From e4ca0e59c39442546866f3dd514a3a5956577daf Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 3 Sep 2024 20:59:04 +0300
Subject: types: Complement the aligned types with signed 64-bit one

Some user may want to use aligned signed 64-bit type.
Provide it for them.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://patch.msgid.link/20240903180218.3640501-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/uapi/linux/types.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/types.h b/include/uapi/linux/types.h
index 6375a0684052..48b933938877 100644
--- a/include/uapi/linux/types.h
+++ b/include/uapi/linux/types.h
@@ -53,6 +53,7 @@ typedef __u32 __bitwise __wsum;
  * No conversions are necessary between 32-bit user-space and a 64-bit kernel.
  */
 #define __aligned_u64 __u64 __attribute__((aligned(8)))
+#define __aligned_s64 __s64 __attribute__((aligned(8)))
 #define __aligned_be64 __be64 __attribute__((aligned(8)))
 #define __aligned_le64 __le64 __attribute__((aligned(8)))
 
-- 
cgit v1.2.3


From f69767a1ada3ac74be2e1ac0795a05e1d1384eff Mon Sep 17 00:00:00 2001
From: Wei Huang <wei.huang2@amd.com>
Date: Wed, 2 Oct 2024 11:59:50 -0500
Subject: PCI: Add TLP Processing Hints (TPH) support

Add support for PCIe TLP Processing Hints (TPH) support (see PCIe r6.2,
sec 6.17).

Add TPH register definitions in pci_regs.h, including the TPH Requester
capability register, TPH Requester control register, TPH Completer
capability, and the ST fields of MSI-X entry.

Introduce pcie_enable_tph() and pcie_disable_tph(), enabling drivers to
toggle TPH support and configure specific ST mode as needed. Also add a new
kernel parameter, "pci=notph", allowing users to disable TPH support across
the entire system.

Link: https://lore.kernel.org/r/20241002165954.128085-2-wei.huang2@amd.com
Co-developed-by: Jing Liu <jing2.liu@intel.com>
Co-developed-by: Paul Luse <paul.e.luse@linux.intel.com>
Co-developed-by: Eric Van Tassell <Eric.VanTassell@amd.com>
Signed-off-by: Jing Liu <jing2.liu@intel.com>
Signed-off-by: Paul Luse <paul.e.luse@linux.intel.com>
Signed-off-by: Eric Van Tassell <Eric.VanTassell@amd.com>
Signed-off-by: Wei Huang <wei.huang2@amd.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
Reviewed-by: Somnath Kotur <somnath.kotur@broadcom.com>
Reviewed-by: Andy Gospodarek <andrew.gospodarek@broadcom.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Lukas Wunner <lukas@wunner.de>
---
 Documentation/admin-guide/kernel-parameters.txt |   4 +
 drivers/pci/Kconfig                             |   9 ++
 drivers/pci/Makefile                            |   1 +
 drivers/pci/pci.c                               |   4 +
 drivers/pci/pci.h                               |  12 ++
 drivers/pci/probe.c                             |   1 +
 drivers/pci/tph.c                               | 197 ++++++++++++++++++++++++
 include/linux/pci-tph.h                         |  21 +++
 include/linux/pci.h                             |   7 +
 include/uapi/linux/pci_regs.h                   |  37 ++++-
 10 files changed, 285 insertions(+), 8 deletions(-)
 create mode 100644 drivers/pci/tph.c
 create mode 100644 include/linux/pci-tph.h

(limited to 'include/uapi/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 1518343bbe22..178995b07451 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4678,6 +4678,10 @@
 		nomio		[S390] Do not use MIO instructions.
 		norid		[S390] ignore the RID field and force use of
 				one PCI domain per PCI function
+		notph		[PCIE] If the PCIE_TPH kernel config parameter
+				is enabled, this kernel boot option can be used
+				to disable PCIe TLP Processing Hints support
+				system-wide.
 
 	pcie_aspm=	[PCIE] Forcibly enable or ignore PCIe Active State Power
 			Management.
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 0d94e4a967d8..2f270e4414b3 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -173,6 +173,15 @@ config PCI_PASID
 
 	  If unsure, say N.
 
+config PCIE_TPH
+	bool "TLP Processing Hints"
+	help
+	  This option adds support for PCIe TLP Processing Hints (TPH).
+	  TPH allows endpoint devices to provide optimization hints, such as
+	  desired caching behavior, for requests that target memory space.
+	  These hints, called Steering Tags, can empower the system hardware
+	  to optimize the utilization of platform resources.
+
 config PCI_P2PDMA
 	bool "PCI peer-to-peer transfer support"
 	depends on ZONE_DEVICE
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 374c5c06d92f..b2a100f2e24a 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_VGA_ARB)		+= vgaarb.o
 obj-$(CONFIG_PCI_DOE)		+= doe.o
 obj-$(CONFIG_PCI_DYNAMIC_OF_NODES) += of_property.o
 obj-$(CONFIG_PCI_NPEM)		+= npem.o
+obj-$(CONFIG_PCIE_TPH)		+= tph.o
 
 # Endpoint library must be initialized before its users
 obj-$(CONFIG_PCI_ENDPOINT)	+= endpoint/
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 7d85c04fbba2..89dafecc869b 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1828,6 +1828,7 @@ int pci_save_state(struct pci_dev *dev)
 	pci_save_dpc_state(dev);
 	pci_save_aer_state(dev);
 	pci_save_ptm_state(dev);
+	pci_save_tph_state(dev);
 	return pci_save_vc_state(dev);
 }
 EXPORT_SYMBOL(pci_save_state);
@@ -1933,6 +1934,7 @@ void pci_restore_state(struct pci_dev *dev)
 	pci_restore_rebar_state(dev);
 	pci_restore_dpc_state(dev);
 	pci_restore_ptm_state(dev);
+	pci_restore_tph_state(dev);
 
 	pci_aer_clear_status(dev);
 	pci_restore_aer_state(dev);
@@ -6896,6 +6898,8 @@ static int __init pci_setup(char *str)
 				pci_no_domains();
 			} else if (!strncmp(str, "noari", 5)) {
 				pcie_ari_disabled = true;
+			} else if (!strncmp(str, "notph", 5)) {
+				pci_no_tph();
 			} else if (!strncmp(str, "cbiosize=", 9)) {
 				pci_cardbus_io_size = memparse(str + 9, &str);
 			} else if (!strncmp(str, "cbmemsize=", 10)) {
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 14d00ce45bfa..d89fdbf04f36 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -597,6 +597,18 @@ static inline int pci_iov_bus_range(struct pci_bus *bus)
 
 #endif /* CONFIG_PCI_IOV */
 
+#ifdef CONFIG_PCIE_TPH
+void pci_restore_tph_state(struct pci_dev *dev);
+void pci_save_tph_state(struct pci_dev *dev);
+void pci_no_tph(void);
+void pci_tph_init(struct pci_dev *dev);
+#else
+static inline void pci_restore_tph_state(struct pci_dev *dev) { }
+static inline void pci_save_tph_state(struct pci_dev *dev) { }
+static inline void pci_no_tph(void) { }
+static inline void pci_tph_init(struct pci_dev *dev) { }
+#endif
+
 #ifdef CONFIG_PCIE_PTM
 void pci_ptm_init(struct pci_dev *dev);
 void pci_save_ptm_state(struct pci_dev *dev);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 4f68414c3086..b086d53a9048 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2495,6 +2495,7 @@ static void pci_init_capabilities(struct pci_dev *dev)
 	pci_dpc_init(dev);		/* Downstream Port Containment */
 	pci_rcec_init(dev);		/* Root Complex Event Collector */
 	pci_doe_init(dev);		/* Data Object Exchange */
+	pci_tph_init(dev);		/* TLP Processing Hints */
 
 	pcie_report_downtraining(dev);
 	pci_init_reset_methods(dev);
diff --git a/drivers/pci/tph.c b/drivers/pci/tph.c
new file mode 100644
index 000000000000..4d6317cbf8a6
--- /dev/null
+++ b/drivers/pci/tph.c
@@ -0,0 +1,197 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * TPH (TLP Processing Hints) support
+ *
+ * Copyright (C) 2024 Advanced Micro Devices, Inc.
+ *     Eric Van Tassell <Eric.VanTassell@amd.com>
+ *     Wei Huang <wei.huang2@amd.com>
+ */
+#include <linux/pci.h>
+#include <linux/bitfield.h>
+#include <linux/pci-tph.h>
+
+#include "pci.h"
+
+/* System-wide TPH disabled */
+static bool pci_tph_disabled;
+
+static u8 get_st_modes(struct pci_dev *pdev)
+{
+	u32 reg;
+
+	pci_read_config_dword(pdev, pdev->tph_cap + PCI_TPH_CAP, &reg);
+	reg &= PCI_TPH_CAP_ST_NS | PCI_TPH_CAP_ST_IV | PCI_TPH_CAP_ST_DS;
+
+	return reg;
+}
+
+/* Return device's Root Port completer capability */
+static u8 get_rp_completer_type(struct pci_dev *pdev)
+{
+	struct pci_dev *rp;
+	u32 reg;
+	int ret;
+
+	rp = pcie_find_root_port(pdev);
+	if (!rp)
+		return 0;
+
+	ret = pcie_capability_read_dword(rp, PCI_EXP_DEVCAP2, &reg);
+	if (ret)
+		return 0;
+
+	return FIELD_GET(PCI_EXP_DEVCAP2_TPH_COMP_MASK, reg);
+}
+
+/**
+ * pcie_disable_tph - Turn off TPH support for device
+ * @pdev: PCI device
+ *
+ * Return: none
+ */
+void pcie_disable_tph(struct pci_dev *pdev)
+{
+	if (!pdev->tph_cap)
+		return;
+
+	if (!pdev->tph_enabled)
+		return;
+
+	pci_write_config_dword(pdev, pdev->tph_cap + PCI_TPH_CTRL, 0);
+
+	pdev->tph_mode = 0;
+	pdev->tph_req_type = 0;
+	pdev->tph_enabled = 0;
+}
+EXPORT_SYMBOL(pcie_disable_tph);
+
+/**
+ * pcie_enable_tph - Enable TPH support for device using a specific ST mode
+ * @pdev: PCI device
+ * @mode: ST mode to enable. Current supported modes include:
+ *
+ *   - PCI_TPH_ST_NS_MODE: NO ST Mode
+ *   - PCI_TPH_ST_IV_MODE: Interrupt Vector Mode
+ *   - PCI_TPH_ST_DS_MODE: Device Specific Mode
+ *
+ * Check whether the mode is actually supported by the device before enabling
+ * and return an error if not. Additionally determine what types of requests,
+ * TPH or extended TPH, can be issued by the device based on its TPH requester
+ * capability and the Root Port's completer capability.
+ *
+ * Return: 0 on success, otherwise negative value (-errno)
+ */
+int pcie_enable_tph(struct pci_dev *pdev, int mode)
+{
+	u32 reg;
+	u8 dev_modes;
+	u8 rp_req_type;
+
+	/* Honor "notph" kernel parameter */
+	if (pci_tph_disabled)
+		return -EINVAL;
+
+	if (!pdev->tph_cap)
+		return -EINVAL;
+
+	if (pdev->tph_enabled)
+		return -EBUSY;
+
+	/* Sanitize and check ST mode compatibility */
+	mode &= PCI_TPH_CTRL_MODE_SEL_MASK;
+	dev_modes = get_st_modes(pdev);
+	if (!((1 << mode) & dev_modes))
+		return -EINVAL;
+
+	pdev->tph_mode = mode;
+
+	/* Get req_type supported by device and its Root Port */
+	pci_read_config_dword(pdev, pdev->tph_cap + PCI_TPH_CAP, &reg);
+	if (FIELD_GET(PCI_TPH_CAP_EXT_TPH, reg))
+		pdev->tph_req_type = PCI_TPH_REQ_EXT_TPH;
+	else
+		pdev->tph_req_type = PCI_TPH_REQ_TPH_ONLY;
+
+	rp_req_type = get_rp_completer_type(pdev);
+
+	/* Final req_type is the smallest value of two */
+	pdev->tph_req_type = min(pdev->tph_req_type, rp_req_type);
+
+	if (pdev->tph_req_type == PCI_TPH_REQ_DISABLE)
+		return -EINVAL;
+
+	/* Write them into TPH control register */
+	pci_read_config_dword(pdev, pdev->tph_cap + PCI_TPH_CTRL, &reg);
+
+	reg &= ~PCI_TPH_CTRL_MODE_SEL_MASK;
+	reg |= FIELD_PREP(PCI_TPH_CTRL_MODE_SEL_MASK, pdev->tph_mode);
+
+	reg &= ~PCI_TPH_CTRL_REQ_EN_MASK;
+	reg |= FIELD_PREP(PCI_TPH_CTRL_REQ_EN_MASK, pdev->tph_req_type);
+
+	pci_write_config_dword(pdev, pdev->tph_cap + PCI_TPH_CTRL, reg);
+
+	pdev->tph_enabled = 1;
+
+	return 0;
+}
+EXPORT_SYMBOL(pcie_enable_tph);
+
+void pci_restore_tph_state(struct pci_dev *pdev)
+{
+	struct pci_cap_saved_state *save_state;
+	u32 *cap;
+
+	if (!pdev->tph_cap)
+		return;
+
+	if (!pdev->tph_enabled)
+		return;
+
+	save_state = pci_find_saved_ext_cap(pdev, PCI_EXT_CAP_ID_TPH);
+	if (!save_state)
+		return;
+
+	/* Restore control register and all ST entries */
+	cap = &save_state->cap.data[0];
+	pci_write_config_dword(pdev, pdev->tph_cap + PCI_TPH_CTRL, *cap++);
+}
+
+void pci_save_tph_state(struct pci_dev *pdev)
+{
+	struct pci_cap_saved_state *save_state;
+	u32 *cap;
+
+	if (!pdev->tph_cap)
+		return;
+
+	if (!pdev->tph_enabled)
+		return;
+
+	save_state = pci_find_saved_ext_cap(pdev, PCI_EXT_CAP_ID_TPH);
+	if (!save_state)
+		return;
+
+	/* Save control register */
+	cap = &save_state->cap.data[0];
+	pci_read_config_dword(pdev, pdev->tph_cap + PCI_TPH_CTRL, cap++);
+}
+
+void pci_no_tph(void)
+{
+	pci_tph_disabled = true;
+
+	pr_info("PCIe TPH is disabled\n");
+}
+
+void pci_tph_init(struct pci_dev *pdev)
+{
+	u32 save_size;
+
+	pdev->tph_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_TPH);
+	if (!pdev->tph_cap)
+		return;
+
+	save_size = sizeof(u32);
+	pci_add_ext_cap_save_buffer(pdev, PCI_EXT_CAP_ID_TPH, save_size);
+}
diff --git a/include/linux/pci-tph.h b/include/linux/pci-tph.h
new file mode 100644
index 000000000000..58654a334ffb
--- /dev/null
+++ b/include/linux/pci-tph.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * TPH (TLP Processing Hints)
+ *
+ * Copyright (C) 2024 Advanced Micro Devices, Inc.
+ *     Eric Van Tassell <Eric.VanTassell@amd.com>
+ *     Wei Huang <wei.huang2@amd.com>
+ */
+#ifndef LINUX_PCI_TPH_H
+#define LINUX_PCI_TPH_H
+
+#ifdef CONFIG_PCIE_TPH
+void pcie_disable_tph(struct pci_dev *pdev);
+int pcie_enable_tph(struct pci_dev *pdev, int mode);
+#else
+static inline void pcie_disable_tph(struct pci_dev *pdev) { }
+static inline int pcie_enable_tph(struct pci_dev *pdev, int mode)
+{ return -EINVAL; }
+#endif
+
+#endif /* LINUX_PCI_TPH_H */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 573b4c4c2be6..8351d76b6e12 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -434,6 +434,7 @@ struct pci_dev {
 	unsigned int	ats_enabled:1;		/* Address Translation Svc */
 	unsigned int	pasid_enabled:1;	/* Process Address Space ID */
 	unsigned int	pri_enabled:1;		/* Page Request Interface */
+	unsigned int	tph_enabled:1;		/* TLP Processing Hints */
 	unsigned int	is_managed:1;		/* Managed via devres */
 	unsigned int	is_msi_managed:1;	/* MSI release via devres installed */
 	unsigned int	needs_freset:1;		/* Requires fundamental reset */
@@ -534,6 +535,12 @@ struct pci_dev {
 
 	/* These methods index pci_reset_fn_methods[] */
 	u8 reset_methods[PCI_NUM_RESET_METHODS]; /* In priority order */
+
+#ifdef CONFIG_PCIE_TPH
+	u16		tph_cap;	/* TPH capability offset */
+	u8		tph_mode;	/* TPH mode */
+	u8		tph_req_type;	/* TPH requester type */
+#endif
 };
 
 static inline struct pci_dev *pci_physfn(struct pci_dev *dev)
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 12323b3334a9..155dea741615 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -340,7 +340,8 @@
 #define PCI_MSIX_ENTRY_UPPER_ADDR	0x4  /* Message Upper Address */
 #define PCI_MSIX_ENTRY_DATA		0x8  /* Message Data */
 #define PCI_MSIX_ENTRY_VECTOR_CTRL	0xc  /* Vector Control */
-#define  PCI_MSIX_ENTRY_CTRL_MASKBIT	0x00000001
+#define  PCI_MSIX_ENTRY_CTRL_MASKBIT	0x00000001  /* Mask Bit */
+#define  PCI_MSIX_ENTRY_CTRL_ST		0xffff0000  /* Steering Tag */
 
 /* CompactPCI Hotswap Register */
 
@@ -659,6 +660,7 @@
 #define  PCI_EXP_DEVCAP2_ATOMIC_COMP64	0x00000100 /* 64b AtomicOp completion */
 #define  PCI_EXP_DEVCAP2_ATOMIC_COMP128	0x00000200 /* 128b AtomicOp completion */
 #define  PCI_EXP_DEVCAP2_LTR		0x00000800 /* Latency tolerance reporting */
+#define  PCI_EXP_DEVCAP2_TPH_COMP_MASK	0x00003000 /* TPH completer support */
 #define  PCI_EXP_DEVCAP2_OBFF_MASK	0x000c0000 /* OBFF support mechanism */
 #define  PCI_EXP_DEVCAP2_OBFF_MSG	0x00040000 /* New message signaling */
 #define  PCI_EXP_DEVCAP2_OBFF_WAKE	0x00080000 /* Re-use WAKE# for OBFF */
@@ -1023,15 +1025,34 @@
 #define  PCI_DPA_CAP_SUBSTATE_MASK	0x1F	/* # substates - 1 */
 #define PCI_DPA_BASE_SIZEOF	16	/* size with 0 substates */
 
+/* TPH Completer Support */
+#define PCI_EXP_DEVCAP2_TPH_COMP_NONE		0x0 /* None */
+#define PCI_EXP_DEVCAP2_TPH_COMP_TPH_ONLY	0x1 /* TPH only */
+#define PCI_EXP_DEVCAP2_TPH_COMP_EXT_TPH	0x3 /* TPH and Extended TPH */
+
 /* TPH Requester */
 #define PCI_TPH_CAP		4	/* capability register */
-#define  PCI_TPH_CAP_LOC_MASK	0x600	/* location mask */
-#define   PCI_TPH_LOC_NONE	0x000	/* no location */
-#define   PCI_TPH_LOC_CAP	0x200	/* in capability */
-#define   PCI_TPH_LOC_MSIX	0x400	/* in MSI-X */
-#define PCI_TPH_CAP_ST_MASK	0x07FF0000	/* ST table mask */
-#define PCI_TPH_CAP_ST_SHIFT	16	/* ST table shift */
-#define PCI_TPH_BASE_SIZEOF	0xc	/* size with no ST table */
+#define  PCI_TPH_CAP_ST_NS	0x00000001 /* No ST Mode Supported */
+#define  PCI_TPH_CAP_ST_IV	0x00000002 /* Interrupt Vector Mode Supported */
+#define  PCI_TPH_CAP_ST_DS	0x00000004 /* Device Specific Mode Supported */
+#define  PCI_TPH_CAP_EXT_TPH	0x00000100 /* Ext TPH Requester Supported */
+#define  PCI_TPH_CAP_LOC_MASK	0x00000600 /* ST Table Location */
+#define   PCI_TPH_LOC_NONE	0x00000000 /* Not present */
+#define   PCI_TPH_LOC_CAP	0x00000200 /* In capability */
+#define   PCI_TPH_LOC_MSIX	0x00000400 /* In MSI-X */
+#define  PCI_TPH_CAP_ST_MASK	0x07FF0000 /* ST Table Size */
+#define  PCI_TPH_CAP_ST_SHIFT	16	/* ST Table Size shift */
+#define PCI_TPH_BASE_SIZEOF	0xc	/* Size with no ST table */
+
+#define PCI_TPH_CTRL		8	/* control register */
+#define  PCI_TPH_CTRL_MODE_SEL_MASK	0x00000007 /* ST Mode Select */
+#define   PCI_TPH_ST_NS_MODE		0x0 /* No ST Mode */
+#define   PCI_TPH_ST_IV_MODE		0x1 /* Interrupt Vector Mode */
+#define   PCI_TPH_ST_DS_MODE		0x2 /* Device Specific Mode */
+#define  PCI_TPH_CTRL_REQ_EN_MASK	0x00000300 /* TPH Requester Enable */
+#define   PCI_TPH_REQ_DISABLE		0x0 /* No TPH requests allowed */
+#define   PCI_TPH_REQ_TPH_ONLY		0x1 /* TPH only requests allowed */
+#define   PCI_TPH_REQ_EXT_TPH		0x3 /* Extended TPH requests allowed */
 
 /* Downstream Port Containment */
 #define PCI_EXP_DPC_CAP			0x04	/* DPC Capability */
-- 
cgit v1.2.3


From 4f647a780f3606acbd2116248d51eadb4d865615 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Mon, 16 Sep 2024 02:17:10 -0700
Subject: bpf: __bpf_fastcall for bpf_get_smp_processor_id in uapi

Since [1] kernel supports __bpf_fastcall attribute for helper function
bpf_get_smp_processor_id(). Update uapi definition for this helper in
order to have this attribute in the generated bpf_helper_defs.h

[1] commit 91b7fbf3936f ("bpf, x86, riscv, arm: no_caller_saved_registers for bpf_get_smp_processor_id()")

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240916091712.2929279-3-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       | 2 ++
 tools/include/uapi/linux/bpf.h | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c6cd7c7aeeee..8ab4d8184b9d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1970,6 +1970,8 @@ union bpf_attr {
  * 		program.
  * 	Return
  * 		The SMP id of the processor running the program.
+ * 	Attributes
+ * 		__bpf_fastcall
  *
  * long bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags)
  * 	Description
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 1fb3cb2636e6..7610883c8191 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1970,6 +1970,8 @@ union bpf_attr {
  * 		program.
  * 	Return
  * 		The SMP id of the processor running the program.
+ * 	Attributes
+ * 		__bpf_fastcall
  *
  * long bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags)
  * 	Description
-- 
cgit v1.2.3


From 91e102e79740ae43ded050ccac71aa3371db4f33 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:58:43 +0100
Subject: prctl: arch-agnostic prctl for shadow stack

Three architectures (x86, aarch64, riscv) have announced support for
shadow stacks with fairly similar functionality.  While x86 is using
arch_prctl() to control the functionality neither arm64 nor riscv uses
that interface so this patch adds arch-agnostic prctl() support to
get and set status of shadow stacks and lock the current configuation to
prevent further changes, with support for turning on and off individual
subfeatures so applications can limit their exposure to features that
they do not need.  The features are:

  - PR_SHADOW_STACK_ENABLE: Tracking and enforcement of shadow stacks,
    including allocation of a shadow stack if one is not already
    allocated.
  - PR_SHADOW_STACK_WRITE: Writes to specific addresses in the shadow
    stack.
  - PR_SHADOW_STACK_PUSH: Push additional values onto the shadow stack.

These features are expected to be inherited by new threads and cleared
on exec(), unknown features should be rejected for enable but accepted
for locking (in order to allow for future proofing).

This is based on a patch originally written by Deepak Gupta but modified
fairly heavily, support for indirect landing pads is removed, additional
modes added and the locking interface reworked.  The set status prctl()
is also reworked to just set flags, if setting/reading the shadow stack
pointer is required this could be a separate prctl.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Yury Khrustalev <yury.khrustalev@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Deepak Gupta <debug@rivosinc.com>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-4-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/mm.h         |  4 ++++
 include/uapi/linux/prctl.h | 22 ++++++++++++++++++++++
 kernel/sys.c               | 30 ++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 182bad0c55df..56654306a832 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4221,4 +4221,8 @@ static inline void pgalloc_tag_copy(struct folio *new, struct folio *old)
 }
 #endif /* CONFIG_MEM_ALLOC_PROFILING */
 
+int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status);
+int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status);
+int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status);
+
 #endif /* _LINUX_MM_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 35791791a879..557a3d2ac1d4 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -328,4 +328,26 @@ struct prctl_mm_map {
 # define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC	0x10 /* Clear the aspect on exec */
 # define PR_PPC_DEXCR_CTRL_MASK		0x1f
 
+/*
+ * Get the current shadow stack configuration for the current thread,
+ * this will be the value configured via PR_SET_SHADOW_STACK_STATUS.
+ */
+#define PR_GET_SHADOW_STACK_STATUS      74
+
+/*
+ * Set the current shadow stack configuration.  Enabling the shadow
+ * stack will cause a shadow stack to be allocated for the thread.
+ */
+#define PR_SET_SHADOW_STACK_STATUS      75
+# define PR_SHADOW_STACK_ENABLE         (1UL << 0)
+# define PR_SHADOW_STACK_WRITE		(1UL << 1)
+# define PR_SHADOW_STACK_PUSH		(1UL << 2)
+
+/*
+ * Prevent further changes to the specified shadow stack
+ * configuration.  All bits may be locked via this call, including
+ * undefined bits.
+ */
+#define PR_LOCK_SHADOW_STACK_STATUS      76
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 4da31f28fda8..3d38a9c7c5c9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2324,6 +2324,21 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
 	return -EINVAL;
 }
 
+int __weak arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status)
+{
+	return -EINVAL;
+}
+
+int __weak arch_set_shadow_stack_status(struct task_struct *t, unsigned long status)
+{
+	return -EINVAL;
+}
+
+int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status)
+{
+	return -EINVAL;
+}
+
 #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE)
 
 #ifdef CONFIG_ANON_VMA_NAME
@@ -2784,6 +2799,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_RISCV_SET_ICACHE_FLUSH_CTX:
 		error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
 		break;
+	case PR_GET_SHADOW_STACK_STATUS:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		error = arch_get_shadow_stack_status(me, (unsigned long __user *) arg2);
+		break;
+	case PR_SET_SHADOW_STACK_STATUS:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		error = arch_set_shadow_stack_status(me, arg2);
+		break;
+	case PR_LOCK_SHADOW_STACK_STATUS:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		error = arch_lock_shadow_stack_status(me, arg2);
+		break;
 	default:
 		error = -EINVAL;
 		break;
-- 
cgit v1.2.3


From 7ec3b57cb29f8371bf12a725b6e8f75831a03f27 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:06 +0100
Subject: arm64/ptrace: Expose GCS via ptrace and core files

Provide a new register type NT_ARM_GCS reporting the current GCS mode
and pointer for EL0.  Due to the interactions with allocation and
deallocation of Guarded Control Stacks we do not permit any changes to
the GCS mode via ptrace, only GCSPR_EL0 may be changed.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-27-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/uapi/asm/ptrace.h |  8 +++++
 arch/arm64/kernel/ptrace.c           | 62 +++++++++++++++++++++++++++++++++++-
 include/uapi/linux/elf.h             |  1 +
 3 files changed, 70 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h
index 7fa2f7036aa7..0f39ba4f3efd 100644
--- a/arch/arm64/include/uapi/asm/ptrace.h
+++ b/arch/arm64/include/uapi/asm/ptrace.h
@@ -324,6 +324,14 @@ struct user_za_header {
 #define ZA_PT_SIZE(vq)						\
 	(ZA_PT_ZA_OFFSET + ZA_PT_ZA_SIZE(vq))
 
+/* GCS state (NT_ARM_GCS) */
+
+struct user_gcs {
+	__u64 features_enabled;
+	__u64 features_locked;
+	__u64 gcspr_el0;
+};
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _UAPI__ASM_PTRACE_H */
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index b756578aeaee..6c1dcfe6d25a 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -34,6 +34,7 @@
 #include <asm/cpufeature.h>
 #include <asm/debug-monitors.h>
 #include <asm/fpsimd.h>
+#include <asm/gcs.h>
 #include <asm/mte.h>
 #include <asm/pointer_auth.h>
 #include <asm/stacktrace.h>
@@ -1473,6 +1474,52 @@ static int poe_set(struct task_struct *target, const struct
 }
 #endif
 
+#ifdef CONFIG_ARM64_GCS
+static int gcs_get(struct task_struct *target,
+		   const struct user_regset *regset,
+		   struct membuf to)
+{
+	struct user_gcs user_gcs;
+
+	if (!system_supports_gcs())
+		return -EINVAL;
+
+	if (target == current)
+		gcs_preserve_current_state();
+
+	user_gcs.features_enabled = target->thread.gcs_el0_mode;
+	user_gcs.features_locked = target->thread.gcs_el0_locked;
+	user_gcs.gcspr_el0 = target->thread.gcspr_el0;
+
+	return membuf_write(&to, &user_gcs, sizeof(user_gcs));
+}
+
+static int gcs_set(struct task_struct *target, const struct
+		   user_regset *regset, unsigned int pos,
+		   unsigned int count, const void *kbuf, const
+		   void __user *ubuf)
+{
+	int ret;
+	struct user_gcs user_gcs;
+
+	if (!system_supports_gcs())
+		return -EINVAL;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_gcs, 0, -1);
+	if (ret)
+		return ret;
+
+	if (user_gcs.features_enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK)
+		return -EINVAL;
+
+	target->thread.gcs_el0_mode = user_gcs.features_enabled;
+	target->thread.gcs_el0_locked = user_gcs.features_locked;
+	target->thread.gcspr_el0 = user_gcs.gcspr_el0;
+
+	return 0;
+}
+#endif
+
 enum aarch64_regset {
 	REGSET_GPR,
 	REGSET_FPR,
@@ -1503,7 +1550,10 @@ enum aarch64_regset {
 	REGSET_TAGGED_ADDR_CTRL,
 #endif
 #ifdef CONFIG_ARM64_POE
-	REGSET_POE
+	REGSET_POE,
+#endif
+#ifdef CONFIG_ARM64_GCS
+	REGSET_GCS,
 #endif
 };
 
@@ -1674,6 +1724,16 @@ static const struct user_regset aarch64_regsets[] = {
 		.set = poe_set,
 	},
 #endif
+#ifdef CONFIG_ARM64_GCS
+	[REGSET_GCS] = {
+		.core_note_type = NT_ARM_GCS,
+		.n = sizeof(struct user_gcs) / sizeof(u64),
+		.size = sizeof(u64),
+		.align = sizeof(u64),
+		.regset_get = gcs_get,
+		.set = gcs_set,
+	},
+#endif
 };
 
 static const struct user_regset_view user_aarch64_view = {
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index b9935988da5c..9adc218fb6df 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -443,6 +443,7 @@ typedef struct elf64_shdr {
 #define NT_ARM_ZT	0x40d		/* ARM SME ZT registers */
 #define NT_ARM_FPMR	0x40e		/* ARM floating point mode register */
 #define NT_ARM_POE	0x40f		/* ARM POE registers */
+#define NT_ARM_GCS	0x410		/* ARM GCS state */
 #define NT_ARC_V2	0x600		/* ARCv2 accumulator/extra registers */
 #define NT_VMCOREDD	0x700		/* Vmcore Device Dump Note */
 #define NT_MIPS_DSP	0x800		/* MIPS DSP ASE registers */
-- 
cgit v1.2.3


From f858cc9eed5b05cbe38d7ffd2787c21e3718eb7d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 3 Oct 2024 12:12:18 +0000
Subject: net: add IFLA_MAX_PACING_OFFLOAD_HORIZON device attribute

Some network devices have the ability to offload EDT (Earliest
Departure Time) which is the model used for TCP pacing and FQ
packet scheduler.

Some of them implement the timing wheel mechanism described in
https://saeed.github.io/files/carousel-sigcomm17.pdf
with an associated 'timing wheel horizon'.

This patch adds dev->max_pacing_offload_horizon expressing
this timing wheel horizon in nsec units.

This is a read-only attribute.

Unless a driver sets it, dev->max_pacing_offload_horizon
is zero.

v2: addressed Jakub feedback ( https://lore.kernel.org/netdev/20240930152304.472767-2-edumazet@google.com/T/#mf6294d714c41cc459962154cc2580ce3c9693663 )
v3: added yaml doc (also per Jakub feedback)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20241003121219.2396589-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/rt_link.yaml               | 4 ++++
 Documentation/networking/net_cachelines/net_device.rst | 1 +
 include/linux/netdevice.h                              | 4 ++++
 include/uapi/linux/if_link.h                           | 1 +
 net/core/rtnetlink.c                                   | 4 ++++
 tools/include/uapi/linux/if_link.h                     | 1 +
 6 files changed, 15 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/netlink/specs/rt_link.yaml b/Documentation/netlink/specs/rt_link.yaml
index 0c4d5d40cae9..d7131a1afadf 100644
--- a/Documentation/netlink/specs/rt_link.yaml
+++ b/Documentation/netlink/specs/rt_link.yaml
@@ -1137,6 +1137,10 @@ attribute-sets:
         name: dpll-pin
         type: nest
         nested-attributes: link-dpll-pin-attrs
+      -
+        name: max-pacing-offload-horizon
+        type: uint
+        doc: EDT offload horizon supported by the device (in nsec).
   -
     name: af-spec-attrs
     attributes:
diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index 22b07c814f4a..49f03cb78c6e 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -183,3 +183,4 @@ struct_devlink_port*                devlink_port
 struct_dpll_pin*                    dpll_pin                                                        
 struct hlist_head                   page_pools
 struct dim_irq_moder*               irq_moder
+u64                                 max_pacing_offload_horizon
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4d20c776a4ff..49a7e7db0883 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2009,6 +2009,8 @@ enum netdev_reg_state {
  *	@dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem,
  *		   where the clock is recovered.
  *
+ *	@max_pacing_offload_horizon: max EDT offload horizon in nsec.
+ *
  *	FIXME: cleanup struct net_device such that network protocol info
  *	moves out.
  */
@@ -2399,6 +2401,8 @@ struct net_device {
 	/** @irq_moder: dim parameters used if IS_ENABLED(CONFIG_DIMLIB). */
 	struct dim_irq_moder	*irq_moder;
 
+	u64			max_pacing_offload_horizon;
+
 	u8			priv[] ____cacheline_aligned
 				       __counted_by(priv_len);
 } ____cacheline_aligned;
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 6dc258993b17..506ba9c80e83 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -377,6 +377,7 @@ enum {
 	IFLA_GSO_IPV4_MAX_SIZE,
 	IFLA_GRO_IPV4_MAX_SIZE,
 	IFLA_DPLL_PIN,
+	IFLA_MAX_PACING_OFFLOAD_HORIZON,
 	__IFLA_MAX
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f0a520987085..682d8d3127db 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1118,6 +1118,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */
 	       + rtnl_devlink_port_size(dev)
 	       + rtnl_dpll_pin_size(dev)
+	       + nla_total_size(8)  /* IFLA_MAX_PACING_OFFLOAD_HORIZON */
 	       + 0;
 }
 
@@ -1867,6 +1868,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
 			READ_ONCE(dev->tso_max_size)) ||
 	    nla_put_u32(skb, IFLA_TSO_MAX_SEGS,
 			READ_ONCE(dev->tso_max_segs)) ||
+	    nla_put_uint(skb, IFLA_MAX_PACING_OFFLOAD_HORIZON,
+			 READ_ONCE(dev->max_pacing_offload_horizon)) ||
 #ifdef CONFIG_RPS
 	    nla_put_u32(skb, IFLA_NUM_RX_QUEUES,
 			READ_ONCE(dev->num_rx_queues)) ||
@@ -1975,6 +1978,7 @@ nla_put_failure:
 }
 
 static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
+	[IFLA_UNSPEC]		= { .strict_start_type = IFLA_DPLL_PIN },
 	[IFLA_IFNAME]		= { .type = NLA_STRING, .len = IFNAMSIZ-1 },
 	[IFLA_ADDRESS]		= { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
 	[IFLA_BROADCAST]	= { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index f0d71b2a3f1e..96ec2b01e725 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -377,6 +377,7 @@ enum {
 	IFLA_GSO_IPV4_MAX_SIZE,
 	IFLA_GRO_IPV4_MAX_SIZE,
 	IFLA_DPLL_PIN,
+	IFLA_MAX_PACING_OFFLOAD_HORIZON,
 	__IFLA_MAX
 };
 
-- 
cgit v1.2.3


From f26080d47007df2ee90e65b7d390207ff3a588af Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Thu, 3 Oct 2024 12:12:19 +0000
Subject: net_sched: sch_fq: add the ability to offload pacing

Some network devices have the ability to offload EDT (Earliest
Departure Time) which is the model used for TCP pacing and FQ packet
scheduler.

Some of them implement the timing wheel mechanism described in
https://saeed.github.io/files/carousel-sigcomm17.pdf
with an associated 'timing wheel horizon'.

This patchs adds to FQ packet scheduler TCA_FQ_OFFLOAD_HORIZON
attribute.

Its value is capped by the device max_pacing_offload_horizon,
added in the prior patch.

It allows FQ to let packets within pacing offload horizon
to be delivered to the device, which will handle the needed
delay without host involvement.

Signed-off-by: Jeffrey Ji <jeffreyji@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20241003121219.2396589-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/pkt_sched.h |  2 ++
 net/sched/sch_fq.c             | 33 +++++++++++++++++++++++++++------
 2 files changed, 29 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index a3cd0c2dc995..25a9a47001cd 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -836,6 +836,8 @@ enum {
 
 	TCA_FQ_WEIGHTS,		/* Weights for each band */
 
+	TCA_FQ_OFFLOAD_HORIZON, /* dequeue paced packets within this horizon immediately (us units) */
+
 	__TCA_FQ_MAX
 };
 
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 19a49af5a9e5..aeabf45c9200 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -111,6 +111,7 @@ struct fq_perband_flows {
 struct fq_sched_data {
 /* Read mostly cache line */
 
+	u64		offload_horizon;
 	u32		quantum;
 	u32		initial_quantum;
 	u32		flow_refill_delay;
@@ -299,7 +300,7 @@ static void fq_gc(struct fq_sched_data *q,
 }
 
 /* Fast path can be used if :
- * 1) Packet tstamp is in the past.
+ * 1) Packet tstamp is in the past, or within the pacing offload horizon.
  * 2) FQ qlen == 0   OR
  *   (no flow is currently eligible for transmit,
  *    AND fast path queue has less than 8 packets)
@@ -314,7 +315,7 @@ static bool fq_fastpath_check(const struct Qdisc *sch, struct sk_buff *skb,
 	const struct fq_sched_data *q = qdisc_priv(sch);
 	const struct sock *sk;
 
-	if (fq_skb_cb(skb)->time_to_send > now)
+	if (fq_skb_cb(skb)->time_to_send > now + q->offload_horizon)
 		return false;
 
 	if (sch->q.qlen != 0) {
@@ -595,15 +596,18 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now)
 	unsigned long sample;
 	struct rb_node *p;
 
-	if (q->time_next_delayed_flow > now)
+	if (q->time_next_delayed_flow > now + q->offload_horizon)
 		return;
 
 	/* Update unthrottle latency EWMA.
 	 * This is cheap and can help diagnosing timer/latency problems.
 	 */
 	sample = (unsigned long)(now - q->time_next_delayed_flow);
-	q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3;
-	q->unthrottle_latency_ns += sample >> 3;
+	if ((long)sample > 0) {
+		q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3;
+		q->unthrottle_latency_ns += sample >> 3;
+	}
+	now += q->offload_horizon;
 
 	q->time_next_delayed_flow = ~0ULL;
 	while ((p = rb_first(&q->delayed)) != NULL) {
@@ -687,7 +691,7 @@ begin:
 		u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send,
 					     f->time_next_packet);
 
-		if (now < time_next_packet) {
+		if (now + q->offload_horizon < time_next_packet) {
 			head->first = f->next;
 			f->time_next_packet = time_next_packet;
 			fq_flow_set_throttled(q, f);
@@ -925,6 +929,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
 	[TCA_FQ_HORIZON_DROP]		= { .type = NLA_U8 },
 	[TCA_FQ_PRIOMAP]		= NLA_POLICY_EXACT_LEN(sizeof(struct tc_prio_qopt)),
 	[TCA_FQ_WEIGHTS]		= NLA_POLICY_EXACT_LEN(FQ_BANDS * sizeof(s32)),
+	[TCA_FQ_OFFLOAD_HORIZON]	= { .type = NLA_U32 },
 };
 
 /* compress a u8 array with all elems <= 3 to an array of 2-bit fields */
@@ -1100,6 +1105,17 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
 		WRITE_ONCE(q->horizon_drop,
 			   nla_get_u8(tb[TCA_FQ_HORIZON_DROP]));
 
+	if (tb[TCA_FQ_OFFLOAD_HORIZON]) {
+		u64 offload_horizon = (u64)NSEC_PER_USEC *
+				      nla_get_u32(tb[TCA_FQ_OFFLOAD_HORIZON]);
+
+		if (offload_horizon <= qdisc_dev(sch)->max_pacing_offload_horizon) {
+			WRITE_ONCE(q->offload_horizon, offload_horizon);
+		} else {
+			NL_SET_ERR_MSG_MOD(extack, "invalid offload_horizon");
+			err = -EINVAL;
+		}
+	}
 	if (!err) {
 
 		sch_tree_unlock(sch);
@@ -1183,6 +1199,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 		.bands = FQ_BANDS,
 	};
 	struct nlattr *opts;
+	u64 offload_horizon;
 	u64 ce_threshold;
 	s32 weights[3];
 	u64 horizon;
@@ -1199,6 +1216,9 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	horizon = READ_ONCE(q->horizon);
 	do_div(horizon, NSEC_PER_USEC);
 
+	offload_horizon = READ_ONCE(q->offload_horizon);
+	do_div(offload_horizon, NSEC_PER_USEC);
+
 	if (nla_put_u32(skb, TCA_FQ_PLIMIT,
 			READ_ONCE(sch->limit)) ||
 	    nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT,
@@ -1224,6 +1244,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	    nla_put_u32(skb, TCA_FQ_TIMER_SLACK,
 			READ_ONCE(q->timer_slack)) ||
 	    nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) ||
+	    nla_put_u32(skb, TCA_FQ_OFFLOAD_HORIZON, (u32)offload_horizon) ||
 	    nla_put_u8(skb, TCA_FQ_HORIZON_DROP,
 		       READ_ONCE(q->horizon_drop)))
 		goto nla_put_failure;
-- 
cgit v1.2.3


From 65c4c93caaf1a9fca2855942e338530967162d25 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Tue, 10 Sep 2024 16:30:12 +0200
Subject: crypto: sig - Introduce sig_alg backend

Commit 6cb8815f41a9 ("crypto: sig - Add interface for sign/verify")
began a transition of asymmetric sign/verify operations from
crypto_akcipher to a new crypto_sig frontend.

Internally, the crypto_sig frontend still uses akcipher_alg as backend,
however:

   "The link between sig and akcipher is meant to be temporary.  The
    plan is to create a new low-level API for sig and then migrate
    the signature code over to that from akcipher."
    https://lore.kernel.org/r/ZrG6w9wsb-iiLZIF@gondor.apana.org.au/

   "having a separate alg for sig is definitely where we want to
    be since there is very little that the two types actually share."
    https://lore.kernel.org/r/ZrHlpz4qnre0zWJO@gondor.apana.org.au/

Take the next step of that migration and augment the crypto_sig frontend
with a sig_alg backend to which all algorithms can be moved.

During the migration, there will briefly be signature algorithms that
are still based on crypto_akcipher, whilst others are already based on
crypto_sig.  Allow for that by building a fork into crypto_sig_*() API
calls (i.e. crypto_sig_maxsize() and friends) such that one of the two
backends is selected based on the transform's cra_type.

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 Documentation/crypto/api-sig.rst      |  14 ++++
 Documentation/crypto/api.rst          |   1 +
 Documentation/crypto/architecture.rst |   2 +
 crypto/sig.c                          | 143 +++++++++++++++++++++++++++++++++-
 crypto/testmgr.c                      | 115 +++++++++++++++++++++++++++
 crypto/testmgr.h                      |  13 ++++
 include/crypto/internal/sig.h         |  80 +++++++++++++++++++
 include/crypto/sig.h                  |  61 +++++++++++++++
 include/uapi/linux/cryptouser.h       |   5 ++
 9 files changed, 432 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/crypto/api-sig.rst

(limited to 'include/uapi/linux')

diff --git a/Documentation/crypto/api-sig.rst b/Documentation/crypto/api-sig.rst
new file mode 100644
index 000000000000..a96dba66296b
--- /dev/null
+++ b/Documentation/crypto/api-sig.rst
@@ -0,0 +1,14 @@
+Asymmetric Signature Algorithm Definitions
+------------------------------------------
+
+.. kernel-doc:: include/crypto/sig.h
+   :functions: sig_alg
+
+Asymmetric Signature API
+------------------------
+
+.. kernel-doc:: include/crypto/sig.h
+   :doc: Generic Public Key Signature API
+
+.. kernel-doc:: include/crypto/sig.h
+   :functions: crypto_alloc_sig crypto_free_sig crypto_sig_set_pubkey crypto_sig_set_privkey crypto_sig_maxsize crypto_sig_sign crypto_sig_verify
diff --git a/Documentation/crypto/api.rst b/Documentation/crypto/api.rst
index ff31c30561d4..8b2a90521886 100644
--- a/Documentation/crypto/api.rst
+++ b/Documentation/crypto/api.rst
@@ -10,4 +10,5 @@ Programming Interface
    api-digest
    api-rng
    api-akcipher
+   api-sig
    api-kpp
diff --git a/Documentation/crypto/architecture.rst b/Documentation/crypto/architecture.rst
index 646c3380a7ed..15dcd62fd22f 100644
--- a/Documentation/crypto/architecture.rst
+++ b/Documentation/crypto/architecture.rst
@@ -214,6 +214,8 @@ the aforementioned cipher types:
 
 -  CRYPTO_ALG_TYPE_AKCIPHER Asymmetric cipher
 
+-  CRYPTO_ALG_TYPE_SIG Asymmetric signature
+
 -  CRYPTO_ALG_TYPE_PCOMPRESS Enhanced version of
    CRYPTO_ALG_TYPE_COMPRESS allowing for segmented compression /
    decompression instead of performing the operation on one segment
diff --git a/crypto/sig.c b/crypto/sig.c
index 7645bedf3a1f..4f36ceb7a90b 100644
--- a/crypto/sig.c
+++ b/crypto/sig.c
@@ -21,14 +21,38 @@
 
 static const struct crypto_type crypto_sig_type;
 
+static void crypto_sig_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct crypto_sig *sig = __crypto_sig_tfm(tfm);
+	struct sig_alg *alg = crypto_sig_alg(sig);
+
+	alg->exit(sig);
+}
+
 static int crypto_sig_init_tfm(struct crypto_tfm *tfm)
 {
 	if (tfm->__crt_alg->cra_type != &crypto_sig_type)
 		return crypto_init_akcipher_ops_sig(tfm);
 
+	struct crypto_sig *sig = __crypto_sig_tfm(tfm);
+	struct sig_alg *alg = crypto_sig_alg(sig);
+
+	if (alg->exit)
+		sig->base.exit = crypto_sig_exit_tfm;
+
+	if (alg->init)
+		return alg->init(sig);
+
 	return 0;
 }
 
+static void crypto_sig_free_instance(struct crypto_instance *inst)
+{
+	struct sig_instance *sig = sig_instance(inst);
+
+	sig->free(sig);
+}
+
 static void __maybe_unused crypto_sig_show(struct seq_file *m,
 					   struct crypto_alg *alg)
 {
@@ -38,16 +62,17 @@ static void __maybe_unused crypto_sig_show(struct seq_file *m,
 static int __maybe_unused crypto_sig_report(struct sk_buff *skb,
 					    struct crypto_alg *alg)
 {
-	struct crypto_report_akcipher rsig = {};
+	struct crypto_report_sig rsig = {};
 
 	strscpy(rsig.type, "sig", sizeof(rsig.type));
 
-	return nla_put(skb, CRYPTOCFGA_REPORT_AKCIPHER, sizeof(rsig), &rsig);
+	return nla_put(skb, CRYPTOCFGA_REPORT_SIG, sizeof(rsig), &rsig);
 }
 
 static const struct crypto_type crypto_sig_type = {
 	.extsize = crypto_alg_extsize,
 	.init_tfm = crypto_sig_init_tfm,
+	.free = crypto_sig_free_instance,
 #ifdef CONFIG_PROC_FS
 	.show = crypto_sig_show,
 #endif
@@ -68,6 +93,14 @@ EXPORT_SYMBOL_GPL(crypto_alloc_sig);
 
 int crypto_sig_maxsize(struct crypto_sig *tfm)
 {
+	if (crypto_sig_tfm(tfm)->__crt_alg->cra_type != &crypto_sig_type)
+		goto akcipher;
+
+	struct sig_alg *alg = crypto_sig_alg(tfm);
+
+	return alg->max_size(tfm);
+
+akcipher:
 	struct crypto_akcipher **ctx = crypto_sig_ctx(tfm);
 
 	return crypto_akcipher_maxsize(*ctx);
@@ -78,6 +111,14 @@ int crypto_sig_sign(struct crypto_sig *tfm,
 		    const void *src, unsigned int slen,
 		    void *dst, unsigned int dlen)
 {
+	if (crypto_sig_tfm(tfm)->__crt_alg->cra_type != &crypto_sig_type)
+		goto akcipher;
+
+	struct sig_alg *alg = crypto_sig_alg(tfm);
+
+	return alg->sign(tfm, src, slen, dst, dlen);
+
+akcipher:
 	struct crypto_akcipher **ctx = crypto_sig_ctx(tfm);
 	struct crypto_akcipher_sync_data data = {
 		.tfm = *ctx,
@@ -97,6 +138,14 @@ int crypto_sig_verify(struct crypto_sig *tfm,
 		      const void *src, unsigned int slen,
 		      const void *digest, unsigned int dlen)
 {
+	if (crypto_sig_tfm(tfm)->__crt_alg->cra_type != &crypto_sig_type)
+		goto akcipher;
+
+	struct sig_alg *alg = crypto_sig_alg(tfm);
+
+	return alg->verify(tfm, src, slen, digest, dlen);
+
+akcipher:
 	struct crypto_akcipher **ctx = crypto_sig_ctx(tfm);
 	struct crypto_akcipher_sync_data data = {
 		.tfm = *ctx,
@@ -120,6 +169,14 @@ EXPORT_SYMBOL_GPL(crypto_sig_verify);
 int crypto_sig_set_pubkey(struct crypto_sig *tfm,
 			  const void *key, unsigned int keylen)
 {
+	if (crypto_sig_tfm(tfm)->__crt_alg->cra_type != &crypto_sig_type)
+		goto akcipher;
+
+	struct sig_alg *alg = crypto_sig_alg(tfm);
+
+	return alg->set_pub_key(tfm, key, keylen);
+
+akcipher:
 	struct crypto_akcipher **ctx = crypto_sig_ctx(tfm);
 
 	return crypto_akcipher_set_pub_key(*ctx, key, keylen);
@@ -129,11 +186,93 @@ EXPORT_SYMBOL_GPL(crypto_sig_set_pubkey);
 int crypto_sig_set_privkey(struct crypto_sig *tfm,
 			  const void *key, unsigned int keylen)
 {
+	if (crypto_sig_tfm(tfm)->__crt_alg->cra_type != &crypto_sig_type)
+		goto akcipher;
+
+	struct sig_alg *alg = crypto_sig_alg(tfm);
+
+	return alg->set_priv_key(tfm, key, keylen);
+
+akcipher:
 	struct crypto_akcipher **ctx = crypto_sig_ctx(tfm);
 
 	return crypto_akcipher_set_priv_key(*ctx, key, keylen);
 }
 EXPORT_SYMBOL_GPL(crypto_sig_set_privkey);
 
+static void sig_prepare_alg(struct sig_alg *alg)
+{
+	struct crypto_alg *base = &alg->base;
+
+	base->cra_type = &crypto_sig_type;
+	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
+	base->cra_flags |= CRYPTO_ALG_TYPE_SIG;
+}
+
+static int sig_default_sign(struct crypto_sig *tfm,
+			    const void *src, unsigned int slen,
+			    void *dst, unsigned int dlen)
+{
+	return -ENOSYS;
+}
+
+static int sig_default_verify(struct crypto_sig *tfm,
+			      const void *src, unsigned int slen,
+			      const void *dst, unsigned int dlen)
+{
+	return -ENOSYS;
+}
+
+static int sig_default_set_key(struct crypto_sig *tfm,
+			       const void *key, unsigned int keylen)
+{
+	return -ENOSYS;
+}
+
+int crypto_register_sig(struct sig_alg *alg)
+{
+	struct crypto_alg *base = &alg->base;
+
+	if (!alg->sign)
+		alg->sign = sig_default_sign;
+	if (!alg->verify)
+		alg->verify = sig_default_verify;
+	if (!alg->set_priv_key)
+		alg->set_priv_key = sig_default_set_key;
+	if (!alg->set_pub_key)
+		return -EINVAL;
+	if (!alg->max_size)
+		return -EINVAL;
+
+	sig_prepare_alg(alg);
+	return crypto_register_alg(base);
+}
+EXPORT_SYMBOL_GPL(crypto_register_sig);
+
+void crypto_unregister_sig(struct sig_alg *alg)
+{
+	crypto_unregister_alg(&alg->base);
+}
+EXPORT_SYMBOL_GPL(crypto_unregister_sig);
+
+int sig_register_instance(struct crypto_template *tmpl,
+			  struct sig_instance *inst)
+{
+	if (WARN_ON(!inst->free))
+		return -EINVAL;
+	sig_prepare_alg(&inst->alg);
+	return crypto_register_instance(tmpl, sig_crypto_instance(inst));
+}
+EXPORT_SYMBOL_GPL(sig_register_instance);
+
+int crypto_grab_sig(struct crypto_sig_spawn *spawn,
+		    struct crypto_instance *inst,
+		    const char *name, u32 type, u32 mask)
+{
+	spawn->base.frontend = &crypto_sig_type;
+	return crypto_grab_spawn(&spawn->base, inst, name, type, mask);
+}
+EXPORT_SYMBOL_GPL(crypto_grab_sig);
+
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Public Key Signature Algorithms");
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index ee8da628e9da..50c8d3e46e2b 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -33,6 +33,7 @@
 #include <crypto/akcipher.h>
 #include <crypto/kpp.h>
 #include <crypto/acompress.h>
+#include <crypto/sig.h>
 #include <crypto/internal/cipher.h>
 #include <crypto/internal/simd.h>
 
@@ -131,6 +132,11 @@ struct akcipher_test_suite {
 	unsigned int count;
 };
 
+struct sig_test_suite {
+	const struct sig_testvec *vecs;
+	unsigned int count;
+};
+
 struct kpp_test_suite {
 	const struct kpp_testvec *vecs;
 	unsigned int count;
@@ -151,6 +157,7 @@ struct alg_test_desc {
 		struct cprng_test_suite cprng;
 		struct drbg_test_suite drbg;
 		struct akcipher_test_suite akcipher;
+		struct sig_test_suite sig;
 		struct kpp_test_suite kpp;
 	} suite;
 };
@@ -4338,6 +4345,114 @@ static int alg_test_akcipher(const struct alg_test_desc *desc,
 	return err;
 }
 
+static int test_sig_one(struct crypto_sig *tfm, const struct sig_testvec *vecs)
+{
+	u8 *ptr, *key __free(kfree);
+	int err, sig_size;
+
+	key = kmalloc(vecs->key_len + 2 * sizeof(u32) + vecs->param_len,
+		      GFP_KERNEL);
+	if (!key)
+		return -ENOMEM;
+
+	/* ecrdsa expects additional parameters appended to the key */
+	memcpy(key, vecs->key, vecs->key_len);
+	ptr = key + vecs->key_len;
+	ptr = test_pack_u32(ptr, vecs->algo);
+	ptr = test_pack_u32(ptr, vecs->param_len);
+	memcpy(ptr, vecs->params, vecs->param_len);
+
+	if (vecs->public_key_vec)
+		err = crypto_sig_set_pubkey(tfm, key, vecs->key_len);
+	else
+		err = crypto_sig_set_privkey(tfm, key, vecs->key_len);
+	if (err)
+		return err;
+
+	/*
+	 * Run asymmetric signature verification first
+	 * (which does not require a private key)
+	 */
+	err = crypto_sig_verify(tfm, vecs->c, vecs->c_size,
+				vecs->m, vecs->m_size);
+	if (err) {
+		pr_err("alg: sig: verify test failed: err %d\n", err);
+		return err;
+	}
+
+	/*
+	 * Don't invoke sign test (which requires a private key)
+	 * for vectors with only a public key.
+	 */
+	if (vecs->public_key_vec)
+		return 0;
+
+	sig_size = crypto_sig_maxsize(tfm);
+	if (sig_size < vecs->c_size) {
+		pr_err("alg: sig: invalid maxsize %u\n", sig_size);
+		return -EINVAL;
+	}
+
+	u8 *sig __free(kfree) = kzalloc(sig_size, GFP_KERNEL);
+	if (!sig)
+		return -ENOMEM;
+
+	/* Run asymmetric signature generation */
+	err = crypto_sig_sign(tfm, vecs->m, vecs->m_size, sig, sig_size);
+	if (err) {
+		pr_err("alg: sig: sign test failed: err %d\n", err);
+		return err;
+	}
+
+	/* Verify that generated signature equals cooked signature */
+	if (memcmp(sig, vecs->c, vecs->c_size) ||
+	    memchr_inv(sig + vecs->c_size, 0, sig_size - vecs->c_size)) {
+		pr_err("alg: sig: sign test failed: invalid output\n");
+		hexdump(sig, sig_size);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int test_sig(struct crypto_sig *tfm, const char *alg,
+		    const struct sig_testvec *vecs, unsigned int tcount)
+{
+	const char *algo = crypto_tfm_alg_driver_name(crypto_sig_tfm(tfm));
+	int ret, i;
+
+	for (i = 0; i < tcount; i++) {
+		ret = test_sig_one(tfm, vecs++);
+		if (ret) {
+			pr_err("alg: sig: test %d failed for %s: err %d\n",
+			       i + 1, algo, ret);
+			return ret;
+		}
+	}
+	return 0;
+}
+
+__maybe_unused
+static int alg_test_sig(const struct alg_test_desc *desc, const char *driver,
+			u32 type, u32 mask)
+{
+	struct crypto_sig *tfm;
+	int err = 0;
+
+	tfm = crypto_alloc_sig(driver, type, mask);
+	if (IS_ERR(tfm)) {
+		pr_err("alg: sig: Failed to load tfm for %s: %ld\n",
+		       driver, PTR_ERR(tfm));
+		return PTR_ERR(tfm);
+	}
+	if (desc->suite.sig.vecs)
+		err = test_sig(tfm, desc->alg, desc->suite.sig.vecs,
+			       desc->suite.sig.count);
+
+	crypto_free_sig(tfm);
+	return err;
+}
+
 static int alg_test_null(const struct alg_test_desc *desc,
 			     const char *driver, u32 type, u32 mask)
 {
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index ed1640f3e352..39dd1d558883 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -162,6 +162,19 @@ struct akcipher_testvec {
 	enum OID algo;
 };
 
+struct sig_testvec {
+	const unsigned char *key;
+	const unsigned char *params;
+	const unsigned char *m;
+	const unsigned char *c;
+	unsigned int key_len;
+	unsigned int param_len;
+	unsigned int m_size;
+	unsigned int c_size;
+	bool public_key_vec;
+	enum OID algo;
+};
+
 struct kpp_testvec {
 	const unsigned char *secret;
 	const unsigned char *b_secret;
diff --git a/include/crypto/internal/sig.h b/include/crypto/internal/sig.h
index 97cb26ef8115..b16648c1a986 100644
--- a/include/crypto/internal/sig.h
+++ b/include/crypto/internal/sig.h
@@ -10,8 +10,88 @@
 #include <crypto/algapi.h>
 #include <crypto/sig.h>
 
+struct sig_instance {
+	void (*free)(struct sig_instance *inst);
+	union {
+		struct {
+			char head[offsetof(struct sig_alg, base)];
+			struct crypto_instance base;
+		};
+		struct sig_alg alg;
+	};
+};
+
+struct crypto_sig_spawn {
+	struct crypto_spawn base;
+};
+
 static inline void *crypto_sig_ctx(struct crypto_sig *tfm)
 {
 	return crypto_tfm_ctx(&tfm->base);
 }
+
+/**
+ * crypto_register_sig() -- Register public key signature algorithm
+ *
+ * Function registers an implementation of a public key signature algorithm
+ *
+ * @alg:	algorithm definition
+ *
+ * Return: zero on success; error code in case of error
+ */
+int crypto_register_sig(struct sig_alg *alg);
+
+/**
+ * crypto_unregister_sig() -- Unregister public key signature algorithm
+ *
+ * Function unregisters an implementation of a public key signature algorithm
+ *
+ * @alg:	algorithm definition
+ */
+void crypto_unregister_sig(struct sig_alg *alg);
+
+int sig_register_instance(struct crypto_template *tmpl,
+			  struct sig_instance *inst);
+
+static inline struct sig_instance *sig_instance(struct crypto_instance *inst)
+{
+	return container_of(&inst->alg, struct sig_instance, alg.base);
+}
+
+static inline struct sig_instance *sig_alg_instance(struct crypto_sig *tfm)
+{
+	return sig_instance(crypto_tfm_alg_instance(&tfm->base));
+}
+
+static inline struct crypto_instance *sig_crypto_instance(struct sig_instance
+									*inst)
+{
+	return container_of(&inst->alg.base, struct crypto_instance, alg);
+}
+
+static inline void *sig_instance_ctx(struct sig_instance *inst)
+{
+	return crypto_instance_ctx(sig_crypto_instance(inst));
+}
+
+int crypto_grab_sig(struct crypto_sig_spawn *spawn,
+		    struct crypto_instance *inst,
+		    const char *name, u32 type, u32 mask);
+
+static inline struct crypto_sig *crypto_spawn_sig(struct crypto_sig_spawn
+								   *spawn)
+{
+	return crypto_spawn_tfm2(&spawn->base);
+}
+
+static inline void crypto_drop_sig(struct crypto_sig_spawn *spawn)
+{
+	crypto_drop_spawn(&spawn->base);
+}
+
+static inline struct sig_alg *crypto_spawn_sig_alg(struct crypto_sig_spawn
+								    *spawn)
+{
+	return container_of(spawn->base.alg, struct sig_alg, base);
+}
 #endif
diff --git a/include/crypto/sig.h b/include/crypto/sig.h
index d25186bb2be3..f0f52a7c5ae7 100644
--- a/include/crypto/sig.h
+++ b/include/crypto/sig.h
@@ -19,6 +19,52 @@ struct crypto_sig {
 	struct crypto_tfm base;
 };
 
+/**
+ * struct sig_alg - generic public key signature algorithm
+ *
+ * @sign:	Function performs a sign operation as defined by public key
+ *		algorithm. Optional.
+ * @verify:	Function performs a complete verify operation as defined by
+ *		public key algorithm, returning verification status. Optional.
+ * @set_pub_key: Function invokes the algorithm specific set public key
+ *		function, which knows how to decode and interpret
+ *		the BER encoded public key and parameters. Mandatory.
+ * @set_priv_key: Function invokes the algorithm specific set private key
+ *		function, which knows how to decode and interpret
+ *		the BER encoded private key and parameters. Optional.
+ * @max_size:	Function returns key size. Mandatory.
+ * @init:	Initialize the cryptographic transformation object.
+ *		This function is used to initialize the cryptographic
+ *		transformation object. This function is called only once at
+ *		the instantiation time, right after the transformation context
+ *		was allocated. In case the cryptographic hardware has some
+ *		special requirements which need to be handled by software, this
+ *		function shall check for the precise requirement of the
+ *		transformation and put any software fallbacks in place.
+ * @exit:	Deinitialize the cryptographic transformation object. This is a
+ *		counterpart to @init, used to remove various changes set in
+ *		@init.
+ *
+ * @base:	Common crypto API algorithm data structure
+ */
+struct sig_alg {
+	int (*sign)(struct crypto_sig *tfm,
+		    const void *src, unsigned int slen,
+		    void *dst, unsigned int dlen);
+	int (*verify)(struct crypto_sig *tfm,
+		      const void *src, unsigned int slen,
+		      const void *digest, unsigned int dlen);
+	int (*set_pub_key)(struct crypto_sig *tfm,
+			   const void *key, unsigned int keylen);
+	int (*set_priv_key)(struct crypto_sig *tfm,
+			    const void *key, unsigned int keylen);
+	unsigned int (*max_size)(struct crypto_sig *tfm);
+	int (*init)(struct crypto_sig *tfm);
+	void (*exit)(struct crypto_sig *tfm);
+
+	struct crypto_alg base;
+};
+
 /**
  * DOC: Generic Public Key Signature API
  *
@@ -47,6 +93,21 @@ static inline struct crypto_tfm *crypto_sig_tfm(struct crypto_sig *tfm)
 	return &tfm->base;
 }
 
+static inline struct crypto_sig *__crypto_sig_tfm(struct crypto_tfm *tfm)
+{
+	return container_of(tfm, struct crypto_sig, base);
+}
+
+static inline struct sig_alg *__crypto_sig_alg(struct crypto_alg *alg)
+{
+	return container_of(alg, struct sig_alg, base);
+}
+
+static inline struct sig_alg *crypto_sig_alg(struct crypto_sig *tfm)
+{
+	return __crypto_sig_alg(crypto_sig_tfm(tfm)->__crt_alg);
+}
+
 /**
  * crypto_free_sig() - free signature tfm handle
  *
diff --git a/include/uapi/linux/cryptouser.h b/include/uapi/linux/cryptouser.h
index 20a6c0fc149e..db05e0419972 100644
--- a/include/uapi/linux/cryptouser.h
+++ b/include/uapi/linux/cryptouser.h
@@ -64,6 +64,7 @@ enum crypto_attr_type_t {
 	CRYPTOCFGA_STAT_AKCIPHER,	/* No longer supported, do not use. */
 	CRYPTOCFGA_STAT_KPP,		/* No longer supported, do not use. */
 	CRYPTOCFGA_STAT_ACOMP,		/* No longer supported, do not use. */
+	CRYPTOCFGA_REPORT_SIG,		/* struct crypto_report_sig */
 	__CRYPTOCFGA_MAX
 
 #define CRYPTOCFGA_MAX (__CRYPTOCFGA_MAX - 1)
@@ -207,6 +208,10 @@ struct crypto_report_acomp {
 	char type[CRYPTO_MAX_NAME];
 };
 
+struct crypto_report_sig {
+	char type[CRYPTO_MAX_NAME];
+};
+
 #define CRYPTO_REPORT_MAXSIZE (sizeof(struct crypto_user_alg) + \
 			       sizeof(struct crypto_report_blkcipher))
 
-- 
cgit v1.2.3


From 5b553e06b3215fa97d222ebddc2bc964f1824c5b Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Tue, 10 Sep 2024 16:30:19 +0200
Subject: crypto: virtio - Drop sign/verify operations

The virtio crypto driver exposes akcipher sign/verify operations in a
user space ABI.  This blocks removal of sign/verify from akcipher_alg.

Herbert opines:

   "I would say that this is something that we can break.  Breaking it
    is no different to running virtio on a host that does not support
    these algorithms.  After all, a software implementation must always
    be present.

    I deliberately left akcipher out of crypto_user because the API
    is still in flux.  We should not let virtio constrain ourselves."
    https://lore.kernel.org/all/ZtqoNAgcnXnrYhZZ@gondor.apana.org.au/

   "I would remove virtio akcipher support in its entirety.  This API
    was never meant to be exposed outside of the kernel."
    https://lore.kernel.org/all/Ztqql_gqgZiMW8zz@gondor.apana.org.au/

Drop sign/verify support from virtio crypto.  There's no strong reason
to also remove encrypt/decrypt support, so keep it.

A key selling point of virtio crypto is to allow guest access to crypto
accelerators on the host.  So far the only akcipher algorithm supported
by virtio crypto is RSA.  Dropping sign/verify merely means that the
PKCS#1 padding is now always generated or verified inside the guest,
but the actual signature generation/verification (which is an RSA
decrypt/encrypt operation) may still use an accelerator on the host.

Generating or verifying the PKCS#1 padding is cheap, so a hardware
accelerator won't be of much help there.  Which begs the question
whether virtio crypto support for sign/verify makes sense at all.

It would make sense for the sign operation if the host has a security
chip to store asymmetric private keys.  But the kernel doesn't even
have an asymmetric_key_subtype yet for hardware-based private keys.
There's at least one rudimentary driver for such chips (atmel-ecc.c for
ATECC508A), but it doesn't implement the sign operation.  The kernel
would first have to grow support for a hardware asymmetric_key_subtype
and at least one driver implementing the sign operation before exposure
to guests via virtio makes sense.

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 .../crypto/virtio/virtio_crypto_akcipher_algs.c    | 65 +++++++---------------
 include/uapi/linux/virtio_crypto.h                 |  1 +
 2 files changed, 22 insertions(+), 44 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c b/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c
index cb92b7fa99c6..48fee07b7e51 100644
--- a/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c
+++ b/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c
@@ -83,23 +83,16 @@ static void virtio_crypto_dataq_akcipher_callback(struct virtio_crypto_request *
 	case VIRTIO_CRYPTO_BADMSG:
 		error = -EBADMSG;
 		break;
-
-	case VIRTIO_CRYPTO_KEY_REJECTED:
-		error = -EKEYREJECTED;
-		break;
-
 	default:
 		error = -EIO;
 		break;
 	}
 
 	akcipher_req = vc_akcipher_req->akcipher_req;
-	if (vc_akcipher_req->opcode != VIRTIO_CRYPTO_AKCIPHER_VERIFY) {
-		/* actuall length maybe less than dst buffer */
-		akcipher_req->dst_len = len - sizeof(vc_req->status);
-		sg_copy_from_buffer(akcipher_req->dst, sg_nents(akcipher_req->dst),
-				    vc_akcipher_req->dst_buf, akcipher_req->dst_len);
-	}
+	/* actual length maybe less than dst buffer */
+	akcipher_req->dst_len = len - sizeof(vc_req->status);
+	sg_copy_from_buffer(akcipher_req->dst, sg_nents(akcipher_req->dst),
+			    vc_akcipher_req->dst_buf, akcipher_req->dst_len);
 	virtio_crypto_akcipher_finalize_req(vc_akcipher_req, akcipher_req, error);
 }
 
@@ -230,36 +223,27 @@ static int __virtio_crypto_akcipher_do_req(struct virtio_crypto_akcipher_request
 	int node = dev_to_node(&vcrypto->vdev->dev);
 	unsigned long flags;
 	int ret;
-	bool verify = vc_akcipher_req->opcode == VIRTIO_CRYPTO_AKCIPHER_VERIFY;
-	unsigned int src_len = verify ? req->src_len + req->dst_len : req->src_len;
 
 	/* out header */
 	sg_init_one(&outhdr_sg, req_data, sizeof(*req_data));
 	sgs[num_out++] = &outhdr_sg;
 
 	/* src data */
-	src_buf = kcalloc_node(src_len, 1, GFP_KERNEL, node);
+	src_buf = kcalloc_node(req->src_len, 1, GFP_KERNEL, node);
 	if (!src_buf)
 		return -ENOMEM;
 
-	if (verify) {
-		/* for verify operation, both src and dst data work as OUT direction */
-		sg_copy_to_buffer(req->src, sg_nents(req->src), src_buf, src_len);
-		sg_init_one(&srcdata_sg, src_buf, src_len);
-		sgs[num_out++] = &srcdata_sg;
-	} else {
-		sg_copy_to_buffer(req->src, sg_nents(req->src), src_buf, src_len);
-		sg_init_one(&srcdata_sg, src_buf, src_len);
-		sgs[num_out++] = &srcdata_sg;
+	sg_copy_to_buffer(req->src, sg_nents(req->src), src_buf, req->src_len);
+	sg_init_one(&srcdata_sg, src_buf, req->src_len);
+	sgs[num_out++] = &srcdata_sg;
 
-		/* dst data */
-		dst_buf = kcalloc_node(req->dst_len, 1, GFP_KERNEL, node);
-		if (!dst_buf)
-			goto free_src;
+	/* dst data */
+	dst_buf = kcalloc_node(req->dst_len, 1, GFP_KERNEL, node);
+	if (!dst_buf)
+		goto free_src;
 
-		sg_init_one(&dstdata_sg, dst_buf, req->dst_len);
-		sgs[num_out + num_in++] = &dstdata_sg;
-	}
+	sg_init_one(&dstdata_sg, dst_buf, req->dst_len);
+	sgs[num_out + num_in++] = &dstdata_sg;
 
 	vc_akcipher_req->src_buf = src_buf;
 	vc_akcipher_req->dst_buf = dst_buf;
@@ -352,16 +336,6 @@ static int virtio_crypto_rsa_decrypt(struct akcipher_request *req)
 	return virtio_crypto_rsa_req(req, VIRTIO_CRYPTO_AKCIPHER_DECRYPT);
 }
 
-static int virtio_crypto_rsa_sign(struct akcipher_request *req)
-{
-	return virtio_crypto_rsa_req(req, VIRTIO_CRYPTO_AKCIPHER_SIGN);
-}
-
-static int virtio_crypto_rsa_verify(struct akcipher_request *req)
-{
-	return virtio_crypto_rsa_req(req, VIRTIO_CRYPTO_AKCIPHER_VERIFY);
-}
-
 static int virtio_crypto_rsa_set_key(struct crypto_akcipher *tfm,
 				     const void *key,
 				     unsigned int keylen,
@@ -524,16 +498,19 @@ static struct virtio_crypto_akcipher_algo virtio_crypto_akcipher_algs[] = {
 		.algo.base = {
 			.encrypt = virtio_crypto_rsa_encrypt,
 			.decrypt = virtio_crypto_rsa_decrypt,
-			.sign = virtio_crypto_rsa_sign,
-			.verify = virtio_crypto_rsa_verify,
+			/*
+			 * Must specify an arbitrary hash algorithm upon
+			 * set_{pub,priv}_key (even though it's not used
+			 * by encrypt/decrypt) because qemu checks for it.
+			 */
 			.set_pub_key = virtio_crypto_p1pad_rsa_sha1_set_pub_key,
 			.set_priv_key = virtio_crypto_p1pad_rsa_sha1_set_priv_key,
 			.max_size = virtio_crypto_rsa_max_size,
 			.init = virtio_crypto_rsa_init_tfm,
 			.exit = virtio_crypto_rsa_exit_tfm,
 			.base = {
-				.cra_name = "pkcs1pad(rsa,sha1)",
-				.cra_driver_name = "virtio-pkcs1-rsa-with-sha1",
+				.cra_name = "pkcs1pad(rsa)",
+				.cra_driver_name = "virtio-pkcs1-rsa",
 				.cra_priority = 150,
 				.cra_module = THIS_MODULE,
 				.cra_ctxsize = sizeof(struct virtio_crypto_akcipher_ctx),
diff --git a/include/uapi/linux/virtio_crypto.h b/include/uapi/linux/virtio_crypto.h
index 71a54a6849ca..2fccb64c9d6b 100644
--- a/include/uapi/linux/virtio_crypto.h
+++ b/include/uapi/linux/virtio_crypto.h
@@ -329,6 +329,7 @@ struct virtio_crypto_op_header {
 	VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AKCIPHER, 0x00)
 #define VIRTIO_CRYPTO_AKCIPHER_DECRYPT \
 	VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AKCIPHER, 0x01)
+	/* akcipher sign/verify opcodes are deprecated */
 #define VIRTIO_CRYPTO_AKCIPHER_SIGN \
 	VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AKCIPHER, 0x02)
 #define VIRTIO_CRYPTO_AKCIPHER_VERIFY \
-- 
cgit v1.2.3


From 4436df478860bb5da1864df2cd20f281a210f139 Mon Sep 17 00:00:00 2001
From: Erick Archer <erick.archer@outlook.com>
Date: Fri, 7 Jun 2024 18:19:12 +0200
Subject: batman-adv: Add flex array to struct batadv_tvlv_tt_data

The "struct batadv_tvlv_tt_data" uses a dynamically sized set of
trailing elements. Specifically, it uses an array of structures of type
"batadv_tvlv_tt_vlan_data". So, use the preferred way in the kernel
declaring a flexible array [1].

At the same time, prepare for the coming implementation by GCC and Clang
of the __counted_by attribute. Flexible array members annotated with
__counted_by can have their accesses bounds-checked at run-time via
CONFIG_UBSAN_BOUNDS (for array indexing) and CONFIG_FORTIFY_SOURCE (for
strcpy/memcpy-family functions). In this case, it is important to note
that the attribute used is specifically __counted_by_be since variable
"num_vlan" is of type __be16.

The following change to the "batadv_tt_tvlv_ogm_handler_v1" function:

-	tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(tt_data + 1);
-	tt_change = (struct batadv_tvlv_tt_change *)(tt_vlan + num_vlan);

+	tt_change = (struct batadv_tvlv_tt_change *)((void *)tt_data
+						     + flex_size);

is intended to prevent the compiler from generating an "out-of-bounds"
notification due to the __counted_by attribute. The compiler can do a
pointer calculation using the vlan_data flexible array memory, or in
other words, this may be calculated as an array offset, since it is the
same as:

        &tt_data->vlan_data[num_vlan]

Therefore, we go past the end of the array. In other "multiple trailing
flexible array" situations, this has been solved by addressing from the
base pointer, since the compiler either knows the full allocation size
or it knows nothing about it (this case, since it came from a "void *"
function argument).

The order in which the structure batadv_tvlv_tt_data and the structure
batadv_tvlv_tt_vlan_data are defined must be swap to avoid an incomplete
type error.

Also, avoid the open-coded arithmetic in memory allocator functions [2]
using the "struct_size" macro and use the "flex_array_size" helper to
clarify some calculations, when possible.

Moreover, the new structure member also allow us to avoid the open-coded
arithmetic on pointers in some situations. Take advantage of this.

This code was detected with the help of Coccinelle, and audited and
modified manually.

Link: https://www.kernel.org/doc/html/next/process/deprecated.html#zero-length-and-one-element-arrays [1]
Link: https://www.kernel.org/doc/html/next/process/deprecated.html#open-coded-arithmetic-in-allocator-arguments [2]
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Erick Archer <erick.archer@outlook.com>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batadv_packet.h | 29 ++++++++++++----------
 net/batman-adv/translation-table.c | 49 ++++++++++++++++----------------------
 2 files changed, 36 insertions(+), 42 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h
index 6e25753015df..439132a819ea 100644
--- a/include/uapi/linux/batadv_packet.h
+++ b/include/uapi/linux/batadv_packet.h
@@ -9,6 +9,7 @@
 
 #include <asm/byteorder.h>
 #include <linux/if_ether.h>
+#include <linux/stddef.h>
 #include <linux/types.h>
 
 /**
@@ -592,19 +593,6 @@ struct batadv_tvlv_gateway_data {
 	__be32 bandwidth_up;
 };
 
-/**
- * struct batadv_tvlv_tt_data - tt data propagated through the tt tvlv container
- * @flags: translation table flags (see batadv_tt_data_flags)
- * @ttvn: translation table version number
- * @num_vlan: number of announced VLANs. In the TVLV this struct is followed by
- *  one batadv_tvlv_tt_vlan_data object per announced vlan
- */
-struct batadv_tvlv_tt_data {
-	__u8   flags;
-	__u8   ttvn;
-	__be16 num_vlan;
-};
-
 /**
  * struct batadv_tvlv_tt_vlan_data - vlan specific tt data propagated through
  *  the tt tvlv container
@@ -618,6 +606,21 @@ struct batadv_tvlv_tt_vlan_data {
 	__u16  reserved;
 };
 
+/**
+ * struct batadv_tvlv_tt_data - tt data propagated through the tt tvlv container
+ * @flags: translation table flags (see batadv_tt_data_flags)
+ * @ttvn: translation table version number
+ * @num_vlan: number of announced VLANs. In the TVLV this struct is followed by
+ *  one batadv_tvlv_tt_vlan_data object per announced vlan
+ * @vlan_data: array of batadv_tvlv_tt_vlan_data objects
+ */
+struct batadv_tvlv_tt_data {
+	__u8   flags;
+	__u8   ttvn;
+	__be16 num_vlan;
+	struct batadv_tvlv_tt_vlan_data vlan_data[] __counted_by_be(num_vlan);
+};
+
 /**
  * struct batadv_tvlv_tt_change - translation table diff data
  * @flags: status indicators concerning the non-mesh client (see
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 2243cec18ecc..6815d1262feb 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -28,6 +28,7 @@
 #include <linux/net.h>
 #include <linux/netdevice.h>
 #include <linux/netlink.h>
+#include <linux/overflow.h>
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
 #include <linux/skbuff.h>
@@ -856,8 +857,7 @@ batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node,
 		num_entries += atomic_read(&vlan->tt.num_entries);
 	}
 
-	change_offset = sizeof(**tt_data);
-	change_offset += num_vlan * sizeof(*tt_vlan);
+	change_offset = struct_size(*tt_data, vlan_data, num_vlan);
 
 	/* if tt_len is negative, allocate the space needed by the full table */
 	if (*tt_len < 0)
@@ -876,7 +876,7 @@ batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node,
 	(*tt_data)->ttvn = atomic_read(&orig_node->last_ttvn);
 	(*tt_data)->num_vlan = htons(num_vlan);
 
-	tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(*tt_data + 1);
+	tt_vlan = (*tt_data)->vlan_data;
 	hlist_for_each_entry(vlan, &orig_node->vlan_list, list) {
 		tt_vlan->vid = htons(vlan->vid);
 		tt_vlan->crc = htonl(vlan->tt.crc);
@@ -936,8 +936,7 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv,
 		total_entries += vlan_entries;
 	}
 
-	change_offset = sizeof(**tt_data);
-	change_offset += num_vlan * sizeof(*tt_vlan);
+	change_offset = struct_size(*tt_data, vlan_data, num_vlan);
 
 	/* if tt_len is negative, allocate the space needed by the full table */
 	if (*tt_len < 0)
@@ -956,7 +955,7 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv,
 	(*tt_data)->ttvn = atomic_read(&bat_priv->tt.vn);
 	(*tt_data)->num_vlan = htons(num_vlan);
 
-	tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(*tt_data + 1);
+	tt_vlan = (*tt_data)->vlan_data;
 	hlist_for_each_entry(vlan, &bat_priv->softif_vlan_list, list) {
 		vlan_entries = atomic_read(&vlan->tt.num_entries);
 		if (vlan_entries < 1)
@@ -2916,7 +2915,6 @@ static bool batadv_send_tt_request(struct batadv_priv *bat_priv,
 {
 	struct batadv_tvlv_tt_data *tvlv_tt_data = NULL;
 	struct batadv_tt_req_node *tt_req_node = NULL;
-	struct batadv_tvlv_tt_vlan_data *tt_vlan_req;
 	struct batadv_hard_iface *primary_if;
 	bool ret = false;
 	int i, size;
@@ -2932,7 +2930,7 @@ static bool batadv_send_tt_request(struct batadv_priv *bat_priv,
 	if (!tt_req_node)
 		goto out;
 
-	size = sizeof(*tvlv_tt_data) + sizeof(*tt_vlan_req) * num_vlan;
+	size = struct_size(tvlv_tt_data, vlan_data, num_vlan);
 	tvlv_tt_data = kzalloc(size, GFP_ATOMIC);
 	if (!tvlv_tt_data)
 		goto out;
@@ -2944,12 +2942,10 @@ static bool batadv_send_tt_request(struct batadv_priv *bat_priv,
 	/* send all the CRCs within the request. This is needed by intermediate
 	 * nodes to ensure they have the correct table before replying
 	 */
-	tt_vlan_req = (struct batadv_tvlv_tt_vlan_data *)(tvlv_tt_data + 1);
 	for (i = 0; i < num_vlan; i++) {
-		tt_vlan_req->vid = tt_vlan->vid;
-		tt_vlan_req->crc = tt_vlan->crc;
+		tvlv_tt_data->vlan_data[i].vid = tt_vlan->vid;
+		tvlv_tt_data->vlan_data[i].crc = tt_vlan->crc;
 
-		tt_vlan_req++;
 		tt_vlan++;
 	}
 
@@ -3001,7 +2997,6 @@ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv,
 	struct batadv_orig_node *res_dst_orig_node = NULL;
 	struct batadv_tvlv_tt_change *tt_change;
 	struct batadv_tvlv_tt_data *tvlv_tt_data = NULL;
-	struct batadv_tvlv_tt_vlan_data *tt_vlan;
 	bool ret = false, full_table;
 	u8 orig_ttvn, req_ttvn;
 	u16 tvlv_len;
@@ -3024,10 +3019,9 @@ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv,
 	orig_ttvn = (u8)atomic_read(&req_dst_orig_node->last_ttvn);
 	req_ttvn = tt_data->ttvn;
 
-	tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(tt_data + 1);
 	/* this node doesn't have the requested data */
 	if (orig_ttvn != req_ttvn ||
-	    !batadv_tt_global_check_crc(req_dst_orig_node, tt_vlan,
+	    !batadv_tt_global_check_crc(req_dst_orig_node, tt_data->vlan_data,
 					ntohs(tt_data->num_vlan)))
 		goto out;
 
@@ -3370,7 +3364,6 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv,
 	struct batadv_orig_node *orig_node = NULL;
 	struct batadv_tvlv_tt_change *tt_change;
 	u8 *tvlv_ptr = (u8 *)tt_data;
-	u16 change_offset;
 
 	batadv_dbg(BATADV_DBG_TT, bat_priv,
 		   "Received TT_RESPONSE from %pM for ttvn %d t_size: %d [%c]\n",
@@ -3383,10 +3376,7 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv,
 
 	spin_lock_bh(&orig_node->tt_lock);
 
-	change_offset = sizeof(struct batadv_tvlv_tt_vlan_data);
-	change_offset *= ntohs(tt_data->num_vlan);
-	change_offset += sizeof(*tt_data);
-	tvlv_ptr += change_offset;
+	tvlv_ptr += struct_size(tt_data, vlan_data, ntohs(tt_data->num_vlan));
 
 	tt_change = (struct batadv_tvlv_tt_change *)tvlv_ptr;
 	if (tt_data->flags & BATADV_TT_FULL_TABLE) {
@@ -3985,10 +3975,10 @@ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
 					  u8 flags, void *tvlv_value,
 					  u16 tvlv_value_len)
 {
-	struct batadv_tvlv_tt_vlan_data *tt_vlan;
 	struct batadv_tvlv_tt_change *tt_change;
 	struct batadv_tvlv_tt_data *tt_data;
 	u16 num_entries, num_vlan;
+	size_t flex_size;
 
 	if (tvlv_value_len < sizeof(*tt_data))
 		return;
@@ -3998,17 +3988,18 @@ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
 
 	num_vlan = ntohs(tt_data->num_vlan);
 
-	if (tvlv_value_len < sizeof(*tt_vlan) * num_vlan)
+	flex_size = flex_array_size(tt_data, vlan_data, num_vlan);
+	if (tvlv_value_len < flex_size)
 		return;
 
-	tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(tt_data + 1);
-	tt_change = (struct batadv_tvlv_tt_change *)(tt_vlan + num_vlan);
-	tvlv_value_len -= sizeof(*tt_vlan) * num_vlan;
+	tt_change = (struct batadv_tvlv_tt_change *)((void *)tt_data
+						     + flex_size);
+	tvlv_value_len -= flex_size;
 
 	num_entries = batadv_tt_entries(tvlv_value_len);
 
-	batadv_tt_update_orig(bat_priv, orig, tt_vlan, num_vlan, tt_change,
-			      num_entries, tt_data->ttvn);
+	batadv_tt_update_orig(bat_priv, orig, tt_data->vlan_data, num_vlan,
+			      tt_change, num_entries, tt_data->ttvn);
 }
 
 /**
@@ -4039,8 +4030,8 @@ static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv,
 	tt_data = tvlv_value;
 	tvlv_value_len -= sizeof(*tt_data);
 
-	tt_vlan_len = sizeof(struct batadv_tvlv_tt_vlan_data);
-	tt_vlan_len *= ntohs(tt_data->num_vlan);
+	tt_vlan_len = flex_array_size(tt_data, vlan_data,
+				      ntohs(tt_data->num_vlan));
 
 	if (tvlv_value_len < tt_vlan_len)
 		return NET_RX_SUCCESS;
-- 
cgit v1.2.3


From 83134ef4609388f6b9ca31a384f531155196c2a7 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 Oct 2024 12:13:31 +0200
Subject: netkit: Add option for scrubbing skb meta data

Jordan reported that when running Cilium with netkit in per-endpoint-routes
mode, network policy misclassifies traffic. In this direct routing mode
of Cilium which is used in case of GKE/EKS/AKS, the Pod's BPF program to
enforce policy sits on the netkit primary device's egress side.

The issue here is that in case of netkit's netkit_prep_forward(), it will
clear meta data such as skb->mark and skb->priority before executing the
BPF program. Thus, identity data stored in there from earlier BPF programs
(e.g. from tcx ingress on the physical device) gets cleared instead of
being made available for the primary's program to process. While for traffic
egressing the Pod via the peer device this might be desired, this is
different for the primary one where compared to tcx egress on the host
veth this information would be available.

To address this, add a new parameter for the device orchestration to
allow control of skb->mark and skb->priority scrubbing, to make the two
accessible from BPF (and eventually leave it up to the program to scrub).
By default, the current behavior is retained. For netkit peer this also
enables the use case where applications could cooperate/signal intent to
the BPF program.

Note that struct netkit has a 4 byte hole between policy and bundle which
is used here, in other words, struct netkit's first cacheline content used
in fast-path does not get moved around.

Fixes: 35dfaad7188c ("netkit, bpf: Add bpf programmable net device")
Reported-by: Jordan Rife <jrife@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://github.com/cilium/cilium/issues/34042
Acked-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://lore.kernel.org/r/20241004101335.117711-1-daniel@iogearbox.net
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 drivers/net/netkit.c         | 68 +++++++++++++++++++++++++++++++++++---------
 include/uapi/linux/if_link.h | 15 ++++++++++
 2 files changed, 70 insertions(+), 13 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c
index 059269557d92..fba2c734f0ec 100644
--- a/drivers/net/netkit.c
+++ b/drivers/net/netkit.c
@@ -20,6 +20,7 @@ struct netkit {
 	struct net_device __rcu *peer;
 	struct bpf_mprog_entry __rcu *active;
 	enum netkit_action policy;
+	enum netkit_scrub scrub;
 	struct bpf_mprog_bundle	bundle;
 
 	/* Needed in slow-path */
@@ -50,12 +51,24 @@ netkit_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
 	return ret;
 }
 
-static void netkit_prep_forward(struct sk_buff *skb, bool xnet)
+static void netkit_xnet(struct sk_buff *skb)
 {
-	skb_scrub_packet(skb, xnet);
 	skb->priority = 0;
+	skb->mark = 0;
+}
+
+static void netkit_prep_forward(struct sk_buff *skb,
+				bool xnet, bool xnet_scrub)
+{
+	skb_scrub_packet(skb, false);
 	nf_skip_egress(skb, true);
 	skb_reset_mac_header(skb);
+	if (!xnet)
+		return;
+	ipvs_reset(skb);
+	skb_clear_tstamp(skb);
+	if (xnet_scrub)
+		netkit_xnet(skb);
 }
 
 static struct netkit *netkit_priv(const struct net_device *dev)
@@ -80,7 +93,8 @@ static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev)
 		     !pskb_may_pull(skb, ETH_HLEN) ||
 		     skb_orphan_frags(skb, GFP_ATOMIC)))
 		goto drop;
-	netkit_prep_forward(skb, !net_eq(dev_net(dev), dev_net(peer)));
+	netkit_prep_forward(skb, !net_eq(dev_net(dev), dev_net(peer)),
+			    nk->scrub);
 	eth_skb_pkt_type(skb, peer);
 	skb->dev = peer;
 	entry = rcu_dereference(nk->active);
@@ -332,8 +346,10 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev,
 			   struct netlink_ext_ack *extack)
 {
 	struct nlattr *peer_tb[IFLA_MAX + 1], **tbp = tb, *attr;
-	enum netkit_action default_prim = NETKIT_PASS;
-	enum netkit_action default_peer = NETKIT_PASS;
+	enum netkit_action policy_prim = NETKIT_PASS;
+	enum netkit_action policy_peer = NETKIT_PASS;
+	enum netkit_scrub scrub_prim = NETKIT_SCRUB_DEFAULT;
+	enum netkit_scrub scrub_peer = NETKIT_SCRUB_DEFAULT;
 	enum netkit_mode mode = NETKIT_L3;
 	unsigned char ifname_assign_type;
 	struct ifinfomsg *ifmp = NULL;
@@ -362,17 +378,21 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev,
 				return err;
 			tbp = peer_tb;
 		}
+		if (data[IFLA_NETKIT_SCRUB])
+			scrub_prim = nla_get_u32(data[IFLA_NETKIT_SCRUB]);
+		if (data[IFLA_NETKIT_PEER_SCRUB])
+			scrub_peer = nla_get_u32(data[IFLA_NETKIT_PEER_SCRUB]);
 		if (data[IFLA_NETKIT_POLICY]) {
 			attr = data[IFLA_NETKIT_POLICY];
-			default_prim = nla_get_u32(attr);
-			err = netkit_check_policy(default_prim, attr, extack);
+			policy_prim = nla_get_u32(attr);
+			err = netkit_check_policy(policy_prim, attr, extack);
 			if (err < 0)
 				return err;
 		}
 		if (data[IFLA_NETKIT_PEER_POLICY]) {
 			attr = data[IFLA_NETKIT_PEER_POLICY];
-			default_peer = nla_get_u32(attr);
-			err = netkit_check_policy(default_peer, attr, extack);
+			policy_peer = nla_get_u32(attr);
+			err = netkit_check_policy(policy_peer, attr, extack);
 			if (err < 0)
 				return err;
 		}
@@ -409,7 +429,8 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev,
 
 	nk = netkit_priv(peer);
 	nk->primary = false;
-	nk->policy = default_peer;
+	nk->policy = policy_peer;
+	nk->scrub = scrub_peer;
 	nk->mode = mode;
 	bpf_mprog_bundle_init(&nk->bundle);
 
@@ -434,7 +455,8 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev,
 
 	nk = netkit_priv(dev);
 	nk->primary = true;
-	nk->policy = default_prim;
+	nk->policy = policy_prim;
+	nk->scrub = scrub_prim;
 	nk->mode = mode;
 	bpf_mprog_bundle_init(&nk->bundle);
 
@@ -874,6 +896,18 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
 		return -EACCES;
 	}
 
+	if (data[IFLA_NETKIT_SCRUB]) {
+		NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_SCRUB],
+				    "netkit scrubbing cannot be changed after device creation");
+		return -EACCES;
+	}
+
+	if (data[IFLA_NETKIT_PEER_SCRUB]) {
+		NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_PEER_SCRUB],
+				    "netkit scrubbing cannot be changed after device creation");
+		return -EACCES;
+	}
+
 	if (data[IFLA_NETKIT_PEER_INFO]) {
 		NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_PEER_INFO],
 				    "netkit peer info cannot be changed after device creation");
@@ -908,8 +942,10 @@ static size_t netkit_get_size(const struct net_device *dev)
 {
 	return nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_POLICY */
 	       nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PEER_POLICY */
-	       nla_total_size(sizeof(u8))  + /* IFLA_NETKIT_PRIMARY */
+	       nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_SCRUB */
+	       nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PEER_SCRUB */
 	       nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_MODE */
+	       nla_total_size(sizeof(u8))  + /* IFLA_NETKIT_PRIMARY */
 	       0;
 }
 
@@ -924,11 +960,15 @@ static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev)
 		return -EMSGSIZE;
 	if (nla_put_u32(skb, IFLA_NETKIT_MODE, nk->mode))
 		return -EMSGSIZE;
+	if (nla_put_u32(skb, IFLA_NETKIT_SCRUB, nk->scrub))
+		return -EMSGSIZE;
 
 	if (peer) {
 		nk = netkit_priv(peer);
 		if (nla_put_u32(skb, IFLA_NETKIT_PEER_POLICY, nk->policy))
 			return -EMSGSIZE;
+		if (nla_put_u32(skb, IFLA_NETKIT_PEER_SCRUB, nk->scrub))
+			return -EMSGSIZE;
 	}
 
 	return 0;
@@ -936,9 +976,11 @@ static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev)
 
 static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = {
 	[IFLA_NETKIT_PEER_INFO]		= { .len = sizeof(struct ifinfomsg) },
-	[IFLA_NETKIT_POLICY]		= { .type = NLA_U32 },
 	[IFLA_NETKIT_MODE]		= { .type = NLA_U32 },
+	[IFLA_NETKIT_POLICY]		= { .type = NLA_U32 },
 	[IFLA_NETKIT_PEER_POLICY]	= { .type = NLA_U32 },
+	[IFLA_NETKIT_SCRUB]		= NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT),
+	[IFLA_NETKIT_PEER_SCRUB]	= NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT),
 	[IFLA_NETKIT_PRIMARY]		= { .type = NLA_REJECT,
 					    .reject_message = "Primary attribute is read-only" },
 };
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 6dc258993b17..2acc7687e017 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1292,6 +1292,19 @@ enum netkit_mode {
 	NETKIT_L3,
 };
 
+/* NETKIT_SCRUB_NONE leaves clearing skb->{mark,priority} up to
+ * the BPF program if attached. This also means the latter can
+ * consume the two fields if they were populated earlier.
+ *
+ * NETKIT_SCRUB_DEFAULT zeroes skb->{mark,priority} fields before
+ * invoking the attached BPF program when the peer device resides
+ * in a different network namespace. This is the default behavior.
+ */
+enum netkit_scrub {
+	NETKIT_SCRUB_NONE,
+	NETKIT_SCRUB_DEFAULT,
+};
+
 enum {
 	IFLA_NETKIT_UNSPEC,
 	IFLA_NETKIT_PEER_INFO,
@@ -1299,6 +1312,8 @@ enum {
 	IFLA_NETKIT_POLICY,
 	IFLA_NETKIT_PEER_POLICY,
 	IFLA_NETKIT_MODE,
+	IFLA_NETKIT_SCRUB,
+	IFLA_NETKIT_PEER_SCRUB,
 	__IFLA_NETKIT_MAX,
 };
 #define IFLA_NETKIT_MAX	(__IFLA_NETKIT_MAX - 1)
-- 
cgit v1.2.3


From a8f2cdd27d114ed6c3354a0e39502e6d56215804 Mon Sep 17 00:00:00 2001
From: Dmitry Perchanov <dmitry.perchanov@intel.com>
Date: Mon, 26 Aug 2024 16:04:23 +0300
Subject: media: v4l: Add luma 16-bit interlaced pixel format

The formats added by this patch are:

        V4L2_PIX_FMT_Y16I

Interlaced lumina format primary use in RealSense Depth cameras with
stereo stream for left and right image sensors.

Signed-off-by: Dmitry Perchanov <dmitry.perchanov@intel.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Link: https://lore.kernel.org/r/568efbd75290e286b8ad9e7347b5f43745121020.camel@intel.com
Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 .../userspace-api/media/v4l/pixfmt-y16i.rst        | 73 ++++++++++++++++++++++
 .../userspace-api/media/v4l/yuv-formats.rst        |  1 +
 drivers/media/v4l2-core/v4l2-ioctl.c               |  1 +
 include/uapi/linux/videodev2.h                     |  1 +
 4 files changed, 76 insertions(+)
 create mode 100644 Documentation/userspace-api/media/v4l/pixfmt-y16i.rst

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/v4l/pixfmt-y16i.rst b/Documentation/userspace-api/media/v4l/pixfmt-y16i.rst
new file mode 100644
index 000000000000..74ba9e910a38
--- /dev/null
+++ b/Documentation/userspace-api/media/v4l/pixfmt-y16i.rst
@@ -0,0 +1,73 @@
+.. SPDX-License-Identifier: GFDL-1.1-no-invariants-or-later
+
+.. _V4L2-PIX-FMT-Y16I:
+
+**************************
+V4L2_PIX_FMT_Y16I ('Y16I')
+**************************
+
+Interleaved grey-scale image, e.g. from a stereo-pair
+
+
+Description
+===========
+
+This is a grey-scale image with a depth of 16 bits per pixel, but with pixels
+from 2 sources interleaved and unpacked. Each pixel is stored in a 16-bit word
+in the little-endian order. The first pixel is from the left source.
+
+**Pixel unpacked representation.**
+Left/Right pixels 16-bit unpacked - 16-bit for each interleaved pixel.
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+
+    * - Y'\ :sub:`0L[7:0]`
+      - Y'\ :sub:`0L[15:8]`
+      - Y'\ :sub:`0R[7:0]`
+      - Y'\ :sub:`0R[15:8]`
+
+**Byte Order.**
+Each cell is one byte.
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+
+    * - start + 0:
+      - Y'\ :sub:`00Llow`
+      - Y'\ :sub:`00Lhigh`
+      - Y'\ :sub:`00Rlow`
+      - Y'\ :sub:`00Rhigh`
+      - Y'\ :sub:`01Llow`
+      - Y'\ :sub:`01Lhigh`
+      - Y'\ :sub:`01Rlow`
+      - Y'\ :sub:`01Rhigh`
+    * - start + 8:
+      - Y'\ :sub:`10Llow`
+      - Y'\ :sub:`10Lhigh`
+      - Y'\ :sub:`10Rlow`
+      - Y'\ :sub:`10Rhigh`
+      - Y'\ :sub:`11Llow`
+      - Y'\ :sub:`11Lhigh`
+      - Y'\ :sub:`11Rlow`
+      - Y'\ :sub:`11Rhigh`
+    * - start + 16:
+      - Y'\ :sub:`20Llow`
+      - Y'\ :sub:`20Lhigh`
+      - Y'\ :sub:`20Rlow`
+      - Y'\ :sub:`20Rhigh`
+      - Y'\ :sub:`21Llow`
+      - Y'\ :sub:`21Lhigh`
+      - Y'\ :sub:`21Rlow`
+      - Y'\ :sub:`21Rhigh`
+    * - start + 24:
+      - Y'\ :sub:`30Llow`
+      - Y'\ :sub:`30Lhigh`
+      - Y'\ :sub:`30Rlow`
+      - Y'\ :sub:`30Rhigh`
+      - Y'\ :sub:`31Llow`
+      - Y'\ :sub:`31Lhigh`
+      - Y'\ :sub:`31Rlow`
+      - Y'\ :sub:`31Rhigh`
diff --git a/Documentation/userspace-api/media/v4l/yuv-formats.rst b/Documentation/userspace-api/media/v4l/yuv-formats.rst
index 24b34cdfa6fe..78ee406d7647 100644
--- a/Documentation/userspace-api/media/v4l/yuv-formats.rst
+++ b/Documentation/userspace-api/media/v4l/yuv-formats.rst
@@ -269,5 +269,6 @@ image.
     pixfmt-yuv-luma
     pixfmt-y8i
     pixfmt-y12i
+    pixfmt-y16i
     pixfmt-uv8
     pixfmt-m420
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index e14db67be97c..b9a3c6b20282 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1327,6 +1327,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_Y14P:		descr = "14-bit Greyscale (MIPI Packed)"; break;
 	case V4L2_PIX_FMT_Y8I:		descr = "Interleaved 8-bit Greyscale"; break;
 	case V4L2_PIX_FMT_Y12I:		descr = "Interleaved 12-bit Greyscale"; break;
+	case V4L2_PIX_FMT_Y16I:		descr = "Interleaved 16-bit Greyscale"; break;
 	case V4L2_PIX_FMT_Z16:		descr = "16-bit Depth"; break;
 	case V4L2_PIX_FMT_INZI:		descr = "Planar 10:16 Greyscale Depth"; break;
 	case V4L2_PIX_FMT_CNF4:		descr = "4-bit Depth Confidence (Packed)"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 27239cb64065..21a8aa575ea3 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -798,6 +798,7 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_S5C_UYVY_JPG v4l2_fourcc('S', '5', 'C', 'I') /* S5C73M3 interleaved UYVY/JPEG */
 #define V4L2_PIX_FMT_Y8I      v4l2_fourcc('Y', '8', 'I', ' ') /* Greyscale 8-bit L/R interleaved */
 #define V4L2_PIX_FMT_Y12I     v4l2_fourcc('Y', '1', '2', 'I') /* Greyscale 12-bit L/R interleaved */
+#define V4L2_PIX_FMT_Y16I     v4l2_fourcc('Y', '1', '6', 'I') /* Greyscale 16-bit L/R interleaved */
 #define V4L2_PIX_FMT_Z16      v4l2_fourcc('Z', '1', '6', ' ') /* Depth data 16-bit */
 #define V4L2_PIX_FMT_MT21C    v4l2_fourcc('M', 'T', '2', '1') /* Mediatek compressed block mode  */
 #define V4L2_PIX_FMT_MM21     v4l2_fourcc('M', 'M', '2', '1') /* Mediatek 8-bit block mode, two non-contiguous planes */
-- 
cgit v1.2.3


From 20503272422693d793b84f88bf23fe4e955d3a33 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Sun, 6 Oct 2024 08:17:58 +0100
Subject: ptp: Add support for the AMZNC10C 'vmclock' device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The vmclock device addresses the problem of live migration with
precision clocks. The tolerances of a hardware counter (e.g. TSC) are
typically around ±50PPM. A guest will use NTP/PTP/PPS to discipline that
counter against an external source of 'real' time, and track the precise
frequency of the counter as it changes with environmental conditions.

When a guest is live migrated, anything it knows about the frequency of
the underlying counter becomes invalid. It may move from a host where
the counter running at -50PPM of its nominal frequency, to a host where
it runs at +50PPM. There will also be a step change in the value of the
counter, as the correctness of its absolute value at migration is
limited by the accuracy of the source and destination host's time
synchronization.

In its simplest form, the device merely advertises a 'disruption_marker'
which indicates that the guest should throw away any NTP synchronization
it thinks it has, and start again.

Because the shared memory region can be exposed all the way to userspace
through the /dev/vmclock0 node, applications can still use time from a
fast vDSO 'system call', and check the disruption marker to be sure that
their timestamp is indeed truthful.

The structure also allows for the precise time, as known by the host, to
be exposed directly to guests so that they don't have to wait for NTP to
resync from scratch. The PTP driver consumes this information if present.
Like the KVM PTP clock, this PTP driver can convert TSC-based cross
timestamps into KVM clock values. Unlike the KVM PTP clock, it does so
only when such is actually helpful.

The values and fields are based on the nascent virtio-rtc specification,
and the intent is that a version (hopefully precisely this version) of
this structure will be included as an optional part of that spec. In the
meantime, this driver supports the simple ACPI form of the device which
is being shipped in certain commercial hypervisors (and submitted for
inclusion in QEMU).

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                      |   7 +
 drivers/ptp/Kconfig              |  13 +
 drivers/ptp/Makefile             |   1 +
 drivers/ptp/ptp_vmclock.c        | 615 +++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/vmclock-abi.h | 182 ++++++++++++
 5 files changed, 818 insertions(+)
 create mode 100644 drivers/ptp/ptp_vmclock.c
 create mode 100644 include/uapi/linux/vmclock-abi.h

(limited to 'include/uapi/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index af635dc60cfe..1389704c7d8d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -18683,6 +18683,13 @@ S:	Maintained
 F:	drivers/ptp/ptp_vclock.c
 F:	net/ethtool/phc_vclocks.c
 
+PTP VMCLOCK SUPPORT
+M:	David Woodhouse <dwmw2@infradead.org>
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	drivers/ptp/ptp_vmclock.c
+F:	include/uapi/linux/vmclock-abi.h
+
 PTRACE SUPPORT
 M:	Oleg Nesterov <oleg@redhat.com>
 S:	Maintained
diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig
index 604541dcb320..e98c9767e0ef 100644
--- a/drivers/ptp/Kconfig
+++ b/drivers/ptp/Kconfig
@@ -131,6 +131,19 @@ config PTP_1588_CLOCK_KVM
 	  To compile this driver as a module, choose M here: the module
 	  will be called ptp_kvm.
 
+config PTP_1588_CLOCK_VMCLOCK
+	tristate "Virtual machine PTP clock"
+	depends on X86_TSC || ARM_ARCH_TIMER
+	depends on PTP_1588_CLOCK && ACPI && ARCH_SUPPORTS_INT128
+	default y
+	help
+	  This driver adds support for using a virtual precision clock
+	  advertised by the hypervisor. This clock is only useful in virtual
+	  machines where such a device is present.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called ptp_vmclock.
+
 config PTP_1588_CLOCK_IDT82P33
 	tristate "IDT 82P33xxx PTP clock"
 	depends on PTP_1588_CLOCK && I2C
diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile
index 68bf02078053..01b5cd91eb61 100644
--- a/drivers/ptp/Makefile
+++ b/drivers/ptp/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_PTP_1588_CLOCK_DTE)	+= ptp_dte.o
 obj-$(CONFIG_PTP_1588_CLOCK_INES)	+= ptp_ines.o
 obj-$(CONFIG_PTP_1588_CLOCK_PCH)	+= ptp_pch.o
 obj-$(CONFIG_PTP_1588_CLOCK_KVM)	+= ptp_kvm.o
+obj-$(CONFIG_PTP_1588_CLOCK_VMCLOCK)	+= ptp_vmclock.o
 obj-$(CONFIG_PTP_1588_CLOCK_QORIQ)	+= ptp-qoriq.o
 ptp-qoriq-y				+= ptp_qoriq.o
 ptp-qoriq-$(CONFIG_DEBUG_FS)		+= ptp_qoriq_debugfs.o
diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c
new file mode 100644
index 000000000000..cdca8a3ad1aa
--- /dev/null
+++ b/drivers/ptp/ptp_vmclock.c
@@ -0,0 +1,615 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Virtual PTP 1588 clock for use with LM-safe VMclock device.
+ *
+ * Copyright © 2024 Amazon.com, Inc. or its affiliates.
+ */
+
+#include <linux/acpi.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+#include <uapi/linux/vmclock-abi.h>
+
+#include <linux/ptp_clock_kernel.h>
+
+#ifdef CONFIG_X86
+#include <asm/pvclock.h>
+#include <asm/kvmclock.h>
+#endif
+
+#ifdef CONFIG_KVM_GUEST
+#define SUPPORT_KVMCLOCK
+#endif
+
+static DEFINE_IDA(vmclock_ida);
+
+ACPI_MODULE_NAME("vmclock");
+
+struct vmclock_state {
+	struct resource res;
+	struct vmclock_abi *clk;
+	struct miscdevice miscdev;
+	struct ptp_clock_info ptp_clock_info;
+	struct ptp_clock *ptp_clock;
+	enum clocksource_ids cs_id, sys_cs_id;
+	int index;
+	char *name;
+};
+
+#define VMCLOCK_MAX_WAIT ms_to_ktime(100)
+
+/* Require at least the flags field to be present. All else can be optional. */
+#define VMCLOCK_MIN_SIZE offsetof(struct vmclock_abi, pad)
+
+#define VMCLOCK_FIELD_PRESENT(_c, _f)			  \
+	(le32_to_cpu((_c)->size) >= (offsetof(struct vmclock_abi, _f) +	\
+				     sizeof((_c)->_f)))
+
+/*
+ * Multiply a 64-bit count by a 64-bit tick 'period' in units of seconds >> 64
+ * and add the fractional second part of the reference time.
+ *
+ * The result is a 128-bit value, the top 64 bits of which are seconds, and
+ * the low 64 bits are (seconds >> 64).
+ */
+static uint64_t mul_u64_u64_shr_add_u64(uint64_t *res_hi, uint64_t delta,
+					uint64_t period, uint8_t shift,
+					uint64_t frac_sec)
+{
+	unsigned __int128 res = (unsigned __int128)delta * period;
+
+	res >>= shift;
+	res += frac_sec;
+	*res_hi = res >> 64;
+	return (uint64_t)res;
+}
+
+static bool tai_adjust(struct vmclock_abi *clk, uint64_t *sec)
+{
+	if (likely(clk->time_type == VMCLOCK_TIME_UTC))
+		return true;
+
+	if (clk->time_type == VMCLOCK_TIME_TAI &&
+	    (le64_to_cpu(clk->flags) & VMCLOCK_FLAG_TAI_OFFSET_VALID)) {
+		if (sec)
+			*sec += (int16_t)le16_to_cpu(clk->tai_offset_sec);
+		return true;
+	}
+	return false;
+}
+
+static int vmclock_get_crosststamp(struct vmclock_state *st,
+				   struct ptp_system_timestamp *sts,
+				   struct system_counterval_t *system_counter,
+				   struct timespec64 *tspec)
+{
+	ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT);
+	struct system_time_snapshot systime_snapshot;
+	uint64_t cycle, delta, seq, frac_sec;
+
+#ifdef CONFIG_X86
+	/*
+	 * We'd expect the hypervisor to know this and to report the clock
+	 * status as VMCLOCK_STATUS_UNRELIABLE. But be paranoid.
+	 */
+	if (check_tsc_unstable())
+		return -EINVAL;
+#endif
+
+	while (1) {
+		seq = le32_to_cpu(st->clk->seq_count) & ~1ULL;
+
+		/*
+		 * This pairs with a write barrier in the hypervisor
+		 * which populates this structure.
+		 */
+		virt_rmb();
+
+		if (st->clk->clock_status == VMCLOCK_STATUS_UNRELIABLE)
+			return -EINVAL;
+
+		/*
+		 * When invoked for gettimex64(), fill in the pre/post system
+		 * times. The simple case is when system time is based on the
+		 * same counter as st->cs_id, in which case all three times
+		 * will be derived from the *same* counter value.
+		 *
+		 * If the system isn't using the same counter, then the value
+		 * from ktime_get_snapshot() will still be used as pre_ts, and
+		 * ptp_read_system_postts() is called to populate postts after
+		 * calling get_cycles().
+		 *
+		 * The conversion to timespec64 happens further down, outside
+		 * the seq_count loop.
+		 */
+		if (sts) {
+			ktime_get_snapshot(&systime_snapshot);
+			if (systime_snapshot.cs_id == st->cs_id) {
+				cycle = systime_snapshot.cycles;
+			} else {
+				cycle = get_cycles();
+				ptp_read_system_postts(sts);
+			}
+		} else {
+			cycle = get_cycles();
+		}
+
+		delta = cycle - le64_to_cpu(st->clk->counter_value);
+
+		frac_sec = mul_u64_u64_shr_add_u64(&tspec->tv_sec, delta,
+						   le64_to_cpu(st->clk->counter_period_frac_sec),
+						   st->clk->counter_period_shift,
+						   le64_to_cpu(st->clk->time_frac_sec));
+		tspec->tv_nsec = mul_u64_u64_shr(frac_sec, NSEC_PER_SEC, 64);
+		tspec->tv_sec += le64_to_cpu(st->clk->time_sec);
+
+		if (!tai_adjust(st->clk, &tspec->tv_sec))
+			return -EINVAL;
+
+		/*
+		 * This pairs with a write barrier in the hypervisor
+		 * which populates this structure.
+		 */
+		virt_rmb();
+		if (seq == le32_to_cpu(st->clk->seq_count))
+			break;
+
+		if (ktime_after(ktime_get(), deadline))
+			return -ETIMEDOUT;
+	}
+
+	if (system_counter) {
+		system_counter->cycles = cycle;
+		system_counter->cs_id = st->cs_id;
+	}
+
+	if (sts) {
+		sts->pre_ts = ktime_to_timespec64(systime_snapshot.real);
+		if (systime_snapshot.cs_id == st->cs_id)
+			sts->post_ts = sts->pre_ts;
+	}
+
+	return 0;
+}
+
+#ifdef SUPPORT_KVMCLOCK
+/*
+ * In the case where the system is using the KVM clock for timekeeping, convert
+ * the TSC value into a KVM clock time in order to return a paired reading that
+ * get_device_system_crosststamp() can cope with.
+ */
+static int vmclock_get_crosststamp_kvmclock(struct vmclock_state *st,
+					    struct ptp_system_timestamp *sts,
+					    struct system_counterval_t *system_counter,
+					    struct timespec64 *tspec)
+{
+	struct pvclock_vcpu_time_info *pvti = this_cpu_pvti();
+	unsigned int pvti_ver;
+	int ret;
+
+	preempt_disable_notrace();
+
+	do {
+		pvti_ver = pvclock_read_begin(pvti);
+
+		ret = vmclock_get_crosststamp(st, sts, system_counter, tspec);
+		if (ret)
+			break;
+
+		system_counter->cycles = __pvclock_read_cycles(pvti,
+							       system_counter->cycles);
+		system_counter->cs_id = CSID_X86_KVM_CLK;
+
+		/*
+		 * This retry should never really happen; if the TSC is
+		 * stable and reliable enough across vCPUS that it is sane
+		 * for the hypervisor to expose a VMCLOCK device which uses
+		 * it as the reference counter, then the KVM clock sohuld be
+		 * in 'master clock mode' and basically never changed. But
+		 * the KVM clock is a fickle and often broken thing, so do
+		 * it "properly" just in case.
+		 */
+	} while (pvclock_read_retry(pvti, pvti_ver));
+
+	preempt_enable_notrace();
+
+	return ret;
+}
+#endif
+
+static int ptp_vmclock_get_time_fn(ktime_t *device_time,
+				   struct system_counterval_t *system_counter,
+				   void *ctx)
+{
+	struct vmclock_state *st = ctx;
+	struct timespec64 tspec;
+	int ret;
+
+#ifdef SUPPORT_KVMCLOCK
+	if (READ_ONCE(st->sys_cs_id) == CSID_X86_KVM_CLK)
+		ret = vmclock_get_crosststamp_kvmclock(st, NULL, system_counter,
+						       &tspec);
+	else
+#endif
+		ret = vmclock_get_crosststamp(st, NULL, system_counter, &tspec);
+
+	if (!ret)
+		*device_time = timespec64_to_ktime(tspec);
+
+	return ret;
+}
+
+static int ptp_vmclock_getcrosststamp(struct ptp_clock_info *ptp,
+				      struct system_device_crosststamp *xtstamp)
+{
+	struct vmclock_state *st = container_of(ptp, struct vmclock_state,
+						ptp_clock_info);
+	int ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn, st,
+						NULL, xtstamp);
+#ifdef SUPPORT_KVMCLOCK
+	/*
+	 * On x86, the KVM clock may be used for the system time. We can
+	 * actually convert a TSC reading to that, and return a paired
+	 * timestamp that get_device_system_crosststamp() *can* handle.
+	 */
+	if (ret == -ENODEV) {
+		struct system_time_snapshot systime_snapshot;
+
+		ktime_get_snapshot(&systime_snapshot);
+
+		if (systime_snapshot.cs_id == CSID_X86_TSC ||
+		    systime_snapshot.cs_id == CSID_X86_KVM_CLK) {
+			WRITE_ONCE(st->sys_cs_id, systime_snapshot.cs_id);
+			ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn,
+							    st, NULL, xtstamp);
+		}
+	}
+#endif
+	return ret;
+}
+
+/*
+ * PTP clock operations
+ */
+
+static int ptp_vmclock_adjfine(struct ptp_clock_info *ptp, long delta)
+{
+	return -EOPNOTSUPP;
+}
+
+static int ptp_vmclock_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+	return -EOPNOTSUPP;
+}
+
+static int ptp_vmclock_settime(struct ptp_clock_info *ptp,
+			   const struct timespec64 *ts)
+{
+	return -EOPNOTSUPP;
+}
+
+static int ptp_vmclock_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts,
+				struct ptp_system_timestamp *sts)
+{
+	struct vmclock_state *st = container_of(ptp, struct vmclock_state,
+						ptp_clock_info);
+
+	return vmclock_get_crosststamp(st, sts, NULL, ts);
+}
+
+static int ptp_vmclock_enable(struct ptp_clock_info *ptp,
+			  struct ptp_clock_request *rq, int on)
+{
+	return -EOPNOTSUPP;
+}
+
+static const struct ptp_clock_info ptp_vmclock_info = {
+	.owner		= THIS_MODULE,
+	.max_adj	= 0,
+	.n_ext_ts	= 0,
+	.n_pins		= 0,
+	.pps		= 0,
+	.adjfine	= ptp_vmclock_adjfine,
+	.adjtime	= ptp_vmclock_adjtime,
+	.gettimex64	= ptp_vmclock_gettimex,
+	.settime64	= ptp_vmclock_settime,
+	.enable		= ptp_vmclock_enable,
+	.getcrosststamp = ptp_vmclock_getcrosststamp,
+};
+
+static struct ptp_clock *vmclock_ptp_register(struct device *dev,
+					      struct vmclock_state *st)
+{
+	enum clocksource_ids cs_id;
+
+	if (IS_ENABLED(CONFIG_ARM64) &&
+	    st->clk->counter_id == VMCLOCK_COUNTER_ARM_VCNT) {
+		/* Can we check it's the virtual counter? */
+		cs_id = CSID_ARM_ARCH_COUNTER;
+	} else if (IS_ENABLED(CONFIG_X86) &&
+		   st->clk->counter_id == VMCLOCK_COUNTER_X86_TSC) {
+		cs_id = CSID_X86_TSC;
+	} else {
+		return NULL;
+	}
+
+	/* Only UTC, or TAI with offset */
+	if (!tai_adjust(st->clk, NULL)) {
+		dev_info(dev, "vmclock does not provide unambiguous UTC\n");
+		return NULL;
+	}
+
+	st->sys_cs_id = cs_id;
+	st->cs_id = cs_id;
+	st->ptp_clock_info = ptp_vmclock_info;
+	strscpy(st->ptp_clock_info.name, st->name);
+
+	return ptp_clock_register(&st->ptp_clock_info, dev);
+}
+
+static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+	struct vmclock_state *st = container_of(fp->private_data,
+						struct vmclock_state, miscdev);
+
+	if ((vma->vm_flags & (VM_READ|VM_WRITE)) != VM_READ)
+		return -EROFS;
+
+	if (vma->vm_end - vma->vm_start != PAGE_SIZE || vma->vm_pgoff)
+		return -EINVAL;
+
+	if (io_remap_pfn_range(vma, vma->vm_start,
+			       st->res.start >> PAGE_SHIFT, PAGE_SIZE,
+			       vma->vm_page_prot))
+		return -EAGAIN;
+
+	return 0;
+}
+
+static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf,
+				    size_t count, loff_t *ppos)
+{
+	struct vmclock_state *st = container_of(fp->private_data,
+						struct vmclock_state, miscdev);
+	ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT);
+	size_t max_count;
+	uint32_t seq;
+
+	if (*ppos >= PAGE_SIZE)
+		return 0;
+
+	max_count = PAGE_SIZE - *ppos;
+	if (count > max_count)
+		count = max_count;
+
+	while (1) {
+		seq = le32_to_cpu(st->clk->seq_count) & ~1U;
+		/* Pairs with hypervisor wmb */
+		virt_rmb();
+
+		if (copy_to_user(buf, ((char *)st->clk) + *ppos, count))
+			return -EFAULT;
+
+		/* Pairs with hypervisor wmb */
+		virt_rmb();
+		if (seq == le32_to_cpu(st->clk->seq_count))
+			break;
+
+		if (ktime_after(ktime_get(), deadline))
+			return -ETIMEDOUT;
+	}
+
+	*ppos += count;
+	return count;
+}
+
+static const struct file_operations vmclock_miscdev_fops = {
+	.mmap = vmclock_miscdev_mmap,
+	.read = vmclock_miscdev_read,
+};
+
+/* module operations */
+
+static void vmclock_remove(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct vmclock_state *st = dev_get_drvdata(dev);
+
+	if (st->ptp_clock)
+		ptp_clock_unregister(st->ptp_clock);
+
+	if (st->miscdev.minor != MISC_DYNAMIC_MINOR)
+		misc_deregister(&st->miscdev);
+}
+
+static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data)
+{
+	struct vmclock_state *st = data;
+	struct resource_win win;
+	struct resource *res = &win.res;
+
+	if (ares->type == ACPI_RESOURCE_TYPE_END_TAG)
+		return AE_OK;
+
+	/* There can be only one */
+	if (resource_type(&st->res) == IORESOURCE_MEM)
+		return AE_ERROR;
+
+	if (acpi_dev_resource_memory(ares, res) ||
+	    acpi_dev_resource_address_space(ares, &win)) {
+
+		if (resource_type(res) != IORESOURCE_MEM ||
+		    resource_size(res) < sizeof(st->clk))
+			return AE_ERROR;
+
+		st->res = *res;
+		return AE_OK;
+	}
+
+	return AE_ERROR;
+}
+
+static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st)
+{
+	struct acpi_device *adev = ACPI_COMPANION(dev);
+	acpi_status status;
+
+	/*
+	 * This should never happen as this function is only called when
+	 * has_acpi_companion(dev) is true, but the logic is sufficiently
+	 * complex that Coverity can't see the tautology.
+	 */
+	if (!adev)
+		return -ENODEV;
+
+	status = acpi_walk_resources(adev->handle, METHOD_NAME__CRS,
+				     vmclock_acpi_resources, st);
+	if (ACPI_FAILURE(status) || resource_type(&st->res) != IORESOURCE_MEM) {
+		dev_err(dev, "failed to get resources\n");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+static void vmclock_put_idx(void *data)
+{
+	struct vmclock_state *st = data;
+
+	ida_free(&vmclock_ida, st->index);
+}
+
+static int vmclock_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct vmclock_state *st;
+	int ret;
+
+	st = devm_kzalloc(dev, sizeof(*st), GFP_KERNEL);
+	if (!st)
+		return -ENOMEM;
+
+	if (has_acpi_companion(dev))
+		ret = vmclock_probe_acpi(dev, st);
+	else
+		ret = -EINVAL; /* Only ACPI for now */
+
+	if (ret) {
+		dev_info(dev, "Failed to obtain physical address: %d\n", ret);
+		goto out;
+	}
+
+	if (resource_size(&st->res) < VMCLOCK_MIN_SIZE) {
+		dev_info(dev, "Region too small (0x%llx)\n",
+			 resource_size(&st->res));
+		ret = -EINVAL;
+		goto out;
+	}
+	st->clk = devm_memremap(dev, st->res.start, resource_size(&st->res),
+				MEMREMAP_WB | MEMREMAP_DEC);
+	if (IS_ERR(st->clk)) {
+		ret = PTR_ERR(st->clk);
+		dev_info(dev, "failed to map shared memory\n");
+		st->clk = NULL;
+		goto out;
+	}
+
+	if (le32_to_cpu(st->clk->magic) != VMCLOCK_MAGIC ||
+	    le32_to_cpu(st->clk->size) > resource_size(&st->res) ||
+	    le16_to_cpu(st->clk->version) != 1) {
+		dev_info(dev, "vmclock magic fields invalid\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = ida_alloc(&vmclock_ida, GFP_KERNEL);
+	if (ret < 0)
+		goto out;
+
+	st->index = ret;
+	ret = devm_add_action_or_reset(&pdev->dev, vmclock_put_idx, st);
+	if (ret)
+		goto out;
+
+	st->name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "vmclock%d", st->index);
+	if (!st->name) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * If the structure is big enough, it can be mapped to userspace.
+	 * Theoretically a guest OS even using larger pages could still
+	 * use 4KiB PTEs to map smaller MMIO regions like this, but let's
+	 * cross that bridge if/when we come to it.
+	 */
+	if (le32_to_cpu(st->clk->size) >= PAGE_SIZE) {
+		st->miscdev.minor = MISC_DYNAMIC_MINOR;
+		st->miscdev.fops = &vmclock_miscdev_fops;
+		st->miscdev.name = st->name;
+
+		ret = misc_register(&st->miscdev);
+		if (ret)
+			goto out;
+	}
+
+	/* If there is valid clock information, register a PTP clock */
+	if (VMCLOCK_FIELD_PRESENT(st->clk, time_frac_sec)) {
+		/* Can return a silent NULL, or an error. */
+		st->ptp_clock = vmclock_ptp_register(dev, st);
+		if (IS_ERR(st->ptp_clock)) {
+			ret = PTR_ERR(st->ptp_clock);
+			st->ptp_clock = NULL;
+			vmclock_remove(pdev);
+			goto out;
+		}
+	}
+
+	if (!st->miscdev.minor && !st->ptp_clock) {
+		/* Neither miscdev nor PTP registered */
+		dev_info(dev, "vmclock: Neither miscdev nor PTP available; not registering\n");
+		ret = -ENODEV;
+		goto out;
+	}
+
+	dev_info(dev, "%s: registered %s%s%s\n", st->name,
+		 st->miscdev.minor ? "miscdev" : "",
+		 (st->miscdev.minor && st->ptp_clock) ? ", " : "",
+		 st->ptp_clock ? "PTP" : "");
+
+	dev_set_drvdata(dev, st);
+
+ out:
+	return ret;
+}
+
+static const struct acpi_device_id vmclock_acpi_ids[] = {
+	{ "AMZNC10C", 0 },
+	{}
+};
+MODULE_DEVICE_TABLE(acpi, vmclock_acpi_ids);
+
+static struct platform_driver vmclock_platform_driver = {
+	.probe		= vmclock_probe,
+	.remove_new	= vmclock_remove,
+	.driver	= {
+		.name	= "vmclock",
+		.acpi_match_table = vmclock_acpi_ids,
+	},
+};
+
+module_platform_driver(vmclock_platform_driver)
+
+MODULE_AUTHOR("David Woodhouse <dwmw2@infradead.org>");
+MODULE_DESCRIPTION("PTP clock using VMCLOCK");
+MODULE_LICENSE("GPL");
diff --git a/include/uapi/linux/vmclock-abi.h b/include/uapi/linux/vmclock-abi.h
new file mode 100644
index 000000000000..2d99b29ac44a
--- /dev/null
+++ b/include/uapi/linux/vmclock-abi.h
@@ -0,0 +1,182 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */
+
+/*
+ * This structure provides a vDSO-style clock to VM guests, exposing the
+ * relationship (or lack thereof) between the CPU clock (TSC, timebase, arch
+ * counter, etc.) and real time. It is designed to address the problem of
+ * live migration, which other clock enlightenments do not.
+ *
+ * When a guest is live migrated, this affects the clock in two ways.
+ *
+ * First, even between identical hosts the actual frequency of the underlying
+ * counter will change within the tolerances of its specification (typically
+ * ±50PPM, or 4 seconds a day). This frequency also varies over time on the
+ * same host, but can be tracked by NTP as it generally varies slowly. With
+ * live migration there is a step change in the frequency, with no warning.
+ *
+ * Second, there may be a step change in the value of the counter itself, as
+ * its accuracy is limited by the precision of the NTP synchronization on the
+ * source and destination hosts.
+ *
+ * So any calibration (NTP, PTP, etc.) which the guest has done on the source
+ * host before migration is invalid, and needs to be redone on the new host.
+ *
+ * In its most basic mode, this structure provides only an indication to the
+ * guest that live migration has occurred. This allows the guest to know that
+ * its clock is invalid and take remedial action. For applications that need
+ * reliable accurate timestamps (e.g. distributed databases), the structure
+ * can be mapped all the way to userspace. This allows the application to see
+ * directly for itself that the clock is disrupted and take appropriate
+ * action, even when using a vDSO-style method to get the time instead of a
+ * system call.
+ *
+ * In its more advanced mode. this structure can also be used to expose the
+ * precise relationship of the CPU counter to real time, as calibrated by the
+ * host. This means that userspace applications can have accurate time
+ * immediately after live migration, rather than having to pause operations
+ * and wait for NTP to recover. This mode does, of course, rely on the
+ * counter being reliable and consistent across CPUs.
+ *
+ * Note that this must be true UTC, never with smeared leap seconds. If a
+ * guest wishes to construct a smeared clock, it can do so. Presenting a
+ * smeared clock through this interface would be problematic because it
+ * actually messes with the apparent counter *period*. A linear smearing
+ * of 1 ms per second would effectively tweak the counter period by 1000PPM
+ * at the start/end of the smearing period, while a sinusoidal smear would
+ * basically be impossible to represent.
+ *
+ * This structure is offered with the intent that it be adopted into the
+ * nascent virtio-rtc standard, as a virtio-rtc that does not address the live
+ * migration problem seems a little less than fit for purpose. For that
+ * reason, certain fields use precisely the same numeric definitions as in
+ * the virtio-rtc proposal. The structure can also be exposed through an ACPI
+ * device with the CID "VMCLOCK", modelled on the "VMGENID" device except for
+ * the fact that it uses a real _CRS to convey the address of the structure
+ * (which should be a full page, to allow for mapping directly to userspace).
+ */
+
+#ifndef __VMCLOCK_ABI_H__
+#define __VMCLOCK_ABI_H__
+
+#include <linux/types.h>
+
+struct vmclock_abi {
+	/* CONSTANT FIELDS */
+	__le32 magic;
+#define VMCLOCK_MAGIC	0x4b4c4356 /* "VCLK" */
+	__le32 size;		/* Size of region containing this structure */
+	__le16 version;	/* 1 */
+	__u8 counter_id; /* Matches VIRTIO_RTC_COUNTER_xxx except INVALID */
+#define VMCLOCK_COUNTER_ARM_VCNT	0
+#define VMCLOCK_COUNTER_X86_TSC		1
+#define VMCLOCK_COUNTER_INVALID		0xff
+	__u8 time_type; /* Matches VIRTIO_RTC_TYPE_xxx */
+#define VMCLOCK_TIME_UTC			0	/* Since 1970-01-01 00:00:00z */
+#define VMCLOCK_TIME_TAI			1	/* Since 1970-01-01 00:00:00z */
+#define VMCLOCK_TIME_MONOTONIC			2	/* Since undefined epoch */
+#define VMCLOCK_TIME_INVALID_SMEARED		3	/* Not supported */
+#define VMCLOCK_TIME_INVALID_MAYBE_SMEARED	4	/* Not supported */
+
+	/* NON-CONSTANT FIELDS PROTECTED BY SEQCOUNT LOCK */
+	__le32 seq_count;	/* Low bit means an update is in progress */
+	/*
+	 * This field changes to another non-repeating value when the CPU
+	 * counter is disrupted, for example on live migration. This lets
+	 * the guest know that it should discard any calibration it has
+	 * performed of the counter against external sources (NTP/PTP/etc.).
+	 */
+	__le64 disruption_marker;
+	__le64 flags;
+	/* Indicates that the tai_offset_sec field is valid */
+#define VMCLOCK_FLAG_TAI_OFFSET_VALID		(1 << 0)
+	/*
+	 * Optionally used to notify guests of pending maintenance events.
+	 * A guest which provides latency-sensitive services may wish to
+	 * remove itself from service if an event is coming up. Two flags
+	 * indicate the approximate imminence of the event.
+	 */
+#define VMCLOCK_FLAG_DISRUPTION_SOON		(1 << 1) /* About a day */
+#define VMCLOCK_FLAG_DISRUPTION_IMMINENT	(1 << 2) /* About an hour */
+#define VMCLOCK_FLAG_PERIOD_ESTERROR_VALID	(1 << 3)
+#define VMCLOCK_FLAG_PERIOD_MAXERROR_VALID	(1 << 4)
+#define VMCLOCK_FLAG_TIME_ESTERROR_VALID	(1 << 5)
+#define VMCLOCK_FLAG_TIME_MAXERROR_VALID	(1 << 6)
+	/*
+	 * If the MONOTONIC flag is set then (other than leap seconds) it is
+	 * guaranteed that the time calculated according this structure at
+	 * any given moment shall never appear to be later than the time
+	 * calculated via the structure at any *later* moment.
+	 *
+	 * In particular, a timestamp based on a counter reading taken
+	 * immediately after setting the low bit of seq_count (and the
+	 * associated memory barrier), using the previously-valid time and
+	 * period fields, shall never be later than a timestamp based on
+	 * a counter reading taken immediately before *clearing* the low
+	 * bit again after the update, using the about-to-be-valid fields.
+	 */
+#define VMCLOCK_FLAG_TIME_MONOTONIC		(1 << 7)
+
+	__u8 pad[2];
+	__u8 clock_status;
+#define VMCLOCK_STATUS_UNKNOWN		0
+#define VMCLOCK_STATUS_INITIALIZING	1
+#define VMCLOCK_STATUS_SYNCHRONIZED	2
+#define VMCLOCK_STATUS_FREERUNNING	3
+#define VMCLOCK_STATUS_UNRELIABLE	4
+
+	/*
+	 * The time exposed through this device is never smeared. This field
+	 * corresponds to the 'subtype' field in virtio-rtc, which indicates
+	 * the smearing method. However in this case it provides a *hint* to
+	 * the guest operating system, such that *if* the guest OS wants to
+	 * provide its users with an alternative clock which does not follow
+	 * UTC, it may do so in a fashion consistent with the other systems
+	 * in the nearby environment.
+	 */
+	__u8 leap_second_smearing_hint; /* Matches VIRTIO_RTC_SUBTYPE_xxx */
+#define VMCLOCK_SMEARING_STRICT		0
+#define VMCLOCK_SMEARING_NOON_LINEAR	1
+#define VMCLOCK_SMEARING_UTC_SLS	2
+	__le16 tai_offset_sec; /* Actually two's complement signed */
+	__u8 leap_indicator;
+	/*
+	 * This field is based on the VIRTIO_RTC_LEAP_xxx values as defined
+	 * in the current draft of virtio-rtc, but since smearing cannot be
+	 * used with the shared memory device, some values are not used.
+	 *
+	 * The _POST_POS and _POST_NEG values allow the guest to perform
+	 * its own smearing during the day or so after a leap second when
+	 * such smearing may need to continue being applied for a leap
+	 * second which is now theoretically "historical".
+	 */
+#define VMCLOCK_LEAP_NONE	0x00	/* No known nearby leap second */
+#define VMCLOCK_LEAP_PRE_POS	0x01	/* Positive leap second at EOM */
+#define VMCLOCK_LEAP_PRE_NEG	0x02	/* Negative leap second at EOM */
+#define VMCLOCK_LEAP_POS	0x03	/* Set during 23:59:60 second */
+#define VMCLOCK_LEAP_POST_POS	0x04
+#define VMCLOCK_LEAP_POST_NEG	0x05
+
+	/* Bit shift for counter_period_frac_sec and its error rate */
+	__u8 counter_period_shift;
+	/*
+	 * Paired values of counter and UTC at a given point in time.
+	 */
+	__le64 counter_value;
+	/*
+	 * Counter period, and error margin of same. The unit of these
+	 * fields is 1/2^(64 + counter_period_shift) of a second.
+	 */
+	__le64 counter_period_frac_sec;
+	__le64 counter_period_esterror_rate_frac_sec;
+	__le64 counter_period_maxerror_rate_frac_sec;
+
+	/*
+	 * Time according to time_type field above.
+	 */
+	__le64 time_sec;		/* Seconds since time_type epoch */
+	__le64 time_frac_sec;		/* Units of 1/2^64 of a second */
+	__le64 time_esterror_nanosec;
+	__le64 time_maxerror_nanosec;
+};
+
+#endif /*  __VMCLOCK_ABI_H__ */
-- 
cgit v1.2.3


From 80c549cd1ab0241a7af262690a0ff9991fc74ec5 Mon Sep 17 00:00:00 2001
From: Alexander Zubkov <green@qrator.net>
Date: Tue, 8 Oct 2024 18:27:57 +0200
Subject: Fix misspelling of "accept*" in net

Several files have "accept*" misspelled as "accpet*" in the comments.
Fix all such occurrences.

Signed-off-by: Alexander Zubkov <green@qrator.net>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20241008162756.22618-2-green@qrator.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c | 4 ++--
 drivers/net/ethernet/natsemi/ns83820.c                        | 2 +-
 include/uapi/linux/udp.h                                      | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c
index 455a54708be4..96fd31d75dfd 100644
--- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c
+++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c
@@ -342,8 +342,8 @@ static struct sk_buff *copy_gl_to_skb_pkt(const struct pkt_gl *gl,
 {
 	struct sk_buff *skb;
 
-	/* Allocate space for cpl_pass_accpet_req which will be synthesized by
-	 * driver. Once driver synthesizes cpl_pass_accpet_req the skb will go
+	/* Allocate space for cpl_pass_accept_req which will be synthesized by
+	 * driver. Once driver synthesizes cpl_pass_accept_req the skb will go
 	 * through the regular cpl_pass_accept_req processing in TOM.
 	 */
 	skb = alloc_skb(gl->tot_len + sizeof(struct cpl_pass_accept_req)
diff --git a/drivers/net/ethernet/natsemi/ns83820.c b/drivers/net/ethernet/natsemi/ns83820.c
index 998586872599..bea969dfa536 100644
--- a/drivers/net/ethernet/natsemi/ns83820.c
+++ b/drivers/net/ethernet/natsemi/ns83820.c
@@ -2090,7 +2090,7 @@ static int ns83820_init_one(struct pci_dev *pci_dev,
 	 */
 	/* Ramit : 1024 DMA is not a good idea, it ends up banging
 	 * some DELL and COMPAQ SMP systems
-	 * Turn on ALP, only we are accpeting Jumbo Packets */
+	 * Turn on ALP, only we are accepting Jumbo Packets */
 	writel(RXCFG_AEP | RXCFG_ARP | RXCFG_AIRL | RXCFG_RX_FD
 		| RXCFG_STRIPCRC
 		//| RXCFG_ALP
diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h
index 1a0fe8b151fb..d85d671deed3 100644
--- a/include/uapi/linux/udp.h
+++ b/include/uapi/linux/udp.h
@@ -31,7 +31,7 @@ struct udphdr {
 #define UDP_CORK	1	/* Never send partially complete segments */
 #define UDP_ENCAP	100	/* Set the socket to accept encapsulated packets */
 #define UDP_NO_CHECK6_TX 101	/* Disable sending checksum for UDP6X */
-#define UDP_NO_CHECK6_RX 102	/* Disable accpeting checksum for UDP6 */
+#define UDP_NO_CHECK6_RX 102	/* Disable accepting checksum for UDP6 */
 #define UDP_SEGMENT	103	/* Set GSO segmentation size */
 #define UDP_GRO		104	/* This socket can receive UDP GRO packets */
 
-- 
cgit v1.2.3


From 04e65df94b3112a1b319b6deb5bab83fd740bc7d Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 9 Oct 2024 10:09:48 +0200
Subject: netlink: spec: add shaper YAML spec

Define the user-space visible interface to query, configure and delete
network shapers via yaml definition.

Add dummy implementations for the relevant NL callbacks.

set() and delete() operations touch a single shaper creating/updating or
deleting it.
The group() operation creates a shaper's group, nesting multiple input
shapers under the specified output shaper.

Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Link: https://patch.msgid.link/7a33a1ff370bdbcd0cd3f909575c912cd56f41da.1728460186.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/net_shaper.yaml | 274 ++++++++++++++++++++++++++++
 MAINTAINERS                                 |   1 +
 include/uapi/linux/net_shaper.h             |  78 ++++++++
 net/Kconfig                                 |   3 +
 net/Makefile                                |   1 +
 net/shaper/Makefile                         |   8 +
 net/shaper/shaper.c                         |  55 ++++++
 net/shaper/shaper_nl_gen.c                  | 125 +++++++++++++
 net/shaper/shaper_nl_gen.h                  |  34 ++++
 9 files changed, 579 insertions(+)
 create mode 100644 Documentation/netlink/specs/net_shaper.yaml
 create mode 100644 include/uapi/linux/net_shaper.h
 create mode 100644 net/shaper/Makefile
 create mode 100644 net/shaper/shaper.c
 create mode 100644 net/shaper/shaper_nl_gen.c
 create mode 100644 net/shaper/shaper_nl_gen.h

(limited to 'include/uapi/linux')

diff --git a/Documentation/netlink/specs/net_shaper.yaml b/Documentation/netlink/specs/net_shaper.yaml
new file mode 100644
index 000000000000..618fc09932ff
--- /dev/null
+++ b/Documentation/netlink/specs/net_shaper.yaml
@@ -0,0 +1,274 @@
+# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+name: net-shaper
+
+doc: |
+  Networking HW rate limiting configuration.
+
+  This API allows configuring HW shapers available on the network
+  devices at different levels (queues, network device) and allows
+  arbitrary manipulation of the scheduling tree of the involved
+  shapers.
+
+  Each @shaper is identified within the given device, by a @handle,
+  comprising both a @scope and an @id.
+
+  Depending on the @scope value, the shapers are attached to specific
+  HW objects (queues, devices) or, for @node scope, represent a
+  scheduling group, that can be placed in an arbitrary location of
+  the scheduling tree.
+
+  Shapers can be created with two different operations: the @set
+  operation, to create and update a single "attached" shaper, and
+  the @group operation, to create and update a scheduling
+  group. Only the @group operation can create @node scope shapers.
+
+  Existing shapers can be deleted/reset via the @delete operation.
+
+  The user can query the running configuration via the @get operation.
+
+definitions:
+  -
+    type: enum
+    name: scope
+    doc: Defines the shaper @id interpretation.
+    render-max: true
+    entries:
+      - name: unspec
+        doc: The scope is not specified.
+      -
+        name: netdev
+        doc: The main shaper for the given network device.
+      -
+        name: queue
+        doc: |
+            The shaper is attached to the given device queue,
+            the @id represents the queue number.
+      -
+        name: node
+        doc: |
+             The shaper allows grouping of queues or other
+             node shapers; can be nested in either @netdev
+             shapers or other @node shapers, allowing placement
+             in any location of the scheduling tree, except
+             leaves and root.
+  -
+    type: enum
+    name: metric
+    doc: Different metric supported by the shaper.
+    entries:
+      -
+        name: bps
+        doc: Shaper operates on a bits per second basis.
+      -
+        name: pps
+        doc: Shaper operates on a packets per second basis.
+
+attribute-sets:
+  -
+    name: net-shaper
+    attributes:
+      -
+        name: handle
+        type: nest
+        nested-attributes: handle
+        doc: Unique identifier for the given shaper inside the owning device.
+      -
+        name: metric
+        type: u32
+        enum: metric
+        doc: Metric used by the given shaper for bw-min, bw-max and burst.
+      -
+        name: bw-min
+        type: uint
+        doc: Guaranteed bandwidth for the given shaper.
+      -
+        name: bw-max
+        type: uint
+        doc: Maximum bandwidth for the given shaper or 0 when unlimited.
+      -
+        name: burst
+        type: uint
+        doc: |
+          Maximum burst-size for shaping. Should not be interpreted
+          as a quantum.
+      -
+        name: priority
+        type: u32
+        doc: |
+          Scheduling priority for the given shaper. The priority
+          scheduling is applied to sibling shapers.
+      -
+        name: weight
+        type: u32
+        doc: |
+          Relative weight for round robin scheduling of the
+          given shaper.
+          The scheduling is applied to all sibling shapers
+          with the same priority.
+      -
+        name: ifindex
+        type: u32
+        doc: Interface index owning the specified shaper.
+      -
+        name: parent
+        type: nest
+        nested-attributes: handle
+        doc: |
+          Identifier for the parent of the affected shaper.
+          Only needed for @group operation.
+      -
+        name: leaves
+        type: nest
+        multi-attr: true
+        nested-attributes: leaf-info
+        doc: |
+           Describes a set of leaves shapers for a @group operation.
+  -
+    name: handle
+    attributes:
+      -
+        name: scope
+        type: u32
+        enum: scope
+        doc: Defines the shaper @id interpretation.
+      -
+        name: id
+        type: u32
+        doc: |
+          Numeric identifier of a shaper. The id semantic depends on
+          the scope. For @queue scope it's the queue id and for @node
+          scope it's the node identifier.
+  -
+    name: leaf-info
+    subset-of: net-shaper
+    attributes:
+      -
+        name: handle
+      -
+        name: priority
+      -
+        name: weight
+
+operations:
+  list:
+    -
+      name: get
+      doc: |
+        Get information about a shaper for a given device.
+      attribute-set: net-shaper
+
+      do:
+        pre: net-shaper-nl-pre-doit
+        post: net-shaper-nl-post-doit
+        request:
+          attributes: &ns-binding
+            - ifindex
+            - handle
+        reply:
+          attributes: &ns-attrs
+            - ifindex
+            - parent
+            - handle
+            - metric
+            - bw-min
+            - bw-max
+            - burst
+            - priority
+            - weight
+
+      dump:
+        pre: net-shaper-nl-pre-dumpit
+        post: net-shaper-nl-post-dumpit
+        request:
+          attributes:
+            - ifindex
+        reply:
+          attributes: *ns-attrs
+    -
+      name: set
+      doc: |
+        Create or update the specified shaper.
+        The set operation can't be used to create a @node scope shaper,
+        use the @group operation instead.
+      attribute-set: net-shaper
+      flags: [ admin-perm ]
+
+      do:
+        pre: net-shaper-nl-pre-doit
+        post: net-shaper-nl-post-doit
+        request:
+          attributes:
+            - ifindex
+            - handle
+            - metric
+            - bw-min
+            - bw-max
+            - burst
+            - priority
+            - weight
+
+    -
+      name: delete
+      doc: |
+        Clear (remove) the specified shaper. When deleting
+        a @node shaper, reattach all the node's leaves to the
+        deleted node's parent.
+        If, after the removal, the parent shaper has no more
+        leaves and the parent shaper scope is @node, the parent
+        node is deleted, recursively.
+        When deleting a @queue shaper or a @netdev shaper,
+        the shaper disappears from the hierarchy, but the
+        queue/device can still send traffic: it has an implicit
+        node with infinite bandwidth. The queue's implicit node
+        feeds an implicit RR node at the root of the hierarchy.
+      attribute-set: net-shaper
+      flags: [ admin-perm ]
+
+      do:
+        pre: net-shaper-nl-pre-doit
+        post: net-shaper-nl-post-doit
+        request:
+          attributes: *ns-binding
+
+    -
+      name: group
+      doc: |
+        Create or update a scheduling group, attaching the specified
+        @leaves shapers under the specified node identified by @handle.
+        The @leaves shapers scope must be @queue and the node shaper
+        scope must be either @node or @netdev.
+        When the node shaper has @node scope, if the @handle @id is not
+        specified, a new shaper of such scope is created, otherwise the
+        specified node must already exist.
+        When updating an existing node shaper, the specified @leaves are
+        added to the existing node; such node will also retain any preexisting
+        leave.
+        The @parent handle for a new node shaper defaults to the parent
+        of all the leaves, provided all the leaves share the same parent.
+        Otherwise @parent handle must be specified.
+        The user can optionally provide shaping attributes for the node
+        shaper.
+        The operation is atomic, on failure no change is applied to
+        the device shaping configuration, otherwise the @node shaper
+        full identifier, comprising @binding and @handle, is provided
+        as the reply.
+      attribute-set: net-shaper
+      flags: [ admin-perm ]
+
+      do:
+        pre: net-shaper-nl-pre-doit
+        post: net-shaper-nl-post-doit
+        request:
+          attributes:
+            - ifindex
+            - parent
+            - handle
+            - metric
+            - bw-min
+            - bw-max
+            - burst
+            - priority
+            - weight
+            - leaves
+        reply:
+          attributes: *ns-binding
diff --git a/MAINTAINERS b/MAINTAINERS
index 1389704c7d8d..2927b44dda25 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16116,6 +16116,7 @@ F:	include/linux/platform_data/wiznet.h
 F:	include/uapi/linux/cn_proc.h
 F:	include/uapi/linux/ethtool_netlink.h
 F:	include/uapi/linux/if_*
+F:	include/uapi/linux/net_shaper.h
 F:	include/uapi/linux/netdev*
 F:	tools/testing/selftests/drivers/net/
 X:	Documentation/devicetree/bindings/net/bluetooth/
diff --git a/include/uapi/linux/net_shaper.h b/include/uapi/linux/net_shaper.h
new file mode 100644
index 000000000000..9e3fa63618ee
--- /dev/null
+++ b/include/uapi/linux/net_shaper.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/net_shaper.yaml */
+/* YNL-GEN uapi header */
+
+#ifndef _UAPI_LINUX_NET_SHAPER_H
+#define _UAPI_LINUX_NET_SHAPER_H
+
+#define NET_SHAPER_FAMILY_NAME		"net-shaper"
+#define NET_SHAPER_FAMILY_VERSION	1
+
+/**
+ * enum net_shaper_scope - Defines the shaper @id interpretation.
+ * @NET_SHAPER_SCOPE_UNSPEC: The scope is not specified.
+ * @NET_SHAPER_SCOPE_NETDEV: The main shaper for the given network device.
+ * @NET_SHAPER_SCOPE_QUEUE: The shaper is attached to the given device queue,
+ *   the @id represents the queue number.
+ * @NET_SHAPER_SCOPE_NODE: The shaper allows grouping of queues or other node
+ *   shapers; can be nested in either @netdev shapers or other @node shapers,
+ *   allowing placement in any location of the scheduling tree, except leaves
+ *   and root.
+ */
+enum net_shaper_scope {
+	NET_SHAPER_SCOPE_UNSPEC,
+	NET_SHAPER_SCOPE_NETDEV,
+	NET_SHAPER_SCOPE_QUEUE,
+	NET_SHAPER_SCOPE_NODE,
+
+	/* private: */
+	__NET_SHAPER_SCOPE_MAX,
+	NET_SHAPER_SCOPE_MAX = (__NET_SHAPER_SCOPE_MAX - 1)
+};
+
+/**
+ * enum net_shaper_metric - Different metric supported by the shaper.
+ * @NET_SHAPER_METRIC_BPS: Shaper operates on a bits per second basis.
+ * @NET_SHAPER_METRIC_PPS: Shaper operates on a packets per second basis.
+ */
+enum net_shaper_metric {
+	NET_SHAPER_METRIC_BPS,
+	NET_SHAPER_METRIC_PPS,
+};
+
+enum {
+	NET_SHAPER_A_HANDLE = 1,
+	NET_SHAPER_A_METRIC,
+	NET_SHAPER_A_BW_MIN,
+	NET_SHAPER_A_BW_MAX,
+	NET_SHAPER_A_BURST,
+	NET_SHAPER_A_PRIORITY,
+	NET_SHAPER_A_WEIGHT,
+	NET_SHAPER_A_IFINDEX,
+	NET_SHAPER_A_PARENT,
+	NET_SHAPER_A_LEAVES,
+
+	__NET_SHAPER_A_MAX,
+	NET_SHAPER_A_MAX = (__NET_SHAPER_A_MAX - 1)
+};
+
+enum {
+	NET_SHAPER_A_HANDLE_SCOPE = 1,
+	NET_SHAPER_A_HANDLE_ID,
+
+	__NET_SHAPER_A_HANDLE_MAX,
+	NET_SHAPER_A_HANDLE_MAX = (__NET_SHAPER_A_HANDLE_MAX - 1)
+};
+
+enum {
+	NET_SHAPER_CMD_GET = 1,
+	NET_SHAPER_CMD_SET,
+	NET_SHAPER_CMD_DELETE,
+	NET_SHAPER_CMD_GROUP,
+
+	__NET_SHAPER_CMD_MAX,
+	NET_SHAPER_CMD_MAX = (__NET_SHAPER_CMD_MAX - 1)
+};
+
+#endif /* _UAPI_LINUX_NET_SHAPER_H */
diff --git a/net/Kconfig b/net/Kconfig
index a629f92dc86b..c3fca69a7c83 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -72,6 +72,9 @@ config NET_DEVMEM
 	depends on GENERIC_ALLOCATOR
 	depends on PAGE_POOL
 
+config NET_SHAPER
+	bool
+
 menu "Networking options"
 
 source "net/packet/Kconfig"
diff --git a/net/Makefile b/net/Makefile
index 65bb8c72a35e..60ed5190eda8 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -79,3 +79,4 @@ obj-$(CONFIG_XDP_SOCKETS)	+= xdp/
 obj-$(CONFIG_MPTCP)		+= mptcp/
 obj-$(CONFIG_MCTP)		+= mctp/
 obj-$(CONFIG_NET_HANDSHAKE)	+= handshake/
+obj-$(CONFIG_NET_SHAPER)	+= shaper/
diff --git a/net/shaper/Makefile b/net/shaper/Makefile
new file mode 100644
index 000000000000..54af7169a331
--- /dev/null
+++ b/net/shaper/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for the net shaper infrastructure.
+#
+# Copyright (c) 2024, Red Hat, Inc.
+#
+
+obj-y += shaper.o shaper_nl_gen.o
diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c
new file mode 100644
index 000000000000..a1b20888f502
--- /dev/null
+++ b/net/shaper/shaper.c
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include "shaper_nl_gen.h"
+
+int net_shaper_nl_pre_doit(const struct genl_split_ops *ops,
+			   struct sk_buff *skb, struct genl_info *info)
+{
+	return -EOPNOTSUPP;
+}
+
+void net_shaper_nl_post_doit(const struct genl_split_ops *ops,
+			     struct sk_buff *skb, struct genl_info *info)
+{
+}
+
+int net_shaper_nl_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	return -EOPNOTSUPP;
+}
+
+int net_shaper_nl_get_dumpit(struct sk_buff *skb,
+			     struct netlink_callback *cb)
+{
+	return -EOPNOTSUPP;
+}
+
+int net_shaper_nl_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	return -EOPNOTSUPP;
+}
+
+int net_shaper_nl_delete_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	return -EOPNOTSUPP;
+}
+
+int net_shaper_nl_pre_dumpit(struct netlink_callback *cb)
+{
+	return -EOPNOTSUPP;
+}
+
+int net_shaper_nl_post_dumpit(struct netlink_callback *cb)
+{
+	return -EOPNOTSUPP;
+}
+
+static int __init shaper_init(void)
+{
+	return genl_register_family(&net_shaper_nl_family);
+}
+
+subsys_initcall(shaper_init);
diff --git a/net/shaper/shaper_nl_gen.c b/net/shaper/shaper_nl_gen.c
new file mode 100644
index 000000000000..34185c5989e6
--- /dev/null
+++ b/net/shaper/shaper_nl_gen.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/net_shaper.yaml */
+/* YNL-GEN kernel source */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "shaper_nl_gen.h"
+
+#include <uapi/linux/net_shaper.h>
+
+/* Common nested types */
+const struct nla_policy net_shaper_handle_nl_policy[NET_SHAPER_A_HANDLE_ID + 1] = {
+	[NET_SHAPER_A_HANDLE_SCOPE] = NLA_POLICY_MAX(NLA_U32, 3),
+	[NET_SHAPER_A_HANDLE_ID] = { .type = NLA_U32, },
+};
+
+const struct nla_policy net_shaper_leaf_info_nl_policy[NET_SHAPER_A_WEIGHT + 1] = {
+	[NET_SHAPER_A_HANDLE] = NLA_POLICY_NESTED(net_shaper_handle_nl_policy),
+	[NET_SHAPER_A_PRIORITY] = { .type = NLA_U32, },
+	[NET_SHAPER_A_WEIGHT] = { .type = NLA_U32, },
+};
+
+/* NET_SHAPER_CMD_GET - do */
+static const struct nla_policy net_shaper_get_do_nl_policy[NET_SHAPER_A_IFINDEX + 1] = {
+	[NET_SHAPER_A_IFINDEX] = { .type = NLA_U32, },
+	[NET_SHAPER_A_HANDLE] = NLA_POLICY_NESTED(net_shaper_handle_nl_policy),
+};
+
+/* NET_SHAPER_CMD_GET - dump */
+static const struct nla_policy net_shaper_get_dump_nl_policy[NET_SHAPER_A_IFINDEX + 1] = {
+	[NET_SHAPER_A_IFINDEX] = { .type = NLA_U32, },
+};
+
+/* NET_SHAPER_CMD_SET - do */
+static const struct nla_policy net_shaper_set_nl_policy[NET_SHAPER_A_IFINDEX + 1] = {
+	[NET_SHAPER_A_IFINDEX] = { .type = NLA_U32, },
+	[NET_SHAPER_A_HANDLE] = NLA_POLICY_NESTED(net_shaper_handle_nl_policy),
+	[NET_SHAPER_A_METRIC] = NLA_POLICY_MAX(NLA_U32, 1),
+	[NET_SHAPER_A_BW_MIN] = { .type = NLA_UINT, },
+	[NET_SHAPER_A_BW_MAX] = { .type = NLA_UINT, },
+	[NET_SHAPER_A_BURST] = { .type = NLA_UINT, },
+	[NET_SHAPER_A_PRIORITY] = { .type = NLA_U32, },
+	[NET_SHAPER_A_WEIGHT] = { .type = NLA_U32, },
+};
+
+/* NET_SHAPER_CMD_DELETE - do */
+static const struct nla_policy net_shaper_delete_nl_policy[NET_SHAPER_A_IFINDEX + 1] = {
+	[NET_SHAPER_A_IFINDEX] = { .type = NLA_U32, },
+	[NET_SHAPER_A_HANDLE] = NLA_POLICY_NESTED(net_shaper_handle_nl_policy),
+};
+
+/* NET_SHAPER_CMD_GROUP - do */
+static const struct nla_policy net_shaper_group_nl_policy[NET_SHAPER_A_LEAVES + 1] = {
+	[NET_SHAPER_A_IFINDEX] = { .type = NLA_U32, },
+	[NET_SHAPER_A_PARENT] = NLA_POLICY_NESTED(net_shaper_handle_nl_policy),
+	[NET_SHAPER_A_HANDLE] = NLA_POLICY_NESTED(net_shaper_handle_nl_policy),
+	[NET_SHAPER_A_METRIC] = NLA_POLICY_MAX(NLA_U32, 1),
+	[NET_SHAPER_A_BW_MIN] = { .type = NLA_UINT, },
+	[NET_SHAPER_A_BW_MAX] = { .type = NLA_UINT, },
+	[NET_SHAPER_A_BURST] = { .type = NLA_UINT, },
+	[NET_SHAPER_A_PRIORITY] = { .type = NLA_U32, },
+	[NET_SHAPER_A_WEIGHT] = { .type = NLA_U32, },
+	[NET_SHAPER_A_LEAVES] = NLA_POLICY_NESTED(net_shaper_leaf_info_nl_policy),
+};
+
+/* Ops table for net_shaper */
+static const struct genl_split_ops net_shaper_nl_ops[] = {
+	{
+		.cmd		= NET_SHAPER_CMD_GET,
+		.pre_doit	= net_shaper_nl_pre_doit,
+		.doit		= net_shaper_nl_get_doit,
+		.post_doit	= net_shaper_nl_post_doit,
+		.policy		= net_shaper_get_do_nl_policy,
+		.maxattr	= NET_SHAPER_A_IFINDEX,
+		.flags		= GENL_CMD_CAP_DO,
+	},
+	{
+		.cmd		= NET_SHAPER_CMD_GET,
+		.start		= net_shaper_nl_pre_dumpit,
+		.dumpit		= net_shaper_nl_get_dumpit,
+		.done		= net_shaper_nl_post_dumpit,
+		.policy		= net_shaper_get_dump_nl_policy,
+		.maxattr	= NET_SHAPER_A_IFINDEX,
+		.flags		= GENL_CMD_CAP_DUMP,
+	},
+	{
+		.cmd		= NET_SHAPER_CMD_SET,
+		.pre_doit	= net_shaper_nl_pre_doit,
+		.doit		= net_shaper_nl_set_doit,
+		.post_doit	= net_shaper_nl_post_doit,
+		.policy		= net_shaper_set_nl_policy,
+		.maxattr	= NET_SHAPER_A_IFINDEX,
+		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+	},
+	{
+		.cmd		= NET_SHAPER_CMD_DELETE,
+		.pre_doit	= net_shaper_nl_pre_doit,
+		.doit		= net_shaper_nl_delete_doit,
+		.post_doit	= net_shaper_nl_post_doit,
+		.policy		= net_shaper_delete_nl_policy,
+		.maxattr	= NET_SHAPER_A_IFINDEX,
+		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+	},
+	{
+		.cmd		= NET_SHAPER_CMD_GROUP,
+		.pre_doit	= net_shaper_nl_pre_doit,
+		.doit		= net_shaper_nl_group_doit,
+		.post_doit	= net_shaper_nl_post_doit,
+		.policy		= net_shaper_group_nl_policy,
+		.maxattr	= NET_SHAPER_A_LEAVES,
+		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+	},
+};
+
+struct genl_family net_shaper_nl_family __ro_after_init = {
+	.name		= NET_SHAPER_FAMILY_NAME,
+	.version	= NET_SHAPER_FAMILY_VERSION,
+	.netnsok	= true,
+	.parallel_ops	= true,
+	.module		= THIS_MODULE,
+	.split_ops	= net_shaper_nl_ops,
+	.n_split_ops	= ARRAY_SIZE(net_shaper_nl_ops),
+};
diff --git a/net/shaper/shaper_nl_gen.h b/net/shaper/shaper_nl_gen.h
new file mode 100644
index 000000000000..016cb6f3187b
--- /dev/null
+++ b/net/shaper/shaper_nl_gen.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/net_shaper.yaml */
+/* YNL-GEN kernel header */
+
+#ifndef _LINUX_NET_SHAPER_GEN_H
+#define _LINUX_NET_SHAPER_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/net_shaper.h>
+
+/* Common nested types */
+extern const struct nla_policy net_shaper_handle_nl_policy[NET_SHAPER_A_HANDLE_ID + 1];
+extern const struct nla_policy net_shaper_leaf_info_nl_policy[NET_SHAPER_A_WEIGHT + 1];
+
+int net_shaper_nl_pre_doit(const struct genl_split_ops *ops,
+			   struct sk_buff *skb, struct genl_info *info);
+void
+net_shaper_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
+			struct genl_info *info);
+int net_shaper_nl_pre_dumpit(struct netlink_callback *cb);
+int net_shaper_nl_post_dumpit(struct netlink_callback *cb);
+
+int net_shaper_nl_get_doit(struct sk_buff *skb, struct genl_info *info);
+int net_shaper_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+int net_shaper_nl_set_doit(struct sk_buff *skb, struct genl_info *info);
+int net_shaper_nl_delete_doit(struct sk_buff *skb, struct genl_info *info);
+int net_shaper_nl_group_doit(struct sk_buff *skb, struct genl_info *info);
+
+extern struct genl_family net_shaper_nl_family;
+
+#endif /* _LINUX_NET_SHAPER_GEN_H */
-- 
cgit v1.2.3


From 14bba9285aedefb99647d716b0f61bf32081e387 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 9 Oct 2024 10:09:54 +0200
Subject: netlink: spec: add shaper introspection support

Allow the user-space to fine-grain query the shaping features
supported by the NIC on each domain.

Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Link: https://patch.msgid.link/3ddd10e450e3fe7d4b944c0d0b886d4483529ee6.1728460186.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/net_shaper.yaml | 88 +++++++++++++++++++++++++++++
 include/uapi/linux/net_shaper.h             | 17 ++++++
 net/shaper/shaper.c                         | 32 +++++++++++
 net/shaper/shaper_nl_gen.c                  | 29 ++++++++++
 net/shaper/shaper_nl_gen.h                  | 10 ++++
 5 files changed, 176 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/netlink/specs/net_shaper.yaml b/Documentation/netlink/specs/net_shaper.yaml
index 618fc09932ff..8ebad0d02904 100644
--- a/Documentation/netlink/specs/net_shaper.yaml
+++ b/Documentation/netlink/specs/net_shaper.yaml
@@ -26,6 +26,11 @@ doc: |
 
   The user can query the running configuration via the @get operation.
 
+  Different devices can provide different feature sets, e.g. with no
+  support for complex scheduling hierarchy, or for some shaping
+  parameters. The user can introspect the HW capabilities via the
+  @cap-get operation.
+
 definitions:
   -
     type: enum
@@ -148,6 +153,53 @@ attribute-sets:
         name: priority
       -
         name: weight
+  -
+    name: caps
+    attributes:
+      -
+        name: ifindex
+        type: u32
+        doc: Interface index queried for shapers capabilities.
+      -
+        name: scope
+        type: u32
+        enum: scope
+        doc: The scope to which the queried capabilities apply.
+      -
+        name: support-metric-bps
+        type: flag
+        doc: The device accepts 'bps' metric for bw-min, bw-max and burst.
+      -
+        name: support-metric-pps
+        type: flag
+        doc: The device accepts 'pps' metric for bw-min, bw-max and burst.
+      -
+        name: support-nesting
+        type: flag
+        doc: |
+          The device supports nesting shaper belonging to this scope
+          below 'node' scoped shapers. Only 'queue' and 'node'
+          scope can have flag 'support-nesting'.
+      -
+        name: support-bw-min
+        type: flag
+        doc: The device supports a minimum guaranteed B/W.
+      -
+        name: support-bw-max
+        type: flag
+        doc: The device supports maximum B/W shaping.
+      -
+        name: support-burst
+        type: flag
+        doc: The device supports a maximum burst size.
+      -
+        name: support-priority
+        type: flag
+        doc: The device supports priority scheduling.
+      -
+        name: support-weight
+        type: flag
+        doc: The device supports weighted round robin scheduling.
 
 operations:
   list:
@@ -272,3 +324,39 @@ operations:
             - leaves
         reply:
           attributes: *ns-binding
+
+    -
+      name: cap-get
+      doc: |
+        Get the shaper capabilities supported by the given device
+        for the specified scope.
+      attribute-set: caps
+
+      do:
+        pre: net-shaper-nl-cap-pre-doit
+        post: net-shaper-nl-cap-post-doit
+        request:
+          attributes:
+            - ifindex
+            - scope
+        reply:
+          attributes: &cap-attrs
+            - ifindex
+            - scope
+            - support-metric-bps
+            - support-metric-pps
+            - support-nesting
+            - support-bw-min
+            - support-bw-max
+            - support-burst
+            - support-priority
+            - support-weight
+
+      dump:
+        pre: net-shaper-nl-cap-pre-dumpit
+        post: net-shaper-nl-cap-post-dumpit
+        request:
+          attributes:
+            - ifindex
+        reply:
+          attributes: *cap-attrs
diff --git a/include/uapi/linux/net_shaper.h b/include/uapi/linux/net_shaper.h
index 9e3fa63618ee..d8834b59f7d7 100644
--- a/include/uapi/linux/net_shaper.h
+++ b/include/uapi/linux/net_shaper.h
@@ -65,11 +65,28 @@ enum {
 	NET_SHAPER_A_HANDLE_MAX = (__NET_SHAPER_A_HANDLE_MAX - 1)
 };
 
+enum {
+	NET_SHAPER_A_CAPS_IFINDEX = 1,
+	NET_SHAPER_A_CAPS_SCOPE,
+	NET_SHAPER_A_CAPS_SUPPORT_METRIC_BPS,
+	NET_SHAPER_A_CAPS_SUPPORT_METRIC_PPS,
+	NET_SHAPER_A_CAPS_SUPPORT_NESTING,
+	NET_SHAPER_A_CAPS_SUPPORT_BW_MIN,
+	NET_SHAPER_A_CAPS_SUPPORT_BW_MAX,
+	NET_SHAPER_A_CAPS_SUPPORT_BURST,
+	NET_SHAPER_A_CAPS_SUPPORT_PRIORITY,
+	NET_SHAPER_A_CAPS_SUPPORT_WEIGHT,
+
+	__NET_SHAPER_A_CAPS_MAX,
+	NET_SHAPER_A_CAPS_MAX = (__NET_SHAPER_A_CAPS_MAX - 1)
+};
+
 enum {
 	NET_SHAPER_CMD_GET = 1,
 	NET_SHAPER_CMD_SET,
 	NET_SHAPER_CMD_DELETE,
 	NET_SHAPER_CMD_GROUP,
+	NET_SHAPER_CMD_CAP_GET,
 
 	__NET_SHAPER_CMD_MAX,
 	NET_SHAPER_CMD_MAX = (__NET_SHAPER_CMD_MAX - 1)
diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c
index 85ad172833fc..92c8da046391 100644
--- a/net/shaper/shaper.c
+++ b/net/shaper/shaper.c
@@ -598,6 +598,27 @@ int net_shaper_nl_post_dumpit(struct netlink_callback *cb)
 	return 0;
 }
 
+int net_shaper_nl_cap_pre_doit(const struct genl_split_ops *ops,
+			       struct sk_buff *skb, struct genl_info *info)
+{
+	return -EOPNOTSUPP;
+}
+
+void net_shaper_nl_cap_post_doit(const struct genl_split_ops *ops,
+				 struct sk_buff *skb, struct genl_info *info)
+{
+}
+
+int net_shaper_nl_cap_pre_dumpit(struct netlink_callback *cb)
+{
+	return -EOPNOTSUPP;
+}
+
+int net_shaper_nl_cap_post_dumpit(struct netlink_callback *cb)
+{
+	return -EOPNOTSUPP;
+}
+
 int net_shaper_nl_get_doit(struct sk_buff *skb, struct genl_info *info)
 {
 	struct net_shaper_binding *binding;
@@ -1126,6 +1147,17 @@ free_msg:
 	goto free_leaves;
 }
 
+int net_shaper_nl_cap_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	return 0;
+}
+
+int net_shaper_nl_cap_get_dumpit(struct sk_buff *skb,
+				 struct netlink_callback *cb)
+{
+	return 0;
+}
+
 static void net_shaper_flush(struct net_shaper_binding *binding)
 {
 	struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding);
diff --git a/net/shaper/shaper_nl_gen.c b/net/shaper/shaper_nl_gen.c
index 34185c5989e6..204c8ae8c7b1 100644
--- a/net/shaper/shaper_nl_gen.c
+++ b/net/shaper/shaper_nl_gen.c
@@ -65,6 +65,17 @@ static const struct nla_policy net_shaper_group_nl_policy[NET_SHAPER_A_LEAVES +
 	[NET_SHAPER_A_LEAVES] = NLA_POLICY_NESTED(net_shaper_leaf_info_nl_policy),
 };
 
+/* NET_SHAPER_CMD_CAP_GET - do */
+static const struct nla_policy net_shaper_cap_get_do_nl_policy[NET_SHAPER_A_CAPS_SCOPE + 1] = {
+	[NET_SHAPER_A_CAPS_IFINDEX] = { .type = NLA_U32, },
+	[NET_SHAPER_A_CAPS_SCOPE] = NLA_POLICY_MAX(NLA_U32, 3),
+};
+
+/* NET_SHAPER_CMD_CAP_GET - dump */
+static const struct nla_policy net_shaper_cap_get_dump_nl_policy[NET_SHAPER_A_CAPS_IFINDEX + 1] = {
+	[NET_SHAPER_A_CAPS_IFINDEX] = { .type = NLA_U32, },
+};
+
 /* Ops table for net_shaper */
 static const struct genl_split_ops net_shaper_nl_ops[] = {
 	{
@@ -112,6 +123,24 @@ static const struct genl_split_ops net_shaper_nl_ops[] = {
 		.maxattr	= NET_SHAPER_A_LEAVES,
 		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
 	},
+	{
+		.cmd		= NET_SHAPER_CMD_CAP_GET,
+		.pre_doit	= net_shaper_nl_cap_pre_doit,
+		.doit		= net_shaper_nl_cap_get_doit,
+		.post_doit	= net_shaper_nl_cap_post_doit,
+		.policy		= net_shaper_cap_get_do_nl_policy,
+		.maxattr	= NET_SHAPER_A_CAPS_SCOPE,
+		.flags		= GENL_CMD_CAP_DO,
+	},
+	{
+		.cmd		= NET_SHAPER_CMD_CAP_GET,
+		.start		= net_shaper_nl_cap_pre_dumpit,
+		.dumpit		= net_shaper_nl_cap_get_dumpit,
+		.done		= net_shaper_nl_cap_post_dumpit,
+		.policy		= net_shaper_cap_get_dump_nl_policy,
+		.maxattr	= NET_SHAPER_A_CAPS_IFINDEX,
+		.flags		= GENL_CMD_CAP_DUMP,
+	},
 };
 
 struct genl_family net_shaper_nl_family __ro_after_init = {
diff --git a/net/shaper/shaper_nl_gen.h b/net/shaper/shaper_nl_gen.h
index 016cb6f3187b..cb7f9026fc23 100644
--- a/net/shaper/shaper_nl_gen.h
+++ b/net/shaper/shaper_nl_gen.h
@@ -17,17 +17,27 @@ extern const struct nla_policy net_shaper_leaf_info_nl_policy[NET_SHAPER_A_WEIGH
 
 int net_shaper_nl_pre_doit(const struct genl_split_ops *ops,
 			   struct sk_buff *skb, struct genl_info *info);
+int net_shaper_nl_cap_pre_doit(const struct genl_split_ops *ops,
+			       struct sk_buff *skb, struct genl_info *info);
 void
 net_shaper_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
 			struct genl_info *info);
+void
+net_shaper_nl_cap_post_doit(const struct genl_split_ops *ops,
+			    struct sk_buff *skb, struct genl_info *info);
 int net_shaper_nl_pre_dumpit(struct netlink_callback *cb);
+int net_shaper_nl_cap_pre_dumpit(struct netlink_callback *cb);
 int net_shaper_nl_post_dumpit(struct netlink_callback *cb);
+int net_shaper_nl_cap_post_dumpit(struct netlink_callback *cb);
 
 int net_shaper_nl_get_doit(struct sk_buff *skb, struct genl_info *info);
 int net_shaper_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
 int net_shaper_nl_set_doit(struct sk_buff *skb, struct genl_info *info);
 int net_shaper_nl_delete_doit(struct sk_buff *skb, struct genl_info *info);
 int net_shaper_nl_group_doit(struct sk_buff *skb, struct genl_info *info);
+int net_shaper_nl_cap_get_doit(struct sk_buff *skb, struct genl_info *info);
+int net_shaper_nl_cap_get_dumpit(struct sk_buff *skb,
+				 struct netlink_callback *cb);
 
 extern struct genl_family net_shaper_nl_family;
 
-- 
cgit v1.2.3


From 5bd48a3a14df4b3ee1be0757efcc0f40d4f57b35 Mon Sep 17 00:00:00 2001
From: Matteo Croce <teknoraver@meta.com>
Date: Thu, 10 Oct 2024 04:56:52 +0100
Subject: bpf: fix argument type in bpf_loop documentation

The `index` argument to bpf_loop() is threaded as an u64.
This lead in a subtle verifier denial where clang cloned the argument
in another register[1].

[1] https://github.com/systemd/systemd/pull/34650#issuecomment-2401092895

Signed-off-by: Matteo Croce <teknoraver@meta.com>
Link: https://lore.kernel.org/r/20241010035652.17830-1-technoboy85@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       | 2 +-
 kernel/bpf/verifier.c          | 2 +-
 tools/include/uapi/linux/bpf.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 8ab4d8184b9d..874af0186fe8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5371,7 +5371,7 @@ union bpf_attr {
  *		Currently, the **flags** must be 0. Currently, nr_loops is
  *		limited to 1 << 23 (~8 million) loops.
  *
- *		long (\*callback_fn)(u32 index, void \*ctx);
+ *		long (\*callback_fn)(u64 index, void \*ctx);
  *
  *		where **index** is the current index in the loop. The index
  *		is zero-indexed.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7d9b38ffd220..cfc62e0776bf 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9917,7 +9917,7 @@ static int set_loop_callback_state(struct bpf_verifier_env *env,
 {
 	/* bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx,
 	 *	    u64 flags);
-	 * callback_fn(u32 index, void *callback_ctx);
+	 * callback_fn(u64 index, void *callback_ctx);
 	 */
 	callee->regs[BPF_REG_1].type = SCALAR_VALUE;
 	callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 7610883c8191..5937c39069ba 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5371,7 +5371,7 @@ union bpf_attr {
  *		Currently, the **flags** must be 0. Currently, nr_loops is
  *		limited to 1 << 23 (~8 million) loops.
  *
- *		long (\*callback_fn)(u32 index, void \*ctx);
+ *		long (\*callback_fn)(u64 index, void \*ctx);
  *
  *		where **index** is the current index in the loop. The index
  *		is zero-indexed.
-- 
cgit v1.2.3


From c6ca31981b545ad3081007b6aa88b6aab1b0cece Mon Sep 17 00:00:00 2001
From: Martin Kelly <martin.kelly@crowdstrike.com>
Date: Thu, 10 Oct 2024 12:33:01 -0700
Subject: bpf: Update bpf_override_return() comment

The documentation says CONFIG_FUNCTION_ERROR_INJECTION is supported only
on x86. This was presumably true at the time of writing, but it's now
supported on many other architectures too. Drop this statement, since
it's not correct anymore and it fits better in other documentation
anyway.

Signed-off-by: Martin Kelly <martin.kelly@crowdstrike.com>
Link: https://lore.kernel.org/r/20241010193301.995909-1-martin.kelly@crowdstrike.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       | 4 ----
 tools/include/uapi/linux/bpf.h | 4 ----
 2 files changed, 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 874af0186fe8..627c4195f04f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3103,10 +3103,6 @@ union bpf_attr {
  * 		with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration
  * 		option, and in this case it only works on functions tagged with
  * 		**ALLOW_ERROR_INJECTION** in the kernel code.
- *
- * 		Also, the helper is only available for the architectures having
- * 		the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing,
- * 		x86 architecture is the only one to support this feature.
  * 	Return
  * 		0
  *
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 5937c39069ba..0e49ce2981a0 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3103,10 +3103,6 @@ union bpf_attr {
  * 		with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration
  * 		option, and in this case it only works on functions tagged with
  * 		**ALLOW_ERROR_INJECTION** in the kernel code.
- *
- * 		Also, the helper is only available for the architectures having
- * 		the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing,
- * 		x86 architecture is the only one to support this feature.
  * 	Return
  * 		0
  *
-- 
cgit v1.2.3


From 445936f9e258eca624c8239056bd8cd6e853b3fd Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Mon, 23 Sep 2024 11:59:57 +0200
Subject: thermal: core: Add user thresholds support

The user thresholds mechanism is a way to have the userspace to tell
the thermal framework to send a notification when a temperature limit
is crossed. There is no id, no hysteresis, just the temperature and
the direction of the limit crossing. That means we can be notified
when a threshold is crossed the way up only, or the way down only or
both ways. That allows to create hysteresis values if it is needed.

A threshold can be added, deleted or flushed. The latter means all
thresholds belonging to a thermal zone will be deleted.

When a threshold is added:

 - if the same threshold (temperature and direction) exists, an error
   is returned

 - if a threshold is specified with the same temperature but a
   different direction, the specified direction is added

 - if there is no threshold with the same temperature then it is
   created

When a threshold is deleted:

 - if the same threshold (temperature and direction) exists, it is
   deleted

 - if a threshold is specified with the same temperature but a
   different direction, the specified direction is removed

 - if there is no threshold with the same temperature, then an error
   is returned

When the threshold are flushed:

 - All thresholds related to a thermal zone are deleted

When a threshold is crossed:

 - the userspace does not need to know which threshold(s) have been
   crossed, it will be notified with the current temperature and the
   previous temperature

 - if multiple thresholds have been crossed between two updates only
   one notification will be send to the userspace, it is pointless to
   send a notification per thresholds crossed as the userspace can
   handle that easily when it has the temperature delta information

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://patch.msgid.link/20240923100005.2532430-2-daniel.lezcano@linaro.org
[ rjw: Subject edit, use BIT(0) and BIT(1) in symbol definitions ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/Makefile             |   1 +
 drivers/thermal/thermal_core.h       |   2 +
 drivers/thermal/thermal_thresholds.c | 229 +++++++++++++++++++++++++++++++++++
 drivers/thermal/thermal_thresholds.h |  19 +++
 include/linux/thermal.h              |   3 +
 include/uapi/linux/thermal.h         |   2 +
 6 files changed, 256 insertions(+)
 create mode 100644 drivers/thermal/thermal_thresholds.c
 create mode 100644 drivers/thermal/thermal_thresholds.h

(limited to 'include/uapi/linux')

diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
index 41c4d56beb40..1e1559bb971e 100644
--- a/drivers/thermal/Makefile
+++ b/drivers/thermal/Makefile
@@ -6,6 +6,7 @@ CFLAGS_thermal_core.o		:= -I$(src)
 obj-$(CONFIG_THERMAL)		+= thermal_sys.o
 thermal_sys-y			+= thermal_core.o thermal_sysfs.o
 thermal_sys-y			+= thermal_trip.o thermal_helpers.o
+thermal_sys-y			+= thermal_thresholds.o
 
 # netlink interface to manage the thermal framework
 thermal_sys-$(CONFIG_THERMAL_NETLINK)		+= thermal_netlink.o
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index a64d39b1c86b..1ea91d59498b 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -13,6 +13,7 @@
 #include <linux/thermal.h>
 
 #include "thermal_netlink.h"
+#include "thermal_thresholds.h"
 #include "thermal_debugfs.h"
 
 struct thermal_attr {
@@ -139,6 +140,7 @@ struct thermal_zone_device {
 #ifdef CONFIG_THERMAL_DEBUGFS
 	struct thermal_debugfs *debugfs;
 #endif
+	struct list_head user_thresholds;
 	struct thermal_trip_desc trips[] __counted_by(num_trips);
 };
 
diff --git a/drivers/thermal/thermal_thresholds.c b/drivers/thermal/thermal_thresholds.c
new file mode 100644
index 000000000000..f33b6d5474d8
--- /dev/null
+++ b/drivers/thermal/thermal_thresholds.c
@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2024 Linaro Limited
+ *
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+ * Thermal thresholds
+ */
+#include <linux/list.h>
+#include <linux/list_sort.h>
+#include <linux/slab.h>
+
+#include "thermal_core.h"
+#include "thermal_thresholds.h"
+
+int thermal_thresholds_init(struct thermal_zone_device *tz)
+{
+	INIT_LIST_HEAD(&tz->user_thresholds);
+
+	return 0;
+}
+
+void thermal_thresholds_flush(struct thermal_zone_device *tz)
+{
+	struct list_head *thresholds = &tz->user_thresholds;
+	struct user_threshold *entry, *tmp;
+
+	lockdep_assert_held(&tz->lock);
+
+	list_for_each_entry_safe(entry, tmp, thresholds, list_node) {
+		list_del(&entry->list_node);
+		kfree(entry);
+	}
+
+	__thermal_zone_device_update(tz, THERMAL_TZ_FLUSH_THRESHOLDS);
+}
+
+void thermal_thresholds_exit(struct thermal_zone_device *tz)
+{
+	thermal_thresholds_flush(tz);
+}
+
+static int __thermal_thresholds_cmp(void *data,
+				    const struct list_head *l1,
+				    const struct list_head *l2)
+{
+	struct user_threshold *t1 = container_of(l1, struct user_threshold, list_node);
+	struct user_threshold *t2 = container_of(l2, struct user_threshold, list_node);
+
+	return t1->temperature - t2->temperature;
+}
+
+static struct user_threshold *__thermal_thresholds_find(const struct list_head *thresholds,
+							int temperature)
+{
+	struct user_threshold *t;
+
+	list_for_each_entry(t, thresholds, list_node)
+		if (t->temperature == temperature)
+			return t;
+
+	return NULL;
+}
+
+static bool __thermal_threshold_is_crossed(struct user_threshold *threshold, int temperature,
+					   int last_temperature, int direction,
+					   int *low, int *high)
+{
+
+	if (temperature >= threshold->temperature) {
+		if (threshold->temperature > *low &&
+		    THERMAL_THRESHOLD_WAY_DOWN & threshold->direction)
+			*low = threshold->temperature;
+
+		if (last_temperature < threshold->temperature &&
+		    threshold->direction & direction)
+			return true;
+	} else {
+		if (threshold->temperature < *high && THERMAL_THRESHOLD_WAY_UP
+		    & threshold->direction)
+			*high = threshold->temperature;
+
+		if (last_temperature >= threshold->temperature &&
+		    threshold->direction & direction)
+			return true;
+	}
+
+	return false;
+}
+
+static bool thermal_thresholds_handle_raising(struct list_head *thresholds, int temperature,
+					      int last_temperature, int *low, int *high)
+{
+	struct user_threshold *t;
+
+	list_for_each_entry(t, thresholds, list_node) {
+		if (__thermal_threshold_is_crossed(t, temperature, last_temperature,
+						   THERMAL_THRESHOLD_WAY_UP, low, high))
+			return true;
+	}
+
+	return false;
+}
+
+static bool thermal_thresholds_handle_dropping(struct list_head *thresholds, int temperature,
+					       int last_temperature, int *low, int *high)
+{
+	struct user_threshold *t;
+
+	list_for_each_entry_reverse(t, thresholds, list_node) {
+		if (__thermal_threshold_is_crossed(t, temperature, last_temperature,
+						   THERMAL_THRESHOLD_WAY_DOWN, low, high))
+			return true;
+	}
+
+	return false;
+}
+
+void thermal_thresholds_handle(struct thermal_zone_device *tz, int *low, int *high)
+{
+	struct list_head *thresholds = &tz->user_thresholds;
+
+	int temperature = tz->temperature;
+	int last_temperature = tz->last_temperature;
+	bool notify;
+
+	lockdep_assert_held(&tz->lock);
+
+	/*
+	 * We need a second update in order to detect a threshold being crossed
+	 */
+	if (last_temperature == THERMAL_TEMP_INVALID)
+		return;
+
+	/*
+	 * The temperature is stable, so obviously we can not have
+	 * crossed a threshold.
+	 */
+	if (last_temperature == temperature)
+		return;
+
+	/*
+	 * Since last update the temperature:
+	 * - increased : thresholds are crossed the way up
+	 * - decreased : thresholds are crossed the way down
+	 */
+	if (temperature > last_temperature)
+		notify = thermal_thresholds_handle_raising(thresholds, temperature,
+							   last_temperature, low, high);
+	else
+		notify = thermal_thresholds_handle_dropping(thresholds, temperature,
+							    last_temperature, low, high);
+
+	if (notify)
+		pr_debug("A threshold has been crossed the way %s, with a temperature=%d, last_temperature=%d\n",
+			 temperature > last_temperature ? "up" : "down", temperature, last_temperature);
+}
+
+int thermal_thresholds_add(struct thermal_zone_device *tz, int temperature, int direction)
+{
+	struct list_head *thresholds = &tz->user_thresholds;
+	struct user_threshold *t;
+
+	lockdep_assert_held(&tz->lock);
+
+	t = __thermal_thresholds_find(thresholds, temperature);
+	if (t) {
+		if (t->direction == direction)
+			return -EEXIST;
+
+		t->direction |= direction;
+	} else {
+
+		t = kmalloc(sizeof(*t), GFP_KERNEL);
+		if (!t)
+			return -ENOMEM;
+
+		INIT_LIST_HEAD(&t->list_node);
+		t->temperature = temperature;
+		t->direction = direction;
+		list_add(&t->list_node, thresholds);
+		list_sort(NULL, thresholds, __thermal_thresholds_cmp);
+	}
+
+	__thermal_zone_device_update(tz, THERMAL_TZ_ADD_THRESHOLD);
+
+	return 0;
+}
+
+int thermal_thresholds_delete(struct thermal_zone_device *tz, int temperature, int direction)
+{
+	struct list_head *thresholds = &tz->user_thresholds;
+	struct user_threshold *t;
+
+	lockdep_assert_held(&tz->lock);
+
+	t = __thermal_thresholds_find(thresholds, temperature);
+	if (!t)
+		return -ENOENT;
+
+	if (t->direction == direction) {
+		list_del(&t->list_node);
+		kfree(t);
+	} else {
+		t->direction &= ~direction;
+	}
+
+	__thermal_zone_device_update(tz, THERMAL_TZ_DEL_THRESHOLD);
+
+	return 0;
+}
+
+int thermal_thresholds_for_each(struct thermal_zone_device *tz,
+				int (*cb)(struct user_threshold *, void *arg), void *arg)
+{
+	struct list_head *thresholds = &tz->user_thresholds;
+	struct user_threshold *entry;
+	int ret;
+
+	lockdep_assert_held(&tz->lock);
+
+	list_for_each_entry(entry, thresholds, list_node) {
+		ret = cb(entry, arg);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
diff --git a/drivers/thermal/thermal_thresholds.h b/drivers/thermal/thermal_thresholds.h
new file mode 100644
index 000000000000..232f4e8089af
--- /dev/null
+++ b/drivers/thermal/thermal_thresholds.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __THERMAL_THRESHOLDS_H__
+#define __THERMAL_THRESHOLDS_H__
+
+struct user_threshold {
+	struct list_head list_node;
+	int temperature;
+	int direction;
+};
+
+int thermal_thresholds_init(struct thermal_zone_device *tz);
+void thermal_thresholds_exit(struct thermal_zone_device *tz);
+void thermal_thresholds_flush(struct thermal_zone_device *tz);
+void thermal_thresholds_handle(struct thermal_zone_device *tz, int *low, int *high);
+int thermal_thresholds_add(struct thermal_zone_device *tz, int temperature, int direction);
+int thermal_thresholds_delete(struct thermal_zone_device *tz, int temperature, int direction);
+int thermal_thresholds_for_each(struct thermal_zone_device *tz,
+				int (*cb)(struct user_threshold *, void *arg), void *arg);
+#endif
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 25ea8fe2313e..bcaa92732e14 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -56,6 +56,9 @@ enum thermal_notify_event {
 	THERMAL_TZ_UNBIND_CDEV, /* Cooling dev is unbind from the thermal zone */
 	THERMAL_INSTANCE_WEIGHT_CHANGED, /* Thermal instance weight changed */
 	THERMAL_TZ_RESUME, /* Thermal zone is resuming after system sleep */
+	THERMAL_TZ_ADD_THRESHOLD, /* Threshold added */
+	THERMAL_TZ_DEL_THRESHOLD, /* Threshold deleted */
+	THERMAL_TZ_FLUSH_THRESHOLDS, /* All thresholds deleted */
 };
 
 /**
diff --git a/include/uapi/linux/thermal.h b/include/uapi/linux/thermal.h
index fc78bf3aead7..2e6f60a36173 100644
--- a/include/uapi/linux/thermal.h
+++ b/include/uapi/linux/thermal.h
@@ -3,6 +3,8 @@
 #define _UAPI_LINUX_THERMAL_H
 
 #define THERMAL_NAME_LENGTH	20
+#define THERMAL_THRESHOLD_WAY_UP	BIT(0)
+#define THERMAL_THRESHOLD_WAY_DOWN	BIT(1)
 
 enum thermal_device_mode {
 	THERMAL_DEVICE_DISABLED = 0,
-- 
cgit v1.2.3


From 06f5531958dd5decaabb21c6fa1da3dcaf8dfc24 Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Date: Mon, 26 Aug 2024 17:24:09 +0000
Subject: media: videodev2: Add flag to unconditionally enumerate pixel formats

When the index is ORed with V4L2_FMTDESC_FLAG_ENUM_ALL the
driver clears the flag and enumerate all the possible formats,
ignoring any limitations from the current configuration.
Drivers which do not support this flag yet always return an EINVAL.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
[hverkuil: improved doc when the new flag is not supported by the driver]
---
 .../userspace-api/media/v4l/vidioc-enum-fmt.rst        | 18 +++++++++++++++++-
 .../userspace-api/media/videodev2.h.rst.exceptions     |  1 +
 include/uapi/linux/videodev2.h                         |  3 +++
 3 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/v4l/vidioc-enum-fmt.rst b/Documentation/userspace-api/media/v4l/vidioc-enum-fmt.rst
index 3adb3d205531..0f69aa04607f 100644
--- a/Documentation/userspace-api/media/v4l/vidioc-enum-fmt.rst
+++ b/Documentation/userspace-api/media/v4l/vidioc-enum-fmt.rst
@@ -85,7 +85,17 @@ the ``mbus_code`` field is handled differently:
     * - __u32
       - ``index``
       - Number of the format in the enumeration, set by the application.
-	This is in no way related to the ``pixelformat`` field.
+        This is in no way related to the ``pixelformat`` field.
+        When the index is ORed with ``V4L2_FMTDESC_FLAG_ENUM_ALL`` the
+        driver clears the flag and enumerates all the possible formats,
+        ignoring any limitations from the current configuration. Drivers
+        which do not support this flag always return an ``EINVAL``
+        error code without clearing this flag.
+        Formats enumerated when using ``V4L2_FMTDESC_FLAG_ENUM_ALL`` flag
+        shouldn't be used when calling :c:func:`VIDIOC_ENUM_FRAMESIZES`
+        or :c:func:`VIDIOC_ENUM_FRAMEINTERVALS`.
+        ``V4L2_FMTDESC_FLAG_ENUM_ALL`` should only be used by drivers that
+        can return different format list depending on this flag.
     * - __u32
       - ``type``
       - Type of the data stream, set by the application. Only these types
@@ -234,6 +244,12 @@ the ``mbus_code`` field is handled differently:
 	valid. The buffer consists of ``height`` lines, each having ``width``
 	Data Units of data and the offset (in bytes) between the beginning of
 	each two consecutive lines is ``bytesperline``.
+    * - ``V4L2_FMTDESC_FLAG_ENUM_ALL``
+      - 0x80000000
+      - When the applications ORs ``index`` with ``V4L2_FMTDESC_FLAG_ENUM_ALL`` flag
+        the driver enumerates all the possible pixel formats without taking care
+        of any already set configuration. Drivers which do not support this flag,
+        always return ``EINVAL`` without clearing this flag.
 
 Return Value
 ============
diff --git a/Documentation/userspace-api/media/videodev2.h.rst.exceptions b/Documentation/userspace-api/media/videodev2.h.rst.exceptions
index d67fd4038d22..429b5cdf05c3 100644
--- a/Documentation/userspace-api/media/videodev2.h.rst.exceptions
+++ b/Documentation/userspace-api/media/videodev2.h.rst.exceptions
@@ -217,6 +217,7 @@ replace define V4L2_FMT_FLAG_CSC_YCBCR_ENC fmtdesc-flags
 replace define V4L2_FMT_FLAG_CSC_HSV_ENC fmtdesc-flags
 replace define V4L2_FMT_FLAG_CSC_QUANTIZATION fmtdesc-flags
 replace define V4L2_FMT_FLAG_META_LINE_BASED fmtdesc-flags
+replace define V4L2_FMTDESC_FLAG_ENUM_ALL fmtdesc-flags
 
 # V4L2 timecode types
 replace define V4L2_TC_TYPE_24FPS timecode-type
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 21a8aa575ea3..ded023edac70 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -907,6 +907,9 @@ struct v4l2_fmtdesc {
 #define V4L2_FMT_FLAG_CSC_QUANTIZATION		0x0100
 #define V4L2_FMT_FLAG_META_LINE_BASED		0x0200
 
+/*  Format description flag, to be ORed with the index */
+#define V4L2_FMTDESC_FLAG_ENUM_ALL		0x80000000
+
 	/* Frame Size and frame rate enumeration */
 /*
  *	F R A M E   S I Z E   E N U M E R A T I O N
-- 
cgit v1.2.3


From 516010460011ae74ac3b7383cf90ed27e2711cd6 Mon Sep 17 00:00:00 2001
From: Joe Damato <jdamato@fastly.com>
Date: Fri, 11 Oct 2024 18:44:57 +0000
Subject: netdev-genl: Dump napi_defer_hard_irqs

Support dumping defer_hard_irqs for a NAPI ID.

Signed-off-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20241011184527.16393-3-jdamato@fastly.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/netdev.yaml | 8 ++++++++
 include/uapi/linux/netdev.h             | 1 +
 net/core/netdev-genl.c                  | 6 ++++++
 tools/include/uapi/linux/netdev.h       | 1 +
 4 files changed, 16 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index 08412c279297..585e87ec3c16 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -248,6 +248,13 @@ attribute-sets:
              threaded mode. If NAPI is not in threaded mode (i.e. uses normal
              softirq context), the attribute will be absent.
         type: u32
+      -
+        name: defer-hard-irqs
+        doc: The number of consecutive empty polls before IRQ deferral ends
+             and hardware IRQs are re-enabled.
+        type: u32
+        checks:
+          max: s32-max
   -
     name: queue
     attributes:
@@ -636,6 +643,7 @@ operations:
             - ifindex
             - irq
             - pid
+            - defer-hard-irqs
       dump:
         request:
           attributes:
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 7c308f04e7a0..13dc0b027e86 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -122,6 +122,7 @@ enum {
 	NETDEV_A_NAPI_ID,
 	NETDEV_A_NAPI_IRQ,
 	NETDEV_A_NAPI_PID,
+	NETDEV_A_NAPI_DEFER_HARD_IRQS,
 
 	__NETDEV_A_NAPI_MAX,
 	NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1)
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index 358cba248796..f98e5d1d0d21 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -161,6 +161,7 @@ static int
 netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
 			const struct genl_info *info)
 {
+	u32 napi_defer_hard_irqs;
 	void *hdr;
 	pid_t pid;
 
@@ -189,6 +190,11 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
 			goto nla_put_failure;
 	}
 
+	napi_defer_hard_irqs = napi_get_defer_hard_irqs(napi);
+	if (nla_put_s32(rsp, NETDEV_A_NAPI_DEFER_HARD_IRQS,
+			napi_defer_hard_irqs))
+		goto nla_put_failure;
+
 	genlmsg_end(rsp, hdr);
 
 	return 0;
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index 7c308f04e7a0..13dc0b027e86 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -122,6 +122,7 @@ enum {
 	NETDEV_A_NAPI_ID,
 	NETDEV_A_NAPI_IRQ,
 	NETDEV_A_NAPI_PID,
+	NETDEV_A_NAPI_DEFER_HARD_IRQS,
 
 	__NETDEV_A_NAPI_MAX,
 	NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1)
-- 
cgit v1.2.3


From 0137891e74576f77a7901718dc0ce08ca074ae74 Mon Sep 17 00:00:00 2001
From: Joe Damato <jdamato@fastly.com>
Date: Fri, 11 Oct 2024 18:44:59 +0000
Subject: netdev-genl: Dump gro_flush_timeout

Support dumping gro_flush_timeout for a NAPI ID.

Signed-off-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20241011184527.16393-5-jdamato@fastly.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/netdev.yaml | 9 +++++++++
 include/uapi/linux/netdev.h             | 1 +
 net/core/netdev-genl.c                  | 6 ++++++
 tools/include/uapi/linux/netdev.h       | 1 +
 4 files changed, 17 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index 585e87ec3c16..7b47454c51dd 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -255,6 +255,14 @@ attribute-sets:
         type: u32
         checks:
           max: s32-max
+      -
+        name: gro-flush-timeout
+        doc: The timeout, in nanoseconds, of when to trigger the NAPI watchdog
+             timer which schedules NAPI processing. Additionally, a non-zero
+             value will also prevent GRO from flushing recent super-frames at
+             the end of a NAPI cycle. This may add receive latency in exchange
+             for reducing the number of frames processed by the network stack.
+        type: uint
   -
     name: queue
     attributes:
@@ -644,6 +652,7 @@ operations:
             - irq
             - pid
             - defer-hard-irqs
+            - gro-flush-timeout
       dump:
         request:
           attributes:
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 13dc0b027e86..cacd33359c76 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -123,6 +123,7 @@ enum {
 	NETDEV_A_NAPI_IRQ,
 	NETDEV_A_NAPI_PID,
 	NETDEV_A_NAPI_DEFER_HARD_IRQS,
+	NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT,
 
 	__NETDEV_A_NAPI_MAX,
 	NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1)
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index f98e5d1d0d21..ac19f2e6cfbe 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -161,6 +161,7 @@ static int
 netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
 			const struct genl_info *info)
 {
+	unsigned long gro_flush_timeout;
 	u32 napi_defer_hard_irqs;
 	void *hdr;
 	pid_t pid;
@@ -195,6 +196,11 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
 			napi_defer_hard_irqs))
 		goto nla_put_failure;
 
+	gro_flush_timeout = napi_get_gro_flush_timeout(napi);
+	if (nla_put_uint(rsp, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT,
+			 gro_flush_timeout))
+		goto nla_put_failure;
+
 	genlmsg_end(rsp, hdr);
 
 	return 0;
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index 13dc0b027e86..cacd33359c76 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -123,6 +123,7 @@ enum {
 	NETDEV_A_NAPI_IRQ,
 	NETDEV_A_NAPI_PID,
 	NETDEV_A_NAPI_DEFER_HARD_IRQS,
+	NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT,
 
 	__NETDEV_A_NAPI_MAX,
 	NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1)
-- 
cgit v1.2.3


From 1287c1ae0fc227e5acef11a539eb4e75646e31c7 Mon Sep 17 00:00:00 2001
From: Joe Damato <jdamato@fastly.com>
Date: Fri, 11 Oct 2024 18:45:01 +0000
Subject: netdev-genl: Support setting per-NAPI config values

Add support to set per-NAPI defer_hard_irqs and gro_flush_timeout.

Signed-off-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20241011184527.16393-7-jdamato@fastly.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/netdev.yaml | 11 ++++++++
 include/uapi/linux/netdev.h             |  1 +
 net/core/netdev-genl-gen.c              | 18 +++++++++++++
 net/core/netdev-genl-gen.h              |  1 +
 net/core/netdev-genl.c                  | 45 +++++++++++++++++++++++++++++++++
 tools/include/uapi/linux/netdev.h       |  1 +
 6 files changed, 77 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index 7b47454c51dd..f9cb97d6106c 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -693,6 +693,17 @@ operations:
         reply:
           attributes:
             - id
+    -
+      name: napi-set
+      doc: Set configurable NAPI instance settings.
+      attribute-set: napi
+      flags: [ admin-perm ]
+      do:
+        request:
+          attributes:
+            - id
+            - defer-hard-irqs
+            - gro-flush-timeout
 
 kernel-family:
   headers: [ "linux/list.h"]
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index cacd33359c76..e3ebb49f60d2 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -201,6 +201,7 @@ enum {
 	NETDEV_CMD_NAPI_GET,
 	NETDEV_CMD_QSTATS_GET,
 	NETDEV_CMD_BIND_RX,
+	NETDEV_CMD_NAPI_SET,
 
 	__NETDEV_CMD_MAX,
 	NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
index b28424ae06d5..e197bd84997c 100644
--- a/net/core/netdev-genl-gen.c
+++ b/net/core/netdev-genl-gen.c
@@ -22,6 +22,10 @@ static const struct netlink_range_validation netdev_a_page_pool_ifindex_range =
 	.max	= 2147483647ULL,
 };
 
+static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range = {
+	.max	= 2147483647ULL,
+};
+
 /* Common nested types */
 const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = {
 	[NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range),
@@ -87,6 +91,13 @@ static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1]
 	[NETDEV_A_DMABUF_QUEUES] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy),
 };
 
+/* NETDEV_CMD_NAPI_SET - do */
+static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT + 1] = {
+	[NETDEV_A_NAPI_ID] = { .type = NLA_U32, },
+	[NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range),
+	[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, },
+};
+
 /* Ops table for netdev */
 static const struct genl_split_ops netdev_nl_ops[] = {
 	{
@@ -171,6 +182,13 @@ static const struct genl_split_ops netdev_nl_ops[] = {
 		.maxattr	= NETDEV_A_DMABUF_FD,
 		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
 	},
+	{
+		.cmd		= NETDEV_CMD_NAPI_SET,
+		.doit		= netdev_nl_napi_set_doit,
+		.policy		= netdev_napi_set_nl_policy,
+		.maxattr	= NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT,
+		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+	},
 };
 
 static const struct genl_multicast_group netdev_nl_mcgrps[] = {
diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h
index 8cda334fd042..e09dd7539ff2 100644
--- a/net/core/netdev-genl-gen.h
+++ b/net/core/netdev-genl-gen.h
@@ -33,6 +33,7 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
 int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
 				struct netlink_callback *cb);
 int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info);
 
 enum {
 	NETDEV_NLGRP_MGMT,
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index ac19f2e6cfbe..b49c3b4e5fbe 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -303,6 +303,51 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
 	return err;
 }
 
+static int
+netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info)
+{
+	u64 gro_flush_timeout = 0;
+	u32 defer = 0;
+
+	if (info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]) {
+		defer = nla_get_u32(info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]);
+		napi_set_defer_hard_irqs(napi, defer);
+	}
+
+	if (info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]) {
+		gro_flush_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]);
+		napi_set_gro_flush_timeout(napi, gro_flush_timeout);
+	}
+
+	return 0;
+}
+
+int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct napi_struct *napi;
+	unsigned int napi_id;
+	int err;
+
+	if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID))
+		return -EINVAL;
+
+	napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]);
+
+	rtnl_lock();
+
+	napi = napi_by_id(napi_id);
+	if (napi) {
+		err = netdev_nl_napi_set_config(napi, info);
+	} else {
+		NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]);
+		err = -ENOENT;
+	}
+
+	rtnl_unlock();
+
+	return err;
+}
+
 static int
 netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev,
 			 u32 q_idx, u32 q_type, const struct genl_info *info)
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index cacd33359c76..e3ebb49f60d2 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -201,6 +201,7 @@ enum {
 	NETDEV_CMD_NAPI_GET,
 	NETDEV_CMD_QSTATS_GET,
 	NETDEV_CMD_BIND_RX,
+	NETDEV_CMD_NAPI_SET,
 
 	__NETDEV_CMD_MAX,
 	NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
-- 
cgit v1.2.3


From 6390834c6f9b2c5e33f52f34579efa0d0df073db Mon Sep 17 00:00:00 2001
From: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com>
Date: Thu, 3 Oct 2024 13:31:10 +0300
Subject: media: uapi: Add meta formats for PiSP FE config and stats

Add two meta formats for PiSP FE: V4L2_META_FMT_RPI_FE_CFG and
V4L2_META_FMT_RPI_FE_STATS. The former is used to provide configuration
for the FE and the latter is used to read the statistics from the FE.

Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 .../userspace-api/media/v4l/meta-formats.rst       |  1 +
 .../userspace-api/media/v4l/metafmt-pisp-fe.rst    | 39 ++++++++++++++++++++++
 drivers/media/v4l2-core/v4l2-ioctl.c               |  2 ++
 include/uapi/linux/videodev2.h                     |  2 ++
 4 files changed, 44 insertions(+)
 create mode 100644 Documentation/userspace-api/media/v4l/metafmt-pisp-fe.rst

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/v4l/meta-formats.rst b/Documentation/userspace-api/media/v4l/meta-formats.rst
index c6e56b5888bc..86ffb3bc8ade 100644
--- a/Documentation/userspace-api/media/v4l/meta-formats.rst
+++ b/Documentation/userspace-api/media/v4l/meta-formats.rst
@@ -16,6 +16,7 @@ These formats are used for the :ref:`metadata` interface only.
     metafmt-generic
     metafmt-intel-ipu3
     metafmt-pisp-be
+    metafmt-pisp-fe
     metafmt-rkisp1
     metafmt-uvc
     metafmt-vivid
diff --git a/Documentation/userspace-api/media/v4l/metafmt-pisp-fe.rst b/Documentation/userspace-api/media/v4l/metafmt-pisp-fe.rst
new file mode 100644
index 000000000000..fddeada83e4a
--- /dev/null
+++ b/Documentation/userspace-api/media/v4l/metafmt-pisp-fe.rst
@@ -0,0 +1,39 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. _v4l2-meta-fmt-rpi-fe-cfg:
+
+************************
+V4L2_META_FMT_RPI_FE_CFG
+************************
+
+Raspberry Pi PiSP Front End configuration format
+================================================
+
+The Raspberry Pi PiSP Front End image signal processor is configured by
+userspace by providing a buffer of configuration parameters to the
+`rp1-cfe-fe-config` output video device node using the
+:c:type:`v4l2_meta_format` interface.
+
+The `Raspberry Pi PiSP technical specification
+<https://datasheets.raspberrypi.com/camera/raspberry-pi-image-signal-processor-specification.pdf>`_
+provide detailed description of the Front End configuration and programming
+model.
+
+.. _v4l2-meta-fmt-rpi-fe-stats:
+
+**************************
+V4L2_META_FMT_RPI_FE_STATS
+**************************
+
+Raspberry Pi PiSP Front End statistics format
+=============================================
+
+The Raspberry Pi PiSP Front End image signal processor provides statistics data
+by writing to a buffer provided via the `rp1-cfe-fe-stats` capture video device
+node using the
+:c:type:`v4l2_meta_format` interface.
+
+The `Raspberry Pi PiSP technical specification
+<https://datasheets.raspberrypi.com/camera/raspberry-pi-image-signal-processor-specification.pdf>`_
+provide detailed description of the Front End configuration and programming
+model.
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index b9a3c6b20282..0304daa8471d 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1468,6 +1468,8 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_Y212:		descr = "12-bit YUYV Packed"; break;
 	case V4L2_PIX_FMT_Y216:		descr = "16-bit YUYV Packed"; break;
 	case V4L2_META_FMT_RPI_BE_CFG:	descr = "RPi PiSP BE Config format"; break;
+	case V4L2_META_FMT_RPI_FE_CFG:  descr = "RPi PiSP FE Config format"; break;
+	case V4L2_META_FMT_RPI_FE_STATS: descr = "RPi PiSP FE Statistics format"; break;
 	case V4L2_META_FMT_GENERIC_8:	descr = "8-bit Generic Metadata"; break;
 	case V4L2_META_FMT_GENERIC_CSI2_10:	descr = "8-bit Generic Meta, 10b CSI-2"; break;
 	case V4L2_META_FMT_GENERIC_CSI2_12:	descr = "8-bit Generic Meta, 12b CSI-2"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index ded023edac70..e7c4dce39007 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -860,6 +860,8 @@ struct v4l2_pix_format {
 
 /* Vendor specific - used for RaspberryPi PiSP */
 #define V4L2_META_FMT_RPI_BE_CFG	v4l2_fourcc('R', 'P', 'B', 'C') /* PiSP BE configuration */
+#define V4L2_META_FMT_RPI_FE_CFG	v4l2_fourcc('R', 'P', 'F', 'C') /* PiSP FE configuration */
+#define V4L2_META_FMT_RPI_FE_STATS	v4l2_fourcc('R', 'P', 'F', 'S') /* PiSP FE stats */
 
 #ifdef __KERNEL__
 /*
-- 
cgit v1.2.3


From 6edb685abb2af445773876a326292b989dcb3c9f Mon Sep 17 00:00:00 2001
From: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com>
Date: Thu, 3 Oct 2024 13:31:12 +0300
Subject: media: raspberrypi: Add support for RP1-CFE

Add support for Raspberry Pi CFE. The CFE is a hardware block that
contains:

- MIPI D-PHY
- MIPI CSI-2 receiver
- Front End ISP (FE)

The driver has been upported from the Raspberry Pi kernel commit
88a681df9623 ("ARM: dts: bcm2712-rpi: Add i2c<n>_pins labels").

Co-developed-by: Naushir Patuck <naush@raspberrypi.com>
Signed-off-by: Naushir Patuck <naush@raspberrypi.com>
Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 MAINTAINERS                                        |    7 +
 drivers/media/platform/raspberrypi/Kconfig         |    1 +
 drivers/media/platform/raspberrypi/Makefile        |    1 +
 drivers/media/platform/raspberrypi/rp1-cfe/Kconfig |   15 +
 .../media/platform/raspberrypi/rp1-cfe/Makefile    |    6 +
 .../media/platform/raspberrypi/rp1-cfe/cfe-fmts.h  |  332 +++
 .../media/platform/raspberrypi/rp1-cfe/cfe-trace.h |  202 ++
 drivers/media/platform/raspberrypi/rp1-cfe/cfe.c   | 2504 ++++++++++++++++++++
 drivers/media/platform/raspberrypi/rp1-cfe/cfe.h   |   43 +
 drivers/media/platform/raspberrypi/rp1-cfe/csi2.c  |  586 +++++
 drivers/media/platform/raspberrypi/rp1-cfe/csi2.h  |   89 +
 drivers/media/platform/raspberrypi/rp1-cfe/dphy.c  |  181 ++
 drivers/media/platform/raspberrypi/rp1-cfe/dphy.h  |   27 +
 .../media/platform/raspberrypi/rp1-cfe/pisp-fe.c   |  605 +++++
 .../media/platform/raspberrypi/rp1-cfe/pisp-fe.h   |   53 +
 .../uapi/linux/media/raspberrypi/pisp_fe_config.h  |  273 +++
 .../linux/media/raspberrypi/pisp_fe_statistics.h   |   64 +
 17 files changed, 4989 insertions(+)
 create mode 100644 drivers/media/platform/raspberrypi/rp1-cfe/Kconfig
 create mode 100644 drivers/media/platform/raspberrypi/rp1-cfe/Makefile
 create mode 100644 drivers/media/platform/raspberrypi/rp1-cfe/cfe-fmts.h
 create mode 100644 drivers/media/platform/raspberrypi/rp1-cfe/cfe-trace.h
 create mode 100644 drivers/media/platform/raspberrypi/rp1-cfe/cfe.c
 create mode 100644 drivers/media/platform/raspberrypi/rp1-cfe/cfe.h
 create mode 100644 drivers/media/platform/raspberrypi/rp1-cfe/csi2.c
 create mode 100644 drivers/media/platform/raspberrypi/rp1-cfe/csi2.h
 create mode 100644 drivers/media/platform/raspberrypi/rp1-cfe/dphy.c
 create mode 100644 drivers/media/platform/raspberrypi/rp1-cfe/dphy.h
 create mode 100644 drivers/media/platform/raspberrypi/rp1-cfe/pisp-fe.c
 create mode 100644 drivers/media/platform/raspberrypi/rp1-cfe/pisp-fe.h
 create mode 100644 include/uapi/linux/media/raspberrypi/pisp_fe_config.h
 create mode 100644 include/uapi/linux/media/raspberrypi/pisp_fe_statistics.h

(limited to 'include/uapi/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index a08759706711..7a14891a8fa9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -19372,6 +19372,13 @@ F:	Documentation/devicetree/bindings/media/raspberrypi,pispbe.yaml
 F:	drivers/media/platform/raspberrypi/pisp_be/
 F:	include/uapi/linux/media/raspberrypi/
 
+RASPBERRY PI PISP CAMERA FRONT END
+M:	Tomi Valkeinen <tomi.valkeinen@ideasonboard.com>
+M:	Raspberry Pi Kernel Maintenance <kernel-list@raspberrypi.com>
+S:	Maintained
+F:	Documentation/devicetree/bindings/media/raspberrypi,rp1-cfe.yaml
+F:	drivers/media/platform/raspberrypi/rp1-cfe/
+
 RC-CORE / LIRC FRAMEWORK
 M:	Sean Young <sean@mess.org>
 L:	linux-media@vger.kernel.org
diff --git a/drivers/media/platform/raspberrypi/Kconfig b/drivers/media/platform/raspberrypi/Kconfig
index e928f979019e..bd5101ffefb5 100644
--- a/drivers/media/platform/raspberrypi/Kconfig
+++ b/drivers/media/platform/raspberrypi/Kconfig
@@ -3,3 +3,4 @@
 comment "Raspberry Pi media platform drivers"
 
 source "drivers/media/platform/raspberrypi/pisp_be/Kconfig"
+source "drivers/media/platform/raspberrypi/rp1-cfe/Kconfig"
diff --git a/drivers/media/platform/raspberrypi/Makefile b/drivers/media/platform/raspberrypi/Makefile
index c0d1a2dab486..af7fde84eefe 100644
--- a/drivers/media/platform/raspberrypi/Makefile
+++ b/drivers/media/platform/raspberrypi/Makefile
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
 
 obj-y += pisp_be/
+obj-y += rp1-cfe/
diff --git a/drivers/media/platform/raspberrypi/rp1-cfe/Kconfig b/drivers/media/platform/raspberrypi/rp1-cfe/Kconfig
new file mode 100644
index 000000000000..327b61f1134b
--- /dev/null
+++ b/drivers/media/platform/raspberrypi/rp1-cfe/Kconfig
@@ -0,0 +1,15 @@
+# RP1 V4L2 camera support
+
+config VIDEO_RP1_CFE
+	tristate "Raspberry Pi RP1 Camera Front End (CFE) video capture driver"
+	depends on VIDEO_DEV
+	depends on PM
+	select VIDEO_V4L2_SUBDEV_API
+	select MEDIA_CONTROLLER
+	select VIDEOBUF2_DMA_CONTIG
+	select V4L2_FWNODE
+	help
+	  Say Y here to enable support for the Raspberry Pi RP1 Camera Front End.
+
+	  To compile this driver as a module, choose M here. The module will be
+	  called rp1-cfe.
diff --git a/drivers/media/platform/raspberrypi/rp1-cfe/Makefile b/drivers/media/platform/raspberrypi/rp1-cfe/Makefile
new file mode 100644
index 000000000000..3f0d0fc8570e
--- /dev/null
+++ b/drivers/media/platform/raspberrypi/rp1-cfe/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for RP1 Camera Front End driver
+#
+rp1-cfe-objs := cfe.o csi2.o pisp-fe.o dphy.o
+obj-$(CONFIG_VIDEO_RP1_CFE) += rp1-cfe.o
diff --git a/drivers/media/platform/raspberrypi/rp1-cfe/cfe-fmts.h b/drivers/media/platform/raspberrypi/rp1-cfe/cfe-fmts.h
new file mode 100644
index 000000000000..7aecf7f83733
--- /dev/null
+++ b/drivers/media/platform/raspberrypi/rp1-cfe/cfe-fmts.h
@@ -0,0 +1,332 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * RP1 Camera Front End formats definition
+ *
+ * Copyright (C) 2021-2024 - Raspberry Pi Ltd.
+ */
+#ifndef _CFE_FMTS_H_
+#define _CFE_FMTS_H_
+
+#include "cfe.h"
+#include <media/mipi-csi2.h>
+
+static const struct cfe_fmt formats[] = {
+	/* YUV Formats */
+	{
+		.fourcc = V4L2_PIX_FMT_YUYV,
+		.code = MEDIA_BUS_FMT_YUYV8_1X16,
+		.depth = 16,
+		.csi_dt = MIPI_CSI2_DT_YUV422_8B,
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_UYVY,
+		.code = MEDIA_BUS_FMT_UYVY8_1X16,
+		.depth = 16,
+		.csi_dt = MIPI_CSI2_DT_YUV422_8B,
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_YVYU,
+		.code = MEDIA_BUS_FMT_YVYU8_1X16,
+		.depth = 16,
+		.csi_dt = MIPI_CSI2_DT_YUV422_8B,
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_VYUY,
+		.code = MEDIA_BUS_FMT_VYUY8_1X16,
+		.depth = 16,
+		.csi_dt = MIPI_CSI2_DT_YUV422_8B,
+	},
+	{
+		/* RGB Formats */
+		.fourcc = V4L2_PIX_FMT_RGB565, /* gggbbbbb rrrrrggg */
+		.code = MEDIA_BUS_FMT_RGB565_2X8_LE,
+		.depth = 16,
+		.csi_dt = MIPI_CSI2_DT_RGB565,
+	},
+	{	.fourcc = V4L2_PIX_FMT_RGB565X, /* rrrrrggg gggbbbbb */
+		.code = MEDIA_BUS_FMT_RGB565_2X8_BE,
+		.depth = 16,
+		.csi_dt = MIPI_CSI2_DT_RGB565,
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_RGB555, /* gggbbbbb arrrrrgg */
+		.code = MEDIA_BUS_FMT_RGB555_2X8_PADHI_LE,
+		.depth = 16,
+		.csi_dt = MIPI_CSI2_DT_RGB555,
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_RGB555X, /* arrrrrgg gggbbbbb */
+		.code = MEDIA_BUS_FMT_RGB555_2X8_PADHI_BE,
+		.depth = 16,
+		.csi_dt = MIPI_CSI2_DT_RGB555,
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_RGB24, /* rgb */
+		.code = MEDIA_BUS_FMT_RGB888_1X24,
+		.depth = 24,
+		.csi_dt = MIPI_CSI2_DT_RGB888,
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_BGR24, /* bgr */
+		.code = MEDIA_BUS_FMT_BGR888_1X24,
+		.depth = 24,
+		.csi_dt = MIPI_CSI2_DT_RGB888,
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_RGB32, /* argb */
+		.code = MEDIA_BUS_FMT_ARGB8888_1X32,
+		.depth = 32,
+		.csi_dt = 0x0,
+	},
+
+	/* Bayer Formats */
+	{
+		.fourcc = V4L2_PIX_FMT_SBGGR8,
+		.code = MEDIA_BUS_FMT_SBGGR8_1X8,
+		.depth = 8,
+		.csi_dt = MIPI_CSI2_DT_RAW8,
+		.remap = { V4L2_PIX_FMT_SBGGR16, V4L2_PIX_FMT_PISP_COMP1_BGGR },
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_SGBRG8,
+		.code = MEDIA_BUS_FMT_SGBRG8_1X8,
+		.depth = 8,
+		.csi_dt = MIPI_CSI2_DT_RAW8,
+		.remap = { V4L2_PIX_FMT_SGBRG16, V4L2_PIX_FMT_PISP_COMP1_GBRG },
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_SGRBG8,
+		.code = MEDIA_BUS_FMT_SGRBG8_1X8,
+		.depth = 8,
+		.csi_dt = MIPI_CSI2_DT_RAW8,
+		.remap = { V4L2_PIX_FMT_SGRBG16, V4L2_PIX_FMT_PISP_COMP1_GRBG },
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_SRGGB8,
+		.code = MEDIA_BUS_FMT_SRGGB8_1X8,
+		.depth = 8,
+		.csi_dt = MIPI_CSI2_DT_RAW8,
+		.remap = { V4L2_PIX_FMT_SRGGB16, V4L2_PIX_FMT_PISP_COMP1_RGGB },
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_SBGGR10P,
+		.code = MEDIA_BUS_FMT_SBGGR10_1X10,
+		.depth = 10,
+		.csi_dt = MIPI_CSI2_DT_RAW10,
+		.remap = { V4L2_PIX_FMT_SBGGR16, V4L2_PIX_FMT_PISP_COMP1_BGGR },
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_SGBRG10P,
+		.code = MEDIA_BUS_FMT_SGBRG10_1X10,
+		.depth = 10,
+		.csi_dt = MIPI_CSI2_DT_RAW10,
+		.remap = { V4L2_PIX_FMT_SGBRG16, V4L2_PIX_FMT_PISP_COMP1_GBRG },
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_SGRBG10P,
+		.code = MEDIA_BUS_FMT_SGRBG10_1X10,
+		.depth = 10,
+		.csi_dt = MIPI_CSI2_DT_RAW10,
+		.remap = { V4L2_PIX_FMT_SGRBG16, V4L2_PIX_FMT_PISP_COMP1_GRBG },
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_SRGGB10P,
+		.code = MEDIA_BUS_FMT_SRGGB10_1X10,
+		.depth = 10,
+		.csi_dt = MIPI_CSI2_DT_RAW10,
+		.remap = { V4L2_PIX_FMT_SRGGB16, V4L2_PIX_FMT_PISP_COMP1_RGGB },
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_SBGGR12P,
+		.code = MEDIA_BUS_FMT_SBGGR12_1X12,
+		.depth = 12,
+		.csi_dt = MIPI_CSI2_DT_RAW12,
+		.remap = { V4L2_PIX_FMT_SBGGR16, V4L2_PIX_FMT_PISP_COMP1_BGGR },
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_SGBRG12P,
+		.code = MEDIA_BUS_FMT_SGBRG12_1X12,
+		.depth = 12,
+		.csi_dt = MIPI_CSI2_DT_RAW12,
+		.remap = { V4L2_PIX_FMT_SGBRG16, V4L2_PIX_FMT_PISP_COMP1_GBRG },
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_SGRBG12P,
+		.code = MEDIA_BUS_FMT_SGRBG12_1X12,
+		.depth = 12,
+		.csi_dt = MIPI_CSI2_DT_RAW12,
+		.remap = { V4L2_PIX_FMT_SGRBG16, V4L2_PIX_FMT_PISP_COMP1_GRBG },
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_SRGGB12P,
+		.code = MEDIA_BUS_FMT_SRGGB12_1X12,
+		.depth = 12,
+		.csi_dt = MIPI_CSI2_DT_RAW12,
+		.remap = { V4L2_PIX_FMT_SRGGB16, V4L2_PIX_FMT_PISP_COMP1_RGGB },
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_SBGGR14P,
+		.code = MEDIA_BUS_FMT_SBGGR14_1X14,
+		.depth = 14,
+		.csi_dt = MIPI_CSI2_DT_RAW14,
+		.remap = { V4L2_PIX_FMT_SBGGR16, V4L2_PIX_FMT_PISP_COMP1_BGGR },
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_SGBRG14P,
+		.code = MEDIA_BUS_FMT_SGBRG14_1X14,
+		.depth = 14,
+		.csi_dt = MIPI_CSI2_DT_RAW14,
+		.remap = { V4L2_PIX_FMT_SGBRG16, V4L2_PIX_FMT_PISP_COMP1_GBRG },
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_SGRBG14P,
+		.code = MEDIA_BUS_FMT_SGRBG14_1X14,
+		.depth = 14,
+		.csi_dt = MIPI_CSI2_DT_RAW14,
+		.remap = { V4L2_PIX_FMT_SGRBG16, V4L2_PIX_FMT_PISP_COMP1_GRBG },
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_SRGGB14P,
+		.code = MEDIA_BUS_FMT_SRGGB14_1X14,
+		.depth = 14,
+		.csi_dt = MIPI_CSI2_DT_RAW14,
+		.remap = { V4L2_PIX_FMT_SRGGB16, V4L2_PIX_FMT_PISP_COMP1_RGGB },
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_SBGGR16,
+		.code = MEDIA_BUS_FMT_SBGGR16_1X16,
+		.depth = 16,
+		.csi_dt = MIPI_CSI2_DT_RAW16,
+		.flags = CFE_FORMAT_FLAG_FE_OUT,
+		.remap = { V4L2_PIX_FMT_SBGGR16, V4L2_PIX_FMT_PISP_COMP1_BGGR },
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_SGBRG16,
+		.code = MEDIA_BUS_FMT_SGBRG16_1X16,
+		.depth = 16,
+		.csi_dt = MIPI_CSI2_DT_RAW16,
+		.flags = CFE_FORMAT_FLAG_FE_OUT,
+		.remap = { V4L2_PIX_FMT_SGBRG16, V4L2_PIX_FMT_PISP_COMP1_GBRG },
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_SGRBG16,
+		.code = MEDIA_BUS_FMT_SGRBG16_1X16,
+		.depth = 16,
+		.csi_dt = MIPI_CSI2_DT_RAW16,
+		.flags = CFE_FORMAT_FLAG_FE_OUT,
+		.remap = { V4L2_PIX_FMT_SGRBG16, V4L2_PIX_FMT_PISP_COMP1_GRBG },
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_SRGGB16,
+		.code = MEDIA_BUS_FMT_SRGGB16_1X16,
+		.depth = 16,
+		.csi_dt = MIPI_CSI2_DT_RAW16,
+		.flags = CFE_FORMAT_FLAG_FE_OUT,
+		.remap = { V4L2_PIX_FMT_SRGGB16, V4L2_PIX_FMT_PISP_COMP1_RGGB },
+	},
+	/* PiSP Compressed Mode 1 */
+	{
+		.fourcc = V4L2_PIX_FMT_PISP_COMP1_RGGB,
+		.code = MEDIA_BUS_FMT_SRGGB16_1X16,
+		.depth = 8,
+		.flags = CFE_FORMAT_FLAG_FE_OUT,
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_PISP_COMP1_BGGR,
+		.code = MEDIA_BUS_FMT_SBGGR16_1X16,
+		.depth = 8,
+		.flags = CFE_FORMAT_FLAG_FE_OUT,
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_PISP_COMP1_GBRG,
+		.code = MEDIA_BUS_FMT_SGBRG16_1X16,
+		.depth = 8,
+		.flags = CFE_FORMAT_FLAG_FE_OUT,
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_PISP_COMP1_GRBG,
+		.code = MEDIA_BUS_FMT_SGRBG16_1X16,
+		.depth = 8,
+		.flags = CFE_FORMAT_FLAG_FE_OUT,
+	},
+	/* Greyscale format */
+	{
+		.fourcc = V4L2_PIX_FMT_GREY,
+		.code = MEDIA_BUS_FMT_Y8_1X8,
+		.depth = 8,
+		.csi_dt = MIPI_CSI2_DT_RAW8,
+		.remap = { V4L2_PIX_FMT_Y16, V4L2_PIX_FMT_PISP_COMP1_MONO },
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_Y10P,
+		.code = MEDIA_BUS_FMT_Y10_1X10,
+		.depth = 10,
+		.csi_dt = MIPI_CSI2_DT_RAW10,
+		.remap = { V4L2_PIX_FMT_Y16, V4L2_PIX_FMT_PISP_COMP1_MONO },
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_Y12P,
+		.code = MEDIA_BUS_FMT_Y12_1X12,
+		.depth = 12,
+		.csi_dt = MIPI_CSI2_DT_RAW12,
+		.remap = { V4L2_PIX_FMT_Y16, V4L2_PIX_FMT_PISP_COMP1_MONO },
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_Y14P,
+		.code = MEDIA_BUS_FMT_Y14_1X14,
+		.depth = 14,
+		.csi_dt = MIPI_CSI2_DT_RAW14,
+		.remap = { V4L2_PIX_FMT_Y16, V4L2_PIX_FMT_PISP_COMP1_MONO },
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_Y16,
+		.code = MEDIA_BUS_FMT_Y16_1X16,
+		.depth = 16,
+		.csi_dt = MIPI_CSI2_DT_RAW16,
+		.flags = CFE_FORMAT_FLAG_FE_OUT,
+		.remap = { V4L2_PIX_FMT_Y16, V4L2_PIX_FMT_PISP_COMP1_MONO },
+	},
+	{
+		.fourcc = V4L2_PIX_FMT_PISP_COMP1_MONO,
+		.code = MEDIA_BUS_FMT_Y16_1X16,
+		.depth = 8,
+		.flags = CFE_FORMAT_FLAG_FE_OUT,
+	},
+
+	/* Embedded data formats */
+	{
+		.fourcc = V4L2_META_FMT_GENERIC_8,
+		.code = MEDIA_BUS_FMT_META_8,
+		.depth = 8,
+		.csi_dt = MIPI_CSI2_DT_EMBEDDED_8B,
+		.flags = CFE_FORMAT_FLAG_META_CAP,
+	},
+	{
+		.fourcc = V4L2_META_FMT_GENERIC_CSI2_10,
+		.code = MEDIA_BUS_FMT_META_10,
+		.depth = 10,
+		.csi_dt = MIPI_CSI2_DT_EMBEDDED_8B,
+		.flags = CFE_FORMAT_FLAG_META_CAP,
+	},
+	{
+		.fourcc = V4L2_META_FMT_GENERIC_CSI2_12,
+		.code = MEDIA_BUS_FMT_META_12,
+		.depth = 12,
+		.csi_dt = MIPI_CSI2_DT_EMBEDDED_8B,
+		.flags = CFE_FORMAT_FLAG_META_CAP,
+	},
+
+	/* Frontend formats */
+	{
+		.fourcc = V4L2_META_FMT_RPI_FE_CFG,
+		.code = MEDIA_BUS_FMT_FIXED,
+		.flags = CFE_FORMAT_FLAG_META_OUT,
+	},
+	{
+		.fourcc = V4L2_META_FMT_RPI_FE_STATS,
+		.code = MEDIA_BUS_FMT_FIXED,
+		.flags = CFE_FORMAT_FLAG_META_CAP,
+	},
+};
+
+#endif /* _CFE_FMTS_H_ */
diff --git a/drivers/media/platform/raspberrypi/rp1-cfe/cfe-trace.h b/drivers/media/platform/raspberrypi/rp1-cfe/cfe-trace.h
new file mode 100644
index 000000000000..1a36259f51b7
--- /dev/null
+++ b/drivers/media/platform/raspberrypi/rp1-cfe/cfe-trace.h
@@ -0,0 +1,202 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Raspberry Pi Ltd.
+ * Copyright (c) 2024 Ideas on Board Oy
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM cfe
+
+#if !defined(_CFE_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _CFE_TRACE_H
+
+#include <linux/tracepoint.h>
+#include <media/videobuf2-v4l2.h>
+
+TRACE_EVENT(cfe_return_buffer,
+	TP_PROTO(u32 node_id, u32 buf_idx, u32 queue_id),
+	TP_ARGS(node_id, buf_idx, queue_id),
+	TP_STRUCT__entry(
+		__field(u32, node_id)
+		__field(u32, buf_idx)
+		__field(u32, queue_id)
+	),
+	TP_fast_assign(
+		__entry->node_id = node_id;
+		__entry->buf_idx = buf_idx;
+		__entry->queue_id = queue_id;
+	),
+	TP_printk("node=%u buf=%u, queue=%u", __entry->node_id,
+		  __entry->buf_idx, __entry->queue_id)
+);
+
+DECLARE_EVENT_CLASS(cfe_buffer_template,
+	TP_PROTO(u32 node_id, struct vb2_buffer *buf),
+	TP_ARGS(node_id, buf),
+	TP_STRUCT__entry(
+		__field(u32, node_id)
+		__field(u32, buf_idx)
+	),
+	TP_fast_assign(
+		__entry->node_id = node_id;
+		__entry->buf_idx = buf->index;
+	),
+	TP_printk("node=%u buf=%u", __entry->node_id, __entry->buf_idx)
+);
+
+DEFINE_EVENT(cfe_buffer_template, cfe_buffer_prepare,
+	TP_PROTO(u32 node_id, struct vb2_buffer *buf),
+	TP_ARGS(node_id, buf));
+
+TRACE_EVENT(cfe_buffer_queue,
+	TP_PROTO(u32 node_id, struct vb2_buffer *buf, bool schedule_now),
+	TP_ARGS(node_id, buf, schedule_now),
+	TP_STRUCT__entry(
+		__field(u32, node_id)
+		__field(u32, buf_idx)
+		__field(bool, schedule_now)
+	),
+	TP_fast_assign(
+		__entry->node_id = node_id;
+		__entry->buf_idx = buf->index;
+		__entry->schedule_now = schedule_now;
+	),
+	TP_printk("node=%u buf=%u%s", __entry->node_id, __entry->buf_idx,
+		  __entry->schedule_now ? " schedule immediately" : "")
+);
+
+DEFINE_EVENT(cfe_buffer_template, cfe_csi2_schedule,
+	TP_PROTO(u32 node_id, struct vb2_buffer *buf),
+	TP_ARGS(node_id, buf));
+
+DEFINE_EVENT(cfe_buffer_template, cfe_fe_schedule,
+	TP_PROTO(u32 node_id, struct vb2_buffer *buf),
+	TP_ARGS(node_id, buf));
+
+TRACE_EVENT(cfe_buffer_complete,
+	TP_PROTO(u32 node_id, struct vb2_v4l2_buffer *buf),
+	TP_ARGS(node_id, buf),
+	TP_STRUCT__entry(
+		__field(u32, node_id)
+		__field(u32, buf_idx)
+		__field(u32, seq)
+		__field(u64, ts)
+	),
+	TP_fast_assign(
+		__entry->node_id = node_id;
+		__entry->buf_idx = buf->vb2_buf.index;
+		__entry->seq = buf->sequence;
+		__entry->ts = buf->vb2_buf.timestamp;
+	),
+	TP_printk("node=%u buf=%u seq=%u ts=%llu", __entry->node_id,
+		  __entry->buf_idx, __entry->seq, __entry->ts)
+);
+
+TRACE_EVENT(cfe_frame_start,
+	TP_PROTO(u32 node_id, u32 fs_count),
+	TP_ARGS(node_id, fs_count),
+	TP_STRUCT__entry(
+		__field(u32, node_id)
+		__field(u32, fs_count)
+	),
+	TP_fast_assign(
+		__entry->node_id = node_id;
+		__entry->fs_count = fs_count;
+	),
+	TP_printk("node=%u fs_count=%u", __entry->node_id, __entry->fs_count)
+);
+
+TRACE_EVENT(cfe_frame_end,
+	TP_PROTO(u32 node_id, u32 fs_count),
+	TP_ARGS(node_id, fs_count),
+	TP_STRUCT__entry(
+		__field(u32, node_id)
+		__field(u32, fs_count)
+	),
+	TP_fast_assign(
+		__entry->node_id = node_id;
+		__entry->fs_count = fs_count;
+	),
+	TP_printk("node=%u fs_count=%u", __entry->node_id, __entry->fs_count)
+);
+
+TRACE_EVENT(cfe_prepare_next_job,
+	TP_PROTO(bool fe_enabled),
+	TP_ARGS(fe_enabled),
+	TP_STRUCT__entry(
+		__field(bool, fe_enabled)
+	),
+	TP_fast_assign(
+		__entry->fe_enabled = fe_enabled;
+	),
+	TP_printk("fe_enabled=%u", __entry->fe_enabled)
+);
+
+/* These are copied from csi2.c */
+#define CSI2_STATUS_IRQ_FS(x)			(BIT(0) << (x))
+#define CSI2_STATUS_IRQ_FE(x)			(BIT(4) << (x))
+#define CSI2_STATUS_IRQ_FE_ACK(x)		(BIT(8) << (x))
+#define CSI2_STATUS_IRQ_LE(x)			(BIT(12) << (x))
+#define CSI2_STATUS_IRQ_LE_ACK(x)		(BIT(16) << (x))
+
+TRACE_EVENT(csi2_irq,
+	TP_PROTO(u32 channel, u32 status, u32 dbg),
+	TP_ARGS(channel, status, dbg),
+	TP_STRUCT__entry(
+		__field(u32, channel)
+		__field(u32, status)
+		__field(u32, dbg)
+	),
+	TP_fast_assign(
+		__entry->channel = channel;
+		__entry->status = status;
+		__entry->dbg = dbg;
+	),
+	TP_printk("ch=%u flags=[ %s%s%s%s%s] frame=%u line=%u\n",
+		  __entry->channel,
+		  (__entry->status & CSI2_STATUS_IRQ_FS(__entry->channel)) ?
+			"FS " : "",
+		  (__entry->status & CSI2_STATUS_IRQ_FE(__entry->channel)) ?
+			"FE " : "",
+		  (__entry->status & CSI2_STATUS_IRQ_FE_ACK(__entry->channel)) ?
+			"FE_ACK " : "",
+		  (__entry->status & CSI2_STATUS_IRQ_LE(__entry->channel)) ?
+			"LE " : "",
+		  (__entry->status & CSI2_STATUS_IRQ_LE_ACK(__entry->channel)) ?
+			"LE_ACK " : "",
+		  __entry->dbg >> 16, __entry->dbg & 0xffff)
+);
+
+TRACE_EVENT(fe_irq,
+	TP_PROTO(u32 status, u32 output_status, u32 frame_status,
+		 u32 error_status, u32 int_status),
+	TP_ARGS(status, output_status, frame_status, error_status, int_status),
+	TP_STRUCT__entry(
+		__field(u32, status)
+		__field(u32, output_status)
+		__field(u32, frame_status)
+		__field(u32, error_status)
+		__field(u32, int_status)
+	),
+	TP_fast_assign(
+		__entry->status = status;
+		__entry->output_status = output_status;
+		__entry->frame_status = frame_status;
+		__entry->error_status = error_status;
+		__entry->int_status = int_status;
+	),
+	TP_printk("status 0x%x out_status 0x%x frame_status 0x%x error_status 0x%x int_status 0x%x",
+		  __entry->status,
+		  __entry->output_status,
+		  __entry->frame_status,
+		  __entry->error_status,
+		  __entry->int_status)
+);
+
+#endif /* _CFE_TRACE_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE ../../drivers/media/platform/raspberrypi/rp1-cfe/cfe-trace
+#include <trace/define_trace.h>
diff --git a/drivers/media/platform/raspberrypi/rp1-cfe/cfe.c b/drivers/media/platform/raspberrypi/rp1-cfe/cfe.c
new file mode 100644
index 000000000000..045910de6c57
--- /dev/null
+++ b/drivers/media/platform/raspberrypi/rp1-cfe/cfe.c
@@ -0,0 +1,2504 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * RP1 Camera Front End Driver
+ *
+ * Copyright (c) 2021-2024 Raspberry Pi Ltd.
+ * Copyright (c) 2023-2024 Ideas on Board Oy
+ */
+
+#include <linux/clk.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/fwnode.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/lcm.h>
+#include <linux/math.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/property.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/videodev2.h>
+
+#include <media/v4l2-async.h>
+#include <media/v4l2-common.h>
+#include <media/v4l2-ctrls.h>
+#include <media/v4l2-dev.h>
+#include <media/v4l2-device.h>
+#include <media/v4l2-event.h>
+#include <media/v4l2-fwnode.h>
+#include <media/v4l2-ioctl.h>
+#include <media/v4l2-mc.h>
+#include <media/videobuf2-dma-contig.h>
+
+#include <linux/media/raspberrypi/pisp_fe_config.h>
+#include <linux/media/raspberrypi/pisp_fe_statistics.h>
+
+#include "cfe-fmts.h"
+#include "cfe.h"
+#include "csi2.h"
+#include "pisp-fe.h"
+
+#define CREATE_TRACE_POINTS
+#include "cfe-trace.h"
+
+#define CFE_MODULE_NAME	"rp1-cfe"
+#define CFE_VERSION	"1.0"
+
+#define cfe_dbg(cfe, fmt, arg...) dev_dbg(&(cfe)->pdev->dev, fmt, ##arg)
+#define cfe_info(cfe, fmt, arg...) dev_info(&(cfe)->pdev->dev, fmt, ##arg)
+#define cfe_err(cfe, fmt, arg...) dev_err(&(cfe)->pdev->dev, fmt, ##arg)
+
+/* MIPICFG registers */
+#define MIPICFG_CFG		0x004
+#define MIPICFG_INTR		0x028
+#define MIPICFG_INTE		0x02c
+#define MIPICFG_INTF		0x030
+#define MIPICFG_INTS		0x034
+
+#define MIPICFG_CFG_SEL_CSI	BIT(0)
+
+#define MIPICFG_INT_CSI_DMA	BIT(0)
+#define MIPICFG_INT_CSI_HOST	BIT(2)
+#define MIPICFG_INT_PISP_FE	BIT(4)
+
+#define BPL_ALIGNMENT 16
+#define MAX_BYTESPERLINE 0xffffff00
+#define MAX_BUFFER_SIZE  0xffffff00
+/*
+ * Max width is therefore determined by the max stride divided by the number of
+ * bits per pixel.
+ *
+ * However, to avoid overflow issues let's use a 16k maximum. This lets us
+ * calculate 16k * 16k * 4 with 32bits. If we need higher maximums, a careful
+ * review and adjustment of the code is needed so that it will deal with
+ * overflows correctly.
+ */
+#define MAX_WIDTH 16384
+#define MAX_HEIGHT MAX_WIDTH
+/* Define a nominal minimum image size */
+#define MIN_WIDTH 16
+#define MIN_HEIGHT 16
+
+#define MIN_META_WIDTH 4
+#define MIN_META_HEIGHT 1
+
+const struct v4l2_mbus_framefmt cfe_default_format = {
+	.width = 640,
+	.height = 480,
+	.code = MEDIA_BUS_FMT_SRGGB10_1X10,
+	.field = V4L2_FIELD_NONE,
+	.colorspace = V4L2_COLORSPACE_RAW,
+	.ycbcr_enc = V4L2_YCBCR_ENC_601,
+	.quantization = V4L2_QUANTIZATION_FULL_RANGE,
+	.xfer_func = V4L2_XFER_FUNC_NONE,
+};
+
+enum node_ids {
+	/* CSI2 HW output nodes first. */
+	CSI2_CH0,
+	CSI2_CH1,
+	CSI2_CH2,
+	CSI2_CH3,
+	/* FE only nodes from here on. */
+	FE_OUT0,
+	FE_OUT1,
+	FE_STATS,
+	FE_CONFIG,
+	NUM_NODES
+};
+
+struct node_description {
+	enum node_ids id;
+	const char *name;
+	unsigned int caps;
+	unsigned int pad_flags;
+	unsigned int link_pad;
+};
+
+/* Must match the ordering of enum ids */
+static const struct node_description node_desc[NUM_NODES] = {
+	[CSI2_CH0] = {
+		.name = "csi2-ch0",
+		.caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_META_CAPTURE,
+		.pad_flags = MEDIA_PAD_FL_SINK | MEDIA_PAD_FL_MUST_CONNECT,
+		.link_pad = CSI2_PAD_FIRST_SOURCE + 0
+	},
+	/*
+	 * At the moment the main userspace component (libcamera) doesn't
+	 * support metadata with video nodes that support both video and
+	 * metadata. So for the time being this node is set to only support
+	 * V4L2_CAP_META_CAPTURE.
+	 */
+	[CSI2_CH1] = {
+		.name = "csi2-ch1",
+		.caps = V4L2_CAP_META_CAPTURE,
+		.pad_flags = MEDIA_PAD_FL_SINK | MEDIA_PAD_FL_MUST_CONNECT,
+		.link_pad = CSI2_PAD_FIRST_SOURCE + 1
+	},
+	[CSI2_CH2] = {
+		.name = "csi2-ch2",
+		.caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_META_CAPTURE,
+		.pad_flags = MEDIA_PAD_FL_SINK | MEDIA_PAD_FL_MUST_CONNECT,
+		.link_pad = CSI2_PAD_FIRST_SOURCE + 2
+	},
+	[CSI2_CH3] = {
+		.name = "csi2-ch3",
+		.caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_META_CAPTURE,
+		.pad_flags = MEDIA_PAD_FL_SINK | MEDIA_PAD_FL_MUST_CONNECT,
+		.link_pad = CSI2_PAD_FIRST_SOURCE + 3
+	},
+	[FE_OUT0] = {
+		.name = "fe-image0",
+		.caps = V4L2_CAP_VIDEO_CAPTURE,
+		.pad_flags = MEDIA_PAD_FL_SINK | MEDIA_PAD_FL_MUST_CONNECT,
+		.link_pad = FE_OUTPUT0_PAD
+	},
+	[FE_OUT1] = {
+		.name = "fe-image1",
+		.caps = V4L2_CAP_VIDEO_CAPTURE,
+		.pad_flags = MEDIA_PAD_FL_SINK | MEDIA_PAD_FL_MUST_CONNECT,
+		.link_pad = FE_OUTPUT1_PAD
+	},
+	[FE_STATS] = {
+		.name = "fe-stats",
+		.caps = V4L2_CAP_META_CAPTURE,
+		.pad_flags = MEDIA_PAD_FL_SINK | MEDIA_PAD_FL_MUST_CONNECT,
+		.link_pad = FE_STATS_PAD
+	},
+	[FE_CONFIG] = {
+		.name = "fe-config",
+		.caps = V4L2_CAP_META_OUTPUT,
+		.pad_flags = MEDIA_PAD_FL_SOURCE | MEDIA_PAD_FL_MUST_CONNECT,
+		.link_pad = FE_CONFIG_PAD
+	},
+};
+
+#define is_fe_node(node) (((node)->id) >= FE_OUT0)
+#define is_csi2_node(node) (!is_fe_node(node))
+
+#define node_supports_image_output(node) \
+	(node_desc[(node)->id].caps & V4L2_CAP_VIDEO_CAPTURE)
+#define node_supports_meta_output(node) \
+	(node_desc[(node)->id].caps & V4L2_CAP_META_CAPTURE)
+#define node_supports_image_input(node) \
+	(node_desc[(node)->id].caps & V4L2_CAP_VIDEO_OUTPUT)
+#define node_supports_meta_input(node) \
+	(node_desc[(node)->id].caps & V4L2_CAP_META_OUTPUT)
+#define node_supports_image(node) \
+	(node_supports_image_output(node) || node_supports_image_input(node))
+#define node_supports_meta(node) \
+	(node_supports_meta_output(node) || node_supports_meta_input(node))
+
+#define is_image_output_node(node) \
+	((node)->buffer_queue.type == V4L2_BUF_TYPE_VIDEO_CAPTURE)
+#define is_image_input_node(node) \
+	((node)->buffer_queue.type == V4L2_BUF_TYPE_VIDEO_OUTPUT)
+#define is_image_node(node) \
+	(is_image_output_node(node) || is_image_input_node(node))
+#define is_meta_output_node(node) \
+	((node)->buffer_queue.type == V4L2_BUF_TYPE_META_CAPTURE)
+#define is_meta_input_node(node) \
+	((node)->buffer_queue.type == V4L2_BUF_TYPE_META_OUTPUT)
+#define is_meta_node(node) \
+	(is_meta_output_node(node) || is_meta_input_node(node))
+
+/* To track state across all nodes. */
+#define NODE_REGISTERED		BIT(0)
+#define NODE_ENABLED		BIT(1)
+#define NODE_STREAMING		BIT(2)
+#define FS_INT			BIT(3)
+#define FE_INT			BIT(4)
+#define NUM_STATES		5
+
+struct cfe_buffer {
+	struct vb2_v4l2_buffer vb;
+	struct list_head list;
+};
+
+struct cfe_config_buffer {
+	struct cfe_buffer buf;
+	struct pisp_fe_config config;
+};
+
+static inline struct cfe_buffer *to_cfe_buffer(struct vb2_buffer *vb)
+{
+	return container_of(vb, struct cfe_buffer, vb.vb2_buf);
+}
+
+static inline
+struct cfe_config_buffer *to_cfe_config_buffer(struct cfe_buffer *buf)
+{
+	return container_of(buf, struct cfe_config_buffer, buf);
+}
+
+struct cfe_node {
+	/* Node id */
+	enum node_ids id;
+	/* Pointer pointing to current v4l2_buffer */
+	struct cfe_buffer *cur_frm;
+	/* Pointer pointing to next v4l2_buffer */
+	struct cfe_buffer *next_frm;
+	/* Used to store current pixel format */
+	struct v4l2_format vid_fmt;
+	/* Used to store current meta format */
+	struct v4l2_format meta_fmt;
+	/* Buffer queue used in video-buf */
+	struct vb2_queue buffer_queue;
+	/* Queue of filled frames */
+	struct list_head dma_queue;
+	/* lock used to access this structure */
+	struct mutex lock;
+	/* Identifies video device for this channel */
+	struct video_device video_dev;
+	/* Pointer to the parent handle */
+	struct cfe_device *cfe;
+	/* Media pad for this node */
+	struct media_pad pad;
+	/* Frame-start counter */
+	unsigned int fs_count;
+	/* Timestamp of the current buffer */
+	u64 ts;
+};
+
+struct cfe_device {
+	struct dentry *debugfs;
+	struct kref kref;
+
+	/* peripheral base address */
+	void __iomem *mipi_cfg_base;
+
+	struct clk *clk;
+
+	/* V4l2 device */
+	struct v4l2_device v4l2_dev;
+	struct media_device mdev;
+	struct media_pipeline pipe;
+
+	/* IRQ lock for node state and DMA queues */
+	spinlock_t state_lock;
+	bool job_ready;
+	bool job_queued;
+
+	/* parent device */
+	struct platform_device *pdev;
+	/* subdevice async Notifier */
+	struct v4l2_async_notifier notifier;
+
+	/* Source sub device */
+	struct v4l2_subdev *source_sd;
+	/* Source subdev's pad */
+	u32 source_pad;
+
+	struct cfe_node node[NUM_NODES];
+	DECLARE_BITMAP(node_flags, NUM_STATES * NUM_NODES);
+
+	struct csi2_device csi2;
+	struct pisp_fe_device fe;
+
+	int fe_csi2_channel;
+
+	/* Mask of enabled streams */
+	u64 streams_mask;
+};
+
+static inline bool is_fe_enabled(struct cfe_device *cfe)
+{
+	return cfe->fe_csi2_channel != -1;
+}
+
+static inline struct cfe_device *to_cfe_device(struct v4l2_device *v4l2_dev)
+{
+	return container_of(v4l2_dev, struct cfe_device, v4l2_dev);
+}
+
+static inline u32 cfg_reg_read(struct cfe_device *cfe, u32 offset)
+{
+	return readl(cfe->mipi_cfg_base + offset);
+}
+
+static inline void cfg_reg_write(struct cfe_device *cfe, u32 offset, u32 val)
+{
+	writel(val, cfe->mipi_cfg_base + offset);
+}
+
+static bool check_state(struct cfe_device *cfe, unsigned long state,
+			unsigned int node_id)
+{
+	unsigned long bit;
+
+	for_each_set_bit(bit, &state, sizeof(state)) {
+		if (!test_bit(bit + (node_id * NUM_STATES), cfe->node_flags))
+			return false;
+	}
+
+	return true;
+}
+
+static void set_state(struct cfe_device *cfe, unsigned long state,
+		      unsigned int node_id)
+{
+	unsigned long bit;
+
+	for_each_set_bit(bit, &state, sizeof(state))
+		set_bit(bit + (node_id * NUM_STATES), cfe->node_flags);
+}
+
+static void clear_state(struct cfe_device *cfe, unsigned long state,
+			unsigned int node_id)
+{
+	unsigned long bit;
+
+	for_each_set_bit(bit, &state, sizeof(state))
+		clear_bit(bit + (node_id * NUM_STATES), cfe->node_flags);
+}
+
+static bool test_any_node(struct cfe_device *cfe, unsigned long cond)
+{
+	for (unsigned int i = 0; i < NUM_NODES; i++) {
+		if (check_state(cfe, cond, i))
+			return true;
+	}
+
+	return false;
+}
+
+static bool test_all_nodes(struct cfe_device *cfe, unsigned long precond,
+			   unsigned long cond)
+{
+	for (unsigned int i = 0; i < NUM_NODES; i++) {
+		if (check_state(cfe, precond, i)) {
+			if (!check_state(cfe, cond, i))
+				return false;
+		}
+	}
+
+	return true;
+}
+
+static int mipi_cfg_regs_show(struct seq_file *s, void *data)
+{
+	struct cfe_device *cfe = s->private;
+	int ret;
+
+	ret = pm_runtime_resume_and_get(&cfe->pdev->dev);
+	if (ret)
+		return ret;
+
+#define DUMP(reg) seq_printf(s, #reg " \t0x%08x\n", cfg_reg_read(cfe, reg))
+	DUMP(MIPICFG_CFG);
+	DUMP(MIPICFG_INTR);
+	DUMP(MIPICFG_INTE);
+	DUMP(MIPICFG_INTF);
+	DUMP(MIPICFG_INTS);
+#undef DUMP
+
+	pm_runtime_put(&cfe->pdev->dev);
+
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(mipi_cfg_regs);
+
+/* Format setup functions */
+const struct cfe_fmt *find_format_by_code(u32 code)
+{
+	for (unsigned int i = 0; i < ARRAY_SIZE(formats); i++) {
+		if (formats[i].code == code)
+			return &formats[i];
+	}
+
+	return NULL;
+}
+
+const struct cfe_fmt *find_format_by_pix(u32 pixelformat)
+{
+	for (unsigned int i = 0; i < ARRAY_SIZE(formats); i++) {
+		if (formats[i].fourcc == pixelformat)
+			return &formats[i];
+	}
+
+	return NULL;
+}
+
+static const struct cfe_fmt *find_format_by_code_and_fourcc(u32 code,
+							    u32 fourcc)
+{
+	for (unsigned int i = 0; i < ARRAY_SIZE(formats); i++) {
+		if (formats[i].code == code && formats[i].fourcc == fourcc)
+			return &formats[i];
+	}
+
+	return NULL;
+}
+
+/*
+ * Given the mbus code, find the 16 bit remapped code. Returns 0 if no remap
+ * possible.
+ */
+u32 cfe_find_16bit_code(u32 code)
+{
+	const struct cfe_fmt *cfe_fmt;
+
+	cfe_fmt = find_format_by_code(code);
+
+	if (!cfe_fmt || !cfe_fmt->remap[CFE_REMAP_16BIT])
+		return 0;
+
+	cfe_fmt = find_format_by_pix(cfe_fmt->remap[CFE_REMAP_16BIT]);
+	if (!cfe_fmt)
+		return 0;
+
+	return cfe_fmt->code;
+}
+
+/*
+ * Given the mbus code, find the 8 bit compressed code. Returns 0 if no remap
+ * possible.
+ */
+u32 cfe_find_compressed_code(u32 code)
+{
+	const struct cfe_fmt *cfe_fmt;
+
+	cfe_fmt = find_format_by_code(code);
+
+	if (!cfe_fmt || !cfe_fmt->remap[CFE_REMAP_COMPRESSED])
+		return 0;
+
+	cfe_fmt = find_format_by_pix(cfe_fmt->remap[CFE_REMAP_COMPRESSED]);
+	if (!cfe_fmt)
+		return 0;
+
+	return cfe_fmt->code;
+}
+
+static void cfe_calc_vid_format_size_bpl(struct cfe_device *cfe,
+					 const struct cfe_fmt *fmt,
+					 struct v4l2_format *f)
+{
+	unsigned int min_bytesperline;
+
+	v4l_bound_align_image(&f->fmt.pix.width, MIN_WIDTH, MAX_WIDTH, 2,
+			      &f->fmt.pix.height, MIN_HEIGHT, MAX_HEIGHT, 0, 0);
+
+	min_bytesperline =
+		ALIGN((f->fmt.pix.width * fmt->depth) >> 3, BPL_ALIGNMENT);
+
+	if (f->fmt.pix.bytesperline > min_bytesperline &&
+	    f->fmt.pix.bytesperline <= MAX_BYTESPERLINE)
+		f->fmt.pix.bytesperline =
+			ALIGN(f->fmt.pix.bytesperline, BPL_ALIGNMENT);
+	else
+		f->fmt.pix.bytesperline = min_bytesperline;
+
+	f->fmt.pix.sizeimage = f->fmt.pix.height * f->fmt.pix.bytesperline;
+
+	cfe_dbg(cfe, "%s: %p4cc size: %ux%u bpl:%u img_size:%u\n", __func__,
+		&f->fmt.pix.pixelformat, f->fmt.pix.width, f->fmt.pix.height,
+		f->fmt.pix.bytesperline, f->fmt.pix.sizeimage);
+}
+
+static void cfe_calc_meta_format_size_bpl(struct cfe_device *cfe,
+					  const struct cfe_fmt *fmt,
+					  struct v4l2_format *f)
+{
+	v4l_bound_align_image(&f->fmt.meta.width, MIN_META_WIDTH, MAX_WIDTH, 2,
+			      &f->fmt.meta.height, MIN_META_HEIGHT, MAX_HEIGHT,
+			      0, 0);
+
+	f->fmt.meta.bytesperline = (f->fmt.meta.width * fmt->depth) >> 3;
+	f->fmt.meta.buffersize = f->fmt.meta.height * f->fmt.pix.bytesperline;
+
+	cfe_dbg(cfe, "%s: %p4cc size: %ux%u bpl:%u buf_size:%u\n", __func__,
+		&f->fmt.meta.dataformat, f->fmt.meta.width, f->fmt.meta.height,
+		f->fmt.meta.bytesperline, f->fmt.meta.buffersize);
+}
+
+static void cfe_schedule_next_csi2_job(struct cfe_device *cfe)
+{
+	struct cfe_buffer *buf;
+	dma_addr_t addr;
+
+	for (unsigned int i = 0; i < CSI2_NUM_CHANNELS; i++) {
+		struct cfe_node *node = &cfe->node[i];
+		unsigned int stride, size;
+
+		if (!check_state(cfe, NODE_STREAMING, i))
+			continue;
+
+		buf = list_first_entry(&node->dma_queue, struct cfe_buffer,
+				       list);
+		node->next_frm = buf;
+		list_del(&buf->list);
+
+		trace_cfe_csi2_schedule(node->id, &buf->vb.vb2_buf);
+
+		if (is_meta_node(node)) {
+			size = node->meta_fmt.fmt.meta.buffersize;
+			/* We use CSI2_CH_CTRL_PACK_BYTES, so stride == 0 */
+			stride = 0;
+		} else {
+			size = node->vid_fmt.fmt.pix.sizeimage;
+			stride = node->vid_fmt.fmt.pix.bytesperline;
+		}
+
+		addr = vb2_dma_contig_plane_dma_addr(&buf->vb.vb2_buf, 0);
+		csi2_set_buffer(&cfe->csi2, node->id, addr, stride, size);
+	}
+}
+
+static void cfe_schedule_next_pisp_job(struct cfe_device *cfe)
+{
+	struct vb2_buffer *vb2_bufs[FE_NUM_PADS] = { 0 };
+	struct cfe_config_buffer *config_buf;
+	struct cfe_buffer *buf;
+
+	for (unsigned int i = CSI2_NUM_CHANNELS; i < NUM_NODES; i++) {
+		struct cfe_node *node = &cfe->node[i];
+
+		if (!check_state(cfe, NODE_STREAMING, i))
+			continue;
+
+		buf = list_first_entry(&node->dma_queue, struct cfe_buffer,
+				       list);
+
+		trace_cfe_fe_schedule(node->id, &buf->vb.vb2_buf);
+
+		node->next_frm = buf;
+		vb2_bufs[node_desc[i].link_pad] = &buf->vb.vb2_buf;
+		list_del(&buf->list);
+	}
+
+	config_buf = to_cfe_config_buffer(cfe->node[FE_CONFIG].next_frm);
+	pisp_fe_submit_job(&cfe->fe, vb2_bufs, &config_buf->config);
+}
+
+static bool cfe_check_job_ready(struct cfe_device *cfe)
+{
+	for (unsigned int i = 0; i < NUM_NODES; i++) {
+		struct cfe_node *node = &cfe->node[i];
+
+		if (!check_state(cfe, NODE_ENABLED, i))
+			continue;
+
+		if (list_empty(&node->dma_queue))
+			return false;
+	}
+
+	return true;
+}
+
+static void cfe_prepare_next_job(struct cfe_device *cfe)
+{
+	trace_cfe_prepare_next_job(is_fe_enabled(cfe));
+
+	cfe->job_queued = true;
+	cfe_schedule_next_csi2_job(cfe);
+	if (is_fe_enabled(cfe))
+		cfe_schedule_next_pisp_job(cfe);
+
+	/* Flag if another job is ready after this. */
+	cfe->job_ready = cfe_check_job_ready(cfe);
+}
+
+static void cfe_process_buffer_complete(struct cfe_node *node,
+					enum vb2_buffer_state state)
+{
+	trace_cfe_buffer_complete(node->id, &node->cur_frm->vb);
+
+	node->cur_frm->vb.sequence = node->fs_count - 1;
+	vb2_buffer_done(&node->cur_frm->vb.vb2_buf, state);
+}
+
+static void cfe_queue_event_sof(struct cfe_node *node)
+{
+	struct v4l2_event event = {
+		.type = V4L2_EVENT_FRAME_SYNC,
+		.u.frame_sync.frame_sequence = node->fs_count - 1,
+	};
+
+	v4l2_event_queue(&node->video_dev, &event);
+}
+
+static void cfe_sof_isr(struct cfe_node *node)
+{
+	struct cfe_device *cfe = node->cfe;
+	bool matching_fs = true;
+
+	trace_cfe_frame_start(node->id, node->fs_count);
+
+	/*
+	 * If the sensor is producing unexpected frame event ordering over a
+	 * sustained period of time, guard against the possibility of coming
+	 * here and orphaning the cur_frm if it's not been dequeued already.
+	 * Unfortunately, there is not enough hardware state to tell if this
+	 * may have occurred.
+	 */
+	if (WARN(node->cur_frm, "%s: [%s] Orphanded frame at seq %u\n",
+		 __func__, node_desc[node->id].name, node->fs_count))
+		cfe_process_buffer_complete(node, VB2_BUF_STATE_ERROR);
+
+	node->cur_frm = node->next_frm;
+	node->next_frm = NULL;
+	node->fs_count++;
+
+	node->ts = ktime_get_ns();
+	for (unsigned int i = 0; i < NUM_NODES; i++) {
+		if (!check_state(cfe, NODE_STREAMING, i) || i == node->id)
+			continue;
+		/*
+		 * This checks if any other node has seen a FS. If yes, use the
+		 * same timestamp, eventually across all node buffers.
+		 */
+		if (cfe->node[i].fs_count >= node->fs_count)
+			node->ts = cfe->node[i].ts;
+		/*
+		 * This checks if all other node have seen a matching FS. If
+		 * yes, we can flag another job to be queued.
+		 */
+		if (matching_fs && cfe->node[i].fs_count != node->fs_count)
+			matching_fs = false;
+	}
+
+	if (matching_fs)
+		cfe->job_queued = false;
+
+	if (node->cur_frm)
+		node->cur_frm->vb.vb2_buf.timestamp = node->ts;
+
+	set_state(cfe, FS_INT, node->id);
+	clear_state(cfe, FE_INT, node->id);
+
+	if (is_image_output_node(node))
+		cfe_queue_event_sof(node);
+}
+
+static void cfe_eof_isr(struct cfe_node *node)
+{
+	struct cfe_device *cfe = node->cfe;
+
+	trace_cfe_frame_end(node->id, node->fs_count - 1);
+
+	if (node->cur_frm)
+		cfe_process_buffer_complete(node, VB2_BUF_STATE_DONE);
+
+	node->cur_frm = NULL;
+	set_state(cfe, FE_INT, node->id);
+	clear_state(cfe, FS_INT, node->id);
+}
+
+static irqreturn_t cfe_isr(int irq, void *dev)
+{
+	struct cfe_device *cfe = dev;
+	bool sof[NUM_NODES] = { 0 }, eof[NUM_NODES] = { 0 };
+	u32 sts;
+
+	sts = cfg_reg_read(cfe, MIPICFG_INTS);
+
+	if (sts & MIPICFG_INT_CSI_DMA)
+		csi2_isr(&cfe->csi2, sof, eof);
+
+	if (sts & MIPICFG_INT_PISP_FE)
+		pisp_fe_isr(&cfe->fe, sof + CSI2_NUM_CHANNELS,
+			    eof + CSI2_NUM_CHANNELS);
+
+	spin_lock(&cfe->state_lock);
+
+	for (unsigned int i = 0; i < NUM_NODES; i++) {
+		struct cfe_node *node = &cfe->node[i];
+
+		/*
+		 * The check_state(NODE_STREAMING) is to ensure we do not loop
+		 * over the CSI2_CHx nodes when the FE is active since they
+		 * generate interrupts even though the node is not streaming.
+		 */
+		if (!check_state(cfe, NODE_STREAMING, i) || !(sof[i] || eof[i]))
+			continue;
+
+		/*
+		 * There are 3 cases where we could get FS + FE_ACK at
+		 * the same time:
+		 * 1) FE of the current frame, and FS of the next frame.
+		 * 2) FS + FE of the same frame.
+		 * 3) FE of the current frame, and FS + FE of the next
+		 *    frame. To handle this, see the sof handler below.
+		 *
+		 * (1) is handled implicitly by the ordering of the FE and FS
+		 * handlers below.
+		 */
+		if (eof[i]) {
+			/*
+			 * The condition below tests for (2). Run the FS handler
+			 * first before the FE handler, both for the current
+			 * frame.
+			 */
+			if (sof[i] && !check_state(cfe, FS_INT, i)) {
+				cfe_sof_isr(node);
+				sof[i] = false;
+			}
+
+			cfe_eof_isr(node);
+		}
+
+		if (sof[i]) {
+			/*
+			 * The condition below tests for (3). In such cases, we
+			 * come in here with FS flag set in the node state from
+			 * the previous frame since it only gets cleared in
+			 * cfe_eof_isr(). Handle the FE for the previous
+			 * frame first before the FS handler for the current
+			 * frame.
+			 */
+			if (check_state(cfe, FS_INT, node->id) &&
+			    !check_state(cfe, FE_INT, node->id)) {
+				cfe_dbg(cfe, "%s: [%s] Handling missing previous FE interrupt\n",
+					__func__, node_desc[node->id].name);
+				cfe_eof_isr(node);
+			}
+
+			cfe_sof_isr(node);
+		}
+
+		if (!cfe->job_queued && cfe->job_ready)
+			cfe_prepare_next_job(cfe);
+	}
+
+	spin_unlock(&cfe->state_lock);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Stream helpers
+ */
+
+static int cfe_get_vc_dt_fallback(struct cfe_device *cfe, u8 *vc, u8 *dt)
+{
+	struct v4l2_subdev_state *state;
+	struct v4l2_mbus_framefmt *fmt;
+	const struct cfe_fmt *cfe_fmt;
+
+	state = v4l2_subdev_get_locked_active_state(&cfe->csi2.sd);
+
+	fmt = v4l2_subdev_state_get_format(state, CSI2_PAD_SINK, 0);
+	if (!fmt)
+		return -EINVAL;
+
+	cfe_fmt = find_format_by_code(fmt->code);
+	if (!cfe_fmt)
+		return -EINVAL;
+
+	*vc = 0;
+	*dt = cfe_fmt->csi_dt;
+
+	return 0;
+}
+
+static int cfe_get_vc_dt(struct cfe_device *cfe, unsigned int channel, u8 *vc,
+			 u8 *dt)
+{
+	struct v4l2_mbus_frame_desc remote_desc;
+	struct v4l2_subdev_state *state;
+	u32 sink_stream;
+	unsigned int i;
+	int ret;
+
+	state = v4l2_subdev_get_locked_active_state(&cfe->csi2.sd);
+
+	ret = v4l2_subdev_routing_find_opposite_end(&state->routing,
+		CSI2_PAD_FIRST_SOURCE + channel, 0, NULL, &sink_stream);
+	if (ret)
+		return ret;
+
+	ret = v4l2_subdev_call(cfe->source_sd, pad, get_frame_desc,
+			       cfe->source_pad, &remote_desc);
+	if (ret == -ENOIOCTLCMD) {
+		cfe_dbg(cfe, "source does not support get_frame_desc, use fallback\n");
+		return cfe_get_vc_dt_fallback(cfe, vc, dt);
+	} else if (ret) {
+		cfe_err(cfe, "Failed to get frame descriptor\n");
+		return ret;
+	}
+
+	if (remote_desc.type != V4L2_MBUS_FRAME_DESC_TYPE_CSI2) {
+		cfe_err(cfe, "Frame descriptor does not describe CSI-2 link");
+		return -EINVAL;
+	}
+
+	for (i = 0; i < remote_desc.num_entries; i++) {
+		if (remote_desc.entry[i].stream == sink_stream)
+			break;
+	}
+
+	if (i == remote_desc.num_entries) {
+		cfe_err(cfe, "Stream %u not found in remote frame desc\n",
+			sink_stream);
+		return -EINVAL;
+	}
+
+	*vc = remote_desc.entry[i].bus.csi2.vc;
+	*dt = remote_desc.entry[i].bus.csi2.dt;
+
+	return 0;
+}
+
+static int cfe_start_channel(struct cfe_node *node)
+{
+	struct cfe_device *cfe = node->cfe;
+	struct v4l2_subdev_state *state;
+	struct v4l2_mbus_framefmt *source_fmt;
+	const struct cfe_fmt *fmt;
+	unsigned long flags;
+	bool start_fe;
+	int ret;
+
+	cfe_dbg(cfe, "%s: [%s]\n", __func__, node_desc[node->id].name);
+
+	start_fe = is_fe_enabled(cfe) &&
+		   test_all_nodes(cfe, NODE_ENABLED, NODE_STREAMING);
+
+	state = v4l2_subdev_get_locked_active_state(&cfe->csi2.sd);
+
+	if (start_fe) {
+		unsigned int width, height;
+		u8 vc, dt;
+
+		cfe_dbg(cfe, "%s: %s using csi2 channel %d\n", __func__,
+			node_desc[FE_OUT0].name, cfe->fe_csi2_channel);
+
+		ret = cfe_get_vc_dt(cfe, cfe->fe_csi2_channel, &vc, &dt);
+		if (ret)
+			return ret;
+
+		source_fmt = v4l2_subdev_state_get_format(state,
+			node_desc[cfe->fe_csi2_channel].link_pad);
+		fmt = find_format_by_code(source_fmt->code);
+
+		width = source_fmt->width;
+		height = source_fmt->height;
+
+		/* Must have a valid CSI2 datatype. */
+		WARN_ON(!fmt->csi_dt);
+
+		/*
+		 * Start the associated CSI2 Channel as well.
+		 *
+		 * Must write to the ADDR register to latch the ctrl values
+		 * even if we are connected to the front end. Once running,
+		 * this is handled by the CSI2 AUTO_ARM mode.
+		 */
+		csi2_start_channel(&cfe->csi2, cfe->fe_csi2_channel,
+				   CSI2_MODE_FE_STREAMING,
+				   true, false, width, height, vc, dt);
+		csi2_set_buffer(&cfe->csi2, cfe->fe_csi2_channel, 0, 0, -1);
+		pisp_fe_start(&cfe->fe);
+	}
+
+	if (is_csi2_node(node)) {
+		unsigned int width = 0, height = 0;
+		u8 vc, dt;
+
+		ret = cfe_get_vc_dt(cfe, node->id, &vc, &dt);
+		if (ret) {
+			if (start_fe) {
+				csi2_stop_channel(&cfe->csi2,
+						  cfe->fe_csi2_channel);
+				pisp_fe_stop(&cfe->fe);
+			}
+
+			return ret;
+		}
+
+		u32 mode = CSI2_MODE_NORMAL;
+
+		source_fmt = v4l2_subdev_state_get_format(state,
+			node_desc[node->id].link_pad);
+		fmt = find_format_by_code(source_fmt->code);
+
+		/* Must have a valid CSI2 datatype. */
+		WARN_ON(!fmt->csi_dt);
+
+		if (is_image_output_node(node)) {
+			u32  pixfmt;
+
+			width = source_fmt->width;
+			height = source_fmt->height;
+
+			pixfmt = node->vid_fmt.fmt.pix.pixelformat;
+
+			if (pixfmt == fmt->remap[CFE_REMAP_16BIT]) {
+				mode = CSI2_MODE_REMAP;
+			} else if (pixfmt == fmt->remap[CFE_REMAP_COMPRESSED]) {
+				mode = CSI2_MODE_COMPRESSED;
+				csi2_set_compression(&cfe->csi2, node->id,
+						     CSI2_COMPRESSION_DELTA, 0,
+						     0);
+			}
+		}
+		/* Unconditionally start this CSI2 channel. */
+		csi2_start_channel(&cfe->csi2, node->id,
+				   mode,
+				   /* Auto arm */
+				   false,
+				   /* Pack bytes */
+				   is_meta_node(node) ? true : false,
+				   width, height, vc, dt);
+	}
+
+	spin_lock_irqsave(&cfe->state_lock, flags);
+	if (cfe->job_ready && test_all_nodes(cfe, NODE_ENABLED, NODE_STREAMING))
+		cfe_prepare_next_job(cfe);
+	spin_unlock_irqrestore(&cfe->state_lock, flags);
+
+	return 0;
+}
+
+static void cfe_stop_channel(struct cfe_node *node, bool fe_stop)
+{
+	struct cfe_device *cfe = node->cfe;
+
+	cfe_dbg(cfe, "%s: [%s] fe_stop %u\n", __func__,
+		node_desc[node->id].name, fe_stop);
+
+	if (fe_stop) {
+		csi2_stop_channel(&cfe->csi2, cfe->fe_csi2_channel);
+		pisp_fe_stop(&cfe->fe);
+	}
+
+	if (is_csi2_node(node))
+		csi2_stop_channel(&cfe->csi2, node->id);
+}
+
+static void cfe_return_buffers(struct cfe_node *node,
+			       enum vb2_buffer_state state)
+{
+	struct cfe_device *cfe = node->cfe;
+	struct cfe_buffer *buf, *tmp;
+	unsigned long flags;
+
+	cfe_dbg(cfe, "%s: [%s]\n", __func__, node_desc[node->id].name);
+
+	spin_lock_irqsave(&cfe->state_lock, flags);
+	list_for_each_entry_safe(buf, tmp, &node->dma_queue, list) {
+		list_del(&buf->list);
+		trace_cfe_return_buffer(node->id, buf->vb.vb2_buf.index, 2);
+		vb2_buffer_done(&buf->vb.vb2_buf, state);
+	}
+
+	if (node->cur_frm) {
+		trace_cfe_return_buffer(node->id,
+					node->cur_frm->vb.vb2_buf.index, 0);
+		vb2_buffer_done(&node->cur_frm->vb.vb2_buf, state);
+	}
+	if (node->next_frm && node->cur_frm != node->next_frm) {
+		trace_cfe_return_buffer(node->id,
+					node->next_frm->vb.vb2_buf.index, 1);
+		vb2_buffer_done(&node->next_frm->vb.vb2_buf, state);
+	}
+
+	node->cur_frm = NULL;
+	node->next_frm = NULL;
+	spin_unlock_irqrestore(&cfe->state_lock, flags);
+}
+
+/*
+ * vb2 ops
+ */
+
+static int cfe_queue_setup(struct vb2_queue *vq, unsigned int *nbuffers,
+			   unsigned int *nplanes, unsigned int sizes[],
+			   struct device *alloc_devs[])
+{
+	struct cfe_node *node = vb2_get_drv_priv(vq);
+	struct cfe_device *cfe = node->cfe;
+	unsigned int size = is_image_node(node) ?
+				    node->vid_fmt.fmt.pix.sizeimage :
+				    node->meta_fmt.fmt.meta.buffersize;
+
+	cfe_dbg(cfe, "%s: [%s] type:%u\n", __func__, node_desc[node->id].name,
+		node->buffer_queue.type);
+
+	if (vq->max_num_buffers + *nbuffers < 3)
+		*nbuffers = 3 - vq->max_num_buffers;
+
+	if (*nplanes) {
+		if (sizes[0] < size) {
+			cfe_err(cfe, "sizes[0] %i < size %u\n", sizes[0], size);
+			return -EINVAL;
+		}
+		size = sizes[0];
+	}
+
+	*nplanes = 1;
+	sizes[0] = size;
+
+	return 0;
+}
+
+static int cfe_buffer_prepare(struct vb2_buffer *vb)
+{
+	struct cfe_node *node = vb2_get_drv_priv(vb->vb2_queue);
+	struct cfe_device *cfe = node->cfe;
+	struct cfe_buffer *buf = to_cfe_buffer(vb);
+	unsigned long size;
+
+	trace_cfe_buffer_prepare(node->id, vb);
+
+	size = is_image_node(node) ? node->vid_fmt.fmt.pix.sizeimage :
+				     node->meta_fmt.fmt.meta.buffersize;
+	if (vb2_plane_size(vb, 0) < size) {
+		cfe_err(cfe, "data will not fit into plane (%lu < %lu)\n",
+			vb2_plane_size(vb, 0), size);
+		return -EINVAL;
+	}
+
+	vb2_set_plane_payload(&buf->vb.vb2_buf, 0, size);
+
+	if (node->id == FE_CONFIG) {
+		struct cfe_config_buffer *b = to_cfe_config_buffer(buf);
+		void *addr = vb2_plane_vaddr(vb, 0);
+
+		memcpy(&b->config, addr, sizeof(struct pisp_fe_config));
+		return pisp_fe_validate_config(&cfe->fe, &b->config,
+					       &cfe->node[FE_OUT0].vid_fmt,
+					       &cfe->node[FE_OUT1].vid_fmt);
+	}
+
+	return 0;
+}
+
+static void cfe_buffer_queue(struct vb2_buffer *vb)
+{
+	struct cfe_node *node = vb2_get_drv_priv(vb->vb2_queue);
+	struct cfe_device *cfe = node->cfe;
+	struct cfe_buffer *buf = to_cfe_buffer(vb);
+	unsigned long flags;
+	bool schedule_now;
+
+	spin_lock_irqsave(&cfe->state_lock, flags);
+
+	list_add_tail(&buf->list, &node->dma_queue);
+
+	if (!cfe->job_ready)
+		cfe->job_ready = cfe_check_job_ready(cfe);
+
+	schedule_now = !cfe->job_queued && cfe->job_ready &&
+		       test_all_nodes(cfe, NODE_ENABLED, NODE_STREAMING);
+
+	trace_cfe_buffer_queue(node->id, vb, schedule_now);
+
+	if (schedule_now)
+		cfe_prepare_next_job(cfe);
+
+	spin_unlock_irqrestore(&cfe->state_lock, flags);
+}
+
+static s64 cfe_get_source_link_freq(struct cfe_device *cfe)
+{
+	struct v4l2_subdev_state *state;
+	s64 link_freq;
+	u32 bpp;
+
+	state = v4l2_subdev_get_locked_active_state(&cfe->csi2.sd);
+
+	/*
+	 * v4l2_get_link_freq() uses V4L2_CID_LINK_FREQ first, and falls back
+	 * to V4L2_CID_PIXEL_RATE if V4L2_CID_LINK_FREQ is not available.
+	 *
+	 * With multistream input there is no single pixel rate, and thus we
+	 * cannot use V4L2_CID_PIXEL_RATE, so we pass 0 as the bpp which
+	 * causes v4l2_get_link_freq() to return an error if it falls back to
+	 * V4L2_CID_PIXEL_RATE.
+	 */
+
+	if (state->routing.num_routes == 1) {
+		struct v4l2_subdev_route *route = &state->routing.routes[0];
+		struct v4l2_mbus_framefmt *source_fmt;
+		const struct cfe_fmt *fmt;
+
+		source_fmt = v4l2_subdev_state_get_format(state,
+							  route->sink_pad,
+							  route->sink_stream);
+
+		fmt = find_format_by_code(source_fmt->code);
+		if (!fmt)
+			return -EINVAL;
+
+		bpp = fmt->depth;
+	} else {
+		bpp = 0;
+	}
+
+	link_freq = v4l2_get_link_freq(cfe->source_sd->ctrl_handler, bpp,
+				       2 * cfe->csi2.dphy.active_lanes);
+	if (link_freq < 0)
+		cfe_err(cfe, "failed to get link freq for subdev '%s'\n",
+			cfe->source_sd->name);
+
+	return link_freq;
+}
+
+static int cfe_start_streaming(struct vb2_queue *vq, unsigned int count)
+{
+	struct v4l2_mbus_config mbus_config = { 0 };
+	struct cfe_node *node = vb2_get_drv_priv(vq);
+	struct cfe_device *cfe = node->cfe;
+	struct v4l2_subdev_state *state;
+	struct v4l2_subdev_route *route;
+	s64 link_freq;
+	int ret;
+
+	cfe_dbg(cfe, "%s: [%s]\n", __func__, node_desc[node->id].name);
+
+	if (!check_state(cfe, NODE_ENABLED, node->id)) {
+		cfe_err(cfe, "%s node link is not enabled.\n",
+			node_desc[node->id].name);
+		ret = -EINVAL;
+		goto err_streaming;
+	}
+
+	ret = pm_runtime_resume_and_get(&cfe->pdev->dev);
+	if (ret < 0) {
+		cfe_err(cfe, "pm_runtime_resume_and_get failed\n");
+		goto err_streaming;
+	}
+
+	/* When using the Frontend, we must enable the FE_CONFIG node. */
+	if (is_fe_enabled(cfe) &&
+	    !check_state(cfe, NODE_ENABLED, cfe->node[FE_CONFIG].id)) {
+		cfe_err(cfe, "FE enabled, but FE_CONFIG node is not\n");
+		ret = -EINVAL;
+		goto err_pm_put;
+	}
+
+	ret = media_pipeline_start(&node->pad, &cfe->pipe);
+	if (ret < 0) {
+		cfe_err(cfe, "Failed to start media pipeline: %d\n", ret);
+		goto err_pm_put;
+	}
+
+	state = v4l2_subdev_lock_and_get_active_state(&cfe->csi2.sd);
+
+	clear_state(cfe, FS_INT | FE_INT, node->id);
+	set_state(cfe, NODE_STREAMING, node->id);
+	node->fs_count = 0;
+
+	ret = cfe_start_channel(node);
+	if (ret)
+		goto err_unlock_state;
+
+	if (!test_all_nodes(cfe, NODE_ENABLED, NODE_STREAMING)) {
+		cfe_dbg(cfe, "Streaming on hold, as all nodes are not set to streaming yet\n");
+		v4l2_subdev_unlock_state(state);
+		return 0;
+	}
+
+	cfg_reg_write(cfe, MIPICFG_CFG, MIPICFG_CFG_SEL_CSI);
+	cfg_reg_write(cfe, MIPICFG_INTE,
+		      MIPICFG_INT_CSI_DMA | MIPICFG_INT_PISP_FE);
+
+	ret = v4l2_subdev_call(cfe->source_sd, pad, get_mbus_config, 0,
+			       &mbus_config);
+	if (ret < 0 && ret != -ENOIOCTLCMD) {
+		cfe_err(cfe, "g_mbus_config failed\n");
+		goto err_clear_inte;
+	}
+
+	cfe->csi2.dphy.active_lanes = mbus_config.bus.mipi_csi2.num_data_lanes;
+	if (!cfe->csi2.dphy.active_lanes)
+		cfe->csi2.dphy.active_lanes = cfe->csi2.dphy.max_lanes;
+	if (cfe->csi2.dphy.active_lanes > cfe->csi2.dphy.max_lanes) {
+		cfe_err(cfe, "Device has requested %u data lanes, which is >%u configured in DT\n",
+			cfe->csi2.dphy.active_lanes, cfe->csi2.dphy.max_lanes);
+		ret = -EINVAL;
+		goto err_clear_inte;
+	}
+
+	link_freq = cfe_get_source_link_freq(cfe);
+	if (link_freq < 0)
+		goto err_clear_inte;
+
+	cfe->csi2.dphy.dphy_rate = div_s64(link_freq * 2, 1000000);
+	csi2_open_rx(&cfe->csi2);
+
+	cfe->streams_mask = 0;
+
+	for_each_active_route(&state->routing, route)
+		cfe->streams_mask |= BIT_ULL(route->sink_stream);
+
+	ret = v4l2_subdev_enable_streams(cfe->source_sd, cfe->source_pad,
+					 cfe->streams_mask);
+	if (ret) {
+		cfe_err(cfe, "stream on failed in subdev\n");
+		goto err_disable_cfe;
+	}
+
+	cfe_dbg(cfe, "Streaming enabled\n");
+
+	v4l2_subdev_unlock_state(state);
+
+	return 0;
+
+err_disable_cfe:
+	csi2_close_rx(&cfe->csi2);
+err_clear_inte:
+	cfg_reg_write(cfe, MIPICFG_INTE, 0);
+
+	cfe_stop_channel(node,
+			 is_fe_enabled(cfe) && test_all_nodes(cfe, NODE_ENABLED,
+							      NODE_STREAMING));
+err_unlock_state:
+	v4l2_subdev_unlock_state(state);
+	media_pipeline_stop(&node->pad);
+err_pm_put:
+	pm_runtime_put(&cfe->pdev->dev);
+err_streaming:
+	cfe_return_buffers(node, VB2_BUF_STATE_QUEUED);
+	clear_state(cfe, NODE_STREAMING, node->id);
+
+	return ret;
+}
+
+static void cfe_stop_streaming(struct vb2_queue *vq)
+{
+	struct cfe_node *node = vb2_get_drv_priv(vq);
+	struct cfe_device *cfe = node->cfe;
+	unsigned long flags;
+	bool fe_stop;
+
+	cfe_dbg(cfe, "%s: [%s]\n", __func__, node_desc[node->id].name);
+
+	spin_lock_irqsave(&cfe->state_lock, flags);
+	fe_stop = is_fe_enabled(cfe) &&
+		  test_all_nodes(cfe, NODE_ENABLED, NODE_STREAMING);
+
+	cfe->job_ready = false;
+	clear_state(cfe, NODE_STREAMING, node->id);
+	spin_unlock_irqrestore(&cfe->state_lock, flags);
+
+	cfe_stop_channel(node, fe_stop);
+
+	if (!test_any_node(cfe, NODE_STREAMING)) {
+		struct v4l2_subdev_state *state;
+		int ret;
+
+		state = v4l2_subdev_lock_and_get_active_state(&cfe->csi2.sd);
+
+		ret = v4l2_subdev_disable_streams(cfe->source_sd,
+						  cfe->source_pad,
+						  cfe->streams_mask);
+		if (ret)
+			cfe_err(cfe, "stream disable failed in subdev\n");
+
+		v4l2_subdev_unlock_state(state);
+
+		csi2_close_rx(&cfe->csi2);
+
+		cfg_reg_write(cfe, MIPICFG_INTE, 0);
+
+		cfe_dbg(cfe, "%s: Streaming disabled\n", __func__);
+	}
+
+	media_pipeline_stop(&node->pad);
+
+	/* Clear all queued buffers for the node */
+	cfe_return_buffers(node, VB2_BUF_STATE_ERROR);
+
+	pm_runtime_put(&cfe->pdev->dev);
+}
+
+static const struct vb2_ops cfe_video_qops = {
+	.wait_prepare = vb2_ops_wait_prepare,
+	.wait_finish = vb2_ops_wait_finish,
+	.queue_setup = cfe_queue_setup,
+	.buf_prepare = cfe_buffer_prepare,
+	.buf_queue = cfe_buffer_queue,
+	.start_streaming = cfe_start_streaming,
+	.stop_streaming = cfe_stop_streaming,
+};
+
+/*
+ * v4l2 ioctl ops
+ */
+
+static int cfe_querycap(struct file *file, void *priv,
+			struct v4l2_capability *cap)
+{
+	strscpy(cap->driver, CFE_MODULE_NAME, sizeof(cap->driver));
+	strscpy(cap->card, CFE_MODULE_NAME, sizeof(cap->card));
+
+	cap->capabilities |= V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_META_CAPTURE |
+			     V4L2_CAP_META_OUTPUT;
+
+	return 0;
+}
+
+static int cfe_enum_fmt_vid_cap(struct file *file, void *priv,
+				struct v4l2_fmtdesc *f)
+{
+	struct cfe_node *node = video_drvdata(file);
+	struct cfe_device *cfe = node->cfe;
+	unsigned int i, j;
+
+	if (!node_supports_image_output(node))
+		return -EINVAL;
+
+	cfe_dbg(cfe, "%s: [%s]\n", __func__, node_desc[node->id].name);
+
+	for (i = 0, j = 0; i < ARRAY_SIZE(formats); i++) {
+		if (f->mbus_code && formats[i].code != f->mbus_code)
+			continue;
+
+		if (formats[i].flags & CFE_FORMAT_FLAG_META_OUT ||
+		    formats[i].flags & CFE_FORMAT_FLAG_META_CAP)
+			continue;
+
+		if (is_fe_node(node) &&
+		    !(formats[i].flags & CFE_FORMAT_FLAG_FE_OUT))
+			continue;
+
+		if (j == f->index) {
+			f->pixelformat = formats[i].fourcc;
+			f->type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+			return 0;
+		}
+		j++;
+	}
+
+	return -EINVAL;
+}
+
+static int cfe_g_fmt(struct file *file, void *priv, struct v4l2_format *f)
+{
+	struct cfe_node *node = video_drvdata(file);
+
+	if (!node_supports_image(node))
+		return -EINVAL;
+
+	*f = node->vid_fmt;
+
+	return 0;
+}
+
+static int cfe_validate_fmt_vid_cap(struct cfe_node *node,
+				    struct v4l2_format *f)
+{
+	struct cfe_device *cfe = node->cfe;
+	const struct cfe_fmt *fmt;
+
+	cfe_dbg(cfe, "%s: [%s] %ux%u, V4L2 pix %p4cc\n", __func__,
+		node_desc[node->id].name, f->fmt.pix.width, f->fmt.pix.height,
+		&f->fmt.pix.pixelformat);
+
+	if (!node_supports_image_output(node))
+		return -EINVAL;
+
+	/*
+	 * Default to a format that works for both CSI2 and FE.
+	 */
+	fmt = find_format_by_pix(f->fmt.pix.pixelformat);
+	if (!fmt)
+		fmt = find_format_by_code(MEDIA_BUS_FMT_SBGGR10_1X10);
+
+	f->fmt.pix.pixelformat = fmt->fourcc;
+
+	if (is_fe_node(node) && fmt->remap[CFE_REMAP_16BIT]) {
+		f->fmt.pix.pixelformat = fmt->remap[CFE_REMAP_16BIT];
+		fmt = find_format_by_pix(f->fmt.pix.pixelformat);
+	}
+
+	f->fmt.pix.field = V4L2_FIELD_NONE;
+
+	cfe_calc_vid_format_size_bpl(cfe, fmt, f);
+
+	return 0;
+}
+
+static int cfe_s_fmt_vid_cap(struct file *file, void *priv,
+			     struct v4l2_format *f)
+{
+	struct cfe_node *node = video_drvdata(file);
+	struct cfe_device *cfe = node->cfe;
+	struct vb2_queue *q = &node->buffer_queue;
+	int ret;
+
+	if (vb2_is_busy(q))
+		return -EBUSY;
+
+	ret = cfe_validate_fmt_vid_cap(node, f);
+	if (ret)
+		return ret;
+
+	node->vid_fmt = *f;
+
+	cfe_dbg(cfe, "%s: Set %ux%u, V4L2 pix %p4cc\n", __func__,
+		node->vid_fmt.fmt.pix.width, node->vid_fmt.fmt.pix.height,
+		&node->vid_fmt.fmt.pix.pixelformat);
+
+	return 0;
+}
+
+static int cfe_try_fmt_vid_cap(struct file *file, void *priv,
+			       struct v4l2_format *f)
+{
+	struct cfe_node *node = video_drvdata(file);
+	struct cfe_device *cfe = node->cfe;
+
+	cfe_dbg(cfe, "%s: [%s]\n", __func__, node_desc[node->id].name);
+
+	return cfe_validate_fmt_vid_cap(node, f);
+}
+
+static int cfe_enum_fmt_meta(struct file *file, void *priv,
+			     struct v4l2_fmtdesc *f)
+{
+	struct cfe_node *node = video_drvdata(file);
+	struct cfe_device *cfe = node->cfe;
+
+	cfe_dbg(cfe, "%s: [%s]\n", __func__, node_desc[node->id].name);
+
+	if (!node_supports_meta(node))
+		return -EINVAL;
+
+	switch (node->id) {
+	case CSI2_CH0...CSI2_CH3:
+		f->flags = V4L2_FMT_FLAG_META_LINE_BASED;
+
+		switch (f->index) {
+		case 0:
+			f->pixelformat = V4L2_META_FMT_GENERIC_8;
+			return 0;
+		case 1:
+			f->pixelformat = V4L2_META_FMT_GENERIC_CSI2_10;
+			return 0;
+		case 2:
+			f->pixelformat = V4L2_META_FMT_GENERIC_CSI2_12;
+			return 0;
+		default:
+			return -EINVAL;
+		}
+	default:
+		break;
+	}
+
+	if (f->index != 0)
+		return -EINVAL;
+
+	switch (node->id) {
+	case FE_STATS:
+		f->pixelformat = V4L2_META_FMT_RPI_FE_STATS;
+		return 0;
+	case FE_CONFIG:
+		f->pixelformat = V4L2_META_FMT_RPI_FE_CFG;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int cfe_validate_fmt_meta(struct cfe_node *node, struct v4l2_format *f)
+{
+	struct cfe_device *cfe = node->cfe;
+	const struct cfe_fmt *fmt;
+
+	switch (node->id) {
+	case CSI2_CH0...CSI2_CH3:
+		cfe_dbg(cfe, "%s: [%s] %ux%u, V4L2 meta %p4cc\n", __func__,
+			node_desc[node->id].name, f->fmt.meta.width,
+			f->fmt.meta.height, &f->fmt.meta.dataformat);
+		break;
+	case FE_STATS:
+	case FE_CONFIG:
+		cfe_dbg(cfe, "%s: [%s] %u bytes, V4L2 meta %p4cc\n", __func__,
+			node_desc[node->id].name, f->fmt.meta.buffersize,
+			&f->fmt.meta.dataformat);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (!node_supports_meta(node))
+		return -EINVAL;
+
+	switch (node->id) {
+	case CSI2_CH0...CSI2_CH3:
+		fmt = find_format_by_pix(f->fmt.meta.dataformat);
+		if (!fmt || !(fmt->flags & CFE_FORMAT_FLAG_META_CAP))
+			fmt = find_format_by_pix(V4L2_META_FMT_GENERIC_CSI2_10);
+
+		f->fmt.meta.dataformat = fmt->fourcc;
+
+		cfe_calc_meta_format_size_bpl(cfe, fmt, f);
+
+		return 0;
+	case FE_STATS:
+		f->fmt.meta.dataformat = V4L2_META_FMT_RPI_FE_STATS;
+		f->fmt.meta.buffersize = sizeof(struct pisp_statistics);
+		return 0;
+	case FE_CONFIG:
+		f->fmt.meta.dataformat = V4L2_META_FMT_RPI_FE_CFG;
+		f->fmt.meta.buffersize = sizeof(struct pisp_fe_config);
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int cfe_g_fmt_meta(struct file *file, void *priv, struct v4l2_format *f)
+{
+	struct cfe_node *node = video_drvdata(file);
+	struct cfe_device *cfe = node->cfe;
+
+	cfe_dbg(cfe, "%s: [%s]\n", __func__, node_desc[node->id].name);
+
+	if (!node_supports_meta(node))
+		return -EINVAL;
+
+	*f = node->meta_fmt;
+
+	return 0;
+}
+
+static int cfe_s_fmt_meta(struct file *file, void *priv, struct v4l2_format *f)
+{
+	struct cfe_node *node = video_drvdata(file);
+	struct cfe_device *cfe = node->cfe;
+	struct vb2_queue *q = &node->buffer_queue;
+	int ret;
+
+	cfe_dbg(cfe, "%s: [%s]\n", __func__, node_desc[node->id].name);
+
+	if (vb2_is_busy(q))
+		return -EBUSY;
+
+	if (!node_supports_meta(node))
+		return -EINVAL;
+
+	ret = cfe_validate_fmt_meta(node, f);
+	if (ret)
+		return ret;
+
+	node->meta_fmt = *f;
+
+	cfe_dbg(cfe, "%s: Set %p4cc\n", __func__,
+		&node->meta_fmt.fmt.meta.dataformat);
+
+	return 0;
+}
+
+static int cfe_try_fmt_meta(struct file *file, void *priv,
+			    struct v4l2_format *f)
+{
+	struct cfe_node *node = video_drvdata(file);
+	struct cfe_device *cfe = node->cfe;
+
+	cfe_dbg(cfe, "%s: [%s]\n", __func__, node_desc[node->id].name);
+	return cfe_validate_fmt_meta(node, f);
+}
+
+static int cfe_enum_framesizes(struct file *file, void *priv,
+			       struct v4l2_frmsizeenum *fsize)
+{
+	struct cfe_node *node = video_drvdata(file);
+	struct cfe_device *cfe = node->cfe;
+	const struct cfe_fmt *fmt;
+
+	cfe_dbg(cfe, "%s [%s]\n", __func__, node_desc[node->id].name);
+
+	if (fsize->index > 0)
+		return -EINVAL;
+
+	/* check for valid format */
+	fmt = find_format_by_pix(fsize->pixel_format);
+	if (!fmt) {
+		cfe_dbg(cfe, "Invalid pixel code: %x\n", fsize->pixel_format);
+		return -EINVAL;
+	}
+
+	/* TODO: Do we have limits on the step_width? */
+
+	fsize->type = V4L2_FRMSIZE_TYPE_STEPWISE;
+	fsize->stepwise.min_width = MIN_WIDTH;
+	fsize->stepwise.max_width = MAX_WIDTH;
+	fsize->stepwise.step_width = 2;
+	fsize->stepwise.min_height = MIN_HEIGHT;
+	fsize->stepwise.max_height = MAX_HEIGHT;
+	fsize->stepwise.step_height = 1;
+
+	return 0;
+}
+
+static int cfe_vb2_ioctl_reqbufs(struct file *file, void *priv,
+				 struct v4l2_requestbuffers *p)
+{
+	struct video_device *vdev = video_devdata(file);
+	struct cfe_node *node = video_get_drvdata(vdev);
+	struct cfe_device *cfe = node->cfe;
+	int ret;
+
+	cfe_dbg(cfe, "%s: [%s] type:%u\n", __func__, node_desc[node->id].name,
+		p->type);
+
+	if (p->type != V4L2_BUF_TYPE_VIDEO_CAPTURE &&
+	    p->type != V4L2_BUF_TYPE_META_CAPTURE &&
+	    p->type != V4L2_BUF_TYPE_META_OUTPUT)
+		return -EINVAL;
+
+	ret = vb2_queue_change_type(vdev->queue, p->type);
+	if (ret)
+		return ret;
+
+	return vb2_ioctl_reqbufs(file, priv, p);
+}
+
+static int cfe_vb2_ioctl_create_bufs(struct file *file, void *priv,
+				     struct v4l2_create_buffers *p)
+{
+	struct video_device *vdev = video_devdata(file);
+	struct cfe_node *node = video_get_drvdata(vdev);
+	struct cfe_device *cfe = node->cfe;
+	int ret;
+
+	cfe_dbg(cfe, "%s: [%s] type:%u\n", __func__, node_desc[node->id].name,
+		p->format.type);
+
+	if (p->format.type != V4L2_BUF_TYPE_VIDEO_CAPTURE &&
+	    p->format.type != V4L2_BUF_TYPE_META_CAPTURE &&
+	    p->format.type != V4L2_BUF_TYPE_META_OUTPUT)
+		return -EINVAL;
+
+	ret = vb2_queue_change_type(vdev->queue, p->format.type);
+	if (ret)
+		return ret;
+
+	return vb2_ioctl_create_bufs(file, priv, p);
+}
+
+static int cfe_subscribe_event(struct v4l2_fh *fh,
+			       const struct v4l2_event_subscription *sub)
+{
+	struct cfe_node *node = video_get_drvdata(fh->vdev);
+
+	switch (sub->type) {
+	case V4L2_EVENT_FRAME_SYNC:
+		if (!node_supports_image_output(node))
+			break;
+
+		return v4l2_event_subscribe(fh, sub, 2, NULL);
+	case V4L2_EVENT_SOURCE_CHANGE:
+		if (!node_supports_image_output(node) &&
+		    !node_supports_meta_output(node))
+			break;
+
+		return v4l2_event_subscribe(fh, sub, 4, NULL);
+	}
+
+	return v4l2_ctrl_subscribe_event(fh, sub);
+}
+
+static const struct v4l2_ioctl_ops cfe_ioctl_ops = {
+	.vidioc_querycap = cfe_querycap,
+	.vidioc_enum_fmt_vid_cap = cfe_enum_fmt_vid_cap,
+	.vidioc_g_fmt_vid_cap = cfe_g_fmt,
+	.vidioc_s_fmt_vid_cap = cfe_s_fmt_vid_cap,
+	.vidioc_try_fmt_vid_cap = cfe_try_fmt_vid_cap,
+
+	.vidioc_enum_fmt_meta_cap = cfe_enum_fmt_meta,
+	.vidioc_g_fmt_meta_cap = cfe_g_fmt_meta,
+	.vidioc_s_fmt_meta_cap = cfe_s_fmt_meta,
+	.vidioc_try_fmt_meta_cap = cfe_try_fmt_meta,
+
+	.vidioc_enum_fmt_meta_out = cfe_enum_fmt_meta,
+	.vidioc_g_fmt_meta_out = cfe_g_fmt_meta,
+	.vidioc_s_fmt_meta_out = cfe_s_fmt_meta,
+	.vidioc_try_fmt_meta_out = cfe_try_fmt_meta,
+
+	.vidioc_enum_framesizes = cfe_enum_framesizes,
+
+	.vidioc_reqbufs = cfe_vb2_ioctl_reqbufs,
+	.vidioc_create_bufs = cfe_vb2_ioctl_create_bufs,
+	.vidioc_prepare_buf = vb2_ioctl_prepare_buf,
+	.vidioc_querybuf = vb2_ioctl_querybuf,
+	.vidioc_qbuf = vb2_ioctl_qbuf,
+	.vidioc_dqbuf = vb2_ioctl_dqbuf,
+	.vidioc_expbuf = vb2_ioctl_expbuf,
+	.vidioc_streamon = vb2_ioctl_streamon,
+	.vidioc_streamoff = vb2_ioctl_streamoff,
+
+	.vidioc_subscribe_event = cfe_subscribe_event,
+	.vidioc_unsubscribe_event = v4l2_event_unsubscribe,
+};
+
+static void cfe_notify(struct v4l2_subdev *sd, unsigned int notification,
+		       void *arg)
+{
+	struct cfe_device *cfe = to_cfe_device(sd->v4l2_dev);
+
+	switch (notification) {
+	case V4L2_DEVICE_NOTIFY_EVENT:
+		for (unsigned int i = 0; i < NUM_NODES; i++) {
+			struct cfe_node *node = &cfe->node[i];
+
+			if (check_state(cfe, NODE_REGISTERED, i))
+				continue;
+
+			v4l2_event_queue(&node->video_dev, arg);
+		}
+		break;
+	default:
+		break;
+	}
+}
+
+/* cfe capture driver file operations */
+static const struct v4l2_file_operations cfe_fops = {
+	.owner = THIS_MODULE,
+	.open = v4l2_fh_open,
+	.release = vb2_fop_release,
+	.poll = vb2_fop_poll,
+	.unlocked_ioctl = video_ioctl2,
+	.mmap = vb2_fop_mmap,
+};
+
+static int cfe_video_link_validate(struct media_link *link)
+{
+	struct video_device *vd = container_of(link->sink->entity,
+					       struct video_device, entity);
+	struct cfe_node *node = container_of(vd, struct cfe_node, video_dev);
+	struct cfe_device *cfe = node->cfe;
+	struct v4l2_mbus_framefmt *source_fmt;
+	struct v4l2_subdev_state *state;
+	struct v4l2_subdev *source_sd;
+	int ret = 0;
+
+	cfe_dbg(cfe, "%s: [%s] link \"%s\":%u -> \"%s\":%u\n", __func__,
+		node_desc[node->id].name,
+		link->source->entity->name, link->source->index,
+		link->sink->entity->name, link->sink->index);
+
+	if (!media_entity_remote_source_pad_unique(link->sink->entity)) {
+		cfe_err(cfe, "video node %s pad not connected\n", vd->name);
+		return -ENOTCONN;
+	}
+
+	source_sd = media_entity_to_v4l2_subdev(link->source->entity);
+
+	state = v4l2_subdev_lock_and_get_active_state(source_sd);
+
+	source_fmt = v4l2_subdev_state_get_format(state, link->source->index);
+	if (!source_fmt) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (is_image_output_node(node)) {
+		struct v4l2_pix_format *pix_fmt = &node->vid_fmt.fmt.pix;
+		const struct cfe_fmt *fmt;
+
+		if (source_fmt->width != pix_fmt->width ||
+		    source_fmt->height != pix_fmt->height) {
+			cfe_err(cfe, "Wrong width or height %ux%u (remote pad set to %ux%u)\n",
+				pix_fmt->width, pix_fmt->height,
+				source_fmt->width, source_fmt->height);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		fmt = find_format_by_code_and_fourcc(source_fmt->code,
+						     pix_fmt->pixelformat);
+		if (!fmt) {
+			cfe_err(cfe, "Format mismatch!\n");
+			ret = -EINVAL;
+			goto out;
+		}
+	} else if (is_csi2_node(node) && is_meta_output_node(node)) {
+		struct v4l2_meta_format *meta_fmt = &node->meta_fmt.fmt.meta;
+		const struct cfe_fmt *fmt;
+
+		if (source_fmt->width != meta_fmt->width ||
+		    source_fmt->height != meta_fmt->height) {
+			cfe_err(cfe, "Wrong width or height %ux%u (remote pad set to %ux%u)\n",
+				meta_fmt->width, meta_fmt->height,
+				source_fmt->width, source_fmt->height);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		fmt = find_format_by_code_and_fourcc(source_fmt->code,
+						     meta_fmt->dataformat);
+		if (!fmt) {
+			cfe_err(cfe, "Format mismatch!\n");
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+out:
+	v4l2_subdev_unlock_state(state);
+
+	return ret;
+}
+
+static const struct media_entity_operations cfe_media_entity_ops = {
+	.link_validate = cfe_video_link_validate,
+};
+
+static int cfe_video_link_notify(struct media_link *link, u32 flags,
+				 unsigned int notification)
+{
+	struct media_device *mdev = link->graph_obj.mdev;
+	struct cfe_device *cfe = container_of(mdev, struct cfe_device, mdev);
+	struct media_entity *fe = &cfe->fe.sd.entity;
+	struct media_entity *csi2 = &cfe->csi2.sd.entity;
+	unsigned long lock_flags;
+
+	if (notification != MEDIA_DEV_NOTIFY_POST_LINK_CH)
+		return 0;
+
+	cfe_dbg(cfe, "%s: %s[%u] -> %s[%u] 0x%x", __func__,
+		link->source->entity->name, link->source->index,
+		link->sink->entity->name, link->sink->index, flags);
+
+	spin_lock_irqsave(&cfe->state_lock, lock_flags);
+
+	for (unsigned int i = 0; i < NUM_NODES; i++) {
+		if (link->sink->entity != &cfe->node[i].video_dev.entity &&
+		    link->source->entity != &cfe->node[i].video_dev.entity)
+			continue;
+
+		if (link->flags & MEDIA_LNK_FL_ENABLED)
+			set_state(cfe, NODE_ENABLED, i);
+		else
+			clear_state(cfe, NODE_ENABLED, i);
+
+		break;
+	}
+
+	spin_unlock_irqrestore(&cfe->state_lock, lock_flags);
+
+	if (link->source->entity != csi2)
+		return 0;
+	if (link->sink->entity != fe)
+		return 0;
+	if (link->sink->index != 0)
+		return 0;
+
+	cfe->fe_csi2_channel = -1;
+	if (link->flags & MEDIA_LNK_FL_ENABLED) {
+		if (link->source->index == node_desc[CSI2_CH0].link_pad)
+			cfe->fe_csi2_channel = CSI2_CH0;
+		else if (link->source->index == node_desc[CSI2_CH1].link_pad)
+			cfe->fe_csi2_channel = CSI2_CH1;
+		else if (link->source->index == node_desc[CSI2_CH2].link_pad)
+			cfe->fe_csi2_channel = CSI2_CH2;
+		else if (link->source->index == node_desc[CSI2_CH3].link_pad)
+			cfe->fe_csi2_channel = CSI2_CH3;
+	}
+
+	if (is_fe_enabled(cfe))
+		cfe_dbg(cfe, "%s: Found CSI2:%d -> FE:0 link\n", __func__,
+			cfe->fe_csi2_channel);
+	else
+		cfe_dbg(cfe, "%s: Unable to find CSI2:x -> FE:0 link\n",
+			__func__);
+
+	return 0;
+}
+
+static const struct media_device_ops cfe_media_device_ops = {
+	.link_notify = cfe_video_link_notify,
+};
+
+static void cfe_release(struct kref *kref)
+{
+	struct cfe_device *cfe = container_of(kref, struct cfe_device, kref);
+
+	media_device_cleanup(&cfe->mdev);
+
+	kfree(cfe);
+}
+
+static void cfe_put(struct cfe_device *cfe)
+{
+	kref_put(&cfe->kref, cfe_release);
+}
+
+static void cfe_get(struct cfe_device *cfe)
+{
+	kref_get(&cfe->kref);
+}
+
+static void cfe_node_release(struct video_device *vdev)
+{
+	struct cfe_node *node = video_get_drvdata(vdev);
+
+	cfe_put(node->cfe);
+}
+
+static int cfe_register_node(struct cfe_device *cfe, int id)
+{
+	struct video_device *vdev;
+	const struct cfe_fmt *fmt;
+	struct vb2_queue *q;
+	struct cfe_node *node = &cfe->node[id];
+	int ret;
+
+	node->cfe = cfe;
+	node->id = id;
+
+	if (node_supports_image(node)) {
+		if (node_supports_image_output(node))
+			node->vid_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+		else
+			node->vid_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
+
+		fmt = find_format_by_code(cfe_default_format.code);
+		if (!fmt) {
+			cfe_err(cfe, "Failed to find format code\n");
+			return -EINVAL;
+		}
+
+		node->vid_fmt.fmt.pix.pixelformat = fmt->fourcc;
+		v4l2_fill_pix_format(&node->vid_fmt.fmt.pix,
+				     &cfe_default_format);
+
+		ret = cfe_validate_fmt_vid_cap(node, &node->vid_fmt);
+		if (ret)
+			return ret;
+	}
+
+	if (node_supports_meta(node)) {
+		if (node_supports_meta_output(node))
+			node->meta_fmt.type = V4L2_BUF_TYPE_META_CAPTURE;
+		else
+			node->meta_fmt.type = V4L2_BUF_TYPE_META_OUTPUT;
+
+		ret = cfe_validate_fmt_meta(node, &node->meta_fmt);
+		if (ret)
+			return ret;
+	}
+
+	mutex_init(&node->lock);
+
+	q = &node->buffer_queue;
+	q->type = node_supports_image(node) ? node->vid_fmt.type :
+					      node->meta_fmt.type;
+	q->io_modes = VB2_MMAP | VB2_DMABUF;
+	q->drv_priv = node;
+	q->ops = &cfe_video_qops;
+	q->mem_ops = &vb2_dma_contig_memops;
+	q->buf_struct_size = id == FE_CONFIG ? sizeof(struct cfe_config_buffer)
+					     : sizeof(struct cfe_buffer);
+	q->timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC;
+	q->lock = &node->lock;
+	q->min_queued_buffers = 1;
+	q->dev = &cfe->pdev->dev;
+
+	ret = vb2_queue_init(q);
+	if (ret) {
+		cfe_err(cfe, "vb2_queue_init() failed\n");
+		return ret;
+	}
+
+	INIT_LIST_HEAD(&node->dma_queue);
+
+	vdev = &node->video_dev;
+	vdev->release = cfe_node_release;
+	vdev->fops = &cfe_fops;
+	vdev->ioctl_ops = &cfe_ioctl_ops;
+	vdev->entity.ops = &cfe_media_entity_ops;
+	vdev->v4l2_dev = &cfe->v4l2_dev;
+	vdev->vfl_dir = (node_supports_image_output(node) ||
+			 node_supports_meta_output(node)) ?
+				VFL_DIR_RX :
+				VFL_DIR_TX;
+	vdev->queue = q;
+	vdev->lock = &node->lock;
+	vdev->device_caps = node_desc[id].caps;
+	vdev->device_caps |= V4L2_CAP_STREAMING | V4L2_CAP_IO_MC;
+
+	/* Define the device names */
+	snprintf(vdev->name, sizeof(vdev->name), "%s-%s", CFE_MODULE_NAME,
+		 node_desc[id].name);
+
+	video_set_drvdata(vdev, node);
+	node->pad.flags = node_desc[id].pad_flags;
+	media_entity_pads_init(&vdev->entity, 1, &node->pad);
+
+	if (!node_supports_image(node)) {
+		v4l2_disable_ioctl(&node->video_dev,
+				   VIDIOC_ENUM_FRAMEINTERVALS);
+		v4l2_disable_ioctl(&node->video_dev, VIDIOC_ENUM_FRAMESIZES);
+	}
+
+	ret = video_register_device(vdev, VFL_TYPE_VIDEO, -1);
+	if (ret) {
+		cfe_err(cfe, "Unable to register video device %s\n",
+			vdev->name);
+		return ret;
+	}
+
+	cfe_info(cfe, "Registered [%s] node id %d as /dev/video%u\n",
+		 vdev->name, id, vdev->num);
+
+	/*
+	 * Acquire a reference to cfe, which will be released when the video
+	 * device will be unregistered and userspace will have closed all open
+	 * file handles.
+	 */
+	cfe_get(cfe);
+	set_state(cfe, NODE_REGISTERED, id);
+
+	return 0;
+}
+
+static void cfe_unregister_nodes(struct cfe_device *cfe)
+{
+	for (unsigned int i = 0; i < NUM_NODES; i++) {
+		struct cfe_node *node = &cfe->node[i];
+
+		if (check_state(cfe, NODE_REGISTERED, i)) {
+			clear_state(cfe, NODE_REGISTERED, i);
+			video_unregister_device(&node->video_dev);
+		}
+	}
+}
+
+static int cfe_link_node_pads(struct cfe_device *cfe)
+{
+	struct media_pad *remote_pad;
+	int ret;
+
+	/* Source -> CSI2 */
+
+	ret = v4l2_create_fwnode_links_to_pad(cfe->source_sd,
+					      &cfe->csi2.pad[CSI2_PAD_SINK],
+					      MEDIA_LNK_FL_IMMUTABLE | MEDIA_LNK_FL_ENABLED);
+
+	if (ret) {
+		cfe_err(cfe, "Failed to create links to the source: %d\n", ret);
+		return ret;
+	}
+
+	remote_pad = media_pad_remote_pad_unique(&cfe->csi2.pad[CSI2_PAD_SINK]);
+	if (IS_ERR(remote_pad)) {
+		ret = PTR_ERR(remote_pad);
+		cfe_err(cfe, "Failed to get unique remote source pad: %d\n",
+			ret);
+		return ret;
+	}
+
+	cfe->source_pad = remote_pad->index;
+
+	for (unsigned int i = 0; i < CSI2_NUM_CHANNELS; i++) {
+		struct cfe_node *node = &cfe->node[i];
+
+		if (!check_state(cfe, NODE_REGISTERED, i))
+			continue;
+
+		/* CSI2 channel # -> /dev/video# */
+		ret = media_create_pad_link(&cfe->csi2.sd.entity,
+					    node_desc[i].link_pad,
+					    &node->video_dev.entity, 0, 0);
+		if (ret)
+			return ret;
+
+		if (node_supports_image(node)) {
+			/* CSI2 channel # -> FE Input */
+			ret = media_create_pad_link(&cfe->csi2.sd.entity,
+						    node_desc[i].link_pad,
+						    &cfe->fe.sd.entity,
+						    FE_STREAM_PAD, 0);
+			if (ret)
+				return ret;
+		}
+	}
+
+	for (unsigned int i = CSI2_NUM_CHANNELS; i < NUM_NODES; i++) {
+		struct cfe_node *node = &cfe->node[i];
+		struct media_entity *src, *dst;
+		unsigned int src_pad, dst_pad;
+
+		if (node_desc[i].pad_flags & MEDIA_PAD_FL_SINK) {
+			/* FE -> /dev/video# */
+			src = &cfe->fe.sd.entity;
+			src_pad = node_desc[i].link_pad;
+			dst = &node->video_dev.entity;
+			dst_pad = 0;
+		} else {
+			/* /dev/video# -> FE */
+			dst = &cfe->fe.sd.entity;
+			dst_pad = node_desc[i].link_pad;
+			src = &node->video_dev.entity;
+			src_pad = 0;
+		}
+
+		ret = media_create_pad_link(src, src_pad, dst, dst_pad, 0);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int cfe_probe_complete(struct cfe_device *cfe)
+{
+	int ret;
+
+	cfe->v4l2_dev.notify = cfe_notify;
+
+	for (unsigned int i = 0; i < NUM_NODES; i++) {
+		ret = cfe_register_node(cfe, i);
+		if (ret) {
+			cfe_err(cfe, "Unable to register video node %u.\n", i);
+			goto unregister;
+		}
+	}
+
+	ret = cfe_link_node_pads(cfe);
+	if (ret) {
+		cfe_err(cfe, "Unable to link node pads.\n");
+		goto unregister;
+	}
+
+	ret = v4l2_device_register_subdev_nodes(&cfe->v4l2_dev);
+	if (ret) {
+		cfe_err(cfe, "Unable to register subdev nodes.\n");
+		goto unregister;
+	}
+
+	return 0;
+
+unregister:
+	cfe_unregister_nodes(cfe);
+	return ret;
+}
+
+static int cfe_async_bound(struct v4l2_async_notifier *notifier,
+			   struct v4l2_subdev *subdev,
+			   struct v4l2_async_connection *asd)
+{
+	struct cfe_device *cfe = to_cfe_device(notifier->v4l2_dev);
+
+	if (cfe->source_sd) {
+		cfe_err(cfe, "Rejecting subdev %s (Already set!!)",
+			subdev->name);
+		return 0;
+	}
+
+	cfe->source_sd = subdev;
+
+	cfe_dbg(cfe, "Using source %s for capture\n", subdev->name);
+
+	return 0;
+}
+
+static int cfe_async_complete(struct v4l2_async_notifier *notifier)
+{
+	struct cfe_device *cfe = to_cfe_device(notifier->v4l2_dev);
+
+	return cfe_probe_complete(cfe);
+}
+
+static const struct v4l2_async_notifier_operations cfe_async_ops = {
+	.bound = cfe_async_bound,
+	.complete = cfe_async_complete,
+};
+
+static int cfe_register_async_nf(struct cfe_device *cfe)
+{
+	struct platform_device *pdev = cfe->pdev;
+	struct v4l2_fwnode_endpoint ep = { .bus_type = V4L2_MBUS_CSI2_DPHY };
+	struct fwnode_handle *local_ep_fwnode;
+	struct v4l2_async_connection *asd;
+	int ret;
+
+	local_ep_fwnode = fwnode_graph_get_endpoint_by_id(pdev->dev.fwnode, 0,
+							  0, 0);
+	if (!local_ep_fwnode) {
+		cfe_err(cfe, "Failed to find local endpoint fwnode\n");
+		return -ENODEV;
+	}
+
+	/* Parse the local endpoint and validate its configuration. */
+	ret = v4l2_fwnode_endpoint_parse(local_ep_fwnode, &ep);
+	if (ret) {
+		cfe_err(cfe, "Failed to find remote endpoint fwnode\n");
+		goto err_put_local_fwnode;
+	}
+
+	for (unsigned int lane = 0; lane < ep.bus.mipi_csi2.num_data_lanes;
+	     lane++) {
+		if (ep.bus.mipi_csi2.data_lanes[lane] != lane + 1) {
+			cfe_err(cfe, "Data lanes reordering not supported\n");
+			ret = -EINVAL;
+			goto err_put_local_fwnode;
+		}
+	}
+
+	cfe->csi2.dphy.max_lanes = ep.bus.mipi_csi2.num_data_lanes;
+	cfe->csi2.bus_flags = ep.bus.mipi_csi2.flags;
+
+	/* Initialize and register the async notifier. */
+	v4l2_async_nf_init(&cfe->notifier, &cfe->v4l2_dev);
+	cfe->notifier.ops = &cfe_async_ops;
+
+	asd = v4l2_async_nf_add_fwnode_remote(&cfe->notifier, local_ep_fwnode,
+					      struct v4l2_async_connection);
+	if (IS_ERR(asd)) {
+		ret = PTR_ERR(asd);
+		cfe_err(cfe, "Error adding subdevice: %d\n", ret);
+		goto err_put_local_fwnode;
+	}
+
+	ret = v4l2_async_nf_register(&cfe->notifier);
+	if (ret) {
+		cfe_err(cfe, "Error registering async notifier: %d\n", ret);
+		goto err_nf_cleanup;
+	}
+
+	fwnode_handle_put(local_ep_fwnode);
+
+	return 0;
+
+err_nf_cleanup:
+	v4l2_async_nf_cleanup(&cfe->notifier);
+err_put_local_fwnode:
+	fwnode_handle_put(local_ep_fwnode);
+
+	return ret;
+}
+
+static int cfe_probe(struct platform_device *pdev)
+{
+	struct cfe_device *cfe;
+	char debugfs_name[32];
+	int ret;
+
+	cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
+	if (!cfe)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, cfe);
+
+	kref_init(&cfe->kref);
+	cfe->pdev = pdev;
+	cfe->fe_csi2_channel = -1;
+	spin_lock_init(&cfe->state_lock);
+
+	cfe->csi2.base = devm_platform_ioremap_resource(pdev, 0);
+	if (IS_ERR(cfe->csi2.base)) {
+		dev_err(&pdev->dev, "Failed to get dma io block\n");
+		ret = PTR_ERR(cfe->csi2.base);
+		goto err_cfe_put;
+	}
+
+	cfe->csi2.dphy.base = devm_platform_ioremap_resource(pdev, 1);
+	if (IS_ERR(cfe->csi2.dphy.base)) {
+		dev_err(&pdev->dev, "Failed to get host io block\n");
+		ret = PTR_ERR(cfe->csi2.dphy.base);
+		goto err_cfe_put;
+	}
+
+	cfe->mipi_cfg_base = devm_platform_ioremap_resource(pdev, 2);
+	if (IS_ERR(cfe->mipi_cfg_base)) {
+		dev_err(&pdev->dev, "Failed to get mipi cfg io block\n");
+		ret = PTR_ERR(cfe->mipi_cfg_base);
+		goto err_cfe_put;
+	}
+
+	cfe->fe.base = devm_platform_ioremap_resource(pdev, 3);
+	if (IS_ERR(cfe->fe.base)) {
+		dev_err(&pdev->dev, "Failed to get pisp fe io block\n");
+		ret = PTR_ERR(cfe->fe.base);
+		goto err_cfe_put;
+	}
+
+	ret = platform_get_irq(pdev, 0);
+	if (ret <= 0) {
+		dev_err(&pdev->dev, "No IRQ resource\n");
+		ret = -EINVAL;
+		goto err_cfe_put;
+	}
+
+	ret = devm_request_irq(&pdev->dev, ret, cfe_isr, 0, "rp1-cfe", cfe);
+	if (ret) {
+		dev_err(&pdev->dev, "Unable to request interrupt\n");
+		ret = -EINVAL;
+		goto err_cfe_put;
+	}
+
+	ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
+	if (ret) {
+		dev_err(&pdev->dev, "DMA enable failed\n");
+		goto err_cfe_put;
+	}
+
+	/* TODO: Enable clock only when running. */
+	cfe->clk = devm_clk_get(&pdev->dev, NULL);
+	if (IS_ERR(cfe->clk))
+		return dev_err_probe(&pdev->dev, PTR_ERR(cfe->clk),
+				     "clock not found\n");
+
+	cfe->mdev.dev = &pdev->dev;
+	cfe->mdev.ops = &cfe_media_device_ops;
+	strscpy(cfe->mdev.model, CFE_MODULE_NAME, sizeof(cfe->mdev.model));
+	strscpy(cfe->mdev.serial, "", sizeof(cfe->mdev.serial));
+	snprintf(cfe->mdev.bus_info, sizeof(cfe->mdev.bus_info), "platform:%s",
+		 dev_name(&pdev->dev));
+
+	media_device_init(&cfe->mdev);
+
+	cfe->v4l2_dev.mdev = &cfe->mdev;
+
+	ret = v4l2_device_register(&pdev->dev, &cfe->v4l2_dev);
+	if (ret) {
+		cfe_err(cfe, "Unable to register v4l2 device.\n");
+		goto err_cfe_put;
+	}
+
+	snprintf(debugfs_name, sizeof(debugfs_name), "rp1-cfe:%s",
+		 dev_name(&pdev->dev));
+	cfe->debugfs = debugfs_create_dir(debugfs_name, NULL);
+	debugfs_create_file("regs", 0440, cfe->debugfs, cfe,
+			    &mipi_cfg_regs_fops);
+
+	/* Enable the block power domain */
+	pm_runtime_enable(&pdev->dev);
+
+	ret = pm_runtime_resume_and_get(&cfe->pdev->dev);
+	if (ret)
+		goto err_runtime_disable;
+
+	cfe->csi2.v4l2_dev = &cfe->v4l2_dev;
+	ret = csi2_init(&cfe->csi2, cfe->debugfs);
+	if (ret) {
+		cfe_err(cfe, "Failed to init csi2 (%d)\n", ret);
+		goto err_runtime_put;
+	}
+
+	cfe->fe.v4l2_dev = &cfe->v4l2_dev;
+	ret = pisp_fe_init(&cfe->fe, cfe->debugfs);
+	if (ret) {
+		cfe_err(cfe, "Failed to init pisp fe (%d)\n", ret);
+		goto err_csi2_uninit;
+	}
+
+	cfe->mdev.hw_revision = cfe->fe.hw_revision;
+	ret = media_device_register(&cfe->mdev);
+	if (ret < 0) {
+		cfe_err(cfe, "Unable to register media-controller device.\n");
+		goto err_pisp_fe_uninit;
+	}
+
+	ret = cfe_register_async_nf(cfe);
+	if (ret) {
+		cfe_err(cfe, "Failed to connect subdevs\n");
+		goto err_media_unregister;
+	}
+
+	pm_runtime_put(&cfe->pdev->dev);
+
+	return 0;
+
+err_media_unregister:
+	media_device_unregister(&cfe->mdev);
+err_pisp_fe_uninit:
+	pisp_fe_uninit(&cfe->fe);
+err_csi2_uninit:
+	csi2_uninit(&cfe->csi2);
+err_runtime_put:
+	pm_runtime_put(&cfe->pdev->dev);
+err_runtime_disable:
+	pm_runtime_disable(&pdev->dev);
+	debugfs_remove(cfe->debugfs);
+	v4l2_device_unregister(&cfe->v4l2_dev);
+err_cfe_put:
+	cfe_put(cfe);
+
+	return ret;
+}
+
+static void cfe_remove(struct platform_device *pdev)
+{
+	struct cfe_device *cfe = platform_get_drvdata(pdev);
+
+	debugfs_remove(cfe->debugfs);
+
+	v4l2_async_nf_unregister(&cfe->notifier);
+	v4l2_async_nf_cleanup(&cfe->notifier);
+
+	media_device_unregister(&cfe->mdev);
+	cfe_unregister_nodes(cfe);
+
+	pisp_fe_uninit(&cfe->fe);
+	csi2_uninit(&cfe->csi2);
+
+	pm_runtime_disable(&pdev->dev);
+
+	v4l2_device_unregister(&cfe->v4l2_dev);
+
+	cfe_put(cfe);
+}
+
+static int cfe_runtime_suspend(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct cfe_device *cfe = platform_get_drvdata(pdev);
+
+	clk_disable_unprepare(cfe->clk);
+
+	return 0;
+}
+
+static int cfe_runtime_resume(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct cfe_device *cfe = platform_get_drvdata(pdev);
+	int ret;
+
+	ret = clk_prepare_enable(cfe->clk);
+	if (ret) {
+		dev_err(dev, "Unable to enable clock\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static const struct dev_pm_ops cfe_pm_ops = {
+	SET_RUNTIME_PM_OPS(cfe_runtime_suspend, cfe_runtime_resume, NULL)
+	SET_LATE_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend,
+				     pm_runtime_force_resume)
+};
+
+static const struct of_device_id cfe_of_match[] = {
+	{ .compatible = "raspberrypi,rp1-cfe" },
+	{ /* sentinel */ },
+};
+MODULE_DEVICE_TABLE(of, cfe_of_match);
+
+static struct platform_driver cfe_driver = {
+	.probe		= cfe_probe,
+	.remove		= cfe_remove,
+	.driver = {
+		.name	= CFE_MODULE_NAME,
+		.of_match_table = cfe_of_match,
+		.pm = &cfe_pm_ops,
+	},
+};
+
+module_platform_driver(cfe_driver);
+
+MODULE_AUTHOR("Naushir Patuck <naush@raspberrypi.com>");
+MODULE_AUTHOR("Tomi Valkeinen <tomi.valkeinen@ideasonboard.com>");
+MODULE_DESCRIPTION("Raspberry Pi RP1 Camera Front End driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(CFE_VERSION);
diff --git a/drivers/media/platform/raspberrypi/rp1-cfe/cfe.h b/drivers/media/platform/raspberrypi/rp1-cfe/cfe.h
new file mode 100644
index 000000000000..c63cc314be3c
--- /dev/null
+++ b/drivers/media/platform/raspberrypi/rp1-cfe/cfe.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * RP1 CFE Driver
+ *
+ * Copyright (c) 2021-2024 Raspberry Pi Ltd.
+ * Copyright (c) 2023-2024 Ideas on Board Oy
+ */
+#ifndef _RP1_CFE_
+#define _RP1_CFE_
+
+#include <linux/media-bus-format.h>
+#include <linux/types.h>
+#include <linux/videodev2.h>
+
+extern bool cfe_debug_verbose;
+
+enum cfe_remap_types {
+	CFE_REMAP_16BIT,
+	CFE_REMAP_COMPRESSED,
+	CFE_NUM_REMAP,
+};
+
+#define CFE_FORMAT_FLAG_META_OUT	BIT(0)
+#define CFE_FORMAT_FLAG_META_CAP	BIT(1)
+#define CFE_FORMAT_FLAG_FE_OUT		BIT(2)
+
+struct cfe_fmt {
+	u32 fourcc;
+	u32 code;
+	u8 depth;
+	u8 csi_dt;
+	u32 remap[CFE_NUM_REMAP];
+	u32 flags;
+};
+
+extern const struct v4l2_mbus_framefmt cfe_default_format;
+
+const struct cfe_fmt *find_format_by_code(u32 code);
+const struct cfe_fmt *find_format_by_pix(u32 pixelformat);
+u32 cfe_find_16bit_code(u32 code);
+u32 cfe_find_compressed_code(u32 code);
+
+#endif
diff --git a/drivers/media/platform/raspberrypi/rp1-cfe/csi2.c b/drivers/media/platform/raspberrypi/rp1-cfe/csi2.c
new file mode 100644
index 000000000000..35c2ab1e2cd4
--- /dev/null
+++ b/drivers/media/platform/raspberrypi/rp1-cfe/csi2.c
@@ -0,0 +1,586 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * RP1 CSI-2 Driver
+ *
+ * Copyright (c) 2021-2024 Raspberry Pi Ltd.
+ * Copyright (c) 2023-2024 Ideas on Board Oy
+ */
+
+#include <linux/delay.h>
+#include <linux/moduleparam.h>
+#include <linux/pm_runtime.h>
+#include <linux/seq_file.h>
+
+#include <media/videobuf2-dma-contig.h>
+
+#include "cfe.h"
+#include "csi2.h"
+
+#include "cfe-trace.h"
+
+static bool csi2_track_errors;
+module_param_named(track_csi2_errors, csi2_track_errors, bool, 0);
+MODULE_PARM_DESC(track_csi2_errors, "track csi-2 errors");
+
+#define csi2_dbg(csi2, fmt, arg...) dev_dbg((csi2)->v4l2_dev->dev, fmt, ##arg)
+#define csi2_err(csi2, fmt, arg...) dev_err((csi2)->v4l2_dev->dev, fmt, ##arg)
+
+/* CSI2-DMA registers */
+#define CSI2_STATUS		0x000
+#define CSI2_QOS		0x004
+#define CSI2_DISCARDS_OVERFLOW	0x008
+#define CSI2_DISCARDS_INACTIVE	0x00c
+#define CSI2_DISCARDS_UNMATCHED	0x010
+#define CSI2_DISCARDS_LEN_LIMIT	0x014
+
+#define CSI2_DISCARDS_AMOUNT_SHIFT	0
+#define CSI2_DISCARDS_AMOUNT_MASK	GENMASK(23, 0)
+#define CSI2_DISCARDS_DT_SHIFT		24
+#define CSI2_DISCARDS_DT_MASK		GENMASK(29, 24)
+#define CSI2_DISCARDS_VC_SHIFT		30
+#define CSI2_DISCARDS_VC_MASK		GENMASK(31, 30)
+
+#define CSI2_LLEV_PANICS	0x018
+#define CSI2_ULEV_PANICS	0x01c
+#define CSI2_IRQ_MASK		0x020
+#define CSI2_IRQ_MASK_IRQ_OVERFLOW		BIT(0)
+#define CSI2_IRQ_MASK_IRQ_DISCARD_OVERFLOW	BIT(1)
+#define CSI2_IRQ_MASK_IRQ_DISCARD_LENGTH_LIMIT	BIT(2)
+#define CSI2_IRQ_MASK_IRQ_DISCARD_UNMATCHED	BIT(3)
+#define CSI2_IRQ_MASK_IRQ_DISCARD_INACTIVE	BIT(4)
+#define CSI2_IRQ_MASK_IRQ_ALL                                              \
+	(CSI2_IRQ_MASK_IRQ_OVERFLOW | CSI2_IRQ_MASK_IRQ_DISCARD_OVERFLOW | \
+	 CSI2_IRQ_MASK_IRQ_DISCARD_LENGTH_LIMIT |                          \
+	 CSI2_IRQ_MASK_IRQ_DISCARD_UNMATCHED |                             \
+	 CSI2_IRQ_MASK_IRQ_DISCARD_INACTIVE)
+
+#define CSI2_CTRL		0x024
+#define CSI2_CH_CTRL(x)		((x) * 0x40 + 0x28)
+#define CSI2_CH_ADDR0(x)	((x) * 0x40 + 0x2c)
+#define CSI2_CH_ADDR1(x)	((x) * 0x40 + 0x3c)
+#define CSI2_CH_STRIDE(x)	((x) * 0x40 + 0x30)
+#define CSI2_CH_LENGTH(x)	((x) * 0x40 + 0x34)
+#define CSI2_CH_DEBUG(x)	((x) * 0x40 + 0x38)
+#define CSI2_CH_FRAME_SIZE(x)	((x) * 0x40 + 0x40)
+#define CSI2_CH_COMP_CTRL(x)	((x) * 0x40 + 0x44)
+#define CSI2_CH_FE_FRAME_ID(x)	((x) * 0x40 + 0x48)
+
+/* CSI2_STATUS */
+#define CSI2_STATUS_IRQ_FS(x)			(BIT(0) << (x))
+#define CSI2_STATUS_IRQ_FE(x)			(BIT(4) << (x))
+#define CSI2_STATUS_IRQ_FE_ACK(x)		(BIT(8) << (x))
+#define CSI2_STATUS_IRQ_LE(x)			(BIT(12) << (x))
+#define CSI2_STATUS_IRQ_LE_ACK(x)		(BIT(16) << (x))
+#define CSI2_STATUS_IRQ_CH_MASK(x) \
+	(CSI2_STATUS_IRQ_FS(x) | CSI2_STATUS_IRQ_FE(x) | \
+	 CSI2_STATUS_IRQ_FE_ACK(x) | CSI2_STATUS_IRQ_LE(x) | \
+	 CSI2_STATUS_IRQ_LE_ACK(x))
+#define CSI2_STATUS_IRQ_OVERFLOW		BIT(20)
+#define CSI2_STATUS_IRQ_DISCARD_OVERFLOW	BIT(21)
+#define CSI2_STATUS_IRQ_DISCARD_LEN_LIMIT	BIT(22)
+#define CSI2_STATUS_IRQ_DISCARD_UNMATCHED	BIT(23)
+#define CSI2_STATUS_IRQ_DISCARD_INACTIVE	BIT(24)
+
+/* CSI2_CTRL */
+#define CSI2_CTRL_EOP_IS_EOL			BIT(0)
+
+/* CSI2_CH_CTRL */
+#define CSI2_CH_CTRL_DMA_EN			BIT(0)
+#define CSI2_CH_CTRL_FORCE			BIT(3)
+#define CSI2_CH_CTRL_AUTO_ARM			BIT(4)
+#define CSI2_CH_CTRL_IRQ_EN_FS			BIT(13)
+#define CSI2_CH_CTRL_IRQ_EN_FE			BIT(14)
+#define CSI2_CH_CTRL_IRQ_EN_FE_ACK		BIT(15)
+#define CSI2_CH_CTRL_IRQ_EN_LE			BIT(16)
+#define CSI2_CH_CTRL_IRQ_EN_LE_ACK		BIT(17)
+#define CSI2_CH_CTRL_FLUSH_FE			BIT(28)
+#define CSI2_CH_CTRL_PACK_LINE			BIT(29)
+#define CSI2_CH_CTRL_PACK_BYTES			BIT(30)
+#define CSI2_CH_CTRL_CH_MODE_MASK		GENMASK(2, 1)
+#define CSI2_CH_CTRL_VC_MASK			GENMASK(6, 5)
+#define CSI2_CH_CTRL_DT_MASK			GENMASK(12, 7)
+#define CSI2_CH_CTRL_LC_MASK			GENMASK(27, 18)
+
+/* CHx_COMPRESSION_CONTROL */
+#define CSI2_CH_COMP_CTRL_OFFSET_MASK		GENMASK(15, 0)
+#define CSI2_CH_COMP_CTRL_SHIFT_MASK		GENMASK(19, 16)
+#define CSI2_CH_COMP_CTRL_MODE_MASK		GENMASK(25, 24)
+
+static inline u32 csi2_reg_read(struct csi2_device *csi2, u32 offset)
+{
+	return readl(csi2->base + offset);
+}
+
+static inline void csi2_reg_write(struct csi2_device *csi2, u32 offset, u32 val)
+{
+	writel(val, csi2->base + offset);
+}
+
+static inline void set_field(u32 *valp, u32 field, u32 mask)
+{
+	u32 val = *valp;
+
+	val &= ~mask;
+	val |= (field << __ffs(mask)) & mask;
+	*valp = val;
+}
+
+static int csi2_regs_show(struct seq_file *s, void *data)
+{
+	struct csi2_device *csi2 = s->private;
+	int ret;
+
+	ret = pm_runtime_resume_and_get(csi2->v4l2_dev->dev);
+	if (ret)
+		return ret;
+
+#define DUMP(reg) seq_printf(s, #reg " \t0x%08x\n", csi2_reg_read(csi2, reg))
+#define DUMP_CH(idx, reg) seq_printf(s, #reg "(%u) \t0x%08x\n", idx, \
+				     csi2_reg_read(csi2, reg(idx)))
+
+	DUMP(CSI2_STATUS);
+	DUMP(CSI2_DISCARDS_OVERFLOW);
+	DUMP(CSI2_DISCARDS_INACTIVE);
+	DUMP(CSI2_DISCARDS_UNMATCHED);
+	DUMP(CSI2_DISCARDS_LEN_LIMIT);
+	DUMP(CSI2_LLEV_PANICS);
+	DUMP(CSI2_ULEV_PANICS);
+	DUMP(CSI2_IRQ_MASK);
+	DUMP(CSI2_CTRL);
+
+	for (unsigned int i = 0; i < CSI2_NUM_CHANNELS; ++i) {
+		DUMP_CH(i, CSI2_CH_CTRL);
+		DUMP_CH(i, CSI2_CH_ADDR0);
+		DUMP_CH(i, CSI2_CH_ADDR1);
+		DUMP_CH(i, CSI2_CH_STRIDE);
+		DUMP_CH(i, CSI2_CH_LENGTH);
+		DUMP_CH(i, CSI2_CH_DEBUG);
+		DUMP_CH(i, CSI2_CH_FRAME_SIZE);
+		DUMP_CH(i, CSI2_CH_COMP_CTRL);
+		DUMP_CH(i, CSI2_CH_FE_FRAME_ID);
+	}
+
+#undef DUMP
+#undef DUMP_CH
+
+	pm_runtime_put(csi2->v4l2_dev->dev);
+
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(csi2_regs);
+
+static int csi2_errors_show(struct seq_file *s, void *data)
+{
+	struct csi2_device *csi2 = s->private;
+	unsigned long flags;
+	u32 discards_table[DISCARDS_TABLE_NUM_VCS][DISCARDS_TABLE_NUM_ENTRIES];
+	u32 discards_dt_table[DISCARDS_TABLE_NUM_ENTRIES];
+	u32 overflows;
+
+	spin_lock_irqsave(&csi2->errors_lock, flags);
+
+	memcpy(discards_table, csi2->discards_table, sizeof(discards_table));
+	memcpy(discards_dt_table, csi2->discards_dt_table,
+	       sizeof(discards_dt_table));
+	overflows = csi2->overflows;
+
+	csi2->overflows = 0;
+	memset(csi2->discards_table, 0, sizeof(discards_table));
+	memset(csi2->discards_dt_table, 0, sizeof(discards_dt_table));
+
+	spin_unlock_irqrestore(&csi2->errors_lock, flags);
+
+	seq_printf(s, "Overflows %u\n", overflows);
+	seq_puts(s, "Discards:\n");
+	seq_puts(s, "VC            OVLF        LEN  UNMATCHED   INACTIVE\n");
+
+	for (unsigned int vc = 0; vc < DISCARDS_TABLE_NUM_VCS; ++vc) {
+		seq_printf(s, "%u       %10u %10u %10u %10u\n", vc,
+			   discards_table[vc][DISCARDS_TABLE_OVERFLOW],
+			   discards_table[vc][DISCARDS_TABLE_LENGTH_LIMIT],
+			   discards_table[vc][DISCARDS_TABLE_UNMATCHED],
+			   discards_table[vc][DISCARDS_TABLE_INACTIVE]);
+	}
+
+	seq_printf(s, "Last DT %10u %10u %10u %10u\n",
+		   discards_dt_table[DISCARDS_TABLE_OVERFLOW],
+		   discards_dt_table[DISCARDS_TABLE_LENGTH_LIMIT],
+		   discards_dt_table[DISCARDS_TABLE_UNMATCHED],
+		   discards_dt_table[DISCARDS_TABLE_INACTIVE]);
+
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(csi2_errors);
+
+static void csi2_isr_handle_errors(struct csi2_device *csi2, u32 status)
+{
+	spin_lock(&csi2->errors_lock);
+
+	if (status & CSI2_STATUS_IRQ_OVERFLOW)
+		csi2->overflows++;
+
+	for (unsigned int i = 0; i < DISCARDS_TABLE_NUM_ENTRIES; ++i) {
+		static const u32 discard_bits[] = {
+			CSI2_STATUS_IRQ_DISCARD_OVERFLOW,
+			CSI2_STATUS_IRQ_DISCARD_LEN_LIMIT,
+			CSI2_STATUS_IRQ_DISCARD_UNMATCHED,
+			CSI2_STATUS_IRQ_DISCARD_INACTIVE,
+		};
+		static const u8 discard_regs[] = {
+			CSI2_DISCARDS_OVERFLOW,
+			CSI2_DISCARDS_LEN_LIMIT,
+			CSI2_DISCARDS_UNMATCHED,
+			CSI2_DISCARDS_INACTIVE,
+		};
+		u32 amount;
+		u8 dt, vc;
+		u32 v;
+
+		if (!(status & discard_bits[i]))
+			continue;
+
+		v = csi2_reg_read(csi2, discard_regs[i]);
+		csi2_reg_write(csi2, discard_regs[i], 0);
+
+		amount = (v & CSI2_DISCARDS_AMOUNT_MASK) >>
+			 CSI2_DISCARDS_AMOUNT_SHIFT;
+		dt = (v & CSI2_DISCARDS_DT_MASK) >> CSI2_DISCARDS_DT_SHIFT;
+		vc = (v & CSI2_DISCARDS_VC_MASK) >> CSI2_DISCARDS_VC_SHIFT;
+
+		csi2->discards_table[vc][i] += amount;
+		csi2->discards_dt_table[i] = dt;
+	}
+
+	spin_unlock(&csi2->errors_lock);
+}
+
+void csi2_isr(struct csi2_device *csi2, bool *sof, bool *eof)
+{
+	u32 status;
+
+	status = csi2_reg_read(csi2, CSI2_STATUS);
+
+	/* Write value back to clear the interrupts */
+	csi2_reg_write(csi2, CSI2_STATUS, status);
+
+	for (unsigned int i = 0; i < CSI2_NUM_CHANNELS; i++) {
+		u32 dbg;
+
+		if ((status & CSI2_STATUS_IRQ_CH_MASK(i)) == 0)
+			continue;
+
+		dbg = csi2_reg_read(csi2, CSI2_CH_DEBUG(i));
+
+		trace_csi2_irq(i, status, dbg);
+
+		sof[i] = !!(status & CSI2_STATUS_IRQ_FS(i));
+		eof[i] = !!(status & CSI2_STATUS_IRQ_FE_ACK(i));
+	}
+
+	if (csi2_track_errors)
+		csi2_isr_handle_errors(csi2, status);
+}
+
+void csi2_set_buffer(struct csi2_device *csi2, unsigned int channel,
+		     dma_addr_t dmaaddr, unsigned int stride, unsigned int size)
+{
+	u64 addr = dmaaddr;
+	/*
+	 * ADDRESS0 must be written last as it triggers the double buffering
+	 * mechanism for all buffer registers within the hardware.
+	 */
+	addr >>= 4;
+	csi2_reg_write(csi2, CSI2_CH_LENGTH(channel), size >> 4);
+	csi2_reg_write(csi2, CSI2_CH_STRIDE(channel), stride >> 4);
+	csi2_reg_write(csi2, CSI2_CH_ADDR1(channel), addr >> 32);
+	csi2_reg_write(csi2, CSI2_CH_ADDR0(channel), addr & 0xffffffff);
+}
+
+void csi2_set_compression(struct csi2_device *csi2, unsigned int channel,
+			  enum csi2_compression_mode mode, unsigned int shift,
+			  unsigned int offset)
+{
+	u32 compression = 0;
+
+	set_field(&compression, CSI2_CH_COMP_CTRL_OFFSET_MASK, offset);
+	set_field(&compression, CSI2_CH_COMP_CTRL_SHIFT_MASK, shift);
+	set_field(&compression, CSI2_CH_COMP_CTRL_MODE_MASK, mode);
+	csi2_reg_write(csi2, CSI2_CH_COMP_CTRL(channel), compression);
+}
+
+void csi2_start_channel(struct csi2_device *csi2, unsigned int channel,
+			enum csi2_mode mode, bool auto_arm, bool pack_bytes,
+			unsigned int width, unsigned int height,
+			u8 vc, u8 dt)
+{
+	u32 ctrl;
+
+	csi2_dbg(csi2, "%s [%u]\n", __func__, channel);
+
+	csi2_reg_write(csi2, CSI2_CH_CTRL(channel), 0);
+	csi2_reg_write(csi2, CSI2_CH_DEBUG(channel), 0);
+	csi2_reg_write(csi2, CSI2_STATUS, CSI2_STATUS_IRQ_CH_MASK(channel));
+
+	/* Enable channel and FS/FE interrupts. */
+	ctrl = CSI2_CH_CTRL_DMA_EN | CSI2_CH_CTRL_IRQ_EN_FS |
+	       CSI2_CH_CTRL_IRQ_EN_FE_ACK | CSI2_CH_CTRL_PACK_LINE;
+	/* PACK_BYTES ensures no striding for embedded data. */
+	if (pack_bytes)
+		ctrl |= CSI2_CH_CTRL_PACK_BYTES;
+
+	if (auto_arm)
+		ctrl |= CSI2_CH_CTRL_AUTO_ARM;
+
+	if (width && height) {
+		set_field(&ctrl, mode, CSI2_CH_CTRL_CH_MODE_MASK);
+		csi2_reg_write(csi2, CSI2_CH_FRAME_SIZE(channel),
+			       (height << 16) | width);
+	} else {
+		set_field(&ctrl, 0x0, CSI2_CH_CTRL_CH_MODE_MASK);
+		csi2_reg_write(csi2, CSI2_CH_FRAME_SIZE(channel), 0);
+	}
+
+	set_field(&ctrl, vc, CSI2_CH_CTRL_VC_MASK);
+	set_field(&ctrl, dt, CSI2_CH_CTRL_DT_MASK);
+	csi2_reg_write(csi2, CSI2_CH_CTRL(channel), ctrl);
+	csi2->num_lines[channel] = height;
+}
+
+void csi2_stop_channel(struct csi2_device *csi2, unsigned int channel)
+{
+	csi2_dbg(csi2, "%s [%u]\n", __func__, channel);
+
+	/* Channel disable.  Use FORCE to allow stopping mid-frame. */
+	csi2_reg_write(csi2, CSI2_CH_CTRL(channel), CSI2_CH_CTRL_FORCE);
+	/* Latch the above change by writing to the ADDR0 register. */
+	csi2_reg_write(csi2, CSI2_CH_ADDR0(channel), 0);
+	/* Write this again, the HW needs it! */
+	csi2_reg_write(csi2, CSI2_CH_ADDR0(channel), 0);
+}
+
+void csi2_open_rx(struct csi2_device *csi2)
+{
+	csi2_reg_write(csi2, CSI2_IRQ_MASK,
+		       csi2_track_errors ? CSI2_IRQ_MASK_IRQ_ALL : 0);
+
+	dphy_start(&csi2->dphy);
+
+	csi2_reg_write(csi2, CSI2_CTRL, CSI2_CTRL_EOP_IS_EOL);
+}
+
+void csi2_close_rx(struct csi2_device *csi2)
+{
+	dphy_stop(&csi2->dphy);
+
+	csi2_reg_write(csi2, CSI2_IRQ_MASK, 0);
+}
+
+static int csi2_init_state(struct v4l2_subdev *sd,
+			   struct v4l2_subdev_state *state)
+{
+	struct v4l2_subdev_route routes[] = { {
+		.sink_pad = CSI2_PAD_SINK,
+		.sink_stream = 0,
+		.source_pad = CSI2_PAD_FIRST_SOURCE,
+		.source_stream = 0,
+		.flags = V4L2_SUBDEV_ROUTE_FL_ACTIVE,
+	} };
+
+	struct v4l2_subdev_krouting routing = {
+		.num_routes = ARRAY_SIZE(routes),
+		.routes = routes,
+	};
+
+	int ret;
+
+	ret = v4l2_subdev_set_routing_with_fmt(sd, state, &routing,
+					       &cfe_default_format);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int csi2_pad_set_fmt(struct v4l2_subdev *sd,
+			    struct v4l2_subdev_state *state,
+			    struct v4l2_subdev_format *format)
+{
+	if (format->pad == CSI2_PAD_SINK) {
+		/* Store the sink format and propagate it to the source. */
+
+		const struct cfe_fmt *cfe_fmt;
+
+		cfe_fmt = find_format_by_code(format->format.code);
+		if (!cfe_fmt) {
+			cfe_fmt = find_format_by_code(MEDIA_BUS_FMT_SRGGB10_1X10);
+			format->format.code = cfe_fmt->code;
+		}
+
+		struct v4l2_mbus_framefmt *fmt;
+
+		fmt = v4l2_subdev_state_get_format(state, format->pad,
+						   format->stream);
+		if (!fmt)
+			return -EINVAL;
+
+		*fmt = format->format;
+
+		fmt = v4l2_subdev_state_get_opposite_stream_format(state,
+								   format->pad,
+								   format->stream);
+		if (!fmt)
+			return -EINVAL;
+
+		format->format.field = V4L2_FIELD_NONE;
+
+		*fmt = format->format;
+	} else {
+		/* Only allow changing the source pad mbus code. */
+
+		struct v4l2_mbus_framefmt *sink_fmt, *source_fmt;
+		u32 sink_code;
+		u32 code;
+
+		sink_fmt = v4l2_subdev_state_get_opposite_stream_format(state,
+									format->pad,
+									format->stream);
+		if (!sink_fmt)
+			return -EINVAL;
+
+		source_fmt = v4l2_subdev_state_get_format(state, format->pad,
+							  format->stream);
+		if (!source_fmt)
+			return -EINVAL;
+
+		sink_code = sink_fmt->code;
+		code = format->format.code;
+
+		/*
+		 * Only allow changing the mbus code to:
+		 * - The sink's mbus code
+		 * - The 16-bit version of the sink's mbus code
+		 * - The compressed version of the sink's mbus code
+		 */
+		if (code == sink_code ||
+		    code == cfe_find_16bit_code(sink_code) ||
+		    code == cfe_find_compressed_code(sink_code))
+			source_fmt->code = code;
+
+		format->format.code = source_fmt->code;
+	}
+
+	return 0;
+}
+
+static int csi2_set_routing(struct v4l2_subdev *sd,
+			    struct v4l2_subdev_state *state,
+			    enum v4l2_subdev_format_whence which,
+			    struct v4l2_subdev_krouting *routing)
+{
+	int ret;
+
+	ret = v4l2_subdev_routing_validate(sd, routing,
+					   V4L2_SUBDEV_ROUTING_ONLY_1_TO_1 |
+					   V4L2_SUBDEV_ROUTING_NO_SOURCE_MULTIPLEXING);
+	if (ret)
+		return ret;
+
+	/* Only stream ID 0 allowed on source pads */
+	for (unsigned int i = 0; i < routing->num_routes; ++i) {
+		const struct v4l2_subdev_route *route = &routing->routes[i];
+
+		if (route->source_stream != 0)
+			return -EINVAL;
+	}
+
+	ret = v4l2_subdev_set_routing_with_fmt(sd, state, routing,
+					       &cfe_default_format);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static const struct v4l2_subdev_pad_ops csi2_subdev_pad_ops = {
+	.get_fmt = v4l2_subdev_get_fmt,
+	.set_fmt = csi2_pad_set_fmt,
+	.set_routing = csi2_set_routing,
+	.link_validate = v4l2_subdev_link_validate_default,
+};
+
+static const struct media_entity_operations csi2_entity_ops = {
+	.link_validate = v4l2_subdev_link_validate,
+	.has_pad_interdep = v4l2_subdev_has_pad_interdep,
+};
+
+static const struct v4l2_subdev_ops csi2_subdev_ops = {
+	.pad = &csi2_subdev_pad_ops,
+};
+
+static const struct v4l2_subdev_internal_ops csi2_internal_ops = {
+	.init_state = csi2_init_state,
+};
+
+int csi2_init(struct csi2_device *csi2, struct dentry *debugfs)
+{
+	unsigned int ret;
+
+	spin_lock_init(&csi2->errors_lock);
+
+	csi2->dphy.dev = csi2->v4l2_dev->dev;
+	dphy_probe(&csi2->dphy);
+
+	debugfs_create_file("csi2_regs", 0440, debugfs, csi2, &csi2_regs_fops);
+
+	if (csi2_track_errors)
+		debugfs_create_file("csi2_errors", 0440, debugfs, csi2,
+				    &csi2_errors_fops);
+
+	csi2->pad[CSI2_PAD_SINK].flags = MEDIA_PAD_FL_SINK;
+
+	for (unsigned int i = CSI2_PAD_FIRST_SOURCE;
+	     i < CSI2_PAD_FIRST_SOURCE + CSI2_PAD_NUM_SOURCES; i++)
+		csi2->pad[i].flags = MEDIA_PAD_FL_SOURCE;
+
+	ret = media_entity_pads_init(&csi2->sd.entity, ARRAY_SIZE(csi2->pad),
+				     csi2->pad);
+	if (ret)
+		return ret;
+
+	/* Initialize subdev */
+	v4l2_subdev_init(&csi2->sd, &csi2_subdev_ops);
+	csi2->sd.internal_ops = &csi2_internal_ops;
+	csi2->sd.entity.function = MEDIA_ENT_F_VID_IF_BRIDGE;
+	csi2->sd.entity.ops = &csi2_entity_ops;
+	csi2->sd.flags = V4L2_SUBDEV_FL_HAS_DEVNODE | V4L2_SUBDEV_FL_STREAMS;
+	csi2->sd.owner = THIS_MODULE;
+	snprintf(csi2->sd.name, sizeof(csi2->sd.name), "csi2");
+
+	ret = v4l2_subdev_init_finalize(&csi2->sd);
+	if (ret)
+		goto err_entity_cleanup;
+
+	ret = v4l2_device_register_subdev(csi2->v4l2_dev, &csi2->sd);
+	if (ret) {
+		csi2_err(csi2, "Failed register csi2 subdev (%d)\n", ret);
+		goto err_subdev_cleanup;
+	}
+
+	return 0;
+
+err_subdev_cleanup:
+	v4l2_subdev_cleanup(&csi2->sd);
+err_entity_cleanup:
+	media_entity_cleanup(&csi2->sd.entity);
+
+	return ret;
+}
+
+void csi2_uninit(struct csi2_device *csi2)
+{
+	v4l2_device_unregister_subdev(&csi2->sd);
+	v4l2_subdev_cleanup(&csi2->sd);
+	media_entity_cleanup(&csi2->sd.entity);
+}
diff --git a/drivers/media/platform/raspberrypi/rp1-cfe/csi2.h b/drivers/media/platform/raspberrypi/rp1-cfe/csi2.h
new file mode 100644
index 000000000000..a8ee5de565fb
--- /dev/null
+++ b/drivers/media/platform/raspberrypi/rp1-cfe/csi2.h
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * RP1 CSI-2 Driver
+ *
+ * Copyright (c) 2021-2024 Raspberry Pi Ltd.
+ * Copyright (c) 2023-2024 Ideas on Board Oy
+ */
+
+#ifndef _RP1_CSI2_
+#define _RP1_CSI2_
+
+#include <linux/debugfs.h>
+#include <linux/io.h>
+#include <linux/types.h>
+#include <media/v4l2-device.h>
+#include <media/v4l2-subdev.h>
+
+#include "dphy.h"
+
+#define CSI2_NUM_CHANNELS 4
+
+#define CSI2_PAD_SINK 0
+#define CSI2_PAD_FIRST_SOURCE 1
+#define CSI2_PAD_NUM_SOURCES 4
+#define CSI2_NUM_PADS 5
+
+#define DISCARDS_TABLE_NUM_VCS 4
+
+enum csi2_mode {
+	CSI2_MODE_NORMAL = 0,
+	CSI2_MODE_REMAP = 1,
+	CSI2_MODE_COMPRESSED = 2,
+	CSI2_MODE_FE_STREAMING = 3,
+};
+
+enum csi2_compression_mode {
+	CSI2_COMPRESSION_DELTA = 1,
+	CSI2_COMPRESSION_SIMPLE = 2,
+	CSI2_COMPRESSION_COMBINED = 3,
+};
+
+enum discards_table_index {
+	DISCARDS_TABLE_OVERFLOW = 0,
+	DISCARDS_TABLE_LENGTH_LIMIT,
+	DISCARDS_TABLE_UNMATCHED,
+	DISCARDS_TABLE_INACTIVE,
+	DISCARDS_TABLE_NUM_ENTRIES,
+};
+
+struct csi2_device {
+	/* Parent V4l2 device */
+	struct v4l2_device *v4l2_dev;
+
+	void __iomem *base;
+
+	struct dphy_data dphy;
+
+	enum v4l2_mbus_type bus_type;
+	unsigned int bus_flags;
+	unsigned int num_lines[CSI2_NUM_CHANNELS];
+
+	struct media_pad pad[CSI2_NUM_PADS];
+	struct v4l2_subdev sd;
+
+	/* lock for csi2 errors counters */
+	spinlock_t errors_lock;
+	u32 overflows;
+	u32 discards_table[DISCARDS_TABLE_NUM_VCS][DISCARDS_TABLE_NUM_ENTRIES];
+	u32 discards_dt_table[DISCARDS_TABLE_NUM_ENTRIES];
+};
+
+void csi2_isr(struct csi2_device *csi2, bool *sof, bool *eof);
+void csi2_set_buffer(struct csi2_device *csi2, unsigned int channel,
+		     dma_addr_t dmaaddr, unsigned int stride,
+		     unsigned int size);
+void csi2_set_compression(struct csi2_device *csi2, unsigned int channel,
+			  enum csi2_compression_mode mode, unsigned int shift,
+			  unsigned int offset);
+void csi2_start_channel(struct csi2_device *csi2, unsigned int channel,
+			enum csi2_mode mode, bool auto_arm,
+			bool pack_bytes, unsigned int width,
+			unsigned int height, u8 vc, u8 dt);
+void csi2_stop_channel(struct csi2_device *csi2, unsigned int channel);
+void csi2_open_rx(struct csi2_device *csi2);
+void csi2_close_rx(struct csi2_device *csi2);
+int csi2_init(struct csi2_device *csi2, struct dentry *debugfs);
+void csi2_uninit(struct csi2_device *csi2);
+
+#endif
diff --git a/drivers/media/platform/raspberrypi/rp1-cfe/dphy.c b/drivers/media/platform/raspberrypi/rp1-cfe/dphy.c
new file mode 100644
index 000000000000..b443f0f56ddc
--- /dev/null
+++ b/drivers/media/platform/raspberrypi/rp1-cfe/dphy.c
@@ -0,0 +1,181 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * RP1 CSI-2 Driver
+ *
+ * Copyright (c) 2021-2024 Raspberry Pi Ltd.
+ * Copyright (c) 2023-2024 Ideas on Board Oy
+ */
+
+#include <linux/delay.h>
+#include <linux/pm_runtime.h>
+
+#include "dphy.h"
+
+#define dphy_dbg(dphy, fmt, arg...) dev_dbg((dphy)->dev, fmt, ##arg)
+#define dphy_err(dphy, fmt, arg...) dev_err((dphy)->dev, fmt, ##arg)
+
+/* DW dphy Host registers */
+#define DPHY_VERSION		0x000
+#define DPHY_N_LANES		0x004
+#define DPHY_RESETN		0x008
+#define DPHY_PHY_SHUTDOWNZ	0x040
+#define DPHY_PHY_RSTZ		0x044
+#define DPHY_PHY_RX		0x048
+#define	DPHY_PHY_STOPSTATE	0x04c
+#define DPHY_PHY_TST_CTRL0	0x050
+#define DPHY_PHY_TST_CTRL1	0x054
+#define DPHY_PHY2_TST_CTRL0	0x058
+#define DPHY_PHY2_TST_CTRL1	0x05c
+
+/* DW dphy Host Transactions */
+#define DPHY_HS_RX_CTRL_LANE0_OFFSET	0x44
+#define DPHY_PLL_INPUT_DIV_OFFSET	0x17
+#define DPHY_PLL_LOOP_DIV_OFFSET	0x18
+#define DPHY_PLL_DIV_CTRL_OFFSET	0x19
+
+static u32 dw_csi2_host_read(struct dphy_data *dphy, u32 offset)
+{
+	return readl(dphy->base + offset);
+}
+
+static void dw_csi2_host_write(struct dphy_data *dphy, u32 offset, u32 data)
+{
+	writel(data, dphy->base + offset);
+}
+
+static void set_tstclr(struct dphy_data *dphy, u32 val)
+{
+	u32 ctrl0 = dw_csi2_host_read(dphy, DPHY_PHY_TST_CTRL0);
+
+	dw_csi2_host_write(dphy, DPHY_PHY_TST_CTRL0, (ctrl0 & ~1) | val);
+}
+
+static void set_tstclk(struct dphy_data *dphy, u32 val)
+{
+	u32 ctrl0 = dw_csi2_host_read(dphy, DPHY_PHY_TST_CTRL0);
+
+	dw_csi2_host_write(dphy, DPHY_PHY_TST_CTRL0, (ctrl0 & ~2) | (val << 1));
+}
+
+static uint8_t get_tstdout(struct dphy_data *dphy)
+{
+	u32 ctrl1 = dw_csi2_host_read(dphy, DPHY_PHY_TST_CTRL1);
+
+	return ((ctrl1 >> 8) & 0xff);
+}
+
+static void set_testen(struct dphy_data *dphy, u32 val)
+{
+	u32 ctrl1 = dw_csi2_host_read(dphy, DPHY_PHY_TST_CTRL1);
+
+	dw_csi2_host_write(dphy, DPHY_PHY_TST_CTRL1,
+			   (ctrl1 & ~(1 << 16)) | (val << 16));
+}
+
+static void set_testdin(struct dphy_data *dphy, u32 val)
+{
+	u32 ctrl1 = dw_csi2_host_read(dphy, DPHY_PHY_TST_CTRL1);
+
+	dw_csi2_host_write(dphy, DPHY_PHY_TST_CTRL1, (ctrl1 & ~0xff) | val);
+}
+
+static uint8_t dphy_transaction(struct dphy_data *dphy, u8 test_code,
+				uint8_t test_data)
+{
+	/* See page 101 of the MIPI DPHY databook. */
+	set_tstclk(dphy, 1);
+	set_testen(dphy, 0);
+	set_testdin(dphy, test_code);
+	set_testen(dphy, 1);
+	set_tstclk(dphy, 0);
+	set_testen(dphy, 0);
+	set_testdin(dphy, test_data);
+	set_tstclk(dphy, 1);
+	return get_tstdout(dphy);
+}
+
+static void dphy_set_hsfreqrange(struct dphy_data *dphy, uint32_t mbps)
+{
+	/* See Table 5-1 on page 65 of dphy databook */
+	static const u16 hsfreqrange_table[][2] = {
+		{ 89, 0b000000 },   { 99, 0b010000 },	{ 109, 0b100000 },
+		{ 129, 0b000001 },  { 139, 0b010001 },	{ 149, 0b100001 },
+		{ 169, 0b000010 },  { 179, 0b010010 },	{ 199, 0b100010 },
+		{ 219, 0b000011 },  { 239, 0b010011 },	{ 249, 0b100011 },
+		{ 269, 0b000100 },  { 299, 0b010100 },	{ 329, 0b000101 },
+		{ 359, 0b010101 },  { 399, 0b100101 },	{ 449, 0b000110 },
+		{ 499, 0b010110 },  { 549, 0b000111 },	{ 599, 0b010111 },
+		{ 649, 0b001000 },  { 699, 0b011000 },	{ 749, 0b001001 },
+		{ 799, 0b011001 },  { 849, 0b101001 },	{ 899, 0b111001 },
+		{ 949, 0b001010 },  { 999, 0b011010 },	{ 1049, 0b101010 },
+		{ 1099, 0b111010 }, { 1149, 0b001011 }, { 1199, 0b011011 },
+		{ 1249, 0b101011 }, { 1299, 0b111011 }, { 1349, 0b001100 },
+		{ 1399, 0b011100 }, { 1449, 0b101100 }, { 1500, 0b111100 },
+	};
+	unsigned int i;
+
+	if (mbps < 80 || mbps > 1500)
+		dphy_err(dphy, "DPHY: Datarate %u Mbps out of range\n", mbps);
+
+	for (i = 0; i < ARRAY_SIZE(hsfreqrange_table) - 1; i++) {
+		if (mbps <= hsfreqrange_table[i][0])
+			break;
+	}
+
+	dphy_transaction(dphy, DPHY_HS_RX_CTRL_LANE0_OFFSET,
+			 hsfreqrange_table[i][1] << 1);
+}
+
+static void dphy_init(struct dphy_data *dphy)
+{
+	dw_csi2_host_write(dphy, DPHY_PHY_RSTZ, 0);
+	dw_csi2_host_write(dphy, DPHY_PHY_SHUTDOWNZ, 0);
+	set_tstclk(dphy, 1);
+	set_testen(dphy, 0);
+	set_tstclr(dphy, 1);
+	usleep_range(15, 20);
+	set_tstclr(dphy, 0);
+	usleep_range(15, 20);
+
+	dphy_set_hsfreqrange(dphy, dphy->dphy_rate);
+
+	usleep_range(5, 10);
+	dw_csi2_host_write(dphy, DPHY_PHY_SHUTDOWNZ, 1);
+	usleep_range(5, 10);
+	dw_csi2_host_write(dphy, DPHY_PHY_RSTZ, 1);
+}
+
+void dphy_start(struct dphy_data *dphy)
+{
+	dphy_dbg(dphy, "%s: Link rate %u Mbps, %u data lanes\n", __func__,
+		 dphy->dphy_rate, dphy->active_lanes);
+
+	dw_csi2_host_write(dphy, DPHY_N_LANES, (dphy->active_lanes - 1));
+	dphy_init(dphy);
+	dw_csi2_host_write(dphy, DPHY_RESETN, 0xffffffff);
+	usleep_range(10, 50);
+}
+
+void dphy_stop(struct dphy_data *dphy)
+{
+	dphy_dbg(dphy, "%s\n", __func__);
+
+	/* Set only one lane (lane 0) as active (ON) */
+	dw_csi2_host_write(dphy, DPHY_N_LANES, 0);
+	dw_csi2_host_write(dphy, DPHY_RESETN, 0);
+}
+
+void dphy_probe(struct dphy_data *dphy)
+{
+	u32 host_ver;
+	u8 host_ver_major, host_ver_minor;
+
+	host_ver = dw_csi2_host_read(dphy, DPHY_VERSION);
+	host_ver_major = (u8)((host_ver >> 24) - '0');
+	host_ver_minor = (u8)((host_ver >> 16) - '0');
+	host_ver_minor = host_ver_minor * 10;
+	host_ver_minor += (u8)((host_ver >> 8) - '0');
+
+	dphy_dbg(dphy, "DW dphy Host HW v%u.%u\n", host_ver_major,
+		 host_ver_minor);
+}
diff --git a/drivers/media/platform/raspberrypi/rp1-cfe/dphy.h b/drivers/media/platform/raspberrypi/rp1-cfe/dphy.h
new file mode 100644
index 000000000000..84fa370957cc
--- /dev/null
+++ b/drivers/media/platform/raspberrypi/rp1-cfe/dphy.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2021-2024 Raspberry Pi Ltd.
+ * Copyright (c) 2023-2024 Ideas on Board Oy
+ */
+
+#ifndef _RP1_DPHY_
+#define _RP1_DPHY_
+
+#include <linux/io.h>
+#include <linux/types.h>
+
+struct dphy_data {
+	struct device *dev;
+
+	void __iomem *base;
+
+	u32 dphy_rate;
+	u32 max_lanes;
+	u32 active_lanes;
+};
+
+void dphy_probe(struct dphy_data *dphy);
+void dphy_start(struct dphy_data *dphy);
+void dphy_stop(struct dphy_data *dphy);
+
+#endif
diff --git a/drivers/media/platform/raspberrypi/rp1-cfe/pisp-fe.c b/drivers/media/platform/raspberrypi/rp1-cfe/pisp-fe.c
new file mode 100644
index 000000000000..05762b1be2bc
--- /dev/null
+++ b/drivers/media/platform/raspberrypi/rp1-cfe/pisp-fe.c
@@ -0,0 +1,605 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PiSP Front End Driver
+ *
+ * Copyright (c) 2021-2024 Raspberry Pi Ltd.
+ */
+
+#include <linux/bitops.h>
+#include <linux/delay.h>
+#include <linux/moduleparam.h>
+#include <linux/pm_runtime.h>
+#include <linux/seq_file.h>
+
+#include <media/videobuf2-dma-contig.h>
+
+#include "cfe.h"
+#include "pisp-fe.h"
+
+#include "cfe-trace.h"
+
+#define FE_VERSION		0x000
+#define FE_CONTROL		0x004
+#define FE_STATUS		0x008
+#define FE_FRAME_STATUS		0x00c
+#define FE_ERROR_STATUS		0x010
+#define FE_OUTPUT_STATUS	0x014
+#define FE_INT_EN		0x018
+#define FE_INT_STATUS		0x01c
+
+/* CONTROL */
+#define FE_CONTROL_QUEUE	BIT(0)
+#define FE_CONTROL_ABORT	BIT(1)
+#define FE_CONTROL_RESET	BIT(2)
+#define FE_CONTROL_LATCH_REGS	BIT(3)
+
+/* INT_EN / INT_STATUS */
+#define FE_INT_EOF		BIT(0)
+#define FE_INT_SOF		BIT(1)
+#define FE_INT_LINES0		BIT(8)
+#define FE_INT_LINES1		BIT(9)
+#define FE_INT_STATS		BIT(16)
+#define FE_INT_QREADY		BIT(24)
+
+/* STATUS */
+#define FE_STATUS_QUEUED	BIT(0)
+#define FE_STATUS_WAITING	BIT(1)
+#define FE_STATUS_ACTIVE	BIT(2)
+
+#define PISP_FE_CONFIG_BASE_OFFSET	0x0040
+
+#define PISP_FE_ENABLE_STATS_CLUSTER \
+	(PISP_FE_ENABLE_STATS_CROP | PISP_FE_ENABLE_DECIMATE    | \
+	 PISP_FE_ENABLE_BLC        | PISP_FE_ENABLE_CDAF_STATS  | \
+	 PISP_FE_ENABLE_AWB_STATS  | PISP_FE_ENABLE_RGBY        | \
+	 PISP_FE_ENABLE_LSC        | PISP_FE_ENABLE_AGC_STATS)
+
+#define PISP_FE_ENABLE_OUTPUT_CLUSTER(i)				\
+	((PISP_FE_ENABLE_CROP0     | PISP_FE_ENABLE_DOWNSCALE0 |	\
+	  PISP_FE_ENABLE_COMPRESS0 | PISP_FE_ENABLE_OUTPUT0) << (4 * (i)))
+
+struct pisp_fe_config_param {
+	u32 dirty_flags;
+	u32 dirty_flags_extra;
+	size_t offset;
+	size_t size;
+};
+
+static const struct pisp_fe_config_param pisp_fe_config_map[] = {
+	/* *_dirty_flag_extra types */
+	{ 0, PISP_FE_DIRTY_GLOBAL,
+		offsetof(struct pisp_fe_config, global),
+		sizeof(struct pisp_fe_global_config) },
+	{ 0, PISP_FE_DIRTY_FLOATING,
+		offsetof(struct pisp_fe_config, floating_stats),
+		sizeof(struct pisp_fe_floating_stats_config) },
+	{ 0, PISP_FE_DIRTY_OUTPUT_AXI,
+		offsetof(struct pisp_fe_config, output_axi),
+		sizeof(struct pisp_fe_output_axi_config) },
+	/* *_dirty_flag types */
+	{ PISP_FE_ENABLE_INPUT, 0,
+		offsetof(struct pisp_fe_config, input),
+		sizeof(struct pisp_fe_input_config) },
+	{ PISP_FE_ENABLE_DECOMPRESS, 0,
+		offsetof(struct pisp_fe_config, decompress),
+		sizeof(struct pisp_decompress_config) },
+	{ PISP_FE_ENABLE_DECOMPAND, 0,
+		offsetof(struct pisp_fe_config, decompand),
+		sizeof(struct pisp_fe_decompand_config) },
+	{ PISP_FE_ENABLE_BLA, 0,
+		offsetof(struct pisp_fe_config, bla),
+		sizeof(struct pisp_bla_config) },
+	{ PISP_FE_ENABLE_DPC, 0,
+		offsetof(struct pisp_fe_config, dpc),
+		sizeof(struct pisp_fe_dpc_config) },
+	{ PISP_FE_ENABLE_STATS_CROP, 0,
+		offsetof(struct pisp_fe_config, stats_crop),
+		sizeof(struct pisp_fe_crop_config) },
+	{ PISP_FE_ENABLE_BLC, 0,
+		offsetof(struct pisp_fe_config, blc),
+		sizeof(struct pisp_bla_config) },
+	{ PISP_FE_ENABLE_CDAF_STATS, 0,
+		offsetof(struct pisp_fe_config, cdaf_stats),
+		sizeof(struct pisp_fe_cdaf_stats_config) },
+	{ PISP_FE_ENABLE_AWB_STATS, 0,
+		offsetof(struct pisp_fe_config, awb_stats),
+		sizeof(struct pisp_fe_awb_stats_config) },
+	{ PISP_FE_ENABLE_RGBY, 0,
+		offsetof(struct pisp_fe_config, rgby),
+		sizeof(struct pisp_fe_rgby_config) },
+	{ PISP_FE_ENABLE_LSC, 0,
+		offsetof(struct pisp_fe_config, lsc),
+		sizeof(struct pisp_fe_lsc_config) },
+	{ PISP_FE_ENABLE_AGC_STATS, 0,
+		offsetof(struct pisp_fe_config, agc_stats),
+		sizeof(struct pisp_agc_statistics) },
+	{ PISP_FE_ENABLE_CROP0, 0,
+		offsetof(struct pisp_fe_config, ch[0].crop),
+		sizeof(struct pisp_fe_crop_config) },
+	{ PISP_FE_ENABLE_DOWNSCALE0, 0,
+		offsetof(struct pisp_fe_config, ch[0].downscale),
+		sizeof(struct pisp_fe_downscale_config) },
+	{ PISP_FE_ENABLE_COMPRESS0, 0,
+		offsetof(struct pisp_fe_config, ch[0].compress),
+		sizeof(struct pisp_compress_config) },
+	{ PISP_FE_ENABLE_OUTPUT0, 0,
+		offsetof(struct pisp_fe_config, ch[0].output),
+		sizeof(struct pisp_fe_output_config) },
+	{ PISP_FE_ENABLE_CROP1, 0,
+		offsetof(struct pisp_fe_config, ch[1].crop),
+		sizeof(struct pisp_fe_crop_config) },
+	{ PISP_FE_ENABLE_DOWNSCALE1, 0,
+		offsetof(struct pisp_fe_config, ch[1].downscale),
+		sizeof(struct pisp_fe_downscale_config) },
+	{ PISP_FE_ENABLE_COMPRESS1, 0,
+		offsetof(struct pisp_fe_config, ch[1].compress),
+		sizeof(struct pisp_compress_config) },
+	{ PISP_FE_ENABLE_OUTPUT1, 0,
+		offsetof(struct pisp_fe_config, ch[1].output),
+		sizeof(struct pisp_fe_output_config) },
+};
+
+#define pisp_fe_dbg(fe, fmt, arg...) dev_dbg((fe)->v4l2_dev->dev, fmt, ##arg)
+#define pisp_fe_info(fe, fmt, arg...) dev_info((fe)->v4l2_dev->dev, fmt, ##arg)
+#define pisp_fe_err(fe, fmt, arg...) dev_err((fe)->v4l2_dev->dev, fmt, ##arg)
+
+static inline u32 pisp_fe_reg_read(struct pisp_fe_device *fe, u32 offset)
+{
+	return readl(fe->base + offset);
+}
+
+static inline void pisp_fe_reg_write(struct pisp_fe_device *fe, u32 offset,
+				     u32 val)
+{
+	writel(val, fe->base + offset);
+}
+
+static inline void pisp_fe_reg_write_relaxed(struct pisp_fe_device *fe,
+					     u32 offset, u32 val)
+{
+	writel_relaxed(val, fe->base + offset);
+}
+
+static int pisp_fe_regs_show(struct seq_file *s, void *data)
+{
+	struct pisp_fe_device *fe = s->private;
+	int ret;
+
+	ret = pm_runtime_resume_and_get(fe->v4l2_dev->dev);
+	if (ret)
+		return ret;
+
+	pisp_fe_reg_write(fe, FE_CONTROL, FE_CONTROL_LATCH_REGS);
+
+#define DUMP(reg) seq_printf(s, #reg " \t0x%08x\n", pisp_fe_reg_read(fe, reg))
+	DUMP(FE_VERSION);
+	DUMP(FE_CONTROL);
+	DUMP(FE_STATUS);
+	DUMP(FE_FRAME_STATUS);
+	DUMP(FE_ERROR_STATUS);
+	DUMP(FE_OUTPUT_STATUS);
+	DUMP(FE_INT_EN);
+	DUMP(FE_INT_STATUS);
+#undef DUMP
+
+	pm_runtime_put(fe->v4l2_dev->dev);
+
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(pisp_fe_regs);
+
+static void pisp_fe_config_write(struct pisp_fe_device *fe,
+				 struct pisp_fe_config *config,
+				 unsigned int start_offset, unsigned int size)
+{
+	const unsigned int max_offset =
+		offsetof(struct pisp_fe_config, ch[PISP_FE_NUM_OUTPUTS]);
+	unsigned int end_offset;
+	u32 *cfg = (u32 *)config;
+
+	start_offset = min(start_offset, max_offset);
+	end_offset = min(start_offset + size, max_offset);
+
+	cfg += start_offset >> 2;
+	for (unsigned int i = start_offset; i < end_offset; i += 4, cfg++)
+		pisp_fe_reg_write_relaxed(fe, PISP_FE_CONFIG_BASE_OFFSET + i,
+					  *cfg);
+}
+
+void pisp_fe_isr(struct pisp_fe_device *fe, bool *sof, bool *eof)
+{
+	u32 status, int_status, out_status, frame_status, error_status;
+
+	pisp_fe_reg_write(fe, FE_CONTROL, FE_CONTROL_LATCH_REGS);
+	status = pisp_fe_reg_read(fe, FE_STATUS);
+	out_status = pisp_fe_reg_read(fe, FE_OUTPUT_STATUS);
+	frame_status = pisp_fe_reg_read(fe, FE_FRAME_STATUS);
+	error_status = pisp_fe_reg_read(fe, FE_ERROR_STATUS);
+
+	int_status = pisp_fe_reg_read(fe, FE_INT_STATUS);
+	pisp_fe_reg_write(fe, FE_INT_STATUS, int_status);
+
+	trace_fe_irq(status, out_status, frame_status, error_status,
+		     int_status);
+
+	/* We do not report interrupts for the input/stream pad. */
+	for (unsigned int i = 0; i < FE_NUM_PADS - 1; i++) {
+		sof[i] = !!(int_status & FE_INT_SOF);
+		eof[i] = !!(int_status & FE_INT_EOF);
+	}
+}
+
+static bool pisp_fe_validate_output(struct pisp_fe_config const *cfg,
+				    unsigned int c, struct v4l2_format const *f)
+{
+	unsigned int wbytes;
+
+	wbytes = cfg->ch[c].output.format.width;
+	if (cfg->ch[c].output.format.format & PISP_IMAGE_FORMAT_BPS_MASK)
+		wbytes *= 2;
+
+	/* Check output image dimensions are nonzero and not too big */
+	if (cfg->ch[c].output.format.width < 2 ||
+	    cfg->ch[c].output.format.height < 2 ||
+	    cfg->ch[c].output.format.height > f->fmt.pix.height ||
+	    cfg->ch[c].output.format.stride > f->fmt.pix.bytesperline ||
+	    wbytes > f->fmt.pix.bytesperline)
+		return false;
+
+	/* Check for zero-sized crops, which could cause lockup */
+	if ((cfg->global.enables & PISP_FE_ENABLE_CROP(c)) &&
+	    ((cfg->ch[c].crop.offset_x >= (cfg->input.format.width & ~1) ||
+	      cfg->ch[c].crop.offset_y >= cfg->input.format.height ||
+	      cfg->ch[c].crop.width < 2 || cfg->ch[c].crop.height < 2)))
+		return false;
+
+	if ((cfg->global.enables & PISP_FE_ENABLE_DOWNSCALE(c)) &&
+	    (cfg->ch[c].downscale.output_width < 2 ||
+	     cfg->ch[c].downscale.output_height < 2))
+		return false;
+
+	return true;
+}
+
+static bool pisp_fe_validate_stats(struct pisp_fe_config const *cfg)
+{
+	/* Check for zero-sized crop, which could cause lockup */
+	return (!(cfg->global.enables & PISP_FE_ENABLE_STATS_CROP) ||
+		(cfg->stats_crop.offset_x < (cfg->input.format.width & ~1) &&
+		 cfg->stats_crop.offset_y < cfg->input.format.height &&
+		 cfg->stats_crop.width >= 2 && cfg->stats_crop.height >= 2));
+}
+
+int pisp_fe_validate_config(struct pisp_fe_device *fe,
+			    struct pisp_fe_config *cfg,
+			    struct v4l2_format const *f0,
+			    struct v4l2_format const *f1)
+{
+	/*
+	 * Check the input is enabled, streaming and has nonzero size;
+	 * to avoid cases where the hardware might lock up or try to
+	 * read inputs from memory (which this driver doesn't support).
+	 */
+	if (!(cfg->global.enables & PISP_FE_ENABLE_INPUT) ||
+	    cfg->input.streaming != 1 || cfg->input.format.width < 2 ||
+	    cfg->input.format.height < 2) {
+		pisp_fe_err(fe, "%s: Input config not valid", __func__);
+		return -EINVAL;
+	}
+
+	for (unsigned int i = 0; i < PISP_FE_NUM_OUTPUTS; i++) {
+		if (!(cfg->global.enables & PISP_FE_ENABLE_OUTPUT(i))) {
+			if (cfg->global.enables &
+					PISP_FE_ENABLE_OUTPUT_CLUSTER(i)) {
+				pisp_fe_err(fe, "%s: Output %u not valid",
+					    __func__, i);
+				return -EINVAL;
+			}
+			continue;
+		}
+
+		if (!pisp_fe_validate_output(cfg, i, i ? f1 : f0))
+			return -EINVAL;
+	}
+
+	if ((cfg->global.enables & PISP_FE_ENABLE_STATS_CLUSTER) &&
+	    !pisp_fe_validate_stats(cfg)) {
+		pisp_fe_err(fe, "%s: Stats config not valid", __func__);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+void pisp_fe_submit_job(struct pisp_fe_device *fe, struct vb2_buffer **vb2_bufs,
+			struct pisp_fe_config *cfg)
+{
+	u64 addr;
+	u32 status;
+
+	/*
+	 * Check output buffers exist and outputs are correctly configured.
+	 * If valid, set the buffer's DMA address; otherwise disable.
+	 */
+	for (unsigned int i = 0; i < PISP_FE_NUM_OUTPUTS; i++) {
+		struct vb2_buffer *buf = vb2_bufs[FE_OUTPUT0_PAD + i];
+
+		if (!(cfg->global.enables & PISP_FE_ENABLE_OUTPUT(i)))
+			continue;
+
+		addr = vb2_dma_contig_plane_dma_addr(buf, 0);
+		cfg->output_buffer[i].addr_lo = addr & 0xffffffff;
+		cfg->output_buffer[i].addr_hi = addr >> 32;
+	}
+
+	if (vb2_bufs[FE_STATS_PAD]) {
+		addr = vb2_dma_contig_plane_dma_addr(vb2_bufs[FE_STATS_PAD], 0);
+		cfg->stats_buffer.addr_lo = addr & 0xffffffff;
+		cfg->stats_buffer.addr_hi = addr >> 32;
+	}
+
+	/* Set up ILINES interrupts 3/4 of the way down each output */
+	cfg->ch[0].output.ilines =
+		max(0x80u, (3u * cfg->ch[0].output.format.height) >> 2);
+	cfg->ch[1].output.ilines =
+		max(0x80u, (3u * cfg->ch[1].output.format.height) >> 2);
+
+	/*
+	 * The hardware must have consumed the previous config by now.
+	 * This read of status also serves as a memory barrier before the
+	 * sequence of relaxed writes which follow.
+	 */
+	status = pisp_fe_reg_read(fe, FE_STATUS);
+	if (WARN_ON(status & FE_STATUS_QUEUED))
+		return;
+
+	/*
+	 * Unconditionally write buffers, global and input parameters.
+	 * Write cropping and output parameters whenever they are enabled.
+	 * Selectively write other parameters that have been marked as
+	 * changed through the dirty flags.
+	 */
+	pisp_fe_config_write(fe, cfg, 0,
+			     offsetof(struct pisp_fe_config, decompress));
+	cfg->dirty_flags_extra &= ~PISP_FE_DIRTY_GLOBAL;
+	cfg->dirty_flags &= ~PISP_FE_ENABLE_INPUT;
+	cfg->dirty_flags |= (cfg->global.enables &
+			     (PISP_FE_ENABLE_STATS_CROP        |
+			      PISP_FE_ENABLE_OUTPUT_CLUSTER(0) |
+			      PISP_FE_ENABLE_OUTPUT_CLUSTER(1)));
+	for (unsigned int i = 0; i < ARRAY_SIZE(pisp_fe_config_map); i++) {
+		const struct pisp_fe_config_param *p = &pisp_fe_config_map[i];
+
+		if (cfg->dirty_flags & p->dirty_flags ||
+		    cfg->dirty_flags_extra & p->dirty_flags_extra)
+			pisp_fe_config_write(fe, cfg, p->offset, p->size);
+	}
+
+	/* This final non-relaxed write serves as a memory barrier */
+	pisp_fe_reg_write(fe, FE_CONTROL, FE_CONTROL_QUEUE);
+}
+
+void pisp_fe_start(struct pisp_fe_device *fe)
+{
+	pisp_fe_reg_write(fe, FE_CONTROL, FE_CONTROL_RESET);
+	pisp_fe_reg_write(fe, FE_INT_STATUS, ~0);
+	pisp_fe_reg_write(fe, FE_INT_EN, FE_INT_EOF | FE_INT_SOF |
+					 FE_INT_LINES0 | FE_INT_LINES1);
+	fe->inframe_count = 0;
+}
+
+void pisp_fe_stop(struct pisp_fe_device *fe)
+{
+	pisp_fe_reg_write(fe, FE_INT_EN, 0);
+	pisp_fe_reg_write(fe, FE_CONTROL, FE_CONTROL_ABORT);
+	usleep_range(1000, 2000);
+	WARN_ON(pisp_fe_reg_read(fe, FE_STATUS));
+	pisp_fe_reg_write(fe, FE_INT_STATUS, ~0);
+}
+
+static int pisp_fe_init_state(struct v4l2_subdev *sd,
+			      struct v4l2_subdev_state *state)
+{
+	struct v4l2_mbus_framefmt *fmt;
+
+	fmt = v4l2_subdev_state_get_format(state, FE_STREAM_PAD);
+	*fmt = cfe_default_format;
+	fmt->code = MEDIA_BUS_FMT_SRGGB16_1X16;
+
+	fmt = v4l2_subdev_state_get_format(state, FE_CONFIG_PAD);
+	fmt->code = MEDIA_BUS_FMT_FIXED;
+	fmt->width = sizeof(struct pisp_fe_config);
+	fmt->height = 1;
+
+	fmt = v4l2_subdev_state_get_format(state, FE_OUTPUT0_PAD);
+	*fmt = cfe_default_format;
+	fmt->code = MEDIA_BUS_FMT_SRGGB16_1X16;
+
+	fmt = v4l2_subdev_state_get_format(state, FE_OUTPUT1_PAD);
+	*fmt = cfe_default_format;
+	fmt->code = MEDIA_BUS_FMT_SRGGB16_1X16;
+
+	fmt = v4l2_subdev_state_get_format(state, FE_STATS_PAD);
+	fmt->code = MEDIA_BUS_FMT_FIXED;
+	fmt->width = sizeof(struct pisp_statistics);
+	fmt->height = 1;
+
+	return 0;
+}
+
+static int pisp_fe_pad_set_fmt(struct v4l2_subdev *sd,
+			       struct v4l2_subdev_state *state,
+			       struct v4l2_subdev_format *format)
+{
+	struct v4l2_mbus_framefmt *fmt;
+	const struct cfe_fmt *cfe_fmt;
+
+	/* TODO: format propagation to source pads */
+	/* TODO: format validation */
+
+	switch (format->pad) {
+	case FE_STREAM_PAD:
+		cfe_fmt = find_format_by_code(format->format.code);
+		if (!cfe_fmt || !(cfe_fmt->flags & CFE_FORMAT_FLAG_FE_OUT))
+			cfe_fmt = find_format_by_code(MEDIA_BUS_FMT_SRGGB16_1X16);
+
+		format->format.code = cfe_fmt->code;
+		format->format.field = V4L2_FIELD_NONE;
+
+		fmt = v4l2_subdev_state_get_format(state, FE_STREAM_PAD);
+		*fmt = format->format;
+
+		fmt = v4l2_subdev_state_get_format(state, FE_OUTPUT0_PAD);
+		*fmt = format->format;
+
+		fmt = v4l2_subdev_state_get_format(state, FE_OUTPUT1_PAD);
+		*fmt = format->format;
+
+		return 0;
+
+	case FE_OUTPUT0_PAD:
+	case FE_OUTPUT1_PAD: {
+		/*
+		 * TODO: we should allow scaling and cropping by allowing the
+		 * user to set the size here.
+		 */
+		struct v4l2_mbus_framefmt *sink_fmt, *source_fmt;
+		u32 sink_code;
+		u32 code;
+
+		cfe_fmt = find_format_by_code(format->format.code);
+		if (!cfe_fmt || !(cfe_fmt->flags & CFE_FORMAT_FLAG_FE_OUT))
+			cfe_fmt = find_format_by_code(MEDIA_BUS_FMT_SRGGB16_1X16);
+
+		format->format.code = cfe_fmt->code;
+
+		sink_fmt = v4l2_subdev_state_get_format(state, FE_STREAM_PAD);
+		if (!sink_fmt)
+			return -EINVAL;
+
+		source_fmt = v4l2_subdev_state_get_format(state, format->pad);
+		if (!source_fmt)
+			return -EINVAL;
+
+		sink_code = sink_fmt->code;
+		code = format->format.code;
+
+		/*
+		 * If the source code from the user does not match the code in
+		 * the sink pad, check that the source code matches the
+		 * compressed version of the sink code.
+		 */
+
+		if (code != sink_code &&
+		    code == cfe_find_compressed_code(sink_code))
+			source_fmt->code = code;
+
+		return 0;
+	}
+
+	case FE_CONFIG_PAD:
+	case FE_STATS_PAD:
+	default:
+		return v4l2_subdev_get_fmt(sd, state, format);
+	}
+}
+
+static const struct v4l2_subdev_pad_ops pisp_fe_subdev_pad_ops = {
+	.get_fmt = v4l2_subdev_get_fmt,
+	.set_fmt = pisp_fe_pad_set_fmt,
+	.link_validate = v4l2_subdev_link_validate_default,
+};
+
+static int pisp_fe_link_validate(struct media_link *link)
+{
+	struct v4l2_subdev *sd = media_entity_to_v4l2_subdev(link->sink->entity);
+	struct pisp_fe_device *fe = container_of(sd, struct pisp_fe_device, sd);
+
+	pisp_fe_dbg(fe, "%s: link \"%s\":%u -> \"%s\":%u\n", __func__,
+		    link->source->entity->name, link->source->index,
+		    link->sink->entity->name, link->sink->index);
+
+	if (link->sink->index == FE_STREAM_PAD)
+		return v4l2_subdev_link_validate(link);
+
+	if (link->sink->index == FE_CONFIG_PAD)
+		return 0;
+
+	return -EINVAL;
+}
+
+static const struct media_entity_operations pisp_fe_entity_ops = {
+	.link_validate = pisp_fe_link_validate,
+};
+
+static const struct v4l2_subdev_ops pisp_fe_subdev_ops = {
+	.pad = &pisp_fe_subdev_pad_ops,
+};
+
+static const struct v4l2_subdev_internal_ops pisp_fe_internal_ops = {
+	.init_state = pisp_fe_init_state,
+};
+
+int pisp_fe_init(struct pisp_fe_device *fe, struct dentry *debugfs)
+{
+	int ret;
+
+	debugfs_create_file("fe_regs", 0440, debugfs, fe, &pisp_fe_regs_fops);
+
+	fe->hw_revision = pisp_fe_reg_read(fe, FE_VERSION);
+	pisp_fe_info(fe, "PiSP FE HW v%u.%u\n",
+		     (fe->hw_revision >> 24) & 0xff,
+		     (fe->hw_revision >> 20) & 0x0f);
+
+	fe->pad[FE_STREAM_PAD].flags =
+		MEDIA_PAD_FL_SINK | MEDIA_PAD_FL_MUST_CONNECT;
+	fe->pad[FE_CONFIG_PAD].flags = MEDIA_PAD_FL_SINK;
+	fe->pad[FE_OUTPUT0_PAD].flags = MEDIA_PAD_FL_SOURCE;
+	fe->pad[FE_OUTPUT1_PAD].flags = MEDIA_PAD_FL_SOURCE;
+	fe->pad[FE_STATS_PAD].flags = MEDIA_PAD_FL_SOURCE;
+
+	ret = media_entity_pads_init(&fe->sd.entity, ARRAY_SIZE(fe->pad),
+				     fe->pad);
+	if (ret)
+		return ret;
+
+	/* Initialize subdev */
+	v4l2_subdev_init(&fe->sd, &pisp_fe_subdev_ops);
+	fe->sd.internal_ops = &pisp_fe_internal_ops;
+	fe->sd.entity.function = MEDIA_ENT_F_PROC_VIDEO_SCALER;
+	fe->sd.entity.ops = &pisp_fe_entity_ops;
+	fe->sd.entity.name = "pisp-fe";
+	fe->sd.flags = V4L2_SUBDEV_FL_HAS_DEVNODE;
+	fe->sd.owner = THIS_MODULE;
+	snprintf(fe->sd.name, sizeof(fe->sd.name), "pisp-fe");
+
+	ret = v4l2_subdev_init_finalize(&fe->sd);
+	if (ret)
+		goto err_entity_cleanup;
+
+	ret = v4l2_device_register_subdev(fe->v4l2_dev, &fe->sd);
+	if (ret) {
+		pisp_fe_err(fe, "Failed register pisp fe subdev (%d)\n", ret);
+		goto err_subdev_cleanup;
+	}
+
+	/* Must be in IDLE state (STATUS == 0) here. */
+	WARN_ON(pisp_fe_reg_read(fe, FE_STATUS));
+
+	return 0;
+
+err_subdev_cleanup:
+	v4l2_subdev_cleanup(&fe->sd);
+err_entity_cleanup:
+	media_entity_cleanup(&fe->sd.entity);
+
+	return ret;
+}
+
+void pisp_fe_uninit(struct pisp_fe_device *fe)
+{
+	v4l2_device_unregister_subdev(&fe->sd);
+	v4l2_subdev_cleanup(&fe->sd);
+	media_entity_cleanup(&fe->sd.entity);
+}
diff --git a/drivers/media/platform/raspberrypi/rp1-cfe/pisp-fe.h b/drivers/media/platform/raspberrypi/rp1-cfe/pisp-fe.h
new file mode 100644
index 000000000000..54d506e19cf2
--- /dev/null
+++ b/drivers/media/platform/raspberrypi/rp1-cfe/pisp-fe.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * PiSP Front End Driver
+ *
+ * Copyright (c) 2021-2024 Raspberry Pi Ltd.
+ */
+#ifndef _PISP_FE_H_
+#define _PISP_FE_H_
+
+#include <linux/debugfs.h>
+#include <linux/io.h>
+#include <linux/types.h>
+#include <linux/videodev2.h>
+
+#include <media/media-device.h>
+#include <media/v4l2-device.h>
+#include <media/v4l2-subdev.h>
+
+#include <linux/media/raspberrypi/pisp_fe_config.h>
+
+enum pisp_fe_pads {
+	FE_STREAM_PAD,
+	FE_CONFIG_PAD,
+	FE_OUTPUT0_PAD,
+	FE_OUTPUT1_PAD,
+	FE_STATS_PAD,
+	FE_NUM_PADS
+};
+
+struct pisp_fe_device {
+	/* Parent V4l2 device */
+	struct v4l2_device *v4l2_dev;
+	void __iomem *base;
+	u32 hw_revision;
+
+	u16 inframe_count;
+	struct media_pad pad[FE_NUM_PADS];
+	struct v4l2_subdev sd;
+};
+
+void pisp_fe_isr(struct pisp_fe_device *fe, bool *sof, bool *eof);
+int pisp_fe_validate_config(struct pisp_fe_device *fe,
+			    struct pisp_fe_config *cfg,
+			    struct v4l2_format const *f0,
+			    struct v4l2_format const *f1);
+void pisp_fe_submit_job(struct pisp_fe_device *fe, struct vb2_buffer **vb2_bufs,
+			struct pisp_fe_config *cfg);
+void pisp_fe_start(struct pisp_fe_device *fe);
+void pisp_fe_stop(struct pisp_fe_device *fe);
+int pisp_fe_init(struct pisp_fe_device *fe, struct dentry *debugfs);
+void pisp_fe_uninit(struct pisp_fe_device *fe);
+
+#endif
diff --git a/include/uapi/linux/media/raspberrypi/pisp_fe_config.h b/include/uapi/linux/media/raspberrypi/pisp_fe_config.h
new file mode 100644
index 000000000000..77237460a3b5
--- /dev/null
+++ b/include/uapi/linux/media/raspberrypi/pisp_fe_config.h
@@ -0,0 +1,273 @@
+/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */
+/*
+ * RP1 PiSP Front End Driver Configuration structures
+ *
+ * Copyright (C) 2021 - Raspberry Pi Ltd.
+ *
+ */
+#ifndef _UAPI_PISP_FE_CONFIG_
+#define _UAPI_PISP_FE_CONFIG_
+
+#include <linux/types.h>
+
+#include "pisp_common.h"
+#include "pisp_fe_statistics.h"
+
+#define PISP_FE_NUM_OUTPUTS 2
+
+enum pisp_fe_enable {
+	PISP_FE_ENABLE_INPUT = 0x000001,
+	PISP_FE_ENABLE_DECOMPRESS = 0x000002,
+	PISP_FE_ENABLE_DECOMPAND = 0x000004,
+	PISP_FE_ENABLE_BLA = 0x000008,
+	PISP_FE_ENABLE_DPC = 0x000010,
+	PISP_FE_ENABLE_STATS_CROP = 0x000020,
+	PISP_FE_ENABLE_DECIMATE = 0x000040,
+	PISP_FE_ENABLE_BLC = 0x000080,
+	PISP_FE_ENABLE_CDAF_STATS = 0x000100,
+	PISP_FE_ENABLE_AWB_STATS = 0x000200,
+	PISP_FE_ENABLE_RGBY = 0x000400,
+	PISP_FE_ENABLE_LSC = 0x000800,
+	PISP_FE_ENABLE_AGC_STATS = 0x001000,
+	PISP_FE_ENABLE_CROP0 = 0x010000,
+	PISP_FE_ENABLE_DOWNSCALE0 = 0x020000,
+	PISP_FE_ENABLE_COMPRESS0 = 0x040000,
+	PISP_FE_ENABLE_OUTPUT0 = 0x080000,
+	PISP_FE_ENABLE_CROP1 = 0x100000,
+	PISP_FE_ENABLE_DOWNSCALE1 = 0x200000,
+	PISP_FE_ENABLE_COMPRESS1 = 0x400000,
+	PISP_FE_ENABLE_OUTPUT1 = 0x800000
+};
+
+#define PISP_FE_ENABLE_CROP(i) (PISP_FE_ENABLE_CROP0 << (4 * (i)))
+#define PISP_FE_ENABLE_DOWNSCALE(i) (PISP_FE_ENABLE_DOWNSCALE0 << (4 * (i)))
+#define PISP_FE_ENABLE_COMPRESS(i) (PISP_FE_ENABLE_COMPRESS0 << (4 * (i)))
+#define PISP_FE_ENABLE_OUTPUT(i) (PISP_FE_ENABLE_OUTPUT0 << (4 * (i)))
+
+/*
+ * We use the enable flags to show when blocks are "dirty", but we need some
+ * extra ones too.
+ */
+enum pisp_fe_dirty {
+	PISP_FE_DIRTY_GLOBAL = 0x0001,
+	PISP_FE_DIRTY_FLOATING = 0x0002,
+	PISP_FE_DIRTY_OUTPUT_AXI = 0x0004
+};
+
+struct pisp_fe_global_config {
+	__u32 enables;
+	__u8 bayer_order;
+	__u8 pad[3];
+} __attribute__((packed));
+
+struct pisp_fe_input_axi_config {
+	/* burst length minus one, in the range 0..15; OR'd with flags */
+	__u8 maxlen_flags;
+	/* { prot[2:0], cache[3:0] } fields */
+	__u8 cache_prot;
+	/* QoS (only 4 LS bits are used) */
+	__u16 qos;
+} __attribute__((packed));
+
+struct pisp_fe_output_axi_config {
+	/* burst length minus one, in the range 0..15; OR'd with flags */
+	__u8 maxlen_flags;
+	/* { prot[2:0], cache[3:0] } fields */
+	__u8 cache_prot;
+	/* QoS (4 bitfields of 4 bits each for different panic levels) */
+	__u16 qos;
+	/*  For Panic mode: Output FIFO panic threshold */
+	__u16 thresh;
+	/*  For Panic mode: Output FIFO statistics throttle threshold */
+	__u16 throttle;
+} __attribute__((packed));
+
+struct pisp_fe_input_config {
+	__u8 streaming;
+	__u8 pad[3];
+	struct pisp_image_format_config format;
+	struct pisp_fe_input_axi_config axi;
+	/* Extra cycles delay before issuing each burst request */
+	__u8 holdoff;
+	__u8 pad2[3];
+} __attribute__((packed));
+
+struct pisp_fe_output_config {
+	struct pisp_image_format_config format;
+	__u16 ilines;
+	__u8 pad[2];
+} __attribute__((packed));
+
+struct pisp_fe_input_buffer_config {
+	__u32 addr_lo;
+	__u32 addr_hi;
+	__u16 frame_id;
+	__u16 pad;
+} __attribute__((packed));
+
+#define PISP_FE_DECOMPAND_LUT_SIZE 65
+
+struct pisp_fe_decompand_config {
+	__u16 lut[PISP_FE_DECOMPAND_LUT_SIZE];
+	__u16 pad;
+} __attribute__((packed));
+
+struct pisp_fe_dpc_config {
+	__u8 coeff_level;
+	__u8 coeff_range;
+	__u8 coeff_range2;
+#define PISP_FE_DPC_FLAG_FOLDBACK 1
+#define PISP_FE_DPC_FLAG_VFLAG 2
+	__u8 flags;
+} __attribute__((packed));
+
+#define PISP_FE_LSC_LUT_SIZE 16
+
+struct pisp_fe_lsc_config {
+	__u8 shift;
+	__u8 pad0;
+	__u16 scale;
+	__u16 centre_x;
+	__u16 centre_y;
+	__u16 lut[PISP_FE_LSC_LUT_SIZE];
+} __attribute__((packed));
+
+struct pisp_fe_rgby_config {
+	__u16 gain_r;
+	__u16 gain_g;
+	__u16 gain_b;
+	__u8 maxflag;
+	__u8 pad;
+} __attribute__((packed));
+
+struct pisp_fe_agc_stats_config {
+	__u16 offset_x;
+	__u16 offset_y;
+	__u16 size_x;
+	__u16 size_y;
+	/* each weight only 4 bits */
+	__u8 weights[PISP_AGC_STATS_NUM_ZONES / 2];
+	__u16 row_offset_x;
+	__u16 row_offset_y;
+	__u16 row_size_x;
+	__u16 row_size_y;
+	__u8 row_shift;
+	__u8 float_shift;
+	__u8 pad1[2];
+} __attribute__((packed));
+
+struct pisp_fe_awb_stats_config {
+	__u16 offset_x;
+	__u16 offset_y;
+	__u16 size_x;
+	__u16 size_y;
+	__u8 shift;
+	__u8 pad[3];
+	__u16 r_lo;
+	__u16 r_hi;
+	__u16 g_lo;
+	__u16 g_hi;
+	__u16 b_lo;
+	__u16 b_hi;
+} __attribute__((packed));
+
+struct pisp_fe_floating_stats_region {
+	__u16 offset_x;
+	__u16 offset_y;
+	__u16 size_x;
+	__u16 size_y;
+} __attribute__((packed));
+
+struct pisp_fe_floating_stats_config {
+	struct pisp_fe_floating_stats_region
+		regions[PISP_FLOATING_STATS_NUM_ZONES];
+} __attribute__((packed));
+
+#define PISP_FE_CDAF_NUM_WEIGHTS 8
+
+struct pisp_fe_cdaf_stats_config {
+	__u16 noise_constant;
+	__u16 noise_slope;
+	__u16 offset_x;
+	__u16 offset_y;
+	__u16 size_x;
+	__u16 size_y;
+	__u16 skip_x;
+	__u16 skip_y;
+	__u32 mode;
+} __attribute__((packed));
+
+struct pisp_fe_stats_buffer_config {
+	__u32 addr_lo;
+	__u32 addr_hi;
+} __attribute__((packed));
+
+struct pisp_fe_crop_config {
+	__u16 offset_x;
+	__u16 offset_y;
+	__u16 width;
+	__u16 height;
+} __attribute__((packed));
+
+enum pisp_fe_downscale_flags {
+	/* downscale the four Bayer components independently... */
+	DOWNSCALE_BAYER = 1,
+	/* ...without trying to preserve their spatial relationship */
+	DOWNSCALE_BIN = 2,
+};
+
+struct pisp_fe_downscale_config {
+	__u8 xin;
+	__u8 xout;
+	__u8 yin;
+	__u8 yout;
+	__u8 flags; /* enum pisp_fe_downscale_flags */
+	__u8 pad[3];
+	__u16 output_width;
+	__u16 output_height;
+} __attribute__((packed));
+
+struct pisp_fe_output_buffer_config {
+	__u32 addr_lo;
+	__u32 addr_hi;
+} __attribute__((packed));
+
+/* Each of the two output channels/branches: */
+struct pisp_fe_output_branch_config {
+	struct pisp_fe_crop_config crop;
+	struct pisp_fe_downscale_config downscale;
+	struct pisp_compress_config compress;
+	struct pisp_fe_output_config output;
+	__u32 pad;
+} __attribute__((packed));
+
+/* And finally one to rule them all: */
+struct pisp_fe_config {
+	/* I/O configuration: */
+	struct pisp_fe_stats_buffer_config stats_buffer;
+	struct pisp_fe_output_buffer_config output_buffer[PISP_FE_NUM_OUTPUTS];
+	struct pisp_fe_input_buffer_config input_buffer;
+	/* processing configuration: */
+	struct pisp_fe_global_config global;
+	struct pisp_fe_input_config input;
+	struct pisp_decompress_config decompress;
+	struct pisp_fe_decompand_config decompand;
+	struct pisp_bla_config bla;
+	struct pisp_fe_dpc_config dpc;
+	struct pisp_fe_crop_config stats_crop;
+	__u32 spare1; /* placeholder for future decimate configuration */
+	struct pisp_bla_config blc;
+	struct pisp_fe_rgby_config rgby;
+	struct pisp_fe_lsc_config lsc;
+	struct pisp_fe_agc_stats_config agc_stats;
+	struct pisp_fe_awb_stats_config awb_stats;
+	struct pisp_fe_cdaf_stats_config cdaf_stats;
+	struct pisp_fe_floating_stats_config floating_stats;
+	struct pisp_fe_output_axi_config output_axi;
+	struct pisp_fe_output_branch_config ch[PISP_FE_NUM_OUTPUTS];
+	/* non-register fields: */
+	__u32 dirty_flags; /* these use pisp_fe_enable */
+	__u32 dirty_flags_extra; /* these use pisp_fe_dirty */
+} __attribute__((packed));
+
+#endif /* _UAPI_PISP_FE_CONFIG_ */
diff --git a/include/uapi/linux/media/raspberrypi/pisp_fe_statistics.h b/include/uapi/linux/media/raspberrypi/pisp_fe_statistics.h
new file mode 100644
index 000000000000..a7d42985aee8
--- /dev/null
+++ b/include/uapi/linux/media/raspberrypi/pisp_fe_statistics.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */
+/*
+ * RP1 PiSP Front End statistics definitions
+ *
+ * Copyright (C) 2021 - Raspberry Pi Ltd.
+ *
+ */
+#ifndef _UAPI_PISP_FE_STATISTICS_H_
+#define _UAPI_PISP_FE_STATISTICS_H_
+
+#include <linux/types.h>
+
+#define PISP_FLOATING_STATS_NUM_ZONES 4
+#define PISP_AGC_STATS_NUM_BINS 1024
+#define PISP_AGC_STATS_SIZE 16
+#define PISP_AGC_STATS_NUM_ZONES (PISP_AGC_STATS_SIZE * PISP_AGC_STATS_SIZE)
+#define PISP_AGC_STATS_NUM_ROW_SUMS 512
+
+struct pisp_agc_statistics_zone {
+	__u64 Y_sum;
+	__u32 counted;
+	__u32 pad;
+} __attribute__((packed));
+
+struct pisp_agc_statistics {
+	__u32 row_sums[PISP_AGC_STATS_NUM_ROW_SUMS];
+	/*
+	 * 32-bits per bin means an image (just less than) 16384x16384 pixels
+	 * in size can weight every pixel from 0 to 15.
+	 */
+	__u32 histogram[PISP_AGC_STATS_NUM_BINS];
+	struct pisp_agc_statistics_zone floating[PISP_FLOATING_STATS_NUM_ZONES];
+} __attribute__((packed));
+
+#define PISP_AWB_STATS_SIZE 32
+#define PISP_AWB_STATS_NUM_ZONES (PISP_AWB_STATS_SIZE * PISP_AWB_STATS_SIZE)
+
+struct pisp_awb_statistics_zone {
+	__u32 R_sum;
+	__u32 G_sum;
+	__u32 B_sum;
+	__u32 counted;
+} __attribute__((packed));
+
+struct pisp_awb_statistics {
+	struct pisp_awb_statistics_zone zones[PISP_AWB_STATS_NUM_ZONES];
+	struct pisp_awb_statistics_zone floating[PISP_FLOATING_STATS_NUM_ZONES];
+} __attribute__((packed));
+
+#define PISP_CDAF_STATS_SIZE 8
+#define PISP_CDAF_STATS_NUM_FOMS (PISP_CDAF_STATS_SIZE * PISP_CDAF_STATS_SIZE)
+
+struct pisp_cdaf_statistics {
+	__u64 foms[PISP_CDAF_STATS_NUM_FOMS];
+	__u64 floating[PISP_FLOATING_STATS_NUM_ZONES];
+} __attribute__((packed));
+
+struct pisp_statistics {
+	struct pisp_awb_statistics awb;
+	struct pisp_agc_statistics agc;
+	struct pisp_cdaf_statistics cdaf;
+} __attribute__((packed));
+
+#endif /* _UAPI_PISP_FE_STATISTICS_H_ */
-- 
cgit v1.2.3


From 7b5a58952fc3b51905c2963647485565df1e5e26 Mon Sep 17 00:00:00 2001
From: Akash Kumar <quic_akakum@quicinc.com>
Date: Fri, 27 Sep 2024 20:51:38 +0530
Subject: usb: gadget: uvc: configfs: Add frame-based frame format support

Add support for frame-based frame format, which can be used to support
multiple formats like H264 or H265, in addition to MJPEG and YUV frames.

The frame-based format is set to H264 by default, but it can be updated
to other formats by modifying the GUID through the guid configfs
attribute. Different structures are used for all three formats, as
H264 has a different structure compared to MJPEG and uncompressed
formats. These structures will be passed to the frame make function
based on the active format, using a common frame structure with
additional parameters needed only for frame-based formats. These
parameters are handled at runtime in the UVC driver.

Signed-off-by: Akash Kumar <quic_akakum@quicinc.com>
Link: https://lore.kernel.org/r/20240927152138.31416-1-quic_akakum@quicinc.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/configfs-usb-gadget-uvc |  64 ++++
 drivers/usb/gadget/function/uvc_configfs.c        | 348 +++++++++++++++++++++-
 drivers/usb/gadget/function/uvc_configfs.h        |  16 +
 drivers/usb/gadget/function/uvc_v4l2.c            |  11 +-
 include/uapi/linux/usb/video.h                    |  58 ++++
 5 files changed, 485 insertions(+), 12 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/ABI/testing/configfs-usb-gadget-uvc b/Documentation/ABI/testing/configfs-usb-gadget-uvc
index 4feb692c4c1d..b6720768d63d 100644
--- a/Documentation/ABI/testing/configfs-usb-gadget-uvc
+++ b/Documentation/ABI/testing/configfs-usb-gadget-uvc
@@ -342,6 +342,70 @@ Description:	Specific uncompressed frame descriptors
 					   support
 		=========================  =====================================
 
+What:           /config/usb-gadget/gadget/functions/uvc.name/streaming/framebased
+Date:           Sept 2024
+KernelVersion:  5.15
+Description:    Framebased format descriptors
+
+What:           /config/usb-gadget/gadget/functions/uvc.name/streaming/framebased/name
+Date:           Sept 2024
+KernelVersion:  5.15
+Description:    Specific framebased format descriptors
+
+                ==================      =======================================
+                bFormatIndex            unique id for this format descriptor;
+                                        only defined after parent header is
+                                        linked into the streaming class;
+                                        read-only
+                bmaControls             this format's data for bmaControls in
+                                        the streaming header
+                bmInterlaceFlags        specifies interlace information,
+                                        read-only
+                bAspectRatioY           the X dimension of the picture aspect
+                                        ratio, read-only
+                bAspectRatioX           the Y dimension of the picture aspect
+                                        ratio, read-only
+                bDefaultFrameIndex      optimum frame index for this stream
+                bBitsPerPixel           number of bits per pixel used to
+                                        specify color in the decoded video
+                                        frame
+                guidFormat              globally unique id used to identify
+                                        stream-encoding format
+                ==================      =======================================
+
+What:           /config/usb-gadget/gadget/functions/uvc.name/streaming/framebased/name/name
+Date:           Sept 2024
+KernelVersion:  5.15
+Description:    Specific framebased frame descriptors
+
+                =========================  =====================================
+                bFrameIndex                unique id for this framedescriptor;
+                                           only defined after parent format is
+                                           linked into the streaming header;
+                                           read-only
+                dwFrameInterval            indicates how frame interval can be
+                                           programmed; a number of values
+                                           separated by newline can be specified
+                dwDefaultFrameInterval     the frame interval the device would
+                                           like to use as default
+                dwBytesPerLine             Specifies the number of bytes per line
+                                           of video for packed fixed frame size
+                                           formats, allowing the receiver to
+                                           perform stride alignment of the video.
+                                           If the bVariableSize value (above) is
+                                           TRUE (1), or if the format does not
+                                           permit such alignment, this value shall
+                                           be set to zero (0).
+                dwMaxBitRate               the maximum bit rate at the shortest
+                                           frame interval in bps
+                dwMinBitRate               the minimum bit rate at the longest
+                                           frame interval in bps
+                wHeight                    height of decoded bitmap frame in px
+                wWidth                     width of decoded bitmam frame in px
+                bmCapabilities             still image support, fixed frame-rate
+                                           support
+                =========================  =====================================
+
 What:		/config/usb-gadget/gadget/functions/uvc.name/streaming/header
 Date:		Dec 2014
 KernelVersion:	4.0
diff --git a/drivers/usb/gadget/function/uvc_configfs.c b/drivers/usb/gadget/function/uvc_configfs.c
index 6fac696ea846..f131943254a4 100644
--- a/drivers/usb/gadget/function/uvc_configfs.c
+++ b/drivers/usb/gadget/function/uvc_configfs.c
@@ -1566,11 +1566,13 @@ static const struct uvcg_config_group_type uvcg_control_grp_type = {
 /* -----------------------------------------------------------------------------
  * streaming/uncompressed
  * streaming/mjpeg
+ * streaming/framebased
  */
 
 static const char * const uvcg_format_names[] = {
 	"uncompressed",
 	"mjpeg",
+	"framebased",
 };
 
 static struct uvcg_color_matching *
@@ -1777,6 +1779,9 @@ static int uvcg_streaming_header_allow_link(struct config_item *src,
 	target_fmt = container_of(to_config_group(target), struct uvcg_format,
 				  group);
 
+	if (!target_fmt)
+		goto out;
+
 	uvcg_format_set_indices(to_config_group(target));
 
 	format_ptr = kzalloc(sizeof(*format_ptr), GFP_KERNEL);
@@ -1816,6 +1821,9 @@ static void uvcg_streaming_header_drop_link(struct config_item *src,
 	target_fmt = container_of(to_config_group(target), struct uvcg_format,
 				  group);
 
+	if (!target_fmt)
+		goto out;
+
 	list_for_each_entry_safe(format_ptr, tmp, &src_hdr->formats, entry)
 		if (format_ptr->fmt == target_fmt) {
 			list_del(&format_ptr->entry);
@@ -1826,6 +1834,7 @@ static void uvcg_streaming_header_drop_link(struct config_item *src,
 
 	--target_fmt->linked;
 
+out:
 	mutex_unlock(&opts->lock);
 	mutex_unlock(su_mutex);
 }
@@ -2022,6 +2031,7 @@ UVCG_FRAME_ATTR(dw_min_bit_rate, dwMinBitRate, 32);
 UVCG_FRAME_ATTR(dw_max_bit_rate, dwMaxBitRate, 32);
 UVCG_FRAME_ATTR(dw_max_video_frame_buffer_size, dwMaxVideoFrameBufferSize, 32);
 UVCG_FRAME_ATTR(dw_default_frame_interval, dwDefaultFrameInterval, 32);
+UVCG_FRAME_ATTR(dw_bytes_perline, dwBytesPerLine, 32);
 
 #undef UVCG_FRAME_ATTR
 
@@ -2035,7 +2045,7 @@ static ssize_t uvcg_frame_dw_frame_interval_show(struct config_item *item,
 	int result, i;
 	char *pg = page;
 
-	mutex_lock(su_mutex); /* for navigating configfs hierarchy */
+	mutex_lock(su_mutex);	/* for navigating configfs hierarchy */
 
 	opts_item = frm->item.ci_parent->ci_parent->ci_parent->ci_parent;
 	opts = to_f_uvc_opts(opts_item);
@@ -2105,7 +2115,7 @@ end:
 
 UVC_ATTR(uvcg_frame_, dw_frame_interval, dwFrameInterval);
 
-static struct configfs_attribute *uvcg_frame_attrs[] = {
+static struct configfs_attribute *uvcg_frame_attrs1[] = {
 	&uvcg_frame_attr_b_frame_index,
 	&uvcg_frame_attr_bm_capabilities,
 	&uvcg_frame_attr_w_width,
@@ -2118,12 +2128,31 @@ static struct configfs_attribute *uvcg_frame_attrs[] = {
 	NULL,
 };
 
-static const struct config_item_type uvcg_frame_type = {
+static struct configfs_attribute *uvcg_frame_attrs2[] = {
+	&uvcg_frame_attr_b_frame_index,
+	&uvcg_frame_attr_bm_capabilities,
+	&uvcg_frame_attr_w_width,
+	&uvcg_frame_attr_w_height,
+	&uvcg_frame_attr_dw_min_bit_rate,
+	&uvcg_frame_attr_dw_max_bit_rate,
+	&uvcg_frame_attr_dw_default_frame_interval,
+	&uvcg_frame_attr_dw_frame_interval,
+	&uvcg_frame_attr_dw_bytes_perline,
+	NULL,
+};
+
+static const struct config_item_type uvcg_frame_type1 = {
 	.ct_item_ops	= &uvcg_config_item_ops,
-	.ct_attrs	= uvcg_frame_attrs,
+	.ct_attrs	= uvcg_frame_attrs1,
 	.ct_owner	= THIS_MODULE,
 };
 
+static const struct config_item_type uvcg_frame_type2 = {
+	.ct_item_ops    = &uvcg_config_item_ops,
+	.ct_attrs       = uvcg_frame_attrs2,
+	.ct_owner       = THIS_MODULE,
+};
+
 static struct config_item *uvcg_frame_make(struct config_group *group,
 					   const char *name)
 {
@@ -2145,6 +2174,7 @@ static struct config_item *uvcg_frame_make(struct config_group *group,
 	h->frame.dw_max_bit_rate		= 55296000;
 	h->frame.dw_max_video_frame_buffer_size	= 460800;
 	h->frame.dw_default_frame_interval	= 666666;
+	h->frame.dw_bytes_perline		= 0;
 
 	opts_item = group->cg_item.ci_parent->ci_parent->ci_parent;
 	opts = to_f_uvc_opts(opts_item);
@@ -2157,6 +2187,9 @@ static struct config_item *uvcg_frame_make(struct config_group *group,
 	} else if (fmt->type == UVCG_MJPEG) {
 		h->frame.b_descriptor_subtype = UVC_VS_FRAME_MJPEG;
 		h->fmt_type = UVCG_MJPEG;
+	} else if (fmt->type == UVCG_FRAMEBASED) {
+		h->frame.b_descriptor_subtype = UVC_VS_FRAME_FRAME_BASED;
+		h->fmt_type = UVCG_FRAMEBASED;
 	} else {
 		mutex_unlock(&opts->lock);
 		kfree(h);
@@ -2175,7 +2208,10 @@ static struct config_item *uvcg_frame_make(struct config_group *group,
 	++fmt->num_frames;
 	mutex_unlock(&opts->lock);
 
-	config_item_init_type_name(&h->item, name, &uvcg_frame_type);
+	if (fmt->type == UVCG_FRAMEBASED)
+		config_item_init_type_name(&h->item, name, &uvcg_frame_type2);
+	else
+		config_item_init_type_name(&h->item, name, &uvcg_frame_type1);
 
 	return &h->item;
 }
@@ -2215,9 +2251,6 @@ static void uvcg_format_set_indices(struct config_group *fmt)
 	list_for_each_entry(ci, &fmt->cg_children, ci_entry) {
 		struct uvcg_frame *frm;
 
-		if (ci->ci_type != &uvcg_frame_type)
-			continue;
-
 		frm = to_uvcg_frame(ci);
 		frm->frame.b_frame_index = i++;
 	}
@@ -2677,6 +2710,251 @@ static const struct uvcg_config_group_type uvcg_mjpeg_grp_type = {
 	.name = "mjpeg",
 };
 
+/* -----------------------------------------------------------------------------
+ * streaming/framebased/<NAME>
+ */
+
+static struct configfs_group_operations uvcg_framebased_group_ops = {
+	.make_item              = uvcg_frame_make,
+	.drop_item              = uvcg_frame_drop,
+};
+
+#define UVCG_FRAMEBASED_ATTR_RO(cname, aname, bits)			\
+static ssize_t uvcg_framebased_##cname##_show(struct config_item *item,	\
+		char *page)						\
+{									\
+	struct uvcg_framebased *u = to_uvcg_framebased(item);		\
+	struct f_uvc_opts *opts;					\
+	struct config_item *opts_item;					\
+	struct mutex *su_mutex = &u->fmt.group.cg_subsys->su_mutex;	\
+	int result;							\
+									\
+	mutex_lock(su_mutex); /* for navigating configfs hierarchy */	\
+									\
+	opts_item = u->fmt.group.cg_item.ci_parent->ci_parent->ci_parent;\
+	opts = to_f_uvc_opts(opts_item);				\
+									\
+	mutex_lock(&opts->lock);					\
+	result = sprintf(page, "%u\n", le##bits##_to_cpu(u->desc.aname));\
+	mutex_unlock(&opts->lock);					\
+									\
+	mutex_unlock(su_mutex);						\
+	return result;							\
+}									\
+									\
+UVC_ATTR_RO(uvcg_framebased_, cname, aname)
+
+#define UVCG_FRAMEBASED_ATTR(cname, aname, bits)			\
+static ssize_t uvcg_framebased_##cname##_show(struct config_item *item,	\
+		char *page)						\
+{									\
+	struct uvcg_framebased *u = to_uvcg_framebased(item);		\
+	struct f_uvc_opts *opts;					\
+	struct config_item *opts_item;					\
+	struct mutex *su_mutex = &u->fmt.group.cg_subsys->su_mutex;	\
+	int result;							\
+									\
+	mutex_lock(su_mutex); /* for navigating configfs hierarchy */	\
+									\
+	opts_item = u->fmt.group.cg_item.ci_parent->ci_parent->ci_parent;\
+	opts = to_f_uvc_opts(opts_item);				\
+									\
+	mutex_lock(&opts->lock);					\
+	result = sprintf(page, "%u\n", le##bits##_to_cpu(u->desc.aname));\
+	mutex_unlock(&opts->lock);					\
+									\
+	mutex_unlock(su_mutex);						\
+	return result;							\
+}									\
+									\
+static ssize_t								\
+uvcg_framebased_##cname##_store(struct config_item *item,		\
+		const char *page, size_t len)				\
+{									\
+	struct uvcg_framebased *u = to_uvcg_framebased(item);		\
+	struct f_uvc_opts *opts;					\
+	struct config_item *opts_item;					\
+	struct mutex *su_mutex = &u->fmt.group.cg_subsys->su_mutex;	\
+	int ret;							\
+	u8 num;								\
+									\
+	mutex_lock(su_mutex); /* for navigating configfs hierarchy */	\
+									\
+	opts_item = u->fmt.group.cg_item.ci_parent->ci_parent->ci_parent;\
+	opts = to_f_uvc_opts(opts_item);				\
+									\
+	mutex_lock(&opts->lock);					\
+	if (u->fmt.linked || opts->refcnt) {				\
+		ret = -EBUSY;						\
+		goto end;						\
+	}								\
+									\
+	ret = kstrtou8(page, 0, &num);					\
+	if (ret)							\
+		goto end;						\
+									\
+	if (num > 255) {						\
+		ret = -EINVAL;						\
+		goto end;						\
+	}								\
+	u->desc.aname = num;						\
+	ret = len;							\
+end:									\
+	mutex_unlock(&opts->lock);					\
+	mutex_unlock(su_mutex);						\
+	return ret;							\
+}									\
+									\
+UVC_ATTR(uvcg_framebased_, cname, aname)
+
+UVCG_FRAMEBASED_ATTR_RO(b_format_index, bFormatIndex, 8);
+UVCG_FRAMEBASED_ATTR_RO(b_bits_per_pixel, bBitsPerPixel, 8);
+UVCG_FRAMEBASED_ATTR(b_default_frame_index, bDefaultFrameIndex, 8);
+UVCG_FRAMEBASED_ATTR_RO(b_aspect_ratio_x, bAspectRatioX, 8);
+UVCG_FRAMEBASED_ATTR_RO(b_aspect_ratio_y, bAspectRatioY, 8);
+UVCG_FRAMEBASED_ATTR_RO(bm_interface_flags, bmInterfaceFlags, 8);
+
+#undef UVCG_FRAMEBASED_ATTR
+#undef UVCG_FRAMEBASED_ATTR_RO
+
+static ssize_t uvcg_framebased_guid_format_show(struct config_item *item,
+						char *page)
+{
+	struct uvcg_framebased *ch = to_uvcg_framebased(item);
+	struct f_uvc_opts *opts;
+	struct config_item *opts_item;
+	struct mutex *su_mutex = &ch->fmt.group.cg_subsys->su_mutex;
+
+	mutex_lock(su_mutex); /* for navigating configfs hierarchy */
+
+	opts_item = ch->fmt.group.cg_item.ci_parent->ci_parent->ci_parent;
+	opts = to_f_uvc_opts(opts_item);
+
+	mutex_lock(&opts->lock);
+	memcpy(page, ch->desc.guidFormat, sizeof(ch->desc.guidFormat));
+	mutex_unlock(&opts->lock);
+
+	mutex_unlock(su_mutex);
+
+	return sizeof(ch->desc.guidFormat);
+}
+
+static ssize_t uvcg_framebased_guid_format_store(struct config_item *item,
+						 const char *page, size_t len)
+{
+	struct uvcg_framebased *ch = to_uvcg_framebased(item);
+	struct f_uvc_opts *opts;
+	struct config_item *opts_item;
+	struct mutex *su_mutex = &ch->fmt.group.cg_subsys->su_mutex;
+	int ret;
+
+	mutex_lock(su_mutex); /* for navigating configfs hierarchy */
+
+	opts_item = ch->fmt.group.cg_item.ci_parent->ci_parent->ci_parent;
+	opts = to_f_uvc_opts(opts_item);
+
+	mutex_lock(&opts->lock);
+	if (ch->fmt.linked || opts->refcnt) {
+		ret = -EBUSY;
+		goto end;
+	}
+
+	memcpy(ch->desc.guidFormat, page,
+	       min(sizeof(ch->desc.guidFormat), len));
+	ret = sizeof(ch->desc.guidFormat);
+
+end:
+	mutex_unlock(&opts->lock);
+	mutex_unlock(su_mutex);
+	return ret;
+}
+
+UVC_ATTR(uvcg_framebased_, guid_format, guidFormat);
+
+static inline ssize_t
+uvcg_framebased_bma_controls_show(struct config_item *item, char *page)
+{
+	struct uvcg_framebased *u = to_uvcg_framebased(item);
+
+	return uvcg_format_bma_controls_show(&u->fmt, page);
+}
+
+static inline ssize_t
+uvcg_framebased_bma_controls_store(struct config_item *item,
+				   const char *page, size_t len)
+{
+	struct uvcg_framebased *u = to_uvcg_framebased(item);
+
+	return uvcg_format_bma_controls_store(&u->fmt, page, len);
+}
+
+UVC_ATTR(uvcg_framebased_, bma_controls, bmaControls);
+
+static struct configfs_attribute *uvcg_framebased_attrs[] = {
+	&uvcg_framebased_attr_b_format_index,
+	&uvcg_framebased_attr_b_default_frame_index,
+	&uvcg_framebased_attr_b_bits_per_pixel,
+	&uvcg_framebased_attr_b_aspect_ratio_x,
+	&uvcg_framebased_attr_b_aspect_ratio_y,
+	&uvcg_framebased_attr_bm_interface_flags,
+	&uvcg_framebased_attr_bma_controls,
+	&uvcg_framebased_attr_guid_format,
+	NULL,
+};
+
+static const struct config_item_type uvcg_framebased_type = {
+	.ct_item_ops    = &uvcg_config_item_ops,
+	.ct_group_ops   = &uvcg_framebased_group_ops,
+	.ct_attrs       = uvcg_framebased_attrs,
+	.ct_owner       = THIS_MODULE,
+};
+
+static struct config_group *uvcg_framebased_make(struct config_group *group,
+						 const char *name)
+{
+	static char guid[] = { /*Declear frame based as H264 format*/
+		'H',  '2',  '6',  '4', 0x00, 0x00, 0x10, 0x00,
+		0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71
+	};
+	struct uvcg_framebased *h;
+
+	h = kzalloc(sizeof(*h), GFP_KERNEL);
+	if (!h)
+		return ERR_PTR(-ENOMEM);
+
+	h->desc.bLength                 = UVC_DT_FORMAT_FRAMEBASED_SIZE;
+	h->desc.bDescriptorType         = USB_DT_CS_INTERFACE;
+	h->desc.bDescriptorSubType      = UVC_VS_FORMAT_FRAME_BASED;
+	memcpy(h->desc.guidFormat, guid, sizeof(guid));
+	h->desc.bBitsPerPixel           = 0;
+	h->desc.bDefaultFrameIndex      = 1;
+	h->desc.bAspectRatioX           = 0;
+	h->desc.bAspectRatioY           = 0;
+	h->desc.bmInterfaceFlags        = 0;
+	h->desc.bCopyProtect            = 0;
+	h->desc.bVariableSize           = 1;
+
+	INIT_LIST_HEAD(&h->fmt.frames);
+	h->fmt.type = UVCG_FRAMEBASED;
+	config_group_init_type_name(&h->fmt.group, name,
+				    &uvcg_framebased_type);
+
+	return &h->fmt.group;
+}
+
+static struct configfs_group_operations uvcg_framebased_grp_ops = {
+	.make_group             = uvcg_framebased_make,
+};
+
+static const struct uvcg_config_group_type uvcg_framebased_grp_type = {
+	.type = {
+		.ct_item_ops    = &uvcg_config_item_ops,
+		.ct_group_ops   = &uvcg_framebased_grp_ops,
+		.ct_owner       = THIS_MODULE,
+	},
+	.name = "framebased",
+};
+
 /* -----------------------------------------------------------------------------
  * streaming/color_matching/default
  */
@@ -2912,6 +3190,7 @@ static int __uvcg_iter_strm_cls(struct uvcg_streaming_header *h,
 		if (ret)
 			return ret;
 		grp = &f->fmt->group;
+		j = 0;
 		list_for_each_entry(item, &grp->cg_children, ci_entry) {
 			frm = to_uvcg_frame(item);
 			ret = fun(frm, priv2, priv3, j++, UVCG_FRAME);
@@ -2965,6 +3244,11 @@ static int __uvcg_cnt_strm(void *priv1, void *priv2, void *priv3, int n,
 				container_of(fmt, struct uvcg_mjpeg, fmt);
 
 			*size += sizeof(m->desc);
+		} else if (fmt->type == UVCG_FRAMEBASED) {
+			struct uvcg_framebased *f =
+				container_of(fmt, struct uvcg_framebased, fmt);
+
+			*size += sizeof(f->desc);
 		} else {
 			return -EINVAL;
 		}
@@ -2975,6 +3259,11 @@ static int __uvcg_cnt_strm(void *priv1, void *priv2, void *priv3, int n,
 		int sz = sizeof(frm->dw_frame_interval);
 
 		*size += sizeof(frm->frame);
+		/*
+		 * framebased has duplicate member with uncompressed and
+		 * mjpeg, so minus it
+		 */
+		*size -= sizeof(u32);
 		*size += frm->frame.b_frame_interval_type * sz;
 	}
 	break;
@@ -2991,6 +3280,27 @@ static int __uvcg_cnt_strm(void *priv1, void *priv2, void *priv3, int n,
 	return 0;
 }
 
+static int __uvcg_copy_framebased_desc(void *dest, struct uvcg_frame *frm,
+				       int sz)
+{
+	struct uvc_frame_framebased *desc = dest;
+
+	desc->bLength = frm->frame.b_length;
+	desc->bDescriptorType = frm->frame.b_descriptor_type;
+	desc->bDescriptorSubType = frm->frame.b_descriptor_subtype;
+	desc->bFrameIndex = frm->frame.b_frame_index;
+	desc->bmCapabilities = frm->frame.bm_capabilities;
+	desc->wWidth = frm->frame.w_width;
+	desc->wHeight = frm->frame.w_height;
+	desc->dwMinBitRate = frm->frame.dw_min_bit_rate;
+	desc->dwMaxBitRate = frm->frame.dw_max_bit_rate;
+	desc->dwDefaultFrameInterval = frm->frame.dw_default_frame_interval;
+	desc->bFrameIntervalType = frm->frame.b_frame_interval_type;
+	desc->dwBytesPerLine = frm->frame.dw_bytes_perline;
+
+	return 0;
+}
+
 /*
  * Fill an array of streaming descriptors.
  *
@@ -3045,6 +3355,15 @@ static int __uvcg_fill_strm(void *priv1, void *priv2, void *priv3, int n,
 			m->desc.bNumFrameDescriptors = fmt->num_frames;
 			memcpy(*dest, &m->desc, sizeof(m->desc));
 			*dest += sizeof(m->desc);
+		} else if (fmt->type == UVCG_FRAMEBASED) {
+			struct uvcg_framebased *f =
+				container_of(fmt, struct uvcg_framebased,
+					     fmt);
+
+			f->desc.bFormatIndex = n + 1;
+			f->desc.bNumFrameDescriptors = fmt->num_frames;
+			memcpy(*dest, &f->desc, sizeof(f->desc));
+			*dest += sizeof(f->desc);
 		} else {
 			return -EINVAL;
 		}
@@ -3054,8 +3373,11 @@ static int __uvcg_fill_strm(void *priv1, void *priv2, void *priv3, int n,
 		struct uvcg_frame *frm = priv1;
 		struct uvc_descriptor_header *h = *dest;
 
-		sz = sizeof(frm->frame);
-		memcpy(*dest, &frm->frame, sz);
+		sz = sizeof(frm->frame) - 4;
+		if (frm->fmt_type != UVCG_FRAMEBASED)
+			memcpy(*dest, &frm->frame, sz);
+		else
+			__uvcg_copy_framebased_desc(*dest, frm, sz);
 		*dest += sz;
 		sz = frm->frame.b_frame_interval_type *
 			sizeof(*frm->dw_frame_interval);
@@ -3066,7 +3388,10 @@ static int __uvcg_fill_strm(void *priv1, void *priv2, void *priv3, int n,
 				frm->frame.b_frame_interval_type);
 		else if (frm->fmt_type == UVCG_MJPEG)
 			h->bLength = UVC_DT_FRAME_MJPEG_SIZE(
-				frm->frame.b_frame_interval_type);
+					frm->frame.b_frame_interval_type);
+		else if (frm->fmt_type == UVCG_FRAMEBASED)
+			h->bLength = UVC_DT_FRAME_FRAMEBASED_SIZE(
+					frm->frame.b_frame_interval_type);
 	}
 	break;
 	case UVCG_COLOR_MATCHING: {
@@ -3285,6 +3610,7 @@ static const struct uvcg_config_group_type uvcg_streaming_grp_type = {
 		&uvcg_streaming_header_grp_type,
 		&uvcg_uncompressed_grp_type,
 		&uvcg_mjpeg_grp_type,
+		&uvcg_framebased_grp_type,
 		&uvcg_color_matching_grp_type,
 		&uvcg_streaming_class_grp_type,
 		NULL,
diff --git a/drivers/usb/gadget/function/uvc_configfs.h b/drivers/usb/gadget/function/uvc_configfs.h
index c6a690158138..2f78cd4f396f 100644
--- a/drivers/usb/gadget/function/uvc_configfs.h
+++ b/drivers/usb/gadget/function/uvc_configfs.h
@@ -49,6 +49,7 @@ container_of(group_ptr, struct uvcg_color_matching, group)
 enum uvcg_format_type {
 	UVCG_UNCOMPRESSED = 0,
 	UVCG_MJPEG,
+	UVCG_FRAMEBASED,
 };
 
 struct uvcg_format {
@@ -105,6 +106,7 @@ struct uvcg_frame {
 		u32	dw_max_video_frame_buffer_size;
 		u32	dw_default_frame_interval;
 		u8	b_frame_interval_type;
+		u32     dw_bytes_perline;
 	} __attribute__((packed)) frame;
 	u32 *dw_frame_interval;
 };
@@ -142,6 +144,20 @@ static inline struct uvcg_mjpeg *to_uvcg_mjpeg(struct config_item *item)
 	return container_of(to_uvcg_format(item), struct uvcg_mjpeg, fmt);
 }
 
+/* -----------------------------------------------------------------------------
+ * streaming/framebased/<NAME>
+ */
+
+struct uvcg_framebased {
+	struct uvcg_format              fmt;
+	struct uvc_format_framebased    desc;
+};
+
+static inline struct uvcg_framebased *to_uvcg_framebased(struct config_item *item)
+{
+	return container_of(to_uvcg_format(item), struct uvcg_framebased, fmt);
+}
+
 /* -----------------------------------------------------------------------------
  * control/extensions/<NAME>
  */
diff --git a/drivers/usb/gadget/function/uvc_v4l2.c b/drivers/usb/gadget/function/uvc_v4l2.c
index de1736f834e6..836b91c73f18 100644
--- a/drivers/usb/gadget/function/uvc_v4l2.c
+++ b/drivers/usb/gadget/function/uvc_v4l2.c
@@ -31,13 +31,22 @@ static const struct uvc_format_desc *to_uvc_format(struct uvcg_format *uformat)
 {
 	char guid[16] = UVC_GUID_FORMAT_MJPEG;
 	const struct uvc_format_desc *format;
-	struct uvcg_uncompressed *unc;
 
 	if (uformat->type == UVCG_UNCOMPRESSED) {
+		struct uvcg_uncompressed *unc;
+
 		unc = to_uvcg_uncompressed(&uformat->group.cg_item);
 		if (!unc)
 			return ERR_PTR(-EINVAL);
 
+		memcpy(guid, unc->desc.guidFormat, sizeof(guid));
+	} else if (uformat->type == UVCG_FRAMEBASED) {
+		struct uvcg_framebased *unc;
+
+		unc = to_uvcg_framebased(&uformat->group.cg_item);
+		if (!unc)
+			return ERR_PTR(-EINVAL);
+
 		memcpy(guid, unc->desc.guidFormat, sizeof(guid));
 	}
 
diff --git a/include/uapi/linux/usb/video.h b/include/uapi/linux/usb/video.h
index 2ff0e8a3a683..526b5155e23c 100644
--- a/include/uapi/linux/usb/video.h
+++ b/include/uapi/linux/usb/video.h
@@ -597,5 +597,63 @@ struct UVC_FRAME_MJPEG(n) {				\
 	__le32 dwFrameInterval[n];			\
 } __attribute__ ((packed))
 
+/* Frame Based Payload - 3.1.1. Frame Based Video Format Descriptor */
+struct uvc_format_framebased {
+	__u8  bLength;
+	__u8  bDescriptorType;
+	__u8  bDescriptorSubType;
+	__u8  bFormatIndex;
+	__u8  bNumFrameDescriptors;
+	__u8  guidFormat[16];
+	__u8  bBitsPerPixel;
+	__u8  bDefaultFrameIndex;
+	__u8  bAspectRatioX;
+	__u8  bAspectRatioY;
+	__u8  bmInterfaceFlags;
+	__u8  bCopyProtect;
+	__u8  bVariableSize;
+} __attribute__((__packed__));
+
+#define UVC_DT_FORMAT_FRAMEBASED_SIZE                  28
+
+/* Frame Based Payload - 3.1.2. Frame Based Video Frame Descriptor */
+struct uvc_frame_framebased {
+	__u8  bLength;
+	__u8  bDescriptorType;
+	__u8  bDescriptorSubType;
+	__u8  bFrameIndex;
+	__u8  bmCapabilities;
+	__u16 wWidth;
+	__u16 wHeight;
+	__u32 dwMinBitRate;
+	__u32 dwMaxBitRate;
+	__u32 dwDefaultFrameInterval;
+	__u8  bFrameIntervalType;
+	__u32 dwBytesPerLine;
+	__u32 dwFrameInterval[];
+} __attribute__((__packed__));
+
+#define UVC_DT_FRAME_FRAMEBASED_SIZE(n)                        (26+4*(n))
+
+#define UVC_FRAME_FRAMEBASED(n) \
+	uvc_frame_framebased_##n
+
+#define DECLARE_UVC_FRAME_FRAMEBASED(n)			\
+struct UVC_FRAME_FRAMEBASED(n) {			\
+	__u8  bLength;					\
+	__u8  bDescriptorType;				\
+	__u8  bDescriptorSubType;                       \
+	__u8  bFrameIndex;                              \
+	__u8  bmCapabilities;                           \
+	__u16 wWidth;                                   \
+	__u16 wHeight;                                  \
+	__u32 dwMinBitRate;                             \
+	__u32 dwMaxBitRate;                             \
+	__u32 dwDefaultFrameInterval;                   \
+	__u8  bFrameIntervalType;                       \
+	__u32 dwBytesPerLine;                           \
+	__u32 dwFrameInterval[n];                       \
+} __attribute__ ((packed))
+
 #endif /* __LINUX_USB_VIDEO_H */
 
-- 
cgit v1.2.3


From 522249f05c5551aec9ec0ba9b6438f1ec19c138d Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 3 Oct 2024 16:29:22 +0200
Subject: fanotify: allow reporting errors on failure to open fd

When working in "fd mode", fanotify_read() needs to open an fd
from a dentry to report event->fd to userspace.

Opening an fd from dentry can fail for several reasons.
For example, when tasks are gone and we try to open their
/proc files or we try to open a WRONLY file like in sysfs
or when trying to open a file that was deleted on the
remote network server.

Add a new flag FAN_REPORT_FD_ERROR for fanotify_init().
For a group with FAN_REPORT_FD_ERROR, we will send the
event with the error instead of the open fd, otherwise
userspace may not get the error at all.

For an overflow event, we report -EBADF to avoid confusing FAN_NOFD
with -EPERM.  Similarly for pidfd open errors we report either -ESRCH
or the open error instead of FAN_NOPIDFD and FAN_EPIDFD.

In any case, userspace will not know which file failed to
open, so add a debug print for further investigation.

Reported-by: Krishna Vivek Vitta <kvitta@microsoft.com>
Link: https://lore.kernel.org/linux-fsdevel/SI2P153MB07182F3424619EDDD1F393EED46D2@SI2P153MB0718.APCP153.PROD.OUTLOOK.COM/
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/20241003142922.111539-1-amir73il@gmail.com
---
 fs/notify/fanotify/fanotify_user.c | 85 +++++++++++++++++++++-----------------
 include/linux/fanotify.h           |  1 +
 include/uapi/linux/fanotify.h      |  1 +
 3 files changed, 50 insertions(+), 37 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 9644bc72e457..8e2d43fc6f7c 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -266,13 +266,6 @@ static int create_fd(struct fsnotify_group *group, const struct path *path,
 			       group->fanotify_data.f_flags | __FMODE_NONOTIFY,
 			       current_cred());
 	if (IS_ERR(new_file)) {
-		/*
-		 * we still send an event even if we can't open the file.  this
-		 * can happen when say tasks are gone and we try to open their
-		 * /proc files or we try to open a WRONLY file like in sysfs
-		 * we just send the errno to userspace since there isn't much
-		 * else we can do.
-		 */
 		put_unused_fd(client_fd);
 		client_fd = PTR_ERR(new_file);
 	} else {
@@ -663,7 +656,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
 	unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
 	struct file *f = NULL, *pidfd_file = NULL;
-	int ret, pidfd = FAN_NOPIDFD, fd = FAN_NOFD;
+	int ret, pidfd = -ESRCH, fd = -EBADF;
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
@@ -691,10 +684,39 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
 	    path && path->mnt && path->dentry) {
 		fd = create_fd(group, path, &f);
-		if (fd < 0)
-			return fd;
+		/*
+		 * Opening an fd from dentry can fail for several reasons.
+		 * For example, when tasks are gone and we try to open their
+		 * /proc files or we try to open a WRONLY file like in sysfs
+		 * or when trying to open a file that was deleted on the
+		 * remote network server.
+		 *
+		 * For a group with FAN_REPORT_FD_ERROR, we will send the
+		 * event with the error instead of the open fd, otherwise
+		 * Userspace may not get the error at all.
+		 * In any case, userspace will not know which file failed to
+		 * open, so add a debug print for further investigation.
+		 */
+		if (fd < 0) {
+			pr_debug("fanotify: create_fd(%pd2) failed err=%d\n",
+				 path->dentry, fd);
+			if (!FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR)) {
+				/*
+				 * Historically, we've handled EOPENSTALE in a
+				 * special way and silently dropped such
+				 * events. Now we have to keep it to maintain
+				 * backward compatibility...
+				 */
+				if (fd == -EOPENSTALE)
+					fd = 0;
+				return fd;
+			}
+		}
 	}
-	metadata.fd = fd;
+	if (FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR))
+		metadata.fd = fd;
+	else
+		metadata.fd = fd >= 0 ? fd : FAN_NOFD;
 
 	if (pidfd_mode) {
 		/*
@@ -709,18 +731,16 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 		 * The PIDTYPE_TGID check for an event->pid is performed
 		 * preemptively in an attempt to catch out cases where the event
 		 * listener reads events after the event generating process has
-		 * already terminated. Report FAN_NOPIDFD to the event listener
-		 * in those cases, with all other pidfd creation errors being
-		 * reported as FAN_EPIDFD.
+		 * already terminated.  Depending on flag FAN_REPORT_FD_ERROR,
+		 * report either -ESRCH or FAN_NOPIDFD to the event listener in
+		 * those cases with all other pidfd creation errors reported as
+		 * the error code itself or as FAN_EPIDFD.
 		 */
-		if (metadata.pid == 0 ||
-		    !pid_has_task(event->pid, PIDTYPE_TGID)) {
-			pidfd = FAN_NOPIDFD;
-		} else {
+		if (metadata.pid && pid_has_task(event->pid, PIDTYPE_TGID))
 			pidfd = pidfd_prepare(event->pid, 0, &pidfd_file);
-			if (pidfd < 0)
-				pidfd = FAN_EPIDFD;
-		}
+
+		if (!FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR) && pidfd < 0)
+			pidfd = pidfd == -ESRCH ? FAN_NOPIDFD : FAN_EPIDFD;
 	}
 
 	ret = -EFAULT;
@@ -737,9 +757,6 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	buf += FAN_EVENT_METADATA_LEN;
 	count -= FAN_EVENT_METADATA_LEN;
 
-	if (fanotify_is_perm_event(event->mask))
-		FANOTIFY_PERM(event)->fd = fd;
-
 	if (info_mode) {
 		ret = copy_info_records_to_user(event, info, info_mode, pidfd,
 						buf, count);
@@ -753,15 +770,18 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	if (pidfd_file)
 		fd_install(pidfd, pidfd_file);
 
+	if (fanotify_is_perm_event(event->mask))
+		FANOTIFY_PERM(event)->fd = fd;
+
 	return metadata.event_len;
 
 out_close_fd:
-	if (fd != FAN_NOFD) {
+	if (f) {
 		put_unused_fd(fd);
 		fput(f);
 	}
 
-	if (pidfd >= 0) {
+	if (pidfd_file) {
 		put_unused_fd(pidfd);
 		fput(pidfd_file);
 	}
@@ -828,15 +848,6 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
 		}
 
 		ret = copy_event_to_user(group, event, buf, count);
-		if (unlikely(ret == -EOPENSTALE)) {
-			/*
-			 * We cannot report events with stale fd so drop it.
-			 * Setting ret to 0 will continue the event loop and
-			 * do the right thing if there are no more events to
-			 * read (i.e. return bytes read, -EAGAIN or wait).
-			 */
-			ret = 0;
-		}
 
 		/*
 		 * Permission events get queued to wait for response.  Other
@@ -845,7 +856,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
 		if (!fanotify_is_perm_event(event->mask)) {
 			fsnotify_destroy_event(group, &event->fse);
 		} else {
-			if (ret <= 0) {
+			if (ret <= 0 || FANOTIFY_PERM(event)->fd < 0) {
 				spin_lock(&group->notification_lock);
 				finish_permission_event(group,
 					FANOTIFY_PERM(event), FAN_DENY, NULL);
@@ -1954,7 +1965,7 @@ static int __init fanotify_user_setup(void)
 				     FANOTIFY_DEFAULT_MAX_USER_MARKS);
 
 	BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
-	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12);
+	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 13);
 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11);
 
 	fanotify_mark_cache = KMEM_CACHE(fanotify_mark,
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index 4f1c4f603118..89ff45bd6f01 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -36,6 +36,7 @@
 #define FANOTIFY_ADMIN_INIT_FLAGS	(FANOTIFY_PERM_CLASSES | \
 					 FAN_REPORT_TID | \
 					 FAN_REPORT_PIDFD | \
+					 FAN_REPORT_FD_ERROR | \
 					 FAN_UNLIMITED_QUEUE | \
 					 FAN_UNLIMITED_MARKS)
 
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index a37de58ca571..34f221d3a1b9 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -60,6 +60,7 @@
 #define FAN_REPORT_DIR_FID	0x00000400	/* Report unique directory id */
 #define FAN_REPORT_NAME		0x00000800	/* Report events with name */
 #define FAN_REPORT_TARGET_FID	0x00001000	/* Report dirent target id  */
+#define FAN_REPORT_FD_ERROR	0x00002000	/* event->fd can report error */
 
 /* Convenience macro - FAN_REPORT_NAME requires FAN_REPORT_DIR_FID */
 #define FAN_REPORT_DFID_NAME	(FAN_REPORT_DIR_FID | FAN_REPORT_NAME)
-- 
cgit v1.2.3


From fb6f20ecb121cef4d7946f834a6ee867c4e21b4a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 17 Oct 2024 12:28:23 +0200
Subject: reiserfs: The last commit

Deprecation period of reiserfs ends with the end of this year so it is
time to remove it from the kernel.

Acked-by: Darrick J. Wong <djwong@kernel.org>
Acked-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 Documentation/filesystems/porting.rst              |    2 +-
 Documentation/userspace-api/ioctl/ioctl-number.rst |    2 +-
 MAINTAINERS                                        |    5 -
 arch/alpha/configs/defconfig                       |    1 -
 arch/arm/configs/pxa_defconfig                     |    4 -
 arch/m68k/configs/amiga_defconfig                  |    1 -
 arch/m68k/configs/apollo_defconfig                 |    1 -
 arch/m68k/configs/atari_defconfig                  |    1 -
 arch/m68k/configs/bvme6000_defconfig               |    1 -
 arch/m68k/configs/hp300_defconfig                  |    1 -
 arch/m68k/configs/mac_defconfig                    |    1 -
 arch/m68k/configs/multi_defconfig                  |    1 -
 arch/m68k/configs/mvme147_defconfig                |    1 -
 arch/m68k/configs/mvme16x_defconfig                |    1 -
 arch/m68k/configs/q40_defconfig                    |    1 -
 arch/m68k/configs/sun3_defconfig                   |    1 -
 arch/m68k/configs/sun3x_defconfig                  |    1 -
 arch/sh/configs/landisk_defconfig                  |    1 -
 arch/sh/configs/titan_defconfig                    |    1 -
 arch/um/configs/i386_defconfig                     |    1 -
 arch/um/configs/x86_64_defconfig                   |    1 -
 drivers/block/Kconfig                              |    2 +-
 fs/Kconfig                                         |    1 -
 fs/Makefile                                        |    1 -
 fs/buffer.c                                        |    3 +-
 fs/quota/Kconfig                                   |   15 +-
 fs/reiserfs/Kconfig                                |   91 -
 fs/reiserfs/Makefile                               |   30 -
 fs/reiserfs/README                                 |  151 -
 fs/reiserfs/acl.h                                  |   78 -
 fs/reiserfs/bitmap.c                               | 1476 -------
 fs/reiserfs/dir.c                                  |  346 --
 fs/reiserfs/do_balan.c                             | 1900 ---------
 fs/reiserfs/file.c                                 |  270 --
 fs/reiserfs/fix_node.c                             | 2822 -------------
 fs/reiserfs/hashes.c                               |  177 -
 fs/reiserfs/ibalance.c                             | 1161 ------
 fs/reiserfs/inode.c                                | 3416 ---------------
 fs/reiserfs/ioctl.c                                |  221 -
 fs/reiserfs/item_ops.c                             |  737 ----
 fs/reiserfs/journal.c                              | 4404 --------------------
 fs/reiserfs/lbalance.c                             | 1426 -------
 fs/reiserfs/lock.c                                 |  101 -
 fs/reiserfs/namei.c                                | 1725 --------
 fs/reiserfs/objectid.c                             |  216 -
 fs/reiserfs/prints.c                               |  792 ----
 fs/reiserfs/procfs.c                               |  490 ---
 fs/reiserfs/reiserfs.h                             | 3419 ---------------
 fs/reiserfs/resize.c                               |  230 -
 fs/reiserfs/stree.c                                | 2280 ----------
 fs/reiserfs/super.c                                | 2646 ------------
 fs/reiserfs/tail_conversion.c                      |  318 --
 fs/reiserfs/xattr.c                                | 1039 -----
 fs/reiserfs/xattr.h                                |  117 -
 fs/reiserfs/xattr_acl.c                            |  411 --
 fs/reiserfs/xattr_security.c                       |  127 -
 fs/reiserfs/xattr_trusted.c                        |   46 -
 fs/reiserfs/xattr_user.c                           |   43 -
 include/uapi/linux/reiserfs_fs.h                   |   27 -
 include/uapi/linux/reiserfs_xattr.h                |   25 -
 scripts/selinux/mdp/mdp.c                          |    3 -
 tools/objtool/noreturns.h                          |    1 -
 .../filesystems/statmount/statmount_test.c         |    2 +-
 63 files changed, 12 insertions(+), 32804 deletions(-)
 delete mode 100644 fs/reiserfs/Kconfig
 delete mode 100644 fs/reiserfs/Makefile
 delete mode 100644 fs/reiserfs/README
 delete mode 100644 fs/reiserfs/acl.h
 delete mode 100644 fs/reiserfs/bitmap.c
 delete mode 100644 fs/reiserfs/dir.c
 delete mode 100644 fs/reiserfs/do_balan.c
 delete mode 100644 fs/reiserfs/file.c
 delete mode 100644 fs/reiserfs/fix_node.c
 delete mode 100644 fs/reiserfs/hashes.c
 delete mode 100644 fs/reiserfs/ibalance.c
 delete mode 100644 fs/reiserfs/inode.c
 delete mode 100644 fs/reiserfs/ioctl.c
 delete mode 100644 fs/reiserfs/item_ops.c
 delete mode 100644 fs/reiserfs/journal.c
 delete mode 100644 fs/reiserfs/lbalance.c
 delete mode 100644 fs/reiserfs/lock.c
 delete mode 100644 fs/reiserfs/namei.c
 delete mode 100644 fs/reiserfs/objectid.c
 delete mode 100644 fs/reiserfs/prints.c
 delete mode 100644 fs/reiserfs/procfs.c
 delete mode 100644 fs/reiserfs/reiserfs.h
 delete mode 100644 fs/reiserfs/resize.c
 delete mode 100644 fs/reiserfs/stree.c
 delete mode 100644 fs/reiserfs/super.c
 delete mode 100644 fs/reiserfs/tail_conversion.c
 delete mode 100644 fs/reiserfs/xattr.c
 delete mode 100644 fs/reiserfs/xattr.h
 delete mode 100644 fs/reiserfs/xattr_acl.c
 delete mode 100644 fs/reiserfs/xattr_security.c
 delete mode 100644 fs/reiserfs/xattr_trusted.c
 delete mode 100644 fs/reiserfs/xattr_user.c
 delete mode 100644 include/uapi/linux/reiserfs_fs.h
 delete mode 100644 include/uapi/linux/reiserfs_xattr.h

(limited to 'include/uapi/linux')

diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 92bffcc6747a..9ab2a3d6f2b4 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -177,7 +177,7 @@ settles down a bit.
 **mandatory**
 
 s_export_op is now required for exporting a filesystem.
-isofs, ext2, ext3, reiserfs, fat
+isofs, ext2, ext3, fat
 can be used as examples of very different filesystems.
 
 ---
diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index e4be1378ba26..243f1f1b554a 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -375,7 +375,7 @@ Code  Seq#    Include File                                           Comments
 0xCB  00-1F                                                          CBM serial IEC bus in development:
                                                                      <mailto:michael.klein@puffin.lb.shuttle.de>
 0xCC  00-0F  drivers/misc/ibmvmc.h                                   pseries VMC driver
-0xCD  01     linux/reiserfs_fs.h
+0xCD  01     linux/reiserfs_fs.h                                     Dead since 6.13
 0xCE  01-02  uapi/linux/cxl_mem.h                                    Compute Express Link Memory Devices
 0xCF  02     fs/smb/client/cifs_ioctl.h
 0xDB  00-0F  drivers/char/mwave/mwavepub.h
diff --git a/MAINTAINERS b/MAINTAINERS
index 7ad507f49324..02de04d4ae1e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -19578,11 +19578,6 @@ F:	Documentation/devicetree/bindings/regmap/
 F:	drivers/base/regmap/
 F:	include/linux/regmap.h
 
-REISERFS FILE SYSTEM
-L:	reiserfs-devel@vger.kernel.org
-S:	Obsolete
-F:	fs/reiserfs/
-
 REMOTE PROCESSOR (REMOTEPROC) SUBSYSTEM
 M:	Bjorn Andersson <andersson@kernel.org>
 M:	Mathieu Poirier <mathieu.poirier@linaro.org>
diff --git a/arch/alpha/configs/defconfig b/arch/alpha/configs/defconfig
index 1816c1dc22b1..3280bd9e6578 100644
--- a/arch/alpha/configs/defconfig
+++ b/arch/alpha/configs/defconfig
@@ -51,7 +51,6 @@ CONFIG_SERIAL_8250_CONSOLE=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_CMOS=y
 CONFIG_EXT2_FS=y
-CONFIG_REISERFS_FS=m
 CONFIG_ISO9660_FS=y
 CONFIG_MSDOS_FS=y
 CONFIG_VFAT_FS=y
diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig
index e1cb170c2bf0..38916ac4bce4 100644
--- a/arch/arm/configs/pxa_defconfig
+++ b/arch/arm/configs/pxa_defconfig
@@ -583,10 +583,6 @@ CONFIG_EXT2_FS_SECURITY=y
 CONFIG_EXT3_FS=y
 CONFIG_EXT3_FS_POSIX_ACL=y
 CONFIG_EXT3_FS_SECURITY=y
-CONFIG_REISERFS_FS=m
-CONFIG_REISERFS_FS_XATTR=y
-CONFIG_REISERFS_FS_POSIX_ACL=y
-CONFIG_REISERFS_FS_SECURITY=y
 CONFIG_XFS_FS=m
 CONFIG_AUTOFS_FS=m
 CONFIG_FUSE_FS=m
diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig
index d01dc47d52ea..fba7b68c235b 100644
--- a/arch/m68k/configs/amiga_defconfig
+++ b/arch/m68k/configs/amiga_defconfig
@@ -449,7 +449,6 @@ CONFIG_RTC_DRV_RP5C01=m
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_DAX=m
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=m
 CONFIG_JFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig
index 46808e581d7b..308655a98bb1 100644
--- a/arch/m68k/configs/apollo_defconfig
+++ b/arch/m68k/configs/apollo_defconfig
@@ -406,7 +406,6 @@ CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_DAX=m
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=m
 CONFIG_JFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig
index 4469a7839c9d..956a3aed97c6 100644
--- a/arch/m68k/configs/atari_defconfig
+++ b/arch/m68k/configs/atari_defconfig
@@ -426,7 +426,6 @@ CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_DAX=m
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=m
 CONFIG_JFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig
index c0719322c028..8790b6756a76 100644
--- a/arch/m68k/configs/bvme6000_defconfig
+++ b/arch/m68k/configs/bvme6000_defconfig
@@ -398,7 +398,6 @@ CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_DAX=m
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=m
 CONFIG_JFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig
index 8d429e63f8f2..dfb2fface338 100644
--- a/arch/m68k/configs/hp300_defconfig
+++ b/arch/m68k/configs/hp300_defconfig
@@ -408,7 +408,6 @@ CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_DAX=m
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=m
 CONFIG_JFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
index bafd33da27c1..6577b4390c38 100644
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@@ -425,7 +425,6 @@ CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_DAX=m
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=m
 CONFIG_JFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig
index 6f5ca3f85ea1..ad2bbc92d8d1 100644
--- a/arch/m68k/configs/multi_defconfig
+++ b/arch/m68k/configs/multi_defconfig
@@ -511,7 +511,6 @@ CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_DAX=m
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=m
 CONFIG_JFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig
index d16b328c7136..3b4a2d2d966f 100644
--- a/arch/m68k/configs/mvme147_defconfig
+++ b/arch/m68k/configs/mvme147_defconfig
@@ -397,7 +397,6 @@ CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_DAX=m
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=m
 CONFIG_JFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig
index 80f6c15a5ed5..9711f37d2ef7 100644
--- a/arch/m68k/configs/mvme16x_defconfig
+++ b/arch/m68k/configs/mvme16x_defconfig
@@ -398,7 +398,6 @@ CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_DAX=m
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=m
 CONFIG_JFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig
index 0e81589f0ee2..5ae3b707c849 100644
--- a/arch/m68k/configs/q40_defconfig
+++ b/arch/m68k/configs/q40_defconfig
@@ -415,7 +415,6 @@ CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_DAX=m
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=m
 CONFIG_JFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig
index 8cd785290339..55efa85492d8 100644
--- a/arch/m68k/configs/sun3_defconfig
+++ b/arch/m68k/configs/sun3_defconfig
@@ -396,7 +396,6 @@ CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_DAX=m
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=m
 CONFIG_JFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig
index 78035369f60f..cf1c78e02fda 100644
--- a/arch/m68k/configs/sun3x_defconfig
+++ b/arch/m68k/configs/sun3x_defconfig
@@ -396,7 +396,6 @@ CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_DAX=m
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=m
 CONFIG_JFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
diff --git a/arch/sh/configs/landisk_defconfig b/arch/sh/configs/landisk_defconfig
index 0311380160f4..d871623955c5 100644
--- a/arch/sh/configs/landisk_defconfig
+++ b/arch/sh/configs/landisk_defconfig
@@ -95,7 +95,6 @@ CONFIG_USB_SISUSBVGA=m
 CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
 # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
-CONFIG_REISERFS_FS=y
 CONFIG_ISO9660_FS=m
 CONFIG_MSDOS_FS=y
 CONFIG_VFAT_FS=y
diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig
index c1032559ecd4..99bc0e889287 100644
--- a/arch/sh/configs/titan_defconfig
+++ b/arch/sh/configs/titan_defconfig
@@ -220,7 +220,6 @@ CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
 # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 # CONFIG_EXT3_FS_XATTR is not set
-CONFIG_REISERFS_FS=m
 CONFIG_XFS_FS=m
 CONFIG_FUSE_FS=m
 CONFIG_ISO9660_FS=m
diff --git a/arch/um/configs/i386_defconfig b/arch/um/configs/i386_defconfig
index e543cbac8792..9c9c77f1255a 100644
--- a/arch/um/configs/i386_defconfig
+++ b/arch/um/configs/i386_defconfig
@@ -61,7 +61,6 @@ CONFIG_UML_NET_DAEMON=y
 CONFIG_UML_NET_MCAST=y
 CONFIG_UML_NET_SLIRP=y
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=y
 CONFIG_QUOTA=y
 CONFIG_AUTOFS_FS=m
 CONFIG_ISO9660_FS=m
diff --git a/arch/um/configs/x86_64_defconfig b/arch/um/configs/x86_64_defconfig
index 939cb12318ca..03b10d3f6816 100644
--- a/arch/um/configs/x86_64_defconfig
+++ b/arch/um/configs/x86_64_defconfig
@@ -59,7 +59,6 @@ CONFIG_UML_NET_DAEMON=y
 CONFIG_UML_NET_MCAST=y
 CONFIG_UML_NET_SLIRP=y
 CONFIG_EXT4_FS=y
-CONFIG_REISERFS_FS=y
 CONFIG_QUOTA=y
 CONFIG_AUTOFS_FS=m
 CONFIG_ISO9660_FS=m
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index ed209f4f2798..a97f2c40c640 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -130,7 +130,7 @@ config BLK_DEV_UBD_SYNC
           kernel command line option.  Alternatively, you can say Y here to
           turn on synchronous operation by default for all block devices.
 
-          If you're running a journalling file system (like reiserfs, for
+          If you're running a journalling file system (like xfs, for
           example) in your virtual machine, you will want to say Y here.  If
           you care for the safety of the data in your virtual machine, Y is a
           wise choice too.  In all other cases (for example, if you're just
diff --git a/fs/Kconfig b/fs/Kconfig
index aae170fc2795..64d420e3c475 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -43,7 +43,6 @@ config FS_MBCACHE
 	default y if EXT4_FS=y
 	default m if EXT2_FS_XATTR || EXT4_FS
 
-source "fs/reiserfs/Kconfig"
 source "fs/jfs/Kconfig"
 
 source "fs/xfs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 61679fd587b7..15df0a923d3a 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -61,7 +61,6 @@ obj-$(CONFIG_DLM)		+= dlm/
  
 # Do not add any filesystems before this line
 obj-$(CONFIG_NETFS_SUPPORT)	+= netfs/
-obj-$(CONFIG_REISERFS_FS)	+= reiserfs/
 obj-$(CONFIG_EXT4_FS)		+= ext4/
 # We place ext4 before ext2 so that clean ext3 root fs's do NOT mount using the
 # ext2 driver, which doesn't know about journalling!  Explicitly request ext2
diff --git a/fs/buffer.c b/fs/buffer.c
index 1fc9a50def0b..c17011bc7120 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -855,8 +855,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
  * done a sync().  Just drop the buffers from the inode list.
  *
  * NOTE: we take the inode's blockdev's mapping's i_private_lock.  Which
- * assumes that all the buffers are against the blockdev.  Not true
- * for reiserfs.
+ * assumes that all the buffers are against the blockdev.
  */
 void invalidate_inode_buffers(struct inode *inode)
 {
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index 4c925e55dbcd..818083a36bef 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -9,14 +9,13 @@ config QUOTA
 	help
 	  If you say Y here, you will be able to set per user limits for disk
 	  usage (also called disk quotas). Currently, it works for the
-	  ext2, ext3, ext4, f2fs, jfs, ocfs2 and reiserfs file systems.
-	  Note that gfs2 and xfs use their own quota system.
-	  Ext3, ext4 and reiserfs also support journaled quotas for which
-	  you don't need to run quotacheck(8) after an unclean shutdown.
-	  For further details, read the Quota mini-HOWTO, available from
-	  <https://www.tldp.org/docs.html#howto>, or the documentation provided
-	  with the quota tools. Probably the quota support is only useful for
-	  multi user systems. If unsure, say N.
+	  ext2, ext3, ext4, f2fs, jfs and ocfs2 file systems. Note that gfs2
+	  and xfs use their own quota system. Ext3 and ext4 also support
+	  journaled quotas for which you don't need to run quotacheck(8) after
+	  an unclean shutdown. For further details, read the Quota mini-HOWTO,
+	  available from <https://www.tldp.org/docs.html#howto>, or the
+	  documentation provided with the quota tools. Probably the quota
+	  support is only useful for multi user systems. If unsure, say N.
 
 config QUOTA_NETLINK_INTERFACE
 	bool "Report quota messages through netlink interface"
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
deleted file mode 100644
index 0e6fe26458fe..000000000000
--- a/fs/reiserfs/Kconfig
+++ /dev/null
@@ -1,91 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-config REISERFS_FS
-	tristate "Reiserfs support (deprecated)"
-	select BUFFER_HEAD
-	select CRC32
-	select LEGACY_DIRECT_IO
-	help
-	  Reiserfs is deprecated and scheduled to be removed from the kernel
-	  in 2025. If you are still using it, please migrate to another
-	  filesystem or tell us your usecase for reiserfs.
-
-	  Reiserfs stores not just filenames but the files themselves in a
-	  balanced tree.  Uses journalling.
-
-	  Balanced trees are more efficient than traditional file system
-	  architectural foundations.
-
-	  In general, ReiserFS is as fast as ext2, but is very efficient with
-	  large directories and small files.  Additional patches are needed
-	  for NFS and quotas, please see 
-	  <https://reiser4.wiki.kernel.org/index.php/Main_Page> for links.
-
-	  It is more easily extended to have features currently found in
-	  database and keyword search systems than block allocation based file
-	  systems are.  The next version will be so extended, and will support
-	  plugins consistent with our motto ``It takes more than a license to
-	  make source code open.''
-
-	  Read <https://reiser4.wiki.kernel.org/index.php/Main_Page> 
-	  to learn more about reiserfs.
-
-	  Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com.
-
-	  If you like it, you can pay us to add new features to it that you
-	  need, buy a support contract, or pay us to port it to another OS.
-
-config REISERFS_CHECK
-	bool "Enable reiserfs debug mode"
-	depends on REISERFS_FS
-	help
-	  If you set this to Y, then ReiserFS will perform every check it can
-	  possibly imagine of its internal consistency throughout its
-	  operation.  It will also go substantially slower.  More than once we
-	  have forgotten that this was on, and then gone despondent over the
-	  latest benchmarks.:-) Use of this option allows our team to go all
-	  out in checking for consistency when debugging without fear of its
-	  effect on end users.  If you are on the verge of sending in a bug
-	  report, say Y and you might get a useful error message.  Almost
-	  everyone should say N.
-
-config REISERFS_PROC_INFO
-	bool "Stats in /proc/fs/reiserfs"
-	depends on REISERFS_FS && PROC_FS
-	help
-	  Create under /proc/fs/reiserfs a hierarchy of files, displaying
-	  various ReiserFS statistics and internal data at the expense of
-	  making your kernel or module slightly larger (+8 KB). This also
-	  increases the amount of kernel memory required for each mount.
-	  Almost everyone but ReiserFS developers and people fine-tuning
-	  reiserfs or tracing problems should say N.
-
-config REISERFS_FS_XATTR
-	bool "ReiserFS extended attributes"
-	depends on REISERFS_FS
-	help
-	  Extended attributes are name:value pairs associated with inodes by
-	  the kernel or by users (see the attr(5) manual page for details).
-
-	  If unsure, say N.
-
-config REISERFS_FS_POSIX_ACL
-	bool "ReiserFS POSIX Access Control Lists"
-	depends on REISERFS_FS_XATTR
-	select FS_POSIX_ACL
-	help
-	  Posix Access Control Lists (ACLs) support permissions for users and
-	  groups beyond the owner/group/world scheme.
-
-	  If you don't know what Access Control Lists are, say N
-
-config REISERFS_FS_SECURITY
-	bool "ReiserFS Security Labels"
-	depends on REISERFS_FS_XATTR
-	help
-	  Security labels support alternative access control models
-	  implemented by security modules like SELinux.  This option
-	  enables an extended attribute handler for file security
-	  labels in the ReiserFS filesystem.
-
-	  If you are not using a security module that requires using
-	  extended attributes for file security labels, say N.
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
deleted file mode 100644
index bd29c58ccbd8..000000000000
--- a/fs/reiserfs/Makefile
+++ /dev/null
@@ -1,30 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for the linux reiser-filesystem routines.
-#
-
-obj-$(CONFIG_REISERFS_FS) += reiserfs.o
-
-reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \
-		 super.o prints.o objectid.o lbalance.o ibalance.o stree.o \
-		 hashes.o tail_conversion.o journal.o resize.o \
-		 item_ops.o ioctl.o xattr.o lock.o
-
-ifeq ($(CONFIG_REISERFS_PROC_INFO),y)
-reiserfs-objs += procfs.o
-endif
-
-ifeq ($(CONFIG_REISERFS_FS_XATTR),y)
-reiserfs-objs += xattr_user.o xattr_trusted.o
-endif
-
-ifeq ($(CONFIG_REISERFS_FS_SECURITY),y)
-reiserfs-objs += xattr_security.o
-endif
-
-ifeq ($(CONFIG_REISERFS_FS_POSIX_ACL),y)
-reiserfs-objs += xattr_acl.o
-endif
-
-TAGS:
-	etags *.c
diff --git a/fs/reiserfs/README b/fs/reiserfs/README
deleted file mode 100644
index 11e9ecf24b63..000000000000
--- a/fs/reiserfs/README
+++ /dev/null
@@ -1,151 +0,0 @@
-[LICENSING]
-
-ReiserFS is hereby licensed under the GNU General
-Public License version 2.
-
-Source code files that contain the phrase "licensing governed by
-reiserfs/README" are "governed files" throughout this file.  Governed
-files are licensed under the GPL.  The portions of them owned by Hans
-Reiser, or authorized to be licensed by him, have been in the past,
-and likely will be in the future, licensed to other parties under
-other licenses.  If you add your code to governed files, and don't
-want it to be owned by Hans Reiser, put your copyright label on that
-code so the poor blight and his customers can keep things straight.
-All portions of governed files not labeled otherwise are owned by Hans
-Reiser, and by adding your code to it, widely distributing it to
-others or sending us a patch, and leaving the sentence in stating that
-licensing is governed by the statement in this file, you accept this.
-It will be a kindness if you identify whether Hans Reiser is allowed
-to license code labeled as owned by you on your behalf other than
-under the GPL, because he wants to know if it is okay to do so and put
-a check in the mail to you (for non-trivial improvements) when he
-makes his next sale.  He makes no guarantees as to the amount if any,
-though he feels motivated to motivate contributors, and you can surely
-discuss this with him before or after contributing.  You have the
-right to decline to allow him to license your code contribution other
-than under the GPL.
-
-Further licensing options are available for commercial and/or other
-interests directly from Hans Reiser: hans@reiser.to.  If you interpret
-the GPL as not allowing those additional licensing options, you read
-it wrongly, and Richard Stallman agrees with me, when carefully read
-you can see that those restrictions on additional terms do not apply
-to the owner of the copyright, and my interpretation of this shall
-govern for this license.
-
-Finally, nothing in this license shall be interpreted to allow you to
-fail to fairly credit me, or to remove my credits, without my
-permission, unless you are an end user not redistributing to others.
-If you have doubts about how to properly do that, or about what is
-fair, ask.  (Last I spoke with him Richard was contemplating how best
-to address the fair crediting issue in the next GPL version.)
-
-[END LICENSING]
-
-Reiserfs is a file system based on balanced tree algorithms, which is
-described at https://reiser4.wiki.kernel.org/index.php/Main_Page 
-
-Stop reading here.  Go there, then return.
-
-Send bug reports to yura@namesys.botik.ru.
-
-mkreiserfs and other utilities are in reiserfs/utils, or wherever your
-Linux provider put them.  There is some disagreement about how useful
-it is for users to get their fsck and mkreiserfs out of sync with the
-version of reiserfs that is in their kernel, with many important
-distributors wanting them out of sync.:-) Please try to remember to
-recompile and reinstall fsck and mkreiserfs with every update of
-reiserfs, this is a common source of confusion.  Note that some of the
-utilities cannot be compiled without accessing the balancing code
-which is in the kernel code, and relocating the utilities may require
-you to specify where that code can be found.
-
-Yes, if you update your reiserfs kernel module you do have to
-recompile your kernel, most of the time.  The errors you get will be
-quite cryptic if your forget to do so.
-
-Real users, as opposed to folks who want to hack and then understand
-what went wrong, will want REISERFS_CHECK off.
-
-Hideous Commercial Pitch: Spread your development costs across other OS
-vendors.  Select from the best in the world, not the best in your
-building, by buying from third party OS component suppliers.  Leverage
-the software component development power of the internet.  Be the most
-aggressive in taking advantage of the commercial possibilities of
-decentralized internet development, and add value through your branded
-integration that you sell as an operating system.  Let your competitors
-be the ones to compete against the entire internet by themselves.  Be
-hip, get with the new economic trend, before your competitors do.  Send
-email to hans@reiser.to.
-
-To understand the code, after reading the website, start reading the
-code by reading reiserfs_fs.h first.
-
-Hans Reiser was the project initiator, primary architect, source of all
-funding for the first 5.5 years, and one of the programmers.  He owns
-the copyright.
-
-Vladimir Saveljev was one of the programmers, and he worked long hours
-writing the cleanest code.  He always made the effort to be the best he
-could be, and to make his code the best that it could be.  What resulted
-was quite remarkable. I don't think that money can ever motivate someone
-to work the way he did, he is one of the most selfless men I know.
-
-Yura helps with benchmarking, coding hashes, and block pre-allocation
-code.
-
-Anatoly Pinchuk is a former member of our team who worked closely with
-Vladimir throughout the project's development.  He wrote a quite
-substantial portion of the total code.  He realized that there was a
-space problem with packing tails of files for files larger than a node
-that start on a node aligned boundary (there are reasons to want to node
-align files), and he invented and implemented indirect items and
-unformatted nodes as the solution.
-
-Konstantin Shvachko was taking part in the early days.
-
-Mikhail Gilula was a brilliant innovator that has shown much generosity.
-
-Grigory Zaigralin was an extremely effective system administrator for
-our group.
-
-Igor Krasheninnikov was wonderful at hardware procurement, repair, and
-network installation.
-
-Jeremy Fitzhardinge wrote the teahash.c code, and he gives credit to a
-textbook he got the algorithm from in the code.  Note that his analysis
-of how we could use the hashing code in making 32 bit NFS cookies work
-was probably more important than the actual algorithm.  Colin Plumb also
-contributed to it.
-
-Chris Mason dived right into our code, and in just a few months produced
-the journaling code that dramatically increased the value of ReiserFS.
-He is just an amazing programmer.
-
-Igor Zagorovsky is writing much of the new item handler and extent code
-for our next major release.
-
-Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
-resizer, and is hard at work on implementing allocate on flush.  SGI
-implemented allocate on flush before us for XFS, and generously took
-the time to convince me we should do it also.  They are great people,
-and a great company.
-
-Yuri Shevchuk and Nikita Danilov are doing squid cache optimization.
-
-Vitaly Fertman is doing fsck.
-
-Jeff Mahoney, of SuSE, contributed a few cleanup fixes, most notably
-the endian safe patches which allow ReiserFS to run on any platform
-supported by the Linux kernel.
-
-SuSE, IntegratedLinux.com, Ecila, MP3.com, bigstorage.com, and the
-Alpha PC Company made it possible for me to not have a day job
-anymore, and to dramatically increase our staffing.  Ecila funded
-hypertext feature development, MP3.com funded journaling, SuSE funded
-core development, IntegratedLinux.com funded squid web cache
-appliances, bigstorage.com funded HSM, and the alpha PC company funded
-the alpha port.  Many of these tasks were helped by sponsors other
-than the ones just named.  SuSE has helped in much more than just
-funding....
-
diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h
deleted file mode 100644
index 2571b1a8be84..000000000000
--- a/fs/reiserfs/acl.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <linux/init.h>
-#include <linux/posix_acl.h>
-
-#define REISERFS_ACL_VERSION	0x0001
-
-typedef struct {
-	__le16 e_tag;
-	__le16 e_perm;
-	__le32 e_id;
-} reiserfs_acl_entry;
-
-typedef struct {
-	__le16 e_tag;
-	__le16 e_perm;
-} reiserfs_acl_entry_short;
-
-typedef struct {
-	__le32 a_version;
-} reiserfs_acl_header;
-
-static inline size_t reiserfs_acl_size(int count)
-{
-	if (count <= 4) {
-		return sizeof(reiserfs_acl_header) +
-		    count * sizeof(reiserfs_acl_entry_short);
-	} else {
-		return sizeof(reiserfs_acl_header) +
-		    4 * sizeof(reiserfs_acl_entry_short) +
-		    (count - 4) * sizeof(reiserfs_acl_entry);
-	}
-}
-
-static inline int reiserfs_acl_count(size_t size)
-{
-	ssize_t s;
-	size -= sizeof(reiserfs_acl_header);
-	s = size - 4 * sizeof(reiserfs_acl_entry_short);
-	if (s < 0) {
-		if (size % sizeof(reiserfs_acl_entry_short))
-			return -1;
-		return size / sizeof(reiserfs_acl_entry_short);
-	} else {
-		if (s % sizeof(reiserfs_acl_entry))
-			return -1;
-		return s / sizeof(reiserfs_acl_entry) + 4;
-	}
-}
-
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
-struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu);
-int reiserfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
-		     struct posix_acl *acl, int type);
-int reiserfs_acl_chmod(struct dentry *dentry);
-int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
-				 struct inode *dir, struct dentry *dentry,
-				 struct inode *inode);
-int reiserfs_cache_default_acl(struct inode *dir);
-
-#else
-
-#define reiserfs_cache_default_acl(inode) 0
-#define reiserfs_get_acl NULL
-#define reiserfs_set_acl NULL
-
-static inline int reiserfs_acl_chmod(struct dentry *dentry)
-{
-	return 0;
-}
-
-static inline int
-reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
-			     const struct inode *dir, struct dentry *dentry,
-			     struct inode *inode)
-{
-	return 0;
-}
-#endif
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
deleted file mode 100644
index bf708ac287b4..000000000000
--- a/fs/reiserfs/bitmap.c
+++ /dev/null
@@ -1,1476 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-/* Reiserfs block (de)allocator, bitmap-based. */
-
-#include <linux/time.h>
-#include "reiserfs.h"
-#include <linux/errno.h>
-#include <linux/buffer_head.h>
-#include <linux/kernel.h>
-#include <linux/pagemap.h>
-#include <linux/vmalloc.h>
-#include <linux/quotaops.h>
-#include <linux/seq_file.h>
-
-#define PREALLOCATION_SIZE 9
-
-/* different reiserfs block allocator options */
-
-#define SB_ALLOC_OPTS(s) (REISERFS_SB(s)->s_alloc_options.bits)
-
-#define  _ALLOC_concentrating_formatted_nodes 0
-#define  _ALLOC_displacing_large_files 1
-#define  _ALLOC_displacing_new_packing_localities 2
-#define  _ALLOC_old_hashed_relocation 3
-#define  _ALLOC_new_hashed_relocation 4
-#define  _ALLOC_skip_busy 5
-#define  _ALLOC_displace_based_on_dirid 6
-#define  _ALLOC_hashed_formatted_nodes 7
-#define  _ALLOC_old_way 8
-#define  _ALLOC_hundredth_slices 9
-#define  _ALLOC_dirid_groups 10
-#define  _ALLOC_oid_groups 11
-#define  _ALLOC_packing_groups 12
-
-#define  concentrating_formatted_nodes(s)	test_bit(_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s))
-#define  displacing_large_files(s)		test_bit(_ALLOC_displacing_large_files, &SB_ALLOC_OPTS(s))
-#define  displacing_new_packing_localities(s)	test_bit(_ALLOC_displacing_new_packing_localities, &SB_ALLOC_OPTS(s))
-
-#define SET_OPTION(optname) \
-   do { \
-	reiserfs_info(s, "block allocator option \"%s\" is set", #optname); \
-	set_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)); \
-    } while(0)
-#define TEST_OPTION(optname, s) \
-    test_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s))
-
-static inline void get_bit_address(struct super_block *s,
-				   b_blocknr_t block,
-				   unsigned int *bmap_nr,
-				   unsigned int *offset)
-{
-	/*
-	 * It is in the bitmap block number equal to the block
-	 * number divided by the number of bits in a block.
-	 */
-	*bmap_nr = block >> (s->s_blocksize_bits + 3);
-	/* Within that bitmap block it is located at bit offset *offset. */
-	*offset = block & ((s->s_blocksize << 3) - 1);
-}
-
-int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value)
-{
-	unsigned int bmap, offset;
-	unsigned int bmap_count = reiserfs_bmap_count(s);
-
-	if (block == 0 || block >= SB_BLOCK_COUNT(s)) {
-		reiserfs_error(s, "vs-4010",
-			       "block number is out of range %lu (%u)",
-			       block, SB_BLOCK_COUNT(s));
-		return 0;
-	}
-
-	get_bit_address(s, block, &bmap, &offset);
-
-	/*
-	 * Old format filesystem? Unlikely, but the bitmaps are all
-	 * up front so we need to account for it.
-	 */
-	if (unlikely(test_bit(REISERFS_OLD_FORMAT,
-			      &REISERFS_SB(s)->s_properties))) {
-		b_blocknr_t bmap1 = REISERFS_SB(s)->s_sbh->b_blocknr + 1;
-		if (block >= bmap1 &&
-		    block <= bmap1 + bmap_count) {
-			reiserfs_error(s, "vs-4019", "bitmap block %lu(%u) "
-				       "can't be freed or reused",
-				       block, bmap_count);
-			return 0;
-		}
-	} else {
-		if (offset == 0) {
-			reiserfs_error(s, "vs-4020", "bitmap block %lu(%u) "
-				       "can't be freed or reused",
-				       block, bmap_count);
-			return 0;
-		}
-	}
-
-	if (bmap >= bmap_count) {
-		reiserfs_error(s, "vs-4030", "bitmap for requested block "
-			       "is out of range: block=%lu, bitmap_nr=%u",
-			       block, bmap);
-		return 0;
-	}
-
-	if (bit_value == 0 && block == SB_ROOT_BLOCK(s)) {
-		reiserfs_error(s, "vs-4050", "this is root block (%u), "
-			       "it must be busy", SB_ROOT_BLOCK(s));
-		return 0;
-	}
-
-	return 1;
-}
-
-/*
- * Searches in journal structures for a given block number (bmap, off).
- * If block is found in reiserfs journal it suggests next free block
- * candidate to test.
- */
-static inline int is_block_in_journal(struct super_block *s, unsigned int bmap,
-				      int off, int *next)
-{
-	b_blocknr_t tmp;
-
-	if (reiserfs_in_journal(s, bmap, off, 1, &tmp)) {
-		if (tmp) {	/* hint supplied */
-			*next = tmp;
-			PROC_INFO_INC(s, scan_bitmap.in_journal_hint);
-		} else {
-			(*next) = off + 1;  /* inc offset to avoid looping. */
-			PROC_INFO_INC(s, scan_bitmap.in_journal_nohint);
-		}
-		PROC_INFO_INC(s, scan_bitmap.retry);
-		return 1;
-	}
-	return 0;
-}
-
-/*
- * Searches for a window of zero bits with given minimum and maximum
- * lengths in one bitmap block
- */
-static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
-			     unsigned int bmap_n, int *beg, int boundary,
-			     int min, int max, int unfm)
-{
-	struct super_block *s = th->t_super;
-	struct reiserfs_bitmap_info *bi = &SB_AP_BITMAP(s)[bmap_n];
-	struct buffer_head *bh;
-	int end, next;
-	int org = *beg;
-
-	BUG_ON(!th->t_trans_id);
-	RFALSE(bmap_n >= reiserfs_bmap_count(s), "Bitmap %u is out of "
-	       "range (0..%u)", bmap_n, reiserfs_bmap_count(s) - 1);
-	PROC_INFO_INC(s, scan_bitmap.bmap);
-
-	if (!bi) {
-		reiserfs_error(s, "jdm-4055", "NULL bitmap info pointer "
-			       "for bitmap %d", bmap_n);
-		return 0;
-	}
-
-	bh = reiserfs_read_bitmap_block(s, bmap_n);
-	if (bh == NULL)
-		return 0;
-
-	while (1) {
-cont:
-		if (bi->free_count < min) {
-			brelse(bh);
-			return 0;	/* No free blocks in this bitmap */
-		}
-
-		/* search for a first zero bit -- beginning of a window */
-		*beg = reiserfs_find_next_zero_le_bit
-		    ((unsigned long *)(bh->b_data), boundary, *beg);
-
-		/*
-		 * search for a zero bit fails or the rest of bitmap block
-		 * cannot contain a zero window of minimum size
-		 */
-		if (*beg + min > boundary) {
-			brelse(bh);
-			return 0;
-		}
-
-		if (unfm && is_block_in_journal(s, bmap_n, *beg, beg))
-			continue;
-		/* first zero bit found; we check next bits */
-		for (end = *beg + 1;; end++) {
-			if (end >= *beg + max || end >= boundary
-			    || reiserfs_test_le_bit(end, bh->b_data)) {
-				next = end;
-				break;
-			}
-
-			/*
-			 * finding the other end of zero bit window requires
-			 * looking into journal structures (in case of
-			 * searching for free blocks for unformatted nodes)
-			 */
-			if (unfm && is_block_in_journal(s, bmap_n, end, &next))
-				break;
-		}
-
-		/*
-		 * now (*beg) points to beginning of zero bits window,
-		 * (end) points to one bit after the window end
-		 */
-
-		/* found window of proper size */
-		if (end - *beg >= min) {
-			int i;
-			reiserfs_prepare_for_journal(s, bh, 1);
-			/*
-			 * try to set all blocks used checking are
-			 * they still free
-			 */
-			for (i = *beg; i < end; i++) {
-				/* Don't check in journal again. */
-				if (reiserfs_test_and_set_le_bit
-				    (i, bh->b_data)) {
-					/*
-					 * bit was set by another process while
-					 * we slept in prepare_for_journal()
-					 */
-					PROC_INFO_INC(s, scan_bitmap.stolen);
-
-					/*
-					 * we can continue with smaller set
-					 * of allocated blocks, if length of
-					 * this set is more or equal to `min'
-					 */
-					if (i >= *beg + min) {
-						end = i;
-						break;
-					}
-
-					/*
-					 * otherwise we clear all bit
-					 * were set ...
-					 */
-					while (--i >= *beg)
-						reiserfs_clear_le_bit
-						    (i, bh->b_data);
-					reiserfs_restore_prepared_buffer(s, bh);
-					*beg = org;
-
-					/*
-					 * Search again in current block
-					 * from beginning
-					 */
-					goto cont;
-				}
-			}
-			bi->free_count -= (end - *beg);
-			journal_mark_dirty(th, bh);
-			brelse(bh);
-
-			/* free block count calculation */
-			reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
-						     1);
-			PUT_SB_FREE_BLOCKS(s, SB_FREE_BLOCKS(s) - (end - *beg));
-			journal_mark_dirty(th, SB_BUFFER_WITH_SB(s));
-
-			return end - (*beg);
-		} else {
-			*beg = next;
-		}
-	}
-}
-
-static int bmap_hash_id(struct super_block *s, u32 id)
-{
-	char *hash_in = NULL;
-	unsigned long hash;
-	unsigned bm;
-
-	if (id <= 2) {
-		bm = 1;
-	} else {
-		hash_in = (char *)(&id);
-		hash = keyed_hash(hash_in, 4);
-		bm = hash % reiserfs_bmap_count(s);
-		if (!bm)
-			bm = 1;
-	}
-	/* this can only be true when SB_BMAP_NR = 1 */
-	if (bm >= reiserfs_bmap_count(s))
-		bm = 0;
-	return bm;
-}
-
-/*
- * hashes the id and then returns > 0 if the block group for the
- * corresponding hash is full
- */
-static inline int block_group_used(struct super_block *s, u32 id)
-{
-	int bm = bmap_hash_id(s, id);
-	struct reiserfs_bitmap_info *info = &SB_AP_BITMAP(s)[bm];
-
-	/*
-	 * If we don't have cached information on this bitmap block, we're
-	 * going to have to load it later anyway. Loading it here allows us
-	 * to make a better decision. This favors long-term performance gain
-	 * with a better on-disk layout vs. a short term gain of skipping the
-	 * read and potentially having a bad placement.
-	 */
-	if (info->free_count == UINT_MAX) {
-		struct buffer_head *bh = reiserfs_read_bitmap_block(s, bm);
-		brelse(bh);
-	}
-
-	if (info->free_count > ((s->s_blocksize << 3) * 60 / 100)) {
-		return 0;
-	}
-	return 1;
-}
-
-/*
- * the packing is returned in disk byte order
- */
-__le32 reiserfs_choose_packing(struct inode * dir)
-{
-	__le32 packing;
-	if (TEST_OPTION(packing_groups, dir->i_sb)) {
-		u32 parent_dir = le32_to_cpu(INODE_PKEY(dir)->k_dir_id);
-		/*
-		 * some versions of reiserfsck expect packing locality 1 to be
-		 * special
-		 */
-		if (parent_dir == 1 || block_group_used(dir->i_sb, parent_dir))
-			packing = INODE_PKEY(dir)->k_objectid;
-		else
-			packing = INODE_PKEY(dir)->k_dir_id;
-	} else
-		packing = INODE_PKEY(dir)->k_objectid;
-	return packing;
-}
-
-/*
- * Tries to find contiguous zero bit window (given size) in given region of
- * bitmap and place new blocks there. Returns number of allocated blocks.
- */
-static int scan_bitmap(struct reiserfs_transaction_handle *th,
-		       b_blocknr_t * start, b_blocknr_t finish,
-		       int min, int max, int unfm, sector_t file_block)
-{
-	int nr_allocated = 0;
-	struct super_block *s = th->t_super;
-	unsigned int bm, off;
-	unsigned int end_bm, end_off;
-	unsigned int off_max = s->s_blocksize << 3;
-
-	BUG_ON(!th->t_trans_id);
-	PROC_INFO_INC(s, scan_bitmap.call);
-
-	/* No point in looking for more free blocks */
-	if (SB_FREE_BLOCKS(s) <= 0)
-		return 0;
-
-	get_bit_address(s, *start, &bm, &off);
-	get_bit_address(s, finish, &end_bm, &end_off);
-	if (bm > reiserfs_bmap_count(s))
-		return 0;
-	if (end_bm > reiserfs_bmap_count(s))
-		end_bm = reiserfs_bmap_count(s);
-
-	/*
-	 * When the bitmap is more than 10% free, anyone can allocate.
-	 * When it's less than 10% free, only files that already use the
-	 * bitmap are allowed. Once we pass 80% full, this restriction
-	 * is lifted.
-	 *
-	 * We do this so that files that grow later still have space close to
-	 * their original allocation. This improves locality, and presumably
-	 * performance as a result.
-	 *
-	 * This is only an allocation policy and does not make up for getting a
-	 * bad hint. Decent hinting must be implemented for this to work well.
-	 */
-	if (TEST_OPTION(skip_busy, s)
-	    && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s) / 20) {
-		for (; bm < end_bm; bm++, off = 0) {
-			if ((off && (!unfm || (file_block != 0)))
-			    || SB_AP_BITMAP(s)[bm].free_count >
-			    (s->s_blocksize << 3) / 10)
-				nr_allocated =
-				    scan_bitmap_block(th, bm, &off, off_max,
-						      min, max, unfm);
-			if (nr_allocated)
-				goto ret;
-		}
-		/* we know from above that start is a reasonable number */
-		get_bit_address(s, *start, &bm, &off);
-	}
-
-	for (; bm < end_bm; bm++, off = 0) {
-		nr_allocated =
-		    scan_bitmap_block(th, bm, &off, off_max, min, max, unfm);
-		if (nr_allocated)
-			goto ret;
-	}
-
-	nr_allocated =
-	    scan_bitmap_block(th, bm, &off, end_off + 1, min, max, unfm);
-
-ret:
-	*start = bm * off_max + off;
-	return nr_allocated;
-
-}
-
-static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
-				 struct inode *inode, b_blocknr_t block,
-				 int for_unformatted)
-{
-	struct super_block *s = th->t_super;
-	struct reiserfs_super_block *rs;
-	struct buffer_head *sbh, *bmbh;
-	struct reiserfs_bitmap_info *apbi;
-	unsigned int nr, offset;
-
-	BUG_ON(!th->t_trans_id);
-	PROC_INFO_INC(s, free_block);
-	rs = SB_DISK_SUPER_BLOCK(s);
-	sbh = SB_BUFFER_WITH_SB(s);
-	apbi = SB_AP_BITMAP(s);
-
-	get_bit_address(s, block, &nr, &offset);
-
-	if (nr >= reiserfs_bmap_count(s)) {
-		reiserfs_error(s, "vs-4075", "block %lu is out of range",
-			       block);
-		return;
-	}
-
-	bmbh = reiserfs_read_bitmap_block(s, nr);
-	if (!bmbh)
-		return;
-
-	reiserfs_prepare_for_journal(s, bmbh, 1);
-
-	/* clear bit for the given block in bit map */
-	if (!reiserfs_test_and_clear_le_bit(offset, bmbh->b_data)) {
-		reiserfs_error(s, "vs-4080",
-			       "block %lu: bit already cleared", block);
-	}
-	apbi[nr].free_count++;
-	journal_mark_dirty(th, bmbh);
-	brelse(bmbh);
-
-	reiserfs_prepare_for_journal(s, sbh, 1);
-	/* update super block */
-	set_sb_free_blocks(rs, sb_free_blocks(rs) + 1);
-
-	journal_mark_dirty(th, sbh);
-	if (for_unformatted) {
-		int depth = reiserfs_write_unlock_nested(s);
-		dquot_free_block_nodirty(inode, 1);
-		reiserfs_write_lock_nested(s, depth);
-	}
-}
-
-void reiserfs_free_block(struct reiserfs_transaction_handle *th,
-			 struct inode *inode, b_blocknr_t block,
-			 int for_unformatted)
-{
-	struct super_block *s = th->t_super;
-
-	BUG_ON(!th->t_trans_id);
-	RFALSE(!s, "vs-4061: trying to free block on nonexistent device");
-	if (!is_reusable(s, block, 1))
-		return;
-
-	if (block > sb_block_count(REISERFS_SB(s)->s_rs)) {
-		reiserfs_error(th->t_super, "bitmap-4072",
-			       "Trying to free block outside file system "
-			       "boundaries (%lu > %lu)",
-			       block, sb_block_count(REISERFS_SB(s)->s_rs));
-		return;
-	}
-	/* mark it before we clear it, just in case */
-	journal_mark_freed(th, s, block);
-	_reiserfs_free_block(th, inode, block, for_unformatted);
-}
-
-/* preallocated blocks don't need to be run through journal_mark_freed */
-static void reiserfs_free_prealloc_block(struct reiserfs_transaction_handle *th,
-					 struct inode *inode, b_blocknr_t block)
-{
-	BUG_ON(!th->t_trans_id);
-	RFALSE(!th->t_super,
-	       "vs-4060: trying to free block on nonexistent device");
-	if (!is_reusable(th->t_super, block, 1))
-		return;
-	_reiserfs_free_block(th, inode, block, 1);
-}
-
-static void __discard_prealloc(struct reiserfs_transaction_handle *th,
-			       struct reiserfs_inode_info *ei)
-{
-	unsigned long save = ei->i_prealloc_block;
-	int dirty = 0;
-	struct inode *inode = &ei->vfs_inode;
-
-	BUG_ON(!th->t_trans_id);
-#ifdef CONFIG_REISERFS_CHECK
-	if (ei->i_prealloc_count < 0)
-		reiserfs_error(th->t_super, "zam-4001",
-			       "inode has negative prealloc blocks count.");
-#endif
-	while (ei->i_prealloc_count > 0) {
-		b_blocknr_t block_to_free;
-
-		/*
-		 * reiserfs_free_prealloc_block can drop the write lock,
-		 * which could allow another caller to free the same block.
-		 * We can protect against it by modifying the prealloc
-		 * state before calling it.
-		 */
-		block_to_free = ei->i_prealloc_block++;
-		ei->i_prealloc_count--;
-		reiserfs_free_prealloc_block(th, inode, block_to_free);
-		dirty = 1;
-	}
-	if (dirty)
-		reiserfs_update_sd(th, inode);
-	ei->i_prealloc_block = save;
-	list_del_init(&ei->i_prealloc_list);
-}
-
-/* FIXME: It should be inline function */
-void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th,
-			       struct inode *inode)
-{
-	struct reiserfs_inode_info *ei = REISERFS_I(inode);
-
-	BUG_ON(!th->t_trans_id);
-	if (ei->i_prealloc_count)
-		__discard_prealloc(th, ei);
-}
-
-void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th)
-{
-	struct list_head *plist = &SB_JOURNAL(th->t_super)->j_prealloc_list;
-
-	BUG_ON(!th->t_trans_id);
-	while (!list_empty(plist)) {
-		struct reiserfs_inode_info *ei;
-		ei = list_entry(plist->next, struct reiserfs_inode_info,
-				i_prealloc_list);
-#ifdef CONFIG_REISERFS_CHECK
-		if (!ei->i_prealloc_count) {
-			reiserfs_error(th->t_super, "zam-4001",
-				       "inode is in prealloc list but has "
-				       "no preallocated blocks.");
-		}
-#endif
-		__discard_prealloc(th, ei);
-	}
-}
-
-void reiserfs_init_alloc_options(struct super_block *s)
-{
-	set_bit(_ALLOC_skip_busy, &SB_ALLOC_OPTS(s));
-	set_bit(_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s));
-	set_bit(_ALLOC_packing_groups, &SB_ALLOC_OPTS(s));
-}
-
-/* block allocator related options are parsed here */
-int reiserfs_parse_alloc_options(struct super_block *s, char *options)
-{
-	char *this_char, *value;
-
-	/* clear default settings */
-	REISERFS_SB(s)->s_alloc_options.bits = 0;
-
-	while ((this_char = strsep(&options, ":")) != NULL) {
-		if ((value = strchr(this_char, '=')) != NULL)
-			*value++ = 0;
-
-		if (!strcmp(this_char, "concentrating_formatted_nodes")) {
-			int temp;
-			SET_OPTION(concentrating_formatted_nodes);
-			temp = (value
-				&& *value) ? simple_strtoul(value, &value,
-							    0) : 10;
-			if (temp <= 0 || temp > 100) {
-				REISERFS_SB(s)->s_alloc_options.border = 10;
-			} else {
-				REISERFS_SB(s)->s_alloc_options.border =
-				    100 / temp;
-			}
-			continue;
-		}
-		if (!strcmp(this_char, "displacing_large_files")) {
-			SET_OPTION(displacing_large_files);
-			REISERFS_SB(s)->s_alloc_options.large_file_size =
-			    (value
-			     && *value) ? simple_strtoul(value, &value, 0) : 16;
-			continue;
-		}
-		if (!strcmp(this_char, "displacing_new_packing_localities")) {
-			SET_OPTION(displacing_new_packing_localities);
-			continue;
-		}
-
-		if (!strcmp(this_char, "old_hashed_relocation")) {
-			SET_OPTION(old_hashed_relocation);
-			continue;
-		}
-
-		if (!strcmp(this_char, "new_hashed_relocation")) {
-			SET_OPTION(new_hashed_relocation);
-			continue;
-		}
-
-		if (!strcmp(this_char, "dirid_groups")) {
-			SET_OPTION(dirid_groups);
-			continue;
-		}
-		if (!strcmp(this_char, "oid_groups")) {
-			SET_OPTION(oid_groups);
-			continue;
-		}
-		if (!strcmp(this_char, "packing_groups")) {
-			SET_OPTION(packing_groups);
-			continue;
-		}
-		if (!strcmp(this_char, "hashed_formatted_nodes")) {
-			SET_OPTION(hashed_formatted_nodes);
-			continue;
-		}
-
-		if (!strcmp(this_char, "skip_busy")) {
-			SET_OPTION(skip_busy);
-			continue;
-		}
-
-		if (!strcmp(this_char, "hundredth_slices")) {
-			SET_OPTION(hundredth_slices);
-			continue;
-		}
-
-		if (!strcmp(this_char, "old_way")) {
-			SET_OPTION(old_way);
-			continue;
-		}
-
-		if (!strcmp(this_char, "displace_based_on_dirid")) {
-			SET_OPTION(displace_based_on_dirid);
-			continue;
-		}
-
-		if (!strcmp(this_char, "preallocmin")) {
-			REISERFS_SB(s)->s_alloc_options.preallocmin =
-			    (value
-			     && *value) ? simple_strtoul(value, &value, 0) : 4;
-			continue;
-		}
-
-		if (!strcmp(this_char, "preallocsize")) {
-			REISERFS_SB(s)->s_alloc_options.preallocsize =
-			    (value
-			     && *value) ? simple_strtoul(value, &value,
-							 0) :
-			    PREALLOCATION_SIZE;
-			continue;
-		}
-
-		reiserfs_warning(s, "zam-4001", "unknown option - %s",
-				 this_char);
-		return 1;
-	}
-
-	reiserfs_info(s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s));
-	return 0;
-}
-
-static void print_sep(struct seq_file *seq, int *first)
-{
-	if (!*first)
-		seq_puts(seq, ":");
-	else
-		*first = 0;
-}
-
-void show_alloc_options(struct seq_file *seq, struct super_block *s)
-{
-	int first = 1;
-
-	if (SB_ALLOC_OPTS(s) == ((1 << _ALLOC_skip_busy) |
-		(1 << _ALLOC_dirid_groups) | (1 << _ALLOC_packing_groups)))
-		return;
-
-	seq_puts(seq, ",alloc=");
-
-	if (TEST_OPTION(concentrating_formatted_nodes, s)) {
-		print_sep(seq, &first);
-		if (REISERFS_SB(s)->s_alloc_options.border != 10) {
-			seq_printf(seq, "concentrating_formatted_nodes=%d",
-				100 / REISERFS_SB(s)->s_alloc_options.border);
-		} else
-			seq_puts(seq, "concentrating_formatted_nodes");
-	}
-	if (TEST_OPTION(displacing_large_files, s)) {
-		print_sep(seq, &first);
-		if (REISERFS_SB(s)->s_alloc_options.large_file_size != 16) {
-			seq_printf(seq, "displacing_large_files=%lu",
-			    REISERFS_SB(s)->s_alloc_options.large_file_size);
-		} else
-			seq_puts(seq, "displacing_large_files");
-	}
-	if (TEST_OPTION(displacing_new_packing_localities, s)) {
-		print_sep(seq, &first);
-		seq_puts(seq, "displacing_new_packing_localities");
-	}
-	if (TEST_OPTION(old_hashed_relocation, s)) {
-		print_sep(seq, &first);
-		seq_puts(seq, "old_hashed_relocation");
-	}
-	if (TEST_OPTION(new_hashed_relocation, s)) {
-		print_sep(seq, &first);
-		seq_puts(seq, "new_hashed_relocation");
-	}
-	if (TEST_OPTION(dirid_groups, s)) {
-		print_sep(seq, &first);
-		seq_puts(seq, "dirid_groups");
-	}
-	if (TEST_OPTION(oid_groups, s)) {
-		print_sep(seq, &first);
-		seq_puts(seq, "oid_groups");
-	}
-	if (TEST_OPTION(packing_groups, s)) {
-		print_sep(seq, &first);
-		seq_puts(seq, "packing_groups");
-	}
-	if (TEST_OPTION(hashed_formatted_nodes, s)) {
-		print_sep(seq, &first);
-		seq_puts(seq, "hashed_formatted_nodes");
-	}
-	if (TEST_OPTION(skip_busy, s)) {
-		print_sep(seq, &first);
-		seq_puts(seq, "skip_busy");
-	}
-	if (TEST_OPTION(hundredth_slices, s)) {
-		print_sep(seq, &first);
-		seq_puts(seq, "hundredth_slices");
-	}
-	if (TEST_OPTION(old_way, s)) {
-		print_sep(seq, &first);
-		seq_puts(seq, "old_way");
-	}
-	if (TEST_OPTION(displace_based_on_dirid, s)) {
-		print_sep(seq, &first);
-		seq_puts(seq, "displace_based_on_dirid");
-	}
-	if (REISERFS_SB(s)->s_alloc_options.preallocmin != 0) {
-		print_sep(seq, &first);
-		seq_printf(seq, "preallocmin=%d",
-				REISERFS_SB(s)->s_alloc_options.preallocmin);
-	}
-	if (REISERFS_SB(s)->s_alloc_options.preallocsize != 17) {
-		print_sep(seq, &first);
-		seq_printf(seq, "preallocsize=%d",
-				REISERFS_SB(s)->s_alloc_options.preallocsize);
-	}
-}
-
-static inline void new_hashed_relocation(reiserfs_blocknr_hint_t * hint)
-{
-	char *hash_in;
-
-	if (hint->formatted_node) {
-		hash_in = (char *)&hint->key.k_dir_id;
-	} else {
-		if (!hint->inode) {
-			/*hint->search_start = hint->beg;*/
-			hash_in = (char *)&hint->key.k_dir_id;
-		} else
-		    if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
-			hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id);
-		else
-			hash_in =
-			    (char *)(&INODE_PKEY(hint->inode)->k_objectid);
-	}
-
-	hint->search_start =
-	    hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
-}
-
-/*
- * Relocation based on dirid, hashing them into a given bitmap block
- * files. Formatted nodes are unaffected, a separate policy covers them
- */
-static void dirid_groups(reiserfs_blocknr_hint_t * hint)
-{
-	unsigned long hash;
-	__u32 dirid = 0;
-	int bm = 0;
-	struct super_block *sb = hint->th->t_super;
-
-	if (hint->inode)
-		dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
-	else if (hint->formatted_node)
-		dirid = hint->key.k_dir_id;
-
-	if (dirid) {
-		bm = bmap_hash_id(sb, dirid);
-		hash = bm * (sb->s_blocksize << 3);
-		/* give a portion of the block group to metadata */
-		if (hint->inode)
-			hash += sb->s_blocksize / 2;
-		hint->search_start = hash;
-	}
-}
-
-/*
- * Relocation based on oid, hashing them into a given bitmap block
- * files. Formatted nodes are unaffected, a separate policy covers them
- */
-static void oid_groups(reiserfs_blocknr_hint_t * hint)
-{
-	if (hint->inode) {
-		unsigned long hash;
-		__u32 oid;
-		__u32 dirid;
-		int bm;
-
-		dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
-
-		/*
-		 * keep the root dir and it's first set of subdirs close to
-		 * the start of the disk
-		 */
-		if (dirid <= 2)
-			hash = (hint->inode->i_sb->s_blocksize << 3);
-		else {
-			oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid);
-			bm = bmap_hash_id(hint->inode->i_sb, oid);
-			hash = bm * (hint->inode->i_sb->s_blocksize << 3);
-		}
-		hint->search_start = hash;
-	}
-}
-
-/*
- * returns 1 if it finds an indirect item and gets valid hint info
- * from it, otherwise 0
- */
-static int get_left_neighbor(reiserfs_blocknr_hint_t * hint)
-{
-	struct treepath *path;
-	struct buffer_head *bh;
-	struct item_head *ih;
-	int pos_in_item;
-	__le32 *item;
-	int ret = 0;
-
-	/*
-	 * reiserfs code can call this function w/o pointer to path
-	 * structure supplied; then we rely on supplied search_start
-	 */
-	if (!hint->path)
-		return 0;
-
-	path = hint->path;
-	bh = get_last_bh(path);
-	RFALSE(!bh, "green-4002: Illegal path specified to get_left_neighbor");
-	ih = tp_item_head(path);
-	pos_in_item = path->pos_in_item;
-	item = tp_item_body(path);
-
-	hint->search_start = bh->b_blocknr;
-
-	/*
-	 * for indirect item: go to left and look for the first non-hole entry
-	 * in the indirect item
-	 */
-	if (!hint->formatted_node && is_indirect_le_ih(ih)) {
-		if (pos_in_item == I_UNFM_NUM(ih))
-			pos_in_item--;
-		while (pos_in_item >= 0) {
-			int t = get_block_num(item, pos_in_item);
-			if (t) {
-				hint->search_start = t;
-				ret = 1;
-				break;
-			}
-			pos_in_item--;
-		}
-	}
-
-	/* does result value fit into specified region? */
-	return ret;
-}
-
-/*
- * should be, if formatted node, then try to put on first part of the device
- * specified as number of percent with mount option device, else try to put
- * on last of device.  This is not to say it is good code to do so,
- * but the effect should be measured.
- */
-static inline void set_border_in_hint(struct super_block *s,
-				      reiserfs_blocknr_hint_t * hint)
-{
-	b_blocknr_t border =
-	    SB_BLOCK_COUNT(s) / REISERFS_SB(s)->s_alloc_options.border;
-
-	if (hint->formatted_node)
-		hint->end = border - 1;
-	else
-		hint->beg = border;
-}
-
-static inline void displace_large_file(reiserfs_blocknr_hint_t * hint)
-{
-	if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
-		hint->search_start =
-		    hint->beg +
-		    keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_dir_id),
-			       4) % (hint->end - hint->beg);
-	else
-		hint->search_start =
-		    hint->beg +
-		    keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_objectid),
-			       4) % (hint->end - hint->beg);
-}
-
-static inline void hash_formatted_node(reiserfs_blocknr_hint_t * hint)
-{
-	char *hash_in;
-
-	if (!hint->inode)
-		hash_in = (char *)&hint->key.k_dir_id;
-	else if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
-		hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id);
-	else
-		hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid);
-
-	hint->search_start =
-	    hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
-}
-
-static inline int
-this_blocknr_allocation_would_make_it_a_large_file(reiserfs_blocknr_hint_t *
-						   hint)
-{
-	return hint->block ==
-	    REISERFS_SB(hint->th->t_super)->s_alloc_options.large_file_size;
-}
-
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
-static inline void displace_new_packing_locality(reiserfs_blocknr_hint_t * hint)
-{
-	struct in_core_key *key = &hint->key;
-
-	hint->th->displace_new_blocks = 0;
-	hint->search_start =
-	    hint->beg + keyed_hash((char *)(&key->k_objectid),
-				   4) % (hint->end - hint->beg);
-}
-#endif
-
-static inline int old_hashed_relocation(reiserfs_blocknr_hint_t * hint)
-{
-	b_blocknr_t border;
-	u32 hash_in;
-
-	if (hint->formatted_node || hint->inode == NULL) {
-		return 0;
-	}
-
-	hash_in = le32_to_cpu((INODE_PKEY(hint->inode))->k_dir_id);
-	border =
-	    hint->beg + (u32) keyed_hash(((char *)(&hash_in)),
-					 4) % (hint->end - hint->beg - 1);
-	if (border > hint->search_start)
-		hint->search_start = border;
-
-	return 1;
-}
-
-static inline int old_way(reiserfs_blocknr_hint_t * hint)
-{
-	b_blocknr_t border;
-
-	if (hint->formatted_node || hint->inode == NULL) {
-		return 0;
-	}
-
-	border =
-	    hint->beg +
-	    le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id) % (hint->end -
-							      hint->beg);
-	if (border > hint->search_start)
-		hint->search_start = border;
-
-	return 1;
-}
-
-static inline void hundredth_slices(reiserfs_blocknr_hint_t * hint)
-{
-	struct in_core_key *key = &hint->key;
-	b_blocknr_t slice_start;
-
-	slice_start =
-	    (keyed_hash((char *)(&key->k_dir_id), 4) % 100) * (hint->end / 100);
-	if (slice_start > hint->search_start
-	    || slice_start + (hint->end / 100) <= hint->search_start) {
-		hint->search_start = slice_start;
-	}
-}
-
-static void determine_search_start(reiserfs_blocknr_hint_t * hint,
-				   int amount_needed)
-{
-	struct super_block *s = hint->th->t_super;
-	int unfm_hint;
-
-	hint->beg = 0;
-	hint->end = SB_BLOCK_COUNT(s) - 1;
-
-	/* This is former border algorithm. Now with tunable border offset */
-	if (concentrating_formatted_nodes(s))
-		set_border_in_hint(s, hint);
-
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
-	/*
-	 * whenever we create a new directory, we displace it.  At first
-	 * we will hash for location, later we might look for a moderately
-	 * empty place for it
-	 */
-	if (displacing_new_packing_localities(s)
-	    && hint->th->displace_new_blocks) {
-		displace_new_packing_locality(hint);
-
-		/*
-		 * we do not continue determine_search_start,
-		 * if new packing locality is being displaced
-		 */
-		return;
-	}
-#endif
-
-	/*
-	 * all persons should feel encouraged to add more special cases
-	 * here and test them
-	 */
-
-	if (displacing_large_files(s) && !hint->formatted_node
-	    && this_blocknr_allocation_would_make_it_a_large_file(hint)) {
-		displace_large_file(hint);
-		return;
-	}
-
-	/*
-	 * if none of our special cases is relevant, use the left
-	 * neighbor in the tree order of the new node we are allocating for
-	 */
-	if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes, s)) {
-		hash_formatted_node(hint);
-		return;
-	}
-
-	unfm_hint = get_left_neighbor(hint);
-
-	/*
-	 * Mimic old block allocator behaviour, that is if VFS allowed for
-	 * preallocation, new blocks are displaced based on directory ID.
-	 * Also, if suggested search_start is less than last preallocated
-	 * block, we start searching from it, assuming that HDD dataflow
-	 * is faster in forward direction
-	 */
-	if (TEST_OPTION(old_way, s)) {
-		if (!hint->formatted_node) {
-			if (!reiserfs_hashed_relocation(s))
-				old_way(hint);
-			else if (!reiserfs_no_unhashed_relocation(s))
-				old_hashed_relocation(hint);
-
-			if (hint->inode
-			    && hint->search_start <
-			    REISERFS_I(hint->inode)->i_prealloc_block)
-				hint->search_start =
-				    REISERFS_I(hint->inode)->i_prealloc_block;
-		}
-		return;
-	}
-
-	/* This is an approach proposed by Hans */
-	if (TEST_OPTION(hundredth_slices, s)
-	    && !(displacing_large_files(s) && !hint->formatted_node)) {
-		hundredth_slices(hint);
-		return;
-	}
-
-	/* old_hashed_relocation only works on unformatted */
-	if (!unfm_hint && !hint->formatted_node &&
-	    TEST_OPTION(old_hashed_relocation, s)) {
-		old_hashed_relocation(hint);
-	}
-
-	/* new_hashed_relocation works with both formatted/unformatted nodes */
-	if ((!unfm_hint || hint->formatted_node) &&
-	    TEST_OPTION(new_hashed_relocation, s)) {
-		new_hashed_relocation(hint);
-	}
-
-	/* dirid grouping works only on unformatted nodes */
-	if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups, s)) {
-		dirid_groups(hint);
-	}
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
-	if (hint->formatted_node && TEST_OPTION(dirid_groups, s)) {
-		dirid_groups(hint);
-	}
-#endif
-
-	/* oid grouping works only on unformatted nodes */
-	if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups, s)) {
-		oid_groups(hint);
-	}
-	return;
-}
-
-static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint)
-{
-	/* make minimum size a mount option and benchmark both ways */
-	/* we preallocate blocks only for regular files, specific size */
-	/* benchmark preallocating always and see what happens */
-
-	hint->prealloc_size = 0;
-
-	if (!hint->formatted_node && hint->preallocate) {
-		if (S_ISREG(hint->inode->i_mode) && !IS_PRIVATE(hint->inode)
-		    && hint->inode->i_size >=
-		    REISERFS_SB(hint->th->t_super)->s_alloc_options.
-		    preallocmin * hint->inode->i_sb->s_blocksize)
-			hint->prealloc_size =
-			    REISERFS_SB(hint->th->t_super)->s_alloc_options.
-			    preallocsize - 1;
-	}
-	return CARRY_ON;
-}
-
-static inline int allocate_without_wrapping_disk(reiserfs_blocknr_hint_t * hint,
-						 b_blocknr_t * new_blocknrs,
-						 b_blocknr_t start,
-						 b_blocknr_t finish, int min,
-						 int amount_needed,
-						 int prealloc_size)
-{
-	int rest = amount_needed;
-	int nr_allocated;
-
-	while (rest > 0 && start <= finish) {
-		nr_allocated = scan_bitmap(hint->th, &start, finish, min,
-					   rest + prealloc_size,
-					   !hint->formatted_node, hint->block);
-
-		if (nr_allocated == 0)	/* no new blocks allocated, return */
-			break;
-
-		/* fill free_blocknrs array first */
-		while (rest > 0 && nr_allocated > 0) {
-			*new_blocknrs++ = start++;
-			rest--;
-			nr_allocated--;
-		}
-
-		/* do we have something to fill prealloc. array also ? */
-		if (nr_allocated > 0) {
-			/*
-			 * it means prealloc_size was greater that 0 and
-			 * we do preallocation
-			 */
-			list_add(&REISERFS_I(hint->inode)->i_prealloc_list,
-				 &SB_JOURNAL(hint->th->t_super)->
-				 j_prealloc_list);
-			REISERFS_I(hint->inode)->i_prealloc_block = start;
-			REISERFS_I(hint->inode)->i_prealloc_count =
-			    nr_allocated;
-			break;
-		}
-	}
-
-	return (amount_needed - rest);
-}
-
-static inline int blocknrs_and_prealloc_arrays_from_search_start
-    (reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs,
-     int amount_needed) {
-	struct super_block *s = hint->th->t_super;
-	b_blocknr_t start = hint->search_start;
-	b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1;
-	int passno = 0;
-	int nr_allocated = 0;
-	int depth;
-
-	determine_prealloc_size(hint);
-	if (!hint->formatted_node) {
-		int quota_ret;
-#ifdef REISERQUOTA_DEBUG
-		reiserfs_debug(s, REISERFS_DEBUG_CODE,
-			       "reiserquota: allocating %d blocks id=%u",
-			       amount_needed, hint->inode->i_uid);
-#endif
-		depth = reiserfs_write_unlock_nested(s);
-		quota_ret =
-		    dquot_alloc_block_nodirty(hint->inode, amount_needed);
-		if (quota_ret) {	/* Quota exceeded? */
-			reiserfs_write_lock_nested(s, depth);
-			return QUOTA_EXCEEDED;
-		}
-		if (hint->preallocate && hint->prealloc_size) {
-#ifdef REISERQUOTA_DEBUG
-			reiserfs_debug(s, REISERFS_DEBUG_CODE,
-				       "reiserquota: allocating (prealloc) %d blocks id=%u",
-				       hint->prealloc_size, hint->inode->i_uid);
-#endif
-			quota_ret = dquot_prealloc_block_nodirty(hint->inode,
-							 hint->prealloc_size);
-			if (quota_ret)
-				hint->preallocate = hint->prealloc_size = 0;
-		}
-		/* for unformatted nodes, force large allocations */
-		reiserfs_write_lock_nested(s, depth);
-	}
-
-	do {
-		switch (passno++) {
-		case 0:	/* Search from hint->search_start to end of disk */
-			start = hint->search_start;
-			finish = SB_BLOCK_COUNT(s) - 1;
-			break;
-		case 1:	/* Search from hint->beg to hint->search_start */
-			start = hint->beg;
-			finish = hint->search_start;
-			break;
-		case 2:	/* Last chance: Search from 0 to hint->beg */
-			start = 0;
-			finish = hint->beg;
-			break;
-		default:
-			/* We've tried searching everywhere, not enough space */
-			/* Free the blocks */
-			if (!hint->formatted_node) {
-#ifdef REISERQUOTA_DEBUG
-				reiserfs_debug(s, REISERFS_DEBUG_CODE,
-					       "reiserquota: freeing (nospace) %d blocks id=%u",
-					       amount_needed +
-					       hint->prealloc_size -
-					       nr_allocated,
-					       hint->inode->i_uid);
-#endif
-				/* Free not allocated blocks */
-				depth = reiserfs_write_unlock_nested(s);
-				dquot_free_block_nodirty(hint->inode,
-					amount_needed + hint->prealloc_size -
-					nr_allocated);
-				reiserfs_write_lock_nested(s, depth);
-			}
-			while (nr_allocated--)
-				reiserfs_free_block(hint->th, hint->inode,
-						    new_blocknrs[nr_allocated],
-						    !hint->formatted_node);
-
-			return NO_DISK_SPACE;
-		}
-	} while ((nr_allocated += allocate_without_wrapping_disk(hint,
-								 new_blocknrs +
-								 nr_allocated,
-								 start, finish,
-								 1,
-								 amount_needed -
-								 nr_allocated,
-								 hint->
-								 prealloc_size))
-		 < amount_needed);
-	if (!hint->formatted_node &&
-	    amount_needed + hint->prealloc_size >
-	    nr_allocated + REISERFS_I(hint->inode)->i_prealloc_count) {
-		/* Some of preallocation blocks were not allocated */
-#ifdef REISERQUOTA_DEBUG
-		reiserfs_debug(s, REISERFS_DEBUG_CODE,
-			       "reiserquota: freeing (failed prealloc) %d blocks id=%u",
-			       amount_needed + hint->prealloc_size -
-			       nr_allocated -
-			       REISERFS_I(hint->inode)->i_prealloc_count,
-			       hint->inode->i_uid);
-#endif
-
-		depth = reiserfs_write_unlock_nested(s);
-		dquot_free_block_nodirty(hint->inode, amount_needed +
-					 hint->prealloc_size - nr_allocated -
-					 REISERFS_I(hint->inode)->
-					 i_prealloc_count);
-		reiserfs_write_lock_nested(s, depth);
-	}
-
-	return CARRY_ON;
-}
-
-/* grab new blocknrs from preallocated list */
-/* return amount still needed after using them */
-static int use_preallocated_list_if_available(reiserfs_blocknr_hint_t * hint,
-					      b_blocknr_t * new_blocknrs,
-					      int amount_needed)
-{
-	struct inode *inode = hint->inode;
-
-	if (REISERFS_I(inode)->i_prealloc_count > 0) {
-		while (amount_needed) {
-
-			*new_blocknrs++ = REISERFS_I(inode)->i_prealloc_block++;
-			REISERFS_I(inode)->i_prealloc_count--;
-
-			amount_needed--;
-
-			if (REISERFS_I(inode)->i_prealloc_count <= 0) {
-				list_del(&REISERFS_I(inode)->i_prealloc_list);
-				break;
-			}
-		}
-	}
-	/* return amount still needed after using preallocated blocks */
-	return amount_needed;
-}
-
-int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *hint,
-			       b_blocknr_t *new_blocknrs,
-			       int amount_needed,
-			       /* Amount of blocks we have already reserved */
-			       int reserved_by_us)
-{
-	int initial_amount_needed = amount_needed;
-	int ret;
-	struct super_block *s = hint->th->t_super;
-
-	/* Check if there is enough space, taking into account reserved space */
-	if (SB_FREE_BLOCKS(s) - REISERFS_SB(s)->reserved_blocks <
-	    amount_needed - reserved_by_us)
-		return NO_DISK_SPACE;
-	/* should this be if !hint->inode &&  hint->preallocate? */
-	/* do you mean hint->formatted_node can be removed ? - Zam */
-	/*
-	 * hint->formatted_node cannot be removed because we try to access
-	 * inode information here, and there is often no inode associated with
-	 * metadata allocations - green
-	 */
-
-	if (!hint->formatted_node && hint->preallocate) {
-		amount_needed = use_preallocated_list_if_available
-		    (hint, new_blocknrs, amount_needed);
-
-		/*
-		 * We have all the block numbers we need from the
-		 * prealloc list
-		 */
-		if (amount_needed == 0)
-			return CARRY_ON;
-		new_blocknrs += (initial_amount_needed - amount_needed);
-	}
-
-	/* find search start and save it in hint structure */
-	determine_search_start(hint, amount_needed);
-	if (hint->search_start >= SB_BLOCK_COUNT(s))
-		hint->search_start = SB_BLOCK_COUNT(s) - 1;
-
-	/* allocation itself; fill new_blocknrs and preallocation arrays */
-	ret = blocknrs_and_prealloc_arrays_from_search_start
-	    (hint, new_blocknrs, amount_needed);
-
-	/*
-	 * We used prealloc. list to fill (partially) new_blocknrs array.
-	 * If final allocation fails we need to return blocks back to
-	 * prealloc. list or just free them. -- Zam (I chose second
-	 * variant)
-	 */
-	if (ret != CARRY_ON) {
-		while (amount_needed++ < initial_amount_needed) {
-			reiserfs_free_block(hint->th, hint->inode,
-					    *(--new_blocknrs), 1);
-		}
-	}
-	return ret;
-}
-
-void reiserfs_cache_bitmap_metadata(struct super_block *sb,
-                                    struct buffer_head *bh,
-                                    struct reiserfs_bitmap_info *info)
-{
-	unsigned long *cur = (unsigned long *)(bh->b_data + bh->b_size);
-
-	/* The first bit must ALWAYS be 1 */
-	if (!reiserfs_test_le_bit(0, (unsigned long *)bh->b_data))
-		reiserfs_error(sb, "reiserfs-2025", "bitmap block %lu is "
-			       "corrupted: first bit must be 1", bh->b_blocknr);
-
-	info->free_count = 0;
-
-	while (--cur >= (unsigned long *)bh->b_data) {
-		/* 0 and ~0 are special, we can optimize for them */
-		if (*cur == 0)
-			info->free_count += BITS_PER_LONG;
-		else if (*cur != ~0L)	/* A mix, investigate */
-			info->free_count += BITS_PER_LONG - hweight_long(*cur);
-	}
-}
-
-struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
-                                               unsigned int bitmap)
-{
-	b_blocknr_t block = (sb->s_blocksize << 3) * bitmap;
-	struct reiserfs_bitmap_info *info = SB_AP_BITMAP(sb) + bitmap;
-	struct buffer_head *bh;
-
-	/*
-	 * Way old format filesystems had the bitmaps packed up front.
-	 * I doubt there are any of these left, but just in case...
-	 */
-	if (unlikely(test_bit(REISERFS_OLD_FORMAT,
-			      &REISERFS_SB(sb)->s_properties)))
-		block = REISERFS_SB(sb)->s_sbh->b_blocknr + 1 + bitmap;
-	else if (bitmap == 0)
-		block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1;
-
-	bh = sb_bread(sb, block);
-	if (bh == NULL)
-		reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) "
-		                 "reading failed", __func__, block);
-	else {
-		if (buffer_locked(bh)) {
-			int depth;
-			PROC_INFO_INC(sb, scan_bitmap.wait);
-			depth = reiserfs_write_unlock_nested(sb);
-			__wait_on_buffer(bh);
-			reiserfs_write_lock_nested(sb, depth);
-		}
-		BUG_ON(!buffer_uptodate(bh));
-		BUG_ON(atomic_read(&bh->b_count) == 0);
-
-		if (info->free_count == UINT_MAX)
-			reiserfs_cache_bitmap_metadata(sb, bh, info);
-	}
-
-	return bh;
-}
-
-int reiserfs_init_bitmap_cache(struct super_block *sb)
-{
-	struct reiserfs_bitmap_info *bitmap;
-	unsigned int bmap_nr = reiserfs_bmap_count(sb);
-
-	bitmap = vmalloc(array_size(bmap_nr, sizeof(*bitmap)));
-	if (bitmap == NULL)
-		return -ENOMEM;
-
-	memset(bitmap, 0xff, sizeof(*bitmap) * bmap_nr);
-
-	SB_AP_BITMAP(sb) = bitmap;
-
-	return 0;
-}
-
-void reiserfs_free_bitmap_cache(struct super_block *sb)
-{
-	if (SB_AP_BITMAP(sb)) {
-		vfree(SB_AP_BITMAP(sb));
-		SB_AP_BITMAP(sb) = NULL;
-	}
-}
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
deleted file mode 100644
index 79ee2b436685..000000000000
--- a/fs/reiserfs/dir.c
+++ /dev/null
@@ -1,346 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include "reiserfs.h"
-#include <linux/stat.h>
-#include <linux/buffer_head.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-
-extern const struct reiserfs_key MIN_KEY;
-
-static int reiserfs_readdir(struct file *, struct dir_context *);
-static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
-			      int datasync);
-
-const struct file_operations reiserfs_dir_operations = {
-	.llseek = generic_file_llseek,
-	.read = generic_read_dir,
-	.iterate_shared = reiserfs_readdir,
-	.fsync = reiserfs_dir_fsync,
-	.unlocked_ioctl = reiserfs_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl = reiserfs_compat_ioctl,
-#endif
-};
-
-static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
-			      int datasync)
-{
-	struct inode *inode = filp->f_mapping->host;
-	int err;
-
-	err = file_write_and_wait_range(filp, start, end);
-	if (err)
-		return err;
-
-	inode_lock(inode);
-	reiserfs_write_lock(inode->i_sb);
-	err = reiserfs_commit_for_inode(inode);
-	reiserfs_write_unlock(inode->i_sb);
-	inode_unlock(inode);
-	if (err < 0)
-		return err;
-	return 0;
-}
-
-#define store_ih(where,what) copy_item_head (where, what)
-
-static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *deh)
-{
-	struct dentry *privroot = REISERFS_SB(dir->i_sb)->priv_root;
-	return (d_really_is_positive(privroot) &&
-	        deh->deh_objectid == INODE_PKEY(d_inode(privroot))->k_objectid);
-}
-
-int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
-{
-
-	/* key of current position in the directory (key of directory entry) */
-	struct cpu_key pos_key;
-
-	INITIALIZE_PATH(path_to_entry);
-	struct buffer_head *bh;
-	int item_num, entry_num;
-	const struct reiserfs_key *rkey;
-	struct item_head *ih, tmp_ih;
-	int search_res;
-	char *local_buf;
-	loff_t next_pos;
-	char small_buf[32];	/* avoid kmalloc if we can */
-	struct reiserfs_dir_entry de;
-	int ret = 0;
-	int depth;
-
-	reiserfs_write_lock(inode->i_sb);
-
-	reiserfs_check_lock_depth(inode->i_sb, "readdir");
-
-	/*
-	 * form key for search the next directory entry using
-	 * f_pos field of file structure
-	 */
-	make_cpu_key(&pos_key, inode, ctx->pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3);
-	next_pos = cpu_key_k_offset(&pos_key);
-
-	path_to_entry.reada = PATH_READA;
-	while (1) {
-research:
-		/*
-		 * search the directory item, containing entry with
-		 * specified key
-		 */
-		search_res =
-		    search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry,
-					&de);
-		if (search_res == IO_ERROR) {
-			/*
-			 * FIXME: we could just skip part of directory
-			 * which could not be read
-			 */
-			ret = -EIO;
-			goto out;
-		}
-		entry_num = de.de_entry_num;
-		bh = de.de_bh;
-		item_num = de.de_item_num;
-		ih = de.de_ih;
-		store_ih(&tmp_ih, ih);
-
-		/* we must have found item, that is item of this directory, */
-		RFALSE(COMP_SHORT_KEYS(&ih->ih_key, &pos_key),
-		       "vs-9000: found item %h does not match to dir we readdir %K",
-		       ih, &pos_key);
-		RFALSE(item_num > B_NR_ITEMS(bh) - 1,
-		       "vs-9005 item_num == %d, item amount == %d",
-		       item_num, B_NR_ITEMS(bh));
-
-		/*
-		 * and entry must be not more than number of entries
-		 * in the item
-		 */
-		RFALSE(ih_entry_count(ih) < entry_num,
-		       "vs-9010: entry number is too big %d (%d)",
-		       entry_num, ih_entry_count(ih));
-
-		/*
-		 * go through all entries in the directory item beginning
-		 * from the entry, that has been found
-		 */
-		if (search_res == POSITION_FOUND
-		    || entry_num < ih_entry_count(ih)) {
-			struct reiserfs_de_head *deh =
-			    B_I_DEH(bh, ih) + entry_num;
-
-			for (; entry_num < ih_entry_count(ih);
-			     entry_num++, deh++) {
-				int d_reclen;
-				char *d_name;
-				ino_t d_ino;
-				loff_t cur_pos = deh_offset(deh);
-
-				/* it is hidden entry */
-				if (!de_visible(deh))
-					continue;
-				d_reclen = entry_length(bh, ih, entry_num);
-				d_name = B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh);
-
-				if (d_reclen <= 0 ||
-				    d_name + d_reclen > bh->b_data + bh->b_size) {
-					/*
-					 * There is corrupted data in entry,
-					 * We'd better stop here
-					 */
-					pathrelse(&path_to_entry);
-					ret = -EIO;
-					goto out;
-				}
-
-				if (!d_name[d_reclen - 1])
-					d_reclen = strlen(d_name);
-
-				/* too big to send back to VFS */
-				if (d_reclen >
-				    REISERFS_MAX_NAME(inode->i_sb->
-						      s_blocksize)) {
-					continue;
-				}
-
-				/* Ignore the .reiserfs_priv entry */
-				if (is_privroot_deh(inode, deh))
-					continue;
-
-				ctx->pos = deh_offset(deh);
-				d_ino = deh_objectid(deh);
-				if (d_reclen <= 32) {
-					local_buf = small_buf;
-				} else {
-					local_buf = kmalloc(d_reclen,
-							    GFP_NOFS);
-					if (!local_buf) {
-						pathrelse(&path_to_entry);
-						ret = -ENOMEM;
-						goto out;
-					}
-					if (item_moved(&tmp_ih, &path_to_entry)) {
-						kfree(local_buf);
-						goto research;
-					}
-				}
-
-				/*
-				 * Note, that we copy name to user space via
-				 * temporary buffer (local_buf) because
-				 * filldir will block if user space buffer is
-				 * swapped out. At that time entry can move to
-				 * somewhere else
-				 */
-				memcpy(local_buf, d_name, d_reclen);
-
-				/*
-				 * Since filldir might sleep, we can release
-				 * the write lock here for other waiters
-				 */
-				depth = reiserfs_write_unlock_nested(inode->i_sb);
-				if (!dir_emit
-				    (ctx, local_buf, d_reclen, d_ino,
-				     DT_UNKNOWN)) {
-					reiserfs_write_lock_nested(inode->i_sb, depth);
-					if (local_buf != small_buf) {
-						kfree(local_buf);
-					}
-					goto end;
-				}
-				reiserfs_write_lock_nested(inode->i_sb, depth);
-				if (local_buf != small_buf) {
-					kfree(local_buf);
-				}
-
-				/* deh_offset(deh) may be invalid now. */
-				next_pos = cur_pos + 1;
-
-				if (item_moved(&tmp_ih, &path_to_entry)) {
-					set_cpu_key_k_offset(&pos_key,
-							     next_pos);
-					goto research;
-				}
-			}	/* for */
-		}
-
-		/* end of directory has been reached */
-		if (item_num != B_NR_ITEMS(bh) - 1)
-			goto end;
-
-		/*
-		 * item we went through is last item of node. Using right
-		 * delimiting key check is it directory end
-		 */
-		rkey = get_rkey(&path_to_entry, inode->i_sb);
-		if (!comp_le_keys(rkey, &MIN_KEY)) {
-			/*
-			 * set pos_key to key, that is the smallest and greater
-			 * that key of the last entry in the item
-			 */
-			set_cpu_key_k_offset(&pos_key, next_pos);
-			continue;
-		}
-
-		/* end of directory has been reached */
-		if (COMP_SHORT_KEYS(rkey, &pos_key)) {
-			goto end;
-		}
-
-		/* directory continues in the right neighboring block */
-		set_cpu_key_k_offset(&pos_key,
-				     le_key_k_offset(KEY_FORMAT_3_5, rkey));
-
-	}			/* while */
-
-end:
-	ctx->pos = next_pos;
-	pathrelse(&path_to_entry);
-	reiserfs_check_path(&path_to_entry);
-out:
-	reiserfs_write_unlock(inode->i_sb);
-	return ret;
-}
-
-static int reiserfs_readdir(struct file *file, struct dir_context *ctx)
-{
-	return reiserfs_readdir_inode(file_inode(file), ctx);
-}
-
-/*
- * compose directory item containing "." and ".." entries (entries are
- * not aligned to 4 byte boundary)
- */
-void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid,
-			    __le32 par_dirid, __le32 par_objid)
-{
-	struct reiserfs_de_head *dot, *dotdot;
-
-	memset(body, 0, EMPTY_DIR_SIZE_V1);
-	dot = (struct reiserfs_de_head *)body;
-	dotdot = dot + 1;
-
-	/* direntry header of "." */
-	put_deh_offset(dot, DOT_OFFSET);
-	/* these two are from make_le_item_head, and are LE */
-	dot->deh_dir_id = dirid;
-	dot->deh_objectid = objid;
-	dot->deh_state = 0;	/* Endian safe if 0 */
-	put_deh_location(dot, EMPTY_DIR_SIZE_V1 - strlen("."));
-	mark_de_visible(dot);
-
-	/* direntry header of ".." */
-	put_deh_offset(dotdot, DOT_DOT_OFFSET);
-	/* key of ".." for the root directory */
-	/* these two are from the inode, and are LE */
-	dotdot->deh_dir_id = par_dirid;
-	dotdot->deh_objectid = par_objid;
-	dotdot->deh_state = 0;	/* Endian safe if 0 */
-	put_deh_location(dotdot, deh_location(dot) - strlen(".."));
-	mark_de_visible(dotdot);
-
-	/* copy ".." and "." */
-	memcpy(body + deh_location(dot), ".", 1);
-	memcpy(body + deh_location(dotdot), "..", 2);
-}
-
-/* compose directory item containing "." and ".." entries */
-void make_empty_dir_item(char *body, __le32 dirid, __le32 objid,
-			 __le32 par_dirid, __le32 par_objid)
-{
-	struct reiserfs_de_head *dot, *dotdot;
-
-	memset(body, 0, EMPTY_DIR_SIZE);
-	dot = (struct reiserfs_de_head *)body;
-	dotdot = dot + 1;
-
-	/* direntry header of "." */
-	put_deh_offset(dot, DOT_OFFSET);
-	/* these two are from make_le_item_head, and are LE */
-	dot->deh_dir_id = dirid;
-	dot->deh_objectid = objid;
-	dot->deh_state = 0;	/* Endian safe if 0 */
-	put_deh_location(dot, EMPTY_DIR_SIZE - ROUND_UP(strlen(".")));
-	mark_de_visible(dot);
-
-	/* direntry header of ".." */
-	put_deh_offset(dotdot, DOT_DOT_OFFSET);
-	/* key of ".." for the root directory */
-	/* these two are from the inode, and are LE */
-	dotdot->deh_dir_id = par_dirid;
-	dotdot->deh_objectid = par_objid;
-	dotdot->deh_state = 0;	/* Endian safe if 0 */
-	put_deh_location(dotdot, deh_location(dot) - ROUND_UP(strlen("..")));
-	mark_de_visible(dotdot);
-
-	/* copy ".." and "." */
-	memcpy(body + deh_location(dot), ".", 1);
-	memcpy(body + deh_location(dotdot), "..", 2);
-}
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
deleted file mode 100644
index 5129efc6f2e6..000000000000
--- a/fs/reiserfs/do_balan.c
+++ /dev/null
@@ -1,1900 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-/*
- * Now we have all buffers that must be used in balancing of the tree
- * Further calculations can not cause schedule(), and thus the buffer
- * tree will be stable until the balancing will be finished
- * balance the tree according to the analysis made before,
- * and using buffers obtained after all above.
- */
-
-#include <linux/uaccess.h>
-#include <linux/time.h>
-#include "reiserfs.h"
-#include <linux/buffer_head.h>
-#include <linux/kernel.h>
-
-static inline void buffer_info_init_left(struct tree_balance *tb,
-                                         struct buffer_info *bi)
-{
-	bi->tb          = tb;
-	bi->bi_bh       = tb->L[0];
-	bi->bi_parent   = tb->FL[0];
-	bi->bi_position = get_left_neighbor_position(tb, 0);
-}
-
-static inline void buffer_info_init_right(struct tree_balance *tb,
-                                          struct buffer_info *bi)
-{
-	bi->tb          = tb;
-	bi->bi_bh       = tb->R[0];
-	bi->bi_parent   = tb->FR[0];
-	bi->bi_position = get_right_neighbor_position(tb, 0);
-}
-
-static inline void buffer_info_init_tbS0(struct tree_balance *tb,
-                                         struct buffer_info *bi)
-{
-	bi->tb          = tb;
-	bi->bi_bh        = PATH_PLAST_BUFFER(tb->tb_path);
-	bi->bi_parent   = PATH_H_PPARENT(tb->tb_path, 0);
-	bi->bi_position = PATH_H_POSITION(tb->tb_path, 1);
-}
-
-static inline void buffer_info_init_bh(struct tree_balance *tb,
-                                       struct buffer_info *bi,
-                                       struct buffer_head *bh)
-{
-	bi->tb          = tb;
-	bi->bi_bh       = bh;
-	bi->bi_parent   = NULL;
-	bi->bi_position = 0;
-}
-
-inline void do_balance_mark_leaf_dirty(struct tree_balance *tb,
-				       struct buffer_head *bh, int flag)
-{
-	journal_mark_dirty(tb->transaction_handle, bh);
-}
-
-#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
-#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty
-
-/*
- * summary:
- *  if deleting something ( tb->insert_size[0] < 0 )
- *    return(balance_leaf_when_delete()); (flag d handled here)
- *  else
- *    if lnum is larger than 0 we put items into the left node
- *    if rnum is larger than 0 we put items into the right node
- *    if snum1 is larger than 0 we put items into the new node s1
- *    if snum2 is larger than 0 we put items into the new node s2
- * Note that all *num* count new items being created.
- */
-
-static void balance_leaf_when_delete_del(struct tree_balance *tb)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int item_pos = PATH_LAST_POSITION(tb->tb_path);
-	struct buffer_info bi;
-#ifdef CONFIG_REISERFS_CHECK
-	struct item_head *ih = item_head(tbS0, item_pos);
-#endif
-
-	RFALSE(ih_item_len(ih) + IH_SIZE != -tb->insert_size[0],
-	       "vs-12013: mode Delete, insert size %d, ih to be deleted %h",
-	       -tb->insert_size[0], ih);
-
-	buffer_info_init_tbS0(tb, &bi);
-	leaf_delete_items(&bi, 0, item_pos, 1, -1);
-
-	if (!item_pos && tb->CFL[0]) {
-		if (B_NR_ITEMS(tbS0)) {
-			replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
-		} else {
-			if (!PATH_H_POSITION(tb->tb_path, 1))
-				replace_key(tb, tb->CFL[0], tb->lkey[0],
-					    PATH_H_PPARENT(tb->tb_path, 0), 0);
-		}
-	}
-
-	RFALSE(!item_pos && !tb->CFL[0],
-	       "PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0],
-	       tb->L[0]);
-}
-
-/* cut item in S[0] */
-static void balance_leaf_when_delete_cut(struct tree_balance *tb)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int item_pos = PATH_LAST_POSITION(tb->tb_path);
-	struct item_head *ih = item_head(tbS0, item_pos);
-	int pos_in_item = tb->tb_path->pos_in_item;
-	struct buffer_info bi;
-	buffer_info_init_tbS0(tb, &bi);
-
-	if (is_direntry_le_ih(ih)) {
-		/*
-		 * UFS unlink semantics are such that you can only
-		 * delete one directory entry at a time.
-		 *
-		 * when we cut a directory tb->insert_size[0] means
-		 * number of entries to be cut (always 1)
-		 */
-		tb->insert_size[0] = -1;
-		leaf_cut_from_buffer(&bi, item_pos, pos_in_item,
-				     -tb->insert_size[0]);
-
-		RFALSE(!item_pos && !pos_in_item && !tb->CFL[0],
-		       "PAP-12030: can not change delimiting key. CFL[0]=%p",
-		       tb->CFL[0]);
-
-		if (!item_pos && !pos_in_item && tb->CFL[0])
-			replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
-	} else {
-		leaf_cut_from_buffer(&bi, item_pos, pos_in_item,
-				     -tb->insert_size[0]);
-
-		RFALSE(!ih_item_len(ih),
-		       "PAP-12035: cut must leave non-zero dynamic "
-		       "length of item");
-	}
-}
-
-static int balance_leaf_when_delete_left(struct tree_balance *tb)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int n = B_NR_ITEMS(tbS0);
-
-	/* L[0] must be joined with S[0] */
-	if (tb->lnum[0] == -1) {
-		/* R[0] must be also joined with S[0] */
-		if (tb->rnum[0] == -1) {
-			if (tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0)) {
-				/*
-				 * all contents of all the
-				 * 3 buffers will be in L[0]
-				 */
-				if (PATH_H_POSITION(tb->tb_path, 1) == 0 &&
-				    1 < B_NR_ITEMS(tb->FR[0]))
-					replace_key(tb, tb->CFL[0],
-						    tb->lkey[0], tb->FR[0], 1);
-
-				leaf_move_items(LEAF_FROM_S_TO_L, tb, n, -1,
-						NULL);
-				leaf_move_items(LEAF_FROM_R_TO_L, tb,
-						B_NR_ITEMS(tb->R[0]), -1,
-						NULL);
-
-				reiserfs_invalidate_buffer(tb, tbS0);
-				reiserfs_invalidate_buffer(tb, tb->R[0]);
-
-				return 0;
-			}
-
-			/* all contents of all the 3 buffers will be in R[0] */
-			leaf_move_items(LEAF_FROM_S_TO_R, tb, n, -1, NULL);
-			leaf_move_items(LEAF_FROM_L_TO_R, tb,
-					B_NR_ITEMS(tb->L[0]), -1, NULL);
-
-			/* right_delimiting_key is correct in R[0] */
-			replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
-
-			reiserfs_invalidate_buffer(tb, tbS0);
-			reiserfs_invalidate_buffer(tb, tb->L[0]);
-
-			return -1;
-		}
-
-		RFALSE(tb->rnum[0] != 0,
-		       "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]);
-		/* all contents of L[0] and S[0] will be in L[0] */
-		leaf_shift_left(tb, n, -1);
-
-		reiserfs_invalidate_buffer(tb, tbS0);
-
-		return 0;
-	}
-
-	/*
-	 * a part of contents of S[0] will be in L[0] and
-	 * the rest part of S[0] will be in R[0]
-	 */
-
-	RFALSE((tb->lnum[0] + tb->rnum[0] < n) ||
-	       (tb->lnum[0] + tb->rnum[0] > n + 1),
-	       "PAP-12050: rnum(%d) and lnum(%d) and item "
-	       "number(%d) in S[0] are not consistent",
-	       tb->rnum[0], tb->lnum[0], n);
-	RFALSE((tb->lnum[0] + tb->rnum[0] == n) &&
-	       (tb->lbytes != -1 || tb->rbytes != -1),
-	       "PAP-12055: bad rbytes (%d)/lbytes (%d) "
-	       "parameters when items are not split",
-	       tb->rbytes, tb->lbytes);
-	RFALSE((tb->lnum[0] + tb->rnum[0] == n + 1) &&
-	       (tb->lbytes < 1 || tb->rbytes != -1),
-	       "PAP-12060: bad rbytes (%d)/lbytes (%d) "
-	       "parameters when items are split",
-	       tb->rbytes, tb->lbytes);
-
-	leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
-	leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
-
-	reiserfs_invalidate_buffer(tb, tbS0);
-
-	return 0;
-}
-
-/*
- * Balance leaf node in case of delete or cut: insert_size[0] < 0
- *
- * lnum, rnum can have values >= -1
- *	-1 means that the neighbor must be joined with S
- *	 0 means that nothing should be done with the neighbor
- *	>0 means to shift entirely or partly the specified number of items
- *         to the neighbor
- */
-static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	struct buffer_info bi;
-	int n;
-
-	RFALSE(tb->FR[0] && B_LEVEL(tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1,
-	       "vs- 12000: level: wrong FR %z", tb->FR[0]);
-	RFALSE(tb->blknum[0] > 1,
-	       "PAP-12005: tb->blknum == %d, can not be > 1", tb->blknum[0]);
-	RFALSE(!tb->blknum[0] && !PATH_H_PPARENT(tb->tb_path, 0),
-	       "PAP-12010: tree can not be empty");
-
-	buffer_info_init_tbS0(tb, &bi);
-
-	/* Delete or truncate the item */
-
-	BUG_ON(flag != M_DELETE && flag != M_CUT);
-	if (flag == M_DELETE)
-		balance_leaf_when_delete_del(tb);
-	else /* M_CUT */
-		balance_leaf_when_delete_cut(tb);
-
-
-	/*
-	 * the rule is that no shifting occurs unless by shifting
-	 * a node can be freed
-	 */
-	n = B_NR_ITEMS(tbS0);
-
-
-	/* L[0] takes part in balancing */
-	if (tb->lnum[0])
-		return balance_leaf_when_delete_left(tb);
-
-	if (tb->rnum[0] == -1) {
-		/* all contents of R[0] and S[0] will be in R[0] */
-		leaf_shift_right(tb, n, -1);
-		reiserfs_invalidate_buffer(tb, tbS0);
-		return 0;
-	}
-
-	RFALSE(tb->rnum[0],
-	       "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]);
-	return 0;
-}
-
-static unsigned int balance_leaf_insert_left(struct tree_balance *tb,
-					     struct item_head *const ih,
-					     const char * const body)
-{
-	int ret;
-	struct buffer_info bi;
-	int n = B_NR_ITEMS(tb->L[0]);
-	unsigned body_shift_bytes = 0;
-
-	if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) {
-		/* part of new item falls into L[0] */
-		int new_item_len, shift;
-
-		ret = leaf_shift_left(tb, tb->lnum[0] - 1, -1);
-
-		/* Calculate item length to insert to S[0] */
-		new_item_len = ih_item_len(ih) - tb->lbytes;
-
-		/* Calculate and check item length to insert to L[0] */
-		put_ih_item_len(ih, ih_item_len(ih) - new_item_len);
-
-		RFALSE(ih_item_len(ih) <= 0,
-		       "PAP-12080: there is nothing to insert into L[0]: "
-		       "ih_item_len=%d", ih_item_len(ih));
-
-		/* Insert new item into L[0] */
-		buffer_info_init_left(tb, &bi);
-		leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body,
-			     min_t(int, tb->zeroes_num, ih_item_len(ih)));
-
-		/*
-		 * Calculate key component, item length and body to
-		 * insert into S[0]
-		 */
-		shift = 0;
-		if (is_indirect_le_ih(ih))
-			shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
-
-		add_le_ih_k_offset(ih, tb->lbytes << shift);
-
-		put_ih_item_len(ih, new_item_len);
-		if (tb->lbytes > tb->zeroes_num) {
-			body_shift_bytes = tb->lbytes - tb->zeroes_num;
-			tb->zeroes_num = 0;
-		} else
-			tb->zeroes_num -= tb->lbytes;
-
-		RFALSE(ih_item_len(ih) <= 0,
-		       "PAP-12085: there is nothing to insert into S[0]: "
-		       "ih_item_len=%d", ih_item_len(ih));
-	} else {
-		/* new item in whole falls into L[0] */
-		/* Shift lnum[0]-1 items to L[0] */
-		ret = leaf_shift_left(tb, tb->lnum[0] - 1, tb->lbytes);
-
-		/* Insert new item into L[0] */
-		buffer_info_init_left(tb, &bi);
-		leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body,
-				     tb->zeroes_num);
-		tb->insert_size[0] = 0;
-		tb->zeroes_num = 0;
-	}
-	return body_shift_bytes;
-}
-
-static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb,
-						 struct item_head * const ih,
-						 const char * const body)
-{
-	int n = B_NR_ITEMS(tb->L[0]);
-	struct buffer_info bi;
-
-	RFALSE(tb->zeroes_num,
-	       "PAP-12090: invalid parameter in case of a directory");
-
-	/* directory item */
-	if (tb->lbytes > tb->pos_in_item) {
-		/* new directory entry falls into L[0] */
-		struct item_head *pasted;
-		int ret, l_pos_in_item = tb->pos_in_item;
-
-		/*
-		 * Shift lnum[0] - 1 items in whole.
-		 * Shift lbytes - 1 entries from given directory item
-		 */
-		ret = leaf_shift_left(tb, tb->lnum[0], tb->lbytes - 1);
-		if (ret && !tb->item_pos) {
-			pasted = item_head(tb->L[0], B_NR_ITEMS(tb->L[0]) - 1);
-			l_pos_in_item += ih_entry_count(pasted) -
-					 (tb->lbytes - 1);
-		}
-
-		/* Append given directory entry to directory item */
-		buffer_info_init_left(tb, &bi);
-		leaf_paste_in_buffer(&bi, n + tb->item_pos - ret,
-				     l_pos_in_item, tb->insert_size[0],
-				     body, tb->zeroes_num);
-
-		/*
-		 * previous string prepared space for pasting new entry,
-		 * following string pastes this entry
-		 */
-
-		/*
-		 * when we have merge directory item, pos_in_item
-		 * has been changed too
-		 */
-
-		/* paste new directory entry. 1 is entry number */
-		leaf_paste_entries(&bi, n + tb->item_pos - ret,
-				   l_pos_in_item, 1,
-				   (struct reiserfs_de_head *) body,
-				   body + DEH_SIZE, tb->insert_size[0]);
-		tb->insert_size[0] = 0;
-	} else {
-		/* new directory item doesn't fall into L[0] */
-		/*
-		 * Shift lnum[0]-1 items in whole. Shift lbytes
-		 * directory entries from directory item number lnum[0]
-		 */
-		leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
-	}
-
-	/* Calculate new position to append in item body */
-	tb->pos_in_item -= tb->lbytes;
-}
-
-static unsigned int balance_leaf_paste_left_shift(struct tree_balance *tb,
-						  struct item_head * const ih,
-						  const char * const body)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int n = B_NR_ITEMS(tb->L[0]);
-	struct buffer_info bi;
-	int body_shift_bytes = 0;
-
-	if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) {
-		balance_leaf_paste_left_shift_dirent(tb, ih, body);
-		return 0;
-	}
-
-	RFALSE(tb->lbytes <= 0,
-	       "PAP-12095: there is nothing to shift to L[0]. "
-	       "lbytes=%d", tb->lbytes);
-	RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)),
-	       "PAP-12100: incorrect position to paste: "
-	       "item_len=%d, pos_in_item=%d",
-	       ih_item_len(item_head(tbS0, tb->item_pos)), tb->pos_in_item);
-
-	/* appended item will be in L[0] in whole */
-	if (tb->lbytes >= tb->pos_in_item) {
-		struct item_head *tbS0_pos_ih, *tbL0_ih;
-		struct item_head *tbS0_0_ih;
-		struct reiserfs_key *left_delim_key;
-		int ret, l_n, version, temp_l;
-
-		tbS0_pos_ih = item_head(tbS0, tb->item_pos);
-		tbS0_0_ih = item_head(tbS0, 0);
-
-		/*
-		 * this bytes number must be appended
-		 * to the last item of L[h]
-		 */
-		l_n = tb->lbytes - tb->pos_in_item;
-
-		/* Calculate new insert_size[0] */
-		tb->insert_size[0] -= l_n;
-
-		RFALSE(tb->insert_size[0] <= 0,
-		       "PAP-12105: there is nothing to paste into "
-		       "L[0]. insert_size=%d", tb->insert_size[0]);
-
-		ret = leaf_shift_left(tb, tb->lnum[0],
-				      ih_item_len(tbS0_pos_ih));
-
-		tbL0_ih = item_head(tb->L[0], n + tb->item_pos - ret);
-
-		/* Append to body of item in L[0] */
-		buffer_info_init_left(tb, &bi);
-		leaf_paste_in_buffer(&bi, n + tb->item_pos - ret,
-				     ih_item_len(tbL0_ih), l_n, body,
-				     min_t(int, l_n, tb->zeroes_num));
-
-		/*
-		 * 0-th item in S0 can be only of DIRECT type
-		 * when l_n != 0
-		 */
-		temp_l = l_n;
-
-		RFALSE(ih_item_len(tbS0_0_ih),
-		       "PAP-12106: item length must be 0");
-		RFALSE(comp_short_le_keys(&tbS0_0_ih->ih_key,
-		       leaf_key(tb->L[0], n + tb->item_pos - ret)),
-		       "PAP-12107: items must be of the same file");
-
-		if (is_indirect_le_ih(tbL0_ih)) {
-			int shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
-			temp_l = l_n << shift;
-		}
-		/* update key of first item in S0 */
-		version = ih_version(tbS0_0_ih);
-		add_le_key_k_offset(version, &tbS0_0_ih->ih_key, temp_l);
-
-		/* update left delimiting key */
-		left_delim_key = internal_key(tb->CFL[0], tb->lkey[0]);
-		add_le_key_k_offset(version, left_delim_key, temp_l);
-
-		/*
-		 * Calculate new body, position in item and
-		 * insert_size[0]
-		 */
-		if (l_n > tb->zeroes_num) {
-			body_shift_bytes = l_n - tb->zeroes_num;
-			tb->zeroes_num = 0;
-		} else
-			tb->zeroes_num -= l_n;
-		tb->pos_in_item = 0;
-
-		RFALSE(comp_short_le_keys(&tbS0_0_ih->ih_key,
-					  leaf_key(tb->L[0],
-						 B_NR_ITEMS(tb->L[0]) - 1)) ||
-		       !op_is_left_mergeable(leaf_key(tbS0, 0), tbS0->b_size) ||
-		       !op_is_left_mergeable(left_delim_key, tbS0->b_size),
-		       "PAP-12120: item must be merge-able with left "
-		       "neighboring item");
-	} else {
-		/* only part of the appended item will be in L[0] */
-
-		/* Calculate position in item for append in S[0] */
-		tb->pos_in_item -= tb->lbytes;
-
-		RFALSE(tb->pos_in_item <= 0,
-		       "PAP-12125: no place for paste. pos_in_item=%d",
-		       tb->pos_in_item);
-
-		/*
-		 * Shift lnum[0] - 1 items in whole.
-		 * Shift lbytes - 1 byte from item number lnum[0]
-		 */
-		leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
-	}
-	return body_shift_bytes;
-}
-
-
-/* appended item will be in L[0] in whole */
-static void balance_leaf_paste_left_whole(struct tree_balance *tb,
-					  struct item_head * const ih,
-					  const char * const body)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int n = B_NR_ITEMS(tb->L[0]);
-	struct buffer_info bi;
-	struct item_head *pasted;
-	int ret;
-
-	/* if we paste into first item of S[0] and it is left mergable */
-	if (!tb->item_pos &&
-	    op_is_left_mergeable(leaf_key(tbS0, 0), tbS0->b_size)) {
-		/*
-		 * then increment pos_in_item by the size of the
-		 * last item in L[0]
-		 */
-		pasted = item_head(tb->L[0], n - 1);
-		if (is_direntry_le_ih(pasted))
-			tb->pos_in_item += ih_entry_count(pasted);
-		else
-			tb->pos_in_item += ih_item_len(pasted);
-	}
-
-	/*
-	 * Shift lnum[0] - 1 items in whole.
-	 * Shift lbytes - 1 byte from item number lnum[0]
-	 */
-	ret = leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
-
-	/* Append to body of item in L[0] */
-	buffer_info_init_left(tb, &bi);
-	leaf_paste_in_buffer(&bi, n + tb->item_pos - ret, tb->pos_in_item,
-			     tb->insert_size[0], body, tb->zeroes_num);
-
-	/* if appended item is directory, paste entry */
-	pasted = item_head(tb->L[0], n + tb->item_pos - ret);
-	if (is_direntry_le_ih(pasted))
-		leaf_paste_entries(&bi, n + tb->item_pos - ret,
-				   tb->pos_in_item, 1,
-				   (struct reiserfs_de_head *)body,
-				   body + DEH_SIZE, tb->insert_size[0]);
-
-	/*
-	 * if appended item is indirect item, put unformatted node
-	 * into un list
-	 */
-	if (is_indirect_le_ih(pasted))
-		set_ih_free_space(pasted, 0);
-
-	tb->insert_size[0] = 0;
-	tb->zeroes_num = 0;
-}
-
-static unsigned int balance_leaf_paste_left(struct tree_balance *tb,
-					    struct item_head * const ih,
-					    const char * const body)
-{
-	/* we must shift the part of the appended item */
-	if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1)
-		return balance_leaf_paste_left_shift(tb, ih, body);
-	else
-		balance_leaf_paste_left_whole(tb, ih, body);
-	return 0;
-}
-
-/* Shift lnum[0] items from S[0] to the left neighbor L[0] */
-static unsigned int balance_leaf_left(struct tree_balance *tb,
-				      struct item_head * const ih,
-				      const char * const body, int flag)
-{
-	if (tb->lnum[0] <= 0)
-		return 0;
-
-	/* new item or it part falls to L[0], shift it too */
-	if (tb->item_pos < tb->lnum[0]) {
-		BUG_ON(flag != M_INSERT && flag != M_PASTE);
-
-		if (flag == M_INSERT)
-			return balance_leaf_insert_left(tb, ih, body);
-		else /* M_PASTE */
-			return balance_leaf_paste_left(tb, ih, body);
-	} else
-		/* new item doesn't fall into L[0] */
-		leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
-	return 0;
-}
-
-
-static void balance_leaf_insert_right(struct tree_balance *tb,
-				      struct item_head * const ih,
-				      const char * const body)
-{
-
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int n = B_NR_ITEMS(tbS0);
-	struct buffer_info bi;
-
-	/* new item or part of it doesn't fall into R[0] */
-	if (n - tb->rnum[0] >= tb->item_pos) {
-		leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
-		return;
-	}
-
-	/* new item or its part falls to R[0] */
-
-	/* part of new item falls into R[0] */
-	if (tb->item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) {
-		loff_t old_key_comp, old_len, r_zeroes_number;
-		const char *r_body;
-		int shift;
-		loff_t offset;
-
-		leaf_shift_right(tb, tb->rnum[0] - 1, -1);
-
-		/* Remember key component and item length */
-		old_key_comp = le_ih_k_offset(ih);
-		old_len = ih_item_len(ih);
-
-		/*
-		 * Calculate key component and item length to insert
-		 * into R[0]
-		 */
-		shift = 0;
-		if (is_indirect_le_ih(ih))
-			shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
-		offset = le_ih_k_offset(ih) + ((old_len - tb->rbytes) << shift);
-		set_le_ih_k_offset(ih, offset);
-		put_ih_item_len(ih, tb->rbytes);
-
-		/* Insert part of the item into R[0] */
-		buffer_info_init_right(tb, &bi);
-		if ((old_len - tb->rbytes) > tb->zeroes_num) {
-			r_zeroes_number = 0;
-			r_body = body + (old_len - tb->rbytes) - tb->zeroes_num;
-		} else {
-			r_body = body;
-			r_zeroes_number = tb->zeroes_num -
-					  (old_len - tb->rbytes);
-			tb->zeroes_num -= r_zeroes_number;
-		}
-
-		leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeroes_number);
-
-		/* Replace right delimiting key by first key in R[0] */
-		replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
-
-		/*
-		 * Calculate key component and item length to
-		 * insert into S[0]
-		 */
-		set_le_ih_k_offset(ih, old_key_comp);
-		put_ih_item_len(ih, old_len - tb->rbytes);
-
-		tb->insert_size[0] -= tb->rbytes;
-
-	} else {
-		/* whole new item falls into R[0] */
-
-		/* Shift rnum[0]-1 items to R[0] */
-		leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes);
-
-		/* Insert new item into R[0] */
-		buffer_info_init_right(tb, &bi);
-		leaf_insert_into_buf(&bi, tb->item_pos - n + tb->rnum[0] - 1,
-				     ih, body, tb->zeroes_num);
-
-		if (tb->item_pos - n + tb->rnum[0] - 1 == 0)
-			replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
-
-		tb->zeroes_num = tb->insert_size[0] = 0;
-	}
-}
-
-
-static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb,
-				     struct item_head * const ih,
-				     const char * const body)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	struct buffer_info bi;
-	int entry_count;
-
-	RFALSE(tb->zeroes_num,
-	       "PAP-12145: invalid parameter in case of a directory");
-	entry_count = ih_entry_count(item_head(tbS0, tb->item_pos));
-
-	/* new directory entry falls into R[0] */
-	if (entry_count - tb->rbytes < tb->pos_in_item) {
-		int paste_entry_position;
-
-		RFALSE(tb->rbytes - 1 >= entry_count || !tb->insert_size[0],
-		       "PAP-12150: no enough of entries to shift to R[0]: "
-		       "rbytes=%d, entry_count=%d", tb->rbytes, entry_count);
-
-		/*
-		 * Shift rnum[0]-1 items in whole.
-		 * Shift rbytes-1 directory entries from directory
-		 * item number rnum[0]
-		 */
-		leaf_shift_right(tb, tb->rnum[0], tb->rbytes - 1);
-
-		/* Paste given directory entry to directory item */
-		paste_entry_position = tb->pos_in_item - entry_count +
-				       tb->rbytes - 1;
-		buffer_info_init_right(tb, &bi);
-		leaf_paste_in_buffer(&bi, 0, paste_entry_position,
-				     tb->insert_size[0], body, tb->zeroes_num);
-
-		/* paste entry */
-		leaf_paste_entries(&bi, 0, paste_entry_position, 1,
-				   (struct reiserfs_de_head *) body,
-				   body + DEH_SIZE, tb->insert_size[0]);
-
-		/* change delimiting keys */
-		if (paste_entry_position == 0)
-			replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
-
-		tb->insert_size[0] = 0;
-		tb->pos_in_item++;
-	} else {
-		/* new directory entry doesn't fall into R[0] */
-		leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
-	}
-}
-
-static void balance_leaf_paste_right_shift(struct tree_balance *tb,
-				     struct item_head * const ih,
-				     const char * const body)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int n_shift, n_rem, r_zeroes_number, version;
-	unsigned long temp_rem;
-	const char *r_body;
-	struct buffer_info bi;
-
-	/* we append to directory item */
-	if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) {
-		balance_leaf_paste_right_shift_dirent(tb, ih, body);
-		return;
-	}
-
-	/* regular object */
-
-	/*
-	 * Calculate number of bytes which must be shifted
-	 * from appended item
-	 */
-	n_shift = tb->rbytes - tb->insert_size[0];
-	if (n_shift < 0)
-		n_shift = 0;
-
-	RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)),
-	       "PAP-12155: invalid position to paste. ih_item_len=%d, "
-	       "pos_in_item=%d", tb->pos_in_item,
-	       ih_item_len(item_head(tbS0, tb->item_pos)));
-
-	leaf_shift_right(tb, tb->rnum[0], n_shift);
-
-	/*
-	 * Calculate number of bytes which must remain in body
-	 * after appending to R[0]
-	 */
-	n_rem = tb->insert_size[0] - tb->rbytes;
-	if (n_rem < 0)
-		n_rem = 0;
-
-	temp_rem = n_rem;
-
-	version = ih_version(item_head(tb->R[0], 0));
-
-	if (is_indirect_le_key(version, leaf_key(tb->R[0], 0))) {
-		int shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
-		temp_rem = n_rem << shift;
-	}
-
-	add_le_key_k_offset(version, leaf_key(tb->R[0], 0), temp_rem);
-	add_le_key_k_offset(version, internal_key(tb->CFR[0], tb->rkey[0]),
-			    temp_rem);
-
-	do_balance_mark_internal_dirty(tb, tb->CFR[0], 0);
-
-	/* Append part of body into R[0] */
-	buffer_info_init_right(tb, &bi);
-	if (n_rem > tb->zeroes_num) {
-		r_zeroes_number = 0;
-		r_body = body + n_rem - tb->zeroes_num;
-	} else {
-		r_body = body;
-		r_zeroes_number = tb->zeroes_num - n_rem;
-		tb->zeroes_num -= r_zeroes_number;
-	}
-
-	leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem,
-			     r_body, r_zeroes_number);
-
-	if (is_indirect_le_ih(item_head(tb->R[0], 0)))
-		set_ih_free_space(item_head(tb->R[0], 0), 0);
-
-	tb->insert_size[0] = n_rem;
-	if (!n_rem)
-		tb->pos_in_item++;
-}
-
-static void balance_leaf_paste_right_whole(struct tree_balance *tb,
-				     struct item_head * const ih,
-				     const char * const body)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int n = B_NR_ITEMS(tbS0);
-	struct item_head *pasted;
-	struct buffer_info bi;
-
-	buffer_info_init_right(tb, &bi);
-	leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
-
-	/* append item in R[0] */
-	if (tb->pos_in_item >= 0) {
-		buffer_info_init_right(tb, &bi);
-		leaf_paste_in_buffer(&bi, tb->item_pos - n + tb->rnum[0],
-				     tb->pos_in_item, tb->insert_size[0], body,
-				     tb->zeroes_num);
-	}
-
-	/* paste new entry, if item is directory item */
-	pasted = item_head(tb->R[0], tb->item_pos - n + tb->rnum[0]);
-	if (is_direntry_le_ih(pasted) && tb->pos_in_item >= 0) {
-		leaf_paste_entries(&bi, tb->item_pos - n + tb->rnum[0],
-				   tb->pos_in_item, 1,
-				   (struct reiserfs_de_head *)body,
-				   body + DEH_SIZE, tb->insert_size[0]);
-
-		if (!tb->pos_in_item) {
-
-			RFALSE(tb->item_pos - n + tb->rnum[0],
-			       "PAP-12165: directory item must be first "
-			       "item of node when pasting is in 0th position");
-
-			/* update delimiting keys */
-			replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
-		}
-	}
-
-	if (is_indirect_le_ih(pasted))
-		set_ih_free_space(pasted, 0);
-	tb->zeroes_num = tb->insert_size[0] = 0;
-}
-
-static void balance_leaf_paste_right(struct tree_balance *tb,
-				     struct item_head * const ih,
-				     const char * const body)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int n = B_NR_ITEMS(tbS0);
-
-	/* new item doesn't fall into R[0] */
-	if (n - tb->rnum[0] > tb->item_pos) {
-		leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
-		return;
-	}
-
-	/* pasted item or part of it falls to R[0] */
-
-	if (tb->item_pos == n - tb->rnum[0] && tb->rbytes != -1)
-		/* we must shift the part of the appended item */
-		balance_leaf_paste_right_shift(tb, ih, body);
-	else
-		/* pasted item in whole falls into R[0] */
-		balance_leaf_paste_right_whole(tb, ih, body);
-}
-
-/* shift rnum[0] items from S[0] to the right neighbor R[0] */
-static void balance_leaf_right(struct tree_balance *tb,
-			       struct item_head * const ih,
-			       const char * const body, int flag)
-{
-	if (tb->rnum[0] <= 0)
-		return;
-
-	BUG_ON(flag != M_INSERT && flag != M_PASTE);
-
-	if (flag == M_INSERT)
-		balance_leaf_insert_right(tb, ih, body);
-	else /* M_PASTE */
-		balance_leaf_paste_right(tb, ih, body);
-}
-
-static void balance_leaf_new_nodes_insert(struct tree_balance *tb,
-					  struct item_head * const ih,
-					  const char * const body,
-					  struct item_head *insert_key,
-					  struct buffer_head **insert_ptr,
-					  int i)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int n = B_NR_ITEMS(tbS0);
-	struct buffer_info bi;
-	int shift;
-
-	/* new item or it part don't falls into S_new[i] */
-	if (n - tb->snum[i] >= tb->item_pos) {
-		leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
-				tb->snum[i], tb->sbytes[i], tb->S_new[i]);
-		return;
-	}
-
-	/* new item or it's part falls to first new node S_new[i] */
-
-	/* part of new item falls into S_new[i] */
-	if (tb->item_pos == n - tb->snum[i] + 1 && tb->sbytes[i] != -1) {
-		int old_key_comp, old_len, r_zeroes_number;
-		const char *r_body;
-
-		/* Move snum[i]-1 items from S[0] to S_new[i] */
-		leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i] - 1, -1,
-				tb->S_new[i]);
-
-		/* Remember key component and item length */
-		old_key_comp = le_ih_k_offset(ih);
-		old_len = ih_item_len(ih);
-
-		/*
-		 * Calculate key component and item length to insert
-		 * into S_new[i]
-		 */
-		shift = 0;
-		if (is_indirect_le_ih(ih))
-			shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
-		set_le_ih_k_offset(ih,
-				   le_ih_k_offset(ih) +
-				   ((old_len - tb->sbytes[i]) << shift));
-
-		put_ih_item_len(ih, tb->sbytes[i]);
-
-		/* Insert part of the item into S_new[i] before 0-th item */
-		buffer_info_init_bh(tb, &bi, tb->S_new[i]);
-
-		if ((old_len - tb->sbytes[i]) > tb->zeroes_num) {
-			r_zeroes_number = 0;
-			r_body = body + (old_len - tb->sbytes[i]) -
-					 tb->zeroes_num;
-		} else {
-			r_body = body;
-			r_zeroes_number = tb->zeroes_num - (old_len -
-					  tb->sbytes[i]);
-			tb->zeroes_num -= r_zeroes_number;
-		}
-
-		leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeroes_number);
-
-		/*
-		 * Calculate key component and item length to
-		 * insert into S[i]
-		 */
-		set_le_ih_k_offset(ih, old_key_comp);
-		put_ih_item_len(ih, old_len - tb->sbytes[i]);
-		tb->insert_size[0] -= tb->sbytes[i];
-	} else {
-		/* whole new item falls into S_new[i] */
-
-		/*
-		 * Shift snum[0] - 1 items to S_new[i]
-		 * (sbytes[i] of split item)
-		 */
-		leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
-				tb->snum[i] - 1, tb->sbytes[i], tb->S_new[i]);
-
-		/* Insert new item into S_new[i] */
-		buffer_info_init_bh(tb, &bi, tb->S_new[i]);
-		leaf_insert_into_buf(&bi, tb->item_pos - n + tb->snum[i] - 1,
-				     ih, body, tb->zeroes_num);
-
-		tb->zeroes_num = tb->insert_size[0] = 0;
-	}
-}
-
-/* we append to directory item */
-static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb,
-					 struct item_head * const ih,
-					 const char * const body,
-					 struct item_head *insert_key,
-					 struct buffer_head **insert_ptr,
-					 int i)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	struct item_head *aux_ih = item_head(tbS0, tb->item_pos);
-	int entry_count = ih_entry_count(aux_ih);
-	struct buffer_info bi;
-
-	if (entry_count - tb->sbytes[i] < tb->pos_in_item &&
-	    tb->pos_in_item <= entry_count) {
-		/* new directory entry falls into S_new[i] */
-
-		RFALSE(!tb->insert_size[0],
-		       "PAP-12215: insert_size is already 0");
-		RFALSE(tb->sbytes[i] - 1 >= entry_count,
-		       "PAP-12220: there are no so much entries (%d), only %d",
-		       tb->sbytes[i] - 1, entry_count);
-
-		/*
-		 * Shift snum[i]-1 items in whole.
-		 * Shift sbytes[i] directory entries
-		 * from directory item number snum[i]
-		 */
-		leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i],
-				tb->sbytes[i] - 1, tb->S_new[i]);
-
-		/*
-		 * Paste given directory entry to
-		 * directory item
-		 */
-		buffer_info_init_bh(tb, &bi, tb->S_new[i]);
-		leaf_paste_in_buffer(&bi, 0, tb->pos_in_item - entry_count +
-				     tb->sbytes[i] - 1, tb->insert_size[0],
-				     body, tb->zeroes_num);
-
-		/* paste new directory entry */
-		leaf_paste_entries(&bi, 0, tb->pos_in_item - entry_count +
-				   tb->sbytes[i] - 1, 1,
-				   (struct reiserfs_de_head *) body,
-				   body + DEH_SIZE, tb->insert_size[0]);
-
-		tb->insert_size[0] = 0;
-		tb->pos_in_item++;
-	} else {
-		/* new directory entry doesn't fall into S_new[i] */
-		leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i],
-				tb->sbytes[i], tb->S_new[i]);
-	}
-
-}
-
-static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb,
-					 struct item_head * const ih,
-					 const char * const body,
-					 struct item_head *insert_key,
-					 struct buffer_head **insert_ptr,
-					 int i)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	struct item_head *aux_ih = item_head(tbS0, tb->item_pos);
-	int n_shift, n_rem, r_zeroes_number, shift;
-	const char *r_body;
-	struct item_head *tmp;
-	struct buffer_info bi;
-
-	RFALSE(ih, "PAP-12210: ih must be 0");
-
-	if (is_direntry_le_ih(aux_ih)) {
-		balance_leaf_new_nodes_paste_dirent(tb, ih, body, insert_key,
-						    insert_ptr, i);
-		return;
-	}
-
-	/* regular object */
-
-
-	RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)) ||
-	       tb->insert_size[0] <= 0,
-	       "PAP-12225: item too short or insert_size <= 0");
-
-	/*
-	 * Calculate number of bytes which must be shifted from appended item
-	 */
-	n_shift = tb->sbytes[i] - tb->insert_size[0];
-	if (n_shift < 0)
-		n_shift = 0;
-	leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], n_shift,
-			tb->S_new[i]);
-
-	/*
-	 * Calculate number of bytes which must remain in body after
-	 * append to S_new[i]
-	 */
-	n_rem = tb->insert_size[0] - tb->sbytes[i];
-	if (n_rem < 0)
-		n_rem = 0;
-
-	/* Append part of body into S_new[0] */
-	buffer_info_init_bh(tb, &bi, tb->S_new[i]);
-	if (n_rem > tb->zeroes_num) {
-		r_zeroes_number = 0;
-		r_body = body + n_rem - tb->zeroes_num;
-	} else {
-		r_body = body;
-		r_zeroes_number = tb->zeroes_num - n_rem;
-		tb->zeroes_num -= r_zeroes_number;
-	}
-
-	leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem,
-			     r_body, r_zeroes_number);
-
-	tmp = item_head(tb->S_new[i], 0);
-	shift = 0;
-	if (is_indirect_le_ih(tmp)) {
-		set_ih_free_space(tmp, 0);
-		shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
-	}
-	add_le_ih_k_offset(tmp, n_rem << shift);
-
-	tb->insert_size[0] = n_rem;
-	if (!n_rem)
-		tb->pos_in_item++;
-}
-
-static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb,
-					       struct item_head * const ih,
-					       const char * const body,
-					       struct item_head *insert_key,
-					       struct buffer_head **insert_ptr,
-					       int i)
-
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int n = B_NR_ITEMS(tbS0);
-	int leaf_mi;
-	struct item_head *pasted;
-	struct buffer_info bi;
-
-#ifdef CONFIG_REISERFS_CHECK
-	struct item_head *ih_check = item_head(tbS0, tb->item_pos);
-
-	if (!is_direntry_le_ih(ih_check) &&
-	    (tb->pos_in_item != ih_item_len(ih_check) ||
-	    tb->insert_size[0] <= 0))
-		reiserfs_panic(tb->tb_sb,
-			     "PAP-12235",
-			     "pos_in_item must be equal to ih_item_len");
-#endif
-
-	leaf_mi = leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i],
-				  tb->sbytes[i], tb->S_new[i]);
-
-	RFALSE(leaf_mi,
-	       "PAP-12240: unexpected value returned by leaf_move_items (%d)",
-	       leaf_mi);
-
-	/* paste into item */
-	buffer_info_init_bh(tb, &bi, tb->S_new[i]);
-	leaf_paste_in_buffer(&bi, tb->item_pos - n + tb->snum[i],
-			     tb->pos_in_item, tb->insert_size[0],
-			     body, tb->zeroes_num);
-
-	pasted = item_head(tb->S_new[i], tb->item_pos - n +
-			   tb->snum[i]);
-	if (is_direntry_le_ih(pasted))
-		leaf_paste_entries(&bi, tb->item_pos - n + tb->snum[i],
-				   tb->pos_in_item, 1,
-				   (struct reiserfs_de_head *)body,
-				   body + DEH_SIZE, tb->insert_size[0]);
-
-	/* if we paste to indirect item update ih_free_space */
-	if (is_indirect_le_ih(pasted))
-		set_ih_free_space(pasted, 0);
-
-	tb->zeroes_num = tb->insert_size[0] = 0;
-
-}
-static void balance_leaf_new_nodes_paste(struct tree_balance *tb,
-					 struct item_head * const ih,
-					 const char * const body,
-					 struct item_head *insert_key,
-					 struct buffer_head **insert_ptr,
-					 int i)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int n = B_NR_ITEMS(tbS0);
-
-	/* pasted item doesn't fall into S_new[i] */
-	if (n - tb->snum[i] > tb->item_pos) {
-		leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
-				tb->snum[i], tb->sbytes[i], tb->S_new[i]);
-		return;
-	}
-
-	/* pasted item or part if it falls to S_new[i] */
-
-	if (tb->item_pos == n - tb->snum[i] && tb->sbytes[i] != -1)
-		/* we must shift part of the appended item */
-		balance_leaf_new_nodes_paste_shift(tb, ih, body, insert_key,
-						   insert_ptr, i);
-	else
-		/* item falls wholly into S_new[i] */
-		balance_leaf_new_nodes_paste_whole(tb, ih, body, insert_key,
-						   insert_ptr, i);
-}
-
-/* Fill new nodes that appear in place of S[0] */
-static void balance_leaf_new_nodes(struct tree_balance *tb,
-				   struct item_head * const ih,
-				   const char * const body,
-				   struct item_head *insert_key,
-				   struct buffer_head **insert_ptr,
-				   int flag)
-{
-	int i;
-	for (i = tb->blknum[0] - 2; i >= 0; i--) {
-		BUG_ON(flag != M_INSERT && flag != M_PASTE);
-
-		RFALSE(!tb->snum[i],
-		       "PAP-12200: snum[%d] == %d. Must be > 0", i,
-		       tb->snum[i]);
-
-		/* here we shift from S to S_new nodes */
-
-		tb->S_new[i] = get_FEB(tb);
-
-		/* initialized block type and tree level */
-		set_blkh_level(B_BLK_HEAD(tb->S_new[i]), DISK_LEAF_NODE_LEVEL);
-
-		if (flag == M_INSERT)
-			balance_leaf_new_nodes_insert(tb, ih, body, insert_key,
-						      insert_ptr, i);
-		else /* M_PASTE */
-			balance_leaf_new_nodes_paste(tb, ih, body, insert_key,
-						     insert_ptr, i);
-
-		memcpy(insert_key + i, leaf_key(tb->S_new[i], 0), KEY_SIZE);
-		insert_ptr[i] = tb->S_new[i];
-
-		RFALSE(!buffer_journaled(tb->S_new[i])
-		       || buffer_journal_dirty(tb->S_new[i])
-		       || buffer_dirty(tb->S_new[i]),
-		       "PAP-12247: S_new[%d] : (%b)",
-		       i, tb->S_new[i]);
-	}
-}
-
-static void balance_leaf_finish_node_insert(struct tree_balance *tb,
-					    struct item_head * const ih,
-					    const char * const body)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	struct buffer_info bi;
-	buffer_info_init_tbS0(tb, &bi);
-	leaf_insert_into_buf(&bi, tb->item_pos, ih, body, tb->zeroes_num);
-
-	/* If we insert the first key change the delimiting key */
-	if (tb->item_pos == 0) {
-		if (tb->CFL[0])	/* can be 0 in reiserfsck */
-			replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
-
-	}
-}
-
-static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb,
-						  struct item_head * const ih,
-						  const char * const body)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	struct item_head *pasted = item_head(tbS0, tb->item_pos);
-	struct buffer_info bi;
-
-	if (tb->pos_in_item >= 0 && tb->pos_in_item <= ih_entry_count(pasted)) {
-		RFALSE(!tb->insert_size[0],
-		       "PAP-12260: insert_size is 0 already");
-
-		/* prepare space */
-		buffer_info_init_tbS0(tb, &bi);
-		leaf_paste_in_buffer(&bi, tb->item_pos, tb->pos_in_item,
-				     tb->insert_size[0], body, tb->zeroes_num);
-
-		/* paste entry */
-		leaf_paste_entries(&bi, tb->item_pos, tb->pos_in_item, 1,
-				   (struct reiserfs_de_head *)body,
-				   body + DEH_SIZE, tb->insert_size[0]);
-
-		if (!tb->item_pos && !tb->pos_in_item) {
-			RFALSE(!tb->CFL[0] || !tb->L[0],
-			       "PAP-12270: CFL[0]/L[0] must  be specified");
-			if (tb->CFL[0])
-				replace_key(tb, tb->CFL[0], tb->lkey[0],
-					    tbS0, 0);
-		}
-
-		tb->insert_size[0] = 0;
-	}
-}
-
-static void balance_leaf_finish_node_paste(struct tree_balance *tb,
-					   struct item_head * const ih,
-					   const char * const body)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	struct buffer_info bi;
-	struct item_head *pasted = item_head(tbS0, tb->item_pos);
-
-	/* when directory, may be new entry already pasted */
-	if (is_direntry_le_ih(pasted)) {
-		balance_leaf_finish_node_paste_dirent(tb, ih, body);
-		return;
-	}
-
-	/* regular object */
-
-	if (tb->pos_in_item == ih_item_len(pasted)) {
-		RFALSE(tb->insert_size[0] <= 0,
-		       "PAP-12275: insert size must not be %d",
-		       tb->insert_size[0]);
-		buffer_info_init_tbS0(tb, &bi);
-		leaf_paste_in_buffer(&bi, tb->item_pos,
-				     tb->pos_in_item, tb->insert_size[0], body,
-				     tb->zeroes_num);
-
-		if (is_indirect_le_ih(pasted))
-			set_ih_free_space(pasted, 0);
-
-		tb->insert_size[0] = 0;
-	}
-#ifdef CONFIG_REISERFS_CHECK
-	else if (tb->insert_size[0]) {
-		print_cur_tb("12285");
-		reiserfs_panic(tb->tb_sb, "PAP-12285",
-		    "insert_size must be 0 (%d)", tb->insert_size[0]);
-	}
-#endif
-}
-
-/*
- * if the affected item was not wholly shifted then we
- * perform all necessary operations on that part or whole
- * of the affected item which remains in S
- */
-static void balance_leaf_finish_node(struct tree_balance *tb,
-				      struct item_head * const ih,
-				      const char * const body, int flag)
-{
-	/* if we must insert or append into buffer S[0] */
-	if (0 <= tb->item_pos && tb->item_pos < tb->s0num) {
-		if (flag == M_INSERT)
-			balance_leaf_finish_node_insert(tb, ih, body);
-		else /* M_PASTE */
-			balance_leaf_finish_node_paste(tb, ih, body);
-	}
-}
-
-/**
- * balance_leaf - reiserfs tree balancing algorithm
- * @tb: tree balance state
- * @ih: item header of inserted item (little endian)
- * @body: body of inserted item or bytes to paste
- * @flag: i - insert, d - delete, c - cut, p - paste (see do_balance)
- * passed back:
- * @insert_key: key to insert new nodes
- * @insert_ptr: array of nodes to insert at the next level
- *
- * In our processing of one level we sometimes determine what must be
- * inserted into the next higher level.  This insertion consists of a
- * key or two keys and their corresponding pointers.
- */
-static int balance_leaf(struct tree_balance *tb, struct item_head *ih,
-			const char *body, int flag,
-			struct item_head *insert_key,
-			struct buffer_head **insert_ptr)
-{
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-
-	PROC_INFO_INC(tb->tb_sb, balance_at[0]);
-
-	/* Make balance in case insert_size[0] < 0 */
-	if (tb->insert_size[0] < 0)
-		return balance_leaf_when_delete(tb, flag);
-
-	tb->item_pos = PATH_LAST_POSITION(tb->tb_path),
-	tb->pos_in_item = tb->tb_path->pos_in_item,
-	tb->zeroes_num = 0;
-	if (flag == M_INSERT && !body)
-		tb->zeroes_num = ih_item_len(ih);
-
-	/*
-	 * for indirect item pos_in_item is measured in unformatted node
-	 * pointers. Recalculate to bytes
-	 */
-	if (flag != M_INSERT
-	    && is_indirect_le_ih(item_head(tbS0, tb->item_pos)))
-		tb->pos_in_item *= UNFM_P_SIZE;
-
-	body += balance_leaf_left(tb, ih, body, flag);
-
-	/* tb->lnum[0] > 0 */
-	/* Calculate new item position */
-	tb->item_pos -= (tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0));
-
-	balance_leaf_right(tb, ih, body, flag);
-
-	/* tb->rnum[0] > 0 */
-	RFALSE(tb->blknum[0] > 3,
-	       "PAP-12180: blknum can not be %d. It must be <= 3", tb->blknum[0]);
-	RFALSE(tb->blknum[0] < 0,
-	       "PAP-12185: blknum can not be %d. It must be >= 0", tb->blknum[0]);
-
-	/*
-	 * if while adding to a node we discover that it is possible to split
-	 * it in two, and merge the left part into the left neighbor and the
-	 * right part into the right neighbor, eliminating the node
-	 */
-	if (tb->blknum[0] == 0) {	/* node S[0] is empty now */
-
-		RFALSE(!tb->lnum[0] || !tb->rnum[0],
-		       "PAP-12190: lnum and rnum must not be zero");
-		/*
-		 * if insertion was done before 0-th position in R[0], right
-		 * delimiting key of the tb->L[0]'s and left delimiting key are
-		 * not set correctly
-		 */
-		if (tb->CFL[0]) {
-			if (!tb->CFR[0])
-				reiserfs_panic(tb->tb_sb, "vs-12195",
-					       "CFR not initialized");
-			copy_key(internal_key(tb->CFL[0], tb->lkey[0]),
-				 internal_key(tb->CFR[0], tb->rkey[0]));
-			do_balance_mark_internal_dirty(tb, tb->CFL[0], 0);
-		}
-
-		reiserfs_invalidate_buffer(tb, tbS0);
-		return 0;
-	}
-
-	balance_leaf_new_nodes(tb, ih, body, insert_key, insert_ptr, flag);
-
-	balance_leaf_finish_node(tb, ih, body, flag);
-
-#ifdef CONFIG_REISERFS_CHECK
-	if (flag == M_PASTE && tb->insert_size[0]) {
-		print_cur_tb("12290");
-		reiserfs_panic(tb->tb_sb,
-			       "PAP-12290", "insert_size is still not 0 (%d)",
-			       tb->insert_size[0]);
-	}
-#endif
-
-	/* Leaf level of the tree is balanced (end of balance_leaf) */
-	return 0;
-}
-
-/* Make empty node */
-void make_empty_node(struct buffer_info *bi)
-{
-	struct block_head *blkh;
-
-	RFALSE(bi->bi_bh == NULL, "PAP-12295: pointer to the buffer is NULL");
-
-	blkh = B_BLK_HEAD(bi->bi_bh);
-	set_blkh_nr_item(blkh, 0);
-	set_blkh_free_space(blkh, MAX_CHILD_SIZE(bi->bi_bh));
-
-	if (bi->bi_parent)
-		B_N_CHILD(bi->bi_parent, bi->bi_position)->dc_size = 0;	/* Endian safe if 0 */
-}
-
-/* Get first empty buffer */
-struct buffer_head *get_FEB(struct tree_balance *tb)
-{
-	int i;
-	struct buffer_info bi;
-
-	for (i = 0; i < MAX_FEB_SIZE; i++)
-		if (tb->FEB[i] != NULL)
-			break;
-
-	if (i == MAX_FEB_SIZE)
-		reiserfs_panic(tb->tb_sb, "vs-12300", "FEB list is empty");
-
-	buffer_info_init_bh(tb, &bi, tb->FEB[i]);
-	make_empty_node(&bi);
-	set_buffer_uptodate(tb->FEB[i]);
-	tb->used[i] = tb->FEB[i];
-	tb->FEB[i] = NULL;
-
-	return tb->used[i];
-}
-
-/* This is now used because reiserfs_free_block has to be able to schedule. */
-static void store_thrown(struct tree_balance *tb, struct buffer_head *bh)
-{
-	int i;
-
-	if (buffer_dirty(bh))
-		reiserfs_warning(tb->tb_sb, "reiserfs-12320",
-				 "called with dirty buffer");
-	for (i = 0; i < ARRAY_SIZE(tb->thrown); i++)
-		if (!tb->thrown[i]) {
-			tb->thrown[i] = bh;
-			get_bh(bh);	/* free_thrown puts this */
-			return;
-		}
-	reiserfs_warning(tb->tb_sb, "reiserfs-12321",
-			 "too many thrown buffers");
-}
-
-static void free_thrown(struct tree_balance *tb)
-{
-	int i;
-	b_blocknr_t blocknr;
-	for (i = 0; i < ARRAY_SIZE(tb->thrown); i++) {
-		if (tb->thrown[i]) {
-			blocknr = tb->thrown[i]->b_blocknr;
-			if (buffer_dirty(tb->thrown[i]))
-				reiserfs_warning(tb->tb_sb, "reiserfs-12322",
-						 "called with dirty buffer %d",
-						 blocknr);
-			brelse(tb->thrown[i]);	/* incremented in store_thrown */
-			reiserfs_free_block(tb->transaction_handle, NULL,
-					    blocknr, 0);
-		}
-	}
-}
-
-void reiserfs_invalidate_buffer(struct tree_balance *tb, struct buffer_head *bh)
-{
-	struct block_head *blkh;
-	blkh = B_BLK_HEAD(bh);
-	set_blkh_level(blkh, FREE_LEVEL);
-	set_blkh_nr_item(blkh, 0);
-
-	clear_buffer_dirty(bh);
-	store_thrown(tb, bh);
-}
-
-/* Replace n_dest'th key in buffer dest by n_src'th key of buffer src.*/
-void replace_key(struct tree_balance *tb, struct buffer_head *dest, int n_dest,
-		 struct buffer_head *src, int n_src)
-{
-
-	RFALSE(dest == NULL || src == NULL,
-	       "vs-12305: source or destination buffer is 0 (src=%p, dest=%p)",
-	       src, dest);
-	RFALSE(!B_IS_KEYS_LEVEL(dest),
-	       "vs-12310: invalid level (%z) for destination buffer. dest must be leaf",
-	       dest);
-	RFALSE(n_dest < 0 || n_src < 0,
-	       "vs-12315: src(%d) or dest(%d) key number < 0", n_src, n_dest);
-	RFALSE(n_dest >= B_NR_ITEMS(dest) || n_src >= B_NR_ITEMS(src),
-	       "vs-12320: src(%d(%d)) or dest(%d(%d)) key number is too big",
-	       n_src, B_NR_ITEMS(src), n_dest, B_NR_ITEMS(dest));
-
-	if (B_IS_ITEMS_LEVEL(src))
-		/* source buffer contains leaf node */
-		memcpy(internal_key(dest, n_dest), item_head(src, n_src),
-		       KEY_SIZE);
-	else
-		memcpy(internal_key(dest, n_dest), internal_key(src, n_src),
-		       KEY_SIZE);
-
-	do_balance_mark_internal_dirty(tb, dest, 0);
-}
-
-int get_left_neighbor_position(struct tree_balance *tb, int h)
-{
-	int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1);
-
-	RFALSE(PATH_H_PPARENT(tb->tb_path, h) == NULL || tb->FL[h] == NULL,
-	       "vs-12325: FL[%d](%p) or F[%d](%p) does not exist",
-	       h, tb->FL[h], h, PATH_H_PPARENT(tb->tb_path, h));
-
-	if (Sh_position == 0)
-		return B_NR_ITEMS(tb->FL[h]);
-	else
-		return Sh_position - 1;
-}
-
-int get_right_neighbor_position(struct tree_balance *tb, int h)
-{
-	int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1);
-
-	RFALSE(PATH_H_PPARENT(tb->tb_path, h) == NULL || tb->FR[h] == NULL,
-	       "vs-12330: F[%d](%p) or FR[%d](%p) does not exist",
-	       h, PATH_H_PPARENT(tb->tb_path, h), h, tb->FR[h]);
-
-	if (Sh_position == B_NR_ITEMS(PATH_H_PPARENT(tb->tb_path, h)))
-		return 0;
-	else
-		return Sh_position + 1;
-}
-
-#ifdef CONFIG_REISERFS_CHECK
-
-int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value);
-static void check_internal_node(struct super_block *s, struct buffer_head *bh,
-				char *mes)
-{
-	struct disk_child *dc;
-	int i;
-
-	RFALSE(!bh, "PAP-12336: bh == 0");
-
-	if (!bh || !B_IS_IN_TREE(bh))
-		return;
-
-	RFALSE(!buffer_dirty(bh) &&
-	       !(buffer_journaled(bh) || buffer_journal_dirty(bh)),
-	       "PAP-12337: buffer (%b) must be dirty", bh);
-	dc = B_N_CHILD(bh, 0);
-
-	for (i = 0; i <= B_NR_ITEMS(bh); i++, dc++) {
-		if (!is_reusable(s, dc_block_number(dc), 1)) {
-			print_cur_tb(mes);
-			reiserfs_panic(s, "PAP-12338",
-				       "invalid child pointer %y in %b",
-				       dc, bh);
-		}
-	}
-}
-
-static int locked_or_not_in_tree(struct tree_balance *tb,
-				  struct buffer_head *bh, char *which)
-{
-	if ((!buffer_journal_prepared(bh) && buffer_locked(bh)) ||
-	    !B_IS_IN_TREE(bh)) {
-		reiserfs_warning(tb->tb_sb, "vs-12339", "%s (%b)", which, bh);
-		return 1;
-	}
-	return 0;
-}
-
-static int check_before_balancing(struct tree_balance *tb)
-{
-	int retval = 0;
-
-	if (REISERFS_SB(tb->tb_sb)->cur_tb) {
-		reiserfs_panic(tb->tb_sb, "vs-12335", "suspect that schedule "
-			       "occurred based on cur_tb not being null at "
-			       "this point in code. do_balance cannot properly "
-			       "handle concurrent tree accesses on a same "
-			       "mount point.");
-	}
-
-	/*
-	 * double check that buffers that we will modify are unlocked.
-	 * (fix_nodes should already have prepped all of these for us).
-	 */
-	if (tb->lnum[0]) {
-		retval |= locked_or_not_in_tree(tb, tb->L[0], "L[0]");
-		retval |= locked_or_not_in_tree(tb, tb->FL[0], "FL[0]");
-		retval |= locked_or_not_in_tree(tb, tb->CFL[0], "CFL[0]");
-		check_leaf(tb->L[0]);
-	}
-	if (tb->rnum[0]) {
-		retval |= locked_or_not_in_tree(tb, tb->R[0], "R[0]");
-		retval |= locked_or_not_in_tree(tb, tb->FR[0], "FR[0]");
-		retval |= locked_or_not_in_tree(tb, tb->CFR[0], "CFR[0]");
-		check_leaf(tb->R[0]);
-	}
-	retval |= locked_or_not_in_tree(tb, PATH_PLAST_BUFFER(tb->tb_path),
-					"S[0]");
-	check_leaf(PATH_PLAST_BUFFER(tb->tb_path));
-
-	return retval;
-}
-
-static void check_after_balance_leaf(struct tree_balance *tb)
-{
-	if (tb->lnum[0]) {
-		if (B_FREE_SPACE(tb->L[0]) !=
-		    MAX_CHILD_SIZE(tb->L[0]) -
-		    dc_size(B_N_CHILD
-			    (tb->FL[0], get_left_neighbor_position(tb, 0)))) {
-			print_cur_tb("12221");
-			reiserfs_panic(tb->tb_sb, "PAP-12355",
-				       "shift to left was incorrect");
-		}
-	}
-	if (tb->rnum[0]) {
-		if (B_FREE_SPACE(tb->R[0]) !=
-		    MAX_CHILD_SIZE(tb->R[0]) -
-		    dc_size(B_N_CHILD
-			    (tb->FR[0], get_right_neighbor_position(tb, 0)))) {
-			print_cur_tb("12222");
-			reiserfs_panic(tb->tb_sb, "PAP-12360",
-				       "shift to right was incorrect");
-		}
-	}
-	if (PATH_H_PBUFFER(tb->tb_path, 1) &&
-	    (B_FREE_SPACE(PATH_H_PBUFFER(tb->tb_path, 0)) !=
-	     (MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)) -
-	      dc_size(B_N_CHILD(PATH_H_PBUFFER(tb->tb_path, 1),
-				PATH_H_POSITION(tb->tb_path, 1)))))) {
-		int left = B_FREE_SPACE(PATH_H_PBUFFER(tb->tb_path, 0));
-		int right = (MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)) -
-			     dc_size(B_N_CHILD(PATH_H_PBUFFER(tb->tb_path, 1),
-					       PATH_H_POSITION(tb->tb_path,
-							       1))));
-		print_cur_tb("12223");
-		reiserfs_warning(tb->tb_sb, "reiserfs-12363",
-				 "B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) = %d; "
-				 "MAX_CHILD_SIZE (%d) - dc_size( %y, %d ) [%d] = %d",
-				 left,
-				 MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)),
-				 PATH_H_PBUFFER(tb->tb_path, 1),
-				 PATH_H_POSITION(tb->tb_path, 1),
-				 dc_size(B_N_CHILD
-					 (PATH_H_PBUFFER(tb->tb_path, 1),
-					  PATH_H_POSITION(tb->tb_path, 1))),
-				 right);
-		reiserfs_panic(tb->tb_sb, "PAP-12365", "S is incorrect");
-	}
-}
-
-static void check_leaf_level(struct tree_balance *tb)
-{
-	check_leaf(tb->L[0]);
-	check_leaf(tb->R[0]);
-	check_leaf(PATH_PLAST_BUFFER(tb->tb_path));
-}
-
-static void check_internal_levels(struct tree_balance *tb)
-{
-	int h;
-
-	/* check all internal nodes */
-	for (h = 1; tb->insert_size[h]; h++) {
-		check_internal_node(tb->tb_sb, PATH_H_PBUFFER(tb->tb_path, h),
-				    "BAD BUFFER ON PATH");
-		if (tb->lnum[h])
-			check_internal_node(tb->tb_sb, tb->L[h], "BAD L");
-		if (tb->rnum[h])
-			check_internal_node(tb->tb_sb, tb->R[h], "BAD R");
-	}
-
-}
-
-#endif
-
-/*
- * Now we have all of the buffers that must be used in balancing of
- * the tree.  We rely on the assumption that schedule() will not occur
- * while do_balance works. ( Only interrupt handlers are acceptable.)
- * We balance the tree according to the analysis made before this,
- * using buffers already obtained.  For SMP support it will someday be
- * necessary to add ordered locking of tb.
- */
-
-/*
- * Some interesting rules of balancing:
- * we delete a maximum of two nodes per level per balancing: we never
- * delete R, when we delete two of three nodes L, S, R then we move
- * them into R.
- *
- * we only delete L if we are deleting two nodes, if we delete only
- * one node we delete S
- *
- * if we shift leaves then we shift as much as we can: this is a
- * deliberate policy of extremism in node packing which results in
- * higher average utilization after repeated random balance operations
- * at the cost of more memory copies and more balancing as a result of
- * small insertions to full nodes.
- *
- * if we shift internal nodes we try to evenly balance the node
- * utilization, with consequent less balancing at the cost of lower
- * utilization.
- *
- * one could argue that the policy for directories in leaves should be
- * that of internal nodes, but we will wait until another day to
- * evaluate this....  It would be nice to someday measure and prove
- * these assumptions as to what is optimal....
- */
-
-static inline void do_balance_starts(struct tree_balance *tb)
-{
-	/* use print_cur_tb() to see initial state of struct tree_balance */
-
-	/* store_print_tb (tb); */
-
-	/* do not delete, just comment it out */
-	/*
-	print_tb(flag, PATH_LAST_POSITION(tb->tb_path),
-		 tb->tb_path->pos_in_item, tb, "check");
-	*/
-	RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB");
-#ifdef CONFIG_REISERFS_CHECK
-	REISERFS_SB(tb->tb_sb)->cur_tb = tb;
-#endif
-}
-
-static inline void do_balance_completed(struct tree_balance *tb)
-{
-
-#ifdef CONFIG_REISERFS_CHECK
-	check_leaf_level(tb);
-	check_internal_levels(tb);
-	REISERFS_SB(tb->tb_sb)->cur_tb = NULL;
-#endif
-
-	/*
-	 * reiserfs_free_block is no longer schedule safe.  So, we need to
-	 * put the buffers we want freed on the thrown list during do_balance,
-	 * and then free them now
-	 */
-
-	REISERFS_SB(tb->tb_sb)->s_do_balance++;
-
-	/* release all nodes hold to perform the balancing */
-	unfix_nodes(tb);
-
-	free_thrown(tb);
-}
-
-/*
- * do_balance - balance the tree
- *
- * @tb: tree_balance structure
- * @ih: item header of inserted item
- * @body: body of inserted item or bytes to paste
- * @flag: 'i' - insert, 'd' - delete, 'c' - cut, 'p' paste
- *
- * Cut means delete part of an item (includes removing an entry from a
- * directory).
- *
- * Delete means delete whole item.
- *
- * Insert means add a new item into the tree.
- *
- * Paste means to append to the end of an existing file or to
- * insert a directory entry.
- */
-void do_balance(struct tree_balance *tb, struct item_head *ih,
-		const char *body, int flag)
-{
-	int child_pos;		/* position of a child node in its parent */
-	int h;			/* level of the tree being processed */
-
-	/*
-	 * in our processing of one level we sometimes determine what
-	 * must be inserted into the next higher level.  This insertion
-	 * consists of a key or two keys and their corresponding
-	 * pointers
-	 */
-	struct item_head insert_key[2];
-
-	/* inserted node-ptrs for the next level */
-	struct buffer_head *insert_ptr[2];
-
-	tb->tb_mode = flag;
-	tb->need_balance_dirty = 0;
-
-	if (FILESYSTEM_CHANGED_TB(tb)) {
-		reiserfs_panic(tb->tb_sb, "clm-6000", "fs generation has "
-			       "changed");
-	}
-	/* if we have no real work to do  */
-	if (!tb->insert_size[0]) {
-		reiserfs_warning(tb->tb_sb, "PAP-12350",
-				 "insert_size == 0, mode == %c", flag);
-		unfix_nodes(tb);
-		return;
-	}
-
-	atomic_inc(&fs_generation(tb->tb_sb));
-	do_balance_starts(tb);
-
-	/*
-	 * balance_leaf returns 0 except if combining L R and S into
-	 * one node.  see balance_internal() for explanation of this
-	 * line of code.
-	 */
-	child_pos = PATH_H_B_ITEM_ORDER(tb->tb_path, 0) +
-	    balance_leaf(tb, ih, body, flag, insert_key, insert_ptr);
-
-#ifdef CONFIG_REISERFS_CHECK
-	check_after_balance_leaf(tb);
-#endif
-
-	/* Balance internal level of the tree. */
-	for (h = 1; h < MAX_HEIGHT && tb->insert_size[h]; h++)
-		child_pos = balance_internal(tb, h, child_pos, insert_key,
-					     insert_ptr);
-
-	do_balance_completed(tb);
-}
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
deleted file mode 100644
index 8eb3ad3e8ae9..000000000000
--- a/fs/reiserfs/file.c
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/time.h>
-#include "reiserfs.h"
-#include "acl.h"
-#include "xattr.h"
-#include <linux/uaccess.h>
-#include <linux/pagemap.h>
-#include <linux/swap.h>
-#include <linux/writeback.h>
-#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
-#include <linux/quotaops.h>
-
-/*
- * We pack the tails of files on file close, not at the time they are written.
- * This implies an unnecessary copy of the tail and an unnecessary indirect item
- * insertion/balancing, for files that are written in one write.
- * It avoids unnecessary tail packings (balances) for files that are written in
- * multiple writes and are small enough to have tails.
- *
- * file_release is called by the VFS layer when the file is closed.  If
- * this is the last open file descriptor, and the file
- * small enough to have a tail, and the tail is currently in an
- * unformatted node, the tail is converted back into a direct item.
- *
- * We use reiserfs_truncate_file to pack the tail, since it already has
- * all the conditions coded.
- */
-static int reiserfs_file_release(struct inode *inode, struct file *filp)
-{
-
-	struct reiserfs_transaction_handle th;
-	int err;
-	int jbegin_failure = 0;
-
-	BUG_ON(!S_ISREG(inode->i_mode));
-
-	if (!atomic_dec_and_mutex_lock(&REISERFS_I(inode)->openers,
-				       &REISERFS_I(inode)->tailpack))
-		return 0;
-
-	/* fast out for when nothing needs to be done */
-	if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
-	     !tail_has_to_be_packed(inode)) &&
-	    REISERFS_I(inode)->i_prealloc_count <= 0) {
-		mutex_unlock(&REISERFS_I(inode)->tailpack);
-		return 0;
-	}
-
-	reiserfs_write_lock(inode->i_sb);
-	/*
-	 * freeing preallocation only involves relogging blocks that
-	 * are already in the current transaction.  preallocation gets
-	 * freed at the end of each transaction, so it is impossible for
-	 * us to log any additional blocks (including quota blocks)
-	 */
-	err = journal_begin(&th, inode->i_sb, 1);
-	if (err) {
-		/*
-		 * uh oh, we can't allow the inode to go away while there
-		 * is still preallocation blocks pending.  Try to join the
-		 * aborted transaction
-		 */
-		jbegin_failure = err;
-		err = journal_join_abort(&th, inode->i_sb);
-
-		if (err) {
-			/*
-			 * hmpf, our choices here aren't good.  We can pin
-			 * the inode which will disallow unmount from ever
-			 * happening, we can do nothing, which will corrupt
-			 * random memory on unmount, or we can forcibly
-			 * remove the file from the preallocation list, which
-			 * will leak blocks on disk.  Lets pin the inode
-			 * and let the admin know what is going on.
-			 */
-			igrab(inode);
-			reiserfs_warning(inode->i_sb, "clm-9001",
-					 "pinning inode %lu because the "
-					 "preallocation can't be freed",
-					 inode->i_ino);
-			goto out;
-		}
-	}
-	reiserfs_update_inode_transaction(inode);
-
-#ifdef REISERFS_PREALLOCATE
-	reiserfs_discard_prealloc(&th, inode);
-#endif
-	err = journal_end(&th);
-
-	/* copy back the error code from journal_begin */
-	if (!err)
-		err = jbegin_failure;
-
-	if (!err &&
-	    (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
-	    tail_has_to_be_packed(inode)) {
-
-		/*
-		 * if regular file is released by last holder and it has been
-		 * appended (we append by unformatted node only) or its direct
-		 * item(s) had to be converted, then it may have to be
-		 * indirect2direct converted
-		 */
-		err = reiserfs_truncate_file(inode, 0);
-	}
-out:
-	reiserfs_write_unlock(inode->i_sb);
-	mutex_unlock(&REISERFS_I(inode)->tailpack);
-	return err;
-}
-
-static int reiserfs_file_open(struct inode *inode, struct file *file)
-{
-	int err = dquot_file_open(inode, file);
-
-	/* somebody might be tailpacking on final close; wait for it */
-        if (!atomic_inc_not_zero(&REISERFS_I(inode)->openers)) {
-		mutex_lock(&REISERFS_I(inode)->tailpack);
-		atomic_inc(&REISERFS_I(inode)->openers);
-		mutex_unlock(&REISERFS_I(inode)->tailpack);
-	}
-	return err;
-}
-
-void reiserfs_vfs_truncate_file(struct inode *inode)
-{
-	mutex_lock(&REISERFS_I(inode)->tailpack);
-	reiserfs_truncate_file(inode, 1);
-	mutex_unlock(&REISERFS_I(inode)->tailpack);
-}
-
-/* Sync a reiserfs file. */
-
-/*
- * FIXME: sync_mapping_buffers() never has anything to sync.  Can
- * be removed...
- */
-
-static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end,
-			      int datasync)
-{
-	struct inode *inode = filp->f_mapping->host;
-	int err;
-	int barrier_done;
-
-	err = file_write_and_wait_range(filp, start, end);
-	if (err)
-		return err;
-
-	inode_lock(inode);
-	BUG_ON(!S_ISREG(inode->i_mode));
-	err = sync_mapping_buffers(inode->i_mapping);
-	reiserfs_write_lock(inode->i_sb);
-	barrier_done = reiserfs_commit_for_inode(inode);
-	reiserfs_write_unlock(inode->i_sb);
-	if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
-		blkdev_issue_flush(inode->i_sb->s_bdev);
-	inode_unlock(inode);
-	if (barrier_done < 0)
-		return barrier_done;
-	return (err < 0) ? -EIO : 0;
-}
-
-/* taken fs/buffer.c:__block_commit_write */
-int reiserfs_commit_page(struct inode *inode, struct page *page,
-			 unsigned from, unsigned to)
-{
-	unsigned block_start, block_end;
-	int partial = 0;
-	unsigned blocksize;
-	struct buffer_head *bh, *head;
-	unsigned long i_size_index = inode->i_size >> PAGE_SHIFT;
-	int new;
-	int logit = reiserfs_file_data_log(inode);
-	struct super_block *s = inode->i_sb;
-	int bh_per_page = PAGE_SIZE / s->s_blocksize;
-	struct reiserfs_transaction_handle th;
-	int ret = 0;
-
-	th.t_trans_id = 0;
-	blocksize = i_blocksize(inode);
-
-	if (logit) {
-		reiserfs_write_lock(s);
-		ret = journal_begin(&th, s, bh_per_page + 1);
-		if (ret)
-			goto drop_write_lock;
-		reiserfs_update_inode_transaction(inode);
-	}
-	for (bh = head = page_buffers(page), block_start = 0;
-	     bh != head || !block_start;
-	     block_start = block_end, bh = bh->b_this_page) {
-
-		new = buffer_new(bh);
-		clear_buffer_new(bh);
-		block_end = block_start + blocksize;
-		if (block_end <= from || block_start >= to) {
-			if (!buffer_uptodate(bh))
-				partial = 1;
-		} else {
-			set_buffer_uptodate(bh);
-			if (logit) {
-				reiserfs_prepare_for_journal(s, bh, 1);
-				journal_mark_dirty(&th, bh);
-			} else if (!buffer_dirty(bh)) {
-				mark_buffer_dirty(bh);
-				/*
-				 * do data=ordered on any page past the end
-				 * of file and any buffer marked BH_New.
-				 */
-				if (reiserfs_data_ordered(inode->i_sb) &&
-				    (new || page->index >= i_size_index)) {
-					reiserfs_add_ordered_list(inode, bh);
-				}
-			}
-		}
-	}
-	if (logit) {
-		ret = journal_end(&th);
-drop_write_lock:
-		reiserfs_write_unlock(s);
-	}
-	/*
-	 * If this is a partial write which happened to make all buffers
-	 * uptodate then we can optimize away a bogus read_folio() for
-	 * the next read(). Here we 'discover' whether the page went
-	 * uptodate as a result of this (potentially partial) write.
-	 */
-	if (!partial)
-		SetPageUptodate(page);
-	return ret;
-}
-
-const struct file_operations reiserfs_file_operations = {
-	.unlocked_ioctl = reiserfs_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl = reiserfs_compat_ioctl,
-#endif
-	.mmap = generic_file_mmap,
-	.open = reiserfs_file_open,
-	.release = reiserfs_file_release,
-	.fsync = reiserfs_sync_file,
-	.read_iter = generic_file_read_iter,
-	.write_iter = generic_file_write_iter,
-	.splice_read = filemap_splice_read,
-	.splice_write = iter_file_splice_write,
-	.llseek = generic_file_llseek,
-};
-
-const struct inode_operations reiserfs_file_inode_operations = {
-	.setattr = reiserfs_setattr,
-	.listxattr = reiserfs_listxattr,
-	.permission = reiserfs_permission,
-	.get_inode_acl = reiserfs_get_acl,
-	.set_acl = reiserfs_set_acl,
-	.fileattr_get = reiserfs_fileattr_get,
-	.fileattr_set = reiserfs_fileattr_set,
-};
-
-const struct inode_operations reiserfs_priv_file_inode_operations = {
-	.setattr = reiserfs_setattr,
-	.permission = reiserfs_permission,
-	.fileattr_get = reiserfs_fileattr_get,
-	.fileattr_set = reiserfs_fileattr_set,
-};
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
deleted file mode 100644
index 6c13a8d9a73c..000000000000
--- a/fs/reiserfs/fix_node.c
+++ /dev/null
@@ -1,2822 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/time.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include "reiserfs.h"
-#include <linux/buffer_head.h>
-
-/*
- * To make any changes in the tree we find a node that contains item
- * to be changed/deleted or position in the node we insert a new item
- * to. We call this node S. To do balancing we need to decide what we
- * will shift to left/right neighbor, or to a new node, where new item
- * will be etc. To make this analysis simpler we build virtual
- * node. Virtual node is an array of items, that will replace items of
- * node S. (For instance if we are going to delete an item, virtual
- * node does not contain it). Virtual node keeps information about
- * item sizes and types, mergeability of first and last items, sizes
- * of all entries in directory item. We use this array of items when
- * calculating what we can shift to neighbors and how many nodes we
- * have to have if we do not any shiftings, if we shift to left/right
- * neighbor or to both.
- */
-
-/*
- * Takes item number in virtual node, returns number of item
- * that it has in source buffer
- */
-static inline int old_item_num(int new_num, int affected_item_num, int mode)
-{
-	if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num)
-		return new_num;
-
-	if (mode == M_INSERT) {
-
-		RFALSE(new_num == 0,
-		       "vs-8005: for INSERT mode and item number of inserted item");
-
-		return new_num - 1;
-	}
-
-	RFALSE(mode != M_DELETE,
-	       "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'",
-	       mode);
-	/* delete mode */
-	return new_num + 1;
-}
-
-static void create_virtual_node(struct tree_balance *tb, int h)
-{
-	struct item_head *ih;
-	struct virtual_node *vn = tb->tb_vn;
-	int new_num;
-	struct buffer_head *Sh;	/* this comes from tb->S[h] */
-
-	Sh = PATH_H_PBUFFER(tb->tb_path, h);
-
-	/* size of changed node */
-	vn->vn_size =
-	    MAX_CHILD_SIZE(Sh) - B_FREE_SPACE(Sh) + tb->insert_size[h];
-
-	/* for internal nodes array if virtual items is not created */
-	if (h) {
-		vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE);
-		return;
-	}
-
-	/* number of items in virtual node  */
-	vn->vn_nr_item =
-	    B_NR_ITEMS(Sh) + ((vn->vn_mode == M_INSERT) ? 1 : 0) -
-	    ((vn->vn_mode == M_DELETE) ? 1 : 0);
-
-	/* first virtual item */
-	vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1);
-	memset(vn->vn_vi, 0, vn->vn_nr_item * sizeof(struct virtual_item));
-	vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item);
-
-	/* first item in the node */
-	ih = item_head(Sh, 0);
-
-	/* define the mergeability for 0-th item (if it is not being deleted) */
-	if (op_is_left_mergeable(&ih->ih_key, Sh->b_size)
-	    && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num))
-		vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE;
-
-	/*
-	 * go through all items that remain in the virtual
-	 * node (except for the new (inserted) one)
-	 */
-	for (new_num = 0; new_num < vn->vn_nr_item; new_num++) {
-		int j;
-		struct virtual_item *vi = vn->vn_vi + new_num;
-		int is_affected =
-		    ((new_num != vn->vn_affected_item_num) ? 0 : 1);
-
-		if (is_affected && vn->vn_mode == M_INSERT)
-			continue;
-
-		/* get item number in source node */
-		j = old_item_num(new_num, vn->vn_affected_item_num,
-				 vn->vn_mode);
-
-		vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE;
-		vi->vi_ih = ih + j;
-		vi->vi_item = ih_item_body(Sh, ih + j);
-		vi->vi_uarea = vn->vn_free_ptr;
-
-		/*
-		 * FIXME: there is no check that item operation did not
-		 * consume too much memory
-		 */
-		vn->vn_free_ptr +=
-		    op_create_vi(vn, vi, is_affected, tb->insert_size[0]);
-		if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
-			reiserfs_panic(tb->tb_sb, "vs-8030",
-				       "virtual node space consumed");
-
-		if (!is_affected)
-			/* this is not being changed */
-			continue;
-
-		if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) {
-			vn->vn_vi[new_num].vi_item_len += tb->insert_size[0];
-			/* pointer to data which is going to be pasted */
-			vi->vi_new_data = vn->vn_data;
-		}
-	}
-
-	/* virtual inserted item is not defined yet */
-	if (vn->vn_mode == M_INSERT) {
-		struct virtual_item *vi = vn->vn_vi + vn->vn_affected_item_num;
-
-		RFALSE(vn->vn_ins_ih == NULL,
-		       "vs-8040: item header of inserted item is not specified");
-		vi->vi_item_len = tb->insert_size[0];
-		vi->vi_ih = vn->vn_ins_ih;
-		vi->vi_item = vn->vn_data;
-		vi->vi_uarea = vn->vn_free_ptr;
-
-		op_create_vi(vn, vi, 0 /*not pasted or cut */ ,
-			     tb->insert_size[0]);
-	}
-
-	/*
-	 * set right merge flag we take right delimiting key and
-	 * check whether it is a mergeable item
-	 */
-	if (tb->CFR[0]) {
-		struct reiserfs_key *key;
-
-		key = internal_key(tb->CFR[0], tb->rkey[0]);
-		if (op_is_left_mergeable(key, Sh->b_size)
-		    && (vn->vn_mode != M_DELETE
-			|| vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1))
-			vn->vn_vi[vn->vn_nr_item - 1].vi_type |=
-			    VI_TYPE_RIGHT_MERGEABLE;
-
-#ifdef CONFIG_REISERFS_CHECK
-		if (op_is_left_mergeable(key, Sh->b_size) &&
-		    !(vn->vn_mode != M_DELETE
-		      || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) {
-			/*
-			 * we delete last item and it could be merged
-			 * with right neighbor's first item
-			 */
-			if (!
-			    (B_NR_ITEMS(Sh) == 1
-			     && is_direntry_le_ih(item_head(Sh, 0))
-			     && ih_entry_count(item_head(Sh, 0)) == 1)) {
-				/*
-				 * node contains more than 1 item, or item
-				 * is not directory item, or this item
-				 * contains more than 1 entry
-				 */
-				print_block(Sh, 0, -1, -1);
-				reiserfs_panic(tb->tb_sb, "vs-8045",
-					       "rdkey %k, affected item==%d "
-					       "(mode==%c) Must be %c",
-					       key, vn->vn_affected_item_num,
-					       vn->vn_mode, M_DELETE);
-			}
-		}
-#endif
-
-	}
-}
-
-/*
- * Using virtual node check, how many items can be
- * shifted to left neighbor
- */
-static void check_left(struct tree_balance *tb, int h, int cur_free)
-{
-	int i;
-	struct virtual_node *vn = tb->tb_vn;
-	struct virtual_item *vi;
-	int d_size, ih_size;
-
-	RFALSE(cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free);
-
-	/* internal level */
-	if (h > 0) {
-		tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
-		return;
-	}
-
-	/* leaf level */
-
-	if (!cur_free || !vn->vn_nr_item) {
-		/* no free space or nothing to move */
-		tb->lnum[h] = 0;
-		tb->lbytes = -1;
-		return;
-	}
-
-	RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
-	       "vs-8055: parent does not exist or invalid");
-
-	vi = vn->vn_vi;
-	if ((unsigned int)cur_free >=
-	    (vn->vn_size -
-	     ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) {
-		/* all contents of S[0] fits into L[0] */
-
-		RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
-		       "vs-8055: invalid mode or balance condition failed");
-
-		tb->lnum[0] = vn->vn_nr_item;
-		tb->lbytes = -1;
-		return;
-	}
-
-	d_size = 0, ih_size = IH_SIZE;
-
-	/* first item may be merge with last item in left neighbor */
-	if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE)
-		d_size = -((int)IH_SIZE), ih_size = 0;
-
-	tb->lnum[0] = 0;
-	for (i = 0; i < vn->vn_nr_item;
-	     i++, ih_size = IH_SIZE, d_size = 0, vi++) {
-		d_size += vi->vi_item_len;
-		if (cur_free >= d_size) {
-			/* the item can be shifted entirely */
-			cur_free -= d_size;
-			tb->lnum[0]++;
-			continue;
-		}
-
-		/* the item cannot be shifted entirely, try to split it */
-		/*
-		 * check whether L[0] can hold ih and at least one byte
-		 * of the item body
-		 */
-
-		/* cannot shift even a part of the current item */
-		if (cur_free <= ih_size) {
-			tb->lbytes = -1;
-			return;
-		}
-		cur_free -= ih_size;
-
-		tb->lbytes = op_check_left(vi, cur_free, 0, 0);
-		if (tb->lbytes != -1)
-			/* count partially shifted item */
-			tb->lnum[0]++;
-
-		break;
-	}
-
-	return;
-}
-
-/*
- * Using virtual node check, how many items can be
- * shifted to right neighbor
- */
-static void check_right(struct tree_balance *tb, int h, int cur_free)
-{
-	int i;
-	struct virtual_node *vn = tb->tb_vn;
-	struct virtual_item *vi;
-	int d_size, ih_size;
-
-	RFALSE(cur_free < 0, "vs-8070: cur_free < 0");
-
-	/* internal level */
-	if (h > 0) {
-		tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
-		return;
-	}
-
-	/* leaf level */
-
-	if (!cur_free || !vn->vn_nr_item) {
-		/* no free space  */
-		tb->rnum[h] = 0;
-		tb->rbytes = -1;
-		return;
-	}
-
-	RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
-	       "vs-8075: parent does not exist or invalid");
-
-	vi = vn->vn_vi + vn->vn_nr_item - 1;
-	if ((unsigned int)cur_free >=
-	    (vn->vn_size -
-	     ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) {
-		/* all contents of S[0] fits into R[0] */
-
-		RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
-		       "vs-8080: invalid mode or balance condition failed");
-
-		tb->rnum[h] = vn->vn_nr_item;
-		tb->rbytes = -1;
-		return;
-	}
-
-	d_size = 0, ih_size = IH_SIZE;
-
-	/* last item may be merge with first item in right neighbor */
-	if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE)
-		d_size = -(int)IH_SIZE, ih_size = 0;
-
-	tb->rnum[0] = 0;
-	for (i = vn->vn_nr_item - 1; i >= 0;
-	     i--, d_size = 0, ih_size = IH_SIZE, vi--) {
-		d_size += vi->vi_item_len;
-		if (cur_free >= d_size) {
-			/* the item can be shifted entirely */
-			cur_free -= d_size;
-			tb->rnum[0]++;
-			continue;
-		}
-
-		/*
-		 * check whether R[0] can hold ih and at least one
-		 * byte of the item body
-		 */
-
-		/* cannot shift even a part of the current item */
-		if (cur_free <= ih_size) {
-			tb->rbytes = -1;
-			return;
-		}
-
-		/*
-		 * R[0] can hold the header of the item and at least
-		 * one byte of its body
-		 */
-		cur_free -= ih_size;	/* cur_free is still > 0 */
-
-		tb->rbytes = op_check_right(vi, cur_free);
-		if (tb->rbytes != -1)
-			/* count partially shifted item */
-			tb->rnum[0]++;
-
-		break;
-	}
-
-	return;
-}
-
-/*
- * from - number of items, which are shifted to left neighbor entirely
- * to - number of item, which are shifted to right neighbor entirely
- * from_bytes - number of bytes of boundary item (or directory entries)
- *              which are shifted to left neighbor
- * to_bytes - number of bytes of boundary item (or directory entries)
- *            which are shifted to right neighbor
- */
-static int get_num_ver(int mode, struct tree_balance *tb, int h,
-		       int from, int from_bytes,
-		       int to, int to_bytes, short *snum012, int flow)
-{
-	int i;
-	int units;
-	struct virtual_node *vn = tb->tb_vn;
-	int total_node_size, max_node_size, current_item_size;
-	int needed_nodes;
-
-	/* position of item we start filling node from */
-	int start_item;
-
-	/* position of item we finish filling node by */
-	int end_item;
-
-	/*
-	 * number of first bytes (entries for directory) of start_item-th item
-	 * we do not include into node that is being filled
-	 */
-	int start_bytes;
-
-	/*
-	 * number of last bytes (entries for directory) of end_item-th item
-	 * we do node include into node that is being filled
-	 */
-	int end_bytes;
-
-	/*
-	 * these are positions in virtual item of items, that are split
-	 * between S[0] and S1new and S1new and S2new
-	 */
-	int split_item_positions[2];
-
-	split_item_positions[0] = -1;
-	split_item_positions[1] = -1;
-
-	/*
-	 * We only create additional nodes if we are in insert or paste mode
-	 * or we are in replace mode at the internal level. If h is 0 and
-	 * the mode is M_REPLACE then in fix_nodes we change the mode to
-	 * paste or insert before we get here in the code.
-	 */
-	RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE),
-	       "vs-8100: insert_size < 0 in overflow");
-
-	max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h));
-
-	/*
-	 * snum012 [0-2] - number of items, that lay
-	 * to S[0], first new node and second new node
-	 */
-	snum012[3] = -1;	/* s1bytes */
-	snum012[4] = -1;	/* s2bytes */
-
-	/* internal level */
-	if (h > 0) {
-		i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE);
-		if (i == max_node_size)
-			return 1;
-		return (i / max_node_size + 1);
-	}
-
-	/* leaf level */
-	needed_nodes = 1;
-	total_node_size = 0;
-
-	/* start from 'from'-th item */
-	start_item = from;
-	/* skip its first 'start_bytes' units */
-	start_bytes = ((from_bytes != -1) ? from_bytes : 0);
-
-	/* last included item is the 'end_item'-th one */
-	end_item = vn->vn_nr_item - to - 1;
-	/* do not count last 'end_bytes' units of 'end_item'-th item */
-	end_bytes = (to_bytes != -1) ? to_bytes : 0;
-
-	/*
-	 * go through all item beginning from the start_item-th item
-	 * and ending by the end_item-th item. Do not count first
-	 * 'start_bytes' units of 'start_item'-th item and last
-	 * 'end_bytes' of 'end_item'-th item
-	 */
-	for (i = start_item; i <= end_item; i++) {
-		struct virtual_item *vi = vn->vn_vi + i;
-		int skip_from_end = ((i == end_item) ? end_bytes : 0);
-
-		RFALSE(needed_nodes > 3, "vs-8105: too many nodes are needed");
-
-		/* get size of current item */
-		current_item_size = vi->vi_item_len;
-
-		/*
-		 * do not take in calculation head part (from_bytes)
-		 * of from-th item
-		 */
-		current_item_size -=
-		    op_part_size(vi, 0 /*from start */ , start_bytes);
-
-		/* do not take in calculation tail part of last item */
-		current_item_size -=
-		    op_part_size(vi, 1 /*from end */ , skip_from_end);
-
-		/* if item fits into current node entierly */
-		if (total_node_size + current_item_size <= max_node_size) {
-			snum012[needed_nodes - 1]++;
-			total_node_size += current_item_size;
-			start_bytes = 0;
-			continue;
-		}
-
-		/*
-		 * virtual item length is longer, than max size of item in
-		 * a node. It is impossible for direct item
-		 */
-		if (current_item_size > max_node_size) {
-			RFALSE(is_direct_le_ih(vi->vi_ih),
-			       "vs-8110: "
-			       "direct item length is %d. It can not be longer than %d",
-			       current_item_size, max_node_size);
-			/* we will try to split it */
-			flow = 1;
-		}
-
-		/* as we do not split items, take new node and continue */
-		if (!flow) {
-			needed_nodes++;
-			i--;
-			total_node_size = 0;
-			continue;
-		}
-
-		/*
-		 * calculate number of item units which fit into node being
-		 * filled
-		 */
-		{
-			int free_space;
-
-			free_space = max_node_size - total_node_size - IH_SIZE;
-			units =
-			    op_check_left(vi, free_space, start_bytes,
-					  skip_from_end);
-			/*
-			 * nothing fits into current node, take new
-			 * node and continue
-			 */
-			if (units == -1) {
-				needed_nodes++, i--, total_node_size = 0;
-				continue;
-			}
-		}
-
-		/* something fits into the current node */
-		start_bytes += units;
-		snum012[needed_nodes - 1 + 3] = units;
-
-		if (needed_nodes > 2)
-			reiserfs_warning(tb->tb_sb, "vs-8111",
-					 "split_item_position is out of range");
-		snum012[needed_nodes - 1]++;
-		split_item_positions[needed_nodes - 1] = i;
-		needed_nodes++;
-		/* continue from the same item with start_bytes != -1 */
-		start_item = i;
-		i--;
-		total_node_size = 0;
-	}
-
-	/*
-	 * sum012[4] (if it is not -1) contains number of units of which
-	 * are to be in S1new, snum012[3] - to be in S0. They are supposed
-	 * to be S1bytes and S2bytes correspondingly, so recalculate
-	 */
-	if (snum012[4] > 0) {
-		int split_item_num;
-		int bytes_to_r, bytes_to_l;
-		int bytes_to_S1new;
-
-		split_item_num = split_item_positions[1];
-		bytes_to_l =
-		    ((from == split_item_num
-		      && from_bytes != -1) ? from_bytes : 0);
-		bytes_to_r =
-		    ((end_item == split_item_num
-		      && end_bytes != -1) ? end_bytes : 0);
-		bytes_to_S1new =
-		    ((split_item_positions[0] ==
-		      split_item_positions[1]) ? snum012[3] : 0);
-
-		/* s2bytes */
-		snum012[4] =
-		    op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] -
-		    bytes_to_r - bytes_to_l - bytes_to_S1new;
-
-		if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY &&
-		    vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT)
-			reiserfs_warning(tb->tb_sb, "vs-8115",
-					 "not directory or indirect item");
-	}
-
-	/* now we know S2bytes, calculate S1bytes */
-	if (snum012[3] > 0) {
-		int split_item_num;
-		int bytes_to_r, bytes_to_l;
-		int bytes_to_S2new;
-
-		split_item_num = split_item_positions[0];
-		bytes_to_l =
-		    ((from == split_item_num
-		      && from_bytes != -1) ? from_bytes : 0);
-		bytes_to_r =
-		    ((end_item == split_item_num
-		      && end_bytes != -1) ? end_bytes : 0);
-		bytes_to_S2new =
-		    ((split_item_positions[0] == split_item_positions[1]
-		      && snum012[4] != -1) ? snum012[4] : 0);
-
-		/* s1bytes */
-		snum012[3] =
-		    op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] -
-		    bytes_to_r - bytes_to_l - bytes_to_S2new;
-	}
-
-	return needed_nodes;
-}
-
-
-/*
- * Set parameters for balancing.
- * Performs write of results of analysis of balancing into structure tb,
- * where it will later be used by the functions that actually do the balancing.
- * Parameters:
- *	tb	tree_balance structure;
- *	h	current level of the node;
- *	lnum	number of items from S[h] that must be shifted to L[h];
- *	rnum	number of items from S[h] that must be shifted to R[h];
- *	blk_num	number of blocks that S[h] will be splitted into;
- *	s012	number of items that fall into splitted nodes.
- *	lbytes	number of bytes which flow to the left neighbor from the
- *              item that is not shifted entirely
- *	rbytes	number of bytes which flow to the right neighbor from the
- *              item that is not shifted entirely
- *	s1bytes	number of bytes which flow to the first  new node when
- *              S[0] splits (this number is contained in s012 array)
- */
-
-static void set_parameters(struct tree_balance *tb, int h, int lnum,
-			   int rnum, int blk_num, short *s012, int lb, int rb)
-{
-
-	tb->lnum[h] = lnum;
-	tb->rnum[h] = rnum;
-	tb->blknum[h] = blk_num;
-
-	/* only for leaf level */
-	if (h == 0) {
-		if (s012 != NULL) {
-			tb->s0num = *s012++;
-			tb->snum[0] = *s012++;
-			tb->snum[1] = *s012++;
-			tb->sbytes[0] = *s012++;
-			tb->sbytes[1] = *s012;
-		}
-		tb->lbytes = lb;
-		tb->rbytes = rb;
-	}
-	PROC_INFO_ADD(tb->tb_sb, lnum[h], lnum);
-	PROC_INFO_ADD(tb->tb_sb, rnum[h], rnum);
-
-	PROC_INFO_ADD(tb->tb_sb, lbytes[h], lb);
-	PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb);
-}
-
-/*
- * check if node disappears if we shift tb->lnum[0] items to left
- * neighbor and tb->rnum[0] to the right one.
- */
-static int is_leaf_removable(struct tree_balance *tb)
-{
-	struct virtual_node *vn = tb->tb_vn;
-	int to_left, to_right;
-	int size;
-	int remain_items;
-
-	/*
-	 * number of items that will be shifted to left (right) neighbor
-	 * entirely
-	 */
-	to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0);
-	to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0);
-	remain_items = vn->vn_nr_item;
-
-	/* how many items remain in S[0] after shiftings to neighbors */
-	remain_items -= (to_left + to_right);
-
-	/* all content of node can be shifted to neighbors */
-	if (remain_items < 1) {
-		set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0,
-			       NULL, -1, -1);
-		return 1;
-	}
-
-	/* S[0] is not removable */
-	if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1)
-		return 0;
-
-	/* check whether we can divide 1 remaining item between neighbors */
-
-	/* get size of remaining item (in item units) */
-	size = op_unit_num(&vn->vn_vi[to_left]);
-
-	if (tb->lbytes + tb->rbytes >= size) {
-		set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL,
-			       tb->lbytes, -1);
-		return 1;
-	}
-
-	return 0;
-}
-
-/* check whether L, S, R can be joined in one node */
-static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree)
-{
-	struct virtual_node *vn = tb->tb_vn;
-	int ih_size;
-	struct buffer_head *S0;
-
-	S0 = PATH_H_PBUFFER(tb->tb_path, 0);
-
-	ih_size = 0;
-	if (vn->vn_nr_item) {
-		if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE)
-			ih_size += IH_SIZE;
-
-		if (vn->vn_vi[vn->vn_nr_item - 1].
-		    vi_type & VI_TYPE_RIGHT_MERGEABLE)
-			ih_size += IH_SIZE;
-	} else {
-		/* there was only one item and it will be deleted */
-		struct item_head *ih;
-
-		RFALSE(B_NR_ITEMS(S0) != 1,
-		       "vs-8125: item number must be 1: it is %d",
-		       B_NR_ITEMS(S0));
-
-		ih = item_head(S0, 0);
-		if (tb->CFR[0]
-		    && !comp_short_le_keys(&ih->ih_key,
-					   internal_key(tb->CFR[0],
-							  tb->rkey[0])))
-			/*
-			 * Directory must be in correct state here: that is
-			 * somewhere at the left side should exist first
-			 * directory item. But the item being deleted can
-			 * not be that first one because its right neighbor
-			 * is item of the same directory. (But first item
-			 * always gets deleted in last turn). So, neighbors
-			 * of deleted item can be merged, so we can save
-			 * ih_size
-			 */
-			if (is_direntry_le_ih(ih)) {
-				ih_size = IH_SIZE;
-
-				/*
-				 * we might check that left neighbor exists
-				 * and is of the same directory
-				 */
-				RFALSE(le_ih_k_offset(ih) == DOT_OFFSET,
-				       "vs-8130: first directory item can not be removed until directory is not empty");
-			}
-
-	}
-
-	if (MAX_CHILD_SIZE(S0) + vn->vn_size <= rfree + lfree + ih_size) {
-		set_parameters(tb, 0, -1, -1, -1, NULL, -1, -1);
-		PROC_INFO_INC(tb->tb_sb, leaves_removable);
-		return 1;
-	}
-	return 0;
-
-}
-
-/* when we do not split item, lnum and rnum are numbers of entire items */
-#define SET_PAR_SHIFT_LEFT \
-if (h)\
-{\
-   int to_l;\
-   \
-   to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\
-	      (MAX_NR_KEY(Sh) + 1 - lpar);\
-	      \
-	      set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\
-}\
-else \
-{\
-   if (lset==LEFT_SHIFT_FLOW)\
-     set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\
-		     tb->lbytes, -1);\
-   else\
-     set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\
-		     -1, -1);\
-}
-
-#define SET_PAR_SHIFT_RIGHT \
-if (h)\
-{\
-   int to_r;\
-   \
-   to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\
-   \
-   set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\
-}\
-else \
-{\
-   if (rset==RIGHT_SHIFT_FLOW)\
-     set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\
-		  -1, tb->rbytes);\
-   else\
-     set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\
-		  -1, -1);\
-}
-
-static void free_buffers_in_tb(struct tree_balance *tb)
-{
-	int i;
-
-	pathrelse(tb->tb_path);
-
-	for (i = 0; i < MAX_HEIGHT; i++) {
-		brelse(tb->L[i]);
-		brelse(tb->R[i]);
-		brelse(tb->FL[i]);
-		brelse(tb->FR[i]);
-		brelse(tb->CFL[i]);
-		brelse(tb->CFR[i]);
-
-		tb->L[i] = NULL;
-		tb->R[i] = NULL;
-		tb->FL[i] = NULL;
-		tb->FR[i] = NULL;
-		tb->CFL[i] = NULL;
-		tb->CFR[i] = NULL;
-	}
-}
-
-/*
- * Get new buffers for storing new nodes that are created while balancing.
- * Returns:	SCHEDULE_OCCURRED - schedule occurred while the function worked;
- *	        CARRY_ON - schedule didn't occur while the function worked;
- *	        NO_DISK_SPACE - no disk space.
- */
-/* The function is NOT SCHEDULE-SAFE! */
-static int get_empty_nodes(struct tree_balance *tb, int h)
-{
-	struct buffer_head *new_bh, *Sh = PATH_H_PBUFFER(tb->tb_path, h);
-	b_blocknr_t *blocknr, blocknrs[MAX_AMOUNT_NEEDED] = { 0, };
-	int counter, number_of_freeblk;
-	int  amount_needed;	/* number of needed empty blocks */
-	int  retval = CARRY_ON;
-	struct super_block *sb = tb->tb_sb;
-
-	/*
-	 * number_of_freeblk is the number of empty blocks which have been
-	 * acquired for use by the balancing algorithm minus the number of
-	 * empty blocks used in the previous levels of the analysis,
-	 * number_of_freeblk = tb->cur_blknum can be non-zero if a schedule
-	 * occurs after empty blocks are acquired, and the balancing analysis
-	 * is then restarted, amount_needed is the number needed by this
-	 * level (h) of the balancing analysis.
-	 *
-	 * Note that for systems with many processes writing, it would be
-	 * more layout optimal to calculate the total number needed by all
-	 * levels and then to run reiserfs_new_blocks to get all of them at
-	 * once.
-	 */
-
-	/*
-	 * Initiate number_of_freeblk to the amount acquired prior to the
-	 * restart of the analysis or 0 if not restarted, then subtract the
-	 * amount needed by all of the levels of the tree below h.
-	 */
-	/* blknum includes S[h], so we subtract 1 in this calculation */
-	for (counter = 0, number_of_freeblk = tb->cur_blknum;
-	     counter < h; counter++)
-		number_of_freeblk -=
-		    (tb->blknum[counter]) ? (tb->blknum[counter] -
-						   1) : 0;
-
-	/* Allocate missing empty blocks. */
-	/* if Sh == 0  then we are getting a new root */
-	amount_needed = (Sh) ? (tb->blknum[h] - 1) : 1;
-	/*
-	 * Amount_needed = the amount that we need more than the
-	 * amount that we have.
-	 */
-	if (amount_needed > number_of_freeblk)
-		amount_needed -= number_of_freeblk;
-	else	/* If we have enough already then there is nothing to do. */
-		return CARRY_ON;
-
-	/*
-	 * No need to check quota - is not allocated for blocks used
-	 * for formatted nodes
-	 */
-	if (reiserfs_new_form_blocknrs(tb, blocknrs,
-				       amount_needed) == NO_DISK_SPACE)
-		return NO_DISK_SPACE;
-
-	/* for each blocknumber we just got, get a buffer and stick it on FEB */
-	for (blocknr = blocknrs, counter = 0;
-	     counter < amount_needed; blocknr++, counter++) {
-
-		RFALSE(!*blocknr,
-		       "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
-
-		new_bh = sb_getblk(sb, *blocknr);
-		RFALSE(buffer_dirty(new_bh) ||
-		       buffer_journaled(new_bh) ||
-		       buffer_journal_dirty(new_bh),
-		       "PAP-8140: journaled or dirty buffer %b for the new block",
-		       new_bh);
-
-		/* Put empty buffers into the array. */
-		RFALSE(tb->FEB[tb->cur_blknum],
-		       "PAP-8141: busy slot for new buffer");
-
-		set_buffer_journal_new(new_bh);
-		tb->FEB[tb->cur_blknum++] = new_bh;
-	}
-
-	if (retval == CARRY_ON && FILESYSTEM_CHANGED_TB(tb))
-		retval = REPEAT_SEARCH;
-
-	return retval;
-}
-
-/*
- * Get free space of the left neighbor, which is stored in the parent
- * node of the left neighbor.
- */
-static int get_lfree(struct tree_balance *tb, int h)
-{
-	struct buffer_head *l, *f;
-	int order;
-
-	if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL ||
-	    (l = tb->FL[h]) == NULL)
-		return 0;
-
-	if (f == l)
-		order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) - 1;
-	else {
-		order = B_NR_ITEMS(l);
-		f = l;
-	}
-
-	return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
-}
-
-/*
- * Get free space of the right neighbor,
- * which is stored in the parent node of the right neighbor.
- */
-static int get_rfree(struct tree_balance *tb, int h)
-{
-	struct buffer_head *r, *f;
-	int order;
-
-	if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL ||
-	    (r = tb->FR[h]) == NULL)
-		return 0;
-
-	if (f == r)
-		order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) + 1;
-	else {
-		order = 0;
-		f = r;
-	}
-
-	return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
-
-}
-
-/* Check whether left neighbor is in memory. */
-static int is_left_neighbor_in_cache(struct tree_balance *tb, int h)
-{
-	struct buffer_head *father, *left;
-	struct super_block *sb = tb->tb_sb;
-	b_blocknr_t left_neighbor_blocknr;
-	int left_neighbor_position;
-
-	/* Father of the left neighbor does not exist. */
-	if (!tb->FL[h])
-		return 0;
-
-	/* Calculate father of the node to be balanced. */
-	father = PATH_H_PBUFFER(tb->tb_path, h + 1);
-
-	RFALSE(!father ||
-	       !B_IS_IN_TREE(father) ||
-	       !B_IS_IN_TREE(tb->FL[h]) ||
-	       !buffer_uptodate(father) ||
-	       !buffer_uptodate(tb->FL[h]),
-	       "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
-	       father, tb->FL[h]);
-
-	/*
-	 * Get position of the pointer to the left neighbor
-	 * into the left father.
-	 */
-	left_neighbor_position = (father == tb->FL[h]) ?
-	    tb->lkey[h] : B_NR_ITEMS(tb->FL[h]);
-	/* Get left neighbor block number. */
-	left_neighbor_blocknr =
-	    B_N_CHILD_NUM(tb->FL[h], left_neighbor_position);
-	/* Look for the left neighbor in the cache. */
-	if ((left = sb_find_get_block(sb, left_neighbor_blocknr))) {
-
-		RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left),
-		       "vs-8170: left neighbor (%b %z) is not in the tree",
-		       left, left);
-		put_bh(left);
-		return 1;
-	}
-
-	return 0;
-}
-
-#define LEFT_PARENTS  'l'
-#define RIGHT_PARENTS 'r'
-
-static void decrement_key(struct cpu_key *key)
-{
-	/* call item specific function for this key */
-	item_ops[cpu_key_k_type(key)]->decrement_key(key);
-}
-
-/*
- * Calculate far left/right parent of the left/right neighbor of the
- * current node, that is calculate the left/right (FL[h]/FR[h]) neighbor
- * of the parent F[h].
- * Calculate left/right common parent of the current node and L[h]/R[h].
- * Calculate left/right delimiting key position.
- * Returns:	PATH_INCORRECT    - path in the tree is not correct
- *		SCHEDULE_OCCURRED - schedule occurred while the function worked
- *	        CARRY_ON          - schedule didn't occur while the function
- *				    worked
- */
-static int get_far_parent(struct tree_balance *tb,
-			  int h,
-			  struct buffer_head **pfather,
-			  struct buffer_head **pcom_father, char c_lr_par)
-{
-	struct buffer_head *parent;
-	INITIALIZE_PATH(s_path_to_neighbor_father);
-	struct treepath *path = tb->tb_path;
-	struct cpu_key s_lr_father_key;
-	int counter,
-	    position = INT_MAX,
-	    first_last_position = 0,
-	    path_offset = PATH_H_PATH_OFFSET(path, h);
-
-	/*
-	 * Starting from F[h] go upwards in the tree, and look for the common
-	 * ancestor of F[h], and its neighbor l/r, that should be obtained.
-	 */
-
-	counter = path_offset;
-
-	RFALSE(counter < FIRST_PATH_ELEMENT_OFFSET,
-	       "PAP-8180: invalid path length");
-
-	for (; counter > FIRST_PATH_ELEMENT_OFFSET; counter--) {
-		/*
-		 * Check whether parent of the current buffer in the path
-		 * is really parent in the tree.
-		 */
-		if (!B_IS_IN_TREE
-		    (parent = PATH_OFFSET_PBUFFER(path, counter - 1)))
-			return REPEAT_SEARCH;
-
-		/* Check whether position in the parent is correct. */
-		if ((position =
-		     PATH_OFFSET_POSITION(path,
-					  counter - 1)) >
-		    B_NR_ITEMS(parent))
-			return REPEAT_SEARCH;
-
-		/*
-		 * Check whether parent at the path really points
-		 * to the child.
-		 */
-		if (B_N_CHILD_NUM(parent, position) !=
-		    PATH_OFFSET_PBUFFER(path, counter)->b_blocknr)
-			return REPEAT_SEARCH;
-
-		/*
-		 * Return delimiting key if position in the parent is not
-		 * equal to first/last one.
-		 */
-		if (c_lr_par == RIGHT_PARENTS)
-			first_last_position = B_NR_ITEMS(parent);
-		if (position != first_last_position) {
-			*pcom_father = parent;
-			get_bh(*pcom_father);
-			/*(*pcom_father = parent)->b_count++; */
-			break;
-		}
-	}
-
-	/* if we are in the root of the tree, then there is no common father */
-	if (counter == FIRST_PATH_ELEMENT_OFFSET) {
-		/*
-		 * Check whether first buffer in the path is the
-		 * root of the tree.
-		 */
-		if (PATH_OFFSET_PBUFFER
-		    (tb->tb_path,
-		     FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
-		    SB_ROOT_BLOCK(tb->tb_sb)) {
-			*pfather = *pcom_father = NULL;
-			return CARRY_ON;
-		}
-		return REPEAT_SEARCH;
-	}
-
-	RFALSE(B_LEVEL(*pcom_father) <= DISK_LEAF_NODE_LEVEL,
-	       "PAP-8185: (%b %z) level too small",
-	       *pcom_father, *pcom_father);
-
-	/* Check whether the common parent is locked. */
-
-	if (buffer_locked(*pcom_father)) {
-
-		/* Release the write lock while the buffer is busy */
-		int depth = reiserfs_write_unlock_nested(tb->tb_sb);
-		__wait_on_buffer(*pcom_father);
-		reiserfs_write_lock_nested(tb->tb_sb, depth);
-		if (FILESYSTEM_CHANGED_TB(tb)) {
-			brelse(*pcom_father);
-			return REPEAT_SEARCH;
-		}
-	}
-
-	/*
-	 * So, we got common parent of the current node and its
-	 * left/right neighbor.  Now we are getting the parent of the
-	 * left/right neighbor.
-	 */
-
-	/* Form key to get parent of the left/right neighbor. */
-	le_key2cpu_key(&s_lr_father_key,
-		       internal_key(*pcom_father,
-				      (c_lr_par ==
-				       LEFT_PARENTS) ? (tb->lkey[h - 1] =
-							position -
-							1) : (tb->rkey[h -
-									   1] =
-							      position)));
-
-	if (c_lr_par == LEFT_PARENTS)
-		decrement_key(&s_lr_father_key);
-
-	if (search_by_key
-	    (tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father,
-	     h + 1) == IO_ERROR)
-		/* path is released */
-		return IO_ERROR;
-
-	if (FILESYSTEM_CHANGED_TB(tb)) {
-		pathrelse(&s_path_to_neighbor_father);
-		brelse(*pcom_father);
-		return REPEAT_SEARCH;
-	}
-
-	*pfather = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
-
-	RFALSE(B_LEVEL(*pfather) != h + 1,
-	       "PAP-8190: (%b %z) level too small", *pfather, *pfather);
-	RFALSE(s_path_to_neighbor_father.path_length <
-	       FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small");
-
-	s_path_to_neighbor_father.path_length--;
-	pathrelse(&s_path_to_neighbor_father);
-	return CARRY_ON;
-}
-
-/*
- * Get parents of neighbors of node in the path(S[path_offset]) and
- * common parents of S[path_offset] and L[path_offset]/R[path_offset]:
- * F[path_offset], FL[path_offset], FR[path_offset], CFL[path_offset],
- * CFR[path_offset].
- * Calculate numbers of left and right delimiting keys position:
- * lkey[path_offset], rkey[path_offset].
- * Returns:	SCHEDULE_OCCURRED - schedule occurred while the function worked
- *	        CARRY_ON - schedule didn't occur while the function worked
- */
-static int get_parents(struct tree_balance *tb, int h)
-{
-	struct treepath *path = tb->tb_path;
-	int position,
-	    ret,
-	    path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
-	struct buffer_head *curf, *curcf;
-
-	/* Current node is the root of the tree or will be root of the tree */
-	if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
-		/*
-		 * The root can not have parents.
-		 * Release nodes which previously were obtained as
-		 * parents of the current node neighbors.
-		 */
-		brelse(tb->FL[h]);
-		brelse(tb->CFL[h]);
-		brelse(tb->FR[h]);
-		brelse(tb->CFR[h]);
-		tb->FL[h]  = NULL;
-		tb->CFL[h] = NULL;
-		tb->FR[h]  = NULL;
-		tb->CFR[h] = NULL;
-		return CARRY_ON;
-	}
-
-	/* Get parent FL[path_offset] of L[path_offset]. */
-	position = PATH_OFFSET_POSITION(path, path_offset - 1);
-	if (position) {
-		/* Current node is not the first child of its parent. */
-		curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
-		curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
-		get_bh(curf);
-		get_bh(curf);
-		tb->lkey[h] = position - 1;
-	} else {
-		/*
-		 * Calculate current parent of L[path_offset], which is the
-		 * left neighbor of the current node.  Calculate current
-		 * common parent of L[path_offset] and the current node.
-		 * Note that CFL[path_offset] not equal FL[path_offset] and
-		 * CFL[path_offset] not equal F[path_offset].
-		 * Calculate lkey[path_offset].
-		 */
-		if ((ret = get_far_parent(tb, h + 1, &curf,
-						  &curcf,
-						  LEFT_PARENTS)) != CARRY_ON)
-			return ret;
-	}
-
-	brelse(tb->FL[h]);
-	tb->FL[h] = curf;	/* New initialization of FL[h]. */
-	brelse(tb->CFL[h]);
-	tb->CFL[h] = curcf;	/* New initialization of CFL[h]. */
-
-	RFALSE((curf && !B_IS_IN_TREE(curf)) ||
-	       (curcf && !B_IS_IN_TREE(curcf)),
-	       "PAP-8195: FL (%b) or CFL (%b) is invalid", curf, curcf);
-
-	/* Get parent FR[h] of R[h]. */
-
-	/* Current node is the last child of F[h]. FR[h] != F[h]. */
-	if (position == B_NR_ITEMS(PATH_H_PBUFFER(path, h + 1))) {
-		/*
-		 * Calculate current parent of R[h], which is the right
-		 * neighbor of F[h].  Calculate current common parent of
-		 * R[h] and current node. Note that CFR[h] not equal
-		 * FR[path_offset] and CFR[h] not equal F[h].
-		 */
-		if ((ret =
-		     get_far_parent(tb, h + 1, &curf, &curcf,
-				    RIGHT_PARENTS)) != CARRY_ON)
-			return ret;
-	} else {
-		/* Current node is not the last child of its parent F[h]. */
-		curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
-		curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
-		get_bh(curf);
-		get_bh(curf);
-		tb->rkey[h] = position;
-	}
-
-	brelse(tb->FR[h]);
-	/* New initialization of FR[path_offset]. */
-	tb->FR[h] = curf;
-
-	brelse(tb->CFR[h]);
-	/* New initialization of CFR[path_offset]. */
-	tb->CFR[h] = curcf;
-
-	RFALSE((curf && !B_IS_IN_TREE(curf)) ||
-	       (curcf && !B_IS_IN_TREE(curcf)),
-	       "PAP-8205: FR (%b) or CFR (%b) is invalid", curf, curcf);
-
-	return CARRY_ON;
-}
-
-/*
- * it is possible to remove node as result of shiftings to
- * neighbors even when we insert or paste item.
- */
-static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
-				      struct tree_balance *tb, int h)
-{
-	struct buffer_head *Sh = PATH_H_PBUFFER(tb->tb_path, h);
-	int levbytes = tb->insert_size[h];
-	struct item_head *ih;
-	struct reiserfs_key *r_key = NULL;
-
-	ih = item_head(Sh, 0);
-	if (tb->CFR[h])
-		r_key = internal_key(tb->CFR[h], tb->rkey[h]);
-
-	if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes
-	    /* shifting may merge items which might save space */
-	    -
-	    ((!h
-	      && op_is_left_mergeable(&ih->ih_key, Sh->b_size)) ? IH_SIZE : 0)
-	    -
-	    ((!h && r_key
-	      && op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0)
-	    + ((h) ? KEY_SIZE : 0)) {
-		/* node can not be removed */
-		if (sfree >= levbytes) {
-			/* new item fits into node S[h] without any shifting */
-			if (!h)
-				tb->s0num =
-				    B_NR_ITEMS(Sh) +
-				    ((mode == M_INSERT) ? 1 : 0);
-			set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
-			return NO_BALANCING_NEEDED;
-		}
-	}
-	PROC_INFO_INC(tb->tb_sb, can_node_be_removed[h]);
-	return !NO_BALANCING_NEEDED;
-}
-
-/*
- * Check whether current node S[h] is balanced when increasing its size by
- * Inserting or Pasting.
- * Calculate parameters for balancing for current level h.
- * Parameters:
- *	tb	tree_balance structure;
- *	h	current level of the node;
- *	inum	item number in S[h];
- *	mode	i - insert, p - paste;
- * Returns:	1 - schedule occurred;
- *	        0 - balancing for higher levels needed;
- *	       -1 - no balancing for higher levels needed;
- *	       -2 - no disk space.
- */
-/* ip means Inserting or Pasting */
-static int ip_check_balance(struct tree_balance *tb, int h)
-{
-	struct virtual_node *vn = tb->tb_vn;
-	/*
-	 * Number of bytes that must be inserted into (value is negative
-	 * if bytes are deleted) buffer which contains node being balanced.
-	 * The mnemonic is that the attempted change in node space used
-	 * level is levbytes bytes.
-	 */
-	int levbytes;
-	int ret;
-
-	int lfree, sfree, rfree /* free space in L, S and R */ ;
-
-	/*
-	 * nver is short for number of vertixes, and lnver is the number if
-	 * we shift to the left, rnver is the number if we shift to the
-	 * right, and lrnver is the number if we shift in both directions.
-	 * The goal is to minimize first the number of vertixes, and second,
-	 * the number of vertixes whose contents are changed by shifting,
-	 * and third the number of uncached vertixes whose contents are
-	 * changed by shifting and must be read from disk.
-	 */
-	int nver, lnver, rnver, lrnver;
-
-	/*
-	 * used at leaf level only, S0 = S[0] is the node being balanced,
-	 * sInum [ I = 0,1,2 ] is the number of items that will
-	 * remain in node SI after balancing.  S1 and S2 are new
-	 * nodes that might be created.
-	 */
-
-	/*
-	 * we perform 8 calls to get_num_ver().  For each call we
-	 * calculate five parameters.  where 4th parameter is s1bytes
-	 * and 5th - s2bytes
-	 *
-	 * s0num, s1num, s2num for 8 cases
-	 * 0,1 - do not shift and do not shift but bottle
-	 * 2   - shift only whole item to left
-	 * 3   - shift to left and bottle as much as possible
-	 * 4,5 - shift to right (whole items and as much as possible
-	 * 6,7 - shift to both directions (whole items and as much as possible)
-	 */
-	short snum012[40] = { 0, };
-
-	/* Sh is the node whose balance is currently being checked */
-	struct buffer_head *Sh;
-
-	Sh = PATH_H_PBUFFER(tb->tb_path, h);
-	levbytes = tb->insert_size[h];
-
-	/* Calculate balance parameters for creating new root. */
-	if (!Sh) {
-		if (!h)
-			reiserfs_panic(tb->tb_sb, "vs-8210",
-				       "S[0] can not be 0");
-		switch (ret = get_empty_nodes(tb, h)) {
-		/* no balancing for higher levels needed */
-		case CARRY_ON:
-			set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
-			return NO_BALANCING_NEEDED;
-
-		case NO_DISK_SPACE:
-		case REPEAT_SEARCH:
-			return ret;
-		default:
-			reiserfs_panic(tb->tb_sb, "vs-8215", "incorrect "
-				       "return value of get_empty_nodes");
-		}
-	}
-
-	/* get parents of S[h] neighbors. */
-	ret = get_parents(tb, h);
-	if (ret != CARRY_ON)
-		return ret;
-
-	sfree = B_FREE_SPACE(Sh);
-
-	/* get free space of neighbors */
-	rfree = get_rfree(tb, h);
-	lfree = get_lfree(tb, h);
-
-	/* and new item fits into node S[h] without any shifting */
-	if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) ==
-	    NO_BALANCING_NEEDED)
-		return NO_BALANCING_NEEDED;
-
-	create_virtual_node(tb, h);
-
-	/*
-	 * determine maximal number of items we can shift to the left
-	 * neighbor (in tb structure) and the maximal number of bytes
-	 * that can flow to the left neighbor from the left most liquid
-	 * item that cannot be shifted from S[0] entirely (returned value)
-	 */
-	check_left(tb, h, lfree);
-
-	/*
-	 * determine maximal number of items we can shift to the right
-	 * neighbor (in tb structure) and the maximal number of bytes
-	 * that can flow to the right neighbor from the right most liquid
-	 * item that cannot be shifted from S[0] entirely (returned value)
-	 */
-	check_right(tb, h, rfree);
-
-	/*
-	 * all contents of internal node S[h] can be moved into its
-	 * neighbors, S[h] will be removed after balancing
-	 */
-	if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) {
-		int to_r;
-
-		/*
-		 * Since we are working on internal nodes, and our internal
-		 * nodes have fixed size entries, then we can balance by the
-		 * number of items rather than the space they consume.  In this
-		 * routine we set the left node equal to the right node,
-		 * allowing a difference of less than or equal to 1 child
-		 * pointer.
-		 */
-		to_r =
-		    ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
-		     vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
-						tb->rnum[h]);
-		set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
-			       -1, -1);
-		return CARRY_ON;
-	}
-
-	/*
-	 * this checks balance condition, that any two neighboring nodes
-	 * can not fit in one node
-	 */
-	RFALSE(h &&
-	       (tb->lnum[h] >= vn->vn_nr_item + 1 ||
-		tb->rnum[h] >= vn->vn_nr_item + 1),
-	       "vs-8220: tree is not balanced on internal level");
-	RFALSE(!h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) ||
-		      (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))),
-	       "vs-8225: tree is not balanced on leaf level");
-
-	/*
-	 * all contents of S[0] can be moved into its neighbors
-	 * S[0] will be removed after balancing.
-	 */
-	if (!h && is_leaf_removable(tb))
-		return CARRY_ON;
-
-	/*
-	 * why do we perform this check here rather than earlier??
-	 * Answer: we can win 1 node in some cases above. Moreover we
-	 * checked it above, when we checked, that S[0] is not removable
-	 * in principle
-	 */
-
-	 /* new item fits into node S[h] without any shifting */
-	if (sfree >= levbytes) {
-		if (!h)
-			tb->s0num = vn->vn_nr_item;
-		set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
-		return NO_BALANCING_NEEDED;
-	}
-
-	{
-		int lpar, rpar, nset, lset, rset, lrset;
-		/* regular overflowing of the node */
-
-		/*
-		 * get_num_ver works in 2 modes (FLOW & NO_FLOW)
-		 * lpar, rpar - number of items we can shift to left/right
-		 *              neighbor (including splitting item)
-		 * nset, lset, rset, lrset - shows, whether flowing items
-		 *                           give better packing
-		 */
-#define FLOW 1
-#define NO_FLOW 0		/* do not any splitting */
-
-		/* we choose one of the following */
-#define NOTHING_SHIFT_NO_FLOW	0
-#define NOTHING_SHIFT_FLOW	5
-#define LEFT_SHIFT_NO_FLOW	10
-#define LEFT_SHIFT_FLOW		15
-#define RIGHT_SHIFT_NO_FLOW	20
-#define RIGHT_SHIFT_FLOW	25
-#define LR_SHIFT_NO_FLOW	30
-#define LR_SHIFT_FLOW		35
-
-		lpar = tb->lnum[h];
-		rpar = tb->rnum[h];
-
-		/*
-		 * calculate number of blocks S[h] must be split into when
-		 * nothing is shifted to the neighbors, as well as number of
-		 * items in each part of the split node (s012 numbers),
-		 * and number of bytes (s1bytes) of the shared drop which
-		 * flow to S1 if any
-		 */
-		nset = NOTHING_SHIFT_NO_FLOW;
-		nver = get_num_ver(vn->vn_mode, tb, h,
-				   0, -1, h ? vn->vn_nr_item : 0, -1,
-				   snum012, NO_FLOW);
-
-		if (!h) {
-			int nver1;
-
-			/*
-			 * note, that in this case we try to bottle
-			 * between S[0] and S1 (S1 - the first new node)
-			 */
-			nver1 = get_num_ver(vn->vn_mode, tb, h,
-					    0, -1, 0, -1,
-					    snum012 + NOTHING_SHIFT_FLOW, FLOW);
-			if (nver > nver1)
-				nset = NOTHING_SHIFT_FLOW, nver = nver1;
-		}
-
-		/*
-		 * calculate number of blocks S[h] must be split into when
-		 * l_shift_num first items and l_shift_bytes of the right
-		 * most liquid item to be shifted are shifted to the left
-		 * neighbor, as well as number of items in each part of the
-		 * splitted node (s012 numbers), and number of bytes
-		 * (s1bytes) of the shared drop which flow to S1 if any
-		 */
-		lset = LEFT_SHIFT_NO_FLOW;
-		lnver = get_num_ver(vn->vn_mode, tb, h,
-				    lpar - ((h || tb->lbytes == -1) ? 0 : 1),
-				    -1, h ? vn->vn_nr_item : 0, -1,
-				    snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW);
-		if (!h) {
-			int lnver1;
-
-			lnver1 = get_num_ver(vn->vn_mode, tb, h,
-					     lpar -
-					     ((tb->lbytes != -1) ? 1 : 0),
-					     tb->lbytes, 0, -1,
-					     snum012 + LEFT_SHIFT_FLOW, FLOW);
-			if (lnver > lnver1)
-				lset = LEFT_SHIFT_FLOW, lnver = lnver1;
-		}
-
-		/*
-		 * calculate number of blocks S[h] must be split into when
-		 * r_shift_num first items and r_shift_bytes of the left most
-		 * liquid item to be shifted are shifted to the right neighbor,
-		 * as well as number of items in each part of the splitted
-		 * node (s012 numbers), and number of bytes (s1bytes) of the
-		 * shared drop which flow to S1 if any
-		 */
-		rset = RIGHT_SHIFT_NO_FLOW;
-		rnver = get_num_ver(vn->vn_mode, tb, h,
-				    0, -1,
-				    h ? (vn->vn_nr_item - rpar) : (rpar -
-								   ((tb->
-								     rbytes !=
-								     -1) ? 1 :
-								    0)), -1,
-				    snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW);
-		if (!h) {
-			int rnver1;
-
-			rnver1 = get_num_ver(vn->vn_mode, tb, h,
-					     0, -1,
-					     (rpar -
-					      ((tb->rbytes != -1) ? 1 : 0)),
-					     tb->rbytes,
-					     snum012 + RIGHT_SHIFT_FLOW, FLOW);
-
-			if (rnver > rnver1)
-				rset = RIGHT_SHIFT_FLOW, rnver = rnver1;
-		}
-
-		/*
-		 * calculate number of blocks S[h] must be split into when
-		 * items are shifted in both directions, as well as number
-		 * of items in each part of the splitted node (s012 numbers),
-		 * and number of bytes (s1bytes) of the shared drop which
-		 * flow to S1 if any
-		 */
-		lrset = LR_SHIFT_NO_FLOW;
-		lrnver = get_num_ver(vn->vn_mode, tb, h,
-				     lpar - ((h || tb->lbytes == -1) ? 0 : 1),
-				     -1,
-				     h ? (vn->vn_nr_item - rpar) : (rpar -
-								    ((tb->
-								      rbytes !=
-								      -1) ? 1 :
-								     0)), -1,
-				     snum012 + LR_SHIFT_NO_FLOW, NO_FLOW);
-		if (!h) {
-			int lrnver1;
-
-			lrnver1 = get_num_ver(vn->vn_mode, tb, h,
-					      lpar -
-					      ((tb->lbytes != -1) ? 1 : 0),
-					      tb->lbytes,
-					      (rpar -
-					       ((tb->rbytes != -1) ? 1 : 0)),
-					      tb->rbytes,
-					      snum012 + LR_SHIFT_FLOW, FLOW);
-			if (lrnver > lrnver1)
-				lrset = LR_SHIFT_FLOW, lrnver = lrnver1;
-		}
-
-		/*
-		 * Our general shifting strategy is:
-		 * 1) to minimized number of new nodes;
-		 * 2) to minimized number of neighbors involved in shifting;
-		 * 3) to minimized number of disk reads;
-		 */
-
-		/* we can win TWO or ONE nodes by shifting in both directions */
-		if (lrnver < lnver && lrnver < rnver) {
-			RFALSE(h &&
-			       (tb->lnum[h] != 1 ||
-				tb->rnum[h] != 1 ||
-				lrnver != 1 || rnver != 2 || lnver != 2
-				|| h != 1), "vs-8230: bad h");
-			if (lrset == LR_SHIFT_FLOW)
-				set_parameters(tb, h, tb->lnum[h], tb->rnum[h],
-					       lrnver, snum012 + lrset,
-					       tb->lbytes, tb->rbytes);
-			else
-				set_parameters(tb, h,
-					       tb->lnum[h] -
-					       ((tb->lbytes == -1) ? 0 : 1),
-					       tb->rnum[h] -
-					       ((tb->rbytes == -1) ? 0 : 1),
-					       lrnver, snum012 + lrset, -1, -1);
-
-			return CARRY_ON;
-		}
-
-		/*
-		 * if shifting doesn't lead to better packing
-		 * then don't shift
-		 */
-		if (nver == lrnver) {
-			set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1,
-				       -1);
-			return CARRY_ON;
-		}
-
-		/*
-		 * now we know that for better packing shifting in only one
-		 * direction either to the left or to the right is required
-		 */
-
-		/*
-		 * if shifting to the left is better than
-		 * shifting to the right
-		 */
-		if (lnver < rnver) {
-			SET_PAR_SHIFT_LEFT;
-			return CARRY_ON;
-		}
-
-		/*
-		 * if shifting to the right is better than
-		 * shifting to the left
-		 */
-		if (lnver > rnver) {
-			SET_PAR_SHIFT_RIGHT;
-			return CARRY_ON;
-		}
-
-		/*
-		 * now shifting in either direction gives the same number
-		 * of nodes and we can make use of the cached neighbors
-		 */
-		if (is_left_neighbor_in_cache(tb, h)) {
-			SET_PAR_SHIFT_LEFT;
-			return CARRY_ON;
-		}
-
-		/*
-		 * shift to the right independently on whether the
-		 * right neighbor in cache or not
-		 */
-		SET_PAR_SHIFT_RIGHT;
-		return CARRY_ON;
-	}
-}
-
-/*
- * Check whether current node S[h] is balanced when Decreasing its size by
- * Deleting or Cutting for INTERNAL node of S+tree.
- * Calculate parameters for balancing for current level h.
- * Parameters:
- *	tb	tree_balance structure;
- *	h	current level of the node;
- *	inum	item number in S[h];
- *	mode	i - insert, p - paste;
- * Returns:	1 - schedule occurred;
- *	        0 - balancing for higher levels needed;
- *	       -1 - no balancing for higher levels needed;
- *	       -2 - no disk space.
- *
- * Note: Items of internal nodes have fixed size, so the balance condition for
- * the internal part of S+tree is as for the B-trees.
- */
-static int dc_check_balance_internal(struct tree_balance *tb, int h)
-{
-	struct virtual_node *vn = tb->tb_vn;
-
-	/*
-	 * Sh is the node whose balance is currently being checked,
-	 * and Fh is its father.
-	 */
-	struct buffer_head *Sh, *Fh;
-	int ret;
-	int lfree, rfree /* free space in L and R */ ;
-
-	Sh = PATH_H_PBUFFER(tb->tb_path, h);
-	Fh = PATH_H_PPARENT(tb->tb_path, h);
-
-	/*
-	 * using tb->insert_size[h], which is negative in this case,
-	 * create_virtual_node calculates:
-	 * new_nr_item = number of items node would have if operation is
-	 * performed without balancing (new_nr_item);
-	 */
-	create_virtual_node(tb, h);
-
-	if (!Fh) {		/* S[h] is the root. */
-		/* no balancing for higher levels needed */
-		if (vn->vn_nr_item > 0) {
-			set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
-			return NO_BALANCING_NEEDED;
-		}
-		/*
-		 * new_nr_item == 0.
-		 * Current root will be deleted resulting in
-		 * decrementing the tree height.
-		 */
-		set_parameters(tb, h, 0, 0, 0, NULL, -1, -1);
-		return CARRY_ON;
-	}
-
-	if ((ret = get_parents(tb, h)) != CARRY_ON)
-		return ret;
-
-	/* get free space of neighbors */
-	rfree = get_rfree(tb, h);
-	lfree = get_lfree(tb, h);
-
-	/* determine maximal number of items we can fit into neighbors */
-	check_left(tb, h, lfree);
-	check_right(tb, h, rfree);
-
-	/*
-	 * Balance condition for the internal node is valid.
-	 * In this case we balance only if it leads to better packing.
-	 */
-	if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) {
-		/*
-		 * Here we join S[h] with one of its neighbors,
-		 * which is impossible with greater values of new_nr_item.
-		 */
-		if (vn->vn_nr_item == MIN_NR_KEY(Sh)) {
-			/* All contents of S[h] can be moved to L[h]. */
-			if (tb->lnum[h] >= vn->vn_nr_item + 1) {
-				int n;
-				int order_L;
-
-				order_L =
-				    ((n =
-				      PATH_H_B_ITEM_ORDER(tb->tb_path,
-							  h)) ==
-				     0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
-				n = dc_size(B_N_CHILD(tb->FL[h], order_L)) /
-				    (DC_SIZE + KEY_SIZE);
-				set_parameters(tb, h, -n - 1, 0, 0, NULL, -1,
-					       -1);
-				return CARRY_ON;
-			}
-
-			/* All contents of S[h] can be moved to R[h]. */
-			if (tb->rnum[h] >= vn->vn_nr_item + 1) {
-				int n;
-				int order_R;
-
-				order_R =
-				    ((n =
-				      PATH_H_B_ITEM_ORDER(tb->tb_path,
-							  h)) ==
-				     B_NR_ITEMS(Fh)) ? 0 : n + 1;
-				n = dc_size(B_N_CHILD(tb->FR[h], order_R)) /
-				    (DC_SIZE + KEY_SIZE);
-				set_parameters(tb, h, 0, -n - 1, 0, NULL, -1,
-					       -1);
-				return CARRY_ON;
-			}
-		}
-
-		/*
-		 * All contents of S[h] can be moved to the neighbors
-		 * (L[h] & R[h]).
-		 */
-		if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
-			int to_r;
-
-			to_r =
-			    ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] -
-			     tb->rnum[h] + vn->vn_nr_item + 1) / 2 -
-			    (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
-			set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r,
-				       0, NULL, -1, -1);
-			return CARRY_ON;
-		}
-
-		/* Balancing does not lead to better packing. */
-		set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
-		return NO_BALANCING_NEEDED;
-	}
-
-	/*
-	 * Current node contain insufficient number of items.
-	 * Balancing is required.
-	 */
-	/* Check whether we can merge S[h] with left neighbor. */
-	if (tb->lnum[h] >= vn->vn_nr_item + 1)
-		if (is_left_neighbor_in_cache(tb, h)
-		    || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) {
-			int n;
-			int order_L;
-
-			order_L =
-			    ((n =
-			      PATH_H_B_ITEM_ORDER(tb->tb_path,
-						  h)) ==
-			     0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
-			n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / (DC_SIZE +
-								      KEY_SIZE);
-			set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, -1);
-			return CARRY_ON;
-		}
-
-	/* Check whether we can merge S[h] with right neighbor. */
-	if (tb->rnum[h] >= vn->vn_nr_item + 1) {
-		int n;
-		int order_R;
-
-		order_R =
-		    ((n =
-		      PATH_H_B_ITEM_ORDER(tb->tb_path,
-					  h)) == B_NR_ITEMS(Fh)) ? 0 : (n + 1);
-		n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / (DC_SIZE +
-							      KEY_SIZE);
-		set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, -1);
-		return CARRY_ON;
-	}
-
-	/* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
-	if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
-		int to_r;
-
-		to_r =
-		    ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
-		     vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
-						tb->rnum[h]);
-		set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
-			       -1, -1);
-		return CARRY_ON;
-	}
-
-	/* For internal nodes try to borrow item from a neighbor */
-	RFALSE(!tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root");
-
-	/* Borrow one or two items from caching neighbor */
-	if (is_left_neighbor_in_cache(tb, h) || !tb->FR[h]) {
-		int from_l;
-
-		from_l =
-		    (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item +
-		     1) / 2 - (vn->vn_nr_item + 1);
-		set_parameters(tb, h, -from_l, 0, 1, NULL, -1, -1);
-		return CARRY_ON;
-	}
-
-	set_parameters(tb, h, 0,
-		       -((MAX_NR_KEY(Sh) + 1 - tb->rnum[h] + vn->vn_nr_item +
-			  1) / 2 - (vn->vn_nr_item + 1)), 1, NULL, -1, -1);
-	return CARRY_ON;
-}
-
-/*
- * Check whether current node S[h] is balanced when Decreasing its size by
- * Deleting or Truncating for LEAF node of S+tree.
- * Calculate parameters for balancing for current level h.
- * Parameters:
- *	tb	tree_balance structure;
- *	h	current level of the node;
- *	inum	item number in S[h];
- *	mode	i - insert, p - paste;
- * Returns:	1 - schedule occurred;
- *	        0 - balancing for higher levels needed;
- *	       -1 - no balancing for higher levels needed;
- *	       -2 - no disk space.
- */
-static int dc_check_balance_leaf(struct tree_balance *tb, int h)
-{
-	struct virtual_node *vn = tb->tb_vn;
-
-	/*
-	 * Number of bytes that must be deleted from
-	 * (value is negative if bytes are deleted) buffer which
-	 * contains node being balanced.  The mnemonic is that the
-	 * attempted change in node space used level is levbytes bytes.
-	 */
-	int levbytes;
-
-	/* the maximal item size */
-	int maxsize, ret;
-
-	/*
-	 * S0 is the node whose balance is currently being checked,
-	 * and F0 is its father.
-	 */
-	struct buffer_head *S0, *F0;
-	int lfree, rfree /* free space in L and R */ ;
-
-	S0 = PATH_H_PBUFFER(tb->tb_path, 0);
-	F0 = PATH_H_PPARENT(tb->tb_path, 0);
-
-	levbytes = tb->insert_size[h];
-
-	maxsize = MAX_CHILD_SIZE(S0);	/* maximal possible size of an item */
-
-	if (!F0) {		/* S[0] is the root now. */
-
-		RFALSE(-levbytes >= maxsize - B_FREE_SPACE(S0),
-		       "vs-8240: attempt to create empty buffer tree");
-
-		set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
-		return NO_BALANCING_NEEDED;
-	}
-
-	if ((ret = get_parents(tb, h)) != CARRY_ON)
-		return ret;
-
-	/* get free space of neighbors */
-	rfree = get_rfree(tb, h);
-	lfree = get_lfree(tb, h);
-
-	create_virtual_node(tb, h);
-
-	/* if 3 leaves can be merge to one, set parameters and return */
-	if (are_leaves_removable(tb, lfree, rfree))
-		return CARRY_ON;
-
-	/*
-	 * determine maximal number of items we can shift to the left/right
-	 * neighbor and the maximal number of bytes that can flow to the
-	 * left/right neighbor from the left/right most liquid item that
-	 * cannot be shifted from S[0] entirely
-	 */
-	check_left(tb, h, lfree);
-	check_right(tb, h, rfree);
-
-	/* check whether we can merge S with left neighbor. */
-	if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1)
-		if (is_left_neighbor_in_cache(tb, h) || ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) ||	/* S can not be merged with R */
-		    !tb->FR[h]) {
-
-			RFALSE(!tb->FL[h],
-			       "vs-8245: dc_check_balance_leaf: FL[h] must exist");
-
-			/* set parameter to merge S[0] with its left neighbor */
-			set_parameters(tb, h, -1, 0, 0, NULL, -1, -1);
-			return CARRY_ON;
-		}
-
-	/* check whether we can merge S[0] with right neighbor. */
-	if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) {
-		set_parameters(tb, h, 0, -1, 0, NULL, -1, -1);
-		return CARRY_ON;
-	}
-
-	/*
-	 * All contents of S[0] can be moved to the neighbors (L[0] & R[0]).
-	 * Set parameters and return
-	 */
-	if (is_leaf_removable(tb))
-		return CARRY_ON;
-
-	/* Balancing is not required. */
-	tb->s0num = vn->vn_nr_item;
-	set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
-	return NO_BALANCING_NEEDED;
-}
-
-/*
- * Check whether current node S[h] is balanced when Decreasing its size by
- * Deleting or Cutting.
- * Calculate parameters for balancing for current level h.
- * Parameters:
- *	tb	tree_balance structure;
- *	h	current level of the node;
- *	inum	item number in S[h];
- *	mode	d - delete, c - cut.
- * Returns:	1 - schedule occurred;
- *	        0 - balancing for higher levels needed;
- *	       -1 - no balancing for higher levels needed;
- *	       -2 - no disk space.
- */
-static int dc_check_balance(struct tree_balance *tb, int h)
-{
-	RFALSE(!(PATH_H_PBUFFER(tb->tb_path, h)),
-	       "vs-8250: S is not initialized");
-
-	if (h)
-		return dc_check_balance_internal(tb, h);
-	else
-		return dc_check_balance_leaf(tb, h);
-}
-
-/*
- * Check whether current node S[h] is balanced.
- * Calculate parameters for balancing for current level h.
- * Parameters:
- *
- *	tb	tree_balance structure:
- *
- *              tb is a large structure that must be read about in the header
- *		file at the same time as this procedure if the reader is
- *		to successfully understand this procedure
- *
- *	h	current level of the node;
- *	inum	item number in S[h];
- *	mode	i - insert, p - paste, d - delete, c - cut.
- * Returns:	1 - schedule occurred;
- *	        0 - balancing for higher levels needed;
- *	       -1 - no balancing for higher levels needed;
- *	       -2 - no disk space.
- */
-static int check_balance(int mode,
-			 struct tree_balance *tb,
-			 int h,
-			 int inum,
-			 int pos_in_item,
-			 struct item_head *ins_ih, const void *data)
-{
-	struct virtual_node *vn;
-
-	vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf);
-	vn->vn_free_ptr = (char *)(tb->tb_vn + 1);
-	vn->vn_mode = mode;
-	vn->vn_affected_item_num = inum;
-	vn->vn_pos_in_item = pos_in_item;
-	vn->vn_ins_ih = ins_ih;
-	vn->vn_data = data;
-
-	RFALSE(mode == M_INSERT && !vn->vn_ins_ih,
-	       "vs-8255: ins_ih can not be 0 in insert mode");
-
-	/* Calculate balance parameters when size of node is increasing. */
-	if (tb->insert_size[h] > 0)
-		return ip_check_balance(tb, h);
-
-	/* Calculate balance parameters when  size of node is decreasing. */
-	return dc_check_balance(tb, h);
-}
-
-/* Check whether parent at the path is the really parent of the current node.*/
-static int get_direct_parent(struct tree_balance *tb, int h)
-{
-	struct buffer_head *bh;
-	struct treepath *path = tb->tb_path;
-	int position,
-	    path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
-
-	/* We are in the root or in the new root. */
-	if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
-
-		RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
-		       "PAP-8260: invalid offset in the path");
-
-		if (PATH_OFFSET_PBUFFER(path, FIRST_PATH_ELEMENT_OFFSET)->
-		    b_blocknr == SB_ROOT_BLOCK(tb->tb_sb)) {
-			/* Root is not changed. */
-			PATH_OFFSET_PBUFFER(path, path_offset - 1) = NULL;
-			PATH_OFFSET_POSITION(path, path_offset - 1) = 0;
-			return CARRY_ON;
-		}
-		/* Root is changed and we must recalculate the path. */
-		return REPEAT_SEARCH;
-	}
-
-	/* Parent in the path is not in the tree. */
-	if (!B_IS_IN_TREE
-	    (bh = PATH_OFFSET_PBUFFER(path, path_offset - 1)))
-		return REPEAT_SEARCH;
-
-	if ((position =
-	     PATH_OFFSET_POSITION(path,
-				  path_offset - 1)) > B_NR_ITEMS(bh))
-		return REPEAT_SEARCH;
-
-	/* Parent in the path is not parent of the current node in the tree. */
-	if (B_N_CHILD_NUM(bh, position) !=
-	    PATH_OFFSET_PBUFFER(path, path_offset)->b_blocknr)
-		return REPEAT_SEARCH;
-
-	if (buffer_locked(bh)) {
-		int depth = reiserfs_write_unlock_nested(tb->tb_sb);
-		__wait_on_buffer(bh);
-		reiserfs_write_lock_nested(tb->tb_sb, depth);
-		if (FILESYSTEM_CHANGED_TB(tb))
-			return REPEAT_SEARCH;
-	}
-
-	/*
-	 * Parent in the path is unlocked and really parent
-	 * of the current node.
-	 */
-	return CARRY_ON;
-}
-
-/*
- * Using lnum[h] and rnum[h] we should determine what neighbors
- * of S[h] we
- * need in order to balance S[h], and get them if necessary.
- * Returns:	SCHEDULE_OCCURRED - schedule occurred while the function worked;
- *	        CARRY_ON - schedule didn't occur while the function worked;
- */
-static int get_neighbors(struct tree_balance *tb, int h)
-{
-	int child_position,
-	    path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h + 1);
-	unsigned long son_number;
-	struct super_block *sb = tb->tb_sb;
-	struct buffer_head *bh;
-	int depth;
-
-	PROC_INFO_INC(sb, get_neighbors[h]);
-
-	if (tb->lnum[h]) {
-		/* We need left neighbor to balance S[h]. */
-		PROC_INFO_INC(sb, need_l_neighbor[h]);
-		bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
-
-		RFALSE(bh == tb->FL[h] &&
-		       !PATH_OFFSET_POSITION(tb->tb_path, path_offset),
-		       "PAP-8270: invalid position in the parent");
-
-		child_position =
-		    (bh ==
-		     tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->
-								       FL[h]);
-		son_number = B_N_CHILD_NUM(tb->FL[h], child_position);
-		depth = reiserfs_write_unlock_nested(tb->tb_sb);
-		bh = sb_bread(sb, son_number);
-		reiserfs_write_lock_nested(tb->tb_sb, depth);
-		if (!bh)
-			return IO_ERROR;
-		if (FILESYSTEM_CHANGED_TB(tb)) {
-			brelse(bh);
-			PROC_INFO_INC(sb, get_neighbors_restart[h]);
-			return REPEAT_SEARCH;
-		}
-
-		RFALSE(!B_IS_IN_TREE(tb->FL[h]) ||
-		       child_position > B_NR_ITEMS(tb->FL[h]) ||
-		       B_N_CHILD_NUM(tb->FL[h], child_position) !=
-		       bh->b_blocknr, "PAP-8275: invalid parent");
-		RFALSE(!B_IS_IN_TREE(bh), "PAP-8280: invalid child");
-		RFALSE(!h &&
-		       B_FREE_SPACE(bh) !=
-		       MAX_CHILD_SIZE(bh) -
-		       dc_size(B_N_CHILD(tb->FL[0], child_position)),
-		       "PAP-8290: invalid child size of left neighbor");
-
-		brelse(tb->L[h]);
-		tb->L[h] = bh;
-	}
-
-	/* We need right neighbor to balance S[path_offset]. */
-	if (tb->rnum[h]) {
-		PROC_INFO_INC(sb, need_r_neighbor[h]);
-		bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
-
-		RFALSE(bh == tb->FR[h] &&
-		       PATH_OFFSET_POSITION(tb->tb_path,
-					    path_offset) >=
-		       B_NR_ITEMS(bh),
-		       "PAP-8295: invalid position in the parent");
-
-		child_position =
-		    (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0;
-		son_number = B_N_CHILD_NUM(tb->FR[h], child_position);
-		depth = reiserfs_write_unlock_nested(tb->tb_sb);
-		bh = sb_bread(sb, son_number);
-		reiserfs_write_lock_nested(tb->tb_sb, depth);
-		if (!bh)
-			return IO_ERROR;
-		if (FILESYSTEM_CHANGED_TB(tb)) {
-			brelse(bh);
-			PROC_INFO_INC(sb, get_neighbors_restart[h]);
-			return REPEAT_SEARCH;
-		}
-		brelse(tb->R[h]);
-		tb->R[h] = bh;
-
-		RFALSE(!h
-		       && B_FREE_SPACE(bh) !=
-		       MAX_CHILD_SIZE(bh) -
-		       dc_size(B_N_CHILD(tb->FR[0], child_position)),
-		       "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
-		       B_FREE_SPACE(bh), MAX_CHILD_SIZE(bh),
-		       dc_size(B_N_CHILD(tb->FR[0], child_position)));
-
-	}
-	return CARRY_ON;
-}
-
-static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh)
-{
-	int max_num_of_items;
-	int max_num_of_entries;
-	unsigned long blocksize = sb->s_blocksize;
-
-#define MIN_NAME_LEN 1
-
-	max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN);
-	max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) /
-	    (DEH_SIZE + MIN_NAME_LEN);
-
-	return sizeof(struct virtual_node) +
-	    max(max_num_of_items * sizeof(struct virtual_item),
-		sizeof(struct virtual_item) +
-		struct_size_t(struct direntry_uarea, entry_sizes,
-			      max_num_of_entries));
-}
-
-/*
- * maybe we should fail balancing we are going to perform when kmalloc
- * fails several times. But now it will loop until kmalloc gets
- * required memory
- */
-static int get_mem_for_virtual_node(struct tree_balance *tb)
-{
-	int check_fs = 0;
-	int size;
-	char *buf;
-
-	size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path));
-
-	/* we have to allocate more memory for virtual node */
-	if (size > tb->vn_buf_size) {
-		if (tb->vn_buf) {
-			/* free memory allocated before */
-			kfree(tb->vn_buf);
-			/* this is not needed if kfree is atomic */
-			check_fs = 1;
-		}
-
-		/* virtual node requires now more memory */
-		tb->vn_buf_size = size;
-
-		/* get memory for virtual item */
-		buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
-		if (!buf) {
-			/*
-			 * getting memory with GFP_KERNEL priority may involve
-			 * balancing now (due to indirect_to_direct conversion
-			 * on dcache shrinking). So, release path and collected
-			 * resources here
-			 */
-			free_buffers_in_tb(tb);
-			buf = kmalloc(size, GFP_NOFS);
-			if (!buf) {
-				tb->vn_buf_size = 0;
-			}
-			tb->vn_buf = buf;
-			schedule();
-			return REPEAT_SEARCH;
-		}
-
-		tb->vn_buf = buf;
-	}
-
-	if (check_fs && FILESYSTEM_CHANGED_TB(tb))
-		return REPEAT_SEARCH;
-
-	return CARRY_ON;
-}
-
-#ifdef CONFIG_REISERFS_CHECK
-static void tb_buffer_sanity_check(struct super_block *sb,
-				   struct buffer_head *bh,
-				   const char *descr, int level)
-{
-	if (bh) {
-		if (atomic_read(&(bh->b_count)) <= 0)
-
-			reiserfs_panic(sb, "jmacd-1", "negative or zero "
-				       "reference counter for buffer %s[%d] "
-				       "(%b)", descr, level, bh);
-
-		if (!buffer_uptodate(bh))
-			reiserfs_panic(sb, "jmacd-2", "buffer is not up "
-				       "to date %s[%d] (%b)",
-				       descr, level, bh);
-
-		if (!B_IS_IN_TREE(bh))
-			reiserfs_panic(sb, "jmacd-3", "buffer is not "
-				       "in tree %s[%d] (%b)",
-				       descr, level, bh);
-
-		if (bh->b_bdev != sb->s_bdev)
-			reiserfs_panic(sb, "jmacd-4", "buffer has wrong "
-				       "device %s[%d] (%b)",
-				       descr, level, bh);
-
-		if (bh->b_size != sb->s_blocksize)
-			reiserfs_panic(sb, "jmacd-5", "buffer has wrong "
-				       "blocksize %s[%d] (%b)",
-				       descr, level, bh);
-
-		if (bh->b_blocknr > SB_BLOCK_COUNT(sb))
-			reiserfs_panic(sb, "jmacd-6", "buffer block "
-				       "number too high %s[%d] (%b)",
-				       descr, level, bh);
-	}
-}
-#else
-static void tb_buffer_sanity_check(struct super_block *sb,
-				   struct buffer_head *bh,
-				   const char *descr, int level)
-{;
-}
-#endif
-
-static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh)
-{
-	return reiserfs_prepare_for_journal(s, bh, 0);
-}
-
-static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
-{
-	struct buffer_head *locked;
-#ifdef CONFIG_REISERFS_CHECK
-	int repeat_counter = 0;
-#endif
-	int i;
-
-	do {
-
-		locked = NULL;
-
-		for (i = tb->tb_path->path_length;
-		     !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) {
-			if (PATH_OFFSET_PBUFFER(tb->tb_path, i)) {
-				/*
-				 * if I understand correctly, we can only
-				 * be sure the last buffer in the path is
-				 * in the tree --clm
-				 */
-#ifdef CONFIG_REISERFS_CHECK
-				if (PATH_PLAST_BUFFER(tb->tb_path) ==
-				    PATH_OFFSET_PBUFFER(tb->tb_path, i))
-					tb_buffer_sanity_check(tb->tb_sb,
-							       PATH_OFFSET_PBUFFER
-							       (tb->tb_path,
-								i), "S",
-							       tb->tb_path->
-							       path_length - i);
-#endif
-				if (!clear_all_dirty_bits(tb->tb_sb,
-							  PATH_OFFSET_PBUFFER
-							  (tb->tb_path,
-							   i))) {
-					locked =
-					    PATH_OFFSET_PBUFFER(tb->tb_path,
-								i);
-				}
-			}
-		}
-
-		for (i = 0; !locked && i < MAX_HEIGHT && tb->insert_size[i];
-		     i++) {
-
-			if (tb->lnum[i]) {
-
-				if (tb->L[i]) {
-					tb_buffer_sanity_check(tb->tb_sb,
-							       tb->L[i],
-							       "L", i);
-					if (!clear_all_dirty_bits
-					    (tb->tb_sb, tb->L[i]))
-						locked = tb->L[i];
-				}
-
-				if (!locked && tb->FL[i]) {
-					tb_buffer_sanity_check(tb->tb_sb,
-							       tb->FL[i],
-							       "FL", i);
-					if (!clear_all_dirty_bits
-					    (tb->tb_sb, tb->FL[i]))
-						locked = tb->FL[i];
-				}
-
-				if (!locked && tb->CFL[i]) {
-					tb_buffer_sanity_check(tb->tb_sb,
-							       tb->CFL[i],
-							       "CFL", i);
-					if (!clear_all_dirty_bits
-					    (tb->tb_sb, tb->CFL[i]))
-						locked = tb->CFL[i];
-				}
-
-			}
-
-			if (!locked && (tb->rnum[i])) {
-
-				if (tb->R[i]) {
-					tb_buffer_sanity_check(tb->tb_sb,
-							       tb->R[i],
-							       "R", i);
-					if (!clear_all_dirty_bits
-					    (tb->tb_sb, tb->R[i]))
-						locked = tb->R[i];
-				}
-
-				if (!locked && tb->FR[i]) {
-					tb_buffer_sanity_check(tb->tb_sb,
-							       tb->FR[i],
-							       "FR", i);
-					if (!clear_all_dirty_bits
-					    (tb->tb_sb, tb->FR[i]))
-						locked = tb->FR[i];
-				}
-
-				if (!locked && tb->CFR[i]) {
-					tb_buffer_sanity_check(tb->tb_sb,
-							       tb->CFR[i],
-							       "CFR", i);
-					if (!clear_all_dirty_bits
-					    (tb->tb_sb, tb->CFR[i]))
-						locked = tb->CFR[i];
-				}
-			}
-		}
-
-		/*
-		 * as far as I can tell, this is not required.  The FEB list
-		 * seems to be full of newly allocated nodes, which will
-		 * never be locked, dirty, or anything else.
-		 * To be safe, I'm putting in the checks and waits in.
-		 * For the moment, they are needed to keep the code in
-		 * journal.c from complaining about the buffer.
-		 * That code is inside CONFIG_REISERFS_CHECK as well.  --clm
-		 */
-		for (i = 0; !locked && i < MAX_FEB_SIZE; i++) {
-			if (tb->FEB[i]) {
-				if (!clear_all_dirty_bits
-				    (tb->tb_sb, tb->FEB[i]))
-					locked = tb->FEB[i];
-			}
-		}
-
-		if (locked) {
-			int depth;
-#ifdef CONFIG_REISERFS_CHECK
-			repeat_counter++;
-			if ((repeat_counter % 10000) == 0) {
-				reiserfs_warning(tb->tb_sb, "reiserfs-8200",
-						 "too many iterations waiting "
-						 "for buffer to unlock "
-						 "(%b)", locked);
-
-				/* Don't loop forever.  Try to recover from possible error. */
-
-				return (FILESYSTEM_CHANGED_TB(tb)) ?
-				    REPEAT_SEARCH : CARRY_ON;
-			}
-#endif
-			depth = reiserfs_write_unlock_nested(tb->tb_sb);
-			__wait_on_buffer(locked);
-			reiserfs_write_lock_nested(tb->tb_sb, depth);
-			if (FILESYSTEM_CHANGED_TB(tb))
-				return REPEAT_SEARCH;
-		}
-
-	} while (locked);
-
-	return CARRY_ON;
-}
-
-/*
- * Prepare for balancing, that is
- *	get all necessary parents, and neighbors;
- *	analyze what and where should be moved;
- *	get sufficient number of new nodes;
- * Balancing will start only after all resources will be collected at a time.
- *
- * When ported to SMP kernels, only at the last moment after all needed nodes
- * are collected in cache, will the resources be locked using the usual
- * textbook ordered lock acquisition algorithms.  Note that ensuring that
- * this code neither write locks what it does not need to write lock nor locks
- * out of order will be a pain in the butt that could have been avoided.
- * Grumble grumble. -Hans
- *
- * fix is meant in the sense of render unchanging
- *
- * Latency might be improved by first gathering a list of what buffers
- * are needed and then getting as many of them in parallel as possible? -Hans
- *
- * Parameters:
- *	op_mode	i - insert, d - delete, c - cut (truncate), p - paste (append)
- *	tb	tree_balance structure;
- *	inum	item number in S[h];
- *      pos_in_item - comment this if you can
- *      ins_ih	item head of item being inserted
- *	data	inserted item or data to be pasted
- * Returns:	1 - schedule occurred while the function worked;
- *	        0 - schedule didn't occur while the function worked;
- *             -1 - if no_disk_space
- */
-
-int fix_nodes(int op_mode, struct tree_balance *tb,
-	      struct item_head *ins_ih, const void *data)
-{
-	int ret, h, item_num = PATH_LAST_POSITION(tb->tb_path);
-	int pos_in_item;
-
-	/*
-	 * we set wait_tb_buffers_run when we have to restore any dirty
-	 * bits cleared during wait_tb_buffers_run
-	 */
-	int wait_tb_buffers_run = 0;
-	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-
-	++REISERFS_SB(tb->tb_sb)->s_fix_nodes;
-
-	pos_in_item = tb->tb_path->pos_in_item;
-
-	tb->fs_gen = get_generation(tb->tb_sb);
-
-	/*
-	 * we prepare and log the super here so it will already be in the
-	 * transaction when do_balance needs to change it.
-	 * This way do_balance won't have to schedule when trying to prepare
-	 * the super for logging
-	 */
-	reiserfs_prepare_for_journal(tb->tb_sb,
-				     SB_BUFFER_WITH_SB(tb->tb_sb), 1);
-	journal_mark_dirty(tb->transaction_handle,
-			   SB_BUFFER_WITH_SB(tb->tb_sb));
-	if (FILESYSTEM_CHANGED_TB(tb))
-		return REPEAT_SEARCH;
-
-	/* if it possible in indirect_to_direct conversion */
-	if (buffer_locked(tbS0)) {
-		int depth = reiserfs_write_unlock_nested(tb->tb_sb);
-		__wait_on_buffer(tbS0);
-		reiserfs_write_lock_nested(tb->tb_sb, depth);
-		if (FILESYSTEM_CHANGED_TB(tb))
-			return REPEAT_SEARCH;
-	}
-#ifdef CONFIG_REISERFS_CHECK
-	if (REISERFS_SB(tb->tb_sb)->cur_tb) {
-		print_cur_tb("fix_nodes");
-		reiserfs_panic(tb->tb_sb, "PAP-8305",
-			       "there is pending do_balance");
-	}
-
-	if (!buffer_uptodate(tbS0) || !B_IS_IN_TREE(tbS0))
-		reiserfs_panic(tb->tb_sb, "PAP-8320", "S[0] (%b %z) is "
-			       "not uptodate at the beginning of fix_nodes "
-			       "or not in tree (mode %c)",
-			       tbS0, tbS0, op_mode);
-
-	/* Check parameters. */
-	switch (op_mode) {
-	case M_INSERT:
-		if (item_num <= 0 || item_num > B_NR_ITEMS(tbS0))
-			reiserfs_panic(tb->tb_sb, "PAP-8330", "Incorrect "
-				       "item number %d (in S0 - %d) in case "
-				       "of insert", item_num,
-				       B_NR_ITEMS(tbS0));
-		break;
-	case M_PASTE:
-	case M_DELETE:
-	case M_CUT:
-		if (item_num < 0 || item_num >= B_NR_ITEMS(tbS0)) {
-			print_block(tbS0, 0, -1, -1);
-			reiserfs_panic(tb->tb_sb, "PAP-8335", "Incorrect "
-				       "item number(%d); mode = %c "
-				       "insert_size = %d",
-				       item_num, op_mode,
-				       tb->insert_size[0]);
-		}
-		break;
-	default:
-		reiserfs_panic(tb->tb_sb, "PAP-8340", "Incorrect mode "
-			       "of operation");
-	}
-#endif
-
-	if (get_mem_for_virtual_node(tb) == REPEAT_SEARCH)
-		/* FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat */
-		return REPEAT_SEARCH;
-
-	/* Starting from the leaf level; for all levels h of the tree. */
-	for (h = 0; h < MAX_HEIGHT && tb->insert_size[h]; h++) {
-		ret = get_direct_parent(tb, h);
-		if (ret != CARRY_ON)
-			goto repeat;
-
-		ret = check_balance(op_mode, tb, h, item_num,
-				    pos_in_item, ins_ih, data);
-		if (ret != CARRY_ON) {
-			if (ret == NO_BALANCING_NEEDED) {
-				/* No balancing for higher levels needed. */
-				ret = get_neighbors(tb, h);
-				if (ret != CARRY_ON)
-					goto repeat;
-				if (h != MAX_HEIGHT - 1)
-					tb->insert_size[h + 1] = 0;
-				/*
-				 * ok, analysis and resource gathering
-				 * are complete
-				 */
-				break;
-			}
-			goto repeat;
-		}
-
-		ret = get_neighbors(tb, h);
-		if (ret != CARRY_ON)
-			goto repeat;
-
-		/*
-		 * No disk space, or schedule occurred and analysis may be
-		 * invalid and needs to be redone.
-		 */
-		ret = get_empty_nodes(tb, h);
-		if (ret != CARRY_ON)
-			goto repeat;
-
-		/*
-		 * We have a positive insert size but no nodes exist on this
-		 * level, this means that we are creating a new root.
-		 */
-		if (!PATH_H_PBUFFER(tb->tb_path, h)) {
-
-			RFALSE(tb->blknum[h] != 1,
-			       "PAP-8350: creating new empty root");
-
-			if (h < MAX_HEIGHT - 1)
-				tb->insert_size[h + 1] = 0;
-		} else if (!PATH_H_PBUFFER(tb->tb_path, h + 1)) {
-			/*
-			 * The tree needs to be grown, so this node S[h]
-			 * which is the root node is split into two nodes,
-			 * and a new node (S[h+1]) will be created to
-			 * become the root node.
-			 */
-			if (tb->blknum[h] > 1) {
-
-				RFALSE(h == MAX_HEIGHT - 1,
-				       "PAP-8355: attempt to create too high of a tree");
-
-				tb->insert_size[h + 1] =
-				    (DC_SIZE +
-				     KEY_SIZE) * (tb->blknum[h] - 1) +
-				    DC_SIZE;
-			} else if (h < MAX_HEIGHT - 1)
-				tb->insert_size[h + 1] = 0;
-		} else
-			tb->insert_size[h + 1] =
-			    (DC_SIZE + KEY_SIZE) * (tb->blknum[h] - 1);
-	}
-
-	ret = wait_tb_buffers_until_unlocked(tb);
-	if (ret == CARRY_ON) {
-		if (FILESYSTEM_CHANGED_TB(tb)) {
-			wait_tb_buffers_run = 1;
-			ret = REPEAT_SEARCH;
-			goto repeat;
-		} else {
-			return CARRY_ON;
-		}
-	} else {
-		wait_tb_buffers_run = 1;
-		goto repeat;
-	}
-
-repeat:
-	/*
-	 * fix_nodes was unable to perform its calculation due to
-	 * filesystem got changed under us, lack of free disk space or i/o
-	 * failure. If the first is the case - the search will be
-	 * repeated. For now - free all resources acquired so far except
-	 * for the new allocated nodes
-	 */
-	{
-		int i;
-
-		/* Release path buffers. */
-		if (wait_tb_buffers_run) {
-			pathrelse_and_restore(tb->tb_sb, tb->tb_path);
-		} else {
-			pathrelse(tb->tb_path);
-		}
-		/* brelse all resources collected for balancing */
-		for (i = 0; i < MAX_HEIGHT; i++) {
-			if (wait_tb_buffers_run) {
-				reiserfs_restore_prepared_buffer(tb->tb_sb,
-								 tb->L[i]);
-				reiserfs_restore_prepared_buffer(tb->tb_sb,
-								 tb->R[i]);
-				reiserfs_restore_prepared_buffer(tb->tb_sb,
-								 tb->FL[i]);
-				reiserfs_restore_prepared_buffer(tb->tb_sb,
-								 tb->FR[i]);
-				reiserfs_restore_prepared_buffer(tb->tb_sb,
-								 tb->
-								 CFL[i]);
-				reiserfs_restore_prepared_buffer(tb->tb_sb,
-								 tb->
-								 CFR[i]);
-			}
-
-			brelse(tb->L[i]);
-			brelse(tb->R[i]);
-			brelse(tb->FL[i]);
-			brelse(tb->FR[i]);
-			brelse(tb->CFL[i]);
-			brelse(tb->CFR[i]);
-
-			tb->L[i] = NULL;
-			tb->R[i] = NULL;
-			tb->FL[i] = NULL;
-			tb->FR[i] = NULL;
-			tb->CFL[i] = NULL;
-			tb->CFR[i] = NULL;
-		}
-
-		if (wait_tb_buffers_run) {
-			for (i = 0; i < MAX_FEB_SIZE; i++) {
-				if (tb->FEB[i])
-					reiserfs_restore_prepared_buffer
-					    (tb->tb_sb, tb->FEB[i]);
-			}
-		}
-		return ret;
-	}
-
-}
-
-void unfix_nodes(struct tree_balance *tb)
-{
-	int i;
-
-	/* Release path buffers. */
-	pathrelse_and_restore(tb->tb_sb, tb->tb_path);
-
-	/* brelse all resources collected for balancing */
-	for (i = 0; i < MAX_HEIGHT; i++) {
-		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->L[i]);
-		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->R[i]);
-		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FL[i]);
-		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FR[i]);
-		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFL[i]);
-		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFR[i]);
-
-		brelse(tb->L[i]);
-		brelse(tb->R[i]);
-		brelse(tb->FL[i]);
-		brelse(tb->FR[i]);
-		brelse(tb->CFL[i]);
-		brelse(tb->CFR[i]);
-	}
-
-	/* deal with list of allocated (used and unused) nodes */
-	for (i = 0; i < MAX_FEB_SIZE; i++) {
-		if (tb->FEB[i]) {
-			b_blocknr_t blocknr = tb->FEB[i]->b_blocknr;
-			/*
-			 * de-allocated block which was not used by
-			 * balancing and bforget about buffer for it
-			 */
-			brelse(tb->FEB[i]);
-			reiserfs_free_block(tb->transaction_handle, NULL,
-					    blocknr, 0);
-		}
-		if (tb->used[i]) {
-			/* release used as new nodes including a new root */
-			brelse(tb->used[i]);
-		}
-	}
-
-	kfree(tb->vn_buf);
-
-}
diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c
deleted file mode 100644
index 7a26c4fe6c46..000000000000
--- a/fs/reiserfs/hashes.c
+++ /dev/null
@@ -1,177 +0,0 @@
-
-/*
- * Keyed 32-bit hash function using TEA in a Davis-Meyer function
- *   H0 = Key
- *   Hi = E Mi(Hi-1) + Hi-1
- *
- * (see Applied Cryptography, 2nd edition, p448).
- *
- * Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
- *
- * Jeremy has agreed to the contents of reiserfs/README. -Hans
- * Yura's function is added (04/07/2000)
- */
-
-#include <linux/kernel.h>
-#include "reiserfs.h"
-#include <asm/types.h>
-
-#define DELTA 0x9E3779B9
-#define FULLROUNDS 10		/* 32 is overkill, 16 is strong crypto */
-#define PARTROUNDS 6		/* 6 gets complete mixing */
-
-/* a, b, c, d - data; h0, h1 - accumulated hash */
-#define TEACORE(rounds)							\
-	do {								\
-		u32 sum = 0;						\
-		int n = rounds;						\
-		u32 b0, b1;						\
-									\
-		b0 = h0;						\
-		b1 = h1;						\
-									\
-		do							\
-		{							\
-			sum += DELTA;					\
-			b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);	\
-			b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);	\
-		} while(--n);						\
-									\
-		h0 += b0;						\
-		h1 += b1;						\
-	} while(0)
-
-u32 keyed_hash(const signed char *msg, int len)
-{
-	u32 k[] = { 0x9464a485, 0x542e1a94, 0x3e846bff, 0xb75bcfc3 };
-
-	u32 h0 = k[0], h1 = k[1];
-	u32 a, b, c, d;
-	u32 pad;
-	int i;
-
-	/*      assert(len >= 0 && len < 256); */
-
-	pad = (u32) len | ((u32) len << 8);
-	pad |= pad << 16;
-
-	while (len >= 16) {
-		a = (u32) msg[0] |
-		    (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
-		b = (u32) msg[4] |
-		    (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24;
-		c = (u32) msg[8] |
-		    (u32) msg[9] << 8 |
-		    (u32) msg[10] << 16 | (u32) msg[11] << 24;
-		d = (u32) msg[12] |
-		    (u32) msg[13] << 8 |
-		    (u32) msg[14] << 16 | (u32) msg[15] << 24;
-
-		TEACORE(PARTROUNDS);
-
-		len -= 16;
-		msg += 16;
-	}
-
-	if (len >= 12) {
-		a = (u32) msg[0] |
-		    (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
-		b = (u32) msg[4] |
-		    (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24;
-		c = (u32) msg[8] |
-		    (u32) msg[9] << 8 |
-		    (u32) msg[10] << 16 | (u32) msg[11] << 24;
-
-		d = pad;
-		for (i = 12; i < len; i++) {
-			d <<= 8;
-			d |= msg[i];
-		}
-	} else if (len >= 8) {
-		a = (u32) msg[0] |
-		    (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
-		b = (u32) msg[4] |
-		    (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24;
-
-		c = d = pad;
-		for (i = 8; i < len; i++) {
-			c <<= 8;
-			c |= msg[i];
-		}
-	} else if (len >= 4) {
-		a = (u32) msg[0] |
-		    (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
-
-		b = c = d = pad;
-		for (i = 4; i < len; i++) {
-			b <<= 8;
-			b |= msg[i];
-		}
-	} else {
-		a = b = c = d = pad;
-		for (i = 0; i < len; i++) {
-			a <<= 8;
-			a |= msg[i];
-		}
-	}
-
-	TEACORE(FULLROUNDS);
-
-/*	return 0;*/
-	return h0 ^ h1;
-}
-
-/*
- * What follows in this file is copyright 2000 by Hans Reiser, and the
- * licensing of what follows is governed by reiserfs/README
- */
-u32 yura_hash(const signed char *msg, int len)
-{
-	int j, pow;
-	u32 a, c;
-	int i;
-
-	for (pow = 1, i = 1; i < len; i++)
-		pow = pow * 10;
-
-	if (len == 1)
-		a = msg[0] - 48;
-	else
-		a = (msg[0] - 48) * pow;
-
-	for (i = 1; i < len; i++) {
-		c = msg[i] - 48;
-		for (pow = 1, j = i; j < len - 1; j++)
-			pow = pow * 10;
-		a = a + c * pow;
-	}
-
-	for (; i < 40; i++) {
-		c = '0' - 48;
-		for (pow = 1, j = i; j < len - 1; j++)
-			pow = pow * 10;
-		a = a + c * pow;
-	}
-
-	for (; i < 256; i++) {
-		c = i;
-		for (pow = 1, j = i; j < len - 1; j++)
-			pow = pow * 10;
-		a = a + c * pow;
-	}
-
-	a = a << 7;
-	return a;
-}
-
-u32 r5_hash(const signed char *msg, int len)
-{
-	u32 a = 0;
-	while (*msg) {
-		a += *msg << 4;
-		a += *msg >> 4;
-		a *= 11;
-		msg++;
-	}
-	return a;
-}
diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
deleted file mode 100644
index 5db6f45b3fed..000000000000
--- a/fs/reiserfs/ibalance.c
+++ /dev/null
@@ -1,1161 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/uaccess.h>
-#include <linux/string.h>
-#include <linux/time.h>
-#include "reiserfs.h"
-#include <linux/buffer_head.h>
-
-/* this is one and only function that is used outside (do_balance.c) */
-int balance_internal(struct tree_balance *,
-		     int, int, struct item_head *, struct buffer_head **);
-
-/*
- * modes of internal_shift_left, internal_shift_right and
- * internal_insert_childs
- */
-#define INTERNAL_SHIFT_FROM_S_TO_L 0
-#define INTERNAL_SHIFT_FROM_R_TO_S 1
-#define INTERNAL_SHIFT_FROM_L_TO_S 2
-#define INTERNAL_SHIFT_FROM_S_TO_R 3
-#define INTERNAL_INSERT_TO_S 4
-#define INTERNAL_INSERT_TO_L 5
-#define INTERNAL_INSERT_TO_R 6
-
-static void internal_define_dest_src_infos(int shift_mode,
-					   struct tree_balance *tb,
-					   int h,
-					   struct buffer_info *dest_bi,
-					   struct buffer_info *src_bi,
-					   int *d_key, struct buffer_head **cf)
-{
-	memset(dest_bi, 0, sizeof(struct buffer_info));
-	memset(src_bi, 0, sizeof(struct buffer_info));
-	/* define dest, src, dest parent, dest position */
-	switch (shift_mode) {
-
-	/* used in internal_shift_left */
-	case INTERNAL_SHIFT_FROM_S_TO_L:
-		src_bi->tb = tb;
-		src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
-		src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
-		src_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
-		dest_bi->tb = tb;
-		dest_bi->bi_bh = tb->L[h];
-		dest_bi->bi_parent = tb->FL[h];
-		dest_bi->bi_position = get_left_neighbor_position(tb, h);
-		*d_key = tb->lkey[h];
-		*cf = tb->CFL[h];
-		break;
-	case INTERNAL_SHIFT_FROM_L_TO_S:
-		src_bi->tb = tb;
-		src_bi->bi_bh = tb->L[h];
-		src_bi->bi_parent = tb->FL[h];
-		src_bi->bi_position = get_left_neighbor_position(tb, h);
-		dest_bi->tb = tb;
-		dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
-		dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
-		/* dest position is analog of dest->b_item_order */
-		dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
-		*d_key = tb->lkey[h];
-		*cf = tb->CFL[h];
-		break;
-
-	/* used in internal_shift_left */
-	case INTERNAL_SHIFT_FROM_R_TO_S:
-		src_bi->tb = tb;
-		src_bi->bi_bh = tb->R[h];
-		src_bi->bi_parent = tb->FR[h];
-		src_bi->bi_position = get_right_neighbor_position(tb, h);
-		dest_bi->tb = tb;
-		dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
-		dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
-		dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
-		*d_key = tb->rkey[h];
-		*cf = tb->CFR[h];
-		break;
-
-	case INTERNAL_SHIFT_FROM_S_TO_R:
-		src_bi->tb = tb;
-		src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
-		src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
-		src_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
-		dest_bi->tb = tb;
-		dest_bi->bi_bh = tb->R[h];
-		dest_bi->bi_parent = tb->FR[h];
-		dest_bi->bi_position = get_right_neighbor_position(tb, h);
-		*d_key = tb->rkey[h];
-		*cf = tb->CFR[h];
-		break;
-
-	case INTERNAL_INSERT_TO_L:
-		dest_bi->tb = tb;
-		dest_bi->bi_bh = tb->L[h];
-		dest_bi->bi_parent = tb->FL[h];
-		dest_bi->bi_position = get_left_neighbor_position(tb, h);
-		break;
-
-	case INTERNAL_INSERT_TO_S:
-		dest_bi->tb = tb;
-		dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
-		dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
-		dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
-		break;
-
-	case INTERNAL_INSERT_TO_R:
-		dest_bi->tb = tb;
-		dest_bi->bi_bh = tb->R[h];
-		dest_bi->bi_parent = tb->FR[h];
-		dest_bi->bi_position = get_right_neighbor_position(tb, h);
-		break;
-
-	default:
-		reiserfs_panic(tb->tb_sb, "ibalance-1",
-			       "shift type is unknown (%d)",
-			       shift_mode);
-	}
-}
-
-/*
- * Insert count node pointers into buffer cur before position to + 1.
- * Insert count items into buffer cur before position to.
- * Items and node pointers are specified by inserted and bh respectively.
- */
-static void internal_insert_childs(struct buffer_info *cur_bi,
-				   int to, int count,
-				   struct item_head *inserted,
-				   struct buffer_head **bh)
-{
-	struct buffer_head *cur = cur_bi->bi_bh;
-	struct block_head *blkh;
-	int nr;
-	struct reiserfs_key *ih;
-	struct disk_child new_dc[2];
-	struct disk_child *dc;
-	int i;
-
-	if (count <= 0)
-		return;
-
-	blkh = B_BLK_HEAD(cur);
-	nr = blkh_nr_item(blkh);
-
-	RFALSE(count > 2, "too many children (%d) are to be inserted", count);
-	RFALSE(B_FREE_SPACE(cur) < count * (KEY_SIZE + DC_SIZE),
-	       "no enough free space (%d), needed %d bytes",
-	       B_FREE_SPACE(cur), count * (KEY_SIZE + DC_SIZE));
-
-	/* prepare space for count disk_child */
-	dc = B_N_CHILD(cur, to + 1);
-
-	memmove(dc + count, dc, (nr + 1 - (to + 1)) * DC_SIZE);
-
-	/* copy to_be_insert disk children */
-	for (i = 0; i < count; i++) {
-		put_dc_size(&new_dc[i],
-			    MAX_CHILD_SIZE(bh[i]) - B_FREE_SPACE(bh[i]));
-		put_dc_block_number(&new_dc[i], bh[i]->b_blocknr);
-	}
-	memcpy(dc, new_dc, DC_SIZE * count);
-
-	/* prepare space for count items  */
-	ih = internal_key(cur, ((to == -1) ? 0 : to));
-
-	memmove(ih + count, ih,
-		(nr - to) * KEY_SIZE + (nr + 1 + count) * DC_SIZE);
-
-	/* copy item headers (keys) */
-	memcpy(ih, inserted, KEY_SIZE);
-	if (count > 1)
-		memcpy(ih + 1, inserted + 1, KEY_SIZE);
-
-	/* sizes, item number */
-	set_blkh_nr_item(blkh, blkh_nr_item(blkh) + count);
-	set_blkh_free_space(blkh,
-			    blkh_free_space(blkh) - count * (DC_SIZE +
-							     KEY_SIZE));
-
-	do_balance_mark_internal_dirty(cur_bi->tb, cur, 0);
-
-	/*&&&&&&&&&&&&&&&&&&&&&&&& */
-	check_internal(cur);
-	/*&&&&&&&&&&&&&&&&&&&&&&&& */
-
-	if (cur_bi->bi_parent) {
-		struct disk_child *t_dc =
-		    B_N_CHILD(cur_bi->bi_parent, cur_bi->bi_position);
-		put_dc_size(t_dc,
-			    dc_size(t_dc) + (count * (DC_SIZE + KEY_SIZE)));
-		do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent,
-					       0);
-
-		/*&&&&&&&&&&&&&&&&&&&&&&&& */
-		check_internal(cur_bi->bi_parent);
-		/*&&&&&&&&&&&&&&&&&&&&&&&& */
-	}
-
-}
-
-/*
- * Delete del_num items and node pointers from buffer cur starting from
- * the first_i'th item and first_p'th pointers respectively.
- */
-static void internal_delete_pointers_items(struct buffer_info *cur_bi,
-					   int first_p,
-					   int first_i, int del_num)
-{
-	struct buffer_head *cur = cur_bi->bi_bh;
-	int nr;
-	struct block_head *blkh;
-	struct reiserfs_key *key;
-	struct disk_child *dc;
-
-	RFALSE(cur == NULL, "buffer is 0");
-	RFALSE(del_num < 0,
-	       "negative number of items (%d) can not be deleted", del_num);
-	RFALSE(first_p < 0 || first_p + del_num > B_NR_ITEMS(cur) + 1
-	       || first_i < 0,
-	       "first pointer order (%d) < 0 or "
-	       "no so many pointers (%d), only (%d) or "
-	       "first key order %d < 0", first_p, first_p + del_num,
-	       B_NR_ITEMS(cur) + 1, first_i);
-	if (del_num == 0)
-		return;
-
-	blkh = B_BLK_HEAD(cur);
-	nr = blkh_nr_item(blkh);
-
-	if (first_p == 0 && del_num == nr + 1) {
-		RFALSE(first_i != 0,
-		       "1st deleted key must have order 0, not %d", first_i);
-		make_empty_node(cur_bi);
-		return;
-	}
-
-	RFALSE(first_i + del_num > B_NR_ITEMS(cur),
-	       "first_i = %d del_num = %d "
-	       "no so many keys (%d) in the node (%b)(%z)",
-	       first_i, del_num, first_i + del_num, cur, cur);
-
-	/* deleting */
-	dc = B_N_CHILD(cur, first_p);
-
-	memmove(dc, dc + del_num, (nr + 1 - first_p - del_num) * DC_SIZE);
-	key = internal_key(cur, first_i);
-	memmove(key, key + del_num,
-		(nr - first_i - del_num) * KEY_SIZE + (nr + 1 -
-						       del_num) * DC_SIZE);
-
-	/* sizes, item number */
-	set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num);
-	set_blkh_free_space(blkh,
-			    blkh_free_space(blkh) +
-			    (del_num * (KEY_SIZE + DC_SIZE)));
-
-	do_balance_mark_internal_dirty(cur_bi->tb, cur, 0);
-	/*&&&&&&&&&&&&&&&&&&&&&&& */
-	check_internal(cur);
-	/*&&&&&&&&&&&&&&&&&&&&&&& */
-
-	if (cur_bi->bi_parent) {
-		struct disk_child *t_dc;
-		t_dc = B_N_CHILD(cur_bi->bi_parent, cur_bi->bi_position);
-		put_dc_size(t_dc,
-			    dc_size(t_dc) - (del_num * (KEY_SIZE + DC_SIZE)));
-
-		do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent,
-					       0);
-		/*&&&&&&&&&&&&&&&&&&&&&&&& */
-		check_internal(cur_bi->bi_parent);
-		/*&&&&&&&&&&&&&&&&&&&&&&&& */
-	}
-}
-
-/* delete n node pointers and items starting from given position */
-static void internal_delete_childs(struct buffer_info *cur_bi, int from, int n)
-{
-	int i_from;
-
-	i_from = (from == 0) ? from : from - 1;
-
-	/*
-	 * delete n pointers starting from `from' position in CUR;
-	 * delete n keys starting from 'i_from' position in CUR;
-	 */
-	internal_delete_pointers_items(cur_bi, from, i_from, n);
-}
-
-/*
- * copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer
- * dest
- * last_first == FIRST_TO_LAST means that we copy first items
- *                             from src to tail of dest
- * last_first == LAST_TO_FIRST means that we copy last items
- *                             from src to head of dest
- */
-static void internal_copy_pointers_items(struct buffer_info *dest_bi,
-					 struct buffer_head *src,
-					 int last_first, int cpy_num)
-{
-	/*
-	 * ATTENTION! Number of node pointers in DEST is equal to number
-	 * of items in DEST  as delimiting key have already inserted to
-	 * buffer dest.
-	 */
-	struct buffer_head *dest = dest_bi->bi_bh;
-	int nr_dest, nr_src;
-	int dest_order, src_order;
-	struct block_head *blkh;
-	struct reiserfs_key *key;
-	struct disk_child *dc;
-
-	nr_src = B_NR_ITEMS(src);
-
-	RFALSE(dest == NULL || src == NULL,
-	       "src (%p) or dest (%p) buffer is 0", src, dest);
-	RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST,
-	       "invalid last_first parameter (%d)", last_first);
-	RFALSE(nr_src < cpy_num - 1,
-	       "no so many items (%d) in src (%d)", cpy_num, nr_src);
-	RFALSE(cpy_num < 0, "cpy_num less than 0 (%d)", cpy_num);
-	RFALSE(cpy_num - 1 + B_NR_ITEMS(dest) > (int)MAX_NR_KEY(dest),
-	       "cpy_num (%d) + item number in dest (%d) can not be > MAX_NR_KEY(%d)",
-	       cpy_num, B_NR_ITEMS(dest), MAX_NR_KEY(dest));
-
-	if (cpy_num == 0)
-		return;
-
-	/* coping */
-	blkh = B_BLK_HEAD(dest);
-	nr_dest = blkh_nr_item(blkh);
-
-	/*dest_order = (last_first == LAST_TO_FIRST) ? 0 : nr_dest; */
-	/*src_order = (last_first == LAST_TO_FIRST) ? (nr_src - cpy_num + 1) : 0; */
-	(last_first == LAST_TO_FIRST) ? (dest_order = 0, src_order =
-					 nr_src - cpy_num + 1) : (dest_order =
-								  nr_dest,
-								  src_order =
-								  0);
-
-	/* prepare space for cpy_num pointers */
-	dc = B_N_CHILD(dest, dest_order);
-
-	memmove(dc + cpy_num, dc, (nr_dest - dest_order) * DC_SIZE);
-
-	/* insert pointers */
-	memcpy(dc, B_N_CHILD(src, src_order), DC_SIZE * cpy_num);
-
-	/* prepare space for cpy_num - 1 item headers */
-	key = internal_key(dest, dest_order);
-	memmove(key + cpy_num - 1, key,
-		KEY_SIZE * (nr_dest - dest_order) + DC_SIZE * (nr_dest +
-							       cpy_num));
-
-	/* insert headers */
-	memcpy(key, internal_key(src, src_order), KEY_SIZE * (cpy_num - 1));
-
-	/* sizes, item number */
-	set_blkh_nr_item(blkh, blkh_nr_item(blkh) + (cpy_num - 1));
-	set_blkh_free_space(blkh,
-			    blkh_free_space(blkh) - (KEY_SIZE * (cpy_num - 1) +
-						     DC_SIZE * cpy_num));
-
-	do_balance_mark_internal_dirty(dest_bi->tb, dest, 0);
-
-	/*&&&&&&&&&&&&&&&&&&&&&&&& */
-	check_internal(dest);
-	/*&&&&&&&&&&&&&&&&&&&&&&&& */
-
-	if (dest_bi->bi_parent) {
-		struct disk_child *t_dc;
-		t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position);
-		put_dc_size(t_dc,
-			    dc_size(t_dc) + (KEY_SIZE * (cpy_num - 1) +
-					     DC_SIZE * cpy_num));
-
-		do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent,
-					       0);
-		/*&&&&&&&&&&&&&&&&&&&&&&&& */
-		check_internal(dest_bi->bi_parent);
-		/*&&&&&&&&&&&&&&&&&&&&&&&& */
-	}
-
-}
-
-/*
- * Copy cpy_num node pointers and cpy_num - 1 items from buffer src to
- * buffer dest.
- * Delete cpy_num - del_par items and node pointers from buffer src.
- * last_first == FIRST_TO_LAST means, that we copy/delete first items from src.
- * last_first == LAST_TO_FIRST means, that we copy/delete last items from src.
- */
-static void internal_move_pointers_items(struct buffer_info *dest_bi,
-					 struct buffer_info *src_bi,
-					 int last_first, int cpy_num,
-					 int del_par)
-{
-	int first_pointer;
-	int first_item;
-
-	internal_copy_pointers_items(dest_bi, src_bi->bi_bh, last_first,
-				     cpy_num);
-
-	if (last_first == FIRST_TO_LAST) {	/* shift_left occurs */
-		first_pointer = 0;
-		first_item = 0;
-		/*
-		 * delete cpy_num - del_par pointers and keys starting for
-		 * pointers with first_pointer, for key - with first_item
-		 */
-		internal_delete_pointers_items(src_bi, first_pointer,
-					       first_item, cpy_num - del_par);
-	} else {		/* shift_right occurs */
-		int i, j;
-
-		i = (cpy_num - del_par ==
-		     (j =
-		      B_NR_ITEMS(src_bi->bi_bh)) + 1) ? 0 : j - cpy_num +
-		    del_par;
-
-		internal_delete_pointers_items(src_bi,
-					       j + 1 - cpy_num + del_par, i,
-					       cpy_num - del_par);
-	}
-}
-
-/* Insert n_src'th key of buffer src before n_dest'th key of buffer dest. */
-static void internal_insert_key(struct buffer_info *dest_bi,
-				/* insert key before key with n_dest number */
-				int dest_position_before,
-				struct buffer_head *src, int src_position)
-{
-	struct buffer_head *dest = dest_bi->bi_bh;
-	int nr;
-	struct block_head *blkh;
-	struct reiserfs_key *key;
-
-	RFALSE(dest == NULL || src == NULL,
-	       "source(%p) or dest(%p) buffer is 0", src, dest);
-	RFALSE(dest_position_before < 0 || src_position < 0,
-	       "source(%d) or dest(%d) key number less than 0",
-	       src_position, dest_position_before);
-	RFALSE(dest_position_before > B_NR_ITEMS(dest) ||
-	       src_position >= B_NR_ITEMS(src),
-	       "invalid position in dest (%d (key number %d)) or in src (%d (key number %d))",
-	       dest_position_before, B_NR_ITEMS(dest),
-	       src_position, B_NR_ITEMS(src));
-	RFALSE(B_FREE_SPACE(dest) < KEY_SIZE,
-	       "no enough free space (%d) in dest buffer", B_FREE_SPACE(dest));
-
-	blkh = B_BLK_HEAD(dest);
-	nr = blkh_nr_item(blkh);
-
-	/* prepare space for inserting key */
-	key = internal_key(dest, dest_position_before);
-	memmove(key + 1, key,
-		(nr - dest_position_before) * KEY_SIZE + (nr + 1) * DC_SIZE);
-
-	/* insert key */
-	memcpy(key, internal_key(src, src_position), KEY_SIZE);
-
-	/* Change dirt, free space, item number fields. */
-
-	set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1);
-	set_blkh_free_space(blkh, blkh_free_space(blkh) - KEY_SIZE);
-
-	do_balance_mark_internal_dirty(dest_bi->tb, dest, 0);
-
-	if (dest_bi->bi_parent) {
-		struct disk_child *t_dc;
-		t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position);
-		put_dc_size(t_dc, dc_size(t_dc) + KEY_SIZE);
-
-		do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent,
-					       0);
-	}
-}
-
-/*
- * Insert d_key'th (delimiting) key from buffer cfl to tail of dest.
- * Copy pointer_amount node pointers and pointer_amount - 1 items from
- * buffer src to buffer dest.
- * Replace  d_key'th key in buffer cfl.
- * Delete pointer_amount items and node pointers from buffer src.
- */
-/* this can be invoked both to shift from S to L and from R to S */
-static void internal_shift_left(
-				/*
-				 * INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S
-				 */
-				int mode,
-				struct tree_balance *tb,
-				int h, int pointer_amount)
-{
-	struct buffer_info dest_bi, src_bi;
-	struct buffer_head *cf;
-	int d_key_position;
-
-	internal_define_dest_src_infos(mode, tb, h, &dest_bi, &src_bi,
-				       &d_key_position, &cf);
-
-	/*printk("pointer_amount = %d\n",pointer_amount); */
-
-	if (pointer_amount) {
-		/*
-		 * insert delimiting key from common father of dest and
-		 * src to node dest into position B_NR_ITEM(dest)
-		 */
-		internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf,
-				    d_key_position);
-
-		if (B_NR_ITEMS(src_bi.bi_bh) == pointer_amount - 1) {
-			if (src_bi.bi_position /*src->b_item_order */  == 0)
-				replace_key(tb, cf, d_key_position,
-					    src_bi.
-					    bi_parent /*src->b_parent */ , 0);
-		} else
-			replace_key(tb, cf, d_key_position, src_bi.bi_bh,
-				    pointer_amount - 1);
-	}
-	/* last parameter is del_parameter */
-	internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST,
-				     pointer_amount, 0);
-
-}
-
-/*
- * Insert delimiting key to L[h].
- * Copy n node pointers and n - 1 items from buffer S[h] to L[h].
- * Delete n - 1 items and node pointers from buffer S[h].
- */
-/* it always shifts from S[h] to L[h] */
-static void internal_shift1_left(struct tree_balance *tb,
-				 int h, int pointer_amount)
-{
-	struct buffer_info dest_bi, src_bi;
-	struct buffer_head *cf;
-	int d_key_position;
-
-	internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
-				       &dest_bi, &src_bi, &d_key_position, &cf);
-
-	/* insert lkey[h]-th key  from CFL[h] to left neighbor L[h] */
-	if (pointer_amount > 0)
-		internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf,
-				    d_key_position);
-
-	/* last parameter is del_parameter */
-	internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST,
-				     pointer_amount, 1);
-}
-
-/*
- * Insert d_key'th (delimiting) key from buffer cfr to head of dest.
- * Copy n node pointers and n - 1 items from buffer src to buffer dest.
- * Replace  d_key'th key in buffer cfr.
- * Delete n items and node pointers from buffer src.
- */
-static void internal_shift_right(
-				 /*
-				  * INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S
-				  */
-				 int mode,
-				 struct tree_balance *tb,
-				 int h, int pointer_amount)
-{
-	struct buffer_info dest_bi, src_bi;
-	struct buffer_head *cf;
-	int d_key_position;
-	int nr;
-
-	internal_define_dest_src_infos(mode, tb, h, &dest_bi, &src_bi,
-				       &d_key_position, &cf);
-
-	nr = B_NR_ITEMS(src_bi.bi_bh);
-
-	if (pointer_amount > 0) {
-		/*
-		 * insert delimiting key from common father of dest
-		 * and src to dest node into position 0
-		 */
-		internal_insert_key(&dest_bi, 0, cf, d_key_position);
-		if (nr == pointer_amount - 1) {
-			RFALSE(src_bi.bi_bh != PATH_H_PBUFFER(tb->tb_path, h) /*tb->S[h] */ ||
-			       dest_bi.bi_bh != tb->R[h],
-			       "src (%p) must be == tb->S[h](%p) when it disappears",
-			       src_bi.bi_bh, PATH_H_PBUFFER(tb->tb_path, h));
-			/* when S[h] disappers replace left delemiting key as well */
-			if (tb->CFL[h])
-				replace_key(tb, cf, d_key_position, tb->CFL[h],
-					    tb->lkey[h]);
-		} else
-			replace_key(tb, cf, d_key_position, src_bi.bi_bh,
-				    nr - pointer_amount);
-	}
-
-	/* last parameter is del_parameter */
-	internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST,
-				     pointer_amount, 0);
-}
-
-/*
- * Insert delimiting key to R[h].
- * Copy n node pointers and n - 1 items from buffer S[h] to R[h].
- * Delete n - 1 items and node pointers from buffer S[h].
- */
-/* it always shift from S[h] to R[h] */
-static void internal_shift1_right(struct tree_balance *tb,
-				  int h, int pointer_amount)
-{
-	struct buffer_info dest_bi, src_bi;
-	struct buffer_head *cf;
-	int d_key_position;
-
-	internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
-				       &dest_bi, &src_bi, &d_key_position, &cf);
-
-	/* insert rkey from CFR[h] to right neighbor R[h] */
-	if (pointer_amount > 0)
-		internal_insert_key(&dest_bi, 0, cf, d_key_position);
-
-	/* last parameter is del_parameter */
-	internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST,
-				     pointer_amount, 1);
-}
-
-/*
- * Delete insert_num node pointers together with their left items
- * and balance current node.
- */
-static void balance_internal_when_delete(struct tree_balance *tb,
-					 int h, int child_pos)
-{
-	int insert_num;
-	int n;
-	struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h);
-	struct buffer_info bi;
-
-	insert_num = tb->insert_size[h] / ((int)(DC_SIZE + KEY_SIZE));
-
-	/* delete child-node-pointer(s) together with their left item(s) */
-	bi.tb = tb;
-	bi.bi_bh = tbSh;
-	bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h);
-	bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
-
-	internal_delete_childs(&bi, child_pos, -insert_num);
-
-	RFALSE(tb->blknum[h] > 1,
-	       "tb->blknum[%d]=%d when insert_size < 0", h, tb->blknum[h]);
-
-	n = B_NR_ITEMS(tbSh);
-
-	if (tb->lnum[h] == 0 && tb->rnum[h] == 0) {
-		if (tb->blknum[h] == 0) {
-			/* node S[h] (root of the tree) is empty now */
-			struct buffer_head *new_root;
-
-			RFALSE(n
-			       || B_FREE_SPACE(tbSh) !=
-			       MAX_CHILD_SIZE(tbSh) - DC_SIZE,
-			       "buffer must have only 0 keys (%d)", n);
-			RFALSE(bi.bi_parent, "root has parent (%p)",
-			       bi.bi_parent);
-
-			/* choose a new root */
-			if (!tb->L[h - 1] || !B_NR_ITEMS(tb->L[h - 1]))
-				new_root = tb->R[h - 1];
-			else
-				new_root = tb->L[h - 1];
-			/*
-			 * switch super block's tree root block
-			 * number to the new value */
-			PUT_SB_ROOT_BLOCK(tb->tb_sb, new_root->b_blocknr);
-			/*REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --; */
-			PUT_SB_TREE_HEIGHT(tb->tb_sb,
-					   SB_TREE_HEIGHT(tb->tb_sb) - 1);
-
-			do_balance_mark_sb_dirty(tb,
-						 REISERFS_SB(tb->tb_sb)->s_sbh,
-						 1);
-			/*&&&&&&&&&&&&&&&&&&&&&& */
-			/* use check_internal if new root is an internal node */
-			if (h > 1)
-				check_internal(new_root);
-			/*&&&&&&&&&&&&&&&&&&&&&& */
-
-			/* do what is needed for buffer thrown from tree */
-			reiserfs_invalidate_buffer(tb, tbSh);
-			return;
-		}
-		return;
-	}
-
-	/* join S[h] with L[h] */
-	if (tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1) {
-
-		RFALSE(tb->rnum[h] != 0,
-		       "invalid tb->rnum[%d]==%d when joining S[h] with L[h]",
-		       h, tb->rnum[h]);
-
-		internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, n + 1);
-		reiserfs_invalidate_buffer(tb, tbSh);
-
-		return;
-	}
-
-	/* join S[h] with R[h] */
-	if (tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1) {
-		RFALSE(tb->lnum[h] != 0,
-		       "invalid tb->lnum[%d]==%d when joining S[h] with R[h]",
-		       h, tb->lnum[h]);
-
-		internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, n + 1);
-
-		reiserfs_invalidate_buffer(tb, tbSh);
-		return;
-	}
-
-	/* borrow from left neighbor L[h] */
-	if (tb->lnum[h] < 0) {
-		RFALSE(tb->rnum[h] != 0,
-		       "wrong tb->rnum[%d]==%d when borrow from L[h]", h,
-		       tb->rnum[h]);
-		internal_shift_right(INTERNAL_SHIFT_FROM_L_TO_S, tb, h,
-				     -tb->lnum[h]);
-		return;
-	}
-
-	/* borrow from right neighbor R[h] */
-	if (tb->rnum[h] < 0) {
-		RFALSE(tb->lnum[h] != 0,
-		       "invalid tb->lnum[%d]==%d when borrow from R[h]",
-		       h, tb->lnum[h]);
-		internal_shift_left(INTERNAL_SHIFT_FROM_R_TO_S, tb, h, -tb->rnum[h]);	/*tb->S[h], tb->CFR[h], tb->rkey[h], tb->R[h], -tb->rnum[h]); */
-		return;
-	}
-
-	/* split S[h] into two parts and put them into neighbors */
-	if (tb->lnum[h] > 0) {
-		RFALSE(tb->rnum[h] == 0 || tb->lnum[h] + tb->rnum[h] != n + 1,
-		       "invalid tb->lnum[%d]==%d or tb->rnum[%d]==%d when S[h](item number == %d) is split between them",
-		       h, tb->lnum[h], h, tb->rnum[h], n);
-
-		internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]);	/*tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], tb->lnum[h]); */
-		internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
-				     tb->rnum[h]);
-
-		reiserfs_invalidate_buffer(tb, tbSh);
-
-		return;
-	}
-	reiserfs_panic(tb->tb_sb, "ibalance-2",
-		       "unexpected tb->lnum[%d]==%d or tb->rnum[%d]==%d",
-		       h, tb->lnum[h], h, tb->rnum[h]);
-}
-
-/* Replace delimiting key of buffers L[h] and S[h] by the given key.*/
-static void replace_lkey(struct tree_balance *tb, int h, struct item_head *key)
-{
-	RFALSE(tb->L[h] == NULL || tb->CFL[h] == NULL,
-	       "L[h](%p) and CFL[h](%p) must exist in replace_lkey",
-	       tb->L[h], tb->CFL[h]);
-
-	if (B_NR_ITEMS(PATH_H_PBUFFER(tb->tb_path, h)) == 0)
-		return;
-
-	memcpy(internal_key(tb->CFL[h], tb->lkey[h]), key, KEY_SIZE);
-
-	do_balance_mark_internal_dirty(tb, tb->CFL[h], 0);
-}
-
-/* Replace delimiting key of buffers S[h] and R[h] by the given key.*/
-static void replace_rkey(struct tree_balance *tb, int h, struct item_head *key)
-{
-	RFALSE(tb->R[h] == NULL || tb->CFR[h] == NULL,
-	       "R[h](%p) and CFR[h](%p) must exist in replace_rkey",
-	       tb->R[h], tb->CFR[h]);
-	RFALSE(B_NR_ITEMS(tb->R[h]) == 0,
-	       "R[h] can not be empty if it exists (item number=%d)",
-	       B_NR_ITEMS(tb->R[h]));
-
-	memcpy(internal_key(tb->CFR[h], tb->rkey[h]), key, KEY_SIZE);
-
-	do_balance_mark_internal_dirty(tb, tb->CFR[h], 0);
-}
-
-
-/*
- * if inserting/pasting {
- *   child_pos is the position of the node-pointer in S[h] that
- *   pointed to S[h-1] before balancing of the h-1 level;
- *   this means that new pointers and items must be inserted AFTER
- *   child_pos
- * } else {
- *   it is the position of the leftmost pointer that must be deleted
- *   (together with its corresponding key to the left of the pointer)
- *   as a result of the previous level's balancing.
- * }
- */
-
-int balance_internal(struct tree_balance *tb,
-		     int h,	/* level of the tree */
-		     int child_pos,
-		     /* key for insertion on higher level    */
-		     struct item_head *insert_key,
-		     /* node for insertion on higher level */
-		     struct buffer_head **insert_ptr)
-{
-	struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h);
-	struct buffer_info bi;
-
-	/*
-	 * we return this: it is 0 if there is no S[h],
-	 * else it is tb->S[h]->b_item_order
-	 */
-	int order;
-	int insert_num, n, k;
-	struct buffer_head *S_new;
-	struct item_head new_insert_key;
-	struct buffer_head *new_insert_ptr = NULL;
-	struct item_head *new_insert_key_addr = insert_key;
-
-	RFALSE(h < 1, "h (%d) can not be < 1 on internal level", h);
-
-	PROC_INFO_INC(tb->tb_sb, balance_at[h]);
-
-	order =
-	    (tbSh) ? PATH_H_POSITION(tb->tb_path,
-				     h + 1) /*tb->S[h]->b_item_order */ : 0;
-
-	/*
-	 * Using insert_size[h] calculate the number insert_num of items
-	 * that must be inserted to or deleted from S[h].
-	 */
-	insert_num = tb->insert_size[h] / ((int)(KEY_SIZE + DC_SIZE));
-
-	/* Check whether insert_num is proper * */
-	RFALSE(insert_num < -2 || insert_num > 2,
-	       "incorrect number of items inserted to the internal node (%d)",
-	       insert_num);
-	RFALSE(h > 1 && (insert_num > 1 || insert_num < -1),
-	       "incorrect number of items (%d) inserted to the internal node on a level (h=%d) higher than last internal level",
-	       insert_num, h);
-
-	/* Make balance in case insert_num < 0 */
-	if (insert_num < 0) {
-		balance_internal_when_delete(tb, h, child_pos);
-		return order;
-	}
-
-	k = 0;
-	if (tb->lnum[h] > 0) {
-		/*
-		 * shift lnum[h] items from S[h] to the left neighbor L[h].
-		 * check how many of new items fall into L[h] or CFL[h] after
-		 * shifting
-		 */
-		n = B_NR_ITEMS(tb->L[h]);	/* number of items in L[h] */
-		if (tb->lnum[h] <= child_pos) {
-			/* new items don't fall into L[h] or CFL[h] */
-			internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
-					    tb->lnum[h]);
-			child_pos -= tb->lnum[h];
-		} else if (tb->lnum[h] > child_pos + insert_num) {
-			/* all new items fall into L[h] */
-			internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
-					    tb->lnum[h] - insert_num);
-			/* insert insert_num keys and node-pointers into L[h] */
-			bi.tb = tb;
-			bi.bi_bh = tb->L[h];
-			bi.bi_parent = tb->FL[h];
-			bi.bi_position = get_left_neighbor_position(tb, h);
-			internal_insert_childs(&bi,
-					       /*tb->L[h], tb->S[h-1]->b_next */
-					       n + child_pos + 1,
-					       insert_num, insert_key,
-					       insert_ptr);
-
-			insert_num = 0;
-		} else {
-			struct disk_child *dc;
-
-			/*
-			 * some items fall into L[h] or CFL[h],
-			 * but some don't fall
-			 */
-			internal_shift1_left(tb, h, child_pos + 1);
-			/* calculate number of new items that fall into L[h] */
-			k = tb->lnum[h] - child_pos - 1;
-			bi.tb = tb;
-			bi.bi_bh = tb->L[h];
-			bi.bi_parent = tb->FL[h];
-			bi.bi_position = get_left_neighbor_position(tb, h);
-			internal_insert_childs(&bi,
-					       /*tb->L[h], tb->S[h-1]->b_next, */
-					       n + child_pos + 1, k,
-					       insert_key, insert_ptr);
-
-			replace_lkey(tb, h, insert_key + k);
-
-			/*
-			 * replace the first node-ptr in S[h] by
-			 * node-ptr to insert_ptr[k]
-			 */
-			dc = B_N_CHILD(tbSh, 0);
-			put_dc_size(dc,
-				    MAX_CHILD_SIZE(insert_ptr[k]) -
-				    B_FREE_SPACE(insert_ptr[k]));
-			put_dc_block_number(dc, insert_ptr[k]->b_blocknr);
-
-			do_balance_mark_internal_dirty(tb, tbSh, 0);
-
-			k++;
-			insert_key += k;
-			insert_ptr += k;
-			insert_num -= k;
-			child_pos = 0;
-		}
-	}
-	/* tb->lnum[h] > 0 */
-	if (tb->rnum[h] > 0) {
-		/*shift rnum[h] items from S[h] to the right neighbor R[h] */
-		/*
-		 * check how many of new items fall into R or CFR
-		 * after shifting
-		 */
-		n = B_NR_ITEMS(tbSh);	/* number of items in S[h] */
-		if (n - tb->rnum[h] >= child_pos)
-			/* new items fall into S[h] */
-			internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
-					     tb->rnum[h]);
-		else if (n + insert_num - tb->rnum[h] < child_pos) {
-			/* all new items fall into R[h] */
-			internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
-					     tb->rnum[h] - insert_num);
-
-			/* insert insert_num keys and node-pointers into R[h] */
-			bi.tb = tb;
-			bi.bi_bh = tb->R[h];
-			bi.bi_parent = tb->FR[h];
-			bi.bi_position = get_right_neighbor_position(tb, h);
-			internal_insert_childs(&bi,
-					       /*tb->R[h],tb->S[h-1]->b_next */
-					       child_pos - n - insert_num +
-					       tb->rnum[h] - 1,
-					       insert_num, insert_key,
-					       insert_ptr);
-			insert_num = 0;
-		} else {
-			struct disk_child *dc;
-
-			/* one of the items falls into CFR[h] */
-			internal_shift1_right(tb, h, n - child_pos + 1);
-			/* calculate number of new items that fall into R[h] */
-			k = tb->rnum[h] - n + child_pos - 1;
-			bi.tb = tb;
-			bi.bi_bh = tb->R[h];
-			bi.bi_parent = tb->FR[h];
-			bi.bi_position = get_right_neighbor_position(tb, h);
-			internal_insert_childs(&bi,
-					       /*tb->R[h], tb->R[h]->b_child, */
-					       0, k, insert_key + 1,
-					       insert_ptr + 1);
-
-			replace_rkey(tb, h, insert_key + insert_num - k - 1);
-
-			/*
-			 * replace the first node-ptr in R[h] by
-			 * node-ptr insert_ptr[insert_num-k-1]
-			 */
-			dc = B_N_CHILD(tb->R[h], 0);
-			put_dc_size(dc,
-				    MAX_CHILD_SIZE(insert_ptr
-						   [insert_num - k - 1]) -
-				    B_FREE_SPACE(insert_ptr
-						 [insert_num - k - 1]));
-			put_dc_block_number(dc,
-					    insert_ptr[insert_num - k -
-						       1]->b_blocknr);
-
-			do_balance_mark_internal_dirty(tb, tb->R[h], 0);
-
-			insert_num -= (k + 1);
-		}
-	}
-
-	/** Fill new node that appears instead of S[h] **/
-	RFALSE(tb->blknum[h] > 2, "blknum can not be > 2 for internal level");
-	RFALSE(tb->blknum[h] < 0, "blknum can not be < 0");
-
-	if (!tb->blknum[h]) {	/* node S[h] is empty now */
-		RFALSE(!tbSh, "S[h] is equal NULL");
-
-		/* do what is needed for buffer thrown from tree */
-		reiserfs_invalidate_buffer(tb, tbSh);
-		return order;
-	}
-
-	if (!tbSh) {
-		/* create new root */
-		struct disk_child *dc;
-		struct buffer_head *tbSh_1 = PATH_H_PBUFFER(tb->tb_path, h - 1);
-		struct block_head *blkh;
-
-		if (tb->blknum[h] != 1)
-			reiserfs_panic(NULL, "ibalance-3", "One new node "
-				       "required for creating the new root");
-		/* S[h] = empty buffer from the list FEB. */
-		tbSh = get_FEB(tb);
-		blkh = B_BLK_HEAD(tbSh);
-		set_blkh_level(blkh, h + 1);
-
-		/* Put the unique node-pointer to S[h] that points to S[h-1]. */
-
-		dc = B_N_CHILD(tbSh, 0);
-		put_dc_block_number(dc, tbSh_1->b_blocknr);
-		put_dc_size(dc,
-			    (MAX_CHILD_SIZE(tbSh_1) - B_FREE_SPACE(tbSh_1)));
-
-		tb->insert_size[h] -= DC_SIZE;
-		set_blkh_free_space(blkh, blkh_free_space(blkh) - DC_SIZE);
-
-		do_balance_mark_internal_dirty(tb, tbSh, 0);
-
-		/*&&&&&&&&&&&&&&&&&&&&&&&& */
-		check_internal(tbSh);
-		/*&&&&&&&&&&&&&&&&&&&&&&&& */
-
-		/* put new root into path structure */
-		PATH_OFFSET_PBUFFER(tb->tb_path, ILLEGAL_PATH_ELEMENT_OFFSET) =
-		    tbSh;
-
-		/* Change root in structure super block. */
-		PUT_SB_ROOT_BLOCK(tb->tb_sb, tbSh->b_blocknr);
-		PUT_SB_TREE_HEIGHT(tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1);
-		do_balance_mark_sb_dirty(tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1);
-	}
-
-	if (tb->blknum[h] == 2) {
-		int snum;
-		struct buffer_info dest_bi, src_bi;
-
-		/* S_new = free buffer from list FEB */
-		S_new = get_FEB(tb);
-
-		set_blkh_level(B_BLK_HEAD(S_new), h + 1);
-
-		dest_bi.tb = tb;
-		dest_bi.bi_bh = S_new;
-		dest_bi.bi_parent = NULL;
-		dest_bi.bi_position = 0;
-		src_bi.tb = tb;
-		src_bi.bi_bh = tbSh;
-		src_bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h);
-		src_bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
-
-		n = B_NR_ITEMS(tbSh);	/* number of items in S[h] */
-		snum = (insert_num + n + 1) / 2;
-		if (n - snum >= child_pos) {
-			/* new items don't fall into S_new */
-			/*  store the delimiting key for the next level */
-			/* new_insert_key = (n - snum)'th key in S[h] */
-			memcpy(&new_insert_key, internal_key(tbSh, n - snum),
-			       KEY_SIZE);
-			/* last parameter is del_par */
-			internal_move_pointers_items(&dest_bi, &src_bi,
-						     LAST_TO_FIRST, snum, 0);
-		} else if (n + insert_num - snum < child_pos) {
-			/* all new items fall into S_new */
-			/*  store the delimiting key for the next level */
-			/*
-			 * new_insert_key = (n + insert_item - snum)'th
-			 * key in S[h]
-			 */
-			memcpy(&new_insert_key,
-			       internal_key(tbSh, n + insert_num - snum),
-			       KEY_SIZE);
-			/* last parameter is del_par */
-			internal_move_pointers_items(&dest_bi, &src_bi,
-						     LAST_TO_FIRST,
-						     snum - insert_num, 0);
-
-			/*
-			 * insert insert_num keys and node-pointers
-			 * into S_new
-			 */
-			internal_insert_childs(&dest_bi,
-					       /*S_new,tb->S[h-1]->b_next, */
-					       child_pos - n - insert_num +
-					       snum - 1,
-					       insert_num, insert_key,
-					       insert_ptr);
-
-			insert_num = 0;
-		} else {
-			struct disk_child *dc;
-
-			/* some items fall into S_new, but some don't fall */
-			/* last parameter is del_par */
-			internal_move_pointers_items(&dest_bi, &src_bi,
-						     LAST_TO_FIRST,
-						     n - child_pos + 1, 1);
-			/* calculate number of new items that fall into S_new */
-			k = snum - n + child_pos - 1;
-
-			internal_insert_childs(&dest_bi, /*S_new, */ 0, k,
-					       insert_key + 1, insert_ptr + 1);
-
-			/* new_insert_key = insert_key[insert_num - k - 1] */
-			memcpy(&new_insert_key, insert_key + insert_num - k - 1,
-			       KEY_SIZE);
-			/*
-			 * replace first node-ptr in S_new by node-ptr
-			 * to insert_ptr[insert_num-k-1]
-			 */
-
-			dc = B_N_CHILD(S_new, 0);
-			put_dc_size(dc,
-				    (MAX_CHILD_SIZE
-				     (insert_ptr[insert_num - k - 1]) -
-				     B_FREE_SPACE(insert_ptr
-						  [insert_num - k - 1])));
-			put_dc_block_number(dc,
-					    insert_ptr[insert_num - k -
-						       1]->b_blocknr);
-
-			do_balance_mark_internal_dirty(tb, S_new, 0);
-
-			insert_num -= (k + 1);
-		}
-		/* new_insert_ptr = node_pointer to S_new */
-		new_insert_ptr = S_new;
-
-		RFALSE(!buffer_journaled(S_new) || buffer_journal_dirty(S_new)
-		       || buffer_dirty(S_new), "cm-00001: bad S_new (%b)",
-		       S_new);
-
-		/* S_new is released in unfix_nodes */
-	}
-
-	n = B_NR_ITEMS(tbSh);	/*number of items in S[h] */
-
-	if (0 <= child_pos && child_pos <= n && insert_num > 0) {
-		bi.tb = tb;
-		bi.bi_bh = tbSh;
-		bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h);
-		bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
-		internal_insert_childs(&bi,	/*tbSh, */
-				       /*          ( tb->S[h-1]->b_parent == tb->S[h] ) ? tb->S[h-1]->b_next :  tb->S[h]->b_child->b_next, */
-				       child_pos, insert_num, insert_key,
-				       insert_ptr);
-	}
-
-	insert_ptr[0] = new_insert_ptr;
-	if (new_insert_ptr)
-		memcpy(new_insert_key_addr, &new_insert_key, KEY_SIZE);
-
-	return order;
-}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
deleted file mode 100644
index d39ee5f6c075..000000000000
--- a/fs/reiserfs/inode.c
+++ /dev/null
@@ -1,3416 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/time.h>
-#include <linux/fs.h>
-#include "reiserfs.h"
-#include "acl.h"
-#include "xattr.h"
-#include <linux/exportfs.h>
-#include <linux/pagemap.h>
-#include <linux/highmem.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-#include <linux/unaligned.h>
-#include <linux/buffer_head.h>
-#include <linux/mpage.h>
-#include <linux/writeback.h>
-#include <linux/quotaops.h>
-#include <linux/swap.h>
-#include <linux/uio.h>
-#include <linux/bio.h>
-
-int reiserfs_commit_write(struct file *f, struct page *page,
-			  unsigned from, unsigned to);
-
-void reiserfs_evict_inode(struct inode *inode)
-{
-	/*
-	 * We need blocks for transaction + (user+group) quota
-	 * update (possibly delete)
-	 */
-	int jbegin_count =
-	    JOURNAL_PER_BALANCE_CNT * 2 +
-	    2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
-	struct reiserfs_transaction_handle th;
-	int err;
-
-	if (!inode->i_nlink && !is_bad_inode(inode))
-		dquot_initialize(inode);
-
-	truncate_inode_pages_final(&inode->i_data);
-	if (inode->i_nlink)
-		goto no_delete;
-
-	/*
-	 * The = 0 happens when we abort creating a new inode
-	 * for some reason like lack of space..
-	 * also handles bad_inode case
-	 */
-	if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) {
-
-		reiserfs_delete_xattrs(inode);
-
-		reiserfs_write_lock(inode->i_sb);
-
-		if (journal_begin(&th, inode->i_sb, jbegin_count))
-			goto out;
-		reiserfs_update_inode_transaction(inode);
-
-		reiserfs_discard_prealloc(&th, inode);
-
-		err = reiserfs_delete_object(&th, inode);
-
-		/*
-		 * Do quota update inside a transaction for journaled quotas.
-		 * We must do that after delete_object so that quota updates
-		 * go into the same transaction as stat data deletion
-		 */
-		if (!err) {
-			int depth = reiserfs_write_unlock_nested(inode->i_sb);
-			dquot_free_inode(inode);
-			reiserfs_write_lock_nested(inode->i_sb, depth);
-		}
-
-		if (journal_end(&th))
-			goto out;
-
-		/*
-		 * check return value from reiserfs_delete_object after
-		 * ending the transaction
-		 */
-		if (err)
-		    goto out;
-
-		/*
-		 * all items of file are deleted, so we can remove
-		 * "save" link
-		 * we can't do anything about an error here
-		 */
-		remove_save_link(inode, 0 /* not truncate */);
-out:
-		reiserfs_write_unlock(inode->i_sb);
-	} else {
-		/* no object items are in the tree */
-		;
-	}
-
-	/* note this must go after the journal_end to prevent deadlock */
-	clear_inode(inode);
-
-	dquot_drop(inode);
-	inode->i_blocks = 0;
-	return;
-
-no_delete:
-	clear_inode(inode);
-	dquot_drop(inode);
-}
-
-static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
-			  __u32 objectid, loff_t offset, int type, int length)
-{
-	key->version = version;
-
-	key->on_disk_key.k_dir_id = dirid;
-	key->on_disk_key.k_objectid = objectid;
-	set_cpu_key_k_offset(key, offset);
-	set_cpu_key_k_type(key, type);
-	key->key_length = length;
-}
-
-/*
- * take base of inode_key (it comes from inode always) (dirid, objectid)
- * and version from an inode, set offset and type of key
- */
-void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset,
-		  int type, int length)
-{
-	_make_cpu_key(key, get_inode_item_key_version(inode),
-		      le32_to_cpu(INODE_PKEY(inode)->k_dir_id),
-		      le32_to_cpu(INODE_PKEY(inode)->k_objectid), offset, type,
-		      length);
-}
-
-/* when key is 0, do not set version and short key */
-inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
-			      int version,
-			      loff_t offset, int type, int length,
-			      int entry_count /*or ih_free_space */ )
-{
-	if (key) {
-		ih->ih_key.k_dir_id = cpu_to_le32(key->on_disk_key.k_dir_id);
-		ih->ih_key.k_objectid =
-		    cpu_to_le32(key->on_disk_key.k_objectid);
-	}
-	put_ih_version(ih, version);
-	set_le_ih_k_offset(ih, offset);
-	set_le_ih_k_type(ih, type);
-	put_ih_item_len(ih, length);
-	/*    set_ih_free_space (ih, 0); */
-	/*
-	 * for directory items it is entry count, for directs and stat
-	 * datas - 0xffff, for indirects - 0
-	 */
-	put_ih_entry_count(ih, entry_count);
-}
-
-/*
- * FIXME: we might cache recently accessed indirect item
- * Ugh.  Not too eager for that....
- * I cut the code until such time as I see a convincing argument (benchmark).
- * I don't want a bloated inode struct..., and I don't like code complexity....
- */
-
-/*
- * cutting the code is fine, since it really isn't in use yet and is easy
- * to add back in.  But, Vladimir has a really good idea here.  Think
- * about what happens for reading a file.  For each page,
- * The VFS layer calls reiserfs_read_folio, who searches the tree to find
- * an indirect item.  This indirect item has X number of pointers, where
- * X is a big number if we've done the block allocation right.  But,
- * we only use one or two of these pointers during each call to read_folio,
- * needlessly researching again later on.
- *
- * The size of the cache could be dynamic based on the size of the file.
- *
- * I'd also like to see us cache the location the stat data item, since
- * we are needlessly researching for that frequently.
- *
- * --chris
- */
-
-/*
- * If this page has a file tail in it, and
- * it was read in by get_block_create_0, the page data is valid,
- * but tail is still sitting in a direct item, and we can't write to
- * it.  So, look through this page, and check all the mapped buffers
- * to make sure they have valid block numbers.  Any that don't need
- * to be unmapped, so that __block_write_begin will correctly call
- * reiserfs_get_block to convert the tail into an unformatted node
- */
-static inline void fix_tail_page_for_writing(struct page *page)
-{
-	struct buffer_head *head, *next, *bh;
-
-	if (page && page_has_buffers(page)) {
-		head = page_buffers(page);
-		bh = head;
-		do {
-			next = bh->b_this_page;
-			if (buffer_mapped(bh) && bh->b_blocknr == 0) {
-				reiserfs_unmap_buffer(bh);
-			}
-			bh = next;
-		} while (bh != head);
-	}
-}
-
-/*
- * reiserfs_get_block does not need to allocate a block only if it has been
- * done already or non-hole position has been found in the indirect item
- */
-static inline int allocation_needed(int retval, b_blocknr_t allocated,
-				    struct item_head *ih,
-				    __le32 * item, int pos_in_item)
-{
-	if (allocated)
-		return 0;
-	if (retval == POSITION_FOUND && is_indirect_le_ih(ih) &&
-	    get_block_num(item, pos_in_item))
-		return 0;
-	return 1;
-}
-
-static inline int indirect_item_found(int retval, struct item_head *ih)
-{
-	return (retval == POSITION_FOUND) && is_indirect_le_ih(ih);
-}
-
-static inline void set_block_dev_mapped(struct buffer_head *bh,
-					b_blocknr_t block, struct inode *inode)
-{
-	map_bh(bh, inode->i_sb, block);
-}
-
-/*
- * files which were created in the earlier version can not be longer,
- * than 2 gb
- */
-static int file_capable(struct inode *inode, sector_t block)
-{
-	/* it is new file. */
-	if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 ||
-	    /* old file, but 'block' is inside of 2gb */
-	    block < (1 << (31 - inode->i_sb->s_blocksize_bits)))
-		return 1;
-
-	return 0;
-}
-
-static int restart_transaction(struct reiserfs_transaction_handle *th,
-			       struct inode *inode, struct treepath *path)
-{
-	struct super_block *s = th->t_super;
-	int err;
-
-	BUG_ON(!th->t_trans_id);
-	BUG_ON(!th->t_refcount);
-
-	pathrelse(path);
-
-	/* we cannot restart while nested */
-	if (th->t_refcount > 1) {
-		return 0;
-	}
-	reiserfs_update_sd(th, inode);
-	err = journal_end(th);
-	if (!err) {
-		err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6);
-		if (!err)
-			reiserfs_update_inode_transaction(inode);
-	}
-	return err;
-}
-
-/*
- * it is called by get_block when create == 0. Returns block number
- * for 'block'-th logical block of file. When it hits direct item it
- * returns 0 (being called from bmap) or read direct item into piece
- * of page (bh_result)
- * Please improve the english/clarity in the comment above, as it is
- * hard to understand.
- */
-static int _get_block_create_0(struct inode *inode, sector_t block,
-			       struct buffer_head *bh_result, int args)
-{
-	INITIALIZE_PATH(path);
-	struct cpu_key key;
-	struct buffer_head *bh;
-	struct item_head *ih, tmp_ih;
-	b_blocknr_t blocknr;
-	char *p;
-	int chars;
-	int ret;
-	int result;
-	int done = 0;
-	unsigned long offset;
-
-	/* prepare the key to look for the 'block'-th block of file */
-	make_cpu_key(&key, inode,
-		     (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
-		     3);
-
-	result = search_for_position_by_key(inode->i_sb, &key, &path);
-	if (result != POSITION_FOUND) {
-		pathrelse(&path);
-		if (result == IO_ERROR)
-			return -EIO;
-		/*
-		 * We do not return -ENOENT if there is a hole but page is
-		 * uptodate, because it means that there is some MMAPED data
-		 * associated with it that is yet to be written to disk.
-		 */
-		if ((args & GET_BLOCK_NO_HOLE)
-		    && !PageUptodate(bh_result->b_page)) {
-			return -ENOENT;
-		}
-		return 0;
-	}
-
-	bh = get_last_bh(&path);
-	ih = tp_item_head(&path);
-	if (is_indirect_le_ih(ih)) {
-		__le32 *ind_item = (__le32 *) ih_item_body(bh, ih);
-
-		/*
-		 * FIXME: here we could cache indirect item or part of it in
-		 * the inode to avoid search_by_key in case of subsequent
-		 * access to file
-		 */
-		blocknr = get_block_num(ind_item, path.pos_in_item);
-		ret = 0;
-		if (blocknr) {
-			map_bh(bh_result, inode->i_sb, blocknr);
-			if (path.pos_in_item ==
-			    ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) {
-				set_buffer_boundary(bh_result);
-			}
-		} else
-			/*
-			 * We do not return -ENOENT if there is a hole but
-			 * page is uptodate, because it means that there is
-			 * some MMAPED data associated with it that is
-			 * yet to be written to disk.
-			 */
-		if ((args & GET_BLOCK_NO_HOLE)
-			    && !PageUptodate(bh_result->b_page)) {
-			ret = -ENOENT;
-		}
-
-		pathrelse(&path);
-		return ret;
-	}
-	/* requested data are in direct item(s) */
-	if (!(args & GET_BLOCK_READ_DIRECT)) {
-		/*
-		 * we are called by bmap. FIXME: we can not map block of file
-		 * when it is stored in direct item(s)
-		 */
-		pathrelse(&path);
-		return -ENOENT;
-	}
-
-	/*
-	 * if we've got a direct item, and the buffer or page was uptodate,
-	 * we don't want to pull data off disk again.  skip to the
-	 * end, where we map the buffer and return
-	 */
-	if (buffer_uptodate(bh_result)) {
-		goto finished;
-	} else
-		/*
-		 * grab_tail_page can trigger calls to reiserfs_get_block on
-		 * up to date pages without any buffers.  If the page is up
-		 * to date, we don't want read old data off disk.  Set the up
-		 * to date bit on the buffer instead and jump to the end
-		 */
-	if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
-		set_buffer_uptodate(bh_result);
-		goto finished;
-	}
-	/* read file tail into part of page */
-	offset = (cpu_key_k_offset(&key) - 1) & (PAGE_SIZE - 1);
-	copy_item_head(&tmp_ih, ih);
-
-	/*
-	 * we only want to kmap if we are reading the tail into the page.
-	 * this is not the common case, so we don't kmap until we are
-	 * sure we need to.  But, this means the item might move if
-	 * kmap schedules
-	 */
-	p = (char *)kmap(bh_result->b_page);
-	p += offset;
-	memset(p, 0, inode->i_sb->s_blocksize);
-	do {
-		if (!is_direct_le_ih(ih)) {
-			BUG();
-		}
-		/*
-		 * make sure we don't read more bytes than actually exist in
-		 * the file.  This can happen in odd cases where i_size isn't
-		 * correct, and when direct item padding results in a few
-		 * extra bytes at the end of the direct item
-		 */
-		if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
-			break;
-		if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) {
-			chars =
-			    inode->i_size - (le_ih_k_offset(ih) - 1) -
-			    path.pos_in_item;
-			done = 1;
-		} else {
-			chars = ih_item_len(ih) - path.pos_in_item;
-		}
-		memcpy(p, ih_item_body(bh, ih) + path.pos_in_item, chars);
-
-		if (done)
-			break;
-
-		p += chars;
-
-		/*
-		 * we done, if read direct item is not the last item of
-		 * node FIXME: we could try to check right delimiting key
-		 * to see whether direct item continues in the right
-		 * neighbor or rely on i_size
-		 */
-		if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1))
-			break;
-
-		/* update key to look for the next piece */
-		set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars);
-		result = search_for_position_by_key(inode->i_sb, &key, &path);
-		if (result != POSITION_FOUND)
-			/* i/o error most likely */
-			break;
-		bh = get_last_bh(&path);
-		ih = tp_item_head(&path);
-	} while (1);
-
-	flush_dcache_page(bh_result->b_page);
-	kunmap(bh_result->b_page);
-
-finished:
-	pathrelse(&path);
-
-	if (result == IO_ERROR)
-		return -EIO;
-
-	/*
-	 * this buffer has valid data, but isn't valid for io.  mapping it to
-	 * block #0 tells the rest of reiserfs it just has a tail in it
-	 */
-	map_bh(bh_result, inode->i_sb, 0);
-	set_buffer_uptodate(bh_result);
-	return 0;
-}
-
-/*
- * this is called to create file map. So, _get_block_create_0 will not
- * read direct item
- */
-static int reiserfs_bmap(struct inode *inode, sector_t block,
-			 struct buffer_head *bh_result, int create)
-{
-	if (!file_capable(inode, block))
-		return -EFBIG;
-
-	reiserfs_write_lock(inode->i_sb);
-	/* do not read the direct item */
-	_get_block_create_0(inode, block, bh_result, 0);
-	reiserfs_write_unlock(inode->i_sb);
-	return 0;
-}
-
-/*
- * special version of get_block that is only used by grab_tail_page right
- * now.  It is sent to __block_write_begin, and when you try to get a
- * block past the end of the file (or a block from a hole) it returns
- * -ENOENT instead of a valid buffer.  __block_write_begin expects to
- * be able to do i/o on the buffers returned, unless an error value
- * is also returned.
- *
- * So, this allows __block_write_begin to be used for reading a single block
- * in a page.  Where it does not produce a valid page for holes, or past the
- * end of the file.  This turns out to be exactly what we need for reading
- * tails for conversion.
- *
- * The point of the wrapper is forcing a certain value for create, even
- * though the VFS layer is calling this function with create==1.  If you
- * don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
- * don't use this function.
-*/
-static int reiserfs_get_block_create_0(struct inode *inode, sector_t block,
-				       struct buffer_head *bh_result,
-				       int create)
-{
-	return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE);
-}
-
-/*
- * This is special helper for reiserfs_get_block in case we are executing
- * direct_IO request.
- */
-static int reiserfs_get_blocks_direct_io(struct inode *inode,
-					 sector_t iblock,
-					 struct buffer_head *bh_result,
-					 int create)
-{
-	int ret;
-
-	bh_result->b_page = NULL;
-
-	/*
-	 * We set the b_size before reiserfs_get_block call since it is
-	 * referenced in convert_tail_for_hole() that may be called from
-	 * reiserfs_get_block()
-	 */
-	bh_result->b_size = i_blocksize(inode);
-
-	ret = reiserfs_get_block(inode, iblock, bh_result,
-				 create | GET_BLOCK_NO_DANGLE);
-	if (ret)
-		goto out;
-
-	/* don't allow direct io onto tail pages */
-	if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
-		/*
-		 * make sure future calls to the direct io funcs for this
-		 * offset in the file fail by unmapping the buffer
-		 */
-		clear_buffer_mapped(bh_result);
-		ret = -EINVAL;
-	}
-
-	/*
-	 * Possible unpacked tail. Flush the data before pages have
-	 * disappeared
-	 */
-	if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
-		int err;
-
-		reiserfs_write_lock(inode->i_sb);
-
-		err = reiserfs_commit_for_inode(inode);
-		REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
-
-		reiserfs_write_unlock(inode->i_sb);
-
-		if (err < 0)
-			ret = err;
-	}
-out:
-	return ret;
-}
-
-/*
- * helper function for when reiserfs_get_block is called for a hole
- * but the file tail is still in a direct item
- * bh_result is the buffer head for the hole
- * tail_offset is the offset of the start of the tail in the file
- *
- * This calls prepare_write, which will start a new transaction
- * you should not be in a transaction, or have any paths held when you
- * call this.
- */
-static int convert_tail_for_hole(struct inode *inode,
-				 struct buffer_head *bh_result,
-				 loff_t tail_offset)
-{
-	unsigned long index;
-	unsigned long tail_end;
-	unsigned long tail_start;
-	struct page *tail_page;
-	struct page *hole_page = bh_result->b_page;
-	int retval = 0;
-
-	if ((tail_offset & (bh_result->b_size - 1)) != 1)
-		return -EIO;
-
-	/* always try to read until the end of the block */
-	tail_start = tail_offset & (PAGE_SIZE - 1);
-	tail_end = (tail_start | (bh_result->b_size - 1)) + 1;
-
-	index = tail_offset >> PAGE_SHIFT;
-	/*
-	 * hole_page can be zero in case of direct_io, we are sure
-	 * that we cannot get here if we write with O_DIRECT into tail page
-	 */
-	if (!hole_page || index != hole_page->index) {
-		tail_page = grab_cache_page(inode->i_mapping, index);
-		retval = -ENOMEM;
-		if (!tail_page) {
-			goto out;
-		}
-	} else {
-		tail_page = hole_page;
-	}
-
-	/*
-	 * we don't have to make sure the conversion did not happen while
-	 * we were locking the page because anyone that could convert
-	 * must first take i_mutex.
-	 *
-	 * We must fix the tail page for writing because it might have buffers
-	 * that are mapped, but have a block number of 0.  This indicates tail
-	 * data that has been read directly into the page, and
-	 * __block_write_begin won't trigger a get_block in this case.
-	 */
-	fix_tail_page_for_writing(tail_page);
-	retval = __reiserfs_write_begin(tail_page, tail_start,
-				      tail_end - tail_start);
-	if (retval)
-		goto unlock;
-
-	/* tail conversion might change the data in the page */
-	flush_dcache_page(tail_page);
-
-	retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end);
-
-unlock:
-	if (tail_page != hole_page) {
-		unlock_page(tail_page);
-		put_page(tail_page);
-	}
-out:
-	return retval;
-}
-
-static inline int _allocate_block(struct reiserfs_transaction_handle *th,
-				  sector_t block,
-				  struct inode *inode,
-				  b_blocknr_t * allocated_block_nr,
-				  struct treepath *path, int flags)
-{
-	BUG_ON(!th->t_trans_id);
-
-#ifdef REISERFS_PREALLOCATE
-	if (!(flags & GET_BLOCK_NO_IMUX)) {
-		return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr,
-						  path, block);
-	}
-#endif
-	return reiserfs_new_unf_blocknrs(th, inode, allocated_block_nr, path,
-					 block);
-}
-
-int reiserfs_get_block(struct inode *inode, sector_t block,
-		       struct buffer_head *bh_result, int create)
-{
-	int repeat, retval = 0;
-	/* b_blocknr_t is (unsigned) 32 bit int*/
-	b_blocknr_t allocated_block_nr = 0;
-	INITIALIZE_PATH(path);
-	int pos_in_item;
-	struct cpu_key key;
-	struct buffer_head *bh, *unbh = NULL;
-	struct item_head *ih, tmp_ih;
-	__le32 *item;
-	int done;
-	int fs_gen;
-	struct reiserfs_transaction_handle *th = NULL;
-	/*
-	 * space reserved in transaction batch:
-	 * . 3 balancings in direct->indirect conversion
-	 * . 1 block involved into reiserfs_update_sd()
-	 * XXX in practically impossible worst case direct2indirect()
-	 * can incur (much) more than 3 balancings.
-	 * quota update for user, group
-	 */
-	int jbegin_count =
-	    JOURNAL_PER_BALANCE_CNT * 3 + 1 +
-	    2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
-	int version;
-	int dangle = 1;
-	loff_t new_offset =
-	    (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
-
-	reiserfs_write_lock(inode->i_sb);
-	version = get_inode_item_key_version(inode);
-
-	if (!file_capable(inode, block)) {
-		reiserfs_write_unlock(inode->i_sb);
-		return -EFBIG;
-	}
-
-	/*
-	 * if !create, we aren't changing the FS, so we don't need to
-	 * log anything, so we don't need to start a transaction
-	 */
-	if (!(create & GET_BLOCK_CREATE)) {
-		int ret;
-		/* find number of block-th logical block of the file */
-		ret = _get_block_create_0(inode, block, bh_result,
-					  create | GET_BLOCK_READ_DIRECT);
-		reiserfs_write_unlock(inode->i_sb);
-		return ret;
-	}
-
-	/*
-	 * if we're already in a transaction, make sure to close
-	 * any new transactions we start in this func
-	 */
-	if ((create & GET_BLOCK_NO_DANGLE) ||
-	    reiserfs_transaction_running(inode->i_sb))
-		dangle = 0;
-
-	/*
-	 * If file is of such a size, that it might have a tail and
-	 * tails are enabled  we should mark it as possibly needing
-	 * tail packing on close
-	 */
-	if ((have_large_tails(inode->i_sb)
-	     && inode->i_size < i_block_size(inode) * 4)
-	    || (have_small_tails(inode->i_sb)
-		&& inode->i_size < i_block_size(inode)))
-		REISERFS_I(inode)->i_flags |= i_pack_on_close_mask;
-
-	/* set the key of the first byte in the 'block'-th block of file */
-	make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ );
-	if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
-start_trans:
-		th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
-		if (!th) {
-			retval = -ENOMEM;
-			goto failure;
-		}
-		reiserfs_update_inode_transaction(inode);
-	}
-research:
-
-	retval = search_for_position_by_key(inode->i_sb, &key, &path);
-	if (retval == IO_ERROR) {
-		retval = -EIO;
-		goto failure;
-	}
-
-	bh = get_last_bh(&path);
-	ih = tp_item_head(&path);
-	item = tp_item_body(&path);
-	pos_in_item = path.pos_in_item;
-
-	fs_gen = get_generation(inode->i_sb);
-	copy_item_head(&tmp_ih, ih);
-
-	if (allocation_needed
-	    (retval, allocated_block_nr, ih, item, pos_in_item)) {
-		/* we have to allocate block for the unformatted node */
-		if (!th) {
-			pathrelse(&path);
-			goto start_trans;
-		}
-
-		repeat =
-		    _allocate_block(th, block, inode, &allocated_block_nr,
-				    &path, create);
-
-		/*
-		 * restart the transaction to give the journal a chance to free
-		 * some blocks.  releases the path, so we have to go back to
-		 * research if we succeed on the second try
-		 */
-		if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
-			SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
-			retval = restart_transaction(th, inode, &path);
-			if (retval)
-				goto failure;
-			repeat =
-			    _allocate_block(th, block, inode,
-					    &allocated_block_nr, NULL, create);
-
-			if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
-				goto research;
-			}
-			if (repeat == QUOTA_EXCEEDED)
-				retval = -EDQUOT;
-			else
-				retval = -ENOSPC;
-			goto failure;
-		}
-
-		if (fs_changed(fs_gen, inode->i_sb)
-		    && item_moved(&tmp_ih, &path)) {
-			goto research;
-		}
-	}
-
-	if (indirect_item_found(retval, ih)) {
-		b_blocknr_t unfm_ptr;
-		/*
-		 * 'block'-th block is in the file already (there is
-		 * corresponding cell in some indirect item). But it may be
-		 * zero unformatted node pointer (hole)
-		 */
-		unfm_ptr = get_block_num(item, pos_in_item);
-		if (unfm_ptr == 0) {
-			/* use allocated block to plug the hole */
-			reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
-			if (fs_changed(fs_gen, inode->i_sb)
-			    && item_moved(&tmp_ih, &path)) {
-				reiserfs_restore_prepared_buffer(inode->i_sb,
-								 bh);
-				goto research;
-			}
-			set_buffer_new(bh_result);
-			if (buffer_dirty(bh_result)
-			    && reiserfs_data_ordered(inode->i_sb))
-				reiserfs_add_ordered_list(inode, bh_result);
-			put_block_num(item, pos_in_item, allocated_block_nr);
-			unfm_ptr = allocated_block_nr;
-			journal_mark_dirty(th, bh);
-			reiserfs_update_sd(th, inode);
-		}
-		set_block_dev_mapped(bh_result, unfm_ptr, inode);
-		pathrelse(&path);
-		retval = 0;
-		if (!dangle && th)
-			retval = reiserfs_end_persistent_transaction(th);
-
-		reiserfs_write_unlock(inode->i_sb);
-
-		/*
-		 * the item was found, so new blocks were not added to the file
-		 * there is no need to make sure the inode is updated with this
-		 * transaction
-		 */
-		return retval;
-	}
-
-	if (!th) {
-		pathrelse(&path);
-		goto start_trans;
-	}
-
-	/*
-	 * desired position is not found or is in the direct item. We have
-	 * to append file with holes up to 'block'-th block converting
-	 * direct items to indirect one if necessary
-	 */
-	done = 0;
-	do {
-		if (is_statdata_le_ih(ih)) {
-			__le32 unp = 0;
-			struct cpu_key tmp_key;
-
-			/* indirect item has to be inserted */
-			make_le_item_head(&tmp_ih, &key, version, 1,
-					  TYPE_INDIRECT, UNFM_P_SIZE,
-					  0 /* free_space */ );
-
-			/*
-			 * we are going to add 'block'-th block to the file.
-			 * Use allocated block for that
-			 */
-			if (cpu_key_k_offset(&key) == 1) {
-				unp = cpu_to_le32(allocated_block_nr);
-				set_block_dev_mapped(bh_result,
-						     allocated_block_nr, inode);
-				set_buffer_new(bh_result);
-				done = 1;
-			}
-			tmp_key = key;	/* ;) */
-			set_cpu_key_k_offset(&tmp_key, 1);
-			PATH_LAST_POSITION(&path)++;
-
-			retval =
-			    reiserfs_insert_item(th, &path, &tmp_key, &tmp_ih,
-						 inode, (char *)&unp);
-			if (retval) {
-				reiserfs_free_block(th, inode,
-						    allocated_block_nr, 1);
-				/*
-				 * retval == -ENOSPC, -EDQUOT or -EIO
-				 * or -EEXIST
-				 */
-				goto failure;
-			}
-		} else if (is_direct_le_ih(ih)) {
-			/* direct item has to be converted */
-			loff_t tail_offset;
-
-			tail_offset =
-			    ((le_ih_k_offset(ih) -
-			      1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
-
-			/*
-			 * direct item we just found fits into block we have
-			 * to map. Convert it into unformatted node: use
-			 * bh_result for the conversion
-			 */
-			if (tail_offset == cpu_key_k_offset(&key)) {
-				set_block_dev_mapped(bh_result,
-						     allocated_block_nr, inode);
-				unbh = bh_result;
-				done = 1;
-			} else {
-				/*
-				 * we have to pad file tail stored in direct
-				 * item(s) up to block size and convert it
-				 * to unformatted node. FIXME: this should
-				 * also get into page cache
-				 */
-
-				pathrelse(&path);
-				/*
-				 * ugly, but we can only end the transaction if
-				 * we aren't nested
-				 */
-				BUG_ON(!th->t_refcount);
-				if (th->t_refcount == 1) {
-					retval =
-					    reiserfs_end_persistent_transaction
-					    (th);
-					th = NULL;
-					if (retval)
-						goto failure;
-				}
-
-				retval =
-				    convert_tail_for_hole(inode, bh_result,
-							  tail_offset);
-				if (retval) {
-					if (retval != -ENOSPC)
-						reiserfs_error(inode->i_sb,
-							"clm-6004",
-							"convert tail failed "
-							"inode %lu, error %d",
-							inode->i_ino,
-							retval);
-					if (allocated_block_nr) {
-						/*
-						 * the bitmap, the super,
-						 * and the stat data == 3
-						 */
-						if (!th)
-							th = reiserfs_persistent_transaction(inode->i_sb, 3);
-						if (th)
-							reiserfs_free_block(th,
-									    inode,
-									    allocated_block_nr,
-									    1);
-					}
-					goto failure;
-				}
-				goto research;
-			}
-			retval =
-			    direct2indirect(th, inode, &path, unbh,
-					    tail_offset);
-			if (retval) {
-				reiserfs_unmap_buffer(unbh);
-				reiserfs_free_block(th, inode,
-						    allocated_block_nr, 1);
-				goto failure;
-			}
-			/*
-			 * it is important the set_buffer_uptodate is done
-			 * after the direct2indirect.  The buffer might
-			 * contain valid data newer than the data on disk
-			 * (read by read_folio, changed, and then sent here by
-			 * writepage).  direct2indirect needs to know if unbh
-			 * was already up to date, so it can decide if the
-			 * data in unbh needs to be replaced with data from
-			 * the disk
-			 */
-			set_buffer_uptodate(unbh);
-
-			/*
-			 * unbh->b_page == NULL in case of DIRECT_IO request,
-			 * this means buffer will disappear shortly, so it
-			 * should not be added to
-			 */
-			if (unbh->b_page) {
-				/*
-				 * we've converted the tail, so we must
-				 * flush unbh before the transaction commits
-				 */
-				reiserfs_add_tail_list(inode, unbh);
-
-				/*
-				 * mark it dirty now to prevent commit_write
-				 * from adding this buffer to the inode's
-				 * dirty buffer list
-				 */
-				/*
-				 * AKPM: changed __mark_buffer_dirty to
-				 * mark_buffer_dirty().  It's still atomic,
-				 * but it sets the page dirty too, which makes
-				 * it eligible for writeback at any time by the
-				 * VM (which was also the case with
-				 * __mark_buffer_dirty())
-				 */
-				mark_buffer_dirty(unbh);
-			}
-		} else {
-			/*
-			 * append indirect item with holes if needed, when
-			 * appending pointer to 'block'-th block use block,
-			 * which is already allocated
-			 */
-			struct cpu_key tmp_key;
-			/*
-			 * We use this in case we need to allocate
-			 * only one block which is a fastpath
-			 */
-			unp_t unf_single = 0;
-			unp_t *un;
-			__u64 max_to_insert =
-			    MAX_ITEM_LEN(inode->i_sb->s_blocksize) /
-			    UNFM_P_SIZE;
-			__u64 blocks_needed;
-
-			RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
-			       "vs-804: invalid position for append");
-			/*
-			 * indirect item has to be appended,
-			 * set up key of that position
-			 * (key type is unimportant)
-			 */
-			make_cpu_key(&tmp_key, inode,
-				     le_key_k_offset(version,
-						     &ih->ih_key) +
-				     op_bytes_number(ih,
-						     inode->i_sb->s_blocksize),
-				     TYPE_INDIRECT, 3);
-
-			RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key),
-			       "green-805: invalid offset");
-			blocks_needed =
-			    1 +
-			    ((cpu_key_k_offset(&key) -
-			      cpu_key_k_offset(&tmp_key)) >> inode->i_sb->
-			     s_blocksize_bits);
-
-			if (blocks_needed == 1) {
-				un = &unf_single;
-			} else {
-				un = kcalloc(min(blocks_needed, max_to_insert),
-					     UNFM_P_SIZE, GFP_NOFS);
-				if (!un) {
-					un = &unf_single;
-					blocks_needed = 1;
-					max_to_insert = 0;
-				}
-			}
-			if (blocks_needed <= max_to_insert) {
-				/*
-				 * we are going to add target block to
-				 * the file. Use allocated block for that
-				 */
-				un[blocks_needed - 1] =
-				    cpu_to_le32(allocated_block_nr);
-				set_block_dev_mapped(bh_result,
-						     allocated_block_nr, inode);
-				set_buffer_new(bh_result);
-				done = 1;
-			} else {
-				/* paste hole to the indirect item */
-				/*
-				 * If kcalloc failed, max_to_insert becomes
-				 * zero and it means we only have space for
-				 * one block
-				 */
-				blocks_needed =
-				    max_to_insert ? max_to_insert : 1;
-			}
-			retval =
-			    reiserfs_paste_into_item(th, &path, &tmp_key, inode,
-						     (char *)un,
-						     UNFM_P_SIZE *
-						     blocks_needed);
-
-			if (blocks_needed != 1)
-				kfree(un);
-
-			if (retval) {
-				reiserfs_free_block(th, inode,
-						    allocated_block_nr, 1);
-				goto failure;
-			}
-			if (!done) {
-				/*
-				 * We need to mark new file size in case
-				 * this function will be interrupted/aborted
-				 * later on. And we may do this only for
-				 * holes.
-				 */
-				inode->i_size +=
-				    inode->i_sb->s_blocksize * blocks_needed;
-			}
-		}
-
-		if (done == 1)
-			break;
-
-		/*
-		 * this loop could log more blocks than we had originally
-		 * asked for.  So, we have to allow the transaction to end
-		 * if it is too big or too full.  Update the inode so things
-		 * are consistent if we crash before the function returns
-		 * release the path so that anybody waiting on the path before
-		 * ending their transaction will be able to continue.
-		 */
-		if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
-			retval = restart_transaction(th, inode, &path);
-			if (retval)
-				goto failure;
-		}
-		/*
-		 * inserting indirect pointers for a hole can take a
-		 * long time.  reschedule if needed and also release the write
-		 * lock for others.
-		 */
-		reiserfs_cond_resched(inode->i_sb);
-
-		retval = search_for_position_by_key(inode->i_sb, &key, &path);
-		if (retval == IO_ERROR) {
-			retval = -EIO;
-			goto failure;
-		}
-		if (retval == POSITION_FOUND) {
-			reiserfs_warning(inode->i_sb, "vs-825",
-					 "%K should not be found", &key);
-			retval = -EEXIST;
-			if (allocated_block_nr)
-				reiserfs_free_block(th, inode,
-						    allocated_block_nr, 1);
-			pathrelse(&path);
-			goto failure;
-		}
-		bh = get_last_bh(&path);
-		ih = tp_item_head(&path);
-		item = tp_item_body(&path);
-		pos_in_item = path.pos_in_item;
-	} while (1);
-
-	retval = 0;
-
-failure:
-	if (th && (!dangle || (retval && !th->t_trans_id))) {
-		int err;
-		if (th->t_trans_id)
-			reiserfs_update_sd(th, inode);
-		err = reiserfs_end_persistent_transaction(th);
-		if (err)
-			retval = err;
-	}
-
-	reiserfs_write_unlock(inode->i_sb);
-	reiserfs_check_path(&path);
-	return retval;
-}
-
-static void reiserfs_readahead(struct readahead_control *rac)
-{
-	mpage_readahead(rac, reiserfs_get_block);
-}
-
-/*
- * Compute real number of used bytes by file
- * Following three functions can go away when we'll have enough space in
- * stat item
- */
-static int real_space_diff(struct inode *inode, int sd_size)
-{
-	int bytes;
-	loff_t blocksize = inode->i_sb->s_blocksize;
-
-	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
-		return sd_size;
-
-	/*
-	 * End of file is also in full block with indirect reference, so round
-	 * up to the next block.
-	 *
-	 * there is just no way to know if the tail is actually packed
-	 * on the file, so we have to assume it isn't.  When we pack the
-	 * tail, we add 4 bytes to pretend there really is an unformatted
-	 * node pointer
-	 */
-	bytes =
-	    ((inode->i_size +
-	      (blocksize - 1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE +
-	    sd_size;
-	return bytes;
-}
-
-static inline loff_t to_real_used_space(struct inode *inode, ulong blocks,
-					int sd_size)
-{
-	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
-		return inode->i_size +
-		    (loff_t) (real_space_diff(inode, sd_size));
-	}
-	return ((loff_t) real_space_diff(inode, sd_size)) +
-	    (((loff_t) blocks) << 9);
-}
-
-/* Compute number of blocks used by file in ReiserFS counting */
-static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
-{
-	loff_t bytes = inode_get_bytes(inode);
-	loff_t real_space = real_space_diff(inode, sd_size);
-
-	/* keeps fsck and non-quota versions of reiserfs happy */
-	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
-		bytes += (loff_t) 511;
-	}
-
-	/*
-	 * files from before the quota patch might i_blocks such that
-	 * bytes < real_space.  Deal with that here to prevent it from
-	 * going negative.
-	 */
-	if (bytes < real_space)
-		return 0;
-	return (bytes - real_space) >> 9;
-}
-
-/*
- * BAD: new directories have stat data of new type and all other items
- * of old type. Version stored in the inode says about body items, so
- * in update_stat_data we can not rely on inode, but have to check
- * item version directly
- */
-
-/* called by read_locked_inode */
-static void init_inode(struct inode *inode, struct treepath *path)
-{
-	struct buffer_head *bh;
-	struct item_head *ih;
-	__u32 rdev;
-
-	bh = PATH_PLAST_BUFFER(path);
-	ih = tp_item_head(path);
-
-	copy_key(INODE_PKEY(inode), &ih->ih_key);
-
-	INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list);
-	REISERFS_I(inode)->i_flags = 0;
-	REISERFS_I(inode)->i_prealloc_block = 0;
-	REISERFS_I(inode)->i_prealloc_count = 0;
-	REISERFS_I(inode)->i_trans_id = 0;
-	REISERFS_I(inode)->i_jl = NULL;
-	reiserfs_init_xattr_rwsem(inode);
-
-	if (stat_data_v1(ih)) {
-		struct stat_data_v1 *sd =
-		    (struct stat_data_v1 *)ih_item_body(bh, ih);
-		unsigned long blocks;
-
-		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
-		set_inode_sd_version(inode, STAT_DATA_V1);
-		inode->i_mode = sd_v1_mode(sd);
-		set_nlink(inode, sd_v1_nlink(sd));
-		i_uid_write(inode, sd_v1_uid(sd));
-		i_gid_write(inode, sd_v1_gid(sd));
-		inode->i_size = sd_v1_size(sd);
-		inode_set_atime(inode, sd_v1_atime(sd), 0);
-		inode_set_mtime(inode, sd_v1_mtime(sd), 0);
-		inode_set_ctime(inode, sd_v1_ctime(sd), 0);
-
-		inode->i_blocks = sd_v1_blocks(sd);
-		inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
-		blocks = (inode->i_size + 511) >> 9;
-		blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9);
-
-		/*
-		 * there was a bug in <=3.5.23 when i_blocks could take
-		 * negative values. Starting from 3.5.17 this value could
-		 * even be stored in stat data. For such files we set
-		 * i_blocks based on file size. Just 2 notes: this can be
-		 * wrong for sparse files. On-disk value will be only
-		 * updated if file's inode will ever change
-		 */
-		if (inode->i_blocks > blocks) {
-			inode->i_blocks = blocks;
-		}
-
-		rdev = sd_v1_rdev(sd);
-		REISERFS_I(inode)->i_first_direct_byte =
-		    sd_v1_first_direct_byte(sd);
-
-		/*
-		 * an early bug in the quota code can give us an odd
-		 * number for the block count.  This is incorrect, fix it here.
-		 */
-		if (inode->i_blocks & 1) {
-			inode->i_blocks++;
-		}
-		inode_set_bytes(inode,
-				to_real_used_space(inode, inode->i_blocks,
-						   SD_V1_SIZE));
-		/*
-		 * nopack is initially zero for v1 objects. For v2 objects,
-		 * nopack is initialised from sd_attrs
-		 */
-		REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
-	} else {
-		/*
-		 * new stat data found, but object may have old items
-		 * (directories and symlinks)
-		 */
-		struct stat_data *sd = (struct stat_data *)ih_item_body(bh, ih);
-
-		inode->i_mode = sd_v2_mode(sd);
-		set_nlink(inode, sd_v2_nlink(sd));
-		i_uid_write(inode, sd_v2_uid(sd));
-		inode->i_size = sd_v2_size(sd);
-		i_gid_write(inode, sd_v2_gid(sd));
-		inode_set_mtime(inode, sd_v2_mtime(sd), 0);
-		inode_set_atime(inode, sd_v2_atime(sd), 0);
-		inode_set_ctime(inode, sd_v2_ctime(sd), 0);
-		inode->i_blocks = sd_v2_blocks(sd);
-		rdev = sd_v2_rdev(sd);
-		if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
-			inode->i_generation =
-			    le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
-		else
-			inode->i_generation = sd_v2_generation(sd);
-
-		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-			set_inode_item_key_version(inode, KEY_FORMAT_3_5);
-		else
-			set_inode_item_key_version(inode, KEY_FORMAT_3_6);
-		REISERFS_I(inode)->i_first_direct_byte = 0;
-		set_inode_sd_version(inode, STAT_DATA_V2);
-		inode_set_bytes(inode,
-				to_real_used_space(inode, inode->i_blocks,
-						   SD_V2_SIZE));
-		/*
-		 * read persistent inode attributes from sd and initialise
-		 * generic inode flags from them
-		 */
-		REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd);
-		sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode);
-	}
-
-	pathrelse(path);
-	if (S_ISREG(inode->i_mode)) {
-		inode->i_op = &reiserfs_file_inode_operations;
-		inode->i_fop = &reiserfs_file_operations;
-		inode->i_mapping->a_ops = &reiserfs_address_space_operations;
-	} else if (S_ISDIR(inode->i_mode)) {
-		inode->i_op = &reiserfs_dir_inode_operations;
-		inode->i_fop = &reiserfs_dir_operations;
-	} else if (S_ISLNK(inode->i_mode)) {
-		inode->i_op = &reiserfs_symlink_inode_operations;
-		inode_nohighmem(inode);
-		inode->i_mapping->a_ops = &reiserfs_address_space_operations;
-	} else {
-		inode->i_blocks = 0;
-		inode->i_op = &reiserfs_special_inode_operations;
-		init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
-	}
-}
-
-/* update new stat data with inode fields */
-static void inode2sd(void *sd, struct inode *inode, loff_t size)
-{
-	struct stat_data *sd_v2 = (struct stat_data *)sd;
-
-	set_sd_v2_mode(sd_v2, inode->i_mode);
-	set_sd_v2_nlink(sd_v2, inode->i_nlink);
-	set_sd_v2_uid(sd_v2, i_uid_read(inode));
-	set_sd_v2_size(sd_v2, size);
-	set_sd_v2_gid(sd_v2, i_gid_read(inode));
-	set_sd_v2_mtime(sd_v2, inode_get_mtime_sec(inode));
-	set_sd_v2_atime(sd_v2, inode_get_atime_sec(inode));
-	set_sd_v2_ctime(sd_v2, inode_get_ctime_sec(inode));
-	set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
-	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
-		set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
-	else
-		set_sd_v2_generation(sd_v2, inode->i_generation);
-	set_sd_v2_attrs(sd_v2, REISERFS_I(inode)->i_attrs);
-}
-
-/* used to copy inode's fields to old stat data */
-static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
-{
-	struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd;
-
-	set_sd_v1_mode(sd_v1, inode->i_mode);
-	set_sd_v1_uid(sd_v1, i_uid_read(inode));
-	set_sd_v1_gid(sd_v1, i_gid_read(inode));
-	set_sd_v1_nlink(sd_v1, inode->i_nlink);
-	set_sd_v1_size(sd_v1, size);
-	set_sd_v1_atime(sd_v1, inode_get_atime_sec(inode));
-	set_sd_v1_ctime(sd_v1, inode_get_ctime_sec(inode));
-	set_sd_v1_mtime(sd_v1, inode_get_mtime_sec(inode));
-
-	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
-		set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
-	else
-		set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
-
-	/* Sigh. i_first_direct_byte is back */
-	set_sd_v1_first_direct_byte(sd_v1,
-				    REISERFS_I(inode)->i_first_direct_byte);
-}
-
-/*
- * NOTE, you must prepare the buffer head before sending it here,
- * and then log it after the call
- */
-static void update_stat_data(struct treepath *path, struct inode *inode,
-			     loff_t size)
-{
-	struct buffer_head *bh;
-	struct item_head *ih;
-
-	bh = PATH_PLAST_BUFFER(path);
-	ih = tp_item_head(path);
-
-	if (!is_statdata_le_ih(ih))
-		reiserfs_panic(inode->i_sb, "vs-13065", "key %k, found item %h",
-			       INODE_PKEY(inode), ih);
-
-	/* path points to old stat data */
-	if (stat_data_v1(ih)) {
-		inode2sd_v1(ih_item_body(bh, ih), inode, size);
-	} else {
-		inode2sd(ih_item_body(bh, ih), inode, size);
-	}
-
-	return;
-}
-
-void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
-			     struct inode *inode, loff_t size)
-{
-	struct cpu_key key;
-	INITIALIZE_PATH(path);
-	struct buffer_head *bh;
-	int fs_gen;
-	struct item_head *ih, tmp_ih;
-	int retval;
-
-	BUG_ON(!th->t_trans_id);
-
-	/* key type is unimportant */
-	make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);
-
-	for (;;) {
-		int pos;
-		/* look for the object's stat data */
-		retval = search_item(inode->i_sb, &key, &path);
-		if (retval == IO_ERROR) {
-			reiserfs_error(inode->i_sb, "vs-13050",
-				       "i/o failure occurred trying to "
-				       "update %K stat data", &key);
-			return;
-		}
-		if (retval == ITEM_NOT_FOUND) {
-			pos = PATH_LAST_POSITION(&path);
-			pathrelse(&path);
-			if (inode->i_nlink == 0) {
-				/*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found"); */
-				return;
-			}
-			reiserfs_warning(inode->i_sb, "vs-13060",
-					 "stat data of object %k (nlink == %d) "
-					 "not found (pos %d)",
-					 INODE_PKEY(inode), inode->i_nlink,
-					 pos);
-			reiserfs_check_path(&path);
-			return;
-		}
-
-		/*
-		 * sigh, prepare_for_journal might schedule.  When it
-		 * schedules the FS might change.  We have to detect that,
-		 * and loop back to the search if the stat data item has moved
-		 */
-		bh = get_last_bh(&path);
-		ih = tp_item_head(&path);
-		copy_item_head(&tmp_ih, ih);
-		fs_gen = get_generation(inode->i_sb);
-		reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
-
-		/* Stat_data item has been moved after scheduling. */
-		if (fs_changed(fs_gen, inode->i_sb)
-		    && item_moved(&tmp_ih, &path)) {
-			reiserfs_restore_prepared_buffer(inode->i_sb, bh);
-			continue;
-		}
-		break;
-	}
-	update_stat_data(&path, inode, size);
-	journal_mark_dirty(th, bh);
-	pathrelse(&path);
-	return;
-}
-
-/*
- * reiserfs_read_locked_inode is called to read the inode off disk, and it
- * does a make_bad_inode when things go wrong.  But, we need to make sure
- * and clear the key in the private portion of the inode, otherwise a
- * corresponding iput might try to delete whatever object the inode last
- * represented.
- */
-static void reiserfs_make_bad_inode(struct inode *inode)
-{
-	memset(INODE_PKEY(inode), 0, KEY_SIZE);
-	make_bad_inode(inode);
-}
-
-/*
- * initially this function was derived from minix or ext2's analog and
- * evolved as the prototype did
- */
-int reiserfs_init_locked_inode(struct inode *inode, void *p)
-{
-	struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p;
-	inode->i_ino = args->objectid;
-	INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid);
-	return 0;
-}
-
-/*
- * looks for stat data in the tree, and fills up the fields of in-core
- * inode stat data fields
- */
-void reiserfs_read_locked_inode(struct inode *inode,
-				struct reiserfs_iget_args *args)
-{
-	INITIALIZE_PATH(path_to_sd);
-	struct cpu_key key;
-	unsigned long dirino;
-	int retval;
-
-	dirino = args->dirid;
-
-	/*
-	 * set version 1, version 2 could be used too, because stat data
-	 * key is the same in both versions
-	 */
-	_make_cpu_key(&key, KEY_FORMAT_3_5, dirino, inode->i_ino, 0, 0, 3);
-
-	/* look for the object's stat data */
-	retval = search_item(inode->i_sb, &key, &path_to_sd);
-	if (retval == IO_ERROR) {
-		reiserfs_error(inode->i_sb, "vs-13070",
-			       "i/o failure occurred trying to find "
-			       "stat data of %K", &key);
-		reiserfs_make_bad_inode(inode);
-		return;
-	}
-
-	/* a stale NFS handle can trigger this without it being an error */
-	if (retval != ITEM_FOUND) {
-		pathrelse(&path_to_sd);
-		reiserfs_make_bad_inode(inode);
-		clear_nlink(inode);
-		return;
-	}
-
-	init_inode(inode, &path_to_sd);
-
-	/*
-	 * It is possible that knfsd is trying to access inode of a file
-	 * that is being removed from the disk by some other thread. As we
-	 * update sd on unlink all that is required is to check for nlink
-	 * here. This bug was first found by Sizif when debugging
-	 * SquidNG/Butterfly, forgotten, and found again after Philippe
-	 * Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
-
-	 * More logical fix would require changes in fs/inode.c:iput() to
-	 * remove inode from hash-table _after_ fs cleaned disk stuff up and
-	 * in iget() to return NULL if I_FREEING inode is found in
-	 * hash-table.
-	 */
-
-	/*
-	 * Currently there is one place where it's ok to meet inode with
-	 * nlink==0: processing of open-unlinked and half-truncated files
-	 * during mount (fs/reiserfs/super.c:finish_unfinished()).
-	 */
-	if ((inode->i_nlink == 0) &&
-	    !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) {
-		reiserfs_warning(inode->i_sb, "vs-13075",
-				 "dead inode read from disk %K. "
-				 "This is likely to be race with knfsd. Ignore",
-				 &key);
-		reiserfs_make_bad_inode(inode);
-	}
-
-	/* init inode should be relsing */
-	reiserfs_check_path(&path_to_sd);
-
-	/*
-	 * Stat data v1 doesn't support ACLs.
-	 */
-	if (get_inode_sd_version(inode) == STAT_DATA_V1)
-		cache_no_acl(inode);
-}
-
-/*
- * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked().
- *
- * @inode:    inode from hash table to check
- * @opaque:   "cookie" passed to iget5_locked(). This is &reiserfs_iget_args.
- *
- * This function is called by iget5_locked() to distinguish reiserfs inodes
- * having the same inode numbers. Such inodes can only exist due to some
- * error condition. One of them should be bad. Inodes with identical
- * inode numbers (objectids) are distinguished by parent directory ids.
- *
- */
-int reiserfs_find_actor(struct inode *inode, void *opaque)
-{
-	struct reiserfs_iget_args *args;
-
-	args = opaque;
-	/* args is already in CPU order */
-	return (inode->i_ino == args->objectid) &&
-	    (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid);
-}
-
-struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key)
-{
-	struct inode *inode;
-	struct reiserfs_iget_args args;
-	int depth;
-
-	args.objectid = key->on_disk_key.k_objectid;
-	args.dirid = key->on_disk_key.k_dir_id;
-	depth = reiserfs_write_unlock_nested(s);
-	inode = iget5_locked(s, key->on_disk_key.k_objectid,
-			     reiserfs_find_actor, reiserfs_init_locked_inode,
-			     (void *)(&args));
-	reiserfs_write_lock_nested(s, depth);
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
-
-	if (inode->i_state & I_NEW) {
-		reiserfs_read_locked_inode(inode, &args);
-		unlock_new_inode(inode);
-	}
-
-	if (comp_short_keys(INODE_PKEY(inode), key) || is_bad_inode(inode)) {
-		/* either due to i/o error or a stale NFS handle */
-		iput(inode);
-		inode = NULL;
-	}
-	return inode;
-}
-
-static struct dentry *reiserfs_get_dentry(struct super_block *sb,
-	u32 objectid, u32 dir_id, u32 generation)
-
-{
-	struct cpu_key key;
-	struct inode *inode;
-
-	key.on_disk_key.k_objectid = objectid;
-	key.on_disk_key.k_dir_id = dir_id;
-	reiserfs_write_lock(sb);
-	inode = reiserfs_iget(sb, &key);
-	if (inode && !IS_ERR(inode) && generation != 0 &&
-	    generation != inode->i_generation) {
-		iput(inode);
-		inode = NULL;
-	}
-	reiserfs_write_unlock(sb);
-
-	return d_obtain_alias(inode);
-}
-
-struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
-		int fh_len, int fh_type)
-{
-	/*
-	 * fhtype happens to reflect the number of u32s encoded.
-	 * due to a bug in earlier code, fhtype might indicate there
-	 * are more u32s then actually fitted.
-	 * so if fhtype seems to be more than len, reduce fhtype.
-	 * Valid types are:
-	 *   2 - objectid + dir_id - legacy support
-	 *   3 - objectid + dir_id + generation
-	 *   4 - objectid + dir_id + objectid and dirid of parent - legacy
-	 *   5 - objectid + dir_id + generation + objectid and dirid of parent
-	 *   6 - as above plus generation of directory
-	 * 6 does not fit in NFSv2 handles
-	 */
-	if (fh_type > fh_len) {
-		if (fh_type != 6 || fh_len != 5)
-			reiserfs_warning(sb, "reiserfs-13077",
-				"nfsd/reiserfs, fhtype=%d, len=%d - odd",
-				fh_type, fh_len);
-		fh_type = fh_len;
-	}
-	if (fh_len < 2)
-		return NULL;
-
-	return reiserfs_get_dentry(sb, fid->raw[0], fid->raw[1],
-		(fh_type == 3 || fh_type >= 5) ? fid->raw[2] : 0);
-}
-
-struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
-		int fh_len, int fh_type)
-{
-	if (fh_type > fh_len)
-		fh_type = fh_len;
-	if (fh_type < 4)
-		return NULL;
-
-	return reiserfs_get_dentry(sb,
-		(fh_type >= 5) ? fid->raw[3] : fid->raw[2],
-		(fh_type >= 5) ? fid->raw[4] : fid->raw[3],
-		(fh_type == 6) ? fid->raw[5] : 0);
-}
-
-int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp,
-		       struct inode *parent)
-{
-	int maxlen = *lenp;
-
-	if (parent && (maxlen < 5)) {
-		*lenp = 5;
-		return FILEID_INVALID;
-	} else if (maxlen < 3) {
-		*lenp = 3;
-		return FILEID_INVALID;
-	}
-
-	data[0] = inode->i_ino;
-	data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
-	data[2] = inode->i_generation;
-	*lenp = 3;
-	if (parent) {
-		data[3] = parent->i_ino;
-		data[4] = le32_to_cpu(INODE_PKEY(parent)->k_dir_id);
-		*lenp = 5;
-		if (maxlen >= 6) {
-			data[5] = parent->i_generation;
-			*lenp = 6;
-		}
-	}
-	return *lenp;
-}
-
-/*
- * looks for stat data, then copies fields to it, marks the buffer
- * containing stat data as dirty
- */
-/*
- * reiserfs inodes are never really dirty, since the dirty inode call
- * always logs them.  This call allows the VFS inode marking routines
- * to properly mark inodes for datasync and such, but only actually
- * does something when called for a synchronous update.
- */
-int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
-	struct reiserfs_transaction_handle th;
-	int jbegin_count = 1;
-
-	if (sb_rdonly(inode->i_sb))
-		return -EROFS;
-	/*
-	 * memory pressure can sometimes initiate write_inode calls with
-	 * sync == 1,
-	 * these cases are just when the system needs ram, not when the
-	 * inode needs to reach disk for safety, and they can safely be
-	 * ignored because the altered inode has already been logged.
-	 */
-	if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) {
-		reiserfs_write_lock(inode->i_sb);
-		if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
-			reiserfs_update_sd(&th, inode);
-			journal_end_sync(&th);
-		}
-		reiserfs_write_unlock(inode->i_sb);
-	}
-	return 0;
-}
-
-/*
- * stat data of new object is inserted already, this inserts the item
- * containing "." and ".." entries
- */
-static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
-				  struct inode *inode,
-				  struct item_head *ih, struct treepath *path,
-				  struct inode *dir)
-{
-	struct super_block *sb = th->t_super;
-	char empty_dir[EMPTY_DIR_SIZE];
-	char *body = empty_dir;
-	struct cpu_key key;
-	int retval;
-
-	BUG_ON(!th->t_trans_id);
-
-	_make_cpu_key(&key, KEY_FORMAT_3_5, le32_to_cpu(ih->ih_key.k_dir_id),
-		      le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET,
-		      TYPE_DIRENTRY, 3 /*key length */ );
-
-	/*
-	 * compose item head for new item. Directories consist of items of
-	 * old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
-	 * is done by reiserfs_new_inode
-	 */
-	if (old_format_only(sb)) {
-		make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
-				  TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
-
-		make_empty_dir_item_v1(body, ih->ih_key.k_dir_id,
-				       ih->ih_key.k_objectid,
-				       INODE_PKEY(dir)->k_dir_id,
-				       INODE_PKEY(dir)->k_objectid);
-	} else {
-		make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
-				  TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2);
-
-		make_empty_dir_item(body, ih->ih_key.k_dir_id,
-				    ih->ih_key.k_objectid,
-				    INODE_PKEY(dir)->k_dir_id,
-				    INODE_PKEY(dir)->k_objectid);
-	}
-
-	/* look for place in the tree for new item */
-	retval = search_item(sb, &key, path);
-	if (retval == IO_ERROR) {
-		reiserfs_error(sb, "vs-13080",
-			       "i/o failure occurred creating new directory");
-		return -EIO;
-	}
-	if (retval == ITEM_FOUND) {
-		pathrelse(path);
-		reiserfs_warning(sb, "vs-13070",
-				 "object with this key exists (%k)",
-				 &(ih->ih_key));
-		return -EEXIST;
-	}
-
-	/* insert item, that is empty directory item */
-	return reiserfs_insert_item(th, path, &key, ih, inode, body);
-}
-
-/*
- * stat data of object has been inserted, this inserts the item
- * containing the body of symlink
- */
-static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th,
-				struct inode *inode,
-				struct item_head *ih,
-				struct treepath *path, const char *symname,
-				int item_len)
-{
-	struct super_block *sb = th->t_super;
-	struct cpu_key key;
-	int retval;
-
-	BUG_ON(!th->t_trans_id);
-
-	_make_cpu_key(&key, KEY_FORMAT_3_5,
-		      le32_to_cpu(ih->ih_key.k_dir_id),
-		      le32_to_cpu(ih->ih_key.k_objectid),
-		      1, TYPE_DIRECT, 3 /*key length */ );
-
-	make_le_item_head(ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len,
-			  0 /*free_space */ );
-
-	/* look for place in the tree for new item */
-	retval = search_item(sb, &key, path);
-	if (retval == IO_ERROR) {
-		reiserfs_error(sb, "vs-13080",
-			       "i/o failure occurred creating new symlink");
-		return -EIO;
-	}
-	if (retval == ITEM_FOUND) {
-		pathrelse(path);
-		reiserfs_warning(sb, "vs-13080",
-				 "object with this key exists (%k)",
-				 &(ih->ih_key));
-		return -EEXIST;
-	}
-
-	/* insert item, that is body of symlink */
-	return reiserfs_insert_item(th, path, &key, ih, inode, symname);
-}
-
-/*
- * inserts the stat data into the tree, and then calls
- * reiserfs_new_directory (to insert ".", ".." item if new object is
- * directory) or reiserfs_new_symlink (to insert symlink body if new
- * object is symlink) or nothing (if new object is regular file)
-
- * NOTE! uid and gid must already be set in the inode.  If we return
- * non-zero due to an error, we have to drop the quota previously allocated
- * for the fresh inode.  This can only be done outside a transaction, so
- * if we return non-zero, we also end the transaction.
- *
- * @th: active transaction handle
- * @dir: parent directory for new inode
- * @mode: mode of new inode
- * @symname: symlink contents if inode is symlink
- * @isize: 0 for regular file, EMPTY_DIR_SIZE for dirs, strlen(symname) for
- *         symlinks
- * @inode: inode to be filled
- * @security: optional security context to associate with this inode
- */
-int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
-		       struct inode *dir, umode_t mode, const char *symname,
-		       /* 0 for regular, EMTRY_DIR_SIZE for dirs,
-		          strlen (symname) for symlinks) */
-		       loff_t i_size, struct dentry *dentry,
-		       struct inode *inode,
-		       struct reiserfs_security_handle *security)
-{
-	struct super_block *sb = dir->i_sb;
-	struct reiserfs_iget_args args;
-	INITIALIZE_PATH(path_to_key);
-	struct cpu_key key;
-	struct item_head ih;
-	struct stat_data sd;
-	int retval;
-	int err;
-	int depth;
-
-	BUG_ON(!th->t_trans_id);
-
-	depth = reiserfs_write_unlock_nested(sb);
-	err = dquot_alloc_inode(inode);
-	reiserfs_write_lock_nested(sb, depth);
-	if (err)
-		goto out_end_trans;
-	if (!dir->i_nlink) {
-		err = -EPERM;
-		goto out_bad_inode;
-	}
-
-	/* item head of new item */
-	ih.ih_key.k_dir_id = reiserfs_choose_packing(dir);
-	ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th));
-	if (!ih.ih_key.k_objectid) {
-		err = -ENOMEM;
-		goto out_bad_inode;
-	}
-	args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
-	if (old_format_only(sb))
-		make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
-				  TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
-	else
-		make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
-				  TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
-	memcpy(INODE_PKEY(inode), &ih.ih_key, KEY_SIZE);
-	args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
-
-	depth = reiserfs_write_unlock_nested(inode->i_sb);
-	err = insert_inode_locked4(inode, args.objectid,
-			     reiserfs_find_actor, &args);
-	reiserfs_write_lock_nested(inode->i_sb, depth);
-	if (err) {
-		err = -EINVAL;
-		goto out_bad_inode;
-	}
-
-	if (old_format_only(sb))
-		/*
-		 * not a perfect generation count, as object ids can be reused,
-		 * but this is as good as reiserfs can do right now.
-		 * note that the private part of inode isn't filled in yet,
-		 * we have to use the directory.
-		 */
-		inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid);
-	else
-#if defined( USE_INODE_GENERATION_COUNTER )
-		inode->i_generation =
-		    le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation);
-#else
-		inode->i_generation = ++event;
-#endif
-
-	/* fill stat data */
-	set_nlink(inode, (S_ISDIR(mode) ? 2 : 1));
-
-	/* uid and gid must already be set by the caller for quota init */
-
-	simple_inode_init_ts(inode);
-	inode->i_size = i_size;
-	inode->i_blocks = 0;
-	inode->i_bytes = 0;
-	REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 :
-	    U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ;
-
-	INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list);
-	REISERFS_I(inode)->i_flags = 0;
-	REISERFS_I(inode)->i_prealloc_block = 0;
-	REISERFS_I(inode)->i_prealloc_count = 0;
-	REISERFS_I(inode)->i_trans_id = 0;
-	REISERFS_I(inode)->i_jl = NULL;
-	REISERFS_I(inode)->i_attrs =
-	    REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
-	sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
-	reiserfs_init_xattr_rwsem(inode);
-
-	/* key to search for correct place for new stat data */
-	_make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id),
-		      le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET,
-		      TYPE_STAT_DATA, 3 /*key length */ );
-
-	/* find proper place for inserting of stat data */
-	retval = search_item(sb, &key, &path_to_key);
-	if (retval == IO_ERROR) {
-		err = -EIO;
-		goto out_bad_inode;
-	}
-	if (retval == ITEM_FOUND) {
-		pathrelse(&path_to_key);
-		err = -EEXIST;
-		goto out_bad_inode;
-	}
-	if (old_format_only(sb)) {
-		/* i_uid or i_gid is too big to be stored in stat data v3.5 */
-		if (i_uid_read(inode) & ~0xffff || i_gid_read(inode) & ~0xffff) {
-			pathrelse(&path_to_key);
-			err = -EINVAL;
-			goto out_bad_inode;
-		}
-		inode2sd_v1(&sd, inode, inode->i_size);
-	} else {
-		inode2sd(&sd, inode, inode->i_size);
-	}
-	/*
-	 * store in in-core inode the key of stat data and version all
-	 * object items will have (directory items will have old offset
-	 * format, other new objects will consist of new items)
-	 */
-	if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
-		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
-	else
-		set_inode_item_key_version(inode, KEY_FORMAT_3_6);
-	if (old_format_only(sb))
-		set_inode_sd_version(inode, STAT_DATA_V1);
-	else
-		set_inode_sd_version(inode, STAT_DATA_V2);
-
-	/* insert the stat data into the tree */
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
-	if (REISERFS_I(dir)->new_packing_locality)
-		th->displace_new_blocks = 1;
-#endif
-	retval =
-	    reiserfs_insert_item(th, &path_to_key, &key, &ih, inode,
-				 (char *)(&sd));
-	if (retval) {
-		err = retval;
-		reiserfs_check_path(&path_to_key);
-		goto out_bad_inode;
-	}
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
-	if (!th->displace_new_blocks)
-		REISERFS_I(dir)->new_packing_locality = 0;
-#endif
-	if (S_ISDIR(mode)) {
-		/* insert item with "." and ".." */
-		retval =
-		    reiserfs_new_directory(th, inode, &ih, &path_to_key, dir);
-	}
-
-	if (S_ISLNK(mode)) {
-		/* insert body of symlink */
-		if (!old_format_only(sb))
-			i_size = ROUND_UP(i_size);
-		retval =
-		    reiserfs_new_symlink(th, inode, &ih, &path_to_key, symname,
-					 i_size);
-	}
-	if (retval) {
-		err = retval;
-		reiserfs_check_path(&path_to_key);
-		journal_end(th);
-		goto out_inserted_sd;
-	}
-
-	/*
-	 * Mark it private if we're creating the privroot
-	 * or something under it.
-	 */
-	if (IS_PRIVATE(dir) || dentry == REISERFS_SB(sb)->priv_root)
-		reiserfs_init_priv_inode(inode);
-
-	if (reiserfs_posixacl(inode->i_sb)) {
-		reiserfs_write_unlock(inode->i_sb);
-		retval = reiserfs_inherit_default_acl(th, dir, dentry, inode);
-		reiserfs_write_lock(inode->i_sb);
-		if (retval) {
-			err = retval;
-			reiserfs_check_path(&path_to_key);
-			journal_end(th);
-			goto out_inserted_sd;
-		}
-	} else if (inode->i_sb->s_flags & SB_POSIXACL) {
-		reiserfs_warning(inode->i_sb, "jdm-13090",
-				 "ACLs aren't enabled in the fs, "
-				 "but vfs thinks they are!");
-	}
-
-	if (security->name) {
-		reiserfs_write_unlock(inode->i_sb);
-		retval = reiserfs_security_write(th, inode, security);
-		reiserfs_write_lock(inode->i_sb);
-		if (retval) {
-			err = retval;
-			reiserfs_check_path(&path_to_key);
-			retval = journal_end(th);
-			if (retval)
-				err = retval;
-			goto out_inserted_sd;
-		}
-	}
-
-	reiserfs_update_sd(th, inode);
-	reiserfs_check_path(&path_to_key);
-
-	return 0;
-
-out_bad_inode:
-	/* Invalidate the object, nothing was inserted yet */
-	INODE_PKEY(inode)->k_objectid = 0;
-
-	/* Quota change must be inside a transaction for journaling */
-	depth = reiserfs_write_unlock_nested(inode->i_sb);
-	dquot_free_inode(inode);
-	reiserfs_write_lock_nested(inode->i_sb, depth);
-
-out_end_trans:
-	journal_end(th);
-	/*
-	 * Drop can be outside and it needs more credits so it's better
-	 * to have it outside
-	 */
-	depth = reiserfs_write_unlock_nested(inode->i_sb);
-	dquot_drop(inode);
-	reiserfs_write_lock_nested(inode->i_sb, depth);
-	inode->i_flags |= S_NOQUOTA;
-	make_bad_inode(inode);
-
-out_inserted_sd:
-	clear_nlink(inode);
-	th->t_trans_id = 0;	/* so the caller can't use this handle later */
-	if (inode->i_state & I_NEW)
-		unlock_new_inode(inode);
-	iput(inode);
-	return err;
-}
-
-/*
- * finds the tail page in the page cache,
- * reads the last block in.
- *
- * On success, page_result is set to a locked, pinned page, and bh_result
- * is set to an up to date buffer for the last block in the file.  returns 0.
- *
- * tail conversion is not done, so bh_result might not be valid for writing
- * check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
- * trying to write the block.
- *
- * on failure, nonzero is returned, page_result and bh_result are untouched.
- */
-static int grab_tail_page(struct inode *inode,
-			  struct page **page_result,
-			  struct buffer_head **bh_result)
-{
-
-	/*
-	 * we want the page with the last byte in the file,
-	 * not the page that will hold the next byte for appending
-	 */
-	unsigned long index = (inode->i_size - 1) >> PAGE_SHIFT;
-	unsigned long pos = 0;
-	unsigned long start = 0;
-	unsigned long blocksize = inode->i_sb->s_blocksize;
-	unsigned long offset = (inode->i_size) & (PAGE_SIZE - 1);
-	struct buffer_head *bh;
-	struct buffer_head *head;
-	struct folio *folio;
-	int error;
-
-	/*
-	 * we know that we are only called with inode->i_size > 0.
-	 * we also know that a file tail can never be as big as a block
-	 * If i_size % blocksize == 0, our file is currently block aligned
-	 * and it won't need converting or zeroing after a truncate.
-	 */
-	if ((offset & (blocksize - 1)) == 0) {
-		return -ENOENT;
-	}
-	folio = __filemap_get_folio(inode->i_mapping, index,
-			FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
-			mapping_gfp_mask(inode->i_mapping));
-	if (IS_ERR(folio))
-		return PTR_ERR(folio);
-	/* start within the page of the last block in the file */
-	start = (offset / blocksize) * blocksize;
-
-	error = __block_write_begin(folio, start, offset - start,
-				    reiserfs_get_block_create_0);
-	if (error)
-		goto unlock;
-
-	head = folio_buffers(folio);
-	bh = head;
-	do {
-		if (pos >= start) {
-			break;
-		}
-		bh = bh->b_this_page;
-		pos += blocksize;
-	} while (bh != head);
-
-	if (!buffer_uptodate(bh)) {
-		/*
-		 * note, this should never happen, prepare_write should be
-		 * taking care of this for us.  If the buffer isn't up to
-		 * date, I've screwed up the code to find the buffer, or the
-		 * code to call prepare_write
-		 */
-		reiserfs_error(inode->i_sb, "clm-6000",
-			       "error reading block %lu", bh->b_blocknr);
-		error = -EIO;
-		goto unlock;
-	}
-	*bh_result = bh;
-	*page_result = &folio->page;
-
-	return error;
-
-unlock:
-	folio_unlock(folio);
-	folio_put(folio);
-	return error;
-}
-
-/*
- * vfs version of truncate file.  Must NOT be called with
- * a transaction already started.
- *
- * some code taken from block_truncate_page
- */
-int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
-{
-	struct reiserfs_transaction_handle th;
-	/* we want the offset for the first byte after the end of the file */
-	unsigned long offset = inode->i_size & (PAGE_SIZE - 1);
-	unsigned blocksize = inode->i_sb->s_blocksize;
-	unsigned length;
-	struct page *page = NULL;
-	int error;
-	struct buffer_head *bh = NULL;
-	int err2;
-
-	reiserfs_write_lock(inode->i_sb);
-
-	if (inode->i_size > 0) {
-		error = grab_tail_page(inode, &page, &bh);
-		if (error) {
-			/*
-			 * -ENOENT means we truncated past the end of the
-			 * file, and get_block_create_0 could not find a
-			 * block to read in, which is ok.
-			 */
-			if (error != -ENOENT)
-				reiserfs_error(inode->i_sb, "clm-6001",
-					       "grab_tail_page failed %d",
-					       error);
-			page = NULL;
-			bh = NULL;
-		}
-	}
-
-	/*
-	 * so, if page != NULL, we have a buffer head for the offset at
-	 * the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
-	 * then we have an unformatted node.  Otherwise, we have a direct item,
-	 * and no zeroing is required on disk.  We zero after the truncate,
-	 * because the truncate might pack the item anyway
-	 * (it will unmap bh if it packs).
-	 *
-	 * it is enough to reserve space in transaction for 2 balancings:
-	 * one for "save" link adding and another for the first
-	 * cut_from_item. 1 is for update_sd
-	 */
-	error = journal_begin(&th, inode->i_sb,
-			      JOURNAL_PER_BALANCE_CNT * 2 + 1);
-	if (error)
-		goto out;
-	reiserfs_update_inode_transaction(inode);
-	if (update_timestamps)
-		/*
-		 * we are doing real truncate: if the system crashes
-		 * before the last transaction of truncating gets committed
-		 * - on reboot the file either appears truncated properly
-		 * or not truncated at all
-		 */
-		add_save_link(&th, inode, 1);
-	err2 = reiserfs_do_truncate(&th, inode, page, update_timestamps);
-	error = journal_end(&th);
-	if (error)
-		goto out;
-
-	/* check reiserfs_do_truncate after ending the transaction */
-	if (err2) {
-		error = err2;
-  		goto out;
-	}
-	
-	if (update_timestamps) {
-		error = remove_save_link(inode, 1 /* truncate */);
-		if (error)
-			goto out;
-	}
-
-	if (page) {
-		length = offset & (blocksize - 1);
-		/* if we are not on a block boundary */
-		if (length) {
-			length = blocksize - length;
-			zero_user(page, offset, length);
-			if (buffer_mapped(bh) && bh->b_blocknr != 0) {
-				mark_buffer_dirty(bh);
-			}
-		}
-		unlock_page(page);
-		put_page(page);
-	}
-
-	reiserfs_write_unlock(inode->i_sb);
-
-	return 0;
-out:
-	if (page) {
-		unlock_page(page);
-		put_page(page);
-	}
-
-	reiserfs_write_unlock(inode->i_sb);
-
-	return error;
-}
-
-static int map_block_for_writepage(struct inode *inode,
-				   struct buffer_head *bh_result,
-				   unsigned long block)
-{
-	struct reiserfs_transaction_handle th;
-	int fs_gen;
-	struct item_head tmp_ih;
-	struct item_head *ih;
-	struct buffer_head *bh;
-	__le32 *item;
-	struct cpu_key key;
-	INITIALIZE_PATH(path);
-	int pos_in_item;
-	int jbegin_count = JOURNAL_PER_BALANCE_CNT;
-	loff_t byte_offset = ((loff_t)block << inode->i_sb->s_blocksize_bits)+1;
-	int retval;
-	int use_get_block = 0;
-	int bytes_copied = 0;
-	int copy_size;
-	int trans_running = 0;
-
-	/*
-	 * catch places below that try to log something without
-	 * starting a trans
-	 */
-	th.t_trans_id = 0;
-
-	if (!buffer_uptodate(bh_result)) {
-		return -EIO;
-	}
-
-	kmap(bh_result->b_page);
-start_over:
-	reiserfs_write_lock(inode->i_sb);
-	make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3);
-
-research:
-	retval = search_for_position_by_key(inode->i_sb, &key, &path);
-	if (retval != POSITION_FOUND) {
-		use_get_block = 1;
-		goto out;
-	}
-
-	bh = get_last_bh(&path);
-	ih = tp_item_head(&path);
-	item = tp_item_body(&path);
-	pos_in_item = path.pos_in_item;
-
-	/* we've found an unformatted node */
-	if (indirect_item_found(retval, ih)) {
-		if (bytes_copied > 0) {
-			reiserfs_warning(inode->i_sb, "clm-6002",
-					 "bytes_copied %d", bytes_copied);
-		}
-		if (!get_block_num(item, pos_in_item)) {
-			/* crap, we are writing to a hole */
-			use_get_block = 1;
-			goto out;
-		}
-		set_block_dev_mapped(bh_result,
-				     get_block_num(item, pos_in_item), inode);
-	} else if (is_direct_le_ih(ih)) {
-		char *p;
-		p = page_address(bh_result->b_page);
-		p += (byte_offset - 1) & (PAGE_SIZE - 1);
-		copy_size = ih_item_len(ih) - pos_in_item;
-
-		fs_gen = get_generation(inode->i_sb);
-		copy_item_head(&tmp_ih, ih);
-
-		if (!trans_running) {
-			/* vs-3050 is gone, no need to drop the path */
-			retval = journal_begin(&th, inode->i_sb, jbegin_count);
-			if (retval)
-				goto out;
-			reiserfs_update_inode_transaction(inode);
-			trans_running = 1;
-			if (fs_changed(fs_gen, inode->i_sb)
-			    && item_moved(&tmp_ih, &path)) {
-				reiserfs_restore_prepared_buffer(inode->i_sb,
-								 bh);
-				goto research;
-			}
-		}
-
-		reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
-
-		if (fs_changed(fs_gen, inode->i_sb)
-		    && item_moved(&tmp_ih, &path)) {
-			reiserfs_restore_prepared_buffer(inode->i_sb, bh);
-			goto research;
-		}
-
-		memcpy(ih_item_body(bh, ih) + pos_in_item, p + bytes_copied,
-		       copy_size);
-
-		journal_mark_dirty(&th, bh);
-		bytes_copied += copy_size;
-		set_block_dev_mapped(bh_result, 0, inode);
-
-		/* are there still bytes left? */
-		if (bytes_copied < bh_result->b_size &&
-		    (byte_offset + bytes_copied) < inode->i_size) {
-			set_cpu_key_k_offset(&key,
-					     cpu_key_k_offset(&key) +
-					     copy_size);
-			goto research;
-		}
-	} else {
-		reiserfs_warning(inode->i_sb, "clm-6003",
-				 "bad item inode %lu", inode->i_ino);
-		retval = -EIO;
-		goto out;
-	}
-	retval = 0;
-
-out:
-	pathrelse(&path);
-	if (trans_running) {
-		int err = journal_end(&th);
-		if (err)
-			retval = err;
-		trans_running = 0;
-	}
-	reiserfs_write_unlock(inode->i_sb);
-
-	/* this is where we fill in holes in the file. */
-	if (use_get_block) {
-		retval = reiserfs_get_block(inode, block, bh_result,
-					    GET_BLOCK_CREATE | GET_BLOCK_NO_IMUX
-					    | GET_BLOCK_NO_DANGLE);
-		if (!retval) {
-			if (!buffer_mapped(bh_result)
-			    || bh_result->b_blocknr == 0) {
-				/* get_block failed to find a mapped unformatted node. */
-				use_get_block = 0;
-				goto start_over;
-			}
-		}
-	}
-	kunmap(bh_result->b_page);
-
-	if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
-		/*
-		 * we've copied data from the page into the direct item, so the
-		 * buffer in the page is now clean, mark it to reflect that.
-		 */
-		lock_buffer(bh_result);
-		clear_buffer_dirty(bh_result);
-		unlock_buffer(bh_result);
-	}
-	return retval;
-}
-
-/*
- * mason@suse.com: updated in 2.5.54 to follow the same general io
- * start/recovery path as __block_write_full_folio, along with special
- * code to handle reiserfs tails.
- */
-static int reiserfs_write_folio(struct folio *folio,
-		struct writeback_control *wbc, void *data)
-{
-	struct inode *inode = folio->mapping->host;
-	unsigned long end_index = inode->i_size >> PAGE_SHIFT;
-	int error = 0;
-	unsigned long block;
-	sector_t last_block;
-	struct buffer_head *head, *bh;
-	int partial = 0;
-	int nr = 0;
-	int checked = folio_test_checked(folio);
-	struct reiserfs_transaction_handle th;
-	struct super_block *s = inode->i_sb;
-	int bh_per_page = PAGE_SIZE / s->s_blocksize;
-	th.t_trans_id = 0;
-
-	/* no logging allowed when nonblocking or from PF_MEMALLOC */
-	if (checked && (current->flags & PF_MEMALLOC)) {
-		folio_redirty_for_writepage(wbc, folio);
-		folio_unlock(folio);
-		return 0;
-	}
-
-	/*
-	 * The folio dirty bit is cleared before writepage is called, which
-	 * means we have to tell create_empty_buffers to make dirty buffers
-	 * The folio really should be up to date at this point, so tossing
-	 * in the BH_Uptodate is just a sanity check.
-	 */
-	head = folio_buffers(folio);
-	if (!head)
-		head = create_empty_buffers(folio, s->s_blocksize,
-				     (1 << BH_Dirty) | (1 << BH_Uptodate));
-
-	/*
-	 * last folio in the file, zero out any contents past the
-	 * last byte in the file
-	 */
-	if (folio->index >= end_index) {
-		unsigned last_offset;
-
-		last_offset = inode->i_size & (PAGE_SIZE - 1);
-		/* no file contents in this folio */
-		if (folio->index >= end_index + 1 || !last_offset) {
-			folio_unlock(folio);
-			return 0;
-		}
-		folio_zero_segment(folio, last_offset, folio_size(folio));
-	}
-	bh = head;
-	block = folio->index << (PAGE_SHIFT - s->s_blocksize_bits);
-	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
-	/* first map all the buffers, logging any direct items we find */
-	do {
-		if (block > last_block) {
-			/*
-			 * This can happen when the block size is less than
-			 * the folio size.  The corresponding bytes in the folio
-			 * were zero filled above
-			 */
-			clear_buffer_dirty(bh);
-			set_buffer_uptodate(bh);
-		} else if ((checked || buffer_dirty(bh)) &&
-			   (!buffer_mapped(bh) || bh->b_blocknr == 0)) {
-			/*
-			 * not mapped yet, or it points to a direct item, search
-			 * the btree for the mapping info, and log any direct
-			 * items found
-			 */
-			if ((error = map_block_for_writepage(inode, bh, block))) {
-				goto fail;
-			}
-		}
-		bh = bh->b_this_page;
-		block++;
-	} while (bh != head);
-
-	/*
-	 * we start the transaction after map_block_for_writepage,
-	 * because it can create holes in the file (an unbounded operation).
-	 * starting it here, we can make a reliable estimate for how many
-	 * blocks we're going to log
-	 */
-	if (checked) {
-		folio_clear_checked(folio);
-		reiserfs_write_lock(s);
-		error = journal_begin(&th, s, bh_per_page + 1);
-		if (error) {
-			reiserfs_write_unlock(s);
-			goto fail;
-		}
-		reiserfs_update_inode_transaction(inode);
-	}
-	/* now go through and lock any dirty buffers on the folio */
-	do {
-		get_bh(bh);
-		if (!buffer_mapped(bh))
-			continue;
-		if (buffer_mapped(bh) && bh->b_blocknr == 0)
-			continue;
-
-		if (checked) {
-			reiserfs_prepare_for_journal(s, bh, 1);
-			journal_mark_dirty(&th, bh);
-			continue;
-		}
-		/*
-		 * from this point on, we know the buffer is mapped to a
-		 * real block and not a direct item
-		 */
-		if (wbc->sync_mode != WB_SYNC_NONE) {
-			lock_buffer(bh);
-		} else {
-			if (!trylock_buffer(bh)) {
-				folio_redirty_for_writepage(wbc, folio);
-				continue;
-			}
-		}
-		if (test_clear_buffer_dirty(bh)) {
-			mark_buffer_async_write(bh);
-		} else {
-			unlock_buffer(bh);
-		}
-	} while ((bh = bh->b_this_page) != head);
-
-	if (checked) {
-		error = journal_end(&th);
-		reiserfs_write_unlock(s);
-		if (error)
-			goto fail;
-	}
-	BUG_ON(folio_test_writeback(folio));
-	folio_start_writeback(folio);
-	folio_unlock(folio);
-
-	/*
-	 * since any buffer might be the only dirty buffer on the folio,
-	 * the first submit_bh can bring the folio out of writeback.
-	 * be careful with the buffers.
-	 */
-	do {
-		struct buffer_head *next = bh->b_this_page;
-		if (buffer_async_write(bh)) {
-			submit_bh(REQ_OP_WRITE, bh);
-			nr++;
-		}
-		put_bh(bh);
-		bh = next;
-	} while (bh != head);
-
-	error = 0;
-done:
-	if (nr == 0) {
-		/*
-		 * if this folio only had a direct item, it is very possible for
-		 * no io to be required without there being an error.  Or,
-		 * someone else could have locked them and sent them down the
-		 * pipe without locking the folio
-		 */
-		bh = head;
-		do {
-			if (!buffer_uptodate(bh)) {
-				partial = 1;
-				break;
-			}
-			bh = bh->b_this_page;
-		} while (bh != head);
-		if (!partial)
-			folio_mark_uptodate(folio);
-		folio_end_writeback(folio);
-	}
-	return error;
-
-fail:
-	/*
-	 * catches various errors, we need to make sure any valid dirty blocks
-	 * get to the media.  The folio is currently locked and not marked for
-	 * writeback
-	 */
-	folio_clear_uptodate(folio);
-	bh = head;
-	do {
-		get_bh(bh);
-		if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) {
-			lock_buffer(bh);
-			mark_buffer_async_write(bh);
-		} else {
-			/*
-			 * clear any dirty bits that might have come from
-			 * getting attached to a dirty folio
-			 */
-			clear_buffer_dirty(bh);
-		}
-		bh = bh->b_this_page;
-	} while (bh != head);
-	BUG_ON(folio_test_writeback(folio));
-	folio_start_writeback(folio);
-	folio_unlock(folio);
-	do {
-		struct buffer_head *next = bh->b_this_page;
-		if (buffer_async_write(bh)) {
-			clear_buffer_dirty(bh);
-			submit_bh(REQ_OP_WRITE, bh);
-			nr++;
-		}
-		put_bh(bh);
-		bh = next;
-	} while (bh != head);
-	goto done;
-}
-
-static int reiserfs_read_folio(struct file *f, struct folio *folio)
-{
-	return block_read_full_folio(folio, reiserfs_get_block);
-}
-
-static int reiserfs_writepages(struct address_space *mapping,
-		struct writeback_control *wbc)
-{
-	reiserfs_wait_on_write_block(mapping->host->i_sb);
-	return write_cache_pages(mapping, wbc, reiserfs_write_folio, NULL);
-}
-
-static void reiserfs_truncate_failed_write(struct inode *inode)
-{
-	truncate_inode_pages(inode->i_mapping, inode->i_size);
-	reiserfs_truncate_file(inode, 0);
-}
-
-static int reiserfs_write_begin(struct file *file,
-				struct address_space *mapping,
-				loff_t pos, unsigned len,
-				struct folio **foliop, void **fsdata)
-{
-	struct inode *inode;
-	struct folio *folio;
-	pgoff_t index;
-	int ret;
-	int old_ref = 0;
-
- 	inode = mapping->host;
-	index = pos >> PAGE_SHIFT;
-	folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
-			mapping_gfp_mask(mapping));
-	if (IS_ERR(folio))
-		return PTR_ERR(folio);
-	*foliop = folio;
-
-	reiserfs_wait_on_write_block(inode->i_sb);
-	fix_tail_page_for_writing(&folio->page);
-	if (reiserfs_transaction_running(inode->i_sb)) {
-		struct reiserfs_transaction_handle *th;
-		th = (struct reiserfs_transaction_handle *)current->
-		    journal_info;
-		BUG_ON(!th->t_refcount);
-		BUG_ON(!th->t_trans_id);
-		old_ref = th->t_refcount;
-		th->t_refcount++;
-	}
-	ret = __block_write_begin(folio, pos, len, reiserfs_get_block);
-	if (ret && reiserfs_transaction_running(inode->i_sb)) {
-		struct reiserfs_transaction_handle *th = current->journal_info;
-		/*
-		 * this gets a little ugly.  If reiserfs_get_block returned an
-		 * error and left a transacstion running, we've got to close
-		 * it, and we've got to free handle if it was a persistent
-		 * transaction.
-		 *
-		 * But, if we had nested into an existing transaction, we need
-		 * to just drop the ref count on the handle.
-		 *
-		 * If old_ref == 0, the transaction is from reiserfs_get_block,
-		 * and it was a persistent trans.  Otherwise, it was nested
-		 * above.
-		 */
-		if (th->t_refcount > old_ref) {
-			if (old_ref)
-				th->t_refcount--;
-			else {
-				int err;
-				reiserfs_write_lock(inode->i_sb);
-				err = reiserfs_end_persistent_transaction(th);
-				reiserfs_write_unlock(inode->i_sb);
-				if (err)
-					ret = err;
-			}
-		}
-	}
-	if (ret) {
-		folio_unlock(folio);
-		folio_put(folio);
-		/* Truncate allocated blocks */
-		reiserfs_truncate_failed_write(inode);
-	}
-	return ret;
-}
-
-int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
-{
-	struct inode *inode = page->mapping->host;
-	int ret;
-	int old_ref = 0;
-	int depth;
-
-	depth = reiserfs_write_unlock_nested(inode->i_sb);
-	reiserfs_wait_on_write_block(inode->i_sb);
-	reiserfs_write_lock_nested(inode->i_sb, depth);
-
-	fix_tail_page_for_writing(page);
-	if (reiserfs_transaction_running(inode->i_sb)) {
-		struct reiserfs_transaction_handle *th;
-		th = (struct reiserfs_transaction_handle *)current->
-		    journal_info;
-		BUG_ON(!th->t_refcount);
-		BUG_ON(!th->t_trans_id);
-		old_ref = th->t_refcount;
-		th->t_refcount++;
-	}
-
-	ret = __block_write_begin(page_folio(page), from, len, reiserfs_get_block);
-	if (ret && reiserfs_transaction_running(inode->i_sb)) {
-		struct reiserfs_transaction_handle *th = current->journal_info;
-		/*
-		 * this gets a little ugly.  If reiserfs_get_block returned an
-		 * error and left a transacstion running, we've got to close
-		 * it, and we've got to free handle if it was a persistent
-		 * transaction.
-		 *
-		 * But, if we had nested into an existing transaction, we need
-		 * to just drop the ref count on the handle.
-		 *
-		 * If old_ref == 0, the transaction is from reiserfs_get_block,
-		 * and it was a persistent trans.  Otherwise, it was nested
-		 * above.
-		 */
-		if (th->t_refcount > old_ref) {
-			if (old_ref)
-				th->t_refcount--;
-			else {
-				int err;
-				reiserfs_write_lock(inode->i_sb);
-				err = reiserfs_end_persistent_transaction(th);
-				reiserfs_write_unlock(inode->i_sb);
-				if (err)
-					ret = err;
-			}
-		}
-	}
-	return ret;
-
-}
-
-static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block)
-{
-	return generic_block_bmap(as, block, reiserfs_bmap);
-}
-
-static int reiserfs_write_end(struct file *file, struct address_space *mapping,
-			      loff_t pos, unsigned len, unsigned copied,
-			      struct folio *folio, void *fsdata)
-{
-	struct inode *inode = folio->mapping->host;
-	int ret = 0;
-	int update_sd = 0;
-	struct reiserfs_transaction_handle *th;
-	unsigned start;
-	bool locked = false;
-
-	reiserfs_wait_on_write_block(inode->i_sb);
-	if (reiserfs_transaction_running(inode->i_sb))
-		th = current->journal_info;
-	else
-		th = NULL;
-
-	start = pos & (PAGE_SIZE - 1);
-	if (unlikely(copied < len)) {
-		if (!folio_test_uptodate(folio))
-			copied = 0;
-
-		folio_zero_new_buffers(folio, start + copied, start + len);
-	}
-	flush_dcache_folio(folio);
-
-	reiserfs_commit_page(inode, &folio->page, start, start + copied);
-
-	/*
-	 * generic_commit_write does this for us, but does not update the
-	 * transaction tracking stuff when the size changes.  So, we have
-	 * to do the i_size updates here.
-	 */
-	if (pos + copied > inode->i_size) {
-		struct reiserfs_transaction_handle myth;
-		reiserfs_write_lock(inode->i_sb);
-		locked = true;
-		/*
-		 * If the file have grown beyond the border where it
-		 * can have a tail, unmark it as needing a tail
-		 * packing
-		 */
-		if ((have_large_tails(inode->i_sb)
-		     && inode->i_size > i_block_size(inode) * 4)
-		    || (have_small_tails(inode->i_sb)
-			&& inode->i_size > i_block_size(inode)))
-			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
-
-		ret = journal_begin(&myth, inode->i_sb, 1);
-		if (ret)
-			goto journal_error;
-
-		reiserfs_update_inode_transaction(inode);
-		inode->i_size = pos + copied;
-		/*
-		 * this will just nest into our transaction.  It's important
-		 * to use mark_inode_dirty so the inode gets pushed around on
-		 * the dirty lists, and so that O_SYNC works as expected
-		 */
-		mark_inode_dirty(inode);
-		reiserfs_update_sd(&myth, inode);
-		update_sd = 1;
-		ret = journal_end(&myth);
-		if (ret)
-			goto journal_error;
-	}
-	if (th) {
-		if (!locked) {
-			reiserfs_write_lock(inode->i_sb);
-			locked = true;
-		}
-		if (!update_sd)
-			mark_inode_dirty(inode);
-		ret = reiserfs_end_persistent_transaction(th);
-		if (ret)
-			goto out;
-	}
-
-out:
-	if (locked)
-		reiserfs_write_unlock(inode->i_sb);
-	folio_unlock(folio);
-	folio_put(folio);
-
-	if (pos + len > inode->i_size)
-		reiserfs_truncate_failed_write(inode);
-
-	return ret == 0 ? copied : ret;
-
-journal_error:
-	reiserfs_write_unlock(inode->i_sb);
-	locked = false;
-	if (th) {
-		if (!update_sd)
-			reiserfs_update_sd(th, inode);
-		ret = reiserfs_end_persistent_transaction(th);
-	}
-	goto out;
-}
-
-int reiserfs_commit_write(struct file *f, struct page *page,
-			  unsigned from, unsigned to)
-{
-	struct inode *inode = page->mapping->host;
-	loff_t pos = ((loff_t) page->index << PAGE_SHIFT) + to;
-	int ret = 0;
-	int update_sd = 0;
-	struct reiserfs_transaction_handle *th = NULL;
-	int depth;
-
-	depth = reiserfs_write_unlock_nested(inode->i_sb);
-	reiserfs_wait_on_write_block(inode->i_sb);
-	reiserfs_write_lock_nested(inode->i_sb, depth);
-
-	if (reiserfs_transaction_running(inode->i_sb)) {
-		th = current->journal_info;
-	}
-	reiserfs_commit_page(inode, page, from, to);
-
-	/*
-	 * generic_commit_write does this for us, but does not update the
-	 * transaction tracking stuff when the size changes.  So, we have
-	 * to do the i_size updates here.
-	 */
-	if (pos > inode->i_size) {
-		struct reiserfs_transaction_handle myth;
-		/*
-		 * If the file have grown beyond the border where it
-		 * can have a tail, unmark it as needing a tail
-		 * packing
-		 */
-		if ((have_large_tails(inode->i_sb)
-		     && inode->i_size > i_block_size(inode) * 4)
-		    || (have_small_tails(inode->i_sb)
-			&& inode->i_size > i_block_size(inode)))
-			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
-
-		ret = journal_begin(&myth, inode->i_sb, 1);
-		if (ret)
-			goto journal_error;
-
-		reiserfs_update_inode_transaction(inode);
-		inode->i_size = pos;
-		/*
-		 * this will just nest into our transaction.  It's important
-		 * to use mark_inode_dirty so the inode gets pushed around
-		 * on the dirty lists, and so that O_SYNC works as expected
-		 */
-		mark_inode_dirty(inode);
-		reiserfs_update_sd(&myth, inode);
-		update_sd = 1;
-		ret = journal_end(&myth);
-		if (ret)
-			goto journal_error;
-	}
-	if (th) {
-		if (!update_sd)
-			mark_inode_dirty(inode);
-		ret = reiserfs_end_persistent_transaction(th);
-		if (ret)
-			goto out;
-	}
-
-out:
-	return ret;
-
-journal_error:
-	if (th) {
-		if (!update_sd)
-			reiserfs_update_sd(th, inode);
-		ret = reiserfs_end_persistent_transaction(th);
-	}
-
-	return ret;
-}
-
-void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode)
-{
-	if (reiserfs_attrs(inode->i_sb)) {
-		if (sd_attrs & REISERFS_SYNC_FL)
-			inode->i_flags |= S_SYNC;
-		else
-			inode->i_flags &= ~S_SYNC;
-		if (sd_attrs & REISERFS_IMMUTABLE_FL)
-			inode->i_flags |= S_IMMUTABLE;
-		else
-			inode->i_flags &= ~S_IMMUTABLE;
-		if (sd_attrs & REISERFS_APPEND_FL)
-			inode->i_flags |= S_APPEND;
-		else
-			inode->i_flags &= ~S_APPEND;
-		if (sd_attrs & REISERFS_NOATIME_FL)
-			inode->i_flags |= S_NOATIME;
-		else
-			inode->i_flags &= ~S_NOATIME;
-		if (sd_attrs & REISERFS_NOTAIL_FL)
-			REISERFS_I(inode)->i_flags |= i_nopack_mask;
-		else
-			REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
-	}
-}
-
-/*
- * decide if this buffer needs to stay around for data logging or ordered
- * write purposes
- */
-static int invalidate_folio_can_drop(struct inode *inode, struct buffer_head *bh)
-{
-	int ret = 1;
-	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
-
-	lock_buffer(bh);
-	spin_lock(&j->j_dirty_buffers_lock);
-	if (!buffer_mapped(bh)) {
-		goto free_jh;
-	}
-	/*
-	 * the page is locked, and the only places that log a data buffer
-	 * also lock the page.
-	 */
-	if (reiserfs_file_data_log(inode)) {
-		/*
-		 * very conservative, leave the buffer pinned if
-		 * anyone might need it.
-		 */
-		if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
-			ret = 0;
-		}
-	} else  if (buffer_dirty(bh)) {
-		struct reiserfs_journal_list *jl;
-		struct reiserfs_jh *jh = bh->b_private;
-
-		/*
-		 * why is this safe?
-		 * reiserfs_setattr updates i_size in the on disk
-		 * stat data before allowing vmtruncate to be called.
-		 *
-		 * If buffer was put onto the ordered list for this
-		 * transaction, we know for sure either this transaction
-		 * or an older one already has updated i_size on disk,
-		 * and this ordered data won't be referenced in the file
-		 * if we crash.
-		 *
-		 * if the buffer was put onto the ordered list for an older
-		 * transaction, we need to leave it around
-		 */
-		if (jh && (jl = jh->jl)
-		    && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
-			ret = 0;
-	}
-free_jh:
-	if (ret && bh->b_private) {
-		reiserfs_free_jh(bh);
-	}
-	spin_unlock(&j->j_dirty_buffers_lock);
-	unlock_buffer(bh);
-	return ret;
-}
-
-/* clm -- taken from fs/buffer.c:block_invalidate_folio */
-static void reiserfs_invalidate_folio(struct folio *folio, size_t offset,
-				    size_t length)
-{
-	struct buffer_head *head, *bh, *next;
-	struct inode *inode = folio->mapping->host;
-	unsigned int curr_off = 0;
-	unsigned int stop = offset + length;
-	int partial_page = (offset || length < folio_size(folio));
-	int ret = 1;
-
-	BUG_ON(!folio_test_locked(folio));
-
-	if (!partial_page)
-		folio_clear_checked(folio);
-
-	head = folio_buffers(folio);
-	if (!head)
-		goto out;
-
-	bh = head;
-	do {
-		unsigned int next_off = curr_off + bh->b_size;
-		next = bh->b_this_page;
-
-		if (next_off > stop)
-			goto out;
-
-		/*
-		 * is this block fully invalidated?
-		 */
-		if (offset <= curr_off) {
-			if (invalidate_folio_can_drop(inode, bh))
-				reiserfs_unmap_buffer(bh);
-			else
-				ret = 0;
-		}
-		curr_off = next_off;
-		bh = next;
-	} while (bh != head);
-
-	/*
-	 * We release buffers only if the entire page is being invalidated.
-	 * The get_block cached value has been unconditionally invalidated,
-	 * so real IO is not possible anymore.
-	 */
-	if (!partial_page && ret) {
-		ret = filemap_release_folio(folio, 0);
-		/* maybe should BUG_ON(!ret); - neilb */
-	}
-out:
-	return;
-}
-
-static bool reiserfs_dirty_folio(struct address_space *mapping,
-		struct folio *folio)
-{
-	if (reiserfs_file_data_log(mapping->host)) {
-		folio_set_checked(folio);
-		return filemap_dirty_folio(mapping, folio);
-	}
-	return block_dirty_folio(mapping, folio);
-}
-
-/*
- * Returns true if the folio's buffers were dropped.  The folio is locked.
- *
- * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
- * in the buffers at folio_buffers(folio).
- *
- * even in -o notail mode, we can't be sure an old mount without -o notail
- * didn't create files with tails.
- */
-static bool reiserfs_release_folio(struct folio *folio, gfp_t unused_gfp_flags)
-{
-	struct inode *inode = folio->mapping->host;
-	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
-	struct buffer_head *head;
-	struct buffer_head *bh;
-	bool ret = true;
-
-	WARN_ON(folio_test_checked(folio));
-	spin_lock(&j->j_dirty_buffers_lock);
-	head = folio_buffers(folio);
-	bh = head;
-	do {
-		if (bh->b_private) {
-			if (!buffer_dirty(bh) && !buffer_locked(bh)) {
-				reiserfs_free_jh(bh);
-			} else {
-				ret = false;
-				break;
-			}
-		}
-		bh = bh->b_this_page;
-	} while (bh != head);
-	if (ret)
-		ret = try_to_free_buffers(folio);
-	spin_unlock(&j->j_dirty_buffers_lock);
-	return ret;
-}
-
-/*
- * We thank Mingming Cao for helping us understand in great detail what
- * to do in this section of the code.
- */
-static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
-{
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_mapping->host;
-	size_t count = iov_iter_count(iter);
-	ssize_t ret;
-
-	ret = blockdev_direct_IO(iocb, inode, iter,
-				 reiserfs_get_blocks_direct_io);
-
-	/*
-	 * In case of error extending write may have instantiated a few
-	 * blocks outside i_size. Trim these off again.
-	 */
-	if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
-		loff_t isize = i_size_read(inode);
-		loff_t end = iocb->ki_pos + count;
-
-		if ((end > isize) && inode_newsize_ok(inode, isize) == 0) {
-			truncate_setsize(inode, isize);
-			reiserfs_vfs_truncate_file(inode);
-		}
-	}
-
-	return ret;
-}
-
-int reiserfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
-		     struct iattr *attr)
-{
-	struct inode *inode = d_inode(dentry);
-	unsigned int ia_valid;
-	int error;
-
-	error = setattr_prepare(&nop_mnt_idmap, dentry, attr);
-	if (error)
-		return error;
-
-	/* must be turned off for recursive notify_change calls */
-	ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
-
-	if (is_quota_modification(&nop_mnt_idmap, inode, attr)) {
-		error = dquot_initialize(inode);
-		if (error)
-			return error;
-	}
-	reiserfs_write_lock(inode->i_sb);
-	if (attr->ia_valid & ATTR_SIZE) {
-		/*
-		 * version 2 items will be caught by the s_maxbytes check
-		 * done for us in vmtruncate
-		 */
-		if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
-		    attr->ia_size > MAX_NON_LFS) {
-			reiserfs_write_unlock(inode->i_sb);
-			error = -EFBIG;
-			goto out;
-		}
-
-		inode_dio_wait(inode);
-
-		/* fill in hole pointers in the expanding truncate case. */
-		if (attr->ia_size > inode->i_size) {
-			loff_t pos = attr->ia_size;
-
-			if ((pos & (inode->i_sb->s_blocksize - 1)) == 0)
-				pos++;
-			error = generic_cont_expand_simple(inode, pos);
-			if (REISERFS_I(inode)->i_prealloc_count > 0) {
-				int err;
-				struct reiserfs_transaction_handle th;
-				/* we're changing at most 2 bitmaps, inode + super */
-				err = journal_begin(&th, inode->i_sb, 4);
-				if (!err) {
-					reiserfs_discard_prealloc(&th, inode);
-					err = journal_end(&th);
-				}
-				if (err)
-					error = err;
-			}
-			if (error) {
-				reiserfs_write_unlock(inode->i_sb);
-				goto out;
-			}
-			/*
-			 * file size is changed, ctime and mtime are
-			 * to be updated
-			 */
-			attr->ia_valid |= (ATTR_MTIME | ATTR_CTIME);
-		}
-	}
-	reiserfs_write_unlock(inode->i_sb);
-
-	if ((((attr->ia_valid & ATTR_UID) && (from_kuid(&init_user_ns, attr->ia_uid) & ~0xffff)) ||
-	     ((attr->ia_valid & ATTR_GID) && (from_kgid(&init_user_ns, attr->ia_gid) & ~0xffff))) &&
-	    (get_inode_sd_version(inode) == STAT_DATA_V1)) {
-		/* stat data of format v3.5 has 16 bit uid and gid */
-		error = -EINVAL;
-		goto out;
-	}
-
-	if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
-	    (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
-		struct reiserfs_transaction_handle th;
-		int jbegin_count =
-		    2 *
-		    (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) +
-		     REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) +
-		    2;
-
-		error = reiserfs_chown_xattrs(inode, attr);
-
-		if (error)
-			return error;
-
-		/*
-		 * (user+group)*(old+new) structure - we count quota
-		 * info and , inode write (sb, inode)
-		 */
-		reiserfs_write_lock(inode->i_sb);
-		error = journal_begin(&th, inode->i_sb, jbegin_count);
-		reiserfs_write_unlock(inode->i_sb);
-		if (error)
-			goto out;
-		error = dquot_transfer(&nop_mnt_idmap, inode, attr);
-		reiserfs_write_lock(inode->i_sb);
-		if (error) {
-			journal_end(&th);
-			reiserfs_write_unlock(inode->i_sb);
-			goto out;
-		}
-
-		/*
-		 * Update corresponding info in inode so that everything
-		 * is in one transaction
-		 */
-		if (attr->ia_valid & ATTR_UID)
-			inode->i_uid = attr->ia_uid;
-		if (attr->ia_valid & ATTR_GID)
-			inode->i_gid = attr->ia_gid;
-		mark_inode_dirty(inode);
-		error = journal_end(&th);
-		reiserfs_write_unlock(inode->i_sb);
-		if (error)
-			goto out;
-	}
-
-	if ((attr->ia_valid & ATTR_SIZE) &&
-	    attr->ia_size != i_size_read(inode)) {
-		error = inode_newsize_ok(inode, attr->ia_size);
-		if (!error) {
-			/*
-			 * Could race against reiserfs_file_release
-			 * if called from NFS, so take tailpack mutex.
-			 */
-			mutex_lock(&REISERFS_I(inode)->tailpack);
-			truncate_setsize(inode, attr->ia_size);
-			reiserfs_truncate_file(inode, 1);
-			mutex_unlock(&REISERFS_I(inode)->tailpack);
-		}
-	}
-
-	if (!error) {
-		setattr_copy(&nop_mnt_idmap, inode, attr);
-		mark_inode_dirty(inode);
-	}
-
-	if (!error && reiserfs_posixacl(inode->i_sb)) {
-		if (attr->ia_valid & ATTR_MODE)
-			error = reiserfs_acl_chmod(dentry);
-	}
-
-out:
-	return error;
-}
-
-const struct address_space_operations reiserfs_address_space_operations = {
-	.writepages = reiserfs_writepages,
-	.read_folio = reiserfs_read_folio,
-	.readahead = reiserfs_readahead,
-	.release_folio = reiserfs_release_folio,
-	.invalidate_folio = reiserfs_invalidate_folio,
-	.write_begin = reiserfs_write_begin,
-	.write_end = reiserfs_write_end,
-	.bmap = reiserfs_aop_bmap,
-	.direct_IO = reiserfs_direct_IO,
-	.dirty_folio = reiserfs_dirty_folio,
-	.migrate_folio = buffer_migrate_folio,
-};
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
deleted file mode 100644
index dd33f8cc6eda..000000000000
--- a/fs/reiserfs/ioctl.c
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/capability.h>
-#include <linux/fs.h>
-#include <linux/mount.h>
-#include "reiserfs.h"
-#include <linux/time.h>
-#include <linux/uaccess.h>
-#include <linux/pagemap.h>
-#include <linux/compat.h>
-#include <linux/fileattr.h>
-
-int reiserfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
-{
-	struct inode *inode = d_inode(dentry);
-
-	if (!reiserfs_attrs(inode->i_sb))
-		return -ENOTTY;
-
-	fileattr_fill_flags(fa, REISERFS_I(inode)->i_attrs);
-
-	return 0;
-}
-
-int reiserfs_fileattr_set(struct mnt_idmap *idmap,
-			  struct dentry *dentry, struct fileattr *fa)
-{
-	struct inode *inode = d_inode(dentry);
-	unsigned int flags = fa->flags;
-	int err;
-
-	reiserfs_write_lock(inode->i_sb);
-
-	err = -ENOTTY;
-	if (!reiserfs_attrs(inode->i_sb))
-		goto unlock;
-
-	err = -EOPNOTSUPP;
-	if (fileattr_has_fsx(fa))
-		goto unlock;
-
-	/*
-	 * Is it quota file? Do not allow user to mess with it
-	 */
-	err = -EPERM;
-	if (IS_NOQUOTA(inode))
-		goto unlock;
-
-	if ((flags & REISERFS_NOTAIL_FL) && S_ISREG(inode->i_mode)) {
-		err = reiserfs_unpack(inode);
-		if (err)
-			goto unlock;
-	}
-	sd_attrs_to_i_attrs(flags, inode);
-	REISERFS_I(inode)->i_attrs = flags;
-	inode_set_ctime_current(inode);
-	mark_inode_dirty(inode);
-	err = 0;
-unlock:
-	reiserfs_write_unlock(inode->i_sb);
-
-	return err;
-}
-
-/*
- * reiserfs_ioctl - handler for ioctl for inode
- * supported commands:
- *  1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect
- *                           and prevent packing file (argument arg has t
- *			      be non-zero)
- *  2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION
- *  3) That's all for a while ...
- */
-long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
-	struct inode *inode = file_inode(filp);
-	int err = 0;
-
-	reiserfs_write_lock(inode->i_sb);
-
-	switch (cmd) {
-	case REISERFS_IOC_UNPACK:
-		if (S_ISREG(inode->i_mode)) {
-			if (arg)
-				err = reiserfs_unpack(inode);
-		} else
-			err = -ENOTTY;
-		break;
-		/*
-		 * following two cases are taken from fs/ext2/ioctl.c by Remy
-		 * Card (card@masi.ibp.fr)
-		 */
-	case REISERFS_IOC_GETVERSION:
-		err = put_user(inode->i_generation, (int __user *)arg);
-		break;
-	case REISERFS_IOC_SETVERSION:
-		if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) {
-			err = -EPERM;
-			break;
-		}
-		err = mnt_want_write_file(filp);
-		if (err)
-			break;
-		if (get_user(inode->i_generation, (int __user *)arg)) {
-			err = -EFAULT;
-			goto setversion_out;
-		}
-		inode_set_ctime_current(inode);
-		mark_inode_dirty(inode);
-setversion_out:
-		mnt_drop_write_file(filp);
-		break;
-	default:
-		err = -ENOTTY;
-	}
-
-	reiserfs_write_unlock(inode->i_sb);
-
-	return err;
-}
-
-#ifdef CONFIG_COMPAT
-long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
-				unsigned long arg)
-{
-	/*
-	 * These are just misnamed, they actually
-	 * get/put from/to user an int
-	 */
-	switch (cmd) {
-	case REISERFS_IOC32_UNPACK:
-		cmd = REISERFS_IOC_UNPACK;
-		break;
-	case REISERFS_IOC32_GETVERSION:
-		cmd = REISERFS_IOC_GETVERSION;
-		break;
-	case REISERFS_IOC32_SETVERSION:
-		cmd = REISERFS_IOC_SETVERSION;
-		break;
-	default:
-		return -ENOIOCTLCMD;
-	}
-
-	return reiserfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
-}
-#endif
-
-int reiserfs_commit_write(struct file *f, struct page *page,
-			  unsigned from, unsigned to);
-/*
- * reiserfs_unpack
- * Function try to convert tail from direct item into indirect.
- * It set up nopack attribute in the REISERFS_I(inode)->nopack
- */
-int reiserfs_unpack(struct inode *inode)
-{
-	int retval = 0;
-	int index;
-	struct page *page;
-	struct address_space *mapping;
-	unsigned long write_from;
-	unsigned long blocksize = inode->i_sb->s_blocksize;
-
-	if (inode->i_size == 0) {
-		REISERFS_I(inode)->i_flags |= i_nopack_mask;
-		return 0;
-	}
-	/* ioctl already done */
-	if (REISERFS_I(inode)->i_flags & i_nopack_mask) {
-		return 0;
-	}
-
-	/* we need to make sure nobody is changing the file size beneath us */
-	{
-		int depth = reiserfs_write_unlock_nested(inode->i_sb);
-
-		inode_lock(inode);
-		reiserfs_write_lock_nested(inode->i_sb, depth);
-	}
-
-	reiserfs_write_lock(inode->i_sb);
-
-	write_from = inode->i_size & (blocksize - 1);
-	/* if we are on a block boundary, we are already unpacked.  */
-	if (write_from == 0) {
-		REISERFS_I(inode)->i_flags |= i_nopack_mask;
-		goto out;
-	}
-
-	/*
-	 * we unpack by finding the page with the tail, and calling
-	 * __reiserfs_write_begin on that page.  This will force a
-	 * reiserfs_get_block to unpack the tail for us.
-	 */
-	index = inode->i_size >> PAGE_SHIFT;
-	mapping = inode->i_mapping;
-	page = grab_cache_page(mapping, index);
-	retval = -ENOMEM;
-	if (!page) {
-		goto out;
-	}
-	retval = __reiserfs_write_begin(page, write_from, 0);
-	if (retval)
-		goto out_unlock;
-
-	/* conversion can change page contents, must flush */
-	flush_dcache_page(page);
-	retval = reiserfs_commit_write(NULL, page, write_from, write_from);
-	REISERFS_I(inode)->i_flags |= i_nopack_mask;
-
-out_unlock:
-	unlock_page(page);
-	put_page(page);
-
-out:
-	inode_unlock(inode);
-	reiserfs_write_unlock(inode->i_sb);
-	return retval;
-}
diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
deleted file mode 100644
index 5011c10287c6..000000000000
--- a/fs/reiserfs/item_ops.c
+++ /dev/null
@@ -1,737 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/time.h>
-#include "reiserfs.h"
-
-/*
- * this contains item handlers for old item types: sd, direct,
- * indirect, directory
- */
-
-/*
- * and where are the comments? how about saying where we can find an
- * explanation of each item handler method? -Hans
- */
-
-/* stat data functions */
-static int sd_bytes_number(struct item_head *ih, int block_size)
-{
-	return 0;
-}
-
-static void sd_decrement_key(struct cpu_key *key)
-{
-	key->on_disk_key.k_objectid--;
-	set_cpu_key_k_type(key, TYPE_ANY);
-	set_cpu_key_k_offset(key, (loff_t)(~0ULL >> 1));
-}
-
-static int sd_is_left_mergeable(struct reiserfs_key *key, unsigned long bsize)
-{
-	return 0;
-}
-
-static void sd_print_item(struct item_head *ih, char *item)
-{
-	printk("\tmode | size | nlinks | first direct | mtime\n");
-	if (stat_data_v1(ih)) {
-		struct stat_data_v1 *sd = (struct stat_data_v1 *)item;
-
-		printk("\t0%-6o | %6u | %2u | %d | %u\n", sd_v1_mode(sd),
-		       sd_v1_size(sd), sd_v1_nlink(sd),
-		       sd_v1_first_direct_byte(sd),
-		       sd_v1_mtime(sd));
-	} else {
-		struct stat_data *sd = (struct stat_data *)item;
-
-		printk("\t0%-6o | %6llu | %2u | %d | %u\n", sd_v2_mode(sd),
-		       (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd),
-		       sd_v2_rdev(sd), sd_v2_mtime(sd));
-	}
-}
-
-static void sd_check_item(struct item_head *ih, char *item)
-{
-	/* unused */
-}
-
-static int sd_create_vi(struct virtual_node *vn,
-			struct virtual_item *vi,
-			int is_affected, int insert_size)
-{
-	vi->vi_index = TYPE_STAT_DATA;
-	return 0;
-}
-
-static int sd_check_left(struct virtual_item *vi, int free,
-			 int start_skip, int end_skip)
-{
-	BUG_ON(start_skip || end_skip);
-	return -1;
-}
-
-static int sd_check_right(struct virtual_item *vi, int free)
-{
-	return -1;
-}
-
-static int sd_part_size(struct virtual_item *vi, int first, int count)
-{
-	BUG_ON(count);
-	return 0;
-}
-
-static int sd_unit_num(struct virtual_item *vi)
-{
-	return vi->vi_item_len - IH_SIZE;
-}
-
-static void sd_print_vi(struct virtual_item *vi)
-{
-	reiserfs_warning(NULL, "reiserfs-16100",
-			 "STATDATA, index %d, type 0x%x, %h",
-			 vi->vi_index, vi->vi_type, vi->vi_ih);
-}
-
-static struct item_operations stat_data_ops = {
-	.bytes_number = sd_bytes_number,
-	.decrement_key = sd_decrement_key,
-	.is_left_mergeable = sd_is_left_mergeable,
-	.print_item = sd_print_item,
-	.check_item = sd_check_item,
-
-	.create_vi = sd_create_vi,
-	.check_left = sd_check_left,
-	.check_right = sd_check_right,
-	.part_size = sd_part_size,
-	.unit_num = sd_unit_num,
-	.print_vi = sd_print_vi
-};
-
-/* direct item functions */
-static int direct_bytes_number(struct item_head *ih, int block_size)
-{
-	return ih_item_len(ih);
-}
-
-/* FIXME: this should probably switch to indirect as well */
-static void direct_decrement_key(struct cpu_key *key)
-{
-	cpu_key_k_offset_dec(key);
-	if (cpu_key_k_offset(key) == 0)
-		set_cpu_key_k_type(key, TYPE_STAT_DATA);
-}
-
-static int direct_is_left_mergeable(struct reiserfs_key *key,
-				    unsigned long bsize)
-{
-	int version = le_key_version(key);
-	return ((le_key_k_offset(version, key) & (bsize - 1)) != 1);
-}
-
-static void direct_print_item(struct item_head *ih, char *item)
-{
-	int j = 0;
-
-/*    return; */
-	printk("\"");
-	while (j < ih_item_len(ih))
-		printk("%c", item[j++]);
-	printk("\"\n");
-}
-
-static void direct_check_item(struct item_head *ih, char *item)
-{
-	/* unused */
-}
-
-static int direct_create_vi(struct virtual_node *vn,
-			    struct virtual_item *vi,
-			    int is_affected, int insert_size)
-{
-	vi->vi_index = TYPE_DIRECT;
-	return 0;
-}
-
-static int direct_check_left(struct virtual_item *vi, int free,
-			     int start_skip, int end_skip)
-{
-	int bytes;
-
-	bytes = free - free % 8;
-	return bytes ? : -1;
-}
-
-static int direct_check_right(struct virtual_item *vi, int free)
-{
-	return direct_check_left(vi, free, 0, 0);
-}
-
-static int direct_part_size(struct virtual_item *vi, int first, int count)
-{
-	return count;
-}
-
-static int direct_unit_num(struct virtual_item *vi)
-{
-	return vi->vi_item_len - IH_SIZE;
-}
-
-static void direct_print_vi(struct virtual_item *vi)
-{
-	reiserfs_warning(NULL, "reiserfs-16101",
-			 "DIRECT, index %d, type 0x%x, %h",
-			 vi->vi_index, vi->vi_type, vi->vi_ih);
-}
-
-static struct item_operations direct_ops = {
-	.bytes_number = direct_bytes_number,
-	.decrement_key = direct_decrement_key,
-	.is_left_mergeable = direct_is_left_mergeable,
-	.print_item = direct_print_item,
-	.check_item = direct_check_item,
-
-	.create_vi = direct_create_vi,
-	.check_left = direct_check_left,
-	.check_right = direct_check_right,
-	.part_size = direct_part_size,
-	.unit_num = direct_unit_num,
-	.print_vi = direct_print_vi
-};
-
-/* indirect item functions */
-static int indirect_bytes_number(struct item_head *ih, int block_size)
-{
-	return ih_item_len(ih) / UNFM_P_SIZE * block_size;
-}
-
-/* decrease offset, if it becomes 0, change type to stat data */
-static void indirect_decrement_key(struct cpu_key *key)
-{
-	cpu_key_k_offset_dec(key);
-	if (cpu_key_k_offset(key) == 0)
-		set_cpu_key_k_type(key, TYPE_STAT_DATA);
-}
-
-/* if it is not first item of the body, then it is mergeable */
-static int indirect_is_left_mergeable(struct reiserfs_key *key,
-				      unsigned long bsize)
-{
-	int version = le_key_version(key);
-	return (le_key_k_offset(version, key) != 1);
-}
-
-/* printing of indirect item */
-static void start_new_sequence(__u32 * start, int *len, __u32 new)
-{
-	*start = new;
-	*len = 1;
-}
-
-static int sequence_finished(__u32 start, int *len, __u32 new)
-{
-	if (start == INT_MAX)
-		return 1;
-
-	if (start == 0 && new == 0) {
-		(*len)++;
-		return 0;
-	}
-	if (start != 0 && (start + *len) == new) {
-		(*len)++;
-		return 0;
-	}
-	return 1;
-}
-
-static void print_sequence(__u32 start, int len)
-{
-	if (start == INT_MAX)
-		return;
-
-	if (len == 1)
-		printk(" %d", start);
-	else
-		printk(" %d(%d)", start, len);
-}
-
-static void indirect_print_item(struct item_head *ih, char *item)
-{
-	int j;
-	__le32 *unp;
-	__u32 prev = INT_MAX;
-	int num = 0;
-
-	unp = (__le32 *) item;
-
-	if (ih_item_len(ih) % UNFM_P_SIZE)
-		reiserfs_warning(NULL, "reiserfs-16102", "invalid item len");
-
-	printk("%d pointers\n[ ", (int)I_UNFM_NUM(ih));
-	for (j = 0; j < I_UNFM_NUM(ih); j++) {
-		if (sequence_finished(prev, &num, get_block_num(unp, j))) {
-			print_sequence(prev, num);
-			start_new_sequence(&prev, &num, get_block_num(unp, j));
-		}
-	}
-	print_sequence(prev, num);
-	printk("]\n");
-}
-
-static void indirect_check_item(struct item_head *ih, char *item)
-{
-	/* unused */
-}
-
-static int indirect_create_vi(struct virtual_node *vn,
-			      struct virtual_item *vi,
-			      int is_affected, int insert_size)
-{
-	vi->vi_index = TYPE_INDIRECT;
-	return 0;
-}
-
-static int indirect_check_left(struct virtual_item *vi, int free,
-			       int start_skip, int end_skip)
-{
-	int bytes;
-
-	bytes = free - free % UNFM_P_SIZE;
-	return bytes ? : -1;
-}
-
-static int indirect_check_right(struct virtual_item *vi, int free)
-{
-	return indirect_check_left(vi, free, 0, 0);
-}
-
-/*
- * return size in bytes of 'units' units. If first == 0 - calculate
- * from the head (left), otherwise - from tail (right)
- */
-static int indirect_part_size(struct virtual_item *vi, int first, int units)
-{
-	/* unit of indirect item is byte (yet) */
-	return units;
-}
-
-static int indirect_unit_num(struct virtual_item *vi)
-{
-	/* unit of indirect item is byte (yet) */
-	return vi->vi_item_len - IH_SIZE;
-}
-
-static void indirect_print_vi(struct virtual_item *vi)
-{
-	reiserfs_warning(NULL, "reiserfs-16103",
-			 "INDIRECT, index %d, type 0x%x, %h",
-			 vi->vi_index, vi->vi_type, vi->vi_ih);
-}
-
-static struct item_operations indirect_ops = {
-	.bytes_number = indirect_bytes_number,
-	.decrement_key = indirect_decrement_key,
-	.is_left_mergeable = indirect_is_left_mergeable,
-	.print_item = indirect_print_item,
-	.check_item = indirect_check_item,
-
-	.create_vi = indirect_create_vi,
-	.check_left = indirect_check_left,
-	.check_right = indirect_check_right,
-	.part_size = indirect_part_size,
-	.unit_num = indirect_unit_num,
-	.print_vi = indirect_print_vi
-};
-
-/* direntry functions */
-static int direntry_bytes_number(struct item_head *ih, int block_size)
-{
-	reiserfs_warning(NULL, "vs-16090",
-			 "bytes number is asked for direntry");
-	return 0;
-}
-
-static void direntry_decrement_key(struct cpu_key *key)
-{
-	cpu_key_k_offset_dec(key);
-	if (cpu_key_k_offset(key) == 0)
-		set_cpu_key_k_type(key, TYPE_STAT_DATA);
-}
-
-static int direntry_is_left_mergeable(struct reiserfs_key *key,
-				      unsigned long bsize)
-{
-	if (le32_to_cpu(key->u.k_offset_v1.k_offset) == DOT_OFFSET)
-		return 0;
-	return 1;
-
-}
-
-static void direntry_print_item(struct item_head *ih, char *item)
-{
-	int i;
-	int namelen;
-	struct reiserfs_de_head *deh;
-	char *name;
-	static char namebuf[80];
-
-	printk("\n # %-15s%-30s%-15s%-15s%-15s\n", "Name",
-	       "Key of pointed object", "Hash", "Gen number", "Status");
-
-	deh = (struct reiserfs_de_head *)item;
-
-	for (i = 0; i < ih_entry_count(ih); i++, deh++) {
-		namelen =
-		    (i ? (deh_location(deh - 1)) : ih_item_len(ih)) -
-		    deh_location(deh);
-		name = item + deh_location(deh);
-		if (name[namelen - 1] == 0)
-			namelen = strlen(name);
-
-		scnprintf(namebuf, sizeof(namebuf), "\"%.*s\"",
-			  (int)sizeof(namebuf)-3, name);
-
-		printk("%d:  %-15s%-15d%-15d%-15lld%-15lld(%s)\n",
-		       i, namebuf,
-		       deh_dir_id(deh), deh_objectid(deh),
-		       GET_HASH_VALUE(deh_offset(deh)),
-		       GET_GENERATION_NUMBER((deh_offset(deh))),
-		       (de_hidden(deh)) ? "HIDDEN" : "VISIBLE");
-	}
-}
-
-static void direntry_check_item(struct item_head *ih, char *item)
-{
-	int i;
-	struct reiserfs_de_head *deh;
-
-	/* unused */
-	deh = (struct reiserfs_de_head *)item;
-	for (i = 0; i < ih_entry_count(ih); i++, deh++) {
-		;
-	}
-}
-
-#define DIRENTRY_VI_FIRST_DIRENTRY_ITEM 1
-
-/*
- * function returns old entry number in directory item in real node
- * using new entry number in virtual item in virtual node
- */
-static inline int old_entry_num(int is_affected, int virtual_entry_num,
-				int pos_in_item, int mode)
-{
-	if (mode == M_INSERT || mode == M_DELETE)
-		return virtual_entry_num;
-
-	if (!is_affected)
-		/* cut or paste is applied to another item */
-		return virtual_entry_num;
-
-	if (virtual_entry_num < pos_in_item)
-		return virtual_entry_num;
-
-	if (mode == M_CUT)
-		return virtual_entry_num + 1;
-
-	RFALSE(mode != M_PASTE || virtual_entry_num == 0,
-	       "vs-8015: old_entry_num: mode must be M_PASTE (mode = \'%c\'",
-	       mode);
-
-	return virtual_entry_num - 1;
-}
-
-/*
- * Create an array of sizes of directory entries for virtual
- * item. Return space used by an item. FIXME: no control over
- * consuming of space used by this item handler
- */
-static int direntry_create_vi(struct virtual_node *vn,
-			      struct virtual_item *vi,
-			      int is_affected, int insert_size)
-{
-	struct direntry_uarea *dir_u = vi->vi_uarea;
-	int i, j;
-	int size = sizeof(struct direntry_uarea);
-	struct reiserfs_de_head *deh;
-
-	vi->vi_index = TYPE_DIRENTRY;
-
-	BUG_ON(!(vi->vi_ih) || !vi->vi_item);
-
-	dir_u->flags = 0;
-	if (le_ih_k_offset(vi->vi_ih) == DOT_OFFSET)
-		dir_u->flags |= DIRENTRY_VI_FIRST_DIRENTRY_ITEM;
-
-	deh = (struct reiserfs_de_head *)(vi->vi_item);
-
-	/* virtual directory item have this amount of entry after */
-	dir_u->entry_count = ih_entry_count(vi->vi_ih) +
-	    ((is_affected) ? ((vn->vn_mode == M_CUT) ? -1 :
-			      (vn->vn_mode == M_PASTE ? 1 : 0)) : 0);
-
-	for (i = 0; i < dir_u->entry_count; i++) {
-		j = old_entry_num(is_affected, i, vn->vn_pos_in_item,
-				  vn->vn_mode);
-		dir_u->entry_sizes[i] =
-		    (j ? deh_location(&deh[j - 1]) : ih_item_len(vi->vi_ih)) -
-		    deh_location(&deh[j]) + DEH_SIZE;
-	}
-
-	size += (dir_u->entry_count * sizeof(short));
-
-	/* set size of pasted entry */
-	if (is_affected && vn->vn_mode == M_PASTE)
-		dir_u->entry_sizes[vn->vn_pos_in_item] = insert_size;
-
-#ifdef CONFIG_REISERFS_CHECK
-	/* compare total size of entries with item length */
-	{
-		int k, l;
-
-		l = 0;
-		for (k = 0; k < dir_u->entry_count; k++)
-			l += dir_u->entry_sizes[k];
-
-		if (l + IH_SIZE != vi->vi_item_len +
-		    ((is_affected
-		      && (vn->vn_mode == M_PASTE
-			  || vn->vn_mode == M_CUT)) ? insert_size : 0)) {
-			reiserfs_panic(NULL, "vs-8025", "(mode==%c, "
-				       "insert_size==%d), invalid length of "
-				       "directory item",
-				       vn->vn_mode, insert_size);
-		}
-	}
-#endif
-
-	return size;
-
-}
-
-/*
- * return number of entries which may fit into specified amount of
- * free space, or -1 if free space is not enough even for 1 entry
- */
-static int direntry_check_left(struct virtual_item *vi, int free,
-			       int start_skip, int end_skip)
-{
-	int i;
-	int entries = 0;
-	struct direntry_uarea *dir_u = vi->vi_uarea;
-
-	for (i = start_skip; i < dir_u->entry_count - end_skip; i++) {
-		/* i-th entry doesn't fit into the remaining free space */
-		if (dir_u->entry_sizes[i] > free)
-			break;
-
-		free -= dir_u->entry_sizes[i];
-		entries++;
-	}
-
-	if (entries == dir_u->entry_count) {
-		reiserfs_panic(NULL, "item_ops-1",
-			       "free space %d, entry_count %d", free,
-			       dir_u->entry_count);
-	}
-
-	/* "." and ".." can not be separated from each other */
-	if (start_skip == 0 && (dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM)
-	    && entries < 2)
-		entries = 0;
-
-	return entries ? : -1;
-}
-
-static int direntry_check_right(struct virtual_item *vi, int free)
-{
-	int i;
-	int entries = 0;
-	struct direntry_uarea *dir_u = vi->vi_uarea;
-
-	for (i = dir_u->entry_count - 1; i >= 0; i--) {
-		/* i-th entry doesn't fit into the remaining free space */
-		if (dir_u->entry_sizes[i] > free)
-			break;
-
-		free -= dir_u->entry_sizes[i];
-		entries++;
-	}
-	BUG_ON(entries == dir_u->entry_count);
-
-	/* "." and ".." can not be separated from each other */
-	if ((dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM)
-	    && entries > dir_u->entry_count - 2)
-		entries = dir_u->entry_count - 2;
-
-	return entries ? : -1;
-}
-
-/* sum of entry sizes between from-th and to-th entries including both edges */
-static int direntry_part_size(struct virtual_item *vi, int first, int count)
-{
-	int i, retval;
-	int from, to;
-	struct direntry_uarea *dir_u = vi->vi_uarea;
-
-	retval = 0;
-	if (first == 0)
-		from = 0;
-	else
-		from = dir_u->entry_count - count;
-	to = from + count - 1;
-
-	for (i = from; i <= to; i++)
-		retval += dir_u->entry_sizes[i];
-
-	return retval;
-}
-
-static int direntry_unit_num(struct virtual_item *vi)
-{
-	struct direntry_uarea *dir_u = vi->vi_uarea;
-
-	return dir_u->entry_count;
-}
-
-static void direntry_print_vi(struct virtual_item *vi)
-{
-	int i;
-	struct direntry_uarea *dir_u = vi->vi_uarea;
-
-	reiserfs_warning(NULL, "reiserfs-16104",
-			 "DIRENTRY, index %d, type 0x%x, %h, flags 0x%x",
-			 vi->vi_index, vi->vi_type, vi->vi_ih, dir_u->flags);
-	printk("%d entries: ", dir_u->entry_count);
-	for (i = 0; i < dir_u->entry_count; i++)
-		printk("%d ", dir_u->entry_sizes[i]);
-	printk("\n");
-}
-
-static struct item_operations direntry_ops = {
-	.bytes_number = direntry_bytes_number,
-	.decrement_key = direntry_decrement_key,
-	.is_left_mergeable = direntry_is_left_mergeable,
-	.print_item = direntry_print_item,
-	.check_item = direntry_check_item,
-
-	.create_vi = direntry_create_vi,
-	.check_left = direntry_check_left,
-	.check_right = direntry_check_right,
-	.part_size = direntry_part_size,
-	.unit_num = direntry_unit_num,
-	.print_vi = direntry_print_vi
-};
-
-/* Error catching functions to catch errors caused by incorrect item types. */
-static int errcatch_bytes_number(struct item_head *ih, int block_size)
-{
-	reiserfs_warning(NULL, "green-16001",
-			 "Invalid item type observed, run fsck ASAP");
-	return 0;
-}
-
-static void errcatch_decrement_key(struct cpu_key *key)
-{
-	reiserfs_warning(NULL, "green-16002",
-			 "Invalid item type observed, run fsck ASAP");
-}
-
-static int errcatch_is_left_mergeable(struct reiserfs_key *key,
-				      unsigned long bsize)
-{
-	reiserfs_warning(NULL, "green-16003",
-			 "Invalid item type observed, run fsck ASAP");
-	return 0;
-}
-
-static void errcatch_print_item(struct item_head *ih, char *item)
-{
-	reiserfs_warning(NULL, "green-16004",
-			 "Invalid item type observed, run fsck ASAP");
-}
-
-static void errcatch_check_item(struct item_head *ih, char *item)
-{
-	reiserfs_warning(NULL, "green-16005",
-			 "Invalid item type observed, run fsck ASAP");
-}
-
-static int errcatch_create_vi(struct virtual_node *vn,
-			      struct virtual_item *vi,
-			      int is_affected, int insert_size)
-{
-	reiserfs_warning(NULL, "green-16006",
-			 "Invalid item type observed, run fsck ASAP");
-	/*
-	 * We might return -1 here as well, but it won't help as
-	 * create_virtual_node() from where this operation is called
-	 * from is of return type void.
-	 */
-	return 0;
-}
-
-static int errcatch_check_left(struct virtual_item *vi, int free,
-			       int start_skip, int end_skip)
-{
-	reiserfs_warning(NULL, "green-16007",
-			 "Invalid item type observed, run fsck ASAP");
-	return -1;
-}
-
-static int errcatch_check_right(struct virtual_item *vi, int free)
-{
-	reiserfs_warning(NULL, "green-16008",
-			 "Invalid item type observed, run fsck ASAP");
-	return -1;
-}
-
-static int errcatch_part_size(struct virtual_item *vi, int first, int count)
-{
-	reiserfs_warning(NULL, "green-16009",
-			 "Invalid item type observed, run fsck ASAP");
-	return 0;
-}
-
-static int errcatch_unit_num(struct virtual_item *vi)
-{
-	reiserfs_warning(NULL, "green-16010",
-			 "Invalid item type observed, run fsck ASAP");
-	return 0;
-}
-
-static void errcatch_print_vi(struct virtual_item *vi)
-{
-	reiserfs_warning(NULL, "green-16011",
-			 "Invalid item type observed, run fsck ASAP");
-}
-
-static struct item_operations errcatch_ops = {
-	.bytes_number = errcatch_bytes_number,
-	.decrement_key = errcatch_decrement_key,
-	.is_left_mergeable = errcatch_is_left_mergeable,
-	.print_item = errcatch_print_item,
-	.check_item = errcatch_check_item,
-
-	.create_vi = errcatch_create_vi,
-	.check_left = errcatch_check_left,
-	.check_right = errcatch_check_right,
-	.part_size = errcatch_part_size,
-	.unit_num = errcatch_unit_num,
-	.print_vi = errcatch_print_vi
-};
-
-#if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3)
-#error Item types must use disk-format assigned values.
-#endif
-
-struct item_operations *item_ops[TYPE_ANY + 1] = {
-	&stat_data_ops,
-	&indirect_ops,
-	&direct_ops,
-	&direntry_ops,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	&errcatch_ops		/* This is to catch errors with invalid type (15th entry for TYPE_ANY) */
-};
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
deleted file mode 100644
index e477ee0ff35d..000000000000
--- a/fs/reiserfs/journal.c
+++ /dev/null
@@ -1,4404 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Write ahead logging implementation copyright Chris Mason 2000
- *
- * The background commits make this code very interrelated, and
- * overly complex.  I need to rethink things a bit....The major players:
- *
- * journal_begin -- call with the number of blocks you expect to log.
- *                  If the current transaction is too
- *		    old, it will block until the current transaction is
- *		    finished, and then start a new one.
- *		    Usually, your transaction will get joined in with
- *                  previous ones for speed.
- *
- * journal_join  -- same as journal_begin, but won't block on the current
- *                  transaction regardless of age.  Don't ever call
- *                  this.  Ever.  There are only two places it should be
- *                  called from, and they are both inside this file.
- *
- * journal_mark_dirty -- adds blocks into this transaction.  clears any flags
- *                       that might make them get sent to disk
- *                       and then marks them BH_JDirty.  Puts the buffer head
- *                       into the current transaction hash.
- *
- * journal_end -- if the current transaction is batchable, it does nothing
- *                   otherwise, it could do an async/synchronous commit, or
- *                   a full flush of all log and real blocks in the
- *                   transaction.
- *
- * flush_old_commits -- if the current transaction is too old, it is ended and
- *                      commit blocks are sent to disk.  Forces commit blocks
- *                      to disk for all backgrounded commits that have been
- *                      around too long.
- *		     -- Note, if you call this as an immediate flush from
- *		        within kupdate, it will ignore the immediate flag
- */
-
-#include <linux/time.h>
-#include <linux/semaphore.h>
-#include <linux/vmalloc.h>
-#include "reiserfs.h"
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/fcntl.h>
-#include <linux/stat.h>
-#include <linux/string.h>
-#include <linux/buffer_head.h>
-#include <linux/workqueue.h>
-#include <linux/writeback.h>
-#include <linux/blkdev.h>
-#include <linux/backing-dev.h>
-#include <linux/uaccess.h>
-#include <linux/slab.h>
-
-
-/* gets a struct reiserfs_journal_list * from a list head */
-#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
-                               j_list))
-
-/* must be correct to keep the desc and commit structs at 4k */
-#define JOURNAL_TRANS_HALF 1018
-#define BUFNR 64		/*read ahead */
-
-/* cnode stat bits.  Move these into reiserfs_fs.h */
-
-/* this block was freed, and can't be written.  */
-#define BLOCK_FREED 2
-/* this block was freed during this transaction, and can't be written */
-#define BLOCK_FREED_HOLDER 3
-
-/* used in flush_journal_list */
-#define BLOCK_NEEDS_FLUSH 4
-#define BLOCK_DIRTIED 5
-
-/* journal list state bits */
-#define LIST_TOUCHED 1
-#define LIST_DIRTY   2
-#define LIST_COMMIT_PENDING  4	/* someone will commit this list */
-
-/* flags for do_journal_end */
-#define FLUSH_ALL   1		/* flush commit and real blocks */
-#define COMMIT_NOW  2		/* end and commit this transaction */
-#define WAIT        4		/* wait for the log blocks to hit the disk */
-
-static int do_journal_end(struct reiserfs_transaction_handle *, int flags);
-static int flush_journal_list(struct super_block *s,
-			      struct reiserfs_journal_list *jl, int flushall);
-static int flush_commit_list(struct super_block *s,
-			     struct reiserfs_journal_list *jl, int flushall);
-static int can_dirty(struct reiserfs_journal_cnode *cn);
-static int journal_join(struct reiserfs_transaction_handle *th,
-			struct super_block *sb);
-static void release_journal_dev(struct reiserfs_journal *journal);
-static void dirty_one_transaction(struct super_block *s,
-				 struct reiserfs_journal_list *jl);
-static void flush_async_commits(struct work_struct *work);
-static void queue_log_writer(struct super_block *s);
-
-/* values for join in do_journal_begin_r */
-enum {
-	JBEGIN_REG = 0,		/* regular journal begin */
-	/* join the running transaction if at all possible */
-	JBEGIN_JOIN = 1,
-	/* called from cleanup code, ignores aborted flag */
-	JBEGIN_ABORT = 2,
-};
-
-static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
-			      struct super_block *sb,
-			      unsigned long nblocks, int join);
-
-static void init_journal_hash(struct super_block *sb)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	memset(journal->j_hash_table, 0,
-	       JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
-}
-
-/*
- * clears BH_Dirty and sticks the buffer on the clean list.  Called because
- * I can't allow refile_buffer to make schedule happen after I've freed a
- * block.  Look at remove_from_transaction and journal_mark_freed for
- * more details.
- */
-static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)
-{
-	if (bh) {
-		clear_buffer_dirty(bh);
-		clear_buffer_journal_test(bh);
-	}
-	return 0;
-}
-
-static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
-							 *sb)
-{
-	struct reiserfs_bitmap_node *bn;
-	static int id;
-
-	bn = kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS);
-	if (!bn) {
-		return NULL;
-	}
-	bn->data = kzalloc(sb->s_blocksize, GFP_NOFS);
-	if (!bn->data) {
-		kfree(bn);
-		return NULL;
-	}
-	bn->id = id++;
-	INIT_LIST_HEAD(&bn->list);
-	return bn;
-}
-
-static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *sb)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	struct reiserfs_bitmap_node *bn = NULL;
-	struct list_head *entry = journal->j_bitmap_nodes.next;
-
-	journal->j_used_bitmap_nodes++;
-repeat:
-
-	if (entry != &journal->j_bitmap_nodes) {
-		bn = list_entry(entry, struct reiserfs_bitmap_node, list);
-		list_del(entry);
-		memset(bn->data, 0, sb->s_blocksize);
-		journal->j_free_bitmap_nodes--;
-		return bn;
-	}
-	bn = allocate_bitmap_node(sb);
-	if (!bn) {
-		yield();
-		goto repeat;
-	}
-	return bn;
-}
-static inline void free_bitmap_node(struct super_block *sb,
-				    struct reiserfs_bitmap_node *bn)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	journal->j_used_bitmap_nodes--;
-	if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) {
-		kfree(bn->data);
-		kfree(bn);
-	} else {
-		list_add(&bn->list, &journal->j_bitmap_nodes);
-		journal->j_free_bitmap_nodes++;
-	}
-}
-
-static void allocate_bitmap_nodes(struct super_block *sb)
-{
-	int i;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	struct reiserfs_bitmap_node *bn = NULL;
-	for (i = 0; i < REISERFS_MIN_BITMAP_NODES; i++) {
-		bn = allocate_bitmap_node(sb);
-		if (bn) {
-			list_add(&bn->list, &journal->j_bitmap_nodes);
-			journal->j_free_bitmap_nodes++;
-		} else {
-			/* this is ok, we'll try again when more are needed */
-			break;
-		}
-	}
-}
-
-static int set_bit_in_list_bitmap(struct super_block *sb,
-				  b_blocknr_t block,
-				  struct reiserfs_list_bitmap *jb)
-{
-	unsigned int bmap_nr = block / (sb->s_blocksize << 3);
-	unsigned int bit_nr = block % (sb->s_blocksize << 3);
-
-	if (!jb->bitmaps[bmap_nr]) {
-		jb->bitmaps[bmap_nr] = get_bitmap_node(sb);
-	}
-	set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data);
-	return 0;
-}
-
-static void cleanup_bitmap_list(struct super_block *sb,
-				struct reiserfs_list_bitmap *jb)
-{
-	int i;
-	if (jb->bitmaps == NULL)
-		return;
-
-	for (i = 0; i < reiserfs_bmap_count(sb); i++) {
-		if (jb->bitmaps[i]) {
-			free_bitmap_node(sb, jb->bitmaps[i]);
-			jb->bitmaps[i] = NULL;
-		}
-	}
-}
-
-/*
- * only call this on FS unmount.
- */
-static int free_list_bitmaps(struct super_block *sb,
-			     struct reiserfs_list_bitmap *jb_array)
-{
-	int i;
-	struct reiserfs_list_bitmap *jb;
-	for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
-		jb = jb_array + i;
-		jb->journal_list = NULL;
-		cleanup_bitmap_list(sb, jb);
-		vfree(jb->bitmaps);
-		jb->bitmaps = NULL;
-	}
-	return 0;
-}
-
-static int free_bitmap_nodes(struct super_block *sb)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	struct list_head *next = journal->j_bitmap_nodes.next;
-	struct reiserfs_bitmap_node *bn;
-
-	while (next != &journal->j_bitmap_nodes) {
-		bn = list_entry(next, struct reiserfs_bitmap_node, list);
-		list_del(next);
-		kfree(bn->data);
-		kfree(bn);
-		next = journal->j_bitmap_nodes.next;
-		journal->j_free_bitmap_nodes--;
-	}
-
-	return 0;
-}
-
-/*
- * get memory for JOURNAL_NUM_BITMAPS worth of bitmaps.
- * jb_array is the array to be filled in.
- */
-int reiserfs_allocate_list_bitmaps(struct super_block *sb,
-				   struct reiserfs_list_bitmap *jb_array,
-				   unsigned int bmap_nr)
-{
-	int i;
-	int failed = 0;
-	struct reiserfs_list_bitmap *jb;
-	int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *);
-
-	for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
-		jb = jb_array + i;
-		jb->journal_list = NULL;
-		jb->bitmaps = vzalloc(mem);
-		if (!jb->bitmaps) {
-			reiserfs_warning(sb, "clm-2000", "unable to "
-					 "allocate bitmaps for journal lists");
-			failed = 1;
-			break;
-		}
-	}
-	if (failed) {
-		free_list_bitmaps(sb, jb_array);
-		return -1;
-	}
-	return 0;
-}
-
-/*
- * find an available list bitmap.  If you can't find one, flush a commit list
- * and try again
- */
-static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb,
-						    struct reiserfs_journal_list
-						    *jl)
-{
-	int i, j;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	struct reiserfs_list_bitmap *jb = NULL;
-
-	for (j = 0; j < (JOURNAL_NUM_BITMAPS * 3); j++) {
-		i = journal->j_list_bitmap_index;
-		journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS;
-		jb = journal->j_list_bitmap + i;
-		if (journal->j_list_bitmap[i].journal_list) {
-			flush_commit_list(sb,
-					  journal->j_list_bitmap[i].
-					  journal_list, 1);
-			if (!journal->j_list_bitmap[i].journal_list) {
-				break;
-			}
-		} else {
-			break;
-		}
-	}
-	/* double check to make sure if flushed correctly */
-	if (jb->journal_list)
-		return NULL;
-	jb->journal_list = jl;
-	return jb;
-}
-
-/*
- * allocates a new chunk of X nodes, and links them all together as a list.
- * Uses the cnode->next and cnode->prev pointers
- * returns NULL on failure
- */
-static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
-{
-	struct reiserfs_journal_cnode *head;
-	int i;
-	if (num_cnodes <= 0) {
-		return NULL;
-	}
-	head = vzalloc(array_size(num_cnodes,
-				  sizeof(struct reiserfs_journal_cnode)));
-	if (!head) {
-		return NULL;
-	}
-	head[0].prev = NULL;
-	head[0].next = head + 1;
-	for (i = 1; i < num_cnodes; i++) {
-		head[i].prev = head + (i - 1);
-		head[i].next = head + (i + 1);	/* if last one, overwrite it after the if */
-	}
-	head[num_cnodes - 1].next = NULL;
-	return head;
-}
-
-/* pulls a cnode off the free list, or returns NULL on failure */
-static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb)
-{
-	struct reiserfs_journal_cnode *cn;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-
-	reiserfs_check_lock_depth(sb, "get_cnode");
-
-	if (journal->j_cnode_free <= 0) {
-		return NULL;
-	}
-	journal->j_cnode_used++;
-	journal->j_cnode_free--;
-	cn = journal->j_cnode_free_list;
-	if (!cn) {
-		return cn;
-	}
-	if (cn->next) {
-		cn->next->prev = NULL;
-	}
-	journal->j_cnode_free_list = cn->next;
-	memset(cn, 0, sizeof(struct reiserfs_journal_cnode));
-	return cn;
-}
-
-/*
- * returns a cnode to the free list
- */
-static void free_cnode(struct super_block *sb,
-		       struct reiserfs_journal_cnode *cn)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-
-	reiserfs_check_lock_depth(sb, "free_cnode");
-
-	journal->j_cnode_used--;
-	journal->j_cnode_free++;
-	/* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */
-	cn->next = journal->j_cnode_free_list;
-	if (journal->j_cnode_free_list) {
-		journal->j_cnode_free_list->prev = cn;
-	}
-	cn->prev = NULL;	/* not needed with the memset, but I might kill the memset, and forget to do this */
-	journal->j_cnode_free_list = cn;
-}
-
-static void clear_prepared_bits(struct buffer_head *bh)
-{
-	clear_buffer_journal_prepared(bh);
-	clear_buffer_journal_restore_dirty(bh);
-}
-
-/*
- * return a cnode with same dev, block number and size in table,
- * or null if not found
- */
-static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
-								  super_block
-								  *sb,
-								  struct
-								  reiserfs_journal_cnode
-								  **table,
-								  long bl)
-{
-	struct reiserfs_journal_cnode *cn;
-	cn = journal_hash(table, sb, bl);
-	while (cn) {
-		if (cn->blocknr == bl && cn->sb == sb)
-			return cn;
-		cn = cn->hnext;
-	}
-	return (struct reiserfs_journal_cnode *)0;
-}
-
-/*
- * this actually means 'can this block be reallocated yet?'.  If you set
- * search_all, a block can only be allocated if it is not in the current
- * transaction, was not freed by the current transaction, and has no chance
- * of ever being overwritten by a replay after crashing.
- *
- * If you don't set search_all, a block can only be allocated if it is not
- * in the current transaction.  Since deleting a block removes it from the
- * current transaction, this case should never happen.  If you don't set
- * search_all, make sure you never write the block without logging it.
- *
- * next_zero_bit is a suggestion about the next block to try for find_forward.
- * when bl is rejected because it is set in a journal list bitmap, we search
- * for the next zero bit in the bitmap that rejected bl.  Then, we return
- * that through next_zero_bit for find_forward to try.
- *
- * Just because we return something in next_zero_bit does not mean we won't
- * reject it on the next call to reiserfs_in_journal
- */
-int reiserfs_in_journal(struct super_block *sb,
-			unsigned int bmap_nr, int bit_nr, int search_all,
-			b_blocknr_t * next_zero_bit)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	struct reiserfs_list_bitmap *jb;
-	int i;
-	unsigned long bl;
-
-	*next_zero_bit = 0;	/* always start this at zero. */
-
-	PROC_INFO_INC(sb, journal.in_journal);
-	/*
-	 * If we aren't doing a search_all, this is a metablock, and it
-	 * will be logged before use.  if we crash before the transaction
-	 * that freed it commits,  this transaction won't have committed
-	 * either, and the block will never be written
-	 */
-	if (search_all) {
-		for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
-			PROC_INFO_INC(sb, journal.in_journal_bitmap);
-			jb = journal->j_list_bitmap + i;
-			if (jb->journal_list && jb->bitmaps[bmap_nr] &&
-			    test_bit(bit_nr,
-				     (unsigned long *)jb->bitmaps[bmap_nr]->
-				     data)) {
-				*next_zero_bit =
-				    find_next_zero_bit((unsigned long *)
-						       (jb->bitmaps[bmap_nr]->
-							data),
-						       sb->s_blocksize << 3,
-						       bit_nr + 1);
-				return 1;
-			}
-		}
-	}
-
-	bl = bmap_nr * (sb->s_blocksize << 3) + bit_nr;
-	/* is it in any old transactions? */
-	if (search_all
-	    && (get_journal_hash_dev(sb, journal->j_list_hash_table, bl))) {
-		return 1;
-	}
-
-	/* is it in the current transaction.  This should never happen */
-	if ((get_journal_hash_dev(sb, journal->j_hash_table, bl))) {
-		BUG();
-		return 1;
-	}
-
-	PROC_INFO_INC(sb, journal.in_journal_reusable);
-	/* safe for reuse */
-	return 0;
-}
-
-/* insert cn into table */
-static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
-				       struct reiserfs_journal_cnode *cn)
-{
-	struct reiserfs_journal_cnode *cn_orig;
-
-	cn_orig = journal_hash(table, cn->sb, cn->blocknr);
-	cn->hnext = cn_orig;
-	cn->hprev = NULL;
-	if (cn_orig) {
-		cn_orig->hprev = cn;
-	}
-	journal_hash(table, cn->sb, cn->blocknr) = cn;
-}
-
-/* lock the current transaction */
-static inline void lock_journal(struct super_block *sb)
-{
-	PROC_INFO_INC(sb, journal.lock_journal);
-
-	reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb);
-}
-
-/* unlock the current transaction */
-static inline void unlock_journal(struct super_block *sb)
-{
-	mutex_unlock(&SB_JOURNAL(sb)->j_mutex);
-}
-
-static inline void get_journal_list(struct reiserfs_journal_list *jl)
-{
-	jl->j_refcount++;
-}
-
-static inline void put_journal_list(struct super_block *s,
-				    struct reiserfs_journal_list *jl)
-{
-	if (jl->j_refcount < 1) {
-		reiserfs_panic(s, "journal-2", "trans id %u, refcount at %d",
-			       jl->j_trans_id, jl->j_refcount);
-	}
-	if (--jl->j_refcount == 0)
-		kfree(jl);
-}
-
-/*
- * this used to be much more involved, and I'm keeping it just in case
- * things get ugly again.  it gets called by flush_commit_list, and
- * cleans up any data stored about blocks freed during a transaction.
- */
-static void cleanup_freed_for_journal_list(struct super_block *sb,
-					   struct reiserfs_journal_list *jl)
-{
-
-	struct reiserfs_list_bitmap *jb = jl->j_list_bitmap;
-	if (jb) {
-		cleanup_bitmap_list(sb, jb);
-	}
-	jl->j_list_bitmap->journal_list = NULL;
-	jl->j_list_bitmap = NULL;
-}
-
-static int journal_list_still_alive(struct super_block *s,
-				    unsigned int trans_id)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	struct list_head *entry = &journal->j_journal_list;
-	struct reiserfs_journal_list *jl;
-
-	if (!list_empty(entry)) {
-		jl = JOURNAL_LIST_ENTRY(entry->next);
-		if (jl->j_trans_id <= trans_id) {
-			return 1;
-		}
-	}
-	return 0;
-}
-
-/*
- * If page->mapping was null, we failed to truncate this page for
- * some reason.  Most likely because it was truncated after being
- * logged via data=journal.
- *
- * This does a check to see if the buffer belongs to one of these
- * lost pages before doing the final put_bh.  If page->mapping was
- * null, it tries to free buffers on the page, which should make the
- * final put_page drop the page from the lru.
- */
-static void release_buffer_page(struct buffer_head *bh)
-{
-	struct folio *folio = bh->b_folio;
-	if (!folio->mapping && folio_trylock(folio)) {
-		folio_get(folio);
-		put_bh(bh);
-		if (!folio->mapping)
-			try_to_free_buffers(folio);
-		folio_unlock(folio);
-		folio_put(folio);
-	} else {
-		put_bh(bh);
-	}
-}
-
-static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
-{
-	if (buffer_journaled(bh)) {
-		reiserfs_warning(NULL, "clm-2084",
-				 "pinned buffer %lu:%pg sent to disk",
-				 bh->b_blocknr, bh->b_bdev);
-	}
-	if (uptodate)
-		set_buffer_uptodate(bh);
-	else
-		clear_buffer_uptodate(bh);
-
-	unlock_buffer(bh);
-	release_buffer_page(bh);
-}
-
-static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate)
-{
-	if (uptodate)
-		set_buffer_uptodate(bh);
-	else
-		clear_buffer_uptodate(bh);
-	unlock_buffer(bh);
-	put_bh(bh);
-}
-
-static void submit_logged_buffer(struct buffer_head *bh)
-{
-	get_bh(bh);
-	bh->b_end_io = reiserfs_end_buffer_io_sync;
-	clear_buffer_journal_new(bh);
-	clear_buffer_dirty(bh);
-	if (!test_clear_buffer_journal_test(bh))
-		BUG();
-	if (!buffer_uptodate(bh))
-		BUG();
-	submit_bh(REQ_OP_WRITE, bh);
-}
-
-static void submit_ordered_buffer(struct buffer_head *bh)
-{
-	get_bh(bh);
-	bh->b_end_io = reiserfs_end_ordered_io;
-	clear_buffer_dirty(bh);
-	if (!buffer_uptodate(bh))
-		BUG();
-	submit_bh(REQ_OP_WRITE, bh);
-}
-
-#define CHUNK_SIZE 32
-struct buffer_chunk {
-	struct buffer_head *bh[CHUNK_SIZE];
-	int nr;
-};
-
-static void write_chunk(struct buffer_chunk *chunk)
-{
-	int i;
-	for (i = 0; i < chunk->nr; i++) {
-		submit_logged_buffer(chunk->bh[i]);
-	}
-	chunk->nr = 0;
-}
-
-static void write_ordered_chunk(struct buffer_chunk *chunk)
-{
-	int i;
-	for (i = 0; i < chunk->nr; i++) {
-		submit_ordered_buffer(chunk->bh[i]);
-	}
-	chunk->nr = 0;
-}
-
-static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
-			spinlock_t * lock, void (fn) (struct buffer_chunk *))
-{
-	int ret = 0;
-	BUG_ON(chunk->nr >= CHUNK_SIZE);
-	chunk->bh[chunk->nr++] = bh;
-	if (chunk->nr >= CHUNK_SIZE) {
-		ret = 1;
-		if (lock) {
-			spin_unlock(lock);
-			fn(chunk);
-			spin_lock(lock);
-		} else {
-			fn(chunk);
-		}
-	}
-	return ret;
-}
-
-static atomic_t nr_reiserfs_jh = ATOMIC_INIT(0);
-static struct reiserfs_jh *alloc_jh(void)
-{
-	struct reiserfs_jh *jh;
-	while (1) {
-		jh = kmalloc(sizeof(*jh), GFP_NOFS);
-		if (jh) {
-			atomic_inc(&nr_reiserfs_jh);
-			return jh;
-		}
-		yield();
-	}
-}
-
-/*
- * we want to free the jh when the buffer has been written
- * and waited on
- */
-void reiserfs_free_jh(struct buffer_head *bh)
-{
-	struct reiserfs_jh *jh;
-
-	jh = bh->b_private;
-	if (jh) {
-		bh->b_private = NULL;
-		jh->bh = NULL;
-		list_del_init(&jh->list);
-		kfree(jh);
-		if (atomic_read(&nr_reiserfs_jh) <= 0)
-			BUG();
-		atomic_dec(&nr_reiserfs_jh);
-		put_bh(bh);
-	}
-}
-
-static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh,
-			   int tail)
-{
-	struct reiserfs_jh *jh;
-
-	if (bh->b_private) {
-		spin_lock(&j->j_dirty_buffers_lock);
-		if (!bh->b_private) {
-			spin_unlock(&j->j_dirty_buffers_lock);
-			goto no_jh;
-		}
-		jh = bh->b_private;
-		list_del_init(&jh->list);
-	} else {
-no_jh:
-		get_bh(bh);
-		jh = alloc_jh();
-		spin_lock(&j->j_dirty_buffers_lock);
-		/*
-		 * buffer must be locked for __add_jh, should be able to have
-		 * two adds at the same time
-		 */
-		BUG_ON(bh->b_private);
-		jh->bh = bh;
-		bh->b_private = jh;
-	}
-	jh->jl = j->j_current_jl;
-	if (tail)
-		list_add_tail(&jh->list, &jh->jl->j_tail_bh_list);
-	else {
-		list_add_tail(&jh->list, &jh->jl->j_bh_list);
-	}
-	spin_unlock(&j->j_dirty_buffers_lock);
-	return 0;
-}
-
-int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh)
-{
-	return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1);
-}
-int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh)
-{
-	return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0);
-}
-
-#define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list)
-static int write_ordered_buffers(spinlock_t * lock,
-				 struct reiserfs_journal *j,
-				 struct reiserfs_journal_list *jl,
-				 struct list_head *list)
-{
-	struct buffer_head *bh;
-	struct reiserfs_jh *jh;
-	int ret = j->j_errno;
-	struct buffer_chunk chunk;
-	struct list_head tmp;
-	INIT_LIST_HEAD(&tmp);
-
-	chunk.nr = 0;
-	spin_lock(lock);
-	while (!list_empty(list)) {
-		jh = JH_ENTRY(list->next);
-		bh = jh->bh;
-		get_bh(bh);
-		if (!trylock_buffer(bh)) {
-			if (!buffer_dirty(bh)) {
-				list_move(&jh->list, &tmp);
-				goto loop_next;
-			}
-			spin_unlock(lock);
-			if (chunk.nr)
-				write_ordered_chunk(&chunk);
-			wait_on_buffer(bh);
-			cond_resched();
-			spin_lock(lock);
-			goto loop_next;
-		}
-		/*
-		 * in theory, dirty non-uptodate buffers should never get here,
-		 * but the upper layer io error paths still have a few quirks.
-		 * Handle them here as gracefully as we can
-		 */
-		if (!buffer_uptodate(bh) && buffer_dirty(bh)) {
-			clear_buffer_dirty(bh);
-			ret = -EIO;
-		}
-		if (buffer_dirty(bh)) {
-			list_move(&jh->list, &tmp);
-			add_to_chunk(&chunk, bh, lock, write_ordered_chunk);
-		} else {
-			reiserfs_free_jh(bh);
-			unlock_buffer(bh);
-		}
-loop_next:
-		put_bh(bh);
-		cond_resched_lock(lock);
-	}
-	if (chunk.nr) {
-		spin_unlock(lock);
-		write_ordered_chunk(&chunk);
-		spin_lock(lock);
-	}
-	while (!list_empty(&tmp)) {
-		jh = JH_ENTRY(tmp.prev);
-		bh = jh->bh;
-		get_bh(bh);
-		reiserfs_free_jh(bh);
-
-		if (buffer_locked(bh)) {
-			spin_unlock(lock);
-			wait_on_buffer(bh);
-			spin_lock(lock);
-		}
-		if (!buffer_uptodate(bh)) {
-			ret = -EIO;
-		}
-		/*
-		 * ugly interaction with invalidate_folio here.
-		 * reiserfs_invalidate_folio will pin any buffer that has a
-		 * valid journal head from an older transaction.  If someone
-		 * else sets our buffer dirty after we write it in the first
-		 * loop, and then someone truncates the page away, nobody
-		 * will ever write the buffer. We're safe if we write the
-		 * page one last time after freeing the journal header.
-		 */
-		if (buffer_dirty(bh) && unlikely(bh->b_folio->mapping == NULL)) {
-			spin_unlock(lock);
-			write_dirty_buffer(bh, 0);
-			spin_lock(lock);
-		}
-		put_bh(bh);
-		cond_resched_lock(lock);
-	}
-	spin_unlock(lock);
-	return ret;
-}
-
-static int flush_older_commits(struct super_block *s,
-			       struct reiserfs_journal_list *jl)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	struct reiserfs_journal_list *other_jl;
-	struct reiserfs_journal_list *first_jl;
-	struct list_head *entry;
-	unsigned int trans_id = jl->j_trans_id;
-	unsigned int other_trans_id;
-
-find_first:
-	/*
-	 * first we walk backwards to find the oldest uncommitted transation
-	 */
-	first_jl = jl;
-	entry = jl->j_list.prev;
-	while (1) {
-		other_jl = JOURNAL_LIST_ENTRY(entry);
-		if (entry == &journal->j_journal_list ||
-		    atomic_read(&other_jl->j_older_commits_done))
-			break;
-
-		first_jl = other_jl;
-		entry = other_jl->j_list.prev;
-	}
-
-	/* if we didn't find any older uncommitted transactions, return now */
-	if (first_jl == jl) {
-		return 0;
-	}
-
-	entry = &first_jl->j_list;
-	while (1) {
-		other_jl = JOURNAL_LIST_ENTRY(entry);
-		other_trans_id = other_jl->j_trans_id;
-
-		if (other_trans_id < trans_id) {
-			if (atomic_read(&other_jl->j_commit_left) != 0) {
-				flush_commit_list(s, other_jl, 0);
-
-				/* list we were called with is gone, return */
-				if (!journal_list_still_alive(s, trans_id))
-					return 1;
-
-				/*
-				 * the one we just flushed is gone, this means
-				 * all older lists are also gone, so first_jl
-				 * is no longer valid either.  Go back to the
-				 * beginning.
-				 */
-				if (!journal_list_still_alive
-				    (s, other_trans_id)) {
-					goto find_first;
-				}
-			}
-			entry = entry->next;
-			if (entry == &journal->j_journal_list)
-				return 0;
-		} else {
-			return 0;
-		}
-	}
-	return 0;
-}
-
-static int reiserfs_async_progress_wait(struct super_block *s)
-{
-	struct reiserfs_journal *j = SB_JOURNAL(s);
-
-	if (atomic_read(&j->j_async_throttle)) {
-		int depth;
-
-		depth = reiserfs_write_unlock_nested(s);
-		wait_var_event_timeout(&j->j_async_throttle,
-				       atomic_read(&j->j_async_throttle) == 0,
-				       HZ / 10);
-		reiserfs_write_lock_nested(s, depth);
-	}
-
-	return 0;
-}
-
-/*
- * if this journal list still has commit blocks unflushed, send them to disk.
- *
- * log areas must be flushed in order (transaction 2 can't commit before
- * transaction 1) Before the commit block can by written, every other log
- * block must be safely on disk
- */
-static int flush_commit_list(struct super_block *s,
-			     struct reiserfs_journal_list *jl, int flushall)
-{
-	int i;
-	b_blocknr_t bn;
-	struct buffer_head *tbh = NULL;
-	unsigned int trans_id = jl->j_trans_id;
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	int retval = 0;
-	int write_len;
-	int depth;
-
-	reiserfs_check_lock_depth(s, "flush_commit_list");
-
-	if (atomic_read(&jl->j_older_commits_done)) {
-		return 0;
-	}
-
-	/*
-	 * before we can put our commit blocks on disk, we have to make
-	 * sure everyone older than us is on disk too
-	 */
-	BUG_ON(jl->j_len <= 0);
-	BUG_ON(trans_id == journal->j_trans_id);
-
-	get_journal_list(jl);
-	if (flushall) {
-		if (flush_older_commits(s, jl) == 1) {
-			/*
-			 * list disappeared during flush_older_commits.
-			 * return
-			 */
-			goto put_jl;
-		}
-	}
-
-	/* make sure nobody is trying to flush this one at the same time */
-	reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s);
-
-	if (!journal_list_still_alive(s, trans_id)) {
-		mutex_unlock(&jl->j_commit_mutex);
-		goto put_jl;
-	}
-	BUG_ON(jl->j_trans_id == 0);
-
-	/* this commit is done, exit */
-	if (atomic_read(&jl->j_commit_left) <= 0) {
-		if (flushall) {
-			atomic_set(&jl->j_older_commits_done, 1);
-		}
-		mutex_unlock(&jl->j_commit_mutex);
-		goto put_jl;
-	}
-
-	if (!list_empty(&jl->j_bh_list)) {
-		int ret;
-
-		/*
-		 * We might sleep in numerous places inside
-		 * write_ordered_buffers. Relax the write lock.
-		 */
-		depth = reiserfs_write_unlock_nested(s);
-		ret = write_ordered_buffers(&journal->j_dirty_buffers_lock,
-					    journal, jl, &jl->j_bh_list);
-		if (ret < 0 && retval == 0)
-			retval = ret;
-		reiserfs_write_lock_nested(s, depth);
-	}
-	BUG_ON(!list_empty(&jl->j_bh_list));
-	/*
-	 * for the description block and all the log blocks, submit any buffers
-	 * that haven't already reached the disk.  Try to write at least 256
-	 * log blocks. later on, we will only wait on blocks that correspond
-	 * to this transaction, but while we're unplugging we might as well
-	 * get a chunk of data on there.
-	 */
-	atomic_inc(&journal->j_async_throttle);
-	write_len = jl->j_len + 1;
-	if (write_len < 256)
-		write_len = 256;
-	for (i = 0 ; i < write_len ; i++) {
-		bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) %
-		    SB_ONDISK_JOURNAL_SIZE(s);
-		tbh = journal_find_get_block(s, bn);
-		if (tbh) {
-			if (buffer_dirty(tbh)) {
-		            depth = reiserfs_write_unlock_nested(s);
-			    write_dirty_buffer(tbh, 0);
-			    reiserfs_write_lock_nested(s, depth);
-			}
-			put_bh(tbh) ;
-		}
-	}
-	if (atomic_dec_and_test(&journal->j_async_throttle))
-		wake_up_var(&journal->j_async_throttle);
-
-	for (i = 0; i < (jl->j_len + 1); i++) {
-		bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
-		    (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
-		tbh = journal_find_get_block(s, bn);
-
-		depth = reiserfs_write_unlock_nested(s);
-		__wait_on_buffer(tbh);
-		reiserfs_write_lock_nested(s, depth);
-		/*
-		 * since we're using ll_rw_blk above, it might have skipped
-		 * over a locked buffer.  Double check here
-		 */
-		/* redundant, sync_dirty_buffer() checks */
-		if (buffer_dirty(tbh)) {
-			depth = reiserfs_write_unlock_nested(s);
-			sync_dirty_buffer(tbh);
-			reiserfs_write_lock_nested(s, depth);
-		}
-		if (unlikely(!buffer_uptodate(tbh))) {
-#ifdef CONFIG_REISERFS_CHECK
-			reiserfs_warning(s, "journal-601",
-					 "buffer write failed");
-#endif
-			retval = -EIO;
-		}
-		/* once for journal_find_get_block */
-		put_bh(tbh);
-		/* once due to original getblk in do_journal_end */
-		put_bh(tbh);
-		atomic_dec(&jl->j_commit_left);
-	}
-
-	BUG_ON(atomic_read(&jl->j_commit_left) != 1);
-
-	/*
-	 * If there was a write error in the journal - we can't commit
-	 * this transaction - it will be invalid and, if successful,
-	 * will just end up propagating the write error out to
-	 * the file system.
-	 */
-	if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
-		if (buffer_dirty(jl->j_commit_bh))
-			BUG();
-		mark_buffer_dirty(jl->j_commit_bh) ;
-		depth = reiserfs_write_unlock_nested(s);
-		if (reiserfs_barrier_flush(s))
-			__sync_dirty_buffer(jl->j_commit_bh,
-					REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
-		else
-			sync_dirty_buffer(jl->j_commit_bh);
-		reiserfs_write_lock_nested(s, depth);
-	}
-
-	/*
-	 * If there was a write error in the journal - we can't commit this
-	 * transaction - it will be invalid and, if successful, will just end
-	 * up propagating the write error out to the filesystem.
-	 */
-	if (unlikely(!buffer_uptodate(jl->j_commit_bh))) {
-#ifdef CONFIG_REISERFS_CHECK
-		reiserfs_warning(s, "journal-615", "buffer write failed");
-#endif
-		retval = -EIO;
-	}
-	bforget(jl->j_commit_bh);
-	if (journal->j_last_commit_id != 0 &&
-	    (jl->j_trans_id - journal->j_last_commit_id) != 1) {
-		reiserfs_warning(s, "clm-2200", "last commit %lu, current %lu",
-				 journal->j_last_commit_id, jl->j_trans_id);
-	}
-	journal->j_last_commit_id = jl->j_trans_id;
-
-	/*
-	 * now, every commit block is on the disk.  It is safe to allow
-	 * blocks freed during this transaction to be reallocated
-	 */
-	cleanup_freed_for_journal_list(s, jl);
-
-	retval = retval ? retval : journal->j_errno;
-
-	/* mark the metadata dirty */
-	if (!retval)
-		dirty_one_transaction(s, jl);
-	atomic_dec(&jl->j_commit_left);
-
-	if (flushall) {
-		atomic_set(&jl->j_older_commits_done, 1);
-	}
-	mutex_unlock(&jl->j_commit_mutex);
-put_jl:
-	put_journal_list(s, jl);
-
-	if (retval)
-		reiserfs_abort(s, retval, "Journal write error in %s",
-			       __func__);
-	return retval;
-}
-
-/*
- * flush_journal_list frequently needs to find a newer transaction for a
- * given block.  This does that, or returns NULL if it can't find anything
- */
-static struct reiserfs_journal_list *find_newer_jl_for_cn(struct
-							  reiserfs_journal_cnode
-							  *cn)
-{
-	struct super_block *sb = cn->sb;
-	b_blocknr_t blocknr = cn->blocknr;
-
-	cn = cn->hprev;
-	while (cn) {
-		if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) {
-			return cn->jlist;
-		}
-		cn = cn->hprev;
-	}
-	return NULL;
-}
-
-static void remove_journal_hash(struct super_block *,
-				struct reiserfs_journal_cnode **,
-				struct reiserfs_journal_list *, unsigned long,
-				int);
-
-/*
- * once all the real blocks have been flushed, it is safe to remove them
- * from the journal list for this transaction.  Aside from freeing the
- * cnode, this also allows the block to be reallocated for data blocks
- * if it had been deleted.
- */
-static void remove_all_from_journal_list(struct super_block *sb,
-					 struct reiserfs_journal_list *jl,
-					 int debug)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	struct reiserfs_journal_cnode *cn, *last;
-	cn = jl->j_realblock;
-
-	/*
-	 * which is better, to lock once around the whole loop, or
-	 * to lock for each call to remove_journal_hash?
-	 */
-	while (cn) {
-		if (cn->blocknr != 0) {
-			if (debug) {
-				reiserfs_warning(sb, "reiserfs-2201",
-						 "block %u, bh is %d, state %ld",
-						 cn->blocknr, cn->bh ? 1 : 0,
-						 cn->state);
-			}
-			cn->state = 0;
-			remove_journal_hash(sb, journal->j_list_hash_table,
-					    jl, cn->blocknr, 1);
-		}
-		last = cn;
-		cn = cn->next;
-		free_cnode(sb, last);
-	}
-	jl->j_realblock = NULL;
-}
-
-/*
- * if this timestamp is greater than the timestamp we wrote last to the
- * header block, write it to the header block.  once this is done, I can
- * safely say the log area for this transaction won't ever be replayed,
- * and I can start releasing blocks in this transaction for reuse as data
- * blocks.  called by flush_journal_list, before it calls
- * remove_all_from_journal_list
- */
-static int _update_journal_header_block(struct super_block *sb,
-					unsigned long offset,
-					unsigned int trans_id)
-{
-	struct reiserfs_journal_header *jh;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	int depth;
-
-	if (reiserfs_is_journal_aborted(journal))
-		return -EIO;
-
-	if (trans_id >= journal->j_last_flush_trans_id) {
-		if (buffer_locked((journal->j_header_bh))) {
-			depth = reiserfs_write_unlock_nested(sb);
-			__wait_on_buffer(journal->j_header_bh);
-			reiserfs_write_lock_nested(sb, depth);
-			if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
-#ifdef CONFIG_REISERFS_CHECK
-				reiserfs_warning(sb, "journal-699",
-						 "buffer write failed");
-#endif
-				return -EIO;
-			}
-		}
-		journal->j_last_flush_trans_id = trans_id;
-		journal->j_first_unflushed_offset = offset;
-		jh = (struct reiserfs_journal_header *)(journal->j_header_bh->
-							b_data);
-		jh->j_last_flush_trans_id = cpu_to_le32(trans_id);
-		jh->j_first_unflushed_offset = cpu_to_le32(offset);
-		jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
-
-		set_buffer_dirty(journal->j_header_bh);
-		depth = reiserfs_write_unlock_nested(sb);
-
-		if (reiserfs_barrier_flush(sb))
-			__sync_dirty_buffer(journal->j_header_bh,
-					REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
-		else
-			sync_dirty_buffer(journal->j_header_bh);
-
-		reiserfs_write_lock_nested(sb, depth);
-		if (!buffer_uptodate(journal->j_header_bh)) {
-			reiserfs_warning(sb, "journal-837",
-					 "IO error during journal replay");
-			return -EIO;
-		}
-	}
-	return 0;
-}
-
-static int update_journal_header_block(struct super_block *sb,
-				       unsigned long offset,
-				       unsigned int trans_id)
-{
-	return _update_journal_header_block(sb, offset, trans_id);
-}
-
-/*
-** flush any and all journal lists older than you are
-** can only be called from flush_journal_list
-*/
-static int flush_older_journal_lists(struct super_block *sb,
-				     struct reiserfs_journal_list *jl)
-{
-	struct list_head *entry;
-	struct reiserfs_journal_list *other_jl;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	unsigned int trans_id = jl->j_trans_id;
-
-	/*
-	 * we know we are the only ones flushing things, no extra race
-	 * protection is required.
-	 */
-restart:
-	entry = journal->j_journal_list.next;
-	/* Did we wrap? */
-	if (entry == &journal->j_journal_list)
-		return 0;
-	other_jl = JOURNAL_LIST_ENTRY(entry);
-	if (other_jl->j_trans_id < trans_id) {
-		BUG_ON(other_jl->j_refcount <= 0);
-		/* do not flush all */
-		flush_journal_list(sb, other_jl, 0);
-
-		/* other_jl is now deleted from the list */
-		goto restart;
-	}
-	return 0;
-}
-
-static void del_from_work_list(struct super_block *s,
-			       struct reiserfs_journal_list *jl)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	if (!list_empty(&jl->j_working_list)) {
-		list_del_init(&jl->j_working_list);
-		journal->j_num_work_lists--;
-	}
-}
-
-/*
- * flush a journal list, both commit and real blocks
- *
- * always set flushall to 1, unless you are calling from inside
- * flush_journal_list
- *
- * IMPORTANT.  This can only be called while there are no journal writers,
- * and the journal is locked.  That means it can only be called from
- * do_journal_end, or by journal_release
- */
-static int flush_journal_list(struct super_block *s,
-			      struct reiserfs_journal_list *jl, int flushall)
-{
-	struct reiserfs_journal_list *pjl;
-	struct reiserfs_journal_cnode *cn;
-	int count;
-	int was_jwait = 0;
-	int was_dirty = 0;
-	struct buffer_head *saved_bh;
-	unsigned long j_len_saved = jl->j_len;
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	int err = 0;
-	int depth;
-
-	BUG_ON(j_len_saved <= 0);
-
-	if (atomic_read(&journal->j_wcount) != 0) {
-		reiserfs_warning(s, "clm-2048", "called with wcount %d",
-				 atomic_read(&journal->j_wcount));
-	}
-
-	/* if flushall == 0, the lock is already held */
-	if (flushall) {
-		reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
-	} else if (mutex_trylock(&journal->j_flush_mutex)) {
-		BUG();
-	}
-
-	count = 0;
-	if (j_len_saved > journal->j_trans_max) {
-		reiserfs_panic(s, "journal-715", "length is %lu, trans id %lu",
-			       j_len_saved, jl->j_trans_id);
-		return 0;
-	}
-
-	/* if all the work is already done, get out of here */
-	if (atomic_read(&jl->j_nonzerolen) <= 0 &&
-	    atomic_read(&jl->j_commit_left) <= 0) {
-		goto flush_older_and_return;
-	}
-
-	/*
-	 * start by putting the commit list on disk.  This will also flush
-	 * the commit lists of any olders transactions
-	 */
-	flush_commit_list(s, jl, 1);
-
-	if (!(jl->j_state & LIST_DIRTY)
-	    && !reiserfs_is_journal_aborted(journal))
-		BUG();
-
-	/* are we done now? */
-	if (atomic_read(&jl->j_nonzerolen) <= 0 &&
-	    atomic_read(&jl->j_commit_left) <= 0) {
-		goto flush_older_and_return;
-	}
-
-	/*
-	 * loop through each cnode, see if we need to write it,
-	 * or wait on a more recent transaction, or just ignore it
-	 */
-	if (atomic_read(&journal->j_wcount) != 0) {
-		reiserfs_panic(s, "journal-844", "journal list is flushing, "
-			       "wcount is not 0");
-	}
-	cn = jl->j_realblock;
-	while (cn) {
-		was_jwait = 0;
-		was_dirty = 0;
-		saved_bh = NULL;
-		/* blocknr of 0 is no longer in the hash, ignore it */
-		if (cn->blocknr == 0) {
-			goto free_cnode;
-		}
-
-		/*
-		 * This transaction failed commit.
-		 * Don't write out to the disk
-		 */
-		if (!(jl->j_state & LIST_DIRTY))
-			goto free_cnode;
-
-		pjl = find_newer_jl_for_cn(cn);
-		/*
-		 * the order is important here.  We check pjl to make sure we
-		 * don't clear BH_JDirty_wait if we aren't the one writing this
-		 * block to disk
-		 */
-		if (!pjl && cn->bh) {
-			saved_bh = cn->bh;
-
-			/*
-			 * we do this to make sure nobody releases the
-			 * buffer while we are working with it
-			 */
-			get_bh(saved_bh);
-
-			if (buffer_journal_dirty(saved_bh)) {
-				BUG_ON(!can_dirty(cn));
-				was_jwait = 1;
-				was_dirty = 1;
-			} else if (can_dirty(cn)) {
-				/*
-				 * everything with !pjl && jwait
-				 * should be writable
-				 */
-				BUG();
-			}
-		}
-
-		/*
-		 * if someone has this block in a newer transaction, just make
-		 * sure they are committed, and don't try writing it to disk
-		 */
-		if (pjl) {
-			if (atomic_read(&pjl->j_commit_left))
-				flush_commit_list(s, pjl, 1);
-			goto free_cnode;
-		}
-
-		/*
-		 * bh == NULL when the block got to disk on its own, OR,
-		 * the block got freed in a future transaction
-		 */
-		if (saved_bh == NULL) {
-			goto free_cnode;
-		}
-
-		/*
-		 * this should never happen.  kupdate_one_transaction has
-		 * this list locked while it works, so we should never see a
-		 * buffer here that is not marked JDirty_wait
-		 */
-		if ((!was_jwait) && !buffer_locked(saved_bh)) {
-			reiserfs_warning(s, "journal-813",
-					 "BAD! buffer %llu %cdirty %cjwait, "
-					 "not in a newer transaction",
-					 (unsigned long long)saved_bh->
-					 b_blocknr, was_dirty ? ' ' : '!',
-					 was_jwait ? ' ' : '!');
-		}
-		if (was_dirty) {
-			/*
-			 * we inc again because saved_bh gets decremented
-			 * at free_cnode
-			 */
-			get_bh(saved_bh);
-			set_bit(BLOCK_NEEDS_FLUSH, &cn->state);
-			lock_buffer(saved_bh);
-			BUG_ON(cn->blocknr != saved_bh->b_blocknr);
-			if (buffer_dirty(saved_bh))
-				submit_logged_buffer(saved_bh);
-			else
-				unlock_buffer(saved_bh);
-			count++;
-		} else {
-			reiserfs_warning(s, "clm-2082",
-					 "Unable to flush buffer %llu in %s",
-					 (unsigned long long)saved_bh->
-					 b_blocknr, __func__);
-		}
-free_cnode:
-		cn = cn->next;
-		if (saved_bh) {
-			/*
-			 * we incremented this to keep others from
-			 * taking the buffer head away
-			 */
-			put_bh(saved_bh);
-			if (atomic_read(&saved_bh->b_count) < 0) {
-				reiserfs_warning(s, "journal-945",
-						 "saved_bh->b_count < 0");
-			}
-		}
-	}
-	if (count > 0) {
-		cn = jl->j_realblock;
-		while (cn) {
-			if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
-				if (!cn->bh) {
-					reiserfs_panic(s, "journal-1011",
-						       "cn->bh is NULL");
-				}
-
-				depth = reiserfs_write_unlock_nested(s);
-				__wait_on_buffer(cn->bh);
-				reiserfs_write_lock_nested(s, depth);
-
-				if (!cn->bh) {
-					reiserfs_panic(s, "journal-1012",
-						       "cn->bh is NULL");
-				}
-				if (unlikely(!buffer_uptodate(cn->bh))) {
-#ifdef CONFIG_REISERFS_CHECK
-					reiserfs_warning(s, "journal-949",
-							 "buffer write failed");
-#endif
-					err = -EIO;
-				}
-				/*
-				 * note, we must clear the JDirty_wait bit
-				 * after the up to date check, otherwise we
-				 * race against our flushpage routine
-				 */
-				BUG_ON(!test_clear_buffer_journal_dirty
-				       (cn->bh));
-
-				/* drop one ref for us */
-				put_bh(cn->bh);
-				/* drop one ref for journal_mark_dirty */
-				release_buffer_page(cn->bh);
-			}
-			cn = cn->next;
-		}
-	}
-
-	if (err)
-		reiserfs_abort(s, -EIO,
-			       "Write error while pushing transaction to disk in %s",
-			       __func__);
-flush_older_and_return:
-
-	/*
-	 * before we can update the journal header block, we _must_ flush all
-	 * real blocks from all older transactions to disk.  This is because
-	 * once the header block is updated, this transaction will not be
-	 * replayed after a crash
-	 */
-	if (flushall) {
-		flush_older_journal_lists(s, jl);
-	}
-
-	err = journal->j_errno;
-	/*
-	 * before we can remove everything from the hash tables for this
-	 * transaction, we must make sure it can never be replayed
-	 *
-	 * since we are only called from do_journal_end, we know for sure there
-	 * are no allocations going on while we are flushing journal lists.  So,
-	 * we only need to update the journal header block for the last list
-	 * being flushed
-	 */
-	if (!err && flushall) {
-		err =
-		    update_journal_header_block(s,
-						(jl->j_start + jl->j_len +
-						 2) % SB_ONDISK_JOURNAL_SIZE(s),
-						jl->j_trans_id);
-		if (err)
-			reiserfs_abort(s, -EIO,
-				       "Write error while updating journal header in %s",
-				       __func__);
-	}
-	remove_all_from_journal_list(s, jl, 0);
-	list_del_init(&jl->j_list);
-	journal->j_num_lists--;
-	del_from_work_list(s, jl);
-
-	if (journal->j_last_flush_id != 0 &&
-	    (jl->j_trans_id - journal->j_last_flush_id) != 1) {
-		reiserfs_warning(s, "clm-2201", "last flush %lu, current %lu",
-				 journal->j_last_flush_id, jl->j_trans_id);
-	}
-	journal->j_last_flush_id = jl->j_trans_id;
-
-	/*
-	 * not strictly required since we are freeing the list, but it should
-	 * help find code using dead lists later on
-	 */
-	jl->j_len = 0;
-	atomic_set(&jl->j_nonzerolen, 0);
-	jl->j_start = 0;
-	jl->j_realblock = NULL;
-	jl->j_commit_bh = NULL;
-	jl->j_trans_id = 0;
-	jl->j_state = 0;
-	put_journal_list(s, jl);
-	if (flushall)
-		mutex_unlock(&journal->j_flush_mutex);
-	return err;
-}
-
-static int write_one_transaction(struct super_block *s,
-				 struct reiserfs_journal_list *jl,
-				 struct buffer_chunk *chunk)
-{
-	struct reiserfs_journal_cnode *cn;
-	int ret = 0;
-
-	jl->j_state |= LIST_TOUCHED;
-	del_from_work_list(s, jl);
-	if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) {
-		return 0;
-	}
-
-	cn = jl->j_realblock;
-	while (cn) {
-		/*
-		 * if the blocknr == 0, this has been cleared from the hash,
-		 * skip it
-		 */
-		if (cn->blocknr == 0) {
-			goto next;
-		}
-		if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) {
-			struct buffer_head *tmp_bh;
-			/*
-			 * we can race against journal_mark_freed when we try
-			 * to lock_buffer(cn->bh), so we have to inc the buffer
-			 * count, and recheck things after locking
-			 */
-			tmp_bh = cn->bh;
-			get_bh(tmp_bh);
-			lock_buffer(tmp_bh);
-			if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) {
-				if (!buffer_journal_dirty(tmp_bh) ||
-				    buffer_journal_prepared(tmp_bh))
-					BUG();
-				add_to_chunk(chunk, tmp_bh, NULL, write_chunk);
-				ret++;
-			} else {
-				/* note, cn->bh might be null now */
-				unlock_buffer(tmp_bh);
-			}
-			put_bh(tmp_bh);
-		}
-next:
-		cn = cn->next;
-		cond_resched();
-	}
-	return ret;
-}
-
-/* used by flush_commit_list */
-static void dirty_one_transaction(struct super_block *s,
-				 struct reiserfs_journal_list *jl)
-{
-	struct reiserfs_journal_cnode *cn;
-	struct reiserfs_journal_list *pjl;
-
-	jl->j_state |= LIST_DIRTY;
-	cn = jl->j_realblock;
-	while (cn) {
-		/*
-		 * look for a more recent transaction that logged this
-		 * buffer.  Only the most recent transaction with a buffer in
-		 * it is allowed to send that buffer to disk
-		 */
-		pjl = find_newer_jl_for_cn(cn);
-		if (!pjl && cn->blocknr && cn->bh
-		    && buffer_journal_dirty(cn->bh)) {
-			BUG_ON(!can_dirty(cn));
-			/*
-			 * if the buffer is prepared, it will either be logged
-			 * or restored.  If restored, we need to make sure
-			 * it actually gets marked dirty
-			 */
-			clear_buffer_journal_new(cn->bh);
-			if (buffer_journal_prepared(cn->bh)) {
-				set_buffer_journal_restore_dirty(cn->bh);
-			} else {
-				set_buffer_journal_test(cn->bh);
-				mark_buffer_dirty(cn->bh);
-			}
-		}
-		cn = cn->next;
-	}
-}
-
-static int kupdate_transactions(struct super_block *s,
-				struct reiserfs_journal_list *jl,
-				struct reiserfs_journal_list **next_jl,
-				unsigned int *next_trans_id,
-				int num_blocks, int num_trans)
-{
-	int ret = 0;
-	int written = 0;
-	int transactions_flushed = 0;
-	unsigned int orig_trans_id = jl->j_trans_id;
-	struct buffer_chunk chunk;
-	struct list_head *entry;
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	chunk.nr = 0;
-
-	reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
-	if (!journal_list_still_alive(s, orig_trans_id)) {
-		goto done;
-	}
-
-	/*
-	 * we've got j_flush_mutex held, nobody is going to delete any
-	 * of these lists out from underneath us
-	 */
-	while ((num_trans && transactions_flushed < num_trans) ||
-	       (!num_trans && written < num_blocks)) {
-
-		if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) ||
-		    atomic_read(&jl->j_commit_left)
-		    || !(jl->j_state & LIST_DIRTY)) {
-			del_from_work_list(s, jl);
-			break;
-		}
-		ret = write_one_transaction(s, jl, &chunk);
-
-		if (ret < 0)
-			goto done;
-		transactions_flushed++;
-		written += ret;
-		entry = jl->j_list.next;
-
-		/* did we wrap? */
-		if (entry == &journal->j_journal_list) {
-			break;
-		}
-		jl = JOURNAL_LIST_ENTRY(entry);
-
-		/* don't bother with older transactions */
-		if (jl->j_trans_id <= orig_trans_id)
-			break;
-	}
-	if (chunk.nr) {
-		write_chunk(&chunk);
-	}
-
-done:
-	mutex_unlock(&journal->j_flush_mutex);
-	return ret;
-}
-
-/*
- * for o_sync and fsync heavy applications, they tend to use
- * all the journa list slots with tiny transactions.  These
- * trigger lots and lots of calls to update the header block, which
- * adds seeks and slows things down.
- *
- * This function tries to clear out a large chunk of the journal lists
- * at once, which makes everything faster since only the newest journal
- * list updates the header block
- */
-static int flush_used_journal_lists(struct super_block *s,
-				    struct reiserfs_journal_list *jl)
-{
-	unsigned long len = 0;
-	unsigned long cur_len;
-	int i;
-	int limit = 256;
-	struct reiserfs_journal_list *tjl;
-	struct reiserfs_journal_list *flush_jl;
-	unsigned int trans_id;
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-
-	flush_jl = tjl = jl;
-
-	/* in data logging mode, try harder to flush a lot of blocks */
-	if (reiserfs_data_log(s))
-		limit = 1024;
-	/* flush for 256 transactions or limit blocks, whichever comes first */
-	for (i = 0; i < 256 && len < limit; i++) {
-		if (atomic_read(&tjl->j_commit_left) ||
-		    tjl->j_trans_id < jl->j_trans_id) {
-			break;
-		}
-		cur_len = atomic_read(&tjl->j_nonzerolen);
-		if (cur_len > 0) {
-			tjl->j_state &= ~LIST_TOUCHED;
-		}
-		len += cur_len;
-		flush_jl = tjl;
-		if (tjl->j_list.next == &journal->j_journal_list)
-			break;
-		tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);
-	}
-	get_journal_list(jl);
-	get_journal_list(flush_jl);
-
-	/*
-	 * try to find a group of blocks we can flush across all the
-	 * transactions, but only bother if we've actually spanned
-	 * across multiple lists
-	 */
-	if (flush_jl != jl)
-		kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
-
-	flush_journal_list(s, flush_jl, 1);
-	put_journal_list(s, flush_jl);
-	put_journal_list(s, jl);
-	return 0;
-}
-
-/*
- * removes any nodes in table with name block and dev as bh.
- * only touchs the hnext and hprev pointers.
- */
-static void remove_journal_hash(struct super_block *sb,
-			 struct reiserfs_journal_cnode **table,
-			 struct reiserfs_journal_list *jl,
-			 unsigned long block, int remove_freed)
-{
-	struct reiserfs_journal_cnode *cur;
-	struct reiserfs_journal_cnode **head;
-
-	head = &(journal_hash(table, sb, block));
-	if (!head) {
-		return;
-	}
-	cur = *head;
-	while (cur) {
-		if (cur->blocknr == block && cur->sb == sb
-		    && (jl == NULL || jl == cur->jlist)
-		    && (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) {
-			if (cur->hnext) {
-				cur->hnext->hprev = cur->hprev;
-			}
-			if (cur->hprev) {
-				cur->hprev->hnext = cur->hnext;
-			} else {
-				*head = cur->hnext;
-			}
-			cur->blocknr = 0;
-			cur->sb = NULL;
-			cur->state = 0;
-			/*
-			 * anybody who clears the cur->bh will also
-			 * dec the nonzerolen
-			 */
-			if (cur->bh && cur->jlist)
-				atomic_dec(&cur->jlist->j_nonzerolen);
-			cur->bh = NULL;
-			cur->jlist = NULL;
-		}
-		cur = cur->hnext;
-	}
-}
-
-static void free_journal_ram(struct super_block *sb)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	kfree(journal->j_current_jl);
-	journal->j_num_lists--;
-
-	vfree(journal->j_cnode_free_orig);
-	free_list_bitmaps(sb, journal->j_list_bitmap);
-	free_bitmap_nodes(sb);	/* must be after free_list_bitmaps */
-	if (journal->j_header_bh) {
-		brelse(journal->j_header_bh);
-	}
-	/*
-	 * j_header_bh is on the journal dev, make sure
-	 * not to release the journal dev until we brelse j_header_bh
-	 */
-	release_journal_dev(journal);
-	vfree(journal);
-}
-
-/*
- * call on unmount.  Only set error to 1 if you haven't made your way out
- * of read_super() yet.  Any other caller must keep error at 0.
- */
-static int do_journal_release(struct reiserfs_transaction_handle *th,
-			      struct super_block *sb, int error)
-{
-	struct reiserfs_transaction_handle myth;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-
-	/*
-	 * we only want to flush out transactions if we were
-	 * called with error == 0
-	 */
-	if (!error && !sb_rdonly(sb)) {
-		/* end the current trans */
-		BUG_ON(!th->t_trans_id);
-		do_journal_end(th, FLUSH_ALL);
-
-		/*
-		 * make sure something gets logged to force
-		 * our way into the flush code
-		 */
-		if (!journal_join(&myth, sb)) {
-			reiserfs_prepare_for_journal(sb,
-						     SB_BUFFER_WITH_SB(sb),
-						     1);
-			journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb));
-			do_journal_end(&myth, FLUSH_ALL);
-		}
-	}
-
-	/* this also catches errors during the do_journal_end above */
-	if (!error && reiserfs_is_journal_aborted(journal)) {
-		memset(&myth, 0, sizeof(myth));
-		if (!journal_join_abort(&myth, sb)) {
-			reiserfs_prepare_for_journal(sb,
-						     SB_BUFFER_WITH_SB(sb),
-						     1);
-			journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb));
-			do_journal_end(&myth, FLUSH_ALL);
-		}
-	}
-
-
-	/*
-	 * We must release the write lock here because
-	 * the workqueue job (flush_async_commit) needs this lock
-	 */
-	reiserfs_write_unlock(sb);
-
-	/*
-	 * Cancel flushing of old commits. Note that neither of these works
-	 * will be requeued because superblock is being shutdown and doesn't
-	 * have SB_ACTIVE set.
-	 */
-	reiserfs_cancel_old_flush(sb);
-	/* wait for all commits to finish */
-	cancel_delayed_work_sync(&SB_JOURNAL(sb)->j_work);
-
-	free_journal_ram(sb);
-
-	reiserfs_write_lock(sb);
-
-	return 0;
-}
-
-/* * call on unmount.  flush all journal trans, release all alloc'd ram */
-int journal_release(struct reiserfs_transaction_handle *th,
-		    struct super_block *sb)
-{
-	return do_journal_release(th, sb, 0);
-}
-
-/* only call from an error condition inside reiserfs_read_super!  */
-int journal_release_error(struct reiserfs_transaction_handle *th,
-			  struct super_block *sb)
-{
-	return do_journal_release(th, sb, 1);
-}
-
-/*
- * compares description block with commit block.
- * returns 1 if they differ, 0 if they are the same
- */
-static int journal_compare_desc_commit(struct super_block *sb,
-				       struct reiserfs_journal_desc *desc,
-				       struct reiserfs_journal_commit *commit)
-{
-	if (get_commit_trans_id(commit) != get_desc_trans_id(desc) ||
-	    get_commit_trans_len(commit) != get_desc_trans_len(desc) ||
-	    get_commit_trans_len(commit) > SB_JOURNAL(sb)->j_trans_max ||
-	    get_commit_trans_len(commit) <= 0) {
-		return 1;
-	}
-	return 0;
-}
-
-/*
- * returns 0 if it did not find a description block
- * returns -1 if it found a corrupt commit block
- * returns 1 if both desc and commit were valid
- * NOTE: only called during fs mount
- */
-static int journal_transaction_is_valid(struct super_block *sb,
-					struct buffer_head *d_bh,
-					unsigned int *oldest_invalid_trans_id,
-					unsigned long *newest_mount_id)
-{
-	struct reiserfs_journal_desc *desc;
-	struct reiserfs_journal_commit *commit;
-	struct buffer_head *c_bh;
-	unsigned long offset;
-
-	if (!d_bh)
-		return 0;
-
-	desc = (struct reiserfs_journal_desc *)d_bh->b_data;
-	if (get_desc_trans_len(desc) > 0
-	    && !memcmp(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8)) {
-		if (oldest_invalid_trans_id && *oldest_invalid_trans_id
-		    && get_desc_trans_id(desc) > *oldest_invalid_trans_id) {
-			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-				       "journal-986: transaction "
-				       "is valid returning because trans_id %d is greater than "
-				       "oldest_invalid %lu",
-				       get_desc_trans_id(desc),
-				       *oldest_invalid_trans_id);
-			return 0;
-		}
-		if (newest_mount_id
-		    && *newest_mount_id > get_desc_mount_id(desc)) {
-			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-				       "journal-1087: transaction "
-				       "is valid returning because mount_id %d is less than "
-				       "newest_mount_id %lu",
-				       get_desc_mount_id(desc),
-				       *newest_mount_id);
-			return -1;
-		}
-		if (get_desc_trans_len(desc) > SB_JOURNAL(sb)->j_trans_max) {
-			reiserfs_warning(sb, "journal-2018",
-					 "Bad transaction length %d "
-					 "encountered, ignoring transaction",
-					 get_desc_trans_len(desc));
-			return -1;
-		}
-		offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
-
-		/*
-		 * ok, we have a journal description block,
-		 * let's see if the transaction was valid
-		 */
-		c_bh =
-		    journal_bread(sb,
-				  SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
-				  ((offset + get_desc_trans_len(desc) +
-				    1) % SB_ONDISK_JOURNAL_SIZE(sb)));
-		if (!c_bh)
-			return 0;
-		commit = (struct reiserfs_journal_commit *)c_bh->b_data;
-		if (journal_compare_desc_commit(sb, desc, commit)) {
-			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-				       "journal_transaction_is_valid, commit offset %ld had bad "
-				       "time %d or length %d",
-				       c_bh->b_blocknr -
-				       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
-				       get_commit_trans_id(commit),
-				       get_commit_trans_len(commit));
-			brelse(c_bh);
-			if (oldest_invalid_trans_id) {
-				*oldest_invalid_trans_id =
-				    get_desc_trans_id(desc);
-				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-					       "journal-1004: "
-					       "transaction_is_valid setting oldest invalid trans_id "
-					       "to %d",
-					       get_desc_trans_id(desc));
-			}
-			return -1;
-		}
-		brelse(c_bh);
-		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-			       "journal-1006: found valid "
-			       "transaction start offset %llu, len %d id %d",
-			       d_bh->b_blocknr -
-			       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
-			       get_desc_trans_len(desc),
-			       get_desc_trans_id(desc));
-		return 1;
-	} else {
-		return 0;
-	}
-}
-
-static void brelse_array(struct buffer_head **heads, int num)
-{
-	int i;
-	for (i = 0; i < num; i++) {
-		brelse(heads[i]);
-	}
-}
-
-/*
- * given the start, and values for the oldest acceptable transactions,
- * this either reads in a replays a transaction, or returns because the
- * transaction is invalid, or too old.
- * NOTE: only called during fs mount
- */
-static int journal_read_transaction(struct super_block *sb,
-				    unsigned long cur_dblock,
-				    unsigned long oldest_start,
-				    unsigned int oldest_trans_id,
-				    unsigned long newest_mount_id)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	struct reiserfs_journal_desc *desc;
-	struct reiserfs_journal_commit *commit;
-	unsigned int trans_id = 0;
-	struct buffer_head *c_bh;
-	struct buffer_head *d_bh;
-	struct buffer_head **log_blocks = NULL;
-	struct buffer_head **real_blocks = NULL;
-	unsigned int trans_offset;
-	int i;
-	int trans_half;
-
-	d_bh = journal_bread(sb, cur_dblock);
-	if (!d_bh)
-		return 1;
-	desc = (struct reiserfs_journal_desc *)d_bh->b_data;
-	trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
-	reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1037: "
-		       "journal_read_transaction, offset %llu, len %d mount_id %d",
-		       d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
-		       get_desc_trans_len(desc), get_desc_mount_id(desc));
-	if (get_desc_trans_id(desc) < oldest_trans_id) {
-		reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1039: "
-			       "journal_read_trans skipping because %lu is too old",
-			       cur_dblock -
-			       SB_ONDISK_JOURNAL_1st_BLOCK(sb));
-		brelse(d_bh);
-		return 1;
-	}
-	if (get_desc_mount_id(desc) != newest_mount_id) {
-		reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1146: "
-			       "journal_read_trans skipping because %d is != "
-			       "newest_mount_id %lu", get_desc_mount_id(desc),
-			       newest_mount_id);
-		brelse(d_bh);
-		return 1;
-	}
-	c_bh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
-			     ((trans_offset + get_desc_trans_len(desc) + 1) %
-			      SB_ONDISK_JOURNAL_SIZE(sb)));
-	if (!c_bh) {
-		brelse(d_bh);
-		return 1;
-	}
-	commit = (struct reiserfs_journal_commit *)c_bh->b_data;
-	if (journal_compare_desc_commit(sb, desc, commit)) {
-		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-			       "journal_read_transaction, "
-			       "commit offset %llu had bad time %d or length %d",
-			       c_bh->b_blocknr -
-			       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
-			       get_commit_trans_id(commit),
-			       get_commit_trans_len(commit));
-		brelse(c_bh);
-		brelse(d_bh);
-		return 1;
-	}
-
-	if (bdev_read_only(sb->s_bdev)) {
-		reiserfs_warning(sb, "clm-2076",
-				 "device is readonly, unable to replay log");
-		brelse(c_bh);
-		brelse(d_bh);
-		return -EROFS;
-	}
-
-	trans_id = get_desc_trans_id(desc);
-	/*
-	 * now we know we've got a good transaction, and it was
-	 * inside the valid time ranges
-	 */
-	log_blocks = kmalloc_array(get_desc_trans_len(desc),
-				   sizeof(struct buffer_head *),
-				   GFP_NOFS);
-	real_blocks = kmalloc_array(get_desc_trans_len(desc),
-				    sizeof(struct buffer_head *),
-				    GFP_NOFS);
-	if (!log_blocks || !real_blocks) {
-		brelse(c_bh);
-		brelse(d_bh);
-		kfree(log_blocks);
-		kfree(real_blocks);
-		reiserfs_warning(sb, "journal-1169",
-				 "kmalloc failed, unable to mount FS");
-		return -1;
-	}
-	/* get all the buffer heads */
-	trans_half = journal_trans_half(sb->s_blocksize);
-	for (i = 0; i < get_desc_trans_len(desc); i++) {
-		log_blocks[i] =
-		    journal_getblk(sb,
-				   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
-				   (trans_offset + 1 +
-				    i) % SB_ONDISK_JOURNAL_SIZE(sb));
-		if (i < trans_half) {
-			real_blocks[i] =
-			    sb_getblk(sb,
-				      le32_to_cpu(desc->j_realblock[i]));
-		} else {
-			real_blocks[i] =
-			    sb_getblk(sb,
-				      le32_to_cpu(commit->
-						  j_realblock[i - trans_half]));
-		}
-		if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(sb)) {
-			reiserfs_warning(sb, "journal-1207",
-					 "REPLAY FAILURE fsck required! "
-					 "Block to replay is outside of "
-					 "filesystem");
-			goto abort_replay;
-		}
-		/* make sure we don't try to replay onto log or reserved area */
-		if (is_block_in_log_or_reserved_area
-		    (sb, real_blocks[i]->b_blocknr)) {
-			reiserfs_warning(sb, "journal-1204",
-					 "REPLAY FAILURE fsck required! "
-					 "Trying to replay onto a log block");
-abort_replay:
-			brelse_array(log_blocks, i);
-			brelse_array(real_blocks, i);
-			brelse(c_bh);
-			brelse(d_bh);
-			kfree(log_blocks);
-			kfree(real_blocks);
-			return -1;
-		}
-	}
-	/* read in the log blocks, memcpy to the corresponding real block */
-	bh_read_batch(get_desc_trans_len(desc), log_blocks);
-	for (i = 0; i < get_desc_trans_len(desc); i++) {
-
-		wait_on_buffer(log_blocks[i]);
-		if (!buffer_uptodate(log_blocks[i])) {
-			reiserfs_warning(sb, "journal-1212",
-					 "REPLAY FAILURE fsck required! "
-					 "buffer write failed");
-			brelse_array(log_blocks + i,
-				     get_desc_trans_len(desc) - i);
-			brelse_array(real_blocks, get_desc_trans_len(desc));
-			brelse(c_bh);
-			brelse(d_bh);
-			kfree(log_blocks);
-			kfree(real_blocks);
-			return -1;
-		}
-		memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data,
-		       real_blocks[i]->b_size);
-		set_buffer_uptodate(real_blocks[i]);
-		brelse(log_blocks[i]);
-	}
-	/* flush out the real blocks */
-	for (i = 0; i < get_desc_trans_len(desc); i++) {
-		set_buffer_dirty(real_blocks[i]);
-		write_dirty_buffer(real_blocks[i], 0);
-	}
-	for (i = 0; i < get_desc_trans_len(desc); i++) {
-		wait_on_buffer(real_blocks[i]);
-		if (!buffer_uptodate(real_blocks[i])) {
-			reiserfs_warning(sb, "journal-1226",
-					 "REPLAY FAILURE, fsck required! "
-					 "buffer write failed");
-			brelse_array(real_blocks + i,
-				     get_desc_trans_len(desc) - i);
-			brelse(c_bh);
-			brelse(d_bh);
-			kfree(log_blocks);
-			kfree(real_blocks);
-			return -1;
-		}
-		brelse(real_blocks[i]);
-	}
-	cur_dblock =
-	    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
-	    ((trans_offset + get_desc_trans_len(desc) +
-	      2) % SB_ONDISK_JOURNAL_SIZE(sb));
-	reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-		       "journal-1095: setting journal " "start to offset %ld",
-		       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb));
-
-	/*
-	 * init starting values for the first transaction, in case
-	 * this is the last transaction to be replayed.
-	 */
-	journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
-	journal->j_last_flush_trans_id = trans_id;
-	journal->j_trans_id = trans_id + 1;
-	/* check for trans_id overflow */
-	if (journal->j_trans_id == 0)
-		journal->j_trans_id = 10;
-	brelse(c_bh);
-	brelse(d_bh);
-	kfree(log_blocks);
-	kfree(real_blocks);
-	return 0;
-}
-
-/*
- * This function reads blocks starting from block and to max_block of bufsize
- * size (but no more than BUFNR blocks at a time). This proved to improve
- * mounting speed on self-rebuilding raid5 arrays at least.
- * Right now it is only used from journal code. But later we might use it
- * from other places.
- * Note: Do not use journal_getblk/sb_getblk functions here!
- */
-static struct buffer_head *reiserfs_breada(struct block_device *dev,
-					   b_blocknr_t block, int bufsize,
-					   b_blocknr_t max_block)
-{
-	struct buffer_head *bhlist[BUFNR];
-	unsigned int blocks = BUFNR;
-	struct buffer_head *bh;
-	int i, j;
-
-	bh = __getblk(dev, block, bufsize);
-	if (!bh || buffer_uptodate(bh))
-		return (bh);
-
-	if (block + BUFNR > max_block) {
-		blocks = max_block - block;
-	}
-	bhlist[0] = bh;
-	j = 1;
-	for (i = 1; i < blocks; i++) {
-		bh = __getblk(dev, block + i, bufsize);
-		if (!bh)
-			break;
-		if (buffer_uptodate(bh)) {
-			brelse(bh);
-			break;
-		} else
-			bhlist[j++] = bh;
-	}
-	bh = bhlist[0];
-	bh_read_nowait(bh, 0);
-	bh_readahead_batch(j - 1, &bhlist[1], 0);
-	for (i = 1; i < j; i++)
-		brelse(bhlist[i]);
-	wait_on_buffer(bh);
-	if (buffer_uptodate(bh))
-		return bh;
-	brelse(bh);
-	return NULL;
-}
-
-/*
- * read and replay the log
- * on a clean unmount, the journal header's next unflushed pointer will be
- * to an invalid transaction.  This tests that before finding all the
- * transactions in the log, which makes normal mount times fast.
- *
- * After a crash, this starts with the next unflushed transaction, and
- * replays until it finds one too old, or invalid.
- *
- * On exit, it sets things up so the first transaction will work correctly.
- * NOTE: only called during fs mount
- */
-static int journal_read(struct super_block *sb)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	struct reiserfs_journal_desc *desc;
-	unsigned int oldest_trans_id = 0;
-	unsigned int oldest_invalid_trans_id = 0;
-	time64_t start;
-	unsigned long oldest_start = 0;
-	unsigned long cur_dblock = 0;
-	unsigned long newest_mount_id = 9;
-	struct buffer_head *d_bh;
-	struct reiserfs_journal_header *jh;
-	int valid_journal_header = 0;
-	int replay_count = 0;
-	int continue_replay = 1;
-	int ret;
-
-	cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
-	reiserfs_info(sb, "checking transaction log (%pg)\n",
-		      file_bdev(journal->j_bdev_file));
-	start = ktime_get_seconds();
-
-	/*
-	 * step 1, read in the journal header block.  Check the transaction
-	 * it says is the first unflushed, and if that transaction is not
-	 * valid, replay is done
-	 */
-	journal->j_header_bh = journal_bread(sb,
-					     SB_ONDISK_JOURNAL_1st_BLOCK(sb)
-					     + SB_ONDISK_JOURNAL_SIZE(sb));
-	if (!journal->j_header_bh) {
-		return 1;
-	}
-	jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data);
-	if (le32_to_cpu(jh->j_first_unflushed_offset) <
-	    SB_ONDISK_JOURNAL_SIZE(sb)
-	    && le32_to_cpu(jh->j_last_flush_trans_id) > 0) {
-		oldest_start =
-		    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
-		    le32_to_cpu(jh->j_first_unflushed_offset);
-		oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1;
-		newest_mount_id = le32_to_cpu(jh->j_mount_id);
-		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-			       "journal-1153: found in "
-			       "header: first_unflushed_offset %d, last_flushed_trans_id "
-			       "%lu", le32_to_cpu(jh->j_first_unflushed_offset),
-			       le32_to_cpu(jh->j_last_flush_trans_id));
-		valid_journal_header = 1;
-
-		/*
-		 * now, we try to read the first unflushed offset.  If it
-		 * is not valid, there is nothing more we can do, and it
-		 * makes no sense to read through the whole log.
-		 */
-		d_bh =
-		    journal_bread(sb,
-				  SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
-				  le32_to_cpu(jh->j_first_unflushed_offset));
-		ret = journal_transaction_is_valid(sb, d_bh, NULL, NULL);
-		if (!ret) {
-			continue_replay = 0;
-		}
-		brelse(d_bh);
-		goto start_log_replay;
-	}
-
-	/*
-	 * ok, there are transactions that need to be replayed.  start
-	 * with the first log block, find all the valid transactions, and
-	 * pick out the oldest.
-	 */
-	while (continue_replay
-	       && cur_dblock <
-	       (SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
-		SB_ONDISK_JOURNAL_SIZE(sb))) {
-		/*
-		 * Note that it is required for blocksize of primary fs
-		 * device and journal device to be the same
-		 */
-		d_bh =
-		    reiserfs_breada(file_bdev(journal->j_bdev_file), cur_dblock,
-				    sb->s_blocksize,
-				    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
-				    SB_ONDISK_JOURNAL_SIZE(sb));
-		ret =
-		    journal_transaction_is_valid(sb, d_bh,
-						 &oldest_invalid_trans_id,
-						 &newest_mount_id);
-		if (ret == 1) {
-			desc = (struct reiserfs_journal_desc *)d_bh->b_data;
-			if (oldest_start == 0) {	/* init all oldest_ values */
-				oldest_trans_id = get_desc_trans_id(desc);
-				oldest_start = d_bh->b_blocknr;
-				newest_mount_id = get_desc_mount_id(desc);
-				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-					       "journal-1179: Setting "
-					       "oldest_start to offset %llu, trans_id %lu",
-					       oldest_start -
-					       SB_ONDISK_JOURNAL_1st_BLOCK
-					       (sb), oldest_trans_id);
-			} else if (oldest_trans_id > get_desc_trans_id(desc)) {
-				/* one we just read was older */
-				oldest_trans_id = get_desc_trans_id(desc);
-				oldest_start = d_bh->b_blocknr;
-				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-					       "journal-1180: Resetting "
-					       "oldest_start to offset %lu, trans_id %lu",
-					       oldest_start -
-					       SB_ONDISK_JOURNAL_1st_BLOCK
-					       (sb), oldest_trans_id);
-			}
-			if (newest_mount_id < get_desc_mount_id(desc)) {
-				newest_mount_id = get_desc_mount_id(desc);
-				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-					       "journal-1299: Setting "
-					       "newest_mount_id to %d",
-					       get_desc_mount_id(desc));
-			}
-			cur_dblock += get_desc_trans_len(desc) + 2;
-		} else {
-			cur_dblock++;
-		}
-		brelse(d_bh);
-	}
-
-start_log_replay:
-	cur_dblock = oldest_start;
-	if (oldest_trans_id) {
-		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-			       "journal-1206: Starting replay "
-			       "from offset %llu, trans_id %lu",
-			       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
-			       oldest_trans_id);
-
-	}
-	replay_count = 0;
-	while (continue_replay && oldest_trans_id > 0) {
-		ret =
-		    journal_read_transaction(sb, cur_dblock, oldest_start,
-					     oldest_trans_id, newest_mount_id);
-		if (ret < 0) {
-			return ret;
-		} else if (ret != 0) {
-			break;
-		}
-		cur_dblock =
-		    SB_ONDISK_JOURNAL_1st_BLOCK(sb) + journal->j_start;
-		replay_count++;
-		if (cur_dblock == oldest_start)
-			break;
-	}
-
-	if (oldest_trans_id == 0) {
-		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-			       "journal-1225: No valid " "transactions found");
-	}
-	/*
-	 * j_start does not get set correctly if we don't replay any
-	 * transactions.  if we had a valid journal_header, set j_start
-	 * to the first unflushed transaction value, copy the trans_id
-	 * from the header
-	 */
-	if (valid_journal_header && replay_count == 0) {
-		journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset);
-		journal->j_trans_id =
-		    le32_to_cpu(jh->j_last_flush_trans_id) + 1;
-		/* check for trans_id overflow */
-		if (journal->j_trans_id == 0)
-			journal->j_trans_id = 10;
-		journal->j_last_flush_trans_id =
-		    le32_to_cpu(jh->j_last_flush_trans_id);
-		journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1;
-	} else {
-		journal->j_mount_id = newest_mount_id + 1;
-	}
-	reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1299: Setting "
-		       "newest_mount_id to %lu", journal->j_mount_id);
-	journal->j_first_unflushed_offset = journal->j_start;
-	if (replay_count > 0) {
-		reiserfs_info(sb,
-			      "replayed %d transactions in %lu seconds\n",
-			      replay_count, ktime_get_seconds() - start);
-	}
-	/* needed to satisfy the locking in _update_journal_header_block */
-	reiserfs_write_lock(sb);
-	if (!bdev_read_only(sb->s_bdev) &&
-	    _update_journal_header_block(sb, journal->j_start,
-					 journal->j_last_flush_trans_id)) {
-		reiserfs_write_unlock(sb);
-		/*
-		 * replay failed, caller must call free_journal_ram and abort
-		 * the mount
-		 */
-		return -1;
-	}
-	reiserfs_write_unlock(sb);
-	return 0;
-}
-
-static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
-{
-	struct reiserfs_journal_list *jl;
-	jl = kzalloc(sizeof(struct reiserfs_journal_list),
-		     GFP_NOFS | __GFP_NOFAIL);
-	INIT_LIST_HEAD(&jl->j_list);
-	INIT_LIST_HEAD(&jl->j_working_list);
-	INIT_LIST_HEAD(&jl->j_tail_bh_list);
-	INIT_LIST_HEAD(&jl->j_bh_list);
-	mutex_init(&jl->j_commit_mutex);
-	SB_JOURNAL(s)->j_num_lists++;
-	get_journal_list(jl);
-	return jl;
-}
-
-static void journal_list_init(struct super_block *sb)
-{
-	SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb);
-}
-
-static void release_journal_dev(struct reiserfs_journal *journal)
-{
-	if (journal->j_bdev_file) {
-		bdev_fput(journal->j_bdev_file);
-		journal->j_bdev_file = NULL;
-	}
-}
-
-static int journal_init_dev(struct super_block *super,
-			    struct reiserfs_journal *journal,
-			    const char *jdev_name)
-{
-	blk_mode_t blkdev_mode = BLK_OPEN_READ;
-	void *holder = journal;
-	int result;
-	dev_t jdev;
-
-	result = 0;
-
-	journal->j_bdev_file = NULL;
-	jdev = SB_ONDISK_JOURNAL_DEVICE(super) ?
-	    new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;
-
-	if (!bdev_read_only(super->s_bdev))
-		blkdev_mode |= BLK_OPEN_WRITE;
-
-	/* there is no "jdev" option and journal is on separate device */
-	if ((!jdev_name || !jdev_name[0])) {
-		if (jdev == super->s_dev)
-			holder = NULL;
-		journal->j_bdev_file = bdev_file_open_by_dev(jdev, blkdev_mode,
-							  holder, NULL);
-		if (IS_ERR(journal->j_bdev_file)) {
-			result = PTR_ERR(journal->j_bdev_file);
-			journal->j_bdev_file = NULL;
-			reiserfs_warning(super, "sh-458",
-					 "cannot init journal device unknown-block(%u,%u): %i",
-					 MAJOR(jdev), MINOR(jdev), result);
-			return result;
-		} else if (jdev != super->s_dev)
-			set_blocksize(journal->j_bdev_file, super->s_blocksize);
-
-		return 0;
-	}
-
-	journal->j_bdev_file = bdev_file_open_by_path(jdev_name, blkdev_mode,
-						   holder, NULL);
-	if (IS_ERR(journal->j_bdev_file)) {
-		result = PTR_ERR(journal->j_bdev_file);
-		journal->j_bdev_file = NULL;
-		reiserfs_warning(super, "sh-457",
-				 "journal_init_dev: Cannot open '%s': %i",
-				 jdev_name, result);
-		return result;
-	}
-
-	set_blocksize(journal->j_bdev_file, super->s_blocksize);
-	reiserfs_info(super,
-		      "journal_init_dev: journal device: %pg\n",
-		      file_bdev(journal->j_bdev_file));
-	return 0;
-}
-
-/*
- * When creating/tuning a file system user can assign some
- * journal params within boundaries which depend on the ratio
- * blocksize/standard_blocksize.
- *
- * For blocks >= standard_blocksize transaction size should
- * be not less then JOURNAL_TRANS_MIN_DEFAULT, and not more
- * then JOURNAL_TRANS_MAX_DEFAULT.
- *
- * For blocks < standard_blocksize these boundaries should be
- * decreased proportionally.
- */
-#define REISERFS_STANDARD_BLKSIZE (4096)
-
-static int check_advise_trans_params(struct super_block *sb,
-				     struct reiserfs_journal *journal)
-{
-        if (journal->j_trans_max) {
-		/* Non-default journal params.  Do sanity check for them. */
-	        int ratio = 1;
-		if (sb->s_blocksize < REISERFS_STANDARD_BLKSIZE)
-		        ratio = REISERFS_STANDARD_BLKSIZE / sb->s_blocksize;
-
-		if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio ||
-		    journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio ||
-		    SB_ONDISK_JOURNAL_SIZE(sb) / journal->j_trans_max <
-		    JOURNAL_MIN_RATIO) {
-			reiserfs_warning(sb, "sh-462",
-					 "bad transaction max size (%u). "
-					 "FSCK?", journal->j_trans_max);
-			return 1;
-		}
-		if (journal->j_max_batch != (journal->j_trans_max) *
-		        JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT) {
-			reiserfs_warning(sb, "sh-463",
-					 "bad transaction max batch (%u). "
-					 "FSCK?", journal->j_max_batch);
-			return 1;
-		}
-	} else {
-		/*
-		 * Default journal params.
-		 * The file system was created by old version
-		 * of mkreiserfs, so some fields contain zeros,
-		 * and we need to advise proper values for them
-		 */
-		if (sb->s_blocksize != REISERFS_STANDARD_BLKSIZE) {
-			reiserfs_warning(sb, "sh-464", "bad blocksize (%u)",
-					 sb->s_blocksize);
-			return 1;
-		}
-		journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT;
-		journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT;
-		journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE;
-	}
-	return 0;
-}
-
-/* must be called once on fs mount.  calls journal_read for you */
-int journal_init(struct super_block *sb, const char *j_dev_name,
-		 int old_format, unsigned int commit_max_age)
-{
-	int num_cnodes = SB_ONDISK_JOURNAL_SIZE(sb) * 2;
-	struct buffer_head *bhjh;
-	struct reiserfs_super_block *rs;
-	struct reiserfs_journal_header *jh;
-	struct reiserfs_journal *journal;
-	struct reiserfs_journal_list *jl;
-	int ret;
-
-	journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal));
-	if (!journal) {
-		reiserfs_warning(sb, "journal-1256",
-				 "unable to get memory for journal structure");
-		return 1;
-	}
-	INIT_LIST_HEAD(&journal->j_bitmap_nodes);
-	INIT_LIST_HEAD(&journal->j_prealloc_list);
-	INIT_LIST_HEAD(&journal->j_working_list);
-	INIT_LIST_HEAD(&journal->j_journal_list);
-	journal->j_persistent_trans = 0;
-	if (reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
-					   reiserfs_bmap_count(sb)))
-		goto free_and_return;
-
-	allocate_bitmap_nodes(sb);
-
-	/* reserved for journal area support */
-	SB_JOURNAL_1st_RESERVED_BLOCK(sb) = (old_format ?
-						 REISERFS_OLD_DISK_OFFSET_IN_BYTES
-						 / sb->s_blocksize +
-						 reiserfs_bmap_count(sb) +
-						 1 :
-						 REISERFS_DISK_OFFSET_IN_BYTES /
-						 sb->s_blocksize + 2);
-
-	/*
-	 * Sanity check to see is the standard journal fitting
-	 * within first bitmap (actual for small blocksizes)
-	 */
-	if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
-	    (SB_JOURNAL_1st_RESERVED_BLOCK(sb) +
-	     SB_ONDISK_JOURNAL_SIZE(sb) > sb->s_blocksize * 8)) {
-		reiserfs_warning(sb, "journal-1393",
-				 "journal does not fit for area addressed "
-				 "by first of bitmap blocks. It starts at "
-				 "%u and its size is %u. Block size %ld",
-				 SB_JOURNAL_1st_RESERVED_BLOCK(sb),
-				 SB_ONDISK_JOURNAL_SIZE(sb),
-				 sb->s_blocksize);
-		goto free_and_return;
-	}
-
-	/*
-	 * Sanity check to see if journal first block is correct.
-	 * If journal first block is invalid it can cause
-	 * zeroing important superblock members.
-	 */
-	if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
-	    SB_ONDISK_JOURNAL_1st_BLOCK(sb) < SB_JOURNAL_1st_RESERVED_BLOCK(sb)) {
-		reiserfs_warning(sb, "journal-1393",
-				 "journal 1st super block is invalid: 1st reserved block %d, but actual 1st block is %d",
-				 SB_JOURNAL_1st_RESERVED_BLOCK(sb),
-				 SB_ONDISK_JOURNAL_1st_BLOCK(sb));
-		goto free_and_return;
-	}
-
-	if (journal_init_dev(sb, journal, j_dev_name) != 0) {
-		reiserfs_warning(sb, "sh-462",
-				 "unable to initialize journal device");
-		goto free_and_return;
-	}
-
-	rs = SB_DISK_SUPER_BLOCK(sb);
-
-	/* read journal header */
-	bhjh = journal_bread(sb,
-			     SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
-			     SB_ONDISK_JOURNAL_SIZE(sb));
-	if (!bhjh) {
-		reiserfs_warning(sb, "sh-459",
-				 "unable to read journal header");
-		goto free_and_return;
-	}
-	jh = (struct reiserfs_journal_header *)(bhjh->b_data);
-
-	/* make sure that journal matches to the super block */
-	if (is_reiserfs_jr(rs)
-	    && (le32_to_cpu(jh->jh_journal.jp_journal_magic) !=
-		sb_jp_journal_magic(rs))) {
-		reiserfs_warning(sb, "sh-460",
-				 "journal header magic %x (device %pg) does "
-				 "not match to magic found in super block %x",
-				 jh->jh_journal.jp_journal_magic,
-				 file_bdev(journal->j_bdev_file),
-				 sb_jp_journal_magic(rs));
-		brelse(bhjh);
-		goto free_and_return;
-	}
-
-	journal->j_trans_max = le32_to_cpu(jh->jh_journal.jp_journal_trans_max);
-	journal->j_max_batch = le32_to_cpu(jh->jh_journal.jp_journal_max_batch);
-	journal->j_max_commit_age =
-	    le32_to_cpu(jh->jh_journal.jp_journal_max_commit_age);
-	journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
-
-	if (check_advise_trans_params(sb, journal) != 0)
-	        goto free_and_return;
-	journal->j_default_max_commit_age = journal->j_max_commit_age;
-
-	if (commit_max_age != 0) {
-		journal->j_max_commit_age = commit_max_age;
-		journal->j_max_trans_age = commit_max_age;
-	}
-
-	reiserfs_info(sb, "journal params: device %pg, size %u, "
-		      "journal first block %u, max trans len %u, max batch %u, "
-		      "max commit age %u, max trans age %u\n",
-		      file_bdev(journal->j_bdev_file),
-		      SB_ONDISK_JOURNAL_SIZE(sb),
-		      SB_ONDISK_JOURNAL_1st_BLOCK(sb),
-		      journal->j_trans_max,
-		      journal->j_max_batch,
-		      journal->j_max_commit_age, journal->j_max_trans_age);
-
-	brelse(bhjh);
-
-	journal->j_list_bitmap_index = 0;
-	journal_list_init(sb);
-
-	memset(journal->j_list_hash_table, 0,
-	       JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
-
-	INIT_LIST_HEAD(&journal->j_dirty_buffers);
-	spin_lock_init(&journal->j_dirty_buffers_lock);
-
-	journal->j_start = 0;
-	journal->j_len = 0;
-	journal->j_len_alloc = 0;
-	atomic_set(&journal->j_wcount, 0);
-	atomic_set(&journal->j_async_throttle, 0);
-	journal->j_bcount = 0;
-	journal->j_trans_start_time = 0;
-	journal->j_last = NULL;
-	journal->j_first = NULL;
-	init_waitqueue_head(&journal->j_join_wait);
-	mutex_init(&journal->j_mutex);
-	mutex_init(&journal->j_flush_mutex);
-
-	journal->j_trans_id = 10;
-	journal->j_mount_id = 10;
-	journal->j_state = 0;
-	atomic_set(&journal->j_jlock, 0);
-	journal->j_cnode_free_list = allocate_cnodes(num_cnodes);
-	journal->j_cnode_free_orig = journal->j_cnode_free_list;
-	journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;
-	journal->j_cnode_used = 0;
-	journal->j_must_wait = 0;
-
-	if (journal->j_cnode_free == 0) {
-		reiserfs_warning(sb, "journal-2004", "Journal cnode memory "
-		                 "allocation failed (%ld bytes). Journal is "
-		                 "too large for available memory. Usually "
-		                 "this is due to a journal that is too large.",
-		                 sizeof (struct reiserfs_journal_cnode) * num_cnodes);
-        	goto free_and_return;
-	}
-
-	init_journal_hash(sb);
-	jl = journal->j_current_jl;
-
-	/*
-	 * get_list_bitmap() may call flush_commit_list() which
-	 * requires the lock. Calling flush_commit_list() shouldn't happen
-	 * this early but I like to be paranoid.
-	 */
-	reiserfs_write_lock(sb);
-	jl->j_list_bitmap = get_list_bitmap(sb, jl);
-	reiserfs_write_unlock(sb);
-	if (!jl->j_list_bitmap) {
-		reiserfs_warning(sb, "journal-2005",
-				 "get_list_bitmap failed for journal list 0");
-		goto free_and_return;
-	}
-
-	ret = journal_read(sb);
-	if (ret < 0) {
-		reiserfs_warning(sb, "reiserfs-2006",
-				 "Replay Failure, unable to mount");
-		goto free_and_return;
-	}
-
-	INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
-	journal->j_work_sb = sb;
-	return 0;
-free_and_return:
-	free_journal_ram(sb);
-	return 1;
-}
-
-/*
- * test for a polite end of the current transaction.  Used by file_write,
- * and should be used by delete to make sure they don't write more than
- * can fit inside a single transaction
- */
-int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
-				   int new_alloc)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
-	time64_t now = ktime_get_seconds();
-	/* cannot restart while nested */
-	BUG_ON(!th->t_trans_id);
-	if (th->t_refcount > 1)
-		return 0;
-	if (journal->j_must_wait > 0 ||
-	    (journal->j_len_alloc + new_alloc) >= journal->j_max_batch ||
-	    atomic_read(&journal->j_jlock) ||
-	    (now - journal->j_trans_start_time) > journal->j_max_trans_age ||
-	    journal->j_cnode_free < (journal->j_trans_max * 3)) {
-		return 1;
-	}
-
-	journal->j_len_alloc += new_alloc;
-	th->t_blocks_allocated += new_alloc ;
-	return 0;
-}
-
-/* this must be called inside a transaction */
-void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
-	BUG_ON(!th->t_trans_id);
-	journal->j_must_wait = 1;
-	set_bit(J_WRITERS_BLOCKED, &journal->j_state);
-	return;
-}
-
-/* this must be called without a transaction started */
-void reiserfs_allow_writes(struct super_block *s)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	clear_bit(J_WRITERS_BLOCKED, &journal->j_state);
-	wake_up(&journal->j_join_wait);
-}
-
-/* this must be called without a transaction started */
-void reiserfs_wait_on_write_block(struct super_block *s)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	wait_event(journal->j_join_wait,
-		   !test_bit(J_WRITERS_BLOCKED, &journal->j_state));
-}
-
-static void queue_log_writer(struct super_block *s)
-{
-	wait_queue_entry_t wait;
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	set_bit(J_WRITERS_QUEUED, &journal->j_state);
-
-	/*
-	 * we don't want to use wait_event here because
-	 * we only want to wait once.
-	 */
-	init_waitqueue_entry(&wait, current);
-	add_wait_queue(&journal->j_join_wait, &wait);
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) {
-		int depth = reiserfs_write_unlock_nested(s);
-		schedule();
-		reiserfs_write_lock_nested(s, depth);
-	}
-	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(&journal->j_join_wait, &wait);
-}
-
-static void wake_queued_writers(struct super_block *s)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state))
-		wake_up(&journal->j_join_wait);
-}
-
-static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	unsigned long bcount = journal->j_bcount;
-	while (1) {
-		int depth;
-
-		depth = reiserfs_write_unlock_nested(sb);
-		schedule_timeout_uninterruptible(1);
-		reiserfs_write_lock_nested(sb, depth);
-
-		journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
-		while ((atomic_read(&journal->j_wcount) > 0 ||
-			atomic_read(&journal->j_jlock)) &&
-		       journal->j_trans_id == trans_id) {
-			queue_log_writer(sb);
-		}
-		if (journal->j_trans_id != trans_id)
-			break;
-		if (bcount == journal->j_bcount)
-			break;
-		bcount = journal->j_bcount;
-	}
-}
-
-/*
- * join == true if you must join an existing transaction.
- * join == false if you can deal with waiting for others to finish
- *
- * this will block until the transaction is joinable.  send the number of
- * blocks you expect to use in nblocks.
-*/
-static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
-			      struct super_block *sb, unsigned long nblocks,
-			      int join)
-{
-	time64_t now = ktime_get_seconds();
-	unsigned int old_trans_id;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	struct reiserfs_transaction_handle myth;
-	int retval;
-	int depth;
-
-	reiserfs_check_lock_depth(sb, "journal_begin");
-	BUG_ON(nblocks > journal->j_trans_max);
-
-	PROC_INFO_INC(sb, journal.journal_being);
-	/* set here for journal_join */
-	th->t_refcount = 1;
-	th->t_super = sb;
-
-relock:
-	lock_journal(sb);
-	if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) {
-		unlock_journal(sb);
-		retval = journal->j_errno;
-		goto out_fail;
-	}
-	journal->j_bcount++;
-
-	if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
-		unlock_journal(sb);
-		depth = reiserfs_write_unlock_nested(sb);
-		reiserfs_wait_on_write_block(sb);
-		reiserfs_write_lock_nested(sb, depth);
-		PROC_INFO_INC(sb, journal.journal_relock_writers);
-		goto relock;
-	}
-	now = ktime_get_seconds();
-
-	/*
-	 * if there is no room in the journal OR
-	 * if this transaction is too old, and we weren't called joinable,
-	 * wait for it to finish before beginning we don't sleep if there
-	 * aren't other writers
-	 */
-
-	if ((!join && journal->j_must_wait > 0) ||
-	    (!join
-	     && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch)
-	    || (!join && atomic_read(&journal->j_wcount) > 0
-		&& journal->j_trans_start_time > 0
-		&& (now - journal->j_trans_start_time) >
-		journal->j_max_trans_age) || (!join
-					      && atomic_read(&journal->j_jlock))
-	    || (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) {
-
-		old_trans_id = journal->j_trans_id;
-		/* allow others to finish this transaction */
-		unlock_journal(sb);
-
-		if (!join && (journal->j_len_alloc + nblocks + 2) >=
-		    journal->j_max_batch &&
-		    ((journal->j_len + nblocks + 2) * 100) <
-		    (journal->j_len_alloc * 75)) {
-			if (atomic_read(&journal->j_wcount) > 10) {
-				queue_log_writer(sb);
-				goto relock;
-			}
-		}
-		/*
-		 * don't mess with joining the transaction if all we
-		 * have to do is wait for someone else to do a commit
-		 */
-		if (atomic_read(&journal->j_jlock)) {
-			while (journal->j_trans_id == old_trans_id &&
-			       atomic_read(&journal->j_jlock)) {
-				queue_log_writer(sb);
-			}
-			goto relock;
-		}
-		retval = journal_join(&myth, sb);
-		if (retval)
-			goto out_fail;
-
-		/* someone might have ended the transaction while we joined */
-		if (old_trans_id != journal->j_trans_id) {
-			retval = do_journal_end(&myth, 0);
-		} else {
-			retval = do_journal_end(&myth, COMMIT_NOW);
-		}
-
-		if (retval)
-			goto out_fail;
-
-		PROC_INFO_INC(sb, journal.journal_relock_wcount);
-		goto relock;
-	}
-	/* we are the first writer, set trans_id */
-	if (journal->j_trans_start_time == 0) {
-		journal->j_trans_start_time = ktime_get_seconds();
-	}
-	atomic_inc(&journal->j_wcount);
-	journal->j_len_alloc += nblocks;
-	th->t_blocks_logged = 0;
-	th->t_blocks_allocated = nblocks;
-	th->t_trans_id = journal->j_trans_id;
-	unlock_journal(sb);
-	INIT_LIST_HEAD(&th->t_list);
-	return 0;
-
-out_fail:
-	memset(th, 0, sizeof(*th));
-	/*
-	 * Re-set th->t_super, so we can properly keep track of how many
-	 * persistent transactions there are. We need to do this so if this
-	 * call is part of a failed restart_transaction, we can free it later
-	 */
-	th->t_super = sb;
-	return retval;
-}
-
-struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
-								    super_block
-								    *s,
-								    int nblocks)
-{
-	int ret;
-	struct reiserfs_transaction_handle *th;
-
-	/*
-	 * if we're nesting into an existing transaction.  It will be
-	 * persistent on its own
-	 */
-	if (reiserfs_transaction_running(s)) {
-		th = current->journal_info;
-		th->t_refcount++;
-		BUG_ON(th->t_refcount < 2);
-
-		return th;
-	}
-	th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS);
-	if (!th)
-		return NULL;
-	ret = journal_begin(th, s, nblocks);
-	if (ret) {
-		kfree(th);
-		return NULL;
-	}
-
-	SB_JOURNAL(s)->j_persistent_trans++;
-	return th;
-}
-
-int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th)
-{
-	struct super_block *s = th->t_super;
-	int ret = 0;
-	if (th->t_trans_id)
-		ret = journal_end(th);
-	else
-		ret = -EIO;
-	if (th->t_refcount == 0) {
-		SB_JOURNAL(s)->j_persistent_trans--;
-		kfree(th);
-	}
-	return ret;
-}
-
-static int journal_join(struct reiserfs_transaction_handle *th,
-			struct super_block *sb)
-{
-	struct reiserfs_transaction_handle *cur_th = current->journal_info;
-
-	/*
-	 * this keeps do_journal_end from NULLing out the
-	 * current->journal_info pointer
-	 */
-	th->t_handle_save = cur_th;
-	BUG_ON(cur_th && cur_th->t_refcount > 1);
-	return do_journal_begin_r(th, sb, 1, JBEGIN_JOIN);
-}
-
-int journal_join_abort(struct reiserfs_transaction_handle *th,
-		       struct super_block *sb)
-{
-	struct reiserfs_transaction_handle *cur_th = current->journal_info;
-
-	/*
-	 * this keeps do_journal_end from NULLing out the
-	 * current->journal_info pointer
-	 */
-	th->t_handle_save = cur_th;
-	BUG_ON(cur_th && cur_th->t_refcount > 1);
-	return do_journal_begin_r(th, sb, 1, JBEGIN_ABORT);
-}
-
-int journal_begin(struct reiserfs_transaction_handle *th,
-		  struct super_block *sb, unsigned long nblocks)
-{
-	struct reiserfs_transaction_handle *cur_th = current->journal_info;
-	int ret;
-
-	th->t_handle_save = NULL;
-	if (cur_th) {
-		/* we are nesting into the current transaction */
-		if (cur_th->t_super == sb) {
-			BUG_ON(!cur_th->t_refcount);
-			cur_th->t_refcount++;
-			memcpy(th, cur_th, sizeof(*th));
-			if (th->t_refcount <= 1)
-				reiserfs_warning(sb, "reiserfs-2005",
-						 "BAD: refcount <= 1, but "
-						 "journal_info != 0");
-			return 0;
-		} else {
-			/*
-			 * we've ended up with a handle from a different
-			 * filesystem.  save it and restore on journal_end.
-			 * This should never really happen...
-			 */
-			reiserfs_warning(sb, "clm-2100",
-					 "nesting info a different FS");
-			th->t_handle_save = current->journal_info;
-			current->journal_info = th;
-		}
-	} else {
-		current->journal_info = th;
-	}
-	ret = do_journal_begin_r(th, sb, nblocks, JBEGIN_REG);
-	BUG_ON(current->journal_info != th);
-
-	/*
-	 * I guess this boils down to being the reciprocal of clm-2100 above.
-	 * If do_journal_begin_r fails, we need to put it back, since
-	 * journal_end won't be called to do it. */
-	if (ret)
-		current->journal_info = th->t_handle_save;
-	else
-		BUG_ON(!th->t_refcount);
-
-	return ret;
-}
-
-/*
- * puts bh into the current transaction.  If it was already there, reorders
- * removes the old pointers from the hash, and puts new ones in (to make
- * sure replay happen in the right order).
- *
- * if it was dirty, cleans and files onto the clean list.  I can't let it
- * be dirty again until the transaction is committed.
- *
- * if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len.
- */
-int journal_mark_dirty(struct reiserfs_transaction_handle *th,
-		       struct buffer_head *bh)
-{
-	struct super_block *sb = th->t_super;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	struct reiserfs_journal_cnode *cn = NULL;
-	int count_already_incd = 0;
-	int prepared = 0;
-	BUG_ON(!th->t_trans_id);
-
-	PROC_INFO_INC(sb, journal.mark_dirty);
-	if (th->t_trans_id != journal->j_trans_id) {
-		reiserfs_panic(th->t_super, "journal-1577",
-			       "handle trans id %ld != current trans id %ld",
-			       th->t_trans_id, journal->j_trans_id);
-	}
-
-	prepared = test_clear_buffer_journal_prepared(bh);
-	clear_buffer_journal_restore_dirty(bh);
-	/* already in this transaction, we are done */
-	if (buffer_journaled(bh)) {
-		PROC_INFO_INC(sb, journal.mark_dirty_already);
-		return 0;
-	}
-
-	/*
-	 * this must be turned into a panic instead of a warning.  We can't
-	 * allow a dirty or journal_dirty or locked buffer to be logged, as
-	 * some changes could get to disk too early.  NOT GOOD.
-	 */
-	if (!prepared || buffer_dirty(bh)) {
-		reiserfs_warning(sb, "journal-1777",
-				 "buffer %llu bad state "
-				 "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT",
-				 (unsigned long long)bh->b_blocknr,
-				 prepared ? ' ' : '!',
-				 buffer_locked(bh) ? ' ' : '!',
-				 buffer_dirty(bh) ? ' ' : '!',
-				 buffer_journal_dirty(bh) ? ' ' : '!');
-	}
-
-	if (atomic_read(&journal->j_wcount) <= 0) {
-		reiserfs_warning(sb, "journal-1409",
-				 "returning because j_wcount was %d",
-				 atomic_read(&journal->j_wcount));
-		return 1;
-	}
-	/*
-	 * this error means I've screwed up, and we've overflowed
-	 * the transaction.  Nothing can be done here, except make the
-	 * FS readonly or panic.
-	 */
-	if (journal->j_len >= journal->j_trans_max) {
-		reiserfs_panic(th->t_super, "journal-1413",
-			       "j_len (%lu) is too big",
-			       journal->j_len);
-	}
-
-	if (buffer_journal_dirty(bh)) {
-		count_already_incd = 1;
-		PROC_INFO_INC(sb, journal.mark_dirty_notjournal);
-		clear_buffer_journal_dirty(bh);
-	}
-
-	if (journal->j_len > journal->j_len_alloc) {
-		journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT;
-	}
-
-	set_buffer_journaled(bh);
-
-	/* now put this guy on the end */
-	if (!cn) {
-		cn = get_cnode(sb);
-		if (!cn) {
-			reiserfs_panic(sb, "journal-4", "get_cnode failed!");
-		}
-
-		if (th->t_blocks_logged == th->t_blocks_allocated) {
-			th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT;
-			journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT;
-		}
-		th->t_blocks_logged++;
-		journal->j_len++;
-
-		cn->bh = bh;
-		cn->blocknr = bh->b_blocknr;
-		cn->sb = sb;
-		cn->jlist = NULL;
-		insert_journal_hash(journal->j_hash_table, cn);
-		if (!count_already_incd) {
-			get_bh(bh);
-		}
-	}
-	cn->next = NULL;
-	cn->prev = journal->j_last;
-	cn->bh = bh;
-	if (journal->j_last) {
-		journal->j_last->next = cn;
-		journal->j_last = cn;
-	} else {
-		journal->j_first = cn;
-		journal->j_last = cn;
-	}
-	reiserfs_schedule_old_flush(sb);
-	return 0;
-}
-
-int journal_end(struct reiserfs_transaction_handle *th)
-{
-	struct super_block *sb = th->t_super;
-	if (!current->journal_info && th->t_refcount > 1)
-		reiserfs_warning(sb, "REISER-NESTING",
-				 "th NULL, refcount %d", th->t_refcount);
-
-	if (!th->t_trans_id) {
-		WARN_ON(1);
-		return -EIO;
-	}
-
-	th->t_refcount--;
-	if (th->t_refcount > 0) {
-		struct reiserfs_transaction_handle *cur_th =
-		    current->journal_info;
-
-		/*
-		 * we aren't allowed to close a nested transaction on a
-		 * different filesystem from the one in the task struct
-		 */
-		BUG_ON(cur_th->t_super != th->t_super);
-
-		if (th != cur_th) {
-			memcpy(current->journal_info, th, sizeof(*th));
-			th->t_trans_id = 0;
-		}
-		return 0;
-	} else {
-		return do_journal_end(th, 0);
-	}
-}
-
-/*
- * removes from the current transaction, relsing and descrementing any counters.
- * also files the removed buffer directly onto the clean list
- *
- * called by journal_mark_freed when a block has been deleted
- *
- * returns 1 if it cleaned and relsed the buffer. 0 otherwise
- */
-static int remove_from_transaction(struct super_block *sb,
-				   b_blocknr_t blocknr, int already_cleaned)
-{
-	struct buffer_head *bh;
-	struct reiserfs_journal_cnode *cn;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	int ret = 0;
-
-	cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
-	if (!cn || !cn->bh) {
-		return ret;
-	}
-	bh = cn->bh;
-	if (cn->prev) {
-		cn->prev->next = cn->next;
-	}
-	if (cn->next) {
-		cn->next->prev = cn->prev;
-	}
-	if (cn == journal->j_first) {
-		journal->j_first = cn->next;
-	}
-	if (cn == journal->j_last) {
-		journal->j_last = cn->prev;
-	}
-	remove_journal_hash(sb, journal->j_hash_table, NULL,
-			    bh->b_blocknr, 0);
-	clear_buffer_journaled(bh);	/* don't log this one */
-
-	if (!already_cleaned) {
-		clear_buffer_journal_dirty(bh);
-		clear_buffer_dirty(bh);
-		clear_buffer_journal_test(bh);
-		put_bh(bh);
-		if (atomic_read(&bh->b_count) < 0) {
-			reiserfs_warning(sb, "journal-1752",
-					 "b_count < 0");
-		}
-		ret = 1;
-	}
-	journal->j_len--;
-	journal->j_len_alloc--;
-	free_cnode(sb, cn);
-	return ret;
-}
-
-/*
- * for any cnode in a journal list, it can only be dirtied of all the
- * transactions that include it are committed to disk.
- * this checks through each transaction, and returns 1 if you are allowed
- * to dirty, and 0 if you aren't
- *
- * it is called by dirty_journal_list, which is called after
- * flush_commit_list has gotten all the log blocks for a given
- * transaction on disk
- *
- */
-static int can_dirty(struct reiserfs_journal_cnode *cn)
-{
-	struct super_block *sb = cn->sb;
-	b_blocknr_t blocknr = cn->blocknr;
-	struct reiserfs_journal_cnode *cur = cn->hprev;
-	int can_dirty = 1;
-
-	/*
-	 * first test hprev.  These are all newer than cn, so any node here
-	 * with the same block number and dev means this node can't be sent
-	 * to disk right now.
-	 */
-	while (cur && can_dirty) {
-		if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb &&
-		    cur->blocknr == blocknr) {
-			can_dirty = 0;
-		}
-		cur = cur->hprev;
-	}
-	/*
-	 * then test hnext.  These are all older than cn.  As long as they
-	 * are committed to the log, it is safe to write cn to disk
-	 */
-	cur = cn->hnext;
-	while (cur && can_dirty) {
-		if (cur->jlist && cur->jlist->j_len > 0 &&
-		    atomic_read(&cur->jlist->j_commit_left) > 0 && cur->bh &&
-		    cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) {
-			can_dirty = 0;
-		}
-		cur = cur->hnext;
-	}
-	return can_dirty;
-}
-
-/*
- * syncs the commit blocks, but does not force the real buffers to disk
- * will wait until the current transaction is done/committed before returning
- */
-int journal_end_sync(struct reiserfs_transaction_handle *th)
-{
-	struct super_block *sb = th->t_super;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-
-	BUG_ON(!th->t_trans_id);
-	/* you can sync while nested, very, very bad */
-	BUG_ON(th->t_refcount > 1);
-	if (journal->j_len == 0) {
-		reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
-					     1);
-		journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb));
-	}
-	return do_journal_end(th, COMMIT_NOW | WAIT);
-}
-
-/* writeback the pending async commits to disk */
-static void flush_async_commits(struct work_struct *work)
-{
-	struct reiserfs_journal *journal =
-		container_of(work, struct reiserfs_journal, j_work.work);
-	struct super_block *sb = journal->j_work_sb;
-	struct reiserfs_journal_list *jl;
-	struct list_head *entry;
-
-	reiserfs_write_lock(sb);
-	if (!list_empty(&journal->j_journal_list)) {
-		/* last entry is the youngest, commit it and you get everything */
-		entry = journal->j_journal_list.prev;
-		jl = JOURNAL_LIST_ENTRY(entry);
-		flush_commit_list(sb, jl, 1);
-	}
-	reiserfs_write_unlock(sb);
-}
-
-/*
- * flushes any old transactions to disk
- * ends the current transaction if it is too old
- */
-void reiserfs_flush_old_commits(struct super_block *sb)
-{
-	time64_t now;
-	struct reiserfs_transaction_handle th;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-
-	now = ktime_get_seconds();
-	/*
-	 * safety check so we don't flush while we are replaying the log during
-	 * mount
-	 */
-	if (list_empty(&journal->j_journal_list))
-		return;
-
-	/*
-	 * check the current transaction.  If there are no writers, and it is
-	 * too old, finish it, and force the commit blocks to disk
-	 */
-	if (atomic_read(&journal->j_wcount) <= 0 &&
-	    journal->j_trans_start_time > 0 &&
-	    journal->j_len > 0 &&
-	    (now - journal->j_trans_start_time) > journal->j_max_trans_age) {
-		if (!journal_join(&th, sb)) {
-			reiserfs_prepare_for_journal(sb,
-						     SB_BUFFER_WITH_SB(sb),
-						     1);
-			journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb));
-
-			/*
-			 * we're only being called from kreiserfsd, it makes
-			 * no sense to do an async commit so that kreiserfsd
-			 * can do it later
-			 */
-			do_journal_end(&th, COMMIT_NOW | WAIT);
-		}
-	}
-}
-
-/*
- * returns 0 if do_journal_end should return right away, returns 1 if
- * do_journal_end should finish the commit
- *
- * if the current transaction is too old, but still has writers, this will
- * wait on j_join_wait until all the writers are done.  By the time it
- * wakes up, the transaction it was called has already ended, so it just
- * flushes the commit list and returns 0.
- *
- * Won't batch when flush or commit_now is set.  Also won't batch when
- * others are waiting on j_join_wait.
- *
- * Note, we can't allow the journal_end to proceed while there are still
- * writers in the log.
- */
-static int check_journal_end(struct reiserfs_transaction_handle *th, int flags)
-{
-
-	time64_t now;
-	int flush = flags & FLUSH_ALL;
-	int commit_now = flags & COMMIT_NOW;
-	int wait_on_commit = flags & WAIT;
-	struct reiserfs_journal_list *jl;
-	struct super_block *sb = th->t_super;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-
-	BUG_ON(!th->t_trans_id);
-
-	if (th->t_trans_id != journal->j_trans_id) {
-		reiserfs_panic(th->t_super, "journal-1577",
-			       "handle trans id %ld != current trans id %ld",
-			       th->t_trans_id, journal->j_trans_id);
-	}
-
-	journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged);
-	/* <= 0 is allowed.  unmounting might not call begin */
-	if (atomic_read(&journal->j_wcount) > 0)
-		atomic_dec(&journal->j_wcount);
-
-	/*
-	 * BUG, deal with case where j_len is 0, but people previously
-	 * freed blocks need to be released will be dealt with by next
-	 * transaction that actually writes something, but should be taken
-	 * care of in this trans
-	 */
-	BUG_ON(journal->j_len == 0);
-
-	/*
-	 * if wcount > 0, and we are called to with flush or commit_now,
-	 * we wait on j_join_wait.  We will wake up when the last writer has
-	 * finished the transaction, and started it on its way to the disk.
-	 * Then, we flush the commit or journal list, and just return 0
-	 * because the rest of journal end was already done for this
-	 * transaction.
-	 */
-	if (atomic_read(&journal->j_wcount) > 0) {
-		if (flush || commit_now) {
-			unsigned trans_id;
-
-			jl = journal->j_current_jl;
-			trans_id = jl->j_trans_id;
-			if (wait_on_commit)
-				jl->j_state |= LIST_COMMIT_PENDING;
-			atomic_set(&journal->j_jlock, 1);
-			if (flush) {
-				journal->j_next_full_flush = 1;
-			}
-			unlock_journal(sb);
-
-			/*
-			 * sleep while the current transaction is
-			 * still j_jlocked
-			 */
-			while (journal->j_trans_id == trans_id) {
-				if (atomic_read(&journal->j_jlock)) {
-					queue_log_writer(sb);
-				} else {
-					lock_journal(sb);
-					if (journal->j_trans_id == trans_id) {
-						atomic_set(&journal->j_jlock,
-							   1);
-					}
-					unlock_journal(sb);
-				}
-			}
-			BUG_ON(journal->j_trans_id == trans_id);
-
-			if (commit_now
-			    && journal_list_still_alive(sb, trans_id)
-			    && wait_on_commit) {
-				flush_commit_list(sb, jl, 1);
-			}
-			return 0;
-		}
-		unlock_journal(sb);
-		return 0;
-	}
-
-	/* deal with old transactions where we are the last writers */
-	now = ktime_get_seconds();
-	if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) {
-		commit_now = 1;
-		journal->j_next_async_flush = 1;
-	}
-	/* don't batch when someone is waiting on j_join_wait */
-	/* don't batch when syncing the commit or flushing the whole trans */
-	if (!(journal->j_must_wait > 0) && !(atomic_read(&journal->j_jlock))
-	    && !flush && !commit_now && (journal->j_len < journal->j_max_batch)
-	    && journal->j_len_alloc < journal->j_max_batch
-	    && journal->j_cnode_free > (journal->j_trans_max * 3)) {
-		journal->j_bcount++;
-		unlock_journal(sb);
-		return 0;
-	}
-
-	if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(sb)) {
-		reiserfs_panic(sb, "journal-003",
-			       "j_start (%ld) is too high",
-			       journal->j_start);
-	}
-	return 1;
-}
-
-/*
- * Does all the work that makes deleting blocks safe.
- * when deleting a block mark BH_JNew, just remove it from the current
- * transaction, clean it's buffer_head and move on.
- *
- * otherwise:
- * set a bit for the block in the journal bitmap.  That will prevent it from
- * being allocated for unformatted nodes before this transaction has finished.
- *
- * mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers.
- * That will prevent any old transactions with this block from trying to flush
- * to the real location.  Since we aren't removing the cnode from the
- * journal_list_hash, *the block can't be reallocated yet.
- *
- * Then remove it from the current transaction, decrementing any counters and
- * filing it on the clean list.
- */
-int journal_mark_freed(struct reiserfs_transaction_handle *th,
-		       struct super_block *sb, b_blocknr_t blocknr)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	struct reiserfs_journal_cnode *cn = NULL;
-	struct buffer_head *bh = NULL;
-	struct reiserfs_list_bitmap *jb = NULL;
-	int cleaned = 0;
-	BUG_ON(!th->t_trans_id);
-
-	cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
-	if (cn && cn->bh) {
-		bh = cn->bh;
-		get_bh(bh);
-	}
-	/* if it is journal new, we just remove it from this transaction */
-	if (bh && buffer_journal_new(bh)) {
-		clear_buffer_journal_new(bh);
-		clear_prepared_bits(bh);
-		reiserfs_clean_and_file_buffer(bh);
-		cleaned = remove_from_transaction(sb, blocknr, cleaned);
-	} else {
-		/*
-		 * set the bit for this block in the journal bitmap
-		 * for this transaction
-		 */
-		jb = journal->j_current_jl->j_list_bitmap;
-		if (!jb) {
-			reiserfs_panic(sb, "journal-1702",
-				       "journal_list_bitmap is NULL");
-		}
-		set_bit_in_list_bitmap(sb, blocknr, jb);
-
-		/* Note, the entire while loop is not allowed to schedule.  */
-
-		if (bh) {
-			clear_prepared_bits(bh);
-			reiserfs_clean_and_file_buffer(bh);
-		}
-		cleaned = remove_from_transaction(sb, blocknr, cleaned);
-
-		/*
-		 * find all older transactions with this block,
-		 * make sure they don't try to write it out
-		 */
-		cn = get_journal_hash_dev(sb, journal->j_list_hash_table,
-					  blocknr);
-		while (cn) {
-			if (sb == cn->sb && blocknr == cn->blocknr) {
-				set_bit(BLOCK_FREED, &cn->state);
-				if (cn->bh) {
-					/*
-					 * remove_from_transaction will brelse
-					 * the buffer if it was in the current
-					 * trans
-					 */
-					if (!cleaned) {
-						clear_buffer_journal_dirty(cn->
-									   bh);
-						clear_buffer_dirty(cn->bh);
-						clear_buffer_journal_test(cn->
-									  bh);
-						cleaned = 1;
-						put_bh(cn->bh);
-						if (atomic_read
-						    (&cn->bh->b_count) < 0) {
-							reiserfs_warning(sb,
-								 "journal-2138",
-								 "cn->bh->b_count < 0");
-						}
-					}
-					/*
-					 * since we are clearing the bh,
-					 * we MUST dec nonzerolen
-					 */
-					if (cn->jlist) {
-						atomic_dec(&cn->jlist->
-							   j_nonzerolen);
-					}
-					cn->bh = NULL;
-				}
-			}
-			cn = cn->hnext;
-		}
-	}
-
-	if (bh)
-		release_buffer_page(bh); /* get_hash grabs the buffer */
-	return 0;
-}
-
-void reiserfs_update_inode_transaction(struct inode *inode)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(inode->i_sb);
-	REISERFS_I(inode)->i_jl = journal->j_current_jl;
-	REISERFS_I(inode)->i_trans_id = journal->j_trans_id;
-}
-
-/*
- * returns -1 on error, 0 if no commits/barriers were done and 1
- * if a transaction was actually committed and the barrier was done
- */
-static int __commit_trans_jl(struct inode *inode, unsigned long id,
-			     struct reiserfs_journal_list *jl)
-{
-	struct reiserfs_transaction_handle th;
-	struct super_block *sb = inode->i_sb;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	int ret = 0;
-
-	/*
-	 * is it from the current transaction,
-	 * or from an unknown transaction?
-	 */
-	if (id == journal->j_trans_id) {
-		jl = journal->j_current_jl;
-		/*
-		 * try to let other writers come in and
-		 * grow this transaction
-		 */
-		let_transaction_grow(sb, id);
-		if (journal->j_trans_id != id) {
-			goto flush_commit_only;
-		}
-
-		ret = journal_begin(&th, sb, 1);
-		if (ret)
-			return ret;
-
-		/* someone might have ended this transaction while we joined */
-		if (journal->j_trans_id != id) {
-			reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
-						     1);
-			journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb));
-			ret = journal_end(&th);
-			goto flush_commit_only;
-		}
-
-		ret = journal_end_sync(&th);
-		if (!ret)
-			ret = 1;
-
-	} else {
-		/*
-		 * this gets tricky, we have to make sure the journal list in
-		 * the inode still exists.  We know the list is still around
-		 * if we've got a larger transaction id than the oldest list
-		 */
-flush_commit_only:
-		if (journal_list_still_alive(inode->i_sb, id)) {
-			/*
-			 * we only set ret to 1 when we know for sure
-			 * the barrier hasn't been started yet on the commit
-			 * block.
-			 */
-			if (atomic_read(&jl->j_commit_left) > 1)
-				ret = 1;
-			flush_commit_list(sb, jl, 1);
-			if (journal->j_errno)
-				ret = journal->j_errno;
-		}
-	}
-	/* otherwise the list is gone, and long since committed */
-	return ret;
-}
-
-int reiserfs_commit_for_inode(struct inode *inode)
-{
-	unsigned int id = REISERFS_I(inode)->i_trans_id;
-	struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
-
-	/*
-	 * for the whole inode, assume unset id means it was
-	 * changed in the current transaction.  More conservative
-	 */
-	if (!id || !jl) {
-		reiserfs_update_inode_transaction(inode);
-		id = REISERFS_I(inode)->i_trans_id;
-		/* jl will be updated in __commit_trans_jl */
-	}
-
-	return __commit_trans_jl(inode, id, jl);
-}
-
-void reiserfs_restore_prepared_buffer(struct super_block *sb,
-				      struct buffer_head *bh)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	PROC_INFO_INC(sb, journal.restore_prepared);
-	if (!bh) {
-		return;
-	}
-	if (test_clear_buffer_journal_restore_dirty(bh) &&
-	    buffer_journal_dirty(bh)) {
-		struct reiserfs_journal_cnode *cn;
-		reiserfs_write_lock(sb);
-		cn = get_journal_hash_dev(sb,
-					  journal->j_list_hash_table,
-					  bh->b_blocknr);
-		if (cn && can_dirty(cn)) {
-			set_buffer_journal_test(bh);
-			mark_buffer_dirty(bh);
-		}
-		reiserfs_write_unlock(sb);
-	}
-	clear_buffer_journal_prepared(bh);
-}
-
-extern struct tree_balance *cur_tb;
-/*
- * before we can change a metadata block, we have to make sure it won't
- * be written to disk while we are altering it.  So, we must:
- * clean it
- * wait on it.
- */
-int reiserfs_prepare_for_journal(struct super_block *sb,
-				 struct buffer_head *bh, int wait)
-{
-	PROC_INFO_INC(sb, journal.prepare);
-
-	if (!trylock_buffer(bh)) {
-		if (!wait)
-			return 0;
-		lock_buffer(bh);
-	}
-	set_buffer_journal_prepared(bh);
-	if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) {
-		clear_buffer_journal_test(bh);
-		set_buffer_journal_restore_dirty(bh);
-	}
-	unlock_buffer(bh);
-	return 1;
-}
-
-/*
- * long and ugly.  If flush, will not return until all commit
- * blocks and all real buffers in the trans are on disk.
- * If no_async, won't return until all commit blocks are on disk.
- *
- * keep reading, there are comments as you go along
- *
- * If the journal is aborted, we just clean up. Things like flushing
- * journal lists, etc just won't happen.
- */
-static int do_journal_end(struct reiserfs_transaction_handle *th, int flags)
-{
-	struct super_block *sb = th->t_super;
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	struct reiserfs_journal_cnode *cn, *next, *jl_cn;
-	struct reiserfs_journal_cnode *last_cn = NULL;
-	struct reiserfs_journal_desc *desc;
-	struct reiserfs_journal_commit *commit;
-	struct buffer_head *c_bh;	/* commit bh */
-	struct buffer_head *d_bh;	/* desc bh */
-	int cur_write_start = 0;	/* start index of current log write */
-	int i;
-	int flush;
-	int wait_on_commit;
-	struct reiserfs_journal_list *jl, *temp_jl;
-	struct list_head *entry, *safe;
-	unsigned long jindex;
-	unsigned int commit_trans_id;
-	int trans_half;
-	int depth;
-
-	BUG_ON(th->t_refcount > 1);
-	BUG_ON(!th->t_trans_id);
-	BUG_ON(!th->t_super);
-
-	/*
-	 * protect flush_older_commits from doing mistakes if the
-	 * transaction ID counter gets overflowed.
-	 */
-	if (th->t_trans_id == ~0U)
-		flags |= FLUSH_ALL | COMMIT_NOW | WAIT;
-	flush = flags & FLUSH_ALL;
-	wait_on_commit = flags & WAIT;
-
-	current->journal_info = th->t_handle_save;
-	reiserfs_check_lock_depth(sb, "journal end");
-	if (journal->j_len == 0) {
-		reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
-					     1);
-		journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb));
-	}
-
-	lock_journal(sb);
-	if (journal->j_next_full_flush) {
-		flags |= FLUSH_ALL;
-		flush = 1;
-	}
-	if (journal->j_next_async_flush) {
-		flags |= COMMIT_NOW | WAIT;
-		wait_on_commit = 1;
-	}
-
-	/*
-	 * check_journal_end locks the journal, and unlocks if it does
-	 * not return 1 it tells us if we should continue with the
-	 * journal_end, or just return
-	 */
-	if (!check_journal_end(th, flags)) {
-		reiserfs_schedule_old_flush(sb);
-		wake_queued_writers(sb);
-		reiserfs_async_progress_wait(sb);
-		goto out;
-	}
-
-	/* check_journal_end might set these, check again */
-	if (journal->j_next_full_flush) {
-		flush = 1;
-	}
-
-	/*
-	 * j must wait means we have to flush the log blocks, and the
-	 * real blocks for this transaction
-	 */
-	if (journal->j_must_wait > 0) {
-		flush = 1;
-	}
-#ifdef REISERFS_PREALLOCATE
-	/*
-	 * quota ops might need to nest, setup the journal_info pointer
-	 * for them and raise the refcount so that it is > 0.
-	 */
-	current->journal_info = th;
-	th->t_refcount++;
-
-	/* it should not involve new blocks into the transaction */
-	reiserfs_discard_all_prealloc(th);
-
-	th->t_refcount--;
-	current->journal_info = th->t_handle_save;
-#endif
-
-	/* setup description block */
-	d_bh =
-	    journal_getblk(sb,
-			   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
-			   journal->j_start);
-	set_buffer_uptodate(d_bh);
-	desc = (struct reiserfs_journal_desc *)(d_bh)->b_data;
-	memset(d_bh->b_data, 0, d_bh->b_size);
-	memcpy(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8);
-	set_desc_trans_id(desc, journal->j_trans_id);
-
-	/*
-	 * setup commit block.  Don't write (keep it clean too) this one
-	 * until after everyone else is written
-	 */
-	c_bh = journal_getblk(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
-			      ((journal->j_start + journal->j_len +
-				1) % SB_ONDISK_JOURNAL_SIZE(sb)));
-	commit = (struct reiserfs_journal_commit *)c_bh->b_data;
-	memset(c_bh->b_data, 0, c_bh->b_size);
-	set_commit_trans_id(commit, journal->j_trans_id);
-	set_buffer_uptodate(c_bh);
-
-	/* init this journal list */
-	jl = journal->j_current_jl;
-
-	/*
-	 * we lock the commit before doing anything because
-	 * we want to make sure nobody tries to run flush_commit_list until
-	 * the new transaction is fully setup, and we've already flushed the
-	 * ordered bh list
-	 */
-	reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb);
-
-	/* save the transaction id in case we need to commit it later */
-	commit_trans_id = jl->j_trans_id;
-
-	atomic_set(&jl->j_older_commits_done, 0);
-	jl->j_trans_id = journal->j_trans_id;
-	jl->j_timestamp = journal->j_trans_start_time;
-	jl->j_commit_bh = c_bh;
-	jl->j_start = journal->j_start;
-	jl->j_len = journal->j_len;
-	atomic_set(&jl->j_nonzerolen, journal->j_len);
-	atomic_set(&jl->j_commit_left, journal->j_len + 2);
-	jl->j_realblock = NULL;
-
-	/*
-	 * The ENTIRE FOR LOOP MUST not cause schedule to occur.
-	 * for each real block, add it to the journal list hash,
-	 * copy into real block index array in the commit or desc block
-	 */
-	trans_half = journal_trans_half(sb->s_blocksize);
-	for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) {
-		if (buffer_journaled(cn->bh)) {
-			jl_cn = get_cnode(sb);
-			if (!jl_cn) {
-				reiserfs_panic(sb, "journal-1676",
-					       "get_cnode returned NULL");
-			}
-			if (i == 0) {
-				jl->j_realblock = jl_cn;
-			}
-			jl_cn->prev = last_cn;
-			jl_cn->next = NULL;
-			if (last_cn) {
-				last_cn->next = jl_cn;
-			}
-			last_cn = jl_cn;
-			/*
-			 * make sure the block we are trying to log
-			 * is not a block of journal or reserved area
-			 */
-			if (is_block_in_log_or_reserved_area
-			    (sb, cn->bh->b_blocknr)) {
-				reiserfs_panic(sb, "journal-2332",
-					       "Trying to log block %lu, "
-					       "which is a log block",
-					       cn->bh->b_blocknr);
-			}
-			jl_cn->blocknr = cn->bh->b_blocknr;
-			jl_cn->state = 0;
-			jl_cn->sb = sb;
-			jl_cn->bh = cn->bh;
-			jl_cn->jlist = jl;
-			insert_journal_hash(journal->j_list_hash_table, jl_cn);
-			if (i < trans_half) {
-				desc->j_realblock[i] =
-				    cpu_to_le32(cn->bh->b_blocknr);
-			} else {
-				commit->j_realblock[i - trans_half] =
-				    cpu_to_le32(cn->bh->b_blocknr);
-			}
-		} else {
-			i--;
-		}
-	}
-	set_desc_trans_len(desc, journal->j_len);
-	set_desc_mount_id(desc, journal->j_mount_id);
-	set_desc_trans_id(desc, journal->j_trans_id);
-	set_commit_trans_len(commit, journal->j_len);
-
-	/*
-	 * special check in case all buffers in the journal
-	 * were marked for not logging
-	 */
-	BUG_ON(journal->j_len == 0);
-
-	/*
-	 * we're about to dirty all the log blocks, mark the description block
-	 * dirty now too.  Don't mark the commit block dirty until all the
-	 * others are on disk
-	 */
-	mark_buffer_dirty(d_bh);
-
-	/*
-	 * first data block is j_start + 1, so add one to
-	 * cur_write_start wherever you use it
-	 */
-	cur_write_start = journal->j_start;
-	cn = journal->j_first;
-	jindex = 1;	/* start at one so we don't get the desc again */
-	while (cn) {
-		clear_buffer_journal_new(cn->bh);
-		/* copy all the real blocks into log area.  dirty log blocks */
-		if (buffer_journaled(cn->bh)) {
-			struct buffer_head *tmp_bh;
-			char *addr;
-			struct page *page;
-			tmp_bh =
-			    journal_getblk(sb,
-					   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
-					   ((cur_write_start +
-					     jindex) %
-					    SB_ONDISK_JOURNAL_SIZE(sb)));
-			set_buffer_uptodate(tmp_bh);
-			page = cn->bh->b_page;
-			addr = kmap(page);
-			memcpy(tmp_bh->b_data,
-			       addr + offset_in_page(cn->bh->b_data),
-			       cn->bh->b_size);
-			kunmap(page);
-			mark_buffer_dirty(tmp_bh);
-			jindex++;
-			set_buffer_journal_dirty(cn->bh);
-			clear_buffer_journaled(cn->bh);
-		} else {
-			/*
-			 * JDirty cleared sometime during transaction.
-			 * don't log this one
-			 */
-			reiserfs_warning(sb, "journal-2048",
-					 "BAD, buffer in journal hash, "
-					 "but not JDirty!");
-			brelse(cn->bh);
-		}
-		next = cn->next;
-		free_cnode(sb, cn);
-		cn = next;
-		reiserfs_cond_resched(sb);
-	}
-
-	/*
-	 * we are done with both the c_bh and d_bh, but
-	 * c_bh must be written after all other commit blocks,
-	 * so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
-	 */
-
-	journal->j_current_jl = alloc_journal_list(sb);
-
-	/* now it is safe to insert this transaction on the main list */
-	list_add_tail(&jl->j_list, &journal->j_journal_list);
-	list_add_tail(&jl->j_working_list, &journal->j_working_list);
-	journal->j_num_work_lists++;
-
-	/* reset journal values for the next transaction */
-	journal->j_start =
-	    (journal->j_start + journal->j_len +
-	     2) % SB_ONDISK_JOURNAL_SIZE(sb);
-	atomic_set(&journal->j_wcount, 0);
-	journal->j_bcount = 0;
-	journal->j_last = NULL;
-	journal->j_first = NULL;
-	journal->j_len = 0;
-	journal->j_trans_start_time = 0;
-	/* check for trans_id overflow */
-	if (++journal->j_trans_id == 0)
-		journal->j_trans_id = 10;
-	journal->j_current_jl->j_trans_id = journal->j_trans_id;
-	journal->j_must_wait = 0;
-	journal->j_len_alloc = 0;
-	journal->j_next_full_flush = 0;
-	journal->j_next_async_flush = 0;
-	init_journal_hash(sb);
-
-	/*
-	 * make sure reiserfs_add_jh sees the new current_jl before we
-	 * write out the tails
-	 */
-	smp_mb();
-
-	/*
-	 * tail conversion targets have to hit the disk before we end the
-	 * transaction.  Otherwise a later transaction might repack the tail
-	 * before this transaction commits, leaving the data block unflushed
-	 * and clean, if we crash before the later transaction commits, the
-	 * data block is lost.
-	 */
-	if (!list_empty(&jl->j_tail_bh_list)) {
-		depth = reiserfs_write_unlock_nested(sb);
-		write_ordered_buffers(&journal->j_dirty_buffers_lock,
-				      journal, jl, &jl->j_tail_bh_list);
-		reiserfs_write_lock_nested(sb, depth);
-	}
-	BUG_ON(!list_empty(&jl->j_tail_bh_list));
-	mutex_unlock(&jl->j_commit_mutex);
-
-	/*
-	 * honor the flush wishes from the caller, simple commits can
-	 * be done outside the journal lock, they are done below
-	 *
-	 * if we don't flush the commit list right now, we put it into
-	 * the work queue so the people waiting on the async progress work
-	 * queue don't wait for this proc to flush journal lists and such.
-	 */
-	if (flush) {
-		flush_commit_list(sb, jl, 1);
-		flush_journal_list(sb, jl, 1);
-	} else if (!(jl->j_state & LIST_COMMIT_PENDING)) {
-		/*
-		 * Avoid queueing work when sb is being shut down. Transaction
-		 * will be flushed on journal shutdown.
-		 */
-		if (sb->s_flags & SB_ACTIVE)
-			queue_delayed_work(REISERFS_SB(sb)->commit_wq,
-					   &journal->j_work, HZ / 10);
-	}
-
-	/*
-	 * if the next transaction has any chance of wrapping, flush
-	 * transactions that might get overwritten.  If any journal lists
-	 * are very old flush them as well.
-	 */
-first_jl:
-	list_for_each_safe(entry, safe, &journal->j_journal_list) {
-		temp_jl = JOURNAL_LIST_ENTRY(entry);
-		if (journal->j_start <= temp_jl->j_start) {
-			if ((journal->j_start + journal->j_trans_max + 1) >=
-			    temp_jl->j_start) {
-				flush_used_journal_lists(sb, temp_jl);
-				goto first_jl;
-			} else if ((journal->j_start +
-				    journal->j_trans_max + 1) <
-				   SB_ONDISK_JOURNAL_SIZE(sb)) {
-				/*
-				 * if we don't cross into the next
-				 * transaction and we don't wrap, there is
-				 * no way we can overlap any later transactions
-				 * break now
-				 */
-				break;
-			}
-		} else if ((journal->j_start +
-			    journal->j_trans_max + 1) >
-			   SB_ONDISK_JOURNAL_SIZE(sb)) {
-			if (((journal->j_start + journal->j_trans_max + 1) %
-			     SB_ONDISK_JOURNAL_SIZE(sb)) >=
-			    temp_jl->j_start) {
-				flush_used_journal_lists(sb, temp_jl);
-				goto first_jl;
-			} else {
-				/*
-				* we don't overlap anything from out start
-				* to the end of the log, and our wrapped
-				* portion doesn't overlap anything at
-				* the start of the log.  We can break
-				*/
-				break;
-			}
-		}
-	}
-
-	journal->j_current_jl->j_list_bitmap =
-	    get_list_bitmap(sb, journal->j_current_jl);
-
-	if (!(journal->j_current_jl->j_list_bitmap)) {
-		reiserfs_panic(sb, "journal-1996",
-			       "could not get a list bitmap");
-	}
-
-	atomic_set(&journal->j_jlock, 0);
-	unlock_journal(sb);
-	/* wake up any body waiting to join. */
-	clear_bit(J_WRITERS_QUEUED, &journal->j_state);
-	wake_up(&journal->j_join_wait);
-
-	if (!flush && wait_on_commit &&
-	    journal_list_still_alive(sb, commit_trans_id)) {
-		flush_commit_list(sb, jl, 1);
-	}
-out:
-	reiserfs_check_lock_depth(sb, "journal end2");
-
-	memset(th, 0, sizeof(*th));
-	/*
-	 * Re-set th->t_super, so we can properly keep track of how many
-	 * persistent transactions there are. We need to do this so if this
-	 * call is part of a failed restart_transaction, we can free it later
-	 */
-	th->t_super = sb;
-
-	return journal->j_errno;
-}
-
-/* Send the file system read only and refuse new transactions */
-void reiserfs_abort_journal(struct super_block *sb, int errno)
-{
-	struct reiserfs_journal *journal = SB_JOURNAL(sb);
-	if (test_bit(J_ABORTED, &journal->j_state))
-		return;
-
-	if (!journal->j_errno)
-		journal->j_errno = errno;
-
-	sb->s_flags |= SB_RDONLY;
-	set_bit(J_ABORTED, &journal->j_state);
-
-#ifdef CONFIG_REISERFS_CHECK
-	dump_stack();
-#endif
-}
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
deleted file mode 100644
index 7f868569d4d0..000000000000
--- a/fs/reiserfs/lbalance.c
+++ /dev/null
@@ -1,1426 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/uaccess.h>
-#include <linux/string.h>
-#include <linux/time.h>
-#include "reiserfs.h"
-#include <linux/buffer_head.h>
-
-/*
- * copy copy_count entries from source directory item to dest buffer
- * (creating new item if needed)
- */
-static void leaf_copy_dir_entries(struct buffer_info *dest_bi,
-				  struct buffer_head *source, int last_first,
-				  int item_num, int from, int copy_count)
-{
-	struct buffer_head *dest = dest_bi->bi_bh;
-	/*
-	 * either the number of target item, or if we must create a
-	 * new item, the number of the item we will create it next to
-	 */
-	int item_num_in_dest;
-
-	struct item_head *ih;
-	struct reiserfs_de_head *deh;
-	int copy_records_len;	/* length of all records in item to be copied */
-	char *records;
-
-	ih = item_head(source, item_num);
-
-	RFALSE(!is_direntry_le_ih(ih), "vs-10000: item must be directory item");
-
-	/*
-	 * length of all record to be copied and first byte of
-	 * the last of them
-	 */
-	deh = B_I_DEH(source, ih);
-	if (copy_count) {
-		copy_records_len = (from ? deh_location(&deh[from - 1]) :
-				    ih_item_len(ih)) -
-		    deh_location(&deh[from + copy_count - 1]);
-		records =
-		    source->b_data + ih_location(ih) +
-		    deh_location(&deh[from + copy_count - 1]);
-	} else {
-		copy_records_len = 0;
-		records = NULL;
-	}
-
-	/* when copy last to first, dest buffer can contain 0 items */
-	item_num_in_dest =
-	    (last_first ==
-	     LAST_TO_FIRST) ? ((B_NR_ITEMS(dest)) ? 0 : -1) : (B_NR_ITEMS(dest)
-							       - 1);
-
-	/*
-	 * if there are no items in dest or the first/last item in
-	 * dest is not item of the same directory
-	 */
-	if ((item_num_in_dest == -1) ||
-	    (last_first == FIRST_TO_LAST && le_ih_k_offset(ih) == DOT_OFFSET) ||
-	    (last_first == LAST_TO_FIRST
-	     && comp_short_le_keys /*COMP_SHORT_KEYS */ (&ih->ih_key,
-							 leaf_key(dest,
-								  item_num_in_dest))))
-	{
-		/* create new item in dest */
-		struct item_head new_ih;
-
-		/* form item header */
-		memcpy(&new_ih.ih_key, &ih->ih_key, KEY_SIZE);
-		put_ih_version(&new_ih, KEY_FORMAT_3_5);
-		/* calculate item len */
-		put_ih_item_len(&new_ih,
-				DEH_SIZE * copy_count + copy_records_len);
-		put_ih_entry_count(&new_ih, 0);
-
-		if (last_first == LAST_TO_FIRST) {
-			/* form key by the following way */
-			if (from < ih_entry_count(ih)) {
-				set_le_ih_k_offset(&new_ih,
-						   deh_offset(&deh[from]));
-			} else {
-				/*
-				 * no entries will be copied to this
-				 * item in this function
-				 */
-				set_le_ih_k_offset(&new_ih, U32_MAX);
-				/*
-				 * this item is not yet valid, but we
-				 * want I_IS_DIRECTORY_ITEM to return 1
-				 * for it, so we -1
-				 */
-			}
-			set_le_key_k_type(KEY_FORMAT_3_5, &new_ih.ih_key,
-					  TYPE_DIRENTRY);
-		}
-
-		/* insert item into dest buffer */
-		leaf_insert_into_buf(dest_bi,
-				     (last_first ==
-				      LAST_TO_FIRST) ? 0 : B_NR_ITEMS(dest),
-				     &new_ih, NULL, 0);
-	} else {
-		/* prepare space for entries */
-		leaf_paste_in_buffer(dest_bi,
-				     (last_first ==
-				      FIRST_TO_LAST) ? (B_NR_ITEMS(dest) -
-							1) : 0, MAX_US_INT,
-				     DEH_SIZE * copy_count + copy_records_len,
-				     records, 0);
-	}
-
-	item_num_in_dest =
-	    (last_first == FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - 1) : 0;
-
-	leaf_paste_entries(dest_bi, item_num_in_dest,
-			   (last_first ==
-			    FIRST_TO_LAST) ? ih_entry_count(item_head(dest,
-									  item_num_in_dest))
-			   : 0, copy_count, deh + from, records,
-			   DEH_SIZE * copy_count + copy_records_len);
-}
-
-/*
- * Copy the first (if last_first == FIRST_TO_LAST) or last
- * (last_first == LAST_TO_FIRST) item or part of it or nothing
- * (see the return 0 below) from SOURCE to the end (if last_first)
- * or beginning (!last_first) of the DEST
- */
-/* returns 1 if anything was copied, else 0 */
-static int leaf_copy_boundary_item(struct buffer_info *dest_bi,
-				   struct buffer_head *src, int last_first,
-				   int bytes_or_entries)
-{
-	struct buffer_head *dest = dest_bi->bi_bh;
-	/* number of items in the source and destination buffers */
-	int dest_nr_item, src_nr_item;
-	struct item_head *ih;
-	struct item_head *dih;
-
-	dest_nr_item = B_NR_ITEMS(dest);
-
-	/*
-	 * if ( DEST is empty or first item of SOURCE and last item of
-	 * DEST are the items of different objects or of different types )
-	 * then there is no need to treat this item differently from the
-	 * other items that we copy, so we return
-	 */
-	if (last_first == FIRST_TO_LAST) {
-		ih = item_head(src, 0);
-		dih = item_head(dest, dest_nr_item - 1);
-
-		/* there is nothing to merge */
-		if (!dest_nr_item
-		    || (!op_is_left_mergeable(&ih->ih_key, src->b_size)))
-			return 0;
-
-		RFALSE(!ih_item_len(ih),
-		       "vs-10010: item can not have empty length");
-
-		if (is_direntry_le_ih(ih)) {
-			if (bytes_or_entries == -1)
-				/* copy all entries to dest */
-				bytes_or_entries = ih_entry_count(ih);
-			leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST, 0, 0,
-					      bytes_or_entries);
-			return 1;
-		}
-
-		/*
-		 * copy part of the body of the first item of SOURCE
-		 * to the end of the body of the last item of the DEST
-		 * part defined by 'bytes_or_entries'; if bytes_or_entries
-		 * == -1 copy whole body; don't create new item header
-		 */
-		if (bytes_or_entries == -1)
-			bytes_or_entries = ih_item_len(ih);
-
-#ifdef CONFIG_REISERFS_CHECK
-		else {
-			if (bytes_or_entries == ih_item_len(ih)
-			    && is_indirect_le_ih(ih))
-				if (get_ih_free_space(ih))
-					reiserfs_panic(sb_from_bi(dest_bi),
-						       "vs-10020",
-						       "last unformatted node "
-						       "must be filled "
-						       "entirely (%h)", ih);
-		}
-#endif
-
-		/*
-		 * merge first item (or its part) of src buffer with the last
-		 * item of dest buffer. Both are of the same file
-		 */
-		leaf_paste_in_buffer(dest_bi,
-				     dest_nr_item - 1, ih_item_len(dih),
-				     bytes_or_entries, ih_item_body(src, ih), 0);
-
-		if (is_indirect_le_ih(dih)) {
-			RFALSE(get_ih_free_space(dih),
-			       "vs-10030: merge to left: last unformatted node of non-last indirect item %h must have zerto free space",
-			       ih);
-			if (bytes_or_entries == ih_item_len(ih))
-				set_ih_free_space(dih, get_ih_free_space(ih));
-		}
-
-		return 1;
-	}
-
-	/* copy boundary item to right (last_first == LAST_TO_FIRST) */
-
-	/*
-	 * (DEST is empty or last item of SOURCE and first item of DEST
-	 * are the items of different object or of different types)
-	 */
-	src_nr_item = B_NR_ITEMS(src);
-	ih = item_head(src, src_nr_item - 1);
-	dih = item_head(dest, 0);
-
-	if (!dest_nr_item || !op_is_left_mergeable(&dih->ih_key, src->b_size))
-		return 0;
-
-	if (is_direntry_le_ih(ih)) {
-		/*
-		 * bytes_or_entries = entries number in last
-		 * item body of SOURCE
-		 */
-		if (bytes_or_entries == -1)
-			bytes_or_entries = ih_entry_count(ih);
-
-		leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST,
-				      src_nr_item - 1,
-				      ih_entry_count(ih) - bytes_or_entries,
-				      bytes_or_entries);
-		return 1;
-	}
-
-	/*
-	 * copy part of the body of the last item of SOURCE to the
-	 * begin of the body of the first item of the DEST; part defined
-	 * by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body;
-	 * change first item key of the DEST; don't create new item header
-	 */
-
-	RFALSE(is_indirect_le_ih(ih) && get_ih_free_space(ih),
-	       "vs-10040: merge to right: last unformatted node of non-last indirect item must be filled entirely (%h)",
-	       ih);
-
-	if (bytes_or_entries == -1) {
-		/* bytes_or_entries = length of last item body of SOURCE */
-		bytes_or_entries = ih_item_len(ih);
-
-		RFALSE(le_ih_k_offset(dih) !=
-		       le_ih_k_offset(ih) + op_bytes_number(ih, src->b_size),
-		       "vs-10050: items %h and %h do not match", ih, dih);
-
-		/* change first item key of the DEST */
-		set_le_ih_k_offset(dih, le_ih_k_offset(ih));
-
-		/* item becomes non-mergeable */
-		/* or mergeable if left item was */
-		set_le_ih_k_type(dih, le_ih_k_type(ih));
-	} else {
-		/* merge to right only part of item */
-		RFALSE(ih_item_len(ih) <= bytes_or_entries,
-		       "vs-10060: no so much bytes %lu (needed %lu)",
-		       (unsigned long)ih_item_len(ih),
-		       (unsigned long)bytes_or_entries);
-
-		/* change first item key of the DEST */
-		if (is_direct_le_ih(dih)) {
-			RFALSE(le_ih_k_offset(dih) <=
-			       (unsigned long)bytes_or_entries,
-			       "vs-10070: dih %h, bytes_or_entries(%d)", dih,
-			       bytes_or_entries);
-			set_le_ih_k_offset(dih,
-					   le_ih_k_offset(dih) -
-					   bytes_or_entries);
-		} else {
-			RFALSE(le_ih_k_offset(dih) <=
-			       (bytes_or_entries / UNFM_P_SIZE) * dest->b_size,
-			       "vs-10080: dih %h, bytes_or_entries(%d)",
-			       dih,
-			       (bytes_or_entries / UNFM_P_SIZE) * dest->b_size);
-			set_le_ih_k_offset(dih,
-					   le_ih_k_offset(dih) -
-					   ((bytes_or_entries / UNFM_P_SIZE) *
-					    dest->b_size));
-		}
-	}
-
-	leaf_paste_in_buffer(dest_bi, 0, 0, bytes_or_entries,
-			     ih_item_body(src,
-				       ih) + ih_item_len(ih) - bytes_or_entries,
-			     0);
-	return 1;
-}
-
-/*
- * copy cpy_mun items from buffer src to buffer dest
- * last_first == FIRST_TO_LAST means, that we copy cpy_num items beginning
- *                             from first-th item in src to tail of dest
- * last_first == LAST_TO_FIRST means, that we copy cpy_num items beginning
- *                             from first-th item in src to head of dest
- */
-static void leaf_copy_items_entirely(struct buffer_info *dest_bi,
-				     struct buffer_head *src, int last_first,
-				     int first, int cpy_num)
-{
-	struct buffer_head *dest;
-	int nr, free_space;
-	int dest_before;
-	int last_loc, last_inserted_loc, location;
-	int i, j;
-	struct block_head *blkh;
-	struct item_head *ih;
-
-	RFALSE(last_first != LAST_TO_FIRST && last_first != FIRST_TO_LAST,
-	       "vs-10090: bad last_first parameter %d", last_first);
-	RFALSE(B_NR_ITEMS(src) - first < cpy_num,
-	       "vs-10100: too few items in source %d, required %d from %d",
-	       B_NR_ITEMS(src), cpy_num, first);
-	RFALSE(cpy_num < 0, "vs-10110: can not copy negative amount of items");
-	RFALSE(!dest_bi, "vs-10120: can not copy negative amount of items");
-
-	dest = dest_bi->bi_bh;
-
-	RFALSE(!dest, "vs-10130: can not copy negative amount of items");
-
-	if (cpy_num == 0)
-		return;
-
-	blkh = B_BLK_HEAD(dest);
-	nr = blkh_nr_item(blkh);
-	free_space = blkh_free_space(blkh);
-
-	/*
-	 * we will insert items before 0-th or nr-th item in dest buffer.
-	 * It depends of last_first parameter
-	 */
-	dest_before = (last_first == LAST_TO_FIRST) ? 0 : nr;
-
-	/* location of head of first new item */
-	ih = item_head(dest, dest_before);
-
-	RFALSE(blkh_free_space(blkh) < cpy_num * IH_SIZE,
-	       "vs-10140: not enough free space for headers %d (needed %d)",
-	       B_FREE_SPACE(dest), cpy_num * IH_SIZE);
-
-	/* prepare space for headers */
-	memmove(ih + cpy_num, ih, (nr - dest_before) * IH_SIZE);
-
-	/* copy item headers */
-	memcpy(ih, item_head(src, first), cpy_num * IH_SIZE);
-
-	free_space -= (IH_SIZE * cpy_num);
-	set_blkh_free_space(blkh, free_space);
-
-	/* location of unmovable item */
-	j = location = (dest_before == 0) ? dest->b_size : ih_location(ih - 1);
-	for (i = dest_before; i < nr + cpy_num; i++) {
-		location -= ih_item_len(ih + i - dest_before);
-		put_ih_location(ih + i - dest_before, location);
-	}
-
-	/* prepare space for items */
-	last_loc = ih_location(&ih[nr + cpy_num - 1 - dest_before]);
-	last_inserted_loc = ih_location(&ih[cpy_num - 1]);
-
-	/* check free space */
-	RFALSE(free_space < j - last_inserted_loc,
-	       "vs-10150: not enough free space for items %d (needed %d)",
-	       free_space, j - last_inserted_loc);
-
-	memmove(dest->b_data + last_loc,
-		dest->b_data + last_loc + j - last_inserted_loc,
-		last_inserted_loc - last_loc);
-
-	/* copy items */
-	memcpy(dest->b_data + last_inserted_loc,
-	       item_body(src, (first + cpy_num - 1)),
-	       j - last_inserted_loc);
-
-	/* sizes, item number */
-	set_blkh_nr_item(blkh, nr + cpy_num);
-	set_blkh_free_space(blkh, free_space - (j - last_inserted_loc));
-
-	do_balance_mark_leaf_dirty(dest_bi->tb, dest, 0);
-
-	if (dest_bi->bi_parent) {
-		struct disk_child *t_dc;
-		t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position);
-		RFALSE(dc_block_number(t_dc) != dest->b_blocknr,
-		       "vs-10160: block number in bh does not match to field in disk_child structure %lu and %lu",
-		       (long unsigned)dest->b_blocknr,
-		       (long unsigned)dc_block_number(t_dc));
-		put_dc_size(t_dc,
-			    dc_size(t_dc) + (j - last_inserted_loc +
-					     IH_SIZE * cpy_num));
-
-		do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent,
-					       0);
-	}
-}
-
-/*
- * This function splits the (liquid) item into two items (useful when
- * shifting part of an item into another node.)
- */
-static void leaf_item_bottle(struct buffer_info *dest_bi,
-			     struct buffer_head *src, int last_first,
-			     int item_num, int cpy_bytes)
-{
-	struct buffer_head *dest = dest_bi->bi_bh;
-	struct item_head *ih;
-
-	RFALSE(cpy_bytes == -1,
-	       "vs-10170: bytes == - 1 means: do not split item");
-
-	if (last_first == FIRST_TO_LAST) {
-		/*
-		 * if ( if item in position item_num in buffer SOURCE
-		 * is directory item )
-		 */
-		ih = item_head(src, item_num);
-		if (is_direntry_le_ih(ih))
-			leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST,
-					      item_num, 0, cpy_bytes);
-		else {
-			struct item_head n_ih;
-
-			/*
-			 * copy part of the body of the item number 'item_num'
-			 * of SOURCE to the end of the DEST part defined by
-			 * 'cpy_bytes'; create new item header; change old
-			 * item_header (????); n_ih = new item_header;
-			 */
-			memcpy(&n_ih, ih, IH_SIZE);
-			put_ih_item_len(&n_ih, cpy_bytes);
-			if (is_indirect_le_ih(ih)) {
-				RFALSE(cpy_bytes == ih_item_len(ih)
-				       && get_ih_free_space(ih),
-				       "vs-10180: when whole indirect item is bottle to left neighbor, it must have free_space==0 (not %lu)",
-				       (long unsigned)get_ih_free_space(ih));
-				set_ih_free_space(&n_ih, 0);
-			}
-
-			RFALSE(op_is_left_mergeable(&ih->ih_key, src->b_size),
-			       "vs-10190: bad mergeability of item %h", ih);
-			n_ih.ih_version = ih->ih_version;	/* JDM Endian safe, both le */
-			leaf_insert_into_buf(dest_bi, B_NR_ITEMS(dest), &n_ih,
-					     item_body(src, item_num), 0);
-		}
-	} else {
-		/*
-		 * if ( if item in position item_num in buffer
-		 * SOURCE is directory item )
-		 */
-		ih = item_head(src, item_num);
-		if (is_direntry_le_ih(ih))
-			leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST,
-					      item_num,
-					      ih_entry_count(ih) - cpy_bytes,
-					      cpy_bytes);
-		else {
-			struct item_head n_ih;
-
-			/*
-			 * copy part of the body of the item number 'item_num'
-			 * of SOURCE to the begin of the DEST part defined by
-			 * 'cpy_bytes'; create new item header;
-			 * n_ih = new item_header;
-			 */
-			memcpy(&n_ih.ih_key, &ih->ih_key, KEY_SIZE);
-
-			/* Endian safe, both le */
-			n_ih.ih_version = ih->ih_version;
-
-			if (is_direct_le_ih(ih)) {
-				set_le_ih_k_offset(&n_ih,
-						   le_ih_k_offset(ih) +
-						   ih_item_len(ih) - cpy_bytes);
-				set_le_ih_k_type(&n_ih, TYPE_DIRECT);
-				set_ih_free_space(&n_ih, MAX_US_INT);
-			} else {
-				/* indirect item */
-				RFALSE(!cpy_bytes && get_ih_free_space(ih),
-				       "vs-10200: ih->ih_free_space must be 0 when indirect item will be appended");
-				set_le_ih_k_offset(&n_ih,
-						   le_ih_k_offset(ih) +
-						   (ih_item_len(ih) -
-						    cpy_bytes) / UNFM_P_SIZE *
-						   dest->b_size);
-				set_le_ih_k_type(&n_ih, TYPE_INDIRECT);
-				set_ih_free_space(&n_ih, get_ih_free_space(ih));
-			}
-
-			/* set item length */
-			put_ih_item_len(&n_ih, cpy_bytes);
-
-			/* Endian safe, both le */
-			n_ih.ih_version = ih->ih_version;
-
-			leaf_insert_into_buf(dest_bi, 0, &n_ih,
-					     item_body(src, item_num) +
-						ih_item_len(ih) - cpy_bytes, 0);
-		}
-	}
-}
-
-/*
- * If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE
- * to DEST.  If cpy_bytes not equal to minus one than copy cpy_num-1 whole
- * items from SOURCE to DEST.  From last item copy cpy_num bytes for regular
- * item and cpy_num directory entries for directory item.
- */
-static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src,
-			   int last_first, int cpy_num, int cpy_bytes)
-{
-	struct buffer_head *dest;
-	int pos, i, src_nr_item, bytes;
-
-	dest = dest_bi->bi_bh;
-	RFALSE(!dest || !src, "vs-10210: !dest || !src");
-	RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST,
-	       "vs-10220:last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST");
-	RFALSE(B_NR_ITEMS(src) < cpy_num,
-	       "vs-10230: No enough items: %d, req. %d", B_NR_ITEMS(src),
-	       cpy_num);
-	RFALSE(cpy_num < 0, "vs-10240: cpy_num < 0 (%d)", cpy_num);
-
-	if (cpy_num == 0)
-		return 0;
-
-	if (last_first == FIRST_TO_LAST) {
-		/* copy items to left */
-		pos = 0;
-		if (cpy_num == 1)
-			bytes = cpy_bytes;
-		else
-			bytes = -1;
-
-		/*
-		 * copy the first item or it part or nothing to the end of
-		 * the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes))
-		 */
-		i = leaf_copy_boundary_item(dest_bi, src, FIRST_TO_LAST, bytes);
-		cpy_num -= i;
-		if (cpy_num == 0)
-			return i;
-		pos += i;
-		if (cpy_bytes == -1)
-			/*
-			 * copy first cpy_num items starting from position
-			 * 'pos' of SOURCE to end of DEST
-			 */
-			leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST,
-						 pos, cpy_num);
-		else {
-			/*
-			 * copy first cpy_num-1 items starting from position
-			 * 'pos-1' of the SOURCE to the end of the DEST
-			 */
-			leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST,
-						 pos, cpy_num - 1);
-
-			/*
-			 * copy part of the item which number is
-			 * cpy_num+pos-1 to the end of the DEST
-			 */
-			leaf_item_bottle(dest_bi, src, FIRST_TO_LAST,
-					 cpy_num + pos - 1, cpy_bytes);
-		}
-	} else {
-		/* copy items to right */
-		src_nr_item = B_NR_ITEMS(src);
-		if (cpy_num == 1)
-			bytes = cpy_bytes;
-		else
-			bytes = -1;
-
-		/*
-		 * copy the last item or it part or nothing to the
-		 * begin of the DEST
-		 * (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes));
-		 */
-		i = leaf_copy_boundary_item(dest_bi, src, LAST_TO_FIRST, bytes);
-
-		cpy_num -= i;
-		if (cpy_num == 0)
-			return i;
-
-		pos = src_nr_item - cpy_num - i;
-		if (cpy_bytes == -1) {
-			/*
-			 * starting from position 'pos' copy last cpy_num
-			 * items of SOURCE to begin of DEST
-			 */
-			leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST,
-						 pos, cpy_num);
-		} else {
-			/*
-			 * copy last cpy_num-1 items starting from position
-			 * 'pos+1' of the SOURCE to the begin of the DEST;
-			 */
-			leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST,
-						 pos + 1, cpy_num - 1);
-
-			/*
-			 * copy part of the item which number is pos to
-			 * the begin of the DEST
-			 */
-			leaf_item_bottle(dest_bi, src, LAST_TO_FIRST, pos,
-					 cpy_bytes);
-		}
-	}
-	return i;
-}
-
-/*
- * there are types of coping: from S[0] to L[0], from S[0] to R[0],
- * from R[0] to L[0]. for each of these we have to define parent and
- * positions of destination and source buffers
- */
-static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb,
-				       struct buffer_info *dest_bi,
-				       struct buffer_info *src_bi,
-				       int *first_last,
-				       struct buffer_head *Snew)
-{
-	memset(dest_bi, 0, sizeof(struct buffer_info));
-	memset(src_bi, 0, sizeof(struct buffer_info));
-
-	/* define dest, src, dest parent, dest position */
-	switch (shift_mode) {
-	case LEAF_FROM_S_TO_L:	/* it is used in leaf_shift_left */
-		src_bi->tb = tb;
-		src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
-		src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
-
-		/* src->b_item_order */
-		src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0);
-		dest_bi->tb = tb;
-		dest_bi->bi_bh = tb->L[0];
-		dest_bi->bi_parent = tb->FL[0];
-		dest_bi->bi_position = get_left_neighbor_position(tb, 0);
-		*first_last = FIRST_TO_LAST;
-		break;
-
-	case LEAF_FROM_S_TO_R:	/* it is used in leaf_shift_right */
-		src_bi->tb = tb;
-		src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
-		src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
-		src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0);
-		dest_bi->tb = tb;
-		dest_bi->bi_bh = tb->R[0];
-		dest_bi->bi_parent = tb->FR[0];
-		dest_bi->bi_position = get_right_neighbor_position(tb, 0);
-		*first_last = LAST_TO_FIRST;
-		break;
-
-	case LEAF_FROM_R_TO_L:	/* it is used in balance_leaf_when_delete */
-		src_bi->tb = tb;
-		src_bi->bi_bh = tb->R[0];
-		src_bi->bi_parent = tb->FR[0];
-		src_bi->bi_position = get_right_neighbor_position(tb, 0);
-		dest_bi->tb = tb;
-		dest_bi->bi_bh = tb->L[0];
-		dest_bi->bi_parent = tb->FL[0];
-		dest_bi->bi_position = get_left_neighbor_position(tb, 0);
-		*first_last = FIRST_TO_LAST;
-		break;
-
-	case LEAF_FROM_L_TO_R:	/* it is used in balance_leaf_when_delete */
-		src_bi->tb = tb;
-		src_bi->bi_bh = tb->L[0];
-		src_bi->bi_parent = tb->FL[0];
-		src_bi->bi_position = get_left_neighbor_position(tb, 0);
-		dest_bi->tb = tb;
-		dest_bi->bi_bh = tb->R[0];
-		dest_bi->bi_parent = tb->FR[0];
-		dest_bi->bi_position = get_right_neighbor_position(tb, 0);
-		*first_last = LAST_TO_FIRST;
-		break;
-
-	case LEAF_FROM_S_TO_SNEW:
-		src_bi->tb = tb;
-		src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
-		src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
-		src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0);
-		dest_bi->tb = tb;
-		dest_bi->bi_bh = Snew;
-		dest_bi->bi_parent = NULL;
-		dest_bi->bi_position = 0;
-		*first_last = LAST_TO_FIRST;
-		break;
-
-	default:
-		reiserfs_panic(sb_from_bi(src_bi), "vs-10250",
-			       "shift type is unknown (%d)", shift_mode);
-	}
-	RFALSE(!src_bi->bi_bh || !dest_bi->bi_bh,
-	       "vs-10260: mode==%d, source (%p) or dest (%p) buffer is initialized incorrectly",
-	       shift_mode, src_bi->bi_bh, dest_bi->bi_bh);
-}
-
-/*
- * copy mov_num items and mov_bytes of the (mov_num-1)th item to
- * neighbor. Delete them from source
- */
-int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num,
-		    int mov_bytes, struct buffer_head *Snew)
-{
-	int ret_value;
-	struct buffer_info dest_bi, src_bi;
-	int first_last;
-
-	leaf_define_dest_src_infos(shift_mode, tb, &dest_bi, &src_bi,
-				   &first_last, Snew);
-
-	ret_value =
-	    leaf_copy_items(&dest_bi, src_bi.bi_bh, first_last, mov_num,
-			    mov_bytes);
-
-	leaf_delete_items(&src_bi, first_last,
-			  (first_last ==
-			   FIRST_TO_LAST) ? 0 : (B_NR_ITEMS(src_bi.bi_bh) -
-						 mov_num), mov_num, mov_bytes);
-
-	return ret_value;
-}
-
-/*
- * Shift shift_num items (and shift_bytes of last shifted item if
- * shift_bytes != -1) from S[0] to L[0] and replace the delimiting key
- */
-int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes)
-{
-	struct buffer_head *S0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int i;
-
-	/*
-	 * move shift_num (and shift_bytes bytes) items from S[0]
-	 * to left neighbor L[0]
-	 */
-	i = leaf_move_items(LEAF_FROM_S_TO_L, tb, shift_num, shift_bytes, NULL);
-
-	if (shift_num) {
-		/* number of items in S[0] == 0 */
-		if (B_NR_ITEMS(S0) == 0) {
-
-			RFALSE(shift_bytes != -1,
-			       "vs-10270: S0 is empty now, but shift_bytes != -1 (%d)",
-			       shift_bytes);
-#ifdef CONFIG_REISERFS_CHECK
-			if (tb->tb_mode == M_PASTE || tb->tb_mode == M_INSERT) {
-				print_cur_tb("vs-10275");
-				reiserfs_panic(tb->tb_sb, "vs-10275",
-					       "balance condition corrupted "
-					       "(%c)", tb->tb_mode);
-			}
-#endif
-
-			if (PATH_H_POSITION(tb->tb_path, 1) == 0)
-				replace_key(tb, tb->CFL[0], tb->lkey[0],
-					    PATH_H_PPARENT(tb->tb_path, 0), 0);
-
-		} else {
-			/* replace lkey in CFL[0] by 0-th key from S[0]; */
-			replace_key(tb, tb->CFL[0], tb->lkey[0], S0, 0);
-
-			RFALSE((shift_bytes != -1 &&
-				!(is_direntry_le_ih(item_head(S0, 0))
-				  && !ih_entry_count(item_head(S0, 0)))) &&
-			       (!op_is_left_mergeable
-				(leaf_key(S0, 0), S0->b_size)),
-			       "vs-10280: item must be mergeable");
-		}
-	}
-
-	return i;
-}
-
-/* CLEANING STOPPED HERE */
-
-/*
- * Shift shift_num (shift_bytes) items from S[0] to the right neighbor,
- * and replace the delimiting key
- */
-int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes)
-{
-	int ret_value;
-
-	/*
-	 * move shift_num (and shift_bytes) items from S[0] to
-	 * right neighbor R[0]
-	 */
-	ret_value =
-	    leaf_move_items(LEAF_FROM_S_TO_R, tb, shift_num, shift_bytes, NULL);
-
-	/* replace rkey in CFR[0] by the 0-th key from R[0] */
-	if (shift_num) {
-		replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
-
-	}
-
-	return ret_value;
-}
-
-static void leaf_delete_items_entirely(struct buffer_info *bi,
-				       int first, int del_num);
-/*
- * If del_bytes == -1, starting from position 'first' delete del_num
- * items in whole in buffer CUR.
- *   If not.
- *   If last_first == 0. Starting from position 'first' delete del_num-1
- *   items in whole. Delete part of body of the first item. Part defined by
- *   del_bytes. Don't delete first item header
- *   If last_first == 1. Starting from position 'first+1' delete del_num-1
- *   items in whole. Delete part of body of the last item . Part defined by
- *   del_bytes. Don't delete last item header.
-*/
-void leaf_delete_items(struct buffer_info *cur_bi, int last_first,
-		       int first, int del_num, int del_bytes)
-{
-	struct buffer_head *bh;
-	int item_amount = B_NR_ITEMS(bh = cur_bi->bi_bh);
-
-	RFALSE(!bh, "10155: bh is not defined");
-	RFALSE(del_num < 0, "10160: del_num can not be < 0. del_num==%d",
-	       del_num);
-	RFALSE(first < 0
-	       || first + del_num > item_amount,
-	       "10165: invalid number of first item to be deleted (%d) or "
-	       "no so much items (%d) to delete (only %d)", first,
-	       first + del_num, item_amount);
-
-	if (del_num == 0)
-		return;
-
-	if (first == 0 && del_num == item_amount && del_bytes == -1) {
-		make_empty_node(cur_bi);
-		do_balance_mark_leaf_dirty(cur_bi->tb, bh, 0);
-		return;
-	}
-
-	if (del_bytes == -1)
-		/* delete del_num items beginning from item in position first */
-		leaf_delete_items_entirely(cur_bi, first, del_num);
-	else {
-		if (last_first == FIRST_TO_LAST) {
-			/*
-			 * delete del_num-1 items beginning from
-			 * item in position first
-			 */
-			leaf_delete_items_entirely(cur_bi, first, del_num - 1);
-
-			/*
-			 * delete the part of the first item of the bh
-			 * do not delete item header
-			 */
-			leaf_cut_from_buffer(cur_bi, 0, 0, del_bytes);
-		} else {
-			struct item_head *ih;
-			int len;
-
-			/*
-			 * delete del_num-1 items beginning from
-			 * item in position first+1
-			 */
-			leaf_delete_items_entirely(cur_bi, first + 1,
-						   del_num - 1);
-
-			ih = item_head(bh, B_NR_ITEMS(bh) - 1);
-			if (is_direntry_le_ih(ih))
-				/* the last item is directory  */
-				/*
-				 * len = numbers of directory entries
-				 * in this item
-				 */
-				len = ih_entry_count(ih);
-			else
-				/* len = body len of item */
-				len = ih_item_len(ih);
-
-			/*
-			 * delete the part of the last item of the bh
-			 * do not delete item header
-			 */
-			leaf_cut_from_buffer(cur_bi, B_NR_ITEMS(bh) - 1,
-					     len - del_bytes, del_bytes);
-		}
-	}
-}
-
-/* insert item into the leaf node in position before */
-void leaf_insert_into_buf(struct buffer_info *bi, int before,
-			  struct item_head * const inserted_item_ih,
-			  const char * const inserted_item_body,
-			  int zeros_number)
-{
-	struct buffer_head *bh = bi->bi_bh;
-	int nr, free_space;
-	struct block_head *blkh;
-	struct item_head *ih;
-	int i;
-	int last_loc, unmoved_loc;
-	char *to;
-
-	blkh = B_BLK_HEAD(bh);
-	nr = blkh_nr_item(blkh);
-	free_space = blkh_free_space(blkh);
-
-	/* check free space */
-	RFALSE(free_space < ih_item_len(inserted_item_ih) + IH_SIZE,
-	       "vs-10170: not enough free space in block %z, new item %h",
-	       bh, inserted_item_ih);
-	RFALSE(zeros_number > ih_item_len(inserted_item_ih),
-	       "vs-10172: zero number == %d, item length == %d",
-	       zeros_number, ih_item_len(inserted_item_ih));
-
-	/* get item new item must be inserted before */
-	ih = item_head(bh, before);
-
-	/* prepare space for the body of new item */
-	last_loc = nr ? ih_location(&ih[nr - before - 1]) : bh->b_size;
-	unmoved_loc = before ? ih_location(ih - 1) : bh->b_size;
-
-	memmove(bh->b_data + last_loc - ih_item_len(inserted_item_ih),
-		bh->b_data + last_loc, unmoved_loc - last_loc);
-
-	to = bh->b_data + unmoved_loc - ih_item_len(inserted_item_ih);
-	memset(to, 0, zeros_number);
-	to += zeros_number;
-
-	/* copy body to prepared space */
-	if (inserted_item_body)
-		memmove(to, inserted_item_body,
-			ih_item_len(inserted_item_ih) - zeros_number);
-	else
-		memset(to, '\0', ih_item_len(inserted_item_ih) - zeros_number);
-
-	/* insert item header */
-	memmove(ih + 1, ih, IH_SIZE * (nr - before));
-	memmove(ih, inserted_item_ih, IH_SIZE);
-
-	/* change locations */
-	for (i = before; i < nr + 1; i++) {
-		unmoved_loc -= ih_item_len(&ih[i - before]);
-		put_ih_location(&ih[i - before], unmoved_loc);
-	}
-
-	/* sizes, free space, item number */
-	set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1);
-	set_blkh_free_space(blkh,
-			    free_space - (IH_SIZE +
-					  ih_item_len(inserted_item_ih)));
-	do_balance_mark_leaf_dirty(bi->tb, bh, 1);
-
-	if (bi->bi_parent) {
-		struct disk_child *t_dc;
-		t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position);
-		put_dc_size(t_dc,
-			    dc_size(t_dc) + (IH_SIZE +
-					     ih_item_len(inserted_item_ih)));
-		do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
-	}
-}
-
-/*
- * paste paste_size bytes to affected_item_num-th item.
- * When item is a directory, this only prepare space for new entries
- */
-void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num,
-			  int pos_in_item, int paste_size,
-			  const char *body, int zeros_number)
-{
-	struct buffer_head *bh = bi->bi_bh;
-	int nr, free_space;
-	struct block_head *blkh;
-	struct item_head *ih;
-	int i;
-	int last_loc, unmoved_loc;
-
-	blkh = B_BLK_HEAD(bh);
-	nr = blkh_nr_item(blkh);
-	free_space = blkh_free_space(blkh);
-
-	/* check free space */
-	RFALSE(free_space < paste_size,
-	       "vs-10175: not enough free space: needed %d, available %d",
-	       paste_size, free_space);
-
-#ifdef CONFIG_REISERFS_CHECK
-	if (zeros_number > paste_size) {
-		struct super_block *sb = NULL;
-		if (bi && bi->tb)
-			sb = bi->tb->tb_sb;
-		print_cur_tb("10177");
-		reiserfs_panic(sb, "vs-10177",
-			       "zeros_number == %d, paste_size == %d",
-			       zeros_number, paste_size);
-	}
-#endif				/* CONFIG_REISERFS_CHECK */
-
-	/* item to be appended */
-	ih = item_head(bh, affected_item_num);
-
-	last_loc = ih_location(&ih[nr - affected_item_num - 1]);
-	unmoved_loc = affected_item_num ? ih_location(ih - 1) : bh->b_size;
-
-	/* prepare space */
-	memmove(bh->b_data + last_loc - paste_size, bh->b_data + last_loc,
-		unmoved_loc - last_loc);
-
-	/* change locations */
-	for (i = affected_item_num; i < nr; i++)
-		put_ih_location(&ih[i - affected_item_num],
-				ih_location(&ih[i - affected_item_num]) -
-				paste_size);
-
-	if (body) {
-		if (!is_direntry_le_ih(ih)) {
-			if (!pos_in_item) {
-				/* shift data to right */
-				memmove(bh->b_data + ih_location(ih) +
-					paste_size,
-					bh->b_data + ih_location(ih),
-					ih_item_len(ih));
-				/* paste data in the head of item */
-				memset(bh->b_data + ih_location(ih), 0,
-				       zeros_number);
-				memcpy(bh->b_data + ih_location(ih) +
-				       zeros_number, body,
-				       paste_size - zeros_number);
-			} else {
-				memset(bh->b_data + unmoved_loc - paste_size, 0,
-				       zeros_number);
-				memcpy(bh->b_data + unmoved_loc - paste_size +
-				       zeros_number, body,
-				       paste_size - zeros_number);
-			}
-		}
-	} else
-		memset(bh->b_data + unmoved_loc - paste_size, '\0', paste_size);
-
-	put_ih_item_len(ih, ih_item_len(ih) + paste_size);
-
-	/* change free space */
-	set_blkh_free_space(blkh, free_space - paste_size);
-
-	do_balance_mark_leaf_dirty(bi->tb, bh, 0);
-
-	if (bi->bi_parent) {
-		struct disk_child *t_dc =
-		    B_N_CHILD(bi->bi_parent, bi->bi_position);
-		put_dc_size(t_dc, dc_size(t_dc) + paste_size);
-		do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
-	}
-}
-
-/*
- * cuts DEL_COUNT entries beginning from FROM-th entry. Directory item
- * does not have free space, so it moves DEHs and remaining records as
- * necessary. Return value is size of removed part of directory item
- * in bytes.
- */
-static int leaf_cut_entries(struct buffer_head *bh,
-			    struct item_head *ih, int from, int del_count)
-{
-	char *item;
-	struct reiserfs_de_head *deh;
-	int prev_record_offset;	/* offset of record, that is (from-1)th */
-	char *prev_record;	/* */
-	int cut_records_len;	/* length of all removed records */
-	int i;
-
-	/*
-	 * make sure that item is directory and there are enough entries to
-	 * remove
-	 */
-	RFALSE(!is_direntry_le_ih(ih), "10180: item is not directory item");
-	RFALSE(ih_entry_count(ih) < from + del_count,
-	       "10185: item contains not enough entries: entry_count = %d, from = %d, to delete = %d",
-	       ih_entry_count(ih), from, del_count);
-
-	if (del_count == 0)
-		return 0;
-
-	/* first byte of item */
-	item = bh->b_data + ih_location(ih);
-
-	/* entry head array */
-	deh = B_I_DEH(bh, ih);
-
-	/*
-	 * first byte of remaining entries, those are BEFORE cut entries
-	 * (prev_record) and length of all removed records (cut_records_len)
-	 */
-	prev_record_offset =
-	    (from ? deh_location(&deh[from - 1]) : ih_item_len(ih));
-	cut_records_len = prev_record_offset /*from_record */  -
-	    deh_location(&deh[from + del_count - 1]);
-	prev_record = item + prev_record_offset;
-
-	/* adjust locations of remaining entries */
-	for (i = ih_entry_count(ih) - 1; i > from + del_count - 1; i--)
-		put_deh_location(&deh[i],
-				 deh_location(&deh[i]) -
-				 (DEH_SIZE * del_count));
-
-	for (i = 0; i < from; i++)
-		put_deh_location(&deh[i],
-				 deh_location(&deh[i]) - (DEH_SIZE * del_count +
-							  cut_records_len));
-
-	put_ih_entry_count(ih, ih_entry_count(ih) - del_count);
-
-	/* shift entry head array and entries those are AFTER removed entries */
-	memmove((char *)(deh + from),
-		deh + from + del_count,
-		prev_record - cut_records_len - (char *)(deh + from +
-							 del_count));
-
-	/* shift records, those are BEFORE removed entries */
-	memmove(prev_record - cut_records_len - DEH_SIZE * del_count,
-		prev_record, item + ih_item_len(ih) - prev_record);
-
-	return DEH_SIZE * del_count + cut_records_len;
-}
-
-/*
- * when cut item is part of regular file
- *      pos_in_item - first byte that must be cut
- *      cut_size - number of bytes to be cut beginning from pos_in_item
- *
- * when cut item is part of directory
- *      pos_in_item - number of first deleted entry
- *      cut_size - count of deleted entries
- */
-void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
-			  int pos_in_item, int cut_size)
-{
-	int nr;
-	struct buffer_head *bh = bi->bi_bh;
-	struct block_head *blkh;
-	struct item_head *ih;
-	int last_loc, unmoved_loc;
-	int i;
-
-	blkh = B_BLK_HEAD(bh);
-	nr = blkh_nr_item(blkh);
-
-	/* item head of truncated item */
-	ih = item_head(bh, cut_item_num);
-
-	if (is_direntry_le_ih(ih)) {
-		/* first cut entry () */
-		cut_size = leaf_cut_entries(bh, ih, pos_in_item, cut_size);
-		if (pos_in_item == 0) {
-			/* change key */
-			RFALSE(cut_item_num,
-			       "when 0-th enrty of item is cut, that item must be first in the node, not %d-th",
-			       cut_item_num);
-			/* change item key by key of first entry in the item */
-			set_le_ih_k_offset(ih, deh_offset(B_I_DEH(bh, ih)));
-		}
-	} else {
-		/* item is direct or indirect */
-		RFALSE(is_statdata_le_ih(ih), "10195: item is stat data");
-		RFALSE(pos_in_item && pos_in_item + cut_size != ih_item_len(ih),
-		       "10200: invalid offset (%lu) or trunc_size (%lu) or ih_item_len (%lu)",
-		       (long unsigned)pos_in_item, (long unsigned)cut_size,
-		       (long unsigned)ih_item_len(ih));
-
-		/* shift item body to left if cut is from the head of item */
-		if (pos_in_item == 0) {
-			memmove(bh->b_data + ih_location(ih),
-				bh->b_data + ih_location(ih) + cut_size,
-				ih_item_len(ih) - cut_size);
-
-			/* change key of item */
-			if (is_direct_le_ih(ih))
-				set_le_ih_k_offset(ih,
-						   le_ih_k_offset(ih) +
-						   cut_size);
-			else {
-				set_le_ih_k_offset(ih,
-						   le_ih_k_offset(ih) +
-						   (cut_size / UNFM_P_SIZE) *
-						   bh->b_size);
-				RFALSE(ih_item_len(ih) == cut_size
-				       && get_ih_free_space(ih),
-				       "10205: invalid ih_free_space (%h)", ih);
-			}
-		}
-	}
-
-	/* location of the last item */
-	last_loc = ih_location(&ih[nr - cut_item_num - 1]);
-
-	/* location of the item, which is remaining at the same place */
-	unmoved_loc = cut_item_num ? ih_location(ih - 1) : bh->b_size;
-
-	/* shift */
-	memmove(bh->b_data + last_loc + cut_size, bh->b_data + last_loc,
-		unmoved_loc - last_loc - cut_size);
-
-	/* change item length */
-	put_ih_item_len(ih, ih_item_len(ih) - cut_size);
-
-	if (is_indirect_le_ih(ih)) {
-		if (pos_in_item)
-			set_ih_free_space(ih, 0);
-	}
-
-	/* change locations */
-	for (i = cut_item_num; i < nr; i++)
-		put_ih_location(&ih[i - cut_item_num],
-				ih_location(&ih[i - cut_item_num]) + cut_size);
-
-	/* size, free space */
-	set_blkh_free_space(blkh, blkh_free_space(blkh) + cut_size);
-
-	do_balance_mark_leaf_dirty(bi->tb, bh, 0);
-
-	if (bi->bi_parent) {
-		struct disk_child *t_dc;
-		t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position);
-		put_dc_size(t_dc, dc_size(t_dc) - cut_size);
-		do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
-	}
-}
-
-/* delete del_num items from buffer starting from the first'th item */
-static void leaf_delete_items_entirely(struct buffer_info *bi,
-				       int first, int del_num)
-{
-	struct buffer_head *bh = bi->bi_bh;
-	int nr;
-	int i, j;
-	int last_loc, last_removed_loc;
-	struct block_head *blkh;
-	struct item_head *ih;
-
-	RFALSE(bh == NULL, "10210: buffer is 0");
-	RFALSE(del_num < 0, "10215: del_num less than 0 (%d)", del_num);
-
-	if (del_num == 0)
-		return;
-
-	blkh = B_BLK_HEAD(bh);
-	nr = blkh_nr_item(blkh);
-
-	RFALSE(first < 0 || first + del_num > nr,
-	       "10220: first=%d, number=%d, there is %d items", first, del_num,
-	       nr);
-
-	if (first == 0 && del_num == nr) {
-		/* this does not work */
-		make_empty_node(bi);
-
-		do_balance_mark_leaf_dirty(bi->tb, bh, 0);
-		return;
-	}
-
-	ih = item_head(bh, first);
-
-	/* location of unmovable item */
-	j = (first == 0) ? bh->b_size : ih_location(ih - 1);
-
-	/* delete items */
-	last_loc = ih_location(&ih[nr - 1 - first]);
-	last_removed_loc = ih_location(&ih[del_num - 1]);
-
-	memmove(bh->b_data + last_loc + j - last_removed_loc,
-		bh->b_data + last_loc, last_removed_loc - last_loc);
-
-	/* delete item headers */
-	memmove(ih, ih + del_num, (nr - first - del_num) * IH_SIZE);
-
-	/* change item location */
-	for (i = first; i < nr - del_num; i++)
-		put_ih_location(&ih[i - first],
-				ih_location(&ih[i - first]) + (j -
-								 last_removed_loc));
-
-	/* sizes, item number */
-	set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num);
-	set_blkh_free_space(blkh,
-			    blkh_free_space(blkh) + (j - last_removed_loc +
-						     IH_SIZE * del_num));
-
-	do_balance_mark_leaf_dirty(bi->tb, bh, 0);
-
-	if (bi->bi_parent) {
-		struct disk_child *t_dc =
-		    B_N_CHILD(bi->bi_parent, bi->bi_position);
-		put_dc_size(t_dc,
-			    dc_size(t_dc) - (j - last_removed_loc +
-					     IH_SIZE * del_num));
-		do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
-	}
-}
-
-/*
- * paste new_entry_count entries (new_dehs, records) into position
- * before to item_num-th item
- */
-void leaf_paste_entries(struct buffer_info *bi,
-			int item_num,
-			int before,
-			int new_entry_count,
-			struct reiserfs_de_head *new_dehs,
-			const char *records, int paste_size)
-{
-	struct item_head *ih;
-	char *item;
-	struct reiserfs_de_head *deh;
-	char *insert_point;
-	int i;
-	struct buffer_head *bh = bi->bi_bh;
-
-	if (new_entry_count == 0)
-		return;
-
-	ih = item_head(bh, item_num);
-
-	/*
-	 * make sure, that item is directory, and there are enough
-	 * records in it
-	 */
-	RFALSE(!is_direntry_le_ih(ih), "10225: item is not directory item");
-	RFALSE(ih_entry_count(ih) < before,
-	       "10230: there are no entry we paste entries before. entry_count = %d, before = %d",
-	       ih_entry_count(ih), before);
-
-	/* first byte of dest item */
-	item = bh->b_data + ih_location(ih);
-
-	/* entry head array */
-	deh = B_I_DEH(bh, ih);
-
-	/* new records will be pasted at this point */
-	insert_point =
-	    item +
-	    (before ? deh_location(&deh[before - 1])
-	     : (ih_item_len(ih) - paste_size));
-
-	/* adjust locations of records that will be AFTER new records */
-	for (i = ih_entry_count(ih) - 1; i >= before; i--)
-		put_deh_location(&deh[i],
-				 deh_location(&deh[i]) +
-				 (DEH_SIZE * new_entry_count));
-
-	/* adjust locations of records that will be BEFORE new records */
-	for (i = 0; i < before; i++)
-		put_deh_location(&deh[i],
-				 deh_location(&deh[i]) + paste_size);
-
-	put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count);
-
-	/* prepare space for pasted records */
-	memmove(insert_point + paste_size, insert_point,
-		item + (ih_item_len(ih) - paste_size) - insert_point);
-
-	/* copy new records */
-	memcpy(insert_point + DEH_SIZE * new_entry_count, records,
-	       paste_size - DEH_SIZE * new_entry_count);
-
-	/* prepare space for new entry heads */
-	deh += before;
-	memmove((char *)(deh + new_entry_count), deh,
-		insert_point - (char *)deh);
-
-	/* copy new entry heads */
-	deh = (struct reiserfs_de_head *)((char *)deh);
-	memcpy(deh, new_dehs, DEH_SIZE * new_entry_count);
-
-	/* set locations of new records */
-	for (i = 0; i < new_entry_count; i++) {
-		put_deh_location(&deh[i],
-				 deh_location(&deh[i]) +
-				 (-deh_location
-				  (&new_dehs[new_entry_count - 1]) +
-				  insert_point + DEH_SIZE * new_entry_count -
-				  item));
-	}
-
-	/* change item key if necessary (when we paste before 0-th entry */
-	if (!before) {
-		set_le_ih_k_offset(ih, deh_offset(new_dehs));
-	}
-#ifdef CONFIG_REISERFS_CHECK
-	{
-		int prev, next;
-		/* check record locations */
-		deh = B_I_DEH(bh, ih);
-		for (i = 0; i < ih_entry_count(ih); i++) {
-			next =
-			    (i <
-			     ih_entry_count(ih) -
-			     1) ? deh_location(&deh[i + 1]) : 0;
-			prev = (i != 0) ? deh_location(&deh[i - 1]) : 0;
-
-			if (prev && prev <= deh_location(&deh[i]))
-				reiserfs_error(sb_from_bi(bi), "vs-10240",
-					       "directory item (%h) "
-					       "corrupted (prev %a, "
-					       "cur(%d) %a)",
-					       ih, deh + i - 1, i, deh + i);
-			if (next && next >= deh_location(&deh[i]))
-				reiserfs_error(sb_from_bi(bi), "vs-10250",
-					       "directory item (%h) "
-					       "corrupted (cur(%d) %a, "
-					       "next %a)",
-					       ih, i, deh + i, deh + i + 1);
-		}
-	}
-#endif
-
-}
diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c
deleted file mode 100644
index 46bd7bd63a71..000000000000
--- a/fs/reiserfs/lock.c
+++ /dev/null
@@ -1,101 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "reiserfs.h"
-#include <linux/mutex.h>
-
-/*
- * The previous reiserfs locking scheme was heavily based on
- * the tricky properties of the Bkl:
- *
- * - it was acquired recursively by a same task
- * - the performances relied on the release-while-schedule() property
- *
- * Now that we replace it by a mutex, we still want to keep the same
- * recursive property to avoid big changes in the code structure.
- * We use our own lock_owner here because the owner field on a mutex
- * is only available in SMP or mutex debugging, also we only need this field
- * for this mutex, no need for a system wide mutex facility.
- *
- * Also this lock is often released before a call that could block because
- * reiserfs performances were partially based on the release while schedule()
- * property of the Bkl.
- */
-void reiserfs_write_lock(struct super_block *s)
-{
-	struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
-
-	if (sb_i->lock_owner != current) {
-		mutex_lock(&sb_i->lock);
-		sb_i->lock_owner = current;
-	}
-
-	/* No need to protect it, only the current task touches it */
-	sb_i->lock_depth++;
-}
-
-void reiserfs_write_unlock(struct super_block *s)
-{
-	struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
-
-	/*
-	 * Are we unlocking without even holding the lock?
-	 * Such a situation must raise a BUG() if we don't want
-	 * to corrupt the data.
-	 */
-	BUG_ON(sb_i->lock_owner != current);
-
-	if (--sb_i->lock_depth == -1) {
-		sb_i->lock_owner = NULL;
-		mutex_unlock(&sb_i->lock);
-	}
-}
-
-int __must_check reiserfs_write_unlock_nested(struct super_block *s)
-{
-	struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
-	int depth;
-
-	/* this can happen when the lock isn't always held */
-	if (sb_i->lock_owner != current)
-		return -1;
-
-	depth = sb_i->lock_depth;
-
-	sb_i->lock_depth = -1;
-	sb_i->lock_owner = NULL;
-	mutex_unlock(&sb_i->lock);
-
-	return depth;
-}
-
-void reiserfs_write_lock_nested(struct super_block *s, int depth)
-{
-	struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
-
-	/* this can happen when the lock isn't always held */
-	if (depth == -1)
-		return;
-
-	mutex_lock(&sb_i->lock);
-	sb_i->lock_owner = current;
-	sb_i->lock_depth = depth;
-}
-
-/*
- * Utility function to force a BUG if it is called without the superblock
- * write lock held.  caller is the string printed just before calling BUG()
- */
-void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
-{
-	struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
-
-	WARN_ON(sb_i->lock_depth < 0);
-}
-
-#ifdef CONFIG_REISERFS_CHECK
-void reiserfs_lock_check_recursive(struct super_block *sb)
-{
-	struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
-
-	WARN_ONCE((sb_i->lock_depth > 0), "Unwanted recursive reiserfs lock!\n");
-}
-#endif
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
deleted file mode 100644
index 7e7b531fcc49..000000000000
--- a/fs/reiserfs/namei.c
+++ /dev/null
@@ -1,1725 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- *
- * Trivial changes by Alan Cox to remove EHASHCOLLISION for compatibility
- *
- * Trivial Changes:
- * Rights granted to Hans Reiser to redistribute under other terms providing
- * he accepts all liability including but not limited to patent, fitness
- * for purpose, and direct or indirect claims arising from failure to perform.
- *
- * NO WARRANTY
- */
-
-#include <linux/time.h>
-#include <linux/bitops.h>
-#include <linux/slab.h>
-#include "reiserfs.h"
-#include "acl.h"
-#include "xattr.h"
-#include <linux/quotaops.h>
-
-#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) set_nlink(i, 1); }
-#define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i);
-
-/*
- * directory item contains array of entry headers. This performs
- * binary search through that array
- */
-static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off)
-{
-	struct item_head *ih = de->de_ih;
-	struct reiserfs_de_head *deh = de->de_deh;
-	int rbound, lbound, j;
-
-	lbound = 0;
-	rbound = ih_entry_count(ih) - 1;
-
-	for (j = (rbound + lbound) / 2; lbound <= rbound;
-	     j = (rbound + lbound) / 2) {
-		if (off < deh_offset(deh + j)) {
-			rbound = j - 1;
-			continue;
-		}
-		if (off > deh_offset(deh + j)) {
-			lbound = j + 1;
-			continue;
-		}
-		/* this is not name found, but matched third key component */
-		de->de_entry_num = j;
-		return NAME_FOUND;
-	}
-
-	de->de_entry_num = lbound;
-	return NAME_NOT_FOUND;
-}
-
-/*
- * comment?  maybe something like set de to point to what the path points to?
- */
-static inline void set_de_item_location(struct reiserfs_dir_entry *de,
-					struct treepath *path)
-{
-	de->de_bh = get_last_bh(path);
-	de->de_ih = tp_item_head(path);
-	de->de_deh = B_I_DEH(de->de_bh, de->de_ih);
-	de->de_item_num = PATH_LAST_POSITION(path);
-}
-
-/*
- * de_bh, de_ih, de_deh (points to first element of array), de_item_num is set
- */
-inline void set_de_name_and_namelen(struct reiserfs_dir_entry *de)
-{
-	struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num;
-
-	BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih));
-
-	de->de_entrylen = entry_length(de->de_bh, de->de_ih, de->de_entry_num);
-	de->de_namelen = de->de_entrylen - (de_with_sd(deh) ? SD_SIZE : 0);
-	de->de_name = ih_item_body(de->de_bh, de->de_ih) + deh_location(deh);
-	if (de->de_name[de->de_namelen - 1] == 0)
-		de->de_namelen = strlen(de->de_name);
-}
-
-/* what entry points to */
-static inline void set_de_object_key(struct reiserfs_dir_entry *de)
-{
-	BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih));
-	de->de_dir_id = deh_dir_id(&de->de_deh[de->de_entry_num]);
-	de->de_objectid = deh_objectid(&de->de_deh[de->de_entry_num]);
-}
-
-static inline void store_de_entry_key(struct reiserfs_dir_entry *de)
-{
-	struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num;
-
-	BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih));
-
-	/* store key of the found entry */
-	de->de_entry_key.version = KEY_FORMAT_3_5;
-	de->de_entry_key.on_disk_key.k_dir_id =
-	    le32_to_cpu(de->de_ih->ih_key.k_dir_id);
-	de->de_entry_key.on_disk_key.k_objectid =
-	    le32_to_cpu(de->de_ih->ih_key.k_objectid);
-	set_cpu_key_k_offset(&de->de_entry_key, deh_offset(deh));
-	set_cpu_key_k_type(&de->de_entry_key, TYPE_DIRENTRY);
-}
-
-/*
- * We assign a key to each directory item, and place multiple entries in a
- * single directory item.  A directory item has a key equal to the key of
- * the first directory entry in it.
-
- * This function first calls search_by_key, then, if item whose first entry
- * matches is not found it looks for the entry inside directory item found
- * by search_by_key. Fills the path to the entry, and to the entry position
- * in the item
- */
-/* The function is NOT SCHEDULE-SAFE! */
-int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
-			struct treepath *path, struct reiserfs_dir_entry *de)
-{
-	int retval;
-
-	retval = search_item(sb, key, path);
-	switch (retval) {
-	case ITEM_NOT_FOUND:
-		if (!PATH_LAST_POSITION(path)) {
-			reiserfs_error(sb, "vs-7000", "search_by_key "
-				       "returned item position == 0");
-			pathrelse(path);
-			return IO_ERROR;
-		}
-		PATH_LAST_POSITION(path)--;
-		break;
-
-	case ITEM_FOUND:
-		break;
-
-	case IO_ERROR:
-		return retval;
-
-	default:
-		pathrelse(path);
-		reiserfs_error(sb, "vs-7002", "no path to here");
-		return IO_ERROR;
-	}
-
-	set_de_item_location(de, path);
-
-#ifdef CONFIG_REISERFS_CHECK
-	if (!is_direntry_le_ih(de->de_ih) ||
-	    COMP_SHORT_KEYS(&de->de_ih->ih_key, key)) {
-		print_block(de->de_bh, 0, -1, -1);
-		reiserfs_panic(sb, "vs-7005", "found item %h is not directory "
-			       "item or does not belong to the same directory "
-			       "as key %K", de->de_ih, key);
-	}
-#endif				/* CONFIG_REISERFS_CHECK */
-
-	/*
-	 * binary search in directory item by third component of the
-	 * key. sets de->de_entry_num of de
-	 */
-	retval = bin_search_in_dir_item(de, cpu_key_k_offset(key));
-	path->pos_in_item = de->de_entry_num;
-	if (retval != NAME_NOT_FOUND) {
-		/*
-		 * ugly, but rename needs de_bh, de_deh, de_name,
-		 * de_namelen, de_objectid set
-		 */
-		set_de_name_and_namelen(de);
-		set_de_object_key(de);
-	}
-	return retval;
-}
-
-/* Keyed 32-bit hash function using TEA in a Davis-Meyer function */
-
-/*
- * The third component is hashed, and you can choose from more than
- * one hash function.  Per directory hashes are not yet implemented
- * but are thought about. This function should be moved to hashes.c
- * Jedi, please do so.  -Hans
- */
-static __u32 get_third_component(struct super_block *s,
-				 const char *name, int len)
-{
-	__u32 res;
-
-	if (!len || (len == 1 && name[0] == '.'))
-		return DOT_OFFSET;
-	if (len == 2 && name[0] == '.' && name[1] == '.')
-		return DOT_DOT_OFFSET;
-
-	res = REISERFS_SB(s)->s_hash_function(name, len);
-
-	/* take bits from 7-th to 30-th including both bounds */
-	res = GET_HASH_VALUE(res);
-	if (res == 0)
-		/*
-		 * needed to have no names before "." and ".." those have hash
-		 * value == 0 and generation conters 1 and 2 accordingly
-		 */
-		res = 128;
-	return res + MAX_GENERATION_NUMBER;
-}
-
-static int reiserfs_match(struct reiserfs_dir_entry *de,
-			  const char *name, int namelen)
-{
-	int retval = NAME_NOT_FOUND;
-
-	if ((namelen == de->de_namelen) &&
-	    !memcmp(de->de_name, name, de->de_namelen))
-		retval =
-		    (de_visible(de->de_deh + de->de_entry_num) ? NAME_FOUND :
-		     NAME_FOUND_INVISIBLE);
-
-	return retval;
-}
-
-/* de's de_bh, de_ih, de_deh, de_item_num, de_entry_num are set already */
-
-/* used when hash collisions exist */
-
-static int linear_search_in_dir_item(struct cpu_key *key,
-				     struct reiserfs_dir_entry *de,
-				     const char *name, int namelen)
-{
-	struct reiserfs_de_head *deh = de->de_deh;
-	int retval;
-	int i;
-
-	i = de->de_entry_num;
-
-	if (i == ih_entry_count(de->de_ih) ||
-	    GET_HASH_VALUE(deh_offset(deh + i)) !=
-	    GET_HASH_VALUE(cpu_key_k_offset(key))) {
-		i--;
-	}
-
-	RFALSE(de->de_deh != B_I_DEH(de->de_bh, de->de_ih),
-	       "vs-7010: array of entry headers not found");
-
-	deh += i;
-
-	for (; i >= 0; i--, deh--) {
-		/* hash value does not match, no need to check whole name */
-		if (GET_HASH_VALUE(deh_offset(deh)) !=
-		    GET_HASH_VALUE(cpu_key_k_offset(key))) {
-			return NAME_NOT_FOUND;
-		}
-
-		/* mark that this generation number is used */
-		if (de->de_gen_number_bit_string)
-			set_bit(GET_GENERATION_NUMBER(deh_offset(deh)),
-				de->de_gen_number_bit_string);
-
-		/* calculate pointer to name and namelen */
-		de->de_entry_num = i;
-		set_de_name_and_namelen(de);
-
-		/*
-		 * de's de_name, de_namelen, de_recordlen are set.
-		 * Fill the rest.
-		 */
-		if ((retval =
-		     reiserfs_match(de, name, namelen)) != NAME_NOT_FOUND) {
-
-			/* key of pointed object */
-			set_de_object_key(de);
-
-			store_de_entry_key(de);
-
-			/* retval can be NAME_FOUND or NAME_FOUND_INVISIBLE */
-			return retval;
-		}
-	}
-
-	if (GET_GENERATION_NUMBER(le_ih_k_offset(de->de_ih)) == 0)
-		/*
-		 * we have reached left most entry in the node. In common we
-		 * have to go to the left neighbor, but if generation counter
-		 * is 0 already, we know for sure, that there is no name with
-		 * the same hash value
-		 */
-		/*
-		 * FIXME: this work correctly only because hash value can not
-		 *  be 0. Btw, in case of Yura's hash it is probably possible,
-		 * so, this is a bug
-		 */
-		return NAME_NOT_FOUND;
-
-	RFALSE(de->de_item_num,
-	       "vs-7015: two diritems of the same directory in one node?");
-
-	return GOTO_PREVIOUS_ITEM;
-}
-
-/*
- * may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND
- * FIXME: should add something like IOERROR
- */
-static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen,
-			       struct treepath *path_to_entry,
-			       struct reiserfs_dir_entry *de)
-{
-	struct cpu_key key_to_search;
-	int retval;
-
-	if (namelen > REISERFS_MAX_NAME(dir->i_sb->s_blocksize))
-		return NAME_NOT_FOUND;
-
-	/* we will search for this key in the tree */
-	make_cpu_key(&key_to_search, dir,
-		     get_third_component(dir->i_sb, name, namelen),
-		     TYPE_DIRENTRY, 3);
-
-	while (1) {
-		retval =
-		    search_by_entry_key(dir->i_sb, &key_to_search,
-					path_to_entry, de);
-		if (retval == IO_ERROR) {
-			reiserfs_error(dir->i_sb, "zam-7001", "io error");
-			return IO_ERROR;
-		}
-
-		/* compare names for all entries having given hash value */
-		retval =
-		    linear_search_in_dir_item(&key_to_search, de, name,
-					      namelen);
-		/*
-		 * there is no need to scan directory anymore.
-		 * Given entry found or does not exist
-		 */
-		if (retval != GOTO_PREVIOUS_ITEM) {
-			path_to_entry->pos_in_item = de->de_entry_num;
-			return retval;
-		}
-
-		/*
-		 * there is left neighboring item of this directory
-		 * and given entry can be there
-		 */
-		set_cpu_key_k_offset(&key_to_search,
-				     le_ih_k_offset(de->de_ih) - 1);
-		pathrelse(path_to_entry);
-
-	}			/* while (1) */
-}
-
-static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
-				      unsigned int flags)
-{
-	int retval;
-	struct inode *inode = NULL;
-	struct reiserfs_dir_entry de;
-	INITIALIZE_PATH(path_to_entry);
-
-	if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len)
-		return ERR_PTR(-ENAMETOOLONG);
-
-	reiserfs_write_lock(dir->i_sb);
-
-	de.de_gen_number_bit_string = NULL;
-	retval =
-	    reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
-				&path_to_entry, &de);
-	pathrelse(&path_to_entry);
-	if (retval == NAME_FOUND) {
-		inode = reiserfs_iget(dir->i_sb,
-				      (struct cpu_key *)&de.de_dir_id);
-		if (!inode || IS_ERR(inode)) {
-			reiserfs_write_unlock(dir->i_sb);
-			return ERR_PTR(-EACCES);
-		}
-
-		/*
-		 * Propagate the private flag so we know we're
-		 * in the priv tree.  Also clear xattr support
-		 * since we don't have xattrs on xattr files.
-		 */
-		if (IS_PRIVATE(dir))
-			reiserfs_init_priv_inode(inode);
-	}
-	reiserfs_write_unlock(dir->i_sb);
-	if (retval == IO_ERROR) {
-		return ERR_PTR(-EIO);
-	}
-
-	return d_splice_alias(inode, dentry);
-}
-
-/*
- * looks up the dentry of the parent directory for child.
- * taken from ext2_get_parent
- */
-struct dentry *reiserfs_get_parent(struct dentry *child)
-{
-	int retval;
-	struct inode *inode = NULL;
-	struct reiserfs_dir_entry de;
-	INITIALIZE_PATH(path_to_entry);
-	struct inode *dir = d_inode(child);
-
-	if (dir->i_nlink == 0) {
-		return ERR_PTR(-ENOENT);
-	}
-	de.de_gen_number_bit_string = NULL;
-
-	reiserfs_write_lock(dir->i_sb);
-	retval = reiserfs_find_entry(dir, "..", 2, &path_to_entry, &de);
-	pathrelse(&path_to_entry);
-	if (retval != NAME_FOUND) {
-		reiserfs_write_unlock(dir->i_sb);
-		return ERR_PTR(-ENOENT);
-	}
-	inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&de.de_dir_id);
-	reiserfs_write_unlock(dir->i_sb);
-
-	return d_obtain_alias(inode);
-}
-
-/* add entry to the directory (entry can be hidden).
-
-insert definition of when hidden directories are used here -Hans
-
- Does not mark dir   inode dirty, do it after successesfull call to it */
-
-static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
-			      struct inode *dir, const char *name, int namelen,
-			      struct inode *inode, int visible)
-{
-	struct cpu_key entry_key;
-	struct reiserfs_de_head *deh;
-	INITIALIZE_PATH(path);
-	struct reiserfs_dir_entry de;
-	DECLARE_BITMAP(bit_string, MAX_GENERATION_NUMBER + 1);
-	int gen_number;
-
-	/*
-	 * 48 bytes now and we avoid kmalloc if we
-	 * create file with short name
-	 */
-	char small_buf[32 + DEH_SIZE];
-
-	char *buffer;
-	int buflen, paste_size;
-	int retval;
-
-	BUG_ON(!th->t_trans_id);
-
-	/* each entry has unique key. compose it */
-	make_cpu_key(&entry_key, dir,
-		     get_third_component(dir->i_sb, name, namelen),
-		     TYPE_DIRENTRY, 3);
-
-	/* get memory for composing the entry */
-	buflen = DEH_SIZE + ROUND_UP(namelen);
-	if (buflen > sizeof(small_buf)) {
-		buffer = kmalloc(buflen, GFP_NOFS);
-		if (!buffer)
-			return -ENOMEM;
-	} else
-		buffer = small_buf;
-
-	paste_size =
-	    (get_inode_sd_version(dir) ==
-	     STAT_DATA_V1) ? (DEH_SIZE + namelen) : buflen;
-
-	/*
-	 * fill buffer : directory entry head, name[, dir objectid | ,
-	 * stat data | ,stat data, dir objectid ]
-	 */
-	deh = (struct reiserfs_de_head *)buffer;
-	deh->deh_location = 0;	/* JDM Endian safe if 0 */
-	put_deh_offset(deh, cpu_key_k_offset(&entry_key));
-	deh->deh_state = 0;	/* JDM Endian safe if 0 */
-	/* put key (ino analog) to de */
-
-	/* safe: k_dir_id is le */
-	deh->deh_dir_id = INODE_PKEY(inode)->k_dir_id;
-	/* safe: k_objectid is le */
-	deh->deh_objectid = INODE_PKEY(inode)->k_objectid;
-
-	/* copy name */
-	memcpy((char *)(deh + 1), name, namelen);
-	/* padd by 0s to the 4 byte boundary */
-	padd_item((char *)(deh + 1), ROUND_UP(namelen), namelen);
-
-	/*
-	 * entry is ready to be pasted into tree, set 'visibility'
-	 * and 'stat data in entry' attributes
-	 */
-	mark_de_without_sd(deh);
-	visible ? mark_de_visible(deh) : mark_de_hidden(deh);
-
-	/* find the proper place for the new entry */
-	memset(bit_string, 0, sizeof(bit_string));
-	de.de_gen_number_bit_string = bit_string;
-	retval = reiserfs_find_entry(dir, name, namelen, &path, &de);
-	if (retval != NAME_NOT_FOUND) {
-		if (buffer != small_buf)
-			kfree(buffer);
-		pathrelse(&path);
-
-		if (retval == IO_ERROR) {
-			return -EIO;
-		}
-
-		if (retval != NAME_FOUND) {
-			reiserfs_error(dir->i_sb, "zam-7002",
-				       "reiserfs_find_entry() returned "
-				       "unexpected value (%d)", retval);
-		}
-
-		return -EEXIST;
-	}
-
-	gen_number =
-	    find_first_zero_bit(bit_string,
-				MAX_GENERATION_NUMBER + 1);
-	if (gen_number > MAX_GENERATION_NUMBER) {
-		/* there is no free generation number */
-		reiserfs_warning(dir->i_sb, "reiserfs-7010",
-				 "Congratulations! we have got hash function "
-				 "screwed up");
-		if (buffer != small_buf)
-			kfree(buffer);
-		pathrelse(&path);
-		return -EBUSY;
-	}
-	/* adjust offset of directory enrty */
-	put_deh_offset(deh, SET_GENERATION_NUMBER(deh_offset(deh), gen_number));
-	set_cpu_key_k_offset(&entry_key, deh_offset(deh));
-
-	/* update max-hash-collisions counter in reiserfs_sb_info */
-	PROC_INFO_MAX(th->t_super, max_hash_collisions, gen_number);
-
-	/* we need to re-search for the insertion point */
-	if (gen_number != 0) {
-		if (search_by_entry_key(dir->i_sb, &entry_key, &path, &de) !=
-		    NAME_NOT_FOUND) {
-			reiserfs_warning(dir->i_sb, "vs-7032",
-					 "entry with this key (%K) already "
-					 "exists", &entry_key);
-
-			if (buffer != small_buf)
-				kfree(buffer);
-			pathrelse(&path);
-			return -EBUSY;
-		}
-	}
-
-	/* perform the insertion of the entry that we have prepared */
-	retval =
-	    reiserfs_paste_into_item(th, &path, &entry_key, dir, buffer,
-				     paste_size);
-	if (buffer != small_buf)
-		kfree(buffer);
-	if (retval) {
-		reiserfs_check_path(&path);
-		return retval;
-	}
-
-	dir->i_size += paste_size;
-	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
-	if (!S_ISDIR(inode->i_mode) && visible)
-		/* reiserfs_mkdir or reiserfs_rename will do that by itself */
-		reiserfs_update_sd(th, dir);
-
-	reiserfs_check_path(&path);
-	return 0;
-}
-
-/*
- * quota utility function, call if you've had to abort after calling
- * new_inode_init, and have not called reiserfs_new_inode yet.
- * This should only be called on inodes that do not have stat data
- * inserted into the tree yet.
- */
-static int drop_new_inode(struct inode *inode)
-{
-	dquot_drop(inode);
-	make_bad_inode(inode);
-	inode->i_flags |= S_NOQUOTA;
-	iput(inode);
-	return 0;
-}
-
-/*
- * utility function that does setup for reiserfs_new_inode.
- * dquot_initialize needs lots of credits so it's better to have it
- * outside of a transaction, so we had to pull some bits of
- * reiserfs_new_inode out into this func.
- */
-static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode)
-{
-	/*
-	 * Make inode invalid - just in case we are going to drop it before
-	 * the initialization happens
-	 */
-	INODE_PKEY(inode)->k_objectid = 0;
-
-	/*
-	 * the quota init calls have to know who to charge the quota to, so
-	 * we have to set uid and gid here
-	 */
-	inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
-	return dquot_initialize(inode);
-}
-
-static int reiserfs_create(struct mnt_idmap *idmap, struct inode *dir,
-			   struct dentry *dentry, umode_t mode, bool excl)
-{
-	int retval;
-	struct inode *inode;
-	/*
-	 * We need blocks for transaction + (user+group)*(quotas
-	 * for new inode + update of quota for directory owner)
-	 */
-	int jbegin_count =
-	    JOURNAL_PER_BALANCE_CNT * 2 +
-	    2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
-		 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
-	struct reiserfs_transaction_handle th;
-	struct reiserfs_security_handle security;
-
-	retval = dquot_initialize(dir);
-	if (retval)
-		return retval;
-
-	if (!(inode = new_inode(dir->i_sb))) {
-		return -ENOMEM;
-	}
-	retval = new_inode_init(inode, dir, mode);
-	if (retval) {
-		drop_new_inode(inode);
-		return retval;
-	}
-
-	jbegin_count += reiserfs_cache_default_acl(dir);
-	retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
-	if (retval < 0) {
-		drop_new_inode(inode);
-		return retval;
-	}
-	jbegin_count += retval;
-	reiserfs_write_lock(dir->i_sb);
-
-	retval = journal_begin(&th, dir->i_sb, jbegin_count);
-	if (retval) {
-		drop_new_inode(inode);
-		goto out_failed;
-	}
-
-	retval =
-	    reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry,
-			       inode, &security);
-	if (retval)
-		goto out_failed;
-
-	inode->i_op = &reiserfs_file_inode_operations;
-	inode->i_fop = &reiserfs_file_operations;
-	inode->i_mapping->a_ops = &reiserfs_address_space_operations;
-
-	retval =
-	    reiserfs_add_entry(&th, dir, dentry->d_name.name,
-			       dentry->d_name.len, inode, 1 /*visible */ );
-	if (retval) {
-		int err;
-		drop_nlink(inode);
-		reiserfs_update_sd(&th, inode);
-		err = journal_end(&th);
-		if (err)
-			retval = err;
-		unlock_new_inode(inode);
-		iput(inode);
-		goto out_failed;
-	}
-	reiserfs_update_inode_transaction(inode);
-	reiserfs_update_inode_transaction(dir);
-
-	d_instantiate_new(dentry, inode);
-	retval = journal_end(&th);
-
-out_failed:
-	reiserfs_write_unlock(dir->i_sb);
-	reiserfs_security_free(&security);
-	return retval;
-}
-
-static int reiserfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
-			  struct dentry *dentry, umode_t mode, dev_t rdev)
-{
-	int retval;
-	struct inode *inode;
-	struct reiserfs_transaction_handle th;
-	struct reiserfs_security_handle security;
-	/*
-	 * We need blocks for transaction + (user+group)*(quotas
-	 * for new inode + update of quota for directory owner)
-	 */
-	int jbegin_count =
-	    JOURNAL_PER_BALANCE_CNT * 3 +
-	    2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
-		 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
-
-	retval = dquot_initialize(dir);
-	if (retval)
-		return retval;
-
-	if (!(inode = new_inode(dir->i_sb))) {
-		return -ENOMEM;
-	}
-	retval = new_inode_init(inode, dir, mode);
-	if (retval) {
-		drop_new_inode(inode);
-		return retval;
-	}
-
-	jbegin_count += reiserfs_cache_default_acl(dir);
-	retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
-	if (retval < 0) {
-		drop_new_inode(inode);
-		return retval;
-	}
-	jbegin_count += retval;
-	reiserfs_write_lock(dir->i_sb);
-
-	retval = journal_begin(&th, dir->i_sb, jbegin_count);
-	if (retval) {
-		drop_new_inode(inode);
-		goto out_failed;
-	}
-
-	retval =
-	    reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry,
-			       inode, &security);
-	if (retval) {
-		goto out_failed;
-	}
-
-	inode->i_op = &reiserfs_special_inode_operations;
-	init_special_inode(inode, inode->i_mode, rdev);
-
-	/* FIXME: needed for block and char devices only */
-	reiserfs_update_sd(&th, inode);
-
-	reiserfs_update_inode_transaction(inode);
-	reiserfs_update_inode_transaction(dir);
-
-	retval =
-	    reiserfs_add_entry(&th, dir, dentry->d_name.name,
-			       dentry->d_name.len, inode, 1 /*visible */ );
-	if (retval) {
-		int err;
-		drop_nlink(inode);
-		reiserfs_update_sd(&th, inode);
-		err = journal_end(&th);
-		if (err)
-			retval = err;
-		unlock_new_inode(inode);
-		iput(inode);
-		goto out_failed;
-	}
-
-	d_instantiate_new(dentry, inode);
-	retval = journal_end(&th);
-
-out_failed:
-	reiserfs_write_unlock(dir->i_sb);
-	reiserfs_security_free(&security);
-	return retval;
-}
-
-static int reiserfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
-			  struct dentry *dentry, umode_t mode)
-{
-	int retval;
-	struct inode *inode;
-	struct reiserfs_transaction_handle th;
-	struct reiserfs_security_handle security;
-	/*
-	 * We need blocks for transaction + (user+group)*(quotas
-	 * for new inode + update of quota for directory owner)
-	 */
-	int jbegin_count =
-	    JOURNAL_PER_BALANCE_CNT * 3 +
-	    2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
-		 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
-
-	retval = dquot_initialize(dir);
-	if (retval)
-		return retval;
-
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
-	/*
-	 * set flag that new packing locality created and new blocks
-	 * for the content of that directory are not displaced yet
-	 */
-	REISERFS_I(dir)->new_packing_locality = 1;
-#endif
-	mode = S_IFDIR | mode;
-	if (!(inode = new_inode(dir->i_sb))) {
-		return -ENOMEM;
-	}
-	retval = new_inode_init(inode, dir, mode);
-	if (retval) {
-		drop_new_inode(inode);
-		return retval;
-	}
-
-	jbegin_count += reiserfs_cache_default_acl(dir);
-	retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
-	if (retval < 0) {
-		drop_new_inode(inode);
-		return retval;
-	}
-	jbegin_count += retval;
-	reiserfs_write_lock(dir->i_sb);
-
-	retval = journal_begin(&th, dir->i_sb, jbegin_count);
-	if (retval) {
-		drop_new_inode(inode);
-		goto out_failed;
-	}
-
-	/*
-	 * inc the link count now, so another writer doesn't overflow
-	 * it while we sleep later on.
-	 */
-	INC_DIR_INODE_NLINK(dir)
-
-	retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */,
-				    old_format_only(dir->i_sb) ?
-				    EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
-				    dentry, inode, &security);
-	if (retval) {
-		DEC_DIR_INODE_NLINK(dir)
-		goto out_failed;
-	}
-
-	reiserfs_update_inode_transaction(inode);
-	reiserfs_update_inode_transaction(dir);
-
-	inode->i_op = &reiserfs_dir_inode_operations;
-	inode->i_fop = &reiserfs_dir_operations;
-
-	/* note, _this_ add_entry will not update dir's stat data */
-	retval =
-	    reiserfs_add_entry(&th, dir, dentry->d_name.name,
-			       dentry->d_name.len, inode, 1 /*visible */ );
-	if (retval) {
-		int err;
-		clear_nlink(inode);
-		DEC_DIR_INODE_NLINK(dir);
-		reiserfs_update_sd(&th, inode);
-		err = journal_end(&th);
-		if (err)
-			retval = err;
-		unlock_new_inode(inode);
-		iput(inode);
-		goto out_failed;
-	}
-	/* the above add_entry did not update dir's stat data */
-	reiserfs_update_sd(&th, dir);
-
-	d_instantiate_new(dentry, inode);
-	retval = journal_end(&th);
-out_failed:
-	reiserfs_write_unlock(dir->i_sb);
-	reiserfs_security_free(&security);
-	return retval;
-}
-
-static inline int reiserfs_empty_dir(struct inode *inode)
-{
-	/*
-	 * we can cheat because an old format dir cannot have
-	 * EMPTY_DIR_SIZE, and a new format dir cannot have
-	 * EMPTY_DIR_SIZE_V1.  So, if the inode is either size,
-	 * regardless of disk format version, the directory is empty.
-	 */
-	if (inode->i_size != EMPTY_DIR_SIZE &&
-	    inode->i_size != EMPTY_DIR_SIZE_V1) {
-		return 0;
-	}
-	return 1;
-}
-
-static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
-{
-	int retval, err;
-	struct inode *inode;
-	struct reiserfs_transaction_handle th;
-	int jbegin_count;
-	INITIALIZE_PATH(path);
-	struct reiserfs_dir_entry de;
-
-	/*
-	 * we will be doing 2 balancings and update 2 stat data, we
-	 * change quotas of the owner of the directory and of the owner
-	 * of the parent directory.  The quota structure is possibly
-	 * deleted only on last iput => outside of this transaction
-	 */
-	jbegin_count =
-	    JOURNAL_PER_BALANCE_CNT * 2 + 2 +
-	    4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
-
-	retval = dquot_initialize(dir);
-	if (retval)
-		return retval;
-
-	reiserfs_write_lock(dir->i_sb);
-	retval = journal_begin(&th, dir->i_sb, jbegin_count);
-	if (retval)
-		goto out_rmdir;
-
-	de.de_gen_number_bit_string = NULL;
-	if ((retval =
-	     reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
-				 &path, &de)) == NAME_NOT_FOUND) {
-		retval = -ENOENT;
-		goto end_rmdir;
-	} else if (retval == IO_ERROR) {
-		retval = -EIO;
-		goto end_rmdir;
-	}
-
-	inode = d_inode(dentry);
-
-	reiserfs_update_inode_transaction(inode);
-	reiserfs_update_inode_transaction(dir);
-
-	if (de.de_objectid != inode->i_ino) {
-		/*
-		 * FIXME: compare key of an object and a key found in the entry
-		 */
-		retval = -EIO;
-		goto end_rmdir;
-	}
-	if (!reiserfs_empty_dir(inode)) {
-		retval = -ENOTEMPTY;
-		goto end_rmdir;
-	}
-
-	/* cut entry from dir directory */
-	retval = reiserfs_cut_from_item(&th, &path, &de.de_entry_key,
-					dir, NULL,	/* page */
-					0 /*new file size - not used here */ );
-	if (retval < 0)
-		goto end_rmdir;
-
-	if (inode->i_nlink != 2 && inode->i_nlink != 1)
-		reiserfs_error(inode->i_sb, "reiserfs-7040",
-			       "empty directory has nlink != 2 (%d)",
-			       inode->i_nlink);
-
-	clear_nlink(inode);
-	inode_set_mtime_to_ts(dir,
-			      inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
-	reiserfs_update_sd(&th, inode);
-
-	DEC_DIR_INODE_NLINK(dir)
-	dir->i_size -= (DEH_SIZE + de.de_entrylen);
-	reiserfs_update_sd(&th, dir);
-
-	/* prevent empty directory from getting lost */
-	add_save_link(&th, inode, 0 /* not truncate */ );
-
-	retval = journal_end(&th);
-	reiserfs_check_path(&path);
-out_rmdir:
-	reiserfs_write_unlock(dir->i_sb);
-	return retval;
-
-end_rmdir:
-	/*
-	 * we must release path, because we did not call
-	 * reiserfs_cut_from_item, or reiserfs_cut_from_item does not
-	 * release path if operation was not complete
-	 */
-	pathrelse(&path);
-	err = journal_end(&th);
-	reiserfs_write_unlock(dir->i_sb);
-	return err ? err : retval;
-}
-
-static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
-{
-	int retval, err;
-	struct inode *inode;
-	struct reiserfs_dir_entry de;
-	INITIALIZE_PATH(path);
-	struct reiserfs_transaction_handle th;
-	int jbegin_count;
-	unsigned long savelink;
-
-	retval = dquot_initialize(dir);
-	if (retval)
-		return retval;
-
-	inode = d_inode(dentry);
-
-	/*
-	 * in this transaction we can be doing at max two balancings and
-	 * update two stat datas, we change quotas of the owner of the
-	 * directory and of the owner of the parent directory. The quota
-	 * structure is possibly deleted only on iput => outside of
-	 * this transaction
-	 */
-	jbegin_count =
-	    JOURNAL_PER_BALANCE_CNT * 2 + 2 +
-	    4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
-
-	reiserfs_write_lock(dir->i_sb);
-	retval = journal_begin(&th, dir->i_sb, jbegin_count);
-	if (retval)
-		goto out_unlink;
-
-	de.de_gen_number_bit_string = NULL;
-	if ((retval =
-	     reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
-				 &path, &de)) == NAME_NOT_FOUND) {
-		retval = -ENOENT;
-		goto end_unlink;
-	} else if (retval == IO_ERROR) {
-		retval = -EIO;
-		goto end_unlink;
-	}
-
-	reiserfs_update_inode_transaction(inode);
-	reiserfs_update_inode_transaction(dir);
-
-	if (de.de_objectid != inode->i_ino) {
-		/*
-		 * FIXME: compare key of an object and a key found in the entry
-		 */
-		retval = -EIO;
-		goto end_unlink;
-	}
-
-	if (!inode->i_nlink) {
-		reiserfs_warning(inode->i_sb, "reiserfs-7042",
-				 "deleting nonexistent file (%lu), %d",
-				 inode->i_ino, inode->i_nlink);
-		set_nlink(inode, 1);
-	}
-
-	drop_nlink(inode);
-
-	/*
-	 * we schedule before doing the add_save_link call, save the link
-	 * count so we don't race
-	 */
-	savelink = inode->i_nlink;
-
-	retval =
-	    reiserfs_cut_from_item(&th, &path, &de.de_entry_key, dir, NULL,
-				   0);
-	if (retval < 0) {
-		inc_nlink(inode);
-		goto end_unlink;
-	}
-	inode_set_ctime_current(inode);
-	reiserfs_update_sd(&th, inode);
-
-	dir->i_size -= (de.de_entrylen + DEH_SIZE);
-	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
-	reiserfs_update_sd(&th, dir);
-
-	if (!savelink)
-		/* prevent file from getting lost */
-		add_save_link(&th, inode, 0 /* not truncate */ );
-
-	retval = journal_end(&th);
-	reiserfs_check_path(&path);
-	reiserfs_write_unlock(dir->i_sb);
-	return retval;
-
-end_unlink:
-	pathrelse(&path);
-	err = journal_end(&th);
-	reiserfs_check_path(&path);
-	if (err)
-		retval = err;
-out_unlink:
-	reiserfs_write_unlock(dir->i_sb);
-	return retval;
-}
-
-static int reiserfs_symlink(struct mnt_idmap *idmap,
-			    struct inode *parent_dir, struct dentry *dentry,
-			    const char *symname)
-{
-	int retval;
-	struct inode *inode;
-	char *name;
-	int item_len;
-	struct reiserfs_transaction_handle th;
-	struct reiserfs_security_handle security;
-	int mode = S_IFLNK | S_IRWXUGO;
-	/*
-	 * We need blocks for transaction + (user+group)*(quotas for
-	 * new inode + update of quota for directory owner)
-	 */
-	int jbegin_count =
-	    JOURNAL_PER_BALANCE_CNT * 3 +
-	    2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) +
-		 REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb));
-
-	retval = dquot_initialize(parent_dir);
-	if (retval)
-		return retval;
-
-	if (!(inode = new_inode(parent_dir->i_sb))) {
-		return -ENOMEM;
-	}
-	retval = new_inode_init(inode, parent_dir, mode);
-	if (retval) {
-		drop_new_inode(inode);
-		return retval;
-	}
-
-	retval = reiserfs_security_init(parent_dir, inode, &dentry->d_name,
-					&security);
-	if (retval < 0) {
-		drop_new_inode(inode);
-		return retval;
-	}
-	jbegin_count += retval;
-
-	reiserfs_write_lock(parent_dir->i_sb);
-	item_len = ROUND_UP(strlen(symname));
-	if (item_len > MAX_DIRECT_ITEM_LEN(parent_dir->i_sb->s_blocksize)) {
-		retval = -ENAMETOOLONG;
-		drop_new_inode(inode);
-		goto out_failed;
-	}
-
-	name = kmalloc(item_len, GFP_NOFS);
-	if (!name) {
-		drop_new_inode(inode);
-		retval = -ENOMEM;
-		goto out_failed;
-	}
-	memcpy(name, symname, strlen(symname));
-	padd_item(name, item_len, strlen(symname));
-
-	retval = journal_begin(&th, parent_dir->i_sb, jbegin_count);
-	if (retval) {
-		drop_new_inode(inode);
-		kfree(name);
-		goto out_failed;
-	}
-
-	retval =
-	    reiserfs_new_inode(&th, parent_dir, mode, name, strlen(symname),
-			       dentry, inode, &security);
-	kfree(name);
-	if (retval) {		/* reiserfs_new_inode iputs for us */
-		goto out_failed;
-	}
-
-	reiserfs_update_inode_transaction(inode);
-	reiserfs_update_inode_transaction(parent_dir);
-
-	inode->i_op = &reiserfs_symlink_inode_operations;
-	inode_nohighmem(inode);
-	inode->i_mapping->a_ops = &reiserfs_address_space_operations;
-
-	retval = reiserfs_add_entry(&th, parent_dir, dentry->d_name.name,
-				    dentry->d_name.len, inode, 1 /*visible */ );
-	if (retval) {
-		int err;
-		drop_nlink(inode);
-		reiserfs_update_sd(&th, inode);
-		err = journal_end(&th);
-		if (err)
-			retval = err;
-		unlock_new_inode(inode);
-		iput(inode);
-		goto out_failed;
-	}
-
-	d_instantiate_new(dentry, inode);
-	retval = journal_end(&th);
-out_failed:
-	reiserfs_write_unlock(parent_dir->i_sb);
-	reiserfs_security_free(&security);
-	return retval;
-}
-
-static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
-			 struct dentry *dentry)
-{
-	int retval;
-	struct inode *inode = d_inode(old_dentry);
-	struct reiserfs_transaction_handle th;
-	/*
-	 * We need blocks for transaction + update of quotas for
-	 * the owners of the directory
-	 */
-	int jbegin_count =
-	    JOURNAL_PER_BALANCE_CNT * 3 +
-	    2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
-
-	retval = dquot_initialize(dir);
-	if (retval)
-		return retval;
-
-	reiserfs_write_lock(dir->i_sb);
-	if (inode->i_nlink >= REISERFS_LINK_MAX) {
-		/* FIXME: sd_nlink is 32 bit for new files */
-		reiserfs_write_unlock(dir->i_sb);
-		return -EMLINK;
-	}
-
-	/* inc before scheduling so reiserfs_unlink knows we are here */
-	inc_nlink(inode);
-
-	retval = journal_begin(&th, dir->i_sb, jbegin_count);
-	if (retval) {
-		drop_nlink(inode);
-		reiserfs_write_unlock(dir->i_sb);
-		return retval;
-	}
-
-	/* create new entry */
-	retval =
-	    reiserfs_add_entry(&th, dir, dentry->d_name.name,
-			       dentry->d_name.len, inode, 1 /*visible */ );
-
-	reiserfs_update_inode_transaction(inode);
-	reiserfs_update_inode_transaction(dir);
-
-	if (retval) {
-		int err;
-		drop_nlink(inode);
-		err = journal_end(&th);
-		reiserfs_write_unlock(dir->i_sb);
-		return err ? err : retval;
-	}
-
-	inode_set_ctime_current(inode);
-	reiserfs_update_sd(&th, inode);
-
-	ihold(inode);
-	d_instantiate(dentry, inode);
-	retval = journal_end(&th);
-	reiserfs_write_unlock(dir->i_sb);
-	return retval;
-}
-
-/* de contains information pointing to an entry which */
-static int de_still_valid(const char *name, int len,
-			  struct reiserfs_dir_entry *de)
-{
-	struct reiserfs_dir_entry tmp = *de;
-
-	/* recalculate pointer to name and name length */
-	set_de_name_and_namelen(&tmp);
-	/* FIXME: could check more */
-	if (tmp.de_namelen != len || memcmp(name, de->de_name, len))
-		return 0;
-	return 1;
-}
-
-static int entry_points_to_object(const char *name, int len,
-				  struct reiserfs_dir_entry *de,
-				  struct inode *inode)
-{
-	if (!de_still_valid(name, len, de))
-		return 0;
-
-	if (inode) {
-		if (!de_visible(de->de_deh + de->de_entry_num))
-			reiserfs_panic(inode->i_sb, "vs-7042",
-				       "entry must be visible");
-		return (de->de_objectid == inode->i_ino) ? 1 : 0;
-	}
-
-	/* this must be added hidden entry */
-	if (de_visible(de->de_deh + de->de_entry_num))
-		reiserfs_panic(NULL, "vs-7043", "entry must be visible");
-
-	return 1;
-}
-
-/* sets key of objectid the entry has to point to */
-static void set_ino_in_dir_entry(struct reiserfs_dir_entry *de,
-				 struct reiserfs_key *key)
-{
-	/* JDM These operations are endian safe - both are le */
-	de->de_deh[de->de_entry_num].deh_dir_id = key->k_dir_id;
-	de->de_deh[de->de_entry_num].deh_objectid = key->k_objectid;
-}
-
-/*
- * process, that is going to call fix_nodes/do_balance must hold only
- * one path. If it holds 2 or more, it can get into endless waiting in
- * get_empty_nodes or its clones
- */
-static int reiserfs_rename(struct mnt_idmap *idmap,
-			   struct inode *old_dir, struct dentry *old_dentry,
-			   struct inode *new_dir, struct dentry *new_dentry,
-			   unsigned int flags)
-{
-	int retval;
-	INITIALIZE_PATH(old_entry_path);
-	INITIALIZE_PATH(new_entry_path);
-	INITIALIZE_PATH(dot_dot_entry_path);
-	struct item_head new_entry_ih, old_entry_ih, dot_dot_ih;
-	struct reiserfs_dir_entry old_de, new_de, dot_dot_de;
-	struct inode *old_inode, *new_dentry_inode;
-	struct reiserfs_transaction_handle th;
-	int jbegin_count;
-	unsigned long savelink = 1;
-	bool update_dir_parent = false;
-
-	if (flags & ~RENAME_NOREPLACE)
-		return -EINVAL;
-
-	/*
-	 * three balancings: (1) old name removal, (2) new name insertion
-	 * and (3) maybe "save" link insertion
-	 * stat data updates: (1) old directory,
-	 * (2) new directory and (3) maybe old object stat data (when it is
-	 * directory) and (4) maybe stat data of object to which new entry
-	 * pointed initially and (5) maybe block containing ".." of
-	 * renamed directory
-	 * quota updates: two parent directories
-	 */
-	jbegin_count =
-	    JOURNAL_PER_BALANCE_CNT * 3 + 5 +
-	    4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb);
-
-	retval = dquot_initialize(old_dir);
-	if (retval)
-		return retval;
-	retval = dquot_initialize(new_dir);
-	if (retval)
-		return retval;
-
-	old_inode = d_inode(old_dentry);
-	new_dentry_inode = d_inode(new_dentry);
-
-	/*
-	 * make sure that oldname still exists and points to an object we
-	 * are going to rename
-	 */
-	old_de.de_gen_number_bit_string = NULL;
-	reiserfs_write_lock(old_dir->i_sb);
-	retval =
-	    reiserfs_find_entry(old_dir, old_dentry->d_name.name,
-				old_dentry->d_name.len, &old_entry_path,
-				&old_de);
-	pathrelse(&old_entry_path);
-	if (retval == IO_ERROR) {
-		reiserfs_write_unlock(old_dir->i_sb);
-		return -EIO;
-	}
-
-	if (retval != NAME_FOUND || old_de.de_objectid != old_inode->i_ino) {
-		reiserfs_write_unlock(old_dir->i_sb);
-		return -ENOENT;
-	}
-
-	if (S_ISDIR(old_inode->i_mode)) {
-		/*
-		 * make sure that directory being renamed has correct ".."
-		 * and that its new parent directory has not too many links
-		 * already
-		 */
-		if (new_dentry_inode) {
-			if (!reiserfs_empty_dir(new_dentry_inode)) {
-				reiserfs_write_unlock(old_dir->i_sb);
-				return -ENOTEMPTY;
-			}
-		}
-
-		if (old_dir != new_dir) {
-			/*
-			 * directory is renamed, its parent directory will be
-			 * changed, so find ".." entry
-			 */
-			dot_dot_de.de_gen_number_bit_string = NULL;
-			retval =
-			    reiserfs_find_entry(old_inode, "..", 2,
-					&dot_dot_entry_path,
-					&dot_dot_de);
-			pathrelse(&dot_dot_entry_path);
-			if (retval != NAME_FOUND) {
-				reiserfs_write_unlock(old_dir->i_sb);
-				return -EIO;
-			}
-
-			/* inode number of .. must equal old_dir->i_ino */
-			if (dot_dot_de.de_objectid != old_dir->i_ino) {
-				reiserfs_write_unlock(old_dir->i_sb);
-				return -EIO;
-			}
-			update_dir_parent = true;
-		}
-	}
-
-	retval = journal_begin(&th, old_dir->i_sb, jbegin_count);
-	if (retval) {
-		reiserfs_write_unlock(old_dir->i_sb);
-		return retval;
-	}
-
-	/* add new entry (or find the existing one) */
-	retval =
-	    reiserfs_add_entry(&th, new_dir, new_dentry->d_name.name,
-			       new_dentry->d_name.len, old_inode, 0);
-	if (retval == -EEXIST) {
-		if (!new_dentry_inode) {
-			reiserfs_panic(old_dir->i_sb, "vs-7050",
-				       "new entry is found, new inode == 0");
-		}
-	} else if (retval) {
-		int err = journal_end(&th);
-		reiserfs_write_unlock(old_dir->i_sb);
-		return err ? err : retval;
-	}
-
-	reiserfs_update_inode_transaction(old_dir);
-	reiserfs_update_inode_transaction(new_dir);
-
-	/*
-	 * this makes it so an fsync on an open fd for the old name will
-	 * commit the rename operation
-	 */
-	reiserfs_update_inode_transaction(old_inode);
-
-	if (new_dentry_inode)
-		reiserfs_update_inode_transaction(new_dentry_inode);
-
-	while (1) {
-		/*
-		 * look for old name using corresponding entry key
-		 * (found by reiserfs_find_entry)
-		 */
-		if ((retval =
-		     search_by_entry_key(new_dir->i_sb, &old_de.de_entry_key,
-					 &old_entry_path,
-					 &old_de)) != NAME_FOUND) {
-			pathrelse(&old_entry_path);
-			journal_end(&th);
-			reiserfs_write_unlock(old_dir->i_sb);
-			return -EIO;
-		}
-
-		copy_item_head(&old_entry_ih, tp_item_head(&old_entry_path));
-
-		reiserfs_prepare_for_journal(old_inode->i_sb, old_de.de_bh, 1);
-
-		/* look for new name by reiserfs_find_entry */
-		new_de.de_gen_number_bit_string = NULL;
-		retval =
-		    reiserfs_find_entry(new_dir, new_dentry->d_name.name,
-					new_dentry->d_name.len, &new_entry_path,
-					&new_de);
-		/*
-		 * reiserfs_add_entry should not return IO_ERROR,
-		 * because it is called with essentially same parameters from
-		 * reiserfs_add_entry above, and we'll catch any i/o errors
-		 * before we get here.
-		 */
-		if (retval != NAME_FOUND_INVISIBLE && retval != NAME_FOUND) {
-			pathrelse(&new_entry_path);
-			pathrelse(&old_entry_path);
-			journal_end(&th);
-			reiserfs_write_unlock(old_dir->i_sb);
-			return -EIO;
-		}
-
-		copy_item_head(&new_entry_ih, tp_item_head(&new_entry_path));
-
-		reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1);
-
-		if (update_dir_parent) {
-			if ((retval =
-			     search_by_entry_key(new_dir->i_sb,
-						 &dot_dot_de.de_entry_key,
-						 &dot_dot_entry_path,
-						 &dot_dot_de)) != NAME_FOUND) {
-				pathrelse(&dot_dot_entry_path);
-				pathrelse(&new_entry_path);
-				pathrelse(&old_entry_path);
-				journal_end(&th);
-				reiserfs_write_unlock(old_dir->i_sb);
-				return -EIO;
-			}
-			copy_item_head(&dot_dot_ih,
-				       tp_item_head(&dot_dot_entry_path));
-			/* node containing ".." gets into transaction */
-			reiserfs_prepare_for_journal(old_inode->i_sb,
-						     dot_dot_de.de_bh, 1);
-		}
-		/*
-		 * we should check seals here, not do
-		 * this stuff, yes? Then, having
-		 * gathered everything into RAM we
-		 * should lock the buffers, yes?  -Hans
-		 */
-		/*
-		 * probably.  our rename needs to hold more
-		 * than one path at once.  The seals would
-		 * have to be written to deal with multi-path
-		 * issues -chris
-		 */
-		/*
-		 * sanity checking before doing the rename - avoid races many
-		 * of the above checks could have scheduled.  We have to be
-		 * sure our items haven't been shifted by another process.
-		 */
-		if (item_moved(&new_entry_ih, &new_entry_path) ||
-		    !entry_points_to_object(new_dentry->d_name.name,
-					    new_dentry->d_name.len,
-					    &new_de, new_dentry_inode) ||
-		    item_moved(&old_entry_ih, &old_entry_path) ||
-		    !entry_points_to_object(old_dentry->d_name.name,
-					    old_dentry->d_name.len,
-					    &old_de, old_inode)) {
-			reiserfs_restore_prepared_buffer(old_inode->i_sb,
-							 new_de.de_bh);
-			reiserfs_restore_prepared_buffer(old_inode->i_sb,
-							 old_de.de_bh);
-			if (update_dir_parent)
-				reiserfs_restore_prepared_buffer(old_inode->
-								 i_sb,
-								 dot_dot_de.
-								 de_bh);
-			continue;
-		}
-		if (update_dir_parent) {
-			if (item_moved(&dot_dot_ih, &dot_dot_entry_path) ||
-			    !entry_points_to_object("..", 2, &dot_dot_de,
-						    old_dir)) {
-				reiserfs_restore_prepared_buffer(old_inode->
-								 i_sb,
-								 old_de.de_bh);
-				reiserfs_restore_prepared_buffer(old_inode->
-								 i_sb,
-								 new_de.de_bh);
-				reiserfs_restore_prepared_buffer(old_inode->
-								 i_sb,
-								 dot_dot_de.
-								 de_bh);
-				continue;
-			}
-		}
-
-		RFALSE(update_dir_parent &&
-		       !buffer_journal_prepared(dot_dot_de.de_bh), "");
-
-		break;
-	}
-
-	/*
-	 * ok, all the changes can be done in one fell swoop when we
-	 * have claimed all the buffers needed.
-	 */
-
-	mark_de_visible(new_de.de_deh + new_de.de_entry_num);
-	set_ino_in_dir_entry(&new_de, INODE_PKEY(old_inode));
-	journal_mark_dirty(&th, new_de.de_bh);
-
-	mark_de_hidden(old_de.de_deh + old_de.de_entry_num);
-	journal_mark_dirty(&th, old_de.de_bh);
-	/*
-	 * thanks to Alex Adriaanse <alex_a@caltech.edu> for patch
-	 * which adds ctime update of renamed object
-	 */
-	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
-
-	if (new_dentry_inode) {
-		/* adjust link number of the victim */
-		if (S_ISDIR(new_dentry_inode->i_mode)) {
-			clear_nlink(new_dentry_inode);
-		} else {
-			drop_nlink(new_dentry_inode);
-		}
-		savelink = new_dentry_inode->i_nlink;
-	}
-
-	if (update_dir_parent) {
-		/* adjust ".." of renamed directory */
-		set_ino_in_dir_entry(&dot_dot_de, INODE_PKEY(new_dir));
-		journal_mark_dirty(&th, dot_dot_de.de_bh);
-	}
-	if (S_ISDIR(old_inode->i_mode)) {
-		/*
-		 * there (in new_dir) was no directory, so it got new link
-		 * (".."  of renamed directory)
-		 */
-		if (!new_dentry_inode)
-			INC_DIR_INODE_NLINK(new_dir);
-
-		/* old directory lost one link - ".. " of renamed directory */
-		DEC_DIR_INODE_NLINK(old_dir);
-	}
-	/*
-	 * looks like in 2.3.99pre3 brelse is atomic.
-	 * so we can use pathrelse
-	 */
-	pathrelse(&new_entry_path);
-	pathrelse(&dot_dot_entry_path);
-
-	/*
-	 * FIXME: this reiserfs_cut_from_item's return value may screw up
-	 * anybody, but it will panic if will not be able to find the
-	 * entry. This needs one more clean up
-	 */
-	if (reiserfs_cut_from_item
-	    (&th, &old_entry_path, &old_de.de_entry_key, old_dir, NULL,
-	     0) < 0)
-		reiserfs_error(old_dir->i_sb, "vs-7060",
-			       "couldn't not cut old name. Fsck later?");
-
-	old_dir->i_size -= DEH_SIZE + old_de.de_entrylen;
-
-	reiserfs_update_sd(&th, old_dir);
-	reiserfs_update_sd(&th, new_dir);
-	reiserfs_update_sd(&th, old_inode);
-
-	if (new_dentry_inode) {
-		if (savelink == 0)
-			add_save_link(&th, new_dentry_inode,
-				      0 /* not truncate */ );
-		reiserfs_update_sd(&th, new_dentry_inode);
-	}
-
-	retval = journal_end(&th);
-	reiserfs_write_unlock(old_dir->i_sb);
-	return retval;
-}
-
-static const struct inode_operations reiserfs_priv_dir_inode_operations = {
-	.create = reiserfs_create,
-	.lookup = reiserfs_lookup,
-	.link = reiserfs_link,
-	.unlink = reiserfs_unlink,
-	.symlink = reiserfs_symlink,
-	.mkdir = reiserfs_mkdir,
-	.rmdir = reiserfs_rmdir,
-	.mknod = reiserfs_mknod,
-	.rename = reiserfs_rename,
-	.setattr = reiserfs_setattr,
-	.permission = reiserfs_permission,
-	.fileattr_get = reiserfs_fileattr_get,
-	.fileattr_set = reiserfs_fileattr_set,
-};
-
-static const struct inode_operations reiserfs_priv_symlink_inode_operations = {
-	.get_link	= page_get_link,
-	.setattr = reiserfs_setattr,
-	.permission = reiserfs_permission,
-};
-
-static const struct inode_operations reiserfs_priv_special_inode_operations = {
-	.setattr = reiserfs_setattr,
-	.permission = reiserfs_permission,
-};
-
-void reiserfs_init_priv_inode(struct inode *inode)
-{
-	inode->i_flags |= S_PRIVATE;
-	inode->i_opflags &= ~IOP_XATTR;
-
-	if (S_ISREG(inode->i_mode))
-		inode->i_op = &reiserfs_priv_file_inode_operations;
-	else if (S_ISDIR(inode->i_mode))
-		inode->i_op = &reiserfs_priv_dir_inode_operations;
-	else if (S_ISLNK(inode->i_mode))
-		inode->i_op = &reiserfs_priv_symlink_inode_operations;
-	else
-		inode->i_op = &reiserfs_priv_special_inode_operations;
-}
-
-/* directories can handle most operations...  */
-const struct inode_operations reiserfs_dir_inode_operations = {
-	.create = reiserfs_create,
-	.lookup = reiserfs_lookup,
-	.link = reiserfs_link,
-	.unlink = reiserfs_unlink,
-	.symlink = reiserfs_symlink,
-	.mkdir = reiserfs_mkdir,
-	.rmdir = reiserfs_rmdir,
-	.mknod = reiserfs_mknod,
-	.rename = reiserfs_rename,
-	.setattr = reiserfs_setattr,
-	.listxattr = reiserfs_listxattr,
-	.permission = reiserfs_permission,
-	.get_inode_acl = reiserfs_get_acl,
-	.set_acl = reiserfs_set_acl,
-	.fileattr_get = reiserfs_fileattr_get,
-	.fileattr_set = reiserfs_fileattr_set,
-};
-
-/*
- * symlink operations.. same as page_symlink_inode_operations, with xattr
- * stuff added
- */
-const struct inode_operations reiserfs_symlink_inode_operations = {
-	.get_link	= page_get_link,
-	.setattr = reiserfs_setattr,
-	.listxattr = reiserfs_listxattr,
-	.permission = reiserfs_permission,
-};
-
-/*
- * special file operations.. just xattr/acl stuff
- */
-const struct inode_operations reiserfs_special_inode_operations = {
-	.setattr = reiserfs_setattr,
-	.listxattr = reiserfs_listxattr,
-	.permission = reiserfs_permission,
-	.get_inode_acl = reiserfs_get_acl,
-	.set_acl = reiserfs_set_acl,
-};
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
deleted file mode 100644
index 34baf5c0f265..000000000000
--- a/fs/reiserfs/objectid.c
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/string.h>
-#include <linux/time.h>
-#include <linux/uuid.h>
-#include "reiserfs.h"
-
-/* find where objectid map starts */
-#define objectid_map(s,rs) (old_format_only (s) ? \
-                         (__le32 *)((struct reiserfs_super_block_v1 *)(rs) + 1) :\
-			 (__le32 *)((rs) + 1))
-
-#ifdef CONFIG_REISERFS_CHECK
-
-static void check_objectid_map(struct super_block *s, __le32 * map)
-{
-	if (le32_to_cpu(map[0]) != 1)
-		reiserfs_panic(s, "vs-15010", "map corrupted: %lx",
-			       (long unsigned int)le32_to_cpu(map[0]));
-
-	/* FIXME: add something else here */
-}
-
-#else
-static void check_objectid_map(struct super_block *s, __le32 * map)
-{;
-}
-#endif
-
-/*
- * When we allocate objectids we allocate the first unused objectid.
- * Each sequence of objectids in use (the odd sequences) is followed
- * by a sequence of objectids not in use (the even sequences).  We
- * only need to record the last objectid in each of these sequences
- * (both the odd and even sequences) in order to fully define the
- * boundaries of the sequences.  A consequence of allocating the first
- * objectid not in use is that under most conditions this scheme is
- * extremely compact.  The exception is immediately after a sequence
- * of operations which deletes a large number of objects of
- * non-sequential objectids, and even then it will become compact
- * again as soon as more objects are created.  Note that many
- * interesting optimizations of layout could result from complicating
- * objectid assignment, but we have deferred making them for now.
- */
-
-/* get unique object identifier */
-__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th)
-{
-	struct super_block *s = th->t_super;
-	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
-	__le32 *map = objectid_map(s, rs);
-	__u32 unused_objectid;
-
-	BUG_ON(!th->t_trans_id);
-
-	check_objectid_map(s, map);
-
-	reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
-	/* comment needed -Hans */
-	unused_objectid = le32_to_cpu(map[1]);
-	if (unused_objectid == U32_MAX) {
-		reiserfs_warning(s, "reiserfs-15100", "no more object ids");
-		reiserfs_restore_prepared_buffer(s, SB_BUFFER_WITH_SB(s));
-		return 0;
-	}
-
-	/*
-	 * This incrementation allocates the first unused objectid. That
-	 * is to say, the first entry on the objectid map is the first
-	 * unused objectid, and by incrementing it we use it.  See below
-	 * where we check to see if we eliminated a sequence of unused
-	 * objectids....
-	 */
-	map[1] = cpu_to_le32(unused_objectid + 1);
-
-	/*
-	 * Now we check to see if we eliminated the last remaining member of
-	 * the first even sequence (and can eliminate the sequence by
-	 * eliminating its last objectid from oids), and can collapse the
-	 * first two odd sequences into one sequence.  If so, then the net
-	 * result is to eliminate a pair of objectids from oids.  We do this
-	 * by shifting the entire map to the left.
-	 */
-	if (sb_oid_cursize(rs) > 2 && map[1] == map[2]) {
-		memmove(map + 1, map + 3,
-			(sb_oid_cursize(rs) - 3) * sizeof(__u32));
-		set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2);
-	}
-
-	journal_mark_dirty(th, SB_BUFFER_WITH_SB(s));
-	return unused_objectid;
-}
-
-/* makes object identifier unused */
-void reiserfs_release_objectid(struct reiserfs_transaction_handle *th,
-			       __u32 objectid_to_release)
-{
-	struct super_block *s = th->t_super;
-	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
-	__le32 *map = objectid_map(s, rs);
-	int i = 0;
-
-	BUG_ON(!th->t_trans_id);
-	/*return; */
-	check_objectid_map(s, map);
-
-	reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
-	journal_mark_dirty(th, SB_BUFFER_WITH_SB(s));
-
-	/*
-	 * start at the beginning of the objectid map (i = 0) and go to
-	 * the end of it (i = disk_sb->s_oid_cursize).  Linear search is
-	 * what we use, though it is possible that binary search would be
-	 * more efficient after performing lots of deletions (which is
-	 * when oids is large.)  We only check even i's.
-	 */
-	while (i < sb_oid_cursize(rs)) {
-		if (objectid_to_release == le32_to_cpu(map[i])) {
-			/* This incrementation unallocates the objectid. */
-			le32_add_cpu(&map[i], 1);
-
-			/*
-			 * Did we unallocate the last member of an
-			 * odd sequence, and can shrink oids?
-			 */
-			if (map[i] == map[i + 1]) {
-				/* shrink objectid map */
-				memmove(map + i, map + i + 2,
-					(sb_oid_cursize(rs) - i -
-					 2) * sizeof(__u32));
-				set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2);
-
-				RFALSE(sb_oid_cursize(rs) < 2 ||
-				       sb_oid_cursize(rs) > sb_oid_maxsize(rs),
-				       "vs-15005: objectid map corrupted cur_size == %d (max == %d)",
-				       sb_oid_cursize(rs), sb_oid_maxsize(rs));
-			}
-			return;
-		}
-
-		if (objectid_to_release > le32_to_cpu(map[i]) &&
-		    objectid_to_release < le32_to_cpu(map[i + 1])) {
-			/* size of objectid map is not changed */
-			if (objectid_to_release + 1 == le32_to_cpu(map[i + 1])) {
-				le32_add_cpu(&map[i + 1], -1);
-				return;
-			}
-
-			/*
-			 * JDM comparing two little-endian values for
-			 * equality -- safe
-			 */
-			/*
-			 * objectid map must be expanded, but
-			 * there is no space
-			 */
-			if (sb_oid_cursize(rs) == sb_oid_maxsize(rs)) {
-				PROC_INFO_INC(s, leaked_oid);
-				return;
-			}
-
-			/* expand the objectid map */
-			memmove(map + i + 3, map + i + 1,
-				(sb_oid_cursize(rs) - i - 1) * sizeof(__u32));
-			map[i + 1] = cpu_to_le32(objectid_to_release);
-			map[i + 2] = cpu_to_le32(objectid_to_release + 1);
-			set_sb_oid_cursize(rs, sb_oid_cursize(rs) + 2);
-			return;
-		}
-		i += 2;
-	}
-
-	reiserfs_error(s, "vs-15011", "tried to free free object id (%lu)",
-		       (long unsigned)objectid_to_release);
-}
-
-int reiserfs_convert_objectid_map_v1(struct super_block *s)
-{
-	struct reiserfs_super_block *disk_sb = SB_DISK_SUPER_BLOCK(s);
-	int cur_size = sb_oid_cursize(disk_sb);
-	int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2;
-	int old_max = sb_oid_maxsize(disk_sb);
-	struct reiserfs_super_block_v1 *disk_sb_v1;
-	__le32 *objectid_map;
-	int i;
-
-	disk_sb_v1 =
-	    (struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data);
-	objectid_map = (__le32 *) (disk_sb_v1 + 1);
-
-	if (cur_size > new_size) {
-		/*
-		 * mark everyone used that was listed as free at
-		 * the end of the objectid map
-		 */
-		objectid_map[new_size - 1] = objectid_map[cur_size - 1];
-		set_sb_oid_cursize(disk_sb, new_size);
-	}
-	/* move the smaller objectid map past the end of the new super */
-	for (i = new_size - 1; i >= 0; i--) {
-		objectid_map[i + (old_max - new_size)] = objectid_map[i];
-	}
-
-	/* set the max size so we don't overflow later */
-	set_sb_oid_maxsize(disk_sb, new_size);
-
-	/* Zero out label and generate random UUID */
-	memset(disk_sb->s_label, 0, sizeof(disk_sb->s_label));
-	generate_random_uuid(disk_sb->s_uuid);
-
-	/* finally, zero out the unused chunk of the new super */
-	memset(disk_sb->s_unused, 0, sizeof(disk_sb->s_unused));
-	return 0;
-}
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
deleted file mode 100644
index 84a194b77f19..000000000000
--- a/fs/reiserfs/prints.c
+++ /dev/null
@@ -1,792 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/time.h>
-#include <linux/fs.h>
-#include "reiserfs.h"
-#include <linux/string.h>
-#include <linux/buffer_head.h>
-
-#include <linux/stdarg.h>
-
-static char error_buf[1024];
-static char fmt_buf[1024];
-static char off_buf[80];
-
-static char *reiserfs_cpu_offset(struct cpu_key *key)
-{
-	if (cpu_key_k_type(key) == TYPE_DIRENTRY)
-		sprintf(off_buf, "%llu(%llu)",
-			(unsigned long long)
-			GET_HASH_VALUE(cpu_key_k_offset(key)),
-			(unsigned long long)
-			GET_GENERATION_NUMBER(cpu_key_k_offset(key)));
-	else
-		sprintf(off_buf, "0x%Lx",
-			(unsigned long long)cpu_key_k_offset(key));
-	return off_buf;
-}
-
-static char *le_offset(struct reiserfs_key *key)
-{
-	int version;
-
-	version = le_key_version(key);
-	if (le_key_k_type(version, key) == TYPE_DIRENTRY)
-		sprintf(off_buf, "%llu(%llu)",
-			(unsigned long long)
-			GET_HASH_VALUE(le_key_k_offset(version, key)),
-			(unsigned long long)
-			GET_GENERATION_NUMBER(le_key_k_offset(version, key)));
-	else
-		sprintf(off_buf, "0x%Lx",
-			(unsigned long long)le_key_k_offset(version, key));
-	return off_buf;
-}
-
-static char *cpu_type(struct cpu_key *key)
-{
-	if (cpu_key_k_type(key) == TYPE_STAT_DATA)
-		return "SD";
-	if (cpu_key_k_type(key) == TYPE_DIRENTRY)
-		return "DIR";
-	if (cpu_key_k_type(key) == TYPE_DIRECT)
-		return "DIRECT";
-	if (cpu_key_k_type(key) == TYPE_INDIRECT)
-		return "IND";
-	return "UNKNOWN";
-}
-
-static char *le_type(struct reiserfs_key *key)
-{
-	int version;
-
-	version = le_key_version(key);
-
-	if (le_key_k_type(version, key) == TYPE_STAT_DATA)
-		return "SD";
-	if (le_key_k_type(version, key) == TYPE_DIRENTRY)
-		return "DIR";
-	if (le_key_k_type(version, key) == TYPE_DIRECT)
-		return "DIRECT";
-	if (le_key_k_type(version, key) == TYPE_INDIRECT)
-		return "IND";
-	return "UNKNOWN";
-}
-
-/* %k */
-static int scnprintf_le_key(char *buf, size_t size, struct reiserfs_key *key)
-{
-	if (key)
-		return scnprintf(buf, size, "[%d %d %s %s]",
-				 le32_to_cpu(key->k_dir_id),
-				 le32_to_cpu(key->k_objectid), le_offset(key),
-				 le_type(key));
-	else
-		return scnprintf(buf, size, "[NULL]");
-}
-
-/* %K */
-static int scnprintf_cpu_key(char *buf, size_t size, struct cpu_key *key)
-{
-	if (key)
-		return scnprintf(buf, size, "[%d %d %s %s]",
-				 key->on_disk_key.k_dir_id,
-				 key->on_disk_key.k_objectid,
-				 reiserfs_cpu_offset(key), cpu_type(key));
-	else
-		return scnprintf(buf, size, "[NULL]");
-}
-
-static int scnprintf_de_head(char *buf, size_t size,
-			     struct reiserfs_de_head *deh)
-{
-	if (deh)
-		return scnprintf(buf, size,
-				 "[offset=%d dir_id=%d objectid=%d location=%d state=%04x]",
-				 deh_offset(deh), deh_dir_id(deh),
-				 deh_objectid(deh), deh_location(deh),
-				 deh_state(deh));
-	else
-		return scnprintf(buf, size, "[NULL]");
-
-}
-
-static int scnprintf_item_head(char *buf, size_t size, struct item_head *ih)
-{
-	if (ih) {
-		char *p = buf;
-		char * const end = buf + size;
-
-		p += scnprintf(p, end - p, "%s",
-			       (ih_version(ih) == KEY_FORMAT_3_6) ?
-			       "*3.6* " : "*3.5*");
-
-		p += scnprintf_le_key(p, end - p, &ih->ih_key);
-
-		p += scnprintf(p, end - p,
-			       ", item_len %d, item_location %d, free_space(entry_count) %d",
-			       ih_item_len(ih), ih_location(ih),
-			       ih_free_space(ih));
-		return p - buf;
-	} else
-		return scnprintf(buf, size, "[NULL]");
-}
-
-static int scnprintf_direntry(char *buf, size_t size,
-			      struct reiserfs_dir_entry *de)
-{
-	char name[20];
-
-	memcpy(name, de->de_name, de->de_namelen > 19 ? 19 : de->de_namelen);
-	name[de->de_namelen > 19 ? 19 : de->de_namelen] = 0;
-	return scnprintf(buf, size, "\"%s\"==>[%d %d]",
-			 name, de->de_dir_id, de->de_objectid);
-}
-
-static int scnprintf_block_head(char *buf, size_t size, struct buffer_head *bh)
-{
-	return scnprintf(buf, size,
-			 "level=%d, nr_items=%d, free_space=%d rdkey ",
-			 B_LEVEL(bh), B_NR_ITEMS(bh), B_FREE_SPACE(bh));
-}
-
-static int scnprintf_buffer_head(char *buf, size_t size, struct buffer_head *bh)
-{
-	return scnprintf(buf, size,
-			 "dev %pg, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)",
-			 bh->b_bdev, bh->b_size,
-			 (unsigned long long)bh->b_blocknr,
-			 atomic_read(&(bh->b_count)),
-			 bh->b_state, bh->b_page,
-			 buffer_uptodate(bh) ? "UPTODATE" : "!UPTODATE",
-			 buffer_dirty(bh) ? "DIRTY" : "CLEAN",
-			 buffer_locked(bh) ? "LOCKED" : "UNLOCKED");
-}
-
-static int scnprintf_disk_child(char *buf, size_t size, struct disk_child *dc)
-{
-	return scnprintf(buf, size, "[dc_number=%d, dc_size=%u]",
-			 dc_block_number(dc), dc_size(dc));
-}
-
-static char *is_there_reiserfs_struct(char *fmt, int *what)
-{
-	char *k = fmt;
-
-	while ((k = strchr(k, '%')) != NULL) {
-		if (k[1] == 'k' || k[1] == 'K' || k[1] == 'h' || k[1] == 't' ||
-		    k[1] == 'z' || k[1] == 'b' || k[1] == 'y' || k[1] == 'a') {
-			*what = k[1];
-			break;
-		}
-		k++;
-	}
-	return k;
-}
-
-/*
- * debugging reiserfs we used to print out a lot of different
- * variables, like keys, item headers, buffer heads etc. Values of
- * most fields matter. So it took a long time just to write
- * appropriative printk. With this reiserfs_warning you can use format
- * specification for complex structures like you used to do with
- * printfs for integers, doubles and pointers. For instance, to print
- * out key structure you have to write just:
- * reiserfs_warning ("bad key %k", key);
- * instead of
- * printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid,
- *         key->k_offset, key->k_uniqueness);
- */
-static DEFINE_SPINLOCK(error_lock);
-static void prepare_error_buf(const char *fmt, va_list args)
-{
-	char *fmt1 = fmt_buf;
-	char *k;
-	char *p = error_buf;
-	char * const end = &error_buf[sizeof(error_buf)];
-	int what;
-
-	spin_lock(&error_lock);
-
-	if (WARN_ON(strscpy(fmt_buf, fmt, sizeof(fmt_buf)) < 0)) {
-		strscpy(error_buf, "format string too long", end - error_buf);
-		goto out_unlock;
-	}
-
-	while ((k = is_there_reiserfs_struct(fmt1, &what)) != NULL) {
-		*k = 0;
-
-		p += vscnprintf(p, end - p, fmt1, args);
-
-		switch (what) {
-		case 'k':
-			p += scnprintf_le_key(p, end - p,
-					      va_arg(args, struct reiserfs_key *));
-			break;
-		case 'K':
-			p += scnprintf_cpu_key(p, end - p,
-					       va_arg(args, struct cpu_key *));
-			break;
-		case 'h':
-			p += scnprintf_item_head(p, end - p,
-						 va_arg(args, struct item_head *));
-			break;
-		case 't':
-			p += scnprintf_direntry(p, end - p,
-						va_arg(args, struct reiserfs_dir_entry *));
-			break;
-		case 'y':
-			p += scnprintf_disk_child(p, end - p,
-						  va_arg(args, struct disk_child *));
-			break;
-		case 'z':
-			p += scnprintf_block_head(p, end - p,
-						  va_arg(args, struct buffer_head *));
-			break;
-		case 'b':
-			p += scnprintf_buffer_head(p, end - p,
-						   va_arg(args, struct buffer_head *));
-			break;
-		case 'a':
-			p += scnprintf_de_head(p, end - p,
-					       va_arg(args, struct reiserfs_de_head *));
-			break;
-		}
-
-		fmt1 = k + 2;
-	}
-	p += vscnprintf(p, end - p, fmt1, args);
-out_unlock:
-	spin_unlock(&error_lock);
-
-}
-
-/*
- * in addition to usual conversion specifiers this accepts reiserfs
- * specific conversion specifiers:
- * %k to print little endian key,
- * %K to print cpu key,
- * %h to print item_head,
- * %t to print directory entry
- * %z to print block head (arg must be struct buffer_head *
- * %b to print buffer_head
- */
-
-#define do_reiserfs_warning(fmt)\
-{\
-    va_list args;\
-    va_start( args, fmt );\
-    prepare_error_buf( fmt, args );\
-    va_end( args );\
-}
-
-void __reiserfs_warning(struct super_block *sb, const char *id,
-			 const char *function, const char *fmt, ...)
-{
-	do_reiserfs_warning(fmt);
-	if (sb)
-		printk(KERN_WARNING "REISERFS warning (device %s): %s%s%s: "
-		       "%s\n", sb->s_id, id ? id : "", id ? " " : "",
-		       function, error_buf);
-	else
-		printk(KERN_WARNING "REISERFS warning: %s%s%s: %s\n",
-		       id ? id : "", id ? " " : "", function, error_buf);
-}
-
-/* No newline.. reiserfs_info calls can be followed by printk's */
-void reiserfs_info(struct super_block *sb, const char *fmt, ...)
-{
-	do_reiserfs_warning(fmt);
-	if (sb)
-		printk(KERN_NOTICE "REISERFS (device %s): %s",
-		       sb->s_id, error_buf);
-	else
-		printk(KERN_NOTICE "REISERFS %s:", error_buf);
-}
-
-/* No newline.. reiserfs_printk calls can be followed by printk's */
-static void reiserfs_printk(const char *fmt, ...)
-{
-	do_reiserfs_warning(fmt);
-	printk(error_buf);
-}
-
-void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
-{
-#ifdef CONFIG_REISERFS_CHECK
-	do_reiserfs_warning(fmt);
-	if (s)
-		printk(KERN_DEBUG "REISERFS debug (device %s): %s\n",
-		       s->s_id, error_buf);
-	else
-		printk(KERN_DEBUG "REISERFS debug: %s\n", error_buf);
-#endif
-}
-
-/*
- * The format:
- *
- *          maintainer-errorid: [function-name:] message
- *
- *   where errorid is unique to the maintainer and function-name is
- *   optional, is recommended, so that anyone can easily find the bug
- *   with a simple grep for the short to type string
- *   maintainer-errorid.  Don't bother with reusing errorids, there are
- *   lots of numbers out there.
- *
- *   Example:
- *
- *   reiserfs_panic(
- *     p_sb, "reiser-29: reiserfs_new_blocknrs: "
- *     "one of search_start or rn(%d) is equal to MAX_B_NUM,"
- *     "which means that we are optimizing location based on the "
- *     "bogus location of a temp buffer (%p).",
- *     rn, bh
- *   );
- *
- *   Regular panic()s sometimes clear the screen before the message can
- *   be read, thus the need for the while loop.
- *
- *   Numbering scheme for panic used by Vladimir and Anatoly( Hans completely
- *   ignores this scheme, and considers it pointless complexity):
- *
- *   panics in reiserfs_fs.h have numbers from 1000 to 1999
- *   super.c			2000 to 2999
- *   preserve.c (unused)	3000 to 3999
- *   bitmap.c			4000 to 4999
- *   stree.c			5000 to 5999
- *   prints.c			6000 to 6999
- *   namei.c			7000 to 7999
- *   fix_nodes.c		8000 to 8999
- *   dir.c			9000 to 9999
- *   lbalance.c			10000 to 10999
- *   ibalance.c			11000 to 11999 not ready
- *   do_balan.c			12000 to 12999
- *   inode.c			13000 to 13999
- *   file.c			14000 to 14999
- *   objectid.c			15000 - 15999
- *   buffer.c			16000 - 16999
- *   symlink.c			17000 - 17999
- *
- *  .  */
-
-void __reiserfs_panic(struct super_block *sb, const char *id,
-		      const char *function, const char *fmt, ...)
-{
-	do_reiserfs_warning(fmt);
-
-#ifdef CONFIG_REISERFS_CHECK
-	dump_stack();
-#endif
-	if (sb)
-		printk(KERN_WARNING "REISERFS panic (device %s): %s%s%s: %s\n",
-		      sb->s_id, id ? id : "", id ? " " : "",
-		      function, error_buf);
-	else
-		printk(KERN_WARNING "REISERFS panic: %s%s%s: %s\n",
-		      id ? id : "", id ? " " : "", function, error_buf);
-	BUG();
-}
-
-void __reiserfs_error(struct super_block *sb, const char *id,
-		      const char *function, const char *fmt, ...)
-{
-	do_reiserfs_warning(fmt);
-
-	BUG_ON(sb == NULL);
-
-	if (reiserfs_error_panic(sb))
-		__reiserfs_panic(sb, id, function, error_buf);
-
-	if (id && id[0])
-		printk(KERN_CRIT "REISERFS error (device %s): %s %s: %s\n",
-		       sb->s_id, id, function, error_buf);
-	else
-		printk(KERN_CRIT "REISERFS error (device %s): %s: %s\n",
-		       sb->s_id, function, error_buf);
-
-	if (sb_rdonly(sb))
-		return;
-
-	reiserfs_info(sb, "Remounting filesystem read-only\n");
-	sb->s_flags |= SB_RDONLY;
-	reiserfs_abort_journal(sb, -EIO);
-}
-
-void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...)
-{
-	do_reiserfs_warning(fmt);
-
-	if (reiserfs_error_panic(sb)) {
-		panic(KERN_CRIT "REISERFS panic (device %s): %s\n", sb->s_id,
-		      error_buf);
-	}
-
-	if (reiserfs_is_journal_aborted(SB_JOURNAL(sb)))
-		return;
-
-	printk(KERN_CRIT "REISERFS abort (device %s): %s\n", sb->s_id,
-	       error_buf);
-
-	sb->s_flags |= SB_RDONLY;
-	reiserfs_abort_journal(sb, errno);
-}
-
-/*
- * this prints internal nodes (4 keys/items in line) (dc_number,
- * dc_size)[k_dirid, k_objectid, k_offset, k_uniqueness](dc_number,
- * dc_size)...
- */
-static int print_internal(struct buffer_head *bh, int first, int last)
-{
-	struct reiserfs_key *key;
-	struct disk_child *dc;
-	int i;
-	int from, to;
-
-	if (!B_IS_KEYS_LEVEL(bh))
-		return 1;
-
-	check_internal(bh);
-
-	if (first == -1) {
-		from = 0;
-		to = B_NR_ITEMS(bh);
-	} else {
-		from = first;
-		to = min_t(int, last, B_NR_ITEMS(bh));
-	}
-
-	reiserfs_printk("INTERNAL NODE (%ld) contains %z\n", bh->b_blocknr, bh);
-
-	dc = B_N_CHILD(bh, from);
-	reiserfs_printk("PTR %d: %y ", from, dc);
-
-	for (i = from, key = internal_key(bh, from), dc++; i < to;
-	     i++, key++, dc++) {
-		reiserfs_printk("KEY %d: %k PTR %d: %y ", i, key, i + 1, dc);
-		if (i && i % 4 == 0)
-			printk("\n");
-	}
-	printk("\n");
-	return 0;
-}
-
-static int print_leaf(struct buffer_head *bh, int print_mode, int first,
-		      int last)
-{
-	struct block_head *blkh;
-	struct item_head *ih;
-	int i, nr;
-	int from, to;
-
-	if (!B_IS_ITEMS_LEVEL(bh))
-		return 1;
-
-	check_leaf(bh);
-
-	blkh = B_BLK_HEAD(bh);
-	ih = item_head(bh, 0);
-	nr = blkh_nr_item(blkh);
-
-	printk
-	    ("\n===================================================================\n");
-	reiserfs_printk("LEAF NODE (%ld) contains %z\n", bh->b_blocknr, bh);
-
-	if (!(print_mode & PRINT_LEAF_ITEMS)) {
-		reiserfs_printk("FIRST ITEM_KEY: %k, LAST ITEM KEY: %k\n",
-				&(ih->ih_key), &((ih + nr - 1)->ih_key));
-		return 0;
-	}
-
-	if (first < 0 || first > nr - 1)
-		from = 0;
-	else
-		from = first;
-
-	if (last < 0 || last > nr)
-		to = nr;
-	else
-		to = last;
-
-	ih += from;
-	printk
-	    ("-------------------------------------------------------------------------------\n");
-	printk
-	    ("|##|   type    |           key           | ilen | free_space | version | loc  |\n");
-	for (i = from; i < to; i++, ih++) {
-		printk
-		    ("-------------------------------------------------------------------------------\n");
-		reiserfs_printk("|%2d| %h |\n", i, ih);
-		if (print_mode & PRINT_LEAF_ITEMS)
-			op_print_item(ih, ih_item_body(bh, ih));
-	}
-
-	printk
-	    ("===================================================================\n");
-
-	return 0;
-}
-
-char *reiserfs_hashname(int code)
-{
-	if (code == YURA_HASH)
-		return "rupasov";
-	if (code == TEA_HASH)
-		return "tea";
-	if (code == R5_HASH)
-		return "r5";
-
-	return "unknown";
-}
-
-/* return 1 if this is not super block */
-static int print_super_block(struct buffer_head *bh)
-{
-	struct reiserfs_super_block *rs =
-	    (struct reiserfs_super_block *)(bh->b_data);
-	int skipped, data_blocks;
-	char *version;
-
-	if (is_reiserfs_3_5(rs)) {
-		version = "3.5";
-	} else if (is_reiserfs_3_6(rs)) {
-		version = "3.6";
-	} else if (is_reiserfs_jr(rs)) {
-		version = ((sb_version(rs) == REISERFS_VERSION_2) ?
-			   "3.6" : "3.5");
-	} else {
-		return 1;
-	}
-
-	printk("%pg\'s super block is in block %llu\n", bh->b_bdev,
-	       (unsigned long long)bh->b_blocknr);
-	printk("Reiserfs version %s\n", version);
-	printk("Block count %u\n", sb_block_count(rs));
-	printk("Blocksize %d\n", sb_blocksize(rs));
-	printk("Free blocks %u\n", sb_free_blocks(rs));
-	/*
-	 * FIXME: this would be confusing if
-	 * someone stores reiserfs super block in some data block ;)
-//    skipped = (bh->b_blocknr * bh->b_size) / sb_blocksize(rs);
-	 */
-	skipped = bh->b_blocknr;
-	data_blocks = sb_block_count(rs) - skipped - 1 - sb_bmap_nr(rs) -
-	    (!is_reiserfs_jr(rs) ? sb_jp_journal_size(rs) +
-	     1 : sb_reserved_for_journal(rs)) - sb_free_blocks(rs);
-	printk
-	    ("Busy blocks (skipped %d, bitmaps - %d, journal (or reserved) blocks - %d\n"
-	     "1 super block, %d data blocks\n", skipped, sb_bmap_nr(rs),
-	     (!is_reiserfs_jr(rs) ? (sb_jp_journal_size(rs) + 1) :
-	      sb_reserved_for_journal(rs)), data_blocks);
-	printk("Root block %u\n", sb_root_block(rs));
-	printk("Journal block (first) %d\n", sb_jp_journal_1st_block(rs));
-	printk("Journal dev %d\n", sb_jp_journal_dev(rs));
-	printk("Journal orig size %d\n", sb_jp_journal_size(rs));
-	printk("FS state %d\n", sb_fs_state(rs));
-	printk("Hash function \"%s\"\n",
-	       reiserfs_hashname(sb_hash_function_code(rs)));
-
-	printk("Tree height %d\n", sb_tree_height(rs));
-	return 0;
-}
-
-static int print_desc_block(struct buffer_head *bh)
-{
-	struct reiserfs_journal_desc *desc;
-
-	if (memcmp(get_journal_desc_magic(bh), JOURNAL_DESC_MAGIC, 8))
-		return 1;
-
-	desc = (struct reiserfs_journal_desc *)(bh->b_data);
-	printk("Desc block %llu (j_trans_id %d, j_mount_id %d, j_len %d)",
-	       (unsigned long long)bh->b_blocknr, get_desc_trans_id(desc),
-	       get_desc_mount_id(desc), get_desc_trans_len(desc));
-
-	return 0;
-}
-/* ..., int print_mode, int first, int last) */
-void print_block(struct buffer_head *bh, ...)
-{
-	va_list args;
-	int mode, first, last;
-
-	if (!bh) {
-		printk("print_block: buffer is NULL\n");
-		return;
-	}
-
-	va_start(args, bh);
-
-	mode = va_arg(args, int);
-	first = va_arg(args, int);
-	last = va_arg(args, int);
-	if (print_leaf(bh, mode, first, last))
-		if (print_internal(bh, first, last))
-			if (print_super_block(bh))
-				if (print_desc_block(bh))
-					printk
-					    ("Block %llu contains unformatted data\n",
-					     (unsigned long long)bh->b_blocknr);
-
-	va_end(args);
-}
-
-static char print_tb_buf[2048];
-
-/* this stores initial state of tree balance in the print_tb_buf */
-void store_print_tb(struct tree_balance *tb)
-{
-	int h = 0;
-	int i;
-	struct buffer_head *tbSh, *tbFh;
-
-	if (!tb)
-		return;
-
-	sprintf(print_tb_buf, "\n"
-		"BALANCING %d\n"
-		"MODE=%c, ITEM_POS=%d POS_IN_ITEM=%d\n"
-		"=====================================================================\n"
-		"* h *    S    *    L    *    R    *   F   *   FL  *   FR  *  CFL  *  CFR  *\n",
-		REISERFS_SB(tb->tb_sb)->s_do_balance,
-		tb->tb_mode, PATH_LAST_POSITION(tb->tb_path),
-		tb->tb_path->pos_in_item);
-
-	for (h = 0; h < ARRAY_SIZE(tb->insert_size); h++) {
-		if (PATH_H_PATH_OFFSET(tb->tb_path, h) <=
-		    tb->tb_path->path_length
-		    && PATH_H_PATH_OFFSET(tb->tb_path,
-					  h) > ILLEGAL_PATH_ELEMENT_OFFSET) {
-			tbSh = PATH_H_PBUFFER(tb->tb_path, h);
-			tbFh = PATH_H_PPARENT(tb->tb_path, h);
-		} else {
-			tbSh = NULL;
-			tbFh = NULL;
-		}
-		sprintf(print_tb_buf + strlen(print_tb_buf),
-			"* %d * %3lld(%2d) * %3lld(%2d) * %3lld(%2d) * %5lld * %5lld * %5lld * %5lld * %5lld *\n",
-			h,
-			(tbSh) ? (long long)(tbSh->b_blocknr) : (-1LL),
-			(tbSh) ? atomic_read(&tbSh->b_count) : -1,
-			(tb->L[h]) ? (long long)(tb->L[h]->b_blocknr) : (-1LL),
-			(tb->L[h]) ? atomic_read(&tb->L[h]->b_count) : -1,
-			(tb->R[h]) ? (long long)(tb->R[h]->b_blocknr) : (-1LL),
-			(tb->R[h]) ? atomic_read(&tb->R[h]->b_count) : -1,
-			(tbFh) ? (long long)(tbFh->b_blocknr) : (-1LL),
-			(tb->FL[h]) ? (long long)(tb->FL[h]->
-						  b_blocknr) : (-1LL),
-			(tb->FR[h]) ? (long long)(tb->FR[h]->
-						  b_blocknr) : (-1LL),
-			(tb->CFL[h]) ? (long long)(tb->CFL[h]->
-						   b_blocknr) : (-1LL),
-			(tb->CFR[h]) ? (long long)(tb->CFR[h]->
-						   b_blocknr) : (-1LL));
-	}
-
-	sprintf(print_tb_buf + strlen(print_tb_buf),
-		"=====================================================================\n"
-		"* h * size * ln * lb * rn * rb * blkn * s0 * s1 * s1b * s2 * s2b * curb * lk * rk *\n"
-		"* 0 * %4d * %2d * %2d * %2d * %2d * %4d * %2d * %2d * %3d * %2d * %3d * %4d * %2d * %2d *\n",
-		tb->insert_size[0], tb->lnum[0], tb->lbytes, tb->rnum[0],
-		tb->rbytes, tb->blknum[0], tb->s0num, tb->snum[0],
-		tb->sbytes[0], tb->snum[1], tb->sbytes[1],
-		tb->cur_blknum, tb->lkey[0], tb->rkey[0]);
-
-	/* this prints balance parameters for non-leaf levels */
-	h = 0;
-	do {
-		h++;
-		sprintf(print_tb_buf + strlen(print_tb_buf),
-			"* %d * %4d * %2d *    * %2d *    * %2d *\n",
-			h, tb->insert_size[h], tb->lnum[h], tb->rnum[h],
-			tb->blknum[h]);
-	} while (tb->insert_size[h]);
-
-	sprintf(print_tb_buf + strlen(print_tb_buf),
-		"=====================================================================\n"
-		"FEB list: ");
-
-	/* print FEB list (list of buffers in form (bh (b_blocknr, b_count), that will be used for new nodes) */
-	h = 0;
-	for (i = 0; i < ARRAY_SIZE(tb->FEB); i++)
-		sprintf(print_tb_buf + strlen(print_tb_buf),
-			"%p (%llu %d)%s", tb->FEB[i],
-			tb->FEB[i] ? (unsigned long long)tb->FEB[i]->
-			b_blocknr : 0ULL,
-			tb->FEB[i] ? atomic_read(&tb->FEB[i]->b_count) : 0,
-			(i == ARRAY_SIZE(tb->FEB) - 1) ? "\n" : ", ");
-
-	sprintf(print_tb_buf + strlen(print_tb_buf),
-		"======================== the end ====================================\n");
-}
-
-void print_cur_tb(char *mes)
-{
-	printk("%s\n%s", mes, print_tb_buf);
-}
-
-static void check_leaf_block_head(struct buffer_head *bh)
-{
-	struct block_head *blkh;
-	int nr;
-
-	blkh = B_BLK_HEAD(bh);
-	nr = blkh_nr_item(blkh);
-	if (nr > (bh->b_size - BLKH_SIZE) / IH_SIZE)
-		reiserfs_panic(NULL, "vs-6010", "invalid item number %z",
-			       bh);
-	if (blkh_free_space(blkh) > bh->b_size - BLKH_SIZE - IH_SIZE * nr)
-		reiserfs_panic(NULL, "vs-6020", "invalid free space %z",
-			       bh);
-
-}
-
-static void check_internal_block_head(struct buffer_head *bh)
-{
-	if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT))
-		reiserfs_panic(NULL, "vs-6025", "invalid level %z", bh);
-
-	if (B_NR_ITEMS(bh) > (bh->b_size - BLKH_SIZE) / IH_SIZE)
-		reiserfs_panic(NULL, "vs-6030", "invalid item number %z", bh);
-
-	if (B_FREE_SPACE(bh) !=
-	    bh->b_size - BLKH_SIZE - KEY_SIZE * B_NR_ITEMS(bh) -
-	    DC_SIZE * (B_NR_ITEMS(bh) + 1))
-		reiserfs_panic(NULL, "vs-6040", "invalid free space %z", bh);
-
-}
-
-void check_leaf(struct buffer_head *bh)
-{
-	int i;
-	struct item_head *ih;
-
-	if (!bh)
-		return;
-	check_leaf_block_head(bh);
-	for (i = 0, ih = item_head(bh, 0); i < B_NR_ITEMS(bh); i++, ih++)
-		op_check_item(ih, ih_item_body(bh, ih));
-}
-
-void check_internal(struct buffer_head *bh)
-{
-	if (!bh)
-		return;
-	check_internal_block_head(bh);
-}
-
-void print_statistics(struct super_block *s)
-{
-
-	/*
-	   printk ("reiserfs_put_super: session statistics: balances %d, fix_nodes %d, \
-	   bmap with search %d, without %d, dir2ind %d, ind2dir %d\n",
-	   REISERFS_SB(s)->s_do_balance, REISERFS_SB(s)->s_fix_nodes,
-	   REISERFS_SB(s)->s_bmaps, REISERFS_SB(s)->s_bmaps_without_search,
-	   REISERFS_SB(s)->s_direct2indirect, REISERFS_SB(s)->s_indirect2direct);
-	 */
-
-}
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
deleted file mode 100644
index 5c68a4a52d78..000000000000
--- a/fs/reiserfs/procfs.c
+++ /dev/null
@@ -1,490 +0,0 @@
-/* -*- linux-c -*- */
-
-/* fs/reiserfs/procfs.c */
-
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-/* proc info support a la one created by Sizif@Botik.RU for PGC */
-
-#include <linux/module.h>
-#include <linux/time.h>
-#include <linux/seq_file.h>
-#include <linux/uaccess.h>
-#include "reiserfs.h"
-#include <linux/init.h>
-#include <linux/proc_fs.h>
-#include <linux/blkdev.h>
-
-/*
- * LOCKING:
- *
- * These guys are evicted from procfs as the very first step in ->kill_sb().
- *
- */
-
-static int show_version(struct seq_file *m, void *unused)
-{
-	struct super_block *sb = m->private;
-	char *format;
-
-	if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6)) {
-		format = "3.6";
-	} else if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_5)) {
-		format = "3.5";
-	} else {
-		format = "unknown";
-	}
-
-	seq_printf(m, "%s format\twith checks %s\n", format,
-#if defined( CONFIG_REISERFS_CHECK )
-		   "on"
-#else
-		   "off"
-#endif
-	    );
-	return 0;
-}
-
-#define SF( x ) ( r -> x )
-#define SFP( x ) SF( s_proc_info_data.x )
-#define SFPL( x ) SFP( x[ level ] )
-#define SFPF( x ) SFP( scan_bitmap.x )
-#define SFPJ( x ) SFP( journal.x )
-
-#define D2C( x ) le16_to_cpu( x )
-#define D4C( x ) le32_to_cpu( x )
-#define DF( x ) D2C( rs -> s_v1.x )
-#define DFL( x ) D4C( rs -> s_v1.x )
-
-#define objectid_map( s, rs ) (old_format_only (s) ?				\
-                         (__le32 *)((struct reiserfs_super_block_v1 *)rs + 1) :	\
-			 (__le32 *)(rs + 1))
-#define MAP( i ) D4C( objectid_map( sb, rs )[ i ] )
-
-#define DJF( x ) le32_to_cpu( rs -> x )
-#define DJP( x ) le32_to_cpu( jp -> x )
-#define JF( x ) ( r -> s_journal -> x )
-
-static int show_super(struct seq_file *m, void *unused)
-{
-	struct super_block *sb = m->private;
-	struct reiserfs_sb_info *r = REISERFS_SB(sb);
-
-	seq_printf(m, "state: \t%s\n"
-		   "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n"
-		   "gen. counter: \t%i\n"
-		   "s_disk_reads: \t%i\n"
-		   "s_disk_writes: \t%i\n"
-		   "s_fix_nodes: \t%i\n"
-		   "s_do_balance: \t%i\n"
-		   "s_unneeded_left_neighbor: \t%i\n"
-		   "s_good_search_by_key_reada: \t%i\n"
-		   "s_bmaps: \t%i\n"
-		   "s_bmaps_without_search: \t%i\n"
-		   "s_direct2indirect: \t%i\n"
-		   "s_indirect2direct: \t%i\n"
-		   "\n"
-		   "max_hash_collisions: \t%i\n"
-		   "breads: \t%lu\n"
-		   "bread_misses: \t%lu\n"
-		   "search_by_key: \t%lu\n"
-		   "search_by_key_fs_changed: \t%lu\n"
-		   "search_by_key_restarted: \t%lu\n"
-		   "insert_item_restarted: \t%lu\n"
-		   "paste_into_item_restarted: \t%lu\n"
-		   "cut_from_item_restarted: \t%lu\n"
-		   "delete_solid_item_restarted: \t%lu\n"
-		   "delete_item_restarted: \t%lu\n"
-		   "leaked_oid: \t%lu\n"
-		   "leaves_removable: \t%lu\n",
-		   SF(s_mount_state) == REISERFS_VALID_FS ?
-		   "REISERFS_VALID_FS" : "REISERFS_ERROR_FS",
-		   reiserfs_r5_hash(sb) ? "FORCE_R5 " : "",
-		   reiserfs_rupasov_hash(sb) ? "FORCE_RUPASOV " : "",
-		   reiserfs_tea_hash(sb) ? "FORCE_TEA " : "",
-		   reiserfs_hash_detect(sb) ? "DETECT_HASH " : "",
-		   reiserfs_no_border(sb) ? "NO_BORDER " : "BORDER ",
-		   reiserfs_no_unhashed_relocation(sb) ?
-		   "NO_UNHASHED_RELOCATION " : "",
-		   reiserfs_hashed_relocation(sb) ? "UNHASHED_RELOCATION " : "",
-		   reiserfs_test4(sb) ? "TEST4 " : "",
-		   have_large_tails(sb) ? "TAILS " : have_small_tails(sb) ?
-		   "SMALL_TAILS " : "NO_TAILS ",
-		   replay_only(sb) ? "REPLAY_ONLY " : "",
-		   convert_reiserfs(sb) ? "CONV " : "",
-		   atomic_read(&r->s_generation_counter),
-		   SF(s_disk_reads), SF(s_disk_writes), SF(s_fix_nodes),
-		   SF(s_do_balance), SF(s_unneeded_left_neighbor),
-		   SF(s_good_search_by_key_reada), SF(s_bmaps),
-		   SF(s_bmaps_without_search), SF(s_direct2indirect),
-		   SF(s_indirect2direct), SFP(max_hash_collisions), SFP(breads),
-		   SFP(bread_miss), SFP(search_by_key),
-		   SFP(search_by_key_fs_changed), SFP(search_by_key_restarted),
-		   SFP(insert_item_restarted), SFP(paste_into_item_restarted),
-		   SFP(cut_from_item_restarted),
-		   SFP(delete_solid_item_restarted), SFP(delete_item_restarted),
-		   SFP(leaked_oid), SFP(leaves_removable));
-
-	return 0;
-}
-
-static int show_per_level(struct seq_file *m, void *unused)
-{
-	struct super_block *sb = m->private;
-	struct reiserfs_sb_info *r = REISERFS_SB(sb);
-	int level;
-
-	seq_printf(m, "level\t"
-		   "     balances"
-		   " [sbk:  reads"
-		   "   fs_changed"
-		   "   restarted]"
-		   "   free space"
-		   "        items"
-		   "   can_remove"
-		   "         lnum"
-		   "         rnum"
-		   "       lbytes"
-		   "       rbytes"
-		   "     get_neig"
-		   " get_neig_res" "  need_l_neig" "  need_r_neig" "\n");
-
-	for (level = 0; level < MAX_HEIGHT; ++level) {
-		seq_printf(m, "%i\t"
-			   " %12lu"
-			   " %12lu"
-			   " %12lu"
-			   " %12lu"
-			   " %12lu"
-			   " %12lu"
-			   " %12lu"
-			   " %12li"
-			   " %12li"
-			   " %12li"
-			   " %12li"
-			   " %12lu"
-			   " %12lu"
-			   " %12lu"
-			   " %12lu"
-			   "\n",
-			   level,
-			   SFPL(balance_at),
-			   SFPL(sbk_read_at),
-			   SFPL(sbk_fs_changed),
-			   SFPL(sbk_restarted),
-			   SFPL(free_at),
-			   SFPL(items_at),
-			   SFPL(can_node_be_removed),
-			   SFPL(lnum),
-			   SFPL(rnum),
-			   SFPL(lbytes),
-			   SFPL(rbytes),
-			   SFPL(get_neighbors),
-			   SFPL(get_neighbors_restart),
-			   SFPL(need_l_neighbor), SFPL(need_r_neighbor)
-		    );
-	}
-	return 0;
-}
-
-static int show_bitmap(struct seq_file *m, void *unused)
-{
-	struct super_block *sb = m->private;
-	struct reiserfs_sb_info *r = REISERFS_SB(sb);
-
-	seq_printf(m, "free_block: %lu\n"
-		   "  scan_bitmap:"
-		   "          wait"
-		   "          bmap"
-		   "         retry"
-		   "        stolen"
-		   "  journal_hint"
-		   "journal_nohint"
-		   "\n"
-		   " %14lu"
-		   " %14lu"
-		   " %14lu"
-		   " %14lu"
-		   " %14lu"
-		   " %14lu"
-		   " %14lu"
-		   "\n",
-		   SFP(free_block),
-		   SFPF(call),
-		   SFPF(wait),
-		   SFPF(bmap),
-		   SFPF(retry),
-		   SFPF(stolen),
-		   SFPF(in_journal_hint), SFPF(in_journal_nohint));
-
-	return 0;
-}
-
-static int show_on_disk_super(struct seq_file *m, void *unused)
-{
-	struct super_block *sb = m->private;
-	struct reiserfs_sb_info *sb_info = REISERFS_SB(sb);
-	struct reiserfs_super_block *rs = sb_info->s_rs;
-	int hash_code = DFL(s_hash_function_code);
-	__u32 flags = DJF(s_flags);
-
-	seq_printf(m, "block_count: \t%i\n"
-		   "free_blocks: \t%i\n"
-		   "root_block: \t%i\n"
-		   "blocksize: \t%i\n"
-		   "oid_maxsize: \t%i\n"
-		   "oid_cursize: \t%i\n"
-		   "umount_state: \t%i\n"
-		   "magic: \t%10.10s\n"
-		   "fs_state: \t%i\n"
-		   "hash: \t%s\n"
-		   "tree_height: \t%i\n"
-		   "bmap_nr: \t%i\n"
-		   "version: \t%i\n"
-		   "flags: \t%x[%s]\n"
-		   "reserved_for_journal: \t%i\n",
-		   DFL(s_block_count),
-		   DFL(s_free_blocks),
-		   DFL(s_root_block),
-		   DF(s_blocksize),
-		   DF(s_oid_maxsize),
-		   DF(s_oid_cursize),
-		   DF(s_umount_state),
-		   rs->s_v1.s_magic,
-		   DF(s_fs_state),
-		   hash_code == TEA_HASH ? "tea" :
-		   (hash_code == YURA_HASH) ? "rupasov" :
-		   (hash_code == R5_HASH) ? "r5" :
-		   (hash_code == UNSET_HASH) ? "unset" : "unknown",
-		   DF(s_tree_height),
-		   DF(s_bmap_nr),
-		   DF(s_version), flags, (flags & reiserfs_attrs_cleared)
-		   ? "attrs_cleared" : "", DF(s_reserved_for_journal));
-
-	return 0;
-}
-
-static int show_oidmap(struct seq_file *m, void *unused)
-{
-	struct super_block *sb = m->private;
-	struct reiserfs_sb_info *sb_info = REISERFS_SB(sb);
-	struct reiserfs_super_block *rs = sb_info->s_rs;
-	unsigned int mapsize = le16_to_cpu(rs->s_v1.s_oid_cursize);
-	unsigned long total_used = 0;
-	int i;
-
-	for (i = 0; i < mapsize; ++i) {
-		__u32 right;
-
-		right = (i == mapsize - 1) ? MAX_KEY_OBJECTID : MAP(i + 1);
-		seq_printf(m, "%s: [ %x .. %x )\n",
-			   (i & 1) ? "free" : "used", MAP(i), right);
-		if (!(i & 1)) {
-			total_used += right - MAP(i);
-		}
-	}
-#if defined( REISERFS_USE_OIDMAPF )
-	if (sb_info->oidmap.use_file && (sb_info->oidmap.mapf != NULL)) {
-		loff_t size = file_inode(sb_info->oidmap.mapf)->i_size;
-		total_used += size / sizeof(reiserfs_oidinterval_d_t);
-	}
-#endif
-	seq_printf(m, "total: \t%i [%i/%i] used: %lu [exact]\n",
-		   mapsize,
-		   mapsize, le16_to_cpu(rs->s_v1.s_oid_maxsize), total_used);
-	return 0;
-}
-
-static time64_t ktime_mono_to_real_seconds(time64_t mono)
-{
-	ktime_t kt = ktime_set(mono, NSEC_PER_SEC/2);
-
-	return ktime_divns(ktime_mono_to_real(kt), NSEC_PER_SEC);
-}
-
-static int show_journal(struct seq_file *m, void *unused)
-{
-	struct super_block *sb = m->private;
-	struct reiserfs_sb_info *r = REISERFS_SB(sb);
-	struct reiserfs_super_block *rs = r->s_rs;
-	struct journal_params *jp = &rs->s_v1.s_journal;
-
-	seq_printf(m,		/* on-disk fields */
-		   "jp_journal_1st_block: \t%i\n"
-		   "jp_journal_dev: \t%pg[%x]\n"
-		   "jp_journal_size: \t%i\n"
-		   "jp_journal_trans_max: \t%i\n"
-		   "jp_journal_magic: \t%i\n"
-		   "jp_journal_max_batch: \t%i\n"
-		   "jp_journal_max_commit_age: \t%i\n"
-		   "jp_journal_max_trans_age: \t%i\n"
-		   /* incore fields */
-		   "j_1st_reserved_block: \t%i\n"
-		   "j_state: \t%li\n"
-		   "j_trans_id: \t%u\n"
-		   "j_mount_id: \t%lu\n"
-		   "j_start: \t%lu\n"
-		   "j_len: \t%lu\n"
-		   "j_len_alloc: \t%lu\n"
-		   "j_wcount: \t%i\n"
-		   "j_bcount: \t%lu\n"
-		   "j_first_unflushed_offset: \t%lu\n"
-		   "j_last_flush_trans_id: \t%u\n"
-		   "j_trans_start_time: \t%lli\n"
-		   "j_list_bitmap_index: \t%i\n"
-		   "j_must_wait: \t%i\n"
-		   "j_next_full_flush: \t%i\n"
-		   "j_next_async_flush: \t%i\n"
-		   "j_cnode_used: \t%i\n" "j_cnode_free: \t%i\n" "\n"
-		   /* reiserfs_proc_info_data_t.journal fields */
-		   "in_journal: \t%12lu\n"
-		   "in_journal_bitmap: \t%12lu\n"
-		   "in_journal_reusable: \t%12lu\n"
-		   "lock_journal: \t%12lu\n"
-		   "lock_journal_wait: \t%12lu\n"
-		   "journal_begin: \t%12lu\n"
-		   "journal_relock_writers: \t%12lu\n"
-		   "journal_relock_wcount: \t%12lu\n"
-		   "mark_dirty: \t%12lu\n"
-		   "mark_dirty_already: \t%12lu\n"
-		   "mark_dirty_notjournal: \t%12lu\n"
-		   "restore_prepared: \t%12lu\n"
-		   "prepare: \t%12lu\n"
-		   "prepare_retry: \t%12lu\n",
-		   DJP(jp_journal_1st_block),
-		   file_bdev(SB_JOURNAL(sb)->j_bdev_file),
-		   DJP(jp_journal_dev),
-		   DJP(jp_journal_size),
-		   DJP(jp_journal_trans_max),
-		   DJP(jp_journal_magic),
-		   DJP(jp_journal_max_batch),
-		   SB_JOURNAL(sb)->j_max_commit_age,
-		   DJP(jp_journal_max_trans_age),
-		   JF(j_1st_reserved_block),
-		   JF(j_state),
-		   JF(j_trans_id),
-		   JF(j_mount_id),
-		   JF(j_start),
-		   JF(j_len),
-		   JF(j_len_alloc),
-		   atomic_read(&r->s_journal->j_wcount),
-		   JF(j_bcount),
-		   JF(j_first_unflushed_offset),
-		   JF(j_last_flush_trans_id),
-		   ktime_mono_to_real_seconds(JF(j_trans_start_time)),
-		   JF(j_list_bitmap_index),
-		   JF(j_must_wait),
-		   JF(j_next_full_flush),
-		   JF(j_next_async_flush),
-		   JF(j_cnode_used),
-		   JF(j_cnode_free),
-		   SFPJ(in_journal),
-		   SFPJ(in_journal_bitmap),
-		   SFPJ(in_journal_reusable),
-		   SFPJ(lock_journal),
-		   SFPJ(lock_journal_wait),
-		   SFPJ(journal_being),
-		   SFPJ(journal_relock_writers),
-		   SFPJ(journal_relock_wcount),
-		   SFPJ(mark_dirty),
-		   SFPJ(mark_dirty_already),
-		   SFPJ(mark_dirty_notjournal),
-		   SFPJ(restore_prepared), SFPJ(prepare), SFPJ(prepare_retry)
-	    );
-	return 0;
-}
-
-static struct proc_dir_entry *proc_info_root = NULL;
-static const char proc_info_root_name[] = "fs/reiserfs";
-
-static void add_file(struct super_block *sb, char *name,
-		     int (*func) (struct seq_file *, void *))
-{
-	proc_create_single_data(name, 0, REISERFS_SB(sb)->procdir, func, sb);
-}
-
-int reiserfs_proc_info_init(struct super_block *sb)
-{
-	char b[BDEVNAME_SIZE];
-	char *s;
-
-	/* Some block devices use /'s */
-	strscpy(b, sb->s_id, BDEVNAME_SIZE);
-	s = strchr(b, '/');
-	if (s)
-		*s = '!';
-
-	spin_lock_init(&__PINFO(sb).lock);
-	REISERFS_SB(sb)->procdir = proc_mkdir_data(b, 0, proc_info_root, sb);
-	if (REISERFS_SB(sb)->procdir) {
-		add_file(sb, "version", show_version);
-		add_file(sb, "super", show_super);
-		add_file(sb, "per-level", show_per_level);
-		add_file(sb, "bitmap", show_bitmap);
-		add_file(sb, "on-disk-super", show_on_disk_super);
-		add_file(sb, "oidmap", show_oidmap);
-		add_file(sb, "journal", show_journal);
-		return 0;
-	}
-	reiserfs_warning(sb, "cannot create /proc/%s/%s",
-			 proc_info_root_name, b);
-	return 1;
-}
-
-int reiserfs_proc_info_done(struct super_block *sb)
-{
-	struct proc_dir_entry *de = REISERFS_SB(sb)->procdir;
-	if (de) {
-		char b[BDEVNAME_SIZE];
-		char *s;
-
-		/* Some block devices use /'s */
-		strscpy(b, sb->s_id, BDEVNAME_SIZE);
-		s = strchr(b, '/');
-		if (s)
-			*s = '!';
-
-		remove_proc_subtree(b, proc_info_root);
-		REISERFS_SB(sb)->procdir = NULL;
-	}
-	return 0;
-}
-
-int reiserfs_proc_info_global_init(void)
-{
-	if (proc_info_root == NULL) {
-		proc_info_root = proc_mkdir(proc_info_root_name, NULL);
-		if (!proc_info_root) {
-			reiserfs_warning(NULL, "cannot create /proc/%s",
-					 proc_info_root_name);
-			return 1;
-		}
-	}
-	return 0;
-}
-
-int reiserfs_proc_info_global_done(void)
-{
-	if (proc_info_root != NULL) {
-		proc_info_root = NULL;
-		remove_proc_entry(proc_info_root_name, NULL);
-	}
-	return 0;
-}
-/*
- * Revision 1.1.8.2  2001/07/15 17:08:42  god
- *  . use get_super() in procfs.c
- *  . remove remove_save_link() from reiserfs_do_truncate()
- *
- * I accept terms and conditions stated in the Legal Agreement
- * (available at http://www.namesys.com/legalese.html)
- *
- * Revision 1.1.8.1  2001/07/11 16:48:50  god
- * proc info support
- *
- * I accept terms and conditions stated in the Legal Agreement
- * (available at http://www.namesys.com/legalese.html)
- *
- */
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
deleted file mode 100644
index 12fc20af8e17..000000000000
--- a/fs/reiserfs/reiserfs.h
+++ /dev/null
@@ -1,3419 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright 1996, 1997, 1998 Hans Reiser, see reiserfs/README for
- * licensing and copyright details
- */
-
-#include <linux/reiserfs_fs.h>
-
-#include <linux/slab.h>
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <linux/bug.h>
-#include <linux/workqueue.h>
-#include <linux/unaligned.h>
-#include <linux/bitops.h>
-#include <linux/proc_fs.h>
-#include <linux/buffer_head.h>
-
-/* the 32 bit compat definitions with int argument */
-#define REISERFS_IOC32_UNPACK		_IOW(0xCD, 1, int)
-#define REISERFS_IOC32_GETVERSION	FS_IOC32_GETVERSION
-#define REISERFS_IOC32_SETVERSION	FS_IOC32_SETVERSION
-
-struct reiserfs_journal_list;
-
-/* bitmasks for i_flags field in reiserfs-specific part of inode */
-typedef enum {
-	/*
-	 * this says what format of key do all items (but stat data) of
-	 * an object have.  If this is set, that format is 3.6 otherwise - 3.5
-	 */
-	i_item_key_version_mask = 0x0001,
-
-	/*
-	 * If this is unset, object has 3.5 stat data, otherwise,
-	 * it has 3.6 stat data with 64bit size, 32bit nlink etc.
-	 */
-	i_stat_data_version_mask = 0x0002,
-
-	/* file might need tail packing on close */
-	i_pack_on_close_mask = 0x0004,
-
-	/* don't pack tail of file */
-	i_nopack_mask = 0x0008,
-
-	/*
-	 * If either of these are set, "safe link" was created for this
-	 * file during truncate or unlink. Safe link is used to avoid
-	 * leakage of disk space on crash with some files open, but unlinked.
-	 */
-	i_link_saved_unlink_mask = 0x0010,
-	i_link_saved_truncate_mask = 0x0020,
-
-	i_has_xattr_dir = 0x0040,
-	i_data_log = 0x0080,
-} reiserfs_inode_flags;
-
-struct reiserfs_inode_info {
-	__u32 i_key[4];		/* key is still 4 32 bit integers */
-
-	/*
-	 * transient inode flags that are never stored on disk. Bitmasks
-	 * for this field are defined above.
-	 */
-	__u32 i_flags;
-
-	/* offset of first byte stored in direct item. */
-	__u32 i_first_direct_byte;
-
-	/* copy of persistent inode flags read from sd_attrs. */
-	__u32 i_attrs;
-
-	/* first unused block of a sequence of unused blocks */
-	int i_prealloc_block;
-	int i_prealloc_count;	/* length of that sequence */
-
-	/* per-transaction list of inodes which  have preallocated blocks */
-	struct list_head i_prealloc_list;
-
-	/*
-	 * new_packing_locality is created; new blocks for the contents
-	 * of this directory should be displaced
-	 */
-	unsigned new_packing_locality:1;
-
-	/*
-	 * we use these for fsync or O_SYNC to decide which transaction
-	 * needs to be committed in order for this inode to be properly
-	 * flushed
-	 */
-	unsigned int i_trans_id;
-
-	struct reiserfs_journal_list *i_jl;
-	atomic_t openers;
-	struct mutex tailpack;
-#ifdef CONFIG_REISERFS_FS_XATTR
-	struct rw_semaphore i_xattr_sem;
-#endif
-#ifdef CONFIG_QUOTA
-	struct dquot __rcu *i_dquot[MAXQUOTAS];
-#endif
-
-	struct inode vfs_inode;
-};
-
-typedef enum {
-	reiserfs_attrs_cleared = 0x00000001,
-} reiserfs_super_block_flags;
-
-/*
- * struct reiserfs_super_block accessors/mutators since this is a disk
- * structure, it will always be in little endian format.
- */
-#define sb_block_count(sbp)         (le32_to_cpu((sbp)->s_v1.s_block_count))
-#define set_sb_block_count(sbp,v)   ((sbp)->s_v1.s_block_count = cpu_to_le32(v))
-#define sb_free_blocks(sbp)         (le32_to_cpu((sbp)->s_v1.s_free_blocks))
-#define set_sb_free_blocks(sbp,v)   ((sbp)->s_v1.s_free_blocks = cpu_to_le32(v))
-#define sb_root_block(sbp)          (le32_to_cpu((sbp)->s_v1.s_root_block))
-#define set_sb_root_block(sbp,v)    ((sbp)->s_v1.s_root_block = cpu_to_le32(v))
-
-#define sb_jp_journal_1st_block(sbp)  \
-              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_1st_block))
-#define set_sb_jp_journal_1st_block(sbp,v) \
-              ((sbp)->s_v1.s_journal.jp_journal_1st_block = cpu_to_le32(v))
-#define sb_jp_journal_dev(sbp) \
-              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_dev))
-#define set_sb_jp_journal_dev(sbp,v) \
-              ((sbp)->s_v1.s_journal.jp_journal_dev = cpu_to_le32(v))
-#define sb_jp_journal_size(sbp) \
-              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_size))
-#define set_sb_jp_journal_size(sbp,v) \
-              ((sbp)->s_v1.s_journal.jp_journal_size = cpu_to_le32(v))
-#define sb_jp_journal_trans_max(sbp) \
-              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_trans_max))
-#define set_sb_jp_journal_trans_max(sbp,v) \
-              ((sbp)->s_v1.s_journal.jp_journal_trans_max = cpu_to_le32(v))
-#define sb_jp_journal_magic(sbp) \
-              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_magic))
-#define set_sb_jp_journal_magic(sbp,v) \
-              ((sbp)->s_v1.s_journal.jp_journal_magic = cpu_to_le32(v))
-#define sb_jp_journal_max_batch(sbp) \
-              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_max_batch))
-#define set_sb_jp_journal_max_batch(sbp,v) \
-              ((sbp)->s_v1.s_journal.jp_journal_max_batch = cpu_to_le32(v))
-#define sb_jp_jourmal_max_commit_age(sbp) \
-              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_max_commit_age))
-#define set_sb_jp_journal_max_commit_age(sbp,v) \
-              ((sbp)->s_v1.s_journal.jp_journal_max_commit_age = cpu_to_le32(v))
-
-#define sb_blocksize(sbp)          (le16_to_cpu((sbp)->s_v1.s_blocksize))
-#define set_sb_blocksize(sbp,v)    ((sbp)->s_v1.s_blocksize = cpu_to_le16(v))
-#define sb_oid_maxsize(sbp)        (le16_to_cpu((sbp)->s_v1.s_oid_maxsize))
-#define set_sb_oid_maxsize(sbp,v)  ((sbp)->s_v1.s_oid_maxsize = cpu_to_le16(v))
-#define sb_oid_cursize(sbp)        (le16_to_cpu((sbp)->s_v1.s_oid_cursize))
-#define set_sb_oid_cursize(sbp,v)  ((sbp)->s_v1.s_oid_cursize = cpu_to_le16(v))
-#define sb_umount_state(sbp)       (le16_to_cpu((sbp)->s_v1.s_umount_state))
-#define set_sb_umount_state(sbp,v) ((sbp)->s_v1.s_umount_state = cpu_to_le16(v))
-#define sb_fs_state(sbp)           (le16_to_cpu((sbp)->s_v1.s_fs_state))
-#define set_sb_fs_state(sbp,v)     ((sbp)->s_v1.s_fs_state = cpu_to_le16(v))
-#define sb_hash_function_code(sbp) \
-              (le32_to_cpu((sbp)->s_v1.s_hash_function_code))
-#define set_sb_hash_function_code(sbp,v) \
-              ((sbp)->s_v1.s_hash_function_code = cpu_to_le32(v))
-#define sb_tree_height(sbp)        (le16_to_cpu((sbp)->s_v1.s_tree_height))
-#define set_sb_tree_height(sbp,v)  ((sbp)->s_v1.s_tree_height = cpu_to_le16(v))
-#define sb_bmap_nr(sbp)            (le16_to_cpu((sbp)->s_v1.s_bmap_nr))
-#define set_sb_bmap_nr(sbp,v)      ((sbp)->s_v1.s_bmap_nr = cpu_to_le16(v))
-#define sb_version(sbp)            (le16_to_cpu((sbp)->s_v1.s_version))
-#define set_sb_version(sbp,v)      ((sbp)->s_v1.s_version = cpu_to_le16(v))
-
-#define sb_mnt_count(sbp)	   (le16_to_cpu((sbp)->s_mnt_count))
-#define set_sb_mnt_count(sbp, v)   ((sbp)->s_mnt_count = cpu_to_le16(v))
-
-#define sb_reserved_for_journal(sbp) \
-              (le16_to_cpu((sbp)->s_v1.s_reserved_for_journal))
-#define set_sb_reserved_for_journal(sbp,v) \
-              ((sbp)->s_v1.s_reserved_for_journal = cpu_to_le16(v))
-
-/* LOGGING -- */
-
-/*
- * These all interelate for performance.
- *
- * If the journal block count is smaller than n transactions, you lose speed.
- * I don't know what n is yet, I'm guessing 8-16.
- *
- * typical transaction size depends on the application, how often fsync is
- * called, and how many metadata blocks you dirty in a 30 second period.
- * The more small files (<16k) you use, the larger your transactions will
- * be.
- *
- * If your journal fills faster than dirty buffers get flushed to disk, it
- * must flush them before allowing the journal to wrap, which slows things
- * down.  If you need high speed meta data updates, the journal should be
- * big enough to prevent wrapping before dirty meta blocks get to disk.
- *
- * If the batch max is smaller than the transaction max, you'll waste space
- * at the end of the journal because journal_end sets the next transaction
- * to start at 0 if the next transaction has any chance of wrapping.
- *
- * The large the batch max age, the better the speed, and the more meta
- * data changes you'll lose after a crash.
- */
-
-/* don't mess with these for a while */
-/* we have a node size define somewhere in reiserfs_fs.h. -Hans */
-#define JOURNAL_BLOCK_SIZE  4096	/* BUG gotta get rid of this */
-#define JOURNAL_MAX_CNODE   1500	/* max cnodes to allocate. */
-#define JOURNAL_HASH_SIZE 8192
-
-/* number of copies of the bitmaps to have floating.  Must be >= 2 */
-#define JOURNAL_NUM_BITMAPS 5
-
-/*
- * One of these for every block in every transaction
- * Each one is in two hash tables.  First, a hash of the current transaction,
- * and after journal_end, a hash of all the in memory transactions.
- * next and prev are used by the current transaction (journal_hash).
- * hnext and hprev are used by journal_list_hash.  If a block is in more
- * than one transaction, the journal_list_hash links it in multiple times.
- * This allows flush_journal_list to remove just the cnode belonging to a
- * given transaction.
- */
-struct reiserfs_journal_cnode {
-	struct buffer_head *bh;	/* real buffer head */
-	struct super_block *sb;	/* dev of real buffer head */
-
-	/* block number of real buffer head, == 0 when buffer on disk */
-	__u32 blocknr;
-
-	unsigned long state;
-
-	/* journal list this cnode lives in */
-	struct reiserfs_journal_list *jlist;
-
-	struct reiserfs_journal_cnode *next;	/* next in transaction list */
-	struct reiserfs_journal_cnode *prev;	/* prev in transaction list */
-	struct reiserfs_journal_cnode *hprev;	/* prev in hash list */
-	struct reiserfs_journal_cnode *hnext;	/* next in hash list */
-};
-
-struct reiserfs_bitmap_node {
-	int id;
-	char *data;
-	struct list_head list;
-};
-
-struct reiserfs_list_bitmap {
-	struct reiserfs_journal_list *journal_list;
-	struct reiserfs_bitmap_node **bitmaps;
-};
-
-/*
- * one of these for each transaction.  The most important part here is the
- * j_realblock.  this list of cnodes is used to hash all the blocks in all
- * the commits, to mark all the real buffer heads dirty once all the commits
- * hit the disk, and to make sure every real block in a transaction is on
- * disk before allowing the log area to be overwritten
- */
-struct reiserfs_journal_list {
-	unsigned long j_start;
-	unsigned long j_state;
-	unsigned long j_len;
-	atomic_t j_nonzerolen;
-	atomic_t j_commit_left;
-
-	/* all commits older than this on disk */
-	atomic_t j_older_commits_done;
-
-	struct mutex j_commit_mutex;
-	unsigned int j_trans_id;
-	time64_t j_timestamp; /* write-only but useful for crash dump analysis */
-	struct reiserfs_list_bitmap *j_list_bitmap;
-	struct buffer_head *j_commit_bh;	/* commit buffer head */
-	struct reiserfs_journal_cnode *j_realblock;
-	struct reiserfs_journal_cnode *j_freedlist;	/* list of buffers that were freed during this trans.  free each of these on flush */
-	/* time ordered list of all active transactions */
-	struct list_head j_list;
-
-	/*
-	 * time ordered list of all transactions we haven't tried
-	 * to flush yet
-	 */
-	struct list_head j_working_list;
-
-	/* list of tail conversion targets in need of flush before commit */
-	struct list_head j_tail_bh_list;
-
-	/* list of data=ordered buffers in need of flush before commit */
-	struct list_head j_bh_list;
-	int j_refcount;
-};
-
-struct reiserfs_journal {
-	struct buffer_head **j_ap_blocks;	/* journal blocks on disk */
-	/* newest journal block */
-	struct reiserfs_journal_cnode *j_last;
-
-	/* oldest journal block.  start here for traverse */
-	struct reiserfs_journal_cnode *j_first;
-
-	struct file *j_bdev_file;
-
-	/* first block on s_dev of reserved area journal */
-	int j_1st_reserved_block;
-
-	unsigned long j_state;
-	unsigned int j_trans_id;
-	unsigned long j_mount_id;
-
-	/* start of current waiting commit (index into j_ap_blocks) */
-	unsigned long j_start;
-	unsigned long j_len;	/* length of current waiting commit */
-
-	/* number of buffers requested by journal_begin() */
-	unsigned long j_len_alloc;
-
-	atomic_t j_wcount;	/* count of writers for current commit */
-
-	/* batch count. allows turning X transactions into 1 */
-	unsigned long j_bcount;
-
-	/* first unflushed transactions offset */
-	unsigned long j_first_unflushed_offset;
-
-	/* last fully flushed journal timestamp */
-	unsigned j_last_flush_trans_id;
-
-	struct buffer_head *j_header_bh;
-
-	time64_t j_trans_start_time;	/* time this transaction started */
-	struct mutex j_mutex;
-	struct mutex j_flush_mutex;
-
-	/* wait for current transaction to finish before starting new one */
-	wait_queue_head_t j_join_wait;
-
-	atomic_t j_jlock;		/* lock for j_join_wait */
-	int j_list_bitmap_index;	/* number of next list bitmap to use */
-
-	/* no more journal begins allowed. MUST sleep on j_join_wait */
-	int j_must_wait;
-
-	/* next journal_end will flush all journal list */
-	int j_next_full_flush;
-
-	/* next journal_end will flush all async commits */
-	int j_next_async_flush;
-
-	int j_cnode_used;	/* number of cnodes on the used list */
-	int j_cnode_free;	/* number of cnodes on the free list */
-
-	/* max number of blocks in a transaction.  */
-	unsigned int j_trans_max;
-
-	/* max number of blocks to batch into a trans */
-	unsigned int j_max_batch;
-
-	/* in seconds, how old can an async commit be */
-	unsigned int j_max_commit_age;
-
-	/* in seconds, how old can a transaction be */
-	unsigned int j_max_trans_age;
-
-	/* the default for the max commit age */
-	unsigned int j_default_max_commit_age;
-
-	struct reiserfs_journal_cnode *j_cnode_free_list;
-
-	/* orig pointer returned from vmalloc */
-	struct reiserfs_journal_cnode *j_cnode_free_orig;
-
-	struct reiserfs_journal_list *j_current_jl;
-	int j_free_bitmap_nodes;
-	int j_used_bitmap_nodes;
-
-	int j_num_lists;	/* total number of active transactions */
-	int j_num_work_lists;	/* number that need attention from kreiserfsd */
-
-	/* debugging to make sure things are flushed in order */
-	unsigned int j_last_flush_id;
-
-	/* debugging to make sure things are committed in order */
-	unsigned int j_last_commit_id;
-
-	struct list_head j_bitmap_nodes;
-	struct list_head j_dirty_buffers;
-	spinlock_t j_dirty_buffers_lock;	/* protects j_dirty_buffers */
-
-	/* list of all active transactions */
-	struct list_head j_journal_list;
-
-	/* lists that haven't been touched by writeback attempts */
-	struct list_head j_working_list;
-
-	/* hash table for real buffer heads in current trans */
-	struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE];
-
-	/* hash table for all the real buffer heads in all the transactions */
-	struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE];
-
-	/* array of bitmaps to record the deleted blocks */
-	struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS];
-
-	/* list of inodes which have preallocated blocks */
-	struct list_head j_prealloc_list;
-	int j_persistent_trans;
-	unsigned long j_max_trans_size;
-	unsigned long j_max_batch_size;
-
-	int j_errno;
-
-	/* when flushing ordered buffers, throttle new ordered writers */
-	struct delayed_work j_work;
-	struct super_block *j_work_sb;
-	atomic_t j_async_throttle;
-};
-
-enum journal_state_bits {
-	J_WRITERS_BLOCKED = 1,	/* set when new writers not allowed */
-	J_WRITERS_QUEUED,    /* set when log is full due to too many writers */
-	J_ABORTED,           /* set when log is aborted */
-};
-
-/* ick.  magic string to find desc blocks in the journal */
-#define JOURNAL_DESC_MAGIC "ReIsErLB"
-
-typedef __u32(*hashf_t) (const signed char *, int);
-
-struct reiserfs_bitmap_info {
-	__u32 free_count;
-};
-
-struct proc_dir_entry;
-
-#if defined( CONFIG_PROC_FS ) && defined( CONFIG_REISERFS_PROC_INFO )
-typedef unsigned long int stat_cnt_t;
-typedef struct reiserfs_proc_info_data {
-	spinlock_t lock;
-	int exiting;
-	int max_hash_collisions;
-
-	stat_cnt_t breads;
-	stat_cnt_t bread_miss;
-	stat_cnt_t search_by_key;
-	stat_cnt_t search_by_key_fs_changed;
-	stat_cnt_t search_by_key_restarted;
-
-	stat_cnt_t insert_item_restarted;
-	stat_cnt_t paste_into_item_restarted;
-	stat_cnt_t cut_from_item_restarted;
-	stat_cnt_t delete_solid_item_restarted;
-	stat_cnt_t delete_item_restarted;
-
-	stat_cnt_t leaked_oid;
-	stat_cnt_t leaves_removable;
-
-	/*
-	 * balances per level.
-	 * Use explicit 5 as MAX_HEIGHT is not visible yet.
-	 */
-	stat_cnt_t balance_at[5];	/* XXX */
-	/* sbk == search_by_key */
-	stat_cnt_t sbk_read_at[5];	/* XXX */
-	stat_cnt_t sbk_fs_changed[5];
-	stat_cnt_t sbk_restarted[5];
-	stat_cnt_t items_at[5];	/* XXX */
-	stat_cnt_t free_at[5];	/* XXX */
-	stat_cnt_t can_node_be_removed[5];	/* XXX */
-	long int lnum[5];	/* XXX */
-	long int rnum[5];	/* XXX */
-	long int lbytes[5];	/* XXX */
-	long int rbytes[5];	/* XXX */
-	stat_cnt_t get_neighbors[5];
-	stat_cnt_t get_neighbors_restart[5];
-	stat_cnt_t need_l_neighbor[5];
-	stat_cnt_t need_r_neighbor[5];
-
-	stat_cnt_t free_block;
-	struct __scan_bitmap_stats {
-		stat_cnt_t call;
-		stat_cnt_t wait;
-		stat_cnt_t bmap;
-		stat_cnt_t retry;
-		stat_cnt_t in_journal_hint;
-		stat_cnt_t in_journal_nohint;
-		stat_cnt_t stolen;
-	} scan_bitmap;
-	struct __journal_stats {
-		stat_cnt_t in_journal;
-		stat_cnt_t in_journal_bitmap;
-		stat_cnt_t in_journal_reusable;
-		stat_cnt_t lock_journal;
-		stat_cnt_t lock_journal_wait;
-		stat_cnt_t journal_being;
-		stat_cnt_t journal_relock_writers;
-		stat_cnt_t journal_relock_wcount;
-		stat_cnt_t mark_dirty;
-		stat_cnt_t mark_dirty_already;
-		stat_cnt_t mark_dirty_notjournal;
-		stat_cnt_t restore_prepared;
-		stat_cnt_t prepare;
-		stat_cnt_t prepare_retry;
-	} journal;
-} reiserfs_proc_info_data_t;
-#else
-typedef struct reiserfs_proc_info_data {
-} reiserfs_proc_info_data_t;
-#endif
-
-/* Number of quota types we support */
-#define REISERFS_MAXQUOTAS 2
-
-/* reiserfs union of in-core super block data */
-struct reiserfs_sb_info {
-	/* Buffer containing the super block */
-	struct buffer_head *s_sbh;
-
-	/* Pointer to the on-disk super block in the buffer */
-	struct reiserfs_super_block *s_rs;
-	struct reiserfs_bitmap_info *s_ap_bitmap;
-
-	/* pointer to journal information */
-	struct reiserfs_journal *s_journal;
-
-	unsigned short s_mount_state;	/* reiserfs state (valid, invalid) */
-
-	/* Serialize writers access, replace the old bkl */
-	struct mutex lock;
-
-	/* Owner of the lock (can be recursive) */
-	struct task_struct *lock_owner;
-
-	/* Depth of the lock, start from -1 like the bkl */
-	int lock_depth;
-
-	struct workqueue_struct *commit_wq;
-
-	/* Comment? -Hans */
-	void (*end_io_handler) (struct buffer_head *, int);
-
-	/*
-	 * pointer to function which is used to sort names in directory.
-	 * Set on mount
-	 */
-	hashf_t s_hash_function;
-
-	/* reiserfs's mount options are set here */
-	unsigned long s_mount_opt;
-
-	/* This is a structure that describes block allocator options */
-	struct {
-		/* Bitfield for enable/disable kind of options */
-		unsigned long bits;
-
-		/*
-		 * size started from which we consider file
-		 * to be a large one (in blocks)
-		 */
-		unsigned long large_file_size;
-
-		int border;	/* percentage of disk, border takes */
-
-		/*
-		 * Minimal file size (in blocks) starting
-		 * from which we do preallocations
-		 */
-		int preallocmin;
-
-		/*
-		 * Number of blocks we try to prealloc when file
-		 * reaches preallocmin size (in blocks) or prealloc_list
-		 is empty.
-		 */
-		int preallocsize;
-	} s_alloc_options;
-
-	/* Comment? -Hans */
-	wait_queue_head_t s_wait;
-	/* increased by one every time the  tree gets re-balanced */
-	atomic_t s_generation_counter;
-
-	/* File system properties. Currently holds on-disk FS format */
-	unsigned long s_properties;
-
-	/* session statistics */
-	int s_disk_reads;
-	int s_disk_writes;
-	int s_fix_nodes;
-	int s_do_balance;
-	int s_unneeded_left_neighbor;
-	int s_good_search_by_key_reada;
-	int s_bmaps;
-	int s_bmaps_without_search;
-	int s_direct2indirect;
-	int s_indirect2direct;
-
-	/*
-	 * set up when it's ok for reiserfs_read_inode2() to read from
-	 * disk inode with nlink==0. Currently this is only used during
-	 * finish_unfinished() processing at mount time
-	 */
-	int s_is_unlinked_ok;
-
-	reiserfs_proc_info_data_t s_proc_info_data;
-	struct proc_dir_entry *procdir;
-
-	/* amount of blocks reserved for further allocations */
-	int reserved_blocks;
-
-
-	/* this lock on now only used to protect reserved_blocks variable */
-	spinlock_t bitmap_lock;
-	struct dentry *priv_root;	/* root of /.reiserfs_priv */
-	struct dentry *xattr_root;	/* root of /.reiserfs_priv/xattrs */
-	int j_errno;
-
-	int work_queued;              /* non-zero delayed work is queued */
-	struct delayed_work old_work; /* old transactions flush delayed work */
-	spinlock_t old_work_lock;     /* protects old_work and work_queued */
-
-#ifdef CONFIG_QUOTA
-	char *s_qf_names[REISERFS_MAXQUOTAS];
-	int s_jquota_fmt;
-#endif
-	char *s_jdev;		/* Stored jdev for mount option showing */
-#ifdef CONFIG_REISERFS_CHECK
-
-	/*
-	 * Detects whether more than one copy of tb exists per superblock
-	 * as a means of checking whether do_balance is executing
-	 * concurrently against another tree reader/writer on a same
-	 * mount point.
-	 */
-	struct tree_balance *cur_tb;
-#endif
-};
-
-/* Definitions of reiserfs on-disk properties: */
-#define REISERFS_3_5 0
-#define REISERFS_3_6 1
-#define REISERFS_OLD_FORMAT 2
-
-/* Mount options */
-enum reiserfs_mount_options {
-	/* large tails will be created in a session */
-	REISERFS_LARGETAIL,
-	/*
-	 * small (for files less than block size) tails will
-	 * be created in a session
-	 */
-	REISERFS_SMALLTAIL,
-
-	/* replay journal and return 0. Use by fsck */
-	REPLAYONLY,
-
-	/*
-	 * -o conv: causes conversion of old format super block to the
-	 * new format. If not specified - old partition will be dealt
-	 * with in a manner of 3.5.x
-	 */
-	REISERFS_CONVERT,
-
-	/*
-	 * -o hash={tea, rupasov, r5, detect} is meant for properly mounting
-	 * reiserfs disks from 3.5.19 or earlier.  99% of the time, this
-	 * option is not required.  If the normal autodection code can't
-	 * determine which hash to use (because both hashes had the same
-	 * value for a file) use this option to force a specific hash.
-	 * It won't allow you to override the existing hash on the FS, so
-	 * if you have a tea hash disk, and mount with -o hash=rupasov,
-	 * the mount will fail.
-	 */
-	FORCE_TEA_HASH,		/* try to force tea hash on mount */
-	FORCE_RUPASOV_HASH,	/* try to force rupasov hash on mount */
-	FORCE_R5_HASH,		/* try to force rupasov hash on mount */
-	FORCE_HASH_DETECT,	/* try to detect hash function on mount */
-
-	REISERFS_DATA_LOG,
-	REISERFS_DATA_ORDERED,
-	REISERFS_DATA_WRITEBACK,
-
-	/*
-	 * used for testing experimental features, makes benchmarking new
-	 * features with and without more convenient, should never be used by
-	 * users in any code shipped to users (ideally)
-	 */
-
-	REISERFS_NO_BORDER,
-	REISERFS_NO_UNHASHED_RELOCATION,
-	REISERFS_HASHED_RELOCATION,
-	REISERFS_ATTRS,
-	REISERFS_XATTRS_USER,
-	REISERFS_POSIXACL,
-	REISERFS_EXPOSE_PRIVROOT,
-	REISERFS_BARRIER_NONE,
-	REISERFS_BARRIER_FLUSH,
-
-	/* Actions on error */
-	REISERFS_ERROR_PANIC,
-	REISERFS_ERROR_RO,
-	REISERFS_ERROR_CONTINUE,
-
-	REISERFS_USRQUOTA,	/* User quota option specified */
-	REISERFS_GRPQUOTA,	/* Group quota option specified */
-
-	REISERFS_TEST1,
-	REISERFS_TEST2,
-	REISERFS_TEST3,
-	REISERFS_TEST4,
-	REISERFS_UNSUPPORTED_OPT,
-};
-
-#define reiserfs_r5_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_R5_HASH))
-#define reiserfs_rupasov_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_RUPASOV_HASH))
-#define reiserfs_tea_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_TEA_HASH))
-#define reiserfs_hash_detect(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_HASH_DETECT))
-#define reiserfs_no_border(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NO_BORDER))
-#define reiserfs_no_unhashed_relocation(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NO_UNHASHED_RELOCATION))
-#define reiserfs_hashed_relocation(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_HASHED_RELOCATION))
-#define reiserfs_test4(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_TEST4))
-
-#define have_large_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_LARGETAIL))
-#define have_small_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_SMALLTAIL))
-#define replay_only(s) (REISERFS_SB(s)->s_mount_opt & (1 << REPLAYONLY))
-#define reiserfs_attrs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ATTRS))
-#define old_format_only(s) (REISERFS_SB(s)->s_properties & (1 << REISERFS_3_5))
-#define convert_reiserfs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_CONVERT))
-#define reiserfs_data_log(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_LOG))
-#define reiserfs_data_ordered(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_ORDERED))
-#define reiserfs_data_writeback(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_WRITEBACK))
-#define reiserfs_xattrs_user(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_XATTRS_USER))
-#define reiserfs_posixacl(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_POSIXACL))
-#define reiserfs_expose_privroot(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_EXPOSE_PRIVROOT))
-#define reiserfs_xattrs_optional(s) (reiserfs_xattrs_user(s) || reiserfs_posixacl(s))
-#define reiserfs_barrier_none(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_NONE))
-#define reiserfs_barrier_flush(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_FLUSH))
-
-#define reiserfs_error_panic(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ERROR_PANIC))
-#define reiserfs_error_ro(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ERROR_RO))
-
-void reiserfs_file_buffer(struct buffer_head *bh, int list);
-extern struct file_system_type reiserfs_fs_type;
-int reiserfs_resize(struct super_block *, unsigned long);
-
-#define CARRY_ON                0
-#define SCHEDULE_OCCURRED       1
-
-#define SB_BUFFER_WITH_SB(s) (REISERFS_SB(s)->s_sbh)
-#define SB_JOURNAL(s) (REISERFS_SB(s)->s_journal)
-#define SB_JOURNAL_1st_RESERVED_BLOCK(s) (SB_JOURNAL(s)->j_1st_reserved_block)
-#define SB_JOURNAL_LEN_FREE(s) (SB_JOURNAL(s)->j_journal_len_free)
-#define SB_AP_BITMAP(s) (REISERFS_SB(s)->s_ap_bitmap)
-
-#define SB_DISK_JOURNAL_HEAD(s) (SB_JOURNAL(s)->j_header_bh->)
-
-#define reiserfs_is_journal_aborted(journal) (unlikely (__reiserfs_is_journal_aborted (journal)))
-static inline int __reiserfs_is_journal_aborted(struct reiserfs_journal
-						*journal)
-{
-	return test_bit(J_ABORTED, &journal->j_state);
-}
-
-/*
- * Locking primitives. The write lock is a per superblock
- * special mutex that has properties close to the Big Kernel Lock
- * which was used in the previous locking scheme.
- */
-void reiserfs_write_lock(struct super_block *s);
-void reiserfs_write_unlock(struct super_block *s);
-int __must_check reiserfs_write_unlock_nested(struct super_block *s);
-void reiserfs_write_lock_nested(struct super_block *s, int depth);
-
-#ifdef CONFIG_REISERFS_CHECK
-void reiserfs_lock_check_recursive(struct super_block *s);
-#else
-static inline void reiserfs_lock_check_recursive(struct super_block *s) { }
-#endif
-
-/*
- * Several mutexes depend on the write lock.
- * However sometimes we want to relax the write lock while we hold
- * these mutexes, according to the release/reacquire on schedule()
- * properties of the Bkl that were used.
- * Reiserfs performances and locking were based on this scheme.
- * Now that the write lock is a mutex and not the bkl anymore, doing so
- * may result in a deadlock:
- *
- * A acquire write_lock
- * A acquire j_commit_mutex
- * A release write_lock and wait for something
- * B acquire write_lock
- * B can't acquire j_commit_mutex and sleep
- * A can't acquire write lock anymore
- * deadlock
- *
- * What we do here is avoiding such deadlock by playing the same game
- * than the Bkl: if we can't acquire a mutex that depends on the write lock,
- * we release the write lock, wait a bit and then retry.
- *
- * The mutexes concerned by this hack are:
- * - The commit mutex of a journal list
- * - The flush mutex
- * - The journal lock
- * - The inode mutex
- */
-static inline void reiserfs_mutex_lock_safe(struct mutex *m,
-					    struct super_block *s)
-{
-	int depth;
-
-	depth = reiserfs_write_unlock_nested(s);
-	mutex_lock(m);
-	reiserfs_write_lock_nested(s, depth);
-}
-
-static inline void
-reiserfs_mutex_lock_nested_safe(struct mutex *m, unsigned int subclass,
-				struct super_block *s)
-{
-	int depth;
-
-	depth = reiserfs_write_unlock_nested(s);
-	mutex_lock_nested(m, subclass);
-	reiserfs_write_lock_nested(s, depth);
-}
-
-static inline void
-reiserfs_down_read_safe(struct rw_semaphore *sem, struct super_block *s)
-{
-       int depth;
-       depth = reiserfs_write_unlock_nested(s);
-       down_read(sem);
-       reiserfs_write_lock_nested(s, depth);
-}
-
-/*
- * When we schedule, we usually want to also release the write lock,
- * according to the previous bkl based locking scheme of reiserfs.
- */
-static inline void reiserfs_cond_resched(struct super_block *s)
-{
-	if (need_resched()) {
-		int depth;
-
-		depth = reiserfs_write_unlock_nested(s);
-		schedule();
-		reiserfs_write_lock_nested(s, depth);
-	}
-}
-
-struct fid;
-
-/*
- * in reading the #defines, it may help to understand that they employ
- *  the following abbreviations:
- *
- *  B = Buffer
- *  I = Item header
- *  H = Height within the tree (should be changed to LEV)
- *  N = Number of the item in the node
- *  STAT = stat data
- *  DEH = Directory Entry Header
- *  EC = Entry Count
- *  E = Entry number
- *  UL = Unsigned Long
- *  BLKH = BLocK Header
- *  UNFM = UNForMatted node
- *  DC = Disk Child
- *  P = Path
- *
- *  These #defines are named by concatenating these abbreviations,
- *  where first comes the arguments, and last comes the return value,
- *  of the macro.
- */
-
-#define USE_INODE_GENERATION_COUNTER
-
-#define REISERFS_PREALLOCATE
-#define DISPLACE_NEW_PACKING_LOCALITIES
-#define PREALLOCATION_SIZE 9
-
-/* n must be power of 2 */
-#define _ROUND_UP(x,n) (((x)+(n)-1u) & ~((n)-1u))
-
-/*
- * to be ok for alpha and others we have to align structures to 8 byte
- * boundary.
- * FIXME: do not change 4 by anything else: there is code which relies on that
- */
-#define ROUND_UP(x) _ROUND_UP(x,8LL)
-
-/*
- * debug levels.  Right now, CONFIG_REISERFS_CHECK means print all debug
- * messages.
- */
-#define REISERFS_DEBUG_CODE 5	/* extra messages to help find/debug errors */
-
-void __reiserfs_warning(struct super_block *s, const char *id,
-			 const char *func, const char *fmt, ...);
-#define reiserfs_warning(s, id, fmt, args...) \
-	 __reiserfs_warning(s, id, __func__, fmt, ##args)
-/* assertions handling */
-
-/* always check a condition and panic if it's false. */
-#define __RASSERT(cond, scond, format, args...)			\
-do {									\
-	if (!(cond))							\
-		reiserfs_panic(NULL, "assertion failure", "(" #cond ") at " \
-			       __FILE__ ":%i:%s: " format "\n",		\
-			       __LINE__, __func__ , ##args);		\
-} while (0)
-
-#define RASSERT(cond, format, args...) __RASSERT(cond, #cond, format, ##args)
-
-#if defined( CONFIG_REISERFS_CHECK )
-#define RFALSE(cond, format, args...) __RASSERT(!(cond), "!(" #cond ")", format, ##args)
-#else
-#define RFALSE( cond, format, args... ) do {;} while( 0 )
-#endif
-
-#define CONSTF __attribute_const__
-/*
- * Disk Data Structures
- */
-
-/***************************************************************************
- *                             SUPER BLOCK                                 *
- ***************************************************************************/
-
-/*
- * Structure of super block on disk, a version of which in RAM is often
- * accessed as REISERFS_SB(s)->s_rs. The version in RAM is part of a larger
- * structure containing fields never written to disk.
- */
-#define UNSET_HASH 0	/* Detect hash on disk */
-#define TEA_HASH  1
-#define YURA_HASH 2
-#define R5_HASH   3
-#define DEFAULT_HASH R5_HASH
-
-struct journal_params {
-	/* where does journal start from on its * device */
-	__le32 jp_journal_1st_block;
-
-	/* journal device st_rdev */
-	__le32 jp_journal_dev;
-
-	/* size of the journal */
-	__le32 jp_journal_size;
-
-	/* max number of blocks in a transaction. */
-	__le32 jp_journal_trans_max;
-
-	/*
-	 * random value made on fs creation
-	 * (this was sb_journal_block_count)
-	 */
-	__le32 jp_journal_magic;
-
-	/* max number of blocks to batch into a trans */
-	__le32 jp_journal_max_batch;
-
-	/* in seconds, how old can an async  commit be */
-	__le32 jp_journal_max_commit_age;
-
-	/* in seconds, how old can a transaction be */
-	__le32 jp_journal_max_trans_age;
-};
-
-/* this is the super from 3.5.X, where X >= 10 */
-struct reiserfs_super_block_v1 {
-	__le32 s_block_count;	/* blocks count         */
-	__le32 s_free_blocks;	/* free blocks count    */
-	__le32 s_root_block;	/* root block number    */
-	struct journal_params s_journal;
-	__le16 s_blocksize;	/* block size */
-
-	/* max size of object id array, see get_objectid() commentary  */
-	__le16 s_oid_maxsize;
-	__le16 s_oid_cursize;	/* current size of object id array */
-
-	/* this is set to 1 when filesystem was umounted, to 2 - when not */
-	__le16 s_umount_state;
-
-	/*
-	 * reiserfs magic string indicates that file system is reiserfs:
-	 * "ReIsErFs" or "ReIsEr2Fs" or "ReIsEr3Fs"
-	 */
-	char s_magic[10];
-
-	/*
-	 * it is set to used by fsck to mark which
-	 * phase of rebuilding is done
-	 */
-	__le16 s_fs_state;
-	/*
-	 * indicate, what hash function is being use
-	 * to sort names in a directory
-	 */
-	__le32 s_hash_function_code;
-	__le16 s_tree_height;	/* height of disk tree */
-
-	/*
-	 * amount of bitmap blocks needed to address
-	 * each block of file system
-	 */
-	__le16 s_bmap_nr;
-
-	/*
-	 * this field is only reliable on filesystem with non-standard journal
-	 */
-	__le16 s_version;
-
-	/*
-	 * size in blocks of journal area on main device, we need to
-	 * keep after making fs with non-standard journal
-	 */
-	__le16 s_reserved_for_journal;
-} __attribute__ ((__packed__));
-
-#define SB_SIZE_V1 (sizeof(struct reiserfs_super_block_v1))
-
-/* this is the on disk super block */
-struct reiserfs_super_block {
-	struct reiserfs_super_block_v1 s_v1;
-	__le32 s_inode_generation;
-
-	/* Right now used only by inode-attributes, if enabled */
-	__le32 s_flags;
-
-	unsigned char s_uuid[16];	/* filesystem unique identifier */
-	unsigned char s_label[16];	/* filesystem volume label */
-	__le16 s_mnt_count;		/* Count of mounts since last fsck */
-	__le16 s_max_mnt_count;		/* Maximum mounts before check */
-	__le32 s_lastcheck;		/* Timestamp of last fsck */
-	__le32 s_check_interval;	/* Interval between checks */
-
-	/*
-	 * zero filled by mkreiserfs and reiserfs_convert_objectid_map_v1()
-	 * so any additions must be updated there as well. */
-	char s_unused[76];
-} __attribute__ ((__packed__));
-
-#define SB_SIZE (sizeof(struct reiserfs_super_block))
-
-#define REISERFS_VERSION_1 0
-#define REISERFS_VERSION_2 2
-
-/* on-disk super block fields converted to cpu form */
-#define SB_DISK_SUPER_BLOCK(s) (REISERFS_SB(s)->s_rs)
-#define SB_V1_DISK_SUPER_BLOCK(s) (&(SB_DISK_SUPER_BLOCK(s)->s_v1))
-#define SB_BLOCKSIZE(s) \
-        le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_blocksize))
-#define SB_BLOCK_COUNT(s) \
-        le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_block_count))
-#define SB_FREE_BLOCKS(s) \
-        le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_free_blocks))
-#define SB_REISERFS_MAGIC(s) \
-        (SB_V1_DISK_SUPER_BLOCK(s)->s_magic)
-#define SB_ROOT_BLOCK(s) \
-        le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_root_block))
-#define SB_TREE_HEIGHT(s) \
-        le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_tree_height))
-#define SB_REISERFS_STATE(s) \
-        le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_umount_state))
-#define SB_VERSION(s) le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_version))
-#define SB_BMAP_NR(s) le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_bmap_nr))
-
-#define PUT_SB_BLOCK_COUNT(s, val) \
-   do { SB_V1_DISK_SUPER_BLOCK(s)->s_block_count = cpu_to_le32(val); } while (0)
-#define PUT_SB_FREE_BLOCKS(s, val) \
-   do { SB_V1_DISK_SUPER_BLOCK(s)->s_free_blocks = cpu_to_le32(val); } while (0)
-#define PUT_SB_ROOT_BLOCK(s, val) \
-   do { SB_V1_DISK_SUPER_BLOCK(s)->s_root_block = cpu_to_le32(val); } while (0)
-#define PUT_SB_TREE_HEIGHT(s, val) \
-   do { SB_V1_DISK_SUPER_BLOCK(s)->s_tree_height = cpu_to_le16(val); } while (0)
-#define PUT_SB_REISERFS_STATE(s, val) \
-   do { SB_V1_DISK_SUPER_BLOCK(s)->s_umount_state = cpu_to_le16(val); } while (0)
-#define PUT_SB_VERSION(s, val) \
-   do { SB_V1_DISK_SUPER_BLOCK(s)->s_version = cpu_to_le16(val); } while (0)
-#define PUT_SB_BMAP_NR(s, val) \
-   do { SB_V1_DISK_SUPER_BLOCK(s)->s_bmap_nr = cpu_to_le16 (val); } while (0)
-
-#define SB_ONDISK_JP(s) (&SB_V1_DISK_SUPER_BLOCK(s)->s_journal)
-#define SB_ONDISK_JOURNAL_SIZE(s) \
-         le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_size))
-#define SB_ONDISK_JOURNAL_1st_BLOCK(s) \
-         le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_1st_block))
-#define SB_ONDISK_JOURNAL_DEVICE(s) \
-         le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_dev))
-#define SB_ONDISK_RESERVED_FOR_JOURNAL(s) \
-         le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_reserved_for_journal))
-
-#define is_block_in_log_or_reserved_area(s, block) \
-         block >= SB_JOURNAL_1st_RESERVED_BLOCK(s) \
-         && block < SB_JOURNAL_1st_RESERVED_BLOCK(s) +  \
-         ((!is_reiserfs_jr(SB_DISK_SUPER_BLOCK(s)) ? \
-         SB_ONDISK_JOURNAL_SIZE(s) + 1 : SB_ONDISK_RESERVED_FOR_JOURNAL(s)))
-
-int is_reiserfs_3_5(struct reiserfs_super_block *rs);
-int is_reiserfs_3_6(struct reiserfs_super_block *rs);
-int is_reiserfs_jr(struct reiserfs_super_block *rs);
-
-/*
- * ReiserFS leaves the first 64k unused, so that partition labels have
- * enough space.  If someone wants to write a fancy bootloader that
- * needs more than 64k, let us know, and this will be increased in size.
- * This number must be larger than the largest block size on any
- * platform, or code will break.  -Hans
- */
-#define REISERFS_DISK_OFFSET_IN_BYTES (64 * 1024)
-#define REISERFS_FIRST_BLOCK unused_define
-#define REISERFS_JOURNAL_OFFSET_IN_BYTES REISERFS_DISK_OFFSET_IN_BYTES
-
-/* the spot for the super in versions 3.5 - 3.5.10 (inclusive) */
-#define REISERFS_OLD_DISK_OFFSET_IN_BYTES (8 * 1024)
-
-/* reiserfs internal error code (used by search_by_key and fix_nodes)) */
-#define CARRY_ON      0
-#define REPEAT_SEARCH -1
-#define IO_ERROR      -2
-#define NO_DISK_SPACE -3
-#define NO_BALANCING_NEEDED  (-4)
-#define NO_MORE_UNUSED_CONTIGUOUS_BLOCKS (-5)
-#define QUOTA_EXCEEDED -6
-
-typedef __u32 b_blocknr_t;
-typedef __le32 unp_t;
-
-struct unfm_nodeinfo {
-	unp_t unfm_nodenum;
-	unsigned short unfm_freespace;
-};
-
-/* there are two formats of keys: 3.5 and 3.6 */
-#define KEY_FORMAT_3_5 0
-#define KEY_FORMAT_3_6 1
-
-/* there are two stat datas */
-#define STAT_DATA_V1 0
-#define STAT_DATA_V2 1
-
-static inline struct reiserfs_inode_info *REISERFS_I(const struct inode *inode)
-{
-	return container_of(inode, struct reiserfs_inode_info, vfs_inode);
-}
-
-static inline struct reiserfs_sb_info *REISERFS_SB(const struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-
-/*
- * Don't trust REISERFS_SB(sb)->s_bmap_nr, it's a u16
- * which overflows on large file systems.
- */
-static inline __u32 reiserfs_bmap_count(struct super_block *sb)
-{
-	return (SB_BLOCK_COUNT(sb) - 1) / (sb->s_blocksize * 8) + 1;
-}
-
-static inline int bmap_would_wrap(unsigned bmap_nr)
-{
-	return bmap_nr > ((1LL << 16) - 1);
-}
-
-extern const struct xattr_handler * const reiserfs_xattr_handlers[];
-
-/*
- * this says about version of key of all items (but stat data) the
- * object consists of
- */
-#define get_inode_item_key_version( inode )                                    \
-    ((REISERFS_I(inode)->i_flags & i_item_key_version_mask) ? KEY_FORMAT_3_6 : KEY_FORMAT_3_5)
-
-#define set_inode_item_key_version( inode, version )                           \
-         ({ if((version)==KEY_FORMAT_3_6)                                      \
-                REISERFS_I(inode)->i_flags |= i_item_key_version_mask;      \
-            else                                                               \
-                REISERFS_I(inode)->i_flags &= ~i_item_key_version_mask; })
-
-#define get_inode_sd_version(inode)                                            \
-    ((REISERFS_I(inode)->i_flags & i_stat_data_version_mask) ? STAT_DATA_V2 : STAT_DATA_V1)
-
-#define set_inode_sd_version(inode, version)                                   \
-         ({ if((version)==STAT_DATA_V2)                                        \
-                REISERFS_I(inode)->i_flags |= i_stat_data_version_mask;     \
-            else                                                               \
-                REISERFS_I(inode)->i_flags &= ~i_stat_data_version_mask; })
-
-/*
- * This is an aggressive tail suppression policy, I am hoping it
- * improves our benchmarks. The principle behind it is that percentage
- * space saving is what matters, not absolute space saving.  This is
- * non-intuitive, but it helps to understand it if you consider that the
- * cost to access 4 blocks is not much more than the cost to access 1
- * block, if you have to do a seek and rotate.  A tail risks a
- * non-linear disk access that is significant as a percentage of total
- * time cost for a 4 block file and saves an amount of space that is
- * less significant as a percentage of space, or so goes the hypothesis.
- * -Hans
- */
-#define STORE_TAIL_IN_UNFM_S1(n_file_size,n_tail_size,n_block_size) \
-(\
-  (!(n_tail_size)) || \
-  (((n_tail_size) > MAX_DIRECT_ITEM_LEN(n_block_size)) || \
-   ( (n_file_size) >= (n_block_size) * 4 ) || \
-   ( ( (n_file_size) >= (n_block_size) * 3 ) && \
-     ( (n_tail_size) >=   (MAX_DIRECT_ITEM_LEN(n_block_size))/4) ) || \
-   ( ( (n_file_size) >= (n_block_size) * 2 ) && \
-     ( (n_tail_size) >=   (MAX_DIRECT_ITEM_LEN(n_block_size))/2) ) || \
-   ( ( (n_file_size) >= (n_block_size) ) && \
-     ( (n_tail_size) >=   (MAX_DIRECT_ITEM_LEN(n_block_size) * 3)/4) ) ) \
-)
-
-/*
- * Another strategy for tails, this one means only create a tail if all the
- * file would fit into one DIRECT item.
- * Primary intention for this one is to increase performance by decreasing
- * seeking.
-*/
-#define STORE_TAIL_IN_UNFM_S2(n_file_size,n_tail_size,n_block_size) \
-(\
-  (!(n_tail_size)) || \
-  (((n_file_size) > MAX_DIRECT_ITEM_LEN(n_block_size)) ) \
-)
-
-/*
- * values for s_umount_state field
- */
-#define REISERFS_VALID_FS    1
-#define REISERFS_ERROR_FS    2
-
-/*
- * there are 5 item types currently
- */
-#define TYPE_STAT_DATA 0
-#define TYPE_INDIRECT 1
-#define TYPE_DIRECT 2
-#define TYPE_DIRENTRY 3
-#define TYPE_MAXTYPE 3
-#define TYPE_ANY 15		/* FIXME: comment is required */
-
-/***************************************************************************
- *                       KEY & ITEM HEAD                                   *
- ***************************************************************************/
-
-/* * directories use this key as well as old files */
-struct offset_v1 {
-	__le32 k_offset;
-	__le32 k_uniqueness;
-} __attribute__ ((__packed__));
-
-struct offset_v2 {
-	__le64 v;
-} __attribute__ ((__packed__));
-
-static inline __u16 offset_v2_k_type(const struct offset_v2 *v2)
-{
-	__u8 type = le64_to_cpu(v2->v) >> 60;
-	return (type <= TYPE_MAXTYPE) ? type : TYPE_ANY;
-}
-
-static inline void set_offset_v2_k_type(struct offset_v2 *v2, int type)
-{
-	v2->v =
-	    (v2->v & cpu_to_le64(~0ULL >> 4)) | cpu_to_le64((__u64) type << 60);
-}
-
-static inline loff_t offset_v2_k_offset(const struct offset_v2 *v2)
-{
-	return le64_to_cpu(v2->v) & (~0ULL >> 4);
-}
-
-static inline void set_offset_v2_k_offset(struct offset_v2 *v2, loff_t offset)
-{
-	offset &= (~0ULL >> 4);
-	v2->v = (v2->v & cpu_to_le64(15ULL << 60)) | cpu_to_le64(offset);
-}
-
-/*
- * Key of an item determines its location in the S+tree, and
- * is composed of 4 components
- */
-struct reiserfs_key {
-	/* packing locality: by default parent directory object id */
-	__le32 k_dir_id;
-
-	__le32 k_objectid;	/* object identifier */
-	union {
-		struct offset_v1 k_offset_v1;
-		struct offset_v2 k_offset_v2;
-	} __attribute__ ((__packed__)) u;
-} __attribute__ ((__packed__));
-
-struct in_core_key {
-	/* packing locality: by default parent directory object id */
-	__u32 k_dir_id;
-	__u32 k_objectid;	/* object identifier */
-	__u64 k_offset;
-	__u8 k_type;
-};
-
-struct cpu_key {
-	struct in_core_key on_disk_key;
-	int version;
-	/* 3 in all cases but direct2indirect and indirect2direct conversion */
-	int key_length;
-};
-
-/*
- * Our function for comparing keys can compare keys of different
- * lengths.  It takes as a parameter the length of the keys it is to
- * compare.  These defines are used in determining what is to be passed
- * to it as that parameter.
- */
-#define REISERFS_FULL_KEY_LEN     4
-#define REISERFS_SHORT_KEY_LEN    2
-
-/* The result of the key compare */
-#define FIRST_GREATER 1
-#define SECOND_GREATER -1
-#define KEYS_IDENTICAL 0
-#define KEY_FOUND 1
-#define KEY_NOT_FOUND 0
-
-#define KEY_SIZE (sizeof(struct reiserfs_key))
-
-/* return values for search_by_key and clones */
-#define ITEM_FOUND 1
-#define ITEM_NOT_FOUND 0
-#define ENTRY_FOUND 1
-#define ENTRY_NOT_FOUND 0
-#define DIRECTORY_NOT_FOUND -1
-#define REGULAR_FILE_FOUND -2
-#define DIRECTORY_FOUND -3
-#define BYTE_FOUND 1
-#define BYTE_NOT_FOUND 0
-#define FILE_NOT_FOUND -1
-
-#define POSITION_FOUND 1
-#define POSITION_NOT_FOUND 0
-
-/* return values for reiserfs_find_entry and search_by_entry_key */
-#define NAME_FOUND 1
-#define NAME_NOT_FOUND 0
-#define GOTO_PREVIOUS_ITEM 2
-#define NAME_FOUND_INVISIBLE 3
-
-/*
- * Everything in the filesystem is stored as a set of items.  The
- * item head contains the key of the item, its free space (for
- * indirect items) and specifies the location of the item itself
- * within the block.
- */
-
-struct item_head {
-	/*
-	 * Everything in the tree is found by searching for it based on
-	 * its key.
-	 */
-	struct reiserfs_key ih_key;
-	union {
-		/*
-		 * The free space in the last unformatted node of an
-		 * indirect item if this is an indirect item.  This
-		 * equals 0xFFFF iff this is a direct item or stat data
-		 * item. Note that the key, not this field, is used to
-		 * determine the item type, and thus which field this
-		 * union contains.
-		 */
-		__le16 ih_free_space_reserved;
-
-		/*
-		 * Iff this is a directory item, this field equals the
-		 * number of directory entries in the directory item.
-		 */
-		__le16 ih_entry_count;
-	} __attribute__ ((__packed__)) u;
-	__le16 ih_item_len;	/* total size of the item body */
-
-	/* an offset to the item body within the block */
-	__le16 ih_item_location;
-
-	/*
-	 * 0 for all old items, 2 for new ones. Highest bit is set by fsck
-	 * temporary, cleaned after all done
-	 */
-	__le16 ih_version;
-} __attribute__ ((__packed__));
-/* size of item header     */
-#define IH_SIZE (sizeof(struct item_head))
-
-#define ih_free_space(ih)            le16_to_cpu((ih)->u.ih_free_space_reserved)
-#define ih_version(ih)               le16_to_cpu((ih)->ih_version)
-#define ih_entry_count(ih)           le16_to_cpu((ih)->u.ih_entry_count)
-#define ih_location(ih)              le16_to_cpu((ih)->ih_item_location)
-#define ih_item_len(ih)              le16_to_cpu((ih)->ih_item_len)
-
-#define put_ih_free_space(ih, val)   do { (ih)->u.ih_free_space_reserved = cpu_to_le16(val); } while(0)
-#define put_ih_version(ih, val)      do { (ih)->ih_version = cpu_to_le16(val); } while (0)
-#define put_ih_entry_count(ih, val)  do { (ih)->u.ih_entry_count = cpu_to_le16(val); } while (0)
-#define put_ih_location(ih, val)     do { (ih)->ih_item_location = cpu_to_le16(val); } while (0)
-#define put_ih_item_len(ih, val)     do { (ih)->ih_item_len = cpu_to_le16(val); } while (0)
-
-#define unreachable_item(ih) (ih_version(ih) & (1 << 15))
-
-#define get_ih_free_space(ih) (ih_version (ih) == KEY_FORMAT_3_6 ? 0 : ih_free_space (ih))
-#define set_ih_free_space(ih,val) put_ih_free_space((ih), ((ih_version(ih) == KEY_FORMAT_3_6) ? 0 : (val)))
-
-/*
- * these operate on indirect items, where you've got an array of ints
- * at a possibly unaligned location.  These are a noop on ia32
- *
- * p is the array of __u32, i is the index into the array, v is the value
- * to store there.
- */
-#define get_block_num(p, i) get_unaligned_le32((p) + (i))
-#define put_block_num(p, i, v) put_unaligned_le32((v), (p) + (i))
-
-/* * in old version uniqueness field shows key type */
-#define V1_SD_UNIQUENESS 0
-#define V1_INDIRECT_UNIQUENESS 0xfffffffe
-#define V1_DIRECT_UNIQUENESS 0xffffffff
-#define V1_DIRENTRY_UNIQUENESS 500
-#define V1_ANY_UNIQUENESS 555	/* FIXME: comment is required */
-
-/* here are conversion routines */
-static inline int uniqueness2type(__u32 uniqueness) CONSTF;
-static inline int uniqueness2type(__u32 uniqueness)
-{
-	switch ((int)uniqueness) {
-	case V1_SD_UNIQUENESS:
-		return TYPE_STAT_DATA;
-	case V1_INDIRECT_UNIQUENESS:
-		return TYPE_INDIRECT;
-	case V1_DIRECT_UNIQUENESS:
-		return TYPE_DIRECT;
-	case V1_DIRENTRY_UNIQUENESS:
-		return TYPE_DIRENTRY;
-	case V1_ANY_UNIQUENESS:
-	default:
-		return TYPE_ANY;
-	}
-}
-
-static inline __u32 type2uniqueness(int type) CONSTF;
-static inline __u32 type2uniqueness(int type)
-{
-	switch (type) {
-	case TYPE_STAT_DATA:
-		return V1_SD_UNIQUENESS;
-	case TYPE_INDIRECT:
-		return V1_INDIRECT_UNIQUENESS;
-	case TYPE_DIRECT:
-		return V1_DIRECT_UNIQUENESS;
-	case TYPE_DIRENTRY:
-		return V1_DIRENTRY_UNIQUENESS;
-	case TYPE_ANY:
-	default:
-		return V1_ANY_UNIQUENESS;
-	}
-}
-
-/*
- * key is pointer to on disk key which is stored in le, result is cpu,
- * there is no way to get version of object from key, so, provide
- * version to these defines
- */
-static inline loff_t le_key_k_offset(int version,
-				     const struct reiserfs_key *key)
-{
-	return (version == KEY_FORMAT_3_5) ?
-	    le32_to_cpu(key->u.k_offset_v1.k_offset) :
-	    offset_v2_k_offset(&(key->u.k_offset_v2));
-}
-
-static inline loff_t le_ih_k_offset(const struct item_head *ih)
-{
-	return le_key_k_offset(ih_version(ih), &(ih->ih_key));
-}
-
-static inline loff_t le_key_k_type(int version, const struct reiserfs_key *key)
-{
-	if (version == KEY_FORMAT_3_5) {
-		loff_t val = le32_to_cpu(key->u.k_offset_v1.k_uniqueness);
-		return uniqueness2type(val);
-	} else
-		return offset_v2_k_type(&(key->u.k_offset_v2));
-}
-
-static inline loff_t le_ih_k_type(const struct item_head *ih)
-{
-	return le_key_k_type(ih_version(ih), &(ih->ih_key));
-}
-
-static inline void set_le_key_k_offset(int version, struct reiserfs_key *key,
-				       loff_t offset)
-{
-	if (version == KEY_FORMAT_3_5)
-		key->u.k_offset_v1.k_offset = cpu_to_le32(offset);
-	else
-		set_offset_v2_k_offset(&key->u.k_offset_v2, offset);
-}
-
-static inline void add_le_key_k_offset(int version, struct reiserfs_key *key,
-				       loff_t offset)
-{
-	set_le_key_k_offset(version, key,
-			    le_key_k_offset(version, key) + offset);
-}
-
-static inline void add_le_ih_k_offset(struct item_head *ih, loff_t offset)
-{
-	add_le_key_k_offset(ih_version(ih), &(ih->ih_key), offset);
-}
-
-static inline void set_le_ih_k_offset(struct item_head *ih, loff_t offset)
-{
-	set_le_key_k_offset(ih_version(ih), &(ih->ih_key), offset);
-}
-
-static inline void set_le_key_k_type(int version, struct reiserfs_key *key,
-				     int type)
-{
-	if (version == KEY_FORMAT_3_5) {
-		type = type2uniqueness(type);
-		key->u.k_offset_v1.k_uniqueness = cpu_to_le32(type);
-	} else
-	       set_offset_v2_k_type(&key->u.k_offset_v2, type);
-}
-
-static inline void set_le_ih_k_type(struct item_head *ih, int type)
-{
-	set_le_key_k_type(ih_version(ih), &(ih->ih_key), type);
-}
-
-static inline int is_direntry_le_key(int version, struct reiserfs_key *key)
-{
-	return le_key_k_type(version, key) == TYPE_DIRENTRY;
-}
-
-static inline int is_direct_le_key(int version, struct reiserfs_key *key)
-{
-	return le_key_k_type(version, key) == TYPE_DIRECT;
-}
-
-static inline int is_indirect_le_key(int version, struct reiserfs_key *key)
-{
-	return le_key_k_type(version, key) == TYPE_INDIRECT;
-}
-
-static inline int is_statdata_le_key(int version, struct reiserfs_key *key)
-{
-	return le_key_k_type(version, key) == TYPE_STAT_DATA;
-}
-
-/* item header has version.  */
-static inline int is_direntry_le_ih(struct item_head *ih)
-{
-	return is_direntry_le_key(ih_version(ih), &ih->ih_key);
-}
-
-static inline int is_direct_le_ih(struct item_head *ih)
-{
-	return is_direct_le_key(ih_version(ih), &ih->ih_key);
-}
-
-static inline int is_indirect_le_ih(struct item_head *ih)
-{
-	return is_indirect_le_key(ih_version(ih), &ih->ih_key);
-}
-
-static inline int is_statdata_le_ih(struct item_head *ih)
-{
-	return is_statdata_le_key(ih_version(ih), &ih->ih_key);
-}
-
-/* key is pointer to cpu key, result is cpu */
-static inline loff_t cpu_key_k_offset(const struct cpu_key *key)
-{
-	return key->on_disk_key.k_offset;
-}
-
-static inline loff_t cpu_key_k_type(const struct cpu_key *key)
-{
-	return key->on_disk_key.k_type;
-}
-
-static inline void set_cpu_key_k_offset(struct cpu_key *key, loff_t offset)
-{
-	key->on_disk_key.k_offset = offset;
-}
-
-static inline void set_cpu_key_k_type(struct cpu_key *key, int type)
-{
-	key->on_disk_key.k_type = type;
-}
-
-static inline void cpu_key_k_offset_dec(struct cpu_key *key)
-{
-	key->on_disk_key.k_offset--;
-}
-
-#define is_direntry_cpu_key(key) (cpu_key_k_type (key) == TYPE_DIRENTRY)
-#define is_direct_cpu_key(key) (cpu_key_k_type (key) == TYPE_DIRECT)
-#define is_indirect_cpu_key(key) (cpu_key_k_type (key) == TYPE_INDIRECT)
-#define is_statdata_cpu_key(key) (cpu_key_k_type (key) == TYPE_STAT_DATA)
-
-/* are these used ? */
-#define is_direntry_cpu_ih(ih) (is_direntry_cpu_key (&((ih)->ih_key)))
-#define is_direct_cpu_ih(ih) (is_direct_cpu_key (&((ih)->ih_key)))
-#define is_indirect_cpu_ih(ih) (is_indirect_cpu_key (&((ih)->ih_key)))
-#define is_statdata_cpu_ih(ih) (is_statdata_cpu_key (&((ih)->ih_key)))
-
-#define I_K_KEY_IN_ITEM(ih, key, n_blocksize) \
-    (!COMP_SHORT_KEYS(ih, key) && \
-	  I_OFF_BYTE_IN_ITEM(ih, k_offset(key), n_blocksize))
-
-/* maximal length of item */
-#define MAX_ITEM_LEN(block_size) (block_size - BLKH_SIZE - IH_SIZE)
-#define MIN_ITEM_LEN 1
-
-/* object identifier for root dir */
-#define REISERFS_ROOT_OBJECTID 2
-#define REISERFS_ROOT_PARENT_OBJECTID 1
-
-extern struct reiserfs_key root_key;
-
-/*
- * Picture represents a leaf of the S+tree
- *  ______________________________________________________
- * |      |  Array of     |                   |           |
- * |Block |  Object-Item  |      F r e e      |  Objects- |
- * | head |  Headers      |     S p a c e     |   Items   |
- * |______|_______________|___________________|___________|
- */
-
-/*
- * Header of a disk block.  More precisely, header of a formatted leaf
- * or internal node, and not the header of an unformatted node.
- */
-struct block_head {
-	__le16 blk_level;	/* Level of a block in the tree. */
-	__le16 blk_nr_item;	/* Number of keys/items in a block. */
-	__le16 blk_free_space;	/* Block free space in bytes. */
-	__le16 blk_reserved;
-	/* dump this in v4/planA */
-
-	/* kept only for compatibility */
-	struct reiserfs_key blk_right_delim_key;
-};
-
-#define BLKH_SIZE                     (sizeof(struct block_head))
-#define blkh_level(p_blkh)            (le16_to_cpu((p_blkh)->blk_level))
-#define blkh_nr_item(p_blkh)          (le16_to_cpu((p_blkh)->blk_nr_item))
-#define blkh_free_space(p_blkh)       (le16_to_cpu((p_blkh)->blk_free_space))
-#define blkh_reserved(p_blkh)         (le16_to_cpu((p_blkh)->blk_reserved))
-#define set_blkh_level(p_blkh,val)    ((p_blkh)->blk_level = cpu_to_le16(val))
-#define set_blkh_nr_item(p_blkh,val)  ((p_blkh)->blk_nr_item = cpu_to_le16(val))
-#define set_blkh_free_space(p_blkh,val) ((p_blkh)->blk_free_space = cpu_to_le16(val))
-#define set_blkh_reserved(p_blkh,val) ((p_blkh)->blk_reserved = cpu_to_le16(val))
-#define blkh_right_delim_key(p_blkh)  ((p_blkh)->blk_right_delim_key)
-#define set_blkh_right_delim_key(p_blkh,val)  ((p_blkh)->blk_right_delim_key = val)
-
-/* values for blk_level field of the struct block_head */
-
-/*
- * When node gets removed from the tree its blk_level is set to FREE_LEVEL.
- * It is then  used to see whether the node is still in the tree
- */
-#define FREE_LEVEL 0
-
-#define DISK_LEAF_NODE_LEVEL  1	/* Leaf node level. */
-
-/*
- * Given the buffer head of a formatted node, resolve to the
- * block head of that node.
- */
-#define B_BLK_HEAD(bh)			((struct block_head *)((bh)->b_data))
-/* Number of items that are in buffer. */
-#define B_NR_ITEMS(bh)			(blkh_nr_item(B_BLK_HEAD(bh)))
-#define B_LEVEL(bh)			(blkh_level(B_BLK_HEAD(bh)))
-#define B_FREE_SPACE(bh)		(blkh_free_space(B_BLK_HEAD(bh)))
-
-#define PUT_B_NR_ITEMS(bh, val)		do { set_blkh_nr_item(B_BLK_HEAD(bh), val); } while (0)
-#define PUT_B_LEVEL(bh, val)		do { set_blkh_level(B_BLK_HEAD(bh), val); } while (0)
-#define PUT_B_FREE_SPACE(bh, val)	do { set_blkh_free_space(B_BLK_HEAD(bh), val); } while (0)
-
-/* Get right delimiting key. -- little endian */
-#define B_PRIGHT_DELIM_KEY(bh)		(&(blk_right_delim_key(B_BLK_HEAD(bh))))
-
-/* Does the buffer contain a disk leaf. */
-#define B_IS_ITEMS_LEVEL(bh)		(B_LEVEL(bh) == DISK_LEAF_NODE_LEVEL)
-
-/* Does the buffer contain a disk internal node */
-#define B_IS_KEYS_LEVEL(bh)      (B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL \
-					    && B_LEVEL(bh) <= MAX_HEIGHT)
-
-/***************************************************************************
- *                             STAT DATA                                   *
- ***************************************************************************/
-
-/*
- * old stat data is 32 bytes long. We are going to distinguish new one by
- * different size
-*/
-struct stat_data_v1 {
-	__le16 sd_mode;		/* file type, permissions */
-	__le16 sd_nlink;	/* number of hard links */
-	__le16 sd_uid;		/* owner */
-	__le16 sd_gid;		/* group */
-	__le32 sd_size;		/* file size */
-	__le32 sd_atime;	/* time of last access */
-	__le32 sd_mtime;	/* time file was last modified  */
-
-	/*
-	 * time inode (stat data) was last changed
-	 * (except changes to sd_atime and sd_mtime)
-	 */
-	__le32 sd_ctime;
-	union {
-		__le32 sd_rdev;
-		__le32 sd_blocks;	/* number of blocks file uses */
-	} __attribute__ ((__packed__)) u;
-
-	/*
-	 * first byte of file which is stored in a direct item: except that if
-	 * it equals 1 it is a symlink and if it equals ~(__u32)0 there is no
-	 * direct item.  The existence of this field really grates on me.
-	 * Let's replace it with a macro based on sd_size and our tail
-	 * suppression policy.  Someday.  -Hans
-	 */
-	__le32 sd_first_direct_byte;
-} __attribute__ ((__packed__));
-
-#define SD_V1_SIZE              (sizeof(struct stat_data_v1))
-#define stat_data_v1(ih)        (ih_version (ih) == KEY_FORMAT_3_5)
-#define sd_v1_mode(sdp)         (le16_to_cpu((sdp)->sd_mode))
-#define set_sd_v1_mode(sdp,v)   ((sdp)->sd_mode = cpu_to_le16(v))
-#define sd_v1_nlink(sdp)        (le16_to_cpu((sdp)->sd_nlink))
-#define set_sd_v1_nlink(sdp,v)  ((sdp)->sd_nlink = cpu_to_le16(v))
-#define sd_v1_uid(sdp)          (le16_to_cpu((sdp)->sd_uid))
-#define set_sd_v1_uid(sdp,v)    ((sdp)->sd_uid = cpu_to_le16(v))
-#define sd_v1_gid(sdp)          (le16_to_cpu((sdp)->sd_gid))
-#define set_sd_v1_gid(sdp,v)    ((sdp)->sd_gid = cpu_to_le16(v))
-#define sd_v1_size(sdp)         (le32_to_cpu((sdp)->sd_size))
-#define set_sd_v1_size(sdp,v)   ((sdp)->sd_size = cpu_to_le32(v))
-#define sd_v1_atime(sdp)        (le32_to_cpu((sdp)->sd_atime))
-#define set_sd_v1_atime(sdp,v)  ((sdp)->sd_atime = cpu_to_le32(v))
-#define sd_v1_mtime(sdp)        (le32_to_cpu((sdp)->sd_mtime))
-#define set_sd_v1_mtime(sdp,v)  ((sdp)->sd_mtime = cpu_to_le32(v))
-#define sd_v1_ctime(sdp)        (le32_to_cpu((sdp)->sd_ctime))
-#define set_sd_v1_ctime(sdp,v)  ((sdp)->sd_ctime = cpu_to_le32(v))
-#define sd_v1_rdev(sdp)         (le32_to_cpu((sdp)->u.sd_rdev))
-#define set_sd_v1_rdev(sdp,v)   ((sdp)->u.sd_rdev = cpu_to_le32(v))
-#define sd_v1_blocks(sdp)       (le32_to_cpu((sdp)->u.sd_blocks))
-#define set_sd_v1_blocks(sdp,v) ((sdp)->u.sd_blocks = cpu_to_le32(v))
-#define sd_v1_first_direct_byte(sdp) \
-                                (le32_to_cpu((sdp)->sd_first_direct_byte))
-#define set_sd_v1_first_direct_byte(sdp,v) \
-                                ((sdp)->sd_first_direct_byte = cpu_to_le32(v))
-
-/* inode flags stored in sd_attrs (nee sd_reserved) */
-
-/*
- * we want common flags to have the same values as in ext2,
- * so chattr(1) will work without problems
- */
-#define REISERFS_IMMUTABLE_FL FS_IMMUTABLE_FL
-#define REISERFS_APPEND_FL    FS_APPEND_FL
-#define REISERFS_SYNC_FL      FS_SYNC_FL
-#define REISERFS_NOATIME_FL   FS_NOATIME_FL
-#define REISERFS_NODUMP_FL    FS_NODUMP_FL
-#define REISERFS_SECRM_FL     FS_SECRM_FL
-#define REISERFS_UNRM_FL      FS_UNRM_FL
-#define REISERFS_COMPR_FL     FS_COMPR_FL
-#define REISERFS_NOTAIL_FL    FS_NOTAIL_FL
-
-/* persistent flags that file inherits from the parent directory */
-#define REISERFS_INHERIT_MASK ( REISERFS_IMMUTABLE_FL |	\
-				REISERFS_SYNC_FL |	\
-				REISERFS_NOATIME_FL |	\
-				REISERFS_NODUMP_FL |	\
-				REISERFS_SECRM_FL |	\
-				REISERFS_COMPR_FL |	\
-				REISERFS_NOTAIL_FL )
-
-/*
- * Stat Data on disk (reiserfs version of UFS disk inode minus the
- * address blocks)
- */
-struct stat_data {
-	__le16 sd_mode;		/* file type, permissions */
-	__le16 sd_attrs;	/* persistent inode flags */
-	__le32 sd_nlink;	/* number of hard links */
-	__le64 sd_size;		/* file size */
-	__le32 sd_uid;		/* owner */
-	__le32 sd_gid;		/* group */
-	__le32 sd_atime;	/* time of last access */
-	__le32 sd_mtime;	/* time file was last modified  */
-
-	/*
-	 * time inode (stat data) was last changed
-	 * (except changes to sd_atime and sd_mtime)
-	 */
-	__le32 sd_ctime;
-	__le32 sd_blocks;
-	union {
-		__le32 sd_rdev;
-		__le32 sd_generation;
-	} __attribute__ ((__packed__)) u;
-} __attribute__ ((__packed__));
-
-/* this is 44 bytes long */
-#define SD_SIZE (sizeof(struct stat_data))
-#define SD_V2_SIZE              SD_SIZE
-#define stat_data_v2(ih)        (ih_version (ih) == KEY_FORMAT_3_6)
-#define sd_v2_mode(sdp)         (le16_to_cpu((sdp)->sd_mode))
-#define set_sd_v2_mode(sdp,v)   ((sdp)->sd_mode = cpu_to_le16(v))
-/* sd_reserved */
-/* set_sd_reserved */
-#define sd_v2_nlink(sdp)        (le32_to_cpu((sdp)->sd_nlink))
-#define set_sd_v2_nlink(sdp,v)  ((sdp)->sd_nlink = cpu_to_le32(v))
-#define sd_v2_size(sdp)         (le64_to_cpu((sdp)->sd_size))
-#define set_sd_v2_size(sdp,v)   ((sdp)->sd_size = cpu_to_le64(v))
-#define sd_v2_uid(sdp)          (le32_to_cpu((sdp)->sd_uid))
-#define set_sd_v2_uid(sdp,v)    ((sdp)->sd_uid = cpu_to_le32(v))
-#define sd_v2_gid(sdp)          (le32_to_cpu((sdp)->sd_gid))
-#define set_sd_v2_gid(sdp,v)    ((sdp)->sd_gid = cpu_to_le32(v))
-#define sd_v2_atime(sdp)        (le32_to_cpu((sdp)->sd_atime))
-#define set_sd_v2_atime(sdp,v)  ((sdp)->sd_atime = cpu_to_le32(v))
-#define sd_v2_mtime(sdp)        (le32_to_cpu((sdp)->sd_mtime))
-#define set_sd_v2_mtime(sdp,v)  ((sdp)->sd_mtime = cpu_to_le32(v))
-#define sd_v2_ctime(sdp)        (le32_to_cpu((sdp)->sd_ctime))
-#define set_sd_v2_ctime(sdp,v)  ((sdp)->sd_ctime = cpu_to_le32(v))
-#define sd_v2_blocks(sdp)       (le32_to_cpu((sdp)->sd_blocks))
-#define set_sd_v2_blocks(sdp,v) ((sdp)->sd_blocks = cpu_to_le32(v))
-#define sd_v2_rdev(sdp)         (le32_to_cpu((sdp)->u.sd_rdev))
-#define set_sd_v2_rdev(sdp,v)   ((sdp)->u.sd_rdev = cpu_to_le32(v))
-#define sd_v2_generation(sdp)   (le32_to_cpu((sdp)->u.sd_generation))
-#define set_sd_v2_generation(sdp,v) ((sdp)->u.sd_generation = cpu_to_le32(v))
-#define sd_v2_attrs(sdp)         (le16_to_cpu((sdp)->sd_attrs))
-#define set_sd_v2_attrs(sdp,v)   ((sdp)->sd_attrs = cpu_to_le16(v))
-
-/***************************************************************************
- *                      DIRECTORY STRUCTURE                                *
- ***************************************************************************/
-/*
- * Picture represents the structure of directory items
- * ________________________________________________
- * |  Array of     |   |     |        |       |   |
- * | directory     |N-1| N-2 | ....   |   1st |0th|
- * | entry headers |   |     |        |       |   |
- * |_______________|___|_____|________|_______|___|
- *                  <----   directory entries         ------>
- *
- * First directory item has k_offset component 1. We store "." and ".."
- * in one item, always, we never split "." and ".." into differing
- * items.  This makes, among other things, the code for removing
- * directories simpler.
- */
-#define SD_OFFSET  0
-#define SD_UNIQUENESS 0
-#define DOT_OFFSET 1
-#define DOT_DOT_OFFSET 2
-#define DIRENTRY_UNIQUENESS 500
-
-#define FIRST_ITEM_OFFSET 1
-
-/*
- * Q: How to get key of object pointed to by entry from entry?
- *
- * A: Each directory entry has its header. This header has deh_dir_id
- *    and deh_objectid fields, those are key of object, entry points to
- */
-
-/*
- * NOT IMPLEMENTED:
- * Directory will someday contain stat data of object
- */
-
-struct reiserfs_de_head {
-	__le32 deh_offset;	/* third component of the directory entry key */
-
-	/*
-	 * objectid of the parent directory of the object, that is referenced
-	 * by directory entry
-	 */
-	__le32 deh_dir_id;
-
-	/* objectid of the object, that is referenced by directory entry */
-	__le32 deh_objectid;
-	__le16 deh_location;	/* offset of name in the whole item */
-
-	/*
-	 * whether 1) entry contains stat data (for future), and
-	 * 2) whether entry is hidden (unlinked)
-	 */
-	__le16 deh_state;
-} __attribute__ ((__packed__));
-#define DEH_SIZE                  sizeof(struct reiserfs_de_head)
-#define deh_offset(p_deh)         (le32_to_cpu((p_deh)->deh_offset))
-#define deh_dir_id(p_deh)         (le32_to_cpu((p_deh)->deh_dir_id))
-#define deh_objectid(p_deh)       (le32_to_cpu((p_deh)->deh_objectid))
-#define deh_location(p_deh)       (le16_to_cpu((p_deh)->deh_location))
-#define deh_state(p_deh)          (le16_to_cpu((p_deh)->deh_state))
-
-#define put_deh_offset(p_deh,v)   ((p_deh)->deh_offset = cpu_to_le32((v)))
-#define put_deh_dir_id(p_deh,v)   ((p_deh)->deh_dir_id = cpu_to_le32((v)))
-#define put_deh_objectid(p_deh,v) ((p_deh)->deh_objectid = cpu_to_le32((v)))
-#define put_deh_location(p_deh,v) ((p_deh)->deh_location = cpu_to_le16((v)))
-#define put_deh_state(p_deh,v)    ((p_deh)->deh_state = cpu_to_le16((v)))
-
-/* empty directory contains two entries "." and ".." and their headers */
-#define EMPTY_DIR_SIZE \
-(DEH_SIZE * 2 + ROUND_UP (sizeof(".") - 1) + ROUND_UP (sizeof("..") - 1))
-
-/* old format directories have this size when empty */
-#define EMPTY_DIR_SIZE_V1 (DEH_SIZE * 2 + 3)
-
-#define DEH_Statdata 0		/* not used now */
-#define DEH_Visible 2
-
-/* 64 bit systems (and the S/390) need to be aligned explicitly -jdm */
-#if BITS_PER_LONG == 64 || defined(__s390__) || defined(__hppa__)
-#   define ADDR_UNALIGNED_BITS  (3)
-#endif
-
-/*
- * These are only used to manipulate deh_state.
- * Because of this, we'll use the ext2_ bit routines,
- * since they are little endian
- */
-#ifdef ADDR_UNALIGNED_BITS
-
-#   define aligned_address(addr)           ((void *)((long)(addr) & ~((1UL << ADDR_UNALIGNED_BITS) - 1)))
-#   define unaligned_offset(addr)          (((int)((long)(addr) & ((1 << ADDR_UNALIGNED_BITS) - 1))) << 3)
-
-#   define set_bit_unaligned(nr, addr)	\
-	__test_and_set_bit_le((nr) + unaligned_offset(addr), aligned_address(addr))
-#   define clear_bit_unaligned(nr, addr)	\
-	__test_and_clear_bit_le((nr) + unaligned_offset(addr), aligned_address(addr))
-#   define test_bit_unaligned(nr, addr)	\
-	test_bit_le((nr) + unaligned_offset(addr), aligned_address(addr))
-
-#else
-
-#   define set_bit_unaligned(nr, addr)	__test_and_set_bit_le(nr, addr)
-#   define clear_bit_unaligned(nr, addr)	__test_and_clear_bit_le(nr, addr)
-#   define test_bit_unaligned(nr, addr)	test_bit_le(nr, addr)
-
-#endif
-
-#define mark_de_with_sd(deh)        set_bit_unaligned (DEH_Statdata, &((deh)->deh_state))
-#define mark_de_without_sd(deh)     clear_bit_unaligned (DEH_Statdata, &((deh)->deh_state))
-#define mark_de_visible(deh)	    set_bit_unaligned (DEH_Visible, &((deh)->deh_state))
-#define mark_de_hidden(deh)	    clear_bit_unaligned (DEH_Visible, &((deh)->deh_state))
-
-#define de_with_sd(deh)		    test_bit_unaligned (DEH_Statdata, &((deh)->deh_state))
-#define de_visible(deh)	    	    test_bit_unaligned (DEH_Visible, &((deh)->deh_state))
-#define de_hidden(deh)	    	    !test_bit_unaligned (DEH_Visible, &((deh)->deh_state))
-
-extern void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid,
-				   __le32 par_dirid, __le32 par_objid);
-extern void make_empty_dir_item(char *body, __le32 dirid, __le32 objid,
-				__le32 par_dirid, __le32 par_objid);
-
-/* two entries per block (at least) */
-#define REISERFS_MAX_NAME(block_size) 255
-
-/*
- * this structure is used for operations on directory entries. It is
- * not a disk structure.
- *
- * When reiserfs_find_entry or search_by_entry_key find directory
- * entry, they return filled reiserfs_dir_entry structure
- */
-struct reiserfs_dir_entry {
-	struct buffer_head *de_bh;
-	int de_item_num;
-	struct item_head *de_ih;
-	int de_entry_num;
-	struct reiserfs_de_head *de_deh;
-	int de_entrylen;
-	int de_namelen;
-	char *de_name;
-	unsigned long *de_gen_number_bit_string;
-
-	__u32 de_dir_id;
-	__u32 de_objectid;
-
-	struct cpu_key de_entry_key;
-};
-
-/*
- * these defines are useful when a particular member of
- * a reiserfs_dir_entry is needed
- */
-
-/* pointer to file name, stored in entry */
-#define B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh) \
-				(ih_item_body(bh, ih) + deh_location(deh))
-
-/* length of name */
-#define I_DEH_N_ENTRY_FILE_NAME_LENGTH(ih,deh,entry_num) \
-(I_DEH_N_ENTRY_LENGTH (ih, deh, entry_num) - (de_with_sd (deh) ? SD_SIZE : 0))
-
-/* hash value occupies bits from 7 up to 30 */
-#define GET_HASH_VALUE(offset) ((offset) & 0x7fffff80LL)
-/* generation number occupies 7 bits starting from 0 up to 6 */
-#define GET_GENERATION_NUMBER(offset) ((offset) & 0x7fLL)
-#define MAX_GENERATION_NUMBER  127
-
-#define SET_GENERATION_NUMBER(offset,gen_number) (GET_HASH_VALUE(offset)|(gen_number))
-
-/*
- * Picture represents an internal node of the reiserfs tree
- *  ______________________________________________________
- * |      |  Array of     |  Array of         |  Free     |
- * |block |    keys       |  pointers         | space     |
- * | head |      N        |      N+1          |           |
- * |______|_______________|___________________|___________|
- */
-
-/***************************************************************************
- *                      DISK CHILD                                         *
- ***************************************************************************/
-/*
- * Disk child pointer:
- * The pointer from an internal node of the tree to a node that is on disk.
- */
-struct disk_child {
-	__le32 dc_block_number;	/* Disk child's block number. */
-	__le16 dc_size;		/* Disk child's used space.   */
-	__le16 dc_reserved;
-};
-
-#define DC_SIZE (sizeof(struct disk_child))
-#define dc_block_number(dc_p)	(le32_to_cpu((dc_p)->dc_block_number))
-#define dc_size(dc_p)		(le16_to_cpu((dc_p)->dc_size))
-#define put_dc_block_number(dc_p, val)   do { (dc_p)->dc_block_number = cpu_to_le32(val); } while(0)
-#define put_dc_size(dc_p, val)   do { (dc_p)->dc_size = cpu_to_le16(val); } while(0)
-
-/* Get disk child by buffer header and position in the tree node. */
-#define B_N_CHILD(bh, n_pos)  ((struct disk_child *)\
-((bh)->b_data + BLKH_SIZE + B_NR_ITEMS(bh) * KEY_SIZE + DC_SIZE * (n_pos)))
-
-/* Get disk child number by buffer header and position in the tree node. */
-#define B_N_CHILD_NUM(bh, n_pos) (dc_block_number(B_N_CHILD(bh, n_pos)))
-#define PUT_B_N_CHILD_NUM(bh, n_pos, val) \
-				(put_dc_block_number(B_N_CHILD(bh, n_pos), val))
-
- /* maximal value of field child_size in structure disk_child */
- /* child size is the combined size of all items and their headers */
-#define MAX_CHILD_SIZE(bh) ((int)( (bh)->b_size - BLKH_SIZE ))
-
-/* amount of used space in buffer (not including block head) */
-#define B_CHILD_SIZE(cur) (MAX_CHILD_SIZE(cur)-(B_FREE_SPACE(cur)))
-
-/* max and min number of keys in internal node */
-#define MAX_NR_KEY(bh) ( (MAX_CHILD_SIZE(bh)-DC_SIZE)/(KEY_SIZE+DC_SIZE) )
-#define MIN_NR_KEY(bh)    (MAX_NR_KEY(bh)/2)
-
-/***************************************************************************
- *                      PATH STRUCTURES AND DEFINES                        *
- ***************************************************************************/
-
-/*
- * search_by_key fills up the path from the root to the leaf as it descends
- * the tree looking for the key.  It uses reiserfs_bread to try to find
- * buffers in the cache given their block number.  If it does not find
- * them in the cache it reads them from disk.  For each node search_by_key
- * finds using reiserfs_bread it then uses bin_search to look through that
- * node.  bin_search will find the position of the block_number of the next
- * node if it is looking through an internal node.  If it is looking through
- * a leaf node bin_search will find the position of the item which has key
- * either equal to given key, or which is the maximal key less than the
- * given key.
- */
-
-struct path_element {
-	/* Pointer to the buffer at the path in the tree. */
-	struct buffer_head *pe_buffer;
-	/* Position in the tree node which is placed in the buffer above. */
-	int pe_position;
-};
-
-/*
- * maximal height of a tree. don't change this without
- * changing JOURNAL_PER_BALANCE_CNT
- */
-#define MAX_HEIGHT 5
-
-/* Must be equals MAX_HEIGHT + FIRST_PATH_ELEMENT_OFFSET */
-#define EXTENDED_MAX_HEIGHT         7
-
-/* Must be equal to at least 2. */
-#define FIRST_PATH_ELEMENT_OFFSET   2
-
-/* Must be equal to FIRST_PATH_ELEMENT_OFFSET - 1 */
-#define ILLEGAL_PATH_ELEMENT_OFFSET 1
-
-/* this MUST be MAX_HEIGHT + 1. See about FEB below */
-#define MAX_FEB_SIZE 6
-
-/*
- * We need to keep track of who the ancestors of nodes are.  When we
- * perform a search we record which nodes were visited while
- * descending the tree looking for the node we searched for. This list
- * of nodes is called the path.  This information is used while
- * performing balancing.  Note that this path information may become
- * invalid, and this means we must check it when using it to see if it
- * is still valid. You'll need to read search_by_key and the comments
- * in it, especially about decrement_counters_in_path(), to understand
- * this structure.
- *
- * Paths make the code so much harder to work with and debug.... An
- * enormous number of bugs are due to them, and trying to write or modify
- * code that uses them just makes my head hurt.  They are based on an
- * excessive effort to avoid disturbing the precious VFS code.:-( The
- * gods only know how we are going to SMP the code that uses them.
- * znodes are the way!
- */
-
-#define PATH_READA	0x1	/* do read ahead */
-#define PATH_READA_BACK 0x2	/* read backwards */
-
-struct treepath {
-	int path_length;	/* Length of the array above.   */
-	int reada;
-	/* Array of the path elements.  */
-	struct path_element path_elements[EXTENDED_MAX_HEIGHT];
-	int pos_in_item;
-};
-
-#define pos_in_item(path) ((path)->pos_in_item)
-
-#define INITIALIZE_PATH(var) \
-struct treepath var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,}
-
-/* Get path element by path and path position. */
-#define PATH_OFFSET_PELEMENT(path, n_offset)  ((path)->path_elements + (n_offset))
-
-/* Get buffer header at the path by path and path position. */
-#define PATH_OFFSET_PBUFFER(path, n_offset)   (PATH_OFFSET_PELEMENT(path, n_offset)->pe_buffer)
-
-/* Get position in the element at the path by path and path position. */
-#define PATH_OFFSET_POSITION(path, n_offset) (PATH_OFFSET_PELEMENT(path, n_offset)->pe_position)
-
-#define PATH_PLAST_BUFFER(path) (PATH_OFFSET_PBUFFER((path), (path)->path_length))
-
-/*
- * you know, to the person who didn't write this the macro name does not
- * at first suggest what it does.  Maybe POSITION_FROM_PATH_END? Or
- * maybe we should just focus on dumping paths... -Hans
- */
-#define PATH_LAST_POSITION(path) (PATH_OFFSET_POSITION((path), (path)->path_length))
-
-/*
- * in do_balance leaf has h == 0 in contrast with path structure,
- * where root has level == 0. That is why we need these defines
- */
-
-/* tb->S[h] */
-#define PATH_H_PBUFFER(path, h) \
-			PATH_OFFSET_PBUFFER(path, path->path_length - (h))
-
-/* tb->F[h] or tb->S[0]->b_parent */
-#define PATH_H_PPARENT(path, h) PATH_H_PBUFFER(path, (h) + 1)
-
-#define PATH_H_POSITION(path, h) \
-			PATH_OFFSET_POSITION(path, path->path_length - (h))
-
-/* tb->S[h]->b_item_order */
-#define PATH_H_B_ITEM_ORDER(path, h) PATH_H_POSITION(path, h + 1)
-
-#define PATH_H_PATH_OFFSET(path, n_h) ((path)->path_length - (n_h))
-
-static inline void *reiserfs_node_data(const struct buffer_head *bh)
-{
-	return bh->b_data + sizeof(struct block_head);
-}
-
-/* get key from internal node */
-static inline struct reiserfs_key *internal_key(struct buffer_head *bh,
-						int item_num)
-{
-	struct reiserfs_key *key = reiserfs_node_data(bh);
-
-	return &key[item_num];
-}
-
-/* get the item header from leaf node */
-static inline struct item_head *item_head(const struct buffer_head *bh,
-					  int item_num)
-{
-	struct item_head *ih = reiserfs_node_data(bh);
-
-	return &ih[item_num];
-}
-
-/* get the key from leaf node */
-static inline struct reiserfs_key *leaf_key(const struct buffer_head *bh,
-					    int item_num)
-{
-	return &item_head(bh, item_num)->ih_key;
-}
-
-static inline void *ih_item_body(const struct buffer_head *bh,
-				 const struct item_head *ih)
-{
-	return bh->b_data + ih_location(ih);
-}
-
-/* get item body from leaf node */
-static inline void *item_body(const struct buffer_head *bh, int item_num)
-{
-	return ih_item_body(bh, item_head(bh, item_num));
-}
-
-static inline struct item_head *tp_item_head(const struct treepath *path)
-{
-	return item_head(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path));
-}
-
-static inline void *tp_item_body(const struct treepath *path)
-{
-	return item_body(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path));
-}
-
-#define get_last_bh(path) PATH_PLAST_BUFFER(path)
-#define get_item_pos(path) PATH_LAST_POSITION(path)
-#define item_moved(ih,path) comp_items(ih, path)
-#define path_changed(ih,path) comp_items (ih, path)
-
-/* array of the entry headers */
- /* get item body */
-#define B_I_DEH(bh, ih) ((struct reiserfs_de_head *)(ih_item_body(bh, ih)))
-
-/*
- * length of the directory entry in directory item. This define
- * calculates length of i-th directory entry using directory entry
- * locations from dir entry head. When it calculates length of 0-th
- * directory entry, it uses length of whole item in place of entry
- * location of the non-existent following entry in the calculation.
- * See picture above.
- */
-static inline int entry_length(const struct buffer_head *bh,
-			       const struct item_head *ih, int pos_in_item)
-{
-	struct reiserfs_de_head *deh;
-
-	deh = B_I_DEH(bh, ih) + pos_in_item;
-	if (pos_in_item)
-		return deh_location(deh - 1) - deh_location(deh);
-
-	return ih_item_len(ih) - deh_location(deh);
-}
-
-/***************************************************************************
- *                       MISC                                              *
- ***************************************************************************/
-
-/* Size of pointer to the unformatted node. */
-#define UNFM_P_SIZE (sizeof(unp_t))
-#define UNFM_P_SHIFT 2
-
-/* in in-core inode key is stored on le form */
-#define INODE_PKEY(inode) ((struct reiserfs_key *)(REISERFS_I(inode)->i_key))
-
-#define MAX_UL_INT 0xffffffff
-#define MAX_INT    0x7ffffff
-#define MAX_US_INT 0xffff
-
-// reiserfs version 2 has max offset 60 bits. Version 1 - 32 bit offset
-static inline loff_t max_reiserfs_offset(struct inode *inode)
-{
-	if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5)
-		return (loff_t) U32_MAX;
-
-	return (loff_t) ((~(__u64) 0) >> 4);
-}
-
-#define MAX_KEY_OBJECTID	MAX_UL_INT
-
-#define MAX_B_NUM  MAX_UL_INT
-#define MAX_FC_NUM MAX_US_INT
-
-/* the purpose is to detect overflow of an unsigned short */
-#define REISERFS_LINK_MAX (MAX_US_INT - 1000)
-
-/*
- * The following defines are used in reiserfs_insert_item
- * and reiserfs_append_item
- */
-#define REISERFS_KERNEL_MEM		0	/* kernel memory mode */
-#define REISERFS_USER_MEM		1	/* user memory mode */
-
-#define fs_generation(s) (REISERFS_SB(s)->s_generation_counter)
-#define get_generation(s) atomic_read (&fs_generation(s))
-#define FILESYSTEM_CHANGED_TB(tb)  (get_generation((tb)->tb_sb) != (tb)->fs_gen)
-#define __fs_changed(gen,s) (gen != get_generation (s))
-#define fs_changed(gen,s)		\
-({					\
-	reiserfs_cond_resched(s);	\
-	__fs_changed(gen, s);		\
-})
-
-/***************************************************************************
- *                  FIXATE NODES                                           *
- ***************************************************************************/
-
-#define VI_TYPE_LEFT_MERGEABLE 1
-#define VI_TYPE_RIGHT_MERGEABLE 2
-
-/*
- * To make any changes in the tree we always first find node, that
- * contains item to be changed/deleted or place to insert a new
- * item. We call this node S. To do balancing we need to decide what
- * we will shift to left/right neighbor, or to a new node, where new
- * item will be etc. To make this analysis simpler we build virtual
- * node. Virtual node is an array of items, that will replace items of
- * node S. (For instance if we are going to delete an item, virtual
- * node does not contain it). Virtual node keeps information about
- * item sizes and types, mergeability of first and last items, sizes
- * of all entries in directory item. We use this array of items when
- * calculating what we can shift to neighbors and how many nodes we
- * have to have if we do not any shiftings, if we shift to left/right
- * neighbor or to both.
- */
-struct virtual_item {
-	int vi_index;		/* index in the array of item operations */
-	unsigned short vi_type;	/* left/right mergeability */
-
-	/* length of item that it will have after balancing */
-	unsigned short vi_item_len;
-
-	struct item_head *vi_ih;
-	const char *vi_item;	/* body of item (old or new) */
-	const void *vi_new_data;	/* 0 always but paste mode */
-	void *vi_uarea;		/* item specific area */
-};
-
-struct virtual_node {
-	/* this is a pointer to the free space in the buffer */
-	char *vn_free_ptr;
-
-	unsigned short vn_nr_item;	/* number of items in virtual node */
-
-	/*
-	 * size of node , that node would have if it has
-	 * unlimited size and no balancing is performed
-	 */
-	short vn_size;
-
-	/* mode of balancing (paste, insert, delete, cut) */
-	short vn_mode;
-
-	short vn_affected_item_num;
-	short vn_pos_in_item;
-
-	/* item header of inserted item, 0 for other modes */
-	struct item_head *vn_ins_ih;
-	const void *vn_data;
-
-	/* array of items (including a new one, excluding item to be deleted) */
-	struct virtual_item *vn_vi;
-};
-
-/* used by directory items when creating virtual nodes */
-struct direntry_uarea {
-	int flags;
-	__u16 entry_count;
-	__u16 entry_sizes[];
-} __attribute__ ((__packed__));
-
-/***************************************************************************
- *                  TREE BALANCE                                           *
- ***************************************************************************/
-
-/*
- * This temporary structure is used in tree balance algorithms, and
- * constructed as we go to the extent that its various parts are
- * needed.  It contains arrays of nodes that can potentially be
- * involved in the balancing of node S, and parameters that define how
- * each of the nodes must be balanced.  Note that in these algorithms
- * for balancing the worst case is to need to balance the current node
- * S and the left and right neighbors and all of their parents plus
- * create a new node.  We implement S1 balancing for the leaf nodes
- * and S0 balancing for the internal nodes (S1 and S0 are defined in
- * our papers.)
- */
-
-/* size of the array of buffers to free at end of do_balance */
-#define MAX_FREE_BLOCK 7
-
-/* maximum number of FEB blocknrs on a single level */
-#define MAX_AMOUNT_NEEDED 2
-
-/* someday somebody will prefix every field in this struct with tb_ */
-struct tree_balance {
-	int tb_mode;
-	int need_balance_dirty;
-	struct super_block *tb_sb;
-	struct reiserfs_transaction_handle *transaction_handle;
-	struct treepath *tb_path;
-
-	/* array of left neighbors of nodes in the path */
-	struct buffer_head *L[MAX_HEIGHT];
-
-	/* array of right neighbors of nodes in the path */
-	struct buffer_head *R[MAX_HEIGHT];
-
-	/* array of fathers of the left neighbors */
-	struct buffer_head *FL[MAX_HEIGHT];
-
-	/* array of fathers of the right neighbors */
-	struct buffer_head *FR[MAX_HEIGHT];
-	/* array of common parents of center node and its left neighbor */
-	struct buffer_head *CFL[MAX_HEIGHT];
-
-	/* array of common parents of center node and its right neighbor */
-	struct buffer_head *CFR[MAX_HEIGHT];
-
-	/*
-	 * array of empty buffers. Number of buffers in array equals
-	 * cur_blknum.
-	 */
-	struct buffer_head *FEB[MAX_FEB_SIZE];
-	struct buffer_head *used[MAX_FEB_SIZE];
-	struct buffer_head *thrown[MAX_FEB_SIZE];
-
-	/*
-	 * array of number of items which must be shifted to the left in
-	 * order to balance the current node; for leaves includes item that
-	 * will be partially shifted; for internal nodes, it is the number
-	 * of child pointers rather than items. It includes the new item
-	 * being created. The code sometimes subtracts one to get the
-	 * number of wholly shifted items for other purposes.
-	 */
-	int lnum[MAX_HEIGHT];
-
-	/* substitute right for left in comment above */
-	int rnum[MAX_HEIGHT];
-
-	/*
-	 * array indexed by height h mapping the key delimiting L[h] and
-	 * S[h] to its item number within the node CFL[h]
-	 */
-	int lkey[MAX_HEIGHT];
-
-	/* substitute r for l in comment above */
-	int rkey[MAX_HEIGHT];
-
-	/*
-	 * the number of bytes by we are trying to add or remove from
-	 * S[h]. A negative value means removing.
-	 */
-	int insert_size[MAX_HEIGHT];
-
-	/*
-	 * number of nodes that will replace node S[h] after balancing
-	 * on the level h of the tree.  If 0 then S is being deleted,
-	 * if 1 then S is remaining and no new nodes are being created,
-	 * if 2 or 3 then 1 or 2 new nodes is being created
-	 */
-	int blknum[MAX_HEIGHT];
-
-	/* fields that are used only for balancing leaves of the tree */
-
-	/* number of empty blocks having been already allocated */
-	int cur_blknum;
-
-	/* number of items that fall into left most node when S[0] splits */
-	int s0num;
-
-	/*
-	 * number of bytes which can flow to the left neighbor from the left
-	 * most liquid item that cannot be shifted from S[0] entirely
-	 * if -1 then nothing will be partially shifted
-	 */
-	int lbytes;
-
-	/*
-	 * number of bytes which will flow to the right neighbor from the right
-	 * most liquid item that cannot be shifted from S[0] entirely
-	 * if -1 then nothing will be partially shifted
-	 */
-	int rbytes;
-
-
-	/*
-	 * index into the array of item headers in
-	 * S[0] of the affected item
-	 */
-	int item_pos;
-
-	/* new nodes allocated to hold what could not fit into S */
-	struct buffer_head *S_new[2];
-
-	/*
-	 * number of items that will be placed into nodes in S_new
-	 * when S[0] splits
-	 */
-	int snum[2];
-
-	/*
-	 * number of bytes which flow to nodes in S_new when S[0] splits
-	 * note: if S[0] splits into 3 nodes, then items do not need to be cut
-	 */
-	int sbytes[2];
-
-	int pos_in_item;
-	int zeroes_num;
-
-	/*
-	 * buffers which are to be freed after do_balance finishes
-	 * by unfix_nodes
-	 */
-	struct buffer_head *buf_to_free[MAX_FREE_BLOCK];
-
-	/*
-	 * kmalloced memory. Used to create virtual node and keep
-	 * map of dirtied bitmap blocks
-	 */
-	char *vn_buf;
-
-	int vn_buf_size;	/* size of the vn_buf */
-
-	/* VN starts after bitmap of bitmap blocks */
-	struct virtual_node *tb_vn;
-
-	/*
-	 * saved value of `reiserfs_generation' counter see
-	 * FILESYSTEM_CHANGED() macro in reiserfs_fs.h
-	 */
-	int fs_gen;
-
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
-	/*
-	 * key pointer, to pass to block allocator or
-	 * another low-level subsystem
-	 */
-	struct in_core_key key;
-#endif
-};
-
-/* These are modes of balancing */
-
-/* When inserting an item. */
-#define M_INSERT	'i'
-/*
- * When inserting into (directories only) or appending onto an already
- * existent item.
- */
-#define M_PASTE		'p'
-/* When deleting an item. */
-#define M_DELETE	'd'
-/* When truncating an item or removing an entry from a (directory) item. */
-#define M_CUT		'c'
-
-/* used when balancing on leaf level skipped (in reiserfsck) */
-#define M_INTERNAL	'n'
-
-/*
- * When further balancing is not needed, then do_balance does not need
- * to be called.
- */
-#define M_SKIP_BALANCING		's'
-#define M_CONVERT	'v'
-
-/* modes of leaf_move_items */
-#define LEAF_FROM_S_TO_L 0
-#define LEAF_FROM_S_TO_R 1
-#define LEAF_FROM_R_TO_L 2
-#define LEAF_FROM_L_TO_R 3
-#define LEAF_FROM_S_TO_SNEW 4
-
-#define FIRST_TO_LAST 0
-#define LAST_TO_FIRST 1
-
-/*
- * used in do_balance for passing parent of node information that has
- * been gotten from tb struct
- */
-struct buffer_info {
-	struct tree_balance *tb;
-	struct buffer_head *bi_bh;
-	struct buffer_head *bi_parent;
-	int bi_position;
-};
-
-static inline struct super_block *sb_from_tb(struct tree_balance *tb)
-{
-	return tb ? tb->tb_sb : NULL;
-}
-
-static inline struct super_block *sb_from_bi(struct buffer_info *bi)
-{
-	return bi ? sb_from_tb(bi->tb) : NULL;
-}
-
-/*
- * there are 4 types of items: stat data, directory item, indirect, direct.
- * +-------------------+------------+--------------+------------+
- * |                   |  k_offset  | k_uniqueness | mergeable? |
- * +-------------------+------------+--------------+------------+
- * |     stat data     |     0      |      0       |   no       |
- * +-------------------+------------+--------------+------------+
- * | 1st directory item| DOT_OFFSET | DIRENTRY_ .. |   no       |
- * | non 1st directory | hash value | UNIQUENESS   |   yes      |
- * |     item          |            |              |            |
- * +-------------------+------------+--------------+------------+
- * | indirect item     | offset + 1 |TYPE_INDIRECT |    [1]	|
- * +-------------------+------------+--------------+------------+
- * | direct item       | offset + 1 |TYPE_DIRECT   |    [2]     |
- * +-------------------+------------+--------------+------------+
- *
- * [1] if this is not the first indirect item of the object
- * [2] if this is not the first direct item of the object
-*/
-
-struct item_operations {
-	int (*bytes_number) (struct item_head * ih, int block_size);
-	void (*decrement_key) (struct cpu_key *);
-	int (*is_left_mergeable) (struct reiserfs_key * ih,
-				  unsigned long bsize);
-	void (*print_item) (struct item_head *, char *item);
-	void (*check_item) (struct item_head *, char *item);
-
-	int (*create_vi) (struct virtual_node * vn, struct virtual_item * vi,
-			  int is_affected, int insert_size);
-	int (*check_left) (struct virtual_item * vi, int free,
-			   int start_skip, int end_skip);
-	int (*check_right) (struct virtual_item * vi, int free);
-	int (*part_size) (struct virtual_item * vi, int from, int to);
-	int (*unit_num) (struct virtual_item * vi);
-	void (*print_vi) (struct virtual_item * vi);
-};
-
-extern struct item_operations *item_ops[TYPE_ANY + 1];
-
-#define op_bytes_number(ih,bsize)                    item_ops[le_ih_k_type (ih)]->bytes_number (ih, bsize)
-#define op_is_left_mergeable(key,bsize)              item_ops[le_key_k_type (le_key_version (key), key)]->is_left_mergeable (key, bsize)
-#define op_print_item(ih,item)                       item_ops[le_ih_k_type (ih)]->print_item (ih, item)
-#define op_check_item(ih,item)                       item_ops[le_ih_k_type (ih)]->check_item (ih, item)
-#define op_create_vi(vn,vi,is_affected,insert_size)  item_ops[le_ih_k_type ((vi)->vi_ih)]->create_vi (vn,vi,is_affected,insert_size)
-#define op_check_left(vi,free,start_skip,end_skip) item_ops[(vi)->vi_index]->check_left (vi, free, start_skip, end_skip)
-#define op_check_right(vi,free)                      item_ops[(vi)->vi_index]->check_right (vi, free)
-#define op_part_size(vi,from,to)                     item_ops[(vi)->vi_index]->part_size (vi, from, to)
-#define op_unit_num(vi)				     item_ops[(vi)->vi_index]->unit_num (vi)
-#define op_print_vi(vi)                              item_ops[(vi)->vi_index]->print_vi (vi)
-
-#define COMP_SHORT_KEYS comp_short_keys
-
-/* number of blocks pointed to by the indirect item */
-#define I_UNFM_NUM(ih)	(ih_item_len(ih) / UNFM_P_SIZE)
-
-/*
- * the used space within the unformatted node corresponding
- * to pos within the item pointed to by ih
- */
-#define I_POS_UNFM_SIZE(ih,pos,size) (((pos) == I_UNFM_NUM(ih) - 1 ) ? (size) - ih_free_space(ih) : (size))
-
-/*
- * number of bytes contained by the direct item or the
- * unformatted nodes the indirect item points to
- */
-
-/* following defines use reiserfs buffer header and item header */
-
-/* get stat-data */
-#define B_I_STAT_DATA(bh, ih) ( (struct stat_data * )((bh)->b_data + ih_location(ih)) )
-
-/* this is 3976 for size==4096 */
-#define MAX_DIRECT_ITEM_LEN(size) ((size) - BLKH_SIZE - 2*IH_SIZE - SD_SIZE - UNFM_P_SIZE)
-
-/*
- * indirect items consist of entries which contain blocknrs, pos
- * indicates which entry, and B_I_POS_UNFM_POINTER resolves to the
- * blocknr contained by the entry pos points to
- */
-#define B_I_POS_UNFM_POINTER(bh, ih, pos)				\
-	le32_to_cpu(*(((unp_t *)ih_item_body(bh, ih)) + (pos)))
-#define PUT_B_I_POS_UNFM_POINTER(bh, ih, pos, val)			\
-	(*(((unp_t *)ih_item_body(bh, ih)) + (pos)) = cpu_to_le32(val))
-
-struct reiserfs_iget_args {
-	__u32 objectid;
-	__u32 dirid;
-};
-
-/***************************************************************************
- *                    FUNCTION DECLARATIONS                                *
- ***************************************************************************/
-
-#define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12)
-
-#define journal_trans_half(blocksize) \
-	((blocksize - sizeof(struct reiserfs_journal_desc) - 12) / sizeof(__u32))
-
-/* journal.c see journal.c for all the comments here */
-
-/* first block written in a commit.  */
-struct reiserfs_journal_desc {
-	__le32 j_trans_id;	/* id of commit */
-
-	/* length of commit. len +1 is the commit block */
-	__le32 j_len;
-
-	__le32 j_mount_id;	/* mount id of this trans */
-	__le32 j_realblock[];	/* real locations for each block */
-};
-
-#define get_desc_trans_id(d)   le32_to_cpu((d)->j_trans_id)
-#define get_desc_trans_len(d)  le32_to_cpu((d)->j_len)
-#define get_desc_mount_id(d)   le32_to_cpu((d)->j_mount_id)
-
-#define set_desc_trans_id(d,val)       do { (d)->j_trans_id = cpu_to_le32 (val); } while (0)
-#define set_desc_trans_len(d,val)      do { (d)->j_len = cpu_to_le32 (val); } while (0)
-#define set_desc_mount_id(d,val)       do { (d)->j_mount_id = cpu_to_le32 (val); } while (0)
-
-/* last block written in a commit */
-struct reiserfs_journal_commit {
-	__le32 j_trans_id;	/* must match j_trans_id from the desc block */
-	__le32 j_len;		/* ditto */
-	__le32 j_realblock[];	/* real locations for each block */
-};
-
-#define get_commit_trans_id(c) le32_to_cpu((c)->j_trans_id)
-#define get_commit_trans_len(c)        le32_to_cpu((c)->j_len)
-#define get_commit_mount_id(c) le32_to_cpu((c)->j_mount_id)
-
-#define set_commit_trans_id(c,val)     do { (c)->j_trans_id = cpu_to_le32 (val); } while (0)
-#define set_commit_trans_len(c,val)    do { (c)->j_len = cpu_to_le32 (val); } while (0)
-
-/*
- * this header block gets written whenever a transaction is considered
- * fully flushed, and is more recent than the last fully flushed transaction.
- * fully flushed means all the log blocks and all the real blocks are on
- * disk, and this transaction does not need to be replayed.
- */
-struct reiserfs_journal_header {
-	/* id of last fully flushed transaction */
-	__le32 j_last_flush_trans_id;
-
-	/* offset in the log of where to start replay after a crash */
-	__le32 j_first_unflushed_offset;
-
-	__le32 j_mount_id;
-	/* 12 */ struct journal_params jh_journal;
-};
-
-/* biggest tunable defines are right here */
-#define JOURNAL_BLOCK_COUNT 8192	/* number of blocks in the journal */
-
-/* biggest possible single transaction, don't change for now (8/3/99) */
-#define JOURNAL_TRANS_MAX_DEFAULT 1024
-#define JOURNAL_TRANS_MIN_DEFAULT 256
-
-/*
- * max blocks to batch into one transaction,
- * don't make this any bigger than 900
- */
-#define JOURNAL_MAX_BATCH_DEFAULT   900
-#define JOURNAL_MIN_RATIO 2
-#define JOURNAL_MAX_COMMIT_AGE 30
-#define JOURNAL_MAX_TRANS_AGE 30
-#define JOURNAL_PER_BALANCE_CNT (3 * (MAX_HEIGHT-2) + 9)
-#define JOURNAL_BLOCKS_PER_OBJECT(sb)  (JOURNAL_PER_BALANCE_CNT * 3 + \
-					 2 * (REISERFS_QUOTA_INIT_BLOCKS(sb) + \
-					      REISERFS_QUOTA_TRANS_BLOCKS(sb)))
-
-#ifdef CONFIG_QUOTA
-#define REISERFS_QUOTA_OPTS ((1 << REISERFS_USRQUOTA) | (1 << REISERFS_GRPQUOTA))
-/* We need to update data and inode (atime) */
-#define REISERFS_QUOTA_TRANS_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? 2 : 0)
-/* 1 balancing, 1 bitmap, 1 data per write + stat data update */
-#define REISERFS_QUOTA_INIT_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? \
-(DQUOT_INIT_ALLOC*(JOURNAL_PER_BALANCE_CNT+2)+DQUOT_INIT_REWRITE+1) : 0)
-/* same as with INIT */
-#define REISERFS_QUOTA_DEL_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? \
-(DQUOT_DEL_ALLOC*(JOURNAL_PER_BALANCE_CNT+2)+DQUOT_DEL_REWRITE+1) : 0)
-#else
-#define REISERFS_QUOTA_TRANS_BLOCKS(s) 0
-#define REISERFS_QUOTA_INIT_BLOCKS(s) 0
-#define REISERFS_QUOTA_DEL_BLOCKS(s) 0
-#endif
-
-/*
- * both of these can be as low as 1, or as high as you want.  The min is the
- * number of 4k bitmap nodes preallocated on mount. New nodes are allocated
- * as needed, and released when transactions are committed.  On release, if
- * the current number of nodes is > max, the node is freed, otherwise,
- * it is put on a free list for faster use later.
-*/
-#define REISERFS_MIN_BITMAP_NODES 10
-#define REISERFS_MAX_BITMAP_NODES 100
-
-/* these are based on journal hash size of 8192 */
-#define JBH_HASH_SHIFT 13
-#define JBH_HASH_MASK 8191
-
-#define _jhashfn(sb,block)	\
-	(((unsigned long)sb>>L1_CACHE_SHIFT) ^ \
-	 (((block)<<(JBH_HASH_SHIFT - 6)) ^ ((block) >> 13) ^ ((block) << (JBH_HASH_SHIFT - 12))))
-#define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK])
-
-/* We need these to make journal.c code more readable */
-#define journal_find_get_block(s, block) __find_get_block(\
-		file_bdev(SB_JOURNAL(s)->j_bdev_file), block, s->s_blocksize)
-#define journal_getblk(s, block) __getblk(file_bdev(SB_JOURNAL(s)->j_bdev_file),\
-		block, s->s_blocksize)
-#define journal_bread(s, block) __bread(file_bdev(SB_JOURNAL(s)->j_bdev_file),\
-		block, s->s_blocksize)
-
-enum reiserfs_bh_state_bits {
-	BH_JDirty = BH_PrivateStart,	/* buffer is in current transaction */
-	BH_JDirty_wait,
-	/*
-	 * disk block was taken off free list before being in a
-	 * finished transaction, or written to disk. Can be reused immed.
-	 */
-	BH_JNew,
-	BH_JPrepared,
-	BH_JRestore_dirty,
-	BH_JTest,		/* debugging only will go away */
-};
-
-BUFFER_FNS(JDirty, journaled);
-TAS_BUFFER_FNS(JDirty, journaled);
-BUFFER_FNS(JDirty_wait, journal_dirty);
-TAS_BUFFER_FNS(JDirty_wait, journal_dirty);
-BUFFER_FNS(JNew, journal_new);
-TAS_BUFFER_FNS(JNew, journal_new);
-BUFFER_FNS(JPrepared, journal_prepared);
-TAS_BUFFER_FNS(JPrepared, journal_prepared);
-BUFFER_FNS(JRestore_dirty, journal_restore_dirty);
-TAS_BUFFER_FNS(JRestore_dirty, journal_restore_dirty);
-BUFFER_FNS(JTest, journal_test);
-TAS_BUFFER_FNS(JTest, journal_test);
-
-/* transaction handle which is passed around for all journal calls */
-struct reiserfs_transaction_handle {
-	/*
-	 * super for this FS when journal_begin was called. saves calls to
-	 * reiserfs_get_super also used by nested transactions to make
-	 * sure they are nesting on the right FS _must_ be first
-	 * in the handle
-	 */
-	struct super_block *t_super;
-
-	int t_refcount;
-	int t_blocks_logged;	/* number of blocks this writer has logged */
-	int t_blocks_allocated;	/* number of blocks this writer allocated */
-
-	/* sanity check, equals the current trans id */
-	unsigned int t_trans_id;
-
-	void *t_handle_save;	/* save existing current->journal_info */
-
-	/*
-	 * if new block allocation occurres, that block
-	 * should be displaced from others
-	 */
-	unsigned displace_new_blocks:1;
-
-	struct list_head t_list;
-};
-
-/*
- * used to keep track of ordered and tail writes, attached to the buffer
- * head through b_journal_head.
- */
-struct reiserfs_jh {
-	struct reiserfs_journal_list *jl;
-	struct buffer_head *bh;
-	struct list_head list;
-};
-
-void reiserfs_free_jh(struct buffer_head *bh);
-int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh);
-int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh);
-int journal_mark_dirty(struct reiserfs_transaction_handle *,
-		       struct buffer_head *bh);
-
-static inline int reiserfs_file_data_log(struct inode *inode)
-{
-	if (reiserfs_data_log(inode->i_sb) ||
-	    (REISERFS_I(inode)->i_flags & i_data_log))
-		return 1;
-	return 0;
-}
-
-static inline int reiserfs_transaction_running(struct super_block *s)
-{
-	struct reiserfs_transaction_handle *th = current->journal_info;
-	if (th && th->t_super == s)
-		return 1;
-	if (th && th->t_super == NULL)
-		BUG();
-	return 0;
-}
-
-static inline int reiserfs_transaction_free_space(struct reiserfs_transaction_handle *th)
-{
-	return th->t_blocks_allocated - th->t_blocks_logged;
-}
-
-struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
-								    super_block
-								    *,
-								    int count);
-int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *);
-void reiserfs_vfs_truncate_file(struct inode *inode);
-int reiserfs_commit_page(struct inode *inode, struct page *page,
-			 unsigned from, unsigned to);
-void reiserfs_flush_old_commits(struct super_block *);
-int reiserfs_commit_for_inode(struct inode *);
-int reiserfs_inode_needs_commit(struct inode *);
-void reiserfs_update_inode_transaction(struct inode *);
-void reiserfs_wait_on_write_block(struct super_block *s);
-void reiserfs_block_writes(struct reiserfs_transaction_handle *th);
-void reiserfs_allow_writes(struct super_block *s);
-void reiserfs_check_lock_depth(struct super_block *s, char *caller);
-int reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh,
-				 int wait);
-void reiserfs_restore_prepared_buffer(struct super_block *,
-				      struct buffer_head *bh);
-int journal_init(struct super_block *, const char *j_dev_name, int old_format,
-		 unsigned int);
-int journal_release(struct reiserfs_transaction_handle *, struct super_block *);
-int journal_release_error(struct reiserfs_transaction_handle *,
-			  struct super_block *);
-int journal_end(struct reiserfs_transaction_handle *);
-int journal_end_sync(struct reiserfs_transaction_handle *);
-int journal_mark_freed(struct reiserfs_transaction_handle *,
-		       struct super_block *, b_blocknr_t blocknr);
-int journal_transaction_should_end(struct reiserfs_transaction_handle *, int);
-int reiserfs_in_journal(struct super_block *sb, unsigned int bmap_nr,
-			 int bit_nr, int searchall, b_blocknr_t *next);
-int journal_begin(struct reiserfs_transaction_handle *,
-		  struct super_block *sb, unsigned long);
-int journal_join_abort(struct reiserfs_transaction_handle *,
-		       struct super_block *sb);
-void reiserfs_abort_journal(struct super_block *sb, int errno);
-void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...);
-int reiserfs_allocate_list_bitmaps(struct super_block *s,
-				   struct reiserfs_list_bitmap *, unsigned int);
-
-void reiserfs_schedule_old_flush(struct super_block *s);
-void reiserfs_cancel_old_flush(struct super_block *s);
-void add_save_link(struct reiserfs_transaction_handle *th,
-		   struct inode *inode, int truncate);
-int remove_save_link(struct inode *inode, int truncate);
-
-/* objectid.c */
-__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th);
-void reiserfs_release_objectid(struct reiserfs_transaction_handle *th,
-			       __u32 objectid_to_release);
-int reiserfs_convert_objectid_map_v1(struct super_block *);
-
-/* stree.c */
-int B_IS_IN_TREE(const struct buffer_head *);
-extern void copy_item_head(struct item_head *to,
-			   const struct item_head *from);
-
-/* first key is in cpu form, second - le */
-extern int comp_short_keys(const struct reiserfs_key *le_key,
-			   const struct cpu_key *cpu_key);
-extern void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from);
-
-/* both are in le form */
-extern int comp_le_keys(const struct reiserfs_key *,
-			const struct reiserfs_key *);
-extern int comp_short_le_keys(const struct reiserfs_key *,
-			      const struct reiserfs_key *);
-
-/* * get key version from on disk key - kludge */
-static inline int le_key_version(const struct reiserfs_key *key)
-{
-	int type;
-
-	type = offset_v2_k_type(&(key->u.k_offset_v2));
-	if (type != TYPE_DIRECT && type != TYPE_INDIRECT
-	    && type != TYPE_DIRENTRY)
-		return KEY_FORMAT_3_5;
-
-	return KEY_FORMAT_3_6;
-
-}
-
-static inline void copy_key(struct reiserfs_key *to,
-			    const struct reiserfs_key *from)
-{
-	memcpy(to, from, KEY_SIZE);
-}
-
-int comp_items(const struct item_head *stored_ih, const struct treepath *path);
-const struct reiserfs_key *get_rkey(const struct treepath *chk_path,
-				    const struct super_block *sb);
-int search_by_key(struct super_block *, const struct cpu_key *,
-		  struct treepath *, int);
-#define search_item(s,key,path) search_by_key (s, key, path, DISK_LEAF_NODE_LEVEL)
-int search_for_position_by_key(struct super_block *sb,
-			       const struct cpu_key *cpu_key,
-			       struct treepath *search_path);
-extern void decrement_bcount(struct buffer_head *bh);
-void decrement_counters_in_path(struct treepath *search_path);
-void pathrelse(struct treepath *search_path);
-int reiserfs_check_path(struct treepath *p);
-void pathrelse_and_restore(struct super_block *s, struct treepath *search_path);
-
-int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
-			 struct treepath *path,
-			 const struct cpu_key *key,
-			 struct item_head *ih,
-			 struct inode *inode, const char *body);
-
-int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th,
-			     struct treepath *path,
-			     const struct cpu_key *key,
-			     struct inode *inode,
-			     const char *body, int paste_size);
-
-int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
-			   struct treepath *path,
-			   struct cpu_key *key,
-			   struct inode *inode,
-			   struct page *page, loff_t new_file_size);
-
-int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
-			 struct treepath *path,
-			 const struct cpu_key *key,
-			 struct inode *inode, struct buffer_head *un_bh);
-
-void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
-				struct inode *inode, struct reiserfs_key *key);
-int reiserfs_delete_object(struct reiserfs_transaction_handle *th,
-			   struct inode *inode);
-int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
-			 struct inode *inode, struct page *,
-			 int update_timestamps);
-
-#define i_block_size(inode) ((inode)->i_sb->s_blocksize)
-#define file_size(inode) ((inode)->i_size)
-#define tail_size(inode) (file_size (inode) & (i_block_size (inode) - 1))
-
-#define tail_has_to_be_packed(inode) (have_large_tails ((inode)->i_sb)?\
-!STORE_TAIL_IN_UNFM_S1(file_size (inode), tail_size(inode), inode->i_sb->s_blocksize):have_small_tails ((inode)->i_sb)?!STORE_TAIL_IN_UNFM_S2(file_size (inode), tail_size(inode), inode->i_sb->s_blocksize):0 )
-
-void padd_item(char *item, int total_length, int length);
-
-/* inode.c */
-/* args for the create parameter of reiserfs_get_block */
-#define GET_BLOCK_NO_CREATE 0	 /* don't create new blocks or convert tails */
-#define GET_BLOCK_CREATE 1	 /* add anything you need to find block */
-#define GET_BLOCK_NO_HOLE 2	 /* return -ENOENT for file holes */
-#define GET_BLOCK_READ_DIRECT 4	 /* read the tail if indirect item not found */
-#define GET_BLOCK_NO_IMUX     8	 /* i_mutex is not held, don't preallocate */
-#define GET_BLOCK_NO_DANGLE   16 /* don't leave any transactions running */
-
-void reiserfs_read_locked_inode(struct inode *inode,
-				struct reiserfs_iget_args *args);
-int reiserfs_find_actor(struct inode *inode, void *p);
-int reiserfs_init_locked_inode(struct inode *inode, void *p);
-void reiserfs_evict_inode(struct inode *inode);
-int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc);
-int reiserfs_get_block(struct inode *inode, sector_t block,
-		       struct buffer_head *bh_result, int create);
-struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
-				     int fh_len, int fh_type);
-struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
-				     int fh_len, int fh_type);
-int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp,
-		       struct inode *parent);
-
-int reiserfs_truncate_file(struct inode *, int update_timestamps);
-void make_cpu_key(struct cpu_key *cpu_key, struct inode *inode, loff_t offset,
-		  int type, int key_length);
-void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
-		       int version,
-		       loff_t offset, int type, int length, int entry_count);
-struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key);
-
-struct reiserfs_security_handle;
-int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
-		       struct inode *dir, umode_t mode,
-		       const char *symname, loff_t i_size,
-		       struct dentry *dentry, struct inode *inode,
-		       struct reiserfs_security_handle *security);
-
-void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
-			     struct inode *inode, loff_t size);
-
-static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th,
-				      struct inode *inode)
-{
-	reiserfs_update_sd_size(th, inode, inode->i_size);
-}
-
-void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode);
-int reiserfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
-		     struct iattr *attr);
-
-int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len);
-
-/* namei.c */
-void reiserfs_init_priv_inode(struct inode *inode);
-void set_de_name_and_namelen(struct reiserfs_dir_entry *de);
-int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
-			struct treepath *path, struct reiserfs_dir_entry *de);
-struct dentry *reiserfs_get_parent(struct dentry *);
-
-#ifdef CONFIG_REISERFS_PROC_INFO
-int reiserfs_proc_info_init(struct super_block *sb);
-int reiserfs_proc_info_done(struct super_block *sb);
-int reiserfs_proc_info_global_init(void);
-int reiserfs_proc_info_global_done(void);
-
-#define PROC_EXP( e )   e
-
-#define __PINFO( sb ) REISERFS_SB(sb) -> s_proc_info_data
-#define PROC_INFO_MAX( sb, field, value )								\
-    __PINFO( sb ).field =												\
-        max( REISERFS_SB( sb ) -> s_proc_info_data.field, value )
-#define PROC_INFO_INC( sb, field ) ( ++ ( __PINFO( sb ).field ) )
-#define PROC_INFO_ADD( sb, field, val ) ( __PINFO( sb ).field += ( val ) )
-#define PROC_INFO_BH_STAT( sb, bh, level )							\
-    PROC_INFO_INC( sb, sbk_read_at[ ( level ) ] );						\
-    PROC_INFO_ADD( sb, free_at[ ( level ) ], B_FREE_SPACE( bh ) );	\
-    PROC_INFO_ADD( sb, items_at[ ( level ) ], B_NR_ITEMS( bh ) )
-#else
-static inline int reiserfs_proc_info_init(struct super_block *sb)
-{
-	return 0;
-}
-
-static inline int reiserfs_proc_info_done(struct super_block *sb)
-{
-	return 0;
-}
-
-static inline int reiserfs_proc_info_global_init(void)
-{
-	return 0;
-}
-
-static inline int reiserfs_proc_info_global_done(void)
-{
-	return 0;
-}
-
-#define PROC_EXP( e )
-#define VOID_V ( ( void ) 0 )
-#define PROC_INFO_MAX( sb, field, value ) VOID_V
-#define PROC_INFO_INC( sb, field ) VOID_V
-#define PROC_INFO_ADD( sb, field, val ) VOID_V
-#define PROC_INFO_BH_STAT(sb, bh, n_node_level) VOID_V
-#endif
-
-/* dir.c */
-extern const struct inode_operations reiserfs_dir_inode_operations;
-extern const struct inode_operations reiserfs_symlink_inode_operations;
-extern const struct inode_operations reiserfs_special_inode_operations;
-extern const struct file_operations reiserfs_dir_operations;
-int reiserfs_readdir_inode(struct inode *, struct dir_context *);
-
-/* tail_conversion.c */
-int direct2indirect(struct reiserfs_transaction_handle *, struct inode *,
-		    struct treepath *, struct buffer_head *, loff_t);
-int indirect2direct(struct reiserfs_transaction_handle *, struct inode *,
-		    struct page *, struct treepath *, const struct cpu_key *,
-		    loff_t, char *);
-void reiserfs_unmap_buffer(struct buffer_head *);
-
-/* file.c */
-extern const struct inode_operations reiserfs_file_inode_operations;
-extern const struct inode_operations reiserfs_priv_file_inode_operations;
-extern const struct file_operations reiserfs_file_operations;
-extern const struct address_space_operations reiserfs_address_space_operations;
-
-/* fix_nodes.c */
-
-int fix_nodes(int n_op_mode, struct tree_balance *tb,
-	      struct item_head *ins_ih, const void *);
-void unfix_nodes(struct tree_balance *);
-
-/* prints.c */
-void __reiserfs_panic(struct super_block *s, const char *id,
-		      const char *function, const char *fmt, ...)
-    __attribute__ ((noreturn));
-#define reiserfs_panic(s, id, fmt, args...) \
-	__reiserfs_panic(s, id, __func__, fmt, ##args)
-void __reiserfs_error(struct super_block *s, const char *id,
-		      const char *function, const char *fmt, ...);
-#define reiserfs_error(s, id, fmt, args...) \
-	 __reiserfs_error(s, id, __func__, fmt, ##args)
-void reiserfs_info(struct super_block *s, const char *fmt, ...);
-void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...);
-void print_indirect_item(struct buffer_head *bh, int item_num);
-void store_print_tb(struct tree_balance *tb);
-void print_cur_tb(char *mes);
-void print_de(struct reiserfs_dir_entry *de);
-void print_bi(struct buffer_info *bi, char *mes);
-#define PRINT_LEAF_ITEMS 1	/* print all items */
-#define PRINT_DIRECTORY_ITEMS 2	/* print directory items */
-#define PRINT_DIRECT_ITEMS 4	/* print contents of direct items */
-void print_block(struct buffer_head *bh, ...);
-void print_bmap(struct super_block *s, int silent);
-void print_bmap_block(int i, char *data, int size, int silent);
-/*void print_super_block (struct super_block * s, char * mes);*/
-void print_objectid_map(struct super_block *s);
-void print_block_head(struct buffer_head *bh, char *mes);
-void check_leaf(struct buffer_head *bh);
-void check_internal(struct buffer_head *bh);
-void print_statistics(struct super_block *s);
-char *reiserfs_hashname(int code);
-
-/* lbalance.c */
-int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num,
-		    int mov_bytes, struct buffer_head *Snew);
-int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes);
-int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes);
-void leaf_delete_items(struct buffer_info *cur_bi, int last_first, int first,
-		       int del_num, int del_bytes);
-void leaf_insert_into_buf(struct buffer_info *bi, int before,
-			  struct item_head * const inserted_item_ih,
-			  const char * const inserted_item_body,
-			  int zeros_number);
-void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num,
-			  int pos_in_item, int paste_size,
-			  const char * const body, int zeros_number);
-void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
-			  int pos_in_item, int cut_size);
-void leaf_paste_entries(struct buffer_info *bi, int item_num, int before,
-			int new_entry_count, struct reiserfs_de_head *new_dehs,
-			const char *records, int paste_size);
-/* ibalance.c */
-int balance_internal(struct tree_balance *, int, int, struct item_head *,
-		     struct buffer_head **);
-
-/* do_balance.c */
-void do_balance_mark_leaf_dirty(struct tree_balance *tb,
-				struct buffer_head *bh, int flag);
-#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
-#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty
-
-void do_balance(struct tree_balance *tb, struct item_head *ih,
-		const char *body, int flag);
-void reiserfs_invalidate_buffer(struct tree_balance *tb,
-				struct buffer_head *bh);
-
-int get_left_neighbor_position(struct tree_balance *tb, int h);
-int get_right_neighbor_position(struct tree_balance *tb, int h);
-void replace_key(struct tree_balance *tb, struct buffer_head *, int,
-		 struct buffer_head *, int);
-void make_empty_node(struct buffer_info *);
-struct buffer_head *get_FEB(struct tree_balance *);
-
-/* bitmap.c */
-
-/*
- * structure contains hints for block allocator, and it is a container for
- * arguments, such as node, search path, transaction_handle, etc.
- */
-struct __reiserfs_blocknr_hint {
-	/* inode passed to allocator, if we allocate unf. nodes */
-	struct inode *inode;
-
-	sector_t block;		/* file offset, in blocks */
-	struct in_core_key key;
-
-	/*
-	 * search path, used by allocator to deternine search_start by
-	 * various ways
-	 */
-	struct treepath *path;
-
-	/*
-	 * transaction handle is needed to log super blocks
-	 * and bitmap blocks changes
-	 */
-	struct reiserfs_transaction_handle *th;
-
-	b_blocknr_t beg, end;
-
-	/*
-	 * a field used to transfer search start value (block number)
-	 * between different block allocator procedures
-	 * (determine_search_start() and others)
-	 */
-	b_blocknr_t search_start;
-
-	/*
-	 * is set in determine_prealloc_size() function,
-	 * used by underlayed function that do actual allocation
-	 */
-	int prealloc_size;
-
-	/*
-	 * the allocator uses different polices for getting disk
-	 * space for formatted/unformatted blocks with/without preallocation
-	 */
-	unsigned formatted_node:1;
-	unsigned preallocate:1;
-};
-
-typedef struct __reiserfs_blocknr_hint reiserfs_blocknr_hint_t;
-
-int reiserfs_parse_alloc_options(struct super_block *, char *);
-void reiserfs_init_alloc_options(struct super_block *s);
-
-/*
- * given a directory, this will tell you what packing locality
- * to use for a new object underneat it.  The locality is returned
- * in disk byte order (le).
- */
-__le32 reiserfs_choose_packing(struct inode *dir);
-
-void show_alloc_options(struct seq_file *seq, struct super_block *s);
-int reiserfs_init_bitmap_cache(struct super_block *sb);
-void reiserfs_free_bitmap_cache(struct super_block *sb);
-void reiserfs_cache_bitmap_metadata(struct super_block *sb, struct buffer_head *bh, struct reiserfs_bitmap_info *info);
-struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, unsigned int bitmap);
-int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value);
-void reiserfs_free_block(struct reiserfs_transaction_handle *th, struct inode *,
-			 b_blocknr_t, int for_unformatted);
-int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *, b_blocknr_t *, int,
-			       int);
-static inline int reiserfs_new_form_blocknrs(struct tree_balance *tb,
-					     b_blocknr_t * new_blocknrs,
-					     int amount_needed)
-{
-	reiserfs_blocknr_hint_t hint = {
-		.th = tb->transaction_handle,
-		.path = tb->tb_path,
-		.inode = NULL,
-		.key = tb->key,
-		.block = 0,
-		.formatted_node = 1
-	};
-	return reiserfs_allocate_blocknrs(&hint, new_blocknrs, amount_needed,
-					  0);
-}
-
-static inline int reiserfs_new_unf_blocknrs(struct reiserfs_transaction_handle
-					    *th, struct inode *inode,
-					    b_blocknr_t * new_blocknrs,
-					    struct treepath *path,
-					    sector_t block)
-{
-	reiserfs_blocknr_hint_t hint = {
-		.th = th,
-		.path = path,
-		.inode = inode,
-		.block = block,
-		.formatted_node = 0,
-		.preallocate = 0
-	};
-	return reiserfs_allocate_blocknrs(&hint, new_blocknrs, 1, 0);
-}
-
-#ifdef REISERFS_PREALLOCATE
-static inline int reiserfs_new_unf_blocknrs2(struct reiserfs_transaction_handle
-					     *th, struct inode *inode,
-					     b_blocknr_t * new_blocknrs,
-					     struct treepath *path,
-					     sector_t block)
-{
-	reiserfs_blocknr_hint_t hint = {
-		.th = th,
-		.path = path,
-		.inode = inode,
-		.block = block,
-		.formatted_node = 0,
-		.preallocate = 1
-	};
-	return reiserfs_allocate_blocknrs(&hint, new_blocknrs, 1, 0);
-}
-
-void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th,
-			       struct inode *inode);
-void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th);
-#endif
-
-/* hashes.c */
-__u32 keyed_hash(const signed char *msg, int len);
-__u32 yura_hash(const signed char *msg, int len);
-__u32 r5_hash(const signed char *msg, int len);
-
-#define reiserfs_set_le_bit		__set_bit_le
-#define reiserfs_test_and_set_le_bit	__test_and_set_bit_le
-#define reiserfs_clear_le_bit		__clear_bit_le
-#define reiserfs_test_and_clear_le_bit	__test_and_clear_bit_le
-#define reiserfs_test_le_bit		test_bit_le
-#define reiserfs_find_next_zero_le_bit	find_next_zero_bit_le
-
-/*
- * sometimes reiserfs_truncate may require to allocate few new blocks
- * to perform indirect2direct conversion. People probably used to
- * think, that truncate should work without problems on a filesystem
- * without free disk space. They may complain that they can not
- * truncate due to lack of free disk space. This spare space allows us
- * to not worry about it. 500 is probably too much, but it should be
- * absolutely safe
- */
-#define SPARE_SPACE 500
-
-/* prototypes from ioctl.c */
-int reiserfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
-int reiserfs_fileattr_set(struct mnt_idmap *idmap,
-			  struct dentry *dentry, struct fileattr *fa);
-long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
-long reiserfs_compat_ioctl(struct file *filp,
-		   unsigned int cmd, unsigned long arg);
-int reiserfs_unpack(struct inode *inode);
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
deleted file mode 100644
index 7b498a0d060b..000000000000
--- a/fs/reiserfs/resize.c
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-/*
- * Written by Alexander Zarochentcev.
- *
- * The kernel part of the (on-line) reiserfs resizer.
- */
-
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include "reiserfs.h"
-#include <linux/buffer_head.h>
-
-int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
-{
-	int err = 0;
-	struct reiserfs_super_block *sb;
-	struct reiserfs_bitmap_info *bitmap;
-	struct reiserfs_bitmap_info *info;
-	struct reiserfs_bitmap_info *old_bitmap = SB_AP_BITMAP(s);
-	struct buffer_head *bh;
-	struct reiserfs_transaction_handle th;
-	unsigned int bmap_nr_new, bmap_nr;
-	unsigned int block_r_new, block_r;
-
-	struct reiserfs_list_bitmap *jb;
-	struct reiserfs_list_bitmap jbitmap[JOURNAL_NUM_BITMAPS];
-
-	unsigned long int block_count, free_blocks;
-	int i;
-	int copy_size;
-	int depth;
-
-	sb = SB_DISK_SUPER_BLOCK(s);
-
-	if (SB_BLOCK_COUNT(s) >= block_count_new) {
-		printk("can\'t shrink filesystem on-line\n");
-		return -EINVAL;
-	}
-
-	/* check the device size */
-	depth = reiserfs_write_unlock_nested(s);
-	bh = sb_bread(s, block_count_new - 1);
-	reiserfs_write_lock_nested(s, depth);
-	if (!bh) {
-		printk("reiserfs_resize: can\'t read last block\n");
-		return -EINVAL;
-	}
-	bforget(bh);
-
-	/*
-	 * old disk layout detection; those partitions can be mounted, but
-	 * cannot be resized
-	 */
-	if (SB_BUFFER_WITH_SB(s)->b_blocknr * SB_BUFFER_WITH_SB(s)->b_size
-	    != REISERFS_DISK_OFFSET_IN_BYTES) {
-		printk
-		    ("reiserfs_resize: unable to resize a reiserfs without distributed bitmap (fs version < 3.5.12)\n");
-		return -ENOTSUPP;
-	}
-
-	/* count used bits in last bitmap block */
-	block_r = SB_BLOCK_COUNT(s) -
-			(reiserfs_bmap_count(s) - 1) * s->s_blocksize * 8;
-
-	/* count bitmap blocks in new fs */
-	bmap_nr_new = block_count_new / (s->s_blocksize * 8);
-	block_r_new = block_count_new - bmap_nr_new * s->s_blocksize * 8;
-	if (block_r_new)
-		bmap_nr_new++;
-	else
-		block_r_new = s->s_blocksize * 8;
-
-	/* save old values */
-	block_count = SB_BLOCK_COUNT(s);
-	bmap_nr = reiserfs_bmap_count(s);
-
-	/* resizing of reiserfs bitmaps (journal and real), if needed */
-	if (bmap_nr_new > bmap_nr) {
-		/* reallocate journal bitmaps */
-		if (reiserfs_allocate_list_bitmaps(s, jbitmap, bmap_nr_new) < 0) {
-			printk
-			    ("reiserfs_resize: unable to allocate memory for journal bitmaps\n");
-			return -ENOMEM;
-		}
-		/*
-		 * the new journal bitmaps are zero filled, now we copy i
-		 * the bitmap node pointers from the old journal bitmap
-		 * structs, and then transfer the new data structures
-		 * into the journal struct.
-		 *
-		 * using the copy_size var below allows this code to work for
-		 * both shrinking and expanding the FS.
-		 */
-		copy_size = min(bmap_nr_new, bmap_nr);
-		copy_size =
-		    copy_size * sizeof(struct reiserfs_list_bitmap_node *);
-		for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
-			struct reiserfs_bitmap_node **node_tmp;
-			jb = SB_JOURNAL(s)->j_list_bitmap + i;
-			memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size);
-
-			/*
-			 * just in case vfree schedules on us, copy the new
-			 * pointer into the journal struct before freeing the
-			 * old one
-			 */
-			node_tmp = jb->bitmaps;
-			jb->bitmaps = jbitmap[i].bitmaps;
-			vfree(node_tmp);
-		}
-
-		/*
-		 * allocate additional bitmap blocks, reallocate
-		 * array of bitmap block pointers
-		 */
-		bitmap =
-		    vzalloc(array_size(bmap_nr_new,
-				       sizeof(struct reiserfs_bitmap_info)));
-		if (!bitmap) {
-			/*
-			 * Journal bitmaps are still supersized, but the
-			 * memory isn't leaked, so I guess it's ok
-			 */
-			printk("reiserfs_resize: unable to allocate memory.\n");
-			return -ENOMEM;
-		}
-		for (i = 0; i < bmap_nr; i++)
-			bitmap[i] = old_bitmap[i];
-
-		/*
-		 * This doesn't go through the journal, but it doesn't have to.
-		 * The changes are still atomic: We're synced up when the
-		 * journal transaction begins, and the new bitmaps don't
-		 * matter if the transaction fails.
-		 */
-		for (i = bmap_nr; i < bmap_nr_new; i++) {
-			int depth;
-			/*
-			 * don't use read_bitmap_block since it will cache
-			 * the uninitialized bitmap
-			 */
-			depth = reiserfs_write_unlock_nested(s);
-			bh = sb_bread(s, i * s->s_blocksize * 8);
-			reiserfs_write_lock_nested(s, depth);
-			if (!bh) {
-				vfree(bitmap);
-				return -EIO;
-			}
-			memset(bh->b_data, 0, sb_blocksize(sb));
-			reiserfs_set_le_bit(0, bh->b_data);
-			reiserfs_cache_bitmap_metadata(s, bh, bitmap + i);
-
-			set_buffer_uptodate(bh);
-			mark_buffer_dirty(bh);
-			depth = reiserfs_write_unlock_nested(s);
-			sync_dirty_buffer(bh);
-			reiserfs_write_lock_nested(s, depth);
-			/* update bitmap_info stuff */
-			bitmap[i].free_count = sb_blocksize(sb) * 8 - 1;
-			brelse(bh);
-		}
-		/* free old bitmap blocks array */
-		SB_AP_BITMAP(s) = bitmap;
-		vfree(old_bitmap);
-	}
-
-	/*
-	 * begin transaction, if there was an error, it's fine. Yes, we have
-	 * incorrect bitmaps now, but none of it is ever going to touch the
-	 * disk anyway.
-	 */
-	err = journal_begin(&th, s, 10);
-	if (err)
-		return err;
-
-	/* Extend old last bitmap block - new blocks have been made available */
-	info = SB_AP_BITMAP(s) + bmap_nr - 1;
-	bh = reiserfs_read_bitmap_block(s, bmap_nr - 1);
-	if (!bh) {
-		int jerr = journal_end(&th);
-		if (jerr)
-			return jerr;
-		return -EIO;
-	}
-
-	reiserfs_prepare_for_journal(s, bh, 1);
-	for (i = block_r; i < s->s_blocksize * 8; i++)
-		reiserfs_clear_le_bit(i, bh->b_data);
-	info->free_count += s->s_blocksize * 8 - block_r;
-
-	journal_mark_dirty(&th, bh);
-	brelse(bh);
-
-	/* Correct new last bitmap block - It may not be full */
-	info = SB_AP_BITMAP(s) + bmap_nr_new - 1;
-	bh = reiserfs_read_bitmap_block(s, bmap_nr_new - 1);
-	if (!bh) {
-		int jerr = journal_end(&th);
-		if (jerr)
-			return jerr;
-		return -EIO;
-	}
-
-	reiserfs_prepare_for_journal(s, bh, 1);
-	for (i = block_r_new; i < s->s_blocksize * 8; i++)
-		reiserfs_set_le_bit(i, bh->b_data);
-	journal_mark_dirty(&th, bh);
-	brelse(bh);
-
-	info->free_count -= s->s_blocksize * 8 - block_r_new;
-	/* update super */
-	reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
-	free_blocks = SB_FREE_BLOCKS(s);
-	PUT_SB_FREE_BLOCKS(s,
-			   free_blocks + (block_count_new - block_count -
-					  (bmap_nr_new - bmap_nr)));
-	PUT_SB_BLOCK_COUNT(s, block_count_new);
-	PUT_SB_BMAP_NR(s, bmap_would_wrap(bmap_nr_new) ? : bmap_nr_new);
-
-	journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
-
-	SB_JOURNAL(s)->j_must_wait = 1;
-	return journal_end(&th);
-}
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
deleted file mode 100644
index 5faf702f8d15..000000000000
--- a/fs/reiserfs/stree.c
+++ /dev/null
@@ -1,2280 +0,0 @@
-/*
- *  Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-/*
- *  Written by Anatoly P. Pinchuk pap@namesys.botik.ru
- *  Programm System Institute
- *  Pereslavl-Zalessky Russia
- */
-
-#include <linux/time.h>
-#include <linux/string.h>
-#include <linux/pagemap.h>
-#include <linux/bio.h>
-#include "reiserfs.h"
-#include <linux/buffer_head.h>
-#include <linux/quotaops.h>
-
-/* Does the buffer contain a disk block which is in the tree. */
-inline int B_IS_IN_TREE(const struct buffer_head *bh)
-{
-
-	RFALSE(B_LEVEL(bh) > MAX_HEIGHT,
-	       "PAP-1010: block (%b) has too big level (%z)", bh, bh);
-
-	return (B_LEVEL(bh) != FREE_LEVEL);
-}
-
-/* to get item head in le form */
-inline void copy_item_head(struct item_head *to,
-			   const struct item_head *from)
-{
-	memcpy(to, from, IH_SIZE);
-}
-
-/*
- * k1 is pointer to on-disk structure which is stored in little-endian
- * form. k2 is pointer to cpu variable. For key of items of the same
- * object this returns 0.
- * Returns: -1 if key1 < key2
- * 0 if key1 == key2
- * 1 if key1 > key2
- */
-inline int comp_short_keys(const struct reiserfs_key *le_key,
-			   const struct cpu_key *cpu_key)
-{
-	__u32 n;
-	n = le32_to_cpu(le_key->k_dir_id);
-	if (n < cpu_key->on_disk_key.k_dir_id)
-		return -1;
-	if (n > cpu_key->on_disk_key.k_dir_id)
-		return 1;
-	n = le32_to_cpu(le_key->k_objectid);
-	if (n < cpu_key->on_disk_key.k_objectid)
-		return -1;
-	if (n > cpu_key->on_disk_key.k_objectid)
-		return 1;
-	return 0;
-}
-
-/*
- * k1 is pointer to on-disk structure which is stored in little-endian
- * form. k2 is pointer to cpu variable.
- * Compare keys using all 4 key fields.
- * Returns: -1 if key1 < key2 0
- * if key1 = key2 1 if key1 > key2
- */
-static inline int comp_keys(const struct reiserfs_key *le_key,
-			    const struct cpu_key *cpu_key)
-{
-	int retval;
-
-	retval = comp_short_keys(le_key, cpu_key);
-	if (retval)
-		return retval;
-	if (le_key_k_offset(le_key_version(le_key), le_key) <
-	    cpu_key_k_offset(cpu_key))
-		return -1;
-	if (le_key_k_offset(le_key_version(le_key), le_key) >
-	    cpu_key_k_offset(cpu_key))
-		return 1;
-
-	if (cpu_key->key_length == 3)
-		return 0;
-
-	/* this part is needed only when tail conversion is in progress */
-	if (le_key_k_type(le_key_version(le_key), le_key) <
-	    cpu_key_k_type(cpu_key))
-		return -1;
-
-	if (le_key_k_type(le_key_version(le_key), le_key) >
-	    cpu_key_k_type(cpu_key))
-		return 1;
-
-	return 0;
-}
-
-inline int comp_short_le_keys(const struct reiserfs_key *key1,
-			      const struct reiserfs_key *key2)
-{
-	__u32 *k1_u32, *k2_u32;
-	int key_length = REISERFS_SHORT_KEY_LEN;
-
-	k1_u32 = (__u32 *) key1;
-	k2_u32 = (__u32 *) key2;
-	for (; key_length--; ++k1_u32, ++k2_u32) {
-		if (le32_to_cpu(*k1_u32) < le32_to_cpu(*k2_u32))
-			return -1;
-		if (le32_to_cpu(*k1_u32) > le32_to_cpu(*k2_u32))
-			return 1;
-	}
-	return 0;
-}
-
-inline void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from)
-{
-	int version;
-	to->on_disk_key.k_dir_id = le32_to_cpu(from->k_dir_id);
-	to->on_disk_key.k_objectid = le32_to_cpu(from->k_objectid);
-
-	/* find out version of the key */
-	version = le_key_version(from);
-	to->version = version;
-	to->on_disk_key.k_offset = le_key_k_offset(version, from);
-	to->on_disk_key.k_type = le_key_k_type(version, from);
-}
-
-/*
- * this does not say which one is bigger, it only returns 1 if keys
- * are not equal, 0 otherwise
- */
-inline int comp_le_keys(const struct reiserfs_key *k1,
-			const struct reiserfs_key *k2)
-{
-	return memcmp(k1, k2, sizeof(struct reiserfs_key));
-}
-
-/**************************************************************************
- *  Binary search toolkit function                                        *
- *  Search for an item in the array by the item key                       *
- *  Returns:    1 if found,  0 if not found;                              *
- *        *pos = number of the searched element if found, else the        *
- *        number of the first element that is larger than key.            *
- **************************************************************************/
-/*
- * For those not familiar with binary search: lbound is the leftmost item
- * that it could be, rbound the rightmost item that it could be.  We examine
- * the item halfway between lbound and rbound, and that tells us either
- * that we can increase lbound, or decrease rbound, or that we have found it,
- * or if lbound <= rbound that there are no possible items, and we have not
- * found it. With each examination we cut the number of possible items it
- * could be by one more than half rounded down, or we find it.
- */
-static inline int bin_search(const void *key,	/* Key to search for. */
-			     const void *base,	/* First item in the array. */
-			     int num,	/* Number of items in the array. */
-			     /*
-			      * Item size in the array.  searched. Lest the
-			      * reader be confused, note that this is crafted
-			      * as a general function, and when it is applied
-			      * specifically to the array of item headers in a
-			      * node, width is actually the item header size
-			      * not the item size.
-			      */
-			     int width,
-			     int *pos /* Number of the searched for element. */
-    )
-{
-	int rbound, lbound, j;
-
-	for (j = ((rbound = num - 1) + (lbound = 0)) / 2;
-	     lbound <= rbound; j = (rbound + lbound) / 2)
-		switch (comp_keys
-			((struct reiserfs_key *)((char *)base + j * width),
-			 (struct cpu_key *)key)) {
-		case -1:
-			lbound = j + 1;
-			continue;
-		case 1:
-			rbound = j - 1;
-			continue;
-		case 0:
-			*pos = j;
-			return ITEM_FOUND;	/* Key found in the array.  */
-		}
-
-	/*
-	 * bin_search did not find given key, it returns position of key,
-	 * that is minimal and greater than the given one.
-	 */
-	*pos = lbound;
-	return ITEM_NOT_FOUND;
-}
-
-
-/* Minimal possible key. It is never in the tree. */
-const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} };
-
-/* Maximal possible key. It is never in the tree. */
-static const struct reiserfs_key MAX_KEY = {
-	cpu_to_le32(0xffffffff),
-	cpu_to_le32(0xffffffff),
-	{{cpu_to_le32(0xffffffff),
-	  cpu_to_le32(0xffffffff)},}
-};
-
-/*
- * Get delimiting key of the buffer by looking for it in the buffers in the
- * path, starting from the bottom of the path, and going upwards.  We must
- * check the path's validity at each step.  If the key is not in the path,
- * there is no delimiting key in the tree (buffer is first or last buffer
- * in tree), and in this case we return a special key, either MIN_KEY or
- * MAX_KEY.
- */
-static inline const struct reiserfs_key *get_lkey(const struct treepath *chk_path,
-						  const struct super_block *sb)
-{
-	int position, path_offset = chk_path->path_length;
-	struct buffer_head *parent;
-
-	RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET,
-	       "PAP-5010: invalid offset in the path");
-
-	/* While not higher in path than first element. */
-	while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
-
-		RFALSE(!buffer_uptodate
-		       (PATH_OFFSET_PBUFFER(chk_path, path_offset)),
-		       "PAP-5020: parent is not uptodate");
-
-		/* Parent at the path is not in the tree now. */
-		if (!B_IS_IN_TREE
-		    (parent =
-		     PATH_OFFSET_PBUFFER(chk_path, path_offset)))
-			return &MAX_KEY;
-		/* Check whether position in the parent is correct. */
-		if ((position =
-		     PATH_OFFSET_POSITION(chk_path,
-					  path_offset)) >
-		    B_NR_ITEMS(parent))
-			return &MAX_KEY;
-		/* Check whether parent at the path really points to the child. */
-		if (B_N_CHILD_NUM(parent, position) !=
-		    PATH_OFFSET_PBUFFER(chk_path,
-					path_offset + 1)->b_blocknr)
-			return &MAX_KEY;
-		/*
-		 * Return delimiting key if position in the parent
-		 * is not equal to zero.
-		 */
-		if (position)
-			return internal_key(parent, position - 1);
-	}
-	/* Return MIN_KEY if we are in the root of the buffer tree. */
-	if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
-	    b_blocknr == SB_ROOT_BLOCK(sb))
-		return &MIN_KEY;
-	return &MAX_KEY;
-}
-
-/* Get delimiting key of the buffer at the path and its right neighbor. */
-inline const struct reiserfs_key *get_rkey(const struct treepath *chk_path,
-					   const struct super_block *sb)
-{
-	int position, path_offset = chk_path->path_length;
-	struct buffer_head *parent;
-
-	RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET,
-	       "PAP-5030: invalid offset in the path");
-
-	while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
-
-		RFALSE(!buffer_uptodate
-		       (PATH_OFFSET_PBUFFER(chk_path, path_offset)),
-		       "PAP-5040: parent is not uptodate");
-
-		/* Parent at the path is not in the tree now. */
-		if (!B_IS_IN_TREE
-		    (parent =
-		     PATH_OFFSET_PBUFFER(chk_path, path_offset)))
-			return &MIN_KEY;
-		/* Check whether position in the parent is correct. */
-		if ((position =
-		     PATH_OFFSET_POSITION(chk_path,
-					  path_offset)) >
-		    B_NR_ITEMS(parent))
-			return &MIN_KEY;
-		/*
-		 * Check whether parent at the path really points
-		 * to the child.
-		 */
-		if (B_N_CHILD_NUM(parent, position) !=
-		    PATH_OFFSET_PBUFFER(chk_path,
-					path_offset + 1)->b_blocknr)
-			return &MIN_KEY;
-
-		/*
-		 * Return delimiting key if position in the parent
-		 * is not the last one.
-		 */
-		if (position != B_NR_ITEMS(parent))
-			return internal_key(parent, position);
-	}
-
-	/* Return MAX_KEY if we are in the root of the buffer tree. */
-	if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
-	    b_blocknr == SB_ROOT_BLOCK(sb))
-		return &MAX_KEY;
-	return &MIN_KEY;
-}
-
-/*
- * Check whether a key is contained in the tree rooted from a buffer at a path.
- * This works by looking at the left and right delimiting keys for the buffer
- * in the last path_element in the path.  These delimiting keys are stored
- * at least one level above that buffer in the tree. If the buffer is the
- * first or last node in the tree order then one of the delimiting keys may
- * be absent, and in this case get_lkey and get_rkey return a special key
- * which is MIN_KEY or MAX_KEY.
- */
-static inline int key_in_buffer(
-				/* Path which should be checked. */
-				struct treepath *chk_path,
-				/* Key which should be checked. */
-				const struct cpu_key *key,
-				struct super_block *sb
-    )
-{
-
-	RFALSE(!key || chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET
-	       || chk_path->path_length > MAX_HEIGHT,
-	       "PAP-5050: pointer to the key(%p) is NULL or invalid path length(%d)",
-	       key, chk_path->path_length);
-	RFALSE(!PATH_PLAST_BUFFER(chk_path)->b_bdev,
-	       "PAP-5060: device must not be NODEV");
-
-	if (comp_keys(get_lkey(chk_path, sb), key) == 1)
-		/* left delimiting key is bigger, that the key we look for */
-		return 0;
-	/*  if ( comp_keys(key, get_rkey(chk_path, sb)) != -1 ) */
-	if (comp_keys(get_rkey(chk_path, sb), key) != 1)
-		/* key must be less than right delimitiing key */
-		return 0;
-	return 1;
-}
-
-int reiserfs_check_path(struct treepath *p)
-{
-	RFALSE(p->path_length != ILLEGAL_PATH_ELEMENT_OFFSET,
-	       "path not properly relsed");
-	return 0;
-}
-
-/*
- * Drop the reference to each buffer in a path and restore
- * dirty bits clean when preparing the buffer for the log.
- * This version should only be called from fix_nodes()
- */
-void pathrelse_and_restore(struct super_block *sb,
-			   struct treepath *search_path)
-{
-	int path_offset = search_path->path_length;
-
-	RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
-	       "clm-4000: invalid path offset");
-
-	while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) {
-		struct buffer_head *bh;
-		bh = PATH_OFFSET_PBUFFER(search_path, path_offset--);
-		reiserfs_restore_prepared_buffer(sb, bh);
-		brelse(bh);
-	}
-	search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
-}
-
-/* Drop the reference to each buffer in a path */
-void pathrelse(struct treepath *search_path)
-{
-	int path_offset = search_path->path_length;
-
-	RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
-	       "PAP-5090: invalid path offset");
-
-	while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET)
-		brelse(PATH_OFFSET_PBUFFER(search_path, path_offset--));
-
-	search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
-}
-
-static int has_valid_deh_location(struct buffer_head *bh, struct item_head *ih)
-{
-	struct reiserfs_de_head *deh;
-	int i;
-
-	deh = B_I_DEH(bh, ih);
-	for (i = 0; i < ih_entry_count(ih); i++) {
-		if (deh_location(&deh[i]) > ih_item_len(ih)) {
-			reiserfs_warning(NULL, "reiserfs-5094",
-					 "directory entry location seems wrong %h",
-					 &deh[i]);
-			return 0;
-		}
-	}
-
-	return 1;
-}
-
-static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
-{
-	struct block_head *blkh;
-	struct item_head *ih;
-	int used_space;
-	int prev_location;
-	int i;
-	int nr;
-
-	blkh = (struct block_head *)buf;
-	if (blkh_level(blkh) != DISK_LEAF_NODE_LEVEL) {
-		reiserfs_warning(NULL, "reiserfs-5080",
-				 "this should be caught earlier");
-		return 0;
-	}
-
-	nr = blkh_nr_item(blkh);
-	if (nr < 1 || nr > ((blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN))) {
-		/* item number is too big or too small */
-		reiserfs_warning(NULL, "reiserfs-5081",
-				 "nr_item seems wrong: %z", bh);
-		return 0;
-	}
-	ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1;
-	used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location(ih));
-
-	/* free space does not match to calculated amount of use space */
-	if (used_space != blocksize - blkh_free_space(blkh)) {
-		reiserfs_warning(NULL, "reiserfs-5082",
-				 "free space seems wrong: %z", bh);
-		return 0;
-	}
-	/*
-	 * FIXME: it is_leaf will hit performance too much - we may have
-	 * return 1 here
-	 */
-
-	/* check tables of item heads */
-	ih = (struct item_head *)(buf + BLKH_SIZE);
-	prev_location = blocksize;
-	for (i = 0; i < nr; i++, ih++) {
-		if (le_ih_k_type(ih) == TYPE_ANY) {
-			reiserfs_warning(NULL, "reiserfs-5083",
-					 "wrong item type for item %h",
-					 ih);
-			return 0;
-		}
-		if (ih_location(ih) >= blocksize
-		    || ih_location(ih) < IH_SIZE * nr) {
-			reiserfs_warning(NULL, "reiserfs-5084",
-					 "item location seems wrong: %h",
-					 ih);
-			return 0;
-		}
-		if (ih_item_len(ih) < 1
-		    || ih_item_len(ih) > MAX_ITEM_LEN(blocksize)) {
-			reiserfs_warning(NULL, "reiserfs-5085",
-					 "item length seems wrong: %h",
-					 ih);
-			return 0;
-		}
-		if (prev_location - ih_location(ih) != ih_item_len(ih)) {
-			reiserfs_warning(NULL, "reiserfs-5086",
-					 "item location seems wrong "
-					 "(second one): %h", ih);
-			return 0;
-		}
-		if (is_direntry_le_ih(ih)) {
-			if (ih_item_len(ih) < (ih_entry_count(ih) * IH_SIZE)) {
-				reiserfs_warning(NULL, "reiserfs-5093",
-						 "item entry count seems wrong %h",
-						 ih);
-				return 0;
-			}
-			return has_valid_deh_location(bh, ih);
-		}
-		prev_location = ih_location(ih);
-	}
-
-	/* one may imagine many more checks */
-	return 1;
-}
-
-/* returns 1 if buf looks like an internal node, 0 otherwise */
-static int is_internal(char *buf, int blocksize, struct buffer_head *bh)
-{
-	struct block_head *blkh;
-	int nr;
-	int used_space;
-
-	blkh = (struct block_head *)buf;
-	nr = blkh_level(blkh);
-	if (nr <= DISK_LEAF_NODE_LEVEL || nr > MAX_HEIGHT) {
-		/* this level is not possible for internal nodes */
-		reiserfs_warning(NULL, "reiserfs-5087",
-				 "this should be caught earlier");
-		return 0;
-	}
-
-	nr = blkh_nr_item(blkh);
-	/* for internal which is not root we might check min number of keys */
-	if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) {
-		reiserfs_warning(NULL, "reiserfs-5088",
-				 "number of key seems wrong: %z", bh);
-		return 0;
-	}
-
-	used_space = BLKH_SIZE + KEY_SIZE * nr + DC_SIZE * (nr + 1);
-	if (used_space != blocksize - blkh_free_space(blkh)) {
-		reiserfs_warning(NULL, "reiserfs-5089",
-				 "free space seems wrong: %z", bh);
-		return 0;
-	}
-
-	/* one may imagine many more checks */
-	return 1;
-}
-
-/*
- * make sure that bh contains formatted node of reiserfs tree of
- * 'level'-th level
- */
-static int is_tree_node(struct buffer_head *bh, int level)
-{
-	if (B_LEVEL(bh) != level) {
-		reiserfs_warning(NULL, "reiserfs-5090", "node level %d does "
-				 "not match to the expected one %d",
-				 B_LEVEL(bh), level);
-		return 0;
-	}
-	if (level == DISK_LEAF_NODE_LEVEL)
-		return is_leaf(bh->b_data, bh->b_size, bh);
-
-	return is_internal(bh->b_data, bh->b_size, bh);
-}
-
-#define SEARCH_BY_KEY_READA 16
-
-/*
- * The function is NOT SCHEDULE-SAFE!
- * It might unlock the write lock if we needed to wait for a block
- * to be read. Note that in this case it won't recover the lock to avoid
- * high contention resulting from too much lock requests, especially
- * the caller (search_by_key) will perform other schedule-unsafe
- * operations just after calling this function.
- *
- * @return depth of lock to be restored after read completes
- */
-static int search_by_key_reada(struct super_block *s,
-				struct buffer_head **bh,
-				b_blocknr_t *b, int num)
-{
-	int i, j;
-	int depth = -1;
-
-	for (i = 0; i < num; i++) {
-		bh[i] = sb_getblk(s, b[i]);
-	}
-	/*
-	 * We are going to read some blocks on which we
-	 * have a reference. It's safe, though we might be
-	 * reading blocks concurrently changed if we release
-	 * the lock. But it's still fine because we check later
-	 * if the tree changed
-	 */
-	for (j = 0; j < i; j++) {
-		/*
-		 * note, this needs attention if we are getting rid of the BKL
-		 * you have to make sure the prepared bit isn't set on this
-		 * buffer
-		 */
-		if (!buffer_uptodate(bh[j])) {
-			if (depth == -1)
-				depth = reiserfs_write_unlock_nested(s);
-			bh_readahead(bh[j], REQ_RAHEAD);
-		}
-		brelse(bh[j]);
-	}
-	return depth;
-}
-
-/*
- * This function fills up the path from the root to the leaf as it
- * descends the tree looking for the key.  It uses reiserfs_bread to
- * try to find buffers in the cache given their block number.  If it
- * does not find them in the cache it reads them from disk.  For each
- * node search_by_key finds using reiserfs_bread it then uses
- * bin_search to look through that node.  bin_search will find the
- * position of the block_number of the next node if it is looking
- * through an internal node.  If it is looking through a leaf node
- * bin_search will find the position of the item which has key either
- * equal to given key, or which is the maximal key less than the given
- * key.  search_by_key returns a path that must be checked for the
- * correctness of the top of the path but need not be checked for the
- * correctness of the bottom of the path
- */
-/*
- * search_by_key - search for key (and item) in stree
- * @sb: superblock
- * @key: pointer to key to search for
- * @search_path: Allocated and initialized struct treepath; Returned filled
- *		 on success.
- * @stop_level: How far down the tree to search, Use DISK_LEAF_NODE_LEVEL to
- *		stop at leaf level.
- *
- * The function is NOT SCHEDULE-SAFE!
- */
-int search_by_key(struct super_block *sb, const struct cpu_key *key,
-		  struct treepath *search_path, int stop_level)
-{
-	b_blocknr_t block_number;
-	int expected_level;
-	struct buffer_head *bh;
-	struct path_element *last_element;
-	int node_level, retval;
-	int fs_gen;
-	struct buffer_head *reada_bh[SEARCH_BY_KEY_READA];
-	b_blocknr_t reada_blocks[SEARCH_BY_KEY_READA];
-	int reada_count = 0;
-
-#ifdef CONFIG_REISERFS_CHECK
-	int repeat_counter = 0;
-#endif
-
-	PROC_INFO_INC(sb, search_by_key);
-
-	/*
-	 * As we add each node to a path we increase its count.  This means
-	 * that we must be careful to release all nodes in a path before we
-	 * either discard the path struct or re-use the path struct, as we
-	 * do here.
-	 */
-
-	pathrelse(search_path);
-
-	/*
-	 * With each iteration of this loop we search through the items in the
-	 * current node, and calculate the next current node(next path element)
-	 * for the next iteration of this loop..
-	 */
-	block_number = SB_ROOT_BLOCK(sb);
-	expected_level = -1;
-	while (1) {
-
-#ifdef CONFIG_REISERFS_CHECK
-		if (!(++repeat_counter % 50000))
-			reiserfs_warning(sb, "PAP-5100",
-					 "%s: there were %d iterations of "
-					 "while loop looking for key %K",
-					 current->comm, repeat_counter,
-					 key);
-#endif
-
-		/* prep path to have another element added to it. */
-		last_element =
-		    PATH_OFFSET_PELEMENT(search_path,
-					 ++search_path->path_length);
-		fs_gen = get_generation(sb);
-
-		/*
-		 * Read the next tree node, and set the last element
-		 * in the path to have a pointer to it.
-		 */
-		if ((bh = last_element->pe_buffer =
-		     sb_getblk(sb, block_number))) {
-
-			/*
-			 * We'll need to drop the lock if we encounter any
-			 * buffers that need to be read. If all of them are
-			 * already up to date, we don't need to drop the lock.
-			 */
-			int depth = -1;
-
-			if (!buffer_uptodate(bh) && reada_count > 1)
-				depth = search_by_key_reada(sb, reada_bh,
-						    reada_blocks, reada_count);
-
-			if (!buffer_uptodate(bh) && depth == -1)
-				depth = reiserfs_write_unlock_nested(sb);
-
-			bh_read_nowait(bh, 0);
-			wait_on_buffer(bh);
-
-			if (depth != -1)
-				reiserfs_write_lock_nested(sb, depth);
-			if (!buffer_uptodate(bh))
-				goto io_error;
-		} else {
-io_error:
-			search_path->path_length--;
-			pathrelse(search_path);
-			return IO_ERROR;
-		}
-		reada_count = 0;
-		if (expected_level == -1)
-			expected_level = SB_TREE_HEIGHT(sb);
-		expected_level--;
-
-		/*
-		 * It is possible that schedule occurred. We must check
-		 * whether the key to search is still in the tree rooted
-		 * from the current buffer. If not then repeat search
-		 * from the root.
-		 */
-		if (fs_changed(fs_gen, sb) &&
-		    (!B_IS_IN_TREE(bh) ||
-		     B_LEVEL(bh) != expected_level ||
-		     !key_in_buffer(search_path, key, sb))) {
-			PROC_INFO_INC(sb, search_by_key_fs_changed);
-			PROC_INFO_INC(sb, search_by_key_restarted);
-			PROC_INFO_INC(sb,
-				      sbk_restarted[expected_level - 1]);
-			pathrelse(search_path);
-
-			/*
-			 * Get the root block number so that we can
-			 * repeat the search starting from the root.
-			 */
-			block_number = SB_ROOT_BLOCK(sb);
-			expected_level = -1;
-
-			/* repeat search from the root */
-			continue;
-		}
-
-		/*
-		 * only check that the key is in the buffer if key is not
-		 * equal to the MAX_KEY. Latter case is only possible in
-		 * "finish_unfinished()" processing during mount.
-		 */
-		RFALSE(comp_keys(&MAX_KEY, key) &&
-		       !key_in_buffer(search_path, key, sb),
-		       "PAP-5130: key is not in the buffer");
-#ifdef CONFIG_REISERFS_CHECK
-		if (REISERFS_SB(sb)->cur_tb) {
-			print_cur_tb("5140");
-			reiserfs_panic(sb, "PAP-5140",
-				       "schedule occurred in do_balance!");
-		}
-#endif
-
-		/*
-		 * make sure, that the node contents look like a node of
-		 * certain level
-		 */
-		if (!is_tree_node(bh, expected_level)) {
-			reiserfs_error(sb, "vs-5150",
-				       "invalid format found in block %ld. "
-				       "Fsck?", bh->b_blocknr);
-			pathrelse(search_path);
-			return IO_ERROR;
-		}
-
-		/* ok, we have acquired next formatted node in the tree */
-		node_level = B_LEVEL(bh);
-
-		PROC_INFO_BH_STAT(sb, bh, node_level - 1);
-
-		RFALSE(node_level < stop_level,
-		       "vs-5152: tree level (%d) is less than stop level (%d)",
-		       node_level, stop_level);
-
-		retval = bin_search(key, item_head(bh, 0),
-				      B_NR_ITEMS(bh),
-				      (node_level ==
-				       DISK_LEAF_NODE_LEVEL) ? IH_SIZE :
-				      KEY_SIZE,
-				      &last_element->pe_position);
-		if (node_level == stop_level) {
-			return retval;
-		}
-
-		/* we are not in the stop level */
-		/*
-		 * item has been found, so we choose the pointer which
-		 * is to the right of the found one
-		 */
-		if (retval == ITEM_FOUND)
-			last_element->pe_position++;
-
-		/*
-		 * if item was not found we choose the position which is to
-		 * the left of the found item. This requires no code,
-		 * bin_search did it already.
-		 */
-
-		/*
-		 * So we have chosen a position in the current node which is
-		 * an internal node.  Now we calculate child block number by
-		 * position in the node.
-		 */
-		block_number =
-		    B_N_CHILD_NUM(bh, last_element->pe_position);
-
-		/*
-		 * if we are going to read leaf nodes, try for read
-		 * ahead as well
-		 */
-		if ((search_path->reada & PATH_READA) &&
-		    node_level == DISK_LEAF_NODE_LEVEL + 1) {
-			int pos = last_element->pe_position;
-			int limit = B_NR_ITEMS(bh);
-			struct reiserfs_key *le_key;
-
-			if (search_path->reada & PATH_READA_BACK)
-				limit = 0;
-			while (reada_count < SEARCH_BY_KEY_READA) {
-				if (pos == limit)
-					break;
-				reada_blocks[reada_count++] =
-				    B_N_CHILD_NUM(bh, pos);
-				if (search_path->reada & PATH_READA_BACK)
-					pos--;
-				else
-					pos++;
-
-				/*
-				 * check to make sure we're in the same object
-				 */
-				le_key = internal_key(bh, pos);
-				if (le32_to_cpu(le_key->k_objectid) !=
-				    key->on_disk_key.k_objectid) {
-					break;
-				}
-			}
-		}
-	}
-}
-
-/*
- * Form the path to an item and position in this item which contains
- * file byte defined by key. If there is no such item
- * corresponding to the key, we point the path to the item with
- * maximal key less than key, and *pos_in_item is set to one
- * past the last entry/byte in the item.  If searching for entry in a
- * directory item, and it is not found, *pos_in_item is set to one
- * entry more than the entry with maximal key which is less than the
- * sought key.
- *
- * Note that if there is no entry in this same node which is one more,
- * then we point to an imaginary entry.  for direct items, the
- * position is in units of bytes, for indirect items the position is
- * in units of blocknr entries, for directory items the position is in
- * units of directory entries.
- */
-/* The function is NOT SCHEDULE-SAFE! */
-int search_for_position_by_key(struct super_block *sb,
-			       /* Key to search (cpu variable) */
-			       const struct cpu_key *p_cpu_key,
-			       /* Filled up by this function. */
-			       struct treepath *search_path)
-{
-	struct item_head *p_le_ih;	/* pointer to on-disk structure */
-	int blk_size;
-	loff_t item_offset, offset;
-	struct reiserfs_dir_entry de;
-	int retval;
-
-	/* If searching for directory entry. */
-	if (is_direntry_cpu_key(p_cpu_key))
-		return search_by_entry_key(sb, p_cpu_key, search_path,
-					   &de);
-
-	/* If not searching for directory entry. */
-
-	/* If item is found. */
-	retval = search_item(sb, p_cpu_key, search_path);
-	if (retval == IO_ERROR)
-		return retval;
-	if (retval == ITEM_FOUND) {
-
-		RFALSE(!ih_item_len
-		       (item_head
-			(PATH_PLAST_BUFFER(search_path),
-			 PATH_LAST_POSITION(search_path))),
-		       "PAP-5165: item length equals zero");
-
-		pos_in_item(search_path) = 0;
-		return POSITION_FOUND;
-	}
-
-	RFALSE(!PATH_LAST_POSITION(search_path),
-	       "PAP-5170: position equals zero");
-
-	/* Item is not found. Set path to the previous item. */
-	p_le_ih =
-	    item_head(PATH_PLAST_BUFFER(search_path),
-			   --PATH_LAST_POSITION(search_path));
-	blk_size = sb->s_blocksize;
-
-	if (comp_short_keys(&p_le_ih->ih_key, p_cpu_key))
-		return FILE_NOT_FOUND;
-
-	/* FIXME: quite ugly this far */
-
-	item_offset = le_ih_k_offset(p_le_ih);
-	offset = cpu_key_k_offset(p_cpu_key);
-
-	/* Needed byte is contained in the item pointed to by the path. */
-	if (item_offset <= offset &&
-	    item_offset + op_bytes_number(p_le_ih, blk_size) > offset) {
-		pos_in_item(search_path) = offset - item_offset;
-		if (is_indirect_le_ih(p_le_ih)) {
-			pos_in_item(search_path) /= blk_size;
-		}
-		return POSITION_FOUND;
-	}
-
-	/*
-	 * Needed byte is not contained in the item pointed to by the
-	 * path. Set pos_in_item out of the item.
-	 */
-	if (is_indirect_le_ih(p_le_ih))
-		pos_in_item(search_path) =
-		    ih_item_len(p_le_ih) / UNFM_P_SIZE;
-	else
-		pos_in_item(search_path) = ih_item_len(p_le_ih);
-
-	return POSITION_NOT_FOUND;
-}
-
-/* Compare given item and item pointed to by the path. */
-int comp_items(const struct item_head *stored_ih, const struct treepath *path)
-{
-	struct buffer_head *bh = PATH_PLAST_BUFFER(path);
-	struct item_head *ih;
-
-	/* Last buffer at the path is not in the tree. */
-	if (!B_IS_IN_TREE(bh))
-		return 1;
-
-	/* Last path position is invalid. */
-	if (PATH_LAST_POSITION(path) >= B_NR_ITEMS(bh))
-		return 1;
-
-	/* we need only to know, whether it is the same item */
-	ih = tp_item_head(path);
-	return memcmp(stored_ih, ih, IH_SIZE);
-}
-
-/* prepare for delete or cut of direct item */
-static inline int prepare_for_direct_item(struct treepath *path,
-					  struct item_head *le_ih,
-					  struct inode *inode,
-					  loff_t new_file_length, int *cut_size)
-{
-	loff_t round_len;
-
-	if (new_file_length == max_reiserfs_offset(inode)) {
-		/* item has to be deleted */
-		*cut_size = -(IH_SIZE + ih_item_len(le_ih));
-		return M_DELETE;
-	}
-	/* new file gets truncated */
-	if (get_inode_item_key_version(inode) == KEY_FORMAT_3_6) {
-		round_len = ROUND_UP(new_file_length);
-		/* this was new_file_length < le_ih ... */
-		if (round_len < le_ih_k_offset(le_ih)) {
-			*cut_size = -(IH_SIZE + ih_item_len(le_ih));
-			return M_DELETE;	/* Delete this item. */
-		}
-		/* Calculate first position and size for cutting from item. */
-		pos_in_item(path) = round_len - (le_ih_k_offset(le_ih) - 1);
-		*cut_size = -(ih_item_len(le_ih) - pos_in_item(path));
-
-		return M_CUT;	/* Cut from this item. */
-	}
-
-	/* old file: items may have any length */
-
-	if (new_file_length < le_ih_k_offset(le_ih)) {
-		*cut_size = -(IH_SIZE + ih_item_len(le_ih));
-		return M_DELETE;	/* Delete this item. */
-	}
-
-	/* Calculate first position and size for cutting from item. */
-	*cut_size = -(ih_item_len(le_ih) -
-		      (pos_in_item(path) =
-		       new_file_length + 1 - le_ih_k_offset(le_ih)));
-	return M_CUT;		/* Cut from this item. */
-}
-
-static inline int prepare_for_direntry_item(struct treepath *path,
-					    struct item_head *le_ih,
-					    struct inode *inode,
-					    loff_t new_file_length,
-					    int *cut_size)
-{
-	if (le_ih_k_offset(le_ih) == DOT_OFFSET &&
-	    new_file_length == max_reiserfs_offset(inode)) {
-		RFALSE(ih_entry_count(le_ih) != 2,
-		       "PAP-5220: incorrect empty directory item (%h)", le_ih);
-		*cut_size = -(IH_SIZE + ih_item_len(le_ih));
-		/* Delete the directory item containing "." and ".." entry. */
-		return M_DELETE;
-	}
-
-	if (ih_entry_count(le_ih) == 1) {
-		/*
-		 * Delete the directory item such as there is one record only
-		 * in this item
-		 */
-		*cut_size = -(IH_SIZE + ih_item_len(le_ih));
-		return M_DELETE;
-	}
-
-	/* Cut one record from the directory item. */
-	*cut_size =
-	    -(DEH_SIZE +
-	      entry_length(get_last_bh(path), le_ih, pos_in_item(path)));
-	return M_CUT;
-}
-
-#define JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD (2 * JOURNAL_PER_BALANCE_CNT + 1)
-
-/*
- * If the path points to a directory or direct item, calculate mode
- * and the size cut, for balance.
- * If the path points to an indirect item, remove some number of its
- * unformatted nodes.
- * In case of file truncate calculate whether this item must be
- * deleted/truncated or last unformatted node of this item will be
- * converted to a direct item.
- * This function returns a determination of what balance mode the
- * calling function should employ.
- */
-static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th,
-				      struct inode *inode,
-				      struct treepath *path,
-				      const struct cpu_key *item_key,
-				      /*
-				       * Number of unformatted nodes
-				       * which were removed from end
-				       * of the file.
-				       */
-				      int *removed,
-				      int *cut_size,
-				      /* MAX_KEY_OFFSET in case of delete. */
-				      unsigned long long new_file_length
-    )
-{
-	struct super_block *sb = inode->i_sb;
-	struct item_head *p_le_ih = tp_item_head(path);
-	struct buffer_head *bh = PATH_PLAST_BUFFER(path);
-
-	BUG_ON(!th->t_trans_id);
-
-	/* Stat_data item. */
-	if (is_statdata_le_ih(p_le_ih)) {
-
-		RFALSE(new_file_length != max_reiserfs_offset(inode),
-		       "PAP-5210: mode must be M_DELETE");
-
-		*cut_size = -(IH_SIZE + ih_item_len(p_le_ih));
-		return M_DELETE;
-	}
-
-	/* Directory item. */
-	if (is_direntry_le_ih(p_le_ih))
-		return prepare_for_direntry_item(path, p_le_ih, inode,
-						 new_file_length,
-						 cut_size);
-
-	/* Direct item. */
-	if (is_direct_le_ih(p_le_ih))
-		return prepare_for_direct_item(path, p_le_ih, inode,
-					       new_file_length, cut_size);
-
-	/* Case of an indirect item. */
-	{
-	    int blk_size = sb->s_blocksize;
-	    struct item_head s_ih;
-	    int need_re_search;
-	    int delete = 0;
-	    int result = M_CUT;
-	    int pos = 0;
-
-	    if ( new_file_length == max_reiserfs_offset (inode) ) {
-		/*
-		 * prepare_for_delete_or_cut() is called by
-		 * reiserfs_delete_item()
-		 */
-		new_file_length = 0;
-		delete = 1;
-	    }
-
-	    do {
-		need_re_search = 0;
-		*cut_size = 0;
-		bh = PATH_PLAST_BUFFER(path);
-		copy_item_head(&s_ih, tp_item_head(path));
-		pos = I_UNFM_NUM(&s_ih);
-
-		while (le_ih_k_offset (&s_ih) + (pos - 1) * blk_size > new_file_length) {
-		    __le32 *unfm;
-		    __u32 block;
-
-		    /*
-		     * Each unformatted block deletion may involve
-		     * one additional bitmap block into the transaction,
-		     * thereby the initial journal space reservation
-		     * might not be enough.
-		     */
-		    if (!delete && (*cut_size) != 0 &&
-			reiserfs_transaction_free_space(th) < JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD)
-			break;
-
-		    unfm = (__le32 *)ih_item_body(bh, &s_ih) + pos - 1;
-		    block = get_block_num(unfm, 0);
-
-		    if (block != 0) {
-			reiserfs_prepare_for_journal(sb, bh, 1);
-			put_block_num(unfm, 0, 0);
-			journal_mark_dirty(th, bh);
-			reiserfs_free_block(th, inode, block, 1);
-		    }
-
-		    reiserfs_cond_resched(sb);
-
-		    if (item_moved (&s_ih, path))  {
-			need_re_search = 1;
-			break;
-		    }
-
-		    pos --;
-		    (*removed)++;
-		    (*cut_size) -= UNFM_P_SIZE;
-
-		    if (pos == 0) {
-			(*cut_size) -= IH_SIZE;
-			result = M_DELETE;
-			break;
-		    }
-		}
-		/*
-		 * a trick.  If the buffer has been logged, this will
-		 * do nothing.  If we've broken the loop without logging
-		 * it, it will restore the buffer
-		 */
-		reiserfs_restore_prepared_buffer(sb, bh);
-	    } while (need_re_search &&
-		     search_for_position_by_key(sb, item_key, path) == POSITION_FOUND);
-	    pos_in_item(path) = pos * UNFM_P_SIZE;
-
-	    if (*cut_size == 0) {
-		/*
-		 * Nothing was cut. maybe convert last unformatted node to the
-		 * direct item?
-		 */
-		result = M_CONVERT;
-	    }
-	    return result;
-	}
-}
-
-/* Calculate number of bytes which will be deleted or cut during balance */
-static int calc_deleted_bytes_number(struct tree_balance *tb, char mode)
-{
-	int del_size;
-	struct item_head *p_le_ih = tp_item_head(tb->tb_path);
-
-	if (is_statdata_le_ih(p_le_ih))
-		return 0;
-
-	del_size =
-	    (mode ==
-	     M_DELETE) ? ih_item_len(p_le_ih) : -tb->insert_size[0];
-	if (is_direntry_le_ih(p_le_ih)) {
-		/*
-		 * return EMPTY_DIR_SIZE; We delete emty directories only.
-		 * we can't use EMPTY_DIR_SIZE, as old format dirs have a
-		 * different empty size.  ick. FIXME, is this right?
-		 */
-		return del_size;
-	}
-
-	if (is_indirect_le_ih(p_le_ih))
-		del_size = (del_size / UNFM_P_SIZE) *
-				(PATH_PLAST_BUFFER(tb->tb_path)->b_size);
-	return del_size;
-}
-
-static void init_tb_struct(struct reiserfs_transaction_handle *th,
-			   struct tree_balance *tb,
-			   struct super_block *sb,
-			   struct treepath *path, int size)
-{
-
-	BUG_ON(!th->t_trans_id);
-
-	memset(tb, '\0', sizeof(struct tree_balance));
-	tb->transaction_handle = th;
-	tb->tb_sb = sb;
-	tb->tb_path = path;
-	PATH_OFFSET_PBUFFER(path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL;
-	PATH_OFFSET_POSITION(path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0;
-	tb->insert_size[0] = size;
-}
-
-void padd_item(char *item, int total_length, int length)
-{
-	int i;
-
-	for (i = total_length; i > length;)
-		item[--i] = 0;
-}
-
-#ifdef REISERQUOTA_DEBUG
-char key2type(struct reiserfs_key *ih)
-{
-	if (is_direntry_le_key(2, ih))
-		return 'd';
-	if (is_direct_le_key(2, ih))
-		return 'D';
-	if (is_indirect_le_key(2, ih))
-		return 'i';
-	if (is_statdata_le_key(2, ih))
-		return 's';
-	return 'u';
-}
-
-char head2type(struct item_head *ih)
-{
-	if (is_direntry_le_ih(ih))
-		return 'd';
-	if (is_direct_le_ih(ih))
-		return 'D';
-	if (is_indirect_le_ih(ih))
-		return 'i';
-	if (is_statdata_le_ih(ih))
-		return 's';
-	return 'u';
-}
-#endif
-
-/*
- * Delete object item.
- * th       - active transaction handle
- * path     - path to the deleted item
- * item_key - key to search for the deleted item
- * indode   - used for updating i_blocks and quotas
- * un_bh    - NULL or unformatted node pointer
- */
-int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
-			 struct treepath *path, const struct cpu_key *item_key,
-			 struct inode *inode, struct buffer_head *un_bh)
-{
-	struct super_block *sb = inode->i_sb;
-	struct tree_balance s_del_balance;
-	struct item_head s_ih;
-	struct item_head *q_ih;
-	int quota_cut_bytes;
-	int ret_value, del_size, removed;
-	int depth;
-
-#ifdef CONFIG_REISERFS_CHECK
-	char mode;
-#endif
-
-	BUG_ON(!th->t_trans_id);
-
-	init_tb_struct(th, &s_del_balance, sb, path,
-		       0 /*size is unknown */ );
-
-	while (1) {
-		removed = 0;
-
-#ifdef CONFIG_REISERFS_CHECK
-		mode =
-#endif
-		    prepare_for_delete_or_cut(th, inode, path,
-					      item_key, &removed,
-					      &del_size,
-					      max_reiserfs_offset(inode));
-
-		RFALSE(mode != M_DELETE, "PAP-5320: mode must be M_DELETE");
-
-		copy_item_head(&s_ih, tp_item_head(path));
-		s_del_balance.insert_size[0] = del_size;
-
-		ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL);
-		if (ret_value != REPEAT_SEARCH)
-			break;
-
-		PROC_INFO_INC(sb, delete_item_restarted);
-
-		/* file system changed, repeat search */
-		ret_value =
-		    search_for_position_by_key(sb, item_key, path);
-		if (ret_value == IO_ERROR)
-			break;
-		if (ret_value == FILE_NOT_FOUND) {
-			reiserfs_warning(sb, "vs-5340",
-					 "no items of the file %K found",
-					 item_key);
-			break;
-		}
-	}			/* while (1) */
-
-	if (ret_value != CARRY_ON) {
-		unfix_nodes(&s_del_balance);
-		return 0;
-	}
-
-	/* reiserfs_delete_item returns item length when success */
-	ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE);
-	q_ih = tp_item_head(path);
-	quota_cut_bytes = ih_item_len(q_ih);
-
-	/*
-	 * hack so the quota code doesn't have to guess if the file has a
-	 * tail.  On tail insert, we allocate quota for 1 unformatted node.
-	 * We test the offset because the tail might have been
-	 * split into multiple items, and we only want to decrement for
-	 * the unfm node once
-	 */
-	if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(q_ih)) {
-		if ((le_ih_k_offset(q_ih) & (sb->s_blocksize - 1)) == 1) {
-			quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
-		} else {
-			quota_cut_bytes = 0;
-		}
-	}
-
-	if (un_bh) {
-		int off;
-		char *data;
-
-		/*
-		 * We are in direct2indirect conversion, so move tail contents
-		 * to the unformatted node
-		 */
-		/*
-		 * note, we do the copy before preparing the buffer because we
-		 * don't care about the contents of the unformatted node yet.
-		 * the only thing we really care about is the direct item's
-		 * data is in the unformatted node.
-		 *
-		 * Otherwise, we would have to call
-		 * reiserfs_prepare_for_journal on the unformatted node,
-		 * which might schedule, meaning we'd have to loop all the
-		 * way back up to the start of the while loop.
-		 *
-		 * The unformatted node must be dirtied later on.  We can't be
-		 * sure here if the entire tail has been deleted yet.
-		 *
-		 * un_bh is from the page cache (all unformatted nodes are
-		 * from the page cache) and might be a highmem page.  So, we
-		 * can't use un_bh->b_data.
-		 * -clm
-		 */
-
-		data = kmap_atomic(un_bh->b_page);
-		off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_SIZE - 1));
-		memcpy(data + off,
-		       ih_item_body(PATH_PLAST_BUFFER(path), &s_ih),
-		       ret_value);
-		kunmap_atomic(data);
-	}
-
-	/* Perform balancing after all resources have been collected at once. */
-	do_balance(&s_del_balance, NULL, NULL, M_DELETE);
-
-#ifdef REISERQUOTA_DEBUG
-	reiserfs_debug(sb, REISERFS_DEBUG_CODE,
-		       "reiserquota delete_item(): freeing %u, id=%u type=%c",
-		       quota_cut_bytes, inode->i_uid, head2type(&s_ih));
-#endif
-	depth = reiserfs_write_unlock_nested(inode->i_sb);
-	dquot_free_space_nodirty(inode, quota_cut_bytes);
-	reiserfs_write_lock_nested(inode->i_sb, depth);
-
-	/* Return deleted body length */
-	return ret_value;
-}
-
-/*
- * Summary Of Mechanisms For Handling Collisions Between Processes:
- *
- *  deletion of the body of the object is performed by iput(), with the
- *  result that if multiple processes are operating on a file, the
- *  deletion of the body of the file is deferred until the last process
- *  that has an open inode performs its iput().
- *
- *  writes and truncates are protected from collisions by use of
- *  semaphores.
- *
- *  creates, linking, and mknod are protected from collisions with other
- *  processes by making the reiserfs_add_entry() the last step in the
- *  creation, and then rolling back all changes if there was a collision.
- *  - Hans
-*/
-
-/* this deletes item which never gets split */
-void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
-				struct inode *inode, struct reiserfs_key *key)
-{
-	struct super_block *sb = th->t_super;
-	struct tree_balance tb;
-	INITIALIZE_PATH(path);
-	int item_len = 0;
-	int tb_init = 0;
-	struct cpu_key cpu_key = {};
-	int retval;
-	int quota_cut_bytes = 0;
-
-	BUG_ON(!th->t_trans_id);
-
-	le_key2cpu_key(&cpu_key, key);
-
-	while (1) {
-		retval = search_item(th->t_super, &cpu_key, &path);
-		if (retval == IO_ERROR) {
-			reiserfs_error(th->t_super, "vs-5350",
-				       "i/o failure occurred trying "
-				       "to delete %K", &cpu_key);
-			break;
-		}
-		if (retval != ITEM_FOUND) {
-			pathrelse(&path);
-			/*
-			 * No need for a warning, if there is just no free
-			 * space to insert '..' item into the
-			 * newly-created subdir
-			 */
-			if (!
-			    ((unsigned long long)
-			     GET_HASH_VALUE(le_key_k_offset
-					    (le_key_version(key), key)) == 0
-			     && (unsigned long long)
-			     GET_GENERATION_NUMBER(le_key_k_offset
-						   (le_key_version(key),
-						    key)) == 1))
-				reiserfs_warning(th->t_super, "vs-5355",
-						 "%k not found", key);
-			break;
-		}
-		if (!tb_init) {
-			tb_init = 1;
-			item_len = ih_item_len(tp_item_head(&path));
-			init_tb_struct(th, &tb, th->t_super, &path,
-				       -(IH_SIZE + item_len));
-		}
-		quota_cut_bytes = ih_item_len(tp_item_head(&path));
-
-		retval = fix_nodes(M_DELETE, &tb, NULL, NULL);
-		if (retval == REPEAT_SEARCH) {
-			PROC_INFO_INC(th->t_super, delete_solid_item_restarted);
-			continue;
-		}
-
-		if (retval == CARRY_ON) {
-			do_balance(&tb, NULL, NULL, M_DELETE);
-			/*
-			 * Should we count quota for item? (we don't
-			 * count quotas for save-links)
-			 */
-			if (inode) {
-				int depth;
-#ifdef REISERQUOTA_DEBUG
-				reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
-					       "reiserquota delete_solid_item(): freeing %u id=%u type=%c",
-					       quota_cut_bytes, inode->i_uid,
-					       key2type(key));
-#endif
-				depth = reiserfs_write_unlock_nested(sb);
-				dquot_free_space_nodirty(inode,
-							 quota_cut_bytes);
-				reiserfs_write_lock_nested(sb, depth);
-			}
-			break;
-		}
-
-		/* IO_ERROR, NO_DISK_SPACE, etc */
-		reiserfs_warning(th->t_super, "vs-5360",
-				 "could not delete %K due to fix_nodes failure",
-				 &cpu_key);
-		unfix_nodes(&tb);
-		break;
-	}
-
-	reiserfs_check_path(&path);
-}
-
-int reiserfs_delete_object(struct reiserfs_transaction_handle *th,
-			   struct inode *inode)
-{
-	int err;
-	inode->i_size = 0;
-	BUG_ON(!th->t_trans_id);
-
-	/* for directory this deletes item containing "." and ".." */
-	err =
-	    reiserfs_do_truncate(th, inode, NULL, 0 /*no timestamp updates */ );
-	if (err)
-		return err;
-
-#if defined( USE_INODE_GENERATION_COUNTER )
-	if (!old_format_only(th->t_super)) {
-		__le32 *inode_generation;
-
-		inode_generation =
-		    &REISERFS_SB(th->t_super)->s_rs->s_inode_generation;
-		le32_add_cpu(inode_generation, 1);
-	}
-/* USE_INODE_GENERATION_COUNTER */
-#endif
-	reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode));
-
-	return err;
-}
-
-static void unmap_buffers(struct page *page, loff_t pos)
-{
-	struct buffer_head *bh;
-	struct buffer_head *head;
-	struct buffer_head *next;
-	unsigned long tail_index;
-	unsigned long cur_index;
-
-	if (page) {
-		if (page_has_buffers(page)) {
-			tail_index = pos & (PAGE_SIZE - 1);
-			cur_index = 0;
-			head = page_buffers(page);
-			bh = head;
-			do {
-				next = bh->b_this_page;
-
-				/*
-				 * we want to unmap the buffers that contain
-				 * the tail, and all the buffers after it
-				 * (since the tail must be at the end of the
-				 * file).  We don't want to unmap file data
-				 * before the tail, since it might be dirty
-				 * and waiting to reach disk
-				 */
-				cur_index += bh->b_size;
-				if (cur_index > tail_index) {
-					reiserfs_unmap_buffer(bh);
-				}
-				bh = next;
-			} while (bh != head);
-		}
-	}
-}
-
-static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th,
-				    struct inode *inode,
-				    struct page *page,
-				    struct treepath *path,
-				    const struct cpu_key *item_key,
-				    loff_t new_file_size, char *mode)
-{
-	struct super_block *sb = inode->i_sb;
-	int block_size = sb->s_blocksize;
-	int cut_bytes;
-	BUG_ON(!th->t_trans_id);
-	BUG_ON(new_file_size != inode->i_size);
-
-	/*
-	 * the page being sent in could be NULL if there was an i/o error
-	 * reading in the last block.  The user will hit problems trying to
-	 * read the file, but for now we just skip the indirect2direct
-	 */
-	if (atomic_read(&inode->i_count) > 1 ||
-	    !tail_has_to_be_packed(inode) ||
-	    !page || (REISERFS_I(inode)->i_flags & i_nopack_mask)) {
-		/* leave tail in an unformatted node */
-		*mode = M_SKIP_BALANCING;
-		cut_bytes =
-		    block_size - (new_file_size & (block_size - 1));
-		pathrelse(path);
-		return cut_bytes;
-	}
-
-	/* Perform the conversion to a direct_item. */
-	return indirect2direct(th, inode, page, path, item_key,
-			       new_file_size, mode);
-}
-
-/*
- * we did indirect_to_direct conversion. And we have inserted direct
- * item successesfully, but there were no disk space to cut unfm
- * pointer being converted. Therefore we have to delete inserted
- * direct item(s)
- */
-static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th,
-					 struct inode *inode, struct treepath *path)
-{
-	struct cpu_key tail_key;
-	int tail_len;
-	int removed;
-	BUG_ON(!th->t_trans_id);
-
-	make_cpu_key(&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4);
-	tail_key.key_length = 4;
-
-	tail_len =
-	    (cpu_key_k_offset(&tail_key) & (inode->i_sb->s_blocksize - 1)) - 1;
-	while (tail_len) {
-		/* look for the last byte of the tail */
-		if (search_for_position_by_key(inode->i_sb, &tail_key, path) ==
-		    POSITION_NOT_FOUND)
-			reiserfs_panic(inode->i_sb, "vs-5615",
-				       "found invalid item");
-		RFALSE(path->pos_in_item !=
-		       ih_item_len(tp_item_head(path)) - 1,
-		       "vs-5616: appended bytes found");
-		PATH_LAST_POSITION(path)--;
-
-		removed =
-		    reiserfs_delete_item(th, path, &tail_key, inode,
-					 NULL /*unbh not needed */ );
-		RFALSE(removed <= 0
-		       || removed > tail_len,
-		       "vs-5617: there was tail %d bytes, removed item length %d bytes",
-		       tail_len, removed);
-		tail_len -= removed;
-		set_cpu_key_k_offset(&tail_key,
-				     cpu_key_k_offset(&tail_key) - removed);
-	}
-	reiserfs_warning(inode->i_sb, "reiserfs-5091", "indirect_to_direct "
-			 "conversion has been rolled back due to "
-			 "lack of disk space");
-	mark_inode_dirty(inode);
-}
-
-/* (Truncate or cut entry) or delete object item. Returns < 0 on failure */
-int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
-			   struct treepath *path,
-			   struct cpu_key *item_key,
-			   struct inode *inode,
-			   struct page *page, loff_t new_file_size)
-{
-	struct super_block *sb = inode->i_sb;
-	/*
-	 * Every function which is going to call do_balance must first
-	 * create a tree_balance structure.  Then it must fill up this
-	 * structure by using the init_tb_struct and fix_nodes functions.
-	 * After that we can make tree balancing.
-	 */
-	struct tree_balance s_cut_balance;
-	struct item_head *p_le_ih;
-	int cut_size = 0;	/* Amount to be cut. */
-	int ret_value = CARRY_ON;
-	int removed = 0;	/* Number of the removed unformatted nodes. */
-	int is_inode_locked = 0;
-	char mode;		/* Mode of the balance. */
-	int retval2 = -1;
-	int quota_cut_bytes;
-	loff_t tail_pos = 0;
-	int depth;
-
-	BUG_ON(!th->t_trans_id);
-
-	init_tb_struct(th, &s_cut_balance, inode->i_sb, path,
-		       cut_size);
-
-	/*
-	 * Repeat this loop until we either cut the item without needing
-	 * to balance, or we fix_nodes without schedule occurring
-	 */
-	while (1) {
-		/*
-		 * Determine the balance mode, position of the first byte to
-		 * be cut, and size to be cut.  In case of the indirect item
-		 * free unformatted nodes which are pointed to by the cut
-		 * pointers.
-		 */
-
-		mode =
-		    prepare_for_delete_or_cut(th, inode, path,
-					      item_key, &removed,
-					      &cut_size, new_file_size);
-		if (mode == M_CONVERT) {
-			/*
-			 * convert last unformatted node to direct item or
-			 * leave tail in the unformatted node
-			 */
-			RFALSE(ret_value != CARRY_ON,
-			       "PAP-5570: can not convert twice");
-
-			ret_value =
-			    maybe_indirect_to_direct(th, inode, page,
-						     path, item_key,
-						     new_file_size, &mode);
-			if (mode == M_SKIP_BALANCING)
-				/* tail has been left in the unformatted node */
-				return ret_value;
-
-			is_inode_locked = 1;
-
-			/*
-			 * removing of last unformatted node will
-			 * change value we have to return to truncate.
-			 * Save it
-			 */
-			retval2 = ret_value;
-
-			/*
-			 * So, we have performed the first part of the
-			 * conversion:
-			 * inserting the new direct item.  Now we are
-			 * removing the last unformatted node pointer.
-			 * Set key to search for it.
-			 */
-			set_cpu_key_k_type(item_key, TYPE_INDIRECT);
-			item_key->key_length = 4;
-			new_file_size -=
-			    (new_file_size & (sb->s_blocksize - 1));
-			tail_pos = new_file_size;
-			set_cpu_key_k_offset(item_key, new_file_size + 1);
-			if (search_for_position_by_key
-			    (sb, item_key,
-			     path) == POSITION_NOT_FOUND) {
-				print_block(PATH_PLAST_BUFFER(path), 3,
-					    PATH_LAST_POSITION(path) - 1,
-					    PATH_LAST_POSITION(path) + 1);
-				reiserfs_panic(sb, "PAP-5580", "item to "
-					       "convert does not exist (%K)",
-					       item_key);
-			}
-			continue;
-		}
-		if (cut_size == 0) {
-			pathrelse(path);
-			return 0;
-		}
-
-		s_cut_balance.insert_size[0] = cut_size;
-
-		ret_value = fix_nodes(mode, &s_cut_balance, NULL, NULL);
-		if (ret_value != REPEAT_SEARCH)
-			break;
-
-		PROC_INFO_INC(sb, cut_from_item_restarted);
-
-		ret_value =
-		    search_for_position_by_key(sb, item_key, path);
-		if (ret_value == POSITION_FOUND)
-			continue;
-
-		reiserfs_warning(sb, "PAP-5610", "item %K not found",
-				 item_key);
-		unfix_nodes(&s_cut_balance);
-		return (ret_value == IO_ERROR) ? -EIO : -ENOENT;
-	}			/* while */
-
-	/* check fix_nodes results (IO_ERROR or NO_DISK_SPACE) */
-	if (ret_value != CARRY_ON) {
-		if (is_inode_locked) {
-			/*
-			 * FIXME: this seems to be not needed: we are always
-			 * able to cut item
-			 */
-			indirect_to_direct_roll_back(th, inode, path);
-		}
-		if (ret_value == NO_DISK_SPACE)
-			reiserfs_warning(sb, "reiserfs-5092",
-					 "NO_DISK_SPACE");
-		unfix_nodes(&s_cut_balance);
-		return -EIO;
-	}
-
-	/* go ahead and perform balancing */
-
-	RFALSE(mode == M_PASTE || mode == M_INSERT, "invalid mode");
-
-	/* Calculate number of bytes that need to be cut from the item. */
-	quota_cut_bytes =
-	    (mode ==
-	     M_DELETE) ? ih_item_len(tp_item_head(path)) : -s_cut_balance.
-	    insert_size[0];
-	if (retval2 == -1)
-		ret_value = calc_deleted_bytes_number(&s_cut_balance, mode);
-	else
-		ret_value = retval2;
-
-	/*
-	 * For direct items, we only change the quota when deleting the last
-	 * item.
-	 */
-	p_le_ih = tp_item_head(s_cut_balance.tb_path);
-	if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(p_le_ih)) {
-		if (mode == M_DELETE &&
-		    (le_ih_k_offset(p_le_ih) & (sb->s_blocksize - 1)) ==
-		    1) {
-			/* FIXME: this is to keep 3.5 happy */
-			REISERFS_I(inode)->i_first_direct_byte = U32_MAX;
-			quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
-		} else {
-			quota_cut_bytes = 0;
-		}
-	}
-#ifdef CONFIG_REISERFS_CHECK
-	if (is_inode_locked) {
-		struct item_head *le_ih =
-		    tp_item_head(s_cut_balance.tb_path);
-		/*
-		 * we are going to complete indirect2direct conversion. Make
-		 * sure, that we exactly remove last unformatted node pointer
-		 * of the item
-		 */
-		if (!is_indirect_le_ih(le_ih))
-			reiserfs_panic(sb, "vs-5652",
-				       "item must be indirect %h", le_ih);
-
-		if (mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE)
-			reiserfs_panic(sb, "vs-5653", "completing "
-				       "indirect2direct conversion indirect "
-				       "item %h being deleted must be of "
-				       "4 byte long", le_ih);
-
-		if (mode == M_CUT
-		    && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) {
-			reiserfs_panic(sb, "vs-5654", "can not complete "
-				       "indirect2direct conversion of %h "
-				       "(CUT, insert_size==%d)",
-				       le_ih, s_cut_balance.insert_size[0]);
-		}
-		/*
-		 * it would be useful to make sure, that right neighboring
-		 * item is direct item of this file
-		 */
-	}
-#endif
-
-	do_balance(&s_cut_balance, NULL, NULL, mode);
-	if (is_inode_locked) {
-		/*
-		 * we've done an indirect->direct conversion.  when the
-		 * data block was freed, it was removed from the list of
-		 * blocks that must be flushed before the transaction
-		 * commits, make sure to unmap and invalidate it
-		 */
-		unmap_buffers(page, tail_pos);
-		REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
-	}
-#ifdef REISERQUOTA_DEBUG
-	reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
-		       "reiserquota cut_from_item(): freeing %u id=%u type=%c",
-		       quota_cut_bytes, inode->i_uid, '?');
-#endif
-	depth = reiserfs_write_unlock_nested(sb);
-	dquot_free_space_nodirty(inode, quota_cut_bytes);
-	reiserfs_write_lock_nested(sb, depth);
-	return ret_value;
-}
-
-static void truncate_directory(struct reiserfs_transaction_handle *th,
-			       struct inode *inode)
-{
-	BUG_ON(!th->t_trans_id);
-	if (inode->i_nlink)
-		reiserfs_error(inode->i_sb, "vs-5655", "link count != 0");
-
-	set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), DOT_OFFSET);
-	set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_DIRENTRY);
-	reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode));
-	reiserfs_update_sd(th, inode);
-	set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), SD_OFFSET);
-	set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_STAT_DATA);
-}
-
-/*
- * Truncate file to the new size. Note, this must be called with a
- * transaction already started
- */
-int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
-			 struct inode *inode,	/* ->i_size contains new size */
-			 struct page *page,	/* up to date for last block */
-			 /*
-			  * when it is called by file_release to convert
-			  * the tail - no timestamps should be updated
-			  */
-			 int update_timestamps
-    )
-{
-	INITIALIZE_PATH(s_search_path);	/* Path to the current object item. */
-	struct item_head *p_le_ih;	/* Pointer to an item header. */
-
-	/* Key to search for a previous file item. */
-	struct cpu_key s_item_key;
-	loff_t file_size,	/* Old file size. */
-	 new_file_size;	/* New file size. */
-	int deleted;		/* Number of deleted or truncated bytes. */
-	int retval;
-	int err = 0;
-
-	BUG_ON(!th->t_trans_id);
-	if (!
-	    (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
-	     || S_ISLNK(inode->i_mode)))
-		return 0;
-
-	/* deletion of directory - no need to update timestamps */
-	if (S_ISDIR(inode->i_mode)) {
-		truncate_directory(th, inode);
-		return 0;
-	}
-
-	/* Get new file size. */
-	new_file_size = inode->i_size;
-
-	/* FIXME: note, that key type is unimportant here */
-	make_cpu_key(&s_item_key, inode, max_reiserfs_offset(inode),
-		     TYPE_DIRECT, 3);
-
-	retval =
-	    search_for_position_by_key(inode->i_sb, &s_item_key,
-				       &s_search_path);
-	if (retval == IO_ERROR) {
-		reiserfs_error(inode->i_sb, "vs-5657",
-			       "i/o failure occurred trying to truncate %K",
-			       &s_item_key);
-		err = -EIO;
-		goto out;
-	}
-	if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) {
-		reiserfs_error(inode->i_sb, "PAP-5660",
-			       "wrong result %d of search for %K", retval,
-			       &s_item_key);
-
-		err = -EIO;
-		goto out;
-	}
-
-	s_search_path.pos_in_item--;
-
-	/* Get real file size (total length of all file items) */
-	p_le_ih = tp_item_head(&s_search_path);
-	if (is_statdata_le_ih(p_le_ih))
-		file_size = 0;
-	else {
-		loff_t offset = le_ih_k_offset(p_le_ih);
-		int bytes =
-		    op_bytes_number(p_le_ih, inode->i_sb->s_blocksize);
-
-		/*
-		 * this may mismatch with real file size: if last direct item
-		 * had no padding zeros and last unformatted node had no free
-		 * space, this file would have this file size
-		 */
-		file_size = offset + bytes - 1;
-	}
-	/*
-	 * are we doing a full truncate or delete, if so
-	 * kick in the reada code
-	 */
-	if (new_file_size == 0)
-		s_search_path.reada = PATH_READA | PATH_READA_BACK;
-
-	if (file_size == 0 || file_size < new_file_size) {
-		goto update_and_out;
-	}
-
-	/* Update key to search for the last file item. */
-	set_cpu_key_k_offset(&s_item_key, file_size);
-
-	do {
-		/* Cut or delete file item. */
-		deleted =
-		    reiserfs_cut_from_item(th, &s_search_path, &s_item_key,
-					   inode, page, new_file_size);
-		if (deleted < 0) {
-			reiserfs_warning(inode->i_sb, "vs-5665",
-					 "reiserfs_cut_from_item failed");
-			reiserfs_check_path(&s_search_path);
-			return 0;
-		}
-
-		RFALSE(deleted > file_size,
-		       "PAP-5670: reiserfs_cut_from_item: too many bytes deleted: deleted %d, file_size %lu, item_key %K",
-		       deleted, file_size, &s_item_key);
-
-		/* Change key to search the last file item. */
-		file_size -= deleted;
-
-		set_cpu_key_k_offset(&s_item_key, file_size);
-
-		/*
-		 * While there are bytes to truncate and previous
-		 * file item is presented in the tree.
-		 */
-
-		/*
-		 * This loop could take a really long time, and could log
-		 * many more blocks than a transaction can hold.  So, we do
-		 * a polite journal end here, and if the transaction needs
-		 * ending, we make sure the file is consistent before ending
-		 * the current trans and starting a new one
-		 */
-		if (journal_transaction_should_end(th, 0) ||
-		    reiserfs_transaction_free_space(th) <= JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) {
-			pathrelse(&s_search_path);
-
-			if (update_timestamps) {
-				inode_set_mtime_to_ts(inode,
-						      current_time(inode));
-				inode_set_ctime_current(inode);
-			}
-			reiserfs_update_sd(th, inode);
-
-			err = journal_end(th);
-			if (err)
-				goto out;
-			err = journal_begin(th, inode->i_sb,
-					    JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD + JOURNAL_PER_BALANCE_CNT * 4) ;
-			if (err)
-				goto out;
-			reiserfs_update_inode_transaction(inode);
-		}
-	} while (file_size > ROUND_UP(new_file_size) &&
-		 search_for_position_by_key(inode->i_sb, &s_item_key,
-					    &s_search_path) == POSITION_FOUND);
-
-	RFALSE(file_size > ROUND_UP(new_file_size),
-	       "PAP-5680: truncate did not finish: new_file_size %lld, current %lld, oid %d",
-	       new_file_size, file_size, s_item_key.on_disk_key.k_objectid);
-
-update_and_out:
-	if (update_timestamps) {
-		/* this is truncate, not file closing */
-		inode_set_mtime_to_ts(inode, current_time(inode));
-		inode_set_ctime_current(inode);
-	}
-	reiserfs_update_sd(th, inode);
-
-out:
-	pathrelse(&s_search_path);
-	return err;
-}
-
-#ifdef CONFIG_REISERFS_CHECK
-/* this makes sure, that we __append__, not overwrite or add holes */
-static void check_research_for_paste(struct treepath *path,
-				     const struct cpu_key *key)
-{
-	struct item_head *found_ih = tp_item_head(path);
-
-	if (is_direct_le_ih(found_ih)) {
-		if (le_ih_k_offset(found_ih) +
-		    op_bytes_number(found_ih,
-				    get_last_bh(path)->b_size) !=
-		    cpu_key_k_offset(key)
-		    || op_bytes_number(found_ih,
-				       get_last_bh(path)->b_size) !=
-		    pos_in_item(path))
-			reiserfs_panic(NULL, "PAP-5720", "found direct item "
-				       "%h or position (%d) does not match "
-				       "to key %K", found_ih,
-				       pos_in_item(path), key);
-	}
-	if (is_indirect_le_ih(found_ih)) {
-		if (le_ih_k_offset(found_ih) +
-		    op_bytes_number(found_ih,
-				    get_last_bh(path)->b_size) !=
-		    cpu_key_k_offset(key)
-		    || I_UNFM_NUM(found_ih) != pos_in_item(path)
-		    || get_ih_free_space(found_ih) != 0)
-			reiserfs_panic(NULL, "PAP-5730", "found indirect "
-				       "item (%h) or position (%d) does not "
-				       "match to key (%K)",
-				       found_ih, pos_in_item(path), key);
-	}
-}
-#endif				/* config reiserfs check */
-
-/*
- * Paste bytes to the existing item.
- * Returns bytes number pasted into the item.
- */
-int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th,
-			     /* Path to the pasted item. */
-			     struct treepath *search_path,
-			     /* Key to search for the needed item. */
-			     const struct cpu_key *key,
-			     /* Inode item belongs to */
-			     struct inode *inode,
-			     /* Pointer to the bytes to paste. */
-			     const char *body,
-			     /* Size of pasted bytes. */
-			     int pasted_size)
-{
-	struct super_block *sb = inode->i_sb;
-	struct tree_balance s_paste_balance;
-	int retval;
-	int fs_gen;
-	int depth;
-
-	BUG_ON(!th->t_trans_id);
-
-	fs_gen = get_generation(inode->i_sb);
-
-#ifdef REISERQUOTA_DEBUG
-	reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
-		       "reiserquota paste_into_item(): allocating %u id=%u type=%c",
-		       pasted_size, inode->i_uid,
-		       key2type(&key->on_disk_key));
-#endif
-
-	depth = reiserfs_write_unlock_nested(sb);
-	retval = dquot_alloc_space_nodirty(inode, pasted_size);
-	reiserfs_write_lock_nested(sb, depth);
-	if (retval) {
-		pathrelse(search_path);
-		return retval;
-	}
-	init_tb_struct(th, &s_paste_balance, th->t_super, search_path,
-		       pasted_size);
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
-	s_paste_balance.key = key->on_disk_key;
-#endif
-
-	/* DQUOT_* can schedule, must check before the fix_nodes */
-	if (fs_changed(fs_gen, inode->i_sb)) {
-		goto search_again;
-	}
-
-	while ((retval =
-		fix_nodes(M_PASTE, &s_paste_balance, NULL,
-			  body)) == REPEAT_SEARCH) {
-search_again:
-		/* file system changed while we were in the fix_nodes */
-		PROC_INFO_INC(th->t_super, paste_into_item_restarted);
-		retval =
-		    search_for_position_by_key(th->t_super, key,
-					       search_path);
-		if (retval == IO_ERROR) {
-			retval = -EIO;
-			goto error_out;
-		}
-		if (retval == POSITION_FOUND) {
-			reiserfs_warning(inode->i_sb, "PAP-5710",
-					 "entry or pasted byte (%K) exists",
-					 key);
-			retval = -EEXIST;
-			goto error_out;
-		}
-#ifdef CONFIG_REISERFS_CHECK
-		check_research_for_paste(search_path, key);
-#endif
-	}
-
-	/*
-	 * Perform balancing after all resources are collected by fix_nodes,
-	 * and accessing them will not risk triggering schedule.
-	 */
-	if (retval == CARRY_ON) {
-		do_balance(&s_paste_balance, NULL /*ih */ , body, M_PASTE);
-		return 0;
-	}
-	retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
-error_out:
-	/* this also releases the path */
-	unfix_nodes(&s_paste_balance);
-#ifdef REISERQUOTA_DEBUG
-	reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
-		       "reiserquota paste_into_item(): freeing %u id=%u type=%c",
-		       pasted_size, inode->i_uid,
-		       key2type(&key->on_disk_key));
-#endif
-	depth = reiserfs_write_unlock_nested(sb);
-	dquot_free_space_nodirty(inode, pasted_size);
-	reiserfs_write_lock_nested(sb, depth);
-	return retval;
-}
-
-/*
- * Insert new item into the buffer at the path.
- * th   - active transaction handle
- * path - path to the inserted item
- * ih   - pointer to the item header to insert
- * body - pointer to the bytes to insert
- */
-int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
-			 struct treepath *path, const struct cpu_key *key,
-			 struct item_head *ih, struct inode *inode,
-			 const char *body)
-{
-	struct tree_balance s_ins_balance;
-	int retval;
-	int fs_gen = 0;
-	int quota_bytes = 0;
-
-	BUG_ON(!th->t_trans_id);
-
-	if (inode) {		/* Do we count quotas for item? */
-		int depth;
-		fs_gen = get_generation(inode->i_sb);
-		quota_bytes = ih_item_len(ih);
-
-		/*
-		 * hack so the quota code doesn't have to guess
-		 * if the file has a tail, links are always tails,
-		 * so there's no guessing needed
-		 */
-		if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(ih))
-			quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE;
-#ifdef REISERQUOTA_DEBUG
-		reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
-			       "reiserquota insert_item(): allocating %u id=%u type=%c",
-			       quota_bytes, inode->i_uid, head2type(ih));
-#endif
-		/*
-		 * We can't dirty inode here. It would be immediately
-		 * written but appropriate stat item isn't inserted yet...
-		 */
-		depth = reiserfs_write_unlock_nested(inode->i_sb);
-		retval = dquot_alloc_space_nodirty(inode, quota_bytes);
-		reiserfs_write_lock_nested(inode->i_sb, depth);
-		if (retval) {
-			pathrelse(path);
-			return retval;
-		}
-	}
-	init_tb_struct(th, &s_ins_balance, th->t_super, path,
-		       IH_SIZE + ih_item_len(ih));
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
-	s_ins_balance.key = key->on_disk_key;
-#endif
-	/*
-	 * DQUOT_* can schedule, must check to be sure calling
-	 * fix_nodes is safe
-	 */
-	if (inode && fs_changed(fs_gen, inode->i_sb)) {
-		goto search_again;
-	}
-
-	while ((retval =
-		fix_nodes(M_INSERT, &s_ins_balance, ih,
-			  body)) == REPEAT_SEARCH) {
-search_again:
-		/* file system changed while we were in the fix_nodes */
-		PROC_INFO_INC(th->t_super, insert_item_restarted);
-		retval = search_item(th->t_super, key, path);
-		if (retval == IO_ERROR) {
-			retval = -EIO;
-			goto error_out;
-		}
-		if (retval == ITEM_FOUND) {
-			reiserfs_warning(th->t_super, "PAP-5760",
-					 "key %K already exists in the tree",
-					 key);
-			retval = -EEXIST;
-			goto error_out;
-		}
-	}
-
-	/* make balancing after all resources will be collected at a time */
-	if (retval == CARRY_ON) {
-		do_balance(&s_ins_balance, ih, body, M_INSERT);
-		return 0;
-	}
-
-	retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
-error_out:
-	/* also releases the path */
-	unfix_nodes(&s_ins_balance);
-#ifdef REISERQUOTA_DEBUG
-	if (inode)
-		reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
-		       "reiserquota insert_item(): freeing %u id=%u type=%c",
-		       quota_bytes, inode->i_uid, head2type(ih));
-#endif
-	if (inode) {
-		int depth = reiserfs_write_unlock_nested(inode->i_sb);
-		dquot_free_space_nodirty(inode, quota_bytes);
-		reiserfs_write_lock_nested(inode->i_sb, depth);
-	}
-	return retval;
-}
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
deleted file mode 100644
index ab76468da02d..000000000000
--- a/fs/reiserfs/super.c
+++ /dev/null
@@ -1,2646 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- *
- * Trivial changes by Alan Cox to add the LFS fixes
- *
- * Trivial Changes:
- * Rights granted to Hans Reiser to redistribute under other terms providing
- * he accepts all liability including but not limited to patent, fitness
- * for purpose, and direct or indirect claims arising from failure to perform.
- *
- * NO WARRANTY
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/time.h>
-#include <linux/uaccess.h>
-#include "reiserfs.h"
-#include "acl.h"
-#include "xattr.h"
-#include <linux/init.h>
-#include <linux/blkdev.h>
-#include <linux/backing-dev.h>
-#include <linux/buffer_head.h>
-#include <linux/exportfs.h>
-#include <linux/quotaops.h>
-#include <linux/vfs.h>
-#include <linux/mount.h>
-#include <linux/namei.h>
-#include <linux/crc32.h>
-#include <linux/seq_file.h>
-
-struct file_system_type reiserfs_fs_type;
-
-static const char reiserfs_3_5_magic_string[] = REISERFS_SUPER_MAGIC_STRING;
-static const char reiserfs_3_6_magic_string[] = REISER2FS_SUPER_MAGIC_STRING;
-static const char reiserfs_jr_magic_string[] = REISER2FS_JR_SUPER_MAGIC_STRING;
-
-int is_reiserfs_3_5(struct reiserfs_super_block *rs)
-{
-	return !strncmp(rs->s_v1.s_magic, reiserfs_3_5_magic_string,
-			strlen(reiserfs_3_5_magic_string));
-}
-
-int is_reiserfs_3_6(struct reiserfs_super_block *rs)
-{
-	return !strncmp(rs->s_v1.s_magic, reiserfs_3_6_magic_string,
-			strlen(reiserfs_3_6_magic_string));
-}
-
-int is_reiserfs_jr(struct reiserfs_super_block *rs)
-{
-	return !strncmp(rs->s_v1.s_magic, reiserfs_jr_magic_string,
-			strlen(reiserfs_jr_magic_string));
-}
-
-static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs)
-{
-	return (is_reiserfs_3_5(rs) || is_reiserfs_3_6(rs) ||
-		is_reiserfs_jr(rs));
-}
-
-static int reiserfs_remount(struct super_block *s, int *flags, char *data);
-static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
-
-static int reiserfs_sync_fs(struct super_block *s, int wait)
-{
-	struct reiserfs_transaction_handle th;
-
-	/*
-	 * Writeback quota in non-journalled quota case - journalled quota has
-	 * no dirty dquots
-	 */
-	dquot_writeback_dquots(s, -1);
-	reiserfs_write_lock(s);
-	if (!journal_begin(&th, s, 1))
-		if (!journal_end_sync(&th))
-			reiserfs_flush_old_commits(s);
-	reiserfs_write_unlock(s);
-	return 0;
-}
-
-static void flush_old_commits(struct work_struct *work)
-{
-	struct reiserfs_sb_info *sbi;
-	struct super_block *s;
-
-	sbi = container_of(work, struct reiserfs_sb_info, old_work.work);
-	s = sbi->s_journal->j_work_sb;
-
-	/*
-	 * We need s_umount for protecting quota writeback. We have to use
-	 * trylock as reiserfs_cancel_old_flush() may be waiting for this work
-	 * to complete with s_umount held.
-	 */
-	if (!down_read_trylock(&s->s_umount)) {
-		/* Requeue work if we are not cancelling it */
-		spin_lock(&sbi->old_work_lock);
-		if (sbi->work_queued == 1)
-			queue_delayed_work(system_long_wq, &sbi->old_work, HZ);
-		spin_unlock(&sbi->old_work_lock);
-		return;
-	}
-	spin_lock(&sbi->old_work_lock);
-	/* Avoid clobbering the cancel state... */
-	if (sbi->work_queued == 1)
-		sbi->work_queued = 0;
-	spin_unlock(&sbi->old_work_lock);
-
-	reiserfs_sync_fs(s, 1);
-	up_read(&s->s_umount);
-}
-
-void reiserfs_schedule_old_flush(struct super_block *s)
-{
-	struct reiserfs_sb_info *sbi = REISERFS_SB(s);
-	unsigned long delay;
-
-	/*
-	 * Avoid scheduling flush when sb is being shut down. It can race
-	 * with journal shutdown and free still queued delayed work.
-	 */
-	if (sb_rdonly(s) || !(s->s_flags & SB_ACTIVE))
-		return;
-
-	spin_lock(&sbi->old_work_lock);
-	if (!sbi->work_queued) {
-		delay = msecs_to_jiffies(dirty_writeback_interval * 10);
-		queue_delayed_work(system_long_wq, &sbi->old_work, delay);
-		sbi->work_queued = 1;
-	}
-	spin_unlock(&sbi->old_work_lock);
-}
-
-void reiserfs_cancel_old_flush(struct super_block *s)
-{
-	struct reiserfs_sb_info *sbi = REISERFS_SB(s);
-
-	spin_lock(&sbi->old_work_lock);
-	/* Make sure no new flushes will be queued */
-	sbi->work_queued = 2;
-	spin_unlock(&sbi->old_work_lock);
-	cancel_delayed_work_sync(&REISERFS_SB(s)->old_work);
-}
-
-static int reiserfs_freeze(struct super_block *s)
-{
-	struct reiserfs_transaction_handle th;
-
-	reiserfs_cancel_old_flush(s);
-
-	reiserfs_write_lock(s);
-	if (!sb_rdonly(s)) {
-		int err = journal_begin(&th, s, 1);
-		if (err) {
-			reiserfs_block_writes(&th);
-		} else {
-			reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
-						     1);
-			journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
-			reiserfs_block_writes(&th);
-			journal_end_sync(&th);
-		}
-	}
-	reiserfs_write_unlock(s);
-	return 0;
-}
-
-static int reiserfs_unfreeze(struct super_block *s)
-{
-	struct reiserfs_sb_info *sbi = REISERFS_SB(s);
-
-	reiserfs_allow_writes(s);
-	spin_lock(&sbi->old_work_lock);
-	/* Allow old_work to run again */
-	sbi->work_queued = 0;
-	spin_unlock(&sbi->old_work_lock);
-	return 0;
-}
-
-extern const struct in_core_key MAX_IN_CORE_KEY;
-
-/*
- * this is used to delete "save link" when there are no items of a
- * file it points to. It can either happen if unlink is completed but
- * "save unlink" removal, or if file has both unlink and truncate
- * pending and as unlink completes first (because key of "save link"
- * protecting unlink is bigger that a key lf "save link" which
- * protects truncate), so there left no items to make truncate
- * completion on
- */
-static int remove_save_link_only(struct super_block *s,
-				 struct reiserfs_key *key, int oid_free)
-{
-	struct reiserfs_transaction_handle th;
-	int err;
-
-	/* we are going to do one balancing */
-	err = journal_begin(&th, s, JOURNAL_PER_BALANCE_CNT);
-	if (err)
-		return err;
-
-	reiserfs_delete_solid_item(&th, NULL, key);
-	if (oid_free)
-		/* removals are protected by direct items */
-		reiserfs_release_objectid(&th, le32_to_cpu(key->k_objectid));
-
-	return journal_end(&th);
-}
-
-#ifdef CONFIG_QUOTA
-static int reiserfs_quota_on_mount(struct super_block *, int);
-#endif
-
-/*
- * Look for uncompleted unlinks and truncates and complete them
- *
- * Called with superblock write locked.  If quotas are enabled, we have to
- * release/retake lest we call dquot_quota_on_mount(), proceed to
- * schedule_on_each_cpu() in invalidate_bdev() and deadlock waiting for the per
- * cpu worklets to complete flush_async_commits() that in turn wait for the
- * superblock write lock.
- */
-static int finish_unfinished(struct super_block *s)
-{
-	INITIALIZE_PATH(path);
-	struct cpu_key max_cpu_key, obj_key;
-	struct reiserfs_key save_link_key, last_inode_key;
-	int retval = 0;
-	struct item_head *ih;
-	struct buffer_head *bh;
-	int item_pos;
-	char *item;
-	int done;
-	struct inode *inode;
-	int truncate;
-#ifdef CONFIG_QUOTA
-	int i;
-	int ms_active_set;
-	int quota_enabled[REISERFS_MAXQUOTAS];
-#endif
-
-	/* compose key to look for "save" links */
-	max_cpu_key.version = KEY_FORMAT_3_5;
-	max_cpu_key.on_disk_key.k_dir_id = ~0U;
-	max_cpu_key.on_disk_key.k_objectid = ~0U;
-	set_cpu_key_k_offset(&max_cpu_key, ~0U);
-	max_cpu_key.key_length = 3;
-
-	memset(&last_inode_key, 0, sizeof(last_inode_key));
-
-#ifdef CONFIG_QUOTA
-	/* Needed for iput() to work correctly and not trash data */
-	if (s->s_flags & SB_ACTIVE) {
-		ms_active_set = 0;
-	} else {
-		ms_active_set = 1;
-		s->s_flags |= SB_ACTIVE;
-	}
-	/* Turn on quotas so that they are updated correctly */
-	for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
-		quota_enabled[i] = 1;
-		if (REISERFS_SB(s)->s_qf_names[i]) {
-			int ret;
-
-			if (sb_has_quota_active(s, i)) {
-				quota_enabled[i] = 0;
-				continue;
-			}
-			reiserfs_write_unlock(s);
-			ret = reiserfs_quota_on_mount(s, i);
-			reiserfs_write_lock(s);
-			if (ret < 0)
-				reiserfs_warning(s, "reiserfs-2500",
-						 "cannot turn on journaled "
-						 "quota: error %d", ret);
-		}
-	}
-#endif
-
-	done = 0;
-	REISERFS_SB(s)->s_is_unlinked_ok = 1;
-	while (!retval) {
-		int depth;
-		retval = search_item(s, &max_cpu_key, &path);
-		if (retval != ITEM_NOT_FOUND) {
-			reiserfs_error(s, "vs-2140",
-				       "search_by_key returned %d", retval);
-			break;
-		}
-
-		bh = get_last_bh(&path);
-		item_pos = get_item_pos(&path);
-		if (item_pos != B_NR_ITEMS(bh)) {
-			reiserfs_warning(s, "vs-2060",
-					 "wrong position found");
-			break;
-		}
-		item_pos--;
-		ih = item_head(bh, item_pos);
-
-		if (le32_to_cpu(ih->ih_key.k_dir_id) != MAX_KEY_OBJECTID)
-			/* there are no "save" links anymore */
-			break;
-
-		save_link_key = ih->ih_key;
-		if (is_indirect_le_ih(ih))
-			truncate = 1;
-		else
-			truncate = 0;
-
-		/* reiserfs_iget needs k_dirid and k_objectid only */
-		item = ih_item_body(bh, ih);
-		obj_key.on_disk_key.k_dir_id = le32_to_cpu(*(__le32 *) item);
-		obj_key.on_disk_key.k_objectid =
-		    le32_to_cpu(ih->ih_key.k_objectid);
-		obj_key.on_disk_key.k_offset = 0;
-		obj_key.on_disk_key.k_type = 0;
-
-		pathrelse(&path);
-
-		inode = reiserfs_iget(s, &obj_key);
-		if (IS_ERR_OR_NULL(inode)) {
-			/*
-			 * the unlink almost completed, it just did not
-			 * manage to remove "save" link and release objectid
-			 */
-			reiserfs_warning(s, "vs-2180", "iget failed for %K",
-					 &obj_key);
-			retval = remove_save_link_only(s, &save_link_key, 1);
-			continue;
-		}
-
-		if (!truncate && inode->i_nlink) {
-			/* file is not unlinked */
-			reiserfs_warning(s, "vs-2185",
-					 "file %K is not unlinked",
-					 &obj_key);
-			retval = remove_save_link_only(s, &save_link_key, 0);
-			continue;
-		}
-		depth = reiserfs_write_unlock_nested(inode->i_sb);
-		dquot_initialize(inode);
-		reiserfs_write_lock_nested(inode->i_sb, depth);
-
-		if (truncate && S_ISDIR(inode->i_mode)) {
-			/*
-			 * We got a truncate request for a dir which
-			 * is impossible.  The only imaginable way is to
-			 * execute unfinished truncate request then boot
-			 * into old kernel, remove the file and create dir
-			 * with the same key.
-			 */
-			reiserfs_warning(s, "green-2101",
-					 "impossible truncate on a "
-					 "directory %k. Please report",
-					 INODE_PKEY(inode));
-			retval = remove_save_link_only(s, &save_link_key, 0);
-			truncate = 0;
-			iput(inode);
-			continue;
-		}
-
-		if (truncate) {
-			REISERFS_I(inode)->i_flags |=
-			    i_link_saved_truncate_mask;
-			/*
-			 * not completed truncate found. New size was
-			 * committed together with "save" link
-			 */
-			reiserfs_info(s, "Truncating %k to %lld ..",
-				      INODE_PKEY(inode), inode->i_size);
-
-			/* don't update modification time */
-			reiserfs_truncate_file(inode, 0);
-
-			retval = remove_save_link(inode, truncate);
-		} else {
-			REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask;
-			/* not completed unlink (rmdir) found */
-			reiserfs_info(s, "Removing %k..", INODE_PKEY(inode));
-			if (memcmp(&last_inode_key, INODE_PKEY(inode),
-					sizeof(last_inode_key))){
-				last_inode_key = *INODE_PKEY(inode);
-				/* removal gets completed in iput */
-				retval = 0;
-			} else {
-				reiserfs_warning(s, "super-2189", "Dead loop "
-						 "in finish_unfinished "
-						 "detected, just remove "
-						 "save link\n");
-				retval = remove_save_link_only(s,
-							&save_link_key, 0);
-			}
-		}
-
-		iput(inode);
-		printk("done\n");
-		done++;
-	}
-	REISERFS_SB(s)->s_is_unlinked_ok = 0;
-
-#ifdef CONFIG_QUOTA
-	/* Turn quotas off */
-	reiserfs_write_unlock(s);
-	for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
-		if (sb_dqopt(s)->files[i] && quota_enabled[i])
-			dquot_quota_off(s, i);
-	}
-	reiserfs_write_lock(s);
-	if (ms_active_set)
-		/* Restore the flag back */
-		s->s_flags &= ~SB_ACTIVE;
-#endif
-	pathrelse(&path);
-	if (done)
-		reiserfs_info(s, "There were %d uncompleted unlinks/truncates. "
-			      "Completed\n", done);
-	return retval;
-}
-
-/*
- * to protect file being unlinked from getting lost we "safe" link files
- * being unlinked. This link will be deleted in the same transaction with last
- * item of file. mounting the filesystem we scan all these links and remove
- * files which almost got lost
- */
-void add_save_link(struct reiserfs_transaction_handle *th,
-		   struct inode *inode, int truncate)
-{
-	INITIALIZE_PATH(path);
-	int retval;
-	struct cpu_key key;
-	struct item_head ih;
-	__le32 link;
-
-	BUG_ON(!th->t_trans_id);
-
-	/* file can only get one "save link" of each kind */
-	RFALSE(truncate &&
-	       (REISERFS_I(inode)->i_flags & i_link_saved_truncate_mask),
-	       "saved link already exists for truncated inode %lx",
-	       (long)inode->i_ino);
-	RFALSE(!truncate &&
-	       (REISERFS_I(inode)->i_flags & i_link_saved_unlink_mask),
-	       "saved link already exists for unlinked inode %lx",
-	       (long)inode->i_ino);
-
-	/* setup key of "save" link */
-	key.version = KEY_FORMAT_3_5;
-	key.on_disk_key.k_dir_id = MAX_KEY_OBJECTID;
-	key.on_disk_key.k_objectid = inode->i_ino;
-	if (!truncate) {
-		/* unlink, rmdir, rename */
-		set_cpu_key_k_offset(&key, 1 + inode->i_sb->s_blocksize);
-		set_cpu_key_k_type(&key, TYPE_DIRECT);
-
-		/* item head of "safe" link */
-		make_le_item_head(&ih, &key, key.version,
-				  1 + inode->i_sb->s_blocksize, TYPE_DIRECT,
-				  4 /*length */ , 0xffff /*free space */ );
-	} else {
-		/* truncate */
-		if (S_ISDIR(inode->i_mode))
-			reiserfs_warning(inode->i_sb, "green-2102",
-					 "Adding a truncate savelink for "
-					 "a directory %k! Please report",
-					 INODE_PKEY(inode));
-		set_cpu_key_k_offset(&key, 1);
-		set_cpu_key_k_type(&key, TYPE_INDIRECT);
-
-		/* item head of "safe" link */
-		make_le_item_head(&ih, &key, key.version, 1, TYPE_INDIRECT,
-				  4 /*length */ , 0 /*free space */ );
-	}
-	key.key_length = 3;
-
-	/* look for its place in the tree */
-	retval = search_item(inode->i_sb, &key, &path);
-	if (retval != ITEM_NOT_FOUND) {
-		if (retval != -ENOSPC)
-			reiserfs_error(inode->i_sb, "vs-2100",
-				       "search_by_key (%K) returned %d", &key,
-				       retval);
-		pathrelse(&path);
-		return;
-	}
-
-	/* body of "save" link */
-	link = INODE_PKEY(inode)->k_dir_id;
-
-	/* put "save" link into tree, don't charge quota to anyone */
-	retval =
-	    reiserfs_insert_item(th, &path, &key, &ih, NULL, (char *)&link);
-	if (retval) {
-		if (retval != -ENOSPC)
-			reiserfs_error(inode->i_sb, "vs-2120",
-				       "insert_item returned %d", retval);
-	} else {
-		if (truncate)
-			REISERFS_I(inode)->i_flags |=
-			    i_link_saved_truncate_mask;
-		else
-			REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask;
-	}
-}
-
-/* this opens transaction unlike add_save_link */
-int remove_save_link(struct inode *inode, int truncate)
-{
-	struct reiserfs_transaction_handle th;
-	struct reiserfs_key key;
-	int err;
-
-	/* we are going to do one balancing only */
-	err = journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT);
-	if (err)
-		return err;
-
-	/* setup key of "save" link */
-	key.k_dir_id = cpu_to_le32(MAX_KEY_OBJECTID);
-	key.k_objectid = INODE_PKEY(inode)->k_objectid;
-	if (!truncate) {
-		/* unlink, rmdir, rename */
-		set_le_key_k_offset(KEY_FORMAT_3_5, &key,
-				    1 + inode->i_sb->s_blocksize);
-		set_le_key_k_type(KEY_FORMAT_3_5, &key, TYPE_DIRECT);
-	} else {
-		/* truncate */
-		set_le_key_k_offset(KEY_FORMAT_3_5, &key, 1);
-		set_le_key_k_type(KEY_FORMAT_3_5, &key, TYPE_INDIRECT);
-	}
-
-	if ((truncate &&
-	     (REISERFS_I(inode)->i_flags & i_link_saved_truncate_mask)) ||
-	    (!truncate &&
-	     (REISERFS_I(inode)->i_flags & i_link_saved_unlink_mask)))
-		/* don't take quota bytes from anywhere */
-		reiserfs_delete_solid_item(&th, NULL, &key);
-	if (!truncate) {
-		reiserfs_release_objectid(&th, inode->i_ino);
-		REISERFS_I(inode)->i_flags &= ~i_link_saved_unlink_mask;
-	} else
-		REISERFS_I(inode)->i_flags &= ~i_link_saved_truncate_mask;
-
-	return journal_end(&th);
-}
-
-static void reiserfs_kill_sb(struct super_block *s)
-{
-	if (REISERFS_SB(s)) {
-		reiserfs_proc_info_done(s);
-		/*
-		 * Force any pending inode evictions to occur now. Any
-		 * inodes to be removed that have extended attributes
-		 * associated with them need to clean them up before
-		 * we can release the extended attribute root dentries.
-		 * shrink_dcache_for_umount will BUG if we don't release
-		 * those before it's called so ->put_super is too late.
-		 */
-		shrink_dcache_sb(s);
-
-		dput(REISERFS_SB(s)->xattr_root);
-		REISERFS_SB(s)->xattr_root = NULL;
-		dput(REISERFS_SB(s)->priv_root);
-		REISERFS_SB(s)->priv_root = NULL;
-	}
-
-	kill_block_super(s);
-}
-
-#ifdef CONFIG_QUOTA
-static int reiserfs_quota_off(struct super_block *sb, int type);
-
-static void reiserfs_quota_off_umount(struct super_block *s)
-{
-	int type;
-
-	for (type = 0; type < REISERFS_MAXQUOTAS; type++)
-		reiserfs_quota_off(s, type);
-}
-#else
-static inline void reiserfs_quota_off_umount(struct super_block *s)
-{
-}
-#endif
-
-static void reiserfs_put_super(struct super_block *s)
-{
-	struct reiserfs_transaction_handle th;
-	th.t_trans_id = 0;
-
-	reiserfs_quota_off_umount(s);
-
-	reiserfs_write_lock(s);
-
-	/*
-	 * change file system state to current state if it was mounted
-	 * with read-write permissions
-	 */
-	if (!sb_rdonly(s)) {
-		if (!journal_begin(&th, s, 10)) {
-			reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
-						     1);
-			set_sb_umount_state(SB_DISK_SUPER_BLOCK(s),
-					    REISERFS_SB(s)->s_mount_state);
-			journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
-		}
-	}
-
-	/*
-	 * note, journal_release checks for readonly mount, and can
-	 * decide not to do a journal_end
-	 */
-	journal_release(&th, s);
-
-	reiserfs_free_bitmap_cache(s);
-
-	brelse(SB_BUFFER_WITH_SB(s));
-
-	print_statistics(s);
-
-	if (REISERFS_SB(s)->reserved_blocks != 0) {
-		reiserfs_warning(s, "green-2005", "reserved blocks left %d",
-				 REISERFS_SB(s)->reserved_blocks);
-	}
-
-	reiserfs_write_unlock(s);
-	mutex_destroy(&REISERFS_SB(s)->lock);
-	destroy_workqueue(REISERFS_SB(s)->commit_wq);
-	kfree(REISERFS_SB(s)->s_jdev);
-	kfree(s->s_fs_info);
-	s->s_fs_info = NULL;
-}
-
-static struct kmem_cache *reiserfs_inode_cachep;
-
-static struct inode *reiserfs_alloc_inode(struct super_block *sb)
-{
-	struct reiserfs_inode_info *ei;
-	ei = alloc_inode_sb(sb, reiserfs_inode_cachep, GFP_KERNEL);
-	if (!ei)
-		return NULL;
-	atomic_set(&ei->openers, 0);
-	mutex_init(&ei->tailpack);
-#ifdef CONFIG_QUOTA
-	memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
-#endif
-
-	return &ei->vfs_inode;
-}
-
-static void reiserfs_free_inode(struct inode *inode)
-{
-	kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
-}
-
-static void init_once(void *foo)
-{
-	struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
-
-	INIT_LIST_HEAD(&ei->i_prealloc_list);
-	inode_init_once(&ei->vfs_inode);
-}
-
-static int __init init_inodecache(void)
-{
-	reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache",
-						  sizeof(struct
-							 reiserfs_inode_info),
-						  0, (SLAB_RECLAIM_ACCOUNT|
-						      SLAB_ACCOUNT),
-						  init_once);
-	if (reiserfs_inode_cachep == NULL)
-		return -ENOMEM;
-	return 0;
-}
-
-static void destroy_inodecache(void)
-{
-	/*
-	 * Make sure all delayed rcu free inodes are flushed before we
-	 * destroy cache.
-	 */
-	rcu_barrier();
-	kmem_cache_destroy(reiserfs_inode_cachep);
-}
-
-/* we don't mark inodes dirty, we just log them */
-static void reiserfs_dirty_inode(struct inode *inode, int flags)
-{
-	struct reiserfs_transaction_handle th;
-
-	int err = 0;
-
-	if (sb_rdonly(inode->i_sb)) {
-		reiserfs_warning(inode->i_sb, "clm-6006",
-				 "writing inode %lu on readonly FS",
-				 inode->i_ino);
-		return;
-	}
-	reiserfs_write_lock(inode->i_sb);
-
-	/*
-	 * this is really only used for atime updates, so they don't have
-	 * to be included in O_SYNC or fsync
-	 */
-	err = journal_begin(&th, inode->i_sb, 1);
-	if (err)
-		goto out;
-
-	reiserfs_update_sd(&th, inode);
-	journal_end(&th);
-
-out:
-	reiserfs_write_unlock(inode->i_sb);
-}
-
-static int reiserfs_show_options(struct seq_file *seq, struct dentry *root)
-{
-	struct super_block *s = root->d_sb;
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	long opts = REISERFS_SB(s)->s_mount_opt;
-
-	if (opts & (1 << REISERFS_LARGETAIL))
-		seq_puts(seq, ",tails=on");
-	else if (!(opts & (1 << REISERFS_SMALLTAIL)))
-		seq_puts(seq, ",notail");
-	/* tails=small is default so we don't show it */
-
-	if (!(opts & (1 << REISERFS_BARRIER_FLUSH)))
-		seq_puts(seq, ",barrier=none");
-	/* barrier=flush is default so we don't show it */
-
-	if (opts & (1 << REISERFS_ERROR_CONTINUE))
-		seq_puts(seq, ",errors=continue");
-	else if (opts & (1 << REISERFS_ERROR_PANIC))
-		seq_puts(seq, ",errors=panic");
-	/* errors=ro is default so we don't show it */
-
-	if (opts & (1 << REISERFS_DATA_LOG))
-		seq_puts(seq, ",data=journal");
-	else if (opts & (1 << REISERFS_DATA_WRITEBACK))
-		seq_puts(seq, ",data=writeback");
-	/* data=ordered is default so we don't show it */
-
-	if (opts & (1 << REISERFS_ATTRS))
-		seq_puts(seq, ",attrs");
-
-	if (opts & (1 << REISERFS_XATTRS_USER))
-		seq_puts(seq, ",user_xattr");
-
-	if (opts & (1 << REISERFS_EXPOSE_PRIVROOT))
-		seq_puts(seq, ",expose_privroot");
-
-	if (opts & (1 << REISERFS_POSIXACL))
-		seq_puts(seq, ",acl");
-
-	if (REISERFS_SB(s)->s_jdev)
-		seq_show_option(seq, "jdev", REISERFS_SB(s)->s_jdev);
-
-	if (journal->j_max_commit_age != journal->j_default_max_commit_age)
-		seq_printf(seq, ",commit=%d", journal->j_max_commit_age);
-
-#ifdef CONFIG_QUOTA
-	if (REISERFS_SB(s)->s_qf_names[USRQUOTA])
-		seq_show_option(seq, "usrjquota",
-				REISERFS_SB(s)->s_qf_names[USRQUOTA]);
-	else if (opts & (1 << REISERFS_USRQUOTA))
-		seq_puts(seq, ",usrquota");
-	if (REISERFS_SB(s)->s_qf_names[GRPQUOTA])
-		seq_show_option(seq, "grpjquota",
-				REISERFS_SB(s)->s_qf_names[GRPQUOTA]);
-	else if (opts & (1 << REISERFS_GRPQUOTA))
-		seq_puts(seq, ",grpquota");
-	if (REISERFS_SB(s)->s_jquota_fmt) {
-		if (REISERFS_SB(s)->s_jquota_fmt == QFMT_VFS_OLD)
-			seq_puts(seq, ",jqfmt=vfsold");
-		else if (REISERFS_SB(s)->s_jquota_fmt == QFMT_VFS_V0)
-			seq_puts(seq, ",jqfmt=vfsv0");
-	}
-#endif
-
-	/* Block allocator options */
-	if (opts & (1 << REISERFS_NO_BORDER))
-		seq_puts(seq, ",block-allocator=noborder");
-	if (opts & (1 << REISERFS_NO_UNHASHED_RELOCATION))
-		seq_puts(seq, ",block-allocator=no_unhashed_relocation");
-	if (opts & (1 << REISERFS_HASHED_RELOCATION))
-		seq_puts(seq, ",block-allocator=hashed_relocation");
-	if (opts & (1 << REISERFS_TEST4))
-		seq_puts(seq, ",block-allocator=test4");
-	show_alloc_options(seq, s);
-	return 0;
-}
-
-#ifdef CONFIG_QUOTA
-static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
-				    size_t, loff_t);
-static ssize_t reiserfs_quota_read(struct super_block *, int, char *, size_t,
-				   loff_t);
-
-static struct dquot __rcu **reiserfs_get_dquots(struct inode *inode)
-{
-	return REISERFS_I(inode)->i_dquot;
-}
-#endif
-
-static const struct super_operations reiserfs_sops = {
-	.alloc_inode = reiserfs_alloc_inode,
-	.free_inode = reiserfs_free_inode,
-	.write_inode = reiserfs_write_inode,
-	.dirty_inode = reiserfs_dirty_inode,
-	.evict_inode = reiserfs_evict_inode,
-	.put_super = reiserfs_put_super,
-	.sync_fs = reiserfs_sync_fs,
-	.freeze_fs = reiserfs_freeze,
-	.unfreeze_fs = reiserfs_unfreeze,
-	.statfs = reiserfs_statfs,
-	.remount_fs = reiserfs_remount,
-	.show_options = reiserfs_show_options,
-#ifdef CONFIG_QUOTA
-	.quota_read = reiserfs_quota_read,
-	.quota_write = reiserfs_quota_write,
-	.get_dquots = reiserfs_get_dquots,
-#endif
-};
-
-#ifdef CONFIG_QUOTA
-#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
-
-static int reiserfs_write_dquot(struct dquot *);
-static int reiserfs_acquire_dquot(struct dquot *);
-static int reiserfs_release_dquot(struct dquot *);
-static int reiserfs_mark_dquot_dirty(struct dquot *);
-static int reiserfs_write_info(struct super_block *, int);
-static int reiserfs_quota_on(struct super_block *, int, int, const struct path *);
-
-static const struct dquot_operations reiserfs_quota_operations = {
-	.write_dquot = reiserfs_write_dquot,
-	.acquire_dquot = reiserfs_acquire_dquot,
-	.release_dquot = reiserfs_release_dquot,
-	.mark_dirty = reiserfs_mark_dquot_dirty,
-	.write_info = reiserfs_write_info,
-	.alloc_dquot	= dquot_alloc,
-	.destroy_dquot	= dquot_destroy,
-	.get_next_id	= dquot_get_next_id,
-};
-
-static const struct quotactl_ops reiserfs_qctl_operations = {
-	.quota_on = reiserfs_quota_on,
-	.quota_off = reiserfs_quota_off,
-	.quota_sync = dquot_quota_sync,
-	.get_state = dquot_get_state,
-	.set_info = dquot_set_dqinfo,
-	.get_dqblk = dquot_get_dqblk,
-	.set_dqblk = dquot_set_dqblk,
-};
-#endif
-
-static const struct export_operations reiserfs_export_ops = {
-	.encode_fh = reiserfs_encode_fh,
-	.fh_to_dentry = reiserfs_fh_to_dentry,
-	.fh_to_parent = reiserfs_fh_to_parent,
-	.get_parent = reiserfs_get_parent,
-};
-
-/*
- * this struct is used in reiserfs_getopt () for containing the value for
- * those mount options that have values rather than being toggles.
- */
-typedef struct {
-	char *value;
-	/*
-	 * bitmask which is to set on mount_options bitmask
-	 * when this value is found, 0 is no bits are to be changed.
-	 */
-	int setmask;
-	/*
-	 * bitmask which is to clear on mount_options bitmask
-	 * when this value is found, 0 is no bits are to be changed.
-	 * This is applied BEFORE setmask
-	 */
-	int clrmask;
-} arg_desc_t;
-
-/* Set this bit in arg_required to allow empty arguments */
-#define REISERFS_OPT_ALLOWEMPTY 31
-
-/*
- * this struct is used in reiserfs_getopt() for describing the
- * set of reiserfs mount options
- */
-typedef struct {
-	char *option_name;
-
-	/* 0 if argument is not required, not 0 otherwise */
-	int arg_required;
-
-	/* list of values accepted by an option */
-	const arg_desc_t *values;
-
-	/*
-	 * bitmask which is to set on mount_options bitmask
-	 * when this value is found, 0 is no bits are to be changed.
-	 */
-	int setmask;
-
-	/*
-	 * bitmask which is to clear on mount_options bitmask
-	 * when this value is found, 0 is no bits are to be changed.
-	 * This is applied BEFORE setmask
-	 */
-	int clrmask;
-} opt_desc_t;
-
-/* possible values for -o data= */
-static const arg_desc_t logging_mode[] = {
-	{"ordered", 1 << REISERFS_DATA_ORDERED,
-	 (1 << REISERFS_DATA_LOG | 1 << REISERFS_DATA_WRITEBACK)},
-	{"journal", 1 << REISERFS_DATA_LOG,
-	 (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_WRITEBACK)},
-	{"writeback", 1 << REISERFS_DATA_WRITEBACK,
-	 (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_LOG)},
-	{.value = NULL}
-};
-
-/* possible values for -o barrier= */
-static const arg_desc_t barrier_mode[] = {
-	{"none", 1 << REISERFS_BARRIER_NONE, 1 << REISERFS_BARRIER_FLUSH},
-	{"flush", 1 << REISERFS_BARRIER_FLUSH, 1 << REISERFS_BARRIER_NONE},
-	{.value = NULL}
-};
-
-/*
- * possible values for "-o block-allocator=" and bits which are to be set in
- * s_mount_opt of reiserfs specific part of in-core super block
- */
-static const arg_desc_t balloc[] = {
-	{"noborder", 1 << REISERFS_NO_BORDER, 0},
-	{"border", 0, 1 << REISERFS_NO_BORDER},
-	{"no_unhashed_relocation", 1 << REISERFS_NO_UNHASHED_RELOCATION, 0},
-	{"hashed_relocation", 1 << REISERFS_HASHED_RELOCATION, 0},
-	{"test4", 1 << REISERFS_TEST4, 0},
-	{"notest4", 0, 1 << REISERFS_TEST4},
-	{NULL, 0, 0}
-};
-
-static const arg_desc_t tails[] = {
-	{"on", 1 << REISERFS_LARGETAIL, 1 << REISERFS_SMALLTAIL},
-	{"off", 0, (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)},
-	{"small", 1 << REISERFS_SMALLTAIL, 1 << REISERFS_LARGETAIL},
-	{NULL, 0, 0}
-};
-
-static const arg_desc_t error_actions[] = {
-	{"panic", 1 << REISERFS_ERROR_PANIC,
-	 (1 << REISERFS_ERROR_RO | 1 << REISERFS_ERROR_CONTINUE)},
-	{"ro-remount", 1 << REISERFS_ERROR_RO,
-	 (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_CONTINUE)},
-#ifdef REISERFS_JOURNAL_ERROR_ALLOWS_NO_LOG
-	{"continue", 1 << REISERFS_ERROR_CONTINUE,
-	 (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_RO)},
-#endif
-	{NULL, 0, 0},
-};
-
-/*
- * proceed only one option from a list *cur - string containing of mount
- * options
- * opts - array of options which are accepted
- * opt_arg - if option is found and requires an argument and if it is specifed
- * in the input - pointer to the argument is stored here
- * bit_flags - if option requires to set a certain bit - it is set here
- * return -1 if unknown option is found, opt->arg_required otherwise
- */
-static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
-			   char **opt_arg, unsigned long *bit_flags)
-{
-	char *p;
-	/*
-	 * foo=bar,
-	 * ^   ^  ^
-	 * |   |  +-- option_end
-	 * |   +-- arg_start
-	 * +-- option_start
-	 */
-	const opt_desc_t *opt;
-	const arg_desc_t *arg;
-
-	p = *cur;
-
-	/* assume argument cannot contain commas */
-	*cur = strchr(p, ',');
-	if (*cur) {
-		*(*cur) = '\0';
-		(*cur)++;
-	}
-
-	if (!strncmp(p, "alloc=", 6)) {
-		/*
-		 * Ugly special case, probably we should redo options
-		 * parser so that it can understand several arguments for
-		 * some options, also so that it can fill several bitfields
-		 * with option values.
-		 */
-		if (reiserfs_parse_alloc_options(s, p + 6)) {
-			return -1;
-		} else {
-			return 0;
-		}
-	}
-
-	/* for every option in the list */
-	for (opt = opts; opt->option_name; opt++) {
-		if (!strncmp(p, opt->option_name, strlen(opt->option_name))) {
-			if (bit_flags) {
-				if (opt->clrmask ==
-				    (1 << REISERFS_UNSUPPORTED_OPT))
-					reiserfs_warning(s, "super-6500",
-							 "%s not supported.\n",
-							 p);
-				else
-					*bit_flags &= ~opt->clrmask;
-				if (opt->setmask ==
-				    (1 << REISERFS_UNSUPPORTED_OPT))
-					reiserfs_warning(s, "super-6501",
-							 "%s not supported.\n",
-							 p);
-				else
-					*bit_flags |= opt->setmask;
-			}
-			break;
-		}
-	}
-	if (!opt->option_name) {
-		reiserfs_warning(s, "super-6502",
-				 "unknown mount option \"%s\"", p);
-		return -1;
-	}
-
-	p += strlen(opt->option_name);
-	switch (*p) {
-	case '=':
-		if (!opt->arg_required) {
-			reiserfs_warning(s, "super-6503",
-					 "the option \"%s\" does not "
-					 "require an argument\n",
-					 opt->option_name);
-			return -1;
-		}
-		break;
-
-	case 0:
-		if (opt->arg_required) {
-			reiserfs_warning(s, "super-6504",
-					 "the option \"%s\" requires an "
-					 "argument\n", opt->option_name);
-			return -1;
-		}
-		break;
-	default:
-		reiserfs_warning(s, "super-6505",
-				 "head of option \"%s\" is only correct\n",
-				 opt->option_name);
-		return -1;
-	}
-
-	/*
-	 * move to the argument, or to next option if argument is not
-	 * required
-	 */
-	p++;
-
-	if (opt->arg_required
-	    && !(opt->arg_required & (1 << REISERFS_OPT_ALLOWEMPTY))
-	    && !strlen(p)) {
-		/* this catches "option=," if not allowed */
-		reiserfs_warning(s, "super-6506",
-				 "empty argument for \"%s\"\n",
-				 opt->option_name);
-		return -1;
-	}
-
-	if (!opt->values) {
-		/* *=NULLopt_arg contains pointer to argument */
-		*opt_arg = p;
-		return opt->arg_required & ~(1 << REISERFS_OPT_ALLOWEMPTY);
-	}
-
-	/* values possible for this option are listed in opt->values */
-	for (arg = opt->values; arg->value; arg++) {
-		if (!strcmp(p, arg->value)) {
-			if (bit_flags) {
-				*bit_flags &= ~arg->clrmask;
-				*bit_flags |= arg->setmask;
-			}
-			return opt->arg_required;
-		}
-	}
-
-	reiserfs_warning(s, "super-6506",
-			 "bad value \"%s\" for option \"%s\"\n", p,
-			 opt->option_name);
-	return -1;
-}
-
-/* returns 0 if something is wrong in option string, 1 - otherwise */
-static int reiserfs_parse_options(struct super_block *s,
-
-				  /* string given via mount's -o */
-				  char *options,
-
-				  /*
-				   * after the parsing phase, contains the
-				   * collection of bitflags defining what
-				   * mount options were selected.
-				   */
-				  unsigned long *mount_options,
-
-				  /* strtol-ed from NNN of resize=NNN */
-				  unsigned long *blocks,
-				  char **jdev_name,
-				  unsigned int *commit_max_age,
-				  char **qf_names,
-				  unsigned int *qfmt)
-{
-	int c;
-	char *arg = NULL;
-	char *pos;
-	opt_desc_t opts[] = {
-		/*
-		 * Compatibility stuff, so that -o notail for old
-		 * setups still work
-		 */
-		{"tails",.arg_required = 't',.values = tails},
-		{"notail",.clrmask =
-		 (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)},
-		{"conv",.setmask = 1 << REISERFS_CONVERT},
-		{"attrs",.setmask = 1 << REISERFS_ATTRS},
-		{"noattrs",.clrmask = 1 << REISERFS_ATTRS},
-		{"expose_privroot", .setmask = 1 << REISERFS_EXPOSE_PRIVROOT},
-#ifdef CONFIG_REISERFS_FS_XATTR
-		{"user_xattr",.setmask = 1 << REISERFS_XATTRS_USER},
-		{"nouser_xattr",.clrmask = 1 << REISERFS_XATTRS_USER},
-#else
-		{"user_xattr",.setmask = 1 << REISERFS_UNSUPPORTED_OPT},
-		{"nouser_xattr",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT},
-#endif
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
-		{"acl",.setmask = 1 << REISERFS_POSIXACL},
-		{"noacl",.clrmask = 1 << REISERFS_POSIXACL},
-#else
-		{"acl",.setmask = 1 << REISERFS_UNSUPPORTED_OPT},
-		{"noacl",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT},
-#endif
-		{.option_name = "nolog"},
-		{"replayonly",.setmask = 1 << REPLAYONLY},
-		{"block-allocator",.arg_required = 'a',.values = balloc},
-		{"data",.arg_required = 'd',.values = logging_mode},
-		{"barrier",.arg_required = 'b',.values = barrier_mode},
-		{"resize",.arg_required = 'r',.values = NULL},
-		{"jdev",.arg_required = 'j',.values = NULL},
-		{"nolargeio",.arg_required = 'w',.values = NULL},
-		{"commit",.arg_required = 'c',.values = NULL},
-		{"usrquota",.setmask = 1 << REISERFS_USRQUOTA},
-		{"grpquota",.setmask = 1 << REISERFS_GRPQUOTA},
-		{"noquota",.clrmask = 1 << REISERFS_USRQUOTA | 1 << REISERFS_GRPQUOTA},
-		{"errors",.arg_required = 'e',.values = error_actions},
-		{"usrjquota",.arg_required =
-		 'u' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL},
-		{"grpjquota",.arg_required =
-		 'g' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL},
-		{"jqfmt",.arg_required = 'f',.values = NULL},
-		{.option_name = NULL}
-	};
-
-	*blocks = 0;
-	if (!options || !*options)
-		/*
-		 * use default configuration: create tails, journaling on, no
-		 * conversion to newest format
-		 */
-		return 1;
-
-	for (pos = options; pos;) {
-		c = reiserfs_getopt(s, &pos, opts, &arg, mount_options);
-		if (c == -1)
-			/* wrong option is given */
-			return 0;
-
-		if (c == 'r') {
-			char *p;
-
-			p = NULL;
-			/* "resize=NNN" or "resize=auto" */
-
-			if (!strcmp(arg, "auto")) {
-				/* From JFS code, to auto-get the size. */
-				*blocks = sb_bdev_nr_blocks(s);
-			} else {
-				*blocks = simple_strtoul(arg, &p, 0);
-				if (*p != '\0') {
-					/* NNN does not look like a number */
-					reiserfs_warning(s, "super-6507",
-							 "bad value %s for "
-							 "-oresize\n", arg);
-					return 0;
-				}
-			}
-		}
-
-		if (c == 'c') {
-			char *p = NULL;
-			unsigned long val = simple_strtoul(arg, &p, 0);
-			/* commit=NNN (time in seconds) */
-			if (*p != '\0' || val >= (unsigned int)-1) {
-				reiserfs_warning(s, "super-6508",
-						 "bad value %s for -ocommit\n",
-						 arg);
-				return 0;
-			}
-			*commit_max_age = (unsigned int)val;
-		}
-
-		if (c == 'w') {
-			reiserfs_warning(s, "super-6509", "nolargeio option "
-					 "is no longer supported");
-			return 0;
-		}
-
-		if (c == 'j') {
-			if (arg && *arg && jdev_name) {
-				/* Hm, already assigned? */
-				if (*jdev_name) {
-					reiserfs_warning(s, "super-6510",
-							 "journal device was "
-							 "already specified to "
-							 "be %s", *jdev_name);
-					return 0;
-				}
-				*jdev_name = arg;
-			}
-		}
-#ifdef CONFIG_QUOTA
-		if (c == 'u' || c == 'g') {
-			int qtype = c == 'u' ? USRQUOTA : GRPQUOTA;
-
-			if (sb_any_quota_loaded(s) &&
-			    (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) {
-				reiserfs_warning(s, "super-6511",
-						 "cannot change journaled "
-						 "quota options when quota "
-						 "turned on.");
-				return 0;
-			}
-			if (qf_names[qtype] !=
-			    REISERFS_SB(s)->s_qf_names[qtype])
-				kfree(qf_names[qtype]);
-			qf_names[qtype] = NULL;
-			if (*arg) {	/* Some filename specified? */
-				if (REISERFS_SB(s)->s_qf_names[qtype]
-				    && strcmp(REISERFS_SB(s)->s_qf_names[qtype],
-					      arg)) {
-					reiserfs_warning(s, "super-6512",
-							 "%s quota file "
-							 "already specified.",
-							 QTYPE2NAME(qtype));
-					return 0;
-				}
-				if (strchr(arg, '/')) {
-					reiserfs_warning(s, "super-6513",
-							 "quotafile must be "
-							 "on filesystem root.");
-					return 0;
-				}
-				qf_names[qtype] = kstrdup(arg, GFP_KERNEL);
-				if (!qf_names[qtype]) {
-					reiserfs_warning(s, "reiserfs-2502",
-							 "not enough memory "
-							 "for storing "
-							 "quotafile name.");
-					return 0;
-				}
-				if (qtype == USRQUOTA)
-					*mount_options |= 1 << REISERFS_USRQUOTA;
-				else
-					*mount_options |= 1 << REISERFS_GRPQUOTA;
-			} else {
-				if (qtype == USRQUOTA)
-					*mount_options &= ~(1 << REISERFS_USRQUOTA);
-				else
-					*mount_options &= ~(1 << REISERFS_GRPQUOTA);
-			}
-		}
-		if (c == 'f') {
-			if (!strcmp(arg, "vfsold"))
-				*qfmt = QFMT_VFS_OLD;
-			else if (!strcmp(arg, "vfsv0"))
-				*qfmt = QFMT_VFS_V0;
-			else {
-				reiserfs_warning(s, "super-6514",
-						 "unknown quota format "
-						 "specified.");
-				return 0;
-			}
-			if (sb_any_quota_loaded(s) &&
-			    *qfmt != REISERFS_SB(s)->s_jquota_fmt) {
-				reiserfs_warning(s, "super-6515",
-						 "cannot change journaled "
-						 "quota options when quota "
-						 "turned on.");
-				return 0;
-			}
-		}
-#else
-		if (c == 'u' || c == 'g' || c == 'f') {
-			reiserfs_warning(s, "reiserfs-2503", "journaled "
-					 "quota options not supported.");
-			return 0;
-		}
-#endif
-	}
-
-#ifdef CONFIG_QUOTA
-	if (!REISERFS_SB(s)->s_jquota_fmt && !*qfmt
-	    && (qf_names[USRQUOTA] || qf_names[GRPQUOTA])) {
-		reiserfs_warning(s, "super-6515",
-				 "journaled quota format not specified.");
-		return 0;
-	}
-	if ((!(*mount_options & (1 << REISERFS_USRQUOTA)) &&
-	       sb_has_quota_loaded(s, USRQUOTA)) ||
-	    (!(*mount_options & (1 << REISERFS_GRPQUOTA)) &&
-	       sb_has_quota_loaded(s, GRPQUOTA))) {
-		reiserfs_warning(s, "super-6516", "quota options must "
-				 "be present when quota is turned on.");
-		return 0;
-	}
-#endif
-
-	return 1;
-}
-
-static void switch_data_mode(struct super_block *s, unsigned long mode)
-{
-	REISERFS_SB(s)->s_mount_opt &= ~((1 << REISERFS_DATA_LOG) |
-					 (1 << REISERFS_DATA_ORDERED) |
-					 (1 << REISERFS_DATA_WRITEBACK));
-	REISERFS_SB(s)->s_mount_opt |= (1 << mode);
-}
-
-static void handle_data_mode(struct super_block *s, unsigned long mount_options)
-{
-	if (mount_options & (1 << REISERFS_DATA_LOG)) {
-		if (!reiserfs_data_log(s)) {
-			switch_data_mode(s, REISERFS_DATA_LOG);
-			reiserfs_info(s, "switching to journaled data mode\n");
-		}
-	} else if (mount_options & (1 << REISERFS_DATA_ORDERED)) {
-		if (!reiserfs_data_ordered(s)) {
-			switch_data_mode(s, REISERFS_DATA_ORDERED);
-			reiserfs_info(s, "switching to ordered data mode\n");
-		}
-	} else if (mount_options & (1 << REISERFS_DATA_WRITEBACK)) {
-		if (!reiserfs_data_writeback(s)) {
-			switch_data_mode(s, REISERFS_DATA_WRITEBACK);
-			reiserfs_info(s, "switching to writeback data mode\n");
-		}
-	}
-}
-
-static void handle_barrier_mode(struct super_block *s, unsigned long bits)
-{
-	int flush = (1 << REISERFS_BARRIER_FLUSH);
-	int none = (1 << REISERFS_BARRIER_NONE);
-	int all_barrier = flush | none;
-
-	if (bits & all_barrier) {
-		REISERFS_SB(s)->s_mount_opt &= ~all_barrier;
-		if (bits & flush) {
-			REISERFS_SB(s)->s_mount_opt |= flush;
-			printk("reiserfs: enabling write barrier flush mode\n");
-		} else if (bits & none) {
-			REISERFS_SB(s)->s_mount_opt |= none;
-			printk("reiserfs: write barriers turned off\n");
-		}
-	}
-}
-
-static void handle_attrs(struct super_block *s)
-{
-	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
-
-	if (reiserfs_attrs(s)) {
-		if (old_format_only(s)) {
-			reiserfs_warning(s, "super-6517", "cannot support "
-					 "attributes on 3.5.x disk format");
-			REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS);
-			return;
-		}
-		if (!(le32_to_cpu(rs->s_flags) & reiserfs_attrs_cleared)) {
-			reiserfs_warning(s, "super-6518", "cannot support "
-					 "attributes until flag is set in "
-					 "super-block");
-			REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS);
-		}
-	}
-}
-
-#ifdef CONFIG_QUOTA
-static void handle_quota_files(struct super_block *s, char **qf_names,
-			       unsigned int *qfmt)
-{
-	int i;
-
-	for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
-		if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
-			kfree(REISERFS_SB(s)->s_qf_names[i]);
-		REISERFS_SB(s)->s_qf_names[i] = qf_names[i];
-	}
-	if (*qfmt)
-		REISERFS_SB(s)->s_jquota_fmt = *qfmt;
-}
-#endif
-
-static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
-{
-	struct reiserfs_super_block *rs;
-	struct reiserfs_transaction_handle th;
-	unsigned long blocks;
-	unsigned long mount_options = REISERFS_SB(s)->s_mount_opt;
-	unsigned long safe_mask = 0;
-	unsigned int commit_max_age = (unsigned int)-1;
-	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	int err;
-	char *qf_names[REISERFS_MAXQUOTAS];
-	unsigned int qfmt = 0;
-#ifdef CONFIG_QUOTA
-	int i;
-#endif
-
-	sync_filesystem(s);
-	reiserfs_write_lock(s);
-
-#ifdef CONFIG_QUOTA
-	memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
-#endif
-
-	rs = SB_DISK_SUPER_BLOCK(s);
-
-	if (!reiserfs_parse_options
-	    (s, arg, &mount_options, &blocks, NULL, &commit_max_age,
-	    qf_names, &qfmt)) {
-#ifdef CONFIG_QUOTA
-		for (i = 0; i < REISERFS_MAXQUOTAS; i++)
-			if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
-				kfree(qf_names[i]);
-#endif
-		err = -EINVAL;
-		goto out_err_unlock;
-	}
-#ifdef CONFIG_QUOTA
-	handle_quota_files(s, qf_names, &qfmt);
-#endif
-
-	handle_attrs(s);
-
-	/* Add options that are safe here */
-	safe_mask |= 1 << REISERFS_SMALLTAIL;
-	safe_mask |= 1 << REISERFS_LARGETAIL;
-	safe_mask |= 1 << REISERFS_NO_BORDER;
-	safe_mask |= 1 << REISERFS_NO_UNHASHED_RELOCATION;
-	safe_mask |= 1 << REISERFS_HASHED_RELOCATION;
-	safe_mask |= 1 << REISERFS_TEST4;
-	safe_mask |= 1 << REISERFS_ATTRS;
-	safe_mask |= 1 << REISERFS_XATTRS_USER;
-	safe_mask |= 1 << REISERFS_POSIXACL;
-	safe_mask |= 1 << REISERFS_BARRIER_FLUSH;
-	safe_mask |= 1 << REISERFS_BARRIER_NONE;
-	safe_mask |= 1 << REISERFS_ERROR_RO;
-	safe_mask |= 1 << REISERFS_ERROR_CONTINUE;
-	safe_mask |= 1 << REISERFS_ERROR_PANIC;
-	safe_mask |= 1 << REISERFS_USRQUOTA;
-	safe_mask |= 1 << REISERFS_GRPQUOTA;
-
-	/*
-	 * Update the bitmask, taking care to keep
-	 * the bits we're not allowed to change here
-	 */
-	REISERFS_SB(s)->s_mount_opt =
-	    (REISERFS_SB(s)->
-	     s_mount_opt & ~safe_mask) | (mount_options & safe_mask);
-
-	if (commit_max_age != 0 && commit_max_age != (unsigned int)-1) {
-		journal->j_max_commit_age = commit_max_age;
-		journal->j_max_trans_age = commit_max_age;
-	} else if (commit_max_age == 0) {
-		/* 0 means restore defaults. */
-		journal->j_max_commit_age = journal->j_default_max_commit_age;
-		journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
-	}
-
-	if (blocks) {
-		err = reiserfs_resize(s, blocks);
-		if (err != 0)
-			goto out_err_unlock;
-	}
-
-	if (*mount_flags & SB_RDONLY) {
-		reiserfs_write_unlock(s);
-		reiserfs_xattr_init(s, *mount_flags);
-		/* remount read-only */
-		if (sb_rdonly(s))
-			/* it is read-only already */
-			goto out_ok_unlocked;
-
-		err = dquot_suspend(s, -1);
-		if (err < 0)
-			goto out_err;
-
-		/* try to remount file system with read-only permissions */
-		if (sb_umount_state(rs) == REISERFS_VALID_FS
-		    || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) {
-			goto out_ok_unlocked;
-		}
-
-		reiserfs_write_lock(s);
-
-		err = journal_begin(&th, s, 10);
-		if (err)
-			goto out_err_unlock;
-
-		/* Mounting a rw partition read-only. */
-		reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
-		set_sb_umount_state(rs, REISERFS_SB(s)->s_mount_state);
-		journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
-	} else {
-		/* remount read-write */
-		if (!sb_rdonly(s)) {
-			reiserfs_write_unlock(s);
-			reiserfs_xattr_init(s, *mount_flags);
-			goto out_ok_unlocked;	/* We are read-write already */
-		}
-
-		if (reiserfs_is_journal_aborted(journal)) {
-			err = journal->j_errno;
-			goto out_err_unlock;
-		}
-
-		handle_data_mode(s, mount_options);
-		handle_barrier_mode(s, mount_options);
-		REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
-
-		/* now it is safe to call journal_begin */
-		s->s_flags &= ~SB_RDONLY;
-		err = journal_begin(&th, s, 10);
-		if (err)
-			goto out_err_unlock;
-
-		/* Mount a partition which is read-only, read-write */
-		reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
-		REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
-		s->s_flags &= ~SB_RDONLY;
-		set_sb_umount_state(rs, REISERFS_ERROR_FS);
-		if (!old_format_only(s))
-			set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
-		/* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */
-		journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
-		REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS;
-	}
-	/* this will force a full flush of all journal lists */
-	SB_JOURNAL(s)->j_must_wait = 1;
-	err = journal_end(&th);
-	if (err)
-		goto out_err_unlock;
-
-	reiserfs_write_unlock(s);
-	if (!(*mount_flags & SB_RDONLY)) {
-		dquot_resume(s, -1);
-		reiserfs_write_lock(s);
-		finish_unfinished(s);
-		reiserfs_write_unlock(s);
-		reiserfs_xattr_init(s, *mount_flags);
-	}
-
-out_ok_unlocked:
-	return 0;
-
-out_err_unlock:
-	reiserfs_write_unlock(s);
-out_err:
-	return err;
-}
-
-static int read_super_block(struct super_block *s, int offset)
-{
-	struct buffer_head *bh;
-	struct reiserfs_super_block *rs;
-	int fs_blocksize;
-
-	bh = sb_bread(s, offset / s->s_blocksize);
-	if (!bh) {
-		reiserfs_warning(s, "sh-2006",
-				 "bread failed (dev %s, block %lu, size %lu)",
-				 s->s_id, offset / s->s_blocksize,
-				 s->s_blocksize);
-		return 1;
-	}
-
-	rs = (struct reiserfs_super_block *)bh->b_data;
-	if (!is_any_reiserfs_magic_string(rs)) {
-		brelse(bh);
-		return 1;
-	}
-	/*
-	 * ok, reiserfs signature (old or new) found in at the given offset
-	 */
-	fs_blocksize = sb_blocksize(rs);
-	brelse(bh);
-	sb_set_blocksize(s, fs_blocksize);
-
-	bh = sb_bread(s, offset / s->s_blocksize);
-	if (!bh) {
-		reiserfs_warning(s, "sh-2007",
-				 "bread failed (dev %s, block %lu, size %lu)",
-				 s->s_id, offset / s->s_blocksize,
-				 s->s_blocksize);
-		return 1;
-	}
-
-	rs = (struct reiserfs_super_block *)bh->b_data;
-	if (sb_blocksize(rs) != s->s_blocksize) {
-		reiserfs_warning(s, "sh-2011", "can't find a reiserfs "
-				 "filesystem on (dev %s, block %llu, size %lu)",
-				 s->s_id,
-				 (unsigned long long)bh->b_blocknr,
-				 s->s_blocksize);
-		brelse(bh);
-		return 1;
-	}
-
-	if (rs->s_v1.s_root_block == cpu_to_le32(-1)) {
-		brelse(bh);
-		reiserfs_warning(s, "super-6519", "Unfinished reiserfsck "
-				 "--rebuild-tree run detected. Please run\n"
-				 "reiserfsck --rebuild-tree and wait for a "
-				 "completion. If that fails\n"
-				 "get newer reiserfsprogs package");
-		return 1;
-	}
-
-	reiserfs_warning(NULL, "", "reiserfs filesystem is deprecated and "
-		"scheduled to be removed from the kernel in 2025");
-	SB_BUFFER_WITH_SB(s) = bh;
-	SB_DISK_SUPER_BLOCK(s) = rs;
-
-	/*
-	 * magic is of non-standard journal filesystem, look at s_version to
-	 * find which format is in use
-	 */
-	if (is_reiserfs_jr(rs)) {
-		if (sb_version(rs) == REISERFS_VERSION_2)
-			reiserfs_info(s, "found reiserfs format \"3.6\""
-				      " with non-standard journal\n");
-		else if (sb_version(rs) == REISERFS_VERSION_1)
-			reiserfs_info(s, "found reiserfs format \"3.5\""
-				      " with non-standard journal\n");
-		else {
-			reiserfs_warning(s, "sh-2012", "found unknown "
-					 "format \"%u\" of reiserfs with "
-					 "non-standard magic", sb_version(rs));
-			return 1;
-		}
-	} else
-		/*
-		 * s_version of standard format may contain incorrect
-		 * information, so we just look at the magic string
-		 */
-		reiserfs_info(s,
-			      "found reiserfs format \"%s\" with standard journal\n",
-			      is_reiserfs_3_5(rs) ? "3.5" : "3.6");
-
-	s->s_op = &reiserfs_sops;
-	s->s_export_op = &reiserfs_export_ops;
-#ifdef CONFIG_QUOTA
-	s->s_qcop = &reiserfs_qctl_operations;
-	s->dq_op = &reiserfs_quota_operations;
-	s->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
-#endif
-
-	/*
-	 * new format is limited by the 32 bit wide i_blocks field, want to
-	 * be one full block below that.
-	 */
-	s->s_maxbytes = (512LL << 32) - s->s_blocksize;
-	return 0;
-}
-
-/* after journal replay, reread all bitmap and super blocks */
-static int reread_meta_blocks(struct super_block *s)
-{
-	if (bh_read(SB_BUFFER_WITH_SB(s), 0) < 0) {
-		reiserfs_warning(s, "reiserfs-2504", "error reading the super");
-		return 1;
-	}
-
-	return 0;
-}
-
-/* hash detection stuff */
-
-/*
- * if root directory is empty - we set default - Yura's - hash and
- * warn about it
- * FIXME: we look for only one name in a directory. If tea and yura
- * both have the same value - we ask user to send report to the
- * mailing list
- */
-static __u32 find_hash_out(struct super_block *s)
-{
-	int retval;
-	struct inode *inode;
-	struct cpu_key key;
-	INITIALIZE_PATH(path);
-	struct reiserfs_dir_entry de;
-	struct reiserfs_de_head *deh;
-	__u32 hash = DEFAULT_HASH;
-	__u32 deh_hashval, teahash, r5hash, yurahash;
-
-	inode = d_inode(s->s_root);
-
-	make_cpu_key(&key, inode, ~0, TYPE_DIRENTRY, 3);
-	retval = search_by_entry_key(s, &key, &path, &de);
-	if (retval == IO_ERROR) {
-		pathrelse(&path);
-		return UNSET_HASH;
-	}
-	if (retval == NAME_NOT_FOUND)
-		de.de_entry_num--;
-
-	set_de_name_and_namelen(&de);
-	deh = de.de_deh + de.de_entry_num;
-
-	if (deh_offset(deh) == DOT_DOT_OFFSET) {
-		/* allow override in this case */
-		if (reiserfs_rupasov_hash(s))
-			hash = YURA_HASH;
-		reiserfs_info(s, "FS seems to be empty, autodetect is using the default hash\n");
-		goto out;
-	}
-
-	deh_hashval = GET_HASH_VALUE(deh_offset(deh));
-	r5hash = GET_HASH_VALUE(r5_hash(de.de_name, de.de_namelen));
-	teahash = GET_HASH_VALUE(keyed_hash(de.de_name, de.de_namelen));
-	yurahash = GET_HASH_VALUE(yura_hash(de.de_name, de.de_namelen));
-
-	if ((teahash == r5hash && deh_hashval == r5hash) ||
-	    (teahash == yurahash && deh_hashval == yurahash) ||
-	    (r5hash == yurahash && deh_hashval == yurahash)) {
-		reiserfs_warning(s, "reiserfs-2506",
-				 "Unable to automatically detect hash "
-				 "function. Please mount with -o "
-				 "hash={tea,rupasov,r5}");
-		hash = UNSET_HASH;
-		goto out;
-	}
-
-	if (deh_hashval == yurahash)
-		hash = YURA_HASH;
-	else if (deh_hashval == teahash)
-		hash = TEA_HASH;
-	else if (deh_hashval == r5hash)
-		hash = R5_HASH;
-	else {
-		reiserfs_warning(s, "reiserfs-2506",
-				 "Unrecognised hash function");
-		hash = UNSET_HASH;
-	}
-out:
-	pathrelse(&path);
-	return hash;
-}
-
-/* finds out which hash names are sorted with */
-static int what_hash(struct super_block *s)
-{
-	__u32 code;
-
-	code = sb_hash_function_code(SB_DISK_SUPER_BLOCK(s));
-
-	/*
-	 * reiserfs_hash_detect() == true if any of the hash mount options
-	 * were used.  We must check them to make sure the user isn't
-	 * using a bad hash value
-	 */
-	if (code == UNSET_HASH || reiserfs_hash_detect(s))
-		code = find_hash_out(s);
-
-	if (code != UNSET_HASH && reiserfs_hash_detect(s)) {
-		/*
-		 * detection has found the hash, and we must check against the
-		 * mount options
-		 */
-		if (reiserfs_rupasov_hash(s) && code != YURA_HASH) {
-			reiserfs_warning(s, "reiserfs-2507",
-					 "Error, %s hash detected, "
-					 "unable to force rupasov hash",
-					 reiserfs_hashname(code));
-			code = UNSET_HASH;
-		} else if (reiserfs_tea_hash(s) && code != TEA_HASH) {
-			reiserfs_warning(s, "reiserfs-2508",
-					 "Error, %s hash detected, "
-					 "unable to force tea hash",
-					 reiserfs_hashname(code));
-			code = UNSET_HASH;
-		} else if (reiserfs_r5_hash(s) && code != R5_HASH) {
-			reiserfs_warning(s, "reiserfs-2509",
-					 "Error, %s hash detected, "
-					 "unable to force r5 hash",
-					 reiserfs_hashname(code));
-			code = UNSET_HASH;
-		}
-	} else {
-		/*
-		 * find_hash_out was not called or
-		 * could not determine the hash
-		 */
-		if (reiserfs_rupasov_hash(s)) {
-			code = YURA_HASH;
-		} else if (reiserfs_tea_hash(s)) {
-			code = TEA_HASH;
-		} else if (reiserfs_r5_hash(s)) {
-			code = R5_HASH;
-		}
-	}
-
-	/*
-	 * if we are mounted RW, and we have a new valid hash code, update
-	 * the super
-	 */
-	if (code != UNSET_HASH &&
-	    !sb_rdonly(s) &&
-	    code != sb_hash_function_code(SB_DISK_SUPER_BLOCK(s))) {
-		set_sb_hash_function_code(SB_DISK_SUPER_BLOCK(s), code);
-	}
-	return code;
-}
-
-/* return pointer to appropriate function */
-static hashf_t hash_function(struct super_block *s)
-{
-	switch (what_hash(s)) {
-	case TEA_HASH:
-		reiserfs_info(s, "Using tea hash to sort names\n");
-		return keyed_hash;
-	case YURA_HASH:
-		reiserfs_info(s, "Using rupasov hash to sort names\n");
-		return yura_hash;
-	case R5_HASH:
-		reiserfs_info(s, "Using r5 hash to sort names\n");
-		return r5_hash;
-	}
-	return NULL;
-}
-
-/* this is used to set up correct value for old partitions */
-static int function2code(hashf_t func)
-{
-	if (func == keyed_hash)
-		return TEA_HASH;
-	if (func == yura_hash)
-		return YURA_HASH;
-	if (func == r5_hash)
-		return R5_HASH;
-
-	BUG();			/* should never happen */
-
-	return 0;
-}
-
-#define SWARN(silent, s, id, ...)			\
-	if (!(silent))				\
-		reiserfs_warning(s, id, __VA_ARGS__)
-
-static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
-{
-	struct inode *root_inode;
-	struct reiserfs_transaction_handle th;
-	int old_format = 0;
-	unsigned long blocks;
-	unsigned int commit_max_age = 0;
-	int jinit_done = 0;
-	struct reiserfs_iget_args args;
-	struct reiserfs_super_block *rs;
-	char *jdev_name;
-	struct reiserfs_sb_info *sbi;
-	int errval = -EINVAL;
-	char *qf_names[REISERFS_MAXQUOTAS] = {};
-	unsigned int qfmt = 0;
-
-	sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
-	if (!sbi)
-		return -ENOMEM;
-	s->s_fs_info = sbi;
-	/* Set default values for options: non-aggressive tails, RO on errors */
-	sbi->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
-	sbi->s_mount_opt |= (1 << REISERFS_ERROR_RO);
-	sbi->s_mount_opt |= (1 << REISERFS_BARRIER_FLUSH);
-	/* no preallocation minimum, be smart in reiserfs_file_write instead */
-	sbi->s_alloc_options.preallocmin = 0;
-	/* Preallocate by 16 blocks (17-1) at once */
-	sbi->s_alloc_options.preallocsize = 17;
-	/* setup default block allocator options */
-	reiserfs_init_alloc_options(s);
-
-	spin_lock_init(&sbi->old_work_lock);
-	INIT_DELAYED_WORK(&sbi->old_work, flush_old_commits);
-	mutex_init(&sbi->lock);
-	sbi->lock_depth = -1;
-
-	sbi->commit_wq = alloc_workqueue("reiserfs/%s", WQ_MEM_RECLAIM, 0,
-					 s->s_id);
-	if (!sbi->commit_wq) {
-		SWARN(silent, s, "", "Cannot allocate commit workqueue");
-		errval = -ENOMEM;
-		goto error_unlocked;
-	}
-
-	jdev_name = NULL;
-	if (reiserfs_parse_options
-	    (s, (char *)data, &sbi->s_mount_opt, &blocks, &jdev_name,
-	     &commit_max_age, qf_names, &qfmt) == 0) {
-		goto error_unlocked;
-	}
-	if (jdev_name && jdev_name[0]) {
-		sbi->s_jdev = kstrdup(jdev_name, GFP_KERNEL);
-		if (!sbi->s_jdev) {
-			SWARN(silent, s, "", "Cannot allocate memory for "
-				"journal device name");
-			goto error_unlocked;
-		}
-	}
-#ifdef CONFIG_QUOTA
-	handle_quota_files(s, qf_names, &qfmt);
-#endif
-
-	if (blocks) {
-		SWARN(silent, s, "jmacd-7", "resize option for remount only");
-		goto error_unlocked;
-	}
-
-	/*
-	 * try old format (undistributed bitmap, super block in 8-th 1k
-	 * block of a device)
-	 */
-	if (!read_super_block(s, REISERFS_OLD_DISK_OFFSET_IN_BYTES))
-		old_format = 1;
-
-	/*
-	 * try new format (64-th 1k block), which can contain reiserfs
-	 * super block
-	 */
-	else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) {
-		SWARN(silent, s, "sh-2021", "can not find reiserfs on %s",
-		      s->s_id);
-		goto error_unlocked;
-	}
-
-	s->s_time_min = 0;
-	s->s_time_max = U32_MAX;
-
-	rs = SB_DISK_SUPER_BLOCK(s);
-	/*
-	 * Let's do basic sanity check to verify that underlying device is not
-	 * smaller than the filesystem. If the check fails then abort and
-	 * scream, because bad stuff will happen otherwise.
-	 */
-	if (bdev_nr_bytes(s->s_bdev) < sb_block_count(rs) * sb_blocksize(rs)) {
-		SWARN(silent, s, "", "Filesystem cannot be "
-		      "mounted because it is bigger than the device");
-		SWARN(silent, s, "", "You may need to run fsck "
-		      "or increase size of your LVM partition");
-		SWARN(silent, s, "", "Or may be you forgot to "
-		      "reboot after fdisk when it told you to");
-		goto error_unlocked;
-	}
-
-	sbi->s_mount_state = SB_REISERFS_STATE(s);
-	sbi->s_mount_state = REISERFS_VALID_FS;
-
-	if ((errval = reiserfs_init_bitmap_cache(s))) {
-		SWARN(silent, s, "jmacd-8", "unable to read bitmap");
-		goto error_unlocked;
-	}
-
-	errval = -EINVAL;
-#ifdef CONFIG_REISERFS_CHECK
-	SWARN(silent, s, "", "CONFIG_REISERFS_CHECK is set ON");
-	SWARN(silent, s, "", "- it is slow mode for debugging.");
-#endif
-
-	/* make data=ordered the default */
-	if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) &&
-	    !reiserfs_data_writeback(s)) {
-		sbi->s_mount_opt |= (1 << REISERFS_DATA_ORDERED);
-	}
-
-	if (reiserfs_data_log(s)) {
-		reiserfs_info(s, "using journaled data mode\n");
-	} else if (reiserfs_data_ordered(s)) {
-		reiserfs_info(s, "using ordered data mode\n");
-	} else {
-		reiserfs_info(s, "using writeback data mode\n");
-	}
-	if (reiserfs_barrier_flush(s)) {
-		printk("reiserfs: using flush barriers\n");
-	}
-
-	if (journal_init(s, jdev_name, old_format, commit_max_age)) {
-		SWARN(silent, s, "sh-2022",
-		      "unable to initialize journal space");
-		goto error_unlocked;
-	} else {
-		/*
-		 * once this is set, journal_release must be called
-		 * if we error out of the mount
-		 */
-		jinit_done = 1;
-	}
-
-	if (reread_meta_blocks(s)) {
-		SWARN(silent, s, "jmacd-9",
-		      "unable to reread meta blocks after journal init");
-		goto error_unlocked;
-	}
-
-	if (replay_only(s))
-		goto error_unlocked;
-
-	s->s_xattr = reiserfs_xattr_handlers;
-
-	if (bdev_read_only(s->s_bdev) && !sb_rdonly(s)) {
-		SWARN(silent, s, "clm-7000",
-		      "Detected readonly device, marking FS readonly");
-		s->s_flags |= SB_RDONLY;
-	}
-	args.objectid = REISERFS_ROOT_OBJECTID;
-	args.dirid = REISERFS_ROOT_PARENT_OBJECTID;
-	root_inode =
-	    iget5_locked(s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor,
-			 reiserfs_init_locked_inode, (void *)&args);
-	if (!root_inode) {
-		SWARN(silent, s, "jmacd-10", "get root inode failed");
-		goto error_unlocked;
-	}
-
-	/*
-	 * This path assumed to be called with the BKL in the old times.
-	 * Now we have inherited the big reiserfs lock from it and many
-	 * reiserfs helpers called in the mount path and elsewhere require
-	 * this lock to be held even if it's not always necessary. Let's be
-	 * conservative and hold it early. The window can be reduced after
-	 * careful review of the code.
-	 */
-	reiserfs_write_lock(s);
-
-	if (root_inode->i_state & I_NEW) {
-		reiserfs_read_locked_inode(root_inode, &args);
-		unlock_new_inode(root_inode);
-	}
-
-	if (!S_ISDIR(root_inode->i_mode) || !inode_get_bytes(root_inode) ||
-	    !root_inode->i_size) {
-		SWARN(silent, s, "", "corrupt root inode, run fsck");
-		iput(root_inode);
-		errval = -EUCLEAN;
-		goto error;
-	}
-
-	s->s_root = d_make_root(root_inode);
-	if (!s->s_root)
-		goto error;
-	/* define and initialize hash function */
-	sbi->s_hash_function = hash_function(s);
-	if (sbi->s_hash_function == NULL) {
-		dput(s->s_root);
-		s->s_root = NULL;
-		goto error;
-	}
-
-	if (is_reiserfs_3_5(rs)
-	    || (is_reiserfs_jr(rs) && SB_VERSION(s) == REISERFS_VERSION_1))
-		set_bit(REISERFS_3_5, &sbi->s_properties);
-	else if (old_format)
-		set_bit(REISERFS_OLD_FORMAT, &sbi->s_properties);
-	else
-		set_bit(REISERFS_3_6, &sbi->s_properties);
-
-	if (!sb_rdonly(s)) {
-
-		errval = journal_begin(&th, s, 1);
-		if (errval) {
-			dput(s->s_root);
-			s->s_root = NULL;
-			goto error;
-		}
-		reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
-
-		set_sb_umount_state(rs, REISERFS_ERROR_FS);
-		set_sb_fs_state(rs, 0);
-
-		/*
-		 * Clear out s_bmap_nr if it would wrap. We can handle this
-		 * case, but older revisions can't. This will cause the
-		 * file system to fail mount on those older implementations,
-		 * avoiding corruption. -jeffm
-		 */
-		if (bmap_would_wrap(reiserfs_bmap_count(s)) &&
-		    sb_bmap_nr(rs) != 0) {
-			reiserfs_warning(s, "super-2030", "This file system "
-					"claims to use %u bitmap blocks in "
-					"its super block, but requires %u. "
-					"Clearing to zero.", sb_bmap_nr(rs),
-					reiserfs_bmap_count(s));
-
-			set_sb_bmap_nr(rs, 0);
-		}
-
-		if (old_format_only(s)) {
-			/*
-			 * filesystem of format 3.5 either with standard
-			 * or non-standard journal
-			 */
-			if (convert_reiserfs(s)) {
-				/* and -o conv is given */
-				if (!silent)
-					reiserfs_info(s,
-						      "converting 3.5 filesystem to the 3.6 format");
-
-				if (is_reiserfs_3_5(rs))
-					/*
-					 * put magic string of 3.6 format.
-					 * 2.2 will not be able to
-					 * mount this filesystem anymore
-					 */
-					memcpy(rs->s_v1.s_magic,
-					       reiserfs_3_6_magic_string,
-					       sizeof
-					       (reiserfs_3_6_magic_string));
-
-				set_sb_version(rs, REISERFS_VERSION_2);
-				reiserfs_convert_objectid_map_v1(s);
-				set_bit(REISERFS_3_6, &sbi->s_properties);
-				clear_bit(REISERFS_3_5, &sbi->s_properties);
-			} else if (!silent) {
-				reiserfs_info(s, "using 3.5.x disk format\n");
-			}
-		} else
-			set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
-
-
-		journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
-		errval = journal_end(&th);
-		if (errval) {
-			dput(s->s_root);
-			s->s_root = NULL;
-			goto error;
-		}
-
-		reiserfs_write_unlock(s);
-		if ((errval = reiserfs_lookup_privroot(s)) ||
-		    (errval = reiserfs_xattr_init(s, s->s_flags))) {
-			dput(s->s_root);
-			s->s_root = NULL;
-			goto error_unlocked;
-		}
-		reiserfs_write_lock(s);
-
-		/*
-		 * look for files which were to be removed in previous session
-		 */
-		finish_unfinished(s);
-	} else {
-		if (old_format_only(s) && !silent) {
-			reiserfs_info(s, "using 3.5.x disk format\n");
-		}
-
-		reiserfs_write_unlock(s);
-		if ((errval = reiserfs_lookup_privroot(s)) ||
-		    (errval = reiserfs_xattr_init(s, s->s_flags))) {
-			dput(s->s_root);
-			s->s_root = NULL;
-			goto error_unlocked;
-		}
-		reiserfs_write_lock(s);
-	}
-	/*
-	 * mark hash in super block: it could be unset. overwrite should be ok
-	 */
-	set_sb_hash_function_code(rs, function2code(sbi->s_hash_function));
-
-	handle_attrs(s);
-
-	reiserfs_proc_info_init(s);
-
-	init_waitqueue_head(&(sbi->s_wait));
-	spin_lock_init(&sbi->bitmap_lock);
-
-	reiserfs_write_unlock(s);
-
-	return (0);
-
-error:
-	reiserfs_write_unlock(s);
-
-error_unlocked:
-	/* kill the commit thread, free journal ram */
-	if (jinit_done) {
-		reiserfs_write_lock(s);
-		journal_release_error(NULL, s);
-		reiserfs_write_unlock(s);
-	}
-
-	if (sbi->commit_wq)
-		destroy_workqueue(sbi->commit_wq);
-
-	reiserfs_cancel_old_flush(s);
-
-	reiserfs_free_bitmap_cache(s);
-	if (SB_BUFFER_WITH_SB(s))
-		brelse(SB_BUFFER_WITH_SB(s));
-#ifdef CONFIG_QUOTA
-	{
-		int j;
-		for (j = 0; j < REISERFS_MAXQUOTAS; j++)
-			kfree(qf_names[j]);
-	}
-#endif
-	kfree(sbi->s_jdev);
-	kfree(sbi);
-
-	s->s_fs_info = NULL;
-	return errval;
-}
-
-static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(dentry->d_sb);
-
-	buf->f_namelen = (REISERFS_MAX_NAME(s->s_blocksize));
-	buf->f_bfree = sb_free_blocks(rs);
-	buf->f_bavail = buf->f_bfree;
-	buf->f_blocks = sb_block_count(rs) - sb_bmap_nr(rs) - 1;
-	buf->f_bsize = dentry->d_sb->s_blocksize;
-	/* changed to accommodate gcc folks. */
-	buf->f_type = REISERFS_SUPER_MAGIC;
-	buf->f_fsid.val[0] = (u32)crc32_le(0, rs->s_uuid, sizeof(rs->s_uuid)/2);
-	buf->f_fsid.val[1] = (u32)crc32_le(0, rs->s_uuid + sizeof(rs->s_uuid)/2,
-				sizeof(rs->s_uuid)/2);
-
-	return 0;
-}
-
-#ifdef CONFIG_QUOTA
-static int reiserfs_write_dquot(struct dquot *dquot)
-{
-	struct reiserfs_transaction_handle th;
-	int ret, err;
-	int depth;
-
-	reiserfs_write_lock(dquot->dq_sb);
-	ret =
-	    journal_begin(&th, dquot->dq_sb,
-			  REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
-	if (ret)
-		goto out;
-	depth = reiserfs_write_unlock_nested(dquot->dq_sb);
-	ret = dquot_commit(dquot);
-	reiserfs_write_lock_nested(dquot->dq_sb, depth);
-	err = journal_end(&th);
-	if (!ret && err)
-		ret = err;
-out:
-	reiserfs_write_unlock(dquot->dq_sb);
-	return ret;
-}
-
-static int reiserfs_acquire_dquot(struct dquot *dquot)
-{
-	struct reiserfs_transaction_handle th;
-	int ret, err;
-	int depth;
-
-	reiserfs_write_lock(dquot->dq_sb);
-	ret =
-	    journal_begin(&th, dquot->dq_sb,
-			  REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
-	if (ret)
-		goto out;
-	depth = reiserfs_write_unlock_nested(dquot->dq_sb);
-	ret = dquot_acquire(dquot);
-	reiserfs_write_lock_nested(dquot->dq_sb, depth);
-	err = journal_end(&th);
-	if (!ret && err)
-		ret = err;
-out:
-	reiserfs_write_unlock(dquot->dq_sb);
-	return ret;
-}
-
-static int reiserfs_release_dquot(struct dquot *dquot)
-{
-	struct reiserfs_transaction_handle th;
-	int ret, err;
-
-	reiserfs_write_lock(dquot->dq_sb);
-	ret =
-	    journal_begin(&th, dquot->dq_sb,
-			  REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
-	reiserfs_write_unlock(dquot->dq_sb);
-	if (ret) {
-		/* Release dquot anyway to avoid endless cycle in dqput() */
-		dquot_release(dquot);
-		goto out;
-	}
-	ret = dquot_release(dquot);
-	reiserfs_write_lock(dquot->dq_sb);
-	err = journal_end(&th);
-	if (!ret && err)
-		ret = err;
-	reiserfs_write_unlock(dquot->dq_sb);
-out:
-	return ret;
-}
-
-static int reiserfs_mark_dquot_dirty(struct dquot *dquot)
-{
-	/* Are we journaling quotas? */
-	if (REISERFS_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
-	    REISERFS_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
-		dquot_mark_dquot_dirty(dquot);
-		return reiserfs_write_dquot(dquot);
-	} else
-		return dquot_mark_dquot_dirty(dquot);
-}
-
-static int reiserfs_write_info(struct super_block *sb, int type)
-{
-	struct reiserfs_transaction_handle th;
-	int ret, err;
-	int depth;
-
-	/* Data block + inode block */
-	reiserfs_write_lock(sb);
-	ret = journal_begin(&th, sb, 2);
-	if (ret)
-		goto out;
-	depth = reiserfs_write_unlock_nested(sb);
-	ret = dquot_commit_info(sb, type);
-	reiserfs_write_lock_nested(sb, depth);
-	err = journal_end(&th);
-	if (!ret && err)
-		ret = err;
-out:
-	reiserfs_write_unlock(sb);
-	return ret;
-}
-
-/*
- * Turn on quotas during mount time - we need to find the quota file and such...
- */
-static int reiserfs_quota_on_mount(struct super_block *sb, int type)
-{
-	return dquot_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type],
-					REISERFS_SB(sb)->s_jquota_fmt, type);
-}
-
-/*
- * Standard function to be called on quota_on
- */
-static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
-			     const struct path *path)
-{
-	int err;
-	struct inode *inode;
-	struct reiserfs_transaction_handle th;
-	int opt = type == USRQUOTA ? REISERFS_USRQUOTA : REISERFS_GRPQUOTA;
-
-	reiserfs_write_lock(sb);
-	if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt))) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	/* Quotafile not on the same filesystem? */
-	if (path->dentry->d_sb != sb) {
-		err = -EXDEV;
-		goto out;
-	}
-	inode = d_inode(path->dentry);
-	/*
-	 * We must not pack tails for quota files on reiserfs for quota
-	 * IO to work
-	 */
-	if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) {
-		err = reiserfs_unpack(inode);
-		if (err) {
-			reiserfs_warning(sb, "super-6520",
-				"Unpacking tail of quota file failed"
-				" (%d). Cannot turn on quotas.", err);
-			err = -EINVAL;
-			goto out;
-		}
-		mark_inode_dirty(inode);
-	}
-	/* Journaling quota? */
-	if (REISERFS_SB(sb)->s_qf_names[type]) {
-		/* Quotafile not of fs root? */
-		if (path->dentry->d_parent != sb->s_root)
-			reiserfs_warning(sb, "super-6521",
-				 "Quota file not on filesystem root. "
-				 "Journalled quota will not work.");
-	}
-
-	/*
-	 * When we journal data on quota file, we have to flush journal to see
-	 * all updates to the file when we bypass pagecache...
-	 */
-	if (reiserfs_file_data_log(inode)) {
-		/* Just start temporary transaction and finish it */
-		err = journal_begin(&th, sb, 1);
-		if (err)
-			goto out;
-		err = journal_end_sync(&th);
-		if (err)
-			goto out;
-	}
-	reiserfs_write_unlock(sb);
-	err = dquot_quota_on(sb, type, format_id, path);
-	if (!err) {
-		inode_lock(inode);
-		REISERFS_I(inode)->i_attrs |= REISERFS_IMMUTABLE_FL |
-					      REISERFS_NOATIME_FL;
-		inode_set_flags(inode, S_IMMUTABLE | S_NOATIME,
-				S_IMMUTABLE | S_NOATIME);
-		inode_unlock(inode);
-		mark_inode_dirty(inode);
-	}
-	return err;
-out:
-	reiserfs_write_unlock(sb);
-	return err;
-}
-
-static int reiserfs_quota_off(struct super_block *sb, int type)
-{
-	int err;
-	struct inode *inode = sb_dqopt(sb)->files[type];
-
-	if (!inode || !igrab(inode))
-		goto out;
-
-	err = dquot_quota_off(sb, type);
-	if (err)
-		goto out_put;
-
-	inode_lock(inode);
-	REISERFS_I(inode)->i_attrs &= ~(REISERFS_IMMUTABLE_FL |
-					REISERFS_NOATIME_FL);
-	inode_set_flags(inode, 0, S_IMMUTABLE | S_NOATIME);
-	inode_unlock(inode);
-	mark_inode_dirty(inode);
-out_put:
-	iput(inode);
-	return err;
-out:
-	return dquot_quota_off(sb, type);
-}
-
-/*
- * Read data from quotafile - avoid pagecache and such because we cannot afford
- * acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and no one else should touch the files)
- * we don't have to be afraid of races
- */
-static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data,
-				   size_t len, loff_t off)
-{
-	struct inode *inode = sb_dqopt(sb)->files[type];
-	unsigned long blk = off >> sb->s_blocksize_bits;
-	int err = 0, offset = off & (sb->s_blocksize - 1), tocopy;
-	size_t toread;
-	struct buffer_head tmp_bh, *bh;
-	loff_t i_size = i_size_read(inode);
-
-	if (off > i_size)
-		return 0;
-	if (off + len > i_size)
-		len = i_size - off;
-	toread = len;
-	while (toread > 0) {
-		tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
-		tmp_bh.b_state = 0;
-		/*
-		 * Quota files are without tails so we can safely
-		 * use this function
-		 */
-		reiserfs_write_lock(sb);
-		err = reiserfs_get_block(inode, blk, &tmp_bh, 0);
-		reiserfs_write_unlock(sb);
-		if (err)
-			return err;
-		if (!buffer_mapped(&tmp_bh))	/* A hole? */
-			memset(data, 0, tocopy);
-		else {
-			bh = sb_bread(sb, tmp_bh.b_blocknr);
-			if (!bh)
-				return -EIO;
-			memcpy(data, bh->b_data + offset, tocopy);
-			brelse(bh);
-		}
-		offset = 0;
-		toread -= tocopy;
-		data += tocopy;
-		blk++;
-	}
-	return len;
-}
-
-/*
- * Write to quotafile (we know the transaction is already started and has
- * enough credits)
- */
-static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
-				    const char *data, size_t len, loff_t off)
-{
-	struct inode *inode = sb_dqopt(sb)->files[type];
-	unsigned long blk = off >> sb->s_blocksize_bits;
-	int err = 0, offset = off & (sb->s_blocksize - 1), tocopy;
-	int journal_quota = REISERFS_SB(sb)->s_qf_names[type] != NULL;
-	size_t towrite = len;
-	struct buffer_head tmp_bh, *bh;
-
-	if (!current->journal_info) {
-		printk(KERN_WARNING "reiserfs: Quota write (off=%llu, len=%llu) cancelled because transaction is not started.\n",
-			(unsigned long long)off, (unsigned long long)len);
-		return -EIO;
-	}
-	while (towrite > 0) {
-		tocopy = min_t(unsigned long, sb->s_blocksize - offset, towrite);
-		tmp_bh.b_state = 0;
-		reiserfs_write_lock(sb);
-		err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE);
-		reiserfs_write_unlock(sb);
-		if (err)
-			goto out;
-		if (offset || tocopy != sb->s_blocksize)
-			bh = sb_bread(sb, tmp_bh.b_blocknr);
-		else
-			bh = sb_getblk(sb, tmp_bh.b_blocknr);
-		if (!bh) {
-			err = -EIO;
-			goto out;
-		}
-		lock_buffer(bh);
-		memcpy(bh->b_data + offset, data, tocopy);
-		flush_dcache_page(bh->b_page);
-		set_buffer_uptodate(bh);
-		unlock_buffer(bh);
-		reiserfs_write_lock(sb);
-		reiserfs_prepare_for_journal(sb, bh, 1);
-		journal_mark_dirty(current->journal_info, bh);
-		if (!journal_quota)
-			reiserfs_add_ordered_list(inode, bh);
-		reiserfs_write_unlock(sb);
-		brelse(bh);
-		offset = 0;
-		towrite -= tocopy;
-		data += tocopy;
-		blk++;
-	}
-out:
-	if (len == towrite)
-		return err;
-	if (inode->i_size < off + len - towrite)
-		i_size_write(inode, off + len - towrite);
-	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
-	mark_inode_dirty(inode);
-	return len - towrite;
-}
-
-#endif
-
-static struct dentry *get_super_block(struct file_system_type *fs_type,
-			   int flags, const char *dev_name,
-			   void *data)
-{
-	return mount_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super);
-}
-
-static int __init init_reiserfs_fs(void)
-{
-	int ret;
-
-	ret = init_inodecache();
-	if (ret)
-		return ret;
-
-	reiserfs_proc_info_global_init();
-
-	ret = register_filesystem(&reiserfs_fs_type);
-	if (ret)
-		goto out;
-
-	return 0;
-out:
-	reiserfs_proc_info_global_done();
-	destroy_inodecache();
-
-	return ret;
-}
-
-static void __exit exit_reiserfs_fs(void)
-{
-	reiserfs_proc_info_global_done();
-	unregister_filesystem(&reiserfs_fs_type);
-	destroy_inodecache();
-}
-
-struct file_system_type reiserfs_fs_type = {
-	.owner = THIS_MODULE,
-	.name = "reiserfs",
-	.mount = get_super_block,
-	.kill_sb = reiserfs_kill_sb,
-	.fs_flags = FS_REQUIRES_DEV,
-};
-MODULE_ALIAS_FS("reiserfs");
-
-MODULE_DESCRIPTION("ReiserFS journaled filesystem");
-MODULE_AUTHOR("Hans Reiser <reiser@namesys.com>");
-MODULE_LICENSE("GPL");
-
-module_init(init_reiserfs_fs);
-module_exit(exit_reiserfs_fs);
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
deleted file mode 100644
index 2cec61af2a9e..000000000000
--- a/fs/reiserfs/tail_conversion.c
+++ /dev/null
@@ -1,318 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright
- * details
- */
-
-#include <linux/time.h>
-#include <linux/pagemap.h>
-#include <linux/buffer_head.h>
-#include "reiserfs.h"
-
-/*
- * access to tail : when one is going to read tail it must make sure, that is
- * not running.  direct2indirect and indirect2direct can not run concurrently
- */
-
-/*
- * Converts direct items to an unformatted node. Panics if file has no
- * tail. -ENOSPC if no disk space for conversion
- */
-/*
- * path points to first direct item of the file regardless of how many of
- * them are there
- */
-int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
-		    struct treepath *path, struct buffer_head *unbh,
-		    loff_t tail_offset)
-{
-	struct super_block *sb = inode->i_sb;
-	struct buffer_head *up_to_date_bh;
-	struct item_head *p_le_ih = tp_item_head(path);
-	unsigned long total_tail = 0;
-
-	/* Key to search for the last byte of the converted item. */
-	struct cpu_key end_key;
-
-	/*
-	 * new indirect item to be inserted or key
-	 * of unfm pointer to be pasted
-	 */
-	struct item_head ind_ih;
-	int blk_size;
-	/* returned value for reiserfs_insert_item and clones */
-	int  retval;
-	/* Handle on an unformatted node that will be inserted in the tree. */
-	unp_t unfm_ptr;
-
-	BUG_ON(!th->t_trans_id);
-
-	REISERFS_SB(sb)->s_direct2indirect++;
-
-	blk_size = sb->s_blocksize;
-
-	/*
-	 * and key to search for append or insert pointer to the new
-	 * unformatted node.
-	 */
-	copy_item_head(&ind_ih, p_le_ih);
-	set_le_ih_k_offset(&ind_ih, tail_offset);
-	set_le_ih_k_type(&ind_ih, TYPE_INDIRECT);
-
-	/* Set the key to search for the place for new unfm pointer */
-	make_cpu_key(&end_key, inode, tail_offset, TYPE_INDIRECT, 4);
-
-	/* FIXME: we could avoid this */
-	if (search_for_position_by_key(sb, &end_key, path) == POSITION_FOUND) {
-		reiserfs_error(sb, "PAP-14030",
-			       "pasted or inserted byte exists in "
-			       "the tree %K. Use fsck to repair.", &end_key);
-		pathrelse(path);
-		return -EIO;
-	}
-
-	p_le_ih = tp_item_head(path);
-
-	unfm_ptr = cpu_to_le32(unbh->b_blocknr);
-
-	if (is_statdata_le_ih(p_le_ih)) {
-		/* Insert new indirect item. */
-		set_ih_free_space(&ind_ih, 0);	/* delete at nearest future */
-		put_ih_item_len(&ind_ih, UNFM_P_SIZE);
-		PATH_LAST_POSITION(path)++;
-		retval =
-		    reiserfs_insert_item(th, path, &end_key, &ind_ih, inode,
-					 (char *)&unfm_ptr);
-	} else {
-		/* Paste into last indirect item of an object. */
-		retval = reiserfs_paste_into_item(th, path, &end_key, inode,
-						    (char *)&unfm_ptr,
-						    UNFM_P_SIZE);
-	}
-	if (retval) {
-		return retval;
-	}
-	/*
-	 * note: from here there are two keys which have matching first
-	 *  three key components. They only differ by the fourth one.
-	 */
-
-	/* Set the key to search for the direct items of the file */
-	make_cpu_key(&end_key, inode, max_reiserfs_offset(inode), TYPE_DIRECT,
-		     4);
-
-	/*
-	 * Move bytes from the direct items to the new unformatted node
-	 * and delete them.
-	 */
-	while (1) {
-		int tail_size;
-
-		/*
-		 * end_key.k_offset is set so, that we will always have found
-		 * last item of the file
-		 */
-		if (search_for_position_by_key(sb, &end_key, path) ==
-		    POSITION_FOUND)
-			reiserfs_panic(sb, "PAP-14050",
-				       "direct item (%K) not found", &end_key);
-		p_le_ih = tp_item_head(path);
-		RFALSE(!is_direct_le_ih(p_le_ih),
-		       "vs-14055: direct item expected(%K), found %h",
-		       &end_key, p_le_ih);
-		tail_size = (le_ih_k_offset(p_le_ih) & (blk_size - 1))
-		    + ih_item_len(p_le_ih) - 1;
-
-		/*
-		 * we only send the unbh pointer if the buffer is not
-		 * up to date.  this avoids overwriting good data from
-		 * writepage() with old data from the disk or buffer cache
-		 * Special case: unbh->b_page will be NULL if we are coming
-		 * through DIRECT_IO handler here.
-		 */
-		if (!unbh->b_page || buffer_uptodate(unbh)
-		    || PageUptodate(unbh->b_page)) {
-			up_to_date_bh = NULL;
-		} else {
-			up_to_date_bh = unbh;
-		}
-		retval = reiserfs_delete_item(th, path, &end_key, inode,
-						up_to_date_bh);
-
-		total_tail += retval;
-
-		/* done: file does not have direct items anymore */
-		if (tail_size == retval)
-			break;
-
-	}
-	/*
-	 * if we've copied bytes from disk into the page, we need to zero
-	 * out the unused part of the block (it was not up to date before)
-	 */
-	if (up_to_date_bh) {
-		unsigned pgoff =
-		    (tail_offset + total_tail - 1) & (PAGE_SIZE - 1);
-		char *kaddr = kmap_atomic(up_to_date_bh->b_page);
-		memset(kaddr + pgoff, 0, blk_size - total_tail);
-		kunmap_atomic(kaddr);
-	}
-
-	REISERFS_I(inode)->i_first_direct_byte = U32_MAX;
-
-	return 0;
-}
-
-/* stolen from fs/buffer.c */
-void reiserfs_unmap_buffer(struct buffer_head *bh)
-{
-	lock_buffer(bh);
-	if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
-		BUG();
-	}
-	clear_buffer_dirty(bh);
-	/*
-	 * Remove the buffer from whatever list it belongs to. We are mostly
-	 * interested in removing it from per-sb j_dirty_buffers list, to avoid
-	 * BUG() on attempt to write not mapped buffer
-	 */
-	if ((!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) {
-		struct inode *inode = bh->b_folio->mapping->host;
-		struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
-		spin_lock(&j->j_dirty_buffers_lock);
-		list_del_init(&bh->b_assoc_buffers);
-		reiserfs_free_jh(bh);
-		spin_unlock(&j->j_dirty_buffers_lock);
-	}
-	clear_buffer_mapped(bh);
-	clear_buffer_req(bh);
-	clear_buffer_new(bh);
-	bh->b_bdev = NULL;
-	unlock_buffer(bh);
-}
-
-/*
- * this first locks inode (neither reads nor sync are permitted),
- * reads tail through page cache, insert direct item. When direct item
- * inserted successfully inode is left locked. Return value is always
- * what we expect from it (number of cut bytes). But when tail remains
- * in the unformatted node, we set mode to SKIP_BALANCING and unlock
- * inode
- */
-int indirect2direct(struct reiserfs_transaction_handle *th,
-		    struct inode *inode, struct page *page,
-		    struct treepath *path,	/* path to the indirect item. */
-		    const struct cpu_key *item_key,	/* Key to look for
-							 * unformatted node
-							 * pointer to be cut. */
-		    loff_t n_new_file_size,	/* New file size. */
-		    char *mode)
-{
-	struct super_block *sb = inode->i_sb;
-	struct item_head s_ih;
-	unsigned long block_size = sb->s_blocksize;
-	char *tail;
-	int tail_len, round_tail_len;
-	loff_t pos, pos1;	/* position of first byte of the tail */
-	struct cpu_key key;
-
-	BUG_ON(!th->t_trans_id);
-
-	REISERFS_SB(sb)->s_indirect2direct++;
-
-	*mode = M_SKIP_BALANCING;
-
-	/* store item head path points to. */
-	copy_item_head(&s_ih, tp_item_head(path));
-
-	tail_len = (n_new_file_size & (block_size - 1));
-	if (get_inode_sd_version(inode) == STAT_DATA_V2)
-		round_tail_len = ROUND_UP(tail_len);
-	else
-		round_tail_len = tail_len;
-
-	pos =
-	    le_ih_k_offset(&s_ih) - 1 + (ih_item_len(&s_ih) / UNFM_P_SIZE -
-					 1) * sb->s_blocksize;
-	pos1 = pos;
-
-	/*
-	 * we are protected by i_mutex. The tail can not disapper, not
-	 * append can be done either
-	 * we are in truncate or packing tail in file_release
-	 */
-
-	tail = (char *)kmap(page);	/* this can schedule */
-
-	if (path_changed(&s_ih, path)) {
-		/* re-search indirect item */
-		if (search_for_position_by_key(sb, item_key, path)
-		    == POSITION_NOT_FOUND)
-			reiserfs_panic(sb, "PAP-5520",
-				       "item to be converted %K does not exist",
-				       item_key);
-		copy_item_head(&s_ih, tp_item_head(path));
-#ifdef CONFIG_REISERFS_CHECK
-		pos = le_ih_k_offset(&s_ih) - 1 +
-		    (ih_item_len(&s_ih) / UNFM_P_SIZE -
-		     1) * sb->s_blocksize;
-		if (pos != pos1)
-			reiserfs_panic(sb, "vs-5530", "tail position "
-				       "changed while we were reading it");
-#endif
-	}
-
-	/* Set direct item header to insert. */
-	make_le_item_head(&s_ih, NULL, get_inode_item_key_version(inode),
-			  pos1 + 1, TYPE_DIRECT, round_tail_len,
-			  0xffff /*ih_free_space */ );
-
-	/*
-	 * we want a pointer to the first byte of the tail in the page.
-	 * the page was locked and this part of the page was up to date when
-	 * indirect2direct was called, so we know the bytes are still valid
-	 */
-	tail = tail + (pos & (PAGE_SIZE - 1));
-
-	PATH_LAST_POSITION(path)++;
-
-	key = *item_key;
-	set_cpu_key_k_type(&key, TYPE_DIRECT);
-	key.key_length = 4;
-	/* Insert tail as new direct item in the tree */
-	if (reiserfs_insert_item(th, path, &key, &s_ih, inode,
-				 tail ? tail : NULL) < 0) {
-		/*
-		 * No disk memory. So we can not convert last unformatted node
-		 * to the direct item.  In this case we used to adjust
-		 * indirect items's ih_free_space. Now ih_free_space is not
-		 * used, it would be ideal to write zeros to corresponding
-		 * unformatted node. For now i_size is considered as guard for
-		 * going out of file size
-		 */
-		kunmap(page);
-		return block_size - round_tail_len;
-	}
-	kunmap(page);
-
-	/* make sure to get the i_blocks changes from reiserfs_insert_item */
-	reiserfs_update_sd(th, inode);
-
-	/*
-	 * note: we have now the same as in above direct2indirect
-	 * conversion: there are two keys which have matching first three
-	 * key components. They only differ by the fourth one.
-	 */
-
-	/*
-	 * We have inserted new direct item and must remove last
-	 * unformatted node.
-	 */
-	*mode = M_CUT;
-
-	/* we store position of first direct item in the in-core inode */
-	/* mark_file_with_tail (inode, pos1 + 1); */
-	REISERFS_I(inode)->i_first_direct_byte = pos1 + 1;
-
-	return block_size - round_tail_len;
-}
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
deleted file mode 100644
index 998035a6388e..000000000000
--- a/fs/reiserfs/xattr.c
+++ /dev/null
@@ -1,1039 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/fs/reiserfs/xattr.c
- *
- * Copyright (c) 2002 by Jeff Mahoney, <jeffm@suse.com>
- *
- */
-
-/*
- * In order to implement EA/ACLs in a clean, backwards compatible manner,
- * they are implemented as files in a "private" directory.
- * Each EA is in it's own file, with the directory layout like so (/ is assumed
- * to be relative to fs root). Inside the /.reiserfs_priv/xattrs directory,
- * directories named using the capital-hex form of the objectid and
- * generation number are used. Inside each directory are individual files
- * named with the name of the extended attribute.
- *
- * So, for objectid 12648430, we could have:
- * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_access
- * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_default
- * /.reiserfs_priv/xattrs/C0FFEE.0/user.Content-Type
- * .. or similar.
- *
- * The file contents are the text of the EA. The size is known based on the
- * stat data describing the file.
- *
- * In the case of system.posix_acl_access and system.posix_acl_default, since
- * these are special cases for filesystem ACLs, they are interpreted by the
- * kernel, in addition, they are negatively and positively cached and attached
- * to the inode so that unnecessary lookups are avoided.
- *
- * Locking works like so:
- * Directory components (xattr root, xattr dir) are protectd by their i_mutex.
- * The xattrs themselves are protected by the xattr_sem.
- */
-
-#include "reiserfs.h"
-#include <linux/capability.h>
-#include <linux/dcache.h>
-#include <linux/namei.h>
-#include <linux/errno.h>
-#include <linux/gfp.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/pagemap.h>
-#include <linux/xattr.h>
-#include "xattr.h"
-#include "acl.h"
-#include <linux/uaccess.h>
-#include <net/checksum.h>
-#include <linux/stat.h>
-#include <linux/quotaops.h>
-#include <linux/security.h>
-#include <linux/posix_acl_xattr.h>
-#include <linux/xattr.h>
-
-#define PRIVROOT_NAME ".reiserfs_priv"
-#define XAROOT_NAME   "xattrs"
-
-
-/*
- * Helpers for inode ops. We do this so that we don't have all the VFS
- * overhead and also for proper i_mutex annotation.
- * dir->i_mutex must be held for all of them.
- */
-#ifdef CONFIG_REISERFS_FS_XATTR
-static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
-{
-	BUG_ON(!inode_is_locked(dir));
-	return dir->i_op->create(&nop_mnt_idmap, dir, dentry, mode, true);
-}
-#endif
-
-static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
-	BUG_ON(!inode_is_locked(dir));
-	return dir->i_op->mkdir(&nop_mnt_idmap, dir, dentry, mode);
-}
-
-/*
- * We use I_MUTEX_CHILD here to silence lockdep. It's safe because xattr
- * mutation ops aren't called during rename or splace, which are the
- * only other users of I_MUTEX_CHILD. It violates the ordering, but that's
- * better than allocating another subclass just for this code.
- */
-static int xattr_unlink(struct inode *dir, struct dentry *dentry)
-{
-	int error;
-
-	BUG_ON(!inode_is_locked(dir));
-
-	inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
-	error = dir->i_op->unlink(dir, dentry);
-	inode_unlock(d_inode(dentry));
-
-	if (!error)
-		d_delete(dentry);
-	return error;
-}
-
-static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
-{
-	int error;
-
-	BUG_ON(!inode_is_locked(dir));
-
-	inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
-	error = dir->i_op->rmdir(dir, dentry);
-	if (!error)
-		d_inode(dentry)->i_flags |= S_DEAD;
-	inode_unlock(d_inode(dentry));
-	if (!error)
-		d_delete(dentry);
-
-	return error;
-}
-
-#define xattr_may_create(flags)	(!flags || flags & XATTR_CREATE)
-
-static struct dentry *open_xa_root(struct super_block *sb, int flags)
-{
-	struct dentry *privroot = REISERFS_SB(sb)->priv_root;
-	struct dentry *xaroot;
-
-	if (d_really_is_negative(privroot))
-		return ERR_PTR(-EOPNOTSUPP);
-
-	inode_lock_nested(d_inode(privroot), I_MUTEX_XATTR);
-
-	xaroot = dget(REISERFS_SB(sb)->xattr_root);
-	if (!xaroot)
-		xaroot = ERR_PTR(-EOPNOTSUPP);
-	else if (d_really_is_negative(xaroot)) {
-		int err = -ENODATA;
-
-		if (xattr_may_create(flags))
-			err = xattr_mkdir(d_inode(privroot), xaroot, 0700);
-		if (err) {
-			dput(xaroot);
-			xaroot = ERR_PTR(err);
-		}
-	}
-
-	inode_unlock(d_inode(privroot));
-	return xaroot;
-}
-
-static struct dentry *open_xa_dir(const struct inode *inode, int flags)
-{
-	struct dentry *xaroot, *xadir;
-	char namebuf[17];
-
-	xaroot = open_xa_root(inode->i_sb, flags);
-	if (IS_ERR(xaroot))
-		return xaroot;
-
-	snprintf(namebuf, sizeof(namebuf), "%X.%X",
-		 le32_to_cpu(INODE_PKEY(inode)->k_objectid),
-		 inode->i_generation);
-
-	inode_lock_nested(d_inode(xaroot), I_MUTEX_XATTR);
-
-	xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf));
-	if (!IS_ERR(xadir) && d_really_is_negative(xadir)) {
-		int err = -ENODATA;
-
-		if (xattr_may_create(flags))
-			err = xattr_mkdir(d_inode(xaroot), xadir, 0700);
-		if (err) {
-			dput(xadir);
-			xadir = ERR_PTR(err);
-		}
-	}
-
-	inode_unlock(d_inode(xaroot));
-	dput(xaroot);
-	return xadir;
-}
-
-/*
- * The following are side effects of other operations that aren't explicitly
- * modifying extended attributes. This includes operations such as permissions
- * or ownership changes, object deletions, etc.
- */
-struct reiserfs_dentry_buf {
-	struct dir_context ctx;
-	struct dentry *xadir;
-	int count;
-	int err;
-	struct dentry *dentries[8];
-};
-
-static bool
-fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
-		   loff_t offset, u64 ino, unsigned int d_type)
-{
-	struct reiserfs_dentry_buf *dbuf =
-		container_of(ctx, struct reiserfs_dentry_buf, ctx);
-	struct dentry *dentry;
-
-	WARN_ON_ONCE(!inode_is_locked(d_inode(dbuf->xadir)));
-
-	if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
-		return false;
-
-	if (name[0] == '.' && (namelen < 2 ||
-			       (namelen == 2 && name[1] == '.')))
-		return true;
-
-	dentry = lookup_one_len(name, dbuf->xadir, namelen);
-	if (IS_ERR(dentry)) {
-		dbuf->err = PTR_ERR(dentry);
-		return false;
-	} else if (d_really_is_negative(dentry)) {
-		/* A directory entry exists, but no file? */
-		reiserfs_error(dentry->d_sb, "xattr-20003",
-			       "Corrupted directory: xattr %pd listed but "
-			       "not found for file %pd.\n",
-			       dentry, dbuf->xadir);
-		dput(dentry);
-		dbuf->err = -EIO;
-		return false;
-	}
-
-	dbuf->dentries[dbuf->count++] = dentry;
-	return true;
-}
-
-static void
-cleanup_dentry_buf(struct reiserfs_dentry_buf *buf)
-{
-	int i;
-
-	for (i = 0; i < buf->count; i++)
-		if (buf->dentries[i])
-			dput(buf->dentries[i]);
-}
-
-static int reiserfs_for_each_xattr(struct inode *inode,
-				   int (*action)(struct dentry *, void *),
-				   void *data)
-{
-	struct dentry *dir;
-	int i, err = 0;
-	struct reiserfs_dentry_buf buf = {
-		.ctx.actor = fill_with_dentries,
-	};
-
-	/* Skip out, an xattr has no xattrs associated with it */
-	if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1)
-		return 0;
-
-	dir = open_xa_dir(inode, XATTR_REPLACE);
-	if (IS_ERR(dir)) {
-		err = PTR_ERR(dir);
-		goto out;
-	} else if (d_really_is_negative(dir)) {
-		err = 0;
-		goto out_dir;
-	}
-
-	inode_lock_nested(d_inode(dir), I_MUTEX_XATTR);
-
-	buf.xadir = dir;
-	while (1) {
-		err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx);
-		if (err)
-			break;
-		if (buf.err) {
-			err = buf.err;
-			break;
-		}
-		if (!buf.count)
-			break;
-		for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) {
-			struct dentry *dentry = buf.dentries[i];
-
-			if (!d_is_dir(dentry))
-				err = action(dentry, data);
-
-			dput(dentry);
-			buf.dentries[i] = NULL;
-		}
-		if (err)
-			break;
-		buf.count = 0;
-	}
-	inode_unlock(d_inode(dir));
-
-	cleanup_dentry_buf(&buf);
-
-	if (!err) {
-		/*
-		 * We start a transaction here to avoid a ABBA situation
-		 * between the xattr root's i_mutex and the journal lock.
-		 * This doesn't incur much additional overhead since the
-		 * new transaction will just nest inside the
-		 * outer transaction.
-		 */
-		int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 +
-			     4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
-		struct reiserfs_transaction_handle th;
-
-		reiserfs_write_lock(inode->i_sb);
-		err = journal_begin(&th, inode->i_sb, blocks);
-		reiserfs_write_unlock(inode->i_sb);
-		if (!err) {
-			int jerror;
-
-			inode_lock_nested(d_inode(dir->d_parent),
-					  I_MUTEX_XATTR);
-			err = action(dir, data);
-			reiserfs_write_lock(inode->i_sb);
-			jerror = journal_end(&th);
-			reiserfs_write_unlock(inode->i_sb);
-			inode_unlock(d_inode(dir->d_parent));
-			err = jerror ?: err;
-		}
-	}
-out_dir:
-	dput(dir);
-out:
-	/*
-	 * -ENODATA: this object doesn't have any xattrs
-	 * -EOPNOTSUPP: this file system doesn't have xattrs enabled on disk.
-	 * Neither are errors
-	 */
-	if (err == -ENODATA || err == -EOPNOTSUPP)
-		err = 0;
-	return err;
-}
-
-static int delete_one_xattr(struct dentry *dentry, void *data)
-{
-	struct inode *dir = d_inode(dentry->d_parent);
-
-	/* This is the xattr dir, handle specially. */
-	if (d_is_dir(dentry))
-		return xattr_rmdir(dir, dentry);
-
-	return xattr_unlink(dir, dentry);
-}
-
-static int chown_one_xattr(struct dentry *dentry, void *data)
-{
-	struct iattr *attrs = data;
-	int ia_valid = attrs->ia_valid;
-	int err;
-
-	/*
-	 * We only want the ownership bits. Otherwise, we'll do
-	 * things like change a directory to a regular file if
-	 * ATTR_MODE is set.
-	 */
-	attrs->ia_valid &= (ATTR_UID|ATTR_GID);
-	err = reiserfs_setattr(&nop_mnt_idmap, dentry, attrs);
-	attrs->ia_valid = ia_valid;
-
-	return err;
-}
-
-/* No i_mutex, but the inode is unconnected. */
-int reiserfs_delete_xattrs(struct inode *inode)
-{
-	int err = reiserfs_for_each_xattr(inode, delete_one_xattr, NULL);
-
-	if (err)
-		reiserfs_warning(inode->i_sb, "jdm-20004",
-				 "Couldn't delete all xattrs (%d)\n", err);
-	return err;
-}
-
-/* inode->i_mutex: down */
-int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
-{
-	int err = reiserfs_for_each_xattr(inode, chown_one_xattr, attrs);
-
-	if (err)
-		reiserfs_warning(inode->i_sb, "jdm-20007",
-				 "Couldn't chown all xattrs (%d)\n", err);
-	return err;
-}
-
-#ifdef CONFIG_REISERFS_FS_XATTR
-/*
- * Returns a dentry corresponding to a specific extended attribute file
- * for the inode. If flags allow, the file is created. Otherwise, a
- * valid or negative dentry, or an error is returned.
- */
-static struct dentry *xattr_lookup(struct inode *inode, const char *name,
-				    int flags)
-{
-	struct dentry *xadir, *xafile;
-	int err = 0;
-
-	xadir = open_xa_dir(inode, flags);
-	if (IS_ERR(xadir))
-		return ERR_CAST(xadir);
-
-	inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR);
-	xafile = lookup_one_len(name, xadir, strlen(name));
-	if (IS_ERR(xafile)) {
-		err = PTR_ERR(xafile);
-		goto out;
-	}
-
-	if (d_really_is_positive(xafile) && (flags & XATTR_CREATE))
-		err = -EEXIST;
-
-	if (d_really_is_negative(xafile)) {
-		err = -ENODATA;
-		if (xattr_may_create(flags))
-			err = xattr_create(d_inode(xadir), xafile,
-					      0700|S_IFREG);
-	}
-
-	if (err)
-		dput(xafile);
-out:
-	inode_unlock(d_inode(xadir));
-	dput(xadir);
-	if (err)
-		return ERR_PTR(err);
-	return xafile;
-}
-
-/* Internal operations on file data */
-static inline void reiserfs_put_page(struct page *page)
-{
-	kunmap(page);
-	put_page(page);
-}
-
-static struct page *reiserfs_get_page(struct inode *dir, size_t n)
-{
-	struct address_space *mapping = dir->i_mapping;
-	struct page *page;
-	/*
-	 * We can deadlock if we try to free dentries,
-	 * and an unlink/rmdir has just occurred - GFP_NOFS avoids this
-	 */
-	mapping_set_gfp_mask(mapping, GFP_NOFS);
-	page = read_mapping_page(mapping, n >> PAGE_SHIFT, NULL);
-	if (!IS_ERR(page))
-		kmap(page);
-	return page;
-}
-
-static inline __u32 xattr_hash(const char *msg, int len)
-{
-	/*
-	 * csum_partial() gives different results for little-endian and
-	 * big endian hosts. Images created on little-endian hosts and
-	 * mounted on big-endian hosts(and vice versa) will see csum mismatches
-	 * when trying to fetch xattrs. Treating the hash as __wsum_t would
-	 * lower the frequency of mismatch.  This is an endianness bug in
-	 * reiserfs.  The return statement would result in a sparse warning. Do
-	 * not fix the sparse warning so as to not hide a reminder of the bug.
-	 */
-	return csum_partial(msg, len, 0);
-}
-
-int reiserfs_commit_write(struct file *f, struct page *page,
-			  unsigned from, unsigned to);
-
-static void update_ctime(struct inode *inode)
-{
-	struct timespec64 now = current_time(inode);
-	struct timespec64 ctime = inode_get_ctime(inode);
-
-	if (inode_unhashed(inode) || !inode->i_nlink ||
-	    timespec64_equal(&ctime, &now))
-		return;
-
-	inode_set_ctime_to_ts(inode, now);
-	mark_inode_dirty(inode);
-}
-
-static int lookup_and_delete_xattr(struct inode *inode, const char *name)
-{
-	int err = 0;
-	struct dentry *dentry, *xadir;
-
-	xadir = open_xa_dir(inode, XATTR_REPLACE);
-	if (IS_ERR(xadir))
-		return PTR_ERR(xadir);
-
-	inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR);
-	dentry = lookup_one_len(name, xadir, strlen(name));
-	if (IS_ERR(dentry)) {
-		err = PTR_ERR(dentry);
-		goto out_dput;
-	}
-
-	if (d_really_is_positive(dentry)) {
-		err = xattr_unlink(d_inode(xadir), dentry);
-		update_ctime(inode);
-	}
-
-	dput(dentry);
-out_dput:
-	inode_unlock(d_inode(xadir));
-	dput(xadir);
-	return err;
-}
-
-
-/* Generic extended attribute operations that can be used by xa plugins */
-
-/*
- * inode->i_mutex: down
- */
-int
-reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
-			  struct inode *inode, const char *name,
-			  const void *buffer, size_t buffer_size, int flags)
-{
-	int err = 0;
-	struct dentry *dentry;
-	struct page *page;
-	char *data;
-	size_t file_pos = 0;
-	size_t buffer_pos = 0;
-	size_t new_size;
-	__u32 xahash = 0;
-
-	if (get_inode_sd_version(inode) == STAT_DATA_V1)
-		return -EOPNOTSUPP;
-
-	if (!buffer) {
-		err = lookup_and_delete_xattr(inode, name);
-		return err;
-	}
-
-	dentry = xattr_lookup(inode, name, flags);
-	if (IS_ERR(dentry))
-		return PTR_ERR(dentry);
-
-	down_write(&REISERFS_I(inode)->i_xattr_sem);
-
-	xahash = xattr_hash(buffer, buffer_size);
-	while (buffer_pos < buffer_size || buffer_pos == 0) {
-		size_t chunk;
-		size_t skip = 0;
-		size_t page_offset = (file_pos & (PAGE_SIZE - 1));
-
-		if (buffer_size - buffer_pos > PAGE_SIZE)
-			chunk = PAGE_SIZE;
-		else
-			chunk = buffer_size - buffer_pos;
-
-		page = reiserfs_get_page(d_inode(dentry), file_pos);
-		if (IS_ERR(page)) {
-			err = PTR_ERR(page);
-			goto out_unlock;
-		}
-
-		lock_page(page);
-		data = page_address(page);
-
-		if (file_pos == 0) {
-			struct reiserfs_xattr_header *rxh;
-
-			skip = file_pos = sizeof(struct reiserfs_xattr_header);
-			if (chunk + skip > PAGE_SIZE)
-				chunk = PAGE_SIZE - skip;
-			rxh = (struct reiserfs_xattr_header *)data;
-			rxh->h_magic = cpu_to_le32(REISERFS_XATTR_MAGIC);
-			rxh->h_hash = cpu_to_le32(xahash);
-		}
-
-		reiserfs_write_lock(inode->i_sb);
-		err = __reiserfs_write_begin(page, page_offset, chunk + skip);
-		if (!err) {
-			if (buffer)
-				memcpy(data + skip, buffer + buffer_pos, chunk);
-			err = reiserfs_commit_write(NULL, page, page_offset,
-						    page_offset + chunk +
-						    skip);
-		}
-		reiserfs_write_unlock(inode->i_sb);
-		unlock_page(page);
-		reiserfs_put_page(page);
-		buffer_pos += chunk;
-		file_pos += chunk;
-		skip = 0;
-		if (err || buffer_size == 0 || !buffer)
-			break;
-	}
-
-	new_size = buffer_size + sizeof(struct reiserfs_xattr_header);
-	if (!err && new_size < i_size_read(d_inode(dentry))) {
-		struct iattr newattrs = {
-			.ia_ctime = current_time(inode),
-			.ia_size = new_size,
-			.ia_valid = ATTR_SIZE | ATTR_CTIME,
-		};
-
-		inode_lock_nested(d_inode(dentry), I_MUTEX_XATTR);
-		inode_dio_wait(d_inode(dentry));
-
-		err = reiserfs_setattr(&nop_mnt_idmap, dentry, &newattrs);
-		inode_unlock(d_inode(dentry));
-	} else
-		update_ctime(inode);
-out_unlock:
-	up_write(&REISERFS_I(inode)->i_xattr_sem);
-	dput(dentry);
-	return err;
-}
-
-/* We need to start a transaction to maintain lock ordering */
-int reiserfs_xattr_set(struct inode *inode, const char *name,
-		       const void *buffer, size_t buffer_size, int flags)
-{
-
-	struct reiserfs_transaction_handle th;
-	int error, error2;
-	size_t jbegin_count = reiserfs_xattr_nblocks(inode, buffer_size);
-
-	/* Check before we start a transaction and then do nothing. */
-	if (!d_really_is_positive(REISERFS_SB(inode->i_sb)->priv_root))
-		return -EOPNOTSUPP;
-
-	if (!(flags & XATTR_REPLACE))
-		jbegin_count += reiserfs_xattr_jcreate_nblocks(inode);
-
-	reiserfs_write_lock(inode->i_sb);
-	error = journal_begin(&th, inode->i_sb, jbegin_count);
-	reiserfs_write_unlock(inode->i_sb);
-	if (error) {
-		return error;
-	}
-
-	error = reiserfs_xattr_set_handle(&th, inode, name,
-					  buffer, buffer_size, flags);
-
-	reiserfs_write_lock(inode->i_sb);
-	error2 = journal_end(&th);
-	reiserfs_write_unlock(inode->i_sb);
-	if (error == 0)
-		error = error2;
-
-	return error;
-}
-
-/*
- * inode->i_mutex: down
- */
-int
-reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer,
-		   size_t buffer_size)
-{
-	ssize_t err = 0;
-	struct dentry *dentry;
-	size_t isize;
-	size_t file_pos = 0;
-	size_t buffer_pos = 0;
-	struct page *page;
-	__u32 hash = 0;
-
-	if (name == NULL)
-		return -EINVAL;
-
-	/*
-	 * We can't have xattrs attached to v1 items since they don't have
-	 * generation numbers
-	 */
-	if (get_inode_sd_version(inode) == STAT_DATA_V1)
-		return -EOPNOTSUPP;
-
-	/*
-	 * priv_root needn't be initialized during mount so allow initial
-	 * lookups to succeed.
-	 */
-	if (!REISERFS_SB(inode->i_sb)->priv_root)
-		return 0;
-
-	dentry = xattr_lookup(inode, name, XATTR_REPLACE);
-	if (IS_ERR(dentry)) {
-		err = PTR_ERR(dentry);
-		goto out;
-	}
-
-	down_read(&REISERFS_I(inode)->i_xattr_sem);
-
-	isize = i_size_read(d_inode(dentry));
-
-	/* Just return the size needed */
-	if (buffer == NULL) {
-		err = isize - sizeof(struct reiserfs_xattr_header);
-		goto out_unlock;
-	}
-
-	if (buffer_size < isize - sizeof(struct reiserfs_xattr_header)) {
-		err = -ERANGE;
-		goto out_unlock;
-	}
-
-	while (file_pos < isize) {
-		size_t chunk;
-		char *data;
-		size_t skip = 0;
-
-		if (isize - file_pos > PAGE_SIZE)
-			chunk = PAGE_SIZE;
-		else
-			chunk = isize - file_pos;
-
-		page = reiserfs_get_page(d_inode(dentry), file_pos);
-		if (IS_ERR(page)) {
-			err = PTR_ERR(page);
-			goto out_unlock;
-		}
-
-		lock_page(page);
-		data = page_address(page);
-		if (file_pos == 0) {
-			struct reiserfs_xattr_header *rxh =
-			    (struct reiserfs_xattr_header *)data;
-			skip = file_pos = sizeof(struct reiserfs_xattr_header);
-			chunk -= skip;
-			/* Magic doesn't match up.. */
-			if (rxh->h_magic != cpu_to_le32(REISERFS_XATTR_MAGIC)) {
-				unlock_page(page);
-				reiserfs_put_page(page);
-				reiserfs_warning(inode->i_sb, "jdm-20001",
-						 "Invalid magic for xattr (%s) "
-						 "associated with %k", name,
-						 INODE_PKEY(inode));
-				err = -EIO;
-				goto out_unlock;
-			}
-			hash = le32_to_cpu(rxh->h_hash);
-		}
-		memcpy(buffer + buffer_pos, data + skip, chunk);
-		unlock_page(page);
-		reiserfs_put_page(page);
-		file_pos += chunk;
-		buffer_pos += chunk;
-		skip = 0;
-	}
-	err = isize - sizeof(struct reiserfs_xattr_header);
-
-	if (xattr_hash(buffer, isize - sizeof(struct reiserfs_xattr_header)) !=
-	    hash) {
-		reiserfs_warning(inode->i_sb, "jdm-20002",
-				 "Invalid hash for xattr (%s) associated "
-				 "with %k", name, INODE_PKEY(inode));
-		err = -EIO;
-	}
-
-out_unlock:
-	up_read(&REISERFS_I(inode)->i_xattr_sem);
-	dput(dentry);
-
-out:
-	return err;
-}
-
-/*
- * In order to implement different sets of xattr operations for each xattr
- * prefix with the generic xattr API, a filesystem should create a
- * null-terminated array of struct xattr_handler (one for each prefix) and
- * hang a pointer to it off of the s_xattr field of the superblock.
- *
- * The generic_fooxattr() functions will use this list to dispatch xattr
- * operations to the correct xattr_handler.
- */
-#define for_each_xattr_handler(handlers, handler)		\
-		for ((handler) = *(handlers)++;			\
-			(handler) != NULL;			\
-			(handler) = *(handlers)++)
-
-static inline bool reiserfs_posix_acl_list(const char *name,
-					   struct dentry *dentry)
-{
-	return (posix_acl_type(name) >= 0) &&
-	       IS_POSIXACL(d_backing_inode(dentry));
-}
-
-/* This is the implementation for the xattr plugin infrastructure */
-static inline bool reiserfs_xattr_list(const struct xattr_handler * const *handlers,
-				       const char *name, struct dentry *dentry)
-{
-	if (handlers) {
-		const struct xattr_handler *xah = NULL;
-
-		for_each_xattr_handler(handlers, xah) {
-			const char *prefix = xattr_prefix(xah);
-
-			if (strncmp(prefix, name, strlen(prefix)))
-				continue;
-
-			if (!xattr_handler_can_list(xah, dentry))
-				return false;
-
-			return true;
-		}
-	}
-
-	return reiserfs_posix_acl_list(name, dentry);
-}
-
-struct listxattr_buf {
-	struct dir_context ctx;
-	size_t size;
-	size_t pos;
-	char *buf;
-	struct dentry *dentry;
-};
-
-static bool listxattr_filler(struct dir_context *ctx, const char *name,
-			    int namelen, loff_t offset, u64 ino,
-			    unsigned int d_type)
-{
-	struct listxattr_buf *b =
-		container_of(ctx, struct listxattr_buf, ctx);
-	size_t size;
-
-	if (name[0] != '.' ||
-	    (namelen != 1 && (name[1] != '.' || namelen != 2))) {
-		if (!reiserfs_xattr_list(b->dentry->d_sb->s_xattr, name,
-					 b->dentry))
-			return true;
-		size = namelen + 1;
-		if (b->buf) {
-			if (b->pos + size > b->size) {
-				b->pos = -ERANGE;
-				return false;
-			}
-			memcpy(b->buf + b->pos, name, namelen);
-			b->buf[b->pos + namelen] = 0;
-		}
-		b->pos += size;
-	}
-	return true;
-}
-
-/*
- * Inode operation listxattr()
- *
- * We totally ignore the generic listxattr here because it would be stupid
- * not to. Since the xattrs are organized in a directory, we can just
- * readdir to find them.
- */
-ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
-{
-	struct dentry *dir;
-	int err = 0;
-	struct listxattr_buf buf = {
-		.ctx.actor = listxattr_filler,
-		.dentry = dentry,
-		.buf = buffer,
-		.size = buffer ? size : 0,
-	};
-
-	if (d_really_is_negative(dentry))
-		return -EINVAL;
-
-	if (get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
-		return -EOPNOTSUPP;
-
-	dir = open_xa_dir(d_inode(dentry), XATTR_REPLACE);
-	if (IS_ERR(dir)) {
-		err = PTR_ERR(dir);
-		if (err == -ENODATA)
-			err = 0;  /* Not an error if there aren't any xattrs */
-		goto out;
-	}
-
-	inode_lock_nested(d_inode(dir), I_MUTEX_XATTR);
-	err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx);
-	inode_unlock(d_inode(dir));
-
-	if (!err)
-		err = buf.pos;
-
-	dput(dir);
-out:
-	return err;
-}
-
-static int create_privroot(struct dentry *dentry)
-{
-	int err;
-	struct inode *inode = d_inode(dentry->d_parent);
-
-	WARN_ON_ONCE(!inode_is_locked(inode));
-
-	err = xattr_mkdir(inode, dentry, 0700);
-	if (err || d_really_is_negative(dentry)) {
-		reiserfs_warning(dentry->d_sb, "jdm-20006",
-				 "xattrs/ACLs enabled and couldn't "
-				 "find/create .reiserfs_priv. "
-				 "Failing mount.");
-		return -EOPNOTSUPP;
-	}
-
-	reiserfs_init_priv_inode(d_inode(dentry));
-	reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr "
-		      "storage.\n", PRIVROOT_NAME);
-
-	return 0;
-}
-
-#else
-int __init reiserfs_xattr_register_handlers(void) { return 0; }
-void reiserfs_xattr_unregister_handlers(void) {}
-static int create_privroot(struct dentry *dentry) { return 0; }
-#endif
-
-/* Actual operations that are exported to VFS-land */
-const struct xattr_handler * const reiserfs_xattr_handlers[] = {
-#ifdef CONFIG_REISERFS_FS_XATTR
-	&reiserfs_xattr_user_handler,
-	&reiserfs_xattr_trusted_handler,
-#endif
-#ifdef CONFIG_REISERFS_FS_SECURITY
-	&reiserfs_xattr_security_handler,
-#endif
-	NULL
-};
-
-static int xattr_mount_check(struct super_block *s)
-{
-	/*
-	 * We need generation numbers to ensure that the oid mapping is correct
-	 * v3.5 filesystems don't have them.
-	 */
-	if (old_format_only(s)) {
-		if (reiserfs_xattrs_optional(s)) {
-			/*
-			 * Old format filesystem, but optional xattrs have
-			 * been enabled. Error out.
-			 */
-			reiserfs_warning(s, "jdm-2005",
-					 "xattrs/ACLs not supported "
-					 "on pre-v3.6 format filesystems. "
-					 "Failing mount.");
-			return -EOPNOTSUPP;
-		}
-	}
-
-	return 0;
-}
-
-int reiserfs_permission(struct mnt_idmap *idmap, struct inode *inode,
-			int mask)
-{
-	/*
-	 * We don't do permission checks on the internal objects.
-	 * Permissions are determined by the "owning" object.
-	 */
-	if (IS_PRIVATE(inode))
-		return 0;
-
-	return generic_permission(&nop_mnt_idmap, inode, mask);
-}
-
-static int xattr_hide_revalidate(struct dentry *dentry, unsigned int flags)
-{
-	return -EPERM;
-}
-
-static const struct dentry_operations xattr_lookup_poison_ops = {
-	.d_revalidate = xattr_hide_revalidate,
-};
-
-int reiserfs_lookup_privroot(struct super_block *s)
-{
-	struct dentry *dentry;
-	int err = 0;
-
-	/* If we don't have the privroot located yet - go find it */
-	inode_lock(d_inode(s->s_root));
-	dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
-				strlen(PRIVROOT_NAME));
-	if (!IS_ERR(dentry)) {
-		REISERFS_SB(s)->priv_root = dentry;
-		d_set_d_op(dentry, &xattr_lookup_poison_ops);
-		if (d_really_is_positive(dentry))
-			reiserfs_init_priv_inode(d_inode(dentry));
-	} else
-		err = PTR_ERR(dentry);
-	inode_unlock(d_inode(s->s_root));
-
-	return err;
-}
-
-/*
- * We need to take a copy of the mount flags since things like
- * SB_RDONLY don't get set until *after* we're called.
- * mount_flags != mount_options
- */
-int reiserfs_xattr_init(struct super_block *s, int mount_flags)
-{
-	int err = 0;
-	struct dentry *privroot = REISERFS_SB(s)->priv_root;
-
-	err = xattr_mount_check(s);
-	if (err)
-		goto error;
-
-	if (d_really_is_negative(privroot) && !(mount_flags & SB_RDONLY)) {
-		inode_lock(d_inode(s->s_root));
-		err = create_privroot(REISERFS_SB(s)->priv_root);
-		inode_unlock(d_inode(s->s_root));
-	}
-
-	if (d_really_is_positive(privroot)) {
-		inode_lock(d_inode(privroot));
-		if (!REISERFS_SB(s)->xattr_root) {
-			struct dentry *dentry;
-
-			dentry = lookup_one_len(XAROOT_NAME, privroot,
-						strlen(XAROOT_NAME));
-			if (!IS_ERR(dentry))
-				REISERFS_SB(s)->xattr_root = dentry;
-			else
-				err = PTR_ERR(dentry);
-		}
-		inode_unlock(d_inode(privroot));
-	}
-
-error:
-	if (err) {
-		clear_bit(REISERFS_XATTRS_USER, &REISERFS_SB(s)->s_mount_opt);
-		clear_bit(REISERFS_POSIXACL, &REISERFS_SB(s)->s_mount_opt);
-	}
-
-	/* The super_block SB_POSIXACL must mirror the (no)acl mount option. */
-	if (reiserfs_posixacl(s))
-		s->s_flags |= SB_POSIXACL;
-	else
-		s->s_flags &= ~SB_POSIXACL;
-
-	return err;
-}
diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h
deleted file mode 100644
index 5868a4e990e3..000000000000
--- a/fs/reiserfs/xattr.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <linux/reiserfs_xattr.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/rwsem.h>
-#include <linux/xattr.h>
-
-struct inode;
-struct dentry;
-struct iattr;
-struct super_block;
-
-int reiserfs_xattr_register_handlers(void) __init;
-void reiserfs_xattr_unregister_handlers(void);
-int reiserfs_xattr_init(struct super_block *sb, int mount_flags);
-int reiserfs_lookup_privroot(struct super_block *sb);
-int reiserfs_delete_xattrs(struct inode *inode);
-int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs);
-int reiserfs_permission(struct mnt_idmap *idmap,
-			struct inode *inode, int mask);
-
-#ifdef CONFIG_REISERFS_FS_XATTR
-#define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir)
-ssize_t reiserfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
-
-int reiserfs_xattr_get(struct inode *, const char *, void *, size_t);
-int reiserfs_xattr_set(struct inode *, const char *, const void *, size_t, int);
-int reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *,
-			      struct inode *, const char *, const void *,
-			      size_t, int);
-
-extern const struct xattr_handler reiserfs_xattr_user_handler;
-extern const struct xattr_handler reiserfs_xattr_trusted_handler;
-extern const struct xattr_handler reiserfs_xattr_security_handler;
-#ifdef CONFIG_REISERFS_FS_SECURITY
-int reiserfs_security_init(struct inode *dir, struct inode *inode,
-			   const struct qstr *qstr,
-			   struct reiserfs_security_handle *sec);
-int reiserfs_security_write(struct reiserfs_transaction_handle *th,
-			    struct inode *inode,
-			    struct reiserfs_security_handle *sec);
-void reiserfs_security_free(struct reiserfs_security_handle *sec);
-#endif
-
-static inline int reiserfs_xattrs_initialized(struct super_block *sb)
-{
-	return REISERFS_SB(sb)->priv_root && REISERFS_SB(sb)->xattr_root;
-}
-
-#define xattr_size(size) ((size) + sizeof(struct reiserfs_xattr_header))
-static inline loff_t reiserfs_xattr_nblocks(struct inode *inode, loff_t size)
-{
-	loff_t ret = 0;
-	if (reiserfs_file_data_log(inode)) {
-		ret = _ROUND_UP(xattr_size(size), inode->i_sb->s_blocksize);
-		ret >>= inode->i_sb->s_blocksize_bits;
-	}
-	return ret;
-}
-
-/*
- * We may have to create up to 3 objects: xattr root, xattr dir, xattr file.
- * Let's try to be smart about it.
- * xattr root: We cache it. If it's not cached, we may need to create it.
- * xattr dir: If anything has been loaded for this inode, we can set a flag
- *            saying so.
- * xattr file: Since we don't cache xattrs, we can't tell. We always include
- *             blocks for it.
- *
- * However, since root and dir can be created between calls - YOU MUST SAVE
- * THIS VALUE.
- */
-static inline size_t reiserfs_xattr_jcreate_nblocks(struct inode *inode)
-{
-	size_t nblocks = JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
-
-	if ((REISERFS_I(inode)->i_flags & i_has_xattr_dir) == 0) {
-		nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
-		if (d_really_is_negative(REISERFS_SB(inode->i_sb)->xattr_root))
-			nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
-	}
-
-	return nblocks;
-}
-
-static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
-{
-	init_rwsem(&REISERFS_I(inode)->i_xattr_sem);
-}
-
-#else
-
-#define reiserfs_listxattr NULL
-
-static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
-{
-}
-#endif  /*  CONFIG_REISERFS_FS_XATTR  */
-
-#ifndef CONFIG_REISERFS_FS_SECURITY
-static inline int reiserfs_security_init(struct inode *dir,
-					 struct inode *inode,
-					 const struct qstr *qstr,
-					 struct reiserfs_security_handle *sec)
-{
-	return 0;
-}
-static inline int
-reiserfs_security_write(struct reiserfs_transaction_handle *th,
-			struct inode *inode,
-			struct reiserfs_security_handle *sec)
-{
-	return 0;
-}
-static inline void reiserfs_security_free(struct reiserfs_security_handle *sec)
-{}
-#endif
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
deleted file mode 100644
index 064264992b49..000000000000
--- a/fs/reiserfs/xattr_acl.c
+++ /dev/null
@@ -1,411 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/capability.h>
-#include <linux/fs.h>
-#include <linux/posix_acl.h>
-#include "reiserfs.h"
-#include <linux/errno.h>
-#include <linux/pagemap.h>
-#include <linux/xattr.h>
-#include <linux/slab.h>
-#include <linux/posix_acl_xattr.h>
-#include "xattr.h"
-#include "acl.h"
-#include <linux/uaccess.h>
-
-static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th,
-			    struct inode *inode, int type,
-			    struct posix_acl *acl);
-
-
-int
-reiserfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
-		 struct posix_acl *acl, int type)
-{
-	int error, error2;
-	struct reiserfs_transaction_handle th;
-	size_t jcreate_blocks;
-	int size = acl ? posix_acl_xattr_size(acl->a_count) : 0;
-	int update_mode = 0;
-	struct inode *inode = d_inode(dentry);
-	umode_t mode = inode->i_mode;
-
-	/*
-	 * Pessimism: We can't assume that anything from the xattr root up
-	 * has been created.
-	 */
-
-	jcreate_blocks = reiserfs_xattr_jcreate_nblocks(inode) +
-			 reiserfs_xattr_nblocks(inode, size) * 2;
-
-	reiserfs_write_lock(inode->i_sb);
-	error = journal_begin(&th, inode->i_sb, jcreate_blocks);
-	reiserfs_write_unlock(inode->i_sb);
-	if (error == 0) {
-		if (type == ACL_TYPE_ACCESS && acl) {
-			error = posix_acl_update_mode(&nop_mnt_idmap, inode,
-						      &mode, &acl);
-			if (error)
-				goto unlock;
-			update_mode = 1;
-		}
-		error = __reiserfs_set_acl(&th, inode, type, acl);
-		if (!error && update_mode)
-			inode->i_mode = mode;
-unlock:
-		reiserfs_write_lock(inode->i_sb);
-		error2 = journal_end(&th);
-		reiserfs_write_unlock(inode->i_sb);
-		if (error2)
-			error = error2;
-	}
-
-	return error;
-}
-
-/*
- * Convert from filesystem to in-memory representation.
- */
-static struct posix_acl *reiserfs_posix_acl_from_disk(const void *value, size_t size)
-{
-	const char *end = (char *)value + size;
-	int n, count;
-	struct posix_acl *acl;
-
-	if (!value)
-		return NULL;
-	if (size < sizeof(reiserfs_acl_header))
-		return ERR_PTR(-EINVAL);
-	if (((reiserfs_acl_header *) value)->a_version !=
-	    cpu_to_le32(REISERFS_ACL_VERSION))
-		return ERR_PTR(-EINVAL);
-	value = (char *)value + sizeof(reiserfs_acl_header);
-	count = reiserfs_acl_count(size);
-	if (count < 0)
-		return ERR_PTR(-EINVAL);
-	if (count == 0)
-		return NULL;
-	acl = posix_acl_alloc(count, GFP_NOFS);
-	if (!acl)
-		return ERR_PTR(-ENOMEM);
-	for (n = 0; n < count; n++) {
-		reiserfs_acl_entry *entry = (reiserfs_acl_entry *) value;
-		if ((char *)value + sizeof(reiserfs_acl_entry_short) > end)
-			goto fail;
-		acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
-		acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
-		switch (acl->a_entries[n].e_tag) {
-		case ACL_USER_OBJ:
-		case ACL_GROUP_OBJ:
-		case ACL_MASK:
-		case ACL_OTHER:
-			value = (char *)value +
-			    sizeof(reiserfs_acl_entry_short);
-			break;
-
-		case ACL_USER:
-			value = (char *)value + sizeof(reiserfs_acl_entry);
-			if ((char *)value > end)
-				goto fail;
-			acl->a_entries[n].e_uid = 
-				make_kuid(&init_user_ns,
-					  le32_to_cpu(entry->e_id));
-			break;
-		case ACL_GROUP:
-			value = (char *)value + sizeof(reiserfs_acl_entry);
-			if ((char *)value > end)
-				goto fail;
-			acl->a_entries[n].e_gid =
-				make_kgid(&init_user_ns,
-					  le32_to_cpu(entry->e_id));
-			break;
-
-		default:
-			goto fail;
-		}
-	}
-	if (value != end)
-		goto fail;
-	return acl;
-
-fail:
-	posix_acl_release(acl);
-	return ERR_PTR(-EINVAL);
-}
-
-/*
- * Convert from in-memory to filesystem representation.
- */
-static void *reiserfs_posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
-{
-	reiserfs_acl_header *ext_acl;
-	char *e;
-	int n;
-
-	*size = reiserfs_acl_size(acl->a_count);
-	ext_acl = kmalloc(sizeof(reiserfs_acl_header) +
-						  acl->a_count *
-						  sizeof(reiserfs_acl_entry),
-						  GFP_NOFS);
-	if (!ext_acl)
-		return ERR_PTR(-ENOMEM);
-	ext_acl->a_version = cpu_to_le32(REISERFS_ACL_VERSION);
-	e = (char *)ext_acl + sizeof(reiserfs_acl_header);
-	for (n = 0; n < acl->a_count; n++) {
-		const struct posix_acl_entry *acl_e = &acl->a_entries[n];
-		reiserfs_acl_entry *entry = (reiserfs_acl_entry *) e;
-		entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
-		entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
-		switch (acl->a_entries[n].e_tag) {
-		case ACL_USER:
-			entry->e_id = cpu_to_le32(
-				from_kuid(&init_user_ns, acl_e->e_uid));
-			e += sizeof(reiserfs_acl_entry);
-			break;
-		case ACL_GROUP:
-			entry->e_id = cpu_to_le32(
-				from_kgid(&init_user_ns, acl_e->e_gid));
-			e += sizeof(reiserfs_acl_entry);
-			break;
-
-		case ACL_USER_OBJ:
-		case ACL_GROUP_OBJ:
-		case ACL_MASK:
-		case ACL_OTHER:
-			e += sizeof(reiserfs_acl_entry_short);
-			break;
-
-		default:
-			goto fail;
-		}
-	}
-	return (char *)ext_acl;
-
-fail:
-	kfree(ext_acl);
-	return ERR_PTR(-EINVAL);
-}
-
-/*
- * Inode operation get_posix_acl().
- *
- * inode->i_mutex: down
- * BKL held [before 2.5.x]
- */
-struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu)
-{
-	char *name, *value;
-	struct posix_acl *acl;
-	int size;
-	int retval;
-
-	if (rcu)
-		return ERR_PTR(-ECHILD);
-
-	switch (type) {
-	case ACL_TYPE_ACCESS:
-		name = XATTR_NAME_POSIX_ACL_ACCESS;
-		break;
-	case ACL_TYPE_DEFAULT:
-		name = XATTR_NAME_POSIX_ACL_DEFAULT;
-		break;
-	default:
-		BUG();
-	}
-
-	size = reiserfs_xattr_get(inode, name, NULL, 0);
-	if (size < 0) {
-		if (size == -ENODATA || size == -ENOSYS)
-			return NULL;
-		return ERR_PTR(size);
-	}
-
-	value = kmalloc(size, GFP_NOFS);
-	if (!value)
-		return ERR_PTR(-ENOMEM);
-
-	retval = reiserfs_xattr_get(inode, name, value, size);
-	if (retval == -ENODATA || retval == -ENOSYS) {
-		/*
-		 * This shouldn't actually happen as it should have
-		 * been caught above.. but just in case
-		 */
-		acl = NULL;
-	} else if (retval < 0) {
-		acl = ERR_PTR(retval);
-	} else {
-		acl = reiserfs_posix_acl_from_disk(value, retval);
-	}
-
-	kfree(value);
-	return acl;
-}
-
-/*
- * Inode operation set_posix_acl().
- *
- * inode->i_mutex: down
- * BKL held [before 2.5.x]
- */
-static int
-__reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
-		 int type, struct posix_acl *acl)
-{
-	char *name;
-	void *value = NULL;
-	size_t size = 0;
-	int error;
-
-	switch (type) {
-	case ACL_TYPE_ACCESS:
-		name = XATTR_NAME_POSIX_ACL_ACCESS;
-		break;
-	case ACL_TYPE_DEFAULT:
-		name = XATTR_NAME_POSIX_ACL_DEFAULT;
-		if (!S_ISDIR(inode->i_mode))
-			return acl ? -EACCES : 0;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	if (acl) {
-		value = reiserfs_posix_acl_to_disk(acl, &size);
-		if (IS_ERR(value))
-			return (int)PTR_ERR(value);
-	}
-
-	error = reiserfs_xattr_set_handle(th, inode, name, value, size, 0);
-
-	/*
-	 * Ensure that the inode gets dirtied if we're only using
-	 * the mode bits and an old ACL didn't exist. We don't need
-	 * to check if the inode is hashed here since we won't get
-	 * called by reiserfs_inherit_default_acl().
-	 */
-	if (error == -ENODATA) {
-		error = 0;
-		if (type == ACL_TYPE_ACCESS) {
-			inode_set_ctime_current(inode);
-			mark_inode_dirty(inode);
-		}
-	}
-
-	kfree(value);
-
-	if (!error)
-		set_cached_acl(inode, type, acl);
-
-	return error;
-}
-
-/*
- * dir->i_mutex: locked,
- * inode is new and not released into the wild yet
- */
-int
-reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
-			     struct inode *dir, struct dentry *dentry,
-			     struct inode *inode)
-{
-	struct posix_acl *default_acl, *acl;
-	int err = 0;
-
-	/* ACLs only get applied to files and directories */
-	if (S_ISLNK(inode->i_mode))
-		return 0;
-
-	/*
-	 * ACLs can only be used on "new" objects, so if it's an old object
-	 * there is nothing to inherit from
-	 */
-	if (get_inode_sd_version(dir) == STAT_DATA_V1)
-		goto apply_umask;
-
-	/*
-	 * Don't apply ACLs to objects in the .reiserfs_priv tree.. This
-	 * would be useless since permissions are ignored, and a pain because
-	 * it introduces locking cycles
-	 */
-	if (IS_PRIVATE(inode))
-		goto apply_umask;
-
-	err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
-	if (err)
-		return err;
-
-	if (default_acl) {
-		err = __reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT,
-					 default_acl);
-		posix_acl_release(default_acl);
-	}
-	if (acl) {
-		if (!err)
-			err = __reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS,
-						 acl);
-		posix_acl_release(acl);
-	}
-
-	return err;
-
-apply_umask:
-	/* no ACL, apply umask */
-	inode->i_mode &= ~current_umask();
-	return err;
-}
-
-/* This is used to cache the default acl before a new object is created.
- * The biggest reason for this is to get an idea of how many blocks will
- * actually be required for the create operation if we must inherit an ACL.
- * An ACL write can add up to 3 object creations and an additional file write
- * so we'd prefer not to reserve that many blocks in the journal if we can.
- * It also has the advantage of not loading the ACL with a transaction open,
- * this may seem silly, but if the owner of the directory is doing the
- * creation, the ACL may not be loaded since the permissions wouldn't require
- * it.
- * We return the number of blocks required for the transaction.
- */
-int reiserfs_cache_default_acl(struct inode *inode)
-{
-	struct posix_acl *acl;
-	int nblocks = 0;
-
-	if (IS_PRIVATE(inode))
-		return 0;
-
-	acl = get_inode_acl(inode, ACL_TYPE_DEFAULT);
-
-	if (acl && !IS_ERR(acl)) {
-		int size = reiserfs_acl_size(acl->a_count);
-
-		/* Other xattrs can be created during inode creation. We don't
-		 * want to claim too many blocks, so we check to see if we
-		 * need to create the tree to the xattrs, and then we
-		 * just want two files. */
-		nblocks = reiserfs_xattr_jcreate_nblocks(inode);
-		nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
-
-		REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
-
-		/* We need to account for writes + bitmaps for two files */
-		nblocks += reiserfs_xattr_nblocks(inode, size) * 4;
-		posix_acl_release(acl);
-	}
-
-	return nblocks;
-}
-
-/*
- * Called under i_mutex
- */
-int reiserfs_acl_chmod(struct dentry *dentry)
-{
-	struct inode *inode = d_inode(dentry);
-
-	if (IS_PRIVATE(inode))
-		return 0;
-	if (get_inode_sd_version(inode) == STAT_DATA_V1 ||
-	    !reiserfs_posixacl(inode->i_sb))
-		return 0;
-
-	return posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode);
-}
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
deleted file mode 100644
index 078dd8cc312f..000000000000
--- a/fs/reiserfs/xattr_security.c
+++ /dev/null
@@ -1,127 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "reiserfs.h"
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/pagemap.h>
-#include <linux/xattr.h>
-#include <linux/slab.h>
-#include "xattr.h"
-#include <linux/security.h>
-#include <linux/uaccess.h>
-
-static int
-security_get(const struct xattr_handler *handler, struct dentry *unused,
-	     struct inode *inode, const char *name, void *buffer, size_t size)
-{
-	if (IS_PRIVATE(inode))
-		return -EPERM;
-
-	return reiserfs_xattr_get(inode, xattr_full_name(handler, name),
-				  buffer, size);
-}
-
-static int
-security_set(const struct xattr_handler *handler,
-	     struct mnt_idmap *idmap, struct dentry *unused,
-	     struct inode *inode, const char *name, const void *buffer,
-	     size_t size, int flags)
-{
-	if (IS_PRIVATE(inode))
-		return -EPERM;
-
-	return reiserfs_xattr_set(inode,
-				  xattr_full_name(handler, name),
-				  buffer, size, flags);
-}
-
-static bool security_list(struct dentry *dentry)
-{
-	return !IS_PRIVATE(d_inode(dentry));
-}
-
-static int
-reiserfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
-		    void *fs_info)
-{
-	struct reiserfs_security_handle *sec = fs_info;
-
-	sec->value = kmemdup(xattr_array->value, xattr_array->value_len,
-			     GFP_KERNEL);
-	if (!sec->value)
-		return -ENOMEM;
-
-	sec->name = xattr_array->name;
-	sec->length = xattr_array->value_len;
-	return 0;
-}
-
-/* Initializes the security context for a new inode and returns the number
- * of blocks needed for the transaction. If successful, reiserfs_security
- * must be released using reiserfs_security_free when the caller is done. */
-int reiserfs_security_init(struct inode *dir, struct inode *inode,
-			   const struct qstr *qstr,
-			   struct reiserfs_security_handle *sec)
-{
-	int blocks = 0;
-	int error;
-
-	sec->name = NULL;
-	sec->value = NULL;
-	sec->length = 0;
-
-	/* Don't add selinux attributes on xattrs - they'll never get used */
-	if (IS_PRIVATE(dir))
-		return 0;
-
-	error = security_inode_init_security(inode, dir, qstr,
-					     &reiserfs_initxattrs, sec);
-	if (error) {
-		sec->name = NULL;
-		sec->value = NULL;
-		sec->length = 0;
-		return error;
-	}
-
-	if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) {
-		blocks = reiserfs_xattr_jcreate_nblocks(inode) +
-			 reiserfs_xattr_nblocks(inode, sec->length);
-		/* We don't want to count the directories twice if we have
-		 * a default ACL. */
-		REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
-	}
-	return blocks;
-}
-
-int reiserfs_security_write(struct reiserfs_transaction_handle *th,
-			    struct inode *inode,
-			    struct reiserfs_security_handle *sec)
-{
-	char xattr_name[XATTR_NAME_MAX + 1] = XATTR_SECURITY_PREFIX;
-	int error;
-
-	if (XATTR_SECURITY_PREFIX_LEN + strlen(sec->name) > XATTR_NAME_MAX)
-		return -EINVAL;
-
-	strlcat(xattr_name, sec->name, sizeof(xattr_name));
-
-	error = reiserfs_xattr_set_handle(th, inode, xattr_name, sec->value,
-					  sec->length, XATTR_CREATE);
-	if (error == -ENODATA || error == -EOPNOTSUPP)
-		error = 0;
-
-	return error;
-}
-
-void reiserfs_security_free(struct reiserfs_security_handle *sec)
-{
-	kfree(sec->value);
-	sec->name = NULL;
-	sec->value = NULL;
-}
-
-const struct xattr_handler reiserfs_xattr_security_handler = {
-	.prefix = XATTR_SECURITY_PREFIX,
-	.get = security_get,
-	.set = security_set,
-	.list = security_list,
-};
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
deleted file mode 100644
index 0c0c74d8db0e..000000000000
--- a/fs/reiserfs/xattr_trusted.c
+++ /dev/null
@@ -1,46 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "reiserfs.h"
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/pagemap.h>
-#include <linux/xattr.h>
-#include "xattr.h"
-#include <linux/uaccess.h>
-
-static int
-trusted_get(const struct xattr_handler *handler, struct dentry *unused,
-	    struct inode *inode, const char *name, void *buffer, size_t size)
-{
-	if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
-		return -EPERM;
-
-	return reiserfs_xattr_get(inode, xattr_full_name(handler, name),
-				  buffer, size);
-}
-
-static int
-trusted_set(const struct xattr_handler *handler,
-	    struct mnt_idmap *idmap, struct dentry *unused,
-	    struct inode *inode, const char *name, const void *buffer,
-	    size_t size, int flags)
-{
-	if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
-		return -EPERM;
-
-	return reiserfs_xattr_set(inode,
-				  xattr_full_name(handler, name),
-				  buffer, size, flags);
-}
-
-static bool trusted_list(struct dentry *dentry)
-{
-	return capable(CAP_SYS_ADMIN) && !IS_PRIVATE(d_inode(dentry));
-}
-
-const struct xattr_handler reiserfs_xattr_trusted_handler = {
-	.prefix = XATTR_TRUSTED_PREFIX,
-	.get = trusted_get,
-	.set = trusted_set,
-	.list = trusted_list,
-};
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
deleted file mode 100644
index 88195181e1d7..000000000000
--- a/fs/reiserfs/xattr_user.c
+++ /dev/null
@@ -1,43 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "reiserfs.h"
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/pagemap.h>
-#include <linux/xattr.h>
-#include "xattr.h"
-#include <linux/uaccess.h>
-
-static int
-user_get(const struct xattr_handler *handler, struct dentry *unused,
-	 struct inode *inode, const char *name, void *buffer, size_t size)
-{
-	if (!reiserfs_xattrs_user(inode->i_sb))
-		return -EOPNOTSUPP;
-	return reiserfs_xattr_get(inode, xattr_full_name(handler, name),
-				  buffer, size);
-}
-
-static int
-user_set(const struct xattr_handler *handler, struct mnt_idmap *idmap,
-	 struct dentry *unused,
-	 struct inode *inode, const char *name, const void *buffer,
-	 size_t size, int flags)
-{
-	if (!reiserfs_xattrs_user(inode->i_sb))
-		return -EOPNOTSUPP;
-	return reiserfs_xattr_set(inode,
-				  xattr_full_name(handler, name),
-				  buffer, size, flags);
-}
-
-static bool user_list(struct dentry *dentry)
-{
-	return reiserfs_xattrs_user(dentry->d_sb);
-}
-
-const struct xattr_handler reiserfs_xattr_user_handler = {
-	.prefix = XATTR_USER_PREFIX,
-	.get = user_get,
-	.set = user_set,
-	.list = user_list,
-};
diff --git a/include/uapi/linux/reiserfs_fs.h b/include/uapi/linux/reiserfs_fs.h
deleted file mode 100644
index 5bb921409f2b..000000000000
--- a/include/uapi/linux/reiserfs_fs.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * Copyright 1996, 1997, 1998 Hans Reiser, see reiserfs/README for licensing and copyright details
- */
-#ifndef _LINUX_REISER_FS_H
-#define _LINUX_REISER_FS_H
-
-#include <linux/types.h>
-#include <linux/magic.h>
-
-/*
- *  include/linux/reiser_fs.h
- *
- *  Reiser File System constants and structures
- *
- */
-
-/* ioctl's command */
-#define REISERFS_IOC_UNPACK		_IOW(0xCD,1,long)
-/* define following flags to be the same as in ext2, so that chattr(1),
-   lsattr(1) will work with us. */
-#define REISERFS_IOC_GETFLAGS		FS_IOC_GETFLAGS
-#define REISERFS_IOC_SETFLAGS		FS_IOC_SETFLAGS
-#define REISERFS_IOC_GETVERSION		FS_IOC_GETVERSION
-#define REISERFS_IOC_SETVERSION		FS_IOC_SETVERSION
-
-#endif				/* _LINUX_REISER_FS_H */
diff --git a/include/uapi/linux/reiserfs_xattr.h b/include/uapi/linux/reiserfs_xattr.h
deleted file mode 100644
index 503ad018ce5b..000000000000
--- a/include/uapi/linux/reiserfs_xattr.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
-  File: linux/reiserfs_xattr.h
-*/
-
-#ifndef _LINUX_REISERFS_XATTR_H
-#define _LINUX_REISERFS_XATTR_H
-
-#include <linux/types.h>
-
-/* Magic value in header */
-#define REISERFS_XATTR_MAGIC 0x52465841	/* "RFXA" */
-
-struct reiserfs_xattr_header {
-	__le32 h_magic;		/* magic number for identification */
-	__le32 h_hash;		/* hash of the value */
-};
-
-struct reiserfs_security_handle {
-	const char *name;
-	void *value;
-	__kernel_size_t length;
-};
-
-#endif  /*  _LINUX_REISERFS_XATTR_H  */
diff --git a/scripts/selinux/mdp/mdp.c b/scripts/selinux/mdp/mdp.c
index 1415604c3d24..a413c157904d 100644
--- a/scripts/selinux/mdp/mdp.c
+++ b/scripts/selinux/mdp/mdp.c
@@ -171,9 +171,6 @@ int main(int argc, char *argv[])
 #ifdef CONFIG_JFS_SECURITY
 	FS_USE("xattr", "jfs");
 #endif
-#ifdef CONFIG_REISERFS_FS_SECURITY
-	FS_USE("xattr", "reiserfs");
-#endif
 #ifdef CONFIG_JFFS2_FS_SECURITY
 	FS_USE("xattr", "jffs2");
 #endif
diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h
index e7da92489167..f37614cc2c1b 100644
--- a/tools/objtool/noreturns.h
+++ b/tools/objtool/noreturns.h
@@ -11,7 +11,6 @@ NORETURN(__ia32_sys_exit)
 NORETURN(__ia32_sys_exit_group)
 NORETURN(__kunit_abort)
 NORETURN(__module_put_and_kthread_exit)
-NORETURN(__reiserfs_panic)
 NORETURN(__stack_chk_fail)
 NORETURN(__tdx_hypercall_failed)
 NORETURN(__ubsan_handle_builtin_unreachable)
diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c
index c773334bbcc9..8eb6aa606a0d 100644
--- a/tools/testing/selftests/filesystems/statmount/statmount_test.c
+++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c
@@ -27,7 +27,7 @@ static const char *const known_fs[] = {
 	"ipathfs", "iso9660", "jffs2", "jfs", "minix", "mqueue", "msdos",
 	"nfs", "nfs4", "nfsd", "nilfs2", "nsfs", "ntfs", "ntfs3", "ocfs2",
 	"ocfs2_dlmfs", "ocxlflash", "omfs", "openpromfs", "overlay", "pipefs",
-	"proc", "pstore", "pvfs2", "qnx4", "qnx6", "ramfs", "reiserfs",
+	"proc", "pstore", "pvfs2", "qnx4", "qnx6", "ramfs",
 	"resctrl", "romfs", "rootfs", "rpc_pipefs", "s390_hypfs", "secretmem",
 	"securityfs", "selinuxfs", "smackfs", "smb3", "sockfs", "spufs",
 	"squashfs", "sysfs", "sysv", "tmpfs", "tracefs", "ubifs", "udf",
-- 
cgit v1.2.3


From 59eaa01ce7a6cbc5c36b928f52888f99fca6b295 Mon Sep 17 00:00:00 2001
From: Uday Shankar <ushankar@purestorage.com>
Date: Mon, 7 Oct 2024 12:24:17 -0600
Subject: ublk: support device recovery without I/O queueing

ublk currently supports the following behaviors on ublk server exit:

A: outstanding I/Os get errors, subsequently issued I/Os get errors
B: outstanding I/Os get errors, subsequently issued I/Os queue
C: outstanding I/Os get reissued, subsequently issued I/Os queue

and the following behaviors for recovery of preexisting block devices by
a future incarnation of the ublk server:

1: ublk devices stopped on ublk server exit (no recovery possible)
2: ublk devices are recoverable using start/end_recovery commands

The userspace interface allows selection of combinations of these
behaviors using flags specified at device creation time, namely:

default behavior: A + 1
UBLK_F_USER_RECOVERY: B + 2
UBLK_F_USER_RECOVERY|UBLK_F_USER_RECOVERY_REISSUE: C + 2

The behavior A + 2 is currently unsupported. Add support for this
behavior under the new flag combination
UBLK_F_USER_RECOVERY|UBLK_F_USER_RECOVERY_FAIL_IO.

Signed-off-by: Uday Shankar <ushankar@purestorage.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20241007182419.3263186-5-ushankar@purestorage.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 78 ++++++++++++++++++++++++++++++++++---------
 include/uapi/linux/ublk_cmd.h | 18 ++++++++++
 2 files changed, 81 insertions(+), 15 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 0e75283e3bda..59951e7c2593 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -60,10 +60,12 @@
 		| UBLK_F_UNPRIVILEGED_DEV \
 		| UBLK_F_CMD_IOCTL_ENCODE \
 		| UBLK_F_USER_COPY \
-		| UBLK_F_ZONED)
+		| UBLK_F_ZONED \
+		| UBLK_F_USER_RECOVERY_FAIL_IO)
 
 #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
-		| UBLK_F_USER_RECOVERY_REISSUE)
+		| UBLK_F_USER_RECOVERY_REISSUE \
+		| UBLK_F_USER_RECOVERY_FAIL_IO)
 
 /* All UBLK_PARAM_TYPE_* should be included here */
 #define UBLK_PARAM_TYPE_ALL                                \
@@ -146,6 +148,7 @@ struct ublk_queue {
 	bool force_abort;
 	bool timeout;
 	bool canceling;
+	bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
 	unsigned short nr_io_ready;	/* how many ios setup */
 	spinlock_t		cancel_lock;
 	struct ublk_device *dev;
@@ -690,7 +693,8 @@ static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub)
  */
 static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub)
 {
-	return ub->dev_info.flags & UBLK_F_USER_RECOVERY;
+	return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
+	       !(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO);
 }
 
 /*
@@ -700,7 +704,8 @@ static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub)
  */
 static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq)
 {
-	return ubq->flags & UBLK_F_USER_RECOVERY;
+	return (ubq->flags & UBLK_F_USER_RECOVERY) &&
+	       !(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO);
 }
 
 /*
@@ -714,6 +719,12 @@ static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub)
 	return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY);
 }
 
+static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub)
+{
+	return ub->dev_info.state == UBLK_S_DEV_QUIESCED ||
+	       ub->dev_info.state == UBLK_S_DEV_FAIL_IO;
+}
+
 static void ublk_free_disk(struct gendisk *disk)
 {
 	struct ublk_device *ub = disk->private_data;
@@ -1275,6 +1286,10 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
 	struct request *rq = bd->rq;
 	blk_status_t res;
 
+	if (unlikely(ubq->fail_io)) {
+		return BLK_STS_TARGET;
+	}
+
 	/* fill iod to slot in io cmd buffer */
 	res = ublk_setup_iod(ubq, rq);
 	if (unlikely(res != BLK_STS_OK))
@@ -1625,6 +1640,7 @@ static void ublk_nosrv_work(struct work_struct *work)
 {
 	struct ublk_device *ub =
 		container_of(work, struct ublk_device, nosrv_work);
+	int i;
 
 	if (ublk_nosrv_should_stop_dev(ub)) {
 		ublk_stop_dev(ub);
@@ -1634,7 +1650,18 @@ static void ublk_nosrv_work(struct work_struct *work)
 	mutex_lock(&ub->mutex);
 	if (ub->dev_info.state != UBLK_S_DEV_LIVE)
 		goto unlock;
-	__ublk_quiesce_dev(ub);
+
+	if (ublk_nosrv_dev_should_queue_io(ub)) {
+		__ublk_quiesce_dev(ub);
+	} else {
+		blk_mq_quiesce_queue(ub->ub_disk->queue);
+		ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
+		for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
+			ublk_get_queue(ub, i)->fail_io = true;
+		}
+		blk_mq_unquiesce_queue(ub->ub_disk->queue);
+	}
+
  unlock:
 	mutex_unlock(&ub->mutex);
 	ublk_cancel_dev(ub);
@@ -2387,8 +2414,13 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
 		return -EPERM;
 
 	/* forbid nonsense combinations of recovery flags */
-	if ((info.flags & UBLK_F_USER_RECOVERY_REISSUE) &&
-	    !(info.flags & UBLK_F_USER_RECOVERY)) {
+	switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) {
+	case 0:
+	case UBLK_F_USER_RECOVERY:
+	case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE):
+	case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO):
+		break;
+	default:
 		pr_warn("%s: invalid recovery flags %llx\n", __func__,
 			info.flags & UBLK_F_ALL_RECOVERY_FLAGS);
 		return -EINVAL;
@@ -2729,14 +2761,18 @@ static int ublk_ctrl_start_recovery(struct ublk_device *ub,
 	 *     and related io_uring ctx is freed so file struct of /dev/ublkcX is
 	 *     released.
 	 *
+	 * and one of the following holds
+	 *
 	 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
 	 *     (a)has quiesced request queue
 	 *     (b)has requeued every inflight rqs whose io_flags is ACTIVE
 	 *     (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
 	 *     (d)has completed/camceled all ioucmds owned by ther dying process
+	 *
+	 * (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not
+	 *     quiesced, but all I/O is being immediately errored
 	 */
-	if (test_bit(UB_STATE_OPEN, &ub->state) ||
-			ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
+	if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) {
 		ret = -EBUSY;
 		goto out_unlock;
 	}
@@ -2760,6 +2796,7 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub,
 	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
 	int ublksrv_pid = (int)header->data[0];
 	int ret = -EINVAL;
+	int i;
 
 	pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n",
 			__func__, ub->dev_info.nr_hw_queues, header->dev_id);
@@ -2774,18 +2811,29 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub,
 	if (ublk_nosrv_should_stop_dev(ub))
 		goto out_unlock;
 
-	if (ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
+	if (!ublk_dev_in_recoverable_state(ub)) {
 		ret = -EBUSY;
 		goto out_unlock;
 	}
 	ub->dev_info.ublksrv_pid = ublksrv_pid;
 	pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
 			__func__, ublksrv_pid, header->dev_id);
-	blk_mq_unquiesce_queue(ub->ub_disk->queue);
-	pr_devel("%s: queue unquiesced, dev id %d.\n",
-			__func__, header->dev_id);
-	blk_mq_kick_requeue_list(ub->ub_disk->queue);
-	ub->dev_info.state = UBLK_S_DEV_LIVE;
+
+	if (ublk_nosrv_dev_should_queue_io(ub)) {
+		ub->dev_info.state = UBLK_S_DEV_LIVE;
+		blk_mq_unquiesce_queue(ub->ub_disk->queue);
+		pr_devel("%s: queue unquiesced, dev id %d.\n",
+				__func__, header->dev_id);
+		blk_mq_kick_requeue_list(ub->ub_disk->queue);
+	} else {
+		blk_mq_quiesce_queue(ub->ub_disk->queue);
+		ub->dev_info.state = UBLK_S_DEV_LIVE;
+		for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
+			ublk_get_queue(ub, i)->fail_io = false;
+		}
+		blk_mq_unquiesce_queue(ub->ub_disk->queue);
+	}
+
 	ret = 0;
  out_unlock:
 	mutex_unlock(&ub->mutex);
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 12873639ea96..a8bc98bb69fc 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -147,8 +147,18 @@
  */
 #define UBLK_F_NEED_GET_DATA (1UL << 2)
 
+/*
+ * - Block devices are recoverable if ublk server exits and restarts
+ * - Outstanding I/O when ublk server exits is met with errors
+ * - I/O issued while there is no ublk server queues
+ */
 #define UBLK_F_USER_RECOVERY	(1UL << 3)
 
+/*
+ * - Block devices are recoverable if ublk server exits and restarts
+ * - Outstanding I/O when ublk server exits is reissued
+ * - I/O issued while there is no ublk server queues
+ */
 #define UBLK_F_USER_RECOVERY_REISSUE	(1UL << 4)
 
 /*
@@ -190,10 +200,18 @@
  */
 #define UBLK_F_ZONED (1ULL << 8)
 
+/*
+ * - Block devices are recoverable if ublk server exits and restarts
+ * - Outstanding I/O when ublk server exits is met with errors
+ * - I/O issued while there is no ublk server is met with errors
+ */
+#define UBLK_F_USER_RECOVERY_FAIL_IO (1ULL << 9)
+
 /* device state */
 #define UBLK_S_DEV_DEAD	0
 #define UBLK_S_DEV_LIVE	1
 #define UBLK_S_DEV_QUIESCED	2
+#define UBLK_S_DEV_FAIL_IO 	3
 
 /* shipped via sqe->cmd of io_uring command */
 struct ublksrv_ctrl_cmd {
-- 
cgit v1.2.3


From b21d948f4cc73e3296f2365c7afca721dd6893fa Mon Sep 17 00:00:00 2001
From: Greg Joyce <gjoyce@linux.ibm.com>
Date: Thu, 29 Aug 2024 12:56:11 -0500
Subject: block: sed-opal: add ioctl IOC_OPAL_SET_SID_PW

After a SED drive is provisioned, there is no way to change the SID
password via the ioctl() interface. A new ioctl IOC_OPAL_SET_SID_PW
will allow the password to be changed. The valid current password is
required.

Signed-off-by: Greg Joyce <gjoyce@linux.ibm.com>
Reviewed-by: Daniel Wagner <dwagner@suse.de>
Link: https://lore.kernel.org/r/20240829175639.6478-2-gjoyce@linux.ibm.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/sed-opal.c              | 26 ++++++++++++++++++++++++++
 include/linux/sed-opal.h      |  1 +
 include/uapi/linux/sed-opal.h |  1 +
 3 files changed, 28 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/block/sed-opal.c b/block/sed-opal.c
index 598fd3e7fcc8..5a28f23f7f22 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -3037,6 +3037,29 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
 	return ret;
 }
 
+static int opal_set_new_sid_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
+{
+	int ret;
+	struct opal_key *newkey = &opal_pw->new_user_pw.opal_key;
+	struct opal_key *oldkey = &opal_pw->session.opal_key;
+
+	const struct opal_step pw_steps[] = {
+		{ start_SIDASP_opal_session, oldkey },
+		{ set_sid_cpin_pin, newkey },
+		{ end_opal_session, }
+	};
+
+	if (!dev)
+		return -ENODEV;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev);
+	ret = execute_steps(dev, pw_steps, ARRAY_SIZE(pw_steps));
+	mutex_unlock(&dev->dev_lock);
+
+	return ret;
+}
+
 static int opal_activate_user(struct opal_dev *dev,
 			      struct opal_session_info *opal_session)
 {
@@ -3286,6 +3309,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
 	case IOC_OPAL_DISCOVERY:
 		ret = opal_get_discv(dev, p);
 		break;
+	case IOC_OPAL_SET_SID_PW:
+		ret = opal_set_new_sid_pw(dev, p);
+		break;
 
 	default:
 		break;
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
index 2ac50822554e..80f33a93f944 100644
--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@@ -52,6 +52,7 @@ static inline bool is_sed_ioctl(unsigned int cmd)
 	case IOC_OPAL_GET_GEOMETRY:
 	case IOC_OPAL_DISCOVERY:
 	case IOC_OPAL_REVERT_LSP:
+	case IOC_OPAL_SET_SID_PW:
 		return true;
 	}
 	return false;
diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h
index d3994b7716bc..9025dd5a4f0f 100644
--- a/include/uapi/linux/sed-opal.h
+++ b/include/uapi/linux/sed-opal.h
@@ -215,5 +215,6 @@ struct opal_revert_lsp {
 #define IOC_OPAL_GET_GEOMETRY       _IOR('p', 238, struct opal_geometry)
 #define IOC_OPAL_DISCOVERY          _IOW('p', 239, struct opal_discovery)
 #define IOC_OPAL_REVERT_LSP         _IOW('p', 240, struct opal_revert_lsp)
+#define IOC_OPAL_SET_SID_PW         _IOW('p', 241, struct opal_new_pw)
 
 #endif /* _UAPI_SED_OPAL_H */
-- 
cgit v1.2.3


From 3607798ad9bdef35ad08489a8239390fccaac6b5 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 9 Oct 2024 10:25:42 +0200
Subject: wifi: cfg80211: add option for vif allowed radios

This allows users to prevent a vif from affecting radios other than the
configured ones. This can be useful in cases where e.g. an AP is running
on one radio, and triggering a scan on another radio should not disturb it.

Changing the allowed radios list for a vif is supported, but only while
it is down.

While it is possible to achieve the same by always explicitly specifying
a frequency list for scan requests and ensuring that the wrong channel/band
is never accidentally set on an unrelated interface, this change makes
multi-radio wiphy setups a lot easier to deal with for CLI users.

By itself, this patch only enforces the radio mask for scanning requests
and remain-on-channel. Follow-up changes build on this to limit configured
frequencies.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Link: https://patch.msgid.link/eefcb218780f71a1549875d149f1196486762756.1728462320.git-series.nbd@nbd.name
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 14 +++++++++++
 include/uapi/linux/nl80211.h |  5 ++++
 net/wireless/core.c          |  2 ++
 net/wireless/nl80211.c       | 60 ++++++++++++++++++++++++++++++++++++++------
 net/wireless/scan.c          | 10 +++++---
 net/wireless/util.c          | 29 +++++++++++++++++++++
 6 files changed, 109 insertions(+), 11 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index c8ce5c2e14f4..95d05e67e69a 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -6221,6 +6221,7 @@ enum ieee80211_ap_reg_power {
  *	entered.
  * @links.cac_time_ms: CAC time in ms
  * @valid_links: bitmap describing what elements of @links are valid
+ * @radio_mask: Bitmask of radios that this interface is allowed to operate on.
  */
 struct wireless_dev {
 	struct wiphy *wiphy;
@@ -6333,6 +6334,8 @@ struct wireless_dev {
 		unsigned int cac_time_ms;
 	} links[IEEE80211_MLD_MAX_NUM_LINKS];
 	u16 valid_links;
+
+	u32 radio_mask;
 };
 
 static inline const u8 *wdev_address(struct wireless_dev *wdev)
@@ -6518,6 +6521,17 @@ static inline bool cfg80211_channel_is_psc(struct ieee80211_channel *chan)
 bool cfg80211_radio_chandef_valid(const struct wiphy_radio *radio,
 				  const struct cfg80211_chan_def *chandef);
 
+/**
+ * cfg80211_wdev_channel_allowed - Check if the wdev may use the channel
+ *
+ * @wdev: the wireless device
+ * @chan: channel to check
+ *
+ * Return: whether or not the wdev may use the channel
+ */
+bool cfg80211_wdev_channel_allowed(struct wireless_dev *wdev,
+				   struct ieee80211_channel *chan);
+
 /**
  * ieee80211_get_response_rate - get basic rate for a given rate
  *
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index f97f5adc8d51..d31ccee99cc7 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2868,6 +2868,9 @@ enum nl80211_commands {
  *	nested item, it contains attributes defined in
  *	&enum nl80211_if_combination_attrs.
  *
+ * @NL80211_ATTR_VIF_RADIO_MASK: Bitmask of allowed radios (u32).
+ *	A value of 0 means all radios.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -3416,6 +3419,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_WIPHY_RADIOS,
 	NL80211_ATTR_WIPHY_INTERFACE_COMBINATIONS,
 
+	NL80211_ATTR_VIF_RADIO_MASK,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 4c8d8f167409..93d62a1d3a45 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -1430,6 +1430,8 @@ void cfg80211_init_wdev(struct wireless_dev *wdev)
 	/* allow mac80211 to determine the timeout */
 	wdev->ps_timeout = -1;
 
+	wdev->radio_mask = BIT(wdev->wiphy->n_radio) - 1;
+
 	if ((wdev->iftype == NL80211_IFTYPE_STATION ||
 	     wdev->iftype == NL80211_IFTYPE_P2P_CLIENT ||
 	     wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr)
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index fb35c03af34c..a330347dd7a3 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -829,6 +829,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_MLO_TTLM_DLINK] = NLA_POLICY_EXACT_LEN(sizeof(u16) * 8),
 	[NL80211_ATTR_MLO_TTLM_ULINK] = NLA_POLICY_EXACT_LEN(sizeof(u16) * 8),
 	[NL80211_ATTR_ASSOC_SPP_AMSDU] = { .type = NLA_FLAG },
+	[NL80211_ATTR_VIF_RADIO_MASK] = { .type = NLA_U32 },
 };
 
 /* policy for the key attributes */
@@ -3996,7 +3997,8 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
 	    nla_put_u32(msg, NL80211_ATTR_GENERATION,
 			rdev->devlist_generation ^
 			(cfg80211_rdev_list_generation << 2)) ||
-	    nla_put_u8(msg, NL80211_ATTR_4ADDR, wdev->use_4addr))
+	    nla_put_u8(msg, NL80211_ATTR_4ADDR, wdev->use_4addr) ||
+	    nla_put_u32(msg, NL80211_ATTR_VIF_RADIO_MASK, wdev->radio_mask))
 		goto nla_put_failure;
 
 	if (rdev->ops->get_channel && !wdev->valid_links) {
@@ -4312,6 +4314,29 @@ static int nl80211_valid_4addr(struct cfg80211_registered_device *rdev,
 	return -EOPNOTSUPP;
 }
 
+static int nl80211_parse_vif_radio_mask(struct genl_info *info,
+					u32 *radio_mask)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct nlattr *attr = info->attrs[NL80211_ATTR_VIF_RADIO_MASK];
+	u32 mask, allowed;
+
+	if (!attr) {
+		*radio_mask = 0;
+		return 0;
+	}
+
+	allowed = BIT(rdev->wiphy.n_radio) - 1;
+	mask = nla_get_u32(attr);
+	if (mask & ~allowed)
+		return -EINVAL;
+	if (!mask)
+		mask = allowed;
+	*radio_mask = mask;
+
+	return 1;
+}
+
 static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
@@ -4319,6 +4344,8 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
 	int err;
 	enum nl80211_iftype otype, ntype;
 	struct net_device *dev = info->user_ptr[1];
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	u32 radio_mask = 0;
 	bool change = false;
 
 	memset(&params, 0, sizeof(params));
@@ -4332,8 +4359,6 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	if (info->attrs[NL80211_ATTR_MESH_ID]) {
-		struct wireless_dev *wdev = dev->ieee80211_ptr;
-
 		if (ntype != NL80211_IFTYPE_MESH_POINT)
 			return -EINVAL;
 		if (otype != NL80211_IFTYPE_MESH_POINT)
@@ -4364,6 +4389,12 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
 	if (err > 0)
 		change = true;
 
+	err = nl80211_parse_vif_radio_mask(info, &radio_mask);
+	if (err < 0)
+		return err;
+	if (err && netif_running(dev))
+		return -EBUSY;
+
 	if (change)
 		err = cfg80211_change_iface(rdev, dev, ntype, &params);
 	else
@@ -4372,11 +4403,11 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
 	if (!err && params.use_4addr != -1)
 		dev->ieee80211_ptr->use_4addr = params.use_4addr;
 
-	if (change && !err) {
-		struct wireless_dev *wdev = dev->ieee80211_ptr;
+	if (radio_mask)
+		wdev->radio_mask = radio_mask;
 
+	if (change && !err)
 		nl80211_notify_iface(rdev, wdev, NL80211_CMD_SET_INTERFACE);
-	}
 
 	return err;
 }
@@ -4387,6 +4418,7 @@ static int _nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
 	struct vif_params params;
 	struct wireless_dev *wdev;
 	struct sk_buff *msg;
+	u32 radio_mask;
 	int err;
 	enum nl80211_iftype type = NL80211_IFTYPE_UNSPECIFIED;
 
@@ -4424,6 +4456,10 @@ static int _nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
 	if (err < 0)
 		return err;
 
+	err = nl80211_parse_vif_radio_mask(info, &radio_mask);
+	if (err < 0)
+		return err;
+
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
 	if (!msg)
 		return -ENOMEM;
@@ -4465,6 +4501,9 @@ static int _nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
 		break;
 	}
 
+	if (radio_mask)
+		wdev->radio_mask = radio_mask;
+
 	if (nl80211_send_iface(msg, info->snd_portid, info->snd_seq, 0,
 			       rdev, wdev, NL80211_CMD_NEW_INTERFACE) < 0) {
 		nlmsg_free(msg);
@@ -9156,6 +9195,9 @@ static bool cfg80211_off_channel_oper_allowed(struct wireless_dev *wdev,
 
 	lockdep_assert_wiphy(wdev->wiphy);
 
+	if (!cfg80211_wdev_channel_allowed(wdev, chan))
+		return false;
+
 	if (!cfg80211_beaconing_iface_active(wdev))
 		return true;
 
@@ -9368,7 +9410,8 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 			}
 
 			/* ignore disabled channels */
-			if (chan->flags & IEEE80211_CHAN_DISABLED)
+			if (chan->flags & IEEE80211_CHAN_DISABLED ||
+			    !cfg80211_wdev_channel_allowed(wdev, chan))
 				continue;
 
 			request->channels[i] = chan;
@@ -9388,7 +9431,8 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 
 				chan = &wiphy->bands[band]->channels[j];
 
-				if (chan->flags & IEEE80211_CHAN_DISABLED)
+				if (chan->flags & IEEE80211_CHAN_DISABLED ||
+				    !cfg80211_wdev_channel_allowed(wdev, chan))
 					continue;
 
 				request->channels[i] = chan;
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 8ba618f4734f..8e3d46bf4836 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -956,7 +956,8 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev)
 		struct ieee80211_channel *chan =
 			ieee80211_get_channel(&rdev->wiphy, ap->center_freq);
 
-		if (!chan || chan->flags & IEEE80211_CHAN_DISABLED)
+		if (!chan || chan->flags & IEEE80211_CHAN_DISABLED ||
+		    !cfg80211_wdev_channel_allowed(rdev_req->wdev, chan))
 			continue;
 
 		for (i = 0; i < rdev_req->n_channels; i++) {
@@ -3515,9 +3516,12 @@ int cfg80211_wext_siwscan(struct net_device *dev,
 			continue;
 
 		for (j = 0; j < wiphy->bands[band]->n_channels; j++) {
+			struct ieee80211_channel *chan;
+
 			/* ignore disabled channels */
-			if (wiphy->bands[band]->channels[j].flags &
-						IEEE80211_CHAN_DISABLED)
+			chan = &wiphy->bands[band]->channels[j];
+			if (chan->flags & IEEE80211_CHAN_DISABLED ||
+			    !cfg80211_wdev_channel_allowed(creq->wdev, chan))
 				continue;
 
 			/* If we have a wireless request structure and the
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 93a9c32418a6..040d62051eb9 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -2923,3 +2923,32 @@ bool cfg80211_radio_chandef_valid(const struct wiphy_radio *radio,
 	return true;
 }
 EXPORT_SYMBOL(cfg80211_radio_chandef_valid);
+
+bool cfg80211_wdev_channel_allowed(struct wireless_dev *wdev,
+				   struct ieee80211_channel *chan)
+{
+	struct wiphy *wiphy = wdev->wiphy;
+	const struct wiphy_radio *radio;
+	struct cfg80211_chan_def chandef;
+	u32 radio_mask;
+	int i;
+
+	radio_mask = wdev->radio_mask;
+	if (!wiphy->n_radio || radio_mask == BIT(wiphy->n_radio) - 1)
+		return true;
+
+	cfg80211_chandef_create(&chandef, chan, NL80211_CHAN_HT20);
+	for (i = 0; i < wiphy->n_radio; i++) {
+		if (!(radio_mask & BIT(i)))
+			continue;
+
+		radio = &wiphy->radio[i];
+		if (!cfg80211_radio_chandef_valid(radio, &chandef))
+			continue;
+
+		return true;
+	}
+
+	return false;
+}
+EXPORT_SYMBOL(cfg80211_wdev_channel_allowed);
-- 
cgit v1.2.3


From ebda716ea4da03326ac4d0a71604d18aa8a2e695 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 9 Oct 2024 10:25:45 +0200
Subject: wifi: cfg80211: report per wiphy radio antenna mask

With multi-radio devices, each radio typically gets a fixed set of antennas.
In order to be able to disable specific antennas for some radios, user space
needs to know which antenna mask bits are assigned to which radio.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Link: https://patch.msgid.link/e0a26afa2c88eaa188ec96ec6d17ecac4e827641.1728462320.git-series.nbd@nbd.name
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 4 ++++
 include/uapi/linux/nl80211.h | 3 +++
 net/wireless/nl80211.c       | 5 +++++
 3 files changed, 12 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 95d05e67e69a..3100733f3e23 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5434,6 +5434,8 @@ struct wiphy_radio_freq_range {
  * @iface_combinations: Valid interface combinations array, should not
  *	list single interface types.
  * @n_iface_combinations: number of entries in @iface_combinations array.
+ *
+ * @antenna_mask: bitmask of antennas connected to this radio.
  */
 struct wiphy_radio {
 	const struct wiphy_radio_freq_range *freq_range;
@@ -5441,6 +5443,8 @@ struct wiphy_radio {
 
 	const struct ieee80211_iface_combination *iface_combinations;
 	int n_iface_combinations;
+
+	u32 antenna_mask;
 };
 
 #define CFG80211_HW_TIMESTAMP_ALL_PEERS	0xffff
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index d31ccee99cc7..1b8827f920ff 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -8036,6 +8036,8 @@ enum nl80211_ap_settings_flags {
  * @NL80211_WIPHY_RADIO_ATTR_INTERFACE_COMBINATION: Supported interface
  *	combination for this radio. Attribute may be present multiple times
  *	and contains attributes defined in &enum nl80211_if_combination_attrs.
+ * @NL80211_WIPHY_RADIO_ATTR_ANTENNA_MASK: bitmask (u32) of antennas
+ *	connected to this radio.
  *
  * @__NL80211_WIPHY_RADIO_ATTR_LAST: Internal
  * @NL80211_WIPHY_RADIO_ATTR_MAX: Highest attribute
@@ -8046,6 +8048,7 @@ enum nl80211_wiphy_radio_attrs {
 	NL80211_WIPHY_RADIO_ATTR_INDEX,
 	NL80211_WIPHY_RADIO_ATTR_FREQ_RANGE,
 	NL80211_WIPHY_RADIO_ATTR_INTERFACE_COMBINATION,
+	NL80211_WIPHY_RADIO_ATTR_ANTENNA_MASK,
 
 	/* keep last */
 	__NL80211_WIPHY_RADIO_ATTR_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index a330347dd7a3..aa78f18dd454 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -2431,6 +2431,11 @@ static int nl80211_put_radio(struct wiphy *wiphy, struct sk_buff *msg, int idx)
 	if (nla_put_u32(msg, NL80211_WIPHY_RADIO_ATTR_INDEX, idx))
 		goto nla_put_failure;
 
+	if (r->antenna_mask &&
+	    nla_put_u32(msg, NL80211_WIPHY_RADIO_ATTR_ANTENNA_MASK,
+			r->antenna_mask))
+		goto nla_put_failure;
+
 	for (i = 0; i < r->n_freq_range; i++) {
 		const struct wiphy_radio_freq_range *range = &r->freq_range[i];
 
-- 
cgit v1.2.3


From a77e527b470cc38754c730bce1483711f643bb60 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 9 Oct 2024 10:25:49 +0200
Subject: wifi: cfg80211: add monitor SKIP_TX flag

This can be used to indicate that the user is not interested in receiving
locally sent packets on the monitor interface.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Link: https://patch.msgid.link/f0c20f832eadd36c71fba9a2a16ba57d78389b6c.1728462320.git-series.nbd@nbd.name
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 2 ++
 include/uapi/linux/nl80211.h | 2 ++
 net/wireless/nl80211.c       | 1 +
 3 files changed, 5 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 5feb93ba0400..8f9853b1a5d1 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2267,6 +2267,7 @@ static inline int cfg80211_get_station(struct net_device *dev,
  * @MONITOR_FLAG_OTHER_BSS: disable BSSID filtering
  * @MONITOR_FLAG_COOK_FRAMES: report frames after processing
  * @MONITOR_FLAG_ACTIVE: active monitor, ACKs frames on its MAC address
+ * @MONITOR_FLAG_SKIP_TX: do not pass locally transmitted frames
  */
 enum monitor_flags {
 	MONITOR_FLAG_CHANGED		= BIT(__NL80211_MNTR_FLAG_INVALID),
@@ -2276,6 +2277,7 @@ enum monitor_flags {
 	MONITOR_FLAG_OTHER_BSS		= BIT(NL80211_MNTR_FLAG_OTHER_BSS),
 	MONITOR_FLAG_COOK_FRAMES	= BIT(NL80211_MNTR_FLAG_COOK_FRAMES),
 	MONITOR_FLAG_ACTIVE		= BIT(NL80211_MNTR_FLAG_ACTIVE),
+	MONITOR_FLAG_SKIP_TX		= BIT(NL80211_MNTR_FLAG_SKIP_TX),
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 1b8827f920ff..6d11437596b9 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4703,6 +4703,7 @@ enum nl80211_survey_info {
  *	overrides all other flags.
  * @NL80211_MNTR_FLAG_ACTIVE: use the configured MAC address
  *	and ACK incoming unicast packets.
+ * @NL80211_MNTR_FLAG_SKIP_TX: do not pass local tx packets
  *
  * @__NL80211_MNTR_FLAG_AFTER_LAST: internal use
  * @NL80211_MNTR_FLAG_MAX: highest possible monitor flag
@@ -4715,6 +4716,7 @@ enum nl80211_mntr_flags {
 	NL80211_MNTR_FLAG_OTHER_BSS,
 	NL80211_MNTR_FLAG_COOK_FRAMES,
 	NL80211_MNTR_FLAG_ACTIVE,
+	NL80211_MNTR_FLAG_SKIP_TX,
 
 	/* keep last */
 	__NL80211_MNTR_FLAG_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 84015f56e93a..4a8c3b6d49d1 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4206,6 +4206,7 @@ static const struct nla_policy mntr_flags_policy[NL80211_MNTR_FLAG_MAX + 1] = {
 	[NL80211_MNTR_FLAG_OTHER_BSS] = { .type = NLA_FLAG },
 	[NL80211_MNTR_FLAG_COOK_FRAMES] = { .type = NLA_FLAG },
 	[NL80211_MNTR_FLAG_ACTIVE] = { .type = NLA_FLAG },
+	[NL80211_MNTR_FLAG_SKIP_TX] = { .type = NLA_FLAG },
 };
 
 static int parse_monitor_flags(struct nlattr *nla, u32 *mntrflags)
-- 
cgit v1.2.3


From cdda1f26e74bac732eca537a69f19f6a37b641be Mon Sep 17 00:00:00 2001
From: Luca Boccassi <luca.boccassi@gmail.com>
Date: Thu, 10 Oct 2024 16:52:32 +0100
Subject: pidfd: add ioctl to retrieve pid info

A common pattern when using pid fds is having to get information
about the process, which currently requires /proc being mounted,
resolving the fd to a pid, and then do manual string parsing of
/proc/N/status and friends. This needs to be reimplemented over
and over in all userspace projects (e.g.: I have reimplemented
resolving in systemd, dbus, dbus-daemon, polkit so far), and
requires additional care in checking that the fd is still valid
after having parsed the data, to avoid races.

Having a programmatic API that can be used directly removes all
these requirements, including having /proc mounted.

As discussed at LPC24, add an ioctl with an extensible struct
so that more parameters can be added later if needed. Start with
returning pid/tgid/ppid and creds unconditionally, and cgroupid
optionally.

Signed-off-by: Luca Boccassi <luca.boccassi@gmail.com>
Link: https://lore.kernel.org/r/20241010155401.2268522-1-luca.boccassi@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pidfs.c                                      | 86 ++++++++++++++++++++++++-
 include/uapi/linux/pidfd.h                      | 50 ++++++++++++++
 tools/testing/selftests/pidfd/pidfd_open_test.c | 82 ++++++++++++++++++++++-
 3 files changed, 214 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/pidfs.c b/fs/pidfs.c
index 80675b6bf884..618abb1fa1b8 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -2,6 +2,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/cgroup.h>
 #include <linux/magic.h>
 #include <linux/mount.h>
 #include <linux/pid.h>
@@ -114,6 +115,81 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
 	return poll_flags;
 }
 
+static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg)
+{
+	struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
+	size_t usize = _IOC_SIZE(cmd);
+	struct pidfd_info kinfo = {};
+	struct user_namespace *user_ns;
+	const struct cred *c;
+	__u64 mask;
+#ifdef CONFIG_CGROUPS
+	struct cgroup *cgrp;
+#endif
+
+	if (!uinfo)
+		return -EINVAL;
+	if (usize < PIDFD_INFO_SIZE_VER0)
+		return -EINVAL; /* First version, no smaller struct possible */
+
+	if (copy_from_user(&mask, &uinfo->mask, sizeof(mask)))
+		return -EFAULT;
+
+	c = get_task_cred(task);
+	if (!c)
+		return -ESRCH;
+
+	/* Unconditionally return identifiers and credentials, the rest only on request */
+
+	user_ns = current_user_ns();
+	kinfo.ruid = from_kuid_munged(user_ns, c->uid);
+	kinfo.rgid = from_kgid_munged(user_ns, c->gid);
+	kinfo.euid = from_kuid_munged(user_ns, c->euid);
+	kinfo.egid = from_kgid_munged(user_ns, c->egid);
+	kinfo.suid = from_kuid_munged(user_ns, c->suid);
+	kinfo.sgid = from_kgid_munged(user_ns, c->sgid);
+	kinfo.fsuid = from_kuid_munged(user_ns, c->fsuid);
+	kinfo.fsgid = from_kgid_munged(user_ns, c->fsgid);
+	kinfo.mask |= PIDFD_INFO_CREDS;
+	put_cred(c);
+
+#ifdef CONFIG_CGROUPS
+	rcu_read_lock();
+	cgrp = task_dfl_cgroup(task);
+	kinfo.cgroupid = cgroup_id(cgrp);
+	kinfo.mask |= PIDFD_INFO_CGROUPID;
+	rcu_read_unlock();
+#endif
+
+	/*
+	 * Copy pid/tgid last, to reduce the chances the information might be
+	 * stale. Note that it is not possible to ensure it will be valid as the
+	 * task might return as soon as the copy_to_user finishes, but that's ok
+	 * and userspace expects that might happen and can act accordingly, so
+	 * this is just best-effort. What we can do however is checking that all
+	 * the fields are set correctly, or return ESRCH to avoid providing
+	 * incomplete information. */
+
+	kinfo.ppid = task_ppid_nr_ns(task, NULL);
+	kinfo.tgid = task_tgid_vnr(task);
+	kinfo.pid = task_pid_vnr(task);
+	kinfo.mask |= PIDFD_INFO_PID;
+
+	if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1))
+		return -ESRCH;
+
+	/*
+	 * If userspace and the kernel have the same struct size it can just
+	 * be copied. If userspace provides an older struct, only the bits that
+	 * userspace knows about will be copied. If userspace provides a new
+	 * struct, only the bits that the kernel knows about will be copied.
+	 */
+	if (copy_to_user(uinfo, &kinfo, min(usize, sizeof(kinfo))))
+		return -EFAULT;
+
+	return 0;
+}
+
 static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct task_struct *task __free(put_task) = NULL;
@@ -122,13 +198,17 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	struct ns_common *ns_common = NULL;
 	struct pid_namespace *pid_ns;
 
-	if (arg)
-		return -EINVAL;
-
 	task = get_pid_task(pid, PIDTYPE_PID);
 	if (!task)
 		return -ESRCH;
 
+	/* Extensible IOCTL that does not open namespace FDs, take a shortcut */
+	if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO))
+		return pidfd_info(task, cmd, arg);
+
+	if (arg)
+		return -EINVAL;
+
 	scoped_guard(task_lock, task) {
 		nsp = task->nsproxy;
 		if (nsp)
diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
index 565fc0629fff..4540f6301b8c 100644
--- a/include/uapi/linux/pidfd.h
+++ b/include/uapi/linux/pidfd.h
@@ -16,6 +16,55 @@
 #define PIDFD_SIGNAL_THREAD_GROUP	(1UL << 1)
 #define PIDFD_SIGNAL_PROCESS_GROUP	(1UL << 2)
 
+/* Flags for pidfd_info. */
+#define PIDFD_INFO_PID			(1UL << 0) /* Always returned, even if not requested */
+#define PIDFD_INFO_CREDS		(1UL << 1) /* Always returned, even if not requested */
+#define PIDFD_INFO_CGROUPID		(1UL << 2) /* Always returned if available, even if not requested */
+
+#define PIDFD_INFO_SIZE_VER0		64 /* sizeof first published struct */
+
+struct pidfd_info {
+	/*
+	 * This mask is similar to the request_mask in statx(2).
+	 *
+	 * Userspace indicates what extensions or expensive-to-calculate fields
+	 * they want by setting the corresponding bits in mask. The kernel
+	 * will ignore bits that it does not know about.
+	 *
+	 * When filling the structure, the kernel will only set bits
+	 * corresponding to the fields that were actually filled by the kernel.
+	 * This also includes any future extensions that might be automatically
+	 * filled. If the structure size is too small to contain a field
+	 * (requested or not), to avoid confusion the mask will not
+	 * contain a bit for that field.
+	 *
+	 * As such, userspace MUST verify that mask contains the
+	 * corresponding flags after the ioctl(2) returns to ensure that it is
+	 * using valid data.
+	 */
+	__u64 mask;
+	/*
+	 * The information contained in the following fields might be stale at the
+	 * time it is received, as the target process might have exited as soon as
+	 * the IOCTL was processed, and there is no way to avoid that. However, it
+	 * is guaranteed that if the call was successful, then the information was
+	 * correct and referred to the intended process at the time the work was
+	 * performed. */
+	__u64 cgroupid;
+	__u32 pid;
+	__u32 tgid;
+	__u32 ppid;
+	__u32 ruid;
+	__u32 rgid;
+	__u32 euid;
+	__u32 egid;
+	__u32 suid;
+	__u32 sgid;
+	__u32 fsuid;
+	__u32 fsgid;
+	__u32 spare0[1];
+};
+
 #define PIDFS_IOCTL_MAGIC 0xFF
 
 #define PIDFD_GET_CGROUP_NAMESPACE            _IO(PIDFS_IOCTL_MAGIC, 1)
@@ -28,5 +77,6 @@
 #define PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 8)
 #define PIDFD_GET_USER_NAMESPACE              _IO(PIDFS_IOCTL_MAGIC, 9)
 #define PIDFD_GET_UTS_NAMESPACE               _IO(PIDFS_IOCTL_MAGIC, 10)
+#define PIDFD_GET_INFO                        _IOWR(PIDFS_IOCTL_MAGIC, 11, struct pidfd_info)
 
 #endif /* _UAPI_LINUX_PIDFD_H */
diff --git a/tools/testing/selftests/pidfd/pidfd_open_test.c b/tools/testing/selftests/pidfd/pidfd_open_test.c
index c62564c264b1..ce413a221bac 100644
--- a/tools/testing/selftests/pidfd/pidfd_open_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_open_test.c
@@ -13,6 +13,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <syscall.h>
+#include <sys/ioctl.h>
 #include <sys/mount.h>
 #include <sys/prctl.h>
 #include <sys/wait.h>
@@ -21,6 +22,32 @@
 #include "pidfd.h"
 #include "../kselftest.h"
 
+#ifndef PIDFS_IOCTL_MAGIC
+#define PIDFS_IOCTL_MAGIC 0xFF
+#endif
+
+#ifndef PIDFD_GET_INFO
+#define PIDFD_GET_INFO _IOWR(PIDFS_IOCTL_MAGIC, 11, struct pidfd_info)
+#define PIDFD_INFO_CGROUPID		(1UL << 0)
+
+struct pidfd_info {
+	__u64 request_mask;
+	__u64 cgroupid;
+	__u32 pid;
+	__u32 tgid;
+	__u32 ppid;
+	__u32 ruid;
+	__u32 rgid;
+	__u32 euid;
+	__u32 egid;
+	__u32 suid;
+	__u32 sgid;
+	__u32 fsuid;
+	__u32 fsgid;
+	__u32 spare0[1];
+};
+#endif
+
 static int safe_int(const char *numstr, int *converted)
 {
 	char *err = NULL;
@@ -120,10 +147,13 @@ out:
 
 int main(int argc, char **argv)
 {
+	struct pidfd_info info = {
+		.request_mask = PIDFD_INFO_CGROUPID,
+	};
 	int pidfd = -1, ret = 1;
 	pid_t pid;
 
-	ksft_set_plan(3);
+	ksft_set_plan(4);
 
 	pidfd = sys_pidfd_open(-1, 0);
 	if (pidfd >= 0) {
@@ -153,6 +183,56 @@ int main(int argc, char **argv)
 	pid = get_pid_from_fdinfo_file(pidfd, "Pid:", sizeof("Pid:") - 1);
 	ksft_print_msg("pidfd %d refers to process with pid %d\n", pidfd, pid);
 
+	if (ioctl(pidfd, PIDFD_GET_INFO, &info) < 0) {
+		ksft_print_msg("%s - failed to get info from pidfd\n", strerror(errno));
+		goto on_error;
+	}
+	if (info.pid != pid) {
+		ksft_print_msg("pid from fdinfo file %d does not match pid from ioctl %d\n",
+			       pid, info.pid);
+		goto on_error;
+	}
+	if (info.ppid != getppid()) {
+		ksft_print_msg("ppid %d does not match ppid from ioctl %d\n",
+			       pid, info.pid);
+		goto on_error;
+	}
+	if (info.ruid != getuid()) {
+		ksft_print_msg("uid %d does not match uid from ioctl %d\n",
+			       getuid(), info.ruid);
+		goto on_error;
+	}
+	if (info.rgid != getgid()) {
+		ksft_print_msg("gid %d does not match gid from ioctl %d\n",
+			       getgid(), info.rgid);
+		goto on_error;
+	}
+	if (info.euid != geteuid()) {
+		ksft_print_msg("euid %d does not match euid from ioctl %d\n",
+			       geteuid(), info.euid);
+		goto on_error;
+	}
+	if (info.egid != getegid()) {
+		ksft_print_msg("egid %d does not match egid from ioctl %d\n",
+			       getegid(), info.egid);
+		goto on_error;
+	}
+	if (info.suid != geteuid()) {
+		ksft_print_msg("suid %d does not match suid from ioctl %d\n",
+			       geteuid(), info.suid);
+		goto on_error;
+	}
+	if (info.sgid != getegid()) {
+		ksft_print_msg("sgid %d does not match sgid from ioctl %d\n",
+			       getegid(), info.sgid);
+		goto on_error;
+	}
+	if ((info.request_mask & PIDFD_INFO_CGROUPID) && info.cgroupid == 0) {
+		ksft_print_msg("cgroupid should not be 0 when PIDFD_INFO_CGROUPID is set\n");
+		goto on_error;
+	}
+	ksft_test_result_pass("get info from pidfd test: passed\n");
+
 	ret = 0;
 
 on_error:
-- 
cgit v1.2.3


From 1773572863c43a14a3e45f0591f28b7dec1ee52a Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Tue, 22 Oct 2024 17:51:42 +0200
Subject: thermal: netlink: Add the commands and the events for the thresholds

The thresholds exist but there is no notification neither action code
related to them yet.

These changes implement the netlink for the notifications when the
thresholds are crossed, added, deleted or flushed as well as the
commands which allows to get the list of the thresholds, flush them,
add and delete.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Link: https://patch.msgid.link/20241022155147.463475-3-daniel.lezcano@linaro.org
[ rjw: Use the thermal_zone guard for locking, subject edit ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/thermal_netlink.c    | 232 ++++++++++++++++++++++++++++++++++-
 drivers/thermal/thermal_netlink.h    |  34 +++++
 drivers/thermal/thermal_thresholds.c |  34 ++---
 drivers/thermal/thermal_thresholds.h |   2 +-
 include/uapi/linux/thermal.h         |  27 ++--
 5 files changed, 301 insertions(+), 28 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/thermal/thermal_netlink.c b/drivers/thermal/thermal_netlink.c
index 91f3fe8c8f07..315a76b01f6a 100644
--- a/drivers/thermal/thermal_netlink.c
+++ b/drivers/thermal/thermal_netlink.c
@@ -9,6 +9,7 @@
 #include <linux/module.h>
 #include <linux/notifier.h>
 #include <linux/kernel.h>
+#include <net/sock.h>
 #include <net/genetlink.h>
 #include <uapi/linux/thermal.h>
 
@@ -49,6 +50,11 @@ static const struct nla_policy thermal_genl_policy[THERMAL_GENL_ATTR_MAX + 1] =
 	[THERMAL_GENL_ATTR_CPU_CAPABILITY_ID]		= { .type = NLA_U32 },
 	[THERMAL_GENL_ATTR_CPU_CAPABILITY_PERFORMANCE]	= { .type = NLA_U32 },
 	[THERMAL_GENL_ATTR_CPU_CAPABILITY_EFFICIENCY]	= { .type = NLA_U32 },
+
+	/* Thresholds */
+	[THERMAL_GENL_ATTR_THRESHOLD]		= { .type = NLA_NESTED },
+	[THERMAL_GENL_ATTR_THRESHOLD_TEMP]	= { .type = NLA_U32 },
+	[THERMAL_GENL_ATTR_THRESHOLD_DIRECTION]	= { .type = NLA_U32 },
 };
 
 struct param {
@@ -62,6 +68,8 @@ struct param {
 	int trip_type;
 	int trip_hyst;
 	int temp;
+	int prev_temp;
+	int direction;
 	int cdev_state;
 	int cdev_max_state;
 	struct thermal_genl_cpu_caps *cpu_capabilities;
@@ -234,6 +242,34 @@ out_cancel_nest:
 	return -EMSGSIZE;
 }
 
+static int thermal_genl_event_threshold_add(struct param *p)
+{
+	if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id) ||
+	    nla_put_u32(p->msg, THERMAL_GENL_ATTR_THRESHOLD_TEMP, p->temp) ||
+	    nla_put_u32(p->msg, THERMAL_GENL_ATTR_THRESHOLD_DIRECTION, p->direction))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int thermal_genl_event_threshold_flush(struct param *p)
+{
+	if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int thermal_genl_event_threshold_up(struct param *p)
+{
+	if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id) ||
+	    nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_PREV_TEMP, p->prev_temp) ||
+	    nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_TEMP, p->temp))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
 int thermal_genl_event_tz_delete(struct param *p)
 	__attribute__((alias("thermal_genl_event_tz")));
 
@@ -246,6 +282,12 @@ int thermal_genl_event_tz_disable(struct param *p)
 int thermal_genl_event_tz_trip_down(struct param *p)
 	__attribute__((alias("thermal_genl_event_tz_trip_up")));
 
+int thermal_genl_event_threshold_delete(struct param *p)
+	__attribute__((alias("thermal_genl_event_threshold_add")));
+
+int thermal_genl_event_threshold_down(struct param *p)
+	__attribute__((alias("thermal_genl_event_threshold_up")));
+
 static cb_t event_cb[] = {
 	[THERMAL_GENL_EVENT_TZ_CREATE]		= thermal_genl_event_tz_create,
 	[THERMAL_GENL_EVENT_TZ_DELETE]		= thermal_genl_event_tz_delete,
@@ -259,6 +301,11 @@ static cb_t event_cb[] = {
 	[THERMAL_GENL_EVENT_CDEV_STATE_UPDATE]	= thermal_genl_event_cdev_state_update,
 	[THERMAL_GENL_EVENT_TZ_GOV_CHANGE]	= thermal_genl_event_gov_change,
 	[THERMAL_GENL_EVENT_CPU_CAPABILITY_CHANGE] = thermal_genl_event_cpu_capability_change,
+	[THERMAL_GENL_EVENT_THRESHOLD_ADD]	= thermal_genl_event_threshold_add,
+	[THERMAL_GENL_EVENT_THRESHOLD_DELETE]	= thermal_genl_event_threshold_delete,
+	[THERMAL_GENL_EVENT_THRESHOLD_FLUSH]	= thermal_genl_event_threshold_flush,
+	[THERMAL_GENL_EVENT_THRESHOLD_DOWN]	= thermal_genl_event_threshold_down,
+	[THERMAL_GENL_EVENT_THRESHOLD_UP]	= thermal_genl_event_threshold_up,
 };
 
 /*
@@ -401,6 +448,43 @@ int thermal_genl_cpu_capability_event(int count,
 }
 EXPORT_SYMBOL_GPL(thermal_genl_cpu_capability_event);
 
+int thermal_notify_threshold_add(const struct thermal_zone_device *tz,
+				 int temperature, int direction)
+{
+	struct param p = { .tz_id = tz->id, .temp = temperature, .direction = direction };
+
+	return thermal_genl_send_event(THERMAL_GENL_EVENT_THRESHOLD_ADD, &p);
+}
+
+int thermal_notify_threshold_delete(const struct thermal_zone_device *tz,
+				    int temperature, int direction)
+{
+	struct param p = { .tz_id = tz->id, .temp = temperature, .direction = direction };
+
+	return thermal_genl_send_event(THERMAL_GENL_EVENT_THRESHOLD_DELETE, &p);
+}
+
+int thermal_notify_threshold_flush(const struct thermal_zone_device *tz)
+{
+	struct param p = { .tz_id = tz->id };
+
+	return thermal_genl_send_event(THERMAL_GENL_EVENT_THRESHOLD_FLUSH, &p);
+}
+
+int thermal_notify_threshold_down(const struct thermal_zone_device *tz)
+{
+	struct param p = { .tz_id = tz->id, .temp = tz->temperature, .prev_temp = tz->last_temperature };
+
+	return thermal_genl_send_event(THERMAL_GENL_EVENT_THRESHOLD_DOWN, &p);
+}
+
+int thermal_notify_threshold_up(const struct thermal_zone_device *tz)
+{
+	struct param p = { .tz_id = tz->id, .temp = tz->temperature, .prev_temp = tz->last_temperature };
+
+	return thermal_genl_send_event(THERMAL_GENL_EVENT_THRESHOLD_UP, &p);
+}
+
 /*************************** Command encoding ********************************/
 
 static int __thermal_genl_cmd_tz_get_id(struct thermal_zone_device *tz,
@@ -563,12 +647,128 @@ out_cancel_nest:
 	return ret;
 }
 
+static int __thermal_genl_cmd_threshold_get(struct user_threshold *threshold, void *arg)
+{
+	struct sk_buff *msg = arg;
+
+	if (nla_put_u32(msg, THERMAL_GENL_ATTR_THRESHOLD_TEMP, threshold->temperature) ||
+	    nla_put_u32(msg, THERMAL_GENL_ATTR_THRESHOLD_DIRECTION, threshold->direction))
+		return -1;
+
+	return 0;
+}
+
+static int thermal_genl_cmd_threshold_get(struct param *p)
+{
+	struct sk_buff *msg = p->msg;
+	struct nlattr *start_trip;
+	int id, ret;
+
+	if (!p->attrs[THERMAL_GENL_ATTR_TZ_ID])
+		return -EINVAL;
+
+	id = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_TZ_ID]);
+
+	CLASS(thermal_zone_get_by_id, tz)(id);
+	if (!tz)
+		return -EINVAL;
+
+	start_trip = nla_nest_start(msg, THERMAL_GENL_ATTR_THRESHOLD);
+	if (!start_trip)
+		return -EMSGSIZE;
+
+	ret = thermal_thresholds_for_each(tz, __thermal_genl_cmd_threshold_get, msg);
+	if (ret)
+		return -EMSGSIZE;
+
+	nla_nest_end(msg, start_trip);
+
+	return 0;
+}
+
+static int thermal_genl_cmd_threshold_add(struct param *p)
+{
+	int id, temp, direction;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!p->attrs[THERMAL_GENL_ATTR_TZ_ID] ||
+	    !p->attrs[THERMAL_GENL_ATTR_THRESHOLD_TEMP] ||
+	    !p->attrs[THERMAL_GENL_ATTR_THRESHOLD_DIRECTION])
+		return -EINVAL;
+
+	id = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_TZ_ID]);
+	temp = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_THRESHOLD_TEMP]);
+	direction = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_THRESHOLD_DIRECTION]);
+
+	CLASS(thermal_zone_get_by_id, tz)(id);
+	if (!tz)
+		return -EINVAL;
+
+	guard(thermal_zone)(tz);
+
+	return thermal_thresholds_add(tz, temp, direction);
+}
+
+static int thermal_genl_cmd_threshold_delete(struct param *p)
+{
+	int id, temp, direction;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!p->attrs[THERMAL_GENL_ATTR_TZ_ID] ||
+	    !p->attrs[THERMAL_GENL_ATTR_THRESHOLD_TEMP] ||
+	    !p->attrs[THERMAL_GENL_ATTR_THRESHOLD_DIRECTION])
+		return -EINVAL;
+
+	id = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_TZ_ID]);
+	temp = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_THRESHOLD_TEMP]);
+	direction = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_THRESHOLD_DIRECTION]);
+
+	CLASS(thermal_zone_get_by_id, tz)(id);
+	if (!tz)
+		return -EINVAL;
+
+	guard(thermal_zone)(tz);
+
+	return thermal_thresholds_delete(tz, temp, direction);
+}
+
+static int thermal_genl_cmd_threshold_flush(struct param *p)
+{
+	int id;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!p->attrs[THERMAL_GENL_ATTR_TZ_ID])
+		return -EINVAL;
+
+	id = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_TZ_ID]);
+
+	CLASS(thermal_zone_get_by_id, tz)(id);
+	if (!tz)
+		return -EINVAL;
+
+	guard(thermal_zone)(tz);
+
+	thermal_thresholds_flush(tz);
+
+	return 0;
+}
+
 static cb_t cmd_cb[] = {
-	[THERMAL_GENL_CMD_TZ_GET_ID]	= thermal_genl_cmd_tz_get_id,
-	[THERMAL_GENL_CMD_TZ_GET_TRIP]	= thermal_genl_cmd_tz_get_trip,
-	[THERMAL_GENL_CMD_TZ_GET_TEMP]	= thermal_genl_cmd_tz_get_temp,
-	[THERMAL_GENL_CMD_TZ_GET_GOV]	= thermal_genl_cmd_tz_get_gov,
-	[THERMAL_GENL_CMD_CDEV_GET]	= thermal_genl_cmd_cdev_get,
+	[THERMAL_GENL_CMD_TZ_GET_ID]		= thermal_genl_cmd_tz_get_id,
+	[THERMAL_GENL_CMD_TZ_GET_TRIP]		= thermal_genl_cmd_tz_get_trip,
+	[THERMAL_GENL_CMD_TZ_GET_TEMP]		= thermal_genl_cmd_tz_get_temp,
+	[THERMAL_GENL_CMD_TZ_GET_GOV]		= thermal_genl_cmd_tz_get_gov,
+	[THERMAL_GENL_CMD_CDEV_GET]		= thermal_genl_cmd_cdev_get,
+	[THERMAL_GENL_CMD_THRESHOLD_GET]	= thermal_genl_cmd_threshold_get,
+	[THERMAL_GENL_CMD_THRESHOLD_ADD]	= thermal_genl_cmd_threshold_add,
+	[THERMAL_GENL_CMD_THRESHOLD_DELETE]	= thermal_genl_cmd_threshold_delete,
+	[THERMAL_GENL_CMD_THRESHOLD_FLUSH]	= thermal_genl_cmd_threshold_flush,
 };
 
 static int thermal_genl_cmd_dumpit(struct sk_buff *skb,
@@ -679,6 +879,26 @@ static const struct genl_small_ops thermal_genl_ops[] = {
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.dumpit = thermal_genl_cmd_dumpit,
 	},
+	{
+		.cmd = THERMAL_GENL_CMD_THRESHOLD_GET,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit = thermal_genl_cmd_doit,
+	},
+	{
+		.cmd = THERMAL_GENL_CMD_THRESHOLD_ADD,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit = thermal_genl_cmd_doit,
+	},
+	{
+		.cmd = THERMAL_GENL_CMD_THRESHOLD_DELETE,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit = thermal_genl_cmd_doit,
+	},
+	{
+		.cmd = THERMAL_GENL_CMD_THRESHOLD_FLUSH,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit = thermal_genl_cmd_doit,
+	},
 };
 
 static struct genl_family thermal_genl_family __ro_after_init = {
@@ -691,7 +911,7 @@ static struct genl_family thermal_genl_family __ro_after_init = {
 	.unbind		= thermal_genl_unbind,
 	.small_ops	= thermal_genl_ops,
 	.n_small_ops	= ARRAY_SIZE(thermal_genl_ops),
-	.resv_start_op	= THERMAL_GENL_CMD_CDEV_GET + 1,
+	.resv_start_op	= __THERMAL_GENL_CMD_MAX,
 	.mcgrps		= thermal_genl_mcgrps,
 	.n_mcgrps	= ARRAY_SIZE(thermal_genl_mcgrps),
 };
diff --git a/drivers/thermal/thermal_netlink.h b/drivers/thermal/thermal_netlink.h
index e01221e8816b..075e9ae85f3d 100644
--- a/drivers/thermal/thermal_netlink.h
+++ b/drivers/thermal/thermal_netlink.h
@@ -53,6 +53,13 @@ int thermal_notify_tz_gov_change(const struct thermal_zone_device *tz,
 int thermal_genl_sampling_temp(int id, int temp);
 int thermal_genl_cpu_capability_event(int count,
 				      struct thermal_genl_cpu_caps *caps);
+int thermal_notify_threshold_add(const struct thermal_zone_device *tz,
+				 int temperature, int direction);
+int thermal_notify_threshold_delete(const struct thermal_zone_device *tz,
+				    int temperature, int direction);
+int thermal_notify_threshold_flush(const struct thermal_zone_device *tz);
+int thermal_notify_threshold_down(const struct thermal_zone_device *tz);
+int thermal_notify_threshold_up(const struct thermal_zone_device *tz);
 #else
 static inline int thermal_netlink_init(void)
 {
@@ -139,6 +146,33 @@ static inline int thermal_genl_cpu_capability_event(int count, struct thermal_ge
 	return 0;
 }
 
+static inline int thermal_notify_threshold_add(const struct thermal_zone_device *tz,
+					       int temperature, int direction)
+{
+	return 0;
+}
+
+static inline int thermal_notify_threshold_delete(const struct thermal_zone_device *tz,
+						  int temperature, int direction)
+{
+	return 0;
+}
+
+static inline int thermal_notify_threshold_flush(const struct thermal_zone_device *tz)
+{
+	return 0;
+}
+
+static inline int thermal_notify_threshold_down(const struct thermal_zone_device *tz)
+{
+	return 0;
+}
+
+static inline int thermal_notify_threshold_up(const struct thermal_zone_device *tz)
+{
+	return 0;
+}
+
 static inline void __init thermal_netlink_exit(void) {}
 
 #endif /* CONFIG_THERMAL_NETLINK */
diff --git a/drivers/thermal/thermal_thresholds.c b/drivers/thermal/thermal_thresholds.c
index f33b6d5474d8..9b063199a789 100644
--- a/drivers/thermal/thermal_thresholds.c
+++ b/drivers/thermal/thermal_thresholds.c
@@ -32,6 +32,8 @@ void thermal_thresholds_flush(struct thermal_zone_device *tz)
 		kfree(entry);
 	}
 
+	thermal_notify_threshold_flush(tz);
+
 	__thermal_zone_device_update(tz, THERMAL_TZ_FLUSH_THRESHOLDS);
 }
 
@@ -122,7 +124,6 @@ void thermal_thresholds_handle(struct thermal_zone_device *tz, int *low, int *hi
 
 	int temperature = tz->temperature;
 	int last_temperature = tz->last_temperature;
-	bool notify;
 
 	lockdep_assert_held(&tz->lock);
 
@@ -144,19 +145,19 @@ void thermal_thresholds_handle(struct thermal_zone_device *tz, int *low, int *hi
 	 * - increased : thresholds are crossed the way up
 	 * - decreased : thresholds are crossed the way down
 	 */
-	if (temperature > last_temperature)
-		notify = thermal_thresholds_handle_raising(thresholds, temperature,
-							   last_temperature, low, high);
-	else
-		notify = thermal_thresholds_handle_dropping(thresholds, temperature,
-							    last_temperature, low, high);
-
-	if (notify)
-		pr_debug("A threshold has been crossed the way %s, with a temperature=%d, last_temperature=%d\n",
-			 temperature > last_temperature ? "up" : "down", temperature, last_temperature);
+	if (temperature > last_temperature) {
+		if (thermal_thresholds_handle_raising(thresholds, temperature,
+						      last_temperature, low, high))
+			thermal_notify_threshold_up(tz);
+	} else {
+		if (thermal_thresholds_handle_dropping(thresholds, temperature,
+						       last_temperature, low, high))
+			thermal_notify_threshold_down(tz);
+	}
 }
 
-int thermal_thresholds_add(struct thermal_zone_device *tz, int temperature, int direction)
+int thermal_thresholds_add(struct thermal_zone_device *tz,
+			   int temperature, int direction)
 {
 	struct list_head *thresholds = &tz->user_thresholds;
 	struct user_threshold *t;
@@ -182,12 +183,15 @@ int thermal_thresholds_add(struct thermal_zone_device *tz, int temperature, int
 		list_sort(NULL, thresholds, __thermal_thresholds_cmp);
 	}
 
+	thermal_notify_threshold_add(tz, temperature, direction);
+
 	__thermal_zone_device_update(tz, THERMAL_TZ_ADD_THRESHOLD);
 
 	return 0;
 }
 
-int thermal_thresholds_delete(struct thermal_zone_device *tz, int temperature, int direction)
+int thermal_thresholds_delete(struct thermal_zone_device *tz,
+			      int temperature, int direction)
 {
 	struct list_head *thresholds = &tz->user_thresholds;
 	struct user_threshold *t;
@@ -205,6 +209,8 @@ int thermal_thresholds_delete(struct thermal_zone_device *tz, int temperature, i
 		t->direction &= ~direction;
 	}
 
+	thermal_notify_threshold_delete(tz, temperature, direction);
+
 	__thermal_zone_device_update(tz, THERMAL_TZ_DEL_THRESHOLD);
 
 	return 0;
@@ -217,7 +223,7 @@ int thermal_thresholds_for_each(struct thermal_zone_device *tz,
 	struct user_threshold *entry;
 	int ret;
 
-	lockdep_assert_held(&tz->lock);
+	guard(thermal_zone)(tz);
 
 	list_for_each_entry(entry, thresholds, list_node) {
 		ret = cb(entry, arg);
diff --git a/drivers/thermal/thermal_thresholds.h b/drivers/thermal/thermal_thresholds.h
index 232f4e8089af..cb372659a20d 100644
--- a/drivers/thermal/thermal_thresholds.h
+++ b/drivers/thermal/thermal_thresholds.h
@@ -10,8 +10,8 @@ struct user_threshold {
 
 int thermal_thresholds_init(struct thermal_zone_device *tz);
 void thermal_thresholds_exit(struct thermal_zone_device *tz);
-void thermal_thresholds_flush(struct thermal_zone_device *tz);
 void thermal_thresholds_handle(struct thermal_zone_device *tz, int *low, int *high);
+void thermal_thresholds_flush(struct thermal_zone_device *tz);
 int thermal_thresholds_add(struct thermal_zone_device *tz, int temperature, int direction);
 int thermal_thresholds_delete(struct thermal_zone_device *tz, int temperature, int direction);
 int thermal_thresholds_for_each(struct thermal_zone_device *tz,
diff --git a/include/uapi/linux/thermal.h b/include/uapi/linux/thermal.h
index 2e6f60a36173..ba8604bdf206 100644
--- a/include/uapi/linux/thermal.h
+++ b/include/uapi/linux/thermal.h
@@ -20,7 +20,7 @@ enum thermal_trip_type {
 
 /* Adding event notification support elements */
 #define THERMAL_GENL_FAMILY_NAME		"thermal"
-#define THERMAL_GENL_VERSION			0x01
+#define THERMAL_GENL_VERSION			0x02
 #define THERMAL_GENL_SAMPLING_GROUP_NAME	"sampling"
 #define THERMAL_GENL_EVENT_GROUP_NAME		"event"
 
@@ -30,6 +30,7 @@ enum thermal_genl_attr {
 	THERMAL_GENL_ATTR_TZ,
 	THERMAL_GENL_ATTR_TZ_ID,
 	THERMAL_GENL_ATTR_TZ_TEMP,
+	THERMAL_GENL_ATTR_TZ_PREV_TEMP,
 	THERMAL_GENL_ATTR_TZ_TRIP,
 	THERMAL_GENL_ATTR_TZ_TRIP_ID,
 	THERMAL_GENL_ATTR_TZ_TRIP_TYPE,
@@ -50,6 +51,9 @@ enum thermal_genl_attr {
 	THERMAL_GENL_ATTR_CPU_CAPABILITY_ID,
 	THERMAL_GENL_ATTR_CPU_CAPABILITY_PERFORMANCE,
 	THERMAL_GENL_ATTR_CPU_CAPABILITY_EFFICIENCY,
+	THERMAL_GENL_ATTR_THRESHOLD,
+	THERMAL_GENL_ATTR_THRESHOLD_TEMP,
+	THERMAL_GENL_ATTR_THRESHOLD_DIRECTION,
 	__THERMAL_GENL_ATTR_MAX,
 };
 #define THERMAL_GENL_ATTR_MAX (__THERMAL_GENL_ATTR_MAX - 1)
@@ -77,6 +81,11 @@ enum thermal_genl_event {
 	THERMAL_GENL_EVENT_CDEV_STATE_UPDATE,	/* Cdev state updated */
 	THERMAL_GENL_EVENT_TZ_GOV_CHANGE,	/* Governor policy changed  */
 	THERMAL_GENL_EVENT_CPU_CAPABILITY_CHANGE,	/* CPU capability changed */
+	THERMAL_GENL_EVENT_THRESHOLD_ADD,	/* A thresold has been added */
+	THERMAL_GENL_EVENT_THRESHOLD_DELETE,	/* A thresold has been deleted */
+	THERMAL_GENL_EVENT_THRESHOLD_FLUSH,	/* All thresolds have been deleted */
+	THERMAL_GENL_EVENT_THRESHOLD_UP,	/* A thresold has been crossed the way up */
+	THERMAL_GENL_EVENT_THRESHOLD_DOWN,	/* A thresold has been crossed the way down */
 	__THERMAL_GENL_EVENT_MAX,
 };
 #define THERMAL_GENL_EVENT_MAX (__THERMAL_GENL_EVENT_MAX - 1)
@@ -84,12 +93,16 @@ enum thermal_genl_event {
 /* Commands supported by the thermal_genl_family */
 enum thermal_genl_cmd {
 	THERMAL_GENL_CMD_UNSPEC,
-	THERMAL_GENL_CMD_TZ_GET_ID,	/* List of thermal zones id */
-	THERMAL_GENL_CMD_TZ_GET_TRIP,	/* List of thermal trips */
-	THERMAL_GENL_CMD_TZ_GET_TEMP,	/* Get the thermal zone temperature */
-	THERMAL_GENL_CMD_TZ_GET_GOV,	/* Get the thermal zone governor */
-	THERMAL_GENL_CMD_TZ_GET_MODE,	/* Get the thermal zone mode */
-	THERMAL_GENL_CMD_CDEV_GET,	/* List of cdev id */
+	THERMAL_GENL_CMD_TZ_GET_ID,		/* List of thermal zones id */
+	THERMAL_GENL_CMD_TZ_GET_TRIP,		/* List of thermal trips */
+	THERMAL_GENL_CMD_TZ_GET_TEMP,		/* Get the thermal zone temperature */
+	THERMAL_GENL_CMD_TZ_GET_GOV,		/* Get the thermal zone governor */
+	THERMAL_GENL_CMD_TZ_GET_MODE,		/* Get the thermal zone mode */
+	THERMAL_GENL_CMD_CDEV_GET,		/* List of cdev id */
+	THERMAL_GENL_CMD_THRESHOLD_GET,		/* List of thresholds */
+	THERMAL_GENL_CMD_THRESHOLD_ADD,		/* Add a threshold */
+	THERMAL_GENL_CMD_THRESHOLD_DELETE,	/* Delete a threshold */
+	THERMAL_GENL_CMD_THRESHOLD_FLUSH,	/* Flush all the thresholds */
 	__THERMAL_GENL_CMD_MAX,
 };
 #define THERMAL_GENL_CMD_MAX (__THERMAL_GENL_CMD_MAX - 1)
-- 
cgit v1.2.3


From 09d6775f503b393d0457c7126aa43208e1724004 Mon Sep 17 00:00:00 2001
From: Samuel Holland <samuel.holland@sifive.com>
Date: Wed, 16 Oct 2024 13:27:45 -0700
Subject: riscv: Add support for userspace pointer masking

RISC-V supports pointer masking with a variable number of tag bits
(which is called "PMLEN" in the specification) and which is configured
at the next higher privilege level.

Wire up the PR_SET_TAGGED_ADDR_CTRL and PR_GET_TAGGED_ADDR_CTRL prctls
so userspace can request a lower bound on the number of tag bits and
determine the actual number of tag bits. As with arm64's
PR_TAGGED_ADDR_ENABLE, the pointer masking configuration is
thread-scoped, inherited on clone() and fork() and cleared on execve().

Reviewed-by: Charlie Jenkins <charlie@rivosinc.com>
Tested-by: Charlie Jenkins <charlie@rivosinc.com>
Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
Link: https://lore.kernel.org/r/20241016202814.4061541-5-samuel.holland@sifive.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 Documentation/arch/riscv/uabi.rst  | 12 +++++
 arch/riscv/Kconfig                 | 11 +++++
 arch/riscv/include/asm/processor.h |  8 ++++
 arch/riscv/include/asm/switch_to.h | 11 +++++
 arch/riscv/kernel/process.c        | 91 ++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/prctl.h         |  5 ++-
 6 files changed, 137 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/arch/riscv/uabi.rst b/Documentation/arch/riscv/uabi.rst
index 2b420bab0527..ddb8359a46ed 100644
--- a/Documentation/arch/riscv/uabi.rst
+++ b/Documentation/arch/riscv/uabi.rst
@@ -68,3 +68,15 @@ Misaligned accesses
 Misaligned scalar accesses are supported in userspace, but they may perform
 poorly.  Misaligned vector accesses are only supported if the Zicclsm extension
 is supported.
+
+Pointer masking
+---------------
+
+Support for pointer masking in userspace (the Supm extension) is provided via
+the ``PR_SET_TAGGED_ADDR_CTRL`` and ``PR_GET_TAGGED_ADDR_CTRL`` ``prctl()``
+operations. Pointer masking is disabled by default. To enable it, userspace
+must call ``PR_SET_TAGGED_ADDR_CTRL`` with the ``PR_PMLEN`` field set to the
+number of mask/tag bits needed by the application. ``PR_PMLEN`` is interpreted
+as a lower bound; if the kernel is unable to satisfy the request, the
+``PR_SET_TAGGED_ADDR_CTRL`` operation will fail. The actual number of tag bits
+is returned in ``PR_PMLEN`` by the ``PR_GET_TAGGED_ADDR_CTRL`` operation.
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 22dc5ea4196c..0ef449465378 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -531,6 +531,17 @@ config RISCV_ISA_C
 
 	  If you don't know what to do here, say Y.
 
+config RISCV_ISA_SUPM
+	bool "Supm extension for userspace pointer masking"
+	depends on 64BIT
+	default y
+	help
+	  Add support for pointer masking in userspace (Supm) when the
+	  underlying hardware extension (Smnpm or Ssnpm) is detected at boot.
+
+	  If this option is disabled, userspace will be unable to use
+	  the prctl(PR_{SET,GET}_TAGGED_ADDR_CTRL) API.
+
 config RISCV_ISA_SVNAPOT
 	bool "Svnapot extension support for supervisor mode NAPOT pages"
 	depends on 64BIT && MMU
diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
index c1a492508835..5f56eb9d114a 100644
--- a/arch/riscv/include/asm/processor.h
+++ b/arch/riscv/include/asm/processor.h
@@ -178,6 +178,14 @@ extern int set_unalign_ctl(struct task_struct *tsk, unsigned int val);
 #define RISCV_SET_ICACHE_FLUSH_CTX(arg1, arg2)	riscv_set_icache_flush_ctx(arg1, arg2)
 extern int riscv_set_icache_flush_ctx(unsigned long ctx, unsigned long per_thread);
 
+#ifdef CONFIG_RISCV_ISA_SUPM
+/* PR_{SET,GET}_TAGGED_ADDR_CTRL prctl */
+long set_tagged_addr_ctrl(struct task_struct *task, unsigned long arg);
+long get_tagged_addr_ctrl(struct task_struct *task);
+#define SET_TAGGED_ADDR_CTRL(arg)	set_tagged_addr_ctrl(current, arg)
+#define GET_TAGGED_ADDR_CTRL()		get_tagged_addr_ctrl(current)
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_RISCV_PROCESSOR_H */
diff --git a/arch/riscv/include/asm/switch_to.h b/arch/riscv/include/asm/switch_to.h
index 9685cd85e57c..94e33216b2d9 100644
--- a/arch/riscv/include/asm/switch_to.h
+++ b/arch/riscv/include/asm/switch_to.h
@@ -70,6 +70,17 @@ static __always_inline bool has_fpu(void) { return false; }
 #define __switch_to_fpu(__prev, __next) do { } while (0)
 #endif
 
+static inline void envcfg_update_bits(struct task_struct *task,
+				      unsigned long mask, unsigned long val)
+{
+	unsigned long envcfg;
+
+	envcfg = (task->thread.envcfg & ~mask) | val;
+	task->thread.envcfg = envcfg;
+	if (task == current)
+		csr_write(CSR_ENVCFG, envcfg);
+}
+
 static inline void __switch_to_envcfg(struct task_struct *next)
 {
 	asm volatile (ALTERNATIVE("nop", "csrw " __stringify(CSR_ENVCFG) ", %0",
diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c
index e3142d8a6e28..200d2ed64dfe 100644
--- a/arch/riscv/kernel/process.c
+++ b/arch/riscv/kernel/process.c
@@ -7,6 +7,7 @@
  * Copyright (C) 2017 SiFive
  */
 
+#include <linux/bitfield.h>
 #include <linux/cpu.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
@@ -180,6 +181,10 @@ void flush_thread(void)
 	memset(&current->thread.vstate, 0, sizeof(struct __riscv_v_ext_state));
 	clear_tsk_thread_flag(current, TIF_RISCV_V_DEFER_RESTORE);
 #endif
+#ifdef CONFIG_RISCV_ISA_SUPM
+	if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SUPM))
+		envcfg_update_bits(current, ENVCFG_PMM, ENVCFG_PMM_PMLEN_0);
+#endif
 }
 
 void arch_release_task_struct(struct task_struct *tsk)
@@ -242,3 +247,89 @@ void __init arch_task_cache_init(void)
 {
 	riscv_v_setup_ctx_cache();
 }
+
+#ifdef CONFIG_RISCV_ISA_SUPM
+enum {
+	PMLEN_0 = 0,
+	PMLEN_7 = 7,
+	PMLEN_16 = 16,
+};
+
+static bool have_user_pmlen_7;
+static bool have_user_pmlen_16;
+
+long set_tagged_addr_ctrl(struct task_struct *task, unsigned long arg)
+{
+	unsigned long valid_mask = PR_PMLEN_MASK;
+	struct thread_info *ti = task_thread_info(task);
+	unsigned long pmm;
+	u8 pmlen;
+
+	if (is_compat_thread(ti))
+		return -EINVAL;
+
+	if (arg & ~valid_mask)
+		return -EINVAL;
+
+	/*
+	 * Prefer the smallest PMLEN that satisfies the user's request,
+	 * in case choosing a larger PMLEN has a performance impact.
+	 */
+	pmlen = FIELD_GET(PR_PMLEN_MASK, arg);
+	if (pmlen == PMLEN_0)
+		pmm = ENVCFG_PMM_PMLEN_0;
+	else if (pmlen <= PMLEN_7 && have_user_pmlen_7)
+		pmm = ENVCFG_PMM_PMLEN_7;
+	else if (pmlen <= PMLEN_16 && have_user_pmlen_16)
+		pmm = ENVCFG_PMM_PMLEN_16;
+	else
+		return -EINVAL;
+
+	envcfg_update_bits(task, ENVCFG_PMM, pmm);
+
+	return 0;
+}
+
+long get_tagged_addr_ctrl(struct task_struct *task)
+{
+	struct thread_info *ti = task_thread_info(task);
+	long ret = 0;
+
+	if (is_compat_thread(ti))
+		return -EINVAL;
+
+	switch (task->thread.envcfg & ENVCFG_PMM) {
+	case ENVCFG_PMM_PMLEN_7:
+		ret = FIELD_PREP(PR_PMLEN_MASK, PMLEN_7);
+		break;
+	case ENVCFG_PMM_PMLEN_16:
+		ret = FIELD_PREP(PR_PMLEN_MASK, PMLEN_16);
+		break;
+	}
+
+	return ret;
+}
+
+static bool try_to_set_pmm(unsigned long value)
+{
+	csr_set(CSR_ENVCFG, value);
+	return (csr_read_clear(CSR_ENVCFG, ENVCFG_PMM) & ENVCFG_PMM) == value;
+}
+
+static int __init tagged_addr_init(void)
+{
+	if (!riscv_has_extension_unlikely(RISCV_ISA_EXT_SUPM))
+		return 0;
+
+	/*
+	 * envcfg.PMM is a WARL field. Detect which values are supported.
+	 * Assume the supported PMLEN values are the same on all harts.
+	 */
+	csr_clear(CSR_ENVCFG, ENVCFG_PMM);
+	have_user_pmlen_7 = try_to_set_pmm(ENVCFG_PMM_PMLEN_7);
+	have_user_pmlen_16 = try_to_set_pmm(ENVCFG_PMM_PMLEN_16);
+
+	return 0;
+}
+core_initcall(tagged_addr_init);
+#endif	/* CONFIG_RISCV_ISA_SUPM */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 35791791a879..cefd656ebf43 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -230,7 +230,7 @@ struct prctl_mm_map {
 # define PR_PAC_APDBKEY			(1UL << 3)
 # define PR_PAC_APGAKEY			(1UL << 4)
 
-/* Tagged user address controls for arm64 */
+/* Tagged user address controls for arm64 and RISC-V */
 #define PR_SET_TAGGED_ADDR_CTRL		55
 #define PR_GET_TAGGED_ADDR_CTRL		56
 # define PR_TAGGED_ADDR_ENABLE		(1UL << 0)
@@ -244,6 +244,9 @@ struct prctl_mm_map {
 # define PR_MTE_TAG_MASK		(0xffffUL << PR_MTE_TAG_SHIFT)
 /* Unused; kept only for source compatibility */
 # define PR_MTE_TCF_SHIFT		1
+/* RISC-V pointer masking tag length */
+# define PR_PMLEN_SHIFT			24
+# define PR_PMLEN_MASK			(0x7fUL << PR_PMLEN_SHIFT)
 
 /* Control reclaim behavior when allocating memory */
 #define PR_SET_IO_FLUSHER		57
-- 
cgit v1.2.3


From 78844482a1c939a972681842f8ee2a8ddb202441 Mon Sep 17 00:00:00 2001
From: Samuel Holland <samuel.holland@sifive.com>
Date: Wed, 16 Oct 2024 13:27:47 -0700
Subject: riscv: Allow ptrace control of the tagged address ABI

This allows a tracer to control the ABI of the tracee, as on arm64.

Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
Link: https://lore.kernel.org/r/20241016202814.4061541-7-samuel.holland@sifive.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/kernel/ptrace.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/elf.h   |  1 +
 2 files changed, 43 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c
index 92731ff8c79a..ea67e9fb7a58 100644
--- a/arch/riscv/kernel/ptrace.c
+++ b/arch/riscv/kernel/ptrace.c
@@ -28,6 +28,9 @@ enum riscv_regset {
 #ifdef CONFIG_RISCV_ISA_V
 	REGSET_V,
 #endif
+#ifdef CONFIG_RISCV_ISA_SUPM
+	REGSET_TAGGED_ADDR_CTRL,
+#endif
 };
 
 static int riscv_gpr_get(struct task_struct *target,
@@ -152,6 +155,35 @@ static int riscv_vr_set(struct task_struct *target,
 }
 #endif
 
+#ifdef CONFIG_RISCV_ISA_SUPM
+static int tagged_addr_ctrl_get(struct task_struct *target,
+				const struct user_regset *regset,
+				struct membuf to)
+{
+	long ctrl = get_tagged_addr_ctrl(target);
+
+	if (IS_ERR_VALUE(ctrl))
+		return ctrl;
+
+	return membuf_write(&to, &ctrl, sizeof(ctrl));
+}
+
+static int tagged_addr_ctrl_set(struct task_struct *target,
+				const struct user_regset *regset,
+				unsigned int pos, unsigned int count,
+				const void *kbuf, const void __user *ubuf)
+{
+	int ret;
+	long ctrl;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, 0, -1);
+	if (ret)
+		return ret;
+
+	return set_tagged_addr_ctrl(target, ctrl);
+}
+#endif
+
 static const struct user_regset riscv_user_regset[] = {
 	[REGSET_X] = {
 		.core_note_type = NT_PRSTATUS,
@@ -182,6 +214,16 @@ static const struct user_regset riscv_user_regset[] = {
 		.set = riscv_vr_set,
 	},
 #endif
+#ifdef CONFIG_RISCV_ISA_SUPM
+	[REGSET_TAGGED_ADDR_CTRL] = {
+		.core_note_type = NT_RISCV_TAGGED_ADDR_CTRL,
+		.n = 1,
+		.size = sizeof(long),
+		.align = sizeof(long),
+		.regset_get = tagged_addr_ctrl_get,
+		.set = tagged_addr_ctrl_set,
+	},
+#endif
 };
 
 static const struct user_regset_view riscv_user_native_view = {
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index b9935988da5c..a920cf8934dc 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -450,6 +450,7 @@ typedef struct elf64_shdr {
 #define NT_MIPS_MSA	0x802		/* MIPS SIMD registers */
 #define NT_RISCV_CSR	0x900		/* RISC-V Control and Status Registers */
 #define NT_RISCV_VECTOR	0x901		/* RISC-V vector registers */
+#define NT_RISCV_TAGGED_ADDR_CTRL 0x902	/* RISC-V tagged address control (prctl()) */
 #define NT_LOONGARCH_CPUCFG	0xa00	/* LoongArch CPU config registers */
 #define NT_LOONGARCH_CSR	0xa01	/* LoongArch control and status registers */
 #define NT_LOONGARCH_LSX	0xa02	/* LoongArch Loongson SIMD Extension registers */
-- 
cgit v1.2.3


From 2f2d46959808e9b039ecb241ff13d50be2d6e231 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Sat, 19 Oct 2024 18:15:42 +0100
Subject: firmware/psci: Add definitions for PSCI v1.3 specification

The v1.3 PSCI spec (https://developer.arm.com/documentation/den0022) adds
the SYSTEM_OFF2 function. Add definitions for it and its hibernation type
parameter.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Miguel Luis <miguel.luis@oracle.com>
Link: https://lore.kernel.org/r/20241019172459.2241939-2-dwmw2@infradead.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
---
 include/uapi/linux/psci.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/psci.h b/include/uapi/linux/psci.h
index 42a40ad3fb62..81759ff385e6 100644
--- a/include/uapi/linux/psci.h
+++ b/include/uapi/linux/psci.h
@@ -59,6 +59,7 @@
 #define PSCI_1_1_FN_SYSTEM_RESET2		PSCI_0_2_FN(18)
 #define PSCI_1_1_FN_MEM_PROTECT			PSCI_0_2_FN(19)
 #define PSCI_1_1_FN_MEM_PROTECT_CHECK_RANGE	PSCI_0_2_FN(20)
+#define PSCI_1_3_FN_SYSTEM_OFF2			PSCI_0_2_FN(21)
 
 #define PSCI_1_0_FN64_CPU_DEFAULT_SUSPEND	PSCI_0_2_FN64(12)
 #define PSCI_1_0_FN64_NODE_HW_STATE		PSCI_0_2_FN64(13)
@@ -68,6 +69,7 @@
 
 #define PSCI_1_1_FN64_SYSTEM_RESET2		PSCI_0_2_FN64(18)
 #define PSCI_1_1_FN64_MEM_PROTECT_CHECK_RANGE	PSCI_0_2_FN64(20)
+#define PSCI_1_3_FN64_SYSTEM_OFF2		PSCI_0_2_FN64(21)
 
 /* PSCI v0.2 power state encoding for CPU_SUSPEND function */
 #define PSCI_0_2_POWER_STATE_ID_MASK		0xffff
@@ -100,6 +102,9 @@
 #define PSCI_1_1_RESET_TYPE_SYSTEM_WARM_RESET	0
 #define PSCI_1_1_RESET_TYPE_VENDOR_START	0x80000000U
 
+/* PSCI v1.3 hibernate type for SYSTEM_OFF2 */
+#define PSCI_1_3_OFF_TYPE_HIBERNATE_OFF		BIT(0)
+
 /* PSCI version decoding (independent of PSCI version) */
 #define PSCI_VERSION_MAJOR_SHIFT		16
 #define PSCI_VERSION_MINOR_MASK			\
-- 
cgit v1.2.3


From f4986a72d6e4be78ec0e4ee0e03531474621183f Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Fri, 25 Oct 2024 06:11:57 -0700
Subject: iommufd: Add IOMMU_IOAS_MAP_FILE

Define the IOMMU_IOAS_MAP_FILE ioctl interface, which allows a user to
register memory by passing a memfd plus offset and length.  Implement it
using the memfd_pin_folios() kAPI.

Link: https://patch.msgid.link/r/1729861919-234514-8-git-send-email-steven.sistare@oracle.com
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/io_pagetable.c    | 36 ++++++++++++++++++++++++-
 drivers/iommu/iommufd/io_pagetable.h    |  2 ++
 drivers/iommu/iommufd/ioas.c            | 47 +++++++++++++++++++++++++++++++++
 drivers/iommu/iommufd/iommufd_private.h |  5 ++++
 drivers/iommu/iommufd/main.c            |  2 ++
 drivers/iommu/iommufd/pages.c           | 23 ++++++++++++++++
 include/uapi/linux/iommufd.h            | 25 ++++++++++++++++++
 7 files changed, 139 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
index 874ee9ea7efc..8a790e597e12 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -268,7 +268,14 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt,
 		/* Use the first entry to guess the ideal IOVA alignment */
 		elm = list_first_entry(pages_list, struct iopt_pages_list,
 				       next);
-		start = elm->start_byte + (uintptr_t)elm->pages->uptr;
+		switch (elm->pages->type) {
+		case IOPT_ADDRESS_USER:
+			start = elm->start_byte + (uintptr_t)elm->pages->uptr;
+			break;
+		case IOPT_ADDRESS_FILE:
+			start = elm->start_byte + elm->pages->start;
+			break;
+		}
 		rc = iopt_alloc_iova(iopt, dst_iova, start, length);
 		if (rc)
 			goto out_unlock;
@@ -446,6 +453,33 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
 			       uptr - pages->uptr, iommu_prot, flags);
 }
 
+/**
+ * iopt_map_file_pages() - Like iopt_map_user_pages, but map a file.
+ * @ictx: iommufd_ctx the iopt is part of
+ * @iopt: io_pagetable to act on
+ * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
+ *        the chosen iova on output. Otherwise is the iova to map to on input
+ * @file: file to map
+ * @start: map file starting at this byte offset
+ * @length: Number of bytes to map
+ * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
+ * @flags: IOPT_ALLOC_IOVA or zero
+ */
+int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
+			unsigned long *iova, struct file *file,
+			unsigned long start, unsigned long length,
+			int iommu_prot, unsigned int flags)
+{
+	struct iopt_pages *pages;
+
+	pages = iopt_alloc_file_pages(file, start, length,
+				      iommu_prot & IOMMU_WRITE);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+	return iopt_map_common(ictx, iopt, pages, iova, length,
+			       start - pages->start, iommu_prot, flags);
+}
+
 struct iova_bitmap_fn_arg {
 	unsigned long flags;
 	struct io_pagetable *iopt;
diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h
index 5ac4eedc0be3..9b40b2237932 100644
--- a/drivers/iommu/iommufd/io_pagetable.h
+++ b/drivers/iommu/iommufd/io_pagetable.h
@@ -220,6 +220,8 @@ struct iopt_pages {
 
 struct iopt_pages *iopt_alloc_user_pages(void __user *uptr,
 					 unsigned long length, bool writable);
+struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start,
+					 unsigned long length, bool writable);
 void iopt_release_pages(struct kref *kref);
 static inline void iopt_put_pages(struct iopt_pages *pages)
 {
diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
index 2c4b2bb11e78..c05d33fc3a50 100644
--- a/drivers/iommu/iommufd/ioas.c
+++ b/drivers/iommu/iommufd/ioas.c
@@ -2,6 +2,7 @@
 /*
  * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
  */
+#include <linux/file.h>
 #include <linux/interval_tree.h>
 #include <linux/iommu.h>
 #include <linux/iommufd.h>
@@ -197,6 +198,52 @@ static int conv_iommu_prot(u32 map_flags)
 	return iommu_prot;
 }
 
+int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_ioas_map_file *cmd = ucmd->cmd;
+	unsigned long iova = cmd->iova;
+	struct iommufd_ioas *ioas;
+	unsigned int flags = 0;
+	struct file *file;
+	int rc;
+
+	if (cmd->flags &
+	     ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE |
+	       IOMMU_IOAS_MAP_READABLE))
+		return -EOPNOTSUPP;
+
+	if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX)
+		return -EOVERFLOW;
+
+	if (!(cmd->flags &
+	      (IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE)))
+		return -EINVAL;
+
+	ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id);
+	if (IS_ERR(ioas))
+		return PTR_ERR(ioas);
+
+	if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA))
+		flags = IOPT_ALLOC_IOVA;
+
+	file = fget(cmd->fd);
+	if (!file)
+		return -EBADF;
+
+	rc = iopt_map_file_pages(ucmd->ictx, &ioas->iopt, &iova, file,
+				 cmd->start, cmd->length,
+				 conv_iommu_prot(cmd->flags), flags);
+	if (rc)
+		goto out_put;
+
+	cmd->iova = iova;
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+out_put:
+	iommufd_put_object(ucmd->ictx, &ioas->obj);
+	fput(file);
+	return rc;
+}
+
 int iommufd_ioas_map(struct iommufd_ucmd *ucmd)
 {
 	struct iommu_ioas_map *cmd = ucmd->cmd;
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index f1d865e6fab6..8f3c21a664bd 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -69,6 +69,10 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
 			unsigned long *iova, void __user *uptr,
 			unsigned long length, int iommu_prot,
 			unsigned int flags);
+int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
+			unsigned long *iova, struct file *file,
+			unsigned long start, unsigned long length,
+			int iommu_prot, unsigned int flags);
 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
 		   unsigned long length, unsigned long *dst_iova,
 		   int iommu_prot, unsigned int flags);
@@ -276,6 +280,7 @@ void iommufd_ioas_destroy(struct iommufd_object *obj);
 int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd);
 int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd);
 int iommufd_ioas_map(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd);
 int iommufd_ioas_copy(struct iommufd_ucmd *ucmd);
 int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd);
 int iommufd_ioas_option(struct iommufd_ucmd *ucmd);
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index b5f5d27ee963..826a2b2be52f 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -378,6 +378,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 		 struct iommu_ioas_iova_ranges, out_iova_alignment),
 	IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map,
 		 iova),
+	IOCTL_OP(IOMMU_IOAS_MAP_FILE, iommufd_ioas_map_file,
+		 struct iommu_ioas_map_file, iova),
 	IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap,
 		 length),
 	IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option,
diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c
index 5f371fa88a4a..2ee6fcd2b551 100644
--- a/drivers/iommu/iommufd/pages.c
+++ b/drivers/iommu/iommufd/pages.c
@@ -45,6 +45,7 @@
  * last_iova + 1 can overflow. An iopt_pages index will always be much less than
  * ULONG_MAX so last_index + 1 cannot overflow.
  */
+#include <linux/file.h>
 #include <linux/highmem.h>
 #include <linux/iommu.h>
 #include <linux/iommufd.h>
@@ -1340,6 +1341,26 @@ struct iopt_pages *iopt_alloc_user_pages(void __user *uptr,
 	return pages;
 }
 
+struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start,
+					 unsigned long length, bool writable)
+
+{
+	struct iopt_pages *pages;
+	unsigned long start_down = ALIGN_DOWN(start, PAGE_SIZE);
+	unsigned long end;
+
+	if (length && check_add_overflow(start, length - 1, &end))
+		return ERR_PTR(-EOVERFLOW);
+
+	pages = iopt_alloc_pages(start - start_down, length, writable);
+	if (IS_ERR(pages))
+		return pages;
+	pages->file = get_file(file);
+	pages->start = start_down;
+	pages->type = IOPT_ADDRESS_FILE;
+	return pages;
+}
+
 void iopt_release_pages(struct kref *kref)
 {
 	struct iopt_pages *pages = container_of(kref, struct iopt_pages, kref);
@@ -1352,6 +1373,8 @@ void iopt_release_pages(struct kref *kref)
 	mutex_destroy(&pages->mutex);
 	put_task_struct(pages->source_task);
 	free_uid(pages->source_user);
+	if (pages->type == IOPT_ADDRESS_FILE)
+		fput(pages->file);
 	kfree(pages);
 }
 
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 72010f71c5e4..41b1a01e9293 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -51,6 +51,7 @@ enum {
 	IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP = 0x8c,
 	IOMMUFD_CMD_HWPT_INVALIDATE = 0x8d,
 	IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e,
+	IOMMUFD_CMD_IOAS_MAP_FILE = 0x8f,
 };
 
 /**
@@ -213,6 +214,30 @@ struct iommu_ioas_map {
 };
 #define IOMMU_IOAS_MAP _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_MAP)
 
+/**
+ * struct iommu_ioas_map_file - ioctl(IOMMU_IOAS_MAP_FILE)
+ * @size: sizeof(struct iommu_ioas_map_file)
+ * @flags: same as for iommu_ioas_map
+ * @ioas_id: same as for iommu_ioas_map
+ * @fd: the memfd to map
+ * @start: byte offset from start of file to map from
+ * @length: same as for iommu_ioas_map
+ * @iova: same as for iommu_ioas_map
+ *
+ * Set an IOVA mapping from a memfd file.  All other arguments and semantics
+ * match those of IOMMU_IOAS_MAP.
+ */
+struct iommu_ioas_map_file {
+	__u32 size;
+	__u32 flags;
+	__u32 ioas_id;
+	__s32 fd;
+	__aligned_u64 start;
+	__aligned_u64 length;
+	__aligned_u64 iova;
+};
+#define IOMMU_IOAS_MAP_FILE _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_MAP_FILE)
+
 /**
  * struct iommu_ioas_copy - ioctl(IOMMU_IOAS_COPY)
  * @size: sizeof(struct iommu_ioas_copy)
-- 
cgit v1.2.3


From cb67ff6272eceb5fcb2fe3b74f0293fa0706841a Mon Sep 17 00:00:00 2001
From: Jonathan Kim <jonathan.kim@amd.com>
Date: Tue, 22 Oct 2024 12:30:50 -0400
Subject: drm/amdkfd: flag per-queue reset support for gfx9

Flag KFD support for per-queue reset on GFX9 devices.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Harish Kasiviswanathan <harish.kasiviswanathan@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 2 ++
 include/uapi/linux/kfd_sysfs.h            | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 3871591c9aec..9476e30d6baa 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -1998,6 +1998,8 @@ static void kfd_topology_set_capabilities(struct kfd_topology_device *dev)
 		if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(9, 4, 2))
 			dev->node_props.capability |=
 				HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED;
+
+		dev->node_props.capability |= HSA_CAP_PER_QUEUE_RESET_SUPPORTED;
 	} else {
 		dev->node_props.debug_prop |= HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX10 |
 					HSA_DBG_WATCH_ADDR_MASK_HI_BIT;
diff --git a/include/uapi/linux/kfd_sysfs.h b/include/uapi/linux/kfd_sysfs.h
index 5e8d28617efa..859b8e91d4d3 100644
--- a/include/uapi/linux/kfd_sysfs.h
+++ b/include/uapi/linux/kfd_sysfs.h
@@ -60,7 +60,8 @@
 #define HSA_CAP_FLAGS_COHERENTHOSTACCESS			0x10000000
 #define HSA_CAP_TRAP_DEBUG_FIRMWARE_SUPPORTED			0x20000000
 #define HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED	0x40000000
-#define HSA_CAP_RESERVED					0x800f8000
+#define HSA_CAP_PER_QUEUE_RESET_SUPPORTED			0x80000000
+#define HSA_CAP_RESERVED					0x000f8000
 
 /* debug_prop bits in node properties */
 #define HSA_DBG_WATCH_ADDR_MASK_LO_BIT_MASK     0x0000000f
-- 
cgit v1.2.3


From b7a0855eb95f6db8ac8e17596e76f7b94a790fe6 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@ziepe.ca>
Date: Mon, 28 Oct 2024 09:38:01 +0000
Subject: iommu: Add new flag to explictly request PASID capable domain

Introduce new flag (IOMMU_HWPT_ALLOC_PASID) to domain_alloc_users() ops.
If IOMMU supports PASID it will allocate domain. Otherwise return error.
In error path check for -EOPNOTSUPP and try to allocate non-PASID
domain so that DMA-API mode work fine for drivers which does not support
PASID as well.

Also modify __iommu_group_alloc_default_domain() to call
iommu_paging_domain_alloc_flags() with appropriate flag when allocating
paging domain.

Signed-off-by: Jason Gunthorpe <jgg@ziepe.ca>
Co-developed-by: Vasant Hegde <vasant.hegde@amd.com>
Signed-off-by: Vasant Hegde <vasant.hegde@amd.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20241028093810.5901-4-vasant.hegde@amd.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c        | 56 +++++++++++++++++++++++++++++++++++---------
 include/uapi/linux/iommufd.h |  8 +++++++
 2 files changed, 53 insertions(+), 11 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index d82aa6563d84..97ad43144736 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -32,6 +32,7 @@
 #include <trace/events/iommu.h>
 #include <linux/sched/mm.h>
 #include <linux/msi.h>
+#include <uapi/linux/iommufd.h>
 
 #include "dma-iommu.h"
 #include "iommu-priv.h"
@@ -99,6 +100,9 @@ static int __iommu_attach_device(struct iommu_domain *domain,
 				 struct device *dev);
 static int __iommu_attach_group(struct iommu_domain *domain,
 				struct iommu_group *group);
+static struct iommu_domain *__iommu_paging_domain_alloc_flags(struct device *dev,
+						       unsigned int type,
+						       unsigned int flags);
 
 enum {
 	IOMMU_SET_DOMAIN_MUST_SUCCEED = 1 << 0,
@@ -1585,8 +1589,30 @@ EXPORT_SYMBOL_GPL(fsl_mc_device_group);
 static struct iommu_domain *
 __iommu_group_alloc_default_domain(struct iommu_group *group, int req_type)
 {
+	struct device *dev = iommu_group_first_dev(group);
+	struct iommu_domain *dom;
+
 	if (group->default_domain && group->default_domain->type == req_type)
 		return group->default_domain;
+
+	/*
+	 * When allocating the DMA API domain assume that the driver is going to
+	 * use PASID and make sure the RID's domain is PASID compatible.
+	 */
+	if (req_type & __IOMMU_DOMAIN_PAGING) {
+		dom = __iommu_paging_domain_alloc_flags(dev, req_type,
+			   dev->iommu->max_pasids ? IOMMU_HWPT_ALLOC_PASID : 0);
+
+		/*
+		 * If driver does not support PASID feature then
+		 * try to allocate non-PASID domain
+		 */
+		if (PTR_ERR(dom) == -EOPNOTSUPP)
+			dom = __iommu_paging_domain_alloc_flags(dev, req_type, 0);
+
+		return dom;
+	}
+
 	return __iommu_group_domain_alloc(group, req_type);
 }
 
@@ -1961,16 +1987,9 @@ __iommu_group_domain_alloc(struct iommu_group *group, unsigned int type)
 	return __iommu_domain_alloc(dev_iommu_ops(dev), dev, type);
 }
 
-/**
- * iommu_paging_domain_alloc_flags() - Allocate a paging domain
- * @dev: device for which the domain is allocated
- * @flags: Enum of iommufd_hwpt_alloc_flags
- *
- * Allocate a paging domain which will be managed by a kernel driver. Return
- * allocated domain if successful, or an ERR pointer for failure.
- */
-struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev,
-						     unsigned int flags)
+static struct iommu_domain *
+__iommu_paging_domain_alloc_flags(struct device *dev, unsigned int type,
+				  unsigned int flags)
 {
 	const struct iommu_ops *ops;
 	struct iommu_domain *domain;
@@ -1994,9 +2013,24 @@ struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev,
 	if (!domain)
 		return ERR_PTR(-ENOMEM);
 
-	iommu_domain_init(domain, IOMMU_DOMAIN_UNMANAGED, ops);
+	iommu_domain_init(domain, type, ops);
 	return domain;
 }
+
+/**
+ * iommu_paging_domain_alloc_flags() - Allocate a paging domain
+ * @dev: device for which the domain is allocated
+ * @flags: Bitmap of iommufd_hwpt_alloc_flags
+ *
+ * Allocate a paging domain which will be managed by a kernel driver. Return
+ * allocated domain if successful, or an ERR pointer for failure.
+ */
+struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev,
+						     unsigned int flags)
+{
+	return __iommu_paging_domain_alloc_flags(dev,
+					 IOMMU_DOMAIN_UNMANAGED, flags);
+}
 EXPORT_SYMBOL_GPL(iommu_paging_domain_alloc_flags);
 
 void iommu_domain_free(struct iommu_domain *domain)
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 72010f71c5e4..0c0ed28ee113 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -359,11 +359,19 @@ struct iommu_vfio_ioas {
  *                                   enforced on device attachment
  * @IOMMU_HWPT_FAULT_ID_VALID: The fault_id field of hwpt allocation data is
  *                             valid.
+ * @IOMMU_HWPT_ALLOC_PASID: Requests a domain that can be used with PASID. The
+ *                          domain can be attached to any PASID on the device.
+ *                          Any domain attached to the non-PASID part of the
+ *                          device must also be flaged, otherwise attaching a
+ *                          PASID will blocked.
+ *                          If IOMMU does not support PASID it will return
+ *                          error (-EOPNOTSUPP).
  */
 enum iommufd_hwpt_alloc_flags {
 	IOMMU_HWPT_ALLOC_NEST_PARENT = 1 << 0,
 	IOMMU_HWPT_ALLOC_DIRTY_TRACKING = 1 << 1,
 	IOMMU_HWPT_FAULT_ID_VALID = 1 << 2,
+	IOMMU_HWPT_ALLOC_PASID = 1 << 3,
 };
 
 /**
-- 
cgit v1.2.3


From 1ddf9916ac09313128e40d6581cef889c0b4ce84 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Wed, 23 Oct 2024 12:53:42 +0200
Subject: xfrm: Add support for per cpu xfrm state handling.

Currently all flows for a certain SA must be processed by the same
cpu to avoid packet reordering and lock contention of the xfrm
state lock.

To get rid of this limitation, the IETF standardized per cpu SAs
in RFC 9611. This patch implements the xfrm part of it.

We add the cpu as a lookup key for xfrm states and a config option
to generate acquire messages for each cpu.

With that, we can have on each cpu a SA with identical traffic selector
so that flows can be processed in parallel on all cpus.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Tested-by: Antony Antony <antony.antony@secunet.com>
Tested-by: Tobias Brunner <tobias@strongswan.org>
---
 include/net/xfrm.h        |  5 ++--
 include/uapi/linux/xfrm.h |  2 ++
 net/key/af_key.c          |  7 +++---
 net/xfrm/xfrm_compat.c    |  6 +++--
 net/xfrm/xfrm_state.c     | 58 ++++++++++++++++++++++++++++++++++++++---------
 net/xfrm/xfrm_user.c      | 56 +++++++++++++++++++++++++++++++++++++++++----
 6 files changed, 112 insertions(+), 22 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index a0bdd58f401c..f5275618e744 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -188,6 +188,7 @@ struct xfrm_state {
 	refcount_t		refcnt;
 	spinlock_t		lock;
 
+	u32			pcpu_num;
 	struct xfrm_id		id;
 	struct xfrm_selector	sel;
 	struct xfrm_mark	mark;
@@ -1684,7 +1685,7 @@ struct xfrmk_spdinfo {
 	u32 spdhmcnt;
 };
 
-struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq);
+struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num);
 int xfrm_state_delete(struct xfrm_state *x);
 int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync);
 int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid);
@@ -1796,7 +1797,7 @@ int verify_spi_info(u8 proto, u32 min, u32 max, struct netlink_ext_ack *extack);
 int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi,
 		   struct netlink_ext_ack *extack);
 struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark,
-				 u8 mode, u32 reqid, u32 if_id, u8 proto,
+				 u8 mode, u32 reqid, u32 if_id, u32 pcpu_num, u8 proto,
 				 const xfrm_address_t *daddr,
 				 const xfrm_address_t *saddr, int create,
 				 unsigned short family);
diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index f28701500714..d73a97e3030a 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -322,6 +322,7 @@ enum xfrm_attr_type_t {
 	XFRMA_MTIMER_THRESH,	/* __u32 in seconds for input SA */
 	XFRMA_SA_DIR,		/* __u8 */
 	XFRMA_NAT_KEEPALIVE_INTERVAL,	/* __u32 in seconds for NAT keepalive */
+	XFRMA_SA_PCPU,		/* __u32 */
 	__XFRMA_MAX
 
 #define XFRMA_OUTPUT_MARK XFRMA_SET_MARK	/* Compatibility */
@@ -437,6 +438,7 @@ struct xfrm_userpolicy_info {
 #define XFRM_POLICY_LOCALOK	1	/* Allow user to override global policy */
 	/* Automatically expand selector to include matching ICMP payloads. */
 #define XFRM_POLICY_ICMP	2
+#define XFRM_POLICY_CPU_ACQUIRE	4
 	__u8				share;
 };
 
diff --git a/net/key/af_key.c b/net/key/af_key.c
index f79fb99271ed..c56bb4f451e6 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1354,7 +1354,7 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_
 	}
 
 	if (hdr->sadb_msg_seq) {
-		x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq);
+		x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq, UINT_MAX);
 		if (x && !xfrm_addr_equal(&x->id.daddr, xdaddr, family)) {
 			xfrm_state_put(x);
 			x = NULL;
@@ -1362,7 +1362,8 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_
 	}
 
 	if (!x)
-		x = xfrm_find_acq(net, &dummy_mark, mode, reqid, 0, proto, xdaddr, xsaddr, 1, family);
+		x = xfrm_find_acq(net, &dummy_mark, mode, reqid, 0, UINT_MAX,
+				  proto, xdaddr, xsaddr, 1, family);
 
 	if (x == NULL)
 		return -ENOENT;
@@ -1417,7 +1418,7 @@ static int pfkey_acquire(struct sock *sk, struct sk_buff *skb, const struct sadb
 	if (hdr->sadb_msg_seq == 0 || hdr->sadb_msg_errno == 0)
 		return 0;
 
-	x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq);
+	x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq, UINT_MAX);
 	if (x == NULL)
 		return 0;
 
diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c
index 91357ccaf4af..5b9ee63e30b6 100644
--- a/net/xfrm/xfrm_compat.c
+++ b/net/xfrm/xfrm_compat.c
@@ -132,6 +132,7 @@ static const struct nla_policy compat_policy[XFRMA_MAX+1] = {
 	[XFRMA_MTIMER_THRESH]	= { .type = NLA_U32 },
 	[XFRMA_SA_DIR]          = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT),
 	[XFRMA_NAT_KEEPALIVE_INTERVAL]	= { .type = NLA_U32 },
+	[XFRMA_SA_PCPU]		= { .type = NLA_U32 },
 };
 
 static struct nlmsghdr *xfrm_nlmsg_put_compat(struct sk_buff *skb,
@@ -282,9 +283,10 @@ static int xfrm_xlate64_attr(struct sk_buff *dst, const struct nlattr *src)
 	case XFRMA_MTIMER_THRESH:
 	case XFRMA_SA_DIR:
 	case XFRMA_NAT_KEEPALIVE_INTERVAL:
+	case XFRMA_SA_PCPU:
 		return xfrm_nla_cpy(dst, src, nla_len(src));
 	default:
-		BUILD_BUG_ON(XFRMA_MAX != XFRMA_NAT_KEEPALIVE_INTERVAL);
+		BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_PCPU);
 		pr_warn_once("unsupported nla_type %d\n", src->nla_type);
 		return -EOPNOTSUPP;
 	}
@@ -439,7 +441,7 @@ static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla,
 	int err;
 
 	if (type > XFRMA_MAX) {
-		BUILD_BUG_ON(XFRMA_MAX != XFRMA_NAT_KEEPALIVE_INTERVAL);
+		BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_PCPU);
 		NL_SET_ERR_MSG(extack, "Bad attribute");
 		return -EOPNOTSUPP;
 	}
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 37478d36a8df..ebef07b80afa 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -679,6 +679,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net)
 		x->lft.hard_packet_limit = XFRM_INF;
 		x->replay_maxage = 0;
 		x->replay_maxdiff = 0;
+		x->pcpu_num = UINT_MAX;
 		spin_lock_init(&x->lock);
 	}
 	return x;
@@ -1155,6 +1156,12 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x,
 			       struct xfrm_state **best, int *acq_in_progress,
 			       int *error)
 {
+	/* We need the cpu id just as a lookup key,
+	 * we don't require it to be stable.
+	 */
+	unsigned int pcpu_id = get_cpu();
+	put_cpu();
+
 	/* Resolution logic:
 	 * 1. There is a valid state with matching selector. Done.
 	 * 2. Valid state with inappropriate selector. Skip.
@@ -1174,13 +1181,18 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x,
 							&fl->u.__fl_common))
 			return;
 
+		if (x->pcpu_num != UINT_MAX && x->pcpu_num != pcpu_id)
+			return;
+
 		if (!*best ||
+		    ((*best)->pcpu_num == UINT_MAX && x->pcpu_num == pcpu_id) ||
 		    (*best)->km.dying > x->km.dying ||
 		    ((*best)->km.dying == x->km.dying &&
 		     (*best)->curlft.add_time < x->curlft.add_time))
 			*best = x;
 	} else if (x->km.state == XFRM_STATE_ACQ) {
-		*acq_in_progress = 1;
+		if (!*best || x->pcpu_num == pcpu_id)
+			*acq_in_progress = 1;
 	} else if (x->km.state == XFRM_STATE_ERROR ||
 		   x->km.state == XFRM_STATE_EXPIRED) {
 		if ((!x->sel.family ||
@@ -1209,6 +1221,13 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
 	unsigned short encap_family = tmpl->encap_family;
 	unsigned int sequence;
 	struct km_event c;
+	unsigned int pcpu_id;
+
+	/* We need the cpu id just as a lookup key,
+	 * we don't require it to be stable.
+	 */
+	pcpu_id = get_cpu();
+	put_cpu();
 
 	to_put = NULL;
 
@@ -1282,7 +1301,10 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
 	}
 
 found:
-	x = best;
+	if (!(pol->flags & XFRM_POLICY_CPU_ACQUIRE) ||
+	    (best && (best->pcpu_num == pcpu_id)))
+		x = best;
+
 	if (!x && !error && !acquire_in_progress) {
 		if (tmpl->id.spi &&
 		    (x0 = __xfrm_state_lookup_all(net, mark, daddr,
@@ -1314,6 +1336,8 @@ found:
 		xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family);
 		memcpy(&x->mark, &pol->mark, sizeof(x->mark));
 		x->if_id = if_id;
+		if ((pol->flags & XFRM_POLICY_CPU_ACQUIRE) && best)
+			x->pcpu_num = pcpu_id;
 
 		error = security_xfrm_state_alloc_acquire(x, pol->security, fl->flowi_secid);
 		if (error) {
@@ -1392,6 +1416,11 @@ found:
 			x = NULL;
 			error = -ESRCH;
 		}
+
+		/* Use the already installed 'fallback' while the CPU-specific
+		 * SA acquire is handled*/
+		if (best)
+			x = best;
 	}
 out:
 	if (x) {
@@ -1524,12 +1553,14 @@ static void __xfrm_state_bump_genids(struct xfrm_state *xnew)
 	unsigned int h;
 	u32 mark = xnew->mark.v & xnew->mark.m;
 	u32 if_id = xnew->if_id;
+	u32 cpu_id = xnew->pcpu_num;
 
 	h = xfrm_dst_hash(net, &xnew->id.daddr, &xnew->props.saddr, reqid, family);
 	hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) {
 		if (x->props.family	== family &&
 		    x->props.reqid	== reqid &&
 		    x->if_id		== if_id &&
+		    x->pcpu_num		== cpu_id &&
 		    (mark & x->mark.m) == x->mark.v &&
 		    xfrm_addr_equal(&x->id.daddr, &xnew->id.daddr, family) &&
 		    xfrm_addr_equal(&x->props.saddr, &xnew->props.saddr, family))
@@ -1552,7 +1583,7 @@ EXPORT_SYMBOL(xfrm_state_insert);
 static struct xfrm_state *__find_acq_core(struct net *net,
 					  const struct xfrm_mark *m,
 					  unsigned short family, u8 mode,
-					  u32 reqid, u32 if_id, u8 proto,
+					  u32 reqid, u32 if_id, u32 pcpu_num, u8 proto,
 					  const xfrm_address_t *daddr,
 					  const xfrm_address_t *saddr,
 					  int create)
@@ -1569,6 +1600,7 @@ static struct xfrm_state *__find_acq_core(struct net *net,
 		    x->id.spi       != 0 ||
 		    x->id.proto	    != proto ||
 		    (mark & x->mark.m) != x->mark.v ||
+		    x->pcpu_num != pcpu_num ||
 		    !xfrm_addr_equal(&x->id.daddr, daddr, family) ||
 		    !xfrm_addr_equal(&x->props.saddr, saddr, family))
 			continue;
@@ -1602,6 +1634,7 @@ static struct xfrm_state *__find_acq_core(struct net *net,
 			break;
 		}
 
+		x->pcpu_num = pcpu_num;
 		x->km.state = XFRM_STATE_ACQ;
 		x->id.proto = proto;
 		x->props.family = family;
@@ -1630,7 +1663,7 @@ static struct xfrm_state *__find_acq_core(struct net *net,
 	return x;
 }
 
-static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq);
+static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num);
 
 int xfrm_state_add(struct xfrm_state *x)
 {
@@ -1656,7 +1689,7 @@ int xfrm_state_add(struct xfrm_state *x)
 	}
 
 	if (use_spi && x->km.seq) {
-		x1 = __xfrm_find_acq_byseq(net, mark, x->km.seq);
+		x1 = __xfrm_find_acq_byseq(net, mark, x->km.seq, x->pcpu_num);
 		if (x1 && ((x1->id.proto != x->id.proto) ||
 		    !xfrm_addr_equal(&x1->id.daddr, &x->id.daddr, family))) {
 			to_put = x1;
@@ -1666,7 +1699,7 @@ int xfrm_state_add(struct xfrm_state *x)
 
 	if (use_spi && !x1)
 		x1 = __find_acq_core(net, &x->mark, family, x->props.mode,
-				     x->props.reqid, x->if_id, x->id.proto,
+				     x->props.reqid, x->if_id, x->pcpu_num, x->id.proto,
 				     &x->id.daddr, &x->props.saddr, 0);
 
 	__xfrm_state_bump_genids(x);
@@ -1791,6 +1824,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig,
 	x->props.flags = orig->props.flags;
 	x->props.extra_flags = orig->props.extra_flags;
 
+	x->pcpu_num = orig->pcpu_num;
 	x->if_id = orig->if_id;
 	x->tfcpad = orig->tfcpad;
 	x->replay_maxdiff = orig->replay_maxdiff;
@@ -2066,13 +2100,14 @@ EXPORT_SYMBOL(xfrm_state_lookup_byaddr);
 
 struct xfrm_state *
 xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid,
-	      u32 if_id, u8 proto, const xfrm_address_t *daddr,
+	      u32 if_id, u32 pcpu_num, u8 proto, const xfrm_address_t *daddr,
 	      const xfrm_address_t *saddr, int create, unsigned short family)
 {
 	struct xfrm_state *x;
 
 	spin_lock_bh(&net->xfrm.xfrm_state_lock);
-	x = __find_acq_core(net, mark, family, mode, reqid, if_id, proto, daddr, saddr, create);
+	x = __find_acq_core(net, mark, family, mode, reqid, if_id, pcpu_num,
+			    proto, daddr, saddr, create);
 	spin_unlock_bh(&net->xfrm.xfrm_state_lock);
 
 	return x;
@@ -2207,7 +2242,7 @@ xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n,
 
 /* Silly enough, but I'm lazy to build resolution list */
 
-static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq)
+static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num)
 {
 	unsigned int h = xfrm_seq_hash(net, seq);
 	struct xfrm_state *x;
@@ -2215,6 +2250,7 @@ static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 s
 	hlist_for_each_entry_rcu(x, net->xfrm.state_byseq + h, byseq) {
 		if (x->km.seq == seq &&
 		    (mark & x->mark.m) == x->mark.v &&
+		    x->pcpu_num == pcpu_num &&
 		    x->km.state == XFRM_STATE_ACQ) {
 			xfrm_state_hold(x);
 			return x;
@@ -2224,12 +2260,12 @@ static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 s
 	return NULL;
 }
 
-struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq)
+struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num)
 {
 	struct xfrm_state *x;
 
 	spin_lock_bh(&net->xfrm.xfrm_state_lock);
-	x = __xfrm_find_acq_byseq(net, mark, seq);
+	x = __xfrm_find_acq_byseq(net, mark, seq, pcpu_num);
 	spin_unlock_bh(&net->xfrm.xfrm_state_lock);
 	return x;
 }
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index e3b8ce89831a..e4d448950d05 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -460,6 +460,12 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
 		}
 	}
 
+	if (!sa_dir && attrs[XFRMA_SA_PCPU]) {
+		NL_SET_ERR_MSG(extack, "SA_PCPU only supported with SA_DIR");
+		err = -EINVAL;
+		goto out;
+	}
+
 out:
 	return err;
 }
@@ -841,6 +847,12 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
 		x->nat_keepalive_interval =
 			nla_get_u32(attrs[XFRMA_NAT_KEEPALIVE_INTERVAL]);
 
+	if (attrs[XFRMA_SA_PCPU]) {
+		x->pcpu_num = nla_get_u32(attrs[XFRMA_SA_PCPU]);
+		if (x->pcpu_num >= num_possible_cpus())
+			goto error;
+	}
+
 	err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV], extack);
 	if (err)
 		goto error;
@@ -1296,6 +1308,11 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
 		if (ret)
 			goto out;
 	}
+	if (x->pcpu_num != UINT_MAX) {
+		ret = nla_put_u32(skb, XFRMA_SA_PCPU, x->pcpu_num);
+		if (ret)
+			goto out;
+	}
 	if (x->dir)
 		ret = nla_put_u8(skb, XFRMA_SA_DIR, x->dir);
 
@@ -1700,6 +1717,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
 	u32 mark;
 	struct xfrm_mark m;
 	u32 if_id = 0;
+	u32 pcpu_num = UINT_MAX;
 
 	p = nlmsg_data(nlh);
 	err = verify_spi_info(p->info.id.proto, p->min, p->max, extack);
@@ -1716,8 +1734,16 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (attrs[XFRMA_IF_ID])
 		if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
 
+	if (attrs[XFRMA_SA_PCPU]) {
+		pcpu_num = nla_get_u32(attrs[XFRMA_SA_PCPU]);
+		if (pcpu_num >= num_possible_cpus()) {
+			err = -EINVAL;
+			goto out_noput;
+		}
+	}
+
 	if (p->info.seq) {
-		x = xfrm_find_acq_byseq(net, mark, p->info.seq);
+		x = xfrm_find_acq_byseq(net, mark, p->info.seq, pcpu_num);
 		if (x && !xfrm_addr_equal(&x->id.daddr, daddr, family)) {
 			xfrm_state_put(x);
 			x = NULL;
@@ -1726,7 +1752,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	if (!x)
 		x = xfrm_find_acq(net, &m, p->info.mode, p->info.reqid,
-				  if_id, p->info.id.proto, daddr,
+				  if_id, pcpu_num, p->info.id.proto, daddr,
 				  &p->info.saddr, 1,
 				  family);
 	err = -ENOENT;
@@ -2526,7 +2552,8 @@ static inline unsigned int xfrm_aevent_msgsize(struct xfrm_state *x)
 	       + nla_total_size(sizeof(struct xfrm_mark))
 	       + nla_total_size(4) /* XFRM_AE_RTHR */
 	       + nla_total_size(4) /* XFRM_AE_ETHR */
-	       + nla_total_size(sizeof(x->dir)); /* XFRMA_SA_DIR */
+	       + nla_total_size(sizeof(x->dir)) /* XFRMA_SA_DIR */
+	       + nla_total_size(4); /* XFRMA_SA_PCPU */
 }
 
 static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c)
@@ -2582,6 +2609,8 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct
 	err = xfrm_if_id_put(skb, x->if_id);
 	if (err)
 		goto out_cancel;
+	if (x->pcpu_num != UINT_MAX)
+		err = nla_put_u32(skb, XFRMA_SA_PCPU, x->pcpu_num);
 
 	if (x->dir) {
 		err = nla_put_u8(skb, XFRMA_SA_DIR, x->dir);
@@ -2852,6 +2881,13 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	xfrm_mark_get(attrs, &mark);
 
+	if (attrs[XFRMA_SA_PCPU]) {
+		x->pcpu_num = nla_get_u32(attrs[XFRMA_SA_PCPU]);
+		err = -EINVAL;
+		if (x->pcpu_num >= num_possible_cpus())
+			goto free_state;
+	}
+
 	err = verify_newpolicy_info(&ua->policy, extack);
 	if (err)
 		goto free_state;
@@ -3182,6 +3218,7 @@ const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
 	[XFRMA_MTIMER_THRESH]   = { .type = NLA_U32 },
 	[XFRMA_SA_DIR]          = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT),
 	[XFRMA_NAT_KEEPALIVE_INTERVAL] = { .type = NLA_U32 },
+	[XFRMA_SA_PCPU]		= { .type = NLA_U32 },
 };
 EXPORT_SYMBOL_GPL(xfrma_policy);
 
@@ -3348,7 +3385,8 @@ static inline unsigned int xfrm_expire_msgsize(void)
 {
 	return NLMSG_ALIGN(sizeof(struct xfrm_user_expire)) +
 	       nla_total_size(sizeof(struct xfrm_mark)) +
-	       nla_total_size(sizeof_field(struct xfrm_state, dir));
+	       nla_total_size(sizeof_field(struct xfrm_state, dir)) +
+	       nla_total_size(4); /* XFRMA_SA_PCPU */
 }
 
 static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c)
@@ -3374,6 +3412,11 @@ static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct
 	err = xfrm_if_id_put(skb, x->if_id);
 	if (err)
 		return err;
+	if (x->pcpu_num != UINT_MAX) {
+		err = nla_put_u32(skb, XFRMA_SA_PCPU, x->pcpu_num);
+		if (err)
+			return err;
+	}
 
 	if (x->dir) {
 		err = nla_put_u8(skb, XFRMA_SA_DIR, x->dir);
@@ -3481,6 +3524,8 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x)
 	}
 	if (x->if_id)
 		l += nla_total_size(sizeof(x->if_id));
+	if (x->pcpu_num)
+		l += nla_total_size(sizeof(x->pcpu_num));
 
 	/* Must count x->lastused as it may become non-zero behind our back. */
 	l += nla_total_size_64bit(sizeof(u64));
@@ -3587,6 +3632,7 @@ static inline unsigned int xfrm_acquire_msgsize(struct xfrm_state *x,
 	       + nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr)
 	       + nla_total_size(sizeof(struct xfrm_mark))
 	       + nla_total_size(xfrm_user_sec_ctx_size(x->security))
+	       + nla_total_size(4) /* XFRMA_SA_PCPU */
 	       + userpolicy_type_attrsize();
 }
 
@@ -3623,6 +3669,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x,
 		err = xfrm_if_id_put(skb, xp->if_id);
 	if (!err && xp->xdo.dev)
 		err = copy_user_offload(&xp->xdo, skb);
+	if (!err && x->pcpu_num != UINT_MAX)
+		err = nla_put_u32(skb, XFRMA_SA_PCPU, x->pcpu_num);
 	if (err) {
 		nlmsg_cancel(skb, nlh);
 		return err;
-- 
cgit v1.2.3


From a377132154ab8404dafcc52e8bc0c73050a954c2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 Sep 2024 05:57:31 -0600
Subject: io_uring/msg_ring: add support for sending a sync message

Normally MSG_RING requires both a source and a destination ring. But
some users don't always have a ring avilable to send a message from, yet
they still need to notify a target ring.

Add support for using io_uring_register(2) without having a source ring,
using a file descriptor of -1 for that. Internally those are called
blind registration opcodes. Implement IORING_REGISTER_SEND_MSG_RING as a
blind opcode, which simply takes an sqe that the application can put on
the stack and use the normal liburing helpers to initialize it. Then the
app can call:

io_uring_register(-1, IORING_REGISTER_SEND_MSG_RING, &sqe, 1);

and get the same behavior in terms of the target, where a CQE is posted
with the details given in the sqe.

For now this takes a single sqe pointer argument, and hence arg must
be set to that, and nr_args must be 1. Could easily be extended to take
an array of sqes, but for now let's keep it simple.

Link: https://lore.kernel.org/r/20240924115932.116167-3-axboe@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  3 +++
 io_uring/msg_ring.c           | 29 +++++++++++++++++++++++++++++
 io_uring/msg_ring.h           |  1 +
 io_uring/register.c           | 30 ++++++++++++++++++++++++++++++
 4 files changed, 63 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 1fe79e750470..86cb385fe0b5 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -612,6 +612,9 @@ enum io_uring_register_op {
 	/* clone registered buffers from source ring to current ring */
 	IORING_REGISTER_CLONE_BUFFERS		= 30,
 
+	/* send MSG_RING without having a ring */
+	IORING_REGISTER_SEND_MSG_RING		= 31,
+
 	/* this goes last */
 	IORING_REGISTER_LAST,
 
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index b8c527f08cd5..edea1ffd501c 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -331,6 +331,35 @@ done:
 	return IOU_OK;
 }
 
+int io_uring_sync_msg_ring(struct io_uring_sqe *sqe)
+{
+	struct io_msg io_msg = { };
+	struct fd f;
+	int ret;
+
+	ret = __io_msg_ring_prep(&io_msg, sqe);
+	if (unlikely(ret))
+		return ret;
+
+	/*
+	 * Only data sending supported, not IORING_MSG_SEND_FD as that one
+	 * doesn't make sense without a source ring to send files from.
+	 */
+	if (io_msg.cmd != IORING_MSG_DATA)
+		return -EINVAL;
+
+	ret = -EBADF;
+	f = fdget(sqe->fd);
+	if (fd_file(f)) {
+		ret = -EBADFD;
+		if (io_is_uring_fops(fd_file(f)))
+			ret = __io_msg_ring_data(fd_file(f)->private_data,
+						 &io_msg, IO_URING_F_UNLOCKED);
+		fdput(f);
+	}
+	return ret;
+}
+
 void io_msg_cache_free(const void *entry)
 {
 	struct io_kiocb *req = (struct io_kiocb *) entry;
diff --git a/io_uring/msg_ring.h b/io_uring/msg_ring.h
index 3030f3942f0f..38e7f8f0c944 100644
--- a/io_uring/msg_ring.h
+++ b/io_uring/msg_ring.h
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 
+int io_uring_sync_msg_ring(struct io_uring_sqe *sqe);
 int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags);
 void io_msg_ring_cleanup(struct io_kiocb *req);
diff --git a/io_uring/register.c b/io_uring/register.c
index eca26d4884d9..52b2f9b74af8 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -28,6 +28,7 @@
 #include "kbuf.h"
 #include "napi.h"
 #include "eventfd.h"
+#include "msg_ring.h"
 
 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
 				 IORING_REGISTER_LAST + IORING_OP_LAST)
@@ -588,6 +589,32 @@ struct file *io_uring_register_get_file(unsigned int fd, bool registered)
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
+/*
+ * "blind" registration opcodes are ones where there's no ring given, and
+ * hence the source fd must be -1.
+ */
+static int io_uring_register_blind(unsigned int opcode, void __user *arg,
+				   unsigned int nr_args)
+{
+	switch (opcode) {
+	case IORING_REGISTER_SEND_MSG_RING: {
+		struct io_uring_sqe sqe;
+
+		if (!arg || nr_args != 1)
+			return -EINVAL;
+		if (copy_from_user(&sqe, arg, sizeof(sqe)))
+			return -EFAULT;
+		/* no flags supported */
+		if (sqe.flags)
+			return -EINVAL;
+		if (sqe.opcode == IORING_OP_MSG_RING)
+			return io_uring_sync_msg_ring(&sqe);
+		}
+	}
+
+	return -EINVAL;
+}
+
 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
 		void __user *, arg, unsigned int, nr_args)
 {
@@ -602,6 +629,9 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
 	if (opcode >= IORING_REGISTER_LAST)
 		return -EINVAL;
 
+	if (fd == -1)
+		return io_uring_register_blind(opcode, arg, nr_args);
+
 	file = io_uring_register_get_file(fd, use_registered_ring);
 	if (IS_ERR(file))
 		return PTR_ERR(file);
-- 
cgit v1.2.3


From 79cfe9e59c2a12c3b3faeeefe38d23f3d8030972 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 21 Oct 2024 13:34:10 -0600
Subject: io_uring/register: add IORING_REGISTER_RESIZE_RINGS

Once a ring has been created, the size of the CQ and SQ rings are fixed.
Usually this isn't a problem on the SQ ring side, as it merely controls
the available number of requests that can be submitted in a single
system call, and there's rarely a need to change that.

For the CQ ring, it's a different story. For most efficient use of
io_uring, it's important that the CQ ring never overflows. This means
that applications must size it for the worst case scenario, which can
be wasteful.

Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize
the existing rings. It takes a struct io_uring_params argument, the same
one which is used to setup the ring initially, and resizes rings
according to the sizes given.

Certain properties are always inherited from the original ring setup,
like SQE128/CQE32 and other setup options. The implementation only
allows flag associated with how the CQ ring is sized and clamped.

Existing unconsumed SQE and CQE entries are copied as part of the
process. If either the SQ or CQ resized destination ring cannot hold the
entries already present in the source rings, then the operation is failed
with -EOVERFLOW. Any register op holds ->uring_lock, which prevents new
submissions, and the internal mapping holds the completion lock as well
across moving CQ ring state.

To prevent races between mmap and ring resizing, add a mutex that's
solely used to serialize ring resize and mmap. mmap_sem can't be used
here, as as fork'ed process may be doing mmaps on the ring as well.
The ctx->resize_lock is held across mmap operations, and the resize
will grab it before swapping out the already mapped new data.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |   7 ++
 include/uapi/linux/io_uring.h  |   5 +
 io_uring/io_uring.c            |   1 +
 io_uring/memmap.c              |   8 ++
 io_uring/register.c            | 215 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 236 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 6d3ee71bd832..841579dcdae9 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -415,6 +415,13 @@ struct io_ring_ctx {
 	/* protected by ->completion_lock */
 	unsigned			evfd_last_cq_tail;
 
+	/*
+	 * Protection for resize vs mmap races - both the mmap and resize
+	 * side will need to grab this lock, to prevent either side from
+	 * being run concurrently with the other.
+	 */
+	struct mutex			resize_lock;
+
 	/*
 	 * If IORING_SETUP_NO_MMAP is used, then the below holds
 	 * the gup'ed pages for the two rings, and the sqes.
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 86cb385fe0b5..60b9c98595fa 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -615,6 +615,11 @@ enum io_uring_register_op {
 	/* send MSG_RING without having a ring */
 	IORING_REGISTER_SEND_MSG_RING		= 31,
 
+	/* 32 reserved for zc rx */
+
+	/* resize CQ ring */
+	IORING_REGISTER_RESIZE_RINGS		= 33,
+
 	/* this goes last */
 	IORING_REGISTER_LAST,
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index b5974bdad48b..140cd47fbdb3 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -353,6 +353,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
 	INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
 	io_napi_init(ctx);
+	mutex_init(&ctx->resize_lock);
 
 	return ctx;
 
diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index d614824e17bd..85c66fa54956 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -251,6 +251,8 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
 	unsigned int npages;
 	void *ptr;
 
+	guard(mutex)(&ctx->resize_lock);
+
 	ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
 	if (IS_ERR(ptr))
 		return PTR_ERR(ptr);
@@ -274,6 +276,7 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
 					 unsigned long len, unsigned long pgoff,
 					 unsigned long flags)
 {
+	struct io_ring_ctx *ctx = filp->private_data;
 	void *ptr;
 
 	/*
@@ -284,6 +287,8 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
 	if (addr)
 		return -EINVAL;
 
+	guard(mutex)(&ctx->resize_lock);
+
 	ptr = io_uring_validate_mmap_request(filp, pgoff, len);
 	if (IS_ERR(ptr))
 		return -ENOMEM;
@@ -329,8 +334,11 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr,
 					 unsigned long len, unsigned long pgoff,
 					 unsigned long flags)
 {
+	struct io_ring_ctx *ctx = file->private_data;
 	void *ptr;
 
+	guard(mutex)(&ctx->resize_lock);
+
 	ptr = io_uring_validate_mmap_request(file, pgoff, len);
 	if (IS_ERR(ptr))
 		return PTR_ERR(ptr);
diff --git a/io_uring/register.c b/io_uring/register.c
index 52b2f9b74af8..fc6c94d694b2 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -29,6 +29,7 @@
 #include "napi.h"
 #include "eventfd.h"
 #include "msg_ring.h"
+#include "memmap.h"
 
 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
 				 IORING_REGISTER_LAST + IORING_OP_LAST)
@@ -361,6 +362,214 @@ static int io_register_clock(struct io_ring_ctx *ctx,
 	return 0;
 }
 
+/*
+ * State to maintain until we can swap. Both new and old state, used for
+ * either mapping or freeing.
+ */
+struct io_ring_ctx_rings {
+	unsigned short n_ring_pages;
+	unsigned short n_sqe_pages;
+	struct page **ring_pages;
+	struct page **sqe_pages;
+	struct io_uring_sqe *sq_sqes;
+	struct io_rings *rings;
+};
+
+static void io_register_free_rings(struct io_uring_params *p,
+				   struct io_ring_ctx_rings *r)
+{
+	if (!(p->flags & IORING_SETUP_NO_MMAP)) {
+		io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages,
+				true);
+		io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages,
+				true);
+	} else {
+		io_pages_free(&r->ring_pages, r->n_ring_pages);
+		io_pages_free(&r->sqe_pages, r->n_sqe_pages);
+		vunmap(r->rings);
+		vunmap(r->sq_sqes);
+	}
+}
+
+#define swap_old(ctx, o, n, field)		\
+	do {					\
+		(o).field = (ctx)->field;	\
+		(ctx)->field = (n).field;	\
+	} while (0)
+
+#define RESIZE_FLAGS	(IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
+#define COPY_FLAGS	(IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
+			 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP)
+
+static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
+	size_t size, sq_array_offset;
+	struct io_uring_params p;
+	unsigned i, tail;
+	void *ptr;
+	int ret;
+
+	/* for single issuer, must be owner resizing */
+	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER &&
+	    current != ctx->submitter_task)
+		return -EEXIST;
+	if (copy_from_user(&p, arg, sizeof(p)))
+		return -EFAULT;
+	if (p.flags & ~RESIZE_FLAGS)
+		return -EINVAL;
+
+	/* properties that are always inherited */
+	p.flags |= (ctx->flags & COPY_FLAGS);
+
+	ret = io_uring_fill_params(p.sq_entries, &p);
+	if (unlikely(ret))
+		return ret;
+
+	/* nothing to do, but copy params back */
+	if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) {
+		if (copy_to_user(arg, &p, sizeof(p)))
+			return -EFAULT;
+		return 0;
+	}
+
+	size = rings_size(p.flags, p.sq_entries, p.cq_entries,
+				&sq_array_offset);
+	if (size == SIZE_MAX)
+		return -EOVERFLOW;
+
+	if (!(p.flags & IORING_SETUP_NO_MMAP))
+		n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size);
+	else
+		n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages,
+						p.cq_off.user_addr, size);
+	if (IS_ERR(n.rings))
+		return PTR_ERR(n.rings);
+
+	n.rings->sq_ring_mask = p.sq_entries - 1;
+	n.rings->cq_ring_mask = p.cq_entries - 1;
+	n.rings->sq_ring_entries = p.sq_entries;
+	n.rings->cq_ring_entries = p.cq_entries;
+
+	if (copy_to_user(arg, &p, sizeof(p))) {
+		io_register_free_rings(&p, &n);
+		return -EFAULT;
+	}
+
+	if (p.flags & IORING_SETUP_SQE128)
+		size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries);
+	else
+		size = array_size(sizeof(struct io_uring_sqe), p.sq_entries);
+	if (size == SIZE_MAX) {
+		io_register_free_rings(&p, &n);
+		return -EOVERFLOW;
+	}
+
+	if (!(p.flags & IORING_SETUP_NO_MMAP))
+		ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size);
+	else
+		ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages,
+					p.sq_off.user_addr,
+					size);
+	if (IS_ERR(ptr)) {
+		io_register_free_rings(&p, &n);
+		return PTR_ERR(ptr);
+	}
+
+	/*
+	 * If using SQPOLL, park the thread
+	 */
+	if (ctx->sq_data) {
+		mutex_unlock(&ctx->uring_lock);
+		io_sq_thread_park(ctx->sq_data);
+		mutex_lock(&ctx->uring_lock);
+	}
+
+	/*
+	 * We'll do the swap. Grab the ctx->resize_lock, which will exclude
+	 * any new mmap's on the ring fd. Clear out existing mappings to prevent
+	 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
+	 * existing rings beyond this point will fail. Not that it could proceed
+	 * at this point anyway, as the io_uring mmap side needs go grab the
+	 * ctx->resize_lock as well. Likewise, hold the completion lock over the
+	 * duration of the actual swap.
+	 */
+	mutex_lock(&ctx->resize_lock);
+	spin_lock(&ctx->completion_lock);
+	o.rings = ctx->rings;
+	ctx->rings = NULL;
+	o.sq_sqes = ctx->sq_sqes;
+	ctx->sq_sqes = NULL;
+
+	/*
+	 * Now copy SQ and CQ entries, if any. If either of the destination
+	 * rings can't hold what is already there, then fail the operation.
+	 */
+	n.sq_sqes = ptr;
+	tail = o.rings->sq.tail;
+	if (tail - o.rings->sq.head > p.sq_entries)
+		goto overflow;
+	for (i = o.rings->sq.head; i < tail; i++) {
+		unsigned src_head = i & (ctx->sq_entries - 1);
+		unsigned dst_head = i & n.rings->sq_ring_mask;
+
+		n.sq_sqes[dst_head] = o.sq_sqes[src_head];
+	}
+	n.rings->sq.head = o.rings->sq.head;
+	n.rings->sq.tail = o.rings->sq.tail;
+
+	tail = o.rings->cq.tail;
+	if (tail - o.rings->cq.head > p.cq_entries) {
+overflow:
+		/* restore old rings, and return -EOVERFLOW via cleanup path */
+		ctx->rings = o.rings;
+		ctx->sq_sqes = o.sq_sqes;
+		to_free = &n;
+		ret = -EOVERFLOW;
+		goto out;
+	}
+	for (i = o.rings->cq.head; i < tail; i++) {
+		unsigned src_head = i & (ctx->cq_entries - 1);
+		unsigned dst_head = i & n.rings->cq_ring_mask;
+
+		n.rings->cqes[dst_head] = o.rings->cqes[src_head];
+	}
+	n.rings->cq.head = o.rings->cq.head;
+	n.rings->cq.tail = o.rings->cq.tail;
+	/* invalidate cached cqe refill */
+	ctx->cqe_cached = ctx->cqe_sentinel = NULL;
+
+	n.rings->sq_dropped = o.rings->sq_dropped;
+	n.rings->sq_flags = o.rings->sq_flags;
+	n.rings->cq_flags = o.rings->cq_flags;
+	n.rings->cq_overflow = o.rings->cq_overflow;
+
+	/* all done, store old pointers and assign new ones */
+	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
+		ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset);
+
+	ctx->sq_entries = p.sq_entries;
+	ctx->cq_entries = p.cq_entries;
+
+	ctx->rings = n.rings;
+	ctx->sq_sqes = n.sq_sqes;
+	swap_old(ctx, o, n, n_ring_pages);
+	swap_old(ctx, o, n, n_sqe_pages);
+	swap_old(ctx, o, n, ring_pages);
+	swap_old(ctx, o, n, sqe_pages);
+	to_free = &o;
+	ret = 0;
+out:
+	spin_unlock(&ctx->completion_lock);
+	mutex_unlock(&ctx->resize_lock);
+	io_register_free_rings(&p, to_free);
+
+	if (ctx->sq_data)
+		io_sq_thread_unpark(ctx->sq_data);
+
+	return ret;
+}
+
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
 	__releases(ctx->uring_lock)
@@ -549,6 +758,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_clone_buffers(ctx, arg);
 		break;
+	case IORING_REGISTER_RESIZE_RINGS:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_register_resize_rings(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
-- 
cgit v1.2.3


From aa00f67adc2c0d6439f81b5a81ff181377c47a7e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 22 Oct 2024 13:47:00 -0600
Subject: io_uring: add support for fixed wait regions

Generally applications have 1 or a few waits of waiting, yet they pass
in a struct io_uring_getevents_arg every time. This needs to get copied
and, in turn, the timeout value needs to get copied.

Rather than do this for every invocation, allow the application to
register a fixed set of wait regions that can simply be indexed when
asking the kernel to wait on events.

At ring setup time, the application can register a number of these wait
regions and initialize region/index 0 upfront:

	struct io_uring_reg_wait *reg;

	reg = io_uring_setup_reg_wait(ring, nr_regions, &ret);

	/* set timeout and mark as set, sigmask/sigmask_sz as needed */
	reg->ts.tv_sec = 0;
	reg->ts.tv_nsec = 100000;
	reg->flags = IORING_REG_WAIT_TS;

where nr_regions >= 1 && nr_regions <= PAGE_SIZE / sizeof(*reg). The
above initializes index 0, but 63 other regions can be initialized,
if needed. Now, instead of doing:

	struct __kernel_timespec timeout = { .tv_nsec = 100000, };

	io_uring_submit_and_wait_timeout(ring, &cqe, nr, &t, NULL);

to wait for events for each submit_and_wait, or just wait, operation, it
can just reference the above region at offset 0 and do:

	io_uring_submit_and_wait_reg(ring, &cqe, nr, 0);

to achieve the same goal of waiting 100usec without needing to copy
both struct io_uring_getevents_arg (24b) and struct __kernel_timeout
(16b) for each invocation. Struct io_uring_reg_wait looks as follows:

struct io_uring_reg_wait {
	struct __kernel_timespec	ts;
	__u32				min_wait_usec;
	__u32				flags;
	__u64				sigmask;
	__u32				sigmask_sz;
	__u32				pad[3];
	__u64				pad2[2];
};

embedding the timeout itself in the region, rather than passing it as
a pointer as well. Note that the signal mask is still passed as a
pointer, both for compatability reasons, but also because there doesn't
seem to be a lot of high frequency waits scenarios that involve setting
and resetting the signal mask for each wait.

The application is free to modify any region before a wait call, or it
can use keep multiple regions with different settings to avoid needing to
modify the same one for wait calls. Up to a page size of regions is mapped
by default, allowing PAGE_SIZE / 64 available regions for use.

The registered region must fit within a page. On a 4kb page size system,
that allows for 64 wait regions if a full page is used, as the size of
struct io_uring_reg_wait is 64b. The region registered must be aligned
to io_uring_reg_wait in size. It's valid to register less than 64
entries.

In network performance testing with zero-copy, this reduced the time
spent waiting on the TX side from 3.12% to 0.3% and the RX side from 4.4%
to 0.3%.

Wait regions are fixed for the lifetime of the ring - once registered,
they are persistent until the ring is torn down. The regions support
minimum wait timeout as well as the regular waits.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 10 ++++++
 include/uapi/linux/io_uring.h  | 41 +++++++++++++++++++++
 io_uring/io_uring.c            | 68 +++++++++++++++++++++++++++++------
 io_uring/register.c            | 82 ++++++++++++++++++++++++++++++++++++++++++
 io_uring/register.h            |  1 +
 5 files changed, 191 insertions(+), 11 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 841579dcdae9..2f12828b22a4 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -327,6 +327,14 @@ struct io_ring_ctx {
 		atomic_t		cq_wait_nr;
 		atomic_t		cq_timeouts;
 		struct wait_queue_head	cq_wait;
+
+		/*
+		 * If registered with IORING_REGISTER_CQWAIT_REG, a single
+		 * page holds N entries, mapped in cq_wait_arg. cq_wait_index
+		 * is the maximum allowable index.
+		 */
+		struct io_uring_reg_wait	*cq_wait_arg;
+		unsigned char			cq_wait_index;
 	} ____cacheline_aligned_in_smp;
 
 	/* timeouts */
@@ -430,6 +438,8 @@ struct io_ring_ctx {
 	unsigned short			n_sqe_pages;
 	struct page			**ring_pages;
 	struct page			**sqe_pages;
+
+	struct page			**cq_wait_page;
 };
 
 struct io_tw_state {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 60b9c98595fa..65b7417c1b05 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -518,6 +518,7 @@ struct io_cqring_offsets {
 #define IORING_ENTER_EXT_ARG		(1U << 3)
 #define IORING_ENTER_REGISTERED_RING	(1U << 4)
 #define IORING_ENTER_ABS_TIMER		(1U << 5)
+#define IORING_ENTER_EXT_ARG_REG	(1U << 6)
 
 /*
  * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -620,6 +621,9 @@ enum io_uring_register_op {
 	/* resize CQ ring */
 	IORING_REGISTER_RESIZE_RINGS		= 33,
 
+	/* register fixed io_uring_reg_wait arguments */
+	IORING_REGISTER_CQWAIT_REG		= 34,
+
 	/* this goes last */
 	IORING_REGISTER_LAST,
 
@@ -803,6 +807,43 @@ enum io_uring_register_restriction_op {
 	IORING_RESTRICTION_LAST
 };
 
+enum {
+	IORING_REG_WAIT_TS		= (1U << 0),
+};
+
+/*
+ * Argument for IORING_REGISTER_CQWAIT_REG, registering a region of
+ * struct io_uring_reg_wait that can be indexed when io_uring_enter(2) is
+ * called rather than pass in a wait argument structure separately.
+ */
+struct io_uring_cqwait_reg_arg {
+	__u32		flags;
+	__u32		struct_size;
+	__u32		nr_entries;
+	__u32		pad;
+	__u64		user_addr;
+	__u64		pad2[3];
+};
+
+/*
+ * Argument for io_uring_enter(2) with
+ * IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG set, where the actual argument
+ * is an index into a previously registered fixed wait region described by
+ * the below structure.
+ */
+struct io_uring_reg_wait {
+	struct __kernel_timespec	ts;
+	__u32				min_wait_usec;
+	__u32				flags;
+	__u64				sigmask;
+	__u32				sigmask_sz;
+	__u32				pad[3];
+	__u64				pad2[2];
+};
+
+/*
+ * Argument for io_uring_enter(2) with IORING_GETEVENTS | IORING_ENTER_EXT_ARG
+ */
 struct io_uring_getevents_arg {
 	__u64	sigmask;
 	__u32	sigmask_sz;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4cd0ee52710d..2863b957e373 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2736,6 +2736,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
 	io_futex_cache_free(ctx);
 	io_destroy_buffers(ctx);
+	io_unregister_cqwait_reg(ctx);
 	mutex_unlock(&ctx->uring_lock);
 	if (ctx->sq_creds)
 		put_cred(ctx->sq_creds);
@@ -3224,21 +3225,43 @@ void __io_uring_cancel(bool cancel_all)
 	io_uring_cancel_generic(cancel_all, NULL);
 }
 
-static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
+static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx,
+			const struct io_uring_getevents_arg __user *uarg)
 {
-	if (flags & IORING_ENTER_EXT_ARG) {
-		struct io_uring_getevents_arg arg;
+	struct io_uring_reg_wait *arg = READ_ONCE(ctx->cq_wait_arg);
 
-		if (argsz != sizeof(arg))
+	if (arg) {
+		unsigned int index = (unsigned int) (uintptr_t) uarg;
+
+		if (index <= ctx->cq_wait_index)
+			return arg + index;
+	}
+
+	return ERR_PTR(-EFAULT);
+}
+
+static int io_validate_ext_arg(struct io_ring_ctx *ctx, unsigned flags,
+			       const void __user *argp, size_t argsz)
+{
+	struct io_uring_getevents_arg arg;
+
+	if (!(flags & IORING_ENTER_EXT_ARG))
+		return 0;
+
+	if (flags & IORING_ENTER_EXT_ARG_REG) {
+		if (argsz != sizeof(struct io_uring_reg_wait))
 			return -EINVAL;
-		if (copy_from_user(&arg, argp, sizeof(arg)))
-			return -EFAULT;
+		return PTR_ERR(io_get_ext_arg_reg(ctx, argp));
 	}
+	if (argsz != sizeof(arg))
+		return -EINVAL;
+	if (copy_from_user(&arg, argp, sizeof(arg)))
+		return -EFAULT;
 	return 0;
 }
 
-static int io_get_ext_arg(unsigned flags, const void __user *argp,
-			  struct ext_arg *ext_arg)
+static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned flags,
+			  const void __user *argp, struct ext_arg *ext_arg)
 {
 	const struct io_uring_getevents_arg __user *uarg = argp;
 	struct io_uring_getevents_arg arg;
@@ -3252,6 +3275,28 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp,
 		return 0;
 	}
 
+	if (flags & IORING_ENTER_EXT_ARG_REG) {
+		struct io_uring_reg_wait *w;
+
+		if (ext_arg->argsz != sizeof(struct io_uring_reg_wait))
+			return -EINVAL;
+		w = io_get_ext_arg_reg(ctx, argp);
+		if (IS_ERR(w))
+			return PTR_ERR(w);
+
+		if (w->flags & ~IORING_REG_WAIT_TS)
+			return -EINVAL;
+		ext_arg->min_time = READ_ONCE(w->min_wait_usec) * NSEC_PER_USEC;
+		ext_arg->sig = u64_to_user_ptr(READ_ONCE(w->sigmask));
+		ext_arg->argsz = READ_ONCE(w->sigmask_sz);
+		if (w->flags & IORING_REG_WAIT_TS) {
+			ext_arg->ts.tv_sec = READ_ONCE(w->ts.tv_sec);
+			ext_arg->ts.tv_nsec = READ_ONCE(w->ts.tv_nsec);
+			ext_arg->ts_set = true;
+		}
+		return 0;
+	}
+
 	/*
 	 * EXT_ARG is set - ensure we agree on the size of it and copy in our
 	 * timespec and sigset_t pointers if good.
@@ -3297,7 +3342,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
 			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
 			       IORING_ENTER_REGISTERED_RING |
-			       IORING_ENTER_ABS_TIMER)))
+			       IORING_ENTER_ABS_TIMER |
+			       IORING_ENTER_EXT_ARG_REG)))
 		return -EINVAL;
 
 	/*
@@ -3380,7 +3426,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 			 */
 			mutex_lock(&ctx->uring_lock);
 iopoll_locked:
-			ret2 = io_validate_ext_arg(flags, argp, argsz);
+			ret2 = io_validate_ext_arg(ctx, flags, argp, argsz);
 			if (likely(!ret2)) {
 				min_complete = min(min_complete,
 						   ctx->cq_entries);
@@ -3390,7 +3436,7 @@ iopoll_locked:
 		} else {
 			struct ext_arg ext_arg = { .argsz = argsz };
 
-			ret2 = io_get_ext_arg(flags, argp, &ext_arg);
+			ret2 = io_get_ext_arg(ctx, flags, argp, &ext_arg);
 			if (likely(!ret2)) {
 				min_complete = min(min_complete,
 						   ctx->cq_entries);
diff --git a/io_uring/register.c b/io_uring/register.c
index fc6c94d694b2..1eb686eaa310 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -570,6 +570,82 @@ out:
 	return ret;
 }
 
+void io_unregister_cqwait_reg(struct io_ring_ctx *ctx)
+{
+	unsigned short npages = 1;
+
+	if (!ctx->cq_wait_page)
+		return;
+
+	io_pages_unmap(ctx->cq_wait_arg, &ctx->cq_wait_page, &npages, true);
+	ctx->cq_wait_arg = NULL;
+	if (ctx->user)
+		__io_unaccount_mem(ctx->user, 1);
+}
+
+/*
+ * Register a page holding N entries of struct io_uring_reg_wait, which can
+ * be used via io_uring_enter(2) if IORING_GETEVENTS_EXT_ARG_REG is set.
+ * If that is set with IORING_GETEVENTS_EXT_ARG, then instead of passing
+ * in a pointer for a struct io_uring_getevents_arg, an index into this
+ * registered array is passed, avoiding two (arg + timeout) copies per
+ * invocation.
+ */
+static int io_register_cqwait_reg(struct io_ring_ctx *ctx, void __user *uarg)
+{
+	struct io_uring_cqwait_reg_arg arg;
+	struct io_uring_reg_wait *reg;
+	struct page **pages;
+	unsigned long len;
+	int nr_pages, poff;
+	int ret;
+
+	if (ctx->cq_wait_page || ctx->cq_wait_arg)
+		return -EBUSY;
+	if (copy_from_user(&arg, uarg, sizeof(arg)))
+		return -EFAULT;
+	if (!arg.nr_entries || arg.flags)
+		return -EINVAL;
+	if (arg.struct_size != sizeof(*reg))
+		return -EINVAL;
+	if (check_mul_overflow(arg.struct_size, arg.nr_entries, &len))
+		return -EOVERFLOW;
+	if (len > PAGE_SIZE)
+		return -EINVAL;
+	/* offset + len must fit within a page, and must be reg_wait aligned */
+	poff = arg.user_addr & ~PAGE_MASK;
+	if (len + poff > PAGE_SIZE)
+		return -EINVAL;
+	if (poff % arg.struct_size)
+		return -EINVAL;
+
+	pages = io_pin_pages(arg.user_addr, len, &nr_pages);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+	ret = -EINVAL;
+	if (nr_pages != 1)
+		goto out_free;
+	if (ctx->user) {
+		ret = __io_account_mem(ctx->user, 1);
+		if (ret)
+			goto out_free;
+	}
+
+	reg = vmap(pages, 1, VM_MAP, PAGE_KERNEL);
+	if (reg) {
+		ctx->cq_wait_index = arg.nr_entries - 1;
+		WRITE_ONCE(ctx->cq_wait_page, pages);
+		WRITE_ONCE(ctx->cq_wait_arg, (void *) reg + poff);
+		return 0;
+	}
+	ret = -ENOMEM;
+	if (ctx->user)
+		__io_unaccount_mem(ctx->user, 1);
+out_free:
+	io_pages_free(&pages, nr_pages);
+	return ret;
+}
+
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
 	__releases(ctx->uring_lock)
@@ -764,6 +840,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_resize_rings(ctx, arg);
 		break;
+	case IORING_REGISTER_CQWAIT_REG:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_register_cqwait_reg(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/io_uring/register.h b/io_uring/register.h
index a5f39d5ef9e0..3e935e8fa4b2 100644
--- a/io_uring/register.h
+++ b/io_uring/register.h
@@ -5,5 +5,6 @@
 int io_eventfd_unregister(struct io_ring_ctx *ctx);
 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id);
 struct file *io_uring_register_get_file(unsigned int fd, bool registered);
+void io_unregister_cqwait_reg(struct io_ring_ctx *ctx);
 
 #endif
-- 
cgit v1.2.3


From a85f31052bce52111b4e9d5a536003481d0421d0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 27 Oct 2024 08:59:10 -0600
Subject: io_uring/nop: add support for testing registered files and buffers

Useful for testing performance/efficiency impact of registered files
and buffers, vs (particularly) non-registered files.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  3 +++
 io_uring/nop.c                | 49 ++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 47 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 65b7417c1b05..024745283783 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -416,6 +416,9 @@ enum io_uring_msg_ring_flags {
  * IORING_NOP_INJECT_RESULT	Inject result from sqe->result
  */
 #define IORING_NOP_INJECT_RESULT	(1U << 0)
+#define IORING_NOP_FILE			(1U << 1)
+#define IORING_NOP_FIXED_FILE		(1U << 2)
+#define IORING_NOP_FIXED_BUFFER		(1U << 3)
 
 /*
  * IO completion data structure (Completion Queue Entry)
diff --git a/io_uring/nop.c b/io_uring/nop.c
index a5bcf3d6984f..2c7a22ba4053 100644
--- a/io_uring/nop.c
+++ b/io_uring/nop.c
@@ -8,35 +8,74 @@
 #include <uapi/linux/io_uring.h>
 
 #include "io_uring.h"
+#include "rsrc.h"
 #include "nop.h"
 
 struct io_nop {
 	/* NOTE: kiocb has the file as the first member, so don't do it here */
 	struct file     *file;
 	int             result;
+	int		fd;
+	int		buffer;
+	unsigned int	flags;
 };
 
+#define NOP_FLAGS	(IORING_NOP_INJECT_RESULT | IORING_NOP_FIXED_FILE | \
+			 IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE)
+
 int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	unsigned int flags;
 	struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop);
 
-	flags = READ_ONCE(sqe->nop_flags);
-	if (flags & ~IORING_NOP_INJECT_RESULT)
+	nop->flags = READ_ONCE(sqe->nop_flags);
+	if (nop->flags & ~NOP_FLAGS)
 		return -EINVAL;
 
-	if (flags & IORING_NOP_INJECT_RESULT)
+	if (nop->flags & IORING_NOP_INJECT_RESULT)
 		nop->result = READ_ONCE(sqe->len);
 	else
 		nop->result = 0;
+	if (nop->flags & IORING_NOP_FIXED_FILE)
+		nop->fd = READ_ONCE(sqe->fd);
+	if (nop->flags & IORING_NOP_FIXED_BUFFER)
+		nop->buffer = READ_ONCE(sqe->buf_index);
 	return 0;
 }
 
 int io_nop(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop);
+	int ret = nop->result;
+
+	if (nop->flags & IORING_NOP_FILE) {
+		if (nop->flags & IORING_NOP_FIXED_FILE) {
+			req->file = io_file_get_fixed(req, nop->fd, issue_flags);
+			req->flags |= REQ_F_FIXED_FILE;
+		} else {
+			req->file = io_file_get_normal(req, nop->fd);
+		}
+		if (!req->file) {
+			ret = -EBADF;
+			goto done;
+		}
+	}
+	if (nop->flags & IORING_NOP_FIXED_BUFFER) {
+		struct io_ring_ctx *ctx = req->ctx;
+		struct io_mapped_ubuf *imu;
+		int idx;
 
-	if (nop->result < 0)
+		ret = -EFAULT;
+		io_ring_submit_lock(ctx, issue_flags);
+		if (nop->buffer < ctx->nr_user_bufs) {
+			idx = array_index_nospec(nop->buffer, ctx->nr_user_bufs);
+			imu = READ_ONCE(ctx->user_bufs[idx]);
+			io_req_set_rsrc_node(req, ctx);
+			ret = 0;
+		}
+		io_ring_submit_unlock(ctx, issue_flags);
+	}
+done:
+	if (ret < 0)
 		req_set_fail(req);
 	io_req_set_res(req, nop->result, 0);
 	return IOU_OK;
-- 
cgit v1.2.3


From 128d333f0dff2fbe41c546581c6f151e9d68cd4c Mon Sep 17 00:00:00 2001
From: Daeho Jeong <daehojeong@google.com>
Date: Thu, 17 Oct 2024 10:31:53 -0700
Subject: f2fs: introduce device aliasing file

F2FS should understand how the device aliasing file works and support
deleting the file after use. A device aliasing file can be created by
mkfs.f2fs tool and it can map the whole device with an extent, not
using node blocks. The file space should be pinned and normally used for
read-only usages.

Signed-off-by: Daeho Jeong <daehojeong@google.com>
Signed-off-by: Chao Yu <chao@kernel.org>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Documentation/filesystems/f2fs.rst | 44 +++++++++++++++++++++++++++++++++++++
 fs/f2fs/data.c                     |  5 +++++
 fs/f2fs/extent_cache.c             | 45 +++++++++++++++++++++++++++++++++++++-
 fs/f2fs/f2fs.h                     |  5 +++++
 fs/f2fs/file.c                     | 45 ++++++++++++++++++++++++++++++++++----
 fs/f2fs/inode.c                    | 19 +++++++++++++++-
 fs/f2fs/super.c                    |  4 ++++
 fs/f2fs/sysfs.c                    |  2 ++
 include/uapi/linux/f2fs.h          |  1 +
 9 files changed, 164 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
index 68a0885fb5e6..fb7d2ee022bc 100644
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@@ -943,3 +943,47 @@ NVMe Zoned Namespace devices
   can start before the zone-capacity and span across zone-capacity boundary.
   Such spanning segments are also considered as usable segments. All blocks
   past the zone-capacity are considered unusable in these segments.
+
+Device aliasing feature
+-----------------------
+
+f2fs can utilize a special file called a "device aliasing file." This file allows
+the entire storage device to be mapped with a single, large extent, not using
+the usual f2fs node structures. This mapped area is pinned and primarily intended
+for holding the space.
+
+Essentially, this mechanism allows a portion of the f2fs area to be temporarily
+reserved and used by another filesystem or for different purposes. Once that
+external usage is complete, the device aliasing file can be deleted, releasing
+the reserved space back to F2FS for its own use.
+
+<use-case>
+
+# ls /dev/vd*
+/dev/vdb (32GB) /dev/vdc (32GB)
+# mkfs.ext4 /dev/vdc
+# mkfs.f2fs -c /dev/vdc@vdc.file /dev/vdb
+# mount /dev/vdb /mnt/f2fs
+# ls -l /mnt/f2fs
+vdc.file
+# df -h
+/dev/vdb                            64G   33G   32G  52% /mnt/f2fs
+
+# mount -o loop /dev/vdc /mnt/ext4
+# df -h
+/dev/vdb                            64G   33G   32G  52% /mnt/f2fs
+/dev/loop7                          32G   24K   30G   1% /mnt/ext4
+# umount /mnt/ext4
+
+# f2fs_io getflags /mnt/f2fs/vdc.file
+get a flag on /mnt/f2fs/vdc.file ret=0, flags=nocow(pinned),immutable
+# f2fs_io setflags noimmutable /mnt/f2fs/vdc.file
+get a flag on noimmutable ret=0, flags=800010
+set a flag on /mnt/f2fs/vdc.file ret=0, flags=noimmutable
+# rm /mnt/f2fs/vdc.file
+# df -h
+/dev/vdb                            64G  753M   64G   2% /mnt/f2fs
+
+So, the key idea is, user can do any file operations on /dev/vdc, and
+reclaim the space after the use, while the space is counted as /data.
+That doesn't require modifying partition size and filesystem format.
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 94f7b084f601..90fa8ab85194 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -3441,6 +3441,11 @@ restart:
 
 	if (!f2fs_lookup_read_extent_cache_block(inode, index,
 						 &dn.data_blkaddr)) {
+		if (IS_DEVICE_ALIASING(inode)) {
+			err = -ENODATA;
+			goto out;
+		}
+
 		if (locked) {
 			err = f2fs_reserve_block(&dn, index);
 			goto out;
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 62ac440d9416..019c1f7b7fa5 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -24,6 +24,7 @@ bool sanity_check_extent_cache(struct inode *inode, struct page *ipage)
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_extent *i_ext = &F2FS_INODE(ipage)->i_ext;
 	struct extent_info ei;
+	int devi;
 
 	get_read_extent_info(&ei, i_ext);
 
@@ -38,7 +39,36 @@ bool sanity_check_extent_cache(struct inode *inode, struct page *ipage)
 			  ei.blk, ei.fofs, ei.len);
 		return false;
 	}
-	return true;
+
+	if (!IS_DEVICE_ALIASING(inode))
+		return true;
+
+	for (devi = 0; devi < sbi->s_ndevs; devi++) {
+		if (FDEV(devi).start_blk != ei.blk ||
+				FDEV(devi).end_blk != ei.blk + ei.len - 1)
+			continue;
+
+		if (devi == 0) {
+			f2fs_warn(sbi,
+			    "%s: inode (ino=%lx) is an alias of meta device",
+			    __func__, inode->i_ino);
+			return false;
+		}
+
+		if (bdev_is_zoned(FDEV(devi).bdev)) {
+			f2fs_warn(sbi,
+			    "%s: device alias inode (ino=%lx)'s extent info "
+			    "[%u, %u, %u] maps to zoned block device",
+			    __func__, inode->i_ino, ei.blk, ei.fofs, ei.len);
+			return false;
+		}
+		return true;
+	}
+
+	f2fs_warn(sbi, "%s: device alias inode (ino=%lx)'s extent info "
+			"[%u, %u, %u] is inconsistent w/ any devices",
+			__func__, inode->i_ino, ei.blk, ei.fofs, ei.len);
+	return false;
 }
 
 static void __set_extent_info(struct extent_info *ei,
@@ -76,6 +106,9 @@ static bool __init_may_extent_tree(struct inode *inode, enum extent_type type)
 
 static bool __may_extent_tree(struct inode *inode, enum extent_type type)
 {
+	if (IS_DEVICE_ALIASING(inode) && type == EX_READ)
+		return true;
+
 	/*
 	 * for recovered files during mount do not create extents
 	 * if shrinker is not registered.
@@ -401,6 +434,11 @@ void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage)
 	if (atomic_read(&et->node_cnt) || !ei.len)
 		goto skip;
 
+	if (IS_DEVICE_ALIASING(inode)) {
+		et->largest = ei;
+		goto skip;
+	}
+
 	en = __attach_extent_node(sbi, et, &ei, NULL,
 				&et->root.rb_root.rb_node, true);
 	if (en) {
@@ -463,6 +501,11 @@ static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
 		goto out;
 	}
 
+	if (IS_DEVICE_ALIASING(inode)) {
+		ret = false;
+		goto out;
+	}
+
 	en = __lookup_extent_node(&et->root, et->cached_en, pgofs);
 	if (!en)
 		goto out;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index dd47dbf6d9e6..f3ef4dc50992 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -213,6 +213,7 @@ struct f2fs_mount_info {
 #define F2FS_FEATURE_CASEFOLD			0x00001000
 #define F2FS_FEATURE_COMPRESSION		0x00002000
 #define F2FS_FEATURE_RO				0x00004000
+#define F2FS_FEATURE_DEVICE_ALIAS		0x00008000
 
 #define __F2FS_HAS_FEATURE(raw_super, mask)				\
 	((raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -3046,6 +3047,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
 #define F2FS_DIRSYNC_FL			0x00010000 /* dirsync behaviour (directories only) */
 #define F2FS_PROJINHERIT_FL		0x20000000 /* Create with parents projid */
 #define F2FS_CASEFOLD_FL		0x40000000 /* Casefolded file */
+#define F2FS_DEVICE_ALIAS_FL		0x80000000 /* File for aliasing a device */
 
 #define F2FS_QUOTA_DEFAULT_FL		(F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL)
 
@@ -3061,6 +3063,8 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
 /* Flags that are appropriate for non-directories/regular files. */
 #define F2FS_OTHER_FLMASK	(F2FS_NODUMP_FL | F2FS_NOATIME_FL)
 
+#define IS_DEVICE_ALIASING(inode)	(F2FS_I(inode)->i_flags & F2FS_DEVICE_ALIAS_FL)
+
 static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
 {
 	if (S_ISDIR(mode))
@@ -4526,6 +4530,7 @@ F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM);
 F2FS_FEATURE_FUNCS(casefold, CASEFOLD);
 F2FS_FEATURE_FUNCS(compression, COMPRESSION);
 F2FS_FEATURE_FUNCS(readonly, RO);
+F2FS_FEATURE_FUNCS(device_alias, DEVICE_ALIAS);
 
 #ifdef CONFIG_BLK_DEV_ZONED
 static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi,
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 348ef73bf8dd..75a8b22da664 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -725,6 +725,11 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock)
 
 	trace_f2fs_truncate_blocks_enter(inode, from);
 
+	if (IS_DEVICE_ALIASING(inode) && from) {
+		err = -EINVAL;
+		goto out_err;
+	}
+
 	free_from = (pgoff_t)F2FS_BLK_ALIGN(from);
 
 	if (free_from >= max_file_blocks(inode))
@@ -739,6 +744,21 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock)
 		goto out;
 	}
 
+	if (IS_DEVICE_ALIASING(inode)) {
+		struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ];
+		struct extent_info ei = et->largest;
+		unsigned int i;
+
+		for (i = 0; i < ei.len; i++)
+			f2fs_invalidate_blocks(sbi, ei.blk + i);
+
+		dec_valid_block_count(sbi, inode, ei.len);
+		f2fs_update_time(sbi, REQ_TIME);
+
+		f2fs_put_page(ipage, 1);
+		goto out;
+	}
+
 	if (f2fs_has_inline_data(inode)) {
 		f2fs_truncate_inline_inode(inode, ipage, from);
 		f2fs_put_page(ipage, 1);
@@ -774,7 +794,7 @@ free_partial:
 	/* lastly zero out the first data page */
 	if (!err)
 		err = truncate_partial_data_page(inode, from, truncate_page);
-
+out_err:
 	trace_f2fs_truncate_blocks_exit(inode, err);
 	return err;
 }
@@ -992,7 +1012,8 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 		return -EPERM;
 
 	if ((attr->ia_valid & ATTR_SIZE)) {
-		if (!f2fs_is_compress_backend_ready(inode))
+		if (!f2fs_is_compress_backend_ready(inode) ||
+				IS_DEVICE_ALIASING(inode))
 			return -EOPNOTSUPP;
 		if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED) &&
 			!IS_ALIGNED(attr->ia_size,
@@ -1861,7 +1882,7 @@ static long f2fs_fallocate(struct file *file, int mode,
 		return -EIO;
 	if (!f2fs_is_checkpoint_ready(F2FS_I_SB(inode)))
 		return -ENOSPC;
-	if (!f2fs_is_compress_backend_ready(inode))
+	if (!f2fs_is_compress_backend_ready(inode) || IS_DEVICE_ALIASING(inode))
 		return -EOPNOTSUPP;
 
 	/* f2fs only support ->fallocate for regular file */
@@ -3297,6 +3318,9 @@ int f2fs_pin_file_control(struct inode *inode, bool inc)
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 
+	if (IS_DEVICE_ALIASING(inode))
+		return -EINVAL;
+
 	if (fi->i_gc_failures >= sbi->gc_pin_file_threshold) {
 		f2fs_warn(sbi, "%s: Enable GC = ino %lx after %x GC trials",
 			  __func__, inode->i_ino, fi->i_gc_failures);
@@ -3327,6 +3351,9 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
 	if (f2fs_readonly(sbi->sb))
 		return -EROFS;
 
+	if (!pin && IS_DEVICE_ALIASING(inode))
+		return -EOPNOTSUPP;
+
 	ret = mnt_want_write_file(filp);
 	if (ret)
 		return ret;
@@ -3392,6 +3419,12 @@ static int f2fs_ioc_get_pin_file(struct file *filp, unsigned long arg)
 	return put_user(pin, (u32 __user *)arg);
 }
 
+static int f2fs_ioc_get_dev_alias_file(struct file *filp, unsigned long arg)
+{
+	return put_user(IS_DEVICE_ALIASING(file_inode(filp)) ? 1 : 0,
+			(u32 __user *)arg);
+}
+
 int f2fs_precache_extents(struct inode *inode)
 {
 	struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -4491,6 +4524,8 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return f2fs_ioc_decompress_file(filp);
 	case F2FS_IOC_COMPRESS_FILE:
 		return f2fs_ioc_compress_file(filp);
+	case F2FS_IOC_GET_DEV_ALIAS_FILE:
+		return f2fs_ioc_get_dev_alias_file(filp, arg);
 	default:
 		return -ENOTTY;
 	}
@@ -4766,7 +4801,8 @@ static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter,
 	else
 		return 0;
 
-	map.m_may_create = true;
+	if (!IS_DEVICE_ALIASING(inode))
+		map.m_may_create = true;
 	if (dio) {
 		map.m_seg_type = f2fs_rw_hint_to_seg_type(sbi,
 						inode->i_write_hint);
@@ -5203,6 +5239,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case F2FS_IOC_SET_COMPRESS_OPTION:
 	case F2FS_IOC_DECOMPRESS_FILE:
 	case F2FS_IOC_COMPRESS_FILE:
+	case F2FS_IOC_GET_DEV_ALIAS_FILE:
 		break;
 	default:
 		return -ENOIOCTLCMD;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 10780e37fc7b..282fd320bdb3 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -372,6 +372,19 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
 		return false;
 	}
 
+	if (IS_DEVICE_ALIASING(inode)) {
+		if (!f2fs_sb_has_device_alias(sbi)) {
+			f2fs_warn(sbi, "%s: inode (ino=%lx) has device alias flag, but the feature is off",
+				  __func__, inode->i_ino);
+			return false;
+		}
+		if (!f2fs_is_pinned_file(inode)) {
+			f2fs_warn(sbi, "%s: inode (ino=%lx) has device alias flag, but is not pinned",
+				  __func__, inode->i_ino);
+			return false;
+		}
+	}
+
 	return true;
 }
 
@@ -825,7 +838,8 @@ void f2fs_evict_inode(struct inode *inode)
 	f2fs_bug_on(sbi, get_dirty_pages(inode));
 	f2fs_remove_dirty_inode(inode);
 
-	f2fs_destroy_extent_tree(inode);
+	if (!IS_DEVICE_ALIASING(inode))
+		f2fs_destroy_extent_tree(inode);
 
 	if (inode->i_nlink || is_bad_inode(inode))
 		goto no_delete;
@@ -881,6 +895,9 @@ retry:
 		goto retry;
 	}
 
+	if (IS_DEVICE_ALIASING(inode))
+		f2fs_destroy_extent_tree(inode);
+
 	if (err) {
 		f2fs_update_inode_page(inode);
 		if (dquot_initialize_needed(inode))
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 8d4ecb2e855e..aa14c8fce7d9 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -834,6 +834,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
 			set_opt(sbi, READ_EXTENT_CACHE);
 			break;
 		case Opt_noextent_cache:
+			if (F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_DEVICE_ALIAS)) {
+				f2fs_err(sbi, "device aliasing requires extent cache");
+				return -EINVAL;
+			}
 			clear_opt(sbi, READ_EXTENT_CACHE);
 			break;
 		case Opt_noinline_data:
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index c56e8c873935..e51304bc65ea 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -1313,6 +1313,7 @@ F2FS_SB_FEATURE_RO_ATTR(sb_checksum, SB_CHKSUM);
 F2FS_SB_FEATURE_RO_ATTR(casefold, CASEFOLD);
 F2FS_SB_FEATURE_RO_ATTR(compression, COMPRESSION);
 F2FS_SB_FEATURE_RO_ATTR(readonly, RO);
+F2FS_SB_FEATURE_RO_ATTR(device_alias, DEVICE_ALIAS);
 
 static struct attribute *f2fs_sb_feat_attrs[] = {
 	ATTR_LIST(sb_encryption),
@@ -1329,6 +1330,7 @@ static struct attribute *f2fs_sb_feat_attrs[] = {
 	ATTR_LIST(sb_casefold),
 	ATTR_LIST(sb_compression),
 	ATTR_LIST(sb_readonly),
+	ATTR_LIST(sb_device_alias),
 	NULL,
 };
 ATTRIBUTE_GROUPS(f2fs_sb_feat);
diff --git a/include/uapi/linux/f2fs.h b/include/uapi/linux/f2fs.h
index 955d440be104..f7aaf8d23e20 100644
--- a/include/uapi/linux/f2fs.h
+++ b/include/uapi/linux/f2fs.h
@@ -43,6 +43,7 @@
 #define F2FS_IOC_DECOMPRESS_FILE	_IO(F2FS_IOCTL_MAGIC, 23)
 #define F2FS_IOC_COMPRESS_FILE		_IO(F2FS_IOCTL_MAGIC, 24)
 #define F2FS_IOC_START_ATOMIC_REPLACE	_IO(F2FS_IOCTL_MAGIC, 25)
+#define F2FS_IOC_GET_DEV_ALIAS_FILE	_IOR(F2FS_IOCTL_MAGIC, 26, __u32)
 
 /*
  * should be same as XFS_IOC_GOINGDOWN.
-- 
cgit v1.2.3


From b16e920a1909da6799c43000db730d8fcdcae907 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 28 Oct 2024 18:43:13 -0600
Subject: io_uring/rsrc: allow cloning at an offset

Right now buffer cloning is an all-or-nothing kind of thing - either the
whole table is cloned from a source to a destination ring, or nothing at
all.

However, it's not always desired to clone the whole thing. Allow for
the application to specify a source and destination offset, and a
number of buffers to clone. If the destination offset is non-zero, then
allocate sparse nodes upfront.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  5 ++++-
 io_uring/rsrc.c               | 32 ++++++++++++++++++++++++++------
 2 files changed, 30 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 024745283783..cc8dbe78c126 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -719,7 +719,10 @@ enum {
 struct io_uring_clone_buffers {
 	__u32	src_fd;
 	__u32	flags;
-	__u32	pad[6];
+	__u32	src_off;
+	__u32	dst_off;
+	__u32	nr;
+	__u32	pad[3];
 };
 
 struct io_uring_buf {
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 7ad91f180566..289866315ecf 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -927,10 +927,11 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
 	return 0;
 }
 
-static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
+static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
+			    struct io_uring_clone_buffers *arg)
 {
+	int i, ret, nbufs, off, nr;
 	struct io_rsrc_data data;
-	int i, ret, nbufs;
 
 	/*
 	 * Drop our own lock here. We'll setup the data we need and reference
@@ -943,11 +944,29 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 	nbufs = src_ctx->buf_table.nr;
 	if (!nbufs)
 		goto out_unlock;
-	ret = io_rsrc_data_alloc(&data, nbufs);
+	ret = -EINVAL;
+	if (!arg->nr)
+		arg->nr = nbufs;
+	else if (arg->nr > nbufs)
+		goto out_unlock;
+	ret = -EOVERFLOW;
+	if (check_add_overflow(arg->nr, arg->src_off, &off))
+		goto out_unlock;
+	if (off > nbufs)
+		goto out_unlock;
+	if (check_add_overflow(arg->nr, arg->dst_off, &off))
+		goto out_unlock;
+	ret = -EINVAL;
+	if (off > IORING_MAX_REG_BUFFERS)
+		goto out_unlock;
+	ret = io_rsrc_data_alloc(&data, off);
 	if (ret)
 		goto out_unlock;
 
-	for (i = 0; i < nbufs; i++) {
+	off = arg->dst_off;
+	i = arg->src_off;
+	nr = arg->nr;
+	while (nr--) {
 		struct io_rsrc_node *dst_node, *src_node;
 
 		src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i);
@@ -963,7 +982,8 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 			refcount_inc(&src_node->buf->refs);
 			dst_node->buf = src_node->buf;
 		}
-		data.nodes[i] = dst_node;
+		data.nodes[off++] = dst_node;
+		i++;
 	}
 
 	/* Have a ref on the bufs now, drop src lock and re-grab our own lock */
@@ -1018,7 +1038,7 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
 	file = io_uring_register_get_file(buf.src_fd, registered_src);
 	if (IS_ERR(file))
 		return PTR_ERR(file);
-	ret = io_clone_buffers(ctx, file->private_data);
+	ret = io_clone_buffers(ctx, file->private_data, &buf);
 	if (!registered_src)
 		fput(file);
 	return ret;
-- 
cgit v1.2.3


From c1329532d5aabecf79788924941afb8a7b7c1024 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 29 Oct 2024 07:50:56 -0600
Subject: io_uring/rsrc: allow cloning with node replacements

Currently cloning a buffer table will fail if the destination already has
a table. But it should be possible to use it to replace existing elements.
Add a IORING_REGISTER_DST_REPLACE cloning flag, which if set, will allow
the destination to already having a buffer table. If that is the case,
then entries designated by offset + nr buffers will be replaced if they
already exist.

Note that it's allowed to use IORING_REGISTER_DST_REPLACE and not have
an existing table, in which case it'll work just like not having the
flag set and an empty table - it'll just assign the newly created table
for that case.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  3 +-
 io_uring/rsrc.c               | 66 ++++++++++++++++++++++++++++++++++---------
 2 files changed, 54 insertions(+), 15 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index cc8dbe78c126..ce58c4590de6 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -713,7 +713,8 @@ struct io_uring_clock_register {
 };
 
 enum {
-	IORING_REGISTER_SRC_REGISTERED = 1,
+	IORING_REGISTER_SRC_REGISTERED	= (1U << 0),
+	IORING_REGISTER_DST_REPLACE	= (1U << 1),
 };
 
 struct io_uring_clone_buffers {
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 289866315ecf..60fa857985cb 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -930,8 +930,40 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
 			    struct io_uring_clone_buffers *arg)
 {
-	int i, ret, nbufs, off, nr;
 	struct io_rsrc_data data;
+	int i, ret, off, nr;
+	unsigned int nbufs;
+
+	/* if offsets are given, must have nr specified too */
+	if (!arg->nr && (arg->dst_off || arg->src_off))
+		return -EINVAL;
+	/* not allowed unless REPLACE is set */
+	if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE))
+		return -EBUSY;
+
+	nbufs = READ_ONCE(src_ctx->buf_table.nr);
+	if (!arg->nr)
+		arg->nr = nbufs;
+	else if (arg->nr > nbufs)
+		return -EINVAL;
+	else if (arg->nr > IORING_MAX_REG_BUFFERS)
+		return -EINVAL;
+	if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
+		return -EOVERFLOW;
+
+	ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr));
+	if (ret)
+		return ret;
+
+	/* Fill entries in data from dst that won't overlap with src */
+	for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) {
+		struct io_rsrc_node *src_node = ctx->buf_table.nodes[i];
+
+		if (src_node) {
+			data.nodes[i] = src_node;
+			src_node->refs++;
+		}
+	}
 
 	/*
 	 * Drop our own lock here. We'll setup the data we need and reference
@@ -954,14 +986,6 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 		goto out_unlock;
 	if (off > nbufs)
 		goto out_unlock;
-	if (check_add_overflow(arg->nr, arg->dst_off, &off))
-		goto out_unlock;
-	ret = -EINVAL;
-	if (off > IORING_MAX_REG_BUFFERS)
-		goto out_unlock;
-	ret = io_rsrc_data_alloc(&data, off);
-	if (ret)
-		goto out_unlock;
 
 	off = arg->dst_off;
 	i = arg->src_off;
@@ -989,6 +1013,20 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 	/* Have a ref on the bufs now, drop src lock and re-grab our own lock */
 	mutex_unlock(&src_ctx->uring_lock);
 	mutex_lock(&ctx->uring_lock);
+
+	/*
+	 * If asked for replace, put the old table. data->nodes[] holds both
+	 * old and new nodes at this point.
+	 */
+	if (arg->flags & IORING_REGISTER_DST_REPLACE)
+		io_rsrc_data_free(&ctx->buf_table);
+
+	/*
+	 * ctx->buf_table should be empty now - either the contents are being
+	 * replaced and we just freed the table, or someone raced setting up
+	 * a buffer table while the clone was happening. If not empty, fall
+	 * through to failure handling.
+	 */
 	if (!ctx->buf_table.nr) {
 		ctx->buf_table = data;
 		return 0;
@@ -998,14 +1036,14 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 	mutex_lock(&src_ctx->uring_lock);
 	/* someone raced setting up buffers, dump ours */
 	ret = -EBUSY;
-	i = nbufs;
 out_put_free:
+	i = data.nr;
 	while (i--) {
 		io_buffer_unmap(src_ctx, data.nodes[i]);
 		kfree(data.nodes[i]);
 	}
-	io_rsrc_data_free(&data);
 out_unlock:
+	io_rsrc_data_free(&data);
 	mutex_unlock(&src_ctx->uring_lock);
 	mutex_lock(&ctx->uring_lock);
 	return ret;
@@ -1025,12 +1063,12 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
 	struct file *file;
 	int ret;
 
-	if (ctx->buf_table.nr)
-		return -EBUSY;
 	if (copy_from_user(&buf, arg, sizeof(buf)))
 		return -EFAULT;
-	if (buf.flags & ~IORING_REGISTER_SRC_REGISTERED)
+	if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE))
 		return -EINVAL;
+	if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr)
+		return -EBUSY;
 	if (memchr_inv(buf.pad, 0, sizeof(buf.pad)))
 		return -EINVAL;
 
-- 
cgit v1.2.3


From 01ee194d1aba1202f0926d5047a2a4cf84d0e45d Mon Sep 17 00:00:00 2001
From: hexue <xue01.he@samsung.com>
Date: Fri, 1 Nov 2024 17:19:57 +0800
Subject: io_uring: add support for hybrid IOPOLL

A new hybrid poll is implemented on the io_uring layer. Once an IO is
issued, it will not poll immediately, but rather block first and re-run
before IO complete, then poll to reap IO. While this poll method could
be a suboptimal solution when running on a single thread, it offers
performance lower than regular polling but higher than IRQ, and CPU
utilization is also lower than polling.

To use hybrid polling, the ring must be setup with both the
IORING_SETUP_IOPOLL and IORING_SETUP_HYBRID)IOPOLL flags set. Hybrid
polling has the same restrictions as IOPOLL, in that commands must
explicitly support it.

Signed-off-by: hexue <xue01.he@samsung.com>
Link: https://lore.kernel.org/r/20241101091957.564220-2-xue01.he@samsung.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 19 ++++++++-
 include/uapi/linux/io_uring.h  |  3 ++
 io_uring/io_uring.c            |  8 +++-
 io_uring/rw.c                  | 92 +++++++++++++++++++++++++++++++++++++-----
 4 files changed, 108 insertions(+), 14 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 77fd508d043a..d52fec533c51 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -298,6 +298,11 @@ struct io_ring_ctx {
 		 * ->uring_cmd() by io_uring_cmd_insert_cancelable()
 		 */
 		struct hlist_head	cancelable_uring_cmd;
+		/*
+		 * For Hybrid IOPOLL, runtime in hybrid polling, without
+		 * scheduling time
+		 */
+		u64					hybrid_poll_time;
 	} ____cacheline_aligned_in_smp;
 
 	struct {
@@ -449,6 +454,7 @@ enum {
 	REQ_F_LINK_TIMEOUT_BIT,
 	REQ_F_NEED_CLEANUP_BIT,
 	REQ_F_POLLED_BIT,
+	REQ_F_HYBRID_IOPOLL_STATE_BIT,
 	REQ_F_BUFFER_SELECTED_BIT,
 	REQ_F_BUFFER_RING_BIT,
 	REQ_F_REISSUE_BIT,
@@ -507,6 +513,8 @@ enum {
 	REQ_F_NEED_CLEANUP	= IO_REQ_FLAG(REQ_F_NEED_CLEANUP_BIT),
 	/* already went through poll handler */
 	REQ_F_POLLED		= IO_REQ_FLAG(REQ_F_POLLED_BIT),
+	/* every req only blocks once in hybrid poll */
+	REQ_F_IOPOLL_STATE        = IO_REQ_FLAG(REQ_F_HYBRID_IOPOLL_STATE_BIT),
 	/* buffer already selected */
 	REQ_F_BUFFER_SELECTED	= IO_REQ_FLAG(REQ_F_BUFFER_SELECTED_BIT),
 	/* buffer selected from ring, needs commit */
@@ -639,8 +647,15 @@ struct io_kiocb {
 	atomic_t			refs;
 	bool				cancel_seq_set;
 	struct io_task_work		io_task_work;
-	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
-	struct hlist_node		hash_node;
+	union {
+		/*
+		 * for polled requests, i.e. IORING_OP_POLL_ADD and async armed
+		 * poll
+		 */
+		struct hlist_node	hash_node;
+		/* For IOPOLL setup queues, with hybrid polling */
+		u64                     iopoll_start;
+	};
 	/* internal polling, see IORING_FEAT_FAST_POLL */
 	struct async_poll		*apoll;
 	/* opcode allocated if it needs to store data for async defer */
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index ce58c4590de6..47977a5c65f5 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -200,6 +200,9 @@ enum io_uring_sqe_flags_bit {
  */
 #define IORING_SETUP_NO_SQARRAY		(1U << 16)
 
+/* Use hybrid poll in iopoll process */
+#define IORING_SETUP_HYBRID_IOPOLL	(1U << 17)
+
 enum io_uring_op {
 	IORING_OP_NOP,
 	IORING_OP_READV,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 44a772013c09..f08ea7fd5998 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -307,6 +307,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 		goto err;
 
 	ctx->flags = p->flags;
+	ctx->hybrid_poll_time = LLONG_MAX;
 	atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
 	init_waitqueue_head(&ctx->sqo_sq_wait);
 	INIT_LIST_HEAD(&ctx->sqd_list);
@@ -3630,6 +3631,11 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
 		static_branch_inc(&io_key_has_sqarray);
 
+	/* HYBRID_IOPOLL only valid with IOPOLL */
+	if ((ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_HYBRID_IOPOLL)) ==
+			IORING_SETUP_HYBRID_IOPOLL)
+		return -EINVAL;
+
 	if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
 	    !(ctx->flags & IORING_SETUP_IOPOLL) &&
 	    !(ctx->flags & IORING_SETUP_SQPOLL))
@@ -3785,7 +3791,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 			IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
 			IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
 			IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY |
-			IORING_SETUP_NO_SQARRAY))
+			IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL))
 		return -EINVAL;
 
 	return io_uring_create(entries, &p, params);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 30448f343c7f..1ea6be2ccc90 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -817,6 +817,11 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
 		kiocb->ki_flags |= IOCB_HIPRI;
 		kiocb->ki_complete = io_complete_rw_iopoll;
 		req->iopoll_completed = 0;
+		if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) {
+			/* make sure every req only blocks once*/
+			req->flags &= ~REQ_F_IOPOLL_STATE;
+			req->iopoll_start = ktime_get_ns();
+		}
 	} else {
 		if (kiocb->ki_flags & IOCB_HIPRI)
 			return -EINVAL;
@@ -1115,6 +1120,78 @@ void io_rw_fail(struct io_kiocb *req)
 	io_req_set_res(req, res, req->cqe.flags);
 }
 
+static int io_uring_classic_poll(struct io_kiocb *req, struct io_comp_batch *iob,
+				unsigned int poll_flags)
+{
+	struct file *file = req->file;
+
+	if (req->opcode == IORING_OP_URING_CMD) {
+		struct io_uring_cmd *ioucmd;
+
+		ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
+		return file->f_op->uring_cmd_iopoll(ioucmd, iob, poll_flags);
+	} else {
+		struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+
+		return file->f_op->iopoll(&rw->kiocb, iob, poll_flags);
+	}
+}
+
+static u64 io_hybrid_iopoll_delay(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+	struct hrtimer_sleeper timer;
+	enum hrtimer_mode mode;
+	ktime_t kt;
+	u64 sleep_time;
+
+	if (req->flags & REQ_F_IOPOLL_STATE)
+		return 0;
+
+	if (ctx->hybrid_poll_time == LLONG_MAX)
+		return 0;
+
+	/* Using half the running time to do schedule */
+	sleep_time = ctx->hybrid_poll_time / 2;
+
+	kt = ktime_set(0, sleep_time);
+	req->flags |= REQ_F_IOPOLL_STATE;
+
+	mode = HRTIMER_MODE_REL;
+	hrtimer_init_sleeper_on_stack(&timer, CLOCK_MONOTONIC, mode);
+	hrtimer_set_expires(&timer.timer, kt);
+	set_current_state(TASK_INTERRUPTIBLE);
+	hrtimer_sleeper_start_expires(&timer, mode);
+
+	if (timer.task)
+		io_schedule();
+
+	hrtimer_cancel(&timer.timer);
+	__set_current_state(TASK_RUNNING);
+	destroy_hrtimer_on_stack(&timer.timer);
+	return sleep_time;
+}
+
+static int io_uring_hybrid_poll(struct io_kiocb *req,
+				struct io_comp_batch *iob, unsigned int poll_flags)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	u64 runtime, sleep_time;
+	int ret;
+
+	sleep_time = io_hybrid_iopoll_delay(ctx, req);
+	ret = io_uring_classic_poll(req, iob, poll_flags);
+	runtime = ktime_get_ns() - req->iopoll_start - sleep_time;
+
+	/*
+	 * Use minimum sleep time if we're polling devices with different
+	 * latencies. We could get more completions from the faster ones.
+	 */
+	if (ctx->hybrid_poll_time > runtime)
+		ctx->hybrid_poll_time = runtime;
+
+	return ret;
+}
+
 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 {
 	struct io_wq_work_node *pos, *start, *prev;
@@ -1131,7 +1208,6 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 
 	wq_list_for_each(pos, start, &ctx->iopoll_list) {
 		struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
-		struct file *file = req->file;
 		int ret;
 
 		/*
@@ -1142,17 +1218,11 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 		if (READ_ONCE(req->iopoll_completed))
 			break;
 
-		if (req->opcode == IORING_OP_URING_CMD) {
-			struct io_uring_cmd *ioucmd;
-
-			ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
-			ret = file->f_op->uring_cmd_iopoll(ioucmd, &iob,
-								poll_flags);
-		} else {
-			struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+		if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL)
+			ret = io_uring_hybrid_poll(req, &iob, poll_flags);
+		else
+			ret = io_uring_classic_poll(req, &iob, poll_flags);
 
-			ret = file->f_op->iopoll(&rw->kiocb, &iob, poll_flags);
-		}
 		if (unlikely(ret < 0))
 			return ret;
 		else if (ret)
-- 
cgit v1.2.3


From a1afb959add1fad43cb337448c244ed70bac3109 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Wed, 30 Oct 2024 09:11:56 +0100
Subject: dpll: add clock quality level attribute and op

In order to allow driver expose quality level of the clock it is
running, introduce a new netlink attr with enum to carry it to the
userspace. Also, introduce an op the dpll netlink code calls into the
driver to obtain the value.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Link: https://patch.msgid.link/20241030081157.966604-2-jiri@resnulli.us
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/dpll.yaml | 41 +++++++++++++++++++++++++++++++++++
 drivers/dpll/dpll_netlink.c           | 24 ++++++++++++++++++++
 include/linux/dpll.h                  |  4 ++++
 include/uapi/linux/dpll.h             | 24 ++++++++++++++++++++
 4 files changed, 93 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/netlink/specs/dpll.yaml b/Documentation/netlink/specs/dpll.yaml
index f2894ca35de8..8feefeae5376 100644
--- a/Documentation/netlink/specs/dpll.yaml
+++ b/Documentation/netlink/specs/dpll.yaml
@@ -85,6 +85,36 @@ definitions:
           This may happen for example if dpll device was previously
           locked on an input pin of type PIN_TYPE_SYNCE_ETH_PORT.
     render-max: true
+  -
+    type: enum
+    name: clock-quality-level
+    doc: |
+      level of quality of a clock device. This mainly applies when
+      the dpll lock-status is DPLL_LOCK_STATUS_HOLDOVER.
+      The current list is defined according to the table 11-7 contained
+      in ITU-T G.8264/Y.1364 document. One may extend this list freely
+      by other ITU-T defined clock qualities, or different ones defined
+      by another standardization body (for those, please use
+      different prefix).
+    entries:
+      -
+        name: itu-opt1-prc
+        value: 1
+      -
+        name: itu-opt1-ssu-a
+      -
+        name: itu-opt1-ssu-b
+      -
+        name: itu-opt1-eec1
+      -
+        name: itu-opt1-prtc
+      -
+        name: itu-opt1-eprtc
+      -
+        name: itu-opt1-eeec
+      -
+        name: itu-opt1-eprc
+    render-max: true
   -
     type: const
     name: temp-divider
@@ -252,6 +282,17 @@ attribute-sets:
         name: lock-status-error
         type: u32
         enum: lock-status-error
+      -
+        name: clock-quality-level
+        type: u32
+        enum: clock-quality-level
+        multi-attr: true
+        doc: |
+          Level of quality of a clock device. This mainly applies when
+          the dpll lock-status is DPLL_LOCK_STATUS_HOLDOVER. This could
+          be put to message multiple times to indicate possible parallel
+          quality levels (e.g. one specified by ITU option 1 and another
+          one specified by option 2).
   -
     name: pin
     enum-name: dpll_a_pin
diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c
index fc0280dcddd1..c130f87147fa 100644
--- a/drivers/dpll/dpll_netlink.c
+++ b/drivers/dpll/dpll_netlink.c
@@ -169,6 +169,27 @@ dpll_msg_add_temp(struct sk_buff *msg, struct dpll_device *dpll,
 	return 0;
 }
 
+static int
+dpll_msg_add_clock_quality_level(struct sk_buff *msg, struct dpll_device *dpll,
+				 struct netlink_ext_ack *extack)
+{
+	const struct dpll_device_ops *ops = dpll_device_ops(dpll);
+	DECLARE_BITMAP(qls, DPLL_CLOCK_QUALITY_LEVEL_MAX) = { 0 };
+	enum dpll_clock_quality_level ql;
+	int ret;
+
+	if (!ops->clock_quality_level_get)
+		return 0;
+	ret = ops->clock_quality_level_get(dpll, dpll_priv(dpll), qls, extack);
+	if (ret)
+		return ret;
+	for_each_set_bit(ql, qls, DPLL_CLOCK_QUALITY_LEVEL_MAX)
+		if (nla_put_u32(msg, DPLL_A_CLOCK_QUALITY_LEVEL, ql))
+			return -EMSGSIZE;
+
+	return 0;
+}
+
 static int
 dpll_msg_add_pin_prio(struct sk_buff *msg, struct dpll_pin *pin,
 		      struct dpll_pin_ref *ref,
@@ -557,6 +578,9 @@ dpll_device_get_one(struct dpll_device *dpll, struct sk_buff *msg,
 	if (ret)
 		return ret;
 	ret = dpll_msg_add_lock_status(msg, dpll, extack);
+	if (ret)
+		return ret;
+	ret = dpll_msg_add_clock_quality_level(msg, dpll, extack);
 	if (ret)
 		return ret;
 	ret = dpll_msg_add_mode(msg, dpll, extack);
diff --git a/include/linux/dpll.h b/include/linux/dpll.h
index 81f7b623d0ba..5e4f9ab1cf75 100644
--- a/include/linux/dpll.h
+++ b/include/linux/dpll.h
@@ -26,6 +26,10 @@ struct dpll_device_ops {
 			       struct netlink_ext_ack *extack);
 	int (*temp_get)(const struct dpll_device *dpll, void *dpll_priv,
 			s32 *temp, struct netlink_ext_ack *extack);
+	int (*clock_quality_level_get)(const struct dpll_device *dpll,
+				       void *dpll_priv,
+				       unsigned long *qls,
+				       struct netlink_ext_ack *extack);
 };
 
 struct dpll_pin_ops {
diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h
index b0654ade7b7e..2b7ec2da4bcc 100644
--- a/include/uapi/linux/dpll.h
+++ b/include/uapi/linux/dpll.h
@@ -79,6 +79,29 @@ enum dpll_lock_status_error {
 	DPLL_LOCK_STATUS_ERROR_MAX = (__DPLL_LOCK_STATUS_ERROR_MAX - 1)
 };
 
+/**
+ * enum dpll_clock_quality_level - level of quality of a clock device. This
+ *   mainly applies when the dpll lock-status is DPLL_LOCK_STATUS_HOLDOVER. The
+ *   current list is defined according to the table 11-7 contained in ITU-T
+ *   G.8264/Y.1364 document. One may extend this list freely by other ITU-T
+ *   defined clock qualities, or different ones defined by another
+ *   standardization body (for those, please use different prefix).
+ */
+enum dpll_clock_quality_level {
+	DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_PRC = 1,
+	DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_SSU_A,
+	DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_SSU_B,
+	DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_EEC1,
+	DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_PRTC,
+	DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_EPRTC,
+	DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_EEEC,
+	DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_EPRC,
+
+	/* private: */
+	__DPLL_CLOCK_QUALITY_LEVEL_MAX,
+	DPLL_CLOCK_QUALITY_LEVEL_MAX = (__DPLL_CLOCK_QUALITY_LEVEL_MAX - 1)
+};
+
 #define DPLL_TEMP_DIVIDER	1000
 
 /**
@@ -180,6 +203,7 @@ enum dpll_a {
 	DPLL_A_TEMP,
 	DPLL_A_TYPE,
 	DPLL_A_LOCK_STATUS_ERROR,
+	DPLL_A_CLOCK_QUALITY_LEVEL,
 
 	__DPLL_A_MAX,
 	DPLL_A_MAX = (__DPLL_A_MAX - 1)
-- 
cgit v1.2.3


From 43d3487035e9a86fad952de4240a518614240d43 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Tue, 29 Oct 2024 15:55:35 -0600
Subject: UAPI: ethtool: Use __struct_group() in struct ethtool_link_settings

Use the `__struct_group()` helper to create a new tagged
`struct ethtool_link_settings_hdr`. This structure groups together
all the members of the flexible `struct ethtool_link_settings`
except the flexible array. As a result, the array is effectively
separated from the rest of the members without modifying the memory
layout of the flexible structure.

This new tagged struct will be used to fix problematic declarations
of middle-flex-arrays in composite structs[1].

[1] https://git.kernel.org/linus/d88cabfd9abc

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://patch.msgid.link/9e9fb0bd72e5ba1e916acbb4995b1e358b86a689.1730238285.git.gustavoars@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/ethtool.h | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index c405ed63acfa..fc1f54b065f9 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -2511,21 +2511,24 @@ enum ethtool_reset_flags {
  *	autonegotiation; 0 if unknown or not applicable.  Read-only.
  */
 struct ethtool_link_settings {
-	__u32	cmd;
-	__u32	speed;
-	__u8	duplex;
-	__u8	port;
-	__u8	phy_address;
-	__u8	autoneg;
-	__u8	mdio_support;
-	__u8	eth_tp_mdix;
-	__u8	eth_tp_mdix_ctrl;
-	__s8	link_mode_masks_nwords;
-	__u8	transceiver;
-	__u8	master_slave_cfg;
-	__u8	master_slave_state;
-	__u8	rate_matching;
-	__u32	reserved[7];
+	/* New members MUST be added within the __struct_group() macro below. */
+	__struct_group(ethtool_link_settings_hdr, hdr, /* no attrs */,
+		__u32	cmd;
+		__u32	speed;
+		__u8	duplex;
+		__u8	port;
+		__u8	phy_address;
+		__u8	autoneg;
+		__u8	mdio_support;
+		__u8	eth_tp_mdix;
+		__u8	eth_tp_mdix_ctrl;
+		__s8	link_mode_masks_nwords;
+		__u8	transceiver;
+		__u8	master_slave_cfg;
+		__u8	master_slave_state;
+		__u8	rate_matching;
+		__u32	reserved[7];
+	);
 	__u32	link_mode_masks[];
 	/* layout of link_mode_masks fields:
 	 * __u32 map_supported[link_mode_masks_nwords];
-- 
cgit v1.2.3


From 9d2fe9cd02ca5f1e70a7eff0262fb3668a27db0c Mon Sep 17 00:00:00 2001
From: Ricardo Ribalda <ribalda@chromium.org>
Date: Fri, 1 Nov 2024 07:46:29 +0000
Subject: iio: Add channel type for attention

Add a new channel type representing if the user's attention state to the
the system. This usually means if the user is looking at the screen or
not.

Signed-off-by: Ricardo Ribalda <ribalda@chromium.org>
Link: https://patch.msgid.link/20241101-hpd-v3-3-e9c80b7c7164@chromium.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 Documentation/ABI/testing/sysfs-bus-iio | 8 ++++++++
 drivers/iio/industrialio-core.c         | 1 +
 include/uapi/linux/iio/types.h          | 1 +
 tools/iio/iio_event_monitor.c           | 2 ++
 4 files changed, 12 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index 9641dd2a1e4b..f83bd6829285 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -2363,3 +2363,11 @@ KernelVersion:	6.10
 Contact:	linux-iio@vger.kernel.org
 Description:
 		The value of current sense resistor in Ohms.
+
+What:		/sys/.../iio:deviceX/in_attention_input
+KernelVersion:	6.13
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Value representing the user's attention to the system expressed
+		in units as percentage. This usually means if the user is
+		looking at the screen or not.
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 4c543490e56c..a2117ad1337d 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -95,6 +95,7 @@ static const char * const iio_chan_type_name_spec[] = {
 	[IIO_DELTA_VELOCITY] = "deltavelocity",
 	[IIO_COLORTEMP] = "colortemp",
 	[IIO_CHROMATICITY] = "chromaticity",
+	[IIO_ATTENTION] = "attention",
 };
 
 static const char * const iio_modifier_names[] = {
diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h
index f2e0b2d50e6b..12886d4465e4 100644
--- a/include/uapi/linux/iio/types.h
+++ b/include/uapi/linux/iio/types.h
@@ -51,6 +51,7 @@ enum iio_chan_type {
 	IIO_DELTA_VELOCITY,
 	IIO_COLORTEMP,
 	IIO_CHROMATICITY,
+	IIO_ATTENTION,
 };
 
 enum iio_modifier {
diff --git a/tools/iio/iio_event_monitor.c b/tools/iio/iio_event_monitor.c
index d0b8e484826d..cccf62ea2b8f 100644
--- a/tools/iio/iio_event_monitor.c
+++ b/tools/iio/iio_event_monitor.c
@@ -63,6 +63,7 @@ static const char * const iio_chan_type_name_spec[] = {
 	[IIO_DELTA_VELOCITY] = "deltavelocity",
 	[IIO_COLORTEMP] = "colortemp",
 	[IIO_CHROMATICITY] = "chromaticity",
+	[IIO_ATTENTION] = "attention",
 };
 
 static const char * const iio_ev_type_text[] = {
@@ -183,6 +184,7 @@ static bool event_is_known(struct iio_event_data *event)
 	case IIO_DELTA_VELOCITY:
 	case IIO_COLORTEMP:
 	case IIO_CHROMATICITY:
+	case IIO_ATTENTION:
 		break;
 	default:
 		return false;
-- 
cgit v1.2.3


From 690e50dd69ee48e43e0f7c42396487da1b81be14 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Sun, 3 Nov 2024 08:53:14 -0800
Subject: tools: ynl-gen: de-kdocify enums with no doc for entries

Sometimes the names of the enum entries are self-explanatory
or come from standards. Forcing authors to write trivial kdoc
for each of such entries seems unreasonable, but kdoc would
complain about undocumented entries.

Detect enums which only have documentation for the entire
type and no documentation for entries. Render their doc
as a plain comment.

Link: https://patch.msgid.link/20241103165314.1631237-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/dpll.h   | 14 +++++++-------
 tools/net/ynl/lib/nlspec.py |  3 +++
 tools/net/ynl/ynl-gen-c.py  | 14 +++++++++-----
 3 files changed, 19 insertions(+), 12 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h
index 2b7ec2da4bcc..bf97d4b6d51f 100644
--- a/include/uapi/linux/dpll.h
+++ b/include/uapi/linux/dpll.h
@@ -79,13 +79,13 @@ enum dpll_lock_status_error {
 	DPLL_LOCK_STATUS_ERROR_MAX = (__DPLL_LOCK_STATUS_ERROR_MAX - 1)
 };
 
-/**
- * enum dpll_clock_quality_level - level of quality of a clock device. This
- *   mainly applies when the dpll lock-status is DPLL_LOCK_STATUS_HOLDOVER. The
- *   current list is defined according to the table 11-7 contained in ITU-T
- *   G.8264/Y.1364 document. One may extend this list freely by other ITU-T
- *   defined clock qualities, or different ones defined by another
- *   standardization body (for those, please use different prefix).
+/*
+ * level of quality of a clock device. This mainly applies when the dpll
+ * lock-status is DPLL_LOCK_STATUS_HOLDOVER. The current list is defined
+ * according to the table 11-7 contained in ITU-T G.8264/Y.1364 document. One
+ * may extend this list freely by other ITU-T defined clock qualities, or
+ * different ones defined by another standardization body (for those, please
+ * use different prefix).
  */
 enum dpll_clock_quality_level {
 	DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_PRC = 1,
diff --git a/tools/net/ynl/lib/nlspec.py b/tools/net/ynl/lib/nlspec.py
index b6d6f8aef423..a745739655ad 100644
--- a/tools/net/ynl/lib/nlspec.py
+++ b/tools/net/ynl/lib/nlspec.py
@@ -131,6 +131,9 @@ class SpecEnumSet(SpecElement):
     def has_doc(self):
         if 'doc' in self.yaml:
             return True
+        return self.has_entry_doc()
+
+    def has_entry_doc(self):
         for entry in self.entries.values():
             if entry.has_doc():
                 return True
diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py
index aa22eb092475..c48b69071111 100755
--- a/tools/net/ynl/ynl-gen-c.py
+++ b/tools/net/ynl/ynl-gen-c.py
@@ -2437,11 +2437,15 @@ def render_uapi(family, cw):
             enum = family.consts[const['name']]
 
             if enum.has_doc():
-                cw.p('/**')
-                doc = ''
-                if 'doc' in enum:
-                    doc = ' - ' + enum['doc']
-                cw.write_doc_line(enum.enum_name + doc)
+                if enum.has_entry_doc():
+                    cw.p('/**')
+                    doc = ''
+                    if 'doc' in enum:
+                        doc = ' - ' + enum['doc']
+                    cw.write_doc_line(enum.enum_name + doc)
+                else:
+                    cw.p('/*')
+                    cw.write_doc_line(enum['doc'], indent=False)
                 for entry in enum.entries.values():
                     if entry.has_doc():
                         doc = '@' + entry.c_name + ': ' + entry['doc']
-- 
cgit v1.2.3


From 35890f85573c2ebbbf3491dc66f7ee2ad63055af Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 30 Oct 2024 21:20:45 -0300
Subject: vfio: Remove VFIO_TYPE1_NESTING_IOMMU

This control causes the ARM SMMU drivers to choose a stage 2
implementation for the IO pagetable (vs the stage 1 usual default),
however this choice has no significant visible impact to the VFIO
user. Further qemu never implemented this and no other userspace user is
known.

The original description in commit f5c9ecebaf2a ("vfio/iommu_type1: add
new VFIO_TYPE1_NESTING_IOMMU IOMMU type") suggested this was to "provide
SMMU translation services to the guest operating system" however the rest
of the API to set the guest table pointer for the stage 1 and manage
invalidation was never completed, or at least never upstreamed, rendering
this part useless dead code.

Upstream has now settled on iommufd as the uAPI for controlling nested
translation. Choosing the stage 2 implementation should be done by through
the IOMMU_HWPT_ALLOC_NEST_PARENT flag during domain allocation.

Remove VFIO_TYPE1_NESTING_IOMMU and everything under it including the
enable_nesting iommu_domain_op.

Just in-case there is some userspace using this continue to treat
requesting it as a NOP, but do not advertise support any more.

Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Mostafa Saleh <smostafa@google.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Reviewed-by: Donald Dutile <ddutile@redhat.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/1-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 16 ----------------
 drivers/iommu/arm/arm-smmu/arm-smmu.c       | 16 ----------------
 drivers/iommu/iommu.c                       | 10 ----------
 drivers/iommu/iommufd/vfio_compat.c         |  7 +------
 drivers/vfio/vfio_iommu_type1.c             | 12 +-----------
 include/linux/iommu.h                       |  3 ---
 include/uapi/linux/vfio.h                   |  2 +-
 7 files changed, 3 insertions(+), 63 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 737c5b882355..acf250aeb18b 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3378,21 +3378,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
 	return group;
 }
 
-static int arm_smmu_enable_nesting(struct iommu_domain *domain)
-{
-	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
-	int ret = 0;
-
-	mutex_lock(&smmu_domain->init_mutex);
-	if (smmu_domain->smmu)
-		ret = -EPERM;
-	else
-		smmu_domain->stage = ARM_SMMU_DOMAIN_S2;
-	mutex_unlock(&smmu_domain->init_mutex);
-
-	return ret;
-}
-
 static int arm_smmu_of_xlate(struct device *dev,
 			     const struct of_phandle_args *args)
 {
@@ -3514,7 +3499,6 @@ static struct iommu_ops arm_smmu_ops = {
 		.flush_iotlb_all	= arm_smmu_flush_iotlb_all,
 		.iotlb_sync		= arm_smmu_iotlb_sync,
 		.iova_to_phys		= arm_smmu_iova_to_phys,
-		.enable_nesting		= arm_smmu_enable_nesting,
 		.free			= arm_smmu_domain_free_paging,
 	}
 };
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 8321962b3714..12b173eec454 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -1558,21 +1558,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
 	return group;
 }
 
-static int arm_smmu_enable_nesting(struct iommu_domain *domain)
-{
-	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
-	int ret = 0;
-
-	mutex_lock(&smmu_domain->init_mutex);
-	if (smmu_domain->smmu)
-		ret = -EPERM;
-	else
-		smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
-	mutex_unlock(&smmu_domain->init_mutex);
-
-	return ret;
-}
-
 static int arm_smmu_set_pgtable_quirks(struct iommu_domain *domain,
 		unsigned long quirks)
 {
@@ -1656,7 +1641,6 @@ static struct iommu_ops arm_smmu_ops = {
 		.flush_iotlb_all	= arm_smmu_flush_iotlb_all,
 		.iotlb_sync		= arm_smmu_iotlb_sync,
 		.iova_to_phys		= arm_smmu_iova_to_phys,
-		.enable_nesting		= arm_smmu_enable_nesting,
 		.set_pgtable_quirks	= arm_smmu_set_pgtable_quirks,
 		.free			= arm_smmu_domain_free,
 	}
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 83c8e617a2c5..dbd70d5a4702 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2723,16 +2723,6 @@ static int __init iommu_init(void)
 }
 core_initcall(iommu_init);
 
-int iommu_enable_nesting(struct iommu_domain *domain)
-{
-	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
-		return -EINVAL;
-	if (!domain->ops->enable_nesting)
-		return -EINVAL;
-	return domain->ops->enable_nesting(domain);
-}
-EXPORT_SYMBOL_GPL(iommu_enable_nesting);
-
 int iommu_set_pgtable_quirks(struct iommu_domain *domain,
 		unsigned long quirk)
 {
diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c
index a3ad5f0b6c59..514aacd64009 100644
--- a/drivers/iommu/iommufd/vfio_compat.c
+++ b/drivers/iommu/iommufd/vfio_compat.c
@@ -291,12 +291,7 @@ static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx,
 	case VFIO_DMA_CC_IOMMU:
 		return iommufd_vfio_cc_iommu(ictx);
 
-	/*
-	 * This is obsolete, and to be removed from VFIO. It was an incomplete
-	 * idea that got merged.
-	 * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/
-	 */
-	case VFIO_TYPE1_NESTING_IOMMU:
+	case __VFIO_RESERVED_TYPE1_NESTING_IOMMU:
 		return 0;
 
 	/*
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index bf391b40e576..50ebc9593c9d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -72,7 +72,6 @@ struct vfio_iommu {
 	uint64_t		pgsize_bitmap;
 	uint64_t		num_non_pinned_groups;
 	bool			v2;
-	bool			nesting;
 	bool			dirty_page_tracking;
 	struct list_head	emulated_iommu_groups;
 };
@@ -2195,12 +2194,6 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 		goto out_free_domain;
 	}
 
-	if (iommu->nesting) {
-		ret = iommu_enable_nesting(domain->domain);
-		if (ret)
-			goto out_domain;
-	}
-
 	ret = iommu_attach_group(domain->domain, group->iommu_group);
 	if (ret)
 		goto out_domain;
@@ -2541,9 +2534,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
 	switch (arg) {
 	case VFIO_TYPE1_IOMMU:
 		break;
-	case VFIO_TYPE1_NESTING_IOMMU:
-		iommu->nesting = true;
-		fallthrough;
+	case __VFIO_RESERVED_TYPE1_NESTING_IOMMU:
 	case VFIO_TYPE1v2_IOMMU:
 		iommu->v2 = true;
 		break;
@@ -2638,7 +2629,6 @@ static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
 	switch (arg) {
 	case VFIO_TYPE1_IOMMU:
 	case VFIO_TYPE1v2_IOMMU:
-	case VFIO_TYPE1_NESTING_IOMMU:
 	case VFIO_UNMAP_ALL:
 		return 1;
 	case VFIO_UPDATE_VADDR:
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index bd722f473635..c88d18d2c928 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -635,7 +635,6 @@ struct iommu_ops {
  * @enforce_cache_coherency: Prevent any kind of DMA from bypassing IOMMU_CACHE,
  *                           including no-snoop TLPs on PCIe or other platform
  *                           specific mechanisms.
- * @enable_nesting: Enable nesting
  * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*)
  * @free: Release the domain after use.
  */
@@ -663,7 +662,6 @@ struct iommu_domain_ops {
 				    dma_addr_t iova);
 
 	bool (*enforce_cache_coherency)(struct iommu_domain *domain);
-	int (*enable_nesting)(struct iommu_domain *domain);
 	int (*set_pgtable_quirks)(struct iommu_domain *domain,
 				  unsigned long quirks);
 
@@ -844,7 +842,6 @@ extern void iommu_group_put(struct iommu_group *group);
 extern int iommu_group_id(struct iommu_group *group);
 extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
 
-int iommu_enable_nesting(struct iommu_domain *domain);
 int iommu_set_pgtable_quirks(struct iommu_domain *domain,
 		unsigned long quirks);
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 2b68e6cdf190..c8dbf8219c4f 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -35,7 +35,7 @@
 #define VFIO_EEH			5
 
 /* Two-stage IOMMU */
-#define VFIO_TYPE1_NESTING_IOMMU	6	/* Implies v2 */
+#define __VFIO_RESERVED_TYPE1_NESTING_IOMMU	6	/* Implies v2 */
 
 #define VFIO_SPAPR_TCE_v2_IOMMU		7
 
-- 
cgit v1.2.3


From 6912ec91828b8d7f21b393befad1c36dadbcd751 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 30 Oct 2024 21:20:49 -0300
Subject: iommu/arm-smmu-v3: Support IOMMU_GET_HW_INFO via struct
 arm_smmu_hw_info

For virtualization cases the IDR/IIDR/AIDR values of the actual SMMU
instance need to be available to the VMM so it can construct an
appropriate vSMMUv3 that reflects the correct HW capabilities.

For userspace page tables these values are required to constrain the valid
values within the CD table and the IOPTEs.

The kernel does not sanitize these values. If building a VMM then
userspace is required to only forward bits into a VM that it knows it can
implement. Some bits will also require a VMM to detect if appropriate
kernel support is available such as for ATS and BTM.

Start a new file and kconfig for the advanced iommufd support. This lets
it be compiled out for kernels that are not intended to support
virtualization, and allows distros to leave it disabled until they are
shipping a matching qemu too.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Reviewed-by: Donald Dutile <ddutile@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/5-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/iommu/Kconfig                              |  9 ++++++
 drivers/iommu/arm/arm-smmu-v3/Makefile             |  1 +
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c    | 31 +++++++++++++++++++
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c        |  1 +
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h        |  9 ++++++
 include/uapi/linux/iommufd.h                       | 35 ++++++++++++++++++++++
 6 files changed, 86 insertions(+)
 create mode 100644 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c

(limited to 'include/uapi/linux')

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index b3aa1f5d5321..0c9bceb1653d 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -415,6 +415,15 @@ config ARM_SMMU_V3_SVA
 	  Say Y here if your system supports SVA extensions such as PCIe PASID
 	  and PRI.
 
+config ARM_SMMU_V3_IOMMUFD
+	bool "Enable IOMMUFD features for ARM SMMUv3 (EXPERIMENTAL)"
+	depends on IOMMUFD
+	help
+	  Support for IOMMUFD features intended to support virtual machines
+	  with accelerated virtual IOMMUs.
+
+	  Say Y here if you are doing development and testing on this feature.
+
 config ARM_SMMU_V3_KUNIT_TEST
 	tristate "KUnit tests for arm-smmu-v3 driver"  if !KUNIT_ALL_TESTS
 	depends on KUNIT
diff --git a/drivers/iommu/arm/arm-smmu-v3/Makefile b/drivers/iommu/arm/arm-smmu-v3/Makefile
index dc98c88b48c8..493a659cc66b 100644
--- a/drivers/iommu/arm/arm-smmu-v3/Makefile
+++ b/drivers/iommu/arm/arm-smmu-v3/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_ARM_SMMU_V3) += arm_smmu_v3.o
 arm_smmu_v3-y := arm-smmu-v3.o
+arm_smmu_v3-$(CONFIG_ARM_SMMU_V3_IOMMUFD) += arm-smmu-v3-iommufd.o
 arm_smmu_v3-$(CONFIG_ARM_SMMU_V3_SVA) += arm-smmu-v3-sva.o
 arm_smmu_v3-$(CONFIG_TEGRA241_CMDQV) += tegra241-cmdqv.o
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
new file mode 100644
index 000000000000..3d2671031c9b
--- /dev/null
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+
+#include <uapi/linux/iommufd.h>
+
+#include "arm-smmu-v3.h"
+
+void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type)
+{
+	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+	struct iommu_hw_info_arm_smmuv3 *info;
+	u32 __iomem *base_idr;
+	unsigned int i;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return ERR_PTR(-ENOMEM);
+
+	base_idr = master->smmu->base + ARM_SMMU_IDR0;
+	for (i = 0; i <= 5; i++)
+		info->idr[i] = readl_relaxed(base_idr + i);
+	info->iidr = readl_relaxed(master->smmu->base + ARM_SMMU_IIDR);
+	info->aidr = readl_relaxed(master->smmu->base + ARM_SMMU_AIDR);
+
+	*length = sizeof(*info);
+	*type = IOMMU_HW_INFO_TYPE_ARM_SMMUV3;
+
+	return info;
+}
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 38725810c14e..996774d461ae 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3506,6 +3506,7 @@ static struct iommu_ops arm_smmu_ops = {
 	.identity_domain	= &arm_smmu_identity_domain,
 	.blocked_domain		= &arm_smmu_blocked_domain,
 	.capable		= arm_smmu_capable,
+	.hw_info		= arm_smmu_hw_info,
 	.domain_alloc_paging    = arm_smmu_domain_alloc_paging,
 	.domain_alloc_sva       = arm_smmu_sva_domain_alloc,
 	.domain_alloc_user	= arm_smmu_domain_alloc_user,
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 06e3d88932df..66261fd5bfb2 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -81,6 +81,8 @@ struct arm_smmu_device;
 #define IIDR_REVISION			GENMASK(15, 12)
 #define IIDR_IMPLEMENTER		GENMASK(11, 0)
 
+#define ARM_SMMU_AIDR			0x1C
+
 #define ARM_SMMU_CR0			0x20
 #define CR0_ATSCHK			(1 << 4)
 #define CR0_CMDQEN			(1 << 3)
@@ -956,4 +958,11 @@ tegra241_cmdqv_probe(struct arm_smmu_device *smmu)
 	return ERR_PTR(-ENODEV);
 }
 #endif /* CONFIG_TEGRA241_CMDQV */
+
+#if IS_ENABLED(CONFIG_ARM_SMMU_V3_IOMMUFD)
+void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type);
+#else
+#define arm_smmu_hw_info NULL
+#endif /* CONFIG_ARM_SMMU_V3_IOMMUFD */
+
 #endif /* _ARM_SMMU_V3_H */
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 72010f71c5e4..b5c94fecb94c 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -484,15 +484,50 @@ struct iommu_hw_info_vtd {
 	__aligned_u64 ecap_reg;
 };
 
+/**
+ * struct iommu_hw_info_arm_smmuv3 - ARM SMMUv3 hardware information
+ *                                   (IOMMU_HW_INFO_TYPE_ARM_SMMUV3)
+ *
+ * @flags: Must be set to 0
+ * @__reserved: Must be 0
+ * @idr: Implemented features for ARM SMMU Non-secure programming interface
+ * @iidr: Information about the implementation and implementer of ARM SMMU,
+ *        and architecture version supported
+ * @aidr: ARM SMMU architecture version
+ *
+ * For the details of @idr, @iidr and @aidr, please refer to the chapters
+ * from 6.3.1 to 6.3.6 in the SMMUv3 Spec.
+ *
+ * User space should read the underlying ARM SMMUv3 hardware information for
+ * the list of supported features.
+ *
+ * Note that these values reflect the raw HW capability, without any insight if
+ * any required kernel driver support is present. Bits may be set indicating the
+ * HW has functionality that is lacking kernel software support, such as BTM. If
+ * a VMM is using this information to construct emulated copies of these
+ * registers it should only forward bits that it knows it can support.
+ *
+ * In future, presence of required kernel support will be indicated in flags.
+ */
+struct iommu_hw_info_arm_smmuv3 {
+	__u32 flags;
+	__u32 __reserved;
+	__u32 idr[6];
+	__u32 iidr;
+	__u32 aidr;
+};
+
 /**
  * enum iommu_hw_info_type - IOMMU Hardware Info Types
  * @IOMMU_HW_INFO_TYPE_NONE: Used by the drivers that do not report hardware
  *                           info
  * @IOMMU_HW_INFO_TYPE_INTEL_VTD: Intel VT-d iommu info type
+ * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type
  */
 enum iommu_hw_info_type {
 	IOMMU_HW_INFO_TYPE_NONE = 0,
 	IOMMU_HW_INFO_TYPE_INTEL_VTD = 1,
+	IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2,
 };
 
 /**
-- 
cgit v1.2.3


From 18d92bb57c39504d9da11c6ef604f58eb1d5a117 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Tue, 22 Oct 2024 18:59:08 +0300
Subject: perf/core: Add aux_pause, aux_resume, aux_start_paused

Hardware traces, such as instruction traces, can produce a vast amount of
trace data, so being able to reduce tracing to more specific circumstances
can be useful.

The ability to pause or resume tracing when another event happens, can do
that.

Add ability for an event to "pause" or "resume" AUX area tracing.

Add aux_pause bit to perf_event_attr to indicate that, if the event
happens, the associated AUX area tracing should be paused. Ditto
aux_resume. Do not allow aux_pause and aux_resume to be set together.

Add aux_start_paused bit to perf_event_attr to indicate to an AUX area
event that it should start in a "paused" state.

Add aux_paused to struct hw_perf_event for AUX area events to keep track of
the "paused" state. aux_paused is initialized to aux_start_paused.

Add PERF_EF_PAUSE and PERF_EF_RESUME modes for ->stop() and ->start()
callbacks. Call as needed, during __perf_event_output(). Add
aux_in_pause_resume to struct perf_buffer to prevent races with the NMI
handler. Pause/resume in NMI context will miss out if it coincides with
another pause/resume.

To use aux_pause or aux_resume, an event must be in a group with the AUX
area event as the group leader.

Example (requires Intel PT and tools patches also):

 $ perf record --kcore -e intel_pt/aux-action=start-paused/k,syscalls:sys_enter_newuname/aux-action=resume/,syscalls:sys_exit_newuname/aux-action=pause/ uname
 Linux
 [ perf record: Woken up 1 times to write data ]
 [ perf record: Captured and wrote 0.043 MB perf.data ]
 $ perf script --call-trace
 uname   30805 [000] 24001.058782799: name: 0x7ffc9c1865b0
 uname   30805 [000] 24001.058784424:  psb offs: 0
 uname   30805 [000] 24001.058784424:  cbr: 39 freq: 3904 MHz (139%)
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])        debug_smp_processor_id
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])        __x64_sys_newuname
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])            down_read
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])                __cond_resched
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])                preempt_count_add
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])                    in_lock_functions
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])                preempt_count_sub
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])            up_read
 uname   30805 [000] 24001.058784629: ([kernel.kallsyms])                preempt_count_add
 uname   30805 [000] 24001.058784838: ([kernel.kallsyms])                    in_lock_functions
 uname   30805 [000] 24001.058784838: ([kernel.kallsyms])                preempt_count_sub
 uname   30805 [000] 24001.058784838: ([kernel.kallsyms])            _copy_to_user
 uname   30805 [000] 24001.058784838: ([kernel.kallsyms])        syscall_exit_to_user_mode
 uname   30805 [000] 24001.058784838: ([kernel.kallsyms])            syscall_exit_work
 uname   30805 [000] 24001.058784838: ([kernel.kallsyms])                perf_syscall_exit
 uname   30805 [000] 24001.058784838: ([kernel.kallsyms])                    debug_smp_processor_id
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                    perf_trace_buf_alloc
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                        perf_swevent_get_recursion_context
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                            debug_smp_processor_id
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                        debug_smp_processor_id
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                    perf_tp_event
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                        perf_trace_buf_update
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                            tracing_gen_ctx_irq_test
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                        perf_swevent_event
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                            __perf_event_account_interrupt
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                                __this_cpu_preempt_check
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                            perf_event_output_forward
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                                perf_event_aux_pause
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                                    ring_buffer_get
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                                        __rcu_read_lock
 uname   30805 [000] 24001.058785046: ([kernel.kallsyms])                                        __rcu_read_unlock
 uname   30805 [000] 24001.058785254: ([kernel.kallsyms])                                    pt_event_stop
 uname   30805 [000] 24001.058785254: ([kernel.kallsyms])                                        debug_smp_processor_id
 uname   30805 [000] 24001.058785254: ([kernel.kallsyms])                                        debug_smp_processor_id
 uname   30805 [000] 24001.058785254: ([kernel.kallsyms])                                        native_write_msr
 uname   30805 [000] 24001.058785463: ([kernel.kallsyms])                                        native_write_msr
 uname   30805 [000] 24001.058785639: 0x0

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: James Clark <james.clark@arm.com>
Link: https://lkml.kernel.org/r/20241022155920.17511-3-adrian.hunter@intel.com
---
 include/linux/perf_event.h      | 28 +++++++++++++++
 include/uapi/linux/perf_event.h | 11 +++++-
 kernel/events/core.c            | 75 ++++++++++++++++++++++++++++++++++++++---
 kernel/events/internal.h        |  1 +
 4 files changed, 110 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fb908843f209..91b310052a7c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -170,6 +170,12 @@ struct hw_perf_event {
 		};
 		struct { /* aux / Intel-PT */
 			u64		aux_config;
+			/*
+			 * For AUX area events, aux_paused cannot be a state
+			 * flag because it can be updated asynchronously to
+			 * state.
+			 */
+			unsigned int	aux_paused;
 		};
 		struct { /* software */
 			struct hrtimer	hrtimer;
@@ -294,6 +300,7 @@ struct perf_event_pmu_context;
 #define PERF_PMU_CAP_NO_EXCLUDE			0x0040
 #define PERF_PMU_CAP_AUX_OUTPUT			0x0080
 #define PERF_PMU_CAP_EXTENDED_HW_TYPE		0x0100
+#define PERF_PMU_CAP_AUX_PAUSE			0x0200
 
 /**
  * pmu::scope
@@ -384,6 +391,8 @@ struct pmu {
 #define PERF_EF_START	0x01		/* start the counter when adding    */
 #define PERF_EF_RELOAD	0x02		/* reload the counter when starting */
 #define PERF_EF_UPDATE	0x04		/* update the counter when stopping */
+#define PERF_EF_PAUSE	0x08		/* AUX area event, pause tracing */
+#define PERF_EF_RESUME	0x10		/* AUX area event, resume tracing */
 
 	/*
 	 * Adds/Removes a counter to/from the PMU, can be done inside a
@@ -423,6 +432,18 @@ struct pmu {
 	 *
 	 * ->start() with PERF_EF_RELOAD will reprogram the counter
 	 *  value, must be preceded by a ->stop() with PERF_EF_UPDATE.
+	 *
+	 * ->stop() with PERF_EF_PAUSE will stop as simply as possible. Will not
+	 * overlap another ->stop() with PERF_EF_PAUSE nor ->start() with
+	 * PERF_EF_RESUME.
+	 *
+	 * ->start() with PERF_EF_RESUME will start as simply as possible but
+	 * only if the counter is not otherwise stopped. Will not overlap
+	 * another ->start() with PERF_EF_RESUME nor ->stop() with
+	 * PERF_EF_PAUSE.
+	 *
+	 * Notably, PERF_EF_PAUSE/PERF_EF_RESUME *can* be concurrent with other
+	 * ->stop()/->start() invocations, just not itself.
 	 */
 	void (*start)			(struct perf_event *event, int flags);
 	void (*stop)			(struct perf_event *event, int flags);
@@ -1679,6 +1700,13 @@ static inline bool has_aux(struct perf_event *event)
 	return event->pmu->setup_aux;
 }
 
+static inline bool has_aux_action(struct perf_event *event)
+{
+	return event->attr.aux_sample_size ||
+	       event->attr.aux_pause ||
+	       event->attr.aux_resume;
+}
+
 static inline bool is_write_backward(struct perf_event *event)
 {
 	return !!event->attr.write_backward;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 4842c36fdf80..0524d541d4e3 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -511,7 +511,16 @@ struct perf_event_attr {
 	__u16	sample_max_stack;
 	__u16	__reserved_2;
 	__u32	aux_sample_size;
-	__u32	__reserved_3;
+
+	union {
+		__u32	aux_action;
+		struct {
+			__u32	aux_start_paused :  1, /* start AUX area tracing paused */
+				aux_pause        :  1, /* on overflow, pause AUX area tracing */
+				aux_resume       :  1, /* on overflow, resume AUX area tracing */
+				__reserved_3     : 29;
+		};
+	};
 
 	/*
 	 * User provided data if sigtrap=1, passed back to user via
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e3589c4287cb..0e9cfe6f3535 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2146,7 +2146,7 @@ static void perf_put_aux_event(struct perf_event *event)
 
 static bool perf_need_aux_event(struct perf_event *event)
 {
-	return !!event->attr.aux_output || !!event->attr.aux_sample_size;
+	return event->attr.aux_output || has_aux_action(event);
 }
 
 static int perf_get_aux_event(struct perf_event *event,
@@ -2171,6 +2171,10 @@ static int perf_get_aux_event(struct perf_event *event,
 	    !perf_aux_output_match(event, group_leader))
 		return 0;
 
+	if ((event->attr.aux_pause || event->attr.aux_resume) &&
+	    !(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE))
+		return 0;
+
 	if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
 		return 0;
 
@@ -8016,6 +8020,49 @@ void perf_prepare_header(struct perf_event_header *header,
 	WARN_ON_ONCE(header->size & 7);
 }
 
+static void __perf_event_aux_pause(struct perf_event *event, bool pause)
+{
+	if (pause) {
+		if (!event->hw.aux_paused) {
+			event->hw.aux_paused = 1;
+			event->pmu->stop(event, PERF_EF_PAUSE);
+		}
+	} else {
+		if (event->hw.aux_paused) {
+			event->hw.aux_paused = 0;
+			event->pmu->start(event, PERF_EF_RESUME);
+		}
+	}
+}
+
+static void perf_event_aux_pause(struct perf_event *event, bool pause)
+{
+	struct perf_buffer *rb;
+
+	if (WARN_ON_ONCE(!event))
+		return;
+
+	rb = ring_buffer_get(event);
+	if (!rb)
+		return;
+
+	scoped_guard (irqsave) {
+		/*
+		 * Guard against self-recursion here. Another event could trip
+		 * this same from NMI context.
+		 */
+		if (READ_ONCE(rb->aux_in_pause_resume))
+			break;
+
+		WRITE_ONCE(rb->aux_in_pause_resume, 1);
+		barrier();
+		__perf_event_aux_pause(event, pause);
+		barrier();
+		WRITE_ONCE(rb->aux_in_pause_resume, 0);
+	}
+	ring_buffer_put(rb);
+}
+
 static __always_inline int
 __perf_event_output(struct perf_event *event,
 		    struct perf_sample_data *data,
@@ -9818,9 +9865,12 @@ static int __perf_event_overflow(struct perf_event *event,
 
 	ret = __perf_event_account_interrupt(event, throttle);
 
+	if (event->attr.aux_pause)
+		perf_event_aux_pause(event->aux_event, true);
+
 	if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT &&
 	    !bpf_overflow_handler(event, data, regs))
-		return ret;
+		goto out;
 
 	/*
 	 * XXX event_limit might not quite work as expected on inherited
@@ -9882,6 +9932,9 @@ static int __perf_event_overflow(struct perf_event *event,
 		event->pending_wakeup = 1;
 		irq_work_queue(&event->pending_irq);
 	}
+out:
+	if (event->attr.aux_resume)
+		perf_event_aux_pause(event->aux_event, false);
 
 	return ret;
 }
@@ -12273,11 +12326,25 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	}
 
 	if (event->attr.aux_output &&
-	    !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
+	    (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) ||
+	     event->attr.aux_pause || event->attr.aux_resume)) {
 		err = -EOPNOTSUPP;
 		goto err_pmu;
 	}
 
+	if (event->attr.aux_pause && event->attr.aux_resume) {
+		err = -EINVAL;
+		goto err_pmu;
+	}
+
+	if (event->attr.aux_start_paused) {
+		if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) {
+			err = -EOPNOTSUPP;
+			goto err_pmu;
+		}
+		event->hw.aux_paused = 1;
+	}
+
 	if (cgroup_fd != -1) {
 		err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
 		if (err)
@@ -13073,7 +13140,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 	 * Grouping is not supported for kernel events, neither is 'AUX',
 	 * make sure the caller's intentions are adjusted.
 	 */
-	if (attr->aux_output)
+	if (attr->aux_output || attr->aux_action)
 		return ERR_PTR(-EINVAL);
 
 	event = perf_event_alloc(attr, cpu, task, NULL, NULL,
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index e072d995d670..249288d82b8d 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -52,6 +52,7 @@ struct perf_buffer {
 	void				(*free_aux)(void *);
 	refcount_t			aux_refcount;
 	int				aux_in_sampling;
+	int				aux_in_pause_resume;
 	void				**aux_pages;
 	void				*aux_priv;
 
-- 
cgit v1.2.3


From 84bfbfbbd32aee136afea4b6bf82581dce79c305 Mon Sep 17 00:00:00 2001
From: Maurice Lambert <mauricelambert434@gmail.com>
Date: Sun, 3 Nov 2024 23:39:50 +0100
Subject: netlink: typographical error in nlmsg_type constants definition

This commit fix a typographical error in netlink nlmsg_type constants definition in the include/uapi/linux/rtnetlink.h at line 177. The definition is RTM_NEWNVLAN RTM_NEWVLAN instead of RTM_NEWVLAN RTM_NEWVLAN.

Signed-off-by: Maurice Lambert <mauricelambert434@gmail.com>
Fixes: 8dcea187088b ("net: bridge: vlan: add rtm definitions and dump support")
Link: https://patch.msgid.link/20241103223950.230300-1-mauricelambert434@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/rtnetlink.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 3b687d20c9ed..db7254d52d93 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -174,7 +174,7 @@ enum {
 #define RTM_GETLINKPROP	RTM_GETLINKPROP
 
 	RTM_NEWVLAN = 112,
-#define RTM_NEWNVLAN	RTM_NEWVLAN
+#define RTM_NEWVLAN	RTM_NEWVLAN
 	RTM_DELVLAN,
 #define RTM_DELVLAN	RTM_DELVLAN
 	RTM_GETVLAN,
-- 
cgit v1.2.3


From 6140be90ec70c39fa844741ca3cc807dd0866394 Mon Sep 17 00:00:00 2001
From: Christian Göttsche <cgzones@googlemail.com>
Date: Fri, 26 Apr 2024 18:20:14 +0200
Subject: fs/xattr: add *at family syscalls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the four syscalls setxattrat(), getxattrat(), listxattrat() and
removexattrat().  Those can be used to operate on extended attributes,
especially security related ones, either relative to a pinned directory
or on a file descriptor without read access, avoiding a
/proc/<pid>/fd/<fd> detour, requiring a mounted procfs.

One use case will be setfiles(8) setting SELinux file contexts
("security.selinux") without race conditions and without a file
descriptor opened with read access requiring SELinux read permission.

Use the do_{name}at() pattern from fs/open.c.

Pass the value of the extended attribute, its length, and for
setxattrat(2) the command (XATTR_CREATE or XATTR_REPLACE) via an added
struct xattr_args to not exceed six syscall arguments and not
merging the AT_* and XATTR_* flags.

[AV: fixes by Christian Brauner folded in, the entire thing rebased on
top of {filename,file}_...xattr() primitives, treatment of empty
pathnames regularized.  As the result, AT_EMPTY_PATH+NULL handling
is cheap, so f...(2) can use it]

Signed-off-by: Christian Göttsche <cgzones@googlemail.com>
Link: https://lore.kernel.org/r/20240426162042.191916-1-cgoettsche@seltendoof.de
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Christian Brauner <brauner@kernel.org>
CC: x86@kernel.org
CC: linux-alpha@vger.kernel.org
CC: linux-kernel@vger.kernel.org
CC: linux-arm-kernel@lists.infradead.org
CC: linux-ia64@vger.kernel.org
CC: linux-m68k@lists.linux-m68k.org
CC: linux-mips@vger.kernel.org
CC: linux-parisc@vger.kernel.org
CC: linuxppc-dev@lists.ozlabs.org
CC: linux-s390@vger.kernel.org
CC: linux-sh@vger.kernel.org
CC: sparclinux@vger.kernel.org
CC: linux-fsdevel@vger.kernel.org
CC: audit@vger.kernel.org
CC: linux-arch@vger.kernel.org
CC: linux-api@vger.kernel.org
CC: linux-security-module@vger.kernel.org
CC: selinux@vger.kernel.org
[brauner: slight tweaks]
Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/kernel/syscalls/syscall.tbl      |   4 +
 arch/arm/tools/syscall.tbl                  |   4 +
 arch/arm64/tools/syscall_32.tbl             |   4 +
 arch/m68k/kernel/syscalls/syscall.tbl       |   4 +
 arch/microblaze/kernel/syscalls/syscall.tbl |   4 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   |   4 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   |   4 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   |   4 +
 arch/parisc/kernel/syscalls/syscall.tbl     |   4 +
 arch/powerpc/kernel/syscalls/syscall.tbl    |   4 +
 arch/s390/kernel/syscalls/syscall.tbl       |   4 +
 arch/sh/kernel/syscalls/syscall.tbl         |   4 +
 arch/sparc/kernel/syscalls/syscall.tbl      |   4 +
 arch/x86/entry/syscalls/syscall_32.tbl      |   4 +
 arch/x86/entry/syscalls/syscall_64.tbl      |   4 +
 arch/xtensa/kernel/syscalls/syscall.tbl     |   4 +
 fs/xattr.c                                  | 245 ++++++++++++++++++----------
 include/asm-generic/audit_change_attr.h     |   6 +
 include/linux/syscalls.h                    |  13 ++
 include/linux/xattr.h                       |   4 +
 include/uapi/asm-generic/unistd.h           |  11 +-
 include/uapi/linux/xattr.h                  |   7 +
 scripts/syscall.tbl                         |   4 +
 23 files changed, 268 insertions(+), 86 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 74720667fe09..c59d53d6d3f3 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -502,3 +502,7 @@
 570	common	lsm_set_self_attr		sys_lsm_set_self_attr
 571	common	lsm_list_modules		sys_lsm_list_modules
 572	common  mseal				sys_mseal
+573	common	setxattrat			sys_setxattrat
+574	common	getxattrat			sys_getxattrat
+575	common	listxattrat			sys_listxattrat
+576	common	removexattrat			sys_removexattrat
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 23c98203c40f..49eeb2ad8dbd 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -477,3 +477,7 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal				sys_mseal
+463	common	setxattrat			sys_setxattrat
+464	common	getxattrat			sys_getxattrat
+465	common	listxattrat			sys_listxattrat
+466	common	removexattrat			sys_removexattrat
diff --git a/arch/arm64/tools/syscall_32.tbl b/arch/arm64/tools/syscall_32.tbl
index 9a37930d4e26..69a829912a05 100644
--- a/arch/arm64/tools/syscall_32.tbl
+++ b/arch/arm64/tools/syscall_32.tbl
@@ -474,3 +474,7 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal				sys_mseal
+463	common	setxattrat			sys_setxattrat
+464	common	getxattrat			sys_getxattrat
+465	common	listxattrat			sys_listxattrat
+466	common	removexattrat			sys_removexattrat
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 22a3cbd4c602..f5ed71f1910d 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -462,3 +462,7 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal				sys_mseal
+463	common	setxattrat			sys_setxattrat
+464	common	getxattrat			sys_getxattrat
+465	common	listxattrat			sys_listxattrat
+466	common	removexattrat			sys_removexattrat
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 2b81a6bd78b2..680f568b77f2 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -468,3 +468,7 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal				sys_mseal
+463	common	setxattrat			sys_setxattrat
+464	common	getxattrat			sys_getxattrat
+465	common	listxattrat			sys_listxattrat
+466	common	removexattrat			sys_removexattrat
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 953f5b7dc723..0b9b7e25b69a 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -401,3 +401,7 @@
 460	n32	lsm_set_self_attr		sys_lsm_set_self_attr
 461	n32	lsm_list_modules		sys_lsm_list_modules
 462	n32	mseal				sys_mseal
+463	n32	setxattrat			sys_setxattrat
+464	n32	getxattrat			sys_getxattrat
+465	n32	listxattrat			sys_listxattrat
+466	n32	removexattrat			sys_removexattrat
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 1464c6be6eb3..c844cd5cda62 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -377,3 +377,7 @@
 460	n64	lsm_set_self_attr		sys_lsm_set_self_attr
 461	n64	lsm_list_modules		sys_lsm_list_modules
 462	n64	mseal				sys_mseal
+463	n64	setxattrat			sys_setxattrat
+464	n64	getxattrat			sys_getxattrat
+465	n64	listxattrat			sys_listxattrat
+466	n64	removexattrat			sys_removexattrat
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 2439a2491cff..349b8aad1159 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -450,3 +450,7 @@
 460	o32	lsm_set_self_attr		sys_lsm_set_self_attr
 461	o32	lsm_list_modules		sys_lsm_list_modules
 462	o32	mseal				sys_mseal
+463	o32	setxattrat			sys_setxattrat
+464	o32	getxattrat			sys_getxattrat
+465	o32	listxattrat			sys_listxattrat
+466	o32	removexattrat			sys_removexattrat
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 66dc406b12e4..d9fc94c86965 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -461,3 +461,7 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal				sys_mseal
+463	common	setxattrat			sys_setxattrat
+464	common	getxattrat			sys_getxattrat
+465	common	listxattrat			sys_listxattrat
+466	common	removexattrat			sys_removexattrat
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index ebae8415dfbb..d8b4ab78bef0 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -553,3 +553,7 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal				sys_mseal
+463	common	setxattrat			sys_setxattrat
+464	common	getxattrat			sys_getxattrat
+465	common	listxattrat			sys_listxattrat
+466	common	removexattrat			sys_removexattrat
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 01071182763e..e9115b4d8b63 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -465,3 +465,7 @@
 460  common	lsm_set_self_attr	sys_lsm_set_self_attr		sys_lsm_set_self_attr
 461  common	lsm_list_modules	sys_lsm_list_modules		sys_lsm_list_modules
 462  common	mseal			sys_mseal			sys_mseal
+463  common	setxattrat		sys_setxattrat			sys_setxattrat
+464  common	getxattrat		sys_getxattrat			sys_getxattrat
+465  common	listxattrat		sys_listxattrat			sys_listxattrat
+466  common	removexattrat		sys_removexattrat		sys_removexattrat
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index c55fd7696d40..c8cad33bf250 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -466,3 +466,7 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal				sys_mseal
+463	common	setxattrat			sys_setxattrat
+464	common	getxattrat			sys_getxattrat
+465	common	listxattrat			sys_listxattrat
+466	common	removexattrat			sys_removexattrat
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index cfdfb3707c16..727f99d333b3 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -508,3 +508,7 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal 				sys_mseal
+463	common	setxattrat			sys_setxattrat
+464	common	getxattrat			sys_getxattrat
+465	common	listxattrat			sys_listxattrat
+466	common	removexattrat			sys_removexattrat
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 534c74b14fab..4d0fb2fba7e2 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -468,3 +468,7 @@
 460	i386	lsm_set_self_attr	sys_lsm_set_self_attr
 461	i386	lsm_list_modules	sys_lsm_list_modules
 462	i386	mseal 			sys_mseal
+463	i386	setxattrat		sys_setxattrat
+464	i386	getxattrat		sys_getxattrat
+465	i386	listxattrat		sys_listxattrat
+466	i386	removexattrat		sys_removexattrat
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 7093ee21c0d1..5eb708bff1c7 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -386,6 +386,10 @@
 460	common	lsm_set_self_attr	sys_lsm_set_self_attr
 461	common	lsm_list_modules	sys_lsm_list_modules
 462 	common  mseal			sys_mseal
+463	common	setxattrat		sys_setxattrat
+464	common	getxattrat		sys_getxattrat
+465	common	listxattrat		sys_listxattrat
+466	common	removexattrat		sys_removexattrat
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 67083fc1b2f5..37effc1b134e 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -433,3 +433,7 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal 				sys_mseal
+463	common	setxattrat			sys_setxattrat
+464	common	getxattrat			sys_getxattrat
+465	common	listxattrat			sys_listxattrat
+466	common	removexattrat			sys_removexattrat
diff --git a/fs/xattr.c b/fs/xattr.c
index b76911b23293..deb336b821c9 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -676,69 +676,90 @@ out:
 	return error;
 }
 
-static int path_setxattr(const char __user *pathname,
-			 const char __user *name, const void __user *value,
-			 size_t size, int flags, unsigned int lookup_flags)
+static int path_setxattrat(int dfd, const char __user *pathname,
+			   unsigned int at_flags, const char __user *name,
+			   const void __user *value, size_t size, int flags)
 {
 	struct xattr_name kname;
 	struct kernel_xattr_ctx ctx = {
-		.cvalue   = value,
-		.kvalue   = NULL,
-		.size     = size,
-		.kname    = &kname,
-		.flags    = flags,
+		.cvalue	= value,
+		.kvalue	= NULL,
+		.size	= size,
+		.kname	= &kname,
+		.flags	= flags,
 	};
+	struct filename *filename;
+	unsigned int lookup_flags = 0;
 	int error;
 
+	if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+		return -EINVAL;
+
+	if (!(at_flags & AT_SYMLINK_NOFOLLOW))
+		lookup_flags = LOOKUP_FOLLOW;
+
 	error = setxattr_copy(name, &ctx);
 	if (error)
 		return error;
 
-	error = filename_setxattr(AT_FDCWD, getname(pathname), lookup_flags,
-				  &ctx);
+	filename = getname_maybe_null(pathname, at_flags);
+	if (!filename) {
+		CLASS(fd, f)(dfd);
+		if (fd_empty(f))
+			error = -EBADF;
+		else
+			error = file_setxattr(fd_file(f), &ctx);
+	} else {
+		error = filename_setxattr(dfd, filename, lookup_flags, &ctx);
+	}
 	kvfree(ctx.kvalue);
 	return error;
 }
 
+SYSCALL_DEFINE6(setxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags,
+		const char __user *, name, const struct xattr_args __user *, uargs,
+		size_t, usize)
+{
+	struct xattr_args args = {};
+	int error;
+
+	BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0);
+	BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST);
+
+	if (unlikely(usize < XATTR_ARGS_SIZE_VER0))
+		return -EINVAL;
+	if (usize > PAGE_SIZE)
+		return -E2BIG;
+
+	error = copy_struct_from_user(&args, sizeof(args), uargs, usize);
+	if (error)
+		return error;
+
+	return path_setxattrat(dfd, pathname, at_flags, name,
+			       u64_to_user_ptr(args.value), args.size,
+			       args.flags);
+}
+
 SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
 		const char __user *, name, const void __user *, value,
 		size_t, size, int, flags)
 {
-	return path_setxattr(pathname, name, value, size, flags, LOOKUP_FOLLOW);
+	return path_setxattrat(AT_FDCWD, pathname, 0, name, value, size, flags);
 }
 
 SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
 		const char __user *, name, const void __user *, value,
 		size_t, size, int, flags)
 {
-	return path_setxattr(pathname, name, value, size, flags, 0);
+	return path_setxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name,
+			       value, size, flags);
 }
 
 SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
 		const void __user *,value, size_t, size, int, flags)
 {
-	struct xattr_name kname;
-	struct kernel_xattr_ctx ctx = {
-		.cvalue   = value,
-		.kvalue   = NULL,
-		.size     = size,
-		.kname    = &kname,
-		.flags    = flags,
-	};
-	int error;
-
-	CLASS(fd, f)(fd);
-
-	if (fd_empty(f))
-		return -EBADF;
-
-	error = setxattr_copy(name, &ctx);
-	if (error)
-		return error;
-
-	error = file_setxattr(fd_file(f), &ctx);
-	kvfree(ctx.kvalue);
-	return error;
+	return path_setxattrat(fd, NULL, AT_EMPTY_PATH, name,
+			       value, size, flags);
 }
 
 /*
@@ -804,11 +825,10 @@ out:
 	return error;
 }
 
-static ssize_t path_getxattr(const char __user *pathname,
-			     const char __user *name, void __user *value,
-			     size_t size, unsigned int lookup_flags)
+static ssize_t path_getxattrat(int dfd, const char __user *pathname,
+			       unsigned int at_flags, const char __user *name,
+			       void __user *value, size_t size)
 {
-	ssize_t error;
 	struct xattr_name kname;
 	struct kernel_xattr_ctx ctx = {
 		.value    = value,
@@ -816,44 +836,72 @@ static ssize_t path_getxattr(const char __user *pathname,
 		.kname    = &kname,
 		.flags    = 0,
 	};
+	struct filename *filename;
+	ssize_t error;
+
+	if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+		return -EINVAL;
 
 	error = import_xattr_name(&kname, name);
 	if (error)
 		return error;
-	return filename_getxattr(AT_FDCWD, getname(pathname), lookup_flags, &ctx);
+
+	filename = getname_maybe_null(pathname, at_flags);
+	if (!filename) {
+		CLASS(fd, f)(dfd);
+		if (fd_empty(f))
+			return -EBADF;
+		return file_getxattr(fd_file(f), &ctx);
+	} else {
+		int lookup_flags = 0;
+		if (!(at_flags & AT_SYMLINK_NOFOLLOW))
+			lookup_flags = LOOKUP_FOLLOW;
+		return filename_getxattr(dfd, filename, lookup_flags, &ctx);
+	}
+}
+
+SYSCALL_DEFINE6(getxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags,
+		const char __user *, name, struct xattr_args __user *, uargs, size_t, usize)
+{
+	struct xattr_args args = {};
+	int error;
+
+	BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0);
+	BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST);
+
+	if (unlikely(usize < XATTR_ARGS_SIZE_VER0))
+		return -EINVAL;
+	if (usize > PAGE_SIZE)
+		return -E2BIG;
+
+	error = copy_struct_from_user(&args, sizeof(args), uargs, usize);
+	if (error)
+		return error;
+
+	if (args.flags != 0)
+		return -EINVAL;
+
+	return path_getxattrat(dfd, pathname, at_flags, name,
+			       u64_to_user_ptr(args.value), args.size);
 }
 
 SYSCALL_DEFINE4(getxattr, const char __user *, pathname,
 		const char __user *, name, void __user *, value, size_t, size)
 {
-	return path_getxattr(pathname, name, value, size, LOOKUP_FOLLOW);
+	return path_getxattrat(AT_FDCWD, pathname, 0, name, value, size);
 }
 
 SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
 		const char __user *, name, void __user *, value, size_t, size)
 {
-	return path_getxattr(pathname, name, value, size, 0);
+	return path_getxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name,
+			       value, size);
 }
 
 SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
 		void __user *, value, size_t, size)
 {
-	ssize_t error;
-	struct xattr_name kname;
-	struct kernel_xattr_ctx ctx = {
-		.value    = value,
-		.size     = size,
-		.kname    = &kname,
-		.flags    = 0,
-	};
-	CLASS(fd, f)(fd);
-
-	if (fd_empty(f))
-		return -EBADF;
-	error = import_xattr_name(&kname, name);
-	if (error)
-		return error;
-	return file_getxattr(fd_file(f), &ctx);
+	return path_getxattrat(fd, NULL, AT_EMPTY_PATH, name, value, size);
 }
 
 /*
@@ -918,32 +966,50 @@ out:
 	return error;
 }
 
-static ssize_t path_listxattr(const char __user *pathname, char __user *list,
-			      size_t size, unsigned int lookup_flags)
+static ssize_t path_listxattrat(int dfd, const char __user *pathname,
+				unsigned int at_flags, char __user *list,
+				size_t size)
+{
+	struct filename *filename;
+	int lookup_flags;
+
+	if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+		return -EINVAL;
+
+	filename = getname_maybe_null(pathname, at_flags);
+	if (!filename) {
+		CLASS(fd, f)(dfd);
+		if (fd_empty(f))
+			return -EBADF;
+		return file_listxattr(fd_file(f), list, size);
+	}
+
+	lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
+	return filename_listxattr(dfd, filename, lookup_flags, list, size);
+}
+
+SYSCALL_DEFINE5(listxattrat, int, dfd, const char __user *, pathname,
+		unsigned int, at_flags,
+		char __user *, list, size_t, size)
 {
-	return filename_listxattr(AT_FDCWD, getname(pathname), lookup_flags,
-				  list, size);
+	return path_listxattrat(dfd, pathname, at_flags, list, size);
 }
 
 SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list,
 		size_t, size)
 {
-	return path_listxattr(pathname, list, size, LOOKUP_FOLLOW);
+	return path_listxattrat(AT_FDCWD, pathname, 0, list, size);
 }
 
 SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
 		size_t, size)
 {
-	return path_listxattr(pathname, list, size, 0);
+	return path_listxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, list, size);
 }
 
 SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
 {
-	CLASS(fd, f)(fd);
-
-	if (fd_empty(f))
-		return -EBADF;
-	return file_listxattr(fd_file(f), list, size);
+	return path_listxattrat(fd, NULL, AT_EMPTY_PATH, list, size);
 }
 
 /*
@@ -996,44 +1062,53 @@ out:
 	return error;
 }
 
-static int path_removexattr(const char __user *pathname,
-			    const char __user *name, unsigned int lookup_flags)
+static int path_removexattrat(int dfd, const char __user *pathname,
+			      unsigned int at_flags, const char __user *name)
 {
 	struct xattr_name kname;
+	struct filename *filename;
+	unsigned int lookup_flags;
 	int error;
 
+	if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+		return -EINVAL;
+
 	error = import_xattr_name(&kname, name);
 	if (error)
 		return error;
-	return filename_removexattr(AT_FDCWD, getname(pathname), lookup_flags,
-				    &kname);
+
+	filename = getname_maybe_null(pathname, at_flags);
+	if (!filename) {
+		CLASS(fd, f)(dfd);
+		if (fd_empty(f))
+			return -EBADF;
+		return file_removexattr(fd_file(f), &kname);
+	}
+	lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
+	return filename_removexattr(dfd, filename, lookup_flags, &kname);
+}
+
+SYSCALL_DEFINE4(removexattrat, int, dfd, const char __user *, pathname,
+		unsigned int, at_flags, const char __user *, name)
+{
+	return path_removexattrat(dfd, pathname, at_flags, name);
 }
 
 SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
 		const char __user *, name)
 {
-	return path_removexattr(pathname, name, LOOKUP_FOLLOW);
+	return path_removexattrat(AT_FDCWD, pathname, 0, name);
 }
 
 SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
 		const char __user *, name)
 {
-	return path_removexattr(pathname, name, 0);
+	return path_removexattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name);
 }
 
 SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
 {
-	CLASS(fd, f)(fd);
-	struct xattr_name kname;
-	int error;
-
-	if (fd_empty(f))
-		return -EBADF;
-
-	error = import_xattr_name(&kname, name);
-	if (error)
-		return error;
-	return file_removexattr(fd_file(f), &kname);
+	return path_removexattrat(fd, NULL, AT_EMPTY_PATH, name);
 }
 
 int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name)
diff --git a/include/asm-generic/audit_change_attr.h b/include/asm-generic/audit_change_attr.h
index 331670807cf0..cc840537885f 100644
--- a/include/asm-generic/audit_change_attr.h
+++ b/include/asm-generic/audit_change_attr.h
@@ -11,9 +11,15 @@ __NR_lchown,
 __NR_fchown,
 #endif
 __NR_setxattr,
+#ifdef __NR_setxattrat
+__NR_setxattrat,
+#endif
 __NR_lsetxattr,
 __NR_fsetxattr,
 __NR_removexattr,
+#ifdef __NR_removexattrat
+__NR_removexattrat,
+#endif
 __NR_lremovexattr,
 __NR_fremovexattr,
 #ifdef __NR_fchownat
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 5758104921e6..c6333204d451 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -77,6 +77,7 @@ struct cachestat_range;
 struct cachestat;
 struct statmount;
 struct mnt_id_req;
+struct xattr_args;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -338,23 +339,35 @@ asmlinkage long sys_io_uring_register(unsigned int fd, unsigned int op,
 				void __user *arg, unsigned int nr_args);
 asmlinkage long sys_setxattr(const char __user *path, const char __user *name,
 			     const void __user *value, size_t size, int flags);
+asmlinkage long sys_setxattrat(int dfd, const char __user *path, unsigned int at_flags,
+			       const char __user *name,
+			       const struct xattr_args __user *args, size_t size);
 asmlinkage long sys_lsetxattr(const char __user *path, const char __user *name,
 			      const void __user *value, size_t size, int flags);
 asmlinkage long sys_fsetxattr(int fd, const char __user *name,
 			      const void __user *value, size_t size, int flags);
 asmlinkage long sys_getxattr(const char __user *path, const char __user *name,
 			     void __user *value, size_t size);
+asmlinkage long sys_getxattrat(int dfd, const char __user *path, unsigned int at_flags,
+			       const char __user *name,
+			       struct xattr_args __user *args, size_t size);
 asmlinkage long sys_lgetxattr(const char __user *path, const char __user *name,
 			      void __user *value, size_t size);
 asmlinkage long sys_fgetxattr(int fd, const char __user *name,
 			      void __user *value, size_t size);
 asmlinkage long sys_listxattr(const char __user *path, char __user *list,
 			      size_t size);
+asmlinkage long sys_listxattrat(int dfd, const char __user *path,
+				unsigned int at_flags,
+				char __user *list, size_t size);
 asmlinkage long sys_llistxattr(const char __user *path, char __user *list,
 			       size_t size);
 asmlinkage long sys_flistxattr(int fd, char __user *list, size_t size);
 asmlinkage long sys_removexattr(const char __user *path,
 				const char __user *name);
+asmlinkage long sys_removexattrat(int dfd, const char __user *path,
+				  unsigned int at_flags,
+				  const char __user *name);
 asmlinkage long sys_lremovexattr(const char __user *path,
 				 const char __user *name);
 asmlinkage long sys_fremovexattr(int fd, const char __user *name);
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index d20051865800..86b0d47984a1 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -19,6 +19,10 @@
 #include <linux/user_namespace.h>
 #include <uapi/linux/xattr.h>
 
+/* List of all open_how "versions". */
+#define XATTR_ARGS_SIZE_VER0	16 /* sizeof first published struct */
+#define XATTR_ARGS_SIZE_LATEST	XATTR_ARGS_SIZE_VER0
+
 struct inode;
 struct dentry;
 
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 5bf6148cac2b..88dc393c2bca 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -841,8 +841,17 @@ __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules)
 #define __NR_mseal 462
 __SYSCALL(__NR_mseal, sys_mseal)
 
+#define __NR_setxattrat 463
+__SYSCALL(__NR_setxattrat, sys_setxattrat)
+#define __NR_getxattrat 464
+__SYSCALL(__NR_getxattrat, sys_getxattrat)
+#define __NR_listxattrat 465
+__SYSCALL(__NR_listxattrat, sys_listxattrat)
+#define __NR_removexattrat 466
+__SYSCALL(__NR_removexattrat, sys_removexattrat)
+
 #undef __NR_syscalls
-#define __NR_syscalls 463
+#define __NR_syscalls 467
 
 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h
index 9463db2dfa9d..9854f9cff3c6 100644
--- a/include/uapi/linux/xattr.h
+++ b/include/uapi/linux/xattr.h
@@ -11,6 +11,7 @@
 */
 
 #include <linux/libc-compat.h>
+#include <linux/types.h>
 
 #ifndef _UAPI_LINUX_XATTR_H
 #define _UAPI_LINUX_XATTR_H
@@ -20,6 +21,12 @@
 
 #define XATTR_CREATE	0x1	/* set value, fail if attr already exists */
 #define XATTR_REPLACE	0x2	/* set value, fail if attr does not exist */
+
+struct xattr_args {
+	__aligned_u64 __user value;
+	__u32 size;
+	__u32 flags;
+};
 #endif
 
 /* Namespaces */
diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl
index 845e24eb372e..ebbdb3c42e9f 100644
--- a/scripts/syscall.tbl
+++ b/scripts/syscall.tbl
@@ -403,3 +403,7 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal				sys_mseal
+463	common	setxattrat			sys_setxattrat
+464	common	getxattrat			sys_getxattrat
+465	common	listxattrat			sys_listxattrat
+466	common	removexattrat			sys_removexattrat
-- 
cgit v1.2.3


From 6bf90bd8c58a305994948eb3409d91a7d8f2edae Mon Sep 17 00:00:00 2001
From: Olivier Langlois <olivier@trillion01.com>
Date: Sun, 13 Oct 2024 14:29:24 -0400
Subject: io_uring/napi: add static napi tracking strategy

Add the static napi tracking strategy. That allows the user to manually
manage the napi ids list for busy polling, and eliminate the overhead of
dynamically updating the list from the fast path.

Signed-off-by: Olivier Langlois <olivier@trillion01.com>
Link: https://lore.kernel.org/r/96943de14968c35a5c599352259ad98f3c0770ba.1728828877.git.olivier@trillion01.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  2 +-
 include/uapi/linux/io_uring.h  | 32 +++++++++++++-
 io_uring/fdinfo.c              | 54 +++++++++++++++++------
 io_uring/napi.c                | 97 +++++++++++++++++++++++++++++++++++++-----
 io_uring/napi.h                |  2 +-
 5 files changed, 160 insertions(+), 27 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index fba2988accc3..072e65e93105 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -408,7 +408,7 @@ struct io_ring_ctx {
 	/* napi busy poll default timeout */
 	ktime_t			napi_busy_poll_dt;
 	bool			napi_prefer_busy_poll;
-	bool			napi_enabled;
+	u8			napi_track_mode;
 
 	DECLARE_HASHTABLE(napi_ht, 4);
 #endif
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 47977a5c65f5..5d08435b95a8 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -790,12 +790,40 @@ struct io_uring_buf_status {
 	__u32	resv[8];
 };
 
+enum io_uring_napi_op {
+	/* register/ungister backward compatible opcode */
+	IO_URING_NAPI_REGISTER_OP = 0,
+
+	/* opcodes to update napi_list when static tracking is used */
+	IO_URING_NAPI_STATIC_ADD_ID = 1,
+	IO_URING_NAPI_STATIC_DEL_ID = 2
+};
+
+enum io_uring_napi_tracking_strategy {
+	/* value must be 0 for backward compatibility */
+	IO_URING_NAPI_TRACKING_DYNAMIC = 0,
+	IO_URING_NAPI_TRACKING_STATIC = 1,
+	IO_URING_NAPI_TRACKING_INACTIVE = 255
+};
+
 /* argument for IORING_(UN)REGISTER_NAPI */
 struct io_uring_napi {
 	__u32	busy_poll_to;
 	__u8	prefer_busy_poll;
-	__u8	pad[3];
-	__u64	resv;
+
+	/* a io_uring_napi_op value */
+	__u8	opcode;
+	__u8	pad[2];
+
+	/*
+	 * for IO_URING_NAPI_REGISTER_OP, it is a
+	 * io_uring_napi_tracking_strategy value.
+	 *
+	 * for IO_URING_NAPI_STATIC_ADD_ID/IO_URING_NAPI_STATIC_DEL_ID
+	 * it is the napi id to add/del from napi_list.
+	 */
+	__u32	op_param;
+	__u32	resv;
 };
 
 /*
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index efbec34ccb18..b214e5a407b5 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -46,6 +46,46 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
 	return 0;
 }
 
+#ifdef CONFIG_NET_RX_BUSY_POLL
+static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx,
+					       struct seq_file *m,
+					       const char *tracking_strategy)
+{
+	seq_puts(m, "NAPI:\tenabled\n");
+	seq_printf(m, "napi tracking:\t%s\n", tracking_strategy);
+	seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt);
+	if (ctx->napi_prefer_busy_poll)
+		seq_puts(m, "napi_prefer_busy_poll:\ttrue\n");
+	else
+		seq_puts(m, "napi_prefer_busy_poll:\tfalse\n");
+}
+
+static __cold void napi_show_fdinfo(struct io_ring_ctx *ctx,
+				    struct seq_file *m)
+{
+	unsigned int mode = READ_ONCE(ctx->napi_track_mode);
+
+	switch (mode) {
+	case IO_URING_NAPI_TRACKING_INACTIVE:
+		seq_puts(m, "NAPI:\tdisabled\n");
+		break;
+	case IO_URING_NAPI_TRACKING_DYNAMIC:
+		common_tracking_show_fdinfo(ctx, m, "dynamic");
+		break;
+	case IO_URING_NAPI_TRACKING_STATIC:
+		common_tracking_show_fdinfo(ctx, m, "static");
+		break;
+	default:
+		seq_printf(m, "NAPI:\tunknown mode (%u)\n", mode);
+	}
+}
+#else
+static inline void napi_show_fdinfo(struct io_ring_ctx *ctx,
+				    struct seq_file *m)
+{
+}
+#endif
+
 /*
  * Caller holds a reference to the file already, we don't need to do
  * anything else to get an extra reference.
@@ -219,18 +259,6 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
 
 	}
 	spin_unlock(&ctx->completion_lock);
-
-#ifdef CONFIG_NET_RX_BUSY_POLL
-	if (ctx->napi_enabled) {
-		seq_puts(m, "NAPI:\tenabled\n");
-		seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt);
-		if (ctx->napi_prefer_busy_poll)
-			seq_puts(m, "napi_prefer_busy_poll:\ttrue\n");
-		else
-			seq_puts(m, "napi_prefer_busy_poll:\tfalse\n");
-	} else {
-		seq_puts(m, "NAPI:\tdisabled\n");
-	}
-#endif
+	napi_show_fdinfo(ctx, m);
 }
 #endif
diff --git a/io_uring/napi.c b/io_uring/napi.c
index 1de1543d8034..b1ade3fda30f 100644
--- a/io_uring/napi.c
+++ b/io_uring/napi.c
@@ -81,6 +81,27 @@ int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id)
 	return 0;
 }
 
+static int __io_napi_del_id(struct io_ring_ctx *ctx, unsigned int napi_id)
+{
+	struct hlist_head *hash_list;
+	struct io_napi_entry *e;
+
+	/* Non-NAPI IDs can be rejected. */
+	if (napi_id < MIN_NAPI_ID)
+		return -EINVAL;
+
+	hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
+	guard(spinlock)(&ctx->napi_lock);
+	e = io_napi_hash_find(hash_list, napi_id);
+	if (!e)
+		return -ENOENT;
+
+	list_del_rcu(&e->list);
+	hash_del_rcu(&e->node);
+	kfree_rcu(e, rcu);
+	return 0;
+}
+
 static void __io_napi_remove_stale(struct io_ring_ctx *ctx)
 {
 	struct io_napi_entry *e;
@@ -136,9 +157,25 @@ static bool io_napi_busy_loop_should_end(void *data,
 	return false;
 }
 
-static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
-				   bool (*loop_end)(void *, unsigned long),
-				   void *loop_end_arg)
+/*
+ * never report stale entries
+ */
+static bool static_tracking_do_busy_loop(struct io_ring_ctx *ctx,
+					 bool (*loop_end)(void *, unsigned long),
+					 void *loop_end_arg)
+{
+	struct io_napi_entry *e;
+
+	list_for_each_entry_rcu(e, &ctx->napi_list, list)
+		napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
+				   ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
+	return false;
+}
+
+static bool
+dynamic_tracking_do_busy_loop(struct io_ring_ctx *ctx,
+			      bool (*loop_end)(void *, unsigned long),
+			      void *loop_end_arg)
 {
 	struct io_napi_entry *e;
 	bool is_stale = false;
@@ -154,6 +191,16 @@ static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
 	return is_stale;
 }
 
+static inline bool
+__io_napi_do_busy_loop(struct io_ring_ctx *ctx,
+		       bool (*loop_end)(void *, unsigned long),
+		       void *loop_end_arg)
+{
+	if (READ_ONCE(ctx->napi_track_mode) == IO_URING_NAPI_TRACKING_STATIC)
+		return static_tracking_do_busy_loop(ctx, loop_end, loop_end_arg);
+	return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg);
+}
+
 static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
 				       struct io_wait_queue *iowq)
 {
@@ -195,6 +242,7 @@ void io_napi_init(struct io_ring_ctx *ctx)
 	spin_lock_init(&ctx->napi_lock);
 	ctx->napi_prefer_busy_poll = false;
 	ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt);
+	ctx->napi_track_mode = IO_URING_NAPI_TRACKING_INACTIVE;
 }
 
 /*
@@ -215,6 +263,24 @@ void io_napi_free(struct io_ring_ctx *ctx)
 	INIT_LIST_HEAD_RCU(&ctx->napi_list);
 }
 
+static int io_napi_register_napi(struct io_ring_ctx *ctx,
+				 struct io_uring_napi *napi)
+{
+	switch (napi->op_param) {
+	case IO_URING_NAPI_TRACKING_DYNAMIC:
+	case IO_URING_NAPI_TRACKING_STATIC:
+		break;
+	default:
+		return -EINVAL;
+	}
+	/* clean the napi list for new settings */
+	io_napi_free(ctx);
+	WRITE_ONCE(ctx->napi_track_mode, napi->op_param);
+	WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC);
+	WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll);
+	return 0;
+}
+
 /*
  * io_napi_register() - Register napi with io-uring
  * @ctx: pointer to io-uring context structure
@@ -226,7 +292,8 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
 {
 	const struct io_uring_napi curr = {
 		.busy_poll_to 	  = ktime_to_us(ctx->napi_busy_poll_dt),
-		.prefer_busy_poll = ctx->napi_prefer_busy_poll
+		.prefer_busy_poll = ctx->napi_prefer_busy_poll,
+		.op_param	  = ctx->napi_track_mode
 	};
 	struct io_uring_napi napi;
 
@@ -234,16 +301,26 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
 		return -EINVAL;
 	if (copy_from_user(&napi, arg, sizeof(napi)))
 		return -EFAULT;
-	if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv)
+	if (napi.pad[0] || napi.pad[1] || napi.resv)
 		return -EINVAL;
 
 	if (copy_to_user(arg, &curr, sizeof(curr)))
 		return -EFAULT;
 
-	WRITE_ONCE(ctx->napi_busy_poll_dt, napi.busy_poll_to * NSEC_PER_USEC);
-	WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll);
-	WRITE_ONCE(ctx->napi_enabled, true);
-	return 0;
+	switch (napi.opcode) {
+	case IO_URING_NAPI_REGISTER_OP:
+		return io_napi_register_napi(ctx, &napi);
+	case IO_URING_NAPI_STATIC_ADD_ID:
+		if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC)
+			return -EINVAL;
+		return __io_napi_add_id(ctx, napi.op_param);
+	case IO_URING_NAPI_STATIC_DEL_ID:
+		if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC)
+			return -EINVAL;
+		return __io_napi_del_id(ctx, napi.op_param);
+	default:
+		return -EINVAL;
+	}
 }
 
 /*
@@ -266,7 +343,7 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
 
 	WRITE_ONCE(ctx->napi_busy_poll_dt, 0);
 	WRITE_ONCE(ctx->napi_prefer_busy_poll, false);
-	WRITE_ONCE(ctx->napi_enabled, false);
+	WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE);
 	return 0;
 }
 
diff --git a/io_uring/napi.h b/io_uring/napi.h
index 4ae622f37b30..fa742f42e09b 100644
--- a/io_uring/napi.h
+++ b/io_uring/napi.h
@@ -44,7 +44,7 @@ static inline void io_napi_add(struct io_kiocb *req)
 	struct io_ring_ctx *ctx = req->ctx;
 	struct socket *sock;
 
-	if (!READ_ONCE(ctx->napi_enabled))
+	if (READ_ONCE(ctx->napi_track_mode) != IO_URING_NAPI_TRACKING_DYNAMIC)
 		return;
 
 	sock = sock_from_file(req->file);
-- 
cgit v1.2.3


From 9907cda95fcbf44141b1292faab89cf8ec542f22 Mon Sep 17 00:00:00 2001
From: Juraj Šarinay <juraj@sarinay.com>
Date: Sun, 3 Nov 2024 13:45:25 +0100
Subject: net: nfc: Propagate ISO14443 type A target ATS to userspace via
 netlink
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a 20-byte field ats to struct nfc_target and expose it as
NFC_ATTR_TARGET_ATS via the netlink interface. The payload contains
'historical bytes' that help to distinguish cards from one another.
The information is commonly used to assemble an emulated ATR similar
to that reported by smart cards with contacts.

Add a 20-byte field target_ats to struct nci_dev to hold the payload
obtained in nci_rf_intf_activated_ntf_packet() and copy it to over to
nfc_target.ats in nci_activate_target(). The approach is similar
to the handling of 'general bytes' within ATR_RES.

Replace the hard-coded size of rats_res within struct
activation_params_nfca_poll_iso_dep by the equal constant NFC_ATS_MAXSIZE
now defined in nfc.h

Within NCI, the information corresponds to the 'RATS Response' activation
parameter that omits the initial length byte TL. This loses no
information and is consistent with our handling of SENSB_RES that
also drops the first (constant) byte.

Tested with nxp_nci_i2c on a few type A targets including an
ICAO 9303 compliant passport.

I refrain from the corresponding change to digital_in_recv_ats()
to have the few drivers based on digital.h fill nfc_target.ats,
as I have no way to test it. That class of drivers appear not to set
NFC_ATTR_TARGET_SENSB_RES either. Consider a separate patch to propagate
(all) the parameters.

Signed-off-by: Juraj Šarinay <juraj@sarinay.com>
Link: https://patch.msgid.link/20241103124525.8392-1-juraj@sarinay.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/nfc/nci.h      |  2 +-
 include/net/nfc/nci_core.h |  4 ++++
 include/net/nfc/nfc.h      |  4 ++++
 include/uapi/linux/nfc.h   |  3 +++
 net/nfc/nci/core.c         | 13 ++++++++++++-
 net/nfc/nci/ntf.c          | 32 +++++++++++++++++++++++++++++++-
 net/nfc/netlink.c          |  5 +++++
 7 files changed, 60 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/nfc/nci.h b/include/net/nfc/nci.h
index dc36519d16aa..09efcaed7c3f 100644
--- a/include/net/nfc/nci.h
+++ b/include/net/nfc/nci.h
@@ -475,7 +475,7 @@ struct nci_rf_discover_ntf {
 #define NCI_OP_RF_INTF_ACTIVATED_NTF	nci_opcode_pack(NCI_GID_RF_MGMT, 0x05)
 struct activation_params_nfca_poll_iso_dep {
 	__u8	rats_res_len;
-	__u8	rats_res[20];
+	__u8	rats_res[NFC_ATS_MAXSIZE];
 };
 
 struct activation_params_nfcb_poll_iso_dep {
diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h
index ea8595651c38..e180bdf2f82b 100644
--- a/include/net/nfc/nci_core.h
+++ b/include/net/nfc/nci_core.h
@@ -265,6 +265,10 @@ struct nci_dev {
 	/* stored during intf_activated_ntf */
 	__u8 remote_gb[NFC_MAX_GT_LEN];
 	__u8 remote_gb_len;
+
+	/* stored during intf_activated_ntf */
+	__u8 target_ats[NFC_ATS_MAXSIZE];
+	__u8 target_ats_len;
 };
 
 /* ----- NCI Devices ----- */
diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h
index 3a3781838c67..127e6c7d910d 100644
--- a/include/net/nfc/nfc.h
+++ b/include/net/nfc/nfc.h
@@ -86,6 +86,8 @@ struct nfc_ops {
  *	is a type A one. The %sens_res most significant byte must be byte 2
  *	as described by the NFC Forum digital specification (i.e. the platform
  *	configuration one) while %sens_res least significant byte is byte 1.
+ * @ats_len: length of Answer To Select in bytes
+ * @ats: Answer To Select returned by an ISO 14443 Type A target upon activation
  */
 struct nfc_target {
 	u32 idx;
@@ -105,6 +107,8 @@ struct nfc_target {
 	u8 is_iso15693;
 	u8 iso15693_dsfid;
 	u8 iso15693_uid[NFC_ISO15693_UID_MAXSIZE];
+	u8 ats_len;
+	u8 ats[NFC_ATS_MAXSIZE];
 };
 
 /**
diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h
index 4fa4e979e948..2f5b4be25261 100644
--- a/include/uapi/linux/nfc.h
+++ b/include/uapi/linux/nfc.h
@@ -164,6 +164,7 @@ enum nfc_commands {
  * @NFC_ATTR_VENDOR_SUBCMD: Vendor specific sub command
  * @NFC_ATTR_VENDOR_DATA: Vendor specific data, to be optionally passed
  *	to a vendor specific command implementation
+ * @NFC_ATTR_TARGET_ATS: ISO 14443 type A target Answer To Select
  */
 enum nfc_attrs {
 	NFC_ATTR_UNSPEC,
@@ -198,6 +199,7 @@ enum nfc_attrs {
 	NFC_ATTR_VENDOR_ID,
 	NFC_ATTR_VENDOR_SUBCMD,
 	NFC_ATTR_VENDOR_DATA,
+	NFC_ATTR_TARGET_ATS,
 /* private: internal use only */
 	__NFC_ATTR_AFTER_LAST
 };
@@ -225,6 +227,7 @@ enum nfc_sdp_attr {
 #define NFC_GB_MAXSIZE			48
 #define NFC_FIRMWARE_NAME_MAXSIZE	32
 #define NFC_ISO15693_UID_MAXSIZE	8
+#define NFC_ATS_MAXSIZE			20
 
 /* NFC protocols */
 #define NFC_PROTO_JEWEL		1
diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c
index f456a5911e7d..1ec5955fe469 100644
--- a/net/nfc/nci/core.c
+++ b/net/nfc/nci/core.c
@@ -757,6 +757,14 @@ int nci_core_conn_close(struct nci_dev *ndev, u8 conn_id)
 }
 EXPORT_SYMBOL(nci_core_conn_close);
 
+static void nci_set_target_ats(struct nfc_target *target, struct nci_dev *ndev)
+{
+	if (ndev->target_ats_len > 0) {
+		target->ats_len = ndev->target_ats_len;
+		memcpy(target->ats, ndev->target_ats, target->ats_len);
+	}
+}
+
 static int nci_set_local_general_bytes(struct nfc_dev *nfc_dev)
 {
 	struct nci_dev *ndev = nfc_get_drvdata(nfc_dev);
@@ -939,8 +947,11 @@ static int nci_activate_target(struct nfc_dev *nfc_dev,
 				 msecs_to_jiffies(NCI_RF_DISC_SELECT_TIMEOUT));
 	}
 
-	if (!rc)
+	if (!rc) {
 		ndev->target_active_prot = protocol;
+		if (protocol == NFC_PROTO_ISO14443)
+			nci_set_target_ats(target, ndev);
+	}
 
 	return rc;
 }
diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c
index 994a0a1efb58..a818eff27e6b 100644
--- a/net/nfc/nci/ntf.c
+++ b/net/nfc/nci/ntf.c
@@ -402,7 +402,7 @@ static int nci_extract_activation_params_iso_dep(struct nci_dev *ndev,
 	switch (ntf->activation_rf_tech_and_mode) {
 	case NCI_NFC_A_PASSIVE_POLL_MODE:
 		nfca_poll = &ntf->activation_params.nfca_poll_iso_dep;
-		nfca_poll->rats_res_len = min_t(__u8, *data++, 20);
+		nfca_poll->rats_res_len = min_t(__u8, *data++, NFC_ATS_MAXSIZE);
 		pr_debug("rats_res_len %d\n", nfca_poll->rats_res_len);
 		if (nfca_poll->rats_res_len > 0) {
 			memcpy(nfca_poll->rats_res,
@@ -531,6 +531,28 @@ static int nci_store_general_bytes_nfc_dep(struct nci_dev *ndev,
 	return NCI_STATUS_OK;
 }
 
+static int nci_store_ats_nfc_iso_dep(struct nci_dev *ndev,
+				     const struct nci_rf_intf_activated_ntf *ntf)
+{
+	ndev->target_ats_len = 0;
+
+	if (ntf->activation_params_len <= 0)
+		return NCI_STATUS_OK;
+
+	if (ntf->activation_params.nfca_poll_iso_dep.rats_res_len > NFC_ATS_MAXSIZE) {
+		pr_debug("ATS too long\n");
+		return NCI_STATUS_RF_PROTOCOL_ERROR;
+	}
+
+	if (ntf->activation_params.nfca_poll_iso_dep.rats_res_len > 0) {
+		ndev->target_ats_len = ntf->activation_params.nfca_poll_iso_dep.rats_res_len;
+		memcpy(ndev->target_ats, ntf->activation_params.nfca_poll_iso_dep.rats_res,
+		       ndev->target_ats_len);
+	}
+
+	return NCI_STATUS_OK;
+}
+
 static void nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev,
 					     const struct sk_buff *skb)
 {
@@ -660,6 +682,14 @@ exit:
 			if (err != NCI_STATUS_OK)
 				pr_err("unable to store general bytes\n");
 		}
+
+		/* store ATS to be reported later in nci_activate_target */
+		if (ntf.rf_interface == NCI_RF_INTERFACE_ISO_DEP &&
+		    ntf.activation_rf_tech_and_mode == NCI_NFC_A_PASSIVE_POLL_MODE) {
+			err = nci_store_ats_nfc_iso_dep(ndev, &ntf);
+			if (err != NCI_STATUS_OK)
+				pr_err("unable to store ATS\n");
+		}
 	}
 
 	if (!(ntf.activation_rf_tech_and_mode & NCI_RF_TECH_MODE_LISTEN_MASK)) {
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index dd2ce73a24fb..6a40b8d0350d 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -96,6 +96,11 @@ static int nfc_genl_send_target(struct sk_buff *msg, struct nfc_target *target,
 			goto nla_put_failure;
 	}
 
+	if (target->ats_len > 0 &&
+	    nla_put(msg, NFC_ATTR_TARGET_ATS, target->ats_len,
+		    target->ats))
+		goto nla_put_failure;
+
 	genlmsg_end(msg, hdr);
 	return 0;
 
-- 
cgit v1.2.3


From b855f02427e995cbc905e134cc3a7f4e503c0455 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Fri, 1 Nov 2024 10:12:03 +0100
Subject: media: replace obsolete hans.verkuil@cisco.com alias

The old hans.verkuil@cisco.com email address was discontinued years ago.

Replace it with the correct hansverk@cisco.com email.

Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
---
 drivers/media/cec/core/cec-core.c                 | 2 +-
 drivers/media/cec/platform/cec-gpio/cec-gpio.c    | 2 +-
 drivers/media/i2c/adv7604.c                       | 2 +-
 drivers/media/i2c/adv7842.c                       | 2 +-
 drivers/media/i2c/ths7303.c                       | 2 +-
 drivers/media/mc/mc-request.c                     | 2 +-
 drivers/media/pci/cobalt/cobalt-driver.c          | 2 +-
 drivers/media/radio/radio-aimslab.c               | 2 +-
 drivers/media/radio/radio-gemtek.c                | 2 +-
 drivers/media/radio/radio-isa.c                   | 2 +-
 drivers/media/radio/radio-isa.h                   | 2 +-
 drivers/media/radio/radio-miropcm20.c             | 2 +-
 drivers/media/radio/radio-rtrack2.c               | 2 +-
 drivers/media/radio/radio-terratec.c              | 2 +-
 drivers/media/radio/radio-zoltrix.c               | 2 +-
 drivers/media/test-drivers/vicodec/vicodec-core.c | 2 +-
 include/media/i2c/ths7303.h                       | 2 +-
 include/media/media-request.h                     | 2 +-
 include/uapi/linux/v4l2-dv-timings.h              | 2 +-
 19 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/cec/core/cec-core.c b/drivers/media/cec/core/cec-core.c
index 48282d272fe6..ca0db8d457b4 100644
--- a/drivers/media/cec/core/cec-core.c
+++ b/drivers/media/cec/core/cec-core.c
@@ -438,6 +438,6 @@ static void __exit cec_devnode_exit(void)
 subsys_initcall(cec_devnode_init);
 module_exit(cec_devnode_exit)
 
-MODULE_AUTHOR("Hans Verkuil <hans.verkuil@cisco.com>");
+MODULE_AUTHOR("Hans Verkuil <hansverk@cisco.com>");
 MODULE_DESCRIPTION("Device node registration for cec drivers");
 MODULE_LICENSE("GPL");
diff --git a/drivers/media/cec/platform/cec-gpio/cec-gpio.c b/drivers/media/cec/platform/cec-gpio/cec-gpio.c
index 69351730ce86..cf64e8871fe5 100644
--- a/drivers/media/cec/platform/cec-gpio/cec-gpio.c
+++ b/drivers/media/cec/platform/cec-gpio/cec-gpio.c
@@ -288,6 +288,6 @@ static struct platform_driver cec_gpio_pdrv = {
 
 module_platform_driver(cec_gpio_pdrv);
 
-MODULE_AUTHOR("Hans Verkuil <hans.verkuil@cisco.com>");
+MODULE_AUTHOR("Hans Verkuil <hansverk@cisco.com>");
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("CEC GPIO driver");
diff --git a/drivers/media/i2c/adv7604.c b/drivers/media/i2c/adv7604.c
index 4504909d95bc..d91b084b2c7a 100644
--- a/drivers/media/i2c/adv7604.c
+++ b/drivers/media/i2c/adv7604.c
@@ -42,7 +42,7 @@ module_param(debug, int, 0644);
 MODULE_PARM_DESC(debug, "debug level (0-2)");
 
 MODULE_DESCRIPTION("Analog Devices ADV7604/10/11/12 video decoder driver");
-MODULE_AUTHOR("Hans Verkuil <hans.verkuil@cisco.com>");
+MODULE_AUTHOR("Hans Verkuil <hansverk@cisco.com>");
 MODULE_AUTHOR("Mats Randgaard <mats.randgaard@cisco.com>");
 MODULE_LICENSE("GPL");
 
diff --git a/drivers/media/i2c/adv7842.c b/drivers/media/i2c/adv7842.c
index 3c9e613af0ce..5545cd23e113 100644
--- a/drivers/media/i2c/adv7842.c
+++ b/drivers/media/i2c/adv7842.c
@@ -38,7 +38,7 @@ module_param(debug, int, 0644);
 MODULE_PARM_DESC(debug, "debug level (0-2)");
 
 MODULE_DESCRIPTION("Analog Devices ADV7842 video decoder driver");
-MODULE_AUTHOR("Hans Verkuil <hans.verkuil@cisco.com>");
+MODULE_AUTHOR("Hans Verkuil <hansverk@cisco.com>");
 MODULE_AUTHOR("Martin Bugge <marbugge@cisco.com>");
 MODULE_LICENSE("GPL");
 
diff --git a/drivers/media/i2c/ths7303.c b/drivers/media/i2c/ths7303.c
index 7526fabc7ee4..b7cedc5b3e8e 100644
--- a/drivers/media/i2c/ths7303.c
+++ b/drivers/media/i2c/ths7303.c
@@ -7,7 +7,7 @@
  * Author: Chaithrika U S <chaithrika@ti.com>
  *
  * Contributors:
- *     Hans Verkuil <hans.verkuil@cisco.com>
+ *     Hans Verkuil <hansverk@cisco.com>
  *     Lad, Prabhakar <prabhakar.lad@ti.com>
  *     Martin Bugge <marbugge@cisco.com>
  *
diff --git a/drivers/media/mc/mc-request.c b/drivers/media/mc/mc-request.c
index e064914c476e..1013c76ede27 100644
--- a/drivers/media/mc/mc-request.c
+++ b/drivers/media/mc/mc-request.c
@@ -6,7 +6,7 @@
  * Copyright (C) 2018 Intel Corporation
  * Copyright (C) 2018 Google, Inc.
  *
- * Author: Hans Verkuil <hans.verkuil@cisco.com>
+ * Author: Hans Verkuil <hansverk@cisco.com>
  * Author: Sakari Ailus <sakari.ailus@linux.intel.com>
  */
 
diff --git a/drivers/media/pci/cobalt/cobalt-driver.c b/drivers/media/pci/cobalt/cobalt-driver.c
index 6e1a0614e6d0..39e25cc53edb 100644
--- a/drivers/media/pci/cobalt/cobalt-driver.c
+++ b/drivers/media/pci/cobalt/cobalt-driver.c
@@ -44,7 +44,7 @@ module_param_named(ignore_err, cobalt_ignore_err, int, 0644);
 MODULE_PARM_DESC(ignore_err,
 	"If set then ignore missing i2c adapters/receivers. Default: 0\n");
 
-MODULE_AUTHOR("Hans Verkuil <hans.verkuil@cisco.com> & Morten Hestnes");
+MODULE_AUTHOR("Hans Verkuil <hansverk@cisco.com> & Morten Hestnes");
 MODULE_DESCRIPTION("cobalt driver");
 MODULE_LICENSE("GPL");
 
diff --git a/drivers/media/radio/radio-aimslab.c b/drivers/media/radio/radio-aimslab.c
index 0e9a3787724c..3c8c17d64821 100644
--- a/drivers/media/radio/radio-aimslab.c
+++ b/drivers/media/radio/radio-aimslab.c
@@ -4,7 +4,7 @@
  *
  * Copyright 1997 M. Kirkwood
  *
- * Converted to the radio-isa framework by Hans Verkuil <hans.verkuil@cisco.com>
+ * Converted to the radio-isa framework by Hans Verkuil <hansverk@cisco.com>
  * Converted to V4L2 API by Mauro Carvalho Chehab <mchehab@kernel.org>
  * Converted to new API by Alan Cox <alan@lxorguk.ukuu.org.uk>
  * Various bugfixes and enhancements by Russell Kroll <rkroll@exploits.org>
diff --git a/drivers/media/radio/radio-gemtek.c b/drivers/media/radio/radio-gemtek.c
index a532f63aa9d9..5ca6274c45bd 100644
--- a/drivers/media/radio/radio-gemtek.c
+++ b/drivers/media/radio/radio-gemtek.c
@@ -15,7 +15,7 @@
  *    Converted to new API by Alan Cox <alan@lxorguk.ukuu.org.uk>
  *    Various bugfixes and enhancements by Russell Kroll <rkroll@exploits.org>
  *
- * Converted to the radio-isa framework by Hans Verkuil <hans.verkuil@cisco.com>
+ * Converted to the radio-isa framework by Hans Verkuil <hansverk@cisco.com>
  * Converted to V4L2 API by Mauro Carvalho Chehab <mchehab@kernel.org>
  *
  * Note: this card seems to swap the left and right audio channels!
diff --git a/drivers/media/radio/radio-isa.c b/drivers/media/radio/radio-isa.c
index ad49151f5ff0..4f87c76a2a96 100644
--- a/drivers/media/radio/radio-isa.c
+++ b/drivers/media/radio/radio-isa.c
@@ -4,7 +4,7 @@
  * This takes care of all the V4L2 scaffolding, allowing the ISA drivers
  * to concentrate on the actual hardware operation.
  *
- * Copyright (C) 2012 Hans Verkuil <hans.verkuil@cisco.com>
+ * Copyright (C) 2012 Hans Verkuil <hansverk@cisco.com>
  */
 
 #include <linux/module.h>
diff --git a/drivers/media/radio/radio-isa.h b/drivers/media/radio/radio-isa.h
index c9159958203e..0f3db473da5e 100644
--- a/drivers/media/radio/radio-isa.h
+++ b/drivers/media/radio/radio-isa.h
@@ -4,7 +4,7 @@
  * This takes care of all the V4L2 scaffolding, allowing the ISA drivers
  * to concentrate on the actual hardware operation.
  *
- * Copyright (C) 2012 Hans Verkuil <hans.verkuil@cisco.com>
+ * Copyright (C) 2012 Hans Verkuil <hansverk@cisco.com>
  */
 
 #ifndef _RADIO_ISA_H_
diff --git a/drivers/media/radio/radio-miropcm20.c b/drivers/media/radio/radio-miropcm20.c
index 08be77b8f3b7..27f058c5e677 100644
--- a/drivers/media/radio/radio-miropcm20.c
+++ b/drivers/media/radio/radio-miropcm20.c
@@ -23,7 +23,7 @@
  * This code has been reintroduced and converted to use
  * the new V4L2 RDS API by:
  *
- * Hans Verkuil <hans.verkuil@cisco.com>
+ * Hans Verkuil <hansverk@cisco.com>
  */
 
 #include <linux/module.h>
diff --git a/drivers/media/radio/radio-rtrack2.c b/drivers/media/radio/radio-rtrack2.c
index 73d2c187f122..16b13a63bfed 100644
--- a/drivers/media/radio/radio-rtrack2.c
+++ b/drivers/media/radio/radio-rtrack2.c
@@ -7,7 +7,7 @@
  * Converted to new API by Alan Cox <alan@lxorguk.ukuu.org.uk>
  * Various bugfixes and enhancements by Russell Kroll <rkroll@exploits.org>
  *
- * Converted to the radio-isa framework by Hans Verkuil <hans.verkuil@cisco.com>
+ * Converted to the radio-isa framework by Hans Verkuil <hansverk@cisco.com>
  * Converted to V4L2 API by Mauro Carvalho Chehab <mchehab@kernel.org>
  *
  * Fully tested with actual hardware and the v4l2-compliance tool.
diff --git a/drivers/media/radio/radio-terratec.c b/drivers/media/radio/radio-terratec.c
index 621bb8523271..720080634454 100644
--- a/drivers/media/radio/radio-terratec.c
+++ b/drivers/media/radio/radio-terratec.c
@@ -17,7 +17,7 @@
  *  Frequency control is done digitally -- ie out(port,encodefreq(95.8));
  *  Volume Control is done digitally
  *
- * Converted to the radio-isa framework by Hans Verkuil <hans.verkuil@cisco.com>
+ * Converted to the radio-isa framework by Hans Verkuil <hansverk@cisco.com>
  * Converted to V4L2 API by Mauro Carvalho Chehab <mchehab@kernel.org>
  */
 
diff --git a/drivers/media/radio/radio-zoltrix.c b/drivers/media/radio/radio-zoltrix.c
index f3dc57c75131..099b7af6a410 100644
--- a/drivers/media/radio/radio-zoltrix.c
+++ b/drivers/media/radio/radio-zoltrix.c
@@ -30,7 +30,7 @@
  * 2006-07-24 - Converted to V4L2 API
  *		by Mauro Carvalho Chehab <mchehab@kernel.org>
  *
- * Converted to the radio-isa framework by Hans Verkuil <hans.verkuil@cisco.com>
+ * Converted to the radio-isa framework by Hans Verkuil <hansverk@cisco.com>
  *
  * Note that this is the driver for the Zoltrix Radio Plus.
  * This driver does not work for the Zoltrix Radio Plus 108 or the
diff --git a/drivers/media/test-drivers/vicodec/vicodec-core.c b/drivers/media/test-drivers/vicodec/vicodec-core.c
index 88800969ae27..c45f5cf12ded 100644
--- a/drivers/media/test-drivers/vicodec/vicodec-core.c
+++ b/drivers/media/test-drivers/vicodec/vicodec-core.c
@@ -26,7 +26,7 @@
 #include "codec-v4l2-fwht.h"
 
 MODULE_DESCRIPTION("Virtual codec device");
-MODULE_AUTHOR("Hans Verkuil <hans.verkuil@cisco.com>");
+MODULE_AUTHOR("Hans Verkuil <hansverk@cisco.com>");
 MODULE_LICENSE("GPL v2");
 
 static bool multiplanar;
diff --git a/include/media/i2c/ths7303.h b/include/media/i2c/ths7303.h
index fee2818c558d..fc937025cdb4 100644
--- a/include/media/i2c/ths7303.h
+++ b/include/media/i2c/ths7303.h
@@ -5,7 +5,7 @@
  * Copyright 2013 Cisco Systems, Inc. and/or its affiliates.
  *
  * Contributors:
- *     Hans Verkuil <hans.verkuil@cisco.com>
+ *     Hans Verkuil <hansverk@cisco.com>
  *     Lad, Prabhakar <prabhakar.lad@ti.com>
  *     Martin Bugge <marbugge@cisco.com>
  */
diff --git a/include/media/media-request.h b/include/media/media-request.h
index 3cd25a2717ce..d4ac557678a7 100644
--- a/include/media/media-request.h
+++ b/include/media/media-request.h
@@ -5,7 +5,7 @@
  * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
  * Copyright (C) 2018 Intel Corporation
  *
- * Author: Hans Verkuil <hans.verkuil@cisco.com>
+ * Author: Hans Verkuil <hansverk@cisco.com>
  * Author: Sakari Ailus <sakari.ailus@linux.intel.com>
  */
 
diff --git a/include/uapi/linux/v4l2-dv-timings.h b/include/uapi/linux/v4l2-dv-timings.h
index ef0128c7369c..44a16e0e5a12 100644
--- a/include/uapi/linux/v4l2-dv-timings.h
+++ b/include/uapi/linux/v4l2-dv-timings.h
@@ -2,7 +2,7 @@
 /*
  * V4L2 DV timings header.
  *
- * Copyright (C) 2012-2016  Hans Verkuil <hans.verkuil@cisco.com>
+ * Copyright (C) 2012-2016  Hans Verkuil <hansverk@cisco.com>
  */
 
 #ifndef _V4L2_DV_TIMINGS_H
-- 
cgit v1.2.3


From 580db513b4a9d52f306580015a1872eea0a0894e Mon Sep 17 00:00:00 2001
From: Khang Nguyen <khangng@os.amperecomputing.com>
Date: Tue, 5 Nov 2024 14:19:15 +0700
Subject: net: mctp: Expose transport binding identifier via IFLA attribute

MCTP control protocol implementations are transport binding dependent.
Endpoint discovery is mandatory based on transport binding.
Message timing requirements are specified in each respective transport
binding specification.

However, we currently have no means to get this information from MCTP
links.

Add a IFLA_MCTP_PHYS_BINDING netlink link attribute, which represents
the transport type using the DMTF DSP0239-defined type numbers, returned
as part of RTM_GETLINK data.

We get an IFLA_MCTP_PHYS_BINDING attribute for each MCTP link, for
example:

- 0x00 (unspec) for loopback interface;
- 0x01 (SMBus/I2C) for mctpi2c%d interfaces; and
- 0x05 (serial) for mctpserial%d interfaces.

Signed-off-by: Khang Nguyen <khangng@os.amperecomputing.com>
Reviewed-by: Matt Johnston <matt@codeconstruct.com.au>
Link: https://patch.msgid.link/20241105071915.821871-1-khangng@os.amperecomputing.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/mctp/mctp-i2c.c    |  3 ++-
 drivers/net/mctp/mctp-i3c.c    |  2 +-
 drivers/net/mctp/mctp-serial.c |  5 +++--
 include/net/mctp.h             | 18 ++++++++++++++++++
 include/net/mctpdevice.h       |  4 +++-
 include/uapi/linux/if_link.h   |  1 +
 net/mctp/device.c              | 12 +++++++++---
 7 files changed, 37 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/mctp/mctp-i2c.c b/drivers/net/mctp/mctp-i2c.c
index e70fb6687994..d2b3f5a59141 100644
--- a/drivers/net/mctp/mctp-i2c.c
+++ b/drivers/net/mctp/mctp-i2c.c
@@ -880,7 +880,8 @@ static int mctp_i2c_add_netdev(struct mctp_i2c_client *mcli,
 		goto err;
 	}
 
-	rc = mctp_register_netdev(ndev, &mctp_i2c_mctp_ops);
+	rc = mctp_register_netdev(ndev, &mctp_i2c_mctp_ops,
+				  MCTP_PHYS_BINDING_SMBUS);
 	if (rc < 0) {
 		dev_err(&mcli->client->dev,
 			"register netdev \"%s\" failed %d\n",
diff --git a/drivers/net/mctp/mctp-i3c.c b/drivers/net/mctp/mctp-i3c.c
index 1bc87a062686..9adad59b8676 100644
--- a/drivers/net/mctp/mctp-i3c.c
+++ b/drivers/net/mctp/mctp-i3c.c
@@ -607,7 +607,7 @@ __must_hold(&busdevs_lock)
 		goto err_free_uninit;
 	}
 
-	rc = mctp_register_netdev(ndev, NULL);
+	rc = mctp_register_netdev(ndev, NULL, MCTP_PHYS_BINDING_I3C);
 	if (rc < 0) {
 		dev_warn(&ndev->dev, "netdev register failed: %d\n", rc);
 		goto err_free_netdev;
diff --git a/drivers/net/mctp/mctp-serial.c b/drivers/net/mctp/mctp-serial.c
index e63720ec3238..26c9a33fd636 100644
--- a/drivers/net/mctp/mctp-serial.c
+++ b/drivers/net/mctp/mctp-serial.c
@@ -23,6 +23,7 @@
 
 #include <linux/mctp.h>
 #include <net/mctp.h>
+#include <net/mctpdevice.h>
 #include <net/pkt_sched.h>
 
 #define MCTP_SERIAL_MTU		68 /* base mtu (64) + mctp header */
@@ -470,7 +471,7 @@ static int mctp_serial_open(struct tty_struct *tty)
 	spin_lock_init(&dev->lock);
 	INIT_WORK(&dev->tx_work, mctp_serial_tx_work);
 
-	rc = register_netdev(ndev);
+	rc = mctp_register_netdev(ndev, NULL, MCTP_PHYS_BINDING_SERIAL);
 	if (rc)
 		goto free_netdev;
 
@@ -492,7 +493,7 @@ static void mctp_serial_close(struct tty_struct *tty)
 	struct mctp_serial *dev = tty->disc_data;
 	int idx = dev->idx;
 
-	unregister_netdev(dev->netdev);
+	mctp_unregister_netdev(dev->netdev);
 	ida_free(&mctp_serial_ida, idx);
 }
 
diff --git a/include/net/mctp.h b/include/net/mctp.h
index 28d59ae94ca3..1ecbff7116f6 100644
--- a/include/net/mctp.h
+++ b/include/net/mctp.h
@@ -298,4 +298,22 @@ void mctp_routes_exit(void);
 int mctp_device_init(void);
 void mctp_device_exit(void);
 
+/* MCTP IDs and Codes from DMTF specification
+ * "DSP0239 Management Component Transport Protocol (MCTP) IDs and Codes"
+ * https://www.dmtf.org/sites/default/files/standards/documents/DSP0239_1.11.1.pdf
+ */
+enum mctp_phys_binding {
+	MCTP_PHYS_BINDING_UNSPEC	= 0x00,
+	MCTP_PHYS_BINDING_SMBUS		= 0x01,
+	MCTP_PHYS_BINDING_PCIE_VDM	= 0x02,
+	MCTP_PHYS_BINDING_USB		= 0x03,
+	MCTP_PHYS_BINDING_KCS		= 0x04,
+	MCTP_PHYS_BINDING_SERIAL	= 0x05,
+	MCTP_PHYS_BINDING_I3C		= 0x06,
+	MCTP_PHYS_BINDING_MMBI		= 0x07,
+	MCTP_PHYS_BINDING_PCC		= 0x08,
+	MCTP_PHYS_BINDING_UCIE		= 0x09,
+	MCTP_PHYS_BINDING_VENDOR	= 0xFF,
+};
+
 #endif /* __NET_MCTP_H */
diff --git a/include/net/mctpdevice.h b/include/net/mctpdevice.h
index 5c0d04b5c12c..957d9ef924c5 100644
--- a/include/net/mctpdevice.h
+++ b/include/net/mctpdevice.h
@@ -22,6 +22,7 @@ struct mctp_dev {
 	refcount_t		refs;
 
 	unsigned int		net;
+	enum mctp_phys_binding	binding;
 
 	const struct mctp_netdev_ops *ops;
 
@@ -44,7 +45,8 @@ struct mctp_dev *mctp_dev_get_rtnl(const struct net_device *dev);
 struct mctp_dev *__mctp_dev_get(const struct net_device *dev);
 
 int mctp_register_netdev(struct net_device *dev,
-			 const struct mctp_netdev_ops *ops);
+			 const struct mctp_netdev_ops *ops,
+			 enum mctp_phys_binding binding);
 void mctp_unregister_netdev(struct net_device *dev);
 
 void mctp_dev_hold(struct mctp_dev *mdev);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 8516c1ccd57a..2575e0cd9b48 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1958,6 +1958,7 @@ struct ifla_rmnet_flags {
 enum {
 	IFLA_MCTP_UNSPEC,
 	IFLA_MCTP_NET,
+	IFLA_MCTP_PHYS_BINDING,
 	__IFLA_MCTP_MAX,
 };
 
diff --git a/net/mctp/device.c b/net/mctp/device.c
index 3d75b919995d..26ce34b7e88e 100644
--- a/net/mctp/device.c
+++ b/net/mctp/device.c
@@ -371,6 +371,8 @@ static int mctp_fill_link_af(struct sk_buff *skb,
 		return -ENODATA;
 	if (nla_put_u32(skb, IFLA_MCTP_NET, mdev->net))
 		return -EMSGSIZE;
+	if (nla_put_u8(skb, IFLA_MCTP_PHYS_BINDING, mdev->binding))
+		return -EMSGSIZE;
 	return 0;
 }
 
@@ -385,6 +387,7 @@ static size_t mctp_get_link_af_size(const struct net_device *dev,
 	if (!mdev)
 		return 0;
 	ret = nla_total_size(4); /* IFLA_MCTP_NET */
+	ret += nla_total_size(1); /* IFLA_MCTP_PHYS_BINDING */
 	mctp_dev_put(mdev);
 	return ret;
 }
@@ -480,7 +483,8 @@ static int mctp_dev_notify(struct notifier_block *this, unsigned long event,
 }
 
 static int mctp_register_netdevice(struct net_device *dev,
-				   const struct mctp_netdev_ops *ops)
+				   const struct mctp_netdev_ops *ops,
+				   enum mctp_phys_binding binding)
 {
 	struct mctp_dev *mdev;
 
@@ -489,17 +493,19 @@ static int mctp_register_netdevice(struct net_device *dev,
 		return PTR_ERR(mdev);
 
 	mdev->ops = ops;
+	mdev->binding = binding;
 
 	return register_netdevice(dev);
 }
 
 int mctp_register_netdev(struct net_device *dev,
-			 const struct mctp_netdev_ops *ops)
+			 const struct mctp_netdev_ops *ops,
+			 enum mctp_phys_binding binding)
 {
 	int rc;
 
 	rtnl_lock();
-	rc = mctp_register_netdevice(dev, ops);
+	rc = mctp_register_netdevice(dev, ops, binding);
 	rtnl_unlock();
 
 	return rc;
-- 
cgit v1.2.3


From 6c83d153ed86eb17c46eafe4e78af4ce2071a052 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 21 Oct 2024 21:28:26 +0200
Subject: btrfs: add new ioctl to wait for cleaned subvolumes

Add a new unprivileged ioctl that will let the command
'btrfs subvolume sync' work without the (privileged) SEARCH_TREE ioctl.

There are several modes of operation, where the most common ones are to
wait on a specific subvolume or all currently queued for cleaning. This
is utilized e.g. in backup applications that delete subvolumes and wait
until they're cleaned to check for remaining space.

The other modes are for flexibility, e.g. for monitoring or
checkpoints in the queue of deleted subvolumes, again without the need
to use SEARCH_TREE.

Notes:

- waiting is interruptible, the timeout is set to 1 second and is not
  configurable

- repeated calls to the ioctl see a different state, so this is
  inherently racy when using e.g. the count or peek next/last

Use cases:

- a subvolume A was deleted, wait for cleaning (WAIT_FOR_ONE)

- a bunch of subvolumes were deleted, wait for all (WAIT_FOR_QUEUED or
  PEEK_LAST + WAIT_FOR_ONE)

- count how many are queued (not blocking), for monitoring purposes

- report progress (PEEK_NEXT), may miss some if cleaning is quick

- own waiting in user space (PEEK_LAST until it's 0)

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c           | 128 +++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/btrfs.h |  25 +++++++++
 2 files changed, 153 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 27a9342cd91c..2118c22625ca 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -5027,6 +5027,132 @@ int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
 	return -EINVAL;
 }
 
+static int btrfs_ioctl_subvol_sync(struct btrfs_fs_info *fs_info, void __user *argp)
+{
+	struct btrfs_root *root;
+	struct btrfs_ioctl_subvol_wait args = { 0 };
+	signed long sched_ret;
+	int refs;
+	u64 root_flags;
+	bool wait_for_deletion = false;
+	bool found = false;
+
+	if (copy_from_user(&args, argp, sizeof(args)))
+		return -EFAULT;
+
+	switch (args.mode) {
+	case BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED:
+		/*
+		 * Wait for the first one deleted that waits until all previous
+		 * are cleaned.
+		 */
+		spin_lock(&fs_info->trans_lock);
+		if (!list_empty(&fs_info->dead_roots)) {
+			root = list_last_entry(&fs_info->dead_roots,
+					       struct btrfs_root, root_list);
+			args.subvolid = btrfs_root_id(root);
+			found = true;
+		}
+		spin_unlock(&fs_info->trans_lock);
+		if (!found)
+			return -ENOENT;
+
+		fallthrough;
+	case BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE:
+		if ((0 < args.subvolid && args.subvolid < BTRFS_FIRST_FREE_OBJECTID) ||
+		    BTRFS_LAST_FREE_OBJECTID < args.subvolid)
+			return -EINVAL;
+		break;
+	case BTRFS_SUBVOL_SYNC_COUNT:
+		spin_lock(&fs_info->trans_lock);
+		args.count = list_count_nodes(&fs_info->dead_roots);
+		spin_unlock(&fs_info->trans_lock);
+		if (copy_to_user(argp, &args, sizeof(args)))
+			return -EFAULT;
+		return 0;
+	case BTRFS_SUBVOL_SYNC_PEEK_FIRST:
+		spin_lock(&fs_info->trans_lock);
+		/* Last in the list was deleted first. */
+		if (!list_empty(&fs_info->dead_roots)) {
+			root = list_last_entry(&fs_info->dead_roots,
+					       struct btrfs_root, root_list);
+			args.subvolid = btrfs_root_id(root);
+		} else {
+			args.subvolid = 0;
+		}
+		spin_unlock(&fs_info->trans_lock);
+		if (copy_to_user(argp, &args, sizeof(args)))
+			return -EFAULT;
+		return 0;
+	case BTRFS_SUBVOL_SYNC_PEEK_LAST:
+		spin_lock(&fs_info->trans_lock);
+		/* First in the list was deleted last. */
+		if (!list_empty(&fs_info->dead_roots)) {
+			root = list_first_entry(&fs_info->dead_roots,
+						struct btrfs_root, root_list);
+			args.subvolid = btrfs_root_id(root);
+		} else {
+			args.subvolid = 0;
+		}
+		spin_unlock(&fs_info->trans_lock);
+		if (copy_to_user(argp, &args, sizeof(args)))
+			return -EFAULT;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+
+	/* 32bit limitation: fs_roots_radix key is not wide enough. */
+	if (sizeof(unsigned long) != sizeof(u64) && args.subvolid > U32_MAX)
+		return -EOVERFLOW;
+
+	while (1) {
+		/* Wait for the specific one. */
+		if (down_read_interruptible(&fs_info->subvol_sem) == -EINTR)
+			return -EINTR;
+		refs = -1;
+		spin_lock(&fs_info->fs_roots_radix_lock);
+		root = radix_tree_lookup(&fs_info->fs_roots_radix,
+					 (unsigned long)args.subvolid);
+		if (root) {
+			spin_lock(&root->root_item_lock);
+			refs = btrfs_root_refs(&root->root_item);
+			root_flags = btrfs_root_flags(&root->root_item);
+			spin_unlock(&root->root_item_lock);
+		}
+		spin_unlock(&fs_info->fs_roots_radix_lock);
+		up_read(&fs_info->subvol_sem);
+
+		/* Subvolume does not exist. */
+		if (!root)
+			return -ENOENT;
+
+		/* Subvolume not deleted at all. */
+		if (refs > 0)
+			return -EEXIST;
+		/* We've waited and now the subvolume is gone. */
+		if (wait_for_deletion && refs == -1) {
+			/* Return the one we waited for as the last one. */
+			if (copy_to_user(argp, &args, sizeof(args)))
+				return -EFAULT;
+			return 0;
+		}
+
+		/* Subvolume not found on the first try (deleted or never existed). */
+		if (refs == -1)
+			return -ENOENT;
+
+		wait_for_deletion = true;
+		ASSERT(root_flags & BTRFS_ROOT_SUBVOL_DEAD);
+		sched_ret = schedule_timeout_interruptible(HZ);
+		/* Early wake up or error. */
+		if (sched_ret != 0)
+			return -EINTR;
+	}
+
+	return 0;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -5178,6 +5304,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_ENCODED_WRITE_32:
 		return btrfs_ioctl_encoded_write(file, argp, true);
 #endif
+	case BTRFS_IOC_SUBVOL_SYNC_WAIT:
+		return btrfs_ioctl_subvol_sync(fs_info, argp);
 	}
 
 	return -ENOTTY;
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index cdf6ad872149..d3b222d7af24 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -1049,6 +1049,29 @@ struct btrfs_ioctl_encoded_io_args {
 #define BTRFS_ENCODED_IO_ENCRYPTION_NONE 0
 #define BTRFS_ENCODED_IO_ENCRYPTION_TYPES 1
 
+/*
+ * Wait for subvolume cleaning process. This queries the kernel queue and it
+ * can change between the calls.
+ *
+ * - FOR_ONE	- specify the subvolid
+ * - FOR_QUEUED - wait for all currently queued
+ * - COUNT	- count number of queued
+ * - PEEK_FIRST - read which is the first in the queue (to be cleaned or being
+ * 		  cleaned already), or 0 if the queue is empty
+ * - PEEK_LAST  - read the last subvolid in the queue, or 0 if the queue is empty
+ */
+struct btrfs_ioctl_subvol_wait {
+	__u64 subvolid;
+	__u32 mode;
+	__u32 count;
+};
+
+#define BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE		(0)
+#define BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED	(1)
+#define BTRFS_SUBVOL_SYNC_COUNT			(2)
+#define BTRFS_SUBVOL_SYNC_PEEK_FIRST		(3)
+#define BTRFS_SUBVOL_SYNC_PEEK_LAST		(4)
+
 /* Error codes as returned by the kernel */
 enum btrfs_err_code {
 	BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET = 1,
@@ -1181,6 +1204,8 @@ enum btrfs_err_code {
 				    struct btrfs_ioctl_encoded_io_args)
 #define BTRFS_IOC_ENCODED_WRITE _IOW(BTRFS_IOCTL_MAGIC, 64, \
 				     struct btrfs_ioctl_encoded_io_args)
+#define BTRFS_IOC_SUBVOL_SYNC_WAIT _IOW(BTRFS_IOCTL_MAGIC, 65, \
+					struct btrfs_ioctl_subvol_wait)
 
 #ifdef __cplusplus
 }
-- 
cgit v1.2.3


From d920179b3d4842a0e27cae54fdddbe5ef3977e73 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Fri, 8 Nov 2024 14:45:34 +0100
Subject: bpf: Add support for uprobe multi session attach

Adding support to attach BPF program for entry and return probe
of the same function. This is common use case which at the moment
requires to create two uprobe multi links.

Adding new BPF_TRACE_UPROBE_SESSION attach type that instructs
kernel to attach single link program to both entry and exit probe.

It's possible to control execution of the BPF program on return
probe simply by returning zero or non zero from the entry BPF
program execution to execute or not the BPF program on return
probe respectively.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20241108134544.480660-4-jolsa@kernel.org
---
 include/uapi/linux/bpf.h       |  1 +
 kernel/bpf/syscall.c           |  9 +++++++--
 kernel/bpf/verifier.c          |  1 +
 kernel/trace/bpf_trace.c       | 36 +++++++++++++++++++++++++++---------
 tools/include/uapi/linux/bpf.h |  1 +
 tools/lib/bpf/libbpf.c         |  1 +
 6 files changed, 38 insertions(+), 11 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f28b6527e815..4162afc6b5d0 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1116,6 +1116,7 @@ enum bpf_attach_type {
 	BPF_NETKIT_PRIMARY,
 	BPF_NETKIT_PEER,
 	BPF_TRACE_KPROBE_SESSION,
+	BPF_TRACE_UPROBE_SESSION,
 	__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8254b2973157..58190ca724a2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4103,10 +4103,14 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
 		if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI &&
 		    attach_type != BPF_TRACE_UPROBE_MULTI)
 			return -EINVAL;
+		if (prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION &&
+		    attach_type != BPF_TRACE_UPROBE_SESSION)
+			return -EINVAL;
 		if (attach_type != BPF_PERF_EVENT &&
 		    attach_type != BPF_TRACE_KPROBE_MULTI &&
 		    attach_type != BPF_TRACE_KPROBE_SESSION &&
-		    attach_type != BPF_TRACE_UPROBE_MULTI)
+		    attach_type != BPF_TRACE_UPROBE_MULTI &&
+		    attach_type != BPF_TRACE_UPROBE_SESSION)
 			return -EINVAL;
 		return 0;
 	case BPF_PROG_TYPE_SCHED_CLS:
@@ -5359,7 +5363,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 		else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI ||
 			 attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION)
 			ret = bpf_kprobe_multi_link_attach(attr, prog);
-		else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI)
+		else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI ||
+			 attr->link_create.attach_type == BPF_TRACE_UPROBE_SESSION)
 			ret = bpf_uprobe_multi_link_attach(attr, prog);
 		break;
 	default:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7d8ed377b35d..132fc172961f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -16027,6 +16027,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
 	case BPF_PROG_TYPE_KPROBE:
 		switch (env->prog->expected_attach_type) {
 		case BPF_TRACE_KPROBE_SESSION:
+		case BPF_TRACE_UPROBE_SESSION:
 			range = retval_range(0, 1);
 			break;
 		default:
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index db9e2792b42b..9c04b1364de2 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1581,6 +1581,17 @@ static inline bool is_kprobe_session(const struct bpf_prog *prog)
 	return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
 }
 
+static inline bool is_uprobe_multi(const struct bpf_prog *prog)
+{
+	return prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI ||
+	       prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION;
+}
+
+static inline bool is_uprobe_session(const struct bpf_prog *prog)
+{
+	return prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION;
+}
+
 static const struct bpf_func_proto *
 kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -1598,13 +1609,13 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_get_func_ip:
 		if (is_kprobe_multi(prog))
 			return &bpf_get_func_ip_proto_kprobe_multi;
-		if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI)
+		if (is_uprobe_multi(prog))
 			return &bpf_get_func_ip_proto_uprobe_multi;
 		return &bpf_get_func_ip_proto_kprobe;
 	case BPF_FUNC_get_attach_cookie:
 		if (is_kprobe_multi(prog))
 			return &bpf_get_attach_cookie_proto_kmulti;
-		if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI)
+		if (is_uprobe_multi(prog))
 			return &bpf_get_attach_cookie_proto_umulti;
 		return &bpf_get_attach_cookie_proto_trace;
 	default:
@@ -3096,6 +3107,7 @@ struct bpf_uprobe {
 	u64 cookie;
 	struct uprobe *uprobe;
 	struct uprobe_consumer consumer;
+	bool session;
 };
 
 struct bpf_uprobe_multi_link {
@@ -3267,9 +3279,13 @@ uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs,
 			  __u64 *data)
 {
 	struct bpf_uprobe *uprobe;
+	int ret;
 
 	uprobe = container_of(con, struct bpf_uprobe, consumer);
-	return uprobe_prog_run(uprobe, instruction_pointer(regs), regs);
+	ret = uprobe_prog_run(uprobe, instruction_pointer(regs), regs);
+	if (uprobe->session)
+		return ret ? UPROBE_HANDLER_IGNORE : 0;
+	return 0;
 }
 
 static int
@@ -3279,7 +3295,8 @@ uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, s
 	struct bpf_uprobe *uprobe;
 
 	uprobe = container_of(con, struct bpf_uprobe, consumer);
-	return uprobe_prog_run(uprobe, func, regs);
+	uprobe_prog_run(uprobe, func, regs);
+	return 0;
 }
 
 static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
@@ -3318,7 +3335,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 	if (sizeof(u64) != sizeof(void *))
 		return -EOPNOTSUPP;
 
-	if (prog->expected_attach_type != BPF_TRACE_UPROBE_MULTI)
+	if (!is_uprobe_multi(prog))
 		return -EINVAL;
 
 	flags = attr->link_create.uprobe_multi.flags;
@@ -3394,11 +3411,12 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 
 		uprobes[i].link = link;
 
-		if (flags & BPF_F_UPROBE_MULTI_RETURN)
-			uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler;
-		else
+		if (!(flags & BPF_F_UPROBE_MULTI_RETURN))
 			uprobes[i].consumer.handler = uprobe_multi_link_handler;
-
+		if (flags & BPF_F_UPROBE_MULTI_RETURN || is_uprobe_session(prog))
+			uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler;
+		if (is_uprobe_session(prog))
+			uprobes[i].session = true;
 		if (pid)
 			uprobes[i].consumer.filter = uprobe_multi_link_filter;
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f28b6527e815..4162afc6b5d0 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1116,6 +1116,7 @@ enum bpf_attach_type {
 	BPF_NETKIT_PRIMARY,
 	BPF_NETKIT_PEER,
 	BPF_TRACE_KPROBE_SESSION,
+	BPF_TRACE_UPROBE_SESSION,
 	__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 711173acbcef..faac1c79840d 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -133,6 +133,7 @@ static const char * const attach_type_name[] = {
 	[BPF_NETKIT_PRIMARY]		= "netkit_primary",
 	[BPF_NETKIT_PEER]		= "netkit_peer",
 	[BPF_TRACE_KPROBE_SESSION]	= "trace_kprobe_session",
+	[BPF_TRACE_UPROBE_SESSION]	= "trace_uprobe_session",
 };
 
 static const char * const link_type_name[] = {
-- 
cgit v1.2.3


From d2bd39c0456b75be9dfc7d774b8d021355c26ae3 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Fri, 18 Oct 2024 17:47:49 +0300
Subject: PCI: Store all PCIe Supported Link Speeds
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The PCIe bandwidth controller added by a subsequent commit will require
selecting PCIe Link Speeds that are lower than the Maximum Link Speed.

The struct pci_bus only stores max_bus_speed. Even if PCIe r6.1 sec 8.2.1
currently disallows gaps in supported Link Speeds, the Implementation Note
in PCIe r6.1 sec 7.5.3.18, recommends determining supported Link Speeds
using the Supported Link Speeds Vector in the Link Capabilities 2 Register
(when available) to "avoid software being confused if a future
specification defines Links that do not require support for all slower
speeds."

Reuse code in pcie_get_speed_cap() to add pcie_get_supported_speeds() to
query the Supported Link Speeds Vector of a PCIe device. The value is taken
directly from the Supported Link Speeds Vector or synthesized from the Max
Link Speed in the Link Capabilities Register when the Link Capabilities 2
Register is not available.

The Supported Link Speeds Vector in the Link Capabilities Register 2
corresponds to the bus below on Root Ports and Downstream Ports, whereas it
corresponds to the bus above on Upstream Ports and Endpoints (PCIe r6.1 sec
7.5.3.18):

  Supported Link Speeds Vector - This field indicates the supported Link
  speed(s) of the associated Port.

Add supported_speeds into the struct pci_dev that caches the
Supported Link Speeds Vector.

supported_speeds contains a set of Link Speeds only in the case where PCIe
Link Speed can be determined. Root Complex Integrated Endpoints do not have
a well-defined Link Speed because they do not implement either of the Link
Capabilities Registers, which is allowed by PCIe r6.1 sec 7.5.3 (the same
limitation applies to determining cur_bus_speed and max_bus_speed that are
PCI_SPEED_UNKNOWN in such case). This is of no concern from PCIe bandwidth
controller point of view because such devices are not attached into a PCIe
Root Port that could be controlled.

The supported_speeds field keeps the extra reserved zero at the least
significant bit to match the Link Capabilities 2 Register layout.

An attempt was made to store supported_speeds field into the struct pci_bus
as an intersection of both ends of the Link, however, the subordinate
struct pci_bus is not available early enough. The Target Speed quirk (in
pcie_failed_link_retrain()) can run either during initial scan or later,
requiring it to use the API provided by the PCIe bandwidth controller to
set the Target Link Speed in order to co-exist with the bandwidth
controller. When the Target Speed quirk is calling the bandwidth controller
during initial scan, the struct pci_bus is not yet initialized. As such,
storing supported_speeds into the struct pci_bus is not viable.

Suggested-by: Lukas Wunner <lukas@wunner.de>
Link: https://lore.kernel.org/r/20241018144755.7875-4-ilpo.jarvinen@linux.intel.com
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
[bhelgaas: move pcie_get_supported_speeds() decl to drivers/pci/pci.h]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/pci/pci.c             | 58 +++++++++++++++++++++++++++++++------------
 drivers/pci/pci.h             |  1 +
 drivers/pci/probe.c           |  3 +++
 include/linux/pci.h           | 10 +++++++-
 include/uapi/linux/pci_regs.h |  1 +
 5 files changed, 56 insertions(+), 17 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 7d85c04fbba2..3d67e8b50ba2 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -6189,38 +6189,64 @@ u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev **limiting_dev,
 EXPORT_SYMBOL(pcie_bandwidth_available);
 
 /**
- * pcie_get_speed_cap - query for the PCI device's link speed capability
+ * pcie_get_supported_speeds - query Supported Link Speed Vector
  * @dev: PCI device to query
  *
- * Query the PCI device speed capability.  Return the maximum link speed
- * supported by the device.
+ * Query @dev supported link speeds.
+ *
+ * Implementation Note in PCIe r6.0 sec 7.5.3.18 recommends determining
+ * supported link speeds using the Supported Link Speeds Vector in the Link
+ * Capabilities 2 Register (when available).
+ *
+ * Link Capabilities 2 was added in PCIe r3.0, sec 7.8.18.
+ *
+ * Without Link Capabilities 2, i.e., prior to PCIe r3.0, Supported Link
+ * Speeds field in Link Capabilities is used and only 2.5 GT/s and 5.0 GT/s
+ * speeds were defined.
+ *
+ * For @dev without Supported Link Speed Vector, the field is synthesized
+ * from the Max Link Speed field in the Link Capabilities Register.
+ *
+ * Return: Supported Link Speeds Vector (+ reserved 0 at LSB).
  */
-enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev)
+u8 pcie_get_supported_speeds(struct pci_dev *dev)
 {
 	u32 lnkcap2, lnkcap;
+	u8 speeds;
 
 	/*
-	 * Link Capabilities 2 was added in PCIe r3.0, sec 7.8.18.  The
-	 * implementation note there recommends using the Supported Link
-	 * Speeds Vector in Link Capabilities 2 when supported.
-	 *
-	 * Without Link Capabilities 2, i.e., prior to PCIe r3.0, software
-	 * should use the Supported Link Speeds field in Link Capabilities,
-	 * where only 2.5 GT/s and 5.0 GT/s speeds were defined.
+	 * Speeds retain the reserved 0 at LSB before PCIe Supported Link
+	 * Speeds Vector to allow using SLS Vector bit defines directly.
 	 */
 	pcie_capability_read_dword(dev, PCI_EXP_LNKCAP2, &lnkcap2);
+	speeds = lnkcap2 & PCI_EXP_LNKCAP2_SLS;
 
 	/* PCIe r3.0-compliant */
-	if (lnkcap2)
-		return PCIE_LNKCAP2_SLS2SPEED(lnkcap2);
+	if (speeds)
+		return speeds;
 
 	pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap);
+
+	/* Synthesize from the Max Link Speed field */
 	if ((lnkcap & PCI_EXP_LNKCAP_SLS) == PCI_EXP_LNKCAP_SLS_5_0GB)
-		return PCIE_SPEED_5_0GT;
+		speeds = PCI_EXP_LNKCAP2_SLS_5_0GB | PCI_EXP_LNKCAP2_SLS_2_5GB;
 	else if ((lnkcap & PCI_EXP_LNKCAP_SLS) == PCI_EXP_LNKCAP_SLS_2_5GB)
-		return PCIE_SPEED_2_5GT;
+		speeds = PCI_EXP_LNKCAP2_SLS_2_5GB;
 
-	return PCI_SPEED_UNKNOWN;
+	return speeds;
+}
+
+/**
+ * pcie_get_speed_cap - query for the PCI device's link speed capability
+ * @dev: PCI device to query
+ *
+ * Query the PCI device speed capability.
+ *
+ * Return: the maximum link speed supported by the device.
+ */
+enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev)
+{
+	return PCIE_LNKCAP2_SLS2SPEED(dev->supported_speeds);
 }
 EXPORT_SYMBOL(pcie_get_speed_cap);
 
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 14d00ce45bfa..d0a46ecf7289 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -373,6 +373,7 @@ static inline int pcie_dev_speed_mbps(enum pci_bus_speed speed)
 	return -EINVAL;
 }
 
+u8 pcie_get_supported_speeds(struct pci_dev *dev);
 const char *pci_speed_string(enum pci_bus_speed speed);
 enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev);
 enum pcie_link_width pcie_get_width_cap(struct pci_dev *dev);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 4f68414c3086..af153a8e8225 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1947,6 +1947,9 @@ int pci_setup_device(struct pci_dev *dev)
 
 	set_pcie_untrusted(dev);
 
+	if (pci_is_pcie(dev))
+		dev->supported_speeds = pcie_get_supported_speeds(dev);
+
 	/* "Unknown power state" */
 	dev->current_state = PCI_UNKNOWN;
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index be5ed534c39c..99c6fa30d25b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -318,7 +318,14 @@ struct pci_sriov;
 struct pci_p2pdma;
 struct rcec_ea;
 
-/* The pci_dev structure describes PCI devices */
+/* struct pci_dev - describes a PCI device
+ *
+ * @supported_speeds:	PCIe Supported Link Speeds Vector (+ reserved 0 at
+ *			LSB). 0 when the supported speeds cannot be
+ *			determined (e.g., for Root Complex Integrated
+ *			Endpoints without the relevant Capability
+ *			Registers).
+ */
 struct pci_dev {
 	struct list_head bus_list;	/* Node in per-bus list */
 	struct pci_bus	*bus;		/* Bus this device is on */
@@ -522,6 +529,7 @@ struct pci_dev {
 	struct npem	*npem;		/* Native PCIe Enclosure Management */
 #endif
 	u16		acs_cap;	/* ACS Capability offset */
+	u8		supported_speeds; /* Supported Link Speeds Vector */
 	phys_addr_t	rom;		/* Physical address if not from BAR */
 	size_t		romlen;		/* Length if not from BAR */
 	/*
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 12323b3334a9..f3c9de0a497c 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -678,6 +678,7 @@
 #define PCI_EXP_DEVSTA2		0x2a	/* Device Status 2 */
 #define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V2 0x2c	/* end of v2 EPs w/o link */
 #define PCI_EXP_LNKCAP2		0x2c	/* Link Capabilities 2 */
+#define  PCI_EXP_LNKCAP2_SLS		0x000000fe /* Supported Link Speeds Vector */
 #define  PCI_EXP_LNKCAP2_SLS_2_5GB	0x00000002 /* Supported Speed 2.5GT/s */
 #define  PCI_EXP_LNKCAP2_SLS_5_0GB	0x00000004 /* Supported Speed 5GT/s */
 #define  PCI_EXP_LNKCAP2_SLS_8_0GB	0x00000008 /* Supported Speed 8GT/s */
-- 
cgit v1.2.3


From 5dc51ec86df6e2214d8398079c1e31736593ab53 Mon Sep 17 00:00:00 2001
From: Martin Karsten <mkarsten@uwaterloo.ca>
Date: Sat, 9 Nov 2024 05:02:31 +0000
Subject: net: Add napi_struct parameter irq_suspend_timeout

Add a per-NAPI IRQ suspension parameter, which can be get/set with
netdev-genl.

This patch doesn't change any behavior but prepares the code for other
changes in the following commits which use irq_suspend_timeout as a
timeout for IRQ suspension.

Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
Co-developed-by: Joe Damato <jdamato@fastly.com>
Signed-off-by: Joe Damato <jdamato@fastly.com>
Tested-by: Joe Damato <jdamato@fastly.com>
Tested-by: Martin Karsten <mkarsten@uwaterloo.ca>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Reviewed-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Link: https://patch.msgid.link/20241109050245.191288-2-jdamato@fastly.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/netdev.yaml |  7 +++++++
 include/linux/netdevice.h               |  2 ++
 include/uapi/linux/netdev.h             |  1 +
 net/core/dev.c                          |  2 ++
 net/core/dev.h                          | 25 +++++++++++++++++++++++++
 net/core/netdev-genl-gen.c              |  5 +++--
 net/core/netdev-genl.c                  | 12 ++++++++++++
 tools/include/uapi/linux/netdev.h       |  1 +
 8 files changed, 53 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index f9cb97d6106c..cbb544bd6c84 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -263,6 +263,11 @@ attribute-sets:
              the end of a NAPI cycle. This may add receive latency in exchange
              for reducing the number of frames processed by the network stack.
         type: uint
+      -
+        name: irq-suspend-timeout
+        doc: The timeout, in nanoseconds, of how long to suspend irq
+             processing, if event polling finds events
+        type: uint
   -
     name: queue
     attributes:
@@ -653,6 +658,7 @@ operations:
             - pid
             - defer-hard-irqs
             - gro-flush-timeout
+            - irq-suspend-timeout
       dump:
         request:
           attributes:
@@ -704,6 +710,7 @@ operations:
             - id
             - defer-hard-irqs
             - gro-flush-timeout
+            - irq-suspend-timeout
 
 kernel-family:
   headers: [ "linux/list.h"]
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index df4483598628..0aae346d919e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -348,6 +348,7 @@ struct gro_list {
  */
 struct napi_config {
 	u64 gro_flush_timeout;
+	u64 irq_suspend_timeout;
 	u32 defer_hard_irqs;
 	unsigned int napi_id;
 };
@@ -384,6 +385,7 @@ struct napi_struct {
 	struct hrtimer		timer;
 	struct task_struct	*thread;
 	unsigned long		gro_flush_timeout;
+	unsigned long		irq_suspend_timeout;
 	u32			defer_hard_irqs;
 	/* control-path-only fields follow */
 	struct list_head	dev_list;
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index e3ebb49f60d2..e4be227d3ad6 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -124,6 +124,7 @@ enum {
 	NETDEV_A_NAPI_PID,
 	NETDEV_A_NAPI_DEFER_HARD_IRQS,
 	NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT,
+	NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT,
 
 	__NETDEV_A_NAPI_MAX,
 	NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1)
diff --git a/net/core/dev.c b/net/core/dev.c
index 6a31152e4606..4d910872963f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6666,6 +6666,7 @@ static void napi_restore_config(struct napi_struct *n)
 {
 	n->defer_hard_irqs = n->config->defer_hard_irqs;
 	n->gro_flush_timeout = n->config->gro_flush_timeout;
+	n->irq_suspend_timeout = n->config->irq_suspend_timeout;
 	/* a NAPI ID might be stored in the config, if so use it. if not, use
 	 * napi_hash_add to generate one for us. It will be saved to the config
 	 * in napi_disable.
@@ -6680,6 +6681,7 @@ static void napi_save_config(struct napi_struct *n)
 {
 	n->config->defer_hard_irqs = n->defer_hard_irqs;
 	n->config->gro_flush_timeout = n->gro_flush_timeout;
+	n->config->irq_suspend_timeout = n->irq_suspend_timeout;
 	n->config->napi_id = n->napi_id;
 	napi_hash_del(n);
 }
diff --git a/net/core/dev.h b/net/core/dev.h
index 7881bced70a9..d043dee25a68 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -236,6 +236,31 @@ static inline void netdev_set_gro_flush_timeout(struct net_device *netdev,
 		netdev->napi_config[i].gro_flush_timeout = timeout;
 }
 
+/**
+ * napi_get_irq_suspend_timeout - get the irq_suspend_timeout
+ * @n: napi struct to get the irq_suspend_timeout from
+ *
+ * Return: the per-NAPI value of the irq_suspend_timeout field.
+ */
+static inline unsigned long
+napi_get_irq_suspend_timeout(const struct napi_struct *n)
+{
+	return READ_ONCE(n->irq_suspend_timeout);
+}
+
+/**
+ * napi_set_irq_suspend_timeout - set the irq_suspend_timeout for a napi
+ * @n: napi struct to set the irq_suspend_timeout
+ * @timeout: timeout value to set
+ *
+ * napi_set_irq_suspend_timeout sets the per-NAPI irq_suspend_timeout
+ */
+static inline void napi_set_irq_suspend_timeout(struct napi_struct *n,
+						unsigned long timeout)
+{
+	WRITE_ONCE(n->irq_suspend_timeout, timeout);
+}
+
 int rps_cpumask_housekeeping(struct cpumask *mask);
 
 #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
index 21de7e10be16..a89cbd8d87c3 100644
--- a/net/core/netdev-genl-gen.c
+++ b/net/core/netdev-genl-gen.c
@@ -92,10 +92,11 @@ static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1]
 };
 
 /* NETDEV_CMD_NAPI_SET - do */
-static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT + 1] = {
+static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT + 1] = {
 	[NETDEV_A_NAPI_ID] = { .type = NLA_U32, },
 	[NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range),
 	[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, },
+	[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, },
 };
 
 /* Ops table for netdev */
@@ -186,7 +187,7 @@ static const struct genl_split_ops netdev_nl_ops[] = {
 		.cmd		= NETDEV_CMD_NAPI_SET,
 		.doit		= netdev_nl_napi_set_doit,
 		.policy		= netdev_napi_set_nl_policy,
-		.maxattr	= NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT,
+		.maxattr	= NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT,
 		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
 	},
 };
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index b49c3b4e5fbe..765ce7c9d73b 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -161,6 +161,7 @@ static int
 netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
 			const struct genl_info *info)
 {
+	unsigned long irq_suspend_timeout;
 	unsigned long gro_flush_timeout;
 	u32 napi_defer_hard_irqs;
 	void *hdr;
@@ -196,6 +197,11 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
 			napi_defer_hard_irqs))
 		goto nla_put_failure;
 
+	irq_suspend_timeout = napi_get_irq_suspend_timeout(napi);
+	if (nla_put_uint(rsp, NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT,
+			 irq_suspend_timeout))
+		goto nla_put_failure;
+
 	gro_flush_timeout = napi_get_gro_flush_timeout(napi);
 	if (nla_put_uint(rsp, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT,
 			 gro_flush_timeout))
@@ -306,6 +312,7 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
 static int
 netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info)
 {
+	u64 irq_suspend_timeout = 0;
 	u64 gro_flush_timeout = 0;
 	u32 defer = 0;
 
@@ -314,6 +321,11 @@ netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info)
 		napi_set_defer_hard_irqs(napi, defer);
 	}
 
+	if (info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]) {
+		irq_suspend_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]);
+		napi_set_irq_suspend_timeout(napi, irq_suspend_timeout);
+	}
+
 	if (info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]) {
 		gro_flush_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]);
 		napi_set_gro_flush_timeout(napi, gro_flush_timeout);
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index e3ebb49f60d2..e4be227d3ad6 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -124,6 +124,7 @@ enum {
 	NETDEV_A_NAPI_PID,
 	NETDEV_A_NAPI_DEFER_HARD_IRQS,
 	NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT,
+	NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT,
 
 	__NETDEV_A_NAPI_MAX,
 	NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1)
-- 
cgit v1.2.3


From ed9d95f691c29748f21bc019de9566b698fdfab7 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 11 Nov 2024 10:09:56 -0500
Subject: fs: add the ability for statmount() to report the fs_subtype

/proc/self/mountinfo prints out the sb->s_subtype after the type. This
is particularly useful for disambiguating FUSE mounts (at least when the
userland driver bothers to set it). Add STATMOUNT_FS_SUBTYPE and claim
one of the __spare2 fields to point to the offset into the str[] array.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ian Kent <raven@themaw.net>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://lore.kernel.org/r/20241111-statmount-v4-2-2eaf35d07a80@kernel.org
Acked-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c             | 19 +++++++++++++++++--
 include/uapi/linux/mount.h |  5 ++++-
 2 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index 23187a414754..dbd89fffd919 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5004,6 +5004,14 @@ static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq)
 	return 0;
 }
 
+static void statmount_fs_subtype(struct kstatmount *s, struct seq_file *seq)
+{
+	struct super_block *sb = s->mnt->mnt_sb;
+
+	if (sb->s_subtype)
+		seq_puts(seq, sb->s_subtype);
+}
+
 static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns)
 {
 	s->sm.mask |= STATMOUNT_MNT_NS_ID;
@@ -5040,7 +5048,7 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
 
 static int statmount_string(struct kstatmount *s, u64 flag)
 {
-	int ret;
+	int ret = 0;
 	size_t kbufsize;
 	struct seq_file *seq = &s->seq;
 	struct statmount *sm = &s->sm;
@@ -5063,6 +5071,10 @@ static int statmount_string(struct kstatmount *s, u64 flag)
 		sm->mnt_opts = start;
 		ret = statmount_mnt_opts(s, seq);
 		break;
+	case STATMOUNT_FS_SUBTYPE:
+		sm->fs_subtype = start;
+		statmount_fs_subtype(s, seq);
+		break;
 	default:
 		WARN_ON_ONCE(true);
 		return -EINVAL;
@@ -5208,6 +5220,9 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
 	if (!err && s->mask & STATMOUNT_MNT_OPTS)
 		err = statmount_string(s, STATMOUNT_MNT_OPTS);
 
+	if (!err && s->mask & STATMOUNT_FS_SUBTYPE)
+		err = statmount_string(s, STATMOUNT_FS_SUBTYPE);
+
 	if (!err && s->mask & STATMOUNT_MNT_NS_ID)
 		statmount_mnt_ns_id(s, ns);
 
@@ -5229,7 +5244,7 @@ static inline bool retry_statmount(const long ret, size_t *seq_size)
 }
 
 #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \
-			      STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS)
+			      STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | STATMOUNT_FS_SUBTYPE)
 
 static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
 			      struct statmount __user *buf, size_t bufsize,
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index 225bc366ffcb..2e939dddf9cb 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -173,7 +173,9 @@ struct statmount {
 	__u32 mnt_root;		/* [str] Root of mount relative to root of fs */
 	__u32 mnt_point;	/* [str] Mountpoint relative to current root */
 	__u64 mnt_ns_id;	/* ID of the mount namespace */
-	__u64 __spare2[49];
+	__u32 fs_subtype;	/* [str] Subtype of fs_type (if any) */
+	__u32 __spare1[1];
+	__u64 __spare2[48];
 	char str[];		/* Variable size part containing strings */
 };
 
@@ -207,6 +209,7 @@ struct mnt_id_req {
 #define STATMOUNT_FS_TYPE		0x00000020U	/* Want/got fs_type */
 #define STATMOUNT_MNT_NS_ID		0x00000040U	/* Want/got mnt_ns_id */
 #define STATMOUNT_MNT_OPTS		0x00000080U	/* Want/got mnt_opts */
+#define STATMOUNT_FS_SUBTYPE		0x00000100U	/* Want/got fs_subtype */
 
 /*
  * Special @mnt_id values that can be passed to listmount
-- 
cgit v1.2.3


From 4db97c21ed07a7d4081ed9820599fa36857083d6 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 5 Nov 2024 12:04:21 -0800
Subject: iommufd/viommu: Add IOMMU_VIOMMU_ALLOC ioctl

Add a new ioctl for user space to do a vIOMMU allocation. It must be based
on a nesting parent HWPT, so take its refcount.

IOMMU driver wanting to support vIOMMUs must define its IOMMU_VIOMMU_TYPE_
in the uAPI header and implement a viommu_alloc op in its iommu_ops.

Link: https://patch.msgid.link/r/dc2b8ba9ac935007beff07c1761c31cd097ed780.1730836219.git.nicolinc@nvidia.com
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/Makefile          |  3 +-
 drivers/iommu/iommufd/iommufd_private.h |  3 ++
 drivers/iommu/iommufd/main.c            |  6 +++
 drivers/iommu/iommufd/viommu.c          | 81 +++++++++++++++++++++++++++++++++
 include/uapi/linux/iommufd.h            | 40 ++++++++++++++++
 5 files changed, 132 insertions(+), 1 deletion(-)
 create mode 100644 drivers/iommu/iommufd/viommu.c

(limited to 'include/uapi/linux')

diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index 83df9077063e..cb784da6cddc 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -7,7 +7,8 @@ iommufd-y := \
 	ioas.o \
 	main.o \
 	pages.o \
-	vfio_compat.o
+	vfio_compat.o \
+	viommu.o
 
 iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o
 
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index be347f726fda..a8104d9d4cef 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -506,6 +506,9 @@ static inline int iommufd_hwpt_replace_device(struct iommufd_device *idev,
 	return iommu_group_replace_domain(idev->igroup->group, hwpt->domain);
 }
 
+int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
+void iommufd_viommu_destroy(struct iommufd_object *obj);
+
 #ifdef CONFIG_IOMMUFD_TEST
 int iommufd_test(struct iommufd_ucmd *ucmd);
 void iommufd_selftest_destroy(struct iommufd_object *obj);
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 30e6c2af3b45..cc514f9bc3e6 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -307,6 +307,7 @@ union ucmd_buffer {
 	struct iommu_ioas_unmap unmap;
 	struct iommu_option option;
 	struct iommu_vfio_ioas vfio_ioas;
+	struct iommu_viommu_alloc viommu;
 #ifdef CONFIG_IOMMUFD_TEST
 	struct iommu_test_cmd test;
 #endif
@@ -360,6 +361,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 		 val64),
 	IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas,
 		 __reserved),
+	IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl,
+		 struct iommu_viommu_alloc, out_viommu_id),
 #ifdef CONFIG_IOMMUFD_TEST
 	IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
 #endif
@@ -495,6 +498,9 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
 	[IOMMUFD_OBJ_FAULT] = {
 		.destroy = iommufd_fault_destroy,
 	},
+	[IOMMUFD_OBJ_VIOMMU] = {
+		.destroy = iommufd_viommu_destroy,
+	},
 #ifdef CONFIG_IOMMUFD_TEST
 	[IOMMUFD_OBJ_SELFTEST] = {
 		.destroy = iommufd_selftest_destroy,
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
new file mode 100644
index 000000000000..888239b78667
--- /dev/null
+++ b/drivers/iommu/iommufd/viommu.c
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+#include "iommufd_private.h"
+
+void iommufd_viommu_destroy(struct iommufd_object *obj)
+{
+	struct iommufd_viommu *viommu =
+		container_of(obj, struct iommufd_viommu, obj);
+
+	if (viommu->ops && viommu->ops->destroy)
+		viommu->ops->destroy(viommu);
+	refcount_dec(&viommu->hwpt->common.obj.users);
+}
+
+int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_viommu_alloc *cmd = ucmd->cmd;
+	struct iommufd_hwpt_paging *hwpt_paging;
+	struct iommufd_viommu *viommu;
+	struct iommufd_device *idev;
+	const struct iommu_ops *ops;
+	int rc;
+
+	if (cmd->flags || cmd->type == IOMMU_VIOMMU_TYPE_DEFAULT)
+		return -EOPNOTSUPP;
+
+	idev = iommufd_get_device(ucmd, cmd->dev_id);
+	if (IS_ERR(idev))
+		return PTR_ERR(idev);
+
+	ops = dev_iommu_ops(idev->dev);
+	if (!ops->viommu_alloc) {
+		rc = -EOPNOTSUPP;
+		goto out_put_idev;
+	}
+
+	hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id);
+	if (IS_ERR(hwpt_paging)) {
+		rc = PTR_ERR(hwpt_paging);
+		goto out_put_idev;
+	}
+
+	if (!hwpt_paging->nest_parent) {
+		rc = -EINVAL;
+		goto out_put_hwpt;
+	}
+
+	viommu = ops->viommu_alloc(idev->dev, hwpt_paging->common.domain,
+				   ucmd->ictx, cmd->type);
+	if (IS_ERR(viommu)) {
+		rc = PTR_ERR(viommu);
+		goto out_put_hwpt;
+	}
+
+	viommu->type = cmd->type;
+	viommu->ictx = ucmd->ictx;
+	viommu->hwpt = hwpt_paging;
+	refcount_inc(&viommu->hwpt->common.obj.users);
+	/*
+	 * It is the most likely case that a physical IOMMU is unpluggable. A
+	 * pluggable IOMMU instance (if exists) is responsible for refcounting
+	 * on its own.
+	 */
+	viommu->iommu_dev = __iommu_get_iommu_dev(idev->dev);
+
+	cmd->out_viommu_id = viommu->obj.id;
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+	if (rc)
+		goto out_abort;
+	iommufd_object_finalize(ucmd->ictx, &viommu->obj);
+	goto out_put_hwpt;
+
+out_abort:
+	iommufd_object_abort_and_destroy(ucmd->ictx, &viommu->obj);
+out_put_hwpt:
+	iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj);
+out_put_idev:
+	iommufd_put_object(ucmd->ictx, &idev->obj);
+	return rc;
+}
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 41b1a01e9293..302844136b02 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -52,6 +52,7 @@ enum {
 	IOMMUFD_CMD_HWPT_INVALIDATE = 0x8d,
 	IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e,
 	IOMMUFD_CMD_IOAS_MAP_FILE = 0x8f,
+	IOMMUFD_CMD_VIOMMU_ALLOC = 0x90,
 };
 
 /**
@@ -822,4 +823,43 @@ struct iommu_fault_alloc {
 	__u32 out_fault_fd;
 };
 #define IOMMU_FAULT_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_FAULT_QUEUE_ALLOC)
+
+/**
+ * enum iommu_viommu_type - Virtual IOMMU Type
+ * @IOMMU_VIOMMU_TYPE_DEFAULT: Reserved for future use
+ */
+enum iommu_viommu_type {
+	IOMMU_VIOMMU_TYPE_DEFAULT = 0,
+};
+
+/**
+ * struct iommu_viommu_alloc - ioctl(IOMMU_VIOMMU_ALLOC)
+ * @size: sizeof(struct iommu_viommu_alloc)
+ * @flags: Must be 0
+ * @type: Type of the virtual IOMMU. Must be defined in enum iommu_viommu_type
+ * @dev_id: The device's physical IOMMU will be used to back the virtual IOMMU
+ * @hwpt_id: ID of a nesting parent HWPT to associate to
+ * @out_viommu_id: Output virtual IOMMU ID for the allocated object
+ *
+ * Allocate a virtual IOMMU object, representing the underlying physical IOMMU's
+ * virtualization support that is a security-isolated slice of the real IOMMU HW
+ * that is unique to a specific VM. Operations global to the IOMMU are connected
+ * to the vIOMMU, such as:
+ * - Security namespace for guest owned ID, e.g. guest-controlled cache tags
+ * - Non-device-affiliated event reporting, e.g. invalidation queue errors
+ * - Access to a sharable nesting parent pagetable across physical IOMMUs
+ * - Virtualization of various platforms IDs, e.g. RIDs and others
+ * - Delivery of paravirtualized invalidation
+ * - Direct assigned invalidation queues
+ * - Direct assigned interrupts
+ */
+struct iommu_viommu_alloc {
+	__u32 size;
+	__u32 flags;
+	__u32 type;
+	__u32 dev_id;
+	__u32 hwpt_id;
+	__u32 out_viommu_id;
+};
+#define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC)
 #endif
-- 
cgit v1.2.3


From 13a750180fc86d41695c8f64d8892412482a401d Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 5 Nov 2024 12:04:23 -0800
Subject: iommufd: Allow pt_id to carry viommu_id for IOMMU_HWPT_ALLOC

Now a vIOMMU holds a shareable nesting parent HWPT. So, it can act like
that nesting parent HWPT to allocate a nested HWPT.

Support that in the IOMMU_HWPT_ALLOC ioctl handler, and update its kdoc.

Also, add an iommufd_viommu_alloc_hwpt_nested helper to allocate a nested
HWPT for a vIOMMU object. Since a vIOMMU object holds the parent hwpt's
refcount already, increase the refcount of the vIOMMU only.

Link: https://patch.msgid.link/r/a0f24f32bfada8b448d17587adcaedeeb50a67ed.1730836219.git.nicolinc@nvidia.com
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/hw_pagetable.c    | 73 ++++++++++++++++++++++++++++++++-
 drivers/iommu/iommufd/iommufd_private.h |  1 +
 include/uapi/linux/iommufd.h            | 14 ++++---
 3 files changed, 81 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index d06bf6e6c19f..982bf4a35a2b 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -57,7 +57,10 @@ void iommufd_hwpt_nested_destroy(struct iommufd_object *obj)
 		container_of(obj, struct iommufd_hwpt_nested, common.obj);
 
 	__iommufd_hwpt_destroy(&hwpt_nested->common);
-	refcount_dec(&hwpt_nested->parent->common.obj.users);
+	if (hwpt_nested->viommu)
+		refcount_dec(&hwpt_nested->viommu->obj.users);
+	else
+		refcount_dec(&hwpt_nested->parent->common.obj.users);
 }
 
 void iommufd_hwpt_nested_abort(struct iommufd_object *obj)
@@ -260,6 +263,58 @@ out_abort:
 	return ERR_PTR(rc);
 }
 
+/**
+ * iommufd_viommu_alloc_hwpt_nested() - Get a hwpt_nested for a vIOMMU
+ * @viommu: vIOMMU ojbect to associate the hwpt_nested/domain with
+ * @flags: Flags from userspace
+ * @user_data: user_data pointer. Must be valid
+ *
+ * Allocate a new IOMMU_DOMAIN_NESTED for a vIOMMU and return it as a NESTED
+ * hw_pagetable.
+ */
+static struct iommufd_hwpt_nested *
+iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags,
+				 const struct iommu_user_data *user_data)
+{
+	struct iommufd_hwpt_nested *hwpt_nested;
+	struct iommufd_hw_pagetable *hwpt;
+	int rc;
+
+	if (!user_data->len)
+		return ERR_PTR(-EOPNOTSUPP);
+	if (!viommu->ops || !viommu->ops->alloc_domain_nested)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	hwpt_nested = __iommufd_object_alloc(
+		viommu->ictx, hwpt_nested, IOMMUFD_OBJ_HWPT_NESTED, common.obj);
+	if (IS_ERR(hwpt_nested))
+		return ERR_CAST(hwpt_nested);
+	hwpt = &hwpt_nested->common;
+
+	hwpt_nested->viommu = viommu;
+	refcount_inc(&viommu->obj.users);
+	hwpt_nested->parent = viommu->hwpt;
+
+	hwpt->domain =
+		viommu->ops->alloc_domain_nested(viommu, flags, user_data);
+	if (IS_ERR(hwpt->domain)) {
+		rc = PTR_ERR(hwpt->domain);
+		hwpt->domain = NULL;
+		goto out_abort;
+	}
+	hwpt->domain->owner = viommu->iommu_dev->ops;
+
+	if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) {
+		rc = -EINVAL;
+		goto out_abort;
+	}
+	return hwpt_nested;
+
+out_abort:
+	iommufd_object_abort_and_destroy(viommu->ictx, &hwpt->obj);
+	return ERR_PTR(rc);
+}
+
 int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
 {
 	struct iommu_hwpt_alloc *cmd = ucmd->cmd;
@@ -316,6 +371,22 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
 			goto out_unlock;
 		}
 		hwpt = &hwpt_nested->common;
+	} else if (pt_obj->type == IOMMUFD_OBJ_VIOMMU) {
+		struct iommufd_hwpt_nested *hwpt_nested;
+		struct iommufd_viommu *viommu;
+
+		viommu = container_of(pt_obj, struct iommufd_viommu, obj);
+		if (viommu->iommu_dev != __iommu_get_iommu_dev(idev->dev)) {
+			rc = -EINVAL;
+			goto out_unlock;
+		}
+		hwpt_nested = iommufd_viommu_alloc_hwpt_nested(
+			viommu, cmd->flags, &user_data);
+		if (IS_ERR(hwpt_nested)) {
+			rc = PTR_ERR(hwpt_nested);
+			goto out_unlock;
+		}
+		hwpt = &hwpt_nested->common;
 	} else {
 		rc = -EINVAL;
 		goto out_put_pt;
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index a8104d9d4cef..e8f5ef550cc9 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -290,6 +290,7 @@ struct iommufd_hwpt_paging {
 struct iommufd_hwpt_nested {
 	struct iommufd_hw_pagetable common;
 	struct iommufd_hwpt_paging *parent;
+	struct iommufd_viommu *viommu;
 };
 
 static inline bool hwpt_is_paging(struct iommufd_hw_pagetable *hwpt)
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 302844136b02..a498d4838f9a 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -435,7 +435,7 @@ enum iommu_hwpt_data_type {
  * @size: sizeof(struct iommu_hwpt_alloc)
  * @flags: Combination of enum iommufd_hwpt_alloc_flags
  * @dev_id: The device to allocate this HWPT for
- * @pt_id: The IOAS or HWPT to connect this HWPT to
+ * @pt_id: The IOAS or HWPT or vIOMMU to connect this HWPT to
  * @out_hwpt_id: The ID of the new HWPT
  * @__reserved: Must be 0
  * @data_type: One of enum iommu_hwpt_data_type
@@ -454,11 +454,13 @@ enum iommu_hwpt_data_type {
  * IOMMU_HWPT_DATA_NONE. The HWPT can be allocated as a parent HWPT for a
  * nesting configuration by passing IOMMU_HWPT_ALLOC_NEST_PARENT via @flags.
  *
- * A user-managed nested HWPT will be created from a given parent HWPT via
- * @pt_id, in which the parent HWPT must be allocated previously via the
- * same ioctl from a given IOAS (@pt_id). In this case, the @data_type
- * must be set to a pre-defined type corresponding to an I/O page table
- * type supported by the underlying IOMMU hardware.
+ * A user-managed nested HWPT will be created from a given vIOMMU (wrapping a
+ * parent HWPT) or a parent HWPT via @pt_id, in which the parent HWPT must be
+ * allocated previously via the same ioctl from a given IOAS (@pt_id). In this
+ * case, the @data_type must be set to a pre-defined type corresponding to an
+ * I/O page table type supported by the underlying IOMMU hardware. The device
+ * via @dev_id and the vIOMMU via @pt_id must be associated to the same IOMMU
+ * instance.
  *
  * If the @data_type is set to IOMMU_HWPT_DATA_NONE, @data_len and
  * @data_uptr should be zero. Otherwise, both @data_len and @data_uptr
-- 
cgit v1.2.3


From 0ce5c2477af2e2284b9c70474e4dae85db211680 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 5 Nov 2024 12:05:09 -0800
Subject: iommufd/viommu: Add IOMMUFD_OBJ_VDEVICE and IOMMU_VDEVICE_ALLOC ioctl

Introduce a new IOMMUFD_OBJ_VDEVICE to represent a physical device (struct
device) against a vIOMMU (struct iommufd_viommu) object in a VM.

This vDEVICE object (and its structure) holds all the infos and attributes
in the VM, regarding the device related to the vIOMMU.

As an initial patch, add a per-vIOMMU virtual ID. This can be:
 - Virtual StreamID on a nested ARM SMMUv3, an index to a Stream Table
 - Virtual DeviceID on a nested AMD IOMMU, an index to a Device Table
 - Virtual RID on a nested Intel VT-D IOMMU, an index to a Context Table
Potentially, this vDEVICE structure would hold some vData for Confidential
Compute Architecture (CCA). Use this virtual ID to index an "vdevs" xarray
that belongs to a vIOMMU object.

Add a new ioctl for vDEVICE allocations. Since a vDEVICE is a connection
of a device object and an iommufd_viommu object, take two refcounts in the
ioctl handler.

Link: https://patch.msgid.link/r/cda8fd2263166e61b8191a3b3207e0d2b08545bf.1730836308.git.nicolinc@nvidia.com
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/iommufd_private.h | 18 ++++++++
 drivers/iommu/iommufd/main.c            |  6 +++
 drivers/iommu/iommufd/viommu.c          | 76 +++++++++++++++++++++++++++++++++
 include/linux/iommufd.h                 |  4 ++
 include/uapi/linux/iommufd.h            | 22 ++++++++++
 5 files changed, 126 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index e8f5ef550cc9..062656c19a07 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -507,8 +507,26 @@ static inline int iommufd_hwpt_replace_device(struct iommufd_device *idev,
 	return iommu_group_replace_domain(idev->igroup->group, hwpt->domain);
 }
 
+static inline struct iommufd_viommu *
+iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id)
+{
+	return container_of(iommufd_get_object(ucmd->ictx, id,
+					       IOMMUFD_OBJ_VIOMMU),
+			    struct iommufd_viommu, obj);
+}
+
 int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
 void iommufd_viommu_destroy(struct iommufd_object *obj);
+int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd);
+void iommufd_vdevice_destroy(struct iommufd_object *obj);
+
+struct iommufd_vdevice {
+	struct iommufd_object obj;
+	struct iommufd_ctx *ictx;
+	struct iommufd_viommu *viommu;
+	struct device *dev;
+	u64 id; /* per-vIOMMU virtual ID */
+};
 
 #ifdef CONFIG_IOMMUFD_TEST
 int iommufd_test(struct iommufd_ucmd *ucmd);
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index cc514f9bc3e6..d735fe04197f 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -308,6 +308,7 @@ union ucmd_buffer {
 	struct iommu_option option;
 	struct iommu_vfio_ioas vfio_ioas;
 	struct iommu_viommu_alloc viommu;
+	struct iommu_vdevice_alloc vdev;
 #ifdef CONFIG_IOMMUFD_TEST
 	struct iommu_test_cmd test;
 #endif
@@ -363,6 +364,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 		 __reserved),
 	IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl,
 		 struct iommu_viommu_alloc, out_viommu_id),
+	IOCTL_OP(IOMMU_VDEVICE_ALLOC, iommufd_vdevice_alloc_ioctl,
+		 struct iommu_vdevice_alloc, virt_id),
 #ifdef CONFIG_IOMMUFD_TEST
 	IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
 #endif
@@ -501,6 +504,9 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
 	[IOMMUFD_OBJ_VIOMMU] = {
 		.destroy = iommufd_viommu_destroy,
 	},
+	[IOMMUFD_OBJ_VDEVICE] = {
+		.destroy = iommufd_vdevice_destroy,
+	},
 #ifdef CONFIG_IOMMUFD_TEST
 	[IOMMUFD_OBJ_SELFTEST] = {
 		.destroy = iommufd_selftest_destroy,
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index 888239b78667..69b88e8c7c26 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -11,6 +11,7 @@ void iommufd_viommu_destroy(struct iommufd_object *obj)
 	if (viommu->ops && viommu->ops->destroy)
 		viommu->ops->destroy(viommu);
 	refcount_dec(&viommu->hwpt->common.obj.users);
+	xa_destroy(&viommu->vdevs);
 }
 
 int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
@@ -53,6 +54,7 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
 		goto out_put_hwpt;
 	}
 
+	xa_init(&viommu->vdevs);
 	viommu->type = cmd->type;
 	viommu->ictx = ucmd->ictx;
 	viommu->hwpt = hwpt_paging;
@@ -79,3 +81,77 @@ out_put_idev:
 	iommufd_put_object(ucmd->ictx, &idev->obj);
 	return rc;
 }
+
+void iommufd_vdevice_destroy(struct iommufd_object *obj)
+{
+	struct iommufd_vdevice *vdev =
+		container_of(obj, struct iommufd_vdevice, obj);
+	struct iommufd_viommu *viommu = vdev->viommu;
+
+	/* xa_cmpxchg is okay to fail if alloc failed xa_cmpxchg previously */
+	xa_cmpxchg(&viommu->vdevs, vdev->id, vdev, NULL, GFP_KERNEL);
+	refcount_dec(&viommu->obj.users);
+	put_device(vdev->dev);
+}
+
+int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_vdevice_alloc *cmd = ucmd->cmd;
+	struct iommufd_vdevice *vdev, *curr;
+	struct iommufd_viommu *viommu;
+	struct iommufd_device *idev;
+	u64 virt_id = cmd->virt_id;
+	int rc = 0;
+
+	/* virt_id indexes an xarray */
+	if (virt_id > ULONG_MAX)
+		return -EINVAL;
+
+	viommu = iommufd_get_viommu(ucmd, cmd->viommu_id);
+	if (IS_ERR(viommu))
+		return PTR_ERR(viommu);
+
+	idev = iommufd_get_device(ucmd, cmd->dev_id);
+	if (IS_ERR(idev)) {
+		rc = PTR_ERR(idev);
+		goto out_put_viommu;
+	}
+
+	if (viommu->iommu_dev != __iommu_get_iommu_dev(idev->dev)) {
+		rc = -EINVAL;
+		goto out_put_idev;
+	}
+
+	vdev = iommufd_object_alloc(ucmd->ictx, vdev, IOMMUFD_OBJ_VDEVICE);
+	if (IS_ERR(vdev)) {
+		rc = PTR_ERR(vdev);
+		goto out_put_idev;
+	}
+
+	vdev->id = virt_id;
+	vdev->dev = idev->dev;
+	get_device(idev->dev);
+	vdev->viommu = viommu;
+	refcount_inc(&viommu->obj.users);
+
+	curr = xa_cmpxchg(&viommu->vdevs, virt_id, NULL, vdev, GFP_KERNEL);
+	if (curr) {
+		rc = xa_err(curr) ?: -EEXIST;
+		goto out_abort;
+	}
+
+	cmd->out_vdevice_id = vdev->obj.id;
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+	if (rc)
+		goto out_abort;
+	iommufd_object_finalize(ucmd->ictx, &vdev->obj);
+	goto out_put_idev;
+
+out_abort:
+	iommufd_object_abort_and_destroy(ucmd->ictx, &vdev->obj);
+out_put_idev:
+	iommufd_put_object(ucmd->ictx, &idev->obj);
+out_put_viommu:
+	iommufd_put_object(ucmd->ictx, &viommu->obj);
+	return rc;
+}
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index de9b56265c9c..71fa1e343023 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -10,6 +10,7 @@
 #include <linux/errno.h>
 #include <linux/refcount.h>
 #include <linux/types.h>
+#include <linux/xarray.h>
 
 struct device;
 struct file;
@@ -31,6 +32,7 @@ enum iommufd_object_type {
 	IOMMUFD_OBJ_ACCESS,
 	IOMMUFD_OBJ_FAULT,
 	IOMMUFD_OBJ_VIOMMU,
+	IOMMUFD_OBJ_VDEVICE,
 #ifdef CONFIG_IOMMUFD_TEST
 	IOMMUFD_OBJ_SELFTEST,
 #endif
@@ -89,6 +91,8 @@ struct iommufd_viommu {
 
 	const struct iommufd_viommu_ops *ops;
 
+	struct xarray vdevs;
+
 	unsigned int type;
 };
 
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index a498d4838f9a..9b5236004b8e 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -53,6 +53,7 @@ enum {
 	IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e,
 	IOMMUFD_CMD_IOAS_MAP_FILE = 0x8f,
 	IOMMUFD_CMD_VIOMMU_ALLOC = 0x90,
+	IOMMUFD_CMD_VDEVICE_ALLOC = 0x91,
 };
 
 /**
@@ -864,4 +865,25 @@ struct iommu_viommu_alloc {
 	__u32 out_viommu_id;
 };
 #define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC)
+
+/**
+ * struct iommu_vdevice_alloc - ioctl(IOMMU_VDEVICE_ALLOC)
+ * @size: sizeof(struct iommu_vdevice_alloc)
+ * @viommu_id: vIOMMU ID to associate with the virtual device
+ * @dev_id: The physical device to allocate a virtual instance on the vIOMMU
+ * @out_vdevice_id: Object handle for the vDevice. Pass to IOMMU_DESTORY
+ * @virt_id: Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID
+ *           of AMD IOMMU, and vRID of a nested Intel VT-d to a Context Table
+ *
+ * Allocate a virtual device instance (for a physical device) against a vIOMMU.
+ * This instance holds the device's information (related to its vIOMMU) in a VM.
+ */
+struct iommu_vdevice_alloc {
+	__u32 size;
+	__u32 viommu_id;
+	__u32 dev_id;
+	__u32 out_vdevice_id;
+	__aligned_u64 virt_id;
+};
+#define IOMMU_VDEVICE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VDEVICE_ALLOC)
 #endif
-- 
cgit v1.2.3


From 54ce69e36c71c88f258b1a322c54343d90954858 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 5 Nov 2024 12:05:12 -0800
Subject: iommufd: Allow hwpt_id to carry viommu_id for IOMMU_HWPT_INVALIDATE

With a vIOMMU object, use space can flush any IOMMU related cache that can
be directed via a vIOMMU object. It is similar to the IOMMU_HWPT_INVALIDATE
uAPI, but can cover a wider range than IOTLB, e.g. device/desciprtor cache.

Allow hwpt_id of the iommu_hwpt_invalidate structure to carry a viommu_id,
and reuse the IOMMU_HWPT_INVALIDATE uAPI for vIOMMU invalidations. Drivers
can define different structures for vIOMMU invalidations v.s. HWPT ones.

Since both the HWPT-based and vIOMMU-based invalidation pathways check own
cache invalidation op, remove the WARN_ON_ONCE in the allocator.

Update the uAPI, kdoc, and selftest case accordingly.

Link: https://patch.msgid.link/r/b411e2245e303b8a964f39f49453a5dff280968f.1730836308.git.nicolinc@nvidia.com
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/hw_pagetable.c    | 40 +++++++++++++++++++++++++--------
 include/uapi/linux/iommufd.h            |  9 +++++---
 tools/testing/selftests/iommu/iommufd.c |  4 ++--
 3 files changed, 39 insertions(+), 14 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index 982bf4a35a2b..702057655a81 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -251,8 +251,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
 	}
 	hwpt->domain->owner = ops;
 
-	if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED ||
-			 !hwpt->domain->ops->cache_invalidate_user)) {
+	if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) {
 		rc = -EINVAL;
 		goto out_abort;
 	}
@@ -483,7 +482,7 @@ int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd)
 		.entry_len = cmd->entry_len,
 		.entry_num = cmd->entry_num,
 	};
-	struct iommufd_hw_pagetable *hwpt;
+	struct iommufd_object *pt_obj;
 	u32 done_num = 0;
 	int rc;
 
@@ -497,17 +496,40 @@ int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd)
 		goto out;
 	}
 
-	hwpt = iommufd_get_hwpt_nested(ucmd, cmd->hwpt_id);
-	if (IS_ERR(hwpt)) {
-		rc = PTR_ERR(hwpt);
+	pt_obj = iommufd_get_object(ucmd->ictx, cmd->hwpt_id, IOMMUFD_OBJ_ANY);
+	if (IS_ERR(pt_obj)) {
+		rc = PTR_ERR(pt_obj);
 		goto out;
 	}
+	if (pt_obj->type == IOMMUFD_OBJ_HWPT_NESTED) {
+		struct iommufd_hw_pagetable *hwpt =
+			container_of(pt_obj, struct iommufd_hw_pagetable, obj);
+
+		if (!hwpt->domain->ops ||
+		    !hwpt->domain->ops->cache_invalidate_user) {
+			rc = -EOPNOTSUPP;
+			goto out_put_pt;
+		}
+		rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain,
+							      &data_array);
+	} else if (pt_obj->type == IOMMUFD_OBJ_VIOMMU) {
+		struct iommufd_viommu *viommu =
+			container_of(pt_obj, struct iommufd_viommu, obj);
+
+		if (!viommu->ops || !viommu->ops->cache_invalidate) {
+			rc = -EOPNOTSUPP;
+			goto out_put_pt;
+		}
+		rc = viommu->ops->cache_invalidate(viommu, &data_array);
+	} else {
+		rc = -EINVAL;
+		goto out_put_pt;
+	}
 
-	rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain,
-						      &data_array);
 	done_num = data_array.entry_num;
 
-	iommufd_put_object(ucmd->ictx, &hwpt->obj);
+out_put_pt:
+	iommufd_put_object(ucmd->ictx, pt_obj);
 out:
 	cmd->entry_num = done_num;
 	if (iommufd_ucmd_respond(ucmd, sizeof(*cmd)))
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 9b5236004b8e..badb41c5bfa4 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -700,7 +700,7 @@ struct iommu_hwpt_vtd_s1_invalidate {
 /**
  * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE)
  * @size: sizeof(struct iommu_hwpt_invalidate)
- * @hwpt_id: ID of a nested HWPT for cache invalidation
+ * @hwpt_id: ID of a nested HWPT or a vIOMMU, for cache invalidation
  * @data_uptr: User pointer to an array of driver-specific cache invalidation
  *             data.
  * @data_type: One of enum iommu_hwpt_invalidate_data_type, defining the data
@@ -711,8 +711,11 @@ struct iommu_hwpt_vtd_s1_invalidate {
  *             Output the number of requests successfully handled by kernel.
  * @__reserved: Must be 0.
  *
- * Invalidate the iommu cache for user-managed page table. Modifications on a
- * user-managed page table should be followed by this operation to sync cache.
+ * Invalidate iommu cache for user-managed page table or vIOMMU. Modifications
+ * on a user-managed page table should be followed by this operation, if a HWPT
+ * is passed in via @hwpt_id. Other caches, such as device cache or descriptor
+ * cache can be flushed if a vIOMMU is passed in via the @hwpt_id field.
+ *
  * Each ioctl can support one or more cache invalidation requests in the array
  * that has a total size of @entry_len * @entry_num.
  *
diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index f3cb628753c9..8cb3e835ca97 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -367,9 +367,9 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
 		EXPECT_ERRNO(EBUSY,
 			     _test_ioctl_destroy(self->fd, parent_hwpt_id));
 
-		/* hwpt_invalidate only supports a user-managed hwpt (nested) */
+		/* hwpt_invalidate does not support a parent hwpt */
 		num_inv = 1;
-		test_err_hwpt_invalidate(ENOENT, parent_hwpt_id, inv_reqs,
+		test_err_hwpt_invalidate(EINVAL, parent_hwpt_id, inv_reqs,
 					 IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
 					 sizeof(*inv_reqs), &num_inv);
 		assert(!num_inv);
-- 
cgit v1.2.3


From 69d9b312f38aa19f8c801e90bd23d70685be49f0 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 30 Oct 2024 21:20:52 -0300
Subject: iommu/arm-smmu-v3: Support IOMMU_VIOMMU_ALLOC

Add a new driver-type for ARM SMMUv3 to enum iommu_viommu_type. Implement
an arm_vsmmu_alloc().

As an initial step, copy the VMID from s2_parent. A followup series is
required to give the VIOMMU object it's own VMID that will be used in all
nesting configurations.

Link: https://patch.msgid.link/r/8-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c    | 45 ++++++++++++++++++++++
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c        |  1 +
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h        | 13 +++++++
 include/uapi/linux/iommufd.h                       |  4 ++
 4 files changed, 63 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
index 3d2671031c9b..60dd9e907595 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
@@ -29,3 +29,48 @@ void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type)
 
 	return info;
 }
+
+static const struct iommufd_viommu_ops arm_vsmmu_ops = {
+};
+
+struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev,
+				       struct iommu_domain *parent,
+				       struct iommufd_ctx *ictx,
+				       unsigned int viommu_type)
+{
+	struct arm_smmu_device *smmu =
+		iommu_get_iommu_dev(dev, struct arm_smmu_device, iommu);
+	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+	struct arm_smmu_domain *s2_parent = to_smmu_domain(parent);
+	struct arm_vsmmu *vsmmu;
+
+	if (viommu_type != IOMMU_VIOMMU_TYPE_ARM_SMMUV3)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (!(smmu->features & ARM_SMMU_FEAT_NESTING))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (s2_parent->smmu != master->smmu)
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * Must support some way to prevent the VM from bypassing the cache
+	 * because VFIO currently does not do any cache maintenance. canwbs
+	 * indicates the device is fully coherent and no cache maintenance is
+	 * ever required, even for PCI No-Snoop.
+	 */
+	if (!arm_smmu_master_canwbs(master))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	vsmmu = iommufd_viommu_alloc(ictx, struct arm_vsmmu, core,
+				     &arm_vsmmu_ops);
+	if (IS_ERR(vsmmu))
+		return ERR_CAST(vsmmu);
+
+	vsmmu->smmu = smmu;
+	vsmmu->s2_parent = s2_parent;
+	/* FIXME Move VMID allocation from the S2 domain allocation to here */
+	vsmmu->vmid = s2_parent->s2_cfg.vmid;
+
+	return &vsmmu->core;
+}
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index b4b03206afbf..c425fb923eb3 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3517,6 +3517,7 @@ static struct iommu_ops arm_smmu_ops = {
 	.dev_disable_feat	= arm_smmu_dev_disable_feature,
 	.page_response		= arm_smmu_page_response,
 	.def_domain_type	= arm_smmu_def_domain_type,
+	.viommu_alloc		= arm_vsmmu_alloc,
 	.pgsize_bitmap		= -1UL, /* Restricted during device attach */
 	.owner			= THIS_MODULE,
 	.default_domain_ops = &(const struct iommu_domain_ops) {
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index c9e5290e995a..3b8013afcec0 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -10,6 +10,7 @@
 
 #include <linux/bitfield.h>
 #include <linux/iommu.h>
+#include <linux/iommufd.h>
 #include <linux/kernel.h>
 #include <linux/mmzone.h>
 #include <linux/sizes.h>
@@ -976,10 +977,22 @@ tegra241_cmdqv_probe(struct arm_smmu_device *smmu)
 }
 #endif /* CONFIG_TEGRA241_CMDQV */
 
+struct arm_vsmmu {
+	struct iommufd_viommu core;
+	struct arm_smmu_device *smmu;
+	struct arm_smmu_domain *s2_parent;
+	u16 vmid;
+};
+
 #if IS_ENABLED(CONFIG_ARM_SMMU_V3_IOMMUFD)
 void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type);
+struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev,
+				       struct iommu_domain *parent,
+				       struct iommufd_ctx *ictx,
+				       unsigned int viommu_type);
 #else
 #define arm_smmu_hw_info NULL
+#define arm_vsmmu_alloc NULL
 #endif /* CONFIG_ARM_SMMU_V3_IOMMUFD */
 
 #endif /* _ARM_SMMU_V3_H */
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index f4f76759b738..7cb13a29969d 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -425,10 +425,12 @@ struct iommu_hwpt_vtd_s1 {
  * enum iommu_hwpt_data_type - IOMMU HWPT Data Type
  * @IOMMU_HWPT_DATA_NONE: no data
  * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table
+ * @IOMMU_HWPT_DATA_ARM_SMMUV3: ARM SMMUv3 Context Descriptor Table
  */
 enum iommu_hwpt_data_type {
 	IOMMU_HWPT_DATA_NONE = 0,
 	IOMMU_HWPT_DATA_VTD_S1 = 1,
+	IOMMU_HWPT_DATA_ARM_SMMUV3 = 2,
 };
 
 /**
@@ -868,9 +870,11 @@ struct iommu_fault_alloc {
 /**
  * enum iommu_viommu_type - Virtual IOMMU Type
  * @IOMMU_VIOMMU_TYPE_DEFAULT: Reserved for future use
+ * @IOMMU_VIOMMU_TYPE_ARM_SMMUV3: ARM SMMUv3 driver specific type
  */
 enum iommu_viommu_type {
 	IOMMU_VIOMMU_TYPE_DEFAULT = 0,
+	IOMMU_VIOMMU_TYPE_ARM_SMMUV3 = 1,
 };
 
 /**
-- 
cgit v1.2.3


From 1e8be08d1c91d52a9b51d424db78ddbf88660bbb Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 30 Oct 2024 21:20:53 -0300
Subject: iommu/arm-smmu-v3: Support IOMMU_DOMAIN_NESTED

For SMMUv3 a IOMMU_DOMAIN_NESTED is composed of a S2 iommu_domain acting
as the parent and a user provided STE fragment that defines the CD table
and related data with addresses translated by the S2 iommu_domain.

The kernel only permits userspace to control certain allowed bits of the
STE that are safe for user/guest control.

IOTLB maintenance is a bit subtle here, the S1 implicitly includes the S2
translation, but there is no way of knowing which S1 entries refer to a
range of S2.

For the IOTLB we follow ARM's guidance and issue a CMDQ_OP_TLBI_NH_ALL to
flush all ASIDs from the VMID after flushing the S2 on any change to the
S2.

The IOMMU_DOMAIN_NESTED can only be created from inside a VIOMMU as the
invalidation path relies on the VIOMMU to translate virtual stream ID used
in the invalidation commands for the CD table and ATS.

Link: https://patch.msgid.link/r/9-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Reviewed-by: Donald Dutile <ddutile@redhat.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c    | 163 +++++++++++++++++++++
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c        |  17 ++-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h        |  26 ++++
 include/uapi/linux/iommufd.h                       |  20 +++
 4 files changed, 225 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
index 60dd9e907595..91247a2a2d2c 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
@@ -30,7 +30,170 @@ void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type)
 	return info;
 }
 
+static void arm_smmu_make_nested_cd_table_ste(
+	struct arm_smmu_ste *target, struct arm_smmu_master *master,
+	struct arm_smmu_nested_domain *nested_domain, bool ats_enabled)
+{
+	arm_smmu_make_s2_domain_ste(
+		target, master, nested_domain->vsmmu->s2_parent, ats_enabled);
+
+	target->data[0] = cpu_to_le64(STRTAB_STE_0_V |
+				      FIELD_PREP(STRTAB_STE_0_CFG,
+						 STRTAB_STE_0_CFG_NESTED));
+	target->data[0] |= nested_domain->ste[0] &
+			   ~cpu_to_le64(STRTAB_STE_0_CFG);
+	target->data[1] |= nested_domain->ste[1];
+}
+
+/*
+ * Create a physical STE from the virtual STE that userspace provided when it
+ * created the nested domain. Using the vSTE userspace can request:
+ * - Non-valid STE
+ * - Abort STE
+ * - Bypass STE (install the S2, no CD table)
+ * - CD table STE (install the S2 and the userspace CD table)
+ */
+static void arm_smmu_make_nested_domain_ste(
+	struct arm_smmu_ste *target, struct arm_smmu_master *master,
+	struct arm_smmu_nested_domain *nested_domain, bool ats_enabled)
+{
+	unsigned int cfg =
+		FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(nested_domain->ste[0]));
+
+	/*
+	 * Userspace can request a non-valid STE through the nesting interface.
+	 * We relay that into an abort physical STE with the intention that
+	 * C_BAD_STE for this SID can be generated to userspace.
+	 */
+	if (!(nested_domain->ste[0] & cpu_to_le64(STRTAB_STE_0_V)))
+		cfg = STRTAB_STE_0_CFG_ABORT;
+
+	switch (cfg) {
+	case STRTAB_STE_0_CFG_S1_TRANS:
+		arm_smmu_make_nested_cd_table_ste(target, master, nested_domain,
+						  ats_enabled);
+		break;
+	case STRTAB_STE_0_CFG_BYPASS:
+		arm_smmu_make_s2_domain_ste(target, master,
+					    nested_domain->vsmmu->s2_parent,
+					    ats_enabled);
+		break;
+	case STRTAB_STE_0_CFG_ABORT:
+	default:
+		arm_smmu_make_abort_ste(target);
+		break;
+	}
+}
+
+static int arm_smmu_attach_dev_nested(struct iommu_domain *domain,
+				      struct device *dev)
+{
+	struct arm_smmu_nested_domain *nested_domain =
+		to_smmu_nested_domain(domain);
+	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+	struct arm_smmu_attach_state state = {
+		.master = master,
+		.old_domain = iommu_get_domain_for_dev(dev),
+		.ssid = IOMMU_NO_PASID,
+		/* Currently invalidation of ATC is not supported */
+		.disable_ats = true,
+	};
+	struct arm_smmu_ste ste;
+	int ret;
+
+	if (nested_domain->vsmmu->smmu != master->smmu)
+		return -EINVAL;
+	if (arm_smmu_ssids_in_use(&master->cd_table))
+		return -EBUSY;
+
+	mutex_lock(&arm_smmu_asid_lock);
+	ret = arm_smmu_attach_prepare(&state, domain);
+	if (ret) {
+		mutex_unlock(&arm_smmu_asid_lock);
+		return ret;
+	}
+
+	arm_smmu_make_nested_domain_ste(&ste, master, nested_domain,
+					state.ats_enabled);
+	arm_smmu_install_ste_for_dev(master, &ste);
+	arm_smmu_attach_commit(&state);
+	mutex_unlock(&arm_smmu_asid_lock);
+	return 0;
+}
+
+static void arm_smmu_domain_nested_free(struct iommu_domain *domain)
+{
+	kfree(to_smmu_nested_domain(domain));
+}
+
+static const struct iommu_domain_ops arm_smmu_nested_ops = {
+	.attach_dev = arm_smmu_attach_dev_nested,
+	.free = arm_smmu_domain_nested_free,
+};
+
+static int arm_smmu_validate_vste(struct iommu_hwpt_arm_smmuv3 *arg)
+{
+	unsigned int cfg;
+
+	if (!(arg->ste[0] & cpu_to_le64(STRTAB_STE_0_V))) {
+		memset(arg->ste, 0, sizeof(arg->ste));
+		return 0;
+	}
+
+	/* EIO is reserved for invalid STE data. */
+	if ((arg->ste[0] & ~STRTAB_STE_0_NESTING_ALLOWED) ||
+	    (arg->ste[1] & ~STRTAB_STE_1_NESTING_ALLOWED))
+		return -EIO;
+
+	cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(arg->ste[0]));
+	if (cfg != STRTAB_STE_0_CFG_ABORT && cfg != STRTAB_STE_0_CFG_BYPASS &&
+	    cfg != STRTAB_STE_0_CFG_S1_TRANS)
+		return -EIO;
+	return 0;
+}
+
+static struct iommu_domain *
+arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
+			      const struct iommu_user_data *user_data)
+{
+	struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core);
+	const u32 SUPPORTED_FLAGS = IOMMU_HWPT_FAULT_ID_VALID;
+	struct arm_smmu_nested_domain *nested_domain;
+	struct iommu_hwpt_arm_smmuv3 arg;
+	int ret;
+
+	/*
+	 * Faults delivered to the nested domain are faults that originated by
+	 * the S1 in the domain. The core code will match all PASIDs when
+	 * delivering the fault due to user_pasid_table
+	 */
+	if (flags & ~SUPPORTED_FLAGS)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	ret = iommu_copy_struct_from_user(&arg, user_data,
+					  IOMMU_HWPT_DATA_ARM_SMMUV3, ste);
+	if (ret)
+		return ERR_PTR(ret);
+
+	ret = arm_smmu_validate_vste(&arg);
+	if (ret)
+		return ERR_PTR(ret);
+
+	nested_domain = kzalloc(sizeof(*nested_domain), GFP_KERNEL_ACCOUNT);
+	if (!nested_domain)
+		return ERR_PTR(-ENOMEM);
+
+	nested_domain->domain.type = IOMMU_DOMAIN_NESTED;
+	nested_domain->domain.ops = &arm_smmu_nested_ops;
+	nested_domain->vsmmu = vsmmu;
+	nested_domain->ste[0] = arg.ste[0];
+	nested_domain->ste[1] = arg.ste[1] & ~cpu_to_le64(STRTAB_STE_1_EATS);
+
+	return &nested_domain->domain;
+}
+
 static const struct iommufd_viommu_ops arm_vsmmu_ops = {
+	.alloc_domain_nested = arm_vsmmu_alloc_domain_nested,
 };
 
 struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev,
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index c425fb923eb3..53f12b9d78ab 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -295,6 +295,7 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
 	case CMDQ_OP_TLBI_NH_ASID:
 		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
 		fallthrough;
+	case CMDQ_OP_TLBI_NH_ALL:
 	case CMDQ_OP_TLBI_S12_VMALL:
 		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
 		break;
@@ -2230,6 +2231,15 @@ static void arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size,
 	}
 	__arm_smmu_tlb_inv_range(&cmd, iova, size, granule, smmu_domain);
 
+	if (smmu_domain->nest_parent) {
+		/*
+		 * When the S2 domain changes all the nested S1 ASIDs have to be
+		 * flushed too.
+		 */
+		cmd.opcode = CMDQ_OP_TLBI_NH_ALL;
+		arm_smmu_cmdq_issue_cmd_with_sync(smmu_domain->smmu, &cmd);
+	}
+
 	/*
 	 * Unfortunately, this can't be leaf-only since we may have
 	 * zapped an entire table.
@@ -2644,6 +2654,8 @@ to_smmu_domain_devices(struct iommu_domain *domain)
 	if ((domain->type & __IOMMU_DOMAIN_PAGING) ||
 	    domain->type == IOMMU_DOMAIN_SVA)
 		return to_smmu_domain(domain);
+	if (domain->type == IOMMU_DOMAIN_NESTED)
+		return to_smmu_nested_domain(domain)->vsmmu->s2_parent;
 	return NULL;
 }
 
@@ -2716,7 +2728,8 @@ int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
 		 * enabled if we have arm_smmu_domain, those always have page
 		 * tables.
 		 */
-		state->ats_enabled = arm_smmu_ats_supported(master);
+		state->ats_enabled = !state->disable_ats &&
+				     arm_smmu_ats_supported(master);
 	}
 
 	if (smmu_domain) {
@@ -3122,6 +3135,7 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags,
 			goto err_free;
 		}
 		smmu_domain->stage = ARM_SMMU_DOMAIN_S2;
+		smmu_domain->nest_parent = true;
 	}
 
 	smmu_domain->domain.type = IOMMU_DOMAIN_UNMANAGED;
@@ -3518,6 +3532,7 @@ static struct iommu_ops arm_smmu_ops = {
 	.page_response		= arm_smmu_page_response,
 	.def_domain_type	= arm_smmu_def_domain_type,
 	.viommu_alloc		= arm_vsmmu_alloc,
+	.user_pasid_table	= 1,
 	.pgsize_bitmap		= -1UL, /* Restricted during device attach */
 	.owner			= THIS_MODULE,
 	.default_domain_ops = &(const struct iommu_domain_ops) {
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 3b8013afcec0..3fabe187ea78 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -244,6 +244,7 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid)
 #define STRTAB_STE_0_CFG_BYPASS		4
 #define STRTAB_STE_0_CFG_S1_TRANS	5
 #define STRTAB_STE_0_CFG_S2_TRANS	6
+#define STRTAB_STE_0_CFG_NESTED		7
 
 #define STRTAB_STE_0_S1FMT		GENMASK_ULL(5, 4)
 #define STRTAB_STE_0_S1FMT_LINEAR	0
@@ -295,6 +296,15 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid)
 
 #define STRTAB_STE_3_S2TTB_MASK		GENMASK_ULL(51, 4)
 
+/* These bits can be controlled by userspace for STRTAB_STE_0_CFG_NESTED */
+#define STRTAB_STE_0_NESTING_ALLOWED                                         \
+	cpu_to_le64(STRTAB_STE_0_V | STRTAB_STE_0_CFG | STRTAB_STE_0_S1FMT | \
+		    STRTAB_STE_0_S1CTXPTR_MASK | STRTAB_STE_0_S1CDMAX)
+#define STRTAB_STE_1_NESTING_ALLOWED                            \
+	cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR |   \
+		    STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH |   \
+		    STRTAB_STE_1_S1STALLD)
+
 /*
  * Context descriptors.
  *
@@ -514,6 +524,7 @@ struct arm_smmu_cmdq_ent {
 			};
 		} cfgi;
 
+		#define CMDQ_OP_TLBI_NH_ALL     0x10
 		#define CMDQ_OP_TLBI_NH_ASID	0x11
 		#define CMDQ_OP_TLBI_NH_VA	0x12
 		#define CMDQ_OP_TLBI_EL2_ALL	0x20
@@ -815,10 +826,18 @@ struct arm_smmu_domain {
 	struct list_head		devices;
 	spinlock_t			devices_lock;
 	bool				enforce_cache_coherency : 1;
+	bool				nest_parent : 1;
 
 	struct mmu_notifier		mmu_notifier;
 };
 
+struct arm_smmu_nested_domain {
+	struct iommu_domain domain;
+	struct arm_vsmmu *vsmmu;
+
+	__le64 ste[2];
+};
+
 /* The following are exposed for testing purposes. */
 struct arm_smmu_entry_writer_ops;
 struct arm_smmu_entry_writer {
@@ -863,6 +882,12 @@ static inline struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
 	return container_of(dom, struct arm_smmu_domain, domain);
 }
 
+static inline struct arm_smmu_nested_domain *
+to_smmu_nested_domain(struct iommu_domain *dom)
+{
+	return container_of(dom, struct arm_smmu_nested_domain, domain);
+}
+
 extern struct xarray arm_smmu_asid_xa;
 extern struct mutex arm_smmu_asid_lock;
 
@@ -909,6 +934,7 @@ struct arm_smmu_attach_state {
 	struct iommu_domain *old_domain;
 	struct arm_smmu_master *master;
 	bool cd_needs_ats;
+	bool disable_ats;
 	ioasid_t ssid;
 	/* Resulting state */
 	bool ats_enabled;
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 7cb13a29969d..b6baaa1e55b1 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -421,6 +421,26 @@ struct iommu_hwpt_vtd_s1 {
 	__u32 __reserved;
 };
 
+/**
+ * struct iommu_hwpt_arm_smmuv3 - ARM SMMUv3 nested STE
+ *                                (IOMMU_HWPT_DATA_ARM_SMMUV3)
+ *
+ * @ste: The first two double words of the user space Stream Table Entry for
+ *       the translation. Must be little-endian.
+ *       Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW Spec)
+ *       - word-0: V, Cfg, S1Fmt, S1ContextPtr, S1CDMax
+ *       - word-1: S1DSS, S1CIR, S1COR, S1CSH, S1STALLD
+ *
+ * -EIO will be returned if @ste is not legal or contains any non-allowed field.
+ * Cfg can be used to select a S1, Bypass or Abort configuration. A Bypass
+ * nested domain will translate the same as the nesting parent. The S1 will
+ * install a Context Descriptor Table pointing at userspace memory translated
+ * by the nesting parent.
+ */
+struct iommu_hwpt_arm_smmuv3 {
+	__aligned_le64 ste[2];
+};
+
 /**
  * enum iommu_hwpt_data_type - IOMMU HWPT Data Type
  * @IOMMU_HWPT_DATA_NONE: no data
-- 
cgit v1.2.3


From f27298a82ba09a1c8aecee8a209b2a312beac672 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 30 Oct 2024 21:20:55 -0300
Subject: iommu/arm-smmu-v3: Allow ATS for IOMMU_DOMAIN_NESTED

The EATS flag needs to flow through the vSTE and into the pSTE, and ensure
physical ATS is enabled on the PCI device.

The physical ATS state must match the VM's idea of EATS as we rely on the
VM to issue the ATS invalidation commands. Thus ATS must remain off at the
device until EATS on a nesting domain turns it on. Attaching a nesting
domain is the point where the invalidation responsibility transfers to
userspace.

Update the ATS logic to track EATS for nesting domains and flush the
ATC whenever the S2 nesting parent changes.

Link: https://patch.msgid.link/r/11-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c    | 31 +++++++++++++++++++---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c        | 26 +++++++++++++++---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h        |  4 ++-
 include/uapi/linux/iommufd.h                       |  2 +-
 4 files changed, 53 insertions(+), 10 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
index a1c8fcd4797c..84c8a21c00ae 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
@@ -95,8 +95,6 @@ static int arm_smmu_attach_dev_nested(struct iommu_domain *domain,
 		.master = master,
 		.old_domain = iommu_get_domain_for_dev(dev),
 		.ssid = IOMMU_NO_PASID,
-		/* Currently invalidation of ATC is not supported */
-		.disable_ats = true,
 	};
 	struct arm_smmu_ste ste;
 	int ret;
@@ -107,6 +105,15 @@ static int arm_smmu_attach_dev_nested(struct iommu_domain *domain,
 		return -EBUSY;
 
 	mutex_lock(&arm_smmu_asid_lock);
+	/*
+	 * The VM has to control the actual ATS state at the PCI device because
+	 * we forward the invalidations directly from the VM. If the VM doesn't
+	 * think ATS is on it will not generate ATC flushes and the ATC will
+	 * become incoherent. Since we can't access the actual virtual PCI ATS
+	 * config bit here base this off the EATS value in the STE. If the EATS
+	 * is set then the VM must generate ATC flushes.
+	 */
+	state.disable_ats = !nested_domain->enable_ats;
 	ret = arm_smmu_attach_prepare(&state, domain);
 	if (ret) {
 		mutex_unlock(&arm_smmu_asid_lock);
@@ -131,8 +138,10 @@ static const struct iommu_domain_ops arm_smmu_nested_ops = {
 	.free = arm_smmu_domain_nested_free,
 };
 
-static int arm_smmu_validate_vste(struct iommu_hwpt_arm_smmuv3 *arg)
+static int arm_smmu_validate_vste(struct iommu_hwpt_arm_smmuv3 *arg,
+				  bool *enable_ats)
 {
+	unsigned int eats;
 	unsigned int cfg;
 
 	if (!(arg->ste[0] & cpu_to_le64(STRTAB_STE_0_V))) {
@@ -149,6 +158,18 @@ static int arm_smmu_validate_vste(struct iommu_hwpt_arm_smmuv3 *arg)
 	if (cfg != STRTAB_STE_0_CFG_ABORT && cfg != STRTAB_STE_0_CFG_BYPASS &&
 	    cfg != STRTAB_STE_0_CFG_S1_TRANS)
 		return -EIO;
+
+	/*
+	 * Only Full ATS or ATS UR is supported
+	 * The EATS field will be set by arm_smmu_make_nested_domain_ste()
+	 */
+	eats = FIELD_GET(STRTAB_STE_1_EATS, le64_to_cpu(arg->ste[1]));
+	arg->ste[1] &= ~cpu_to_le64(STRTAB_STE_1_EATS);
+	if (eats != STRTAB_STE_1_EATS_ABT && eats != STRTAB_STE_1_EATS_TRANS)
+		return -EIO;
+
+	if (cfg == STRTAB_STE_0_CFG_S1_TRANS)
+		*enable_ats = (eats == STRTAB_STE_1_EATS_TRANS);
 	return 0;
 }
 
@@ -160,6 +181,7 @@ arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
 	const u32 SUPPORTED_FLAGS = IOMMU_HWPT_FAULT_ID_VALID;
 	struct arm_smmu_nested_domain *nested_domain;
 	struct iommu_hwpt_arm_smmuv3 arg;
+	bool enable_ats = false;
 	int ret;
 
 	/*
@@ -175,7 +197,7 @@ arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
 	if (ret)
 		return ERR_PTR(ret);
 
-	ret = arm_smmu_validate_vste(&arg);
+	ret = arm_smmu_validate_vste(&arg, &enable_ats);
 	if (ret)
 		return ERR_PTR(ret);
 
@@ -185,6 +207,7 @@ arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
 
 	nested_domain->domain.type = IOMMU_DOMAIN_NESTED;
 	nested_domain->domain.ops = &arm_smmu_nested_ops;
+	nested_domain->enable_ats = enable_ats;
 	nested_domain->vsmmu = vsmmu;
 	nested_domain->ste[0] = arg.ste[0];
 	nested_domain->ste[1] = arg.ste[1] & ~cpu_to_le64(STRTAB_STE_1_EATS);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index de598d66b5c2..b47f80224781 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2107,7 +2107,16 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
 		if (!master->ats_enabled)
 			continue;
 
-		arm_smmu_atc_inv_to_cmd(master_domain->ssid, iova, size, &cmd);
+		if (master_domain->nested_ats_flush) {
+			/*
+			 * If a S2 used as a nesting parent is changed we have
+			 * no option but to completely flush the ATC.
+			 */
+			arm_smmu_atc_inv_to_cmd(IOMMU_NO_PASID, 0, 0, &cmd);
+		} else {
+			arm_smmu_atc_inv_to_cmd(master_domain->ssid, iova, size,
+						&cmd);
+		}
 
 		for (i = 0; i < master->num_streams; i++) {
 			cmd.atc.sid = master->streams[i].id;
@@ -2631,7 +2640,7 @@ static void arm_smmu_disable_pasid(struct arm_smmu_master *master)
 static struct arm_smmu_master_domain *
 arm_smmu_find_master_domain(struct arm_smmu_domain *smmu_domain,
 			    struct arm_smmu_master *master,
-			    ioasid_t ssid)
+			    ioasid_t ssid, bool nested_ats_flush)
 {
 	struct arm_smmu_master_domain *master_domain;
 
@@ -2640,7 +2649,8 @@ arm_smmu_find_master_domain(struct arm_smmu_domain *smmu_domain,
 	list_for_each_entry(master_domain, &smmu_domain->devices,
 			    devices_elm) {
 		if (master_domain->master == master &&
-		    master_domain->ssid == ssid)
+		    master_domain->ssid == ssid &&
+		    master_domain->nested_ats_flush == nested_ats_flush)
 			return master_domain;
 	}
 	return NULL;
@@ -2671,13 +2681,18 @@ static void arm_smmu_remove_master_domain(struct arm_smmu_master *master,
 {
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain_devices(domain);
 	struct arm_smmu_master_domain *master_domain;
+	bool nested_ats_flush = false;
 	unsigned long flags;
 
 	if (!smmu_domain)
 		return;
 
+	if (domain->type == IOMMU_DOMAIN_NESTED)
+		nested_ats_flush = to_smmu_nested_domain(domain)->enable_ats;
+
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
-	master_domain = arm_smmu_find_master_domain(smmu_domain, master, ssid);
+	master_domain = arm_smmu_find_master_domain(smmu_domain, master, ssid,
+						    nested_ats_flush);
 	if (master_domain) {
 		list_del(&master_domain->devices_elm);
 		kfree(master_domain);
@@ -2744,6 +2759,9 @@ int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
 			return -ENOMEM;
 		master_domain->master = master;
 		master_domain->ssid = state->ssid;
+		if (new_domain->type == IOMMU_DOMAIN_NESTED)
+			master_domain->nested_ats_flush =
+				to_smmu_nested_domain(new_domain)->enable_ats;
 
 		/*
 		 * During prepare we want the current smmu_domain and new
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 5a025d310dbe..01c1d16dc0c8 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -305,7 +305,7 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid)
 #define STRTAB_STE_1_NESTING_ALLOWED                            \
 	cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR |   \
 		    STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH |   \
-		    STRTAB_STE_1_S1STALLD)
+		    STRTAB_STE_1_S1STALLD | STRTAB_STE_1_EATS)
 
 /*
  * Context descriptors.
@@ -837,6 +837,7 @@ struct arm_smmu_domain {
 struct arm_smmu_nested_domain {
 	struct iommu_domain domain;
 	struct arm_vsmmu *vsmmu;
+	bool enable_ats : 1;
 
 	__le64 ste[2];
 };
@@ -878,6 +879,7 @@ struct arm_smmu_master_domain {
 	struct list_head devices_elm;
 	struct arm_smmu_master *master;
 	ioasid_t ssid;
+	bool nested_ats_flush : 1;
 };
 
 static inline struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index b6baaa1e55b1..a66eb0384cd6 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -429,7 +429,7 @@ struct iommu_hwpt_vtd_s1 {
  *       the translation. Must be little-endian.
  *       Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW Spec)
  *       - word-0: V, Cfg, S1Fmt, S1ContextPtr, S1CDMax
- *       - word-1: S1DSS, S1CIR, S1COR, S1CSH, S1STALLD
+ *       - word-1: EATS, S1DSS, S1CIR, S1COR, S1CSH, S1STALLD
  *
  * -EIO will be returned if @ste is not legal or contains any non-allowed field.
  * Cfg can be used to select a S1, Bypass or Abort configuration. A Bypass
-- 
cgit v1.2.3


From d68beb276ba26cec47350a6d468e967673ee0c56 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 30 Oct 2024 21:20:56 -0300
Subject: iommu/arm-smmu-v3: Support IOMMU_HWPT_INVALIDATE using a VIOMMU
 object

Implement the vIOMMU's cache_invalidate op for user space to invalidate
the IOTLB entries, Device ATS and CD entries that are cached by hardware.

Add struct iommu_viommu_arm_smmuv3_invalidate defining invalidation
entries that are simply in the native format of a 128-bit TLBI
command. Scan those commands against the permitted command list and fix
their VMID/SID fields to match what is stored in the vIOMMU.

Link: https://patch.msgid.link/r/12-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com
Co-developed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Co-developed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c    | 134 +++++++++++++++++++++
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c        |   6 +-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h        |   5 +
 include/uapi/linux/iommufd.h                       |  24 ++++
 4 files changed, 166 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
index 84c8a21c00ae..c96cab6521a4 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
@@ -215,8 +215,134 @@ arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
 	return &nested_domain->domain;
 }
 
+static int arm_vsmmu_vsid_to_sid(struct arm_vsmmu *vsmmu, u32 vsid, u32 *sid)
+{
+	struct arm_smmu_master *master;
+	struct device *dev;
+	int ret = 0;
+
+	xa_lock(&vsmmu->core.vdevs);
+	dev = iommufd_viommu_find_dev(&vsmmu->core, (unsigned long)vsid);
+	if (!dev) {
+		ret = -EIO;
+		goto unlock;
+	}
+	master = dev_iommu_priv_get(dev);
+
+	/* At this moment, iommufd only supports PCI device that has one SID */
+	if (sid)
+		*sid = master->streams[0].id;
+unlock:
+	xa_unlock(&vsmmu->core.vdevs);
+	return ret;
+}
+
+/* This is basically iommu_viommu_arm_smmuv3_invalidate in u64 for conversion */
+struct arm_vsmmu_invalidation_cmd {
+	union {
+		u64 cmd[2];
+		struct iommu_viommu_arm_smmuv3_invalidate ucmd;
+	};
+};
+
+/*
+ * Convert, in place, the raw invalidation command into an internal format that
+ * can be passed to arm_smmu_cmdq_issue_cmdlist(). Internally commands are
+ * stored in CPU endian.
+ *
+ * Enforce the VMID or SID on the command.
+ */
+static int arm_vsmmu_convert_user_cmd(struct arm_vsmmu *vsmmu,
+				      struct arm_vsmmu_invalidation_cmd *cmd)
+{
+	/* Commands are le64 stored in u64 */
+	cmd->cmd[0] = le64_to_cpu(cmd->ucmd.cmd[0]);
+	cmd->cmd[1] = le64_to_cpu(cmd->ucmd.cmd[1]);
+
+	switch (cmd->cmd[0] & CMDQ_0_OP) {
+	case CMDQ_OP_TLBI_NSNH_ALL:
+		/* Convert to NH_ALL */
+		cmd->cmd[0] = CMDQ_OP_TLBI_NH_ALL |
+			      FIELD_PREP(CMDQ_TLBI_0_VMID, vsmmu->vmid);
+		cmd->cmd[1] = 0;
+		break;
+	case CMDQ_OP_TLBI_NH_VA:
+	case CMDQ_OP_TLBI_NH_VAA:
+	case CMDQ_OP_TLBI_NH_ALL:
+	case CMDQ_OP_TLBI_NH_ASID:
+		cmd->cmd[0] &= ~CMDQ_TLBI_0_VMID;
+		cmd->cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, vsmmu->vmid);
+		break;
+	case CMDQ_OP_ATC_INV:
+	case CMDQ_OP_CFGI_CD:
+	case CMDQ_OP_CFGI_CD_ALL: {
+		u32 sid, vsid = FIELD_GET(CMDQ_CFGI_0_SID, cmd->cmd[0]);
+
+		if (arm_vsmmu_vsid_to_sid(vsmmu, vsid, &sid))
+			return -EIO;
+		cmd->cmd[0] &= ~CMDQ_CFGI_0_SID;
+		cmd->cmd[0] |= FIELD_PREP(CMDQ_CFGI_0_SID, sid);
+		break;
+	}
+	default:
+		return -EIO;
+	}
+	return 0;
+}
+
+static int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu,
+				      struct iommu_user_data_array *array)
+{
+	struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core);
+	struct arm_smmu_device *smmu = vsmmu->smmu;
+	struct arm_vsmmu_invalidation_cmd *last;
+	struct arm_vsmmu_invalidation_cmd *cmds;
+	struct arm_vsmmu_invalidation_cmd *cur;
+	struct arm_vsmmu_invalidation_cmd *end;
+	int ret;
+
+	cmds = kcalloc(array->entry_num, sizeof(*cmds), GFP_KERNEL);
+	if (!cmds)
+		return -ENOMEM;
+	cur = cmds;
+	end = cmds + array->entry_num;
+
+	static_assert(sizeof(*cmds) == 2 * sizeof(u64));
+	ret = iommu_copy_struct_from_full_user_array(
+		cmds, sizeof(*cmds), array,
+		IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3);
+	if (ret)
+		goto out;
+
+	last = cmds;
+	while (cur != end) {
+		ret = arm_vsmmu_convert_user_cmd(vsmmu, cur);
+		if (ret)
+			goto out;
+
+		/* FIXME work in blocks of CMDQ_BATCH_ENTRIES and copy each block? */
+		cur++;
+		if (cur != end && (cur - last) != CMDQ_BATCH_ENTRIES - 1)
+			continue;
+
+		/* FIXME always uses the main cmdq rather than trying to group by type */
+		ret = arm_smmu_cmdq_issue_cmdlist(smmu, &smmu->cmdq, last->cmd,
+						  cur - last, true);
+		if (ret) {
+			cur--;
+			goto out;
+		}
+		last = cur;
+	}
+out:
+	array->entry_num = cur - cmds;
+	kfree(cmds);
+	return ret;
+}
+
 static const struct iommufd_viommu_ops arm_vsmmu_ops = {
 	.alloc_domain_nested = arm_vsmmu_alloc_domain_nested,
+	.cache_invalidate = arm_vsmmu_cache_invalidate,
 };
 
 struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev,
@@ -239,6 +365,14 @@ struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev,
 	if (s2_parent->smmu != master->smmu)
 		return ERR_PTR(-EINVAL);
 
+	/*
+	 * FORCE_SYNC is not set with FEAT_NESTING. Some study of the exact HW
+	 * defect is needed to determine if arm_vsmmu_cache_invalidate() needs
+	 * any change to remove this.
+	 */
+	if (WARN_ON(smmu->options & ARM_SMMU_OPT_CMDQ_FORCE_SYNC))
+		return ERR_PTR(-EOPNOTSUPP);
+
 	/*
 	 * Must support some way to prevent the VM from bypassing the cache
 	 * because VFIO currently does not do any cache maintenance. canwbs
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index b47f80224781..2a9f2d1d3ed9 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -766,9 +766,9 @@ static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds,
  *   insert their own list of commands then all of the commands from one
  *   CPU will appear before any of the commands from the other CPU.
  */
-static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
-				       struct arm_smmu_cmdq *cmdq,
-				       u64 *cmds, int n, bool sync)
+int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
+				struct arm_smmu_cmdq *cmdq, u64 *cmds, int n,
+				bool sync)
 {
 	u64 cmd_sync[CMDQ_ENT_DWORDS];
 	u32 prod;
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 01c1d16dc0c8..af25f092303f 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -529,6 +529,7 @@ struct arm_smmu_cmdq_ent {
 		#define CMDQ_OP_TLBI_NH_ALL     0x10
 		#define CMDQ_OP_TLBI_NH_ASID	0x11
 		#define CMDQ_OP_TLBI_NH_VA	0x12
+		#define CMDQ_OP_TLBI_NH_VAA	0x13
 		#define CMDQ_OP_TLBI_EL2_ALL	0x20
 		#define CMDQ_OP_TLBI_EL2_ASID	0x21
 		#define CMDQ_OP_TLBI_EL2_VA	0x22
@@ -951,6 +952,10 @@ void arm_smmu_attach_commit(struct arm_smmu_attach_state *state);
 void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master,
 				  const struct arm_smmu_ste *target);
 
+int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
+				struct arm_smmu_cmdq *cmdq, u64 *cmds, int n,
+				bool sync);
+
 #ifdef CONFIG_ARM_SMMU_V3_SVA
 bool arm_smmu_sva_supported(struct arm_smmu_device *smmu);
 bool arm_smmu_master_sva_supported(struct arm_smmu_master *master);
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index a66eb0384cd6..747d3d9baa3d 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -713,9 +713,11 @@ struct iommu_hwpt_get_dirty_bitmap {
  * enum iommu_hwpt_invalidate_data_type - IOMMU HWPT Cache Invalidation
  *                                        Data Type
  * @IOMMU_HWPT_INVALIDATE_DATA_VTD_S1: Invalidation data for VTD_S1
+ * @IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3: Invalidation data for ARM SMMUv3
  */
 enum iommu_hwpt_invalidate_data_type {
 	IOMMU_HWPT_INVALIDATE_DATA_VTD_S1 = 0,
+	IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3 = 1,
 };
 
 /**
@@ -754,6 +756,28 @@ struct iommu_hwpt_vtd_s1_invalidate {
 	__u32 __reserved;
 };
 
+/**
+ * struct iommu_viommu_arm_smmuv3_invalidate - ARM SMMUv3 cahce invalidation
+ *         (IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3)
+ * @cmd: 128-bit cache invalidation command that runs in SMMU CMDQ.
+ *       Must be little-endian.
+ *
+ * Supported command list only when passing in a vIOMMU via @hwpt_id:
+ *     CMDQ_OP_TLBI_NSNH_ALL
+ *     CMDQ_OP_TLBI_NH_VA
+ *     CMDQ_OP_TLBI_NH_VAA
+ *     CMDQ_OP_TLBI_NH_ALL
+ *     CMDQ_OP_TLBI_NH_ASID
+ *     CMDQ_OP_ATC_INV
+ *     CMDQ_OP_CFGI_CD
+ *     CMDQ_OP_CFGI_CD_ALL
+ *
+ * -EIO will be returned if the command is not supported.
+ */
+struct iommu_viommu_arm_smmuv3_invalidate {
+	__aligned_le64 cmd[2];
+};
+
 /**
  * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE)
  * @size: sizeof(struct iommu_hwpt_invalidate)
-- 
cgit v1.2.3


From c532de5a67a70f8533d495f8f2aaa9a0491c3ad0 Mon Sep 17 00:00:00 2001
From: Xianglai Li <lixianglai@loongson.cn>
Date: Wed, 13 Nov 2024 16:18:27 +0800
Subject: LoongArch: KVM: Add IPI device support

Add device model for IPI interrupt controller, implement basic create &
destroy interfaces, and register device model to kvm device table.

Signed-off-by: Tianrui Zhao <zhaotianrui@loongson.cn>
Signed-off-by: Xianglai Li <lixianglai@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/loongarch/include/asm/kvm_host.h |   4 ++
 arch/loongarch/include/asm/kvm_ipi.h  |  33 ++++++++++
 arch/loongarch/kvm/Makefile           |   1 +
 arch/loongarch/kvm/intc/ipi.c         | 112 ++++++++++++++++++++++++++++++++++
 arch/loongarch/kvm/main.c             |   7 ++-
 arch/loongarch/kvm/vcpu.c             |   3 +
 include/uapi/linux/kvm.h              |   4 ++
 7 files changed, 162 insertions(+), 2 deletions(-)
 create mode 100644 arch/loongarch/include/asm/kvm_ipi.h
 create mode 100644 arch/loongarch/kvm/intc/ipi.c

(limited to 'include/uapi/linux')

diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h
index d6bb72424027..8e5393d21fcb 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -18,6 +18,7 @@
 
 #include <asm/inst.h>
 #include <asm/kvm_mmu.h>
+#include <asm/kvm_ipi.h>
 #include <asm/loongarch.h>
 
 /* Loongarch KVM register ids */
@@ -117,6 +118,7 @@ struct kvm_arch {
 
 	s64 time_offset;
 	struct kvm_context __percpu *vmcs;
+	struct loongarch_ipi *ipi;
 };
 
 #define CSR_MAX_NUMS		0x800
@@ -221,6 +223,8 @@ struct kvm_vcpu_arch {
 	int last_sched_cpu;
 	/* mp state */
 	struct kvm_mp_state mp_state;
+	/* ipi state */
+	struct ipi_state ipi_state;
 	/* cpucfg */
 	u32 cpucfg[KVM_MAX_CPUCFG_REGS];
 
diff --git a/arch/loongarch/include/asm/kvm_ipi.h b/arch/loongarch/include/asm/kvm_ipi.h
new file mode 100644
index 000000000000..baaa6253e4dc
--- /dev/null
+++ b/arch/loongarch/include/asm/kvm_ipi.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2024 Loongson Technology Corporation Limited
+ */
+
+#ifndef __ASM_KVM_IPI_H
+#define __ASM_KVM_IPI_H
+
+#include <kvm/iodev.h>
+
+#define LARCH_INT_IPI			12
+
+struct loongarch_ipi {
+	spinlock_t lock;
+	struct kvm *kvm;
+	struct kvm_io_device device;
+};
+
+struct ipi_state {
+	spinlock_t lock;
+	uint32_t status;
+	uint32_t en;
+	uint32_t set;
+	uint32_t clear;
+	uint64_t buf[4];
+};
+
+#define IOCSR_IPI_BASE		0x1000
+#define IOCSR_IPI_SIZE		0x160
+
+int kvm_loongarch_register_ipi_device(void);
+
+#endif
diff --git a/arch/loongarch/kvm/Makefile b/arch/loongarch/kvm/Makefile
index b2f4cbe01ae8..36c3009fe89c 100644
--- a/arch/loongarch/kvm/Makefile
+++ b/arch/loongarch/kvm/Makefile
@@ -18,5 +18,6 @@ kvm-y += timer.o
 kvm-y += tlb.o
 kvm-y += vcpu.o
 kvm-y += vm.o
+kvm-y += intc/ipi.o
 
 CFLAGS_exit.o	+= $(call cc-option,-Wno-override-init,)
diff --git a/arch/loongarch/kvm/intc/ipi.c b/arch/loongarch/kvm/intc/ipi.c
new file mode 100644
index 000000000000..9e45571ad76e
--- /dev/null
+++ b/arch/loongarch/kvm/intc/ipi.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024 Loongson Technology Corporation Limited
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_ipi.h>
+#include <asm/kvm_vcpu.h>
+
+static int kvm_ipi_read(struct kvm_vcpu *vcpu,
+			struct kvm_io_device *dev,
+			gpa_t addr, int len, void *val)
+{
+	return 0;
+}
+
+static int kvm_ipi_write(struct kvm_vcpu *vcpu,
+			struct kvm_io_device *dev,
+			gpa_t addr, int len, const void *val)
+{
+	return 0;
+}
+
+static const struct kvm_io_device_ops kvm_ipi_ops = {
+	.read	= kvm_ipi_read,
+	.write	= kvm_ipi_write,
+};
+
+static int kvm_ipi_get_attr(struct kvm_device *dev,
+			struct kvm_device_attr *attr)
+{
+	return 0;
+}
+
+static int kvm_ipi_set_attr(struct kvm_device *dev,
+			struct kvm_device_attr *attr)
+{
+	return 0;
+}
+
+static int kvm_ipi_create(struct kvm_device *dev, u32 type)
+{
+	int ret;
+	struct kvm *kvm;
+	struct kvm_io_device *device;
+	struct loongarch_ipi *s;
+
+	if (!dev) {
+		kvm_err("%s: kvm_device ptr is invalid!\n", __func__);
+		return -EINVAL;
+	}
+
+	kvm = dev->kvm;
+	if (kvm->arch.ipi) {
+		kvm_err("%s: LoongArch IPI has already been created!\n", __func__);
+		return -EINVAL;
+	}
+
+	s = kzalloc(sizeof(struct loongarch_ipi), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	spin_lock_init(&s->lock);
+	s->kvm = kvm;
+
+	/*
+	 * Initialize IOCSR device
+	 */
+	device = &s->device;
+	kvm_iodevice_init(device, &kvm_ipi_ops);
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_io_bus_register_dev(kvm, KVM_IOCSR_BUS, IOCSR_IPI_BASE, IOCSR_IPI_SIZE, device);
+	mutex_unlock(&kvm->slots_lock);
+	if (ret < 0) {
+		kvm_err("%s: Initialize IOCSR dev failed, ret = %d\n", __func__, ret);
+		goto err;
+	}
+
+	kvm->arch.ipi = s;
+	return 0;
+
+err:
+	kfree(s);
+	return -EFAULT;
+}
+
+static void kvm_ipi_destroy(struct kvm_device *dev)
+{
+	struct kvm *kvm;
+	struct loongarch_ipi *ipi;
+
+	if (!dev || !dev->kvm || !dev->kvm->arch.ipi)
+		return;
+
+	kvm = dev->kvm;
+	ipi = kvm->arch.ipi;
+	kvm_io_bus_unregister_dev(kvm, KVM_IOCSR_BUS, &ipi->device);
+	kfree(ipi);
+}
+
+static struct kvm_device_ops kvm_ipi_dev_ops = {
+	.name = "kvm-loongarch-ipi",
+	.create = kvm_ipi_create,
+	.destroy = kvm_ipi_destroy,
+	.set_attr = kvm_ipi_set_attr,
+	.get_attr = kvm_ipi_get_attr,
+};
+
+int kvm_loongarch_register_ipi_device(void)
+{
+	return kvm_register_device_ops(&kvm_ipi_dev_ops, KVM_DEV_TYPE_LOONGARCH_IPI);
+}
diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c
index 27e9b94c0a0b..14f3f69c5bb9 100644
--- a/arch/loongarch/kvm/main.c
+++ b/arch/loongarch/kvm/main.c
@@ -313,7 +313,7 @@ void kvm_arch_disable_virtualization_cpu(void)
 
 static int kvm_loongarch_env_init(void)
 {
-	int cpu, order;
+	int cpu, order, ret;
 	void *addr;
 	struct kvm_context *context;
 
@@ -368,7 +368,10 @@ static int kvm_loongarch_env_init(void)
 
 	kvm_init_gcsr_flag();
 
-	return 0;
+	/* Register LoongArch IPI interrupt controller interface. */
+	ret = kvm_loongarch_register_ipi_device();
+
+	return ret;
 }
 
 static void kvm_loongarch_env_exit(void)
diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c
index 174734a23d0a..cab1818be68d 100644
--- a/arch/loongarch/kvm/vcpu.c
+++ b/arch/loongarch/kvm/vcpu.c
@@ -1475,6 +1475,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 	/* Init */
 	vcpu->arch.last_sched_cpu = -1;
 
+	/* Init ipi_state lock */
+	spin_lock_init(&vcpu->arch.ipi_state.lock);
+
 	/*
 	 * Initialize guest register state to valid architectural reset state.
 	 */
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 637efc055145..9fff439c30ea 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1158,7 +1158,11 @@ enum kvm_device_type {
 #define KVM_DEV_TYPE_ARM_PV_TIME	KVM_DEV_TYPE_ARM_PV_TIME
 	KVM_DEV_TYPE_RISCV_AIA,
 #define KVM_DEV_TYPE_RISCV_AIA		KVM_DEV_TYPE_RISCV_AIA
+	KVM_DEV_TYPE_LOONGARCH_IPI,
+#define KVM_DEV_TYPE_LOONGARCH_IPI	KVM_DEV_TYPE_LOONGARCH_IPI
+
 	KVM_DEV_TYPE_MAX,
+
 };
 
 struct kvm_vfio_spapr_tce {
-- 
cgit v1.2.3


From 2e8b9df82631e714cc2b7bf302772c8259673180 Mon Sep 17 00:00:00 2001
From: Xianglai Li <lixianglai@loongson.cn>
Date: Wed, 13 Nov 2024 16:18:27 +0800
Subject: LoongArch: KVM: Add EIOINTC device support

Add device model for EIOINTC interrupt controller, implement basic
create & destroy interfaces, and register device model to kvm device
table.

Signed-off-by: Tianrui Zhao <zhaotianrui@loongson.cn>
Signed-off-by: Xianglai Li <lixianglai@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/loongarch/include/asm/kvm_eiointc.h |  93 ++++++++++++++++++++++
 arch/loongarch/include/asm/kvm_host.h    |   4 +-
 arch/loongarch/kvm/Makefile              |   1 +
 arch/loongarch/kvm/intc/eiointc.c        | 132 +++++++++++++++++++++++++++++++
 arch/loongarch/kvm/main.c                |   6 ++
 include/uapi/linux/kvm.h                 |   2 +
 6 files changed, 237 insertions(+), 1 deletion(-)
 create mode 100644 arch/loongarch/include/asm/kvm_eiointc.h
 create mode 100644 arch/loongarch/kvm/intc/eiointc.c

(limited to 'include/uapi/linux')

diff --git a/arch/loongarch/include/asm/kvm_eiointc.h b/arch/loongarch/include/asm/kvm_eiointc.h
new file mode 100644
index 000000000000..ed65de5a8168
--- /dev/null
+++ b/arch/loongarch/include/asm/kvm_eiointc.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2024 Loongson Technology Corporation Limited
+ */
+
+#ifndef __ASM_KVM_EIOINTC_H
+#define __ASM_KVM_EIOINTC_H
+
+#include <kvm/iodev.h>
+
+#define EIOINTC_IRQS			256
+#define EIOINTC_ROUTE_MAX_VCPUS		256
+#define EIOINTC_IRQS_U8_NUMS		(EIOINTC_IRQS / 8)
+#define EIOINTC_IRQS_U16_NUMS		(EIOINTC_IRQS_U8_NUMS / 2)
+#define EIOINTC_IRQS_U32_NUMS		(EIOINTC_IRQS_U8_NUMS / 4)
+#define EIOINTC_IRQS_U64_NUMS		(EIOINTC_IRQS_U8_NUMS / 8)
+/* map to ipnum per 32 irqs */
+#define EIOINTC_IRQS_NODETYPE_COUNT	16
+
+#define EIOINTC_BASE			0x1400
+#define EIOINTC_SIZE			0x900
+
+#define EIOINTC_VIRT_BASE		(0x40000000)
+#define EIOINTC_VIRT_SIZE		(0x1000)
+
+#define LOONGSON_IP_NUM			8
+
+struct loongarch_eiointc {
+	spinlock_t lock;
+	struct kvm *kvm;
+	struct kvm_io_device device;
+	struct kvm_io_device device_vext;
+	uint32_t num_cpu;
+	uint32_t features;
+	uint32_t status;
+
+	/* hardware state */
+	union nodetype {
+		u64 reg_u64[EIOINTC_IRQS_NODETYPE_COUNT / 4];
+		u32 reg_u32[EIOINTC_IRQS_NODETYPE_COUNT / 2];
+		u16 reg_u16[EIOINTC_IRQS_NODETYPE_COUNT];
+		u8 reg_u8[EIOINTC_IRQS_NODETYPE_COUNT * 2];
+	} nodetype;
+
+	/* one bit shows the state of one irq */
+	union bounce {
+		u64 reg_u64[EIOINTC_IRQS_U64_NUMS];
+		u32 reg_u32[EIOINTC_IRQS_U32_NUMS];
+		u16 reg_u16[EIOINTC_IRQS_U16_NUMS];
+		u8 reg_u8[EIOINTC_IRQS_U8_NUMS];
+	} bounce;
+
+	union isr {
+		u64 reg_u64[EIOINTC_IRQS_U64_NUMS];
+		u32 reg_u32[EIOINTC_IRQS_U32_NUMS];
+		u16 reg_u16[EIOINTC_IRQS_U16_NUMS];
+		u8 reg_u8[EIOINTC_IRQS_U8_NUMS];
+	} isr;
+	union coreisr {
+		u64 reg_u64[EIOINTC_ROUTE_MAX_VCPUS][EIOINTC_IRQS_U64_NUMS];
+		u32 reg_u32[EIOINTC_ROUTE_MAX_VCPUS][EIOINTC_IRQS_U32_NUMS];
+		u16 reg_u16[EIOINTC_ROUTE_MAX_VCPUS][EIOINTC_IRQS_U16_NUMS];
+		u8 reg_u8[EIOINTC_ROUTE_MAX_VCPUS][EIOINTC_IRQS_U8_NUMS];
+	} coreisr;
+	union enable {
+		u64 reg_u64[EIOINTC_IRQS_U64_NUMS];
+		u32 reg_u32[EIOINTC_IRQS_U32_NUMS];
+		u16 reg_u16[EIOINTC_IRQS_U16_NUMS];
+		u8 reg_u8[EIOINTC_IRQS_U8_NUMS];
+	} enable;
+
+	/* use one byte to config ipmap for 32 irqs at once */
+	union ipmap {
+		u64 reg_u64;
+		u32 reg_u32[EIOINTC_IRQS_U32_NUMS / 4];
+		u16 reg_u16[EIOINTC_IRQS_U16_NUMS / 4];
+		u8 reg_u8[EIOINTC_IRQS_U8_NUMS / 4];
+	} ipmap;
+	/* use one byte to config coremap for one irq */
+	union coremap {
+		u64 reg_u64[EIOINTC_IRQS / 8];
+		u32 reg_u32[EIOINTC_IRQS / 4];
+		u16 reg_u16[EIOINTC_IRQS / 2];
+		u8 reg_u8[EIOINTC_IRQS];
+	} coremap;
+
+	DECLARE_BITMAP(sw_coreisr[EIOINTC_ROUTE_MAX_VCPUS][LOONGSON_IP_NUM], EIOINTC_IRQS);
+	uint8_t  sw_coremap[EIOINTC_IRQS];
+};
+
+int kvm_loongarch_register_eiointc_device(void);
+
+#endif /* __ASM_KVM_EIOINTC_H */
diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h
index a1de884ebb44..2d0476f05148 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -19,6 +19,7 @@
 #include <asm/inst.h>
 #include <asm/kvm_mmu.h>
 #include <asm/kvm_ipi.h>
+#include <asm/kvm_eiointc.h>
 #include <asm/loongarch.h>
 
 /* Loongarch KVM register ids */
@@ -87,7 +88,7 @@ struct kvm_world_switch {
  *
  *  For LOONGARCH_CSR_CPUID register, max CPUID size if 512
  *  For IPI hardware, max destination CPUID size 1024
- *  For extioi interrupt controller, max destination CPUID size is 256
+ *  For eiointc interrupt controller, max destination CPUID size is 256
  *  For msgint interrupt controller, max supported CPUID size is 65536
  *
  * Currently max CPUID is defined as 256 for KVM hypervisor, in future
@@ -121,6 +122,7 @@ struct kvm_arch {
 	s64 time_offset;
 	struct kvm_context __percpu *vmcs;
 	struct loongarch_ipi *ipi;
+	struct loongarch_eiointc *eiointc;
 };
 
 #define CSR_MAX_NUMS		0x800
diff --git a/arch/loongarch/kvm/Makefile b/arch/loongarch/kvm/Makefile
index 36c3009fe89c..bb50fc799c29 100644
--- a/arch/loongarch/kvm/Makefile
+++ b/arch/loongarch/kvm/Makefile
@@ -19,5 +19,6 @@ kvm-y += tlb.o
 kvm-y += vcpu.o
 kvm-y += vm.o
 kvm-y += intc/ipi.o
+kvm-y += intc/eiointc.o
 
 CFLAGS_exit.o	+= $(call cc-option,-Wno-override-init,)
diff --git a/arch/loongarch/kvm/intc/eiointc.c b/arch/loongarch/kvm/intc/eiointc.c
new file mode 100644
index 000000000000..10afa6163643
--- /dev/null
+++ b/arch/loongarch/kvm/intc/eiointc.c
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024 Loongson Technology Corporation Limited
+ */
+
+#include <asm/kvm_eiointc.h>
+#include <asm/kvm_vcpu.h>
+#include <linux/count_zeros.h>
+
+static int kvm_eiointc_read(struct kvm_vcpu *vcpu,
+			struct kvm_io_device *dev,
+			gpa_t addr, int len, void *val)
+{
+	return 0;
+}
+
+static int kvm_eiointc_write(struct kvm_vcpu *vcpu,
+			struct kvm_io_device *dev,
+			gpa_t addr, int len, const void *val)
+{
+	return 0;
+}
+
+static const struct kvm_io_device_ops kvm_eiointc_ops = {
+	.read	= kvm_eiointc_read,
+	.write	= kvm_eiointc_write,
+};
+
+static int kvm_eiointc_virt_read(struct kvm_vcpu *vcpu,
+				struct kvm_io_device *dev,
+				gpa_t addr, int len, void *val)
+{
+	return 0;
+}
+
+static int kvm_eiointc_virt_write(struct kvm_vcpu *vcpu,
+				struct kvm_io_device *dev,
+				gpa_t addr, int len, const void *val)
+{
+	return 0;
+}
+
+static const struct kvm_io_device_ops kvm_eiointc_virt_ops = {
+	.read	= kvm_eiointc_virt_read,
+	.write	= kvm_eiointc_virt_write,
+};
+
+static int kvm_eiointc_get_attr(struct kvm_device *dev,
+				struct kvm_device_attr *attr)
+{
+	return 0;
+}
+
+static int kvm_eiointc_set_attr(struct kvm_device *dev,
+				struct kvm_device_attr *attr)
+{
+	return 0;
+}
+
+static int kvm_eiointc_create(struct kvm_device *dev, u32 type)
+{
+	int ret;
+	struct loongarch_eiointc *s;
+	struct kvm_io_device *device, *device1;
+	struct kvm *kvm = dev->kvm;
+
+	/* eiointc has been created */
+	if (kvm->arch.eiointc)
+		return -EINVAL;
+
+	s = kzalloc(sizeof(struct loongarch_eiointc), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	spin_lock_init(&s->lock);
+	s->kvm = kvm;
+
+	/*
+	 * Initialize IOCSR device
+	 */
+	device = &s->device;
+	kvm_iodevice_init(device, &kvm_eiointc_ops);
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_io_bus_register_dev(kvm, KVM_IOCSR_BUS,
+			EIOINTC_BASE, EIOINTC_SIZE, device);
+	mutex_unlock(&kvm->slots_lock);
+	if (ret < 0) {
+		kfree(s);
+		return ret;
+	}
+
+	device1 = &s->device_vext;
+	kvm_iodevice_init(device1, &kvm_eiointc_virt_ops);
+	ret = kvm_io_bus_register_dev(kvm, KVM_IOCSR_BUS,
+			EIOINTC_VIRT_BASE, EIOINTC_VIRT_SIZE, device1);
+	if (ret < 0) {
+		kvm_io_bus_unregister_dev(kvm, KVM_IOCSR_BUS, &s->device);
+		kfree(s);
+		return ret;
+	}
+	kvm->arch.eiointc = s;
+
+	return 0;
+}
+
+static void kvm_eiointc_destroy(struct kvm_device *dev)
+{
+	struct kvm *kvm;
+	struct loongarch_eiointc *eiointc;
+
+	if (!dev || !dev->kvm || !dev->kvm->arch.eiointc)
+		return;
+
+	kvm = dev->kvm;
+	eiointc = kvm->arch.eiointc;
+	kvm_io_bus_unregister_dev(kvm, KVM_IOCSR_BUS, &eiointc->device);
+	kvm_io_bus_unregister_dev(kvm, KVM_IOCSR_BUS, &eiointc->device_vext);
+	kfree(eiointc);
+}
+
+static struct kvm_device_ops kvm_eiointc_dev_ops = {
+	.name = "kvm-loongarch-eiointc",
+	.create = kvm_eiointc_create,
+	.destroy = kvm_eiointc_destroy,
+	.set_attr = kvm_eiointc_set_attr,
+	.get_attr = kvm_eiointc_get_attr,
+};
+
+int kvm_loongarch_register_eiointc_device(void)
+{
+	return kvm_register_device_ops(&kvm_eiointc_dev_ops, KVM_DEV_TYPE_LOONGARCH_EIOINTC);
+}
diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c
index 14f3f69c5bb9..8de366ade99c 100644
--- a/arch/loongarch/kvm/main.c
+++ b/arch/loongarch/kvm/main.c
@@ -9,6 +9,7 @@
 #include <asm/cacheflush.h>
 #include <asm/cpufeature.h>
 #include <asm/kvm_csr.h>
+#include <asm/kvm_eiointc.h>
 #include "trace.h"
 
 unsigned long vpid_mask;
@@ -370,6 +371,11 @@ static int kvm_loongarch_env_init(void)
 
 	/* Register LoongArch IPI interrupt controller interface. */
 	ret = kvm_loongarch_register_ipi_device();
+	if (ret)
+		return ret;
+
+	/* Register LoongArch EIOINTC interrupt controller interface. */
+	ret = kvm_loongarch_register_eiointc_device();
 
 	return ret;
 }
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 9fff439c30ea..0ec5c631d9e9 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1160,6 +1160,8 @@ enum kvm_device_type {
 #define KVM_DEV_TYPE_RISCV_AIA		KVM_DEV_TYPE_RISCV_AIA
 	KVM_DEV_TYPE_LOONGARCH_IPI,
 #define KVM_DEV_TYPE_LOONGARCH_IPI	KVM_DEV_TYPE_LOONGARCH_IPI
+	KVM_DEV_TYPE_LOONGARCH_EIOINTC,
+#define KVM_DEV_TYPE_LOONGARCH_EIOINTC	KVM_DEV_TYPE_LOONGARCH_EIOINTC
 
 	KVM_DEV_TYPE_MAX,
 
-- 
cgit v1.2.3


From e785dfacf7e7fe94370fa0e8e3ff1bc8fe179831 Mon Sep 17 00:00:00 2001
From: Xianglai Li <lixianglai@loongson.cn>
Date: Wed, 13 Nov 2024 16:18:27 +0800
Subject: LoongArch: KVM: Add PCHPIC device support

Add device model for PCHPIC interrupt controller, implemente basic
create & destroy interface, and register device model to kvm device
table.

Signed-off-by: Tianrui Zhao <zhaotianrui@loongson.cn>
Signed-off-by: Xianglai Li <lixianglai@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/loongarch/include/asm/kvm_host.h    |  2 +
 arch/loongarch/include/asm/kvm_pch_pic.h | 31 +++++++++++
 arch/loongarch/kvm/Makefile              |  1 +
 arch/loongarch/kvm/intc/pch_pic.c        | 88 ++++++++++++++++++++++++++++++++
 arch/loongarch/kvm/main.c                |  6 +++
 include/uapi/linux/kvm.h                 |  2 +
 6 files changed, 130 insertions(+)
 create mode 100644 arch/loongarch/include/asm/kvm_pch_pic.h
 create mode 100644 arch/loongarch/kvm/intc/pch_pic.c

(limited to 'include/uapi/linux')

diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h
index a63c2bf6fae0..a6b82c8ab7bc 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -20,6 +20,7 @@
 #include <asm/kvm_mmu.h>
 #include <asm/kvm_ipi.h>
 #include <asm/kvm_eiointc.h>
+#include <asm/kvm_pch_pic.h>
 #include <asm/loongarch.h>
 
 /* Loongarch KVM register ids */
@@ -125,6 +126,7 @@ struct kvm_arch {
 	struct kvm_context __percpu *vmcs;
 	struct loongarch_ipi *ipi;
 	struct loongarch_eiointc *eiointc;
+	struct loongarch_pch_pic *pch_pic;
 };
 
 #define CSR_MAX_NUMS		0x800
diff --git a/arch/loongarch/include/asm/kvm_pch_pic.h b/arch/loongarch/include/asm/kvm_pch_pic.h
new file mode 100644
index 000000000000..914be4fd35e9
--- /dev/null
+++ b/arch/loongarch/include/asm/kvm_pch_pic.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2024 Loongson Technology Corporation Limited
+ */
+
+#ifndef __ASM_KVM_PCH_PIC_H
+#define __ASM_KVM_PCH_PIC_H
+
+#include <kvm/iodev.h>
+
+struct loongarch_pch_pic {
+	spinlock_t lock;
+	struct kvm *kvm;
+	struct kvm_io_device device;
+	uint64_t mask; /* 1:disable irq, 0:enable irq */
+	uint64_t htmsi_en; /* 1:msi */
+	uint64_t edge; /* 1:edge triggered, 0:level triggered */
+	uint64_t auto_ctrl0; /* only use default value 00b */
+	uint64_t auto_ctrl1; /* only use default value 00b */
+	uint64_t last_intirr; /* edge detection */
+	uint64_t irr; /* interrupt request register */
+	uint64_t isr; /* interrupt service register */
+	uint64_t polarity; /* 0: high level trigger, 1: low level trigger */
+	uint8_t  route_entry[64]; /* default value 0, route to int0: eiointc */
+	uint8_t  htmsi_vector[64]; /* irq route table for routing to eiointc */
+	uint64_t pch_pic_base;
+};
+
+int kvm_loongarch_register_pch_pic_device(void);
+
+#endif /* __ASM_KVM_PCH_PIC_H */
diff --git a/arch/loongarch/kvm/Makefile b/arch/loongarch/kvm/Makefile
index bb50fc799c29..97b2adf08206 100644
--- a/arch/loongarch/kvm/Makefile
+++ b/arch/loongarch/kvm/Makefile
@@ -20,5 +20,6 @@ kvm-y += vcpu.o
 kvm-y += vm.o
 kvm-y += intc/ipi.o
 kvm-y += intc/eiointc.o
+kvm-y += intc/pch_pic.o
 
 CFLAGS_exit.o	+= $(call cc-option,-Wno-override-init,)
diff --git a/arch/loongarch/kvm/intc/pch_pic.c b/arch/loongarch/kvm/intc/pch_pic.c
new file mode 100644
index 000000000000..564b7dcbbbed
--- /dev/null
+++ b/arch/loongarch/kvm/intc/pch_pic.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024 Loongson Technology Corporation Limited
+ */
+
+#include <asm/kvm_eiointc.h>
+#include <asm/kvm_pch_pic.h>
+#include <asm/kvm_vcpu.h>
+#include <linux/count_zeros.h>
+
+static int kvm_pch_pic_read(struct kvm_vcpu *vcpu,
+			struct kvm_io_device *dev,
+			gpa_t addr, int len, void *val)
+{
+	return 0;
+}
+
+static int kvm_pch_pic_write(struct kvm_vcpu *vcpu,
+			struct kvm_io_device *dev,
+			gpa_t addr, int len, const void *val)
+{
+	return 0;
+}
+
+static const struct kvm_io_device_ops kvm_pch_pic_ops = {
+	.read	= kvm_pch_pic_read,
+	.write	= kvm_pch_pic_write,
+};
+
+static int kvm_pch_pic_get_attr(struct kvm_device *dev,
+				struct kvm_device_attr *attr)
+{
+	return 0;
+}
+
+static int kvm_pch_pic_set_attr(struct kvm_device *dev,
+				struct kvm_device_attr *attr)
+{
+	return 0;
+}
+
+static int kvm_pch_pic_create(struct kvm_device *dev, u32 type)
+{
+	struct kvm *kvm = dev->kvm;
+	struct loongarch_pch_pic *s;
+
+	/* pch pic should not has been created */
+	if (kvm->arch.pch_pic)
+		return -EINVAL;
+
+	s = kzalloc(sizeof(struct loongarch_pch_pic), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	spin_lock_init(&s->lock);
+	s->kvm = kvm;
+	kvm->arch.pch_pic = s;
+
+	return 0;
+}
+
+static void kvm_pch_pic_destroy(struct kvm_device *dev)
+{
+	struct kvm *kvm;
+	struct loongarch_pch_pic *s;
+
+	if (!dev || !dev->kvm || !dev->kvm->arch.pch_pic)
+		return;
+
+	kvm = dev->kvm;
+	s = kvm->arch.pch_pic;
+	/* unregister pch pic device and free it's memory */
+	kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &s->device);
+	kfree(s);
+}
+
+static struct kvm_device_ops kvm_pch_pic_dev_ops = {
+	.name = "kvm-loongarch-pch-pic",
+	.create = kvm_pch_pic_create,
+	.destroy = kvm_pch_pic_destroy,
+	.set_attr = kvm_pch_pic_set_attr,
+	.get_attr = kvm_pch_pic_get_attr,
+};
+
+int kvm_loongarch_register_pch_pic_device(void)
+{
+	return kvm_register_device_ops(&kvm_pch_pic_dev_ops, KVM_DEV_TYPE_LOONGARCH_PCHPIC);
+}
diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c
index 8de366ade99c..396fed2665a5 100644
--- a/arch/loongarch/kvm/main.c
+++ b/arch/loongarch/kvm/main.c
@@ -10,6 +10,7 @@
 #include <asm/cpufeature.h>
 #include <asm/kvm_csr.h>
 #include <asm/kvm_eiointc.h>
+#include <asm/kvm_pch_pic.h>
 #include "trace.h"
 
 unsigned long vpid_mask;
@@ -376,6 +377,11 @@ static int kvm_loongarch_env_init(void)
 
 	/* Register LoongArch EIOINTC interrupt controller interface. */
 	ret = kvm_loongarch_register_eiointc_device();
+	if (ret)
+		return ret;
+
+	/* Register LoongArch PCH-PIC interrupt controller interface. */
+	ret = kvm_loongarch_register_pch_pic_device();
 
 	return ret;
 }
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 0ec5c631d9e9..502ea63b5d2e 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1162,6 +1162,8 @@ enum kvm_device_type {
 #define KVM_DEV_TYPE_LOONGARCH_IPI	KVM_DEV_TYPE_LOONGARCH_IPI
 	KVM_DEV_TYPE_LOONGARCH_EIOINTC,
 #define KVM_DEV_TYPE_LOONGARCH_EIOINTC	KVM_DEV_TYPE_LOONGARCH_EIOINTC
+	KVM_DEV_TYPE_LOONGARCH_PCHPIC,
+#define KVM_DEV_TYPE_LOONGARCH_PCHPIC	KVM_DEV_TYPE_LOONGARCH_PCHPIC
 
 	KVM_DEV_TYPE_MAX,
 
-- 
cgit v1.2.3


From 44010543fc8bedad172aa5b6c43480e5d2124497 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 11 Nov 2024 10:09:57 -0500
Subject: fs: add the ability for statmount() to report the sb_source

/proc/self/mountinfo displays the source for the mount, but statmount()
doesn't yet have a way to return it. Add a new STATMOUNT_SB_SOURCE flag,
claim the 32-bit __spare1 field to hold the offset into the str[] array.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://lore.kernel.org/r/20241111-statmount-v4-3-2eaf35d07a80@kernel.org
Acked-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c             | 36 +++++++++++++++++++++++++++++++++++-
 include/uapi/linux/mount.h |  3 ++-
 2 files changed, 37 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index dbd89fffd919..d32b5afa99dc 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5012,6 +5012,32 @@ static void statmount_fs_subtype(struct kstatmount *s, struct seq_file *seq)
 		seq_puts(seq, sb->s_subtype);
 }
 
+static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq)
+{
+	struct super_block *sb = s->mnt->mnt_sb;
+	struct mount *r = real_mount(s->mnt);
+
+	if (sb->s_op->show_devname) {
+		size_t start = seq->count;
+		int ret;
+
+		ret = sb->s_op->show_devname(seq, s->mnt->mnt_root);
+		if (ret)
+			return ret;
+
+		if (unlikely(seq_has_overflowed(seq)))
+			return -EAGAIN;
+
+		/* Unescape the result */
+		seq->buf[seq->count] = '\0';
+		seq->count = start;
+		seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL));
+	} else if (r->mnt_devname) {
+		seq_puts(seq, r->mnt_devname);
+	}
+	return 0;
+}
+
 static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns)
 {
 	s->sm.mask |= STATMOUNT_MNT_NS_ID;
@@ -5075,6 +5101,10 @@ static int statmount_string(struct kstatmount *s, u64 flag)
 		sm->fs_subtype = start;
 		statmount_fs_subtype(s, seq);
 		break;
+	case STATMOUNT_SB_SOURCE:
+		sm->sb_source = start;
+		ret = statmount_sb_source(s, seq);
+		break;
 	default:
 		WARN_ON_ONCE(true);
 		return -EINVAL;
@@ -5223,6 +5253,9 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
 	if (!err && s->mask & STATMOUNT_FS_SUBTYPE)
 		err = statmount_string(s, STATMOUNT_FS_SUBTYPE);
 
+	if (!err && s->mask & STATMOUNT_SB_SOURCE)
+		err = statmount_string(s, STATMOUNT_SB_SOURCE);
+
 	if (!err && s->mask & STATMOUNT_MNT_NS_ID)
 		statmount_mnt_ns_id(s, ns);
 
@@ -5244,7 +5277,8 @@ static inline bool retry_statmount(const long ret, size_t *seq_size)
 }
 
 #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \
-			      STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | STATMOUNT_FS_SUBTYPE)
+			      STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \
+			      STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE)
 
 static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
 			      struct statmount __user *buf, size_t bufsize,
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index 2e939dddf9cb..2b49e9131d77 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -174,7 +174,7 @@ struct statmount {
 	__u32 mnt_point;	/* [str] Mountpoint relative to current root */
 	__u64 mnt_ns_id;	/* ID of the mount namespace */
 	__u32 fs_subtype;	/* [str] Subtype of fs_type (if any) */
-	__u32 __spare1[1];
+	__u32 sb_source;	/* [str] Source string of the mount */
 	__u64 __spare2[48];
 	char str[];		/* Variable size part containing strings */
 };
@@ -210,6 +210,7 @@ struct mnt_id_req {
 #define STATMOUNT_MNT_NS_ID		0x00000040U	/* Want/got mnt_ns_id */
 #define STATMOUNT_MNT_OPTS		0x00000080U	/* Want/got mnt_opts */
 #define STATMOUNT_FS_SUBTYPE		0x00000100U	/* Want/got fs_subtype */
+#define STATMOUNT_SB_SOURCE		0x00000200U	/* Want/got sb_source */
 
 /*
  * Special @mnt_id values that can be passed to listmount
-- 
cgit v1.2.3


From 2f4d4503e9e5ab765a7948f98bc5deef7850f607 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Tue, 12 Nov 2024 11:10:04 +0100
Subject: statmount: add flag to retrieve unescaped options

Filesystem options can be retrieved with STATMOUNT_MNT_OPTS, which
returns a string of comma separated options, where some characters are
escaped using the \OOO notation.

Add a new flag, STATMOUNT_OPT_ARRAY, which instead returns the raw
option values separated with '\0' charaters.

Since escaped charaters are rare, this inteface is preferable for
non-libmount users which likley don't want to deal with option
de-escaping.

Example code:

	if (st->mask & STATMOUNT_OPT_ARRAY) {
		const char *opt = st->str + st->opt_array;

		for (unsigned int i = 0; i < st->opt_num; i++) {
			printf("opt_array[%i]: <%s>\n", i, opt);
			opt += strlen(opt) + 1;
		}
	}

Example ouput:

(1) mnt_opts: <lowerdir+=/l\054w\054r,lowerdir+=/l\054w\054r1,upperdir=/upp\054r,workdir=/w\054rk,redirect_dir=nofollow,uuid=null>

(2) opt_array[0]: <lowerdir+=/l,w,r>
    opt_array[1]: <lowerdir+=/l,w,r1>
    opt_array[2]: <upperdir=/upp,r>
    opt_array[3]: <workdir=/w,rk>
    opt_array[4]: <redirect_dir=nofollow>
    opt_array[5]: <uuid=null>

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Link: https://lore.kernel.org/r/20241112101006.30715-1-mszeredi@redhat.com
Acked-by: Jeff Layton <jlayton@kernel.org>
[brauner: tweak variable naming and parsing add example output]
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c             | 47 +++++++++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/mount.h |  7 +++++--
 2 files changed, 51 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index d32b5afa99dc..4f39c4aba85d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5072,6 +5072,43 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
 	return 0;
 }
 
+static int statmount_opt_array(struct kstatmount *s, struct seq_file *seq)
+{
+	struct vfsmount *mnt = s->mnt;
+	struct super_block *sb = mnt->mnt_sb;
+	size_t start = seq->count;
+	char *buf_start, *buf_end, *opt_start, *opt_end;
+	u32 count = 0;
+	int err;
+
+	if (!sb->s_op->show_options)
+		return 0;
+
+	buf_start = seq->buf + start;
+	err = sb->s_op->show_options(seq, mnt->mnt_root);
+	if (err)
+		return err;
+
+	if (unlikely(seq_has_overflowed(seq)))
+		return -EAGAIN;
+
+	if (seq->count == start)
+		return 0;
+
+	buf_end = seq->buf + seq->count;
+	*buf_end = '\0';
+	for (opt_start = buf_start + 1; opt_start < buf_end; opt_start = opt_end + 1) {
+		opt_end = strchrnul(opt_start, ',');
+		*opt_end = '\0';
+		buf_start += string_unescape(opt_start, buf_start, 0, UNESCAPE_OCTAL) + 1;
+		if (WARN_ON_ONCE(++count == 0))
+			return -EOVERFLOW;
+	}
+	seq->count = buf_start - 1 - seq->buf;
+	s->sm.opt_num = count;
+	return 0;
+}
+
 static int statmount_string(struct kstatmount *s, u64 flag)
 {
 	int ret = 0;
@@ -5097,6 +5134,10 @@ static int statmount_string(struct kstatmount *s, u64 flag)
 		sm->mnt_opts = start;
 		ret = statmount_mnt_opts(s, seq);
 		break;
+	case STATMOUNT_OPT_ARRAY:
+		sm->opt_array = start;
+		ret = statmount_opt_array(s, seq);
+		break;
 	case STATMOUNT_FS_SUBTYPE:
 		sm->fs_subtype = start;
 		statmount_fs_subtype(s, seq);
@@ -5250,6 +5291,9 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
 	if (!err && s->mask & STATMOUNT_MNT_OPTS)
 		err = statmount_string(s, STATMOUNT_MNT_OPTS);
 
+	if (!err && s->mask & STATMOUNT_OPT_ARRAY)
+		err = statmount_string(s, STATMOUNT_OPT_ARRAY);
+
 	if (!err && s->mask & STATMOUNT_FS_SUBTYPE)
 		err = statmount_string(s, STATMOUNT_FS_SUBTYPE);
 
@@ -5278,7 +5322,8 @@ static inline bool retry_statmount(const long ret, size_t *seq_size)
 
 #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \
 			      STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \
-			      STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE)
+			      STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \
+			      STATMOUNT_OPT_ARRAY)
 
 static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
 			      struct statmount __user *buf, size_t bufsize,
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index 2b49e9131d77..c0fda4604187 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -154,7 +154,7 @@ struct mount_attr {
  */
 struct statmount {
 	__u32 size;		/* Total size, including strings */
-	__u32 mnt_opts;		/* [str] Mount options of the mount */
+	__u32 mnt_opts;		/* [str] Options (comma separated, escaped) */
 	__u64 mask;		/* What results were written */
 	__u32 sb_dev_major;	/* Device ID */
 	__u32 sb_dev_minor;
@@ -175,7 +175,9 @@ struct statmount {
 	__u64 mnt_ns_id;	/* ID of the mount namespace */
 	__u32 fs_subtype;	/* [str] Subtype of fs_type (if any) */
 	__u32 sb_source;	/* [str] Source string of the mount */
-	__u64 __spare2[48];
+	__u32 opt_num;		/* Number of fs options */
+	__u32 opt_array;	/* [str] Array of nul terminated fs options */
+	__u64 __spare2[47];
 	char str[];		/* Variable size part containing strings */
 };
 
@@ -211,6 +213,7 @@ struct mnt_id_req {
 #define STATMOUNT_MNT_OPTS		0x00000080U	/* Want/got mnt_opts */
 #define STATMOUNT_FS_SUBTYPE		0x00000100U	/* Want/got fs_subtype */
 #define STATMOUNT_SB_SOURCE		0x00000200U	/* Want/got sb_source */
+#define STATMOUNT_OPT_ARRAY		0x00000400U	/* Want/got opt_... */
 
 /*
  * Special @mnt_id values that can be passed to listmount
-- 
cgit v1.2.3


From 95f567f81e43a1bcb5fbf0559e55b7505707300d Mon Sep 17 00:00:00 2001
From: Stefan Berger <stefanb@linux.ibm.com>
Date: Fri, 1 Nov 2024 15:37:03 -0400
Subject: fs: Simplify getattr interface function checking AT_GETATTR_NOSEC
 flag

Commit 8a924db2d7b5 ("fs: Pass AT_GETATTR_NOSEC flag to getattr interface
function")' introduced the AT_GETATTR_NOSEC flag to ensure that the
call paths only call vfs_getattr_nosec if it is set instead of vfs_getattr.
Now, simplify the getattr interface functions of filesystems where the flag
AT_GETATTR_NOSEC is checked.

There is only a single caller of inode_operations getattr function and it
is located in fs/stat.c in vfs_getattr_nosec. The caller there is the only
one from which the AT_GETATTR_NOSEC flag is passed from.

Two filesystems are checking this flag in .getattr and the flag is always
passed to them unconditionally from only vfs_getattr_nosec:

- ecryptfs:  Simplify by always calling vfs_getattr_nosec in
             ecryptfs_getattr. From there the flag is passed to no other
             function and this function is not called otherwise.

- overlayfs: Simplify by always calling vfs_getattr_nosec in
             ovl_getattr. From there the flag is passed to no other
             function and this function is not called otherwise.

The query_flags in vfs_getattr_nosec will mask-out AT_GETATTR_NOSEC from
any caller using AT_STATX_SYNC_TYPE as mask so that the flag is not
important inside this function. Also, since no filesystem is checking the
flag anymore, remove the flag entirely now, including the BUG_ON check that
never triggered.

The net change of the changes here combined with the original commit is
that ecryptfs and overlayfs do not call vfs_getattr but only
vfs_getattr_nosec.

Fixes: 8a924db2d7b5 ("fs: Pass AT_GETATTR_NOSEC flag to getattr interface function")
Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Closes: https://lore.kernel.org/linux-fsdevel/20241101011724.GN1350452@ZenIV/T/#u
Cc: Tyler Hicks <code@tyhicks.com>
Cc: ecryptfs@vger.kernel.org
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: linux-unionfs@vger.kernel.org
Cc: Christian Brauner <brauner@kernel.org>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ecryptfs/inode.c        | 12 ++----------
 fs/overlayfs/inode.c       | 10 +++++-----
 fs/overlayfs/overlayfs.h   |  8 --------
 fs/stat.c                  |  5 +----
 include/uapi/linux/fcntl.h |  4 ----
 5 files changed, 8 insertions(+), 31 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 5ed1e4cf6c0b..644e973d5a77 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -1008,14 +1008,6 @@ static int ecryptfs_getattr_link(struct mnt_idmap *idmap,
 	return rc;
 }
 
-static int ecryptfs_do_getattr(const struct path *path, struct kstat *stat,
-			       u32 request_mask, unsigned int flags)
-{
-	if (flags & AT_GETATTR_NOSEC)
-		return vfs_getattr_nosec(path, stat, request_mask, flags);
-	return vfs_getattr(path, stat, request_mask, flags);
-}
-
 static int ecryptfs_getattr(struct mnt_idmap *idmap,
 			    const struct path *path, struct kstat *stat,
 			    u32 request_mask, unsigned int flags)
@@ -1024,8 +1016,8 @@ static int ecryptfs_getattr(struct mnt_idmap *idmap,
 	struct kstat lower_stat;
 	int rc;
 
-	rc = ecryptfs_do_getattr(ecryptfs_dentry_to_lower_path(dentry),
-				 &lower_stat, request_mask, flags);
+	rc = vfs_getattr_nosec(ecryptfs_dentry_to_lower_path(dentry),
+			       &lower_stat, request_mask, flags);
 	if (!rc) {
 		fsstack_copy_attr_all(d_inode(dentry),
 				      ecryptfs_inode_to_lower(d_inode(dentry)));
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 35fd3e3e1778..8b31f44c12cd 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -170,7 +170,7 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
 
 	type = ovl_path_real(dentry, &realpath);
 	old_cred = ovl_override_creds(dentry->d_sb);
-	err = ovl_do_getattr(&realpath, stat, request_mask, flags);
+	err = vfs_getattr_nosec(&realpath, stat, request_mask, flags);
 	if (err)
 		goto out;
 
@@ -195,8 +195,8 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
 					(!is_dir ? STATX_NLINK : 0);
 
 			ovl_path_lower(dentry, &realpath);
-			err = ovl_do_getattr(&realpath, &lowerstat, lowermask,
-					     flags);
+			err = vfs_getattr_nosec(&realpath, &lowerstat, lowermask,
+						flags);
 			if (err)
 				goto out;
 
@@ -248,8 +248,8 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
 
 			ovl_path_lowerdata(dentry, &realpath);
 			if (realpath.dentry) {
-				err = ovl_do_getattr(&realpath, &lowerdatastat,
-						     lowermask, flags);
+				err = vfs_getattr_nosec(&realpath, &lowerdatastat,
+							lowermask, flags);
 				if (err)
 					goto out;
 			} else {
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 0bfe35da4b7b..910dbbb2bb7b 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -412,14 +412,6 @@ static inline bool ovl_open_flags_need_copy_up(int flags)
 	return ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC));
 }
 
-static inline int ovl_do_getattr(const struct path *path, struct kstat *stat,
-				 u32 request_mask, unsigned int flags)
-{
-	if (flags & AT_GETATTR_NOSEC)
-		return vfs_getattr_nosec(path, stat, request_mask, flags);
-	return vfs_getattr(path, stat, request_mask, flags);
-}
-
 /* util.c */
 int ovl_get_write_access(struct dentry *dentry);
 void ovl_put_write_access(struct dentry *dentry);
diff --git a/fs/stat.c b/fs/stat.c
index 855b995ad09b..011d2160b7af 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -165,7 +165,7 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
 	if (inode->i_op->getattr)
 		return inode->i_op->getattr(idmap, path, stat,
 					    request_mask,
-					    query_flags | AT_GETATTR_NOSEC);
+					    query_flags);
 
 	generic_fillattr(idmap, request_mask, inode, stat);
 	return 0;
@@ -198,9 +198,6 @@ int vfs_getattr(const struct path *path, struct kstat *stat,
 {
 	int retval;
 
-	if (WARN_ON_ONCE(query_flags & AT_GETATTR_NOSEC))
-		return -EPERM;
-
 	retval = security_inode_getattr(path);
 	if (retval)
 		return retval;
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 87e2dec79fea..a40833bf2855 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -154,8 +154,4 @@
 					   usable with open_by_handle_at(2). */
 #define AT_HANDLE_MNT_ID_UNIQUE	0x001	/* Return the u64 unique mount ID. */
 
-#if defined(__KERNEL__)
-#define AT_GETATTR_NOSEC	0x80000000
-#endif
-
 #endif /* _UAPI_LINUX_FCNTL_H */
-- 
cgit v1.2.3


From 7c1ae151e81268db1fe8c8a473d922fc5ba47b72 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Wed, 13 Nov 2024 13:51:54 +0200
Subject: virtio_pci: Introduce device parts access commands

Introduce device parts access commands via the admin queue.

These commands and their structure adhere to the Virtio 1.4
specification.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20241113115200.209269-2-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/uapi/linux/virtio_pci.h | 131 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 131 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index a8208492e822..1beb317df1b9 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -40,6 +40,7 @@
 #define _LINUX_VIRTIO_PCI_H
 
 #include <linux/types.h>
+#include <linux/kernel.h>
 
 #ifndef VIRTIO_PCI_NO_LEGACY
 
@@ -240,6 +241,17 @@ struct virtio_pci_cfg_cap {
 #define VIRTIO_ADMIN_CMD_LEGACY_DEV_CFG_READ		0x5
 #define VIRTIO_ADMIN_CMD_LEGACY_NOTIFY_INFO		0x6
 
+/* Device parts access commands. */
+#define VIRTIO_ADMIN_CMD_CAP_ID_LIST_QUERY		0x7
+#define VIRTIO_ADMIN_CMD_DEVICE_CAP_GET			0x8
+#define VIRTIO_ADMIN_CMD_DRIVER_CAP_SET			0x9
+#define VIRTIO_ADMIN_CMD_RESOURCE_OBJ_CREATE		0xa
+#define VIRTIO_ADMIN_CMD_RESOURCE_OBJ_DESTROY		0xd
+#define VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_GET		0xe
+#define VIRTIO_ADMIN_CMD_DEV_PARTS_GET			0xf
+#define VIRTIO_ADMIN_CMD_DEV_PARTS_SET			0x10
+#define VIRTIO_ADMIN_CMD_DEV_MODE_SET			0x11
+
 struct virtio_admin_cmd_hdr {
 	__le16 opcode;
 	/*
@@ -286,4 +298,123 @@ struct virtio_admin_cmd_notify_info_result {
 	struct virtio_admin_cmd_notify_info_data entries[VIRTIO_ADMIN_CMD_MAX_NOTIFY_INFO];
 };
 
+#define VIRTIO_DEV_PARTS_CAP 0x0000
+
+struct virtio_dev_parts_cap {
+	__u8 get_parts_resource_objects_limit;
+	__u8 set_parts_resource_objects_limit;
+};
+
+#define MAX_CAP_ID __KERNEL_DIV_ROUND_UP(VIRTIO_DEV_PARTS_CAP + 1, 64)
+
+struct virtio_admin_cmd_query_cap_id_result {
+	__le64 supported_caps[MAX_CAP_ID];
+};
+
+struct virtio_admin_cmd_cap_get_data {
+	__le16 id;
+	__u8 reserved[6];
+};
+
+struct virtio_admin_cmd_cap_set_data {
+	__le16 id;
+	__u8 reserved[6];
+	__u8 cap_specific_data[];
+};
+
+struct virtio_admin_cmd_resource_obj_cmd_hdr {
+	__le16 type;
+	__u8 reserved[2];
+	__le32 id; /* Indicates unique resource object id per resource object type */
+};
+
+struct virtio_admin_cmd_resource_obj_create_data {
+	struct virtio_admin_cmd_resource_obj_cmd_hdr hdr;
+	__le64 flags;
+	__u8 resource_obj_specific_data[];
+};
+
+#define VIRTIO_RESOURCE_OBJ_DEV_PARTS 0
+
+#define VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_GET 0
+#define VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_SET 1
+
+struct virtio_resource_obj_dev_parts {
+	__u8 type;
+	__u8 reserved[7];
+};
+
+#define VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE 0
+#define VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_COUNT 1
+#define VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_LIST 2
+
+struct virtio_admin_cmd_dev_parts_metadata_data {
+	struct virtio_admin_cmd_resource_obj_cmd_hdr hdr;
+	__u8 type;
+	__u8 reserved[7];
+};
+
+#define VIRTIO_DEV_PART_F_OPTIONAL 0
+
+struct virtio_dev_part_hdr {
+	__le16 part_type;
+	__u8 flags;
+	__u8 reserved;
+	union {
+		struct {
+			__le32 offset;
+			__le32 reserved;
+		} pci_common_cfg;
+		struct {
+			__le16 index;
+			__u8 reserved[6];
+		} vq_index;
+	} selector;
+	__le32 length;
+};
+
+struct virtio_dev_part {
+	struct virtio_dev_part_hdr hdr;
+	__u8 value[];
+};
+
+struct virtio_admin_cmd_dev_parts_metadata_result {
+	union {
+		struct {
+			__le32 size;
+			__le32 reserved;
+		} parts_size;
+		struct {
+			__le32 count;
+			__le32 reserved;
+		} hdr_list_count;
+		struct {
+			__le32 count;
+			__le32 reserved;
+			struct virtio_dev_part_hdr hdrs[];
+		} hdr_list;
+	};
+};
+
+#define VIRTIO_ADMIN_CMD_DEV_PARTS_GET_TYPE_SELECTED 0
+#define VIRTIO_ADMIN_CMD_DEV_PARTS_GET_TYPE_ALL 1
+
+struct virtio_admin_cmd_dev_parts_get_data {
+	struct virtio_admin_cmd_resource_obj_cmd_hdr hdr;
+	__u8 type;
+	__u8 reserved[7];
+	struct virtio_dev_part_hdr hdr_list[];
+};
+
+struct virtio_admin_cmd_dev_parts_set_data {
+	struct virtio_admin_cmd_resource_obj_cmd_hdr hdr;
+	struct virtio_dev_part parts[];
+};
+
+#define VIRTIO_ADMIN_CMD_DEV_MODE_F_STOPPED 0
+
+struct virtio_admin_cmd_dev_mode_set_data {
+	__u8 flags;
+};
+
 #endif
-- 
cgit v1.2.3


From aefff51e1c2986e16f2780ca8e4c97b784800ab5 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 14 Nov 2024 16:31:27 +0100
Subject: statmount: retrieve security mount options

Add the ability to retrieve security mount options. Keep them separate
from filesystem specific mount options so it's easy to tell them apart.
Also allow to retrieve them separate from other mount options as most of
the time users won't be interested in security specific mount options.

Link: https://lore.kernel.org/r/20241114-radtour-ofenrohr-ff34b567b40a@brauner
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c             | 74 +++++++++++++++++++++++++++++++++++++---------
 include/uapi/linux/mount.h |  5 +++-
 2 files changed, 64 insertions(+), 15 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index 4f39c4aba85d..a9065a9ab971 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5072,13 +5072,30 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
 	return 0;
 }
 
+static inline int statmount_opt_unescape(struct seq_file *seq, char *buf_start)
+{
+	char *buf_end, *opt_start, *opt_end;
+	int count = 0;
+
+	buf_end = seq->buf + seq->count;
+	*buf_end = '\0';
+	for (opt_start = buf_start + 1; opt_start < buf_end; opt_start = opt_end + 1) {
+		opt_end = strchrnul(opt_start, ',');
+		*opt_end = '\0';
+		buf_start += string_unescape(opt_start, buf_start, 0, UNESCAPE_OCTAL) + 1;
+		if (WARN_ON_ONCE(++count == INT_MAX))
+			return -EOVERFLOW;
+	}
+	seq->count = buf_start - 1 - seq->buf;
+	return count;
+}
+
 static int statmount_opt_array(struct kstatmount *s, struct seq_file *seq)
 {
 	struct vfsmount *mnt = s->mnt;
 	struct super_block *sb = mnt->mnt_sb;
 	size_t start = seq->count;
-	char *buf_start, *buf_end, *opt_start, *opt_end;
-	u32 count = 0;
+	char *buf_start;
 	int err;
 
 	if (!sb->s_op->show_options)
@@ -5095,17 +5112,39 @@ static int statmount_opt_array(struct kstatmount *s, struct seq_file *seq)
 	if (seq->count == start)
 		return 0;
 
-	buf_end = seq->buf + seq->count;
-	*buf_end = '\0';
-	for (opt_start = buf_start + 1; opt_start < buf_end; opt_start = opt_end + 1) {
-		opt_end = strchrnul(opt_start, ',');
-		*opt_end = '\0';
-		buf_start += string_unescape(opt_start, buf_start, 0, UNESCAPE_OCTAL) + 1;
-		if (WARN_ON_ONCE(++count == 0))
-			return -EOVERFLOW;
-	}
-	seq->count = buf_start - 1 - seq->buf;
-	s->sm.opt_num = count;
+	err = statmount_opt_unescape(seq, buf_start);
+	if (err < 0)
+		return err;
+
+	s->sm.opt_num = err;
+	return 0;
+}
+
+static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq)
+{
+	struct vfsmount *mnt = s->mnt;
+	struct super_block *sb = mnt->mnt_sb;
+	size_t start = seq->count;
+	char *buf_start;
+	int err;
+
+	buf_start = seq->buf + start;
+
+	err = security_sb_show_options(seq, sb);
+	if (!err)
+		return err;
+
+	if (unlikely(seq_has_overflowed(seq)))
+		return -EAGAIN;
+
+	if (seq->count == start)
+		return 0;
+
+	err = statmount_opt_unescape(seq, buf_start);
+	if (err < 0)
+		return err;
+
+	s->sm.opt_sec_num = err;
 	return 0;
 }
 
@@ -5138,6 +5177,10 @@ static int statmount_string(struct kstatmount *s, u64 flag)
 		sm->opt_array = start;
 		ret = statmount_opt_array(s, seq);
 		break;
+	case STATMOUNT_OPT_SEC_ARRAY:
+		sm->opt_sec_array = start;
+		ret = statmount_opt_sec_array(s, seq);
+		break;
 	case STATMOUNT_FS_SUBTYPE:
 		sm->fs_subtype = start;
 		statmount_fs_subtype(s, seq);
@@ -5294,6 +5337,9 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
 	if (!err && s->mask & STATMOUNT_OPT_ARRAY)
 		err = statmount_string(s, STATMOUNT_OPT_ARRAY);
 
+	if (!err && s->mask & STATMOUNT_OPT_SEC_ARRAY)
+		err = statmount_string(s, STATMOUNT_OPT_SEC_ARRAY);
+
 	if (!err && s->mask & STATMOUNT_FS_SUBTYPE)
 		err = statmount_string(s, STATMOUNT_FS_SUBTYPE);
 
@@ -5323,7 +5369,7 @@ static inline bool retry_statmount(const long ret, size_t *seq_size)
 #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \
 			      STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \
 			      STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \
-			      STATMOUNT_OPT_ARRAY)
+			      STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY)
 
 static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
 			      struct statmount __user *buf, size_t bufsize,
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index c0fda4604187..c07008816aca 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -177,7 +177,9 @@ struct statmount {
 	__u32 sb_source;	/* [str] Source string of the mount */
 	__u32 opt_num;		/* Number of fs options */
 	__u32 opt_array;	/* [str] Array of nul terminated fs options */
-	__u64 __spare2[47];
+	__u32 opt_sec_num;	/* Number of security options */
+	__u32 opt_sec_array;	/* [str] Array of nul terminated security options */
+	__u64 __spare2[46];
 	char str[];		/* Variable size part containing strings */
 };
 
@@ -214,6 +216,7 @@ struct mnt_id_req {
 #define STATMOUNT_FS_SUBTYPE		0x00000100U	/* Want/got fs_subtype */
 #define STATMOUNT_SB_SOURCE		0x00000200U	/* Want/got sb_source */
 #define STATMOUNT_OPT_ARRAY		0x00000400U	/* Want/got opt_... */
+#define STATMOUNT_OPT_SEC_ARRAY		0x00000800U	/* Want/got opt_sec... */
 
 /*
  * Special @mnt_id values that can be passed to listmount
-- 
cgit v1.2.3


From 829ed626499c11c9d11c65e93febc1e0da7cd61b Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Wed, 13 Nov 2024 11:51:36 -0800
Subject: iommufd: Add IOMMU_IOAS_CHANGE_PROCESS

Add an ioctl that updates all DMA mappings to reflect the current process,
Change the mm and transfer locked memory accounting from old to current mm.
This will be used for live update, allowing an old process to hand the
iommufd device descriptor to a new process.  The new process calls the
ioctl.

IOMMU_IOAS_CHANGE_PROCESS only supports DMA mappings created with
IOMMU_IOAS_MAP_FILE, because the kernel metadata for such mappings does
not depend on the userland VA of the pages (which is different in the new
process).
IOMMU_IOAS_CHANGE_PROCESS fails if other types of mappings are present.

This is a revised version of code originally provided by Jason.

Link: https://patch.msgid.link/r/1731527497-16091-4-git-send-email-steven.sistare@oracle.com
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/io_pagetable.h    |   1 +
 drivers/iommu/iommufd/ioas.c            | 147 ++++++++++++++++++++++++++++++++
 drivers/iommu/iommufd/iommufd_private.h |   1 +
 drivers/iommu/iommufd/main.c            |   2 +
 include/uapi/linux/iommufd.h            |  23 +++++
 5 files changed, 174 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h
index f5f20fa639ef..10c928a9a463 100644
--- a/drivers/iommu/iommufd/io_pagetable.h
+++ b/drivers/iommu/iommufd/io_pagetable.h
@@ -173,6 +173,7 @@ enum {
 	IOPT_PAGES_ACCOUNT_NONE = 0,
 	IOPT_PAGES_ACCOUNT_USER = 1,
 	IOPT_PAGES_ACCOUNT_MM = 2,
+	IOPT_PAGES_ACCOUNT_MODE_NUM = 3,
 };
 
 enum iopt_address_type {
diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
index c82ed5a92e3b..1542c5fd10a8 100644
--- a/drivers/iommu/iommufd/ioas.c
+++ b/drivers/iommu/iommufd/ioas.c
@@ -439,6 +439,153 @@ static int iommufd_take_all_iova_rwsem(struct iommufd_ctx *ictx,
 	return 0;
 }
 
+static bool need_charge_update(struct iopt_pages *pages)
+{
+	switch (pages->account_mode) {
+	case IOPT_PAGES_ACCOUNT_NONE:
+		return false;
+	case IOPT_PAGES_ACCOUNT_MM:
+		return pages->source_mm != current->mm;
+	case IOPT_PAGES_ACCOUNT_USER:
+		/*
+		 * Update when mm changes because it also accounts
+		 * in mm->pinned_vm.
+		 */
+		return (pages->source_user != current_user()) ||
+		       (pages->source_mm != current->mm);
+	}
+	return true;
+}
+
+static int charge_current(unsigned long *npinned)
+{
+	struct iopt_pages tmp = {
+		.source_mm = current->mm,
+		.source_task = current->group_leader,
+		.source_user = current_user(),
+	};
+	unsigned int account_mode;
+	int rc;
+
+	for (account_mode = 0; account_mode != IOPT_PAGES_ACCOUNT_MODE_NUM;
+	     account_mode++) {
+		if (!npinned[account_mode])
+			continue;
+
+		tmp.account_mode = account_mode;
+		rc = iopt_pages_update_pinned(&tmp, npinned[account_mode], true,
+					      NULL);
+		if (rc)
+			goto err_undo;
+	}
+	return 0;
+
+err_undo:
+	while (account_mode != 0) {
+		account_mode--;
+		if (!npinned[account_mode])
+			continue;
+		tmp.account_mode = account_mode;
+		iopt_pages_update_pinned(&tmp, npinned[account_mode], false,
+					 NULL);
+	}
+	return rc;
+}
+
+static void change_mm(struct iopt_pages *pages)
+{
+	struct task_struct *old_task = pages->source_task;
+	struct user_struct *old_user = pages->source_user;
+	struct mm_struct *old_mm = pages->source_mm;
+
+	pages->source_mm = current->mm;
+	mmgrab(pages->source_mm);
+	mmdrop(old_mm);
+
+	pages->source_task = current->group_leader;
+	get_task_struct(pages->source_task);
+	put_task_struct(old_task);
+
+	pages->source_user = get_uid(current_user());
+	free_uid(old_user);
+}
+
+#define for_each_ioas_area(_xa, _index, _ioas, _area) \
+	xa_for_each((_xa), (_index), (_ioas)) \
+		for (_area = iopt_area_iter_first(&_ioas->iopt, 0, ULONG_MAX); \
+		     _area; \
+		     _area = iopt_area_iter_next(_area, 0, ULONG_MAX))
+
+int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_ioas_change_process *cmd = ucmd->cmd;
+	struct iommufd_ctx *ictx = ucmd->ictx;
+	unsigned long all_npinned[IOPT_PAGES_ACCOUNT_MODE_NUM] = {};
+	struct iommufd_ioas *ioas;
+	struct iopt_area *area;
+	struct iopt_pages *pages;
+	struct xarray ioas_list;
+	unsigned long index;
+	int rc;
+
+	if (cmd->__reserved)
+		return -EOPNOTSUPP;
+
+	xa_init(&ioas_list);
+	rc = iommufd_take_all_iova_rwsem(ictx, &ioas_list);
+	if (rc)
+		return rc;
+
+	for_each_ioas_area(&ioas_list, index, ioas, area)  {
+		if (area->pages->type != IOPT_ADDRESS_FILE) {
+			rc = -EINVAL;
+			goto out;
+		}
+	}
+
+	/*
+	 * Count last_pinned pages, then clear it to avoid double counting
+	 * if the same iopt_pages is visited multiple times in this loop.
+	 * Since we are under all the locks, npinned == last_npinned, so we
+	 * can easily restore last_npinned before we return.
+	 */
+	for_each_ioas_area(&ioas_list, index, ioas, area)  {
+		pages = area->pages;
+
+		if (need_charge_update(pages)) {
+			all_npinned[pages->account_mode] += pages->last_npinned;
+			pages->last_npinned = 0;
+		}
+	}
+
+	rc = charge_current(all_npinned);
+
+	if (rc) {
+		/* Charge failed.  Fix last_npinned and bail. */
+		for_each_ioas_area(&ioas_list, index, ioas, area)
+			area->pages->last_npinned = area->pages->npinned;
+		goto out;
+	}
+
+	for_each_ioas_area(&ioas_list, index, ioas, area) {
+		pages = area->pages;
+
+		/* Uncharge the old one (which also restores last_npinned) */
+		if (need_charge_update(pages)) {
+			int r = iopt_pages_update_pinned(pages, pages->npinned,
+							 false, NULL);
+
+			if (WARN_ON(r))
+				rc = r;
+		}
+		change_mm(pages);
+	}
+
+out:
+	iommufd_release_all_iova_rwsem(ictx, &ioas_list);
+	return rc;
+}
+
 int iommufd_option_rlimit_mode(struct iommu_option *cmd,
 			       struct iommufd_ctx *ictx)
 {
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 57c0c8f0f6a5..b6d706cf2c66 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -255,6 +255,7 @@ int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd);
 int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd);
 int iommufd_ioas_map(struct iommufd_ucmd *ucmd);
 int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd);
 int iommufd_ioas_copy(struct iommufd_ucmd *ucmd);
 int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd);
 int iommufd_ioas_option(struct iommufd_ucmd *ucmd);
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 13ac2286035e..0a96cc8f27da 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -349,6 +349,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 		 struct iommu_ioas_alloc, out_ioas_id),
 	IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas,
 		 struct iommu_ioas_allow_iovas, allowed_iovas),
+	IOCTL_OP(IOMMU_IOAS_CHANGE_PROCESS, iommufd_ioas_change_process,
+		 struct iommu_ioas_change_process, __reserved),
 	IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy,
 		 src_iova),
 	IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges,
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 747d3d9baa3d..4ae8b1ee0444 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -54,6 +54,7 @@ enum {
 	IOMMUFD_CMD_IOAS_MAP_FILE = 0x8f,
 	IOMMUFD_CMD_VIOMMU_ALLOC = 0x90,
 	IOMMUFD_CMD_VDEVICE_ALLOC = 0x91,
+	IOMMUFD_CMD_IOAS_CHANGE_PROCESS = 0x92,
 };
 
 /**
@@ -972,4 +973,26 @@ struct iommu_vdevice_alloc {
 	__aligned_u64 virt_id;
 };
 #define IOMMU_VDEVICE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VDEVICE_ALLOC)
+
+/**
+ * struct iommu_ioas_change_process - ioctl(VFIO_IOAS_CHANGE_PROCESS)
+ * @size: sizeof(struct iommu_ioas_change_process)
+ * @__reserved: Must be 0
+ *
+ * This transfers pinned memory counts for every memory map in every IOAS
+ * in the context to the current process.  This only supports maps created
+ * with IOMMU_IOAS_MAP_FILE, and returns EINVAL if other maps are present.
+ * If the ioctl returns a failure status, then nothing is changed.
+ *
+ * This API is useful for transferring operation of a device from one process
+ * to another, such as during userland live update.
+ */
+struct iommu_ioas_change_process {
+	__u32 size;
+	__u32 __reserved;
+};
+
+#define IOMMU_IOAS_CHANGE_PROCESS \
+	_IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_CHANGE_PROCESS)
+
 #endif
-- 
cgit v1.2.3


From a12143e6084c502fc3cfaa8b717bffc8c14cf806 Mon Sep 17 00:00:00 2001
From: Jeremy Sowden <jeremy@azazel.net>
Date: Thu, 14 Nov 2024 22:07:51 +0100
Subject: netfilter: bitwise: rename some boolean operation functions

In the next patch we add support for doing AND, OR and XOR operations
directly in the kernel, so rename some functions and an enum constant
related to mask-and-xor boolean operations.

Signed-off-by: Jeremy Sowden <jeremy@azazel.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 10 +++++++---
 net/netfilter/nft_bitwise.c              | 34 ++++++++++++++++----------------
 2 files changed, 24 insertions(+), 20 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 9e9079321380..487542234ccd 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -564,16 +564,20 @@ enum nft_immediate_attributes {
 /**
  * enum nft_bitwise_ops - nf_tables bitwise operations
  *
- * @NFT_BITWISE_BOOL: mask-and-xor operation used to implement NOT, AND, OR and
- *                    XOR boolean operations
+ * @NFT_BITWISE_MASK_XOR: mask-and-xor operation used to implement NOT, AND, OR
+ *                        and XOR boolean operations
  * @NFT_BITWISE_LSHIFT: left-shift operation
  * @NFT_BITWISE_RSHIFT: right-shift operation
  */
 enum nft_bitwise_ops {
-	NFT_BITWISE_BOOL,
+	NFT_BITWISE_MASK_XOR,
 	NFT_BITWISE_LSHIFT,
 	NFT_BITWISE_RSHIFT,
 };
+/*
+ * Old name for NFT_BITWISE_MASK_XOR.  Retained for backwards-compatibility.
+ */
+#define NFT_BITWISE_BOOL NFT_BITWISE_MASK_XOR
 
 /**
  * enum nft_bitwise_attributes - nf_tables bitwise expression netlink attributes
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index 7de95674fd8c..7f6a4f800537 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -25,8 +25,8 @@ struct nft_bitwise {
 	struct nft_data		data;
 };
 
-static void nft_bitwise_eval_bool(u32 *dst, const u32 *src,
-				  const struct nft_bitwise *priv)
+static void nft_bitwise_eval_mask_xor(u32 *dst, const u32 *src,
+				      const struct nft_bitwise *priv)
 {
 	unsigned int i;
 
@@ -68,8 +68,8 @@ void nft_bitwise_eval(const struct nft_expr *expr,
 	u32 *dst = &regs->data[priv->dreg];
 
 	switch (priv->op) {
-	case NFT_BITWISE_BOOL:
-		nft_bitwise_eval_bool(dst, src, priv);
+	case NFT_BITWISE_MASK_XOR:
+		nft_bitwise_eval_mask_xor(dst, src, priv);
 		break;
 	case NFT_BITWISE_LSHIFT:
 		nft_bitwise_eval_lshift(dst, src, priv);
@@ -90,8 +90,8 @@ static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = {
 	[NFTA_BITWISE_DATA]	= { .type = NLA_NESTED },
 };
 
-static int nft_bitwise_init_bool(struct nft_bitwise *priv,
-				 const struct nlattr *const tb[])
+static int nft_bitwise_init_mask_xor(struct nft_bitwise *priv,
+				     const struct nlattr *const tb[])
 {
 	struct nft_data_desc mask = {
 		.type	= NFT_DATA_VALUE,
@@ -185,7 +185,7 @@ static int nft_bitwise_init(const struct nft_ctx *ctx,
 	if (tb[NFTA_BITWISE_OP]) {
 		priv->op = ntohl(nla_get_be32(tb[NFTA_BITWISE_OP]));
 		switch (priv->op) {
-		case NFT_BITWISE_BOOL:
+		case NFT_BITWISE_MASK_XOR:
 		case NFT_BITWISE_LSHIFT:
 		case NFT_BITWISE_RSHIFT:
 			break;
@@ -193,12 +193,12 @@ static int nft_bitwise_init(const struct nft_ctx *ctx,
 			return -EOPNOTSUPP;
 		}
 	} else {
-		priv->op = NFT_BITWISE_BOOL;
+		priv->op = NFT_BITWISE_MASK_XOR;
 	}
 
 	switch(priv->op) {
-	case NFT_BITWISE_BOOL:
-		err = nft_bitwise_init_bool(priv, tb);
+	case NFT_BITWISE_MASK_XOR:
+		err = nft_bitwise_init_mask_xor(priv, tb);
 		break;
 	case NFT_BITWISE_LSHIFT:
 	case NFT_BITWISE_RSHIFT:
@@ -209,8 +209,8 @@ static int nft_bitwise_init(const struct nft_ctx *ctx,
 	return err;
 }
 
-static int nft_bitwise_dump_bool(struct sk_buff *skb,
-				 const struct nft_bitwise *priv)
+static int nft_bitwise_dump_mask_xor(struct sk_buff *skb,
+				     const struct nft_bitwise *priv)
 {
 	if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask,
 			  NFT_DATA_VALUE, priv->len) < 0)
@@ -248,8 +248,8 @@ static int nft_bitwise_dump(struct sk_buff *skb,
 		return -1;
 
 	switch (priv->op) {
-	case NFT_BITWISE_BOOL:
-		err = nft_bitwise_dump_bool(skb, priv);
+	case NFT_BITWISE_MASK_XOR:
+		err = nft_bitwise_dump_mask_xor(skb, priv);
 		break;
 	case NFT_BITWISE_LSHIFT:
 	case NFT_BITWISE_RSHIFT:
@@ -269,7 +269,7 @@ static int nft_bitwise_offload(struct nft_offload_ctx *ctx,
 	const struct nft_bitwise *priv = nft_expr_priv(expr);
 	struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
 
-	if (priv->op != NFT_BITWISE_BOOL)
+	if (priv->op != NFT_BITWISE_MASK_XOR)
 		return -EOPNOTSUPP;
 
 	if (memcmp(&priv->xor, &zero, sizeof(priv->xor)) ||
@@ -406,7 +406,7 @@ nft_bitwise_fast_dump(struct sk_buff *skb,
 		return -1;
 	if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(sizeof(u32))))
 		return -1;
-	if (nla_put_be32(skb, NFTA_BITWISE_OP, htonl(NFT_BITWISE_BOOL)))
+	if (nla_put_be32(skb, NFTA_BITWISE_OP, htonl(NFT_BITWISE_MASK_XOR)))
 		return -1;
 
 	data.data[0] = priv->mask;
@@ -501,7 +501,7 @@ nft_bitwise_select_ops(const struct nft_ctx *ctx,
 		return &nft_bitwise_ops;
 
 	if (tb[NFTA_BITWISE_OP] &&
-	    ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])) != NFT_BITWISE_BOOL)
+	    ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])) != NFT_BITWISE_MASK_XOR)
 		return &nft_bitwise_ops;
 
 	return &nft_bitwise_fast_ops;
-- 
cgit v1.2.3


From c374196b2b9f4b803fccd59ed82f0712041e21e1 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 11 Oct 2024 11:00:22 +0200
Subject: fs: name_to_handle_at() support for "explicit connectable" file
 handles

nfsd encodes "connectable" file handles for the subtree_check feature,
which can be resolved to an open file with a connected path.
So far, userspace nfs server could not make use of this functionality.

Introduce a new flag AT_HANDLE_CONNECTABLE to name_to_handle_at(2).
When used, the encoded file handle is "explicitly connectable".

The "explicitly connectable" file handle sets bits in the high 16bit of
the handle_type field, so open_by_handle_at(2) will know that it needs
to open a file with a connected path.

old kernels will now recognize the handle_type with high bits set,
so "explicitly connectable" file handles cannot be decoded by
open_by_handle_at(2) on old kernels.

The flag AT_HANDLE_CONNECTABLE is not allowed together with either
AT_HANDLE_FID or AT_EMPTY_PATH.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20241011090023.655623-3-amir73il@gmail.com
Fixes: 570df4e9c23f ("ceph: snapshot nfs re-export")
Acked-by:
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fhandle.c               | 48 +++++++++++++++++++++++++++++++++++++++++-----
 include/linux/exportfs.h   |  2 ++
 include/uapi/linux/fcntl.h |  1 +
 3 files changed, 46 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/fhandle.c b/fs/fhandle.c
index 218511f38cbb..8339a1041025 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -31,6 +31,14 @@ static long do_sys_name_to_handle(const struct path *path,
 	if (!exportfs_can_encode_fh(path->dentry->d_sb->s_export_op, fh_flags))
 		return -EOPNOTSUPP;
 
+	/*
+	 * A request to encode a connectable handle for a disconnected dentry
+	 * is unexpected since AT_EMPTY_PATH is not allowed.
+	 */
+	if (fh_flags & EXPORT_FH_CONNECTABLE &&
+	    WARN_ON(path->dentry->d_flags & DCACHE_DISCONNECTED))
+		return -EINVAL;
+
 	if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
 		return -EFAULT;
 
@@ -45,7 +53,7 @@ static long do_sys_name_to_handle(const struct path *path,
 	/* convert handle size to multiple of sizeof(u32) */
 	handle_dwords = f_handle.handle_bytes >> 2;
 
-	/* we ask for a non connectable maybe decodeable file handle */
+	/* Encode a possibly decodeable/connectable file handle */
 	retval = exportfs_encode_fh(path->dentry,
 				    (struct fid *)handle->f_handle,
 				    &handle_dwords, fh_flags);
@@ -67,8 +75,23 @@ static long do_sys_name_to_handle(const struct path *path,
 		 * non variable part of the file_handle
 		 */
 		handle_bytes = 0;
-	} else
+	} else {
+		/*
+		 * When asked to encode a connectable file handle, encode this
+		 * property in the file handle itself, so that we later know
+		 * how to decode it.
+		 * For sanity, also encode in the file handle if the encoded
+		 * object is a directory and verify this during decode, because
+		 * decoding directory file handles is quite different than
+		 * decoding connectable non-directory file handles.
+		 */
+		if (fh_flags & EXPORT_FH_CONNECTABLE) {
+			handle->handle_type |= FILEID_IS_CONNECTABLE;
+			if (d_is_dir(path->dentry))
+				fh_flags |= FILEID_IS_DIR;
+		}
 		retval = 0;
+	}
 	/* copy the mount id */
 	if (unique_mntid) {
 		if (put_user(real_mount(path->mnt)->mnt_id_unique,
@@ -109,15 +132,30 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
 {
 	struct path path;
 	int lookup_flags;
-	int fh_flags;
+	int fh_flags = 0;
 	int err;
 
 	if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID |
-		     AT_HANDLE_MNT_ID_UNIQUE))
+		     AT_HANDLE_MNT_ID_UNIQUE | AT_HANDLE_CONNECTABLE))
+		return -EINVAL;
+
+	/*
+	 * AT_HANDLE_FID means there is no intention to decode file handle
+	 * AT_HANDLE_CONNECTABLE means there is an intention to decode a
+	 * connected fd (with known path), so these flags are conflicting.
+	 * AT_EMPTY_PATH could be used along with a dfd that refers to a
+	 * disconnected non-directory, which cannot be used to encode a
+	 * connectable file handle, because its parent is unknown.
+	 */
+	if (flag & AT_HANDLE_CONNECTABLE &&
+	    flag & (AT_HANDLE_FID | AT_EMPTY_PATH))
 		return -EINVAL;
+	else if (flag & AT_HANDLE_FID)
+		fh_flags |= EXPORT_FH_FID;
+	else if (flag & AT_HANDLE_CONNECTABLE)
+		fh_flags |= EXPORT_FH_CONNECTABLE;
 
 	lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0;
-	fh_flags = (flag & AT_HANDLE_FID) ? EXPORT_FH_FID : 0;
 	if (flag & AT_EMPTY_PATH)
 		lookup_flags |= LOOKUP_EMPTY;
 	err = user_path_at(dfd, name, lookup_flags, &path);
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 5e14d4500a75..4ee42b2cf4ab 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -169,6 +169,8 @@ struct fid {
 #define FILEID_USER_FLAGS(type) ((type) & FILEID_USER_FLAGS_MASK)
 
 /* Flags supported in encoded handle_type that is exported to user */
+#define FILEID_IS_CONNECTABLE	0x10000
+#define FILEID_IS_DIR		0x20000
 #define FILEID_VALID_USER_FLAGS	(0)
 
 /**
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 87e2dec79fea..56ff2100e021 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -153,6 +153,7 @@
 					   object identity and may not be
 					   usable with open_by_handle_at(2). */
 #define AT_HANDLE_MNT_ID_UNIQUE	0x001	/* Return the u64 unique mount ID. */
+#define AT_HANDLE_CONNECTABLE	0x002	/* Request a connectable file handle */
 
 #if defined(__KERNEL__)
 #define AT_GETATTR_NOSEC	0x80000000
-- 
cgit v1.2.3


From b0ccf4f53d968e794a4ea579d5135cc1aaf1a53f Mon Sep 17 00:00:00 2001
From: Jeremy Sowden <jeremy@azazel.net>
Date: Thu, 14 Nov 2024 22:08:13 +0100
Subject: netfilter: bitwise: add support for doing AND, OR and XOR directly

Hitherto, these operations have been converted in user space to
mask-and-xor operations on one register and two immediate values, and it
is the latter which have been evaluated by the kernel.  We add support
for evaluating these operations directly in kernel space on one register
and either an immediate value or a second register.

Pablo made a few changes to the original patch:

- EINVAL if NFTA_BITWISE_SREG2 is used with fast version.
- Allow _AND,_OR,_XOR with _DATA != sizeof(u32)
- Dump _SREG2 or _DATA with _AND,_OR,_XOR

Signed-off-by: Jeremy Sowden <jeremy@azazel.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |   8 ++
 net/netfilter/nft_bitwise.c              | 134 ++++++++++++++++++++++++++++---
 2 files changed, 131 insertions(+), 11 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 487542234ccd..49c944e78463 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -568,11 +568,17 @@ enum nft_immediate_attributes {
  *                        and XOR boolean operations
  * @NFT_BITWISE_LSHIFT: left-shift operation
  * @NFT_BITWISE_RSHIFT: right-shift operation
+ * @NFT_BITWISE_AND: and operation
+ * @NFT_BITWISE_OR: or operation
+ * @NFT_BITWISE_XOR: xor operation
  */
 enum nft_bitwise_ops {
 	NFT_BITWISE_MASK_XOR,
 	NFT_BITWISE_LSHIFT,
 	NFT_BITWISE_RSHIFT,
+	NFT_BITWISE_AND,
+	NFT_BITWISE_OR,
+	NFT_BITWISE_XOR,
 };
 /*
  * Old name for NFT_BITWISE_MASK_XOR.  Retained for backwards-compatibility.
@@ -590,6 +596,7 @@ enum nft_bitwise_ops {
  * @NFTA_BITWISE_OP: type of operation (NLA_U32: nft_bitwise_ops)
  * @NFTA_BITWISE_DATA: argument for non-boolean operations
  *                     (NLA_NESTED: nft_data_attributes)
+ * @NFTA_BITWISE_SREG2: second source register (NLA_U32: nft_registers)
  *
  * The bitwise expression supports boolean and shift operations.  It implements
  * the boolean operations by performing the following operation:
@@ -613,6 +620,7 @@ enum nft_bitwise_attributes {
 	NFTA_BITWISE_XOR,
 	NFTA_BITWISE_OP,
 	NFTA_BITWISE_DATA,
+	NFTA_BITWISE_SREG2,
 	__NFTA_BITWISE_MAX
 };
 #define NFTA_BITWISE_MAX	(__NFTA_BITWISE_MAX - 1)
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index 7f6a4f800537..d550910aabec 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -17,6 +17,7 @@
 
 struct nft_bitwise {
 	u8			sreg;
+	u8			sreg2;
 	u8			dreg;
 	enum nft_bitwise_ops	op:8;
 	u8			len;
@@ -60,28 +61,72 @@ static void nft_bitwise_eval_rshift(u32 *dst, const u32 *src,
 	}
 }
 
+static void nft_bitwise_eval_and(u32 *dst, const u32 *src, const u32 *src2,
+				 const struct nft_bitwise *priv)
+{
+	unsigned int i, n;
+
+	for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++)
+		dst[i] = src[i] & src2[i];
+}
+
+static void nft_bitwise_eval_or(u32 *dst, const u32 *src, const u32 *src2,
+				const struct nft_bitwise *priv)
+{
+	unsigned int i, n;
+
+	for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++)
+		dst[i] = src[i] | src2[i];
+}
+
+static void nft_bitwise_eval_xor(u32 *dst, const u32 *src, const u32 *src2,
+				 const struct nft_bitwise *priv)
+{
+	unsigned int i, n;
+
+	for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++)
+		dst[i] = src[i] ^ src2[i];
+}
+
 void nft_bitwise_eval(const struct nft_expr *expr,
 		      struct nft_regs *regs, const struct nft_pktinfo *pkt)
 {
 	const struct nft_bitwise *priv = nft_expr_priv(expr);
-	const u32 *src = &regs->data[priv->sreg];
+	const u32 *src = &regs->data[priv->sreg], *src2;
 	u32 *dst = &regs->data[priv->dreg];
 
-	switch (priv->op) {
-	case NFT_BITWISE_MASK_XOR:
+	if (priv->op == NFT_BITWISE_MASK_XOR) {
 		nft_bitwise_eval_mask_xor(dst, src, priv);
-		break;
-	case NFT_BITWISE_LSHIFT:
+		return;
+	}
+	if (priv->op == NFT_BITWISE_LSHIFT) {
 		nft_bitwise_eval_lshift(dst, src, priv);
-		break;
-	case NFT_BITWISE_RSHIFT:
+		return;
+	}
+	if (priv->op == NFT_BITWISE_RSHIFT) {
 		nft_bitwise_eval_rshift(dst, src, priv);
-		break;
+		return;
+	}
+
+	src2 = priv->sreg2 ? &regs->data[priv->sreg2] : priv->data.data;
+
+	if (priv->op == NFT_BITWISE_AND) {
+		nft_bitwise_eval_and(dst, src, src2, priv);
+		return;
+	}
+	if (priv->op == NFT_BITWISE_OR) {
+		nft_bitwise_eval_or(dst, src, src2, priv);
+		return;
+	}
+	if (priv->op == NFT_BITWISE_XOR) {
+		nft_bitwise_eval_xor(dst, src, src2, priv);
+		return;
 	}
 }
 
 static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = {
 	[NFTA_BITWISE_SREG]	= { .type = NLA_U32 },
+	[NFTA_BITWISE_SREG2]	= { .type = NLA_U32 },
 	[NFTA_BITWISE_DREG]	= { .type = NLA_U32 },
 	[NFTA_BITWISE_LEN]	= { .type = NLA_U32 },
 	[NFTA_BITWISE_MASK]	= { .type = NLA_NESTED },
@@ -105,7 +150,8 @@ static int nft_bitwise_init_mask_xor(struct nft_bitwise *priv,
 	};
 	int err;
 
-	if (tb[NFTA_BITWISE_DATA])
+	if (tb[NFTA_BITWISE_DATA] ||
+	    tb[NFTA_BITWISE_SREG2])
 		return -EINVAL;
 
 	if (!tb[NFTA_BITWISE_MASK] ||
@@ -139,7 +185,8 @@ static int nft_bitwise_init_shift(struct nft_bitwise *priv,
 	int err;
 
 	if (tb[NFTA_BITWISE_MASK] ||
-	    tb[NFTA_BITWISE_XOR])
+	    tb[NFTA_BITWISE_XOR]  ||
+	    tb[NFTA_BITWISE_SREG2])
 		return -EINVAL;
 
 	if (!tb[NFTA_BITWISE_DATA])
@@ -157,6 +204,41 @@ static int nft_bitwise_init_shift(struct nft_bitwise *priv,
 	return 0;
 }
 
+static int nft_bitwise_init_bool(const struct nft_ctx *ctx,
+				 struct nft_bitwise *priv,
+				 const struct nlattr *const tb[])
+{
+	int err;
+
+	if (tb[NFTA_BITWISE_MASK] ||
+	    tb[NFTA_BITWISE_XOR])
+		return -EINVAL;
+
+	if ((!tb[NFTA_BITWISE_DATA] && !tb[NFTA_BITWISE_SREG2]) ||
+	    (tb[NFTA_BITWISE_DATA] && tb[NFTA_BITWISE_SREG2]))
+		return -EINVAL;
+
+	if (tb[NFTA_BITWISE_DATA]) {
+		struct nft_data_desc desc = {
+			.type	= NFT_DATA_VALUE,
+			.size	= sizeof(priv->data),
+			.len	= priv->len,
+		};
+
+		err = nft_data_init(NULL, &priv->data, &desc,
+				    tb[NFTA_BITWISE_DATA]);
+		if (err < 0)
+			return err;
+	} else {
+		err = nft_parse_register_load(ctx, tb[NFTA_BITWISE_SREG2],
+					      &priv->sreg2, priv->len);
+		if (err < 0)
+			return err;
+	}
+
+	return 0;
+}
+
 static int nft_bitwise_init(const struct nft_ctx *ctx,
 			    const struct nft_expr *expr,
 			    const struct nlattr * const tb[])
@@ -188,6 +270,9 @@ static int nft_bitwise_init(const struct nft_ctx *ctx,
 		case NFT_BITWISE_MASK_XOR:
 		case NFT_BITWISE_LSHIFT:
 		case NFT_BITWISE_RSHIFT:
+		case NFT_BITWISE_AND:
+		case NFT_BITWISE_OR:
+		case NFT_BITWISE_XOR:
 			break;
 		default:
 			return -EOPNOTSUPP;
@@ -204,6 +289,11 @@ static int nft_bitwise_init(const struct nft_ctx *ctx,
 	case NFT_BITWISE_RSHIFT:
 		err = nft_bitwise_init_shift(priv, tb);
 		break;
+	case NFT_BITWISE_AND:
+	case NFT_BITWISE_OR:
+	case NFT_BITWISE_XOR:
+		err = nft_bitwise_init_bool(ctx, priv, tb);
+		break;
 	}
 
 	return err;
@@ -232,6 +322,21 @@ static int nft_bitwise_dump_shift(struct sk_buff *skb,
 	return 0;
 }
 
+static int nft_bitwise_dump_bool(struct sk_buff *skb,
+				 const struct nft_bitwise *priv)
+{
+	if (priv->sreg2) {
+		if (nft_dump_register(skb, NFTA_BITWISE_SREG2, priv->sreg2))
+			return -1;
+	} else {
+		if (nft_data_dump(skb, NFTA_BITWISE_DATA, &priv->data,
+				  NFT_DATA_VALUE, sizeof(u32)) < 0)
+			return -1;
+	}
+
+	return 0;
+}
+
 static int nft_bitwise_dump(struct sk_buff *skb,
 			    const struct nft_expr *expr, bool reset)
 {
@@ -255,6 +360,11 @@ static int nft_bitwise_dump(struct sk_buff *skb,
 	case NFT_BITWISE_RSHIFT:
 		err = nft_bitwise_dump_shift(skb, priv);
 		break;
+	case NFT_BITWISE_AND:
+	case NFT_BITWISE_OR:
+	case NFT_BITWISE_XOR:
+		err = nft_bitwise_dump_bool(skb, priv);
+		break;
 	}
 
 	return err;
@@ -299,6 +409,7 @@ static bool nft_bitwise_reduce(struct nft_regs_track *track,
 	    track->regs[priv->dreg].bitwise &&
 	    track->regs[priv->dreg].bitwise->ops == expr->ops &&
 	    priv->sreg == bitwise->sreg &&
+	    priv->sreg2 == bitwise->sreg2 &&
 	    priv->dreg == bitwise->dreg &&
 	    priv->op == bitwise->op &&
 	    priv->len == bitwise->len &&
@@ -375,7 +486,8 @@ static int nft_bitwise_fast_init(const struct nft_ctx *ctx,
 	if (err < 0)
 		return err;
 
-	if (tb[NFTA_BITWISE_DATA])
+	if (tb[NFTA_BITWISE_DATA] ||
+	    tb[NFTA_BITWISE_SREG2])
 		return -EINVAL;
 
 	if (!tb[NFTA_BITWISE_MASK] ||
-- 
cgit v1.2.3


From 83e041522eb9c45479f4490b212687cf1e7e9999 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Nov 2024 16:54:40 +0000
Subject: io_uring: temporarily disable registered waits

Disable wait argument registration as it'll be replaced with a more
generic feature. We'll still need IORING_ENTER_EXT_ARG_REG parsing
in a few commits so leave it be.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/70b1d1d218c41ba77a76d1789c8641dab0b0563e.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 10 ------
 include/uapi/linux/io_uring.h  |  3 --
 io_uring/io_uring.c            | 10 ------
 io_uring/register.c            | 82 ------------------------------------------
 io_uring/register.h            |  1 -
 5 files changed, 106 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 072e65e93105..52a5da99a205 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -330,14 +330,6 @@ struct io_ring_ctx {
 		atomic_t		cq_wait_nr;
 		atomic_t		cq_timeouts;
 		struct wait_queue_head	cq_wait;
-
-		/*
-		 * If registered with IORING_REGISTER_CQWAIT_REG, a single
-		 * page holds N entries, mapped in cq_wait_arg. cq_wait_index
-		 * is the maximum allowable index.
-		 */
-		struct io_uring_reg_wait	*cq_wait_arg;
-		unsigned char			cq_wait_index;
 	} ____cacheline_aligned_in_smp;
 
 	/* timeouts */
@@ -431,8 +423,6 @@ struct io_ring_ctx {
 	unsigned short			n_sqe_pages;
 	struct page			**ring_pages;
 	struct page			**sqe_pages;
-
-	struct page			**cq_wait_page;
 };
 
 struct io_tw_state {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 5d08435b95a8..132f5db3d4e8 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -627,9 +627,6 @@ enum io_uring_register_op {
 	/* resize CQ ring */
 	IORING_REGISTER_RESIZE_RINGS		= 33,
 
-	/* register fixed io_uring_reg_wait arguments */
-	IORING_REGISTER_CQWAIT_REG		= 34,
-
 	/* this goes last */
 	IORING_REGISTER_LAST,
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 464a70bde7e6..286b7bb73978 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2709,7 +2709,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
 	io_futex_cache_free(ctx);
 	io_destroy_buffers(ctx);
-	io_unregister_cqwait_reg(ctx);
 	mutex_unlock(&ctx->uring_lock);
 	if (ctx->sq_creds)
 		put_cred(ctx->sq_creds);
@@ -3195,15 +3194,6 @@ void __io_uring_cancel(bool cancel_all)
 static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx,
 			const struct io_uring_getevents_arg __user *uarg)
 {
-	struct io_uring_reg_wait *arg = READ_ONCE(ctx->cq_wait_arg);
-
-	if (arg) {
-		unsigned int index = (unsigned int) (uintptr_t) uarg;
-
-		if (index <= ctx->cq_wait_index)
-			return arg + index;
-	}
-
 	return ERR_PTR(-EFAULT);
 }
 
diff --git a/io_uring/register.c b/io_uring/register.c
index 45edfc57963a..3c5a3cfb186b 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -570,82 +570,6 @@ out:
 	return ret;
 }
 
-void io_unregister_cqwait_reg(struct io_ring_ctx *ctx)
-{
-	unsigned short npages = 1;
-
-	if (!ctx->cq_wait_page)
-		return;
-
-	io_pages_unmap(ctx->cq_wait_arg, &ctx->cq_wait_page, &npages, true);
-	ctx->cq_wait_arg = NULL;
-	if (ctx->user)
-		__io_unaccount_mem(ctx->user, 1);
-}
-
-/*
- * Register a page holding N entries of struct io_uring_reg_wait, which can
- * be used via io_uring_enter(2) if IORING_GETEVENTS_EXT_ARG_REG is set.
- * If that is set with IORING_GETEVENTS_EXT_ARG, then instead of passing
- * in a pointer for a struct io_uring_getevents_arg, an index into this
- * registered array is passed, avoiding two (arg + timeout) copies per
- * invocation.
- */
-static int io_register_cqwait_reg(struct io_ring_ctx *ctx, void __user *uarg)
-{
-	struct io_uring_cqwait_reg_arg arg;
-	struct io_uring_reg_wait *reg;
-	struct page **pages;
-	unsigned long len;
-	int nr_pages, poff;
-	int ret;
-
-	if (ctx->cq_wait_page || ctx->cq_wait_arg)
-		return -EBUSY;
-	if (copy_from_user(&arg, uarg, sizeof(arg)))
-		return -EFAULT;
-	if (!arg.nr_entries || arg.flags)
-		return -EINVAL;
-	if (arg.struct_size != sizeof(*reg))
-		return -EINVAL;
-	if (check_mul_overflow(arg.struct_size, arg.nr_entries, &len))
-		return -EOVERFLOW;
-	if (len > PAGE_SIZE)
-		return -EINVAL;
-	/* offset + len must fit within a page, and must be reg_wait aligned */
-	poff = arg.user_addr & ~PAGE_MASK;
-	if (len + poff > PAGE_SIZE)
-		return -EINVAL;
-	if (poff % arg.struct_size)
-		return -EINVAL;
-
-	pages = io_pin_pages(arg.user_addr, len, &nr_pages);
-	if (IS_ERR(pages))
-		return PTR_ERR(pages);
-	ret = -EINVAL;
-	if (nr_pages != 1)
-		goto out_free;
-	if (ctx->user) {
-		ret = __io_account_mem(ctx->user, 1);
-		if (ret)
-			goto out_free;
-	}
-
-	reg = vmap(pages, 1, VM_MAP, PAGE_KERNEL);
-	if (reg) {
-		ctx->cq_wait_index = arg.nr_entries - 1;
-		WRITE_ONCE(ctx->cq_wait_page, pages);
-		WRITE_ONCE(ctx->cq_wait_arg, (void *) reg + poff);
-		return 0;
-	}
-	ret = -ENOMEM;
-	if (ctx->user)
-		__io_unaccount_mem(ctx->user, 1);
-out_free:
-	io_pages_free(&pages, nr_pages);
-	return ret;
-}
-
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
 	__releases(ctx->uring_lock)
@@ -840,12 +764,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_resize_rings(ctx, arg);
 		break;
-	case IORING_REGISTER_CQWAIT_REG:
-		ret = -EINVAL;
-		if (!arg || nr_args != 1)
-			break;
-		ret = io_register_cqwait_reg(ctx, arg);
-		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/io_uring/register.h b/io_uring/register.h
index 3e935e8fa4b2..a5f39d5ef9e0 100644
--- a/io_uring/register.h
+++ b/io_uring/register.h
@@ -5,6 +5,5 @@
 int io_eventfd_unregister(struct io_ring_ctx *ctx);
 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id);
 struct file *io_uring_register_get_file(unsigned int fd, bool registered);
-void io_unregister_cqwait_reg(struct io_ring_ctx *ctx);
 
 #endif
-- 
cgit v1.2.3


From dfbbfbf191878e8dd422768ce009858d8b5b761e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Nov 2024 16:54:41 +0000
Subject: io_uring: introduce concept of memory regions

We've got a good number of mappings we share with the userspace, that
includes the main rings, provided buffer rings, upcoming rings for
zerocopy rx and more. All of them duplicate user argument parsing and
some internal details as well (page pinnning, huge page optimisations,
mmap'ing, etc.)

Introduce a notion of regions. For userspace for now it's just a new
structure called struct io_uring_region_desc which is supposed to
parameterise all such mapping / queue creations. A region either
represents a user provided chunk of memory, in which case the user_addr
field should point to it, or a request for the kernel to allocate the
memory, in which case the user would need to mmap it after using the
offset returned in the mmap_offset field. With a uniform userspace API
we can avoid additional boiler plate code and apply future optimisation
to all of them at once.

Internally, there is a new structure struct io_mapped_region holding all
relevant runtime information and some helpers to work with it. This
patch limits it to user provided regions.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0e6fe25818dfbaebd1bd90b870a6cac503fe1a24.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  6 ++++
 include/uapi/linux/io_uring.h  | 14 +++++++++
 io_uring/memmap.c              | 67 ++++++++++++++++++++++++++++++++++++++++++
 io_uring/memmap.h              | 14 +++++++++
 4 files changed, 101 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 52a5da99a205..1d3a37234ace 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -75,6 +75,12 @@ struct io_hash_table {
 	unsigned		hash_bits;
 };
 
+struct io_mapped_region {
+	struct page		**pages;
+	void			*vmap_ptr;
+	size_t			nr_pages;
+};
+
 /*
  * Arbitrary limit, can be raised if need be
  */
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 132f5db3d4e8..5cbfd330c688 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -647,6 +647,20 @@ struct io_uring_files_update {
 	__aligned_u64 /* __s32 * */ fds;
 };
 
+enum {
+	/* initialise with user provided memory pointed by user_addr */
+	IORING_MEM_REGION_TYPE_USER		= 1,
+};
+
+struct io_uring_region_desc {
+	__u64 user_addr;
+	__u64 size;
+	__u32 flags;
+	__u32 id;
+	__u64 mmap_offset;
+	__u64 __resv[4];
+};
+
 /*
  * Register a fully sparse file space, rather than pass in an array of all
  * -1 file descriptors.
diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index 6ab59c60dfd0..bbd9569a0120 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -12,6 +12,7 @@
 
 #include "memmap.h"
 #include "kbuf.h"
+#include "rsrc.h"
 
 static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
 				   size_t size, gfp_t gfp)
@@ -194,6 +195,72 @@ void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
 	return ERR_PTR(-ENOMEM);
 }
 
+void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr)
+{
+	if (mr->pages) {
+		unpin_user_pages(mr->pages, mr->nr_pages);
+		kvfree(mr->pages);
+	}
+	if (mr->vmap_ptr)
+		vunmap(mr->vmap_ptr);
+	if (mr->nr_pages && ctx->user)
+		__io_unaccount_mem(ctx->user, mr->nr_pages);
+
+	memset(mr, 0, sizeof(*mr));
+}
+
+int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
+		     struct io_uring_region_desc *reg)
+{
+	int pages_accounted = 0;
+	struct page **pages;
+	int nr_pages, ret;
+	void *vptr;
+	u64 end;
+
+	if (WARN_ON_ONCE(mr->pages || mr->vmap_ptr || mr->nr_pages))
+		return -EFAULT;
+	if (memchr_inv(&reg->__resv, 0, sizeof(reg->__resv)))
+		return -EINVAL;
+	if (reg->flags != IORING_MEM_REGION_TYPE_USER)
+		return -EINVAL;
+	if (!reg->user_addr)
+		return -EFAULT;
+	if (!reg->size || reg->mmap_offset || reg->id)
+		return -EINVAL;
+	if ((reg->size >> PAGE_SHIFT) > INT_MAX)
+		return E2BIG;
+	if ((reg->user_addr | reg->size) & ~PAGE_MASK)
+		return -EINVAL;
+	if (check_add_overflow(reg->user_addr, reg->size, &end))
+		return -EOVERFLOW;
+
+	pages = io_pin_pages(reg->user_addr, reg->size, &nr_pages);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	if (ctx->user) {
+		ret = __io_account_mem(ctx->user, nr_pages);
+		if (ret)
+			goto out_free;
+		pages_accounted = nr_pages;
+	}
+
+	vptr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+	if (!vptr)
+		goto out_free;
+
+	mr->pages = pages;
+	mr->vmap_ptr = vptr;
+	mr->nr_pages = nr_pages;
+	return 0;
+out_free:
+	if (pages_accounted)
+		__io_unaccount_mem(ctx->user, pages_accounted);
+	io_pages_free(&pages, nr_pages);
+	return ret;
+}
+
 static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff,
 					    size_t sz)
 {
diff --git a/io_uring/memmap.h b/io_uring/memmap.h
index 5cec5b7ac49a..f361a635b6c7 100644
--- a/io_uring/memmap.h
+++ b/io_uring/memmap.h
@@ -22,4 +22,18 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr,
 					 unsigned long flags);
 int io_uring_mmap(struct file *file, struct vm_area_struct *vma);
 
+void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr);
+int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
+		     struct io_uring_region_desc *reg);
+
+static inline void *io_region_get_ptr(struct io_mapped_region *mr)
+{
+	return mr->vmap_ptr;
+}
+
+static inline bool io_region_is_set(struct io_mapped_region *mr)
+{
+	return !!mr->nr_pages;
+}
+
 #endif
-- 
cgit v1.2.3


From 93238e66185524aad925acefb2312203b9e26d63 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Nov 2024 16:54:42 +0000
Subject: io_uring: add memory region registration

Regions will serve multiple purposes. First, with it we can decouple
ring/etc. object creation from registration / mapping of the memory they
will be placed in. We already have hacks that allow to put both SQ and
CQ into the same huge page, in the future we should be able to:

region = create_region(io_ring);
create_pbuf_ring(io_uring, region, offset=0);
create_pbuf_ring(io_uring, region, offset=N);

The second use case is efficiently passing parameters. The following
patch enables back on top of regions IORING_ENTER_EXT_ARG_REG, which
optimises wait arguments. It'll also be useful for request arguments
replacing iovecs, msghdr, etc. pointers. Eventually it would also be
handy for BPF as well if it comes to fruition.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0798cf3a14fad19cfc96fc9feca5f3e11481691d.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  3 +++
 include/uapi/linux/io_uring.h  |  8 ++++++++
 io_uring/io_uring.c            |  1 +
 io_uring/register.c            | 37 +++++++++++++++++++++++++++++++++++++
 4 files changed, 49 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 1d3a37234ace..e1d69123e164 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -429,6 +429,9 @@ struct io_ring_ctx {
 	unsigned short			n_sqe_pages;
 	struct page			**ring_pages;
 	struct page			**sqe_pages;
+
+	/* used for optimised request parameter and wait argument passing  */
+	struct io_mapped_region		param_region;
 };
 
 struct io_tw_state {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 5cbfd330c688..1ee35890125b 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -627,6 +627,8 @@ enum io_uring_register_op {
 	/* resize CQ ring */
 	IORING_REGISTER_RESIZE_RINGS		= 33,
 
+	IORING_REGISTER_MEM_REGION		= 34,
+
 	/* this goes last */
 	IORING_REGISTER_LAST,
 
@@ -661,6 +663,12 @@ struct io_uring_region_desc {
 	__u64 __resv[4];
 };
 
+struct io_uring_mem_region_reg {
+	__u64 region_uptr; /* struct io_uring_region_desc * */
+	__u64 flags;
+	__u64 __resv[2];
+};
+
 /*
  * Register a fully sparse file space, rather than pass in an array of all
  * -1 file descriptors.
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 286b7bb73978..c640b8a4ceee 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2709,6 +2709,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
 	io_futex_cache_free(ctx);
 	io_destroy_buffers(ctx);
+	io_free_region(ctx, &ctx->param_region);
 	mutex_unlock(&ctx->uring_lock);
 	if (ctx->sq_creds)
 		put_cred(ctx->sq_creds);
diff --git a/io_uring/register.c b/io_uring/register.c
index 3c5a3cfb186b..2cbac3d9b288 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -570,6 +570,37 @@ out:
 	return ret;
 }
 
+static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
+{
+	struct io_uring_mem_region_reg __user *reg_uptr = uarg;
+	struct io_uring_mem_region_reg reg;
+	struct io_uring_region_desc __user *rd_uptr;
+	struct io_uring_region_desc rd;
+	int ret;
+
+	if (io_region_is_set(&ctx->param_region))
+		return -EBUSY;
+	if (copy_from_user(&reg, reg_uptr, sizeof(reg)))
+		return -EFAULT;
+	rd_uptr = u64_to_user_ptr(reg.region_uptr);
+	if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
+		return -EFAULT;
+
+	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
+		return -EINVAL;
+	if (reg.flags)
+		return -EINVAL;
+
+	ret = io_create_region(ctx, &ctx->param_region, &rd);
+	if (ret)
+		return ret;
+	if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
+		io_free_region(ctx, &ctx->param_region);
+		return -EFAULT;
+	}
+	return 0;
+}
+
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
 	__releases(ctx->uring_lock)
@@ -764,6 +795,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_resize_rings(ctx, arg);
 		break;
+	case IORING_REGISTER_MEM_REGION:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_register_mem_region(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
-- 
cgit v1.2.3


From d617b3147d54c42351eac63b5398d4ddf4f4011b Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Nov 2024 16:54:43 +0000
Subject: io_uring: restore back registered wait arguments

Now we've got a more generic region registration API, place
IORING_ENTER_EXT_ARG_REG and re-enable it.

First, the user has to register a region with the
IORING_MEM_REGION_REG_WAIT_ARG flag set. It can only be done for a
ring in a disabled state, aka IORING_SETUP_R_DISABLED, to avoid races
with already running waiters. With that we should have stable constant
values for ctx->cq_wait_{size,arg} in io_get_ext_arg_reg() and hence no
READ_ONCE required.

The other API difference is that we're now passing byte offsets instead
of indexes. The user _must_ align all offsets / pointers to the native
word size, failing to do so might but not necessarily has to lead to a
failure usually returned as -EFAULT. liburing will be hiding this
details from users.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/81822c1b4ffbe8ad391b4f9ad1564def0d26d990.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  3 +++
 include/uapi/linux/io_uring.h  |  5 +++++
 io_uring/io_uring.c            | 14 +++++++++++++-
 io_uring/register.c            | 16 +++++++++++++++-
 4 files changed, 36 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index e1d69123e164..aa5f5ea98076 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -324,6 +324,9 @@ struct io_ring_ctx {
 		unsigned		cq_entries;
 		struct io_ev_fd	__rcu	*io_ev_fd;
 		unsigned		cq_extra;
+
+		void			*cq_wait_arg;
+		size_t			cq_wait_size;
 	} ____cacheline_aligned_in_smp;
 
 	/*
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 1ee35890125b..4418d0192959 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -663,6 +663,11 @@ struct io_uring_region_desc {
 	__u64 __resv[4];
 };
 
+enum {
+	/* expose the region as registered wait arguments */
+	IORING_MEM_REGION_REG_WAIT_ARG		= 1,
+};
+
 struct io_uring_mem_region_reg {
 	__u64 region_uptr; /* struct io_uring_region_desc * */
 	__u64 flags;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c640b8a4ceee..da8fd460977b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3195,7 +3195,19 @@ void __io_uring_cancel(bool cancel_all)
 static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx,
 			const struct io_uring_getevents_arg __user *uarg)
 {
-	return ERR_PTR(-EFAULT);
+	unsigned long size = sizeof(struct io_uring_reg_wait);
+	unsigned long offset = (uintptr_t)uarg;
+	unsigned long end;
+
+	if (unlikely(offset % sizeof(long)))
+		return ERR_PTR(-EFAULT);
+
+	/* also protects from NULL ->cq_wait_arg as the size would be 0 */
+	if (unlikely(check_add_overflow(offset, size, &end) ||
+		     end > ctx->cq_wait_size))
+		return ERR_PTR(-EFAULT);
+
+	return ctx->cq_wait_arg + offset;
 }
 
 static int io_validate_ext_arg(struct io_ring_ctx *ctx, unsigned flags,
diff --git a/io_uring/register.c b/io_uring/register.c
index 2cbac3d9b288..1a60f4916649 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -588,7 +588,16 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
 
 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
 		return -EINVAL;
-	if (reg.flags)
+	if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
+		return -EINVAL;
+
+	/*
+	 * This ensures there are no waiters. Waiters are unlocked and it's
+	 * hard to synchronise with them, especially if we need to initialise
+	 * the region.
+	 */
+	if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
+	    !(ctx->flags & IORING_SETUP_R_DISABLED))
 		return -EINVAL;
 
 	ret = io_create_region(ctx, &ctx->param_region, &rd);
@@ -598,6 +607,11 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
 		io_free_region(ctx, &ctx->param_region);
 		return -EFAULT;
 	}
+
+	if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
+		ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region);
+		ctx->cq_wait_size = rd.size;
+	}
 	return 0;
 }
 
-- 
cgit v1.2.3


From c750629caeca01979da3403f4bebecda88713233 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 18 Nov 2024 15:14:34 +0000
Subject: io_uring: remove io_uring_cqwait_reg_arg

A separate wait argument registration API was removed, also delete
leftover uapi definitions.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/143b6a53591badac23632d3e6fa3e5db4b342ee2.1731942445.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 4418d0192959..aac9a4f8fa9a 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -873,20 +873,6 @@ enum {
 	IORING_REG_WAIT_TS		= (1U << 0),
 };
 
-/*
- * Argument for IORING_REGISTER_CQWAIT_REG, registering a region of
- * struct io_uring_reg_wait that can be indexed when io_uring_enter(2) is
- * called rather than pass in a wait argument structure separately.
- */
-struct io_uring_cqwait_reg_arg {
-	__u32		flags;
-	__u32		struct_size;
-	__u32		nr_entries;
-	__u32		pad;
-	__u64		user_addr;
-	__u64		pad2[3];
-};
-
 /*
  * Argument for io_uring_enter(2) with
  * IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG set, where the actual argument
-- 
cgit v1.2.3


From ebda123fe703f492d7d557a4da00888ddec4779e Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Fri, 15 Nov 2024 12:43:04 -0800
Subject: Revert "UAPI: ethtool: Use __struct_group() in struct
 ethtool_link_settings"

This reverts commit 43d3487035e9a86fad952de4240a518614240d43. We cannot
use tagged struct groups in UAPI because C++ will throw syntax errors
even under "extern C".

Signed-off-by: Kees Cook <kees@kernel.org>
Link: https://patch.msgid.link/20241115204308.3821419-2-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/ethtool.h | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index fc1f54b065f9..c405ed63acfa 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -2511,24 +2511,21 @@ enum ethtool_reset_flags {
  *	autonegotiation; 0 if unknown or not applicable.  Read-only.
  */
 struct ethtool_link_settings {
-	/* New members MUST be added within the __struct_group() macro below. */
-	__struct_group(ethtool_link_settings_hdr, hdr, /* no attrs */,
-		__u32	cmd;
-		__u32	speed;
-		__u8	duplex;
-		__u8	port;
-		__u8	phy_address;
-		__u8	autoneg;
-		__u8	mdio_support;
-		__u8	eth_tp_mdix;
-		__u8	eth_tp_mdix_ctrl;
-		__s8	link_mode_masks_nwords;
-		__u8	transceiver;
-		__u8	master_slave_cfg;
-		__u8	master_slave_state;
-		__u8	rate_matching;
-		__u32	reserved[7];
-	);
+	__u32	cmd;
+	__u32	speed;
+	__u8	duplex;
+	__u8	port;
+	__u8	phy_address;
+	__u8	autoneg;
+	__u8	mdio_support;
+	__u8	eth_tp_mdix;
+	__u8	eth_tp_mdix_ctrl;
+	__s8	link_mode_masks_nwords;
+	__u8	transceiver;
+	__u8	master_slave_cfg;
+	__u8	master_slave_state;
+	__u8	rate_matching;
+	__u32	reserved[7];
 	__u32	link_mode_masks[];
 	/* layout of link_mode_masks fields:
 	 * __u32 map_supported[link_mode_masks_nwords];
-- 
cgit v1.2.3


From 96c677fca54a28fcfea4dbab9c1f2530bd0a08d1 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Fri, 15 Nov 2024 12:43:05 -0800
Subject: UAPI: ethtool: Avoid flex-array in struct ethtool_link_settings

struct ethtool_link_settings tends to be used as a header for other
structures that have trailing bytes[1], but has a trailing flexible array
itself. Using this overlapped with other structures leads to ambiguous
object sizing in the compiler, so we want to avoid such situations (which
have caused real bugs in the past). Detecting this can be done with
-Wflex-array-member-not-at-end, which will need to be enabled globally.

Using a tagged struct_group() to create a new ethtool_link_settings_hdr
structure isn't possible as it seems we cannot use the tagged variant of
struct_group() due to syntax issues from C++'s perspective (even within
"extern C")[2]. Instead, we can just leave the offending member defined
in UAPI and remove it from the kernel's view of the structure, as Linux
doesn't actually use this member at all. There is also no change in
size since it was already a flexible array that didn't contribute to
size returned by any use of sizeof().

Reported-by: Jakub Kicinski <kuba@kernel.org>
Closes: https://lore.kernel.org/lkml/20241109100213.262a2fa0@kernel.org/ [2]
Link: https://lore.kernel.org/lkml/0bc2809fe2a6c11dd4c8a9a10d9bd65cccdb559b.1730238285.git.gustavoars@kernel.org/ [1]
Signed-off-by: Kees Cook <kees@kernel.org>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20241115204308.3821419-3-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/ethtool.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index c405ed63acfa..7e1b3820f91f 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -2526,12 +2526,19 @@ struct ethtool_link_settings {
 	__u8	master_slave_state;
 	__u8	rate_matching;
 	__u32	reserved[7];
+#ifndef __KERNEL__
+	/* Linux builds with -Wflex-array-member-not-at-end but does
+	 * not use the "link_mode_masks" member. Leave it defined for
+	 * userspace for now, and when userspace wants to start using
+	 * -Wfamnae, we'll need a new solution.
+	 */
 	__u32	link_mode_masks[];
 	/* layout of link_mode_masks fields:
 	 * __u32 map_supported[link_mode_masks_nwords];
 	 * __u32 map_advertising[link_mode_masks_nwords];
 	 * __u32 map_lp_advertising[link_mode_masks_nwords];
 	 */
+#endif
 };
 
 /**
-- 
cgit v1.2.3