From e71da1fd0e84bc5c87a78b405e40713840eecc80 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <uwe@kleine-koenig.org>
Date: Sat, 6 Feb 2021 16:13:48 +0100
Subject: HID: intel-ish-hid: Make remove callback return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The driver core ignores the return value of struct bus_type::remove()
because there is only little that can be done. To simplify the quest to
make this function return void, let struct ishtp_cl_driver::remove() return
void, too. All users already unconditionally return 0, this commit makes
it obvious that returning an error value is a bad idea.

Signed-off-by: Uwe Kleine-König <uwe@kleine-koenig.org>
Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/intel-ish-client-if.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h
index 0d6b4bc191c5..94669e21dc8b 100644
--- a/include/linux/intel-ish-client-if.h
+++ b/include/linux/intel-ish-client-if.h
@@ -36,7 +36,7 @@ struct ishtp_cl_driver {
 	const char *name;
 	const guid_t *guid;
 	int (*probe)(struct ishtp_cl_device *dev);
-	int (*remove)(struct ishtp_cl_device *dev);
+	void (*remove)(struct ishtp_cl_device *dev);
 	int (*reset)(struct ishtp_cl_device *dev);
 	const struct dev_pm_ops *pm;
 };
-- 
cgit v1.2.3


From c57179c73562e31d39139ac245b8a2d337e1823b Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Fri, 26 Mar 2021 14:34:58 +0000
Subject: HID: ishtp-hid-client: Fix 'suggest-attribute=format' compiler
 warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes the following W=1 kernel build warning(s):

 drivers/hid/intel-ish-hid/ishtp/bus.c: In function ‘ishtp_trace_callback’:
 drivers/hid/intel-ish-hid/ishtp/bus.c:876:29: warning: return type might be a candidate for a format attribute [-Wsuggest-attribute=format]
 876 | return cl_device->ishtp_dev->print_log;
 | ~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~

Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Cc: Daniel Drubin <daniel.drubin@intel.com>
Cc: linux-input@vger.kernel.org
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
---
 drivers/hid/intel-ish-hid/ishtp-hid-client.c | 4 ++--
 drivers/hid/intel-ish-hid/ishtp-hid.h        | 4 ++--
 drivers/hid/intel-ish-hid/ishtp/bus.c        | 4 ++--
 drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h  | 4 ++--
 include/linux/intel-ish-client-if.h          | 8 +++++++-
 5 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/intel-ish-hid/ishtp-hid-client.c b/drivers/hid/intel-ish-hid/ishtp-hid-client.c
index 042a7091802d..6b1fa971b33e 100644
--- a/drivers/hid/intel-ish-hid/ishtp-hid-client.c
+++ b/drivers/hid/intel-ish-hid/ishtp-hid-client.c
@@ -784,7 +784,7 @@ static void hid_ishtp_cl_reset_handler(struct work_struct *work)
 	}
 }
 
-void (*hid_print_trace)(void *unused, const char *format, ...);
+ishtp_print_log ishtp_hid_print_trace;
 
 /**
  * hid_ishtp_cl_probe() - ISHTP client driver probe
@@ -823,7 +823,7 @@ static int hid_ishtp_cl_probe(struct ishtp_cl_device *cl_device)
 
 	INIT_WORK(&client_data->work, hid_ishtp_cl_reset_handler);
 
-	hid_print_trace = ishtp_trace_callback(cl_device);
+	ishtp_hid_print_trace = ishtp_trace_callback(cl_device);
 
 	rv = hid_ishtp_cl_init(hid_ishtp_cl, 0);
 	if (rv) {
diff --git a/drivers/hid/intel-ish-hid/ishtp-hid.h b/drivers/hid/intel-ish-hid/ishtp-hid.h
index e2423f7d2b54..f88443a7d935 100644
--- a/drivers/hid/intel-ish-hid/ishtp-hid.h
+++ b/drivers/hid/intel-ish-hid/ishtp-hid.h
@@ -16,9 +16,9 @@
 #define	IS_RESPONSE	0x80
 
 /* Used to dump to Linux trace buffer, if enabled */
-extern void (*hid_print_trace)(void *unused, const char *format, ...);
+extern ishtp_print_log ishtp_hid_print_trace;
 #define hid_ishtp_trace(client, ...) \
-		(hid_print_trace)(NULL, __VA_ARGS__)
+	(ishtp_hid_print_trace)(NULL, __VA_ARGS__)
 
 /* ISH HID message structure */
 struct hostif_msg_hdr {
diff --git a/drivers/hid/intel-ish-hid/ishtp/bus.c b/drivers/hid/intel-ish-hid/ishtp/bus.c
index c1c7d5356208..f0802b047ed8 100644
--- a/drivers/hid/intel-ish-hid/ishtp/bus.c
+++ b/drivers/hid/intel-ish-hid/ishtp/bus.c
@@ -869,9 +869,9 @@ EXPORT_SYMBOL(ishtp_get_pci_device);
  *
  * This interface is used to return trace callback function pointer.
  *
- * Return: void *.
+ * Return: *ishtp_print_log()
  */
-void *ishtp_trace_callback(struct ishtp_cl_device *cl_device)
+ishtp_print_log ishtp_trace_callback(struct ishtp_cl_device *cl_device)
 {
 	return cl_device->ishtp_dev->print_log;
 }
diff --git a/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h b/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h
index 1cc6364aa957..f579b16e6d7a 100644
--- a/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h
+++ b/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h
@@ -10,6 +10,7 @@
 
 #include <linux/types.h>
 #include <linux/spinlock.h>
+#include <linux/intel-ish-client-if.h>
 #include "bus.h"
 #include "hbm.h"
 
@@ -202,8 +203,7 @@ struct ishtp_device {
 	uint64_t ishtp_host_dma_rx_buf_phys;
 
 	/* Dump to trace buffers if enabled*/
-	__printf(2, 3) void (*print_log)(struct ishtp_device *dev,
-					 const char *format, ...);
+	ishtp_print_log print_log;
 
 	/* Debug stats */
 	unsigned int	ipc_rx_cnt;
diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h
index 94669e21dc8b..25e2b4e80502 100644
--- a/include/linux/intel-ish-client-if.h
+++ b/include/linux/intel-ish-client-if.h
@@ -8,11 +8,17 @@
 #ifndef _INTEL_ISH_CLIENT_IF_H_
 #define _INTEL_ISH_CLIENT_IF_H_
 
+#include <linux/device.h>
+#include <linux/uuid.h>
+
 struct ishtp_cl_device;
 struct ishtp_device;
 struct ishtp_cl;
 struct ishtp_fw_client;
 
+typedef __printf(2, 3) void (*ishtp_print_log)(struct ishtp_device *dev,
+					       const char *format, ...);
+
 /* Client state */
 enum cl_state {
 	ISHTP_CL_INITIALIZING = 0,
@@ -76,7 +82,7 @@ int ishtp_register_event_cb(struct ishtp_cl_device *device,
 /* Get the device * from ishtp device instance */
 struct device *ishtp_device(struct ishtp_cl_device *cl_device);
 /* Trace interface for clients */
-void *ishtp_trace_callback(struct ishtp_cl_device *cl_device);
+ishtp_print_log ishtp_trace_callback(struct ishtp_cl_device *cl_device);
 /* Get device pointer of PCI device for DMA acces */
 struct device *ishtp_get_pci_device(struct ishtp_cl_device *cl_device);
 
-- 
cgit v1.2.3


From ffb37ca3bd16ce6ea2df2f87fde9a31e94ebb54b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 1 Apr 2021 19:00:57 -0400
Subject: switch file_open_root() to struct path

... and provide file_open_root_mnt(), using the root of given mount.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting.rst | 9 +++++++++
 arch/um/drivers/mconsole_kern.c       | 2 +-
 fs/coredump.c                         | 4 ++--
 fs/fhandle.c                          | 2 +-
 fs/internal.h                         | 2 +-
 fs/kernel_read_file.c                 | 2 +-
 fs/namei.c                            | 8 +++-----
 fs/open.c                             | 4 ++--
 fs/proc/proc_sysctl.c                 | 2 +-
 include/linux/fs.h                    | 8 +++++++-
 kernel/usermode_driver.c              | 2 +-
 11 files changed, 29 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 0302035781be..9bb2b35f90bb 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -890,3 +890,12 @@ been called or returned with non -EIOCBQUEUED code.
 
 mnt_want_write_file() can now only be paired with mnt_drop_write_file(),
 whereas previously it could be paired with mnt_drop_write() as well.
+
+---
+
+**mandatory**
+
+Calling conventions for file_open_root() changed; now it takes struct path *
+instead of passing mount and dentry separately.  For callers that used to
+pass <mnt, mnt->mnt_root> pair (i.e. the root of given mount), a new helper
+is provided - file_open_root_mnt().  In-tree users adjusted.
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 6d00af25ec6b..c42b10024e26 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -140,7 +140,7 @@ void mconsole_proc(struct mc_request *req)
 		mconsole_reply(req, "Proc not available", 1, 0);
 		goto out;
 	}
-	file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY, 0);
+	file = file_open_root_mnt(mnt, ptr, O_RDONLY, 0);
 	if (IS_ERR(file)) {
 		mconsole_reply(req, "Failed to open file", 1, 0);
 		printk(KERN_ERR "open /proc/%s: %ld\n", ptr, PTR_ERR(file));
diff --git a/fs/coredump.c b/fs/coredump.c
index 1c0fdc1aa70b..087db444b06b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -755,8 +755,8 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 			task_lock(&init_task);
 			get_fs_root(init_task.fs, &root);
 			task_unlock(&init_task);
-			cprm.file = file_open_root(root.dentry, root.mnt,
-				cn.corename, open_flags, 0600);
+			cprm.file = file_open_root(&root, cn.corename,
+						   open_flags, 0600);
 			path_put(&root);
 		} else {
 			cprm.file = filp_open(cn.corename, open_flags, 0600);
diff --git a/fs/fhandle.c b/fs/fhandle.c
index ec6feeccc276..6630c69c23a2 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -229,7 +229,7 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
 		path_put(&path);
 		return fd;
 	}
-	file = file_open_root(path.dentry, path.mnt, "", open_flag, 0);
+	file = file_open_root(&path, "", open_flag, 0);
 	if (IS_ERR(file)) {
 		put_unused_fd(fd);
 		retval =  PTR_ERR(file);
diff --git a/fs/internal.h b/fs/internal.h
index 6aeae7ef3380..3ce8edbaa3ca 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -129,7 +129,7 @@ struct open_flags {
 };
 extern struct file *do_filp_open(int dfd, struct filename *pathname,
 		const struct open_flags *op);
-extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
+extern struct file *do_file_open_root(const struct path *,
 		const char *, const struct open_flags *);
 extern struct open_how build_open_how(int flags, umode_t mode);
 extern int build_open_flags(const struct open_how *how, struct open_flags *op);
diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c
index 90d255fbdd9b..87aac4c72c37 100644
--- a/fs/kernel_read_file.c
+++ b/fs/kernel_read_file.c
@@ -160,7 +160,7 @@ int kernel_read_file_from_path_initns(const char *path, loff_t offset,
 	get_fs_root(init_task.fs, &root);
 	task_unlock(&init_task);
 
-	file = file_open_root(root.dentry, root.mnt, path, O_RDONLY, 0);
+	file = file_open_root(&root, path, O_RDONLY, 0);
 	path_put(&root);
 	if (IS_ERR(file))
 		return PTR_ERR(file);
diff --git a/fs/namei.c b/fs/namei.c
index 48a2f288e802..4b6cf4974dd7 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3533,7 +3533,7 @@ struct file *do_filp_open(int dfd, struct filename *pathname,
 	return filp;
 }
 
-struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+struct file *do_file_open_root(const struct path *root,
 		const char *name, const struct open_flags *op)
 {
 	struct nameidata nd;
@@ -3541,16 +3541,14 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
 	struct filename *filename;
 	int flags = op->lookup_flags | LOOKUP_ROOT;
 
-	nd.root.mnt = mnt;
-	nd.root.dentry = dentry;
-
-	if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
+	if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN)
 		return ERR_PTR(-ELOOP);
 
 	filename = getname_kernel(name);
 	if (IS_ERR(filename))
 		return ERR_CAST(filename);
 
+	nd.root = *root;
 	set_nameidata(&nd, -1, filename);
 	file = path_openat(&nd, op, flags | LOOKUP_RCU);
 	if (unlikely(file == ERR_PTR(-ECHILD)))
diff --git a/fs/open.c b/fs/open.c
index e53af13b5835..b3c904e82e2a 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1156,7 +1156,7 @@ struct file *filp_open(const char *filename, int flags, umode_t mode)
 }
 EXPORT_SYMBOL(filp_open);
 
-struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+struct file *file_open_root(const struct path *root,
 			    const char *filename, int flags, umode_t mode)
 {
 	struct open_flags op;
@@ -1164,7 +1164,7 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
 	int err = build_open_flags(&how, &op);
 	if (err)
 		return ERR_PTR(err);
-	return do_file_open_root(dentry, mnt, filename, &op);
+	return do_file_open_root(root, filename, &op);
 }
 EXPORT_SYMBOL(file_open_root);
 
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 984e42f8cb11..6606f21fc195 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -1806,7 +1806,7 @@ static int process_sysctl_arg(char *param, char *val,
 		panic("%s: Failed to allocate path for %s\n", __func__, param);
 	strreplace(path, '.', '/');
 
-	file = file_open_root((*proc_mnt)->mnt_root, *proc_mnt, path, O_WRONLY, 0);
+	file = file_open_root_mnt(*proc_mnt, path, O_WRONLY, 0);
 	if (IS_ERR(file)) {
 		err = PTR_ERR(file);
 		if (err == -ENOENT)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ec8f3ddf4a6a..1acea2bb9d60 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2632,8 +2632,14 @@ extern long do_sys_open(int dfd, const char __user *filename, int flags,
 			umode_t mode);
 extern struct file *file_open_name(struct filename *, int, umode_t);
 extern struct file *filp_open(const char *, int, umode_t);
-extern struct file *file_open_root(struct dentry *, struct vfsmount *,
+extern struct file *file_open_root(const struct path *,
 				   const char *, int, umode_t);
+static inline struct file *file_open_root_mnt(struct vfsmount *mnt,
+				   const char *name, int flags, umode_t mode)
+{
+	return file_open_root(&(struct path){.mnt = mnt, .dentry = mnt->mnt_root},
+			      name, flags, mode);
+}
 extern struct file * dentry_open(const struct path *, int, const struct cred *);
 extern struct file * open_with_fake_path(const struct path *, int,
 					 struct inode*, const struct cred *);
diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
index 0b35212ffc3d..78353cb73836 100644
--- a/kernel/usermode_driver.c
+++ b/kernel/usermode_driver.c
@@ -26,7 +26,7 @@ static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *na
 	if (IS_ERR(mnt))
 		return mnt;
 
-	file = file_open_root(mnt->mnt_root, mnt, name, O_CREAT | O_WRONLY, 0700);
+	file = file_open_root_mnt(mnt, name, O_CREAT | O_WRONLY, 0700);
 	if (IS_ERR(file)) {
 		mntput(mnt);
 		return ERR_CAST(file);
-- 
cgit v1.2.3


From bcba1e7d0d520adba895d9e0800a056f734b0a6a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 1 Apr 2021 22:03:41 -0400
Subject: take LOOKUP_{ROOT,ROOT_GRABBED,JUMPED} out of LOOKUP_... space

Separate field in nameidata (nd->state) holding the flags that
should be internal-only - that way we both get some spare bits
in LOOKUP_... and get simpler rules for nd->root lifetime rules,
since we can set the replacement of LOOKUP_ROOT (ND_ROOT_PRESET)
at the same time we set nd->root.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/path-lookup.rst |  6 ++--
 fs/namei.c                                | 54 ++++++++++++++++++-------------
 fs/nfs/nfstrace.h                         |  4 ---
 include/linux/namei.h                     |  3 --
 4 files changed, 34 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/path-lookup.rst b/Documentation/filesystems/path-lookup.rst
index c482e1619e77..ede67f705787 100644
--- a/Documentation/filesystems/path-lookup.rst
+++ b/Documentation/filesystems/path-lookup.rst
@@ -1321,18 +1321,18 @@ to lookup: RCU-walk, REF-walk, and REF-walk with forced revalidation.
 yet.  This is primarily used to tell the audit subsystem the full
 context of a particular access being audited.
 
-``LOOKUP_ROOT`` indicates that the ``root`` field in the ``nameidata`` was
+``ND_ROOT_PRESET`` indicates that the ``root`` field in the ``nameidata`` was
 provided by the caller, so it shouldn't be released when it is no
 longer needed.
 
-``LOOKUP_JUMPED`` means that the current dentry was chosen not because
+``ND_JUMPED`` means that the current dentry was chosen not because
 it had the right name but for some other reason.  This happens when
 following "``..``", following a symlink to ``/``, crossing a mount point
 or accessing a "``/proc/$PID/fd/$FD``" symlink (also known as a "magic
 link"). In this case the filesystem has not been asked to revalidate the
 name (with ``d_revalidate()``).  In such cases the inode may still need
 to be revalidated, so ``d_op->d_weak_revalidate()`` is called if
-``LOOKUP_JUMPED`` is set when the look completes - which may be at the
+``ND_JUMPED`` is set when the look completes - which may be at the
 final component or, when creating, unlinking, or renaming, at the penultimate component.
 
 Resolution-restriction flags
diff --git a/fs/namei.c b/fs/namei.c
index 4b6cf4974dd7..622b9f15bf1c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -554,7 +554,7 @@ struct nameidata {
 	struct qstr	last;
 	struct path	root;
 	struct inode	*inode; /* path.dentry.d_inode */
-	unsigned int	flags;
+	unsigned int	flags, state;
 	unsigned	seq, m_seq, r_seq;
 	int		last_type;
 	unsigned	depth;
@@ -573,6 +573,10 @@ struct nameidata {
 	umode_t		dir_mode;
 } __randomize_layout;
 
+#define ND_ROOT_PRESET 1
+#define ND_ROOT_GRABBED 2
+#define ND_JUMPED 4
+
 static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
 {
 	struct nameidata *old = current->nameidata;
@@ -583,6 +587,7 @@ static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
 	p->path.dentry = NULL;
 	p->total_link_count = old ? old->total_link_count : 0;
 	p->saved = old;
+	p->state = 0;
 	current->nameidata = p;
 }
 
@@ -645,9 +650,9 @@ static void terminate_walk(struct nameidata *nd)
 		path_put(&nd->path);
 		for (i = 0; i < nd->depth; i++)
 			path_put(&nd->stack[i].link);
-		if (nd->flags & LOOKUP_ROOT_GRABBED) {
+		if (nd->state & ND_ROOT_GRABBED) {
 			path_put(&nd->root);
-			nd->flags &= ~LOOKUP_ROOT_GRABBED;
+			nd->state &= ~ND_ROOT_GRABBED;
 		}
 	} else {
 		nd->flags &= ~LOOKUP_RCU;
@@ -710,9 +715,9 @@ static bool legitimize_root(struct nameidata *nd)
 	if (!nd->root.mnt && (nd->flags & LOOKUP_IS_SCOPED))
 		return false;
 	/* Nothing to do if nd->root is zero or is managed by the VFS user. */
-	if (!nd->root.mnt || (nd->flags & LOOKUP_ROOT))
+	if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET))
 		return true;
-	nd->flags |= LOOKUP_ROOT_GRABBED;
+	nd->state |= ND_ROOT_GRABBED;
 	return legitimize_path(nd, &nd->root, nd->root_seq);
 }
 
@@ -849,8 +854,9 @@ static int complete_walk(struct nameidata *nd)
 		 * We don't want to zero nd->root for scoped-lookups or
 		 * externally-managed nd->root.
 		 */
-		if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED)))
-			nd->root.mnt = NULL;
+		if (!(nd->state & ND_ROOT_PRESET))
+			if (!(nd->flags & LOOKUP_IS_SCOPED))
+				nd->root.mnt = NULL;
 		nd->flags &= ~LOOKUP_CACHED;
 		if (!try_to_unlazy(nd))
 			return -ECHILD;
@@ -877,7 +883,7 @@ static int complete_walk(struct nameidata *nd)
 			return -EXDEV;
 	}
 
-	if (likely(!(nd->flags & LOOKUP_JUMPED)))
+	if (likely(!(nd->state & ND_JUMPED)))
 		return 0;
 
 	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
@@ -915,7 +921,7 @@ static int set_root(struct nameidata *nd)
 		} while (read_seqcount_retry(&fs->seq, seq));
 	} else {
 		get_fs_root(fs, &nd->root);
-		nd->flags |= LOOKUP_ROOT_GRABBED;
+		nd->state |= ND_ROOT_GRABBED;
 	}
 	return 0;
 }
@@ -948,7 +954,7 @@ static int nd_jump_root(struct nameidata *nd)
 		path_get(&nd->path);
 		nd->inode = nd->path.dentry->d_inode;
 	}
-	nd->flags |= LOOKUP_JUMPED;
+	nd->state |= ND_JUMPED;
 	return 0;
 }
 
@@ -976,7 +982,7 @@ int nd_jump_link(struct path *path)
 	path_put(&nd->path);
 	nd->path = *path;
 	nd->inode = nd->path.dentry->d_inode;
-	nd->flags |= LOOKUP_JUMPED;
+	nd->state |= ND_JUMPED;
 	return 0;
 
 err:
@@ -1424,7 +1430,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 			if (mounted) {
 				path->mnt = &mounted->mnt;
 				dentry = path->dentry = mounted->mnt.mnt_root;
-				nd->flags |= LOOKUP_JUMPED;
+				nd->state |= ND_JUMPED;
 				*seqp = read_seqcount_begin(&dentry->d_seq);
 				*inode = dentry->d_inode;
 				/*
@@ -1469,7 +1475,7 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
 		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
 			ret = -EXDEV;
 		else
-			nd->flags |= LOOKUP_JUMPED;
+			nd->state |= ND_JUMPED;
 	}
 	if (unlikely(ret)) {
 		dput(path->dentry);
@@ -2220,7 +2226,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			case 2:
 				if (name[1] == '.') {
 					type = LAST_DOTDOT;
-					nd->flags |= LOOKUP_JUMPED;
+					nd->state |= ND_JUMPED;
 				}
 				break;
 			case 1:
@@ -2228,7 +2234,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		}
 		if (likely(type == LAST_NORM)) {
 			struct dentry *parent = nd->path.dentry;
-			nd->flags &= ~LOOKUP_JUMPED;
+			nd->state &= ~ND_JUMPED;
 			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
 				struct qstr this = { { .hash_len = hash_len }, .name = name };
 				err = parent->d_op->d_hash(parent, &this);
@@ -2302,14 +2308,15 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 	if (flags & LOOKUP_RCU)
 		rcu_read_lock();
 
-	nd->flags = flags | LOOKUP_JUMPED;
+	nd->flags = flags;
+	nd->state |= ND_JUMPED;
 	nd->depth = 0;
 
 	nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
 	nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
 	smp_rmb();
 
-	if (flags & LOOKUP_ROOT) {
+	if (nd->state & ND_ROOT_PRESET) {
 		struct dentry *root = nd->root.dentry;
 		struct inode *inode = root->d_inode;
 		if (*s && unlikely(!d_can_lookup(root)))
@@ -2384,7 +2391,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 			nd->root_seq = nd->seq;
 		} else {
 			path_get(&nd->root);
-			nd->flags |= LOOKUP_ROOT_GRABBED;
+			nd->state |= ND_ROOT_GRABBED;
 		}
 	}
 	return s;
@@ -2423,7 +2430,7 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path
 		;
 	if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
 		err = handle_lookup_down(nd);
-		nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please...
+		nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please...
 	}
 	if (!err)
 		err = complete_walk(nd);
@@ -2447,11 +2454,11 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags,
 	struct nameidata nd;
 	if (IS_ERR(name))
 		return PTR_ERR(name);
+	set_nameidata(&nd, dfd, name);
 	if (unlikely(root)) {
 		nd.root = *root;
-		flags |= LOOKUP_ROOT;
+		nd.state = ND_ROOT_PRESET;
 	}
-	set_nameidata(&nd, dfd, name);
 	retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
 	if (unlikely(retval == -ECHILD))
 		retval = path_lookupat(&nd, flags, path);
@@ -3539,7 +3546,7 @@ struct file *do_file_open_root(const struct path *root,
 	struct nameidata nd;
 	struct file *file;
 	struct filename *filename;
-	int flags = op->lookup_flags | LOOKUP_ROOT;
+	int flags = op->lookup_flags;
 
 	if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN)
 		return ERR_PTR(-ELOOP);
@@ -3548,8 +3555,9 @@ struct file *do_file_open_root(const struct path *root,
 	if (IS_ERR(filename))
 		return ERR_CAST(filename);
 
-	nd.root = *root;
 	set_nameidata(&nd, -1, filename);
+	nd.root = *root;
+	nd.state = ND_ROOT_PRESET;
 	file = path_openat(&nd, op, flags | LOOKUP_RCU);
 	if (unlikely(file == ERR_PTR(-ECHILD)))
 		file = path_openat(&nd, op, flags);
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 5a59dcdce0b2..cb7f49723bbf 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -271,8 +271,6 @@ TRACE_DEFINE_ENUM(LOOKUP_OPEN);
 TRACE_DEFINE_ENUM(LOOKUP_CREATE);
 TRACE_DEFINE_ENUM(LOOKUP_EXCL);
 TRACE_DEFINE_ENUM(LOOKUP_RENAME_TARGET);
-TRACE_DEFINE_ENUM(LOOKUP_JUMPED);
-TRACE_DEFINE_ENUM(LOOKUP_ROOT);
 TRACE_DEFINE_ENUM(LOOKUP_EMPTY);
 TRACE_DEFINE_ENUM(LOOKUP_DOWN);
 
@@ -288,8 +286,6 @@ TRACE_DEFINE_ENUM(LOOKUP_DOWN);
 			{ LOOKUP_CREATE, "CREATE" }, \
 			{ LOOKUP_EXCL, "EXCL" }, \
 			{ LOOKUP_RENAME_TARGET, "RENAME_TARGET" }, \
-			{ LOOKUP_JUMPED, "JUMPED" }, \
-			{ LOOKUP_ROOT, "ROOT" }, \
 			{ LOOKUP_EMPTY, "EMPTY" }, \
 			{ LOOKUP_DOWN, "DOWN" })
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index b9605b2b46e7..be9a2b349ca7 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -36,9 +36,6 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT};
 
 /* internal use only */
 #define LOOKUP_PARENT		0x0010
-#define LOOKUP_JUMPED		0x1000
-#define LOOKUP_ROOT		0x2000
-#define LOOKUP_ROOT_GRABBED	0x0008
 
 /* Scoping flags for lookup. */
 #define LOOKUP_NO_SYMLINKS	0x010000 /* No symlink crossing. */
-- 
cgit v1.2.3


From f9c82a4ea89c384d49ce03768ba88d049ed3f1f0 Mon Sep 17 00:00:00 2001
From: Alexey Gladkov <legion@kernel.org>
Date: Thu, 22 Apr 2021 14:27:08 +0200
Subject: Increase size of ucounts to atomic_long_t

RLIMIT_MSGQUEUE and RLIMIT_MEMLOCK use unsigned long to store their
counters. As a preparation for moving rlimits based on ucounts, we need
to increase the size of the variable to long.

Signed-off-by: Alexey Gladkov <legion@kernel.org>
Link: https://lkml.kernel.org/r/257aa5fb1a7d81cf0f4c34f39ada2320c4284771.1619094428.git.legion@kernel.org
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/user_namespace.h |  4 ++--
 kernel/ucount.c                | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index f6c5f784be5a..c242c10906c5 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -88,7 +88,7 @@ struct user_namespace {
 	struct ctl_table_header *sysctls;
 #endif
 	struct ucounts		*ucounts;
-	int ucount_max[UCOUNT_COUNTS];
+	long ucount_max[UCOUNT_COUNTS];
 } __randomize_layout;
 
 struct ucounts {
@@ -96,7 +96,7 @@ struct ucounts {
 	struct user_namespace *ns;
 	kuid_t uid;
 	int count;
-	atomic_t ucount[UCOUNT_COUNTS];
+	atomic_long_t ucount[UCOUNT_COUNTS];
 };
 
 extern struct user_namespace init_user_ns;
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 11b1596e2542..04c561751af1 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -175,14 +175,14 @@ static void put_ucounts(struct ucounts *ucounts)
 	kfree(ucounts);
 }
 
-static inline bool atomic_inc_below(atomic_t *v, int u)
+static inline bool atomic_long_inc_below(atomic_long_t *v, int u)
 {
-	int c, old;
-	c = atomic_read(v);
+	long c, old;
+	c = atomic_long_read(v);
 	for (;;) {
 		if (unlikely(c >= u))
 			return false;
-		old = atomic_cmpxchg(v, c, c+1);
+		old = atomic_long_cmpxchg(v, c, c+1);
 		if (likely(old == c))
 			return true;
 		c = old;
@@ -196,17 +196,17 @@ struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,
 	struct user_namespace *tns;
 	ucounts = get_ucounts(ns, uid);
 	for (iter = ucounts; iter; iter = tns->ucounts) {
-		int max;
+		long max;
 		tns = iter->ns;
 		max = READ_ONCE(tns->ucount_max[type]);
-		if (!atomic_inc_below(&iter->ucount[type], max))
+		if (!atomic_long_inc_below(&iter->ucount[type], max))
 			goto fail;
 	}
 	return ucounts;
 fail:
 	bad = iter;
 	for (iter = ucounts; iter != bad; iter = iter->ns->ucounts)
-		atomic_dec(&iter->ucount[type]);
+		atomic_long_dec(&iter->ucount[type]);
 
 	put_ucounts(ucounts);
 	return NULL;
@@ -216,7 +216,7 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
 {
 	struct ucounts *iter;
 	for (iter = ucounts; iter; iter = iter->ns->ucounts) {
-		int dec = atomic_dec_if_positive(&iter->ucount[type]);
+		long dec = atomic_long_dec_if_positive(&iter->ucount[type]);
 		WARN_ON_ONCE(dec < 0);
 	}
 	put_ucounts(ucounts);
-- 
cgit v1.2.3


From 905ae01c4ae2ae3df05bb141801b1db4b7d83c61 Mon Sep 17 00:00:00 2001
From: Alexey Gladkov <legion@kernel.org>
Date: Thu, 22 Apr 2021 14:27:09 +0200
Subject: Add a reference to ucounts for each cred

For RLIMIT_NPROC and some other rlimits the user_struct that holds the
global limit is kept alive for the lifetime of a process by keeping it
in struct cred. Adding a pointer to ucounts in the struct cred will
allow to track RLIMIT_NPROC not only for user in the system, but for
user in the user_namespace.

Updating ucounts may require memory allocation which may fail. So, we
cannot change cred.ucounts in the commit_creds() because this function
cannot fail and it should always return 0. For this reason, we modify
cred.ucounts before calling the commit_creds().

Changelog

v6:
* Fix null-ptr-deref in is_ucounts_overlimit() detected by trinity. This
  error was caused by the fact that cred_alloc_blank() left the ucounts
  pointer empty.

Reported-by: kernel test robot <oliver.sang@intel.com>
Signed-off-by: Alexey Gladkov <legion@kernel.org>
Link: https://lkml.kernel.org/r/b37aaef28d8b9b0d757e07ba6dd27281bbe39259.1619094428.git.legion@kernel.org
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/exec.c                      |  4 ++++
 include/linux/cred.h           |  2 ++
 include/linux/user_namespace.h |  4 ++++
 kernel/cred.c                  | 40 ++++++++++++++++++++++++++++++++++++++++
 kernel/fork.c                  |  6 ++++++
 kernel/sys.c                   | 12 ++++++++++++
 kernel/ucount.c                | 40 +++++++++++++++++++++++++++++++++++++---
 kernel/user_namespace.c        |  3 +++
 8 files changed, 108 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 18594f11c31f..d7c4187ca023 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1360,6 +1360,10 @@ int begin_new_exec(struct linux_binprm * bprm)
 	WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1);
 	flush_signal_handlers(me, 0);
 
+	retval = set_cred_ucounts(bprm->cred);
+	if (retval < 0)
+		goto out_unlock;
+
 	/*
 	 * install the new credentials for this executable
 	 */
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 4c6350503697..66436e655032 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -144,6 +144,7 @@ struct cred {
 #endif
 	struct user_struct *user;	/* real user ID subscription */
 	struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
+	struct ucounts *ucounts;
 	struct group_info *group_info;	/* supplementary groups for euid/fsgid */
 	/* RCU deletion */
 	union {
@@ -170,6 +171,7 @@ extern int set_security_override_from_ctx(struct cred *, const char *);
 extern int set_create_files_as(struct cred *, struct inode *);
 extern int cred_fscmp(const struct cred *, const struct cred *);
 extern void __init cred_init(void);
+extern int set_cred_ucounts(struct cred *);
 
 /*
  * check for validity of credentials
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index c242c10906c5..7919b80d57ed 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -100,11 +100,15 @@ struct ucounts {
 };
 
 extern struct user_namespace init_user_ns;
+extern struct ucounts init_ucounts;
 
 bool setup_userns_sysctls(struct user_namespace *ns);
 void retire_userns_sysctls(struct user_namespace *ns);
 struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
 void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
+struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid);
+struct ucounts *get_ucounts(struct ucounts *ucounts);
+void put_ucounts(struct ucounts *ucounts);
 
 #ifdef CONFIG_USER_NS
 
diff --git a/kernel/cred.c b/kernel/cred.c
index 421b1149c651..58a8a9e24347 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -60,6 +60,7 @@ struct cred init_cred = {
 	.user			= INIT_USER,
 	.user_ns		= &init_user_ns,
 	.group_info		= &init_groups,
+	.ucounts		= &init_ucounts,
 };
 
 static inline void set_cred_subscribers(struct cred *cred, int n)
@@ -119,6 +120,8 @@ static void put_cred_rcu(struct rcu_head *rcu)
 	if (cred->group_info)
 		put_group_info(cred->group_info);
 	free_uid(cred->user);
+	if (cred->ucounts)
+		put_ucounts(cred->ucounts);
 	put_user_ns(cred->user_ns);
 	kmem_cache_free(cred_jar, cred);
 }
@@ -222,6 +225,7 @@ struct cred *cred_alloc_blank(void)
 #ifdef CONFIG_DEBUG_CREDENTIALS
 	new->magic = CRED_MAGIC;
 #endif
+	new->ucounts = get_ucounts(&init_ucounts);
 
 	if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0)
 		goto error;
@@ -284,6 +288,11 @@ struct cred *prepare_creds(void)
 
 	if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
 		goto error;
+
+	new->ucounts = get_ucounts(new->ucounts);
+	if (!new->ucounts)
+		goto error;
+
 	validate_creds(new);
 	return new;
 
@@ -363,6 +372,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 		ret = create_user_ns(new);
 		if (ret < 0)
 			goto error_put;
+		if (set_cred_ucounts(new) < 0)
+			goto error_put;
 	}
 
 #ifdef CONFIG_KEYS
@@ -653,6 +664,31 @@ int cred_fscmp(const struct cred *a, const struct cred *b)
 }
 EXPORT_SYMBOL(cred_fscmp);
 
+int set_cred_ucounts(struct cred *new)
+{
+	struct task_struct *task = current;
+	const struct cred *old = task->real_cred;
+	struct ucounts *old_ucounts = new->ucounts;
+
+	if (new->user == old->user && new->user_ns == old->user_ns)
+		return 0;
+
+	/*
+	 * This optimization is needed because alloc_ucounts() uses locks
+	 * for table lookups.
+	 */
+	if (old_ucounts && old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid))
+		return 0;
+
+	if (!(new->ucounts = alloc_ucounts(new->user_ns, new->euid)))
+		return -EAGAIN;
+
+	if (old_ucounts)
+		put_ucounts(old_ucounts);
+
+	return 0;
+}
+
 /*
  * initialise the credentials stuff
  */
@@ -719,6 +755,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
 	if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
 		goto error;
 
+	new->ucounts = get_ucounts(new->ucounts);
+	if (!new->ucounts)
+		goto error;
+
 	put_cred(old);
 	validate_creds(new);
 	return new;
diff --git a/kernel/fork.c b/kernel/fork.c
index 426cd0c51f9e..321a5e31d817 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2995,6 +2995,12 @@ int ksys_unshare(unsigned long unshare_flags)
 	if (err)
 		goto bad_unshare_cleanup_cred;
 
+	if (new_cred) {
+		err = set_cred_ucounts(new_cred);
+		if (err)
+			goto bad_unshare_cleanup_cred;
+	}
+
 	if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
 		if (do_sysvsem) {
 			/*
diff --git a/kernel/sys.c b/kernel/sys.c
index 2e2e3f378d97..cabfc5b86175 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -552,6 +552,10 @@ long __sys_setreuid(uid_t ruid, uid_t euid)
 	if (retval < 0)
 		goto error;
 
+	retval = set_cred_ucounts(new);
+	if (retval < 0)
+		goto error;
+
 	return commit_creds(new);
 
 error:
@@ -610,6 +614,10 @@ long __sys_setuid(uid_t uid)
 	if (retval < 0)
 		goto error;
 
+	retval = set_cred_ucounts(new);
+	if (retval < 0)
+		goto error;
+
 	return commit_creds(new);
 
 error:
@@ -685,6 +693,10 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
 	if (retval < 0)
 		goto error;
 
+	retval = set_cred_ucounts(new);
+	if (retval < 0)
+		goto error;
+
 	return commit_creds(new);
 
 error:
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 04c561751af1..50cc1dfb7d28 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -8,6 +8,12 @@
 #include <linux/kmemleak.h>
 #include <linux/user_namespace.h>
 
+struct ucounts init_ucounts = {
+	.ns    = &init_user_ns,
+	.uid   = GLOBAL_ROOT_UID,
+	.count = 1,
+};
+
 #define UCOUNTS_HASHTABLE_BITS 10
 static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)];
 static DEFINE_SPINLOCK(ucounts_lock);
@@ -125,7 +131,15 @@ static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struc
 	return NULL;
 }
 
-static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
+static void hlist_add_ucounts(struct ucounts *ucounts)
+{
+	struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid);
+	spin_lock_irq(&ucounts_lock);
+	hlist_add_head(&ucounts->node, hashent);
+	spin_unlock_irq(&ucounts_lock);
+}
+
+struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
 {
 	struct hlist_head *hashent = ucounts_hashentry(ns, uid);
 	struct ucounts *ucounts, *new;
@@ -160,7 +174,26 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
 	return ucounts;
 }
 
-static void put_ucounts(struct ucounts *ucounts)
+struct ucounts *get_ucounts(struct ucounts *ucounts)
+{
+	unsigned long flags;
+
+	if (!ucounts)
+		return NULL;
+
+	spin_lock_irqsave(&ucounts_lock, flags);
+	if (ucounts->count == INT_MAX) {
+		WARN_ONCE(1, "ucounts: counter has reached its maximum value");
+		ucounts = NULL;
+	} else {
+		ucounts->count += 1;
+	}
+	spin_unlock_irqrestore(&ucounts_lock, flags);
+
+	return ucounts;
+}
+
+void put_ucounts(struct ucounts *ucounts)
 {
 	unsigned long flags;
 
@@ -194,7 +227,7 @@ struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,
 {
 	struct ucounts *ucounts, *iter, *bad;
 	struct user_namespace *tns;
-	ucounts = get_ucounts(ns, uid);
+	ucounts = alloc_ucounts(ns, uid);
 	for (iter = ucounts; iter; iter = tns->ucounts) {
 		long max;
 		tns = iter->ns;
@@ -237,6 +270,7 @@ static __init int user_namespace_sysctl_init(void)
 	BUG_ON(!user_header);
 	BUG_ON(!setup_userns_sysctls(&init_user_ns));
 #endif
+	hlist_add_ucounts(&init_ucounts);
 	return 0;
 }
 subsys_initcall(user_namespace_sysctl_init);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 9a4b980d695b..f1b7b4b8ffa2 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -1340,6 +1340,9 @@ static int userns_install(struct nsset *nsset, struct ns_common *ns)
 	put_user_ns(cred->user_ns);
 	set_cred_user_ns(cred, get_user_ns(user_ns));
 
+	if (set_cred_ucounts(cred) < 0)
+		return -EINVAL;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From b6c336528926ef73b0f70260f2636de2c3b94c14 Mon Sep 17 00:00:00 2001
From: Alexey Gladkov <legion@kernel.org>
Date: Thu, 22 Apr 2021 14:27:10 +0200
Subject: Use atomic_t for ucounts reference counting

The current implementation of the ucounts reference counter requires the
use of spin_lock. We're going to use get_ucounts() in more performance
critical areas like a handling of RLIMIT_SIGPENDING.

Now we need to use spin_lock only if we want to change the hashtable.

v10:
* Always try to put ucounts in case we cannot increase ucounts->count.
  This will allow to cover the case when all consumers will return
  ucounts at once.

v9:
* Use a negative value to check that the ucounts->count is close to
  overflow.

Signed-off-by: Alexey Gladkov <legion@kernel.org>
Link: https://lkml.kernel.org/r/94d1dbecab060a6b116b0a2d1accd8ca1bbb4f5f.1619094428.git.legion@kernel.org
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/user_namespace.h |  4 ++--
 kernel/ucount.c                | 53 +++++++++++++++---------------------------
 2 files changed, 21 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 7919b80d57ed..80b5bf12feae 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -95,7 +95,7 @@ struct ucounts {
 	struct hlist_node node;
 	struct user_namespace *ns;
 	kuid_t uid;
-	int count;
+	atomic_t count;
 	atomic_long_t ucount[UCOUNT_COUNTS];
 };
 
@@ -107,7 +107,7 @@ void retire_userns_sysctls(struct user_namespace *ns);
 struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
 void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
 struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid);
-struct ucounts *get_ucounts(struct ucounts *ucounts);
+struct ucounts * __must_check get_ucounts(struct ucounts *ucounts);
 void put_ucounts(struct ucounts *ucounts);
 
 #ifdef CONFIG_USER_NS
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 50cc1dfb7d28..365865f368ec 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -11,7 +11,7 @@
 struct ucounts init_ucounts = {
 	.ns    = &init_user_ns,
 	.uid   = GLOBAL_ROOT_UID,
-	.count = 1,
+	.count = ATOMIC_INIT(1),
 };
 
 #define UCOUNTS_HASHTABLE_BITS 10
@@ -139,6 +139,15 @@ static void hlist_add_ucounts(struct ucounts *ucounts)
 	spin_unlock_irq(&ucounts_lock);
 }
 
+struct ucounts *get_ucounts(struct ucounts *ucounts)
+{
+	if (ucounts && atomic_add_negative(1, &ucounts->count)) {
+		put_ucounts(ucounts);
+		ucounts = NULL;
+	}
+	return ucounts;
+}
+
 struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
 {
 	struct hlist_head *hashent = ucounts_hashentry(ns, uid);
@@ -155,7 +164,7 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
 
 		new->ns = ns;
 		new->uid = uid;
-		new->count = 0;
+		atomic_set(&new->count, 1);
 
 		spin_lock_irq(&ucounts_lock);
 		ucounts = find_ucounts(ns, uid, hashent);
@@ -163,33 +172,12 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
 			kfree(new);
 		} else {
 			hlist_add_head(&new->node, hashent);
-			ucounts = new;
+			spin_unlock_irq(&ucounts_lock);
+			return new;
 		}
 	}
-	if (ucounts->count == INT_MAX)
-		ucounts = NULL;
-	else
-		ucounts->count += 1;
 	spin_unlock_irq(&ucounts_lock);
-	return ucounts;
-}
-
-struct ucounts *get_ucounts(struct ucounts *ucounts)
-{
-	unsigned long flags;
-
-	if (!ucounts)
-		return NULL;
-
-	spin_lock_irqsave(&ucounts_lock, flags);
-	if (ucounts->count == INT_MAX) {
-		WARN_ONCE(1, "ucounts: counter has reached its maximum value");
-		ucounts = NULL;
-	} else {
-		ucounts->count += 1;
-	}
-	spin_unlock_irqrestore(&ucounts_lock, flags);
-
+	ucounts = get_ucounts(ucounts);
 	return ucounts;
 }
 
@@ -197,15 +185,12 @@ void put_ucounts(struct ucounts *ucounts)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&ucounts_lock, flags);
-	ucounts->count -= 1;
-	if (!ucounts->count)
+	if (atomic_dec_and_test(&ucounts->count)) {
+		spin_lock_irqsave(&ucounts_lock, flags);
 		hlist_del_init(&ucounts->node);
-	else
-		ucounts = NULL;
-	spin_unlock_irqrestore(&ucounts_lock, flags);
-
-	kfree(ucounts);
+		spin_unlock_irqrestore(&ucounts_lock, flags);
+		kfree(ucounts);
+	}
 }
 
 static inline bool atomic_long_inc_below(atomic_long_t *v, int u)
-- 
cgit v1.2.3


From 21d1c5e386bc751f1953b371d72cd5b7d9c9e270 Mon Sep 17 00:00:00 2001
From: Alexey Gladkov <legion@kernel.org>
Date: Thu, 22 Apr 2021 14:27:11 +0200
Subject: Reimplement RLIMIT_NPROC on top of ucounts

The rlimit counter is tied to uid in the user_namespace. This allows
rlimit values to be specified in userns even if they are already
globally exceeded by the user. However, the value of the previous
user_namespaces cannot be exceeded.

To illustrate the impact of rlimits, let's say there is a program that
does not fork. Some service-A wants to run this program as user X in
multiple containers. Since the program never fork the service wants to
set RLIMIT_NPROC=1.

service-A
 \- program (uid=1000, container1, rlimit_nproc=1)
 \- program (uid=1000, container2, rlimit_nproc=1)

The service-A sets RLIMIT_NPROC=1 and runs the program in container1.
When the service-A tries to run a program with RLIMIT_NPROC=1 in
container2 it fails since user X already has one running process.

We cannot use existing inc_ucounts / dec_ucounts because they do not
allow us to exceed the maximum for the counter. Some rlimits can be
overlimited by root or if the user has the appropriate capability.

Changelog

v11:
* Change inc_rlimit_ucounts() which now returns top value of ucounts.
* Drop inc_rlimit_ucounts_and_test() because the return code of
  inc_rlimit_ucounts() can be checked.

Signed-off-by: Alexey Gladkov <legion@kernel.org>
Link: https://lkml.kernel.org/r/c5286a8aa16d2d698c222f7532f3d735c82bc6bc.1619094428.git.legion@kernel.org
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/exec.c                      |  2 +-
 include/linux/cred.h           |  2 ++
 include/linux/sched/user.h     |  1 -
 include/linux/user_namespace.h | 12 ++++++++++++
 kernel/cred.c                  | 10 +++++-----
 kernel/exit.c                  |  2 +-
 kernel/fork.c                  |  9 +++++----
 kernel/sys.c                   |  2 +-
 kernel/ucount.c                | 44 ++++++++++++++++++++++++++++++++++++++++++
 kernel/user.c                  |  1 -
 kernel/user_namespace.c        |  3 ++-
 11 files changed, 73 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index d7c4187ca023..f2bcdbeb3afb 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1878,7 +1878,7 @@ static int do_execveat_common(int fd, struct filename *filename,
 	 * whether NPROC limit is still exceeded.
 	 */
 	if ((current->flags & PF_NPROC_EXCEEDED) &&
-	    atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
+	    is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
 		retval = -EAGAIN;
 		goto out_ret;
 	}
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 66436e655032..5ca1e8a1d035 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -372,6 +372,7 @@ static inline void put_cred(const struct cred *_cred)
 
 #define task_uid(task)		(task_cred_xxx((task), uid))
 #define task_euid(task)		(task_cred_xxx((task), euid))
+#define task_ucounts(task)	(task_cred_xxx((task), ucounts))
 
 #define current_cred_xxx(xxx)			\
 ({						\
@@ -388,6 +389,7 @@ static inline void put_cred(const struct cred *_cred)
 #define current_fsgid() 	(current_cred_xxx(fsgid))
 #define current_cap()		(current_cred_xxx(cap_effective))
 #define current_user()		(current_cred_xxx(user))
+#define current_ucounts()	(current_cred_xxx(ucounts))
 
 extern struct user_namespace init_user_ns;
 #ifdef CONFIG_USER_NS
diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index a8ec3b6093fc..d33d867ad6c1 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -12,7 +12,6 @@
  */
 struct user_struct {
 	refcount_t __count;	/* reference count */
-	atomic_t processes;	/* How many processes does this user have? */
 	atomic_t sigpending;	/* How many pending signals does this user have? */
 #ifdef CONFIG_FANOTIFY
 	atomic_t fanotify_listeners;
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 80b5bf12feae..4a97acc35990 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -50,9 +50,12 @@ enum ucount_type {
 	UCOUNT_INOTIFY_INSTANCES,
 	UCOUNT_INOTIFY_WATCHES,
 #endif
+	UCOUNT_RLIMIT_NPROC,
 	UCOUNT_COUNTS,
 };
 
+#define MAX_PER_NAMESPACE_UCOUNTS UCOUNT_RLIMIT_NPROC
+
 struct user_namespace {
 	struct uid_gid_map	uid_map;
 	struct uid_gid_map	gid_map;
@@ -110,6 +113,15 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid);
 struct ucounts * __must_check get_ucounts(struct ucounts *ucounts);
 void put_ucounts(struct ucounts *ucounts);
 
+static inline long get_ucounts_value(struct ucounts *ucounts, enum ucount_type type)
+{
+	return atomic_long_read(&ucounts->ucount[type]);
+}
+
+long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v);
+bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v);
+bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max);
+
 #ifdef CONFIG_USER_NS
 
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
diff --git a/kernel/cred.c b/kernel/cred.c
index 58a8a9e24347..dcfa30b337c5 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -360,7 +360,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 		kdebug("share_creds(%p{%d,%d})",
 		       p->cred, atomic_read(&p->cred->usage),
 		       read_cred_subscribers(p->cred));
-		atomic_inc(&p->cred->user->processes);
+		inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
 		return 0;
 	}
 
@@ -395,8 +395,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	}
 #endif
 
-	atomic_inc(&new->user->processes);
 	p->cred = p->real_cred = get_cred(new);
+	inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
 	alter_cred_subscribers(new, 2);
 	validate_creds(new);
 	return 0;
@@ -496,12 +496,12 @@ int commit_creds(struct cred *new)
 	 * in set_user().
 	 */
 	alter_cred_subscribers(new, 2);
-	if (new->user != old->user)
-		atomic_inc(&new->user->processes);
+	if (new->user != old->user || new->user_ns != old->user_ns)
+		inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
 	rcu_assign_pointer(task->real_cred, new);
 	rcu_assign_pointer(task->cred, new);
 	if (new->user != old->user)
-		atomic_dec(&old->user->processes);
+		dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);
 	alter_cred_subscribers(old, -2);
 
 	/* send notifications */
diff --git a/kernel/exit.c b/kernel/exit.c
index 04029e35e69a..61c0fe902b50 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -188,7 +188,7 @@ repeat:
 	/* don't need to get the RCU readlock here - the process is dead and
 	 * can't be modifying its own credentials. But shut RCU-lockdep up */
 	rcu_read_lock();
-	atomic_dec(&__task_cred(p)->user->processes);
+	dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
 	rcu_read_unlock();
 
 	cgroup_release(p);
diff --git a/kernel/fork.c b/kernel/fork.c
index 321a5e31d817..ed7dfb07178d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -819,9 +819,11 @@ void __init fork_init(void)
 	init_task.signal->rlim[RLIMIT_SIGPENDING] =
 		init_task.signal->rlim[RLIMIT_NPROC];
 
-	for (i = 0; i < UCOUNT_COUNTS; i++)
+	for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++)
 		init_user_ns.ucount_max[i] = max_threads/2;
 
+	init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC);
+
 #ifdef CONFIG_VMAP_STACK
 	cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
 			  NULL, free_vm_stack_cache);
@@ -1978,8 +1980,7 @@ static __latent_entropy struct task_struct *copy_process(
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
 	retval = -EAGAIN;
-	if (atomic_read(&p->real_cred->user->processes) >=
-			task_rlimit(p, RLIMIT_NPROC)) {
+	if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
 		if (p->real_cred->user != INIT_USER &&
 		    !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
 			goto bad_fork_free;
@@ -2382,7 +2383,7 @@ bad_fork_cleanup_threadgroup_lock:
 #endif
 	delayacct_tsk_free(p);
 bad_fork_cleanup_count:
-	atomic_dec(&p->cred->user->processes);
+	dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
 	exit_creds(p);
 bad_fork_free:
 	p->state = TASK_DEAD;
diff --git a/kernel/sys.c b/kernel/sys.c
index cabfc5b86175..00266a65a000 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -473,7 +473,7 @@ static int set_user(struct cred *new)
 	 * for programs doing set*uid()+execve() by harmlessly deferring the
 	 * failure to the execve() stage.
 	 */
-	if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
+	if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) &&
 			new_user != INIT_USER)
 		current->flags |= PF_NPROC_EXCEEDED;
 	else
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 365865f368ec..6caa56f7dec8 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -80,6 +80,7 @@ static struct ctl_table user_table[] = {
 	UCOUNT_ENTRY("max_inotify_instances"),
 	UCOUNT_ENTRY("max_inotify_watches"),
 #endif
+	{ },
 	{ }
 };
 #endif /* CONFIG_SYSCTL */
@@ -240,6 +241,48 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
 	put_ucounts(ucounts);
 }
 
+long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
+{
+	struct ucounts *iter;
+	long ret = 0;
+
+	for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+		long max = READ_ONCE(iter->ns->ucount_max[type]);
+		long new = atomic_long_add_return(v, &iter->ucount[type]);
+		if (new < 0 || new > max)
+			ret = LONG_MAX;
+		else if (iter == ucounts)
+			ret = new;
+	}
+	return ret;
+}
+
+bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
+{
+	struct ucounts *iter;
+	long new;
+	for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+		long dec = atomic_long_add_return(-v, &iter->ucount[type]);
+		WARN_ON_ONCE(dec < 0);
+		if (iter == ucounts)
+			new = dec;
+	}
+	return (new == 0);
+}
+
+bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max)
+{
+	struct ucounts *iter;
+	if (get_ucounts_value(ucounts, type) > max)
+		return true;
+	for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+		max = READ_ONCE(iter->ns->ucount_max[type]);
+		if (get_ucounts_value(iter, type) > max)
+			return true;
+	}
+	return false;
+}
+
 static __init int user_namespace_sysctl_init(void)
 {
 #ifdef CONFIG_SYSCTL
@@ -256,6 +299,7 @@ static __init int user_namespace_sysctl_init(void)
 	BUG_ON(!setup_userns_sysctls(&init_user_ns));
 #endif
 	hlist_add_ucounts(&init_ucounts);
+	inc_rlimit_ucounts(&init_ucounts, UCOUNT_RLIMIT_NPROC, 1);
 	return 0;
 }
 subsys_initcall(user_namespace_sysctl_init);
diff --git a/kernel/user.c b/kernel/user.c
index a2478cddf536..7f5ff498207a 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -98,7 +98,6 @@ static DEFINE_SPINLOCK(uidhash_lock);
 /* root_user.__count is 1, for init task cred */
 struct user_struct root_user = {
 	.__count	= REFCOUNT_INIT(1),
-	.processes	= ATOMIC_INIT(1),
 	.sigpending	= ATOMIC_INIT(0),
 	.locked_shm     = 0,
 	.uid		= GLOBAL_ROOT_UID,
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index f1b7b4b8ffa2..e6577c835072 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -119,9 +119,10 @@ int create_user_ns(struct cred *new)
 	ns->owner = owner;
 	ns->group = group;
 	INIT_WORK(&ns->work, free_user_ns);
-	for (i = 0; i < UCOUNT_COUNTS; i++) {
+	for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) {
 		ns->ucount_max[i] = INT_MAX;
 	}
+	ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC);
 	ns->ucounts = ucounts;
 
 	/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
-- 
cgit v1.2.3


From 6e52a9f0532f912af37bab4caf18b57d1b9845f4 Mon Sep 17 00:00:00 2001
From: Alexey Gladkov <legion@kernel.org>
Date: Thu, 22 Apr 2021 14:27:12 +0200
Subject: Reimplement RLIMIT_MSGQUEUE on top of ucounts

The rlimit counter is tied to uid in the user_namespace. This allows
rlimit values to be specified in userns even if they are already
globally exceeded by the user. However, the value of the previous
user_namespaces cannot be exceeded.

Signed-off-by: Alexey Gladkov <legion@kernel.org>
Link: https://lkml.kernel.org/r/2531f42f7884bbfee56a978040b3e0d25cdf6cde.1619094428.git.legion@kernel.org
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/sched/user.h     |  4 ----
 include/linux/user_namespace.h |  1 +
 ipc/mqueue.c                   | 40 +++++++++++++++++++++-------------------
 kernel/fork.c                  |  1 +
 kernel/ucount.c                |  1 +
 kernel/user_namespace.c        |  1 +
 6 files changed, 25 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index d33d867ad6c1..8a34446681aa 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -18,10 +18,6 @@ struct user_struct {
 #endif
 #ifdef CONFIG_EPOLL
 	atomic_long_t epoll_watches; /* The number of file descriptors currently watched */
-#endif
-#ifdef CONFIG_POSIX_MQUEUE
-	/* protected by mq_lock	*/
-	unsigned long mq_bytes;	/* How many bytes can be allocated to mqueue? */
 #endif
 	unsigned long locked_shm; /* How many pages of mlocked shm ? */
 	unsigned long unix_inflight;	/* How many files in flight in unix sockets */
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 4a97acc35990..5eeb86b00e68 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -51,6 +51,7 @@ enum ucount_type {
 	UCOUNT_INOTIFY_WATCHES,
 #endif
 	UCOUNT_RLIMIT_NPROC,
+	UCOUNT_RLIMIT_MSGQUEUE,
 	UCOUNT_COUNTS,
 };
 
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 8031464ed4ae..461fcf8c873d 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -144,7 +144,7 @@ struct mqueue_inode_info {
 	struct pid *notify_owner;
 	u32 notify_self_exec_id;
 	struct user_namespace *notify_user_ns;
-	struct user_struct *user;	/* user who created, for accounting */
+	struct ucounts *ucounts;	/* user who created, for accounting */
 	struct sock *notify_sock;
 	struct sk_buff *notify_cookie;
 
@@ -292,7 +292,6 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
 		struct ipc_namespace *ipc_ns, umode_t mode,
 		struct mq_attr *attr)
 {
-	struct user_struct *u = current_user();
 	struct inode *inode;
 	int ret = -ENOMEM;
 
@@ -321,7 +320,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
 		info->notify_owner = NULL;
 		info->notify_user_ns = NULL;
 		info->qsize = 0;
-		info->user = NULL;	/* set when all is ok */
+		info->ucounts = NULL;	/* set when all is ok */
 		info->msg_tree = RB_ROOT;
 		info->msg_tree_rightmost = NULL;
 		info->node_cache = NULL;
@@ -371,19 +370,23 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
 		if (mq_bytes + mq_treesize < mq_bytes)
 			goto out_inode;
 		mq_bytes += mq_treesize;
-		spin_lock(&mq_lock);
-		if (u->mq_bytes + mq_bytes < u->mq_bytes ||
-		    u->mq_bytes + mq_bytes > rlimit(RLIMIT_MSGQUEUE)) {
+		info->ucounts = get_ucounts(current_ucounts());
+		if (info->ucounts) {
+			long msgqueue;
+
+			spin_lock(&mq_lock);
+			msgqueue = inc_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes);
+			if (msgqueue == LONG_MAX || msgqueue > rlimit(RLIMIT_MSGQUEUE)) {
+				dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes);
+				spin_unlock(&mq_lock);
+				put_ucounts(info->ucounts);
+				info->ucounts = NULL;
+				/* mqueue_evict_inode() releases info->messages */
+				ret = -EMFILE;
+				goto out_inode;
+			}
 			spin_unlock(&mq_lock);
-			/* mqueue_evict_inode() releases info->messages */
-			ret = -EMFILE;
-			goto out_inode;
 		}
-		u->mq_bytes += mq_bytes;
-		spin_unlock(&mq_lock);
-
-		/* all is ok */
-		info->user = get_uid(u);
 	} else if (S_ISDIR(mode)) {
 		inc_nlink(inode);
 		/* Some things misbehave if size == 0 on a directory */
@@ -497,7 +500,6 @@ static void mqueue_free_inode(struct inode *inode)
 static void mqueue_evict_inode(struct inode *inode)
 {
 	struct mqueue_inode_info *info;
-	struct user_struct *user;
 	struct ipc_namespace *ipc_ns;
 	struct msg_msg *msg, *nmsg;
 	LIST_HEAD(tmp_msg);
@@ -520,8 +522,7 @@ static void mqueue_evict_inode(struct inode *inode)
 		free_msg(msg);
 	}
 
-	user = info->user;
-	if (user) {
+	if (info->ucounts) {
 		unsigned long mq_bytes, mq_treesize;
 
 		/* Total amount of bytes accounted for the mqueue */
@@ -533,7 +534,7 @@ static void mqueue_evict_inode(struct inode *inode)
 					  info->attr.mq_msgsize);
 
 		spin_lock(&mq_lock);
-		user->mq_bytes -= mq_bytes;
+		dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes);
 		/*
 		 * get_ns_from_inode() ensures that the
 		 * (ipc_ns = sb->s_fs_info) is either a valid ipc_ns
@@ -543,7 +544,8 @@ static void mqueue_evict_inode(struct inode *inode)
 		if (ipc_ns)
 			ipc_ns->mq_queues_count--;
 		spin_unlock(&mq_lock);
-		free_uid(user);
+		put_ucounts(info->ucounts);
+		info->ucounts = NULL;
 	}
 	if (ipc_ns)
 		put_ipc_ns(ipc_ns);
diff --git a/kernel/fork.c b/kernel/fork.c
index ed7dfb07178d..a9c5097dfc86 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -823,6 +823,7 @@ void __init fork_init(void)
 		init_user_ns.ucount_max[i] = max_threads/2;
 
 	init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC);
+	init_user_ns.ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = task_rlimit(&init_task, RLIMIT_MSGQUEUE);
 
 #ifdef CONFIG_VMAP_STACK
 	cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 6caa56f7dec8..6e6f936a5963 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -80,6 +80,7 @@ static struct ctl_table user_table[] = {
 	UCOUNT_ENTRY("max_inotify_instances"),
 	UCOUNT_ENTRY("max_inotify_watches"),
 #endif
+	{ },
 	{ },
 	{ }
 };
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index e6577c835072..7eccc4f84549 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -123,6 +123,7 @@ int create_user_ns(struct cred *new)
 		ns->ucount_max[i] = INT_MAX;
 	}
 	ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC);
+	ns->ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = rlimit(RLIMIT_MSGQUEUE);
 	ns->ucounts = ucounts;
 
 	/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
-- 
cgit v1.2.3


From d64696905554e919321e31afc210606653b8f6a4 Mon Sep 17 00:00:00 2001
From: Alexey Gladkov <legion@kernel.org>
Date: Thu, 22 Apr 2021 14:27:13 +0200
Subject: Reimplement RLIMIT_SIGPENDING on top of ucounts

The rlimit counter is tied to uid in the user_namespace. This allows
rlimit values to be specified in userns even if they are already
globally exceeded by the user. However, the value of the previous
user_namespaces cannot be exceeded.

Changelog

v11:
* Revert most of changes to fix performance issues.

v10:
* Fix memory leak on get_ucounts failure.

Signed-off-by: Alexey Gladkov <legion@kernel.org>
Link: https://lkml.kernel.org/r/df9d7764dddd50f28616b7840de74ec0f81711a8.1619094428.git.legion@kernel.org
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/array.c                |  2 +-
 include/linux/sched/user.h     |  1 -
 include/linux/signal_types.h   |  4 +++-
 include/linux/user_namespace.h |  1 +
 kernel/fork.c                  |  1 +
 kernel/signal.c                | 25 +++++++++++++------------
 kernel/ucount.c                |  1 +
 kernel/user.c                  |  1 -
 kernel/user_namespace.c        |  1 +
 9 files changed, 21 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index bb87e4d89cd8..74b0ea4b7e38 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -284,7 +284,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
 		collect_sigign_sigcatch(p, &ignored, &caught);
 		num_threads = get_nr_threads(p);
 		rcu_read_lock();  /* FIXME: is this correct? */
-		qsize = atomic_read(&__task_cred(p)->user->sigpending);
+		qsize = get_ucounts_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING);
 		rcu_read_unlock();
 		qlim = task_rlimit(p, RLIMIT_SIGPENDING);
 		unlock_task_sighand(p, &flags);
diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index 8a34446681aa..8ba9cec4fb99 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -12,7 +12,6 @@
  */
 struct user_struct {
 	refcount_t __count;	/* reference count */
-	atomic_t sigpending;	/* How many pending signals does this user have? */
 #ifdef CONFIG_FANOTIFY
 	atomic_t fanotify_listeners;
 #endif
diff --git a/include/linux/signal_types.h b/include/linux/signal_types.h
index 68e06c75c5b2..34cb28b8f16c 100644
--- a/include/linux/signal_types.h
+++ b/include/linux/signal_types.h
@@ -13,6 +13,8 @@ typedef struct kernel_siginfo {
 	__SIGINFO;
 } kernel_siginfo_t;
 
+struct ucounts;
+
 /*
  * Real Time signals may be queued.
  */
@@ -21,7 +23,7 @@ struct sigqueue {
 	struct list_head list;
 	int flags;
 	kernel_siginfo_t info;
-	struct user_struct *user;
+	struct ucounts *ucounts;
 };
 
 /* flags values. */
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 5eeb86b00e68..58f417986472 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -52,6 +52,7 @@ enum ucount_type {
 #endif
 	UCOUNT_RLIMIT_NPROC,
 	UCOUNT_RLIMIT_MSGQUEUE,
+	UCOUNT_RLIMIT_SIGPENDING,
 	UCOUNT_COUNTS,
 };
 
diff --git a/kernel/fork.c b/kernel/fork.c
index a9c5097dfc86..03119926b27d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -824,6 +824,7 @@ void __init fork_init(void)
 
 	init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC);
 	init_user_ns.ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = task_rlimit(&init_task, RLIMIT_MSGQUEUE);
+	init_user_ns.ucount_max[UCOUNT_RLIMIT_SIGPENDING] = task_rlimit(&init_task, RLIMIT_SIGPENDING);
 
 #ifdef CONFIG_VMAP_STACK
 	cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
diff --git a/kernel/signal.c b/kernel/signal.c
index f2718350bf4b..9a6dab712123 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -413,8 +413,8 @@ static struct sigqueue *
 __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
 {
 	struct sigqueue *q = NULL;
-	struct user_struct *user;
-	int sigpending;
+	struct ucounts *ucounts = NULL;
+	long sigpending;
 
 	/*
 	 * Protect access to @t credentials. This can go away when all
@@ -425,27 +425,26 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
 	 * changes from/to zero.
 	 */
 	rcu_read_lock();
-	user = __task_cred(t)->user;
-	sigpending = atomic_inc_return(&user->sigpending);
+	ucounts = task_ucounts(t);
+	sigpending = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1);
 	if (sigpending == 1)
-		get_uid(user);
+		ucounts = get_ucounts(ucounts);
 	rcu_read_unlock();
 
-	if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
+	if (override_rlimit || (sigpending < LONG_MAX && sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
 		q = kmem_cache_alloc(sigqueue_cachep, flags);
 	} else {
 		print_dropped_signal(sig);
 	}
 
 	if (unlikely(q == NULL)) {
-		if (atomic_dec_and_test(&user->sigpending))
-			free_uid(user);
+		if (ucounts && dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1))
+			put_ucounts(ucounts);
 	} else {
 		INIT_LIST_HEAD(&q->list);
 		q->flags = 0;
-		q->user = user;
+		q->ucounts = ucounts;
 	}
-
 	return q;
 }
 
@@ -453,8 +452,10 @@ static void __sigqueue_free(struct sigqueue *q)
 {
 	if (q->flags & SIGQUEUE_PREALLOC)
 		return;
-	if (atomic_dec_and_test(&q->user->sigpending))
-		free_uid(q->user);
+	if (q->ucounts && dec_rlimit_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING, 1)) {
+		put_ucounts(q->ucounts);
+		q->ucounts = NULL;
+	}
 	kmem_cache_free(sigqueue_cachep, q);
 }
 
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 6e6f936a5963..8ce62da6a62c 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -80,6 +80,7 @@ static struct ctl_table user_table[] = {
 	UCOUNT_ENTRY("max_inotify_instances"),
 	UCOUNT_ENTRY("max_inotify_watches"),
 #endif
+	{ },
 	{ },
 	{ },
 	{ }
diff --git a/kernel/user.c b/kernel/user.c
index 7f5ff498207a..6737327f83be 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -98,7 +98,6 @@ static DEFINE_SPINLOCK(uidhash_lock);
 /* root_user.__count is 1, for init task cred */
 struct user_struct root_user = {
 	.__count	= REFCOUNT_INIT(1),
-	.sigpending	= ATOMIC_INIT(0),
 	.locked_shm     = 0,
 	.uid		= GLOBAL_ROOT_UID,
 	.ratelimit	= RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0),
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 7eccc4f84549..822eacee4588 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -124,6 +124,7 @@ int create_user_ns(struct cred *new)
 	}
 	ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC);
 	ns->ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = rlimit(RLIMIT_MSGQUEUE);
+	ns->ucount_max[UCOUNT_RLIMIT_SIGPENDING] = rlimit(RLIMIT_SIGPENDING);
 	ns->ucounts = ucounts;
 
 	/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
-- 
cgit v1.2.3


From d7c9e99aee48e6bc0b427f3e3c658a6aba15001e Mon Sep 17 00:00:00 2001
From: Alexey Gladkov <legion@kernel.org>
Date: Thu, 22 Apr 2021 14:27:14 +0200
Subject: Reimplement RLIMIT_MEMLOCK on top of ucounts

The rlimit counter is tied to uid in the user_namespace. This allows
rlimit values to be specified in userns even if they are already
globally exceeded by the user. However, the value of the previous
user_namespaces cannot be exceeded.

Changelog

v11:
* Fix issue found by lkp robot.

v8:
* Fix issues found by lkp-tests project.

v7:
* Keep only ucounts for RLIMIT_MEMLOCK checks instead of struct cred.

v6:
* Fix bug in hugetlb_file_setup() detected by trinity.

Reported-by: kernel test robot <oliver.sang@intel.com>
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Alexey Gladkov <legion@kernel.org>
Link: https://lkml.kernel.org/r/970d50c70c71bfd4496e0e8d2a0a32feebebb350.1619094428.git.legion@kernel.org
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/hugetlbfs/inode.c           | 16 ++++++++--------
 include/linux/hugetlb.h        |  4 ++--
 include/linux/mm.h             |  4 ++--
 include/linux/sched/user.h     |  1 -
 include/linux/shmem_fs.h       |  2 +-
 include/linux/user_namespace.h |  1 +
 ipc/shm.c                      | 26 +++++++++++++-------------
 kernel/fork.c                  |  1 +
 kernel/ucount.c                |  1 +
 kernel/user.c                  |  1 -
 kernel/user_namespace.c        |  1 +
 mm/memfd.c                     |  4 ++--
 mm/mlock.c                     | 22 ++++++++++++++--------
 mm/mmap.c                      |  4 ++--
 mm/shmem.c                     | 10 +++++-----
 15 files changed, 53 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 701c82c36138..be519fc9559a 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1443,7 +1443,7 @@ static int get_hstate_idx(int page_size_log)
  * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
  */
 struct file *hugetlb_file_setup(const char *name, size_t size,
-				vm_flags_t acctflag, struct user_struct **user,
+				vm_flags_t acctflag, struct ucounts **ucounts,
 				int creat_flags, int page_size_log)
 {
 	struct inode *inode;
@@ -1455,20 +1455,20 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
 	if (hstate_idx < 0)
 		return ERR_PTR(-ENODEV);
 
-	*user = NULL;
+	*ucounts = NULL;
 	mnt = hugetlbfs_vfsmount[hstate_idx];
 	if (!mnt)
 		return ERR_PTR(-ENOENT);
 
 	if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
-		*user = current_user();
-		if (user_shm_lock(size, *user)) {
+		*ucounts = current_ucounts();
+		if (user_shm_lock(size, *ucounts)) {
 			task_lock(current);
 			pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
 				current->comm, current->pid);
 			task_unlock(current);
 		} else {
-			*user = NULL;
+			*ucounts = NULL;
 			return ERR_PTR(-EPERM);
 		}
 	}
@@ -1495,9 +1495,9 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
 
 	iput(inode);
 out:
-	if (*user) {
-		user_shm_unlock(size, *user);
-		*user = NULL;
+	if (*ucounts) {
+		user_shm_unlock(size, *ucounts);
+		*ucounts = NULL;
 	}
 	return file;
 }
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index cccd1aab69dd..96d63dbdec65 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -434,7 +434,7 @@ static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
 extern const struct file_operations hugetlbfs_file_operations;
 extern const struct vm_operations_struct hugetlb_vm_ops;
 struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
-				struct user_struct **user, int creat_flags,
+				struct ucounts **ucounts, int creat_flags,
 				int page_size_log);
 
 static inline bool is_file_hugepages(struct file *file)
@@ -454,7 +454,7 @@ static inline struct hstate *hstate_inode(struct inode *i)
 #define is_file_hugepages(file)			false
 static inline struct file *
 hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag,
-		struct user_struct **user, int creat_flags,
+		struct ucounts **ucounts, int creat_flags,
 		int page_size_log)
 {
 	return ERR_PTR(-ENOSYS);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8ba434287387..3b4e24738ce4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1670,8 +1670,8 @@ extern bool can_do_mlock(void);
 #else
 static inline bool can_do_mlock(void) { return false; }
 #endif
-extern int user_shm_lock(size_t, struct user_struct *);
-extern void user_shm_unlock(size_t, struct user_struct *);
+extern int user_shm_lock(size_t, struct ucounts *);
+extern void user_shm_unlock(size_t, struct ucounts *);
 
 /*
  * Parameter block passed down to zap_pte_range in exceptional cases.
diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index 8ba9cec4fb99..82bd2532da6b 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -18,7 +18,6 @@ struct user_struct {
 #ifdef CONFIG_EPOLL
 	atomic_long_t epoll_watches; /* The number of file descriptors currently watched */
 #endif
-	unsigned long locked_shm; /* How many pages of mlocked shm ? */
 	unsigned long unix_inflight;	/* How many files in flight in unix sockets */
 	atomic_long_t pipe_bufs;  /* how many pages are allocated in pipe buffers */
 
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index d82b6f396588..aa77dcd1646f 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -65,7 +65,7 @@ extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt,
 extern int shmem_zero_setup(struct vm_area_struct *);
 extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags);
-extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
+extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts);
 #ifdef CONFIG_SHMEM
 extern const struct address_space_operations shmem_aops;
 static inline bool shmem_mapping(struct address_space *mapping)
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 58f417986472..2a3177b9b8bf 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -53,6 +53,7 @@ enum ucount_type {
 	UCOUNT_RLIMIT_NPROC,
 	UCOUNT_RLIMIT_MSGQUEUE,
 	UCOUNT_RLIMIT_SIGPENDING,
+	UCOUNT_RLIMIT_MEMLOCK,
 	UCOUNT_COUNTS,
 };
 
diff --git a/ipc/shm.c b/ipc/shm.c
index febd88daba8c..003234fbbd17 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -60,7 +60,7 @@ struct shmid_kernel /* private to the kernel */
 	time64_t		shm_ctim;
 	struct pid		*shm_cprid;
 	struct pid		*shm_lprid;
-	struct user_struct	*mlock_user;
+	struct ucounts		*mlock_ucounts;
 
 	/* The task created the shm object.  NULL if the task is dead. */
 	struct task_struct	*shm_creator;
@@ -286,10 +286,10 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
 	shm_rmid(ns, shp);
 	shm_unlock(shp);
 	if (!is_file_hugepages(shm_file))
-		shmem_lock(shm_file, 0, shp->mlock_user);
-	else if (shp->mlock_user)
+		shmem_lock(shm_file, 0, shp->mlock_ucounts);
+	else if (shp->mlock_ucounts)
 		user_shm_unlock(i_size_read(file_inode(shm_file)),
-				shp->mlock_user);
+				shp->mlock_ucounts);
 	fput(shm_file);
 	ipc_update_pid(&shp->shm_cprid, NULL);
 	ipc_update_pid(&shp->shm_lprid, NULL);
@@ -625,7 +625,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 
 	shp->shm_perm.key = key;
 	shp->shm_perm.mode = (shmflg & S_IRWXUGO);
-	shp->mlock_user = NULL;
+	shp->mlock_ucounts = NULL;
 
 	shp->shm_perm.security = NULL;
 	error = security_shm_alloc(&shp->shm_perm);
@@ -650,7 +650,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 		if (shmflg & SHM_NORESERVE)
 			acctflag = VM_NORESERVE;
 		file = hugetlb_file_setup(name, hugesize, acctflag,
-				  &shp->mlock_user, HUGETLB_SHMFS_INODE,
+				  &shp->mlock_ucounts, HUGETLB_SHMFS_INODE,
 				(shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
 	} else {
 		/*
@@ -698,8 +698,8 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 no_id:
 	ipc_update_pid(&shp->shm_cprid, NULL);
 	ipc_update_pid(&shp->shm_lprid, NULL);
-	if (is_file_hugepages(file) && shp->mlock_user)
-		user_shm_unlock(size, shp->mlock_user);
+	if (is_file_hugepages(file) && shp->mlock_ucounts)
+		user_shm_unlock(size, shp->mlock_ucounts);
 	fput(file);
 	ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);
 	return error;
@@ -1105,12 +1105,12 @@ static int shmctl_do_lock(struct ipc_namespace *ns, int shmid, int cmd)
 		goto out_unlock0;
 
 	if (cmd == SHM_LOCK) {
-		struct user_struct *user = current_user();
+		struct ucounts *ucounts = current_ucounts();
 
-		err = shmem_lock(shm_file, 1, user);
+		err = shmem_lock(shm_file, 1, ucounts);
 		if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) {
 			shp->shm_perm.mode |= SHM_LOCKED;
-			shp->mlock_user = user;
+			shp->mlock_ucounts = ucounts;
 		}
 		goto out_unlock0;
 	}
@@ -1118,9 +1118,9 @@ static int shmctl_do_lock(struct ipc_namespace *ns, int shmid, int cmd)
 	/* SHM_UNLOCK */
 	if (!(shp->shm_perm.mode & SHM_LOCKED))
 		goto out_unlock0;
-	shmem_lock(shm_file, 0, shp->mlock_user);
+	shmem_lock(shm_file, 0, shp->mlock_ucounts);
 	shp->shm_perm.mode &= ~SHM_LOCKED;
-	shp->mlock_user = NULL;
+	shp->mlock_ucounts = NULL;
 	get_file(shm_file);
 	ipc_unlock_object(&shp->shm_perm);
 	rcu_read_unlock();
diff --git a/kernel/fork.c b/kernel/fork.c
index 03119926b27d..610fd4de60d7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -825,6 +825,7 @@ void __init fork_init(void)
 	init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC);
 	init_user_ns.ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = task_rlimit(&init_task, RLIMIT_MSGQUEUE);
 	init_user_ns.ucount_max[UCOUNT_RLIMIT_SIGPENDING] = task_rlimit(&init_task, RLIMIT_SIGPENDING);
+	init_user_ns.ucount_max[UCOUNT_RLIMIT_MEMLOCK] = task_rlimit(&init_task, RLIMIT_MEMLOCK);
 
 #ifdef CONFIG_VMAP_STACK
 	cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 8ce62da6a62c..d316bac3e520 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -83,6 +83,7 @@ static struct ctl_table user_table[] = {
 	{ },
 	{ },
 	{ },
+	{ },
 	{ }
 };
 #endif /* CONFIG_SYSCTL */
diff --git a/kernel/user.c b/kernel/user.c
index 6737327f83be..c82399c1618a 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -98,7 +98,6 @@ static DEFINE_SPINLOCK(uidhash_lock);
 /* root_user.__count is 1, for init task cred */
 struct user_struct root_user = {
 	.__count	= REFCOUNT_INIT(1),
-	.locked_shm     = 0,
 	.uid		= GLOBAL_ROOT_UID,
 	.ratelimit	= RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0),
 };
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 822eacee4588..892da1360862 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -125,6 +125,7 @@ int create_user_ns(struct cred *new)
 	ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC);
 	ns->ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = rlimit(RLIMIT_MSGQUEUE);
 	ns->ucount_max[UCOUNT_RLIMIT_SIGPENDING] = rlimit(RLIMIT_SIGPENDING);
+	ns->ucount_max[UCOUNT_RLIMIT_MEMLOCK] = rlimit(RLIMIT_MEMLOCK);
 	ns->ucounts = ucounts;
 
 	/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
diff --git a/mm/memfd.c b/mm/memfd.c
index 2647c898990c..081dd33e6a61 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -297,9 +297,9 @@ SYSCALL_DEFINE2(memfd_create,
 	}
 
 	if (flags & MFD_HUGETLB) {
-		struct user_struct *user = NULL;
+		struct ucounts *ucounts = NULL;
 
-		file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
+		file = hugetlb_file_setup(name, 0, VM_NORESERVE, &ucounts,
 					HUGETLB_ANONHUGE_INODE,
 					(flags >> MFD_HUGE_SHIFT) &
 					MFD_HUGE_MASK);
diff --git a/mm/mlock.c b/mm/mlock.c
index f8f8cc32d03d..dd411aabf695 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -817,9 +817,10 @@ SYSCALL_DEFINE0(munlockall)
  */
 static DEFINE_SPINLOCK(shmlock_user_lock);
 
-int user_shm_lock(size_t size, struct user_struct *user)
+int user_shm_lock(size_t size, struct ucounts *ucounts)
 {
 	unsigned long lock_limit, locked;
+	long memlock;
 	int allowed = 0;
 
 	locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -828,21 +829,26 @@ int user_shm_lock(size_t size, struct user_struct *user)
 		allowed = 1;
 	lock_limit >>= PAGE_SHIFT;
 	spin_lock(&shmlock_user_lock);
-	if (!allowed &&
-	    locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
+	memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
+
+	if (!allowed && (memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
+		dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
+		goto out;
+	}
+	if (!get_ucounts(ucounts)) {
+		dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
 		goto out;
-	get_uid(user);
-	user->locked_shm += locked;
+	}
 	allowed = 1;
 out:
 	spin_unlock(&shmlock_user_lock);
 	return allowed;
 }
 
-void user_shm_unlock(size_t size, struct user_struct *user)
+void user_shm_unlock(size_t size, struct ucounts *ucounts)
 {
 	spin_lock(&shmlock_user_lock);
-	user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
 	spin_unlock(&shmlock_user_lock);
-	free_uid(user);
+	put_ucounts(ucounts);
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index 3f287599a7a3..99f97d200aa4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1605,7 +1605,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
 			goto out_fput;
 		}
 	} else if (flags & MAP_HUGETLB) {
-		struct user_struct *user = NULL;
+		struct ucounts *ucounts = NULL;
 		struct hstate *hs;
 
 		hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
@@ -1621,7 +1621,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
 		 */
 		file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
 				VM_NORESERVE,
-				&user, HUGETLB_ANONHUGE_INODE,
+				&ucounts, HUGETLB_ANONHUGE_INODE,
 				(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
 		if (IS_ERR(file))
 			return PTR_ERR(file);
diff --git a/mm/shmem.c b/mm/shmem.c
index b2db4ed0fbc7..7ee6d27222e9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2227,7 +2227,7 @@ static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
 }
 #endif
 
-int shmem_lock(struct file *file, int lock, struct user_struct *user)
+int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
 {
 	struct inode *inode = file_inode(file);
 	struct shmem_inode_info *info = SHMEM_I(inode);
@@ -2239,13 +2239,13 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
 	 * no serialization needed when called from shm_destroy().
 	 */
 	if (lock && !(info->flags & VM_LOCKED)) {
-		if (!user_shm_lock(inode->i_size, user))
+		if (!user_shm_lock(inode->i_size, ucounts))
 			goto out_nomem;
 		info->flags |= VM_LOCKED;
 		mapping_set_unevictable(file->f_mapping);
 	}
-	if (!lock && (info->flags & VM_LOCKED) && user) {
-		user_shm_unlock(inode->i_size, user);
+	if (!lock && (info->flags & VM_LOCKED) && ucounts) {
+		user_shm_unlock(inode->i_size, ucounts);
 		info->flags &= ~VM_LOCKED;
 		mapping_clear_unevictable(file->f_mapping);
 	}
@@ -4093,7 +4093,7 @@ int shmem_unuse(unsigned int type, bool frontswap,
 	return 0;
 }
 
-int shmem_lock(struct file *file, int lock, struct user_struct *user)
+int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From c1ada3dc7219b02b3467aa906c2f5f8b098578d1 Mon Sep 17 00:00:00 2001
From: Alexey Gladkov <legion@kernel.org>
Date: Thu, 22 Apr 2021 14:27:16 +0200
Subject: ucounts: Set ucount_max to the largest positive value the type can
 hold

The ns->ucount_max[] is signed long which is less than the rlimit size.
We have to protect ucount_max[] from overflow and only use the largest
value that we can hold.

On 32bit using "long" instead of "unsigned long" to hold the counts has
the downside that RLIMIT_MSGQUEUE and RLIMIT_MEMLOCK are limited to 2GiB
instead of 4GiB. I don't think anyone cares but it should be mentioned
in case someone does.

The RLIMIT_NPROC and RLIMIT_SIGPENDING used atomic_t so their maximum
hasn't changed.

Signed-off-by: Alexey Gladkov <legion@kernel.org>
Link: https://lkml.kernel.org/r/1825a5dfa18bc5a570e79feb05e2bd07fd57e7e3.1619094428.git.legion@kernel.org
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/user_namespace.h | 6 ++++++
 kernel/fork.c                  | 8 ++++----
 kernel/user_namespace.c        | 8 ++++----
 3 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 2a3177b9b8bf..61794ae32fa8 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -125,6 +125,12 @@ long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v);
 bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v);
 bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max);
 
+static inline void set_rlimit_ucount_max(struct user_namespace *ns,
+		enum ucount_type type, unsigned long max)
+{
+	ns->ucount_max[type] = max <= LONG_MAX ? max : LONG_MAX;
+}
+
 #ifdef CONFIG_USER_NS
 
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
diff --git a/kernel/fork.c b/kernel/fork.c
index 610fd4de60d7..c41820481b2e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -822,10 +822,10 @@ void __init fork_init(void)
 	for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++)
 		init_user_ns.ucount_max[i] = max_threads/2;
 
-	init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC);
-	init_user_ns.ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = task_rlimit(&init_task, RLIMIT_MSGQUEUE);
-	init_user_ns.ucount_max[UCOUNT_RLIMIT_SIGPENDING] = task_rlimit(&init_task, RLIMIT_SIGPENDING);
-	init_user_ns.ucount_max[UCOUNT_RLIMIT_MEMLOCK] = task_rlimit(&init_task, RLIMIT_MEMLOCK);
+	set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, task_rlimit(&init_task, RLIMIT_NPROC));
+	set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, task_rlimit(&init_task, RLIMIT_MSGQUEUE));
+	set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, task_rlimit(&init_task, RLIMIT_SIGPENDING));
+	set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, task_rlimit(&init_task, RLIMIT_MEMLOCK));
 
 #ifdef CONFIG_VMAP_STACK
 	cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 892da1360862..d4a545bbab7f 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -122,10 +122,10 @@ int create_user_ns(struct cred *new)
 	for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) {
 		ns->ucount_max[i] = INT_MAX;
 	}
-	ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC);
-	ns->ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = rlimit(RLIMIT_MSGQUEUE);
-	ns->ucount_max[UCOUNT_RLIMIT_SIGPENDING] = rlimit(RLIMIT_SIGPENDING);
-	ns->ucount_max[UCOUNT_RLIMIT_MEMLOCK] = rlimit(RLIMIT_MEMLOCK);
+	set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC));
+	set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE));
+	set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING));
+	set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK));
 	ns->ucounts = ucounts;
 
 	/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
-- 
cgit v1.2.3


From 6be388f4a35d2ce5ef7dbf635a8964a5da7f799f Mon Sep 17 00:00:00 2001
From: Anirudh Rayabharam <mail@anirudhrb.com>
Date: Sun, 25 Apr 2021 23:03:53 +0530
Subject: HID: usbhid: fix info leak in hid_submit_ctrl

In hid_submit_ctrl(), the way of calculating the report length doesn't
take into account that report->size can be zero. When running the
syzkaller reproducer, a report of size 0 causes hid_submit_ctrl) to
calculate transfer_buffer_length as 16384. When this urb is passed to
the usb core layer, KMSAN reports an info leak of 16384 bytes.

To fix this, first modify hid_report_len() to account for the zero
report size case by using DIV_ROUND_UP for the division. Then, call it
from hid_submit_ctrl().

Reported-by: syzbot+7c2bb71996f95a82524c@syzkaller.appspotmail.com
Signed-off-by: Anirudh Rayabharam <mail@anirudhrb.com>
Acked-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/usbhid/hid-core.c | 2 +-
 include/linux/hid.h           | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c
index 86257ce6d619..4e9077363c96 100644
--- a/drivers/hid/usbhid/hid-core.c
+++ b/drivers/hid/usbhid/hid-core.c
@@ -374,7 +374,7 @@ static int hid_submit_ctrl(struct hid_device *hid)
 	raw_report = usbhid->ctrl[usbhid->ctrltail].raw_report;
 	dir = usbhid->ctrl[usbhid->ctrltail].dir;
 
-	len = ((report->size - 1) >> 3) + 1 + (report->id > 0);
+	len = hid_report_len(report);
 	if (dir == USB_DIR_OUT) {
 		usbhid->urbctrl->pipe = usb_sndctrlpipe(hid_to_usb_dev(hid), 0);
 		usbhid->urbctrl->transfer_buffer_length = len;
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 271021e20a3f..10e922cee4eb 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -1167,8 +1167,7 @@ static inline void hid_hw_wait(struct hid_device *hdev)
  */
 static inline u32 hid_report_len(struct hid_report *report)
 {
-	/* equivalent to DIV_ROUND_UP(report->size, 8) + !!(report->id > 0) */
-	return ((report->size - 1) >> 3) + 1 + (report->id > 0);
+	return DIV_ROUND_UP(report->size, 8) + (report->id > 0);
 }
 
 int hid_report_raw_event(struct hid_device *hid, int type, u8 *data, u32 size,
-- 
cgit v1.2.3


From 1333a6779501f4cc662ff5c8b36b0a22f3a7ddc6 Mon Sep 17 00:00:00 2001
From: Michael Walle <michael@walle.cc>
Date: Sat, 24 Apr 2021 13:06:04 +0200
Subject: nvmem: core: allow specifying of_node

Until now, the of_node of the parent device is used. Some devices
provide more than just the nvmem provider. To avoid name space clashes,
add a way to allow specifying the nvmem cells in subnodes. Consider the
following example:

    flash@0 {
        compatible = "jedec,spi-nor";

        partitions {
            compatible = "fixed-partitions";
            #address-cells = <1>;
            #size-cells = <1>;

            partition@0 {
                reg = <0x000000 0x010000>;
            };
        };

        otp {
            compatible = "user-otp";
            #address-cells = <1>;
            #size-cells = <1>;

            serial-number@0 {
                reg = <0x0 0x8>;
            };
        };
    };

There the nvmem provider might be the MTD partition or the OTP region of
the flash.

Add a new config->of_node parameter, which if set, will be used instead
of the parent's of_node.

Signed-off-by: Michael Walle <michael@walle.cc>
Acked-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210424110608.15748-2-michael@walle.cc
---
 drivers/nvmem/core.c           | 4 +++-
 include/linux/nvmem-provider.h | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index bca671ff4e54..62d363a399d3 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -789,7 +789,9 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config)
 	nvmem->reg_write = config->reg_write;
 	nvmem->keepout = config->keepout;
 	nvmem->nkeepout = config->nkeepout;
-	if (!config->no_of_node)
+	if (config->of_node)
+		nvmem->dev.of_node = config->of_node;
+	else if (!config->no_of_node)
 		nvmem->dev.of_node = config->dev->of_node;
 
 	switch (config->id) {
diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index e162b757b6d5..471cb7b9e896 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -57,6 +57,7 @@ struct nvmem_keepout {
  * @type:	Type of the nvmem storage
  * @read_only:	Device is read-only.
  * @root_only:	Device is accessibly to root only.
+ * @of_node:	If given, this will be used instead of the parent's of_node.
  * @no_of_node:	Device should not use the parent's of_node even if it's !NULL.
  * @reg_read:	Callback to read data.
  * @reg_write:	Callback to write data.
@@ -86,6 +87,7 @@ struct nvmem_config {
 	enum nvmem_type		type;
 	bool			read_only;
 	bool			root_only;
+	struct device_node	*of_node;
 	bool			no_of_node;
 	nvmem_reg_read_t	reg_read;
 	nvmem_reg_write_t	reg_write;
-- 
cgit v1.2.3


From 4b361cfa862479fbb1d14ddf01de4dbc7146dcc5 Mon Sep 17 00:00:00 2001
From: Michael Walle <michael@walle.cc>
Date: Sat, 24 Apr 2021 13:06:08 +0200
Subject: mtd: core: add OTP nvmem provider support

Flash OTP regions can already be read via user space. Some boards have
their serial number or MAC addresses stored in the OTP regions. Add
support for them being a (read-only) nvmem provider.

The API to read the OTP data is already in place. It distinguishes
between factory and user OTP, thus there are up to two different
providers.

Signed-off-by: Michael Walle <michael@walle.cc>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210424110608.15748-6-michael@walle.cc
---
 drivers/mtd/mtdcore.c   | 148 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/mtd.h |   2 +
 2 files changed, 150 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 1fb43eea3599..3f2e20e22501 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -776,6 +776,146 @@ static void mtd_set_dev_defaults(struct mtd_info *mtd)
 	mutex_init(&mtd->master.chrdev_lock);
 }
 
+static ssize_t mtd_otp_size(struct mtd_info *mtd, bool is_user)
+{
+	struct otp_info *info = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	ssize_t size = 0;
+	unsigned int i;
+	size_t retlen;
+	int ret;
+
+	if (is_user)
+		ret = mtd_get_user_prot_info(mtd, PAGE_SIZE, &retlen, info);
+	else
+		ret = mtd_get_fact_prot_info(mtd, PAGE_SIZE, &retlen, info);
+	if (ret)
+		goto err;
+
+	for (i = 0; i < retlen / sizeof(*info); i++) {
+		size += info->length;
+		info++;
+	}
+
+	kfree(info);
+	return size;
+
+err:
+	kfree(info);
+	return ret;
+}
+
+static struct nvmem_device *mtd_otp_nvmem_register(struct mtd_info *mtd,
+						   const char *compatible,
+						   int size,
+						   nvmem_reg_read_t reg_read)
+{
+	struct nvmem_device *nvmem = NULL;
+	struct nvmem_config config = {};
+	struct device_node *np;
+
+	/* DT binding is optional */
+	np = of_get_compatible_child(mtd->dev.of_node, compatible);
+
+	/* OTP nvmem will be registered on the physical device */
+	config.dev = mtd->dev.parent;
+	/* just reuse the compatible as name */
+	config.name = compatible;
+	config.id = NVMEM_DEVID_NONE;
+	config.owner = THIS_MODULE;
+	config.type = NVMEM_TYPE_OTP;
+	config.root_only = true;
+	config.reg_read = reg_read;
+	config.size = size;
+	config.of_node = np;
+	config.priv = mtd;
+
+	nvmem = nvmem_register(&config);
+	/* Just ignore if there is no NVMEM support in the kernel */
+	if (IS_ERR(nvmem) && PTR_ERR(nvmem) == -EOPNOTSUPP)
+		nvmem = NULL;
+
+	of_node_put(np);
+
+	return nvmem;
+}
+
+static int mtd_nvmem_user_otp_reg_read(void *priv, unsigned int offset,
+				       void *val, size_t bytes)
+{
+	struct mtd_info *mtd = priv;
+	size_t retlen;
+	int ret;
+
+	ret = mtd_read_user_prot_reg(mtd, offset, bytes, &retlen, val);
+	if (ret)
+		return ret;
+
+	return retlen == bytes ? 0 : -EIO;
+}
+
+static int mtd_nvmem_fact_otp_reg_read(void *priv, unsigned int offset,
+				       void *val, size_t bytes)
+{
+	struct mtd_info *mtd = priv;
+	size_t retlen;
+	int ret;
+
+	ret = mtd_read_fact_prot_reg(mtd, offset, bytes, &retlen, val);
+	if (ret)
+		return ret;
+
+	return retlen == bytes ? 0 : -EIO;
+}
+
+static int mtd_otp_nvmem_add(struct mtd_info *mtd)
+{
+	struct nvmem_device *nvmem;
+	ssize_t size;
+	int err;
+
+	if (mtd->_get_user_prot_info && mtd->_read_user_prot_reg) {
+		size = mtd_otp_size(mtd, true);
+		if (size < 0)
+			return size;
+
+		if (size > 0) {
+			nvmem = mtd_otp_nvmem_register(mtd, "user-otp", size,
+						       mtd_nvmem_user_otp_reg_read);
+			if (IS_ERR(nvmem)) {
+				dev_err(&mtd->dev, "Failed to register OTP NVMEM device\n");
+				return PTR_ERR(nvmem);
+			}
+			mtd->otp_user_nvmem = nvmem;
+		}
+	}
+
+	if (mtd->_get_fact_prot_info && mtd->_read_fact_prot_reg) {
+		size = mtd_otp_size(mtd, false);
+		if (size < 0) {
+			err = size;
+			goto err;
+		}
+
+		if (size > 0) {
+			nvmem = mtd_otp_nvmem_register(mtd, "factory-otp", size,
+						       mtd_nvmem_fact_otp_reg_read);
+			if (IS_ERR(nvmem)) {
+				dev_err(&mtd->dev, "Failed to register OTP NVMEM device\n");
+				err = PTR_ERR(nvmem);
+				goto err;
+			}
+			mtd->otp_factory_nvmem = nvmem;
+		}
+	}
+
+	return 0;
+
+err:
+	if (mtd->otp_user_nvmem)
+		nvmem_unregister(mtd->otp_user_nvmem);
+	return err;
+}
+
 /**
  * mtd_device_parse_register - parse partitions and register an MTD device.
  *
@@ -851,6 +991,8 @@ int mtd_device_parse_register(struct mtd_info *mtd, const char * const *types,
 		register_reboot_notifier(&mtd->reboot_notifier);
 	}
 
+	ret = mtd_otp_nvmem_add(mtd);
+
 out:
 	if (ret && device_is_registered(&mtd->dev))
 		del_mtd_device(mtd);
@@ -872,6 +1014,12 @@ int mtd_device_unregister(struct mtd_info *master)
 	if (master->_reboot)
 		unregister_reboot_notifier(&master->reboot_notifier);
 
+	if (master->otp_user_nvmem)
+		nvmem_unregister(master->otp_user_nvmem);
+
+	if (master->otp_factory_nvmem)
+		nvmem_unregister(master->otp_factory_nvmem);
+
 	err = del_mtd_partitions(master);
 	if (err)
 		return err;
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index a89955f3cbc8..88227044fc86 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -380,6 +380,8 @@ struct mtd_info {
 	int usecount;
 	struct mtd_debug_info dbg;
 	struct nvmem_device *nvmem;
+	struct nvmem_device *otp_user_nvmem;
+	struct nvmem_device *otp_factory_nvmem;
 
 	/*
 	 * Parent device from the MTD partition point of view.
-- 
cgit v1.2.3


From 0e4768713e71dd224633fd7e00ad358bc48f433a Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 23 Apr 2021 21:24:31 +0300
Subject: spi: pxa2xx: Replace header inclusions by forward declarations

When the data structure is only referred by pointer, compiler may not need
to see the contents of the data type. Thus, we may replace header inclusions
by respective forward declarations. Due to above add missed headers as well.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210423182441.50272-5-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-pxa2xx-dma.c   |  4 ++--
 drivers/spi/spi-pxa2xx-pci.c   |  1 +
 drivers/spi/spi-pxa2xx.c       |  2 ++
 drivers/spi/spi-pxa2xx.h       | 18 ++++++++++--------
 include/linux/spi/pxa2xx_spi.h |  2 ++
 5 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi-pxa2xx-dma.c b/drivers/spi/spi-pxa2xx-dma.c
index 2e4a49567146..e00dbadd39ec 100644
--- a/drivers/spi/spi-pxa2xx-dma.c
+++ b/drivers/spi/spi-pxa2xx-dma.c
@@ -9,11 +9,11 @@
 #include <linux/device.h>
 #include <linux/dma-mapping.h>
 #include <linux/dmaengine.h>
-#include <linux/pxa2xx_ssp.h>
 #include <linux/scatterlist.h>
 #include <linux/sizes.h>
-#include <linux/spi/spi.h>
+
 #include <linux/spi/pxa2xx_spi.h>
+#include <linux/spi/spi.h>
 
 #include "spi-pxa2xx.h"
 
diff --git a/drivers/spi/spi-pxa2xx-pci.c b/drivers/spi/spi-pxa2xx-pci.c
index f588fad77fc0..a259be12d326 100644
--- a/drivers/spi/spi-pxa2xx-pci.c
+++ b/drivers/spi/spi-pxa2xx-pci.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/platform_device.h>
+
 #include <linux/spi/pxa2xx_spi.h>
 
 #include <linux/dmaengine.h>
diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c
index 0f3f7d725937..1d4c7f4217ed 100644
--- a/drivers/spi/spi-pxa2xx.c
+++ b/drivers/spi/spi-pxa2xx.c
@@ -9,6 +9,7 @@
 #include <linux/clk.h>
 #include <linux/delay.h>
 #include <linux/device.h>
+#include <linux/dmaengine.h>
 #include <linux/err.h>
 #include <linux/errno.h>
 #include <linux/gpio/consumer.h>
@@ -25,6 +26,7 @@
 #include <linux/pm_runtime.h>
 #include <linux/property.h>
 #include <linux/slab.h>
+
 #include <linux/spi/pxa2xx_spi.h>
 #include <linux/spi/spi.h>
 
diff --git a/drivers/spi/spi-pxa2xx.h b/drivers/spi/spi-pxa2xx.h
index 6724d7e056ce..739e264feaa6 100644
--- a/drivers/spi/spi-pxa2xx.h
+++ b/drivers/spi/spi-pxa2xx.h
@@ -7,16 +7,18 @@
 #ifndef SPI_PXA2XX_H
 #define SPI_PXA2XX_H
 
-#include <linux/atomic.h>
-#include <linux/dmaengine.h>
-#include <linux/errno.h>
-#include <linux/io.h>
 #include <linux/interrupt.h>
-#include <linux/pxa2xx_ssp.h>
-#include <linux/scatterlist.h>
+#include <linux/io.h>
+#include <linux/types.h>
 #include <linux/sizes.h>
-#include <linux/spi/spi.h>
-#include <linux/spi/pxa2xx_spi.h>
+
+#include <linux/pxa2xx_ssp.h>
+
+struct gpio_desc;
+struct pxa2xx_spi_controller;
+struct spi_controller;
+struct spi_device;
+struct spi_transfer;
 
 struct driver_data {
 	/* SSP Info */
diff --git a/include/linux/spi/pxa2xx_spi.h b/include/linux/spi/pxa2xx_spi.h
index 31f00c7f4f59..1e0e2f136319 100644
--- a/include/linux/spi/pxa2xx_spi.h
+++ b/include/linux/spi/pxa2xx_spi.h
@@ -5,6 +5,8 @@
 #ifndef __linux_pxa2xx_spi_h
 #define __linux_pxa2xx_spi_h
 
+#include <linux/types.h>
+
 #include <linux/pxa2xx_ssp.h>
 
 #define PXA2XX_CS_ASSERT (0x01)
-- 
cgit v1.2.3


From 5edc24901f4d469f8fc943004f73655933e89dbf Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 23 Apr 2021 21:24:32 +0300
Subject: spi: pxa2xx: Unify ifdeffery used in the headers

The two headers have quite different ifdeffery to prevent multiple inclusion.
Unify them with the pattern that in particular reflects their location.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210423182441.50272-6-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/pxa2xx_ssp.h     | 6 +++---
 include/linux/spi/pxa2xx_spi.h | 7 ++++---
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pxa2xx_ssp.h b/include/linux/pxa2xx_ssp.h
index 7f73b26ed22e..14b049840faf 100644
--- a/include/linux/pxa2xx_ssp.h
+++ b/include/linux/pxa2xx_ssp.h
@@ -11,8 +11,8 @@
  *       PXA3xx     SSP1, SSP2, SSP3, SSP4
  */
 
-#ifndef __LINUX_SSP_H
-#define __LINUX_SSP_H
+#ifndef __LINUX_PXA2XX_SSP_H
+#define __LINUX_PXA2XX_SSP_H
 
 #include <linux/bits.h>
 #include <linux/compiler_types.h>
@@ -270,4 +270,4 @@ static inline struct ssp_device *pxa_ssp_request_of(const struct device_node *n,
 static inline void pxa_ssp_free(struct ssp_device *ssp) {}
 #endif
 
-#endif
+#endif	/* __LINUX_PXA2XX_SSP_H */
diff --git a/include/linux/spi/pxa2xx_spi.h b/include/linux/spi/pxa2xx_spi.h
index 1e0e2f136319..12ef04d0896d 100644
--- a/include/linux/spi/pxa2xx_spi.h
+++ b/include/linux/spi/pxa2xx_spi.h
@@ -2,8 +2,8 @@
 /*
  * Copyright (C) 2005 Stephen Street / StreetFire Sound Labs
  */
-#ifndef __linux_pxa2xx_spi_h
-#define __linux_pxa2xx_spi_h
+#ifndef __LINUX_SPI_PXA2XX_SPI_H
+#define __LINUX_SPI_PXA2XX_SPI_H
 
 #include <linux/types.h>
 
@@ -51,4 +51,5 @@ struct pxa2xx_spi_chip {
 extern void pxa2xx_set_spi_info(unsigned id, struct pxa2xx_spi_controller *info);
 
 #endif
-#endif
+
+#endif	/* __LINUX_SPI_PXA2XX_SPI_H */
-- 
cgit v1.2.3


From 1beb37b0e3f98708bfb37778049764b4500756da Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 23 Apr 2021 21:24:33 +0300
Subject: spi: pxa2xx: Group Intel Quark specific definitions

DDS_RATE is Intel Quark specific definition. Move it to the rest
Intel Quark related.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210423182441.50272-7-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/pxa2xx_ssp.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pxa2xx_ssp.h b/include/linux/pxa2xx_ssp.h
index 14b049840faf..1b6c1a0922bd 100644
--- a/include/linux/pxa2xx_ssp.h
+++ b/include/linux/pxa2xx_ssp.h
@@ -38,7 +38,6 @@ struct device_node;
 #define SSDR		(0x10)  /* SSP Data Write/Data Read Register */
 
 #define SSTO		(0x28)  /* SSP Time Out Register */
-#define DDS_RATE	(0x28)  /* SSP DDS Clock Rate Register (Intel Quark) */
 #define SSPSP		(0x2C)  /* SSP Programmable Serial Protocol */
 #define SSTSA		(0x30)  /* SSP Tx Timeslot Active */
 #define SSRSA		(0x34)  /* SSP Rx Timeslot Active */
@@ -105,6 +104,9 @@ struct device_node;
 #define CE4100_SSCR1_RFT	GENMASK(11, 10)	/* Receive FIFO Threshold (mask) */
 #define CE4100_SSCR1_RxTresh(x) (((x) - 1) << 10)	/* level [1..4] */
 
+/* Intel Quark X1000 */
+#define DDS_RATE		0x28		 /* SSP DDS Clock Rate Register */
+
 /* QUARK_X1000 SSCR0 bit definition */
 #define QUARK_X1000_SSCR0_DSS		GENMASK(4, 0)	/* Data Size Select (mask) */
 #define QUARK_X1000_SSCR0_DataSize(x)	((x) - 1)	/* Data Size Select [4..32] */
-- 
cgit v1.2.3


From 661ee6280931548f7b3b887ad26a157474ae5ac4 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Sat, 8 May 2021 14:15:38 +0200
Subject: cgroup: introduce cgroup.kill

Introduce the cgroup.kill file. It does what it says on the tin and
allows a caller to kill a cgroup by writing "1" into cgroup.kill.
The file is available in non-root cgroups.

Killing cgroups is a process directed operation, i.e. the whole
thread-group is affected. Consequently trying to write to cgroup.kill in
threaded cgroups will be rejected and EOPNOTSUPP returned. This behavior
aligns with cgroup.procs where reads in threaded-cgroups are rejected
with EOPNOTSUPP.

The cgroup.kill file is write-only since killing a cgroup is an event
not which makes it different from e.g. freezer where a cgroup
transitions between the two states.

As with all new cgroup features cgroup.kill is recursive by default.

Killing a cgroup is protected against concurrent migrations through the
cgroup mutex. To protect against forkbombs and to mitigate the effect of
racing forks a new CGRP_KILL css set lock protected flag is introduced
that is set prior to killing a cgroup and unset after the cgroup has
been killed. We can then check in cgroup_post_fork() where we hold the
css set lock already whether the cgroup is currently being killed. If so
we send the child a SIGKILL signal immediately taking it down as soon as
it returns to userspace. To make the killing of the child semantically
clean it is killed after all cgroup attachment operations have been
finalized.

There are various use-cases of this interface:
- Containers usually have a conservative layout where each container
  usually has a delegated cgroup. For such layouts there is a 1:1
  mapping between container and cgroup. If the container in addition
  uses a separate pid namespace then killing a container usually becomes
  a simple kill -9 <container-init-pid> from an ancestor pid namespace.
  However, there are quite a few scenarios where that isn't true. For
  example, there are containers that share the cgroup with other
  processes on purpose that are supposed to be bound to the lifetime of
  the container but are not in the same pidns of the container.
  Containers that are in a delegated cgroup but share the pid namespace
  with the host or other containers.
- Service managers such as systemd use cgroups to group and organize
  processes belonging to a service. They usually rely on a recursive
  algorithm now to kill a service. With cgroup.kill this becomes a
  simple write to cgroup.kill.
- Userspace OOM implementations can make good use of this feature to
  efficiently take down whole cgroups quickly.
- The kill program can gain a new
  kill --cgroup /sys/fs/cgroup/delegated
  flag to take down cgroups.

A few observations about the semantics:
- If parent and child are in the same cgroup and CLONE_INTO_CGROUP is
  not specified we are not taking cgroup mutex meaning the cgroup can be
  killed while a process in that cgroup is forking.
  If the kill request happens right before cgroup_can_fork() and before
  the parent grabs its siglock the parent is guaranteed to see the
  pending SIGKILL. In addition we perform another check in
  cgroup_post_fork() whether the cgroup is being killed and is so take
  down the child (see above). This is robust enough and protects gainst
  forkbombs. If userspace really really wants to have stricter
  protection the simple solution would be to grab the write side of the
  cgroup threadgroup rwsem which will force all ongoing forks to
  complete before killing starts. We concluded that this is not
  necessary as the semantics for concurrent forking should simply align
  with freezer where a similar check as cgroup_post_fork() is performed.

  For all other cases CLONE_INTO_CGROUP is required. In this case we
  will grab the cgroup mutex so the cgroup can't be killed while we
  fork. Once we're done with the fork and have dropped cgroup mutex we
  are visible and will be found by any subsequent kill request.
- We obviously don't kill kthreads. This means a cgroup that has a
  kthread will not become empty after killing and consequently no
  unpopulated event will be generated. The assumption is that kthreads
  should be in the root cgroup only anyway so this is not an issue.
- We skip killing tasks that already have pending fatal signals.
- Freezer doesn't care about tasks in different pid namespaces, i.e. if
  you have two tasks in different pid namespaces the cgroup would still
  be frozen. The cgroup.kill mechanism consequently behaves the same
  way, i.e. we kill all processes and ignore in which pid namespace they
  exist.
- If the caller is located in a cgroup that is killed the caller will
  obviously be killed as well.

Link: https://lore.kernel.org/r/20210503143922.3093755-1-brauner@kernel.org
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: cgroups@vger.kernel.org
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Acked-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h |   3 ++
 kernel/cgroup/cgroup.c      | 127 +++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 116 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 559ee05f86b2..43fef771009a 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -71,6 +71,9 @@ enum {
 
 	/* Cgroup is frozen. */
 	CGRP_FROZEN,
+
+	/* Control group has to be killed. */
+	CGRP_KILL,
 };
 
 /* cgroup_root->flags */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index e049edd66776..e640fc78d731 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3667,6 +3667,80 @@ static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
 	return nbytes;
 }
 
+static void __cgroup_kill(struct cgroup *cgrp)
+{
+	struct css_task_iter it;
+	struct task_struct *task;
+
+	lockdep_assert_held(&cgroup_mutex);
+
+	spin_lock_irq(&css_set_lock);
+	set_bit(CGRP_KILL, &cgrp->flags);
+	spin_unlock_irq(&css_set_lock);
+
+	css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
+	while ((task = css_task_iter_next(&it))) {
+		/* Ignore kernel threads here. */
+		if (task->flags & PF_KTHREAD)
+			continue;
+
+		/* Skip tasks that are already dying. */
+		if (__fatal_signal_pending(task))
+			continue;
+
+		send_sig(SIGKILL, task, 0);
+	}
+	css_task_iter_end(&it);
+
+	spin_lock_irq(&css_set_lock);
+	clear_bit(CGRP_KILL, &cgrp->flags);
+	spin_unlock_irq(&css_set_lock);
+}
+
+static void cgroup_kill(struct cgroup *cgrp)
+{
+	struct cgroup_subsys_state *css;
+	struct cgroup *dsct;
+
+	lockdep_assert_held(&cgroup_mutex);
+
+	cgroup_for_each_live_descendant_pre(dsct, css, cgrp)
+		__cgroup_kill(dsct);
+}
+
+static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
+				 size_t nbytes, loff_t off)
+{
+	ssize_t ret = 0;
+	int kill;
+	struct cgroup *cgrp;
+
+	ret = kstrtoint(strstrip(buf), 0, &kill);
+	if (ret)
+		return ret;
+
+	if (kill != 1)
+		return -ERANGE;
+
+	cgrp = cgroup_kn_lock_live(of->kn, false);
+	if (!cgrp)
+		return -ENOENT;
+
+	/*
+	 * Killing is a process directed operation, i.e. the whole thread-group
+	 * is taken down so act like we do for cgroup.procs and only make this
+	 * writable in non-threaded cgroups.
+	 */
+	if (cgroup_is_threaded(cgrp))
+		ret = -EOPNOTSUPP;
+	else
+		cgroup_kill(cgrp);
+
+	cgroup_kn_unlock(of->kn);
+
+	return ret ?: nbytes;
+}
+
 static int cgroup_file_open(struct kernfs_open_file *of)
 {
 	struct cftype *cft = of_cft(of);
@@ -4859,6 +4933,11 @@ static struct cftype cgroup_base_files[] = {
 		.seq_show = cgroup_freeze_show,
 		.write = cgroup_freeze_write,
 	},
+	{
+		.name = "cgroup.kill",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.write = cgroup_kill_write,
+	},
 	{
 		.name = "cpu.stat",
 		.seq_show = cpu_stat_show,
@@ -6085,6 +6164,8 @@ void cgroup_post_fork(struct task_struct *child,
 		      struct kernel_clone_args *kargs)
 	__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
 {
+	unsigned long cgrp_flags = 0;
+	bool kill = false;
 	struct cgroup_subsys *ss;
 	struct css_set *cset;
 	int i;
@@ -6096,6 +6177,11 @@ void cgroup_post_fork(struct task_struct *child,
 
 	/* init tasks are special, only link regular threads */
 	if (likely(child->pid)) {
+		if (kargs->cgrp)
+			cgrp_flags = kargs->cgrp->flags;
+		else
+			cgrp_flags = cset->dfl_cgrp->flags;
+
 		WARN_ON_ONCE(!list_empty(&child->cg_list));
 		cset->nr_tasks++;
 		css_set_move_task(child, NULL, cset, false);
@@ -6104,23 +6190,32 @@ void cgroup_post_fork(struct task_struct *child,
 		cset = NULL;
 	}
 
-	/*
-	 * If the cgroup has to be frozen, the new task has too.  Let's set
-	 * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the
-	 * frozen state.
-	 */
-	if (unlikely(cgroup_task_freeze(child))) {
-		spin_lock(&child->sighand->siglock);
-		WARN_ON_ONCE(child->frozen);
-		child->jobctl |= JOBCTL_TRAP_FREEZE;
-		spin_unlock(&child->sighand->siglock);
+	if (!(child->flags & PF_KTHREAD)) {
+		if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) {
+			/*
+			 * If the cgroup has to be frozen, the new task has
+			 * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to
+			 * get the task into the frozen state.
+			 */
+			spin_lock(&child->sighand->siglock);
+			WARN_ON_ONCE(child->frozen);
+			child->jobctl |= JOBCTL_TRAP_FREEZE;
+			spin_unlock(&child->sighand->siglock);
+
+			/*
+			 * Calling cgroup_update_frozen() isn't required here,
+			 * because it will be called anyway a bit later from
+			 * do_freezer_trap(). So we avoid cgroup's transient
+			 * switch from the frozen state and back.
+			 */
+		}
 
 		/*
-		 * Calling cgroup_update_frozen() isn't required here,
-		 * because it will be called anyway a bit later from
-		 * do_freezer_trap(). So we avoid cgroup's transient switch
-		 * from the frozen state and back.
+		 * If the cgroup is to be killed notice it now and take the
+		 * child down right after we finished preparing it for
+		 * userspace.
 		 */
+		kill = test_bit(CGRP_KILL, &cgrp_flags);
 	}
 
 	spin_unlock_irq(&css_set_lock);
@@ -6143,6 +6238,10 @@ void cgroup_post_fork(struct task_struct *child,
 		put_css_set(rcset);
 	}
 
+	/* Cgroup has to be killed so take down child immediately. */
+	if (unlikely(kill))
+		do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID);
+
 	cgroup_css_set_put_fork(kargs);
 }
 
-- 
cgit v1.2.3


From bf067edf5d2f5b2948ee7197974a719aae3e526c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sat, 8 May 2021 00:07:47 +0200
Subject: openrisc: always use unaligned-struct header

openrisc is the only architecture using the linux/unaligned/*memmove
infrastructure. There is a comment saying that this version is more
efficient, but this was added in 2011 before the openrisc gcc port
was merged upstream.

I checked a couple of files to see what the actual difference is with
the mainline gcc (9.4 and 11.1), and found that the generic header
seems to produce better code now, regardless of the gcc version.

Specifically, the be_memmove leads to allocating a stack slot and
copying the data one byte at a time, then reading the whole word
from the stack:

00000000 <test_get_unaligned_memmove>:
   0:	9c 21 ff f4 	l.addi r1,r1,-12
   4:	d4 01 10 04 	l.sw 4(r1),r2
   8:	8e 63 00 00 	l.lbz r19,0(r3)
   c:	9c 41 00 0c 	l.addi r2,r1,12
  10:	8e 23 00 01 	l.lbz r17,1(r3)
  14:	db e2 9f f4 	l.sb -12(r2),r19
  18:	db e2 8f f5 	l.sb -11(r2),r17
  1c:	8e 63 00 02 	l.lbz r19,2(r3)
  20:	8e 23 00 03 	l.lbz r17,3(r3)
  24:	d4 01 48 08 	l.sw 8(r1),r9
  28:	db e2 9f f6 	l.sb -10(r2),r19
  2c:	db e2 8f f7 	l.sb -9(r2),r17
  30:	85 62 ff f4 	l.lwz r11,-12(r2)
  34:	85 21 00 08 	l.lwz r9,8(r1)
  38:	84 41 00 04 	l.lwz r2,4(r1)
  3c:	44 00 48 00 	l.jr r9
  40:	9c 21 00 0c 	l.addi r1,r1,12

while the be_struct version reads each byte into a register
and does a shift to the right position:

00000000 <test_get_unaligned_struct>:
   0:	9c 21 ff f8 	l.addi r1,r1,-8
   4:	8e 63 00 00 	l.lbz r19,0(r3)
   8:	aa 20 00 18 	l.ori r17,r0,0x18
   c:	e2 73 88 08 	l.sll r19,r19,r17
  10:	8d 63 00 01 	l.lbz r11,1(r3)
  14:	aa 20 00 10 	l.ori r17,r0,0x10
  18:	e1 6b 88 08 	l.sll r11,r11,r17
  1c:	e1 6b 98 04 	l.or r11,r11,r19
  20:	8e 23 00 02 	l.lbz r17,2(r3)
  24:	aa 60 00 08 	l.ori r19,r0,0x8
  28:	e2 31 98 08 	l.sll r17,r17,r19
  2c:	d4 01 10 00 	l.sw 0(r1),r2
  30:	d4 01 48 04 	l.sw 4(r1),r9
  34:	9c 41 00 08 	l.addi r2,r1,8
  38:	e2 31 58 04 	l.or r17,r17,r11
  3c:	8d 63 00 03 	l.lbz r11,3(r3)
  40:	e1 6b 88 04 	l.or r11,r11,r17
  44:	84 41 00 00 	l.lwz r2,0(r1)
  48:	85 21 00 04 	l.lwz r9,4(r1)
  4c:	44 00 48 00 	l.jr r9
  50:	9c 21 00 08 	l.addi r1,r1,8

According to Stafford Horne, the new version should in fact perform
better.

In the trivial example, the struct version is a few instructions longer,
but building a whole kernel shows an overall reduction in code size,
presumably because it now has to manage fewer stack slots:

   text	   data	    bss	    dec	    hex	filename
4792010	 181480	  82324	5055814	 4d2546	vmlinux-unaligned-memmove
4790642	 181480	  82324	5054446	 4d1fee	vmlinux-unaligned-struct

Remove the memmove version completely and let openrisc use the same
code as everyone else, as a simplification.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Stafford Horne <shorne@gmail.com>
---
 arch/openrisc/include/asm/unaligned.h | 47 -----------------------------------
 include/linux/unaligned/be_memmove.h  | 37 ---------------------------
 include/linux/unaligned/le_memmove.h  | 37 ---------------------------
 include/linux/unaligned/memmove.h     | 46 ----------------------------------
 4 files changed, 167 deletions(-)
 delete mode 100644 arch/openrisc/include/asm/unaligned.h
 delete mode 100644 include/linux/unaligned/be_memmove.h
 delete mode 100644 include/linux/unaligned/le_memmove.h
 delete mode 100644 include/linux/unaligned/memmove.h

(limited to 'include/linux')

diff --git a/arch/openrisc/include/asm/unaligned.h b/arch/openrisc/include/asm/unaligned.h
deleted file mode 100644
index 14353f2101f2..000000000000
--- a/arch/openrisc/include/asm/unaligned.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * OpenRISC Linux
- *
- * Linux architectural port borrowing liberally from similar works of
- * others.  All original copyrights apply as per the original source
- * declaration.
- *
- * OpenRISC implementation:
- * Copyright (C) 2003 Matjaz Breskvar <phoenix@bsemi.com>
- * Copyright (C) 2010-2011 Jonas Bonn <jonas@southpole.se>
- * et al.
- */
-
-#ifndef __ASM_OPENRISC_UNALIGNED_H
-#define __ASM_OPENRISC_UNALIGNED_H
-
-/*
- * This is copied from the generic implementation and the C-struct
- * variant replaced with the memmove variant.  The GCC compiler
- * for the OR32 arch optimizes too aggressively for the C-struct
- * variant to work, so use the memmove variant instead.
- *
- * It may be worth considering implementing the unaligned access
- * exception handler and allowing unaligned accesses (access_ok.h)...
- * not sure if it would be much of a performance win without further
- * investigation.
- */
-#include <asm/byteorder.h>
-
-#if defined(__LITTLE_ENDIAN)
-# include <linux/unaligned/le_memmove.h>
-# include <linux/unaligned/be_byteshift.h>
-# include <linux/unaligned/generic.h>
-# define get_unaligned	__get_unaligned_le
-# define put_unaligned	__put_unaligned_le
-#elif defined(__BIG_ENDIAN)
-# include <linux/unaligned/be_memmove.h>
-# include <linux/unaligned/le_byteshift.h>
-# include <linux/unaligned/generic.h>
-# define get_unaligned	__get_unaligned_be
-# define put_unaligned	__put_unaligned_be
-#else
-# error need to define endianess
-#endif
-
-#endif /* __ASM_OPENRISC_UNALIGNED_H */
diff --git a/include/linux/unaligned/be_memmove.h b/include/linux/unaligned/be_memmove.h
deleted file mode 100644
index 7164214a4ba1..000000000000
--- a/include/linux/unaligned/be_memmove.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_UNALIGNED_BE_MEMMOVE_H
-#define _LINUX_UNALIGNED_BE_MEMMOVE_H
-
-#include <linux/unaligned/memmove.h>
-
-static inline u16 get_unaligned_be16(const void *p)
-{
-	return __get_unaligned_memmove16((const u8 *)p);
-}
-
-static inline u32 get_unaligned_be32(const void *p)
-{
-	return __get_unaligned_memmove32((const u8 *)p);
-}
-
-static inline u64 get_unaligned_be64(const void *p)
-{
-	return __get_unaligned_memmove64((const u8 *)p);
-}
-
-static inline void put_unaligned_be16(u16 val, void *p)
-{
-	__put_unaligned_memmove16(val, p);
-}
-
-static inline void put_unaligned_be32(u32 val, void *p)
-{
-	__put_unaligned_memmove32(val, p);
-}
-
-static inline void put_unaligned_be64(u64 val, void *p)
-{
-	__put_unaligned_memmove64(val, p);
-}
-
-#endif /* _LINUX_UNALIGNED_LE_MEMMOVE_H */
diff --git a/include/linux/unaligned/le_memmove.h b/include/linux/unaligned/le_memmove.h
deleted file mode 100644
index 9202e864d026..000000000000
--- a/include/linux/unaligned/le_memmove.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_UNALIGNED_LE_MEMMOVE_H
-#define _LINUX_UNALIGNED_LE_MEMMOVE_H
-
-#include <linux/unaligned/memmove.h>
-
-static inline u16 get_unaligned_le16(const void *p)
-{
-	return __get_unaligned_memmove16((const u8 *)p);
-}
-
-static inline u32 get_unaligned_le32(const void *p)
-{
-	return __get_unaligned_memmove32((const u8 *)p);
-}
-
-static inline u64 get_unaligned_le64(const void *p)
-{
-	return __get_unaligned_memmove64((const u8 *)p);
-}
-
-static inline void put_unaligned_le16(u16 val, void *p)
-{
-	__put_unaligned_memmove16(val, p);
-}
-
-static inline void put_unaligned_le32(u32 val, void *p)
-{
-	__put_unaligned_memmove32(val, p);
-}
-
-static inline void put_unaligned_le64(u64 val, void *p)
-{
-	__put_unaligned_memmove64(val, p);
-}
-
-#endif /* _LINUX_UNALIGNED_LE_MEMMOVE_H */
diff --git a/include/linux/unaligned/memmove.h b/include/linux/unaligned/memmove.h
deleted file mode 100644
index ac71b53bc6dc..000000000000
--- a/include/linux/unaligned/memmove.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_UNALIGNED_MEMMOVE_H
-#define _LINUX_UNALIGNED_MEMMOVE_H
-
-#include <linux/kernel.h>
-#include <linux/string.h>
-
-/* Use memmove here, so gcc does not insert a __builtin_memcpy. */
-
-static inline u16 __get_unaligned_memmove16(const void *p)
-{
-	u16 tmp;
-	memmove(&tmp, p, 2);
-	return tmp;
-}
-
-static inline u32 __get_unaligned_memmove32(const void *p)
-{
-	u32 tmp;
-	memmove(&tmp, p, 4);
-	return tmp;
-}
-
-static inline u64 __get_unaligned_memmove64(const void *p)
-{
-	u64 tmp;
-	memmove(&tmp, p, 8);
-	return tmp;
-}
-
-static inline void __put_unaligned_memmove16(u16 val, void *p)
-{
-	memmove(p, &val, 2);
-}
-
-static inline void __put_unaligned_memmove32(u32 val, void *p)
-{
-	memmove(p, &val, 4);
-}
-
-static inline void __put_unaligned_memmove64(u64 val, void *p)
-{
-	memmove(p, &val, 8);
-}
-
-#endif /* _LINUX_UNALIGNED_MEMMOVE_H */
-- 
cgit v1.2.3


From 0652035a57945e14e611dafae2ec5b46a05bc1d1 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sat, 8 May 2021 00:07:51 +0200
Subject: asm-generic: unaligned: remove byteshift helpers

In theory, compilers should be able to work this out themselves so we
can use a simpler version based on the swab() helpers.

I have verified that this works on all supported compiler versions
(gcc-4.9 and up, clang-10 and up). Looking at the object code produced by
gcc-11, I found that the impact is mostly a change in inlining decisions
that lead to slightly larger code.

In other cases, this version produces explicit byte swaps in place of
separate byte access, or comparing against pre-swapped constants.

While the source code is clearly simpler, I have not seen an indication
of the new version actually producing better code on Arm, so maybe
we want to skip this after all. From what I can tell, gcc recognizes
the byteswap pattern in the byteshift.h header and can turn it into
explicit instructions, but it does not turn a __builtin_bswap32() back
into individual bytes when that would result in better output, e.g.
when storing a byte-reversed constant.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/include/asm/unaligned.h       |  2 -
 include/asm-generic/unaligned.h        |  2 -
 include/linux/unaligned/be_byteshift.h | 71 ----------------------------------
 include/linux/unaligned/be_struct.h    | 30 ++++++++++++++
 include/linux/unaligned/le_byteshift.h | 71 ----------------------------------
 include/linux/unaligned/le_struct.h    | 30 ++++++++++++++
 6 files changed, 60 insertions(+), 146 deletions(-)
 delete mode 100644 include/linux/unaligned/be_byteshift.h
 delete mode 100644 include/linux/unaligned/le_byteshift.h

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/unaligned.h b/arch/arm/include/asm/unaligned.h
index ab905ffcf193..3c5248fb4cdc 100644
--- a/arch/arm/include/asm/unaligned.h
+++ b/arch/arm/include/asm/unaligned.h
@@ -10,13 +10,11 @@
 
 #if defined(__LITTLE_ENDIAN)
 # include <linux/unaligned/le_struct.h>
-# include <linux/unaligned/be_byteshift.h>
 # include <linux/unaligned/generic.h>
 # define get_unaligned	__get_unaligned_le
 # define put_unaligned	__put_unaligned_le
 #elif defined(__BIG_ENDIAN)
 # include <linux/unaligned/be_struct.h>
-# include <linux/unaligned/le_byteshift.h>
 # include <linux/unaligned/generic.h>
 # define get_unaligned	__get_unaligned_be
 # define put_unaligned	__put_unaligned_be
diff --git a/include/asm-generic/unaligned.h b/include/asm-generic/unaligned.h
index 374c940e9be1..d79df721ae60 100644
--- a/include/asm-generic/unaligned.h
+++ b/include/asm-generic/unaligned.h
@@ -16,7 +16,6 @@
 #if defined(__LITTLE_ENDIAN)
 # ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
 #  include <linux/unaligned/le_struct.h>
-#  include <linux/unaligned/be_byteshift.h>
 # endif
 # include <linux/unaligned/generic.h>
 # define get_unaligned	__get_unaligned_le
@@ -24,7 +23,6 @@
 #elif defined(__BIG_ENDIAN)
 # ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
 #  include <linux/unaligned/be_struct.h>
-#  include <linux/unaligned/le_byteshift.h>
 # endif
 # include <linux/unaligned/generic.h>
 # define get_unaligned	__get_unaligned_be
diff --git a/include/linux/unaligned/be_byteshift.h b/include/linux/unaligned/be_byteshift.h
deleted file mode 100644
index c43ff5918c8a..000000000000
--- a/include/linux/unaligned/be_byteshift.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_UNALIGNED_BE_BYTESHIFT_H
-#define _LINUX_UNALIGNED_BE_BYTESHIFT_H
-
-#include <linux/types.h>
-
-static inline u16 __get_unaligned_be16(const u8 *p)
-{
-	return p[0] << 8 | p[1];
-}
-
-static inline u32 __get_unaligned_be32(const u8 *p)
-{
-	return p[0] << 24 | p[1] << 16 | p[2] << 8 | p[3];
-}
-
-static inline u64 __get_unaligned_be64(const u8 *p)
-{
-	return (u64)__get_unaligned_be32(p) << 32 |
-	       __get_unaligned_be32(p + 4);
-}
-
-static inline void __put_unaligned_be16(u16 val, u8 *p)
-{
-	*p++ = val >> 8;
-	*p++ = val;
-}
-
-static inline void __put_unaligned_be32(u32 val, u8 *p)
-{
-	__put_unaligned_be16(val >> 16, p);
-	__put_unaligned_be16(val, p + 2);
-}
-
-static inline void __put_unaligned_be64(u64 val, u8 *p)
-{
-	__put_unaligned_be32(val >> 32, p);
-	__put_unaligned_be32(val, p + 4);
-}
-
-static inline u16 get_unaligned_be16(const void *p)
-{
-	return __get_unaligned_be16(p);
-}
-
-static inline u32 get_unaligned_be32(const void *p)
-{
-	return __get_unaligned_be32(p);
-}
-
-static inline u64 get_unaligned_be64(const void *p)
-{
-	return __get_unaligned_be64(p);
-}
-
-static inline void put_unaligned_be16(u16 val, void *p)
-{
-	__put_unaligned_be16(val, p);
-}
-
-static inline void put_unaligned_be32(u32 val, void *p)
-{
-	__put_unaligned_be32(val, p);
-}
-
-static inline void put_unaligned_be64(u64 val, void *p)
-{
-	__put_unaligned_be64(val, p);
-}
-
-#endif /* _LINUX_UNALIGNED_BE_BYTESHIFT_H */
diff --git a/include/linux/unaligned/be_struct.h b/include/linux/unaligned/be_struct.h
index 15ea503a13fc..76d9fe297c33 100644
--- a/include/linux/unaligned/be_struct.h
+++ b/include/linux/unaligned/be_struct.h
@@ -34,4 +34,34 @@ static inline void put_unaligned_be64(u64 val, void *p)
 	__put_unaligned_cpu64(val, p);
 }
 
+static inline u16 get_unaligned_le16(const void *p)
+{
+	return swab16(__get_unaligned_cpu16((const u8 *)p));
+}
+
+static inline u32 get_unaligned_le32(const void *p)
+{
+	return swab32(__get_unaligned_cpu32((const u8 *)p));
+}
+
+static inline u64 get_unaligned_le64(const void *p)
+{
+	return swab64(__get_unaligned_cpu64((const u8 *)p));
+}
+
+static inline void put_unaligned_le16(u16 val, void *p)
+{
+	__put_unaligned_cpu16(swab16(val), p);
+}
+
+static inline void put_unaligned_le32(u32 val, void *p)
+{
+	__put_unaligned_cpu32(swab32(val), p);
+}
+
+static inline void put_unaligned_le64(u64 val, void *p)
+{
+	__put_unaligned_cpu64(swab64(val), p);
+}
+
 #endif /* _LINUX_UNALIGNED_BE_STRUCT_H */
diff --git a/include/linux/unaligned/le_byteshift.h b/include/linux/unaligned/le_byteshift.h
deleted file mode 100644
index 2248dcb0df76..000000000000
--- a/include/linux/unaligned/le_byteshift.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_UNALIGNED_LE_BYTESHIFT_H
-#define _LINUX_UNALIGNED_LE_BYTESHIFT_H
-
-#include <linux/types.h>
-
-static inline u16 __get_unaligned_le16(const u8 *p)
-{
-	return p[0] | p[1] << 8;
-}
-
-static inline u32 __get_unaligned_le32(const u8 *p)
-{
-	return p[0] | p[1] << 8 | p[2] << 16 | p[3] << 24;
-}
-
-static inline u64 __get_unaligned_le64(const u8 *p)
-{
-	return (u64)__get_unaligned_le32(p + 4) << 32 |
-	       __get_unaligned_le32(p);
-}
-
-static inline void __put_unaligned_le16(u16 val, u8 *p)
-{
-	*p++ = val;
-	*p++ = val >> 8;
-}
-
-static inline void __put_unaligned_le32(u32 val, u8 *p)
-{
-	__put_unaligned_le16(val >> 16, p + 2);
-	__put_unaligned_le16(val, p);
-}
-
-static inline void __put_unaligned_le64(u64 val, u8 *p)
-{
-	__put_unaligned_le32(val >> 32, p + 4);
-	__put_unaligned_le32(val, p);
-}
-
-static inline u16 get_unaligned_le16(const void *p)
-{
-	return __get_unaligned_le16(p);
-}
-
-static inline u32 get_unaligned_le32(const void *p)
-{
-	return __get_unaligned_le32(p);
-}
-
-static inline u64 get_unaligned_le64(const void *p)
-{
-	return __get_unaligned_le64(p);
-}
-
-static inline void put_unaligned_le16(u16 val, void *p)
-{
-	__put_unaligned_le16(val, p);
-}
-
-static inline void put_unaligned_le32(u32 val, void *p)
-{
-	__put_unaligned_le32(val, p);
-}
-
-static inline void put_unaligned_le64(u64 val, void *p)
-{
-	__put_unaligned_le64(val, p);
-}
-
-#endif /* _LINUX_UNALIGNED_LE_BYTESHIFT_H */
diff --git a/include/linux/unaligned/le_struct.h b/include/linux/unaligned/le_struct.h
index 9977987883a6..22f90a4afaa5 100644
--- a/include/linux/unaligned/le_struct.h
+++ b/include/linux/unaligned/le_struct.h
@@ -34,4 +34,34 @@ static inline void put_unaligned_le64(u64 val, void *p)
 	__put_unaligned_cpu64(val, p);
 }
 
+static inline u16 get_unaligned_be16(const void *p)
+{
+	return swab16(__get_unaligned_cpu16((const u8 *)p));
+}
+
+static inline u32 get_unaligned_be32(const void *p)
+{
+	return swab32(__get_unaligned_cpu32((const u8 *)p));
+}
+
+static inline u64 get_unaligned_be64(const void *p)
+{
+	return swab64(__get_unaligned_cpu64((const u8 *)p));
+}
+
+static inline void put_unaligned_be16(u16 val, void *p)
+{
+	__put_unaligned_cpu16(swab16(val), p);
+}
+
+static inline void put_unaligned_be32(u32 val, void *p)
+{
+	__put_unaligned_cpu32(swab32(val), p);
+}
+
+static inline void put_unaligned_be64(u64 val, void *p)
+{
+	__put_unaligned_cpu64(swab64(val), p);
+}
+
 #endif /* _LINUX_UNALIGNED_LE_STRUCT_H */
-- 
cgit v1.2.3


From 778aaefb8e864fc61f850539ea479554dd4caea1 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sat, 8 May 2021 00:07:52 +0200
Subject: asm-generic: unaligned always use struct helpers

As found by Vineet Gupta and Linus Torvalds, gcc has somewhat unexpected
behavior when faced with overlapping unaligned pointers. The kernel's
unaligned/access-ok.h header technically invokes undefined behavior
that happens to usually work on the architectures using it, but if the
compiler optimizes code based on the assumption that undefined behavior
doesn't happen, it can create output that actually causes data corruption.

A related problem was previously found on 32-bit ARMv7, where most
instructions can be used on unaligned data, but 64-bit ldrd/strd causes
an exception. The workaround was to always use the unaligned/le_struct.h
helper instead of unaligned/access-ok.h, in commit 1cce91dfc8f7 ("ARM:
8715/1: add a private asm/unaligned.h").

The same solution should work on all other architectures as well, so
remove the access-ok.h variant and use the other one unconditionally on
all architectures, picking either the big-endian or little-endian version.

With this, the arm specific header can be removed as well, and the
only file including linux/unaligned/access_ok.h gets moved to including
the normal file.

Fortunately, this made almost no difference to the object code produced
by gcc-11. On x86, s390, powerpc, and arc, the resulting binary appears
to be identical to the previous version, while on arm64 and m68k there
are minimal differences that looks like an optimization pass went into
a different direction, usually using fewer stack spills on the new
version.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100363
---
 arch/arm/include/asm/unaligned.h    | 25 --------------
 arch/mips/crypto/crc32-mips.c       |  2 +-
 include/asm-generic/unaligned.h     | 13 ++-----
 include/linux/unaligned/access_ok.h | 68 -------------------------------------
 4 files changed, 3 insertions(+), 105 deletions(-)
 delete mode 100644 arch/arm/include/asm/unaligned.h
 delete mode 100644 include/linux/unaligned/access_ok.h

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/unaligned.h b/arch/arm/include/asm/unaligned.h
deleted file mode 100644
index 3c5248fb4cdc..000000000000
--- a/arch/arm/include/asm/unaligned.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef __ASM_ARM_UNALIGNED_H
-#define __ASM_ARM_UNALIGNED_H
-
-/*
- * We generally want to set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS on ARMv6+,
- * but we don't want to use linux/unaligned/access_ok.h since that can lead
- * to traps on unaligned stm/ldm or strd/ldrd.
- */
-#include <asm/byteorder.h>
-
-#if defined(__LITTLE_ENDIAN)
-# include <linux/unaligned/le_struct.h>
-# include <linux/unaligned/generic.h>
-# define get_unaligned	__get_unaligned_le
-# define put_unaligned	__put_unaligned_le
-#elif defined(__BIG_ENDIAN)
-# include <linux/unaligned/be_struct.h>
-# include <linux/unaligned/generic.h>
-# define get_unaligned	__get_unaligned_be
-# define put_unaligned	__put_unaligned_be
-#else
-# error need to define endianess
-#endif
-
-#endif /* __ASM_ARM_UNALIGNED_H */
diff --git a/arch/mips/crypto/crc32-mips.c b/arch/mips/crypto/crc32-mips.c
index faa88a6a74c0..0a03529cf317 100644
--- a/arch/mips/crypto/crc32-mips.c
+++ b/arch/mips/crypto/crc32-mips.c
@@ -8,13 +8,13 @@
  * Copyright (C) 2018 MIPS Tech, LLC
  */
 
-#include <linux/unaligned/access_ok.h>
 #include <linux/cpufeature.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/string.h>
 #include <asm/mipsregs.h>
+#include <asm/unaligned.h>
 
 #include <crypto/internal/hash.h>
 
diff --git a/include/asm-generic/unaligned.h b/include/asm-generic/unaligned.h
index d79df721ae60..36bf03aaa674 100644
--- a/include/asm-generic/unaligned.h
+++ b/include/asm-generic/unaligned.h
@@ -8,22 +8,13 @@
  */
 #include <asm/byteorder.h>
 
-/* Set by the arch if it can handle unaligned accesses in hardware. */
-#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-# include <linux/unaligned/access_ok.h>
-#endif
-
 #if defined(__LITTLE_ENDIAN)
-# ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-#  include <linux/unaligned/le_struct.h>
-# endif
+# include <linux/unaligned/le_struct.h>
 # include <linux/unaligned/generic.h>
 # define get_unaligned	__get_unaligned_le
 # define put_unaligned	__put_unaligned_le
 #elif defined(__BIG_ENDIAN)
-# ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-#  include <linux/unaligned/be_struct.h>
-# endif
+# include <linux/unaligned/be_struct.h>
 # include <linux/unaligned/generic.h>
 # define get_unaligned	__get_unaligned_be
 # define put_unaligned	__put_unaligned_be
diff --git a/include/linux/unaligned/access_ok.h b/include/linux/unaligned/access_ok.h
deleted file mode 100644
index 167aa849c0ce..000000000000
--- a/include/linux/unaligned/access_ok.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_UNALIGNED_ACCESS_OK_H
-#define _LINUX_UNALIGNED_ACCESS_OK_H
-
-#include <linux/kernel.h>
-#include <asm/byteorder.h>
-
-static __always_inline u16 get_unaligned_le16(const void *p)
-{
-	return le16_to_cpup((__le16 *)p);
-}
-
-static __always_inline u32 get_unaligned_le32(const void *p)
-{
-	return le32_to_cpup((__le32 *)p);
-}
-
-static __always_inline u64 get_unaligned_le64(const void *p)
-{
-	return le64_to_cpup((__le64 *)p);
-}
-
-static __always_inline u16 get_unaligned_be16(const void *p)
-{
-	return be16_to_cpup((__be16 *)p);
-}
-
-static __always_inline u32 get_unaligned_be32(const void *p)
-{
-	return be32_to_cpup((__be32 *)p);
-}
-
-static __always_inline u64 get_unaligned_be64(const void *p)
-{
-	return be64_to_cpup((__be64 *)p);
-}
-
-static __always_inline void put_unaligned_le16(u16 val, void *p)
-{
-	*((__le16 *)p) = cpu_to_le16(val);
-}
-
-static __always_inline void put_unaligned_le32(u32 val, void *p)
-{
-	*((__le32 *)p) = cpu_to_le32(val);
-}
-
-static __always_inline void put_unaligned_le64(u64 val, void *p)
-{
-	*((__le64 *)p) = cpu_to_le64(val);
-}
-
-static __always_inline void put_unaligned_be16(u16 val, void *p)
-{
-	*((__be16 *)p) = cpu_to_be16(val);
-}
-
-static __always_inline void put_unaligned_be32(u32 val, void *p)
-{
-	*((__be32 *)p) = cpu_to_be32(val);
-}
-
-static __always_inline void put_unaligned_be64(u64 val, void *p)
-{
-	*((__be64 *)p) = cpu_to_be64(val);
-}
-
-#endif /* _LINUX_UNALIGNED_ACCESS_OK_H */
-- 
cgit v1.2.3


From 9d9d415f0048e4f7a6109595e2d1657850569c6c Mon Sep 17 00:00:00 2001
From: "Radu Pirea (NXP OSS)" <radu-nicolae.pirea@oss.nxp.com>
Date: Mon, 10 May 2021 18:34:32 +0300
Subject: ptp: ptp_clock: make scaled_ppm_to_ppb static inline

Make scaled_ppm_to_ppb static inline to be able to build drivers that
use this function even with PTP_1588_CLOCK disabled.

Signed-off-by: Radu Pirea (NXP OSS) <radu-nicolae.pirea@oss.nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_clock.c          | 21 ---------------------
 include/linux/ptp_clock_kernel.h | 34 ++++++++++++++++++++++++++--------
 2 files changed, 26 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c
index 03a246e60fd9..a780435331c8 100644
--- a/drivers/ptp/ptp_clock.c
+++ b/drivers/ptp/ptp_clock.c
@@ -63,27 +63,6 @@ static void enqueue_external_timestamp(struct timestamp_event_queue *queue,
 	spin_unlock_irqrestore(&queue->lock, flags);
 }
 
-s32 scaled_ppm_to_ppb(long ppm)
-{
-	/*
-	 * The 'freq' field in the 'struct timex' is in parts per
-	 * million, but with a 16 bit binary fractional field.
-	 *
-	 * We want to calculate
-	 *
-	 *    ppb = scaled_ppm * 1000 / 2^16
-	 *
-	 * which simplifies to
-	 *
-	 *    ppb = scaled_ppm * 125 / 2^13
-	 */
-	s64 ppb = 1 + ppm;
-	ppb *= 125;
-	ppb >>= 13;
-	return (s32) ppb;
-}
-EXPORT_SYMBOL(scaled_ppm_to_ppb);
-
 /* posix clock implementation */
 
 static int ptp_clock_getres(struct posix_clock *pc, struct timespec64 *tp)
diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index 0d47fd33b228..a311bddd9e85 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -186,6 +186,32 @@ struct ptp_clock_event {
 	};
 };
 
+/**
+ * scaled_ppm_to_ppb() - convert scaled ppm to ppb
+ *
+ * @ppm:    Parts per million, but with a 16 bit binary fractional field
+ */
+static inline s32 scaled_ppm_to_ppb(long ppm)
+{
+	/*
+	 * The 'freq' field in the 'struct timex' is in parts per
+	 * million, but with a 16 bit binary fractional field.
+	 *
+	 * We want to calculate
+	 *
+	 *    ppb = scaled_ppm * 1000 / 2^16
+	 *
+	 * which simplifies to
+	 *
+	 *    ppb = scaled_ppm * 125 / 2^13
+	 */
+	s64 ppb = 1 + ppm;
+
+	ppb *= 125;
+	ppb >>= 13;
+	return (s32)ppb;
+}
+
 #if IS_REACHABLE(CONFIG_PTP_1588_CLOCK)
 
 /**
@@ -229,14 +255,6 @@ extern void ptp_clock_event(struct ptp_clock *ptp,
 
 extern int ptp_clock_index(struct ptp_clock *ptp);
 
-/**
- * scaled_ppm_to_ppb() - convert scaled ppm to ppb
- *
- * @ppm:    Parts per million, but with a 16 bit binary fractional field
- */
-
-extern s32 scaled_ppm_to_ppb(long ppm);
-
 /**
  * ptp_find_pin() - obtain the pin index of a given auxiliary function
  *
-- 
cgit v1.2.3


From 258ca95e2cd9a0fcc4508a1bf1742b1a3e9a7bbb Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 23 Feb 2021 01:10:04 +0100
Subject: timer: Revert "timer: Add timer_curr_running()"

This reverts commit dcd42591ebb8a25895b551a5297ea9c24414ba54.
The only user was RCU/nocb.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Neeraj Upadhyay <neeraju@codeaurora.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/timer.h |  2 --
 kernel/time/timer.c   | 14 --------------
 2 files changed, 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 4118a97e62fb..fda13c9d1256 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -192,8 +192,6 @@ extern int try_to_del_timer_sync(struct timer_list *timer);
 
 #define del_singleshot_timer_sync(t) del_timer_sync(t)
 
-extern bool timer_curr_running(struct timer_list *timer);
-
 extern void init_timers(void);
 struct hrtimer;
 extern enum hrtimer_restart it_real_fn(struct hrtimer *);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index d111adf4a0cb..84332f01dc57 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1237,20 +1237,6 @@ int try_to_del_timer_sync(struct timer_list *timer)
 }
 EXPORT_SYMBOL(try_to_del_timer_sync);
 
-bool timer_curr_running(struct timer_list *timer)
-{
-	int i;
-
-	for (i = 0; i < NR_BASES; i++) {
-		struct timer_base *base = this_cpu_ptr(&timer_bases[i]);
-
-		if (base->running_timer == timer)
-			return true;
-	}
-
-	return false;
-}
-
 #ifdef CONFIG_PREEMPT_RT
 static __init void timer_base_init_expiry_lock(struct timer_base *base)
 {
-- 
cgit v1.2.3


From 7bf0a6141ab9c1d113bd85d6d13d43903a4278ba Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 9 Apr 2021 00:38:58 +0200
Subject: srcu: Unconditionally embed struct lockdep_map

Since struct lockdep_map has zero size when CONFIG_DEBUG_LOCK_ALLOC=n,
this commit removes the #ifdef from the srcu_struct structure's ->dep_map.
This change will simplify further manipulations of this field.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Uladzislau Rezki <urezki@gmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Neeraj Upadhyay <neeraju@codeaurora.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Joel Fernandes <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/srcutree.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 9cfcc8a756ae..cb1f4351e8ba 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -82,9 +82,7 @@ struct srcu_struct {
 						/*  callback for the barrier */
 						/*  operation. */
 	struct delayed_work work;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map dep_map;
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 };
 
 /* Values for state variable (bottom bits of ->srcu_gp_seq). */
-- 
cgit v1.2.3


From 8e9c01c717df7e05c5bd1ca86aaa3a74b31f37f1 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 9 Apr 2021 00:38:59 +0200
Subject: srcu: Initialize SRCU after timers

Once srcu_init() is called, the SRCU core will make use of delayed
workqueues, which rely on timers.  However init_timers() is called
several steps after rcu_init().  This means that a call_srcu() after
rcu_init() but before init_timers() would find itself within a dangerously
uninitialized timer core.

This commit therefore creates a separate call to srcu_init() after
init_timer() completes, which ensures that we stay in early SRCU mode
until timers are safe(r).

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Uladzislau Rezki <urezki@gmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Neeraj Upadhyay <neeraju@codeaurora.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Joel Fernandes <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/srcu.h  | 6 ++++++
 init/main.c           | 2 ++
 kernel/rcu/rcu.h      | 6 ------
 kernel/rcu/srcutree.c | 5 +++++
 kernel/rcu/tiny.c     | 1 -
 kernel/rcu/tree.c     | 1 -
 6 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index a0895bbf71ce..e6011a9975af 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -64,6 +64,12 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp);
 unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp);
 bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie);
 
+#ifdef CONFIG_SRCU
+void srcu_init(void);
+#else /* #ifdef CONFIG_SRCU */
+static inline void srcu_init(void) { }
+#endif /* #else #ifdef CONFIG_SRCU */
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
 /**
diff --git a/init/main.c b/init/main.c
index eb01e121d2f1..7b6f49c4d388 100644
--- a/init/main.c
+++ b/init/main.c
@@ -42,6 +42,7 @@
 #include <linux/profile.h>
 #include <linux/kfence.h>
 #include <linux/rcupdate.h>
+#include <linux/srcu.h>
 #include <linux/moduleparam.h>
 #include <linux/kallsyms.h>
 #include <linux/writeback.h>
@@ -979,6 +980,7 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
 	tick_init();
 	rcu_init_nohz();
 	init_timers();
+	srcu_init();
 	hrtimers_init();
 	softirq_init();
 	timekeeping_init();
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index bf0827d4b659..ca3f2af32bf8 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -422,12 +422,6 @@ do {									\
 
 #endif /* #if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) */
 
-#ifdef CONFIG_SRCU
-void srcu_init(void);
-#else /* #ifdef CONFIG_SRCU */
-static inline void srcu_init(void) { }
-#endif /* #else #ifdef CONFIG_SRCU */
-
 #ifdef CONFIG_TINY_RCU
 /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
 static inline bool rcu_gp_is_normal(void) { return true; }
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index f4f0cbf7a02b..9ec35c158740 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1384,6 +1384,11 @@ void __init srcu_init(void)
 {
 	struct srcu_struct *ssp;
 
+	/*
+	 * Once that is set, call_srcu() can follow the normal path and
+	 * queue delayed work. This must follow RCU workqueues creation
+	 * and timers initialization.
+	 */
 	srcu_init_done = true;
 	while (!list_empty(&srcu_boot_list)) {
 		ssp = list_first_entry(&srcu_boot_list, struct srcu_struct,
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index c8a029fbb114..340b3f8b090d 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -221,5 +221,4 @@ void __init rcu_init(void)
 {
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 	rcu_early_boot_tests();
-	srcu_init();
 }
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8e78b2430c16..c35b15229cff 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4735,7 +4735,6 @@ void __init rcu_init(void)
 	WARN_ON(!rcu_gp_wq);
 	rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
 	WARN_ON(!rcu_par_gp_wq);
-	srcu_init();
 
 	/* Fill in default value for rcutree.qovld boot parameter. */
 	/* -After- the rcu_node ->lock fields are initialized! */
-- 
cgit v1.2.3


From c9e73e3d2b1eb1ea7ff068e05007eec3bd8ef1c9 Mon Sep 17 00:00:00 2001
From: Lorenz Bauer <lmb@cloudflare.com>
Date: Thu, 29 Apr 2021 14:46:56 +0100
Subject: bpf: verifier: Allocate idmap scratch in verifier env

func_states_equal makes a very short lived allocation for idmap,
probably because it's too large to fit on the stack. However the
function is called quite often, leading to a lot of alloc / free
churn. Replace the temporary allocation with dedicated scratch
space in struct bpf_verifier_env.

Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Edward Cree <ecree.xilinx@gmail.com>
Link: https://lore.kernel.org/bpf/20210429134656.122225-4-lmb@cloudflare.com
---
 include/linux/bpf_verifier.h |  8 ++++++++
 kernel/bpf/verifier.c        | 46 +++++++++++++++-----------------------------
 2 files changed, 23 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 06841517ab1e..d4632aa3ca50 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -215,6 +215,13 @@ struct bpf_idx_pair {
 	u32 idx;
 };
 
+struct bpf_id_pair {
+	u32 old;
+	u32 cur;
+};
+
+/* Maximum number of register states that can exist at once */
+#define BPF_ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE)
 #define MAX_CALL_FRAMES 8
 struct bpf_verifier_state {
 	/* call stack tracking */
@@ -418,6 +425,7 @@ struct bpf_verifier_env {
 	const struct bpf_line_info *prev_linfo;
 	struct bpf_verifier_log log;
 	struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1];
+	struct bpf_id_pair idmap_scratch[BPF_ID_MAP_SIZE];
 	struct {
 		int *insn_state;
 		int *insn_stack;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 380c8ad49b7f..bdfdb54676ea 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9748,13 +9748,6 @@ static bool range_within(struct bpf_reg_state *old,
 	       old->s32_max_value >= cur->s32_max_value;
 }
 
-/* Maximum number of register states that can exist at once */
-#define ID_MAP_SIZE	(MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE)
-struct idpair {
-	u32 old;
-	u32 cur;
-};
-
 /* If in the old state two registers had the same id, then they need to have
  * the same id in the new state as well.  But that id could be different from
  * the old state, so we need to track the mapping from old to new ids.
@@ -9765,11 +9758,11 @@ struct idpair {
  * So we look through our idmap to see if this old id has been seen before.  If
  * so, we require the new id to match; otherwise, we add the id pair to the map.
  */
-static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap)
+static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap)
 {
 	unsigned int i;
 
-	for (i = 0; i < ID_MAP_SIZE; i++) {
+	for (i = 0; i < BPF_ID_MAP_SIZE; i++) {
 		if (!idmap[i].old) {
 			/* Reached an empty slot; haven't seen this id before */
 			idmap[i].old = old_id;
@@ -9882,7 +9875,7 @@ next:
 
 /* Returns true if (rold safe implies rcur safe) */
 static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
-		    struct idpair *idmap)
+		    struct bpf_id_pair *idmap)
 {
 	bool equal;
 
@@ -10000,7 +9993,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 
 static bool stacksafe(struct bpf_func_state *old,
 		      struct bpf_func_state *cur,
-		      struct idpair *idmap)
+		      struct bpf_id_pair *idmap)
 {
 	int i, spi;
 
@@ -10097,32 +10090,23 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur)
  * whereas register type in current state is meaningful, it means that
  * the current state will reach 'bpf_exit' instruction safely
  */
-static bool func_states_equal(struct bpf_func_state *old,
+static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old,
 			      struct bpf_func_state *cur)
 {
-	struct idpair *idmap;
-	bool ret = false;
 	int i;
 
-	idmap = kcalloc(ID_MAP_SIZE, sizeof(struct idpair), GFP_KERNEL);
-	/* If we failed to allocate the idmap, just say it's not safe */
-	if (!idmap)
-		return false;
-
-	for (i = 0; i < MAX_BPF_REG; i++) {
-		if (!regsafe(&old->regs[i], &cur->regs[i], idmap))
-			goto out_free;
-	}
+	memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch));
+	for (i = 0; i < MAX_BPF_REG; i++)
+		if (!regsafe(&old->regs[i], &cur->regs[i], env->idmap_scratch))
+			return false;
 
-	if (!stacksafe(old, cur, idmap))
-		goto out_free;
+	if (!stacksafe(old, cur, env->idmap_scratch))
+		return false;
 
 	if (!refsafe(old, cur))
-		goto out_free;
-	ret = true;
-out_free:
-	kfree(idmap);
-	return ret;
+		return false;
+
+	return true;
 }
 
 static bool states_equal(struct bpf_verifier_env *env,
@@ -10149,7 +10133,7 @@ static bool states_equal(struct bpf_verifier_env *env,
 	for (i = 0; i <= old->curframe; i++) {
 		if (old->frame[i]->callsite != cur->frame[i]->callsite)
 			return false;
-		if (!func_states_equal(old->frame[i], cur->frame[i]))
+		if (!func_states_equal(env, old->frame[i], cur->frame[i]))
 			return false;
 	}
 	return true;
-- 
cgit v1.2.3


From ce7c169dee28866539abb0e603b9a23055d30fdc Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 30 Mar 2021 13:23:49 -0700
Subject: rcu: Remove the unused rcu_irq_exit_preempt() function

Commit 9ee01e0f69a9 ("x86/entry: Clean up idtentry_enter/exit()
leftovers") left the rcu_irq_exit_preempt() in place in order to avoid
conflicts with the -rcu tree.  Now that this change has long since hit
mainline, this commit removes the no-longer-used rcu_irq_exit_preempt()
function.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcutiny.h |  1 -
 include/linux/rcutree.h |  1 -
 kernel/rcu/tree.c       | 22 ----------------------
 3 files changed, 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 35e0be326ffc..953e70fafe38 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -86,7 +86,6 @@ static inline void rcu_irq_enter(void) { }
 static inline void rcu_irq_exit_irqson(void) { }
 static inline void rcu_irq_enter_irqson(void) { }
 static inline void rcu_irq_exit(void) { }
-static inline void rcu_irq_exit_preempt(void) { }
 static inline void rcu_irq_exit_check_preempt(void) { }
 #define rcu_is_idle_cpu(cpu) \
 	(is_idle_task(current) && !in_nmi() && !in_irq() && !in_serving_softirq())
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index b89b54130f49..53209d669400 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -49,7 +49,6 @@ void rcu_idle_enter(void);
 void rcu_idle_exit(void);
 void rcu_irq_enter(void);
 void rcu_irq_exit(void);
-void rcu_irq_exit_preempt(void);
 void rcu_irq_enter_irqson(void);
 void rcu_irq_exit_irqson(void);
 bool rcu_is_idle_cpu(int cpu);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8e78b2430c16..f6543b8004c0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -833,28 +833,6 @@ void noinstr rcu_irq_exit(void)
 	rcu_nmi_exit();
 }
 
-/**
- * rcu_irq_exit_preempt - Inform RCU that current CPU is exiting irq
- *			  towards in kernel preemption
- *
- * Same as rcu_irq_exit() but has a sanity check that scheduling is safe
- * from RCU point of view. Invoked from return from interrupt before kernel
- * preemption.
- */
-void rcu_irq_exit_preempt(void)
-{
-	lockdep_assert_irqs_disabled();
-	rcu_nmi_exit();
-
-	RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0,
-			 "RCU dynticks_nesting counter underflow/zero!");
-	RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) !=
-			 DYNTICK_IRQ_NONIDLE,
-			 "Bad RCU  dynticks_nmi_nesting counter\n");
-	RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
-			 "RCU in extended quiescent state!");
-}
-
 #ifdef CONFIG_PROVE_RCU
 /**
  * rcu_irq_exit_check_preempt - Validate that scheduling is possible
-- 
cgit v1.2.3


From 3066820034b5dd4e89bd74a7739c51c2d6f5e554 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 5 Apr 2021 09:51:05 -0700
Subject: rcu: Reject RCU_LOCKDEP_WARN() false positives

If another lockdep report runs concurrently with an RCU lockdep report
from RCU_LOCKDEP_WARN(), the following sequence of events can occur:

1.	debug_lockdep_rcu_enabled() sees that lockdep is enabled
	when called from (say) synchronize_rcu().

2.	Lockdep is disabled by a concurrent lockdep report.

3.	debug_lockdep_rcu_enabled() evaluates its lockdep-expression
	argument, for example, lock_is_held(&rcu_bh_lock_map).

4.	Because lockdep is now disabled, lock_is_held() plays it safe and
	returns the constant 1.

5.	But in this case, the constant 1 is not safe, because invoking
	synchronize_rcu() under rcu_read_lock_bh() is disallowed.

6.	debug_lockdep_rcu_enabled() wrongly invokes lockdep_rcu_suspicious(),
	resulting in a false-positive splat.

This commit therefore changes RCU_LOCKDEP_WARN() to check
debug_lockdep_rcu_enabled() after checking the lockdep expression,
so that any "safe" returns from lock_is_held() are rejected by
debug_lockdep_rcu_enabled().  This requires memory ordering, which is
supplied by READ_ONCE(debug_locks).  The resulting volatile accesses
prevent the compiler from reordering and the fact that only one variable
is being accessed prevents the underlying hardware from reordering.
The combination works for IA64, which can reorder reads to the same
location, but this is defeated by the volatile accesses, which compile
to load instructions that provide ordering.

Reported-by: syzbot+dde0cc33951735441301@syzkaller.appspotmail.com
Reported-by: Matthew Wilcox <willy@infradead.org>
Reported-by: syzbot+88e4f02896967fe1ab0d@syzkaller.appspotmail.com
Reported-by: Thomas Gleixner <tglx@linutronix.de>
Suggested-by: Boqun Feng <boqun.feng@gmail.com>
Reviewed-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h | 2 +-
 kernel/rcu/update.c      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 9455476c5ba2..1199ffd305d1 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -315,7 +315,7 @@ static inline int rcu_read_lock_any_held(void)
 #define RCU_LOCKDEP_WARN(c, s)						\
 	do {								\
 		static bool __section(".data.unlikely") __warned;	\
-		if (debug_lockdep_rcu_enabled() && !__warned && (c)) {	\
+		if ((c) && debug_lockdep_rcu_enabled() && !__warned) {	\
 			__warned = true;				\
 			lockdep_rcu_suspicious(__FILE__, __LINE__, s);	\
 		}							\
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index b95ae86c40a7..dd94a602a6d2 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -277,7 +277,7 @@ EXPORT_SYMBOL_GPL(rcu_callback_map);
 
 noinstr int notrace debug_lockdep_rcu_enabled(void)
 {
-	return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks &&
+	return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && READ_ONCE(debug_locks) &&
 	       current->lockdep_recursion == 0;
 }
 EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
-- 
cgit v1.2.3


From f4f809f66b7545b89bff4b132cdb37adc2d2c157 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Mon, 10 May 2021 14:39:46 -0700
Subject: cgroup: inline cgroup_task_freeze()

After the introduction of the cgroup.kill there is only one call site
of cgroup_task_freeze() left: cgroup_exit(). cgroup_task_freeze() is
currently taking rcu_read_lock() to read task's cgroup flags, but
because it's always called with css_set_lock locked, the rcu protection
is excessive.

Simplify the code by inlining cgroup_task_freeze().

v2: fix build

Signed-off-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 18 ------------------
 kernel/cgroup/cgroup.c |  3 ++-
 2 files changed, 2 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4f2f79de083e..a72764287cb5 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -906,20 +906,6 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze);
 void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src,
 				 struct cgroup *dst);
 
-static inline bool cgroup_task_freeze(struct task_struct *task)
-{
-	bool ret;
-
-	if (task->flags & PF_KTHREAD)
-		return false;
-
-	rcu_read_lock();
-	ret = test_bit(CGRP_FREEZE, &task_dfl_cgroup(task)->flags);
-	rcu_read_unlock();
-
-	return ret;
-}
-
 static inline bool cgroup_task_frozen(struct task_struct *task)
 {
 	return task->frozen;
@@ -929,10 +915,6 @@ static inline bool cgroup_task_frozen(struct task_struct *task)
 
 static inline void cgroup_enter_frozen(void) { }
 static inline void cgroup_leave_frozen(bool always_leave) { }
-static inline bool cgroup_task_freeze(struct task_struct *task)
-{
-	return false;
-}
 static inline bool cgroup_task_frozen(struct task_struct *task)
 {
 	return false;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index e640fc78d731..8e0d7092afbb 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6267,7 +6267,8 @@ void cgroup_exit(struct task_struct *tsk)
 	cset->nr_tasks--;
 
 	WARN_ON_ONCE(cgroup_task_frozen(tsk));
-	if (unlikely(cgroup_task_freeze(tsk)))
+	if (unlikely(!(tsk->flags & PF_KTHREAD) &&
+		     test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
 		cgroup_update_frozen(task_dfl_cgroup(tsk));
 
 	spin_unlock_irq(&css_set_lock);
-- 
cgit v1.2.3


From 8a922805fb0950187ff037801e337aec010a6ccb Mon Sep 17 00:00:00 2001
From: Zhongjun Tan <tanzhongjun@yulong.com>
Date: Fri, 9 Apr 2021 13:48:41 +0800
Subject: selinux: delete selinux_xfrm_policy_lookup() useless argument

seliunx_xfrm_policy_lookup() is hooks of security_xfrm_policy_lookup().
The dir argument is uselss in security_xfrm_policy_lookup(). So
remove the dir argument from selinux_xfrm_policy_lookup() and
security_xfrm_policy_lookup().

Signed-off-by: Zhongjun Tan <tanzhongjun@yulong.com>
[PM: reformat the subject line]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h   | 3 +--
 include/linux/security.h        | 4 ++--
 net/xfrm/xfrm_policy.c          | 6 ++----
 security/security.c             | 4 ++--
 security/selinux/include/xfrm.h | 2 +-
 security/selinux/xfrm.c         | 2 +-
 6 files changed, 9 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 04c01794de83..2adeea44c0d5 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -358,8 +358,7 @@ LSM_HOOK(int, 0, xfrm_state_alloc_acquire, struct xfrm_state *x,
 	 struct xfrm_sec_ctx *polsec, u32 secid)
 LSM_HOOK(void, LSM_RET_VOID, xfrm_state_free_security, struct xfrm_state *x)
 LSM_HOOK(int, 0, xfrm_state_delete_security, struct xfrm_state *x)
-LSM_HOOK(int, 0, xfrm_policy_lookup, struct xfrm_sec_ctx *ctx, u32 fl_secid,
-	 u8 dir)
+LSM_HOOK(int, 0, xfrm_policy_lookup, struct xfrm_sec_ctx *ctx, u32 fl_secid)
 LSM_HOOK(int, 1, xfrm_state_pol_flow_match, struct xfrm_state *x,
 	 struct xfrm_policy *xp, const struct flowi_common *flic)
 LSM_HOOK(int, 0, xfrm_decode_session, struct sk_buff *skb, u32 *secid,
diff --git a/include/linux/security.h b/include/linux/security.h
index 06f7c50ce77f..24eda04221e9 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1681,7 +1681,7 @@ int security_xfrm_state_alloc_acquire(struct xfrm_state *x,
 				      struct xfrm_sec_ctx *polsec, u32 secid);
 int security_xfrm_state_delete(struct xfrm_state *x);
 void security_xfrm_state_free(struct xfrm_state *x);
-int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir);
+int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid);
 int security_xfrm_state_pol_flow_match(struct xfrm_state *x,
 				       struct xfrm_policy *xp,
 				       const struct flowi_common *flic);
@@ -1732,7 +1732,7 @@ static inline int security_xfrm_state_delete(struct xfrm_state *x)
 	return 0;
 }
 
-static inline int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir)
+static inline int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid)
 {
 	return 0;
 }
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index ce500f847b99..e70cf1d2c0e0 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1902,8 +1902,7 @@ static int xfrm_policy_match(const struct xfrm_policy *pol,
 
 	match = xfrm_selector_match(sel, fl, family);
 	if (match)
-		ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid,
-						  dir);
+		ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid);
 	return ret;
 }
 
@@ -2181,8 +2180,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
 				goto out;
 			}
 			err = security_xfrm_policy_lookup(pol->security,
-						      fl->flowi_secid,
-						      dir);
+						      fl->flowi_secid);
 			if (!err) {
 				if (!xfrm_pol_hold_rcu(pol))
 					goto again;
diff --git a/security/security.c b/security/security.c
index b38155b2de83..0c1c9796e3e4 100644
--- a/security/security.c
+++ b/security/security.c
@@ -2466,9 +2466,9 @@ void security_xfrm_state_free(struct xfrm_state *x)
 	call_void_hook(xfrm_state_free_security, x);
 }
 
-int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir)
+int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid)
 {
-	return call_int_hook(xfrm_policy_lookup, 0, ctx, fl_secid, dir);
+	return call_int_hook(xfrm_policy_lookup, 0, ctx, fl_secid);
 }
 
 int security_xfrm_state_pol_flow_match(struct xfrm_state *x,
diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h
index 0a6f34a7a971..74159400eeee 100644
--- a/security/selinux/include/xfrm.h
+++ b/security/selinux/include/xfrm.h
@@ -23,7 +23,7 @@ int selinux_xfrm_state_alloc_acquire(struct xfrm_state *x,
 				     struct xfrm_sec_ctx *polsec, u32 secid);
 void selinux_xfrm_state_free(struct xfrm_state *x);
 int selinux_xfrm_state_delete(struct xfrm_state *x);
-int selinux_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir);
+int selinux_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid);
 int selinux_xfrm_state_pol_flow_match(struct xfrm_state *x,
 				      struct xfrm_policy *xp,
 				      const struct flowi_common *flic);
diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c
index 634f3db24da6..be83e5ce4469 100644
--- a/security/selinux/xfrm.c
+++ b/security/selinux/xfrm.c
@@ -150,7 +150,7 @@ static int selinux_xfrm_delete(struct xfrm_sec_ctx *ctx)
  * LSM hook implementation that authorizes that a flow can use a xfrm policy
  * rule.
  */
-int selinux_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir)
+int selinux_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid)
 {
 	int rc;
 
-- 
cgit v1.2.3


From 0c8ccd8b267fc735e4621774ce62728f27d42863 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 10 May 2021 15:41:29 +0300
Subject: spi: pxa2xx: Use pxa_ssp_enable()/pxa_ssp_disable() in the driver

There are few places that repeat the logic of pxa_ssp_enable() and
pxa_ssp_disable(). Use them instead of open coded variants.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210510124134.24638-10-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-pxa2xx-dma.c |  4 +---
 drivers/spi/spi-pxa2xx.c     | 36 ++++++++++++++++++------------------
 include/linux/pxa2xx_ssp.h   | 16 ++++++++++++++++
 sound/soc/pxa/pxa-ssp.c      | 16 ----------------
 4 files changed, 35 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi-pxa2xx-dma.c b/drivers/spi/spi-pxa2xx-dma.c
index e00dbadd39ec..5ca01ad7f460 100644
--- a/drivers/spi/spi-pxa2xx-dma.c
+++ b/drivers/spi/spi-pxa2xx-dma.c
@@ -50,9 +50,7 @@ static void pxa2xx_spi_dma_transfer_complete(struct driver_data *drv_data,
 
 		if (error) {
 			/* In case we got an error we disable the SSP now */
-			pxa2xx_spi_write(drv_data, SSCR0,
-					 pxa2xx_spi_read(drv_data, SSCR0)
-					 & ~SSCR0_SSE);
+			pxa_ssp_disable(drv_data->ssp);
 			msg->status = -EIO;
 		}
 
diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c
index 087c84e605b9..a27f51f5db65 100644
--- a/drivers/spi/spi-pxa2xx.c
+++ b/drivers/spi/spi-pxa2xx.c
@@ -286,13 +286,11 @@ static u32 pxa2xx_configure_sscr0(const struct driver_data *drv_data,
 	case QUARK_X1000_SSP:
 		return clk_div
 			| QUARK_X1000_SSCR0_Motorola
-			| QUARK_X1000_SSCR0_DataSize(bits > 32 ? 8 : bits)
-			| SSCR0_SSE;
+			| QUARK_X1000_SSCR0_DataSize(bits > 32 ? 8 : bits);
 	default:
 		return clk_div
 			| SSCR0_Motorola
 			| SSCR0_DataSize(bits > 16 ? bits - 16 : bits)
-			| SSCR0_SSE
 			| (bits > 16 ? SSCR0_EDSS : 0);
 	}
 }
@@ -498,8 +496,7 @@ static void pxa2xx_spi_off(struct driver_data *drv_data)
 	if (is_mmp2_ssp(drv_data))
 		return;
 
-	pxa2xx_spi_write(drv_data, SSCR0,
-			 pxa2xx_spi_read(drv_data, SSCR0) & ~SSCR0_SSE);
+	pxa_ssp_disable(drv_data->ssp);
 }
 
 static int null_writer(struct driver_data *drv_data)
@@ -1098,25 +1095,26 @@ static int pxa2xx_spi_transfer_one(struct spi_controller *controller,
 	    (pxa2xx_spi_read(drv_data, DDS_RATE) != chip->dds_rate))
 		pxa2xx_spi_write(drv_data, DDS_RATE, chip->dds_rate);
 
+	/* Stop the SSP */
+	if (!is_mmp2_ssp(drv_data))
+		pxa_ssp_disable(drv_data->ssp);
+
+	if (!pxa25x_ssp_comp(drv_data))
+		pxa2xx_spi_write(drv_data, SSTO, chip->timeout);
+
 	/* see if we need to reload the config registers */
 	if ((pxa2xx_spi_read(drv_data, SSCR0) != cr0)
 	    || (pxa2xx_spi_read(drv_data, SSCR1) & change_mask)
 	    != (cr1 & change_mask)) {
-		/* stop the SSP, and update the other bits */
-		if (!is_mmp2_ssp(drv_data))
-			pxa2xx_spi_write(drv_data, SSCR0, cr0 & ~SSCR0_SSE);
-		if (!pxa25x_ssp_comp(drv_data))
-			pxa2xx_spi_write(drv_data, SSTO, chip->timeout);
 		/* first set CR1 without interrupt and service enables */
 		pxa2xx_spi_write(drv_data, SSCR1, cr1 & change_mask);
-		/* restart the SSP */
+		/* Update the other bits */
 		pxa2xx_spi_write(drv_data, SSCR0, cr0);
-
-	} else {
-		if (!pxa25x_ssp_comp(drv_data))
-			pxa2xx_spi_write(drv_data, SSTO, chip->timeout);
 	}
 
+	/* Restart the SSP */
+	pxa_ssp_enable(drv_data->ssp);
+
 	if (is_mmp2_ssp(drv_data)) {
 		u8 tx_level = (pxa2xx_spi_read(drv_data, SSSR)
 					& SSSR_TFL_MASK) >> 8;
@@ -1786,8 +1784,9 @@ static int pxa2xx_spi_probe(struct platform_device *pdev)
 		controller->min_speed_hz =
 			DIV_ROUND_UP(controller->max_speed_hz, 512);
 
+	pxa_ssp_disable(ssp);
+
 	/* Load default SSP configuration */
-	pxa2xx_spi_write(drv_data, SSCR0, 0);
 	switch (drv_data->ssp_type) {
 	case QUARK_X1000_SSP:
 		tmp = QUARK_X1000_SSCR1_RxTresh(RX_THRESH_QUARK_X1000_DFLT) |
@@ -1928,7 +1927,7 @@ static int pxa2xx_spi_remove(struct platform_device *pdev)
 	spi_unregister_controller(drv_data->controller);
 
 	/* Disable the SSP at the peripheral and SOC level */
-	pxa2xx_spi_write(drv_data, SSCR0, 0);
+	pxa_ssp_disable(ssp);
 	clk_disable_unprepare(ssp->clk);
 
 	/* Release DMA */
@@ -1957,7 +1956,8 @@ static int pxa2xx_spi_suspend(struct device *dev)
 	status = spi_controller_suspend(drv_data->controller);
 	if (status != 0)
 		return status;
-	pxa2xx_spi_write(drv_data, SSCR0, 0);
+
+	pxa_ssp_disable(ssp);
 
 	if (!pm_runtime_suspended(dev))
 		clk_disable_unprepare(ssp->clk);
diff --git a/include/linux/pxa2xx_ssp.h b/include/linux/pxa2xx_ssp.h
index 1b6c1a0922bd..fdfbe17e15f4 100644
--- a/include/linux/pxa2xx_ssp.h
+++ b/include/linux/pxa2xx_ssp.h
@@ -254,6 +254,22 @@ static inline u32 pxa_ssp_read_reg(struct ssp_device *dev, u32 reg)
 	return __raw_readl(dev->mmio_base + reg);
 }
 
+static inline void pxa_ssp_enable(struct ssp_device *ssp)
+{
+	u32 sscr0;
+
+	sscr0 = pxa_ssp_read_reg(ssp, SSCR0) | SSCR0_SSE;
+	pxa_ssp_write_reg(ssp, SSCR0, sscr0);
+}
+
+static inline void pxa_ssp_disable(struct ssp_device *ssp)
+{
+	u32 sscr0;
+
+	sscr0 = pxa_ssp_read_reg(ssp, SSCR0) & ~SSCR0_SSE;
+	pxa_ssp_write_reg(ssp, SSCR0, sscr0);
+}
+
 #if IS_ENABLED(CONFIG_PXA_SSP)
 struct ssp_device *pxa_ssp_request(int port, const char *label);
 void pxa_ssp_free(struct ssp_device *);
diff --git a/sound/soc/pxa/pxa-ssp.c b/sound/soc/pxa/pxa-ssp.c
index b941adcbb8f9..939e7e28486a 100644
--- a/sound/soc/pxa/pxa-ssp.c
+++ b/sound/soc/pxa/pxa-ssp.c
@@ -61,22 +61,6 @@ static void dump_registers(struct ssp_device *ssp)
 		 pxa_ssp_read_reg(ssp, SSACD));
 }
 
-static void pxa_ssp_enable(struct ssp_device *ssp)
-{
-	uint32_t sscr0;
-
-	sscr0 = __raw_readl(ssp->mmio_base + SSCR0) | SSCR0_SSE;
-	__raw_writel(sscr0, ssp->mmio_base + SSCR0);
-}
-
-static void pxa_ssp_disable(struct ssp_device *ssp)
-{
-	uint32_t sscr0;
-
-	sscr0 = __raw_readl(ssp->mmio_base + SSCR0) & ~SSCR0_SSE;
-	__raw_writel(sscr0, ssp->mmio_base + SSCR0);
-}
-
 static void pxa_ssp_set_dma_params(struct ssp_device *ssp, int width4,
 			int out, struct snd_dmaengine_dai_dma_data *dma)
 {
-- 
cgit v1.2.3


From 3fdb59cf10b020b32b9f1dfc78611320623dcb3e Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 10 May 2021 15:41:34 +0300
Subject: spi: pxa2xx: Introduce special type for Merrifield SPIs

Intel Merrifield SPI is actually more closer to PXA3xx. It has extended FIFO
(32 bytes) and additional registers to get or set FIFO thresholds.

Introduce new type for Intel Merrifield SPI host controllers and handle bigger
FIFO size.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210510124134.24638-15-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-pxa2xx-pci.c |  2 +-
 drivers/spi/spi-pxa2xx.c     | 32 +++++++++++++++++++++++++++++---
 include/linux/pxa2xx_ssp.h   | 16 ++++++++++++++++
 3 files changed, 46 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi-pxa2xx-pci.c b/drivers/spi/spi-pxa2xx-pci.c
index a259be12d326..dce9ade9a4df 100644
--- a/drivers/spi/spi-pxa2xx-pci.c
+++ b/drivers/spi/spi-pxa2xx-pci.c
@@ -179,7 +179,7 @@ static struct pxa_spi_info spi_info_configs[] = {
 		.rx_param = &bsw2_rx_param,
 	},
 	[PORT_MRFLD] = {
-		.type = PXA27x_SSP,
+		.type = MRFLD_SSP,
 		.max_clk_rate = 25000000,
 		.setup = mrfld_spi_setup,
 	},
diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c
index af3f01de8f5b..5985b39e2dd6 100644
--- a/drivers/spi/spi-pxa2xx.c
+++ b/drivers/spi/spi-pxa2xx.c
@@ -200,6 +200,11 @@ static bool is_mmp2_ssp(const struct driver_data *drv_data)
 	return drv_data->ssp_type == MMP2_SSP;
 }
 
+static bool is_mrfld_ssp(const struct driver_data *drv_data)
+{
+	return drv_data->ssp_type == MRFLD_SSP;
+}
+
 static void pxa2xx_spi_update(const struct driver_data *drv_data, u32 reg, u32 mask, u32 value)
 {
 	if ((pxa2xx_spi_read(drv_data, reg) & mask) != value)
@@ -1087,6 +1092,15 @@ static int pxa2xx_spi_transfer_one(struct spi_controller *controller,
 		pxa2xx_spi_update(drv_data, SSITF, GENMASK(15, 0), chip->lpss_tx_threshold);
 	}
 
+	if (is_mrfld_ssp(drv_data)) {
+		u32 thresh = 0;
+
+		thresh |= SFIFOTT_RxThresh(chip->lpss_rx_threshold);
+		thresh |= SFIFOTT_TxThresh(chip->lpss_tx_threshold);
+
+		pxa2xx_spi_update(drv_data, SFIFOTT, 0xffffffff, thresh);
+	}
+
 	if (is_quark_x1000_ssp(drv_data))
 		pxa2xx_spi_update(drv_data, DDS_RATE, GENMASK(23, 0), chip->dds_rate);
 
@@ -1253,6 +1267,11 @@ static int setup(struct spi_device *spi)
 		tx_hi_thres = 0;
 		rx_thres = RX_THRESH_QUARK_X1000_DFLT;
 		break;
+	case MRFLD_SSP:
+		tx_thres = TX_THRESH_MRFLD_DFLT;
+		tx_hi_thres = 0;
+		rx_thres = RX_THRESH_MRFLD_DFLT;
+		break;
 	case CE4100_SSP:
 		tx_thres = TX_THRESH_CE4100_DFLT;
 		tx_hi_thres = 0;
@@ -1328,9 +1347,16 @@ static int setup(struct spi_device *spi)
 		chip->cr1 |= SSCR1_SPH;
 	}
 
-	chip->lpss_rx_threshold = SSIRF_RxThresh(rx_thres);
-	chip->lpss_tx_threshold = SSITF_TxLoThresh(tx_thres)
-				| SSITF_TxHiThresh(tx_hi_thres);
+	if (is_lpss_ssp(drv_data)) {
+		chip->lpss_rx_threshold = SSIRF_RxThresh(rx_thres);
+		chip->lpss_tx_threshold = SSITF_TxLoThresh(tx_thres) |
+					  SSITF_TxHiThresh(tx_hi_thres);
+	}
+
+	if (is_mrfld_ssp(drv_data)) {
+		chip->lpss_rx_threshold = rx_thres;
+		chip->lpss_tx_threshold = tx_thres;
+	}
 
 	/* set dma burst and threshold outside of chip_info path so that if
 	 * chip_info goes away after setting chip->enable_dma, the
diff --git a/include/linux/pxa2xx_ssp.h b/include/linux/pxa2xx_ssp.h
index fdfbe17e15f4..2b21bc1f3c73 100644
--- a/include/linux/pxa2xx_ssp.h
+++ b/include/linux/pxa2xx_ssp.h
@@ -183,6 +183,21 @@ struct device_node;
 #define SSACD_ACPS(x)		((x) << 4)	/* Audio clock PLL select */
 #define SSACD_SCDX8		BIT(7)		/* SYSCLK division ratio select */
 
+/* Intel Merrifield SSP */
+#define SFIFOL			0x68		/* FIFO level */
+#define SFIFOTT			0x6c		/* FIFO trigger threshold */
+
+#define RX_THRESH_MRFLD_DFLT	16
+#define TX_THRESH_MRFLD_DFLT	16
+
+#define SFIFOL_TFL_MASK		GENMASK(15, 0)	/* Transmit FIFO Level mask */
+#define SFIFOL_RFL_MASK		GENMASK(31, 16)	/* Receive FIFO Level mask */
+
+#define SFIFOTT_TFT		GENMASK(15, 0)	/* Transmit FIFO Threshold (mask) */
+#define SFIFOTT_TxThresh(x)	(((x) - 1) << 0)	/* TX FIFO trigger threshold / level */
+#define SFIFOTT_RFT		GENMASK(31, 16)	/* Receive FIFO Threshold (mask) */
+#define SFIFOTT_RxThresh(x)	(((x) - 1) << 16)	/* RX FIFO trigger threshold / level */
+
 /* LPSS SSP */
 #define SSITF			0x44		/* TX FIFO trigger level */
 #define SSITF_TxHiThresh(x)	(((x) - 1) << 0)
@@ -205,6 +220,7 @@ enum pxa_ssp_type {
 	MMP2_SSP,
 	PXA910_SSP,
 	CE4100_SSP,
+	MRFLD_SSP,
 	QUARK_X1000_SSP,
 	LPSS_LPT_SSP, /* Keep LPSS types sorted with lpss_platforms[] */
 	LPSS_BYT_SSP,
-- 
cgit v1.2.3


From 35f3f8504c3b60a1ae5576e178b27fc0ddd6157d Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 10 May 2021 16:12:42 +0300
Subject: spi: Switch to signed types for *_native_cs SPI controller fields

While fixing undefined behaviour the commit f60d7270c8a3 ("spi: Avoid
undefined behaviour when counting unused native CSs") missed the case
when all CSs are GPIOs and thus unused_native_cs will be evaluated to
-1 in unsigned representation. This will falsely trigger a condition
in the spi_get_gpio_descs().

Switch to signed types for *_native_cs SPI controller fields to fix above.

Fixes: f60d7270c8a3 ("spi: Avoid undefined behaviour when counting unused native CSs")
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210510131242.49455-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 360a3bc767ca..74239d65c7fd 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -644,8 +644,8 @@ struct spi_controller {
 	int			*cs_gpios;
 	struct gpio_desc	**cs_gpiods;
 	bool			use_gpio_descriptors;
-	u8			unused_native_cs;
-	u8			max_native_cs;
+	s8			unused_native_cs;
+	s8			max_native_cs;
 
 	/* statistics */
 	struct spi_statistics	statistics;
-- 
cgit v1.2.3


From 345e9f5ca798600e44c0843646621f2804eb99f4 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 11 May 2021 11:00:45 +0800
Subject: soundwire: bus: only use CLOCK_STOP_MODE0 and fix confusions

Existing devices and implementations only support the required
CLOCK_STOP_MODE0. All the code related to CLOCK_STOP_MODE1 has not
been tested and is highly questionable, with a clear confusion between
CLOCK_STOP_MODE1 and the simple clock stop state machine.

This patch removes all usages of CLOCK_STOP_MODE1 - which has no
impact on any solution - and fixes the use of the simple clock stop
state machine. The resulting code should be a lot more symmetrical and
easier to maintain.

Note that CLOCK_STOP_MODE1 is not supported in the SoundWire Device
Class specification so it's rather unlikely that we need to re-add
this mode later.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Guennadi Liakhovetski <guennadi.liakhovetski@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20210511030048.25622-2-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/bus.c       | 100 +++++++++++++++++-------------------------
 include/linux/soundwire/sdw.h |   2 -
 2 files changed, 40 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c
index a9e0aa72654d..dc4033b6f2e9 100644
--- a/drivers/soundwire/bus.c
+++ b/drivers/soundwire/bus.c
@@ -821,26 +821,6 @@ static void sdw_modify_slave_status(struct sdw_slave *slave,
 	mutex_unlock(&bus->bus_lock);
 }
 
-static enum sdw_clk_stop_mode sdw_get_clk_stop_mode(struct sdw_slave *slave)
-{
-	enum sdw_clk_stop_mode mode;
-
-	/*
-	 * Query for clock stop mode if Slave implements
-	 * ops->get_clk_stop_mode, else read from property.
-	 */
-	if (slave->ops && slave->ops->get_clk_stop_mode) {
-		mode = slave->ops->get_clk_stop_mode(slave);
-	} else {
-		if (slave->prop.clk_stop_mode1)
-			mode = SDW_CLK_STOP_MODE1;
-		else
-			mode = SDW_CLK_STOP_MODE0;
-	}
-
-	return mode;
-}
-
 static int sdw_slave_clk_stop_callback(struct sdw_slave *slave,
 				       enum sdw_clk_stop_mode mode,
 				       enum sdw_clk_stop_type type)
@@ -933,7 +913,6 @@ static int sdw_bus_wait_for_clk_prep_deprep(struct sdw_bus *bus, u16 dev_num)
  */
 int sdw_bus_prep_clk_stop(struct sdw_bus *bus)
 {
-	enum sdw_clk_stop_mode slave_mode;
 	bool simple_clk_stop = true;
 	struct sdw_slave *slave;
 	bool is_slave = false;
@@ -955,10 +934,8 @@ int sdw_bus_prep_clk_stop(struct sdw_bus *bus)
 		/* Identify if Slave(s) are available on Bus */
 		is_slave = true;
 
-		slave_mode = sdw_get_clk_stop_mode(slave);
-		slave->curr_clk_stop_mode = slave_mode;
-
-		ret = sdw_slave_clk_stop_callback(slave, slave_mode,
+		ret = sdw_slave_clk_stop_callback(slave,
+						  SDW_CLK_STOP_MODE0,
 						  SDW_CLK_PRE_PREPARE);
 		if (ret < 0) {
 			dev_err(&slave->dev,
@@ -966,22 +943,29 @@ int sdw_bus_prep_clk_stop(struct sdw_bus *bus)
 			return ret;
 		}
 
-		ret = sdw_slave_clk_stop_prepare(slave,
-						 slave_mode, true);
-		if (ret < 0) {
-			dev_err(&slave->dev,
-				"pre-prepare failed:%d", ret);
-			return ret;
-		}
-
-		if (slave_mode == SDW_CLK_STOP_MODE1)
+		/* Only prepare a Slave device if needed */
+		if (!slave->prop.simple_clk_stop_capable) {
 			simple_clk_stop = false;
+
+			ret = sdw_slave_clk_stop_prepare(slave,
+							 SDW_CLK_STOP_MODE0,
+							 true);
+			if (ret < 0) {
+				dev_err(&slave->dev,
+					"pre-prepare failed:%d", ret);
+				return ret;
+			}
+		}
 	}
 
 	/* Skip remaining clock stop preparation if no Slave is attached */
 	if (!is_slave)
 		return ret;
 
+	/*
+	 * Don't wait for all Slaves to be ready if they follow the simple
+	 * state machine
+	 */
 	if (!simple_clk_stop) {
 		ret = sdw_bus_wait_for_clk_prep_deprep(bus,
 						       SDW_BROADCAST_DEV_NUM);
@@ -998,17 +982,13 @@ int sdw_bus_prep_clk_stop(struct sdw_bus *bus)
 		    slave->status != SDW_SLAVE_ALERT)
 			continue;
 
-		slave_mode = slave->curr_clk_stop_mode;
-
-		if (slave_mode == SDW_CLK_STOP_MODE1) {
-			ret = sdw_slave_clk_stop_callback(slave,
-							  slave_mode,
-							  SDW_CLK_POST_PREPARE);
+		ret = sdw_slave_clk_stop_callback(slave,
+						  SDW_CLK_STOP_MODE0,
+						  SDW_CLK_POST_PREPARE);
 
-			if (ret < 0) {
-				dev_err(&slave->dev,
-					"post-prepare failed:%d", ret);
-			}
+		if (ret < 0) {
+			dev_err(&slave->dev,
+				"post-prepare failed:%d", ret);
 		}
 	}
 
@@ -1059,7 +1039,6 @@ EXPORT_SYMBOL(sdw_bus_clk_stop);
  */
 int sdw_bus_exit_clk_stop(struct sdw_bus *bus)
 {
-	enum sdw_clk_stop_mode mode;
 	bool simple_clk_stop = true;
 	struct sdw_slave *slave;
 	bool is_slave = false;
@@ -1081,31 +1060,33 @@ int sdw_bus_exit_clk_stop(struct sdw_bus *bus)
 		/* Identify if Slave(s) are available on Bus */
 		is_slave = true;
 
-		mode = slave->curr_clk_stop_mode;
-
-		if (mode == SDW_CLK_STOP_MODE1) {
-			simple_clk_stop = false;
-			continue;
-		}
-
-		ret = sdw_slave_clk_stop_callback(slave, mode,
+		ret = sdw_slave_clk_stop_callback(slave, SDW_CLK_STOP_MODE0,
 						  SDW_CLK_PRE_DEPREPARE);
 		if (ret < 0)
 			dev_warn(&slave->dev,
 				 "clk stop deprep failed:%d", ret);
 
-		ret = sdw_slave_clk_stop_prepare(slave, mode,
-						 false);
+		/* Only de-prepare a Slave device if needed */
+		if (!slave->prop.simple_clk_stop_capable) {
+			simple_clk_stop = false;
 
-		if (ret < 0)
-			dev_warn(&slave->dev,
-				 "clk stop deprep failed:%d", ret);
+			ret = sdw_slave_clk_stop_prepare(slave, SDW_CLK_STOP_MODE0,
+							 false);
+
+			if (ret < 0)
+				dev_warn(&slave->dev,
+					 "clk stop deprep failed:%d", ret);
+		}
 	}
 
 	/* Skip remaining clock stop de-preparation if no Slave is attached */
 	if (!is_slave)
 		return 0;
 
+	/*
+	 * Don't wait for all Slaves to be ready if they follow the simple
+	 * state machine
+	 */
 	if (!simple_clk_stop)
 		sdw_bus_wait_for_clk_prep_deprep(bus, SDW_BROADCAST_DEV_NUM);
 
@@ -1117,8 +1098,7 @@ int sdw_bus_exit_clk_stop(struct sdw_bus *bus)
 		    slave->status != SDW_SLAVE_ALERT)
 			continue;
 
-		mode = slave->curr_clk_stop_mode;
-		sdw_slave_clk_stop_callback(slave, mode,
+		sdw_slave_clk_stop_callback(slave, SDW_CLK_STOP_MODE0,
 					    SDW_CLK_POST_DEPREPARE);
 	}
 
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index ced07f8fde87..5d93d9949653 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -624,7 +624,6 @@ struct sdw_slave_ops {
 	int (*port_prep)(struct sdw_slave *slave,
 			 struct sdw_prepare_ch *prepare_ch,
 			 enum sdw_port_prep_ops pre_ops);
-	int (*get_clk_stop_mode)(struct sdw_slave *slave);
 	int (*clk_stop)(struct sdw_slave *slave,
 			enum sdw_clk_stop_mode mode,
 			enum sdw_clk_stop_type type);
@@ -675,7 +674,6 @@ struct sdw_slave {
 	struct list_head node;
 	struct completion port_ready[SDW_MAX_PORTS];
 	unsigned int m_port_map[SDW_MAX_PORTS];
-	enum sdw_clk_stop_mode curr_clk_stop_mode;
 	u16 dev_num;
 	u16 dev_num_sticky;
 	bool probed;
-- 
cgit v1.2.3


From 448df2d8fcab6cc50e0de4679ce3afe2ece282f2 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 11 May 2021 11:00:46 +0800
Subject: soundwire: add missing kernel-doc description

For some reason we never added a description for the clk_stop
callback.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Guennadi Liakhovetski <guennadi.liakhovetski@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20210511030048.25622-3-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 5d93d9949653..8ca736e92d5a 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -612,6 +612,7 @@ struct sdw_bus_params {
  * @update_status: Update Slave status
  * @bus_config: Update the bus config for Slave
  * @port_prep: Prepare the port with parameters
+ * @clk_stop: handle imp-def sequences before and after prepare and de-prepare
  */
 struct sdw_slave_ops {
 	int (*read_prop)(struct sdw_slave *sdw);
-- 
cgit v1.2.3


From bf30396cdf8132a199af5f8f0e60367876f455df Mon Sep 17 00:00:00 2001
From: Loic Poulain <loic.poulain@linaro.org>
Date: Tue, 11 May 2021 16:42:22 +0200
Subject: net: wwan: Add unknown port type

Some devices may have ports with unknown type/protocol which need to
be tagged (though not supported by WWAN core). This will be the case
for cdc-wdm based drivers.

Signed-off-by: Loic Poulain <loic.poulain@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/wwan.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/wwan.h b/include/linux/wwan.h
index aa05a253dcf9..7216c114d758 100644
--- a/include/linux/wwan.h
+++ b/include/linux/wwan.h
@@ -15,6 +15,7 @@
  * @WWAN_PORT_QMI: Qcom modem/MSM interface for modem control
  * @WWAN_PORT_QCDM: Qcom Modem diagnostic interface
  * @WWAN_PORT_FIREHOSE: XML based command protocol
+ * @WWAN_PORT_UNKNOWN: Unknown port type
  * @WWAN_PORT_MAX: Number of supported port types
  */
 enum wwan_port_type {
@@ -23,7 +24,8 @@ enum wwan_port_type {
 	WWAN_PORT_QMI,
 	WWAN_PORT_QCDM,
 	WWAN_PORT_FIREHOSE,
-	WWAN_PORT_MAX,
+	WWAN_PORT_UNKNOWN,
+	WWAN_PORT_MAX = WWAN_PORT_UNKNOWN,
 };
 
 struct wwan_port;
-- 
cgit v1.2.3


From cac6fb015f719104e60b1c68c15ca5b734f57b9c Mon Sep 17 00:00:00 2001
From: Loic Poulain <loic.poulain@linaro.org>
Date: Tue, 11 May 2021 16:42:23 +0200
Subject: usb: class: cdc-wdm: WWAN framework integration

The WWAN framework provides a unified way to handle WWAN/modems and its
control port(s). It has initially been introduced to support MHI/PCI
modems, offering the same control protocols as the USB variants such as
MBIM, QMI, AT... The WWAN framework exposes these control protocols as
character devices, similarly to cdc-wdm, but in a bus agnostic fashion.

This change adds registration of the USB modem cdc-wdm control endpoints
to the WWAN framework as standard control ports (wwanXpY...).

Exposing cdc-wdm through WWAN framework normally maintains backward
compatibility, e.g:
    $ qmicli --device-open-qmi -d /dev/wwan0p1QMI --dms-get-ids
instead of
    $ qmicli --device-open-qmi -d /dev/cdc-wdm0 --dms-get-ids

However, some tools may rely on cdc-wdm driver/device name for device
detection. It is then safer to keep the 'legacy' cdc-wdm character
device to prevent any breakage. This is handled in this change by
API mutual exclusion, only one access method can be used at a time,
either cdc-wdm chardev or WWAN API.

Note that unknown channel types (other than MBIM, AT or MBIM) are not
registered to the WWAN framework.

Signed-off-by: Loic Poulain <loic.poulain@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/cdc_mbim.c       |   1 +
 drivers/net/usb/huawei_cdc_ncm.c |   1 +
 drivers/net/usb/qmi_wwan.c       |   3 +-
 drivers/usb/class/cdc-wdm.c      | 180 ++++++++++++++++++++++++++++++++++++++-
 include/linux/usb/cdc-wdm.h      |   3 +-
 5 files changed, 182 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/cdc_mbim.c b/drivers/net/usb/cdc_mbim.c
index 5db66272fc82..42fb75057c15 100644
--- a/drivers/net/usb/cdc_mbim.c
+++ b/drivers/net/usb/cdc_mbim.c
@@ -168,6 +168,7 @@ static int cdc_mbim_bind(struct usbnet *dev, struct usb_interface *intf)
 		subdriver = usb_cdc_wdm_register(ctx->control,
 						 &dev->status->desc,
 						 le16_to_cpu(ctx->mbim_desc->wMaxControlMessage),
+						 WWAN_PORT_MBIM,
 						 cdc_mbim_wdm_manage_power);
 	if (IS_ERR(subdriver)) {
 		ret = PTR_ERR(subdriver);
diff --git a/drivers/net/usb/huawei_cdc_ncm.c b/drivers/net/usb/huawei_cdc_ncm.c
index a87f0dabcdb7..849b77330bf2 100644
--- a/drivers/net/usb/huawei_cdc_ncm.c
+++ b/drivers/net/usb/huawei_cdc_ncm.c
@@ -96,6 +96,7 @@ static int huawei_cdc_ncm_bind(struct usbnet *usbnet_dev,
 		subdriver = usb_cdc_wdm_register(ctx->control,
 						 &usbnet_dev->status->desc,
 						 1024, /* wMaxCommand */
+						 WWAN_PORT_AT,
 						 huawei_cdc_ncm_wdm_manage_power);
 	if (IS_ERR(subdriver)) {
 		ret = PTR_ERR(subdriver);
diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 6700f1970b24..db157f21a322 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -710,7 +710,8 @@ static int qmi_wwan_register_subdriver(struct usbnet *dev)
 
 	/* register subdriver */
 	subdriver = usb_cdc_wdm_register(info->control, &dev->status->desc,
-					 4096, &qmi_wwan_cdc_wdm_manage_power);
+					 4096, WWAN_PORT_QMI,
+					 &qmi_wwan_cdc_wdm_manage_power);
 	if (IS_ERR(subdriver)) {
 		dev_err(&info->control->dev, "subdriver registration failed\n");
 		rv = PTR_ERR(subdriver);
diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c
index 508b1c3f8b73..457b00c6e984 100644
--- a/drivers/usb/class/cdc-wdm.c
+++ b/drivers/usb/class/cdc-wdm.c
@@ -21,8 +21,10 @@
 #include <linux/uaccess.h>
 #include <linux/bitops.h>
 #include <linux/poll.h>
+#include <linux/skbuff.h>
 #include <linux/usb.h>
 #include <linux/usb/cdc.h>
+#include <linux/wwan.h>
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
 #include <linux/usb/cdc-wdm.h>
@@ -55,6 +57,7 @@ MODULE_DEVICE_TABLE (usb, wdm_ids);
 #define WDM_SUSPENDING		8
 #define WDM_RESETTING		9
 #define WDM_OVERFLOW		10
+#define WDM_WWAN_IN_USE		11
 
 #define WDM_MAX			16
 
@@ -106,6 +109,9 @@ struct wdm_device {
 
 	struct list_head	device_list;
 	int			(*manage_power)(struct usb_interface *, int);
+
+	enum wwan_port_type	wwanp_type;
+	struct wwan_port	*wwanp;
 };
 
 static struct usb_driver wdm_driver;
@@ -157,6 +163,8 @@ static void wdm_out_callback(struct urb *urb)
 	wake_up_all(&desc->wait);
 }
 
+static void wdm_wwan_rx(struct wdm_device *desc, int length);
+
 static void wdm_in_callback(struct urb *urb)
 {
 	unsigned long flags;
@@ -192,6 +200,11 @@ static void wdm_in_callback(struct urb *urb)
 		}
 	}
 
+	if (test_bit(WDM_WWAN_IN_USE, &desc->flags)) {
+		wdm_wwan_rx(desc, length);
+		goto out;
+	}
+
 	/*
 	 * only set a new error if there is no previous error.
 	 * Errors are only cleared during read/open
@@ -226,6 +239,7 @@ skip_error:
 		set_bit(WDM_READ, &desc->flags);
 		wake_up(&desc->wait);
 	}
+out:
 	spin_unlock_irqrestore(&desc->iuspin, flags);
 }
 
@@ -697,6 +711,11 @@ static int wdm_open(struct inode *inode, struct file *file)
 		goto out;
 	file->private_data = desc;
 
+	if (test_bit(WDM_WWAN_IN_USE, &desc->flags)) {
+		rv = -EBUSY;
+		goto out;
+	}
+
 	rv = usb_autopm_get_interface(desc->intf);
 	if (rv < 0) {
 		dev_err(&desc->intf->dev, "Error autopm - %d\n", rv);
@@ -792,6 +811,151 @@ static struct usb_class_driver wdm_class = {
 	.minor_base =	WDM_MINOR_BASE,
 };
 
+/* --- WWAN framework integration --- */
+#ifdef CONFIG_WWAN
+static int wdm_wwan_port_start(struct wwan_port *port)
+{
+	struct wdm_device *desc = wwan_port_get_drvdata(port);
+
+	/* The interface is both exposed via the WWAN framework and as a
+	 * legacy usbmisc chardev. If chardev is already open, just fail
+	 * to prevent concurrent usage. Otherwise, switch to WWAN mode.
+	 */
+	mutex_lock(&wdm_mutex);
+	if (desc->count) {
+		mutex_unlock(&wdm_mutex);
+		return -EBUSY;
+	}
+	set_bit(WDM_WWAN_IN_USE, &desc->flags);
+	mutex_unlock(&wdm_mutex);
+
+	desc->manage_power(desc->intf, 1);
+
+	/* tx is allowed */
+	wwan_port_txon(port);
+
+	/* Start getting events */
+	return usb_submit_urb(desc->validity, GFP_KERNEL);
+}
+
+static void wdm_wwan_port_stop(struct wwan_port *port)
+{
+	struct wdm_device *desc = wwan_port_get_drvdata(port);
+
+	/* Stop all transfers and disable WWAN mode */
+	kill_urbs(desc);
+	desc->manage_power(desc->intf, 0);
+	clear_bit(WDM_READ, &desc->flags);
+	clear_bit(WDM_WWAN_IN_USE, &desc->flags);
+}
+
+static void wdm_wwan_port_tx_complete(struct urb *urb)
+{
+	struct sk_buff *skb = urb->context;
+	struct wdm_device *desc = skb_shinfo(skb)->destructor_arg;
+
+	usb_autopm_put_interface(desc->intf);
+	wwan_port_txon(desc->wwanp);
+	kfree_skb(skb);
+}
+
+static int wdm_wwan_port_tx(struct wwan_port *port, struct sk_buff *skb)
+{
+	struct wdm_device *desc = wwan_port_get_drvdata(port);
+	struct usb_interface *intf = desc->intf;
+	struct usb_ctrlrequest *req = desc->orq;
+	int rv;
+
+	rv = usb_autopm_get_interface(intf);
+	if (rv)
+		return rv;
+
+	usb_fill_control_urb(
+		desc->command,
+		interface_to_usbdev(intf),
+		usb_sndctrlpipe(interface_to_usbdev(intf), 0),
+		(unsigned char *)req,
+		skb->data,
+		skb->len,
+		wdm_wwan_port_tx_complete,
+		skb
+	);
+
+	req->bRequestType = (USB_DIR_OUT | USB_TYPE_CLASS | USB_RECIP_INTERFACE);
+	req->bRequest = USB_CDC_SEND_ENCAPSULATED_COMMAND;
+	req->wValue = 0;
+	req->wIndex = desc->inum;
+	req->wLength = cpu_to_le16(skb->len);
+
+	skb_shinfo(skb)->destructor_arg = desc;
+
+	rv = usb_submit_urb(desc->command, GFP_KERNEL);
+	if (rv)
+		usb_autopm_put_interface(intf);
+	else /* One transfer at a time, stop TX until URB completion */
+		wwan_port_txoff(port);
+
+	return rv;
+}
+
+static struct wwan_port_ops wdm_wwan_port_ops = {
+	.start = wdm_wwan_port_start,
+	.stop = wdm_wwan_port_stop,
+	.tx = wdm_wwan_port_tx,
+};
+
+static void wdm_wwan_init(struct wdm_device *desc)
+{
+	struct usb_interface *intf = desc->intf;
+	struct wwan_port *port;
+
+	/* Only register to WWAN core if protocol/type is known */
+	if (desc->wwanp_type == WWAN_PORT_UNKNOWN) {
+		dev_info(&intf->dev, "Unknown control protocol\n");
+		return;
+	}
+
+	port = wwan_create_port(&intf->dev, desc->wwanp_type, &wdm_wwan_port_ops, desc);
+	if (IS_ERR(port)) {
+		dev_err(&intf->dev, "%s: Unable to create WWAN port\n",
+			dev_name(intf->usb_dev));
+		return;
+	}
+
+	desc->wwanp = port;
+}
+
+static void wdm_wwan_deinit(struct wdm_device *desc)
+{
+	if (!desc->wwanp)
+		return;
+
+	wwan_remove_port(desc->wwanp);
+	desc->wwanp = NULL;
+}
+
+static void wdm_wwan_rx(struct wdm_device *desc, int length)
+{
+	struct wwan_port *port = desc->wwanp;
+	struct sk_buff *skb;
+
+	/* Forward data to WWAN port */
+	skb = alloc_skb(length, GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	memcpy(skb_put(skb, length), desc->inbuf, length);
+	wwan_port_rx(port, skb);
+
+	/* inbuf has been copied, it is safe to check for outstanding data */
+	schedule_work(&desc->service_outs_intr);
+}
+#else /* CONFIG_WWAN */
+static void wdm_wwan_init(struct wdm_device *desc) {}
+static void wdm_wwan_deinit(struct wdm_device *desc) {}
+static void wdm_wwan_rx(struct wdm_device *desc, int length) {}
+#endif /* CONFIG_WWAN */
+
 /* --- error handling --- */
 static void wdm_rxwork(struct work_struct *work)
 {
@@ -836,7 +1000,8 @@ static void service_interrupt_work(struct work_struct *work)
 /* --- hotplug --- */
 
 static int wdm_create(struct usb_interface *intf, struct usb_endpoint_descriptor *ep,
-		u16 bufsize, int (*manage_power)(struct usb_interface *, int))
+		      u16 bufsize, enum wwan_port_type type,
+		      int (*manage_power)(struct usb_interface *, int))
 {
 	int rv = -ENOMEM;
 	struct wdm_device *desc;
@@ -853,6 +1018,7 @@ static int wdm_create(struct usb_interface *intf, struct usb_endpoint_descriptor
 	/* this will be expanded and needed in hardware endianness */
 	desc->inum = cpu_to_le16((u16)intf->cur_altsetting->desc.bInterfaceNumber);
 	desc->intf = intf;
+	desc->wwanp_type = type;
 	INIT_WORK(&desc->rxwork, wdm_rxwork);
 	INIT_WORK(&desc->service_outs_intr, service_interrupt_work);
 
@@ -933,6 +1099,9 @@ static int wdm_create(struct usb_interface *intf, struct usb_endpoint_descriptor
 		goto err;
 	else
 		dev_info(&intf->dev, "%s: USB WDM device\n", dev_name(intf->usb_dev));
+
+	wdm_wwan_init(desc);
+
 out:
 	return rv;
 err:
@@ -977,7 +1146,7 @@ static int wdm_probe(struct usb_interface *intf, const struct usb_device_id *id)
 		goto err;
 	ep = &iface->endpoint[0].desc;
 
-	rv = wdm_create(intf, ep, maxcom, &wdm_manage_power);
+	rv = wdm_create(intf, ep, maxcom, WWAN_PORT_UNKNOWN, &wdm_manage_power);
 
 err:
 	return rv;
@@ -988,6 +1157,7 @@ err:
  * @intf: usb interface the subdriver will associate with
  * @ep: interrupt endpoint to monitor for notifications
  * @bufsize: maximum message size to support for read/write
+ * @type: Type/protocol of the transported data (MBIM, QMI...)
  * @manage_power: call-back invoked during open and release to
  *                manage the device's power
  * Create WDM usb class character device and associate it with intf
@@ -1005,12 +1175,12 @@ err:
  */
 struct usb_driver *usb_cdc_wdm_register(struct usb_interface *intf,
 					struct usb_endpoint_descriptor *ep,
-					int bufsize,
+					int bufsize, enum wwan_port_type type,
 					int (*manage_power)(struct usb_interface *, int))
 {
 	int rv;
 
-	rv = wdm_create(intf, ep, bufsize, manage_power);
+	rv = wdm_create(intf, ep, bufsize, type, manage_power);
 	if (rv < 0)
 		goto err;
 
@@ -1029,6 +1199,8 @@ static void wdm_disconnect(struct usb_interface *intf)
 	desc = wdm_find_device(intf);
 	mutex_lock(&wdm_mutex);
 
+	wdm_wwan_deinit(desc);
+
 	/* the spinlock makes sure no new urbs are generated in the callbacks */
 	spin_lock_irqsave(&desc->iuspin, flags);
 	set_bit(WDM_DISCONNECTING, &desc->flags);
diff --git a/include/linux/usb/cdc-wdm.h b/include/linux/usb/cdc-wdm.h
index 9b895f93d8de..9f5a51f79ba5 100644
--- a/include/linux/usb/cdc-wdm.h
+++ b/include/linux/usb/cdc-wdm.h
@@ -12,11 +12,12 @@
 #ifndef __LINUX_USB_CDC_WDM_H
 #define __LINUX_USB_CDC_WDM_H
 
+#include <linux/wwan.h>
 #include <uapi/linux/usb/cdc-wdm.h>
 
 extern struct usb_driver *usb_cdc_wdm_register(struct usb_interface *intf,
 					struct usb_endpoint_descriptor *ep,
-					int bufsize,
+					int bufsize, enum wwan_port_type type,
 					int (*manage_power)(struct usb_interface *, int));
 
 #endif /* __LINUX_USB_CDC_WDM_H */
-- 
cgit v1.2.3


From c5895d3f06cbb80ccb311f1dcb37074651030cb6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 4 May 2021 22:43:42 +0200
Subject: sched: Simplify sched_info_on()

The situation around sched_info is somewhat complicated, it is used by
sched_stats and delayacct and, indirectly, kvm.

If SCHEDSTATS=Y (but disabled by default) sched_info_on() is
unconditionally true -- this is the case for all distro kernel configs
I checked.

If for some reason SCHEDSTATS=N, but TASK_DELAY_ACCT=Y, then
sched_info_on() can return false when delayacct is disabled,
presumably because there would be no other users left; except kvm is.

Instead of complicating matters further by accurately accounting
sched_stat and kvm state, simply unconditionally enable when
SCHED_INFO=Y, matching the common distro case.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lkml.kernel.org/r/20210505111525.121458839@infradead.org
---
 include/linux/sched/stat.h | 10 ++--------
 kernel/delayacct.c         |  1 -
 kernel/sched/stats.h       | 37 ++++++++++---------------------------
 3 files changed, 12 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h
index 568286411b43..939c3ec9e1b9 100644
--- a/include/linux/sched/stat.h
+++ b/include/linux/sched/stat.h
@@ -3,6 +3,7 @@
 #define _LINUX_SCHED_STAT_H
 
 #include <linux/percpu.h>
+#include <linux/kconfig.h>
 
 /*
  * Various counters maintained by the scheduler and fork(),
@@ -23,14 +24,7 @@ extern unsigned long nr_iowait_cpu(int cpu);
 
 static inline int sched_info_on(void)
 {
-#ifdef CONFIG_SCHEDSTATS
-	return 1;
-#elif defined(CONFIG_TASK_DELAY_ACCT)
-	extern int delayacct_on;
-	return delayacct_on;
-#else
-	return 0;
-#endif
+	return IS_ENABLED(CONFIG_SCHED_INFO);
 }
 
 #ifdef CONFIG_SCHEDSTATS
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 3fe7cd52b459..3a0b910386d1 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -15,7 +15,6 @@
 #include <linux/module.h>
 
 int delayacct_on __read_mostly = 1;	/* Delay accounting turned on/off */
-EXPORT_SYMBOL_GPL(delayacct_on);
 struct kmem_cache *delayacct_cache;
 
 static int __init delayacct_setup_disable(char *str)
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index ee7da12a7056..33ffd41935ba 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -150,11 +150,6 @@ static inline void psi_sched_switch(struct task_struct *prev,
 #endif /* CONFIG_PSI */
 
 #ifdef CONFIG_SCHED_INFO
-static inline void sched_info_reset_dequeued(struct task_struct *t)
-{
-	t->sched_info.last_queued = 0;
-}
-
 /*
  * We are interested in knowing how long it was from the *first* time a
  * task was queued to the time that it finally hit a CPU, we call this routine
@@ -163,13 +158,12 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
  */
 static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t)
 {
-	unsigned long long now = rq_clock(rq), delta = 0;
+	unsigned long long delta = 0;
 
-	if (sched_info_on()) {
-		if (t->sched_info.last_queued)
-			delta = now - t->sched_info.last_queued;
+	if (t->sched_info.last_queued) {
+		delta = rq_clock(rq) - t->sched_info.last_queued;
+		t->sched_info.last_queued = 0;
 	}
-	sched_info_reset_dequeued(t);
 	t->sched_info.run_delay += delta;
 
 	rq_sched_info_dequeue(rq, delta);
@@ -184,9 +178,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
 {
 	unsigned long long now = rq_clock(rq), delta = 0;
 
-	if (t->sched_info.last_queued)
+	if (t->sched_info.last_queued) {
 		delta = now - t->sched_info.last_queued;
-	sched_info_reset_dequeued(t);
+		t->sched_info.last_queued = 0;
+	}
 	t->sched_info.run_delay += delta;
 	t->sched_info.last_arrival = now;
 	t->sched_info.pcount++;
@@ -201,10 +196,8 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
  */
 static inline void sched_info_enqueue(struct rq *rq, struct task_struct *t)
 {
-	if (sched_info_on()) {
-		if (!t->sched_info.last_queued)
-			t->sched_info.last_queued = rq_clock(rq);
-	}
+	if (!t->sched_info.last_queued)
+		t->sched_info.last_queued = rq_clock(rq);
 }
 
 /*
@@ -231,7 +224,7 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
  * the idle task.)  We are only called when prev != next.
  */
 static inline void
-__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
+sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
 {
 	/*
 	 * prev now departs the CPU.  It's not interesting to record
@@ -245,18 +238,8 @@ __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct
 		sched_info_arrive(rq, next);
 }
 
-static inline void
-sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
-{
-	if (sched_info_on())
-		__sched_info_switch(rq, prev, next);
-}
-
 #else /* !CONFIG_SCHED_INFO: */
 # define sched_info_enqueue(rq, t)	do { } while (0)
-# define sched_info_reset_dequeued(t)	do { } while (0)
 # define sched_info_dequeue(rq, t)	do { } while (0)
-# define sched_info_depart(rq, t)	do { } while (0)
-# define sched_info_arrive(rq, next)	do { } while (0)
 # define sched_info_switch(rq, t, next)	do { } while (0)
 #endif /* CONFIG_SCHED_INFO */
-- 
cgit v1.2.3


From eee4d9fee2544389e5ce5697ed92db67c86d7a9f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 4 May 2021 22:43:36 +0200
Subject: delayacct: Add static_branch in scheduler hooks

Cheaper when delayacct is disabled.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lkml.kernel.org/r/20210505111525.248028369@infradead.org
---
 include/linux/delayacct.h | 8 ++++++++
 kernel/delayacct.c        | 3 +++
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 21651f946751..57fefa54b53a 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -58,8 +58,10 @@ struct task_delay_info {
 
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/jump_label.h>
 
 #ifdef CONFIG_TASK_DELAY_ACCT
+DECLARE_STATIC_KEY_TRUE(delayacct_key);
 extern int delayacct_on;	/* Delay accounting turned on/off */
 extern struct kmem_cache *delayacct_cache;
 extern void delayacct_init(void);
@@ -114,6 +116,9 @@ static inline void delayacct_tsk_free(struct task_struct *tsk)
 
 static inline void delayacct_blkio_start(void)
 {
+	if (!static_branch_likely(&delayacct_key))
+		return;
+
 	delayacct_set_flag(current, DELAYACCT_PF_BLKIO);
 	if (current->delays)
 		__delayacct_blkio_start();
@@ -121,6 +126,9 @@ static inline void delayacct_blkio_start(void)
 
 static inline void delayacct_blkio_end(struct task_struct *p)
 {
+	if (!static_branch_likely(&delayacct_key))
+		return;
+
 	if (p->delays)
 		__delayacct_blkio_end(p);
 	delayacct_clear_flag(p, DELAYACCT_PF_BLKIO);
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 3a0b910386d1..63012fd39ae2 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -14,6 +14,7 @@
 #include <linux/delayacct.h>
 #include <linux/module.h>
 
+DEFINE_STATIC_KEY_TRUE(delayacct_key);
 int delayacct_on __read_mostly = 1;	/* Delay accounting turned on/off */
 struct kmem_cache *delayacct_cache;
 
@@ -28,6 +29,8 @@ void delayacct_init(void)
 {
 	delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT);
 	delayacct_tsk_init(&init_task);
+	if (!delayacct_on)
+		static_branch_disable(&delayacct_key);
 }
 
 void __delayacct_tsk_init(struct task_struct *tsk)
-- 
cgit v1.2.3


From e4042ad492357fa995921376462b04a025dd53b6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 4 May 2021 22:43:32 +0200
Subject: delayacct: Default disabled

Assuming this stuff isn't actually used much; disable it by default
and avoid allocating and tracking the task_delay_info structure.

taskstats is changed to still report the regular sched and sched_info
and only skip the missing task_delay_info fields instead of not
reporting anything.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20210505111525.308018373@infradead.org
---
 Documentation/accounting/delay-accounting.rst   |  8 ++++----
 Documentation/admin-guide/kernel-parameters.txt |  2 +-
 include/linux/delayacct.h                       | 16 ++++------------
 kernel/delayacct.c                              | 19 +++++++++++--------
 4 files changed, 20 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst
index 7cc7f5852da0..f20b282d6de0 100644
--- a/Documentation/accounting/delay-accounting.rst
+++ b/Documentation/accounting/delay-accounting.rst
@@ -69,13 +69,13 @@ Compile the kernel with::
 	CONFIG_TASK_DELAY_ACCT=y
 	CONFIG_TASKSTATS=y
 
-Delay accounting is enabled by default at boot up.
-To disable, add::
+Delay accounting is disabled by default at boot up.
+To enable, add::
 
-   nodelayacct
+   delayacct
 
 to the kernel boot options. The rest of the instructions
-below assume this has not been done.
+below assume this has been done.
 
 After the system has booted up, use a utility
 similar to  getdelays.c to access the delays
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index cb89dbdedc46..ef5048c127a3 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3244,7 +3244,7 @@
 
 	noclflush	[BUGS=X86] Don't use the CLFLUSH instruction
 
-	nodelayacct	[KNL] Disable per-task delay accounting
+	delayacct	[KNL] Enable per-task delay accounting
 
 	nodsp		[SH] Disable hardware DSP at boot time.
 
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 57fefa54b53a..225c8e01a111 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -61,7 +61,7 @@ struct task_delay_info {
 #include <linux/jump_label.h>
 
 #ifdef CONFIG_TASK_DELAY_ACCT
-DECLARE_STATIC_KEY_TRUE(delayacct_key);
+DECLARE_STATIC_KEY_FALSE(delayacct_key);
 extern int delayacct_on;	/* Delay accounting turned on/off */
 extern struct kmem_cache *delayacct_cache;
 extern void delayacct_init(void);
@@ -69,7 +69,7 @@ extern void __delayacct_tsk_init(struct task_struct *);
 extern void __delayacct_tsk_exit(struct task_struct *);
 extern void __delayacct_blkio_start(void);
 extern void __delayacct_blkio_end(struct task_struct *);
-extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *);
+extern int delayacct_add_tsk(struct taskstats *, struct task_struct *);
 extern __u64 __delayacct_blkio_ticks(struct task_struct *);
 extern void __delayacct_freepages_start(void);
 extern void __delayacct_freepages_end(void);
@@ -116,7 +116,7 @@ static inline void delayacct_tsk_free(struct task_struct *tsk)
 
 static inline void delayacct_blkio_start(void)
 {
-	if (!static_branch_likely(&delayacct_key))
+	if (!static_branch_unlikely(&delayacct_key))
 		return;
 
 	delayacct_set_flag(current, DELAYACCT_PF_BLKIO);
@@ -126,7 +126,7 @@ static inline void delayacct_blkio_start(void)
 
 static inline void delayacct_blkio_end(struct task_struct *p)
 {
-	if (!static_branch_likely(&delayacct_key))
+	if (!static_branch_unlikely(&delayacct_key))
 		return;
 
 	if (p->delays)
@@ -134,14 +134,6 @@ static inline void delayacct_blkio_end(struct task_struct *p)
 	delayacct_clear_flag(p, DELAYACCT_PF_BLKIO);
 }
 
-static inline int delayacct_add_tsk(struct taskstats *d,
-					struct task_struct *tsk)
-{
-	if (!delayacct_on || !tsk->delays)
-		return 0;
-	return __delayacct_add_tsk(d, tsk);
-}
-
 static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk)
 {
 	if (tsk->delays)
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 63012fd39ae2..3f086903b295 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -14,23 +14,23 @@
 #include <linux/delayacct.h>
 #include <linux/module.h>
 
-DEFINE_STATIC_KEY_TRUE(delayacct_key);
-int delayacct_on __read_mostly = 1;	/* Delay accounting turned on/off */
+DEFINE_STATIC_KEY_FALSE(delayacct_key);
+int delayacct_on __read_mostly;	/* Delay accounting turned on/off */
 struct kmem_cache *delayacct_cache;
 
-static int __init delayacct_setup_disable(char *str)
+static int __init delayacct_setup_enable(char *str)
 {
-	delayacct_on = 0;
+	delayacct_on = 1;
 	return 1;
 }
-__setup("nodelayacct", delayacct_setup_disable);
+__setup("delayacct", delayacct_setup_enable);
 
 void delayacct_init(void)
 {
 	delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT);
 	delayacct_tsk_init(&init_task);
-	if (!delayacct_on)
-		static_branch_disable(&delayacct_key);
+	if (delayacct_on)
+		static_branch_enable(&delayacct_key);
 }
 
 void __delayacct_tsk_init(struct task_struct *tsk)
@@ -83,7 +83,7 @@ void __delayacct_blkio_end(struct task_struct *p)
 	delayacct_end(&delays->lock, &delays->blkio_start, total, count);
 }
 
-int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
+int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 {
 	u64 utime, stime, stimescaled, utimescaled;
 	unsigned long long t2, t3;
@@ -118,6 +118,9 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	d->cpu_run_virtual_total =
 		(tmp < (s64)d->cpu_run_virtual_total) ?	0 : tmp;
 
+	if (!tsk->delays)
+		return 0;
+
 	/* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
 
 	raw_spin_lock_irqsave(&tsk->delays->lock, flags);
-- 
cgit v1.2.3


From 0cd7c741f01de13dc1eecf22557593b3514639bb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 10 May 2021 14:01:00 +0200
Subject: delayacct: Add sysctl to enable at runtime

Just like sched_schedstats, allow runtime enabling (and disabling) of
delayacct. This is useful if one forgot to add the delayacct boot time
option.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/YJkhebGJAywaZowX@hirez.programming.kicks-ass.net
---
 Documentation/accounting/delay-accounting.rst |  6 +++--
 include/linux/delayacct.h                     |  4 +++
 kernel/delayacct.c                            | 36 +++++++++++++++++++++++++--
 kernel/sysctl.c                               | 12 +++++++++
 4 files changed, 54 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst
index f20b282d6de0..1b8b46deeb29 100644
--- a/Documentation/accounting/delay-accounting.rst
+++ b/Documentation/accounting/delay-accounting.rst
@@ -74,8 +74,10 @@ To enable, add::
 
    delayacct
 
-to the kernel boot options. The rest of the instructions
-below assume this has been done.
+to the kernel boot options. The rest of the instructions below assume this has
+been done. Alternatively, use sysctl kernel.task_delayacct to switch the state
+at runtime. Note however that only tasks started after enabling it will have
+delayacct information.
 
 After the system has booted up, use a utility
 similar to  getdelays.c to access the delays
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 225c8e01a111..af7e6eb50283 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -65,6 +65,10 @@ DECLARE_STATIC_KEY_FALSE(delayacct_key);
 extern int delayacct_on;	/* Delay accounting turned on/off */
 extern struct kmem_cache *delayacct_cache;
 extern void delayacct_init(void);
+
+extern int sysctl_delayacct(struct ctl_table *table, int write, void *buffer,
+			    size_t *lenp, loff_t *ppos);
+
 extern void __delayacct_tsk_init(struct task_struct *);
 extern void __delayacct_tsk_exit(struct task_struct *);
 extern void __delayacct_blkio_start(void);
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 3f086903b295..51530d5b15a8 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -18,6 +18,17 @@ DEFINE_STATIC_KEY_FALSE(delayacct_key);
 int delayacct_on __read_mostly;	/* Delay accounting turned on/off */
 struct kmem_cache *delayacct_cache;
 
+static void set_delayacct(bool enabled)
+{
+	if (enabled) {
+		static_branch_enable(&delayacct_key);
+		delayacct_on = 1;
+	} else {
+		delayacct_on = 0;
+		static_branch_disable(&delayacct_key);
+	}
+}
+
 static int __init delayacct_setup_enable(char *str)
 {
 	delayacct_on = 1;
@@ -29,9 +40,30 @@ void delayacct_init(void)
 {
 	delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT);
 	delayacct_tsk_init(&init_task);
-	if (delayacct_on)
-		static_branch_enable(&delayacct_key);
+	set_delayacct(delayacct_on);
+}
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_delayacct(struct ctl_table *table, int write, void *buffer,
+		     size_t *lenp, loff_t *ppos)
+{
+	int state = delayacct_on;
+	struct ctl_table t;
+	int err;
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	t = *table;
+	t.data = &state;
+	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+	if (err < 0)
+		return err;
+	if (write)
+		set_delayacct(state);
+	return err;
 }
+#endif
 
 void __delayacct_tsk_init(struct task_struct *tsk)
 {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 14edf84cc571..0afbfc83157a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -71,6 +71,7 @@
 #include <linux/coredump.h>
 #include <linux/latencytop.h>
 #include <linux/pid.h>
+#include <linux/delayacct.h>
 
 #include "../lib/kstrtox.h"
 
@@ -1727,6 +1728,17 @@ static struct ctl_table kern_table[] = {
 		.extra2		= SYSCTL_ONE,
 	},
 #endif /* CONFIG_SCHEDSTATS */
+#ifdef CONFIG_TASK_DELAY_ACCT
+	{
+		.procname	= "task_delayacct",
+		.data		= NULL,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sysctl_delayacct,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+#endif /* CONFIG_TASK_DELAY_ACCT */
 #ifdef CONFIG_NUMA_BALANCING
 	{
 		.procname	= "numa_balancing",
-- 
cgit v1.2.3


From 8a311c740b53324ec584e0e3bb7077d56b123c28 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 17 Nov 2020 18:19:36 -0500
Subject: sched: Basic tracking of matching tasks

Introduce task_struct::core_cookie as an opaque identifier for core
scheduling. When enabled; core scheduling will only allow matching
task to be on the core; where idle matches everything.

When task_struct::core_cookie is set (and core scheduling is enabled)
these tasks are indexed in a second RB-tree, first on cookie value
then on scheduling function, such that matching task selection always
finds the most elegible match.

NOTE: *shudder* at the overhead...

NOTE: *sigh*, a 3rd copy of the scheduling function; the alternative
is per class tracking of cookies and that just duplicates a lot of
stuff for no raisin (the 2nd copy lives in the rt-mutex PI code).

[Joel: folded fixes]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Don Hiatt <dhiatt@digitalocean.com>
Tested-by: Hongyu Ning <hongyu.ning@linux.intel.com>
Tested-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/20210422123308.496975854@infradead.org
---
 include/linux/sched.h |   8 ++-
 kernel/sched/core.c   | 152 ++++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched/fair.c   |  46 ---------------
 kernel/sched/sched.h  |  55 ++++++++++++++++++
 4 files changed, 210 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d2c881384517..45eedccf86aa 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -700,10 +700,16 @@ struct task_struct {
 	const struct sched_class	*sched_class;
 	struct sched_entity		se;
 	struct sched_rt_entity		rt;
+	struct sched_dl_entity		dl;
+
+#ifdef CONFIG_SCHED_CORE
+	struct rb_node			core_node;
+	unsigned long			core_cookie;
+#endif
+
 #ifdef CONFIG_CGROUP_SCHED
 	struct task_group		*sched_task_group;
 #endif
-	struct sched_dl_entity		dl;
 
 #ifdef CONFIG_UCLAMP_TASK
 	/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 85147bea9d93..c057d471c025 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -88,6 +88,133 @@ __read_mostly int scheduler_running;
 
 DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
 
+/* kernel prio, less is more */
+static inline int __task_prio(struct task_struct *p)
+{
+	if (p->sched_class == &stop_sched_class) /* trumps deadline */
+		return -2;
+
+	if (rt_prio(p->prio)) /* includes deadline */
+		return p->prio; /* [-1, 99] */
+
+	if (p->sched_class == &idle_sched_class)
+		return MAX_RT_PRIO + NICE_WIDTH; /* 140 */
+
+	return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */
+}
+
+/*
+ * l(a,b)
+ * le(a,b) := !l(b,a)
+ * g(a,b)  := l(b,a)
+ * ge(a,b) := !l(a,b)
+ */
+
+/* real prio, less is less */
+static inline bool prio_less(struct task_struct *a, struct task_struct *b)
+{
+
+	int pa = __task_prio(a), pb = __task_prio(b);
+
+	if (-pa < -pb)
+		return true;
+
+	if (-pb < -pa)
+		return false;
+
+	if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
+		return !dl_time_before(a->dl.deadline, b->dl.deadline);
+
+	if (pa == MAX_RT_PRIO + MAX_NICE)  { /* fair */
+		u64 vruntime = b->se.vruntime;
+
+		/*
+		 * Normalize the vruntime if tasks are in different cpus.
+		 */
+		if (task_cpu(a) != task_cpu(b)) {
+			vruntime -= task_cfs_rq(b)->min_vruntime;
+			vruntime += task_cfs_rq(a)->min_vruntime;
+		}
+
+		return !((s64)(a->se.vruntime - vruntime) <= 0);
+	}
+
+	return false;
+}
+
+static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b)
+{
+	if (a->core_cookie < b->core_cookie)
+		return true;
+
+	if (a->core_cookie > b->core_cookie)
+		return false;
+
+	/* flip prio, so high prio is leftmost */
+	if (prio_less(b, a))
+		return true;
+
+	return false;
+}
+
+#define __node_2_sc(node) rb_entry((node), struct task_struct, core_node)
+
+static inline bool rb_sched_core_less(struct rb_node *a, const struct rb_node *b)
+{
+	return __sched_core_less(__node_2_sc(a), __node_2_sc(b));
+}
+
+static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node)
+{
+	const struct task_struct *p = __node_2_sc(node);
+	unsigned long cookie = (unsigned long)key;
+
+	if (cookie < p->core_cookie)
+		return -1;
+
+	if (cookie > p->core_cookie)
+		return 1;
+
+	return 0;
+}
+
+static void sched_core_enqueue(struct rq *rq, struct task_struct *p)
+{
+	rq->core->core_task_seq++;
+
+	if (!p->core_cookie)
+		return;
+
+	rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
+}
+
+static void sched_core_dequeue(struct rq *rq, struct task_struct *p)
+{
+	rq->core->core_task_seq++;
+
+	if (!p->core_cookie)
+		return;
+
+	rb_erase(&p->core_node, &rq->core_tree);
+}
+
+/*
+ * Find left-most (aka, highest priority) task matching @cookie.
+ */
+static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
+{
+	struct rb_node *node;
+
+	node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
+	/*
+	 * The idle task always matches any cookie!
+	 */
+	if (!node)
+		return idle_sched_class.pick_task(rq);
+
+	return __node_2_sc(node);
+}
+
 /*
  * Magic required such that:
  *
@@ -147,10 +274,16 @@ static void __sched_core_flip(bool enabled)
 	cpus_read_unlock();
 }
 
-static void __sched_core_enable(void)
+static void sched_core_assert_empty(void)
 {
-	// XXX verify there are no cookie tasks (yet)
+	int cpu;
 
+	for_each_possible_cpu(cpu)
+		WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree));
+}
+
+static void __sched_core_enable(void)
+{
 	static_branch_enable(&__sched_core_enabled);
 	/*
 	 * Ensure all previous instances of raw_spin_rq_*lock() have finished
@@ -158,12 +291,12 @@ static void __sched_core_enable(void)
 	 */
 	synchronize_rcu();
 	__sched_core_flip(true);
+	sched_core_assert_empty();
 }
 
 static void __sched_core_disable(void)
 {
-	// XXX verify there are no cookie tasks (left)
-
+	sched_core_assert_empty();
 	__sched_core_flip(false);
 	static_branch_disable(&__sched_core_enabled);
 }
@@ -205,6 +338,11 @@ void sched_core_put(void)
 		schedule_work(&_work);
 }
 
+#else /* !CONFIG_SCHED_CORE */
+
+static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
+static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
+
 #endif /* CONFIG_SCHED_CORE */
 
 /*
@@ -1797,10 +1935,16 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 
 	uclamp_rq_inc(rq, p);
 	p->sched_class->enqueue_task(rq, p, flags);
+
+	if (sched_core_enabled(rq))
+		sched_core_enqueue(rq, p);
 }
 
 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
+	if (sched_core_enabled(rq))
+		sched_core_dequeue(rq, p);
+
 	if (!(flags & DEQUEUE_NOCLOCK))
 		update_rq_clock(rq);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 51d72ab5b5ae..08be7a2eb05b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -268,33 +268,11 @@ const struct sched_class fair_sched_class;
  */
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
-	SCHED_WARN_ON(!entity_is_task(se));
-	return container_of(se, struct task_struct, se);
-}
 
 /* Walk up scheduling entities hierarchy */
 #define for_each_sched_entity(se) \
 		for (; se; se = se->parent)
 
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
-	return p->se.cfs_rq;
-}
-
-/* runqueue on which this entity is (to be) queued */
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
-	return se->cfs_rq;
-}
-
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
-	return grp->my_q;
-}
-
 static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
 {
 	if (!path)
@@ -455,33 +433,9 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 
 #else	/* !CONFIG_FAIR_GROUP_SCHED */
 
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
-	return container_of(se, struct task_struct, se);
-}
-
 #define for_each_sched_entity(se) \
 		for (; se; se = NULL)
 
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
-	return &task_rq(p)->cfs;
-}
-
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
-	struct task_struct *p = task_of(se);
-	struct rq *rq = task_rq(p);
-
-	return &rq->cfs;
-}
-
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
-	return NULL;
-}
-
 static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
 {
 	if (path)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fa990cd259ce..e43a2176d88f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1080,6 +1080,10 @@ struct rq {
 	/* per rq */
 	struct rq		*core;
 	unsigned int		core_enabled;
+	struct rb_root		core_tree;
+
+	/* shared state */
+	unsigned int		core_task_seq;
 #endif
 };
 
@@ -1243,6 +1247,57 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 #define raw_rq()		raw_cpu_ptr(&runqueues)
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+	SCHED_WARN_ON(!entity_is_task(se));
+	return container_of(se, struct task_struct, se);
+}
+
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+	return p->se.cfs_rq;
+}
+
+/* runqueue on which this entity is (to be) queued */
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+	return se->cfs_rq;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+	return grp->my_q;
+}
+
+#else
+
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+	return container_of(se, struct task_struct, se);
+}
+
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+	return &task_rq(p)->cfs;
+}
+
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+	struct task_struct *p = task_of(se);
+	struct rq *rq = task_rq(p);
+
+	return &rq->cfs;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+	return NULL;
+}
+#endif
+
 extern void update_rq_clock(struct rq *rq);
 
 static inline u64 __rq_clock_broken(struct rq *rq)
-- 
cgit v1.2.3


From d2dfa17bc7de67e99685c4d6557837bf801a102c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 17 Nov 2020 18:19:43 -0500
Subject: sched: Trivial forced-newidle balancer

When a sibling is forced-idle to match the core-cookie; search for
matching tasks to fill the core.

rcu_read_unlock() can incur an infrequent deadlock in
sched_core_balance(). Fix this by using the RCU-sched flavor instead.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Don Hiatt <dhiatt@digitalocean.com>
Tested-by: Hongyu Ning <hongyu.ning@linux.intel.com>
Tested-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/20210422123308.800048269@infradead.org
---
 include/linux/sched.h |   1 +
 kernel/sched/core.c   | 130 +++++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/idle.c   |   1 +
 kernel/sched/sched.h  |   6 +++
 4 files changed, 137 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 45eedccf86aa..9b822e383212 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -705,6 +705,7 @@ struct task_struct {
 #ifdef CONFIG_SCHED_CORE
 	struct rb_node			core_node;
 	unsigned long			core_cookie;
+	unsigned int			core_occupation;
 #endif
 
 #ifdef CONFIG_CGROUP_SCHED
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e45c1d21b371..b4988887510f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -204,6 +204,21 @@ static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
 	return __node_2_sc(node);
 }
 
+static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
+{
+	struct rb_node *node = &p->core_node;
+
+	node = rb_next(node);
+	if (!node)
+		return NULL;
+
+	p = container_of(node, struct task_struct, core_node);
+	if (p->core_cookie != cookie)
+		return NULL;
+
+	return p;
+}
+
 /*
  * Magic required such that:
  *
@@ -5389,8 +5404,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 	const struct sched_class *class;
 	const struct cpumask *smt_mask;
 	bool fi_before = false;
+	int i, j, cpu, occ = 0;
 	bool need_sync;
-	int i, j, cpu;
 
 	if (!sched_core_enabled(rq))
 		return __pick_next_task(rq, prev, rf);
@@ -5512,6 +5527,9 @@ again:
 			if (!p)
 				continue;
 
+			if (!is_task_rq_idle(p))
+				occ++;
+
 			rq_i->core_pick = p;
 			if (rq_i->idle == p && rq_i->nr_running) {
 				rq->core->core_forceidle = true;
@@ -5543,6 +5561,7 @@ again:
 
 						cpu_rq(j)->core_pick = NULL;
 					}
+					occ = 1;
 					goto again;
 				}
 			}
@@ -5588,6 +5607,8 @@ again:
 		if (!(fi_before && rq->core->core_forceidle))
 			task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle);
 
+		rq_i->core_pick->core_occupation = occ;
+
 		if (i == cpu) {
 			rq_i->core_pick = NULL;
 			continue;
@@ -5609,6 +5630,113 @@ done:
 	return next;
 }
 
+static bool try_steal_cookie(int this, int that)
+{
+	struct rq *dst = cpu_rq(this), *src = cpu_rq(that);
+	struct task_struct *p;
+	unsigned long cookie;
+	bool success = false;
+
+	local_irq_disable();
+	double_rq_lock(dst, src);
+
+	cookie = dst->core->core_cookie;
+	if (!cookie)
+		goto unlock;
+
+	if (dst->curr != dst->idle)
+		goto unlock;
+
+	p = sched_core_find(src, cookie);
+	if (p == src->idle)
+		goto unlock;
+
+	do {
+		if (p == src->core_pick || p == src->curr)
+			goto next;
+
+		if (!cpumask_test_cpu(this, &p->cpus_mask))
+			goto next;
+
+		if (p->core_occupation > dst->idle->core_occupation)
+			goto next;
+
+		p->on_rq = TASK_ON_RQ_MIGRATING;
+		deactivate_task(src, p, 0);
+		set_task_cpu(p, this);
+		activate_task(dst, p, 0);
+		p->on_rq = TASK_ON_RQ_QUEUED;
+
+		resched_curr(dst);
+
+		success = true;
+		break;
+
+next:
+		p = sched_core_next(p, cookie);
+	} while (p);
+
+unlock:
+	double_rq_unlock(dst, src);
+	local_irq_enable();
+
+	return success;
+}
+
+static bool steal_cookie_task(int cpu, struct sched_domain *sd)
+{
+	int i;
+
+	for_each_cpu_wrap(i, sched_domain_span(sd), cpu) {
+		if (i == cpu)
+			continue;
+
+		if (need_resched())
+			break;
+
+		if (try_steal_cookie(cpu, i))
+			return true;
+	}
+
+	return false;
+}
+
+static void sched_core_balance(struct rq *rq)
+{
+	struct sched_domain *sd;
+	int cpu = cpu_of(rq);
+
+	preempt_disable();
+	rcu_read_lock();
+	raw_spin_rq_unlock_irq(rq);
+	for_each_domain(cpu, sd) {
+		if (need_resched())
+			break;
+
+		if (steal_cookie_task(cpu, sd))
+			break;
+	}
+	raw_spin_rq_lock_irq(rq);
+	rcu_read_unlock();
+	preempt_enable();
+}
+
+static DEFINE_PER_CPU(struct callback_head, core_balance_head);
+
+void queue_core_balance(struct rq *rq)
+{
+	if (!sched_core_enabled(rq))
+		return;
+
+	if (!rq->core->core_cookie)
+		return;
+
+	if (!rq->nr_running) /* not forced idle */
+		return;
+
+	queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
+}
+
 static inline void sched_core_cpu_starting(unsigned int cpu)
 {
 	const struct cpumask *smt_mask = cpu_smt_mask(cpu);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 43646e7876d9..912b47aa99d8 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -437,6 +437,7 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir
 {
 	update_idle_core(rq);
 	schedstat_inc(rq->sched_goidle);
+	queue_core_balance(rq);
 }
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4a898abc60ce..91ca1fee9fec 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1170,6 +1170,8 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
 
 bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool fi);
 
+extern void queue_core_balance(struct rq *rq);
+
 #else /* !CONFIG_SCHED_CORE */
 
 static inline bool sched_core_enabled(struct rq *rq)
@@ -1192,6 +1194,10 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
 	return &rq->__lock;
 }
 
+static inline void queue_core_balance(struct rq *rq)
+{
+}
+
 #endif /* CONFIG_SCHED_CORE */
 
 static inline void lockdep_assert_rq_held(struct rq *rq)
-- 
cgit v1.2.3


From 6e33cad0af49336952e5541464bd02f5b5fd433e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 26 Mar 2021 18:55:06 +0100
Subject: sched: Trivial core scheduling cookie management

In order to not have to use pid_struct, create a new, smaller,
structure to manage task cookies for core scheduling.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Don Hiatt <dhiatt@digitalocean.com>
Tested-by: Hongyu Ning <hongyu.ning@linux.intel.com>
Tested-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/20210422123308.919768100@infradead.org
---
 include/linux/sched.h     |   6 +++
 kernel/fork.c             |   1 +
 kernel/sched/Makefile     |   1 +
 kernel/sched/core.c       |   7 +--
 kernel/sched/core_sched.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h      |  16 +++++++
 6 files changed, 137 insertions(+), 3 deletions(-)
 create mode 100644 kernel/sched/core_sched.c

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9b822e383212..eab3f7c4251b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2179,4 +2179,10 @@ int sched_trace_rq_nr_running(struct rq *rq);
 
 const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
 
+#ifdef CONFIG_SCHED_CORE
+extern void sched_core_free(struct task_struct *tsk);
+#else
+static inline void sched_core_free(struct task_struct *tsk) { }
+#endif
+
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index dc06afd725cb..d16c60c9daca 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -742,6 +742,7 @@ void __put_task_struct(struct task_struct *tsk)
 	exit_creds(tsk);
 	delayacct_tsk_free(tsk);
 	put_signal_struct(tsk->signal);
+	sched_core_free(tsk);
 
 	if (!profile_handoff_task(tsk))
 		free_task(tsk);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 5fc9c9b70862..978fcfca5871 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -36,3 +36,4 @@ obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
 obj-$(CONFIG_MEMBARRIER) += membarrier.o
 obj-$(CONFIG_CPU_ISOLATION) += isolation.o
 obj-$(CONFIG_PSI) += psi.o
+obj-$(CONFIG_SCHED_CORE) += core_sched.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b4988887510f..55b2d9399e12 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -167,7 +167,7 @@ static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node)
 	return 0;
 }
 
-static void sched_core_enqueue(struct rq *rq, struct task_struct *p)
+void sched_core_enqueue(struct rq *rq, struct task_struct *p)
 {
 	rq->core->core_task_seq++;
 
@@ -177,14 +177,15 @@ static void sched_core_enqueue(struct rq *rq, struct task_struct *p)
 	rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
 }
 
-static void sched_core_dequeue(struct rq *rq, struct task_struct *p)
+void sched_core_dequeue(struct rq *rq, struct task_struct *p)
 {
 	rq->core->core_task_seq++;
 
-	if (!p->core_cookie)
+	if (!sched_core_enqueued(p))
 		return;
 
 	rb_erase(&p->core_node, &rq->core_tree);
+	RB_CLEAR_NODE(&p->core_node);
 }
 
 /*
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
new file mode 100644
index 000000000000..8d0869a9eb8c
--- /dev/null
+++ b/kernel/sched/core_sched.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "sched.h"
+
+/*
+ * A simple wrapper around refcount. An allocated sched_core_cookie's
+ * address is used to compute the cookie of the task.
+ */
+struct sched_core_cookie {
+	refcount_t refcnt;
+};
+
+unsigned long sched_core_alloc_cookie(void)
+{
+	struct sched_core_cookie *ck = kmalloc(sizeof(*ck), GFP_KERNEL);
+	if (!ck)
+		return 0;
+
+	refcount_set(&ck->refcnt, 1);
+	sched_core_get();
+
+	return (unsigned long)ck;
+}
+
+void sched_core_put_cookie(unsigned long cookie)
+{
+	struct sched_core_cookie *ptr = (void *)cookie;
+
+	if (ptr && refcount_dec_and_test(&ptr->refcnt)) {
+		kfree(ptr);
+		sched_core_put();
+	}
+}
+
+unsigned long sched_core_get_cookie(unsigned long cookie)
+{
+	struct sched_core_cookie *ptr = (void *)cookie;
+
+	if (ptr)
+		refcount_inc(&ptr->refcnt);
+
+	return cookie;
+}
+
+/*
+ * sched_core_update_cookie - replace the cookie on a task
+ * @p: the task to update
+ * @cookie: the new cookie
+ *
+ * Effectively exchange the task cookie; caller is responsible for lifetimes on
+ * both ends.
+ *
+ * Returns: the old cookie
+ */
+unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cookie)
+{
+	unsigned long old_cookie;
+	struct rq_flags rf;
+	struct rq *rq;
+	bool enqueued;
+
+	rq = task_rq_lock(p, &rf);
+
+	/*
+	 * Since creating a cookie implies sched_core_get(), and we cannot set
+	 * a cookie until after we've created it, similarly, we cannot destroy
+	 * a cookie until after we've removed it, we must have core scheduling
+	 * enabled here.
+	 */
+	SCHED_WARN_ON((p->core_cookie || cookie) && !sched_core_enabled(rq));
+
+	enqueued = sched_core_enqueued(p);
+	if (enqueued)
+		sched_core_dequeue(rq, p);
+
+	old_cookie = p->core_cookie;
+	p->core_cookie = cookie;
+
+	if (enqueued)
+		sched_core_enqueue(rq, p);
+
+	/*
+	 * If task is currently running, it may not be compatible anymore after
+	 * the cookie change, so enter the scheduler on its CPU to schedule it
+	 * away.
+	 */
+	if (task_running(rq, p))
+		resched_curr(rq);
+
+	task_rq_unlock(rq, p, &rf);
+
+	return old_cookie;
+}
+
+static unsigned long sched_core_clone_cookie(struct task_struct *p)
+{
+	unsigned long cookie, flags;
+
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	cookie = sched_core_get_cookie(p->core_cookie);
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+	return cookie;
+}
+
+void sched_core_free(struct task_struct *p)
+{
+	sched_core_put_cookie(p->core_cookie);
+}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3878386a0a02..904c52b560d1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1229,6 +1229,22 @@ static inline bool sched_group_cookie_match(struct rq *rq,
 
 extern void queue_core_balance(struct rq *rq);
 
+static inline bool sched_core_enqueued(struct task_struct *p)
+{
+	return !RB_EMPTY_NODE(&p->core_node);
+}
+
+extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
+extern void sched_core_dequeue(struct rq *rq, struct task_struct *p);
+
+extern void sched_core_get(void);
+extern void sched_core_put(void);
+
+extern unsigned long sched_core_alloc_cookie(void);
+extern void sched_core_put_cookie(unsigned long cookie);
+extern unsigned long sched_core_get_cookie(unsigned long cookie);
+extern unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cookie);
+
 #else /* !CONFIG_SCHED_CORE */
 
 static inline bool sched_core_enabled(struct rq *rq)
-- 
cgit v1.2.3


From 85dd3f61203c5cfa72b308ff327b5fbf3fc1ce5e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 29 Mar 2021 15:18:35 +0200
Subject: sched: Inherit task cookie on fork()

Note that sched_core_fork() is called from under tasklist_lock, and
not from sched_fork() earlier. This avoids a few races later.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Don Hiatt <dhiatt@digitalocean.com>
Tested-by: Hongyu Ning <hongyu.ning@linux.intel.com>
Tested-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/20210422123308.980003687@infradead.org
---
 include/linux/sched.h     | 2 ++
 kernel/fork.c             | 3 +++
 kernel/sched/core_sched.c | 6 ++++++
 3 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index eab3f7c4251b..fba47e52e482 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2181,8 +2181,10 @@ const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
 
 #ifdef CONFIG_SCHED_CORE
 extern void sched_core_free(struct task_struct *tsk);
+extern void sched_core_fork(struct task_struct *p);
 #else
 static inline void sched_core_free(struct task_struct *tsk) { }
+static inline void sched_core_fork(struct task_struct *p) { }
 #endif
 
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index d16c60c9daca..e7fd928fcafe 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2251,6 +2251,8 @@ static __latent_entropy struct task_struct *copy_process(
 
 	klp_copy_process(p);
 
+	sched_core_fork(p);
+
 	spin_lock(&current->sighand->siglock);
 
 	/*
@@ -2338,6 +2340,7 @@ static __latent_entropy struct task_struct *copy_process(
 	return p;
 
 bad_fork_cancel_cgroup:
+	sched_core_free(p);
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
 	cgroup_cancel_fork(p, args);
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index 8d0869a9eb8c..dcbbeaefaaa3 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -103,6 +103,12 @@ static unsigned long sched_core_clone_cookie(struct task_struct *p)
 	return cookie;
 }
 
+void sched_core_fork(struct task_struct *p)
+{
+	RB_CLEAR_NODE(&p->core_node);
+	p->core_cookie = sched_core_clone_cookie(current);
+}
+
 void sched_core_free(struct task_struct *p)
 {
 	sched_core_put_cookie(p->core_cookie);
-- 
cgit v1.2.3


From 7ac592aa35a684ff1858fb9ec282886b9e3575ac Mon Sep 17 00:00:00 2001
From: Chris Hyser <chris.hyser@oracle.com>
Date: Wed, 24 Mar 2021 17:40:15 -0400
Subject: sched: prctl() core-scheduling interface

This patch provides support for setting and copying core scheduling
'task cookies' between threads (PID), processes (TGID), and process
groups (PGID).

The value of core scheduling isn't that tasks don't share a core,
'nosmt' can do that. The value lies in exploiting all the sharing
opportunities that exist to recover possible lost performance and that
requires a degree of flexibility in the API.

From a security perspective (and there are others), the thread,
process and process group distinction is an existent hierarchal
categorization of tasks that reflects many of the security concerns
about 'data sharing'. For example, protecting against cache-snooping
by a thread that can just read the memory directly isn't all that
useful.

With this in mind, subcommands to CREATE/SHARE (TO/FROM) provide a
mechanism to create and share cookies. CREATE/SHARE_TO specify a
target pid with enum pidtype used to specify the scope of the targeted
tasks. For example, PIDTYPE_TGID will share the cookie with the
process and all of it's threads as typically desired in a security
scenario.

API:

  prctl(PR_SCHED_CORE, PR_SCHED_CORE_GET, tgtpid, pidtype, &cookie)
  prctl(PR_SCHED_CORE, PR_SCHED_CORE_CREATE, tgtpid, pidtype, NULL)
  prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_TO, tgtpid, pidtype, NULL)
  prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_FROM, srcpid, pidtype, NULL)

where 'tgtpid/srcpid == 0' implies the current process and pidtype is
kernel enum pid_type {PIDTYPE_PID, PIDTYPE_TGID, PIDTYPE_PGID, ...}.

For return values, EINVAL, ENOMEM are what they say. ESRCH means the
tgtpid/srcpid was not found. EPERM indicates lack of PTRACE permission
access to tgtpid/srcpid. ENODEV indicates your machines lacks SMT.

[peterz: complete rewrite]
Signed-off-by: Chris Hyser <chris.hyser@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Don Hiatt <dhiatt@digitalocean.com>
Tested-by: Hongyu Ning <hongyu.ning@linux.intel.com>
Tested-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/20210422123309.039845339@infradead.org
---
 include/linux/sched.h            |   2 +
 include/uapi/linux/prctl.h       |   8 +++
 kernel/sched/core_sched.c        | 114 +++++++++++++++++++++++++++++++++++++++
 kernel/sys.c                     |   5 ++
 tools/include/uapi/linux/prctl.h |   8 +++
 5 files changed, 137 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fba47e52e482..c7e7d50e2fdc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2182,6 +2182,8 @@ const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
 #ifdef CONFIG_SCHED_CORE
 extern void sched_core_free(struct task_struct *tsk);
 extern void sched_core_fork(struct task_struct *p);
+extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
+				unsigned long uaddr);
 #else
 static inline void sched_core_free(struct task_struct *tsk) { }
 static inline void sched_core_fork(struct task_struct *p) { }
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 18a9f59dc067..967d9c55323d 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -259,4 +259,12 @@ struct prctl_mm_map {
 #define PR_PAC_SET_ENABLED_KEYS		60
 #define PR_PAC_GET_ENABLED_KEYS		61
 
+/* Request the scheduler to share a core */
+#define PR_SCHED_CORE			62
+# define PR_SCHED_CORE_GET		0
+# define PR_SCHED_CORE_CREATE		1 /* create unique core_sched cookie */
+# define PR_SCHED_CORE_SHARE_TO		2 /* push core_sched cookie to pid */
+# define PR_SCHED_CORE_SHARE_FROM	3 /* pull core_sched cookie to pid */
+# define PR_SCHED_CORE_MAX		4
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index dcbbeaefaaa3..9a80e9a474c0 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
+#include <linux/prctl.h>
 #include "sched.h"
 
 /*
@@ -113,3 +114,116 @@ void sched_core_free(struct task_struct *p)
 {
 	sched_core_put_cookie(p->core_cookie);
 }
+
+static void __sched_core_set(struct task_struct *p, unsigned long cookie)
+{
+	cookie = sched_core_get_cookie(cookie);
+	cookie = sched_core_update_cookie(p, cookie);
+	sched_core_put_cookie(cookie);
+}
+
+/* Called from prctl interface: PR_SCHED_CORE */
+int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
+			 unsigned long uaddr)
+{
+	unsigned long cookie = 0, id = 0;
+	struct task_struct *task, *p;
+	struct pid *grp;
+	int err = 0;
+
+	if (!static_branch_likely(&sched_smt_present))
+		return -ENODEV;
+
+	if (type > PIDTYPE_PGID || cmd >= PR_SCHED_CORE_MAX || pid < 0 ||
+	    (cmd != PR_SCHED_CORE_GET && uaddr))
+		return -EINVAL;
+
+	rcu_read_lock();
+	if (pid == 0) {
+		task = current;
+	} else {
+		task = find_task_by_vpid(pid);
+		if (!task) {
+			rcu_read_unlock();
+			return -ESRCH;
+		}
+	}
+	get_task_struct(task);
+	rcu_read_unlock();
+
+	/*
+	 * Check if this process has the right to modify the specified
+	 * process. Use the regular "ptrace_may_access()" checks.
+	 */
+	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
+		err = -EPERM;
+		goto out;
+	}
+
+	switch (cmd) {
+	case PR_SCHED_CORE_GET:
+		if (type != PIDTYPE_PID || uaddr & 7) {
+			err = -EINVAL;
+			goto out;
+		}
+		cookie = sched_core_clone_cookie(task);
+		if (cookie) {
+			/* XXX improve ? */
+			ptr_to_hashval((void *)cookie, &id);
+		}
+		err = put_user(id, (u64 __user *)uaddr);
+		goto out;
+
+	case PR_SCHED_CORE_CREATE:
+		cookie = sched_core_alloc_cookie();
+		if (!cookie) {
+			err = -ENOMEM;
+			goto out;
+		}
+		break;
+
+	case PR_SCHED_CORE_SHARE_TO:
+		cookie = sched_core_clone_cookie(current);
+		break;
+
+	case PR_SCHED_CORE_SHARE_FROM:
+		if (type != PIDTYPE_PID) {
+			err = -EINVAL;
+			goto out;
+		}
+		cookie = sched_core_clone_cookie(task);
+		__sched_core_set(current, cookie);
+		goto out;
+
+	default:
+		err = -EINVAL;
+		goto out;
+	};
+
+	if (type == PIDTYPE_PID) {
+		__sched_core_set(task, cookie);
+		goto out;
+	}
+
+	read_lock(&tasklist_lock);
+	grp = task_pid_type(task, type);
+
+	do_each_pid_thread(grp, type, p) {
+		if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) {
+			err = -EPERM;
+			goto out_tasklist;
+		}
+	} while_each_pid_thread(grp, type, p);
+
+	do_each_pid_thread(grp, type, p) {
+		__sched_core_set(p, cookie);
+	} while_each_pid_thread(grp, type, p);
+out_tasklist:
+	read_unlock(&tasklist_lock);
+
+out:
+	sched_core_put_cookie(cookie);
+	put_task_struct(task);
+	return err;
+}
+
diff --git a/kernel/sys.c b/kernel/sys.c
index 3a583a29815f..9de46a4bf492 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2550,6 +2550,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		error = set_syscall_user_dispatch(arg2, arg3, arg4,
 						  (char __user *) arg5);
 		break;
+#ifdef CONFIG_SCHED_CORE
+	case PR_SCHED_CORE:
+		error = sched_core_share_pid(arg2, arg3, arg4, arg5);
+		break;
+#endif
 	default:
 		error = -EINVAL;
 		break;
diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h
index 18a9f59dc067..967d9c55323d 100644
--- a/tools/include/uapi/linux/prctl.h
+++ b/tools/include/uapi/linux/prctl.h
@@ -259,4 +259,12 @@ struct prctl_mm_map {
 #define PR_PAC_SET_ENABLED_KEYS		60
 #define PR_PAC_GET_ENABLED_KEYS		61
 
+/* Request the scheduler to share a core */
+#define PR_SCHED_CORE			62
+# define PR_SCHED_CORE_GET		0
+# define PR_SCHED_CORE_CREATE		1 /* create unique core_sched cookie */
+# define PR_SCHED_CORE_SHARE_TO		2 /* push core_sched cookie to pid */
+# define PR_SCHED_CORE_SHARE_FROM	3 /* pull core_sched cookie to pid */
+# define PR_SCHED_CORE_MAX		4
+
 #endif /* _LINUX_PRCTL_H */
-- 
cgit v1.2.3


From fa5e5dc39669b4427830c546ede8709323b8276c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 6 May 2021 21:33:58 +0200
Subject: jump_label, x86: Introduce jump_entry_size()

This allows architectures to have variable sized jumps.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20210506194157.786777050@infradead.org
---
 arch/x86/include/asm/jump_label.h | 4 ++--
 arch/x86/kernel/jump_label.c      | 7 +++++++
 include/linux/jump_label.h        | 9 +++++++++
 kernel/jump_label.c               | 2 +-
 4 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index dfdc2b1c17dd..d85802a00629 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -4,8 +4,6 @@
 
 #define HAVE_JUMP_LABEL_BATCH
 
-#define JUMP_LABEL_NOP_SIZE 5
-
 #include <asm/asm.h>
 #include <asm/nops.h>
 
@@ -47,6 +45,8 @@ l_yes:
 	return true;
 }
 
+extern int arch_jump_entry_size(struct jump_entry *entry);
+
 #endif	/* __ASSEMBLY__ */
 
 #endif
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 638d3b9be0ad..a29eecc14c94 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -16,6 +16,13 @@
 #include <asm/alternative.h>
 #include <asm/text-patching.h>
 
+#define JUMP_LABEL_NOP_SIZE	JMP32_INSN_SIZE
+
+int arch_jump_entry_size(struct jump_entry *entry)
+{
+	return JMP32_INSN_SIZE;
+}
+
 static const void *
 __jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type)
 {
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 05f5554d860f..8c45f58292ac 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -176,6 +176,15 @@ static inline void jump_entry_set_init(struct jump_entry *entry)
 	entry->key |= 2;
 }
 
+static inline int jump_entry_size(struct jump_entry *entry)
+{
+#ifdef JUMP_LABEL_NOP_SIZE
+	return JUMP_LABEL_NOP_SIZE;
+#else
+	return arch_jump_entry_size(entry);
+#endif
+}
+
 #endif
 #endif
 
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index ba39fbb1f8e7..521cafcfcb69 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -309,7 +309,7 @@ EXPORT_SYMBOL_GPL(jump_label_rate_limit);
 static int addr_conflict(struct jump_entry *entry, void *start, void *end)
 {
 	if (jump_entry_code(entry) <= (unsigned long)end &&
-	    jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE > (unsigned long)start)
+	    jump_entry_code(entry) + jump_entry_size(entry) > (unsigned long)start)
 		return 1;
 
 	return 0;
-- 
cgit v1.2.3


From 5af0ea293d78c8b8f0b87ae2b13f7ac584057bc3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 6 May 2021 21:34:00 +0200
Subject: jump_label: Free jump_entry::key bit1 for build use

Have jump_label_init() set jump_entry::key bit1 to either 0 ot 1
unconditionally. This makes it available for build-time games.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20210506194157.906893264@infradead.org
---
 include/linux/jump_label.h |  7 +++++--
 kernel/jump_label.c        | 10 ++++++----
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 8c45f58292ac..48b9b2a82767 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -171,9 +171,12 @@ static inline bool jump_entry_is_init(const struct jump_entry *entry)
 	return (unsigned long)entry->key & 2UL;
 }
 
-static inline void jump_entry_set_init(struct jump_entry *entry)
+static inline void jump_entry_set_init(struct jump_entry *entry, bool set)
 {
-	entry->key |= 2;
+	if (set)
+		entry->key |= 2;
+	else
+		entry->key &= ~2;
 }
 
 static inline int jump_entry_size(struct jump_entry *entry)
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 521cafcfcb69..bdb0681bece8 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -483,13 +483,14 @@ void __init jump_label_init(void)
 
 	for (iter = iter_start; iter < iter_stop; iter++) {
 		struct static_key *iterk;
+		bool in_init;
 
 		/* rewrite NOPs */
 		if (jump_label_type(iter) == JUMP_LABEL_NOP)
 			arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
 
-		if (init_section_contains((void *)jump_entry_code(iter), 1))
-			jump_entry_set_init(iter);
+		in_init = init_section_contains((void *)jump_entry_code(iter), 1);
+		jump_entry_set_init(iter, in_init);
 
 		iterk = jump_entry_key(iter);
 		if (iterk == key)
@@ -634,9 +635,10 @@ static int jump_label_add_module(struct module *mod)
 
 	for (iter = iter_start; iter < iter_stop; iter++) {
 		struct static_key *iterk;
+		bool in_init;
 
-		if (within_module_init(jump_entry_code(iter), mod))
-			jump_entry_set_init(iter);
+		in_init = within_module_init(jump_entry_code(iter), mod);
+		jump_entry_set_init(iter, in_init);
 
 		iterk = jump_entry_key(iter);
 		if (iterk == key)
-- 
cgit v1.2.3


From cc00c1988801dc71f63bb7bad019e85046865095 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 12 May 2021 19:51:31 +0200
Subject: sched: Fix leftover comment typos

A few more snuck in. Also capitalize 'CPU' while at it.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched_clock.h | 2 +-
 kernel/sched/core.c         | 4 ++--
 kernel/sched/fair.c         | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched_clock.h b/include/linux/sched_clock.h
index 528718e4ed52..835ee87ed792 100644
--- a/include/linux/sched_clock.h
+++ b/include/linux/sched_clock.h
@@ -14,7 +14,7 @@
  * @sched_clock_mask:   Bitmask for two's complement subtraction of non 64bit
  *			clocks.
  * @read_sched_clock:	Current clock source (or dummy source when suspended).
- * @mult:		Multipler for scaled math conversion.
+ * @mult:		Multiplier for scaled math conversion.
  * @shift:		Shift value for scaled math conversion.
  *
  * Care must be taken when updating this structure; it is read by
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9d00f4958bde..ac8882da5daf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5506,7 +5506,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 	}
 
 	/*
-	 * Try and select tasks for each sibling in decending sched_class
+	 * Try and select tasks for each sibling in descending sched_class
 	 * order.
 	 */
 	for_each_class(class) {
@@ -5520,7 +5520,7 @@ again:
 
 			/*
 			 * If this sibling doesn't yet have a suitable task to
-			 * run; ask for the most elegible task, given the
+			 * run; ask for the most eligible task, given the
 			 * highest priority task already selected for this
 			 * core.
 			 */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2635e1048421..161b92aa1c79 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10808,11 +10808,11 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
 	 * sched_slice() considers only this active rq and it gets the
 	 * whole slice. But during force idle, we have siblings acting
 	 * like a single runqueue and hence we need to consider runnable
-	 * tasks on this cpu and the forced idle cpu. Ideally, we should
+	 * tasks on this CPU and the forced idle CPU. Ideally, we should
 	 * go through the forced idle rq, but that would be a perf hit.
-	 * We can assume that the forced idle cpu has atleast
+	 * We can assume that the forced idle CPU has at least
 	 * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
-	 * if we need to give up the cpu.
+	 * if we need to give up the CPU.
 	 */
 	if (rq->core->core_forceidle && rq->cfs.nr_running == 1 &&
 	    __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
-- 
cgit v1.2.3


From 93d0955e6cf562d02aae37f5f8d98d9d9d16e0d4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 12 May 2021 20:04:28 +0200
Subject: locking: Fix comment typos

A few snuck through.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/lockdep_types.h |  2 +-
 kernel/futex.c                | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h
index 2ec9ff5a7fff..3e726ace5c62 100644
--- a/include/linux/lockdep_types.h
+++ b/include/linux/lockdep_types.h
@@ -52,7 +52,7 @@ enum lockdep_lock_type {
  * NR_LOCKDEP_CACHING_CLASSES ... Number of classes
  * cached in the instance of lockdep_map
  *
- * Currently main class (subclass == 0) and signle depth subclass
+ * Currently main class (subclass == 0) and single depth subclass
  * are cached in lockdep_map. This optimization is mainly targeting
  * on rq->lock. double_rq_lock() acquires this highly competitive with
  * single depth.
diff --git a/kernel/futex.c b/kernel/futex.c
index 4938a00bc785..2f386f012900 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1874,7 +1874,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
 	 * If the caller intends to requeue more than 1 waiter to pifutex,
 	 * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
 	 * as we have means to handle the possible fault.  If not, don't set
-	 * the bit unecessarily as it will force the subsequent unlock to enter
+	 * the bit unnecessarily as it will force the subsequent unlock to enter
 	 * the kernel.
 	 */
 	top_waiter = futex_top_waiter(hb1, key1);
@@ -2103,7 +2103,7 @@ retry_private:
 			continue;
 
 		/*
-		 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
+		 * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always
 		 * be paired with each other and no other futex ops.
 		 *
 		 * We should never be requeueing a futex_q with a pi_state,
@@ -2318,7 +2318,7 @@ retry:
 }
 
 /*
- * PI futexes can not be requeued and must remove themself from the
+ * PI futexes can not be requeued and must remove themselves from the
  * hash bucket. The hash bucket lock (i.e. lock_ptr) is held.
  */
 static void unqueue_me_pi(struct futex_q *q)
@@ -2903,7 +2903,7 @@ no_block:
 	 */
 	res = fixup_owner(uaddr, &q, !ret);
 	/*
-	 * If fixup_owner() returned an error, proprogate that.  If it acquired
+	 * If fixup_owner() returned an error, propagate that.  If it acquired
 	 * the lock, clear our -ETIMEDOUT or -EINTR.
 	 */
 	if (res)
@@ -3280,7 +3280,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 		 */
 		res = fixup_owner(uaddr2, &q, !ret);
 		/*
-		 * If fixup_owner() returned an error, proprogate that.  If it
+		 * If fixup_owner() returned an error, propagate that.  If it
 		 * acquired the lock, clear -ETIMEDOUT or -EINTR.
 		 */
 		if (res)
@@ -3678,7 +3678,7 @@ void futex_exec_release(struct task_struct *tsk)
 {
 	/*
 	 * The state handling is done for consistency, but in the case of
-	 * exec() there is no way to prevent futher damage as the PID stays
+	 * exec() there is no way to prevent further damage as the PID stays
 	 * the same. But for the unlikely and arguably buggy case that a
 	 * futex is held on exec(), this provides at least as much state
 	 * consistency protection which is possible.
-- 
cgit v1.2.3


From ca0760e7d79e2bb9c342e6b3f925b1ef01c6303e Mon Sep 17 00:00:00 2001
From: Wei Ming Chen <jj251510319013@gmail.com>
Date: Thu, 6 May 2021 20:30:51 +0800
Subject: Compiler Attributes: Add continue in comment

Add "continue;" for switch/case block according to Doc[1]

[1] https://www.kernel.org/doc/html/latest/process/deprecated.html?highlight=fallthrough#implicit-switch-case-fall-through

Signed-off-by: Wei Ming Chen <jj251510319013@gmail.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 include/linux/compiler_attributes.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
index c043b8d2b17b..183ddd5fd072 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -199,6 +199,7 @@
  * must end with any of these keywords:
  *   break;
  *   fallthrough;
+ *   continue;
  *   goto <label>;
  *   return [expression];
  *
-- 
cgit v1.2.3


From 01aee8fd7fb23049e2b52abadbe1f7b5e94a52d2 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Apr 2021 23:02:25 +0300
Subject: sched: Make nr_running() return 32-bit value

Creating 2**32 tasks is impossible due to futex pid limits and wasteful
anyway. Nobody has done it.

Bring nr_running() into 32-bit world to save on REX prefixes.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20210422200228.1423391-1-adobriyan@gmail.com
---
 fs/proc/loadavg.c          | 2 +-
 fs/proc/stat.c             | 2 +-
 include/linux/sched/stat.h | 2 +-
 kernel/sched/core.c        | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 8468baee951d..f32878d9a39f 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -16,7 +16,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
 
 	get_avenrun(avnrun, FIXED_1/200, 0);
 
-	seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
+	seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %u/%d %d\n",
 		LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
 		LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
 		LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index f25e8531fd27..941605de7f9a 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -200,7 +200,7 @@ static int show_stat(struct seq_file *p, void *v)
 		"\nctxt %llu\n"
 		"btime %llu\n"
 		"processes %lu\n"
-		"procs_running %lu\n"
+		"procs_running %u\n"
 		"procs_blocked %lu\n",
 		nr_context_switches(),
 		(unsigned long long)boottime.tv_sec,
diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h
index 939c3ec9e1b9..73606b3de394 100644
--- a/include/linux/sched/stat.h
+++ b/include/linux/sched/stat.h
@@ -17,7 +17,7 @@ extern unsigned long total_forks;
 extern int nr_threads;
 DECLARE_PER_CPU(unsigned long, process_counts);
 extern int nr_processes(void);
-extern unsigned long nr_running(void);
+extern unsigned int nr_running(void);
 extern bool single_task_running(void);
 extern unsigned long nr_iowait(void);
 extern unsigned long nr_iowait_cpu(int cpu);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ac8882da5daf..2c6cdb059c64 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4692,9 +4692,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
  * externally visible scheduler statistics: current number of runnable
  * threads, total number of context switches performed since bootup.
  */
-unsigned long nr_running(void)
+unsigned int nr_running(void)
 {
-	unsigned long i, sum = 0;
+	unsigned int i, sum = 0;
 
 	for_each_online_cpu(i)
 		sum += cpu_rq(i)->nr_running;
-- 
cgit v1.2.3


From 9745516841a55c77163a5d549bce1374d776df54 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Apr 2021 23:02:26 +0300
Subject: sched: Make nr_iowait() return 32-bit value

Creating 2**32 tasks to wait in D-state is impossible and wasteful.

Return "unsigned int" and save on REX prefixes.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20210422200228.1423391-2-adobriyan@gmail.com
---
 fs/proc/stat.c             | 2 +-
 include/linux/sched/stat.h | 2 +-
 kernel/sched/core.c        | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 941605de7f9a..6561a06ef905 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -201,7 +201,7 @@ static int show_stat(struct seq_file *p, void *v)
 		"btime %llu\n"
 		"processes %lu\n"
 		"procs_running %u\n"
-		"procs_blocked %lu\n",
+		"procs_blocked %u\n",
 		nr_context_switches(),
 		(unsigned long long)boottime.tv_sec,
 		total_forks,
diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h
index 73606b3de394..81d9b539e3b7 100644
--- a/include/linux/sched/stat.h
+++ b/include/linux/sched/stat.h
@@ -19,7 +19,7 @@ DECLARE_PER_CPU(unsigned long, process_counts);
 extern int nr_processes(void);
 extern unsigned int nr_running(void);
 extern bool single_task_running(void);
-extern unsigned long nr_iowait(void);
+extern unsigned int nr_iowait(void);
 extern unsigned long nr_iowait_cpu(int cpu);
 
 static inline int sched_info_on(void)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2c6cdb059c64..fadf2bf1e86f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4774,9 +4774,9 @@ unsigned long nr_iowait_cpu(int cpu)
  * Task CPU affinities can make all that even more 'interesting'.
  */
 
-unsigned long nr_iowait(void)
+unsigned int nr_iowait(void)
 {
-	unsigned long i, sum = 0;
+	unsigned int i, sum = 0;
 
 	for_each_possible_cpu(i)
 		sum += nr_iowait_cpu(i);
-- 
cgit v1.2.3


From 8fc2858e572ce761bffcade81a42ac72005e76f9 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Apr 2021 23:02:27 +0300
Subject: sched: Make nr_iowait_cpu() return 32-bit value

Runqueue ->nr_iowait counters are 32-bit anyway.

Propagate 32-bitness into other code, but don't try too hard.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20210422200228.1423391-3-adobriyan@gmail.com
---
 drivers/cpuidle/governors/menu.c | 6 +++---
 include/linux/sched/stat.h       | 2 +-
 kernel/sched/core.c              | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index c3aa8d6ccee3..2e5670446991 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -117,7 +117,7 @@ struct menu_device {
 	int		interval_ptr;
 };
 
-static inline int which_bucket(u64 duration_ns, unsigned long nr_iowaiters)
+static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters)
 {
 	int bucket = 0;
 
@@ -150,7 +150,7 @@ static inline int which_bucket(u64 duration_ns, unsigned long nr_iowaiters)
  * to be, the higher this multiplier, and thus the higher
  * the barrier to go to an expensive C state.
  */
-static inline int performance_multiplier(unsigned long nr_iowaiters)
+static inline int performance_multiplier(unsigned int nr_iowaiters)
 {
 	/* for IO wait tasks (per cpu!) we add 10x each */
 	return 1 + 10 * nr_iowaiters;
@@ -270,7 +270,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	unsigned int predicted_us;
 	u64 predicted_ns;
 	u64 interactivity_req;
-	unsigned long nr_iowaiters;
+	unsigned int nr_iowaiters;
 	ktime_t delta, delta_tick;
 	int i, idx;
 
diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h
index 81d9b539e3b7..0108a38bb64d 100644
--- a/include/linux/sched/stat.h
+++ b/include/linux/sched/stat.h
@@ -20,7 +20,7 @@ extern int nr_processes(void);
 extern unsigned int nr_running(void);
 extern bool single_task_running(void);
 extern unsigned int nr_iowait(void);
-extern unsigned long nr_iowait_cpu(int cpu);
+extern unsigned int nr_iowait_cpu(int cpu);
 
 static inline int sched_info_on(void)
 {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fadf2bf1e86f..24fd795e4b8c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4739,7 +4739,7 @@ unsigned long long nr_context_switches(void)
  * it does become runnable.
  */
 
-unsigned long nr_iowait_cpu(int cpu)
+unsigned int nr_iowait_cpu(int cpu)
 {
 	return atomic_read(&cpu_rq(cpu)->nr_iowait);
 }
-- 
cgit v1.2.3


From f105dfec0a951cd0d5bfbfe9dc067ea69f71ad5c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 13 May 2021 01:29:15 +0200
Subject: tick/nohz: Evaluate the CPU expression after the static key

When tick_nohz_full_cpu() is called with smp_processor_id(), the latter
is unconditionally evaluated whether the static key is on or off. It is
not necessary in the off-case though, so make sure the cpu expression
is executed at the last moment.

Illustrate with the following test function:

	int tick_nohz_test(void)
	{
		return tick_nohz_full_cpu(smp_processor_id());
	}

The resulting code before was:

	mov    %gs:0x7eea92d1(%rip),%eax   # smp_processor_id() fetch
	nopl   0x0(%rax,%rax,1)
	xor    %eax,%eax
	retq
	cmpb   $0x0,0x29d393a(%rip)        # <tick_nohz_full_running>
	je     tick_nohz_test+0x29         # jump to below eax clear
	mov    %eax,%eax
	bt     %rax,0x29d3936(%rip)        # <tick_nohz_full_mask>
	setb   %al
	movzbl %al,%eax
	retq
	xor    %eax,%eax
	retq

Now it becomes:

	nopl   0x0(%rax,%rax,1)
	xor    %eax,%eax
	retq
	cmpb   $0x0,0x29d3871(%rip)        # <tick_nohz_full_running>
	je     tick_nohz_test+0x29         # jump to below eax clear
	mov    %gs:0x7eea91f0(%rip),%eax   # smp_processor_id() fetch, after static key
	mov    %eax,%eax
	bt     %rax,0x29d3866(%rip)        # <tick_nohz_full_mask>
	setb   %al
	movzbl %al,%eax
	retq
	xor    %eax,%eax
	retq

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20210512232924.150322-2-frederic@kernel.org
---
 include/linux/tick.h | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 7340613c7eff..2258984a0e8a 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -185,13 +185,17 @@ static inline bool tick_nohz_full_enabled(void)
 	return tick_nohz_full_running;
 }
 
-static inline bool tick_nohz_full_cpu(int cpu)
-{
-	if (!tick_nohz_full_enabled())
-		return false;
-
-	return cpumask_test_cpu(cpu, tick_nohz_full_mask);
-}
+/*
+ * Check if a CPU is part of the nohz_full subset. Arrange for evaluating
+ * the cpu expression (typically smp_processor_id()) _after_ the static
+ * key.
+ */
+#define tick_nohz_full_cpu(_cpu) ({					\
+	bool __ret = false;						\
+	if (tick_nohz_full_enabled())					\
+		__ret = cpumask_test_cpu((_cpu), tick_nohz_full_mask);	\
+	__ret;								\
+})
 
 static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask)
 {
-- 
cgit v1.2.3


From 1e4ca26d367ae71743e25068e5cd8750ef3f5f7d Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Thu, 13 May 2021 01:29:21 +0200
Subject: tick/nohz: Change signal tick dependency to wake up CPUs of member
 tasks

Rather than waking up all nohz_full CPUs on the system, only wake up
the target CPUs of member threads of the signal.

Reduces interruptions to nohz_full CPUs.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210512232924.150322-8-frederic@kernel.org
---
 include/linux/tick.h           |  8 ++++----
 kernel/time/posix-cpu-timers.c |  4 ++--
 kernel/time/tick-sched.c       | 15 +++++++++++++--
 3 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 2258984a0e8a..0bb80a7f05b9 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -211,7 +211,7 @@ extern void tick_nohz_dep_set_task(struct task_struct *tsk,
 				   enum tick_dep_bits bit);
 extern void tick_nohz_dep_clear_task(struct task_struct *tsk,
 				     enum tick_dep_bits bit);
-extern void tick_nohz_dep_set_signal(struct signal_struct *signal,
+extern void tick_nohz_dep_set_signal(struct task_struct *tsk,
 				     enum tick_dep_bits bit);
 extern void tick_nohz_dep_clear_signal(struct signal_struct *signal,
 				       enum tick_dep_bits bit);
@@ -256,11 +256,11 @@ static inline void tick_dep_clear_task(struct task_struct *tsk,
 	if (tick_nohz_full_enabled())
 		tick_nohz_dep_clear_task(tsk, bit);
 }
-static inline void tick_dep_set_signal(struct signal_struct *signal,
+static inline void tick_dep_set_signal(struct task_struct *tsk,
 				       enum tick_dep_bits bit)
 {
 	if (tick_nohz_full_enabled())
-		tick_nohz_dep_set_signal(signal, bit);
+		tick_nohz_dep_set_signal(tsk, bit);
 }
 static inline void tick_dep_clear_signal(struct signal_struct *signal,
 					 enum tick_dep_bits bit)
@@ -288,7 +288,7 @@ static inline void tick_dep_set_task(struct task_struct *tsk,
 				     enum tick_dep_bits bit) { }
 static inline void tick_dep_clear_task(struct task_struct *tsk,
 				       enum tick_dep_bits bit) { }
-static inline void tick_dep_set_signal(struct signal_struct *signal,
+static inline void tick_dep_set_signal(struct task_struct *tsk,
 				       enum tick_dep_bits bit) { }
 static inline void tick_dep_clear_signal(struct signal_struct *signal,
 					 enum tick_dep_bits bit) { }
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 3bb96a8b49c9..29a5e54e6e10 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -523,7 +523,7 @@ static void arm_timer(struct k_itimer *timer, struct task_struct *p)
 	if (CPUCLOCK_PERTHREAD(timer->it_clock))
 		tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER);
 	else
-		tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER);
+		tick_dep_set_signal(p, TICK_DEP_BIT_POSIX_TIMER);
 }
 
 /*
@@ -1358,7 +1358,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid,
 	if (*newval < *nextevt)
 		*nextevt = *newval;
 
-	tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER);
+	tick_dep_set_signal(tsk, TICK_DEP_BIT_POSIX_TIMER);
 }
 
 static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b90ca6635ea4..acbe6722cf75 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -444,9 +444,20 @@ EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task);
  * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse
  * per process timers.
  */
-void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits bit)
+void tick_nohz_dep_set_signal(struct task_struct *tsk,
+			      enum tick_dep_bits bit)
 {
-	tick_nohz_dep_set_all(&sig->tick_dep_mask, bit);
+	int prev;
+	struct signal_struct *sig = tsk->signal;
+
+	prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask);
+	if (!prev) {
+		struct task_struct *t;
+
+		lockdep_assert_held(&tsk->sighand->siglock);
+		__for_each_thread(sig, t)
+			tick_nohz_kick_task(t);
+	}
 }
 
 void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit)
-- 
cgit v1.2.3


From a1dfb6311c7739e21e160bc4c5575a1b21b48c87 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Thu, 13 May 2021 01:29:22 +0200
Subject: tick/nohz: Kick only _queued_ task whose tick dependency is updated

When the tick dependency of a task is updated, we want it to aknowledge
the new state and restart the tick if needed. If the task is not
running, we don't need to kick it because it will observe the new
dependency upon scheduling in. But if the task is running, we may need
to send an IPI to it so that it gets notified.

Unfortunately we don't have the means to check if a task is running
in a race free way. Checking p->on_cpu in a synchronized way against
p->tick_dep_mask would imply adding a full barrier between
prepare_task_switch() and tick_nohz_task_switch(), which we want to
avoid in this fast-path.

Therefore we blindly fire an IPI to the task's CPU.

Meanwhile we can check if the task is queued on the CPU rq because
p->on_rq is always set to TASK_ON_RQ_QUEUED _before_ schedule() and its
full barrier that precedes tick_nohz_task_switch(). And if the task is
queued on a nohz_full CPU, it also has fair chances to be running as the
isolation constraints prescribe running single tasks on full dynticks
CPUs.

So use this as a trick to check if we can spare an IPI toward a
non-running task.

NOTE: For the ordering to be correct, it is assumed that we never
deactivate a task while it is running, the only exception being the task
deactivating itself while scheduling out.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210512232924.150322-9-frederic@kernel.org
---
 include/linux/sched.h    |  2 ++
 kernel/sched/core.c      |  5 +++++
 kernel/time/tick-sched.c | 19 +++++++++++++++++--
 3 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d2c881384517..3341ae2e8231 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2011,6 +2011,8 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
+extern bool sched_task_on_rq(struct task_struct *p);
+
 /*
  * In order to reduce various lock holder preemption latencies provide an
  * interface to see if a vCPU is currently running or not.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5226cc26a095..78e480f7881a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1590,6 +1590,11 @@ static inline void uclamp_post_fork(struct task_struct *p) { }
 static inline void init_uclamp(void) { }
 #endif /* CONFIG_UCLAMP_TASK */
 
+bool sched_task_on_rq(struct task_struct *p)
+{
+	return task_on_rq_queued(p);
+}
+
 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	if (!(flags & ENQUEUE_NOCLOCK))
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index acbe6722cf75..197a3bd882ad 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -324,14 +324,28 @@ void tick_nohz_full_kick_cpu(int cpu)
 
 static void tick_nohz_kick_task(struct task_struct *tsk)
 {
-	int cpu = task_cpu(tsk);
+	int cpu;
+
+	/*
+	 * If the task is not running, run_posix_cpu_timers()
+	 * has nothing to elapse, IPI can then be spared.
+	 *
+	 * activate_task()                      STORE p->tick_dep_mask
+	 *   STORE p->on_rq
+	 * __schedule() (switch to task 'p')    smp_mb() (atomic_fetch_or())
+	 *   LOCK rq->lock                      LOAD p->on_rq
+	 *   smp_mb__after_spin_lock()
+	 *   tick_nohz_task_switch()
+	 *     LOAD p->tick_dep_mask
+	 */
+	if (!sched_task_on_rq(tsk))
+		return;
 
 	/*
 	 * If the task concurrently migrates to another CPU,
 	 * we guarantee it sees the new tick dependency upon
 	 * schedule.
 	 *
-	 *
 	 * set_task_cpu(p, cpu);
 	 *   STORE p->cpu = @cpu
 	 * __schedule() (switch to task 'p')
@@ -340,6 +354,7 @@ static void tick_nohz_kick_task(struct task_struct *tsk)
 	 *   tick_nohz_task_switch()            smp_mb() (atomic_fetch_or())
 	 *      LOAD p->tick_dep_mask           LOAD p->cpu
 	 */
+	cpu = task_cpu(tsk);
 
 	preempt_disable();
 	if (cpu_online(cpu))
-- 
cgit v1.2.3


From cbbc07e1e892c373f30f4ba08fedecd49afca247 Mon Sep 17 00:00:00 2001
From: Peter Chen <peter.chen@nxp.com>
Date: Sat, 8 May 2021 13:33:57 +0800
Subject: usb: host: move EH SINGLE_STEP_SET_FEATURE implementation to core

It is needed at USB Certification test for Embedded Host 2.0, and
the detail is at CH6.4.1.1 of On-The-Go and Embedded Host Supplement
to the USB Revision 2.0 Specification. Since other USB 2.0 capable
host like XHCI also need it, so move it to HCD core.

Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Peter Chen <peter.chen@nxp.com>
Signed-off-by: Li Jun <jun.li@nxp.com>
Link: https://lore.kernel.org/r/1620452039-11694-1-git-send-email-jun.li@nxp.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/hcd.c      | 134 ++++++++++++++++++++++++++++++++++++++++++
 drivers/usb/host/ehci-hcd.c |   4 ++
 drivers/usb/host/ehci-hub.c | 139 --------------------------------------------
 drivers/usb/host/ehci-q.c   |   2 +-
 include/linux/usb/hcd.h     |  13 ++++-
 5 files changed, 151 insertions(+), 141 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 6119fb41d736..d7eb9f179ca6 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -2110,6 +2110,140 @@ int usb_hcd_get_frame_number (struct usb_device *udev)
 	return hcd->driver->get_frame_number (hcd);
 }
 
+/*-------------------------------------------------------------------------*/
+#ifdef CONFIG_USB_HCD_TEST_MODE
+
+static void usb_ehset_completion(struct urb *urb)
+{
+	struct completion  *done = urb->context;
+
+	complete(done);
+}
+/*
+ * Allocate and initialize a control URB. This request will be used by the
+ * EHSET SINGLE_STEP_SET_FEATURE test in which the DATA and STATUS stages
+ * of the GetDescriptor request are sent 15 seconds after the SETUP stage.
+ * Return NULL if failed.
+ */
+static struct urb *request_single_step_set_feature_urb(
+	struct usb_device	*udev,
+	void			*dr,
+	void			*buf,
+	struct completion	*done)
+{
+	struct urb *urb;
+	struct usb_hcd *hcd = bus_to_hcd(udev->bus);
+	struct usb_host_endpoint *ep;
+
+	urb = usb_alloc_urb(0, GFP_KERNEL);
+	if (!urb)
+		return NULL;
+
+	urb->pipe = usb_rcvctrlpipe(udev, 0);
+	ep = (usb_pipein(urb->pipe) ? udev->ep_in : udev->ep_out)
+				[usb_pipeendpoint(urb->pipe)];
+	if (!ep) {
+		usb_free_urb(urb);
+		return NULL;
+	}
+
+	urb->ep = ep;
+	urb->dev = udev;
+	urb->setup_packet = (void *)dr;
+	urb->transfer_buffer = buf;
+	urb->transfer_buffer_length = USB_DT_DEVICE_SIZE;
+	urb->complete = usb_ehset_completion;
+	urb->status = -EINPROGRESS;
+	urb->actual_length = 0;
+	urb->transfer_flags = URB_DIR_IN;
+	usb_get_urb(urb);
+	atomic_inc(&urb->use_count);
+	atomic_inc(&urb->dev->urbnum);
+	urb->setup_dma = dma_map_single(
+			hcd->self.sysdev,
+			urb->setup_packet,
+			sizeof(struct usb_ctrlrequest),
+			DMA_TO_DEVICE);
+	urb->transfer_dma = dma_map_single(
+			hcd->self.sysdev,
+			urb->transfer_buffer,
+			urb->transfer_buffer_length,
+			DMA_FROM_DEVICE);
+	urb->context = done;
+	return urb;
+}
+
+int ehset_single_step_set_feature(struct usb_hcd *hcd, int port)
+{
+	int retval = -ENOMEM;
+	struct usb_ctrlrequest *dr;
+	struct urb *urb;
+	struct usb_device *udev;
+	struct usb_device_descriptor *buf;
+	DECLARE_COMPLETION_ONSTACK(done);
+
+	/* Obtain udev of the rhub's child port */
+	udev = usb_hub_find_child(hcd->self.root_hub, port);
+	if (!udev) {
+		dev_err(hcd->self.controller, "No device attached to the RootHub\n");
+		return -ENODEV;
+	}
+	buf = kmalloc(USB_DT_DEVICE_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	dr = kmalloc(sizeof(struct usb_ctrlrequest), GFP_KERNEL);
+	if (!dr) {
+		kfree(buf);
+		return -ENOMEM;
+	}
+
+	/* Fill Setup packet for GetDescriptor */
+	dr->bRequestType = USB_DIR_IN;
+	dr->bRequest = USB_REQ_GET_DESCRIPTOR;
+	dr->wValue = cpu_to_le16(USB_DT_DEVICE << 8);
+	dr->wIndex = 0;
+	dr->wLength = cpu_to_le16(USB_DT_DEVICE_SIZE);
+	urb = request_single_step_set_feature_urb(udev, dr, buf, &done);
+	if (!urb)
+		goto cleanup;
+
+	/* Submit just the SETUP stage */
+	retval = hcd->driver->submit_single_step_set_feature(hcd, urb, 1);
+	if (retval)
+		goto out1;
+	if (!wait_for_completion_timeout(&done, msecs_to_jiffies(2000))) {
+		usb_kill_urb(urb);
+		retval = -ETIMEDOUT;
+		dev_err(hcd->self.controller,
+			"%s SETUP stage timed out on ep0\n", __func__);
+		goto out1;
+	}
+	msleep(15 * 1000);
+
+	/* Complete remaining DATA and STATUS stages using the same URB */
+	urb->status = -EINPROGRESS;
+	usb_get_urb(urb);
+	atomic_inc(&urb->use_count);
+	atomic_inc(&urb->dev->urbnum);
+	retval = hcd->driver->submit_single_step_set_feature(hcd, urb, 0);
+	if (!retval && !wait_for_completion_timeout(&done,
+						msecs_to_jiffies(2000))) {
+		usb_kill_urb(urb);
+		retval = -ETIMEDOUT;
+		dev_err(hcd->self.controller,
+			"%s IN stage timed out on ep0\n", __func__);
+	}
+out1:
+	usb_free_urb(urb);
+cleanup:
+	kfree(dr);
+	kfree(buf);
+	return retval;
+}
+EXPORT_SYMBOL_GPL(ehset_single_step_set_feature);
+#endif /* CONFIG_USB_HCD_TEST_MODE */
+
 /*-------------------------------------------------------------------------*/
 
 #ifdef	CONFIG_PM
diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c
index 94b5e64ae9a2..35eec0c0edcd 100644
--- a/drivers/usb/host/ehci-hcd.c
+++ b/drivers/usb/host/ehci-hcd.c
@@ -1238,6 +1238,10 @@ static const struct hc_driver ehci_hc_driver = {
 	 * device support
 	 */
 	.free_dev =		ehci_remove_device,
+#ifdef CONFIG_USB_HCD_TEST_MODE
+	/* EH SINGLE_STEP_SET_FEATURE test support */
+	.submit_single_step_set_feature	= ehci_submit_single_step_set_feature,
+#endif
 };
 
 void ehci_init_driver(struct hc_driver *drv,
diff --git a/drivers/usb/host/ehci-hub.c b/drivers/usb/host/ehci-hub.c
index 159cc27b1a36..c4f6a2559a98 100644
--- a/drivers/usb/host/ehci-hub.c
+++ b/drivers/usb/host/ehci-hub.c
@@ -726,145 +726,6 @@ ehci_hub_descriptor (
 	desc->wHubCharacteristics = cpu_to_le16(temp);
 }
 
-/*-------------------------------------------------------------------------*/
-#ifdef CONFIG_USB_HCD_TEST_MODE
-
-#define EHSET_TEST_SINGLE_STEP_SET_FEATURE 0x06
-
-static void usb_ehset_completion(struct urb *urb)
-{
-	struct completion  *done = urb->context;
-
-	complete(done);
-}
-static int submit_single_step_set_feature(
-	struct usb_hcd	*hcd,
-	struct urb	*urb,
-	int		is_setup
-);
-
-/*
- * Allocate and initialize a control URB. This request will be used by the
- * EHSET SINGLE_STEP_SET_FEATURE test in which the DATA and STATUS stages
- * of the GetDescriptor request are sent 15 seconds after the SETUP stage.
- * Return NULL if failed.
- */
-static struct urb *request_single_step_set_feature_urb(
-	struct usb_device	*udev,
-	void			*dr,
-	void			*buf,
-	struct completion	*done
-) {
-	struct urb *urb;
-	struct usb_hcd *hcd = bus_to_hcd(udev->bus);
-	struct usb_host_endpoint *ep;
-
-	urb = usb_alloc_urb(0, GFP_KERNEL);
-	if (!urb)
-		return NULL;
-
-	urb->pipe = usb_rcvctrlpipe(udev, 0);
-	ep = (usb_pipein(urb->pipe) ? udev->ep_in : udev->ep_out)
-				[usb_pipeendpoint(urb->pipe)];
-	if (!ep) {
-		usb_free_urb(urb);
-		return NULL;
-	}
-
-	urb->ep = ep;
-	urb->dev = udev;
-	urb->setup_packet = (void *)dr;
-	urb->transfer_buffer = buf;
-	urb->transfer_buffer_length = USB_DT_DEVICE_SIZE;
-	urb->complete = usb_ehset_completion;
-	urb->status = -EINPROGRESS;
-	urb->actual_length = 0;
-	urb->transfer_flags = URB_DIR_IN;
-	usb_get_urb(urb);
-	atomic_inc(&urb->use_count);
-	atomic_inc(&urb->dev->urbnum);
-	urb->setup_dma = dma_map_single(
-			hcd->self.sysdev,
-			urb->setup_packet,
-			sizeof(struct usb_ctrlrequest),
-			DMA_TO_DEVICE);
-	urb->transfer_dma = dma_map_single(
-			hcd->self.sysdev,
-			urb->transfer_buffer,
-			urb->transfer_buffer_length,
-			DMA_FROM_DEVICE);
-	urb->context = done;
-	return urb;
-}
-
-static int ehset_single_step_set_feature(struct usb_hcd *hcd, int port)
-{
-	int retval = -ENOMEM;
-	struct usb_ctrlrequest *dr;
-	struct urb *urb;
-	struct usb_device *udev;
-	struct ehci_hcd *ehci = hcd_to_ehci(hcd);
-	struct usb_device_descriptor *buf;
-	DECLARE_COMPLETION_ONSTACK(done);
-
-	/* Obtain udev of the rhub's child port */
-	udev = usb_hub_find_child(hcd->self.root_hub, port);
-	if (!udev) {
-		ehci_err(ehci, "No device attached to the RootHub\n");
-		return -ENODEV;
-	}
-	buf = kmalloc(USB_DT_DEVICE_SIZE, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	dr = kmalloc(sizeof(struct usb_ctrlrequest), GFP_KERNEL);
-	if (!dr) {
-		kfree(buf);
-		return -ENOMEM;
-	}
-
-	/* Fill Setup packet for GetDescriptor */
-	dr->bRequestType = USB_DIR_IN;
-	dr->bRequest = USB_REQ_GET_DESCRIPTOR;
-	dr->wValue = cpu_to_le16(USB_DT_DEVICE << 8);
-	dr->wIndex = 0;
-	dr->wLength = cpu_to_le16(USB_DT_DEVICE_SIZE);
-	urb = request_single_step_set_feature_urb(udev, dr, buf, &done);
-	if (!urb)
-		goto cleanup;
-
-	/* Submit just the SETUP stage */
-	retval = submit_single_step_set_feature(hcd, urb, 1);
-	if (retval)
-		goto out1;
-	if (!wait_for_completion_timeout(&done, msecs_to_jiffies(2000))) {
-		usb_kill_urb(urb);
-		retval = -ETIMEDOUT;
-		ehci_err(ehci, "%s SETUP stage timed out on ep0\n", __func__);
-		goto out1;
-	}
-	msleep(15 * 1000);
-
-	/* Complete remaining DATA and STATUS stages using the same URB */
-	urb->status = -EINPROGRESS;
-	usb_get_urb(urb);
-	atomic_inc(&urb->use_count);
-	atomic_inc(&urb->dev->urbnum);
-	retval = submit_single_step_set_feature(hcd, urb, 0);
-	if (!retval && !wait_for_completion_timeout(&done,
-						msecs_to_jiffies(2000))) {
-		usb_kill_urb(urb);
-		retval = -ETIMEDOUT;
-		ehci_err(ehci, "%s IN stage timed out on ep0\n", __func__);
-	}
-out1:
-	usb_free_urb(urb);
-cleanup:
-	kfree(dr);
-	kfree(buf);
-	return retval;
-}
-#endif /* CONFIG_USB_HCD_TEST_MODE */
 /*-------------------------------------------------------------------------*/
 
 int ehci_hub_control(
diff --git a/drivers/usb/host/ehci-q.c b/drivers/usb/host/ehci-q.c
index a826715ae9bd..2cbf4f85bff3 100644
--- a/drivers/usb/host/ehci-q.c
+++ b/drivers/usb/host/ehci-q.c
@@ -1165,7 +1165,7 @@ submit_async (
  * performed; TRUE - SETUP and FALSE - IN+STATUS
  * Returns 0 if success
  */
-static int submit_single_step_set_feature(
+static int ehci_submit_single_step_set_feature(
 	struct usb_hcd  *hcd,
 	struct urb      *urb,
 	int             is_setup
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 96281cd50ff6..22c5d1c0acf3 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -409,7 +409,10 @@ struct hc_driver {
 	int	(*find_raw_port_number)(struct usb_hcd *, int);
 	/* Call for power on/off the port if necessary */
 	int	(*port_power)(struct usb_hcd *hcd, int portnum, bool enable);
-
+	/* Call for SINGLE_STEP_SET_FEATURE Test for USB2 EH certification */
+#define EHSET_TEST_SINGLE_STEP_SET_FEATURE 0x06
+	int	(*submit_single_step_set_feature)(struct usb_hcd *,
+			struct urb *, int);
 };
 
 static inline int hcd_giveback_urb_in_bh(struct usb_hcd *hcd)
@@ -474,6 +477,14 @@ int usb_hcd_setup_local_mem(struct usb_hcd *hcd, phys_addr_t phys_addr,
 
 struct platform_device;
 extern void usb_hcd_platform_shutdown(struct platform_device *dev);
+#ifdef CONFIG_USB_HCD_TEST_MODE
+extern int ehset_single_step_set_feature(struct usb_hcd *hcd, int port);
+#else
+static inline int ehset_single_step_set_feature(struct usb_hcd *hcd, int port)
+{
+	return 0;
+}
+#endif /* CONFIG_USB_HCD_TEST_MODE */
 
 #ifdef CONFIG_USB_PCI
 struct pci_dev;
-- 
cgit v1.2.3


From 0733d83905326baef3c25d8bd9a96fdc9eb71b86 Mon Sep 17 00:00:00 2001
From: Shawn Guo <shawn.guo@linaro.org>
Date: Sun, 25 Apr 2021 10:00:24 +0800
Subject: firmware: replace HOTPLUG with UEVENT in FW_ACTION defines

With commit 312c004d36ce ("[PATCH] driver core: replace "hotplug" by
"uevent"") already in the tree over a decade, update the name of
FW_ACTION defines to follow semantics, and reflect what the defines are
really meant for, i.e. whether or not generate user space event.

Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
Link: https://lore.kernel.org/r/20210425020024.28057-1-shawn.guo@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/dma/imx-sdma.c                      |  2 +-
 drivers/media/platform/exynos4-is/fimc-is.c |  2 +-
 drivers/mfd/iqs62x.c                        |  2 +-
 drivers/misc/lattice-ecp3-config.c          |  2 +-
 drivers/net/wireless/ti/wlcore/main.c       |  2 +-
 drivers/platform/x86/dell/dell_rbu.c        |  2 +-
 drivers/remoteproc/remoteproc_core.c        |  2 +-
 drivers/scsi/lpfc/lpfc_init.c               |  2 +-
 drivers/tty/serial/ucc_uart.c               |  2 +-
 include/linux/firmware.h                    |  4 ++--
 lib/test_firmware.c                         | 10 +++++-----
 sound/soc/codecs/wm8958-dsp2.c              |  6 +++---
 12 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c
index d5590c08db51..e2b559945c11 100644
--- a/drivers/dma/imx-sdma.c
+++ b/drivers/dma/imx-sdma.c
@@ -1829,7 +1829,7 @@ static int sdma_get_firmware(struct sdma_engine *sdma,
 	int ret;
 
 	ret = request_firmware_nowait(THIS_MODULE,
-			FW_ACTION_HOTPLUG, fw_name, sdma->dev,
+			FW_ACTION_UEVENT, fw_name, sdma->dev,
 			GFP_KERNEL, sdma, sdma_load_firmware);
 
 	return ret;
diff --git a/drivers/media/platform/exynos4-is/fimc-is.c b/drivers/media/platform/exynos4-is/fimc-is.c
index 972d9601d236..8f3ba74f19f2 100644
--- a/drivers/media/platform/exynos4-is/fimc-is.c
+++ b/drivers/media/platform/exynos4-is/fimc-is.c
@@ -436,7 +436,7 @@ done:
 static int fimc_is_request_firmware(struct fimc_is *is, const char *fw_name)
 {
 	return request_firmware_nowait(THIS_MODULE,
-				FW_ACTION_HOTPLUG, fw_name, &is->pdev->dev,
+				FW_ACTION_UEVENT, fw_name, &is->pdev->dev,
 				GFP_KERNEL, is, fimc_is_load_firmware);
 }
 
diff --git a/drivers/mfd/iqs62x.c b/drivers/mfd/iqs62x.c
index d1fc38a78acb..9805cf191245 100644
--- a/drivers/mfd/iqs62x.c
+++ b/drivers/mfd/iqs62x.c
@@ -998,7 +998,7 @@ static int iqs62x_probe(struct i2c_client *client)
 
 	device_property_read_string(&client->dev, "firmware-name", &fw_name);
 
-	ret = request_firmware_nowait(THIS_MODULE, FW_ACTION_HOTPLUG,
+	ret = request_firmware_nowait(THIS_MODULE, FW_ACTION_UEVENT,
 				      fw_name ? : iqs62x->dev_desc->fw_name,
 				      &client->dev, GFP_KERNEL, iqs62x,
 				      iqs62x_firmware_load);
diff --git a/drivers/misc/lattice-ecp3-config.c b/drivers/misc/lattice-ecp3-config.c
index 5eaf74447ca1..0f54730c7ed5 100644
--- a/drivers/misc/lattice-ecp3-config.c
+++ b/drivers/misc/lattice-ecp3-config.c
@@ -198,7 +198,7 @@ static int lattice_ecp3_probe(struct spi_device *spi)
 	spi_set_drvdata(spi, data);
 
 	init_completion(&data->fw_loaded);
-	err = request_firmware_nowait(THIS_MODULE, FW_ACTION_HOTPLUG,
+	err = request_firmware_nowait(THIS_MODULE, FW_ACTION_UEVENT,
 				      FIRMWARE_NAME, &spi->dev,
 				      GFP_KERNEL, spi, firmware_load);
 	if (err) {
diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c
index 8509b989940c..5bf15355c2b3 100644
--- a/drivers/net/wireless/ti/wlcore/main.c
+++ b/drivers/net/wireless/ti/wlcore/main.c
@@ -6784,7 +6784,7 @@ int wlcore_probe(struct wl1271 *wl, struct platform_device *pdev)
 
 	if (pdev_data->family && pdev_data->family->nvs_name) {
 		nvs_name = pdev_data->family->nvs_name;
-		ret = request_firmware_nowait(THIS_MODULE, FW_ACTION_HOTPLUG,
+		ret = request_firmware_nowait(THIS_MODULE, FW_ACTION_UEVENT,
 					      nvs_name, &pdev->dev, GFP_KERNEL,
 					      wl, wlcore_nvs_cb);
 		if (ret < 0) {
diff --git a/drivers/platform/x86/dell/dell_rbu.c b/drivers/platform/x86/dell/dell_rbu.c
index 085ad0a0d22e..e9f4b30dcafa 100644
--- a/drivers/platform/x86/dell/dell_rbu.c
+++ b/drivers/platform/x86/dell/dell_rbu.c
@@ -573,7 +573,7 @@ static ssize_t image_type_write(struct file *filp, struct kobject *kobj,
 		if (!rbu_data.entry_created) {
 			spin_unlock(&rbu_data.lock);
 			req_firm_rc = request_firmware_nowait(THIS_MODULE,
-				FW_ACTION_NOHOTPLUG, "dell_rbu",
+				FW_ACTION_NOUEVENT, "dell_rbu",
 				&rbu_device->dev, GFP_KERNEL, &context,
 				callbackfn_rbu);
 			if (req_firm_rc) {
diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index 626a6b90fba2..6069f8db37d5 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -1788,7 +1788,7 @@ static int rproc_trigger_auto_boot(struct rproc *rproc)
 	 * We're initiating an asynchronous firmware loading, so we can
 	 * be built-in kernel code, without hanging the boot process.
 	 */
-	ret = request_firmware_nowait(THIS_MODULE, FW_ACTION_HOTPLUG,
+	ret = request_firmware_nowait(THIS_MODULE, FW_ACTION_UEVENT,
 				      rproc->firmware, &rproc->dev, GFP_KERNEL,
 				      rproc, rproc_auto_boot_callback);
 	if (ret < 0)
diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
index 5f018d02bf56..cb4196b0f7e7 100644
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -13051,7 +13051,7 @@ lpfc_sli4_request_firmware_update(struct lpfc_hba *phba, uint8_t fw_upgrade)
 	snprintf(file_name, ELX_MODEL_NAME_SIZE, "%s.grp", phba->ModelName);
 
 	if (fw_upgrade == INT_FW_UPGRADE) {
-		ret = request_firmware_nowait(THIS_MODULE, FW_ACTION_HOTPLUG,
+		ret = request_firmware_nowait(THIS_MODULE, FW_ACTION_UEVENT,
 					file_name, &phba->pcidev->dev,
 					GFP_KERNEL, (void *)phba,
 					lpfc_write_firmware);
diff --git a/drivers/tty/serial/ucc_uart.c b/drivers/tty/serial/ucc_uart.c
index f81261cb52b8..6000853973c1 100644
--- a/drivers/tty/serial/ucc_uart.c
+++ b/drivers/tty/serial/ucc_uart.c
@@ -1227,7 +1227,7 @@ static int soft_uart_init(struct platform_device *ofdev)
 		 * kernel, then we use it.
 		 */
 		ret = request_firmware_nowait(THIS_MODULE,
-					      FW_ACTION_HOTPLUG, filename, &ofdev->dev,
+					      FW_ACTION_UEVENT, filename, &ofdev->dev,
 					      GFP_KERNEL, &ofdev->dev, uart_firmware_cont);
 		if (ret) {
 			dev_err(&ofdev->dev,
diff --git a/include/linux/firmware.h b/include/linux/firmware.h
index 84e346ae766e..25109192cebe 100644
--- a/include/linux/firmware.h
+++ b/include/linux/firmware.h
@@ -6,8 +6,8 @@
 #include <linux/compiler.h>
 #include <linux/gfp.h>
 
-#define FW_ACTION_NOHOTPLUG 0
-#define FW_ACTION_HOTPLUG 1
+#define FW_ACTION_NOUEVENT 0
+#define FW_ACTION_UEVENT 1
 
 struct firmware {
 	size_t size;
diff --git a/lib/test_firmware.c b/lib/test_firmware.c
index b6fe89add9fe..1bccd6cd5f48 100644
--- a/lib/test_firmware.c
+++ b/lib/test_firmware.c
@@ -260,8 +260,8 @@ static ssize_t config_show(struct device *dev,
 	len += scnprintf(buf + len, PAGE_SIZE - len,
 			"send_uevent:\t\t%s\n",
 			test_fw_config->send_uevent ?
-			"FW_ACTION_HOTPLUG" :
-			"FW_ACTION_NOHOTPLUG");
+			"FW_ACTION_UEVENT" :
+			"FW_ACTION_NOUEVENT");
 	len += scnprintf(buf + len, PAGE_SIZE - len,
 			"into_buf:\t\t%s\n",
 			test_fw_config->into_buf ? "true" : "false");
@@ -729,7 +729,7 @@ static ssize_t trigger_custom_fallback_store(struct device *dev,
 	mutex_lock(&test_fw_mutex);
 	release_firmware(test_firmware);
 	test_firmware = NULL;
-	rc = request_firmware_nowait(THIS_MODULE, FW_ACTION_NOHOTPLUG, name,
+	rc = request_firmware_nowait(THIS_MODULE, FW_ACTION_NOUEVENT, name,
 				     dev, GFP_KERNEL, NULL,
 				     trigger_async_request_cb);
 	if (rc) {
@@ -938,8 +938,8 @@ ssize_t trigger_batched_requests_async_store(struct device *dev,
 	pr_info("batched loading '%s' custom fallback mechanism %u times\n",
 		test_fw_config->name, test_fw_config->num_requests);
 
-	send_uevent = test_fw_config->send_uevent ? FW_ACTION_HOTPLUG :
-		FW_ACTION_NOHOTPLUG;
+	send_uevent = test_fw_config->send_uevent ? FW_ACTION_UEVENT :
+		FW_ACTION_NOUEVENT;
 
 	for (i = 0; i < test_fw_config->num_requests; i++) {
 		req = &test_fw_config->reqs[i];
diff --git a/sound/soc/codecs/wm8958-dsp2.c b/sound/soc/codecs/wm8958-dsp2.c
index 536339e43dc7..e4018ba3b19a 100644
--- a/sound/soc/codecs/wm8958-dsp2.c
+++ b/sound/soc/codecs/wm8958-dsp2.c
@@ -912,13 +912,13 @@ void wm8958_dsp2_init(struct snd_soc_component *component)
 
 
 	/* We don't *require* firmware and don't want to delay boot */
-	request_firmware_nowait(THIS_MODULE, FW_ACTION_HOTPLUG,
+	request_firmware_nowait(THIS_MODULE, FW_ACTION_UEVENT,
 				"wm8958_mbc.wfw", component->dev, GFP_KERNEL,
 				component, wm8958_mbc_loaded);
-	request_firmware_nowait(THIS_MODULE, FW_ACTION_HOTPLUG,
+	request_firmware_nowait(THIS_MODULE, FW_ACTION_UEVENT,
 				"wm8958_mbc_vss.wfw", component->dev, GFP_KERNEL,
 				component, wm8958_mbc_vss_loaded);
-	request_firmware_nowait(THIS_MODULE, FW_ACTION_HOTPLUG,
+	request_firmware_nowait(THIS_MODULE, FW_ACTION_UEVENT,
 				"wm8958_enh_eq.wfw", component->dev, GFP_KERNEL,
 				component, wm8958_enh_eq_loaded);
 
-- 
cgit v1.2.3


From ed5aecd3da2eabd8a6c9f5593df2c4f00985fca2 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Wed, 5 May 2021 11:18:54 +0200
Subject: tty: remove broken r3964 line discipline

Noone stepped up in the past two years since it was marked as BROKEN by
commit c7084edc3f6d (tty: mark Siemens R3964 line discipline as BROKEN).
Remove the line discipline for good.

Three remarks:
* we remove also the uapi header (as noone is able to use that interface
  anyway)
* we do *not* remove the N_R3964 constant definition from tty.h, so it
  remains reserved.
* in_interrupt() check is now removed from vt's con_put_char. Noone else
  calls tty_operations::put_char from interrupt context.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20210505091928.22010-2-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/configs/ppc6xx_defconfig |    1 -
 drivers/char/Kconfig                  |   13 -
 drivers/tty/Makefile                  |    1 -
 drivers/tty/n_r3964.c                 | 1283 ---------------------------------
 drivers/tty/vt/vt.c                   |    2 -
 include/linux/n_r3964.h               |  175 -----
 include/uapi/linux/n_r3964.h          |   99 ---
 7 files changed, 1574 deletions(-)
 delete mode 100644 drivers/tty/n_r3964.c
 delete mode 100644 include/linux/n_r3964.h
 delete mode 100644 include/uapi/linux/n_r3964.h

(limited to 'include/linux')

diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig
index 1fd9d1260f9e..ee09f7bb6ea9 100644
--- a/arch/powerpc/configs/ppc6xx_defconfig
+++ b/arch/powerpc/configs/ppc6xx_defconfig
@@ -621,7 +621,6 @@ CONFIG_HW_RANDOM=y
 CONFIG_HW_RANDOM_VIRTIO=m
 CONFIG_NVRAM=y
 CONFIG_DTLK=m
-CONFIG_R3964=m
 CONFIG_CARDMAN_4000=m
 CONFIG_CARDMAN_4040=m
 CONFIG_IPWIRELESS=m
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index b151e0fcdeb5..52d0dd49a683 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -218,19 +218,6 @@ config XILINX_HWICAP
 
 	  If unsure, say N.
 
-config R3964
-	tristate "Siemens R3964 line discipline"
-	depends on TTY && BROKEN
-	help
-	  This driver allows synchronous communication with devices using the
-	  Siemens R3964 packet protocol. Unless you are dealing with special
-	  hardware like PLCs, you are unlikely to need this.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called n_r3964.
-
-	  If unsure, say N.
-
 config APPLICOM
 	tristate "Applicom intelligent fieldbus card support"
 	depends on PCI
diff --git a/drivers/tty/Makefile b/drivers/tty/Makefile
index c7054f5117c3..a2bd75fbaaa4 100644
--- a/drivers/tty/Makefile
+++ b/drivers/tty/Makefile
@@ -9,7 +9,6 @@ obj-$(CONFIG_AUDIT)		+= tty_audit.o
 obj-$(CONFIG_MAGIC_SYSRQ)	+= sysrq.o
 obj-$(CONFIG_N_HDLC)		+= n_hdlc.o
 obj-$(CONFIG_N_GSM)		+= n_gsm.o
-obj-$(CONFIG_R3964)		+= n_r3964.o
 
 obj-y				+= vt/
 obj-$(CONFIG_HVC_DRIVER)	+= hvc/
diff --git a/drivers/tty/n_r3964.c b/drivers/tty/n_r3964.c
deleted file mode 100644
index 2eb76ea1d88d..000000000000
--- a/drivers/tty/n_r3964.c
+++ /dev/null
@@ -1,1283 +0,0 @@
-// SPDX-License-Identifier: GPL-1.0+
-/* r3964 linediscipline for linux
- *
- * -----------------------------------------------------------
- * Copyright by 
- * Philips Automation Projects
- * Kassel (Germany)
- * -----------------------------------------------------------
- * Author:
- * L. Haag
- *
- * $Log: n_r3964.c,v $
- * Revision 1.10  2001/03/18 13:02:24  dwmw2
- * Fix timer usage, use spinlocks properly.
- *
- * Revision 1.9  2001/03/18 12:52:14  dwmw2
- * Merge changes in 2.4.2
- *
- * Revision 1.8  2000/03/23 14:14:54  dwmw2
- * Fix race in sleeping in r3964_read()
- *
- * Revision 1.7  1999/28/08 11:41:50  dwmw2
- * Port to 2.3 kernel
- *
- * Revision 1.6  1998/09/30 00:40:40  dwmw2
- * Fixed compilation on 2.0.x kernels
- * Updated to newly registered tty-ldisc number 9
- *
- * Revision 1.5  1998/09/04 21:57:36  dwmw2
- * Signal handling bug fixes, port to 2.1.x.
- *
- * Revision 1.4  1998/04/02 20:26:59  lhaag
- * select, blocking, ...
- *
- * Revision 1.3  1998/02/12 18:58:43  root
- * fixed some memory leaks
- * calculation of checksum characters
- *
- * Revision 1.2  1998/02/07 13:03:34  root
- * ioctl read_telegram
- *
- * Revision 1.1  1998/02/06 19:21:03  root
- * Initial revision
- *
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-#include <linux/fcntl.h>
-#include <linux/interrupt.h>
-#include <linux/ptrace.h>
-#include <linux/ioport.h>
-#include <linux/in.h>
-#include <linux/slab.h>
-#include <linux/tty.h>
-#include <linux/errno.h>
-#include <linux/string.h>	/* used in new tty drivers */
-#include <linux/signal.h>	/* used in new tty drivers */
-#include <linux/ioctl.h>
-#include <linux/n_r3964.h>
-#include <linux/poll.h>
-#include <linux/init.h>
-#include <linux/uaccess.h>
-
-/*#define DEBUG_QUEUE*/
-
-/* Log successful handshake and protocol operations  */
-/*#define DEBUG_PROTO_S*/
-
-/* Log handshake and protocol errors: */
-/*#define DEBUG_PROTO_E*/
-
-/* Log Linediscipline operations (open, close, read, write...): */
-/*#define DEBUG_LDISC*/
-
-/* Log module and memory operations (init, cleanup; kmalloc, kfree): */
-/*#define DEBUG_MODUL*/
-
-/* Macro helpers for debug output: */
-#define TRACE(format, args...) printk("r3964: " format "\n" , ## args)
-
-#ifdef DEBUG_MODUL
-#define TRACE_M(format, args...) printk("r3964: " format "\n" , ## args)
-#else
-#define TRACE_M(fmt, arg...) do {} while (0)
-#endif
-#ifdef DEBUG_PROTO_S
-#define TRACE_PS(format, args...) printk("r3964: " format "\n" , ## args)
-#else
-#define TRACE_PS(fmt, arg...) do {} while (0)
-#endif
-#ifdef DEBUG_PROTO_E
-#define TRACE_PE(format, args...) printk("r3964: " format "\n" , ## args)
-#else
-#define TRACE_PE(fmt, arg...) do {} while (0)
-#endif
-#ifdef DEBUG_LDISC
-#define TRACE_L(format, args...) printk("r3964: " format "\n" , ## args)
-#else
-#define TRACE_L(fmt, arg...) do {} while (0)
-#endif
-#ifdef DEBUG_QUEUE
-#define TRACE_Q(format, args...) printk("r3964: " format "\n" , ## args)
-#else
-#define TRACE_Q(fmt, arg...) do {} while (0)
-#endif
-static void add_tx_queue(struct r3964_info *, struct r3964_block_header *);
-static void remove_from_tx_queue(struct r3964_info *pInfo, int error_code);
-static void put_char(struct r3964_info *pInfo, unsigned char ch);
-static void trigger_transmit(struct r3964_info *pInfo);
-static void retry_transmit(struct r3964_info *pInfo);
-static void transmit_block(struct r3964_info *pInfo);
-static void receive_char(struct r3964_info *pInfo, const unsigned char c);
-static void receive_error(struct r3964_info *pInfo, const char flag);
-static void on_timeout(struct timer_list *t);
-static int enable_signals(struct r3964_info *pInfo, struct pid *pid, int arg);
-static int read_telegram(struct r3964_info *pInfo, struct pid *pid,
-		unsigned char __user * buf);
-static void add_msg(struct r3964_client_info *pClient, int msg_id, int arg,
-		int error_code, struct r3964_block_header *pBlock);
-static struct r3964_message *remove_msg(struct r3964_info *pInfo,
-		struct r3964_client_info *pClient);
-static void remove_client_block(struct r3964_info *pInfo,
-		struct r3964_client_info *pClient);
-
-static int r3964_open(struct tty_struct *tty);
-static void r3964_close(struct tty_struct *tty);
-static ssize_t r3964_read(struct tty_struct *tty, struct file *file,
-		void *cookie, unsigned char *buf, size_t nr);
-static ssize_t r3964_write(struct tty_struct *tty, struct file *file,
-		const unsigned char *buf, size_t nr);
-static int r3964_ioctl(struct tty_struct *tty, struct file *file,
-		unsigned int cmd, unsigned long arg);
-#ifdef CONFIG_COMPAT
-static int r3964_compat_ioctl(struct tty_struct *tty, struct file *file,
-		unsigned int cmd, unsigned long arg);
-#endif
-static void r3964_set_termios(struct tty_struct *tty, struct ktermios *old);
-static __poll_t r3964_poll(struct tty_struct *tty, struct file *file,
-		struct poll_table_struct *wait);
-static void r3964_receive_buf(struct tty_struct *tty, const unsigned char *cp,
-		char *fp, int count);
-
-static struct tty_ldisc_ops tty_ldisc_N_R3964 = {
-	.owner = THIS_MODULE,
-	.name = "R3964",
-	.open = r3964_open,
-	.close = r3964_close,
-	.read = r3964_read,
-	.write = r3964_write,
-	.ioctl = r3964_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl = r3964_compat_ioctl,
-#endif
-	.set_termios = r3964_set_termios,
-	.poll = r3964_poll,
-	.receive_buf = r3964_receive_buf,
-};
-
-static void dump_block(const unsigned char *block, unsigned int length)
-{
-	unsigned int i, j;
-	char linebuf[16 * 3 + 1];
-
-	for (i = 0; i < length; i += 16) {
-		for (j = 0; (j < 16) && (j + i < length); j++) {
-			sprintf(linebuf + 3 * j, "%02x ", block[i + j]);
-		}
-		linebuf[3 * j] = '\0';
-		TRACE_PS("%s", linebuf);
-	}
-}
-
-/*************************************************************
- * Driver initialisation
- *************************************************************/
-
-/*************************************************************
- * Module support routines
- *************************************************************/
-
-static void __exit r3964_exit(void)
-{
-	int status;
-
-	TRACE_M("cleanup_module()");
-
-	status = tty_unregister_ldisc(N_R3964);
-
-	if (status != 0) {
-		printk(KERN_ERR "r3964: error unregistering linediscipline: "
-				"%d\n", status);
-	} else {
-		TRACE_L("linediscipline successfully unregistered");
-	}
-}
-
-static int __init r3964_init(void)
-{
-	int status;
-
-	printk("r3964: Philips r3964 Driver $Revision: 1.10 $\n");
-
-	/*
-	 * Register the tty line discipline
-	 */
-
-	status = tty_register_ldisc(N_R3964, &tty_ldisc_N_R3964);
-	if (status == 0) {
-		TRACE_L("line discipline %d registered", N_R3964);
-		TRACE_L("flags=%x num=%x", tty_ldisc_N_R3964.flags,
-			tty_ldisc_N_R3964.num);
-		TRACE_L("open=%p", tty_ldisc_N_R3964.open);
-		TRACE_L("tty_ldisc_N_R3964 = %p", &tty_ldisc_N_R3964);
-	} else {
-		printk(KERN_ERR "r3964: error registering line discipline: "
-				"%d\n", status);
-	}
-	return status;
-}
-
-module_init(r3964_init);
-module_exit(r3964_exit);
-
-/*************************************************************
- * Protocol implementation routines
- *************************************************************/
-
-static void add_tx_queue(struct r3964_info *pInfo,
-			 struct r3964_block_header *pHeader)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&pInfo->lock, flags);
-
-	pHeader->next = NULL;
-
-	if (pInfo->tx_last == NULL) {
-		pInfo->tx_first = pInfo->tx_last = pHeader;
-	} else {
-		pInfo->tx_last->next = pHeader;
-		pInfo->tx_last = pHeader;
-	}
-
-	spin_unlock_irqrestore(&pInfo->lock, flags);
-
-	TRACE_Q("add_tx_queue %p, length %d, tx_first = %p",
-		pHeader, pHeader->length, pInfo->tx_first);
-}
-
-static void remove_from_tx_queue(struct r3964_info *pInfo, int error_code)
-{
-	struct r3964_block_header *pHeader;
-	unsigned long flags;
-#ifdef DEBUG_QUEUE
-	struct r3964_block_header *pDump;
-#endif
-
-	pHeader = pInfo->tx_first;
-
-	if (pHeader == NULL)
-		return;
-
-#ifdef DEBUG_QUEUE
-	printk("r3964: remove_from_tx_queue: %p, length %u - ",
-		pHeader, pHeader->length);
-	for (pDump = pHeader; pDump; pDump = pDump->next)
-		printk("%p ", pDump);
-	printk("\n");
-#endif
-
-	if (pHeader->owner) {
-		if (error_code) {
-			add_msg(pHeader->owner, R3964_MSG_ACK, 0,
-				error_code, NULL);
-		} else {
-			add_msg(pHeader->owner, R3964_MSG_ACK, pHeader->length,
-				error_code, NULL);
-		}
-		wake_up_interruptible(&pInfo->tty->read_wait);
-	}
-
-	spin_lock_irqsave(&pInfo->lock, flags);
-
-	pInfo->tx_first = pHeader->next;
-	if (pInfo->tx_first == NULL) {
-		pInfo->tx_last = NULL;
-	}
-
-	spin_unlock_irqrestore(&pInfo->lock, flags);
-
-	kfree(pHeader);
-	TRACE_M("remove_from_tx_queue - kfree %p", pHeader);
-
-	TRACE_Q("remove_from_tx_queue: tx_first = %p, tx_last = %p",
-		pInfo->tx_first, pInfo->tx_last);
-}
-
-static void add_rx_queue(struct r3964_info *pInfo,
-			 struct r3964_block_header *pHeader)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&pInfo->lock, flags);
-
-	pHeader->next = NULL;
-
-	if (pInfo->rx_last == NULL) {
-		pInfo->rx_first = pInfo->rx_last = pHeader;
-	} else {
-		pInfo->rx_last->next = pHeader;
-		pInfo->rx_last = pHeader;
-	}
-	pInfo->blocks_in_rx_queue++;
-
-	spin_unlock_irqrestore(&pInfo->lock, flags);
-
-	TRACE_Q("add_rx_queue: %p, length = %d, rx_first = %p, count = %d",
-		pHeader, pHeader->length,
-		pInfo->rx_first, pInfo->blocks_in_rx_queue);
-}
-
-static void remove_from_rx_queue(struct r3964_info *pInfo,
-				 struct r3964_block_header *pHeader)
-{
-	unsigned long flags;
-	struct r3964_block_header *pFind;
-
-	if (pHeader == NULL)
-		return;
-
-	TRACE_Q("remove_from_rx_queue: rx_first = %p, rx_last = %p, count = %d",
-		pInfo->rx_first, pInfo->rx_last, pInfo->blocks_in_rx_queue);
-	TRACE_Q("remove_from_rx_queue: %p, length %u",
-		pHeader, pHeader->length);
-
-	spin_lock_irqsave(&pInfo->lock, flags);
-
-	if (pInfo->rx_first == pHeader) {
-		/* Remove the first block in the linked list: */
-		pInfo->rx_first = pHeader->next;
-
-		if (pInfo->rx_first == NULL) {
-			pInfo->rx_last = NULL;
-		}
-		pInfo->blocks_in_rx_queue--;
-	} else {
-		/* Find block to remove: */
-		for (pFind = pInfo->rx_first; pFind; pFind = pFind->next) {
-			if (pFind->next == pHeader) {
-				/* Got it. */
-				pFind->next = pHeader->next;
-				pInfo->blocks_in_rx_queue--;
-				if (pFind->next == NULL) {
-					/* Oh, removed the last one! */
-					pInfo->rx_last = pFind;
-				}
-				break;
-			}
-		}
-	}
-
-	spin_unlock_irqrestore(&pInfo->lock, flags);
-
-	kfree(pHeader);
-	TRACE_M("remove_from_rx_queue - kfree %p", pHeader);
-
-	TRACE_Q("remove_from_rx_queue: rx_first = %p, rx_last = %p, count = %d",
-		pInfo->rx_first, pInfo->rx_last, pInfo->blocks_in_rx_queue);
-}
-
-static void put_char(struct r3964_info *pInfo, unsigned char ch)
-{
-	struct tty_struct *tty = pInfo->tty;
-	/* FIXME: put_char should not be called from an IRQ */
-	tty_put_char(tty, ch);
-	pInfo->bcc ^= ch;
-}
-
-static void flush(struct r3964_info *pInfo)
-{
-	struct tty_struct *tty = pInfo->tty;
-
-	if (tty == NULL || tty->ops->flush_chars == NULL)
-		return;
-	tty->ops->flush_chars(tty);
-}
-
-static void trigger_transmit(struct r3964_info *pInfo)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&pInfo->lock, flags);
-
-	if ((pInfo->state == R3964_IDLE) && (pInfo->tx_first != NULL)) {
-		pInfo->state = R3964_TX_REQUEST;
-		pInfo->nRetry = 0;
-		pInfo->flags &= ~R3964_ERROR;
-		mod_timer(&pInfo->tmr, jiffies + R3964_TO_QVZ);
-
-		spin_unlock_irqrestore(&pInfo->lock, flags);
-
-		TRACE_PS("trigger_transmit - sent STX");
-
-		put_char(pInfo, STX);
-		flush(pInfo);
-
-		pInfo->bcc = 0;
-	} else {
-		spin_unlock_irqrestore(&pInfo->lock, flags);
-	}
-}
-
-static void retry_transmit(struct r3964_info *pInfo)
-{
-	if (pInfo->nRetry < R3964_MAX_RETRIES) {
-		TRACE_PE("transmission failed. Retry #%d", pInfo->nRetry);
-		pInfo->bcc = 0;
-		put_char(pInfo, STX);
-		flush(pInfo);
-		pInfo->state = R3964_TX_REQUEST;
-		pInfo->nRetry++;
-		mod_timer(&pInfo->tmr, jiffies + R3964_TO_QVZ);
-	} else {
-		TRACE_PE("transmission failed after %d retries",
-			 R3964_MAX_RETRIES);
-
-		remove_from_tx_queue(pInfo, R3964_TX_FAIL);
-
-		put_char(pInfo, NAK);
-		flush(pInfo);
-		pInfo->state = R3964_IDLE;
-
-		trigger_transmit(pInfo);
-	}
-}
-
-static void transmit_block(struct r3964_info *pInfo)
-{
-	struct tty_struct *tty = pInfo->tty;
-	struct r3964_block_header *pBlock = pInfo->tx_first;
-	int room = 0;
-
-	if (tty == NULL || pBlock == NULL) {
-		return;
-	}
-
-	room = tty_write_room(tty);
-
-	TRACE_PS("transmit_block %p, room %d, length %d",
-		 pBlock, room, pBlock->length);
-
-	while (pInfo->tx_position < pBlock->length) {
-		if (room < 2)
-			break;
-
-		if (pBlock->data[pInfo->tx_position] == DLE) {
-			/* send additional DLE char: */
-			put_char(pInfo, DLE);
-		}
-		put_char(pInfo, pBlock->data[pInfo->tx_position++]);
-
-		room--;
-	}
-
-	if ((pInfo->tx_position == pBlock->length) && (room >= 3)) {
-		put_char(pInfo, DLE);
-		put_char(pInfo, ETX);
-		if (pInfo->flags & R3964_BCC) {
-			put_char(pInfo, pInfo->bcc);
-		}
-		pInfo->state = R3964_WAIT_FOR_TX_ACK;
-		mod_timer(&pInfo->tmr, jiffies + R3964_TO_QVZ);
-	}
-	flush(pInfo);
-}
-
-static void on_receive_block(struct r3964_info *pInfo)
-{
-	unsigned int length;
-	struct r3964_client_info *pClient;
-	struct r3964_block_header *pBlock;
-
-	length = pInfo->rx_position;
-
-	/* compare byte checksum characters: */
-	if (pInfo->flags & R3964_BCC) {
-		if (pInfo->bcc != pInfo->last_rx) {
-			TRACE_PE("checksum error - got %x but expected %x",
-				 pInfo->last_rx, pInfo->bcc);
-			pInfo->flags |= R3964_CHECKSUM;
-		}
-	}
-
-	/* check for errors (parity, overrun,...): */
-	if (pInfo->flags & R3964_ERROR) {
-		TRACE_PE("on_receive_block - transmission failed error %x",
-			 pInfo->flags & R3964_ERROR);
-
-		put_char(pInfo, NAK);
-		flush(pInfo);
-		if (pInfo->nRetry < R3964_MAX_RETRIES) {
-			pInfo->state = R3964_WAIT_FOR_RX_REPEAT;
-			pInfo->nRetry++;
-			mod_timer(&pInfo->tmr, jiffies + R3964_TO_RX_PANIC);
-		} else {
-			TRACE_PE("on_receive_block - failed after max retries");
-			pInfo->state = R3964_IDLE;
-		}
-		return;
-	}
-
-	/* received block; submit DLE: */
-	put_char(pInfo, DLE);
-	flush(pInfo);
-	del_timer_sync(&pInfo->tmr);
-	TRACE_PS(" rx success: got %d chars", length);
-
-	/* prepare struct r3964_block_header: */
-	pBlock = kmalloc(length + sizeof(struct r3964_block_header),
-			GFP_KERNEL);
-	TRACE_M("on_receive_block - kmalloc %p", pBlock);
-
-	if (pBlock == NULL)
-		return;
-
-	pBlock->length = length;
-	pBlock->data = ((unsigned char *)pBlock) +
-			sizeof(struct r3964_block_header);
-	pBlock->locks = 0;
-	pBlock->next = NULL;
-	pBlock->owner = NULL;
-
-	memcpy(pBlock->data, pInfo->rx_buf, length);
-
-	/* queue block into rx_queue: */
-	add_rx_queue(pInfo, pBlock);
-
-	/* notify attached client processes: */
-	for (pClient = pInfo->firstClient; pClient; pClient = pClient->next) {
-		if (pClient->sig_flags & R3964_SIG_DATA) {
-			add_msg(pClient, R3964_MSG_DATA, length, R3964_OK,
-				pBlock);
-		}
-	}
-	wake_up_interruptible(&pInfo->tty->read_wait);
-
-	pInfo->state = R3964_IDLE;
-
-	trigger_transmit(pInfo);
-}
-
-static void receive_char(struct r3964_info *pInfo, const unsigned char c)
-{
-	switch (pInfo->state) {
-	case R3964_TX_REQUEST:
-		if (c == DLE) {
-			TRACE_PS("TX_REQUEST - got DLE");
-
-			pInfo->state = R3964_TRANSMITTING;
-			pInfo->tx_position = 0;
-
-			transmit_block(pInfo);
-		} else if (c == STX) {
-			if (pInfo->nRetry == 0) {
-				TRACE_PE("TX_REQUEST - init conflict");
-				if (pInfo->priority == R3964_SLAVE) {
-					goto start_receiving;
-				}
-			} else {
-				TRACE_PE("TX_REQUEST - secondary init "
-					"conflict!? Switching to SLAVE mode "
-					"for next rx.");
-				goto start_receiving;
-			}
-		} else {
-			TRACE_PE("TX_REQUEST - char != DLE: %x", c);
-			retry_transmit(pInfo);
-		}
-		break;
-	case R3964_TRANSMITTING:
-		if (c == NAK) {
-			TRACE_PE("TRANSMITTING - got NAK");
-			retry_transmit(pInfo);
-		} else {
-			TRACE_PE("TRANSMITTING - got invalid char");
-
-			pInfo->state = R3964_WAIT_ZVZ_BEFORE_TX_RETRY;
-			mod_timer(&pInfo->tmr, jiffies + R3964_TO_ZVZ);
-		}
-		break;
-	case R3964_WAIT_FOR_TX_ACK:
-		if (c == DLE) {
-			TRACE_PS("WAIT_FOR_TX_ACK - got DLE");
-			remove_from_tx_queue(pInfo, R3964_OK);
-
-			pInfo->state = R3964_IDLE;
-			trigger_transmit(pInfo);
-		} else {
-			retry_transmit(pInfo);
-		}
-		break;
-	case R3964_WAIT_FOR_RX_REPEAT:
-	case R3964_IDLE:
-		if (c == STX) {
-			/* Prevent rx_queue from overflow: */
-			if (pInfo->blocks_in_rx_queue >=
-			    R3964_MAX_BLOCKS_IN_RX_QUEUE) {
-				TRACE_PE("IDLE - got STX but no space in "
-						"rx_queue!");
-				pInfo->state = R3964_WAIT_FOR_RX_BUF;
-				mod_timer(&pInfo->tmr,
-					  jiffies + R3964_TO_NO_BUF);
-				break;
-			}
-start_receiving:
-			/* Ok, start receiving: */
-			TRACE_PS("IDLE - got STX");
-			pInfo->rx_position = 0;
-			pInfo->last_rx = 0;
-			pInfo->flags &= ~R3964_ERROR;
-			pInfo->state = R3964_RECEIVING;
-			mod_timer(&pInfo->tmr, jiffies + R3964_TO_ZVZ);
-			pInfo->nRetry = 0;
-			put_char(pInfo, DLE);
-			flush(pInfo);
-			pInfo->bcc = 0;
-		}
-		break;
-	case R3964_RECEIVING:
-		if (pInfo->rx_position < RX_BUF_SIZE) {
-			pInfo->bcc ^= c;
-
-			if (c == DLE) {
-				if (pInfo->last_rx == DLE) {
-					pInfo->last_rx = 0;
-					goto char_to_buf;
-				}
-				pInfo->last_rx = DLE;
-				break;
-			} else if ((c == ETX) && (pInfo->last_rx == DLE)) {
-				if (pInfo->flags & R3964_BCC) {
-					pInfo->state = R3964_WAIT_FOR_BCC;
-					mod_timer(&pInfo->tmr,
-						  jiffies + R3964_TO_ZVZ);
-				} else {
-					on_receive_block(pInfo);
-				}
-			} else {
-				pInfo->last_rx = c;
-char_to_buf:
-				pInfo->rx_buf[pInfo->rx_position++] = c;
-				mod_timer(&pInfo->tmr, jiffies + R3964_TO_ZVZ);
-			}
-		}
-		/* else: overflow-msg? BUF_SIZE>MTU; should not happen? */
-		break;
-	case R3964_WAIT_FOR_BCC:
-		pInfo->last_rx = c;
-		on_receive_block(pInfo);
-		break;
-	}
-}
-
-static void receive_error(struct r3964_info *pInfo, const char flag)
-{
-	switch (flag) {
-	case TTY_NORMAL:
-		break;
-	case TTY_BREAK:
-		TRACE_PE("received break");
-		pInfo->flags |= R3964_BREAK;
-		break;
-	case TTY_PARITY:
-		TRACE_PE("parity error");
-		pInfo->flags |= R3964_PARITY;
-		break;
-	case TTY_FRAME:
-		TRACE_PE("frame error");
-		pInfo->flags |= R3964_FRAME;
-		break;
-	case TTY_OVERRUN:
-		TRACE_PE("frame overrun");
-		pInfo->flags |= R3964_OVERRUN;
-		break;
-	default:
-		TRACE_PE("receive_error - unknown flag %d", flag);
-		pInfo->flags |= R3964_UNKNOWN;
-		break;
-	}
-}
-
-static void on_timeout(struct timer_list *t)
-{
-	struct r3964_info *pInfo = from_timer(pInfo, t, tmr);
-
-	switch (pInfo->state) {
-	case R3964_TX_REQUEST:
-		TRACE_PE("TX_REQUEST - timeout");
-		retry_transmit(pInfo);
-		break;
-	case R3964_WAIT_ZVZ_BEFORE_TX_RETRY:
-		put_char(pInfo, NAK);
-		flush(pInfo);
-		retry_transmit(pInfo);
-		break;
-	case R3964_WAIT_FOR_TX_ACK:
-		TRACE_PE("WAIT_FOR_TX_ACK - timeout");
-		retry_transmit(pInfo);
-		break;
-	case R3964_WAIT_FOR_RX_BUF:
-		TRACE_PE("WAIT_FOR_RX_BUF - timeout");
-		put_char(pInfo, NAK);
-		flush(pInfo);
-		pInfo->state = R3964_IDLE;
-		break;
-	case R3964_RECEIVING:
-		TRACE_PE("RECEIVING - timeout after %d chars",
-			 pInfo->rx_position);
-		put_char(pInfo, NAK);
-		flush(pInfo);
-		pInfo->state = R3964_IDLE;
-		break;
-	case R3964_WAIT_FOR_RX_REPEAT:
-		TRACE_PE("WAIT_FOR_RX_REPEAT - timeout");
-		pInfo->state = R3964_IDLE;
-		break;
-	case R3964_WAIT_FOR_BCC:
-		TRACE_PE("WAIT_FOR_BCC - timeout");
-		put_char(pInfo, NAK);
-		flush(pInfo);
-		pInfo->state = R3964_IDLE;
-		break;
-	}
-}
-
-static struct r3964_client_info *findClient(struct r3964_info *pInfo,
-		struct pid *pid)
-{
-	struct r3964_client_info *pClient;
-
-	for (pClient = pInfo->firstClient; pClient; pClient = pClient->next) {
-		if (pClient->pid == pid) {
-			return pClient;
-		}
-	}
-	return NULL;
-}
-
-static int enable_signals(struct r3964_info *pInfo, struct pid *pid, int arg)
-{
-	struct r3964_client_info *pClient;
-	struct r3964_client_info **ppClient;
-	struct r3964_message *pMsg;
-
-	if ((arg & R3964_SIG_ALL) == 0) {
-		/* Remove client from client list */
-		for (ppClient = &pInfo->firstClient; *ppClient;
-		     ppClient = &(*ppClient)->next) {
-			pClient = *ppClient;
-
-			if (pClient->pid == pid) {
-				TRACE_PS("removing client %d from client list",
-					 pid_nr(pid));
-				*ppClient = pClient->next;
-				while (pClient->msg_count) {
-					pMsg = remove_msg(pInfo, pClient);
-					if (pMsg) {
-						kfree(pMsg);
-						TRACE_M("enable_signals - msg "
-							"kfree %p", pMsg);
-					}
-				}
-				put_pid(pClient->pid);
-				kfree(pClient);
-				TRACE_M("enable_signals - kfree %p", pClient);
-				return 0;
-			}
-		}
-		return -EINVAL;
-	} else {
-		pClient = findClient(pInfo, pid);
-		if (pClient) {
-			/* update signal options */
-			pClient->sig_flags = arg;
-		} else {
-			/* add client to client list */
-			pClient = kmalloc(sizeof(struct r3964_client_info),
-					GFP_KERNEL);
-			TRACE_M("enable_signals - kmalloc %p", pClient);
-			if (pClient == NULL)
-				return -ENOMEM;
-
-			TRACE_PS("add client %d to client list", pid_nr(pid));
-			spin_lock_init(&pClient->lock);
-			pClient->sig_flags = arg;
-			pClient->pid = get_pid(pid);
-			pClient->next = pInfo->firstClient;
-			pClient->first_msg = NULL;
-			pClient->last_msg = NULL;
-			pClient->next_block_to_read = NULL;
-			pClient->msg_count = 0;
-			pInfo->firstClient = pClient;
-		}
-	}
-
-	return 0;
-}
-
-static int read_telegram(struct r3964_info *pInfo, struct pid *pid,
-			 unsigned char __user * buf)
-{
-	struct r3964_client_info *pClient;
-	struct r3964_block_header *block;
-
-	if (!buf) {
-		return -EINVAL;
-	}
-
-	pClient = findClient(pInfo, pid);
-	if (pClient == NULL) {
-		return -EINVAL;
-	}
-
-	block = pClient->next_block_to_read;
-	if (!block) {
-		return 0;
-	} else {
-		if (copy_to_user(buf, block->data, block->length))
-			return -EFAULT;
-
-		remove_client_block(pInfo, pClient);
-		return block->length;
-	}
-
-	return -EINVAL;
-}
-
-static void add_msg(struct r3964_client_info *pClient, int msg_id, int arg,
-		int error_code, struct r3964_block_header *pBlock)
-{
-	struct r3964_message *pMsg;
-	unsigned long flags;
-
-	if (pClient->msg_count < R3964_MAX_MSG_COUNT - 1) {
-queue_the_message:
-
-		pMsg = kmalloc(sizeof(struct r3964_message),
-				error_code ? GFP_ATOMIC : GFP_KERNEL);
-		TRACE_M("add_msg - kmalloc %p", pMsg);
-		if (pMsg == NULL) {
-			return;
-		}
-
-		spin_lock_irqsave(&pClient->lock, flags);
-
-		pMsg->msg_id = msg_id;
-		pMsg->arg = arg;
-		pMsg->error_code = error_code;
-		pMsg->block = pBlock;
-		pMsg->next = NULL;
-
-		if (pClient->last_msg == NULL) {
-			pClient->first_msg = pClient->last_msg = pMsg;
-		} else {
-			pClient->last_msg->next = pMsg;
-			pClient->last_msg = pMsg;
-		}
-
-		pClient->msg_count++;
-
-		if (pBlock != NULL) {
-			pBlock->locks++;
-		}
-		spin_unlock_irqrestore(&pClient->lock, flags);
-	} else {
-		if ((pClient->last_msg->msg_id == R3964_MSG_ACK)
-		    && (pClient->last_msg->error_code == R3964_OVERFLOW)) {
-			pClient->last_msg->arg++;
-			TRACE_PE("add_msg - inc prev OVERFLOW-msg");
-		} else {
-			msg_id = R3964_MSG_ACK;
-			arg = 0;
-			error_code = R3964_OVERFLOW;
-			pBlock = NULL;
-			TRACE_PE("add_msg - queue OVERFLOW-msg");
-			goto queue_the_message;
-		}
-	}
-	/* Send SIGIO signal to client process: */
-	if (pClient->sig_flags & R3964_USE_SIGIO) {
-		kill_pid(pClient->pid, SIGIO, 1);
-	}
-}
-
-static struct r3964_message *remove_msg(struct r3964_info *pInfo,
-					struct r3964_client_info *pClient)
-{
-	struct r3964_message *pMsg = NULL;
-	unsigned long flags;
-
-	if (pClient->first_msg) {
-		spin_lock_irqsave(&pClient->lock, flags);
-
-		pMsg = pClient->first_msg;
-		pClient->first_msg = pMsg->next;
-		if (pClient->first_msg == NULL) {
-			pClient->last_msg = NULL;
-		}
-
-		pClient->msg_count--;
-		if (pMsg->block) {
-			remove_client_block(pInfo, pClient);
-			pClient->next_block_to_read = pMsg->block;
-		}
-		spin_unlock_irqrestore(&pClient->lock, flags);
-	}
-	return pMsg;
-}
-
-static void remove_client_block(struct r3964_info *pInfo,
-				struct r3964_client_info *pClient)
-{
-	struct r3964_block_header *block;
-
-	TRACE_PS("remove_client_block PID %d", pid_nr(pClient->pid));
-
-	block = pClient->next_block_to_read;
-	if (block) {
-		block->locks--;
-		if (block->locks == 0) {
-			remove_from_rx_queue(pInfo, block);
-		}
-	}
-	pClient->next_block_to_read = NULL;
-}
-
-/*************************************************************
- * Line discipline routines
- *************************************************************/
-
-static int r3964_open(struct tty_struct *tty)
-{
-	struct r3964_info *pInfo;
-
-	TRACE_L("open");
-	TRACE_L("tty=%p, PID=%d, disc_data=%p",
-		tty, current->pid, tty->disc_data);
-
-	pInfo = kmalloc(sizeof(struct r3964_info), GFP_KERNEL);
-	TRACE_M("r3964_open - info kmalloc %p", pInfo);
-
-	if (!pInfo) {
-		printk(KERN_ERR "r3964: failed to alloc info structure\n");
-		return -ENOMEM;
-	}
-
-	pInfo->rx_buf = kmalloc(RX_BUF_SIZE, GFP_KERNEL);
-	TRACE_M("r3964_open - rx_buf kmalloc %p", pInfo->rx_buf);
-
-	if (!pInfo->rx_buf) {
-		printk(KERN_ERR "r3964: failed to alloc receive buffer\n");
-		kfree(pInfo);
-		TRACE_M("r3964_open - info kfree %p", pInfo);
-		return -ENOMEM;
-	}
-
-	pInfo->tx_buf = kmalloc(TX_BUF_SIZE, GFP_KERNEL);
-	TRACE_M("r3964_open - tx_buf kmalloc %p", pInfo->tx_buf);
-
-	if (!pInfo->tx_buf) {
-		printk(KERN_ERR "r3964: failed to alloc transmit buffer\n");
-		kfree(pInfo->rx_buf);
-		TRACE_M("r3964_open - rx_buf kfree %p", pInfo->rx_buf);
-		kfree(pInfo);
-		TRACE_M("r3964_open - info kfree %p", pInfo);
-		return -ENOMEM;
-	}
-
-	spin_lock_init(&pInfo->lock);
-	mutex_init(&pInfo->read_lock);
-	pInfo->tty = tty;
-	pInfo->priority = R3964_MASTER;
-	pInfo->rx_first = pInfo->rx_last = NULL;
-	pInfo->tx_first = pInfo->tx_last = NULL;
-	pInfo->rx_position = 0;
-	pInfo->tx_position = 0;
-	pInfo->last_rx = 0;
-	pInfo->blocks_in_rx_queue = 0;
-	pInfo->firstClient = NULL;
-	pInfo->state = R3964_IDLE;
-	pInfo->flags = R3964_DEBUG;
-	pInfo->nRetry = 0;
-
-	tty->disc_data = pInfo;
-	tty->receive_room = 65536;
-
-	timer_setup(&pInfo->tmr, on_timeout, 0);
-
-	return 0;
-}
-
-static void r3964_close(struct tty_struct *tty)
-{
-	struct r3964_info *pInfo = tty->disc_data;
-	struct r3964_client_info *pClient, *pNext;
-	struct r3964_message *pMsg;
-	struct r3964_block_header *pHeader, *pNextHeader;
-	unsigned long flags;
-
-	TRACE_L("close");
-
-	/*
-	 * Make sure that our task queue isn't activated.  If it
-	 * is, take it out of the linked list.
-	 */
-	del_timer_sync(&pInfo->tmr);
-
-	/* Remove client-structs and message queues: */
-	pClient = pInfo->firstClient;
-	while (pClient) {
-		pNext = pClient->next;
-		while (pClient->msg_count) {
-			pMsg = remove_msg(pInfo, pClient);
-			if (pMsg) {
-				kfree(pMsg);
-				TRACE_M("r3964_close - msg kfree %p", pMsg);
-			}
-		}
-		put_pid(pClient->pid);
-		kfree(pClient);
-		TRACE_M("r3964_close - client kfree %p", pClient);
-		pClient = pNext;
-	}
-	/* Remove jobs from tx_queue: */
-	spin_lock_irqsave(&pInfo->lock, flags);
-	pHeader = pInfo->tx_first;
-	pInfo->tx_first = pInfo->tx_last = NULL;
-	spin_unlock_irqrestore(&pInfo->lock, flags);
-
-	while (pHeader) {
-		pNextHeader = pHeader->next;
-		kfree(pHeader);
-		pHeader = pNextHeader;
-	}
-
-	/* Free buffers: */
-	kfree(pInfo->rx_buf);
-	TRACE_M("r3964_close - rx_buf kfree %p", pInfo->rx_buf);
-	kfree(pInfo->tx_buf);
-	TRACE_M("r3964_close - tx_buf kfree %p", pInfo->tx_buf);
-	kfree(pInfo);
-	TRACE_M("r3964_close - info kfree %p", pInfo);
-}
-
-static ssize_t r3964_read(struct tty_struct *tty, struct file *file,
-			  unsigned char *kbuf, size_t nr,
-			  void **cookie, unsigned long offset)
-{
-	struct r3964_info *pInfo = tty->disc_data;
-	struct r3964_client_info *pClient;
-	struct r3964_message *pMsg;
-	struct r3964_client_message theMsg;
-	int ret;
-
-	TRACE_L("read()");
-
-	/*
-	 *	Internal serialization of reads.
-	 */
-	if (file->f_flags & O_NONBLOCK) {
-		if (!mutex_trylock(&pInfo->read_lock))
-			return -EAGAIN;
-	} else {
-		if (mutex_lock_interruptible(&pInfo->read_lock))
-			return -ERESTARTSYS;
-	}
-
-	pClient = findClient(pInfo, task_pid(current));
-	if (pClient) {
-		pMsg = remove_msg(pInfo, pClient);
-		if (pMsg == NULL) {
-			/* no messages available. */
-			if (tty_io_nonblock(tty, file)) {
-				ret = -EAGAIN;
-				goto unlock;
-			}
-			/* block until there is a message: */
-			wait_event_interruptible(tty->read_wait,
-					(pMsg = remove_msg(pInfo, pClient)));
-		}
-
-		/* If we still haven't got a message, we must have been signalled */
-
-		if (!pMsg) {
-			ret = -EINTR;
-			goto unlock;
-		}
-
-		/* deliver msg to client process: */
-		theMsg.msg_id = pMsg->msg_id;
-		theMsg.arg = pMsg->arg;
-		theMsg.error_code = pMsg->error_code;
-		ret = sizeof(struct r3964_client_message);
-
-		kfree(pMsg);
-		TRACE_M("r3964_read - msg kfree %p", pMsg);
-
-		memcpy(kbuf, &theMsg, ret);
-
-		TRACE_PS("read - return %d", ret);
-		goto unlock;
-	}
-	ret = -EPERM;
-unlock:
-	mutex_unlock(&pInfo->read_lock);
-	return ret;
-}
-
-static ssize_t r3964_write(struct tty_struct *tty, struct file *file,
-			   const unsigned char *data, size_t count)
-{
-	struct r3964_info *pInfo = tty->disc_data;
-	struct r3964_block_header *pHeader;
-	struct r3964_client_info *pClient;
-	unsigned char *new_data;
-
-	TRACE_L("write request, %d characters", count);
-/* 
- * Verify the pointers 
- */
-
-	if (!pInfo)
-		return -EIO;
-
-/*
- * Ensure that the caller does not wish to send too much.
- */
-	if (count > R3964_MTU) {
-		if (pInfo->flags & R3964_DEBUG) {
-			TRACE_L(KERN_WARNING "r3964_write: truncating user "
-				"packet from %u to mtu %d", count, R3964_MTU);
-		}
-		count = R3964_MTU;
-	}
-/*
- * Allocate a buffer for the data and copy it from the buffer with header prepended
- */
-	new_data = kmalloc(count + sizeof(struct r3964_block_header),
-			GFP_KERNEL);
-	TRACE_M("r3964_write - kmalloc %p", new_data);
-	if (new_data == NULL) {
-		if (pInfo->flags & R3964_DEBUG) {
-			printk(KERN_ERR "r3964_write: no memory\n");
-		}
-		return -ENOSPC;
-	}
-
-	pHeader = (struct r3964_block_header *)new_data;
-	pHeader->data = new_data + sizeof(struct r3964_block_header);
-	pHeader->length = count;
-	pHeader->locks = 0;
-	pHeader->owner = NULL;
-
-	pClient = findClient(pInfo, task_pid(current));
-	if (pClient) {
-		pHeader->owner = pClient;
-	}
-
-	memcpy(pHeader->data, data, count);	/* We already verified this */
-
-	if (pInfo->flags & R3964_DEBUG) {
-		dump_block(pHeader->data, count);
-	}
-
-/*
- * Add buffer to transmit-queue:
- */
-	add_tx_queue(pInfo, pHeader);
-	trigger_transmit(pInfo);
-
-	return 0;
-}
-
-static int r3964_ioctl(struct tty_struct *tty, struct file *file,
-		unsigned int cmd, unsigned long arg)
-{
-	struct r3964_info *pInfo = tty->disc_data;
-	if (pInfo == NULL)
-		return -EINVAL;
-	switch (cmd) {
-	case R3964_ENABLE_SIGNALS:
-		return enable_signals(pInfo, task_pid(current), arg);
-	case R3964_SETPRIORITY:
-		if (arg < R3964_MASTER || arg > R3964_SLAVE)
-			return -EINVAL;
-		pInfo->priority = arg & 0xff;
-		return 0;
-	case R3964_USE_BCC:
-		if (arg)
-			pInfo->flags |= R3964_BCC;
-		else
-			pInfo->flags &= ~R3964_BCC;
-		return 0;
-	case R3964_READ_TELEGRAM:
-		return read_telegram(pInfo, task_pid(current),
-				(unsigned char __user *)arg);
-	default:
-		return -ENOIOCTLCMD;
-	}
-}
-
-#ifdef CONFIG_COMPAT
-static int r3964_compat_ioctl(struct tty_struct *tty, struct file *file,
-		unsigned int cmd, unsigned long arg)
-{
-	switch (cmd) {
-	case R3964_ENABLE_SIGNALS:
-	case R3964_SETPRIORITY:
-	case R3964_USE_BCC:
-		return r3964_ioctl(tty, file, cmd, arg);
-	default:
-		return -ENOIOCTLCMD;
-	}
-}
-#endif
-
-static void r3964_set_termios(struct tty_struct *tty, struct ktermios *old)
-{
-	TRACE_L("set_termios");
-}
-
-/* Called without the kernel lock held - fine */
-static __poll_t r3964_poll(struct tty_struct *tty, struct file *file,
-			struct poll_table_struct *wait)
-{
-	struct r3964_info *pInfo = tty->disc_data;
-	struct r3964_client_info *pClient;
-	struct r3964_message *pMsg = NULL;
-	unsigned long flags;
-	__poll_t result = EPOLLOUT;
-
-	TRACE_L("POLL");
-
-	pClient = findClient(pInfo, task_pid(current));
-	if (pClient) {
-		poll_wait(file, &tty->read_wait, wait);
-		spin_lock_irqsave(&pInfo->lock, flags);
-		pMsg = pClient->first_msg;
-		spin_unlock_irqrestore(&pInfo->lock, flags);
-		if (pMsg)
-			result |= EPOLLIN | EPOLLRDNORM;
-	} else {
-		result = -EINVAL;
-	}
-	return result;
-}
-
-static void r3964_receive_buf(struct tty_struct *tty, const unsigned char *cp,
-			char *fp, int count)
-{
-	struct r3964_info *pInfo = tty->disc_data;
-	const unsigned char *p;
-	char *f, flags = TTY_NORMAL;
-	int i;
-
-	for (i = count, p = cp, f = fp; i; i--, p++) {
-		if (f)
-			flags = *f++;
-		if (flags == TTY_NORMAL) {
-			receive_char(pInfo, *p);
-		} else {
-			receive_error(pInfo, flags);
-		}
-
-	}
-}
-
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_LDISC(N_R3964);
diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index 01645e87b3d5..e5040bdadcd6 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -3260,8 +3260,6 @@ static int con_write(struct tty_struct *tty, const unsigned char *buf, int count
 
 static int con_put_char(struct tty_struct *tty, unsigned char ch)
 {
-	if (in_interrupt())
-		return 0;	/* n_r3964 calls put_char() from interrupt context */
 	return do_con_write(tty, &ch, 1);
 }
 
diff --git a/include/linux/n_r3964.h b/include/linux/n_r3964.h
deleted file mode 100644
index 90a803aa42e8..000000000000
--- a/include/linux/n_r3964.h
+++ /dev/null
@@ -1,175 +0,0 @@
-/* r3964 linediscipline for linux
- *
- * -----------------------------------------------------------
- * Copyright by
- * Philips Automation Projects
- * Kassel (Germany)
- * -----------------------------------------------------------
- * This software may be used and distributed according to the terms of
- * the GNU General Public License, incorporated herein by reference.
- *
- * Author:
- * L. Haag
- *
- * $Log: r3964.h,v $
- * Revision 1.4  2005/12/21 19:54:24  Kurt Huwig <kurt huwig de>
- * Fixed HZ usage on 2.6 kernels
- * Removed unnecessary include
- *
- * Revision 1.3  2001/03/18 13:02:24  dwmw2
- * Fix timer usage, use spinlocks properly.
- *
- * Revision 1.2  2001/03/18 12:53:15  dwmw2
- * Merge changes in 2.4.2
- *
- * Revision 1.1.1.1  1998/10/13 16:43:14  dwmw2
- * This'll screw the version control
- *
- * Revision 1.6  1998/09/30 00:40:38  dwmw2
- * Updated to use kernel's N_R3964 if available
- *
- * Revision 1.4  1998/04/02 20:29:44  lhaag
- * select, blocking, ...
- *
- * Revision 1.3  1998/02/12 18:58:43  root
- * fixed some memory leaks
- * calculation of checksum characters
- *
- * Revision 1.2  1998/02/07 13:03:17  root
- * ioctl read_telegram
- *
- * Revision 1.1  1998/02/06 19:19:43  root
- * Initial revision
- *
- *
- */
-#ifndef __LINUX_N_R3964_H__
-#define __LINUX_N_R3964_H__
-
-
-#include <linux/param.h>
-#include <uapi/linux/n_r3964.h>
-
-/*
- * Common ascii handshake characters:
- */
-
-#define STX 0x02
-#define ETX 0x03
-#define DLE 0x10
-#define NAK 0x15
-
-/*
- * Timeouts (from milliseconds to jiffies)
- */
-
-#define R3964_TO_QVZ ((550)*HZ/1000)
-#define R3964_TO_ZVZ ((220)*HZ/1000)
-#define R3964_TO_NO_BUF ((400)*HZ/1000)
-#define R3964_NO_TX_ROOM ((100)*HZ/1000)
-#define R3964_TO_RX_PANIC ((4000)*HZ/1000)
-#define R3964_MAX_RETRIES 5
-
-
-enum { R3964_IDLE, 
-	   R3964_TX_REQUEST, R3964_TRANSMITTING, 
-	   R3964_WAIT_ZVZ_BEFORE_TX_RETRY, R3964_WAIT_FOR_TX_ACK,
-	   R3964_WAIT_FOR_RX_BUF,
-	   R3964_RECEIVING, R3964_WAIT_FOR_BCC, R3964_WAIT_FOR_RX_REPEAT
-	   };
-
-/*
- * All open file-handles are 'clients' and are stored in a linked list:
- */
-
-struct r3964_message;
-
-struct r3964_client_info {
-	spinlock_t     lock;
-	struct pid    *pid;
-	unsigned int   sig_flags;
-
-	struct r3964_client_info *next;
-
-	struct r3964_message *first_msg;
-	struct r3964_message *last_msg;
-	struct r3964_block_header *next_block_to_read;
-	int            msg_count;
-};
-
-
-
-struct r3964_block_header;
-
-/* internal version of client_message: */
-struct r3964_message {
-	  int     msg_id;
-	  int     arg;
-	  int     error_code;
-	  struct r3964_block_header *block;
-	  struct r3964_message *next;
-};
-
-/*
- * Header of received block in rx_buf/tx_buf:
- */
-
-struct r3964_block_header 
-{
-	unsigned int length;             /* length in chars without header */
-	unsigned char *data;             /* usually data is located 
-                                        immediately behind this struct */
-	unsigned int locks;              /* only used in rx_buffer */
-	  
-    struct r3964_block_header *next;
-	struct r3964_client_info *owner;  /* =NULL in rx_buffer */
-};
-
-/*
- * If rx_buf hasn't enough space to store R3964_MTU chars,
- * we will reject all incoming STX-requests by sending NAK.
- */
-
-#define RX_BUF_SIZE    4000
-#define TX_BUF_SIZE    4000
-#define R3964_MAX_BLOCKS_IN_RX_QUEUE 100
-
-#define R3964_PARITY 0x0001
-#define R3964_FRAME  0x0002
-#define R3964_OVERRUN 0x0004
-#define R3964_UNKNOWN 0x0008
-#define R3964_BREAK   0x0010
-#define R3964_CHECKSUM 0x0020
-#define R3964_ERROR  0x003f
-#define R3964_BCC   0x4000
-#define R3964_DEBUG 0x8000
-
-
-struct r3964_info {
-	spinlock_t     lock;
-	struct tty_struct *tty;
-	unsigned char priority;
-	unsigned char *rx_buf;            /* ring buffer */
-	unsigned char *tx_buf;
-
-	struct r3964_block_header *rx_first;
-	struct r3964_block_header *rx_last;
-	struct r3964_block_header *tx_first;
-	struct r3964_block_header *tx_last;
-	unsigned int tx_position;
-        unsigned int rx_position;
-	unsigned char last_rx;
-	unsigned char bcc;
-        unsigned int  blocks_in_rx_queue;
-
-	struct mutex read_lock;		/* serialize r3964_read */
-
-	struct r3964_client_info *firstClient;
-	unsigned int state;
-	unsigned int flags;
-
-	struct timer_list tmr;
-	int nRetry;
-};
-
-#endif
diff --git a/include/uapi/linux/n_r3964.h b/include/uapi/linux/n_r3964.h
deleted file mode 100644
index 6bbd18520f30..000000000000
--- a/include/uapi/linux/n_r3964.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/* SPDX-License-Identifier: GPL-1.0+ WITH Linux-syscall-note */
-/* r3964 linediscipline for linux
- *
- * -----------------------------------------------------------
- * Copyright by
- * Philips Automation Projects
- * Kassel (Germany)
- * -----------------------------------------------------------
- * This software may be used and distributed according to the terms of
- * the GNU General Public License, incorporated herein by reference.
- *
- * Author:
- * L. Haag
- *
- * $Log: r3964.h,v $
- * Revision 1.4  2005/12/21 19:54:24  Kurt Huwig <kurt huwig de>
- * Fixed HZ usage on 2.6 kernels
- * Removed unnecessary include
- *
- * Revision 1.3  2001/03/18 13:02:24  dwmw2
- * Fix timer usage, use spinlocks properly.
- *
- * Revision 1.2  2001/03/18 12:53:15  dwmw2
- * Merge changes in 2.4.2
- *
- * Revision 1.1.1.1  1998/10/13 16:43:14  dwmw2
- * This'll screw the version control
- *
- * Revision 1.6  1998/09/30 00:40:38  dwmw2
- * Updated to use kernel's N_R3964 if available
- *
- * Revision 1.4  1998/04/02 20:29:44  lhaag
- * select, blocking, ...
- *
- * Revision 1.3  1998/02/12 18:58:43  root
- * fixed some memory leaks
- * calculation of checksum characters
- *
- * Revision 1.2  1998/02/07 13:03:17  root
- * ioctl read_telegram
- *
- * Revision 1.1  1998/02/06 19:19:43  root
- * Initial revision
- *
- *
- */
-
-#ifndef _UAPI__LINUX_N_R3964_H__
-#define _UAPI__LINUX_N_R3964_H__
-
-/* line disciplines for r3964 protocol */
-
-
-/*
- * Ioctl-commands
- */
-
-#define R3964_ENABLE_SIGNALS      0x5301
-#define R3964_SETPRIORITY         0x5302
-#define R3964_USE_BCC             0x5303
-#define R3964_READ_TELEGRAM       0x5304
-
-/* Options for R3964_SETPRIORITY */
-#define R3964_MASTER   0
-#define R3964_SLAVE    1
-
-/* Options for R3964_ENABLE_SIGNALS */
-#define R3964_SIG_ACK   0x0001
-#define R3964_SIG_DATA  0x0002
-#define R3964_SIG_ALL   0x000f
-#define R3964_SIG_NONE  0x0000
-#define R3964_USE_SIGIO 0x1000
-
-/*
- * r3964 operation states:
- */
-
-/* types for msg_id: */
-enum {R3964_MSG_ACK=1, R3964_MSG_DATA };
-
-#define R3964_MAX_MSG_COUNT 32
-
-/* error codes for client messages */
-#define R3964_OK 0        /* no error. */
-#define R3964_TX_FAIL -1  /* transmission error, block NOT sent */
-#define R3964_OVERFLOW -2 /* msg queue overflow */
-
-/* the client gets this struct when calling read(fd,...): */
-struct r3964_client_message {
-	  int     msg_id;
-	  int     arg;
-	  int     error_code;
-};
-
-#define R3964_MTU      256
-
-
-
-#endif /* _UAPI__LINUX_N_R3964_H__ */
-- 
cgit v1.2.3


From 0f3dcf3b5d76669123bf99fec812b8b0acd60375 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Wed, 5 May 2021 11:19:04 +0200
Subject: tty: make fp of tty_ldisc_ops::receive_buf{,2} const

Char pointer (cp) passed to tty_ldisc_ops::receive_buf{,2} is const.
There is no reason for flag pointer (fp) not to be too. So switch it in
the definition and all uses.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: William Hubbs <w.d.hubbs@gmail.com>
Cc: Chris Brannon <chris@the-brannons.com>
Cc: Kirk Reiser <kirk@reisers.ca>
Cc: Samuel Thibault <samuel.thibault@ens-lyon.org>
Cc: Marcel Holtmann <marcel@holtmann.org>
Cc: Johan Hedberg <johan.hedberg@gmail.com>
Cc: Luiz Augusto von Dentz <luiz.dentz@gmail.com>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Wolfgang Grandegger <wg@grandegger.com>
Cc: Marc Kleine-Budde <mkl@pengutronix.de>
Cc: Andreas Koensgen <ajk@comnets.uni-bremen.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Liam Girdwood <lgirdwood@gmail.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: Takashi Iwai <tiwai@suse.com>
Cc: Peter Ujfalusi <peter.ujfalusi@gmail.com>
Link: https://lore.kernel.org/r/20210505091928.22010-12-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/accessibility/speakup/spk_ttyio.c |  3 ++-
 drivers/bluetooth/hci_ldisc.c             |  2 +-
 drivers/input/serio/serport.c             |  3 ++-
 drivers/misc/ti-st/st_core.c              |  2 +-
 drivers/net/caif/caif_serial.c            |  2 +-
 drivers/net/can/slcan.c                   |  3 ++-
 drivers/net/hamradio/6pack.c              |  2 +-
 drivers/net/hamradio/mkiss.c              |  2 +-
 drivers/net/ppp/ppp_async.c               |  6 +++---
 drivers/net/ppp/ppp_synctty.c             |  6 +++---
 drivers/net/slip/slip.c                   |  2 +-
 drivers/tty/n_gsm.c                       |  2 +-
 drivers/tty/n_hdlc.c                      |  2 +-
 drivers/tty/n_null.c                      |  2 +-
 drivers/tty/n_tty.c                       | 16 ++++++++--------
 drivers/tty/tty_buffer.c                  |  4 ++--
 include/linux/tty.h                       |  2 +-
 include/linux/tty_ldisc.h                 |  4 ++--
 net/nfc/nci/uart.c                        |  2 +-
 sound/soc/codecs/cx20442.c                |  4 ++--
 sound/soc/ti/ams-delta.c                  |  4 ++--
 21 files changed, 39 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/accessibility/speakup/spk_ttyio.c b/drivers/accessibility/speakup/spk_ttyio.c
index 2e39fcf492d8..b7fd094700d9 100644
--- a/drivers/accessibility/speakup/spk_ttyio.c
+++ b/drivers/accessibility/speakup/spk_ttyio.c
@@ -72,7 +72,8 @@ static void spk_ttyio_ldisc_close(struct tty_struct *tty)
 }
 
 static int spk_ttyio_receive_buf2(struct tty_struct *tty,
-				  const unsigned char *cp, char *fp, int count)
+				  const unsigned char *cp,
+				  const char *fp, int count)
 {
 	struct spk_ldisc_data *ldisc_data = tty->disc_data;
 	struct spk_synth *synth = ldisc_data->synth;
diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c
index 71a4ca505e09..e785851a92c1 100644
--- a/drivers/bluetooth/hci_ldisc.c
+++ b/drivers/bluetooth/hci_ldisc.c
@@ -593,7 +593,7 @@ static void hci_uart_tty_wakeup(struct tty_struct *tty)
  * Return Value:    None
  */
 static void hci_uart_tty_receive(struct tty_struct *tty, const u8 *data,
-				 char *flags, int count)
+				 const char *flags, int count)
 {
 	struct hci_uart *hu = tty->disc_data;
 
diff --git a/drivers/input/serio/serport.c b/drivers/input/serio/serport.c
index 33e9d9bfd036..ff3715315592 100644
--- a/drivers/input/serio/serport.c
+++ b/drivers/input/serio/serport.c
@@ -114,7 +114,8 @@ static void serport_ldisc_close(struct tty_struct *tty)
  * 'interrupt' routine.
  */
 
-static void serport_ldisc_receive(struct tty_struct *tty, const unsigned char *cp, char *fp, int count)
+static void serport_ldisc_receive(struct tty_struct *tty,
+		const unsigned char *cp, const char *fp, int count)
 {
 	struct serport *serport = (struct serport*) tty->disc_data;
 	unsigned long flags;
diff --git a/drivers/misc/ti-st/st_core.c b/drivers/misc/ti-st/st_core.c
index 071844b58073..239a75502cd6 100644
--- a/drivers/misc/ti-st/st_core.c
+++ b/drivers/misc/ti-st/st_core.c
@@ -798,7 +798,7 @@ static void st_tty_close(struct tty_struct *tty)
 }
 
 static void st_tty_receive(struct tty_struct *tty, const unsigned char *data,
-			   char *tty_flags, int count)
+			   const char *tty_flags, int count)
 {
 #ifdef VERBOSE
 	print_hex_dump(KERN_DEBUG, ">in>", DUMP_PREFIX_NONE,
diff --git a/drivers/net/caif/caif_serial.c b/drivers/net/caif/caif_serial.c
index da6fffb4d5a8..fb3ca4e11fe5 100644
--- a/drivers/net/caif/caif_serial.c
+++ b/drivers/net/caif/caif_serial.c
@@ -159,7 +159,7 @@ static inline void debugfs_tx(struct ser_device *ser, const u8 *data, int size)
 #endif
 
 static void ldisc_receive(struct tty_struct *tty, const u8 *data,
-			char *flags, int count)
+			const char *flags, int count)
 {
 	struct sk_buff *skb = NULL;
 	struct ser_device *ser;
diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c
index 31ba6664503d..7dc3e79cb5c4 100644
--- a/drivers/net/can/slcan.c
+++ b/drivers/net/can/slcan.c
@@ -467,7 +467,8 @@ static void slc_setup(struct net_device *dev)
  */
 
 static void slcan_receive_buf(struct tty_struct *tty,
-			      const unsigned char *cp, char *fp, int count)
+			      const unsigned char *cp, const char *fp,
+			      int count)
 {
 	struct slcan *sl = (struct slcan *) tty->disc_data;
 
diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c
index 80f41945709f..c0a80f04dd8e 100644
--- a/drivers/net/hamradio/6pack.c
+++ b/drivers/net/hamradio/6pack.c
@@ -428,7 +428,7 @@ out:
  * and sent on to some IP layer for further processing.
  */
 static void sixpack_receive_buf(struct tty_struct *tty,
-	const unsigned char *cp, char *fp, int count)
+	const unsigned char *cp, const char *fp, int count)
 {
 	struct sixpack *sp;
 	int count1;
diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c
index 65154224d5b8..fc05ded48178 100644
--- a/drivers/net/hamradio/mkiss.c
+++ b/drivers/net/hamradio/mkiss.c
@@ -871,7 +871,7 @@ static int mkiss_ioctl(struct tty_struct *tty, struct file *file,
  * and sent on to the AX.25 layer for further processing.
  */
 static void mkiss_receive_buf(struct tty_struct *tty, const unsigned char *cp,
-	char *fp, int count)
+	const char *fp, int count)
 {
 	struct mkiss *ax = mkiss_get(tty);
 
diff --git a/drivers/net/ppp/ppp_async.c b/drivers/net/ppp/ppp_async.c
index 8b41aa3fb64e..9f08bd19551f 100644
--- a/drivers/net/ppp/ppp_async.c
+++ b/drivers/net/ppp/ppp_async.c
@@ -98,7 +98,7 @@ static int ppp_async_send(struct ppp_channel *chan, struct sk_buff *skb);
 static int ppp_async_push(struct asyncppp *ap);
 static void ppp_async_flush_output(struct asyncppp *ap);
 static void ppp_async_input(struct asyncppp *ap, const unsigned char *buf,
-			    char *flags, int count);
+			    const char *flags, int count);
 static int ppp_async_ioctl(struct ppp_channel *chan, unsigned int cmd,
 			   unsigned long arg);
 static void ppp_async_process(struct tasklet_struct *t);
@@ -340,7 +340,7 @@ ppp_asynctty_poll(struct tty_struct *tty, struct file *file, poll_table *wait)
 /* May sleep, don't call from interrupt level or with interrupts disabled */
 static void
 ppp_asynctty_receive(struct tty_struct *tty, const unsigned char *buf,
-		  char *cflags, int count)
+		  const char *cflags, int count)
 {
 	struct asyncppp *ap = ap_get(tty);
 	unsigned long flags;
@@ -829,7 +829,7 @@ process_input_packet(struct asyncppp *ap)
 
 static void
 ppp_async_input(struct asyncppp *ap, const unsigned char *buf,
-		char *flags, int count)
+		const char *flags, int count)
 {
 	struct sk_buff *skb;
 	int c, i, j, n, s, f;
diff --git a/drivers/net/ppp/ppp_synctty.c b/drivers/net/ppp/ppp_synctty.c
index 576b6a93bf23..c82c4e6df4cb 100644
--- a/drivers/net/ppp/ppp_synctty.c
+++ b/drivers/net/ppp/ppp_synctty.c
@@ -94,7 +94,7 @@ static void ppp_sync_process(struct tasklet_struct *t);
 static int ppp_sync_push(struct syncppp *ap);
 static void ppp_sync_flush_output(struct syncppp *ap);
 static void ppp_sync_input(struct syncppp *ap, const unsigned char *buf,
-			   char *flags, int count);
+			   const char *flags, int count);
 
 static const struct ppp_channel_ops sync_ops = {
 	.start_xmit = ppp_sync_send,
@@ -333,7 +333,7 @@ ppp_sync_poll(struct tty_struct *tty, struct file *file, poll_table *wait)
 /* May sleep, don't call from interrupt level or with interrupts disabled */
 static void
 ppp_sync_receive(struct tty_struct *tty, const unsigned char *buf,
-		  char *cflags, int count)
+		  const char *cflags, int count)
 {
 	struct syncppp *ap = sp_get(tty);
 	unsigned long flags;
@@ -665,7 +665,7 @@ ppp_sync_flush_output(struct syncppp *ap)
  */
 static void
 ppp_sync_input(struct syncppp *ap, const unsigned char *buf,
-		char *flags, int count)
+		const char *flags, int count)
 {
 	struct sk_buff *skb;
 	unsigned char *p;
diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c
index 1ab124eba8eb..4dda49e61745 100644
--- a/drivers/net/slip/slip.c
+++ b/drivers/net/slip/slip.c
@@ -685,7 +685,7 @@ static void sl_setup(struct net_device *dev)
  */
 
 static void slip_receive_buf(struct tty_struct *tty, const unsigned char *cp,
-							char *fp, int count)
+		const char *fp, int count)
 {
 	struct slip *sl = tty->disc_data;
 
diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c
index 5fea02cfb0cc..477403ecc445 100644
--- a/drivers/tty/n_gsm.c
+++ b/drivers/tty/n_gsm.c
@@ -2424,7 +2424,7 @@ static void gsmld_detach_gsm(struct tty_struct *tty, struct gsm_mux *gsm)
 }
 
 static void gsmld_receive_buf(struct tty_struct *tty, const unsigned char *cp,
-			      char *fp, int count)
+			      const char *fp, int count)
 {
 	struct gsm_mux *gsm = tty->disc_data;
 	char flags = TTY_NORMAL;
diff --git a/drivers/tty/n_hdlc.c b/drivers/tty/n_hdlc.c
index b0f33e8ac819..62b1d1a6e0f1 100644
--- a/drivers/tty/n_hdlc.c
+++ b/drivers/tty/n_hdlc.c
@@ -358,7 +358,7 @@ static void n_hdlc_tty_wakeup(struct tty_struct *tty)
  * interpreted as one HDLC frame.
  */
 static void n_hdlc_tty_receive(struct tty_struct *tty, const __u8 *data,
-			       char *flags, int count)
+			       const char *flags, int count)
 {
 	register struct n_hdlc *n_hdlc = tty->disc_data;
 	register struct n_hdlc_buf *buf;
diff --git a/drivers/tty/n_null.c b/drivers/tty/n_null.c
index b8f67b5f1ef8..2ff373d2f98d 100644
--- a/drivers/tty/n_null.c
+++ b/drivers/tty/n_null.c
@@ -33,7 +33,7 @@ static ssize_t n_null_write(struct tty_struct *tty, struct file *file,
 }
 
 static void n_null_receivebuf(struct tty_struct *tty,
-				 const unsigned char *cp, char *fp,
+				 const unsigned char *cp, const char *fp,
 				 int cnt)
 {
 }
diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index 0d93be26c678..c32318da5190 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -1474,7 +1474,7 @@ n_tty_receive_char_lnext(struct tty_struct *tty, unsigned char c, char flag)
 
 static void
 n_tty_receive_buf_real_raw(struct tty_struct *tty, const unsigned char *cp,
-			   char *fp, int count)
+			   const char *fp, int count)
 {
 	struct n_tty_data *ldata = tty->disc_data;
 	size_t n, head;
@@ -1494,7 +1494,7 @@ n_tty_receive_buf_real_raw(struct tty_struct *tty, const unsigned char *cp,
 
 static void
 n_tty_receive_buf_raw(struct tty_struct *tty, const unsigned char *cp,
-		      char *fp, int count)
+		      const char *fp, int count)
 {
 	struct n_tty_data *ldata = tty->disc_data;
 	char flag = TTY_NORMAL;
@@ -1511,7 +1511,7 @@ n_tty_receive_buf_raw(struct tty_struct *tty, const unsigned char *cp,
 
 static void
 n_tty_receive_buf_closing(struct tty_struct *tty, const unsigned char *cp,
-			  char *fp, int count)
+			  const char *fp, int count)
 {
 	char flag = TTY_NORMAL;
 
@@ -1524,7 +1524,7 @@ n_tty_receive_buf_closing(struct tty_struct *tty, const unsigned char *cp,
 }
 
 static void n_tty_receive_buf_standard(struct tty_struct *tty,
-		const unsigned char *cp, char *fp, int count)
+		const unsigned char *cp, const char *fp, int count)
 {
 	struct n_tty_data *ldata = tty->disc_data;
 	char flag = TTY_NORMAL;
@@ -1562,7 +1562,7 @@ static void n_tty_receive_buf_standard(struct tty_struct *tty,
 }
 
 static void __receive_buf(struct tty_struct *tty, const unsigned char *cp,
-			  char *fp, int count)
+			  const char *fp, int count)
 {
 	struct n_tty_data *ldata = tty->disc_data;
 	bool preops = I_ISTRIP(tty) || (I_IUCLC(tty) && L_IEXTEN(tty));
@@ -1629,7 +1629,7 @@ static void __receive_buf(struct tty_struct *tty, const unsigned char *cp,
  */
 static int
 n_tty_receive_buf_common(struct tty_struct *tty, const unsigned char *cp,
-			 char *fp, int count, int flow)
+			 const char *fp, int count, int flow)
 {
 	struct n_tty_data *ldata = tty->disc_data;
 	int room, n, rcvd = 0, overflow;
@@ -1698,13 +1698,13 @@ n_tty_receive_buf_common(struct tty_struct *tty, const unsigned char *cp,
 }
 
 static void n_tty_receive_buf(struct tty_struct *tty, const unsigned char *cp,
-			      char *fp, int count)
+			      const char *fp, int count)
 {
 	n_tty_receive_buf_common(tty, cp, fp, count, 0);
 }
 
 static int n_tty_receive_buf2(struct tty_struct *tty, const unsigned char *cp,
-			      char *fp, int count)
+			      const char *fp, int count)
 {
 	return n_tty_receive_buf_common(tty, cp, fp, count, 1);
 }
diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c
index 9733469a14b2..55b1f1711341 100644
--- a/drivers/tty/tty_buffer.c
+++ b/drivers/tty/tty_buffer.c
@@ -455,7 +455,7 @@ EXPORT_SYMBOL_GPL(tty_prepare_flip_string);
  *	Returns the number of bytes processed
  */
 int tty_ldisc_receive_buf(struct tty_ldisc *ld, const unsigned char *p,
-			  char *f, int count)
+			  const char *f, int count)
 {
 	if (ld->ops->receive_buf2)
 		count = ld->ops->receive_buf2(ld->tty, p, f, count);
@@ -472,7 +472,7 @@ static int
 receive_buf(struct tty_port *port, struct tty_buffer *head, int count)
 {
 	unsigned char *p = char_buf_ptr(head, head->read);
-	char	      *f = NULL;
+	const char *f = NULL;
 	int n;
 
 	if (~head->flags & TTYB_NORMAL)
diff --git a/include/linux/tty.h b/include/linux/tty.h
index e5d6b1f28823..5aad2220266c 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -628,7 +628,7 @@ extern int tty_register_ldisc(int disc, struct tty_ldisc_ops *new_ldisc);
 extern int tty_unregister_ldisc(int disc);
 extern int tty_set_ldisc(struct tty_struct *tty, int disc);
 extern int tty_ldisc_receive_buf(struct tty_ldisc *ld, const unsigned char *p,
-				 char *f, int count);
+				 const char *f, int count);
 
 /* n_tty.c */
 extern void n_tty_inherit_ops(struct tty_ldisc_ops *ops);
diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h
index 31284b55bd4f..c20ca6a75b4c 100644
--- a/include/linux/tty_ldisc.h
+++ b/include/linux/tty_ldisc.h
@@ -201,11 +201,11 @@ struct tty_ldisc_ops {
 	 * The following routines are called from below.
 	 */
 	void	(*receive_buf)(struct tty_struct *, const unsigned char *cp,
-			       char *fp, int count);
+			       const char *fp, int count);
 	void	(*write_wakeup)(struct tty_struct *);
 	void	(*dcd_change)(struct tty_struct *, unsigned int);
 	int	(*receive_buf2)(struct tty_struct *, const unsigned char *cp,
-				char *fp, int count);
+				const char *fp, int count);
 
 	struct  module *owner;
 
diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c
index 1248faf4d6df..98102ef64004 100644
--- a/net/nfc/nci/uart.c
+++ b/net/nfc/nci/uart.c
@@ -308,7 +308,7 @@ static int nci_uart_default_recv_buf(struct nci_uart *nu, const u8 *data,
  * Return Value:    None
  */
 static void nci_uart_tty_receive(struct tty_struct *tty, const u8 *data,
-				 char *flags, int count)
+				 const char *flags, int count)
 {
 	struct nci_uart *nu = (void *)tty->disc_data;
 
diff --git a/sound/soc/codecs/cx20442.c b/sound/soc/codecs/cx20442.c
index 61dfa86d444d..ec8d6e74b467 100644
--- a/sound/soc/codecs/cx20442.c
+++ b/sound/soc/codecs/cx20442.c
@@ -259,8 +259,8 @@ static int v253_hangup(struct tty_struct *tty)
 }
 
 /* Line discipline .receive_buf() */
-static void v253_receive(struct tty_struct *tty,
-				const unsigned char *cp, char *fp, int count)
+static void v253_receive(struct tty_struct *tty, const unsigned char *cp,
+		const char *fp, int count)
 {
 	struct snd_soc_component *component = tty->disc_data;
 	struct cx20442_priv *cx20442;
diff --git a/sound/soc/ti/ams-delta.c b/sound/soc/ti/ams-delta.c
index aba0017e870b..55a6736a378e 100644
--- a/sound/soc/ti/ams-delta.c
+++ b/sound/soc/ti/ams-delta.c
@@ -337,8 +337,8 @@ static int cx81801_hangup(struct tty_struct *tty)
 }
 
 /* Line discipline .receive_buf() */
-static void cx81801_receive(struct tty_struct *tty,
-				const unsigned char *cp, char *fp, int count)
+static void cx81801_receive(struct tty_struct *tty, const unsigned char *cp,
+		const char *fp, int count)
 {
 	struct snd_soc_component *component = tty->disc_data;
 	const unsigned char *c;
-- 
cgit v1.2.3


From 6e94dbc7a4e49a028b81302d755bba1a518f973b Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Wed, 5 May 2021 11:19:05 +0200
Subject: tty: cumulate and document tty_struct::flow* members

Group the flow flags under a single struct called flow. The new struct
contains 'stopped' and 'tco_stopped' bools which used to be bits in a
bitfield. The struct also contains the lock protecting them to
potentially share the same cache line.

Note that commit c545b66c6922b (tty: Serialize tcflow() with other tty
flow control changes) added a padding to the original bitfield. It was
for the bitfield to occupy a whole 64b word to avoid interferring stores
on Alpha (cannot we evaporate this arch with weird implications to C
code yet?). But it doesn't work as expected as the padding
(tty_struct::unused) is aligned to a 8B boundary too and occupies some
bytes from the next word.

So make it reliable by:
1) setting __aligned of the struct -- that aligns the start, and
2) making 'unsigned long unused[0]' as the last member of the struct --
   pads the end.

This is also the perfect time to start the documentation of tty_struct
where all this lives. So we start by documenting what these bools
actually serve for. And why we do all the alignment dances. Only the few
up-to-date information from the Theodore's comment made it into this new
Kerneldoc comment.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Ulf Hansson <ulf.hansson@linaro.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Shawn Guo <shawnguo@kernel.org>
Cc: Sascha Hauer <s.hauer@pengutronix.de>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: "Maciej W. Rozycki" <macro@orcam.me.uk>
Link: https://lore.kernel.org/r/20210505091928.22010-13-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/networking/caif/caif.rst |  4 ++--
 drivers/char/pcmcia/synclink_cs.c      |  8 +++----
 drivers/mmc/core/sdio_uart.c           |  2 +-
 drivers/net/caif/caif_serial.c         |  4 ++--
 drivers/s390/char/tty3270.c            |  2 +-
 drivers/staging/fwserial/fwserial.c    |  2 +-
 drivers/tty/amiserial.c                |  8 +++----
 drivers/tty/moxa.c                     |  4 ++--
 drivers/tty/mxser.c                    | 12 +++++------
 drivers/tty/n_tty.c                    |  8 +++----
 drivers/tty/pty.c                      |  4 ++--
 drivers/tty/serial/arc_uart.c          |  2 +-
 drivers/tty/serial/dz.c                |  2 +-
 drivers/tty/synclink_gt.c              |  6 +++---
 drivers/tty/tty_io.c                   | 24 ++++++++++-----------
 drivers/tty/tty_ioctl.c                | 16 +++++++-------
 drivers/tty/tty_port.c                 |  2 +-
 drivers/tty/vt/keyboard.c              |  2 +-
 drivers/tty/vt/vt.c                    |  4 ++--
 include/linux/serial_core.h            |  2 +-
 include/linux/tty.h                    | 38 ++++++++++++++++++++--------------
 include/linux/tty_driver.h             |  4 ++--
 22 files changed, 83 insertions(+), 77 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/caif/caif.rst b/Documentation/networking/caif/caif.rst
index 81a14373d780..d922d419c513 100644
--- a/Documentation/networking/caif/caif.rst
+++ b/Documentation/networking/caif/caif.rst
@@ -69,9 +69,9 @@ There are debugfs parameters provided for serial communication.
 
   - 0x01 - tty->warned is on.
   - 0x04 - tty->packed is on.
-  - 0x08 - tty->flow_stopped is on.
+  - 0x08 - tty->flow.tco_stopped is on.
   - 0x10 - tty->hw_stopped is on.
-  - 0x20 - tty->stopped is on.
+  - 0x20 - tty->flow.stopped is on.
 
 * last_tx_msg: Binary blob Prints the last transmitted frame.
 
diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index 3287a7627ed0..b4707bc3aee8 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -985,7 +985,7 @@ static void tx_done(MGSLPC_INFO *info, struct tty_struct *tty)
 	else
 #endif
 	{
-		if (tty && (tty->stopped || tty->hw_stopped)) {
+		if (tty && (tty->flow.stopped || tty->hw_stopped)) {
 			tx_stop(info);
 			return;
 		}
@@ -1005,7 +1005,7 @@ static void tx_ready(MGSLPC_INFO *info, struct tty_struct *tty)
 		if (!info->tx_active)
 			return;
 	} else {
-		if (tty && (tty->stopped || tty->hw_stopped)) {
+		if (tty && (tty->flow.stopped || tty->hw_stopped)) {
 			tx_stop(info);
 			return;
 		}
@@ -1525,7 +1525,7 @@ static void mgslpc_flush_chars(struct tty_struct *tty)
 	if (mgslpc_paranoia_check(info, tty->name, "mgslpc_flush_chars"))
 		return;
 
-	if (info->tx_count <= 0 || tty->stopped ||
+	if (info->tx_count <= 0 || tty->flow.stopped ||
 	    tty->hw_stopped || !info->tx_buf)
 		return;
 
@@ -1594,7 +1594,7 @@ static int mgslpc_write(struct tty_struct * tty,
 		ret += c;
 	}
 start:
-	if (info->tx_count && !tty->stopped && !tty->hw_stopped) {
+	if (info->tx_count && !tty->flow.stopped && !tty->hw_stopped) {
 		spin_lock_irqsave(&info->lock, flags);
 		if (!info->tx_active)
 			tx_start(info, tty);
diff --git a/drivers/mmc/core/sdio_uart.c b/drivers/mmc/core/sdio_uart.c
index be4067281caa..dbcac2b7f2fe 100644
--- a/drivers/mmc/core/sdio_uart.c
+++ b/drivers/mmc/core/sdio_uart.c
@@ -439,7 +439,7 @@ static void sdio_uart_transmit_chars(struct sdio_uart_port *port)
 	tty = tty_port_tty_get(&port->port);
 
 	if (tty == NULL || !kfifo_len(xmit) ||
-				tty->stopped || tty->hw_stopped) {
+				tty->flow.stopped || tty->hw_stopped) {
 		sdio_uart_stop_tx(port);
 		tty_kref_put(tty);
 		return;
diff --git a/drivers/net/caif/caif_serial.c b/drivers/net/caif/caif_serial.c
index fb3ca4e11fe5..3996cf7dc9ba 100644
--- a/drivers/net/caif/caif_serial.c
+++ b/drivers/net/caif/caif_serial.c
@@ -87,8 +87,8 @@ static void ldisc_tx_wakeup(struct tty_struct *tty);
 static inline void update_tty_status(struct ser_device *ser)
 {
 	ser->tty_status =
-		ser->tty->stopped << 5 |
-		ser->tty->flow_stopped << 3 |
+		ser->tty->flow.stopped << 5 |
+		ser->tty->flow.tco_stopped << 3 |
 		ser->tty->packet << 2;
 }
 static inline void debugfs_init(struct ser_device *ser, struct tty_struct *tty)
diff --git a/drivers/s390/char/tty3270.c b/drivers/s390/char/tty3270.c
index 307a80f85c07..1b68564799fa 100644
--- a/drivers/s390/char/tty3270.c
+++ b/drivers/s390/char/tty3270.c
@@ -1640,7 +1640,7 @@ tty3270_do_write(struct tty3270 *tp, struct tty_struct *tty,
 	int i_msg, i;
 
 	spin_lock_bh(&tp->view.lock);
-	for (i_msg = 0; !tty->stopped && i_msg < count; i_msg++) {
+	for (i_msg = 0; !tty->flow.stopped && i_msg < count; i_msg++) {
 		if (tp->esc_state != 0) {
 			/* Continue escape sequence. */
 			tty3270_escape_sequence(tp, buf[i_msg]);
diff --git a/drivers/staging/fwserial/fwserial.c b/drivers/staging/fwserial/fwserial.c
index 1ee6382cafc4..4245532d2fe0 100644
--- a/drivers/staging/fwserial/fwserial.c
+++ b/drivers/staging/fwserial/fwserial.c
@@ -722,7 +722,7 @@ static int fwtty_tx(struct fwtty_port *port, bool drain)
 
 	/* try to write as many dma transactions out as possible */
 	n = -EAGAIN;
-	while (!tty->stopped && !tty->hw_stopped &&
+	while (!tty->flow.stopped && !tty->hw_stopped &&
 	       !test_bit(STOP_TX, &port->flags)) {
 		txn = kmem_cache_alloc(fwtty_txn_cache, GFP_ATOMIC);
 		if (!txn) {
diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c
index ca48ce5a0862..a4b8876091d2 100644
--- a/drivers/tty/amiserial.c
+++ b/drivers/tty/amiserial.c
@@ -148,7 +148,7 @@ static __inline__ void rtsdtr_ctrl(int bits)
  * ------------------------------------------------------------
  * rs_stop() and rs_start()
  *
- * This routines are called before setting or resetting tty->stopped.
+ * This routines are called before setting or resetting tty->flow.stopped.
  * They enable or disable transmitter interrupts, as necessary.
  * ------------------------------------------------------------
  */
@@ -309,7 +309,7 @@ static void transmit_chars(struct serial_state *info)
 		return;
 	}
 	if (info->xmit.head == info->xmit.tail
-	    || info->tport.tty->stopped
+	    || info->tport.tty->flow.stopped
 	    || info->tport.tty->hw_stopped) {
 		info->IER &= ~UART_IER_THRI;
 	        custom.intena = IF_TBE;
@@ -768,7 +768,7 @@ static void rs_flush_chars(struct tty_struct *tty)
 	unsigned long flags;
 
 	if (info->xmit.head == info->xmit.tail
-	    || tty->stopped
+	    || tty->flow.stopped
 	    || tty->hw_stopped
 	    || !info->xmit.buf)
 		return;
@@ -812,7 +812,7 @@ static int rs_write(struct tty_struct * tty, const unsigned char *buf, int count
 	local_irq_restore(flags);
 
 	if (info->xmit.head != info->xmit.tail
-	    && !tty->stopped
+	    && !tty->flow.stopped
 	    && !tty->hw_stopped
 	    && !(info->IER & UART_IER_THRI)) {
 		info->IER |= UART_IER_THRI;
diff --git a/drivers/tty/moxa.c b/drivers/tty/moxa.c
index 4d4f15b5cd29..847ad3dac107 100644
--- a/drivers/tty/moxa.c
+++ b/drivers/tty/moxa.c
@@ -1221,7 +1221,7 @@ static int moxa_write_room(struct tty_struct *tty)
 {
 	struct moxa_port *ch;
 
-	if (tty->stopped)
+	if (tty->flow.stopped)
 		return 0;
 	ch = tty->driver_data;
 	if (ch == NULL)
@@ -1374,7 +1374,7 @@ static int moxa_poll_port(struct moxa_port *p, unsigned int handle,
 			clear_bit(EMPTYWAIT, &p->statusflags);
 			tty_wakeup(tty);
 		}
-		if (test_bit(LOWWAIT, &p->statusflags) && !tty->stopped &&
+		if (test_bit(LOWWAIT, &p->statusflags) && !tty->flow.stopped &&
 				MoxaPortTxQueue(p) <= WAKEUP_CHARS) {
 			clear_bit(LOWWAIT, &p->statusflags);
 			tty_wakeup(tty);
diff --git a/drivers/tty/mxser.c b/drivers/tty/mxser.c
index 16a852ecbe8a..85271e109014 100644
--- a/drivers/tty/mxser.c
+++ b/drivers/tty/mxser.c
@@ -1118,7 +1118,7 @@ static int mxser_write(struct tty_struct *tty, const unsigned char *buf, int cou
 		total += c;
 	}
 
-	if (info->xmit_cnt && !tty->stopped) {
+	if (info->xmit_cnt && !tty->flow.stopped) {
 		if (!tty->hw_stopped ||
 				(info->type == PORT_16550A) ||
 				(info->board->chip_flag)) {
@@ -1149,7 +1149,7 @@ static int mxser_put_char(struct tty_struct *tty, unsigned char ch)
 	info->xmit_head &= SERIAL_XMIT_SIZE - 1;
 	info->xmit_cnt++;
 	spin_unlock_irqrestore(&info->slock, flags);
-	if (!tty->stopped) {
+	if (!tty->flow.stopped) {
 		if (!tty->hw_stopped ||
 				(info->type == PORT_16550A) ||
 				info->board->chip_flag) {
@@ -1169,7 +1169,7 @@ static void mxser_flush_chars(struct tty_struct *tty)
 	struct mxser_port *info = tty->driver_data;
 	unsigned long flags;
 
-	if (info->xmit_cnt <= 0 || tty->stopped || !info->port.xmit_buf ||
+	if (info->xmit_cnt <= 0 || tty->flow.stopped || !info->port.xmit_buf ||
 			(tty->hw_stopped && info->type != PORT_16550A &&
 			 !info->board->chip_flag))
 		return;
@@ -1917,7 +1917,7 @@ static void mxser_unthrottle(struct tty_struct *tty)
 /*
  * mxser_stop() and mxser_start()
  *
- * This routines are called before setting or resetting tty->stopped.
+ * This routines are called before setting or resetting tty->flow.stopped.
  * They enable or disable transmitter interrupts, as necessary.
  */
 static void mxser_stop(struct tty_struct *tty)
@@ -1963,7 +1963,7 @@ static void mxser_set_termios(struct tty_struct *tty, struct ktermios *old_termi
 
 	/* Handle sw stopped */
 	if ((old_termios->c_iflag & IXON) && !I_IXON(tty)) {
-		tty->stopped = 0;
+		tty->flow.stopped = 0;
 
 		if (info->board->chip_flag) {
 			spin_lock_irqsave(&info->slock, flags);
@@ -2175,7 +2175,7 @@ static void mxser_transmit_chars(struct tty_struct *tty, struct mxser_port *port
 	if (port->port.xmit_buf == NULL)
 		return;
 
-	if (port->xmit_cnt <= 0 || tty->stopped ||
+	if (port->xmit_cnt <= 0 || tty->flow.stopped ||
 			(tty->hw_stopped &&
 			(port->type != PORT_16550A) &&
 			(!port->board->chip_flag))) {
diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index c32318da5190..3566bb577eb0 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -1289,7 +1289,7 @@ static void n_tty_receive_char_special(struct tty_struct *tty, unsigned char c)
 		}
 	}
 
-	if (tty->stopped && !tty->flow_stopped && I_IXON(tty) && I_IXANY(tty)) {
+	if (tty->flow.stopped && !tty->flow.tco_stopped && I_IXON(tty) && I_IXANY(tty)) {
 		start_tty(tty);
 		process_echoes(tty);
 	}
@@ -1398,7 +1398,7 @@ static void n_tty_receive_char(struct tty_struct *tty, unsigned char c)
 {
 	struct n_tty_data *ldata = tty->disc_data;
 
-	if (tty->stopped && !tty->flow_stopped && I_IXON(tty) && I_IXANY(tty)) {
+	if (tty->flow.stopped && !tty->flow.tco_stopped && I_IXON(tty) && I_IXANY(tty)) {
 		start_tty(tty);
 		process_echoes(tty);
 	}
@@ -1427,7 +1427,7 @@ static void n_tty_receive_char_closing(struct tty_struct *tty, unsigned char c)
 		if (c == STOP_CHAR(tty))
 			stop_tty(tty);
 		else if (c == START_CHAR(tty) ||
-			 (tty->stopped && !tty->flow_stopped && I_IXANY(tty) &&
+			 (tty->flow.stopped && !tty->flow.tco_stopped && I_IXANY(tty) &&
 			  c != INTR_CHAR(tty) && c != QUIT_CHAR(tty) &&
 			  c != SUSP_CHAR(tty))) {
 			start_tty(tty);
@@ -1797,7 +1797,7 @@ static void n_tty_set_termios(struct tty_struct *tty, struct ktermios *old)
 	 * Fix tty hang when I_IXON(tty) is cleared, but the tty
 	 * been stopped by STOP_CHAR(tty) before it.
 	 */
-	if (!I_IXON(tty) && old && (old->c_iflag & IXON) && !tty->flow_stopped) {
+	if (!I_IXON(tty) && old && (old->c_iflag & IXON) && !tty->flow.tco_stopped) {
 		start_tty(tty);
 		process_echoes(tty);
 	}
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index 9b5d4ae5d8f2..017f28150a32 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -113,7 +113,7 @@ static int pty_write(struct tty_struct *tty, const unsigned char *buf, int c)
 	struct tty_struct *to = tty->link;
 	unsigned long flags;
 
-	if (tty->stopped)
+	if (tty->flow.stopped)
 		return 0;
 
 	if (c > 0) {
@@ -138,7 +138,7 @@ static int pty_write(struct tty_struct *tty, const unsigned char *buf, int c)
 
 static int pty_write_room(struct tty_struct *tty)
 {
-	if (tty->stopped)
+	if (tty->flow.stopped)
 		return 0;
 	return tty_buffer_space_avail(tty->link->port);
 }
diff --git a/drivers/tty/serial/arc_uart.c b/drivers/tty/serial/arc_uart.c
index 1a9444b6b57e..596217d10d5c 100644
--- a/drivers/tty/serial/arc_uart.c
+++ b/drivers/tty/serial/arc_uart.c
@@ -149,7 +149,7 @@ static unsigned int arc_serial_tx_empty(struct uart_port *port)
 /*
  * Driver internal routine, used by both tty(serial core) as well as tx-isr
  *  -Called under spinlock in either cases
- *  -also tty->stopped has already been checked
+ *  -also tty->flow.stopped has already been checked
  *     = by uart_start( ) before calling us
  *     = tx_ist checks that too before calling
  */
diff --git a/drivers/tty/serial/dz.c b/drivers/tty/serial/dz.c
index 4552742c3859..6d91e9b6284d 100644
--- a/drivers/tty/serial/dz.c
+++ b/drivers/tty/serial/dz.c
@@ -115,7 +115,7 @@ static void dz_out(struct dz_port *dport, unsigned offset, u16 value)
  * rs_stop () and rs_start ()
  *
  * These routines are called before setting or resetting
- * tty->stopped. They enable or disable transmitter interrupts,
+ * tty->flow.stopped. They enable or disable transmitter interrupts,
  * as necessary.
  * ------------------------------------------------------------
  */
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index 5523cf7bd1c2..1555dccc28af 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -768,7 +768,7 @@ static int write(struct tty_struct *tty,
 	if (!info->tx_buf || (count > info->max_frame_size))
 		return -EIO;
 
-	if (!count || tty->stopped || tty->hw_stopped)
+	if (!count || tty->flow.stopped || tty->hw_stopped)
 		return 0;
 
 	spin_lock_irqsave(&info->lock, flags);
@@ -889,7 +889,7 @@ static void flush_chars(struct tty_struct *tty)
 		return;
 	DBGINFO(("%s flush_chars entry tx_count=%d\n", info->device_name, info->tx_count));
 
-	if (info->tx_count <= 0 || tty->stopped ||
+	if (info->tx_count <= 0 || tty->flow.stopped ||
 	    tty->hw_stopped || !info->tx_buf)
 		return;
 
@@ -2241,7 +2241,7 @@ static void isr_txeom(struct slgt_info *info, unsigned short status)
 		else
 #endif
 		{
-			if (info->port.tty && (info->port.tty->stopped || info->port.tty->hw_stopped)) {
+			if (info->port.tty && (info->port.tty->flow.stopped || info->port.tty->hw_stopped)) {
 				tx_stop(info);
 				return;
 			}
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 5b5e99604989..9734be2eb00e 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -778,14 +778,14 @@ EXPORT_SYMBOL(tty_hung_up_p);
  *	but not always.
  *
  *	Locking:
- *		flow_lock
+ *		flow.lock
  */
 
 void __stop_tty(struct tty_struct *tty)
 {
-	if (tty->stopped)
+	if (tty->flow.stopped)
 		return;
-	tty->stopped = 1;
+	tty->flow.stopped = true;
 	if (tty->ops->stop)
 		tty->ops->stop(tty);
 }
@@ -794,9 +794,9 @@ void stop_tty(struct tty_struct *tty)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&tty->flow_lock, flags);
+	spin_lock_irqsave(&tty->flow.lock, flags);
 	__stop_tty(tty);
-	spin_unlock_irqrestore(&tty->flow_lock, flags);
+	spin_unlock_irqrestore(&tty->flow.lock, flags);
 }
 EXPORT_SYMBOL(stop_tty);
 
@@ -809,14 +809,14 @@ EXPORT_SYMBOL(stop_tty);
  *	start method is invoked and the line discipline woken.
  *
  *	Locking:
- *		flow_lock
+ *		flow.lock
  */
 
 void __start_tty(struct tty_struct *tty)
 {
-	if (!tty->stopped || tty->flow_stopped)
+	if (!tty->flow.stopped || tty->flow.tco_stopped)
 		return;
-	tty->stopped = 0;
+	tty->flow.stopped = false;
 	if (tty->ops->start)
 		tty->ops->start(tty);
 	tty_wakeup(tty);
@@ -826,9 +826,9 @@ void start_tty(struct tty_struct *tty)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&tty->flow_lock, flags);
+	spin_lock_irqsave(&tty->flow.lock, flags);
 	__start_tty(tty);
-	spin_unlock_irqrestore(&tty->flow_lock, flags);
+	spin_unlock_irqrestore(&tty->flow.lock, flags);
 }
 EXPORT_SYMBOL(start_tty);
 
@@ -1172,7 +1172,7 @@ ssize_t redirected_tty_write(struct kiocb *iocb, struct iov_iter *iter)
 
 int tty_send_xchar(struct tty_struct *tty, char ch)
 {
-	int	was_stopped = tty->stopped;
+	bool was_stopped = tty->flow.stopped;
 
 	if (tty->ops->send_xchar) {
 		down_read(&tty->termios_rwsem);
@@ -3141,7 +3141,7 @@ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx)
 	INIT_WORK(&tty->hangup_work, do_tty_hangup);
 	mutex_init(&tty->atomic_write_lock);
 	spin_lock_init(&tty->ctrl_lock);
-	spin_lock_init(&tty->flow_lock);
+	spin_lock_init(&tty->flow.lock);
 	spin_lock_init(&tty->files_lock);
 	INIT_LIST_HEAD(&tty->tty_files);
 	INIT_WORK(&tty->SAK_work, do_SAK_work);
diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c
index 41f7449d0464..07c88ccfb17a 100644
--- a/drivers/tty/tty_ioctl.c
+++ b/drivers/tty/tty_ioctl.c
@@ -846,20 +846,20 @@ int n_tty_ioctl_helper(struct tty_struct *tty, struct file *file,
 			return retval;
 		switch (arg) {
 		case TCOOFF:
-			spin_lock_irq(&tty->flow_lock);
-			if (!tty->flow_stopped) {
-				tty->flow_stopped = 1;
+			spin_lock_irq(&tty->flow.lock);
+			if (!tty->flow.tco_stopped) {
+				tty->flow.tco_stopped = true;
 				__stop_tty(tty);
 			}
-			spin_unlock_irq(&tty->flow_lock);
+			spin_unlock_irq(&tty->flow.lock);
 			break;
 		case TCOON:
-			spin_lock_irq(&tty->flow_lock);
-			if (tty->flow_stopped) {
-				tty->flow_stopped = 0;
+			spin_lock_irq(&tty->flow.lock);
+			if (tty->flow.tco_stopped) {
+				tty->flow.tco_stopped = false;
 				__start_tty(tty);
 			}
-			spin_unlock_irq(&tty->flow_lock);
+			spin_unlock_irq(&tty->flow.lock);
 			break;
 		case TCIOFF:
 			if (STOP_CHAR(tty) != __DISABLED_CHAR)
diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
index 303c198fbf5c..0eb523207828 100644
--- a/drivers/tty/tty_port.c
+++ b/drivers/tty/tty_port.c
@@ -587,7 +587,7 @@ int tty_port_close_start(struct tty_port *port,
 
 	if (tty_port_initialized(port)) {
 		/* Don't block on a stalled port, just pull the chain */
-		if (tty->flow_stopped)
+		if (tty->flow.tco_stopped)
 			tty_driver_flush_buffer(tty);
 		if (port->closing_wait != ASYNC_CLOSING_WAIT_NONE)
 			tty_wait_until_sent(tty, port->closing_wait);
diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index 5d2309742718..4b0d69042ceb 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -515,7 +515,7 @@ static void fn_hold(struct vc_data *vc)
 	 * these routines are also activated by ^S/^Q.
 	 * (And SCROLLOCK can also be set by the ioctl KDSKBLED.)
 	 */
-	if (tty->stopped)
+	if (tty->flow.stopped)
 		start_tty(tty);
 	else
 		stop_tty(tty);
diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index e5040bdadcd6..38c677fb6505 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -2888,7 +2888,7 @@ static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int co
 
 	param.vc = vc;
 
-	while (!tty->stopped && count) {
+	while (!tty->flow.stopped && count) {
 		int orig = *buf;
 		buf++;
 		n++;
@@ -3265,7 +3265,7 @@ static int con_put_char(struct tty_struct *tty, unsigned char ch)
 
 static int con_write_room(struct tty_struct *tty)
 {
-	if (tty->stopped)
+	if (tty->flow.stopped)
 		return 0;
 	return 32768;		/* No limit, really; we're not buffering */
 }
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index d7ed00f1594e..7445c8fd88c0 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -428,7 +428,7 @@ int uart_resume_port(struct uart_driver *reg, struct uart_port *port);
 static inline int uart_tx_stopped(struct uart_port *port)
 {
 	struct tty_struct *tty = port->state->port.tty;
-	if ((tty && tty->stopped) || port->hw_stopped)
+	if ((tty && tty->flow.stopped) || port->hw_stopped)
 		return 1;
 	return 0;
 }
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 5aad2220266c..df3a69b2e1ea 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -243,20 +243,22 @@ struct tty_port {
 #define TTY_PORT_KOPENED	5	/* device exclusively opened by
 					   kernel */
 
-/*
- * Where all of the state associated with a tty is kept while the tty
- * is open.  Since the termios state should be kept even if the tty
- * has been closed --- for things like the baud rate, etc --- it is
- * not stored here, but rather a pointer to the real state is stored
- * here.  Possible the winsize structure should have the same
- * treatment, but (1) the default 80x24 is usually right and (2) it's
- * most often used by a windowing system, which will set the correct
- * size each time the window is created or resized anyway.
- * 						- TYT, 9/14/92
- */
-
 struct tty_operations;
 
+/**
+ * struct tty_struct - state associated with a tty while open
+ *
+ * @flow.lock: lock for flow members
+ * @flow.stopped: tty stopped/started by tty_stop/tty_start
+ * @flow.tco_stopped: tty stopped/started by TCOOFF/TCOON ioctls (it has
+ *		      precedense over @flow.stopped)
+ * @flow.unused: alignment for Alpha, so that no members other than @flow.* are
+ *		 modified by the same 64b word store. The @flow's __aligned is
+ *		 there for the very same reason.
+ *
+ * All of the state associated with a tty while the tty is open. Persistent
+ * storage for tty devices is referenced here as @port in struct tty_port.
+ */
 struct tty_struct {
 	int	magic;
 	struct kref kref;
@@ -275,7 +277,6 @@ struct tty_struct {
 	struct rw_semaphore termios_rwsem;
 	struct mutex winsize_mutex;
 	spinlock_t ctrl_lock;
-	spinlock_t flow_lock;
 	/* Termios values are protected by the termios rwsem */
 	struct ktermios termios, termios_locked;
 	char name[64];
@@ -288,9 +289,14 @@ struct tty_struct {
 	unsigned long flags;
 	int count;
 	struct winsize winsize;		/* winsize_mutex */
-	unsigned long stopped:1,	/* flow_lock */
-		      flow_stopped:1,
-		      unused:BITS_PER_LONG - 2;
+
+	struct {
+		spinlock_t lock;
+		bool stopped;
+		bool tco_stopped;
+		unsigned long unused[0];
+	} __aligned(sizeof(unsigned long)) flow;
+
 	int hw_stopped;
 	unsigned long ctrl_status:8,	/* ctrl_lock */
 		      packet:1,
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index 2f719b471d52..653fa5af3a22 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -153,7 +153,7 @@
  * 	This routine notifies the tty driver that it should stop
  * 	outputting characters to the tty device.  
  *
- *	Called with ->flow_lock held. Serialized with start() method.
+ *	Called with ->flow.lock held. Serialized with start() method.
  *
  *	Optional:
  *
@@ -164,7 +164,7 @@
  * 	This routine notifies the tty driver that it resume sending
  *	characters to the tty device.
  *
- *	Called with ->flow_lock held. Serialized with stop() method.
+ *	Called with ->flow.lock held. Serialized with stop() method.
  *
  *	Optional:
  *
-- 
cgit v1.2.3


From 64d608db38ffc0c7a25455387096e0aad9410397 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Wed, 5 May 2021 11:19:06 +0200
Subject: tty: cumulate and document tty_struct::ctrl* members

Group the ctrl members under a single struct called ctrl. The new struct
contains 'pgrp', 'session', 'pktstatus', and 'packet'. 'pktstatus' and
'packet' used to be bits in a bitfield. The struct also contains the
lock protecting them to share the same cache line.

Note that commit c545b66c6922b (tty: Serialize tcflow() with other tty
flow control changes) added a padding to the original bitfield. It was
for the bitfield to occupy a whole 64b word to avoid interferring stores
on Alpha (cannot we evaporate this arch with weird implications to C
code yet?). But it doesn't work as expected as the padding
(tty_struct::ctrl_unused) is aligned to a 8B boundary too and occupies
some bytes from the next word.

So make it reliable by:
1) setting __aligned of the struct -- that aligns the start, and
2) making 'unsigned long unused[0]' as the last member of the struct --
   pads the end.

Add a kerneldoc comment for this grouped members.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: netdev@vger.kernel.org
Link: https://lore.kernel.org/r/20210505091928.22010-14-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/caif/caif_serial.c |  2 +-
 drivers/tty/n_tty.c            | 30 +++++++--------
 drivers/tty/pty.c              | 62 +++++++++++++++----------------
 drivers/tty/tty_io.c           | 44 +++++++++++-----------
 drivers/tty/tty_jobctrl.c      | 84 +++++++++++++++++++++---------------------
 drivers/tty/vt/vt.c            |  4 +-
 include/linux/tty.h            | 26 ++++++++-----
 7 files changed, 130 insertions(+), 122 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/caif/caif_serial.c b/drivers/net/caif/caif_serial.c
index 3996cf7dc9ba..b0566588ce33 100644
--- a/drivers/net/caif/caif_serial.c
+++ b/drivers/net/caif/caif_serial.c
@@ -89,7 +89,7 @@ static inline void update_tty_status(struct ser_device *ser)
 	ser->tty_status =
 		ser->tty->flow.stopped << 5 |
 		ser->tty->flow.tco_stopped << 3 |
-		ser->tty->packet << 2;
+		ser->tty->ctrl.packet << 2;
 }
 static inline void debugfs_init(struct ser_device *ser, struct tty_struct *tty)
 {
diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index 3566bb577eb0..8ce712eec026 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -342,10 +342,10 @@ static void n_tty_packet_mode_flush(struct tty_struct *tty)
 {
 	unsigned long flags;
 
-	if (tty->link->packet) {
-		spin_lock_irqsave(&tty->ctrl_lock, flags);
-		tty->ctrl_status |= TIOCPKT_FLUSHREAD;
-		spin_unlock_irqrestore(&tty->ctrl_lock, flags);
+	if (tty->link->ctrl.packet) {
+		spin_lock_irqsave(&tty->ctrl.lock, flags);
+		tty->ctrl.pktstatus |= TIOCPKT_FLUSHREAD;
+		spin_unlock_irqrestore(&tty->ctrl.lock, flags);
 		wake_up_interruptible(&tty->link->read_wait);
 	}
 }
@@ -361,7 +361,7 @@ static void n_tty_packet_mode_flush(struct tty_struct *tty)
  *	Holds termios_rwsem to exclude producer/consumer while
  *	buffer indices are reset.
  *
- *	Locking: ctrl_lock, exclusive termios_rwsem
+ *	Locking: ctrl.lock, exclusive termios_rwsem
  */
 
 static void n_tty_flush_buffer(struct tty_struct *tty)
@@ -1103,7 +1103,7 @@ static void eraser(unsigned char c, struct tty_struct *tty)
  *	buffer is 'output'. The signal is processed first to alert any current
  *	readers or writers to discontinue and exit their i/o loops.
  *
- *	Locking: ctrl_lock
+ *	Locking: ctrl.lock
  */
 
 static void __isig(int sig, struct tty_struct *tty)
@@ -2025,7 +2025,7 @@ static bool canon_copy_from_read_buf(struct tty_struct *tty,
  *
  *	Locking: redirected write test is safe
  *		 current->signal->tty check is safe
- *		 ctrl_lock to safely reference tty->pgrp
+ *		 ctrl.lock to safely reference tty->ctrl.pgrp
  */
 
 static int job_control(struct tty_struct *tty, struct file *file)
@@ -2072,7 +2072,7 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file,
 	int minimum, time;
 	ssize_t retval = 0;
 	long timeout;
-	int packet;
+	bool packet;
 	size_t tail;
 
 	/*
@@ -2128,20 +2128,20 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file,
 		}
 	}
 
-	packet = tty->packet;
+	packet = tty->ctrl.packet;
 	tail = ldata->read_tail;
 
 	add_wait_queue(&tty->read_wait, &wait);
 	while (nr) {
 		/* First test for status change. */
-		if (packet && tty->link->ctrl_status) {
+		if (packet && tty->link->ctrl.pktstatus) {
 			unsigned char cs;
 			if (kb != kbuf)
 				break;
-			spin_lock_irq(&tty->link->ctrl_lock);
-			cs = tty->link->ctrl_status;
-			tty->link->ctrl_status = 0;
-			spin_unlock_irq(&tty->link->ctrl_lock);
+			spin_lock_irq(&tty->link->ctrl.lock);
+			cs = tty->link->ctrl.pktstatus;
+			tty->link->ctrl.pktstatus = 0;
+			spin_unlock_irq(&tty->link->ctrl.lock);
 			*kb++ = cs;
 			nr--;
 			break;
@@ -2368,7 +2368,7 @@ static __poll_t n_tty_poll(struct tty_struct *tty, struct file *file,
 		if (input_available_p(tty, 1))
 			mask |= EPOLLIN | EPOLLRDNORM;
 	}
-	if (tty->packet && tty->link->ctrl_status)
+	if (tty->ctrl.packet && tty->link->ctrl.pktstatus)
 		mask |= EPOLLPRI | EPOLLIN | EPOLLRDNORM;
 	if (test_bit(TTY_OTHER_CLOSED, &tty->flags))
 		mask |= EPOLLHUP;
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index 017f28150a32..3e7b5c811f9b 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -57,9 +57,9 @@ static void pty_close(struct tty_struct *tty, struct file *filp)
 	set_bit(TTY_IO_ERROR, &tty->flags);
 	wake_up_interruptible(&tty->read_wait);
 	wake_up_interruptible(&tty->write_wait);
-	spin_lock_irq(&tty->ctrl_lock);
-	tty->packet = 0;
-	spin_unlock_irq(&tty->ctrl_lock);
+	spin_lock_irq(&tty->ctrl.lock);
+	tty->ctrl.packet = false;
+	spin_unlock_irq(&tty->ctrl.lock);
 	/* Review - krefs on tty_link ?? */
 	if (!tty->link)
 		return;
@@ -185,16 +185,16 @@ static int pty_set_pktmode(struct tty_struct *tty, int __user *arg)
 	if (get_user(pktmode, arg))
 		return -EFAULT;
 
-	spin_lock_irq(&tty->ctrl_lock);
+	spin_lock_irq(&tty->ctrl.lock);
 	if (pktmode) {
-		if (!tty->packet) {
-			tty->link->ctrl_status = 0;
+		if (!tty->ctrl.packet) {
+			tty->link->ctrl.pktstatus = 0;
 			smp_mb();
-			tty->packet = 1;
+			tty->ctrl.packet = true;
 		}
 	} else
-		tty->packet = 0;
-	spin_unlock_irq(&tty->ctrl_lock);
+		tty->ctrl.packet = false;
+	spin_unlock_irq(&tty->ctrl.lock);
 
 	return 0;
 }
@@ -202,7 +202,7 @@ static int pty_set_pktmode(struct tty_struct *tty, int __user *arg)
 /* Get the packet mode of a pty */
 static int pty_get_pktmode(struct tty_struct *tty, int __user *arg)
 {
-	int pktmode = tty->packet;
+	int pktmode = tty->ctrl.packet;
 
 	return put_user(pktmode, arg);
 }
@@ -232,11 +232,11 @@ static void pty_flush_buffer(struct tty_struct *tty)
 		return;
 
 	tty_buffer_flush(to, NULL);
-	if (to->packet) {
-		spin_lock_irq(&tty->ctrl_lock);
-		tty->ctrl_status |= TIOCPKT_FLUSHWRITE;
+	if (to->ctrl.packet) {
+		spin_lock_irq(&tty->ctrl.lock);
+		tty->ctrl.pktstatus |= TIOCPKT_FLUSHWRITE;
 		wake_up_interruptible(&to->read_wait);
-		spin_unlock_irq(&tty->ctrl_lock);
+		spin_unlock_irq(&tty->ctrl.lock);
 	}
 }
 
@@ -266,7 +266,7 @@ static void pty_set_termios(struct tty_struct *tty,
 					struct ktermios *old_termios)
 {
 	/* See if packet mode change of state. */
-	if (tty->link && tty->link->packet) {
+	if (tty->link && tty->link->ctrl.packet) {
 		int extproc = (old_termios->c_lflag & EXTPROC) | L_EXTPROC(tty);
 		int old_flow = ((old_termios->c_iflag & IXON) &&
 				(old_termios->c_cc[VSTOP] == '\023') &&
@@ -275,17 +275,17 @@ static void pty_set_termios(struct tty_struct *tty,
 				STOP_CHAR(tty) == '\023' &&
 				START_CHAR(tty) == '\021');
 		if ((old_flow != new_flow) || extproc) {
-			spin_lock_irq(&tty->ctrl_lock);
+			spin_lock_irq(&tty->ctrl.lock);
 			if (old_flow != new_flow) {
-				tty->ctrl_status &= ~(TIOCPKT_DOSTOP | TIOCPKT_NOSTOP);
+				tty->ctrl.pktstatus &= ~(TIOCPKT_DOSTOP | TIOCPKT_NOSTOP);
 				if (new_flow)
-					tty->ctrl_status |= TIOCPKT_DOSTOP;
+					tty->ctrl.pktstatus |= TIOCPKT_DOSTOP;
 				else
-					tty->ctrl_status |= TIOCPKT_NOSTOP;
+					tty->ctrl.pktstatus |= TIOCPKT_NOSTOP;
 			}
 			if (extproc)
-				tty->ctrl_status |= TIOCPKT_IOCTL;
-			spin_unlock_irq(&tty->ctrl_lock);
+				tty->ctrl.pktstatus |= TIOCPKT_IOCTL;
+			spin_unlock_irq(&tty->ctrl.lock);
 			wake_up_interruptible(&tty->link->read_wait);
 		}
 	}
@@ -346,11 +346,11 @@ static void pty_start(struct tty_struct *tty)
 {
 	unsigned long flags;
 
-	if (tty->link && tty->link->packet) {
-		spin_lock_irqsave(&tty->ctrl_lock, flags);
-		tty->ctrl_status &= ~TIOCPKT_STOP;
-		tty->ctrl_status |= TIOCPKT_START;
-		spin_unlock_irqrestore(&tty->ctrl_lock, flags);
+	if (tty->link && tty->link->ctrl.packet) {
+		spin_lock_irqsave(&tty->ctrl.lock, flags);
+		tty->ctrl.pktstatus &= ~TIOCPKT_STOP;
+		tty->ctrl.pktstatus |= TIOCPKT_START;
+		spin_unlock_irqrestore(&tty->ctrl.lock, flags);
 		wake_up_interruptible_poll(&tty->link->read_wait, EPOLLIN);
 	}
 }
@@ -359,11 +359,11 @@ static void pty_stop(struct tty_struct *tty)
 {
 	unsigned long flags;
 
-	if (tty->link && tty->link->packet) {
-		spin_lock_irqsave(&tty->ctrl_lock, flags);
-		tty->ctrl_status &= ~TIOCPKT_START;
-		tty->ctrl_status |= TIOCPKT_STOP;
-		spin_unlock_irqrestore(&tty->ctrl_lock, flags);
+	if (tty->link && tty->link->ctrl.packet) {
+		spin_lock_irqsave(&tty->ctrl.lock, flags);
+		tty->ctrl.pktstatus &= ~TIOCPKT_START;
+		tty->ctrl.pktstatus |= TIOCPKT_STOP;
+		spin_unlock_irqrestore(&tty->ctrl.lock, flags);
 		wake_up_interruptible_poll(&tty->link->read_wait, EPOLLIN);
 	}
 }
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 9734be2eb00e..362845dc1c19 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -638,15 +638,15 @@ static void __tty_hangup(struct tty_struct *tty, int exit_session)
 
 	tty_ldisc_hangup(tty, cons_filp != NULL);
 
-	spin_lock_irq(&tty->ctrl_lock);
+	spin_lock_irq(&tty->ctrl.lock);
 	clear_bit(TTY_THROTTLED, &tty->flags);
 	clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags);
-	put_pid(tty->session);
-	put_pid(tty->pgrp);
-	tty->session = NULL;
-	tty->pgrp = NULL;
-	tty->ctrl_status = 0;
-	spin_unlock_irq(&tty->ctrl_lock);
+	put_pid(tty->ctrl.session);
+	put_pid(tty->ctrl.pgrp);
+	tty->ctrl.session = NULL;
+	tty->ctrl.pgrp = NULL;
+	tty->ctrl.pktstatus = 0;
+	spin_unlock_irq(&tty->ctrl.lock);
 
 	/*
 	 * If one of the devices matches a console pointer, we
@@ -1559,8 +1559,8 @@ static void release_one_tty(struct work_struct *work)
 	list_del_init(&tty->tty_files);
 	spin_unlock(&tty->files_lock);
 
-	put_pid(tty->pgrp);
-	put_pid(tty->session);
+	put_pid(tty->ctrl.pgrp);
+	put_pid(tty->ctrl.session);
 	free_tty_struct(tty);
 }
 
@@ -1861,9 +1861,9 @@ int tty_release(struct inode *inode, struct file *filp)
 	 */
 	if (!tty->count) {
 		read_lock(&tasklist_lock);
-		session_clear_tty(tty->session);
+		session_clear_tty(tty->ctrl.session);
 		if (o_tty)
-			session_clear_tty(o_tty->session);
+			session_clear_tty(o_tty->ctrl.session);
 		read_unlock(&tasklist_lock);
 	}
 
@@ -2250,16 +2250,16 @@ static int __tty_fasync(int fd, struct file *filp, int on)
 		enum pid_type type;
 		struct pid *pid;
 
-		spin_lock_irqsave(&tty->ctrl_lock, flags);
-		if (tty->pgrp) {
-			pid = tty->pgrp;
+		spin_lock_irqsave(&tty->ctrl.lock, flags);
+		if (tty->ctrl.pgrp) {
+			pid = tty->ctrl.pgrp;
 			type = PIDTYPE_PGID;
 		} else {
 			pid = task_pid(current);
 			type = PIDTYPE_TGID;
 		}
 		get_pid(pid);
-		spin_unlock_irqrestore(&tty->ctrl_lock, flags);
+		spin_unlock_irqrestore(&tty->ctrl.lock, flags);
 		__f_setown(filp, pid, type, 0);
 		put_pid(pid);
 		retval = 0;
@@ -2381,7 +2381,7 @@ EXPORT_SYMBOL(tty_do_resize);
  *
  *	Locking:
  *		Driver dependent. The default do_resize method takes the
- *	tty termios mutex and ctrl_lock. The console takes its own lock
+ *	tty termios mutex and ctrl.lock. The console takes its own lock
  *	then calls into the default method.
  */
 
@@ -3039,9 +3039,9 @@ void __do_SAK(struct tty_struct *tty)
 	if (!tty)
 		return;
 
-	spin_lock_irqsave(&tty->ctrl_lock, flags);
-	session = get_pid(tty->session);
-	spin_unlock_irqrestore(&tty->ctrl_lock, flags);
+	spin_lock_irqsave(&tty->ctrl.lock, flags);
+	session = get_pid(tty->ctrl.session);
+	spin_unlock_irqrestore(&tty->ctrl.lock, flags);
 
 	tty_ldisc_flush(tty);
 
@@ -3129,8 +3129,8 @@ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx)
 		kfree(tty);
 		return NULL;
 	}
-	tty->session = NULL;
-	tty->pgrp = NULL;
+	tty->ctrl.session = NULL;
+	tty->ctrl.pgrp = NULL;
 	mutex_init(&tty->legacy_mutex);
 	mutex_init(&tty->throttle_mutex);
 	init_rwsem(&tty->termios_rwsem);
@@ -3140,7 +3140,7 @@ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx)
 	init_waitqueue_head(&tty->read_wait);
 	INIT_WORK(&tty->hangup_work, do_tty_hangup);
 	mutex_init(&tty->atomic_write_lock);
-	spin_lock_init(&tty->ctrl_lock);
+	spin_lock_init(&tty->ctrl.lock);
 	spin_lock_init(&tty->flow.lock);
 	spin_lock_init(&tty->files_lock);
 	INIT_LIST_HEAD(&tty->tty_files);
diff --git a/drivers/tty/tty_jobctrl.c b/drivers/tty/tty_jobctrl.c
index 7813dc910a19..6119b5e48610 100644
--- a/drivers/tty/tty_jobctrl.c
+++ b/drivers/tty/tty_jobctrl.c
@@ -28,7 +28,7 @@ static int is_ignored(int sig)
  *	not in the foreground, send a SIGTTOU.  If the signal is blocked or
  *	ignored, go ahead and perform the operation.  (POSIX 7.2)
  *
- *	Locking: ctrl_lock
+ *	Locking: ctrl.lock
  */
 int __tty_check_change(struct tty_struct *tty, int sig)
 {
@@ -42,9 +42,9 @@ int __tty_check_change(struct tty_struct *tty, int sig)
 	rcu_read_lock();
 	pgrp = task_pgrp(current);
 
-	spin_lock_irqsave(&tty->ctrl_lock, flags);
-	tty_pgrp = tty->pgrp;
-	spin_unlock_irqrestore(&tty->ctrl_lock, flags);
+	spin_lock_irqsave(&tty->ctrl.lock, flags);
+	tty_pgrp = tty->ctrl.pgrp;
+	spin_unlock_irqrestore(&tty->ctrl.lock, flags);
 
 	if (tty_pgrp && pgrp != tty_pgrp) {
 		if (is_ignored(sig)) {
@@ -99,16 +99,16 @@ static void __proc_set_tty(struct tty_struct *tty)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&tty->ctrl_lock, flags);
+	spin_lock_irqsave(&tty->ctrl.lock, flags);
 	/*
 	 * The session and fg pgrp references will be non-NULL if
 	 * tiocsctty() is stealing the controlling tty
 	 */
-	put_pid(tty->session);
-	put_pid(tty->pgrp);
-	tty->pgrp = get_pid(task_pgrp(current));
-	tty->session = get_pid(task_session(current));
-	spin_unlock_irqrestore(&tty->ctrl_lock, flags);
+	put_pid(tty->ctrl.session);
+	put_pid(tty->ctrl.pgrp);
+	tty->ctrl.pgrp = get_pid(task_pgrp(current));
+	tty->ctrl.session = get_pid(task_session(current));
+	spin_unlock_irqrestore(&tty->ctrl.lock, flags);
 	if (current->signal->tty) {
 		tty_debug(tty, "current tty %s not NULL!!\n",
 			  current->signal->tty->name);
@@ -135,7 +135,7 @@ void tty_open_proc_set_tty(struct file *filp, struct tty_struct *tty)
 	spin_lock_irq(&current->sighand->siglock);
 	if (current->signal->leader &&
 	    !current->signal->tty &&
-	    tty->session == NULL) {
+	    tty->ctrl.session == NULL) {
 		/*
 		 * Don't let a process that only has write access to the tty
 		 * obtain the privileges associated with having a tty as
@@ -200,8 +200,8 @@ int tty_signal_session_leader(struct tty_struct *tty, int exit_session)
 	struct pid *tty_pgrp = NULL;
 
 	read_lock(&tasklist_lock);
-	if (tty->session) {
-		do_each_pid_task(tty->session, PIDTYPE_SID, p) {
+	if (tty->ctrl.session) {
+		do_each_pid_task(tty->ctrl.session, PIDTYPE_SID, p) {
 			spin_lock_irq(&p->sighand->siglock);
 			if (p->signal->tty == tty) {
 				p->signal->tty = NULL;
@@ -218,13 +218,14 @@ int tty_signal_session_leader(struct tty_struct *tty, int exit_session)
 			__group_send_sig_info(SIGHUP, SEND_SIG_PRIV, p);
 			__group_send_sig_info(SIGCONT, SEND_SIG_PRIV, p);
 			put_pid(p->signal->tty_old_pgrp);  /* A noop */
-			spin_lock(&tty->ctrl_lock);
-			tty_pgrp = get_pid(tty->pgrp);
-			if (tty->pgrp)
-				p->signal->tty_old_pgrp = get_pid(tty->pgrp);
-			spin_unlock(&tty->ctrl_lock);
+			spin_lock(&tty->ctrl.lock);
+			tty_pgrp = get_pid(tty->ctrl.pgrp);
+			if (tty->ctrl.pgrp)
+				p->signal->tty_old_pgrp =
+					get_pid(tty->ctrl.pgrp);
+			spin_unlock(&tty->ctrl.lock);
 			spin_unlock_irq(&p->sighand->siglock);
-		} while_each_pid_task(tty->session, PIDTYPE_SID, p);
+		} while_each_pid_task(tty->ctrl.session, PIDTYPE_SID, p);
 	}
 	read_unlock(&tasklist_lock);
 
@@ -309,12 +310,12 @@ void disassociate_ctty(int on_exit)
 		unsigned long flags;
 
 		tty_lock(tty);
-		spin_lock_irqsave(&tty->ctrl_lock, flags);
-		put_pid(tty->session);
-		put_pid(tty->pgrp);
-		tty->session = NULL;
-		tty->pgrp = NULL;
-		spin_unlock_irqrestore(&tty->ctrl_lock, flags);
+		spin_lock_irqsave(&tty->ctrl.lock, flags);
+		put_pid(tty->ctrl.session);
+		put_pid(tty->ctrl.pgrp);
+		tty->ctrl.session = NULL;
+		tty->ctrl.pgrp = NULL;
+		spin_unlock_irqrestore(&tty->ctrl.lock, flags);
 		tty_unlock(tty);
 		tty_kref_put(tty);
 	}
@@ -363,7 +364,8 @@ static int tiocsctty(struct tty_struct *tty, struct file *file, int arg)
 	tty_lock(tty);
 	read_lock(&tasklist_lock);
 
-	if (current->signal->leader && (task_session(current) == tty->session))
+	if (current->signal->leader &&
+			task_session(current) == tty->ctrl.session)
 		goto unlock;
 
 	/*
@@ -375,7 +377,7 @@ static int tiocsctty(struct tty_struct *tty, struct file *file, int arg)
 		goto unlock;
 	}
 
-	if (tty->session) {
+	if (tty->ctrl.session) {
 		/*
 		 * This tty is already the controlling
 		 * tty for another session group!
@@ -384,7 +386,7 @@ static int tiocsctty(struct tty_struct *tty, struct file *file, int arg)
 			/*
 			 * Steal it away
 			 */
-			session_clear_tty(tty->session);
+			session_clear_tty(tty->ctrl.session);
 		} else {
 			ret = -EPERM;
 			goto unlock;
@@ -416,9 +418,9 @@ struct pid *tty_get_pgrp(struct tty_struct *tty)
 	unsigned long flags;
 	struct pid *pgrp;
 
-	spin_lock_irqsave(&tty->ctrl_lock, flags);
-	pgrp = get_pid(tty->pgrp);
-	spin_unlock_irqrestore(&tty->ctrl_lock, flags);
+	spin_lock_irqsave(&tty->ctrl.lock, flags);
+	pgrp = get_pid(tty->ctrl.pgrp);
+	spin_unlock_irqrestore(&tty->ctrl.lock, flags);
 
 	return pgrp;
 }
@@ -499,10 +501,10 @@ static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t
 	if (pgrp_nr < 0)
 		return -EINVAL;
 
-	spin_lock_irq(&real_tty->ctrl_lock);
+	spin_lock_irq(&real_tty->ctrl.lock);
 	if (!current->signal->tty ||
 	    (current->signal->tty != real_tty) ||
-	    (real_tty->session != task_session(current))) {
+	    (real_tty->ctrl.session != task_session(current))) {
 		retval = -ENOTTY;
 		goto out_unlock_ctrl;
 	}
@@ -515,12 +517,12 @@ static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t
 	if (session_of_pgrp(pgrp) != task_session(current))
 		goto out_unlock;
 	retval = 0;
-	put_pid(real_tty->pgrp);
-	real_tty->pgrp = get_pid(pgrp);
+	put_pid(real_tty->ctrl.pgrp);
+	real_tty->ctrl.pgrp = get_pid(pgrp);
 out_unlock:
 	rcu_read_unlock();
 out_unlock_ctrl:
-	spin_unlock_irq(&real_tty->ctrl_lock);
+	spin_unlock_irq(&real_tty->ctrl.lock);
 	return retval;
 }
 
@@ -545,16 +547,16 @@ static int tiocgsid(struct tty_struct *tty, struct tty_struct *real_tty, pid_t _
 	if (tty == real_tty && current->signal->tty != real_tty)
 		return -ENOTTY;
 
-	spin_lock_irqsave(&real_tty->ctrl_lock, flags);
-	if (!real_tty->session)
+	spin_lock_irqsave(&real_tty->ctrl.lock, flags);
+	if (!real_tty->ctrl.session)
 		goto err;
-	sid = pid_vnr(real_tty->session);
-	spin_unlock_irqrestore(&real_tty->ctrl_lock, flags);
+	sid = pid_vnr(real_tty->ctrl.session);
+	spin_unlock_irqrestore(&real_tty->ctrl.lock, flags);
 
 	return put_user(sid, p);
 
 err:
-	spin_unlock_irqrestore(&real_tty->ctrl_lock, flags);
+	spin_unlock_irqrestore(&real_tty->ctrl.lock, flags);
 	return -ENOTTY;
 }
 
diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index 38c677fb6505..706f066eb711 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -1189,7 +1189,7 @@ static inline int resize_screen(struct vc_data *vc, int width, int height,
  *	information and perform any necessary signal handling.
  *
  *	Caller must hold the console semaphore. Takes the termios rwsem and
- *	ctrl_lock of the tty IFF a tty is passed.
+ *	ctrl.lock of the tty IFF a tty is passed.
  */
 
 static int vc_do_resize(struct tty_struct *tty, struct vc_data *vc,
@@ -1355,7 +1355,7 @@ int vc_resize(struct vc_data *vc, unsigned int cols, unsigned int rows)
  *	the actual work.
  *
  *	Takes the console sem and the called methods then take the tty
- *	termios_rwsem and the tty ctrl_lock in that order.
+ *	termios_rwsem and the tty ctrl.lock in that order.
  */
 static int vt_resize(struct tty_struct *tty, struct winsize *ws)
 {
diff --git a/include/linux/tty.h b/include/linux/tty.h
index df3a69b2e1ea..283ac5f29052 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -255,6 +255,13 @@ struct tty_operations;
  * @flow.unused: alignment for Alpha, so that no members other than @flow.* are
  *		 modified by the same 64b word store. The @flow's __aligned is
  *		 there for the very same reason.
+ * @ctrl.lock: lock for ctrl members
+ * @ctrl.pgrp: process group of this tty (setpgrp(2))
+ * @ctrl.session: session of this tty (setsid(2)). Writes are protected by both
+ *		  @ctrl.lock and legacy mutex, readers must use at least one of
+ *		  them.
+ * @ctrl.pktstatus: packet mode status (bitwise OR of TIOCPKT_* constants)
+ * @ctrl.packet: packet mode enabled
  *
  * All of the state associated with a tty while the tty is open. Persistent
  * storage for tty devices is referenced here as @port in struct tty_port.
@@ -276,16 +283,9 @@ struct tty_struct {
 	struct mutex throttle_mutex;
 	struct rw_semaphore termios_rwsem;
 	struct mutex winsize_mutex;
-	spinlock_t ctrl_lock;
 	/* Termios values are protected by the termios rwsem */
 	struct ktermios termios, termios_locked;
 	char name[64];
-	struct pid *pgrp;		/* Protected by ctrl lock */
-	/*
-	 * Writes protected by both ctrl lock and legacy mutex, readers must use
-	 * at least one of them.
-	 */
-	struct pid *session;
 	unsigned long flags;
 	int count;
 	struct winsize winsize;		/* winsize_mutex */
@@ -297,10 +297,16 @@ struct tty_struct {
 		unsigned long unused[0];
 	} __aligned(sizeof(unsigned long)) flow;
 
+	struct {
+		spinlock_t lock;
+		struct pid *pgrp;
+		struct pid *session;
+		unsigned char pktstatus;
+		bool packet;
+		unsigned long unused[0];
+	} __aligned(sizeof(unsigned long)) ctrl;
+
 	int hw_stopped;
-	unsigned long ctrl_status:8,	/* ctrl_lock */
-		      packet:1,
-		      unused_ctrl:BITS_PER_LONG - 9;
 	unsigned int receive_room;	/* Bytes free for queue */
 	int flow_change;
 
-- 
cgit v1.2.3


From fbadf70a8053b3dce78a45997ae55651693a2a81 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Wed, 5 May 2021 11:19:07 +0200
Subject: tty: set tty_ldisc_ops::num statically

There is no reason to pass the ldisc number to tty_register_ldisc
separately. Just set it in the already defined tty_ldisc_ops in all the
ldiscs.

This simplifies tty_register_ldisc a bit too (no need to set the num
member there).

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: William Hubbs <w.d.hubbs@gmail.com>
Cc: Chris Brannon <chris@the-brannons.com>
Cc: Kirk Reiser <kirk@reisers.ca>
Cc: Samuel Thibault <samuel.thibault@ens-lyon.org>
Cc: Marcel Holtmann <marcel@holtmann.org>
Cc: Johan Hedberg <johan.hedberg@gmail.com>
Cc: Luiz Augusto von Dentz <luiz.dentz@gmail.com>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Wolfgang Grandegger <wg@grandegger.com>
Cc: Marc Kleine-Budde <mkl@pengutronix.de>
Cc: Andreas Koensgen <ajk@comnets.uni-bremen.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rodolfo Giometti <giometti@enneenne.com>
Cc: Peter Ujfalusi <peter.ujfalusi@gmail.com>
Cc: Liam Girdwood <lgirdwood@gmail.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: Takashi Iwai <tiwai@suse.com>
Link: https://lore.kernel.org/r/20210505091928.22010-15-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/accessibility/speakup/spk_ttyio.c | 3 ++-
 drivers/bluetooth/hci_ldisc.c             | 3 ++-
 drivers/input/serio/serport.c             | 3 ++-
 drivers/misc/ti-st/st_core.c              | 3 ++-
 drivers/net/caif/caif_serial.c            | 3 ++-
 drivers/net/can/slcan.c                   | 3 ++-
 drivers/net/hamradio/6pack.c              | 4 +++-
 drivers/net/hamradio/mkiss.c              | 3 ++-
 drivers/net/ppp/ppp_async.c               | 3 ++-
 drivers/net/ppp/ppp_synctty.c             | 3 ++-
 drivers/net/slip/slip.c                   | 3 ++-
 drivers/pps/clients/pps-ldisc.c           | 3 ++-
 drivers/tty/n_gsm.c                       | 3 ++-
 drivers/tty/n_hdlc.c                      | 3 ++-
 drivers/tty/n_null.c                      | 3 ++-
 drivers/tty/n_tty.c                       | 3 ++-
 drivers/tty/tty_ldisc.c                   | 7 +++----
 include/linux/tty.h                       | 2 +-
 net/nfc/nci/uart.c                        | 3 ++-
 sound/soc/ti/ams-delta.c                  | 3 ++-
 20 files changed, 41 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/accessibility/speakup/spk_ttyio.c b/drivers/accessibility/speakup/spk_ttyio.c
index b7fd094700d9..8faa27bae6bf 100644
--- a/drivers/accessibility/speakup/spk_ttyio.c
+++ b/drivers/accessibility/speakup/spk_ttyio.c
@@ -105,6 +105,7 @@ static int spk_ttyio_receive_buf2(struct tty_struct *tty,
 
 static struct tty_ldisc_ops spk_ttyio_ldisc_ops = {
 	.owner          = THIS_MODULE,
+	.num		= N_SPEAKUP,
 	.name           = "speakup_ldisc",
 	.open           = spk_ttyio_ldisc_open,
 	.close          = spk_ttyio_ldisc_close,
@@ -212,7 +213,7 @@ static int spk_ttyio_initialise_ldisc(struct spk_synth *synth)
 
 void spk_ttyio_register_ldisc(void)
 {
-	if (tty_register_ldisc(N_SPEAKUP, &spk_ttyio_ldisc_ops))
+	if (tty_register_ldisc(&spk_ttyio_ldisc_ops))
 		pr_warn("speakup: Error registering line discipline. Most synths won't work.\n");
 }
 
diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c
index e785851a92c1..ee32006e8fc9 100644
--- a/drivers/bluetooth/hci_ldisc.c
+++ b/drivers/bluetooth/hci_ldisc.c
@@ -821,6 +821,7 @@ static __poll_t hci_uart_tty_poll(struct tty_struct *tty,
 
 static struct tty_ldisc_ops hci_uart_ldisc = {
 	.owner		= THIS_MODULE,
+	.num		= N_HCI,
 	.name		= "n_hci",
 	.open		= hci_uart_tty_open,
 	.close		= hci_uart_tty_close,
@@ -840,7 +841,7 @@ static int __init hci_uart_init(void)
 	BT_INFO("HCI UART driver ver %s", VERSION);
 
 	/* Register the tty discipline */
-	err = tty_register_ldisc(N_HCI, &hci_uart_ldisc);
+	err = tty_register_ldisc(&hci_uart_ldisc);
 	if (err) {
 		BT_ERR("HCI line discipline registration failed. (%d)", err);
 		return err;
diff --git a/drivers/input/serio/serport.c b/drivers/input/serio/serport.c
index ff3715315592..870b1d2606fc 100644
--- a/drivers/input/serio/serport.c
+++ b/drivers/input/serio/serport.c
@@ -274,6 +274,7 @@ static void serport_ldisc_write_wakeup(struct tty_struct * tty)
 
 static struct tty_ldisc_ops serport_ldisc = {
 	.owner =	THIS_MODULE,
+	.num =		N_MOUSE,
 	.name =		"input",
 	.open =		serport_ldisc_open,
 	.close =	serport_ldisc_close,
@@ -294,7 +295,7 @@ static struct tty_ldisc_ops serport_ldisc = {
 static int __init serport_init(void)
 {
 	int retval;
-	retval = tty_register_ldisc(N_MOUSE, &serport_ldisc);
+	retval = tty_register_ldisc(&serport_ldisc);
 	if (retval)
 		printk(KERN_ERR "serport.c: Error registering line discipline.\n");
 
diff --git a/drivers/misc/ti-st/st_core.c b/drivers/misc/ti-st/st_core.c
index 239a75502cd6..a4f5d02940c7 100644
--- a/drivers/misc/ti-st/st_core.c
+++ b/drivers/misc/ti-st/st_core.c
@@ -845,6 +845,7 @@ static void st_tty_flush_buffer(struct tty_struct *tty)
 }
 
 static struct tty_ldisc_ops st_ldisc_ops = {
+	.num = N_TI_WL,
 	.name = "n_st",
 	.open = st_tty_open,
 	.close = st_tty_close,
@@ -860,7 +861,7 @@ int st_core_init(struct st_data_s **core_data)
 	struct st_data_s *st_gdata;
 	long err;
 
-	err = tty_register_ldisc(N_TI_WL, &st_ldisc_ops);
+	err = tty_register_ldisc(&st_ldisc_ops);
 	if (err) {
 		pr_err("error registering %d line discipline %ld",
 			   N_TI_WL, err);
diff --git a/drivers/net/caif/caif_serial.c b/drivers/net/caif/caif_serial.c
index b0566588ce33..2407a0f6656d 100644
--- a/drivers/net/caif/caif_serial.c
+++ b/drivers/net/caif/caif_serial.c
@@ -382,6 +382,7 @@ static void ldisc_close(struct tty_struct *tty)
 /* The line discipline structure. */
 static struct tty_ldisc_ops caif_ldisc = {
 	.owner =	THIS_MODULE,
+	.num =		N_CAIF,
 	.name =		"n_caif",
 	.open =		ldisc_open,
 	.close =	ldisc_close,
@@ -431,7 +432,7 @@ static int __init caif_ser_init(void)
 {
 	int ret;
 
-	ret = tty_register_ldisc(N_CAIF, &caif_ldisc);
+	ret = tty_register_ldisc(&caif_ldisc);
 	if (ret < 0)
 		pr_err("cannot register CAIF ldisc=%d err=%d\n", N_CAIF, ret);
 
diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c
index 7dc3e79cb5c4..e3f528c82242 100644
--- a/drivers/net/can/slcan.c
+++ b/drivers/net/can/slcan.c
@@ -698,6 +698,7 @@ static int slcan_ioctl(struct tty_struct *tty, struct file *file,
 
 static struct tty_ldisc_ops slc_ldisc = {
 	.owner		= THIS_MODULE,
+	.num		= N_SLCAN,
 	.name		= "slcan",
 	.open		= slcan_open,
 	.close		= slcan_close,
@@ -722,7 +723,7 @@ static int __init slcan_init(void)
 		return -ENOMEM;
 
 	/* Fill in our line protocol discipline, and register it */
-	status = tty_register_ldisc(N_SLCAN, &slc_ldisc);
+	status = tty_register_ldisc(&slc_ldisc);
 	if (status)  {
 		printk(KERN_ERR "slcan: can't register line discipline\n");
 		kfree(slcan_devs);
diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c
index c0a80f04dd8e..4db1d3c4d771 100644
--- a/drivers/net/hamradio/6pack.c
+++ b/drivers/net/hamradio/6pack.c
@@ -744,6 +744,7 @@ static int sixpack_ioctl(struct tty_struct *tty, struct file *file,
 
 static struct tty_ldisc_ops sp_ldisc = {
 	.owner		= THIS_MODULE,
+	.num		= N_6PACK,
 	.name		= "6pack",
 	.open		= sixpack_open,
 	.close		= sixpack_close,
@@ -766,7 +767,8 @@ static int __init sixpack_init_driver(void)
 	printk(msg_banner);
 
 	/* Register the provided line protocol discipline */
-	if ((status = tty_register_ldisc(N_6PACK, &sp_ldisc)) != 0)
+	status = tty_register_ldisc(&sp_ldisc);
+	if (status)
 		printk(msg_regfail, status);
 
 	return status;
diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c
index fc05ded48178..1eb87a5a9394 100644
--- a/drivers/net/hamradio/mkiss.c
+++ b/drivers/net/hamradio/mkiss.c
@@ -933,6 +933,7 @@ out:
 
 static struct tty_ldisc_ops ax_ldisc = {
 	.owner		= THIS_MODULE,
+	.num		= N_AX25,
 	.name		= "mkiss",
 	.open		= mkiss_open,
 	.close		= mkiss_close,
@@ -952,7 +953,7 @@ static int __init mkiss_init_driver(void)
 
 	printk(banner);
 
-	status = tty_register_ldisc(N_AX25, &ax_ldisc);
+	status = tty_register_ldisc(&ax_ldisc);
 	if (status != 0)
 		printk(msg_regfail, status);
 
diff --git a/drivers/net/ppp/ppp_async.c b/drivers/net/ppp/ppp_async.c
index 9f08bd19551f..4bfb66c40c86 100644
--- a/drivers/net/ppp/ppp_async.c
+++ b/drivers/net/ppp/ppp_async.c
@@ -372,6 +372,7 @@ ppp_asynctty_wakeup(struct tty_struct *tty)
 
 static struct tty_ldisc_ops ppp_ldisc = {
 	.owner  = THIS_MODULE,
+	.num	= N_PPP,
 	.name	= "ppp",
 	.open	= ppp_asynctty_open,
 	.close	= ppp_asynctty_close,
@@ -389,7 +390,7 @@ ppp_async_init(void)
 {
 	int err;
 
-	err = tty_register_ldisc(N_PPP, &ppp_ldisc);
+	err = tty_register_ldisc(&ppp_ldisc);
 	if (err != 0)
 		printk(KERN_ERR "PPP_async: error %d registering line disc.\n",
 		       err);
diff --git a/drivers/net/ppp/ppp_synctty.c b/drivers/net/ppp/ppp_synctty.c
index c82c4e6df4cb..0942d3ee48e0 100644
--- a/drivers/net/ppp/ppp_synctty.c
+++ b/drivers/net/ppp/ppp_synctty.c
@@ -365,6 +365,7 @@ ppp_sync_wakeup(struct tty_struct *tty)
 
 static struct tty_ldisc_ops ppp_sync_ldisc = {
 	.owner	= THIS_MODULE,
+	.num	= N_SYNC_PPP,
 	.name	= "pppsync",
 	.open	= ppp_sync_open,
 	.close	= ppp_sync_close,
@@ -382,7 +383,7 @@ ppp_sync_init(void)
 {
 	int err;
 
-	err = tty_register_ldisc(N_SYNC_PPP, &ppp_sync_ldisc);
+	err = tty_register_ldisc(&ppp_sync_ldisc);
 	if (err != 0)
 		printk(KERN_ERR "PPP_sync: error %d registering line disc.\n",
 		       err);
diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c
index 4dda49e61745..938ac0ec0305 100644
--- a/drivers/net/slip/slip.c
+++ b/drivers/net/slip/slip.c
@@ -1263,6 +1263,7 @@ static int sl_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 
 static struct tty_ldisc_ops sl_ldisc = {
 	.owner 		= THIS_MODULE,
+	.num		= N_SLIP,
 	.name 		= "slip",
 	.open 		= slip_open,
 	.close	 	= slip_close,
@@ -1298,7 +1299,7 @@ static int __init slip_init(void)
 		return -ENOMEM;
 
 	/* Fill in our line protocol discipline, and register it */
-	status = tty_register_ldisc(N_SLIP, &sl_ldisc);
+	status = tty_register_ldisc(&sl_ldisc);
 	if (status != 0) {
 		printk(KERN_ERR "SLIP: can't register line discipline (err = %d)\n", status);
 		kfree(slip_devs);
diff --git a/drivers/pps/clients/pps-ldisc.c b/drivers/pps/clients/pps-ldisc.c
index bf26cc56b863..2a88e678d0b8 100644
--- a/drivers/pps/clients/pps-ldisc.c
+++ b/drivers/pps/clients/pps-ldisc.c
@@ -112,12 +112,13 @@ static int __init pps_tty_init(void)
 
 	/* Init PPS_TTY data */
 	pps_ldisc_ops.owner = THIS_MODULE;
+	pps_ldisc_ops.num = N_PPS;
 	pps_ldisc_ops.name = "pps_tty";
 	pps_ldisc_ops.dcd_change = pps_tty_dcd_change;
 	pps_ldisc_ops.open = pps_tty_open;
 	pps_ldisc_ops.close = pps_tty_close;
 
-	err = tty_register_ldisc(N_PPS, &pps_ldisc_ops);
+	err = tty_register_ldisc(&pps_ldisc_ops);
 	if (err)
 		pr_err("can't register PPS line discipline\n");
 	else
diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c
index 477403ecc445..654e439ff6c8 100644
--- a/drivers/tty/n_gsm.c
+++ b/drivers/tty/n_gsm.c
@@ -2857,6 +2857,7 @@ static int gsm_create_network(struct gsm_dlci *dlci, struct gsm_netconfig *nc)
 /* Line discipline for real tty */
 static struct tty_ldisc_ops tty_ldisc_packet = {
 	.owner		 = THIS_MODULE,
+	.num		 = N_GSM0710,
 	.name            = "n_gsm",
 	.open            = gsmld_open,
 	.close           = gsmld_close,
@@ -3242,7 +3243,7 @@ static const struct tty_operations gsmtty_ops = {
 static int __init gsm_init(void)
 {
 	/* Fill in our line protocol discipline, and register it */
-	int status = tty_register_ldisc(N_GSM0710, &tty_ldisc_packet);
+	int status = tty_register_ldisc(&tty_ldisc_packet);
 	if (status != 0) {
 		pr_err("n_gsm: can't register line discipline (err = %d)\n",
 								status);
diff --git a/drivers/tty/n_hdlc.c b/drivers/tty/n_hdlc.c
index 62b1d1a6e0f1..2256039911f5 100644
--- a/drivers/tty/n_hdlc.c
+++ b/drivers/tty/n_hdlc.c
@@ -788,6 +788,7 @@ static struct n_hdlc_buf *n_hdlc_buf_get(struct n_hdlc_buf_list *buf_list)
 
 static struct tty_ldisc_ops n_hdlc_ldisc = {
 	.owner		= THIS_MODULE,
+	.num		= N_HDLC,
 	.name		= "hdlc",
 	.open		= n_hdlc_tty_open,
 	.close		= n_hdlc_tty_close,
@@ -807,7 +808,7 @@ static int __init n_hdlc_init(void)
 	/* range check maxframe arg */
 	maxframe = clamp(maxframe, 4096, MAX_HDLC_FRAME_SIZE);
 
-	status = tty_register_ldisc(N_HDLC, &n_hdlc_ldisc);
+	status = tty_register_ldisc(&n_hdlc_ldisc);
 	if (!status)
 		pr_info("N_HDLC line discipline registered with maxframe=%d\n",
 				maxframe);
diff --git a/drivers/tty/n_null.c b/drivers/tty/n_null.c
index 2ff373d2f98d..ee229c812dce 100644
--- a/drivers/tty/n_null.c
+++ b/drivers/tty/n_null.c
@@ -40,6 +40,7 @@ static void n_null_receivebuf(struct tty_struct *tty,
 
 static struct tty_ldisc_ops null_ldisc = {
 	.owner		=	THIS_MODULE,
+	.num		=	N_NULL,
 	.name		=	"n_null",
 	.open		=	n_null_open,
 	.close		=	n_null_close,
@@ -50,7 +51,7 @@ static struct tty_ldisc_ops null_ldisc = {
 
 static int __init n_null_init(void)
 {
-	BUG_ON(tty_register_ldisc(N_NULL, &null_ldisc));
+	BUG_ON(tty_register_ldisc(&null_ldisc));
 	return 0;
 }
 
diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index 8ce712eec026..2fe27905c398 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -2424,6 +2424,7 @@ static int n_tty_ioctl(struct tty_struct *tty, struct file *file,
 
 static struct tty_ldisc_ops n_tty_ops = {
 	.owner		 = THIS_MODULE,
+	.num		 = N_TTY,
 	.name            = "n_tty",
 	.open            = n_tty_open,
 	.close           = n_tty_close,
@@ -2455,5 +2456,5 @@ EXPORT_SYMBOL_GPL(n_tty_inherit_ops);
 
 void __init n_tty_init(void)
 {
-	tty_register_ldisc(N_TTY, &n_tty_ops);
+	tty_register_ldisc(&n_tty_ops);
 }
diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c
index 03f414172f34..9aff04bee4cd 100644
--- a/drivers/tty/tty_ldisc.c
+++ b/drivers/tty/tty_ldisc.c
@@ -59,17 +59,16 @@ static struct tty_ldisc_ops *tty_ldiscs[NR_LDISCS];
  *		takes tty_ldiscs_lock to guard against ldisc races
  */
 
-int tty_register_ldisc(int disc, struct tty_ldisc_ops *new_ldisc)
+int tty_register_ldisc(struct tty_ldisc_ops *new_ldisc)
 {
 	unsigned long flags;
 	int ret = 0;
 
-	if (disc < N_TTY || disc >= NR_LDISCS)
+	if (new_ldisc->num < N_TTY || new_ldisc->num >= NR_LDISCS)
 		return -EINVAL;
 
 	raw_spin_lock_irqsave(&tty_ldiscs_lock, flags);
-	tty_ldiscs[disc] = new_ldisc;
-	new_ldisc->num = disc;
+	tty_ldiscs[new_ldisc->num] = new_ldisc;
 	new_ldisc->refcount = 0;
 	raw_spin_unlock_irqrestore(&tty_ldiscs_lock, flags);
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 283ac5f29052..95c632299fb4 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -636,7 +636,7 @@ static inline int tty_port_users(struct tty_port *port)
 	return port->count + port->blocked_open;
 }
 
-extern int tty_register_ldisc(int disc, struct tty_ldisc_ops *new_ldisc);
+extern int tty_register_ldisc(struct tty_ldisc_ops *new_ldisc);
 extern int tty_unregister_ldisc(int disc);
 extern int tty_set_ldisc(struct tty_struct *tty, int disc);
 extern int tty_ldisc_receive_buf(struct tty_ldisc *ld, const unsigned char *p,
diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c
index 98102ef64004..648c26b4bad8 100644
--- a/net/nfc/nci/uart.c
+++ b/net/nfc/nci/uart.c
@@ -442,6 +442,7 @@ EXPORT_SYMBOL_GPL(nci_uart_set_config);
 
 static struct tty_ldisc_ops nci_uart_ldisc = {
 	.owner		= THIS_MODULE,
+	.num		= N_NCI,
 	.name		= "n_nci",
 	.open		= nci_uart_tty_open,
 	.close		= nci_uart_tty_close,
@@ -456,7 +457,7 @@ static struct tty_ldisc_ops nci_uart_ldisc = {
 
 static int __init nci_uart_init(void)
 {
-	return tty_register_ldisc(N_NCI, &nci_uart_ldisc);
+	return tty_register_ldisc(&nci_uart_ldisc);
 }
 
 static void __exit nci_uart_exit(void)
diff --git a/sound/soc/ti/ams-delta.c b/sound/soc/ti/ams-delta.c
index 55a6736a378e..fcbb791bf27a 100644
--- a/sound/soc/ti/ams-delta.c
+++ b/sound/soc/ti/ams-delta.c
@@ -396,6 +396,7 @@ static void cx81801_wakeup(struct tty_struct *tty)
 
 static struct tty_ldisc_ops cx81801_ops = {
 	.name = "cx81801",
+	.num = N_V253,
 	.owner = THIS_MODULE,
 	.open = cx81801_open,
 	.close = cx81801_close,
@@ -503,7 +504,7 @@ static int ams_delta_cx20442_init(struct snd_soc_pcm_runtime *rtd)
 	}
 
 	/* Register optional line discipline for over the modem control */
-	ret = tty_register_ldisc(N_V253, &cx81801_ops);
+	ret = tty_register_ldisc(&cx81801_ops);
 	if (ret) {
 		dev_warn(card->dev,
 				"Failed to register line discipline, "
-- 
cgit v1.2.3


From f81ee8b8b8421dc06d13f197bb53191559cc51da Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Wed, 5 May 2021 11:19:09 +0200
Subject: tty: make tty_ldisc_ops a param in tty_unregister_ldisc

Make tty_unregister_ldisc symmetric to tty_register_ldisc by accepting
struct tty_ldisc_ops as a parameter instead of ldisc number. This avoids
checking of the ldisc number bounds in tty_unregister_ldisc.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: William Hubbs <w.d.hubbs@gmail.com>
Cc: Chris Brannon <chris@the-brannons.com>
Cc: Kirk Reiser <kirk@reisers.ca>
Cc: Samuel Thibault <samuel.thibault@ens-lyon.org>
Cc: Marcel Holtmann <marcel@holtmann.org>
Cc: Johan Hedberg <johan.hedberg@gmail.com>
Cc: Luiz Augusto von Dentz <luiz.dentz@gmail.com>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Wolfgang Grandegger <wg@grandegger.com>
Cc: Marc Kleine-Budde <mkl@pengutronix.de>
Cc: Andreas Koensgen <ajk@comnets.uni-bremen.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rodolfo Giometti <giometti@enneenne.com>
Cc: Peter Ujfalusi <peter.ujfalusi@gmail.com>
Cc: Liam Girdwood <lgirdwood@gmail.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: Takashi Iwai <tiwai@suse.com>
Link: https://lore.kernel.org/r/20210505091928.22010-17-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/accessibility/speakup/spk_ttyio.c | 2 +-
 drivers/bluetooth/hci_ldisc.c             | 2 +-
 drivers/input/serio/serport.c             | 2 +-
 drivers/misc/ti-st/st_core.c              | 6 +++---
 drivers/net/caif/caif_serial.c            | 2 +-
 drivers/net/can/slcan.c                   | 2 +-
 drivers/net/hamradio/6pack.c              | 2 +-
 drivers/net/hamradio/mkiss.c              | 2 +-
 drivers/net/ppp/ppp_async.c               | 2 +-
 drivers/net/ppp/ppp_synctty.c             | 2 +-
 drivers/net/slip/slip.c                   | 2 +-
 drivers/pps/clients/pps-ldisc.c           | 2 +-
 drivers/tty/n_gsm.c                       | 4 ++--
 drivers/tty/n_hdlc.c                      | 3 +--
 drivers/tty/n_null.c                      | 2 +-
 drivers/tty/tty_ldisc.c                   | 9 +++------
 include/linux/tty.h                       | 2 +-
 net/nfc/nci/uart.c                        | 2 +-
 sound/soc/ti/ams-delta.c                  | 2 +-
 19 files changed, 24 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/accessibility/speakup/spk_ttyio.c b/drivers/accessibility/speakup/spk_ttyio.c
index 8faa27bae6bf..0b7f1a87eaeb 100644
--- a/drivers/accessibility/speakup/spk_ttyio.c
+++ b/drivers/accessibility/speakup/spk_ttyio.c
@@ -219,7 +219,7 @@ void spk_ttyio_register_ldisc(void)
 
 void spk_ttyio_unregister_ldisc(void)
 {
-	if (tty_unregister_ldisc(N_SPEAKUP))
+	if (tty_unregister_ldisc(&spk_ttyio_ldisc_ops))
 		pr_warn("speakup: Couldn't unregister ldisc\n");
 }
 
diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c
index ee32006e8fc9..31e0c804363c 100644
--- a/drivers/bluetooth/hci_ldisc.c
+++ b/drivers/bluetooth/hci_ldisc.c
@@ -917,7 +917,7 @@ static void __exit hci_uart_exit(void)
 #endif
 
 	/* Release tty registration of line discipline */
-	err = tty_unregister_ldisc(N_HCI);
+	err = tty_unregister_ldisc(&hci_uart_ldisc);
 	if (err)
 		BT_ERR("Can't unregister HCI line discipline (%d)", err);
 }
diff --git a/drivers/input/serio/serport.c b/drivers/input/serio/serport.c
index 870b1d2606fc..7fbbe00e3553 100644
--- a/drivers/input/serio/serport.c
+++ b/drivers/input/serio/serport.c
@@ -304,7 +304,7 @@ static int __init serport_init(void)
 
 static void __exit serport_exit(void)
 {
-	tty_unregister_ldisc(N_MOUSE);
+	tty_unregister_ldisc(&serport_ldisc);
 }
 
 module_init(serport_init);
diff --git a/drivers/misc/ti-st/st_core.c b/drivers/misc/ti-st/st_core.c
index a4f5d02940c7..174ae8e52805 100644
--- a/drivers/misc/ti-st/st_core.c
+++ b/drivers/misc/ti-st/st_core.c
@@ -872,7 +872,7 @@ int st_core_init(struct st_data_s **core_data)
 	st_gdata = kzalloc(sizeof(struct st_data_s), GFP_KERNEL);
 	if (!st_gdata) {
 		pr_err("memory allocation failed");
-		err = tty_unregister_ldisc(N_TI_WL);
+		err = tty_unregister_ldisc(&st_ldisc_ops);
 		if (err)
 			pr_err("unable to un-register ldisc %ld", err);
 		err = -ENOMEM;
@@ -892,7 +892,7 @@ int st_core_init(struct st_data_s **core_data)
 	if (err) {
 		pr_err("error during st_ll initialization(%ld)", err);
 		kfree(st_gdata);
-		err = tty_unregister_ldisc(N_TI_WL);
+		err = tty_unregister_ldisc(&st_ldisc_ops);
 		if (err)
 			pr_err("unable to un-register ldisc");
 		return err;
@@ -919,7 +919,7 @@ void st_core_exit(struct st_data_s *st_gdata)
 		kfree_skb(st_gdata->rx_skb);
 		kfree_skb(st_gdata->tx_skb);
 		/* TTY ldisc cleanup */
-		err = tty_unregister_ldisc(N_TI_WL);
+		err = tty_unregister_ldisc(&st_ldisc_ops);
 		if (err)
 			pr_err("unable to un-register ldisc %ld", err);
 		/* free the global data pointer */
diff --git a/drivers/net/caif/caif_serial.c b/drivers/net/caif/caif_serial.c
index 2407a0f6656d..d0c24dba4a86 100644
--- a/drivers/net/caif/caif_serial.c
+++ b/drivers/net/caif/caif_serial.c
@@ -447,7 +447,7 @@ static void __exit caif_ser_exit(void)
 	spin_unlock(&ser_lock);
 	ser_release(NULL);
 	cancel_work_sync(&ser_release_work);
-	tty_unregister_ldisc(N_CAIF);
+	tty_unregister_ldisc(&caif_ldisc);
 	debugfs_remove_recursive(debugfsdir);
 }
 
diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c
index e3f528c82242..03a2dbd3c367 100644
--- a/drivers/net/can/slcan.c
+++ b/drivers/net/can/slcan.c
@@ -784,7 +784,7 @@ static void __exit slcan_exit(void)
 	kfree(slcan_devs);
 	slcan_devs = NULL;
 
-	i = tty_unregister_ldisc(N_SLCAN);
+	i = tty_unregister_ldisc(&slc_ldisc);
 	if (i)
 		printk(KERN_ERR "slcan: can't unregister ldisc (err %d)\n", i);
 }
diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c
index 4db1d3c4d771..aac24f9caceb 100644
--- a/drivers/net/hamradio/6pack.c
+++ b/drivers/net/hamradio/6pack.c
@@ -781,7 +781,7 @@ static void __exit sixpack_exit_driver(void)
 {
 	int ret;
 
-	if ((ret = tty_unregister_ldisc(N_6PACK)))
+	if ((ret = tty_unregister_ldisc(&sp_ldisc)))
 		printk(msg_unregfail, ret);
 }
 
diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c
index 1eb87a5a9394..750c6afc9302 100644
--- a/drivers/net/hamradio/mkiss.c
+++ b/drivers/net/hamradio/mkiss.c
@@ -967,7 +967,7 @@ static void __exit mkiss_exit_driver(void)
 {
 	int ret;
 
-	if ((ret = tty_unregister_ldisc(N_AX25)))
+	if ((ret = tty_unregister_ldisc(&ax_ldisc)))
 		printk(msg_unregfail, ret);
 }
 
diff --git a/drivers/net/ppp/ppp_async.c b/drivers/net/ppp/ppp_async.c
index 4bfb66c40c86..b2d454d01eba 100644
--- a/drivers/net/ppp/ppp_async.c
+++ b/drivers/net/ppp/ppp_async.c
@@ -1016,7 +1016,7 @@ static void async_lcp_peek(struct asyncppp *ap, unsigned char *data,
 
 static void __exit ppp_async_cleanup(void)
 {
-	if (tty_unregister_ldisc(N_PPP) != 0)
+	if (tty_unregister_ldisc(&ppp_ldisc) != 0)
 		printk(KERN_ERR "failed to unregister PPP line discipline\n");
 }
 
diff --git a/drivers/net/ppp/ppp_synctty.c b/drivers/net/ppp/ppp_synctty.c
index 0942d3ee48e0..ca403bde56fb 100644
--- a/drivers/net/ppp/ppp_synctty.c
+++ b/drivers/net/ppp/ppp_synctty.c
@@ -727,7 +727,7 @@ err:
 static void __exit
 ppp_sync_cleanup(void)
 {
-	if (tty_unregister_ldisc(N_SYNC_PPP) != 0)
+	if (tty_unregister_ldisc(&ppp_sync_ldisc) != 0)
 		printk(KERN_ERR "failed to unregister Sync PPP line discipline\n");
 }
 
diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c
index 938ac0ec0305..867efff40a0e 100644
--- a/drivers/net/slip/slip.c
+++ b/drivers/net/slip/slip.c
@@ -1360,7 +1360,7 @@ static void __exit slip_exit(void)
 	kfree(slip_devs);
 	slip_devs = NULL;
 
-	i = tty_unregister_ldisc(N_SLIP);
+	i = tty_unregister_ldisc(&sl_ldisc);
 	if (i != 0)
 		printk(KERN_ERR "SLIP: can't unregister line discipline (err = %d)\n", i);
 }
diff --git a/drivers/pps/clients/pps-ldisc.c b/drivers/pps/clients/pps-ldisc.c
index 2a88e678d0b8..91b947f37774 100644
--- a/drivers/pps/clients/pps-ldisc.c
+++ b/drivers/pps/clients/pps-ldisc.c
@@ -131,7 +131,7 @@ static void __exit pps_tty_cleanup(void)
 {
 	int err;
 
-	err = tty_unregister_ldisc(N_PPS);
+	err = tty_unregister_ldisc(&pps_ldisc_ops);
 	if (err)
 		pr_err("can't unregister PPS line discipline\n");
 	else
diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c
index cce2ef04caeb..db7d4a30af91 100644
--- a/drivers/tty/n_gsm.c
+++ b/drivers/tty/n_gsm.c
@@ -3280,13 +3280,13 @@ static int __init gsm_init(void)
 err_put_driver:
 	put_tty_driver(gsm_tty_driver);
 err_unreg_ldisc:
-	tty_unregister_ldisc(N_GSM0710);
+	tty_unregister_ldisc(&tty_ldisc_packet);
 	return status;
 }
 
 static void __exit gsm_exit(void)
 {
-	int status = tty_unregister_ldisc(N_GSM0710);
+	int status = tty_unregister_ldisc(&tty_ldisc_packet);
 	if (status != 0)
 		pr_err("n_gsm: can't unregister line discipline (err = %d)\n",
 								status);
diff --git a/drivers/tty/n_hdlc.c b/drivers/tty/n_hdlc.c
index 2256039911f5..c2afbfe0a1d5 100644
--- a/drivers/tty/n_hdlc.c
+++ b/drivers/tty/n_hdlc.c
@@ -822,8 +822,7 @@ static int __init n_hdlc_init(void)
 
 static void __exit n_hdlc_exit(void)
 {
-	/* Release tty registration of line discipline */
-	int status = tty_unregister_ldisc(N_HDLC);
+	int status = tty_unregister_ldisc(&n_hdlc_ldisc);
 
 	if (status)
 		pr_err("N_HDLC: can't unregister line discipline (err = %d)\n",
diff --git a/drivers/tty/n_null.c b/drivers/tty/n_null.c
index ee229c812dce..f913b665af72 100644
--- a/drivers/tty/n_null.c
+++ b/drivers/tty/n_null.c
@@ -57,7 +57,7 @@ static int __init n_null_init(void)
 
 static void __exit n_null_exit(void)
 {
-	tty_unregister_ldisc(N_NULL);
+	tty_unregister_ldisc(&null_ldisc);
 }
 
 module_init(n_null_init);
diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c
index 9aff04bee4cd..d02deeb5e584 100644
--- a/drivers/tty/tty_ldisc.c
+++ b/drivers/tty/tty_ldisc.c
@@ -87,19 +87,16 @@ EXPORT_SYMBOL(tty_register_ldisc);
  *		takes tty_ldiscs_lock to guard against ldisc races
  */
 
-int tty_unregister_ldisc(int disc)
+int tty_unregister_ldisc(struct tty_ldisc_ops *ldisc)
 {
 	unsigned long flags;
 	int ret = 0;
 
-	if (disc < N_TTY || disc >= NR_LDISCS)
-		return -EINVAL;
-
 	raw_spin_lock_irqsave(&tty_ldiscs_lock, flags);
-	if (tty_ldiscs[disc]->refcount)
+	if (tty_ldiscs[ldisc->num]->refcount)
 		ret = -EBUSY;
 	else
-		tty_ldiscs[disc] = NULL;
+		tty_ldiscs[ldisc->num] = NULL;
 	raw_spin_unlock_irqrestore(&tty_ldiscs_lock, flags);
 
 	return ret;
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 95c632299fb4..6a72d0ff6391 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -637,7 +637,7 @@ static inline int tty_port_users(struct tty_port *port)
 }
 
 extern int tty_register_ldisc(struct tty_ldisc_ops *new_ldisc);
-extern int tty_unregister_ldisc(int disc);
+extern int tty_unregister_ldisc(struct tty_ldisc_ops *ldisc);
 extern int tty_set_ldisc(struct tty_struct *tty, int disc);
 extern int tty_ldisc_receive_buf(struct tty_ldisc *ld, const unsigned char *p,
 				 const char *f, int count);
diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c
index 648c26b4bad8..502e7a3f8948 100644
--- a/net/nfc/nci/uart.c
+++ b/net/nfc/nci/uart.c
@@ -462,7 +462,7 @@ static int __init nci_uart_init(void)
 
 static void __exit nci_uart_exit(void)
 {
-	tty_unregister_ldisc(N_NCI);
+	tty_unregister_ldisc(&nci_uart_ldisc);
 }
 
 module_init(nci_uart_init);
diff --git a/sound/soc/ti/ams-delta.c b/sound/soc/ti/ams-delta.c
index fcbb791bf27a..e89548e48364 100644
--- a/sound/soc/ti/ams-delta.c
+++ b/sound/soc/ti/ams-delta.c
@@ -583,7 +583,7 @@ static int ams_delta_remove(struct platform_device *pdev)
 {
 	struct snd_soc_card *card = platform_get_drvdata(pdev);
 
-	if (tty_unregister_ldisc(N_V253) != 0)
+	if (tty_unregister_ldisc(&cx81801_ops) != 0)
 		dev_warn(&pdev->dev,
 			"failed to unregister V253 line discipline\n");
 
-- 
cgit v1.2.3


From 19475209331168cdb8070a011650535f1c54a730 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Wed, 5 May 2021 11:19:10 +0200
Subject: tty: drop tty_ldisc_ops::refcount

The refcount is checked only in tty_unregister_ldisc and EBUSY returned
if it is nonzero. But none of the tty_unregister_ldisc callers act
anyhow if this (or any other) error is returned. So remove
tty_ldisc_ops::refcount completely and make tty_unregister_ldisc return
'void' in the next patches. That means we assume tty_unregister_ldisc is
not called while the ldisc might be in use. That relies on
try_module_get in get_ldops and module_put in put_ldops.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20210505091928.22010-18-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/n_tty.c       |  2 +-
 drivers/tty/tty_ldisc.c   | 14 +++-----------
 include/linux/tty_ldisc.h |  2 --
 3 files changed, 4 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index 2fe27905c398..0ec93f1a61f5 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -2450,7 +2450,7 @@ void n_tty_inherit_ops(struct tty_ldisc_ops *ops)
 {
 	*ops = n_tty_ops;
 	ops->owner = NULL;
-	ops->refcount = ops->flags = 0;
+	ops->flags = 0;
 }
 EXPORT_SYMBOL_GPL(n_tty_inherit_ops);
 
diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c
index d02deeb5e584..98e8316fd28a 100644
--- a/drivers/tty/tty_ldisc.c
+++ b/drivers/tty/tty_ldisc.c
@@ -69,7 +69,6 @@ int tty_register_ldisc(struct tty_ldisc_ops *new_ldisc)
 
 	raw_spin_lock_irqsave(&tty_ldiscs_lock, flags);
 	tty_ldiscs[new_ldisc->num] = new_ldisc;
-	new_ldisc->refcount = 0;
 	raw_spin_unlock_irqrestore(&tty_ldiscs_lock, flags);
 
 	return ret;
@@ -90,16 +89,12 @@ EXPORT_SYMBOL(tty_register_ldisc);
 int tty_unregister_ldisc(struct tty_ldisc_ops *ldisc)
 {
 	unsigned long flags;
-	int ret = 0;
 
 	raw_spin_lock_irqsave(&tty_ldiscs_lock, flags);
-	if (tty_ldiscs[ldisc->num]->refcount)
-		ret = -EBUSY;
-	else
-		tty_ldiscs[ldisc->num] = NULL;
+	tty_ldiscs[ldisc->num] = NULL;
 	raw_spin_unlock_irqrestore(&tty_ldiscs_lock, flags);
 
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL(tty_unregister_ldisc);
 
@@ -113,10 +108,8 @@ static struct tty_ldisc_ops *get_ldops(int disc)
 	ldops = tty_ldiscs[disc];
 	if (ldops) {
 		ret = ERR_PTR(-EAGAIN);
-		if (try_module_get(ldops->owner)) {
-			ldops->refcount++;
+		if (try_module_get(ldops->owner))
 			ret = ldops;
-		}
 	}
 	raw_spin_unlock_irqrestore(&tty_ldiscs_lock, flags);
 	return ret;
@@ -127,7 +120,6 @@ static void put_ldops(struct tty_ldisc_ops *ldops)
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&tty_ldiscs_lock, flags);
-	ldops->refcount--;
 	module_put(ldops->owner);
 	raw_spin_unlock_irqrestore(&tty_ldiscs_lock, flags);
 }
diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h
index c20ca6a75b4c..fbe9de278629 100644
--- a/include/linux/tty_ldisc.h
+++ b/include/linux/tty_ldisc.h
@@ -208,8 +208,6 @@ struct tty_ldisc_ops {
 				const char *fp, int count);
 
 	struct  module *owner;
-
-	int refcount;
 };
 
 struct tty_ldisc {
-- 
cgit v1.2.3


From f6f19595a7efdaa0c196d7fa2b343b5588f94470 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Wed, 5 May 2021 11:19:12 +0200
Subject: tty: return void from tty_unregister_ldisc

Now that noone checks the return value of tty_unregister_ldisc, make the
function return 'void'.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20210505091928.22010-20-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_ldisc.c | 4 +---
 include/linux/tty.h     | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c
index 98e8316fd28a..8edd73ab9148 100644
--- a/drivers/tty/tty_ldisc.c
+++ b/drivers/tty/tty_ldisc.c
@@ -86,15 +86,13 @@ EXPORT_SYMBOL(tty_register_ldisc);
  *		takes tty_ldiscs_lock to guard against ldisc races
  */
 
-int tty_unregister_ldisc(struct tty_ldisc_ops *ldisc)
+void tty_unregister_ldisc(struct tty_ldisc_ops *ldisc)
 {
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&tty_ldiscs_lock, flags);
 	tty_ldiscs[ldisc->num] = NULL;
 	raw_spin_unlock_irqrestore(&tty_ldiscs_lock, flags);
-
-	return 0;
 }
 EXPORT_SYMBOL(tty_unregister_ldisc);
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 6a72d0ff6391..e18a4f1ac39d 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -637,7 +637,7 @@ static inline int tty_port_users(struct tty_port *port)
 }
 
 extern int tty_register_ldisc(struct tty_ldisc_ops *new_ldisc);
-extern int tty_unregister_ldisc(struct tty_ldisc_ops *ldisc);
+extern void tty_unregister_ldisc(struct tty_ldisc_ops *ldisc);
 extern int tty_set_ldisc(struct tty_struct *tty, int disc);
 extern int tty_ldisc_receive_buf(struct tty_ldisc *ld, const unsigned char *p,
 				 const char *f, int count);
-- 
cgit v1.2.3


From 03b3b1a2405ccd71570cd5ec1fe4abd7bb4891cb Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Wed, 5 May 2021 11:19:15 +0200
Subject: tty: make tty_operations::write_room return uint

Line disciplines expect a positive value or zero returned from
tty->ops->write_room (invoked by tty_write_room). So make this
assumption explicit by using unsigned int as a return value. Both of
tty->ops->write_room and tty_write_room.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Acked-by: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Acked-by: Alex Elder <elder@linaro.org>
Acked-by: Max Filippov <jcmvbkbc@gmail.com> # xtensa
Acked-by: David Sterba <dsterba@suse.com>
Acked-By: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Chris Zankel <chris@zankel.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Cc: Jens Taprogge <jens.taprogge@taprogge.org>
Cc: Karsten Keil <isdn@linux-pingi.de>
Cc: Scott Branden <scott.branden@broadcom.com>
Cc: Ulf Hansson <ulf.hansson@linaro.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: David Lin <dtwlin@gmail.com>
Cc: Johan Hovold <johan@kernel.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Shawn Guo <shawnguo@kernel.org>
Cc: Sascha Hauer <s.hauer@pengutronix.de>
Cc: Oliver Neukum <oneukum@suse.com>
Cc: Felipe Balbi <balbi@kernel.org>
Cc: Mathias Nyman <mathias.nyman@intel.com>
Cc: Marcel Holtmann <marcel@holtmann.org>
Cc: Johan Hedberg <johan.hedberg@gmail.com>
Cc: Luiz Augusto von Dentz <luiz.dentz@gmail.com>
Link: https://lore.kernel.org/r/20210505091928.22010-23-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/alpha/kernel/srmcons.c            | 2 +-
 arch/m68k/emu/nfcon.c                  | 2 +-
 arch/parisc/kernel/pdc_cons.c          | 2 +-
 arch/um/drivers/line.c                 | 6 +++---
 arch/um/drivers/line.h                 | 2 +-
 arch/xtensa/platforms/iss/console.c    | 2 +-
 drivers/char/pcmcia/synclink_cs.c      | 2 +-
 drivers/char/ttyprintk.c               | 2 +-
 drivers/ipack/devices/ipoctal.c        | 2 +-
 drivers/isdn/capi/capi.c               | 6 +++---
 drivers/misc/bcm-vk/bcm_vk_tty.c       | 2 +-
 drivers/mmc/core/sdio_uart.c           | 2 +-
 drivers/net/usb/hso.c                  | 4 ++--
 drivers/s390/char/con3215.c            | 2 +-
 drivers/s390/char/sclp_tty.c           | 4 ++--
 drivers/s390/char/sclp_vt220.c         | 4 ++--
 drivers/s390/char/tty3270.c            | 2 +-
 drivers/staging/fwserial/fwserial.c    | 6 +++---
 drivers/staging/gdm724x/gdm_tty.c      | 2 +-
 drivers/staging/greybus/uart.c         | 2 +-
 drivers/tty/amiserial.c                | 2 +-
 drivers/tty/ehv_bytechan.c             | 4 ++--
 drivers/tty/goldfish.c                 | 2 +-
 drivers/tty/hvc/hvc_console.c          | 2 +-
 drivers/tty/hvc/hvcs.c                 | 2 +-
 drivers/tty/hvc/hvsi.c                 | 4 ++--
 drivers/tty/ipwireless/tty.c           | 2 +-
 drivers/tty/mips_ejtag_fdc.c           | 4 ++--
 drivers/tty/moxa.c                     | 8 ++++----
 drivers/tty/mxser.c                    | 2 +-
 drivers/tty/n_gsm.c                    | 2 +-
 drivers/tty/nozomi.c                   | 4 ++--
 drivers/tty/pty.c                      | 2 +-
 drivers/tty/serial/kgdb_nmi.c          | 2 +-
 drivers/tty/serial/serial_core.c       | 4 ++--
 drivers/tty/synclink_gt.c              | 6 +++---
 drivers/tty/tty_ioctl.c                | 2 +-
 drivers/tty/ttynull.c                  | 2 +-
 drivers/tty/vcc.c                      | 4 ++--
 drivers/tty/vt/vt.c                    | 2 +-
 drivers/usb/class/cdc-acm.c            | 2 +-
 drivers/usb/gadget/function/u_serial.c | 6 +++---
 drivers/usb/host/xhci-dbgtty.c         | 4 ++--
 drivers/usb/serial/usb-serial.c        | 2 +-
 include/linux/tty.h                    | 2 +-
 include/linux/tty_driver.h             | 4 ++--
 net/bluetooth/rfcomm/tty.c             | 2 +-
 47 files changed, 71 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/srmcons.c b/arch/alpha/kernel/srmcons.c
index 438b10c44d73..2110b7e7f988 100644
--- a/arch/alpha/kernel/srmcons.c
+++ b/arch/alpha/kernel/srmcons.c
@@ -142,7 +142,7 @@ srmcons_write(struct tty_struct *tty,
 	return count;
 }
 
-static int
+static unsigned int
 srmcons_write_room(struct tty_struct *tty)
 {
 	return 512;
diff --git a/arch/m68k/emu/nfcon.c b/arch/m68k/emu/nfcon.c
index 57e8c8fb5eba..92636c89d65b 100644
--- a/arch/m68k/emu/nfcon.c
+++ b/arch/m68k/emu/nfcon.c
@@ -85,7 +85,7 @@ static int nfcon_tty_put_char(struct tty_struct *tty, unsigned char ch)
 	return 1;
 }
 
-static int nfcon_tty_write_room(struct tty_struct *tty)
+static unsigned int nfcon_tty_write_room(struct tty_struct *tty)
 {
 	return 64;
 }
diff --git a/arch/parisc/kernel/pdc_cons.c b/arch/parisc/kernel/pdc_cons.c
index 7ed404c60a9e..fe2ed0bbd07e 100644
--- a/arch/parisc/kernel/pdc_cons.c
+++ b/arch/parisc/kernel/pdc_cons.c
@@ -103,7 +103,7 @@ static int pdc_console_tty_write(struct tty_struct *tty, const unsigned char *bu
 	return count;
 }
 
-static int pdc_console_tty_write_room(struct tty_struct *tty)
+static unsigned int pdc_console_tty_write_room(struct tty_struct *tty)
 {
 	return 32768; /* no limit, no buffer used */
 }
diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index 1c70a31e7c5b..2b8810ba5470 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -32,7 +32,7 @@ static irqreturn_t line_interrupt(int irq, void *data)
  *
  * Should be called while holding line->lock (this does not modify data).
  */
-static int write_room(struct line *line)
+static unsigned int write_room(struct line *line)
 {
 	int n;
 
@@ -47,11 +47,11 @@ static int write_room(struct line *line)
 	return n - 1;
 }
 
-int line_write_room(struct tty_struct *tty)
+unsigned int line_write_room(struct tty_struct *tty)
 {
 	struct line *line = tty->driver_data;
 	unsigned long flags;
-	int room;
+	unsigned int room;
 
 	spin_lock_irqsave(&line->lock, flags);
 	room = write_room(line);
diff --git a/arch/um/drivers/line.h b/arch/um/drivers/line.h
index 01d21e76144f..861edf329569 100644
--- a/arch/um/drivers/line.h
+++ b/arch/um/drivers/line.h
@@ -70,7 +70,7 @@ extern void line_set_termios(struct tty_struct *tty, struct ktermios * old);
 extern int line_chars_in_buffer(struct tty_struct *tty);
 extern void line_flush_buffer(struct tty_struct *tty);
 extern void line_flush_chars(struct tty_struct *tty);
-extern int line_write_room(struct tty_struct *tty);
+extern unsigned int line_write_room(struct tty_struct *tty);
 extern void line_throttle(struct tty_struct *tty);
 extern void line_unthrottle(struct tty_struct *tty);
 
diff --git a/arch/xtensa/platforms/iss/console.c b/arch/xtensa/platforms/iss/console.c
index a3dda25a4e45..98ac3a7fdb0a 100644
--- a/arch/xtensa/platforms/iss/console.c
+++ b/arch/xtensa/platforms/iss/console.c
@@ -100,7 +100,7 @@ static void rs_flush_chars(struct tty_struct *tty)
 {
 }
 
-static int rs_write_room(struct tty_struct *tty)
+static unsigned int rs_write_room(struct tty_struct *tty)
 {
 	/* Let's say iss can always accept 2K characters.. */
 	return 2 * 1024;
diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index b4707bc3aee8..e4b2c68f44f5 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -1609,7 +1609,7 @@ cleanup:
 
 /* Return the count of free bytes in transmit buffer
  */
-static int mgslpc_write_room(struct tty_struct *tty)
+static unsigned int mgslpc_write_room(struct tty_struct *tty)
 {
 	MGSLPC_INFO *info = (MGSLPC_INFO *)tty->driver_data;
 	int ret;
diff --git a/drivers/char/ttyprintk.c b/drivers/char/ttyprintk.c
index 93f5d11c830b..e93b0af92339 100644
--- a/drivers/char/ttyprintk.c
+++ b/drivers/char/ttyprintk.c
@@ -132,7 +132,7 @@ static int tpk_write(struct tty_struct *tty,
 /*
  * TTY operations write_room function.
  */
-static int tpk_write_room(struct tty_struct *tty)
+static unsigned int tpk_write_room(struct tty_struct *tty)
 {
 	return TPK_MAX_ROOM;
 }
diff --git a/drivers/ipack/devices/ipoctal.c b/drivers/ipack/devices/ipoctal.c
index 3940714e4397..ea0f1aeaaa06 100644
--- a/drivers/ipack/devices/ipoctal.c
+++ b/drivers/ipack/devices/ipoctal.c
@@ -460,7 +460,7 @@ static int ipoctal_write_tty(struct tty_struct *tty,
 	return char_copied;
 }
 
-static int ipoctal_write_room(struct tty_struct *tty)
+static unsigned int ipoctal_write_room(struct tty_struct *tty)
 {
 	struct ipoctal_channel *channel = tty->driver_data;
 
diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c
index fdf87acccd06..c50c454006b3 100644
--- a/drivers/isdn/capi/capi.c
+++ b/drivers/isdn/capi/capi.c
@@ -1175,14 +1175,14 @@ static void capinc_tty_flush_chars(struct tty_struct *tty)
 	handle_minor_recv(mp);
 }
 
-static int capinc_tty_write_room(struct tty_struct *tty)
+static unsigned int capinc_tty_write_room(struct tty_struct *tty)
 {
 	struct capiminor *mp = tty->driver_data;
-	int room;
+	unsigned int room;
 
 	room = CAPINC_MAX_SENDQUEUE-skb_queue_len(&mp->outqueue);
 	room *= CAPI_MAX_BLKSIZE;
-	pr_debug("capinc_tty_write_room = %d\n", room);
+	pr_debug("capinc_tty_write_room = %u\n", room);
 	return room;
 }
 
diff --git a/drivers/misc/bcm-vk/bcm_vk_tty.c b/drivers/misc/bcm-vk/bcm_vk_tty.c
index 4d02692ecfc7..dae9eeed84a2 100644
--- a/drivers/misc/bcm-vk/bcm_vk_tty.c
+++ b/drivers/misc/bcm-vk/bcm_vk_tty.c
@@ -214,7 +214,7 @@ static int bcm_vk_tty_write(struct tty_struct *tty,
 	return count;
 }
 
-static int bcm_vk_tty_write_room(struct tty_struct *tty)
+static unsigned int bcm_vk_tty_write_room(struct tty_struct *tty)
 {
 	struct bcm_vk *vk = dev_get_drvdata(tty->dev);
 
diff --git a/drivers/mmc/core/sdio_uart.c b/drivers/mmc/core/sdio_uart.c
index dbcac2b7f2fe..c8f4eca7aad4 100644
--- a/drivers/mmc/core/sdio_uart.c
+++ b/drivers/mmc/core/sdio_uart.c
@@ -797,7 +797,7 @@ static int sdio_uart_write(struct tty_struct *tty, const unsigned char *buf,
 	return ret;
 }
 
-static int sdio_uart_write_room(struct tty_struct *tty)
+static unsigned int sdio_uart_write_room(struct tty_struct *tty)
 {
 	struct sdio_uart_port *port = tty->driver_data;
 	return FIFO_SIZE - kfifo_len(&port->xmit_fifo);
diff --git a/drivers/net/usb/hso.c b/drivers/net/usb/hso.c
index 3ef4b2841402..bb8bb85308ab 100644
--- a/drivers/net/usb/hso.c
+++ b/drivers/net/usb/hso.c
@@ -1357,10 +1357,10 @@ out:
 }
 
 /* how much room is there for writing */
-static int hso_serial_write_room(struct tty_struct *tty)
+static unsigned int hso_serial_write_room(struct tty_struct *tty)
 {
 	struct hso_serial *serial = tty->driver_data;
-	int room;
+	unsigned int room;
 	unsigned long flags;
 
 	spin_lock_irqsave(&serial->serial_lock, flags);
diff --git a/drivers/s390/char/con3215.c b/drivers/s390/char/con3215.c
index 1fd5bca9fa20..c9fd4a05931a 100644
--- a/drivers/s390/char/con3215.c
+++ b/drivers/s390/char/con3215.c
@@ -924,7 +924,7 @@ static void tty3215_close(struct tty_struct *tty, struct file * filp)
 /*
  * Returns the amount of free space in the output buffer.
  */
-static int tty3215_write_room(struct tty_struct *tty)
+static unsigned int tty3215_write_room(struct tty_struct *tty)
 {
 	struct raw3215_info *raw = tty->driver_data;
 
diff --git a/drivers/s390/char/sclp_tty.c b/drivers/s390/char/sclp_tty.c
index 4456ceb23bd2..ea1e43fd16bc 100644
--- a/drivers/s390/char/sclp_tty.c
+++ b/drivers/s390/char/sclp_tty.c
@@ -86,12 +86,12 @@ sclp_tty_close(struct tty_struct *tty, struct file *filp)
  * a string of newlines. Every newline creates a new message which
  * needs 82 bytes.
  */
-static int
+static unsigned int
 sclp_tty_write_room (struct tty_struct *tty)
 {
 	unsigned long flags;
 	struct list_head *l;
-	int count;
+	unsigned int count;
 
 	spin_lock_irqsave(&sclp_tty_lock, flags);
 	count = 0;
diff --git a/drivers/s390/char/sclp_vt220.c b/drivers/s390/char/sclp_vt220.c
index 7f4445b0f819..b621adee35f0 100644
--- a/drivers/s390/char/sclp_vt220.c
+++ b/drivers/s390/char/sclp_vt220.c
@@ -609,12 +609,12 @@ sclp_vt220_flush_chars(struct tty_struct *tty)
  * to change as output buffers get emptied, or if the output flow
  * control is acted.
  */
-static int
+static unsigned int
 sclp_vt220_write_room(struct tty_struct *tty)
 {
 	unsigned long flags;
 	struct list_head *l;
-	int count;
+	unsigned int count;
 
 	spin_lock_irqsave(&sclp_vt220_lock, flags);
 	count = 0;
diff --git a/drivers/s390/char/tty3270.c b/drivers/s390/char/tty3270.c
index 1b68564799fa..82d4c961ed06 100644
--- a/drivers/s390/char/tty3270.c
+++ b/drivers/s390/char/tty3270.c
@@ -1071,7 +1071,7 @@ static void tty3270_cleanup(struct tty_struct *tty)
 /*
  * We always have room.
  */
-static int
+static unsigned int
 tty3270_write_room(struct tty_struct *tty)
 {
 	return INT_MAX;
diff --git a/drivers/staging/fwserial/fwserial.c b/drivers/staging/fwserial/fwserial.c
index 4245532d2fe0..a151cd76d24e 100644
--- a/drivers/staging/fwserial/fwserial.c
+++ b/drivers/staging/fwserial/fwserial.c
@@ -1113,16 +1113,16 @@ static int fwtty_write(struct tty_struct *tty, const unsigned char *buf, int c)
 	return (n < 0) ? 0 : n;
 }
 
-static int fwtty_write_room(struct tty_struct *tty)
+static unsigned int fwtty_write_room(struct tty_struct *tty)
 {
 	struct fwtty_port *port = tty->driver_data;
-	int n;
+	unsigned int n;
 
 	spin_lock_bh(&port->lock);
 	n = dma_fifo_avail(&port->tx_fifo);
 	spin_unlock_bh(&port->lock);
 
-	fwtty_dbg(port, "%d\n", n);
+	fwtty_dbg(port, "%u\n", n);
 
 	return n;
 }
diff --git a/drivers/staging/gdm724x/gdm_tty.c b/drivers/staging/gdm724x/gdm_tty.c
index 0ccc8c24e754..279de2cd9c4a 100644
--- a/drivers/staging/gdm724x/gdm_tty.c
+++ b/drivers/staging/gdm724x/gdm_tty.c
@@ -183,7 +183,7 @@ static int gdm_tty_write(struct tty_struct *tty, const unsigned char *buf,
 	return len;
 }
 
-static int gdm_tty_write_room(struct tty_struct *tty)
+static unsigned int gdm_tty_write_room(struct tty_struct *tty)
 {
 	struct gdm *gdm = tty->driver_data;
 
diff --git a/drivers/staging/greybus/uart.c b/drivers/staging/greybus/uart.c
index b1e63f7798b0..529eccb99b6c 100644
--- a/drivers/staging/greybus/uart.c
+++ b/drivers/staging/greybus/uart.c
@@ -440,7 +440,7 @@ static int gb_tty_write(struct tty_struct *tty, const unsigned char *buf,
 	return count;
 }
 
-static int gb_tty_write_room(struct tty_struct *tty)
+static unsigned int gb_tty_write_room(struct tty_struct *tty)
 {
 	struct gb_tty *gb_tty = tty->driver_data;
 	unsigned long flags;
diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c
index a4b8876091d2..ee1f4d72cd5e 100644
--- a/drivers/tty/amiserial.c
+++ b/drivers/tty/amiserial.c
@@ -827,7 +827,7 @@ static int rs_write(struct tty_struct * tty, const unsigned char *buf, int count
 	return ret;
 }
 
-static int rs_write_room(struct tty_struct *tty)
+static unsigned int rs_write_room(struct tty_struct *tty)
 {
 	struct serial_state *info = tty->driver_data;
 
diff --git a/drivers/tty/ehv_bytechan.c b/drivers/tty/ehv_bytechan.c
index 3c6dd06ec5fb..445e5ff9b36d 100644
--- a/drivers/tty/ehv_bytechan.c
+++ b/drivers/tty/ehv_bytechan.c
@@ -536,11 +536,11 @@ static void ehv_bc_tty_close(struct tty_struct *ttys, struct file *filp)
  * how much write room the driver can guarantee will be sent OR BUFFERED.  This
  * driver MUST honor the return value.
  */
-static int ehv_bc_tty_write_room(struct tty_struct *ttys)
+static unsigned int ehv_bc_tty_write_room(struct tty_struct *ttys)
 {
 	struct ehv_bc_data *bc = ttys->driver_data;
 	unsigned long flags;
-	int count;
+	unsigned int count;
 
 	spin_lock_irqsave(&bc->lock, flags);
 	count = CIRC_SPACE(bc->head, bc->tail, BUF_SIZE);
diff --git a/drivers/tty/goldfish.c b/drivers/tty/goldfish.c
index cd23a4b05c8f..e4f9a60dcc18 100644
--- a/drivers/tty/goldfish.c
+++ b/drivers/tty/goldfish.c
@@ -193,7 +193,7 @@ static int goldfish_tty_write(struct tty_struct *tty, const unsigned char *buf,
 	return count;
 }
 
-static int goldfish_tty_write_room(struct tty_struct *tty)
+static unsigned int goldfish_tty_write_room(struct tty_struct *tty)
 {
 	return 0x10000;
 }
diff --git a/drivers/tty/hvc/hvc_console.c b/drivers/tty/hvc/hvc_console.c
index cdcc64ea2554..a3725eb69cd3 100644
--- a/drivers/tty/hvc/hvc_console.c
+++ b/drivers/tty/hvc/hvc_console.c
@@ -586,7 +586,7 @@ static void hvc_set_winsz(struct work_struct *work)
  * how much write room the driver can guarantee will be sent OR BUFFERED.  This
  * driver MUST honor the return value.
  */
-static int hvc_write_room(struct tty_struct *tty)
+static unsigned int hvc_write_room(struct tty_struct *tty)
 {
 	struct hvc_struct *hp = tty->driver_data;
 
diff --git a/drivers/tty/hvc/hvcs.c b/drivers/tty/hvc/hvcs.c
index 197988c55e0c..f43f2f94d8bd 100644
--- a/drivers/tty/hvc/hvcs.c
+++ b/drivers/tty/hvc/hvcs.c
@@ -1376,7 +1376,7 @@ static int hvcs_write(struct tty_struct *tty,
  * absolutely WILL BUFFER if we can't send it.  This driver MUST honor the
  * return value, hence the reason for hvcs_struct buffering.
  */
-static int hvcs_write_room(struct tty_struct *tty)
+static unsigned int hvcs_write_room(struct tty_struct *tty)
 {
 	struct hvcs_struct *hvcsd = tty->driver_data;
 
diff --git a/drivers/tty/hvc/hvsi.c b/drivers/tty/hvc/hvsi.c
index e8c58f9bd263..0a56f44e6b12 100644
--- a/drivers/tty/hvc/hvsi.c
+++ b/drivers/tty/hvc/hvsi.c
@@ -890,7 +890,7 @@ out:
 	spin_unlock_irqrestore(&hp->lock, flags);
 }
 
-static int hvsi_write_room(struct tty_struct *tty)
+static unsigned int hvsi_write_room(struct tty_struct *tty)
 {
 	struct hvsi_struct *hp = tty->driver_data;
 
@@ -929,7 +929,7 @@ static int hvsi_write(struct tty_struct *tty,
 	 * will see there is no room in outbuf and return.
 	 */
 	while ((count > 0) && (hvsi_write_room(tty) > 0)) {
-		int chunksize = min(count, hvsi_write_room(tty));
+		int chunksize = min_t(int, count, hvsi_write_room(tty));
 
 		BUG_ON(hp->n_outbuf < 0);
 		memcpy(hp->outbuf + hp->n_outbuf, source, chunksize);
diff --git a/drivers/tty/ipwireless/tty.c b/drivers/tty/ipwireless/tty.c
index 99bb2f149ff5..ab562838313b 100644
--- a/drivers/tty/ipwireless/tty.c
+++ b/drivers/tty/ipwireless/tty.c
@@ -228,7 +228,7 @@ static int ipw_write(struct tty_struct *linux_tty,
 	return count;
 }
 
-static int ipw_write_room(struct tty_struct *linux_tty)
+static unsigned int ipw_write_room(struct tty_struct *linux_tty)
 {
 	struct ipw_tty *tty = linux_tty->driver_data;
 	int room;
diff --git a/drivers/tty/mips_ejtag_fdc.c b/drivers/tty/mips_ejtag_fdc.c
index a8e19b4833bf..f427e8e154d7 100644
--- a/drivers/tty/mips_ejtag_fdc.c
+++ b/drivers/tty/mips_ejtag_fdc.c
@@ -840,11 +840,11 @@ static int mips_ejtag_fdc_tty_write(struct tty_struct *tty,
 	return total;
 }
 
-static int mips_ejtag_fdc_tty_write_room(struct tty_struct *tty)
+static unsigned int mips_ejtag_fdc_tty_write_room(struct tty_struct *tty)
 {
 	struct mips_ejtag_fdc_tty_port *dport = tty->driver_data;
 	struct mips_ejtag_fdc_tty *priv = dport->driver;
-	int room;
+	unsigned int room;
 
 	/* Report the space in the xmit buffer */
 	spin_lock(&dport->xmit_lock);
diff --git a/drivers/tty/moxa.c b/drivers/tty/moxa.c
index 847ad3dac107..e4fe9315de29 100644
--- a/drivers/tty/moxa.c
+++ b/drivers/tty/moxa.c
@@ -188,7 +188,7 @@ module_param(ttymajor, int, 0);
 static int moxa_open(struct tty_struct *, struct file *);
 static void moxa_close(struct tty_struct *, struct file *);
 static int moxa_write(struct tty_struct *, const unsigned char *, int);
-static int moxa_write_room(struct tty_struct *);
+static unsigned int moxa_write_room(struct tty_struct *);
 static void moxa_flush_buffer(struct tty_struct *);
 static int moxa_chars_in_buffer(struct tty_struct *);
 static void moxa_set_termios(struct tty_struct *, struct ktermios *);
@@ -218,7 +218,7 @@ static int MoxaPortWriteData(struct tty_struct *, const unsigned char *, int);
 static int MoxaPortReadData(struct moxa_port *);
 static int MoxaPortTxQueue(struct moxa_port *);
 static int MoxaPortRxQueue(struct moxa_port *);
-static int MoxaPortTxFree(struct moxa_port *);
+static unsigned int MoxaPortTxFree(struct moxa_port *);
 static void MoxaPortTxDisable(struct moxa_port *);
 static void MoxaPortTxEnable(struct moxa_port *);
 static int moxa_get_serial_info(struct tty_struct *, struct serial_struct *);
@@ -1217,7 +1217,7 @@ static int moxa_write(struct tty_struct *tty,
 	return len;
 }
 
-static int moxa_write_room(struct tty_struct *tty)
+static unsigned int moxa_write_room(struct tty_struct *tty)
 {
 	struct moxa_port *ch;
 
@@ -1992,7 +1992,7 @@ static int MoxaPortTxQueue(struct moxa_port *port)
 	return (wptr - rptr) & mask;
 }
 
-static int MoxaPortTxFree(struct moxa_port *port)
+static unsigned int MoxaPortTxFree(struct moxa_port *port)
 {
 	void __iomem *ofsAddr = port->tableAddr;
 	u16 rptr, wptr, mask;
diff --git a/drivers/tty/mxser.c b/drivers/tty/mxser.c
index 85271e109014..5851a45d828c 100644
--- a/drivers/tty/mxser.c
+++ b/drivers/tty/mxser.c
@@ -1183,7 +1183,7 @@ static void mxser_flush_chars(struct tty_struct *tty)
 	spin_unlock_irqrestore(&info->slock, flags);
 }
 
-static int mxser_write_room(struct tty_struct *tty)
+static unsigned int mxser_write_room(struct tty_struct *tty)
 {
 	struct mxser_port *info = tty->driver_data;
 	int ret;
diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c
index 157b26ef6259..06f0c6d39620 100644
--- a/drivers/tty/n_gsm.c
+++ b/drivers/tty/n_gsm.c
@@ -3056,7 +3056,7 @@ static int gsmtty_write(struct tty_struct *tty, const unsigned char *buf,
 	return sent;
 }
 
-static int gsmtty_write_room(struct tty_struct *tty)
+static unsigned int gsmtty_write_room(struct tty_struct *tty)
 {
 	struct gsm_dlci *dlci = tty->driver_data;
 	if (dlci->state == DLCI_CLOSED)
diff --git a/drivers/tty/nozomi.c b/drivers/tty/nozomi.c
index 9a2d78ace49b..c55475a9a184 100644
--- a/drivers/tty/nozomi.c
+++ b/drivers/tty/nozomi.c
@@ -1636,10 +1636,10 @@ static int ntty_write(struct tty_struct *tty, const unsigned char *buffer,
  * If the port is unplugged report lots of room and let the bits
  * dribble away so we don't block anything.
  */
-static int ntty_write_room(struct tty_struct *tty)
+static unsigned int ntty_write_room(struct tty_struct *tty)
 {
 	struct port *port = tty->driver_data;
-	int room = 4096;
+	unsigned int room = 4096;
 	const struct nozomi *dc = get_dc_by_tty(tty);
 
 	if (dc)
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index 3e7b5c811f9b..eb8556b19592 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -136,7 +136,7 @@ static int pty_write(struct tty_struct *tty, const unsigned char *buf, int c)
  *	the other device.
  */
 
-static int pty_write_room(struct tty_struct *tty)
+static unsigned int pty_write_room(struct tty_struct *tty)
 {
 	if (tty->flow.stopped)
 		return 0;
diff --git a/drivers/tty/serial/kgdb_nmi.c b/drivers/tty/serial/kgdb_nmi.c
index db059b66438e..b193bbc666d4 100644
--- a/drivers/tty/serial/kgdb_nmi.c
+++ b/drivers/tty/serial/kgdb_nmi.c
@@ -298,7 +298,7 @@ static void kgdb_nmi_tty_hangup(struct tty_struct *tty)
 	tty_port_hangup(&priv->port);
 }
 
-static int kgdb_nmi_tty_write_room(struct tty_struct *tty)
+static unsigned int kgdb_nmi_tty_write_room(struct tty_struct *tty)
 {
 	/* Actually, we can handle any amount as we use polled writes. */
 	return 2048;
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 87f7127b57e6..cb46a65a5dd8 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -616,12 +616,12 @@ static int uart_write(struct tty_struct *tty,
 	return ret;
 }
 
-static int uart_write_room(struct tty_struct *tty)
+static unsigned int uart_write_room(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
 	struct uart_port *port;
 	unsigned long flags;
-	int ret;
+	unsigned int ret;
 
 	port = uart_port_lock(state, flags);
 	ret = uart_circ_chars_free(&state->xmit);
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index 1555dccc28af..583aa8342112 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -868,15 +868,15 @@ exit:
 	DBGINFO(("%s wait_until_sent exit\n", info->device_name));
 }
 
-static int write_room(struct tty_struct *tty)
+static unsigned int write_room(struct tty_struct *tty)
 {
 	struct slgt_info *info = tty->driver_data;
-	int ret;
+	unsigned int ret;
 
 	if (sanity_check(info, tty->name, "write_room"))
 		return 0;
 	ret = (info->tx_active) ? 0 : HDLC_MAX_FRAME_SIZE;
-	DBGINFO(("%s write_room=%d\n", info->device_name, ret));
+	DBGINFO(("%s write_room=%u\n", info->device_name, ret));
 	return ret;
 }
 
diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c
index 07c88ccfb17a..d8834784b586 100644
--- a/drivers/tty/tty_ioctl.c
+++ b/drivers/tty/tty_ioctl.c
@@ -73,7 +73,7 @@ EXPORT_SYMBOL(tty_chars_in_buffer);
  *	returned and data may be lost as there will be no flow control.
  */
  
-int tty_write_room(struct tty_struct *tty)
+unsigned int tty_write_room(struct tty_struct *tty)
 {
 	if (tty->ops->write_room)
 		return tty->ops->write_room(tty);
diff --git a/drivers/tty/ttynull.c b/drivers/tty/ttynull.c
index 17f05b7eb6d3..af3311a24917 100644
--- a/drivers/tty/ttynull.c
+++ b/drivers/tty/ttynull.c
@@ -35,7 +35,7 @@ static int ttynull_write(struct tty_struct *tty, const unsigned char *buf,
 	return count;
 }
 
-static int ttynull_write_room(struct tty_struct *tty)
+static unsigned int ttynull_write_room(struct tty_struct *tty)
 {
 	return 65536;
 }
diff --git a/drivers/tty/vcc.c b/drivers/tty/vcc.c
index 0a3a71e14df4..d82ce3bb82c3 100644
--- a/drivers/tty/vcc.c
+++ b/drivers/tty/vcc.c
@@ -870,10 +870,10 @@ static int vcc_write(struct tty_struct *tty, const unsigned char *buf,
 	return total_sent ? total_sent : rv;
 }
 
-static int vcc_write_room(struct tty_struct *tty)
+static unsigned int vcc_write_room(struct tty_struct *tty)
 {
 	struct vcc_port *port;
-	u64 num;
+	unsigned int num;
 
 	port = vcc_get_ne(tty->index);
 	if (unlikely(!port)) {
diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index 706f066eb711..96c130714930 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -3263,7 +3263,7 @@ static int con_put_char(struct tty_struct *tty, unsigned char ch)
 	return do_con_write(tty, &ch, 1);
 }
 
-static int con_write_room(struct tty_struct *tty)
+static unsigned int con_write_room(struct tty_struct *tty)
 {
 	if (tty->flow.stopped)
 		return 0;
diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index ca7a61190dd9..76b7fd234238 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -838,7 +838,7 @@ static int acm_tty_write(struct tty_struct *tty,
 	return count;
 }
 
-static int acm_tty_write_room(struct tty_struct *tty)
+static unsigned int acm_tty_write_room(struct tty_struct *tty)
 {
 	struct acm *acm = tty->driver_data;
 	/*
diff --git a/drivers/usb/gadget/function/u_serial.c b/drivers/usb/gadget/function/u_serial.c
index 1e59204ec7aa..676a920d9d6b 100644
--- a/drivers/usb/gadget/function/u_serial.c
+++ b/drivers/usb/gadget/function/u_serial.c
@@ -774,18 +774,18 @@ static void gs_flush_chars(struct tty_struct *tty)
 	spin_unlock_irqrestore(&port->port_lock, flags);
 }
 
-static int gs_write_room(struct tty_struct *tty)
+static unsigned int gs_write_room(struct tty_struct *tty)
 {
 	struct gs_port	*port = tty->driver_data;
 	unsigned long	flags;
-	int		room = 0;
+	unsigned int room = 0;
 
 	spin_lock_irqsave(&port->port_lock, flags);
 	if (port->port_usb)
 		room = kfifo_avail(&port->port_write_buf);
 	spin_unlock_irqrestore(&port->port_lock, flags);
 
-	pr_vdebug("gs_write_room: (%d,%p) room=%d\n",
+	pr_vdebug("gs_write_room: (%d,%p) room=%u\n",
 		port->port_num, tty, room);
 
 	return room;
diff --git a/drivers/usb/host/xhci-dbgtty.c b/drivers/usb/host/xhci-dbgtty.c
index ae4e4ab638b5..cd3ab35dd689 100644
--- a/drivers/usb/host/xhci-dbgtty.c
+++ b/drivers/usb/host/xhci-dbgtty.c
@@ -240,11 +240,11 @@ static void dbc_tty_flush_chars(struct tty_struct *tty)
 	spin_unlock_irqrestore(&port->port_lock, flags);
 }
 
-static int dbc_tty_write_room(struct tty_struct *tty)
+static unsigned int dbc_tty_write_room(struct tty_struct *tty)
 {
 	struct dbc_port		*port = tty->driver_data;
 	unsigned long		flags;
-	int			room = 0;
+	unsigned int		room;
 
 	spin_lock_irqsave(&port->port_lock, flags);
 	room = kfifo_avail(&port->write_fifo);
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index 98b33b1b5357..055096831daf 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -376,7 +376,7 @@ exit:
 	return retval;
 }
 
-static int serial_write_room(struct tty_struct *tty)
+static unsigned int serial_write_room(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index e18a4f1ac39d..d18fc34d3054 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -459,7 +459,7 @@ extern void tty_write_message(struct tty_struct *tty, char *msg);
 extern int tty_send_xchar(struct tty_struct *tty, char ch);
 extern int tty_put_char(struct tty_struct *tty, unsigned char c);
 extern int tty_chars_in_buffer(struct tty_struct *tty);
-extern int tty_write_room(struct tty_struct *tty);
+extern unsigned int tty_write_room(struct tty_struct *tty);
 extern void tty_driver_flush_buffer(struct tty_struct *tty);
 extern void tty_throttle(struct tty_struct *tty);
 extern void tty_unthrottle(struct tty_struct *tty);
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index 653fa5af3a22..ea5b15c72764 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -89,7 +89,7 @@
  *
  *	Note: Do not call this function directly, call tty_driver_flush_chars
  * 
- * int  (*write_room)(struct tty_struct *tty);
+ * unsigned int  (*write_room)(struct tty_struct *tty);
  *
  * 	This routine returns the numbers of characters the tty driver
  * 	will accept for queuing to be written.  This number is subject
@@ -256,7 +256,7 @@ struct tty_operations {
 		      const unsigned char *buf, int count);
 	int  (*put_char)(struct tty_struct *tty, unsigned char ch);
 	void (*flush_chars)(struct tty_struct *tty);
-	int  (*write_room)(struct tty_struct *tty);
+	unsigned int (*write_room)(struct tty_struct *tty);
 	int  (*chars_in_buffer)(struct tty_struct *tty);
 	int  (*ioctl)(struct tty_struct *tty,
 		    unsigned int cmd, unsigned long arg);
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index a58584949a95..a5e3d957f20f 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -807,7 +807,7 @@ static int rfcomm_tty_write(struct tty_struct *tty, const unsigned char *buf, in
 	return sent;
 }
 
-static int rfcomm_tty_write_room(struct tty_struct *tty)
+static unsigned int rfcomm_tty_write_room(struct tty_struct *tty)
 {
 	struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
 	int room = 0;
-- 
cgit v1.2.3


From 76c8eaafe4f061f3790112842a2fbb297e4bea88 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 21 Apr 2021 14:30:54 -0700
Subject: rcu: Create an unrcu_pointer() to remove __rcu from a pointer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The xchg() and cmpxchg() functions are sometimes used to carry out RCU
updates.  Unfortunately, this can result in sparse warnings for both
the old-value and new-value arguments, as well as for the return value.
The arguments can be dealt with using RCU_INITIALIZER():

	old_p = xchg(&p, RCU_INITIALIZER(new_p));

But a sparse warning still remains due to assigning the __rcu pointer
returned from xchg to the (most likely) non-__rcu pointer old_p.

This commit therefore provides an unrcu_pointer() macro that strips
the __rcu.  This macro can be used as follows:

	old_p = unrcu_pointer(xchg(&p, RCU_INITIALIZER(new_p)));

Reported-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 1199ffd305d1..b071d02a028a 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -363,6 +363,20 @@ static inline void rcu_preempt_sleep_check(void) { }
 #define rcu_check_sparse(p, space)
 #endif /* #else #ifdef __CHECKER__ */
 
+/**
+ * unrcu_pointer - mark a pointer as not being RCU protected
+ * @p: pointer needing to lose its __rcu property
+ *
+ * Converts @p from an __rcu pointer to a __kernel pointer.
+ * This allows an __rcu pointer to be used with xchg() and friends.
+ */
+#define unrcu_pointer(p)						\
+({									\
+	typeof(*p) *_________p1 = (typeof(*p) *__force)(p);		\
+	rcu_check_sparse(p, __rcu);					\
+	((typeof(*p) __force __kernel *)(_________p1)); 		\
+})
+
 #define __rcu_access_pointer(p, space) \
 ({ \
 	typeof(*p) *_________p1 = (typeof(*p) *__force)READ_ONCE(p); \
-- 
cgit v1.2.3


From 1893afd63409111c6edcee9d6e1196fc06cf4fd7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 29 Apr 2021 11:18:01 -0700
Subject: rcu: Improve comments describing RCU read-side critical sections

There are a number of places that call out the fact that preempt-disable
regions of code now act as RCU read-side critical sections, where
preempt-disable regions of code include irq-disable regions of code,
bh-disable regions of code, hardirq handlers, and NMI handlers.  However,
someone relying solely on (for example) the call_rcu() header comment
might well have no idea that preempt-disable regions of code have RCU
semantics.

This commit therefore updates the header comments for
call_rcu(), synchronize_rcu(), rcu_dereference_bh_check(), and
rcu_dereference_sched_check() to call out these new(ish) forms of RCU
readers.

Reported-by: Michel Lespinasse <michel@lespinasse.org>
[ paulmck: Apply Matthew Wilcox and Michel Lespinasse feedback. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h | 35 ++++++++++++++++++++++++++++-------
 kernel/rcu/tree.c        | 24 ++++++++++++++----------
 2 files changed, 42 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index b071d02a028a..f0eecb9e49c8 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -532,7 +532,12 @@ do {									      \
  * @p: The pointer to read, prior to dereferencing
  * @c: The conditions under which the dereference will take place
  *
- * This is the RCU-bh counterpart to rcu_dereference_check().
+ * This is the RCU-bh counterpart to rcu_dereference_check().  However,
+ * please note that starting in v5.0 kernels, vanilla RCU grace periods
+ * wait for local_bh_disable() regions of code in addition to regions of
+ * code demarked by rcu_read_lock() and rcu_read_unlock().  This means
+ * that synchronize_rcu(), call_rcu, and friends all take not only
+ * rcu_read_lock() but also rcu_read_lock_bh() into account.
  */
 #define rcu_dereference_bh_check(p, c) \
 	__rcu_dereference_check((p), (c) || rcu_read_lock_bh_held(), __rcu)
@@ -543,6 +548,11 @@ do {									      \
  * @c: The conditions under which the dereference will take place
  *
  * This is the RCU-sched counterpart to rcu_dereference_check().
+ * However, please note that starting in v5.0 kernels, vanilla RCU grace
+ * periods wait for preempt_disable() regions of code in addition to
+ * regions of code demarked by rcu_read_lock() and rcu_read_unlock().
+ * This means that synchronize_rcu(), call_rcu, and friends all take not
+ * only rcu_read_lock() but also rcu_read_lock_sched() into account.
  */
 #define rcu_dereference_sched_check(p, c) \
 	__rcu_dereference_check((p), (c) || rcu_read_lock_sched_held(), \
@@ -634,6 +644,12 @@ do {									      \
  * sections, invocation of the corresponding RCU callback is deferred
  * until after the all the other CPUs exit their critical sections.
  *
+ * In v5.0 and later kernels, synchronize_rcu() and call_rcu() also
+ * wait for regions of code with preemption disabled, including regions of
+ * code with interrupts or softirqs disabled.  In pre-v5.0 kernels, which
+ * define synchronize_sched(), only code enclosed within rcu_read_lock()
+ * and rcu_read_unlock() are guaranteed to be waited for.
+ *
  * Note, however, that RCU callbacks are permitted to run concurrently
  * with new RCU read-side critical sections.  One way that this can happen
  * is via the following sequence of events: (1) CPU 0 enters an RCU
@@ -728,9 +744,11 @@ static inline void rcu_read_unlock(void)
 /**
  * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section
  *
- * This is equivalent of rcu_read_lock(), but also disables softirqs.
- * Note that anything else that disables softirqs can also serve as
- * an RCU read-side critical section.
+ * This is equivalent to rcu_read_lock(), but also disables softirqs.
+ * Note that anything else that disables softirqs can also serve as an RCU
+ * read-side critical section.  However, please note that this equivalence
+ * applies only to v5.0 and later.  Before v5.0, rcu_read_lock() and
+ * rcu_read_lock_bh() were unrelated.
  *
  * Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh()
  * must occur in the same context, for example, it is illegal to invoke
@@ -763,9 +781,12 @@ static inline void rcu_read_unlock_bh(void)
 /**
  * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section
  *
- * This is equivalent of rcu_read_lock(), but disables preemption.
- * Read-side critical sections can also be introduced by anything else
- * that disables preemption, including local_irq_disable() and friends.
+ * This is equivalent to rcu_read_lock(), but also disables preemption.
+ * Read-side critical sections can also be introduced by anything else that
+ * disables preemption, including local_irq_disable() and friends.  However,
+ * please note that the equivalence to rcu_read_lock() applies only to
+ * v5.0 and later.  Before v5.0, rcu_read_lock() and rcu_read_lock_sched()
+ * were unrelated.
  *
  * Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched()
  * must occur in the same context, for example, it is illegal to invoke
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 2437960a2795..4b00e4fbfa10 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3059,12 +3059,14 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
  * period elapses, in other words after all pre-existing RCU read-side
  * critical sections have completed.  However, the callback function
  * might well execute concurrently with RCU read-side critical sections
- * that started after call_rcu() was invoked.  RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(), and
- * may be nested.  In addition, regions of code across which interrupts,
- * preemption, or softirqs have been disabled also serve as RCU read-side
- * critical sections.  This includes hardware interrupt handlers, softirq
- * handlers, and NMI handlers.
+ * that started after call_rcu() was invoked.
+ *
+ * RCU read-side critical sections are delimited by rcu_read_lock()
+ * and rcu_read_unlock(), and may be nested.  In addition, but only in
+ * v5.0 and later, regions of code across which interrupts, preemption,
+ * or softirqs have been disabled also serve as RCU read-side critical
+ * sections.  This includes hardware interrupt handlers, softirq handlers,
+ * and NMI handlers.
  *
  * Note that all CPUs must agree that the grace period extended beyond
  * all pre-existing RCU read-side critical section.  On systems with more
@@ -3730,10 +3732,12 @@ static int rcu_blocking_is_gp(void)
  * read-side critical sections have completed.  Note, however, that
  * upon return from synchronize_rcu(), the caller might well be executing
  * concurrently with new RCU read-side critical sections that began while
- * synchronize_rcu() was waiting.  RCU read-side critical sections are
- * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
- * In addition, regions of code across which interrupts, preemption, or
- * softirqs have been disabled also serve as RCU read-side critical
+ * synchronize_rcu() was waiting.
+ *
+ * RCU read-side critical sections are delimited by rcu_read_lock()
+ * and rcu_read_unlock(), and may be nested.  In addition, but only in
+ * v5.0 and later, regions of code across which interrupts, preemption,
+ * or softirqs have been disabled also serve as RCU read-side critical
  * sections.  This includes hardware interrupt handlers, softirq handlers,
  * and NMI handlers.
  *
-- 
cgit v1.2.3


From 0223846010750e28e4330f1beefb5564ba406ef7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 29 Apr 2021 11:30:49 -0700
Subject: rcu: Remove obsolete rcu_read_unlock() deadlock commentary

The deferred quiescent states resulting from the consolidation of RCU-bh
and RCU-sched into RCU means that rcu_read_unlock() will no longer attempt
to acquire scheduler locks if interrupts were disabled across that call
to rcu_read_unlock().  The cautions in the rcu_read_unlock() header
comment are therefore obsolete.  This commit therefore removes them.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h | 33 ++++++---------------------------
 1 file changed, 6 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index f0eecb9e49c8..d9680b798b21 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -702,33 +702,12 @@ static __always_inline void rcu_read_lock(void)
 /**
  * rcu_read_unlock() - marks the end of an RCU read-side critical section.
  *
- * In most situations, rcu_read_unlock() is immune from deadlock.
- * However, in kernels built with CONFIG_RCU_BOOST, rcu_read_unlock()
- * is responsible for deboosting, which it does via rt_mutex_unlock().
- * Unfortunately, this function acquires the scheduler's runqueue and
- * priority-inheritance spinlocks.  This means that deadlock could result
- * if the caller of rcu_read_unlock() already holds one of these locks or
- * any lock that is ever acquired while holding them.
- *
- * That said, RCU readers are never priority boosted unless they were
- * preempted.  Therefore, one way to avoid deadlock is to make sure
- * that preemption never happens within any RCU read-side critical
- * section whose outermost rcu_read_unlock() is called with one of
- * rt_mutex_unlock()'s locks held.  Such preemption can be avoided in
- * a number of ways, for example, by invoking preempt_disable() before
- * critical section's outermost rcu_read_lock().
- *
- * Given that the set of locks acquired by rt_mutex_unlock() might change
- * at any time, a somewhat more future-proofed approach is to make sure
- * that that preemption never happens within any RCU read-side critical
- * section whose outermost rcu_read_unlock() is called with irqs disabled.
- * This approach relies on the fact that rt_mutex_unlock() currently only
- * acquires irq-disabled locks.
- *
- * The second of these two approaches is best in most situations,
- * however, the first approach can also be useful, at least to those
- * developers willing to keep abreast of the set of locks acquired by
- * rt_mutex_unlock().
+ * In almost all situations, rcu_read_unlock() is immune from deadlock.
+ * In recent kernels that have consolidated synchronize_sched() and
+ * synchronize_rcu_bh() into synchronize_rcu(), this deadlock immunity
+ * also extends to the scheduler's runqueue and priority-inheritance
+ * spinlocks, courtesy of the quiescent-state deferral that is carried
+ * out when rcu_read_unlock() is invoked with interrupts disabled.
  *
  * See rcu_read_lock() for more information.
  */
-- 
cgit v1.2.3


From 9a33fbf9d23034d7e89849c587b0aed0e4cf794d Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Wed, 5 May 2021 11:19:17 +0200
Subject: tty: make tty_buffer_space_avail return uint

tty_buffer_space_avail returns values >= 0, so make it clear by the
return type.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20210505091928.22010-25-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_aspeed_vuart.c | 4 ++--
 drivers/tty/tty_buffer.c                    | 2 +-
 include/linux/tty_flip.h                    | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/8250/8250_aspeed_vuart.c b/drivers/tty/serial/8250/8250_aspeed_vuart.c
index 2bf1d8582d9a..a28a394ba32a 100644
--- a/drivers/tty/serial/8250/8250_aspeed_vuart.c
+++ b/drivers/tty/serial/8250/8250_aspeed_vuart.c
@@ -329,7 +329,7 @@ static int aspeed_vuart_handle_irq(struct uart_port *port)
 {
 	struct uart_8250_port *up = up_to_u8250p(port);
 	unsigned int iir, lsr;
-	int space, count;
+	unsigned int space, count;
 
 	iir = serial_port_in(port, UART_IIR);
 
@@ -353,7 +353,7 @@ static int aspeed_vuart_handle_irq(struct uart_port *port)
 					  jiffies + unthrottle_timeout);
 
 		} else {
-			count = min(space, 256);
+			count = min(space, 256U);
 
 			do {
 				serial8250_read_char(up, lsr);
diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c
index 55b1f1711341..585a19f65284 100644
--- a/drivers/tty/tty_buffer.c
+++ b/drivers/tty/tty_buffer.c
@@ -88,7 +88,7 @@ EXPORT_SYMBOL_GPL(tty_buffer_unlock_exclusive);
  *	pre-allocate if memory guarantee is required).
  */
 
-int tty_buffer_space_avail(struct tty_port *port)
+unsigned int tty_buffer_space_avail(struct tty_port *port)
 {
 	int space = port->buf.mem_limit - atomic_read(&port->buf.mem_used);
 	return max(space, 0);
diff --git a/include/linux/tty_flip.h b/include/linux/tty_flip.h
index 767f62086bd9..d6729281ec50 100644
--- a/include/linux/tty_flip.h
+++ b/include/linux/tty_flip.h
@@ -3,7 +3,7 @@
 #define _LINUX_TTY_FLIP_H
 
 extern int tty_buffer_set_limit(struct tty_port *port, int limit);
-extern int tty_buffer_space_avail(struct tty_port *port);
+extern unsigned int tty_buffer_space_avail(struct tty_port *port);
 extern int tty_buffer_request_room(struct tty_port *port, size_t size);
 extern int tty_insert_flip_string_flags(struct tty_port *port,
 		const unsigned char *chars, const char *flags, size_t size);
-- 
cgit v1.2.3


From fff4ef17a9400fcd276b5c3a00ce5793f6c465e6 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Wed, 5 May 2021 11:19:19 +0200
Subject: tty: make tty_operations::chars_in_buffer return uint

tty_operations::chars_in_buffer is another hook which is expected to
return values >= 0. So make it explicit by the return type too -- use
unsigned int.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Acked-By: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Acked-by: David Sterba <dsterba@suse.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Cc: Jens Taprogge <jens.taprogge@taprogge.org>
Cc: Karsten Keil <isdn@linux-pingi.de>
Cc: Ulf Hansson <ulf.hansson@linaro.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: David Lin <dtwlin@gmail.com>
Cc: Johan Hovold <johan@kernel.org>
Cc: Alex Elder <elder@kernel.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Shawn Guo <shawnguo@kernel.org>
Cc: Sascha Hauer <s.hauer@pengutronix.de>
Cc: Oliver Neukum <oneukum@suse.com>
Cc: Felipe Balbi <balbi@kernel.org>
Cc: Mathias Nyman <mathias.nyman@intel.com>
Cc: Marcel Holtmann <marcel@holtmann.org>
Cc: Johan Hedberg <johan.hedberg@gmail.com>
Cc: Luiz Augusto von Dentz <luiz.dentz@gmail.com>
Link: https://lore.kernel.org/r/20210505091928.22010-27-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/um/drivers/line.c                 |  4 ++--
 arch/um/drivers/line.h                 |  2 +-
 drivers/char/pcmcia/synclink_cs.c      |  6 +++---
 drivers/ipack/devices/ipoctal.c        |  2 +-
 drivers/isdn/capi/capi.c               |  2 +-
 drivers/mmc/core/sdio_uart.c           |  2 +-
 drivers/net/usb/hso.c                  |  4 ++--
 drivers/s390/char/con3215.c            |  2 +-
 drivers/s390/char/sclp_rw.c            |  4 ++--
 drivers/s390/char/sclp_rw.h            |  2 +-
 drivers/s390/char/sclp_tty.c           |  5 ++---
 drivers/s390/char/sclp_vt220.c         |  5 ++---
 drivers/staging/fwserial/fwserial.c    |  6 +++---
 drivers/staging/greybus/uart.c         |  4 ++--
 drivers/tty/amiserial.c                |  2 +-
 drivers/tty/goldfish.c                 |  2 +-
 drivers/tty/hvc/hvc_console.c          |  2 +-
 drivers/tty/hvc/hvcs.c                 |  2 +-
 drivers/tty/hvc/hvsi.c                 |  2 +-
 drivers/tty/ipwireless/tty.c           |  2 +-
 drivers/tty/mips_ejtag_fdc.c           |  4 ++--
 drivers/tty/moxa.c                     | 10 +++++-----
 drivers/tty/mxser.c                    |  2 +-
 drivers/tty/n_gsm.c                    |  2 +-
 drivers/tty/nozomi.c                   |  2 +-
 drivers/tty/serial/serial_core.c       |  4 ++--
 drivers/tty/synclink_gt.c              |  6 +++---
 drivers/tty/tty_ioctl.c                |  2 +-
 drivers/tty/vcc.c                      |  4 ++--
 drivers/usb/class/cdc-acm.c            |  2 +-
 drivers/usb/gadget/function/u_serial.c |  6 +++---
 drivers/usb/host/xhci-dbgtty.c         |  4 ++--
 drivers/usb/serial/usb-serial.c        |  2 +-
 include/linux/tty.h                    |  2 +-
 include/linux/tty_driver.h             |  2 +-
 net/bluetooth/rfcomm/tty.c             |  2 +-
 36 files changed, 58 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index 2b8810ba5470..159434851417 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -60,11 +60,11 @@ unsigned int line_write_room(struct tty_struct *tty)
 	return room;
 }
 
-int line_chars_in_buffer(struct tty_struct *tty)
+unsigned int line_chars_in_buffer(struct tty_struct *tty)
 {
 	struct line *line = tty->driver_data;
 	unsigned long flags;
-	int ret;
+	unsigned int ret;
 
 	spin_lock_irqsave(&line->lock, flags);
 	/* write_room subtracts 1 for the needed NULL, so we readd it.*/
diff --git a/arch/um/drivers/line.h b/arch/um/drivers/line.h
index 861edf329569..3325e2bc64e4 100644
--- a/arch/um/drivers/line.h
+++ b/arch/um/drivers/line.h
@@ -67,7 +67,7 @@ extern int line_setup(char **conf, unsigned nlines, char **def,
 extern int line_write(struct tty_struct *tty, const unsigned char *buf,
 		      int len);
 extern void line_set_termios(struct tty_struct *tty, struct ktermios * old);
-extern int line_chars_in_buffer(struct tty_struct *tty);
+extern unsigned int line_chars_in_buffer(struct tty_struct *tty);
 extern void line_flush_buffer(struct tty_struct *tty);
 extern void line_flush_chars(struct tty_struct *tty);
 extern unsigned int line_write_room(struct tty_struct *tty);
diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index e4b2c68f44f5..9f7420bc5026 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -1637,10 +1637,10 @@ static unsigned int mgslpc_write_room(struct tty_struct *tty)
 
 /* Return the count of bytes in transmit buffer
  */
-static int mgslpc_chars_in_buffer(struct tty_struct *tty)
+static unsigned int mgslpc_chars_in_buffer(struct tty_struct *tty)
 {
 	MGSLPC_INFO *info = (MGSLPC_INFO *)tty->driver_data;
-	int rc;
+	unsigned int rc;
 
 	if (debug_level >= DEBUG_LEVEL_INFO)
 		printk("%s(%d):mgslpc_chars_in_buffer(%s)\n",
@@ -1655,7 +1655,7 @@ static int mgslpc_chars_in_buffer(struct tty_struct *tty)
 		rc = info->tx_count;
 
 	if (debug_level >= DEBUG_LEVEL_INFO)
-		printk("%s(%d):mgslpc_chars_in_buffer(%s)=%d\n",
+		printk("%s(%d):mgslpc_chars_in_buffer(%s)=%u\n",
 			 __FILE__, __LINE__, info->device_name, rc);
 
 	return rc;
diff --git a/drivers/ipack/devices/ipoctal.c b/drivers/ipack/devices/ipoctal.c
index ea0f1aeaaa06..0a3b89c17d08 100644
--- a/drivers/ipack/devices/ipoctal.c
+++ b/drivers/ipack/devices/ipoctal.c
@@ -467,7 +467,7 @@ static unsigned int ipoctal_write_room(struct tty_struct *tty)
 	return PAGE_SIZE - channel->nb_bytes;
 }
 
-static int ipoctal_chars_in_buffer(struct tty_struct *tty)
+static unsigned int ipoctal_chars_in_buffer(struct tty_struct *tty)
 {
 	struct ipoctal_channel *channel = tty->driver_data;
 
diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c
index c50c454006b3..dae80197ad9c 100644
--- a/drivers/isdn/capi/capi.c
+++ b/drivers/isdn/capi/capi.c
@@ -1186,7 +1186,7 @@ static unsigned int capinc_tty_write_room(struct tty_struct *tty)
 	return room;
 }
 
-static int capinc_tty_chars_in_buffer(struct tty_struct *tty)
+static unsigned int capinc_tty_chars_in_buffer(struct tty_struct *tty)
 {
 	struct capiminor *mp = tty->driver_data;
 
diff --git a/drivers/mmc/core/sdio_uart.c b/drivers/mmc/core/sdio_uart.c
index c8f4eca7aad4..c36242b86b1d 100644
--- a/drivers/mmc/core/sdio_uart.c
+++ b/drivers/mmc/core/sdio_uart.c
@@ -803,7 +803,7 @@ static unsigned int sdio_uart_write_room(struct tty_struct *tty)
 	return FIFO_SIZE - kfifo_len(&port->xmit_fifo);
 }
 
-static int sdio_uart_chars_in_buffer(struct tty_struct *tty)
+static unsigned int sdio_uart_chars_in_buffer(struct tty_struct *tty)
 {
 	struct sdio_uart_port *port = tty->driver_data;
 	return kfifo_len(&port->xmit_fifo);
diff --git a/drivers/net/usb/hso.c b/drivers/net/usb/hso.c
index bb8bb85308ab..c7563ed3ac31 100644
--- a/drivers/net/usb/hso.c
+++ b/drivers/net/usb/hso.c
@@ -1404,11 +1404,11 @@ static void hso_serial_set_termios(struct tty_struct *tty, struct ktermios *old)
 }
 
 /* how many characters in the buffer */
-static int hso_serial_chars_in_buffer(struct tty_struct *tty)
+static unsigned int hso_serial_chars_in_buffer(struct tty_struct *tty)
 {
 	struct hso_serial *serial = tty->driver_data;
-	int chars;
 	unsigned long flags;
+	unsigned int chars;
 
 	/* sanity check */
 	if (serial == NULL)
diff --git a/drivers/s390/char/con3215.c b/drivers/s390/char/con3215.c
index c9fd4a05931a..8821927ef875 100644
--- a/drivers/s390/char/con3215.c
+++ b/drivers/s390/char/con3215.c
@@ -980,7 +980,7 @@ static void tty3215_flush_chars(struct tty_struct *tty)
 /*
  * Returns the number of characters in the output buffer
  */
-static int tty3215_chars_in_buffer(struct tty_struct *tty)
+static unsigned int tty3215_chars_in_buffer(struct tty_struct *tty)
 {
 	struct raw3215_info *raw = tty->driver_data;
 
diff --git a/drivers/s390/char/sclp_rw.c b/drivers/s390/char/sclp_rw.c
index d6c84e354df5..5a1bf6eaa9d9 100644
--- a/drivers/s390/char/sclp_rw.c
+++ b/drivers/s390/char/sclp_rw.c
@@ -325,10 +325,10 @@ sclp_buffer_space(struct sclp_buffer *buffer)
 /*
  * Return number of characters in buffer
  */
-int
+unsigned int
 sclp_chars_in_buffer(struct sclp_buffer *buffer)
 {
-	int count;
+	unsigned int count;
 
 	count = buffer->char_sum;
 	if (buffer->current_line != NULL)
diff --git a/drivers/s390/char/sclp_rw.h b/drivers/s390/char/sclp_rw.h
index 93d706e4935c..b4506be79246 100644
--- a/drivers/s390/char/sclp_rw.h
+++ b/drivers/s390/char/sclp_rw.h
@@ -86,7 +86,7 @@ void *sclp_unmake_buffer(struct sclp_buffer *);
 int sclp_buffer_space(struct sclp_buffer *);
 int sclp_write(struct sclp_buffer *buffer, const unsigned char *, int);
 int sclp_emit_buffer(struct sclp_buffer *,void (*)(struct sclp_buffer *,int));
-int sclp_chars_in_buffer(struct sclp_buffer *);
+unsigned int sclp_chars_in_buffer(struct sclp_buffer *);
 
 #ifdef CONFIG_SCLP_CONSOLE
 void sclp_console_pm_event(enum sclp_pm_event sclp_pm_event);
diff --git a/drivers/s390/char/sclp_tty.c b/drivers/s390/char/sclp_tty.c
index ea1e43fd16bc..162127ff7845 100644
--- a/drivers/s390/char/sclp_tty.c
+++ b/drivers/s390/char/sclp_tty.c
@@ -280,16 +280,15 @@ sclp_tty_flush_chars(struct tty_struct *tty)
  * characters in the write buffer (will not be written as long as there is a
  * final line feed missing).
  */
-static int
+static unsigned int
 sclp_tty_chars_in_buffer(struct tty_struct *tty)
 {
 	unsigned long flags;
 	struct list_head *l;
 	struct sclp_buffer *t;
-	int count;
+	unsigned int count = 0;
 
 	spin_lock_irqsave(&sclp_tty_lock, flags);
-	count = 0;
 	if (sclp_ttybuf != NULL)
 		count = sclp_chars_in_buffer(sclp_ttybuf);
 	list_for_each(l, &sclp_tty_outqueue) {
diff --git a/drivers/s390/char/sclp_vt220.c b/drivers/s390/char/sclp_vt220.c
index b621adee35f0..24eb3a0b0a9a 100644
--- a/drivers/s390/char/sclp_vt220.c
+++ b/drivers/s390/char/sclp_vt220.c
@@ -629,16 +629,15 @@ sclp_vt220_write_room(struct tty_struct *tty)
 /*
  * Return number of buffered chars.
  */
-static int
+static unsigned int
 sclp_vt220_chars_in_buffer(struct tty_struct *tty)
 {
 	unsigned long flags;
 	struct list_head *l;
 	struct sclp_vt220_request *r;
-	int count;
+	unsigned int count = 0;
 
 	spin_lock_irqsave(&sclp_vt220_lock, flags);
-	count = 0;
 	if (sclp_vt220_current_request != NULL)
 		count = sclp_vt220_chars_stored(sclp_vt220_current_request);
 	list_for_each(l, &sclp_vt220_outqueue) {
diff --git a/drivers/staging/fwserial/fwserial.c b/drivers/staging/fwserial/fwserial.c
index a151cd76d24e..d2b286ea27c5 100644
--- a/drivers/staging/fwserial/fwserial.c
+++ b/drivers/staging/fwserial/fwserial.c
@@ -1127,16 +1127,16 @@ static unsigned int fwtty_write_room(struct tty_struct *tty)
 	return n;
 }
 
-static int fwtty_chars_in_buffer(struct tty_struct *tty)
+static unsigned int fwtty_chars_in_buffer(struct tty_struct *tty)
 {
 	struct fwtty_port *port = tty->driver_data;
-	int n;
+	unsigned int n;
 
 	spin_lock_bh(&port->lock);
 	n = dma_fifo_level(&port->tx_fifo);
 	spin_unlock_bh(&port->lock);
 
-	fwtty_dbg(port, "%d\n", n);
+	fwtty_dbg(port, "%u\n", n);
 
 	return n;
 }
diff --git a/drivers/staging/greybus/uart.c b/drivers/staging/greybus/uart.c
index 529eccb99b6c..ccfaa0f21b9c 100644
--- a/drivers/staging/greybus/uart.c
+++ b/drivers/staging/greybus/uart.c
@@ -457,11 +457,11 @@ static unsigned int gb_tty_write_room(struct tty_struct *tty)
 	return room;
 }
 
-static int gb_tty_chars_in_buffer(struct tty_struct *tty)
+static unsigned int gb_tty_chars_in_buffer(struct tty_struct *tty)
 {
 	struct gb_tty *gb_tty = tty->driver_data;
 	unsigned long flags;
-	int chars;
+	unsigned int chars;
 
 	spin_lock_irqsave(&gb_tty->write_lock, flags);
 	chars = kfifo_len(&gb_tty->write_fifo);
diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c
index ee1f4d72cd5e..5ec19c48fb7a 100644
--- a/drivers/tty/amiserial.c
+++ b/drivers/tty/amiserial.c
@@ -834,7 +834,7 @@ static unsigned int rs_write_room(struct tty_struct *tty)
 	return CIRC_SPACE(info->xmit.head, info->xmit.tail, SERIAL_XMIT_SIZE);
 }
 
-static int rs_chars_in_buffer(struct tty_struct *tty)
+static unsigned int rs_chars_in_buffer(struct tty_struct *tty)
 {
 	struct serial_state *info = tty->driver_data;
 
diff --git a/drivers/tty/goldfish.c b/drivers/tty/goldfish.c
index e4f9a60dcc18..ccb683a6e6f5 100644
--- a/drivers/tty/goldfish.c
+++ b/drivers/tty/goldfish.c
@@ -198,7 +198,7 @@ static unsigned int goldfish_tty_write_room(struct tty_struct *tty)
 	return 0x10000;
 }
 
-static int goldfish_tty_chars_in_buffer(struct tty_struct *tty)
+static unsigned int goldfish_tty_chars_in_buffer(struct tty_struct *tty)
 {
 	struct goldfish_tty *qtty = &goldfish_ttys[tty->index];
 	void __iomem *base = qtty->base;
diff --git a/drivers/tty/hvc/hvc_console.c b/drivers/tty/hvc/hvc_console.c
index a3725eb69cd3..d0f0253fb93e 100644
--- a/drivers/tty/hvc/hvc_console.c
+++ b/drivers/tty/hvc/hvc_console.c
@@ -596,7 +596,7 @@ static unsigned int hvc_write_room(struct tty_struct *tty)
 	return hp->outbuf_size - hp->n_outbuf;
 }
 
-static int hvc_chars_in_buffer(struct tty_struct *tty)
+static unsigned int hvc_chars_in_buffer(struct tty_struct *tty)
 {
 	struct hvc_struct *hp = tty->driver_data;
 
diff --git a/drivers/tty/hvc/hvcs.c b/drivers/tty/hvc/hvcs.c
index f43f2f94d8bd..fe5e6b4f43de 100644
--- a/drivers/tty/hvc/hvcs.c
+++ b/drivers/tty/hvc/hvcs.c
@@ -1386,7 +1386,7 @@ static unsigned int hvcs_write_room(struct tty_struct *tty)
 	return HVCS_BUFF_LEN - hvcsd->chars_in_buffer;
 }
 
-static int hvcs_chars_in_buffer(struct tty_struct *tty)
+static unsigned int hvcs_chars_in_buffer(struct tty_struct *tty)
 {
 	struct hvcs_struct *hvcsd = tty->driver_data;
 
diff --git a/drivers/tty/hvc/hvsi.c b/drivers/tty/hvc/hvsi.c
index 0a56f44e6b12..bfc15279d5bc 100644
--- a/drivers/tty/hvc/hvsi.c
+++ b/drivers/tty/hvc/hvsi.c
@@ -897,7 +897,7 @@ static unsigned int hvsi_write_room(struct tty_struct *tty)
 	return N_OUTBUF - hp->n_outbuf;
 }
 
-static int hvsi_chars_in_buffer(struct tty_struct *tty)
+static unsigned int hvsi_chars_in_buffer(struct tty_struct *tty)
 {
 	struct hvsi_struct *hp = tty->driver_data;
 
diff --git a/drivers/tty/ipwireless/tty.c b/drivers/tty/ipwireless/tty.c
index ab562838313b..e01ca68f24f4 100644
--- a/drivers/tty/ipwireless/tty.c
+++ b/drivers/tty/ipwireless/tty.c
@@ -270,7 +270,7 @@ static int ipwireless_set_serial_info(struct tty_struct *linux_tty,
 	return 0;	/* Keeps the PCMCIA scripts happy. */
 }
 
-static int ipw_chars_in_buffer(struct tty_struct *linux_tty)
+static unsigned int ipw_chars_in_buffer(struct tty_struct *linux_tty)
 {
 	struct ipw_tty *tty = linux_tty->driver_data;
 
diff --git a/drivers/tty/mips_ejtag_fdc.c b/drivers/tty/mips_ejtag_fdc.c
index f427e8e154d7..3b5915b94fac 100644
--- a/drivers/tty/mips_ejtag_fdc.c
+++ b/drivers/tty/mips_ejtag_fdc.c
@@ -854,10 +854,10 @@ static unsigned int mips_ejtag_fdc_tty_write_room(struct tty_struct *tty)
 	return room;
 }
 
-static int mips_ejtag_fdc_tty_chars_in_buffer(struct tty_struct *tty)
+static unsigned int mips_ejtag_fdc_tty_chars_in_buffer(struct tty_struct *tty)
 {
 	struct mips_ejtag_fdc_tty_port *dport = tty->driver_data;
-	int chars;
+	unsigned int chars;
 
 	/* Report the number of bytes in the xmit buffer */
 	spin_lock(&dport->xmit_lock);
diff --git a/drivers/tty/moxa.c b/drivers/tty/moxa.c
index e4fe9315de29..64b18177c790 100644
--- a/drivers/tty/moxa.c
+++ b/drivers/tty/moxa.c
@@ -190,7 +190,7 @@ static void moxa_close(struct tty_struct *, struct file *);
 static int moxa_write(struct tty_struct *, const unsigned char *, int);
 static unsigned int moxa_write_room(struct tty_struct *);
 static void moxa_flush_buffer(struct tty_struct *);
-static int moxa_chars_in_buffer(struct tty_struct *);
+static unsigned int moxa_chars_in_buffer(struct tty_struct *);
 static void moxa_set_termios(struct tty_struct *, struct ktermios *);
 static void moxa_stop(struct tty_struct *);
 static void moxa_start(struct tty_struct *);
@@ -216,7 +216,7 @@ static int MoxaPortLineStatus(struct moxa_port *);
 static void MoxaPortFlushData(struct moxa_port *, int);
 static int MoxaPortWriteData(struct tty_struct *, const unsigned char *, int);
 static int MoxaPortReadData(struct moxa_port *);
-static int MoxaPortTxQueue(struct moxa_port *);
+static unsigned int MoxaPortTxQueue(struct moxa_port *);
 static int MoxaPortRxQueue(struct moxa_port *);
 static unsigned int MoxaPortTxFree(struct moxa_port *);
 static void MoxaPortTxDisable(struct moxa_port *);
@@ -1239,10 +1239,10 @@ static void moxa_flush_buffer(struct tty_struct *tty)
 	tty_wakeup(tty);
 }
 
-static int moxa_chars_in_buffer(struct tty_struct *tty)
+static unsigned int moxa_chars_in_buffer(struct tty_struct *tty)
 {
 	struct moxa_port *ch = tty->driver_data;
-	int chars;
+	unsigned int chars;
 
 	chars = MoxaPortTxQueue(ch);
 	if (chars)
@@ -1981,7 +1981,7 @@ static int MoxaPortReadData(struct moxa_port *port)
 }
 
 
-static int MoxaPortTxQueue(struct moxa_port *port)
+static unsigned int MoxaPortTxQueue(struct moxa_port *port)
 {
 	void __iomem *ofsAddr = port->tableAddr;
 	u16 rptr, wptr, mask;
diff --git a/drivers/tty/mxser.c b/drivers/tty/mxser.c
index 5851a45d828c..a74e6146a748 100644
--- a/drivers/tty/mxser.c
+++ b/drivers/tty/mxser.c
@@ -1192,7 +1192,7 @@ static unsigned int mxser_write_room(struct tty_struct *tty)
 	return ret < 0 ? 0 : ret;
 }
 
-static int mxser_chars_in_buffer(struct tty_struct *tty)
+static unsigned int mxser_chars_in_buffer(struct tty_struct *tty)
 {
 	struct mxser_port *info = tty->driver_data;
 	return info->xmit_cnt;
diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c
index 06f0c6d39620..bd24dc0d7e96 100644
--- a/drivers/tty/n_gsm.c
+++ b/drivers/tty/n_gsm.c
@@ -3064,7 +3064,7 @@ static unsigned int gsmtty_write_room(struct tty_struct *tty)
 	return TX_SIZE - kfifo_len(&dlci->fifo);
 }
 
-static int gsmtty_chars_in_buffer(struct tty_struct *tty)
+static unsigned int gsmtty_chars_in_buffer(struct tty_struct *tty)
 {
 	struct gsm_dlci *dlci = tty->driver_data;
 	if (dlci->state == DLCI_CLOSED)
diff --git a/drivers/tty/nozomi.c b/drivers/tty/nozomi.c
index c55475a9a184..62c16731ccd8 100644
--- a/drivers/tty/nozomi.c
+++ b/drivers/tty/nozomi.c
@@ -1776,7 +1776,7 @@ static void ntty_throttle(struct tty_struct *tty)
 }
 
 /* Returns number of chars in buffer, called by tty layer */
-static s32 ntty_chars_in_buffer(struct tty_struct *tty)
+static unsigned int ntty_chars_in_buffer(struct tty_struct *tty)
 {
 	struct port *port = tty->driver_data;
 	struct nozomi *dc = get_dc_by_tty(tty);
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index cb46a65a5dd8..d29329eb52f4 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -629,12 +629,12 @@ static unsigned int uart_write_room(struct tty_struct *tty)
 	return ret;
 }
 
-static int uart_chars_in_buffer(struct tty_struct *tty)
+static unsigned int uart_chars_in_buffer(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
 	struct uart_port *port;
 	unsigned long flags;
-	int ret;
+	unsigned int ret;
 
 	port = uart_port_lock(state, flags);
 	ret = uart_circ_chars_pending(&state->xmit);
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index 583aa8342112..cf87dc66087b 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -1254,14 +1254,14 @@ static int synclink_gt_proc_show(struct seq_file *m, void *v)
 /*
  * return count of bytes in transmit buffer
  */
-static int chars_in_buffer(struct tty_struct *tty)
+static unsigned int chars_in_buffer(struct tty_struct *tty)
 {
 	struct slgt_info *info = tty->driver_data;
-	int count;
+	unsigned int count;
 	if (sanity_check(info, tty->name, "chars_in_buffer"))
 		return 0;
 	count = tbuf_bytes(info);
-	DBGINFO(("%s chars_in_buffer()=%d\n", info->device_name, count));
+	DBGINFO(("%s chars_in_buffer()=%u\n", info->device_name, count));
 	return count;
 }
 
diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c
index d8834784b586..aa9ecc8be990 100644
--- a/drivers/tty/tty_ioctl.c
+++ b/drivers/tty/tty_ioctl.c
@@ -54,7 +54,7 @@
  *	to be no queue on the device.
  */
 
-int tty_chars_in_buffer(struct tty_struct *tty)
+unsigned int tty_chars_in_buffer(struct tty_struct *tty)
 {
 	if (tty->ops->chars_in_buffer)
 		return tty->ops->chars_in_buffer(tty);
diff --git a/drivers/tty/vcc.c b/drivers/tty/vcc.c
index d82ce3bb82c3..e883b8f12099 100644
--- a/drivers/tty/vcc.c
+++ b/drivers/tty/vcc.c
@@ -888,10 +888,10 @@ static unsigned int vcc_write_room(struct tty_struct *tty)
 	return num;
 }
 
-static int vcc_chars_in_buffer(struct tty_struct *tty)
+static unsigned int vcc_chars_in_buffer(struct tty_struct *tty)
 {
 	struct vcc_port *port;
-	u64 num;
+	unsigned int num;
 
 	port = vcc_get_ne(tty->index);
 	if (unlikely(!port)) {
diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index 76b7fd234238..81199efe0312 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -848,7 +848,7 @@ static unsigned int acm_tty_write_room(struct tty_struct *tty)
 	return acm_wb_is_avail(acm) ? acm->writesize : 0;
 }
 
-static int acm_tty_chars_in_buffer(struct tty_struct *tty)
+static unsigned int acm_tty_chars_in_buffer(struct tty_struct *tty)
 {
 	struct acm *acm = tty->driver_data;
 	/*
diff --git a/drivers/usb/gadget/function/u_serial.c b/drivers/usb/gadget/function/u_serial.c
index 676a920d9d6b..bffef8e47dac 100644
--- a/drivers/usb/gadget/function/u_serial.c
+++ b/drivers/usb/gadget/function/u_serial.c
@@ -791,17 +791,17 @@ static unsigned int gs_write_room(struct tty_struct *tty)
 	return room;
 }
 
-static int gs_chars_in_buffer(struct tty_struct *tty)
+static unsigned int gs_chars_in_buffer(struct tty_struct *tty)
 {
 	struct gs_port	*port = tty->driver_data;
 	unsigned long	flags;
-	int		chars = 0;
+	unsigned int	chars;
 
 	spin_lock_irqsave(&port->port_lock, flags);
 	chars = kfifo_len(&port->port_write_buf);
 	spin_unlock_irqrestore(&port->port_lock, flags);
 
-	pr_vdebug("gs_chars_in_buffer: (%d,%p) chars=%d\n",
+	pr_vdebug("gs_chars_in_buffer: (%d,%p) chars=%u\n",
 		port->port_num, tty, chars);
 
 	return chars;
diff --git a/drivers/usb/host/xhci-dbgtty.c b/drivers/usb/host/xhci-dbgtty.c
index cd3ab35dd689..bef104511352 100644
--- a/drivers/usb/host/xhci-dbgtty.c
+++ b/drivers/usb/host/xhci-dbgtty.c
@@ -253,11 +253,11 @@ static unsigned int dbc_tty_write_room(struct tty_struct *tty)
 	return room;
 }
 
-static int dbc_tty_chars_in_buffer(struct tty_struct *tty)
+static unsigned int dbc_tty_chars_in_buffer(struct tty_struct *tty)
 {
 	struct dbc_port		*port = tty->driver_data;
 	unsigned long		flags;
-	int			chars = 0;
+	unsigned int		chars;
 
 	spin_lock_irqsave(&port->port_lock, flags);
 	chars = kfifo_len(&port->write_fifo);
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index 055096831daf..eeb441c77207 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -385,7 +385,7 @@ static unsigned int serial_write_room(struct tty_struct *tty)
 	return port->serial->type->write_room(tty);
 }
 
-static int serial_chars_in_buffer(struct tty_struct *tty)
+static unsigned int serial_chars_in_buffer(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct usb_serial *serial = port->serial;
diff --git a/include/linux/tty.h b/include/linux/tty.h
index d18fc34d3054..5cf6b2e7331b 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -458,7 +458,7 @@ extern void tty_unregister_device(struct tty_driver *driver, unsigned index);
 extern void tty_write_message(struct tty_struct *tty, char *msg);
 extern int tty_send_xchar(struct tty_struct *tty, char ch);
 extern int tty_put_char(struct tty_struct *tty, unsigned char c);
-extern int tty_chars_in_buffer(struct tty_struct *tty);
+extern unsigned int tty_chars_in_buffer(struct tty_struct *tty);
 extern unsigned int tty_write_room(struct tty_struct *tty);
 extern void tty_driver_flush_buffer(struct tty_struct *tty);
 extern void tty_throttle(struct tty_struct *tty);
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index ea5b15c72764..a4694bb125cc 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -257,7 +257,7 @@ struct tty_operations {
 	int  (*put_char)(struct tty_struct *tty, unsigned char ch);
 	void (*flush_chars)(struct tty_struct *tty);
 	unsigned int (*write_room)(struct tty_struct *tty);
-	int  (*chars_in_buffer)(struct tty_struct *tty);
+	unsigned int (*chars_in_buffer)(struct tty_struct *tty);
 	int  (*ioctl)(struct tty_struct *tty,
 		    unsigned int cmd, unsigned long arg);
 	long (*compat_ioctl)(struct tty_struct *tty,
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index a5e3d957f20f..c76dcc0f679b 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -1010,7 +1010,7 @@ static void rfcomm_tty_unthrottle(struct tty_struct *tty)
 	rfcomm_dlc_unthrottle(dev->dlc);
 }
 
-static int rfcomm_tty_chars_in_buffer(struct tty_struct *tty)
+static unsigned int rfcomm_tty_chars_in_buffer(struct tty_struct *tty)
 {
 	struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
 
-- 
cgit v1.2.3


From 76af233d9b0c0b749e97b8f90fd0ff0e417ce3e3 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Mon, 10 May 2021 08:59:22 +0200
Subject: tty: remove unused tty_throttle

The last user was removed in commit e91e52e42814 (n_tty: Fix stuck
throttled driver) in 2013. So remove exported tty_throttle completely.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20210510065923.5112-1-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_ioctl.c    | 31 +++++--------------------------
 include/linux/tty.h        |  1 -
 include/linux/tty_driver.h |  2 +-
 3 files changed, 6 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c
index aa9ecc8be990..75885d502749 100644
--- a/drivers/tty/tty_ioctl.c
+++ b/drivers/tty/tty_ioctl.c
@@ -96,28 +96,6 @@ void tty_driver_flush_buffer(struct tty_struct *tty)
 }
 EXPORT_SYMBOL(tty_driver_flush_buffer);
 
-/**
- *	tty_throttle		-	flow control
- *	@tty: terminal
- *
- *	Indicate that a tty should stop transmitting data down the stack.
- *	Takes the termios rwsem to protect against parallel throttle/unthrottle
- *	and also to ensure the driver can consistently reference its own
- *	termios data at this point when implementing software flow control.
- */
-
-void tty_throttle(struct tty_struct *tty)
-{
-	down_write(&tty->termios_rwsem);
-	/* check TTY_THROTTLED first so it indicates our state */
-	if (!test_and_set_bit(TTY_THROTTLED, &tty->flags) &&
-	    tty->ops->throttle)
-		tty->ops->throttle(tty);
-	tty->flow_change = 0;
-	up_write(&tty->termios_rwsem);
-}
-EXPORT_SYMBOL(tty_throttle);
-
 /**
  *	tty_unthrottle		-	flow control
  *	@tty: terminal
@@ -146,10 +124,11 @@ EXPORT_SYMBOL(tty_unthrottle);
  *	tty_throttle_safe	-	flow control
  *	@tty: terminal
  *
- *	Similar to tty_throttle() but will only attempt throttle
- *	if tty->flow_change is TTY_THROTTLE_SAFE. Prevents an accidental
- *	throttle due to race conditions when throttling is conditional
- *	on factors evaluated prior to throttling.
+ *	Indicate that a tty should stop transmitting data down the stack.
+ *	tty_throttle_safe will only attempt throttle if tty->flow_change is
+ *	TTY_THROTTLE_SAFE. Prevents an accidental throttle due to race
+ *	conditions when throttling is conditional on factors evaluated prior to
+ *	throttling.
  *
  *	Returns 0 if tty is throttled (or was already throttled)
  */
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 5cf6b2e7331b..4c0c7ca1d9a4 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -461,7 +461,6 @@ extern int tty_put_char(struct tty_struct *tty, unsigned char c);
 extern unsigned int tty_chars_in_buffer(struct tty_struct *tty);
 extern unsigned int tty_write_room(struct tty_struct *tty);
 extern void tty_driver_flush_buffer(struct tty_struct *tty);
-extern void tty_throttle(struct tty_struct *tty);
 extern void tty_unthrottle(struct tty_struct *tty);
 extern int tty_throttle_safe(struct tty_struct *tty);
 extern int tty_unthrottle_safe(struct tty_struct *tty);
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index a4694bb125cc..448f8ee6db6e 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -136,7 +136,7 @@
  * 	the line discipline are close to full, and it should somehow
  * 	signal that no more characters should be sent to the tty.
  *
- *	Optional: Always invoke via tty_throttle(), called under the
+ *	Optional: Always invoke via tty_throttle_safe(), called under the
  *	termios lock.
  * 
  * void (*unthrottle)(struct tty_struct * tty);
-- 
cgit v1.2.3


From 3b85f9ba3480c1bcbebb2bb490822bec0e7a1201 Mon Sep 17 00:00:00 2001
From: Linus Lüssing <linus.luessing@c0d3.blue>
Date: Thu, 13 May 2021 15:20:53 +0200
Subject: net: bridge: mcast: export multicast router presence adjacent to a
 port
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To properly support routable multicast addresses in batman-adv in a
group-aware way, a batman-adv node needs to know if it serves multicast
routers.

This adds a function to the bridge to export this so that batman-adv
can then make full use of the Multicast Router Discovery capability of
the bridge.

Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h |  8 +++++++
 net/bridge/br_multicast.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 2cc35038a8ca..12e9a32dbca0 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -67,6 +67,7 @@ int br_multicast_list_adjacent(struct net_device *dev,
 			       struct list_head *br_ip_list);
 bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto);
 bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto);
+bool br_multicast_has_router_adjacent(struct net_device *dev, int proto);
 bool br_multicast_enabled(const struct net_device *dev);
 bool br_multicast_router(const struct net_device *dev);
 int br_mdb_replay(struct net_device *br_dev, struct net_device *dev,
@@ -87,6 +88,13 @@ static inline bool br_multicast_has_querier_adjacent(struct net_device *dev,
 {
 	return false;
 }
+
+static inline bool br_multicast_has_router_adjacent(struct net_device *dev,
+						    int proto)
+{
+	return true;
+}
+
 static inline bool br_multicast_enabled(const struct net_device *dev)
 {
 	return false;
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index f234c48036c8..0703725527b3 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -4054,6 +4054,61 @@ unlock:
 }
 EXPORT_SYMBOL_GPL(br_multicast_has_querier_adjacent);
 
+/**
+ * br_multicast_has_router_adjacent - Checks for a router behind a bridge port
+ * @dev: The bridge port adjacent to which to check for a multicast router
+ * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6
+ *
+ * Checks whether the given interface has a bridge on top and if so returns
+ * true if a multicast router is behind one of the other ports of this
+ * bridge. Otherwise returns false.
+ */
+bool br_multicast_has_router_adjacent(struct net_device *dev, int proto)
+{
+	struct net_bridge_port *port, *p;
+	bool ret = false;
+
+	rcu_read_lock();
+	port = br_port_get_check_rcu(dev);
+	if (!port)
+		goto unlock;
+
+	switch (proto) {
+	case ETH_P_IP:
+		hlist_for_each_entry_rcu(p, &port->br->ip4_mc_router_list,
+					 ip4_rlist) {
+			if (p == port)
+				continue;
+
+			ret = true;
+			goto unlock;
+		}
+		break;
+#if IS_ENABLED(CONFIG_IPV6)
+	case ETH_P_IPV6:
+		hlist_for_each_entry_rcu(p, &port->br->ip6_mc_router_list,
+					 ip6_rlist) {
+			if (p == port)
+				continue;
+
+			ret = true;
+			goto unlock;
+		}
+		break;
+#endif
+	default:
+		/* when compiled without IPv6 support, be conservative and
+		 * always assume presence of an IPv6 multicast router
+		 */
+		ret = true;
+	}
+
+unlock:
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(br_multicast_has_router_adjacent);
+
 static void br_mcast_stats_add(struct bridge_mcast_stats __percpu *stats,
 			       const struct sk_buff *skb, u8 type, u8 dir)
 {
-- 
cgit v1.2.3


From 14374fbb3f06ddaba186d608a58c07f3d48d08df Mon Sep 17 00:00:00 2001
From: Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
Date: Tue, 11 May 2021 23:07:25 +0200
Subject: misc: eeprom_93xx46: Add new 93c56 and 93c66 compatible strings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These two devices have respectively 2048 and 4096 bits of storage,
compared to 1024 for the 93c46.

Reviewed-by: Jonathan Neuschäfer <j.neuschaefer@gmx.net>
Signed-off-by: Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
Link: https://lore.kernel.org/r/20210511210727.24895-3-linkmauve@linkmauve.fr
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/eeprom/eeprom_93xx46.c | 35 ++++++++++++++++++++++++++++++++---
 include/linux/eeprom_93xx46.h       |  3 +++
 2 files changed, 35 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/eeprom/eeprom_93xx46.c b/drivers/misc/eeprom/eeprom_93xx46.c
index ffdb8e5a26e0..29d8971ec558 100644
--- a/drivers/misc/eeprom/eeprom_93xx46.c
+++ b/drivers/misc/eeprom/eeprom_93xx46.c
@@ -29,14 +29,29 @@
 
 struct eeprom_93xx46_devtype_data {
 	unsigned int quirks;
+	unsigned char flags;
+};
+
+static const struct eeprom_93xx46_devtype_data at93c46_data = {
+	.flags = EE_SIZE1K,
+};
+
+static const struct eeprom_93xx46_devtype_data at93c56_data = {
+	.flags = EE_SIZE2K,
+};
+
+static const struct eeprom_93xx46_devtype_data at93c66_data = {
+	.flags = EE_SIZE4K,
 };
 
 static const struct eeprom_93xx46_devtype_data atmel_at93c46d_data = {
+	.flags = EE_SIZE1K,
 	.quirks = EEPROM_93XX46_QUIRK_SINGLE_WORD_READ |
 		  EEPROM_93XX46_QUIRK_INSTRUCTION_LENGTH,
 };
 
 static const struct eeprom_93xx46_devtype_data microchip_93lc46b_data = {
+	.flags = EE_SIZE1K,
 	.quirks = EEPROM_93XX46_QUIRK_EXTRA_READ_CYCLE,
 };
 
@@ -381,8 +396,11 @@ static void select_deassert(void *context)
 }
 
 static const struct of_device_id eeprom_93xx46_of_table[] = {
-	{ .compatible = "eeprom-93xx46", },
+	{ .compatible = "eeprom-93xx46", .data = &at93c46_data, },
+	{ .compatible = "atmel,at93c46", .data = &at93c46_data, },
 	{ .compatible = "atmel,at93c46d", .data = &atmel_at93c46d_data, },
+	{ .compatible = "atmel,at93c56", .data = &at93c56_data, },
+	{ .compatible = "atmel,at93c66", .data = &at93c66_data, },
 	{ .compatible = "microchip,93lc46b", .data = &microchip_93lc46b_data, },
 	{}
 };
@@ -432,6 +450,7 @@ static int eeprom_93xx46_probe_dt(struct spi_device *spi)
 		const struct eeprom_93xx46_devtype_data *data = of_id->data;
 
 		pd->quirks = data->quirks;
+		pd->flags |= data->flags;
 	}
 
 	spi->dev.platform_data = pd;
@@ -461,7 +480,16 @@ static int eeprom_93xx46_probe(struct spi_device *spi)
 	if (!edev)
 		return -ENOMEM;
 
-	edev->size = 128;
+	if (pd->flags & EE_SIZE1K)
+		edev->size = 128;
+	else if (pd->flags & EE_SIZE2K)
+		edev->size = 256;
+	else if (pd->flags & EE_SIZE4K)
+		edev->size = 512;
+	else {
+		dev_err(&spi->dev, "unspecified size\n");
+		return -EINVAL;
+	}
 
 	if (pd->flags & EE_ADDR8)
 		edev->addrlen = ilog2(edev->size);
@@ -496,8 +524,9 @@ static int eeprom_93xx46_probe(struct spi_device *spi)
 	if (IS_ERR(edev->nvmem))
 		return PTR_ERR(edev->nvmem);
 
-	dev_info(&spi->dev, "%d-bit eeprom %s\n",
+	dev_info(&spi->dev, "%d-bit eeprom containing %d bytes %s\n",
 		(pd->flags & EE_ADDR8) ? 8 : 16,
+		edev->size,
 		(pd->flags & EE_READONLY) ? "(readonly)" : "");
 
 	if (!(pd->flags & EE_READONLY)) {
diff --git a/include/linux/eeprom_93xx46.h b/include/linux/eeprom_93xx46.h
index 99580c22f91a..34c2175e6a1e 100644
--- a/include/linux/eeprom_93xx46.h
+++ b/include/linux/eeprom_93xx46.h
@@ -10,6 +10,9 @@ struct eeprom_93xx46_platform_data {
 #define EE_ADDR8	0x01		/*  8 bit addr. cfg */
 #define EE_ADDR16	0x02		/* 16 bit addr. cfg */
 #define EE_READONLY	0x08		/* forbid writing */
+#define EE_SIZE1K	0x10		/* 1 kb of data, that is a 93xx46 */
+#define EE_SIZE2K	0x20		/* 2 kb of data, that is a 93xx56 */
+#define EE_SIZE4K	0x40		/* 4 kb of data, that is a 93xx66 */
 
 	unsigned int	quirks;
 /* Single word read transfers only; no sequential read. */
-- 
cgit v1.2.3


From ea030ca688193462b8d612c1628c37129aa30072 Mon Sep 17 00:00:00 2001
From: Lucas Tanure <tanureal@opensource.cirrus.com>
Date: Wed, 12 May 2021 14:52:22 +0100
Subject: regmap-i2c: Set regmap max raw r/w from quirks

Set regmap raw read/write from i2c quirks max read/write
so regmap_raw_read/write can split the access into chunks

Signed-off-by: Lucas Tanure <tanureal@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20210512135222.223203-1-tanureal@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap-i2c.c | 45 +++++++++++++++++++++++++++++++++-------
 drivers/base/regmap/regmap.c     |  2 ++
 include/linux/regmap.h           |  2 ++
 3 files changed, 42 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/regmap/regmap-i2c.c b/drivers/base/regmap/regmap-i2c.c
index 62b95a9212ae..980e5ce6a3a3 100644
--- a/drivers/base/regmap/regmap-i2c.c
+++ b/drivers/base/regmap/regmap-i2c.c
@@ -306,33 +306,64 @@ static const struct regmap_bus regmap_i2c_smbus_i2c_block_reg16 = {
 static const struct regmap_bus *regmap_get_i2c_bus(struct i2c_client *i2c,
 					const struct regmap_config *config)
 {
+	const struct i2c_adapter_quirks *quirks;
+	const struct regmap_bus *bus = NULL;
+	struct regmap_bus *ret_bus;
+	u16 max_read = 0, max_write = 0;
+
 	if (i2c_check_functionality(i2c->adapter, I2C_FUNC_I2C))
-		return &regmap_i2c;
+		bus = &regmap_i2c;
 	else if (config->val_bits == 8 && config->reg_bits == 8 &&
 		 i2c_check_functionality(i2c->adapter,
 					 I2C_FUNC_SMBUS_I2C_BLOCK))
-		return &regmap_i2c_smbus_i2c_block;
+		bus = &regmap_i2c_smbus_i2c_block;
 	else if (config->val_bits == 8 && config->reg_bits == 16 &&
 		i2c_check_functionality(i2c->adapter,
 					I2C_FUNC_SMBUS_I2C_BLOCK))
-		return &regmap_i2c_smbus_i2c_block_reg16;
+		bus = &regmap_i2c_smbus_i2c_block_reg16;
 	else if (config->val_bits == 16 && config->reg_bits == 8 &&
 		 i2c_check_functionality(i2c->adapter,
 					 I2C_FUNC_SMBUS_WORD_DATA))
 		switch (regmap_get_val_endian(&i2c->dev, NULL, config)) {
 		case REGMAP_ENDIAN_LITTLE:
-			return &regmap_smbus_word;
+			bus = &regmap_smbus_word;
+			break;
 		case REGMAP_ENDIAN_BIG:
-			return &regmap_smbus_word_swapped;
+			bus = &regmap_smbus_word_swapped;
+			break;
 		default:		/* everything else is not supported */
 			break;
 		}
 	else if (config->val_bits == 8 && config->reg_bits == 8 &&
 		 i2c_check_functionality(i2c->adapter,
 					 I2C_FUNC_SMBUS_BYTE_DATA))
-		return &regmap_smbus_byte;
+		bus = &regmap_smbus_byte;
+
+	if (!bus)
+		return ERR_PTR(-ENOTSUPP);
+
+	quirks = i2c->adapter->quirks;
+	if (quirks) {
+		if (quirks->max_read_len &&
+		    (bus->max_raw_read == 0 || bus->max_raw_read > quirks->max_read_len))
+			max_read = quirks->max_read_len;
+
+		if (quirks->max_write_len &&
+		    (bus->max_raw_write == 0 || bus->max_raw_write > quirks->max_write_len))
+			max_write = quirks->max_write_len;
+
+		if (max_read || max_write) {
+			ret_bus = kmemdup(bus, sizeof(*bus), GFP_KERNEL);
+			if (!ret_bus)
+				return ERR_PTR(-ENOMEM);
+			ret_bus->free_on_exit = true;
+			ret_bus->max_raw_read = max_read;
+			ret_bus->max_raw_write = max_write;
+			bus = ret_bus;
+		}
+	}
 
-	return ERR_PTR(-ENOTSUPP);
+	return bus;
 }
 
 struct regmap *__regmap_init_i2c(struct i2c_client *i2c,
diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index 297e95be25b3..0d185ec018a5 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -1496,6 +1496,8 @@ void regmap_exit(struct regmap *map)
 		mutex_destroy(&map->mutex);
 	kfree_const(map->name);
 	kfree(map->patch);
+	if (map->bus && map->bus->free_on_exit)
+		kfree(map->bus);
 	kfree(map);
 }
 EXPORT_SYMBOL_GPL(regmap_exit);
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index f87a11a5cc4a..8c16e6fa0f66 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -502,6 +502,7 @@ typedef void (*regmap_hw_free_context)(void *context);
  *     DEFAULT, BIG is assumed.
  * @max_raw_read: Max raw read size that can be used on the bus.
  * @max_raw_write: Max raw write size that can be used on the bus.
+ * @free_on_exit: kfree this on exit of regmap
  */
 struct regmap_bus {
 	bool fast_io;
@@ -519,6 +520,7 @@ struct regmap_bus {
 	enum regmap_endian val_format_endian_default;
 	size_t max_raw_read;
 	size_t max_raw_write;
+	bool free_on_exit;
 };
 
 /*
-- 
cgit v1.2.3


From adae1e931acd8b430d31141a283ea06d4b705417 Mon Sep 17 00:00:00 2001
From: Andres Beltran <lkmlabelt@gmail.com>
Date: Thu, 8 Apr 2021 18:14:39 +0200
Subject: Drivers: hv: vmbus: Copy packets sent by Hyper-V out of the ring
 buffer

Pointers to ring-buffer packets sent by Hyper-V are used within the
guest VM. Hyper-V can send packets with erroneous values or modify
packet fields after they are processed by the guest. To defend
against these scenarios, return a copy of the incoming VMBus packet
after validating its length and offset fields in hv_pkt_iter_first().
In this way, the packet can no longer be modified by the host.

Signed-off-by: Andres Beltran <lkmlabelt@gmail.com>
Co-developed-by: Andrea Parri (Microsoft) <parri.andrea@gmail.com>
Signed-off-by: Andrea Parri (Microsoft) <parri.andrea@gmail.com>
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Link: https://lore.kernel.org/r/20210408161439.341988-1-parri.andrea@gmail.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 drivers/hv/channel.c              |  9 +++--
 drivers/hv/hv_fcopy.c             |  1 +
 drivers/hv/hv_kvp.c               |  1 +
 drivers/hv/hyperv_vmbus.h         |  2 +-
 drivers/hv/ring_buffer.c          | 82 +++++++++++++++++++++++++++++++++------
 drivers/net/hyperv/hyperv_net.h   |  7 ++++
 drivers/net/hyperv/netvsc.c       |  2 +
 drivers/net/hyperv/rndis_filter.c |  2 +
 drivers/scsi/storvsc_drv.c        | 10 +++++
 include/linux/hyperv.h            | 48 +++++++++++++++++++----
 net/vmw_vsock/hyperv_transport.c  |  4 +-
 11 files changed, 143 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index c2635e913a92..bfbca4eeb773 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -662,12 +662,15 @@ static int __vmbus_open(struct vmbus_channel *newchannel,
 	newchannel->onchannel_callback = onchannelcallback;
 	newchannel->channel_callback_context = context;
 
-	err = hv_ringbuffer_init(&newchannel->outbound, page, send_pages);
+	if (!newchannel->max_pkt_size)
+		newchannel->max_pkt_size = VMBUS_DEFAULT_MAX_PKT_SIZE;
+
+	err = hv_ringbuffer_init(&newchannel->outbound, page, send_pages, 0);
 	if (err)
 		goto error_clean_ring;
 
-	err = hv_ringbuffer_init(&newchannel->inbound,
-				 &page[send_pages], recv_pages);
+	err = hv_ringbuffer_init(&newchannel->inbound, &page[send_pages],
+				 recv_pages, newchannel->max_pkt_size);
 	if (err)
 		goto error_clean_ring;
 
diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c
index 59ce85e00a02..660036da7449 100644
--- a/drivers/hv/hv_fcopy.c
+++ b/drivers/hv/hv_fcopy.c
@@ -349,6 +349,7 @@ int hv_fcopy_init(struct hv_util_service *srv)
 {
 	recv_buffer = srv->recv_buffer;
 	fcopy_transaction.recv_channel = srv->channel;
+	fcopy_transaction.recv_channel->max_pkt_size = HV_HYP_PAGE_SIZE * 2;
 
 	/*
 	 * When this driver loads, the user level daemon that
diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c
index b49962d312ce..c698592b83e4 100644
--- a/drivers/hv/hv_kvp.c
+++ b/drivers/hv/hv_kvp.c
@@ -757,6 +757,7 @@ hv_kvp_init(struct hv_util_service *srv)
 {
 	recv_buffer = srv->recv_buffer;
 	kvp_transaction.recv_channel = srv->channel;
+	kvp_transaction.recv_channel->max_pkt_size = HV_HYP_PAGE_SIZE * 4;
 
 	/*
 	 * When this driver loads, the user level daemon that
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 9416e09ebd58..42f3d9d123a1 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -174,7 +174,7 @@ extern int hv_synic_cleanup(unsigned int cpu);
 void hv_ringbuffer_pre_init(struct vmbus_channel *channel);
 
 int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
-		       struct page *pages, u32 pagecnt);
+		       struct page *pages, u32 pagecnt, u32 max_pkt_size);
 
 void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info);
 
diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index 374f8afbf8a5..e621f8d9b436 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -181,7 +181,7 @@ void hv_ringbuffer_pre_init(struct vmbus_channel *channel)
 
 /* Initialize the ring buffer. */
 int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
-		       struct page *pages, u32 page_cnt)
+		       struct page *pages, u32 page_cnt, u32 max_pkt_size)
 {
 	int i;
 	struct page **pages_wraparound;
@@ -223,6 +223,14 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
 		sizeof(struct hv_ring_buffer);
 	ring_info->priv_read_index = 0;
 
+	/* Initialize buffer that holds copies of incoming packets */
+	if (max_pkt_size) {
+		ring_info->pkt_buffer = kzalloc(max_pkt_size, GFP_KERNEL);
+		if (!ring_info->pkt_buffer)
+			return -ENOMEM;
+		ring_info->pkt_buffer_size = max_pkt_size;
+	}
+
 	spin_lock_init(&ring_info->ring_lock);
 
 	return 0;
@@ -235,6 +243,9 @@ void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info)
 	vunmap(ring_info->ring_buffer);
 	ring_info->ring_buffer = NULL;
 	mutex_unlock(&ring_info->ring_buffer_mutex);
+
+	kfree(ring_info->pkt_buffer);
+	ring_info->pkt_buffer_size = 0;
 }
 
 /* Write to the ring buffer. */
@@ -375,7 +386,7 @@ int hv_ringbuffer_read(struct vmbus_channel *channel,
 	memcpy(buffer, (const char *)desc + offset, packetlen);
 
 	/* Advance ring index to next packet descriptor */
-	__hv_pkt_iter_next(channel, desc);
+	__hv_pkt_iter_next(channel, desc, true);
 
 	/* Notify host of update */
 	hv_pkt_iter_close(channel);
@@ -401,6 +412,22 @@ static u32 hv_pkt_iter_avail(const struct hv_ring_buffer_info *rbi)
 		return (rbi->ring_datasize - priv_read_loc) + write_loc;
 }
 
+/*
+ * Get first vmbus packet without copying it out of the ring buffer
+ */
+struct vmpacket_descriptor *hv_pkt_iter_first_raw(struct vmbus_channel *channel)
+{
+	struct hv_ring_buffer_info *rbi = &channel->inbound;
+
+	hv_debug_delay_test(channel, MESSAGE_DELAY);
+
+	if (hv_pkt_iter_avail(rbi) < sizeof(struct vmpacket_descriptor))
+		return NULL;
+
+	return (struct vmpacket_descriptor *)(hv_get_ring_buffer(rbi) + rbi->priv_read_index);
+}
+EXPORT_SYMBOL_GPL(hv_pkt_iter_first_raw);
+
 /*
  * Get first vmbus packet from ring buffer after read_index
  *
@@ -409,17 +436,49 @@ static u32 hv_pkt_iter_avail(const struct hv_ring_buffer_info *rbi)
 struct vmpacket_descriptor *hv_pkt_iter_first(struct vmbus_channel *channel)
 {
 	struct hv_ring_buffer_info *rbi = &channel->inbound;
-	struct vmpacket_descriptor *desc;
+	struct vmpacket_descriptor *desc, *desc_copy;
+	u32 bytes_avail, pkt_len, pkt_offset;
 
-	hv_debug_delay_test(channel, MESSAGE_DELAY);
-	if (hv_pkt_iter_avail(rbi) < sizeof(struct vmpacket_descriptor))
+	desc = hv_pkt_iter_first_raw(channel);
+	if (!desc)
 		return NULL;
 
-	desc = hv_get_ring_buffer(rbi) + rbi->priv_read_index;
-	if (desc)
-		prefetch((char *)desc + (desc->len8 << 3));
+	bytes_avail = min(rbi->pkt_buffer_size, hv_pkt_iter_avail(rbi));
+
+	/*
+	 * Ensure the compiler does not use references to incoming Hyper-V values (which
+	 * could change at any moment) when reading local variables later in the code
+	 */
+	pkt_len = READ_ONCE(desc->len8) << 3;
+	pkt_offset = READ_ONCE(desc->offset8) << 3;
+
+	/*
+	 * If pkt_len is invalid, set it to the smaller of hv_pkt_iter_avail() and
+	 * rbi->pkt_buffer_size
+	 */
+	if (pkt_len < sizeof(struct vmpacket_descriptor) || pkt_len > bytes_avail)
+		pkt_len = bytes_avail;
+
+	/*
+	 * If pkt_offset is invalid, arbitrarily set it to
+	 * the size of vmpacket_descriptor
+	 */
+	if (pkt_offset < sizeof(struct vmpacket_descriptor) || pkt_offset > pkt_len)
+		pkt_offset = sizeof(struct vmpacket_descriptor);
+
+	/* Copy the Hyper-V packet out of the ring buffer */
+	desc_copy = (struct vmpacket_descriptor *)rbi->pkt_buffer;
+	memcpy(desc_copy, desc, pkt_len);
+
+	/*
+	 * Hyper-V could still change len8 and offset8 after the earlier read.
+	 * Ensure that desc_copy has legal values for len8 and offset8 that
+	 * are consistent with the copy we just made
+	 */
+	desc_copy->len8 = pkt_len >> 3;
+	desc_copy->offset8 = pkt_offset >> 3;
 
-	return desc;
+	return desc_copy;
 }
 EXPORT_SYMBOL_GPL(hv_pkt_iter_first);
 
@@ -431,7 +490,8 @@ EXPORT_SYMBOL_GPL(hv_pkt_iter_first);
  */
 struct vmpacket_descriptor *
 __hv_pkt_iter_next(struct vmbus_channel *channel,
-		   const struct vmpacket_descriptor *desc)
+		   const struct vmpacket_descriptor *desc,
+		   bool copy)
 {
 	struct hv_ring_buffer_info *rbi = &channel->inbound;
 	u32 packetlen = desc->len8 << 3;
@@ -444,7 +504,7 @@ __hv_pkt_iter_next(struct vmbus_channel *channel,
 		rbi->priv_read_index -= dsize;
 
 	/* more data? */
-	return hv_pkt_iter_first(channel);
+	return copy ? hv_pkt_iter_first(channel) : hv_pkt_iter_first_raw(channel);
 }
 EXPORT_SYMBOL_GPL(__hv_pkt_iter_next);
 
diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 442c520ab8f3..b11aa68b44ec 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -895,9 +895,16 @@ static inline u32 netvsc_rqstor_size(unsigned long ringbytes)
 		ringbytes / NETVSC_MIN_IN_MSG_SIZE;
 }
 
+/* XFER PAGE packets can specify a maximum of 375 ranges for NDIS >= 6.0
+ * and a maximum of 64 ranges for NDIS < 6.0 with no RSC; with RSC, this
+ * limit is raised to 562 (= NVSP_RSC_MAX).
+ */
+#define NETVSC_MAX_XFER_PAGE_RANGES NVSP_RSC_MAX
 #define NETVSC_XFER_HEADER_SIZE(rng_cnt) \
 		(offsetof(struct vmtransfer_page_packet_header, ranges) + \
 		(rng_cnt) * sizeof(struct vmtransfer_page_range))
+#define NETVSC_MAX_PKT_SIZE (NETVSC_XFER_HEADER_SIZE(NETVSC_MAX_XFER_PAGE_RANGES) + \
+		sizeof(struct nvsp_message) + (sizeof(u32) * VRSS_SEND_TAB_SIZE))
 
 struct multi_send_data {
 	struct sk_buff *skb; /* skb containing the pkt */
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 9d07c9ce4be2..067077138e52 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -1650,6 +1650,8 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device,
 
 	/* Open the channel */
 	device->channel->rqstor_size = netvsc_rqstor_size(netvsc_ring_bytes);
+	device->channel->max_pkt_size = NETVSC_MAX_PKT_SIZE;
+
 	ret = vmbus_open(device->channel, netvsc_ring_bytes,
 			 netvsc_ring_bytes,  NULL, 0,
 			 netvsc_channel_cb, net_device->chan_table);
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index c0e89e107d57..d7ff9ddcbae2 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1260,6 +1260,8 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc)
 	nvchan->channel = new_sc;
 
 	new_sc->rqstor_size = netvsc_rqstor_size(netvsc_ring_bytes);
+	new_sc->max_pkt_size = NETVSC_MAX_PKT_SIZE;
+
 	ret = vmbus_open(new_sc, netvsc_ring_bytes,
 			 netvsc_ring_bytes, NULL, 0,
 			 netvsc_channel_cb, nvchan);
diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index e6718a74e5da..07149fa72b68 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -406,6 +406,14 @@ static void storvsc_on_channel_callback(void *context);
 #define STORVSC_IDE_MAX_TARGETS				1
 #define STORVSC_IDE_MAX_CHANNELS			1
 
+/*
+ * Upper bound on the size of a storvsc packet. vmscsi_size_delta is not
+ * included in the calculation because it is set after STORVSC_MAX_PKT_SIZE
+ * is used in storvsc_connect_to_vsp
+ */
+#define STORVSC_MAX_PKT_SIZE (sizeof(struct vmpacket_descriptor) +\
+			      sizeof(struct vstor_packet))
+
 struct storvsc_cmd_request {
 	struct scsi_cmnd *cmd;
 
@@ -701,6 +709,7 @@ static void handle_sc_creation(struct vmbus_channel *new_sc)
 		return;
 
 	memset(&props, 0, sizeof(struct vmstorage_channel_properties));
+	new_sc->max_pkt_size = STORVSC_MAX_PKT_SIZE;
 
 	/*
 	 * The size of vmbus_requestor is an upper bound on the number of requests
@@ -1294,6 +1303,7 @@ static int storvsc_connect_to_vsp(struct hv_device *device, u32 ring_size,
 
 	memset(&props, 0, sizeof(struct vmstorage_channel_properties));
 
+	device->channel->max_pkt_size = STORVSC_MAX_PKT_SIZE;
 	/*
 	 * The size of vmbus_requestor is an upper bound on the number of requests
 	 * that can be in-progress at any one time across all channels.
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index d1e59dbef1dd..3932446f215f 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -181,6 +181,10 @@ struct hv_ring_buffer_info {
 	 * being freed while the ring buffer is being accessed.
 	 */
 	struct mutex ring_buffer_mutex;
+
+	/* Buffer that holds a copy of an incoming host packet */
+	void *pkt_buffer;
+	u32 pkt_buffer_size;
 };
 
 
@@ -799,6 +803,8 @@ struct vmbus_device {
 	bool allowed_in_isolated;
 };
 
+#define VMBUS_DEFAULT_MAX_PKT_SIZE 4096
+
 struct vmbus_channel {
 	struct list_head listentry;
 
@@ -1021,6 +1027,9 @@ struct vmbus_channel {
 	/* request/transaction ids for VMBus */
 	struct vmbus_requestor requestor;
 	u32 rqstor_size;
+
+	/* The max size of a packet on this channel */
+	u32 max_pkt_size;
 };
 
 u64 vmbus_next_request_id(struct vmbus_requestor *rqstor, u64 rqst_addr);
@@ -1662,32 +1671,55 @@ static inline u32 hv_pkt_datalen(const struct vmpacket_descriptor *desc)
 }
 
 
+struct vmpacket_descriptor *
+hv_pkt_iter_first_raw(struct vmbus_channel *channel);
+
 struct vmpacket_descriptor *
 hv_pkt_iter_first(struct vmbus_channel *channel);
 
 struct vmpacket_descriptor *
 __hv_pkt_iter_next(struct vmbus_channel *channel,
-		   const struct vmpacket_descriptor *pkt);
+		   const struct vmpacket_descriptor *pkt,
+		   bool copy);
 
 void hv_pkt_iter_close(struct vmbus_channel *channel);
 
-/*
- * Get next packet descriptor from iterator
- * If at end of list, return NULL and update host.
- */
 static inline struct vmpacket_descriptor *
-hv_pkt_iter_next(struct vmbus_channel *channel,
-		 const struct vmpacket_descriptor *pkt)
+hv_pkt_iter_next_pkt(struct vmbus_channel *channel,
+		     const struct vmpacket_descriptor *pkt,
+		     bool copy)
 {
 	struct vmpacket_descriptor *nxt;
 
-	nxt = __hv_pkt_iter_next(channel, pkt);
+	nxt = __hv_pkt_iter_next(channel, pkt, copy);
 	if (!nxt)
 		hv_pkt_iter_close(channel);
 
 	return nxt;
 }
 
+/*
+ * Get next packet descriptor without copying it out of the ring buffer
+ * If at end of list, return NULL and update host.
+ */
+static inline struct vmpacket_descriptor *
+hv_pkt_iter_next_raw(struct vmbus_channel *channel,
+		     const struct vmpacket_descriptor *pkt)
+{
+	return hv_pkt_iter_next_pkt(channel, pkt, false);
+}
+
+/*
+ * Get next packet descriptor from iterator
+ * If at end of list, return NULL and update host.
+ */
+static inline struct vmpacket_descriptor *
+hv_pkt_iter_next(struct vmbus_channel *channel,
+		 const struct vmpacket_descriptor *pkt)
+{
+	return hv_pkt_iter_next_pkt(channel, pkt, true);
+}
+
 #define foreach_vmbus_pkt(pkt, channel) \
 	for (pkt = hv_pkt_iter_first(channel); pkt; \
 	    pkt = hv_pkt_iter_next(channel, pkt))
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index cc3bae2659e7..19189cf30a72 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -596,7 +596,7 @@ static ssize_t hvs_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg,
 		return -EOPNOTSUPP;
 
 	if (need_refill) {
-		hvs->recv_desc = hv_pkt_iter_first(hvs->chan);
+		hvs->recv_desc = hv_pkt_iter_first_raw(hvs->chan);
 		ret = hvs_update_recv_data(hvs);
 		if (ret)
 			return ret;
@@ -610,7 +610,7 @@ static ssize_t hvs_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg,
 
 	hvs->recv_data_len -= to_read;
 	if (hvs->recv_data_len == 0) {
-		hvs->recv_desc = hv_pkt_iter_next(hvs->chan, hvs->recv_desc);
+		hvs->recv_desc = hv_pkt_iter_next_raw(hvs->chan, hvs->recv_desc);
 		if (hvs->recv_desc) {
 			ret = hvs_update_recv_data(hvs);
 			if (ret)
-- 
cgit v1.2.3


From bf5fd8cae3c8f0d1e6f71a076e0ce2bd17645d0b Mon Sep 17 00:00:00 2001
From: "Andrea Parri (Microsoft)" <parri.andrea@gmail.com>
Date: Mon, 10 May 2021 23:08:41 +0200
Subject: scsi: storvsc: Use blk_mq_unique_tag() to generate requestIDs

Use blk_mq_unique_tag() to generate requestIDs for StorVSC, avoiding
all issues with allocating enough entries in the VMbus requestor.

Suggested-by: Michael Kelley <mikelley@microsoft.com>
Signed-off-by: Andrea Parri (Microsoft) <parri.andrea@gmail.com>
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20210510210841.370472-1-parri.andrea@gmail.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 drivers/hv/channel.c              | 14 +++---
 drivers/hv/ring_buffer.c          | 13 +++---
 drivers/net/hyperv/netvsc.c       |  8 ++--
 drivers/net/hyperv/rndis_filter.c |  2 +
 drivers/scsi/storvsc_drv.c        | 94 ++++++++++++++++++++++++++-------------
 include/linux/hyperv.h            | 13 +++++-
 6 files changed, 95 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index bfbca4eeb773..f3761c73b074 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -1189,15 +1189,14 @@ EXPORT_SYMBOL_GPL(vmbus_recvpacket_raw);
  * vmbus_next_request_id - Returns a new request id. It is also
  * the index at which the guest memory address is stored.
  * Uses a spin lock to avoid race conditions.
- * @rqstor: Pointer to the requestor struct
+ * @channel: Pointer to the VMbus channel struct
  * @rqst_add: Guest memory address to be stored in the array
  */
-u64 vmbus_next_request_id(struct vmbus_requestor *rqstor, u64 rqst_addr)
+u64 vmbus_next_request_id(struct vmbus_channel *channel, u64 rqst_addr)
 {
+	struct vmbus_requestor *rqstor = &channel->requestor;
 	unsigned long flags;
 	u64 current_id;
-	const struct vmbus_channel *channel =
-		container_of(rqstor, const struct vmbus_channel, requestor);
 
 	/* Check rqstor has been initialized */
 	if (!channel->rqstor_size)
@@ -1231,16 +1230,15 @@ EXPORT_SYMBOL_GPL(vmbus_next_request_id);
 /*
  * vmbus_request_addr - Returns the memory address stored at @trans_id
  * in @rqstor. Uses a spin lock to avoid race conditions.
- * @rqstor: Pointer to the requestor struct
+ * @channel: Pointer to the VMbus channel struct
  * @trans_id: Request id sent back from Hyper-V. Becomes the requestor's
  * next request id.
  */
-u64 vmbus_request_addr(struct vmbus_requestor *rqstor, u64 trans_id)
+u64 vmbus_request_addr(struct vmbus_channel *channel, u64 trans_id)
 {
+	struct vmbus_requestor *rqstor = &channel->requestor;
 	unsigned long flags;
 	u64 req_addr;
-	const struct vmbus_channel *channel =
-		container_of(rqstor, const struct vmbus_channel, requestor);
 
 	/* Check rqstor has been initialized */
 	if (!channel->rqstor_size)
diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index e621f8d9b436..2aee356840a2 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -312,10 +312,12 @@ int hv_ringbuffer_write(struct vmbus_channel *channel,
 	 */
 
 	if (desc->flags == VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED) {
-		rqst_id = vmbus_next_request_id(&channel->requestor, requestid);
-		if (rqst_id == VMBUS_RQST_ERROR) {
-			spin_unlock_irqrestore(&outring_info->ring_lock, flags);
-			return -EAGAIN;
+		if (channel->next_request_id_callback != NULL) {
+			rqst_id = channel->next_request_id_callback(channel, requestid);
+			if (rqst_id == VMBUS_RQST_ERROR) {
+				spin_unlock_irqrestore(&outring_info->ring_lock, flags);
+				return -EAGAIN;
+			}
 		}
 	}
 	desc = hv_get_ring_buffer(outring_info) + old_write;
@@ -343,7 +345,8 @@ int hv_ringbuffer_write(struct vmbus_channel *channel,
 	if (channel->rescind) {
 		if (rqst_id != VMBUS_NO_RQSTOR) {
 			/* Reclaim request ID to avoid leak of IDs */
-			vmbus_request_addr(&channel->requestor, rqst_id);
+			if (channel->request_addr_callback != NULL)
+				channel->request_addr_callback(channel, rqst_id);
 		}
 		return -ENODEV;
 	}
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 067077138e52..7bd935412853 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -757,7 +757,7 @@ static void netvsc_send_tx_complete(struct net_device *ndev,
 	int queue_sends;
 	u64 cmd_rqst;
 
-	cmd_rqst = vmbus_request_addr(&channel->requestor, (u64)desc->trans_id);
+	cmd_rqst = channel->request_addr_callback(channel, (u64)desc->trans_id);
 	if (cmd_rqst == VMBUS_RQST_ERROR) {
 		netdev_err(ndev, "Incorrect transaction id\n");
 		return;
@@ -817,8 +817,8 @@ static void netvsc_send_completion(struct net_device *ndev,
 
 	/* First check if this is a VMBUS completion without data payload */
 	if (!msglen) {
-		cmd_rqst = vmbus_request_addr(&incoming_channel->requestor,
-					      (u64)desc->trans_id);
+		cmd_rqst = incoming_channel->request_addr_callback(incoming_channel,
+								   (u64)desc->trans_id);
 		if (cmd_rqst == VMBUS_RQST_ERROR) {
 			netdev_err(ndev, "Invalid transaction id\n");
 			return;
@@ -1649,6 +1649,8 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device,
 		       netvsc_poll, NAPI_POLL_WEIGHT);
 
 	/* Open the channel */
+	device->channel->next_request_id_callback = vmbus_next_request_id;
+	device->channel->request_addr_callback = vmbus_request_addr;
 	device->channel->rqstor_size = netvsc_rqstor_size(netvsc_ring_bytes);
 	device->channel->max_pkt_size = NETVSC_MAX_PKT_SIZE;
 
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index d7ff9ddcbae2..983bf362466a 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1259,6 +1259,8 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc)
 	/* Set the channel before opening.*/
 	nvchan->channel = new_sc;
 
+	new_sc->next_request_id_callback = vmbus_next_request_id;
+	new_sc->request_addr_callback = vmbus_request_addr;
 	new_sc->rqstor_size = netvsc_rqstor_size(netvsc_ring_bytes);
 	new_sc->max_pkt_size = NETVSC_MAX_PKT_SIZE;
 
diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 07149fa72b68..403753929320 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -696,6 +696,23 @@ old_is_alloced:
 	spin_unlock_irqrestore(&stor_device->lock, flags);
 }
 
+static u64 storvsc_next_request_id(struct vmbus_channel *channel, u64 rqst_addr)
+{
+	struct storvsc_cmd_request *request =
+		(struct storvsc_cmd_request *)(unsigned long)rqst_addr;
+
+	if (rqst_addr == VMBUS_RQST_INIT)
+		return VMBUS_RQST_INIT;
+	if (rqst_addr == VMBUS_RQST_RESET)
+		return VMBUS_RQST_RESET;
+
+	/*
+	 * Cannot return an ID of 0, which is reserved for an unsolicited
+	 * message from Hyper-V.
+	 */
+	return (u64)blk_mq_unique_tag(request->cmd->request) + 1;
+}
+
 static void handle_sc_creation(struct vmbus_channel *new_sc)
 {
 	struct hv_device *device = new_sc->primary_channel->device_obj;
@@ -711,11 +728,7 @@ static void handle_sc_creation(struct vmbus_channel *new_sc)
 	memset(&props, 0, sizeof(struct vmstorage_channel_properties));
 	new_sc->max_pkt_size = STORVSC_MAX_PKT_SIZE;
 
-	/*
-	 * The size of vmbus_requestor is an upper bound on the number of requests
-	 * that can be in-progress at any one time across all channels.
-	 */
-	new_sc->rqstor_size = scsi_driver.can_queue;
+	new_sc->next_request_id_callback = storvsc_next_request_id;
 
 	ret = vmbus_open(new_sc,
 			 storvsc_ringbuffer_size,
@@ -782,7 +795,7 @@ static void  handle_multichannel_storage(struct hv_device *device, int max_chns)
 	ret = vmbus_sendpacket(device->channel, vstor_packet,
 			       (sizeof(struct vstor_packet) -
 			       stor_device->vmscsi_size_delta),
-			       (unsigned long)request,
+			       VMBUS_RQST_INIT,
 			       VM_PKT_DATA_INBAND,
 			       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
 
@@ -851,7 +864,7 @@ static int storvsc_execute_vstor_op(struct hv_device *device,
 	ret = vmbus_sendpacket(device->channel, vstor_packet,
 			       (sizeof(struct vstor_packet) -
 			       stor_device->vmscsi_size_delta),
-			       (unsigned long)request,
+			       VMBUS_RQST_INIT,
 			       VM_PKT_DATA_INBAND,
 			       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
 	if (ret != 0)
@@ -1253,6 +1266,7 @@ static void storvsc_on_channel_callback(void *context)
 	const struct vmpacket_descriptor *desc;
 	struct hv_device *device;
 	struct storvsc_device *stor_device;
+	struct Scsi_Host *shost;
 
 	if (channel->primary_channel != NULL)
 		device = channel->primary_channel->device_obj;
@@ -1263,20 +1277,12 @@ static void storvsc_on_channel_callback(void *context)
 	if (!stor_device)
 		return;
 
-	foreach_vmbus_pkt(desc, channel) {
-		void *packet = hv_pkt_data(desc);
-		struct storvsc_cmd_request *request;
-		u64 cmd_rqst;
-
-		cmd_rqst = vmbus_request_addr(&channel->requestor,
-					      desc->trans_id);
-		if (cmd_rqst == VMBUS_RQST_ERROR) {
-			dev_err(&device->device,
-				"Incorrect transaction id\n");
-			continue;
-		}
+	shost = stor_device->host;
 
-		request = (struct storvsc_cmd_request *)(unsigned long)cmd_rqst;
+	foreach_vmbus_pkt(desc, channel) {
+		struct vstor_packet *packet = hv_pkt_data(desc);
+		struct storvsc_cmd_request *request = NULL;
+		u64 rqst_id = desc->trans_id;
 
 		if (hv_pkt_datalen(desc) < sizeof(struct vstor_packet) -
 				stor_device->vmscsi_size_delta) {
@@ -1284,14 +1290,44 @@ static void storvsc_on_channel_callback(void *context)
 			continue;
 		}
 
-		if (request == &stor_device->init_request ||
-		    request == &stor_device->reset_request) {
-			memcpy(&request->vstor_packet, packet,
-			       (sizeof(struct vstor_packet) - stor_device->vmscsi_size_delta));
-			complete(&request->wait_event);
+		if (rqst_id == VMBUS_RQST_INIT) {
+			request = &stor_device->init_request;
+		} else if (rqst_id == VMBUS_RQST_RESET) {
+			request = &stor_device->reset_request;
 		} else {
+			/* Hyper-V can send an unsolicited message with ID of 0 */
+			if (rqst_id == 0) {
+				/*
+				 * storvsc_on_receive() looks at the vstor_packet in the message
+				 * from the ring buffer.  If the operation in the vstor_packet is
+				 * COMPLETE_IO, then we call storvsc_on_io_completion(), and
+				 * dereference the guest memory address.  Make sure we don't call
+				 * storvsc_on_io_completion() with a guest memory address that is
+				 * zero if Hyper-V were to construct and send such a bogus packet.
+				 */
+				if (packet->operation == VSTOR_OPERATION_COMPLETE_IO) {
+					dev_err(&device->device, "Invalid packet with ID of 0\n");
+					continue;
+				}
+			} else {
+				struct scsi_cmnd *scmnd;
+
+				/* Transaction 'rqst_id' corresponds to tag 'rqst_id - 1' */
+				scmnd = scsi_host_find_tag(shost, rqst_id - 1);
+				if (scmnd == NULL) {
+					dev_err(&device->device, "Incorrect transaction ID\n");
+					continue;
+				}
+				request = (struct storvsc_cmd_request *)scsi_cmd_priv(scmnd);
+			}
+
 			storvsc_on_receive(stor_device, packet, request);
+			continue;
 		}
+
+		memcpy(&request->vstor_packet, packet,
+		       (sizeof(struct vstor_packet) - stor_device->vmscsi_size_delta));
+		complete(&request->wait_event);
 	}
 }
 
@@ -1304,11 +1340,7 @@ static int storvsc_connect_to_vsp(struct hv_device *device, u32 ring_size,
 	memset(&props, 0, sizeof(struct vmstorage_channel_properties));
 
 	device->channel->max_pkt_size = STORVSC_MAX_PKT_SIZE;
-	/*
-	 * The size of vmbus_requestor is an upper bound on the number of requests
-	 * that can be in-progress at any one time across all channels.
-	 */
-	device->channel->rqstor_size = scsi_driver.can_queue;
+	device->channel->next_request_id_callback = storvsc_next_request_id;
 
 	ret = vmbus_open(device->channel,
 			 ring_size,
@@ -1634,7 +1666,7 @@ static int storvsc_host_reset_handler(struct scsi_cmnd *scmnd)
 	ret = vmbus_sendpacket(device->channel, vstor_packet,
 			       (sizeof(struct vstor_packet) -
 				stor_device->vmscsi_size_delta),
-			       (unsigned long)&stor_device->reset_request,
+			       VMBUS_RQST_RESET,
 			       VM_PKT_DATA_INBAND,
 			       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
 	if (ret != 0)
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 3932446f215f..2e859d2f9609 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -794,7 +794,11 @@ struct vmbus_requestor {
 
 #define VMBUS_NO_RQSTOR U64_MAX
 #define VMBUS_RQST_ERROR (U64_MAX - 1)
+/* NetVSC-specific */
 #define VMBUS_RQST_ID_NO_RESPONSE (U64_MAX - 2)
+/* StorVSC-specific */
+#define VMBUS_RQST_INIT (U64_MAX - 2)
+#define VMBUS_RQST_RESET (U64_MAX - 3)
 
 struct vmbus_device {
 	u16  dev_type;
@@ -1024,6 +1028,11 @@ struct vmbus_channel {
 	u32 fuzz_testing_interrupt_delay;
 	u32 fuzz_testing_message_delay;
 
+	/* callback to generate a request ID from a request address */
+	u64 (*next_request_id_callback)(struct vmbus_channel *channel, u64 rqst_addr);
+	/* callback to retrieve a request address from a request ID */
+	u64 (*request_addr_callback)(struct vmbus_channel *channel, u64 rqst_id);
+
 	/* request/transaction ids for VMBus */
 	struct vmbus_requestor requestor;
 	u32 rqstor_size;
@@ -1032,8 +1041,8 @@ struct vmbus_channel {
 	u32 max_pkt_size;
 };
 
-u64 vmbus_next_request_id(struct vmbus_requestor *rqstor, u64 rqst_addr);
-u64 vmbus_request_addr(struct vmbus_requestor *rqstor, u64 trans_id);
+u64 vmbus_next_request_id(struct vmbus_channel *channel, u64 rqst_addr);
+u64 vmbus_request_addr(struct vmbus_channel *channel, u64 trans_id);
 
 static inline bool is_hvsock_channel(const struct vmbus_channel *c)
 {
-- 
cgit v1.2.3


From 869cbeef18e5c4370157e733b947d44f37441ea9 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Wed, 12 May 2021 16:32:10 +0200
Subject: lsm_audit,selinux: pass IB device name by reference

While trying to address a Coverity warning that the dev_name string
might end up unterminated when strcpy'ing it in
selinux_ib_endport_manage_subnet(), I realized that it is possible (and
simpler) to just pass the dev_name pointer directly, rather than copying
the string to a buffer.

The ibendport variable goes out of scope at the end of the function
anyway, so the lifetime of the dev_name pointer will never be shorter
than that of ibendport, thus we can safely just pass the dev_name
pointer and be done with it.

Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
Acked-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_audit.h | 8 ++++----
 security/selinux/hooks.c  | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h
index cd23355d2271..17d02eda9538 100644
--- a/include/linux/lsm_audit.h
+++ b/include/linux/lsm_audit.h
@@ -48,13 +48,13 @@ struct lsm_ioctlop_audit {
 };
 
 struct lsm_ibpkey_audit {
-	u64	subnet_prefix;
-	u16	pkey;
+	u64 subnet_prefix;
+	u16 pkey;
 };
 
 struct lsm_ibendport_audit {
-	char	dev_name[IB_DEVICE_NAME_MAX];
-	u8	port;
+	const char *dev_name;
+	u8 port;
 };
 
 /* Auxiliary data to use in generating the audit record. */
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index eaea837d89d1..fc6a3ab7e179 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -6850,7 +6850,7 @@ static int selinux_ib_endport_manage_subnet(void *ib_sec, const char *dev_name,
 		return err;
 
 	ad.type = LSM_AUDIT_DATA_IBENDPORT;
-	strncpy(ibendport.dev_name, dev_name, sizeof(ibendport.dev_name));
+	ibendport.dev_name = dev_name;
 	ibendport.port = port_num;
 	ad.u.ibendport = &ibendport;
 	return avc_has_perm(&selinux_state,
-- 
cgit v1.2.3


From 0cfe5a6e758fb20be8ad3e8f10cb087cc8033eeb Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Thu, 1 Apr 2021 17:41:04 +0200
Subject: gpu: host1x: Split up client initalization and registration

In some cases we may need to initialize the host1x client first before
registering it. This commit adds a new helper that will do nothing but
the initialization of the data structure.

At the same time, the initialization is removed from the registration
function. Note, however, that for simplicity we explicitly initialize
the client when the host1x_client_register() function is called, as
opposed to the low-level __host1x_client_register() function. This
allows existing callers to remain unchanged.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/bus.c | 30 ++++++++++++++++++++++++------
 include/linux/host1x.h   | 30 ++++++++++++++++++++++++------
 2 files changed, 48 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/host1x/bus.c b/drivers/gpu/host1x/bus.c
index 46f69c532b6b..218e3718fd68 100644
--- a/drivers/gpu/host1x/bus.c
+++ b/drivers/gpu/host1x/bus.c
@@ -735,6 +735,29 @@ void host1x_driver_unregister(struct host1x_driver *driver)
 }
 EXPORT_SYMBOL(host1x_driver_unregister);
 
+/**
+ * __host1x_client_init() - initialize a host1x client
+ * @client: host1x client
+ * @key: lock class key for the client-specific mutex
+ */
+void __host1x_client_init(struct host1x_client *client, struct lock_class_key *key)
+{
+	INIT_LIST_HEAD(&client->list);
+	__mutex_init(&client->lock, "host1x client lock", key);
+	client->usecount = 0;
+}
+EXPORT_SYMBOL(__host1x_client_init);
+
+/**
+ * host1x_client_exit() - uninitialize a host1x client
+ * @client: host1x client
+ */
+void host1x_client_exit(struct host1x_client *client)
+{
+	mutex_destroy(&client->lock);
+}
+EXPORT_SYMBOL(host1x_client_exit);
+
 /**
  * __host1x_client_register() - register a host1x client
  * @client: host1x client
@@ -747,16 +770,11 @@ EXPORT_SYMBOL(host1x_driver_unregister);
  * device and call host1x_device_init(), which will in turn call each client's
  * &host1x_client_ops.init implementation.
  */
-int __host1x_client_register(struct host1x_client *client,
-			     struct lock_class_key *key)
+int __host1x_client_register(struct host1x_client *client)
 {
 	struct host1x *host1x;
 	int err;
 
-	INIT_LIST_HEAD(&client->list);
-	__mutex_init(&client->lock, "host1x client lock", key);
-	client->usecount = 0;
-
 	mutex_lock(&devices_lock);
 
 	list_for_each_entry(host1x, &devices, list) {
diff --git a/include/linux/host1x.h b/include/linux/host1x.h
index 232e1bd507a7..9b0487c88571 100644
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -332,12 +332,30 @@ static inline struct host1x_device *to_host1x_device(struct device *dev)
 int host1x_device_init(struct host1x_device *device);
 int host1x_device_exit(struct host1x_device *device);
 
-int __host1x_client_register(struct host1x_client *client,
-			     struct lock_class_key *key);
-#define host1x_client_register(class) \
-	({ \
-		static struct lock_class_key __key; \
-		__host1x_client_register(class, &__key); \
+void __host1x_client_init(struct host1x_client *client, struct lock_class_key *key);
+void host1x_client_exit(struct host1x_client *client);
+
+#define host1x_client_init(client)			\
+	({						\
+		static struct lock_class_key __key;	\
+		__host1x_client_init(client, &__key);	\
+	})
+
+int __host1x_client_register(struct host1x_client *client);
+
+/*
+ * Note that this wrapper calls __host1x_client_init() for compatibility
+ * with existing callers. Callers that want to separately initialize and
+ * register a host1x client must first initialize using either of the
+ * __host1x_client_init() or host1x_client_init() functions and then use
+ * the low-level __host1x_client_register() function to avoid the client
+ * getting reinitialized.
+ */
+#define host1x_client_register(client)			\
+	({						\
+		static struct lock_class_key __key;	\
+		__host1x_client_init(client, &__key);	\
+		__host1x_client_register(client);	\
 	})
 
 int host1x_client_unregister(struct host1x_client *client);
-- 
cgit v1.2.3


From 803f4e1eab7a8938ba3a3c30dd4eb5e9eeef5e63 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sat, 8 May 2021 00:07:57 +0200
Subject: asm-generic: simplify asm/unaligned.h

The get_unaligned()/put_unaligned() implementations are much more complex
than necessary, now that all architectures use the same code.

Move everything into one file and use a much more compact way to express
the same logic.

I've compared the binary output using gcc-11 across defconfig builds for
all architectures and found this patch to make no difference, except for
a single function on powerpc that needs two additional register moves
because of random differences in register allocation.

There are a handful of callers of the low-level __get_unaligned_cpu32,
so leave that in place for the time being even though the common code
no longer uses it.

This adds a warning for any caller of get_unaligned()/put_unaligned()
that passes in a single-byte pointer, but I've sent patches for all
instances that show up in x86 and randconfig builds. It would be nice
to change the arguments of the endian-specific accessors to take the
matching __be16/__be32/__be64/__le16/__le32/__le64 arguments instead of
a void pointer, but that requires more changes to the rest of the kernel.

This new version does allow aggregate types into get_unaligned(), which
was not the original goal but might come in handy.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/asm-generic/unaligned.h     | 130 ++++++++++++++++++++++++++++++++----
 include/linux/unaligned/be_struct.h |  67 -------------------
 include/linux/unaligned/generic.h   | 115 -------------------------------
 include/linux/unaligned/le_struct.h |  67 -------------------
 4 files changed, 117 insertions(+), 262 deletions(-)
 delete mode 100644 include/linux/unaligned/be_struct.h
 delete mode 100644 include/linux/unaligned/generic.h
 delete mode 100644 include/linux/unaligned/le_struct.h

(limited to 'include/linux')

diff --git a/include/asm-generic/unaligned.h b/include/asm-generic/unaligned.h
index 36bf03aaa674..1c4242416c9f 100644
--- a/include/asm-generic/unaligned.h
+++ b/include/asm-generic/unaligned.h
@@ -6,20 +6,124 @@
  * This is the most generic implementation of unaligned accesses
  * and should work almost anywhere.
  */
+#include <linux/unaligned/packed_struct.h>
 #include <asm/byteorder.h>
 
-#if defined(__LITTLE_ENDIAN)
-# include <linux/unaligned/le_struct.h>
-# include <linux/unaligned/generic.h>
-# define get_unaligned	__get_unaligned_le
-# define put_unaligned	__put_unaligned_le
-#elif defined(__BIG_ENDIAN)
-# include <linux/unaligned/be_struct.h>
-# include <linux/unaligned/generic.h>
-# define get_unaligned	__get_unaligned_be
-# define put_unaligned	__put_unaligned_be
-#else
-# error need to define endianess
-#endif
+#define __get_unaligned_t(type, ptr) ({						\
+	const struct { type x; } __packed *__pptr = (typeof(__pptr))(ptr);	\
+	__pptr->x;								\
+})
+
+#define __put_unaligned_t(type, val, ptr) do {					\
+	struct { type x; } __packed *__pptr = (typeof(__pptr))(ptr);		\
+	__pptr->x = (val);							\
+} while (0)
+
+#define get_unaligned(ptr)	__get_unaligned_t(typeof(*(ptr)), (ptr))
+#define put_unaligned(val, ptr) __put_unaligned_t(typeof(*(ptr)), (val), (ptr))
+
+static inline u16 get_unaligned_le16(const void *p)
+{
+	return le16_to_cpu(__get_unaligned_t(__le16, p));
+}
+
+static inline u32 get_unaligned_le32(const void *p)
+{
+	return le32_to_cpu(__get_unaligned_t(__le32, p));
+}
+
+static inline u64 get_unaligned_le64(const void *p)
+{
+	return le64_to_cpu(__get_unaligned_t(__le64, p));
+}
+
+static inline void put_unaligned_le16(u16 val, void *p)
+{
+	__put_unaligned_t(__le16, cpu_to_le16(val), p);
+}
+
+static inline void put_unaligned_le32(u32 val, void *p)
+{
+	__put_unaligned_t(__le32, cpu_to_le32(val), p);
+}
+
+static inline void put_unaligned_le64(u64 val, void *p)
+{
+	__put_unaligned_t(__le64, cpu_to_le64(val), p);
+}
+
+static inline u16 get_unaligned_be16(const void *p)
+{
+	return be16_to_cpu(__get_unaligned_t(__be16, p));
+}
+
+static inline u32 get_unaligned_be32(const void *p)
+{
+	return be32_to_cpu(__get_unaligned_t(__be32, p));
+}
+
+static inline u64 get_unaligned_be64(const void *p)
+{
+	return be64_to_cpu(__get_unaligned_t(__be64, p));
+}
+
+static inline void put_unaligned_be16(u16 val, void *p)
+{
+	__put_unaligned_t(__be16, cpu_to_be16(val), p);
+}
+
+static inline void put_unaligned_be32(u32 val, void *p)
+{
+	__put_unaligned_t(__be32, cpu_to_be32(val), p);
+}
+
+static inline void put_unaligned_be64(u64 val, void *p)
+{
+	__put_unaligned_t(__be64, cpu_to_be64(val), p);
+}
+
+static inline u32 __get_unaligned_be24(const u8 *p)
+{
+	return p[0] << 16 | p[1] << 8 | p[2];
+}
+
+static inline u32 get_unaligned_be24(const void *p)
+{
+	return __get_unaligned_be24(p);
+}
+
+static inline u32 __get_unaligned_le24(const u8 *p)
+{
+	return p[0] | p[1] << 8 | p[2] << 16;
+}
+
+static inline u32 get_unaligned_le24(const void *p)
+{
+	return __get_unaligned_le24(p);
+}
+
+static inline void __put_unaligned_be24(const u32 val, u8 *p)
+{
+	*p++ = val >> 16;
+	*p++ = val >> 8;
+	*p++ = val;
+}
+
+static inline void put_unaligned_be24(const u32 val, void *p)
+{
+	__put_unaligned_be24(val, p);
+}
+
+static inline void __put_unaligned_le24(const u32 val, u8 *p)
+{
+	*p++ = val;
+	*p++ = val >> 8;
+	*p++ = val >> 16;
+}
+
+static inline void put_unaligned_le24(const u32 val, void *p)
+{
+	__put_unaligned_le24(val, p);
+}
 
 #endif /* __ASM_GENERIC_UNALIGNED_H */
diff --git a/include/linux/unaligned/be_struct.h b/include/linux/unaligned/be_struct.h
deleted file mode 100644
index 76d9fe297c33..000000000000
--- a/include/linux/unaligned/be_struct.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_UNALIGNED_BE_STRUCT_H
-#define _LINUX_UNALIGNED_BE_STRUCT_H
-
-#include <linux/unaligned/packed_struct.h>
-
-static inline u16 get_unaligned_be16(const void *p)
-{
-	return __get_unaligned_cpu16((const u8 *)p);
-}
-
-static inline u32 get_unaligned_be32(const void *p)
-{
-	return __get_unaligned_cpu32((const u8 *)p);
-}
-
-static inline u64 get_unaligned_be64(const void *p)
-{
-	return __get_unaligned_cpu64((const u8 *)p);
-}
-
-static inline void put_unaligned_be16(u16 val, void *p)
-{
-	__put_unaligned_cpu16(val, p);
-}
-
-static inline void put_unaligned_be32(u32 val, void *p)
-{
-	__put_unaligned_cpu32(val, p);
-}
-
-static inline void put_unaligned_be64(u64 val, void *p)
-{
-	__put_unaligned_cpu64(val, p);
-}
-
-static inline u16 get_unaligned_le16(const void *p)
-{
-	return swab16(__get_unaligned_cpu16((const u8 *)p));
-}
-
-static inline u32 get_unaligned_le32(const void *p)
-{
-	return swab32(__get_unaligned_cpu32((const u8 *)p));
-}
-
-static inline u64 get_unaligned_le64(const void *p)
-{
-	return swab64(__get_unaligned_cpu64((const u8 *)p));
-}
-
-static inline void put_unaligned_le16(u16 val, void *p)
-{
-	__put_unaligned_cpu16(swab16(val), p);
-}
-
-static inline void put_unaligned_le32(u32 val, void *p)
-{
-	__put_unaligned_cpu32(swab32(val), p);
-}
-
-static inline void put_unaligned_le64(u64 val, void *p)
-{
-	__put_unaligned_cpu64(swab64(val), p);
-}
-
-#endif /* _LINUX_UNALIGNED_BE_STRUCT_H */
diff --git a/include/linux/unaligned/generic.h b/include/linux/unaligned/generic.h
deleted file mode 100644
index 303289492859..000000000000
--- a/include/linux/unaligned/generic.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_UNALIGNED_GENERIC_H
-#define _LINUX_UNALIGNED_GENERIC_H
-
-#include <linux/types.h>
-
-/*
- * Cause a link-time error if we try an unaligned access other than
- * 1,2,4 or 8 bytes long
- */
-extern void __bad_unaligned_access_size(void);
-
-#define __get_unaligned_le(ptr) ((__force typeof(*(ptr)))({			\
-	__builtin_choose_expr(sizeof(*(ptr)) == 1, *(ptr),			\
-	__builtin_choose_expr(sizeof(*(ptr)) == 2, get_unaligned_le16((ptr)),	\
-	__builtin_choose_expr(sizeof(*(ptr)) == 4, get_unaligned_le32((ptr)),	\
-	__builtin_choose_expr(sizeof(*(ptr)) == 8, get_unaligned_le64((ptr)),	\
-	__bad_unaligned_access_size()))));					\
-	}))
-
-#define __get_unaligned_be(ptr) ((__force typeof(*(ptr)))({			\
-	__builtin_choose_expr(sizeof(*(ptr)) == 1, *(ptr),			\
-	__builtin_choose_expr(sizeof(*(ptr)) == 2, get_unaligned_be16((ptr)),	\
-	__builtin_choose_expr(sizeof(*(ptr)) == 4, get_unaligned_be32((ptr)),	\
-	__builtin_choose_expr(sizeof(*(ptr)) == 8, get_unaligned_be64((ptr)),	\
-	__bad_unaligned_access_size()))));					\
-	}))
-
-#define __put_unaligned_le(val, ptr) ({					\
-	void *__gu_p = (ptr);						\
-	switch (sizeof(*(ptr))) {					\
-	case 1:								\
-		*(u8 *)__gu_p = (__force u8)(val);			\
-		break;							\
-	case 2:								\
-		put_unaligned_le16((__force u16)(val), __gu_p);		\
-		break;							\
-	case 4:								\
-		put_unaligned_le32((__force u32)(val), __gu_p);		\
-		break;							\
-	case 8:								\
-		put_unaligned_le64((__force u64)(val), __gu_p);		\
-		break;							\
-	default:							\
-		__bad_unaligned_access_size();				\
-		break;							\
-	}								\
-	(void)0; })
-
-#define __put_unaligned_be(val, ptr) ({					\
-	void *__gu_p = (ptr);						\
-	switch (sizeof(*(ptr))) {					\
-	case 1:								\
-		*(u8 *)__gu_p = (__force u8)(val);			\
-		break;							\
-	case 2:								\
-		put_unaligned_be16((__force u16)(val), __gu_p);		\
-		break;							\
-	case 4:								\
-		put_unaligned_be32((__force u32)(val), __gu_p);		\
-		break;							\
-	case 8:								\
-		put_unaligned_be64((__force u64)(val), __gu_p);		\
-		break;							\
-	default:							\
-		__bad_unaligned_access_size();				\
-		break;							\
-	}								\
-	(void)0; })
-
-static inline u32 __get_unaligned_be24(const u8 *p)
-{
-	return p[0] << 16 | p[1] << 8 | p[2];
-}
-
-static inline u32 get_unaligned_be24(const void *p)
-{
-	return __get_unaligned_be24(p);
-}
-
-static inline u32 __get_unaligned_le24(const u8 *p)
-{
-	return p[0] | p[1] << 8 | p[2] << 16;
-}
-
-static inline u32 get_unaligned_le24(const void *p)
-{
-	return __get_unaligned_le24(p);
-}
-
-static inline void __put_unaligned_be24(const u32 val, u8 *p)
-{
-	*p++ = val >> 16;
-	*p++ = val >> 8;
-	*p++ = val;
-}
-
-static inline void put_unaligned_be24(const u32 val, void *p)
-{
-	__put_unaligned_be24(val, p);
-}
-
-static inline void __put_unaligned_le24(const u32 val, u8 *p)
-{
-	*p++ = val;
-	*p++ = val >> 8;
-	*p++ = val >> 16;
-}
-
-static inline void put_unaligned_le24(const u32 val, void *p)
-{
-	__put_unaligned_le24(val, p);
-}
-
-#endif /* _LINUX_UNALIGNED_GENERIC_H */
diff --git a/include/linux/unaligned/le_struct.h b/include/linux/unaligned/le_struct.h
deleted file mode 100644
index 22f90a4afaa5..000000000000
--- a/include/linux/unaligned/le_struct.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_UNALIGNED_LE_STRUCT_H
-#define _LINUX_UNALIGNED_LE_STRUCT_H
-
-#include <linux/unaligned/packed_struct.h>
-
-static inline u16 get_unaligned_le16(const void *p)
-{
-	return __get_unaligned_cpu16((const u8 *)p);
-}
-
-static inline u32 get_unaligned_le32(const void *p)
-{
-	return __get_unaligned_cpu32((const u8 *)p);
-}
-
-static inline u64 get_unaligned_le64(const void *p)
-{
-	return __get_unaligned_cpu64((const u8 *)p);
-}
-
-static inline void put_unaligned_le16(u16 val, void *p)
-{
-	__put_unaligned_cpu16(val, p);
-}
-
-static inline void put_unaligned_le32(u32 val, void *p)
-{
-	__put_unaligned_cpu32(val, p);
-}
-
-static inline void put_unaligned_le64(u64 val, void *p)
-{
-	__put_unaligned_cpu64(val, p);
-}
-
-static inline u16 get_unaligned_be16(const void *p)
-{
-	return swab16(__get_unaligned_cpu16((const u8 *)p));
-}
-
-static inline u32 get_unaligned_be32(const void *p)
-{
-	return swab32(__get_unaligned_cpu32((const u8 *)p));
-}
-
-static inline u64 get_unaligned_be64(const void *p)
-{
-	return swab64(__get_unaligned_cpu64((const u8 *)p));
-}
-
-static inline void put_unaligned_be16(u16 val, void *p)
-{
-	__put_unaligned_cpu16(swab16(val), p);
-}
-
-static inline void put_unaligned_be32(u32 val, void *p)
-{
-	__put_unaligned_cpu32(swab32(val), p);
-}
-
-static inline void put_unaligned_be64(u64 val, void *p)
-{
-	__put_unaligned_cpu64(swab64(val), p);
-}
-
-#endif /* _LINUX_UNALIGNED_LE_STRUCT_H */
-- 
cgit v1.2.3


From dbf20809d6e0072ad189c937761d58bf98a47b43 Mon Sep 17 00:00:00 2001
From: Nuno Sa <nuno.sa@analog.com>
Date: Tue, 27 Apr 2021 10:54:52 +0200
Subject: iio: adis: add burst_max_speed_hz variable

Typically, in burst mode, the device cannot operate at it's full spi
speed. Hence, the spi transfers for burst mode have to take this into
account. With this change we avoid a potential race with the spi core as
drivers were 'hacking' the device 'max_speed_hz' directly in the
trigger handler.

Reviewed-by: Alexandru Ardelean <ardeleanalex@gmail.com>
Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20210427085454.30616-5-nuno.sa@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/imu/adis_buffer.c | 4 ++++
 include/linux/iio/imu/adis.h  | 2 ++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iio/imu/adis_buffer.c b/drivers/iio/imu/adis_buffer.c
index f6dbfbd17d41..351c303c8a8c 100644
--- a/drivers/iio/imu/adis_buffer.c
+++ b/drivers/iio/imu/adis_buffer.c
@@ -51,9 +51,13 @@ static int adis_update_scan_mode_burst(struct iio_dev *indio_dev,
 	adis->xfer[0].tx_buf = tx;
 	adis->xfer[0].bits_per_word = 8;
 	adis->xfer[0].len = 2;
+	if (adis->data->burst_max_speed_hz)
+		adis->xfer[0].speed_hz = adis->data->burst_max_speed_hz;
 	adis->xfer[1].rx_buf = adis->buffer;
 	adis->xfer[1].bits_per_word = 8;
 	adis->xfer[1].len = burst_length;
+	if (adis->data->burst_max_speed_hz)
+		adis->xfer[1].speed_hz = adis->data->burst_max_speed_hz;
 
 	spi_message_init(&adis->msg);
 	spi_message_add_tail(&adis->xfer[0], &adis->msg);
diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h
index f9b728d490b1..cf49997d5903 100644
--- a/include/linux/iio/imu/adis.h
+++ b/include/linux/iio/imu/adis.h
@@ -55,6 +55,7 @@ struct adis_timeout {
  *			this should be the minimum size supported by the device.
  * @burst_max_len:	Holds the maximum burst size when the device supports
  *			more than one burst mode with different sizes
+ * @burst_max_speed_hz:	Maximum spi speed that can be used in burst mode
  */
 struct adis_data {
 	unsigned int read_delay;
@@ -83,6 +84,7 @@ struct adis_data {
 	unsigned int burst_reg_cmd;
 	unsigned int burst_len;
 	unsigned int burst_max_len;
+	unsigned int burst_max_speed_hz;
 };
 
 /**
-- 
cgit v1.2.3


From 15ea2878bfb255099092634d28f31177f237ccd7 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Mon, 26 Apr 2021 18:49:03 +0100
Subject: iio: core: move @id from struct iio_dev to struct iio_dev_opaque

Continuing from Alexandru Ardelean's introduction of the split between
driver modifiable fields and those that should only be set by the core.

This could have been done in two steps to make the actual move after
introducing iio_device_id() but there seemed limited point to that
given how mechanical the majority of the patch is.

Includes fixup from Alex for missing mxs-lradc-adc conversion.

Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Alexandru Ardelean <ardeleanalex@gmail.com>
Link: https://lore.kernel.org/r/20210426174911.397061-2-jic23@kernel.org
---
 drivers/iio/accel/adxl372.c                        |  4 ++--
 drivers/iio/accel/bma180.c                         |  2 +-
 drivers/iio/accel/bmc150-accel-core.c              |  4 ++--
 drivers/iio/accel/kxcjk-1013.c                     |  4 ++--
 drivers/iio/accel/mma8452.c                        |  2 +-
 drivers/iio/accel/mxc4005.c                        |  2 +-
 drivers/iio/accel/stk8312.c                        |  2 +-
 drivers/iio/accel/stk8ba50.c                       |  2 +-
 drivers/iio/adc/ad7606.c                           |  3 ++-
 drivers/iio/adc/ad7766.c                           |  3 ++-
 drivers/iio/adc/ad7768-1.c                         |  3 ++-
 drivers/iio/adc/ad_sigma_delta.c                   |  2 +-
 drivers/iio/adc/at91-sama5d2_adc.c                 |  2 +-
 drivers/iio/adc/at91_adc.c                         |  4 ++--
 drivers/iio/adc/dln2-adc.c                         |  3 ++-
 drivers/iio/adc/ina2xx-adc.c                       |  3 ++-
 drivers/iio/adc/mxs-lradc-adc.c                    |  2 +-
 drivers/iio/adc/ti-ads131e08.c                     |  2 +-
 drivers/iio/adc/xilinx-xadc-core.c                 |  2 +-
 drivers/iio/buffer/industrialio-triggered-buffer.c |  2 +-
 drivers/iio/chemical/atlas-sensor.c                |  2 +-
 drivers/iio/chemical/ccs811.c                      |  2 +-
 drivers/iio/chemical/scd30_core.c                  |  3 ++-
 .../iio/common/hid-sensors/hid-sensor-trigger.c    |  2 +-
 drivers/iio/gyro/adxrs290.c                        |  2 +-
 drivers/iio/gyro/bmg160_core.c                     |  4 ++--
 drivers/iio/gyro/fxas21002c_core.c                 |  2 +-
 drivers/iio/gyro/itg3200_buffer.c                  |  2 +-
 drivers/iio/gyro/mpu3050-core.c                    |  2 +-
 drivers/iio/health/afe4403.c                       |  2 +-
 drivers/iio/health/afe4404.c                       |  2 +-
 drivers/iio/imu/adis_trigger.c                     |  3 ++-
 drivers/iio/imu/bmi160/bmi160_core.c               |  3 ++-
 drivers/iio/imu/inv_mpu6050/inv_mpu_trigger.c      |  2 +-
 drivers/iio/imu/kmx61.c                            |  2 +-
 drivers/iio/industrialio-core.c                    | 24 +++++++++++++++++-----
 drivers/iio/industrialio-triggered-event.c         |  2 +-
 drivers/iio/light/acpi-als.c                       |  3 ++-
 drivers/iio/light/rpr0521.c                        |  2 +-
 drivers/iio/light/si1145.c                         |  2 +-
 drivers/iio/light/vcnl4000.c                       |  3 ++-
 drivers/iio/light/vcnl4035.c                       |  2 +-
 drivers/iio/magnetometer/bmc150_magn.c             |  2 +-
 drivers/iio/magnetometer/rm3100-core.c             |  2 +-
 drivers/iio/potentiostat/lmp91000.c                |  3 ++-
 drivers/iio/pressure/zpa2326.c                     |  3 ++-
 drivers/iio/proximity/as3935.c                     |  3 ++-
 drivers/iio/proximity/sx9310.c                     |  2 +-
 drivers/iio/proximity/sx9500.c                     |  2 +-
 include/linux/iio/iio-opaque.h                     |  2 ++
 include/linux/iio/iio.h                            |  4 ++--
 51 files changed, 89 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/accel/adxl372.c b/drivers/iio/accel/adxl372.c
index 9c9a896a872a..fc9592407717 100644
--- a/drivers/iio/accel/adxl372.c
+++ b/drivers/iio/accel/adxl372.c
@@ -1223,14 +1223,14 @@ int adxl372_probe(struct device *dev, struct regmap *regmap,
 		st->dready_trig = devm_iio_trigger_alloc(dev,
 							 "%s-dev%d",
 							 indio_dev->name,
-							 indio_dev->id);
+							 iio_device_id(indio_dev));
 		if (st->dready_trig == NULL)
 			return -ENOMEM;
 
 		st->peak_datardy_trig = devm_iio_trigger_alloc(dev,
 							       "%s-dev%d-peak",
 							       indio_dev->name,
-							       indio_dev->id);
+							       iio_device_id(indio_dev));
 		if (!st->peak_datardy_trig)
 			return -ENOMEM;
 
diff --git a/drivers/iio/accel/bma180.c b/drivers/iio/accel/bma180.c
index b8a7469cdae4..68d91a70de03 100644
--- a/drivers/iio/accel/bma180.c
+++ b/drivers/iio/accel/bma180.c
@@ -1045,7 +1045,7 @@ static int bma180_probe(struct i2c_client *client,
 
 	if (client->irq > 0) {
 		data->trig = iio_trigger_alloc(dev, "%s-dev%d", indio_dev->name,
-			indio_dev->id);
+					       iio_device_id(indio_dev));
 		if (!data->trig) {
 			ret = -ENOMEM;
 			goto err_chip_disable;
diff --git a/drivers/iio/accel/bmc150-accel-core.c b/drivers/iio/accel/bmc150-accel-core.c
index 04d85ce34e9f..62a164a7b852 100644
--- a/drivers/iio/accel/bmc150-accel-core.c
+++ b/drivers/iio/accel/bmc150-accel-core.c
@@ -1470,9 +1470,9 @@ static int bmc150_accel_triggers_setup(struct iio_dev *indio_dev,
 		struct bmc150_accel_trigger *t = &data->triggers[i];
 
 		t->indio_trig = devm_iio_trigger_alloc(dev,
-					bmc150_accel_triggers[i].name,
+						       bmc150_accel_triggers[i].name,
 						       indio_dev->name,
-						       indio_dev->id);
+						       iio_device_id(indio_dev));
 		if (!t->indio_trig) {
 			ret = -ENOMEM;
 			break;
diff --git a/drivers/iio/accel/kxcjk-1013.c b/drivers/iio/accel/kxcjk-1013.c
index ff724bc17a45..283e6a3feffc 100644
--- a/drivers/iio/accel/kxcjk-1013.c
+++ b/drivers/iio/accel/kxcjk-1013.c
@@ -1404,7 +1404,7 @@ static int kxcjk1013_probe(struct i2c_client *client,
 		data->dready_trig = devm_iio_trigger_alloc(&client->dev,
 							   "%s-dev%d",
 							   indio_dev->name,
-							   indio_dev->id);
+							   iio_device_id(indio_dev));
 		if (!data->dready_trig) {
 			ret = -ENOMEM;
 			goto err_poweroff;
@@ -1413,7 +1413,7 @@ static int kxcjk1013_probe(struct i2c_client *client,
 		data->motion_trig = devm_iio_trigger_alloc(&client->dev,
 							  "%s-any-motion-dev%d",
 							  indio_dev->name,
-							  indio_dev->id);
+							  iio_device_id(indio_dev));
 		if (!data->motion_trig) {
 			ret = -ENOMEM;
 			goto err_poweroff;
diff --git a/drivers/iio/accel/mma8452.c b/drivers/iio/accel/mma8452.c
index 4d307dfb9169..464a6bfe6746 100644
--- a/drivers/iio/accel/mma8452.c
+++ b/drivers/iio/accel/mma8452.c
@@ -1461,7 +1461,7 @@ static int mma8452_trigger_setup(struct iio_dev *indio_dev)
 
 	trig = devm_iio_trigger_alloc(&data->client->dev, "%s-dev%d",
 				      indio_dev->name,
-				      indio_dev->id);
+				      iio_device_id(indio_dev));
 	if (!trig)
 		return -ENOMEM;
 
diff --git a/drivers/iio/accel/mxc4005.c b/drivers/iio/accel/mxc4005.c
index fb3cbaa62bd8..98c7f5f59011 100644
--- a/drivers/iio/accel/mxc4005.c
+++ b/drivers/iio/accel/mxc4005.c
@@ -433,7 +433,7 @@ static int mxc4005_probe(struct i2c_client *client,
 		data->dready_trig = devm_iio_trigger_alloc(&client->dev,
 							   "%s-dev%d",
 							   indio_dev->name,
-							   indio_dev->id);
+							   iio_device_id(indio_dev));
 		if (!data->dready_trig)
 			return -ENOMEM;
 
diff --git a/drivers/iio/accel/stk8312.c b/drivers/iio/accel/stk8312.c
index 60aecfa9fd92..aeab108c457d 100644
--- a/drivers/iio/accel/stk8312.c
+++ b/drivers/iio/accel/stk8312.c
@@ -552,7 +552,7 @@ static int stk8312_probe(struct i2c_client *client,
 		data->dready_trig = devm_iio_trigger_alloc(&client->dev,
 							   "%s-dev%d",
 							   indio_dev->name,
-							   indio_dev->id);
+							   iio_device_id(indio_dev));
 		if (!data->dready_trig) {
 			ret = -ENOMEM;
 			goto err_power_off;
diff --git a/drivers/iio/accel/stk8ba50.c b/drivers/iio/accel/stk8ba50.c
index 7cf9cb7e8666..3e7cf23be7e1 100644
--- a/drivers/iio/accel/stk8ba50.c
+++ b/drivers/iio/accel/stk8ba50.c
@@ -448,7 +448,7 @@ static int stk8ba50_probe(struct i2c_client *client,
 		data->dready_trig = devm_iio_trigger_alloc(&client->dev,
 							   "%s-dev%d",
 							   indio_dev->name,
-							   indio_dev->id);
+							   iio_device_id(indio_dev));
 		if (!data->dready_trig) {
 			ret = -ENOMEM;
 			goto err_power_off;
diff --git a/drivers/iio/adc/ad7606.c b/drivers/iio/adc/ad7606.c
index 0af0bb4d5a7f..0a60ecc69d38 100644
--- a/drivers/iio/adc/ad7606.c
+++ b/drivers/iio/adc/ad7606.c
@@ -663,7 +663,8 @@ int ad7606_probe(struct device *dev, int irq, void __iomem *base_address,
 	}
 
 	st->trig = devm_iio_trigger_alloc(dev, "%s-dev%d",
-					  indio_dev->name, indio_dev->id);
+					  indio_dev->name,
+					  iio_device_id(indio_dev));
 	if (!st->trig)
 		return -ENOMEM;
 
diff --git a/drivers/iio/adc/ad7766.c b/drivers/iio/adc/ad7766.c
index 1e41759f3ee5..236a455aaa18 100644
--- a/drivers/iio/adc/ad7766.c
+++ b/drivers/iio/adc/ad7766.c
@@ -248,7 +248,8 @@ static int ad7766_probe(struct spi_device *spi)
 
 	if (spi->irq > 0) {
 		ad7766->trig = devm_iio_trigger_alloc(&spi->dev, "%s-dev%d",
-			indio_dev->name, indio_dev->id);
+						      indio_dev->name,
+						      iio_device_id(indio_dev));
 		if (!ad7766->trig)
 			return -ENOMEM;
 
diff --git a/drivers/iio/adc/ad7768-1.c b/drivers/iio/adc/ad7768-1.c
index c945f1349623..41752777e96c 100644
--- a/drivers/iio/adc/ad7768-1.c
+++ b/drivers/iio/adc/ad7768-1.c
@@ -626,7 +626,8 @@ static int ad7768_probe(struct spi_device *spi)
 	}
 
 	st->trig = devm_iio_trigger_alloc(&spi->dev, "%s-dev%d",
-					  indio_dev->name, indio_dev->id);
+					  indio_dev->name,
+					  iio_device_id(indio_dev));
 	if (!st->trig)
 		return -ENOMEM;
 
diff --git a/drivers/iio/adc/ad_sigma_delta.c b/drivers/iio/adc/ad_sigma_delta.c
index e777ec718973..69b979331ccd 100644
--- a/drivers/iio/adc/ad_sigma_delta.c
+++ b/drivers/iio/adc/ad_sigma_delta.c
@@ -477,7 +477,7 @@ static int ad_sd_probe_trigger(struct iio_dev *indio_dev)
 
 	sigma_delta->trig = iio_trigger_alloc(&sigma_delta->spi->dev,
 					      "%s-dev%d", indio_dev->name,
-					      indio_dev->id);
+					      iio_device_id(indio_dev));
 	if (sigma_delta->trig == NULL) {
 		ret = -ENOMEM;
 		goto error_ret;
diff --git a/drivers/iio/adc/at91-sama5d2_adc.c b/drivers/iio/adc/at91-sama5d2_adc.c
index a7826f097b95..6e8c28675947 100644
--- a/drivers/iio/adc/at91-sama5d2_adc.c
+++ b/drivers/iio/adc/at91-sama5d2_adc.c
@@ -997,7 +997,7 @@ static struct iio_trigger *at91_adc_allocate_trigger(struct iio_dev *indio,
 	int ret;
 
 	trig = devm_iio_trigger_alloc(&indio->dev, "%s-dev%d-%s", indio->name,
-				      indio->id, trigger_name);
+				iio_device_id(indio), trigger_name);
 	if (!trig)
 		return NULL;
 
diff --git a/drivers/iio/adc/at91_adc.c b/drivers/iio/adc/at91_adc.c
index 0b5f0c91d0d7..5a7d3a3a5fa8 100644
--- a/drivers/iio/adc/at91_adc.c
+++ b/drivers/iio/adc/at91_adc.c
@@ -547,7 +547,7 @@ static int at91_adc_get_trigger_value_by_name(struct iio_dev *idev,
 		char *name = kasprintf(GFP_KERNEL,
 				"%s-dev%d-%s",
 				idev->name,
-				idev->id,
+				iio_device_id(idev),
 				triggers[i].name);
 		if (!name)
 			return -ENOMEM;
@@ -626,7 +626,7 @@ static struct iio_trigger *at91_adc_allocate_trigger(struct iio_dev *idev,
 	int ret;
 
 	trig = iio_trigger_alloc(idev->dev.parent, "%s-dev%d-%s", idev->name,
-				 idev->id, trigger->name);
+				 iio_device_id(idev), trigger->name);
 	if (trig == NULL)
 		return NULL;
 
diff --git a/drivers/iio/adc/dln2-adc.c b/drivers/iio/adc/dln2-adc.c
index 0d53ef18e045..16407664182c 100644
--- a/drivers/iio/adc/dln2-adc.c
+++ b/drivers/iio/adc/dln2-adc.c
@@ -649,7 +649,8 @@ static int dln2_adc_probe(struct platform_device *pdev)
 	indio_dev->setup_ops = &dln2_adc_buffer_setup_ops;
 
 	dln2->trig = devm_iio_trigger_alloc(dev, "%s-dev%d",
-					    indio_dev->name, indio_dev->id);
+					    indio_dev->name,
+					    iio_device_id(indio_dev));
 	if (!dln2->trig) {
 		dev_err(dev, "failed to allocate trigger\n");
 		return -ENOMEM;
diff --git a/drivers/iio/adc/ina2xx-adc.c b/drivers/iio/adc/ina2xx-adc.c
index 2ae54258b221..a4b2ff9e0dd5 100644
--- a/drivers/iio/adc/ina2xx-adc.c
+++ b/drivers/iio/adc/ina2xx-adc.c
@@ -843,7 +843,8 @@ static int ina2xx_buffer_enable(struct iio_dev *indio_dev)
 		chip->allow_async_readout);
 
 	task = kthread_create(ina2xx_capture_thread, (void *)indio_dev,
-			      "%s:%d-%uus", indio_dev->name, indio_dev->id,
+			      "%s:%d-%uus", indio_dev->name,
+			      iio_device_id(indio_dev),
 			      sampling_us);
 	if (IS_ERR(task))
 		return PTR_ERR(task);
diff --git a/drivers/iio/adc/mxs-lradc-adc.c b/drivers/iio/adc/mxs-lradc-adc.c
index 30e29f44ebd2..1d99170d3328 100644
--- a/drivers/iio/adc/mxs-lradc-adc.c
+++ b/drivers/iio/adc/mxs-lradc-adc.c
@@ -455,7 +455,7 @@ static int mxs_lradc_adc_trigger_init(struct iio_dev *iio)
 	struct mxs_lradc_adc *adc = iio_priv(iio);
 
 	trig = devm_iio_trigger_alloc(&iio->dev, "%s-dev%i", iio->name,
-				      iio->id);
+				      iio_device_id(iio));
 	if (!trig)
 		return -ENOMEM;
 
diff --git a/drivers/iio/adc/ti-ads131e08.c b/drivers/iio/adc/ti-ads131e08.c
index 2c20dbed8ada..0c2025a22575 100644
--- a/drivers/iio/adc/ti-ads131e08.c
+++ b/drivers/iio/adc/ti-ads131e08.c
@@ -849,7 +849,7 @@ static int ads131e08_probe(struct spi_device *spi)
 	}
 
 	st->trig = devm_iio_trigger_alloc(&spi->dev, "%s-dev%d",
-		indio_dev->name, indio_dev->id);
+		indio_dev->name, iio_device_id(indio_dev));
 	if (!st->trig) {
 		dev_err(&spi->dev, "failed to allocate IIO trigger\n");
 		return -ENOMEM;
diff --git a/drivers/iio/adc/xilinx-xadc-core.c b/drivers/iio/adc/xilinx-xadc-core.c
index 6914c1900ed0..198d2916266d 100644
--- a/drivers/iio/adc/xilinx-xadc-core.c
+++ b/drivers/iio/adc/xilinx-xadc-core.c
@@ -743,7 +743,7 @@ static struct iio_trigger *xadc_alloc_trigger(struct iio_dev *indio_dev,
 	int ret;
 
 	trig = devm_iio_trigger_alloc(dev, "%s%d-%s", indio_dev->name,
-				      indio_dev->id, name);
+				      iio_device_id(indio_dev), name);
 	if (trig == NULL)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/drivers/iio/buffer/industrialio-triggered-buffer.c b/drivers/iio/buffer/industrialio-triggered-buffer.c
index ebb4520ac291..f77c4538141e 100644
--- a/drivers/iio/buffer/industrialio-triggered-buffer.c
+++ b/drivers/iio/buffer/industrialio-triggered-buffer.c
@@ -56,7 +56,7 @@ int iio_triggered_buffer_setup_ext(struct iio_dev *indio_dev,
 						 indio_dev,
 						 "%s_consumer%d",
 						 indio_dev->name,
-						 indio_dev->id);
+						 iio_device_id(indio_dev));
 	if (indio_dev->pollfunc == NULL) {
 		ret = -ENOMEM;
 		goto error_kfifo_free;
diff --git a/drivers/iio/chemical/atlas-sensor.c b/drivers/iio/chemical/atlas-sensor.c
index 56ba6c82b501..d10f921b233a 100644
--- a/drivers/iio/chemical/atlas-sensor.c
+++ b/drivers/iio/chemical/atlas-sensor.c
@@ -640,7 +640,7 @@ static int atlas_probe(struct i2c_client *client,
 	indio_dev->modes = INDIO_BUFFER_SOFTWARE | INDIO_DIRECT_MODE;
 
 	trig = devm_iio_trigger_alloc(&client->dev, "%s-dev%d",
-				      indio_dev->name, indio_dev->id);
+				      indio_dev->name, iio_device_id(indio_dev));
 
 	if (!trig)
 		return -ENOMEM;
diff --git a/drivers/iio/chemical/ccs811.c b/drivers/iio/chemical/ccs811.c
index 886e96496dbf..847194fa1e46 100644
--- a/drivers/iio/chemical/ccs811.c
+++ b/drivers/iio/chemical/ccs811.c
@@ -491,7 +491,7 @@ static int ccs811_probe(struct i2c_client *client,
 		data->drdy_trig = devm_iio_trigger_alloc(&client->dev,
 							 "%s-dev%d",
 							 indio_dev->name,
-							 indio_dev->id);
+							 iio_device_id(indio_dev));
 		if (!data->drdy_trig) {
 			ret = -ENOMEM;
 			goto err_poweroff;
diff --git a/drivers/iio/chemical/scd30_core.c b/drivers/iio/chemical/scd30_core.c
index d89f117dd0ef..9fe6bbe9ee04 100644
--- a/drivers/iio/chemical/scd30_core.c
+++ b/drivers/iio/chemical/scd30_core.c
@@ -640,7 +640,8 @@ static int scd30_setup_trigger(struct iio_dev *indio_dev)
 	struct iio_trigger *trig;
 	int ret;
 
-	trig = devm_iio_trigger_alloc(dev, "%s-dev%d", indio_dev->name, indio_dev->id);
+	trig = devm_iio_trigger_alloc(dev, "%s-dev%d", indio_dev->name,
+				      iio_device_id(indio_dev));
 	if (!trig) {
 		dev_err(dev, "failed to allocate trigger\n");
 		return -ENOMEM;
diff --git a/drivers/iio/common/hid-sensors/hid-sensor-trigger.c b/drivers/iio/common/hid-sensors/hid-sensor-trigger.c
index 95ddccb44f1c..5a7b3e253e58 100644
--- a/drivers/iio/common/hid-sensors/hid-sensor-trigger.c
+++ b/drivers/iio/common/hid-sensors/hid-sensor-trigger.c
@@ -256,7 +256,7 @@ int hid_sensor_setup_trigger(struct iio_dev *indio_dev, const char *name,
 	}
 
 	trig = iio_trigger_alloc(indio_dev->dev.parent,
-				 "%s-dev%d", name, indio_dev->id);
+				 "%s-dev%d", name, iio_device_id(indio_dev));
 	if (trig == NULL) {
 		dev_err(&indio_dev->dev, "Trigger Allocate Failed\n");
 		ret = -ENOMEM;
diff --git a/drivers/iio/gyro/adxrs290.c b/drivers/iio/gyro/adxrs290.c
index cec5e1f17c22..3e0734ddafe3 100644
--- a/drivers/iio/gyro/adxrs290.c
+++ b/drivers/iio/gyro/adxrs290.c
@@ -589,7 +589,7 @@ static int adxrs290_probe_trigger(struct iio_dev *indio_dev)
 
 	st->dready_trig = devm_iio_trigger_alloc(&st->spi->dev, "%s-dev%d",
 						 indio_dev->name,
-						 indio_dev->id);
+						 iio_device_id(indio_dev));
 	if (!st->dready_trig)
 		return -ENOMEM;
 
diff --git a/drivers/iio/gyro/bmg160_core.c b/drivers/iio/gyro/bmg160_core.c
index b11ebd9bb7a4..26a9ed5770c6 100644
--- a/drivers/iio/gyro/bmg160_core.c
+++ b/drivers/iio/gyro/bmg160_core.c
@@ -1137,14 +1137,14 @@ int bmg160_core_probe(struct device *dev, struct regmap *regmap, int irq,
 		data->dready_trig = devm_iio_trigger_alloc(dev,
 							   "%s-dev%d",
 							   indio_dev->name,
-							   indio_dev->id);
+							   iio_device_id(indio_dev));
 		if (!data->dready_trig)
 			return -ENOMEM;
 
 		data->motion_trig = devm_iio_trigger_alloc(dev,
 							  "%s-any-motion-dev%d",
 							  indio_dev->name,
-							  indio_dev->id);
+							  iio_device_id(indio_dev));
 		if (!data->motion_trig)
 			return -ENOMEM;
 
diff --git a/drivers/iio/gyro/fxas21002c_core.c b/drivers/iio/gyro/fxas21002c_core.c
index 1a20c6b88e7d..5af7b48ff01a 100644
--- a/drivers/iio/gyro/fxas21002c_core.c
+++ b/drivers/iio/gyro/fxas21002c_core.c
@@ -852,7 +852,7 @@ static int fxas21002c_trigger_probe(struct fxas21002c_data *data)
 
 	data->dready_trig = devm_iio_trigger_alloc(dev, "%s-dev%d",
 						   indio_dev->name,
-						   indio_dev->id);
+						   iio_device_id(indio_dev));
 	if (!data->dready_trig)
 		return -ENOMEM;
 
diff --git a/drivers/iio/gyro/itg3200_buffer.c b/drivers/iio/gyro/itg3200_buffer.c
index af0aaa146f0c..04dd6a7969ea 100644
--- a/drivers/iio/gyro/itg3200_buffer.c
+++ b/drivers/iio/gyro/itg3200_buffer.c
@@ -114,7 +114,7 @@ int itg3200_probe_trigger(struct iio_dev *indio_dev)
 	struct itg3200 *st = iio_priv(indio_dev);
 
 	st->trig = iio_trigger_alloc(&st->i2c->dev, "%s-dev%d", indio_dev->name,
-				     indio_dev->id);
+				     iio_device_id(indio_dev));
 	if (!st->trig)
 		return -ENOMEM;
 
diff --git a/drivers/iio/gyro/mpu3050-core.c b/drivers/iio/gyro/mpu3050-core.c
index f17a93519535..2b930c7f4d86 100644
--- a/drivers/iio/gyro/mpu3050-core.c
+++ b/drivers/iio/gyro/mpu3050-core.c
@@ -1058,7 +1058,7 @@ static int mpu3050_trigger_probe(struct iio_dev *indio_dev, int irq)
 	mpu3050->trig = devm_iio_trigger_alloc(&indio_dev->dev,
 					       "%s-dev%d",
 					       indio_dev->name,
-					       indio_dev->id);
+					       iio_device_id(indio_dev));
 	if (!mpu3050->trig)
 		return -ENOMEM;
 
diff --git a/drivers/iio/health/afe4403.c b/drivers/iio/health/afe4403.c
index 1fa8d51d5080..d4921385aaf7 100644
--- a/drivers/iio/health/afe4403.c
+++ b/drivers/iio/health/afe4403.c
@@ -521,7 +521,7 @@ static int afe4403_probe(struct spi_device *spi)
 		afe->trig = devm_iio_trigger_alloc(afe->dev,
 						   "%s-dev%d",
 						   indio_dev->name,
-						   indio_dev->id);
+						   iio_device_id(indio_dev));
 		if (!afe->trig) {
 			dev_err(afe->dev, "Unable to allocate IIO trigger\n");
 			ret = -ENOMEM;
diff --git a/drivers/iio/health/afe4404.c b/drivers/iio/health/afe4404.c
index e1476bf79fe2..d8a27dfe074a 100644
--- a/drivers/iio/health/afe4404.c
+++ b/drivers/iio/health/afe4404.c
@@ -528,7 +528,7 @@ static int afe4404_probe(struct i2c_client *client,
 		afe->trig = devm_iio_trigger_alloc(afe->dev,
 						   "%s-dev%d",
 						   indio_dev->name,
-						   indio_dev->id);
+						   iio_device_id(indio_dev));
 		if (!afe->trig) {
 			dev_err(afe->dev, "Unable to allocate IIO trigger\n");
 			ret = -ENOMEM;
diff --git a/drivers/iio/imu/adis_trigger.c b/drivers/iio/imu/adis_trigger.c
index fa5540fabacc..48eedc29b28a 100644
--- a/drivers/iio/imu/adis_trigger.c
+++ b/drivers/iio/imu/adis_trigger.c
@@ -62,7 +62,8 @@ int devm_adis_probe_trigger(struct adis *adis, struct iio_dev *indio_dev)
 	int ret;
 
 	adis->trig = devm_iio_trigger_alloc(&adis->spi->dev, "%s-dev%d",
-					    indio_dev->name, indio_dev->id);
+					    indio_dev->name,
+					    iio_device_id(indio_dev));
 	if (!adis->trig)
 		return -ENOMEM;
 
diff --git a/drivers/iio/imu/bmi160/bmi160_core.c b/drivers/iio/imu/bmi160/bmi160_core.c
index 290b5ef83f77..b63bd7e5e5e5 100644
--- a/drivers/iio/imu/bmi160/bmi160_core.c
+++ b/drivers/iio/imu/bmi160/bmi160_core.c
@@ -785,7 +785,8 @@ int bmi160_probe_trigger(struct iio_dev *indio_dev, int irq, u32 irq_type)
 	int ret;
 
 	data->trig = devm_iio_trigger_alloc(&indio_dev->dev, "%s-dev%d",
-					    indio_dev->name, indio_dev->id);
+					    indio_dev->name,
+					    iio_device_id(indio_dev));
 
 	if (data->trig == NULL)
 		return -ENOMEM;
diff --git a/drivers/iio/imu/inv_mpu6050/inv_mpu_trigger.c b/drivers/iio/imu/inv_mpu6050/inv_mpu_trigger.c
index de8ed1446d60..e21ba778595a 100644
--- a/drivers/iio/imu/inv_mpu6050/inv_mpu_trigger.c
+++ b/drivers/iio/imu/inv_mpu6050/inv_mpu_trigger.c
@@ -238,7 +238,7 @@ int inv_mpu6050_probe_trigger(struct iio_dev *indio_dev, int irq_type)
 	st->trig = devm_iio_trigger_alloc(&indio_dev->dev,
 					  "%s-dev%d",
 					  indio_dev->name,
-					  indio_dev->id);
+					  iio_device_id(indio_dev));
 	if (!st->trig)
 		return -ENOMEM;
 
diff --git a/drivers/iio/imu/kmx61.c b/drivers/iio/imu/kmx61.c
index fc5a60fcfec0..d3e06ce99c1e 100644
--- a/drivers/iio/imu/kmx61.c
+++ b/drivers/iio/imu/kmx61.c
@@ -1264,7 +1264,7 @@ static struct iio_trigger *kmx61_trigger_setup(struct kmx61_data *data,
 				      "%s-%s-dev%d",
 				      indio_dev->name,
 				      tag,
-				      indio_dev->id);
+				      iio_device_id(indio_dev));
 	if (!trig)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 3fdcf2d4997a..ec21341c6322 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -169,6 +169,20 @@ static const char * const iio_chan_info_postfix[] = {
 	[IIO_CHAN_INFO_THERMOCOUPLE_TYPE] = "thermocouple_type",
 	[IIO_CHAN_INFO_CALIBAMBIENT] = "calibambient",
 };
+/**
+ * iio_device_id() - query the unique ID for the device
+ * @indio_dev:		Device structure whose ID is being queried
+ *
+ * The IIO device ID is a unique index used for example for the naming
+ * of the character device /dev/iio\:device[ID]
+ */
+int iio_device_id(struct iio_dev *indio_dev)
+{
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev);
+
+	return iio_dev_opaque->id;
+}
+EXPORT_SYMBOL_GPL(iio_device_id);
 
 /**
  * iio_sysfs_match_string_with_gaps - matches given string in an array with gaps
@@ -1588,7 +1602,7 @@ static void iio_dev_release(struct device *device)
 
 	iio_device_detach_buffers(indio_dev);
 
-	ida_simple_remove(&iio_ida, indio_dev->id);
+	ida_simple_remove(&iio_ida, iio_dev_opaque->id);
 	kfree(iio_dev_opaque);
 }
 
@@ -1631,14 +1645,14 @@ struct iio_dev *iio_device_alloc(struct device *parent, int sizeof_priv)
 	mutex_init(&indio_dev->info_exist_lock);
 	INIT_LIST_HEAD(&iio_dev_opaque->channel_attr_list);
 
-	indio_dev->id = ida_simple_get(&iio_ida, 0, 0, GFP_KERNEL);
-	if (indio_dev->id < 0) {
+	iio_dev_opaque->id = ida_simple_get(&iio_ida, 0, 0, GFP_KERNEL);
+	if (iio_dev_opaque->id < 0) {
 		/* cannot use a dev_err as the name isn't available */
 		pr_err("failed to get device id\n");
 		kfree(iio_dev_opaque);
 		return NULL;
 	}
-	dev_set_name(&indio_dev->dev, "iio:device%d", indio_dev->id);
+	dev_set_name(&indio_dev->dev, "iio:device%d", iio_dev_opaque->id);
 	INIT_LIST_HEAD(&iio_dev_opaque->buffer_list);
 	INIT_LIST_HEAD(&iio_dev_opaque->ioctl_handlers);
 
@@ -1891,7 +1905,7 @@ int __iio_device_register(struct iio_dev *indio_dev, struct module *this_mod)
 		cdev_init(&indio_dev->chrdev, &iio_event_fileops);
 
 	if (iio_dev_opaque->attached_buffers_cnt || iio_dev_opaque->event_interface) {
-		indio_dev->dev.devt = MKDEV(MAJOR(iio_devt), indio_dev->id);
+		indio_dev->dev.devt = MKDEV(MAJOR(iio_devt), iio_dev_opaque->id);
 		indio_dev->chrdev.owner = this_mod;
 	}
 
diff --git a/drivers/iio/industrialio-triggered-event.c b/drivers/iio/industrialio-triggered-event.c
index 53da9ab17a62..4bedc65c9fe3 100644
--- a/drivers/iio/industrialio-triggered-event.c
+++ b/drivers/iio/industrialio-triggered-event.c
@@ -37,7 +37,7 @@ int iio_triggered_event_setup(struct iio_dev *indio_dev,
 						       indio_dev,
 						       "%s_consumer%d",
 						       indio_dev->name,
-						       indio_dev->id);
+						       iio_device_id(indio_dev));
 	if (indio_dev->pollfunc_event == NULL)
 		return -ENOMEM;
 
diff --git a/drivers/iio/light/acpi-als.c b/drivers/iio/light/acpi-als.c
index 0a6ab5761eec..e1ff6f524f4b 100644
--- a/drivers/iio/light/acpi-als.c
+++ b/drivers/iio/light/acpi-als.c
@@ -204,7 +204,8 @@ static int acpi_als_add(struct acpi_device *device)
 	indio_dev->channels = acpi_als_channels;
 	indio_dev->num_channels = ARRAY_SIZE(acpi_als_channels);
 
-	als->trig = devm_iio_trigger_alloc(dev, "%s-dev%d", indio_dev->name, indio_dev->id);
+	als->trig = devm_iio_trigger_alloc(dev, "%s-dev%d", indio_dev->name,
+					   iio_device_id(indio_dev));
 	if (!als->trig)
 		return -ENOMEM;
 
diff --git a/drivers/iio/light/rpr0521.c b/drivers/iio/light/rpr0521.c
index 033578f444e4..7e332de0e6a5 100644
--- a/drivers/iio/light/rpr0521.c
+++ b/drivers/iio/light/rpr0521.c
@@ -985,7 +985,7 @@ static int rpr0521_probe(struct i2c_client *client,
 		/* Trigger0 producer setup */
 		data->drdy_trigger0 = devm_iio_trigger_alloc(
 			indio_dev->dev.parent,
-			"%s-dev%d", indio_dev->name, indio_dev->id);
+			"%s-dev%d", indio_dev->name, iio_device_id(indio_dev));
 		if (!data->drdy_trigger0) {
 			ret = -ENOMEM;
 			goto err_pm_disable;
diff --git a/drivers/iio/light/si1145.c b/drivers/iio/light/si1145.c
index 9b5c99823943..3fb52402fcc3 100644
--- a/drivers/iio/light/si1145.c
+++ b/drivers/iio/light/si1145.c
@@ -1243,7 +1243,7 @@ static int si1145_probe_trigger(struct iio_dev *indio_dev)
 	int ret;
 
 	trig = devm_iio_trigger_alloc(&client->dev,
-			"%s-dev%d", indio_dev->name, indio_dev->id);
+			"%s-dev%d", indio_dev->name, iio_device_id(indio_dev));
 	if (!trig)
 		return -ENOMEM;
 
diff --git a/drivers/iio/light/vcnl4000.c b/drivers/iio/light/vcnl4000.c
index 2f7916f95689..4a61865d01cd 100644
--- a/drivers/iio/light/vcnl4000.c
+++ b/drivers/iio/light/vcnl4000.c
@@ -998,7 +998,8 @@ static int vcnl4010_probe_trigger(struct iio_dev *indio_dev)
 	struct iio_trigger *trigger;
 
 	trigger = devm_iio_trigger_alloc(&client->dev, "%s-dev%d",
-					 indio_dev->name, indio_dev->id);
+					 indio_dev->name,
+					 iio_device_id(indio_dev));
 	if (!trigger)
 		return -ENOMEM;
 
diff --git a/drivers/iio/light/vcnl4035.c b/drivers/iio/light/vcnl4035.c
index ae87740d9cef..691a54b763e1 100644
--- a/drivers/iio/light/vcnl4035.c
+++ b/drivers/iio/light/vcnl4035.c
@@ -507,7 +507,7 @@ static int vcnl4035_probe_trigger(struct iio_dev *indio_dev)
 
 	data->drdy_trigger0 = devm_iio_trigger_alloc(
 			indio_dev->dev.parent,
-			"%s-dev%d", indio_dev->name, indio_dev->id);
+			"%s-dev%d", indio_dev->name, iio_device_id(indio_dev));
 	if (!data->drdy_trigger0)
 		return -ENOMEM;
 
diff --git a/drivers/iio/magnetometer/bmc150_magn.c b/drivers/iio/magnetometer/bmc150_magn.c
index 00f9766bad5c..d75b437a43f2 100644
--- a/drivers/iio/magnetometer/bmc150_magn.c
+++ b/drivers/iio/magnetometer/bmc150_magn.c
@@ -915,7 +915,7 @@ int bmc150_magn_probe(struct device *dev, struct regmap *regmap,
 		data->dready_trig = devm_iio_trigger_alloc(dev,
 							   "%s-dev%d",
 							   indio_dev->name,
-							   indio_dev->id);
+							   iio_device_id(indio_dev));
 		if (!data->dready_trig) {
 			ret = -ENOMEM;
 			dev_err(dev, "iio trigger alloc failed\n");
diff --git a/drivers/iio/magnetometer/rm3100-core.c b/drivers/iio/magnetometer/rm3100-core.c
index dd811da9cb6d..4df5887fd04c 100644
--- a/drivers/iio/magnetometer/rm3100-core.c
+++ b/drivers/iio/magnetometer/rm3100-core.c
@@ -575,7 +575,7 @@ int rm3100_common_probe(struct device *dev, struct regmap *regmap, int irq)
 
 		data->drdy_trig = devm_iio_trigger_alloc(dev, "%s-drdy%d",
 							 indio_dev->name,
-							 indio_dev->id);
+							 iio_device_id(indio_dev));
 		if (!data->drdy_trig)
 			return -ENOMEM;
 
diff --git a/drivers/iio/potentiostat/lmp91000.c b/drivers/iio/potentiostat/lmp91000.c
index 8a9c576616ee..1948e2d22c27 100644
--- a/drivers/iio/potentiostat/lmp91000.c
+++ b/drivers/iio/potentiostat/lmp91000.c
@@ -323,7 +323,8 @@ static int lmp91000_probe(struct i2c_client *client,
 	}
 
 	data->trig = devm_iio_trigger_alloc(dev, "%s-mux%d",
-					    indio_dev->name, indio_dev->id);
+					    indio_dev->name,
+					    iio_device_id(indio_dev));
 	if (!data->trig) {
 		dev_err(dev, "cannot allocate iio trigger.\n");
 		return -ENOMEM;
diff --git a/drivers/iio/pressure/zpa2326.c b/drivers/iio/pressure/zpa2326.c
index a93411216aee..89295c90f801 100644
--- a/drivers/iio/pressure/zpa2326.c
+++ b/drivers/iio/pressure/zpa2326.c
@@ -1408,7 +1408,8 @@ static int zpa2326_init_managed_trigger(struct device          *parent,
 		return 0;
 
 	trigger = devm_iio_trigger_alloc(parent, "%s-dev%d",
-					 indio_dev->name, indio_dev->id);
+					 indio_dev->name,
+					 iio_device_id(indio_dev));
 	if (!trigger)
 		return -ENOMEM;
 
diff --git a/drivers/iio/proximity/as3935.c b/drivers/iio/proximity/as3935.c
index edc4a35ae66d..dc20fe81232c 100644
--- a/drivers/iio/proximity/as3935.c
+++ b/drivers/iio/proximity/as3935.c
@@ -404,7 +404,8 @@ static int as3935_probe(struct spi_device *spi)
 	indio_dev->info = &as3935_info;
 
 	trig = devm_iio_trigger_alloc(dev, "%s-dev%d",
-				      indio_dev->name, indio_dev->id);
+				      indio_dev->name,
+				      iio_device_id(indio_dev));
 
 	if (!trig)
 		return -ENOMEM;
diff --git a/drivers/iio/proximity/sx9310.c b/drivers/iio/proximity/sx9310.c
index 327ebb7ddbb9..175f3b7c61d7 100644
--- a/drivers/iio/proximity/sx9310.c
+++ b/drivers/iio/proximity/sx9310.c
@@ -1473,7 +1473,7 @@ static int sx9310_probe(struct i2c_client *client)
 
 		data->trig = devm_iio_trigger_alloc(dev, "%s-dev%d",
 						    indio_dev->name,
-						    indio_dev->id);
+						    iio_device_id(indio_dev));
 		if (!data->trig)
 			return -ENOMEM;
 
diff --git a/drivers/iio/proximity/sx9500.c b/drivers/iio/proximity/sx9500.c
index a87f4a8e4327..3e4ddb2e8c2b 100644
--- a/drivers/iio/proximity/sx9500.c
+++ b/drivers/iio/proximity/sx9500.c
@@ -946,7 +946,7 @@ static int sx9500_probe(struct i2c_client *client,
 			return ret;
 
 		data->trig = devm_iio_trigger_alloc(&client->dev,
-				"%s-dev%d", indio_dev->name, indio_dev->id);
+				"%s-dev%d", indio_dev->name, iio_device_id(indio_dev));
 		if (!data->trig)
 			return -ENOMEM;
 
diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h
index 32addd5e790e..e66b029d99de 100644
--- a/include/linux/iio/iio-opaque.h
+++ b/include/linux/iio/iio-opaque.h
@@ -6,6 +6,7 @@
 /**
  * struct iio_dev_opaque - industrial I/O device opaque information
  * @indio_dev:			public industrial I/O device information
+ * @id:			used to identify device internally
  * @event_interface:		event chrdevs associated with interrupt lines
  * @attached_buffers:		array of buffers statically attached by the driver
  * @attached_buffers_cnt:	number of buffers in the array of statically attached buffers
@@ -26,6 +27,7 @@
  */
 struct iio_dev_opaque {
 	struct iio_dev			indio_dev;
+	int				id;
 	struct iio_event_interface	*event_interface;
 	struct iio_buffer		**attached_buffers;
 	unsigned int			attached_buffers_cnt;
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index f2d65e2e88b6..569861d5887a 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -488,7 +488,6 @@ struct iio_buffer_setup_ops {
 
 /**
  * struct iio_dev - industrial I/O device
- * @id:			[INTERN] used to identify device internally
  * @driver_module:	[INTERN] used to make it harder to undercut users
  * @modes:		[DRIVER] operating modes supported by device
  * @currentmode:	[DRIVER] current operating mode
@@ -523,7 +522,6 @@ struct iio_buffer_setup_ops {
  *			**MUST** be accessed **ONLY** via iio_priv() helper
  */
 struct iio_dev {
-	int				id;
 	struct module			*driver_module;
 
 	int				modes;
@@ -559,6 +557,8 @@ struct iio_dev {
 	void				*priv;
 };
 
+int iio_device_id(struct iio_dev *indio_dev);
+
 const struct iio_chan_spec
 *iio_find_channel_from_si(struct iio_dev *indio_dev, int si);
 /**
-- 
cgit v1.2.3


From e5333ed09e0f8ece3cbb37912c17cf9880ee3fb0 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Mon, 26 Apr 2021 18:49:04 +0100
Subject: iio: avoid shadowing of variable name in to_iio_dev_opaque()

indio_dev was both the macro input parameter and the field name
in this macro.  That causes trouble if the instance of
struct iio_dev passed in is not called indio_dev.

Whilst a fix of sorts, no need to backport as it seems we never
hit this previously due to some very consistent naming in IIO.

Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Alexandru Ardelean <ardeleanalex@gmail.com>
Link: https://lore.kernel.org/r/20210426174911.397061-3-jic23@kernel.org
---
 include/linux/iio/iio-opaque.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h
index e66b029d99de..f876e3aede2c 100644
--- a/include/linux/iio/iio-opaque.h
+++ b/include/linux/iio/iio-opaque.h
@@ -48,7 +48,7 @@ struct iio_dev_opaque {
 #endif
 };
 
-#define to_iio_dev_opaque(indio_dev)		\
-	container_of(indio_dev, struct iio_dev_opaque, indio_dev)
+#define to_iio_dev_opaque(_indio_dev)		\
+	container_of((_indio_dev), struct iio_dev_opaque, indio_dev)
 
 #endif
-- 
cgit v1.2.3


From 6eaf9f6a2738789dedb1e962096f61aaddd81464 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Mon, 26 Apr 2021 18:49:05 +0100
Subject: iio: core: move @driver_module from struct iio_dev to struct
 iio_dev_opaque

Continuing move to hide internal elements from drivers, move this structure
element over.  It's only accessed from iio core files so this one was
straight forward and no accessor functions are needed.

Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Alexandru Ardelean <ardeleanalex@gmail.com>
Link: https://lore.kernel.org/r/20210426174911.397061-4-jic23@kernel.org
---
 drivers/iio/industrialio-core.c    | 2 +-
 drivers/iio/industrialio-trigger.c | 9 ++++++---
 include/linux/iio/iio-opaque.h     | 2 ++
 include/linux/iio/iio.h            | 3 ---
 4 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index ec21341c6322..58b35db00a81 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -1858,7 +1858,7 @@ int __iio_device_register(struct iio_dev *indio_dev, struct module *this_mod)
 	if (!indio_dev->info)
 		return -EINVAL;
 
-	indio_dev->driver_module = this_mod;
+	iio_dev_opaque->driver_module = this_mod;
 	/* If the calling driver did not initialize of_node, do it here */
 	if (!indio_dev->dev.of_node && indio_dev->dev.parent)
 		indio_dev->dev.of_node = indio_dev->dev.parent->of_node;
diff --git a/drivers/iio/industrialio-trigger.c b/drivers/iio/industrialio-trigger.c
index 3236647b2c37..b489eeeb0004 100644
--- a/drivers/iio/industrialio-trigger.c
+++ b/drivers/iio/industrialio-trigger.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 
 #include <linux/iio/iio.h>
+#include <linux/iio/iio-opaque.h>
 #include <linux/iio/trigger.h>
 #include "iio_core.h"
 #include "iio_core_trigger.h"
@@ -240,12 +241,13 @@ static void iio_trigger_put_irq(struct iio_trigger *trig, int irq)
 int iio_trigger_attach_poll_func(struct iio_trigger *trig,
 				 struct iio_poll_func *pf)
 {
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(pf->indio_dev);
 	bool notinuse =
 		bitmap_empty(trig->pool, CONFIG_IIO_CONSUMERS_PER_TRIGGER);
 	int ret = 0;
 
 	/* Prevent the module from being removed whilst attached to a trigger */
-	__module_get(pf->indio_dev->driver_module);
+	__module_get(iio_dev_opaque->driver_module);
 
 	/* Get irq number */
 	pf->irq = iio_trigger_get_irq(trig);
@@ -284,13 +286,14 @@ out_free_irq:
 out_put_irq:
 	iio_trigger_put_irq(trig, pf->irq);
 out_put_module:
-	module_put(pf->indio_dev->driver_module);
+	module_put(iio_dev_opaque->driver_module);
 	return ret;
 }
 
 int iio_trigger_detach_poll_func(struct iio_trigger *trig,
 				 struct iio_poll_func *pf)
 {
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(pf->indio_dev);
 	bool no_other_users =
 		bitmap_weight(trig->pool, CONFIG_IIO_CONSUMERS_PER_TRIGGER) == 1;
 	int ret = 0;
@@ -304,7 +307,7 @@ int iio_trigger_detach_poll_func(struct iio_trigger *trig,
 		trig->attached_own_device = false;
 	iio_trigger_put_irq(trig, pf->irq);
 	free_irq(pf->irq, pf);
-	module_put(pf->indio_dev->driver_module);
+	module_put(iio_dev_opaque->driver_module);
 
 	return ret;
 }
diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h
index f876e3aede2c..96dd265103d0 100644
--- a/include/linux/iio/iio-opaque.h
+++ b/include/linux/iio/iio-opaque.h
@@ -7,6 +7,7 @@
  * struct iio_dev_opaque - industrial I/O device opaque information
  * @indio_dev:			public industrial I/O device information
  * @id:			used to identify device internally
+ * @driver_module:		used to make it harder to undercut users
  * @event_interface:		event chrdevs associated with interrupt lines
  * @attached_buffers:		array of buffers statically attached by the driver
  * @attached_buffers_cnt:	number of buffers in the array of statically attached buffers
@@ -28,6 +29,7 @@
 struct iio_dev_opaque {
 	struct iio_dev			indio_dev;
 	int				id;
+	struct module			*driver_module;
 	struct iio_event_interface	*event_interface;
 	struct iio_buffer		**attached_buffers;
 	unsigned int			attached_buffers_cnt;
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 569861d5887a..9e8e1358a032 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -488,7 +488,6 @@ struct iio_buffer_setup_ops {
 
 /**
  * struct iio_dev - industrial I/O device
- * @driver_module:	[INTERN] used to make it harder to undercut users
  * @modes:		[DRIVER] operating modes supported by device
  * @currentmode:	[DRIVER] current operating mode
  * @dev:		[DRIVER] device structure, should be assigned a parent
@@ -522,8 +521,6 @@ struct iio_buffer_setup_ops {
  *			**MUST** be accessed **ONLY** via iio_priv() helper
  */
 struct iio_dev {
-	struct module			*driver_module;
-
 	int				modes;
 	int				currentmode;
 	struct device			dev;
-- 
cgit v1.2.3


From 3028e0c2af95dd476ccd71f4fc025990385168c2 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Mon, 26 Apr 2021 18:49:06 +0100
Subject: iio: core: move @trig_readonly from struct iio_dev to struct
 iio_dev_opaque

This is only set via the iio_trig_set_immutable() call and later used
by the IIO core so there is no benefit in drivers being able to access
it.

Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Alexandru Ardelean <ardeleanalex@gmail.com>
Link: https://lore.kernel.org/r/20210426174911.397061-5-jic23@kernel.org
---
 drivers/iio/industrialio-trigger.c | 10 +++++++---
 include/linux/iio/iio-opaque.h     |  2 ++
 include/linux/iio/iio.h            |  2 --
 3 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-trigger.c b/drivers/iio/industrialio-trigger.c
index b489eeeb0004..b23caa2f2aa1 100644
--- a/drivers/iio/industrialio-trigger.c
+++ b/drivers/iio/industrialio-trigger.c
@@ -117,14 +117,17 @@ EXPORT_SYMBOL(iio_trigger_unregister);
 
 int iio_trigger_set_immutable(struct iio_dev *indio_dev, struct iio_trigger *trig)
 {
+	struct iio_dev_opaque *iio_dev_opaque;
+
 	if (!indio_dev || !trig)
 		return -EINVAL;
 
+	iio_dev_opaque = to_iio_dev_opaque(indio_dev);
 	mutex_lock(&indio_dev->mlock);
-	WARN_ON(indio_dev->trig_readonly);
+	WARN_ON(iio_dev_opaque->trig_readonly);
 
 	indio_dev->trig = iio_trigger_get(trig);
-	indio_dev->trig_readonly = true;
+	iio_dev_opaque->trig_readonly = true;
 	mutex_unlock(&indio_dev->mlock);
 
 	return 0;
@@ -402,6 +405,7 @@ static ssize_t iio_trigger_write_current(struct device *dev,
 					 size_t len)
 {
 	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev);
 	struct iio_trigger *oldtrig = indio_dev->trig;
 	struct iio_trigger *trig;
 	int ret;
@@ -411,7 +415,7 @@ static ssize_t iio_trigger_write_current(struct device *dev,
 		mutex_unlock(&indio_dev->mlock);
 		return -EBUSY;
 	}
-	if (indio_dev->trig_readonly) {
+	if (iio_dev_opaque->trig_readonly) {
 		mutex_unlock(&indio_dev->mlock);
 		return -EPERM;
 	}
diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h
index 96dd265103d0..10aa97239117 100644
--- a/include/linux/iio/iio-opaque.h
+++ b/include/linux/iio/iio-opaque.h
@@ -8,6 +8,7 @@
  * @indio_dev:			public industrial I/O device information
  * @id:			used to identify device internally
  * @driver_module:		used to make it harder to undercut users
+ * @trig_readonly:		mark the current trigger immutable
  * @event_interface:		event chrdevs associated with interrupt lines
  * @attached_buffers:		array of buffers statically attached by the driver
  * @attached_buffers_cnt:	number of buffers in the array of statically attached buffers
@@ -30,6 +31,7 @@ struct iio_dev_opaque {
 	struct iio_dev			indio_dev;
 	int				id;
 	struct module			*driver_module;
+	bool				trig_readonly;
 	struct iio_event_interface	*event_interface;
 	struct iio_buffer		**attached_buffers;
 	unsigned int			attached_buffers_cnt;
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 9e8e1358a032..672f141f74c5 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -503,7 +503,6 @@ struct iio_buffer_setup_ops {
  * @scan_timestamp:	[INTERN] set if any buffers have requested timestamp
  * @scan_index_timestamp:[INTERN] cache of the index to the timestamp
  * @trig:		[INTERN] current device trigger (buffer modes)
- * @trig_readonly:	[INTERN] mark the current trigger immutable
  * @pollfunc:		[DRIVER] function run on trigger being received
  * @pollfunc_event:	[DRIVER] function run on events trigger being received
  * @channels:		[DRIVER] channel specification structure table
@@ -535,7 +534,6 @@ struct iio_dev {
 	bool				scan_timestamp;
 	unsigned			scan_index_timestamp;
 	struct iio_trigger		*trig;
-	bool				trig_readonly;
 	struct iio_poll_func		*pollfunc;
 	struct iio_poll_func		*pollfunc_event;
 
-- 
cgit v1.2.3


From 62f4f36cdfcdbb961bbbeab15e6595dd391d2205 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Mon, 26 Apr 2021 18:49:07 +0100
Subject: iio: core: move @scan_index_timestamp to struct iio_dev_opaque

No reason for this cached value to be exposed to drivers so move it
to the opaque structure.

Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Alexandru Ardelean <ardeleanalex@gmail.com>
Link: https://lore.kernel.org/r/20210426174911.397061-6-jic23@kernel.org
---
 drivers/iio/industrialio-buffer.c | 7 +++++--
 include/linux/iio/iio-opaque.h    | 4 ++++
 include/linux/iio/iio.h           | 2 --
 3 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index 9a8e16c7e9af..9ecb3c58d94c 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -601,8 +601,10 @@ static unsigned int iio_storage_bytes_for_si(struct iio_dev *indio_dev,
 
 static unsigned int iio_storage_bytes_for_timestamp(struct iio_dev *indio_dev)
 {
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev);
+
 	return iio_storage_bytes_for_si(indio_dev,
-					indio_dev->scan_index_timestamp);
+					iio_dev_opaque->scan_index_timestamp);
 }
 
 static int iio_compute_scan_bytes(struct iio_dev *indio_dev,
@@ -1469,6 +1471,7 @@ static int __iio_buffer_alloc_sysfs_and_mask(struct iio_buffer *buffer,
 					     struct iio_dev *indio_dev,
 					     int index)
 {
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev);
 	struct iio_dev_attr *p;
 	struct attribute **attr;
 	int ret, i, attrn, scan_el_attrcount, buffer_attrcount;
@@ -1495,7 +1498,7 @@ static int __iio_buffer_alloc_sysfs_and_mask(struct iio_buffer *buffer,
 				goto error_cleanup_dynamic;
 			scan_el_attrcount += ret;
 			if (channels[i].type == IIO_TIMESTAMP)
-				indio_dev->scan_index_timestamp =
+				iio_dev_opaque->scan_index_timestamp =
 					channels[i].scan_index;
 		}
 		if (indio_dev->masklength && buffer->scan_mask == NULL) {
diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h
index 10aa97239117..02038fb2d291 100644
--- a/include/linux/iio/iio-opaque.h
+++ b/include/linux/iio/iio-opaque.h
@@ -22,6 +22,7 @@
  * @groupcounter:		index of next attribute group
  * @legacy_scan_el_group:	attribute group for legacy scan elements attribute group
  * @legacy_buffer_group:	attribute group for legacy buffer attributes group
+ * @scan_index_timestamp:	cache of the index to the timestamp
  * @debugfs_dentry:		device specific debugfs dentry
  * @cached_reg_addr:		cached register address for debugfs reads
  * @read_buf:			read buffer to be used for the initial reg read
@@ -44,6 +45,9 @@ struct iio_dev_opaque {
 	int				groupcounter;
 	struct attribute_group		legacy_scan_el_group;
 	struct attribute_group		legacy_buffer_group;
+
+	unsigned int			scan_index_timestamp;
+
 #if defined(CONFIG_DEBUG_FS)
 	struct dentry			*debugfs_dentry;
 	unsigned			cached_reg_addr;
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 672f141f74c5..cbc9e9ece0a6 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -501,7 +501,6 @@ struct iio_buffer_setup_ops {
  *			channels
  * @active_scan_mask:	[INTERN] union of all scan masks requested by buffers
  * @scan_timestamp:	[INTERN] set if any buffers have requested timestamp
- * @scan_index_timestamp:[INTERN] cache of the index to the timestamp
  * @trig:		[INTERN] current device trigger (buffer modes)
  * @pollfunc:		[DRIVER] function run on trigger being received
  * @pollfunc_event:	[DRIVER] function run on events trigger being received
@@ -532,7 +531,6 @@ struct iio_dev {
 	unsigned			masklength;
 	const unsigned long		*active_scan_mask;
 	bool				scan_timestamp;
-	unsigned			scan_index_timestamp;
 	struct iio_trigger		*trig;
 	struct iio_poll_func		*pollfunc;
 	struct iio_poll_func		*pollfunc_event;
-- 
cgit v1.2.3


From b804e2b76ac6d5559b99588e0190ac97b5597497 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Mon, 26 Apr 2021 18:49:08 +0100
Subject: iio: core: move @info_exist_lock to struct iio_dev_opaque

This lock is only of interest to the IIO core, so make it only
visible there.

Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Alexandru Ardelean <ardeleanalex@gmail.com>
Link: https://lore.kernel.org/r/20210426174911.397061-7-jic23@kernel.org
---
 drivers/iio/industrialio-buffer.c |  5 +++--
 drivers/iio/industrialio-core.c   | 10 ++++-----
 drivers/iio/inkern.c              | 46 ++++++++++++++++++++++++---------------
 include/linux/iio/iio-opaque.h    |  2 ++
 include/linux/iio/iio.h           |  2 --
 5 files changed, 38 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index 9ecb3c58d94c..10923b40c76d 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -1150,12 +1150,13 @@ int iio_update_buffers(struct iio_dev *indio_dev,
 		       struct iio_buffer *insert_buffer,
 		       struct iio_buffer *remove_buffer)
 {
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev);
 	int ret;
 
 	if (insert_buffer == remove_buffer)
 		return 0;
 
-	mutex_lock(&indio_dev->info_exist_lock);
+	mutex_lock(&iio_dev_opaque->info_exist_lock);
 	mutex_lock(&indio_dev->mlock);
 
 	if (insert_buffer && iio_buffer_is_active(insert_buffer))
@@ -1178,7 +1179,7 @@ int iio_update_buffers(struct iio_dev *indio_dev,
 
 out_unlock:
 	mutex_unlock(&indio_dev->mlock);
-	mutex_unlock(&indio_dev->info_exist_lock);
+	mutex_unlock(&iio_dev_opaque->info_exist_lock);
 
 	return ret;
 }
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 58b35db00a81..74bb977e2357 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -1642,7 +1642,7 @@ struct iio_dev *iio_device_alloc(struct device *parent, int sizeof_priv)
 	device_initialize(&indio_dev->dev);
 	iio_device_set_drvdata(indio_dev, (void *)indio_dev);
 	mutex_init(&indio_dev->mlock);
-	mutex_init(&indio_dev->info_exist_lock);
+	mutex_init(&iio_dev_opaque->info_exist_lock);
 	INIT_LIST_HEAD(&iio_dev_opaque->channel_attr_list);
 
 	iio_dev_opaque->id = ida_simple_get(&iio_ida, 0, 0, GFP_KERNEL);
@@ -1779,7 +1779,7 @@ static long iio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	struct iio_ioctl_handler *h;
 	int ret = -ENODEV;
 
-	mutex_lock(&indio_dev->info_exist_lock);
+	mutex_lock(&iio_dev_opaque->info_exist_lock);
 
 	/**
 	 * The NULL check here is required to prevent crashing when a device
@@ -1799,7 +1799,7 @@ static long iio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		ret = -ENODEV;
 
 out_unlock:
-	mutex_unlock(&indio_dev->info_exist_lock);
+	mutex_unlock(&iio_dev_opaque->info_exist_lock);
 
 	return ret;
 }
@@ -1938,7 +1938,7 @@ void iio_device_unregister(struct iio_dev *indio_dev)
 {
 	cdev_device_del(&indio_dev->chrdev, &indio_dev->dev);
 
-	mutex_lock(&indio_dev->info_exist_lock);
+	mutex_lock(&iio_dev_opaque->info_exist_lock);
 
 	iio_device_unregister_debugfs(indio_dev);
 
@@ -1949,7 +1949,7 @@ void iio_device_unregister(struct iio_dev *indio_dev)
 	iio_device_wakeup_eventset(indio_dev);
 	iio_buffer_wakeup_poll(indio_dev);
 
-	mutex_unlock(&indio_dev->info_exist_lock);
+	mutex_unlock(&iio_dev_opaque->info_exist_lock);
 
 	iio_buffers_free_sysfs_and_mask(indio_dev);
 }
diff --git a/drivers/iio/inkern.c b/drivers/iio/inkern.c
index 5aa740cea661..391a3380a1d1 100644
--- a/drivers/iio/inkern.c
+++ b/drivers/iio/inkern.c
@@ -10,6 +10,7 @@
 #include <linux/of.h>
 
 #include <linux/iio/iio.h>
+#include <linux/iio/iio-opaque.h>
 #include "iio_core.h"
 #include <linux/iio/machine.h>
 #include <linux/iio/driver.h>
@@ -538,9 +539,10 @@ static int iio_channel_read(struct iio_channel *chan, int *val, int *val2,
 
 int iio_read_channel_raw(struct iio_channel *chan, int *val)
 {
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(chan->indio_dev);
 	int ret;
 
-	mutex_lock(&chan->indio_dev->info_exist_lock);
+	mutex_lock(&iio_dev_opaque->info_exist_lock);
 	if (chan->indio_dev->info == NULL) {
 		ret = -ENODEV;
 		goto err_unlock;
@@ -548,7 +550,7 @@ int iio_read_channel_raw(struct iio_channel *chan, int *val)
 
 	ret = iio_channel_read(chan, val, NULL, IIO_CHAN_INFO_RAW);
 err_unlock:
-	mutex_unlock(&chan->indio_dev->info_exist_lock);
+	mutex_unlock(&iio_dev_opaque->info_exist_lock);
 
 	return ret;
 }
@@ -556,9 +558,10 @@ EXPORT_SYMBOL_GPL(iio_read_channel_raw);
 
 int iio_read_channel_average_raw(struct iio_channel *chan, int *val)
 {
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(chan->indio_dev);
 	int ret;
 
-	mutex_lock(&chan->indio_dev->info_exist_lock);
+	mutex_lock(&iio_dev_opaque->info_exist_lock);
 	if (chan->indio_dev->info == NULL) {
 		ret = -ENODEV;
 		goto err_unlock;
@@ -566,7 +569,7 @@ int iio_read_channel_average_raw(struct iio_channel *chan, int *val)
 
 	ret = iio_channel_read(chan, val, NULL, IIO_CHAN_INFO_AVERAGE_RAW);
 err_unlock:
-	mutex_unlock(&chan->indio_dev->info_exist_lock);
+	mutex_unlock(&iio_dev_opaque->info_exist_lock);
 
 	return ret;
 }
@@ -631,9 +634,10 @@ static int iio_convert_raw_to_processed_unlocked(struct iio_channel *chan,
 int iio_convert_raw_to_processed(struct iio_channel *chan, int raw,
 	int *processed, unsigned int scale)
 {
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(chan->indio_dev);
 	int ret;
 
-	mutex_lock(&chan->indio_dev->info_exist_lock);
+	mutex_lock(&iio_dev_opaque->info_exist_lock);
 	if (chan->indio_dev->info == NULL) {
 		ret = -ENODEV;
 		goto err_unlock;
@@ -642,7 +646,7 @@ int iio_convert_raw_to_processed(struct iio_channel *chan, int raw,
 	ret = iio_convert_raw_to_processed_unlocked(chan, raw, processed,
 							scale);
 err_unlock:
-	mutex_unlock(&chan->indio_dev->info_exist_lock);
+	mutex_unlock(&iio_dev_opaque->info_exist_lock);
 
 	return ret;
 }
@@ -651,9 +655,10 @@ EXPORT_SYMBOL_GPL(iio_convert_raw_to_processed);
 int iio_read_channel_attribute(struct iio_channel *chan, int *val, int *val2,
 			       enum iio_chan_info_enum attribute)
 {
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(chan->indio_dev);
 	int ret;
 
-	mutex_lock(&chan->indio_dev->info_exist_lock);
+	mutex_lock(&iio_dev_opaque->info_exist_lock);
 	if (chan->indio_dev->info == NULL) {
 		ret = -ENODEV;
 		goto err_unlock;
@@ -661,7 +666,7 @@ int iio_read_channel_attribute(struct iio_channel *chan, int *val, int *val2,
 
 	ret = iio_channel_read(chan, val, val2, attribute);
 err_unlock:
-	mutex_unlock(&chan->indio_dev->info_exist_lock);
+	mutex_unlock(&iio_dev_opaque->info_exist_lock);
 
 	return ret;
 }
@@ -676,9 +681,10 @@ EXPORT_SYMBOL_GPL(iio_read_channel_offset);
 int iio_read_channel_processed_scale(struct iio_channel *chan, int *val,
 				     unsigned int scale)
 {
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(chan->indio_dev);
 	int ret;
 
-	mutex_lock(&chan->indio_dev->info_exist_lock);
+	mutex_lock(&iio_dev_opaque->info_exist_lock);
 	if (chan->indio_dev->info == NULL) {
 		ret = -ENODEV;
 		goto err_unlock;
@@ -699,7 +705,7 @@ int iio_read_channel_processed_scale(struct iio_channel *chan, int *val,
 	}
 
 err_unlock:
-	mutex_unlock(&chan->indio_dev->info_exist_lock);
+	mutex_unlock(&iio_dev_opaque->info_exist_lock);
 
 	return ret;
 }
@@ -733,9 +739,10 @@ int iio_read_avail_channel_attribute(struct iio_channel *chan,
 				     const int **vals, int *type, int *length,
 				     enum iio_chan_info_enum attribute)
 {
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(chan->indio_dev);
 	int ret;
 
-	mutex_lock(&chan->indio_dev->info_exist_lock);
+	mutex_lock(&iio_dev_opaque->info_exist_lock);
 	if (!chan->indio_dev->info) {
 		ret = -ENODEV;
 		goto err_unlock;
@@ -743,7 +750,7 @@ int iio_read_avail_channel_attribute(struct iio_channel *chan,
 
 	ret = iio_channel_read_avail(chan, vals, type, length, attribute);
 err_unlock:
-	mutex_unlock(&chan->indio_dev->info_exist_lock);
+	mutex_unlock(&iio_dev_opaque->info_exist_lock);
 
 	return ret;
 }
@@ -815,10 +822,11 @@ static int iio_channel_read_max(struct iio_channel *chan,
 
 int iio_read_max_channel_raw(struct iio_channel *chan, int *val)
 {
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(chan->indio_dev);
 	int ret;
 	int type;
 
-	mutex_lock(&chan->indio_dev->info_exist_lock);
+	mutex_lock(&iio_dev_opaque->info_exist_lock);
 	if (!chan->indio_dev->info) {
 		ret = -ENODEV;
 		goto err_unlock;
@@ -826,7 +834,7 @@ int iio_read_max_channel_raw(struct iio_channel *chan, int *val)
 
 	ret = iio_channel_read_max(chan, val, NULL, &type, IIO_CHAN_INFO_RAW);
 err_unlock:
-	mutex_unlock(&chan->indio_dev->info_exist_lock);
+	mutex_unlock(&iio_dev_opaque->info_exist_lock);
 
 	return ret;
 }
@@ -834,10 +842,11 @@ EXPORT_SYMBOL_GPL(iio_read_max_channel_raw);
 
 int iio_get_channel_type(struct iio_channel *chan, enum iio_chan_type *type)
 {
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(chan->indio_dev);
 	int ret = 0;
 	/* Need to verify underlying driver has not gone away */
 
-	mutex_lock(&chan->indio_dev->info_exist_lock);
+	mutex_lock(&iio_dev_opaque->info_exist_lock);
 	if (chan->indio_dev->info == NULL) {
 		ret = -ENODEV;
 		goto err_unlock;
@@ -845,7 +854,7 @@ int iio_get_channel_type(struct iio_channel *chan, enum iio_chan_type *type)
 
 	*type = chan->channel->type;
 err_unlock:
-	mutex_unlock(&chan->indio_dev->info_exist_lock);
+	mutex_unlock(&iio_dev_opaque->info_exist_lock);
 
 	return ret;
 }
@@ -861,9 +870,10 @@ static int iio_channel_write(struct iio_channel *chan, int val, int val2,
 int iio_write_channel_attribute(struct iio_channel *chan, int val, int val2,
 				enum iio_chan_info_enum attribute)
 {
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(chan->indio_dev);
 	int ret;
 
-	mutex_lock(&chan->indio_dev->info_exist_lock);
+	mutex_lock(&iio_dev_opaque->info_exist_lock);
 	if (chan->indio_dev->info == NULL) {
 		ret = -ENODEV;
 		goto err_unlock;
@@ -871,7 +881,7 @@ int iio_write_channel_attribute(struct iio_channel *chan, int val, int val2,
 
 	ret = iio_channel_write(chan, val, val2, attribute);
 err_unlock:
-	mutex_unlock(&chan->indio_dev->info_exist_lock);
+	mutex_unlock(&iio_dev_opaque->info_exist_lock);
 
 	return ret;
 }
diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h
index 02038fb2d291..538b4b5ef1a9 100644
--- a/include/linux/iio/iio-opaque.h
+++ b/include/linux/iio/iio-opaque.h
@@ -8,6 +8,7 @@
  * @indio_dev:			public industrial I/O device information
  * @id:			used to identify device internally
  * @driver_module:		used to make it harder to undercut users
+ * @info_exist_lock:		lock to prevent use during removal
  * @trig_readonly:		mark the current trigger immutable
  * @event_interface:		event chrdevs associated with interrupt lines
  * @attached_buffers:		array of buffers statically attached by the driver
@@ -32,6 +33,7 @@ struct iio_dev_opaque {
 	struct iio_dev			indio_dev;
 	int				id;
 	struct module			*driver_module;
+	struct mutex			info_exist_lock;
 	bool				trig_readonly;
 	struct iio_event_interface	*event_interface;
 	struct iio_buffer		**attached_buffers;
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index cbc9e9ece0a6..a12bbd8b1e74 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -510,7 +510,6 @@ struct iio_buffer_setup_ops {
  * @label:              [DRIVER] unique name to identify which device this is
  * @info:		[DRIVER] callbacks and constant info from driver
  * @clock_id:		[INTERN] timestamping clock posix identifier
- * @info_exist_lock:	[INTERN] lock to prevent use during removal
  * @setup_ops:		[DRIVER] callbacks to call before and after buffer
  *			enable/disable
  * @chrdev:		[INTERN] associated character device
@@ -542,7 +541,6 @@ struct iio_dev {
 	const char			*label;
 	const struct iio_info		*info;
 	clockid_t			clock_id;
-	struct mutex			info_exist_lock;
 	const struct iio_buffer_setup_ops	*setup_ops;
 	struct cdev			chrdev;
 
-- 
cgit v1.2.3


From 396f7234856956eb29f009da6e5d846f29f87ebd Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Mon, 26 Apr 2021 18:49:09 +0100
Subject: iio: core: move @chrdev from struct iio_dev to struct iio_dev_opaque

No reason for this to be exposed to the drivers, so lets move it to the
opaque structure.

Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Alexandru Ardelean <ardeleanalex@gmail.com>
Link: https://lore.kernel.org/r/20210426174911.397061-8-jic23@kernel.org
---
 drivers/iio/industrialio-core.c | 22 +++++++++++++---------
 include/linux/iio/iio-opaque.h  |  2 ++
 include/linux/iio/iio.h         |  2 --
 3 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 74bb977e2357..0aba0a0085eb 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -1715,8 +1715,9 @@ EXPORT_SYMBOL_GPL(devm_iio_device_alloc);
  **/
 static int iio_chrdev_open(struct inode *inode, struct file *filp)
 {
-	struct iio_dev *indio_dev = container_of(inode->i_cdev,
-						struct iio_dev, chrdev);
+	struct iio_dev_opaque *iio_dev_opaque =
+		container_of(inode->i_cdev, struct iio_dev_opaque, chrdev);
+	struct iio_dev *indio_dev = &iio_dev_opaque->indio_dev;
 	struct iio_dev_buffer_pair *ib;
 
 	if (test_and_set_bit(IIO_BUSY_BIT_POS, &indio_dev->flags))
@@ -1749,8 +1750,9 @@ static int iio_chrdev_open(struct inode *inode, struct file *filp)
 static int iio_chrdev_release(struct inode *inode, struct file *filp)
 {
 	struct iio_dev_buffer_pair *ib = filp->private_data;
-	struct iio_dev *indio_dev = container_of(inode->i_cdev,
-						struct iio_dev, chrdev);
+	struct iio_dev_opaque *iio_dev_opaque =
+		container_of(inode->i_cdev, struct iio_dev_opaque, chrdev);
+	struct iio_dev *indio_dev = &iio_dev_opaque->indio_dev;
 	kfree(ib);
 	clear_bit(IIO_BUSY_BIT_POS, &indio_dev->flags);
 	iio_device_put(indio_dev);
@@ -1900,19 +1902,19 @@ int __iio_device_register(struct iio_dev *indio_dev, struct module *this_mod)
 		indio_dev->setup_ops = &noop_ring_setup_ops;
 
 	if (iio_dev_opaque->attached_buffers_cnt)
-		cdev_init(&indio_dev->chrdev, &iio_buffer_fileops);
+		cdev_init(&iio_dev_opaque->chrdev, &iio_buffer_fileops);
 	else if (iio_dev_opaque->event_interface)
-		cdev_init(&indio_dev->chrdev, &iio_event_fileops);
+		cdev_init(&iio_dev_opaque->chrdev, &iio_event_fileops);
 
 	if (iio_dev_opaque->attached_buffers_cnt || iio_dev_opaque->event_interface) {
 		indio_dev->dev.devt = MKDEV(MAJOR(iio_devt), iio_dev_opaque->id);
-		indio_dev->chrdev.owner = this_mod;
+		iio_dev_opaque->chrdev.owner = this_mod;
 	}
 
 	/* assign device groups now; they should be all registered now */
 	indio_dev->dev.groups = iio_dev_opaque->groups;
 
-	ret = cdev_device_add(&indio_dev->chrdev, &indio_dev->dev);
+	ret = cdev_device_add(&iio_dev_opaque->chrdev, &indio_dev->dev);
 	if (ret < 0)
 		goto error_unreg_eventset;
 
@@ -1936,7 +1938,9 @@ EXPORT_SYMBOL(__iio_device_register);
  **/
 void iio_device_unregister(struct iio_dev *indio_dev)
 {
-	cdev_device_del(&indio_dev->chrdev, &indio_dev->dev);
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev);
+
+	cdev_device_del(&iio_dev_opaque->chrdev, &indio_dev->dev);
 
 	mutex_lock(&iio_dev_opaque->info_exist_lock);
 
diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h
index 538b4b5ef1a9..2f8ef5d15a66 100644
--- a/include/linux/iio/iio-opaque.h
+++ b/include/linux/iio/iio-opaque.h
@@ -24,6 +24,7 @@
  * @legacy_scan_el_group:	attribute group for legacy scan elements attribute group
  * @legacy_buffer_group:	attribute group for legacy buffer attributes group
  * @scan_index_timestamp:	cache of the index to the timestamp
+ * @chrdev:			associated character device
  * @debugfs_dentry:		device specific debugfs dentry
  * @cached_reg_addr:		cached register address for debugfs reads
  * @read_buf:			read buffer to be used for the initial reg read
@@ -49,6 +50,7 @@ struct iio_dev_opaque {
 	struct attribute_group		legacy_buffer_group;
 
 	unsigned int			scan_index_timestamp;
+	struct cdev			chrdev;
 
 #if defined(CONFIG_DEBUG_FS)
 	struct dentry			*debugfs_dentry;
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index a12bbd8b1e74..586e2dc4fbf3 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -512,7 +512,6 @@ struct iio_buffer_setup_ops {
  * @clock_id:		[INTERN] timestamping clock posix identifier
  * @setup_ops:		[DRIVER] callbacks to call before and after buffer
  *			enable/disable
- * @chrdev:		[INTERN] associated character device
  * @flags:		[INTERN] file ops related flags including busy flag.
  * @priv:		[DRIVER] reference to driver's private information
  *			**MUST** be accessed **ONLY** via iio_priv() helper
@@ -542,7 +541,6 @@ struct iio_dev {
 	const struct iio_info		*info;
 	clockid_t			clock_id;
 	const struct iio_buffer_setup_ops	*setup_ops;
-	struct cdev			chrdev;
 
 	unsigned long			flags;
 	void				*priv;
-- 
cgit v1.2.3


From 8b1c82cb849f8f7c758891099f2128b8fbc05744 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Mon, 26 Apr 2021 18:49:10 +0100
Subject: iio: core: move @flags from struct iio_dev to struct iio_dev_opaque

No reason any driver should ever need access to this field, so hide it.

Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Alexandru Ardelean <ardeleanalex@gmail.com>
Link: https://lore.kernel.org/r/20210426174911.397061-9-jic23@kernel.org
---
 drivers/iio/industrialio-core.c | 6 +++---
 include/linux/iio/iio-opaque.h  | 2 ++
 include/linux/iio/iio.h         | 2 --
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 0aba0a0085eb..29ff7668297e 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -1720,7 +1720,7 @@ static int iio_chrdev_open(struct inode *inode, struct file *filp)
 	struct iio_dev *indio_dev = &iio_dev_opaque->indio_dev;
 	struct iio_dev_buffer_pair *ib;
 
-	if (test_and_set_bit(IIO_BUSY_BIT_POS, &indio_dev->flags))
+	if (test_and_set_bit(IIO_BUSY_BIT_POS, &iio_dev_opaque->flags))
 		return -EBUSY;
 
 	iio_device_get(indio_dev);
@@ -1728,7 +1728,7 @@ static int iio_chrdev_open(struct inode *inode, struct file *filp)
 	ib = kmalloc(sizeof(*ib), GFP_KERNEL);
 	if (!ib) {
 		iio_device_put(indio_dev);
-		clear_bit(IIO_BUSY_BIT_POS, &indio_dev->flags);
+		clear_bit(IIO_BUSY_BIT_POS, &iio_dev_opaque->flags);
 		return -ENOMEM;
 	}
 
@@ -1754,7 +1754,7 @@ static int iio_chrdev_release(struct inode *inode, struct file *filp)
 		container_of(inode->i_cdev, struct iio_dev_opaque, chrdev);
 	struct iio_dev *indio_dev = &iio_dev_opaque->indio_dev;
 	kfree(ib);
-	clear_bit(IIO_BUSY_BIT_POS, &indio_dev->flags);
+	clear_bit(IIO_BUSY_BIT_POS, &iio_dev_opaque->flags);
 	iio_device_put(indio_dev);
 
 	return 0;
diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h
index 2f8ef5d15a66..d7c3036861ac 100644
--- a/include/linux/iio/iio-opaque.h
+++ b/include/linux/iio/iio-opaque.h
@@ -25,6 +25,7 @@
  * @legacy_buffer_group:	attribute group for legacy buffer attributes group
  * @scan_index_timestamp:	cache of the index to the timestamp
  * @chrdev:			associated character device
+ * @flags:			file ops related flags including busy flag.
  * @debugfs_dentry:		device specific debugfs dentry
  * @cached_reg_addr:		cached register address for debugfs reads
  * @read_buf:			read buffer to be used for the initial reg read
@@ -51,6 +52,7 @@ struct iio_dev_opaque {
 
 	unsigned int			scan_index_timestamp;
 	struct cdev			chrdev;
+	unsigned long			flags;
 
 #if defined(CONFIG_DEBUG_FS)
 	struct dentry			*debugfs_dentry;
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 586e2dc4fbf3..ed0537015eee 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -512,7 +512,6 @@ struct iio_buffer_setup_ops {
  * @clock_id:		[INTERN] timestamping clock posix identifier
  * @setup_ops:		[DRIVER] callbacks to call before and after buffer
  *			enable/disable
- * @flags:		[INTERN] file ops related flags including busy flag.
  * @priv:		[DRIVER] reference to driver's private information
  *			**MUST** be accessed **ONLY** via iio_priv() helper
  */
@@ -542,7 +541,6 @@ struct iio_dev {
 	clockid_t			clock_id;
 	const struct iio_buffer_setup_ops	*setup_ops;
 
-	unsigned long			flags;
 	void				*priv;
 };
 
-- 
cgit v1.2.3


From 62a486c46d61bc684967fc3f83eed15dde49cf9b Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Mon, 26 Apr 2021 18:49:11 +0100
Subject: iio: core: move @clock_id from struct iio_dev to struct
 iio_dev_opaque

There is already an acessor function used to access it, making this
move straight forward.

Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Alexandru Ardelean <ardeleanalex@gmail.com>
Link: https://lore.kernel.org/r/20210426174911.397061-10-jic23@kernel.org
---
 drivers/iio/industrialio-core.c | 14 +++++++++++++-
 include/linux/iio/iio-opaque.h  |  2 ++
 include/linux/iio/iio.h         | 12 +-----------
 3 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 29ff7668297e..bfa20a346f71 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -271,13 +271,25 @@ int iio_device_set_clock(struct iio_dev *indio_dev, clockid_t clock_id)
 		mutex_unlock(&indio_dev->mlock);
 		return -EBUSY;
 	}
-	indio_dev->clock_id = clock_id;
+	iio_dev_opaque->clock_id = clock_id;
 	mutex_unlock(&indio_dev->mlock);
 
 	return 0;
 }
 EXPORT_SYMBOL(iio_device_set_clock);
 
+/**
+ * iio_device_get_clock() - Retrieve current timestamping clock for the device
+ * @indio_dev: IIO device structure containing the device
+ */
+clockid_t iio_device_get_clock(const struct iio_dev *indio_dev)
+{
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev);
+
+	return iio_dev_opaque->clock_id;
+}
+EXPORT_SYMBOL(iio_device_get_clock);
+
 /**
  * iio_get_time_ns() - utility function to get a time stamp for events etc
  * @indio_dev: device
diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h
index d7c3036861ac..c9504e9da571 100644
--- a/include/linux/iio/iio-opaque.h
+++ b/include/linux/iio/iio-opaque.h
@@ -24,6 +24,7 @@
  * @legacy_scan_el_group:	attribute group for legacy scan elements attribute group
  * @legacy_buffer_group:	attribute group for legacy buffer attributes group
  * @scan_index_timestamp:	cache of the index to the timestamp
+ * @clock_id:			timestamping clock posix identifier
  * @chrdev:			associated character device
  * @flags:			file ops related flags including busy flag.
  * @debugfs_dentry:		device specific debugfs dentry
@@ -51,6 +52,7 @@ struct iio_dev_opaque {
 	struct attribute_group		legacy_buffer_group;
 
 	unsigned int			scan_index_timestamp;
+	clockid_t			clock_id;
 	struct cdev			chrdev;
 	unsigned long			flags;
 
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index ed0537015eee..5606a3f4c4cb 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -509,7 +509,6 @@ struct iio_buffer_setup_ops {
  * @name:		[DRIVER] name of the device.
  * @label:              [DRIVER] unique name to identify which device this is
  * @info:		[DRIVER] callbacks and constant info from driver
- * @clock_id:		[INTERN] timestamping clock posix identifier
  * @setup_ops:		[DRIVER] callbacks to call before and after buffer
  *			enable/disable
  * @priv:		[DRIVER] reference to driver's private information
@@ -538,7 +537,6 @@ struct iio_dev {
 	const char			*name;
 	const char			*label;
 	const struct iio_info		*info;
-	clockid_t			clock_id;
 	const struct iio_buffer_setup_ops	*setup_ops;
 
 	void				*priv;
@@ -589,15 +587,7 @@ static inline void iio_device_put(struct iio_dev *indio_dev)
 		put_device(&indio_dev->dev);
 }
 
-/**
- * iio_device_get_clock() - Retrieve current timestamping clock for the device
- * @indio_dev: IIO device structure containing the device
- */
-static inline clockid_t iio_device_get_clock(const struct iio_dev *indio_dev)
-{
-	return indio_dev->clock_id;
-}
-
+clockid_t iio_device_get_clock(const struct iio_dev *indio_dev);
 int iio_device_set_clock(struct iio_dev *indio_dev, clockid_t clock_id);
 
 /**
-- 
cgit v1.2.3


From 38934daf7b5c1b35a01748cb7d4272282cc3a890 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 14 Apr 2021 22:54:50 +0300
Subject: iio: magnetometer: st_magn: Provide default platform data

Provide default platform data for magnetometer in case it supports DRDY.

One case is LSM9DS0 IMU, on which it is the case. Since accelerometer
is using INT1, default magnetometer to INT2.

While at it, update description of the drdy_int_pin field.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210414195454.84183-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/magnetometer/st_magn_core.c        | 11 ++++++++++-
 include/linux/platform_data/st_sensors_pdata.h |  3 ++-
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/magnetometer/st_magn_core.c b/drivers/iio/magnetometer/st_magn_core.c
index 71faebd07feb..55357f266b8b 100644
--- a/drivers/iio/magnetometer/st_magn_core.c
+++ b/drivers/iio/magnetometer/st_magn_core.c
@@ -383,6 +383,11 @@ static const struct st_sensor_settings st_magn_sensors_settings[] = {
 	},
 };
 
+/* Default magn DRDY is available on INT2 pin */
+static const struct st_sensors_platform_data default_magn_pdata = {
+	.drdy_int_pin = 2,
+};
+
 static int st_magn_read_raw(struct iio_dev *indio_dev,
 			struct iio_chan_spec const *ch, int *val,
 							int *val2, long mask)
@@ -490,6 +495,7 @@ EXPORT_SYMBOL(st_magn_get_settings);
 int st_magn_common_probe(struct iio_dev *indio_dev)
 {
 	struct st_sensor_data *mdata = iio_priv(indio_dev);
+	struct st_sensors_platform_data *pdata = dev_get_platdata(mdata->dev);
 	int err;
 
 	indio_dev->modes = INDIO_DIRECT_MODE;
@@ -510,7 +516,10 @@ int st_magn_common_probe(struct iio_dev *indio_dev)
 	mdata->current_fullscale = &mdata->sensor_settings->fs.fs_avl[0];
 	mdata->odr = mdata->sensor_settings->odr.odr_avl[0].hz;
 
-	err = st_sensors_init_sensor(indio_dev, NULL);
+	if (!pdata)
+		pdata = (struct st_sensors_platform_data *)&default_magn_pdata;
+
+	err = st_sensors_init_sensor(indio_dev, pdata);
 	if (err < 0)
 		goto st_magn_power_off;
 
diff --git a/include/linux/platform_data/st_sensors_pdata.h b/include/linux/platform_data/st_sensors_pdata.h
index e40b28ca892e..897051e51b78 100644
--- a/include/linux/platform_data/st_sensors_pdata.h
+++ b/include/linux/platform_data/st_sensors_pdata.h
@@ -13,8 +13,9 @@
 /**
  * struct st_sensors_platform_data - Platform data for the ST sensors
  * @drdy_int_pin: Redirect DRDY on pin 1 (1) or pin 2 (2).
- *	Available only for accelerometer and pressure sensors.
+ *	Available only for accelerometer, magnetometer and pressure sensors.
  *	Accelerometer DRDY on LSM330 available only on pin 1 (see datasheet).
+ *	Magnetometer DRDY is supported only on LSM9DS0.
  * @open_drain: set the interrupt line to be open drain if possible.
  * @spi_3wire: enable spi-3wire mode.
  * @pullups: enable/disable i2c controller pullup resistors.
-- 
cgit v1.2.3


From d61881ef7f08aef02d9bfc8c66f4c89c59cdf112 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 14 Apr 2021 22:54:52 +0300
Subject: iio: st_sensors: Make accel, gyro, magn and pressure probe shared

Some IMUs may utilize existing library code for STMicro accelerometer,
gyroscope, magnetometer and pressure. Let's share them via st_sensors.h.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210414195454.84183-5-andriy.shevchenko@linux.intel.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/accel/st_accel.h          |  4 ----
 drivers/iio/gyro/st_gyro.h            |  4 ----
 drivers/iio/magnetometer/st_magn.h    |  4 ----
 drivers/iio/pressure/st_pressure.h    |  4 ----
 include/linux/iio/common/st_sensors.h | 20 ++++++++++++++++++++
 5 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/accel/st_accel.h b/drivers/iio/accel/st_accel.h
index 181ebe79c4eb..f5b0b8bbaff7 100644
--- a/drivers/iio/accel/st_accel.h
+++ b/drivers/iio/accel/st_accel.h
@@ -62,10 +62,6 @@ enum st_accel_type {
 #define LIS2DE12_ACCEL_DEV_NAME		"lis2de12"
 #define LIS2HH12_ACCEL_DEV_NAME		"lis2hh12"
 
-const struct st_sensor_settings *st_accel_get_settings(const char *name);
-int st_accel_common_probe(struct iio_dev *indio_dev);
-void st_accel_common_remove(struct iio_dev *indio_dev);
-
 #ifdef CONFIG_IIO_BUFFER
 int st_accel_allocate_ring(struct iio_dev *indio_dev);
 void st_accel_deallocate_ring(struct iio_dev *indio_dev);
diff --git a/drivers/iio/gyro/st_gyro.h b/drivers/iio/gyro/st_gyro.h
index b385fe664dcc..6537f5cb8320 100644
--- a/drivers/iio/gyro/st_gyro.h
+++ b/drivers/iio/gyro/st_gyro.h
@@ -24,10 +24,6 @@
 #define LSM330_GYRO_DEV_NAME		"lsm330_gyro"
 #define LSM9DS0_GYRO_DEV_NAME		"lsm9ds0_gyro"
 
-const struct st_sensor_settings *st_gyro_get_settings(const char *name);
-int st_gyro_common_probe(struct iio_dev *indio_dev);
-void st_gyro_common_remove(struct iio_dev *indio_dev);
-
 #ifdef CONFIG_IIO_BUFFER
 int st_gyro_allocate_ring(struct iio_dev *indio_dev);
 void st_gyro_deallocate_ring(struct iio_dev *indio_dev);
diff --git a/drivers/iio/magnetometer/st_magn.h b/drivers/iio/magnetometer/st_magn.h
index 7ba6a6ba5c58..fb6c906c4c0c 100644
--- a/drivers/iio/magnetometer/st_magn.h
+++ b/drivers/iio/magnetometer/st_magn.h
@@ -23,10 +23,6 @@
 #define LSM9DS1_MAGN_DEV_NAME		"lsm9ds1_magn"
 #define IIS2MDC_MAGN_DEV_NAME		"iis2mdc"
 
-const struct st_sensor_settings *st_magn_get_settings(const char *name);
-int st_magn_common_probe(struct iio_dev *indio_dev);
-void st_magn_common_remove(struct iio_dev *indio_dev);
-
 #ifdef CONFIG_IIO_BUFFER
 int st_magn_allocate_ring(struct iio_dev *indio_dev);
 void st_magn_deallocate_ring(struct iio_dev *indio_dev);
diff --git a/drivers/iio/pressure/st_pressure.h b/drivers/iio/pressure/st_pressure.h
index 5c746ff6087e..9417b3bd7513 100644
--- a/drivers/iio/pressure/st_pressure.h
+++ b/drivers/iio/pressure/st_pressure.h
@@ -41,10 +41,6 @@ static __maybe_unused const struct st_sensors_platform_data default_press_pdata
 	.drdy_int_pin = 1,
 };
 
-const struct st_sensor_settings *st_press_get_settings(const char *name);
-int st_press_common_probe(struct iio_dev *indio_dev);
-void st_press_common_remove(struct iio_dev *indio_dev);
-
 #ifdef CONFIG_IIO_BUFFER
 int st_press_allocate_ring(struct iio_dev *indio_dev);
 void st_press_deallocate_ring(struct iio_dev *indio_dev);
diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h
index 33e939977444..aa017b90fb06 100644
--- a/include/linux/iio/common/st_sensors.h
+++ b/include/linux/iio/common/st_sensors.h
@@ -317,4 +317,24 @@ ssize_t st_sensors_sysfs_scale_avail(struct device *dev,
 
 void st_sensors_dev_name_probe(struct device *dev, char *name, int len);
 
+/* Accelerometer */
+const struct st_sensor_settings *st_accel_get_settings(const char *name);
+int st_accel_common_probe(struct iio_dev *indio_dev);
+void st_accel_common_remove(struct iio_dev *indio_dev);
+
+/* Gyroscope */
+const struct st_sensor_settings *st_gyro_get_settings(const char *name);
+int st_gyro_common_probe(struct iio_dev *indio_dev);
+void st_gyro_common_remove(struct iio_dev *indio_dev);
+
+/* Magnetometer */
+const struct st_sensor_settings *st_magn_get_settings(const char *name);
+int st_magn_common_probe(struct iio_dev *indio_dev);
+void st_magn_common_remove(struct iio_dev *indio_dev);
+
+/* Pressure */
+const struct st_sensor_settings *st_press_get_settings(const char *name);
+int st_press_common_probe(struct iio_dev *indio_dev);
+void st_press_common_remove(struct iio_dev *indio_dev);
+
 #endif /* ST_SENSORS_H */
-- 
cgit v1.2.3


From 6731ca3999ffa4c878a661b980759300dfb0237e Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 14 Apr 2021 22:54:53 +0300
Subject: iio: st_sensors: Add lsm9ds0 IMU support

We can utilize separate drivers for accelerometer and magnetometer,
so here is the glue driver to enable LSM9DS0 IMU support.

The idea was suggested by Crestez Dan Leonard in [1]. The proposed change
was sent as RFC due to race condition concerns, which are indeed possible.

In order to amend the initial change, I went further by providing a specific
multi-instantiate probe driver that reuses existing accelerometer and
magnetometer.

[1]: https://lore.kernel.org/patchwork/patch/670353/

Suggested-by: Crestez Dan Leonard <leonard.crestez@intel.com>
Cc: mr.lahorde@laposte.net
Cc: Matija Podravec <matija_podravec@fastmail.fm>
Cc: Sergey Borishchenko <borischenko.sergey@gmail.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210414195454.84183-6-andriy.shevchenko@linux.intel.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/accel/st_accel_core.c            |  89 ++++++++++++++-
 drivers/iio/imu/Kconfig                      |   1 +
 drivers/iio/imu/Makefile                     |   1 +
 drivers/iio/imu/st_lsm9ds0/Kconfig           |  28 +++++
 drivers/iio/imu/st_lsm9ds0/Makefile          |   5 +
 drivers/iio/imu/st_lsm9ds0/st_lsm9ds0.h      |  23 ++++
 drivers/iio/imu/st_lsm9ds0/st_lsm9ds0_core.c | 163 +++++++++++++++++++++++++++
 drivers/iio/imu/st_lsm9ds0/st_lsm9ds0_i2c.c  |  84 ++++++++++++++
 drivers/iio/imu/st_lsm9ds0/st_lsm9ds0_spi.c  |  83 ++++++++++++++
 drivers/iio/magnetometer/st_magn_core.c      |  98 ++++++++++++++++
 include/linux/iio/common/st_sensors.h        |   2 +
 11 files changed, 576 insertions(+), 1 deletion(-)
 create mode 100644 drivers/iio/imu/st_lsm9ds0/Kconfig
 create mode 100644 drivers/iio/imu/st_lsm9ds0/Makefile
 create mode 100644 drivers/iio/imu/st_lsm9ds0/st_lsm9ds0.h
 create mode 100644 drivers/iio/imu/st_lsm9ds0/st_lsm9ds0_core.c
 create mode 100644 drivers/iio/imu/st_lsm9ds0/st_lsm9ds0_i2c.c
 create mode 100644 drivers/iio/imu/st_lsm9ds0/st_lsm9ds0_spi.c

(limited to 'include/linux')

diff --git a/drivers/iio/accel/st_accel_core.c b/drivers/iio/accel/st_accel_core.c
index 5c258c1ca62d..dc32ebefe3fc 100644
--- a/drivers/iio/accel/st_accel_core.c
+++ b/drivers/iio/accel/st_accel_core.c
@@ -980,7 +980,94 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = {
 		.multi_read_bit = true,
 		.bootime = 2,
 	},
-
+	{
+		.wai = 0x49,
+		.wai_addr = ST_SENSORS_DEFAULT_WAI_ADDRESS,
+		.sensors_supported = {
+			[0] = LSM9DS0_IMU_DEV_NAME,
+		},
+		.ch = (struct iio_chan_spec *)st_accel_16bit_channels,
+		.odr = {
+			.addr = 0x20,
+			.mask = GENMASK(7, 4),
+			.odr_avl = {
+				{ 3, 0x01, },
+				{ 6, 0x02, },
+				{ 12, 0x03, },
+				{ 25, 0x04, },
+				{ 50, 0x05, },
+				{ 100, 0x06, },
+				{ 200, 0x07, },
+				{ 400, 0x08, },
+				{ 800, 0x09, },
+				{ 1600, 0x0a, },
+			},
+		},
+		.pw = {
+			.addr = 0x20,
+			.mask = GENMASK(7, 4),
+			.value_off = ST_SENSORS_DEFAULT_POWER_OFF_VALUE,
+		},
+		.enable_axis = {
+			.addr = ST_SENSORS_DEFAULT_AXIS_ADDR,
+			.mask = ST_SENSORS_DEFAULT_AXIS_MASK,
+		},
+		.fs = {
+			.addr = 0x21,
+			.mask = GENMASK(5, 3),
+			.fs_avl = {
+				[0] = {
+					.num = ST_ACCEL_FS_AVL_2G,
+					.value = 0x00,
+					.gain = IIO_G_TO_M_S_2(61),
+				},
+				[1] = {
+					.num = ST_ACCEL_FS_AVL_4G,
+					.value = 0x01,
+					.gain = IIO_G_TO_M_S_2(122),
+				},
+				[2] = {
+					.num = ST_ACCEL_FS_AVL_6G,
+					.value = 0x02,
+					.gain = IIO_G_TO_M_S_2(183),
+				},
+				[3] = {
+					.num = ST_ACCEL_FS_AVL_8G,
+					.value = 0x03,
+					.gain = IIO_G_TO_M_S_2(244),
+				},
+				[4] = {
+					.num = ST_ACCEL_FS_AVL_16G,
+					.value = 0x04,
+					.gain = IIO_G_TO_M_S_2(732),
+				},
+			},
+		},
+		.bdu = {
+			.addr = 0x20,
+			.mask = BIT(3),
+		},
+		.drdy_irq = {
+			.int1 = {
+				.addr = 0x22,
+				.mask = BIT(2),
+			},
+			.int2 = {
+				.addr = 0x23,
+				.mask = BIT(3),
+			},
+			.stat_drdy = {
+				.addr = ST_SENSORS_DEFAULT_STAT_ADDR,
+				.mask = GENMASK(2, 0),
+			},
+		},
+		.sim = {
+			.addr = 0x21,
+			.value = BIT(0),
+		},
+		.multi_read_bit = true,
+		.bootime = 2,
+	},
 };
 
 /* Default accel DRDY is available on INT1 pin */
diff --git a/drivers/iio/imu/Kconfig b/drivers/iio/imu/Kconfig
index f02883b08480..001ca2c3ff95 100644
--- a/drivers/iio/imu/Kconfig
+++ b/drivers/iio/imu/Kconfig
@@ -94,6 +94,7 @@ config KMX61
 source "drivers/iio/imu/inv_icm42600/Kconfig"
 source "drivers/iio/imu/inv_mpu6050/Kconfig"
 source "drivers/iio/imu/st_lsm6dsx/Kconfig"
+source "drivers/iio/imu/st_lsm9ds0/Kconfig"
 
 endmenu
 
diff --git a/drivers/iio/imu/Makefile b/drivers/iio/imu/Makefile
index 13e9ff442b11..c82748096c77 100644
--- a/drivers/iio/imu/Makefile
+++ b/drivers/iio/imu/Makefile
@@ -26,3 +26,4 @@ obj-y += inv_mpu6050/
 obj-$(CONFIG_KMX61) += kmx61.o
 
 obj-y += st_lsm6dsx/
+obj-y += st_lsm9ds0/
diff --git a/drivers/iio/imu/st_lsm9ds0/Kconfig b/drivers/iio/imu/st_lsm9ds0/Kconfig
new file mode 100644
index 000000000000..53b7017014f8
--- /dev/null
+++ b/drivers/iio/imu/st_lsm9ds0/Kconfig
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config IIO_ST_LSM9DS0
+	tristate "STMicroelectronics LSM9DS0 IMU driver"
+	depends on (I2C || SPI_MASTER) && SYSFS
+	depends on !SENSORS_LIS3_I2C
+	depends on !SENSORS_LIS3_SPI
+	select IIO_ST_LSM9DS0_I2C if I2C
+	select IIO_ST_LSM9DS0_SPI if SPI_MASTER
+	select IIO_ST_ACCEL_3AXIS
+	select IIO_ST_MAGN_3AXIS
+
+	help
+	  Say yes here to build support for STMicroelectronics LSM9DS0 IMU
+	  sensor. Supported devices: accelerometer/magnetometer of lsm9ds0.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called st_lsm9ds0.
+
+config IIO_ST_LSM9DS0_I2C
+	tristate
+	depends on IIO_ST_LSM9DS0
+	select REGMAP_I2C
+
+config IIO_ST_LSM9DS0_SPI
+	tristate
+	depends on IIO_ST_LSM9DS0
+	select REGMAP_SPI
diff --git a/drivers/iio/imu/st_lsm9ds0/Makefile b/drivers/iio/imu/st_lsm9ds0/Makefile
new file mode 100644
index 000000000000..488af523f648
--- /dev/null
+++ b/drivers/iio/imu/st_lsm9ds0/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_IIO_ST_LSM9DS0) += st_lsm9ds0.o
+st_lsm9ds0-y := st_lsm9ds0_core.o
+obj-$(CONFIG_IIO_ST_LSM9DS0_I2C) += st_lsm9ds0_i2c.o
+obj-$(CONFIG_IIO_ST_LSM9DS0_SPI) += st_lsm9ds0_spi.o
diff --git a/drivers/iio/imu/st_lsm9ds0/st_lsm9ds0.h b/drivers/iio/imu/st_lsm9ds0/st_lsm9ds0.h
new file mode 100644
index 000000000000..146393afd9a7
--- /dev/null
+++ b/drivers/iio/imu/st_lsm9ds0/st_lsm9ds0.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+// STMicroelectronics LSM9DS0 IMU driver
+
+#ifndef ST_LSM9DS0_H
+#define ST_LSM9DS0_H
+
+struct iio_dev;
+struct regulator;
+
+struct st_lsm9ds0 {
+	struct device *dev;
+	const char *name;
+	int irq;
+	struct iio_dev *accel;
+	struct iio_dev *magn;
+	struct regulator *vdd;
+	struct regulator *vdd_io;
+};
+
+int st_lsm9ds0_probe(struct st_lsm9ds0 *lsm9ds0, struct regmap *regmap);
+int st_lsm9ds0_remove(struct st_lsm9ds0 *lsm9ds0);
+
+#endif /* ST_LSM9DS0_H */
diff --git a/drivers/iio/imu/st_lsm9ds0/st_lsm9ds0_core.c b/drivers/iio/imu/st_lsm9ds0/st_lsm9ds0_core.c
new file mode 100644
index 000000000000..8204f7303fd7
--- /dev/null
+++ b/drivers/iio/imu/st_lsm9ds0/st_lsm9ds0_core.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * STMicroelectronics LSM9DS0 IMU driver
+ *
+ * Copyright (C) 2021, Intel Corporation
+ *
+ * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ */
+
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/regulator/consumer.h>
+
+#include <linux/iio/common/st_sensors.h>
+#include <linux/iio/iio.h>
+
+#include "st_lsm9ds0.h"
+
+static int st_lsm9ds0_power_enable(struct device *dev, struct st_lsm9ds0 *lsm9ds0)
+{
+	int ret;
+
+	/* Regulators not mandatory, but if requested we should enable them. */
+	lsm9ds0->vdd = devm_regulator_get(dev, "vdd");
+	if (IS_ERR(lsm9ds0->vdd)) {
+		dev_err(dev, "unable to get Vdd supply\n");
+		return PTR_ERR(lsm9ds0->vdd);
+	}
+	ret = regulator_enable(lsm9ds0->vdd);
+	if (ret) {
+		dev_warn(dev, "Failed to enable specified Vdd supply\n");
+		return ret;
+	}
+
+	lsm9ds0->vdd_io = devm_regulator_get(dev, "vddio");
+	if (IS_ERR(lsm9ds0->vdd_io)) {
+		dev_err(dev, "unable to get Vdd_IO supply\n");
+		regulator_disable(lsm9ds0->vdd);
+		return PTR_ERR(lsm9ds0->vdd_io);
+	}
+	ret = regulator_enable(lsm9ds0->vdd_io);
+	if (ret) {
+		dev_warn(dev, "Failed to enable specified Vdd_IO supply\n");
+		regulator_disable(lsm9ds0->vdd);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void st_lsm9ds0_power_disable(void *data)
+{
+	struct st_lsm9ds0 *lsm9ds0 = data;
+
+	regulator_disable(lsm9ds0->vdd_io);
+	regulator_disable(lsm9ds0->vdd);
+}
+
+static int devm_st_lsm9ds0_power_enable(struct st_lsm9ds0 *lsm9ds0)
+{
+	struct device *dev = lsm9ds0->dev;
+	int ret;
+
+	ret = st_lsm9ds0_power_enable(dev, lsm9ds0);
+	if (ret)
+		return ret;
+
+	return devm_add_action_or_reset(dev, st_lsm9ds0_power_disable, lsm9ds0);
+}
+
+static int st_lsm9ds0_probe_accel(struct st_lsm9ds0 *lsm9ds0, struct regmap *regmap)
+{
+	const struct st_sensor_settings *settings;
+	struct device *dev = lsm9ds0->dev;
+	struct st_sensor_data *data;
+
+	settings = st_accel_get_settings(lsm9ds0->name);
+	if (!settings) {
+		dev_err(dev, "device name %s not recognized.\n", lsm9ds0->name);
+		return -ENODEV;
+	}
+
+	lsm9ds0->accel = devm_iio_device_alloc(dev, sizeof(*data));
+	if (!lsm9ds0->accel)
+		return -ENOMEM;
+
+	lsm9ds0->accel->name = lsm9ds0->name;
+
+	data = iio_priv(lsm9ds0->accel);
+	data->sensor_settings = (struct st_sensor_settings *)settings;
+	data->dev = dev;
+	data->irq = lsm9ds0->irq;
+	data->regmap = regmap;
+	data->vdd = lsm9ds0->vdd;
+	data->vdd_io = lsm9ds0->vdd_io;
+
+	return st_accel_common_probe(lsm9ds0->accel);
+}
+
+static int st_lsm9ds0_probe_magn(struct st_lsm9ds0 *lsm9ds0, struct regmap *regmap)
+{
+	const struct st_sensor_settings *settings;
+	struct device *dev = lsm9ds0->dev;
+	struct st_sensor_data *data;
+
+	settings = st_magn_get_settings(lsm9ds0->name);
+	if (!settings) {
+		dev_err(dev, "device name %s not recognized.\n", lsm9ds0->name);
+		return -ENODEV;
+	}
+
+	lsm9ds0->magn = devm_iio_device_alloc(dev, sizeof(*data));
+	if (!lsm9ds0->magn)
+		return -ENOMEM;
+
+	lsm9ds0->magn->name = lsm9ds0->name;
+
+	data = iio_priv(lsm9ds0->magn);
+	data->sensor_settings = (struct st_sensor_settings *)settings;
+	data->dev = dev;
+	data->irq = lsm9ds0->irq;
+	data->regmap = regmap;
+	data->vdd = lsm9ds0->vdd;
+	data->vdd_io = lsm9ds0->vdd_io;
+
+	return st_magn_common_probe(lsm9ds0->magn);
+}
+
+int st_lsm9ds0_probe(struct st_lsm9ds0 *lsm9ds0, struct regmap *regmap)
+{
+	int ret;
+
+	ret = devm_st_lsm9ds0_power_enable(lsm9ds0);
+	if (ret)
+		return ret;
+
+	/* Setup accelerometer device */
+	ret = st_lsm9ds0_probe_accel(lsm9ds0, regmap);
+	if (ret)
+		return ret;
+
+	/* Setup magnetometer device */
+	ret = st_lsm9ds0_probe_magn(lsm9ds0, regmap);
+	if (ret)
+		st_accel_common_remove(lsm9ds0->accel);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(st_lsm9ds0_probe);
+
+int st_lsm9ds0_remove(struct st_lsm9ds0 *lsm9ds0)
+{
+	st_magn_common_remove(lsm9ds0->magn);
+	st_accel_common_remove(lsm9ds0->accel);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(st_lsm9ds0_remove);
+
+MODULE_AUTHOR("Andy Shevchenko <andriy.shevchenko@linux.intel.com>");
+MODULE_DESCRIPTION("STMicroelectronics LSM9DS0 IMU core driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/iio/imu/st_lsm9ds0/st_lsm9ds0_i2c.c b/drivers/iio/imu/st_lsm9ds0/st_lsm9ds0_i2c.c
new file mode 100644
index 000000000000..50a36ab53bc3
--- /dev/null
+++ b/drivers/iio/imu/st_lsm9ds0/st_lsm9ds0_i2c.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * STMicroelectronics LSM9DS0 IMU driver
+ *
+ * Copyright (C) 2021, Intel Corporation
+ *
+ * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ */
+
+#include <linux/i2c.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/iio/common/st_sensors_i2c.h>
+
+#include "st_lsm9ds0.h"
+
+static const struct of_device_id st_lsm9ds0_of_match[] = {
+	{
+		.compatible = "st,lsm9ds0-imu",
+		.data = LSM9DS0_IMU_DEV_NAME,
+	},
+	{}
+};
+MODULE_DEVICE_TABLE(of, st_lsm9ds0_of_match);
+
+static const struct i2c_device_id st_lsm9ds0_id_table[] = {
+	{ LSM9DS0_IMU_DEV_NAME },
+	{}
+};
+MODULE_DEVICE_TABLE(i2c, st_lsm9ds0_id_table);
+
+static const struct regmap_config st_lsm9ds0_regmap_config = {
+	.reg_bits	= 8,
+	.val_bits	= 8,
+	.read_flag_mask	= 0x80,
+};
+
+static int st_lsm9ds0_i2c_probe(struct i2c_client *client)
+{
+	const struct regmap_config *config = &st_lsm9ds0_regmap_config;
+	struct device *dev = &client->dev;
+	struct st_lsm9ds0 *lsm9ds0;
+	struct regmap *regmap;
+
+	st_sensors_dev_name_probe(dev, client->name, sizeof(client->name));
+
+	lsm9ds0 = devm_kzalloc(dev, sizeof(*lsm9ds0), GFP_KERNEL);
+	if (!lsm9ds0)
+		return -ENOMEM;
+
+	lsm9ds0->dev = dev;
+	lsm9ds0->name = client->name;
+	lsm9ds0->irq = client->irq;
+
+	regmap = devm_regmap_init_i2c(client, config);
+	if (IS_ERR(regmap))
+		return PTR_ERR(regmap);
+
+	i2c_set_clientdata(client, lsm9ds0);
+
+	return st_lsm9ds0_probe(lsm9ds0, regmap);
+}
+
+static int st_lsm9ds0_i2c_remove(struct i2c_client *client)
+{
+	return st_lsm9ds0_remove(i2c_get_clientdata(client));
+}
+
+static struct i2c_driver st_lsm9ds0_driver = {
+	.driver = {
+		.name = "st-lsm9ds0-i2c",
+		.of_match_table = st_lsm9ds0_of_match,
+	},
+	.probe_new = st_lsm9ds0_i2c_probe,
+	.remove = st_lsm9ds0_i2c_remove,
+	.id_table = st_lsm9ds0_id_table,
+};
+module_i2c_driver(st_lsm9ds0_driver);
+
+MODULE_AUTHOR("Andy Shevchenko <andriy.shevchenko@linux.intel.com>");
+MODULE_DESCRIPTION("STMicroelectronics LSM9DS0 IMU I2C driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/iio/imu/st_lsm9ds0/st_lsm9ds0_spi.c b/drivers/iio/imu/st_lsm9ds0/st_lsm9ds0_spi.c
new file mode 100644
index 000000000000..272c88990dd0
--- /dev/null
+++ b/drivers/iio/imu/st_lsm9ds0/st_lsm9ds0_spi.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * STMicroelectronics LSM9DS0 IMU driver
+ *
+ * Copyright (C) 2021, Intel Corporation
+ *
+ * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/spi/spi.h>
+
+#include <linux/iio/common/st_sensors_spi.h>
+
+#include "st_lsm9ds0.h"
+
+static const struct of_device_id st_lsm9ds0_of_match[] = {
+	{
+		.compatible = "st,lsm9ds0-imu",
+		.data = LSM9DS0_IMU_DEV_NAME,
+	},
+	{}
+};
+MODULE_DEVICE_TABLE(of, st_lsm9ds0_of_match);
+
+static const struct spi_device_id st_lsm9ds0_id_table[] = {
+	{ LSM9DS0_IMU_DEV_NAME },
+	{}
+};
+MODULE_DEVICE_TABLE(spi, st_lsm9ds0_id_table);
+
+static const struct regmap_config st_lsm9ds0_regmap_config = {
+	.reg_bits	= 8,
+	.val_bits	= 8,
+	.read_flag_mask	= 0xc0,
+};
+
+static int st_lsm9ds0_spi_probe(struct spi_device *spi)
+{
+	struct device *dev = &spi->dev;
+	struct st_lsm9ds0 *lsm9ds0;
+	struct regmap *regmap;
+
+	st_sensors_dev_name_probe(dev, spi->modalias, sizeof(spi->modalias));
+
+	lsm9ds0 = devm_kzalloc(dev, sizeof(*lsm9ds0), GFP_KERNEL);
+	if (!lsm9ds0)
+		return -ENOMEM;
+
+	lsm9ds0->dev = dev;
+	lsm9ds0->name = spi->modalias;
+	lsm9ds0->irq = spi->irq;
+
+	regmap = devm_regmap_init_spi(spi, &st_lsm9ds0_regmap_config);
+	if (IS_ERR(regmap))
+		return PTR_ERR(regmap);
+
+	spi_set_drvdata(spi, lsm9ds0);
+
+	return st_lsm9ds0_probe(lsm9ds0, regmap);
+}
+
+static int st_lsm9ds0_spi_remove(struct spi_device *spi)
+{
+	return st_lsm9ds0_remove(spi_get_drvdata(spi));
+}
+
+static struct spi_driver st_lsm9ds0_driver = {
+	.driver = {
+		.name = "st-lsm9ds0-spi",
+		.of_match_table = st_lsm9ds0_of_match,
+	},
+	.probe = st_lsm9ds0_spi_probe,
+	.remove = st_lsm9ds0_spi_remove,
+	.id_table = st_lsm9ds0_id_table,
+};
+module_spi_driver(st_lsm9ds0_driver);
+
+MODULE_AUTHOR("Andy Shevchenko <andriy.shevchenko@linux.intel.com>");
+MODULE_DESCRIPTION("STMicroelectronics LSM9DS0 IMU SPI driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/iio/magnetometer/st_magn_core.c b/drivers/iio/magnetometer/st_magn_core.c
index 58da48434a25..1596faa74da9 100644
--- a/drivers/iio/magnetometer/st_magn_core.c
+++ b/drivers/iio/magnetometer/st_magn_core.c
@@ -33,6 +33,7 @@
 /* FULLSCALE */
 #define ST_MAGN_FS_AVL_1300MG			1300
 #define ST_MAGN_FS_AVL_1900MG			1900
+#define ST_MAGN_FS_AVL_2000MG			2000
 #define ST_MAGN_FS_AVL_2500MG			2500
 #define ST_MAGN_FS_AVL_4000MG			4000
 #define ST_MAGN_FS_AVL_4700MG			4700
@@ -53,6 +54,11 @@
 #define ST_MAGN_3_OUT_Y_L_ADDR			0x6a
 #define ST_MAGN_3_OUT_Z_L_ADDR			0x6c
 
+/* Special L addresses for sensor 4 */
+#define ST_MAGN_4_OUT_X_L_ADDR			0x08
+#define ST_MAGN_4_OUT_Y_L_ADDR			0x0a
+#define ST_MAGN_4_OUT_Z_L_ADDR			0x0c
+
 static const struct iio_chan_spec st_magn_16bit_channels[] = {
 	ST_SENSORS_LSM_CHANNELS(IIO_MAGN,
 			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
@@ -101,6 +107,22 @@ static const struct iio_chan_spec st_magn_3_16bit_channels[] = {
 	IIO_CHAN_SOFT_TIMESTAMP(3)
 };
 
+static const struct iio_chan_spec st_magn_4_16bit_channels[] = {
+	ST_SENSORS_LSM_CHANNELS(IIO_MAGN,
+			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+			ST_SENSORS_SCAN_X, 1, IIO_MOD_X, 's', IIO_LE, 16, 16,
+			ST_MAGN_4_OUT_X_L_ADDR),
+	ST_SENSORS_LSM_CHANNELS(IIO_MAGN,
+			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+			ST_SENSORS_SCAN_Y, 1, IIO_MOD_Y, 's', IIO_LE, 16, 16,
+			ST_MAGN_4_OUT_Y_L_ADDR),
+	ST_SENSORS_LSM_CHANNELS(IIO_MAGN,
+			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+			ST_SENSORS_SCAN_Z, 1, IIO_MOD_Z, 's', IIO_LE, 16, 16,
+			ST_MAGN_4_OUT_Z_L_ADDR),
+	IIO_CHAN_SOFT_TIMESTAMP(3)
+};
+
 static const struct st_sensor_settings st_magn_sensors_settings[] = {
 	{
 		.wai = 0, /* This sensor has no valid WhoAmI report 0 */
@@ -381,6 +403,82 @@ static const struct st_sensor_settings st_magn_sensors_settings[] = {
 		.multi_read_bit = false,
 		.bootime = 2,
 	},
+	{
+		.wai = 0x49,
+		.wai_addr = ST_SENSORS_DEFAULT_WAI_ADDRESS,
+		.sensors_supported = {
+			[0] = LSM9DS0_IMU_DEV_NAME,
+		},
+		.ch = (struct iio_chan_spec *)st_magn_4_16bit_channels,
+		.odr = {
+			.addr = 0x24,
+			.mask = GENMASK(4, 2),
+			.odr_avl = {
+				{ 3, 0x00, },
+				{ 6, 0x01, },
+				{ 12, 0x02, },
+				{ 25, 0x03, },
+				{ 50, 0x04, },
+				{ 100, 0x05, },
+			},
+		},
+		.pw = {
+			.addr = 0x26,
+			.mask = GENMASK(1, 0),
+			.value_on = 0x00,
+			.value_off = 0x03,
+		},
+		.fs = {
+			.addr = 0x25,
+			.mask = GENMASK(6, 5),
+			.fs_avl = {
+				[0] = {
+					.num = ST_MAGN_FS_AVL_2000MG,
+					.value = 0x00,
+					.gain = 73,
+				},
+				[1] = {
+					.num = ST_MAGN_FS_AVL_4000MG,
+					.value = 0x01,
+					.gain = 146,
+				},
+				[2] = {
+					.num = ST_MAGN_FS_AVL_8000MG,
+					.value = 0x02,
+					.gain = 292,
+				},
+				[3] = {
+					.num = ST_MAGN_FS_AVL_12000MG,
+					.value = 0x03,
+					.gain = 438,
+				},
+			},
+		},
+		.bdu = {
+			.addr = 0x20,
+			.mask = BIT(3),
+		},
+		.drdy_irq = {
+			.int1 = {
+				.addr = 0x22,
+				.mask = BIT(1),
+			},
+			.int2 = {
+				.addr = 0x23,
+				.mask = BIT(2),
+			},
+			.stat_drdy = {
+				.addr = 0x07,
+				.mask = GENMASK(2, 0),
+			},
+		},
+		.sim = {
+			.addr = 0x21,
+			.value = BIT(0),
+		},
+		.multi_read_bit = true,
+		.bootime = 2,
+	},
 };
 
 /* Default magn DRDY is available on INT2 pin */
diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h
index aa017b90fb06..0b9aeb479f48 100644
--- a/include/linux/iio/common/st_sensors.h
+++ b/include/linux/iio/common/st_sensors.h
@@ -20,6 +20,8 @@
 
 #include <linux/platform_data/st_sensors_pdata.h>
 
+#define LSM9DS0_IMU_DEV_NAME		"lsm9ds0"
+
 /*
  * Buffer size max case: 2bytes per channel, 3 channels in total +
  *			 8bytes timestamp channel (s64)
-- 
cgit v1.2.3


From 8dea228b174ac9637b567e5ef54f4c40db4b3c41 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Sat, 1 May 2021 18:13:47 +0100
Subject: iio: cros_ec_sensors: Fix alignment of buffer in
 iio_push_to_buffers_with_timestamp()

The samples buffer is passed to iio_push_to_buffers_with_timestamp()
which requires a buffer aligned to 8 bytes as it is assumed that
the timestamp will be naturally aligned if present.

Fixes tag is inaccurate but prior to that likely manual backporting needed
(for anything before 4.18) Earlier than that the include file to fix is
drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.h:
commit 974e6f02e27 ("iio: cros_ec_sensors_core: Add common functions
for the ChromeOS EC Sensor Hub.") present since kernel stable 4.10.
(Thanks to Gwendal for tracking this down)

Fixes: 5a0b8cb46624c ("iio: cros_ec: Move cros_ec_sensors_core.h in /include")
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Gwendal Grignou <gwendal@chromium.org
Link: https://lore.kernel.org/r/20210501171352.512953-7-jic23@kernel.org
---
 include/linux/iio/common/cros_ec_sensors_core.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/common/cros_ec_sensors_core.h b/include/linux/iio/common/cros_ec_sensors_core.h
index 7ce8a8adad58..c582e1a14232 100644
--- a/include/linux/iio/common/cros_ec_sensors_core.h
+++ b/include/linux/iio/common/cros_ec_sensors_core.h
@@ -77,7 +77,7 @@ struct cros_ec_sensors_core_state {
 		u16 scale;
 	} calib[CROS_EC_SENSOR_MAX_AXIS];
 	s8 sign[CROS_EC_SENSOR_MAX_AXIS];
-	u8 samples[CROS_EC_SAMPLE_SIZE];
+	u8 samples[CROS_EC_SAMPLE_SIZE] __aligned(8);
 
 	int (*read_ec_sensors_data)(struct iio_dev *indio_dev,
 				    unsigned long scan_mask, s16 *data);
-- 
cgit v1.2.3


From dbc557fa5ff866f46c7e29c790f3a9b64e49ef3f Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 9 Apr 2021 18:34:56 +0300
Subject: ata: Replace inclusion of kernel.h by bits.h in the header

ata.h uses BIT() macro, hence bits.h must be included. Otherwise
there is no need to have kernel.h included, I do not see any
direct users of it in ata.h. Hence replace inclusion of kernel.h.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210409153456.87798-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/ata.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ata.h b/include/linux/ata.h
index 6e67aded28f8..1b44f40c7700 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -13,7 +13,7 @@
 #ifndef __LINUX_ATA_H__
 #define __LINUX_ATA_H__
 
-#include <linux/kernel.h>
+#include <linux/bits.h>
 #include <linux/string.h>
 #include <linux/types.h>
 #include <asm/byteorder.h>
-- 
cgit v1.2.3


From c2b1063e8feb2115537addce10f36c0c82d11d9b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 2 Apr 2021 08:23:25 +0200
Subject: genirq: Add a IRQF_NO_DEBUG flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The whole call to note_interrupt() can be avoided or return early when
interrupts would be marked accordingly. For IPI handlers which always
return HANDLED the whole procedure is pretty pointless to begin with.

Add a IRQF_NO_DEBUG flag and mark the interrupt accordingly if supplied
when the interrupt is requested.

When noirqdebug is set on the kernel commandline, then the interrupt is
marked unconditionally so that there is only one condition in the hotpath
to evaluate.

 [ clg: Add changelog ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Cédric Le Goater <clg@kaod.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/7a8ad02f-63a8-c1aa-fdd1-39d973593d02@kaod.org
---
 include/linux/interrupt.h |  3 +++
 include/linux/irq.h       |  2 ++
 kernel/irq/chip.c         |  2 +-
 kernel/irq/handle.c       |  2 +-
 kernel/irq/manage.c       |  5 +++++
 kernel/irq/settings.h     | 12 ++++++++++++
 6 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 4777850a6dc7..a52109c3f3a4 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -64,6 +64,8 @@
  * IRQF_NO_AUTOEN - Don't enable IRQ or NMI automatically when users request it.
  *                Users will enable it explicitly by enable_irq() or enable_nmi()
  *                later.
+ * IRQF_NO_DEBUG - Exclude from runnaway detection for IPI and similar handlers,
+ *		   depends on IRQF_PERCPU.
  */
 #define IRQF_SHARED		0x00000080
 #define IRQF_PROBE_SHARED	0x00000100
@@ -78,6 +80,7 @@
 #define IRQF_EARLY_RESUME	0x00020000
 #define IRQF_COND_SUSPEND	0x00040000
 #define IRQF_NO_AUTOEN		0x00080000
+#define IRQF_NO_DEBUG		0x00100000
 
 #define IRQF_TIMER		(__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
 
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 31b347c9f8dd..8e9a9ae471a6 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -72,6 +72,7 @@ enum irqchip_irq_state;
  *				  mechanism and from core side polling.
  * IRQ_DISABLE_UNLAZY		- Disable lazy irq disable
  * IRQ_HIDDEN			- Don't show up in /proc/interrupts
+ * IRQ_NO_DEBUG			- Exclude from note_interrupt() debugging
  */
 enum {
 	IRQ_TYPE_NONE		= 0x00000000,
@@ -99,6 +100,7 @@ enum {
 	IRQ_IS_POLLED		= (1 << 18),
 	IRQ_DISABLE_UNLAZY	= (1 << 19),
 	IRQ_HIDDEN		= (1 << 20),
+	IRQ_NO_DEBUG		= (1 << 21),
 };
 
 #define IRQF_MODIFY_MASK	\
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 8cc8e5713287..7f04c7d8296e 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -481,7 +481,7 @@ void handle_nested_irq(unsigned int irq)
 	for_each_action_of_desc(desc, action)
 		action_ret |= action->thread_fn(action->irq, action->dev_id);
 
-	if (!noirqdebug)
+	if (!irq_settings_no_debug(desc))
 		note_interrupt(desc, action_ret);
 
 	raw_spin_lock_irq(&desc->lock);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 762a928e18f9..221d80c31e94 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -197,7 +197,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
 
 	add_interrupt_randomness(desc->irq_data.irq, flags);
 
-	if (!noirqdebug)
+	if (!irq_settings_no_debug(desc))
 		note_interrupt(desc, retval);
 	return retval;
 }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 4c14356543d9..7bdd09e7d5f0 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1686,8 +1686,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		if (new->flags & IRQF_PERCPU) {
 			irqd_set(&desc->irq_data, IRQD_PER_CPU);
 			irq_settings_set_per_cpu(desc);
+			if (new->flags & IRQF_NO_DEBUG)
+				irq_settings_set_no_debug(desc);
 		}
 
+		if (noirqdebug)
+			irq_settings_set_no_debug(desc);
+
 		if (new->flags & IRQF_ONESHOT)
 			desc->istate |= IRQS_ONESHOT;
 
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 403378b9947b..7b7efb1a114b 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -18,6 +18,7 @@ enum {
 	_IRQ_IS_POLLED		= IRQ_IS_POLLED,
 	_IRQ_DISABLE_UNLAZY	= IRQ_DISABLE_UNLAZY,
 	_IRQ_HIDDEN		= IRQ_HIDDEN,
+	_IRQ_NO_DEBUG		= IRQ_NO_DEBUG,
 	_IRQF_MODIFY_MASK	= IRQF_MODIFY_MASK,
 };
 
@@ -33,6 +34,7 @@ enum {
 #define IRQ_IS_POLLED		GOT_YOU_MORON
 #define IRQ_DISABLE_UNLAZY	GOT_YOU_MORON
 #define IRQ_HIDDEN		GOT_YOU_MORON
+#define IRQ_NO_DEBUG		GOT_YOU_MORON
 #undef IRQF_MODIFY_MASK
 #define IRQF_MODIFY_MASK	GOT_YOU_MORON
 
@@ -174,3 +176,13 @@ static inline bool irq_settings_is_hidden(struct irq_desc *desc)
 {
 	return desc->status_use_accessors & _IRQ_HIDDEN;
 }
+
+static inline void irq_settings_set_no_debug(struct irq_desc *desc)
+{
+	desc->status_use_accessors |= _IRQ_NO_DEBUG;
+}
+
+static inline bool irq_settings_no_debug(struct irq_desc *desc)
+{
+	return desc->status_use_accessors & _IRQ_NO_DEBUG;
+}
-- 
cgit v1.2.3


From 7617af3d1a5e0938eb1fd2742f19bcea772c7f8d Mon Sep 17 00:00:00 2001
From: Michael Sit Wei Hong <michael.wei.hong.sit@intel.com>
Date: Mon, 17 May 2021 17:43:31 +0800
Subject: net: pcs: Introducing support for DWC xpcs Energy Efficient Ethernet

Add DWC xpcs EEE support callbacks.The callback function is used to
set EEE registers on xpcs.

xpcs transparent mode is enabled to allow PHY to detect MAC EEE status.

Signed-off-by: Michael Sit Wei Hong <michael.wei.hong.sit@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/pcs/pcs-xpcs.c   | 51 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pcs/pcs-xpcs.h |  2 ++
 2 files changed, 53 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c
index 944ba105cac1..aa985a5aae8d 100644
--- a/drivers/net/pcs/pcs-xpcs.c
+++ b/drivers/net/pcs/pcs-xpcs.c
@@ -63,6 +63,9 @@
 #define DW_VR_MII_DIG_CTRL1		0x8000
 #define DW_VR_MII_AN_CTRL		0x8001
 #define DW_VR_MII_AN_INTR_STS		0x8002
+/* EEE Mode Control Register */
+#define DW_VR_MII_EEE_MCTRL0		0x8006
+#define DW_VR_MII_EEE_MCTRL1		0x800b
 
 /* VR_MII_DIG_CTRL1 */
 #define DW_VR_MII_DIG_CTRL1_MAC_AUTO_SW		BIT(9)
@@ -86,6 +89,20 @@
 #define DW_VR_MII_C37_ANSGM_SP_1000		0x2
 #define DW_VR_MII_C37_ANSGM_SP_LNKSTS		BIT(4)
 
+/* VR MII EEE Control 0 defines */
+#define DW_VR_MII_EEE_LTX_EN		BIT(0)  /* LPI Tx Enable */
+#define DW_VR_MII_EEE_LRX_EN		BIT(1)  /* LPI Rx Enable */
+#define DW_VR_MII_EEE_TX_QUIET_EN		BIT(2)  /* Tx Quiet Enable */
+#define DW_VR_MII_EEE_RX_QUIET_EN		BIT(3)  /* Rx Quiet Enable */
+#define DW_VR_MII_EEE_TX_EN_CTRL		BIT(4)  /* Tx Control Enable */
+#define DW_VR_MII_EEE_RX_EN_CTRL		BIT(7)  /* Rx Control Enable */
+
+#define DW_VR_MII_EEE_MULT_FACT_100NS_SHIFT	8
+#define DW_VR_MII_EEE_MULT_FACT_100NS		GENMASK(11, 8)
+
+/* VR MII EEE Control 1 defines */
+#define DW_VR_MII_EEE_TRN_LPI		BIT(0)	/* Transparent Mode Enable */
+
 static const int xpcs_usxgmii_features[] = {
 	ETHTOOL_LINK_MODE_Pause_BIT,
 	ETHTOOL_LINK_MODE_Asym_Pause_BIT,
@@ -650,6 +667,39 @@ static int xpcs_validate(struct mdio_xpcs_args *xpcs,
 	return 0;
 }
 
+static int xpcs_config_eee(struct mdio_xpcs_args *xpcs, int mult_fact_100ns,
+			   int enable)
+{
+	int ret;
+
+	if (enable) {
+	/* Enable EEE */
+		ret = DW_VR_MII_EEE_LTX_EN | DW_VR_MII_EEE_LRX_EN |
+		      DW_VR_MII_EEE_TX_QUIET_EN | DW_VR_MII_EEE_RX_QUIET_EN |
+		      DW_VR_MII_EEE_TX_EN_CTRL | DW_VR_MII_EEE_RX_EN_CTRL |
+		      mult_fact_100ns << DW_VR_MII_EEE_MULT_FACT_100NS_SHIFT;
+	} else {
+		ret = xpcs_read(xpcs, MDIO_MMD_VEND2, DW_VR_MII_EEE_MCTRL0);
+		if (ret < 0)
+			return ret;
+		ret &= ~(DW_VR_MII_EEE_LTX_EN | DW_VR_MII_EEE_LRX_EN |
+		       DW_VR_MII_EEE_TX_QUIET_EN | DW_VR_MII_EEE_RX_QUIET_EN |
+		       DW_VR_MII_EEE_TX_EN_CTRL | DW_VR_MII_EEE_RX_EN_CTRL |
+		       DW_VR_MII_EEE_MULT_FACT_100NS);
+	}
+
+	ret = xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_EEE_MCTRL0, ret);
+	if (ret < 0)
+		return ret;
+
+	ret = xpcs_read(xpcs, MDIO_MMD_VEND2, DW_VR_MII_EEE_MCTRL1);
+	if (ret < 0)
+		return ret;
+
+	ret |= DW_VR_MII_EEE_TRN_LPI;
+	return xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_EEE_MCTRL1, ret);
+}
+
 static int xpcs_config_aneg_c37_sgmii(struct mdio_xpcs_args *xpcs)
 {
 	int ret;
@@ -908,6 +958,7 @@ static struct mdio_xpcs_ops xpcs_ops = {
 	.get_state = xpcs_get_state,
 	.link_up = xpcs_link_up,
 	.probe = xpcs_probe,
+	.config_eee = xpcs_config_eee,
 };
 
 struct mdio_xpcs_ops *mdio_xpcs_get_ops(void)
diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h
index 2cb5188a7ef1..5938ced805f4 100644
--- a/include/linux/pcs/pcs-xpcs.h
+++ b/include/linux/pcs/pcs-xpcs.h
@@ -32,6 +32,8 @@ struct mdio_xpcs_ops {
 	int (*link_up)(struct mdio_xpcs_args *xpcs, int speed,
 		       phy_interface_t interface);
 	int (*probe)(struct mdio_xpcs_args *xpcs, phy_interface_t interface);
+	int (*config_eee)(struct mdio_xpcs_args *xpcs, int mult_fact_100ns,
+			  int enable);
 };
 
 #if IS_ENABLED(CONFIG_PCS_XPCS)
-- 
cgit v1.2.3


From e80fe71b3ffe1ec31c4a9be60170f897bbdf1b92 Mon Sep 17 00:00:00 2001
From: Michael Sit Wei Hong <michael.wei.hong.sit@intel.com>
Date: Mon, 17 May 2021 17:43:32 +0800
Subject: net: stmmac: Add callbacks for DWC xpcs Energy Efficient Ethernet

Link xpcs callback functions for MAC to configure the xpcs EEE feature.

The clk_eee frequency is used to calculate the MULT_FACT_100NS. This is
to adjust the clock tic closer to 100ns.

Signed-off-by: Michael Sit Wei Hong <michael.wei.hong.sit@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c    | 11 +++++++++++
 drivers/net/ethernet/stmicro/stmmac/hwif.h           |  2 ++
 drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c |  6 ++++++
 include/linux/stmmac.h                               |  1 +
 4 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
index 80728a4c0e3f..e36a8cc59ad0 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
@@ -429,6 +429,17 @@ static int intel_mgbe_common_data(struct pci_dev *pdev,
 	plat->force_sf_dma_mode = 0;
 	plat->tso_en = 1;
 
+	/* Multiplying factor to the clk_eee_i clock time
+	 * period to make it closer to 100 ns. This value
+	 * should be programmed such that the clk_eee_time_period *
+	 * (MULT_FACT_100NS + 1) should be within 80 ns to 120 ns
+	 * clk_eee frequency is 19.2Mhz
+	 * clk_eee_time_period is 52ns
+	 * 52ns * (1 + 1) = 104ns
+	 * MULT_FACT_100NS = 1
+	 */
+	plat->mult_fact_100ns = 1;
+
 	plat->rx_sched_algorithm = MTL_RX_ALGORITHM_SP;
 
 	for (i = 0; i < plat->rx_queues_to_use; i++) {
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index 6d5e0f2b03ce..75a8b90c202a 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -623,6 +623,8 @@ struct stmmac_mmc_ops {
 	stmmac_do_callback(__priv, xpcs, link_up, __args)
 #define stmmac_xpcs_probe(__priv, __args...) \
 	stmmac_do_callback(__priv, xpcs, probe, __args)
+#define stmmac_xpcs_config_eee(__priv, __args...) \
+	stmmac_do_callback(__priv, xpcs, config_eee, __args)
 
 struct stmmac_regs_off {
 	u32 ptp_off;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index 61b11639ee0c..1f6d749fd9a3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
@@ -720,6 +720,12 @@ static int stmmac_ethtool_op_set_eee(struct net_device *dev,
 		netdev_warn(priv->dev,
 			    "Setting EEE tx-lpi is not supported\n");
 
+	ret = stmmac_xpcs_config_eee(priv, &priv->hw->xpcs_args,
+				     priv->plat->mult_fact_100ns,
+				     edata->eee_enabled);
+	if (ret)
+		return ret;
+
 	if (!edata->eee_enabled)
 		stmmac_disable_eee_mode(priv);
 
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 0db36360ef21..e14a12df381b 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -223,6 +223,7 @@ struct plat_stmmacenet_data {
 	struct clk *clk_ptp_ref;
 	unsigned int clk_ptp_rate;
 	unsigned int clk_ref_rate;
+	unsigned int mult_fact_100ns;
 	s32 ptp_max_adj;
 	struct reset_control *stmmac_rst;
 	struct stmmac_axi *axi;
-- 
cgit v1.2.3


From 4d7b324e231366ea772ab10df46be31273ca39af Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Tue, 18 May 2021 09:47:23 +0300
Subject: bus: ti-sysc: Fix am335x resume hang for usb otg module

On am335x, suspend and resume only works once, and the system hangs if
suspend is attempted again. However, turns out suspend and resume works
fine multiple times if the USB OTG driver for musb controller is loaded.

The issue is caused my the interconnect target module losing context
during suspend, and it needs a restore on resume to be reconfigure again
as debugged earlier by Dave Gerlach <d-gerlach@ti.com>.

There are also other modules that need a restore on resume, like gpmc as
noted by Dave. So let's add a common way to restore an interconnect
target module based on a quirk flag. For now, let's enable the quirk for
am335x otg only to fix the suspend and resume issue.

As gpmc is not causing hangs based on tests with BeagleBone, let's patch
gpmc separately. For gpmc, we also need a hardware reset done before
restore according to Dave.

To reinit the modules, we decouple system suspend from PM runtime. We
replace calls to pm_runtime_force_suspend() and pm_runtime_force_resume()
with direct calls to internal functions and rely on the driver internal
state. There no point trying to handle complex system suspend and resume
quirks via PM runtime.

This is issue should have already been noticed with commit 1819ef2e2d12
("bus: ti-sysc: Use swsup quirks also for am335x musb") when quirk
handling was added for am335x otg for swsup. But the issue went unnoticed
as having musb driver loaded hides the issue, and suspend and resume works
once without the driver loaded.

Fixes: 1819ef2e2d12 ("bus: ti-sysc: Use swsup quirks also for am335x musb")
Suggested-by: Dave Gerlach <d-gerlach@ti.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 drivers/bus/ti-sysc.c                 | 53 +++++++++++++++++++++++++++++++++--
 include/linux/platform_data/ti-sysc.h |  1 +
 2 files changed, 51 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c
index 50a9f34b9e6c..4ff319863be2 100644
--- a/drivers/bus/ti-sysc.c
+++ b/drivers/bus/ti-sysc.c
@@ -1334,6 +1334,34 @@ err_allow_idle:
 	return error;
 }
 
+static int sysc_reinit_module(struct sysc *ddata, bool leave_enabled)
+{
+	struct device *dev = ddata->dev;
+	int error;
+
+	/* Disable target module if it is enabled */
+	if (ddata->enabled) {
+		error = sysc_runtime_suspend(dev);
+		if (error)
+			dev_warn(dev, "reinit suspend failed: %i\n", error);
+	}
+
+	/* Enable target module */
+	error = sysc_runtime_resume(dev);
+	if (error)
+		dev_warn(dev, "reinit resume failed: %i\n", error);
+
+	if (leave_enabled)
+		return error;
+
+	/* Disable target module if no leave_enabled was set */
+	error = sysc_runtime_suspend(dev);
+	if (error)
+		dev_warn(dev, "reinit suspend failed: %i\n", error);
+
+	return error;
+}
+
 static int __maybe_unused sysc_noirq_suspend(struct device *dev)
 {
 	struct sysc *ddata;
@@ -1344,12 +1372,18 @@ static int __maybe_unused sysc_noirq_suspend(struct device *dev)
 	    (SYSC_QUIRK_LEGACY_IDLE | SYSC_QUIRK_NO_IDLE))
 		return 0;
 
-	return pm_runtime_force_suspend(dev);
+	if (!ddata->enabled)
+		return 0;
+
+	ddata->needs_resume = 1;
+
+	return sysc_runtime_suspend(dev);
 }
 
 static int __maybe_unused sysc_noirq_resume(struct device *dev)
 {
 	struct sysc *ddata;
+	int error = 0;
 
 	ddata = dev_get_drvdata(dev);
 
@@ -1357,7 +1391,19 @@ static int __maybe_unused sysc_noirq_resume(struct device *dev)
 	    (SYSC_QUIRK_LEGACY_IDLE | SYSC_QUIRK_NO_IDLE))
 		return 0;
 
-	return pm_runtime_force_resume(dev);
+	if (ddata->cfg.quirks & SYSC_QUIRK_REINIT_ON_RESUME) {
+		error = sysc_reinit_module(ddata, ddata->needs_resume);
+		if (error)
+			dev_warn(dev, "noirq_resume failed: %i\n", error);
+	} else if (ddata->needs_resume) {
+		error = sysc_runtime_resume(dev);
+		if (error)
+			dev_warn(dev, "noirq_resume failed: %i\n", error);
+	}
+
+	ddata->needs_resume = 0;
+
+	return error;
 }
 
 static const struct dev_pm_ops sysc_pm_ops = {
@@ -1468,7 +1514,8 @@ static const struct sysc_revision_quirk sysc_revision_quirks[] = {
 	SYSC_QUIRK("usb_otg_hs", 0, 0x400, 0x404, 0x408, 0x00000050,
 		   0xffffffff, SYSC_QUIRK_SWSUP_SIDLE | SYSC_QUIRK_SWSUP_MSTANDBY),
 	SYSC_QUIRK("usb_otg_hs", 0, 0, 0x10, -ENODEV, 0x4ea2080d, 0xffffffff,
-		   SYSC_QUIRK_SWSUP_SIDLE | SYSC_QUIRK_SWSUP_MSTANDBY),
+		   SYSC_QUIRK_SWSUP_SIDLE | SYSC_QUIRK_SWSUP_MSTANDBY |
+		   SYSC_QUIRK_REINIT_ON_RESUME),
 	SYSC_QUIRK("wdt", 0, 0, 0x10, 0x14, 0x502a0500, 0xfffff0f0,
 		   SYSC_MODULE_QUIRK_WDT),
 	/* PRUSS on am3, am4 and am5 */
diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h
index fafc1beea504..9837fb011f2f 100644
--- a/include/linux/platform_data/ti-sysc.h
+++ b/include/linux/platform_data/ti-sysc.h
@@ -50,6 +50,7 @@ struct sysc_regbits {
 	s8 emufree_shift;
 };
 
+#define SYSC_QUIRK_REINIT_ON_RESUME	BIT(27)
 #define SYSC_QUIRK_GPMC_DEBUG		BIT(26)
 #define SYSC_MODULE_QUIRK_ENA_RESETDONE	BIT(25)
 #define SYSC_MODULE_QUIRK_PRUSS		BIT(24)
-- 
cgit v1.2.3


From 00b89fe0197f0c55a045775c11553c0cdb7082fe Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Mon, 10 May 2021 16:10:23 +0100
Subject: sched: Make the idle task quack like a per-CPU kthread

For all intents and purposes, the idle task is a per-CPU kthread. It isn't
created via the same route as other pcpu kthreads however, and as a result
it is missing a few bells and whistles: it fails kthread_is_per_cpu() and
it doesn't have PF_NO_SETAFFINITY set.

Fix the former by giving the idle task a kthread struct along with the
KTHREAD_IS_PER_CPU flag. This requires some extra iffery as init_idle()
call be called more than once on the same idle task.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210510151024.2448573-2-valentin.schneider@arm.com
---
 include/linux/kthread.h |  2 ++
 kernel/kthread.c        | 30 ++++++++++++++++++------------
 kernel/sched/core.c     | 21 +++++++++++++++------
 3 files changed, 35 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 2484ed97e72f..d9133d6db308 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -33,6 +33,8 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
 					  unsigned int cpu,
 					  const char *namefmt);
 
+void set_kthread_struct(struct task_struct *p);
+
 void kthread_set_per_cpu(struct task_struct *k, int cpu);
 bool kthread_is_per_cpu(struct task_struct *k);
 
diff --git a/kernel/kthread.c b/kernel/kthread.c
index fe3f2a40d61e..3d326833092b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -68,16 +68,6 @@ enum KTHREAD_BITS {
 	KTHREAD_SHOULD_PARK,
 };
 
-static inline void set_kthread_struct(void *kthread)
-{
-	/*
-	 * We abuse ->set_child_tid to avoid the new member and because it
-	 * can't be wrongly copied by copy_process(). We also rely on fact
-	 * that the caller can't exec, so PF_KTHREAD can't be cleared.
-	 */
-	current->set_child_tid = (__force void __user *)kthread;
-}
-
 static inline struct kthread *to_kthread(struct task_struct *k)
 {
 	WARN_ON(!(k->flags & PF_KTHREAD));
@@ -103,6 +93,22 @@ static inline struct kthread *__to_kthread(struct task_struct *p)
 	return kthread;
 }
 
+void set_kthread_struct(struct task_struct *p)
+{
+	struct kthread *kthread;
+
+	if (__to_kthread(p))
+		return;
+
+	kthread = kzalloc(sizeof(*kthread), GFP_KERNEL);
+	/*
+	 * We abuse ->set_child_tid to avoid the new member and because it
+	 * can't be wrongly copied by copy_process(). We also rely on fact
+	 * that the caller can't exec, so PF_KTHREAD can't be cleared.
+	 */
+	p->set_child_tid = (__force void __user *)kthread;
+}
+
 void free_kthread_struct(struct task_struct *k)
 {
 	struct kthread *kthread;
@@ -272,8 +278,8 @@ static int kthread(void *_create)
 	struct kthread *self;
 	int ret;
 
-	self = kzalloc(sizeof(*self), GFP_KERNEL);
-	set_kthread_struct(self);
+	set_kthread_struct(current);
+	self = to_kthread(current);
 
 	/* If user was SIGKILLed, I release the structure. */
 	done = xchg(&create->done, NULL);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 24fd795e4b8c..6a5124c4d54f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8234,12 +8234,25 @@ void __init init_idle(struct task_struct *idle, int cpu)
 
 	__sched_fork(0, idle);
 
+	/*
+	 * The idle task doesn't need the kthread struct to function, but it
+	 * is dressed up as a per-CPU kthread and thus needs to play the part
+	 * if we want to avoid special-casing it in code that deals with per-CPU
+	 * kthreads.
+	 */
+	set_kthread_struct(idle);
+
 	raw_spin_lock_irqsave(&idle->pi_lock, flags);
 	raw_spin_rq_lock(rq);
 
 	idle->state = TASK_RUNNING;
 	idle->se.exec_start = sched_clock();
-	idle->flags |= PF_IDLE;
+	/*
+	 * PF_KTHREAD should already be set at this point; regardless, make it
+	 * look like a proper per-CPU kthread.
+	 */
+	idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY;
+	kthread_set_per_cpu(idle, cpu);
 
 	scs_task_reset(idle);
 	kasan_unpoison_task_stack(idle);
@@ -8456,12 +8469,8 @@ static void balance_push(struct rq *rq)
 	/*
 	 * Both the cpu-hotplug and stop task are in this case and are
 	 * required to complete the hotplug process.
-	 *
-	 * XXX: the idle task does not match kthread_is_per_cpu() due to
-	 * histerical raisins.
 	 */
-	if (rq->idle == push_task ||
-	    kthread_is_per_cpu(push_task) ||
+	if (kthread_is_per_cpu(push_task) ||
 	    is_migration_disabled(push_task)) {
 
 		/*
-- 
cgit v1.2.3


From 8083d6b812cac5e38db9c707b41cd478beed4a0c Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 17 May 2021 17:03:49 +0300
Subject: spi: pxa2xx: Fix style of and typos in the comments and messages

Fix style of the comments and messages along with typos in them.

While at it, update Intel Copyright year.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210517140351.901-8-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-pxa2xx-dma.c   |  4 +--
 drivers/spi/spi-pxa2xx-pci.c   |  5 +--
 drivers/spi/spi-pxa2xx.c       | 71 ++++++++++++++++++++++--------------------
 drivers/spi/spi-pxa2xx.h       |  2 +-
 include/linux/pxa2xx_ssp.h     |  9 +++---
 include/linux/spi/pxa2xx_spi.h | 12 +++++--
 6 files changed, 58 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi-pxa2xx-dma.c b/drivers/spi/spi-pxa2xx-dma.c
index f022d82dcb1b..be563f0dd03a 100644
--- a/drivers/spi/spi-pxa2xx-dma.c
+++ b/drivers/spi/spi-pxa2xx-dma.c
@@ -2,7 +2,7 @@
 /*
  * PXA2xx SPI DMA engine support.
  *
- * Copyright (C) 2013, Intel Corporation
+ * Copyright (C) 2013, 2021 Intel Corporation
  * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
  */
 
@@ -26,7 +26,7 @@ static void pxa2xx_spi_dma_transfer_complete(struct driver_data *drv_data,
 	 * It is possible that one CPU is handling ROR interrupt and other
 	 * just gets DMA completion. Calling pump_transfers() twice for the
 	 * same transfer leads to problems thus we prevent concurrent calls
-	 * by using ->dma_running.
+	 * by using dma_running.
 	 */
 	if (atomic_dec_and_test(&drv_data->dma_running)) {
 		/*
diff --git a/drivers/spi/spi-pxa2xx-pci.c b/drivers/spi/spi-pxa2xx-pci.c
index dce9ade9a4df..9c9992d4f547 100644
--- a/drivers/spi/spi-pxa2xx-pci.c
+++ b/drivers/spi/spi-pxa2xx-pci.c
@@ -1,8 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * CE4100's SPI device is more or less the same one as found on PXA
+ * PCI glue driver for SPI PXA2xx compatible controllers.
+ * CE4100's SPI device is more or less the same one as found on PXA.
  *
- * Copyright (C) 2016, Intel Corporation
+ * Copyright (C) 2016, 2021 Intel Corporation
  */
 #include <linux/clk-provider.h>
 #include <linux/module.h>
diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c
index f8264771b360..94b1585de203 100644
--- a/drivers/spi/spi-pxa2xx.c
+++ b/drivers/spi/spi-pxa2xx.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Copyright (C) 2005 Stephen Street / StreetFire Sound Labs
- * Copyright (C) 2013, Intel Corporation
+ * Copyright (C) 2013, 2021 Intel Corporation
  */
 
 #include <linux/acpi.h>
@@ -40,11 +40,11 @@ MODULE_ALIAS("platform:pxa2xx-spi");
 #define TIMOUT_DFLT		1000
 
 /*
- * for testing SSCR1 changes that require SSP restart, basically
- * everything except the service and interrupt enables, the pxa270 developer
+ * For testing SSCR1 changes that require SSP restart, basically
+ * everything except the service and interrupt enables, the PXA270 developer
  * manual says only SSCR1_SCFR, SSCR1_SPH, SSCR1_SPO need to be in this
- * list, but the PXA255 dev man says all bits without really meaning the
- * service and interrupt enables
+ * list, but the PXA255 developer manual says all bits without really meaning
+ * the service and interrupt enables.
  */
 #define SSCR1_CHANGE_MASK (SSCR1_TTELP | SSCR1_TTE | SSCR1_SCFR \
 				| SSCR1_ECRA | SSCR1_ECRB | SSCR1_SCLKDIR \
@@ -653,12 +653,12 @@ static irqreturn_t interrupt_transfer(struct driver_data *drv_data)
 		irq_status &= ~SSSR_TFS;
 
 	if (irq_status & SSSR_ROR) {
-		int_error_stop(drv_data, "interrupt_transfer: fifo overrun", -EIO);
+		int_error_stop(drv_data, "interrupt_transfer: FIFO overrun", -EIO);
 		return IRQ_HANDLED;
 	}
 
 	if (irq_status & SSSR_TUR) {
-		int_error_stop(drv_data, "interrupt_transfer: fifo underrun", -EIO);
+		int_error_stop(drv_data, "interrupt_transfer: FIFO underrun", -EIO);
 		return IRQ_HANDLED;
 	}
 
@@ -670,7 +670,7 @@ static irqreturn_t interrupt_transfer(struct driver_data *drv_data)
 		}
 	}
 
-	/* Drain rx fifo, Fill tx fifo and prevent overruns */
+	/* Drain Rx FIFO, Fill Tx FIFO and prevent overruns */
 	do {
 		if (drv_data->read(drv_data)) {
 			int_transfer_complete(drv_data);
@@ -691,8 +691,8 @@ static irqreturn_t interrupt_transfer(struct driver_data *drv_data)
 		sccr1_reg &= ~SSCR1_TIE;
 
 		/*
-		 * PXA25x_SSP has no timeout, set up rx threshould for the
-		 * remaining RX bytes.
+		 * PXA25x_SSP has no timeout, set up Rx threshold for
+		 * the remaining Rx bytes.
 		 */
 		if (pxa25x_ssp_comp(drv_data)) {
 			u32 rx_thre;
@@ -914,7 +914,7 @@ static unsigned int ssp_get_clk_div(struct driver_data *drv_data, int rate)
 
 	/*
 	 * Calculate the divisor for the SCR (Serial Clock Rate), avoiding
-	 * that the SSP transmission rate can be greater than the device rate
+	 * that the SSP transmission rate can be greater than the device rate.
 	 */
 	if (ssp->type == PXA25x_SSP || ssp->type == CE4100_SSP)
 		return (DIV_ROUND_UP(ssp_clk, 2 * rate) - 1) & 0xff;
@@ -972,7 +972,7 @@ static int pxa2xx_spi_transfer_one(struct spi_controller *controller,
 	/* Check if we can DMA this transfer */
 	if (transfer->len > MAX_DMA_LEN && chip->enable_dma) {
 
-		/* reject already-mapped transfers; PIO won't always work */
+		/* Reject already-mapped transfers; PIO won't always work */
 		if (message->is_dma_mapped
 				|| transfer->rx_dma || transfer->tx_dma) {
 			dev_err(&spi->dev,
@@ -981,7 +981,7 @@ static int pxa2xx_spi_transfer_one(struct spi_controller *controller,
 			return -EINVAL;
 		}
 
-		/* warn ... we force this to PIO mode */
+		/* Warn ... we force this to PIO mode */
 		dev_warn_ratelimited(&spi->dev,
 				     "DMA disabled for transfer length %u greater than %d\n",
 				     transfer->len, MAX_DMA_LEN);
@@ -1026,8 +1026,8 @@ static int pxa2xx_spi_transfer_one(struct spi_controller *controller,
 					u32_writer : null_writer;
 	}
 	/*
-	 * if bits/word is changed in dma mode, then must check the
-	 * thresholds and burst also
+	 * If bits per word is changed in DMA mode, then must check
+	 * the thresholds and burst also.
 	 */
 	if (chip->enable_dma) {
 		if (pxa2xx_spi_set_dma_burst_and_threshold(chip,
@@ -1101,10 +1101,10 @@ static int pxa2xx_spi_transfer_one(struct spi_controller *controller,
 	if (!pxa25x_ssp_comp(drv_data))
 		pxa2xx_spi_write(drv_data, SSTO, chip->timeout);
 
-	/* first set CR1 without interrupt and service enables */
+	/* First set CR1 without interrupt and service enables */
 	pxa2xx_spi_update(drv_data, SSCR1, change_mask, cr1);
 
-	/* see if we need to reload the config registers */
+	/* See if we need to reload the configuration registers */
 	pxa2xx_spi_update(drv_data, SSCR0, GENMASK(31, 0), cr0);
 
 	/* Restart the SSP */
@@ -1114,7 +1114,7 @@ static int pxa2xx_spi_transfer_one(struct spi_controller *controller,
 		u8 tx_level = read_SSSR_bits(drv_data, SSSR_TFL_MASK) >> 8;
 
 		if (tx_level) {
-			/* On MMP2, flipping SSE doesn't to empty TXFIFO. */
+			/* On MMP2, flipping SSE doesn't to empty Tx FIFO. */
 			dev_warn(&spi->dev, "%u bytes of garbage in Tx FIFO!\n", tx_level);
 			if (tx_level > transfer->len)
 				tx_level = transfer->len;
@@ -1134,7 +1134,7 @@ static int pxa2xx_spi_transfer_one(struct spi_controller *controller,
 
 	/*
 	 * Release the data by enabling service requests and interrupts,
-	 * without changing any mode bits
+	 * without changing any mode bits.
 	 */
 	pxa2xx_spi_write(drv_data, SSCR1, cr1);
 
@@ -1207,12 +1207,13 @@ static int setup_cs(struct spi_device *spi, struct chip_data *chip,
 	if (drv_data->ssp_type == CE4100_SSP)
 		return 0;
 
-	/* NOTE: setup() can be called multiple times, possibly with
-	 * different chip_info, release previously requested GPIO
+	/*
+	 * NOTE: setup() can be called multiple times, possibly with
+	 * different chip_info, release previously requested GPIO.
 	 */
 	cleanup_cs(spi);
 
-	/* If (*cs_control) is provided, ignore GPIO chip select */
+	/* If ->cs_control() is provided, ignore GPIO chip select */
 	if (chip_info->cs_control) {
 		chip->cs_control = chip_info->cs_control;
 		return 0;
@@ -1288,7 +1289,7 @@ static int setup(struct spi_device *spi)
 		break;
 	}
 
-	/* Only alloc on first setup */
+	/* Only allocate on the first setup */
 	chip = spi_get_ctldata(spi);
 	if (!chip) {
 		chip = kzalloc(sizeof(struct chip_data), GFP_KERNEL);
@@ -1307,8 +1308,10 @@ static int setup(struct spi_device *spi)
 		chip->timeout = TIMOUT_DFLT;
 	}
 
-	/* protocol drivers may change the chip settings, so...
-	 * if chip_info exists, use it */
+	/*
+	 * Protocol drivers may change the chip settings, so...
+	 * if chip_info exists, use it.
+	 */
 	chip_info = spi->controller_data;
 
 	/* chip_info isn't always needed */
@@ -1344,11 +1347,13 @@ static int setup(struct spi_device *spi)
 		chip->lpss_tx_threshold = tx_thres;
 	}
 
-	/* set dma burst and threshold outside of chip_info path so that if
-	 * chip_info goes away after setting chip->enable_dma, the
-	 * burst and threshold can still respond to changes in bits_per_word */
+	/*
+	 * Set DMA burst and threshold outside of chip_info path so that if
+	 * chip_info goes away after setting chip->enable_dma, the burst and
+	 * threshold can still respond to changes in bits_per_word.
+	 */
 	if (chip->enable_dma) {
-		/* set up legal burst and threshold for dma */
+		/* Set up legal burst and threshold for DMA */
 		if (pxa2xx_spi_set_dma_burst_and_threshold(chip, spi,
 						spi->bits_per_word,
 						&chip->dma_burst_size,
@@ -1677,7 +1682,7 @@ static int pxa2xx_spi_probe(struct platform_device *pdev)
 		ssp = &platform_info->ssp;
 
 	if (!ssp->mmio_base) {
-		dev_err(&pdev->dev, "failed to get ssp\n");
+		dev_err(&pdev->dev, "failed to get SSP\n");
 		return -ENODEV;
 	}
 
@@ -1699,7 +1704,7 @@ static int pxa2xx_spi_probe(struct platform_device *pdev)
 	controller->dev.of_node = dev->of_node;
 	controller->dev.fwnode = dev->fwnode;
 
-	/* the spi->mode bits understood by this driver: */
+	/* The spi->mode bits understood by this driver: */
 	controller->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH | SPI_LOOP;
 
 	controller->bus_num = ssp->port_id;
@@ -1787,7 +1792,7 @@ static int pxa2xx_spi_probe(struct platform_device *pdev)
 		      QUARK_X1000_SSCR1_TxTresh(TX_THRESH_QUARK_X1000_DFLT);
 		pxa2xx_spi_write(drv_data, SSCR1, tmp);
 
-		/* using the Motorola SPI protocol and use 8 bit frame */
+		/* Using the Motorola SPI protocol and use 8 bit frame */
 		tmp = QUARK_X1000_SSCR0_Motorola | QUARK_X1000_SSCR0_DataSize(8);
 		pxa2xx_spi_write(drv_data, SSCR0, tmp);
 		break;
@@ -1859,7 +1864,7 @@ static int pxa2xx_spi_probe(struct platform_device *pdev)
 	platform_set_drvdata(pdev, drv_data);
 	status = spi_register_controller(controller);
 	if (status) {
-		dev_err(&pdev->dev, "problem registering spi controller\n");
+		dev_err(&pdev->dev, "problem registering SPI controller\n");
 		goto out_error_pm_runtime_enabled;
 	}
 
diff --git a/drivers/spi/spi-pxa2xx.h b/drivers/spi/spi-pxa2xx.h
index db9de46110ad..9a20fb88e50f 100644
--- a/drivers/spi/spi-pxa2xx.h
+++ b/drivers/spi/spi-pxa2xx.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) 2005 Stephen Street / StreetFire Sound Labs
- * Copyright (C) 2013, Intel Corporation
+ * Copyright (C) 2013, 2021 Intel Corporation
  */
 
 #ifndef SPI_PXA2XX_H
diff --git a/include/linux/pxa2xx_ssp.h b/include/linux/pxa2xx_ssp.h
index 2b21bc1f3c73..a3fec2de512f 100644
--- a/include/linux/pxa2xx_ssp.h
+++ b/include/linux/pxa2xx_ssp.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- *  Copyright (C) 2003 Russell King, All Rights Reserved.
+ * Copyright (C) 2003 Russell King, All Rights Reserved.
  *
  * This driver supports the following PXA CPU/SSP ports:-
  *
@@ -59,7 +59,7 @@ struct device_node;
 /* PXA27x, PXA3xx */
 #define SSCR0_EDSS	BIT(20)		/* Extended data size select */
 #define SSCR0_NCS	BIT(21)		/* Network clock select */
-#define SSCR0_RIM	BIT(22)		/* Receive FIFO overrrun interrupt mask */
+#define SSCR0_RIM	BIT(22)		/* Receive FIFO overrun interrupt mask */
 #define SSCR0_TUM	BIT(23)		/* Transmit FIFO underrun interrupt mask */
 #define SSCR0_FRDC	GENMASK(26, 24)	/* Frame rate divider control (mask) */
 #define SSCR0_SlotsPerFrm(x) (((x) - 1) << 24)	/* Time slots per frame [1..8] */
@@ -126,7 +126,7 @@ struct device_node;
 #define QUARK_X1000_SSCR1_EFWR	BIT(16)		/* Enable FIFO Write/Read */
 #define QUARK_X1000_SSCR1_STRF	BIT(17)		/* Select FIFO or EFWR */
 
-/* extra bits in PXA255, PXA26x and PXA27x SSP ports */
+/* Extra bits in PXA255, PXA26x and PXA27x SSP ports */
 #define SSCR0_TISSP		(1 << 4)	/* TI Sync Serial Protocol */
 #define SSCR0_PSP		(3 << 4)	/* PSP - Programmable Serial Protocol */
 
@@ -222,7 +222,8 @@ enum pxa_ssp_type {
 	CE4100_SSP,
 	MRFLD_SSP,
 	QUARK_X1000_SSP,
-	LPSS_LPT_SSP, /* Keep LPSS types sorted with lpss_platforms[] */
+	/* Keep LPSS types sorted with lpss_platforms[] */
+	LPSS_LPT_SSP,
 	LPSS_BYT_SSP,
 	LPSS_BSW_SSP,
 	LPSS_SPT_SSP,
diff --git a/include/linux/spi/pxa2xx_spi.h b/include/linux/spi/pxa2xx_spi.h
index 12ef04d0896d..eaab121ee575 100644
--- a/include/linux/spi/pxa2xx_spi.h
+++ b/include/linux/spi/pxa2xx_spi.h
@@ -14,7 +14,10 @@
 
 struct dma_chan;
 
-/* device.platform_data for SSP controller devices */
+/*
+ * The platform data for SSP controller devices
+ * (resides in device.platform_data).
+ */
 struct pxa2xx_spi_controller {
 	u16 num_chipselect;
 	u8 enable_dma;
@@ -30,8 +33,11 @@ struct pxa2xx_spi_controller {
 	struct ssp_device ssp;
 };
 
-/* spi_board_info.controller_data for SPI slave devices,
- * copied to spi_device.platform_data ... mostly for dma tuning
+/*
+ * The controller specific data for SPI slave devices
+ * (resides in spi_board_info.controller_data),
+ * copied to spi_device.platform_data ... mostly for
+ * DMA tuning.
  */
 struct pxa2xx_spi_chip {
 	u8 tx_threshold;
-- 
cgit v1.2.3


From c49661aa6f7097047b7e86ad37b1cf308a7a8d4f Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Sun, 16 May 2021 19:23:48 -0700
Subject: skmsg: Remove unused parameters of sk_msg_wait_data()

'err' and 'flags' are not used, we can just get rid of them.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <song@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210517022348.50555-1-xiyou.wangcong@gmail.com
---
 include/linux/skmsg.h | 3 +--
 net/core/skmsg.c      | 3 +--
 net/ipv4/tcp_bpf.c    | 9 ++-------
 net/ipv4/udp_bpf.c    | 8 ++------
 4 files changed, 6 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index aba0f0f429be..fcaa9a7996c8 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -126,8 +126,7 @@ int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
 			      struct sk_msg *msg, u32 bytes);
 int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
 			     struct sk_msg *msg, u32 bytes);
-int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags,
-		     long timeo, int *err);
+int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo);
 int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
 		   int len, int flags);
 
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 43ce17a6a585..f0b9decdf279 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -399,8 +399,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
 
-int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags,
-		     long timeo, int *err)
+int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo)
 {
 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 	int ret = 0;
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index ad9d17923fc5..a80de92ea3b6 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -184,11 +184,11 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 msg_bytes_ready:
 	copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
 	if (!copied) {
-		int data, err = 0;
 		long timeo;
+		int data;
 
 		timeo = sock_rcvtimeo(sk, nonblock);
-		data = sk_msg_wait_data(sk, psock, flags, timeo, &err);
+		data = sk_msg_wait_data(sk, psock, timeo);
 		if (data) {
 			if (!sk_psock_queue_empty(psock))
 				goto msg_bytes_ready;
@@ -196,14 +196,9 @@ msg_bytes_ready:
 			sk_psock_put(sk, psock);
 			return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
 		}
-		if (err) {
-			ret = err;
-			goto out;
-		}
 		copied = -EAGAIN;
 	}
 	ret = copied;
-out:
 	release_sock(sk);
 	sk_psock_put(sk, psock);
 	return ret;
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
index 954c4591a6fd..b07e4b6dda25 100644
--- a/net/ipv4/udp_bpf.c
+++ b/net/ipv4/udp_bpf.c
@@ -43,21 +43,17 @@ static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 msg_bytes_ready:
 	copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
 	if (!copied) {
-		int data, err = 0;
 		long timeo;
+		int data;
 
 		timeo = sock_rcvtimeo(sk, nonblock);
-		data = sk_msg_wait_data(sk, psock, flags, timeo, &err);
+		data = sk_msg_wait_data(sk, psock, timeo);
 		if (data) {
 			if (!sk_psock_queue_empty(psock))
 				goto msg_bytes_ready;
 			ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
 			goto out;
 		}
-		if (err) {
-			ret = err;
-			goto out;
-		}
 		copied = -EAGAIN;
 	}
 	ret = copied;
-- 
cgit v1.2.3


From 86544c3de6a2185409c5a3d02f674ea223a14217 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Tue, 18 May 2021 20:49:24 +0300
Subject: net: mdio: provide shim implementation of devm_of_mdiobus_register

Similar to the way in which of_mdiobus_register() has a fallback to the
non-DT based mdiobus_register() when CONFIG_OF is not set, we can create
a shim for the device-managed devm_of_mdiobus_register() which calls
devm_mdiobus_register() and discards the struct device_node *.

In particular, this solves a build issue with the qca8k DSA driver which
uses devm_of_mdiobus_register and can be compiled without CONFIG_OF.

Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org> # build-tested
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/of_mdio.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/of_mdio.h b/include/linux/of_mdio.h
index 2b05e7f7c238..da633d34ab86 100644
--- a/include/linux/of_mdio.h
+++ b/include/linux/of_mdio.h
@@ -72,6 +72,13 @@ static inline int of_mdiobus_register(struct mii_bus *mdio, struct device_node *
 	return mdiobus_register(mdio);
 }
 
+static inline int devm_of_mdiobus_register(struct device *dev,
+					   struct mii_bus *mdio,
+					   struct device_node *np)
+{
+	return devm_mdiobus_register(dev, mdio);
+}
+
 static inline struct mdio_device *of_mdio_find_device(struct device_node *np)
 {
 	return NULL;
-- 
cgit v1.2.3


From 79a7f8bdb159d9914b58740f3d31d602a6e4aca8 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 13 May 2021 17:36:03 -0700
Subject: bpf: Introduce bpf_sys_bpf() helper and program type.

Add placeholders for bpf_sys_bpf() helper and new program type.
Make sure to check that expected_attach_type is zero for future extensibility.
Allow tracing helper functions to be used in this program type, since they will
only execute from user context via bpf_prog_test_run.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210514003623.28033-2-alexei.starovoitov@gmail.com
---
 include/linux/bpf.h            | 10 ++++++++
 include/linux/bpf_types.h      |  2 ++
 include/uapi/linux/bpf.h       |  8 +++++++
 kernel/bpf/syscall.c           | 53 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c          |  8 +++++++
 net/bpf/test_run.c             | 43 ++++++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  8 +++++++
 7 files changed, 132 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 02b02cb29ce2..04a2bf41ae72 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1826,6 +1826,9 @@ static inline bool bpf_map_is_dev_bound(struct bpf_map *map)
 
 struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr);
 void bpf_map_offload_map_free(struct bpf_map *map);
+int bpf_prog_test_run_syscall(struct bpf_prog *prog,
+			      const union bpf_attr *kattr,
+			      union bpf_attr __user *uattr);
 #else
 static inline int bpf_prog_offload_init(struct bpf_prog *prog,
 					union bpf_attr *attr)
@@ -1851,6 +1854,13 @@ static inline struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
 static inline void bpf_map_offload_map_free(struct bpf_map *map)
 {
 }
+
+static inline int bpf_prog_test_run_syscall(struct bpf_prog *prog,
+					    const union bpf_attr *kattr,
+					    union bpf_attr __user *uattr)
+{
+	return -ENOTSUPP;
+}
 #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
 
 #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index f883f01a5061..a9db1eae6796 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -77,6 +77,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LSM, lsm,
 	       void *, void *)
 #endif /* CONFIG_BPF_LSM */
 #endif
+BPF_PROG_TYPE(BPF_PROG_TYPE_SYSCALL, bpf_syscall,
+	      void *, void *)
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ec6d85a81744..c92648f38144 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -937,6 +937,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_EXT,
 	BPF_PROG_TYPE_LSM,
 	BPF_PROG_TYPE_SK_LOOKUP,
+	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
 };
 
 enum bpf_attach_type {
@@ -4735,6 +4736,12 @@ union bpf_attr {
  *		be zero-terminated except when **str_size** is 0.
  *
  *		Or **-EBUSY** if the per-CPU memory copy buffer is busy.
+ *
+ * long bpf_sys_bpf(u32 cmd, void *attr, u32 attr_size)
+ * 	Description
+ * 		Execute bpf syscall with given arguments.
+ * 	Return
+ * 		A syscall result.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -4903,6 +4910,7 @@ union bpf_attr {
 	FN(check_mtu),			\
 	FN(for_each_map_elem),		\
 	FN(snprintf),			\
+	FN(sys_bpf),			\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 941ca06d9dfa..b1e7352919cb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2014,6 +2014,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
 		if (expected_attach_type == BPF_SK_LOOKUP)
 			return 0;
 		return -EINVAL;
+	case BPF_PROG_TYPE_SYSCALL:
 	case BPF_PROG_TYPE_EXT:
 		if (expected_attach_type)
 			return -EINVAL;
@@ -4508,3 +4509,55 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 
 	return err;
 }
+
+static bool syscall_prog_is_valid_access(int off, int size,
+					 enum bpf_access_type type,
+					 const struct bpf_prog *prog,
+					 struct bpf_insn_access_aux *info)
+{
+	if (off < 0 || off >= U16_MAX)
+		return false;
+	if (off % size != 0)
+		return false;
+	return true;
+}
+
+BPF_CALL_3(bpf_sys_bpf, int, cmd, void *, attr, u32, attr_size)
+{
+	return -EINVAL;
+}
+
+const struct bpf_func_proto bpf_sys_bpf_proto = {
+	.func		= bpf_sys_bpf,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_ANYTHING,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+};
+
+const struct bpf_func_proto * __weak
+tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	return bpf_base_func_proto(func_id);
+}
+
+static const struct bpf_func_proto *
+syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_sys_bpf:
+		return &bpf_sys_bpf_proto;
+	default:
+		return tracing_prog_func_proto(func_id, prog);
+	}
+}
+
+const struct bpf_verifier_ops bpf_syscall_verifier_ops = {
+	.get_func_proto  = syscall_prog_func_proto,
+	.is_valid_access = syscall_prog_is_valid_access,
+};
+
+const struct bpf_prog_ops bpf_syscall_prog_ops = {
+	.test_run = bpf_prog_test_run_syscall,
+};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bdfdb54676ea..37407d8fbca4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -13196,6 +13196,14 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 	int ret;
 	u64 key;
 
+	if (prog->type == BPF_PROG_TYPE_SYSCALL) {
+		if (prog->aux->sleepable)
+			/* attach_btf_id checked to be zero already */
+			return 0;
+		verbose(env, "Syscall programs can only be sleepable\n");
+		return -EINVAL;
+	}
+
 	if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING &&
 	    prog->type != BPF_PROG_TYPE_LSM) {
 		verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n");
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index a5d72c48fb66..a6972d7ddf80 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -918,3 +918,46 @@ out:
 	kfree(user_ctx);
 	return ret;
 }
+
+int bpf_prog_test_run_syscall(struct bpf_prog *prog,
+			      const union bpf_attr *kattr,
+			      union bpf_attr __user *uattr)
+{
+	void __user *ctx_in = u64_to_user_ptr(kattr->test.ctx_in);
+	__u32 ctx_size_in = kattr->test.ctx_size_in;
+	void *ctx = NULL;
+	u32 retval;
+	int err = 0;
+
+	/* doesn't support data_in/out, ctx_out, duration, or repeat or flags */
+	if (kattr->test.data_in || kattr->test.data_out ||
+	    kattr->test.ctx_out || kattr->test.duration ||
+	    kattr->test.repeat || kattr->test.flags)
+		return -EINVAL;
+
+	if (ctx_size_in < prog->aux->max_ctx_offset ||
+	    ctx_size_in > U16_MAX)
+		return -EINVAL;
+
+	if (ctx_size_in) {
+		ctx = kzalloc(ctx_size_in, GFP_USER);
+		if (!ctx)
+			return -ENOMEM;
+		if (copy_from_user(ctx, ctx_in, ctx_size_in)) {
+			err = -EFAULT;
+			goto out;
+		}
+	}
+	retval = bpf_prog_run_pin_on_cpu(prog, ctx);
+
+	if (copy_to_user(&uattr->test.retval, &retval, sizeof(u32))) {
+		err = -EFAULT;
+		goto out;
+	}
+	if (ctx_size_in)
+		if (copy_to_user(ctx_in, ctx, ctx_size_in))
+			err = -EFAULT;
+out:
+	kfree(ctx);
+	return err;
+}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index ec6d85a81744..c92648f38144 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -937,6 +937,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_EXT,
 	BPF_PROG_TYPE_LSM,
 	BPF_PROG_TYPE_SK_LOOKUP,
+	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
 };
 
 enum bpf_attach_type {
@@ -4735,6 +4736,12 @@ union bpf_attr {
  *		be zero-terminated except when **str_size** is 0.
  *
  *		Or **-EBUSY** if the per-CPU memory copy buffer is busy.
+ *
+ * long bpf_sys_bpf(u32 cmd, void *attr, u32 attr_size)
+ * 	Description
+ * 		Execute bpf syscall with given arguments.
+ * 	Return
+ * 		A syscall result.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -4903,6 +4910,7 @@ union bpf_attr {
 	FN(check_mtu),			\
 	FN(for_each_map_elem),		\
 	FN(snprintf),			\
+	FN(sys_bpf),			\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From cdf7fb0a9f3d36b279590ac41e61c6b655db0d4a Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 13 May 2021 17:36:04 -0700
Subject: bpf: Introduce bpfptr_t user/kernel pointer.

Similar to sockptr_t introduce bpfptr_t with few additions:
make_bpfptr() creates new user/kernel pointer in the same address space as
existing user/kernel pointer.
bpfptr_add() advances the user/kernel pointer.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210514003623.28033-3-alexei.starovoitov@gmail.com
---
 include/linux/bpfptr.h | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 include/linux/bpfptr.h

(limited to 'include/linux')

diff --git a/include/linux/bpfptr.h b/include/linux/bpfptr.h
new file mode 100644
index 000000000000..5cdeab497cb3
--- /dev/null
+++ b/include/linux/bpfptr.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* A pointer that can point to either kernel or userspace memory. */
+#ifndef _LINUX_BPFPTR_H
+#define _LINUX_BPFPTR_H
+
+#include <linux/sockptr.h>
+
+typedef sockptr_t bpfptr_t;
+
+static inline bool bpfptr_is_kernel(bpfptr_t bpfptr)
+{
+	return bpfptr.is_kernel;
+}
+
+static inline bpfptr_t KERNEL_BPFPTR(void *p)
+{
+	return (bpfptr_t) { .kernel = p, .is_kernel = true };
+}
+
+static inline bpfptr_t USER_BPFPTR(void __user *p)
+{
+	return (bpfptr_t) { .user = p };
+}
+
+static inline bpfptr_t make_bpfptr(u64 addr, bool is_kernel)
+{
+	if (is_kernel)
+		return KERNEL_BPFPTR((void*) (uintptr_t) addr);
+	else
+		return USER_BPFPTR(u64_to_user_ptr(addr));
+}
+
+static inline bool bpfptr_is_null(bpfptr_t bpfptr)
+{
+	if (bpfptr_is_kernel(bpfptr))
+		return !bpfptr.kernel;
+	return !bpfptr.user;
+}
+
+static inline void bpfptr_add(bpfptr_t *bpfptr, size_t val)
+{
+	if (bpfptr_is_kernel(*bpfptr))
+		bpfptr->kernel += val;
+	else
+		bpfptr->user += val;
+}
+
+static inline int copy_from_bpfptr_offset(void *dst, bpfptr_t src,
+					  size_t offset, size_t size)
+{
+	return copy_from_sockptr_offset(dst, (sockptr_t) src, offset, size);
+}
+
+static inline int copy_from_bpfptr(void *dst, bpfptr_t src, size_t size)
+{
+	return copy_from_bpfptr_offset(dst, src, 0, size);
+}
+
+static inline int copy_to_bpfptr_offset(bpfptr_t dst, size_t offset,
+					const void *src, size_t size)
+{
+	return copy_to_sockptr_offset((sockptr_t) dst, offset, src, size);
+}
+
+static inline void *memdup_bpfptr(bpfptr_t src, size_t len)
+{
+	return memdup_sockptr((sockptr_t) src, len);
+}
+
+static inline long strncpy_from_bpfptr(char *dst, bpfptr_t src, size_t count)
+{
+	return strncpy_from_sockptr(dst, (sockptr_t) src, count);
+}
+
+#endif /* _LINUX_BPFPTR_H */
-- 
cgit v1.2.3


From af2ac3e13e45752af03c8a933f9b6e18841b128b Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 13 May 2021 17:36:05 -0700
Subject: bpf: Prepare bpf syscall to be used from kernel and user space.

With the help from bpfptr_t prepare relevant bpf syscall commands
to be used from kernel and user space.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210514003623.28033-4-alexei.starovoitov@gmail.com
---
 include/linux/bpf.h   |   8 ++--
 kernel/bpf/bpf_iter.c |  13 +++---
 kernel/bpf/syscall.c  | 113 +++++++++++++++++++++++++++++++++-----------------
 kernel/bpf/verifier.c |  34 ++++++++-------
 net/bpf/test_run.c    |   2 +-
 5 files changed, 104 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 04a2bf41ae72..7fd53380c981 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -22,6 +22,7 @@
 #include <linux/sched/mm.h>
 #include <linux/slab.h>
 #include <linux/percpu-refcount.h>
+#include <linux/bpfptr.h>
 
 struct bpf_verifier_env;
 struct bpf_verifier_log;
@@ -1428,7 +1429,7 @@ struct bpf_iter__bpf_map_elem {
 int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info);
 void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info);
 bool bpf_iter_prog_supported(struct bpf_prog *prog);
-int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
+int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_prog *prog);
 int bpf_iter_new_fd(struct bpf_link *link);
 bool bpf_link_is_iter(struct bpf_link *link);
 struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop);
@@ -1459,7 +1460,7 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value);
 
 int bpf_get_file_flag(int flags);
-int bpf_check_uarg_tail_zero(void __user *uaddr, size_t expected_size,
+int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size,
 			     size_t actual_size);
 
 /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
@@ -1479,8 +1480,7 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
 }
 
 /* verify correctness of eBPF program */
-int bpf_check(struct bpf_prog **fp, union bpf_attr *attr,
-	      union bpf_attr __user *uattr);
+int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr);
 
 #ifndef CONFIG_BPF_JIT_ALWAYS_ON
 void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 931870f9cf56..2d4fbdbb194e 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -473,15 +473,16 @@ bool bpf_link_is_iter(struct bpf_link *link)
 	return link->ops == &bpf_iter_link_lops;
 }
 
-int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
+			 struct bpf_prog *prog)
 {
-	union bpf_iter_link_info __user *ulinfo;
 	struct bpf_link_primer link_primer;
 	struct bpf_iter_target_info *tinfo;
 	union bpf_iter_link_info linfo;
 	struct bpf_iter_link *link;
 	u32 prog_btf_id, linfo_len;
 	bool existed = false;
+	bpfptr_t ulinfo;
 	int err;
 
 	if (attr->link_create.target_fd || attr->link_create.flags)
@@ -489,18 +490,18 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 
 	memset(&linfo, 0, sizeof(union bpf_iter_link_info));
 
-	ulinfo = u64_to_user_ptr(attr->link_create.iter_info);
+	ulinfo = make_bpfptr(attr->link_create.iter_info, uattr.is_kernel);
 	linfo_len = attr->link_create.iter_info_len;
-	if (!ulinfo ^ !linfo_len)
+	if (bpfptr_is_null(ulinfo) ^ !linfo_len)
 		return -EINVAL;
 
-	if (ulinfo) {
+	if (!bpfptr_is_null(ulinfo)) {
 		err = bpf_check_uarg_tail_zero(ulinfo, sizeof(linfo),
 					       linfo_len);
 		if (err)
 			return err;
 		linfo_len = min_t(u32, linfo_len, sizeof(linfo));
-		if (copy_from_user(&linfo, ulinfo, linfo_len))
+		if (copy_from_bpfptr(&linfo, ulinfo, linfo_len))
 			return -EFAULT;
 	}
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b1e7352919cb..28387fe149ba 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -72,11 +72,10 @@ static const struct bpf_map_ops * const bpf_map_types[] = {
  * copy_from_user() call. However, this is not a concern since this function is
  * meant to be a future-proofing of bits.
  */
-int bpf_check_uarg_tail_zero(void __user *uaddr,
+int bpf_check_uarg_tail_zero(bpfptr_t uaddr,
 			     size_t expected_size,
 			     size_t actual_size)
 {
-	unsigned char __user *addr = uaddr + expected_size;
 	int res;
 
 	if (unlikely(actual_size > PAGE_SIZE))	/* silly large */
@@ -85,7 +84,12 @@ int bpf_check_uarg_tail_zero(void __user *uaddr,
 	if (actual_size <= expected_size)
 		return 0;
 
-	res = check_zeroed_user(addr, actual_size - expected_size);
+	if (uaddr.is_kernel)
+		res = memchr_inv(uaddr.kernel + expected_size, 0,
+				 actual_size - expected_size) == NULL;
+	else
+		res = check_zeroed_user(uaddr.user + expected_size,
+					actual_size - expected_size);
 	if (res < 0)
 		return res;
 	return res ? 0 : -E2BIG;
@@ -1004,6 +1008,17 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size)
 	return NULL;
 }
 
+static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size)
+{
+	if (key_size)
+		return memdup_bpfptr(ukey, key_size);
+
+	if (!bpfptr_is_null(ukey))
+		return ERR_PTR(-EINVAL);
+
+	return NULL;
+}
+
 /* last field in 'union bpf_attr' used by this command */
 #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
 
@@ -1074,10 +1089,10 @@ err_put:
 
 #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
 
-static int map_update_elem(union bpf_attr *attr)
+static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
 {
-	void __user *ukey = u64_to_user_ptr(attr->key);
-	void __user *uvalue = u64_to_user_ptr(attr->value);
+	bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
+	bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel);
 	int ufd = attr->map_fd;
 	struct bpf_map *map;
 	void *key, *value;
@@ -1103,7 +1118,7 @@ static int map_update_elem(union bpf_attr *attr)
 		goto err_put;
 	}
 
-	key = __bpf_copy_key(ukey, map->key_size);
+	key = ___bpf_copy_key(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
 		goto err_put;
@@ -1123,7 +1138,7 @@ static int map_update_elem(union bpf_attr *attr)
 		goto free_key;
 
 	err = -EFAULT;
-	if (copy_from_user(value, uvalue, value_size) != 0)
+	if (copy_from_bpfptr(value, uvalue, value_size) != 0)
 		goto free_value;
 
 	err = bpf_map_update_value(map, f, key, value, attr->flags);
@@ -2076,7 +2091,7 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
 /* last field in 'union bpf_attr' used by this command */
 #define	BPF_PROG_LOAD_LAST_FIELD attach_prog_fd
 
-static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
+static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
 {
 	enum bpf_prog_type type = attr->prog_type;
 	struct bpf_prog *prog, *dst_prog = NULL;
@@ -2101,8 +2116,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 		return -EPERM;
 
 	/* copy eBPF program license from user space */
-	if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
-			      sizeof(license) - 1) < 0)
+	if (strncpy_from_bpfptr(license,
+				make_bpfptr(attr->license, uattr.is_kernel),
+				sizeof(license) - 1) < 0)
 		return -EFAULT;
 	license[sizeof(license) - 1] = 0;
 
@@ -2186,8 +2202,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 	prog->len = attr->insn_cnt;
 
 	err = -EFAULT;
-	if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),
-			   bpf_prog_insn_size(prog)) != 0)
+	if (copy_from_bpfptr(prog->insns,
+			     make_bpfptr(attr->insns, uattr.is_kernel),
+			     bpf_prog_insn_size(prog)) != 0)
 		goto free_prog_sec;
 
 	prog->orig_prog = NULL;
@@ -3423,7 +3440,7 @@ static int bpf_prog_get_info_by_fd(struct file *file,
 	u32 ulen;
 	int err;
 
-	err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+	err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
 	if (err)
 		return err;
 	info_len = min_t(u32, sizeof(info), info_len);
@@ -3702,7 +3719,7 @@ static int bpf_map_get_info_by_fd(struct file *file,
 	u32 info_len = attr->info.info_len;
 	int err;
 
-	err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+	err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
 	if (err)
 		return err;
 	info_len = min_t(u32, sizeof(info), info_len);
@@ -3745,7 +3762,7 @@ static int bpf_btf_get_info_by_fd(struct file *file,
 	u32 info_len = attr->info.info_len;
 	int err;
 
-	err = bpf_check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len);
+	err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len);
 	if (err)
 		return err;
 
@@ -3762,7 +3779,7 @@ static int bpf_link_get_info_by_fd(struct file *file,
 	u32 info_len = attr->info.info_len;
 	int err;
 
-	err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+	err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
 	if (err)
 		return err;
 	info_len = min_t(u32, sizeof(info), info_len);
@@ -4023,13 +4040,14 @@ err_put:
 	return err;
 }
 
-static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
+				   struct bpf_prog *prog)
 {
 	if (attr->link_create.attach_type != prog->expected_attach_type)
 		return -EINVAL;
 
 	if (prog->expected_attach_type == BPF_TRACE_ITER)
-		return bpf_iter_link_attach(attr, prog);
+		return bpf_iter_link_attach(attr, uattr, prog);
 	else if (prog->type == BPF_PROG_TYPE_EXT)
 		return bpf_tracing_prog_attach(prog,
 					       attr->link_create.target_fd,
@@ -4038,7 +4056,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *
 }
 
 #define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len
-static int link_create(union bpf_attr *attr)
+static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 {
 	enum bpf_prog_type ptype;
 	struct bpf_prog *prog;
@@ -4057,7 +4075,7 @@ static int link_create(union bpf_attr *attr)
 		goto out;
 
 	if (prog->type == BPF_PROG_TYPE_EXT) {
-		ret = tracing_bpf_link_attach(attr, prog);
+		ret = tracing_bpf_link_attach(attr, uattr, prog);
 		goto out;
 	}
 
@@ -4078,7 +4096,7 @@ static int link_create(union bpf_attr *attr)
 		ret = cgroup_bpf_link_attach(attr, prog);
 		break;
 	case BPF_PROG_TYPE_TRACING:
-		ret = tracing_bpf_link_attach(attr, prog);
+		ret = tracing_bpf_link_attach(attr, uattr, prog);
 		break;
 	case BPF_PROG_TYPE_FLOW_DISSECTOR:
 	case BPF_PROG_TYPE_SK_LOOKUP:
@@ -4366,7 +4384,7 @@ out_prog_put:
 	return ret;
 }
 
-SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
+static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
 {
 	union bpf_attr attr;
 	int err;
@@ -4381,7 +4399,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 
 	/* copy attributes from user space, may be less than sizeof(bpf_attr) */
 	memset(&attr, 0, sizeof(attr));
-	if (copy_from_user(&attr, uattr, size) != 0)
+	if (copy_from_bpfptr(&attr, uattr, size) != 0)
 		return -EFAULT;
 
 	err = security_bpf(cmd, &attr, size);
@@ -4396,7 +4414,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 		err = map_lookup_elem(&attr);
 		break;
 	case BPF_MAP_UPDATE_ELEM:
-		err = map_update_elem(&attr);
+		err = map_update_elem(&attr, uattr);
 		break;
 	case BPF_MAP_DELETE_ELEM:
 		err = map_delete_elem(&attr);
@@ -4423,21 +4441,21 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 		err = bpf_prog_detach(&attr);
 		break;
 	case BPF_PROG_QUERY:
-		err = bpf_prog_query(&attr, uattr);
+		err = bpf_prog_query(&attr, uattr.user);
 		break;
 	case BPF_PROG_TEST_RUN:
-		err = bpf_prog_test_run(&attr, uattr);
+		err = bpf_prog_test_run(&attr, uattr.user);
 		break;
 	case BPF_PROG_GET_NEXT_ID:
-		err = bpf_obj_get_next_id(&attr, uattr,
+		err = bpf_obj_get_next_id(&attr, uattr.user,
 					  &prog_idr, &prog_idr_lock);
 		break;
 	case BPF_MAP_GET_NEXT_ID:
-		err = bpf_obj_get_next_id(&attr, uattr,
+		err = bpf_obj_get_next_id(&attr, uattr.user,
 					  &map_idr, &map_idr_lock);
 		break;
 	case BPF_BTF_GET_NEXT_ID:
-		err = bpf_obj_get_next_id(&attr, uattr,
+		err = bpf_obj_get_next_id(&attr, uattr.user,
 					  &btf_idr, &btf_idr_lock);
 		break;
 	case BPF_PROG_GET_FD_BY_ID:
@@ -4447,7 +4465,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 		err = bpf_map_get_fd_by_id(&attr);
 		break;
 	case BPF_OBJ_GET_INFO_BY_FD:
-		err = bpf_obj_get_info_by_fd(&attr, uattr);
+		err = bpf_obj_get_info_by_fd(&attr, uattr.user);
 		break;
 	case BPF_RAW_TRACEPOINT_OPEN:
 		err = bpf_raw_tracepoint_open(&attr);
@@ -4459,26 +4477,26 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 		err = bpf_btf_get_fd_by_id(&attr);
 		break;
 	case BPF_TASK_FD_QUERY:
-		err = bpf_task_fd_query(&attr, uattr);
+		err = bpf_task_fd_query(&attr, uattr.user);
 		break;
 	case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
 		err = map_lookup_and_delete_elem(&attr);
 		break;
 	case BPF_MAP_LOOKUP_BATCH:
-		err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH);
+		err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_BATCH);
 		break;
 	case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
-		err = bpf_map_do_batch(&attr, uattr,
+		err = bpf_map_do_batch(&attr, uattr.user,
 				       BPF_MAP_LOOKUP_AND_DELETE_BATCH);
 		break;
 	case BPF_MAP_UPDATE_BATCH:
-		err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH);
+		err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_UPDATE_BATCH);
 		break;
 	case BPF_MAP_DELETE_BATCH:
-		err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH);
+		err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_DELETE_BATCH);
 		break;
 	case BPF_LINK_CREATE:
-		err = link_create(&attr);
+		err = link_create(&attr, uattr);
 		break;
 	case BPF_LINK_UPDATE:
 		err = link_update(&attr);
@@ -4487,7 +4505,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 		err = bpf_link_get_fd_by_id(&attr);
 		break;
 	case BPF_LINK_GET_NEXT_ID:
-		err = bpf_obj_get_next_id(&attr, uattr,
+		err = bpf_obj_get_next_id(&attr, uattr.user,
 					  &link_idr, &link_idr_lock);
 		break;
 	case BPF_ENABLE_STATS:
@@ -4510,6 +4528,11 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	return err;
 }
 
+SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
+{
+	return __sys_bpf(cmd, USER_BPFPTR(uattr), size);
+}
+
 static bool syscall_prog_is_valid_access(int off, int size,
 					 enum bpf_access_type type,
 					 const struct bpf_prog *prog,
@@ -4524,7 +4547,19 @@ static bool syscall_prog_is_valid_access(int off, int size,
 
 BPF_CALL_3(bpf_sys_bpf, int, cmd, void *, attr, u32, attr_size)
 {
-	return -EINVAL;
+	switch (cmd) {
+	case BPF_MAP_CREATE:
+	case BPF_MAP_UPDATE_ELEM:
+	case BPF_MAP_FREEZE:
+	case BPF_PROG_LOAD:
+		break;
+	/* case BPF_PROG_TEST_RUN:
+	 * is not part of this list to prevent recursive test_run
+	 */
+	default:
+		return -EINVAL;
+	}
+	return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
 }
 
 const struct bpf_func_proto bpf_sys_bpf_proto = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 37407d8fbca4..e63c7d60e00d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9436,7 +9436,7 @@ static int check_abnormal_return(struct bpf_verifier_env *env)
 
 static int check_btf_func(struct bpf_verifier_env *env,
 			  const union bpf_attr *attr,
-			  union bpf_attr __user *uattr)
+			  bpfptr_t uattr)
 {
 	const struct btf_type *type, *func_proto, *ret_type;
 	u32 i, nfuncs, urec_size, min_size;
@@ -9445,7 +9445,7 @@ static int check_btf_func(struct bpf_verifier_env *env,
 	struct bpf_func_info_aux *info_aux = NULL;
 	struct bpf_prog *prog;
 	const struct btf *btf;
-	void __user *urecord;
+	bpfptr_t urecord;
 	u32 prev_offset = 0;
 	bool scalar_return;
 	int ret = -ENOMEM;
@@ -9473,7 +9473,7 @@ static int check_btf_func(struct bpf_verifier_env *env,
 	prog = env->prog;
 	btf = prog->aux->btf;
 
-	urecord = u64_to_user_ptr(attr->func_info);
+	urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
 	min_size = min_t(u32, krec_size, urec_size);
 
 	krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN);
@@ -9491,13 +9491,15 @@ static int check_btf_func(struct bpf_verifier_env *env,
 				/* set the size kernel expects so loader can zero
 				 * out the rest of the record.
 				 */
-				if (put_user(min_size, &uattr->func_info_rec_size))
+				if (copy_to_bpfptr_offset(uattr,
+							  offsetof(union bpf_attr, func_info_rec_size),
+							  &min_size, sizeof(min_size)))
 					ret = -EFAULT;
 			}
 			goto err_free;
 		}
 
-		if (copy_from_user(&krecord[i], urecord, min_size)) {
+		if (copy_from_bpfptr(&krecord[i], urecord, min_size)) {
 			ret = -EFAULT;
 			goto err_free;
 		}
@@ -9549,7 +9551,7 @@ static int check_btf_func(struct bpf_verifier_env *env,
 		}
 
 		prev_offset = krecord[i].insn_off;
-		urecord += urec_size;
+		bpfptr_add(&urecord, urec_size);
 	}
 
 	prog->aux->func_info = krecord;
@@ -9581,14 +9583,14 @@ static void adjust_btf_func(struct bpf_verifier_env *env)
 
 static int check_btf_line(struct bpf_verifier_env *env,
 			  const union bpf_attr *attr,
-			  union bpf_attr __user *uattr)
+			  bpfptr_t uattr)
 {
 	u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0;
 	struct bpf_subprog_info *sub;
 	struct bpf_line_info *linfo;
 	struct bpf_prog *prog;
 	const struct btf *btf;
-	void __user *ulinfo;
+	bpfptr_t ulinfo;
 	int err;
 
 	nr_linfo = attr->line_info_cnt;
@@ -9614,7 +9616,7 @@ static int check_btf_line(struct bpf_verifier_env *env,
 
 	s = 0;
 	sub = env->subprog_info;
-	ulinfo = u64_to_user_ptr(attr->line_info);
+	ulinfo = make_bpfptr(attr->line_info, uattr.is_kernel);
 	expected_size = sizeof(struct bpf_line_info);
 	ncopy = min_t(u32, expected_size, rec_size);
 	for (i = 0; i < nr_linfo; i++) {
@@ -9622,14 +9624,15 @@ static int check_btf_line(struct bpf_verifier_env *env,
 		if (err) {
 			if (err == -E2BIG) {
 				verbose(env, "nonzero tailing record in line_info");
-				if (put_user(expected_size,
-					     &uattr->line_info_rec_size))
+				if (copy_to_bpfptr_offset(uattr,
+							  offsetof(union bpf_attr, line_info_rec_size),
+							  &expected_size, sizeof(expected_size)))
 					err = -EFAULT;
 			}
 			goto err_free;
 		}
 
-		if (copy_from_user(&linfo[i], ulinfo, ncopy)) {
+		if (copy_from_bpfptr(&linfo[i], ulinfo, ncopy)) {
 			err = -EFAULT;
 			goto err_free;
 		}
@@ -9681,7 +9684,7 @@ static int check_btf_line(struct bpf_verifier_env *env,
 		}
 
 		prev_offset = linfo[i].insn_off;
-		ulinfo += rec_size;
+		bpfptr_add(&ulinfo, rec_size);
 	}
 
 	if (s != env->subprog_cnt) {
@@ -9703,7 +9706,7 @@ err_free:
 
 static int check_btf_info(struct bpf_verifier_env *env,
 			  const union bpf_attr *attr,
-			  union bpf_attr __user *uattr)
+			  bpfptr_t uattr)
 {
 	struct btf *btf;
 	int err;
@@ -13275,8 +13278,7 @@ struct btf *bpf_get_btf_vmlinux(void)
 	return btf_vmlinux;
 }
 
-int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
-	      union bpf_attr __user *uattr)
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
 {
 	u64 start_time = ktime_get_ns();
 	struct bpf_verifier_env *env;
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index a6972d7ddf80..aa47af349ba8 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -409,7 +409,7 @@ static void *bpf_ctx_init(const union bpf_attr *kattr, u32 max_size)
 		return ERR_PTR(-ENOMEM);
 
 	if (data_in) {
-		err = bpf_check_uarg_tail_zero(data_in, max_size, size);
+		err = bpf_check_uarg_tail_zero(USER_BPFPTR(data_in), max_size, size);
 		if (err) {
 			kfree(data);
 			return ERR_PTR(err);
-- 
cgit v1.2.3


From c571bd752e91602f092823b2f1ee685a74d2726c Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 13 May 2021 17:36:08 -0700
Subject: bpf: Make btf_load command to be bpfptr_t compatible.

Similar to prog_load make btf_load command to be availble to
bpf_prog_type_syscall program.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210514003623.28033-7-alexei.starovoitov@gmail.com
---
 include/linux/btf.h  | 2 +-
 kernel/bpf/btf.c     | 8 ++++----
 kernel/bpf/syscall.c | 7 ++++---
 3 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index 3bac66e0183a..94a0c976c90f 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -21,7 +21,7 @@ extern const struct file_operations btf_fops;
 
 void btf_get(struct btf *btf);
 void btf_put(struct btf *btf);
-int btf_new_fd(const union bpf_attr *attr);
+int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr);
 struct btf *btf_get_by_fd(int fd);
 int btf_get_info_by_fd(const struct btf *btf,
 		       const union bpf_attr *attr,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 0600ed325fa0..fbf6c06a9d62 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -4257,7 +4257,7 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
 	return 0;
 }
 
-static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
+static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
 			     u32 log_level, char __user *log_ubuf, u32 log_size)
 {
 	struct btf_verifier_env *env = NULL;
@@ -4306,7 +4306,7 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
 	btf->data = data;
 	btf->data_size = btf_data_size;
 
-	if (copy_from_user(data, btf_data, btf_data_size)) {
+	if (copy_from_bpfptr(data, btf_data, btf_data_size)) {
 		err = -EFAULT;
 		goto errout;
 	}
@@ -5780,12 +5780,12 @@ static int __btf_new_fd(struct btf *btf)
 	return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);
 }
 
-int btf_new_fd(const union bpf_attr *attr)
+int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr)
 {
 	struct btf *btf;
 	int ret;
 
-	btf = btf_parse(u64_to_user_ptr(attr->btf),
+	btf = btf_parse(make_bpfptr(attr->btf, uattr.is_kernel),
 			attr->btf_size, attr->btf_log_level,
 			u64_to_user_ptr(attr->btf_log_buf),
 			attr->btf_log_size);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 28387fe149ba..415865c49dd4 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3842,7 +3842,7 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
 
 #define BPF_BTF_LOAD_LAST_FIELD btf_log_level
 
-static int bpf_btf_load(const union bpf_attr *attr)
+static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr)
 {
 	if (CHECK_ATTR(BPF_BTF_LOAD))
 		return -EINVAL;
@@ -3850,7 +3850,7 @@ static int bpf_btf_load(const union bpf_attr *attr)
 	if (!bpf_capable())
 		return -EPERM;
 
-	return btf_new_fd(attr);
+	return btf_new_fd(attr, uattr);
 }
 
 #define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
@@ -4471,7 +4471,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
 		err = bpf_raw_tracepoint_open(&attr);
 		break;
 	case BPF_BTF_LOAD:
-		err = bpf_btf_load(&attr);
+		err = bpf_btf_load(&attr, uattr);
 		break;
 	case BPF_BTF_GET_FD_BY_ID:
 		err = bpf_btf_get_fd_by_id(&attr);
@@ -4552,6 +4552,7 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, void *, attr, u32, attr_size)
 	case BPF_MAP_UPDATE_ELEM:
 	case BPF_MAP_FREEZE:
 	case BPF_PROG_LOAD:
+	case BPF_BTF_LOAD:
 		break;
 	/* case BPF_PROG_TEST_RUN:
 	 * is not part of this list to prevent recursive test_run
-- 
cgit v1.2.3


From 387544bfa291a22383d60b40f887360e2b931ec6 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 13 May 2021 17:36:10 -0700
Subject: bpf: Introduce fd_idx

Typical program loading sequence involves creating bpf maps and applying
map FDs into bpf instructions in various places in the bpf program.
This job is done by libbpf that is using compiler generated ELF relocations
to patch certain instruction after maps are created and BTFs are loaded.
The goal of fd_idx is to allow bpf instructions to stay immutable
after compilation. At load time the libbpf would still create maps as usual,
but it wouldn't need to patch instructions. It would store map_fds into
__u32 fd_array[] and would pass that pointer to sys_bpf(BPF_PROG_LOAD).

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210514003623.28033-9-alexei.starovoitov@gmail.com
---
 include/linux/bpf_verifier.h   |  1 +
 include/uapi/linux/bpf.h       | 16 +++++++++-----
 kernel/bpf/syscall.c           |  2 +-
 kernel/bpf/verifier.c          | 47 +++++++++++++++++++++++++++++++++---------
 tools/include/uapi/linux/bpf.h | 16 +++++++++-----
 5 files changed, 61 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index d4632aa3ca50..e774ecc1cd1f 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -450,6 +450,7 @@ struct bpf_verifier_env {
 	u32 peak_states;
 	/* longest register parentage chain walked for liveness marking */
 	u32 longest_mark_read_walk;
+	bpfptr_t fd_array;
 };
 
 __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c92648f38144..de58a714ed36 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1098,8 +1098,8 @@ enum bpf_link_type {
 /* When BPF ldimm64's insn[0].src_reg != 0 then this can have
  * the following extensions:
  *
- * insn[0].src_reg:  BPF_PSEUDO_MAP_FD
- * insn[0].imm:      map fd
+ * insn[0].src_reg:  BPF_PSEUDO_MAP_[FD|IDX]
+ * insn[0].imm:      map fd or fd_idx
  * insn[1].imm:      0
  * insn[0].off:      0
  * insn[1].off:      0
@@ -1107,15 +1107,19 @@ enum bpf_link_type {
  * verifier type:    CONST_PTR_TO_MAP
  */
 #define BPF_PSEUDO_MAP_FD	1
-/* insn[0].src_reg:  BPF_PSEUDO_MAP_VALUE
- * insn[0].imm:      map fd
+#define BPF_PSEUDO_MAP_IDX	5
+
+/* insn[0].src_reg:  BPF_PSEUDO_MAP_[IDX_]VALUE
+ * insn[0].imm:      map fd or fd_idx
  * insn[1].imm:      offset into value
  * insn[0].off:      0
  * insn[1].off:      0
  * ldimm64 rewrite:  address of map[0]+offset
  * verifier type:    PTR_TO_MAP_VALUE
  */
-#define BPF_PSEUDO_MAP_VALUE	2
+#define BPF_PSEUDO_MAP_VALUE		2
+#define BPF_PSEUDO_MAP_IDX_VALUE	6
+
 /* insn[0].src_reg:  BPF_PSEUDO_BTF_ID
  * insn[0].imm:      kernel btd id of VAR
  * insn[1].imm:      0
@@ -1315,6 +1319,8 @@ union bpf_attr {
 			/* or valid module BTF object fd or 0 to attach to vmlinux */
 			__u32		attach_btf_obj_fd;
 		};
+		__u32		:32;		/* pad */
+		__aligned_u64	fd_array;	/* array of FDs */
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 415865c49dd4..da7dc2406470 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2089,7 +2089,7 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD attach_prog_fd
+#define	BPF_PROG_LOAD_LAST_FIELD fd_array
 
 static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
 {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e63c7d60e00d..9189eecb26dd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -8915,12 +8915,14 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	mark_reg_known_zero(env, regs, insn->dst_reg);
 	dst_reg->map_ptr = map;
 
-	if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) {
+	if (insn->src_reg == BPF_PSEUDO_MAP_VALUE ||
+	    insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) {
 		dst_reg->type = PTR_TO_MAP_VALUE;
 		dst_reg->off = aux->map_off;
 		if (map_value_has_spin_lock(map))
 			dst_reg->id = ++env->id_gen;
-	} else if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
+	} else if (insn->src_reg == BPF_PSEUDO_MAP_FD ||
+		   insn->src_reg == BPF_PSEUDO_MAP_IDX) {
 		dst_reg->type = CONST_PTR_TO_MAP;
 	} else {
 		verbose(env, "bpf verifier is misconfigured\n");
@@ -11173,6 +11175,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
 			struct bpf_map *map;
 			struct fd f;
 			u64 addr;
+			u32 fd;
 
 			if (i == insn_cnt - 1 || insn[1].code != 0 ||
 			    insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
@@ -11202,16 +11205,38 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
 			/* In final convert_pseudo_ld_imm64() step, this is
 			 * converted into regular 64-bit imm load insn.
 			 */
-			if ((insn[0].src_reg != BPF_PSEUDO_MAP_FD &&
-			     insn[0].src_reg != BPF_PSEUDO_MAP_VALUE) ||
-			    (insn[0].src_reg == BPF_PSEUDO_MAP_FD &&
-			     insn[1].imm != 0)) {
-				verbose(env,
-					"unrecognized bpf_ld_imm64 insn\n");
+			switch (insn[0].src_reg) {
+			case BPF_PSEUDO_MAP_VALUE:
+			case BPF_PSEUDO_MAP_IDX_VALUE:
+				break;
+			case BPF_PSEUDO_MAP_FD:
+			case BPF_PSEUDO_MAP_IDX:
+				if (insn[1].imm == 0)
+					break;
+				fallthrough;
+			default:
+				verbose(env, "unrecognized bpf_ld_imm64 insn\n");
 				return -EINVAL;
 			}
 
-			f = fdget(insn[0].imm);
+			switch (insn[0].src_reg) {
+			case BPF_PSEUDO_MAP_IDX_VALUE:
+			case BPF_PSEUDO_MAP_IDX:
+				if (bpfptr_is_null(env->fd_array)) {
+					verbose(env, "fd_idx without fd_array is invalid\n");
+					return -EPROTO;
+				}
+				if (copy_from_bpfptr_offset(&fd, env->fd_array,
+							    insn[0].imm * sizeof(fd),
+							    sizeof(fd)))
+					return -EFAULT;
+				break;
+			default:
+				fd = insn[0].imm;
+				break;
+			}
+
+			f = fdget(fd);
 			map = __bpf_map_get(f);
 			if (IS_ERR(map)) {
 				verbose(env, "fd %d is not pointing to valid bpf_map\n",
@@ -11226,7 +11251,8 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
 			}
 
 			aux = &env->insn_aux_data[i];
-			if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
+			if (insn[0].src_reg == BPF_PSEUDO_MAP_FD ||
+			    insn[0].src_reg == BPF_PSEUDO_MAP_IDX) {
 				addr = (unsigned long)map;
 			} else {
 				u32 off = insn[1].imm;
@@ -13308,6 +13334,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
 		env->insn_aux_data[i].orig_idx = i;
 	env->prog = *prog;
 	env->ops = bpf_verifier_ops[env->prog->type];
+	env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel);
 	is_priv = bpf_capable();
 
 	bpf_get_btf_vmlinux();
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c92648f38144..de58a714ed36 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1098,8 +1098,8 @@ enum bpf_link_type {
 /* When BPF ldimm64's insn[0].src_reg != 0 then this can have
  * the following extensions:
  *
- * insn[0].src_reg:  BPF_PSEUDO_MAP_FD
- * insn[0].imm:      map fd
+ * insn[0].src_reg:  BPF_PSEUDO_MAP_[FD|IDX]
+ * insn[0].imm:      map fd or fd_idx
  * insn[1].imm:      0
  * insn[0].off:      0
  * insn[1].off:      0
@@ -1107,15 +1107,19 @@ enum bpf_link_type {
  * verifier type:    CONST_PTR_TO_MAP
  */
 #define BPF_PSEUDO_MAP_FD	1
-/* insn[0].src_reg:  BPF_PSEUDO_MAP_VALUE
- * insn[0].imm:      map fd
+#define BPF_PSEUDO_MAP_IDX	5
+
+/* insn[0].src_reg:  BPF_PSEUDO_MAP_[IDX_]VALUE
+ * insn[0].imm:      map fd or fd_idx
  * insn[1].imm:      offset into value
  * insn[0].off:      0
  * insn[1].off:      0
  * ldimm64 rewrite:  address of map[0]+offset
  * verifier type:    PTR_TO_MAP_VALUE
  */
-#define BPF_PSEUDO_MAP_VALUE	2
+#define BPF_PSEUDO_MAP_VALUE		2
+#define BPF_PSEUDO_MAP_IDX_VALUE	6
+
 /* insn[0].src_reg:  BPF_PSEUDO_BTF_ID
  * insn[0].imm:      kernel btd id of VAR
  * insn[1].imm:      0
@@ -1315,6 +1319,8 @@ union bpf_attr {
 			/* or valid module BTF object fd or 0 to attach to vmlinux */
 			__u32		attach_btf_obj_fd;
 		};
+		__u32		:32;		/* pad */
+		__aligned_u64	fd_array;	/* array of FDs */
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
-- 
cgit v1.2.3


From 3d78417b60fba249cc555468cb72d96f5cde2964 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 13 May 2021 17:36:11 -0700
Subject: bpf: Add bpf_btf_find_by_name_kind() helper.

Add new helper:
long bpf_btf_find_by_name_kind(char *name, int name_sz, u32 kind, int flags)
Description
	Find BTF type with given name and kind in vmlinux BTF or in module's BTFs.
Return
	Returns btf_id and btf_obj_fd in lower and upper 32 bits.

It will be used by loader program to find btf_id to attach the program to
and to find btf_ids of ksyms.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210514003623.28033-10-alexei.starovoitov@gmail.com
---
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       |  7 +++++
 kernel/bpf/btf.c               | 62 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           |  2 ++
 tools/include/uapi/linux/bpf.h |  7 +++++
 5 files changed, 79 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7fd53380c981..9dc44ba97584 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1974,6 +1974,7 @@ extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto;
 extern const struct bpf_func_proto bpf_task_storage_get_proto;
 extern const struct bpf_func_proto bpf_task_storage_delete_proto;
 extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
+extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto;
 
 const struct bpf_func_proto *bpf_tracing_func_proto(
 	enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index de58a714ed36..3cc07351c1cf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4748,6 +4748,12 @@ union bpf_attr {
  * 		Execute bpf syscall with given arguments.
  * 	Return
  * 		A syscall result.
+ *
+ * long bpf_btf_find_by_name_kind(char *name, int name_sz, u32 kind, int flags)
+ * 	Description
+ * 		Find BTF type with given name and kind in vmlinux BTF or in module's BTFs.
+ * 	Return
+ * 		Returns btf_id and btf_obj_fd in lower and upper 32 bits.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -4917,6 +4923,7 @@ union bpf_attr {
 	FN(for_each_map_elem),		\
 	FN(snprintf),			\
 	FN(sys_bpf),			\
+	FN(btf_find_by_name_kind),	\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index fbf6c06a9d62..85716327c375 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6085,3 +6085,65 @@ struct module *btf_try_get_module(const struct btf *btf)
 
 	return res;
 }
+
+BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags)
+{
+	struct btf *btf;
+	long ret;
+
+	if (flags)
+		return -EINVAL;
+
+	if (name_sz <= 1 || name[name_sz - 1])
+		return -EINVAL;
+
+	btf = bpf_get_btf_vmlinux();
+	if (IS_ERR(btf))
+		return PTR_ERR(btf);
+
+	ret = btf_find_by_name_kind(btf, name, kind);
+	/* ret is never zero, since btf_find_by_name_kind returns
+	 * positive btf_id or negative error.
+	 */
+	if (ret < 0) {
+		struct btf *mod_btf;
+		int id;
+
+		/* If name is not found in vmlinux's BTF then search in module's BTFs */
+		spin_lock_bh(&btf_idr_lock);
+		idr_for_each_entry(&btf_idr, mod_btf, id) {
+			if (!btf_is_module(mod_btf))
+				continue;
+			/* linear search could be slow hence unlock/lock
+			 * the IDR to avoiding holding it for too long
+			 */
+			btf_get(mod_btf);
+			spin_unlock_bh(&btf_idr_lock);
+			ret = btf_find_by_name_kind(mod_btf, name, kind);
+			if (ret > 0) {
+				int btf_obj_fd;
+
+				btf_obj_fd = __btf_new_fd(mod_btf);
+				if (btf_obj_fd < 0) {
+					btf_put(mod_btf);
+					return btf_obj_fd;
+				}
+				return ret | (((u64)btf_obj_fd) << 32);
+			}
+			spin_lock_bh(&btf_idr_lock);
+			btf_put(mod_btf);
+		}
+		spin_unlock_bh(&btf_idr_lock);
+	}
+	return ret;
+}
+
+const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = {
+	.func		= bpf_btf_find_by_name_kind,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_MEM,
+	.arg2_type	= ARG_CONST_SIZE,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_ANYTHING,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index da7dc2406470..f93ff2ebf96d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4584,6 +4584,8 @@ syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	switch (func_id) {
 	case BPF_FUNC_sys_bpf:
 		return &bpf_sys_bpf_proto;
+	case BPF_FUNC_btf_find_by_name_kind:
+		return &bpf_btf_find_by_name_kind_proto;
 	default:
 		return tracing_prog_func_proto(func_id, prog);
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index de58a714ed36..3cc07351c1cf 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4748,6 +4748,12 @@ union bpf_attr {
  * 		Execute bpf syscall with given arguments.
  * 	Return
  * 		A syscall result.
+ *
+ * long bpf_btf_find_by_name_kind(char *name, int name_sz, u32 kind, int flags)
+ * 	Description
+ * 		Find BTF type with given name and kind in vmlinux BTF or in module's BTFs.
+ * 	Return
+ * 		Returns btf_id and btf_obj_fd in lower and upper 32 bits.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -4917,6 +4923,7 @@ union bpf_attr {
 	FN(for_each_map_elem),		\
 	FN(snprintf),			\
 	FN(sys_bpf),			\
+	FN(btf_find_by_name_kind),	\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From 3410fbcd47dc6479af4309febf760ccaa5efb472 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@nvidia.com>
Date: Wed, 12 May 2021 13:52:27 +0300
Subject: {net, RDMA}/mlx5: Fix override of log_max_qp by other device

mlx5_core_dev holds pointer to static profile, hence when the
log_max_qp of the profile is override by some device, then it
effect all other mlx5 devices that share the same profile.
Fix it by having a profile instance for every mlx5 device.

Fixes: 883371c453b9 ("net/mlx5: Check FW limitations on log_max_qp before setting it")
Signed-off-by: Maor Gottlieb <maorg@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/infiniband/hw/mlx5/mr.c                |  4 +--
 drivers/net/ethernet/mellanox/mlx5/core/main.c | 11 +++----
 include/linux/mlx5/driver.h                    | 44 +++++++++++++-------------
 3 files changed, 29 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 4388afeff251..9662cd39c7ff 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -743,10 +743,10 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 		ent->xlt = (1 << ent->order) * sizeof(struct mlx5_mtt) /
 			   MLX5_IB_UMR_OCTOWORD;
 		ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
-		if ((dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) &&
+		if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) &&
 		    !dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
 		    mlx5_ib_can_load_pas_with_umr(dev, 0))
-			ent->limit = dev->mdev->profile->mr_cache[i].limit;
+			ent->limit = dev->mdev->profile.mr_cache[i].limit;
 		else
 			ent->limit = 0;
 		spin_lock_irq(&ent->lock);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index c114365eb126..a1d67bd7fb43 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -503,7 +503,7 @@ static int handle_hca_cap_odp(struct mlx5_core_dev *dev, void *set_ctx)
 
 static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx)
 {
-	struct mlx5_profile *prof = dev->profile;
+	struct mlx5_profile *prof = &dev->profile;
 	void *set_hca_cap;
 	int err;
 
@@ -524,11 +524,11 @@ static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx)
 		 to_fw_pkey_sz(dev, 128));
 
 	/* Check log_max_qp from HCA caps to set in current profile */
-	if (MLX5_CAP_GEN_MAX(dev, log_max_qp) < profile[prof_sel].log_max_qp) {
+	if (MLX5_CAP_GEN_MAX(dev, log_max_qp) < prof->log_max_qp) {
 		mlx5_core_warn(dev, "log_max_qp value in current profile is %d, changing it to HCA capability limit (%d)\n",
-			       profile[prof_sel].log_max_qp,
+			       prof->log_max_qp,
 			       MLX5_CAP_GEN_MAX(dev, log_max_qp));
-		profile[prof_sel].log_max_qp = MLX5_CAP_GEN_MAX(dev, log_max_qp);
+		prof->log_max_qp = MLX5_CAP_GEN_MAX(dev, log_max_qp);
 	}
 	if (prof->mask & MLX5_PROF_MASK_QP_SIZE)
 		MLX5_SET(cmd_hca_cap, set_hca_cap, log_max_qp,
@@ -1381,8 +1381,7 @@ int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx)
 	struct mlx5_priv *priv = &dev->priv;
 	int err;
 
-	dev->profile = &profile[profile_idx];
-
+	memcpy(&dev->profile, &profile[profile_idx], sizeof(dev->profile));
 	INIT_LIST_HEAD(&priv->ctx_list);
 	spin_lock_init(&priv->ctx_lock);
 	mutex_init(&dev->intf_state_mutex);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f8e8d7e90616..020a8f7fdbdd 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -703,6 +703,27 @@ struct mlx5_hv_vhca;
 #define MLX5_LOG_SW_ICM_BLOCK_SIZE(dev) (MLX5_CAP_DEV_MEM(dev, log_sw_icm_alloc_granularity))
 #define MLX5_SW_ICM_BLOCK_SIZE(dev) (1 << MLX5_LOG_SW_ICM_BLOCK_SIZE(dev))
 
+enum {
+	MLX5_PROF_MASK_QP_SIZE		= (u64)1 << 0,
+	MLX5_PROF_MASK_MR_CACHE		= (u64)1 << 1,
+};
+
+enum {
+	MR_CACHE_LAST_STD_ENTRY = 20,
+	MLX5_IMR_MTT_CACHE_ENTRY,
+	MLX5_IMR_KSM_CACHE_ENTRY,
+	MAX_MR_CACHE_ENTRIES
+};
+
+struct mlx5_profile {
+	u64	mask;
+	u8	log_max_qp;
+	struct {
+		int	size;
+		int	limit;
+	} mr_cache[MAX_MR_CACHE_ENTRIES];
+};
+
 struct mlx5_core_dev {
 	struct device *device;
 	enum mlx5_coredev_type coredev_type;
@@ -731,7 +752,7 @@ struct mlx5_core_dev {
 	struct mutex		intf_state_mutex;
 	unsigned long		intf_state;
 	struct mlx5_priv	priv;
-	struct mlx5_profile	*profile;
+	struct mlx5_profile	profile;
 	u32			issi;
 	struct mlx5e_resources  mlx5e_res;
 	struct mlx5_dm          *dm;
@@ -1083,18 +1104,6 @@ static inline u8 mlx5_mkey_variant(u32 mkey)
 	return mkey & 0xff;
 }
 
-enum {
-	MLX5_PROF_MASK_QP_SIZE		= (u64)1 << 0,
-	MLX5_PROF_MASK_MR_CACHE		= (u64)1 << 1,
-};
-
-enum {
-	MR_CACHE_LAST_STD_ENTRY = 20,
-	MLX5_IMR_MTT_CACHE_ENTRY,
-	MLX5_IMR_KSM_CACHE_ENTRY,
-	MAX_MR_CACHE_ENTRIES
-};
-
 /* Async-atomic event notifier used by mlx5 core to forward FW
  * evetns recived from event queue to mlx5 consumers.
  * Optimise event queue dipatching.
@@ -1148,15 +1157,6 @@ int mlx5_rdma_rn_get_params(struct mlx5_core_dev *mdev,
 			    struct ib_device *device,
 			    struct rdma_netdev_alloc_params *params);
 
-struct mlx5_profile {
-	u64	mask;
-	u8	log_max_qp;
-	struct {
-		int	size;
-		int	limit;
-	} mr_cache[MAX_MR_CACHE_ENTRIES];
-};
-
 enum {
 	MLX5_PCI_DEV_IS_VF		= 1 << 0,
 };
-- 
cgit v1.2.3


From 7c9f131f366ab414691907fa0407124ea2b2f3bc Mon Sep 17 00:00:00 2001
From: Eli Cohen <elic@nvidia.com>
Date: Thu, 22 Apr 2021 15:48:10 +0300
Subject: {net,vdpa}/mlx5: Configure interface MAC into mpfs L2 table

net/mlx5: Expose MPFS configuration API

MPFS is the multi physical function switch that bridges traffic between
the physical port and any physical functions associated with it. The
driver is required to add or remove MAC entries to properly forward
incoming traffic to the correct physical function.

We export the API to control MPFS so that other drivers, such as
mlx5_vdpa are able to add MAC addresses of their network interfaces.

The MAC address of the vdpa interface must be configured into the MPFS L2
address. Failing to do so could cause, in some NIC configurations, failure
to forward packets to the vdpa network device instance.

Fix this by adding calls to update the MPFS table.

CC: <mst@redhat.com>
CC: <jasowang@redhat.com>
CC: <virtualization@lists.linux-foundation.org>
Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices")
Signed-off-by: Eli Cohen <elic@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_fs.c    |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c |  3 +++
 drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h |  5 +----
 drivers/vdpa/mlx5/net/mlx5_vnet.c                  | 19 ++++++++++++++++++-
 include/linux/mlx5/mpfs.h                          | 18 ++++++++++++++++++
 6 files changed, 42 insertions(+), 5 deletions(-)
 create mode 100644 include/linux/mlx5/mpfs.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
index 0d571a0c76d9..0b75fab41ae8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
@@ -35,6 +35,7 @@
 #include <linux/ipv6.h>
 #include <linux/tcp.h>
 #include <linux/mlx5/fs.h>
+#include <linux/mlx5/mpfs.h>
 #include "en.h"
 #include "en_rep.h"
 #include "lib/mpfs.h"
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 570f2280823c..b88705a3a1a8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -35,6 +35,7 @@
 #include <linux/mlx5/mlx5_ifc.h>
 #include <linux/mlx5/vport.h>
 #include <linux/mlx5/fs.h>
+#include <linux/mlx5/mpfs.h>
 #include "esw/acl/lgcy.h"
 #include "esw/legacy.h"
 #include "mlx5_core.h"
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c
index fd8449ff9e17..839a01da110f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c
@@ -33,6 +33,7 @@
 #include <linux/etherdevice.h>
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/mlx5_ifc.h>
+#include <linux/mlx5/mpfs.h>
 #include <linux/mlx5/eswitch.h>
 #include "mlx5_core.h"
 #include "lib/mpfs.h"
@@ -175,6 +176,7 @@ out:
 	mutex_unlock(&mpfs->lock);
 	return err;
 }
+EXPORT_SYMBOL(mlx5_mpfs_add_mac);
 
 int mlx5_mpfs_del_mac(struct mlx5_core_dev *dev, u8 *mac)
 {
@@ -206,3 +208,4 @@ unlock:
 	mutex_unlock(&mpfs->lock);
 	return err;
 }
+EXPORT_SYMBOL(mlx5_mpfs_del_mac);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h
index 4a7b2c3203a7..4a293542a7aa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h
@@ -84,12 +84,9 @@ struct l2addr_node {
 #ifdef CONFIG_MLX5_MPFS
 int  mlx5_mpfs_init(struct mlx5_core_dev *dev);
 void mlx5_mpfs_cleanup(struct mlx5_core_dev *dev);
-int  mlx5_mpfs_add_mac(struct mlx5_core_dev *dev, u8 *mac);
-int  mlx5_mpfs_del_mac(struct mlx5_core_dev *dev, u8 *mac);
 #else /* #ifndef CONFIG_MLX5_MPFS */
 static inline int  mlx5_mpfs_init(struct mlx5_core_dev *dev) { return 0; }
 static inline void mlx5_mpfs_cleanup(struct mlx5_core_dev *dev) {}
-static inline int  mlx5_mpfs_add_mac(struct mlx5_core_dev *dev, u8 *mac) { return 0; }
-static inline int  mlx5_mpfs_del_mac(struct mlx5_core_dev *dev, u8 *mac) { return 0; }
 #endif
+
 #endif
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 189e4385df40..dda5dc6f7737 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -15,6 +15,7 @@
 #include <linux/mlx5/vport.h>
 #include <linux/mlx5/fs.h>
 #include <linux/mlx5/mlx5_ifc_vdpa.h>
+#include <linux/mlx5/mpfs.h>
 #include "mlx5_vdpa.h"
 
 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
@@ -1859,11 +1860,16 @@ static int mlx5_vdpa_set_map(struct vdpa_device *vdev, struct vhost_iotlb *iotlb
 static void mlx5_vdpa_free(struct vdpa_device *vdev)
 {
 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+	struct mlx5_core_dev *pfmdev;
 	struct mlx5_vdpa_net *ndev;
 
 	ndev = to_mlx5_vdpa_ndev(mvdev);
 
 	free_resources(ndev);
+	if (!is_zero_ether_addr(ndev->config.mac)) {
+		pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
+		mlx5_mpfs_del_mac(pfmdev, ndev->config.mac);
+	}
 	mlx5_vdpa_free_resources(&ndev->mvdev);
 	mutex_destroy(&ndev->reslock);
 }
@@ -1990,6 +1996,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name)
 {
 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
 	struct virtio_net_config *config;
+	struct mlx5_core_dev *pfmdev;
 	struct mlx5_vdpa_dev *mvdev;
 	struct mlx5_vdpa_net *ndev;
 	struct mlx5_core_dev *mdev;
@@ -2023,10 +2030,17 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name)
 	if (err)
 		goto err_mtu;
 
+	if (!is_zero_ether_addr(config->mac)) {
+		pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev));
+		err = mlx5_mpfs_add_mac(pfmdev, config->mac);
+		if (err)
+			goto err_mtu;
+	}
+
 	mvdev->vdev.dma_dev = mdev->device;
 	err = mlx5_vdpa_alloc_resources(&ndev->mvdev);
 	if (err)
-		goto err_mtu;
+		goto err_mpfs;
 
 	err = alloc_resources(ndev);
 	if (err)
@@ -2044,6 +2058,9 @@ err_reg:
 	free_resources(ndev);
 err_res:
 	mlx5_vdpa_free_resources(&ndev->mvdev);
+err_mpfs:
+	if (!is_zero_ether_addr(config->mac))
+		mlx5_mpfs_del_mac(pfmdev, config->mac);
 err_mtu:
 	mutex_destroy(&ndev->reslock);
 	put_device(&mvdev->vdev.dev);
diff --git a/include/linux/mlx5/mpfs.h b/include/linux/mlx5/mpfs.h
new file mode 100644
index 000000000000..bf700c8d5516
--- /dev/null
+++ b/include/linux/mlx5/mpfs.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+ * Copyright (c) 2021 Mellanox Technologies Ltd.
+ */
+
+#ifndef _MLX5_MPFS_
+#define _MLX5_MPFS_
+
+struct mlx5_core_dev;
+
+#ifdef CONFIG_MLX5_MPFS
+int  mlx5_mpfs_add_mac(struct mlx5_core_dev *dev, u8 *mac);
+int  mlx5_mpfs_del_mac(struct mlx5_core_dev *dev, u8 *mac);
+#else /* #ifndef CONFIG_MLX5_MPFS */
+static inline int  mlx5_mpfs_add_mac(struct mlx5_core_dev *dev, u8 *mac) { return 0; }
+static inline int  mlx5_mpfs_del_mac(struct mlx5_core_dev *dev, u8 *mac) { return 0; }
+#endif
+
+#endif
-- 
cgit v1.2.3


From 94cc7aeaf6c0cff0b8aeb7cb3579cee46b923560 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Wed, 5 May 2021 11:19:16 +0200
Subject: USB: serial: make usb_serial_driver::write_room return uint

Line disciplines expect a positive value or zero returned from
tty->ops->write_room (invoked by tty_write_room). Both of them are being
updated to return an unsigned int. Switch also
usb_serial_driver::write_room and all its users.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
[ johan: amend commit message, drop unrelated comment change ]
Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/usb/serial/cyberjack.c        | 4 ++--
 drivers/usb/serial/cypress_m8.c       | 8 ++++----
 drivers/usb/serial/digi_acceleport.c  | 8 ++++----
 drivers/usb/serial/garmin_gps.c       | 2 +-
 drivers/usb/serial/generic.c          | 6 +++---
 drivers/usb/serial/io_edgeport.c      | 6 +++---
 drivers/usb/serial/io_ti.c            | 6 +++---
 drivers/usb/serial/ir-usb.c           | 6 +++---
 drivers/usb/serial/keyspan.c          | 4 ++--
 drivers/usb/serial/kobil_sct.c        | 4 ++--
 drivers/usb/serial/mos7720.c          | 6 +++---
 drivers/usb/serial/mos7840.c          | 6 +++---
 drivers/usb/serial/opticon.c          | 2 +-
 drivers/usb/serial/oti6858.c          | 6 +++---
 drivers/usb/serial/quatech2.c         | 4 ++--
 drivers/usb/serial/sierra.c           | 2 +-
 drivers/usb/serial/ti_usb_3410_5052.c | 8 ++++----
 drivers/usb/serial/usb-wwan.h         | 2 +-
 drivers/usb/serial/usb_wwan.c         | 6 +++---
 include/linux/usb/serial.h            | 4 ++--
 20 files changed, 50 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/serial/cyberjack.c b/drivers/usb/serial/cyberjack.c
index cf389224d528..51e5aac3bf4c 100644
--- a/drivers/usb/serial/cyberjack.c
+++ b/drivers/usb/serial/cyberjack.c
@@ -53,7 +53,7 @@ static int  cyberjack_open(struct tty_struct *tty,
 static void cyberjack_close(struct usb_serial_port *port);
 static int cyberjack_write(struct tty_struct *tty,
 	struct usb_serial_port *port, const unsigned char *buf, int count);
-static int cyberjack_write_room(struct tty_struct *tty);
+static unsigned int cyberjack_write_room(struct tty_struct *tty);
 static void cyberjack_read_int_callback(struct urb *urb);
 static void cyberjack_read_bulk_callback(struct urb *urb);
 static void cyberjack_write_bulk_callback(struct urb *urb);
@@ -240,7 +240,7 @@ static int cyberjack_write(struct tty_struct *tty,
 	return count;
 }
 
-static int cyberjack_write_room(struct tty_struct *tty)
+static unsigned int cyberjack_write_room(struct tty_struct *tty)
 {
 	/* FIXME: .... */
 	return CYBERJACK_LOCAL_BUF_SIZE;
diff --git a/drivers/usb/serial/cypress_m8.c b/drivers/usb/serial/cypress_m8.c
index 166ee2286fda..5e7d2f9fa0c2 100644
--- a/drivers/usb/serial/cypress_m8.c
+++ b/drivers/usb/serial/cypress_m8.c
@@ -122,7 +122,7 @@ static void cypress_dtr_rts(struct usb_serial_port *port, int on);
 static int  cypress_write(struct tty_struct *tty, struct usb_serial_port *port,
 			const unsigned char *buf, int count);
 static void cypress_send(struct usb_serial_port *port);
-static int  cypress_write_room(struct tty_struct *tty);
+static unsigned int cypress_write_room(struct tty_struct *tty);
 static void cypress_earthmate_init_termios(struct tty_struct *tty);
 static void cypress_set_termios(struct tty_struct *tty,
 			struct usb_serial_port *port, struct ktermios *old);
@@ -789,18 +789,18 @@ send:
 
 
 /* returns how much space is available in the soft buffer */
-static int cypress_write_room(struct tty_struct *tty)
+static unsigned int cypress_write_room(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct cypress_private *priv = usb_get_serial_port_data(port);
-	int room = 0;
+	unsigned int room;
 	unsigned long flags;
 
 	spin_lock_irqsave(&priv->lock, flags);
 	room = kfifo_avail(&priv->write_fifo);
 	spin_unlock_irqrestore(&priv->lock, flags);
 
-	dev_dbg(&port->dev, "%s - returns %d\n", __func__, room);
+	dev_dbg(&port->dev, "%s - returns %u\n", __func__, room);
 	return room;
 }
 
diff --git a/drivers/usb/serial/digi_acceleport.c b/drivers/usb/serial/digi_acceleport.c
index 8b2f06539f2c..5deb900450ee 100644
--- a/drivers/usb/serial/digi_acceleport.c
+++ b/drivers/usb/serial/digi_acceleport.c
@@ -223,7 +223,7 @@ static int digi_tiocmset(struct tty_struct *tty, unsigned int set,
 static int digi_write(struct tty_struct *tty, struct usb_serial_port *port,
 		const unsigned char *buf, int count);
 static void digi_write_bulk_callback(struct urb *urb);
-static int digi_write_room(struct tty_struct *tty);
+static unsigned int digi_write_room(struct tty_struct *tty);
 static int digi_chars_in_buffer(struct tty_struct *tty);
 static int digi_open(struct tty_struct *tty, struct usb_serial_port *port);
 static void digi_close(struct usb_serial_port *port);
@@ -1020,11 +1020,11 @@ static void digi_write_bulk_callback(struct urb *urb)
 		tty_port_tty_wakeup(&port->port);
 }
 
-static int digi_write_room(struct tty_struct *tty)
+static unsigned int digi_write_room(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct digi_port *priv = usb_get_serial_port_data(port);
-	int room;
+	unsigned int room;
 	unsigned long flags = 0;
 
 	spin_lock_irqsave(&priv->dp_port_lock, flags);
@@ -1035,7 +1035,7 @@ static int digi_write_room(struct tty_struct *tty)
 		room = port->bulk_out_size - 2 - priv->dp_out_buf_len;
 
 	spin_unlock_irqrestore(&priv->dp_port_lock, flags);
-	dev_dbg(&port->dev, "digi_write_room: port=%d, room=%d\n", priv->dp_port_num, room);
+	dev_dbg(&port->dev, "digi_write_room: port=%d, room=%u\n", priv->dp_port_num, room);
 	return room;
 
 }
diff --git a/drivers/usb/serial/garmin_gps.c b/drivers/usb/serial/garmin_gps.c
index 50e8bdc77e71..756d1ac7e96f 100644
--- a/drivers/usb/serial/garmin_gps.c
+++ b/drivers/usb/serial/garmin_gps.c
@@ -1113,7 +1113,7 @@ static int garmin_write(struct tty_struct *tty, struct usb_serial_port *port,
 }
 
 
-static int garmin_write_room(struct tty_struct *tty)
+static unsigned int garmin_write_room(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	/*
diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
index d10aa3d2ee49..bc3cf66af0de 100644
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -230,11 +230,11 @@ int usb_serial_generic_write(struct tty_struct *tty,
 }
 EXPORT_SYMBOL_GPL(usb_serial_generic_write);
 
-int usb_serial_generic_write_room(struct tty_struct *tty)
+unsigned int usb_serial_generic_write_room(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	unsigned long flags;
-	int room;
+	unsigned int room;
 
 	if (!port->bulk_out_size)
 		return 0;
@@ -243,7 +243,7 @@ int usb_serial_generic_write_room(struct tty_struct *tty)
 	room = kfifo_avail(&port->write_fifo);
 	spin_unlock_irqrestore(&port->lock, flags);
 
-	dev_dbg(&port->dev, "%s - returns %d\n", __func__, room);
+	dev_dbg(&port->dev, "%s - returns %u\n", __func__, room);
 	return room;
 }
 
diff --git a/drivers/usb/serial/io_edgeport.c b/drivers/usb/serial/io_edgeport.c
index e6fe3882bf69..f6cedc87d3e4 100644
--- a/drivers/usb/serial/io_edgeport.c
+++ b/drivers/usb/serial/io_edgeport.c
@@ -1355,11 +1355,11 @@ exit_send:
  *	we return the amount of room that we have for this port	(the txCredits)
  *	otherwise we return a negative error number.
  *****************************************************************************/
-static int edge_write_room(struct tty_struct *tty)
+static unsigned int edge_write_room(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct edgeport_port *edge_port = usb_get_serial_port_data(port);
-	int room;
+	unsigned int room;
 	unsigned long flags;
 
 	if (edge_port == NULL)
@@ -1377,7 +1377,7 @@ static int edge_write_room(struct tty_struct *tty)
 	room = edge_port->txCredits - edge_port->txfifo.count;
 	spin_unlock_irqrestore(&edge_port->ep_lock, flags);
 
-	dev_dbg(&port->dev, "%s - returns %d\n", __func__, room);
+	dev_dbg(&port->dev, "%s - returns %u\n", __func__, room);
 	return room;
 }
 
diff --git a/drivers/usb/serial/io_ti.c b/drivers/usb/serial/io_ti.c
index 39503fdccebf..94c82c33e629 100644
--- a/drivers/usb/serial/io_ti.c
+++ b/drivers/usb/serial/io_ti.c
@@ -2067,11 +2067,11 @@ static void edge_send(struct usb_serial_port *port, struct tty_struct *tty)
 		tty_wakeup(tty);
 }
 
-static int edge_write_room(struct tty_struct *tty)
+static unsigned int edge_write_room(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct edgeport_port *edge_port = usb_get_serial_port_data(port);
-	int room = 0;
+	unsigned int room;
 	unsigned long flags;
 
 	if (edge_port == NULL)
@@ -2083,7 +2083,7 @@ static int edge_write_room(struct tty_struct *tty)
 	room = kfifo_avail(&port->write_fifo);
 	spin_unlock_irqrestore(&edge_port->ep_lock, flags);
 
-	dev_dbg(&port->dev, "%s - returns %d\n", __func__, room);
+	dev_dbg(&port->dev, "%s - returns %u\n", __func__, room);
 	return room;
 }
 
diff --git a/drivers/usb/serial/ir-usb.c b/drivers/usb/serial/ir-usb.c
index 172261a908d8..7b44dbea95cd 100644
--- a/drivers/usb/serial/ir-usb.c
+++ b/drivers/usb/serial/ir-usb.c
@@ -47,7 +47,7 @@ static int xbof = -1;
 static int  ir_startup (struct usb_serial *serial);
 static int ir_write(struct tty_struct *tty, struct usb_serial_port *port,
 		const unsigned char *buf, int count);
-static int ir_write_room(struct tty_struct *tty);
+static unsigned int ir_write_room(struct tty_struct *tty);
 static void ir_write_bulk_callback(struct urb *urb);
 static void ir_process_read_urb(struct urb *urb);
 static void ir_set_termios(struct tty_struct *tty,
@@ -339,10 +339,10 @@ static void ir_write_bulk_callback(struct urb *urb)
 	usb_serial_port_softint(port);
 }
 
-static int ir_write_room(struct tty_struct *tty)
+static unsigned int ir_write_room(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
-	int count = 0;
+	unsigned int count = 0;
 
 	if (port->bulk_out_size == 0)
 		return 0;
diff --git a/drivers/usb/serial/keyspan.c b/drivers/usb/serial/keyspan.c
index b04a029e3657..87b89c99d517 100644
--- a/drivers/usb/serial/keyspan.c
+++ b/drivers/usb/serial/keyspan.c
@@ -1453,13 +1453,13 @@ static void usa67_glocont_callback(struct urb *urb)
 	}
 }
 
-static int keyspan_write_room(struct tty_struct *tty)
+static unsigned int keyspan_write_room(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct keyspan_port_private	*p_priv;
 	const struct keyspan_device_details	*d_details;
 	int				flip;
-	int				data_len;
+	unsigned int			data_len;
 	struct urb			*this_urb;
 
 	p_priv = usb_get_serial_port_data(port);
diff --git a/drivers/usb/serial/kobil_sct.c b/drivers/usb/serial/kobil_sct.c
index a9bc546626ab..4ed8b8b0a361 100644
--- a/drivers/usb/serial/kobil_sct.c
+++ b/drivers/usb/serial/kobil_sct.c
@@ -53,7 +53,7 @@ static int  kobil_open(struct tty_struct *tty, struct usb_serial_port *port);
 static void kobil_close(struct usb_serial_port *port);
 static int  kobil_write(struct tty_struct *tty, struct usb_serial_port *port,
 			 const unsigned char *buf, int count);
-static int  kobil_write_room(struct tty_struct *tty);
+static unsigned int kobil_write_room(struct tty_struct *tty);
 static int  kobil_ioctl(struct tty_struct *tty,
 			unsigned int cmd, unsigned long arg);
 static int  kobil_tiocmget(struct tty_struct *tty);
@@ -358,7 +358,7 @@ static int kobil_write(struct tty_struct *tty, struct usb_serial_port *port,
 }
 
 
-static int kobil_write_room(struct tty_struct *tty)
+static unsigned int kobil_write_room(struct tty_struct *tty)
 {
 	/* FIXME */
 	return 8;
diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c
index 6ee83886e2c9..d9cc7f840d48 100644
--- a/drivers/usb/serial/mos7720.c
+++ b/drivers/usb/serial/mos7720.c
@@ -1033,11 +1033,11 @@ static void mos7720_break(struct tty_struct *tty, int break_state)
  *	If successful, we return the amount of room that we have for this port
  *	Otherwise we return a negative error number.
  */
-static int mos7720_write_room(struct tty_struct *tty)
+static unsigned int mos7720_write_room(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct moschip_port *mos7720_port;
-	int room = 0;
+	unsigned int room = 0;
 	int i;
 
 	mos7720_port = usb_get_serial_port_data(port);
@@ -1051,7 +1051,7 @@ static int mos7720_write_room(struct tty_struct *tty)
 			room += URB_TRANSFER_BUFFER_SIZE;
 	}
 
-	dev_dbg(&port->dev, "%s - returns %d\n", __func__, room);
+	dev_dbg(&port->dev, "%s - returns %u\n", __func__, room);
 	return room;
 }
 
diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c
index 28e4093794e0..609799aaba72 100644
--- a/drivers/usb/serial/mos7840.c
+++ b/drivers/usb/serial/mos7840.c
@@ -818,12 +818,12 @@ static void mos7840_break(struct tty_struct *tty, int break_state)
  *	Otherwise we return a negative error number.
  *****************************************************************************/
 
-static int mos7840_write_room(struct tty_struct *tty)
+static unsigned int mos7840_write_room(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct moschip_port *mos7840_port = usb_get_serial_port_data(port);
 	int i;
-	int room = 0;
+	unsigned int room = 0;
 	unsigned long flags;
 
 	spin_lock_irqsave(&mos7840_port->pool_lock, flags);
@@ -834,7 +834,7 @@ static int mos7840_write_room(struct tty_struct *tty)
 	spin_unlock_irqrestore(&mos7840_port->pool_lock, flags);
 
 	room = (room == 0) ? 0 : room - URB_TRANSFER_BUFFER_SIZE + 1;
-	dev_dbg(&mos7840_port->port->dev, "%s - returns %d\n", __func__, room);
+	dev_dbg(&mos7840_port->port->dev, "%s - returns %u\n", __func__, room);
 	return room;
 
 }
diff --git a/drivers/usb/serial/opticon.c b/drivers/usb/serial/opticon.c
index 40c713fae0c3..37b51947bd0b 100644
--- a/drivers/usb/serial/opticon.c
+++ b/drivers/usb/serial/opticon.c
@@ -267,7 +267,7 @@ error_no_buffer:
 	return ret;
 }
 
-static int opticon_write_room(struct tty_struct *tty)
+static unsigned int opticon_write_room(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct opticon_private *priv = usb_get_serial_port_data(port);
diff --git a/drivers/usb/serial/oti6858.c b/drivers/usb/serial/oti6858.c
index 65cd0341fa78..4ab9f335dd0e 100644
--- a/drivers/usb/serial/oti6858.c
+++ b/drivers/usb/serial/oti6858.c
@@ -126,7 +126,7 @@ static void oti6858_read_bulk_callback(struct urb *urb);
 static void oti6858_write_bulk_callback(struct urb *urb);
 static int oti6858_write(struct tty_struct *tty, struct usb_serial_port *port,
 			const unsigned char *buf, int count);
-static int oti6858_write_room(struct tty_struct *tty);
+static unsigned int oti6858_write_room(struct tty_struct *tty);
 static int oti6858_chars_in_buffer(struct tty_struct *tty);
 static int oti6858_tiocmget(struct tty_struct *tty);
 static int oti6858_tiocmset(struct tty_struct *tty,
@@ -363,10 +363,10 @@ static int oti6858_write(struct tty_struct *tty, struct usb_serial_port *port,
 	return count;
 }
 
-static int oti6858_write_room(struct tty_struct *tty)
+static unsigned int oti6858_write_room(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
-	int room = 0;
+	unsigned int room;
 	unsigned long flags;
 
 	spin_lock_irqsave(&port->lock, flags);
diff --git a/drivers/usb/serial/quatech2.c b/drivers/usb/serial/quatech2.c
index 5f2e7f668e68..3b5f2032ecdb 100644
--- a/drivers/usb/serial/quatech2.c
+++ b/drivers/usb/serial/quatech2.c
@@ -870,12 +870,12 @@ static void qt2_update_lsr(struct usb_serial_port *port, unsigned char *ch)
 
 }
 
-static int qt2_write_room(struct tty_struct *tty)
+static unsigned int qt2_write_room(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct qt2_port_private *port_priv;
 	unsigned long flags = 0;
-	int r;
+	unsigned int r;
 
 	port_priv = usb_get_serial_port_data(port);
 
diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c
index 54e16ffc30a0..753cee5d17a1 100644
--- a/drivers/usb/serial/sierra.c
+++ b/drivers/usb/serial/sierra.c
@@ -613,7 +613,7 @@ static void sierra_instat_callback(struct urb *urb)
 	}
 }
 
-static int sierra_write_room(struct tty_struct *tty)
+static unsigned int sierra_write_room(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct sierra_port_private *portdata = usb_get_serial_port_data(port);
diff --git a/drivers/usb/serial/ti_usb_3410_5052.c b/drivers/usb/serial/ti_usb_3410_5052.c
index caa46ac23db9..2c543c175296 100644
--- a/drivers/usb/serial/ti_usb_3410_5052.c
+++ b/drivers/usb/serial/ti_usb_3410_5052.c
@@ -307,7 +307,7 @@ static int ti_open(struct tty_struct *tty, struct usb_serial_port *port);
 static void ti_close(struct usb_serial_port *port);
 static int ti_write(struct tty_struct *tty, struct usb_serial_port *port,
 		const unsigned char *data, int count);
-static int ti_write_room(struct tty_struct *tty);
+static unsigned int ti_write_room(struct tty_struct *tty);
 static int ti_chars_in_buffer(struct tty_struct *tty);
 static bool ti_tx_empty(struct usb_serial_port *port);
 static void ti_throttle(struct tty_struct *tty);
@@ -810,18 +810,18 @@ static int ti_write(struct tty_struct *tty, struct usb_serial_port *port,
 }
 
 
-static int ti_write_room(struct tty_struct *tty)
+static unsigned int ti_write_room(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct ti_port *tport = usb_get_serial_port_data(port);
-	int room = 0;
+	unsigned int room;
 	unsigned long flags;
 
 	spin_lock_irqsave(&tport->tp_lock, flags);
 	room = kfifo_avail(&port->write_fifo);
 	spin_unlock_irqrestore(&tport->tp_lock, flags);
 
-	dev_dbg(&port->dev, "%s - returns %d\n", __func__, room);
+	dev_dbg(&port->dev, "%s - returns %u\n", __func__, room);
 	return room;
 }
 
diff --git a/drivers/usb/serial/usb-wwan.h b/drivers/usb/serial/usb-wwan.h
index b5331d03092f..6c7c9f3309b0 100644
--- a/drivers/usb/serial/usb-wwan.h
+++ b/drivers/usb/serial/usb-wwan.h
@@ -11,7 +11,7 @@ extern int usb_wwan_open(struct tty_struct *tty, struct usb_serial_port *port);
 extern void usb_wwan_close(struct usb_serial_port *port);
 extern int usb_wwan_port_probe(struct usb_serial_port *port);
 extern void usb_wwan_port_remove(struct usb_serial_port *port);
-extern int usb_wwan_write_room(struct tty_struct *tty);
+extern unsigned int usb_wwan_write_room(struct tty_struct *tty);
 extern int usb_wwan_tiocmget(struct tty_struct *tty);
 extern int usb_wwan_tiocmset(struct tty_struct *tty,
 			     unsigned int set, unsigned int clear);
diff --git a/drivers/usb/serial/usb_wwan.c b/drivers/usb/serial/usb_wwan.c
index 3eb72c59ede6..06212f08d9f9 100644
--- a/drivers/usb/serial/usb_wwan.c
+++ b/drivers/usb/serial/usb_wwan.c
@@ -278,12 +278,12 @@ static void usb_wwan_outdat_callback(struct urb *urb)
 	}
 }
 
-int usb_wwan_write_room(struct tty_struct *tty)
+unsigned int usb_wwan_write_room(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct usb_wwan_port_private *portdata;
 	int i;
-	int data_len = 0;
+	unsigned int data_len = 0;
 	struct urb *this_urb;
 
 	portdata = usb_get_serial_port_data(port);
@@ -294,7 +294,7 @@ int usb_wwan_write_room(struct tty_struct *tty)
 			data_len += OUT_BUFLEN;
 	}
 
-	dev_dbg(&port->dev, "%s: %d\n", __func__, data_len);
+	dev_dbg(&port->dev, "%s: %u\n", __func__, data_len);
 	return data_len;
 }
 EXPORT_SYMBOL(usb_wwan_write_room);
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 8c63fa9bfc74..6472d1f7b028 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -276,7 +276,7 @@ struct usb_serial_driver {
 	int  (*write)(struct tty_struct *tty, struct usb_serial_port *port,
 			const unsigned char *buf, int count);
 	/* Called only by the tty layer */
-	int  (*write_room)(struct tty_struct *tty);
+	unsigned int (*write_room)(struct tty_struct *tty);
 	int  (*ioctl)(struct tty_struct *tty,
 		      unsigned int cmd, unsigned long arg);
 	void (*get_serial)(struct tty_struct *tty, struct serial_struct *ss);
@@ -347,7 +347,7 @@ int usb_serial_generic_write(struct tty_struct *tty, struct usb_serial_port *por
 		const unsigned char *buf, int count);
 void usb_serial_generic_close(struct usb_serial_port *port);
 int usb_serial_generic_resume(struct usb_serial *serial);
-int usb_serial_generic_write_room(struct tty_struct *tty);
+unsigned int usb_serial_generic_write_room(struct tty_struct *tty);
 int usb_serial_generic_chars_in_buffer(struct tty_struct *tty);
 void usb_serial_generic_wait_until_sent(struct tty_struct *tty, long timeout);
 void usb_serial_generic_read_bulk_callback(struct urb *urb);
-- 
cgit v1.2.3


From 155591d3ceeec2cd6a50b40278e2014c45f6b5f6 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Wed, 5 May 2021 11:19:20 +0200
Subject: USB: serial: make usb_serial_driver::chars_in_buffer return uint

tty_operations::chars_in_buffer is being switched to return uint. Do the
same for usb_serial_driver's chars_in_buffer.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
[ johan: amend commit message ]
Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/usb/serial/cypress_m8.c       | 8 ++++----
 drivers/usb/serial/digi_acceleport.c  | 4 ++--
 drivers/usb/serial/generic.c          | 6 +++---
 drivers/usb/serial/io_edgeport.c      | 6 +++---
 drivers/usb/serial/io_ti.c            | 6 +++---
 drivers/usb/serial/mos7720.c          | 6 +++---
 drivers/usb/serial/mos7840.c          | 6 +++---
 drivers/usb/serial/opticon.c          | 4 ++--
 drivers/usb/serial/oti6858.c          | 6 +++---
 drivers/usb/serial/sierra.c           | 6 +++---
 drivers/usb/serial/ti_usb_3410_5052.c | 8 ++++----
 drivers/usb/serial/usb-wwan.h         | 2 +-
 drivers/usb/serial/usb_wwan.c         | 6 +++---
 include/linux/usb/serial.h            | 4 ++--
 14 files changed, 39 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/serial/cypress_m8.c b/drivers/usb/serial/cypress_m8.c
index 5e7d2f9fa0c2..1dca04e1519d 100644
--- a/drivers/usb/serial/cypress_m8.c
+++ b/drivers/usb/serial/cypress_m8.c
@@ -129,7 +129,7 @@ static void cypress_set_termios(struct tty_struct *tty,
 static int  cypress_tiocmget(struct tty_struct *tty);
 static int  cypress_tiocmset(struct tty_struct *tty,
 			unsigned int set, unsigned int clear);
-static int  cypress_chars_in_buffer(struct tty_struct *tty);
+static unsigned int cypress_chars_in_buffer(struct tty_struct *tty);
 static void cypress_throttle(struct tty_struct *tty);
 static void cypress_unthrottle(struct tty_struct *tty);
 static void cypress_set_dead(struct usb_serial_port *port);
@@ -970,18 +970,18 @@ static void cypress_set_termios(struct tty_struct *tty,
 
 
 /* returns amount of data still left in soft buffer */
-static int cypress_chars_in_buffer(struct tty_struct *tty)
+static unsigned int cypress_chars_in_buffer(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct cypress_private *priv = usb_get_serial_port_data(port);
-	int chars = 0;
+	unsigned int chars;
 	unsigned long flags;
 
 	spin_lock_irqsave(&priv->lock, flags);
 	chars = kfifo_len(&priv->write_fifo);
 	spin_unlock_irqrestore(&priv->lock, flags);
 
-	dev_dbg(&port->dev, "%s - returns %d\n", __func__, chars);
+	dev_dbg(&port->dev, "%s - returns %u\n", __func__, chars);
 	return chars;
 }
 
diff --git a/drivers/usb/serial/digi_acceleport.c b/drivers/usb/serial/digi_acceleport.c
index 5deb900450ee..19ee8191647c 100644
--- a/drivers/usb/serial/digi_acceleport.c
+++ b/drivers/usb/serial/digi_acceleport.c
@@ -224,7 +224,7 @@ static int digi_write(struct tty_struct *tty, struct usb_serial_port *port,
 		const unsigned char *buf, int count);
 static void digi_write_bulk_callback(struct urb *urb);
 static unsigned int digi_write_room(struct tty_struct *tty);
-static int digi_chars_in_buffer(struct tty_struct *tty);
+static unsigned int digi_chars_in_buffer(struct tty_struct *tty);
 static int digi_open(struct tty_struct *tty, struct usb_serial_port *port);
 static void digi_close(struct usb_serial_port *port);
 static void digi_dtr_rts(struct usb_serial_port *port, int on);
@@ -1040,7 +1040,7 @@ static unsigned int digi_write_room(struct tty_struct *tty)
 
 }
 
-static int digi_chars_in_buffer(struct tty_struct *tty)
+static unsigned int digi_chars_in_buffer(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct digi_port *priv = usb_get_serial_port_data(port);
diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
index bc3cf66af0de..15b6dee3a8e5 100644
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -247,11 +247,11 @@ unsigned int usb_serial_generic_write_room(struct tty_struct *tty)
 	return room;
 }
 
-int usb_serial_generic_chars_in_buffer(struct tty_struct *tty)
+unsigned int usb_serial_generic_chars_in_buffer(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	unsigned long flags;
-	int chars;
+	unsigned int chars;
 
 	if (!port->bulk_out_size)
 		return 0;
@@ -260,7 +260,7 @@ int usb_serial_generic_chars_in_buffer(struct tty_struct *tty)
 	chars = kfifo_len(&port->write_fifo) + port->tx_bytes;
 	spin_unlock_irqrestore(&port->lock, flags);
 
-	dev_dbg(&port->dev, "%s - returns %d\n", __func__, chars);
+	dev_dbg(&port->dev, "%s - returns %u\n", __func__, chars);
 	return chars;
 }
 EXPORT_SYMBOL_GPL(usb_serial_generic_chars_in_buffer);
diff --git a/drivers/usb/serial/io_edgeport.c b/drivers/usb/serial/io_edgeport.c
index f6cedc87d3e4..1f8dff4390c7 100644
--- a/drivers/usb/serial/io_edgeport.c
+++ b/drivers/usb/serial/io_edgeport.c
@@ -1391,11 +1391,11 @@ static unsigned int edge_write_room(struct tty_struct *tty)
  *	system,
  *	Otherwise we return a negative error number.
  *****************************************************************************/
-static int edge_chars_in_buffer(struct tty_struct *tty)
+static unsigned int edge_chars_in_buffer(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct edgeport_port *edge_port = usb_get_serial_port_data(port);
-	int num_chars;
+	unsigned int num_chars;
 	unsigned long flags;
 
 	if (edge_port == NULL)
@@ -1413,7 +1413,7 @@ static int edge_chars_in_buffer(struct tty_struct *tty)
 						edge_port->txfifo.count;
 	spin_unlock_irqrestore(&edge_port->ep_lock, flags);
 	if (num_chars) {
-		dev_dbg(&port->dev, "%s - returns %d\n", __func__, num_chars);
+		dev_dbg(&port->dev, "%s - returns %u\n", __func__, num_chars);
 	}
 
 	return num_chars;
diff --git a/drivers/usb/serial/io_ti.c b/drivers/usb/serial/io_ti.c
index 94c82c33e629..84b848d2794e 100644
--- a/drivers/usb/serial/io_ti.c
+++ b/drivers/usb/serial/io_ti.c
@@ -2087,11 +2087,11 @@ static unsigned int edge_write_room(struct tty_struct *tty)
 	return room;
 }
 
-static int edge_chars_in_buffer(struct tty_struct *tty)
+static unsigned int edge_chars_in_buffer(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct edgeport_port *edge_port = usb_get_serial_port_data(port);
-	int chars = 0;
+	unsigned int chars;
 	unsigned long flags;
 	if (edge_port == NULL)
 		return 0;
@@ -2100,7 +2100,7 @@ static int edge_chars_in_buffer(struct tty_struct *tty)
 	chars = kfifo_len(&port->write_fifo);
 	spin_unlock_irqrestore(&edge_port->ep_lock, flags);
 
-	dev_dbg(&port->dev, "%s - returns %d\n", __func__, chars);
+	dev_dbg(&port->dev, "%s - returns %u\n", __func__, chars);
 	return chars;
 }
 
diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c
index d9cc7f840d48..ce41009756f3 100644
--- a/drivers/usb/serial/mos7720.c
+++ b/drivers/usb/serial/mos7720.c
@@ -949,11 +949,11 @@ static int mos7720_open(struct tty_struct *tty, struct usb_serial_port *port)
  *	system,
  *	Otherwise we return a negative error number.
  */
-static int mos7720_chars_in_buffer(struct tty_struct *tty)
+static unsigned int mos7720_chars_in_buffer(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	int i;
-	int chars = 0;
+	unsigned int chars = 0;
 	struct moschip_port *mos7720_port;
 
 	mos7720_port = usb_get_serial_port_data(port);
@@ -965,7 +965,7 @@ static int mos7720_chars_in_buffer(struct tty_struct *tty)
 		    mos7720_port->write_urb_pool[i]->status == -EINPROGRESS)
 			chars += URB_TRANSFER_BUFFER_SIZE;
 	}
-	dev_dbg(&port->dev, "%s - returns %d\n", __func__, chars);
+	dev_dbg(&port->dev, "%s - returns %u\n", __func__, chars);
 	return chars;
 }
 
diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c
index 609799aaba72..b22ccbd98998 100644
--- a/drivers/usb/serial/mos7840.c
+++ b/drivers/usb/serial/mos7840.c
@@ -735,12 +735,12 @@ err:
  *	Otherwise we return zero.
  *****************************************************************************/
 
-static int mos7840_chars_in_buffer(struct tty_struct *tty)
+static unsigned int mos7840_chars_in_buffer(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct moschip_port *mos7840_port = usb_get_serial_port_data(port);
 	int i;
-	int chars = 0;
+	unsigned int chars = 0;
 	unsigned long flags;
 
 	spin_lock_irqsave(&mos7840_port->pool_lock, flags);
@@ -751,7 +751,7 @@ static int mos7840_chars_in_buffer(struct tty_struct *tty)
 		}
 	}
 	spin_unlock_irqrestore(&mos7840_port->pool_lock, flags);
-	dev_dbg(&port->dev, "%s - returns %d\n", __func__, chars);
+	dev_dbg(&port->dev, "%s - returns %u\n", __func__, chars);
 	return chars;
 
 }
diff --git a/drivers/usb/serial/opticon.c b/drivers/usb/serial/opticon.c
index 37b51947bd0b..aed28c35caff 100644
--- a/drivers/usb/serial/opticon.c
+++ b/drivers/usb/serial/opticon.c
@@ -289,12 +289,12 @@ static unsigned int opticon_write_room(struct tty_struct *tty)
 	return 2048;
 }
 
-static int opticon_chars_in_buffer(struct tty_struct *tty)
+static unsigned int opticon_chars_in_buffer(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct opticon_private *priv = usb_get_serial_port_data(port);
 	unsigned long flags;
-	int count;
+	unsigned int count;
 
 	spin_lock_irqsave(&priv->lock, flags);
 	count = priv->outstanding_bytes;
diff --git a/drivers/usb/serial/oti6858.c b/drivers/usb/serial/oti6858.c
index 4ab9f335dd0e..a5caedbe72e2 100644
--- a/drivers/usb/serial/oti6858.c
+++ b/drivers/usb/serial/oti6858.c
@@ -127,7 +127,7 @@ static void oti6858_write_bulk_callback(struct urb *urb);
 static int oti6858_write(struct tty_struct *tty, struct usb_serial_port *port,
 			const unsigned char *buf, int count);
 static unsigned int oti6858_write_room(struct tty_struct *tty);
-static int oti6858_chars_in_buffer(struct tty_struct *tty);
+static unsigned int oti6858_chars_in_buffer(struct tty_struct *tty);
 static int oti6858_tiocmget(struct tty_struct *tty);
 static int oti6858_tiocmset(struct tty_struct *tty,
 				unsigned int set, unsigned int clear);
@@ -376,10 +376,10 @@ static unsigned int oti6858_write_room(struct tty_struct *tty)
 	return room;
 }
 
-static int oti6858_chars_in_buffer(struct tty_struct *tty)
+static unsigned int oti6858_chars_in_buffer(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
-	int chars = 0;
+	unsigned int chars;
 	unsigned long flags;
 
 	spin_lock_irqsave(&port->lock, flags);
diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c
index 753cee5d17a1..4b49b7c33364 100644
--- a/drivers/usb/serial/sierra.c
+++ b/drivers/usb/serial/sierra.c
@@ -632,19 +632,19 @@ static unsigned int sierra_write_room(struct tty_struct *tty)
 	return 2048;
 }
 
-static int sierra_chars_in_buffer(struct tty_struct *tty)
+static unsigned int sierra_chars_in_buffer(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct sierra_port_private *portdata = usb_get_serial_port_data(port);
 	unsigned long flags;
-	int chars;
+	unsigned int chars;
 
 	/* NOTE: This overcounts somewhat. */
 	spin_lock_irqsave(&portdata->lock, flags);
 	chars = portdata->outstanding_urbs * MAX_TRANSFER;
 	spin_unlock_irqrestore(&portdata->lock, flags);
 
-	dev_dbg(&port->dev, "%s - %d\n", __func__, chars);
+	dev_dbg(&port->dev, "%s - %u\n", __func__, chars);
 
 	return chars;
 }
diff --git a/drivers/usb/serial/ti_usb_3410_5052.c b/drivers/usb/serial/ti_usb_3410_5052.c
index 2c543c175296..34f8ed224abe 100644
--- a/drivers/usb/serial/ti_usb_3410_5052.c
+++ b/drivers/usb/serial/ti_usb_3410_5052.c
@@ -308,7 +308,7 @@ static void ti_close(struct usb_serial_port *port);
 static int ti_write(struct tty_struct *tty, struct usb_serial_port *port,
 		const unsigned char *data, int count);
 static unsigned int ti_write_room(struct tty_struct *tty);
-static int ti_chars_in_buffer(struct tty_struct *tty);
+static unsigned int ti_chars_in_buffer(struct tty_struct *tty);
 static bool ti_tx_empty(struct usb_serial_port *port);
 static void ti_throttle(struct tty_struct *tty);
 static void ti_unthrottle(struct tty_struct *tty);
@@ -826,18 +826,18 @@ static unsigned int ti_write_room(struct tty_struct *tty)
 }
 
 
-static int ti_chars_in_buffer(struct tty_struct *tty)
+static unsigned int ti_chars_in_buffer(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct ti_port *tport = usb_get_serial_port_data(port);
-	int chars = 0;
+	unsigned int chars;
 	unsigned long flags;
 
 	spin_lock_irqsave(&tport->tp_lock, flags);
 	chars = kfifo_len(&port->write_fifo);
 	spin_unlock_irqrestore(&tport->tp_lock, flags);
 
-	dev_dbg(&port->dev, "%s - returns %d\n", __func__, chars);
+	dev_dbg(&port->dev, "%s - returns %u\n", __func__, chars);
 	return chars;
 }
 
diff --git a/drivers/usb/serial/usb-wwan.h b/drivers/usb/serial/usb-wwan.h
index 6c7c9f3309b0..519101945769 100644
--- a/drivers/usb/serial/usb-wwan.h
+++ b/drivers/usb/serial/usb-wwan.h
@@ -17,7 +17,7 @@ extern int usb_wwan_tiocmset(struct tty_struct *tty,
 			     unsigned int set, unsigned int clear);
 extern int usb_wwan_write(struct tty_struct *tty, struct usb_serial_port *port,
 			  const unsigned char *buf, int count);
-extern int usb_wwan_chars_in_buffer(struct tty_struct *tty);
+extern unsigned int usb_wwan_chars_in_buffer(struct tty_struct *tty);
 #ifdef CONFIG_PM
 extern int usb_wwan_suspend(struct usb_serial *serial, pm_message_t message);
 extern int usb_wwan_resume(struct usb_serial *serial);
diff --git a/drivers/usb/serial/usb_wwan.c b/drivers/usb/serial/usb_wwan.c
index 06212f08d9f9..cb01283d4d15 100644
--- a/drivers/usb/serial/usb_wwan.c
+++ b/drivers/usb/serial/usb_wwan.c
@@ -299,12 +299,12 @@ unsigned int usb_wwan_write_room(struct tty_struct *tty)
 }
 EXPORT_SYMBOL(usb_wwan_write_room);
 
-int usb_wwan_chars_in_buffer(struct tty_struct *tty)
+unsigned int usb_wwan_chars_in_buffer(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct usb_wwan_port_private *portdata;
 	int i;
-	int data_len = 0;
+	unsigned int data_len = 0;
 	struct urb *this_urb;
 
 	portdata = usb_get_serial_port_data(port);
@@ -316,7 +316,7 @@ int usb_wwan_chars_in_buffer(struct tty_struct *tty)
 		if (this_urb && test_bit(i, &portdata->out_busy))
 			data_len += this_urb->transfer_buffer_length;
 	}
-	dev_dbg(&port->dev, "%s: %d\n", __func__, data_len);
+	dev_dbg(&port->dev, "%s: %u\n", __func__, data_len);
 	return data_len;
 }
 EXPORT_SYMBOL(usb_wwan_chars_in_buffer);
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 6472d1f7b028..95c729446e27 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -284,7 +284,7 @@ struct usb_serial_driver {
 	void (*set_termios)(struct tty_struct *tty,
 			struct usb_serial_port *port, struct ktermios *old);
 	void (*break_ctl)(struct tty_struct *tty, int break_state);
-	int  (*chars_in_buffer)(struct tty_struct *tty);
+	unsigned int (*chars_in_buffer)(struct tty_struct *tty);
 	void (*wait_until_sent)(struct tty_struct *tty, long timeout);
 	bool (*tx_empty)(struct usb_serial_port *port);
 	void (*throttle)(struct tty_struct *tty);
@@ -348,7 +348,7 @@ int usb_serial_generic_write(struct tty_struct *tty, struct usb_serial_port *por
 void usb_serial_generic_close(struct usb_serial_port *port);
 int usb_serial_generic_resume(struct usb_serial *serial);
 unsigned int usb_serial_generic_write_room(struct tty_struct *tty);
-int usb_serial_generic_chars_in_buffer(struct tty_struct *tty);
+unsigned int usb_serial_generic_chars_in_buffer(struct tty_struct *tty);
 void usb_serial_generic_wait_until_sent(struct tty_struct *tty, long timeout);
 void usb_serial_generic_read_bulk_callback(struct urb *urb);
 void usb_serial_generic_write_bulk_callback(struct urb *urb);
-- 
cgit v1.2.3


From 4d80d6ca5d77fde9880da8466e5b64f250e5bf82 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 18 May 2021 11:17:26 +0200
Subject: genirq: Export affinity setter for modules

Perf modules abuse irq_set_affinity_hint() to set the affinity of system
PMU interrupts just because irq_set_affinity() was not exported.

The fact that irq_set_affinity_hint() actually sets the affinity is a
non-documented side effect and the name is clearly saying it's a hint.

To clean this up, export the real affinity setter.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20210518093117.968251441@linutronix.de
---
 include/linux/interrupt.h | 35 ++---------------------------------
 kernel/irq/manage.c       | 33 ++++++++++++++++++++++++++++++++-
 2 files changed, 34 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 4777850a6dc7..35a374241515 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -319,39 +319,8 @@ struct irq_affinity_desc {
 
 extern cpumask_var_t irq_default_affinity;
 
-/* Internal implementation. Use the helpers below */
-extern int __irq_set_affinity(unsigned int irq, const struct cpumask *cpumask,
-			      bool force);
-
-/**
- * irq_set_affinity - Set the irq affinity of a given irq
- * @irq:	Interrupt to set affinity
- * @cpumask:	cpumask
- *
- * Fails if cpumask does not contain an online CPU
- */
-static inline int
-irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
-{
-	return __irq_set_affinity(irq, cpumask, false);
-}
-
-/**
- * irq_force_affinity - Force the irq affinity of a given irq
- * @irq:	Interrupt to set affinity
- * @cpumask:	cpumask
- *
- * Same as irq_set_affinity, but without checking the mask against
- * online cpus.
- *
- * Solely for low level cpu hotplug code, where we need to make per
- * cpu interrupts affine before the cpu becomes online.
- */
-static inline int
-irq_force_affinity(unsigned int irq, const struct cpumask *cpumask)
-{
-	return __irq_set_affinity(irq, cpumask, true);
-}
+extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
+extern int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask);
 
 extern int irq_can_set_affinity(unsigned int irq);
 extern int irq_select_affinity(unsigned int irq);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 4c14356543d9..a847dd2044c8 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -441,7 +441,8 @@ out_unlock:
 	return ret;
 }
 
-int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force)
+static int __irq_set_affinity(unsigned int irq, const struct cpumask *mask,
+			      bool force)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
@@ -456,6 +457,36 @@ int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force)
 	return ret;
 }
 
+/**
+ * irq_set_affinity - Set the irq affinity of a given irq
+ * @irq:	Interrupt to set affinity
+ * @cpumask:	cpumask
+ *
+ * Fails if cpumask does not contain an online CPU
+ */
+int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
+{
+	return __irq_set_affinity(irq, cpumask, false);
+}
+EXPORT_SYMBOL_GPL(irq_set_affinity);
+
+/**
+ * irq_force_affinity - Force the irq affinity of a given irq
+ * @irq:	Interrupt to set affinity
+ * @cpumask:	cpumask
+ *
+ * Same as irq_set_affinity, but without checking the mask against
+ * online cpus.
+ *
+ * Solely for low level cpu hotplug code, where we need to make per
+ * cpu interrupts affine before the cpu becomes online.
+ */
+int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask)
+{
+	return __irq_set_affinity(irq, cpumask, true);
+}
+EXPORT_SYMBOL_GPL(irq_force_affinity);
+
 int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
 {
 	unsigned long flags;
-- 
cgit v1.2.3


From 2beb4a53fc3f1081cedc1c1a198c7f56cc4fc60c Mon Sep 17 00:00:00 2001
From: "Chang S. Bae" <chang.seok.bae@intel.com>
Date: Tue, 18 May 2021 13:03:19 -0700
Subject: x86/signal: Detect and prevent an alternate signal stack overflow

The kernel pushes context on to the userspace stack to prepare for the
user's signal handler. When the user has supplied an alternate signal
stack, via sigaltstack(2), it is easy for the kernel to verify that the
stack size is sufficient for the current hardware context.

Check if writing the hardware context to the alternate stack will exceed
it's size. If yes, then instead of corrupting user-data and proceeding with
the original signal handler, an immediate SIGSEGV signal is delivered.

Refactor the stack pointer check code from on_sig_stack() and use the new
helper.

While the kernel allows new source code to discover and use a sufficient
alternate signal stack size, this check is still necessary to protect
binaries with insufficient alternate signal stack size from data
corruption.

Fixes: c2bc11f10a39 ("x86, AVX-512: Enable AVX-512 States Context Switch")
Reported-by: Florian Weimer <fweimer@redhat.com>
Suggested-by: Jann Horn <jannh@google.com>
Suggested-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Len Brown <len.brown@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20210518200320.17239-6-chang.seok.bae@intel.com
Link: https://bugzilla.kernel.org/show_bug.cgi?id=153531
---
 arch/x86/kernel/signal.c     | 24 ++++++++++++++++++++----
 include/linux/sched/signal.h | 19 ++++++++++++-------
 2 files changed, 32 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 86e53868159a..2ddcf2165bcb 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -239,10 +239,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 	     void __user **fpstate)
 {
 	/* Default to using normal stack */
+	bool nested_altstack = on_sig_stack(regs->sp);
+	bool entering_altstack = false;
 	unsigned long math_size = 0;
 	unsigned long sp = regs->sp;
 	unsigned long buf_fx = 0;
-	int onsigstack = on_sig_stack(sp);
 	int ret;
 
 	/* redzone */
@@ -251,15 +252,23 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 
 	/* This is the X/Open sanctioned signal stack switching.  */
 	if (ka->sa.sa_flags & SA_ONSTACK) {
-		if (sas_ss_flags(sp) == 0)
+		/*
+		 * This checks nested_altstack via sas_ss_flags(). Sensible
+		 * programs use SS_AUTODISARM, which disables that check, and
+		 * programs that don't use SS_AUTODISARM get compatible.
+		 */
+		if (sas_ss_flags(sp) == 0) {
 			sp = current->sas_ss_sp + current->sas_ss_size;
+			entering_altstack = true;
+		}
 	} else if (IS_ENABLED(CONFIG_X86_32) &&
-		   !onsigstack &&
+		   !nested_altstack &&
 		   regs->ss != __USER_DS &&
 		   !(ka->sa.sa_flags & SA_RESTORER) &&
 		   ka->sa.sa_restorer) {
 		/* This is the legacy signal stack switching. */
 		sp = (unsigned long) ka->sa.sa_restorer;
+		entering_altstack = true;
 	}
 
 	sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32),
@@ -272,8 +281,15 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 	 * If we are on the alternate signal stack and would overflow it, don't.
 	 * Return an always-bogus address instead so we will die with SIGSEGV.
 	 */
-	if (onsigstack && !likely(on_sig_stack(sp)))
+	if (unlikely((nested_altstack || entering_altstack) &&
+		     !__on_sig_stack(sp))) {
+
+		if (show_unhandled_signals && printk_ratelimit())
+			pr_info("%s[%d] overflowed sigaltstack\n",
+				current->comm, task_pid_nr(current));
+
 		return (void __user *)-1L;
+	}
 
 	/* save i387 and extended state */
 	ret = copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size);
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 3f6a0fcaa10c..ae60f838ebb9 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -537,6 +537,17 @@ static inline int kill_cad_pid(int sig, int priv)
 #define SEND_SIG_NOINFO ((struct kernel_siginfo *) 0)
 #define SEND_SIG_PRIV	((struct kernel_siginfo *) 1)
 
+static inline int __on_sig_stack(unsigned long sp)
+{
+#ifdef CONFIG_STACK_GROWSUP
+	return sp >= current->sas_ss_sp &&
+		sp - current->sas_ss_sp < current->sas_ss_size;
+#else
+	return sp > current->sas_ss_sp &&
+		sp - current->sas_ss_sp <= current->sas_ss_size;
+#endif
+}
+
 /*
  * True if we are on the alternate signal stack.
  */
@@ -554,13 +565,7 @@ static inline int on_sig_stack(unsigned long sp)
 	if (current->sas_ss_flags & SS_AUTODISARM)
 		return 0;
 
-#ifdef CONFIG_STACK_GROWSUP
-	return sp >= current->sas_ss_sp &&
-		sp - current->sas_ss_sp < current->sas_ss_size;
-#else
-	return sp > current->sas_ss_sp &&
-		sp - current->sas_ss_sp <= current->sas_ss_size;
-#endif
+	return __on_sig_stack(sp);
 }
 
 static inline int sas_ss_flags(unsigned long sp)
-- 
cgit v1.2.3


From c06a40e9513d246bdeacd290f2357bb99251dc9a Mon Sep 17 00:00:00 2001
From: Luca Ceresoli <luca@lucaceresoli.net>
Date: Fri, 19 Feb 2021 23:39:08 +0100
Subject: mfd: lp87565: Fix typo in define names

"GOIO" should be "GPIO" here.

Signed-off-by: Luca Ceresoli <luca@lucaceresoli.net>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/gpio/gpio-lp87565.c |  6 +++---
 include/linux/mfd/lp87565.h | 28 ++++++++++++++--------------
 2 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-lp87565.c b/drivers/gpio/gpio-lp87565.c
index e1244520cf7d..fcde6708b5df 100644
--- a/drivers/gpio/gpio-lp87565.c
+++ b/drivers/gpio/gpio-lp87565.c
@@ -123,14 +123,14 @@ static int lp87565_gpio_set_config(struct gpio_chip *gc, unsigned int offset,
 		return regmap_update_bits(gpio->map,
 					  LP87565_REG_GPIO_CONFIG,
 					  BIT(offset +
-					      __ffs(LP87565_GOIO1_OD)),
+					      __ffs(LP87565_GPIO1_OD)),
 					  BIT(offset +
-					      __ffs(LP87565_GOIO1_OD)));
+					      __ffs(LP87565_GPIO1_OD)));
 	case PIN_CONFIG_DRIVE_PUSH_PULL:
 		return regmap_update_bits(gpio->map,
 					  LP87565_REG_GPIO_CONFIG,
 					  BIT(offset +
-					      __ffs(LP87565_GOIO1_OD)), 0);
+					      __ffs(LP87565_GPIO1_OD)), 0);
 	default:
 		return -ENOTSUPP;
 	}
diff --git a/include/linux/mfd/lp87565.h b/include/linux/mfd/lp87565.h
index 5640e6088fe6..a8799ae50dcf 100644
--- a/include/linux/mfd/lp87565.h
+++ b/include/linux/mfd/lp87565.h
@@ -222,20 +222,20 @@ enum lp87565_device_type {
 #define LP87565_GPIO2_SEL			BIT(1)
 #define LP87565_GPIO1_SEL			BIT(0)
 
-#define LP87565_GOIO3_OD			BIT(6)
-#define LP87565_GOIO2_OD			BIT(5)
-#define LP87565_GOIO1_OD			BIT(4)
-#define LP87565_GOIO3_DIR			BIT(2)
-#define LP87565_GOIO2_DIR			BIT(1)
-#define LP87565_GOIO1_DIR			BIT(0)
-
-#define LP87565_GOIO3_IN			BIT(2)
-#define LP87565_GOIO2_IN			BIT(1)
-#define LP87565_GOIO1_IN			BIT(0)
-
-#define LP87565_GOIO3_OUT			BIT(2)
-#define LP87565_GOIO2_OUT			BIT(1)
-#define LP87565_GOIO1_OUT			BIT(0)
+#define LP87565_GPIO3_OD			BIT(6)
+#define LP87565_GPIO2_OD			BIT(5)
+#define LP87565_GPIO1_OD			BIT(4)
+#define LP87565_GPIO3_DIR			BIT(2)
+#define LP87565_GPIO2_DIR			BIT(1)
+#define LP87565_GPIO1_DIR			BIT(0)
+
+#define LP87565_GPIO3_IN			BIT(2)
+#define LP87565_GPIO2_IN			BIT(1)
+#define LP87565_GPIO1_IN			BIT(0)
+
+#define LP87565_GPIO3_OUT			BIT(2)
+#define LP87565_GPIO2_OUT			BIT(1)
+#define LP87565_GPIO1_OUT			BIT(0)
 
 enum LP87565_regulator_id {
 	/* BUCK's */
-- 
cgit v1.2.3


From 5258f7eed42f4565d065726fd82d3430dd618a68 Mon Sep 17 00:00:00 2001
From: Luca Ceresoli <luca@lucaceresoli.net>
Date: Fri, 19 Feb 2021 23:39:10 +0100
Subject: mfd: lp87565: Move LP87565_regulator_id to .c file

This enum is used only internally to the regulator driver for buck indexes.

Signed-off-by: Luca Ceresoli <luca@lucaceresoli.net>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/regulator/lp87565-regulator.c | 11 +++++++++++
 include/linux/mfd/lp87565.h           | 11 -----------
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/lp87565-regulator.c b/drivers/regulator/lp87565-regulator.c
index eeab9d3c824b..d059ae85047a 100644
--- a/drivers/regulator/lp87565-regulator.c
+++ b/drivers/regulator/lp87565-regulator.c
@@ -11,6 +11,17 @@
 
 #include <linux/mfd/lp87565.h>
 
+enum LP87565_regulator_id {
+	/* BUCK's */
+	LP87565_BUCK_0,
+	LP87565_BUCK_1,
+	LP87565_BUCK_2,
+	LP87565_BUCK_3,
+	LP87565_BUCK_10,
+	LP87565_BUCK_23,
+	LP87565_BUCK_3210,
+};
+
 #define LP87565_REGULATOR(_name, _id, _of, _ops, _n, _vr, _vm,		\
 			  _er, _em, _ev, _delay, _lr, _cr)		\
 	[_id] = {							\
diff --git a/include/linux/mfd/lp87565.h b/include/linux/mfd/lp87565.h
index a8799ae50dcf..94cb581af34b 100644
--- a/include/linux/mfd/lp87565.h
+++ b/include/linux/mfd/lp87565.h
@@ -237,17 +237,6 @@ enum lp87565_device_type {
 #define LP87565_GPIO2_OUT			BIT(1)
 #define LP87565_GPIO1_OUT			BIT(0)
 
-enum LP87565_regulator_id {
-	/* BUCK's */
-	LP87565_BUCK_0,
-	LP87565_BUCK_1,
-	LP87565_BUCK_2,
-	LP87565_BUCK_3,
-	LP87565_BUCK_10,
-	LP87565_BUCK_23,
-	LP87565_BUCK_3210,
-};
-
 /**
  * struct LP87565 - state holder for the LP87565 driver
  * @dev: struct device pointer for MFD device
-- 
cgit v1.2.3


From 1f89d2fe16072a74b34bdb895160910091427891 Mon Sep 17 00:00:00 2001
From: Sander Vanheule <sander@svanheule.net>
Date: Mon, 17 May 2021 21:28:03 +0200
Subject: regmap: Add MDIO bus support

Basic support for MDIO bus access. Support only includes clause-22
register access, with 5-bit addresses, and 16-bit wide registers.

Signed-off-by: Sander Vanheule <sander@svanheule.net>
Link: https://lore.kernel.org/r/63b99a2fec2c4ea3c461d59d451af8d675ecf312.1621279162.git.sander@svanheule.net
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/Kconfig       |  6 ++++-
 drivers/base/regmap/Makefile      |  1 +
 drivers/base/regmap/regmap-mdio.c | 57 +++++++++++++++++++++++++++++++++++++++
 include/linux/regmap.h            | 36 +++++++++++++++++++++++++
 4 files changed, 99 insertions(+), 1 deletion(-)
 create mode 100644 drivers/base/regmap/regmap-mdio.c

(limited to 'include/linux')

diff --git a/drivers/base/regmap/Kconfig b/drivers/base/regmap/Kconfig
index 50b1e2d06a25..159bac6c5046 100644
--- a/drivers/base/regmap/Kconfig
+++ b/drivers/base/regmap/Kconfig
@@ -4,8 +4,9 @@
 # subsystems should select the appropriate symbols.
 
 config REGMAP
-	default y if (REGMAP_I2C || REGMAP_SPI || REGMAP_SPMI || REGMAP_W1 || REGMAP_AC97 || REGMAP_MMIO || REGMAP_IRQ || REGMAP_SOUNDWIRE || REGMAP_SOUNDWIRE_MBQ || REGMAP_SCCB || REGMAP_I3C || REGMAP_SPI_AVMM)
+	default y if (REGMAP_I2C || REGMAP_SPI || REGMAP_SPMI || REGMAP_W1 || REGMAP_AC97 || REGMAP_MMIO || REGMAP_IRQ || REGMAP_SOUNDWIRE || REGMAP_SOUNDWIRE_MBQ || REGMAP_SCCB || REGMAP_I3C || REGMAP_SPI_AVMM || REGMAP_MDIO)
 	select IRQ_DOMAIN if REGMAP_IRQ
+	select MDIO_BUS if REGMAP_MDIO
 	bool
 
 config REGCACHE_COMPRESSED
@@ -36,6 +37,9 @@ config REGMAP_W1
 	tristate
 	depends on W1
 
+config REGMAP_MDIO
+	tristate
+
 config REGMAP_MMIO
 	tristate
 
diff --git a/drivers/base/regmap/Makefile b/drivers/base/regmap/Makefile
index 33f63adb5b3d..11facb32a027 100644
--- a/drivers/base/regmap/Makefile
+++ b/drivers/base/regmap/Makefile
@@ -19,3 +19,4 @@ obj-$(CONFIG_REGMAP_SOUNDWIRE_MBQ) += regmap-sdw-mbq.o
 obj-$(CONFIG_REGMAP_SCCB) += regmap-sccb.o
 obj-$(CONFIG_REGMAP_I3C) += regmap-i3c.o
 obj-$(CONFIG_REGMAP_SPI_AVMM) += regmap-spi-avmm.o
+obj-$(CONFIG_REGMAP_MDIO) += regmap-mdio.o
diff --git a/drivers/base/regmap/regmap-mdio.c b/drivers/base/regmap/regmap-mdio.c
new file mode 100644
index 000000000000..5f18fe409f56
--- /dev/null
+++ b/drivers/base/regmap/regmap-mdio.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/errno.h>
+#include <linux/mdio.h>
+#include <linux/module.h>
+#include <linux/regmap.h>
+
+static int regmap_mdio_read(void *context, unsigned int reg, unsigned int *val)
+{
+	struct mdio_device *mdio_dev = context;
+	int ret;
+
+	ret = mdiobus_read(mdio_dev->bus, mdio_dev->addr, reg);
+	*val = ret & 0xffff;
+
+	return ret < 0 ? ret : 0;
+}
+
+static int regmap_mdio_write(void *context, unsigned int reg, unsigned int val)
+{
+	struct mdio_device *mdio_dev = context;
+
+	return mdiobus_write(mdio_dev->bus, mdio_dev->addr, reg, val);
+}
+
+static const struct regmap_bus regmap_mdio_bus = {
+	.reg_write = regmap_mdio_write,
+	.reg_read = regmap_mdio_read,
+};
+
+struct regmap *__regmap_init_mdio(struct mdio_device *mdio_dev,
+	const struct regmap_config *config, struct lock_class_key *lock_key,
+	const char *lock_name)
+{
+	if (config->reg_bits != 5 || config->val_bits != 16)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	return __regmap_init(&mdio_dev->dev, &regmap_mdio_bus, mdio_dev, config,
+		lock_key, lock_name);
+}
+EXPORT_SYMBOL_GPL(__regmap_init_mdio);
+
+struct regmap *__devm_regmap_init_mdio(struct mdio_device *mdio_dev,
+	const struct regmap_config *config, struct lock_class_key *lock_key,
+	const char *lock_name)
+{
+	if (config->reg_bits != 5 || config->val_bits != 16)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	return __devm_regmap_init(&mdio_dev->dev, &regmap_mdio_bus, mdio_dev,
+		config, lock_key, lock_name);
+}
+EXPORT_SYMBOL_GPL(__devm_regmap_init_mdio);
+
+MODULE_AUTHOR("Sander Vanheule <sander@svanheule.net>");
+MODULE_DESCRIPTION("Regmap MDIO Module");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index f87a11a5cc4a..e97dd05f7cdb 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -27,6 +27,7 @@ struct device_node;
 struct i2c_client;
 struct i3c_device;
 struct irq_domain;
+struct mdio_device;
 struct slim_device;
 struct spi_device;
 struct spmi_device;
@@ -538,6 +539,10 @@ struct regmap *__regmap_init_i2c(struct i2c_client *i2c,
 				 const struct regmap_config *config,
 				 struct lock_class_key *lock_key,
 				 const char *lock_name);
+struct regmap *__regmap_init_mdio(struct mdio_device *mdio_dev,
+				 const struct regmap_config *config,
+				 struct lock_class_key *lock_key,
+				 const char *lock_name);
 struct regmap *__regmap_init_sccb(struct i2c_client *i2c,
 				  const struct regmap_config *config,
 				  struct lock_class_key *lock_key,
@@ -594,6 +599,10 @@ struct regmap *__devm_regmap_init_i2c(struct i2c_client *i2c,
 				      const struct regmap_config *config,
 				      struct lock_class_key *lock_key,
 				      const char *lock_name);
+struct regmap *__devm_regmap_init_mdio(struct mdio_device *mdio_dev,
+				      const struct regmap_config *config,
+				      struct lock_class_key *lock_key,
+				      const char *lock_name);
 struct regmap *__devm_regmap_init_sccb(struct i2c_client *i2c,
 				       const struct regmap_config *config,
 				       struct lock_class_key *lock_key,
@@ -697,6 +706,19 @@ int regmap_attach_dev(struct device *dev, struct regmap *map,
 	__regmap_lockdep_wrapper(__regmap_init_i2c, #config,		\
 				i2c, config)
 
+/**
+ * regmap_init_mdio() - Initialise register map
+ *
+ * @mdio_dev: Device that will be interacted with
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer to
+ * a struct regmap.
+ */
+#define regmap_init_mdio(mdio_dev, config)				\
+	__regmap_lockdep_wrapper(__regmap_init_mdio, #config,		\
+				mdio_dev, config)
+
 /**
  * regmap_init_sccb() - Initialise register map
  *
@@ -888,6 +910,20 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
 	__regmap_lockdep_wrapper(__devm_regmap_init_i2c, #config,	\
 				i2c, config)
 
+/**
+ * devm_regmap_init_mdio() - Initialise managed register map
+ *
+ * @mdio_dev: Device that will be interacted with
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer
+ * to a struct regmap.  The regmap will be automatically freed by the
+ * device management code.
+ */
+#define devm_regmap_init_mdio(mdio_dev, config)				\
+	__regmap_lockdep_wrapper(__devm_regmap_init_mdio, #config,	\
+				mdio_dev, config)
+
 /**
  * devm_regmap_init_sccb() - Initialise managed register map
  *
-- 
cgit v1.2.3


From f1069a8756b9e9f6c055e709740d2d66650f0fb0 Mon Sep 17 00:00:00 2001
From: Vasily Gorbik <gor@linux.ibm.com>
Date: Wed, 19 May 2021 15:03:08 +0200
Subject: compiler.h: Avoid using inline asm operand modifiers

The expansion of annotate_reachable/annotate_unreachable on s390 will
result in a compiler error if the __COUNTER__ value is high enough.
For example with "i" (154) the "%c0" operand of annotate_reachable
will be expanded to -102:

        -102:
        .pushsection .discard.reachable
        .long -102b - .
        .popsection

This is a quirk of the gcc backend for s390, it interprets the %c0
as a signed byte value. Avoid using operand modifiers in this case
by simply converting __COUNTER__ to string, with the same result,
but in an arch assembler independent way.

Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/patch-1.thread-1a26be.git-930d1b44844a.your-ad-here.call-01621428935-ext-2104@work.hours
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: Borislav Petkov <bp@suse.de>
Cc: linux-kernel@vger.kernel.org
---
 include/linux/compiler.h | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index df5b405e6305..77047904cf70 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -115,18 +115,24 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
  * The __COUNTER__ based labels are a hack to make each instance of the macros
  * unique, to convince GCC not to merge duplicate inline asm statements.
  */
-#define annotate_reachable() ({						\
-	asm volatile("%c0:\n\t"						\
+#define __stringify_label(n) #n
+
+#define __annotate_reachable(c) ({					\
+	asm volatile(__stringify_label(c) ":\n\t"			\
 		     ".pushsection .discard.reachable\n\t"		\
-		     ".long %c0b - .\n\t"				\
-		     ".popsection\n\t" : : "i" (__COUNTER__));		\
+		     ".long " __stringify_label(c) "b - .\n\t"		\
+		     ".popsection\n\t");				\
 })
-#define annotate_unreachable() ({					\
-	asm volatile("%c0:\n\t"						\
+#define annotate_reachable() __annotate_reachable(__COUNTER__)
+
+#define __annotate_unreachable(c) ({					\
+	asm volatile(__stringify_label(c) ":\n\t"			\
 		     ".pushsection .discard.unreachable\n\t"		\
-		     ".long %c0b - .\n\t"				\
-		     ".popsection\n\t" : : "i" (__COUNTER__));		\
+		     ".long " __stringify_label(c) "b - .\n\t"		\
+		     ".popsection\n\t");				\
 })
+#define annotate_unreachable() __annotate_unreachable(__COUNTER__)
+
 #define ASM_UNREACHABLE							\
 	"999:\n\t"							\
 	".pushsection .discard.unreachable\n\t"				\
-- 
cgit v1.2.3


From c199f64ff93c48a45add92eee4456ffcabfc838e Mon Sep 17 00:00:00 2001
From: Vasily Gorbik <gor@linux.ibm.com>
Date: Wed, 19 May 2021 15:03:13 +0200
Subject: instrumentation.h: Avoid using inline asm operand modifiers

The expansion of instrumentation_begin/instrumentation_end on s390 will
result in a compiler error if the __COUNTER__ value is high enough.
For example with "i" (154) the "%c0" operand of annotate_reachable
will be expanded to -102:

        -102:
        .pushsection .discard.instr_begin
        .long -102b - .
        .popsection

This is a quirk of the gcc backend for s390, it interprets the %c0
as a signed byte value. Avoid using operand modifiers in this case
by simply converting __COUNTER__ to string, with the same result,
but in an arch assembler independent way.

Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/patch-2.thread-1a26be.git-1a26be80cb18.your-ad-here.call-01621428935-ext-2104@work.hours
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: Borislav Petkov <bp@suse.de>
Cc: linux-kernel@vger.kernel.org
---
 include/linux/instrumentation.h | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/instrumentation.h b/include/linux/instrumentation.h
index 93e2ad67fc10..fa2cd8c63dcc 100644
--- a/include/linux/instrumentation.h
+++ b/include/linux/instrumentation.h
@@ -4,13 +4,16 @@
 
 #if defined(CONFIG_DEBUG_ENTRY) && defined(CONFIG_STACK_VALIDATION)
 
+#include <linux/stringify.h>
+
 /* Begin/end of an instrumentation safe region */
-#define instrumentation_begin() ({					\
-	asm volatile("%c0: nop\n\t"						\
+#define __instrumentation_begin(c) ({					\
+	asm volatile(__stringify(c) ": nop\n\t"				\
 		     ".pushsection .discard.instr_begin\n\t"		\
-		     ".long %c0b - .\n\t"				\
-		     ".popsection\n\t" : : "i" (__COUNTER__));		\
+		     ".long " __stringify(c) "b - .\n\t"		\
+		     ".popsection\n\t");				\
 })
+#define instrumentation_begin() __instrumentation_begin(__COUNTER__)
 
 /*
  * Because instrumentation_{begin,end}() can nest, objtool validation considers
@@ -43,12 +46,13 @@
  * To avoid this, have _end() be a NOP instruction, this ensures it will be
  * part of the condition block and does not escape.
  */
-#define instrumentation_end() ({					\
-	asm volatile("%c0: nop\n\t"					\
+#define __instrumentation_end(c) ({					\
+	asm volatile(__stringify(c) ": nop\n\t"				\
 		     ".pushsection .discard.instr_end\n\t"		\
-		     ".long %c0b - .\n\t"				\
-		     ".popsection\n\t" : : "i" (__COUNTER__));		\
+		     ".long " __stringify(c) "b - .\n\t"		\
+		     ".popsection\n\t");				\
 })
+#define instrumentation_end() __instrumentation_end(__COUNTER__)
 #else
 # define instrumentation_begin()	do { } while(0)
 # define instrumentation_end()		do { } while(0)
-- 
cgit v1.2.3


From 57b55eeb755201832c2fc2df58818f64fc023fdb Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 10 May 2021 22:47:17 +0300
Subject: pinctrl: Keep enum pin_config_param ordered by name (part 2)

It seems the ordering is by name. Keep it that way.
Here updating the entire list (there were two more options not in order).

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210510194717.12255-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/pinconf-generic.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index e18ab3d5908f..98ed5959ca9a 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -81,28 +81,28 @@ struct pinctrl_map;
  *	passed in the argument on a custom form, else just use argument 1
  *	to indicate low power mode, argument 0 turns low power mode off.
  * @PIN_CONFIG_MODE_PWM: this will configure the pin for PWM
+ * @PIN_CONFIG_OUTPUT: this will configure the pin as an output and drive a
+ * 	value on the line. Use argument 1 to indicate high level, argument 0 to
+ *	indicate low level. (Please see Documentation/driver-api/pinctl.rst,
+ *	section "GPIO mode pitfalls" for a discussion around this parameter.)
  * @PIN_CONFIG_OUTPUT_ENABLE: this will enable the pin's output mode
  * 	without driving a value there. For most platforms this reduces to
  * 	enable the output buffers and then let the pin controller current
  * 	configuration (eg. the currently selected mux function) drive values on
  * 	the line. Use argument 1 to enable output mode, argument 0 to disable
  * 	it.
- * @PIN_CONFIG_OUTPUT: this will configure the pin as an output and drive a
- * 	value on the line. Use argument 1 to indicate high level, argument 0 to
- *	indicate low level. (Please see Documentation/driver-api/pinctl.rst,
- *	section "GPIO mode pitfalls" for a discussion around this parameter.)
  * @PIN_CONFIG_PERSIST_STATE: retain pin state across sleep or controller reset
  * @PIN_CONFIG_POWER_SOURCE: if the pin can select between different power
  *	supplies, the argument to this parameter (on a custom format) tells
  *	the driver which alternative power source to use.
- * @PIN_CONFIG_SLEEP_HARDWARE_STATE: indicate this is sleep related state.
- * @PIN_CONFIG_SLEW_RATE: if the pin can select slew rate, the argument to
- *	this parameter (on a custom format) tells the driver which alternative
- *	slew rate to use.
  * @PIN_CONFIG_SKEW_DELAY: if the pin has programmable skew rate (on inputs)
  *	or latch delay (on outputs) this parameter (in a custom format)
  *	specifies the clock skew or latch delay. It typically controls how
  *	many double inverters are put in front of the line.
+ * @PIN_CONFIG_SLEEP_HARDWARE_STATE: indicate this is sleep related state.
+ * @PIN_CONFIG_SLEW_RATE: if the pin can select slew rate, the argument to
+ *	this parameter (on a custom format) tells the driver which alternative
+ *	slew rate to use.
  * @PIN_CONFIG_END: this is the last enumerator for pin configurations, if
  *	you need to pass in custom configurations to the pin controller, use
  *	PIN_CONFIG_END+1 as the base offset.
@@ -127,13 +127,13 @@ enum pin_config_param {
 	PIN_CONFIG_INPUT_SCHMITT_ENABLE,
 	PIN_CONFIG_MODE_LOW_POWER,
 	PIN_CONFIG_MODE_PWM,
-	PIN_CONFIG_OUTPUT_ENABLE,
 	PIN_CONFIG_OUTPUT,
+	PIN_CONFIG_OUTPUT_ENABLE,
 	PIN_CONFIG_PERSIST_STATE,
 	PIN_CONFIG_POWER_SOURCE,
+	PIN_CONFIG_SKEW_DELAY,
 	PIN_CONFIG_SLEEP_HARDWARE_STATE,
 	PIN_CONFIG_SLEW_RATE,
-	PIN_CONFIG_SKEW_DELAY,
 	PIN_CONFIG_END = 0x7F,
 	PIN_CONFIG_MAX = 0xFF,
 };
-- 
cgit v1.2.3


From b8be5db573b822920b0f6230498d900752bede17 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Wed, 19 May 2021 09:21:50 +0200
Subject: tty/serial: clean up uart_match_port

* make parameters const (as they are only read)
* return bool (as comparison results are returned)
* add \n before final return

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20210519072153.3859-1-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/serial_core.c | 16 +++++++++-------
 include/linux/serial_core.h      |  3 ++-
 2 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 2793b1cf2d24..b5beb215f062 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -3027,26 +3027,28 @@ out:
 /*
  *	Are the two ports equivalent?
  */
-int uart_match_port(struct uart_port *port1, struct uart_port *port2)
+bool uart_match_port(const struct uart_port *port1,
+		const struct uart_port *port2)
 {
 	if (port1->iotype != port2->iotype)
-		return 0;
+		return false;
 
 	switch (port1->iotype) {
 	case UPIO_PORT:
-		return (port1->iobase == port2->iobase);
+		return port1->iobase == port2->iobase;
 	case UPIO_HUB6:
-		return (port1->iobase == port2->iobase) &&
-		       (port1->hub6   == port2->hub6);
+		return port1->iobase == port2->iobase &&
+		       port1->hub6   == port2->hub6;
 	case UPIO_MEM:
 	case UPIO_MEM16:
 	case UPIO_MEM32:
 	case UPIO_MEM32BE:
 	case UPIO_AU:
 	case UPIO_TSI:
-		return (port1->mapbase == port2->mapbase);
+		return port1->mapbase == port2->mapbase;
 	}
-	return 0;
+
+	return false;
 }
 EXPORT_SYMBOL(uart_match_port);
 
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 7445c8fd88c0..52d7fb92a69d 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -408,7 +408,8 @@ int uart_register_driver(struct uart_driver *uart);
 void uart_unregister_driver(struct uart_driver *uart);
 int uart_add_one_port(struct uart_driver *reg, struct uart_port *port);
 int uart_remove_one_port(struct uart_driver *reg, struct uart_port *port);
-int uart_match_port(struct uart_port *port1, struct uart_port *port2);
+bool uart_match_port(const struct uart_port *port1,
+		const struct uart_port *port2);
 
 /*
  * Power Management
-- 
cgit v1.2.3


From cd256b068f80e8b4a1eccd73527b67b3eb50f7ad Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Wed, 19 May 2021 09:21:51 +0200
Subject: tty/serial: make port of serial8250_register_8250_port const

After the previous patch, we can make port passed to
serial8250_find_match_or_unused const. And then we can make const also
port of serial8250_register_8250_port.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20210519072153.3859-2-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_core.c | 4 ++--
 include/linux/serial_8250.h         | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index 1082e76c4d37..1ce193daea7f 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -906,7 +906,7 @@ static struct platform_device *serial8250_isa_devs;
  */
 static DEFINE_MUTEX(serial_mutex);
 
-static struct uart_8250_port *serial8250_find_match_or_unused(struct uart_port *port)
+static struct uart_8250_port *serial8250_find_match_or_unused(const struct uart_port *port)
 {
 	int i;
 
@@ -971,7 +971,7 @@ static void serial_8250_overrun_backoff_work(struct work_struct *work)
  *
  *	On success the port is ready to use and the line number is returned.
  */
-int serial8250_register_8250_port(struct uart_8250_port *up)
+int serial8250_register_8250_port(const struct uart_8250_port *up)
 {
 	struct uart_8250_port *uart;
 	int ret = -ENOSPC;
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 9e655055112d..5db211f43b29 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -146,7 +146,7 @@ static inline struct uart_8250_port *up_to_u8250p(struct uart_port *up)
 	return container_of(up, struct uart_8250_port, port);
 }
 
-int serial8250_register_8250_port(struct uart_8250_port *);
+int serial8250_register_8250_port(const struct uart_8250_port *);
 void serial8250_unregister_port(int line);
 void serial8250_suspend_port(int line);
 void serial8250_resume_port(int line);
-- 
cgit v1.2.3


From ee62c89cd45999ba4e09938bd01ec6d1a83ca6d6 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Wed, 19 May 2021 10:51:38 +0200
Subject: docs: update sysfs-platform_profile.rst reference

The file name: Documentation/ABI/testing/sysfs-platform_profile.rst
should be, instead: Documentation/userspace-api/sysfs-platform_profile.rst.

Update its cross-reference accordingly.

Fixes: a2ff95e018f1 ("ACPI: platform: Add platform profile support")
Fixes: 8e0cbf356377 ("Documentation: Add documentation for new platform_profile sysfs attribute")
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Mark Pearson <markpearson@lenovo.com>
Link: https://lore.kernel.org/r/295089effd8353578b9725c61c0453d920978d72.1621413933.git.mchehab+huawei@kernel.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 include/linux/platform_profile.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h
index a6329003aee7..e5cbb6841f3a 100644
--- a/include/linux/platform_profile.h
+++ b/include/linux/platform_profile.h
@@ -2,7 +2,7 @@
 /*
  * Platform profile sysfs interface
  *
- * See Documentation/ABI/testing/sysfs-platform_profile.rst for more
+ * See Documentation/userspace-api/sysfs-platform_profile.rst for more
  * information.
  */
 
-- 
cgit v1.2.3


From 4b0c9948a4c2f446a11bd592bd7d23f06ad75d8e Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Wed, 19 May 2021 10:51:42 +0200
Subject: docs: update pin-control.rst references

Changeset 5513b411ea5b ("Documentation: rename pinctl to pin-control")
renamed: Documentation/driver-api/pinctl.rst
to: Documentation/driver-api/pin-control.rst.

Update the cross-references accordingly.

Fixes: 5513b411ea5b ("Documentation: rename pinctl to pin-control")
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Acked-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/46ac2e918c7c4a4b701d54870f167b78466ec578.1621413933.git.mchehab+huawei@kernel.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 include/linux/device.h                  | 2 +-
 include/linux/mfd/madera/pdata.h        | 2 +-
 include/linux/pinctrl/pinconf-generic.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index 38a2071cf776..d1183cfdc8fb 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -399,7 +399,7 @@ struct dev_links_info {
  * 		along with subsystem-level and driver-level callbacks.
  * @em_pd:	device's energy model performance domain
  * @pins:	For device pin management.
- *		See Documentation/driver-api/pinctl.rst for details.
+ *		See Documentation/driver-api/pin-control.rst for details.
  * @msi_list:	Hosts MSI descriptors
  * @msi_domain: The generic MSI domain this device is using.
  * @numa_node:	NUMA node this device is close to.
diff --git a/include/linux/mfd/madera/pdata.h b/include/linux/mfd/madera/pdata.h
index 601cbbc10370..32e3470708ed 100644
--- a/include/linux/mfd/madera/pdata.h
+++ b/include/linux/mfd/madera/pdata.h
@@ -31,7 +31,7 @@ struct pinctrl_map;
  * @irq_flags:	    Mode for primary IRQ (defaults to active low)
  * @gpio_base:	    Base GPIO number
  * @gpio_configs:   Array of GPIO configurations (See
- *		    Documentation/driver-api/pinctl.rst)
+ *		    Documentation/driver-api/pin-control.rst)
  * @n_gpio_configs: Number of entries in gpio_configs
  * @gpsw:	    General purpose switch mode setting. Depends on the external
  *		    hardware connected to the switch. (See the SW1_MODE field
diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index e18ab3d5908f..5a96602a3316 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -89,7 +89,7 @@ struct pinctrl_map;
  * 	it.
  * @PIN_CONFIG_OUTPUT: this will configure the pin as an output and drive a
  * 	value on the line. Use argument 1 to indicate high level, argument 0 to
- *	indicate low level. (Please see Documentation/driver-api/pinctl.rst,
+ *	indicate low level. (Please see Documentation/driver-api/pin-control.rst,
  *	section "GPIO mode pitfalls" for a discussion around this parameter.)
  * @PIN_CONFIG_PERSIST_STATE: retain pin state across sleep or controller reset
  * @PIN_CONFIG_POWER_SOURCE: if the pin can select between different power
-- 
cgit v1.2.3


From 2ade8fc65076095460e3ea1ca65a8f619d7d9a3a Mon Sep 17 00:00:00 2001
From: David Bartley <andareed@gmail.com>
Date: Thu, 20 May 2021 10:41:30 -0700
Subject: x86/amd_nb: Add AMD family 19h model 50h PCI ids

This is required to support Zen3 APUs in k10temp.

Signed-off-by: David Bartley <andareed@gmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Wei Huang <wei.huang2@amd.com>
Link: https://lkml.kernel.org/r/20210520174130.94954-1-andareed@gmail.com
---
 arch/x86/kernel/amd_nb.c | 3 +++
 include/linux/pci_ids.h  | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 09083094eb57..23dda362dc0f 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -25,6 +25,7 @@
 #define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c
 #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444
 #define PCI_DEVICE_ID_AMD_19H_DF_F4	0x1654
+#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e
 
 /* Protect the PCI config register pairs used for SMN and DF indirect access. */
 static DEFINE_MUTEX(smn_mutex);
@@ -57,6 +58,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) },
 	{}
 };
 
@@ -72,6 +74,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
 	{}
 };
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 4c3fa5293d76..5356ccf1c275 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -555,6 +555,7 @@
 #define PCI_DEVICE_ID_AMD_17H_M60H_DF_F3 0x144b
 #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F3 0x1443
 #define PCI_DEVICE_ID_AMD_19H_DF_F3	0x1653
+#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F3 0x166d
 #define PCI_DEVICE_ID_AMD_CNB17H_F3	0x1703
 #define PCI_DEVICE_ID_AMD_LANCE		0x2000
 #define PCI_DEVICE_ID_AMD_LANCE_HOME	0x2001
-- 
cgit v1.2.3


From e1327a127703f94b8838d756cf6eaac506b329a7 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Thu, 8 Apr 2021 18:01:05 +0000
Subject: export: Make CRCs robust to symbol trimming

The CRC calculation done by genksyms is triggered when the parser hits
EXPORT_SYMBOL*() macros. At this point, genksyms recursively expands the
types, and uses that as the input for the CRC calculation. In the case
of forward-declared structs, the type expands to 'UNKNOWN'. Next, the
result of the expansion of each type is cached, and is re-used when/if
the same type is seen again for another exported symbol in the file.

Unfortunately, this can cause CRC 'stability' issues when a struct
definition becomes visible in the middle of a C file. For example, let's
assume code with the following pattern:

    struct foo;

    int bar(struct foo *arg)
    {
	/* Do work ... */
    }
    EXPORT_SYMBOL_GPL(bar);

    /* This contains struct foo's definition */
    #include "foo.h"

    int baz(struct foo *arg)
    {
	/* Do more work ... */
    }
    EXPORT_SYMBOL_GPL(baz);

Here, baz's CRC will be computed using the expansion of struct foo that
was cached after bar's CRC calculation ('UNKOWN' here). But if
EXPORT_SYMBOL_GPL(bar) is removed from the file (because of e.g. symbol
trimming using CONFIG_TRIM_UNUSED_KSYMS), struct foo will be expanded
late, during baz's CRC calculation, which now has visibility over the
full struct definition, hence resulting in a different CRC for baz.

This can cause annoying issues for distro kernel (such as the Android
Generic Kernel Image) which use CONFIG_UNUSED_KSYMS_WHITELIST. Indeed,
as per the above, adding a symbol to the whitelist can change the CRC of
symbols that are already kept exported. As such, modules built against a
kernel with a trimmed ABI may not load against the same kernel built
with an extended whitelist, even though they are still strictly binary
compatible. While rebuilding the modules would obviously solve the
issue, I believe this classifies as an odd genksyms corner case, and it
gets in the way of kernel updates in the GKI context.

To work around the issue, make sure to keep issuing the
__GENKSYMS_EXPORT_SYMBOL macros for all trimmed symbols, hence making
the genksyms parsing insensitive to symbol trimming.

Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20210408180105.2496212-1-qperret@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/export.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/export.h b/include/linux/export.h
index 6271a5d9c988..27d848712b90 100644
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -140,7 +140,12 @@ struct kernel_symbol {
 #define ___cond_export_sym(sym, sec, ns, enabled)			\
 	__cond_export_sym_##enabled(sym, sec, ns)
 #define __cond_export_sym_1(sym, sec, ns) ___EXPORT_SYMBOL(sym, sec, ns)
+
+#ifdef __GENKSYMS__
+#define __cond_export_sym_0(sym, sec, ns) __GENKSYMS_EXPORT_SYMBOL(sym)
+#else
 #define __cond_export_sym_0(sym, sec, ns) /* nothing */
+#endif
 
 #else
 
-- 
cgit v1.2.3


From e3ccfe1ad7d895487977ef64eda3441d16c9851a Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Fri, 14 May 2021 17:27:45 +0200
Subject: evm: Introduce evm_revalidate_status()

When EVM_ALLOW_METADATA_WRITES is set, EVM allows any operation on
metadata. Its main purpose is to allow users to freely set metadata when it
is protected by a portable signature, until an HMAC key is loaded.

However, callers of evm_verifyxattr() are not notified about metadata
changes and continue to rely on the last status returned by the function.
For example IMA, since it caches the appraisal result, will not call again
evm_verifyxattr() until the appraisal flags are cleared, and will grant
access to the file even if there was a metadata operation that made the
portable signature invalid.

This patch introduces evm_revalidate_status(), which callers of
evm_verifyxattr() can use in their xattr hooks to determine whether
re-validation is necessary and to do the proper actions. IMA calls it in
its xattr hooks to reset the appraisal flags, so that the EVM status is
re-evaluated after a metadata operation.

Lastly, this patch also adds a call to evm_reset_status() in
evm_inode_post_setattr() to invalidate the cached EVM status after a
setattr operation.

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 include/linux/evm.h                   |  6 ++++++
 security/integrity/evm/evm_main.c     | 40 +++++++++++++++++++++++++++++++----
 security/integrity/ima/ima_appraise.c | 15 ++++++++-----
 3 files changed, 52 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/evm.h b/include/linux/evm.h
index 8302bc29bb35..39bb17a8236b 100644
--- a/include/linux/evm.h
+++ b/include/linux/evm.h
@@ -35,6 +35,7 @@ extern void evm_inode_post_removexattr(struct dentry *dentry,
 extern int evm_inode_init_security(struct inode *inode,
 				   const struct xattr *xattr_array,
 				   struct xattr *evm);
+extern bool evm_revalidate_status(const char *xattr_name);
 #ifdef CONFIG_FS_POSIX_ACL
 extern int posix_xattr_acl(const char *xattrname);
 #else
@@ -104,5 +105,10 @@ static inline int evm_inode_init_security(struct inode *inode,
 	return 0;
 }
 
+static inline bool evm_revalidate_status(const char *xattr_name)
+{
+	return false;
+}
+
 #endif /* CONFIG_EVM */
 #endif /* LINUX_EVM_H */
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c
index 7ac5204c8d1f..782915117175 100644
--- a/security/integrity/evm/evm_main.c
+++ b/security/integrity/evm/evm_main.c
@@ -425,6 +425,31 @@ static void evm_reset_status(struct inode *inode)
 		iint->evm_status = INTEGRITY_UNKNOWN;
 }
 
+/**
+ * evm_revalidate_status - report whether EVM status re-validation is necessary
+ * @xattr_name: pointer to the affected extended attribute name
+ *
+ * Report whether callers of evm_verifyxattr() should re-validate the
+ * EVM status.
+ *
+ * Return true if re-validation is necessary, false otherwise.
+ */
+bool evm_revalidate_status(const char *xattr_name)
+{
+	if (!evm_key_loaded())
+		return false;
+
+	/* evm_inode_post_setattr() passes NULL */
+	if (!xattr_name)
+		return true;
+
+	if (!evm_protected_xattr(xattr_name) && !posix_xattr_acl(xattr_name) &&
+	    strcmp(xattr_name, XATTR_NAME_EVM))
+		return false;
+
+	return true;
+}
+
 /**
  * evm_inode_post_setxattr - update 'security.evm' to reflect the changes
  * @dentry: pointer to the affected dentry
@@ -441,12 +466,14 @@ static void evm_reset_status(struct inode *inode)
 void evm_inode_post_setxattr(struct dentry *dentry, const char *xattr_name,
 			     const void *xattr_value, size_t xattr_value_len)
 {
-	if (!evm_key_loaded() || (!evm_protected_xattr(xattr_name)
-				  && !posix_xattr_acl(xattr_name)))
+	if (!evm_revalidate_status(xattr_name))
 		return;
 
 	evm_reset_status(dentry->d_inode);
 
+	if (!strcmp(xattr_name, XATTR_NAME_EVM))
+		return;
+
 	evm_update_evmxattr(dentry, xattr_name, xattr_value, xattr_value_len);
 }
 
@@ -462,11 +489,14 @@ void evm_inode_post_setxattr(struct dentry *dentry, const char *xattr_name,
  */
 void evm_inode_post_removexattr(struct dentry *dentry, const char *xattr_name)
 {
-	if (!evm_key_loaded() || !evm_protected_xattr(xattr_name))
+	if (!evm_revalidate_status(xattr_name))
 		return;
 
 	evm_reset_status(dentry->d_inode);
 
+	if (!strcmp(xattr_name, XATTR_NAME_EVM))
+		return;
+
 	evm_update_evmxattr(dentry, xattr_name, NULL, 0);
 }
 
@@ -513,9 +543,11 @@ int evm_inode_setattr(struct dentry *dentry, struct iattr *attr)
  */
 void evm_inode_post_setattr(struct dentry *dentry, int ia_valid)
 {
-	if (!evm_key_loaded())
+	if (!evm_revalidate_status(NULL))
 		return;
 
+	evm_reset_status(dentry->d_inode);
+
 	if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))
 		evm_update_evmxattr(dentry, NULL, NULL, 0);
 }
diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c
index 4e5eb0236278..03894769dffa 100644
--- a/security/integrity/ima/ima_appraise.c
+++ b/security/integrity/ima/ima_appraise.c
@@ -570,6 +570,7 @@ int ima_inode_setxattr(struct dentry *dentry, const char *xattr_name,
 		       const void *xattr_value, size_t xattr_value_len)
 {
 	const struct evm_ima_xattr_data *xvalue = xattr_value;
+	int digsig = 0;
 	int result;
 
 	result = ima_protect_xattr(dentry, xattr_name, xattr_value,
@@ -577,9 +578,12 @@ int ima_inode_setxattr(struct dentry *dentry, const char *xattr_name,
 	if (result == 1) {
 		if (!xattr_value_len || (xvalue->type >= IMA_XATTR_LAST))
 			return -EINVAL;
-		ima_reset_appraise_flags(d_backing_inode(dentry),
-			xvalue->type == EVM_IMA_XATTR_DIGSIG);
-		result = 0;
+		digsig = (xvalue->type == EVM_IMA_XATTR_DIGSIG);
+	}
+	if (result == 1 || evm_revalidate_status(xattr_name)) {
+		ima_reset_appraise_flags(d_backing_inode(dentry), digsig);
+		if (result == 1)
+			result = 0;
 	}
 	return result;
 }
@@ -589,9 +593,10 @@ int ima_inode_removexattr(struct dentry *dentry, const char *xattr_name)
 	int result;
 
 	result = ima_protect_xattr(dentry, xattr_name, NULL, 0);
-	if (result == 1) {
+	if (result == 1 || evm_revalidate_status(xattr_name)) {
 		ima_reset_appraise_flags(d_backing_inode(dentry), 0);
-		result = 0;
+		if (result == 1)
+			result = 0;
 	}
 	return result;
 }
-- 
cgit v1.2.3


From cdef685be5b4ae55c3959289e72d520402839c29 Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Fri, 14 May 2021 17:27:47 +0200
Subject: evm: Allow xattr/attr operations for portable signatures

If files with portable signatures are copied from one location to another
or are extracted from an archive, verification can temporarily fail until
all xattrs/attrs are set in the destination. Only portable signatures may
be moved or copied from one file to another, as they don't depend on
system-specific information such as the inode generation. Instead portable
signatures must include security.ima.

Unlike other security.evm types, EVM portable signatures are also
immutable. Thus, it wouldn't be a problem to allow xattr/attr operations
when verification fails, as portable signatures will never be replaced with
the HMAC on possibly corrupted xattrs/attrs.

This patch first introduces a new integrity status called
INTEGRITY_FAIL_IMMUTABLE, that allows callers of
evm_verify_current_integrity() to detect that a portable signature didn't
pass verification and then adds an exception in evm_protect_xattr() and
evm_inode_setattr() for this status and returns 0 instead of -EPERM.

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 include/linux/integrity.h             |  1 +
 security/integrity/evm/evm_main.c     | 33 +++++++++++++++++++++++++++------
 security/integrity/ima/ima_appraise.c |  2 ++
 3 files changed, 30 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/integrity.h b/include/linux/integrity.h
index 2271939c5c31..2ea0f2f65ab6 100644
--- a/include/linux/integrity.h
+++ b/include/linux/integrity.h
@@ -13,6 +13,7 @@ enum integrity_status {
 	INTEGRITY_PASS = 0,
 	INTEGRITY_PASS_IMMUTABLE,
 	INTEGRITY_FAIL,
+	INTEGRITY_FAIL_IMMUTABLE,
 	INTEGRITY_NOLABEL,
 	INTEGRITY_NOXATTRS,
 	INTEGRITY_UNKNOWN,
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c
index 4206c7e492ae..333524e879b5 100644
--- a/security/integrity/evm/evm_main.c
+++ b/security/integrity/evm/evm_main.c
@@ -27,7 +27,8 @@
 int evm_initialized;
 
 static const char * const integrity_status_msg[] = {
-	"pass", "pass_immutable", "fail", "no_label", "no_xattrs", "unknown"
+	"pass", "pass_immutable", "fail", "fail_immutable", "no_label",
+	"no_xattrs", "unknown"
 };
 int evm_hmac_attrs;
 
@@ -155,7 +156,7 @@ static enum integrity_status evm_verify_hmac(struct dentry *dentry,
 	enum integrity_status evm_status = INTEGRITY_PASS;
 	struct evm_digest digest;
 	struct inode *inode;
-	int rc, xattr_len;
+	int rc, xattr_len, evm_immutable = 0;
 
 	if (iint && (iint->evm_status == INTEGRITY_PASS ||
 		     iint->evm_status == INTEGRITY_PASS_IMMUTABLE))
@@ -200,8 +201,10 @@ static enum integrity_status evm_verify_hmac(struct dentry *dentry,
 		if (rc)
 			rc = -EINVAL;
 		break;
-	case EVM_IMA_XATTR_DIGSIG:
 	case EVM_XATTR_PORTABLE_DIGSIG:
+		evm_immutable = 1;
+		fallthrough;
+	case EVM_IMA_XATTR_DIGSIG:
 		/* accept xattr with non-empty signature field */
 		if (xattr_len <= sizeof(struct signature_v2_hdr)) {
 			evm_status = INTEGRITY_FAIL;
@@ -238,9 +241,14 @@ static enum integrity_status evm_verify_hmac(struct dentry *dentry,
 		break;
 	}
 
-	if (rc)
-		evm_status = (rc == -ENODATA) ?
-				INTEGRITY_NOXATTRS : INTEGRITY_FAIL;
+	if (rc) {
+		if (rc == -ENODATA)
+			evm_status = INTEGRITY_NOXATTRS;
+		else if (evm_immutable)
+			evm_status = INTEGRITY_FAIL_IMMUTABLE;
+		else
+			evm_status = INTEGRITY_FAIL;
+	}
 out:
 	if (iint)
 		iint->evm_status = evm_status;
@@ -380,6 +388,14 @@ out:
 	if (evm_hmac_disabled() && (evm_status == INTEGRITY_NOLABEL ||
 	    evm_status == INTEGRITY_UNKNOWN))
 		return 0;
+
+	/*
+	 * Writing other xattrs is safe for portable signatures, as portable
+	 * signatures are immutable and can never be updated.
+	 */
+	if (evm_status == INTEGRITY_FAIL_IMMUTABLE)
+		return 0;
+
 	if (evm_status != INTEGRITY_PASS)
 		integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry),
 				    dentry->d_name.name, "appraise_metadata",
@@ -553,8 +569,13 @@ int evm_inode_setattr(struct dentry *dentry, struct iattr *attr)
 	if (!(ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)))
 		return 0;
 	evm_status = evm_verify_current_integrity(dentry);
+	/*
+	 * Writing attrs is safe for portable signatures, as portable signatures
+	 * are immutable and can never be updated.
+	 */
 	if ((evm_status == INTEGRITY_PASS) ||
 	    (evm_status == INTEGRITY_NOXATTRS) ||
+	    (evm_status == INTEGRITY_FAIL_IMMUTABLE) ||
 	    (evm_hmac_disabled() && (evm_status == INTEGRITY_NOLABEL ||
 	     evm_status == INTEGRITY_UNKNOWN)))
 		return 0;
diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c
index 03894769dffa..9bb351b933fb 100644
--- a/security/integrity/ima/ima_appraise.c
+++ b/security/integrity/ima/ima_appraise.c
@@ -416,6 +416,8 @@ int ima_appraise_measurement(enum ima_hooks func,
 	case INTEGRITY_NOLABEL:		/* No security.evm xattr. */
 		cause = "missing-HMAC";
 		goto out;
+	case INTEGRITY_FAIL_IMMUTABLE:
+		fallthrough;
 	case INTEGRITY_FAIL:		/* Invalid HMAC/signature. */
 		cause = "invalid-HMAC";
 		goto out;
-- 
cgit v1.2.3


From 7e135dc725417ecc0629afb4b3b24457d2a4869d Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Fri, 14 May 2021 17:27:48 +0200
Subject: evm: Pass user namespace to set/remove xattr hooks

In preparation for 'evm: Allow setxattr() and setattr() for unmodified
metadata', this patch passes mnt_userns to the inode set/remove xattr hooks
so that the GID of the inode on an idmapped mount is correctly determined
by posix_acl_update_mode().

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Reviewed-by: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 include/linux/evm.h               | 12 ++++++++----
 security/integrity/evm/evm_main.c | 17 +++++++++++------
 security/security.c               |  4 ++--
 3 files changed, 21 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/evm.h b/include/linux/evm.h
index 39bb17a8236b..31ef1dbbb3ac 100644
--- a/include/linux/evm.h
+++ b/include/linux/evm.h
@@ -23,13 +23,15 @@ extern enum integrity_status evm_verifyxattr(struct dentry *dentry,
 					     struct integrity_iint_cache *iint);
 extern int evm_inode_setattr(struct dentry *dentry, struct iattr *attr);
 extern void evm_inode_post_setattr(struct dentry *dentry, int ia_valid);
-extern int evm_inode_setxattr(struct dentry *dentry, const char *name,
+extern int evm_inode_setxattr(struct user_namespace *mnt_userns,
+			      struct dentry *dentry, const char *name,
 			      const void *value, size_t size);
 extern void evm_inode_post_setxattr(struct dentry *dentry,
 				    const char *xattr_name,
 				    const void *xattr_value,
 				    size_t xattr_value_len);
-extern int evm_inode_removexattr(struct dentry *dentry, const char *xattr_name);
+extern int evm_inode_removexattr(struct user_namespace *mnt_userns,
+				 struct dentry *dentry, const char *xattr_name);
 extern void evm_inode_post_removexattr(struct dentry *dentry,
 				       const char *xattr_name);
 extern int evm_inode_init_security(struct inode *inode,
@@ -72,7 +74,8 @@ static inline void evm_inode_post_setattr(struct dentry *dentry, int ia_valid)
 	return;
 }
 
-static inline int evm_inode_setxattr(struct dentry *dentry, const char *name,
+static inline int evm_inode_setxattr(struct user_namespace *mnt_userns,
+				     struct dentry *dentry, const char *name,
 				     const void *value, size_t size)
 {
 	return 0;
@@ -86,7 +89,8 @@ static inline void evm_inode_post_setxattr(struct dentry *dentry,
 	return;
 }
 
-static inline int evm_inode_removexattr(struct dentry *dentry,
+static inline int evm_inode_removexattr(struct user_namespace *mnt_userns,
+					struct dentry *dentry,
 					const char *xattr_name)
 {
 	return 0;
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c
index 333524e879b5..300df6906e05 100644
--- a/security/integrity/evm/evm_main.c
+++ b/security/integrity/evm/evm_main.c
@@ -342,7 +342,8 @@ static enum integrity_status evm_verify_current_integrity(struct dentry *dentry)
  * For posix xattr acls only, permit security.evm, even if it currently
  * doesn't exist, to be updated unless the EVM signature is immutable.
  */
-static int evm_protect_xattr(struct dentry *dentry, const char *xattr_name,
+static int evm_protect_xattr(struct user_namespace *mnt_userns,
+			     struct dentry *dentry, const char *xattr_name,
 			     const void *xattr_value, size_t xattr_value_len)
 {
 	enum integrity_status evm_status;
@@ -406,6 +407,7 @@ out:
 
 /**
  * evm_inode_setxattr - protect the EVM extended attribute
+ * @mnt_userns: user namespace of the idmapped mount
  * @dentry: pointer to the affected dentry
  * @xattr_name: pointer to the affected extended attribute name
  * @xattr_value: pointer to the new extended attribute value
@@ -417,8 +419,9 @@ out:
  * userspace from writing HMAC value.  Writing 'security.evm' requires
  * requires CAP_SYS_ADMIN privileges.
  */
-int evm_inode_setxattr(struct dentry *dentry, const char *xattr_name,
-		       const void *xattr_value, size_t xattr_value_len)
+int evm_inode_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		       const char *xattr_name, const void *xattr_value,
+		       size_t xattr_value_len)
 {
 	const struct evm_ima_xattr_data *xattr_data = xattr_value;
 
@@ -435,19 +438,21 @@ int evm_inode_setxattr(struct dentry *dentry, const char *xattr_name,
 		    xattr_data->type != EVM_XATTR_PORTABLE_DIGSIG)
 			return -EPERM;
 	}
-	return evm_protect_xattr(dentry, xattr_name, xattr_value,
+	return evm_protect_xattr(mnt_userns, dentry, xattr_name, xattr_value,
 				 xattr_value_len);
 }
 
 /**
  * evm_inode_removexattr - protect the EVM extended attribute
+ * @mnt_userns: user namespace of the idmapped mount
  * @dentry: pointer to the affected dentry
  * @xattr_name: pointer to the affected extended attribute name
  *
  * Removing 'security.evm' requires CAP_SYS_ADMIN privileges and that
  * the current value is valid.
  */
-int evm_inode_removexattr(struct dentry *dentry, const char *xattr_name)
+int evm_inode_removexattr(struct user_namespace *mnt_userns,
+			  struct dentry *dentry, const char *xattr_name)
 {
 	/* Policy permits modification of the protected xattrs even though
 	 * there's no HMAC key loaded
@@ -455,7 +460,7 @@ int evm_inode_removexattr(struct dentry *dentry, const char *xattr_name)
 	if (evm_initialized & EVM_ALLOW_METADATA_WRITES)
 		return 0;
 
-	return evm_protect_xattr(dentry, xattr_name, NULL, 0);
+	return evm_protect_xattr(mnt_userns, dentry, xattr_name, NULL, 0);
 }
 
 static void evm_reset_status(struct inode *inode)
diff --git a/security/security.c b/security/security.c
index b38155b2de83..e9f8010a2341 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1354,7 +1354,7 @@ int security_inode_setxattr(struct user_namespace *mnt_userns,
 	ret = ima_inode_setxattr(dentry, name, value, size);
 	if (ret)
 		return ret;
-	return evm_inode_setxattr(dentry, name, value, size);
+	return evm_inode_setxattr(mnt_userns, dentry, name, value, size);
 }
 
 void security_inode_post_setxattr(struct dentry *dentry, const char *name,
@@ -1399,7 +1399,7 @@ int security_inode_removexattr(struct user_namespace *mnt_userns,
 	ret = ima_inode_removexattr(dentry, name);
 	if (ret)
 		return ret;
-	return evm_inode_removexattr(dentry, name);
+	return evm_inode_removexattr(mnt_userns, dentry, name);
 }
 
 int security_inode_need_killpriv(struct dentry *dentry)
-- 
cgit v1.2.3


From 24bb0076d7bc0ea4caf0af55bd0273a1c343748a Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Mon, 17 May 2021 17:40:20 +0800
Subject: usb: fix spelling mistakes in header files

Fix some spelling mistakes in comments:
trasfer ==> transfer
consumtion ==> consumption
endoint ==> endpoint
sharable ==> shareable
contraints ==> constraints
Auxilary ==> Auxiliary
correspondig ==> corresponding
interupt ==> interrupt
inifinite ==> infinite
assignement ==> assignment

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Link: https://lore.kernel.org/r/20210517094020.7310-1-thunder.leizhen@huawei.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb.h           | 2 +-
 include/linux/usb/composite.h | 2 +-
 include/linux/usb/gadget.h    | 2 +-
 include/linux/usb/hcd.h       | 4 ++--
 include/linux/usb/otg-fsm.h   | 6 +++---
 include/linux/usb/otg.h       | 2 +-
 include/linux/usb/quirks.h    | 2 +-
 include/linux/usb/serial.h    | 2 +-
 include/linux/usb/typec_dp.h  | 2 +-
 9 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index eaae24217e8a..4db6b824af5c 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1485,7 +1485,7 @@ typedef void (*usb_complete_t)(struct urb *);
  *
  * Note that transfer_buffer must still be set if the controller
  * does not support DMA (as indicated by hcd_uses_dma()) and when talking
- * to root hub. If you have to trasfer between highmem zone and the device
+ * to root hub. If you have to transfer between highmem zone and the device
  * on such controller, create a bounce buffer or bail out with an error.
  * If transfer_buffer cannot be set (is in highmem) and the controller is DMA
  * capable, assign NULL to it, so that usbmon knows not to use the value.
diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h
index c71150f2c639..9d2762279286 100644
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -271,7 +271,7 @@ int config_ep_by_speed(struct usb_gadget *g, struct usb_function *f,
  * @bConfigurationValue: Copied into configuration descriptor.
  * @iConfiguration: Copied into configuration descriptor.
  * @bmAttributes: Copied into configuration descriptor.
- * @MaxPower: Power consumtion in mA. Used to compute bMaxPower in the
+ * @MaxPower: Power consumption in mA. Used to compute bMaxPower in the
  *	configuration descriptor after considering the bus speed.
  * @cdev: assigned by @usb_add_config() before calling @bind(); this is
  *	the device associated with this configuration.
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index ee04ef214ce8..8811eb96e5cc 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -197,7 +197,7 @@ struct usb_ep_caps {
  * @name:identifier for the endpoint, such as "ep-a" or "ep9in-bulk"
  * @ops: Function pointers used to access hardware-specific operations.
  * @ep_list:the gadget's ep_list holds all of its endpoints
- * @caps:The structure describing types and directions supported by endoint.
+ * @caps:The structure describing types and directions supported by endpoint.
  * @enabled: The current endpoint enabled/disabled state.
  * @claimed: True if this endpoint is claimed by a function.
  * @maxpacket:The maximum packet size used on this endpoint.  The initial
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 22c5d1c0acf3..548a028f2dab 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -59,7 +59,7 @@
  * USB Host Controller Driver (usb_hcd) framework
  *
  * Since "struct usb_bus" is so thin, you can't share much code in it.
- * This framework is a layer over that, and should be more sharable.
+ * This framework is a layer over that, and should be more shareable.
  */
 
 /*-------------------------------------------------------------------------*/
@@ -299,7 +299,7 @@ struct hc_driver {
 	 * (optional) these hooks allow an HCD to override the default DMA
 	 * mapping and unmapping routines.  In general, they shouldn't be
 	 * necessary unless the host controller has special DMA requirements,
-	 * such as alignment contraints.  If these are not specified, the
+	 * such as alignment constraints.  If these are not specified, the
 	 * general usb_hcd_(un)?map_urb_for_dma functions will be used instead
 	 * (and it may be a good idea to call these functions in your HCD
 	 * implementation)
diff --git a/include/linux/usb/otg-fsm.h b/include/linux/usb/otg-fsm.h
index e78eb577d0fa..3aee78dda16d 100644
--- a/include/linux/usb/otg-fsm.h
+++ b/include/linux/usb/otg-fsm.h
@@ -98,7 +98,7 @@ enum otg_fsm_timer {
  * @b_bus_req:	TRUE during the time that the Application running on the
  *		B-device wants to use the bus
  *
- *	Auxilary inputs (OTG v1.3 only. Obsolete now.)
+ *	Auxiliary inputs (OTG v1.3 only. Obsolete now.)
  * @a_sess_vld:	TRUE if the A-device detects that VBUS is above VA_SESS_VLD
  * @b_bus_suspend: TRUE when the A-device detects that the B-device has put
  *		the bus into suspend
@@ -153,7 +153,7 @@ struct otg_fsm {
 	int a_bus_req;
 	int b_bus_req;
 
-	/* Auxilary inputs */
+	/* Auxiliary inputs */
 	int a_sess_vld;
 	int b_bus_resume;
 	int b_bus_suspend;
@@ -177,7 +177,7 @@ struct otg_fsm {
 	int a_bus_req_inf;
 	int a_clr_err_inf;
 	int b_bus_req_inf;
-	/* Auxilary informative variables */
+	/* Auxiliary informative variables */
 	int a_suspend_req_inf;
 
 	/* Timeout indicator for timers */
diff --git a/include/linux/usb/otg.h b/include/linux/usb/otg.h
index 69f1b6328532..7ceeecbb9e02 100644
--- a/include/linux/usb/otg.h
+++ b/include/linux/usb/otg.h
@@ -125,7 +125,7 @@ enum usb_dr_mode {
  * @dev: Pointer to the given device
  *
  * The function gets phy interface string from property 'dr_mode',
- * and returns the correspondig enum usb_dr_mode
+ * and returns the corresponding enum usb_dr_mode
  */
 extern enum usb_dr_mode usb_get_dr_mode(struct device *dev);
 
diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h
index 5e4c497f54d6..eeb7c2157c72 100644
--- a/include/linux/usb/quirks.h
+++ b/include/linux/usb/quirks.h
@@ -32,7 +32,7 @@
 #define USB_QUIRK_DELAY_INIT			BIT(6)
 
 /*
- * For high speed and super speed interupt endpoints, the USB 2.0 and
+ * For high speed and super speed interrupt endpoints, the USB 2.0 and
  * USB 3.0 spec require the interval in microframes
  * (1 microframe = 125 microseconds) to be calculated as
  * interval = 2 ^ (bInterval-1).
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 8c63fa9bfc74..b81eb604e092 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -395,7 +395,7 @@ static inline void usb_serial_debug_data(struct device *dev,
 }
 
 /*
- * Macro for reporting errors in write path to avoid inifinite loop
+ * Macro for reporting errors in write path to avoid infinite loop
  * when port is used as a console.
  */
 #define dev_err_console(usport, fmt, ...)				\
diff --git a/include/linux/usb/typec_dp.h b/include/linux/usb/typec_dp.h
index fc4c7edb2e8a..cfb916cccd31 100644
--- a/include/linux/usb/typec_dp.h
+++ b/include/linux/usb/typec_dp.h
@@ -97,7 +97,7 @@ enum {
 #define DP_CONF_PIN_ASSIGNEMENT_SHIFT	8
 #define DP_CONF_PIN_ASSIGNEMENT_MASK	GENMASK(15, 8)
 
-/* Helper for setting/getting the pin assignement value to the configuration */
+/* Helper for setting/getting the pin assignment value to the configuration */
 #define DP_CONF_SET_PIN_ASSIGN(_a_)	((_a_) << 8)
 #define DP_CONF_GET_PIN_ASSIGN(_conf_)	(((_conf_) & GENMASK(15, 8)) >> 8)
 
-- 
cgit v1.2.3


From f9a88370e6751c68a8f0d1c3f23100ca20596249 Mon Sep 17 00:00:00 2001
From: Rui Miguel Silva <rui.silva@linaro.org>
Date: Thu, 13 May 2021 09:47:12 +0100
Subject: usb: isp1760: remove platform data struct and code

Since the removal of the Blackfin port with:
commit 4ba66a976072 ("arch: remove blackfin port")

No one is using or referencing this header and platform data struct.
Remove them.

Signed-off-by: Rui Miguel Silva <rui.silva@linaro.org>
Link: https://lore.kernel.org/r/20210513084717.2487366-5-rui.silva@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/isp1760/isp1760-if.c | 20 +++-----------------
 include/linux/usb/isp1760.h      | 19 -------------------
 2 files changed, 3 insertions(+), 36 deletions(-)
 delete mode 100644 include/linux/usb/isp1760.h

(limited to 'include/linux')

diff --git a/drivers/usb/isp1760/isp1760-if.c b/drivers/usb/isp1760/isp1760-if.c
index abfba9f5ec23..fb6701608cd8 100644
--- a/drivers/usb/isp1760/isp1760-if.c
+++ b/drivers/usb/isp1760/isp1760-if.c
@@ -16,7 +16,6 @@
 #include <linux/of.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
-#include <linux/usb/isp1760.h>
 #include <linux/usb/hcd.h>
 
 #include "isp1760-core.h"
@@ -225,22 +224,9 @@ static int isp1760_plat_probe(struct platform_device *pdev)
 
 		if (of_property_read_bool(dp, "dreq-polarity"))
 			devflags |= ISP1760_FLAG_DREQ_POL_HIGH;
-	} else if (dev_get_platdata(&pdev->dev)) {
-		struct isp1760_platform_data *pdata =
-			dev_get_platdata(&pdev->dev);
-
-		if (pdata->is_isp1761)
-			devflags |= ISP1760_FLAG_ISP1761;
-		if (pdata->bus_width_16)
-			devflags |= ISP1760_FLAG_BUS_WIDTH_16;
-		if (pdata->port1_otg)
-			devflags |= ISP1760_FLAG_OTG_EN;
-		if (pdata->analog_oc)
-			devflags |= ISP1760_FLAG_ANALOG_OC;
-		if (pdata->dack_polarity_high)
-			devflags |= ISP1760_FLAG_DACK_POL_HIGH;
-		if (pdata->dreq_polarity_high)
-			devflags |= ISP1760_FLAG_DREQ_POL_HIGH;
+	} else {
+		pr_err("isp1760: no platform data\n");
+		return -ENXIO;
 	}
 
 	ret = isp1760_register(mem_res, irq_res->start, irqflags, &pdev->dev,
diff --git a/include/linux/usb/isp1760.h b/include/linux/usb/isp1760.h
deleted file mode 100644
index b75ded28db81..000000000000
--- a/include/linux/usb/isp1760.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * board initialization should put one of these into dev->platform_data
- * and place the isp1760 onto platform_bus named "isp1760-hcd".
- */
-
-#ifndef __LINUX_USB_ISP1760_H
-#define __LINUX_USB_ISP1760_H
-
-struct isp1760_platform_data {
-	unsigned is_isp1761:1;			/* Chip is ISP1761 */
-	unsigned bus_width_16:1;		/* 16/32-bit data bus width */
-	unsigned port1_otg:1;			/* Port 1 supports OTG */
-	unsigned analog_oc:1;			/* Analog overcurrent */
-	unsigned dack_polarity_high:1;		/* DACK active high */
-	unsigned dreq_polarity_high:1;		/* DREQ active high */
-};
-
-#endif /* __LINUX_USB_ISP1760_H */
-- 
cgit v1.2.3


From 59d4d06c8ab0375dcc4bab329e6ecd44dd46373e Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <badhri@google.com>
Date: Mon, 17 May 2021 12:21:11 -0700
Subject: usb: typec: tcpm: Move TCPC to APPLY_RC state during PR_SWAP

When vbus auto discharge is enabled, TCPCI based TCPC transitions
into Attached.SNK/Attached.SRC state. During PR_SWAP, TCPCI based
TCPC would disconnect when partner changes power roles. TCPC has
to be moved APPLY RC state during PR_SWAP. This is done by
ROLE_CONTROL.CC1 != ROLE_CONTROL.CC2 and
POWER_CONTROL.AutodischargeDisconnect is 0. Once the swap sequence
is done, AutoDischargeDisconnect is re-enabled.

Fixes: f321a02caebd ("usb: typec: tcpm: Implement enabling Auto Discharge disconnect support")
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Acked-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Badhri Jagan Sridharan <badhri@google.com>
Link: https://lore.kernel.org/r/20210517192112.40934-3-badhri@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/tcpm/tcpm.c | 16 ++++++++++++++++
 include/linux/usb/tcpm.h      |  4 ++++
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c
index 3551e3304c7e..3feaf5d6419e 100644
--- a/drivers/usb/typec/tcpm/tcpm.c
+++ b/drivers/usb/typec/tcpm/tcpm.c
@@ -789,6 +789,19 @@ static int tcpm_enable_auto_vbus_discharge(struct tcpm_port *port, bool enable)
 	return ret;
 }
 
+static void tcpm_apply_rc(struct tcpm_port *port)
+{
+	/*
+	 * TCPCI: Move to APPLY_RC state to prevent disconnect during PR_SWAP
+	 * when Vbus auto discharge on disconnect is enabled.
+	 */
+	if (port->tcpc->enable_auto_vbus_discharge && port->tcpc->apply_rc) {
+		tcpm_log(port, "Apply_RC");
+		port->tcpc->apply_rc(port->tcpc, port->cc_req, port->polarity);
+		tcpm_enable_auto_vbus_discharge(port, false);
+	}
+}
+
 /*
  * Determine RP value to set based on maximum current supported
  * by a port if configured as source.
@@ -4469,6 +4482,7 @@ static void run_state_machine(struct tcpm_port *port)
 		tcpm_set_state(port, ready_state(port), 0);
 		break;
 	case PR_SWAP_START:
+		tcpm_apply_rc(port);
 		if (port->pwr_role == TYPEC_SOURCE)
 			tcpm_set_state(port, PR_SWAP_SRC_SNK_TRANSITION_OFF,
 				       PD_T_SRC_TRANSITION);
@@ -4508,6 +4522,7 @@ static void run_state_machine(struct tcpm_port *port)
 		tcpm_set_state(port, ERROR_RECOVERY, PD_T_PS_SOURCE_ON_PRS);
 		break;
 	case PR_SWAP_SRC_SNK_SINK_ON:
+		tcpm_enable_auto_vbus_discharge(port, true);
 		/* Set the vbus disconnect threshold for implicit contract */
 		tcpm_set_auto_vbus_discharge_threshold(port, TYPEC_PWR_MODE_USB, false, VSAFE5V);
 		tcpm_set_state(port, SNK_STARTUP, 0);
@@ -4524,6 +4539,7 @@ static void run_state_machine(struct tcpm_port *port)
 			       PD_T_PS_SOURCE_OFF);
 		break;
 	case PR_SWAP_SNK_SRC_SOURCE_ON:
+		tcpm_enable_auto_vbus_discharge(port, true);
 		tcpm_set_cc(port, tcpm_rp_cc(port));
 		tcpm_set_vbus(port, true);
 		/*
diff --git a/include/linux/usb/tcpm.h b/include/linux/usb/tcpm.h
index 42fcfbe10590..bffc8d3e14ad 100644
--- a/include/linux/usb/tcpm.h
+++ b/include/linux/usb/tcpm.h
@@ -66,6 +66,8 @@ enum tcpm_transmit_type {
  *		For example, some tcpcs may include BC1.2 charger detection
  *		and use that in this case.
  * @set_cc:	Called to set value of CC pins
+ * @apply_rc:	Optional; Needed to move TCPCI based chipset to APPLY_RC state
+ *		as stated by the TCPCI specification.
  * @get_cc:	Called to read current CC pin values
  * @set_polarity:
  *		Called to set polarity
@@ -120,6 +122,8 @@ struct tcpc_dev {
 	int (*get_vbus)(struct tcpc_dev *dev);
 	int (*get_current_limit)(struct tcpc_dev *dev);
 	int (*set_cc)(struct tcpc_dev *dev, enum typec_cc_status cc);
+	int (*apply_rc)(struct tcpc_dev *dev, enum typec_cc_status cc,
+			enum typec_cc_polarity polarity);
 	int (*get_cc)(struct tcpc_dev *dev, enum typec_cc_status *cc1,
 		      enum typec_cc_status *cc2);
 	int (*set_polarity)(struct tcpc_dev *dev,
-- 
cgit v1.2.3


From 393b06383fb77a006a29eb1574474d468e8c868b Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 21 May 2021 20:45:19 +0200
Subject: debugfs: remove return value of debugfs_create_bool()

No one checks the return value of debugfs_create_bool(), as it's not
needed, so make the return value void, so that no one tries to do so in
the future.

Link: https://lore.kernel.org/r/20210521184519.1356639-1-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/filesystems/debugfs.rst |  4 ++--
 fs/debugfs/file.c                     | 15 +++------------
 include/linux/debugfs.h               | 12 ++++--------
 3 files changed, 9 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/debugfs.rst b/Documentation/filesystems/debugfs.rst
index 0f2292e367e6..71b1fee56d2a 100644
--- a/Documentation/filesystems/debugfs.rst
+++ b/Documentation/filesystems/debugfs.rst
@@ -120,8 +120,8 @@ and hexadecimal::
 
 Boolean values can be placed in debugfs with::
 
-    struct dentry *debugfs_create_bool(const char *name, umode_t mode,
-				       struct dentry *parent, bool *value);
+    void debugfs_create_bool(const char *name, umode_t mode,
+                             struct dentry *parent, bool *value);
 
 A read on the resulting file will yield either Y (for non-zero values) or
 N, followed by a newline.  If written to, it will accept either upper- or
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 54f827339c38..d4ab31e39027 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -846,20 +846,11 @@ static const struct file_operations fops_bool_wo = {
  * This function creates a file in debugfs with the given name that
  * contains the value of the variable @value.  If the @mode variable is so
  * set, it can be read from, and written to.
- *
- * This function will return a pointer to a dentry if it succeeds.  This
- * pointer must be passed to the debugfs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, ERR_PTR(-ERROR) will be
- * returned.
- *
- * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will
- * be returned.
  */
-struct dentry *debugfs_create_bool(const char *name, umode_t mode,
-				   struct dentry *parent, bool *value)
+void debugfs_create_bool(const char *name, umode_t mode, struct dentry *parent,
+			 bool *value)
 {
-	return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_bool,
+	debugfs_create_mode_unsafe(name, mode, parent, value, &fops_bool,
 				   &fops_bool_ro, &fops_bool_wo);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_bool);
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 1fdb4343af9c..53150803eb7c 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -126,8 +126,8 @@ void debugfs_create_size_t(const char *name, umode_t mode,
 			   struct dentry *parent, size_t *value);
 void debugfs_create_atomic_t(const char *name, umode_t mode,
 			     struct dentry *parent, atomic_t *value);
-struct dentry *debugfs_create_bool(const char *name, umode_t mode,
-				  struct dentry *parent, bool *value);
+void debugfs_create_bool(const char *name, umode_t mode, struct dentry *parent,
+			 bool *value);
 void debugfs_create_str(const char *name, umode_t mode,
 			struct dentry *parent, char **value);
 
@@ -295,12 +295,8 @@ static inline void debugfs_create_atomic_t(const char *name, umode_t mode,
 					   atomic_t *value)
 { }
 
-static inline struct dentry *debugfs_create_bool(const char *name, umode_t mode,
-						 struct dentry *parent,
-						 bool *value)
-{
-	return ERR_PTR(-ENODEV);
-}
+static inline void debugfs_create_bool(const char *name, umode_t mode,
+				       struct dentry *parent, bool *value) { }
 
 static inline void debugfs_create_str(const char *name, umode_t mode,
 				      struct dentry *parent,
-- 
cgit v1.2.3


From fb05b14c5b99a7a462d6e733155e4b2e80e28646 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 21 May 2021 20:43:40 +0200
Subject: debugfs: remove return value of debugfs_create_ulong()

No one checks the return value of debugfs_create_ulong(), as it's not
needed, so make the return value void, so that no one tries to do so in
the future.

Link: https://lore.kernel.org/r/20210521184340.1348539-1-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c       | 18 ++++--------------
 include/linux/debugfs.h | 14 +++++---------
 2 files changed, 9 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index d4ab31e39027..fb0c102847d5 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -582,22 +582,12 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n");
  * This function creates a file in debugfs with the given name that
  * contains the value of the variable @value.  If the @mode variable is so
  * set, it can be read from, and written to.
- *
- * This function will return a pointer to a dentry if it succeeds.  This
- * pointer must be passed to the debugfs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, ERR_PTR(-ERROR) will be
- * returned.
- *
- * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will
- * be returned.
  */
-struct dentry *debugfs_create_ulong(const char *name, umode_t mode,
-				    struct dentry *parent, unsigned long *value)
+void debugfs_create_ulong(const char *name, umode_t mode, struct dentry *parent,
+			  unsigned long *value)
 {
-	return debugfs_create_mode_unsafe(name, mode, parent, value,
-					&fops_ulong, &fops_ulong_ro,
-					&fops_ulong_wo);
+	debugfs_create_mode_unsafe(name, mode, parent, value, &fops_ulong,
+				   &fops_ulong_ro, &fops_ulong_wo);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_ulong);
 
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 53150803eb7c..c869f1e73d75 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -112,8 +112,8 @@ void debugfs_create_u32(const char *name, umode_t mode, struct dentry *parent,
 			u32 *value);
 void debugfs_create_u64(const char *name, umode_t mode, struct dentry *parent,
 			u64 *value);
-struct dentry *debugfs_create_ulong(const char *name, umode_t mode,
-				    struct dentry *parent, unsigned long *value);
+void debugfs_create_ulong(const char *name, umode_t mode, struct dentry *parent,
+			  unsigned long *value);
 void debugfs_create_x8(const char *name, umode_t mode, struct dentry *parent,
 		       u8 *value);
 void debugfs_create_x16(const char *name, umode_t mode, struct dentry *parent,
@@ -266,13 +266,9 @@ static inline void debugfs_create_u32(const char *name, umode_t mode,
 static inline void debugfs_create_u64(const char *name, umode_t mode,
 				      struct dentry *parent, u64 *value) { }
 
-static inline struct dentry *debugfs_create_ulong(const char *name,
-						umode_t mode,
-						struct dentry *parent,
-						unsigned long *value)
-{
-	return ERR_PTR(-ENODEV);
-}
+static inline void debugfs_create_ulong(const char *name, umode_t mode,
+					struct dentry *parent,
+					unsigned long *value) { }
 
 static inline void debugfs_create_x8(const char *name, umode_t mode,
 				     struct dentry *parent, u8 *value) { }
-- 
cgit v1.2.3


From 80dd33cf72d1ab4f0af303f1fa242c6d6c8d328f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 14 May 2021 14:10:15 +0200
Subject: drivers: base: Fix device link removal

When device_link_free() drops references to the supplier and
consumer devices of the device link going away and the reference
being dropped turns out to be the last one for any of those
device objects, its ->release callback will be invoked and it
may sleep which goes against the SRCU callback execution
requirements.

To address this issue, make the device link removal code carry out
the device_link_free() actions preceded by SRCU synchronization from
a separate work item (the "long" workqueue is used for that, because
it does not matter when the device link memory is released and it may
take time to get to that point) instead of using SRCU callbacks.

While at it, make the code work analogously when SRCU is not enabled
to reduce the differences between the SRCU and non-SRCU cases.

Fixes: 843e600b8a2b ("driver core: Fix sleeping in invalid context during device link deletion")
Cc: stable <stable@vger.kernel.org>
Reported-by: chenxiang (M) <chenxiang66@hisilicon.com>
Tested-by: chenxiang (M) <chenxiang66@hisilicon.com>
Reviewed-by: Saravana Kannan <saravanak@google.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/5722787.lOV4Wx5bFT@kreacher
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 37 +++++++++++++++++++++++--------------
 include/linux/device.h |  6 ++----
 2 files changed, 25 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 628e33939aca..61c19641e1d0 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -194,6 +194,11 @@ int device_links_read_lock_held(void)
 {
 	return srcu_read_lock_held(&device_links_srcu);
 }
+
+static void device_link_synchronize_removal(void)
+{
+	synchronize_srcu(&device_links_srcu);
+}
 #else /* !CONFIG_SRCU */
 static DECLARE_RWSEM(device_links_lock);
 
@@ -224,6 +229,10 @@ int device_links_read_lock_held(void)
 	return lockdep_is_held(&device_links_lock);
 }
 #endif
+
+static inline void device_link_synchronize_removal(void)
+{
+}
 #endif /* !CONFIG_SRCU */
 
 static bool device_is_ancestor(struct device *dev, struct device *target)
@@ -445,8 +454,13 @@ static struct attribute *devlink_attrs[] = {
 };
 ATTRIBUTE_GROUPS(devlink);
 
-static void device_link_free(struct device_link *link)
+static void device_link_release_fn(struct work_struct *work)
 {
+	struct device_link *link = container_of(work, struct device_link, rm_work);
+
+	/* Ensure that all references to the link object have been dropped. */
+	device_link_synchronize_removal();
+
 	while (refcount_dec_not_one(&link->rpm_active))
 		pm_runtime_put(link->supplier);
 
@@ -455,24 +469,19 @@ static void device_link_free(struct device_link *link)
 	kfree(link);
 }
 
-#ifdef CONFIG_SRCU
-static void __device_link_free_srcu(struct rcu_head *rhead)
-{
-	device_link_free(container_of(rhead, struct device_link, rcu_head));
-}
-
 static void devlink_dev_release(struct device *dev)
 {
 	struct device_link *link = to_devlink(dev);
 
-	call_srcu(&device_links_srcu, &link->rcu_head, __device_link_free_srcu);
-}
-#else
-static void devlink_dev_release(struct device *dev)
-{
-	device_link_free(to_devlink(dev));
+	INIT_WORK(&link->rm_work, device_link_release_fn);
+	/*
+	 * It may take a while to complete this work because of the SRCU
+	 * synchronization in device_link_release_fn() and if the consumer or
+	 * supplier devices get deleted when it runs, so put it into the "long"
+	 * workqueue.
+	 */
+	queue_work(system_long_wq, &link->rm_work);
 }
-#endif
 
 static struct class devlink_class = {
 	.name = "devlink",
diff --git a/include/linux/device.h b/include/linux/device.h
index 38a2071cf776..f1a00040fa53 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -570,7 +570,7 @@ struct device {
  * @flags: Link flags.
  * @rpm_active: Whether or not the consumer device is runtime-PM-active.
  * @kref: Count repeated addition of the same link.
- * @rcu_head: An RCU head to use for deferred execution of SRCU callbacks.
+ * @rm_work: Work structure used for removing the link.
  * @supplier_preactivated: Supplier has been made active before consumer probe.
  */
 struct device_link {
@@ -583,9 +583,7 @@ struct device_link {
 	u32 flags;
 	refcount_t rpm_active;
 	struct kref kref;
-#ifdef CONFIG_SRCU
-	struct rcu_head rcu_head;
-#endif
+	struct work_struct rm_work;
 	bool supplier_preactivated; /* Owned by consumer probe. */
 };
 
-- 
cgit v1.2.3


From 46ad057245912fc8a49e18f6f8b57f80ab8d4dc1 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 19 May 2021 18:33:14 +0200
Subject: sysfs: Add helper BIN_ATTRIBUTE_GROUPS

New helper BIN_ATTRIBUTE_GROUPS() does the same as ATTRIBUTE_GROUPS(),
just for binary attributes.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://lore.kernel.org/r/e20db248-ed30-cf5d-a37c-b538dceaa5b2@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/sysfs.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index d76a1ddf83a3..a12556a4b93a 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -162,6 +162,12 @@ static const struct attribute_group _name##_group = {		\
 };								\
 __ATTRIBUTE_GROUPS(_name)
 
+#define BIN_ATTRIBUTE_GROUPS(_name)				\
+static const struct attribute_group _name##_group = {		\
+	.bin_attrs = _name##_attrs,				\
+};								\
+__ATTRIBUTE_GROUPS(_name)
+
 struct file;
 struct vm_area_struct;
 struct address_space;
-- 
cgit v1.2.3


From 7ee3e97e00a3893e354c3993c3f7d9dc127e9c5e Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Thu, 13 May 2021 09:07:51 +0000
Subject: kprobes: Allow architectures to override optinsn page allocation

Some architectures like powerpc require a non standard
allocation of optinsn page, because module pages are
too far from the kernel for direct branches.

Define weak alloc_optinsn_page() and free_optinsn_page(), that
fall back on alloc_insn_page() and free_insn_page() when not
overridden by the architecture.

Suggested-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/40a43d6df1fdf41ade36e9a46e60a4df774ca9f6.1620896780.git.christophe.leroy@csgroup.eu
---
 include/linux/kprobes.h |  3 +++
 kernel/kprobes.c        | 14 ++++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 1883a4a9f16a..02d4020615a7 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -409,6 +409,9 @@ void dump_kprobe(struct kprobe *kp);
 void *alloc_insn_page(void);
 void free_insn_page(void *page);
 
+void *alloc_optinsn_page(void);
+void free_optinsn_page(void *page);
+
 int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 		       char *sym);
 
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 745f08fdd7a6..8c0a6fdef771 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -321,11 +321,21 @@ int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
 }
 
 #ifdef CONFIG_OPTPROBES
+void __weak *alloc_optinsn_page(void)
+{
+	return alloc_insn_page();
+}
+
+void __weak free_optinsn_page(void *page)
+{
+	free_insn_page(page);
+}
+
 /* For optimized_kprobe buffer */
 struct kprobe_insn_cache kprobe_optinsn_slots = {
 	.mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex),
-	.alloc = alloc_insn_page,
-	.free = free_insn_page,
+	.alloc = alloc_optinsn_page,
+	.free = free_optinsn_page,
 	.sym = KPROBE_OPTINSN_PAGE_SYM,
 	.pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
 	/* .insn_size is initialized later */
-- 
cgit v1.2.3


From 0514582a1a5b4ac1a3fd64792826d392d7ae9ddc Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@ingics.com>
Date: Sun, 23 May 2021 15:10:44 +0800
Subject: regulator: bd70528: Fix off-by-one for buck123 .n_voltages setting

The valid selectors for bd70528 bucks are 0 ~ 0xf, so the .n_voltages
should be 16 (0x10). Use 0x10 to make it consistent with BD70528_LDO_VOLTS.
Also remove redundant defines for BD70528_BUCK_VOLTS.

Signed-off-by: Axel Lin <axel.lin@ingics.com>
Acked-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Link: https://lore.kernel.org/r/20210523071045.2168904-1-axel.lin@ingics.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/mfd/rohm-bd70528.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/rohm-bd70528.h b/include/linux/mfd/rohm-bd70528.h
index a57af878fd0c..4a5966475a35 100644
--- a/include/linux/mfd/rohm-bd70528.h
+++ b/include/linux/mfd/rohm-bd70528.h
@@ -26,9 +26,7 @@ struct bd70528_data {
 	struct mutex rtc_timer_lock;
 };
 
-#define BD70528_BUCK_VOLTS 17
-#define BD70528_BUCK_VOLTS 17
-#define BD70528_BUCK_VOLTS 17
+#define BD70528_BUCK_VOLTS 0x10
 #define BD70528_LDO_VOLTS 0x20
 
 #define BD70528_REG_BUCK1_EN	0x0F
-- 
cgit v1.2.3


From 4c668630bf8ea90a041fc69c9984486e0f56682d Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@ingics.com>
Date: Sun, 23 May 2021 15:10:45 +0800
Subject: regulator: bd71828: Fix .n_voltages settings

Current .n_voltages settings do not cover the latest 2 valid selectors,
so it fails to set voltage for the hightest voltage support.
The latest linear range has step_uV = 0, so it does not matter if we
count the .n_voltages to maximum selector + 1 or the first selector of
latest linear range + 1.
To simplify calculating the n_voltages, let's just set the
.n_voltages to maximum selector + 1.

Fixes: 522498f8cb8c ("regulator: bd71828: Basic support for ROHM bd71828 PMIC regulators")
Signed-off-by: Axel Lin <axel.lin@ingics.com>
Reviewed-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Link: https://lore.kernel.org/r/20210523071045.2168904-2-axel.lin@ingics.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/mfd/rohm-bd71828.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/rohm-bd71828.h b/include/linux/mfd/rohm-bd71828.h
index 017a4c01cb31..61f0974c33d7 100644
--- a/include/linux/mfd/rohm-bd71828.h
+++ b/include/linux/mfd/rohm-bd71828.h
@@ -26,11 +26,11 @@ enum {
 	BD71828_REGULATOR_AMOUNT,
 };
 
-#define BD71828_BUCK1267_VOLTS		0xEF
-#define BD71828_BUCK3_VOLTS		0x10
-#define BD71828_BUCK4_VOLTS		0x20
-#define BD71828_BUCK5_VOLTS		0x10
-#define BD71828_LDO_VOLTS		0x32
+#define BD71828_BUCK1267_VOLTS		0x100
+#define BD71828_BUCK3_VOLTS		0x20
+#define BD71828_BUCK4_VOLTS		0x40
+#define BD71828_BUCK5_VOLTS		0x20
+#define BD71828_LDO_VOLTS		0x40
 /* LDO6 is fixed 1.8V voltage */
 #define BD71828_LDO_6_VOLTAGE		1800000
 
-- 
cgit v1.2.3


From 42a7dfa26fc6df1624d7c2955200e5053dd0b818 Mon Sep 17 00:00:00 2001
From: David Bauer <mail@david-bauer.net>
Date: Sat, 22 May 2021 09:44:52 +0200
Subject: spi: ath79: drop platform data

The ath79 platform has been converted to pure OF. The platform data is
not needed anymore because of this.

Signed-off-by: David Bauer <mail@david-bauer.net>
Link: https://lore.kernel.org/r/20210522074453.39299-1-mail@david-bauer.net
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-ath79.c                 |  8 --------
 include/linux/platform_data/spi-ath79.h | 16 ----------------
 2 files changed, 24 deletions(-)
 delete mode 100644 include/linux/platform_data/spi-ath79.h

(limited to 'include/linux')

diff --git a/drivers/spi/spi-ath79.c b/drivers/spi/spi-ath79.c
index 98ace748cd98..497d5c028496 100644
--- a/drivers/spi/spi-ath79.c
+++ b/drivers/spi/spi-ath79.c
@@ -19,7 +19,6 @@
 #include <linux/bitops.h>
 #include <linux/clk.h>
 #include <linux/err.h>
-#include <linux/platform_data/spi-ath79.h>
 
 #define DRV_NAME	"ath79-spi"
 
@@ -138,7 +137,6 @@ static int ath79_spi_probe(struct platform_device *pdev)
 {
 	struct spi_master *master;
 	struct ath79_spi *sp;
-	struct ath79_spi_platform_data *pdata;
 	unsigned long rate;
 	int ret;
 
@@ -152,15 +150,9 @@ static int ath79_spi_probe(struct platform_device *pdev)
 	master->dev.of_node = pdev->dev.of_node;
 	platform_set_drvdata(pdev, sp);
 
-	pdata = dev_get_platdata(&pdev->dev);
-
 	master->use_gpio_descriptors = true;
 	master->bits_per_word_mask = SPI_BPW_RANGE_MASK(1, 32);
 	master->flags = SPI_MASTER_GPIO_SS;
-	if (pdata) {
-		master->bus_num = pdata->bus_num;
-		master->num_chipselect = pdata->num_chipselect;
-	}
 
 	sp->bitbang.master = master;
 	sp->bitbang.chipselect = ath79_spi_chipselect;
diff --git a/include/linux/platform_data/spi-ath79.h b/include/linux/platform_data/spi-ath79.h
deleted file mode 100644
index 81a388ff58cc..000000000000
--- a/include/linux/platform_data/spi-ath79.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- *  Platform data definition for Atheros AR71XX/AR724X/AR913X SPI controller
- *
- *  Copyright (C) 2008-2010 Gabor Juhos <juhosg@openwrt.org>
- */
-
-#ifndef _ATH79_SPI_PLATFORM_H
-#define _ATH79_SPI_PLATFORM_H
-
-struct ath79_spi_platform_data {
-	unsigned	bus_num;
-	unsigned	num_chipselect;
-};
-
-#endif /* _ATH79_SPI_PLATFORM_H */
-- 
cgit v1.2.3


From 3af3d772f7216cf23081bb4176e86f1219d32ebc Mon Sep 17 00:00:00 2001
From: "zhangyi (F)" <yi.zhang@huawei.com>
Date: Sat, 13 Mar 2021 11:01:45 +0800
Subject: block_dump: remove block_dump feature

We have already delete block_dump feature in mark_inode_dirty() because
it can be replaced by tracepoints, now we also remove the part in
submit_bio() for the same reason. The part of block dump feature in
submit_bio() dump the write process, write region and sectors on the
target disk into kernel message. it can be replaced by
block_bio_queue tracepoint in submit_bio_checks(), so we do not need
block_dump anymore, remove the whole block_dump feature.

Signed-off-by: zhangyi (F) <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20210313030146.2882027-3-yi.zhang@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c          | 9 ---------
 include/linux/writeback.h | 1 -
 kernel/sysctl.c           | 8 --------
 mm/page-writeback.c       | 5 -----
 4 files changed, 23 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 9bcdae93f6d4..689aac2625d2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1086,15 +1086,6 @@ blk_qc_t submit_bio(struct bio *bio)
 			task_io_account_read(bio->bi_iter.bi_size);
 			count_vm_events(PGPGIN, count);
 		}
-
-		if (unlikely(block_dump)) {
-			char b[BDEVNAME_SIZE];
-			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
-			current->comm, task_pid_nr(current),
-				op_is_write(bio_op(bio)) ? "WRITE" : "READ",
-				(unsigned long long)bio->bi_iter.bi_sector,
-				bio_devname(bio, b), count);
-		}
 	}
 
 	/*
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 8e5c5bb16e2d..9ef50176f3a1 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -360,7 +360,6 @@ extern unsigned int dirty_writeback_interval;
 extern unsigned int dirty_expire_interval;
 extern unsigned int dirtytime_expire_interval;
 extern int vm_highmem_is_dirtyable;
-extern int block_dump;
 extern int laptop_mode;
 
 int dirty_background_ratio_handler(struct ctl_table *table, int write,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 14edf84cc571..08e52b1090e9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2931,14 +2931,6 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
-	{
-		.procname	= "block_dump",
-		.data		= &block_dump,
-		.maxlen		= sizeof(block_dump),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= SYSCTL_ZERO,
-	},
 	{
 		.procname	= "vfs_cache_pressure",
 		.data		= &sysctl_vfs_cache_pressure,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0062d5c57d41..fe72d5f65688 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -108,11 +108,6 @@ EXPORT_SYMBOL_GPL(dirty_writeback_interval);
  */
 unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
 
-/*
- * Flag that makes the machine dump writes/reads and block dirtyings.
- */
-int block_dump;
-
 /*
  * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
  * a full sync is triggered after this time elapses without any disk activity.
-- 
cgit v1.2.3


From d97e594c51660bea510a387731637b894651e4b5 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Thu, 13 May 2021 20:00:58 +0800
Subject: blk-mq: Use request queue-wide tags for tagset-wide sbitmap

The tags used for an IO scheduler are currently per hctx.

As such, when q->nr_hw_queues grows, so does the request queue total IO
scheduler tag depth.

This may cause problems for SCSI MQ HBAs whose total driver depth is
fixed.

Ming and Yanhui report higher CPU usage and lower throughput in scenarios
where the fixed total driver tag depth is appreciably lower than the total
scheduler tag depth:
https://lore.kernel.org/linux-block/440dfcfc-1a2c-bd98-1161-cec4d78c6dfc@huawei.com/T/#mc0d6d4f95275a2743d1c8c3e4dc9ff6c9aa3a76b

In that scenario, since the scheduler tag is got first, much contention
is introduced since a driver tag may not be available after we have got
the sched tag.

Improve this scenario by introducing request queue-wide tags for when
a tagset-wide sbitmap is used. The static sched requests are still
allocated per hctx, as requests are initialised per hctx, as in
blk_mq_init_request(..., hctx_idx, ...) ->
set->ops->init_request(.., hctx_idx, ...).

For simplicity of resizing the request queue sbitmap when updating the
request queue depth, just init at the max possible size, so we don't need
to deal with the possibly with swapping out a new sbitmap for old if
we need to grow.

Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/1620907258-30910-3-git-send-email-john.garry@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c   | 67 ++++++++++++++++++++++++++++++++++++++++----------
 block/blk-mq-sched.h   |  2 ++
 block/blk-mq-tag.c     | 11 ++++-----
 block/blk-mq.c         | 13 ++++++++--
 include/linux/blkdev.h |  4 +++
 5 files changed, 76 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 996a4b2f73aa..045b6878b8c5 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -509,11 +509,9 @@ static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
 				   struct blk_mq_hw_ctx *hctx,
 				   unsigned int hctx_idx)
 {
-	unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
-
 	if (hctx->sched_tags) {
 		blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
-		blk_mq_free_rq_map(hctx->sched_tags, flags);
+		blk_mq_free_rq_map(hctx->sched_tags, set->flags);
 		hctx->sched_tags = NULL;
 	}
 }
@@ -523,12 +521,10 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q,
 				   unsigned int hctx_idx)
 {
 	struct blk_mq_tag_set *set = q->tag_set;
-	/* Clear HCTX_SHARED so tags are init'ed */
-	unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
 	int ret;
 
 	hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
-					       set->reserved_tags, flags);
+					       set->reserved_tags, set->flags);
 	if (!hctx->sched_tags)
 		return -ENOMEM;
 
@@ -546,16 +542,50 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q)
 	int i;
 
 	queue_for_each_hw_ctx(q, hctx, i) {
-		/* Clear HCTX_SHARED so tags are freed */
-		unsigned int flags = hctx->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
-
 		if (hctx->sched_tags) {
-			blk_mq_free_rq_map(hctx->sched_tags, flags);
+			blk_mq_free_rq_map(hctx->sched_tags, hctx->flags);
 			hctx->sched_tags = NULL;
 		}
 	}
 }
 
+static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)
+{
+	struct blk_mq_tag_set *set = queue->tag_set;
+	int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
+	struct blk_mq_hw_ctx *hctx;
+	int ret, i;
+
+	/*
+	 * Set initial depth at max so that we don't need to reallocate for
+	 * updating nr_requests.
+	 */
+	ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags,
+				  &queue->sched_breserved_tags,
+				  MAX_SCHED_RQ, set->reserved_tags,
+				  set->numa_node, alloc_policy);
+	if (ret)
+		return ret;
+
+	queue_for_each_hw_ctx(queue, hctx, i) {
+		hctx->sched_tags->bitmap_tags =
+					&queue->sched_bitmap_tags;
+		hctx->sched_tags->breserved_tags =
+					&queue->sched_breserved_tags;
+	}
+
+	sbitmap_queue_resize(&queue->sched_bitmap_tags,
+			     queue->nr_requests - set->reserved_tags);
+
+	return 0;
+}
+
+static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue)
+{
+	sbitmap_queue_free(&queue->sched_bitmap_tags);
+	sbitmap_queue_free(&queue->sched_breserved_tags);
+}
+
 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 {
 	struct blk_mq_hw_ctx *hctx;
@@ -580,12 +610,18 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 	queue_for_each_hw_ctx(q, hctx, i) {
 		ret = blk_mq_sched_alloc_tags(q, hctx, i);
 		if (ret)
-			goto err;
+			goto err_free_tags;
+	}
+
+	if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {
+		ret = blk_mq_init_sched_shared_sbitmap(q);
+		if (ret)
+			goto err_free_tags;
 	}
 
 	ret = e->ops.init_sched(q, e);
 	if (ret)
-		goto err;
+		goto err_free_sbitmap;
 
 	blk_mq_debugfs_register_sched(q);
 
@@ -605,7 +641,10 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 
 	return 0;
 
-err:
+err_free_sbitmap:
+	if (blk_mq_is_sbitmap_shared(q->tag_set->flags))
+		blk_mq_exit_sched_shared_sbitmap(q);
+err_free_tags:
 	blk_mq_sched_free_requests(q);
 	blk_mq_sched_tags_teardown(q);
 	q->elevator = NULL;
@@ -643,5 +682,7 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
 	if (e->type->ops.exit_sched)
 		e->type->ops.exit_sched(e);
 	blk_mq_sched_tags_teardown(q);
+	if (blk_mq_is_sbitmap_shared(q->tag_set->flags))
+		blk_mq_exit_sched_shared_sbitmap(q);
 	q->elevator = NULL;
 }
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 5b18ab915c65..aff037cfd8e7 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -5,6 +5,8 @@
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
 
+#define MAX_SCHED_RQ (16 * BLKDEV_MAX_RQ)
+
 void blk_mq_sched_assign_ioc(struct request *rq);
 
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index f597d40de10b..86f87346232a 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -13,6 +13,7 @@
 #include <linux/delay.h>
 #include "blk.h"
 #include "blk-mq.h"
+#include "blk-mq-sched.h"
 #include "blk-mq-tag.h"
 
 /*
@@ -590,8 +591,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
 	 */
 	if (tdepth > tags->nr_tags) {
 		struct blk_mq_tag_set *set = hctx->queue->tag_set;
-		/* Only sched tags can grow, so clear HCTX_SHARED flag  */
-		unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
 		struct blk_mq_tags *new;
 		bool ret;
 
@@ -602,21 +601,21 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
 		 * We need some sort of upper limit, set it high enough that
 		 * no valid use cases should require more.
 		 */
-		if (tdepth > 16 * BLKDEV_MAX_RQ)
+		if (tdepth > MAX_SCHED_RQ)
 			return -EINVAL;
 
 		new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
-				tags->nr_reserved_tags, flags);
+				tags->nr_reserved_tags, set->flags);
 		if (!new)
 			return -ENOMEM;
 		ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
 		if (ret) {
-			blk_mq_free_rq_map(new, flags);
+			blk_mq_free_rq_map(new, set->flags);
 			return -ENOMEM;
 		}
 
 		blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
-		blk_mq_free_rq_map(*tagsptr, flags);
+		blk_mq_free_rq_map(*tagsptr, set->flags);
 		*tagsptr = new;
 	} else {
 		/*
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 001e196bdebd..f11d4018ce2e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3640,15 +3640,24 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
 		} else {
 			ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
 							nr, true);
+			if (blk_mq_is_sbitmap_shared(set->flags)) {
+				hctx->sched_tags->bitmap_tags =
+					&q->sched_bitmap_tags;
+				hctx->sched_tags->breserved_tags =
+					&q->sched_breserved_tags;
+			}
 		}
 		if (ret)
 			break;
 		if (q->elevator && q->elevator->type->ops.depth_updated)
 			q->elevator->type->ops.depth_updated(hctx);
 	}
-
-	if (!ret)
+	if (!ret) {
 		q->nr_requests = nr;
+		if (q->elevator && blk_mq_is_sbitmap_shared(set->flags))
+			sbitmap_queue_resize(&q->sched_bitmap_tags,
+					     nr - set->reserved_tags);
+	}
 
 	blk_mq_unquiesce_queue(q);
 	blk_mq_unfreeze_queue(q);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f69c75bd6d27..2c28577b50f4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -25,6 +25,7 @@
 #include <linux/scatterlist.h>
 #include <linux/blkzoned.h>
 #include <linux/pm.h>
+#include <linux/sbitmap.h>
 
 struct module;
 struct scsi_ioctl_command;
@@ -493,6 +494,9 @@ struct request_queue {
 
 	atomic_t		nr_active_requests_shared_sbitmap;
 
+	struct sbitmap_queue	sched_bitmap_tags;
+	struct sbitmap_queue	sched_breserved_tags;
+
 	struct list_head	icq_list;
 #ifdef CONFIG_BLK_CGROUP
 	DECLARE_BITMAP		(blkcg_pols, BLKCG_MAX_POLS);
-- 
cgit v1.2.3


From 08b2b6fdf6b26032f025084ce2893924a0cdb4a2 Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Mon, 24 May 2021 16:29:43 +0800
Subject: cgroup: fix spelling mistakes

Fix some spelling mistakes in comments:
hierarhcy ==> hierarchy
automtically ==> automatically
overriden ==> overridden
In absense of .. or ==> In absence of .. and
assocaited ==> associated
taget ==> target
initate ==> initiate
succeded ==> succeeded
curremt ==> current
udpated ==> updated

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h | 6 +++---
 include/linux/cgroup.h      | 2 +-
 kernel/cgroup/cgroup-v1.c   | 2 +-
 kernel/cgroup/cgroup.c      | 8 ++++----
 kernel/cgroup/cpuset.c      | 2 +-
 kernel/cgroup/rdma.c        | 2 +-
 kernel/cgroup/rstat.c       | 2 +-
 7 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 559ee05f86b2..fb8f6d2cd104 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -232,7 +232,7 @@ struct css_set {
 	struct list_head task_iters;
 
 	/*
-	 * On the default hierarhcy, ->subsys[ssid] may point to a css
+	 * On the default hierarchy, ->subsys[ssid] may point to a css
 	 * attached to an ancestor instead of the cgroup this css_set is
 	 * associated with.  The following node is anchored at
 	 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
@@ -668,7 +668,7 @@ struct cgroup_subsys {
 	 */
 	bool threaded:1;
 
-	/* the following two fields are initialized automtically during boot */
+	/* the following two fields are initialized automatically during boot */
 	int id;
 	const char *name;
 
@@ -757,7 +757,7 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
  * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer.
  * On boot, sock_cgroup_data records the cgroup that the sock was created
  * in so that cgroup2 matches can be made; however, once either net_prio or
- * net_cls starts being used, the area is overriden to carry prioidx and/or
+ * net_cls starts being used, the area is overridden to carry prioidx and/or
  * classid.  The two modes are distinguished by whether the lowest bit is
  * set.  Clear bit indicates cgroup pointer while set bit prioidx and
  * classid.
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4f2f79de083e..6bc9c76680b2 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -32,7 +32,7 @@ struct kernel_clone_args;
 #ifdef CONFIG_CGROUPS
 
 /*
- * All weight knobs on the default hierarhcy should use the following min,
+ * All weight knobs on the default hierarchy should use the following min,
  * default and max values.  The default value is the logarithmic center of
  * MIN and MAX and allows 100x to be expressed in both directions.
  */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 391aa570369b..8190b6bfc978 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -1001,7 +1001,7 @@ static int check_cgroupfs_options(struct fs_context *fc)
 	ctx->subsys_mask &= enabled;
 
 	/*
-	 * In absense of 'none', 'name=' or subsystem name options,
+	 * In absence of 'none', 'name=' and subsystem name options,
 	 * let's default to 'all'.
 	 */
 	if (!ctx->subsys_mask && !ctx->none && !ctx->name)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index e7a9a2998245..21ecc6ee6a6d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -468,7 +468,7 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
  * @cgrp: the cgroup of interest
  * @ss: the subsystem of interest
  *
- * Find and get @cgrp's css assocaited with @ss.  If the css doesn't exist
+ * Find and get @cgrp's css associated with @ss.  If the css doesn't exist
  * or is offline, %NULL is returned.
  */
 static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
@@ -1633,7 +1633,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
 
 /**
  * css_clear_dir - remove subsys files in a cgroup directory
- * @css: taget css
+ * @css: target css
  */
 static void css_clear_dir(struct cgroup_subsys_state *css)
 {
@@ -5350,7 +5350,7 @@ out_unlock:
 /*
  * This is called when the refcnt of a css is confirmed to be killed.
  * css_tryget_online() is now guaranteed to fail.  Tell the subsystem to
- * initate destruction and put the css ref from kill_css().
+ * initiate destruction and put the css ref from kill_css().
  */
 static void css_killed_work_fn(struct work_struct *work)
 {
@@ -6052,7 +6052,7 @@ out_revert:
  * @kargs: the arguments passed to create the child process
  *
  * This calls the cancel_fork() callbacks if a fork failed *after*
- * cgroup_can_fork() succeded and cleans up references we took to
+ * cgroup_can_fork() succeeded and cleans up references we took to
  * prepare a new css_set for the child process in cgroup_can_fork().
  */
 void cgroup_cancel_fork(struct task_struct *child,
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index a945504c0ae7..adb5190c4429 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -3376,7 +3376,7 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
 }
 
 /**
- * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
+ * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed
  * @nodemask: the nodemask to be checked
  *
  * Are any of the nodes in the nodemask allowed in current->mems_allowed?
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index ae042c347c64..3135406608c7 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -244,7 +244,7 @@ EXPORT_SYMBOL(rdmacg_uncharge);
  * This function follows charging resource in hierarchical way.
  * It will fail if the charge would cause the new value to exceed the
  * hierarchical limit.
- * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
+ * Returns 0 if the charge succeeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
  * Returns pointer to rdmacg for this resource when charging is successful.
  *
  * Charger needs to account resources on two criteria.
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 3a3fd2993a65..cee265cb535c 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -75,7 +75,7 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
  * @root: root of the tree to traversal
  * @cpu: target cpu
  *
- * Walks the udpated rstat_cpu tree on @cpu from @root.  %NULL @pos starts
+ * Walks the updated rstat_cpu tree on @cpu from @root.  %NULL @pos starts
  * the traversal and %NULL return indicates the end.  During traversal,
  * each returned cgroup is unlinked from the tree.  Must be called with the
  * matching cgroup_rstat_cpu_lock held.
-- 
cgit v1.2.3


From 3e87f192b405960c0fe83e0925bd0dadf4f8cf43 Mon Sep 17 00:00:00 2001
From: Denis Salopek <denis.salopek@sartura.hr>
Date: Tue, 11 May 2021 23:00:04 +0200
Subject: bpf: Add lookup_and_delete_elem support to hashtab

Extend the existing bpf_map_lookup_and_delete_elem() functionality to
hashtab map types, in addition to stacks and queues.
Create a new hashtab bpf_map_ops function that does lookup and deletion
of the element under the same bucket lock and add the created map_ops to
bpf.h.

Signed-off-by: Denis Salopek <denis.salopek@sartura.hr>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/4d18480a3e990ffbf14751ddef0325eed3be2966.1620763117.git.denis.salopek@sartura.hr
---
 include/linux/bpf.h            |  2 +
 include/uapi/linux/bpf.h       | 13 ++++++
 kernel/bpf/hashtab.c           | 98 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           | 34 +++++++++++++--
 tools/include/uapi/linux/bpf.h | 13 ++++++
 5 files changed, 156 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9dc44ba97584..1e9a0ff3217b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -70,6 +70,8 @@ struct bpf_map_ops {
 	void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
 	int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr,
 				union bpf_attr __user *uattr);
+	int (*map_lookup_and_delete_elem)(struct bpf_map *map, void *key,
+					  void *value, u64 flags);
 	int (*map_lookup_and_delete_batch)(struct bpf_map *map,
 					   const union bpf_attr *attr,
 					   union bpf_attr __user *uattr);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 418b9b813d65..562adeac1d67 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -527,6 +527,15 @@ union bpf_iter_link_info {
  *		Look up an element with the given *key* in the map referred to
  *		by the file descriptor *fd*, and if found, delete the element.
  *
+ *		For **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map
+ *		types, the *flags* argument needs to be set to 0, but for other
+ *		map types, it may be specified as:
+ *
+ *		**BPF_F_LOCK**
+ *			Look up and delete the value of a spin-locked map
+ *			without returning the lock. This must be specified if
+ *			the elements contain a spinlock.
+ *
  *		The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types
  *		implement this command as a "pop" operation, deleting the top
  *		element rather than one corresponding to *key*.
@@ -536,6 +545,10 @@ union bpf_iter_link_info {
  *		This command is only valid for the following map types:
  *		* **BPF_MAP_TYPE_QUEUE**
  *		* **BPF_MAP_TYPE_STACK**
+ *		* **BPF_MAP_TYPE_HASH**
+ *		* **BPF_MAP_TYPE_PERCPU_HASH**
+ *		* **BPF_MAP_TYPE_LRU_HASH**
+ *		* **BPF_MAP_TYPE_LRU_PERCPU_HASH**
  *
  *	Return
  *		Returns zero on success. On error, -1 is returned and *errno*
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d7ebb12ffffc..9da0a0413a53 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1401,6 +1401,100 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
 	rcu_read_unlock();
 }
 
+static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+					     void *value, bool is_lru_map,
+					     bool is_percpu, u64 flags)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_nulls_head *head;
+	unsigned long bflags;
+	struct htab_elem *l;
+	u32 hash, key_size;
+	struct bucket *b;
+	int ret;
+
+	key_size = map->key_size;
+
+	hash = htab_map_hash(key, key_size, htab->hashrnd);
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	ret = htab_lock_bucket(htab, b, hash, &bflags);
+	if (ret)
+		return ret;
+
+	l = lookup_elem_raw(head, hash, key, key_size);
+	if (!l) {
+		ret = -ENOENT;
+	} else {
+		if (is_percpu) {
+			u32 roundup_value_size = round_up(map->value_size, 8);
+			void __percpu *pptr;
+			int off = 0, cpu;
+
+			pptr = htab_elem_get_ptr(l, key_size);
+			for_each_possible_cpu(cpu) {
+				bpf_long_memcpy(value + off,
+						per_cpu_ptr(pptr, cpu),
+						roundup_value_size);
+				off += roundup_value_size;
+			}
+		} else {
+			u32 roundup_key_size = round_up(map->key_size, 8);
+
+			if (flags & BPF_F_LOCK)
+				copy_map_value_locked(map, value, l->key +
+						      roundup_key_size,
+						      true);
+			else
+				copy_map_value(map, value, l->key +
+					       roundup_key_size);
+			check_and_init_map_lock(map, value);
+		}
+
+		hlist_nulls_del_rcu(&l->hash_node);
+		if (!is_lru_map)
+			free_htab_elem(htab, l);
+	}
+
+	htab_unlock_bucket(htab, b, hash, bflags);
+
+	if (is_lru_map && l)
+		bpf_lru_push_free(&htab->lru, &l->lru_node);
+
+	return ret;
+}
+
+static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+					   void *value, u64 flags)
+{
+	return __htab_map_lookup_and_delete_elem(map, key, value, false, false,
+						 flags);
+}
+
+static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
+						  void *key, void *value,
+						  u64 flags)
+{
+	return __htab_map_lookup_and_delete_elem(map, key, value, false, true,
+						 flags);
+}
+
+static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+					       void *value, u64 flags)
+{
+	return __htab_map_lookup_and_delete_elem(map, key, value, true, false,
+						 flags);
+}
+
+static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
+						      void *key, void *value,
+						      u64 flags)
+{
+	return __htab_map_lookup_and_delete_elem(map, key, value, true, true,
+						 flags);
+}
+
 static int
 __htab_map_lookup_and_delete_batch(struct bpf_map *map,
 				   const union bpf_attr *attr,
@@ -1934,6 +2028,7 @@ const struct bpf_map_ops htab_map_ops = {
 	.map_free = htab_map_free,
 	.map_get_next_key = htab_map_get_next_key,
 	.map_lookup_elem = htab_map_lookup_elem,
+	.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
 	.map_update_elem = htab_map_update_elem,
 	.map_delete_elem = htab_map_delete_elem,
 	.map_gen_lookup = htab_map_gen_lookup,
@@ -1954,6 +2049,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
 	.map_free = htab_map_free,
 	.map_get_next_key = htab_map_get_next_key,
 	.map_lookup_elem = htab_lru_map_lookup_elem,
+	.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
 	.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
 	.map_update_elem = htab_lru_map_update_elem,
 	.map_delete_elem = htab_lru_map_delete_elem,
@@ -2077,6 +2173,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
 	.map_free = htab_map_free,
 	.map_get_next_key = htab_map_get_next_key,
 	.map_lookup_elem = htab_percpu_map_lookup_elem,
+	.map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
 	.map_update_elem = htab_percpu_map_update_elem,
 	.map_delete_elem = htab_map_delete_elem,
 	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
@@ -2096,6 +2193,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
 	.map_free = htab_map_free,
 	.map_get_next_key = htab_map_get_next_key,
 	.map_lookup_elem = htab_lru_percpu_map_lookup_elem,
+	.map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
 	.map_update_elem = htab_lru_percpu_map_update_elem,
 	.map_delete_elem = htab_lru_map_delete_elem,
 	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1d1cd80a6e67..50457019da27 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1483,7 +1483,7 @@ free_buf:
 	return err;
 }
 
-#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
+#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags
 
 static int map_lookup_and_delete_elem(union bpf_attr *attr)
 {
@@ -1499,6 +1499,9 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
 		return -EINVAL;
 
+	if (attr->flags & ~BPF_F_LOCK)
+		return -EINVAL;
+
 	f = fdget(ufd);
 	map = __bpf_map_get(f);
 	if (IS_ERR(map))
@@ -1509,24 +1512,47 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
 		goto err_put;
 	}
 
+	if (attr->flags &&
+	    (map->map_type == BPF_MAP_TYPE_QUEUE ||
+	     map->map_type == BPF_MAP_TYPE_STACK)) {
+		err = -EINVAL;
+		goto err_put;
+	}
+
+	if ((attr->flags & BPF_F_LOCK) &&
+	    !map_value_has_spin_lock(map)) {
+		err = -EINVAL;
+		goto err_put;
+	}
+
 	key = __bpf_copy_key(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
 		goto err_put;
 	}
 
-	value_size = map->value_size;
+	value_size = bpf_map_value_size(map);
 
 	err = -ENOMEM;
 	value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
 	if (!value)
 		goto free_key;
 
+	err = -ENOTSUPP;
 	if (map->map_type == BPF_MAP_TYPE_QUEUE ||
 	    map->map_type == BPF_MAP_TYPE_STACK) {
 		err = map->ops->map_pop_elem(map, value);
-	} else {
-		err = -ENOTSUPP;
+	} else if (map->map_type == BPF_MAP_TYPE_HASH ||
+		   map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+		   map->map_type == BPF_MAP_TYPE_LRU_HASH ||
+		   map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+		if (!bpf_map_is_dev_bound(map)) {
+			bpf_disable_instrumentation();
+			rcu_read_lock();
+			err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
+			rcu_read_unlock();
+			bpf_enable_instrumentation();
+		}
 	}
 
 	if (err)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 418b9b813d65..562adeac1d67 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -527,6 +527,15 @@ union bpf_iter_link_info {
  *		Look up an element with the given *key* in the map referred to
  *		by the file descriptor *fd*, and if found, delete the element.
  *
+ *		For **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map
+ *		types, the *flags* argument needs to be set to 0, but for other
+ *		map types, it may be specified as:
+ *
+ *		**BPF_F_LOCK**
+ *			Look up and delete the value of a spin-locked map
+ *			without returning the lock. This must be specified if
+ *			the elements contain a spinlock.
+ *
  *		The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types
  *		implement this command as a "pop" operation, deleting the top
  *		element rather than one corresponding to *key*.
@@ -536,6 +545,10 @@ union bpf_iter_link_info {
  *		This command is only valid for the following map types:
  *		* **BPF_MAP_TYPE_QUEUE**
  *		* **BPF_MAP_TYPE_STACK**
+ *		* **BPF_MAP_TYPE_HASH**
+ *		* **BPF_MAP_TYPE_PERCPU_HASH**
+ *		* **BPF_MAP_TYPE_LRU_HASH**
+ *		* **BPF_MAP_TYPE_LRU_PERCPU_HASH**
  *
  *	Return
  *		Returns zero on success. On error, -1 is returned and *errno*
-- 
cgit v1.2.3


From 1cb61759d40716643281b8e0f8c7afebc8699249 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Fri, 21 May 2021 09:26:10 +0200
Subject: init: verify that function is initcall_t at compile-time

In the spirit of making it hard to misuse an interface, add a
compile-time assertion in the CONFIG_HAVE_ARCH_PREL32_RELOCATIONS case
to verify the initcall function matches initcall_t, because the inline
asm bypasses any type-checking the compiler would otherwise do. This
will help developers catch incorrect API use in all configurations.

A recent example of this is:
https://lkml.kernel.org/r/20210514140015.2944744-1-arnd@kernel.org

Signed-off-by: Marco Elver <elver@google.com>
Reviewed-by: Miguel Ojeda <ojeda@kernel.org>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Sami Tolvanen <samitolvanen@google.com>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20210521072610.2880286-1-elver@google.com
---
 include/linux/init.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/init.h b/include/linux/init.h
index 045ad1650ed1..d82b4b2e1d25 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -242,7 +242,8 @@ extern bool initcall_debug;
 	asm(".section	\"" __sec "\", \"a\"		\n"	\
 	    __stringify(__name) ":			\n"	\
 	    ".long	" __stringify(__stub) " - .	\n"	\
-	    ".previous					\n");
+	    ".previous					\n");	\
+	static_assert(__same_type(initcall_t, &fn));
 #else
 #define ____define_initcall(fn, __unused, __name, __sec)	\
 	static initcall_t __name __used 			\
-- 
cgit v1.2.3


From 8fb33b6055300a23f26868680c22a5726834785e Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Tue, 25 May 2021 10:56:59 +0800
Subject: bpf: Fix spelling mistakes

Fix some spelling mistakes in comments:
aother ==> another
Netiher ==> Neither
desribe ==> describe
intializing ==> initializing
funciton ==> function
wont ==> won't and move the word 'the' at the end to the next line
accross ==> across
pathes ==> paths
triggerred ==> triggered
excute ==> execute
ether ==> either
conervative ==> conservative
convetion ==> convention
markes ==> marks
interpeter ==> interpreter

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210525025659.8898-2-thunder.leizhen@huawei.com
---
 include/linux/bpf_local_storage.h |  4 ++--
 kernel/bpf/bpf_inode_storage.c    |  2 +-
 kernel/bpf/btf.c                  |  6 +++---
 kernel/bpf/devmap.c               |  4 ++--
 kernel/bpf/hashtab.c              |  4 ++--
 kernel/bpf/reuseport_array.c      |  2 +-
 kernel/bpf/trampoline.c           |  2 +-
 kernel/bpf/verifier.c             | 12 ++++++------
 8 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index b902c580c48d..24496bc28e7b 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -58,7 +58,7 @@ struct bpf_local_storage_data {
 	 * from the object's bpf_local_storage.
 	 *
 	 * Put it in the same cacheline as the data to minimize
-	 * the number of cachelines access during the cache hit case.
+	 * the number of cachelines accessed during the cache hit case.
 	 */
 	struct bpf_local_storage_map __rcu *smap;
 	u8 data[] __aligned(8);
@@ -71,7 +71,7 @@ struct bpf_local_storage_elem {
 	struct bpf_local_storage __rcu *local_storage;
 	struct rcu_head rcu;
 	/* 8 bytes hole */
-	/* The data is stored in aother cacheline to minimize
+	/* The data is stored in another cacheline to minimize
 	 * the number of cachelines access during a cache hit.
 	 */
 	struct bpf_local_storage_data sdata ____cacheline_aligned;
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 2921ca39a93e..96ceed0e0fb5 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -72,7 +72,7 @@ void bpf_inode_storage_free(struct inode *inode)
 		return;
 	}
 
-	/* Netiher the bpf_prog nor the bpf-map's syscall
+	/* Neither the bpf_prog nor the bpf-map's syscall
 	 * could be modifying the local_storage->list now.
 	 * Thus, no elem can be added-to or deleted-from the
 	 * local_storage->list by the bpf_prog or by the bpf-map's syscall.
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 85716327c375..a6e39c5ea0bf 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -51,7 +51,7 @@
  * The BTF type section contains a list of 'struct btf_type' objects.
  * Each one describes a C type.  Recall from the above section
  * that a 'struct btf_type' object could be immediately followed by extra
- * data in order to desribe some particular C types.
+ * data in order to describe some particular C types.
  *
  * type_id:
  * ~~~~~~~
@@ -1143,7 +1143,7 @@ static void *btf_show_obj_safe(struct btf_show *show,
 
 	/*
 	 * We need a new copy to our safe object, either because we haven't
-	 * yet copied and are intializing safe data, or because the data
+	 * yet copied and are initializing safe data, or because the data
 	 * we want falls outside the boundaries of the safe object.
 	 */
 	if (!safe) {
@@ -3417,7 +3417,7 @@ static struct btf_kind_operations func_proto_ops = {
 	 * BTF_KIND_FUNC_PROTO cannot be directly referred by
 	 * a struct's member.
 	 *
-	 * It should be a funciton pointer instead.
+	 * It should be a function pointer instead.
 	 * (i.e. struct's member -> BTF_KIND_PTR -> BTF_KIND_FUNC_PROTO)
 	 *
 	 * Hence, there is no btf_func_check_member().
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index aa516472ce46..d60d617ec0d7 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -382,8 +382,8 @@ void __dev_flush(void)
 }
 
 /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
- * update happens in parallel here a dev_put wont happen until after reading the
- * ifindex.
+ * update happens in parallel here a dev_put won't happen until after reading
+ * the ifindex.
  */
 static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 {
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 9da0a0413a53..6f6681b07364 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -46,12 +46,12 @@
  * events, kprobes and tracing to be invoked before the prior invocation
  * from one of these contexts completed. sys_bpf() uses the same mechanism
  * by pinning the task to the current CPU and incrementing the recursion
- * protection accross the map operation.
+ * protection across the map operation.
  *
  * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
  * operations like memory allocations (even with GFP_ATOMIC) from atomic
  * contexts. This is required because even with GFP_ATOMIC the memory
- * allocator calls into code pathes which acquire locks with long held lock
+ * allocator calls into code paths which acquire locks with long held lock
  * sections. To ensure the deterministic behaviour these locks are regular
  * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
  * true atomic contexts on an RT kernel are the low level hardware
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 4838922f723d..93a55391791a 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -102,7 +102,7 @@ static void reuseport_array_free(struct bpf_map *map)
 	/*
 	 * ops->map_*_elem() will not be able to access this
 	 * array now. Hence, this function only races with
-	 * bpf_sk_reuseport_detach() which was triggerred by
+	 * bpf_sk_reuseport_detach() which was triggered by
 	 * close() or disconnect().
 	 *
 	 * This function and bpf_sk_reuseport_detach() are
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 2d44b5aa0057..28a3630c48ee 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -552,7 +552,7 @@ static void notrace inc_misses_counter(struct bpf_prog *prog)
  * __bpf_prog_enter returns:
  * 0 - skip execution of the bpf prog
  * 1 - execute bpf prog
- * [2..MAX_U64] - excute bpf prog and record execution time.
+ * [2..MAX_U64] - execute bpf prog and record execution time.
  *     This is start time.
  */
 u64 notrace __bpf_prog_enter(struct bpf_prog *prog)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9189eecb26dd..1de4b8c6ee42 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -47,7 +47,7 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
  * - unreachable insns exist (shouldn't be a forest. program = one function)
  * - out of bounds or malformed jumps
  * The second pass is all possible path descent from the 1st insn.
- * Since it's analyzing all pathes through the program, the length of the
+ * Since it's analyzing all paths through the program, the length of the
  * analysis is limited to 64k insn, which may be hit even if total number of
  * insn is less then 4K, but there are too many branches that change stack/regs.
  * Number of 'branches to be analyzed' is limited to 1k
@@ -132,7 +132,7 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
  * If it's ok, then verifier allows this BPF_CALL insn and looks at
  * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets
  * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function
- * returns ether pointer to map value or NULL.
+ * returns either pointer to map value or NULL.
  *
  * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off'
  * insn, the register holding that pointer in the true branch changes state to
@@ -2616,7 +2616,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 		if (dst_reg != BPF_REG_FP) {
 			/* The backtracking logic can only recognize explicit
 			 * stack slot address like [fp - 8]. Other spill of
-			 * scalar via different register has to be conervative.
+			 * scalar via different register has to be conservative.
 			 * Backtrack from here and mark all registers as precise
 			 * that contributed into 'reg' being a constant.
 			 */
@@ -9053,7 +9053,7 @@ static int check_return_code(struct bpf_verifier_env *env)
 	    !prog->aux->attach_func_proto->type)
 		return 0;
 
-	/* eBPF calling convetion is such that R0 is used
+	/* eBPF calling convention is such that R0 is used
 	 * to return the value from eBPF program.
 	 * Make sure that it's readable at this time
 	 * of bpf_exit, which means that program wrote
@@ -9844,7 +9844,7 @@ static void clean_verifier_state(struct bpf_verifier_env *env,
  * Since the verifier pushes the branch states as it sees them while exploring
  * the program the condition of walking the branch instruction for the second
  * time means that all states below this branch were already explored and
- * their final liveness markes are already propagated.
+ * their final liveness marks are already propagated.
  * Hence when the verifier completes the search of state list in is_state_visited()
  * we can call this clean_live_states() function to mark all liveness states
  * as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state'
@@ -12464,7 +12464,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			prog->aux->max_pkt_offset = MAX_PACKET_OFF;
 
 			/* mark bpf_tail_call as different opcode to avoid
-			 * conditional branch in the interpeter for every normal
+			 * conditional branch in the interpreter for every normal
 			 * call and to prevent accidental JITing by JIT compiler
 			 * that doesn't support bpf_tail_call yet
 			 */
-- 
cgit v1.2.3


From a8b98c808eab3ec8f1b5a64be967b0f4af4cae43 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 24 May 2021 16:53:21 +0300
Subject: fanotify: fix permission model of unprivileged group

Reporting event->pid should depend on the privileges of the user that
initialized the group, not the privileges of the user reading the
events.

Use an internal group flag FANOTIFY_UNPRIV to record the fact that the
group was initialized by an unprivileged user.

To be on the safe side, the premissions to setup filesystem and mount
marks now require that both the user that initialized the group and
the user setting up the mark have CAP_SYS_ADMIN.

Link: https://lore.kernel.org/linux-fsdevel/CAOQ4uxiA77_P5vtv7e83g0+9d7B5W9ZTE4GfQEYbWmfT1rA=VA@mail.gmail.com/
Fixes: 7cea2a3c505e ("fanotify: support limited functionality for unprivileged users")
Cc: <Stable@vger.kernel.org> # v5.12+
Link: https://lore.kernel.org/r/20210524135321.2190062-1-amir73il@gmail.com
Reviewed-by: Matthew Bobrowski <repnop@google.com>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify_user.c | 30 ++++++++++++++++++++++++------
 fs/notify/fdinfo.c                 |  2 +-
 include/linux/fanotify.h           |  4 ++++
 3 files changed, 29 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 71fefb30e015..be5b6d2c01e7 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -424,11 +424,18 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	 * events generated by the listener process itself, without disclosing
 	 * the pids of other processes.
 	 */
-	if (!capable(CAP_SYS_ADMIN) &&
+	if (FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
 	    task_tgid(current) != event->pid)
 		metadata.pid = 0;
 
-	if (path && path->mnt && path->dentry) {
+	/*
+	 * For now, fid mode is required for an unprivileged listener and
+	 * fid mode does not report fd in events.  Keep this check anyway
+	 * for safety in case fid mode requirement is relaxed in the future
+	 * to allow unprivileged listener to get events with no fd and no fid.
+	 */
+	if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
+	    path && path->mnt && path->dentry) {
 		fd = create_fd(group, path, &f);
 		if (fd < 0)
 			return fd;
@@ -1040,6 +1047,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	int f_flags, fd;
 	unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
 	unsigned int class = flags & FANOTIFY_CLASS_BITS;
+	unsigned int internal_flags = 0;
 
 	pr_debug("%s: flags=%x event_f_flags=%x\n",
 		 __func__, flags, event_f_flags);
@@ -1053,6 +1061,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 		 */
 		if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode)
 			return -EPERM;
+
+		/*
+		 * Setting the internal flag FANOTIFY_UNPRIV on the group
+		 * prevents setting mount/filesystem marks on this group and
+		 * prevents reporting pid and open fd in events.
+		 */
+		internal_flags |= FANOTIFY_UNPRIV;
 	}
 
 #ifdef CONFIG_AUDITSYSCALL
@@ -1105,7 +1120,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 		goto out_destroy_group;
 	}
 
-	group->fanotify_data.flags = flags;
+	group->fanotify_data.flags = flags | internal_flags;
 	group->memcg = get_mem_cgroup_from_mm(current->mm);
 
 	group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
@@ -1305,11 +1320,13 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 	group = f.file->private_data;
 
 	/*
-	 * An unprivileged user is not allowed to watch a mount point nor
-	 * a filesystem.
+	 * An unprivileged user is not allowed to setup mount nor filesystem
+	 * marks.  This also includes setting up such marks by a group that
+	 * was initialized by an unprivileged user.
 	 */
 	ret = -EPERM;
-	if (!capable(CAP_SYS_ADMIN) &&
+	if ((!capable(CAP_SYS_ADMIN) ||
+	     FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) &&
 	    mark_type != FAN_MARK_INODE)
 		goto fput_and_out;
 
@@ -1460,6 +1477,7 @@ static int __init fanotify_user_setup(void)
 	max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS,
 				     FANOTIFY_DEFAULT_MAX_USER_MARKS);
 
+	BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10);
 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
 
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index a712b2aaa9ac..57f0d5d9f934 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -144,7 +144,7 @@ void fanotify_show_fdinfo(struct seq_file *m, struct file *f)
 	struct fsnotify_group *group = f->private_data;
 
 	seq_printf(m, "fanotify flags:%x event-flags:%x\n",
-		   group->fanotify_data.flags,
+		   group->fanotify_data.flags & FANOTIFY_INIT_FLAGS,
 		   group->fanotify_data.f_flags);
 
 	show_fdinfo(m, f, fanotify_fdinfo);
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index bad41bcb25df..a16dbeced152 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -51,6 +51,10 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */
 #define FANOTIFY_INIT_FLAGS	(FANOTIFY_ADMIN_INIT_FLAGS | \
 				 FANOTIFY_USER_INIT_FLAGS)
 
+/* Internal group flags */
+#define FANOTIFY_UNPRIV		0x80000000
+#define FANOTIFY_INTERNAL_GROUP_FLAGS	(FANOTIFY_UNPRIV)
+
 #define FANOTIFY_MARK_TYPE_BITS	(FAN_MARK_INODE | FAN_MARK_MOUNT | \
 				 FAN_MARK_FILESYSTEM)
 
-- 
cgit v1.2.3


From f4e44b393389c77958f7c58bf4415032b4cda15b Mon Sep 17 00:00:00 2001
From: Dai Ngo <dai.ngo@oracle.com>
Date: Fri, 21 May 2021 15:09:37 -0400
Subject: NFSD: delay unmount source's export after inter-server copy
 completed.

Currently the source's export is mounted and unmounted on every
inter-server copy operation. This patch is an enhancement to delay
the unmount of the source export for a certain period of time to
eliminate the mount and unmount overhead on subsequent copy operations.

After a copy operation completes, a work entry is added to the
delayed unmount list with an expiration time. This list is serviced
by the laundromat thread to unmount the export of the expired entries.
Each time the export is being used again, its expiration time is
extended and the entry is re-inserted to the tail of the list.

The unmount task and the mount operation of the copy request are
synced to make sure the export is not unmounted while it's being
used.

Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/netns.h         |   6 +++
 fs/nfsd/nfs4proc.c      | 135 ++++++++++++++++++++++++++++++++++++++++++++++--
 fs/nfsd/nfs4state.c     |  71 +++++++++++++++++++++++++
 fs/nfsd/nfsd.h          |   4 ++
 fs/nfsd/nfssvc.c        |   3 ++
 include/linux/nfs_ssc.h |  14 +++++
 6 files changed, 229 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index a75abeb1e698..935c1028c217 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -176,6 +176,12 @@ struct nfsd_net {
 	unsigned int             longest_chain_cachesize;
 
 	struct shrinker		nfsd_reply_cache_shrinker;
+
+	/* tracking server-to-server copy mounts */
+	spinlock_t              nfsd_ssc_lock;
+	struct list_head        nfsd_ssc_mount_list;
+	wait_queue_head_t       nfsd_ssc_waitq;
+
 	/* utsname taken from the process that starts the server */
 	char			nfsd_name[UNX_MAXNODENAME+1];
 };
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 48c0992cb65c..0bd71c6da81d 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -55,6 +55,13 @@ module_param(inter_copy_offload_enable, bool, 0644);
 MODULE_PARM_DESC(inter_copy_offload_enable,
 		 "Enable inter server to server copy offload. Default: false");
 
+#ifdef CONFIG_NFSD_V4_2_INTER_SSC
+static int nfsd4_ssc_umount_timeout = 900000;		/* default to 15 mins */
+module_param(nfsd4_ssc_umount_timeout, int, 0644);
+MODULE_PARM_DESC(nfsd4_ssc_umount_timeout,
+		"idle msecs before unmount export from source server");
+#endif
+
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #include <linux/security.h>
 
@@ -1165,6 +1172,81 @@ extern void nfs_sb_deactive(struct super_block *sb);
 
 #define NFSD42_INTERSSC_MOUNTOPS "vers=4.2,addr=%s,sec=sys"
 
+/*
+ * setup a work entry in the ssc delayed unmount list.
+ */
+static int nfsd4_ssc_setup_dul(struct nfsd_net *nn, char *ipaddr,
+		struct nfsd4_ssc_umount_item **retwork, struct vfsmount **ss_mnt)
+{
+	struct nfsd4_ssc_umount_item *ni = 0;
+	struct nfsd4_ssc_umount_item *work = NULL;
+	struct nfsd4_ssc_umount_item *tmp;
+	DEFINE_WAIT(wait);
+
+	*ss_mnt = NULL;
+	*retwork = NULL;
+	work = kzalloc(sizeof(*work), GFP_KERNEL);
+try_again:
+	spin_lock(&nn->nfsd_ssc_lock);
+	list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) {
+		if (strncmp(ni->nsui_ipaddr, ipaddr, sizeof(ni->nsui_ipaddr)))
+			continue;
+		/* found a match */
+		if (ni->nsui_busy) {
+			/*  wait - and try again */
+			prepare_to_wait(&nn->nfsd_ssc_waitq, &wait,
+				TASK_INTERRUPTIBLE);
+			spin_unlock(&nn->nfsd_ssc_lock);
+
+			/* allow 20secs for mount/unmount for now - revisit */
+			if (signal_pending(current) ||
+					(schedule_timeout(20*HZ) == 0)) {
+				kfree(work);
+				return nfserr_eagain;
+			}
+			finish_wait(&nn->nfsd_ssc_waitq, &wait);
+			goto try_again;
+		}
+		*ss_mnt = ni->nsui_vfsmount;
+		refcount_inc(&ni->nsui_refcnt);
+		spin_unlock(&nn->nfsd_ssc_lock);
+		kfree(work);
+
+		/* return vfsmount in ss_mnt */
+		return 0;
+	}
+	if (work) {
+		strncpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr));
+		refcount_set(&work->nsui_refcnt, 2);
+		work->nsui_busy = true;
+		list_add_tail(&work->nsui_list, &nn->nfsd_ssc_mount_list);
+		*retwork = work;
+	}
+	spin_unlock(&nn->nfsd_ssc_lock);
+	return 0;
+}
+
+static void nfsd4_ssc_update_dul_work(struct nfsd_net *nn,
+		struct nfsd4_ssc_umount_item *work, struct vfsmount *ss_mnt)
+{
+	/* set nsui_vfsmount, clear busy flag and wakeup waiters */
+	spin_lock(&nn->nfsd_ssc_lock);
+	work->nsui_vfsmount = ss_mnt;
+	work->nsui_busy = false;
+	wake_up_all(&nn->nfsd_ssc_waitq);
+	spin_unlock(&nn->nfsd_ssc_lock);
+}
+
+static void nfsd4_ssc_cancel_dul_work(struct nfsd_net *nn,
+		struct nfsd4_ssc_umount_item *work)
+{
+	spin_lock(&nn->nfsd_ssc_lock);
+	list_del(&work->nsui_list);
+	wake_up_all(&nn->nfsd_ssc_waitq);
+	spin_unlock(&nn->nfsd_ssc_lock);
+	kfree(work);
+}
+
 /*
  * Support one copy source server for now.
  */
@@ -1181,6 +1263,8 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
 	char *ipaddr, *dev_name, *raw_data;
 	int len, raw_len;
 	__be32 status = nfserr_inval;
+	struct nfsd4_ssc_umount_item *work = NULL;
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 
 	naddr = &nss->u.nl4_addr;
 	tmp_addrlen = rpc_uaddr2sockaddr(SVC_NET(rqstp), naddr->addr,
@@ -1229,12 +1313,23 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
 		goto out_free_rawdata;
 	snprintf(dev_name, len + 5, "%s%s%s:/", startsep, ipaddr, endsep);
 
+	status = nfsd4_ssc_setup_dul(nn, ipaddr, &work, &ss_mnt);
+	if (status)
+		goto out_free_devname;
+	if (ss_mnt)
+		goto out_done;
+
 	/* Use an 'internal' mount: SB_KERNMOUNT -> MNT_INTERNAL */
 	ss_mnt = vfs_kern_mount(type, SB_KERNMOUNT, dev_name, raw_data);
 	module_put(type->owner);
-	if (IS_ERR(ss_mnt))
+	if (IS_ERR(ss_mnt)) {
+		if (work)
+			nfsd4_ssc_cancel_dul_work(nn, work);
 		goto out_free_devname;
-
+	}
+	if (work)
+		nfsd4_ssc_update_dul_work(nn, work, ss_mnt);
+out_done:
 	status = 0;
 	*mount = ss_mnt;
 
@@ -1301,10 +1396,42 @@ static void
 nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src,
 			struct nfsd_file *dst)
 {
+	bool found = false;
+	long timeout;
+	struct nfsd4_ssc_umount_item *tmp;
+	struct nfsd4_ssc_umount_item *ni = 0;
+	struct nfsd_net *nn = net_generic(dst->nf_net, nfsd_net_id);
+
 	nfs42_ssc_close(src->nf_file);
-	fput(src->nf_file);
 	nfsd_file_put(dst);
-	mntput(ss_mnt);
+	fput(src->nf_file);
+
+	if (!nn) {
+		mntput(ss_mnt);
+		return;
+	}
+	spin_lock(&nn->nfsd_ssc_lock);
+	timeout = msecs_to_jiffies(nfsd4_ssc_umount_timeout);
+	list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) {
+		if (ni->nsui_vfsmount->mnt_sb == ss_mnt->mnt_sb) {
+			list_del(&ni->nsui_list);
+			/*
+			 * vfsmount can be shared by multiple exports,
+			 * decrement refcnt. If the count drops to 1 it
+			 * will be unmounted when nsui_expire expires.
+			 */
+			refcount_dec(&ni->nsui_refcnt);
+			ni->nsui_expire = jiffies + timeout;
+			list_add_tail(&ni->nsui_list, &nn->nfsd_ssc_mount_list);
+			found = true;
+			break;
+		}
+	}
+	spin_unlock(&nn->nfsd_ssc_lock);
+	if (!found) {
+		mntput(ss_mnt);
+		return;
+	}
 }
 
 #else /* CONFIG_NFSD_V4_2_INTER_SSC */
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b2573d3ecd3c..99dfc668f605 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -44,6 +44,7 @@
 #include <linux/jhash.h>
 #include <linux/string_helpers.h>
 #include <linux/fsnotify.h>
+#include <linux/nfs_ssc.h>
 #include "xdr4.h"
 #include "xdr4cb.h"
 #include "vfs.h"
@@ -5480,6 +5481,69 @@ static bool state_expired(struct laundry_time *lt, time64_t last_refresh)
 	return false;
 }
 
+#ifdef CONFIG_NFSD_V4_2_INTER_SSC
+void nfsd4_ssc_init_umount_work(struct nfsd_net *nn)
+{
+	spin_lock_init(&nn->nfsd_ssc_lock);
+	INIT_LIST_HEAD(&nn->nfsd_ssc_mount_list);
+	init_waitqueue_head(&nn->nfsd_ssc_waitq);
+}
+EXPORT_SYMBOL_GPL(nfsd4_ssc_init_umount_work);
+
+/*
+ * This is called when nfsd is being shutdown, after all inter_ssc
+ * cleanup were done, to destroy the ssc delayed unmount list.
+ */
+static void nfsd4_ssc_shutdown_umount(struct nfsd_net *nn)
+{
+	struct nfsd4_ssc_umount_item *ni = 0;
+	struct nfsd4_ssc_umount_item *tmp;
+
+	spin_lock(&nn->nfsd_ssc_lock);
+	list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) {
+		list_del(&ni->nsui_list);
+		spin_unlock(&nn->nfsd_ssc_lock);
+		mntput(ni->nsui_vfsmount);
+		kfree(ni);
+		spin_lock(&nn->nfsd_ssc_lock);
+	}
+	spin_unlock(&nn->nfsd_ssc_lock);
+}
+
+static void nfsd4_ssc_expire_umount(struct nfsd_net *nn)
+{
+	bool do_wakeup = false;
+	struct nfsd4_ssc_umount_item *ni = 0;
+	struct nfsd4_ssc_umount_item *tmp;
+
+	spin_lock(&nn->nfsd_ssc_lock);
+	list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) {
+		if (time_after(jiffies, ni->nsui_expire)) {
+			if (refcount_read(&ni->nsui_refcnt) > 1)
+				continue;
+
+			/* mark being unmount */
+			ni->nsui_busy = true;
+			spin_unlock(&nn->nfsd_ssc_lock);
+			mntput(ni->nsui_vfsmount);
+			spin_lock(&nn->nfsd_ssc_lock);
+
+			/* waiters need to start from begin of list */
+			list_del(&ni->nsui_list);
+			kfree(ni);
+
+			/* wakeup ssc_connect waiters */
+			do_wakeup = true;
+			continue;
+		}
+		break;
+	}
+	if (do_wakeup)
+		wake_up_all(&nn->nfsd_ssc_waitq);
+	spin_unlock(&nn->nfsd_ssc_lock);
+}
+#endif
+
 static time64_t
 nfs4_laundromat(struct nfsd_net *nn)
 {
@@ -5589,6 +5653,10 @@ nfs4_laundromat(struct nfsd_net *nn)
 		list_del_init(&nbl->nbl_lru);
 		free_blocked_lock(nbl);
 	}
+#ifdef CONFIG_NFSD_V4_2_INTER_SSC
+	/* service the server-to-server copy delayed unmount list */
+	nfsd4_ssc_expire_umount(nn);
+#endif
 out:
 	return max_t(time64_t, lt.new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT);
 }
@@ -7506,6 +7574,9 @@ nfs4_state_shutdown_net(struct net *net)
 
 	nfsd4_client_tracking_exit(net);
 	nfs4_state_destroy_net(net);
+#ifdef CONFIG_NFSD_V4_2_INTER_SSC
+	nfsd4_ssc_shutdown_umount(nn);
+#endif
 }
 
 void
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 14dbfa75059d..9664303afdaf 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -484,6 +484,10 @@ static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval)
 extern int nfsd4_is_junction(struct dentry *dentry);
 extern int register_cld_notifier(void);
 extern void unregister_cld_notifier(void);
+#ifdef CONFIG_NFSD_V4_2_INTER_SSC
+extern void nfsd4_ssc_init_umount_work(struct nfsd_net *nn);
+#endif
+
 #else /* CONFIG_NFSD_V4 */
 static inline int nfsd4_is_junction(struct dentry *dentry)
 {
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index dd5d69921676..ccb59e91011b 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -403,6 +403,9 @@ static int nfsd_startup_net(struct net *net, const struct cred *cred)
 	if (ret)
 		goto out_filecache;
 
+#ifdef CONFIG_NFSD_V4_2_INTER_SSC
+	nfsd4_ssc_init_umount_work(nn);
+#endif
 	nn->nfsd_net_up = true;
 	return 0;
 
diff --git a/include/linux/nfs_ssc.h b/include/linux/nfs_ssc.h
index f5ba0fbff72f..222ae8883e85 100644
--- a/include/linux/nfs_ssc.h
+++ b/include/linux/nfs_ssc.h
@@ -8,6 +8,7 @@
  */
 
 #include <linux/nfs_fs.h>
+#include <linux/sunrpc/svc.h>
 
 extern struct nfs_ssc_client_ops_tbl nfs_ssc_client_tbl;
 
@@ -52,6 +53,19 @@ static inline void nfs42_ssc_close(struct file *filep)
 	if (nfs_ssc_client_tbl.ssc_nfs4_ops)
 		(*nfs_ssc_client_tbl.ssc_nfs4_ops->sco_close)(filep);
 }
+
+struct nfsd4_ssc_umount_item {
+	struct list_head nsui_list;
+	bool nsui_busy;
+	/*
+	 * nsui_refcnt inited to 2, 1 on list and 1 for consumer. Entry
+	 * is removed when refcnt drops to 1 and nsui_expire expires.
+	 */
+	refcount_t nsui_refcnt;
+	unsigned long nsui_expire;
+	struct vfsmount *nsui_vfsmount;
+	char nsui_ipaddr[RPC_MAX_ADDRBUFLEN];
+};
 #endif
 
 /*
-- 
cgit v1.2.3


From 85aabbd7b315c65673084b6227bee92c00405239 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Mon, 10 May 2021 19:31:30 +0200
Subject: PCI/MSI: Fix MSIs for generic hosts that use device-tree's "msi-map"

Since commit 9ec37efb8783 ("PCI/MSI: Make pci_host_common_probe() declare
its reliance on MSI domains"), platforms that rely on the "msi-map"
device-tree property don't get MSIs anymore.

On the Arm Fast Model for example [1], the host bridge doesn't have a
"msi-parent" property since it doesn't itself generate MSIs, and so doesn't
get a MSI domain. It has an "msi-map" property instead to describe MSI
controllers of child devices. As a result, due to the new msi_domain check
in pci_register_host_bridge(), the whole bus gets PCI_BUS_FLAGS_NO_MSI.

Check whether the root complex has an "msi-map" property before giving
up on MSIs.

[1] arch/arm64/boot/dts/arm/fvp-base-revc.dts

Fixes: 9ec37efb8783 ("PCI/MSI: Make pci_host_common_probe() declare its reliance on MSI domains")
Link: https://lore.kernel.org/r/20210510173129.750496-1-jean-philippe@linaro.org
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Marc Zyngier <maz@kernel.org>
---
 drivers/pci/of.c    | 7 +++++++
 drivers/pci/probe.c | 3 ++-
 include/linux/pci.h | 2 ++
 3 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/of.c b/drivers/pci/of.c
index da5b414d585a..85dcb7097da4 100644
--- a/drivers/pci/of.c
+++ b/drivers/pci/of.c
@@ -103,6 +103,13 @@ struct irq_domain *pci_host_bridge_of_msi_domain(struct pci_bus *bus)
 #endif
 }
 
+bool pci_host_of_has_msi_map(struct device *dev)
+{
+	if (dev && dev->of_node)
+		return of_get_property(dev->of_node, "msi-map", NULL);
+	return false;
+}
+
 static inline int __of_pci_pci_compare(struct device_node *node,
 				       unsigned int data)
 {
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 3a62d09b8869..275204646c68 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -925,7 +925,8 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge)
 	device_enable_async_suspend(bus->bridge);
 	pci_set_bus_of_node(bus);
 	pci_set_bus_msi_domain(bus);
-	if (bridge->msi_domain && !dev_get_msi_domain(&bus->dev))
+	if (bridge->msi_domain && !dev_get_msi_domain(&bus->dev) &&
+	    !pci_host_of_has_msi_map(parent))
 		bus->bus_flags |= PCI_BUS_FLAGS_NO_MSI;
 
 	if (!parent)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index c20211e59a57..24306504226a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2344,6 +2344,7 @@ int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off,
 struct device_node;
 struct irq_domain;
 struct irq_domain *pci_host_bridge_of_msi_domain(struct pci_bus *bus);
+bool pci_host_of_has_msi_map(struct device *dev);
 
 /* Arch may override this (weak) */
 struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus);
@@ -2351,6 +2352,7 @@ struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus);
 #else	/* CONFIG_OF */
 static inline struct irq_domain *
 pci_host_bridge_of_msi_domain(struct pci_bus *bus) { return NULL; }
+static inline bool pci_host_of_has_msi_map(struct device *dev) { return false; }
 #endif  /* CONFIG_OF */
 
 static inline struct device_node *
-- 
cgit v1.2.3


From e624d4ed4aa8cc3c69d1359b0aaea539203ed266 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Wed, 19 May 2021 17:07:45 +0800
Subject: xdp: Extend xdp_redirect_map with broadcast support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds two flags BPF_F_BROADCAST and BPF_F_EXCLUDE_INGRESS to
extend xdp_redirect_map for broadcast support.

With BPF_F_BROADCAST the packet will be broadcasted to all the interfaces
in the map. with BPF_F_EXCLUDE_INGRESS the ingress interface will be
excluded when do broadcasting.

When getting the devices in dev hash map via dev_map_hash_get_next_key(),
there is a possibility that we fall back to the first key when a device
was removed. This will duplicate packets on some interfaces. So just walk
the whole buckets to avoid this issue. For dev array map, we also walk the
whole map to find valid interfaces.

Function bpf_clear_redirect_map() was removed in
commit ee75aef23afe ("bpf, xdp: Restructure redirect actions").
Add it back as we need to use ri->map again.

With test topology:
  +-------------------+             +-------------------+
  | Host A (i40e 10G) |  ---------- | eno1(i40e 10G)    |
  +-------------------+             |                   |
                                    |   Host B          |
  +-------------------+             |                   |
  | Host C (i40e 10G) |  ---------- | eno2(i40e 10G)    |
  +-------------------+             |                   |
                                    |          +------+ |
                                    | veth0 -- | Peer | |
                                    | veth1 -- |      | |
                                    | veth2 -- |  NS  | |
                                    |          +------+ |
                                    +-------------------+

On Host A:
 # pktgen/pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -s 64

On Host B(Intel(R) Xeon(R) CPU E5-2690 v3 @ 2.60GHz, 128G Memory):
Use xdp_redirect_map and xdp_redirect_map_multi in samples/bpf for testing.
All the veth peers in the NS have a XDP_DROP program loaded. The
forward_map max_entries in xdp_redirect_map_multi is modify to 4.

Testing the performance impact on the regular xdp_redirect path with and
without patch (to check impact of additional check for broadcast mode):

5.12 rc4         | redirect_map        i40e->i40e      |    2.0M |  9.7M
5.12 rc4         | redirect_map        i40e->veth      |    1.7M | 11.8M
5.12 rc4 + patch | redirect_map        i40e->i40e      |    2.0M |  9.6M
5.12 rc4 + patch | redirect_map        i40e->veth      |    1.7M | 11.7M

Testing the performance when cloning packets with the redirect_map_multi
test, using a redirect map size of 4, filled with 1-3 devices:

5.12 rc4 + patch | redirect_map multi  i40e->veth (x1) |    1.7M | 11.4M
5.12 rc4 + patch | redirect_map multi  i40e->veth (x2) |    1.1M |  4.3M
5.12 rc4 + patch | redirect_map multi  i40e->veth (x3) |    0.8M |  2.6M

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Link: https://lore.kernel.org/bpf/20210519090747.1655268-3-liuhangbin@gmail.com
---
 include/linux/bpf.h            |  20 +++++
 include/linux/filter.h         |  19 ++++-
 include/net/xdp.h              |   1 +
 include/trace/events/xdp.h     |   6 +-
 include/uapi/linux/bpf.h       |  14 +++-
 kernel/bpf/cpumap.c            |   3 +-
 kernel/bpf/devmap.c            | 183 ++++++++++++++++++++++++++++++++++++++++-
 net/core/filter.c              |  37 ++++++++-
 net/core/xdp.c                 |  28 +++++++
 net/xdp/xskmap.c               |   3 +-
 tools/include/uapi/linux/bpf.h |  14 +++-
 11 files changed, 313 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1e9a0ff3217b..86dec5001ae2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1501,8 +1501,13 @@ int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
 		    struct net_device *dev_rx);
 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
 		    struct net_device *dev_rx);
+int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+			  struct bpf_map *map, bool exclude_ingress);
 int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
 			     struct bpf_prog *xdp_prog);
+int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
+			   struct bpf_prog *xdp_prog, struct bpf_map *map,
+			   bool exclude_ingress);
 bool dev_map_can_have_prog(struct bpf_map *map);
 
 void __cpu_map_flush(void);
@@ -1670,6 +1675,13 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
 	return 0;
 }
 
+static inline
+int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+			  struct bpf_map *map, bool exclude_ingress)
+{
+	return 0;
+}
+
 struct sk_buff;
 
 static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst,
@@ -1679,6 +1691,14 @@ static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst,
 	return 0;
 }
 
+static inline
+int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
+			   struct bpf_prog *xdp_prog, struct bpf_map *map,
+			   bool exclude_ingress)
+{
+	return 0;
+}
+
 static inline void __cpu_map_flush(void)
 {
 }
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 9a09547bc7ba..c5ad7df029ed 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -646,6 +646,7 @@ struct bpf_redirect_info {
 	u32 flags;
 	u32 tgt_index;
 	void *tgt_value;
+	struct bpf_map *map;
 	u32 map_id;
 	enum bpf_map_type map_type;
 	u32 kern_flags;
@@ -1464,17 +1465,19 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
 }
 #endif /* IS_ENABLED(CONFIG_IPV6) */
 
-static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex, u64 flags,
+static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex,
+						  u64 flags, const u64 flag_mask,
 						  void *lookup_elem(struct bpf_map *map, u32 key))
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+	const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX;
 
 	/* Lower bits of the flags are used as return code on lookup failure */
-	if (unlikely(flags > XDP_TX))
+	if (unlikely(flags & ~(action_mask | flag_mask)))
 		return XDP_ABORTED;
 
 	ri->tgt_value = lookup_elem(map, ifindex);
-	if (unlikely(!ri->tgt_value)) {
+	if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) {
 		/* If the lookup fails we want to clear out the state in the
 		 * redirect_info struct completely, so that if an eBPF program
 		 * performs multiple lookups, the last one always takes
@@ -1482,13 +1485,21 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind
 		 */
 		ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */
 		ri->map_type = BPF_MAP_TYPE_UNSPEC;
-		return flags;
+		return flags & action_mask;
 	}
 
 	ri->tgt_index = ifindex;
 	ri->map_id = map->id;
 	ri->map_type = map->map_type;
 
+	if (flags & BPF_F_BROADCAST) {
+		WRITE_ONCE(ri->map, map);
+		ri->flags = flags;
+	} else {
+		WRITE_ONCE(ri->map, NULL);
+		ri->flags = 0;
+	}
+
 	return XDP_REDIRECT;
 }
 
diff --git a/include/net/xdp.h b/include/net/xdp.h
index a5bc214a49d9..5533f0ab2afc 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -170,6 +170,7 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
 struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
 					 struct net_device *dev);
 int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp);
+struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf);
 
 static inline
 void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp)
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index fcad3645a70b..c40fc97f9417 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -110,7 +110,11 @@ DECLARE_EVENT_CLASS(xdp_redirect_template,
 		u32 ifindex = 0, map_index = index;
 
 		if (map_type == BPF_MAP_TYPE_DEVMAP || map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
-			ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex;
+			/* Just leave to_ifindex to 0 if do broadcast redirect,
+			 * as tgt will be NULL.
+			 */
+			if (tgt)
+				ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex;
 		} else if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
 			ifindex = index;
 			map_index = 0;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 562adeac1d67..2c1ba70abbf1 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2555,8 +2555,12 @@ union bpf_attr {
  * 		The lower two bits of *flags* are used as the return code if
  * 		the map lookup fails. This is so that the return value can be
  * 		one of the XDP program return codes up to **XDP_TX**, as chosen
- * 		by the caller. Any higher bits in the *flags* argument must be
- * 		unset.
+ * 		by the caller. The higher bits of *flags* can be set to
+ * 		BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below.
+ *
+ * 		With BPF_F_BROADCAST the packet will be broadcasted to all the
+ * 		interfaces in the map, with BPF_F_EXCLUDE_INGRESS the ingress
+ * 		interface will be excluded when do broadcasting.
  *
  * 		See also **bpf_redirect**\ (), which only supports redirecting
  * 		to an ifindex, but doesn't require a map to do so.
@@ -5122,6 +5126,12 @@ enum {
 	BPF_F_BPRM_SECUREEXEC	= (1ULL << 0),
 };
 
+/* Flags for bpf_redirect_map helper */
+enum {
+	BPF_F_BROADCAST		= (1ULL << 3),
+	BPF_F_EXCLUDE_INGRESS	= (1ULL << 4),
+};
+
 #define __bpf_md_ptr(type, name)	\
 union {					\
 	type name;			\
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 5dd3e866599a..a1a0c4e791c6 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -601,7 +601,8 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 
 static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
 {
-	return __bpf_xdp_redirect_map(map, ifindex, flags, __cpu_map_lookup_elem);
+	return __bpf_xdp_redirect_map(map, ifindex, flags, 0,
+				      __cpu_map_lookup_elem);
 }
 
 static int cpu_map_btf_id;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 642264e32abd..f9148daab0e3 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -198,6 +198,7 @@ static void dev_map_free(struct bpf_map *map)
 	list_del_rcu(&dtab->list);
 	spin_unlock(&dev_map_lock);
 
+	bpf_clear_redirect_map(map);
 	synchronize_rcu();
 
 	/* Make sure prior __dev_map_entry_free() have completed. */
@@ -515,6 +516,99 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
 	return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog);
 }
 
+static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp,
+			 int exclude_ifindex)
+{
+	if (!obj || obj->dev->ifindex == exclude_ifindex ||
+	    !obj->dev->netdev_ops->ndo_xdp_xmit)
+		return false;
+
+	if (xdp_ok_fwd_dev(obj->dev, xdp->data_end - xdp->data))
+		return false;
+
+	return true;
+}
+
+static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj,
+				 struct net_device *dev_rx,
+				 struct xdp_frame *xdpf)
+{
+	struct xdp_frame *nxdpf;
+
+	nxdpf = xdpf_clone(xdpf);
+	if (!nxdpf)
+		return -ENOMEM;
+
+	bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog);
+
+	return 0;
+}
+
+int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+			  struct bpf_map *map, bool exclude_ingress)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	int exclude_ifindex = exclude_ingress ? dev_rx->ifindex : 0;
+	struct bpf_dtab_netdev *dst, *last_dst = NULL;
+	struct hlist_head *head;
+	struct xdp_frame *xdpf;
+	unsigned int i;
+	int err;
+
+	xdpf = xdp_convert_buff_to_frame(xdp);
+	if (unlikely(!xdpf))
+		return -EOVERFLOW;
+
+	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+		for (i = 0; i < map->max_entries; i++) {
+			dst = READ_ONCE(dtab->netdev_map[i]);
+			if (!is_valid_dst(dst, xdp, exclude_ifindex))
+				continue;
+
+			/* we only need n-1 clones; last_dst enqueued below */
+			if (!last_dst) {
+				last_dst = dst;
+				continue;
+			}
+
+			err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
+			if (err)
+				return err;
+
+			last_dst = dst;
+		}
+	} else { /* BPF_MAP_TYPE_DEVMAP_HASH */
+		for (i = 0; i < dtab->n_buckets; i++) {
+			head = dev_map_index_hash(dtab, i);
+			hlist_for_each_entry_rcu(dst, head, index_hlist,
+						 lockdep_is_held(&dtab->index_lock)) {
+				if (!is_valid_dst(dst, xdp, exclude_ifindex))
+					continue;
+
+				/* we only need n-1 clones; last_dst enqueued below */
+				if (!last_dst) {
+					last_dst = dst;
+					continue;
+				}
+
+				err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
+				if (err)
+					return err;
+
+				last_dst = dst;
+			}
+		}
+	}
+
+	/* consume the last copy of the frame */
+	if (last_dst)
+		bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog);
+	else
+		xdp_return_frame_rx_napi(xdpf); /* dtab is empty */
+
+	return 0;
+}
+
 int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
 			     struct bpf_prog *xdp_prog)
 {
@@ -529,6 +623,87 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
 	return 0;
 }
 
+static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst,
+				  struct sk_buff *skb,
+				  struct bpf_prog *xdp_prog)
+{
+	struct sk_buff *nskb;
+	int err;
+
+	nskb = skb_clone(skb, GFP_ATOMIC);
+	if (!nskb)
+		return -ENOMEM;
+
+	err = dev_map_generic_redirect(dst, nskb, xdp_prog);
+	if (unlikely(err)) {
+		consume_skb(nskb);
+		return err;
+	}
+
+	return 0;
+}
+
+int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
+			   struct bpf_prog *xdp_prog, struct bpf_map *map,
+			   bool exclude_ingress)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	int exclude_ifindex = exclude_ingress ? dev->ifindex : 0;
+	struct bpf_dtab_netdev *dst, *last_dst = NULL;
+	struct hlist_head *head;
+	struct hlist_node *next;
+	unsigned int i;
+	int err;
+
+	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+		for (i = 0; i < map->max_entries; i++) {
+			dst = READ_ONCE(dtab->netdev_map[i]);
+			if (!dst || dst->dev->ifindex == exclude_ifindex)
+				continue;
+
+			/* we only need n-1 clones; last_dst enqueued below */
+			if (!last_dst) {
+				last_dst = dst;
+				continue;
+			}
+
+			err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
+			if (err)
+				return err;
+
+			last_dst = dst;
+		}
+	} else { /* BPF_MAP_TYPE_DEVMAP_HASH */
+		for (i = 0; i < dtab->n_buckets; i++) {
+			head = dev_map_index_hash(dtab, i);
+			hlist_for_each_entry_safe(dst, next, head, index_hlist) {
+				if (!dst || dst->dev->ifindex == exclude_ifindex)
+					continue;
+
+				/* we only need n-1 clones; last_dst enqueued below */
+				if (!last_dst) {
+					last_dst = dst;
+					continue;
+				}
+
+				err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
+				if (err)
+					return err;
+
+				last_dst = dst;
+			}
+		}
+	}
+
+	/* consume the first skb and return */
+	if (last_dst)
+		return dev_map_generic_redirect(last_dst, skb, xdp_prog);
+
+	/* dtab is empty */
+	consume_skb(skb);
+	return 0;
+}
+
 static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
@@ -755,12 +930,16 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
 
 static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
 {
-	return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_lookup_elem);
+	return __bpf_xdp_redirect_map(map, ifindex, flags,
+				      BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
+				      __dev_map_lookup_elem);
 }
 
 static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
 {
-	return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_hash_lookup_elem);
+	return __bpf_xdp_redirect_map(map, ifindex, flags,
+				      BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
+				      __dev_map_hash_lookup_elem);
 }
 
 static int dev_map_btf_id;
diff --git a/net/core/filter.c b/net/core/filter.c
index 582ac196fd94..caa88955562e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3930,6 +3930,23 @@ void xdp_do_flush(void)
 }
 EXPORT_SYMBOL_GPL(xdp_do_flush);
 
+void bpf_clear_redirect_map(struct bpf_map *map)
+{
+	struct bpf_redirect_info *ri;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		ri = per_cpu_ptr(&bpf_redirect_info, cpu);
+		/* Avoid polluting remote cacheline due to writes if
+		 * not needed. Once we pass this test, we need the
+		 * cmpxchg() to make sure it hasn't been changed in
+		 * the meantime by remote CPU.
+		 */
+		if (unlikely(READ_ONCE(ri->map) == map))
+			cmpxchg(&ri->map, map, NULL);
+	}
+}
+
 int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 		    struct bpf_prog *xdp_prog)
 {
@@ -3937,6 +3954,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 	enum bpf_map_type map_type = ri->map_type;
 	void *fwd = ri->tgt_value;
 	u32 map_id = ri->map_id;
+	struct bpf_map *map;
 	int err;
 
 	ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
@@ -3946,7 +3964,14 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 	case BPF_MAP_TYPE_DEVMAP:
 		fallthrough;
 	case BPF_MAP_TYPE_DEVMAP_HASH:
-		err = dev_map_enqueue(fwd, xdp, dev);
+		map = READ_ONCE(ri->map);
+		if (unlikely(map)) {
+			WRITE_ONCE(ri->map, NULL);
+			err = dev_map_enqueue_multi(xdp, dev, map,
+						    ri->flags & BPF_F_EXCLUDE_INGRESS);
+		} else {
+			err = dev_map_enqueue(fwd, xdp, dev);
+		}
 		break;
 	case BPF_MAP_TYPE_CPUMAP:
 		err = cpu_map_enqueue(fwd, xdp, dev);
@@ -3988,13 +4013,21 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
 				       enum bpf_map_type map_type, u32 map_id)
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+	struct bpf_map *map;
 	int err;
 
 	switch (map_type) {
 	case BPF_MAP_TYPE_DEVMAP:
 		fallthrough;
 	case BPF_MAP_TYPE_DEVMAP_HASH:
-		err = dev_map_generic_redirect(fwd, skb, xdp_prog);
+		map = READ_ONCE(ri->map);
+		if (unlikely(map)) {
+			WRITE_ONCE(ri->map, NULL);
+			err = dev_map_redirect_multi(dev, skb, xdp_prog, map,
+						     ri->flags & BPF_F_EXCLUDE_INGRESS);
+		} else {
+			err = dev_map_generic_redirect(fwd, skb, xdp_prog);
+		}
 		if (unlikely(err))
 			goto err;
 		break;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 858276e72c68..725d20f1b100 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -584,3 +584,31 @@ struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
 	return __xdp_build_skb_from_frame(xdpf, skb, dev);
 }
 EXPORT_SYMBOL_GPL(xdp_build_skb_from_frame);
+
+struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf)
+{
+	unsigned int headroom, totalsize;
+	struct xdp_frame *nxdpf;
+	struct page *page;
+	void *addr;
+
+	headroom = xdpf->headroom + sizeof(*xdpf);
+	totalsize = headroom + xdpf->len;
+
+	if (unlikely(totalsize > PAGE_SIZE))
+		return NULL;
+	page = dev_alloc_page();
+	if (!page)
+		return NULL;
+	addr = page_to_virt(page);
+
+	memcpy(addr, xdpf, totalsize);
+
+	nxdpf = addr;
+	nxdpf->data = addr + headroom;
+	nxdpf->frame_sz = PAGE_SIZE;
+	nxdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
+	nxdpf->mem.id = 0;
+
+	return nxdpf;
+}
diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
index 67b4ce504852..9df75ea4a567 100644
--- a/net/xdp/xskmap.c
+++ b/net/xdp/xskmap.c
@@ -226,7 +226,8 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key)
 
 static int xsk_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
 {
-	return __bpf_xdp_redirect_map(map, ifindex, flags, __xsk_map_lookup_elem);
+	return __bpf_xdp_redirect_map(map, ifindex, flags, 0,
+				      __xsk_map_lookup_elem);
 }
 
 void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 562adeac1d67..2c1ba70abbf1 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2555,8 +2555,12 @@ union bpf_attr {
  * 		The lower two bits of *flags* are used as the return code if
  * 		the map lookup fails. This is so that the return value can be
  * 		one of the XDP program return codes up to **XDP_TX**, as chosen
- * 		by the caller. Any higher bits in the *flags* argument must be
- * 		unset.
+ * 		by the caller. The higher bits of *flags* can be set to
+ * 		BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below.
+ *
+ * 		With BPF_F_BROADCAST the packet will be broadcasted to all the
+ * 		interfaces in the map, with BPF_F_EXCLUDE_INGRESS the ingress
+ * 		interface will be excluded when do broadcasting.
  *
  * 		See also **bpf_redirect**\ (), which only supports redirecting
  * 		to an ifindex, but doesn't require a map to do so.
@@ -5122,6 +5126,12 @@ enum {
 	BPF_F_BPRM_SECUREEXEC	= (1ULL << 0),
 };
 
+/* Flags for bpf_redirect_map helper */
+enum {
+	BPF_F_BROADCAST		= (1ULL << 3),
+	BPF_F_EXCLUDE_INGRESS	= (1ULL << 4),
+};
+
 #define __bpf_md_ptr(type, name)	\
 union {					\
 	type name;			\
-- 
cgit v1.2.3


From 961965c45c706175b24227868b1c12d72775e446 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 5 May 2021 23:37:29 +0200
Subject: mtd: rawnand: Add a helper to clarify the interface configuration

Name it nand_interface_is_sdr() which will make even more sense when
nand_interface_is_nvddr() will be introduced.

Use it when relevant.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210505213750.257417-2-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/raw/atmel/nand-controller.c |  2 +-
 include/linux/mtd/rawnand.h                  | 11 ++++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/atmel/nand-controller.c b/drivers/mtd/nand/raw/atmel/nand-controller.c
index 8aab1017b460..53b3a11b9952 100644
--- a/drivers/mtd/nand/raw/atmel/nand-controller.c
+++ b/drivers/mtd/nand/raw/atmel/nand-controller.c
@@ -1246,7 +1246,7 @@ static int atmel_smc_nand_prepare_smcconf(struct atmel_nand *nand,
 	nc = to_nand_controller(nand->base.controller);
 
 	/* DDR interface not supported. */
-	if (conf->type != NAND_SDR_IFACE)
+	if (!nand_interface_is_sdr(conf))
 		return -ENOTSUPP;
 
 	/*
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 29df2f43dcb5..39b31f8e03b7 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -496,6 +496,15 @@ struct nand_interface_config {
 	} timings;
 };
 
+/**
+ * nand_interface_is_sdr - get the interface type
+ * @conf:	The data interface
+ */
+static bool nand_interface_is_sdr(const struct nand_interface_config *conf)
+{
+	return conf->type == NAND_SDR_IFACE;
+}
+
 /**
  * nand_get_sdr_timings - get SDR timing from data interface
  * @conf:	The data interface
@@ -503,7 +512,7 @@ struct nand_interface_config {
 static inline const struct nand_sdr_timings *
 nand_get_sdr_timings(const struct nand_interface_config *conf)
 {
-	if (conf->type != NAND_SDR_IFACE)
+	if (!nand_interface_is_sdr(conf))
 		return ERR_PTR(-EINVAL);
 
 	return &conf->timings.sdr;
-- 
cgit v1.2.3


From 64de50e38e6fced70d1cb9ad3112de0691d0ed2d Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 5 May 2021 23:37:32 +0200
Subject: mtd: rawnand: onfi: Use the BIT() macro when possible

Update the onfi.h header to use the BIT() macro.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Acked-by: Alexander Dahl <ada@thorsis.com>
Link: https://lore.kernel.org/linux-mtd/20210505213750.257417-5-miquel.raynal@bootlin.com
---
 include/linux/mtd/onfi.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/onfi.h b/include/linux/mtd/onfi.h
index 339ac798568e..cf14474bc454 100644
--- a/include/linux/mtd/onfi.h
+++ b/include/linux/mtd/onfi.h
@@ -24,17 +24,17 @@
 #define ONFI_VERSION_4_0		BIT(9)
 
 /* ONFI features */
-#define ONFI_FEATURE_16_BIT_BUS		(1 << 0)
-#define ONFI_FEATURE_EXT_PARAM_PAGE	(1 << 7)
+#define ONFI_FEATURE_16_BIT_BUS		BIT(0)
+#define ONFI_FEATURE_EXT_PARAM_PAGE	BIT(7)
 
 /* ONFI timing mode, used in both asynchronous and synchronous mode */
-#define ONFI_TIMING_MODE_0		(1 << 0)
-#define ONFI_TIMING_MODE_1		(1 << 1)
-#define ONFI_TIMING_MODE_2		(1 << 2)
-#define ONFI_TIMING_MODE_3		(1 << 3)
-#define ONFI_TIMING_MODE_4		(1 << 4)
-#define ONFI_TIMING_MODE_5		(1 << 5)
-#define ONFI_TIMING_MODE_UNKNOWN	(1 << 6)
+#define ONFI_TIMING_MODE_0		BIT(0)
+#define ONFI_TIMING_MODE_1		BIT(1)
+#define ONFI_TIMING_MODE_2		BIT(2)
+#define ONFI_TIMING_MODE_3		BIT(3)
+#define ONFI_TIMING_MODE_4		BIT(4)
+#define ONFI_TIMING_MODE_5		BIT(5)
+#define ONFI_TIMING_MODE_UNKNOWN	BIT(6)
 
 /* ONFI feature number/address */
 #define ONFI_FEATURE_NUMBER		256
@@ -49,7 +49,7 @@
 #define ONFI_SUBFEATURE_PARAM_LEN	4
 
 /* ONFI optional commands SET/GET FEATURES supported? */
-#define ONFI_OPT_CMD_SET_GET_FEATURES	(1 << 2)
+#define ONFI_OPT_CMD_SET_GET_FEATURES	BIT(2)
 
 struct nand_onfi_params {
 	/* rev info and features block */
-- 
cgit v1.2.3


From b16e0d5d7d693fe93e75569ac1ec80b513902a92 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 5 May 2021 23:37:33 +0200
Subject: mtd: rawnand: Update dead URL

The current link to the ONFI specification is broken, the onfi.org
website now points to materials on Micron's website. Update the URL
accordingly.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210505213750.257417-6-miquel.raynal@bootlin.com
---
 include/linux/mtd/rawnand.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 39b31f8e03b7..24aee0af5421 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -385,8 +385,8 @@ struct nand_ecc_ctrl {
  * This struct defines the timing requirements of a SDR NAND chip.
  * These information can be found in every NAND datasheets and the timings
  * meaning are described in the ONFI specifications:
- * www.onfi.org/~/media/ONFI/specs/onfi_3_1_spec.pdf (chapter 4.15 Timing
- * Parameters)
+ * https://media-www.micron.com/-/media/client/onfi/specs/onfi_3_1_spec.pdf
+ * (chapter 4.15 Timing Parameters)
  *
  * All these timings are expressed in picoseconds.
  *
-- 
cgit v1.2.3


From dbb7b2e07564443c2d357398e83e27c2fa5a89ed Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 5 May 2021 23:37:34 +0200
Subject: mtd: rawnand: Use more recent ONFI specification wording

In particular, first ONFI specifications referred to SDR modes as
asynchronous modes, which is not the term we usually have in mind. The
spec has then been updated, so do the same here in the NAND subsystem to
avoid any possible confusion.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210505213750.257417-7-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/raw/cadence-nand-controller.c | 6 +++---
 drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.h     | 2 +-
 drivers/mtd/nand/raw/nand_base.c               | 2 +-
 drivers/mtd/nand/raw/nand_onfi.c               | 2 +-
 include/linux/mtd/onfi.h                       | 6 +++---
 5 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/cadence-nand-controller.c b/drivers/mtd/nand/raw/cadence-nand-controller.c
index b46786cd53e0..7eec60ea9056 100644
--- a/drivers/mtd/nand/raw/cadence-nand-controller.c
+++ b/drivers/mtd/nand/raw/cadence-nand-controller.c
@@ -2348,9 +2348,9 @@ cadence_nand_setup_interface(struct nand_chip *chip, int chipnr,
 	 * for tRP and tRH timings. If it is NOT possible to sample data
 	 * with optimal tRP/tRH settings, the parameters will be extended.
 	 * If clk_period is 50ns (the lowest value) this condition is met
-	 * for asynchronous timing modes 1, 2, 3, 4 and 5.
-	 * If clk_period is 20ns the condition is met only
-	 * for asynchronous timing mode 5.
+	 * for SDR timing modes 1, 2, 3, 4 and 5.
+	 * If clk_period is 20ns the condition is met only for SDR timing
+	 * mode 5.
 	 */
 	if (sdr->tRC_min <= clk_period &&
 	    sdr->tRP_min <= (clk_period / 2) &&
diff --git a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.h b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.h
index fdc5ed7de083..5e1c3ddae5f8 100644
--- a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.h
+++ b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.h
@@ -79,7 +79,7 @@ enum gpmi_type {
 struct gpmi_devdata {
 	enum gpmi_type type;
 	int bch_max_ecc_strength;
-	int max_chain_delay; /* See the async EDO mode */
+	int max_chain_delay; /* See the SDR EDO mode */
 	const char * const *clks;
 	const int clks_count;
 };
diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index e29e5b3d31bf..f4b287b706f8 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -936,7 +936,7 @@ int nand_choose_best_sdr_timings(struct nand_chip *chip,
 		/* Fallback to slower modes */
 		best_mode = iface->timings.mode;
 	} else if (chip->parameters.onfi) {
-		best_mode = fls(chip->parameters.onfi->async_timing_mode) - 1;
+		best_mode = fls(chip->parameters.onfi->sdr_timing_modes) - 1;
 	}
 
 	for (mode = best_mode; mode >= 0; mode--) {
diff --git a/drivers/mtd/nand/raw/nand_onfi.c b/drivers/mtd/nand/raw/nand_onfi.c
index 45649e03797d..02303455c34f 100644
--- a/drivers/mtd/nand/raw/nand_onfi.c
+++ b/drivers/mtd/nand/raw/nand_onfi.c
@@ -315,7 +315,7 @@ int nand_onfi_detect(struct nand_chip *chip)
 	onfi->tBERS = le16_to_cpu(p->t_bers);
 	onfi->tR = le16_to_cpu(p->t_r);
 	onfi->tCCS = le16_to_cpu(p->t_ccs);
-	onfi->async_timing_mode = le16_to_cpu(p->async_timing_mode);
+	onfi->sdr_timing_modes = le16_to_cpu(p->sdr_timing_modes);
 	onfi->vendor_revision = le16_to_cpu(p->vendor_revision);
 	memcpy(onfi->vendor, p->vendor, sizeof(p->vendor));
 	chip->parameters.onfi = onfi;
diff --git a/include/linux/mtd/onfi.h b/include/linux/mtd/onfi.h
index cf14474bc454..2ade5632dc5b 100644
--- a/include/linux/mtd/onfi.h
+++ b/include/linux/mtd/onfi.h
@@ -93,7 +93,7 @@ struct nand_onfi_params {
 
 	/* electrical parameter block */
 	u8 io_pin_capacitance_max;
-	__le16 async_timing_mode;
+	__le16 sdr_timing_modes;
 	__le16 program_cache_timing_mode;
 	__le16 t_prog;
 	__le16 t_bers;
@@ -160,7 +160,7 @@ struct onfi_ext_param_page {
  * @tBERS: Block erase time
  * @tR: Page read time
  * @tCCS: Change column setup time
- * @async_timing_mode: Supported asynchronous timing mode
+ * @sdr_timing_modes: Supported asynchronous/SDR timing modes
  * @vendor_revision: Vendor specific revision number
  * @vendor: Vendor specific data
  */
@@ -170,7 +170,7 @@ struct onfi_params {
 	u16 tBERS;
 	u16 tR;
 	u16 tCCS;
-	u16 async_timing_mode;
+	u16 sdr_timing_modes;
 	u16 vendor_revision;
 	u8 vendor[88];
 };
-- 
cgit v1.2.3


From 7ce872d9f55f46ef54b60ed39c0144b24578d7c3 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 5 May 2021 23:37:35 +0200
Subject: mtd: rawnand: Clarify the NV-DDR entries in the ONFI structure

Both src_sync_timing_mode and src_ssync_features entries of the ONFI
parameter page have been updated and now are named nvddr_timing_modes,
nvddr2_timing_modes and nvddr_nvddr2_features, which is much more
understandable for someone which do not know the history of the ONFI
specification. Update the relevant structure with regard to these
changes.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210505213750.257417-8-miquel.raynal@bootlin.com
---
 include/linux/mtd/onfi.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/onfi.h b/include/linux/mtd/onfi.h
index 2ade5632dc5b..319e1736851d 100644
--- a/include/linux/mtd/onfi.h
+++ b/include/linux/mtd/onfi.h
@@ -99,8 +99,9 @@ struct nand_onfi_params {
 	__le16 t_bers;
 	__le16 t_r;
 	__le16 t_ccs;
-	__le16 src_sync_timing_mode;
-	u8 src_ssync_features;
+	u8 nvddr_timing_modes;
+	u8 nvddr2_timing_modes;
+	u8 nvddr_nvddr2_features;
 	__le16 clk_pin_capacitance_typ;
 	__le16 io_pin_capacitance_typ;
 	__le16 input_pin_capacitance_typ;
-- 
cgit v1.2.3


From 1666b815ad1a5b6373e950da5002ac46521a9b28 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 5 May 2021 23:37:36 +0200
Subject: mtd: rawnand: Add NV-DDR timings

Create the relevant ONFI NV-DDR timings structure and fill it with
default values from the ONFI specification.

Add the relevant structure entries and helpers.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210505213750.257417-9-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/raw/nand_timings.c | 255 ++++++++++++++++++++++++++++++++++++
 include/linux/mtd/rawnand.h         | 112 ++++++++++++++++
 2 files changed, 367 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/nand_timings.c b/drivers/mtd/nand/raw/nand_timings.c
index 94d832646487..481b56d5f60d 100644
--- a/drivers/mtd/nand/raw/nand_timings.c
+++ b/drivers/mtd/nand/raw/nand_timings.c
@@ -292,6 +292,261 @@ static const struct nand_interface_config onfi_sdr_timings[] = {
 	},
 };
 
+static const struct nand_interface_config onfi_nvddr_timings[] = {
+	/* Mode 0 */
+	{
+		.type = NAND_NVDDR_IFACE,
+		.timings.mode = 0,
+		.timings.nvddr = {
+			.tCCS_min = 500000,
+			.tR_max = 200000000,
+			.tPROG_max = 1000000ULL * ONFI_DYN_TIMING_MAX,
+			.tBERS_max = 1000000ULL * ONFI_DYN_TIMING_MAX,
+			.tAC_min = 3000,
+			.tAC_max = 25000,
+			.tADL_min = 400000,
+			.tCAD_min = 45000,
+			.tCAH_min = 10000,
+			.tCALH_min = 10000,
+			.tCALS_min = 10000,
+			.tCAS_min = 10000,
+			.tCEH_min = 20000,
+			.tCH_min = 10000,
+			.tCK_min = 50000,
+			.tCS_min = 35000,
+			.tDH_min = 5000,
+			.tDQSCK_min = 3000,
+			.tDQSCK_max = 25000,
+			.tDQSD_min = 0,
+			.tDQSD_max = 18000,
+			.tDQSHZ_max = 20000,
+			.tDQSQ_max = 5000,
+			.tDS_min = 5000,
+			.tDSC_min = 50000,
+			.tFEAT_max = 1000000,
+			.tITC_max = 1000000,
+			.tQHS_max = 6000,
+			.tRHW_min = 100000,
+			.tRR_min = 20000,
+			.tRST_max = 500000000,
+			.tWB_max = 100000,
+			.tWHR_min = 80000,
+			.tWRCK_min = 20000,
+			.tWW_min = 100000,
+		},
+	},
+	/* Mode 1 */
+	{
+		.type = NAND_NVDDR_IFACE,
+		.timings.mode = 1,
+		.timings.nvddr = {
+			.tCCS_min = 500000,
+			.tR_max = 200000000,
+			.tPROG_max = 1000000ULL * ONFI_DYN_TIMING_MAX,
+			.tBERS_max = 1000000ULL * ONFI_DYN_TIMING_MAX,
+			.tAC_min = 3000,
+			.tAC_max = 25000,
+			.tADL_min = 400000,
+			.tCAD_min = 45000,
+			.tCAH_min = 5000,
+			.tCALH_min = 5000,
+			.tCALS_min = 5000,
+			.tCAS_min = 5000,
+			.tCEH_min = 20000,
+			.tCH_min = 5000,
+			.tCK_min = 30000,
+			.tCS_min = 25000,
+			.tDH_min = 2500,
+			.tDQSCK_min = 3000,
+			.tDQSCK_max = 25000,
+			.tDQSD_min = 0,
+			.tDQSD_max = 18000,
+			.tDQSHZ_max = 20000,
+			.tDQSQ_max = 2500,
+			.tDS_min = 3000,
+			.tDSC_min = 30000,
+			.tFEAT_max = 1000000,
+			.tITC_max = 1000000,
+			.tQHS_max = 3000,
+			.tRHW_min = 100000,
+			.tRR_min = 20000,
+			.tRST_max = 500000000,
+			.tWB_max = 100000,
+			.tWHR_min = 80000,
+			.tWRCK_min = 20000,
+			.tWW_min = 100000,
+		},
+	},
+	/* Mode 2 */
+	{
+		.type = NAND_NVDDR_IFACE,
+		.timings.mode = 2,
+		.timings.nvddr = {
+			.tCCS_min = 500000,
+			.tR_max = 200000000,
+			.tPROG_max = 1000000ULL * ONFI_DYN_TIMING_MAX,
+			.tBERS_max = 1000000ULL * ONFI_DYN_TIMING_MAX,
+			.tAC_min = 3000,
+			.tAC_max = 25000,
+			.tADL_min = 400000,
+			.tCAD_min = 45000,
+			.tCAH_min = 4000,
+			.tCALH_min = 4000,
+			.tCALS_min = 4000,
+			.tCAS_min = 4000,
+			.tCEH_min = 20000,
+			.tCH_min = 4000,
+			.tCK_min = 20000,
+			.tCS_min = 15000,
+			.tDH_min = 1700,
+			.tDQSCK_min = 3000,
+			.tDQSCK_max = 25000,
+			.tDQSD_min = 0,
+			.tDQSD_max = 18000,
+			.tDQSHZ_max = 20000,
+			.tDQSQ_max = 1700,
+			.tDS_min = 2000,
+			.tDSC_min = 20000,
+			.tFEAT_max = 1000000,
+			.tITC_max = 1000000,
+			.tQHS_max = 2000,
+			.tRHW_min = 100000,
+			.tRR_min = 20000,
+			.tRST_max = 500000000,
+			.tWB_max = 100000,
+			.tWHR_min = 80000,
+			.tWRCK_min = 20000,
+			.tWW_min = 100000,
+		},
+	},
+	/* Mode 3 */
+	{
+		.type = NAND_NVDDR_IFACE,
+		.timings.mode = 3,
+		.timings.nvddr = {
+			.tCCS_min = 500000,
+			.tR_max = 200000000,
+			.tPROG_max = 1000000ULL * ONFI_DYN_TIMING_MAX,
+			.tBERS_max = 1000000ULL * ONFI_DYN_TIMING_MAX,
+			.tAC_min = 3000,
+			.tAC_max = 25000,
+			.tADL_min = 400000,
+			.tCAD_min = 45000,
+			.tCAH_min = 3000,
+			.tCALH_min = 3000,
+			.tCALS_min = 3000,
+			.tCAS_min = 3000,
+			.tCEH_min = 20000,
+			.tCH_min = 3000,
+			.tCK_min = 15000,
+			.tCS_min = 15000,
+			.tDH_min = 1300,
+			.tDQSCK_min = 3000,
+			.tDQSCK_max = 25000,
+			.tDQSD_min = 0,
+			.tDQSD_max = 18000,
+			.tDQSHZ_max = 20000,
+			.tDQSQ_max = 1300,
+			.tDS_min = 1500,
+			.tDSC_min = 15000,
+			.tFEAT_max = 1000000,
+			.tITC_max = 1000000,
+			.tQHS_max = 1500,
+			.tRHW_min = 100000,
+			.tRR_min = 20000,
+			.tRST_max = 500000000,
+			.tWB_max = 100000,
+			.tWHR_min = 80000,
+			.tWRCK_min = 20000,
+			.tWW_min = 100000,
+		},
+	},
+	/* Mode 4 */
+	{
+		.type = NAND_NVDDR_IFACE,
+		.timings.mode = 4,
+		.timings.nvddr = {
+			.tCCS_min = 500000,
+			.tR_max = 200000000,
+			.tPROG_max = 1000000ULL * ONFI_DYN_TIMING_MAX,
+			.tBERS_max = 1000000ULL * ONFI_DYN_TIMING_MAX,
+			.tAC_min = 3000,
+			.tAC_max = 25000,
+			.tADL_min = 400000,
+			.tCAD_min = 45000,
+			.tCAH_min = 2500,
+			.tCALH_min = 2500,
+			.tCALS_min = 2500,
+			.tCAS_min = 2500,
+			.tCEH_min = 20000,
+			.tCH_min = 2500,
+			.tCK_min = 12000,
+			.tCS_min = 15000,
+			.tDH_min = 1100,
+			.tDQSCK_min = 3000,
+			.tDQSCK_max = 25000,
+			.tDQSD_min = 0,
+			.tDQSD_max = 18000,
+			.tDQSHZ_max = 20000,
+			.tDQSQ_max = 1000,
+			.tDS_min = 1100,
+			.tDSC_min = 12000,
+			.tFEAT_max = 1000000,
+			.tITC_max = 1000000,
+			.tQHS_max = 1200,
+			.tRHW_min = 100000,
+			.tRR_min = 20000,
+			.tRST_max = 500000000,
+			.tWB_max = 100000,
+			.tWHR_min = 80000,
+			.tWRCK_min = 20000,
+			.tWW_min = 100000,
+		},
+	},
+	/* Mode 5 */
+	{
+		.type = NAND_NVDDR_IFACE,
+		.timings.mode = 5,
+		.timings.nvddr = {
+			.tCCS_min = 500000,
+			.tR_max = 200000000,
+			.tPROG_max = 1000000ULL * ONFI_DYN_TIMING_MAX,
+			.tBERS_max = 1000000ULL * ONFI_DYN_TIMING_MAX,
+			.tAC_min = 3000,
+			.tAC_max = 25000,
+			.tADL_min = 400000,
+			.tCAD_min = 45000,
+			.tCAH_min = 2000,
+			.tCALH_min = 2000,
+			.tCALS_min = 2000,
+			.tCAS_min = 2000,
+			.tCEH_min = 20000,
+			.tCH_min = 2000,
+			.tCK_min = 10000,
+			.tCS_min = 15000,
+			.tDH_min = 900,
+			.tDQSCK_min = 3000,
+			.tDQSCK_max = 25000,
+			.tDQSD_min = 0,
+			.tDQSD_max = 18000,
+			.tDQSHZ_max = 20000,
+			.tDQSQ_max = 850,
+			.tDS_min = 900,
+			.tDSC_min = 10000,
+			.tFEAT_max = 1000000,
+			.tITC_max = 1000000,
+			.tQHS_max = 1000,
+			.tRHW_min = 100000,
+			.tRR_min = 20000,
+			.tRST_max = 500000000,
+			.tWB_max = 100000,
+			.tWHR_min = 80000,
+			.tWRCK_min = 20000,
+			.tWW_min = 100000,
+		},
+	},
+};
+
 /* All NAND chips share the same reset data interface: SDR mode 0 */
 const struct nand_interface_config *nand_get_reset_interface_config(void)
 {
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 24aee0af5421..a53a1543d1d4 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -471,12 +471,100 @@ struct nand_sdr_timings {
 	u32 tWW_min;
 };
 
+/**
+ * struct nand_nvddr_timings - NV-DDR NAND chip timings
+ *
+ * This struct defines the timing requirements of a NV-DDR NAND data interface.
+ * These information can be found in every NAND datasheets and the timings
+ * meaning are described in the ONFI specifications:
+ * https://media-www.micron.com/-/media/client/onfi/specs/onfi_4_1_gold.pdf
+ * (chapter 4.18.2 NV-DDR)
+ *
+ * All these timings are expressed in picoseconds.
+ *
+ * @tBERS_max: Block erase time
+ * @tCCS_min: Change column setup time
+ * @tPROG_max: Page program time
+ * @tR_max: Page read time
+ * @tAC_min: Access window of DQ[7:0] from CLK
+ * @tAC_max: Access window of DQ[7:0] from CLK
+ * @tADL_min: ALE to data loading time
+ * @tCAD_min: Command, Address, Data delay
+ * @tCAH_min: Command/Address DQ hold time
+ * @tCALH_min: W/R_n, CLE and ALE hold time
+ * @tCALS_min: W/R_n, CLE and ALE setup time
+ * @tCAS_min: Command/address DQ setup time
+ * @tCEH_min: CE# high hold time
+ * @tCH_min:  CE# hold time
+ * @tCK_min: Average clock cycle time
+ * @tCS_min: CE# setup time
+ * @tDH_min: Data hold time
+ * @tDQSCK_min: Start of the access window of DQS from CLK
+ * @tDQSCK_max: End of the access window of DQS from CLK
+ * @tDQSD_min: Min W/R_n low to DQS/DQ driven by device
+ * @tDQSD_max: Max W/R_n low to DQS/DQ driven by device
+ * @tDQSHZ_max: W/R_n high to DQS/DQ tri-state by device
+ * @tDQSQ_max: DQS-DQ skew, DQS to last DQ valid, per access
+ * @tDS_min: Data setup time
+ * @tDSC_min: DQS cycle time
+ * @tFEAT_max: Busy time for Set Features and Get Features
+ * @tITC_max: Interface and Timing Mode Change time
+ * @tQHS_max: Data hold skew factor
+ * @tRHW_min: Data output cycle to command, address, or data input cycle
+ * @tRR_min: Ready to RE# low (data only)
+ * @tRST_max: Device reset time, measured from the falling edge of R/B# to the
+ *	      rising edge of R/B#.
+ * @tWB_max: WE# high to SR[6] low
+ * @tWHR_min: WE# high to RE# low
+ * @tWRCK_min: W/R_n low to data output cycle
+ * @tWW_min: WP# transition to WE# low
+ */
+struct nand_nvddr_timings {
+	u64 tBERS_max;
+	u32 tCCS_min;
+	u64 tPROG_max;
+	u64 tR_max;
+	u32 tAC_min;
+	u32 tAC_max;
+	u32 tADL_min;
+	u32 tCAD_min;
+	u32 tCAH_min;
+	u32 tCALH_min;
+	u32 tCALS_min;
+	u32 tCAS_min;
+	u32 tCEH_min;
+	u32 tCH_min;
+	u32 tCK_min;
+	u32 tCS_min;
+	u32 tDH_min;
+	u32 tDQSCK_min;
+	u32 tDQSCK_max;
+	u32 tDQSD_min;
+	u32 tDQSD_max;
+	u32 tDQSHZ_max;
+	u32 tDQSQ_max;
+	u32 tDS_min;
+	u32 tDSC_min;
+	u32 tFEAT_max;
+	u32 tITC_max;
+	u32 tQHS_max;
+	u32 tRHW_min;
+	u32 tRR_min;
+	u32 tRST_max;
+	u32 tWB_max;
+	u32 tWHR_min;
+	u32 tWRCK_min;
+	u32 tWW_min;
+};
+
 /**
  * enum nand_interface_type - NAND interface type
  * @NAND_SDR_IFACE:	Single Data Rate interface
+ * @NAND_NVDDR_IFACE:	Double Data Rate interface
  */
 enum nand_interface_type {
 	NAND_SDR_IFACE,
+	NAND_NVDDR_IFACE,
 };
 
 /**
@@ -485,6 +573,7 @@ enum nand_interface_type {
  * @timings:	 The timing information
  * @timings.mode: Timing mode as defined in the specification
  * @timings.sdr: Use it when @type is %NAND_SDR_IFACE.
+ * @timings.nvddr: Use it when @type is %NAND_NVDDR_IFACE.
  */
 struct nand_interface_config {
 	enum nand_interface_type type;
@@ -492,6 +581,7 @@ struct nand_interface_config {
 		unsigned int mode;
 		union {
 			struct nand_sdr_timings sdr;
+			struct nand_nvddr_timings nvddr;
 		};
 	} timings;
 };
@@ -505,6 +595,15 @@ static bool nand_interface_is_sdr(const struct nand_interface_config *conf)
 	return conf->type == NAND_SDR_IFACE;
 }
 
+/**
+ * nand_interface_is_nvddr - get the interface type
+ * @conf:	The data interface
+ */
+static bool nand_interface_is_nvddr(const struct nand_interface_config *conf)
+{
+	return conf->type == NAND_NVDDR_IFACE;
+}
+
 /**
  * nand_get_sdr_timings - get SDR timing from data interface
  * @conf:	The data interface
@@ -518,6 +617,19 @@ nand_get_sdr_timings(const struct nand_interface_config *conf)
 	return &conf->timings.sdr;
 }
 
+/**
+ * nand_get_nvddr_timings - get NV-DDR timing from data interface
+ * @conf:	The data interface
+ */
+static inline const struct nand_nvddr_timings *
+nand_get_nvddr_timings(const struct nand_interface_config *conf)
+{
+	if (!nand_interface_is_nvddr(conf))
+		return ERR_PTR(-EINVAL);
+
+	return &conf->timings.nvddr;
+}
+
 /**
  * struct nand_op_cmd_instr - Definition of a command instruction
  * @opcode: the command to issue in one cycle
-- 
cgit v1.2.3


From 9310668fb60a7ee76c4fdfd6388747a6f2beaf75 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 5 May 2021 23:37:37 +0200
Subject: mtd: rawnand: Retrieve NV-DDR timing modes from the ONFI parameter
 page

When parsing the ONFI parameter page, save the available NV-DDR timing
modes in the core's dynamic ONFI structure. Once available to the rest
of the core out of the ONFI driver, these values will then be used to
derive the best timing mode.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210505213750.257417-10-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/raw/nand_onfi.c | 2 ++
 include/linux/mtd/onfi.h         | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/nand_onfi.c b/drivers/mtd/nand/raw/nand_onfi.c
index 02303455c34f..f7a4a0573fe7 100644
--- a/drivers/mtd/nand/raw/nand_onfi.c
+++ b/drivers/mtd/nand/raw/nand_onfi.c
@@ -316,6 +316,8 @@ int nand_onfi_detect(struct nand_chip *chip)
 	onfi->tR = le16_to_cpu(p->t_r);
 	onfi->tCCS = le16_to_cpu(p->t_ccs);
 	onfi->sdr_timing_modes = le16_to_cpu(p->sdr_timing_modes);
+	if (p->features & ONFI_FEATURE_NV_DDR)
+		onfi->nvddr_timing_modes = p->nvddr_timing_modes;
 	onfi->vendor_revision = le16_to_cpu(p->vendor_revision);
 	memcpy(onfi->vendor, p->vendor, sizeof(p->vendor));
 	chip->parameters.onfi = onfi;
diff --git a/include/linux/mtd/onfi.h b/include/linux/mtd/onfi.h
index 319e1736851d..14e66a49557e 100644
--- a/include/linux/mtd/onfi.h
+++ b/include/linux/mtd/onfi.h
@@ -25,6 +25,7 @@
 
 /* ONFI features */
 #define ONFI_FEATURE_16_BIT_BUS		BIT(0)
+#define ONFI_FEATURE_NV_DDR		BIT(5)
 #define ONFI_FEATURE_EXT_PARAM_PAGE	BIT(7)
 
 /* ONFI timing mode, used in both asynchronous and synchronous mode */
@@ -162,6 +163,7 @@ struct onfi_ext_param_page {
  * @tR: Page read time
  * @tCCS: Change column setup time
  * @sdr_timing_modes: Supported asynchronous/SDR timing modes
+ * @nvddr_timing_modes: Supported source synchronous/NV-DDR timing modes
  * @vendor_revision: Vendor specific revision number
  * @vendor: Vendor specific data
  */
@@ -172,6 +174,7 @@ struct onfi_params {
 	u16 tR;
 	u16 tCCS;
 	u16 sdr_timing_modes;
+	u16 nvddr_timing_modes;
 	u16 vendor_revision;
 	u8 vendor[88];
 };
-- 
cgit v1.2.3


From 45606518f961b9e7adddc017e7813fa9f92b43fb Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 5 May 2021 23:37:39 +0200
Subject: mtd: rawnand: Add onfi_fill_nvddr_interface_config() helper

Same logic as for the SDR path, let's create a
onfi_fill_nvddr_interface_config() helper to fill an interface
configuration structure with NV-DDR timings, given a specific ONFI mode.

There is one additional thing to do compared to SDR mode: tCAD timing
can be fast or slow and this depends on an ONFI parameter page bit. By
default the slow value is declared in the timings structure definition,
but this helper can shrink it down if necessary.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210505213750.257417-12-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/raw/nand_onfi.c    |  1 +
 drivers/mtd/nand/raw/nand_timings.c | 41 +++++++++++++++++++++++++++++++++++++
 include/linux/mtd/onfi.h            |  2 ++
 3 files changed, 44 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/nand_onfi.c b/drivers/mtd/nand/raw/nand_onfi.c
index f7a4a0573fe7..8e4677f2ba76 100644
--- a/drivers/mtd/nand/raw/nand_onfi.c
+++ b/drivers/mtd/nand/raw/nand_onfi.c
@@ -315,6 +315,7 @@ int nand_onfi_detect(struct nand_chip *chip)
 	onfi->tBERS = le16_to_cpu(p->t_bers);
 	onfi->tR = le16_to_cpu(p->t_r);
 	onfi->tCCS = le16_to_cpu(p->t_ccs);
+	onfi->fast_tCAD = p->nvddr_nvddr2_features & BIT(0);
 	onfi->sdr_timing_modes = le16_to_cpu(p->sdr_timing_modes);
 	if (p->features & ONFI_FEATURE_NV_DDR)
 		onfi->nvddr_timing_modes = p->nvddr_timing_modes;
diff --git a/drivers/mtd/nand/raw/nand_timings.c b/drivers/mtd/nand/raw/nand_timings.c
index fb483e696cb9..8eaa132c3900 100644
--- a/drivers/mtd/nand/raw/nand_timings.c
+++ b/drivers/mtd/nand/raw/nand_timings.c
@@ -636,6 +636,45 @@ static void onfi_fill_sdr_interface_config(struct nand_chip *chip,
 	}
 }
 
+/**
+ * onfi_fill_nvddr_interface_config - Initialize a NVDDR interface config from a
+ *                                    given ONFI mode
+ * @chip: The NAND chip
+ * @iface: The interface configuration to fill
+ * @timing_mode: The ONFI timing mode
+ */
+static void onfi_fill_nvddr_interface_config(struct nand_chip *chip,
+					     struct nand_interface_config *iface,
+					     unsigned int timing_mode)
+{
+	struct onfi_params *onfi = chip->parameters.onfi;
+
+	if (WARN_ON(timing_mode >= ARRAY_SIZE(onfi_nvddr_timings)))
+		return;
+
+	*iface = onfi_nvddr_timings[timing_mode];
+
+	/*
+	 * Initialize timings that cannot be deduced from timing mode:
+	 * tPROG, tBERS, tR, tCCS and tCAD.
+	 * These information are part of the ONFI parameter page.
+	 */
+	if (onfi) {
+		struct nand_nvddr_timings *timings = &iface->timings.nvddr;
+
+		/* microseconds -> picoseconds */
+		timings->tPROG_max = 1000000ULL * onfi->tPROG;
+		timings->tBERS_max = 1000000ULL * onfi->tBERS;
+		timings->tR_max = 1000000ULL * onfi->tR;
+
+		/* nanoseconds -> picoseconds */
+		timings->tCCS_min = 1000UL * onfi->tCCS;
+
+		if (onfi->fast_tCAD)
+			timings->tCAD_min = 25000;
+	}
+}
+
 /**
  * onfi_fill_interface_config - Initialize an interface config from a given
  *                              ONFI mode
@@ -651,4 +690,6 @@ void onfi_fill_interface_config(struct nand_chip *chip,
 {
 	if (type == NAND_SDR_IFACE)
 		return onfi_fill_sdr_interface_config(chip, iface, timing_mode);
+	else
+		return onfi_fill_nvddr_interface_config(chip, iface, timing_mode);
 }
diff --git a/include/linux/mtd/onfi.h b/include/linux/mtd/onfi.h
index 14e66a49557e..a9677bf1e47e 100644
--- a/include/linux/mtd/onfi.h
+++ b/include/linux/mtd/onfi.h
@@ -162,6 +162,7 @@ struct onfi_ext_param_page {
  * @tBERS: Block erase time
  * @tR: Page read time
  * @tCCS: Change column setup time
+ * @fast_tCAD: Command/Address/Data slow or fast delay (NV-DDR only)
  * @sdr_timing_modes: Supported asynchronous/SDR timing modes
  * @nvddr_timing_modes: Supported source synchronous/NV-DDR timing modes
  * @vendor_revision: Vendor specific revision number
@@ -173,6 +174,7 @@ struct onfi_params {
 	u16 tBERS;
 	u16 tR;
 	u16 tCCS;
+	bool fast_tCAD;
 	u16 sdr_timing_modes;
 	u16 nvddr_timing_modes;
 	u16 vendor_revision;
-- 
cgit v1.2.3


From d7a773e8812bcf7a5412e4baebc6eb1c11242551 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 5 May 2021 23:37:41 +0200
Subject: mtd: rawnand: Access SDR and NV-DDR timings through a common macro

Most timings related to the bus timings are different between SDR and
NV-DDR. However, we identified 9 individual timings which are more
related to the NAND chip internals. These are common between the two
interface types. Fortunately, only these common timings are being shared
through the NAND core and its ->exec_op() interface, which allows the
writing of a simple macro checking the interface type and depending on
it, returning either the relevant SDR timing or the NV-DDR timing. This
is the purpose of the NAND_COMMON_TIMING_PS() macro.

As all this is evaluated at build time, one will immediately be notified
in case a non common timing is being accessed through this macro.

Two handy macros are also inserted at the same time, which use
PSEC_TO_NSEC or PSEC_TO_MSEC so that it is very easy to return timings
in milli-, nano- or pico-seconds, as usually requested by the internal
API.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210505213750.257417-14-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/raw/nand_base.c | 131 +++++++++++++++++++++------------------
 include/linux/mtd/rawnand.h      |  28 +++++++++
 2 files changed, 99 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index f4b287b706f8..cdb3ddb34cd6 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -647,7 +647,7 @@ static int nand_block_checkbad(struct nand_chip *chip, loff_t ofs, int allowbbt)
  */
 int nand_soft_waitrdy(struct nand_chip *chip, unsigned long timeout_ms)
 {
-	const struct nand_sdr_timings *timings;
+	const struct nand_interface_config *conf;
 	u8 status = 0;
 	int ret;
 
@@ -655,8 +655,8 @@ int nand_soft_waitrdy(struct nand_chip *chip, unsigned long timeout_ms)
 		return -ENOTSUPP;
 
 	/* Wait tWB before polling the STATUS reg. */
-	timings = nand_get_sdr_timings(nand_get_interface_config(chip));
-	ndelay(PSEC_TO_NSEC(timings->tWB_max));
+	conf = nand_get_interface_config(chip);
+	ndelay(NAND_COMMON_TIMING_NS(conf, tWB_max));
 
 	ret = nand_status_op(chip, NULL);
 	if (ret)
@@ -1047,15 +1047,15 @@ static int nand_sp_exec_read_page_op(struct nand_chip *chip, unsigned int page,
 				     unsigned int offset_in_page, void *buf,
 				     unsigned int len)
 {
-	const struct nand_sdr_timings *sdr =
-		nand_get_sdr_timings(nand_get_interface_config(chip));
+	const struct nand_interface_config *conf =
+		nand_get_interface_config(chip);
 	struct mtd_info *mtd = nand_to_mtd(chip);
 	u8 addrs[4];
 	struct nand_op_instr instrs[] = {
 		NAND_OP_CMD(NAND_CMD_READ0, 0),
-		NAND_OP_ADDR(3, addrs, PSEC_TO_NSEC(sdr->tWB_max)),
-		NAND_OP_WAIT_RDY(PSEC_TO_MSEC(sdr->tR_max),
-				 PSEC_TO_NSEC(sdr->tRR_min)),
+		NAND_OP_ADDR(3, addrs, NAND_COMMON_TIMING_NS(conf, tWB_max)),
+		NAND_OP_WAIT_RDY(NAND_COMMON_TIMING_MS(conf, tR_max),
+				 NAND_COMMON_TIMING_NS(conf, tRR_min)),
 		NAND_OP_DATA_IN(len, buf, 0),
 	};
 	struct nand_operation op = NAND_OPERATION(chip->cur_cs, instrs);
@@ -1090,15 +1090,15 @@ static int nand_lp_exec_read_page_op(struct nand_chip *chip, unsigned int page,
 				     unsigned int offset_in_page, void *buf,
 				     unsigned int len)
 {
-	const struct nand_sdr_timings *sdr =
-		nand_get_sdr_timings(nand_get_interface_config(chip));
+	const struct nand_interface_config *conf =
+		nand_get_interface_config(chip);
 	u8 addrs[5];
 	struct nand_op_instr instrs[] = {
 		NAND_OP_CMD(NAND_CMD_READ0, 0),
 		NAND_OP_ADDR(4, addrs, 0),
-		NAND_OP_CMD(NAND_CMD_READSTART, PSEC_TO_NSEC(sdr->tWB_max)),
-		NAND_OP_WAIT_RDY(PSEC_TO_MSEC(sdr->tR_max),
-				 PSEC_TO_NSEC(sdr->tRR_min)),
+		NAND_OP_CMD(NAND_CMD_READSTART, NAND_COMMON_TIMING_NS(conf, tWB_max)),
+		NAND_OP_WAIT_RDY(NAND_COMMON_TIMING_MS(conf, tR_max),
+				 NAND_COMMON_TIMING_NS(conf, tRR_min)),
 		NAND_OP_DATA_IN(len, buf, 0),
 	};
 	struct nand_operation op = NAND_OPERATION(chip->cur_cs, instrs);
@@ -1187,13 +1187,14 @@ int nand_read_param_page_op(struct nand_chip *chip, u8 page, void *buf,
 		return -EINVAL;
 
 	if (nand_has_exec_op(chip)) {
-		const struct nand_sdr_timings *sdr =
-			nand_get_sdr_timings(nand_get_interface_config(chip));
+		const struct nand_interface_config *conf =
+			nand_get_interface_config(chip);
 		struct nand_op_instr instrs[] = {
 			NAND_OP_CMD(NAND_CMD_PARAM, 0),
-			NAND_OP_ADDR(1, &page, PSEC_TO_NSEC(sdr->tWB_max)),
-			NAND_OP_WAIT_RDY(PSEC_TO_MSEC(sdr->tR_max),
-					 PSEC_TO_NSEC(sdr->tRR_min)),
+			NAND_OP_ADDR(1, &page,
+				     NAND_COMMON_TIMING_NS(conf, tWB_max)),
+			NAND_OP_WAIT_RDY(NAND_COMMON_TIMING_MS(conf, tR_max),
+					 NAND_COMMON_TIMING_NS(conf, tRR_min)),
 			NAND_OP_8BIT_DATA_IN(len, buf, 0),
 		};
 		struct nand_operation op = NAND_OPERATION(chip->cur_cs, instrs);
@@ -1242,14 +1243,14 @@ int nand_change_read_column_op(struct nand_chip *chip,
 		return -ENOTSUPP;
 
 	if (nand_has_exec_op(chip)) {
-		const struct nand_sdr_timings *sdr =
-			nand_get_sdr_timings(nand_get_interface_config(chip));
+		const struct nand_interface_config *conf =
+			nand_get_interface_config(chip);
 		u8 addrs[2] = {};
 		struct nand_op_instr instrs[] = {
 			NAND_OP_CMD(NAND_CMD_RNDOUT, 0),
 			NAND_OP_ADDR(2, addrs, 0),
 			NAND_OP_CMD(NAND_CMD_RNDOUTSTART,
-				    PSEC_TO_NSEC(sdr->tCCS_min)),
+				    NAND_COMMON_TIMING_NS(conf, tCCS_min)),
 			NAND_OP_DATA_IN(len, buf, 0),
 		};
 		struct nand_operation op = NAND_OPERATION(chip->cur_cs, instrs);
@@ -1317,8 +1318,8 @@ static int nand_exec_prog_page_op(struct nand_chip *chip, unsigned int page,
 				  unsigned int offset_in_page, const void *buf,
 				  unsigned int len, bool prog)
 {
-	const struct nand_sdr_timings *sdr =
-		nand_get_sdr_timings(nand_get_interface_config(chip));
+	const struct nand_interface_config *conf =
+		nand_get_interface_config(chip);
 	struct mtd_info *mtd = nand_to_mtd(chip);
 	u8 addrs[5] = {};
 	struct nand_op_instr instrs[] = {
@@ -1329,10 +1330,11 @@ static int nand_exec_prog_page_op(struct nand_chip *chip, unsigned int page,
 		 */
 		NAND_OP_CMD(NAND_CMD_READ0, 0),
 		NAND_OP_CMD(NAND_CMD_SEQIN, 0),
-		NAND_OP_ADDR(0, addrs, PSEC_TO_NSEC(sdr->tADL_min)),
+		NAND_OP_ADDR(0, addrs, NAND_COMMON_TIMING_NS(conf, tADL_min)),
 		NAND_OP_DATA_OUT(len, buf, 0),
-		NAND_OP_CMD(NAND_CMD_PAGEPROG, PSEC_TO_NSEC(sdr->tWB_max)),
-		NAND_OP_WAIT_RDY(PSEC_TO_MSEC(sdr->tPROG_max), 0),
+		NAND_OP_CMD(NAND_CMD_PAGEPROG,
+			    NAND_COMMON_TIMING_NS(conf, tWB_max)),
+		NAND_OP_WAIT_RDY(NAND_COMMON_TIMING_MS(conf, tPROG_max), 0),
 	};
 	struct nand_operation op = NAND_OPERATION(chip->cur_cs, instrs);
 	int naddrs = nand_fill_column_cycles(chip, addrs, offset_in_page);
@@ -1431,12 +1433,13 @@ int nand_prog_page_end_op(struct nand_chip *chip)
 	u8 status;
 
 	if (nand_has_exec_op(chip)) {
-		const struct nand_sdr_timings *sdr =
-			nand_get_sdr_timings(nand_get_interface_config(chip));
+		const struct nand_interface_config *conf =
+			nand_get_interface_config(chip);
 		struct nand_op_instr instrs[] = {
 			NAND_OP_CMD(NAND_CMD_PAGEPROG,
-				    PSEC_TO_NSEC(sdr->tWB_max)),
-			NAND_OP_WAIT_RDY(PSEC_TO_MSEC(sdr->tPROG_max), 0),
+				    NAND_COMMON_TIMING_NS(conf, tWB_max)),
+			NAND_OP_WAIT_RDY(NAND_COMMON_TIMING_MS(conf, tPROG_max),
+					 0),
 		};
 		struct nand_operation op = NAND_OPERATION(chip->cur_cs, instrs);
 
@@ -1549,12 +1552,12 @@ int nand_change_write_column_op(struct nand_chip *chip,
 		return -ENOTSUPP;
 
 	if (nand_has_exec_op(chip)) {
-		const struct nand_sdr_timings *sdr =
-			nand_get_sdr_timings(nand_get_interface_config(chip));
+		const struct nand_interface_config *conf =
+			nand_get_interface_config(chip);
 		u8 addrs[2];
 		struct nand_op_instr instrs[] = {
 			NAND_OP_CMD(NAND_CMD_RNDIN, 0),
-			NAND_OP_ADDR(2, addrs, PSEC_TO_NSEC(sdr->tCCS_min)),
+			NAND_OP_ADDR(2, addrs, NAND_COMMON_TIMING_NS(conf, tCCS_min)),
 			NAND_OP_DATA_OUT(len, buf, 0),
 		};
 		struct nand_operation op = NAND_OPERATION(chip->cur_cs, instrs);
@@ -1604,11 +1607,12 @@ int nand_readid_op(struct nand_chip *chip, u8 addr, void *buf,
 		return -EINVAL;
 
 	if (nand_has_exec_op(chip)) {
-		const struct nand_sdr_timings *sdr =
-			nand_get_sdr_timings(nand_get_interface_config(chip));
+		const struct nand_interface_config *conf =
+			nand_get_interface_config(chip);
 		struct nand_op_instr instrs[] = {
 			NAND_OP_CMD(NAND_CMD_READID, 0),
-			NAND_OP_ADDR(1, &addr, PSEC_TO_NSEC(sdr->tADL_min)),
+			NAND_OP_ADDR(1, &addr,
+				     NAND_COMMON_TIMING_NS(conf, tADL_min)),
 			NAND_OP_8BIT_DATA_IN(len, buf, 0),
 		};
 		struct nand_operation op = NAND_OPERATION(chip->cur_cs, instrs);
@@ -1643,11 +1647,11 @@ EXPORT_SYMBOL_GPL(nand_readid_op);
 int nand_status_op(struct nand_chip *chip, u8 *status)
 {
 	if (nand_has_exec_op(chip)) {
-		const struct nand_sdr_timings *sdr =
-			nand_get_sdr_timings(nand_get_interface_config(chip));
+		const struct nand_interface_config *conf =
+			nand_get_interface_config(chip);
 		struct nand_op_instr instrs[] = {
 			NAND_OP_CMD(NAND_CMD_STATUS,
-				    PSEC_TO_NSEC(sdr->tADL_min)),
+				    NAND_COMMON_TIMING_NS(conf, tADL_min)),
 			NAND_OP_8BIT_DATA_IN(1, status, 0),
 		};
 		struct nand_operation op = NAND_OPERATION(chip->cur_cs, instrs);
@@ -1712,15 +1716,16 @@ int nand_erase_op(struct nand_chip *chip, unsigned int eraseblock)
 	u8 status;
 
 	if (nand_has_exec_op(chip)) {
-		const struct nand_sdr_timings *sdr =
-			nand_get_sdr_timings(nand_get_interface_config(chip));
+		const struct nand_interface_config *conf =
+			nand_get_interface_config(chip);
 		u8 addrs[3] = {	page, page >> 8, page >> 16 };
 		struct nand_op_instr instrs[] = {
 			NAND_OP_CMD(NAND_CMD_ERASE1, 0),
 			NAND_OP_ADDR(2, addrs, 0),
 			NAND_OP_CMD(NAND_CMD_ERASE2,
-				    PSEC_TO_MSEC(sdr->tWB_max)),
-			NAND_OP_WAIT_RDY(PSEC_TO_MSEC(sdr->tBERS_max), 0),
+				    NAND_COMMON_TIMING_MS(conf, tWB_max)),
+			NAND_OP_WAIT_RDY(NAND_COMMON_TIMING_MS(conf, tBERS_max),
+					 0),
 		};
 		struct nand_operation op = NAND_OPERATION(chip->cur_cs, instrs);
 
@@ -1771,14 +1776,17 @@ static int nand_set_features_op(struct nand_chip *chip, u8 feature,
 	int i, ret;
 
 	if (nand_has_exec_op(chip)) {
-		const struct nand_sdr_timings *sdr =
-			nand_get_sdr_timings(nand_get_interface_config(chip));
+		const struct nand_interface_config *conf =
+			nand_get_interface_config(chip);
 		struct nand_op_instr instrs[] = {
 			NAND_OP_CMD(NAND_CMD_SET_FEATURES, 0),
-			NAND_OP_ADDR(1, &feature, PSEC_TO_NSEC(sdr->tADL_min)),
+			NAND_OP_ADDR(1, &feature, NAND_COMMON_TIMING_NS(conf,
+									tADL_min)),
 			NAND_OP_8BIT_DATA_OUT(ONFI_SUBFEATURE_PARAM_LEN, data,
-					      PSEC_TO_NSEC(sdr->tWB_max)),
-			NAND_OP_WAIT_RDY(PSEC_TO_MSEC(sdr->tFEAT_max), 0),
+					      NAND_COMMON_TIMING_NS(conf,
+								    tWB_max)),
+			NAND_OP_WAIT_RDY(NAND_COMMON_TIMING_MS(conf, tFEAT_max),
+					 0),
 		};
 		struct nand_operation op = NAND_OPERATION(chip->cur_cs, instrs);
 
@@ -1818,13 +1826,14 @@ static int nand_get_features_op(struct nand_chip *chip, u8 feature,
 	int i;
 
 	if (nand_has_exec_op(chip)) {
-		const struct nand_sdr_timings *sdr =
-			nand_get_sdr_timings(nand_get_interface_config(chip));
+		const struct nand_interface_config *conf =
+			nand_get_interface_config(chip);
 		struct nand_op_instr instrs[] = {
 			NAND_OP_CMD(NAND_CMD_GET_FEATURES, 0),
-			NAND_OP_ADDR(1, &feature, PSEC_TO_NSEC(sdr->tWB_max)),
-			NAND_OP_WAIT_RDY(PSEC_TO_MSEC(sdr->tFEAT_max),
-					 PSEC_TO_NSEC(sdr->tRR_min)),
+			NAND_OP_ADDR(1, &feature,
+				     NAND_COMMON_TIMING_NS(conf, tWB_max)),
+			NAND_OP_WAIT_RDY(NAND_COMMON_TIMING_MS(conf, tFEAT_max),
+					 NAND_COMMON_TIMING_NS(conf, tRR_min)),
 			NAND_OP_8BIT_DATA_IN(ONFI_SUBFEATURE_PARAM_LEN,
 					     data, 0),
 		};
@@ -1875,11 +1884,13 @@ static int nand_wait_rdy_op(struct nand_chip *chip, unsigned int timeout_ms,
 int nand_reset_op(struct nand_chip *chip)
 {
 	if (nand_has_exec_op(chip)) {
-		const struct nand_sdr_timings *sdr =
-			nand_get_sdr_timings(nand_get_interface_config(chip));
+		const struct nand_interface_config *conf =
+			nand_get_interface_config(chip);
 		struct nand_op_instr instrs[] = {
-			NAND_OP_CMD(NAND_CMD_RESET, PSEC_TO_NSEC(sdr->tWB_max)),
-			NAND_OP_WAIT_RDY(PSEC_TO_MSEC(sdr->tRST_max), 0),
+			NAND_OP_CMD(NAND_CMD_RESET,
+				    NAND_COMMON_TIMING_NS(conf, tWB_max)),
+			NAND_OP_WAIT_RDY(NAND_COMMON_TIMING_MS(conf, tRST_max),
+					 0),
 		};
 		struct nand_operation op = NAND_OPERATION(chip->cur_cs, instrs);
 
@@ -3137,13 +3148,13 @@ static int nand_setup_read_retry(struct nand_chip *chip, int retry_mode)
 
 static void nand_wait_readrdy(struct nand_chip *chip)
 {
-	const struct nand_sdr_timings *sdr;
+	const struct nand_interface_config *conf;
 
 	if (!(chip->options & NAND_NEED_READRDY))
 		return;
 
-	sdr = nand_get_sdr_timings(nand_get_interface_config(chip));
-	WARN_ON(nand_wait_rdy_op(chip, PSEC_TO_MSEC(sdr->tR_max), 0));
+	conf = nand_get_interface_config(chip);
+	WARN_ON(nand_wait_rdy_op(chip, NAND_COMMON_TIMING_MS(conf, tR_max), 0));
 }
 
 /**
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index a53a1543d1d4..89b9c52c7387 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -557,6 +557,34 @@ struct nand_nvddr_timings {
 	u32 tWW_min;
 };
 
+/*
+ * While timings related to the data interface itself are mostly different
+ * between SDR and NV-DDR, timings related to the internal chip behavior are
+ * common. IOW, the following entries which describe the internal delays have
+ * the same definition and are shared in both SDR and NV-DDR timing structures:
+ * - tADL_min
+ * - tBERS_max
+ * - tCCS_min
+ * - tFEAT_max
+ * - tPROG_max
+ * - tR_max
+ * - tRR_min
+ * - tRST_max
+ * - tWB_max
+ *
+ * The below macros return the value of a given timing, no matter the interface.
+ */
+#define NAND_COMMON_TIMING_PS(conf, timing_name)		\
+	nand_interface_is_sdr(conf) ?				\
+		nand_get_sdr_timings(conf)->timing_name :	\
+		nand_get_nvddr_timings(conf)->timing_name
+
+#define NAND_COMMON_TIMING_MS(conf, timing_name) \
+	PSEC_TO_MSEC(NAND_COMMON_TIMING_PS((conf), timing_name))
+
+#define NAND_COMMON_TIMING_NS(conf, timing_name) \
+	PSEC_TO_NSEC(NAND_COMMON_TIMING_PS((conf), timing_name))
+
 /**
  * enum nand_interface_type - NAND interface type
  * @NAND_SDR_IFACE:	Single Data Rate interface
-- 
cgit v1.2.3


From f3fe156ede6db96a060cc98ff1bce1ee6417a68b Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 5 May 2021 23:37:44 +0200
Subject: mtd: rawnand: Support enabling NV-DDR through SET_FEATURES

Until now the parameter of the ADDR_TIMING_MODE feature was just the
ONFI timing mode (from 0 to 5) because we were only supporting the SDR
data interface. In the same byte, bits 4 and 5 indicate which data
interface is being configured so use them to set the right mode and also
read them back to ensure the right timing has been setup on the chip's
side.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210505213750.257417-17-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/raw/nand_base.c | 18 +++++++++++++-----
 include/linux/mtd/onfi.h         |  5 +++++
 2 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index b8a515a74379..cfe8257bf175 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -832,7 +832,7 @@ static int nand_reset_interface(struct nand_chip *chip, int chipnr)
 static int nand_setup_interface(struct nand_chip *chip, int chipnr)
 {
 	const struct nand_controller_ops *ops = chip->controller->ops;
-	u8 tmode_param[ONFI_SUBFEATURE_PARAM_LEN] = { };
+	u8 tmode_param[ONFI_SUBFEATURE_PARAM_LEN] = { }, request;
 	int ret;
 
 	if (!nand_controller_can_setup_interface(chip))
@@ -848,7 +848,12 @@ static int nand_setup_interface(struct nand_chip *chip, int chipnr)
 	if (!chip->best_interface_config)
 		return 0;
 
-	tmode_param[0] = chip->best_interface_config->timings.mode;
+	request = chip->best_interface_config->timings.mode;
+	if (nand_interface_is_sdr(chip->best_interface_config))
+		request |= ONFI_DATA_INTERFACE_SDR;
+	else
+		request |= ONFI_DATA_INTERFACE_NVDDR;
+	tmode_param[0] = request;
 
 	/* Change the mode on the chip side (if supported by the NAND chip) */
 	if (nand_supports_set_features(chip, ONFI_FEATURE_ADDR_TIMING_MODE)) {
@@ -877,10 +882,13 @@ static int nand_setup_interface(struct nand_chip *chip, int chipnr)
 	if (ret)
 		goto err_reset_chip;
 
-	if (tmode_param[0] != chip->best_interface_config->timings.mode) {
-		pr_warn("timing mode %d not acknowledged by the NAND chip\n",
+	if (request != tmode_param[0]) {
+		pr_warn("%s timing mode %d not acknowledged by the NAND chip\n",
+			nand_interface_is_nvddr(chip->best_interface_config) ? "NV-DDR" : "SDR",
 			chip->best_interface_config->timings.mode);
-		ret = 0;
+		pr_debug("NAND chip would work in %s timing mode %d\n",
+			 tmode_param[0] & ONFI_DATA_INTERFACE_NVDDR ? "NV-DDR" : "SDR",
+			 (unsigned int)ONFI_TIMING_MODE_PARAM(tmode_param[0]));
 		goto err_reset_chip;
 	}
 
diff --git a/include/linux/mtd/onfi.h b/include/linux/mtd/onfi.h
index a9677bf1e47e..a7376f9beddf 100644
--- a/include/linux/mtd/onfi.h
+++ b/include/linux/mtd/onfi.h
@@ -11,6 +11,7 @@
 #define __LINUX_MTD_ONFI_H
 
 #include <linux/types.h>
+#include <linux/bitfield.h>
 
 /* ONFI version bits */
 #define ONFI_VERSION_1_0		BIT(1)
@@ -29,6 +30,9 @@
 #define ONFI_FEATURE_EXT_PARAM_PAGE	BIT(7)
 
 /* ONFI timing mode, used in both asynchronous and synchronous mode */
+#define ONFI_DATA_INTERFACE_SDR		0
+#define ONFI_DATA_INTERFACE_NVDDR	BIT(4)
+#define ONFI_DATA_INTERFACE_NVDDR2	BIT(5)
 #define ONFI_TIMING_MODE_0		BIT(0)
 #define ONFI_TIMING_MODE_1		BIT(1)
 #define ONFI_TIMING_MODE_2		BIT(2)
@@ -36,6 +40,7 @@
 #define ONFI_TIMING_MODE_4		BIT(4)
 #define ONFI_TIMING_MODE_5		BIT(5)
 #define ONFI_TIMING_MODE_UNKNOWN	BIT(6)
+#define ONFI_TIMING_MODE_PARAM(x)	FIELD_GET(GENMASK(3, 0), (x))
 
 /* ONFI feature number/address */
 #define ONFI_FEATURE_NUMBER		256
-- 
cgit v1.2.3


From e86be3a04bc4aeaf12f93af35f08f8d4385bcd98 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 25 May 2021 18:43:38 -0400
Subject: SUNRPC: More fixes for backlog congestion

Ensure that we fix the XPRT_CONGESTED starvation issue for RDMA as well
as socket based transports.
Ensure we always initialise the request after waking up from the backlog
list.

Fixes: e877a88d1f06 ("SUNRPC in case of backlog, hand free slots directly to waiting task")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprt.h     |  2 ++
 net/sunrpc/xprt.c               | 58 ++++++++++++++++++++---------------------
 net/sunrpc/xprtrdma/transport.c | 12 ++++-----
 net/sunrpc/xprtrdma/verbs.c     | 18 ++++++++++---
 net/sunrpc/xprtrdma/xprt_rdma.h |  1 +
 5 files changed, 52 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index d81fe8b364d0..61b622e334ee 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -368,6 +368,8 @@ struct rpc_xprt *	xprt_alloc(struct net *net, size_t size,
 				unsigned int num_prealloc,
 				unsigned int max_req);
 void			xprt_free(struct rpc_xprt *);
+void			xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task);
+bool			xprt_wake_up_backlog(struct rpc_xprt *xprt, struct rpc_rqst *req);
 
 static inline int
 xprt_enable_swap(struct rpc_xprt *xprt)
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 5b3981fd3783..3509a7f139b9 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1607,11 +1607,18 @@ xprt_transmit(struct rpc_task *task)
 	spin_unlock(&xprt->queue_lock);
 }
 
-static void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task)
+static void xprt_complete_request_init(struct rpc_task *task)
+{
+	if (task->tk_rqstp)
+		xprt_request_init(task);
+}
+
+void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task)
 {
 	set_bit(XPRT_CONGESTED, &xprt->state);
-	rpc_sleep_on(&xprt->backlog, task, NULL);
+	rpc_sleep_on(&xprt->backlog, task, xprt_complete_request_init);
 }
+EXPORT_SYMBOL_GPL(xprt_add_backlog);
 
 static bool __xprt_set_rq(struct rpc_task *task, void *data)
 {
@@ -1619,14 +1626,13 @@ static bool __xprt_set_rq(struct rpc_task *task, void *data)
 
 	if (task->tk_rqstp == NULL) {
 		memset(req, 0, sizeof(*req));	/* mark unused */
-		task->tk_status = -EAGAIN;
 		task->tk_rqstp = req;
 		return true;
 	}
 	return false;
 }
 
-static bool xprt_wake_up_backlog(struct rpc_xprt *xprt, struct rpc_rqst *req)
+bool xprt_wake_up_backlog(struct rpc_xprt *xprt, struct rpc_rqst *req)
 {
 	if (rpc_wake_up_first(&xprt->backlog, __xprt_set_rq, req) == NULL) {
 		clear_bit(XPRT_CONGESTED, &xprt->state);
@@ -1634,6 +1640,7 @@ static bool xprt_wake_up_backlog(struct rpc_xprt *xprt, struct rpc_rqst *req)
 	}
 	return true;
 }
+EXPORT_SYMBOL_GPL(xprt_wake_up_backlog);
 
 static bool xprt_throttle_congested(struct rpc_xprt *xprt, struct rpc_task *task)
 {
@@ -1643,7 +1650,7 @@ static bool xprt_throttle_congested(struct rpc_xprt *xprt, struct rpc_task *task
 		goto out;
 	spin_lock(&xprt->reserve_lock);
 	if (test_bit(XPRT_CONGESTED, &xprt->state)) {
-		rpc_sleep_on(&xprt->backlog, task, NULL);
+		xprt_add_backlog(xprt, task);
 		ret = true;
 	}
 	spin_unlock(&xprt->reserve_lock);
@@ -1812,10 +1819,6 @@ xprt_request_init(struct rpc_task *task)
 	struct rpc_xprt *xprt = task->tk_xprt;
 	struct rpc_rqst	*req = task->tk_rqstp;
 
-	if (req->rq_task)
-		/* Already initialized */
-		return;
-
 	req->rq_task	= task;
 	req->rq_xprt    = xprt;
 	req->rq_buffer  = NULL;
@@ -1876,10 +1879,8 @@ void xprt_retry_reserve(struct rpc_task *task)
 	struct rpc_xprt *xprt = task->tk_xprt;
 
 	task->tk_status = 0;
-	if (task->tk_rqstp != NULL) {
-		xprt_request_init(task);
+	if (task->tk_rqstp != NULL)
 		return;
-	}
 
 	task->tk_status = -EAGAIN;
 	xprt_do_reserve(xprt, task);
@@ -1904,24 +1905,21 @@ void xprt_release(struct rpc_task *task)
 	}
 
 	xprt = req->rq_xprt;
-	if (xprt) {
-		xprt_request_dequeue_xprt(task);
-		spin_lock(&xprt->transport_lock);
-		xprt->ops->release_xprt(xprt, task);
-		if (xprt->ops->release_request)
-			xprt->ops->release_request(task);
-		xprt_schedule_autodisconnect(xprt);
-		spin_unlock(&xprt->transport_lock);
-		if (req->rq_buffer)
-			xprt->ops->buf_free(task);
-		xdr_free_bvec(&req->rq_rcv_buf);
-		xdr_free_bvec(&req->rq_snd_buf);
-		if (req->rq_cred != NULL)
-			put_rpccred(req->rq_cred);
-		if (req->rq_release_snd_buf)
-			req->rq_release_snd_buf(req);
-	} else
-		xprt = task->tk_xprt;
+	xprt_request_dequeue_xprt(task);
+	spin_lock(&xprt->transport_lock);
+	xprt->ops->release_xprt(xprt, task);
+	if (xprt->ops->release_request)
+		xprt->ops->release_request(task);
+	xprt_schedule_autodisconnect(xprt);
+	spin_unlock(&xprt->transport_lock);
+	if (req->rq_buffer)
+		xprt->ops->buf_free(task);
+	xdr_free_bvec(&req->rq_rcv_buf);
+	xdr_free_bvec(&req->rq_snd_buf);
+	if (req->rq_cred != NULL)
+		put_rpccred(req->rq_cred);
+	if (req->rq_release_snd_buf)
+		req->rq_release_snd_buf(req);
 
 	task->tk_rqstp = NULL;
 	if (likely(!bc_prealloc(req)))
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 09953597d055..19a49d26b1e4 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -520,9 +520,8 @@ xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
 	return;
 
 out_sleep:
-	set_bit(XPRT_CONGESTED, &xprt->state);
-	rpc_sleep_on(&xprt->backlog, task, NULL);
 	task->tk_status = -EAGAIN;
+	xprt_add_backlog(xprt, task);
 }
 
 /**
@@ -537,10 +536,11 @@ xprt_rdma_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *rqst)
 	struct rpcrdma_xprt *r_xprt =
 		container_of(xprt, struct rpcrdma_xprt, rx_xprt);
 
-	memset(rqst, 0, sizeof(*rqst));
-	rpcrdma_buffer_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst));
-	if (unlikely(!rpc_wake_up_next(&xprt->backlog)))
-		clear_bit(XPRT_CONGESTED, &xprt->state);
+	rpcrdma_reply_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst));
+	if (!xprt_wake_up_backlog(xprt, rqst)) {
+		memset(rqst, 0, sizeof(*rqst));
+		rpcrdma_buffer_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst));
+	}
 }
 
 static bool rpcrdma_check_regbuf(struct rpcrdma_xprt *r_xprt,
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 1e965a380896..649c23518ec0 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1200,6 +1200,20 @@ rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt)
 	return mr;
 }
 
+/**
+ * rpcrdma_reply_put - Put reply buffers back into pool
+ * @buffers: buffer pool
+ * @req: object to return
+ *
+ */
+void rpcrdma_reply_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req)
+{
+	if (req->rl_reply) {
+		rpcrdma_rep_put(buffers, req->rl_reply);
+		req->rl_reply = NULL;
+	}
+}
+
 /**
  * rpcrdma_buffer_get - Get a request buffer
  * @buffers: Buffer pool from which to obtain a buffer
@@ -1228,9 +1242,7 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
  */
 void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req)
 {
-	if (req->rl_reply)
-		rpcrdma_rep_put(buffers, req->rl_reply);
-	req->rl_reply = NULL;
+	rpcrdma_reply_put(buffers, req);
 
 	spin_lock(&buffers->rb_lock);
 	list_add(&req->rl_list, &buffers->rb_send_bufs);
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 436ad7312614..5d231d94e944 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -479,6 +479,7 @@ struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
 void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers,
 			struct rpcrdma_req *req);
 void rpcrdma_rep_put(struct rpcrdma_buffer *buf, struct rpcrdma_rep *rep);
+void rpcrdma_reply_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req);
 
 bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size,
 			    gfp_t flags);
-- 
cgit v1.2.3


From 9be85de97786a75f62080de1c0c13656f65cba84 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 25 May 2021 15:02:00 +0100
Subject: locking/atomic: make ARCH_ATOMIC a Kconfig symbol

Subsequent patches will move architectures over to the ARCH_ATOMIC API,
after preparing the asm-generic atomic implementations to function with
or without ARCH_ATOMIC.

As some architectures use the asm-generic implementations exclusively
(and don't have a local atomic.h), and to avoid the risk that
ARCH_ATOMIC isn't defined in some cases we expect, let's make the
ARCH_ATOMIC macro a Kconfig symbol instead, so that we can guarantee it
is consistently available where needed.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210525140232.53872-2-mark.rutland@arm.com
---
 arch/Kconfig                    | 3 +++
 arch/arm64/Kconfig              | 1 +
 arch/arm64/include/asm/atomic.h | 2 --
 arch/s390/Kconfig               | 1 +
 arch/s390/include/asm/atomic.h  | 2 --
 arch/um/Kconfig                 | 1 +
 arch/x86/Kconfig                | 1 +
 arch/x86/include/asm/atomic.h   | 2 --
 include/linux/atomic.h          | 2 +-
 9 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index c45b770d3579..3fb3b12d4a95 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -11,6 +11,9 @@ source "arch/$(SRCARCH)/Kconfig"
 
 menu "General architecture-dependent options"
 
+config ARCH_ATOMIC
+	bool
+
 config CRASH_CORE
 	bool
 
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 9f1d8566bbf9..62ab429d1f42 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -9,6 +9,7 @@ config ARM64
 	select ACPI_MCFG if (ACPI && PCI)
 	select ACPI_SPCR_TABLE if ACPI
 	select ACPI_PPTT if ACPI
+	select ARCH_ATOMIC
 	select ARCH_HAS_DEBUG_WX
 	select ARCH_BINFMT_ELF_STATE
 	select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION
diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h
index b56a4b2bc248..c9979273d389 100644
--- a/arch/arm64/include/asm/atomic.h
+++ b/arch/arm64/include/asm/atomic.h
@@ -223,6 +223,4 @@ static __always_inline long arch_atomic64_dec_if_positive(atomic64_t *v)
 
 #define arch_atomic64_dec_if_positive		arch_atomic64_dec_if_positive
 
-#define ARCH_ATOMIC
-
 #endif /* __ASM_ATOMIC_H */
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index b4c7c34069f8..85374a36c69e 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -58,6 +58,7 @@ config S390
 	# Note: keep this list sorted alphabetically
 	#
 	imply IMA_SECURE_AND_OR_TRUSTED_BOOT
+	select ARCH_ATOMIC
 	select ARCH_32BIT_USTAT_F_TINODE
 	select ARCH_BINFMT_ELF_STATE
 	select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM
diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h
index 7c93c6573524..7138d189cc42 100644
--- a/arch/s390/include/asm/atomic.h
+++ b/arch/s390/include/asm/atomic.h
@@ -147,6 +147,4 @@ ATOMIC64_OPS(xor)
 #define arch_atomic64_fetch_sub(_i, _v)  arch_atomic64_fetch_add(-(s64)(_i), _v)
 #define arch_atomic64_sub(_i, _v)	 arch_atomic64_add(-(s64)(_i), _v)
 
-#define ARCH_ATOMIC
-
 #endif /* __ARCH_S390_ATOMIC__  */
diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index 57cfd9a1c082..4370a9521ea4 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -5,6 +5,7 @@ menu "UML-specific options"
 config UML
 	bool
 	default y
+	select ARCH_ATOMIC
 	select ARCH_EPHEMERAL_INODES
 	select ARCH_HAS_KCOV
 	select ARCH_NO_PREEMPT
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0045e1b44190..11a27563033d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -58,6 +58,7 @@ config X86
 	#
 	select ACPI_LEGACY_TABLES_LOOKUP	if ACPI
 	select ACPI_SYSTEM_POWER_STATES_SUPPORT	if ACPI
+	select ARCH_ATOMIC
 	select ARCH_32BIT_OFF_T			if X86_32
 	select ARCH_CLOCKSOURCE_INIT
 	select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index f732741ad7c7..5e754e895767 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -269,6 +269,4 @@ static __always_inline int arch_atomic_fetch_xor(int i, atomic_t *v)
 # include <asm/atomic64_64.h>
 #endif
 
-#define ARCH_ATOMIC
-
 #endif /* _ASM_X86_ATOMIC_H */
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 571a11008ab5..4f8d83f9e480 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -77,7 +77,7 @@
 	__ret;								\
 })
 
-#ifdef ARCH_ATOMIC
+#ifdef CONFIG_ARCH_ATOMIC
 #include <linux/atomic-arch-fallback.h>
 #include <asm-generic/atomic-instrumented.h>
 #else
-- 
cgit v1.2.3


From 3c1885187bc1faa0a1c52f7bd34550740a208169 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 25 May 2021 15:02:31 +0100
Subject: locking/atomic: delete !ARCH_ATOMIC remnants

Now that all architectures implement ARCH_ATOMIC, we can make it
mandatory, removing the Kconfig symbol and logic for !ARCH_ATOMIC.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210525140232.53872-33-mark.rutland@arm.com
---
 arch/Kconfig                    |    3 -
 arch/alpha/Kconfig              |    1 -
 arch/arc/Kconfig                |    1 -
 arch/arm/Kconfig                |    1 -
 arch/arm64/Kconfig              |    1 -
 arch/csky/Kconfig               |    1 -
 arch/h8300/Kconfig              |    1 -
 arch/hexagon/Kconfig            |    1 -
 arch/ia64/Kconfig               |    1 -
 arch/m68k/Kconfig               |    1 -
 arch/microblaze/Kconfig         |    1 -
 arch/mips/Kconfig               |    1 -
 arch/nds32/Kconfig              |    1 -
 arch/nios2/Kconfig              |    1 -
 arch/openrisc/Kconfig           |    1 -
 arch/parisc/Kconfig             |    1 -
 arch/powerpc/Kconfig            |    1 -
 arch/riscv/Kconfig              |    1 -
 arch/s390/Kconfig               |    1 -
 arch/sh/Kconfig                 |    1 -
 arch/sparc/Kconfig              |    1 -
 arch/um/Kconfig                 |    1 -
 arch/x86/Kconfig                |    1 -
 arch/xtensa/Kconfig             |    1 -
 include/asm-generic/atomic.h    |   44 +-
 include/asm-generic/atomic64.h  |   29 -
 include/asm-generic/cmpxchg.h   |   21 -
 include/linux/atomic-fallback.h | 2595 ---------------------------------------
 include/linux/atomic.h          |    4 -
 scripts/atomic/check-atomics.sh |    1 -
 scripts/atomic/gen-atomics.sh   |    1 -
 31 files changed, 3 insertions(+), 2718 deletions(-)
 delete mode 100644 include/linux/atomic-fallback.h

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 3fb3b12d4a95..c45b770d3579 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -11,9 +11,6 @@ source "arch/$(SRCARCH)/Kconfig"
 
 menu "General architecture-dependent options"
 
-config ARCH_ATOMIC
-	bool
-
 config CRASH_CORE
 	bool
 
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 7920fc2e2a2a..5998106faa60 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -2,7 +2,6 @@
 config ALPHA
 	bool
 	default y
-	select ARCH_ATOMIC
 	select ARCH_32BIT_USTAT_F_TINODE
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_MIGHT_HAVE_PC_SERIO
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 098ecc72d048..2d98501c0897 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -6,7 +6,6 @@
 config ARC
 	def_bool y
 	select ARC_TIMERS
-	select ARCH_ATOMIC
 	select ARCH_HAS_CACHE_LINE_SIZE
 	select ARCH_HAS_DEBUG_VM_PGTABLE
 	select ARCH_HAS_DMA_PREP_COHERENT
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index b7334a6643b9..24804f11302d 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -3,7 +3,6 @@ config ARM
 	bool
 	default y
 	select ARCH_32BIT_OFF_T
-	select ARCH_ATOMIC
 	select ARCH_HAS_BINFMT_FLAT
 	select ARCH_HAS_DEBUG_VIRTUAL if MMU
 	select ARCH_HAS_DMA_WRITE_COMBINE if !ARM_DMA_MEM_BUFFERABLE
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 62ab429d1f42..9f1d8566bbf9 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -9,7 +9,6 @@ config ARM64
 	select ACPI_MCFG if (ACPI && PCI)
 	select ACPI_SPCR_TABLE if ACPI
 	select ACPI_PPTT if ACPI
-	select ARCH_ATOMIC
 	select ARCH_HAS_DEBUG_WX
 	select ARCH_BINFMT_ELF_STATE
 	select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 3521f14bcd96..8de5b987edb9 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -2,7 +2,6 @@
 config CSKY
 	def_bool y
 	select ARCH_32BIT_OFF_T
-	select ARCH_ATOMIC
 	select ARCH_HAS_DMA_PREP_COHERENT
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index bdf05ad3206a..3e3e0f16f7e0 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -2,7 +2,6 @@
 config H8300
         def_bool y
 	select ARCH_32BIT_OFF_T
-	select ARCH_ATOMIC
 	select ARCH_HAS_BINFMT_FLAT
 	select BINFMT_FLAT_ARGVP_ENVP_ON_STACK
 	select BINFMT_FLAT_OLD_ALWAYS_RAM
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 1368954ef679..44a409967af1 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -5,7 +5,6 @@ comment "Linux Kernel Configuration for Hexagon"
 config HEXAGON
 	def_bool y
 	select ARCH_32BIT_OFF_T
-	select ARCH_ATOMIC
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	select ARCH_NO_PREEMPT
 	# Other pending projects/to-do items.
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index c5414dcd5d0d..279252e3e0f7 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -8,7 +8,6 @@ menu "Processor type and features"
 
 config IA64
 	bool
-	select ARCH_ATOMIC
 	select ARCH_HAS_DMA_MARK_CLEAN
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_MIGHT_HAVE_PC_SERIO
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index d1d91ac47f51..372e4e69c43a 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -3,7 +3,6 @@ config M68K
 	bool
 	default y
 	select ARCH_32BIT_OFF_T
-	select ARCH_ATOMIC
 	select ARCH_HAS_BINFMT_FLAT
 	select ARCH_HAS_DMA_PREP_COHERENT if HAS_DMA && MMU && !COLDFIRE
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE if HAS_DMA
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 5a52922dc225..0660f47012bc 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -2,7 +2,6 @@
 config MICROBLAZE
 	def_bool y
 	select ARCH_32BIT_OFF_T
-	select ARCH_ATOMIC
 	select ARCH_NO_SWAP
 	select ARCH_HAS_DMA_PREP_COHERENT
 	select ARCH_HAS_GCOV_PROFILE_ALL
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 55b4da96872f..ed51970c08e7 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -3,7 +3,6 @@ config MIPS
 	bool
 	default y
 	select ARCH_32BIT_OFF_T if !64BIT
-	select ARCH_ATOMIC
 	select ARCH_BINFMT_ELF_STATE if MIPS_FP_SUPPORT
 	select ARCH_HAS_DEBUG_VIRTUAL if !64BIT
 	select ARCH_HAS_FORTIFY_SOURCE
diff --git a/arch/nds32/Kconfig b/arch/nds32/Kconfig
index 352913573aee..62313902d75d 100644
--- a/arch/nds32/Kconfig
+++ b/arch/nds32/Kconfig
@@ -7,7 +7,6 @@
 config NDS32
 	def_bool y
 	select ARCH_32BIT_OFF_T
-	select ARCH_ATOMIC
 	select ARCH_HAS_DMA_PREP_COHERENT
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index 67dae88c5b53..c24955c81c92 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -2,7 +2,6 @@
 config NIOS2
 	def_bool y
 	select ARCH_32BIT_OFF_T
-	select ARCH_ATOMIC
 	select ARCH_HAS_DMA_PREP_COHERENT
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index 8c50bc9674f5..591acc5990dc 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -7,7 +7,6 @@
 config OPENRISC
 	def_bool y
 	select ARCH_32BIT_OFF_T
-	select ARCH_ATOMIC
 	select ARCH_HAS_DMA_SET_UNCACHED
 	select ARCH_HAS_DMA_CLEAR_UNCACHED
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index bfa120a4add1..bde9907bc5b2 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -2,7 +2,6 @@
 config PARISC
 	def_bool y
 	select ARCH_32BIT_OFF_T if !64BIT
-	select ARCH_ATOMIC
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select HAVE_IDE
 	select HAVE_FUNCTION_TRACER
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d143c2b616f0..088dd2afcfe4 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -118,7 +118,6 @@ config PPC
 	# Please keep this list sorted alphabetically.
 	#
 	select ARCH_32BIT_OFF_T if PPC32
-	select ARCH_ATOMIC
 	select ARCH_ENABLE_MEMORY_HOTPLUG
 	select ARCH_ENABLE_MEMORY_HOTREMOVE
 	select ARCH_HAS_COPY_MC			if PPC64
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index c59b9f4a9d62..a8ad8eb76120 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -12,7 +12,6 @@ config 32BIT
 
 config RISCV
 	def_bool y
-	select ARCH_ATOMIC
 	select ARCH_CLOCKSOURCE_INIT
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_SUPPORTS_DEBUG_PAGEALLOC if MMU
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 85374a36c69e..b4c7c34069f8 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -58,7 +58,6 @@ config S390
 	# Note: keep this list sorted alphabetically
 	#
 	imply IMA_SECURE_AND_OR_TRUSTED_BOOT
-	select ARCH_ATOMIC
 	select ARCH_32BIT_USTAT_F_TINODE
 	select ARCH_BINFMT_ELF_STATE
 	select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index d2925cbb6fa4..68129537e350 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -2,7 +2,6 @@
 config SUPERH
 	def_bool y
 	select ARCH_32BIT_OFF_T
-	select ARCH_ATOMIC
 	select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM && MMU
 	select ARCH_ENABLE_MEMORY_HOTREMOVE if SPARSEMEM && MMU
 	select ARCH_HAVE_CUSTOM_GPIO_H
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 46790083e918..164a5254c91c 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -13,7 +13,6 @@ config 64BIT
 config SPARC
 	bool
 	default y
-	select ARCH_ATOMIC
 	select ARCH_MIGHT_HAVE_PC_PARPORT if SPARC64 && PCI
 	select ARCH_MIGHT_HAVE_PC_SERIO
 	select DMA_OPS
diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index 4370a9521ea4..57cfd9a1c082 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -5,7 +5,6 @@ menu "UML-specific options"
 config UML
 	bool
 	default y
-	select ARCH_ATOMIC
 	select ARCH_EPHEMERAL_INODES
 	select ARCH_HAS_KCOV
 	select ARCH_NO_PREEMPT
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 11a27563033d..0045e1b44190 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -58,7 +58,6 @@ config X86
 	#
 	select ACPI_LEGACY_TABLES_LOOKUP	if ACPI
 	select ACPI_SYSTEM_POWER_STATES_SUPPORT	if ACPI
-	select ARCH_ATOMIC
 	select ARCH_32BIT_OFF_T			if X86_32
 	select ARCH_CLOCKSOURCE_INIT
 	select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 39bb9bdae6b1..2332b2156993 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -2,7 +2,6 @@
 config XTENSA
 	def_bool y
 	select ARCH_32BIT_OFF_T
-	select ARCH_ATOMIC
 	select ARCH_HAS_BINFMT_FLAT if !MMU
 	select ARCH_HAS_DMA_PREP_COHERENT if MMU
 	select ARCH_HAS_SYNC_DMA_FOR_CPU if MMU
diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h
index 649060fa0fe8..04b8be9f1a77 100644
--- a/include/asm-generic/atomic.h
+++ b/include/asm-generic/atomic.h
@@ -12,14 +12,6 @@
 #include <asm/cmpxchg.h>
 #include <asm/barrier.h>
 
-#ifdef CONFIG_ARCH_ATOMIC
-#define __ga_cmpxchg	arch_cmpxchg
-#define __ga_xchg	arch_xchg
-#else
-#define __ga_cmpxchg	cmpxchg
-#define __ga_xchg	xchg
-#endif
-
 #ifdef CONFIG_SMP
 
 /* we can build all atomic primitives from cmpxchg */
@@ -30,7 +22,7 @@ static inline void generic_atomic_##op(int i, atomic_t *v)		\
 	int c, old;							\
 									\
 	c = v->counter;							\
-	while ((old = __ga_cmpxchg(&v->counter, c, c c_op i)) != c)	\
+	while ((old = arch_cmpxchg(&v->counter, c, c c_op i)) != c)	\
 		c = old;						\
 }
 
@@ -40,7 +32,7 @@ static inline int generic_atomic_##op##_return(int i, atomic_t *v)	\
 	int c, old;							\
 									\
 	c = v->counter;							\
-	while ((old = __ga_cmpxchg(&v->counter, c, c c_op i)) != c)	\
+	while ((old = arch_cmpxchg(&v->counter, c, c c_op i)) != c)	\
 		c = old;						\
 									\
 	return c c_op i;						\
@@ -52,7 +44,7 @@ static inline int generic_atomic_fetch_##op(int i, atomic_t *v)		\
 	int c, old;							\
 									\
 	c = v->counter;							\
-	while ((old = __ga_cmpxchg(&v->counter, c, c c_op i)) != c)	\
+	while ((old = arch_cmpxchg(&v->counter, c, c c_op i)) != c)	\
 		c = old;						\
 									\
 	return c;							\
@@ -120,11 +112,6 @@ ATOMIC_OP(xor, ^)
 #undef ATOMIC_OP_RETURN
 #undef ATOMIC_OP
 
-#undef __ga_cmpxchg
-#undef __ga_xchg
-
-#ifdef CONFIG_ARCH_ATOMIC
-
 #define arch_atomic_add_return			generic_atomic_add_return
 #define arch_atomic_sub_return			generic_atomic_sub_return
 
@@ -146,29 +133,4 @@ ATOMIC_OP(xor, ^)
 #define arch_atomic_xchg(ptr, v)		(arch_xchg(&(ptr)->counter, (v)))
 #define arch_atomic_cmpxchg(v, old, new)	(arch_cmpxchg(&((v)->counter), (old), (new)))
 
-#else /* CONFIG_ARCH_ATOMIC */
-
-#define atomic_add_return		generic_atomic_add_return
-#define atomic_sub_return		generic_atomic_sub_return
-
-#define atomic_fetch_add		generic_atomic_fetch_add
-#define atomic_fetch_sub		generic_atomic_fetch_sub
-#define atomic_fetch_and		generic_atomic_fetch_and
-#define atomic_fetch_or			generic_atomic_fetch_or
-#define atomic_fetch_xor		generic_atomic_fetch_xor
-
-#define atomic_add			generic_atomic_add
-#define atomic_sub			generic_atomic_sub
-#define atomic_and			generic_atomic_and
-#define atomic_or			generic_atomic_or
-#define atomic_xor			generic_atomic_xor
-
-#define atomic_read(v)			READ_ONCE((v)->counter)
-#define atomic_set(v, i)		WRITE_ONCE(((v)->counter), (i))
-
-#define atomic_xchg(ptr, v)		(xchg(&(ptr)->counter, (v)))
-#define atomic_cmpxchg(v, old, new)	(cmpxchg(&((v)->counter), (old), (new)))
-
-#endif /* CONFIG_ARCH_ATOMIC */
-
 #endif /* __ASM_GENERIC_ATOMIC_H */
diff --git a/include/asm-generic/atomic64.h b/include/asm-generic/atomic64.h
index c8c7d9fae820..100d24b02e52 100644
--- a/include/asm-generic/atomic64.h
+++ b/include/asm-generic/atomic64.h
@@ -49,8 +49,6 @@ extern s64 generic_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n);
 extern s64 generic_atomic64_xchg(atomic64_t *v, s64 new);
 extern s64 generic_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u);
 
-#ifdef CONFIG_ARCH_ATOMIC
-
 #define arch_atomic64_read		generic_atomic64_read
 #define arch_atomic64_set		generic_atomic64_set
 #define arch_atomic64_set_release	generic_atomic64_set
@@ -74,31 +72,4 @@ extern s64 generic_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u);
 #define arch_atomic64_xchg		generic_atomic64_xchg
 #define arch_atomic64_fetch_add_unless	generic_atomic64_fetch_add_unless
 
-#else /* CONFIG_ARCH_ATOMIC */
-
-#define atomic64_read			generic_atomic64_read
-#define atomic64_set			generic_atomic64_set
-#define atomic64_set_release		generic_atomic64_set
-
-#define atomic64_add			generic_atomic64_add
-#define atomic64_add_return		generic_atomic64_add_return
-#define atomic64_fetch_add		generic_atomic64_fetch_add
-#define atomic64_sub			generic_atomic64_sub
-#define atomic64_sub_return		generic_atomic64_sub_return
-#define atomic64_fetch_sub		generic_atomic64_fetch_sub
-
-#define atomic64_and			generic_atomic64_and
-#define atomic64_fetch_and		generic_atomic64_fetch_and
-#define atomic64_or			generic_atomic64_or
-#define atomic64_fetch_or		generic_atomic64_fetch_or
-#define atomic64_xor			generic_atomic64_xor
-#define atomic64_fetch_xor		generic_atomic64_fetch_xor
-
-#define atomic64_dec_if_positive	generic_atomic64_dec_if_positive
-#define atomic64_cmpxchg		generic_atomic64_cmpxchg
-#define atomic64_xchg			generic_atomic64_xchg
-#define atomic64_fetch_add_unless	generic_atomic64_fetch_add_unless
-
-#endif /* CONFIG_ARCH_ATOMIC */
-
 #endif  /*  _ASM_GENERIC_ATOMIC64_H  */
diff --git a/include/asm-generic/cmpxchg.h b/include/asm-generic/cmpxchg.h
index 98c931199089..dca4419922a9 100644
--- a/include/asm-generic/cmpxchg.h
+++ b/include/asm-generic/cmpxchg.h
@@ -97,8 +97,6 @@ unsigned long __generic_xchg(unsigned long x, volatile void *ptr, int size)
 	__generic_cmpxchg64_local((ptr), (o), (n))
 
 
-#ifdef CONFIG_ARCH_ATOMIC
-
 #ifndef arch_xchg
 #define arch_xchg		generic_xchg
 #endif
@@ -114,23 +112,4 @@ unsigned long __generic_xchg(unsigned long x, volatile void *ptr, int size)
 #define arch_cmpxchg		arch_cmpxchg_local
 #define arch_cmpxchg64		arch_cmpxchg64_local
 
-#else /* CONFIG_ARCH_ATOMIC */
-
-#ifndef xchg
-#define xchg			generic_xchg
-#endif
-
-#ifndef cmpxchg_local
-#define cmpxchg_local		generic_cmpxchg_local
-#endif
-
-#ifndef cmpxchg64_local
-#define cmpxchg64_local		generic_cmpxchg64_local
-#endif
-
-#define cmpxchg			cmpxchg_local
-#define cmpxchg64		cmpxchg64_local
-
-#endif /* CONFIG_ARCH_ATOMIC */
-
 #endif /* __ASM_GENERIC_CMPXCHG_H */
diff --git a/include/linux/atomic-fallback.h b/include/linux/atomic-fallback.h
deleted file mode 100644
index 2a3f55d98be9..000000000000
--- a/include/linux/atomic-fallback.h
+++ /dev/null
@@ -1,2595 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-// Generated by scripts/atomic/gen-atomic-fallback.sh
-// DO NOT MODIFY THIS FILE DIRECTLY
-
-#ifndef _LINUX_ATOMIC_FALLBACK_H
-#define _LINUX_ATOMIC_FALLBACK_H
-
-#include <linux/compiler.h>
-
-#ifndef xchg_relaxed
-#define xchg_acquire xchg
-#define xchg_release xchg
-#define xchg_relaxed xchg
-#else /* xchg_relaxed */
-
-#ifndef xchg_acquire
-#define xchg_acquire(...) \
-	__atomic_op_acquire(xchg, __VA_ARGS__)
-#endif
-
-#ifndef xchg_release
-#define xchg_release(...) \
-	__atomic_op_release(xchg, __VA_ARGS__)
-#endif
-
-#ifndef xchg
-#define xchg(...) \
-	__atomic_op_fence(xchg, __VA_ARGS__)
-#endif
-
-#endif /* xchg_relaxed */
-
-#ifndef cmpxchg_relaxed
-#define cmpxchg_acquire cmpxchg
-#define cmpxchg_release cmpxchg
-#define cmpxchg_relaxed cmpxchg
-#else /* cmpxchg_relaxed */
-
-#ifndef cmpxchg_acquire
-#define cmpxchg_acquire(...) \
-	__atomic_op_acquire(cmpxchg, __VA_ARGS__)
-#endif
-
-#ifndef cmpxchg_release
-#define cmpxchg_release(...) \
-	__atomic_op_release(cmpxchg, __VA_ARGS__)
-#endif
-
-#ifndef cmpxchg
-#define cmpxchg(...) \
-	__atomic_op_fence(cmpxchg, __VA_ARGS__)
-#endif
-
-#endif /* cmpxchg_relaxed */
-
-#ifndef cmpxchg64_relaxed
-#define cmpxchg64_acquire cmpxchg64
-#define cmpxchg64_release cmpxchg64
-#define cmpxchg64_relaxed cmpxchg64
-#else /* cmpxchg64_relaxed */
-
-#ifndef cmpxchg64_acquire
-#define cmpxchg64_acquire(...) \
-	__atomic_op_acquire(cmpxchg64, __VA_ARGS__)
-#endif
-
-#ifndef cmpxchg64_release
-#define cmpxchg64_release(...) \
-	__atomic_op_release(cmpxchg64, __VA_ARGS__)
-#endif
-
-#ifndef cmpxchg64
-#define cmpxchg64(...) \
-	__atomic_op_fence(cmpxchg64, __VA_ARGS__)
-#endif
-
-#endif /* cmpxchg64_relaxed */
-
-#ifndef try_cmpxchg_relaxed
-#ifdef try_cmpxchg
-#define try_cmpxchg_acquire try_cmpxchg
-#define try_cmpxchg_release try_cmpxchg
-#define try_cmpxchg_relaxed try_cmpxchg
-#endif /* try_cmpxchg */
-
-#ifndef try_cmpxchg
-#define try_cmpxchg(_ptr, _oldp, _new) \
-({ \
-	typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
-	___r = cmpxchg((_ptr), ___o, (_new)); \
-	if (unlikely(___r != ___o)) \
-		*___op = ___r; \
-	likely(___r == ___o); \
-})
-#endif /* try_cmpxchg */
-
-#ifndef try_cmpxchg_acquire
-#define try_cmpxchg_acquire(_ptr, _oldp, _new) \
-({ \
-	typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
-	___r = cmpxchg_acquire((_ptr), ___o, (_new)); \
-	if (unlikely(___r != ___o)) \
-		*___op = ___r; \
-	likely(___r == ___o); \
-})
-#endif /* try_cmpxchg_acquire */
-
-#ifndef try_cmpxchg_release
-#define try_cmpxchg_release(_ptr, _oldp, _new) \
-({ \
-	typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
-	___r = cmpxchg_release((_ptr), ___o, (_new)); \
-	if (unlikely(___r != ___o)) \
-		*___op = ___r; \
-	likely(___r == ___o); \
-})
-#endif /* try_cmpxchg_release */
-
-#ifndef try_cmpxchg_relaxed
-#define try_cmpxchg_relaxed(_ptr, _oldp, _new) \
-({ \
-	typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
-	___r = cmpxchg_relaxed((_ptr), ___o, (_new)); \
-	if (unlikely(___r != ___o)) \
-		*___op = ___r; \
-	likely(___r == ___o); \
-})
-#endif /* try_cmpxchg_relaxed */
-
-#else /* try_cmpxchg_relaxed */
-
-#ifndef try_cmpxchg_acquire
-#define try_cmpxchg_acquire(...) \
-	__atomic_op_acquire(try_cmpxchg, __VA_ARGS__)
-#endif
-
-#ifndef try_cmpxchg_release
-#define try_cmpxchg_release(...) \
-	__atomic_op_release(try_cmpxchg, __VA_ARGS__)
-#endif
-
-#ifndef try_cmpxchg
-#define try_cmpxchg(...) \
-	__atomic_op_fence(try_cmpxchg, __VA_ARGS__)
-#endif
-
-#endif /* try_cmpxchg_relaxed */
-
-#define arch_atomic_read atomic_read
-#define arch_atomic_read_acquire atomic_read_acquire
-
-#ifndef atomic_read_acquire
-static __always_inline int
-atomic_read_acquire(const atomic_t *v)
-{
-	return smp_load_acquire(&(v)->counter);
-}
-#define atomic_read_acquire atomic_read_acquire
-#endif
-
-#define arch_atomic_set atomic_set
-#define arch_atomic_set_release atomic_set_release
-
-#ifndef atomic_set_release
-static __always_inline void
-atomic_set_release(atomic_t *v, int i)
-{
-	smp_store_release(&(v)->counter, i);
-}
-#define atomic_set_release atomic_set_release
-#endif
-
-#define arch_atomic_add atomic_add
-
-#define arch_atomic_add_return atomic_add_return
-#define arch_atomic_add_return_acquire atomic_add_return_acquire
-#define arch_atomic_add_return_release atomic_add_return_release
-#define arch_atomic_add_return_relaxed atomic_add_return_relaxed
-
-#ifndef atomic_add_return_relaxed
-#define atomic_add_return_acquire atomic_add_return
-#define atomic_add_return_release atomic_add_return
-#define atomic_add_return_relaxed atomic_add_return
-#else /* atomic_add_return_relaxed */
-
-#ifndef atomic_add_return_acquire
-static __always_inline int
-atomic_add_return_acquire(int i, atomic_t *v)
-{
-	int ret = atomic_add_return_relaxed(i, v);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic_add_return_acquire atomic_add_return_acquire
-#endif
-
-#ifndef atomic_add_return_release
-static __always_inline int
-atomic_add_return_release(int i, atomic_t *v)
-{
-	__atomic_release_fence();
-	return atomic_add_return_relaxed(i, v);
-}
-#define atomic_add_return_release atomic_add_return_release
-#endif
-
-#ifndef atomic_add_return
-static __always_inline int
-atomic_add_return(int i, atomic_t *v)
-{
-	int ret;
-	__atomic_pre_full_fence();
-	ret = atomic_add_return_relaxed(i, v);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic_add_return atomic_add_return
-#endif
-
-#endif /* atomic_add_return_relaxed */
-
-#define arch_atomic_fetch_add atomic_fetch_add
-#define arch_atomic_fetch_add_acquire atomic_fetch_add_acquire
-#define arch_atomic_fetch_add_release atomic_fetch_add_release
-#define arch_atomic_fetch_add_relaxed atomic_fetch_add_relaxed
-
-#ifndef atomic_fetch_add_relaxed
-#define atomic_fetch_add_acquire atomic_fetch_add
-#define atomic_fetch_add_release atomic_fetch_add
-#define atomic_fetch_add_relaxed atomic_fetch_add
-#else /* atomic_fetch_add_relaxed */
-
-#ifndef atomic_fetch_add_acquire
-static __always_inline int
-atomic_fetch_add_acquire(int i, atomic_t *v)
-{
-	int ret = atomic_fetch_add_relaxed(i, v);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic_fetch_add_acquire atomic_fetch_add_acquire
-#endif
-
-#ifndef atomic_fetch_add_release
-static __always_inline int
-atomic_fetch_add_release(int i, atomic_t *v)
-{
-	__atomic_release_fence();
-	return atomic_fetch_add_relaxed(i, v);
-}
-#define atomic_fetch_add_release atomic_fetch_add_release
-#endif
-
-#ifndef atomic_fetch_add
-static __always_inline int
-atomic_fetch_add(int i, atomic_t *v)
-{
-	int ret;
-	__atomic_pre_full_fence();
-	ret = atomic_fetch_add_relaxed(i, v);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic_fetch_add atomic_fetch_add
-#endif
-
-#endif /* atomic_fetch_add_relaxed */
-
-#define arch_atomic_sub atomic_sub
-
-#define arch_atomic_sub_return atomic_sub_return
-#define arch_atomic_sub_return_acquire atomic_sub_return_acquire
-#define arch_atomic_sub_return_release atomic_sub_return_release
-#define arch_atomic_sub_return_relaxed atomic_sub_return_relaxed
-
-#ifndef atomic_sub_return_relaxed
-#define atomic_sub_return_acquire atomic_sub_return
-#define atomic_sub_return_release atomic_sub_return
-#define atomic_sub_return_relaxed atomic_sub_return
-#else /* atomic_sub_return_relaxed */
-
-#ifndef atomic_sub_return_acquire
-static __always_inline int
-atomic_sub_return_acquire(int i, atomic_t *v)
-{
-	int ret = atomic_sub_return_relaxed(i, v);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic_sub_return_acquire atomic_sub_return_acquire
-#endif
-
-#ifndef atomic_sub_return_release
-static __always_inline int
-atomic_sub_return_release(int i, atomic_t *v)
-{
-	__atomic_release_fence();
-	return atomic_sub_return_relaxed(i, v);
-}
-#define atomic_sub_return_release atomic_sub_return_release
-#endif
-
-#ifndef atomic_sub_return
-static __always_inline int
-atomic_sub_return(int i, atomic_t *v)
-{
-	int ret;
-	__atomic_pre_full_fence();
-	ret = atomic_sub_return_relaxed(i, v);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic_sub_return atomic_sub_return
-#endif
-
-#endif /* atomic_sub_return_relaxed */
-
-#define arch_atomic_fetch_sub atomic_fetch_sub
-#define arch_atomic_fetch_sub_acquire atomic_fetch_sub_acquire
-#define arch_atomic_fetch_sub_release atomic_fetch_sub_release
-#define arch_atomic_fetch_sub_relaxed atomic_fetch_sub_relaxed
-
-#ifndef atomic_fetch_sub_relaxed
-#define atomic_fetch_sub_acquire atomic_fetch_sub
-#define atomic_fetch_sub_release atomic_fetch_sub
-#define atomic_fetch_sub_relaxed atomic_fetch_sub
-#else /* atomic_fetch_sub_relaxed */
-
-#ifndef atomic_fetch_sub_acquire
-static __always_inline int
-atomic_fetch_sub_acquire(int i, atomic_t *v)
-{
-	int ret = atomic_fetch_sub_relaxed(i, v);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic_fetch_sub_acquire atomic_fetch_sub_acquire
-#endif
-
-#ifndef atomic_fetch_sub_release
-static __always_inline int
-atomic_fetch_sub_release(int i, atomic_t *v)
-{
-	__atomic_release_fence();
-	return atomic_fetch_sub_relaxed(i, v);
-}
-#define atomic_fetch_sub_release atomic_fetch_sub_release
-#endif
-
-#ifndef atomic_fetch_sub
-static __always_inline int
-atomic_fetch_sub(int i, atomic_t *v)
-{
-	int ret;
-	__atomic_pre_full_fence();
-	ret = atomic_fetch_sub_relaxed(i, v);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic_fetch_sub atomic_fetch_sub
-#endif
-
-#endif /* atomic_fetch_sub_relaxed */
-
-#define arch_atomic_inc atomic_inc
-
-#ifndef atomic_inc
-static __always_inline void
-atomic_inc(atomic_t *v)
-{
-	atomic_add(1, v);
-}
-#define atomic_inc atomic_inc
-#endif
-
-#define arch_atomic_inc_return atomic_inc_return
-#define arch_atomic_inc_return_acquire atomic_inc_return_acquire
-#define arch_atomic_inc_return_release atomic_inc_return_release
-#define arch_atomic_inc_return_relaxed atomic_inc_return_relaxed
-
-#ifndef atomic_inc_return_relaxed
-#ifdef atomic_inc_return
-#define atomic_inc_return_acquire atomic_inc_return
-#define atomic_inc_return_release atomic_inc_return
-#define atomic_inc_return_relaxed atomic_inc_return
-#endif /* atomic_inc_return */
-
-#ifndef atomic_inc_return
-static __always_inline int
-atomic_inc_return(atomic_t *v)
-{
-	return atomic_add_return(1, v);
-}
-#define atomic_inc_return atomic_inc_return
-#endif
-
-#ifndef atomic_inc_return_acquire
-static __always_inline int
-atomic_inc_return_acquire(atomic_t *v)
-{
-	return atomic_add_return_acquire(1, v);
-}
-#define atomic_inc_return_acquire atomic_inc_return_acquire
-#endif
-
-#ifndef atomic_inc_return_release
-static __always_inline int
-atomic_inc_return_release(atomic_t *v)
-{
-	return atomic_add_return_release(1, v);
-}
-#define atomic_inc_return_release atomic_inc_return_release
-#endif
-
-#ifndef atomic_inc_return_relaxed
-static __always_inline int
-atomic_inc_return_relaxed(atomic_t *v)
-{
-	return atomic_add_return_relaxed(1, v);
-}
-#define atomic_inc_return_relaxed atomic_inc_return_relaxed
-#endif
-
-#else /* atomic_inc_return_relaxed */
-
-#ifndef atomic_inc_return_acquire
-static __always_inline int
-atomic_inc_return_acquire(atomic_t *v)
-{
-	int ret = atomic_inc_return_relaxed(v);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic_inc_return_acquire atomic_inc_return_acquire
-#endif
-
-#ifndef atomic_inc_return_release
-static __always_inline int
-atomic_inc_return_release(atomic_t *v)
-{
-	__atomic_release_fence();
-	return atomic_inc_return_relaxed(v);
-}
-#define atomic_inc_return_release atomic_inc_return_release
-#endif
-
-#ifndef atomic_inc_return
-static __always_inline int
-atomic_inc_return(atomic_t *v)
-{
-	int ret;
-	__atomic_pre_full_fence();
-	ret = atomic_inc_return_relaxed(v);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic_inc_return atomic_inc_return
-#endif
-
-#endif /* atomic_inc_return_relaxed */
-
-#define arch_atomic_fetch_inc atomic_fetch_inc
-#define arch_atomic_fetch_inc_acquire atomic_fetch_inc_acquire
-#define arch_atomic_fetch_inc_release atomic_fetch_inc_release
-#define arch_atomic_fetch_inc_relaxed atomic_fetch_inc_relaxed
-
-#ifndef atomic_fetch_inc_relaxed
-#ifdef atomic_fetch_inc
-#define atomic_fetch_inc_acquire atomic_fetch_inc
-#define atomic_fetch_inc_release atomic_fetch_inc
-#define atomic_fetch_inc_relaxed atomic_fetch_inc
-#endif /* atomic_fetch_inc */
-
-#ifndef atomic_fetch_inc
-static __always_inline int
-atomic_fetch_inc(atomic_t *v)
-{
-	return atomic_fetch_add(1, v);
-}
-#define atomic_fetch_inc atomic_fetch_inc
-#endif
-
-#ifndef atomic_fetch_inc_acquire
-static __always_inline int
-atomic_fetch_inc_acquire(atomic_t *v)
-{
-	return atomic_fetch_add_acquire(1, v);
-}
-#define atomic_fetch_inc_acquire atomic_fetch_inc_acquire
-#endif
-
-#ifndef atomic_fetch_inc_release
-static __always_inline int
-atomic_fetch_inc_release(atomic_t *v)
-{
-	return atomic_fetch_add_release(1, v);
-}
-#define atomic_fetch_inc_release atomic_fetch_inc_release
-#endif
-
-#ifndef atomic_fetch_inc_relaxed
-static __always_inline int
-atomic_fetch_inc_relaxed(atomic_t *v)
-{
-	return atomic_fetch_add_relaxed(1, v);
-}
-#define atomic_fetch_inc_relaxed atomic_fetch_inc_relaxed
-#endif
-
-#else /* atomic_fetch_inc_relaxed */
-
-#ifndef atomic_fetch_inc_acquire
-static __always_inline int
-atomic_fetch_inc_acquire(atomic_t *v)
-{
-	int ret = atomic_fetch_inc_relaxed(v);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic_fetch_inc_acquire atomic_fetch_inc_acquire
-#endif
-
-#ifndef atomic_fetch_inc_release
-static __always_inline int
-atomic_fetch_inc_release(atomic_t *v)
-{
-	__atomic_release_fence();
-	return atomic_fetch_inc_relaxed(v);
-}
-#define atomic_fetch_inc_release atomic_fetch_inc_release
-#endif
-
-#ifndef atomic_fetch_inc
-static __always_inline int
-atomic_fetch_inc(atomic_t *v)
-{
-	int ret;
-	__atomic_pre_full_fence();
-	ret = atomic_fetch_inc_relaxed(v);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic_fetch_inc atomic_fetch_inc
-#endif
-
-#endif /* atomic_fetch_inc_relaxed */
-
-#define arch_atomic_dec atomic_dec
-
-#ifndef atomic_dec
-static __always_inline void
-atomic_dec(atomic_t *v)
-{
-	atomic_sub(1, v);
-}
-#define atomic_dec atomic_dec
-#endif
-
-#define arch_atomic_dec_return atomic_dec_return
-#define arch_atomic_dec_return_acquire atomic_dec_return_acquire
-#define arch_atomic_dec_return_release atomic_dec_return_release
-#define arch_atomic_dec_return_relaxed atomic_dec_return_relaxed
-
-#ifndef atomic_dec_return_relaxed
-#ifdef atomic_dec_return
-#define atomic_dec_return_acquire atomic_dec_return
-#define atomic_dec_return_release atomic_dec_return
-#define atomic_dec_return_relaxed atomic_dec_return
-#endif /* atomic_dec_return */
-
-#ifndef atomic_dec_return
-static __always_inline int
-atomic_dec_return(atomic_t *v)
-{
-	return atomic_sub_return(1, v);
-}
-#define atomic_dec_return atomic_dec_return
-#endif
-
-#ifndef atomic_dec_return_acquire
-static __always_inline int
-atomic_dec_return_acquire(atomic_t *v)
-{
-	return atomic_sub_return_acquire(1, v);
-}
-#define atomic_dec_return_acquire atomic_dec_return_acquire
-#endif
-
-#ifndef atomic_dec_return_release
-static __always_inline int
-atomic_dec_return_release(atomic_t *v)
-{
-	return atomic_sub_return_release(1, v);
-}
-#define atomic_dec_return_release atomic_dec_return_release
-#endif
-
-#ifndef atomic_dec_return_relaxed
-static __always_inline int
-atomic_dec_return_relaxed(atomic_t *v)
-{
-	return atomic_sub_return_relaxed(1, v);
-}
-#define atomic_dec_return_relaxed atomic_dec_return_relaxed
-#endif
-
-#else /* atomic_dec_return_relaxed */
-
-#ifndef atomic_dec_return_acquire
-static __always_inline int
-atomic_dec_return_acquire(atomic_t *v)
-{
-	int ret = atomic_dec_return_relaxed(v);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic_dec_return_acquire atomic_dec_return_acquire
-#endif
-
-#ifndef atomic_dec_return_release
-static __always_inline int
-atomic_dec_return_release(atomic_t *v)
-{
-	__atomic_release_fence();
-	return atomic_dec_return_relaxed(v);
-}
-#define atomic_dec_return_release atomic_dec_return_release
-#endif
-
-#ifndef atomic_dec_return
-static __always_inline int
-atomic_dec_return(atomic_t *v)
-{
-	int ret;
-	__atomic_pre_full_fence();
-	ret = atomic_dec_return_relaxed(v);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic_dec_return atomic_dec_return
-#endif
-
-#endif /* atomic_dec_return_relaxed */
-
-#define arch_atomic_fetch_dec atomic_fetch_dec
-#define arch_atomic_fetch_dec_acquire atomic_fetch_dec_acquire
-#define arch_atomic_fetch_dec_release atomic_fetch_dec_release
-#define arch_atomic_fetch_dec_relaxed atomic_fetch_dec_relaxed
-
-#ifndef atomic_fetch_dec_relaxed
-#ifdef atomic_fetch_dec
-#define atomic_fetch_dec_acquire atomic_fetch_dec
-#define atomic_fetch_dec_release atomic_fetch_dec
-#define atomic_fetch_dec_relaxed atomic_fetch_dec
-#endif /* atomic_fetch_dec */
-
-#ifndef atomic_fetch_dec
-static __always_inline int
-atomic_fetch_dec(atomic_t *v)
-{
-	return atomic_fetch_sub(1, v);
-}
-#define atomic_fetch_dec atomic_fetch_dec
-#endif
-
-#ifndef atomic_fetch_dec_acquire
-static __always_inline int
-atomic_fetch_dec_acquire(atomic_t *v)
-{
-	return atomic_fetch_sub_acquire(1, v);
-}
-#define atomic_fetch_dec_acquire atomic_fetch_dec_acquire
-#endif
-
-#ifndef atomic_fetch_dec_release
-static __always_inline int
-atomic_fetch_dec_release(atomic_t *v)
-{
-	return atomic_fetch_sub_release(1, v);
-}
-#define atomic_fetch_dec_release atomic_fetch_dec_release
-#endif
-
-#ifndef atomic_fetch_dec_relaxed
-static __always_inline int
-atomic_fetch_dec_relaxed(atomic_t *v)
-{
-	return atomic_fetch_sub_relaxed(1, v);
-}
-#define atomic_fetch_dec_relaxed atomic_fetch_dec_relaxed
-#endif
-
-#else /* atomic_fetch_dec_relaxed */
-
-#ifndef atomic_fetch_dec_acquire
-static __always_inline int
-atomic_fetch_dec_acquire(atomic_t *v)
-{
-	int ret = atomic_fetch_dec_relaxed(v);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic_fetch_dec_acquire atomic_fetch_dec_acquire
-#endif
-
-#ifndef atomic_fetch_dec_release
-static __always_inline int
-atomic_fetch_dec_release(atomic_t *v)
-{
-	__atomic_release_fence();
-	return atomic_fetch_dec_relaxed(v);
-}
-#define atomic_fetch_dec_release atomic_fetch_dec_release
-#endif
-
-#ifndef atomic_fetch_dec
-static __always_inline int
-atomic_fetch_dec(atomic_t *v)
-{
-	int ret;
-	__atomic_pre_full_fence();
-	ret = atomic_fetch_dec_relaxed(v);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic_fetch_dec atomic_fetch_dec
-#endif
-
-#endif /* atomic_fetch_dec_relaxed */
-
-#define arch_atomic_and atomic_and
-
-#define arch_atomic_fetch_and atomic_fetch_and
-#define arch_atomic_fetch_and_acquire atomic_fetch_and_acquire
-#define arch_atomic_fetch_and_release atomic_fetch_and_release
-#define arch_atomic_fetch_and_relaxed atomic_fetch_and_relaxed
-
-#ifndef atomic_fetch_and_relaxed
-#define atomic_fetch_and_acquire atomic_fetch_and
-#define atomic_fetch_and_release atomic_fetch_and
-#define atomic_fetch_and_relaxed atomic_fetch_and
-#else /* atomic_fetch_and_relaxed */
-
-#ifndef atomic_fetch_and_acquire
-static __always_inline int
-atomic_fetch_and_acquire(int i, atomic_t *v)
-{
-	int ret = atomic_fetch_and_relaxed(i, v);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic_fetch_and_acquire atomic_fetch_and_acquire
-#endif
-
-#ifndef atomic_fetch_and_release
-static __always_inline int
-atomic_fetch_and_release(int i, atomic_t *v)
-{
-	__atomic_release_fence();
-	return atomic_fetch_and_relaxed(i, v);
-}
-#define atomic_fetch_and_release atomic_fetch_and_release
-#endif
-
-#ifndef atomic_fetch_and
-static __always_inline int
-atomic_fetch_and(int i, atomic_t *v)
-{
-	int ret;
-	__atomic_pre_full_fence();
-	ret = atomic_fetch_and_relaxed(i, v);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic_fetch_and atomic_fetch_and
-#endif
-
-#endif /* atomic_fetch_and_relaxed */
-
-#define arch_atomic_andnot atomic_andnot
-
-#ifndef atomic_andnot
-static __always_inline void
-atomic_andnot(int i, atomic_t *v)
-{
-	atomic_and(~i, v);
-}
-#define atomic_andnot atomic_andnot
-#endif
-
-#define arch_atomic_fetch_andnot atomic_fetch_andnot
-#define arch_atomic_fetch_andnot_acquire atomic_fetch_andnot_acquire
-#define arch_atomic_fetch_andnot_release atomic_fetch_andnot_release
-#define arch_atomic_fetch_andnot_relaxed atomic_fetch_andnot_relaxed
-
-#ifndef atomic_fetch_andnot_relaxed
-#ifdef atomic_fetch_andnot
-#define atomic_fetch_andnot_acquire atomic_fetch_andnot
-#define atomic_fetch_andnot_release atomic_fetch_andnot
-#define atomic_fetch_andnot_relaxed atomic_fetch_andnot
-#endif /* atomic_fetch_andnot */
-
-#ifndef atomic_fetch_andnot
-static __always_inline int
-atomic_fetch_andnot(int i, atomic_t *v)
-{
-	return atomic_fetch_and(~i, v);
-}
-#define atomic_fetch_andnot atomic_fetch_andnot
-#endif
-
-#ifndef atomic_fetch_andnot_acquire
-static __always_inline int
-atomic_fetch_andnot_acquire(int i, atomic_t *v)
-{
-	return atomic_fetch_and_acquire(~i, v);
-}
-#define atomic_fetch_andnot_acquire atomic_fetch_andnot_acquire
-#endif
-
-#ifndef atomic_fetch_andnot_release
-static __always_inline int
-atomic_fetch_andnot_release(int i, atomic_t *v)
-{
-	return atomic_fetch_and_release(~i, v);
-}
-#define atomic_fetch_andnot_release atomic_fetch_andnot_release
-#endif
-
-#ifndef atomic_fetch_andnot_relaxed
-static __always_inline int
-atomic_fetch_andnot_relaxed(int i, atomic_t *v)
-{
-	return atomic_fetch_and_relaxed(~i, v);
-}
-#define atomic_fetch_andnot_relaxed atomic_fetch_andnot_relaxed
-#endif
-
-#else /* atomic_fetch_andnot_relaxed */
-
-#ifndef atomic_fetch_andnot_acquire
-static __always_inline int
-atomic_fetch_andnot_acquire(int i, atomic_t *v)
-{
-	int ret = atomic_fetch_andnot_relaxed(i, v);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic_fetch_andnot_acquire atomic_fetch_andnot_acquire
-#endif
-
-#ifndef atomic_fetch_andnot_release
-static __always_inline int
-atomic_fetch_andnot_release(int i, atomic_t *v)
-{
-	__atomic_release_fence();
-	return atomic_fetch_andnot_relaxed(i, v);
-}
-#define atomic_fetch_andnot_release atomic_fetch_andnot_release
-#endif
-
-#ifndef atomic_fetch_andnot
-static __always_inline int
-atomic_fetch_andnot(int i, atomic_t *v)
-{
-	int ret;
-	__atomic_pre_full_fence();
-	ret = atomic_fetch_andnot_relaxed(i, v);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic_fetch_andnot atomic_fetch_andnot
-#endif
-
-#endif /* atomic_fetch_andnot_relaxed */
-
-#define arch_atomic_or atomic_or
-
-#define arch_atomic_fetch_or atomic_fetch_or
-#define arch_atomic_fetch_or_acquire atomic_fetch_or_acquire
-#define arch_atomic_fetch_or_release atomic_fetch_or_release
-#define arch_atomic_fetch_or_relaxed atomic_fetch_or_relaxed
-
-#ifndef atomic_fetch_or_relaxed
-#define atomic_fetch_or_acquire atomic_fetch_or
-#define atomic_fetch_or_release atomic_fetch_or
-#define atomic_fetch_or_relaxed atomic_fetch_or
-#else /* atomic_fetch_or_relaxed */
-
-#ifndef atomic_fetch_or_acquire
-static __always_inline int
-atomic_fetch_or_acquire(int i, atomic_t *v)
-{
-	int ret = atomic_fetch_or_relaxed(i, v);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic_fetch_or_acquire atomic_fetch_or_acquire
-#endif
-
-#ifndef atomic_fetch_or_release
-static __always_inline int
-atomic_fetch_or_release(int i, atomic_t *v)
-{
-	__atomic_release_fence();
-	return atomic_fetch_or_relaxed(i, v);
-}
-#define atomic_fetch_or_release atomic_fetch_or_release
-#endif
-
-#ifndef atomic_fetch_or
-static __always_inline int
-atomic_fetch_or(int i, atomic_t *v)
-{
-	int ret;
-	__atomic_pre_full_fence();
-	ret = atomic_fetch_or_relaxed(i, v);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic_fetch_or atomic_fetch_or
-#endif
-
-#endif /* atomic_fetch_or_relaxed */
-
-#define arch_atomic_xor atomic_xor
-
-#define arch_atomic_fetch_xor atomic_fetch_xor
-#define arch_atomic_fetch_xor_acquire atomic_fetch_xor_acquire
-#define arch_atomic_fetch_xor_release atomic_fetch_xor_release
-#define arch_atomic_fetch_xor_relaxed atomic_fetch_xor_relaxed
-
-#ifndef atomic_fetch_xor_relaxed
-#define atomic_fetch_xor_acquire atomic_fetch_xor
-#define atomic_fetch_xor_release atomic_fetch_xor
-#define atomic_fetch_xor_relaxed atomic_fetch_xor
-#else /* atomic_fetch_xor_relaxed */
-
-#ifndef atomic_fetch_xor_acquire
-static __always_inline int
-atomic_fetch_xor_acquire(int i, atomic_t *v)
-{
-	int ret = atomic_fetch_xor_relaxed(i, v);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic_fetch_xor_acquire atomic_fetch_xor_acquire
-#endif
-
-#ifndef atomic_fetch_xor_release
-static __always_inline int
-atomic_fetch_xor_release(int i, atomic_t *v)
-{
-	__atomic_release_fence();
-	return atomic_fetch_xor_relaxed(i, v);
-}
-#define atomic_fetch_xor_release atomic_fetch_xor_release
-#endif
-
-#ifndef atomic_fetch_xor
-static __always_inline int
-atomic_fetch_xor(int i, atomic_t *v)
-{
-	int ret;
-	__atomic_pre_full_fence();
-	ret = atomic_fetch_xor_relaxed(i, v);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic_fetch_xor atomic_fetch_xor
-#endif
-
-#endif /* atomic_fetch_xor_relaxed */
-
-#define arch_atomic_xchg atomic_xchg
-#define arch_atomic_xchg_acquire atomic_xchg_acquire
-#define arch_atomic_xchg_release atomic_xchg_release
-#define arch_atomic_xchg_relaxed atomic_xchg_relaxed
-
-#ifndef atomic_xchg_relaxed
-#define atomic_xchg_acquire atomic_xchg
-#define atomic_xchg_release atomic_xchg
-#define atomic_xchg_relaxed atomic_xchg
-#else /* atomic_xchg_relaxed */
-
-#ifndef atomic_xchg_acquire
-static __always_inline int
-atomic_xchg_acquire(atomic_t *v, int i)
-{
-	int ret = atomic_xchg_relaxed(v, i);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic_xchg_acquire atomic_xchg_acquire
-#endif
-
-#ifndef atomic_xchg_release
-static __always_inline int
-atomic_xchg_release(atomic_t *v, int i)
-{
-	__atomic_release_fence();
-	return atomic_xchg_relaxed(v, i);
-}
-#define atomic_xchg_release atomic_xchg_release
-#endif
-
-#ifndef atomic_xchg
-static __always_inline int
-atomic_xchg(atomic_t *v, int i)
-{
-	int ret;
-	__atomic_pre_full_fence();
-	ret = atomic_xchg_relaxed(v, i);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic_xchg atomic_xchg
-#endif
-
-#endif /* atomic_xchg_relaxed */
-
-#define arch_atomic_cmpxchg atomic_cmpxchg
-#define arch_atomic_cmpxchg_acquire atomic_cmpxchg_acquire
-#define arch_atomic_cmpxchg_release atomic_cmpxchg_release
-#define arch_atomic_cmpxchg_relaxed atomic_cmpxchg_relaxed
-
-#ifndef atomic_cmpxchg_relaxed
-#define atomic_cmpxchg_acquire atomic_cmpxchg
-#define atomic_cmpxchg_release atomic_cmpxchg
-#define atomic_cmpxchg_relaxed atomic_cmpxchg
-#else /* atomic_cmpxchg_relaxed */
-
-#ifndef atomic_cmpxchg_acquire
-static __always_inline int
-atomic_cmpxchg_acquire(atomic_t *v, int old, int new)
-{
-	int ret = atomic_cmpxchg_relaxed(v, old, new);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic_cmpxchg_acquire atomic_cmpxchg_acquire
-#endif
-
-#ifndef atomic_cmpxchg_release
-static __always_inline int
-atomic_cmpxchg_release(atomic_t *v, int old, int new)
-{
-	__atomic_release_fence();
-	return atomic_cmpxchg_relaxed(v, old, new);
-}
-#define atomic_cmpxchg_release atomic_cmpxchg_release
-#endif
-
-#ifndef atomic_cmpxchg
-static __always_inline int
-atomic_cmpxchg(atomic_t *v, int old, int new)
-{
-	int ret;
-	__atomic_pre_full_fence();
-	ret = atomic_cmpxchg_relaxed(v, old, new);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic_cmpxchg atomic_cmpxchg
-#endif
-
-#endif /* atomic_cmpxchg_relaxed */
-
-#define arch_atomic_try_cmpxchg atomic_try_cmpxchg
-#define arch_atomic_try_cmpxchg_acquire atomic_try_cmpxchg_acquire
-#define arch_atomic_try_cmpxchg_release atomic_try_cmpxchg_release
-#define arch_atomic_try_cmpxchg_relaxed atomic_try_cmpxchg_relaxed
-
-#ifndef atomic_try_cmpxchg_relaxed
-#ifdef atomic_try_cmpxchg
-#define atomic_try_cmpxchg_acquire atomic_try_cmpxchg
-#define atomic_try_cmpxchg_release atomic_try_cmpxchg
-#define atomic_try_cmpxchg_relaxed atomic_try_cmpxchg
-#endif /* atomic_try_cmpxchg */
-
-#ifndef atomic_try_cmpxchg
-static __always_inline bool
-atomic_try_cmpxchg(atomic_t *v, int *old, int new)
-{
-	int r, o = *old;
-	r = atomic_cmpxchg(v, o, new);
-	if (unlikely(r != o))
-		*old = r;
-	return likely(r == o);
-}
-#define atomic_try_cmpxchg atomic_try_cmpxchg
-#endif
-
-#ifndef atomic_try_cmpxchg_acquire
-static __always_inline bool
-atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new)
-{
-	int r, o = *old;
-	r = atomic_cmpxchg_acquire(v, o, new);
-	if (unlikely(r != o))
-		*old = r;
-	return likely(r == o);
-}
-#define atomic_try_cmpxchg_acquire atomic_try_cmpxchg_acquire
-#endif
-
-#ifndef atomic_try_cmpxchg_release
-static __always_inline bool
-atomic_try_cmpxchg_release(atomic_t *v, int *old, int new)
-{
-	int r, o = *old;
-	r = atomic_cmpxchg_release(v, o, new);
-	if (unlikely(r != o))
-		*old = r;
-	return likely(r == o);
-}
-#define atomic_try_cmpxchg_release atomic_try_cmpxchg_release
-#endif
-
-#ifndef atomic_try_cmpxchg_relaxed
-static __always_inline bool
-atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new)
-{
-	int r, o = *old;
-	r = atomic_cmpxchg_relaxed(v, o, new);
-	if (unlikely(r != o))
-		*old = r;
-	return likely(r == o);
-}
-#define atomic_try_cmpxchg_relaxed atomic_try_cmpxchg_relaxed
-#endif
-
-#else /* atomic_try_cmpxchg_relaxed */
-
-#ifndef atomic_try_cmpxchg_acquire
-static __always_inline bool
-atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new)
-{
-	bool ret = atomic_try_cmpxchg_relaxed(v, old, new);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic_try_cmpxchg_acquire atomic_try_cmpxchg_acquire
-#endif
-
-#ifndef atomic_try_cmpxchg_release
-static __always_inline bool
-atomic_try_cmpxchg_release(atomic_t *v, int *old, int new)
-{
-	__atomic_release_fence();
-	return atomic_try_cmpxchg_relaxed(v, old, new);
-}
-#define atomic_try_cmpxchg_release atomic_try_cmpxchg_release
-#endif
-
-#ifndef atomic_try_cmpxchg
-static __always_inline bool
-atomic_try_cmpxchg(atomic_t *v, int *old, int new)
-{
-	bool ret;
-	__atomic_pre_full_fence();
-	ret = atomic_try_cmpxchg_relaxed(v, old, new);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic_try_cmpxchg atomic_try_cmpxchg
-#endif
-
-#endif /* atomic_try_cmpxchg_relaxed */
-
-#define arch_atomic_sub_and_test atomic_sub_and_test
-
-#ifndef atomic_sub_and_test
-/**
- * atomic_sub_and_test - subtract value from variable and test result
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically subtracts @i from @v and returns
- * true if the result is zero, or false for all
- * other cases.
- */
-static __always_inline bool
-atomic_sub_and_test(int i, atomic_t *v)
-{
-	return atomic_sub_return(i, v) == 0;
-}
-#define atomic_sub_and_test atomic_sub_and_test
-#endif
-
-#define arch_atomic_dec_and_test atomic_dec_and_test
-
-#ifndef atomic_dec_and_test
-/**
- * atomic_dec_and_test - decrement and test
- * @v: pointer of type atomic_t
- *
- * Atomically decrements @v by 1 and
- * returns true if the result is 0, or false for all other
- * cases.
- */
-static __always_inline bool
-atomic_dec_and_test(atomic_t *v)
-{
-	return atomic_dec_return(v) == 0;
-}
-#define atomic_dec_and_test atomic_dec_and_test
-#endif
-
-#define arch_atomic_inc_and_test atomic_inc_and_test
-
-#ifndef atomic_inc_and_test
-/**
- * atomic_inc_and_test - increment and test
- * @v: pointer of type atomic_t
- *
- * Atomically increments @v by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-static __always_inline bool
-atomic_inc_and_test(atomic_t *v)
-{
-	return atomic_inc_return(v) == 0;
-}
-#define atomic_inc_and_test atomic_inc_and_test
-#endif
-
-#define arch_atomic_add_negative atomic_add_negative
-
-#ifndef atomic_add_negative
-/**
- * atomic_add_negative - add and test if negative
- * @i: integer value to add
- * @v: pointer of type atomic_t
- *
- * Atomically adds @i to @v and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
- */
-static __always_inline bool
-atomic_add_negative(int i, atomic_t *v)
-{
-	return atomic_add_return(i, v) < 0;
-}
-#define atomic_add_negative atomic_add_negative
-#endif
-
-#define arch_atomic_fetch_add_unless atomic_fetch_add_unless
-
-#ifndef atomic_fetch_add_unless
-/**
- * atomic_fetch_add_unless - add unless the number is already a given value
- * @v: pointer of type atomic_t
- * @a: the amount to add to v...
- * @u: ...unless v is equal to u.
- *
- * Atomically adds @a to @v, so long as @v was not already @u.
- * Returns original value of @v
- */
-static __always_inline int
-atomic_fetch_add_unless(atomic_t *v, int a, int u)
-{
-	int c = atomic_read(v);
-
-	do {
-		if (unlikely(c == u))
-			break;
-	} while (!atomic_try_cmpxchg(v, &c, c + a));
-
-	return c;
-}
-#define atomic_fetch_add_unless atomic_fetch_add_unless
-#endif
-
-#define arch_atomic_add_unless atomic_add_unless
-
-#ifndef atomic_add_unless
-/**
- * atomic_add_unless - add unless the number is already a given value
- * @v: pointer of type atomic_t
- * @a: the amount to add to v...
- * @u: ...unless v is equal to u.
- *
- * Atomically adds @a to @v, if @v was not already @u.
- * Returns true if the addition was done.
- */
-static __always_inline bool
-atomic_add_unless(atomic_t *v, int a, int u)
-{
-	return atomic_fetch_add_unless(v, a, u) != u;
-}
-#define atomic_add_unless atomic_add_unless
-#endif
-
-#define arch_atomic_inc_not_zero atomic_inc_not_zero
-
-#ifndef atomic_inc_not_zero
-/**
- * atomic_inc_not_zero - increment unless the number is zero
- * @v: pointer of type atomic_t
- *
- * Atomically increments @v by 1, if @v is non-zero.
- * Returns true if the increment was done.
- */
-static __always_inline bool
-atomic_inc_not_zero(atomic_t *v)
-{
-	return atomic_add_unless(v, 1, 0);
-}
-#define atomic_inc_not_zero atomic_inc_not_zero
-#endif
-
-#define arch_atomic_inc_unless_negative atomic_inc_unless_negative
-
-#ifndef atomic_inc_unless_negative
-static __always_inline bool
-atomic_inc_unless_negative(atomic_t *v)
-{
-	int c = atomic_read(v);
-
-	do {
-		if (unlikely(c < 0))
-			return false;
-	} while (!atomic_try_cmpxchg(v, &c, c + 1));
-
-	return true;
-}
-#define atomic_inc_unless_negative atomic_inc_unless_negative
-#endif
-
-#define arch_atomic_dec_unless_positive atomic_dec_unless_positive
-
-#ifndef atomic_dec_unless_positive
-static __always_inline bool
-atomic_dec_unless_positive(atomic_t *v)
-{
-	int c = atomic_read(v);
-
-	do {
-		if (unlikely(c > 0))
-			return false;
-	} while (!atomic_try_cmpxchg(v, &c, c - 1));
-
-	return true;
-}
-#define atomic_dec_unless_positive atomic_dec_unless_positive
-#endif
-
-#define arch_atomic_dec_if_positive atomic_dec_if_positive
-
-#ifndef atomic_dec_if_positive
-static __always_inline int
-atomic_dec_if_positive(atomic_t *v)
-{
-	int dec, c = atomic_read(v);
-
-	do {
-		dec = c - 1;
-		if (unlikely(dec < 0))
-			break;
-	} while (!atomic_try_cmpxchg(v, &c, dec));
-
-	return dec;
-}
-#define atomic_dec_if_positive atomic_dec_if_positive
-#endif
-
-#ifdef CONFIG_GENERIC_ATOMIC64
-#include <asm-generic/atomic64.h>
-#endif
-
-#define arch_atomic64_read atomic64_read
-#define arch_atomic64_read_acquire atomic64_read_acquire
-
-#ifndef atomic64_read_acquire
-static __always_inline s64
-atomic64_read_acquire(const atomic64_t *v)
-{
-	return smp_load_acquire(&(v)->counter);
-}
-#define atomic64_read_acquire atomic64_read_acquire
-#endif
-
-#define arch_atomic64_set atomic64_set
-#define arch_atomic64_set_release atomic64_set_release
-
-#ifndef atomic64_set_release
-static __always_inline void
-atomic64_set_release(atomic64_t *v, s64 i)
-{
-	smp_store_release(&(v)->counter, i);
-}
-#define atomic64_set_release atomic64_set_release
-#endif
-
-#define arch_atomic64_add atomic64_add
-
-#define arch_atomic64_add_return atomic64_add_return
-#define arch_atomic64_add_return_acquire atomic64_add_return_acquire
-#define arch_atomic64_add_return_release atomic64_add_return_release
-#define arch_atomic64_add_return_relaxed atomic64_add_return_relaxed
-
-#ifndef atomic64_add_return_relaxed
-#define atomic64_add_return_acquire atomic64_add_return
-#define atomic64_add_return_release atomic64_add_return
-#define atomic64_add_return_relaxed atomic64_add_return
-#else /* atomic64_add_return_relaxed */
-
-#ifndef atomic64_add_return_acquire
-static __always_inline s64
-atomic64_add_return_acquire(s64 i, atomic64_t *v)
-{
-	s64 ret = atomic64_add_return_relaxed(i, v);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic64_add_return_acquire atomic64_add_return_acquire
-#endif
-
-#ifndef atomic64_add_return_release
-static __always_inline s64
-atomic64_add_return_release(s64 i, atomic64_t *v)
-{
-	__atomic_release_fence();
-	return atomic64_add_return_relaxed(i, v);
-}
-#define atomic64_add_return_release atomic64_add_return_release
-#endif
-
-#ifndef atomic64_add_return
-static __always_inline s64
-atomic64_add_return(s64 i, atomic64_t *v)
-{
-	s64 ret;
-	__atomic_pre_full_fence();
-	ret = atomic64_add_return_relaxed(i, v);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic64_add_return atomic64_add_return
-#endif
-
-#endif /* atomic64_add_return_relaxed */
-
-#define arch_atomic64_fetch_add atomic64_fetch_add
-#define arch_atomic64_fetch_add_acquire atomic64_fetch_add_acquire
-#define arch_atomic64_fetch_add_release atomic64_fetch_add_release
-#define arch_atomic64_fetch_add_relaxed atomic64_fetch_add_relaxed
-
-#ifndef atomic64_fetch_add_relaxed
-#define atomic64_fetch_add_acquire atomic64_fetch_add
-#define atomic64_fetch_add_release atomic64_fetch_add
-#define atomic64_fetch_add_relaxed atomic64_fetch_add
-#else /* atomic64_fetch_add_relaxed */
-
-#ifndef atomic64_fetch_add_acquire
-static __always_inline s64
-atomic64_fetch_add_acquire(s64 i, atomic64_t *v)
-{
-	s64 ret = atomic64_fetch_add_relaxed(i, v);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic64_fetch_add_acquire atomic64_fetch_add_acquire
-#endif
-
-#ifndef atomic64_fetch_add_release
-static __always_inline s64
-atomic64_fetch_add_release(s64 i, atomic64_t *v)
-{
-	__atomic_release_fence();
-	return atomic64_fetch_add_relaxed(i, v);
-}
-#define atomic64_fetch_add_release atomic64_fetch_add_release
-#endif
-
-#ifndef atomic64_fetch_add
-static __always_inline s64
-atomic64_fetch_add(s64 i, atomic64_t *v)
-{
-	s64 ret;
-	__atomic_pre_full_fence();
-	ret = atomic64_fetch_add_relaxed(i, v);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic64_fetch_add atomic64_fetch_add
-#endif
-
-#endif /* atomic64_fetch_add_relaxed */
-
-#define arch_atomic64_sub atomic64_sub
-
-#define arch_atomic64_sub_return atomic64_sub_return
-#define arch_atomic64_sub_return_acquire atomic64_sub_return_acquire
-#define arch_atomic64_sub_return_release atomic64_sub_return_release
-#define arch_atomic64_sub_return_relaxed atomic64_sub_return_relaxed
-
-#ifndef atomic64_sub_return_relaxed
-#define atomic64_sub_return_acquire atomic64_sub_return
-#define atomic64_sub_return_release atomic64_sub_return
-#define atomic64_sub_return_relaxed atomic64_sub_return
-#else /* atomic64_sub_return_relaxed */
-
-#ifndef atomic64_sub_return_acquire
-static __always_inline s64
-atomic64_sub_return_acquire(s64 i, atomic64_t *v)
-{
-	s64 ret = atomic64_sub_return_relaxed(i, v);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic64_sub_return_acquire atomic64_sub_return_acquire
-#endif
-
-#ifndef atomic64_sub_return_release
-static __always_inline s64
-atomic64_sub_return_release(s64 i, atomic64_t *v)
-{
-	__atomic_release_fence();
-	return atomic64_sub_return_relaxed(i, v);
-}
-#define atomic64_sub_return_release atomic64_sub_return_release
-#endif
-
-#ifndef atomic64_sub_return
-static __always_inline s64
-atomic64_sub_return(s64 i, atomic64_t *v)
-{
-	s64 ret;
-	__atomic_pre_full_fence();
-	ret = atomic64_sub_return_relaxed(i, v);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic64_sub_return atomic64_sub_return
-#endif
-
-#endif /* atomic64_sub_return_relaxed */
-
-#define arch_atomic64_fetch_sub atomic64_fetch_sub
-#define arch_atomic64_fetch_sub_acquire atomic64_fetch_sub_acquire
-#define arch_atomic64_fetch_sub_release atomic64_fetch_sub_release
-#define arch_atomic64_fetch_sub_relaxed atomic64_fetch_sub_relaxed
-
-#ifndef atomic64_fetch_sub_relaxed
-#define atomic64_fetch_sub_acquire atomic64_fetch_sub
-#define atomic64_fetch_sub_release atomic64_fetch_sub
-#define atomic64_fetch_sub_relaxed atomic64_fetch_sub
-#else /* atomic64_fetch_sub_relaxed */
-
-#ifndef atomic64_fetch_sub_acquire
-static __always_inline s64
-atomic64_fetch_sub_acquire(s64 i, atomic64_t *v)
-{
-	s64 ret = atomic64_fetch_sub_relaxed(i, v);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic64_fetch_sub_acquire atomic64_fetch_sub_acquire
-#endif
-
-#ifndef atomic64_fetch_sub_release
-static __always_inline s64
-atomic64_fetch_sub_release(s64 i, atomic64_t *v)
-{
-	__atomic_release_fence();
-	return atomic64_fetch_sub_relaxed(i, v);
-}
-#define atomic64_fetch_sub_release atomic64_fetch_sub_release
-#endif
-
-#ifndef atomic64_fetch_sub
-static __always_inline s64
-atomic64_fetch_sub(s64 i, atomic64_t *v)
-{
-	s64 ret;
-	__atomic_pre_full_fence();
-	ret = atomic64_fetch_sub_relaxed(i, v);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic64_fetch_sub atomic64_fetch_sub
-#endif
-
-#endif /* atomic64_fetch_sub_relaxed */
-
-#define arch_atomic64_inc atomic64_inc
-
-#ifndef atomic64_inc
-static __always_inline void
-atomic64_inc(atomic64_t *v)
-{
-	atomic64_add(1, v);
-}
-#define atomic64_inc atomic64_inc
-#endif
-
-#define arch_atomic64_inc_return atomic64_inc_return
-#define arch_atomic64_inc_return_acquire atomic64_inc_return_acquire
-#define arch_atomic64_inc_return_release atomic64_inc_return_release
-#define arch_atomic64_inc_return_relaxed atomic64_inc_return_relaxed
-
-#ifndef atomic64_inc_return_relaxed
-#ifdef atomic64_inc_return
-#define atomic64_inc_return_acquire atomic64_inc_return
-#define atomic64_inc_return_release atomic64_inc_return
-#define atomic64_inc_return_relaxed atomic64_inc_return
-#endif /* atomic64_inc_return */
-
-#ifndef atomic64_inc_return
-static __always_inline s64
-atomic64_inc_return(atomic64_t *v)
-{
-	return atomic64_add_return(1, v);
-}
-#define atomic64_inc_return atomic64_inc_return
-#endif
-
-#ifndef atomic64_inc_return_acquire
-static __always_inline s64
-atomic64_inc_return_acquire(atomic64_t *v)
-{
-	return atomic64_add_return_acquire(1, v);
-}
-#define atomic64_inc_return_acquire atomic64_inc_return_acquire
-#endif
-
-#ifndef atomic64_inc_return_release
-static __always_inline s64
-atomic64_inc_return_release(atomic64_t *v)
-{
-	return atomic64_add_return_release(1, v);
-}
-#define atomic64_inc_return_release atomic64_inc_return_release
-#endif
-
-#ifndef atomic64_inc_return_relaxed
-static __always_inline s64
-atomic64_inc_return_relaxed(atomic64_t *v)
-{
-	return atomic64_add_return_relaxed(1, v);
-}
-#define atomic64_inc_return_relaxed atomic64_inc_return_relaxed
-#endif
-
-#else /* atomic64_inc_return_relaxed */
-
-#ifndef atomic64_inc_return_acquire
-static __always_inline s64
-atomic64_inc_return_acquire(atomic64_t *v)
-{
-	s64 ret = atomic64_inc_return_relaxed(v);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic64_inc_return_acquire atomic64_inc_return_acquire
-#endif
-
-#ifndef atomic64_inc_return_release
-static __always_inline s64
-atomic64_inc_return_release(atomic64_t *v)
-{
-	__atomic_release_fence();
-	return atomic64_inc_return_relaxed(v);
-}
-#define atomic64_inc_return_release atomic64_inc_return_release
-#endif
-
-#ifndef atomic64_inc_return
-static __always_inline s64
-atomic64_inc_return(atomic64_t *v)
-{
-	s64 ret;
-	__atomic_pre_full_fence();
-	ret = atomic64_inc_return_relaxed(v);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic64_inc_return atomic64_inc_return
-#endif
-
-#endif /* atomic64_inc_return_relaxed */
-
-#define arch_atomic64_fetch_inc atomic64_fetch_inc
-#define arch_atomic64_fetch_inc_acquire atomic64_fetch_inc_acquire
-#define arch_atomic64_fetch_inc_release atomic64_fetch_inc_release
-#define arch_atomic64_fetch_inc_relaxed atomic64_fetch_inc_relaxed
-
-#ifndef atomic64_fetch_inc_relaxed
-#ifdef atomic64_fetch_inc
-#define atomic64_fetch_inc_acquire atomic64_fetch_inc
-#define atomic64_fetch_inc_release atomic64_fetch_inc
-#define atomic64_fetch_inc_relaxed atomic64_fetch_inc
-#endif /* atomic64_fetch_inc */
-
-#ifndef atomic64_fetch_inc
-static __always_inline s64
-atomic64_fetch_inc(atomic64_t *v)
-{
-	return atomic64_fetch_add(1, v);
-}
-#define atomic64_fetch_inc atomic64_fetch_inc
-#endif
-
-#ifndef atomic64_fetch_inc_acquire
-static __always_inline s64
-atomic64_fetch_inc_acquire(atomic64_t *v)
-{
-	return atomic64_fetch_add_acquire(1, v);
-}
-#define atomic64_fetch_inc_acquire atomic64_fetch_inc_acquire
-#endif
-
-#ifndef atomic64_fetch_inc_release
-static __always_inline s64
-atomic64_fetch_inc_release(atomic64_t *v)
-{
-	return atomic64_fetch_add_release(1, v);
-}
-#define atomic64_fetch_inc_release atomic64_fetch_inc_release
-#endif
-
-#ifndef atomic64_fetch_inc_relaxed
-static __always_inline s64
-atomic64_fetch_inc_relaxed(atomic64_t *v)
-{
-	return atomic64_fetch_add_relaxed(1, v);
-}
-#define atomic64_fetch_inc_relaxed atomic64_fetch_inc_relaxed
-#endif
-
-#else /* atomic64_fetch_inc_relaxed */
-
-#ifndef atomic64_fetch_inc_acquire
-static __always_inline s64
-atomic64_fetch_inc_acquire(atomic64_t *v)
-{
-	s64 ret = atomic64_fetch_inc_relaxed(v);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic64_fetch_inc_acquire atomic64_fetch_inc_acquire
-#endif
-
-#ifndef atomic64_fetch_inc_release
-static __always_inline s64
-atomic64_fetch_inc_release(atomic64_t *v)
-{
-	__atomic_release_fence();
-	return atomic64_fetch_inc_relaxed(v);
-}
-#define atomic64_fetch_inc_release atomic64_fetch_inc_release
-#endif
-
-#ifndef atomic64_fetch_inc
-static __always_inline s64
-atomic64_fetch_inc(atomic64_t *v)
-{
-	s64 ret;
-	__atomic_pre_full_fence();
-	ret = atomic64_fetch_inc_relaxed(v);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic64_fetch_inc atomic64_fetch_inc
-#endif
-
-#endif /* atomic64_fetch_inc_relaxed */
-
-#define arch_atomic64_dec atomic64_dec
-
-#ifndef atomic64_dec
-static __always_inline void
-atomic64_dec(atomic64_t *v)
-{
-	atomic64_sub(1, v);
-}
-#define atomic64_dec atomic64_dec
-#endif
-
-#define arch_atomic64_dec_return atomic64_dec_return
-#define arch_atomic64_dec_return_acquire atomic64_dec_return_acquire
-#define arch_atomic64_dec_return_release atomic64_dec_return_release
-#define arch_atomic64_dec_return_relaxed atomic64_dec_return_relaxed
-
-#ifndef atomic64_dec_return_relaxed
-#ifdef atomic64_dec_return
-#define atomic64_dec_return_acquire atomic64_dec_return
-#define atomic64_dec_return_release atomic64_dec_return
-#define atomic64_dec_return_relaxed atomic64_dec_return
-#endif /* atomic64_dec_return */
-
-#ifndef atomic64_dec_return
-static __always_inline s64
-atomic64_dec_return(atomic64_t *v)
-{
-	return atomic64_sub_return(1, v);
-}
-#define atomic64_dec_return atomic64_dec_return
-#endif
-
-#ifndef atomic64_dec_return_acquire
-static __always_inline s64
-atomic64_dec_return_acquire(atomic64_t *v)
-{
-	return atomic64_sub_return_acquire(1, v);
-}
-#define atomic64_dec_return_acquire atomic64_dec_return_acquire
-#endif
-
-#ifndef atomic64_dec_return_release
-static __always_inline s64
-atomic64_dec_return_release(atomic64_t *v)
-{
-	return atomic64_sub_return_release(1, v);
-}
-#define atomic64_dec_return_release atomic64_dec_return_release
-#endif
-
-#ifndef atomic64_dec_return_relaxed
-static __always_inline s64
-atomic64_dec_return_relaxed(atomic64_t *v)
-{
-	return atomic64_sub_return_relaxed(1, v);
-}
-#define atomic64_dec_return_relaxed atomic64_dec_return_relaxed
-#endif
-
-#else /* atomic64_dec_return_relaxed */
-
-#ifndef atomic64_dec_return_acquire
-static __always_inline s64
-atomic64_dec_return_acquire(atomic64_t *v)
-{
-	s64 ret = atomic64_dec_return_relaxed(v);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic64_dec_return_acquire atomic64_dec_return_acquire
-#endif
-
-#ifndef atomic64_dec_return_release
-static __always_inline s64
-atomic64_dec_return_release(atomic64_t *v)
-{
-	__atomic_release_fence();
-	return atomic64_dec_return_relaxed(v);
-}
-#define atomic64_dec_return_release atomic64_dec_return_release
-#endif
-
-#ifndef atomic64_dec_return
-static __always_inline s64
-atomic64_dec_return(atomic64_t *v)
-{
-	s64 ret;
-	__atomic_pre_full_fence();
-	ret = atomic64_dec_return_relaxed(v);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic64_dec_return atomic64_dec_return
-#endif
-
-#endif /* atomic64_dec_return_relaxed */
-
-#define arch_atomic64_fetch_dec atomic64_fetch_dec
-#define arch_atomic64_fetch_dec_acquire atomic64_fetch_dec_acquire
-#define arch_atomic64_fetch_dec_release atomic64_fetch_dec_release
-#define arch_atomic64_fetch_dec_relaxed atomic64_fetch_dec_relaxed
-
-#ifndef atomic64_fetch_dec_relaxed
-#ifdef atomic64_fetch_dec
-#define atomic64_fetch_dec_acquire atomic64_fetch_dec
-#define atomic64_fetch_dec_release atomic64_fetch_dec
-#define atomic64_fetch_dec_relaxed atomic64_fetch_dec
-#endif /* atomic64_fetch_dec */
-
-#ifndef atomic64_fetch_dec
-static __always_inline s64
-atomic64_fetch_dec(atomic64_t *v)
-{
-	return atomic64_fetch_sub(1, v);
-}
-#define atomic64_fetch_dec atomic64_fetch_dec
-#endif
-
-#ifndef atomic64_fetch_dec_acquire
-static __always_inline s64
-atomic64_fetch_dec_acquire(atomic64_t *v)
-{
-	return atomic64_fetch_sub_acquire(1, v);
-}
-#define atomic64_fetch_dec_acquire atomic64_fetch_dec_acquire
-#endif
-
-#ifndef atomic64_fetch_dec_release
-static __always_inline s64
-atomic64_fetch_dec_release(atomic64_t *v)
-{
-	return atomic64_fetch_sub_release(1, v);
-}
-#define atomic64_fetch_dec_release atomic64_fetch_dec_release
-#endif
-
-#ifndef atomic64_fetch_dec_relaxed
-static __always_inline s64
-atomic64_fetch_dec_relaxed(atomic64_t *v)
-{
-	return atomic64_fetch_sub_relaxed(1, v);
-}
-#define atomic64_fetch_dec_relaxed atomic64_fetch_dec_relaxed
-#endif
-
-#else /* atomic64_fetch_dec_relaxed */
-
-#ifndef atomic64_fetch_dec_acquire
-static __always_inline s64
-atomic64_fetch_dec_acquire(atomic64_t *v)
-{
-	s64 ret = atomic64_fetch_dec_relaxed(v);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic64_fetch_dec_acquire atomic64_fetch_dec_acquire
-#endif
-
-#ifndef atomic64_fetch_dec_release
-static __always_inline s64
-atomic64_fetch_dec_release(atomic64_t *v)
-{
-	__atomic_release_fence();
-	return atomic64_fetch_dec_relaxed(v);
-}
-#define atomic64_fetch_dec_release atomic64_fetch_dec_release
-#endif
-
-#ifndef atomic64_fetch_dec
-static __always_inline s64
-atomic64_fetch_dec(atomic64_t *v)
-{
-	s64 ret;
-	__atomic_pre_full_fence();
-	ret = atomic64_fetch_dec_relaxed(v);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic64_fetch_dec atomic64_fetch_dec
-#endif
-
-#endif /* atomic64_fetch_dec_relaxed */
-
-#define arch_atomic64_and atomic64_and
-
-#define arch_atomic64_fetch_and atomic64_fetch_and
-#define arch_atomic64_fetch_and_acquire atomic64_fetch_and_acquire
-#define arch_atomic64_fetch_and_release atomic64_fetch_and_release
-#define arch_atomic64_fetch_and_relaxed atomic64_fetch_and_relaxed
-
-#ifndef atomic64_fetch_and_relaxed
-#define atomic64_fetch_and_acquire atomic64_fetch_and
-#define atomic64_fetch_and_release atomic64_fetch_and
-#define atomic64_fetch_and_relaxed atomic64_fetch_and
-#else /* atomic64_fetch_and_relaxed */
-
-#ifndef atomic64_fetch_and_acquire
-static __always_inline s64
-atomic64_fetch_and_acquire(s64 i, atomic64_t *v)
-{
-	s64 ret = atomic64_fetch_and_relaxed(i, v);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic64_fetch_and_acquire atomic64_fetch_and_acquire
-#endif
-
-#ifndef atomic64_fetch_and_release
-static __always_inline s64
-atomic64_fetch_and_release(s64 i, atomic64_t *v)
-{
-	__atomic_release_fence();
-	return atomic64_fetch_and_relaxed(i, v);
-}
-#define atomic64_fetch_and_release atomic64_fetch_and_release
-#endif
-
-#ifndef atomic64_fetch_and
-static __always_inline s64
-atomic64_fetch_and(s64 i, atomic64_t *v)
-{
-	s64 ret;
-	__atomic_pre_full_fence();
-	ret = atomic64_fetch_and_relaxed(i, v);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic64_fetch_and atomic64_fetch_and
-#endif
-
-#endif /* atomic64_fetch_and_relaxed */
-
-#define arch_atomic64_andnot atomic64_andnot
-
-#ifndef atomic64_andnot
-static __always_inline void
-atomic64_andnot(s64 i, atomic64_t *v)
-{
-	atomic64_and(~i, v);
-}
-#define atomic64_andnot atomic64_andnot
-#endif
-
-#define arch_atomic64_fetch_andnot atomic64_fetch_andnot
-#define arch_atomic64_fetch_andnot_acquire atomic64_fetch_andnot_acquire
-#define arch_atomic64_fetch_andnot_release atomic64_fetch_andnot_release
-#define arch_atomic64_fetch_andnot_relaxed atomic64_fetch_andnot_relaxed
-
-#ifndef atomic64_fetch_andnot_relaxed
-#ifdef atomic64_fetch_andnot
-#define atomic64_fetch_andnot_acquire atomic64_fetch_andnot
-#define atomic64_fetch_andnot_release atomic64_fetch_andnot
-#define atomic64_fetch_andnot_relaxed atomic64_fetch_andnot
-#endif /* atomic64_fetch_andnot */
-
-#ifndef atomic64_fetch_andnot
-static __always_inline s64
-atomic64_fetch_andnot(s64 i, atomic64_t *v)
-{
-	return atomic64_fetch_and(~i, v);
-}
-#define atomic64_fetch_andnot atomic64_fetch_andnot
-#endif
-
-#ifndef atomic64_fetch_andnot_acquire
-static __always_inline s64
-atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v)
-{
-	return atomic64_fetch_and_acquire(~i, v);
-}
-#define atomic64_fetch_andnot_acquire atomic64_fetch_andnot_acquire
-#endif
-
-#ifndef atomic64_fetch_andnot_release
-static __always_inline s64
-atomic64_fetch_andnot_release(s64 i, atomic64_t *v)
-{
-	return atomic64_fetch_and_release(~i, v);
-}
-#define atomic64_fetch_andnot_release atomic64_fetch_andnot_release
-#endif
-
-#ifndef atomic64_fetch_andnot_relaxed
-static __always_inline s64
-atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v)
-{
-	return atomic64_fetch_and_relaxed(~i, v);
-}
-#define atomic64_fetch_andnot_relaxed atomic64_fetch_andnot_relaxed
-#endif
-
-#else /* atomic64_fetch_andnot_relaxed */
-
-#ifndef atomic64_fetch_andnot_acquire
-static __always_inline s64
-atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v)
-{
-	s64 ret = atomic64_fetch_andnot_relaxed(i, v);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic64_fetch_andnot_acquire atomic64_fetch_andnot_acquire
-#endif
-
-#ifndef atomic64_fetch_andnot_release
-static __always_inline s64
-atomic64_fetch_andnot_release(s64 i, atomic64_t *v)
-{
-	__atomic_release_fence();
-	return atomic64_fetch_andnot_relaxed(i, v);
-}
-#define atomic64_fetch_andnot_release atomic64_fetch_andnot_release
-#endif
-
-#ifndef atomic64_fetch_andnot
-static __always_inline s64
-atomic64_fetch_andnot(s64 i, atomic64_t *v)
-{
-	s64 ret;
-	__atomic_pre_full_fence();
-	ret = atomic64_fetch_andnot_relaxed(i, v);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic64_fetch_andnot atomic64_fetch_andnot
-#endif
-
-#endif /* atomic64_fetch_andnot_relaxed */
-
-#define arch_atomic64_or atomic64_or
-
-#define arch_atomic64_fetch_or atomic64_fetch_or
-#define arch_atomic64_fetch_or_acquire atomic64_fetch_or_acquire
-#define arch_atomic64_fetch_or_release atomic64_fetch_or_release
-#define arch_atomic64_fetch_or_relaxed atomic64_fetch_or_relaxed
-
-#ifndef atomic64_fetch_or_relaxed
-#define atomic64_fetch_or_acquire atomic64_fetch_or
-#define atomic64_fetch_or_release atomic64_fetch_or
-#define atomic64_fetch_or_relaxed atomic64_fetch_or
-#else /* atomic64_fetch_or_relaxed */
-
-#ifndef atomic64_fetch_or_acquire
-static __always_inline s64
-atomic64_fetch_or_acquire(s64 i, atomic64_t *v)
-{
-	s64 ret = atomic64_fetch_or_relaxed(i, v);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic64_fetch_or_acquire atomic64_fetch_or_acquire
-#endif
-
-#ifndef atomic64_fetch_or_release
-static __always_inline s64
-atomic64_fetch_or_release(s64 i, atomic64_t *v)
-{
-	__atomic_release_fence();
-	return atomic64_fetch_or_relaxed(i, v);
-}
-#define atomic64_fetch_or_release atomic64_fetch_or_release
-#endif
-
-#ifndef atomic64_fetch_or
-static __always_inline s64
-atomic64_fetch_or(s64 i, atomic64_t *v)
-{
-	s64 ret;
-	__atomic_pre_full_fence();
-	ret = atomic64_fetch_or_relaxed(i, v);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic64_fetch_or atomic64_fetch_or
-#endif
-
-#endif /* atomic64_fetch_or_relaxed */
-
-#define arch_atomic64_xor atomic64_xor
-
-#define arch_atomic64_fetch_xor atomic64_fetch_xor
-#define arch_atomic64_fetch_xor_acquire atomic64_fetch_xor_acquire
-#define arch_atomic64_fetch_xor_release atomic64_fetch_xor_release
-#define arch_atomic64_fetch_xor_relaxed atomic64_fetch_xor_relaxed
-
-#ifndef atomic64_fetch_xor_relaxed
-#define atomic64_fetch_xor_acquire atomic64_fetch_xor
-#define atomic64_fetch_xor_release atomic64_fetch_xor
-#define atomic64_fetch_xor_relaxed atomic64_fetch_xor
-#else /* atomic64_fetch_xor_relaxed */
-
-#ifndef atomic64_fetch_xor_acquire
-static __always_inline s64
-atomic64_fetch_xor_acquire(s64 i, atomic64_t *v)
-{
-	s64 ret = atomic64_fetch_xor_relaxed(i, v);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic64_fetch_xor_acquire atomic64_fetch_xor_acquire
-#endif
-
-#ifndef atomic64_fetch_xor_release
-static __always_inline s64
-atomic64_fetch_xor_release(s64 i, atomic64_t *v)
-{
-	__atomic_release_fence();
-	return atomic64_fetch_xor_relaxed(i, v);
-}
-#define atomic64_fetch_xor_release atomic64_fetch_xor_release
-#endif
-
-#ifndef atomic64_fetch_xor
-static __always_inline s64
-atomic64_fetch_xor(s64 i, atomic64_t *v)
-{
-	s64 ret;
-	__atomic_pre_full_fence();
-	ret = atomic64_fetch_xor_relaxed(i, v);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic64_fetch_xor atomic64_fetch_xor
-#endif
-
-#endif /* atomic64_fetch_xor_relaxed */
-
-#define arch_atomic64_xchg atomic64_xchg
-#define arch_atomic64_xchg_acquire atomic64_xchg_acquire
-#define arch_atomic64_xchg_release atomic64_xchg_release
-#define arch_atomic64_xchg_relaxed atomic64_xchg_relaxed
-
-#ifndef atomic64_xchg_relaxed
-#define atomic64_xchg_acquire atomic64_xchg
-#define atomic64_xchg_release atomic64_xchg
-#define atomic64_xchg_relaxed atomic64_xchg
-#else /* atomic64_xchg_relaxed */
-
-#ifndef atomic64_xchg_acquire
-static __always_inline s64
-atomic64_xchg_acquire(atomic64_t *v, s64 i)
-{
-	s64 ret = atomic64_xchg_relaxed(v, i);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic64_xchg_acquire atomic64_xchg_acquire
-#endif
-
-#ifndef atomic64_xchg_release
-static __always_inline s64
-atomic64_xchg_release(atomic64_t *v, s64 i)
-{
-	__atomic_release_fence();
-	return atomic64_xchg_relaxed(v, i);
-}
-#define atomic64_xchg_release atomic64_xchg_release
-#endif
-
-#ifndef atomic64_xchg
-static __always_inline s64
-atomic64_xchg(atomic64_t *v, s64 i)
-{
-	s64 ret;
-	__atomic_pre_full_fence();
-	ret = atomic64_xchg_relaxed(v, i);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic64_xchg atomic64_xchg
-#endif
-
-#endif /* atomic64_xchg_relaxed */
-
-#define arch_atomic64_cmpxchg atomic64_cmpxchg
-#define arch_atomic64_cmpxchg_acquire atomic64_cmpxchg_acquire
-#define arch_atomic64_cmpxchg_release atomic64_cmpxchg_release
-#define arch_atomic64_cmpxchg_relaxed atomic64_cmpxchg_relaxed
-
-#ifndef atomic64_cmpxchg_relaxed
-#define atomic64_cmpxchg_acquire atomic64_cmpxchg
-#define atomic64_cmpxchg_release atomic64_cmpxchg
-#define atomic64_cmpxchg_relaxed atomic64_cmpxchg
-#else /* atomic64_cmpxchg_relaxed */
-
-#ifndef atomic64_cmpxchg_acquire
-static __always_inline s64
-atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new)
-{
-	s64 ret = atomic64_cmpxchg_relaxed(v, old, new);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic64_cmpxchg_acquire atomic64_cmpxchg_acquire
-#endif
-
-#ifndef atomic64_cmpxchg_release
-static __always_inline s64
-atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new)
-{
-	__atomic_release_fence();
-	return atomic64_cmpxchg_relaxed(v, old, new);
-}
-#define atomic64_cmpxchg_release atomic64_cmpxchg_release
-#endif
-
-#ifndef atomic64_cmpxchg
-static __always_inline s64
-atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
-{
-	s64 ret;
-	__atomic_pre_full_fence();
-	ret = atomic64_cmpxchg_relaxed(v, old, new);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic64_cmpxchg atomic64_cmpxchg
-#endif
-
-#endif /* atomic64_cmpxchg_relaxed */
-
-#define arch_atomic64_try_cmpxchg atomic64_try_cmpxchg
-#define arch_atomic64_try_cmpxchg_acquire atomic64_try_cmpxchg_acquire
-#define arch_atomic64_try_cmpxchg_release atomic64_try_cmpxchg_release
-#define arch_atomic64_try_cmpxchg_relaxed atomic64_try_cmpxchg_relaxed
-
-#ifndef atomic64_try_cmpxchg_relaxed
-#ifdef atomic64_try_cmpxchg
-#define atomic64_try_cmpxchg_acquire atomic64_try_cmpxchg
-#define atomic64_try_cmpxchg_release atomic64_try_cmpxchg
-#define atomic64_try_cmpxchg_relaxed atomic64_try_cmpxchg
-#endif /* atomic64_try_cmpxchg */
-
-#ifndef atomic64_try_cmpxchg
-static __always_inline bool
-atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
-{
-	s64 r, o = *old;
-	r = atomic64_cmpxchg(v, o, new);
-	if (unlikely(r != o))
-		*old = r;
-	return likely(r == o);
-}
-#define atomic64_try_cmpxchg atomic64_try_cmpxchg
-#endif
-
-#ifndef atomic64_try_cmpxchg_acquire
-static __always_inline bool
-atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new)
-{
-	s64 r, o = *old;
-	r = atomic64_cmpxchg_acquire(v, o, new);
-	if (unlikely(r != o))
-		*old = r;
-	return likely(r == o);
-}
-#define atomic64_try_cmpxchg_acquire atomic64_try_cmpxchg_acquire
-#endif
-
-#ifndef atomic64_try_cmpxchg_release
-static __always_inline bool
-atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new)
-{
-	s64 r, o = *old;
-	r = atomic64_cmpxchg_release(v, o, new);
-	if (unlikely(r != o))
-		*old = r;
-	return likely(r == o);
-}
-#define atomic64_try_cmpxchg_release atomic64_try_cmpxchg_release
-#endif
-
-#ifndef atomic64_try_cmpxchg_relaxed
-static __always_inline bool
-atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new)
-{
-	s64 r, o = *old;
-	r = atomic64_cmpxchg_relaxed(v, o, new);
-	if (unlikely(r != o))
-		*old = r;
-	return likely(r == o);
-}
-#define atomic64_try_cmpxchg_relaxed atomic64_try_cmpxchg_relaxed
-#endif
-
-#else /* atomic64_try_cmpxchg_relaxed */
-
-#ifndef atomic64_try_cmpxchg_acquire
-static __always_inline bool
-atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new)
-{
-	bool ret = atomic64_try_cmpxchg_relaxed(v, old, new);
-	__atomic_acquire_fence();
-	return ret;
-}
-#define atomic64_try_cmpxchg_acquire atomic64_try_cmpxchg_acquire
-#endif
-
-#ifndef atomic64_try_cmpxchg_release
-static __always_inline bool
-atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new)
-{
-	__atomic_release_fence();
-	return atomic64_try_cmpxchg_relaxed(v, old, new);
-}
-#define atomic64_try_cmpxchg_release atomic64_try_cmpxchg_release
-#endif
-
-#ifndef atomic64_try_cmpxchg
-static __always_inline bool
-atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
-{
-	bool ret;
-	__atomic_pre_full_fence();
-	ret = atomic64_try_cmpxchg_relaxed(v, old, new);
-	__atomic_post_full_fence();
-	return ret;
-}
-#define atomic64_try_cmpxchg atomic64_try_cmpxchg
-#endif
-
-#endif /* atomic64_try_cmpxchg_relaxed */
-
-#define arch_atomic64_sub_and_test atomic64_sub_and_test
-
-#ifndef atomic64_sub_and_test
-/**
- * atomic64_sub_and_test - subtract value from variable and test result
- * @i: integer value to subtract
- * @v: pointer of type atomic64_t
- *
- * Atomically subtracts @i from @v and returns
- * true if the result is zero, or false for all
- * other cases.
- */
-static __always_inline bool
-atomic64_sub_and_test(s64 i, atomic64_t *v)
-{
-	return atomic64_sub_return(i, v) == 0;
-}
-#define atomic64_sub_and_test atomic64_sub_and_test
-#endif
-
-#define arch_atomic64_dec_and_test atomic64_dec_and_test
-
-#ifndef atomic64_dec_and_test
-/**
- * atomic64_dec_and_test - decrement and test
- * @v: pointer of type atomic64_t
- *
- * Atomically decrements @v by 1 and
- * returns true if the result is 0, or false for all other
- * cases.
- */
-static __always_inline bool
-atomic64_dec_and_test(atomic64_t *v)
-{
-	return atomic64_dec_return(v) == 0;
-}
-#define atomic64_dec_and_test atomic64_dec_and_test
-#endif
-
-#define arch_atomic64_inc_and_test atomic64_inc_and_test
-
-#ifndef atomic64_inc_and_test
-/**
- * atomic64_inc_and_test - increment and test
- * @v: pointer of type atomic64_t
- *
- * Atomically increments @v by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-static __always_inline bool
-atomic64_inc_and_test(atomic64_t *v)
-{
-	return atomic64_inc_return(v) == 0;
-}
-#define atomic64_inc_and_test atomic64_inc_and_test
-#endif
-
-#define arch_atomic64_add_negative atomic64_add_negative
-
-#ifndef atomic64_add_negative
-/**
- * atomic64_add_negative - add and test if negative
- * @i: integer value to add
- * @v: pointer of type atomic64_t
- *
- * Atomically adds @i to @v and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
- */
-static __always_inline bool
-atomic64_add_negative(s64 i, atomic64_t *v)
-{
-	return atomic64_add_return(i, v) < 0;
-}
-#define atomic64_add_negative atomic64_add_negative
-#endif
-
-#define arch_atomic64_fetch_add_unless atomic64_fetch_add_unless
-
-#ifndef atomic64_fetch_add_unless
-/**
- * atomic64_fetch_add_unless - add unless the number is already a given value
- * @v: pointer of type atomic64_t
- * @a: the amount to add to v...
- * @u: ...unless v is equal to u.
- *
- * Atomically adds @a to @v, so long as @v was not already @u.
- * Returns original value of @v
- */
-static __always_inline s64
-atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
-{
-	s64 c = atomic64_read(v);
-
-	do {
-		if (unlikely(c == u))
-			break;
-	} while (!atomic64_try_cmpxchg(v, &c, c + a));
-
-	return c;
-}
-#define atomic64_fetch_add_unless atomic64_fetch_add_unless
-#endif
-
-#define arch_atomic64_add_unless atomic64_add_unless
-
-#ifndef atomic64_add_unless
-/**
- * atomic64_add_unless - add unless the number is already a given value
- * @v: pointer of type atomic64_t
- * @a: the amount to add to v...
- * @u: ...unless v is equal to u.
- *
- * Atomically adds @a to @v, if @v was not already @u.
- * Returns true if the addition was done.
- */
-static __always_inline bool
-atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
-{
-	return atomic64_fetch_add_unless(v, a, u) != u;
-}
-#define atomic64_add_unless atomic64_add_unless
-#endif
-
-#define arch_atomic64_inc_not_zero atomic64_inc_not_zero
-
-#ifndef atomic64_inc_not_zero
-/**
- * atomic64_inc_not_zero - increment unless the number is zero
- * @v: pointer of type atomic64_t
- *
- * Atomically increments @v by 1, if @v is non-zero.
- * Returns true if the increment was done.
- */
-static __always_inline bool
-atomic64_inc_not_zero(atomic64_t *v)
-{
-	return atomic64_add_unless(v, 1, 0);
-}
-#define atomic64_inc_not_zero atomic64_inc_not_zero
-#endif
-
-#define arch_atomic64_inc_unless_negative atomic64_inc_unless_negative
-
-#ifndef atomic64_inc_unless_negative
-static __always_inline bool
-atomic64_inc_unless_negative(atomic64_t *v)
-{
-	s64 c = atomic64_read(v);
-
-	do {
-		if (unlikely(c < 0))
-			return false;
-	} while (!atomic64_try_cmpxchg(v, &c, c + 1));
-
-	return true;
-}
-#define atomic64_inc_unless_negative atomic64_inc_unless_negative
-#endif
-
-#define arch_atomic64_dec_unless_positive atomic64_dec_unless_positive
-
-#ifndef atomic64_dec_unless_positive
-static __always_inline bool
-atomic64_dec_unless_positive(atomic64_t *v)
-{
-	s64 c = atomic64_read(v);
-
-	do {
-		if (unlikely(c > 0))
-			return false;
-	} while (!atomic64_try_cmpxchg(v, &c, c - 1));
-
-	return true;
-}
-#define atomic64_dec_unless_positive atomic64_dec_unless_positive
-#endif
-
-#define arch_atomic64_dec_if_positive atomic64_dec_if_positive
-
-#ifndef atomic64_dec_if_positive
-static __always_inline s64
-atomic64_dec_if_positive(atomic64_t *v)
-{
-	s64 dec, c = atomic64_read(v);
-
-	do {
-		dec = c - 1;
-		if (unlikely(dec < 0))
-			break;
-	} while (!atomic64_try_cmpxchg(v, &c, dec));
-
-	return dec;
-}
-#define atomic64_dec_if_positive atomic64_dec_if_positive
-#endif
-
-#endif /* _LINUX_ATOMIC_FALLBACK_H */
-// d78e6c293c661c15188f0ec05bce45188c8d5892
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 4f8d83f9e480..ed1d3ffd5b9d 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -77,12 +77,8 @@
 	__ret;								\
 })
 
-#ifdef CONFIG_ARCH_ATOMIC
 #include <linux/atomic-arch-fallback.h>
 #include <asm-generic/atomic-instrumented.h>
-#else
-#include <linux/atomic-fallback.h>
-#endif
 
 #include <asm-generic/atomic-long.h>
 
diff --git a/scripts/atomic/check-atomics.sh b/scripts/atomic/check-atomics.sh
index 82748d42ecc5..9c7fbd4bcbce 100755
--- a/scripts/atomic/check-atomics.sh
+++ b/scripts/atomic/check-atomics.sh
@@ -17,7 +17,6 @@ cat <<EOF |
 asm-generic/atomic-instrumented.h
 asm-generic/atomic-long.h
 linux/atomic-arch-fallback.h
-linux/atomic-fallback.h
 EOF
 while read header; do
 	OLDSUM="$(tail -n 1 ${LINUXDIR}/include/${header})"
diff --git a/scripts/atomic/gen-atomics.sh b/scripts/atomic/gen-atomics.sh
index d29e159ef489..f776a574224d 100755
--- a/scripts/atomic/gen-atomics.sh
+++ b/scripts/atomic/gen-atomics.sh
@@ -11,7 +11,6 @@ cat <<EOF |
 gen-atomic-instrumented.sh      asm-generic/atomic-instrumented.h
 gen-atomic-long.sh              asm-generic/atomic-long.h
 gen-atomic-fallback.sh          linux/atomic-arch-fallback.h		arch_
-gen-atomic-fallback.sh          linux/atomic-fallback.h
 EOF
 while read script header args; do
 	/bin/sh ${ATOMICDIR}/${script} ${ATOMICTBL} ${args} > ${LINUXDIR}/include/${header}
-- 
cgit v1.2.3


From 10e96f8b4e7521197a50b370ce0923ab6a8d0ca0 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 26 May 2021 11:32:39 +0200
Subject: mtd: rawnand: Move struct gpio_desc declaration to the top

The struct gpio_desc is declared in the middle of the rawnand.h header,
right before the first function using it (nand_gpio_waitrdy). Before
adding a new function and to make it clear: move the declaration to the
top of the file.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210526093242.183847-2-miquel.raynal@bootlin.com
---
 include/linux/mtd/rawnand.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 89b9c52c7387..d41d39360fff 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -24,6 +24,7 @@
 #include <linux/types.h>
 
 struct nand_chip;
+struct gpio_desc;
 
 /* The maximum number of NAND chips in an array */
 #define NAND_MAX_CHIPS		8
@@ -1562,7 +1563,6 @@ void nand_cleanup(struct nand_chip *chip);
  * instruction and have no physical pin to check it.
  */
 int nand_soft_waitrdy(struct nand_chip *chip, unsigned long timeout_ms);
-struct gpio_desc;
 int nand_gpio_waitrdy(struct nand_chip *chip, struct gpio_desc *gpiod,
 		      unsigned long timeout_ms);
 
-- 
cgit v1.2.3


From b85c943d181ac58e3a34a5f79c73d421f4da7b00 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 26 May 2021 11:32:40 +0200
Subject: mtd: rawnand: Add a helper to parse the gpio-cs DT property

New chips may feature a lot of CS because of their extended length. As
many controllers have been designed a decade ago, they usually only
feature just a couple. This does not mean that the entire range of
these chips cannot be accessed: it is just a matter of adding more
GPIO CS in the hardware design. A DT property has been added to
describe the CS array: cs-gpios.

Here is the code parsing it this new property, allocating what needs to
be, requesting the GPIOs and returning an array with the additional
available CS. The first entries of this array are left empty and are
reserved for native CS.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210526093242.183847-3-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/raw/nand_base.c | 39 +++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/rawnand.h      |  4 ++++
 2 files changed, 43 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index 22974a53077f..57a583149cc0 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -42,6 +42,7 @@
 #include <linux/io.h>
 #include <linux/mtd/partitions.h>
 #include <linux/of.h>
+#include <linux/of_gpio.h>
 #include <linux/gpio/consumer.h>
 
 #include "internals.h"
@@ -5249,6 +5250,44 @@ static int of_get_nand_secure_regions(struct nand_chip *chip)
 	return 0;
 }
 
+/**
+ * rawnand_dt_parse_gpio_cs - Parse the gpio-cs property of a controller
+ * @dev: Device that will be parsed. Also used for managed allocations.
+ * @cs_array: Array of GPIO desc pointers allocated on success
+ * @ncs_array: Number of entries in @cs_array updated on success.
+ * @return 0 on success, an error otherwise.
+ */
+int rawnand_dt_parse_gpio_cs(struct device *dev, struct gpio_desc ***cs_array,
+			     unsigned int *ncs_array)
+{
+	struct device_node *np = dev->of_node;
+	struct gpio_desc **descs;
+	int ndescs, i;
+
+	ndescs = of_gpio_named_count(np, "cs-gpios");
+	if (ndescs < 0) {
+		dev_dbg(dev, "No valid cs-gpios property\n");
+		return 0;
+	}
+
+	descs = devm_kcalloc(dev, ndescs, sizeof(*descs), GFP_KERNEL);
+	if (!descs)
+		return -ENOMEM;
+
+	for (i = 0; i < ndescs; i++) {
+		descs[i] = gpiod_get_index_optional(dev, "cs", i,
+						    GPIOD_OUT_HIGH);
+		if (IS_ERR(descs[i]))
+			return PTR_ERR(descs[i]);
+	}
+
+	*ncs_array = ndescs;
+	*cs_array = descs;
+
+	return 0;
+}
+EXPORT_SYMBOL(rawnand_dt_parse_gpio_cs);
+
 static int rawnand_dt_init(struct nand_chip *chip)
 {
 	struct nand_device *nand = mtd_to_nanddev(nand_to_mtd(chip));
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index d41d39360fff..b2f9dd3cbd69 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -1595,4 +1595,8 @@ static inline void *nand_get_data_buf(struct nand_chip *chip)
 	return chip->data_buf;
 }
 
+/* Parse the gpio-cs property */
+int rawnand_dt_parse_gpio_cs(struct device *dev, struct gpio_desc ***cs_array,
+			     unsigned int *ncs_array);
+
 #endif /* __LINUX_MTD_RAWNAND_H */
-- 
cgit v1.2.3


From 3fdc0cb59d97f87e2cc708d424f1538e31744286 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Tue, 18 May 2021 17:36:18 +0100
Subject: arm64: smccc: Add support for SMCCCv1.2 extended input/output
 registers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SMCCC v1.2 allows x8-x17 to be used as parameter registers and x4—x17
to be used as result registers in SMC64/HVC64. Arm Firmware Framework
for Armv8-A specification makes use of x0-x7 as parameter and result
registers. There are other users like Hyper-V who intend to use beyond
x0-x7 as well.

Current SMCCC interface in the kernel just use x0-x7 as parameter and
x0-x3 as result registers as required by SMCCCv1.0. Let us add new
interface to support this extended set of input/output registers namely
x0-x17 as both parameter and result registers.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Michael Kelley <mikelley@microsoft.com>
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Cc: Will Deacon <will@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20210518163618.43950-1-sudeep.holla@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/asm-offsets.c |  9 +++++++
 arch/arm64/kernel/smccc-call.S  | 57 +++++++++++++++++++++++++++++++++++++++++
 include/linux/arm-smccc.h       | 55 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 121 insertions(+)

(limited to 'include/linux')

diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 0cb34ccb6e73..74321bc9a459 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -138,6 +138,15 @@ int main(void)
   DEFINE(ARM_SMCCC_RES_X2_OFFS,		offsetof(struct arm_smccc_res, a2));
   DEFINE(ARM_SMCCC_QUIRK_ID_OFFS,	offsetof(struct arm_smccc_quirk, id));
   DEFINE(ARM_SMCCC_QUIRK_STATE_OFFS,	offsetof(struct arm_smccc_quirk, state));
+  DEFINE(ARM_SMCCC_1_2_REGS_X0_OFFS,	offsetof(struct arm_smccc_1_2_regs, a0));
+  DEFINE(ARM_SMCCC_1_2_REGS_X2_OFFS,	offsetof(struct arm_smccc_1_2_regs, a2));
+  DEFINE(ARM_SMCCC_1_2_REGS_X4_OFFS,	offsetof(struct arm_smccc_1_2_regs, a4));
+  DEFINE(ARM_SMCCC_1_2_REGS_X6_OFFS,	offsetof(struct arm_smccc_1_2_regs, a6));
+  DEFINE(ARM_SMCCC_1_2_REGS_X8_OFFS,	offsetof(struct arm_smccc_1_2_regs, a8));
+  DEFINE(ARM_SMCCC_1_2_REGS_X10_OFFS,	offsetof(struct arm_smccc_1_2_regs, a10));
+  DEFINE(ARM_SMCCC_1_2_REGS_X12_OFFS,	offsetof(struct arm_smccc_1_2_regs, a12));
+  DEFINE(ARM_SMCCC_1_2_REGS_X14_OFFS,	offsetof(struct arm_smccc_1_2_regs, a14));
+  DEFINE(ARM_SMCCC_1_2_REGS_X16_OFFS,	offsetof(struct arm_smccc_1_2_regs, a16));
   BLANK();
   DEFINE(HIBERN_PBE_ORIG,	offsetof(struct pbe, orig_address));
   DEFINE(HIBERN_PBE_ADDR,	offsetof(struct pbe, address));
diff --git a/arch/arm64/kernel/smccc-call.S b/arch/arm64/kernel/smccc-call.S
index d62447964ed9..2def9d0dd3dd 100644
--- a/arch/arm64/kernel/smccc-call.S
+++ b/arch/arm64/kernel/smccc-call.S
@@ -43,3 +43,60 @@ SYM_FUNC_START(__arm_smccc_hvc)
 	SMCCC	hvc
 SYM_FUNC_END(__arm_smccc_hvc)
 EXPORT_SYMBOL(__arm_smccc_hvc)
+
+	.macro SMCCC_1_2 instr
+	/* Save `res` and free a GPR that won't be clobbered */
+	stp     x1, x19, [sp, #-16]!
+
+	/* Ensure `args` won't be clobbered while loading regs in next step */
+	mov	x19, x0
+
+	/* Load the registers x0 - x17 from the struct arm_smccc_1_2_regs */
+	ldp	x0, x1, [x19, #ARM_SMCCC_1_2_REGS_X0_OFFS]
+	ldp	x2, x3, [x19, #ARM_SMCCC_1_2_REGS_X2_OFFS]
+	ldp	x4, x5, [x19, #ARM_SMCCC_1_2_REGS_X4_OFFS]
+	ldp	x6, x7, [x19, #ARM_SMCCC_1_2_REGS_X6_OFFS]
+	ldp	x8, x9, [x19, #ARM_SMCCC_1_2_REGS_X8_OFFS]
+	ldp	x10, x11, [x19, #ARM_SMCCC_1_2_REGS_X10_OFFS]
+	ldp	x12, x13, [x19, #ARM_SMCCC_1_2_REGS_X12_OFFS]
+	ldp	x14, x15, [x19, #ARM_SMCCC_1_2_REGS_X14_OFFS]
+	ldp	x16, x17, [x19, #ARM_SMCCC_1_2_REGS_X16_OFFS]
+
+	\instr #0
+
+	/* Load the `res` from the stack */
+	ldr	x19, [sp]
+
+	/* Store the registers x0 - x17 into the result structure */
+	stp	x0, x1, [x19, #ARM_SMCCC_1_2_REGS_X0_OFFS]
+	stp	x2, x3, [x19, #ARM_SMCCC_1_2_REGS_X2_OFFS]
+	stp	x4, x5, [x19, #ARM_SMCCC_1_2_REGS_X4_OFFS]
+	stp	x6, x7, [x19, #ARM_SMCCC_1_2_REGS_X6_OFFS]
+	stp	x8, x9, [x19, #ARM_SMCCC_1_2_REGS_X8_OFFS]
+	stp	x10, x11, [x19, #ARM_SMCCC_1_2_REGS_X10_OFFS]
+	stp	x12, x13, [x19, #ARM_SMCCC_1_2_REGS_X12_OFFS]
+	stp	x14, x15, [x19, #ARM_SMCCC_1_2_REGS_X14_OFFS]
+	stp	x16, x17, [x19, #ARM_SMCCC_1_2_REGS_X16_OFFS]
+
+	/* Restore original x19 */
+	ldp     xzr, x19, [sp], #16
+	ret
+.endm
+
+/*
+ * void arm_smccc_1_2_hvc(const struct arm_smccc_1_2_regs *args,
+ *			  struct arm_smccc_1_2_regs *res);
+ */
+SYM_FUNC_START(arm_smccc_1_2_hvc)
+	SMCCC_1_2 hvc
+SYM_FUNC_END(arm_smccc_1_2_hvc)
+EXPORT_SYMBOL(arm_smccc_1_2_hvc)
+
+/*
+ * void arm_smccc_1_2_smc(const struct arm_smccc_1_2_regs *args,
+ *			  struct arm_smccc_1_2_regs *res);
+ */
+SYM_FUNC_START(arm_smccc_1_2_smc)
+	SMCCC_1_2 smc
+SYM_FUNC_END(arm_smccc_1_2_smc)
+EXPORT_SYMBOL(arm_smccc_1_2_smc)
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index 6861489a1890..5cef2b8b0479 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -227,6 +227,61 @@ struct arm_smccc_res {
 	unsigned long a3;
 };
 
+#ifdef CONFIG_ARM64
+/**
+ * struct arm_smccc_1_2_regs - Arguments for or Results from SMC/HVC call
+ * @a0-a17 argument values from registers 0 to 17
+ */
+struct arm_smccc_1_2_regs {
+	unsigned long a0;
+	unsigned long a1;
+	unsigned long a2;
+	unsigned long a3;
+	unsigned long a4;
+	unsigned long a5;
+	unsigned long a6;
+	unsigned long a7;
+	unsigned long a8;
+	unsigned long a9;
+	unsigned long a10;
+	unsigned long a11;
+	unsigned long a12;
+	unsigned long a13;
+	unsigned long a14;
+	unsigned long a15;
+	unsigned long a16;
+	unsigned long a17;
+};
+
+/**
+ * arm_smccc_1_2_hvc() - make HVC calls
+ * @args: arguments passed via struct arm_smccc_1_2_regs
+ * @res: result values via struct arm_smccc_1_2_regs
+ *
+ * This function is used to make HVC calls following SMC Calling Convention
+ * v1.2 or above. The content of the supplied param are copied from the
+ * structure to registers prior to the HVC instruction. The return values
+ * are updated with the content from registers on return from the HVC
+ * instruction.
+ */
+asmlinkage void arm_smccc_1_2_hvc(const struct arm_smccc_1_2_regs *args,
+				  struct arm_smccc_1_2_regs *res);
+
+/**
+ * arm_smccc_1_2_smc() - make SMC calls
+ * @args: arguments passed via struct arm_smccc_1_2_regs
+ * @res: result values via struct arm_smccc_1_2_regs
+ *
+ * This function is used to make SMC calls following SMC Calling Convention
+ * v1.2 or above. The content of the supplied param are copied from the
+ * structure to registers prior to the SMC instruction. The return values
+ * are updated with the content from registers on return from the SMC
+ * instruction.
+ */
+asmlinkage void arm_smccc_1_2_smc(const struct arm_smccc_1_2_regs *args,
+				  struct arm_smccc_1_2_regs *res);
+#endif
+
 /**
  * struct arm_smccc_quirk - Contains quirk information
  * @id: quirk identification
-- 
cgit v1.2.3


From 62f3415db237b8d2aa9a804ff84ce2efa87df179 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 26 May 2021 11:46:17 -0700
Subject: net: phy: Document phydev::dev_flags bits allocation

Document the phydev::dev_flags bit allocation to allow bits 15:0 to
define PHY driver specific behavior, bits 23:16 to be reserved for now,
and bits 31:24 to hold generic PHY driver flags.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://lore.kernel.org/r/20210526184617.3105012-1-f.fainelli@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 60d2b26026a2..852743f07e3e 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -496,6 +496,11 @@ struct macsec_ops;
  * @mac_managed_pm: Set true if MAC driver takes of suspending/resuming PHY
  * @state: State of the PHY for management purposes
  * @dev_flags: Device-specific flags used by the PHY driver.
+ *		Bits [15:0] are free to use by the PHY driver to communicate
+ *			    driver specific behavior.
+ *		Bits [23:16] are currently reserved for future use.
+ *		Bits [31:24] are reserved for defining generic
+ *			     PHY driver behavior.
  * @irq: IRQ number of the PHY's interrupt (-1 if none)
  * @phy_timer: The timer for handling the state machine
  * @phylink: Pointer to phylink instance for this PHY
-- 
cgit v1.2.3


From e781858488b918e30a6ff28e9eab6058b787e3b3 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Fri, 21 May 2021 16:10:29 +0100
Subject: firmware: arm_ffa: Add initial FFA bus support for device enumeration

The Arm FF for Armv8-A specification has concept of endpoints or
partitions. In the Normal world, a partition could be a VM when
the Virtualization extension is enabled or the kernel itself.

In order to handle multiple partitions, we can create a FFA device for
each such partition on a dedicated FFA bus. Similarly, different drivers
requiring FFA transport can be registered on the same bus. We can match
the device and drivers using UUID. This is mostly for the in-kernel
users with FFA drivers.

Link: https://lore.kernel.org/r/20210521151033.181846-2-sudeep.holla@arm.com
Tested-by: Jens Wiklander <jens.wiklander@linaro.org>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 MAINTAINERS                       |   7 ++
 drivers/firmware/Kconfig          |   1 +
 drivers/firmware/Makefile         |   1 +
 drivers/firmware/arm_ffa/Kconfig  |  16 +++
 drivers/firmware/arm_ffa/Makefile |   4 +
 drivers/firmware/arm_ffa/bus.c    | 207 ++++++++++++++++++++++++++++++++++++++
 include/linux/arm_ffa.h           |  91 +++++++++++++++++
 7 files changed, 327 insertions(+)
 create mode 100644 drivers/firmware/arm_ffa/Kconfig
 create mode 100644 drivers/firmware/arm_ffa/Makefile
 create mode 100644 drivers/firmware/arm_ffa/bus.c
 create mode 100644 include/linux/arm_ffa.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index bd7aff0c120f..3b8ab94cf697 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7069,6 +7069,13 @@ F:	include/linux/firewire.h
 F:	include/uapi/linux/firewire*.h
 F:	tools/firewire/
 
+FIRMWARE FRAMEWORK FOR ARMV8-A
+M:	Sudeep Holla <sudeep.holla@arm.com>
+L:	linux-arm-kernel@lists.infradead.org
+S:	Maintained
+F:	drivers/firmware/arm_ffa/
+F:	include/linux/arm_ffa.h
+
 FIRMWARE LOADER (request_firmware)
 M:	Luis Chamberlain <mcgrof@kernel.org>
 L:	linux-kernel@vger.kernel.org
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index db0ea2d2d75a..b53ac9fc7704 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -296,6 +296,7 @@ config TURRIS_MOX_RWTM
 	  other manufacturing data and also utilize the Entropy Bit Generator
 	  for hardware random number generation.
 
+source "drivers/firmware/arm_ffa/Kconfig"
 source "drivers/firmware/broadcom/Kconfig"
 source "drivers/firmware/google/Kconfig"
 source "drivers/firmware/efi/Kconfig"
diff --git a/drivers/firmware/Makefile b/drivers/firmware/Makefile
index 5e013b6a3692..546ac8e7f6d0 100644
--- a/drivers/firmware/Makefile
+++ b/drivers/firmware/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_TI_SCI_PROTOCOL)	+= ti_sci.o
 obj-$(CONFIG_TRUSTED_FOUNDATIONS) += trusted_foundations.o
 obj-$(CONFIG_TURRIS_MOX_RWTM)	+= turris-mox-rwtm.o
 
+obj-y				+= arm_ffa/
 obj-y				+= arm_scmi/
 obj-y				+= broadcom/
 obj-y				+= meson/
diff --git a/drivers/firmware/arm_ffa/Kconfig b/drivers/firmware/arm_ffa/Kconfig
new file mode 100644
index 000000000000..261a3660650a
--- /dev/null
+++ b/drivers/firmware/arm_ffa/Kconfig
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config ARM_FFA_TRANSPORT
+	tristate "Arm Firmware Framework for Armv8-A"
+	depends on OF
+	depends on ARM64
+	default n
+	help
+	  This Firmware Framework(FF) for Arm A-profile processors describes
+	  interfaces that standardize communication between the various
+	  software images which includes communication between images in
+	  the Secure world and Normal world. It also leverages the
+	  virtualization extension to isolate software images provided
+	  by an ecosystem of vendors from each other.
+
+	  This driver provides interface for all the client drivers making
+	  use of the features offered by ARM FF-A.
diff --git a/drivers/firmware/arm_ffa/Makefile b/drivers/firmware/arm_ffa/Makefile
new file mode 100644
index 000000000000..bfe4323a8784
--- /dev/null
+++ b/drivers/firmware/arm_ffa/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+ffa-bus-y = bus.o
+ffa-module-objs := $(ffa-bus-y)
+obj-$(CONFIG_ARM_FFA_TRANSPORT) = ffa-module.o
diff --git a/drivers/firmware/arm_ffa/bus.c b/drivers/firmware/arm_ffa/bus.c
new file mode 100644
index 000000000000..a17874bed946
--- /dev/null
+++ b/drivers/firmware/arm_ffa/bus.c
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 ARM Ltd.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/arm_ffa.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+static int ffa_device_match(struct device *dev, struct device_driver *drv)
+{
+	const struct ffa_device_id *id_table;
+	struct ffa_device *ffa_dev;
+
+	id_table = to_ffa_driver(drv)->id_table;
+	ffa_dev = to_ffa_dev(dev);
+
+	while (!uuid_is_null(&id_table->uuid)) {
+		if (uuid_equal(&ffa_dev->uuid, &id_table->uuid))
+			return 1;
+		id_table++;
+	}
+
+	return 0;
+}
+
+static int ffa_device_probe(struct device *dev)
+{
+	struct ffa_driver *ffa_drv = to_ffa_driver(dev->driver);
+	struct ffa_device *ffa_dev = to_ffa_dev(dev);
+
+	if (!ffa_device_match(dev, dev->driver))
+		return -ENODEV;
+
+	return ffa_drv->probe(ffa_dev);
+}
+
+static int ffa_device_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	struct ffa_device *ffa_dev = to_ffa_dev(dev);
+
+	return add_uevent_var(env, "MODALIAS=arm_ffa:%04x:%pUb",
+			      ffa_dev->vm_id, &ffa_dev->uuid);
+}
+
+static ssize_t partition_id_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct ffa_device *ffa_dev = to_ffa_dev(dev);
+
+	return sprintf(buf, "0x%04x\n", ffa_dev->vm_id);
+}
+static DEVICE_ATTR_RO(partition_id);
+
+static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct ffa_device *ffa_dev = to_ffa_dev(dev);
+
+	return sprintf(buf, "%pUb\n", &ffa_dev->uuid);
+}
+static DEVICE_ATTR_RO(uuid);
+
+static struct attribute *ffa_device_attributes_attrs[] = {
+	&dev_attr_partition_id.attr,
+	&dev_attr_uuid.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(ffa_device_attributes);
+
+struct bus_type ffa_bus_type = {
+	.name		= "arm_ffa",
+	.match		= ffa_device_match,
+	.probe		= ffa_device_probe,
+	.uevent		= ffa_device_uevent,
+	.dev_groups	= ffa_device_attributes_groups,
+};
+EXPORT_SYMBOL_GPL(ffa_bus_type);
+
+int ffa_driver_register(struct ffa_driver *driver, struct module *owner,
+			const char *mod_name)
+{
+	int ret;
+
+	driver->driver.bus = &ffa_bus_type;
+	driver->driver.name = driver->name;
+	driver->driver.owner = owner;
+	driver->driver.mod_name = mod_name;
+
+	ret = driver_register(&driver->driver);
+	if (!ret)
+		pr_debug("registered new ffa driver %s\n", driver->name);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ffa_driver_register);
+
+void ffa_driver_unregister(struct ffa_driver *driver)
+{
+	driver_unregister(&driver->driver);
+}
+EXPORT_SYMBOL_GPL(ffa_driver_unregister);
+
+static void ffa_release_device(struct device *dev)
+{
+	struct ffa_device *ffa_dev = to_ffa_dev(dev);
+
+	kfree(ffa_dev);
+}
+
+static int __ffa_devices_unregister(struct device *dev, void *data)
+{
+	ffa_release_device(dev);
+
+	return 0;
+}
+
+static void ffa_devices_unregister(void)
+{
+	bus_for_each_dev(&ffa_bus_type, NULL, NULL,
+			 __ffa_devices_unregister);
+}
+
+bool ffa_device_is_valid(struct ffa_device *ffa_dev)
+{
+	bool valid = false;
+	struct device *dev = NULL;
+	struct ffa_device *tmp_dev;
+
+	do {
+		dev = bus_find_next_device(&ffa_bus_type, dev);
+		tmp_dev = to_ffa_dev(dev);
+		if (tmp_dev == ffa_dev) {
+			valid = true;
+			break;
+		}
+		put_device(dev);
+	} while (dev);
+
+	put_device(dev);
+
+	return valid;
+}
+
+struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id)
+{
+	int ret;
+	struct device *dev;
+	struct ffa_device *ffa_dev;
+
+	ffa_dev = kzalloc(sizeof(*ffa_dev), GFP_KERNEL);
+	if (!ffa_dev)
+		return NULL;
+
+	dev = &ffa_dev->dev;
+	dev->bus = &ffa_bus_type;
+	dev->release = ffa_release_device;
+	dev_set_name(&ffa_dev->dev, "arm-ffa-%04x", vm_id);
+
+	ffa_dev->vm_id = vm_id;
+	uuid_copy(&ffa_dev->uuid, uuid);
+
+	ret = device_register(&ffa_dev->dev);
+	if (ret) {
+		dev_err(dev, "unable to register device %s err=%d\n",
+			dev_name(dev), ret);
+		put_device(dev);
+		return NULL;
+	}
+
+	return ffa_dev;
+}
+EXPORT_SYMBOL_GPL(ffa_device_register);
+
+void ffa_device_unregister(struct ffa_device *ffa_dev)
+{
+	if (!ffa_dev)
+		return;
+
+	device_unregister(&ffa_dev->dev);
+}
+EXPORT_SYMBOL_GPL(ffa_device_unregister);
+
+static int __init arm_ffa_bus_init(void)
+{
+	return bus_register(&ffa_bus_type);
+}
+module_init(arm_ffa_bus_init);
+
+static void __exit arm_ffa_bus_exit(void)
+{
+	ffa_devices_unregister();
+	bus_unregister(&ffa_bus_type);
+}
+
+module_exit(arm_ffa_bus_exit);
+
+MODULE_ALIAS("arm-ffa-bus");
+MODULE_AUTHOR("Sudeep Holla <sudeep.holla@arm.com>");
+MODULE_DESCRIPTION("Arm FF-A bus driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
new file mode 100644
index 000000000000..331ff62c9873
--- /dev/null
+++ b/include/linux/arm_ffa.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 ARM Ltd.
+ */
+
+#ifndef _LINUX_ARM_FFA_H
+#define _LINUX_ARM_FFA_H
+
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/uuid.h>
+
+/* FFA Bus/Device/Driver related */
+struct ffa_device {
+	int vm_id;
+	uuid_t uuid;
+	struct device dev;
+};
+
+#define to_ffa_dev(d) container_of(d, struct ffa_device, dev)
+
+struct ffa_device_id {
+	uuid_t uuid;
+};
+
+struct ffa_driver {
+	const char *name;
+	int (*probe)(struct ffa_device *sdev);
+	void (*remove)(struct ffa_device *sdev);
+	const struct ffa_device_id *id_table;
+
+	struct device_driver driver;
+};
+
+#define to_ffa_driver(d) container_of(d, struct ffa_driver, driver)
+
+static inline void ffa_dev_set_drvdata(struct ffa_device *fdev, void *data)
+{
+	fdev->dev.driver_data = data;
+}
+
+#if IS_REACHABLE(CONFIG_ARM_FFA_TRANSPORT)
+struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id);
+void ffa_device_unregister(struct ffa_device *ffa_dev);
+int ffa_driver_register(struct ffa_driver *driver, struct module *owner,
+			const char *mod_name);
+void ffa_driver_unregister(struct ffa_driver *driver);
+bool ffa_device_is_valid(struct ffa_device *ffa_dev);
+
+#else
+static inline
+struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id)
+{
+	return NULL;
+}
+
+static inline void ffa_device_unregister(struct ffa_device *dev) {}
+
+static inline int
+ffa_driver_register(struct ffa_driver *driver, struct module *owner,
+		    const char *mod_name)
+{
+	return -EINVAL;
+}
+
+static inline void ffa_driver_unregister(struct ffa_driver *driver) {}
+
+static inline
+bool ffa_device_is_valid(struct ffa_device *ffa_dev) { return false; }
+
+#endif /* CONFIG_ARM_FFA_TRANSPORT */
+
+#define ffa_register(driver) \
+	ffa_driver_register(driver, THIS_MODULE, KBUILD_MODNAME)
+#define ffa_unregister(driver) \
+	ffa_driver_unregister(driver)
+
+/**
+ * module_ffa_driver() - Helper macro for registering a psa_ffa driver
+ * @__ffa_driver: ffa_driver structure
+ *
+ * Helper macro for psa_ffa drivers to set up proper module init / exit
+ * functions.  Replaces module_init() and module_exit() and keeps people from
+ * printing pointless things to the kernel log when their driver is loaded.
+ */
+#define module_ffa_driver(__ffa_driver)	\
+	module_driver(__ffa_driver, ffa_register, ffa_unregister)
+
+#endif /* _LINUX_ARM_FFA_H */
-- 
cgit v1.2.3


From d0c0bce831223b08e5bade2cefc93c3ddb790796 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Fri, 21 May 2021 16:10:32 +0100
Subject: firmware: arm_ffa: Setup in-kernel users of FFA partitions

Parse the FFA nodes from the device-tree and register all the partitions
whose services will be used in the kernel.

In order to also enable in-kernel users of FFA interface, let us add
simple set of operations for such devices.

The in-kernel users are registered without the character device interface.

Link: https://lore.kernel.org/r/20210521151033.181846-5-sudeep.holla@arm.com
Tested-by: Jens Wiklander <jens.wiklander@linaro.org>
Reviewed-by: Jens Wiklander <jens.wiklander@linaro.org>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_ffa/bus.c    |   9 ++
 drivers/firmware/arm_ffa/common.h |   3 +
 drivers/firmware/arm_ffa/driver.c | 221 ++++++++++++++++++++++++++++++++++++++
 include/linux/arm_ffa.h           |  39 ++++++-
 4 files changed, 271 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_ffa/bus.c b/drivers/firmware/arm_ffa/bus.c
index ae1eada5dc0c..83166e02b191 100644
--- a/drivers/firmware/arm_ffa/bus.c
+++ b/drivers/firmware/arm_ffa/bus.c
@@ -24,6 +24,15 @@ static int ffa_device_match(struct device *dev, struct device_driver *drv)
 	ffa_dev = to_ffa_dev(dev);
 
 	while (!uuid_is_null(&id_table->uuid)) {
+		/*
+		 * FF-A v1.0 doesn't provide discovery of UUIDs, just the
+		 * partition IDs, so fetch the partitions IDs for this
+		 * id_table UUID and assign the UUID to the device if the
+		 * partition ID matches
+		 */
+		if (uuid_is_null(&ffa_dev->uuid))
+			ffa_device_match_uuid(ffa_dev, &id_table->uuid);
+
 		if (uuid_equal(&ffa_dev->uuid, &id_table->uuid))
 			return 1;
 		id_table++;
diff --git a/drivers/firmware/arm_ffa/common.h b/drivers/firmware/arm_ffa/common.h
index f24754a59f47..d6eccf1fd3f6 100644
--- a/drivers/firmware/arm_ffa/common.h
+++ b/drivers/firmware/arm_ffa/common.h
@@ -6,6 +6,7 @@
 #ifndef _FFA_COMMON_H
 #define _FFA_COMMON_H
 
+#include <linux/arm_ffa.h>
 #include <linux/arm-smccc.h>
 #include <linux/err.h>
 
@@ -15,6 +16,8 @@ typedef void (ffa_fn)(ffa_value_t, ffa_value_t *);
 
 int arm_ffa_bus_init(void);
 void arm_ffa_bus_exit(void);
+bool ffa_device_is_valid(struct ffa_device *ffa_dev);
+void ffa_device_match_uuid(struct ffa_device *ffa_dev, const uuid_t *uuid);
 
 #ifdef CONFIG_ARM_FFA_SMCCC
 int __init ffa_transport_init(ffa_fn **invoke_ffa_fn);
diff --git a/drivers/firmware/arm_ffa/driver.c b/drivers/firmware/arm_ffa/driver.c
index 4846f079dca0..85b9bbdbeabe 100644
--- a/drivers/firmware/arm_ffa/driver.c
+++ b/drivers/firmware/arm_ffa/driver.c
@@ -24,9 +24,12 @@
 
 #include <linux/arm_ffa.h>
 #include <linux/bitfield.h>
+#include <linux/device.h>
 #include <linux/io.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/uuid.h>
 
 #include "common.h"
 
@@ -185,6 +188,22 @@ static int ffa_version_check(u32 *version)
 	return 0;
 }
 
+static int ffa_rx_release(void)
+{
+	ffa_value_t ret;
+
+	invoke_ffa_fn((ffa_value_t){
+		      .a0 = FFA_RX_RELEASE,
+		      }, &ret);
+
+	if (ret.a0 == FFA_ERROR)
+		return ffa_to_linux_errno((int)ret.a2);
+
+	/* check for ret.a0 == FFA_RX_RELEASE ? */
+
+	return 0;
+}
+
 static int ffa_rxtx_map(phys_addr_t tx_buf, phys_addr_t rx_buf, u32 pg_cnt)
 {
 	ffa_value_t ret;
@@ -214,6 +233,65 @@ static int ffa_rxtx_unmap(u16 vm_id)
 	return 0;
 }
 
+/* buffer must be sizeof(struct ffa_partition_info) * num_partitions */
+static int
+__ffa_partition_info_get(u32 uuid0, u32 uuid1, u32 uuid2, u32 uuid3,
+			 struct ffa_partition_info *buffer, int num_partitions)
+{
+	int count;
+	ffa_value_t partition_info;
+
+	mutex_lock(&drv_info->rx_lock);
+	invoke_ffa_fn((ffa_value_t){
+		      .a0 = FFA_PARTITION_INFO_GET,
+		      .a1 = uuid0, .a2 = uuid1, .a3 = uuid2, .a4 = uuid3,
+		      }, &partition_info);
+
+	if (partition_info.a0 == FFA_ERROR) {
+		mutex_unlock(&drv_info->rx_lock);
+		return ffa_to_linux_errno((int)partition_info.a2);
+	}
+
+	count = partition_info.a2;
+
+	if (buffer && count <= num_partitions)
+		memcpy(buffer, drv_info->rx_buffer, sizeof(*buffer) * count);
+
+	ffa_rx_release();
+
+	mutex_unlock(&drv_info->rx_lock);
+
+	return count;
+}
+
+/* buffer is allocated and caller must free the same if returned count > 0 */
+static int
+ffa_partition_probe(const uuid_t *uuid, struct ffa_partition_info **buffer)
+{
+	int count;
+	u32 uuid0_4[4];
+	struct ffa_partition_info *pbuf;
+
+	export_uuid((u8 *)uuid0_4, uuid);
+	count = __ffa_partition_info_get(uuid0_4[0], uuid0_4[1], uuid0_4[2],
+					 uuid0_4[3], NULL, 0);
+	if (count <= 0)
+		return count;
+
+	pbuf = kcalloc(count, sizeof(*pbuf), GFP_KERNEL);
+	if (!pbuf)
+		return -ENOMEM;
+
+	count = __ffa_partition_info_get(uuid0_4[0], uuid0_4[1], uuid0_4[2],
+					 uuid0_4[3], pbuf, count);
+	if (count <= 0)
+		kfree(pbuf);
+	else
+		*buffer = pbuf;
+
+	return count;
+}
+
 #define VM_ID_MASK	GENMASK(15, 0)
 static int ffa_id_get(u16 *vm_id)
 {
@@ -231,6 +309,147 @@ static int ffa_id_get(u16 *vm_id)
 	return 0;
 }
 
+static int ffa_msg_send_direct_req(u16 src_id, u16 dst_id, bool mode_32bit,
+				   struct ffa_send_direct_data *data)
+{
+	u32 req_id, resp_id, src_dst_ids = PACK_TARGET_INFO(src_id, dst_id);
+	ffa_value_t ret;
+
+	if (mode_32bit) {
+		req_id = FFA_MSG_SEND_DIRECT_REQ;
+		resp_id = FFA_MSG_SEND_DIRECT_RESP;
+	} else {
+		req_id = FFA_FN_NATIVE(MSG_SEND_DIRECT_REQ);
+		resp_id = FFA_FN_NATIVE(MSG_SEND_DIRECT_RESP);
+	}
+
+	invoke_ffa_fn((ffa_value_t){
+		      .a0 = req_id, .a1 = src_dst_ids, .a2 = 0,
+		      .a3 = data->data0, .a4 = data->data1, .a5 = data->data2,
+		      .a6 = data->data3, .a7 = data->data4,
+		      }, &ret);
+
+	while (ret.a0 == FFA_INTERRUPT)
+		invoke_ffa_fn((ffa_value_t){
+			      .a0 = FFA_RUN, .a1 = ret.a1,
+			      }, &ret);
+
+	if (ret.a0 == FFA_ERROR)
+		return ffa_to_linux_errno((int)ret.a2);
+
+	if (ret.a0 == resp_id) {
+		data->data0 = ret.a3;
+		data->data1 = ret.a4;
+		data->data2 = ret.a5;
+		data->data3 = ret.a6;
+		data->data4 = ret.a7;
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static u32 ffa_api_version_get(void)
+{
+	return drv_info->version;
+}
+
+static int ffa_partition_info_get(const char *uuid_str,
+				  struct ffa_partition_info *buffer)
+{
+	int count;
+	uuid_t uuid;
+	struct ffa_partition_info *pbuf;
+
+	if (uuid_parse(uuid_str, &uuid)) {
+		pr_err("invalid uuid (%s)\n", uuid_str);
+		return -ENODEV;
+	}
+
+	count = ffa_partition_probe(&uuid_null, &pbuf);
+	if (count <= 0)
+		return -ENOENT;
+
+	memcpy(buffer, pbuf, sizeof(*pbuf) * count);
+	kfree(pbuf);
+	return 0;
+}
+
+static void ffa_mode_32bit_set(struct ffa_device *dev)
+{
+	dev->mode_32bit = true;
+}
+
+static int ffa_sync_send_receive(struct ffa_device *dev,
+				 struct ffa_send_direct_data *data)
+{
+	return ffa_msg_send_direct_req(drv_info->vm_id, dev->vm_id,
+				       dev->mode_32bit, data);
+}
+
+static const struct ffa_dev_ops ffa_ops = {
+	.api_version_get = ffa_api_version_get,
+	.partition_info_get = ffa_partition_info_get,
+	.mode_32bit_set = ffa_mode_32bit_set,
+	.sync_send_receive = ffa_sync_send_receive,
+};
+
+const struct ffa_dev_ops *ffa_dev_ops_get(struct ffa_device *dev)
+{
+	if (ffa_device_is_valid(dev))
+		return &ffa_ops;
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(ffa_dev_ops_get);
+
+void ffa_device_match_uuid(struct ffa_device *ffa_dev, const uuid_t *uuid)
+{
+	int count, idx;
+	struct ffa_partition_info *pbuf, *tpbuf;
+
+	count = ffa_partition_probe(uuid, &pbuf);
+	if (count <= 0)
+		return;
+
+	for (idx = 0, tpbuf = pbuf; idx < count; idx++, tpbuf++)
+		if (tpbuf->id == ffa_dev->vm_id)
+			uuid_copy(&ffa_dev->uuid, uuid);
+	kfree(pbuf);
+}
+
+static void ffa_setup_partitions(void)
+{
+	int count, idx;
+	struct ffa_device *ffa_dev;
+	struct ffa_partition_info *pbuf, *tpbuf;
+
+	count = ffa_partition_probe(&uuid_null, &pbuf);
+	if (count <= 0) {
+		pr_info("%s: No partitions found, error %d\n", __func__, count);
+		return;
+	}
+
+	for (idx = 0, tpbuf = pbuf; idx < count; idx++, tpbuf++) {
+		/* Note that the &uuid_null parameter will require
+		 * ffa_device_match() to find the UUID of this partition id
+		 * with help of ffa_device_match_uuid(). Once the FF-A spec
+		 * is updated to provide correct UUID here for each partition
+		 * as part of the discovery API, we need to pass the
+		 * discovered UUID here instead.
+		 */
+		ffa_dev = ffa_device_register(&uuid_null, tpbuf->id);
+		if (!ffa_dev) {
+			pr_err("%s: failed to register partition ID 0x%x\n",
+			       __func__, tpbuf->id);
+			continue;
+		}
+
+		ffa_dev_set_drvdata(ffa_dev, drv_info);
+	}
+	kfree(pbuf);
+}
+
 static int __init ffa_init(void)
 {
 	int ret;
@@ -282,6 +501,8 @@ static int __init ffa_init(void)
 	mutex_init(&drv_info->rx_lock);
 	mutex_init(&drv_info->tx_lock);
 
+	ffa_setup_partitions();
+
 	return 0;
 free_pages:
 	if (drv_info->tx_buffer)
diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index 331ff62c9873..d672673fc621 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -6,7 +6,6 @@
 #ifndef _LINUX_ARM_FFA_H
 #define _LINUX_ARM_FFA_H
 
-#include <linux/cdev.h>
 #include <linux/device.h>
 #include <linux/module.h>
 #include <linux/types.h>
@@ -15,6 +14,7 @@
 /* FFA Bus/Device/Driver related */
 struct ffa_device {
 	int vm_id;
+	bool mode_32bit;
 	uuid_t uuid;
 	struct device dev;
 };
@@ -48,6 +48,7 @@ int ffa_driver_register(struct ffa_driver *driver, struct module *owner,
 			const char *mod_name);
 void ffa_driver_unregister(struct ffa_driver *driver);
 bool ffa_device_is_valid(struct ffa_device *ffa_dev);
+const struct ffa_dev_ops *ffa_dev_ops_get(struct ffa_device *dev);
 
 #else
 static inline
@@ -70,6 +71,11 @@ static inline void ffa_driver_unregister(struct ffa_driver *driver) {}
 static inline
 bool ffa_device_is_valid(struct ffa_device *ffa_dev) { return false; }
 
+static inline
+const struct ffa_dev_ops *ffa_dev_ops_get(struct ffa_device *dev)
+{
+	return NULL;
+}
 #endif /* CONFIG_ARM_FFA_TRANSPORT */
 
 #define ffa_register(driver) \
@@ -88,4 +94,35 @@ bool ffa_device_is_valid(struct ffa_device *ffa_dev) { return false; }
 #define module_ffa_driver(__ffa_driver)	\
 	module_driver(__ffa_driver, ffa_register, ffa_unregister)
 
+/* FFA transport related */
+struct ffa_partition_info {
+	u16 id;
+	u16 exec_ctxt;
+/* partition supports receipt of direct requests */
+#define FFA_PARTITION_DIRECT_RECV	BIT(0)
+/* partition can send direct requests. */
+#define FFA_PARTITION_DIRECT_SEND	BIT(1)
+/* partition can send and receive indirect messages. */
+#define FFA_PARTITION_INDIRECT_MSG	BIT(2)
+	u32 properties;
+};
+
+/* For use with FFA_MSG_SEND_DIRECT_{REQ,RESP} which pass data via registers */
+struct ffa_send_direct_data {
+	unsigned long data0; /* w3/x3 */
+	unsigned long data1; /* w4/x4 */
+	unsigned long data2; /* w5/x5 */
+	unsigned long data3; /* w6/x6 */
+	unsigned long data4; /* w7/x7 */
+};
+
+struct ffa_dev_ops {
+	u32 (*api_version_get)(void);
+	int (*partition_info_get)(const char *uuid_str,
+				  struct ffa_partition_info *buffer);
+	void (*mode_32bit_set)(struct ffa_device *dev);
+	int (*sync_send_receive)(struct ffa_device *dev,
+				 struct ffa_send_direct_data *data);
+};
+
 #endif /* _LINUX_ARM_FFA_H */
-- 
cgit v1.2.3


From cc2195fe536c28e192df5d07e6dd277af36814b4 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Fri, 21 May 2021 16:10:33 +0100
Subject: firmware: arm_ffa: Add support for MEM_* interfaces

Most of the MEM_* APIs share the same parameters, so they can be
generalised. Currently only MEM_SHARE is implemented and the user space
interface for that is not added yet.

Link: https://lore.kernel.org/r/20210521151033.181846-6-sudeep.holla@arm.com
Tested-by: Jens Wiklander <jens.wiklander@linaro.org>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_ffa/driver.c | 199 ++++++++++++++++++++++++++++++++++++++
 include/linux/arm_ffa.h           | 139 ++++++++++++++++++++++++++
 2 files changed, 338 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_ffa/driver.c b/drivers/firmware/arm_ffa/driver.c
index 85b9bbdbeabe..b1edb4b2e94a 100644
--- a/drivers/firmware/arm_ffa/driver.c
+++ b/drivers/firmware/arm_ffa/driver.c
@@ -28,6 +28,8 @@
 #include <linux/io.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/scatterlist.h>
 #include <linux/slab.h>
 #include <linux/uuid.h>
 
@@ -349,6 +351,192 @@ static int ffa_msg_send_direct_req(u16 src_id, u16 dst_id, bool mode_32bit,
 	return -EINVAL;
 }
 
+static int ffa_mem_first_frag(u32 func_id, phys_addr_t buf, u32 buf_sz,
+			      u32 frag_len, u32 len, u64 *handle)
+{
+	ffa_value_t ret;
+
+	invoke_ffa_fn((ffa_value_t){
+		      .a0 = func_id, .a1 = len, .a2 = frag_len,
+		      .a3 = buf, .a4 = buf_sz,
+		      }, &ret);
+
+	while (ret.a0 == FFA_MEM_OP_PAUSE)
+		invoke_ffa_fn((ffa_value_t){
+			      .a0 = FFA_MEM_OP_RESUME,
+			      .a1 = ret.a1, .a2 = ret.a2,
+			      }, &ret);
+
+	if (ret.a0 == FFA_ERROR)
+		return ffa_to_linux_errno((int)ret.a2);
+
+	if (ret.a0 != FFA_SUCCESS)
+		return -EOPNOTSUPP;
+
+	if (handle)
+		*handle = PACK_HANDLE(ret.a2, ret.a3);
+
+	return frag_len;
+}
+
+static int ffa_mem_next_frag(u64 handle, u32 frag_len)
+{
+	ffa_value_t ret;
+
+	invoke_ffa_fn((ffa_value_t){
+		      .a0 = FFA_MEM_FRAG_TX,
+		      .a1 = HANDLE_LOW(handle), .a2 = HANDLE_HIGH(handle),
+		      .a3 = frag_len,
+		      }, &ret);
+
+	while (ret.a0 == FFA_MEM_OP_PAUSE)
+		invoke_ffa_fn((ffa_value_t){
+			      .a0 = FFA_MEM_OP_RESUME,
+			      .a1 = ret.a1, .a2 = ret.a2,
+			      }, &ret);
+
+	if (ret.a0 == FFA_ERROR)
+		return ffa_to_linux_errno((int)ret.a2);
+
+	if (ret.a0 != FFA_MEM_FRAG_RX)
+		return -EOPNOTSUPP;
+
+	return ret.a3;
+}
+
+static int
+ffa_transmit_fragment(u32 func_id, phys_addr_t buf, u32 buf_sz, u32 frag_len,
+		      u32 len, u64 *handle, bool first)
+{
+	if (!first)
+		return ffa_mem_next_frag(*handle, frag_len);
+
+	return ffa_mem_first_frag(func_id, buf, buf_sz, frag_len, len, handle);
+}
+
+static u32 ffa_get_num_pages_sg(struct scatterlist *sg)
+{
+	u32 num_pages = 0;
+
+	do {
+		num_pages += sg->length / FFA_PAGE_SIZE;
+	} while ((sg = sg_next(sg)));
+
+	return num_pages;
+}
+
+static int
+ffa_setup_and_transmit(u32 func_id, void *buffer, u32 max_fragsize,
+		       struct ffa_mem_ops_args *args)
+{
+	int rc = 0;
+	bool first = true;
+	phys_addr_t addr = 0;
+	struct ffa_composite_mem_region *composite;
+	struct ffa_mem_region_addr_range *constituents;
+	struct ffa_mem_region_attributes *ep_mem_access;
+	struct ffa_mem_region *mem_region = buffer;
+	u32 idx, frag_len, length, buf_sz = 0, num_entries = sg_nents(args->sg);
+
+	mem_region->tag = args->tag;
+	mem_region->flags = args->flags;
+	mem_region->sender_id = drv_info->vm_id;
+	mem_region->attributes = FFA_MEM_NORMAL | FFA_MEM_WRITE_BACK |
+				 FFA_MEM_INNER_SHAREABLE;
+	ep_mem_access = &mem_region->ep_mem_access[0];
+
+	for (idx = 0; idx < args->nattrs; idx++, ep_mem_access++) {
+		ep_mem_access->receiver = args->attrs[idx].receiver;
+		ep_mem_access->attrs = args->attrs[idx].attrs;
+		ep_mem_access->composite_off = COMPOSITE_OFFSET(args->nattrs);
+	}
+	mem_region->ep_count = args->nattrs;
+
+	composite = buffer + COMPOSITE_OFFSET(args->nattrs);
+	composite->total_pg_cnt = ffa_get_num_pages_sg(args->sg);
+	composite->addr_range_cnt = num_entries;
+
+	length = COMPOSITE_CONSTITUENTS_OFFSET(args->nattrs, num_entries);
+	frag_len = COMPOSITE_CONSTITUENTS_OFFSET(args->nattrs, 0);
+	if (frag_len > max_fragsize)
+		return -ENXIO;
+
+	if (!args->use_txbuf) {
+		addr = virt_to_phys(buffer);
+		buf_sz = max_fragsize / FFA_PAGE_SIZE;
+	}
+
+	constituents = buffer + frag_len;
+	idx = 0;
+	do {
+		if (frag_len == max_fragsize) {
+			rc = ffa_transmit_fragment(func_id, addr, buf_sz,
+						   frag_len, length,
+						   &args->g_handle, first);
+			if (rc < 0)
+				return -ENXIO;
+
+			first = false;
+			idx = 0;
+			frag_len = 0;
+			constituents = buffer;
+		}
+
+		if ((void *)constituents - buffer > max_fragsize) {
+			pr_err("Memory Region Fragment > Tx Buffer size\n");
+			return -EFAULT;
+		}
+
+		constituents->address = sg_phys(args->sg);
+		constituents->pg_cnt = args->sg->length / FFA_PAGE_SIZE;
+		constituents++;
+		frag_len += sizeof(struct ffa_mem_region_addr_range);
+	} while ((args->sg = sg_next(args->sg)));
+
+	return ffa_transmit_fragment(func_id, addr, buf_sz, frag_len,
+				     length, &args->g_handle, first);
+}
+
+static int ffa_memory_ops(u32 func_id, struct ffa_mem_ops_args *args)
+{
+	int ret;
+	void *buffer;
+
+	if (!args->use_txbuf) {
+		buffer = alloc_pages_exact(RXTX_BUFFER_SIZE, GFP_KERNEL);
+		if (!buffer)
+			return -ENOMEM;
+	} else {
+		buffer = drv_info->tx_buffer;
+		mutex_lock(&drv_info->tx_lock);
+	}
+
+	ret = ffa_setup_and_transmit(func_id, buffer, RXTX_BUFFER_SIZE, args);
+
+	if (args->use_txbuf)
+		mutex_unlock(&drv_info->tx_lock);
+	else
+		free_pages_exact(buffer, RXTX_BUFFER_SIZE);
+
+	return ret < 0 ? ret : 0;
+}
+
+static int ffa_memory_reclaim(u64 g_handle, u32 flags)
+{
+	ffa_value_t ret;
+
+	invoke_ffa_fn((ffa_value_t){
+		      .a0 = FFA_MEM_RECLAIM,
+		      .a1 = HANDLE_LOW(g_handle), .a2 = HANDLE_HIGH(g_handle),
+		      .a3 = flags,
+		      }, &ret);
+
+	if (ret.a0 == FFA_ERROR)
+		return ffa_to_linux_errno((int)ret.a2);
+
+	return 0;
+}
+
 static u32 ffa_api_version_get(void)
 {
 	return drv_info->version;
@@ -387,11 +575,22 @@ static int ffa_sync_send_receive(struct ffa_device *dev,
 				       dev->mode_32bit, data);
 }
 
+static int
+ffa_memory_share(struct ffa_device *dev, struct ffa_mem_ops_args *args)
+{
+	if (dev->mode_32bit)
+		return ffa_memory_ops(FFA_MEM_SHARE, args);
+
+	return ffa_memory_ops(FFA_FN_NATIVE(MEM_SHARE), args);
+}
+
 static const struct ffa_dev_ops ffa_ops = {
 	.api_version_get = ffa_api_version_get,
 	.partition_info_get = ffa_partition_info_get,
 	.mode_32bit_set = ffa_mode_32bit_set,
 	.sync_send_receive = ffa_sync_send_receive,
+	.memory_reclaim = ffa_memory_reclaim,
+	.memory_share = ffa_memory_share,
 };
 
 const struct ffa_dev_ops *ffa_dev_ops_get(struct ffa_device *dev)
diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index d672673fc621..505c679b6a9b 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -116,6 +116,142 @@ struct ffa_send_direct_data {
 	unsigned long data4; /* w7/x7 */
 };
 
+struct ffa_mem_region_addr_range {
+	/* The base IPA of the constituent memory region, aligned to 4 kiB */
+	u64 address;
+	/* The number of 4 kiB pages in the constituent memory region. */
+	u32 pg_cnt;
+	u32 reserved;
+};
+
+struct ffa_composite_mem_region {
+	/*
+	 * The total number of 4 kiB pages included in this memory region. This
+	 * must be equal to the sum of page counts specified in each
+	 * `struct ffa_mem_region_addr_range`.
+	 */
+	u32 total_pg_cnt;
+	/* The number of constituents included in this memory region range */
+	u32 addr_range_cnt;
+	u64 reserved;
+	/** An array of `addr_range_cnt` memory region constituents. */
+	struct ffa_mem_region_addr_range constituents[];
+};
+
+struct ffa_mem_region_attributes {
+	/* The ID of the VM to which the memory is being given or shared. */
+	u16 receiver;
+	/*
+	 * The permissions with which the memory region should be mapped in the
+	 * receiver's page table.
+	 */
+#define FFA_MEM_EXEC		BIT(3)
+#define FFA_MEM_NO_EXEC		BIT(2)
+#define FFA_MEM_RW		BIT(1)
+#define FFA_MEM_RO		BIT(0)
+	u8 attrs;
+	/*
+	 * Flags used during FFA_MEM_RETRIEVE_REQ and FFA_MEM_RETRIEVE_RESP
+	 * for memory regions with multiple borrowers.
+	 */
+#define FFA_MEM_RETRIEVE_SELF_BORROWER	BIT(0)
+	u8 flag;
+	u32 composite_off;
+	/*
+	 * Offset in bytes from the start of the outer `ffa_memory_region` to
+	 * an `struct ffa_mem_region_addr_range`.
+	 */
+	u64 reserved;
+};
+
+struct ffa_mem_region {
+	/* The ID of the VM/owner which originally sent the memory region */
+	u16 sender_id;
+#define FFA_MEM_NORMAL		BIT(5)
+#define FFA_MEM_DEVICE		BIT(4)
+
+#define FFA_MEM_WRITE_BACK	(3 << 2)
+#define FFA_MEM_NON_CACHEABLE	(1 << 2)
+
+#define FFA_DEV_nGnRnE		(0 << 2)
+#define FFA_DEV_nGnRE		(1 << 2)
+#define FFA_DEV_nGRE		(2 << 2)
+#define FFA_DEV_GRE		(3 << 2)
+
+#define FFA_MEM_NON_SHAREABLE	(0)
+#define FFA_MEM_OUTER_SHAREABLE	(2)
+#define FFA_MEM_INNER_SHAREABLE	(3)
+	u8 attributes;
+	u8 reserved_0;
+/*
+ * Clear memory region contents after unmapping it from the sender and
+ * before mapping it for any receiver.
+ */
+#define FFA_MEM_CLEAR			BIT(0)
+/*
+ * Whether the hypervisor may time slice the memory sharing or retrieval
+ * operation.
+ */
+#define FFA_TIME_SLICE_ENABLE		BIT(1)
+
+#define FFA_MEM_RETRIEVE_TYPE_IN_RESP	(0 << 3)
+#define FFA_MEM_RETRIEVE_TYPE_SHARE	(1 << 3)
+#define FFA_MEM_RETRIEVE_TYPE_LEND	(2 << 3)
+#define FFA_MEM_RETRIEVE_TYPE_DONATE	(3 << 3)
+
+#define FFA_MEM_RETRIEVE_ADDR_ALIGN_HINT	BIT(9)
+#define FFA_MEM_RETRIEVE_ADDR_ALIGN(x)		((x) << 5)
+	/* Flags to control behaviour of the transaction. */
+	u32 flags;
+#define HANDLE_LOW_MASK		GENMASK_ULL(31, 0)
+#define HANDLE_HIGH_MASK	GENMASK_ULL(63, 32)
+#define HANDLE_LOW(x)		((u32)(FIELD_GET(HANDLE_LOW_MASK, (x))))
+#define	HANDLE_HIGH(x)		((u32)(FIELD_GET(HANDLE_HIGH_MASK, (x))))
+
+#define PACK_HANDLE(l, h)		\
+	(FIELD_PREP(HANDLE_LOW_MASK, (l)) | FIELD_PREP(HANDLE_HIGH_MASK, (h)))
+	/*
+	 * A globally-unique ID assigned by the hypervisor for a region
+	 * of memory being sent between VMs.
+	 */
+	u64 handle;
+	/*
+	 * An implementation defined value associated with the receiver and the
+	 * memory region.
+	 */
+	u64 tag;
+	u32 reserved_1;
+	/*
+	 * The number of `ffa_mem_region_attributes` entries included in this
+	 * transaction.
+	 */
+	u32 ep_count;
+	/*
+	 * An array of endpoint memory access descriptors.
+	 * Each one specifies a memory region offset, an endpoint and the
+	 * attributes with which this memory region should be mapped in that
+	 * endpoint's page table.
+	 */
+	struct ffa_mem_region_attributes ep_mem_access[];
+};
+
+#define	COMPOSITE_OFFSET(x)	\
+	(offsetof(struct ffa_mem_region, ep_mem_access[x]))
+#define CONSTITUENTS_OFFSET(x)	\
+	(offsetof(struct ffa_composite_mem_region, constituents[x]))
+#define COMPOSITE_CONSTITUENTS_OFFSET(x, y)	\
+	(COMPOSITE_OFFSET(x) + CONSTITUENTS_OFFSET(y))
+
+struct ffa_mem_ops_args {
+	bool use_txbuf;
+	u32 nattrs;
+	u32 flags;
+	u64 tag;
+	u64 g_handle;
+	struct scatterlist *sg;
+	struct ffa_mem_region_attributes *attrs;
+};
+
 struct ffa_dev_ops {
 	u32 (*api_version_get)(void);
 	int (*partition_info_get)(const char *uuid_str,
@@ -123,6 +259,9 @@ struct ffa_dev_ops {
 	void (*mode_32bit_set)(struct ffa_device *dev);
 	int (*sync_send_receive)(struct ffa_device *dev,
 				 struct ffa_send_direct_data *data);
+	int (*memory_reclaim)(u64 g_handle, u32 flags);
+	int (*memory_share)(struct ffa_device *dev,
+			    struct ffa_mem_ops_args *args);
 };
 
 #endif /* _LINUX_ARM_FFA_H */
-- 
cgit v1.2.3


From 125217e0967fc905be35a3b2c9ba4db9a8616b92 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Tue, 25 May 2021 18:00:38 -0500
Subject: i40e: Replace one-element array with flexible-array member
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a regular need in the kernel to provide a way to declare having a
dynamically sized set of trailing elements in a structure. Kernel code
should always use “flexible array members”[1] for these cases. The older
style of one-element or zero-length arrays should no longer be used[2].

Refactor the code according to the use of a flexible-array member in struct
i40e_qvlist_info instead of one-element array, and use the struct_size()
helper.

[1] https://en.wikipedia.org/wiki/Flexible_array_member
[2] https://www.kernel.org/doc/html/v5.10/process/deprecated.html#zero-length-and-one-element-arrays

Link: https://github.com/KSPP/linux/issues/79
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Acked-by: Shiraz Saleem <shiraz.saleem@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/infiniband/hw/i40iw/i40iw_main.c      | 5 ++---
 drivers/net/ethernet/intel/i40e/i40e_client.c | 2 +-
 include/linux/net/intel/i40e_client.h         | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c
index b496f30ce066..364f69cd620f 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_main.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_main.c
@@ -1423,7 +1423,7 @@ static enum i40iw_status_code i40iw_save_msix_info(struct i40iw_device *iwdev,
 	struct i40e_qv_info *iw_qvinfo;
 	u32 ceq_idx;
 	u32 i;
-	u32 size;
+	size_t size;
 
 	if (!ldev->msix_count) {
 		i40iw_pr_err("No MSI-X vectors\n");
@@ -1433,8 +1433,7 @@ static enum i40iw_status_code i40iw_save_msix_info(struct i40iw_device *iwdev,
 	iwdev->msix_count = ldev->msix_count;
 
 	size = sizeof(struct i40iw_msix_vector) * iwdev->msix_count;
-	size += sizeof(struct i40e_qvlist_info);
-	size +=  sizeof(struct i40e_qv_info) * iwdev->msix_count - 1;
+	size += struct_size(iw_qvlist, qv_info, iwdev->msix_count);
 	iwdev->iw_msixtbl = kzalloc(size, GFP_KERNEL);
 
 	if (!iwdev->iw_msixtbl)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_client.c b/drivers/net/ethernet/intel/i40e/i40e_client.c
index 32f3facbed1a..63eab14a26df 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_client.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_client.c
@@ -579,7 +579,7 @@ static int i40e_client_setup_qvlist(struct i40e_info *ldev,
 	u32 v_idx, i, reg_idx, reg;
 
 	ldev->qvlist_info = kzalloc(struct_size(ldev->qvlist_info, qv_info,
-				    qvlist_info->num_vectors - 1), GFP_KERNEL);
+				    qvlist_info->num_vectors), GFP_KERNEL);
 	if (!ldev->qvlist_info)
 		return -ENOMEM;
 	ldev->qvlist_info->num_vectors = qvlist_info->num_vectors;
diff --git a/include/linux/net/intel/i40e_client.h b/include/linux/net/intel/i40e_client.h
index f41387a8969f..fd7bc860a241 100644
--- a/include/linux/net/intel/i40e_client.h
+++ b/include/linux/net/intel/i40e_client.h
@@ -48,7 +48,7 @@ struct i40e_qv_info {
 
 struct i40e_qvlist_info {
 	u32 num_vectors;
-	struct i40e_qv_info qv_info[1];
+	struct i40e_qv_info qv_info[];
 };
 
 
-- 
cgit v1.2.3


From 73e33008e865e5a7b79282331c2d6e920d5d47f8 Mon Sep 17 00:00:00 2001
From: Chunfeng Yun <chunfeng.yun@mediatek.com>
Date: Tue, 25 May 2021 16:53:05 +0800
Subject: usb: roles: add helper usb_role_string()

Introduces usb_role_string() function, which returns a
human-readable name of provided usb role, it's useful to
make the log readable.

Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Chunfeng Yun <chunfeng.yun@mediatek.com>
Link: https://lore.kernel.org/r/1621932786-9335-1-git-send-email-chunfeng.yun@mediatek.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/roles/class.c | 9 +++++++++
 include/linux/usb/role.h  | 6 ++++++
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/roles/class.c b/drivers/usb/roles/class.c
index 33b637d0d8d9..dfaed7eee94f 100644
--- a/drivers/usb/roles/class.c
+++ b/drivers/usb/roles/class.c
@@ -214,6 +214,15 @@ static const char * const usb_roles[] = {
 	[USB_ROLE_DEVICE]	= "device",
 };
 
+const char *usb_role_string(enum usb_role role)
+{
+	if (role < 0 || role >= ARRAY_SIZE(usb_roles))
+		return "unknown";
+
+	return usb_roles[role];
+}
+EXPORT_SYMBOL_GPL(usb_role_string);
+
 static ssize_t
 role_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
diff --git a/include/linux/usb/role.h b/include/linux/usb/role.h
index 0164fed31b06..031f148ab373 100644
--- a/include/linux/usb/role.h
+++ b/include/linux/usb/role.h
@@ -65,6 +65,7 @@ void usb_role_switch_unregister(struct usb_role_switch *sw);
 
 void usb_role_switch_set_drvdata(struct usb_role_switch *sw, void *data);
 void *usb_role_switch_get_drvdata(struct usb_role_switch *sw);
+const char *usb_role_string(enum usb_role role);
 #else
 static inline int usb_role_switch_set_role(struct usb_role_switch *sw,
 		enum usb_role role)
@@ -109,6 +110,11 @@ static inline void *usb_role_switch_get_drvdata(struct usb_role_switch *sw)
 	return NULL;
 }
 
+static inline const char *usb_role_string(enum usb_role role)
+{
+	return "unknown";
+}
+
 #endif
 
 #endif /* __LINUX_USB_ROLE_H */
-- 
cgit v1.2.3


From 70f400d4d957c2453c8689552ff212bc59f88938 Mon Sep 17 00:00:00 2001
From: Rajat Jain <rajatja@google.com>
Date: Mon, 24 May 2021 10:18:11 -0700
Subject: driver core: Move the "removable" attribute from USB to core

Move the "removable" attribute from USB to core in order to allow it to be
supported by other subsystem / buses. Individual buses that want to support
this attribute can populate the removable property of the device while
enumerating it with the 3 possible values -
 - "unknown"
 - "fixed"
 - "removable"
Leaving the field unchanged (i.e. "not supported") would mean that the
attribute would not show up in sysfs for that device. The UAPI (location,
symantics etc) for the attribute remains unchanged.

Move the "removable" attribute from USB to the device core so it can be
used by other subsystems / buses.

By default, devices do not have a "removable" attribute in sysfs.

If a subsystem or bus driver wants to support a "removable" attribute, it
should call device_set_removable() before calling device_register() or
device_add(), e.g.:

    device_set_removable(dev, DEVICE_REMOVABLE);
    device_register(dev);

The possible values and the resulting sysfs attribute contents are:

    DEVICE_REMOVABLE_UNKNOWN  ->  "unknown"
    DEVICE_REMOVABLE          ->  "removable"
    DEVICE_FIXED              ->  "fixed"

Convert the USB "removable" attribute to use this new device core
functionality.  There should be no user-visible change in the location or
semantics of attribute for USB devices.

Reviewed-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Rajat Jain <rajatja@google.com>
Link: https://lore.kernel.org/r/20210524171812.18095-1-rajatja@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-bus-usb           | 11 -------
 Documentation/ABI/testing/sysfs-devices-removable | 17 +++++++++++
 drivers/base/core.c                               | 28 +++++++++++++++++
 drivers/usb/core/hub.c                            | 13 ++++----
 drivers/usb/core/sysfs.c                          | 24 ---------------
 include/linux/device.h                            | 37 +++++++++++++++++++++++
 include/linux/usb.h                               |  7 -----
 7 files changed, 89 insertions(+), 48 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-devices-removable

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-usb b/Documentation/ABI/testing/sysfs-bus-usb
index bf2c1968525f..73eb23bc1f34 100644
--- a/Documentation/ABI/testing/sysfs-bus-usb
+++ b/Documentation/ABI/testing/sysfs-bus-usb
@@ -154,17 +154,6 @@ Description:
 		files hold a string value (enable or disable) indicating whether
 		or not USB3 hardware LPM U1 or U2 is enabled for the device.
 
-What:		/sys/bus/usb/devices/.../removable
-Date:		February 2012
-Contact:	Matthew Garrett <mjg@redhat.com>
-Description:
-		Some information about whether a given USB device is
-		physically fixed to the platform can be inferred from a
-		combination of hub descriptor bits and platform-specific data
-		such as ACPI. This file will read either "removable" or
-		"fixed" if the information is available, and "unknown"
-		otherwise.
-
 What:		/sys/bus/usb/devices/.../ltm_capable
 Date:		July 2012
 Contact:	Sarah Sharp <sarah.a.sharp@linux.intel.com>
diff --git a/Documentation/ABI/testing/sysfs-devices-removable b/Documentation/ABI/testing/sysfs-devices-removable
new file mode 100644
index 000000000000..acf7766e800b
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-removable
@@ -0,0 +1,17 @@
+What:		/sys/devices/.../removable
+Date:		May 2021
+Contact:	Rajat Jain <rajatxjain@gmail.com>
+Description:
+		Information about whether a given device can be removed from the
+		platform by the	user. This is determined by its subsystem in a
+		bus / platform-specific way. This attribute is only present for
+		devices that can support determining such information:
+
+		"removable": device can be removed from the platform by the user
+		"fixed":     device is fixed to the platform / cannot be removed
+			     by the user.
+		"unknown":   The information is unavailable / cannot be deduced.
+
+		Currently this is only supported by USB (which infers the
+		information from a combination of hub descriptor bits and
+		platform-specific data such as ACPI).
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 628e33939aca..f0a2331c71a0 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -2405,6 +2405,25 @@ static ssize_t online_store(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RW(online);
 
+static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
+			      char *buf)
+{
+	const char *loc;
+
+	switch (dev->removable) {
+	case DEVICE_REMOVABLE:
+		loc = "removable";
+		break;
+	case DEVICE_FIXED:
+		loc = "fixed";
+		break;
+	default:
+		loc = "unknown";
+	}
+	return sysfs_emit(buf, "%s\n", loc);
+}
+static DEVICE_ATTR_RO(removable);
+
 int device_add_groups(struct device *dev, const struct attribute_group **groups)
 {
 	return sysfs_create_groups(&dev->kobj, groups);
@@ -2582,8 +2601,16 @@ static int device_add_attrs(struct device *dev)
 			goto err_remove_dev_online;
 	}
 
+	if (dev_removable_is_valid(dev)) {
+		error = device_create_file(dev, &dev_attr_removable);
+		if (error)
+			goto err_remove_dev_waiting_for_supplier;
+	}
+
 	return 0;
 
+ err_remove_dev_waiting_for_supplier:
+	device_remove_file(dev, &dev_attr_waiting_for_supplier);
  err_remove_dev_online:
 	device_remove_file(dev, &dev_attr_online);
  err_remove_dev_groups:
@@ -2603,6 +2630,7 @@ static void device_remove_attrs(struct device *dev)
 	struct class *class = dev->class;
 	const struct device_type *type = dev->type;
 
+	device_remove_file(dev, &dev_attr_removable);
 	device_remove_file(dev, &dev_attr_waiting_for_supplier);
 	device_remove_file(dev, &dev_attr_online);
 	device_remove_groups(dev, dev->groups);
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index bbe7c17b49d1..3bd379d5d1be 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -2432,6 +2432,8 @@ static void set_usb_port_removable(struct usb_device *udev)
 	u16 wHubCharacteristics;
 	bool removable = true;
 
+	dev_set_removable(&udev->dev, DEVICE_REMOVABLE_UNKNOWN);
+
 	if (!hdev)
 		return;
 
@@ -2443,11 +2445,11 @@ static void set_usb_port_removable(struct usb_device *udev)
 	 */
 	switch (hub->ports[udev->portnum - 1]->connect_type) {
 	case USB_PORT_CONNECT_TYPE_HOT_PLUG:
-		udev->removable = USB_DEVICE_REMOVABLE;
+		dev_set_removable(&udev->dev, DEVICE_REMOVABLE);
 		return;
 	case USB_PORT_CONNECT_TYPE_HARD_WIRED:
 	case USB_PORT_NOT_USED:
-		udev->removable = USB_DEVICE_FIXED;
+		dev_set_removable(&udev->dev, DEVICE_FIXED);
 		return;
 	default:
 		break;
@@ -2472,9 +2474,9 @@ static void set_usb_port_removable(struct usb_device *udev)
 	}
 
 	if (removable)
-		udev->removable = USB_DEVICE_REMOVABLE;
+		dev_set_removable(&udev->dev, DEVICE_REMOVABLE);
 	else
-		udev->removable = USB_DEVICE_FIXED;
+		dev_set_removable(&udev->dev, DEVICE_FIXED);
 
 }
 
@@ -2546,8 +2548,7 @@ int usb_new_device(struct usb_device *udev)
 	device_enable_async_suspend(&udev->dev);
 
 	/* check whether the hub or firmware marks this port as non-removable */
-	if (udev->parent)
-		set_usb_port_removable(udev);
+	set_usb_port_removable(udev);
 
 	/* Register the device.  The device driver is responsible
 	 * for configuring the device and invoking the add-device
diff --git a/drivers/usb/core/sysfs.c b/drivers/usb/core/sysfs.c
index 5a168ba9fc51..fa2e49d432ff 100644
--- a/drivers/usb/core/sysfs.c
+++ b/drivers/usb/core/sysfs.c
@@ -301,29 +301,6 @@ static ssize_t urbnum_show(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RO(urbnum);
 
-static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
-			      char *buf)
-{
-	struct usb_device *udev;
-	char *state;
-
-	udev = to_usb_device(dev);
-
-	switch (udev->removable) {
-	case USB_DEVICE_REMOVABLE:
-		state = "removable";
-		break;
-	case USB_DEVICE_FIXED:
-		state = "fixed";
-		break;
-	default:
-		state = "unknown";
-	}
-
-	return sprintf(buf, "%s\n", state);
-}
-static DEVICE_ATTR_RO(removable);
-
 static ssize_t ltm_capable_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
@@ -828,7 +805,6 @@ static struct attribute *dev_attrs[] = {
 	&dev_attr_avoid_reset_quirk.attr,
 	&dev_attr_authorized.attr,
 	&dev_attr_remove.attr,
-	&dev_attr_removable.attr,
 	&dev_attr_ltm_capable.attr,
 #ifdef CONFIG_OF
 	&dev_attr_devspec.attr,
diff --git a/include/linux/device.h b/include/linux/device.h
index 38a2071cf776..8566fa98b239 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -350,6 +350,22 @@ enum dl_dev_state {
 	DL_DEV_UNBINDING,
 };
 
+/**
+ * enum device_removable - Whether the device is removable. The criteria for a
+ * device to be classified as removable is determined by its subsystem or bus.
+ * @DEVICE_REMOVABLE_NOT_SUPPORTED: This attribute is not supported for this
+ *				    device (default).
+ * @DEVICE_REMOVABLE_UNKNOWN:  Device location is Unknown.
+ * @DEVICE_FIXED: Device is not removable by the user.
+ * @DEVICE_REMOVABLE: Device is removable by the user.
+ */
+enum device_removable {
+	DEVICE_REMOVABLE_NOT_SUPPORTED = 0, /* must be 0 */
+	DEVICE_REMOVABLE_UNKNOWN,
+	DEVICE_FIXED,
+	DEVICE_REMOVABLE,
+};
+
 /**
  * struct dev_links_info - Device data related to device links.
  * @suppliers: List of links to supplier devices.
@@ -431,6 +447,9 @@ struct dev_links_info {
  * 		device (i.e. the bus driver that discovered the device).
  * @iommu_group: IOMMU group the device belongs to.
  * @iommu:	Per device generic IOMMU runtime data
+ * @removable:  Whether the device can be removed from the system. This
+ *              should be set by the subsystem / bus driver that discovered
+ *              the device.
  *
  * @offline_disabled: If set, the device is permanently online.
  * @offline:	Set after successful invocation of bus type's .offline().
@@ -544,6 +563,8 @@ struct device {
 	struct iommu_group	*iommu_group;
 	struct dev_iommu	*iommu;
 
+	enum device_removable	removable;
+
 	bool			offline_disabled:1;
 	bool			offline:1;
 	bool			of_node_reused:1;
@@ -782,6 +803,22 @@ static inline bool dev_has_sync_state(struct device *dev)
 	return false;
 }
 
+static inline void dev_set_removable(struct device *dev,
+				     enum device_removable removable)
+{
+	dev->removable = removable;
+}
+
+static inline bool dev_is_removable(struct device *dev)
+{
+	return dev->removable == DEVICE_REMOVABLE;
+}
+
+static inline bool dev_removable_is_valid(struct device *dev)
+{
+	return dev->removable != DEVICE_REMOVABLE_NOT_SUPPORTED;
+}
+
 /*
  * High level routines for use by the bus drivers
  */
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 4db6b824af5c..7ccaa76a9a96 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -473,12 +473,6 @@ struct usb_dev_state;
 
 struct usb_tt;
 
-enum usb_device_removable {
-	USB_DEVICE_REMOVABLE_UNKNOWN = 0,
-	USB_DEVICE_REMOVABLE,
-	USB_DEVICE_FIXED,
-};
-
 enum usb_port_connect_type {
 	USB_PORT_CONNECT_TYPE_UNKNOWN = 0,
 	USB_PORT_CONNECT_TYPE_HOT_PLUG,
@@ -703,7 +697,6 @@ struct usb_device {
 #endif
 	struct wusb_dev *wusb_dev;
 	int slot_id;
-	enum usb_device_removable removable;
 	struct usb2_lpm_parameters l1_params;
 	struct usb3_lpm_parameters u1_params;
 	struct usb3_lpm_parameters u2_params;
-- 
cgit v1.2.3


From 6bd5b743686243dae7351d5dcceeb7f171201bb4 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpengli@tencent.com>
Date: Tue, 18 May 2021 05:00:31 -0700
Subject: KVM: PPC: exit halt polling on need_resched()

This is inspired by commit 262de4102c7bb8 (kvm: exit halt polling on
need_resched() as well). Due to PPC implements an arch specific halt
polling logic, we have to the need_resched() check there as well. This
patch adds a helper function that can be shared between book3s and generic
halt-polling loops.

Reviewed-by: David Matlack <dmatlack@google.com>
Reviewed-by: Venkatesh Srinivas <venkateshs@chromium.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Venkatesh Srinivas <venkateshs@chromium.org>
Cc: Jim Mattson <jmattson@google.com>
Cc: David Matlack <dmatlack@google.com>
Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Message-Id: <1621339235-11131-1-git-send-email-wanpengli@tencent.com>
[Make the function inline. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/powerpc/kvm/book3s_hv.c | 2 +-
 include/linux/kvm_host.h     | 6 ++++++
 virt/kvm/kvm_main.c          | 3 +--
 3 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 28a80d240b76..7360350e66ff 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3936,7 +3936,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 				break;
 			}
 			cur = ktime_get();
-		} while (single_task_running() && ktime_before(cur, stop));
+		} while (kvm_vcpu_can_poll(cur, stop));
 
 		spin_lock(&vc->lock);
 		vc->vcore_state = VCORE_INACTIVE;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2f34487e21f2..5d4b96b36ec0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -10,6 +10,7 @@
 #include <linux/spinlock.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
+#include <linux/sched/stat.h>
 #include <linux/bug.h>
 #include <linux/minmax.h>
 #include <linux/mm.h>
@@ -265,6 +266,11 @@ static inline bool kvm_vcpu_mapped(struct kvm_host_map *map)
 	return !!map->hva;
 }
 
+static inline bool kvm_vcpu_can_poll(ktime_t cur, ktime_t stop)
+{
+	return single_task_running() && !need_resched() && ktime_before(cur, stop);
+}
+
 /*
  * Sometimes a large or cross-page mmio needs to be broken up into separate
  * exits for userspace servicing.
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 6b4feb92dc79..5f40725144f5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2973,8 +2973,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 				goto out;
 			}
 			poll_end = cur = ktime_get();
-		} while (single_task_running() && !need_resched() &&
-			 ktime_before(cur, stop));
+		} while (kvm_vcpu_can_poll(cur, stop));
 	}
 
 	prepare_to_rcuwait(&vcpu->wait);
-- 
cgit v1.2.3


From 084071d5e9226add45a6031928bf10e6afc855fd Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 25 May 2021 10:41:17 -0300
Subject: KVM: rename KVM_REQ_PENDING_TIMER to KVM_REQ_UNBLOCK

KVM_REQ_UNBLOCK will be used to exit a vcpu from
its inner vcpu halt emulation loop.

Rename KVM_REQ_PENDING_TIMER to KVM_REQ_UNBLOCK, switch
PowerPC to arch specific request bit.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Message-Id: <20210525134321.303768132@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/vcpu-requests.rst | 8 +++++---
 arch/powerpc/include/asm/kvm_host.h      | 1 +
 arch/x86/kvm/lapic.c                     | 2 +-
 arch/x86/kvm/x86.c                       | 2 +-
 include/linux/kvm_host.h                 | 2 +-
 virt/kvm/kvm_main.c                      | 2 ++
 6 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/virt/kvm/vcpu-requests.rst b/Documentation/virt/kvm/vcpu-requests.rst
index 5feb3706a7ae..af1b37441e0a 100644
--- a/Documentation/virt/kvm/vcpu-requests.rst
+++ b/Documentation/virt/kvm/vcpu-requests.rst
@@ -118,10 +118,12 @@ KVM_REQ_MMU_RELOAD
   necessary to inform each VCPU to completely refresh the tables.  This
   request is used for that.
 
-KVM_REQ_PENDING_TIMER
+KVM_REQ_UNBLOCK
 
-  This request may be made from a timer handler run on the host on behalf
-  of a VCPU.  It informs the VCPU thread to inject a timer interrupt.
+  This request informs the vCPU to exit kvm_vcpu_block.  It is used for
+  example from timer handlers that run on the host on behalf of a vCPU,
+  or in order to update the interrupt routing and ensure that assigned
+  devices will wake up the vCPU.
 
 KVM_REQ_UNHALT
 
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 1e83359f286b..7f2e90db2050 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -51,6 +51,7 @@
 /* PPC-specific vcpu->requests bit members */
 #define KVM_REQ_WATCHDOG	KVM_ARCH_REQ(0)
 #define KVM_REQ_EPR_EXIT	KVM_ARCH_REQ(1)
+#define KVM_REQ_PENDING_TIMER	KVM_ARCH_REQ(2)
 
 #include <linux/mmu_notifier.h>
 
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 5d91f2367c31..8120e8614b92 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1669,7 +1669,7 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn)
 	}
 
 	atomic_inc(&apic->lapic_timer.pending);
-	kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+	kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
 	if (from_timer_fn)
 		kvm_vcpu_kick(vcpu);
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 98538b1cb453..fe464b66898f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9501,7 +9501,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 		if (r <= 0)
 			break;
 
-		kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu);
+		kvm_clear_request(KVM_REQ_UNBLOCK, vcpu);
 		if (kvm_cpu_has_pending_timer(vcpu))
 			kvm_inject_pending_timer_irqs(vcpu);
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5d4b96b36ec0..76102efbf079 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -147,7 +147,7 @@ static inline bool is_error_page(struct page *page)
  */
 #define KVM_REQ_TLB_FLUSH         (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_MMU_RELOAD        (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
-#define KVM_REQ_PENDING_TIMER     2
+#define KVM_REQ_UNBLOCK           2
 #define KVM_REQ_UNHALT            3
 #define KVM_REQUEST_ARCH_BASE     8
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5f40725144f5..37a2d500a148 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2929,6 +2929,8 @@ static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
 		goto out;
 	if (signal_pending(current))
 		goto out;
+	if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
+		goto out;
 
 	ret = 0;
 out:
-- 
cgit v1.2.3


From d327ea15a305024ef0085252fa3657bbb1ce25f5 Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Tue, 25 May 2021 13:20:12 +0100
Subject: random32: Fix implicit truncation warning in prandom_seed_state()

sparse generates the following warning:

 include/linux/prandom.h:114:45: sparse: sparse: cast truncates bits from
 constant value

This is because the 64-bit seed value is manipulated and then placed in a
u32, causing an implicit cast and truncation. A forced cast to u32 doesn't
prevent this warning, which is reasonable because a typecast doesn't prove
that truncation was expected.

Logical-AND the value with 0xffffffff to make explicit that truncation to
32-bit is intended.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20210525122012.6336-3-rf@opensource.cirrus.com
---
 include/linux/prandom.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/prandom.h b/include/linux/prandom.h
index bbf4b4ad61df..056d31317e49 100644
--- a/include/linux/prandom.h
+++ b/include/linux/prandom.h
@@ -111,7 +111,7 @@ static inline u32 __seed(u32 x, u32 m)
  */
 static inline void prandom_seed_state(struct rnd_state *state, u64 seed)
 {
-	u32 i = (seed >> 32) ^ (seed << 10) ^ seed;
+	u32 i = ((seed >> 32) ^ (seed << 10) ^ seed) & 0xffffffffUL;
 
 	state->s1 = __seed(i,   2U);
 	state->s2 = __seed(i,   8U);
-- 
cgit v1.2.3


From 39b27e89a76f3827ad93aed9213a6daf2b91f819 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Tue, 25 May 2021 12:37:11 +0200
Subject: driver core: Drop helper devm_platform_ioremap_resource_wc()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since the macro was introduced in 2019 (commit bb6243b4f73d ("drivers:
platform: provide devm_platform_ioremap_resource_wc()") there is only a
single user which hardly justifies the function for the small task it
provides.

So drop the helper and open-code it in the only user. Adapt the non-wc
case accordingly.

For a all-mod-config build on amd64 this change introduces the following
changes according to bloat-o-meter:

add/remove: 0/1 grow/shrink: 1/0 up/down: 20/-252 (-232)
Function                                     old     new   delta
devm_platform_ioremap_resource_wc            252       -    -252
sram_probe                                   796     816     +20

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20210525103711.956438-1-u.kleine-koenig@pengutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/driver-api/driver-model/devres.rst |  1 -
 drivers/base/platform.c                          | 20 --------------------
 drivers/misc/sram.c                              |  6 ++++--
 include/linux/platform_device.h                  |  3 ---
 4 files changed, 4 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
index e0814d214048..0fe1fffa295e 100644
--- a/Documentation/driver-api/driver-model/devres.rst
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -314,7 +314,6 @@ IOMAP
   devm_ioremap_resource() : checks resource, requests memory region, ioremaps
   devm_ioremap_resource_wc()
   devm_platform_ioremap_resource() : calls devm_ioremap_resource() for platform device
-  devm_platform_ioremap_resource_wc()
   devm_platform_ioremap_resource_byname()
   devm_platform_get_and_ioremap_resource()
   devm_iounmap()
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 9cd34def2237..ae071e8ed665 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -124,26 +124,6 @@ void __iomem *devm_platform_ioremap_resource(struct platform_device *pdev,
 }
 EXPORT_SYMBOL_GPL(devm_platform_ioremap_resource);
 
-/**
- * devm_platform_ioremap_resource_wc - write-combined variant of
- *                                     devm_platform_ioremap_resource()
- *
- * @pdev: platform device to use both for memory resource lookup as well as
- *        resource management
- * @index: resource index
- *
- * Return: a pointer to the remapped memory or an ERR_PTR() encoded error code
- * on failure.
- */
-void __iomem *devm_platform_ioremap_resource_wc(struct platform_device *pdev,
-						unsigned int index)
-{
-	struct resource *res;
-
-	res = platform_get_resource(pdev, IORESOURCE_MEM, index);
-	return devm_ioremap_resource_wc(&pdev->dev, res);
-}
-
 /**
  * devm_platform_ioremap_resource_byname - call devm_ioremap_resource for
  *					   a platform device, retrieve the
diff --git a/drivers/misc/sram.c b/drivers/misc/sram.c
index 202bf951e909..93638ae2753a 100644
--- a/drivers/misc/sram.c
+++ b/drivers/misc/sram.c
@@ -341,6 +341,7 @@ static int sram_probe(struct platform_device *pdev)
 {
 	struct sram_dev *sram;
 	int ret;
+	struct resource *res;
 	int (*init_func)(void);
 
 	sram = devm_kzalloc(&pdev->dev, sizeof(*sram), GFP_KERNEL);
@@ -349,10 +350,11 @@ static int sram_probe(struct platform_device *pdev)
 
 	sram->dev = &pdev->dev;
 
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (of_property_read_bool(pdev->dev.of_node, "no-memory-wc"))
-		sram->virt_base = devm_platform_ioremap_resource(pdev, 0);
+		sram->virt_base = devm_ioremap_resource(&pdev->dev, res);
 	else
-		sram->virt_base = devm_platform_ioremap_resource_wc(pdev, 0);
+		sram->virt_base = devm_ioremap_resource_wc(&pdev->dev, res);
 	if (IS_ERR(sram->virt_base)) {
 		dev_err(&pdev->dev, "could not map SRAM registers\n");
 		return PTR_ERR(sram->virt_base);
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index cd81e060863c..ed42ea9f60ba 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -66,9 +66,6 @@ extern void __iomem *
 devm_platform_ioremap_resource(struct platform_device *pdev,
 			       unsigned int index);
 extern void __iomem *
-devm_platform_ioremap_resource_wc(struct platform_device *pdev,
-				  unsigned int index);
-extern void __iomem *
 devm_platform_ioremap_resource_byname(struct platform_device *pdev,
 				      const char *name);
 extern int platform_get_irq(struct platform_device *, unsigned int);
-- 
cgit v1.2.3


From 16b79a1e083371a38f72872345866e81abb7ca18 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Date: Tue, 25 May 2021 14:47:16 -0400
Subject: soc: samsung: pmu: drop EXYNOS_CENTRAL_SEQ_OPTION defines

The defines for Exynos5 CENTRAL_SEQ_OPTION (e.g.
EXYNOS5_USE_STANDBYWFI_ARM_CORE1) are not used.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Reviewed-by: Alim Akhtar <alim.akhtar@samsung.com>
Link: https://lore.kernel.org/r/20210525184716.119663-1-krzysztof.kozlowski@canonical.com
---
 include/linux/soc/samsung/exynos-regs-pmu.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h b/include/linux/soc/samsung/exynos-regs-pmu.h
index fc9250fb3133..aa840ed043e1 100644
--- a/include/linux/soc/samsung/exynos-regs-pmu.h
+++ b/include/linux/soc/samsung/exynos-regs-pmu.h
@@ -611,12 +611,6 @@
 #define EXYNOS5420_FSYS2_OPTION					0x4168
 #define EXYNOS5420_PSGEN_OPTION					0x4188
 
-/* For EXYNOS_CENTRAL_SEQ_OPTION */
-#define EXYNOS5_USE_STANDBYWFI_ARM_CORE0			BIT(16)
-#define EXYNOS5_USE_STANDBYWFI_ARM_CORE1			BUT(17)
-#define EXYNOS5_USE_STANDBYWFE_ARM_CORE0			BIT(24)
-#define EXYNOS5_USE_STANDBYWFE_ARM_CORE1			BIT(25)
-
 #define EXYNOS5420_ARM_USE_STANDBY_WFI0				BIT(4)
 #define EXYNOS5420_ARM_USE_STANDBY_WFI1				BIT(5)
 #define EXYNOS5420_ARM_USE_STANDBY_WFI2				BIT(6)
-- 
cgit v1.2.3


From b973cf32453f78d8661a640d0a0167d1d41ea331 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@nvidia.com>
Date: Mon, 23 Nov 2020 14:48:22 -0600
Subject: net/mlx5e: TC: Reserved bit 31 of REG_C1 for IPsec offload

Currently ASAP features fully utilize all the bits of the CQE's flow tag
and ft_metadata field. The flow tag field cannot be used because the
flow table tagging in FTE does not allow partial write.

We agree to reserve bit 31 of CQE's ft_metadata for IPsec to avoid
ASAP CT from dropping IPsec offloaded packet

Here is the new bit layout of REG_C1. Tunnel option id is reduced to
11 bits:
< IPSEC MARKER (1) | ESW_TUN_ID(12) | ESW_TUN_OPTS(11) | ESW_ZONE_ID(8) >

Signed-off-by: Huy Nguyen <huyn@nvidia.com>
Signed-off-by: Raed Salem <raeds@nvidia.com>
Reviewed-by: Paul Blakey <paulb@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Paul Blakey <paulb@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.h     |  2 +-
 include/linux/mlx5/eswitch.h                        | 17 ++++++++++-------
 3 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c
index 6cdc52d50a48..8cef4e7cfa4b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c
@@ -617,7 +617,7 @@ static bool mlx5e_restore_skb(struct sk_buff *skb, u32 chain, u32 reg_c1,
 			      struct mlx5e_tc_update_priv *tc_priv)
 {
 	struct mlx5e_priv *priv = netdev_priv(skb->dev);
-	u32 tunnel_id = reg_c1 >> ESW_TUN_OFFSET;
+	u32 tunnel_id = (reg_c1 >> ESW_TUN_OFFSET) & TUNNEL_ID_MASK;
 
 	if (chain) {
 		struct mlx5_rep_uplink_priv *uplink_priv;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
index 3534d14d7d5c..721093b55acc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
@@ -129,7 +129,7 @@ struct tunnel_match_enc_opts {
  */
 #define TUNNEL_INFO_BITS 12
 #define TUNNEL_INFO_BITS_MASK GENMASK(TUNNEL_INFO_BITS - 1, 0)
-#define ENC_OPTS_BITS 12
+#define ENC_OPTS_BITS 11
 #define ENC_OPTS_BITS_MASK GENMASK(ENC_OPTS_BITS - 1, 0)
 #define TUNNEL_ID_BITS (TUNNEL_INFO_BITS + ENC_OPTS_BITS)
 #define TUNNEL_ID_MASK GENMASK(TUNNEL_ID_BITS - 1, 0)
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 17109b65c1ac..bc7db2e059eb 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -98,10 +98,11 @@ u32 mlx5_eswitch_get_vport_metadata_for_set(struct mlx5_eswitch *esw,
 					    u16 vport_num);
 
 /* Reg C1 usage:
- * Reg C1 = < ESW_TUN_ID(12) | ESW_TUN_OPTS(12) | ESW_ZONE_ID(8) >
+ * Reg C1 = < Reserved(1) | ESW_TUN_ID(12) | ESW_TUN_OPTS(11) | ESW_ZONE_ID(8) >
  *
- * Highest 12 bits of reg c1 is the encapsulation tunnel id, next 12 bits is
- * encapsulation tunnel options, and the lowest 8 bits are used for zone id.
+ * Highest bit is reserved for other offloads as marker bit, next 12 bits of reg c1
+ * is the encapsulation tunnel id, next 11 bits is encapsulation tunnel options,
+ * and the lowest 8 bits are used for zone id.
  *
  * Zone id is used to restore CT flow when packet misses on chain.
  *
@@ -109,16 +110,18 @@ u32 mlx5_eswitch_get_vport_metadata_for_set(struct mlx5_eswitch *esw,
  * on miss and to support inner header rewrite by means of implicit chain 0
  * flows.
  */
+#define ESW_RESERVED_BITS 1
 #define ESW_ZONE_ID_BITS 8
-#define ESW_TUN_OPTS_BITS 12
+#define ESW_TUN_OPTS_BITS 11
 #define ESW_TUN_ID_BITS 12
 #define ESW_TUN_OPTS_OFFSET ESW_ZONE_ID_BITS
 #define ESW_TUN_OFFSET ESW_TUN_OPTS_OFFSET
 #define ESW_ZONE_ID_MASK GENMASK(ESW_ZONE_ID_BITS - 1, 0)
-#define ESW_TUN_OPTS_MASK GENMASK(32 - ESW_TUN_ID_BITS - 1, ESW_TUN_OPTS_OFFSET)
-#define ESW_TUN_MASK GENMASK(31, ESW_TUN_OFFSET)
+#define ESW_TUN_OPTS_MASK GENMASK(31 - ESW_TUN_ID_BITS - ESW_RESERVED_BITS, ESW_TUN_OPTS_OFFSET)
+#define ESW_TUN_MASK GENMASK(31 - ESW_RESERVED_BITS, ESW_TUN_OFFSET)
 #define ESW_TUN_ID_SLOW_TABLE_GOTO_VPORT 0 /* 0 is not a valid tunnel id */
-#define ESW_TUN_OPTS_SLOW_TABLE_GOTO_VPORT 0xFFF /* 0xFFF is a reserved mapping */
+/* 0x7FF is a reserved mapping */
+#define ESW_TUN_OPTS_SLOW_TABLE_GOTO_VPORT GENMASK(ESW_TUN_OPTS_BITS - 1, 0)
 #define ESW_TUN_SLOW_TABLE_GOTO_VPORT ((ESW_TUN_ID_SLOW_TABLE_GOTO_VPORT << ESW_TUN_OPTS_BITS) | \
 				       ESW_TUN_OPTS_SLOW_TABLE_GOTO_VPORT)
 #define ESW_TUN_SLOW_TABLE_GOTO_VPORT_MARK ESW_TUN_OPTS_MASK
-- 
cgit v1.2.3


From 4a98544d182761873381d46bb1a498703ca85bf0 Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@nvidia.com>
Date: Mon, 8 Mar 2021 14:16:02 +0200
Subject: net/mlx5: Move chains ft pool to be used by all firmware steering

Firmware FT pool is per device, but the software tracking of this pool
only services fs_chains users, and if another layer takes a flow table,
the pool will not be updated, and fs_chains will fail creating a flow
table, with no recovery till the flow table is returned.

Move FT pool to be global per device, and stored at the cmd level,
so all layers can use it.

Signed-off-by: Paul Blakey <paulb@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c   | 25 ++++--
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  | 27 +++++--
 .../net/ethernet/mellanox/mlx5/core/fs_ft_pool.c   | 83 ++++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/fs_ft_pool.h   | 21 +++++
 .../ethernet/mellanox/mlx5/core/lib/fs_chains.c    | 89 ++--------------------
 include/linux/mlx5/driver.h                        |  2 +
 7 files changed, 151 insertions(+), 98 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fs_ft_pool.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fs_ft_pool.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index a1223e904190..8dbdf1aef00f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -14,7 +14,7 @@ obj-$(CONFIG_MLX5_CORE) += mlx5_core.o
 mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
 		health.o mcg.o cq.o alloc.o port.o mr.o pd.o \
 		transobj.o vport.o sriov.o fs_cmd.o fs_core.o pci_irq.o \
-		fs_counters.o rl.o lag.o dev.o events.o wq.o lib/gid.o \
+		fs_counters.o fs_ft_pool.o rl.o lag.o dev.o events.o wq.o lib/gid.o \
 		lib/devcom.o lib/pci_vsc.o lib/dm.o diag/fs_tracepoint.o \
 		diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o \
 		fw_reset.o qos.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index 94712a10ef9a..b7aae8b75760 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -36,6 +36,7 @@
 
 #include "fs_core.h"
 #include "fs_cmd.h"
+#include "fs_ft_pool.h"
 #include "mlx5_core.h"
 #include "eswitch.h"
 
@@ -192,18 +193,20 @@ static int mlx5_cmd_create_flow_table(struct mlx5_flow_root_namespace *ns,
 	u32 out[MLX5_ST_SZ_DW(create_flow_table_out)] = {};
 	u32 in[MLX5_ST_SZ_DW(create_flow_table_in)] = {};
 	struct mlx5_core_dev *dev = ns->dev;
-	unsigned int log_size = 0;
 	int err;
 
-	log_size = size ? ilog2(roundup_pow_of_two(size)) : 0;
-	ft->max_fte = 1 << log_size;
+	if (size != POOL_NEXT_SIZE)
+		size = roundup_pow_of_two(size);
+	size = mlx5_ft_pool_get_avail_sz(dev, ft->type, size);
+	if (!size)
+		return -ENOSPC;
 
 	MLX5_SET(create_flow_table_in, in, opcode,
 		 MLX5_CMD_OP_CREATE_FLOW_TABLE);
 
 	MLX5_SET(create_flow_table_in, in, table_type, ft->type);
 	MLX5_SET(create_flow_table_in, in, flow_table_context.level, ft->level);
-	MLX5_SET(create_flow_table_in, in, flow_table_context.log_size, log_size);
+	MLX5_SET(create_flow_table_in, in, flow_table_context.log_size, size ? ilog2(size) : 0);
 	MLX5_SET(create_flow_table_in, in, vport_number, ft->vport);
 	MLX5_SET(create_flow_table_in, in, other_vport,
 		 !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT));
@@ -240,9 +243,14 @@ static int mlx5_cmd_create_flow_table(struct mlx5_flow_root_namespace *ns,
 	}
 
 	err = mlx5_cmd_exec_inout(dev, create_flow_table, in, out);
-	if (!err)
+	if (!err) {
 		ft->id = MLX5_GET(create_flow_table_out, out,
 				  table_id);
+		ft->max_fte = size;
+	} else {
+		mlx5_ft_pool_put_sz(ns->dev, size);
+	}
+
 	return err;
 }
 
@@ -251,6 +259,7 @@ static int mlx5_cmd_destroy_flow_table(struct mlx5_flow_root_namespace *ns,
 {
 	u32 in[MLX5_ST_SZ_DW(destroy_flow_table_in)] = {};
 	struct mlx5_core_dev *dev = ns->dev;
+	int err;
 
 	MLX5_SET(destroy_flow_table_in, in, opcode,
 		 MLX5_CMD_OP_DESTROY_FLOW_TABLE);
@@ -260,7 +269,11 @@ static int mlx5_cmd_destroy_flow_table(struct mlx5_flow_root_namespace *ns,
 	MLX5_SET(destroy_flow_table_in, in, other_vport,
 		 !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT));
 
-	return mlx5_cmd_exec_in(dev, destroy_flow_table, in);
+	err = mlx5_cmd_exec_in(dev, destroy_flow_table, in);
+	if (!err)
+		mlx5_ft_pool_put_sz(ns->dev, ft->max_fte);
+
+	return err;
 }
 
 static int mlx5_cmd_modify_flow_table(struct mlx5_flow_root_namespace *ns,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 378990c933e5..6e20cbb4656a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -38,6 +38,7 @@
 #include "mlx5_core.h"
 #include "fs_core.h"
 #include "fs_cmd.h"
+#include "fs_ft_pool.h"
 #include "diag/fs_tracepoint.h"
 #include "accel/ipsec.h"
 #include "fpga/ipsec.h"
@@ -1166,6 +1167,8 @@ mlx5_create_lag_demux_flow_table(struct mlx5_flow_namespace *ns,
 
 	ft_attr.level = level;
 	ft_attr.prio  = prio;
+	ft_attr.max_fte = 1;
+
 	return __mlx5_create_flow_table(ns, &ft_attr, FS_FT_OP_MOD_LAG_DEMUX, 0);
 }
 EXPORT_SYMBOL(mlx5_create_lag_demux_flow_table);
@@ -1175,19 +1178,20 @@ mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns,
 				    struct mlx5_flow_table_attr *ft_attr)
 {
 	int num_reserved_entries = ft_attr->autogroup.num_reserved_entries;
-	int autogroups_max_fte = ft_attr->max_fte - num_reserved_entries;
 	int max_num_groups = ft_attr->autogroup.max_num_groups;
 	struct mlx5_flow_table *ft;
-
-	if (max_num_groups > autogroups_max_fte)
-		return ERR_PTR(-EINVAL);
-	if (num_reserved_entries > ft_attr->max_fte)
-		return ERR_PTR(-EINVAL);
+	int autogroups_max_fte;
 
 	ft = mlx5_create_flow_table(ns, ft_attr);
 	if (IS_ERR(ft))
 		return ft;
 
+	autogroups_max_fte = ft->max_fte - num_reserved_entries;
+	if (max_num_groups > autogroups_max_fte)
+		goto err_validate;
+	if (num_reserved_entries > ft->max_fte)
+		goto err_validate;
+
 	ft->autogroup.active = true;
 	ft->autogroup.required_groups = max_num_groups;
 	ft->autogroup.max_fte = autogroups_max_fte;
@@ -1195,6 +1199,10 @@ mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns,
 	ft->autogroup.group_size = autogroups_max_fte / (max_num_groups + 1);
 
 	return ft;
+
+err_validate:
+	mlx5_destroy_flow_table(ft);
+	return ERR_PTR(-ENOSPC);
 }
 EXPORT_SYMBOL(mlx5_create_auto_grouped_flow_table);
 
@@ -2588,6 +2596,7 @@ void mlx5_cleanup_fs(struct mlx5_core_dev *dev)
 	mlx5_cleanup_fc_stats(dev);
 	kmem_cache_destroy(steering->ftes_cache);
 	kmem_cache_destroy(steering->fgs_cache);
+	mlx5_ft_pool_destroy(dev);
 	kfree(steering);
 }
 
@@ -2938,9 +2947,13 @@ int mlx5_init_fs(struct mlx5_core_dev *dev)
 	if (err)
 		return err;
 
+	err = mlx5_ft_pool_init(dev);
+	if (err)
+		return err;
+
 	steering = kzalloc(sizeof(*steering), GFP_KERNEL);
 	if (!steering)
-		return -ENOMEM;
+		goto err;
 	steering->dev = dev;
 	dev->priv.steering = steering;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_ft_pool.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_ft_pool.c
new file mode 100644
index 000000000000..526fbb669142
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_ft_pool.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2021 Mellanox Technologies. */
+
+#include "fs_ft_pool.h"
+
+/* Firmware currently has 4 pool of 4 sizes that it supports (FT_POOLS),
+ * and a virtual memory region of 16M (MLX5_FT_SIZE), this region is duplicated
+ * for each flow table pool. We can allocate up to 16M of each pool,
+ * and we keep track of how much we used via mlx5_ft_pool_get_avail_sz.
+ * Firmware doesn't report any of this for now.
+ * ESW_POOL is expected to be sorted from large to small and match firmware
+ * pools.
+ */
+#define FT_SIZE (16 * 1024 * 1024)
+static const unsigned int FT_POOLS[] = { 4 * 1024 * 1024,
+					 1 * 1024 * 1024,
+					 64 * 1024,
+					 128,
+					 1 /* size for termination tables */ };
+struct mlx5_ft_pool {
+	int ft_left[ARRAY_SIZE(FT_POOLS)];
+};
+
+int mlx5_ft_pool_init(struct mlx5_core_dev *dev)
+{
+	struct mlx5_ft_pool *ft_pool;
+	int i;
+
+	ft_pool = kzalloc(sizeof(*ft_pool), GFP_KERNEL);
+
+	for (i = ARRAY_SIZE(FT_POOLS) - 1; i >= 0; i--)
+		ft_pool->ft_left[i] = FT_SIZE / FT_POOLS[i];
+
+	dev->priv.ft_pool = ft_pool;
+	return 0;
+}
+
+void mlx5_ft_pool_destroy(struct mlx5_core_dev *dev)
+{
+	kfree(dev->priv.ft_pool);
+}
+
+int
+mlx5_ft_pool_get_avail_sz(struct mlx5_core_dev *dev, enum fs_flow_table_type table_type,
+			  int desired_size)
+{
+	u32 max_ft_size = 1 << MLX5_CAP_FLOWTABLE_TYPE(dev, log_max_ft_size, table_type);
+	int i, found_i = -1;
+
+	for (i = ARRAY_SIZE(FT_POOLS) - 1; i >= 0; i--) {
+		if (dev->priv.ft_pool->ft_left[i] && FT_POOLS[i] >= desired_size &&
+		    FT_POOLS[i] <= max_ft_size) {
+			found_i = i;
+			if (desired_size != POOL_NEXT_SIZE)
+				break;
+		}
+	}
+
+	if (found_i != -1) {
+		--dev->priv.ft_pool->ft_left[found_i];
+		return FT_POOLS[found_i];
+	}
+
+	return 0;
+}
+
+void
+mlx5_ft_pool_put_sz(struct mlx5_core_dev *dev, int sz)
+{
+	int i;
+
+	if (!sz)
+		return;
+
+	for (i = ARRAY_SIZE(FT_POOLS) - 1; i >= 0; i--) {
+		if (sz == FT_POOLS[i]) {
+			++dev->priv.ft_pool->ft_left[i];
+			return;
+		}
+	}
+
+	WARN_ONCE(1, "Couldn't find size %d in flow table size pool", sz);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_ft_pool.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_ft_pool.h
new file mode 100644
index 000000000000..25f4274b372b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_ft_pool.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2021 Mellanox Technologies. */
+
+#ifndef __MLX5_FS_FT_POOL_H__
+#define __MLX5_FS_FT_POOL_H__
+
+#include <linux/mlx5/driver.h>
+#include "fs_core.h"
+
+#define POOL_NEXT_SIZE 0
+
+int mlx5_ft_pool_init(struct mlx5_core_dev *dev);
+void mlx5_ft_pool_destroy(struct mlx5_core_dev *dev);
+
+int
+mlx5_ft_pool_get_avail_sz(struct mlx5_core_dev *dev, enum fs_flow_table_type table_type,
+			  int desired_size);
+void
+mlx5_ft_pool_put_sz(struct mlx5_core_dev *dev, int sz);
+
+#endif /* __MLX5_FS_FT_POOL_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c
index 4c60c540bf9d..d0cfe7adb8a0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c
@@ -6,6 +6,7 @@
 #include <linux/mlx5/fs.h>
 
 #include "lib/fs_chains.h"
+#include "fs_ft_pool.h"
 #include "en/mapping.h"
 #include "fs_core.h"
 #include "en_tc.h"
@@ -13,25 +14,10 @@
 #define chains_lock(chains) ((chains)->lock)
 #define chains_ht(chains) ((chains)->chains_ht)
 #define prios_ht(chains) ((chains)->prios_ht)
-#define ft_pool_left(chains) ((chains)->ft_left)
 #define tc_default_ft(chains) ((chains)->tc_default_ft)
 #define tc_end_ft(chains) ((chains)->tc_end_ft)
 #define ns_to_chains_fs_prio(ns) ((ns) == MLX5_FLOW_NAMESPACE_FDB ? \
 				  FDB_TC_OFFLOAD : MLX5E_TC_PRIO)
-
-/* Firmware currently has 4 pool of 4 sizes that it supports (FT_POOLS),
- * and a virtual memory region of 16M (MLX5_FT_SIZE), this region is duplicated
- * for each flow table pool. We can allocate up to 16M of each pool,
- * and we keep track of how much we used via get_next_avail_sz_from_pool.
- * Firmware doesn't report any of this for now.
- * ESW_POOL is expected to be sorted from large to small and match firmware
- * pools.
- */
-#define FT_SIZE (16 * 1024 * 1024)
-static const unsigned int FT_POOLS[] = { 4 * 1024 * 1024,
-					  1 * 1024 * 1024,
-					  64 * 1024,
-					  128 };
 #define FT_TBL_SZ (64 * 1024)
 
 struct mlx5_fs_chains {
@@ -49,8 +35,6 @@ struct mlx5_fs_chains {
 	enum mlx5_flow_namespace_type ns;
 	u32 group_num;
 	u32 flags;
-
-	int ft_left[ARRAY_SIZE(FT_POOLS)];
 };
 
 struct fs_chain {
@@ -160,54 +144,6 @@ mlx5_chains_set_end_ft(struct mlx5_fs_chains *chains,
 	tc_end_ft(chains) = ft;
 }
 
-#define POOL_NEXT_SIZE 0
-static int
-mlx5_chains_get_avail_sz_from_pool(struct mlx5_fs_chains *chains,
-				   int desired_size)
-{
-	int i, found_i = -1;
-
-	for (i = ARRAY_SIZE(FT_POOLS) - 1; i >= 0; i--) {
-		if (ft_pool_left(chains)[i] && FT_POOLS[i] > desired_size) {
-			found_i = i;
-			if (desired_size != POOL_NEXT_SIZE)
-				break;
-		}
-	}
-
-	if (found_i != -1) {
-		--ft_pool_left(chains)[found_i];
-		return FT_POOLS[found_i];
-	}
-
-	return 0;
-}
-
-static void
-mlx5_chains_put_sz_to_pool(struct mlx5_fs_chains *chains, int sz)
-{
-	int i;
-
-	for (i = ARRAY_SIZE(FT_POOLS) - 1; i >= 0; i--) {
-		if (sz == FT_POOLS[i]) {
-			++ft_pool_left(chains)[i];
-			return;
-		}
-	}
-
-	WARN_ONCE(1, "Couldn't find size %d in flow table size pool", sz);
-}
-
-static void
-mlx5_chains_init_sz_pool(struct mlx5_fs_chains *chains, u32 ft_max)
-{
-	int i;
-
-	for (i = ARRAY_SIZE(FT_POOLS) - 1; i >= 0; i--)
-		ft_pool_left(chains)[i] =
-			FT_POOLS[i] <= ft_max ? FT_SIZE / FT_POOLS[i] : 0;
-}
-
 static struct mlx5_flow_table *
 mlx5_chains_create_table(struct mlx5_fs_chains *chains,
 			 u32 chain, u32 prio, u32 level)
@@ -221,11 +157,7 @@ mlx5_chains_create_table(struct mlx5_fs_chains *chains,
 		ft_attr.flags |= (MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT |
 				  MLX5_FLOW_TABLE_TUNNEL_EN_DECAP);
 
-	sz = (chain == mlx5_chains_get_nf_ft_chain(chains)) ?
-	     mlx5_chains_get_avail_sz_from_pool(chains, FT_TBL_SZ) :
-	     mlx5_chains_get_avail_sz_from_pool(chains, POOL_NEXT_SIZE);
-	if (!sz)
-		return ERR_PTR(-ENOSPC);
+	sz = (chain == mlx5_chains_get_nf_ft_chain(chains)) ? FT_TBL_SZ : POOL_NEXT_SIZE;
 	ft_attr.max_fte = sz;
 
 	/* We use tc_default_ft(chains) as the table's next_ft till
@@ -266,21 +198,12 @@ mlx5_chains_create_table(struct mlx5_fs_chains *chains,
 	if (IS_ERR(ft)) {
 		mlx5_core_warn(chains->dev, "Failed to create chains table err %d (chain: %d, prio: %d, level: %d, size: %d)\n",
 			       (int)PTR_ERR(ft), chain, prio, level, sz);
-		mlx5_chains_put_sz_to_pool(chains, sz);
 		return ft;
 	}
 
 	return ft;
 }
 
-static void
-mlx5_chains_destroy_table(struct mlx5_fs_chains *chains,
-			  struct mlx5_flow_table *ft)
-{
-	mlx5_chains_put_sz_to_pool(chains, ft->max_fte);
-	mlx5_destroy_flow_table(ft);
-}
-
 static int
 create_chain_restore(struct fs_chain *chain)
 {
@@ -637,7 +560,7 @@ err_insert:
 err_miss_rule:
 	mlx5_destroy_flow_group(miss_group);
 err_group:
-	mlx5_chains_destroy_table(chains, ft);
+	mlx5_destroy_flow_table(ft);
 err_create:
 err_alloc:
 	kvfree(prio_s);
@@ -660,7 +583,7 @@ mlx5_chains_destroy_prio(struct mlx5_fs_chains *chains,
 			       prio_params);
 	mlx5_del_flow_rules(prio->miss_rule);
 	mlx5_destroy_flow_group(prio->miss_group);
-	mlx5_chains_destroy_table(chains, prio->ft);
+	mlx5_destroy_flow_table(prio->ft);
 	mlx5_chains_put_chain(chain);
 	kvfree(prio);
 }
@@ -785,7 +708,7 @@ void
 mlx5_chains_destroy_global_table(struct mlx5_fs_chains *chains,
 				 struct mlx5_flow_table *ft)
 {
-	mlx5_chains_destroy_table(chains, ft);
+	mlx5_destroy_flow_table(ft);
 }
 
 static struct mlx5_fs_chains *
@@ -817,8 +740,6 @@ mlx5_chains_init(struct mlx5_core_dev *dev, struct mlx5_chains_attr *attr)
 		       mlx5_chains_get_chain_range(chains_priv),
 		       mlx5_chains_get_prio_range(chains_priv));
 
-	mlx5_chains_init_sz_pool(chains_priv, attr->max_ft_sz);
-
 	err = rhashtable_init(&chains_ht(chains_priv), &chain_params);
 	if (err)
 		goto init_chains_ht_err;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f8e8d7e90616..6a7749c21b82 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -550,6 +550,7 @@ struct mlx5_adev {
 	int idx;
 };
 
+struct mlx5_ft_pool;
 struct mlx5_priv {
 	/* IRQ table valid only for real pci devices PF or VF */
 	struct mlx5_irq_table   *irq_table;
@@ -602,6 +603,7 @@ struct mlx5_priv {
 	struct mlx5_core_roce	roce;
 	struct mlx5_fc_stats		fc_stats;
 	struct mlx5_rl_table            rl_table;
+	struct mlx5_ft_pool		*ft_pool;
 
 	struct mlx5_bfreg_data		bfregs;
 	struct mlx5_uars_page	       *uar;
-- 
cgit v1.2.3


From f2867434002387c9739494041ac81c17a3808150 Mon Sep 17 00:00:00 2001
From: Suman Anna <s-anna@ti.com>
Date: Wed, 19 May 2021 13:03:04 -0500
Subject: remoteproc: Fix various kernel-doc warnings

Fix all the kernel-doc warnings in various remoteproc core files.
Some of them just needed a formatting cleanup change, while others
needed the Return statement to be added, or documenting the missed
structure elements.

Reviewed-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Suman Anna <s-anna@ti.com>
Link: https://lore.kernel.org/r/20210519180304.23563-3-s-anna@ti.com
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/remoteproc/remoteproc_core.c       | 44 +++++++++++++++++---------
 drivers/remoteproc/remoteproc_elf_loader.c | 12 ++++---
 drivers/remoteproc/remoteproc_virtio.c     |  6 ++--
 include/linux/remoteproc.h                 | 50 ++++++++++++++++--------------
 4 files changed, 69 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index 8c279039b6a3..6348aaa42bbb 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -183,12 +183,12 @@ EXPORT_SYMBOL(rproc_va_to_pa);
  * translations on the internal remoteproc memory regions through a platform
  * implementation specific da_to_va ops, if present.
  *
- * The function returns a valid kernel address on success or NULL on failure.
- *
  * Note: phys_to_virt(iommu_iova_to_phys(rproc->domain, da)) will work too,
  * but only on kernel direct mapped RAM memory. Instead, we're just using
  * here the output of the DMA API for the carveouts, which should be more
  * correct.
+ *
+ * Return: a valid kernel address on success or NULL on failure
  */
 void *rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
@@ -509,7 +509,7 @@ static int copy_dma_range_map(struct device *to, struct device *from)
  * use RSC_DEVMEM resource entries to map their required @da to the physical
  * address of their base CMA region (ouch, hacky!).
  *
- * Returns 0 on success, or an appropriate error code otherwise
+ * Return: 0 on success, or an appropriate error code otherwise
  */
 static int rproc_handle_vdev(struct rproc *rproc, void *ptr,
 			     int offset, int avail)
@@ -644,7 +644,7 @@ void rproc_vdev_release(struct kref *ref)
  * support dynamically allocating this address using the generic
  * DMA API (but currently there isn't a use case for that).
  *
- * Returns 0 on success, or an appropriate error code otherwise
+ * Return: 0 on success, or an appropriate error code otherwise
  */
 static int rproc_handle_trace(struct rproc *rproc, void *ptr,
 			      int offset, int avail)
@@ -721,6 +721,8 @@ static int rproc_handle_trace(struct rproc *rproc, void *ptr,
  * tell us ranges of physical addresses the firmware is allowed to request,
  * and not allow firmwares to request access to physical addresses that
  * are outside those ranges.
+ *
+ * Return: 0 on success, or an appropriate error code otherwise
  */
 static int rproc_handle_devmem(struct rproc *rproc, void *ptr,
 			       int offset, int avail)
@@ -783,6 +785,8 @@ out:
  *
  * This function allocate specified memory entry @mem using
  * dma_alloc_coherent() as default allocator
+ *
+ * Return: 0 on success, or an appropriate error code otherwise
  */
 static int rproc_alloc_carveout(struct rproc *rproc,
 				struct rproc_mem_entry *mem)
@@ -889,6 +893,8 @@ dma_free:
  *
  * This function releases specified memory entry @mem allocated via
  * rproc_alloc_carveout() function by @rproc.
+ *
+ * Return: 0 on success, or an appropriate error code otherwise
  */
 static int rproc_release_carveout(struct rproc *rproc,
 				  struct rproc_mem_entry *mem)
@@ -918,6 +924,8 @@ static int rproc_release_carveout(struct rproc *rproc,
  * (e.g. CMA) more efficiently, and also minimizes the number of TLB entries
  * needed to map it (in case @rproc is using an IOMMU). Reducing the TLB
  * pressure is important; it may have a substantial impact on performance.
+ *
+ * Return: 0 on success, or an appropriate error code otherwise
  */
 static int rproc_handle_carveout(struct rproc *rproc,
 				 void *ptr, int offset, int avail)
@@ -1006,6 +1014,8 @@ EXPORT_SYMBOL(rproc_add_carveout);
  *
  * This function allocates a rproc_mem_entry struct and fill it with parameters
  * provided by client.
+ *
+ * Return: a valid pointer on success, or NULL on failure
  */
 __printf(8, 9)
 struct rproc_mem_entry *
@@ -1050,6 +1060,8 @@ EXPORT_SYMBOL(rproc_mem_entry_init);
  *
  * This function allocates a rproc_mem_entry struct and fill it with parameters
  * provided by client.
+ *
+ * Return: a valid pointer on success, or NULL on failure
  */
 __printf(5, 6)
 struct rproc_mem_entry *
@@ -1881,6 +1893,8 @@ static int __rproc_detach(struct rproc *rproc)
  * remoteproc functional again.
  *
  * This function can sleep, so it cannot be called from atomic context.
+ *
+ * Return: 0 on success or a negative value upon failure
  */
 int rproc_trigger_recovery(struct rproc *rproc)
 {
@@ -1965,7 +1979,7 @@ static void rproc_crash_handler_work(struct work_struct *work)
  * If the remote processor is already powered on, this function immediately
  * returns (successfully).
  *
- * Returns 0 on success, and an appropriate error value otherwise.
+ * Return: 0 on success, and an appropriate error value otherwise
  */
 int rproc_boot(struct rproc *rproc)
 {
@@ -2100,6 +2114,8 @@ EXPORT_SYMBOL(rproc_shutdown);
  * no longer available.  From there it should be possible to remove the
  * platform driver and even power cycle the application processor (if the HW
  * supports it) without needing to switch off the remote processor.
+ *
+ * Return: 0 on success, and an appropriate error value otherwise
  */
 int rproc_detach(struct rproc *rproc)
 {
@@ -2152,7 +2168,7 @@ EXPORT_SYMBOL(rproc_detach);
  * This function increments the remote processor's refcount, so always
  * use rproc_put() to decrement it back once rproc isn't needed anymore.
  *
- * Returns the rproc handle on success, and NULL on failure.
+ * Return: rproc handle on success, and NULL on failure
  */
 #ifdef CONFIG_OF
 struct rproc *rproc_get_by_phandle(phandle phandle)
@@ -2302,8 +2318,6 @@ static int rproc_validate(struct rproc *rproc)
  * This is called by the platform-specific rproc implementation, whenever
  * a new remote processor device is probed.
  *
- * Returns 0 on success and an appropriate error code otherwise.
- *
  * Note: this function initiates an asynchronous firmware loading
  * context, which will look for virtio devices supported by the rproc's
  * firmware.
@@ -2311,6 +2325,8 @@ static int rproc_validate(struct rproc *rproc)
  * If found, those virtio devices will be created and added, so as a result
  * of registering this remote processor, additional virtio drivers might be
  * probed.
+ *
+ * Return: 0 on success and an appropriate error code otherwise
  */
 int rproc_add(struct rproc *rproc)
 {
@@ -2364,7 +2380,7 @@ static void devm_rproc_remove(void *rproc)
  * This function performs like rproc_add() but the registered rproc device will
  * automatically be removed on driver detach.
  *
- * Returns: 0 on success, negative errno on failure
+ * Return: 0 on success, negative errno on failure
  */
 int devm_rproc_add(struct device *dev, struct rproc *rproc)
 {
@@ -2472,10 +2488,10 @@ static int rproc_alloc_ops(struct rproc *rproc, const struct rproc_ops *ops)
  * implementations should then call rproc_add() to complete
  * the registration of the remote processor.
  *
- * On success the new rproc is returned, and on failure, NULL.
- *
  * Note: _never_ directly deallocate @rproc, even if it was not registered
  * yet. Instead, when you need to unroll rproc_alloc(), use rproc_free().
+ *
+ * Return: new rproc pointer on success, and NULL on failure
  */
 struct rproc *rproc_alloc(struct device *dev, const char *name,
 			  const struct rproc_ops *ops,
@@ -2588,7 +2604,7 @@ EXPORT_SYMBOL(rproc_put);
  * of the outstanding reference created by rproc_alloc. To decrement that
  * one last refcount, one still needs to call rproc_free().
  *
- * Returns 0 on success and -EINVAL if @rproc isn't valid.
+ * Return: 0 on success and -EINVAL if @rproc isn't valid
  */
 int rproc_del(struct rproc *rproc)
 {
@@ -2635,7 +2651,7 @@ static void devm_rproc_free(struct device *dev, void *res)
  * This function performs like rproc_alloc() but the acquired rproc device will
  * automatically be released on driver detach.
  *
- * Returns: new rproc instance, or NULL on failure
+ * Return: new rproc instance, or NULL on failure
  */
 struct rproc *devm_rproc_alloc(struct device *dev, const char *name,
 			       const struct rproc_ops *ops,
@@ -2687,7 +2703,7 @@ EXPORT_SYMBOL(rproc_remove_subdev);
  * rproc_get_by_child() - acquire rproc handle of @dev's ancestor
  * @dev:	child device to find ancestor of
  *
- * Returns the ancestor rproc instance, or NULL if not found.
+ * Return: the ancestor rproc instance, or NULL if not found
  */
 struct rproc *rproc_get_by_child(struct device *dev)
 {
diff --git a/drivers/remoteproc/remoteproc_elf_loader.c b/drivers/remoteproc/remoteproc_elf_loader.c
index 11423588965a..469c52e62faf 100644
--- a/drivers/remoteproc/remoteproc_elf_loader.c
+++ b/drivers/remoteproc/remoteproc_elf_loader.c
@@ -31,6 +31,8 @@
  * @fw: the ELF firmware image
  *
  * Make sure this fw image is sane (ie a correct ELF32/ELF64 file).
+ *
+ * Return: 0 on success and -EINVAL upon any failure
  */
 int rproc_elf_sanity_check(struct rproc *rproc, const struct firmware *fw)
 {
@@ -117,11 +119,11 @@ EXPORT_SYMBOL(rproc_elf_sanity_check);
  * @rproc: the remote processor handle
  * @fw: the ELF firmware image
  *
- * This function returns the entry point address of the ELF
- * image.
- *
  * Note that the boot address is not a configurable property of all remote
  * processors. Some will always boot at a specific hard-coded address.
+ *
+ * Return: entry point address of the ELF image
+ *
  */
 u64 rproc_elf_get_boot_addr(struct rproc *rproc, const struct firmware *fw)
 {
@@ -152,6 +154,8 @@ EXPORT_SYMBOL(rproc_elf_get_boot_addr);
  * might be different: they might not have iommus, and would prefer to
  * directly allocate memory for every segment/resource. This is not yet
  * supported, though.
+ *
+ * Return: 0 on success and an appropriate error code otherwise
  */
 int rproc_elf_load_segments(struct rproc *rproc, const struct firmware *fw)
 {
@@ -362,7 +366,7 @@ EXPORT_SYMBOL(rproc_elf_load_rsc_table);
  * This function finds the location of the loaded resource table. Don't
  * call this function if the table wasn't loaded yet - it's a bug if you do.
  *
- * Returns the pointer to the resource table if it is found or NULL otherwise.
+ * Return: pointer to the resource table if it is found or NULL otherwise.
  * If the table wasn't loaded yet the result is unspecified.
  */
 struct resource_table *rproc_elf_find_loaded_rsc_table(struct rproc *rproc,
diff --git a/drivers/remoteproc/remoteproc_virtio.c b/drivers/remoteproc/remoteproc_virtio.c
index 0cc617f76068..cf4d54e98e6a 100644
--- a/drivers/remoteproc/remoteproc_virtio.c
+++ b/drivers/remoteproc/remoteproc_virtio.c
@@ -45,7 +45,7 @@ static bool rproc_virtio_notify(struct virtqueue *vq)
  * when the remote processor signals that a specific virtqueue has pending
  * messages available.
  *
- * Returns IRQ_NONE if no message was found in the @notifyid virtqueue,
+ * Return: IRQ_NONE if no message was found in the @notifyid virtqueue,
  * and otherwise returns IRQ_HANDLED.
  */
 irqreturn_t rproc_vq_interrupt(struct rproc *rproc, int notifyid)
@@ -325,7 +325,7 @@ static void rproc_virtio_dev_release(struct device *dev)
  * This function registers a virtio device. This vdev's partent is
  * the rproc device.
  *
- * Returns 0 on success or an appropriate error value otherwise.
+ * Return: 0 on success or an appropriate error value otherwise
  */
 int rproc_add_virtio_dev(struct rproc_vdev *rvdev, int id)
 {
@@ -432,6 +432,8 @@ out:
  * @data: must be null
  *
  * This function unregisters an existing virtio device.
+ *
+ * Return: 0
  */
 int rproc_remove_virtio_dev(struct device *dev, void *data)
 {
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index 8b795b544f75..a5b37bc10865 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -243,7 +243,7 @@ struct fw_rsc_trace {
  * @da: device address
  * @align: the alignment between the consumer and producer parts of the vring
  * @num: num of buffers supported by this vring (must be power of two)
- * @notifyid is a unique rproc-wide notify index for this vring. This notify
+ * @notifyid: a unique rproc-wide notify index for this vring. This notify
  * index is used when kicking a remote processor, to let it know that this
  * vring is triggered.
  * @pa: physical address
@@ -266,18 +266,18 @@ struct fw_rsc_vdev_vring {
 /**
  * struct fw_rsc_vdev - virtio device header
  * @id: virtio device id (as in virtio_ids.h)
- * @notifyid is a unique rproc-wide notify index for this vdev. This notify
+ * @notifyid: a unique rproc-wide notify index for this vdev. This notify
  * index is used when kicking a remote processor, to let it know that the
  * status/features of this vdev have changes.
- * @dfeatures specifies the virtio device features supported by the firmware
- * @gfeatures is a place holder used by the host to write back the
+ * @dfeatures: specifies the virtio device features supported by the firmware
+ * @gfeatures: a place holder used by the host to write back the
  * negotiated features that are supported by both sides.
- * @config_len is the size of the virtio config space of this vdev. The config
+ * @config_len: the size of the virtio config space of this vdev. The config
  * space lies in the resource table immediate after this vdev header.
- * @status is a place holder where the host will indicate its virtio progress.
- * @num_of_vrings indicates how many vrings are described in this vdev header
+ * @status: a place holder where the host will indicate its virtio progress.
+ * @num_of_vrings: indicates how many vrings are described in this vdev header
  * @reserved: reserved (must be zero)
- * @vring is an array of @num_of_vrings entries of 'struct fw_rsc_vdev_vring'.
+ * @vring: an array of @num_of_vrings entries of 'struct fw_rsc_vdev_vring'.
  *
  * This resource is a virtio device header: it provides information about
  * the vdev, and is then used by the host and its peer remote processors
@@ -287,16 +287,17 @@ struct fw_rsc_vdev_vring {
  * to statically allocate a vdev upon registration of the rproc (dynamic vdev
  * allocation is not yet supported).
  *
- * Note: unlike virtualization systems, the term 'host' here means
- * the Linux side which is running remoteproc to control the remote
- * processors. We use the name 'gfeatures' to comply with virtio's terms,
- * though there isn't really any virtualized guest OS here: it's the host
- * which is responsible for negotiating the final features.
- * Yeah, it's a bit confusing.
- *
- * Note: immediately following this structure is the virtio config space for
- * this vdev (which is specific to the vdev; for more info, read the virtio
- * spec). the size of the config space is specified by @config_len.
+ * Note:
+ * 1. unlike virtualization systems, the term 'host' here means
+ *    the Linux side which is running remoteproc to control the remote
+ *    processors. We use the name 'gfeatures' to comply with virtio's terms,
+ *    though there isn't really any virtualized guest OS here: it's the host
+ *    which is responsible for negotiating the final features.
+ *    Yeah, it's a bit confusing.
+ *
+ * 2. immediately following this structure is the virtio config space for
+ *    this vdev (which is specific to the vdev; for more info, read the virtio
+ *    spec). The size of the config space is specified by @config_len.
  */
 struct fw_rsc_vdev {
 	u32 id;
@@ -440,7 +441,7 @@ enum rproc_state {
  * enum rproc_crash_type - remote processor crash types
  * @RPROC_MMUFAULT:	iommu fault
  * @RPROC_WATCHDOG:	watchdog bite
- * @RPROC_FATAL_ERROR	fatal error
+ * @RPROC_FATAL_ERROR:	fatal error
  *
  * Each element of the enum is used as an array index. So that, the value of
  * the elements should be always something sane.
@@ -457,9 +458,9 @@ enum rproc_crash_type {
  * enum rproc_dump_mechanism - Coredump options for core
  * @RPROC_COREDUMP_DISABLED:	Don't perform any dump
  * @RPROC_COREDUMP_ENABLED:	Copy dump to separate buffer and carry on with
-				recovery
+ *				recovery
  * @RPROC_COREDUMP_INLINE:	Read segments directly from device memory. Stall
-				recovery until all segments are read
+ *				recovery until all segments are read
  */
 enum rproc_dump_mechanism {
 	RPROC_COREDUMP_DISABLED,
@@ -475,6 +476,7 @@ enum rproc_dump_mechanism {
  * @priv:	private data associated with the dump_segment
  * @dump:	custom dump function to fill device memory segment associated
  *		with coredump
+ * @offset:	offset of the segment
  */
 struct rproc_dump_segment {
 	struct list_head node;
@@ -524,7 +526,9 @@ struct rproc_dump_segment {
  * @auto_boot: flag to indicate if remote processor should be auto-started
  * @dump_segments: list of segments in the firmware
  * @nb_vdev: number of vdev currently handled by rproc
- * @char_dev: character device of the rproc
+ * @elf_class: firmware ELF class
+ * @elf_machine: firmware ELF machine
+ * @cdev: character device of the rproc
  * @cdev_put_on_release: flag to indicate if remoteproc should be shutdown on @char_dev release
  */
 struct rproc {
@@ -613,10 +617,10 @@ struct rproc_vring {
  * struct rproc_vdev - remoteproc state for a supported virtio device
  * @refcount: reference counter for the vdev and vring allocations
  * @subdev: handle for registering the vdev as a rproc subdevice
+ * @dev: device struct used for reference count semantics
  * @id: virtio device id (as in virtio_ids.h)
  * @node: list node
  * @rproc: the rproc handle
- * @vdev: the virio device
  * @vring: the vrings for this vdev
  * @rsc_offset: offset of the vdev's resource entry
  * @index: vdev position versus other vdev declared in resource table
-- 
cgit v1.2.3


From 5c350aa11b441b32baf3bfe4018168cb8d10cef7 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Fri, 28 May 2021 11:24:15 +0200
Subject: fcntl: remove unused VALID_UPGRADE_FLAGS

We currently do not maky use of this feature and should we implement
something like this in the future it's trivial to add it back.

Link: https://lore.kernel.org/r/20210528092417.3942079-2-brauner@kernel.org
Cc: Christoph Hellwig <hch@lst.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Suggested-by: Richard Guy Briggs <rgb@redhat.com>
Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/fcntl.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index 766fcd973beb..a332e79b3207 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -12,10 +12,6 @@
 	 FASYNC	| O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | \
 	 O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE)
 
-/* List of all valid flags for the how->upgrade_mask argument: */
-#define VALID_UPGRADE_FLAGS \
-	(UPGRADE_NOWRITE | UPGRADE_NOREAD)
-
 /* List of all valid flags for the how->resolve argument: */
 #define VALID_RESOLVE_FLAGS \
 	(RESOLVE_NO_XDEV | RESOLVE_NO_MAGICLINKS | RESOLVE_NO_SYMLINKS | \
-- 
cgit v1.2.3


From 5a7b95fb993ec399c8a685552aa6a8fc995c40bd Mon Sep 17 00:00:00 2001
From: Bibby Hsieh <bibby.hsieh@mediatek.com>
Date: Thu, 27 May 2021 15:55:53 +0800
Subject: i2c: core: support bus regulator controlling in adapter

Although in the most platforms, the bus power of i2c
are alway on, some platforms disable the i2c bus power
in order to meet low power request.

We can control bulk regulator if it is provided in i2c
adapter device.

Signed-off-by: Bibby Hsieh <bibby.hsieh@mediatek.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Hsin-Yi Wang <hsinyi@chromium.org>
Reviewed-by: Matthias Brugger <matthias.bgg@gmail.com>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 drivers/i2c/i2c-core-base.c | 95 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/i2c.h         |  2 +
 2 files changed, 97 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c
index ae1b1b610aca..2a222e495c70 100644
--- a/drivers/i2c/i2c-core-base.c
+++ b/drivers/i2c/i2c-core-base.c
@@ -465,12 +465,14 @@ static int i2c_smbus_host_notify_to_irq(const struct i2c_client *client)
 static int i2c_device_probe(struct device *dev)
 {
 	struct i2c_client	*client = i2c_verify_client(dev);
+	struct i2c_adapter	*adap;
 	struct i2c_driver	*driver;
 	int status;
 
 	if (!client)
 		return 0;
 
+	adap = client->adapter;
 	client->irq = client->init_irq;
 
 	if (!client->irq) {
@@ -536,6 +538,14 @@ static int i2c_device_probe(struct device *dev)
 
 	dev_dbg(dev, "probe\n");
 
+	if (adap->bus_regulator) {
+		status = regulator_enable(adap->bus_regulator);
+		if (status < 0) {
+			dev_err(&adap->dev, "Failed to enable bus regulator\n");
+			goto err_clear_wakeup_irq;
+		}
+	}
+
 	status = of_clk_set_defaults(dev->of_node, false);
 	if (status < 0)
 		goto err_clear_wakeup_irq;
@@ -593,8 +603,10 @@ put_sync_adapter:
 static int i2c_device_remove(struct device *dev)
 {
 	struct i2c_client	*client = to_i2c_client(dev);
+	struct i2c_adapter      *adap;
 	struct i2c_driver	*driver;
 
+	adap = client->adapter;
 	driver = to_i2c_driver(dev->driver);
 	if (driver->remove) {
 		int status;
@@ -609,6 +621,8 @@ static int i2c_device_remove(struct device *dev)
 	devres_release_group(&client->dev, client->devres_group_id);
 
 	dev_pm_domain_detach(&client->dev, true);
+	if (!pm_runtime_status_suspended(&client->dev) && adap->bus_regulator)
+		regulator_disable(adap->bus_regulator);
 
 	dev_pm_clear_wake_irq(&client->dev);
 	device_init_wakeup(&client->dev, false);
@@ -621,6 +635,86 @@ static int i2c_device_remove(struct device *dev)
 	return 0;
 }
 
+#ifdef CONFIG_PM_SLEEP
+static int i2c_resume_early(struct device *dev)
+{
+	struct i2c_client *client = i2c_verify_client(dev);
+	int err;
+
+	if (!client)
+		return 0;
+
+	if (pm_runtime_status_suspended(&client->dev) &&
+		client->adapter->bus_regulator) {
+		err = regulator_enable(client->adapter->bus_regulator);
+		if (err)
+			return err;
+	}
+
+	return pm_generic_resume_early(&client->dev);
+}
+
+static int i2c_suspend_late(struct device *dev)
+{
+	struct i2c_client *client = i2c_verify_client(dev);
+	int err;
+
+	if (!client)
+		return 0;
+
+	err = pm_generic_suspend_late(&client->dev);
+	if (err)
+		return err;
+
+	if (!pm_runtime_status_suspended(&client->dev) &&
+		client->adapter->bus_regulator)
+		return regulator_disable(client->adapter->bus_regulator);
+
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_PM
+static int i2c_runtime_resume(struct device *dev)
+{
+	struct i2c_client *client = i2c_verify_client(dev);
+	int err;
+
+	if (!client)
+		return 0;
+
+	if (client->adapter->bus_regulator) {
+		err = regulator_enable(client->adapter->bus_regulator);
+		if (err)
+			return err;
+	}
+
+	return pm_generic_runtime_resume(&client->dev);
+}
+
+static int i2c_runtime_suspend(struct device *dev)
+{
+	struct i2c_client *client = i2c_verify_client(dev);
+	int err;
+
+	if (!client)
+		return 0;
+
+	err = pm_generic_runtime_suspend(&client->dev);
+	if (err)
+		return err;
+
+	if (client->adapter->bus_regulator)
+		return regulator_disable(client->adapter->bus_regulator);
+	return 0;
+}
+#endif
+
+static const struct dev_pm_ops i2c_device_pm = {
+	SET_LATE_SYSTEM_SLEEP_PM_OPS(i2c_suspend_late, i2c_resume_early)
+	SET_RUNTIME_PM_OPS(i2c_runtime_suspend, i2c_runtime_resume, NULL)
+};
+
 static void i2c_device_shutdown(struct device *dev)
 {
 	struct i2c_client *client = i2c_verify_client(dev);
@@ -678,6 +772,7 @@ struct bus_type i2c_bus_type = {
 	.probe		= i2c_device_probe,
 	.remove		= i2c_device_remove,
 	.shutdown	= i2c_device_shutdown,
+	.pm		= &i2c_device_pm,
 };
 EXPORT_SYMBOL_GPL(i2c_bus_type);
 
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index e8f2ac8c9c3d..953a4eecb88f 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -15,6 +15,7 @@
 #include <linux/device.h>	/* for struct device */
 #include <linux/sched.h>	/* for completion */
 #include <linux/mutex.h>
+#include <linux/regulator/consumer.h>
 #include <linux/rtmutex.h>
 #include <linux/irqdomain.h>		/* for Host Notify IRQ */
 #include <linux/of.h>		/* for struct device_node */
@@ -729,6 +730,7 @@ struct i2c_adapter {
 	const struct i2c_adapter_quirks *quirks;
 
 	struct irq_domain *host_notify_domain;
+	struct regulator *bus_regulator;
 };
 #define to_i2c_adapter(d) container_of(d, struct i2c_adapter, dev)
 
-- 
cgit v1.2.3


From 586d5a8bcede47fda7bebf4b36be917c5010db16 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 28 May 2021 12:30:03 +0200
Subject: netfilter: x_tables: reduce xt_action_param by 8 byte

The fragment offset in ipv4/ipv6 is a 16bit field, so use
u16 instead of unsigned int.

On 64bit: 40 bytes to 32 bytes. By extension this also reduces
nft_pktinfo (56 to 48 byte).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h | 2 +-
 net/ipv6/netfilter/ip6_tables.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 07c6ad8f2a02..28d7027cd460 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -36,8 +36,8 @@ struct xt_action_param {
 		const void *matchinfo, *targinfo;
 	};
 	const struct nf_hook_state *state;
-	int fragoff;
 	unsigned int thoff;
+	u16 fragoff;
 	bool hotdrop;
 };
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index e810a23baf99..de2cf3943b91 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -51,7 +51,7 @@ ip6_packet_match(const struct sk_buff *skb,
 		 const char *outdev,
 		 const struct ip6t_ip6 *ip6info,
 		 unsigned int *protoff,
-		 int *fragoff, bool *hotdrop)
+		 u16 *fragoff, bool *hotdrop)
 {
 	unsigned long ret;
 	const struct ipv6hdr *ipv6 = ipv6_hdr(skb);
-- 
cgit v1.2.3


From 6802db48fc27b8d7f601e96a85771f2205702941 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 28 May 2021 12:30:04 +0200
Subject: netfilter: reduce size of nf_hook_state on 32bit platforms

Reduce size from 28 to 24 bytes on 32bit platforms.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index f0f3a8354c3c..f161569fbe2f 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -65,8 +65,8 @@ struct nf_hook_ops;
 struct sock;
 
 struct nf_hook_state {
-	unsigned int hook;
-	u_int8_t pf;
+	u8 hook;
+	u8 pf;
 	struct net_device *in;
 	struct net_device *out;
 	struct sock *sk;
-- 
cgit v1.2.3


From e860fa9b69e1bf077ba4725ee4be7b9443a3682a Mon Sep 17 00:00:00 2001
From: Dave Ertman <david.m.ertman@intel.com>
Date: Thu, 20 May 2021 09:37:48 -0500
Subject: iidc: Introduce iidc.h

Introduce a shared header file used by the 'ice' Intel networking driver
providing RDMA support and the 'irdma' driver to provide a private
interface.

Signed-off-by: Dave Ertman <david.m.ertman@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 MAINTAINERS                    |   1 +
 include/linux/net/intel/iidc.h | 100 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+)
 create mode 100644 include/linux/net/intel/iidc.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index bd7aff0c120f..34d2bf36b5ad 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9130,6 +9130,7 @@ F:	Documentation/networking/device_drivers/ethernet/intel/
 F:	drivers/net/ethernet/intel/
 F:	drivers/net/ethernet/intel/*/
 F:	include/linux/avf/virtchnl.h
+F:	include/linux/net/intel/iidc.h
 
 INTEL FRAMEBUFFER DRIVER (excluding 810 and 815)
 M:	Maik Broemme <mbroemme@libmpq.org>
diff --git a/include/linux/net/intel/iidc.h b/include/linux/net/intel/iidc.h
new file mode 100644
index 000000000000..e32f6712aee0
--- /dev/null
+++ b/include/linux/net/intel/iidc.h
@@ -0,0 +1,100 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2021, Intel Corporation. */
+
+#ifndef _IIDC_H_
+#define _IIDC_H_
+
+#include <linux/auxiliary_bus.h>
+#include <linux/dcbnl.h>
+#include <linux/device.h>
+#include <linux/if_ether.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+
+enum iidc_event_type {
+	IIDC_EVENT_BEFORE_MTU_CHANGE,
+	IIDC_EVENT_AFTER_MTU_CHANGE,
+	IIDC_EVENT_BEFORE_TC_CHANGE,
+	IIDC_EVENT_AFTER_TC_CHANGE,
+	IIDC_EVENT_CRIT_ERR,
+	IIDC_EVENT_NBITS		/* must be last */
+};
+
+enum iidc_reset_type {
+	IIDC_PFR,
+	IIDC_CORER,
+	IIDC_GLOBR,
+};
+
+#define IIDC_MAX_USER_PRIORITY		8
+
+/* Struct to hold per RDMA Qset info */
+struct iidc_rdma_qset_params {
+	/* Qset TEID returned to the RDMA driver in
+	 * ice_add_rdma_qset and used by RDMA driver
+	 * for calls to ice_del_rdma_qset
+	 */
+	u32 teid;	/* Qset TEID */
+	u16 qs_handle; /* RDMA driver provides this */
+	u16 vport_id; /* VSI index */
+	u8 tc; /* TC branch the Qset should belong to */
+};
+
+struct iidc_qos_info {
+	u64 tc_ctx;
+	u8 rel_bw;
+	u8 prio_type;
+	u8 egress_virt_up;
+	u8 ingress_virt_up;
+};
+
+/* Struct to pass QoS info */
+struct iidc_qos_params {
+	struct iidc_qos_info tc_info[IEEE_8021QAZ_MAX_TCS];
+	u8 up2tc[IIDC_MAX_USER_PRIORITY];
+	u8 vport_relative_bw;
+	u8 vport_priority_type;
+	u8 num_tc;
+};
+
+struct iidc_event {
+	DECLARE_BITMAP(type, IIDC_EVENT_NBITS);
+	u32 reg;
+};
+
+struct ice_pf;
+
+int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset);
+int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset);
+int ice_rdma_request_reset(struct ice_pf *pf, enum iidc_reset_type reset_type);
+int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable);
+void ice_get_qos_params(struct ice_pf *pf, struct iidc_qos_params *qos);
+
+#define IIDC_RDMA_ROCE_NAME	"roce"
+
+/* Structure representing auxiliary driver tailored information about the core
+ * PCI dev, each auxiliary driver using the IIDC interface will have an
+ * instance of this struct dedicated to it.
+ */
+
+struct iidc_auxiliary_dev {
+	struct auxiliary_device adev;
+	struct ice_pf *pf;
+};
+
+/* structure representing the auxiliary driver. This struct is to be
+ * allocated and populated by the auxiliary driver's owner. The core PCI
+ * driver will access these ops by performing a container_of on the
+ * auxiliary_device->dev.driver.
+ */
+struct iidc_auxiliary_drv {
+	struct auxiliary_driver adrv;
+	/* This event_handler is meant to be a blocking call.  For instance,
+	 * when a BEFORE_MTU_CHANGE event comes in, the event_handler will not
+	 * return until the auxiliary driver is ready for the MTU change to
+	 * happen.
+	 */
+	void (*event_handler)(struct ice_pf *pf, struct iidc_event *event);
+};
+
+#endif /* _IIDC_H_*/
-- 
cgit v1.2.3


From 9ed7533121219cb25408888cf7fbb929cedc033c Mon Sep 17 00:00:00 2001
From: Shiraz Saleem <shiraz.saleem@intel.com>
Date: Fri, 21 May 2021 10:10:59 -0700
Subject: i40e: Prep i40e header for aux bus conversion

Add the definitions to the i40e client header file in
preparation to convert i40e to use the new auxiliary bus
infrastructure. This header is shared between the 'i40e'
Intel networking driver providing RDMA support and the
'irdma' driver.

Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/linux/net/intel/i40e_client.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/net/intel/i40e_client.h b/include/linux/net/intel/i40e_client.h
index fd7bc860a241..41f24b5241ab 100644
--- a/include/linux/net/intel/i40e_client.h
+++ b/include/linux/net/intel/i40e_client.h
@@ -4,6 +4,8 @@
 #ifndef _I40E_CLIENT_H_
 #define _I40E_CLIENT_H_
 
+#include <linux/auxiliary_bus.h>
+
 #define I40E_CLIENT_STR_LENGTH 10
 
 /* Client interface version should be updated anytime there is a change in the
@@ -78,6 +80,7 @@ struct i40e_info {
 	u8 lanmac[6];
 	struct net_device *netdev;
 	struct pci_dev *pcidev;
+	struct auxiliary_device *aux_dev;
 	u8 __iomem *hw_addr;
 	u8 fid;	/* function id, PF id or VF id */
 #define I40E_CLIENT_FTYPE_PF 0
@@ -100,6 +103,11 @@ struct i40e_info {
 	u32 fw_build;                   /* firmware build number */
 };
 
+struct i40e_auxiliary_device {
+	struct auxiliary_device aux_dev;
+	struct i40e_info *ldev;
+};
+
 #define I40E_CLIENT_RESET_LEVEL_PF   1
 #define I40E_CLIENT_RESET_LEVEL_CORE 2
 #define I40E_CLIENT_VSI_FLAG_TCP_ENABLE  BIT(1)
@@ -187,6 +195,8 @@ static inline bool i40e_client_is_registered(struct i40e_client *client)
 	return test_bit(__I40E_CLIENT_REGISTERED, &client->state);
 }
 
+void i40e_client_device_register(struct i40e_info *ldev, struct i40e_client *client);
+void i40e_client_device_unregister(struct i40e_info *ldev);
 /* used by clients */
 int i40e_register_client(struct i40e_client *client);
 int i40e_unregister_client(struct i40e_client *client);
-- 
cgit v1.2.3


From acfbb1911dc907f5f2f1096e88feeaff433bba81 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 18 May 2021 13:43:22 +0300
Subject: dmaengine: Move kdoc description of struct dma_chan_percpu closer to
 it

We have split by unknown reason of kdoc and struct dma_chan_percpu definition.
Join them back. No functional change.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210518104323.37632-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dmaengine.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 004736b6a9c8..93c3ca5fdafd 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -229,12 +229,6 @@ enum sum_check_flags {
  */
 typedef struct { DECLARE_BITMAP(bits, DMA_TX_TYPE_END); } dma_cap_mask_t;
 
-/**
- * struct dma_chan_percpu - the per-CPU part of struct dma_chan
- * @memcpy_count: transaction counter
- * @bytes_transferred: byte counter
- */
-
 /**
  * enum dma_desc_metadata_mode - per descriptor metadata mode types supported
  * @DESC_METADATA_CLIENT - the metadata buffer is allocated/provided by the
@@ -291,6 +285,11 @@ enum dma_desc_metadata_mode {
 	DESC_METADATA_ENGINE = BIT(1),
 };
 
+/**
+ * struct dma_chan_percpu - the per-CPU part of struct dma_chan
+ * @memcpy_count: transaction counter
+ * @bytes_transferred: byte counter
+ */
 struct dma_chan_percpu {
 	/* stats */
 	unsigned long memcpy_count;
-- 
cgit v1.2.3


From f268c3737ecaefcfeecfb4cb5e44958a8976f067 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Thu, 27 May 2021 13:34:41 +0200
Subject: tick/nohz: Only check for RCU deferred wakeup on user/guest entry
 when needed

Checking for and processing RCU-nocb deferred wakeup upon user/guest
entry is only relevant when nohz_full runs on the local CPU, otherwise
the periodic tick should take care of it.

Make sure we don't needlessly pollute these fast-paths as a -3%
performance regression on a will-it-scale.per_process_ops has been
reported so far.

Fixes: 47b8ff194c1f (entry: Explicitly flush pending rcuog wakeup before last rescheduling point)
Fixes: 4ae7dc97f726 (entry/kvm: Explicitly flush pending rcuog wakeup before last rescheduling point)
Reported-by: kernel test robot <oliver.sang@intel.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20210527113441.465489-1-frederic@kernel.org
---
 include/linux/entry-kvm.h | 3 ++-
 include/linux/tick.h      | 7 +++++++
 kernel/entry/common.c     | 5 +++--
 kernel/time/tick-sched.c  | 1 +
 4 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h
index 8b2b1d68b954..136b8d97d8c0 100644
--- a/include/linux/entry-kvm.h
+++ b/include/linux/entry-kvm.h
@@ -3,6 +3,7 @@
 #define __LINUX_ENTRYKVM_H
 
 #include <linux/entry-common.h>
+#include <linux/tick.h>
 
 /* Transfer to guest mode work */
 #ifdef CONFIG_KVM_XFER_TO_GUEST_WORK
@@ -57,7 +58,7 @@ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu);
 static inline void xfer_to_guest_mode_prepare(void)
 {
 	lockdep_assert_irqs_disabled();
-	rcu_nocb_flush_deferred_wakeup();
+	tick_nohz_user_enter_prepare();
 }
 
 /**
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 7340613c7eff..1a0ff88fa107 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -11,6 +11,7 @@
 #include <linux/context_tracking_state.h>
 #include <linux/cpumask.h>
 #include <linux/sched.h>
+#include <linux/rcupdate.h>
 
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
 extern void __init tick_init(void);
@@ -300,4 +301,10 @@ static inline void tick_nohz_task_switch(void)
 		__tick_nohz_task_switch();
 }
 
+static inline void tick_nohz_user_enter_prepare(void)
+{
+	if (tick_nohz_full_cpu(smp_processor_id()))
+		rcu_nocb_flush_deferred_wakeup();
+}
+
 #endif
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index a0b3b04fb596..bf16395b9e13 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -5,6 +5,7 @@
 #include <linux/highmem.h>
 #include <linux/livepatch.h>
 #include <linux/audit.h>
+#include <linux/tick.h>
 
 #include "common.h"
 
@@ -186,7 +187,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 		local_irq_disable_exit_to_user();
 
 		/* Check if any of the above work has queued a deferred wakeup */
-		rcu_nocb_flush_deferred_wakeup();
+		tick_nohz_user_enter_prepare();
 
 		ti_work = READ_ONCE(current_thread_info()->flags);
 	}
@@ -202,7 +203,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs)
 	lockdep_assert_irqs_disabled();
 
 	/* Flush pending rcuog wakeup before the last need_resched() check */
-	rcu_nocb_flush_deferred_wakeup();
+	tick_nohz_user_enter_prepare();
 
 	if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
 		ti_work = exit_to_user_mode_loop(regs, ti_work);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 828b091501ca..6784f27a3099 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -230,6 +230,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 
 #ifdef CONFIG_NO_HZ_FULL
 cpumask_var_t tick_nohz_full_mask;
+EXPORT_SYMBOL_GPL(tick_nohz_full_mask);
 bool tick_nohz_full_running;
 EXPORT_SYMBOL_GPL(tick_nohz_full_running);
 static atomic_t tick_dep_mask;
-- 
cgit v1.2.3


From c58e7ed28b4534ed073371843d03c433d6a9fe34 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Date: Wed, 26 May 2021 12:22:51 -0400
Subject: PM: runtime: document common mistake with pm_runtime_get_sync()

pm_runtime_get_sync(), contradictory to intuition, does not drop the
runtime PM usage counter on errors which lead to several wrong usages in
drivers (missing the put).  pm_runtime_resume_and_get() was added as a
better implementation so document the preference of using it, hoping it
will stop bad patterns.

Suggested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
[ rjw: Documentation change edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/power/runtime_pm.rst | 6 +++++-
 include/linux/pm_runtime.h         | 3 +++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst
index 18ae21bf7f92..b48cac5f9048 100644
--- a/Documentation/power/runtime_pm.rst
+++ b/Documentation/power/runtime_pm.rst
@@ -378,7 +378,11 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
 
   `int pm_runtime_get_sync(struct device *dev);`
     - increment the device's usage counter, run pm_runtime_resume(dev) and
-      return its result
+      return its result;
+      note that it does not drop the device's usage counter on errors, so
+      consider using pm_runtime_resume_and_get() instead of it, especially
+      if its return value is checked by the caller, as this is likely to
+      result in cleaner code.
 
   `int pm_runtime_get_if_in_use(struct device *dev);`
     - return -EINVAL if 'power.disable_depth' is nonzero; otherwise, if the
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 6c08a085367b..aab8b35e9f8a 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -380,6 +380,9 @@ static inline int pm_runtime_get(struct device *dev)
  * The possible return values of this function are the same as for
  * pm_runtime_resume() and the runtime PM usage counter of @dev remains
  * incremented in all cases, even if it returns an error code.
+ * Consider using pm_runtime_resume_and_get() instead of it, especially
+ * if its return value is checked by the caller, as this is likely to result
+ * in cleaner code.
  */
 static inline int pm_runtime_get_sync(struct device *dev)
 {
-- 
cgit v1.2.3


From 220a31b091fb77886eb224ce2d7a5d890e43de63 Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Sat, 29 May 2021 19:03:03 +0800
Subject: kgdb: Fix spelling mistakes

Fix some spelling mistakes in comments:
initalization ==> initialization
detatch ==> detach
represntation ==> representation
hexidecimal ==> hexadecimal
delimeter ==> delimiter
architecure ==> architecture

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Link: https://lore.kernel.org/r/20210529110305.9446-3-thunder.leizhen@huawei.com
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---
 include/linux/kgdb.h           | 8 ++++----
 kernel/debug/debug_core.c      | 2 +-
 kernel/debug/kdb/kdb_main.c    | 8 ++++----
 kernel/debug/kdb/kdb_private.h | 2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index 392a3670944c..258cdde8d356 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -105,9 +105,9 @@ extern int dbg_set_reg(int regno, void *mem, struct pt_regs *regs);
  */
 
 /**
- *	kgdb_arch_init - Perform any architecture specific initalization.
+ *	kgdb_arch_init - Perform any architecture specific initialization.
  *
- *	This function will handle the initalization of any architecture
+ *	This function will handle the initialization of any architecture
  *	specific callbacks.
  */
 extern int kgdb_arch_init(void);
@@ -229,9 +229,9 @@ extern int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt);
 extern int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt);
 
 /**
- *	kgdb_arch_late - Perform any architecture specific initalization.
+ *	kgdb_arch_late - Perform any architecture specific initialization.
  *
- *	This function will handle the late initalization of any
+ *	This function will handle the late initialization of any
  *	architecture specific callbacks.  This is an optional function for
  *	handling things like late initialization of hw breakpoints.  The
  *	default implementation does nothing.
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 4708aec492df..a1f26766eb90 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -1032,7 +1032,7 @@ dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
 	/*
 	 * Take the following action on reboot notify depending on value:
 	 *    1 == Enter debugger
-	 *    0 == [the default] detatch debug client
+	 *    0 == [the default] detach debug client
 	 *   -1 == Do nothing... and use this until the board resets
 	 */
 	switch (kgdbreboot) {
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 1baa96a2ecb8..cbc10f0a37a7 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -253,7 +253,7 @@ static char *kdballocenv(size_t bytes)
  * Parameters:
  *	match	A character string representing a numeric value
  * Outputs:
- *	*value  the unsigned long represntation of the env variable 'match'
+ *	*value  the unsigned long representation of the env variable 'match'
  * Returns:
  *	Zero on success, a kdb diagnostic on failure.
  */
@@ -356,7 +356,7 @@ static void kdb_printenv(void)
  * Parameters:
  *	arg	A character string representing a numeric value
  * Outputs:
- *	*value  the unsigned long represntation of arg.
+ *	*value  the unsigned long representation of arg.
  * Returns:
  *	Zero on success, a kdb diagnostic on failure.
  */
@@ -470,7 +470,7 @@ static int kdb_check_regs(void)
  *	symbol name, and offset to the caller.
  *
  *	The argument may consist of a numeric value (decimal or
- *	hexidecimal), a symbol name, a register name (preceded by the
+ *	hexadecimal), a symbol name, a register name (preceded by the
  *	percent sign), an environment variable with a numeric value
  *	(preceded by a dollar sign) or a simple arithmetic expression
  *	consisting of a symbol name, +/-, and a numeric constant value
@@ -894,7 +894,7 @@ static void parse_grep(const char *str)
  *	Limited to 20 tokens.
  *
  *	Real rudimentary tokenization. Basically only whitespace
- *	is considered a token delimeter (but special consideration
+ *	is considered a token delimiter (but special consideration
  *	is taken of the '=' sign as used by the 'set' command).
  *
  *	The algorithm used to tokenize the input string relies on
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index ccbed9089808..170c69aedebb 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -64,7 +64,7 @@
 
 /*
  * KDB_MAXBPT describes the total number of breakpoints
- * supported by this architecure.
+ * supported by this architecture.
  */
 #define KDB_MAXBPT	16
 
-- 
cgit v1.2.3


From 0e5cb7770684b4c81bcc63f4675e488f9a0e31eb Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Sat, 27 Feb 2021 10:23:45 +0000
Subject: irqchip/gic: Split vGIC probing information from the GIC code

The vGIC advertising code is unsurprisingly very much tied to
the GIC implementations. However, we are about to extend the
support to lesser implementations.

Let's dissociate the vgic registration from the GIC code and
move it into KVM, where it makes a bit more sense. This also
allows us to mark the gic_kvm_info structures as __initdata.

Reviewed-by: Alexandru Elisei <alexandru.elisei@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/vgic/vgic-init.c        | 18 ++++++++++++---
 drivers/irqchip/irq-gic-common.c       | 13 -----------
 drivers/irqchip/irq-gic-common.h       |  2 --
 drivers/irqchip/irq-gic-v3.c           |  6 ++---
 drivers/irqchip/irq-gic.c              |  6 ++---
 include/linux/irqchip/arm-gic-common.h | 25 +--------------------
 include/linux/irqchip/arm-vgic-info.h  | 41 ++++++++++++++++++++++++++++++++++
 7 files changed, 63 insertions(+), 48 deletions(-)
 create mode 100644 include/linux/irqchip/arm-vgic-info.h

(limited to 'include/linux')

diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index 58cbda00e56d..2fdb65529594 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -482,6 +482,16 @@ static irqreturn_t vgic_maintenance_handler(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
+static struct gic_kvm_info *gic_kvm_info;
+
+void __init vgic_set_kvm_info(const struct gic_kvm_info *info)
+{
+	BUG_ON(gic_kvm_info != NULL);
+	gic_kvm_info = kmalloc(sizeof(*info), GFP_KERNEL);
+	if (gic_kvm_info)
+		*gic_kvm_info = *info;
+}
+
 /**
  * kvm_vgic_init_cpu_hardware - initialize the GIC VE hardware
  *
@@ -509,10 +519,8 @@ void kvm_vgic_init_cpu_hardware(void)
  */
 int kvm_vgic_hyp_init(void)
 {
-	const struct gic_kvm_info *gic_kvm_info;
 	int ret;
 
-	gic_kvm_info = gic_get_kvm_info();
 	if (!gic_kvm_info)
 		return -ENODEV;
 
@@ -536,10 +544,14 @@ int kvm_vgic_hyp_init(void)
 		ret = -ENODEV;
 	}
 
+	kvm_vgic_global_state.maint_irq = gic_kvm_info->maint_irq;
+
+	kfree(gic_kvm_info);
+	gic_kvm_info = NULL;
+
 	if (ret)
 		return ret;
 
-	kvm_vgic_global_state.maint_irq = gic_kvm_info->maint_irq;
 	ret = request_percpu_irq(kvm_vgic_global_state.maint_irq,
 				 vgic_maintenance_handler,
 				 "vgic", kvm_get_running_vcpus());
diff --git a/drivers/irqchip/irq-gic-common.c b/drivers/irqchip/irq-gic-common.c
index f47b41dfd023..a610821c8ff2 100644
--- a/drivers/irqchip/irq-gic-common.c
+++ b/drivers/irqchip/irq-gic-common.c
@@ -12,19 +12,6 @@
 
 static DEFINE_RAW_SPINLOCK(irq_controller_lock);
 
-static const struct gic_kvm_info *gic_kvm_info;
-
-const struct gic_kvm_info *gic_get_kvm_info(void)
-{
-	return gic_kvm_info;
-}
-
-void gic_set_kvm_info(const struct gic_kvm_info *info)
-{
-	BUG_ON(gic_kvm_info != NULL);
-	gic_kvm_info = info;
-}
-
 void gic_enable_of_quirks(const struct device_node *np,
 			  const struct gic_quirk *quirks, void *data)
 {
diff --git a/drivers/irqchip/irq-gic-common.h b/drivers/irqchip/irq-gic-common.h
index ccba8b0fe0f5..27e3d4ed4f32 100644
--- a/drivers/irqchip/irq-gic-common.h
+++ b/drivers/irqchip/irq-gic-common.h
@@ -28,6 +28,4 @@ void gic_enable_quirks(u32 iidr, const struct gic_quirk *quirks,
 void gic_enable_of_quirks(const struct device_node *np,
 			  const struct gic_quirk *quirks, void *data);
 
-void gic_set_kvm_info(const struct gic_kvm_info *info);
-
 #endif /* _IRQ_GIC_COMMON_H */
diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index 37a23aa6de37..453fc425eede 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -103,7 +103,7 @@ EXPORT_SYMBOL(gic_nonsecure_priorities);
 /* ppi_nmi_refs[n] == number of cpus having ppi[n + 16] set as NMI */
 static refcount_t *ppi_nmi_refs;
 
-static struct gic_kvm_info gic_v3_kvm_info;
+static struct gic_kvm_info gic_v3_kvm_info __initdata;
 static DEFINE_PER_CPU(bool, has_rss);
 
 #define MPIDR_RS(mpidr)			(((mpidr) & 0xF0UL) >> 4)
@@ -1852,7 +1852,7 @@ static void __init gic_of_setup_kvm_info(struct device_node *node)
 
 	gic_v3_kvm_info.has_v4 = gic_data.rdists.has_vlpis;
 	gic_v3_kvm_info.has_v4_1 = gic_data.rdists.has_rvpeid;
-	gic_set_kvm_info(&gic_v3_kvm_info);
+	vgic_set_kvm_info(&gic_v3_kvm_info);
 }
 
 static int __init gic_of_init(struct device_node *node, struct device_node *parent)
@@ -2168,7 +2168,7 @@ static void __init gic_acpi_setup_kvm_info(void)
 
 	gic_v3_kvm_info.has_v4 = gic_data.rdists.has_vlpis;
 	gic_v3_kvm_info.has_v4_1 = gic_data.rdists.has_rvpeid;
-	gic_set_kvm_info(&gic_v3_kvm_info);
+	vgic_set_kvm_info(&gic_v3_kvm_info);
 }
 
 static int __init
diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index b1d9c22caf2e..2de9ec8ece0c 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -119,7 +119,7 @@ static DEFINE_STATIC_KEY_TRUE(supports_deactivate_key);
 
 static struct gic_chip_data gic_data[CONFIG_ARM_GIC_MAX_NR] __read_mostly;
 
-static struct gic_kvm_info gic_v2_kvm_info;
+static struct gic_kvm_info gic_v2_kvm_info __initdata;
 
 static DEFINE_PER_CPU(u32, sgi_intid);
 
@@ -1451,7 +1451,7 @@ static void __init gic_of_setup_kvm_info(struct device_node *node)
 		return;
 
 	if (static_branch_likely(&supports_deactivate_key))
-		gic_set_kvm_info(&gic_v2_kvm_info);
+		vgic_set_kvm_info(&gic_v2_kvm_info);
 }
 
 int __init
@@ -1618,7 +1618,7 @@ static void __init gic_acpi_setup_kvm_info(void)
 
 	gic_v2_kvm_info.maint_irq = irq;
 
-	gic_set_kvm_info(&gic_v2_kvm_info);
+	vgic_set_kvm_info(&gic_v2_kvm_info);
 }
 
 static int __init gic_v2_acpi_init(union acpi_subtable_headers *header,
diff --git a/include/linux/irqchip/arm-gic-common.h b/include/linux/irqchip/arm-gic-common.h
index fa8c0455c352..1177f3a1aed5 100644
--- a/include/linux/irqchip/arm-gic-common.h
+++ b/include/linux/irqchip/arm-gic-common.h
@@ -7,8 +7,7 @@
 #ifndef __LINUX_IRQCHIP_ARM_GIC_COMMON_H
 #define __LINUX_IRQCHIP_ARM_GIC_COMMON_H
 
-#include <linux/types.h>
-#include <linux/ioport.h>
+#include <linux/irqchip/arm-vgic-info.h>
 
 #define GICD_INT_DEF_PRI		0xa0
 #define GICD_INT_DEF_PRI_X4		((GICD_INT_DEF_PRI << 24) |\
@@ -16,28 +15,6 @@
 					(GICD_INT_DEF_PRI << 8) |\
 					GICD_INT_DEF_PRI)
 
-enum gic_type {
-	GIC_V2,
-	GIC_V3,
-};
-
-struct gic_kvm_info {
-	/* GIC type */
-	enum gic_type	type;
-	/* Virtual CPU interface */
-	struct resource vcpu;
-	/* Interrupt number */
-	unsigned int	maint_irq;
-	/* Virtual control interface */
-	struct resource vctrl;
-	/* vlpi support */
-	bool		has_v4;
-	/* rvpeid support */
-	bool		has_v4_1;
-};
-
-const struct gic_kvm_info *gic_get_kvm_info(void);
-
 struct irq_domain;
 struct fwnode_handle;
 int gicv2m_init(struct fwnode_handle *parent_handle,
diff --git a/include/linux/irqchip/arm-vgic-info.h b/include/linux/irqchip/arm-vgic-info.h
new file mode 100644
index 000000000000..a25d4da5697d
--- /dev/null
+++ b/include/linux/irqchip/arm-vgic-info.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * include/linux/irqchip/arm-vgic-info.h
+ *
+ * Copyright (C) 2016 ARM Limited, All Rights Reserved.
+ */
+#ifndef __LINUX_IRQCHIP_ARM_VGIC_INFO_H
+#define __LINUX_IRQCHIP_ARM_VGIC_INFO_H
+
+#include <linux/types.h>
+#include <linux/ioport.h>
+
+enum gic_type {
+	/* Full GICv2 */
+	GIC_V2,
+	/* Full GICv3, optionally with v2 compat */
+	GIC_V3,
+};
+
+struct gic_kvm_info {
+	/* GIC type */
+	enum gic_type	type;
+	/* Virtual CPU interface */
+	struct resource vcpu;
+	/* Interrupt number */
+	unsigned int	maint_irq;
+	/* Virtual control interface */
+	struct resource vctrl;
+	/* vlpi support */
+	bool		has_v4;
+	/* rvpeid support */
+	bool		has_v4_1;
+};
+
+#ifdef CONFIG_KVM
+void vgic_set_kvm_info(const struct gic_kvm_info *info);
+#else
+static inline void vgic_set_kvm_info(const struct gic_kvm_info *info) {}
+#endif
+
+#endif
-- 
cgit v1.2.3


From 669062d2a1aa36661b490683fe17810aa24a9cfb Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Sun, 28 Feb 2021 11:09:59 +0000
Subject: KVM: arm64: vgic: Be tolerant to the lack of maintenance interrupt
 masking

As it turns out, not all the interrupt controllers are able to
expose a vGIC maintenance interrupt that can be independently
enabled/disabled.

And to be fair, it doesn't really matter as all we require is
for the interrupt to kick us out of guest mode out way or another.

To that effect, add gic_kvm_info.no_maint_irq_mask for an interrupt
controller to advertise the lack of masking.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/vgic/vgic-init.c       | 8 +++++++-
 include/linux/irqchip/arm-vgic-info.h | 2 ++
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index 2fdb65529594..6752d084934d 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -519,12 +519,15 @@ void kvm_vgic_init_cpu_hardware(void)
  */
 int kvm_vgic_hyp_init(void)
 {
+	bool has_mask;
 	int ret;
 
 	if (!gic_kvm_info)
 		return -ENODEV;
 
-	if (!gic_kvm_info->maint_irq) {
+	has_mask = !gic_kvm_info->no_maint_irq_mask;
+
+	if (has_mask && !gic_kvm_info->maint_irq) {
 		kvm_err("No vgic maintenance irq\n");
 		return -ENXIO;
 	}
@@ -552,6 +555,9 @@ int kvm_vgic_hyp_init(void)
 	if (ret)
 		return ret;
 
+	if (!has_mask)
+		return 0;
+
 	ret = request_percpu_irq(kvm_vgic_global_state.maint_irq,
 				 vgic_maintenance_handler,
 				 "vgic", kvm_get_running_vcpus());
diff --git a/include/linux/irqchip/arm-vgic-info.h b/include/linux/irqchip/arm-vgic-info.h
index a25d4da5697d..7c0d08ebb82c 100644
--- a/include/linux/irqchip/arm-vgic-info.h
+++ b/include/linux/irqchip/arm-vgic-info.h
@@ -24,6 +24,8 @@ struct gic_kvm_info {
 	struct resource vcpu;
 	/* Interrupt number */
 	unsigned int	maint_irq;
+	/* No interrupt mask, no need to use the above field */
+	bool		no_maint_irq_mask;
 	/* Virtual control interface */
 	struct resource vctrl;
 	/* vlpi support */
-- 
cgit v1.2.3


From f6c3e24fb721dda247f6691c809d6e6c413f22c7 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 15 Mar 2021 21:56:47 +0000
Subject: KVM: arm64: vgic: Let an interrupt controller advertise lack of HW
 deactivation

The vGIC, as architected by ARM, allows a virtual interrupt to
trigger the deactivation of a physical interrupt. This allows
the following interrupt to be delivered without requiring an exit.

However, some implementations have choosen not to implement this,
meaning that we will need some unsavoury workarounds to deal with this.

On detecting such a case, taint the kernel and spit a nastygram.
We'll deal with this in later patches.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/vgic/vgic-init.c       | 10 ++++++++++
 include/kvm/arm_vgic.h                |  3 +++
 include/linux/irqchip/arm-vgic-info.h |  2 ++
 3 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index 6752d084934d..340c51d87677 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -532,6 +532,16 @@ int kvm_vgic_hyp_init(void)
 		return -ENXIO;
 	}
 
+	/*
+	 * If we get one of these oddball non-GICs, taint the kernel,
+	 * as we have no idea of how they *really* behave.
+	 */
+	if (gic_kvm_info->no_hw_deactivation) {
+		kvm_info("Non-architectural vgic, tainting kernel\n");
+		add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+		kvm_vgic_global_state.no_hw_deactivation = true;
+	}
+
 	switch (gic_kvm_info->type) {
 	case GIC_V2:
 		ret = vgic_v2_probe(gic_kvm_info);
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index ec621180ef09..e45b26e8d479 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -72,6 +72,9 @@ struct vgic_global {
 	bool			has_gicv4;
 	bool			has_gicv4_1;
 
+	/* Pseudo GICv3 from outer space */
+	bool			no_hw_deactivation;
+
 	/* GIC system register CPU interface */
 	struct static_key_false gicv3_cpuif;
 
diff --git a/include/linux/irqchip/arm-vgic-info.h b/include/linux/irqchip/arm-vgic-info.h
index 7c0d08ebb82c..a75b2c7de69d 100644
--- a/include/linux/irqchip/arm-vgic-info.h
+++ b/include/linux/irqchip/arm-vgic-info.h
@@ -32,6 +32,8 @@ struct gic_kvm_info {
 	bool		has_v4;
 	/* rvpeid support */
 	bool		has_v4_1;
+	/* Deactivation impared, subpar stuff */
+	bool		no_hw_deactivation;
 };
 
 #ifdef CONFIG_KVM
-- 
cgit v1.2.3


From 380d2b2d5a0491e47dfa250b40e3d849a922871d Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Fri, 28 May 2021 02:54:00 +0300
Subject: regulator: core: Add regulator_sync_voltage_rdev()

Some NVIDIA Tegra devices use a CPU soft-reset method for the reboot and
in this case we need to restore the coupled voltages to the state that is
suitable for hardware during boot. Add new regulator_sync_voltage_rdev()
helper which is needed by regulator drivers in order to sync voltage of
a coupled regulators.

Acked-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/regulator/core.c         | 23 +++++++++++++++++++++++
 include/linux/regulator/driver.h |  1 +
 2 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index f192bf19492e..ead0b6d2af45 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -4105,6 +4105,29 @@ int regulator_set_voltage_time_sel(struct regulator_dev *rdev,
 }
 EXPORT_SYMBOL_GPL(regulator_set_voltage_time_sel);
 
+int regulator_sync_voltage_rdev(struct regulator_dev *rdev)
+{
+	int ret;
+
+	regulator_lock(rdev);
+
+	if (!rdev->desc->ops->set_voltage &&
+	    !rdev->desc->ops->set_voltage_sel) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* balance only, if regulator is coupled */
+	if (rdev->coupling_desc.n_coupled > 1)
+		ret = regulator_balance_voltage(rdev, PM_SUSPEND_ON);
+	else
+		ret = -EOPNOTSUPP;
+
+out:
+	regulator_unlock(rdev);
+	return ret;
+}
+
 /**
  * regulator_sync_voltage - re-apply last regulator output voltage
  * @regulator: regulator source
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 4ea520c248e9..35e5a611db81 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -540,6 +540,7 @@ int regulator_set_current_limit_regmap(struct regulator_dev *rdev,
 int regulator_get_current_limit_regmap(struct regulator_dev *rdev);
 void *regulator_get_init_drvdata(struct regulator_init_data *reg_init_data);
 int regulator_set_ramp_delay_regmap(struct regulator_dev *rdev, int ramp_delay);
+int regulator_sync_voltage_rdev(struct regulator_dev *rdev);
 
 /*
  * Helper functions intended to be used by regulator drivers prior registering
-- 
cgit v1.2.3


From e848edae31263d2119e7cde779d754439277fbba Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Tue, 1 Jun 2021 05:31:11 +0300
Subject: clk: tegra: Add stubs needed for compile-testing

Add stubs needed for compile-testing of Tegra memory drivers.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/soc/tegra/pmc.c   |   5 ---
 include/linux/clk/tegra.h | 100 ++++++++++++++++++++++++++++++++++++----------
 2 files changed, 79 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soc/tegra/pmc.c b/drivers/soc/tegra/pmc.c
index 8e3b78bb2ac2..4a582eae82ef 100644
--- a/drivers/soc/tegra/pmc.c
+++ b/drivers/soc/tegra/pmc.c
@@ -743,11 +743,6 @@ out:
 	return err;
 }
 
-int __weak tegra210_clk_handle_mbist_war(unsigned int id)
-{
-	return 0;
-}
-
 static int tegra_powergate_power_up(struct tegra_powergate *pg,
 				    bool disable_clocks)
 {
diff --git a/include/linux/clk/tegra.h b/include/linux/clk/tegra.h
index f7ff722a03dd..d128ad1570aa 100644
--- a/include/linux/clk/tegra.h
+++ b/include/linux/clk/tegra.h
@@ -123,20 +123,6 @@ static inline void tegra_cpu_clock_resume(void)
 }
 #endif
 
-extern int tegra210_plle_hw_sequence_start(void);
-extern bool tegra210_plle_hw_sequence_is_enabled(void);
-extern void tegra210_xusb_pll_hw_control_enable(void);
-extern void tegra210_xusb_pll_hw_sequence_start(void);
-extern void tegra210_sata_pll_hw_control_enable(void);
-extern void tegra210_sata_pll_hw_sequence_start(void);
-extern void tegra210_set_sata_pll_seq_sw(bool state);
-extern void tegra210_put_utmipll_in_iddq(void);
-extern void tegra210_put_utmipll_out_iddq(void);
-extern int tegra210_clk_handle_mbist_war(unsigned int id);
-extern void tegra210_clk_emc_dll_enable(bool flag);
-extern void tegra210_clk_emc_dll_update_setting(u32 emc_dll_src_value);
-extern void tegra210_clk_emc_update_setting(u32 emc_src_value);
-
 struct clk;
 struct tegra_emc;
 
@@ -144,17 +130,10 @@ typedef long (tegra20_clk_emc_round_cb)(unsigned long rate,
 					unsigned long min_rate,
 					unsigned long max_rate,
 					void *arg);
-
-void tegra20_clk_set_emc_round_callback(tegra20_clk_emc_round_cb *round_cb,
-					void *cb_arg);
-int tegra20_clk_prepare_emc_mc_same_freq(struct clk *emc_clk, bool same);
-
 typedef int (tegra124_emc_prepare_timing_change_cb)(struct tegra_emc *emc,
 						    unsigned long rate);
 typedef void (tegra124_emc_complete_timing_change_cb)(struct tegra_emc *emc,
 						      unsigned long rate);
-void tegra124_clk_set_emc_callbacks(tegra124_emc_prepare_timing_change_cb *prep_cb,
-				    tegra124_emc_complete_timing_change_cb *complete_cb);
 
 struct tegra210_clk_emc_config {
 	unsigned long rate;
@@ -176,8 +155,87 @@ struct tegra210_clk_emc_provider {
 			const struct tegra210_clk_emc_config *config);
 };
 
+#if defined(CONFIG_ARCH_TEGRA_2x_SOC) || defined(CONFIG_ARCH_TEGRA_3x_SOC)
+void tegra20_clk_set_emc_round_callback(tegra20_clk_emc_round_cb *round_cb,
+					void *cb_arg);
+int tegra20_clk_prepare_emc_mc_same_freq(struct clk *emc_clk, bool same);
+#else
+static inline void
+tegra20_clk_set_emc_round_callback(tegra20_clk_emc_round_cb *round_cb,
+				   void *cb_arg)
+{
+}
+
+static inline int
+tegra20_clk_prepare_emc_mc_same_freq(struct clk *emc_clk, bool same)
+{
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_TEGRA124_CLK_EMC
+void tegra124_clk_set_emc_callbacks(tegra124_emc_prepare_timing_change_cb *prep_cb,
+				    tegra124_emc_complete_timing_change_cb *complete_cb);
+#else
+static inline void
+tegra124_clk_set_emc_callbacks(tegra124_emc_prepare_timing_change_cb *prep_cb,
+			       tegra124_emc_complete_timing_change_cb *complete_cb)
+{
+}
+#endif
+
+#ifdef CONFIG_ARCH_TEGRA_210_SOC
+int tegra210_plle_hw_sequence_start(void);
+bool tegra210_plle_hw_sequence_is_enabled(void);
+void tegra210_xusb_pll_hw_control_enable(void);
+void tegra210_xusb_pll_hw_sequence_start(void);
+void tegra210_sata_pll_hw_control_enable(void);
+void tegra210_sata_pll_hw_sequence_start(void);
+void tegra210_set_sata_pll_seq_sw(bool state);
+void tegra210_put_utmipll_in_iddq(void);
+void tegra210_put_utmipll_out_iddq(void);
+int tegra210_clk_handle_mbist_war(unsigned int id);
+void tegra210_clk_emc_dll_enable(bool flag);
+void tegra210_clk_emc_dll_update_setting(u32 emc_dll_src_value);
+void tegra210_clk_emc_update_setting(u32 emc_src_value);
+
 int tegra210_clk_emc_attach(struct clk *clk,
 			    struct tegra210_clk_emc_provider *provider);
 void tegra210_clk_emc_detach(struct clk *clk);
+#else
+static inline int tegra210_plle_hw_sequence_start(void)
+{
+	return 0;
+}
+
+static inline bool tegra210_plle_hw_sequence_is_enabled(void)
+{
+	return false;
+}
+
+static inline int tegra210_clk_handle_mbist_war(unsigned int id)
+{
+	return 0;
+}
+
+static inline int
+tegra210_clk_emc_attach(struct clk *clk,
+			struct tegra210_clk_emc_provider *provider)
+{
+	return 0;
+}
+
+static inline void tegra210_xusb_pll_hw_control_enable(void) {}
+static inline void tegra210_xusb_pll_hw_sequence_start(void) {}
+static inline void tegra210_sata_pll_hw_control_enable(void) {}
+static inline void tegra210_sata_pll_hw_sequence_start(void) {}
+static inline void tegra210_set_sata_pll_seq_sw(bool state) {}
+static inline void tegra210_put_utmipll_in_iddq(void) {}
+static inline void tegra210_put_utmipll_out_iddq(void) {}
+static inline void tegra210_clk_emc_dll_enable(bool flag) {}
+static inline void tegra210_clk_emc_dll_update_setting(u32 emc_dll_src_value) {}
+static inline void tegra210_clk_emc_update_setting(u32 emc_src_value) {}
+static inline void tegra210_clk_emc_detach(struct clk *clk) {}
+#endif
 
 #endif /* __LINUX_CLK_TEGRA_H_ */
-- 
cgit v1.2.3


From 4a1c456a57c3366d736548ad4d09eb3aa0b9ddaf Mon Sep 17 00:00:00 2001
From: Chris Morgan <macromorgan@hotmail.com>
Date: Wed, 19 May 2021 15:37:51 -0500
Subject: mfd: Add Rockchip rk817 audio CODEC support

Add rk817 codec support cell to rk808 mfd driver.

Signed-off-by: Chris Morgan <macromorgan@hotmail.com>
Tested-by: Maciej Matuszczyk <maccraft123mc@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/rk808.c       | 81 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/rk808.h | 81 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 162 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/rk808.c b/drivers/mfd/rk808.c
index ad923dd4e007..77ccd31ca1d9 100644
--- a/drivers/mfd/rk808.c
+++ b/drivers/mfd/rk808.c
@@ -65,6 +65,7 @@ static bool rk817_is_volatile_reg(struct device *dev, unsigned int reg)
 	switch (reg) {
 	case RK817_SECONDS_REG ... RK817_WEEKS_REG:
 	case RK817_RTC_STATUS_REG:
+	case RK817_CODEC_DTOP_LPT_SRST:
 	case RK817_INT_STS_REG0:
 	case RK817_INT_STS_REG1:
 	case RK817_INT_STS_REG2:
@@ -163,6 +164,7 @@ static const struct mfd_cell rk817s[] = {
 		.num_resources = ARRAY_SIZE(rk817_rtc_resources),
 		.resources = &rk817_rtc_resources[0],
 	},
+	{ .name = "rk817-codec",},
 };
 
 static const struct mfd_cell rk818s[] = {
@@ -201,6 +203,85 @@ static const struct rk808_reg_data rk808_pre_init_reg[] = {
 
 static const struct rk808_reg_data rk817_pre_init_reg[] = {
 	{RK817_RTC_CTRL_REG, RTC_STOP, RTC_STOP},
+	/* Codec specific registers */
+	{ RK817_CODEC_DTOP_VUCTL, MASK_ALL, 0x03 },
+	{ RK817_CODEC_DTOP_VUCTIME, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DTOP_LPT_SRST, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DTOP_DIGEN_CLKE, MASK_ALL, 0x00 },
+	/* from vendor driver, CODEC_AREF_RTCFG0 not defined in data sheet */
+	{ RK817_CODEC_AREF_RTCFG0, MASK_ALL, 0x00 },
+	{ RK817_CODEC_AREF_RTCFG1, MASK_ALL, 0x06 },
+	{ RK817_CODEC_AADC_CFG0, MASK_ALL, 0xc8 },
+	/* from vendor driver, CODEC_AADC_CFG1 not defined in data sheet */
+	{ RK817_CODEC_AADC_CFG1, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DADC_VOLL, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DADC_VOLR, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DADC_SR_ACL0, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DADC_ALC1, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DADC_ALC2, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DADC_NG, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DADC_HPF, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DADC_RVOLL, MASK_ALL, 0xff },
+	{ RK817_CODEC_DADC_RVOLR, MASK_ALL, 0xff },
+	{ RK817_CODEC_AMIC_CFG0, MASK_ALL, 0x70 },
+	{ RK817_CODEC_AMIC_CFG1, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DMIC_PGA_GAIN, MASK_ALL, 0x66 },
+	{ RK817_CODEC_DMIC_LMT1, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DMIC_LMT2, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DMIC_NG1, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DMIC_NG2, MASK_ALL, 0x00 },
+	/* from vendor driver, CODEC_ADAC_CFG0 not defined in data sheet */
+	{ RK817_CODEC_ADAC_CFG0, MASK_ALL, 0x00 },
+	{ RK817_CODEC_ADAC_CFG1, MASK_ALL, 0x07 },
+	{ RK817_CODEC_DDAC_POPD_DACST, MASK_ALL, 0x82 },
+	{ RK817_CODEC_DDAC_VOLL, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DDAC_VOLR, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DDAC_SR_LMT0, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DDAC_LMT1, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DDAC_LMT2, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DDAC_MUTE_MIXCTL, MASK_ALL, 0xa0 },
+	{ RK817_CODEC_DDAC_RVOLL, MASK_ALL, 0xff },
+	{ RK817_CODEC_DADC_RVOLR, MASK_ALL, 0xff },
+	{ RK817_CODEC_AMIC_CFG0, MASK_ALL, 0x70 },
+	{ RK817_CODEC_AMIC_CFG1, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DMIC_PGA_GAIN, MASK_ALL, 0x66 },
+	{ RK817_CODEC_DMIC_LMT1, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DMIC_LMT2, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DMIC_NG1, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DMIC_NG2, MASK_ALL, 0x00 },
+	/* from vendor driver, CODEC_ADAC_CFG0 not defined in data sheet */
+	{ RK817_CODEC_ADAC_CFG0, MASK_ALL, 0x00 },
+	{ RK817_CODEC_ADAC_CFG1, MASK_ALL, 0x07 },
+	{ RK817_CODEC_DDAC_POPD_DACST, MASK_ALL, 0x82 },
+	{ RK817_CODEC_DDAC_VOLL, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DDAC_VOLR, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DDAC_SR_LMT0, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DDAC_LMT1, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DDAC_LMT2, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DDAC_MUTE_MIXCTL, MASK_ALL, 0xa0 },
+	{ RK817_CODEC_DDAC_RVOLL, MASK_ALL, 0xff },
+	{ RK817_CODEC_DDAC_RVOLR, MASK_ALL, 0xff },
+	{ RK817_CODEC_AHP_ANTI0, MASK_ALL, 0x00 },
+	{ RK817_CODEC_AHP_ANTI1, MASK_ALL, 0x00 },
+	{ RK817_CODEC_AHP_CFG0, MASK_ALL, 0xe0 },
+	{ RK817_CODEC_AHP_CFG1, MASK_ALL, 0x1f },
+	{ RK817_CODEC_AHP_CP, MASK_ALL, 0x09 },
+	{ RK817_CODEC_ACLASSD_CFG1, MASK_ALL, 0x69 },
+	{ RK817_CODEC_ACLASSD_CFG2, MASK_ALL, 0x44 },
+	{ RK817_CODEC_APLL_CFG0, MASK_ALL, 0x04 },
+	{ RK817_CODEC_APLL_CFG1, MASK_ALL, 0x00 },
+	{ RK817_CODEC_APLL_CFG2, MASK_ALL, 0x30 },
+	{ RK817_CODEC_APLL_CFG3, MASK_ALL, 0x19 },
+	{ RK817_CODEC_APLL_CFG4, MASK_ALL, 0x65 },
+	{ RK817_CODEC_APLL_CFG5, MASK_ALL, 0x01 },
+	{ RK817_CODEC_DI2S_CKM, MASK_ALL, 0x01 },
+	{ RK817_CODEC_DI2S_RSD, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DI2S_RXCR1, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DI2S_RXCR2, MASK_ALL, 0x17 },
+	{ RK817_CODEC_DI2S_RXCMD_TSD, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DI2S_TXCR1, MASK_ALL, 0x00 },
+	{ RK817_CODEC_DI2S_TXCR2, MASK_ALL, 0x17 },
+	{ RK817_CODEC_DI2S_TXCR3_TXCMD, MASK_ALL, 0x00 },
 	{RK817_GPIO_INT_CFG, RK817_INT_POL_MSK, RK817_INT_POL_L},
 	{RK817_SYS_CFG(1), RK817_HOTDIE_TEMP_MSK | RK817_TSD_TEMP_MSK,
 					   RK817_HOTDIE_105 | RK817_TSD_140},
diff --git a/include/linux/mfd/rk808.h b/include/linux/mfd/rk808.h
index e07f6e61cd38..a96e6d43ca06 100644
--- a/include/linux/mfd/rk808.h
+++ b/include/linux/mfd/rk808.h
@@ -437,6 +437,87 @@ enum rk809_reg_id {
 #define RK817_RTC_COMP_LSB_REG		0x10
 #define RK817_RTC_COMP_MSB_REG		0x11
 
+/* RK817 Codec Registers */
+#define RK817_CODEC_DTOP_VUCTL		0x12
+#define RK817_CODEC_DTOP_VUCTIME	0x13
+#define RK817_CODEC_DTOP_LPT_SRST	0x14
+#define RK817_CODEC_DTOP_DIGEN_CLKE	0x15
+#define RK817_CODEC_AREF_RTCFG0		0x16
+#define RK817_CODEC_AREF_RTCFG1		0x17
+#define RK817_CODEC_AADC_CFG0		0x18
+#define RK817_CODEC_AADC_CFG1		0x19
+#define RK817_CODEC_DADC_VOLL		0x1a
+#define RK817_CODEC_DADC_VOLR		0x1b
+#define RK817_CODEC_DADC_SR_ACL0	0x1e
+#define RK817_CODEC_DADC_ALC1		0x1f
+#define RK817_CODEC_DADC_ALC2		0x20
+#define RK817_CODEC_DADC_NG		0x21
+#define RK817_CODEC_DADC_HPF		0x22
+#define RK817_CODEC_DADC_RVOLL		0x23
+#define RK817_CODEC_DADC_RVOLR		0x24
+#define RK817_CODEC_AMIC_CFG0		0x27
+#define RK817_CODEC_AMIC_CFG1		0x28
+#define RK817_CODEC_DMIC_PGA_GAIN	0x29
+#define RK817_CODEC_DMIC_LMT1		0x2a
+#define RK817_CODEC_DMIC_LMT2		0x2b
+#define RK817_CODEC_DMIC_NG1		0x2c
+#define RK817_CODEC_DMIC_NG2		0x2d
+#define RK817_CODEC_ADAC_CFG0		0x2e
+#define RK817_CODEC_ADAC_CFG1		0x2f
+#define RK817_CODEC_DDAC_POPD_DACST	0x30
+#define RK817_CODEC_DDAC_VOLL		0x31
+#define RK817_CODEC_DDAC_VOLR		0x32
+#define RK817_CODEC_DDAC_SR_LMT0	0x35
+#define RK817_CODEC_DDAC_LMT1		0x36
+#define RK817_CODEC_DDAC_LMT2		0x37
+#define RK817_CODEC_DDAC_MUTE_MIXCTL	0x38
+#define RK817_CODEC_DDAC_RVOLL		0x39
+#define RK817_CODEC_DDAC_RVOLR		0x3a
+#define RK817_CODEC_AHP_ANTI0		0x3b
+#define RK817_CODEC_AHP_ANTI1		0x3c
+#define RK817_CODEC_AHP_CFG0		0x3d
+#define RK817_CODEC_AHP_CFG1		0x3e
+#define RK817_CODEC_AHP_CP		0x3f
+#define RK817_CODEC_ACLASSD_CFG1	0x40
+#define RK817_CODEC_ACLASSD_CFG2	0x41
+#define RK817_CODEC_APLL_CFG0		0x42
+#define RK817_CODEC_APLL_CFG1		0x43
+#define RK817_CODEC_APLL_CFG2		0x44
+#define RK817_CODEC_APLL_CFG3		0x45
+#define RK817_CODEC_APLL_CFG4		0x46
+#define RK817_CODEC_APLL_CFG5		0x47
+#define RK817_CODEC_DI2S_CKM		0x48
+#define RK817_CODEC_DI2S_RSD		0x49
+#define RK817_CODEC_DI2S_RXCR1		0x4a
+#define RK817_CODEC_DI2S_RXCR2		0x4b
+#define RK817_CODEC_DI2S_RXCMD_TSD	0x4c
+#define RK817_CODEC_DI2S_TXCR1		0x4d
+#define RK817_CODEC_DI2S_TXCR2		0x4e
+#define RK817_CODEC_DI2S_TXCR3_TXCMD	0x4f
+
+/* RK817_CODEC_DI2S_CKM */
+#define RK817_I2S_MODE_MASK		(0x1 << 0)
+#define RK817_I2S_MODE_MST		(0x1 << 0)
+#define RK817_I2S_MODE_SLV		(0x0 << 0)
+
+/* RK817_CODEC_DDAC_MUTE_MIXCTL */
+#define DACMT_MASK			(0x1 << 0)
+#define DACMT_ENABLE			(0x1 << 0)
+#define DACMT_DISABLE			(0x0 << 0)
+
+/* RK817_CODEC_DI2S_RXCR2 */
+#define VDW_RX_24BITS			(0x17)
+#define VDW_RX_16BITS			(0x0f)
+
+/* RK817_CODEC_DI2S_TXCR2 */
+#define VDW_TX_24BITS			(0x17)
+#define VDW_TX_16BITS			(0x0f)
+
+/* RK817_CODEC_AMIC_CFG0 */
+#define MIC_DIFF_MASK			(0x1 << 7)
+#define MIC_DIFF_DIS			(0x0 << 7)
+#define MIC_DIFF_EN			(0x1 << 7)
+
 #define RK817_POWER_EN_REG(i)		(0xb1 + (i))
 #define RK817_POWER_SLP_EN_REG(i)	(0xb5 + (i))
 
-- 
cgit v1.2.3


From 958229a7c55f219b1cff99f939dabbc1b6ba7161 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 May 2021 07:50:54 +0200
Subject: block: add a flag to make put_disk on partially initalized disks
 safer

Add a flag to indicate that __device_add_disk did grab a queue reference
so that disk_release only drops it if we actually had it.  This sort
out one of the major pitfals with partially initialized gendisk that
a lot of drivers did get wrong or still do.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20210521055116.1053587-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c         | 7 +++++--
 include/linux/genhd.h | 1 +
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 9fa734cb9cbd..c826db33a73e 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -539,7 +539,10 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
 	 * Take an extra ref on queue which will be put on disk_release()
 	 * so that it sticks around as long as @disk is there.
 	 */
-	WARN_ON_ONCE(!blk_get_queue(disk->queue));
+	if (blk_get_queue(disk->queue))
+		set_bit(GD_QUEUE_REF, &disk->state);
+	else
+		WARN_ON_ONCE(1);
 
 	disk_add_events(disk);
 	blk_integrity_add(disk);
@@ -1107,7 +1110,7 @@ static void disk_release(struct device *dev)
 	kfree(disk->random);
 	xa_destroy(&disk->part_tbl);
 	bdput(disk->part0);
-	if (disk->queue)
+	if (test_bit(GD_QUEUE_REF, &disk->state) && disk->queue)
 		blk_put_queue(disk->queue);
 	kfree(disk);
 }
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 6fc26f7bdf71..4d3ee8b6b297 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -153,6 +153,7 @@ struct gendisk {
 	unsigned long state;
 #define GD_NEED_PART_SCAN		0
 #define GD_READ_ONLY			1
+#define GD_QUEUE_REF			2
 	struct kobject *slave_dir;
 
 	struct timer_rand_state *random;
-- 
cgit v1.2.3


From f525464a8000f092c20b00eead3eaa9d849c599e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 May 2021 07:50:55 +0200
Subject: block: add blk_alloc_disk and blk_cleanup_disk APIs

Add two new APIs to allocate and free a gendisk including the
request_queue for use with BIO based drivers.  This is to avoid
boilerplate code in drivers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20210521055116.1053587-6-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c         | 35 +++++++++++++++++++++++++++++++++++
 include/linux/genhd.h | 22 ++++++++++++++++++++++
 2 files changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index c826db33a73e..efe0db4d62f0 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1293,6 +1293,25 @@ out_free_disk:
 }
 EXPORT_SYMBOL(__alloc_disk_node);
 
+struct gendisk *__blk_alloc_disk(int node)
+{
+	struct request_queue *q;
+	struct gendisk *disk;
+
+	q = blk_alloc_queue(node);
+	if (!q)
+		return NULL;
+
+	disk = __alloc_disk_node(0, node);
+	if (!disk) {
+		blk_cleanup_queue(q);
+		return NULL;
+	}
+	disk->queue = q;
+	return disk;
+}
+EXPORT_SYMBOL(__blk_alloc_disk);
+
 /**
  * put_disk - decrements the gendisk refcount
  * @disk: the struct gendisk to decrement the refcount for
@@ -1310,6 +1329,22 @@ void put_disk(struct gendisk *disk)
 }
 EXPORT_SYMBOL(put_disk);
 
+/**
+ * blk_cleanup_disk - shutdown a gendisk allocated by blk_alloc_disk
+ * @disk: gendisk to shutdown
+ *
+ * Mark the queue hanging off @disk DYING, drain all pending requests, then mark
+ * the queue DEAD, destroy and put it and the gendisk structure.
+ *
+ * Context: can sleep
+ */
+void blk_cleanup_disk(struct gendisk *disk)
+{
+	blk_cleanup_queue(disk->queue);
+	put_disk(disk);
+}
+EXPORT_SYMBOL(blk_cleanup_disk);
+
 static void set_disk_ro_uevent(struct gendisk *gd, int ro)
 {
 	char event[] = "DISK_RO=1";
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 4d3ee8b6b297..782f0171d104 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -278,6 +278,28 @@ extern void put_disk(struct gendisk *disk);
 
 #define alloc_disk(minors) alloc_disk_node(minors, NUMA_NO_NODE)
 
+/**
+ * blk_alloc_disk - allocate a gendisk structure
+ * @node_id: numa node to allocate on
+ *
+ * Allocate and pre-initialize a gendisk structure for use with BIO based
+ * drivers.
+ *
+ * Context: can sleep
+ */
+#define blk_alloc_disk(node_id)						\
+({									\
+	struct gendisk *__disk = __blk_alloc_disk(node_id);		\
+	static struct lock_class_key __key;				\
+									\
+	if (__disk)							\
+		lockdep_init_map(&__disk->lockdep_map,			\
+			"(bio completion)", &__key, 0);			\
+	__disk;								\
+})
+struct gendisk *__blk_alloc_disk(int node);
+void blk_cleanup_disk(struct gendisk *disk);
+
 int __register_blkdev(unsigned int major, const char *name,
 		void (*probe)(dev_t devt));
 #define register_blkdev(major, name) \
-- 
cgit v1.2.3


From da7ba72960ca2a9b968e47fcf414d16f3d4c0c42 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 May 2021 07:51:16 +0200
Subject: block: unexport blk_alloc_queue

blk_alloc_queue is just an internal helper now, unexport it and remove
it from the public header.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20210521055116.1053587-27-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 1 -
 block/blk.h            | 2 ++
 include/linux/blkdev.h | 1 -
 3 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 689aac2625d2..3515a66022d7 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -599,7 +599,6 @@ fail_q:
 	kmem_cache_free(blk_requestq_cachep, q);
 	return NULL;
 }
-EXPORT_SYMBOL(blk_alloc_queue);
 
 /**
  * blk_get_queue - increment the request_queue refcount
diff --git a/block/blk.h b/block/blk.h
index cba3a94aabfa..3440142f029b 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -359,4 +359,6 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
 		struct page *page, unsigned int len, unsigned int offset,
 		unsigned int max_sectors, bool *same_page);
 
+struct request_queue *blk_alloc_queue(int node_id);
+
 #endif /* BLK_INTERNAL_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2c28577b50f4..d66d0da72529 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1213,7 +1213,6 @@ static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 extern void blk_dump_rq_flags(struct request *, char *);
 
 bool __must_check blk_get_queue(struct request_queue *);
-struct request_queue *blk_alloc_queue(int node_id);
 extern void blk_put_queue(struct request_queue *);
 extern void blk_set_queue_dying(struct request_queue *);
 
-- 
cgit v1.2.3


From a8698707a1835be3abd12a3b28079a80999f8dee Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 25 May 2021 08:12:56 +0200
Subject: block: move bd_mutex to struct gendisk
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the per-block device bd_mutex with a per-gendisk open_mutex,
thus simplifying locking wherever we deal with partitions.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Acked-by: Roger Pau Monné <roger.pau@citrix.com>
Link: https://lore.kernel.org/r/20210525061301.2242282-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/filesystems/locking.rst |  2 +-
 block/genhd.c                         |  7 ++++---
 block/partitions/core.c               | 24 ++++++++++-------------
 drivers/block/loop.c                  | 14 ++++++-------
 drivers/block/xen-blkfront.c          |  8 ++++----
 drivers/block/zram/zram_drv.c         | 18 ++++++++---------
 drivers/block/zram/zram_drv.h         |  2 +-
 drivers/md/md.h                       |  6 +++---
 drivers/s390/block/dasd_genhd.c       |  8 ++++----
 drivers/scsi/sd.c                     |  4 ++--
 fs/block_dev.c                        | 37 ++++++++++++++---------------------
 fs/btrfs/volumes.c                    |  2 +-
 fs/super.c                            |  8 ++++----
 include/linux/blk_types.h             |  1 -
 include/linux/genhd.h                 |  3 +++
 15 files changed, 68 insertions(+), 76 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 1e894480115b..2183fd8cc350 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -480,7 +480,7 @@ prototypes::
 locking rules:
 
 ======================= ===================
-ops			bd_mutex
+ops			open_mutex
 ======================= ===================
 open:			yes
 release:		yes
diff --git a/block/genhd.c b/block/genhd.c
index efe0db4d62f0..38d136a19484 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -591,10 +591,10 @@ void del_gendisk(struct gendisk *disk)
 	blk_integrity_del(disk);
 	disk_del_events(disk);
 
-	mutex_lock(&disk->part0->bd_mutex);
+	mutex_lock(&disk->open_mutex);
 	disk->flags &= ~GENHD_FL_UP;
 	blk_drop_partitions(disk);
-	mutex_unlock(&disk->part0->bd_mutex);
+	mutex_unlock(&disk->open_mutex);
 
 	fsync_bdev(disk->part0);
 	__invalidate_device(disk->part0, true);
@@ -1273,6 +1273,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
 		goto out_free_disk;
 
 	disk->node_id = node_id;
+	mutex_init(&disk->open_mutex);
 	xa_init(&disk->part_tbl);
 	if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
 		goto out_destroy_part_tbl;
@@ -1525,7 +1526,7 @@ void disk_unblock_events(struct gendisk *disk)
  * doesn't clear the events from @disk->ev.
  *
  * CONTEXT:
- * If @mask is non-zero must be called with bdev->bd_mutex held.
+ * If @mask is non-zero must be called with disk->open_mutex held.
  */
 void disk_flush_events(struct gendisk *disk, unsigned int mask)
 {
diff --git a/block/partitions/core.c b/block/partitions/core.c
index ada3e1e66989..4fde8e0dd7cd 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -283,7 +283,7 @@ struct device_type part_type = {
 };
 
 /*
- * Must be called either with bd_mutex held, before a disk can be opened or
+ * Must be called either with open_mutex held, before a disk can be opened or
  * after all disk users are gone.
  */
 static void delete_partition(struct block_device *part)
@@ -312,7 +312,7 @@ static ssize_t whole_disk_show(struct device *dev,
 static DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL);
 
 /*
- * Must be called either with bd_mutex held, before a disk can be opened or
+ * Must be called either with open_mutex held, before a disk can be opened or
  * after all disk users are gone.
  */
 static struct block_device *add_partition(struct gendisk *disk, int partno,
@@ -453,15 +453,15 @@ int bdev_add_partition(struct block_device *bdev, int partno,
 {
 	struct block_device *part;
 
-	mutex_lock(&bdev->bd_mutex);
+	mutex_lock(&bdev->bd_disk->open_mutex);
 	if (partition_overlaps(bdev->bd_disk, start, length, -1)) {
-		mutex_unlock(&bdev->bd_mutex);
+		mutex_unlock(&bdev->bd_disk->open_mutex);
 		return -EBUSY;
 	}
 
 	part = add_partition(bdev->bd_disk, partno, start, length,
 			ADDPART_FLAG_NONE, NULL);
-	mutex_unlock(&bdev->bd_mutex);
+	mutex_unlock(&bdev->bd_disk->open_mutex);
 	return PTR_ERR_OR_ZERO(part);
 }
 
@@ -474,8 +474,7 @@ int bdev_del_partition(struct block_device *bdev, int partno)
 	if (!part)
 		return -ENXIO;
 
-	mutex_lock(&part->bd_mutex);
-	mutex_lock_nested(&bdev->bd_mutex, 1);
+	mutex_lock(&bdev->bd_disk->open_mutex);
 
 	ret = -EBUSY;
 	if (part->bd_openers)
@@ -484,8 +483,7 @@ int bdev_del_partition(struct block_device *bdev, int partno)
 	delete_partition(part);
 	ret = 0;
 out_unlock:
-	mutex_unlock(&bdev->bd_mutex);
-	mutex_unlock(&part->bd_mutex);
+	mutex_unlock(&bdev->bd_disk->open_mutex);
 	bdput(part);
 	return ret;
 }
@@ -500,8 +498,7 @@ int bdev_resize_partition(struct block_device *bdev, int partno,
 	if (!part)
 		return -ENXIO;
 
-	mutex_lock(&part->bd_mutex);
-	mutex_lock_nested(&bdev->bd_mutex, 1);
+	mutex_lock(&bdev->bd_disk->open_mutex);
 	ret = -EINVAL;
 	if (start != part->bd_start_sect)
 		goto out_unlock;
@@ -514,8 +511,7 @@ int bdev_resize_partition(struct block_device *bdev, int partno,
 
 	ret = 0;
 out_unlock:
-	mutex_unlock(&part->bd_mutex);
-	mutex_unlock(&bdev->bd_mutex);
+	mutex_unlock(&bdev->bd_disk->open_mutex);
 	bdput(part);
 	return ret;
 }
@@ -541,7 +537,7 @@ void blk_drop_partitions(struct gendisk *disk)
 	struct block_device *part;
 	unsigned long idx;
 
-	lockdep_assert_held(&disk->part0->bd_mutex);
+	lockdep_assert_held(&disk->open_mutex);
 
 	xa_for_each_start(&disk->part_tbl, idx, part, 1) {
 		if (!bdgrab(part))
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index d58d68f3c7cd..95c570f5923f 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -652,9 +652,9 @@ static void loop_reread_partitions(struct loop_device *lo,
 {
 	int rc;
 
-	mutex_lock(&bdev->bd_mutex);
+	mutex_lock(&bdev->bd_disk->open_mutex);
 	rc = bdev_disk_changed(bdev, false);
-	mutex_unlock(&bdev->bd_mutex);
+	mutex_unlock(&bdev->bd_disk->open_mutex);
 	if (rc)
 		pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n",
 			__func__, lo->lo_number, lo->lo_file_name, rc);
@@ -747,7 +747,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 	mutex_unlock(&lo->lo_mutex);
 	/*
 	 * We must drop file reference outside of lo_mutex as dropping
-	 * the file ref can take bd_mutex which creates circular locking
+	 * the file ref can take open_mutex which creates circular locking
 	 * dependency.
 	 */
 	fput(old_file);
@@ -1260,7 +1260,7 @@ out_unlock:
 	mutex_unlock(&lo->lo_mutex);
 	if (partscan) {
 		/*
-		 * bd_mutex has been held already in release path, so don't
+		 * open_mutex has been held already in release path, so don't
 		 * acquire it if this function is called in such case.
 		 *
 		 * If the reread partition isn't from release path, lo_refcnt
@@ -1268,10 +1268,10 @@ out_unlock:
 		 * current holder is released.
 		 */
 		if (!release)
-			mutex_lock(&bdev->bd_mutex);
+			mutex_lock(&bdev->bd_disk->open_mutex);
 		err = bdev_disk_changed(bdev, false);
 		if (!release)
-			mutex_unlock(&bdev->bd_mutex);
+			mutex_unlock(&bdev->bd_disk->open_mutex);
 		if (err)
 			pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
 				__func__, lo_number, err);
@@ -1298,7 +1298,7 @@ out_unlock:
 	/*
 	 * Need not hold lo_mutex to fput backing file. Calling fput holding
 	 * lo_mutex triggers a circular lock dependency possibility warning as
-	 * fput can take bd_mutex which is usually taken before lo_mutex.
+	 * fput can take open_mutex which is usually taken before lo_mutex.
 	 */
 	if (filp)
 		fput(filp);
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 10df39a8b18d..f2c1aedcdf5a 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -2163,7 +2163,7 @@ static void blkfront_closing(struct blkfront_info *info)
 		return;
 	}
 
-	mutex_lock(&bdev->bd_mutex);
+	mutex_lock(&bdev->bd_disk->open_mutex);
 
 	if (bdev->bd_openers) {
 		xenbus_dev_error(xbdev, -EBUSY,
@@ -2174,7 +2174,7 @@ static void blkfront_closing(struct blkfront_info *info)
 		xenbus_frontend_closed(xbdev);
 	}
 
-	mutex_unlock(&bdev->bd_mutex);
+	mutex_unlock(&bdev->bd_disk->open_mutex);
 	bdput(bdev);
 }
 
@@ -2531,7 +2531,7 @@ static int blkfront_remove(struct xenbus_device *xbdev)
 	 * isn't closed yet, we let release take care of it.
 	 */
 
-	mutex_lock(&bdev->bd_mutex);
+	mutex_lock(&disk->open_mutex);
 	info = disk->private_data;
 
 	dev_warn(disk_to_dev(disk),
@@ -2546,7 +2546,7 @@ static int blkfront_remove(struct xenbus_device *xbdev)
 		mutex_unlock(&blkfront_mutex);
 	}
 
-	mutex_unlock(&bdev->bd_mutex);
+	mutex_unlock(&disk->open_mutex);
 	bdput(bdev);
 
 	return 0;
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 006416cc4969..fcaf2750f68f 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1781,24 +1781,24 @@ static ssize_t reset_store(struct device *dev,
 	zram = dev_to_zram(dev);
 	bdev = zram->disk->part0;
 
-	mutex_lock(&bdev->bd_mutex);
+	mutex_lock(&bdev->bd_disk->open_mutex);
 	/* Do not reset an active device or claimed device */
 	if (bdev->bd_openers || zram->claim) {
-		mutex_unlock(&bdev->bd_mutex);
+		mutex_unlock(&bdev->bd_disk->open_mutex);
 		return -EBUSY;
 	}
 
 	/* From now on, anyone can't open /dev/zram[0-9] */
 	zram->claim = true;
-	mutex_unlock(&bdev->bd_mutex);
+	mutex_unlock(&bdev->bd_disk->open_mutex);
 
 	/* Make sure all the pending I/O are finished */
 	fsync_bdev(bdev);
 	zram_reset_device(zram);
 
-	mutex_lock(&bdev->bd_mutex);
+	mutex_lock(&bdev->bd_disk->open_mutex);
 	zram->claim = false;
-	mutex_unlock(&bdev->bd_mutex);
+	mutex_unlock(&bdev->bd_disk->open_mutex);
 
 	return len;
 }
@@ -1808,7 +1808,7 @@ static int zram_open(struct block_device *bdev, fmode_t mode)
 	int ret = 0;
 	struct zram *zram;
 
-	WARN_ON(!mutex_is_locked(&bdev->bd_mutex));
+	WARN_ON(!mutex_is_locked(&bdev->bd_disk->open_mutex));
 
 	zram = bdev->bd_disk->private_data;
 	/* zram was claimed to reset so open request fails */
@@ -1972,14 +1972,14 @@ static int zram_remove(struct zram *zram)
 {
 	struct block_device *bdev = zram->disk->part0;
 
-	mutex_lock(&bdev->bd_mutex);
+	mutex_lock(&bdev->bd_disk->open_mutex);
 	if (bdev->bd_openers || zram->claim) {
-		mutex_unlock(&bdev->bd_mutex);
+		mutex_unlock(&bdev->bd_disk->open_mutex);
 		return -EBUSY;
 	}
 
 	zram->claim = true;
-	mutex_unlock(&bdev->bd_mutex);
+	mutex_unlock(&bdev->bd_disk->open_mutex);
 
 	zram_debugfs_unregister(zram);
 
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 419a7e8281ee..74c411911b6e 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -112,7 +112,7 @@ struct zram {
 	/*
 	 * zram is claimed so open request will be failed
 	 */
-	bool claim; /* Protected by bdev->bd_mutex */
+	bool claim; /* Protected by disk->open_mutex */
 	struct file *backing_dev;
 #ifdef CONFIG_ZRAM_WRITEBACK
 	spinlock_t wb_limit_lock;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index fb7eab58cfd5..a88086d4110c 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -395,10 +395,10 @@ struct mddev {
 	 * that we are never stopping an array while it is open.
 	 * 'reconfig_mutex' protects all other reconfiguration.
 	 * These locks are separate due to conflicting interactions
-	 * with bdev->bd_mutex.
+	 * with disk->open_mutex.
 	 * Lock ordering is:
-	 *  reconfig_mutex -> bd_mutex
-	 *  bd_mutex -> open_mutex:  e.g. __blkdev_get -> md_open
+	 *  reconfig_mutex -> disk->open_mutex
+	 *  disk->open_mutex -> open_mutex:  e.g. __blkdev_get -> md_open
 	 */
 	struct mutex			open_mutex;
 	struct mutex			reconfig_mutex;
diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c
index 8d6587ec73e2..bf2082d461c7 100644
--- a/drivers/s390/block/dasd_genhd.c
+++ b/drivers/s390/block/dasd_genhd.c
@@ -109,9 +109,9 @@ int dasd_scan_partitions(struct dasd_block *block)
 		return -ENODEV;
 	}
 
-	mutex_lock(&bdev->bd_mutex);
+	mutex_lock(&block->gdp->open_mutex);
 	rc = bdev_disk_changed(bdev, false);
-	mutex_unlock(&bdev->bd_mutex);
+	mutex_unlock(&block->gdp->open_mutex);
 	if (rc)
 		DBF_DEV_EVENT(DBF_ERR, block->base,
 				"scan partitions error, rc %d", rc);
@@ -145,9 +145,9 @@ void dasd_destroy_partitions(struct dasd_block *block)
 	bdev = block->bdev;
 	block->bdev = NULL;
 
-	mutex_lock(&bdev->bd_mutex);
+	mutex_lock(&bdev->bd_disk->open_mutex);
 	bdev_disk_changed(bdev, true);
-	mutex_unlock(&bdev->bd_mutex);
+	mutex_unlock(&bdev->bd_disk->open_mutex);
 
 	/* Matching blkdev_put to the blkdev_get in dasd_scan_partitions. */
 	blkdev_put(bdev, FMODE_READ);
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index cb3c37d1e009..d3ff723af879 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1400,7 +1400,7 @@ static void sd_uninit_command(struct scsi_cmnd *SCpnt)
  *	In the latter case @inode and @filp carry an abridged amount
  *	of information as noted above.
  *
- *	Locking: called with bdev->bd_mutex held.
+ *	Locking: called with bdev->bd_disk->open_mutex held.
  **/
 static int sd_open(struct block_device *bdev, fmode_t mode)
 {
@@ -1476,7 +1476,7 @@ error_out:
  *	Note: may block (uninterruptible) if error recovery is underway
  *	on this disk.
  *
- *	Locking: called with bdev->bd_mutex held.
+ *	Locking: called with bdev->bd_disk->open_mutex held.
  **/
 static void sd_release(struct gendisk *disk, fmode_t mode)
 {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 41d2d9708bf8..e094806c3a0c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -895,7 +895,6 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 	mapping_set_gfp_mask(&inode->i_data, GFP_USER);
 
 	bdev = I_BDEV(inode);
-	mutex_init(&bdev->bd_mutex);
 	mutex_init(&bdev->bd_fsfreeze_mutex);
 	spin_lock_init(&bdev->bd_size_lock);
 	bdev->bd_disk = disk;
@@ -1154,7 +1153,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
 	struct bd_holder_disk *holder;
 	int ret = 0;
 
-	mutex_lock(&bdev->bd_mutex);
+	mutex_lock(&bdev->bd_disk->open_mutex);
 
 	WARN_ON_ONCE(!bdev->bd_holder);
 
@@ -1199,7 +1198,7 @@ out_del:
 out_free:
 	kfree(holder);
 out_unlock:
-	mutex_unlock(&bdev->bd_mutex);
+	mutex_unlock(&bdev->bd_disk->open_mutex);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(bd_link_disk_holder);
@@ -1218,7 +1217,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
 {
 	struct bd_holder_disk *holder;
 
-	mutex_lock(&bdev->bd_mutex);
+	mutex_lock(&bdev->bd_disk->open_mutex);
 
 	holder = bd_find_holder_disk(bdev, disk);
 
@@ -1230,7 +1229,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
 		kfree(holder);
 	}
 
-	mutex_unlock(&bdev->bd_mutex);
+	mutex_unlock(&bdev->bd_disk->open_mutex);
 }
 EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
 #endif
@@ -1242,7 +1241,7 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate)
 	struct gendisk *disk = bdev->bd_disk;
 	int ret = 0;
 
-	lockdep_assert_held(&bdev->bd_mutex);
+	lockdep_assert_held(&disk->open_mutex);
 
 	if (!(disk->flags & GENHD_FL_UP))
 		return -ENXIO;
@@ -1327,14 +1326,10 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode)
 		goto done;
 
 	whole = bdgrab(disk->part0);
-	mutex_lock_nested(&whole->bd_mutex, 1);
 	ret = blkdev_get_whole(whole, mode);
-	if (ret) {
-		mutex_unlock(&whole->bd_mutex);
+	if (ret)
 		goto out_put_whole;
-	}
 	whole->bd_part_count++;
-	mutex_unlock(&whole->bd_mutex);
 
 	ret = -ENXIO;
 	if (!bdev_nr_sectors(part))
@@ -1437,7 +1432,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
 
 	disk_block_events(disk);
 
-	mutex_lock(&bdev->bd_mutex);
+	mutex_lock(&disk->open_mutex);
 	ret = -ENXIO;
 	if (!(disk->flags & GENHD_FL_UP))
 		goto abort_claiming;
@@ -1463,7 +1458,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
 			unblock_events = false;
 		}
 	}
-	mutex_unlock(&bdev->bd_mutex);
+	mutex_unlock(&disk->open_mutex);
 
 	if (unblock_events)
 		disk_unblock_events(disk);
@@ -1472,7 +1467,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
 abort_claiming:
 	if (mode & FMODE_EXCL)
 		bd_abort_claiming(bdev, holder);
-	mutex_unlock(&bdev->bd_mutex);
+	mutex_unlock(&disk->open_mutex);
 	disk_unblock_events(disk);
 put_blkdev:
 	blkdev_put_no_open(bdev);
@@ -1552,7 +1547,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 	struct gendisk *disk = bdev->bd_disk;
 	struct block_device *victim = NULL;
 
-	mutex_lock_nested(&bdev->bd_mutex, for_part);
 	if (for_part)
 		bdev->bd_part_count--;
 
@@ -1567,7 +1561,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 
 	if (!bdev_is_partition(bdev) && disk->fops->release)
 		disk->fops->release(disk, mode);
-	mutex_unlock(&bdev->bd_mutex);
 	if (victim) {
 		__blkdev_put(victim, mode, 1);
 		bdput(victim);
@@ -1588,15 +1581,14 @@ void blkdev_put(struct block_device *bdev, fmode_t mode)
 	if (bdev->bd_openers == 1)
 		sync_blockdev(bdev);
 
-	mutex_lock(&bdev->bd_mutex);
-
+	mutex_lock(&disk->open_mutex);
 	if (mode & FMODE_EXCL) {
 		struct block_device *whole = bdev_whole(bdev);
 		bool bdev_free;
 
 		/*
 		 * Release a claim on the device.  The holder fields
-		 * are protected with bdev_lock.  bd_mutex is to
+		 * are protected with bdev_lock.  open_mutex is to
 		 * synchronize disk_holder unlinking.
 		 */
 		spin_lock(&bdev_lock);
@@ -1627,9 +1619,10 @@ void blkdev_put(struct block_device *bdev, fmode_t mode)
 	 * from userland - e.g. eject(1).
 	 */
 	disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE);
-	mutex_unlock(&bdev->bd_mutex);
 
 	__blkdev_put(bdev, mode, 0);
+	mutex_unlock(&disk->open_mutex);
+
 	blkdev_put_no_open(bdev);
 }
 EXPORT_SYMBOL(blkdev_put);
@@ -1936,10 +1929,10 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
 		old_inode = inode;
 		bdev = I_BDEV(inode);
 
-		mutex_lock(&bdev->bd_mutex);
+		mutex_lock(&bdev->bd_disk->open_mutex);
 		if (bdev->bd_openers)
 			func(bdev, arg);
-		mutex_unlock(&bdev->bd_mutex);
+		mutex_unlock(&bdev->bd_disk->open_mutex);
 
 		spin_lock(&blockdev_superblock->s_inode_list_lock);
 	}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 47d27059d064..f246eb2772e9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1247,7 +1247,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	lockdep_assert_held(&uuid_mutex);
 	/*
 	 * The device_list_mutex cannot be taken here in case opening the
-	 * underlying device takes further locks like bd_mutex.
+	 * underlying device takes further locks like open_mutex.
 	 *
 	 * We also don't need the lock here as this is called during mount and
 	 * exclusion is provided by uuid_mutex
diff --git a/fs/super.c b/fs/super.c
index 11b7e7213fd1..91b7f156735b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1277,9 +1277,9 @@ int get_tree_bdev(struct fs_context *fc,
 		}
 
 		/*
-		 * s_umount nests inside bd_mutex during
+		 * s_umount nests inside open_mutex during
 		 * __invalidate_device().  blkdev_put() acquires
-		 * bd_mutex and can't be called under s_umount.  Drop
+		 * open_mutex and can't be called under s_umount.  Drop
 		 * s_umount temporarily.  This is safe as we're
 		 * holding an active reference.
 		 */
@@ -1352,9 +1352,9 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 		}
 
 		/*
-		 * s_umount nests inside bd_mutex during
+		 * s_umount nests inside open_mutex during
 		 * __invalidate_device().  blkdev_put() acquires
-		 * bd_mutex and can't be called under s_umount.  Drop
+		 * open_mutex and can't be called under s_umount.  Drop
 		 * s_umount temporarily.  This is safe as we're
 		 * holding an active reference.
 		 */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index db026b6ec15a..a09660671fa4 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -29,7 +29,6 @@ struct block_device {
 	int			bd_openers;
 	struct inode *		bd_inode;	/* will die */
 	struct super_block *	bd_super;
-	struct mutex		bd_mutex;	/* open/close mutex */
 	void *			bd_claiming;
 	struct device		bd_device;
 	void *			bd_holder;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 782f0171d104..1fabb1559110 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -154,6 +154,9 @@ struct gendisk {
 #define GD_NEED_PART_SCAN		0
 #define GD_READ_ONLY			1
 #define GD_QUEUE_REF			2
+
+	struct mutex open_mutex;	/* open/close mutex */
+
 	struct kobject *slave_dir;
 
 	struct timer_rand_state *random;
-- 
cgit v1.2.3


From ab4b57057d744861f670b47b163209727b26418b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 25 May 2021 08:12:59 +0200
Subject: block: move bd_part_count to struct gendisk

The bd_part_count value only makes sense for whole devices, so move it
to struct gendisk and give it a more descriptive name.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20210525061301.2242282-7-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/ioctl.c             | 2 +-
 fs/block_dev.c            | 6 +++---
 include/linux/blk_types.h | 3 ---
 include/linux/genhd.h     | 1 +
 4 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/block/ioctl.c b/block/ioctl.c
index 8ba1ed8defd0..24beec9ca9c9 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -89,7 +89,7 @@ static int blkdev_reread_part(struct block_device *bdev, fmode_t mode)
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
-	if (bdev->bd_part_count)
+	if (bdev->bd_disk->open_partitions)
 		return -EBUSY;
 
 	/*
diff --git a/fs/block_dev.c b/fs/block_dev.c
index cd45b54e86b4..ac9b3c158a77 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1253,7 +1253,7 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate)
 		return -ENXIO;
 
 rescan:
-	if (bdev->bd_part_count)
+	if (disk->open_partitions)
 		return -EBUSY;
 	sync_blockdev(bdev);
 	invalidate_bdev(bdev);
@@ -1348,7 +1348,7 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode)
 	if (!bdev_nr_sectors(part))
 		goto out_blkdev_put;
 
-	whole->bd_part_count++;
+	disk->open_partitions++;
 	set_init_blocksize(part);
 	if (part->bd_bdi == &noop_backing_dev_info)
 		part->bd_bdi = bdi_get(disk->queue->backing_dev_info);
@@ -1370,7 +1370,7 @@ static void blkdev_put_part(struct block_device *part, fmode_t mode)
 	if (--part->bd_openers)
 		return;
 	blkdev_flush_mapping(part);
-	whole->bd_part_count--;
+	whole->bd_disk->open_partitions--;
 	blkdev_put_whole(whole, mode);
 	bdput(whole);
 }
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index a09660671fa4..fd3860d18d7e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -39,9 +39,6 @@ struct block_device {
 #endif
 	struct kobject		*bd_holder_dir;
 	u8			bd_partno;
-	/* number of times partitions within this device have been opened. */
-	unsigned		bd_part_count;
-
 	spinlock_t		bd_size_lock; /* for bd_inode->i_size updates */
 	struct gendisk *	bd_disk;
 	struct backing_dev_info *bd_bdi;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 1fabb1559110..47d4605c0e7e 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -156,6 +156,7 @@ struct gendisk {
 #define GD_QUEUE_REF			2
 
 	struct mutex open_mutex;	/* open/close mutex */
+	unsigned open_partitions;	/* number of open partitions */
 
 	struct kobject *slave_dir;
 
-- 
cgit v1.2.3


From c97d93c31e5734a16bfe663085ec91b8c9fb20f9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 25 May 2021 08:13:00 +0200
Subject: block: factor out a part_devt helper

Add a helper to find the dev_t for a disk + partno tuple.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20210525061301.2242282-8-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c         | 25 +++++++++++++++++--------
 include/linux/genhd.h |  1 +
 init/do_mounts.c      | 10 ++--------
 3 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 38d136a19484..3f7b1c92c7f3 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1227,6 +1227,19 @@ static int __init proc_genhd_init(void)
 module_init(proc_genhd_init);
 #endif /* CONFIG_PROC_FS */
 
+dev_t part_devt(struct gendisk *disk, u8 partno)
+{
+	struct block_device *part = bdget_disk(disk, partno);
+	dev_t devt = 0;
+
+	if (part) {
+		devt = part->bd_dev;
+		bdput(part);
+	}
+
+	return devt;
+}
+
 dev_t blk_lookup_devt(const char *name, int partno)
 {
 	dev_t devt = MKDEV(0, 0);
@@ -1236,7 +1249,6 @@ dev_t blk_lookup_devt(const char *name, int partno)
 	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
 	while ((dev = class_dev_iter_next(&iter))) {
 		struct gendisk *disk = dev_to_disk(dev);
-		struct block_device *part;
 
 		if (strcmp(dev_name(dev), name))
 			continue;
@@ -1247,13 +1259,10 @@ dev_t blk_lookup_devt(const char *name, int partno)
 			 */
 			devt = MKDEV(MAJOR(dev->devt),
 				     MINOR(dev->devt) + partno);
-			break;
-		}
-		part = bdget_disk(disk, partno);
-		if (part) {
-			devt = part->bd_dev;
-			bdput(part);
-			break;
+		} else {
+			devt = part_devt(disk, partno);
+			if (devt)
+				break;
 		}
 	}
 	class_dev_iter_exit(&iter);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 47d4605c0e7e..64a8431202b7 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -333,6 +333,7 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev,
 }
 #endif /* CONFIG_SYSFS */
 
+dev_t part_devt(struct gendisk *disk, u8 partno);
 dev_t blk_lookup_devt(const char *name, int partno);
 void blk_request_module(dev_t devt);
 #ifdef CONFIG_BLOCK
diff --git a/init/do_mounts.c b/init/do_mounts.c
index a78e44ee6adb..74aede860de7 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -133,14 +133,8 @@ static dev_t devt_from_partuuid(const char *uuid_str)
 		 * Attempt to find the requested partition by adding an offset
 		 * to the partition number found by UUID.
 		 */
-		struct block_device *part;
-
-		part = bdget_disk(dev_to_disk(dev),
-				  dev_to_bdev(dev)->bd_partno + offset);
-		if (part) {
-			devt = part->bd_dev;
-			bdput(part);
-		}
+		devt = part_devt(dev_to_disk(dev),
+				 dev_to_bdev(dev)->bd_partno + offset);
 	} else {
 		devt = dev->devt;
 	}
-- 
cgit v1.2.3


From 0e0ccdecb3cff95a350b4364e7ebbaa754d0e47d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 25 May 2021 08:13:01 +0200
Subject: block: remove bdget_disk

Just opencode the xa_load in the callers, as none of them actually
needs a reference to the bdev.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20210525061301.2242282-9-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c           | 35 +++++------------------------------
 block/partitions/core.c | 25 ++++++++++++-------------
 include/linux/genhd.h   |  1 -
 3 files changed, 17 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 3f7b1c92c7f3..5f5628216295 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -676,32 +676,6 @@ void blk_request_module(dev_t devt)
 		request_module("block-major-%d", MAJOR(devt));
 }
 
-/**
- * bdget_disk - do bdget() by gendisk and partition number
- * @disk: gendisk of interest
- * @partno: partition number
- *
- * Find partition @partno from @disk, do bdget() on it.
- *
- * CONTEXT:
- * Don't care.
- *
- * RETURNS:
- * Resulting block_device on success, NULL on failure.
- */
-struct block_device *bdget_disk(struct gendisk *disk, int partno)
-{
-	struct block_device *bdev = NULL;
-
-	rcu_read_lock();
-	bdev = xa_load(&disk->part_tbl, partno);
-	if (bdev && !bdgrab(bdev))
-		bdev = NULL;
-	rcu_read_unlock();
-
-	return bdev;
-}
-
 /*
  * print a full list of all partitions - intended for places where the root
  * filesystem can't be mounted and thus to give the victim some idea of what
@@ -1229,13 +1203,14 @@ module_init(proc_genhd_init);
 
 dev_t part_devt(struct gendisk *disk, u8 partno)
 {
-	struct block_device *part = bdget_disk(disk, partno);
+	struct block_device *part;
 	dev_t devt = 0;
 
-	if (part) {
+	rcu_read_lock();
+	part = xa_load(&disk->part_tbl, partno);
+	if (part)
 		devt = part->bd_dev;
-		bdput(part);
-	}
+	rcu_read_unlock();
 
 	return devt;
 }
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 4fde8e0dd7cd..186d4fbd9f09 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -326,6 +326,8 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
 	const char *dname;
 	int err;
 
+	lockdep_assert_held(&disk->open_mutex);
+
 	if (partno >= disk_max_parts(disk))
 		return ERR_PTR(-EINVAL);
 
@@ -467,14 +469,13 @@ int bdev_add_partition(struct block_device *bdev, int partno,
 
 int bdev_del_partition(struct block_device *bdev, int partno)
 {
-	struct block_device *part;
-	int ret;
-
-	part = bdget_disk(bdev->bd_disk, partno);
-	if (!part)
-		return -ENXIO;
+	struct block_device *part = NULL;
+	int ret = -ENXIO;
 
 	mutex_lock(&bdev->bd_disk->open_mutex);
+	part = xa_load(&bdev->bd_disk->part_tbl, partno);
+	if (!part)
+		goto out_unlock;
 
 	ret = -EBUSY;
 	if (part->bd_openers)
@@ -484,21 +485,20 @@ int bdev_del_partition(struct block_device *bdev, int partno)
 	ret = 0;
 out_unlock:
 	mutex_unlock(&bdev->bd_disk->open_mutex);
-	bdput(part);
 	return ret;
 }
 
 int bdev_resize_partition(struct block_device *bdev, int partno,
 		sector_t start, sector_t length)
 {
-	struct block_device *part;
-	int ret = 0;
+	struct block_device *part = NULL;
+	int ret = -ENXIO;
 
-	part = bdget_disk(bdev->bd_disk, partno);
+	mutex_lock(&bdev->bd_disk->open_mutex);
+	part = xa_load(&bdev->bd_disk->part_tbl, partno);
 	if (!part)
-		return -ENXIO;
+		goto out_unlock;
 
-	mutex_lock(&bdev->bd_disk->open_mutex);
 	ret = -EINVAL;
 	if (start != part->bd_start_sect)
 		goto out_unlock;
@@ -512,7 +512,6 @@ int bdev_resize_partition(struct block_device *bdev, int partno,
 	ret = 0;
 out_unlock:
 	mutex_unlock(&bdev->bd_disk->open_mutex);
-	bdput(part);
 	return ret;
 }
 
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 64a8431202b7..03d684f0498f 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -223,7 +223,6 @@ static inline void add_disk_no_queue_reg(struct gendisk *disk)
 }
 
 extern void del_gendisk(struct gendisk *gp);
-extern struct block_device *bdget_disk(struct gendisk *disk, int partno);
 
 void set_disk_ro(struct gendisk *disk, bool read_only);
 
-- 
cgit v1.2.3


From ec6aba3d2be1ed75b3f4c894bb64a36d40db1f55 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 25 May 2021 09:25:19 +0200
Subject: kprobes: Remove kprobe::fault_handler

The reason for kprobe::fault_handler(), as given by their comment:

 * We come here because instructions in the pre/post
 * handler caused the page_fault, this could happen
 * if handler tries to access user space by
 * copy_from_user(), get_user() etc. Let the
 * user-specified handler try to fix it first.

Is just plain bad. Those other handlers are ran from non-preemptible
context and had better use _nofault() functions. Also, there is no
upstream usage of this.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Link: https://lore.kernel.org/r/20210525073213.561116662@infradead.org
---
 Documentation/trace/kprobes.rst    | 24 +++++-------------------
 arch/arc/kernel/kprobes.c          | 10 ----------
 arch/arm/probes/kprobes/core.c     |  9 ---------
 arch/arm64/kernel/probes/kprobes.c | 10 ----------
 arch/csky/kernel/probes/kprobes.c  | 10 ----------
 arch/ia64/kernel/kprobes.c         |  9 ---------
 arch/mips/kernel/kprobes.c         |  3 ---
 arch/powerpc/kernel/kprobes.c      | 10 ----------
 arch/riscv/kernel/probes/kprobes.c | 10 ----------
 arch/s390/kernel/kprobes.c         | 10 ----------
 arch/sh/kernel/kprobes.c           | 10 ----------
 arch/sparc/kernel/kprobes.c        | 10 ----------
 arch/x86/kernel/kprobes/core.c     | 10 ----------
 include/linux/kprobes.h            |  8 --------
 kernel/kprobes.c                   | 19 -------------------
 samples/kprobes/kprobe_example.c   | 15 ---------------
 16 files changed, 5 insertions(+), 172 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/trace/kprobes.rst b/Documentation/trace/kprobes.rst
index b757b6dfd3d4..998149ce2fd9 100644
--- a/Documentation/trace/kprobes.rst
+++ b/Documentation/trace/kprobes.rst
@@ -362,14 +362,11 @@ register_kprobe
 	#include <linux/kprobes.h>
 	int register_kprobe(struct kprobe *kp);
 
-Sets a breakpoint at the address kp->addr.  When the breakpoint is
-hit, Kprobes calls kp->pre_handler.  After the probed instruction
-is single-stepped, Kprobe calls kp->post_handler.  If a fault
-occurs during execution of kp->pre_handler or kp->post_handler,
-or during single-stepping of the probed instruction, Kprobes calls
-kp->fault_handler.  Any or all handlers can be NULL. If kp->flags
-is set KPROBE_FLAG_DISABLED, that kp will be registered but disabled,
-so, its handlers aren't hit until calling enable_kprobe(kp).
+Sets a breakpoint at the address kp->addr.  When the breakpoint is hit, Kprobes
+calls kp->pre_handler.  After the probed instruction is single-stepped, Kprobe
+calls kp->post_handler.  Any or all handlers can be NULL. If kp->flags is set
+KPROBE_FLAG_DISABLED, that kp will be registered but disabled, so, its handlers
+aren't hit until calling enable_kprobe(kp).
 
 .. note::
 
@@ -415,17 +412,6 @@ User's post-handler (kp->post_handler)::
 p and regs are as described for the pre_handler.  flags always seems
 to be zero.
 
-User's fault-handler (kp->fault_handler)::
-
-	#include <linux/kprobes.h>
-	#include <linux/ptrace.h>
-	int fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr);
-
-p and regs are as described for the pre_handler.  trapnr is the
-architecture-specific trap number associated with the fault (e.g.,
-on i386, 13 for a general protection fault or 14 for a page fault).
-Returns 1 if it successfully handled the exception.
-
 register_kretprobe
 ------------------
 
diff --git a/arch/arc/kernel/kprobes.c b/arch/arc/kernel/kprobes.c
index cabef45f11df..9f5b39f38736 100644
--- a/arch/arc/kernel/kprobes.c
+++ b/arch/arc/kernel/kprobes.c
@@ -323,16 +323,6 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned long trapnr)
 		 */
 		kprobes_inc_nmissed_count(cur);
 
-		/*
-		 * We come here because instructions in the pre/post
-		 * handler caused the page_fault, this could happen
-		 * if handler tries to access user space by
-		 * copy_from_user(), get_user() etc. Let the
-		 * user-specified handler try to fix it first.
-		 */
-		if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
-			return 1;
-
 		/*
 		 * In case the user-specified fault handler returned zero,
 		 * try to fix up.
diff --git a/arch/arm/probes/kprobes/core.c b/arch/arm/probes/kprobes/core.c
index a9653117ca0d..7b9b9a5a409b 100644
--- a/arch/arm/probes/kprobes/core.c
+++ b/arch/arm/probes/kprobes/core.c
@@ -358,15 +358,6 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr)
 		 */
 		kprobes_inc_nmissed_count(cur);
 
-		/*
-		 * We come here because instructions in the pre/post
-		 * handler caused the page_fault, this could happen
-		 * if handler tries to access user space by
-		 * copy_from_user(), get_user() etc. Let the
-		 * user-specified handler try to fix it.
-		 */
-		if (cur->fault_handler && cur->fault_handler(cur, regs, fsr))
-			return 1;
 		break;
 
 	default:
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
index d607c9912025..f6b088e9fa70 100644
--- a/arch/arm64/kernel/probes/kprobes.c
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -283,16 +283,6 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr)
 		 */
 		kprobes_inc_nmissed_count(cur);
 
-		/*
-		 * We come here because instructions in the pre/post
-		 * handler caused the page_fault, this could happen
-		 * if handler tries to access user space by
-		 * copy_from_user(), get_user() etc. Let the
-		 * user-specified handler try to fix it first.
-		 */
-		if (cur->fault_handler && cur->fault_handler(cur, regs, fsr))
-			return 1;
-
 		/*
 		 * In case the user-specified fault handler returned
 		 * zero, try to fix up.
diff --git a/arch/csky/kernel/probes/kprobes.c b/arch/csky/kernel/probes/kprobes.c
index 589f090f48b9..e0e973e49770 100644
--- a/arch/csky/kernel/probes/kprobes.c
+++ b/arch/csky/kernel/probes/kprobes.c
@@ -301,16 +301,6 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int trapnr)
 		 */
 		kprobes_inc_nmissed_count(cur);
 
-		/*
-		 * We come here because instructions in the pre/post
-		 * handler caused the page_fault, this could happen
-		 * if handler tries to access user space by
-		 * copy_from_user(), get_user() etc. Let the
-		 * user-specified handler try to fix it first.
-		 */
-		if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
-			return 1;
-
 		/*
 		 * In case the user-specified fault handler returned
 		 * zero, try to fix up.
diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index fc1ff8a4d7de..6efed4ecff9e 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -850,15 +850,6 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		 */
 		kprobes_inc_nmissed_count(cur);
 
-		/*
-		 * We come here because instructions in the pre/post
-		 * handler caused the page_fault, this could happen
-		 * if handler tries to access user space by
-		 * copy_from_user(), get_user() etc. Let the
-		 * user-specified handler try to fix it first.
-		 */
-		if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
-			return 1;
 		/*
 		 * In case the user-specified fault handler returned
 		 * zero, try to fix up.
diff --git a/arch/mips/kernel/kprobes.c b/arch/mips/kernel/kprobes.c
index 54dfba8fa77c..75bff0f77319 100644
--- a/arch/mips/kernel/kprobes.c
+++ b/arch/mips/kernel/kprobes.c
@@ -403,9 +403,6 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 	struct kprobe *cur = kprobe_running();
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 
-	if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
-		return 1;
-
 	if (kcb->kprobe_status & KPROBE_HIT_SS) {
 		resume_execution(cur, regs, kcb);
 		regs->cp0_status |= kcb->kprobe_old_SR;
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 01ab2163659e..75b4e874269d 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -508,16 +508,6 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		 */
 		kprobes_inc_nmissed_count(cur);
 
-		/*
-		 * We come here because instructions in the pre/post
-		 * handler caused the page_fault, this could happen
-		 * if handler tries to access user space by
-		 * copy_from_user(), get_user() etc. Let the
-		 * user-specified handler try to fix it first.
-		 */
-		if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
-			return 1;
-
 		/*
 		 * In case the user-specified fault handler returned
 		 * zero, try to fix up.
diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c
index 10b965c34536..923b5ea396ea 100644
--- a/arch/riscv/kernel/probes/kprobes.c
+++ b/arch/riscv/kernel/probes/kprobes.c
@@ -283,16 +283,6 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int trapnr)
 		 */
 		kprobes_inc_nmissed_count(cur);
 
-		/*
-		 * We come here because instructions in the pre/post
-		 * handler caused the page_fault, this could happen
-		 * if handler tries to access user space by
-		 * copy_from_user(), get_user() etc. Let the
-		 * user-specified handler try to fix it first.
-		 */
-		if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
-			return 1;
-
 		/*
 		 * In case the user-specified fault handler returned
 		 * zero, try to fix up.
diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index aae24dc75df6..ad631e33df24 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -452,16 +452,6 @@ static int kprobe_trap_handler(struct pt_regs *regs, int trapnr)
 		 */
 		kprobes_inc_nmissed_count(p);
 
-		/*
-		 * We come here because instructions in the pre/post
-		 * handler caused the page_fault, this could happen
-		 * if handler tries to access user space by
-		 * copy_from_user(), get_user() etc. Let the
-		 * user-specified handler try to fix it first.
-		 */
-		if (p->fault_handler && p->fault_handler(p, regs, trapnr))
-			return 1;
-
 		/*
 		 * In case the user-specified fault handler returned
 		 * zero, try to fix up.
diff --git a/arch/sh/kernel/kprobes.c b/arch/sh/kernel/kprobes.c
index 756100b01e84..58263420ad2a 100644
--- a/arch/sh/kernel/kprobes.c
+++ b/arch/sh/kernel/kprobes.c
@@ -389,16 +389,6 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		 */
 		kprobes_inc_nmissed_count(cur);
 
-		/*
-		 * We come here because instructions in the pre/post
-		 * handler caused the page_fault, this could happen
-		 * if handler tries to access user space by
-		 * copy_from_user(), get_user() etc. Let the
-		 * user-specified handler try to fix it first.
-		 */
-		if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
-			return 1;
-
 		/*
 		 * In case the user-specified fault handler returned
 		 * zero, try to fix up.
diff --git a/arch/sparc/kernel/kprobes.c b/arch/sparc/kernel/kprobes.c
index 217c21a6986a..db4e341b4b6e 100644
--- a/arch/sparc/kernel/kprobes.c
+++ b/arch/sparc/kernel/kprobes.c
@@ -352,16 +352,6 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		 */
 		kprobes_inc_nmissed_count(cur);
 
-		/*
-		 * We come here because instructions in the pre/post
-		 * handler caused the page_fault, this could happen
-		 * if handler tries to access user space by
-		 * copy_from_user(), get_user() etc. Let the
-		 * user-specified handler try to fix it first.
-		 */
-		if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
-			return 1;
-
 		/*
 		 * In case the user-specified fault handler returned
 		 * zero, try to fix up.
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index d3d65545cb8b..cfcdf4b8a306 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -1110,16 +1110,6 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		 * these specific fault cases.
 		 */
 		kprobes_inc_nmissed_count(cur);
-
-		/*
-		 * We come here because instructions in the pre/post
-		 * handler caused the page_fault, this could happen
-		 * if handler tries to access user space by
-		 * copy_from_user(), get_user() etc. Let the
-		 * user-specified handler try to fix it first.
-		 */
-		if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
-			return 1;
 	}
 
 	return 0;
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 1883a4a9f16a..523ffc7bc3a8 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -54,8 +54,6 @@ struct kretprobe_instance;
 typedef int (*kprobe_pre_handler_t) (struct kprobe *, struct pt_regs *);
 typedef void (*kprobe_post_handler_t) (struct kprobe *, struct pt_regs *,
 				       unsigned long flags);
-typedef int (*kprobe_fault_handler_t) (struct kprobe *, struct pt_regs *,
-				       int trapnr);
 typedef int (*kretprobe_handler_t) (struct kretprobe_instance *,
 				    struct pt_regs *);
 
@@ -83,12 +81,6 @@ struct kprobe {
 	/* Called after addr is executed, unless... */
 	kprobe_post_handler_t post_handler;
 
-	/*
-	 * ... called if executing addr causes a fault (eg. page fault).
-	 * Return 1 if it handled fault, otherwise kernel will see it.
-	 */
-	kprobe_fault_handler_t fault_handler;
-
 	/* Saved opcode (which has been replaced with breakpoint) */
 	kprobe_opcode_t opcode;
 
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 745f08fdd7a6..e41385afe79d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1183,23 +1183,6 @@ static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
 }
 NOKPROBE_SYMBOL(aggr_post_handler);
 
-static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
-			      int trapnr)
-{
-	struct kprobe *cur = __this_cpu_read(kprobe_instance);
-
-	/*
-	 * if we faulted "during" the execution of a user specified
-	 * probe handler, invoke just that probe's fault handler
-	 */
-	if (cur && cur->fault_handler) {
-		if (cur->fault_handler(cur, regs, trapnr))
-			return 1;
-	}
-	return 0;
-}
-NOKPROBE_SYMBOL(aggr_fault_handler);
-
 /* Walks the list and increments nmissed count for multiprobe case */
 void kprobes_inc_nmissed_count(struct kprobe *p)
 {
@@ -1330,7 +1313,6 @@ static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 	ap->addr = p->addr;
 	ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED;
 	ap->pre_handler = aggr_pre_handler;
-	ap->fault_handler = aggr_fault_handler;
 	/* We don't care the kprobe which has gone. */
 	if (p->post_handler && !kprobe_gone(p))
 		ap->post_handler = aggr_post_handler;
@@ -2014,7 +1996,6 @@ int register_kretprobe(struct kretprobe *rp)
 
 	rp->kp.pre_handler = pre_handler_kretprobe;
 	rp->kp.post_handler = NULL;
-	rp->kp.fault_handler = NULL;
 
 	/* Pre-allocate memory for max kretprobe instances */
 	if (rp->maxactive <= 0) {
diff --git a/samples/kprobes/kprobe_example.c b/samples/kprobes/kprobe_example.c
index c495664c0a9b..4b2f31828951 100644
--- a/samples/kprobes/kprobe_example.c
+++ b/samples/kprobes/kprobe_example.c
@@ -94,26 +94,11 @@ static void __kprobes handler_post(struct kprobe *p, struct pt_regs *regs,
 #endif
 }
 
-/*
- * fault_handler: this is called if an exception is generated for any
- * instruction within the pre- or post-handler, or when Kprobes
- * single-steps the probed instruction.
- */
-static int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr)
-{
-	pr_info("fault_handler: p->addr = 0x%p, trap #%dn", p->addr, trapnr);
-	/* Return 0 because we don't handle the fault. */
-	return 0;
-}
-/* NOKPROBE_SYMBOL() is also available */
-NOKPROBE_SYMBOL(handler_fault);
-
 static int __init kprobe_init(void)
 {
 	int ret;
 	kp.pre_handler = handler_pre;
 	kp.post_handler = handler_post;
-	kp.fault_handler = handler_fault;
 
 	ret = register_kprobe(&kp);
 	if (ret < 0) {
-- 
cgit v1.2.3


From 0b78f8bcf4951af30b0ae83ea4fad27d641ab617 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@infradead.org>
Date: Tue, 1 Jun 2021 15:30:30 +0100
Subject: Revert "fb_defio: Remove custom address_space_operations"

Commit ccf953d8f3d6 makes framebuffers which use deferred I/O stop
displaying updates after the first one.  This is because the pages
handled by fb_defio no longer have a page_mapping().  That prevents
page_mkclean() from marking the PTEs as clean, and so writes are only
noticed the first time.

Reported-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/YLZEhv0cpZp8uVE3@casper.infradead.org
---
 drivers/video/fbdev/core/fb_defio.c | 35 +++++++++++++++++++++++++++++++++++
 drivers/video/fbdev/core/fbmem.c    |  4 ++++
 include/linux/fb.h                  |  3 +++
 3 files changed, 42 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c
index b292887a2481..a591d291b231 100644
--- a/drivers/video/fbdev/core/fb_defio.c
+++ b/drivers/video/fbdev/core/fb_defio.c
@@ -52,6 +52,13 @@ static vm_fault_t fb_deferred_io_fault(struct vm_fault *vmf)
 		return VM_FAULT_SIGBUS;
 
 	get_page(page);
+
+	if (vmf->vma->vm_file)
+		page->mapping = vmf->vma->vm_file->f_mapping;
+	else
+		printk(KERN_ERR "no mapping available\n");
+
+	BUG_ON(!page->mapping);
 	page->index = vmf->pgoff;
 
 	vmf->page = page;
@@ -144,6 +151,17 @@ static const struct vm_operations_struct fb_deferred_io_vm_ops = {
 	.page_mkwrite	= fb_deferred_io_mkwrite,
 };
 
+static int fb_deferred_io_set_page_dirty(struct page *page)
+{
+	if (!PageDirty(page))
+		SetPageDirty(page);
+	return 0;
+}
+
+static const struct address_space_operations fb_deferred_io_aops = {
+	.set_page_dirty = fb_deferred_io_set_page_dirty,
+};
+
 int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma)
 {
 	vma->vm_ops = &fb_deferred_io_vm_ops;
@@ -194,12 +212,29 @@ void fb_deferred_io_init(struct fb_info *info)
 }
 EXPORT_SYMBOL_GPL(fb_deferred_io_init);
 
+void fb_deferred_io_open(struct fb_info *info,
+			 struct inode *inode,
+			 struct file *file)
+{
+	file->f_mapping->a_ops = &fb_deferred_io_aops;
+}
+EXPORT_SYMBOL_GPL(fb_deferred_io_open);
+
 void fb_deferred_io_cleanup(struct fb_info *info)
 {
 	struct fb_deferred_io *fbdefio = info->fbdefio;
+	struct page *page;
+	int i;
 
 	BUG_ON(!fbdefio);
 	cancel_delayed_work_sync(&info->deferred_work);
+
+	/* clear out the mapping that we setup */
+	for (i = 0 ; i < info->fix.smem_len; i += PAGE_SIZE) {
+		page = fb_deferred_io_page(info, i);
+		page->mapping = NULL;
+	}
+
 	mutex_destroy(&fbdefio->lock);
 }
 EXPORT_SYMBOL_GPL(fb_deferred_io_cleanup);
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index 072780b0e570..98f193078c05 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -1415,6 +1415,10 @@ __releases(&info->lock)
 		if (res)
 			module_put(info->fbops->owner);
 	}
+#ifdef CONFIG_FB_DEFERRED_IO
+	if (info->fbdefio)
+		fb_deferred_io_open(info, inode, file);
+#endif
 out:
 	unlock_fb_info(info);
 	if (res)
diff --git a/include/linux/fb.h b/include/linux/fb.h
index a8dccd23c249..ecfbcc0553a5 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -659,6 +659,9 @@ static inline void __fb_pad_aligned_buffer(u8 *dst, u32 d_pitch,
 /* drivers/video/fb_defio.c */
 int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma);
 extern void fb_deferred_io_init(struct fb_info *info);
+extern void fb_deferred_io_open(struct fb_info *info,
+				struct inode *inode,
+				struct file *file);
 extern void fb_deferred_io_cleanup(struct fb_info *info);
 extern int fb_deferred_io_fsync(struct file *file, loff_t start,
 				loff_t end, int datasync);
-- 
cgit v1.2.3


From d8570c182f56ca52c98734732fb9a331f7c23f9a Mon Sep 17 00:00:00 2001
From: Hsin-Hsiung Wang <hsin-hsiung.wang@mediatek.com>
Date: Wed, 26 May 2021 14:52:00 +0800
Subject: mfd: mt6358: Refine interrupt code

This patch refines the interrupt related code to support new chips.

Signed-off-by: Hsin-Hsiung Wang <hsin-hsiung.wang@mediatek.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/mt6358-irq.c        | 65 ++++++++++++++++++++++++-----------------
 include/linux/mfd/mt6358/core.h |  8 ++---
 2 files changed, 41 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/mt6358-irq.c b/drivers/mfd/mt6358-irq.c
index db734f2831ff..4b094e5e51cc 100644
--- a/drivers/mfd/mt6358-irq.c
+++ b/drivers/mfd/mt6358-irq.c
@@ -13,7 +13,9 @@
 #include <linux/platform_device.h>
 #include <linux/regmap.h>
 
-static struct irq_top_t mt6358_ints[] = {
+#define MTK_PMIC_REG_WIDTH 16
+
+static const struct irq_top_t mt6358_ints[] = {
 	MT6358_TOP_GEN(BUCK),
 	MT6358_TOP_GEN(LDO),
 	MT6358_TOP_GEN(PSC),
@@ -24,6 +26,13 @@ static struct irq_top_t mt6358_ints[] = {
 	MT6358_TOP_GEN(MISC),
 };
 
+static struct pmic_irq_data mt6358_irqd = {
+	.num_top = ARRAY_SIZE(mt6358_ints),
+	.num_pmic_irqs = MT6358_IRQ_NR,
+	.top_int_status_reg = MT6358_TOP_INT_STATUS0,
+	.pmic_ints = mt6358_ints,
+};
+
 static void pmic_irq_enable(struct irq_data *data)
 {
 	unsigned int hwirq = irqd_to_hwirq(data);
@@ -62,15 +71,15 @@ static void pmic_irq_sync_unlock(struct irq_data *data)
 		/* Find out the IRQ group */
 		top_gp = 0;
 		while ((top_gp + 1) < irqd->num_top &&
-		       i >= mt6358_ints[top_gp + 1].hwirq_base)
+		       i >= irqd->pmic_ints[top_gp + 1].hwirq_base)
 			top_gp++;
 
 		/* Find the IRQ registers */
-		gp_offset = i - mt6358_ints[top_gp].hwirq_base;
-		int_regs = gp_offset / MT6358_REG_WIDTH;
-		shift = gp_offset % MT6358_REG_WIDTH;
-		en_reg = mt6358_ints[top_gp].en_reg +
-			 (mt6358_ints[top_gp].en_reg_shift * int_regs);
+		gp_offset = i - irqd->pmic_ints[top_gp].hwirq_base;
+		int_regs = gp_offset / MTK_PMIC_REG_WIDTH;
+		shift = gp_offset % MTK_PMIC_REG_WIDTH;
+		en_reg = irqd->pmic_ints[top_gp].en_reg +
+			 (irqd->pmic_ints[top_gp].en_reg_shift * int_regs);
 
 		regmap_update_bits(chip->regmap, en_reg, BIT(shift),
 				   irqd->enable_hwirq[i] << shift);
@@ -95,10 +104,11 @@ static void mt6358_irq_sp_handler(struct mt6397_chip *chip,
 	unsigned int irq_status, sta_reg, status;
 	unsigned int hwirq, virq;
 	int i, j, ret;
+	struct pmic_irq_data *irqd = chip->irq_data;
 
-	for (i = 0; i < mt6358_ints[top_gp].num_int_regs; i++) {
-		sta_reg = mt6358_ints[top_gp].sta_reg +
-			mt6358_ints[top_gp].sta_reg_shift * i;
+	for (i = 0; i < irqd->pmic_ints[top_gp].num_int_regs; i++) {
+		sta_reg = irqd->pmic_ints[top_gp].sta_reg +
+			irqd->pmic_ints[top_gp].sta_reg_shift * i;
 
 		ret = regmap_read(chip->regmap, sta_reg, &irq_status);
 		if (ret) {
@@ -114,8 +124,8 @@ static void mt6358_irq_sp_handler(struct mt6397_chip *chip,
 		do {
 			j = __ffs(status);
 
-			hwirq = mt6358_ints[top_gp].hwirq_base +
-				MT6358_REG_WIDTH * i + j;
+			hwirq = irqd->pmic_ints[top_gp].hwirq_base +
+				MTK_PMIC_REG_WIDTH * i + j;
 
 			virq = irq_find_mapping(chip->irq_domain, hwirq);
 			if (virq)
@@ -131,12 +141,12 @@ static void mt6358_irq_sp_handler(struct mt6397_chip *chip,
 static irqreturn_t mt6358_irq_handler(int irq, void *data)
 {
 	struct mt6397_chip *chip = data;
-	struct pmic_irq_data *mt6358_irq_data = chip->irq_data;
+	struct pmic_irq_data *irqd = chip->irq_data;
 	unsigned int bit, i, top_irq_status = 0;
 	int ret;
 
 	ret = regmap_read(chip->regmap,
-			  mt6358_irq_data->top_int_status_reg,
+			  irqd->top_int_status_reg,
 			  &top_irq_status);
 	if (ret) {
 		dev_err(chip->dev,
@@ -144,8 +154,8 @@ static irqreturn_t mt6358_irq_handler(int irq, void *data)
 		return IRQ_NONE;
 	}
 
-	for (i = 0; i < mt6358_irq_data->num_top; i++) {
-		bit = BIT(mt6358_ints[i].top_offset);
+	for (i = 0; i < irqd->num_top; i++) {
+		bit = BIT(irqd->pmic_ints[i].top_offset);
 		if (top_irq_status & bit) {
 			mt6358_irq_sp_handler(chip, i);
 			top_irq_status &= ~bit;
@@ -180,17 +190,18 @@ int mt6358_irq_init(struct mt6397_chip *chip)
 	int i, j, ret;
 	struct pmic_irq_data *irqd;
 
-	irqd = devm_kzalloc(chip->dev, sizeof(*irqd), GFP_KERNEL);
-	if (!irqd)
-		return -ENOMEM;
+	switch (chip->chip_id) {
+	case MT6358_CHIP_ID:
+		chip->irq_data = &mt6358_irqd;
+		break;
 
-	chip->irq_data = irqd;
+	default:
+		dev_err(chip->dev, "unsupported chip: 0x%x\n", chip->chip_id);
+		return -ENODEV;
+	}
 
 	mutex_init(&chip->irqlock);
-	irqd->top_int_status_reg = MT6358_TOP_INT_STATUS0;
-	irqd->num_pmic_irqs = MT6358_IRQ_NR;
-	irqd->num_top = ARRAY_SIZE(mt6358_ints);
-
+	irqd = chip->irq_data;
 	irqd->enable_hwirq = devm_kcalloc(chip->dev,
 					  irqd->num_pmic_irqs,
 					  sizeof(*irqd->enable_hwirq),
@@ -207,10 +218,10 @@ int mt6358_irq_init(struct mt6397_chip *chip)
 
 	/* Disable all interrupts for initializing */
 	for (i = 0; i < irqd->num_top; i++) {
-		for (j = 0; j < mt6358_ints[i].num_int_regs; j++)
+		for (j = 0; j < irqd->pmic_ints[i].num_int_regs; j++)
 			regmap_write(chip->regmap,
-				     mt6358_ints[i].en_reg +
-				     mt6358_ints[i].en_reg_shift * j, 0);
+				     irqd->pmic_ints[i].en_reg +
+				     irqd->pmic_ints[i].en_reg_shift * j, 0);
 	}
 
 	chip->irq_domain = irq_domain_add_linear(chip->dev->of_node,
diff --git a/include/linux/mfd/mt6358/core.h b/include/linux/mfd/mt6358/core.h
index c5a11b7458d4..68578e2019b0 100644
--- a/include/linux/mfd/mt6358/core.h
+++ b/include/linux/mfd/mt6358/core.h
@@ -6,12 +6,9 @@
 #ifndef __MFD_MT6358_CORE_H__
 #define __MFD_MT6358_CORE_H__
 
-#define MT6358_REG_WIDTH 16
-
 struct irq_top_t {
 	int hwirq_base;
 	unsigned int num_int_regs;
-	unsigned int num_int_bits;
 	unsigned int en_reg;
 	unsigned int en_reg_shift;
 	unsigned int sta_reg;
@@ -25,6 +22,7 @@ struct pmic_irq_data {
 	unsigned short top_int_status_reg;
 	bool *enable_hwirq;
 	bool *cache_hwirq;
+	const struct irq_top_t *pmic_ints;
 };
 
 enum mt6358_irq_top_status_shift {
@@ -146,8 +144,8 @@ enum mt6358_irq_numbers {
 {	\
 	.hwirq_base = MT6358_IRQ_##sp##_BASE,	\
 	.num_int_regs =	\
-		((MT6358_IRQ_##sp##_BITS - 1) / MT6358_REG_WIDTH) + 1,	\
-	.num_int_bits = MT6358_IRQ_##sp##_BITS, \
+		((MT6358_IRQ_##sp##_BITS - 1) /	\
+		MTK_PMIC_REG_WIDTH) + 1,	\
 	.en_reg = MT6358_##sp##_TOP_INT_CON0,	\
 	.en_reg_shift = 0x6,	\
 	.sta_reg = MT6358_##sp##_TOP_INT_STATUS0,	\
-- 
cgit v1.2.3


From be60652f0260c2f371670ec90f1ac55e2671f793 Mon Sep 17 00:00:00 2001
From: Hsin-Hsiung Wang <hsin-hsiung.wang@mediatek.com>
Date: Wed, 26 May 2021 14:52:01 +0800
Subject: rtc: mt6397: refine RTC_TC_MTH

This patch adds RTC_TC_MTH_MASK to support new chips.

Signed-off-by: Yuchen Huang <yuchen.huang@mediatek.com>
Signed-off-by: Hsin-Hsiung Wang <hsin-hsiung.wang@mediatek.com>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/rtc/rtc-mt6397.c       | 2 +-
 include/linux/mfd/mt6397/rtc.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/rtc/rtc-mt6397.c b/drivers/rtc/rtc-mt6397.c
index 6655035e5164..80dc479a6ff0 100644
--- a/drivers/rtc/rtc-mt6397.c
+++ b/drivers/rtc/rtc-mt6397.c
@@ -75,7 +75,7 @@ static int __mtk_rtc_read_time(struct mt6397_rtc *rtc,
 	tm->tm_min = data[RTC_OFFSET_MIN];
 	tm->tm_hour = data[RTC_OFFSET_HOUR];
 	tm->tm_mday = data[RTC_OFFSET_DOM];
-	tm->tm_mon = data[RTC_OFFSET_MTH];
+	tm->tm_mon = data[RTC_OFFSET_MTH] & RTC_TC_MTH_MASK;
 	tm->tm_year = data[RTC_OFFSET_YEAR];
 
 	ret = regmap_read(rtc->regmap, rtc->addr_base + RTC_TC_SEC, sec);
diff --git a/include/linux/mfd/mt6397/rtc.h b/include/linux/mfd/mt6397/rtc.h
index c3748b53bf7d..068ae1c0f0e8 100644
--- a/include/linux/mfd/mt6397/rtc.h
+++ b/include/linux/mfd/mt6397/rtc.h
@@ -36,6 +36,7 @@
 #define RTC_AL_MASK_DOW                BIT(4)
 
 #define RTC_TC_SEC             0x000a
+#define RTC_TC_MTH_MASK        0x000f
 /* Min, Hour, Dom... register offset to RTC_TC_SEC */
 #define RTC_OFFSET_SEC         0
 #define RTC_OFFSET_MIN         1
-- 
cgit v1.2.3


From e545b8f380a96174df40db4203d09156e096ee89 Mon Sep 17 00:00:00 2001
From: Hsin-Hsiung Wang <hsin-hsiung.wang@mediatek.com>
Date: Wed, 26 May 2021 14:52:04 +0800
Subject: mfd: Add support for the MediaTek MT6359 PMIC

This adds support for the MediaTek MT6359 PMIC. This is a
multifunction device with the following sub modules:

- Codec
- Interrupt
- Regulator
- RTC

It is interfaced to the host controller using SPI interface
by a proprietary hardware called PMIC wrapper or pwrap.
MT6359 MFD is a child device of the pwrap.

Signed-off-by: Hsin-Hsiung Wang <hsin-hsiung.wang@mediatek.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/mt6358-irq.c             |  24 ++
 drivers/mfd/mt6397-core.c            |  24 ++
 include/linux/mfd/mt6359/core.h      | 133 +++++++++
 include/linux/mfd/mt6359/registers.h | 529 +++++++++++++++++++++++++++++++++++
 include/linux/mfd/mt6397/core.h      |   1 +
 5 files changed, 711 insertions(+)
 create mode 100644 include/linux/mfd/mt6359/core.h
 create mode 100644 include/linux/mfd/mt6359/registers.h

(limited to 'include/linux')

diff --git a/drivers/mfd/mt6358-irq.c b/drivers/mfd/mt6358-irq.c
index 4b094e5e51cc..83f3ffbdbb4c 100644
--- a/drivers/mfd/mt6358-irq.c
+++ b/drivers/mfd/mt6358-irq.c
@@ -5,6 +5,8 @@
 #include <linux/interrupt.h>
 #include <linux/mfd/mt6358/core.h>
 #include <linux/mfd/mt6358/registers.h>
+#include <linux/mfd/mt6359/core.h>
+#include <linux/mfd/mt6359/registers.h>
 #include <linux/mfd/mt6397/core.h>
 #include <linux/module.h>
 #include <linux/of.h>
@@ -26,6 +28,17 @@ static const struct irq_top_t mt6358_ints[] = {
 	MT6358_TOP_GEN(MISC),
 };
 
+static const struct irq_top_t mt6359_ints[] = {
+	MT6359_TOP_GEN(BUCK),
+	MT6359_TOP_GEN(LDO),
+	MT6359_TOP_GEN(PSC),
+	MT6359_TOP_GEN(SCK),
+	MT6359_TOP_GEN(BM),
+	MT6359_TOP_GEN(HK),
+	MT6359_TOP_GEN(AUD),
+	MT6359_TOP_GEN(MISC),
+};
+
 static struct pmic_irq_data mt6358_irqd = {
 	.num_top = ARRAY_SIZE(mt6358_ints),
 	.num_pmic_irqs = MT6358_IRQ_NR,
@@ -33,6 +46,13 @@ static struct pmic_irq_data mt6358_irqd = {
 	.pmic_ints = mt6358_ints,
 };
 
+static struct pmic_irq_data mt6359_irqd = {
+	.num_top = ARRAY_SIZE(mt6359_ints),
+	.num_pmic_irqs = MT6359_IRQ_NR,
+	.top_int_status_reg = MT6359_TOP_INT_STATUS0,
+	.pmic_ints = mt6359_ints,
+};
+
 static void pmic_irq_enable(struct irq_data *data)
 {
 	unsigned int hwirq = irqd_to_hwirq(data);
@@ -195,6 +215,10 @@ int mt6358_irq_init(struct mt6397_chip *chip)
 		chip->irq_data = &mt6358_irqd;
 		break;
 
+	case MT6359_CHIP_ID:
+		chip->irq_data = &mt6359_irqd;
+		break;
+
 	default:
 		dev_err(chip->dev, "unsupported chip: 0x%x\n", chip->chip_id);
 		return -ENODEV;
diff --git a/drivers/mfd/mt6397-core.c b/drivers/mfd/mt6397-core.c
index 7518d74c3b4c..9a615f75fbde 100644
--- a/drivers/mfd/mt6397-core.c
+++ b/drivers/mfd/mt6397-core.c
@@ -13,9 +13,11 @@
 #include <linux/mfd/core.h>
 #include <linux/mfd/mt6323/core.h>
 #include <linux/mfd/mt6358/core.h>
+#include <linux/mfd/mt6359/core.h>
 #include <linux/mfd/mt6397/core.h>
 #include <linux/mfd/mt6323/registers.h>
 #include <linux/mfd/mt6358/registers.h>
+#include <linux/mfd/mt6359/registers.h>
 #include <linux/mfd/mt6397/registers.h>
 
 #define MT6323_RTC_BASE		0x8000
@@ -99,6 +101,17 @@ static const struct mfd_cell mt6358_devs[] = {
 	},
 };
 
+static const struct mfd_cell mt6359_devs[] = {
+	{ .name = "mt6359-regulator", },
+	{
+		.name = "mt6359-rtc",
+		.num_resources = ARRAY_SIZE(mt6358_rtc_resources),
+		.resources = mt6358_rtc_resources,
+		.of_compatible = "mediatek,mt6358-rtc",
+	},
+	{ .name = "mt6359-sound", },
+};
+
 static const struct mfd_cell mt6397_devs[] = {
 	{
 		.name = "mt6397-rtc",
@@ -149,6 +162,14 @@ static const struct chip_data mt6358_core = {
 	.irq_init = mt6358_irq_init,
 };
 
+static const struct chip_data mt6359_core = {
+	.cid_addr = MT6359_SWCID,
+	.cid_shift = 8,
+	.cells = mt6359_devs,
+	.cell_size = ARRAY_SIZE(mt6359_devs),
+	.irq_init = mt6358_irq_init,
+};
+
 static const struct chip_data mt6397_core = {
 	.cid_addr = MT6397_CID,
 	.cid_shift = 0,
@@ -218,6 +239,9 @@ static const struct of_device_id mt6397_of_match[] = {
 	}, {
 		.compatible = "mediatek,mt6358",
 		.data = &mt6358_core,
+	}, {
+		.compatible = "mediatek,mt6359",
+		.data = &mt6359_core,
 	}, {
 		.compatible = "mediatek,mt6397",
 		.data = &mt6397_core,
diff --git a/include/linux/mfd/mt6359/core.h b/include/linux/mfd/mt6359/core.h
new file mode 100644
index 000000000000..8d298868126d
--- /dev/null
+++ b/include/linux/mfd/mt6359/core.h
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2021 MediaTek Inc.
+ */
+
+#ifndef __MFD_MT6359_CORE_H__
+#define __MFD_MT6359_CORE_H__
+
+enum mt6359_irq_top_status_shift {
+	MT6359_BUCK_TOP = 0,
+	MT6359_LDO_TOP,
+	MT6359_PSC_TOP,
+	MT6359_SCK_TOP,
+	MT6359_BM_TOP,
+	MT6359_HK_TOP,
+	MT6359_AUD_TOP = 7,
+	MT6359_MISC_TOP,
+};
+
+enum mt6359_irq_numbers {
+	MT6359_IRQ_VCORE_OC = 1,
+	MT6359_IRQ_VGPU11_OC,
+	MT6359_IRQ_VGPU12_OC,
+	MT6359_IRQ_VMODEM_OC,
+	MT6359_IRQ_VPROC1_OC,
+	MT6359_IRQ_VPROC2_OC,
+	MT6359_IRQ_VS1_OC,
+	MT6359_IRQ_VS2_OC,
+	MT6359_IRQ_VPA_OC = 9,
+	MT6359_IRQ_VFE28_OC = 16,
+	MT6359_IRQ_VXO22_OC,
+	MT6359_IRQ_VRF18_OC,
+	MT6359_IRQ_VRF12_OC,
+	MT6359_IRQ_VEFUSE_OC,
+	MT6359_IRQ_VCN33_1_OC,
+	MT6359_IRQ_VCN33_2_OC,
+	MT6359_IRQ_VCN13_OC,
+	MT6359_IRQ_VCN18_OC,
+	MT6359_IRQ_VA09_OC,
+	MT6359_IRQ_VCAMIO_OC,
+	MT6359_IRQ_VA12_OC,
+	MT6359_IRQ_VAUX18_OC,
+	MT6359_IRQ_VAUD18_OC,
+	MT6359_IRQ_VIO18_OC,
+	MT6359_IRQ_VSRAM_PROC1_OC,
+	MT6359_IRQ_VSRAM_PROC2_OC,
+	MT6359_IRQ_VSRAM_OTHERS_OC,
+	MT6359_IRQ_VSRAM_MD_OC,
+	MT6359_IRQ_VEMC_OC,
+	MT6359_IRQ_VSIM1_OC,
+	MT6359_IRQ_VSIM2_OC,
+	MT6359_IRQ_VUSB_OC,
+	MT6359_IRQ_VRFCK_OC,
+	MT6359_IRQ_VBBCK_OC,
+	MT6359_IRQ_VBIF28_OC,
+	MT6359_IRQ_VIBR_OC,
+	MT6359_IRQ_VIO28_OC,
+	MT6359_IRQ_VM18_OC,
+	MT6359_IRQ_VUFS_OC = 45,
+	MT6359_IRQ_PWRKEY = 48,
+	MT6359_IRQ_HOMEKEY,
+	MT6359_IRQ_PWRKEY_R,
+	MT6359_IRQ_HOMEKEY_R,
+	MT6359_IRQ_NI_LBAT_INT,
+	MT6359_IRQ_CHRDET_EDGE = 53,
+	MT6359_IRQ_RTC = 64,
+	MT6359_IRQ_FG_BAT_H = 80,
+	MT6359_IRQ_FG_BAT_L,
+	MT6359_IRQ_FG_CUR_H,
+	MT6359_IRQ_FG_CUR_L,
+	MT6359_IRQ_FG_ZCV = 84,
+	MT6359_IRQ_FG_N_CHARGE_L = 87,
+	MT6359_IRQ_FG_IAVG_H,
+	MT6359_IRQ_FG_IAVG_L = 89,
+	MT6359_IRQ_FG_DISCHARGE = 91,
+	MT6359_IRQ_FG_CHARGE,
+	MT6359_IRQ_BATON_LV = 96,
+	MT6359_IRQ_BATON_BAT_IN = 98,
+	MT6359_IRQ_BATON_BAT_OU,
+	MT6359_IRQ_BIF = 100,
+	MT6359_IRQ_BAT_H = 112,
+	MT6359_IRQ_BAT_L,
+	MT6359_IRQ_BAT2_H,
+	MT6359_IRQ_BAT2_L,
+	MT6359_IRQ_BAT_TEMP_H,
+	MT6359_IRQ_BAT_TEMP_L,
+	MT6359_IRQ_THR_H,
+	MT6359_IRQ_THR_L,
+	MT6359_IRQ_AUXADC_IMP,
+	MT6359_IRQ_NAG_C_DLTV = 121,
+	MT6359_IRQ_AUDIO = 128,
+	MT6359_IRQ_ACCDET = 133,
+	MT6359_IRQ_ACCDET_EINT0,
+	MT6359_IRQ_ACCDET_EINT1,
+	MT6359_IRQ_SPI_CMD_ALERT = 144,
+	MT6359_IRQ_NR,
+};
+
+#define MT6359_IRQ_BUCK_BASE MT6359_IRQ_VCORE_OC
+#define MT6359_IRQ_LDO_BASE MT6359_IRQ_VFE28_OC
+#define MT6359_IRQ_PSC_BASE MT6359_IRQ_PWRKEY
+#define MT6359_IRQ_SCK_BASE MT6359_IRQ_RTC
+#define MT6359_IRQ_BM_BASE MT6359_IRQ_FG_BAT_H
+#define MT6359_IRQ_HK_BASE MT6359_IRQ_BAT_H
+#define MT6359_IRQ_AUD_BASE MT6359_IRQ_AUDIO
+#define MT6359_IRQ_MISC_BASE MT6359_IRQ_SPI_CMD_ALERT
+
+#define MT6359_IRQ_BUCK_BITS (MT6359_IRQ_VPA_OC - MT6359_IRQ_BUCK_BASE + 1)
+#define MT6359_IRQ_LDO_BITS (MT6359_IRQ_VUFS_OC - MT6359_IRQ_LDO_BASE + 1)
+#define MT6359_IRQ_PSC_BITS	\
+	(MT6359_IRQ_CHRDET_EDGE - MT6359_IRQ_PSC_BASE + 1)
+#define MT6359_IRQ_SCK_BITS (MT6359_IRQ_RTC - MT6359_IRQ_SCK_BASE + 1)
+#define MT6359_IRQ_BM_BITS (MT6359_IRQ_BIF - MT6359_IRQ_BM_BASE + 1)
+#define MT6359_IRQ_HK_BITS (MT6359_IRQ_NAG_C_DLTV - MT6359_IRQ_HK_BASE + 1)
+#define MT6359_IRQ_AUD_BITS	\
+	(MT6359_IRQ_ACCDET_EINT1 - MT6359_IRQ_AUD_BASE + 1)
+#define MT6359_IRQ_MISC_BITS	\
+	(MT6359_IRQ_SPI_CMD_ALERT - MT6359_IRQ_MISC_BASE + 1)
+
+#define MT6359_TOP_GEN(sp)	\
+{	\
+	.hwirq_base = MT6359_IRQ_##sp##_BASE,	\
+	.num_int_regs =	\
+		((MT6359_IRQ_##sp##_BITS - 1) /	\
+		MTK_PMIC_REG_WIDTH) + 1,	\
+	.en_reg = MT6359_##sp##_TOP_INT_CON0,	\
+	.en_reg_shift = 0x6,	\
+	.sta_reg = MT6359_##sp##_TOP_INT_STATUS0,	\
+	.sta_reg_shift = 0x2,	\
+	.top_offset = MT6359_##sp##_TOP,	\
+}
+
+#endif /* __MFD_MT6359_CORE_H__ */
diff --git a/include/linux/mfd/mt6359/registers.h b/include/linux/mfd/mt6359/registers.h
new file mode 100644
index 000000000000..2135c9695918
--- /dev/null
+++ b/include/linux/mfd/mt6359/registers.h
@@ -0,0 +1,529 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2021 MediaTek Inc.
+ */
+
+#ifndef __MFD_MT6359_REGISTERS_H__
+#define __MFD_MT6359_REGISTERS_H__
+
+/* PMIC Registers */
+#define MT6359_SWCID                         0xa
+#define MT6359_MISC_TOP_INT_CON0             0x188
+#define MT6359_MISC_TOP_INT_STATUS0          0x194
+#define MT6359_TOP_INT_STATUS0               0x19e
+#define MT6359_SCK_TOP_INT_CON0              0x528
+#define MT6359_SCK_TOP_INT_STATUS0           0x534
+#define MT6359_EOSC_CALI_CON0                0x53a
+#define MT6359_EOSC_CALI_CON1                0x53c
+#define MT6359_RTC_MIX_CON0                  0x53e
+#define MT6359_RTC_MIX_CON1                  0x540
+#define MT6359_RTC_MIX_CON2                  0x542
+#define MT6359_RTC_DSN_ID                    0x580
+#define MT6359_RTC_DSN_REV0                  0x582
+#define MT6359_RTC_DBI                       0x584
+#define MT6359_RTC_DXI                       0x586
+#define MT6359_RTC_BBPU                      0x588
+#define MT6359_RTC_IRQ_STA                   0x58a
+#define MT6359_RTC_IRQ_EN                    0x58c
+#define MT6359_RTC_CII_EN                    0x58e
+#define MT6359_RTC_AL_MASK                   0x590
+#define MT6359_RTC_TC_SEC                    0x592
+#define MT6359_RTC_TC_MIN                    0x594
+#define MT6359_RTC_TC_HOU                    0x596
+#define MT6359_RTC_TC_DOM                    0x598
+#define MT6359_RTC_TC_DOW                    0x59a
+#define MT6359_RTC_TC_MTH                    0x59c
+#define MT6359_RTC_TC_YEA                    0x59e
+#define MT6359_RTC_AL_SEC                    0x5a0
+#define MT6359_RTC_AL_MIN                    0x5a2
+#define MT6359_RTC_AL_HOU                    0x5a4
+#define MT6359_RTC_AL_DOM                    0x5a6
+#define MT6359_RTC_AL_DOW                    0x5a8
+#define MT6359_RTC_AL_MTH                    0x5aa
+#define MT6359_RTC_AL_YEA                    0x5ac
+#define MT6359_RTC_OSC32CON                  0x5ae
+#define MT6359_RTC_POWERKEY1                 0x5b0
+#define MT6359_RTC_POWERKEY2                 0x5b2
+#define MT6359_RTC_PDN1                      0x5b4
+#define MT6359_RTC_PDN2                      0x5b6
+#define MT6359_RTC_SPAR0                     0x5b8
+#define MT6359_RTC_SPAR1                     0x5ba
+#define MT6359_RTC_PROT                      0x5bc
+#define MT6359_RTC_DIFF                      0x5be
+#define MT6359_RTC_CALI                      0x5c0
+#define MT6359_RTC_WRTGR                     0x5c2
+#define MT6359_RTC_CON                       0x5c4
+#define MT6359_RTC_SEC_CTRL                  0x5c6
+#define MT6359_RTC_INT_CNT                   0x5c8
+#define MT6359_RTC_SEC_DAT0                  0x5ca
+#define MT6359_RTC_SEC_DAT1                  0x5cc
+#define MT6359_RTC_SEC_DAT2                  0x5ce
+#define MT6359_RTC_SEC_DSN_ID                0x600
+#define MT6359_RTC_SEC_DSN_REV0              0x602
+#define MT6359_RTC_SEC_DBI                   0x604
+#define MT6359_RTC_SEC_DXI                   0x606
+#define MT6359_RTC_TC_SEC_SEC                0x608
+#define MT6359_RTC_TC_MIN_SEC                0x60a
+#define MT6359_RTC_TC_HOU_SEC                0x60c
+#define MT6359_RTC_TC_DOM_SEC                0x60e
+#define MT6359_RTC_TC_DOW_SEC                0x610
+#define MT6359_RTC_TC_MTH_SEC                0x612
+#define MT6359_RTC_TC_YEA_SEC                0x614
+#define MT6359_RTC_SEC_CK_PDN                0x616
+#define MT6359_RTC_SEC_WRTGR                 0x618
+#define MT6359_PSC_TOP_INT_CON0              0x910
+#define MT6359_PSC_TOP_INT_STATUS0           0x91c
+#define MT6359_BM_TOP_INT_CON0               0xc32
+#define MT6359_BM_TOP_INT_CON1               0xc38
+#define MT6359_BM_TOP_INT_STATUS0            0xc4a
+#define MT6359_BM_TOP_INT_STATUS1            0xc4c
+#define MT6359_HK_TOP_INT_CON0               0xf92
+#define MT6359_HK_TOP_INT_STATUS0            0xf9e
+#define MT6359_BUCK_TOP_INT_CON0             0x1418
+#define MT6359_BUCK_TOP_INT_STATUS0          0x1424
+#define MT6359_BUCK_VPU_CON0                 0x1488
+#define MT6359_BUCK_VPU_DBG0                 0x14a6
+#define MT6359_BUCK_VPU_DBG1                 0x14a8
+#define MT6359_BUCK_VPU_ELR0                 0x14ac
+#define MT6359_BUCK_VCORE_CON0               0x1508
+#define MT6359_BUCK_VCORE_DBG0               0x1526
+#define MT6359_BUCK_VCORE_DBG1               0x1528
+#define MT6359_BUCK_VCORE_SSHUB_CON0         0x152a
+#define MT6359_BUCK_VCORE_ELR0               0x1534
+#define MT6359_BUCK_VGPU11_CON0              0x1588
+#define MT6359_BUCK_VGPU11_DBG0              0x15a6
+#define MT6359_BUCK_VGPU11_DBG1              0x15a8
+#define MT6359_BUCK_VGPU11_ELR0              0x15ac
+#define MT6359_BUCK_VMODEM_CON0              0x1688
+#define MT6359_BUCK_VMODEM_DBG0              0x16a6
+#define MT6359_BUCK_VMODEM_DBG1              0x16a8
+#define MT6359_BUCK_VMODEM_ELR0              0x16ae
+#define MT6359_BUCK_VPROC1_CON0              0x1708
+#define MT6359_BUCK_VPROC1_DBG0              0x1726
+#define MT6359_BUCK_VPROC1_DBG1              0x1728
+#define MT6359_BUCK_VPROC1_ELR0              0x172e
+#define MT6359_BUCK_VPROC2_CON0              0x1788
+#define MT6359_BUCK_VPROC2_DBG0              0x17a6
+#define MT6359_BUCK_VPROC2_DBG1              0x17a8
+#define MT6359_BUCK_VPROC2_ELR0              0x17b2
+#define MT6359_BUCK_VS1_CON0                 0x1808
+#define MT6359_BUCK_VS1_DBG0                 0x1826
+#define MT6359_BUCK_VS1_DBG1                 0x1828
+#define MT6359_BUCK_VS1_ELR0                 0x1834
+#define MT6359_BUCK_VS2_CON0                 0x1888
+#define MT6359_BUCK_VS2_DBG0                 0x18a6
+#define MT6359_BUCK_VS2_DBG1                 0x18a8
+#define MT6359_BUCK_VS2_ELR0                 0x18b4
+#define MT6359_BUCK_VPA_CON0                 0x1908
+#define MT6359_BUCK_VPA_CON1                 0x190e
+#define MT6359_BUCK_VPA_CFG0                 0x1910
+#define MT6359_BUCK_VPA_CFG1                 0x1912
+#define MT6359_BUCK_VPA_DBG0                 0x1914
+#define MT6359_BUCK_VPA_DBG1                 0x1916
+#define MT6359_VGPUVCORE_ANA_CON2            0x198e
+#define MT6359_VGPUVCORE_ANA_CON13           0x19a4
+#define MT6359_VPROC1_ANA_CON3               0x19b2
+#define MT6359_VPROC2_ANA_CON3               0x1a0e
+#define MT6359_VMODEM_ANA_CON3               0x1a1a
+#define MT6359_VPU_ANA_CON3                  0x1a26
+#define MT6359_VS1_ANA_CON0                  0x1a2c
+#define MT6359_VS2_ANA_CON0                  0x1a34
+#define MT6359_VPA_ANA_CON0                  0x1a3c
+#define MT6359_LDO_TOP_INT_CON0              0x1b14
+#define MT6359_LDO_TOP_INT_CON1              0x1b1a
+#define MT6359_LDO_TOP_INT_STATUS0           0x1b28
+#define MT6359_LDO_TOP_INT_STATUS1           0x1b2a
+#define MT6359_LDO_VSRAM_PROC1_ELR           0x1b40
+#define MT6359_LDO_VSRAM_PROC2_ELR           0x1b42
+#define MT6359_LDO_VSRAM_OTHERS_ELR          0x1b44
+#define MT6359_LDO_VSRAM_MD_ELR              0x1b46
+#define MT6359_LDO_VFE28_CON0                0x1b88
+#define MT6359_LDO_VFE28_MON                 0x1b8a
+#define MT6359_LDO_VXO22_CON0                0x1b98
+#define MT6359_LDO_VXO22_MON                 0x1b9a
+#define MT6359_LDO_VRF18_CON0                0x1ba8
+#define MT6359_LDO_VRF18_MON                 0x1baa
+#define MT6359_LDO_VRF12_CON0                0x1bb8
+#define MT6359_LDO_VRF12_MON                 0x1bba
+#define MT6359_LDO_VEFUSE_CON0               0x1bc8
+#define MT6359_LDO_VEFUSE_MON                0x1bca
+#define MT6359_LDO_VCN33_1_CON0              0x1bd8
+#define MT6359_LDO_VCN33_1_MON               0x1bda
+#define MT6359_LDO_VCN33_1_MULTI_SW          0x1be8
+#define MT6359_LDO_VCN33_2_CON0              0x1c08
+#define MT6359_LDO_VCN33_2_MON               0x1c0a
+#define MT6359_LDO_VCN33_2_MULTI_SW          0x1c18
+#define MT6359_LDO_VCN13_CON0                0x1c1a
+#define MT6359_LDO_VCN13_MON                 0x1c1c
+#define MT6359_LDO_VCN18_CON0                0x1c2a
+#define MT6359_LDO_VCN18_MON                 0x1c2c
+#define MT6359_LDO_VA09_CON0                 0x1c3a
+#define MT6359_LDO_VA09_MON                  0x1c3c
+#define MT6359_LDO_VCAMIO_CON0               0x1c4a
+#define MT6359_LDO_VCAMIO_MON                0x1c4c
+#define MT6359_LDO_VA12_CON0                 0x1c5a
+#define MT6359_LDO_VA12_MON                  0x1c5c
+#define MT6359_LDO_VAUX18_CON0               0x1c88
+#define MT6359_LDO_VAUX18_MON                0x1c8a
+#define MT6359_LDO_VAUD18_CON0               0x1c98
+#define MT6359_LDO_VAUD18_MON                0x1c9a
+#define MT6359_LDO_VIO18_CON0                0x1ca8
+#define MT6359_LDO_VIO18_MON                 0x1caa
+#define MT6359_LDO_VEMC_CON0                 0x1cb8
+#define MT6359_LDO_VEMC_MON                  0x1cba
+#define MT6359_LDO_VSIM1_CON0                0x1cc8
+#define MT6359_LDO_VSIM1_MON                 0x1cca
+#define MT6359_LDO_VSIM2_CON0                0x1cd8
+#define MT6359_LDO_VSIM2_MON                 0x1cda
+#define MT6359_LDO_VUSB_CON0                 0x1d08
+#define MT6359_LDO_VUSB_MON                  0x1d0a
+#define MT6359_LDO_VUSB_MULTI_SW             0x1d18
+#define MT6359_LDO_VRFCK_CON0                0x1d1a
+#define MT6359_LDO_VRFCK_MON                 0x1d1c
+#define MT6359_LDO_VBBCK_CON0                0x1d2a
+#define MT6359_LDO_VBBCK_MON                 0x1d2c
+#define MT6359_LDO_VBIF28_CON0               0x1d3a
+#define MT6359_LDO_VBIF28_MON                0x1d3c
+#define MT6359_LDO_VIBR_CON0                 0x1d4a
+#define MT6359_LDO_VIBR_MON                  0x1d4c
+#define MT6359_LDO_VIO28_CON0                0x1d5a
+#define MT6359_LDO_VIO28_MON                 0x1d5c
+#define MT6359_LDO_VM18_CON0                 0x1d88
+#define MT6359_LDO_VM18_MON                  0x1d8a
+#define MT6359_LDO_VUFS_CON0                 0x1d98
+#define MT6359_LDO_VUFS_MON                  0x1d9a
+#define MT6359_LDO_VSRAM_PROC1_CON0          0x1e88
+#define MT6359_LDO_VSRAM_PROC1_MON           0x1e8a
+#define MT6359_LDO_VSRAM_PROC1_VOSEL1        0x1e8e
+#define MT6359_LDO_VSRAM_PROC2_CON0          0x1ea6
+#define MT6359_LDO_VSRAM_PROC2_MON           0x1ea8
+#define MT6359_LDO_VSRAM_PROC2_VOSEL1        0x1eac
+#define MT6359_LDO_VSRAM_OTHERS_CON0         0x1f08
+#define MT6359_LDO_VSRAM_OTHERS_MON          0x1f0a
+#define MT6359_LDO_VSRAM_OTHERS_VOSEL1       0x1f0e
+#define MT6359_LDO_VSRAM_OTHERS_SSHUB        0x1f26
+#define MT6359_LDO_VSRAM_MD_CON0             0x1f2c
+#define MT6359_LDO_VSRAM_MD_MON              0x1f2e
+#define MT6359_LDO_VSRAM_MD_VOSEL1           0x1f32
+#define MT6359_VFE28_ANA_CON0                0x1f88
+#define MT6359_VAUX18_ANA_CON0               0x1f8c
+#define MT6359_VUSB_ANA_CON0                 0x1f90
+#define MT6359_VBIF28_ANA_CON0               0x1f94
+#define MT6359_VCN33_1_ANA_CON0              0x1f98
+#define MT6359_VCN33_2_ANA_CON0              0x1f9c
+#define MT6359_VEMC_ANA_CON0                 0x1fa0
+#define MT6359_VSIM1_ANA_CON0                0x1fa4
+#define MT6359_VSIM2_ANA_CON0                0x1fa8
+#define MT6359_VIO28_ANA_CON0                0x1fac
+#define MT6359_VIBR_ANA_CON0                 0x1fb0
+#define MT6359_VRF18_ANA_CON0                0x2008
+#define MT6359_VEFUSE_ANA_CON0               0x200c
+#define MT6359_VCN18_ANA_CON0                0x2010
+#define MT6359_VCAMIO_ANA_CON0               0x2014
+#define MT6359_VAUD18_ANA_CON0               0x2018
+#define MT6359_VIO18_ANA_CON0                0x201c
+#define MT6359_VM18_ANA_CON0                 0x2020
+#define MT6359_VUFS_ANA_CON0                 0x2024
+#define MT6359_VRF12_ANA_CON0                0x202a
+#define MT6359_VCN13_ANA_CON0                0x202e
+#define MT6359_VA09_ANA_CON0                 0x2032
+#define MT6359_VA12_ANA_CON0                 0x2036
+#define MT6359_VXO22_ANA_CON0                0x2088
+#define MT6359_VRFCK_ANA_CON0                0x208c
+#define MT6359_VBBCK_ANA_CON0                0x2094
+#define MT6359_AUD_TOP_INT_CON0              0x2328
+#define MT6359_AUD_TOP_INT_STATUS0           0x2334
+
+#define MT6359_RG_BUCK_VPU_EN_ADDR             MT6359_BUCK_VPU_CON0
+#define MT6359_RG_BUCK_VPU_LP_ADDR             MT6359_BUCK_VPU_CON0
+#define MT6359_RG_BUCK_VPU_LP_SHIFT            1
+#define MT6359_DA_VPU_VOSEL_ADDR               MT6359_BUCK_VPU_DBG0
+#define MT6359_DA_VPU_VOSEL_MASK               0x7F
+#define MT6359_DA_VPU_VOSEL_SHIFT              0
+#define MT6359_DA_VPU_EN_ADDR                  MT6359_BUCK_VPU_DBG1
+#define MT6359_RG_BUCK_VPU_VOSEL_ADDR          MT6359_BUCK_VPU_ELR0
+#define MT6359_RG_BUCK_VPU_VOSEL_MASK          0x7F
+#define MT6359_RG_BUCK_VPU_VOSEL_SHIFT         0
+#define MT6359_RG_BUCK_VCORE_EN_ADDR           MT6359_BUCK_VCORE_CON0
+#define MT6359_RG_BUCK_VCORE_LP_ADDR           MT6359_BUCK_VCORE_CON0
+#define MT6359_RG_BUCK_VCORE_LP_SHIFT          1
+#define MT6359_DA_VCORE_VOSEL_ADDR             MT6359_BUCK_VCORE_DBG0
+#define MT6359_DA_VCORE_VOSEL_MASK             0x7F
+#define MT6359_DA_VCORE_VOSEL_SHIFT            0
+#define MT6359_DA_VCORE_EN_ADDR                MT6359_BUCK_VCORE_DBG1
+#define MT6359_RG_BUCK_VCORE_SSHUB_EN_ADDR     MT6359_BUCK_VCORE_SSHUB_CON0
+#define MT6359_RG_BUCK_VCORE_SSHUB_VOSEL_ADDR  MT6359_BUCK_VCORE_SSHUB_CON0
+#define MT6359_RG_BUCK_VCORE_SSHUB_VOSEL_MASK  0x7F
+#define MT6359_RG_BUCK_VCORE_SSHUB_VOSEL_SHIFT 4
+#define MT6359_RG_BUCK_VCORE_VOSEL_ADDR        MT6359_BUCK_VCORE_ELR0
+#define MT6359_RG_BUCK_VCORE_VOSEL_MASK        0x7F
+#define MT6359_RG_BUCK_VCORE_VOSEL_SHIFT       0
+#define MT6359_RG_BUCK_VGPU11_EN_ADDR          MT6359_BUCK_VGPU11_CON0
+#define MT6359_RG_BUCK_VGPU11_LP_ADDR          MT6359_BUCK_VGPU11_CON0
+#define MT6359_RG_BUCK_VGPU11_LP_SHIFT         1
+#define MT6359_DA_VGPU11_VOSEL_ADDR            MT6359_BUCK_VGPU11_DBG0
+#define MT6359_DA_VGPU11_VOSEL_MASK            0x7F
+#define MT6359_DA_VGPU11_VOSEL_SHIFT           0
+#define MT6359_DA_VGPU11_EN_ADDR               MT6359_BUCK_VGPU11_DBG1
+#define MT6359_RG_BUCK_VGPU11_VOSEL_ADDR       MT6359_BUCK_VGPU11_ELR0
+#define MT6359_RG_BUCK_VGPU11_VOSEL_MASK       0x7F
+#define MT6359_RG_BUCK_VGPU11_VOSEL_SHIFT      0
+#define MT6359_RG_BUCK_VMODEM_EN_ADDR          MT6359_BUCK_VMODEM_CON0
+#define MT6359_RG_BUCK_VMODEM_LP_ADDR          MT6359_BUCK_VMODEM_CON0
+#define MT6359_RG_BUCK_VMODEM_LP_SHIFT         1
+#define MT6359_DA_VMODEM_VOSEL_ADDR            MT6359_BUCK_VMODEM_DBG0
+#define MT6359_DA_VMODEM_VOSEL_MASK            0x7F
+#define MT6359_DA_VMODEM_VOSEL_SHIFT           0
+#define MT6359_DA_VMODEM_EN_ADDR               MT6359_BUCK_VMODEM_DBG1
+#define MT6359_RG_BUCK_VMODEM_VOSEL_ADDR       MT6359_BUCK_VMODEM_ELR0
+#define MT6359_RG_BUCK_VMODEM_VOSEL_MASK       0x7F
+#define MT6359_RG_BUCK_VMODEM_VOSEL_SHIFT      0
+#define MT6359_RG_BUCK_VPROC1_EN_ADDR          MT6359_BUCK_VPROC1_CON0
+#define MT6359_RG_BUCK_VPROC1_LP_ADDR          MT6359_BUCK_VPROC1_CON0
+#define MT6359_RG_BUCK_VPROC1_LP_SHIFT         1
+#define MT6359_DA_VPROC1_VOSEL_ADDR            MT6359_BUCK_VPROC1_DBG0
+#define MT6359_DA_VPROC1_VOSEL_MASK            0x7F
+#define MT6359_DA_VPROC1_VOSEL_SHIFT           0
+#define MT6359_DA_VPROC1_EN_ADDR               MT6359_BUCK_VPROC1_DBG1
+#define MT6359_RG_BUCK_VPROC1_VOSEL_ADDR       MT6359_BUCK_VPROC1_ELR0
+#define MT6359_RG_BUCK_VPROC1_VOSEL_MASK       0x7F
+#define MT6359_RG_BUCK_VPROC1_VOSEL_SHIFT      0
+#define MT6359_RG_BUCK_VPROC2_EN_ADDR          MT6359_BUCK_VPROC2_CON0
+#define MT6359_RG_BUCK_VPROC2_LP_ADDR          MT6359_BUCK_VPROC2_CON0
+#define MT6359_RG_BUCK_VPROC2_LP_SHIFT         1
+#define MT6359_DA_VPROC2_VOSEL_ADDR            MT6359_BUCK_VPROC2_DBG0
+#define MT6359_DA_VPROC2_VOSEL_MASK            0x7F
+#define MT6359_DA_VPROC2_VOSEL_SHIFT           0
+#define MT6359_DA_VPROC2_EN_ADDR               MT6359_BUCK_VPROC2_DBG1
+#define MT6359_RG_BUCK_VPROC2_VOSEL_ADDR       MT6359_BUCK_VPROC2_ELR0
+#define MT6359_RG_BUCK_VPROC2_VOSEL_MASK       0x7F
+#define MT6359_RG_BUCK_VPROC2_VOSEL_SHIFT      0
+#define MT6359_RG_BUCK_VS1_EN_ADDR             MT6359_BUCK_VS1_CON0
+#define MT6359_RG_BUCK_VS1_LP_ADDR             MT6359_BUCK_VS1_CON0
+#define MT6359_RG_BUCK_VS1_LP_SHIFT            1
+#define MT6359_DA_VS1_VOSEL_ADDR               MT6359_BUCK_VS1_DBG0
+#define MT6359_DA_VS1_VOSEL_MASK               0x7F
+#define MT6359_DA_VS1_VOSEL_SHIFT              0
+#define MT6359_DA_VS1_EN_ADDR                  MT6359_BUCK_VS1_DBG1
+#define MT6359_RG_BUCK_VS1_VOSEL_ADDR          MT6359_BUCK_VS1_ELR0
+#define MT6359_RG_BUCK_VS1_VOSEL_MASK          0x7F
+#define MT6359_RG_BUCK_VS1_VOSEL_SHIFT         0
+#define MT6359_RG_BUCK_VS2_EN_ADDR             MT6359_BUCK_VS2_CON0
+#define MT6359_RG_BUCK_VS2_LP_ADDR             MT6359_BUCK_VS2_CON0
+#define MT6359_RG_BUCK_VS2_LP_SHIFT            1
+#define MT6359_DA_VS2_VOSEL_ADDR               MT6359_BUCK_VS2_DBG0
+#define MT6359_DA_VS2_VOSEL_MASK               0x7F
+#define MT6359_DA_VS2_VOSEL_SHIFT              0
+#define MT6359_DA_VS2_EN_ADDR                  MT6359_BUCK_VS2_DBG1
+#define MT6359_RG_BUCK_VS2_VOSEL_ADDR          MT6359_BUCK_VS2_ELR0
+#define MT6359_RG_BUCK_VS2_VOSEL_MASK          0x7F
+#define MT6359_RG_BUCK_VS2_VOSEL_SHIFT         0
+#define MT6359_RG_BUCK_VPA_EN_ADDR             MT6359_BUCK_VPA_CON0
+#define MT6359_RG_BUCK_VPA_LP_ADDR             MT6359_BUCK_VPA_CON0
+#define MT6359_RG_BUCK_VPA_LP_SHIFT            1
+#define MT6359_RG_BUCK_VPA_VOSEL_ADDR          MT6359_BUCK_VPA_CON1
+#define MT6359_RG_BUCK_VPA_VOSEL_MASK          0x3F
+#define MT6359_RG_BUCK_VPA_VOSEL_SHIFT         0
+#define MT6359_DA_VPA_VOSEL_ADDR               MT6359_BUCK_VPA_DBG0
+#define MT6359_DA_VPA_VOSEL_MASK               0x3F
+#define MT6359_DA_VPA_VOSEL_SHIFT              0
+#define MT6359_DA_VPA_EN_ADDR                  MT6359_BUCK_VPA_DBG1
+#define MT6359_RG_VGPU11_FCCM_ADDR             MT6359_VGPUVCORE_ANA_CON2
+#define MT6359_RG_VGPU11_FCCM_SHIFT            9
+#define MT6359_RG_VCORE_FCCM_ADDR              MT6359_VGPUVCORE_ANA_CON13
+#define MT6359_RG_VCORE_FCCM_SHIFT             5
+#define MT6359_RG_VPROC1_FCCM_ADDR             MT6359_VPROC1_ANA_CON3
+#define MT6359_RG_VPROC1_FCCM_SHIFT            1
+#define MT6359_RG_VPROC2_FCCM_ADDR             MT6359_VPROC2_ANA_CON3
+#define MT6359_RG_VPROC2_FCCM_SHIFT            1
+#define MT6359_RG_VMODEM_FCCM_ADDR             MT6359_VMODEM_ANA_CON3
+#define MT6359_RG_VMODEM_FCCM_SHIFT            1
+#define MT6359_RG_VPU_FCCM_ADDR                MT6359_VPU_ANA_CON3
+#define MT6359_RG_VPU_FCCM_SHIFT               1
+#define MT6359_RG_VS1_FPWM_ADDR                MT6359_VS1_ANA_CON0
+#define MT6359_RG_VS1_FPWM_SHIFT               3
+#define MT6359_RG_VS2_FPWM_ADDR                MT6359_VS2_ANA_CON0
+#define MT6359_RG_VS2_FPWM_SHIFT               3
+#define MT6359_RG_VPA_MODESET_ADDR             MT6359_VPA_ANA_CON0
+#define MT6359_RG_VPA_MODESET_SHIFT            1
+#define MT6359_RG_LDO_VSRAM_PROC1_VOSEL_ADDR   MT6359_LDO_VSRAM_PROC1_ELR
+#define MT6359_RG_LDO_VSRAM_PROC1_VOSEL_MASK   0x7F
+#define MT6359_RG_LDO_VSRAM_PROC1_VOSEL_SHIFT  0
+#define MT6359_RG_LDO_VSRAM_PROC2_VOSEL_ADDR   MT6359_LDO_VSRAM_PROC2_ELR
+#define MT6359_RG_LDO_VSRAM_PROC2_VOSEL_MASK   0x7F
+#define MT6359_RG_LDO_VSRAM_PROC2_VOSEL_SHIFT  0
+#define MT6359_RG_LDO_VSRAM_OTHERS_VOSEL_ADDR  MT6359_LDO_VSRAM_OTHERS_ELR
+#define MT6359_RG_LDO_VSRAM_OTHERS_VOSEL_MASK  0x7F
+#define MT6359_RG_LDO_VSRAM_OTHERS_VOSEL_SHIFT 0
+#define MT6359_RG_LDO_VSRAM_MD_VOSEL_ADDR      MT6359_LDO_VSRAM_MD_ELR
+#define MT6359_RG_LDO_VSRAM_MD_VOSEL_MASK      0x7F
+#define MT6359_RG_LDO_VSRAM_MD_VOSEL_SHIFT     0
+#define MT6359_RG_LDO_VFE28_EN_ADDR            MT6359_LDO_VFE28_CON0
+#define MT6359_DA_VFE28_B_EN_ADDR              MT6359_LDO_VFE28_MON
+#define MT6359_RG_LDO_VXO22_EN_ADDR            MT6359_LDO_VXO22_CON0
+#define MT6359_RG_LDO_VXO22_EN_SHIFT           0
+#define MT6359_DA_VXO22_B_EN_ADDR              MT6359_LDO_VXO22_MON
+#define MT6359_RG_LDO_VRF18_EN_ADDR            MT6359_LDO_VRF18_CON0
+#define MT6359_RG_LDO_VRF18_EN_SHIFT           0
+#define MT6359_DA_VRF18_B_EN_ADDR              MT6359_LDO_VRF18_MON
+#define MT6359_RG_LDO_VRF12_EN_ADDR            MT6359_LDO_VRF12_CON0
+#define MT6359_RG_LDO_VRF12_EN_SHIFT           0
+#define MT6359_DA_VRF12_B_EN_ADDR              MT6359_LDO_VRF12_MON
+#define MT6359_RG_LDO_VEFUSE_EN_ADDR           MT6359_LDO_VEFUSE_CON0
+#define MT6359_RG_LDO_VEFUSE_EN_SHIFT          0
+#define MT6359_DA_VEFUSE_B_EN_ADDR             MT6359_LDO_VEFUSE_MON
+#define MT6359_RG_LDO_VCN33_1_EN_0_ADDR        MT6359_LDO_VCN33_1_CON0
+#define MT6359_RG_LDO_VCN33_1_EN_0_MASK        0x1
+#define MT6359_RG_LDO_VCN33_1_EN_0_SHIFT       0
+#define MT6359_DA_VCN33_1_B_EN_ADDR            MT6359_LDO_VCN33_1_MON
+#define MT6359_RG_LDO_VCN33_1_EN_1_ADDR        MT6359_LDO_VCN33_1_MULTI_SW
+#define MT6359_RG_LDO_VCN33_1_EN_1_SHIFT       15
+#define MT6359_RG_LDO_VCN33_2_EN_0_ADDR        MT6359_LDO_VCN33_2_CON0
+#define MT6359_RG_LDO_VCN33_2_EN_0_SHIFT       0
+#define MT6359_DA_VCN33_2_B_EN_ADDR            MT6359_LDO_VCN33_2_MON
+#define MT6359_RG_LDO_VCN33_2_EN_1_ADDR        MT6359_LDO_VCN33_2_MULTI_SW
+#define MT6359_RG_LDO_VCN33_2_EN_1_MASK        0x1
+#define MT6359_RG_LDO_VCN33_2_EN_1_SHIFT       15
+#define MT6359_RG_LDO_VCN13_EN_ADDR            MT6359_LDO_VCN13_CON0
+#define MT6359_RG_LDO_VCN13_EN_SHIFT           0
+#define MT6359_DA_VCN13_B_EN_ADDR              MT6359_LDO_VCN13_MON
+#define MT6359_RG_LDO_VCN18_EN_ADDR            MT6359_LDO_VCN18_CON0
+#define MT6359_DA_VCN18_B_EN_ADDR              MT6359_LDO_VCN18_MON
+#define MT6359_RG_LDO_VA09_EN_ADDR             MT6359_LDO_VA09_CON0
+#define MT6359_RG_LDO_VA09_EN_SHIFT            0
+#define MT6359_DA_VA09_B_EN_ADDR               MT6359_LDO_VA09_MON
+#define MT6359_RG_LDO_VCAMIO_EN_ADDR           MT6359_LDO_VCAMIO_CON0
+#define MT6359_RG_LDO_VCAMIO_EN_SHIFT          0
+#define MT6359_DA_VCAMIO_B_EN_ADDR             MT6359_LDO_VCAMIO_MON
+#define MT6359_RG_LDO_VA12_EN_ADDR             MT6359_LDO_VA12_CON0
+#define MT6359_RG_LDO_VA12_EN_SHIFT            0
+#define MT6359_DA_VA12_B_EN_ADDR               MT6359_LDO_VA12_MON
+#define MT6359_RG_LDO_VAUX18_EN_ADDR           MT6359_LDO_VAUX18_CON0
+#define MT6359_DA_VAUX18_B_EN_ADDR             MT6359_LDO_VAUX18_MON
+#define MT6359_RG_LDO_VAUD18_EN_ADDR           MT6359_LDO_VAUD18_CON0
+#define MT6359_DA_VAUD18_B_EN_ADDR             MT6359_LDO_VAUD18_MON
+#define MT6359_RG_LDO_VIO18_EN_ADDR            MT6359_LDO_VIO18_CON0
+#define MT6359_RG_LDO_VIO18_EN_SHIFT           0
+#define MT6359_DA_VIO18_B_EN_ADDR              MT6359_LDO_VIO18_MON
+#define MT6359_RG_LDO_VEMC_EN_ADDR             MT6359_LDO_VEMC_CON0
+#define MT6359_RG_LDO_VEMC_EN_SHIFT            0
+#define MT6359_DA_VEMC_B_EN_ADDR               MT6359_LDO_VEMC_MON
+#define MT6359_RG_LDO_VSIM1_EN_ADDR            MT6359_LDO_VSIM1_CON0
+#define MT6359_RG_LDO_VSIM1_EN_SHIFT           0
+#define MT6359_DA_VSIM1_B_EN_ADDR              MT6359_LDO_VSIM1_MON
+#define MT6359_RG_LDO_VSIM2_EN_ADDR            MT6359_LDO_VSIM2_CON0
+#define MT6359_RG_LDO_VSIM2_EN_SHIFT           0
+#define MT6359_DA_VSIM2_B_EN_ADDR              MT6359_LDO_VSIM2_MON
+#define MT6359_RG_LDO_VUSB_EN_0_ADDR           MT6359_LDO_VUSB_CON0
+#define MT6359_RG_LDO_VUSB_EN_0_MASK           0x1
+#define MT6359_RG_LDO_VUSB_EN_0_SHIFT          0
+#define MT6359_DA_VUSB_B_EN_ADDR               MT6359_LDO_VUSB_MON
+#define MT6359_RG_LDO_VUSB_EN_1_ADDR           MT6359_LDO_VUSB_MULTI_SW
+#define MT6359_RG_LDO_VUSB_EN_1_MASK           0x1
+#define MT6359_RG_LDO_VUSB_EN_1_SHIFT          15
+#define MT6359_RG_LDO_VRFCK_EN_ADDR            MT6359_LDO_VRFCK_CON0
+#define MT6359_RG_LDO_VRFCK_EN_SHIFT           0
+#define MT6359_DA_VRFCK_B_EN_ADDR              MT6359_LDO_VRFCK_MON
+#define MT6359_RG_LDO_VBBCK_EN_ADDR            MT6359_LDO_VBBCK_CON0
+#define MT6359_RG_LDO_VBBCK_EN_SHIFT           0
+#define MT6359_DA_VBBCK_B_EN_ADDR              MT6359_LDO_VBBCK_MON
+#define MT6359_RG_LDO_VBIF28_EN_ADDR           MT6359_LDO_VBIF28_CON0
+#define MT6359_DA_VBIF28_B_EN_ADDR             MT6359_LDO_VBIF28_MON
+#define MT6359_RG_LDO_VIBR_EN_ADDR             MT6359_LDO_VIBR_CON0
+#define MT6359_RG_LDO_VIBR_EN_SHIFT            0
+#define MT6359_DA_VIBR_B_EN_ADDR               MT6359_LDO_VIBR_MON
+#define MT6359_RG_LDO_VIO28_EN_ADDR            MT6359_LDO_VIO28_CON0
+#define MT6359_RG_LDO_VIO28_EN_SHIFT           0
+#define MT6359_DA_VIO28_B_EN_ADDR              MT6359_LDO_VIO28_MON
+#define MT6359_RG_LDO_VM18_EN_ADDR             MT6359_LDO_VM18_CON0
+#define MT6359_RG_LDO_VM18_EN_SHIFT            0
+#define MT6359_DA_VM18_B_EN_ADDR               MT6359_LDO_VM18_MON
+#define MT6359_RG_LDO_VUFS_EN_ADDR             MT6359_LDO_VUFS_CON0
+#define MT6359_RG_LDO_VUFS_EN_SHIFT               0
+#define MT6359_DA_VUFS_B_EN_ADDR               MT6359_LDO_VUFS_MON
+#define MT6359_RG_LDO_VSRAM_PROC1_EN_ADDR      MT6359_LDO_VSRAM_PROC1_CON0
+#define MT6359_DA_VSRAM_PROC1_B_EN_ADDR        MT6359_LDO_VSRAM_PROC1_MON
+#define MT6359_DA_VSRAM_PROC1_VOSEL_ADDR       MT6359_LDO_VSRAM_PROC1_VOSEL1
+#define MT6359_DA_VSRAM_PROC1_VOSEL_MASK       0x7F
+#define MT6359_DA_VSRAM_PROC1_VOSEL_SHIFT      8
+#define MT6359_RG_LDO_VSRAM_PROC2_EN_ADDR      MT6359_LDO_VSRAM_PROC2_CON0
+#define MT6359_DA_VSRAM_PROC2_B_EN_ADDR        MT6359_LDO_VSRAM_PROC2_MON
+#define MT6359_DA_VSRAM_PROC2_VOSEL_ADDR       MT6359_LDO_VSRAM_PROC2_VOSEL1
+#define MT6359_DA_VSRAM_PROC2_VOSEL_MASK       0x7F
+#define MT6359_DA_VSRAM_PROC2_VOSEL_SHIFT      8
+#define MT6359_RG_LDO_VSRAM_OTHERS_EN_ADDR     MT6359_LDO_VSRAM_OTHERS_CON0
+#define MT6359_DA_VSRAM_OTHERS_B_EN_ADDR       MT6359_LDO_VSRAM_OTHERS_MON
+#define MT6359_DA_VSRAM_OTHERS_VOSEL_ADDR      MT6359_LDO_VSRAM_OTHERS_VOSEL1
+#define MT6359_DA_VSRAM_OTHERS_VOSEL_MASK      0x7F
+#define MT6359_DA_VSRAM_OTHERS_VOSEL_SHIFT     8
+#define MT6359_RG_LDO_VSRAM_OTHERS_SSHUB_EN_ADDR     MT6359_LDO_VSRAM_OTHERS_SSHUB
+#define MT6359_RG_LDO_VSRAM_OTHERS_SSHUB_VOSEL_ADDR  MT6359_LDO_VSRAM_OTHERS_SSHUB
+#define MT6359_RG_LDO_VSRAM_OTHERS_SSHUB_VOSEL_MASK  0x7F
+#define MT6359_RG_LDO_VSRAM_OTHERS_SSHUB_VOSEL_SHIFT 1
+#define MT6359_RG_LDO_VSRAM_MD_EN_ADDR         MT6359_LDO_VSRAM_MD_CON0
+#define MT6359_DA_VSRAM_MD_B_EN_ADDR           MT6359_LDO_VSRAM_MD_MON
+#define MT6359_DA_VSRAM_MD_VOSEL_ADDR          MT6359_LDO_VSRAM_MD_VOSEL1
+#define MT6359_DA_VSRAM_MD_VOSEL_MASK          0x7F
+#define MT6359_DA_VSRAM_MD_VOSEL_SHIFT         8
+#define MT6359_RG_VCN33_1_VOSEL_ADDR           MT6359_VCN33_1_ANA_CON0
+#define MT6359_RG_VCN33_1_VOSEL_MASK           0xF
+#define MT6359_RG_VCN33_1_VOSEL_SHIFT          8
+#define MT6359_RG_VCN33_2_VOSEL_ADDR           MT6359_VCN33_2_ANA_CON0
+#define MT6359_RG_VCN33_2_VOSEL_MASK           0xF
+#define MT6359_RG_VCN33_2_VOSEL_SHIFT          8
+#define MT6359_RG_VEMC_VOSEL_ADDR              MT6359_VEMC_ANA_CON0
+#define MT6359_RG_VEMC_VOSEL_MASK              0xF
+#define MT6359_RG_VEMC_VOSEL_SHIFT             8
+#define MT6359_RG_VSIM1_VOSEL_ADDR             MT6359_VSIM1_ANA_CON0
+#define MT6359_RG_VSIM1_VOSEL_MASK             0xF
+#define MT6359_RG_VSIM1_VOSEL_SHIFT            8
+#define MT6359_RG_VSIM2_VOSEL_ADDR             MT6359_VSIM2_ANA_CON0
+#define MT6359_RG_VSIM2_VOSEL_MASK             0xF
+#define MT6359_RG_VSIM2_VOSEL_SHIFT            8
+#define MT6359_RG_VIO28_VOSEL_ADDR             MT6359_VIO28_ANA_CON0
+#define MT6359_RG_VIO28_VOSEL_MASK             0xF
+#define MT6359_RG_VIO28_VOSEL_SHIFT            8
+#define MT6359_RG_VIBR_VOSEL_ADDR              MT6359_VIBR_ANA_CON0
+#define MT6359_RG_VIBR_VOSEL_MASK              0xF
+#define MT6359_RG_VIBR_VOSEL_SHIFT             8
+#define MT6359_RG_VRF18_VOSEL_ADDR             MT6359_VRF18_ANA_CON0
+#define MT6359_RG_VRF18_VOSEL_MASK             0xF
+#define MT6359_RG_VRF18_VOSEL_SHIFT            8
+#define MT6359_RG_VEFUSE_VOSEL_ADDR            MT6359_VEFUSE_ANA_CON0
+#define MT6359_RG_VEFUSE_VOSEL_MASK            0xF
+#define MT6359_RG_VEFUSE_VOSEL_SHIFT           8
+#define MT6359_RG_VCAMIO_VOSEL_ADDR            MT6359_VCAMIO_ANA_CON0
+#define MT6359_RG_VCAMIO_VOSEL_MASK            0xF
+#define MT6359_RG_VCAMIO_VOSEL_SHIFT           8
+#define MT6359_RG_VIO18_VOSEL_ADDR             MT6359_VIO18_ANA_CON0
+#define MT6359_RG_VIO18_VOSEL_MASK             0xF
+#define MT6359_RG_VIO18_VOSEL_SHIFT            8
+#define MT6359_RG_VM18_VOSEL_ADDR              MT6359_VM18_ANA_CON0
+#define MT6359_RG_VM18_VOSEL_MASK              0xF
+#define MT6359_RG_VM18_VOSEL_SHIFT             8
+#define MT6359_RG_VUFS_VOSEL_ADDR              MT6359_VUFS_ANA_CON0
+#define MT6359_RG_VUFS_VOSEL_MASK              0xF
+#define MT6359_RG_VUFS_VOSEL_SHIFT             8
+#define MT6359_RG_VRF12_VOSEL_ADDR             MT6359_VRF12_ANA_CON0
+#define MT6359_RG_VRF12_VOSEL_MASK             0xF
+#define MT6359_RG_VRF12_VOSEL_SHIFT            8
+#define MT6359_RG_VCN13_VOSEL_ADDR             MT6359_VCN13_ANA_CON0
+#define MT6359_RG_VCN13_VOSEL_MASK             0xF
+#define MT6359_RG_VCN13_VOSEL_SHIFT            8
+#define MT6359_RG_VA09_VOSEL_ADDR              MT6359_VA09_ANA_CON0
+#define MT6359_RG_VA09_VOSEL_MASK              0xF
+#define MT6359_RG_VA09_VOSEL_SHIFT             8
+#define MT6359_RG_VA12_VOSEL_ADDR              MT6359_VA12_ANA_CON0
+#define MT6359_RG_VA12_VOSEL_MASK              0xF
+#define MT6359_RG_VA12_VOSEL_SHIFT             8
+#define MT6359_RG_VXO22_VOSEL_ADDR             MT6359_VXO22_ANA_CON0
+#define MT6359_RG_VXO22_VOSEL_MASK             0xF
+#define MT6359_RG_VXO22_VOSEL_SHIFT            8
+#define MT6359_RG_VRFCK_VOSEL_ADDR             MT6359_VRFCK_ANA_CON0
+#define MT6359_RG_VRFCK_VOSEL_MASK             0xF
+#define MT6359_RG_VRFCK_VOSEL_SHIFT            8
+#define MT6359_RG_VBBCK_VOSEL_ADDR             MT6359_VBBCK_ANA_CON0
+#define MT6359_RG_VBBCK_VOSEL_MASK             0xF
+#define MT6359_RG_VBBCK_VOSEL_SHIFT            8
+
+#endif /* __MFD_MT6359_REGISTERS_H__ */
diff --git a/include/linux/mfd/mt6397/core.h b/include/linux/mfd/mt6397/core.h
index 949268581b36..56f210eebc54 100644
--- a/include/linux/mfd/mt6397/core.h
+++ b/include/linux/mfd/mt6397/core.h
@@ -13,6 +13,7 @@
 enum chip_id {
 	MT6323_CHIP_ID = 0x23,
 	MT6358_CHIP_ID = 0x58,
+	MT6359_CHIP_ID = 0x59,
 	MT6391_CHIP_ID = 0x91,
 	MT6397_CHIP_ID = 0x97,
 };
-- 
cgit v1.2.3


From d7a58decc7049e8ca9707b63fcc2556cde3d26c5 Mon Sep 17 00:00:00 2001
From: Wen Su <wen.su@mediatek.com>
Date: Wed, 26 May 2021 14:52:05 +0800
Subject: regulator: mt6359: Add support for MT6359 regulator

The MT6359 is a regulator found on boards based on MediaTek MT6779 and
probably other SoCs. It is a so called pmic and connects as a slave to
SoC using SPI, wrapped inside the pmic-wrapper.

Signed-off-by: Wen Su <wen.su@mediatek.com>
Signed-off-by: Hsin-Hsiung Wang <hsin-hsiung.wang@mediatek.com>
Acked-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/regulator/Kconfig                  |   9 +
 drivers/regulator/Makefile                 |   1 +
 drivers/regulator/mt6359-regulator.c       | 669 +++++++++++++++++++++++++++++
 include/linux/regulator/mt6359-regulator.h |  58 +++
 4 files changed, 737 insertions(+)
 create mode 100644 drivers/regulator/mt6359-regulator.c
 create mode 100644 include/linux/regulator/mt6359-regulator.h

(limited to 'include/linux')

diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index 9d84d9245490..1ef47c9fb336 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -779,6 +779,15 @@ config REGULATOR_MT6358
 	  This driver supports the control of different power rails of device
 	  through regulator interface.
 
+config REGULATOR_MT6359
+	tristate "MediaTek MT6359 PMIC"
+	depends on MFD_MT6397
+	help
+	  Say y here to select this option to enable the power regulator of
+	  MediaTek MT6359 PMIC.
+	  This driver supports the control of different power rails of device
+	  through regulator interface.
+
 config REGULATOR_MT6360
 	tristate "MT6360 SubPMIC Regulator"
 	depends on MFD_MT6360
diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile
index 580b015296ea..4f4d99622db8 100644
--- a/drivers/regulator/Makefile
+++ b/drivers/regulator/Makefile
@@ -94,6 +94,7 @@ obj-$(CONFIG_REGULATOR_MT6311) += mt6311-regulator.o
 obj-$(CONFIG_REGULATOR_MT6315) += mt6315-regulator.o
 obj-$(CONFIG_REGULATOR_MT6323)	+= mt6323-regulator.o
 obj-$(CONFIG_REGULATOR_MT6358)	+= mt6358-regulator.o
+obj-$(CONFIG_REGULATOR_MT6359)	+= mt6359-regulator.o
 obj-$(CONFIG_REGULATOR_MT6360) += mt6360-regulator.o
 obj-$(CONFIG_REGULATOR_MT6380)	+= mt6380-regulator.o
 obj-$(CONFIG_REGULATOR_MT6397)	+= mt6397-regulator.o
diff --git a/drivers/regulator/mt6359-regulator.c b/drivers/regulator/mt6359-regulator.c
new file mode 100644
index 000000000000..994d3f67f73d
--- /dev/null
+++ b/drivers/regulator/mt6359-regulator.c
@@ -0,0 +1,669 @@
+// SPDX-License-Identifier: GPL-2.0
+//
+// Copyright (c) 2021 MediaTek Inc.
+
+#include <linux/platform_device.h>
+#include <linux/mfd/mt6359/registers.h>
+#include <linux/mfd/mt6397/core.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/regmap.h>
+#include <linux/regulator/driver.h>
+#include <linux/regulator/machine.h>
+#include <linux/regulator/mt6359-regulator.h>
+#include <linux/regulator/of_regulator.h>
+
+#define MT6359_BUCK_MODE_AUTO		0
+#define MT6359_BUCK_MODE_FORCE_PWM	1
+#define MT6359_BUCK_MODE_NORMAL		0
+#define MT6359_BUCK_MODE_LP		2
+
+/*
+ * MT6359 regulators' information
+ *
+ * @desc: standard fields of regulator description.
+ * @status_reg: for query status of regulators.
+ * @qi: Mask for query enable signal status of regulators.
+ * @modeset_reg: for operating AUTO/PWM mode register.
+ * @modeset_mask: MASK for operating modeset register.
+ * @modeset_shift: SHIFT for operating modeset register.
+ */
+struct mt6359_regulator_info {
+	struct regulator_desc desc;
+	u32 status_reg;
+	u32 qi;
+	u32 modeset_reg;
+	u32 modeset_mask;
+	u32 modeset_shift;
+	u32 lp_mode_reg;
+	u32 lp_mode_mask;
+	u32 lp_mode_shift;
+};
+
+#define MT6359_BUCK(match, _name, min, max, step, min_sel,	\
+	volt_ranges, _enable_reg, _status_reg,			\
+	_vsel_reg, _vsel_mask,					\
+	_lp_mode_reg, _lp_mode_shift,				\
+	_modeset_reg, _modeset_shift)				\
+[MT6359_ID_##_name] = {						\
+	.desc = {						\
+		.name = #_name,					\
+		.of_match = of_match_ptr(match),		\
+		.regulators_node = of_match_ptr("regulators"),	\
+		.ops = &mt6359_volt_range_ops,			\
+		.type = REGULATOR_VOLTAGE,			\
+		.id = MT6359_ID_##_name,			\
+		.owner = THIS_MODULE,				\
+		.uV_step = (step),				\
+		.linear_min_sel = (min_sel),			\
+		.n_voltages = ((max) - (min)) / (step) + 1,	\
+		.min_uV = (min),				\
+		.linear_ranges = volt_ranges,			\
+		.n_linear_ranges = ARRAY_SIZE(volt_ranges),	\
+		.vsel_reg = _vsel_reg,				\
+		.vsel_mask = _vsel_mask,			\
+		.enable_reg = _enable_reg,			\
+		.enable_mask = BIT(0),				\
+		.of_map_mode = mt6359_map_mode,			\
+	},							\
+	.status_reg = _status_reg,				\
+	.qi = BIT(0),						\
+	.lp_mode_reg = _lp_mode_reg,				\
+	.lp_mode_mask = BIT(_lp_mode_shift),			\
+	.lp_mode_shift = _lp_mode_shift,			\
+	.modeset_reg = _modeset_reg,				\
+	.modeset_mask = BIT(_modeset_shift),			\
+	.modeset_shift = _modeset_shift				\
+}
+
+#define MT6359_LDO_LINEAR(match, _name, min, max, step, min_sel,\
+	volt_ranges, _enable_reg, _status_reg,			\
+	_vsel_reg, _vsel_mask)					\
+[MT6359_ID_##_name] = {						\
+	.desc = {						\
+		.name = #_name,					\
+		.of_match = of_match_ptr(match),		\
+		.regulators_node = of_match_ptr("regulators"),	\
+		.ops = &mt6359_volt_range_ops,			\
+		.type = REGULATOR_VOLTAGE,			\
+		.id = MT6359_ID_##_name,			\
+		.owner = THIS_MODULE,				\
+		.uV_step = (step),				\
+		.linear_min_sel = (min_sel),			\
+		.n_voltages = ((max) - (min)) / (step) + 1,	\
+		.min_uV = (min),				\
+		.linear_ranges = volt_ranges,			\
+		.n_linear_ranges = ARRAY_SIZE(volt_ranges),	\
+		.vsel_reg = _vsel_reg,				\
+		.vsel_mask = _vsel_mask,			\
+		.enable_reg = _enable_reg,			\
+		.enable_mask = BIT(0),				\
+	},							\
+	.status_reg = _status_reg,				\
+	.qi = BIT(0),						\
+}
+
+#define MT6359_LDO(match, _name, _volt_table,			\
+	_enable_reg, _enable_mask, _status_reg,			\
+	_vsel_reg, _vsel_mask, _en_delay)			\
+[MT6359_ID_##_name] = {						\
+	.desc = {						\
+		.name = #_name,					\
+		.of_match = of_match_ptr(match),		\
+		.regulators_node = of_match_ptr("regulators"),	\
+		.ops = &mt6359_volt_table_ops,			\
+		.type = REGULATOR_VOLTAGE,			\
+		.id = MT6359_ID_##_name,			\
+		.owner = THIS_MODULE,				\
+		.n_voltages = ARRAY_SIZE(_volt_table),		\
+		.volt_table = _volt_table,			\
+		.vsel_reg = _vsel_reg,				\
+		.vsel_mask = _vsel_mask,			\
+		.enable_reg = _enable_reg,			\
+		.enable_mask = BIT(_enable_mask),		\
+		.enable_time = _en_delay,			\
+	},							\
+	.status_reg = _status_reg,				\
+	.qi = BIT(0),						\
+}
+
+#define MT6359_REG_FIXED(match, _name, _enable_reg,	\
+	_status_reg, _fixed_volt)			\
+[MT6359_ID_##_name] = {					\
+	.desc = {					\
+		.name = #_name,				\
+		.of_match = of_match_ptr(match),	\
+		.regulators_node = of_match_ptr("regulators"),	\
+		.ops = &mt6359_volt_fixed_ops,		\
+		.type = REGULATOR_VOLTAGE,		\
+		.id = MT6359_ID_##_name,		\
+		.owner = THIS_MODULE,			\
+		.n_voltages = 1,			\
+		.enable_reg = _enable_reg,		\
+		.enable_mask = BIT(0),			\
+		.fixed_uV = (_fixed_volt),		\
+	},						\
+	.status_reg = _status_reg,			\
+	.qi = BIT(0),					\
+}
+
+static const struct linear_range mt_volt_range1[] = {
+	REGULATOR_LINEAR_RANGE(800000, 0, 0x70, 12500),
+};
+
+static const struct linear_range mt_volt_range2[] = {
+	REGULATOR_LINEAR_RANGE(400000, 0, 0x7f, 6250),
+};
+
+static const struct linear_range mt_volt_range3[] = {
+	REGULATOR_LINEAR_RANGE(400000, 0, 0x70, 6250),
+};
+
+static const struct linear_range mt_volt_range4[] = {
+	REGULATOR_LINEAR_RANGE(800000, 0, 0x40, 12500),
+};
+
+static const struct linear_range mt_volt_range5[] = {
+	REGULATOR_LINEAR_RANGE(500000, 0, 0x3F, 50000),
+};
+
+static const struct linear_range mt_volt_range6[] = {
+	REGULATOR_LINEAR_RANGE(500000, 0, 0x7f, 6250),
+};
+
+static const struct linear_range mt_volt_range7[] = {
+	REGULATOR_LINEAR_RANGE(500000, 0, 0x7f, 6250),
+};
+
+static const u32 vsim1_voltages[] = {
+	0, 0, 0, 1700000, 1800000, 0, 0, 0, 2700000, 0, 0, 3000000, 3100000,
+};
+
+static const u32 vibr_voltages[] = {
+	1200000, 1300000, 1500000, 0, 1800000, 2000000, 0, 0, 2700000, 2800000,
+	0, 3000000, 0, 3300000,
+};
+
+static const u32 vrf12_voltages[] = {
+	0, 0, 1100000, 1200000,	1300000,
+};
+
+static const u32 volt18_voltages[] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1700000, 1800000, 1900000,
+};
+
+static const u32 vcn13_voltages[] = {
+	900000, 1000000, 0, 1200000, 1300000,
+};
+
+static const u32 vcn33_voltages[] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 2800000, 0, 0, 0, 3300000, 3400000, 3500000,
+};
+
+static const u32 vefuse_voltages[] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1700000, 1800000, 1900000, 2000000,
+};
+
+static const u32 vxo22_voltages[] = {
+	1800000, 0, 0, 0, 2200000,
+};
+
+static const u32 vrfck_voltages[] = {
+	0, 0, 1500000, 0, 0, 0, 0, 1600000, 0, 0, 0, 0, 1700000,
+};
+
+static const u32 vio28_voltages[] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 2800000, 2900000, 3000000, 3100000, 3300000,
+};
+
+static const u32 vemc_voltages[] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2900000, 3000000, 0, 3300000,
+};
+
+static const u32 va12_voltages[] = {
+	0, 0, 0, 0, 0, 0, 1200000, 1300000,
+};
+
+static const u32 va09_voltages[] = {
+	0, 0, 800000, 900000, 0, 0, 1200000,
+};
+
+static const u32 vrf18_voltages[] = {
+	0, 0, 0, 0, 0, 1700000, 1800000, 1810000,
+};
+
+static const u32 vbbck_voltages[] = {
+	0, 0, 0, 0, 1100000, 0, 0, 0, 1150000, 0, 0, 0, 1200000,
+};
+
+static const u32 vsim2_voltages[] = {
+	0, 0, 0, 1700000, 1800000, 0, 0, 0, 2700000, 0, 0, 3000000, 3100000,
+};
+
+static inline unsigned int mt6359_map_mode(unsigned int mode)
+{
+	switch (mode) {
+	case MT6359_BUCK_MODE_NORMAL:
+		return REGULATOR_MODE_NORMAL;
+	case MT6359_BUCK_MODE_FORCE_PWM:
+		return REGULATOR_MODE_FAST;
+	case MT6359_BUCK_MODE_LP:
+		return REGULATOR_MODE_IDLE;
+	default:
+		return REGULATOR_MODE_INVALID;
+	}
+}
+
+static int mt6359_get_status(struct regulator_dev *rdev)
+{
+	int ret;
+	u32 regval;
+	struct mt6359_regulator_info *info = rdev_get_drvdata(rdev);
+
+	ret = regmap_read(rdev->regmap, info->status_reg, &regval);
+	if (ret != 0) {
+		dev_err(&rdev->dev, "Failed to get enable reg: %d\n", ret);
+		return ret;
+	}
+
+	if (regval & info->qi)
+		return REGULATOR_STATUS_ON;
+	else
+		return REGULATOR_STATUS_OFF;
+}
+
+static unsigned int mt6359_regulator_get_mode(struct regulator_dev *rdev)
+{
+	struct mt6359_regulator_info *info = rdev_get_drvdata(rdev);
+	int ret, regval;
+
+	ret = regmap_read(rdev->regmap, info->modeset_reg, &regval);
+	if (ret != 0) {
+		dev_err(&rdev->dev,
+			"Failed to get mt6359 buck mode: %d\n", ret);
+		return ret;
+	}
+
+	if ((regval & info->modeset_mask) >> info->modeset_shift ==
+		MT6359_BUCK_MODE_FORCE_PWM)
+		return REGULATOR_MODE_FAST;
+
+	ret = regmap_read(rdev->regmap, info->lp_mode_reg, &regval);
+	if (ret != 0) {
+		dev_err(&rdev->dev,
+			"Failed to get mt6359 buck lp mode: %d\n", ret);
+		return ret;
+	}
+
+	if (regval & info->lp_mode_mask)
+		return REGULATOR_MODE_IDLE;
+	else
+		return REGULATOR_MODE_NORMAL;
+}
+
+static int mt6359_regulator_set_mode(struct regulator_dev *rdev,
+				     unsigned int mode)
+{
+	struct mt6359_regulator_info *info = rdev_get_drvdata(rdev);
+	int ret = 0, val;
+	int curr_mode;
+
+	curr_mode = mt6359_regulator_get_mode(rdev);
+	switch (mode) {
+	case REGULATOR_MODE_FAST:
+		val = MT6359_BUCK_MODE_FORCE_PWM;
+		val <<= info->modeset_shift;
+		ret = regmap_update_bits(rdev->regmap,
+					 info->modeset_reg,
+					 info->modeset_mask,
+					 val);
+		break;
+	case REGULATOR_MODE_NORMAL:
+		if (curr_mode == REGULATOR_MODE_FAST) {
+			val = MT6359_BUCK_MODE_AUTO;
+			val <<= info->modeset_shift;
+			ret = regmap_update_bits(rdev->regmap,
+						 info->modeset_reg,
+						 info->modeset_mask,
+						 val);
+		} else if (curr_mode == REGULATOR_MODE_IDLE) {
+			val = MT6359_BUCK_MODE_NORMAL;
+			val <<= info->lp_mode_shift;
+			ret = regmap_update_bits(rdev->regmap,
+						 info->lp_mode_reg,
+						 info->lp_mode_mask,
+						 val);
+			udelay(100);
+		}
+		break;
+	case REGULATOR_MODE_IDLE:
+		val = MT6359_BUCK_MODE_LP >> 1;
+		val <<= info->lp_mode_shift;
+		ret = regmap_update_bits(rdev->regmap,
+					 info->lp_mode_reg,
+					 info->lp_mode_mask,
+					 val);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (ret != 0) {
+		dev_err(&rdev->dev,
+			"Failed to set mt6359 buck mode: %d\n", ret);
+	}
+
+	return ret;
+}
+
+static const struct regulator_ops mt6359_volt_range_ops = {
+	.list_voltage = regulator_list_voltage_linear_range,
+	.map_voltage = regulator_map_voltage_linear_range,
+	.set_voltage_sel = regulator_set_voltage_sel_regmap,
+	.get_voltage_sel = regulator_get_voltage_sel_regmap,
+	.set_voltage_time_sel = regulator_set_voltage_time_sel,
+	.enable = regulator_enable_regmap,
+	.disable = regulator_disable_regmap,
+	.is_enabled = regulator_is_enabled_regmap,
+	.get_status = mt6359_get_status,
+	.set_mode = mt6359_regulator_set_mode,
+	.get_mode = mt6359_regulator_get_mode,
+};
+
+static const struct regulator_ops mt6359_volt_table_ops = {
+	.list_voltage = regulator_list_voltage_table,
+	.map_voltage = regulator_map_voltage_iterate,
+	.set_voltage_sel = regulator_set_voltage_sel_regmap,
+	.get_voltage_sel = regulator_get_voltage_sel_regmap,
+	.set_voltage_time_sel = regulator_set_voltage_time_sel,
+	.enable = regulator_enable_regmap,
+	.disable = regulator_disable_regmap,
+	.is_enabled = regulator_is_enabled_regmap,
+	.get_status = mt6359_get_status,
+};
+
+static const struct regulator_ops mt6359_volt_fixed_ops = {
+	.enable = regulator_enable_regmap,
+	.disable = regulator_disable_regmap,
+	.is_enabled = regulator_is_enabled_regmap,
+	.get_status = mt6359_get_status,
+};
+
+/* The array is indexed by id(MT6359_ID_XXX) */
+static struct mt6359_regulator_info mt6359_regulators[] = {
+	MT6359_BUCK("buck_vs1", VS1, 800000, 2200000, 12500, 0,
+		    mt_volt_range1, MT6359_RG_BUCK_VS1_EN_ADDR,
+		    MT6359_DA_VS1_EN_ADDR, MT6359_RG_BUCK_VS1_VOSEL_ADDR,
+		    MT6359_RG_BUCK_VS1_VOSEL_MASK <<
+		    MT6359_RG_BUCK_VS1_VOSEL_SHIFT,
+		    MT6359_RG_BUCK_VS1_LP_ADDR, MT6359_RG_BUCK_VS1_LP_SHIFT,
+		    MT6359_RG_VS1_FPWM_ADDR, MT6359_RG_VS1_FPWM_SHIFT),
+	MT6359_BUCK("buck_vgpu11", VGPU11, 400000, 1193750, 6250, 0,
+		    mt_volt_range2, MT6359_RG_BUCK_VGPU11_EN_ADDR,
+		    MT6359_DA_VGPU11_EN_ADDR, MT6359_RG_BUCK_VGPU11_VOSEL_ADDR,
+		    MT6359_RG_BUCK_VGPU11_VOSEL_MASK <<
+		    MT6359_RG_BUCK_VGPU11_VOSEL_SHIFT,
+		    MT6359_RG_BUCK_VGPU11_LP_ADDR,
+		    MT6359_RG_BUCK_VGPU11_LP_SHIFT,
+		    MT6359_RG_VGPU11_FCCM_ADDR, MT6359_RG_VGPU11_FCCM_SHIFT),
+	MT6359_BUCK("buck_vmodem", VMODEM, 400000, 1100000, 6250, 0,
+		    mt_volt_range3, MT6359_RG_BUCK_VMODEM_EN_ADDR,
+		    MT6359_DA_VMODEM_EN_ADDR, MT6359_RG_BUCK_VMODEM_VOSEL_ADDR,
+		    MT6359_RG_BUCK_VMODEM_VOSEL_MASK <<
+		    MT6359_RG_BUCK_VMODEM_VOSEL_SHIFT,
+		    MT6359_RG_BUCK_VMODEM_LP_ADDR,
+		    MT6359_RG_BUCK_VMODEM_LP_SHIFT,
+		    MT6359_RG_VMODEM_FCCM_ADDR, MT6359_RG_VMODEM_FCCM_SHIFT),
+	MT6359_BUCK("buck_vpu", VPU, 400000, 1193750, 6250, 0,
+		    mt_volt_range2, MT6359_RG_BUCK_VPU_EN_ADDR,
+		    MT6359_DA_VPU_EN_ADDR, MT6359_RG_BUCK_VPU_VOSEL_ADDR,
+		    MT6359_RG_BUCK_VPU_VOSEL_MASK <<
+		    MT6359_RG_BUCK_VPU_VOSEL_SHIFT,
+		    MT6359_RG_BUCK_VPU_LP_ADDR, MT6359_RG_BUCK_VPU_LP_SHIFT,
+		    MT6359_RG_VPU_FCCM_ADDR, MT6359_RG_VPU_FCCM_SHIFT),
+	MT6359_BUCK("buck_vcore", VCORE, 400000, 1193750, 6250, 0,
+		    mt_volt_range2, MT6359_RG_BUCK_VCORE_EN_ADDR,
+		    MT6359_DA_VCORE_EN_ADDR, MT6359_RG_BUCK_VCORE_VOSEL_ADDR,
+		    MT6359_RG_BUCK_VCORE_VOSEL_MASK <<
+		    MT6359_RG_BUCK_VCORE_VOSEL_SHIFT,
+		    MT6359_RG_BUCK_VCORE_LP_ADDR, MT6359_RG_BUCK_VCORE_LP_SHIFT,
+		    MT6359_RG_VCORE_FCCM_ADDR, MT6359_RG_VCORE_FCCM_SHIFT),
+	MT6359_BUCK("buck_vs2", VS2, 800000, 1600000, 12500, 0,
+		    mt_volt_range4, MT6359_RG_BUCK_VS2_EN_ADDR,
+		    MT6359_DA_VS2_EN_ADDR, MT6359_RG_BUCK_VS2_VOSEL_ADDR,
+		    MT6359_RG_BUCK_VS2_VOSEL_MASK <<
+		    MT6359_RG_BUCK_VS2_VOSEL_SHIFT,
+		    MT6359_RG_BUCK_VS2_LP_ADDR, MT6359_RG_BUCK_VS2_LP_SHIFT,
+		    MT6359_RG_VS2_FPWM_ADDR, MT6359_RG_VS2_FPWM_SHIFT),
+	MT6359_BUCK("buck_vpa", VPA, 500000, 3650000, 50000, 0,
+		    mt_volt_range5, MT6359_RG_BUCK_VPA_EN_ADDR,
+		    MT6359_DA_VPA_EN_ADDR, MT6359_RG_BUCK_VPA_VOSEL_ADDR,
+		    MT6359_RG_BUCK_VPA_VOSEL_MASK <<
+		    MT6359_RG_BUCK_VPA_VOSEL_SHIFT,
+		    MT6359_RG_BUCK_VPA_LP_ADDR, MT6359_RG_BUCK_VPA_LP_SHIFT,
+		    MT6359_RG_VPA_MODESET_ADDR, MT6359_RG_VPA_MODESET_SHIFT),
+	MT6359_BUCK("buck_vproc2", VPROC2, 400000, 1193750, 6250, 0,
+		    mt_volt_range2, MT6359_RG_BUCK_VPROC2_EN_ADDR,
+		    MT6359_DA_VPROC2_EN_ADDR, MT6359_RG_BUCK_VPROC2_VOSEL_ADDR,
+		    MT6359_RG_BUCK_VPROC2_VOSEL_MASK <<
+		    MT6359_RG_BUCK_VPROC2_VOSEL_SHIFT,
+		    MT6359_RG_BUCK_VPROC2_LP_ADDR,
+		    MT6359_RG_BUCK_VPROC2_LP_SHIFT,
+		    MT6359_RG_VPROC2_FCCM_ADDR, MT6359_RG_VPROC2_FCCM_SHIFT),
+	MT6359_BUCK("buck_vproc1", VPROC1, 400000, 1193750, 6250, 0,
+		    mt_volt_range2, MT6359_RG_BUCK_VPROC1_EN_ADDR,
+		    MT6359_DA_VPROC1_EN_ADDR, MT6359_RG_BUCK_VPROC1_VOSEL_ADDR,
+		    MT6359_RG_BUCK_VPROC1_VOSEL_MASK <<
+		    MT6359_RG_BUCK_VPROC1_VOSEL_SHIFT,
+		    MT6359_RG_BUCK_VPROC1_LP_ADDR,
+		    MT6359_RG_BUCK_VPROC1_LP_SHIFT,
+		    MT6359_RG_VPROC1_FCCM_ADDR, MT6359_RG_VPROC1_FCCM_SHIFT),
+	MT6359_BUCK("buck_vcore_sshub", VCORE_SSHUB, 400000, 1193750, 6250, 0,
+		    mt_volt_range2, MT6359_RG_BUCK_VCORE_SSHUB_EN_ADDR,
+		    MT6359_DA_VCORE_EN_ADDR,
+		    MT6359_RG_BUCK_VCORE_SSHUB_VOSEL_ADDR,
+		    MT6359_RG_BUCK_VCORE_SSHUB_VOSEL_MASK <<
+		    MT6359_RG_BUCK_VCORE_SSHUB_VOSEL_SHIFT,
+		    MT6359_RG_BUCK_VCORE_LP_ADDR, MT6359_RG_BUCK_VCORE_LP_SHIFT,
+		    MT6359_RG_VCORE_FCCM_ADDR, MT6359_RG_VCORE_FCCM_SHIFT),
+	MT6359_REG_FIXED("ldo_vaud18", VAUD18, MT6359_RG_LDO_VAUD18_EN_ADDR,
+			 MT6359_DA_VAUD18_B_EN_ADDR, 1800000),
+	MT6359_LDO("ldo_vsim1", VSIM1, vsim1_voltages,
+		   MT6359_RG_LDO_VSIM1_EN_ADDR, MT6359_RG_LDO_VSIM1_EN_SHIFT,
+		   MT6359_DA_VSIM1_B_EN_ADDR, MT6359_RG_VSIM1_VOSEL_ADDR,
+		   MT6359_RG_VSIM1_VOSEL_MASK << MT6359_RG_VSIM1_VOSEL_SHIFT,
+		   480),
+	MT6359_LDO("ldo_vibr", VIBR, vibr_voltages,
+		   MT6359_RG_LDO_VIBR_EN_ADDR, MT6359_RG_LDO_VIBR_EN_SHIFT,
+		   MT6359_DA_VIBR_B_EN_ADDR, MT6359_RG_VIBR_VOSEL_ADDR,
+		   MT6359_RG_VIBR_VOSEL_MASK << MT6359_RG_VIBR_VOSEL_SHIFT,
+		   240),
+	MT6359_LDO("ldo_vrf12", VRF12, vrf12_voltages,
+		   MT6359_RG_LDO_VRF12_EN_ADDR, MT6359_RG_LDO_VRF12_EN_SHIFT,
+		   MT6359_DA_VRF12_B_EN_ADDR, MT6359_RG_VRF12_VOSEL_ADDR,
+		   MT6359_RG_VRF12_VOSEL_MASK << MT6359_RG_VRF12_VOSEL_SHIFT,
+		   120),
+	MT6359_REG_FIXED("ldo_vusb", VUSB, MT6359_RG_LDO_VUSB_EN_0_ADDR,
+			 MT6359_DA_VUSB_B_EN_ADDR, 3000000),
+	MT6359_LDO_LINEAR("ldo_vsram_proc2", VSRAM_PROC2, 500000, 1293750, 6250,
+			  0, mt_volt_range6, MT6359_RG_LDO_VSRAM_PROC2_EN_ADDR,
+			  MT6359_DA_VSRAM_PROC2_B_EN_ADDR,
+			  MT6359_RG_LDO_VSRAM_PROC2_VOSEL_ADDR,
+			  MT6359_RG_LDO_VSRAM_PROC2_VOSEL_MASK <<
+			  MT6359_RG_LDO_VSRAM_PROC2_VOSEL_SHIFT),
+	MT6359_LDO("ldo_vio18", VIO18, volt18_voltages,
+		   MT6359_RG_LDO_VIO18_EN_ADDR, MT6359_RG_LDO_VIO18_EN_SHIFT,
+		   MT6359_DA_VIO18_B_EN_ADDR, MT6359_RG_VIO18_VOSEL_ADDR,
+		   MT6359_RG_VIO18_VOSEL_MASK << MT6359_RG_VIO18_VOSEL_SHIFT,
+		   960),
+	MT6359_LDO("ldo_vcamio", VCAMIO, volt18_voltages,
+		   MT6359_RG_LDO_VCAMIO_EN_ADDR, MT6359_RG_LDO_VCAMIO_EN_SHIFT,
+		   MT6359_DA_VCAMIO_B_EN_ADDR, MT6359_RG_VCAMIO_VOSEL_ADDR,
+		   MT6359_RG_VCAMIO_VOSEL_MASK << MT6359_RG_VCAMIO_VOSEL_SHIFT,
+		   1290),
+	MT6359_REG_FIXED("ldo_vcn18", VCN18, MT6359_RG_LDO_VCN18_EN_ADDR,
+			 MT6359_DA_VCN18_B_EN_ADDR, 1800000),
+	MT6359_REG_FIXED("ldo_vfe28", VFE28, MT6359_RG_LDO_VFE28_EN_ADDR,
+			 MT6359_DA_VFE28_B_EN_ADDR, 2800000),
+	MT6359_LDO("ldo_vcn13", VCN13, vcn13_voltages,
+		   MT6359_RG_LDO_VCN13_EN_ADDR, MT6359_RG_LDO_VCN13_EN_SHIFT,
+		   MT6359_DA_VCN13_B_EN_ADDR, MT6359_RG_VCN13_VOSEL_ADDR,
+		   MT6359_RG_VCN13_VOSEL_MASK << MT6359_RG_VCN13_VOSEL_SHIFT,
+		   240),
+	MT6359_LDO("ldo_vcn33_1_bt", VCN33_1_BT, vcn33_voltages,
+		   MT6359_RG_LDO_VCN33_1_EN_0_ADDR,
+		   MT6359_RG_LDO_VCN33_1_EN_0_SHIFT,
+		   MT6359_DA_VCN33_1_B_EN_ADDR, MT6359_RG_VCN33_1_VOSEL_ADDR,
+		   MT6359_RG_VCN33_1_VOSEL_MASK <<
+		   MT6359_RG_VCN33_1_VOSEL_SHIFT, 240),
+	MT6359_LDO("ldo_vcn33_1_wifi", VCN33_1_WIFI, vcn33_voltages,
+		   MT6359_RG_LDO_VCN33_1_EN_1_ADDR,
+		   MT6359_RG_LDO_VCN33_1_EN_1_SHIFT,
+		   MT6359_DA_VCN33_1_B_EN_ADDR, MT6359_RG_VCN33_1_VOSEL_ADDR,
+		   MT6359_RG_VCN33_1_VOSEL_MASK <<
+		   MT6359_RG_VCN33_1_VOSEL_SHIFT, 240),
+	MT6359_REG_FIXED("ldo_vaux18", VAUX18, MT6359_RG_LDO_VAUX18_EN_ADDR,
+			 MT6359_DA_VAUX18_B_EN_ADDR, 1800000),
+	MT6359_LDO_LINEAR("ldo_vsram_others", VSRAM_OTHERS, 500000, 1293750,
+			  6250, 0, mt_volt_range6,
+			  MT6359_RG_LDO_VSRAM_OTHERS_EN_ADDR,
+			  MT6359_DA_VSRAM_OTHERS_B_EN_ADDR,
+			  MT6359_RG_LDO_VSRAM_OTHERS_VOSEL_ADDR,
+			  MT6359_RG_LDO_VSRAM_OTHERS_VOSEL_MASK <<
+			  MT6359_RG_LDO_VSRAM_OTHERS_VOSEL_SHIFT),
+	MT6359_LDO("ldo_vefuse", VEFUSE, vefuse_voltages,
+		   MT6359_RG_LDO_VEFUSE_EN_ADDR, MT6359_RG_LDO_VEFUSE_EN_SHIFT,
+		   MT6359_DA_VEFUSE_B_EN_ADDR, MT6359_RG_VEFUSE_VOSEL_ADDR,
+		   MT6359_RG_VEFUSE_VOSEL_MASK << MT6359_RG_VEFUSE_VOSEL_SHIFT,
+		   240),
+	MT6359_LDO("ldo_vxo22", VXO22, vxo22_voltages,
+		   MT6359_RG_LDO_VXO22_EN_ADDR, MT6359_RG_LDO_VXO22_EN_SHIFT,
+		   MT6359_DA_VXO22_B_EN_ADDR, MT6359_RG_VXO22_VOSEL_ADDR,
+		   MT6359_RG_VXO22_VOSEL_MASK << MT6359_RG_VXO22_VOSEL_SHIFT,
+		   120),
+	MT6359_LDO("ldo_vrfck", VRFCK, vrfck_voltages,
+		   MT6359_RG_LDO_VRFCK_EN_ADDR, MT6359_RG_LDO_VRFCK_EN_SHIFT,
+		   MT6359_DA_VRFCK_B_EN_ADDR, MT6359_RG_VRFCK_VOSEL_ADDR,
+		   MT6359_RG_VRFCK_VOSEL_MASK << MT6359_RG_VRFCK_VOSEL_SHIFT,
+		   480),
+	MT6359_REG_FIXED("ldo_vbif28", VBIF28, MT6359_RG_LDO_VBIF28_EN_ADDR,
+			 MT6359_DA_VBIF28_B_EN_ADDR, 2800000),
+	MT6359_LDO("ldo_vio28", VIO28, vio28_voltages,
+		   MT6359_RG_LDO_VIO28_EN_ADDR, MT6359_RG_LDO_VIO28_EN_SHIFT,
+		   MT6359_DA_VIO28_B_EN_ADDR, MT6359_RG_VIO28_VOSEL_ADDR,
+		   MT6359_RG_VIO28_VOSEL_MASK << MT6359_RG_VIO28_VOSEL_SHIFT,
+		   240),
+	MT6359_LDO("ldo_vemc", VEMC, vemc_voltages,
+		   MT6359_RG_LDO_VEMC_EN_ADDR, MT6359_RG_LDO_VEMC_EN_SHIFT,
+		   MT6359_DA_VEMC_B_EN_ADDR, MT6359_RG_VEMC_VOSEL_ADDR,
+		   MT6359_RG_VEMC_VOSEL_MASK << MT6359_RG_VEMC_VOSEL_SHIFT,
+		   240),
+	MT6359_LDO("ldo_vcn33_2_bt", VCN33_2_BT, vcn33_voltages,
+		   MT6359_RG_LDO_VCN33_2_EN_0_ADDR,
+		   MT6359_RG_LDO_VCN33_2_EN_0_SHIFT,
+		   MT6359_DA_VCN33_2_B_EN_ADDR, MT6359_RG_VCN33_2_VOSEL_ADDR,
+		   MT6359_RG_VCN33_2_VOSEL_MASK <<
+		   MT6359_RG_VCN33_2_VOSEL_SHIFT, 240),
+	MT6359_LDO("ldo_vcn33_2_wifi", VCN33_2_WIFI, vcn33_voltages,
+		   MT6359_RG_LDO_VCN33_2_EN_1_ADDR,
+		   MT6359_RG_LDO_VCN33_2_EN_1_SHIFT,
+		   MT6359_DA_VCN33_2_B_EN_ADDR, MT6359_RG_VCN33_2_VOSEL_ADDR,
+		   MT6359_RG_VCN33_2_VOSEL_MASK <<
+		   MT6359_RG_VCN33_2_VOSEL_SHIFT, 240),
+	MT6359_LDO("ldo_va12", VA12, va12_voltages,
+		   MT6359_RG_LDO_VA12_EN_ADDR, MT6359_RG_LDO_VA12_EN_SHIFT,
+		   MT6359_DA_VA12_B_EN_ADDR, MT6359_RG_VA12_VOSEL_ADDR,
+		   MT6359_RG_VA12_VOSEL_MASK << MT6359_RG_VA12_VOSEL_SHIFT,
+		   240),
+	MT6359_LDO("ldo_va09", VA09, va09_voltages,
+		   MT6359_RG_LDO_VA09_EN_ADDR, MT6359_RG_LDO_VA09_EN_SHIFT,
+		   MT6359_DA_VA09_B_EN_ADDR, MT6359_RG_VA09_VOSEL_ADDR,
+		   MT6359_RG_VA09_VOSEL_MASK << MT6359_RG_VA09_VOSEL_SHIFT,
+		   240),
+	MT6359_LDO("ldo_vrf18", VRF18, vrf18_voltages,
+		   MT6359_RG_LDO_VRF18_EN_ADDR, MT6359_RG_LDO_VRF18_EN_SHIFT,
+		   MT6359_DA_VRF18_B_EN_ADDR, MT6359_RG_VRF18_VOSEL_ADDR,
+		   MT6359_RG_VRF18_VOSEL_MASK << MT6359_RG_VRF18_VOSEL_SHIFT,
+		   120),
+	MT6359_LDO_LINEAR("ldo_vsram_md", VSRAM_MD, 500000, 1100000, 6250,
+			  0, mt_volt_range7, MT6359_RG_LDO_VSRAM_MD_EN_ADDR,
+			  MT6359_DA_VSRAM_MD_B_EN_ADDR,
+			  MT6359_RG_LDO_VSRAM_MD_VOSEL_ADDR,
+			  MT6359_RG_LDO_VSRAM_MD_VOSEL_MASK <<
+			  MT6359_RG_LDO_VSRAM_MD_VOSEL_SHIFT),
+	MT6359_LDO("ldo_vufs", VUFS, volt18_voltages,
+		   MT6359_RG_LDO_VUFS_EN_ADDR, MT6359_RG_LDO_VUFS_EN_SHIFT,
+		   MT6359_DA_VUFS_B_EN_ADDR, MT6359_RG_VUFS_VOSEL_ADDR,
+		   MT6359_RG_VUFS_VOSEL_MASK << MT6359_RG_VUFS_VOSEL_SHIFT,
+		   1920),
+	MT6359_LDO("ldo_vm18", VM18, volt18_voltages,
+		   MT6359_RG_LDO_VM18_EN_ADDR, MT6359_RG_LDO_VM18_EN_SHIFT,
+		   MT6359_DA_VM18_B_EN_ADDR, MT6359_RG_VM18_VOSEL_ADDR,
+		   MT6359_RG_VM18_VOSEL_MASK << MT6359_RG_VM18_VOSEL_SHIFT,
+		   1920),
+	MT6359_LDO("ldo_vbbck", VBBCK, vbbck_voltages,
+		   MT6359_RG_LDO_VBBCK_EN_ADDR, MT6359_RG_LDO_VBBCK_EN_SHIFT,
+		   MT6359_DA_VBBCK_B_EN_ADDR, MT6359_RG_VBBCK_VOSEL_ADDR,
+		   MT6359_RG_VBBCK_VOSEL_MASK << MT6359_RG_VBBCK_VOSEL_SHIFT,
+		   240),
+	MT6359_LDO_LINEAR("ldo_vsram_proc1", VSRAM_PROC1, 500000, 1293750, 6250,
+			  0, mt_volt_range6, MT6359_RG_LDO_VSRAM_PROC1_EN_ADDR,
+			  MT6359_DA_VSRAM_PROC1_B_EN_ADDR,
+			  MT6359_RG_LDO_VSRAM_PROC1_VOSEL_ADDR,
+			  MT6359_RG_LDO_VSRAM_PROC1_VOSEL_MASK <<
+			  MT6359_RG_LDO_VSRAM_PROC1_VOSEL_SHIFT),
+	MT6359_LDO("ldo_vsim2", VSIM2, vsim2_voltages,
+		   MT6359_RG_LDO_VSIM2_EN_ADDR, MT6359_RG_LDO_VSIM2_EN_SHIFT,
+		   MT6359_DA_VSIM2_B_EN_ADDR, MT6359_RG_VSIM2_VOSEL_ADDR,
+		   MT6359_RG_VSIM2_VOSEL_MASK << MT6359_RG_VSIM2_VOSEL_SHIFT,
+		   480),
+	MT6359_LDO_LINEAR("ldo_vsram_others_sshub", VSRAM_OTHERS_SSHUB,
+			  500000, 1293750, 6250, 0, mt_volt_range6,
+			  MT6359_RG_LDO_VSRAM_OTHERS_SSHUB_EN_ADDR,
+			  MT6359_DA_VSRAM_OTHERS_B_EN_ADDR,
+			  MT6359_RG_LDO_VSRAM_OTHERS_SSHUB_VOSEL_ADDR,
+			  MT6359_RG_LDO_VSRAM_OTHERS_SSHUB_VOSEL_MASK <<
+			  MT6359_RG_LDO_VSRAM_OTHERS_SSHUB_VOSEL_SHIFT),
+};
+
+static int mt6359_regulator_probe(struct platform_device *pdev)
+{
+	struct mt6397_chip *mt6397 = dev_get_drvdata(pdev->dev.parent);
+	struct regulator_config config = {};
+	struct regulator_dev *rdev;
+	int i;
+
+	config.dev = mt6397->dev;
+	config.regmap = mt6397->regmap;
+	for (i = 0; i < MT6359_MAX_REGULATOR; i++) {
+		config.driver_data = &mt6359_regulators[i];
+		rdev = devm_regulator_register(&pdev->dev, &mt6359_regulators[i].desc, &config);
+		if (IS_ERR(rdev)) {
+			dev_err(&pdev->dev, "failed to register %s\n",
+				mt6359_regulators[i].desc.name);
+			return PTR_ERR(rdev);
+		}
+	}
+
+	return 0;
+}
+
+static const struct platform_device_id mt6359_platform_ids[] = {
+	{"mt6359-regulator", 0},
+	{ /* sentinel */ },
+};
+MODULE_DEVICE_TABLE(platform, mt6359_platform_ids);
+
+static struct platform_driver mt6359_regulator_driver = {
+	.driver = {
+		.name = "mt6359-regulator",
+	},
+	.probe = mt6359_regulator_probe,
+	.id_table = mt6359_platform_ids,
+};
+
+module_platform_driver(mt6359_regulator_driver);
+
+MODULE_AUTHOR("Wen Su <wen.su@mediatek.com>");
+MODULE_DESCRIPTION("Regulator Driver for MediaTek MT6359 PMIC");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/regulator/mt6359-regulator.h b/include/linux/regulator/mt6359-regulator.h
new file mode 100644
index 000000000000..14c4b715613e
--- /dev/null
+++ b/include/linux/regulator/mt6359-regulator.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2021 MediaTek Inc.
+ */
+
+#ifndef __LINUX_REGULATOR_MT6359_H
+#define __LINUX_REGULATOR_MT6359_H
+
+enum {
+	MT6359_ID_VS1 = 0,
+	MT6359_ID_VGPU11,
+	MT6359_ID_VMODEM,
+	MT6359_ID_VPU,
+	MT6359_ID_VCORE,
+	MT6359_ID_VS2,
+	MT6359_ID_VPA,
+	MT6359_ID_VPROC2,
+	MT6359_ID_VPROC1,
+	MT6359_ID_VCORE_SSHUB,
+	MT6359_ID_VAUD18 = 10,
+	MT6359_ID_VSIM1,
+	MT6359_ID_VIBR,
+	MT6359_ID_VRF12,
+	MT6359_ID_VUSB,
+	MT6359_ID_VSRAM_PROC2,
+	MT6359_ID_VIO18,
+	MT6359_ID_VCAMIO,
+	MT6359_ID_VCN18,
+	MT6359_ID_VFE28,
+	MT6359_ID_VCN13,
+	MT6359_ID_VCN33_1_BT,
+	MT6359_ID_VCN33_1_WIFI,
+	MT6359_ID_VAUX18,
+	MT6359_ID_VSRAM_OTHERS,
+	MT6359_ID_VEFUSE,
+	MT6359_ID_VXO22,
+	MT6359_ID_VRFCK,
+	MT6359_ID_VBIF28,
+	MT6359_ID_VIO28,
+	MT6359_ID_VEMC,
+	MT6359_ID_VCN33_2_BT,
+	MT6359_ID_VCN33_2_WIFI,
+	MT6359_ID_VA12,
+	MT6359_ID_VA09,
+	MT6359_ID_VRF18,
+	MT6359_ID_VSRAM_MD,
+	MT6359_ID_VUFS,
+	MT6359_ID_VM18,
+	MT6359_ID_VBBCK,
+	MT6359_ID_VSRAM_PROC1,
+	MT6359_ID_VSIM2,
+	MT6359_ID_VSRAM_OTHERS_SSHUB,
+	MT6359_ID_RG_MAX,
+};
+
+#define MT6359_MAX_REGULATOR	MT6359_ID_RG_MAX
+
+#endif /* __LINUX_REGULATOR_MT6359_H */
-- 
cgit v1.2.3


From 4cfc965475124c4eed2b7b5d8b6fc5048a21ecfd Mon Sep 17 00:00:00 2001
From: Hsin-Hsiung Wang <hsin-hsiung.wang@mediatek.com>
Date: Wed, 26 May 2021 14:52:06 +0800
Subject: regulator: mt6359: Add support for MT6359P regulator

The MT6359P is a eco version for MT6359 regulator.
We add support based on MT6359 regulator driver.

Signed-off-by: Hsin-Hsiung Wang <hsin-hsiung.wang@mediatek.com>
Acked-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/regulator/mt6359-regulator.c       | 379 ++++++++++++++++++++++++++++-
 include/linux/mfd/mt6359p/registers.h      | 249 +++++++++++++++++++
 include/linux/regulator/mt6359-regulator.h |   1 +
 3 files changed, 623 insertions(+), 6 deletions(-)
 create mode 100644 include/linux/mfd/mt6359p/registers.h

(limited to 'include/linux')

diff --git a/drivers/regulator/mt6359-regulator.c b/drivers/regulator/mt6359-regulator.c
index 994d3f67f73d..4f517c9fd6c4 100644
--- a/drivers/regulator/mt6359-regulator.c
+++ b/drivers/regulator/mt6359-regulator.c
@@ -4,6 +4,7 @@
 
 #include <linux/platform_device.h>
 #include <linux/mfd/mt6359/registers.h>
+#include <linux/mfd/mt6359p/registers.h>
 #include <linux/mfd/mt6397/core.h>
 #include <linux/module.h>
 #include <linux/of_device.h>
@@ -147,6 +148,29 @@ struct mt6359_regulator_info {
 	.qi = BIT(0),					\
 }
 
+#define MT6359P_LDO1(match, _name, _ops, _volt_table,	\
+	_enable_reg, _enable_mask, _status_reg,		\
+	_vsel_reg, _vsel_mask)				\
+[MT6359_ID_##_name] = {					\
+	.desc = {					\
+		.name = #_name,				\
+		.of_match = of_match_ptr(match),	\
+		.regulators_node = of_match_ptr("regulators"),	\
+		.ops = &_ops,				\
+		.type = REGULATOR_VOLTAGE,		\
+		.id = MT6359_ID_##_name,		\
+		.owner = THIS_MODULE,			\
+		.n_voltages = ARRAY_SIZE(_volt_table),	\
+		.volt_table = _volt_table,		\
+		.vsel_reg = _vsel_reg,			\
+		.vsel_mask = _vsel_mask,		\
+		.enable_reg = _enable_reg,		\
+		.enable_mask = BIT(_enable_mask),	\
+	},						\
+	.status_reg = _status_reg,			\
+	.qi = BIT(0),					\
+}
+
 static const struct linear_range mt_volt_range1[] = {
 	REGULATOR_LINEAR_RANGE(800000, 0, 0x70, 12500),
 };
@@ -175,6 +199,10 @@ static const struct linear_range mt_volt_range7[] = {
 	REGULATOR_LINEAR_RANGE(500000, 0, 0x7f, 6250),
 };
 
+static const struct linear_range mt_volt_range8[] = {
+	REGULATOR_LINEAR_RANGE(506250, 0, 0x7f, 6250),
+};
+
 static const u32 vsim1_voltages[] = {
 	0, 0, 0, 1700000, 1800000, 0, 0, 0, 2700000, 0, 0, 3000000, 3100000,
 };
@@ -212,6 +240,10 @@ static const u32 vrfck_voltages[] = {
 	0, 0, 1500000, 0, 0, 0, 0, 1600000, 0, 0, 0, 0, 1700000,
 };
 
+static const u32 vrfck_voltages_1[] = {
+	1240000, 1600000,
+};
+
 static const u32 vio28_voltages[] = {
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 2800000, 2900000, 3000000, 3100000, 3300000,
 };
@@ -220,6 +252,11 @@ static const u32 vemc_voltages[] = {
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2900000, 3000000, 0, 3300000,
 };
 
+static const u32 vemc_voltages_1[] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 2500000, 2800000, 2900000, 3000000, 3100000,
+	3300000,
+};
+
 static const u32 va12_voltages[] = {
 	0, 0, 0, 0, 0, 0, 1200000, 1300000,
 };
@@ -356,6 +393,78 @@ static int mt6359_regulator_set_mode(struct regulator_dev *rdev,
 	return ret;
 }
 
+static int mt6359p_vemc_set_voltage_sel(struct regulator_dev *rdev,
+					u32 sel)
+{
+	struct mt6359_regulator_info *info = rdev_get_drvdata(rdev);
+	int ret;
+	u32 val = 0;
+
+	sel <<= ffs(info->desc.vsel_mask) - 1;
+	ret = regmap_write(rdev->regmap, MT6359P_TMA_KEY_ADDR, TMA_KEY);
+	if (ret)
+		return ret;
+
+	ret = regmap_read(rdev->regmap, MT6359P_VM_MODE_ADDR, &val);
+	if (ret)
+		return ret;
+
+	switch (val) {
+	case 0:
+		/* If HW trapping is 0, use VEMC_VOSEL_0 */
+		ret = regmap_update_bits(rdev->regmap,
+					 info->desc.vsel_reg,
+					 info->desc.vsel_mask, sel);
+		break;
+	case 1:
+		/* If HW trapping is 1, use VEMC_VOSEL_1 */
+		ret = regmap_update_bits(rdev->regmap,
+					 info->desc.vsel_reg + 0x2,
+					 info->desc.vsel_mask, sel);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (ret)
+		return ret;
+
+	ret = regmap_write(rdev->regmap, MT6359P_TMA_KEY_ADDR, 0);
+	return ret;
+}
+
+static int mt6359p_vemc_get_voltage_sel(struct regulator_dev *rdev)
+{
+	struct mt6359_regulator_info *info = rdev_get_drvdata(rdev);
+	int ret;
+	u32 val = 0;
+
+	ret = regmap_read(rdev->regmap, MT6359P_VM_MODE_ADDR, &val);
+	if (ret)
+		return ret;
+	switch (val) {
+	case 0:
+		/* If HW trapping is 0, use VEMC_VOSEL_0 */
+		ret = regmap_read(rdev->regmap,
+				  info->desc.vsel_reg, &val);
+		break;
+	case 1:
+		/* If HW trapping is 1, use VEMC_VOSEL_1 */
+		ret = regmap_read(rdev->regmap,
+				  info->desc.vsel_reg + 0x2, &val);
+		break;
+	default:
+		return -EINVAL;
+	}
+	if (ret)
+		return ret;
+
+	val &= info->desc.vsel_mask;
+	val >>= ffs(info->desc.vsel_mask) - 1;
+
+	return val;
+}
+
 static const struct regulator_ops mt6359_volt_range_ops = {
 	.list_voltage = regulator_list_voltage_linear_range,
 	.map_voltage = regulator_map_voltage_linear_range,
@@ -389,6 +498,18 @@ static const struct regulator_ops mt6359_volt_fixed_ops = {
 	.get_status = mt6359_get_status,
 };
 
+static const struct regulator_ops mt6359p_vemc_ops = {
+	.list_voltage = regulator_list_voltage_table,
+	.map_voltage = regulator_map_voltage_iterate,
+	.set_voltage_sel = mt6359p_vemc_set_voltage_sel,
+	.get_voltage_sel = mt6359p_vemc_get_voltage_sel,
+	.set_voltage_time_sel = regulator_set_voltage_time_sel,
+	.enable = regulator_enable_regmap,
+	.disable = regulator_disable_regmap,
+	.is_enabled = regulator_is_enabled_regmap,
+	.get_status = mt6359_get_status,
+};
+
 /* The array is indexed by id(MT6359_ID_XXX) */
 static struct mt6359_regulator_info mt6359_regulators[] = {
 	MT6359_BUCK("buck_vs1", VS1, 800000, 2200000, 12500, 0,
@@ -626,21 +747,267 @@ static struct mt6359_regulator_info mt6359_regulators[] = {
 			  MT6359_RG_LDO_VSRAM_OTHERS_SSHUB_VOSEL_SHIFT),
 };
 
+static struct mt6359_regulator_info mt6359p_regulators[] = {
+	MT6359_BUCK("buck_vs1", VS1, 800000, 2200000, 12500, 0,
+		    mt_volt_range1, MT6359_RG_BUCK_VS1_EN_ADDR,
+		    MT6359_DA_VS1_EN_ADDR, MT6359_RG_BUCK_VS1_VOSEL_ADDR,
+		    MT6359_RG_BUCK_VS1_VOSEL_MASK <<
+		    MT6359_RG_BUCK_VS1_VOSEL_SHIFT,
+		    MT6359_RG_BUCK_VS1_LP_ADDR, MT6359_RG_BUCK_VS1_LP_SHIFT,
+		    MT6359_RG_VS1_FPWM_ADDR, MT6359_RG_VS1_FPWM_SHIFT),
+	MT6359_BUCK("buck_vgpu11", VGPU11, 400000, 1193750, 6250, 0,
+		    mt_volt_range2, MT6359_RG_BUCK_VGPU11_EN_ADDR,
+		    MT6359_DA_VGPU11_EN_ADDR, MT6359P_RG_BUCK_VGPU11_VOSEL_ADDR,
+		    MT6359_RG_BUCK_VGPU11_VOSEL_MASK <<
+		    MT6359_RG_BUCK_VGPU11_VOSEL_SHIFT,
+		    MT6359_RG_BUCK_VGPU11_LP_ADDR,
+		    MT6359_RG_BUCK_VGPU11_LP_SHIFT,
+		    MT6359_RG_VGPU11_FCCM_ADDR, MT6359_RG_VGPU11_FCCM_SHIFT),
+	MT6359_BUCK("buck_vmodem", VMODEM, 400000, 1100000, 6250, 0,
+		    mt_volt_range3, MT6359_RG_BUCK_VMODEM_EN_ADDR,
+		    MT6359_DA_VMODEM_EN_ADDR, MT6359_RG_BUCK_VMODEM_VOSEL_ADDR,
+		    MT6359_RG_BUCK_VMODEM_VOSEL_MASK <<
+		    MT6359_RG_BUCK_VMODEM_VOSEL_SHIFT,
+		    MT6359_RG_BUCK_VMODEM_LP_ADDR,
+		    MT6359_RG_BUCK_VMODEM_LP_SHIFT,
+		    MT6359_RG_VMODEM_FCCM_ADDR, MT6359_RG_VMODEM_FCCM_SHIFT),
+	MT6359_BUCK("buck_vpu", VPU, 400000, 1193750, 6250, 0,
+		    mt_volt_range2, MT6359_RG_BUCK_VPU_EN_ADDR,
+		    MT6359_DA_VPU_EN_ADDR, MT6359_RG_BUCK_VPU_VOSEL_ADDR,
+		    MT6359_RG_BUCK_VPU_VOSEL_MASK <<
+		    MT6359_RG_BUCK_VPU_VOSEL_SHIFT,
+		    MT6359_RG_BUCK_VPU_LP_ADDR, MT6359_RG_BUCK_VPU_LP_SHIFT,
+		    MT6359_RG_VPU_FCCM_ADDR, MT6359_RG_VPU_FCCM_SHIFT),
+	MT6359_BUCK("buck_vcore", VCORE, 506250, 1300000, 6250, 0,
+		    mt_volt_range8, MT6359_RG_BUCK_VCORE_EN_ADDR,
+		    MT6359_DA_VCORE_EN_ADDR, MT6359P_RG_BUCK_VCORE_VOSEL_ADDR,
+		    MT6359_RG_BUCK_VCORE_VOSEL_MASK <<
+		    MT6359_RG_BUCK_VCORE_VOSEL_SHIFT,
+		    MT6359_RG_BUCK_VCORE_LP_ADDR, MT6359_RG_BUCK_VCORE_LP_SHIFT,
+		    MT6359_RG_VCORE_FCCM_ADDR, MT6359_RG_VCORE_FCCM_SHIFT),
+	MT6359_BUCK("buck_vs2", VS2, 800000, 1600000, 12500, 0,
+		    mt_volt_range4, MT6359_RG_BUCK_VS2_EN_ADDR,
+		    MT6359_DA_VS2_EN_ADDR, MT6359_RG_BUCK_VS2_VOSEL_ADDR,
+		    MT6359_RG_BUCK_VS2_VOSEL_MASK <<
+		    MT6359_RG_BUCK_VS2_VOSEL_SHIFT,
+		    MT6359_RG_BUCK_VS2_LP_ADDR, MT6359_RG_BUCK_VS2_LP_SHIFT,
+		    MT6359_RG_VS2_FPWM_ADDR, MT6359_RG_VS2_FPWM_SHIFT),
+	MT6359_BUCK("buck_vpa", VPA, 500000, 3650000, 50000, 0,
+		    mt_volt_range5, MT6359_RG_BUCK_VPA_EN_ADDR,
+		    MT6359_DA_VPA_EN_ADDR, MT6359_RG_BUCK_VPA_VOSEL_ADDR,
+		    MT6359_RG_BUCK_VPA_VOSEL_MASK <<
+		    MT6359_RG_BUCK_VPA_VOSEL_SHIFT,
+		    MT6359_RG_BUCK_VPA_LP_ADDR, MT6359_RG_BUCK_VPA_LP_SHIFT,
+		    MT6359_RG_VPA_MODESET_ADDR, MT6359_RG_VPA_MODESET_SHIFT),
+	MT6359_BUCK("buck_vproc2", VPROC2, 400000, 1193750, 6250, 0,
+		    mt_volt_range2, MT6359_RG_BUCK_VPROC2_EN_ADDR,
+		    MT6359_DA_VPROC2_EN_ADDR, MT6359_RG_BUCK_VPROC2_VOSEL_ADDR,
+		    MT6359_RG_BUCK_VPROC2_VOSEL_MASK <<
+		    MT6359_RG_BUCK_VPROC2_VOSEL_SHIFT,
+		    MT6359_RG_BUCK_VPROC2_LP_ADDR,
+		    MT6359_RG_BUCK_VPROC2_LP_SHIFT,
+		    MT6359_RG_VPROC2_FCCM_ADDR, MT6359_RG_VPROC2_FCCM_SHIFT),
+	MT6359_BUCK("buck_vproc1", VPROC1, 400000, 1193750, 6250, 0,
+		    mt_volt_range2, MT6359_RG_BUCK_VPROC1_EN_ADDR,
+		    MT6359_DA_VPROC1_EN_ADDR, MT6359_RG_BUCK_VPROC1_VOSEL_ADDR,
+		    MT6359_RG_BUCK_VPROC1_VOSEL_MASK <<
+		    MT6359_RG_BUCK_VPROC1_VOSEL_SHIFT,
+		    MT6359_RG_BUCK_VPROC1_LP_ADDR,
+		    MT6359_RG_BUCK_VPROC1_LP_SHIFT,
+		    MT6359_RG_VPROC1_FCCM_ADDR, MT6359_RG_VPROC1_FCCM_SHIFT),
+	MT6359_BUCK("buck_vgpu11_sshub", VGPU11_SSHUB, 400000, 1193750, 6250, 0,
+		    mt_volt_range2, MT6359P_RG_BUCK_VGPU11_SSHUB_EN_ADDR,
+		    MT6359_DA_VGPU11_EN_ADDR,
+		    MT6359P_RG_BUCK_VGPU11_SSHUB_VOSEL_ADDR,
+		    MT6359P_RG_BUCK_VGPU11_SSHUB_VOSEL_MASK <<
+		    MT6359P_RG_BUCK_VGPU11_SSHUB_VOSEL_SHIFT,
+		    MT6359_RG_BUCK_VGPU11_LP_ADDR,
+		    MT6359_RG_BUCK_VGPU11_LP_SHIFT,
+		    MT6359_RG_VGPU11_FCCM_ADDR, MT6359_RG_VGPU11_FCCM_SHIFT),
+	MT6359_REG_FIXED("ldo_vaud18", VAUD18, MT6359P_RG_LDO_VAUD18_EN_ADDR,
+			 MT6359P_DA_VAUD18_B_EN_ADDR, 1800000),
+	MT6359_LDO("ldo_vsim1", VSIM1, vsim1_voltages,
+		   MT6359P_RG_LDO_VSIM1_EN_ADDR, MT6359P_RG_LDO_VSIM1_EN_SHIFT,
+		   MT6359P_DA_VSIM1_B_EN_ADDR, MT6359P_RG_VSIM1_VOSEL_ADDR,
+		   MT6359_RG_VSIM1_VOSEL_MASK << MT6359_RG_VSIM1_VOSEL_SHIFT,
+		   480),
+	MT6359_LDO("ldo_vibr", VIBR, vibr_voltages,
+		   MT6359P_RG_LDO_VIBR_EN_ADDR, MT6359P_RG_LDO_VIBR_EN_SHIFT,
+		   MT6359P_DA_VIBR_B_EN_ADDR, MT6359P_RG_VIBR_VOSEL_ADDR,
+		   MT6359_RG_VIBR_VOSEL_MASK << MT6359_RG_VIBR_VOSEL_SHIFT,
+		   240),
+	MT6359_LDO("ldo_vrf12", VRF12, vrf12_voltages,
+		   MT6359P_RG_LDO_VRF12_EN_ADDR, MT6359P_RG_LDO_VRF12_EN_SHIFT,
+		   MT6359P_DA_VRF12_B_EN_ADDR, MT6359P_RG_VRF12_VOSEL_ADDR,
+		   MT6359_RG_VRF12_VOSEL_MASK << MT6359_RG_VRF12_VOSEL_SHIFT,
+		   480),
+	MT6359_REG_FIXED("ldo_vusb", VUSB, MT6359P_RG_LDO_VUSB_EN_0_ADDR,
+			 MT6359P_DA_VUSB_B_EN_ADDR, 3000000),
+	MT6359_LDO_LINEAR("ldo_vsram_proc2", VSRAM_PROC2, 500000, 1293750, 6250,
+			  0, mt_volt_range6, MT6359P_RG_LDO_VSRAM_PROC2_EN_ADDR,
+			  MT6359P_DA_VSRAM_PROC2_B_EN_ADDR,
+			  MT6359P_RG_LDO_VSRAM_PROC2_VOSEL_ADDR,
+			  MT6359_RG_LDO_VSRAM_PROC2_VOSEL_MASK <<
+			  MT6359_RG_LDO_VSRAM_PROC2_VOSEL_SHIFT),
+	MT6359_LDO("ldo_vio18", VIO18, volt18_voltages,
+		   MT6359P_RG_LDO_VIO18_EN_ADDR, MT6359P_RG_LDO_VIO18_EN_SHIFT,
+		   MT6359P_DA_VIO18_B_EN_ADDR, MT6359P_RG_VIO18_VOSEL_ADDR,
+		   MT6359_RG_VIO18_VOSEL_MASK << MT6359_RG_VIO18_VOSEL_SHIFT,
+		   960),
+	MT6359_LDO("ldo_vcamio", VCAMIO, volt18_voltages,
+		   MT6359P_RG_LDO_VCAMIO_EN_ADDR,
+		   MT6359P_RG_LDO_VCAMIO_EN_SHIFT,
+		   MT6359P_DA_VCAMIO_B_EN_ADDR, MT6359P_RG_VCAMIO_VOSEL_ADDR,
+		   MT6359_RG_VCAMIO_VOSEL_MASK << MT6359_RG_VCAMIO_VOSEL_SHIFT,
+		   1290),
+	MT6359_REG_FIXED("ldo_vcn18", VCN18, MT6359P_RG_LDO_VCN18_EN_ADDR,
+			 MT6359P_DA_VCN18_B_EN_ADDR, 1800000),
+	MT6359_REG_FIXED("ldo_vfe28", VFE28, MT6359P_RG_LDO_VFE28_EN_ADDR,
+			 MT6359P_DA_VFE28_B_EN_ADDR, 2800000),
+	MT6359_LDO("ldo_vcn13", VCN13, vcn13_voltages,
+		   MT6359P_RG_LDO_VCN13_EN_ADDR, MT6359P_RG_LDO_VCN13_EN_SHIFT,
+		   MT6359P_DA_VCN13_B_EN_ADDR, MT6359P_RG_VCN13_VOSEL_ADDR,
+		   MT6359_RG_VCN13_VOSEL_MASK << MT6359_RG_VCN13_VOSEL_SHIFT,
+		   240),
+	MT6359_LDO("ldo_vcn33_1_bt", VCN33_1_BT, vcn33_voltages,
+		   MT6359P_RG_LDO_VCN33_1_EN_0_ADDR,
+		   MT6359_RG_LDO_VCN33_1_EN_0_SHIFT,
+		   MT6359P_DA_VCN33_1_B_EN_ADDR, MT6359P_RG_VCN33_1_VOSEL_ADDR,
+		   MT6359_RG_VCN33_1_VOSEL_MASK <<
+		   MT6359_RG_VCN33_1_VOSEL_SHIFT, 240),
+	MT6359_LDO("ldo_vcn33_1_wifi", VCN33_1_WIFI, vcn33_voltages,
+		   MT6359P_RG_LDO_VCN33_1_EN_1_ADDR,
+		   MT6359P_RG_LDO_VCN33_1_EN_1_SHIFT,
+		   MT6359P_DA_VCN33_1_B_EN_ADDR, MT6359P_RG_VCN33_1_VOSEL_ADDR,
+		   MT6359_RG_VCN33_1_VOSEL_MASK <<
+		   MT6359_RG_VCN33_1_VOSEL_SHIFT, 240),
+	MT6359_REG_FIXED("ldo_vaux18", VAUX18, MT6359P_RG_LDO_VAUX18_EN_ADDR,
+			 MT6359P_DA_VAUX18_B_EN_ADDR, 1800000),
+	MT6359_LDO_LINEAR("ldo_vsram_others", VSRAM_OTHERS, 500000, 1293750,
+			  6250, 0, mt_volt_range6,
+			  MT6359P_RG_LDO_VSRAM_OTHERS_EN_ADDR,
+			  MT6359P_DA_VSRAM_OTHERS_B_EN_ADDR,
+			  MT6359P_RG_LDO_VSRAM_OTHERS_VOSEL_ADDR,
+			  MT6359_RG_LDO_VSRAM_OTHERS_VOSEL_MASK <<
+			  MT6359_RG_LDO_VSRAM_OTHERS_VOSEL_SHIFT),
+	MT6359_LDO("ldo_vefuse", VEFUSE, vefuse_voltages,
+		   MT6359P_RG_LDO_VEFUSE_EN_ADDR,
+		   MT6359P_RG_LDO_VEFUSE_EN_SHIFT,
+		   MT6359P_DA_VEFUSE_B_EN_ADDR, MT6359P_RG_VEFUSE_VOSEL_ADDR,
+		   MT6359_RG_VEFUSE_VOSEL_MASK << MT6359_RG_VEFUSE_VOSEL_SHIFT,
+		   240),
+	MT6359_LDO("ldo_vxo22", VXO22, vxo22_voltages,
+		   MT6359P_RG_LDO_VXO22_EN_ADDR, MT6359P_RG_LDO_VXO22_EN_SHIFT,
+		   MT6359P_DA_VXO22_B_EN_ADDR, MT6359P_RG_VXO22_VOSEL_ADDR,
+		   MT6359_RG_VXO22_VOSEL_MASK << MT6359_RG_VXO22_VOSEL_SHIFT,
+		   480),
+	MT6359_LDO("ldo_vrfck_1", VRFCK, vrfck_voltages_1,
+		   MT6359P_RG_LDO_VRFCK_EN_ADDR, MT6359P_RG_LDO_VRFCK_EN_SHIFT,
+		   MT6359P_DA_VRFCK_B_EN_ADDR, MT6359P_RG_VRFCK_VOSEL_ADDR,
+		   MT6359_RG_VRFCK_VOSEL_MASK << MT6359_RG_VRFCK_VOSEL_SHIFT,
+		   480),
+	MT6359_REG_FIXED("ldo_vbif28", VBIF28, MT6359P_RG_LDO_VBIF28_EN_ADDR,
+			 MT6359P_DA_VBIF28_B_EN_ADDR, 2800000),
+	MT6359_LDO("ldo_vio28", VIO28, vio28_voltages,
+		   MT6359P_RG_LDO_VIO28_EN_ADDR, MT6359P_RG_LDO_VIO28_EN_SHIFT,
+		   MT6359P_DA_VIO28_B_EN_ADDR, MT6359P_RG_VIO28_VOSEL_ADDR,
+		   MT6359_RG_VIO28_VOSEL_MASK << MT6359_RG_VIO28_VOSEL_SHIFT,
+		   1920),
+	MT6359P_LDO1("ldo_vemc_1", VEMC, mt6359p_vemc_ops, vemc_voltages_1,
+		     MT6359P_RG_LDO_VEMC_EN_ADDR, MT6359P_RG_LDO_VEMC_EN_SHIFT,
+		     MT6359P_DA_VEMC_B_EN_ADDR,
+		     MT6359P_RG_LDO_VEMC_VOSEL_0_ADDR,
+		     MT6359P_RG_LDO_VEMC_VOSEL_0_MASK <<
+		     MT6359P_RG_LDO_VEMC_VOSEL_0_SHIFT),
+	MT6359_LDO("ldo_vcn33_2_bt", VCN33_2_BT, vcn33_voltages,
+		   MT6359P_RG_LDO_VCN33_2_EN_0_ADDR,
+		   MT6359P_RG_LDO_VCN33_2_EN_0_SHIFT,
+		   MT6359P_DA_VCN33_2_B_EN_ADDR, MT6359P_RG_VCN33_2_VOSEL_ADDR,
+		   MT6359_RG_VCN33_2_VOSEL_MASK <<
+		   MT6359_RG_VCN33_2_VOSEL_SHIFT, 240),
+	MT6359_LDO("ldo_vcn33_2_wifi", VCN33_2_WIFI, vcn33_voltages,
+		   MT6359P_RG_LDO_VCN33_2_EN_1_ADDR,
+		   MT6359_RG_LDO_VCN33_2_EN_1_SHIFT,
+		   MT6359P_DA_VCN33_2_B_EN_ADDR, MT6359P_RG_VCN33_2_VOSEL_ADDR,
+		   MT6359_RG_VCN33_2_VOSEL_MASK <<
+		   MT6359_RG_VCN33_2_VOSEL_SHIFT, 240),
+	MT6359_LDO("ldo_va12", VA12, va12_voltages,
+		   MT6359P_RG_LDO_VA12_EN_ADDR, MT6359P_RG_LDO_VA12_EN_SHIFT,
+		   MT6359P_DA_VA12_B_EN_ADDR, MT6359P_RG_VA12_VOSEL_ADDR,
+		   MT6359_RG_VA12_VOSEL_MASK << MT6359_RG_VA12_VOSEL_SHIFT,
+		   960),
+	MT6359_LDO("ldo_va09", VA09, va09_voltages,
+		   MT6359P_RG_LDO_VA09_EN_ADDR, MT6359P_RG_LDO_VA09_EN_SHIFT,
+		   MT6359P_DA_VA09_B_EN_ADDR, MT6359P_RG_VA09_VOSEL_ADDR,
+		   MT6359_RG_VA09_VOSEL_MASK << MT6359_RG_VA09_VOSEL_SHIFT,
+		   960),
+	MT6359_LDO("ldo_vrf18", VRF18, vrf18_voltages,
+		   MT6359P_RG_LDO_VRF18_EN_ADDR, MT6359P_RG_LDO_VRF18_EN_SHIFT,
+		   MT6359P_DA_VRF18_B_EN_ADDR, MT6359P_RG_VRF18_VOSEL_ADDR,
+		   MT6359_RG_VRF18_VOSEL_MASK << MT6359_RG_VRF18_VOSEL_SHIFT,
+		   240),
+	MT6359_LDO_LINEAR("ldo_vsram_md", VSRAM_MD, 500000, 1293750, 6250,
+			  0, mt_volt_range7, MT6359P_RG_LDO_VSRAM_MD_EN_ADDR,
+			  MT6359P_DA_VSRAM_MD_B_EN_ADDR,
+			  MT6359P_RG_LDO_VSRAM_MD_VOSEL_ADDR,
+			  MT6359_RG_LDO_VSRAM_MD_VOSEL_MASK <<
+			  MT6359_RG_LDO_VSRAM_MD_VOSEL_SHIFT),
+	MT6359_LDO("ldo_vufs", VUFS, volt18_voltages,
+		   MT6359P_RG_LDO_VUFS_EN_ADDR, MT6359P_RG_LDO_VUFS_EN_SHIFT,
+		   MT6359P_DA_VUFS_B_EN_ADDR, MT6359P_RG_VUFS_VOSEL_ADDR,
+		   MT6359_RG_VUFS_VOSEL_MASK << MT6359_RG_VUFS_VOSEL_SHIFT,
+		   1920),
+	MT6359_LDO("ldo_vm18", VM18, volt18_voltages,
+		   MT6359P_RG_LDO_VM18_EN_ADDR, MT6359P_RG_LDO_VM18_EN_SHIFT,
+		   MT6359P_DA_VM18_B_EN_ADDR, MT6359P_RG_VM18_VOSEL_ADDR,
+		   MT6359_RG_VM18_VOSEL_MASK << MT6359_RG_VM18_VOSEL_SHIFT,
+		   1920),
+	MT6359_LDO("ldo_vbbck", VBBCK, vbbck_voltages,
+		   MT6359P_RG_LDO_VBBCK_EN_ADDR, MT6359P_RG_LDO_VBBCK_EN_SHIFT,
+		   MT6359P_DA_VBBCK_B_EN_ADDR, MT6359P_RG_VBBCK_VOSEL_ADDR,
+		   MT6359P_RG_VBBCK_VOSEL_MASK << MT6359P_RG_VBBCK_VOSEL_SHIFT,
+		   480),
+	MT6359_LDO_LINEAR("ldo_vsram_proc1", VSRAM_PROC1, 500000, 1293750, 6250,
+			  0, mt_volt_range6, MT6359P_RG_LDO_VSRAM_PROC1_EN_ADDR,
+			  MT6359P_DA_VSRAM_PROC1_B_EN_ADDR,
+			  MT6359P_RG_LDO_VSRAM_PROC1_VOSEL_ADDR,
+			  MT6359_RG_LDO_VSRAM_PROC1_VOSEL_MASK <<
+			  MT6359_RG_LDO_VSRAM_PROC1_VOSEL_SHIFT),
+	MT6359_LDO("ldo_vsim2", VSIM2, vsim2_voltages,
+		   MT6359P_RG_LDO_VSIM2_EN_ADDR, MT6359P_RG_LDO_VSIM2_EN_SHIFT,
+		   MT6359P_DA_VSIM2_B_EN_ADDR, MT6359P_RG_VSIM2_VOSEL_ADDR,
+		   MT6359_RG_VSIM2_VOSEL_MASK << MT6359_RG_VSIM2_VOSEL_SHIFT,
+		   480),
+	MT6359_LDO_LINEAR("ldo_vsram_others_sshub", VSRAM_OTHERS_SSHUB,
+			  500000, 1293750, 6250, 0, mt_volt_range6,
+			  MT6359P_RG_LDO_VSRAM_OTHERS_SSHUB_EN_ADDR,
+			  MT6359P_DA_VSRAM_OTHERS_B_EN_ADDR,
+			  MT6359P_RG_LDO_VSRAM_OTHERS_SSHUB_VOSEL_ADDR,
+			  MT6359_RG_LDO_VSRAM_OTHERS_SSHUB_VOSEL_MASK <<
+			  MT6359_RG_LDO_VSRAM_OTHERS_SSHUB_VOSEL_SHIFT),
+};
+
 static int mt6359_regulator_probe(struct platform_device *pdev)
 {
 	struct mt6397_chip *mt6397 = dev_get_drvdata(pdev->dev.parent);
 	struct regulator_config config = {};
 	struct regulator_dev *rdev;
-	int i;
+	struct mt6359_regulator_info *mt6359_info;
+	int i, hw_ver;
+
+	regmap_read(mt6397->regmap, MT6359P_HWCID, &hw_ver);
+	if (hw_ver >= MT6359P_CHIP_VER)
+		mt6359_info = mt6359p_regulators;
+	else
+		mt6359_info = mt6359_regulators;
 
 	config.dev = mt6397->dev;
 	config.regmap = mt6397->regmap;
-	for (i = 0; i < MT6359_MAX_REGULATOR; i++) {
-		config.driver_data = &mt6359_regulators[i];
-		rdev = devm_regulator_register(&pdev->dev, &mt6359_regulators[i].desc, &config);
+	for (i = 0; i < MT6359_MAX_REGULATOR; i++, mt6359_info++) {
+		config.driver_data = mt6359_info;
+		rdev = devm_regulator_register(&pdev->dev, &mt6359_info->desc, &config);
 		if (IS_ERR(rdev)) {
-			dev_err(&pdev->dev, "failed to register %s\n",
-				mt6359_regulators[i].desc.name);
+			dev_err(&pdev->dev, "failed to register %s\n", mt6359_info->desc.name);
 			return PTR_ERR(rdev);
 		}
 	}
diff --git a/include/linux/mfd/mt6359p/registers.h b/include/linux/mfd/mt6359p/registers.h
new file mode 100644
index 000000000000..3d97c1885171
--- /dev/null
+++ b/include/linux/mfd/mt6359p/registers.h
@@ -0,0 +1,249 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2021 MediaTek Inc.
+ */
+
+#ifndef __MFD_MT6359P_REGISTERS_H__
+#define __MFD_MT6359P_REGISTERS_H__
+
+#define MT6359P_CHIP_VER 0x5930
+
+/* PMIC Registers */
+#define MT6359P_HWCID                         0x8
+#define MT6359P_TOP_TRAP                      0x50
+#define MT6359P_TOP_TMA_KEY                   0x3a8
+#define MT6359P_BUCK_VCORE_ELR_NUM            0x152a
+#define MT6359P_BUCK_VCORE_ELR0               0x152c
+#define MT6359P_BUCK_VGPU11_SSHUB_CON0        0x15aa
+#define MT6359P_BUCK_VGPU11_ELR0              0x15b4
+#define MT6359P_LDO_VSRAM_PROC1_ELR           0x1b44
+#define MT6359P_LDO_VSRAM_PROC2_ELR           0x1b46
+#define MT6359P_LDO_VSRAM_OTHERS_ELR          0x1b48
+#define MT6359P_LDO_VSRAM_MD_ELR              0x1b4a
+#define MT6359P_LDO_VEMC_ELR_0                0x1b4c
+#define MT6359P_LDO_VFE28_CON0                0x1b88
+#define MT6359P_LDO_VFE28_MON                 0x1b8c
+#define MT6359P_LDO_VXO22_CON0                0x1b9a
+#define MT6359P_LDO_VXO22_MON                 0x1b9e
+#define MT6359P_LDO_VRF18_CON0                0x1bac
+#define MT6359P_LDO_VRF18_MON                 0x1bb0
+#define MT6359P_LDO_VRF12_CON0                0x1bbe
+#define MT6359P_LDO_VRF12_MON                 0x1bc2
+#define MT6359P_LDO_VEFUSE_CON0               0x1bd0
+#define MT6359P_LDO_VEFUSE_MON                0x1bd4
+#define MT6359P_LDO_VCN33_1_CON0              0x1be2
+#define MT6359P_LDO_VCN33_1_MON               0x1be6
+#define MT6359P_LDO_VCN33_1_MULTI_SW          0x1bf4
+#define MT6359P_LDO_VCN33_2_CON0              0x1c08
+#define MT6359P_LDO_VCN33_2_MON               0x1c0c
+#define MT6359P_LDO_VCN33_2_MULTI_SW          0x1c1a
+#define MT6359P_LDO_VCN13_CON0                0x1c1c
+#define MT6359P_LDO_VCN13_MON                 0x1c20
+#define MT6359P_LDO_VCN18_CON0                0x1c2e
+#define MT6359P_LDO_VCN18_MON                 0x1c32
+#define MT6359P_LDO_VA09_CON0                 0x1c40
+#define MT6359P_LDO_VA09_MON                  0x1c44
+#define MT6359P_LDO_VCAMIO_CON0               0x1c52
+#define MT6359P_LDO_VCAMIO_MON                0x1c56
+#define MT6359P_LDO_VA12_CON0                 0x1c64
+#define MT6359P_LDO_VA12_MON                  0x1c68
+#define MT6359P_LDO_VAUX18_CON0               0x1c88
+#define MT6359P_LDO_VAUX18_MON                0x1c8c
+#define MT6359P_LDO_VAUD18_CON0               0x1c9a
+#define MT6359P_LDO_VAUD18_MON                0x1c9e
+#define MT6359P_LDO_VIO18_CON0                0x1cac
+#define MT6359P_LDO_VIO18_MON                 0x1cb0
+#define MT6359P_LDO_VEMC_CON0                 0x1cbe
+#define MT6359P_LDO_VEMC_MON                  0x1cc2
+#define MT6359P_LDO_VSIM1_CON0                0x1cd0
+#define MT6359P_LDO_VSIM1_MON                 0x1cd4
+#define MT6359P_LDO_VSIM2_CON0                0x1ce2
+#define MT6359P_LDO_VSIM2_MON                 0x1ce6
+#define MT6359P_LDO_VUSB_CON0                 0x1d08
+#define MT6359P_LDO_VUSB_MON                  0x1d0c
+#define MT6359P_LDO_VUSB_MULTI_SW             0x1d1a
+#define MT6359P_LDO_VRFCK_CON0                0x1d1c
+#define MT6359P_LDO_VRFCK_MON                 0x1d20
+#define MT6359P_LDO_VBBCK_CON0                0x1d2e
+#define MT6359P_LDO_VBBCK_MON                 0x1d32
+#define MT6359P_LDO_VBIF28_CON0               0x1d40
+#define MT6359P_LDO_VBIF28_MON                0x1d44
+#define MT6359P_LDO_VIBR_CON0                 0x1d52
+#define MT6359P_LDO_VIBR_MON                  0x1d56
+#define MT6359P_LDO_VIO28_CON0                0x1d64
+#define MT6359P_LDO_VIO28_MON                 0x1d68
+#define MT6359P_LDO_VM18_CON0                 0x1d88
+#define MT6359P_LDO_VM18_MON                  0x1d8c
+#define MT6359P_LDO_VUFS_CON0                 0x1d9a
+#define MT6359P_LDO_VUFS_MON                  0x1d9e
+#define MT6359P_LDO_VSRAM_PROC1_CON0          0x1e88
+#define MT6359P_LDO_VSRAM_PROC1_MON           0x1e8c
+#define MT6359P_LDO_VSRAM_PROC1_VOSEL1        0x1e90
+#define MT6359P_LDO_VSRAM_PROC2_CON0          0x1ea8
+#define MT6359P_LDO_VSRAM_PROC2_MON           0x1eac
+#define MT6359P_LDO_VSRAM_PROC2_VOSEL1        0x1eb0
+#define MT6359P_LDO_VSRAM_OTHERS_CON0         0x1f08
+#define MT6359P_LDO_VSRAM_OTHERS_MON          0x1f0c
+#define MT6359P_LDO_VSRAM_OTHERS_VOSEL1       0x1f10
+#define MT6359P_LDO_VSRAM_OTHERS_SSHUB        0x1f28
+#define MT6359P_LDO_VSRAM_MD_CON0             0x1f2e
+#define MT6359P_LDO_VSRAM_MD_MON              0x1f32
+#define MT6359P_LDO_VSRAM_MD_VOSEL1           0x1f36
+#define MT6359P_VFE28_ANA_CON0                0x1f88
+#define MT6359P_VAUX18_ANA_CON0               0x1f8c
+#define MT6359P_VUSB_ANA_CON0                 0x1f90
+#define MT6359P_VBIF28_ANA_CON0               0x1f94
+#define MT6359P_VCN33_1_ANA_CON0              0x1f98
+#define MT6359P_VCN33_2_ANA_CON0              0x1f9c
+#define MT6359P_VEMC_ANA_CON0                 0x1fa0
+#define MT6359P_VSIM1_ANA_CON0                0x1fa2
+#define MT6359P_VSIM2_ANA_CON0                0x1fa6
+#define MT6359P_VIO28_ANA_CON0                0x1faa
+#define MT6359P_VIBR_ANA_CON0                 0x1fae
+#define MT6359P_VFE28_ELR_4                   0x1fc0
+#define MT6359P_VRF18_ANA_CON0                0x2008
+#define MT6359P_VEFUSE_ANA_CON0               0x200c
+#define MT6359P_VCN18_ANA_CON0                0x2010
+#define MT6359P_VCAMIO_ANA_CON0               0x2014
+#define MT6359P_VAUD18_ANA_CON0               0x2018
+#define MT6359P_VIO18_ANA_CON0                0x201c
+#define MT6359P_VM18_ANA_CON0                 0x2020
+#define MT6359P_VUFS_ANA_CON0                 0x2024
+#define MT6359P_VRF12_ANA_CON0                0x202a
+#define MT6359P_VCN13_ANA_CON0                0x202e
+#define MT6359P_VA09_ANA_CON0                 0x2032
+#define MT6359P_VRF18_ELR_3                   0x204e
+#define MT6359P_VXO22_ANA_CON0                0x2088
+#define MT6359P_VRFCK_ANA_CON0                0x208c
+#define MT6359P_VBBCK_ANA_CON0                0x2096
+
+#define MT6359P_RG_BUCK_VCORE_VOSEL_ADDR         MT6359P_BUCK_VCORE_ELR0
+#define MT6359P_RG_BUCK_VGPU11_SSHUB_EN_ADDR     MT6359P_BUCK_VGPU11_SSHUB_CON0
+#define MT6359P_RG_BUCK_VGPU11_VOSEL_ADDR        MT6359P_BUCK_VGPU11_ELR0
+#define MT6359P_RG_BUCK_VGPU11_SSHUB_VOSEL_ADDR  MT6359P_BUCK_VGPU11_SSHUB_CON0
+#define MT6359P_RG_BUCK_VGPU11_SSHUB_VOSEL_MASK  0x7F
+#define MT6359P_RG_BUCK_VGPU11_SSHUB_VOSEL_SHIFT 4
+#define MT6359P_RG_LDO_VSRAM_PROC1_VOSEL_ADDR    MT6359P_LDO_VSRAM_PROC1_ELR
+#define MT6359P_RG_LDO_VSRAM_PROC2_VOSEL_ADDR    MT6359P_LDO_VSRAM_PROC2_ELR
+#define MT6359P_RG_LDO_VSRAM_OTHERS_VOSEL_ADDR   MT6359P_LDO_VSRAM_OTHERS_ELR
+#define MT6359P_RG_LDO_VSRAM_MD_VOSEL_ADDR       MT6359P_LDO_VSRAM_MD_ELR
+#define MT6359P_RG_LDO_VEMC_VOSEL_0_ADDR         MT6359P_LDO_VEMC_ELR_0
+#define MT6359P_RG_LDO_VEMC_VOSEL_0_MASK         0xF
+#define MT6359P_RG_LDO_VEMC_VOSEL_0_SHIFT        0
+#define MT6359P_RG_LDO_VFE28_EN_ADDR             MT6359P_LDO_VFE28_CON0
+#define MT6359P_DA_VFE28_B_EN_ADDR               MT6359P_LDO_VFE28_MON
+#define MT6359P_RG_LDO_VXO22_EN_ADDR             MT6359P_LDO_VXO22_CON0
+#define MT6359P_RG_LDO_VXO22_EN_SHIFT            0
+#define MT6359P_DA_VXO22_B_EN_ADDR               MT6359P_LDO_VXO22_MON
+#define MT6359P_RG_LDO_VRF18_EN_ADDR             MT6359P_LDO_VRF18_CON0
+#define MT6359P_RG_LDO_VRF18_EN_SHIFT            0
+#define MT6359P_DA_VRF18_B_EN_ADDR               MT6359P_LDO_VRF18_MON
+#define MT6359P_RG_LDO_VRF12_EN_ADDR             MT6359P_LDO_VRF12_CON0
+#define MT6359P_RG_LDO_VRF12_EN_SHIFT            0
+#define MT6359P_DA_VRF12_B_EN_ADDR               MT6359P_LDO_VRF12_MON
+#define MT6359P_RG_LDO_VEFUSE_EN_ADDR            MT6359P_LDO_VEFUSE_CON0
+#define MT6359P_RG_LDO_VEFUSE_EN_SHIFT           0
+#define MT6359P_DA_VEFUSE_B_EN_ADDR              MT6359P_LDO_VEFUSE_MON
+#define MT6359P_RG_LDO_VCN33_1_EN_0_ADDR         MT6359P_LDO_VCN33_1_CON0
+#define MT6359P_DA_VCN33_1_B_EN_ADDR             MT6359P_LDO_VCN33_1_MON
+#define MT6359P_RG_LDO_VCN33_1_EN_1_ADDR         MT6359P_LDO_VCN33_1_MULTI_SW
+#define MT6359P_RG_LDO_VCN33_1_EN_1_SHIFT        15
+#define MT6359P_RG_LDO_VCN33_2_EN_0_ADDR         MT6359P_LDO_VCN33_2_CON0
+#define MT6359P_RG_LDO_VCN33_2_EN_0_SHIFT        0
+#define MT6359P_DA_VCN33_2_B_EN_ADDR             MT6359P_LDO_VCN33_2_MON
+#define MT6359P_RG_LDO_VCN33_2_EN_1_ADDR         MT6359P_LDO_VCN33_2_MULTI_SW
+#define MT6359P_RG_LDO_VCN13_EN_ADDR             MT6359P_LDO_VCN13_CON0
+#define MT6359P_RG_LDO_VCN13_EN_SHIFT            0
+#define MT6359P_DA_VCN13_B_EN_ADDR               MT6359P_LDO_VCN13_MON
+#define MT6359P_RG_LDO_VCN18_EN_ADDR             MT6359P_LDO_VCN18_CON0
+#define MT6359P_DA_VCN18_B_EN_ADDR               MT6359P_LDO_VCN18_MON
+#define MT6359P_RG_LDO_VA09_EN_ADDR              MT6359P_LDO_VA09_CON0
+#define MT6359P_RG_LDO_VA09_EN_SHIFT             0
+#define MT6359P_DA_VA09_B_EN_ADDR                MT6359P_LDO_VA09_MON
+#define MT6359P_RG_LDO_VCAMIO_EN_ADDR            MT6359P_LDO_VCAMIO_CON0
+#define MT6359P_RG_LDO_VCAMIO_EN_SHIFT           0
+#define MT6359P_DA_VCAMIO_B_EN_ADDR              MT6359P_LDO_VCAMIO_MON
+#define MT6359P_RG_LDO_VA12_EN_ADDR              MT6359P_LDO_VA12_CON0
+#define MT6359P_RG_LDO_VA12_EN_SHIFT             0
+#define MT6359P_DA_VA12_B_EN_ADDR                MT6359P_LDO_VA12_MON
+#define MT6359P_RG_LDO_VAUX18_EN_ADDR            MT6359P_LDO_VAUX18_CON0
+#define MT6359P_DA_VAUX18_B_EN_ADDR              MT6359P_LDO_VAUX18_MON
+#define MT6359P_RG_LDO_VAUD18_EN_ADDR            MT6359P_LDO_VAUD18_CON0
+#define MT6359P_DA_VAUD18_B_EN_ADDR              MT6359P_LDO_VAUD18_MON
+#define MT6359P_RG_LDO_VIO18_EN_ADDR             MT6359P_LDO_VIO18_CON0
+#define MT6359P_RG_LDO_VIO18_EN_SHIFT            0
+#define MT6359P_DA_VIO18_B_EN_ADDR               MT6359P_LDO_VIO18_MON
+#define MT6359P_RG_LDO_VEMC_EN_ADDR              MT6359P_LDO_VEMC_CON0
+#define MT6359P_RG_LDO_VEMC_EN_SHIFT             0
+#define MT6359P_DA_VEMC_B_EN_ADDR                MT6359P_LDO_VEMC_MON
+#define MT6359P_RG_LDO_VSIM1_EN_ADDR             MT6359P_LDO_VSIM1_CON0
+#define MT6359P_RG_LDO_VSIM1_EN_SHIFT            0
+#define MT6359P_DA_VSIM1_B_EN_ADDR               MT6359P_LDO_VSIM1_MON
+#define MT6359P_RG_LDO_VSIM2_EN_ADDR             MT6359P_LDO_VSIM2_CON0
+#define MT6359P_RG_LDO_VSIM2_EN_SHIFT            0
+#define MT6359P_DA_VSIM2_B_EN_ADDR               MT6359P_LDO_VSIM2_MON
+#define MT6359P_RG_LDO_VUSB_EN_0_ADDR            MT6359P_LDO_VUSB_CON0
+#define MT6359P_DA_VUSB_B_EN_ADDR                MT6359P_LDO_VUSB_MON
+#define MT6359P_RG_LDO_VUSB_EN_1_ADDR            MT6359P_LDO_VUSB_MULTI_SW
+#define MT6359P_RG_LDO_VRFCK_EN_ADDR             MT6359P_LDO_VRFCK_CON0
+#define MT6359P_RG_LDO_VRFCK_EN_SHIFT            0
+#define MT6359P_DA_VRFCK_B_EN_ADDR               MT6359P_LDO_VRFCK_MON
+#define MT6359P_RG_LDO_VBBCK_EN_ADDR             MT6359P_LDO_VBBCK_CON0
+#define MT6359P_RG_LDO_VBBCK_EN_SHIFT            0
+#define MT6359P_DA_VBBCK_B_EN_ADDR               MT6359P_LDO_VBBCK_MON
+#define MT6359P_RG_LDO_VBIF28_EN_ADDR            MT6359P_LDO_VBIF28_CON0
+#define MT6359P_DA_VBIF28_B_EN_ADDR              MT6359P_LDO_VBIF28_MON
+#define MT6359P_RG_LDO_VIBR_EN_ADDR              MT6359P_LDO_VIBR_CON0
+#define MT6359P_RG_LDO_VIBR_EN_SHIFT             0
+#define MT6359P_DA_VIBR_B_EN_ADDR                MT6359P_LDO_VIBR_MON
+#define MT6359P_RG_LDO_VIO28_EN_ADDR             MT6359P_LDO_VIO28_CON0
+#define MT6359P_RG_LDO_VIO28_EN_SHIFT            0
+#define MT6359P_DA_VIO28_B_EN_ADDR               MT6359P_LDO_VIO28_MON
+#define MT6359P_RG_LDO_VM18_EN_ADDR              MT6359P_LDO_VM18_CON0
+#define MT6359P_RG_LDO_VM18_EN_SHIFT             0
+#define MT6359P_DA_VM18_B_EN_ADDR                MT6359P_LDO_VM18_MON
+#define MT6359P_RG_LDO_VUFS_EN_ADDR              MT6359P_LDO_VUFS_CON0
+#define MT6359P_RG_LDO_VUFS_EN_SHIFT             0
+#define MT6359P_DA_VUFS_B_EN_ADDR                MT6359P_LDO_VUFS_MON
+#define MT6359P_RG_LDO_VSRAM_PROC1_EN_ADDR       MT6359P_LDO_VSRAM_PROC1_CON0
+#define MT6359P_DA_VSRAM_PROC1_B_EN_ADDR         MT6359P_LDO_VSRAM_PROC1_MON
+#define MT6359P_DA_VSRAM_PROC1_VOSEL_ADDR        MT6359P_LDO_VSRAM_PROC1_VOSEL1
+#define MT6359P_RG_LDO_VSRAM_PROC2_EN_ADDR       MT6359P_LDO_VSRAM_PROC2_CON0
+#define MT6359P_DA_VSRAM_PROC2_B_EN_ADDR         MT6359P_LDO_VSRAM_PROC2_MON
+#define MT6359P_DA_VSRAM_PROC2_VOSEL_ADDR        MT6359P_LDO_VSRAM_PROC2_VOSEL1
+#define MT6359P_RG_LDO_VSRAM_OTHERS_EN_ADDR      MT6359P_LDO_VSRAM_OTHERS_CON0
+#define MT6359P_DA_VSRAM_OTHERS_B_EN_ADDR        MT6359P_LDO_VSRAM_OTHERS_MON
+#define MT6359P_DA_VSRAM_OTHERS_VOSEL_ADDR       MT6359P_LDO_VSRAM_OTHERS_VOSEL1
+#define MT6359P_RG_LDO_VSRAM_OTHERS_SSHUB_EN_ADDR    MT6359P_LDO_VSRAM_OTHERS_SSHUB
+#define MT6359P_RG_LDO_VSRAM_OTHERS_SSHUB_VOSEL_ADDR MT6359P_LDO_VSRAM_OTHERS_SSHUB
+#define MT6359P_RG_LDO_VSRAM_MD_EN_ADDR          MT6359P_LDO_VSRAM_MD_CON0
+#define MT6359P_DA_VSRAM_MD_B_EN_ADDR            MT6359P_LDO_VSRAM_MD_MON
+#define MT6359P_DA_VSRAM_MD_VOSEL_ADDR           MT6359P_LDO_VSRAM_MD_VOSEL1
+#define MT6359P_RG_VCN33_1_VOSEL_ADDR            MT6359P_VCN33_1_ANA_CON0
+#define MT6359P_RG_VCN33_2_VOSEL_ADDR            MT6359P_VCN33_2_ANA_CON0
+#define MT6359P_RG_VEMC_VOSEL_ADDR               MT6359P_VEMC_ANA_CON0
+#define MT6359P_RG_VSIM1_VOSEL_ADDR              MT6359P_VSIM1_ANA_CON0
+#define MT6359P_RG_VSIM2_VOSEL_ADDR              MT6359P_VSIM2_ANA_CON0
+#define MT6359P_RG_VIO28_VOSEL_ADDR              MT6359P_VIO28_ANA_CON0
+#define MT6359P_RG_VIBR_VOSEL_ADDR               MT6359P_VIBR_ANA_CON0
+#define MT6359P_RG_VRF18_VOSEL_ADDR              MT6359P_VRF18_ANA_CON0
+#define MT6359P_RG_VEFUSE_VOSEL_ADDR             MT6359P_VEFUSE_ANA_CON0
+#define MT6359P_RG_VCAMIO_VOSEL_ADDR             MT6359P_VCAMIO_ANA_CON0
+#define MT6359P_RG_VIO18_VOSEL_ADDR              MT6359P_VIO18_ANA_CON0
+#define MT6359P_RG_VM18_VOSEL_ADDR               MT6359P_VM18_ANA_CON0
+#define MT6359P_RG_VUFS_VOSEL_ADDR               MT6359P_VUFS_ANA_CON0
+#define MT6359P_RG_VRF12_VOSEL_ADDR              MT6359P_VRF12_ANA_CON0
+#define MT6359P_RG_VCN13_VOSEL_ADDR              MT6359P_VCN13_ANA_CON0
+#define MT6359P_RG_VA09_VOSEL_ADDR               MT6359P_VRF18_ELR_3
+#define MT6359P_RG_VA12_VOSEL_ADDR               MT6359P_VFE28_ELR_4
+#define MT6359P_RG_VXO22_VOSEL_ADDR              MT6359P_VXO22_ANA_CON0
+#define MT6359P_RG_VRFCK_VOSEL_ADDR              MT6359P_VRFCK_ANA_CON0
+#define MT6359P_RG_VBBCK_VOSEL_ADDR              MT6359P_VBBCK_ANA_CON0
+#define MT6359P_RG_VBBCK_VOSEL_MASK              0xF
+#define MT6359P_RG_VBBCK_VOSEL_SHIFT             4
+#define MT6359P_VM_MODE_ADDR                     MT6359P_TOP_TRAP
+#define MT6359P_TMA_KEY_ADDR                     MT6359P_TOP_TMA_KEY
+
+#define TMA_KEY 0x9CA6
+
+#endif /* __MFD_MT6359P_REGISTERS_H__ */
diff --git a/include/linux/regulator/mt6359-regulator.h b/include/linux/regulator/mt6359-regulator.h
index 14c4b715613e..6d6e5a58f482 100644
--- a/include/linux/regulator/mt6359-regulator.h
+++ b/include/linux/regulator/mt6359-regulator.h
@@ -17,6 +17,7 @@ enum {
 	MT6359_ID_VPROC2,
 	MT6359_ID_VPROC1,
 	MT6359_ID_VCORE_SSHUB,
+	MT6359_ID_VGPU11_SSHUB = MT6359_ID_VCORE_SSHUB,
 	MT6359_ID_VAUD18 = 10,
 	MT6359_ID_VSIM1,
 	MT6359_ID_VIBR,
-- 
cgit v1.2.3


From 8c7a703ec9787a1b45b024e9acd253328422dcbd Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Fri, 28 May 2021 09:38:09 +0200
Subject: evm: Verify portable signatures against all protected xattrs

Currently, the evm_config_default_xattrnames array contains xattr names
only related to LSMs which are enabled in the kernel configuration.
However, EVM portable signatures do not depend on local information and a
vendor might include in the signature calculation xattrs that are not
enabled in the target platform.

Just including all xattrs names in evm_config_default_xattrnames is not a
safe approach, because a target system might have already calculated
signatures or HMACs based only on the enabled xattrs. After applying this
patch, EVM would verify those signatures and HMACs with all xattrs instead.
The non-enabled ones, which could possibly exist, would cause a
verification error.

Thus, this patch adds a new field named enabled to the xattr_list
structure, which is set to true if the LSM associated to a given xattr name
is enabled in the kernel configuration. The non-enabled xattrs are taken
into account only in evm_calc_hmac_or_hash(), if the passed security.evm
type is EVM_XATTR_PORTABLE_DIGSIG.

The new function evm_protected_xattr_if_enabled() has been defined so that
IMA can include all protected xattrs and not only the enabled ones in the
measurement list, if the new template fields xattrnames, xattrlengths or
xattrvalues have been included in the template format.

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 include/linux/evm.h                 |  6 ++++
 security/integrity/evm/evm.h        |  1 +
 security/integrity/evm/evm_crypto.c |  7 +++++
 security/integrity/evm/evm_main.c   | 56 ++++++++++++++++++++++++++++++-------
 security/integrity/evm/evm_secfs.c  | 16 +++++++++--
 5 files changed, 74 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/evm.h b/include/linux/evm.h
index 31ef1dbbb3ac..5011a299c251 100644
--- a/include/linux/evm.h
+++ b/include/linux/evm.h
@@ -38,6 +38,7 @@ extern int evm_inode_init_security(struct inode *inode,
 				   const struct xattr *xattr_array,
 				   struct xattr *evm);
 extern bool evm_revalidate_status(const char *xattr_name);
+extern int evm_protected_xattr_if_enabled(const char *req_xattr_name);
 #ifdef CONFIG_FS_POSIX_ACL
 extern int posix_xattr_acl(const char *xattrname);
 #else
@@ -114,5 +115,10 @@ static inline bool evm_revalidate_status(const char *xattr_name)
 	return false;
 }
 
+static inline int evm_protected_xattr_if_enabled(const char *req_xattr_name)
+{
+	return false;
+}
+
 #endif /* CONFIG_EVM */
 #endif /* LINUX_EVM_H */
diff --git a/security/integrity/evm/evm.h b/security/integrity/evm/evm.h
index f2fef2b5ed51..0d44f41d16f8 100644
--- a/security/integrity/evm/evm.h
+++ b/security/integrity/evm/evm.h
@@ -29,6 +29,7 @@
 struct xattr_list {
 	struct list_head list;
 	char *name;
+	bool enabled;
 };
 
 extern int evm_initialized;
diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c
index d76b006cbcc4..1628e2ca9862 100644
--- a/security/integrity/evm/evm_crypto.c
+++ b/security/integrity/evm/evm_crypto.c
@@ -216,6 +216,13 @@ static int evm_calc_hmac_or_hash(struct dentry *dentry,
 		if (strcmp(xattr->name, XATTR_NAME_IMA) == 0)
 			is_ima = true;
 
+		/*
+		 * Skip non-enabled xattrs for locally calculated
+		 * signatures/HMACs.
+		 */
+		if (type != EVM_XATTR_PORTABLE_DIGSIG && !xattr->enabled)
+			continue;
+
 		if ((req_xattr_name && req_xattr_value)
 		    && !strcmp(xattr->name, req_xattr_name)) {
 			error = 0;
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c
index 0196168aeb7d..ee4e17a790fb 100644
--- a/security/integrity/evm/evm_main.c
+++ b/security/integrity/evm/evm_main.c
@@ -34,24 +34,44 @@ static const char * const integrity_status_msg[] = {
 int evm_hmac_attrs;
 
 static struct xattr_list evm_config_default_xattrnames[] = {
+	{.name = XATTR_NAME_SELINUX,
 #ifdef CONFIG_SECURITY_SELINUX
-	{.name = XATTR_NAME_SELINUX},
+	 .enabled = true
 #endif
+	},
+	{.name = XATTR_NAME_SMACK,
 #ifdef CONFIG_SECURITY_SMACK
-	{.name = XATTR_NAME_SMACK},
+	 .enabled = true
+#endif
+	},
+	{.name = XATTR_NAME_SMACKEXEC,
+#ifdef CONFIG_EVM_EXTRA_SMACK_XATTRS
+	 .enabled = true
+#endif
+	},
+	{.name = XATTR_NAME_SMACKTRANSMUTE,
 #ifdef CONFIG_EVM_EXTRA_SMACK_XATTRS
-	{.name = XATTR_NAME_SMACKEXEC},
-	{.name = XATTR_NAME_SMACKTRANSMUTE},
-	{.name = XATTR_NAME_SMACKMMAP},
+	 .enabled = true
 #endif
+	},
+	{.name = XATTR_NAME_SMACKMMAP,
+#ifdef CONFIG_EVM_EXTRA_SMACK_XATTRS
+	 .enabled = true
 #endif
+	},
+	{.name = XATTR_NAME_APPARMOR,
 #ifdef CONFIG_SECURITY_APPARMOR
-	{.name = XATTR_NAME_APPARMOR},
+	 .enabled = true
 #endif
+	},
+	{.name = XATTR_NAME_IMA,
 #ifdef CONFIG_IMA_APPRAISE
-	{.name = XATTR_NAME_IMA},
+	 .enabled = true
 #endif
-	{.name = XATTR_NAME_CAPS},
+	},
+	{.name = XATTR_NAME_CAPS,
+	 .enabled = true
+	},
 };
 
 LIST_HEAD(evm_config_xattrnames);
@@ -76,7 +96,9 @@ static void __init evm_init_config(void)
 
 	pr_info("Initialising EVM extended attributes:\n");
 	for (i = 0; i < xattrs; i++) {
-		pr_info("%s\n", evm_config_default_xattrnames[i].name);
+		pr_info("%s%s\n", evm_config_default_xattrnames[i].name,
+			!evm_config_default_xattrnames[i].enabled ?
+			" (disabled)" : "");
 		list_add_tail(&evm_config_default_xattrnames[i].list,
 			      &evm_config_xattrnames);
 	}
@@ -257,7 +279,8 @@ out:
 	return evm_status;
 }
 
-static int evm_protected_xattr(const char *req_xattr_name)
+static int evm_protected_xattr_common(const char *req_xattr_name,
+				      bool all_xattrs)
 {
 	int namelen;
 	int found = 0;
@@ -265,6 +288,9 @@ static int evm_protected_xattr(const char *req_xattr_name)
 
 	namelen = strlen(req_xattr_name);
 	list_for_each_entry_lockless(xattr, &evm_config_xattrnames, list) {
+		if (!all_xattrs && !xattr->enabled)
+			continue;
+
 		if ((strlen(xattr->name) == namelen)
 		    && (strncmp(req_xattr_name, xattr->name, namelen) == 0)) {
 			found = 1;
@@ -281,6 +307,16 @@ static int evm_protected_xattr(const char *req_xattr_name)
 	return found;
 }
 
+static int evm_protected_xattr(const char *req_xattr_name)
+{
+	return evm_protected_xattr_common(req_xattr_name, false);
+}
+
+int evm_protected_xattr_if_enabled(const char *req_xattr_name)
+{
+	return evm_protected_xattr_common(req_xattr_name, true);
+}
+
 /**
  * evm_verifyxattr - verify the integrity of the requested xattr
  * @dentry: object of the verify xattr
diff --git a/security/integrity/evm/evm_secfs.c b/security/integrity/evm/evm_secfs.c
index 5f0da41bccd0..a99676eb7f41 100644
--- a/security/integrity/evm/evm_secfs.c
+++ b/security/integrity/evm/evm_secfs.c
@@ -139,8 +139,12 @@ static ssize_t evm_read_xattrs(struct file *filp, char __user *buf,
 	if (rc)
 		return -ERESTARTSYS;
 
-	list_for_each_entry(xattr, &evm_config_xattrnames, list)
+	list_for_each_entry(xattr, &evm_config_xattrnames, list) {
+		if (!xattr->enabled)
+			continue;
+
 		size += strlen(xattr->name) + 1;
+	}
 
 	temp = kmalloc(size + 1, GFP_KERNEL);
 	if (!temp) {
@@ -149,6 +153,9 @@ static ssize_t evm_read_xattrs(struct file *filp, char __user *buf,
 	}
 
 	list_for_each_entry(xattr, &evm_config_xattrnames, list) {
+		if (!xattr->enabled)
+			continue;
+
 		sprintf(temp + offset, "%s\n", xattr->name);
 		offset += strlen(xattr->name) + 1;
 	}
@@ -199,6 +206,7 @@ static ssize_t evm_write_xattrs(struct file *file, const char __user *buf,
 		goto out;
 	}
 
+	xattr->enabled = true;
 	xattr->name = memdup_user_nul(buf, count);
 	if (IS_ERR(xattr->name)) {
 		err = PTR_ERR(xattr->name);
@@ -245,6 +253,10 @@ static ssize_t evm_write_xattrs(struct file *file, const char __user *buf,
 	list_for_each_entry(tmp, &evm_config_xattrnames, list) {
 		if (strcmp(xattr->name, tmp->name) == 0) {
 			err = -EEXIST;
+			if (!tmp->enabled) {
+				tmp->enabled = true;
+				err = count;
+			}
 			mutex_unlock(&xattr_list_mutex);
 			goto out;
 		}
@@ -256,7 +268,7 @@ static ssize_t evm_write_xattrs(struct file *file, const char __user *buf,
 	audit_log_end(ab);
 	return count;
 out:
-	audit_log_format(ab, " res=%d", err);
+	audit_log_format(ab, " res=%d", (err < 0) ? err : 0);
 	audit_log_end(ab);
 	if (xattr) {
 		kfree(xattr->name);
-- 
cgit v1.2.3


From 5ac712dcdfefb1a783384db85e0507d161e87812 Mon Sep 17 00:00:00 2001
From: Wong Vee Khee <vee.khee.wong@linux.intel.com>
Date: Tue, 1 Jun 2021 21:52:35 +0800
Subject: net: stmmac: enable platform specific safety features

On Intel platforms, not all safety features are enabled on the hardware.
The current implementation enable all safety features by default. This
will cause mass error and warning printouts after the module is loaded.

Introduce platform specific safety features flag to enable or disable
each safety features.

Signed-off-by: Wong Vee Khee <vee.khee.wong@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c  | 26 +++++++++++++++++++
 drivers/net/ethernet/stmicro/stmmac/dwmac5.c       | 30 ++++++++++++++--------
 drivers/net/ethernet/stmicro/stmmac/dwmac5.h       |  3 ++-
 .../net/ethernet/stmicro/stmmac/dwxgmac2_core.c    |  4 ++-
 drivers/net/ethernet/stmicro/stmmac/hwif.h         |  3 ++-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  |  3 ++-
 drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c   | 16 ++++++++++++
 include/linux/stmmac.h                             | 13 ++++++++++
 8 files changed, 84 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
index e36a8cc59ad0..2ecf93c84b9d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
@@ -568,6 +568,16 @@ static int ehl_common_data(struct pci_dev *pdev,
 	plat->tx_queues_to_use = 8;
 	plat->clk_ptp_rate = 200000000;
 
+	plat->safety_feat_cfg->tsoee = 1;
+	plat->safety_feat_cfg->mrxpee = 1;
+	plat->safety_feat_cfg->mestee = 1;
+	plat->safety_feat_cfg->mrxee = 1;
+	plat->safety_feat_cfg->mtxee = 1;
+	plat->safety_feat_cfg->epsi = 0;
+	plat->safety_feat_cfg->edpp = 0;
+	plat->safety_feat_cfg->prtyen = 0;
+	plat->safety_feat_cfg->tmouten = 0;
+
 	return intel_mgbe_common_data(pdev, plat);
 }
 
@@ -683,6 +693,16 @@ static int tgl_common_data(struct pci_dev *pdev,
 	plat->tx_queues_to_use = 4;
 	plat->clk_ptp_rate = 200000000;
 
+	plat->safety_feat_cfg->tsoee = 1;
+	plat->safety_feat_cfg->mrxpee = 0;
+	plat->safety_feat_cfg->mestee = 1;
+	plat->safety_feat_cfg->mrxee = 1;
+	plat->safety_feat_cfg->mtxee = 1;
+	plat->safety_feat_cfg->epsi = 0;
+	plat->safety_feat_cfg->edpp = 0;
+	plat->safety_feat_cfg->prtyen = 0;
+	plat->safety_feat_cfg->tmouten = 0;
+
 	return intel_mgbe_common_data(pdev, plat);
 }
 
@@ -959,6 +979,12 @@ static int intel_eth_pci_probe(struct pci_dev *pdev,
 	if (!plat->dma_cfg)
 		return -ENOMEM;
 
+	plat->safety_feat_cfg = devm_kzalloc(&pdev->dev,
+					     sizeof(*plat->safety_feat_cfg),
+					     GFP_KERNEL);
+	if (!plat->safety_feat_cfg)
+		return -ENOMEM;
+
 	/* Enable pci device */
 	ret = pcim_enable_device(pdev);
 	if (ret) {
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c
index d8c6ff725237..9c2d40f853ed 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c
@@ -183,7 +183,8 @@ static void dwmac5_handle_dma_err(struct net_device *ndev,
 			STAT_OFF(dma_errors), stats);
 }
 
-int dwmac5_safety_feat_config(void __iomem *ioaddr, unsigned int asp)
+int dwmac5_safety_feat_config(void __iomem *ioaddr, unsigned int asp,
+			      struct stmmac_safety_feature_cfg *safety_feat_cfg)
 {
 	u32 value;
 
@@ -193,11 +194,16 @@ int dwmac5_safety_feat_config(void __iomem *ioaddr, unsigned int asp)
 	/* 1. Enable Safety Features */
 	value = readl(ioaddr + MTL_ECC_CONTROL);
 	value |= MEEAO; /* MTL ECC Error Addr Status Override */
-	value |= TSOEE; /* TSO ECC */
-	value |= MRXPEE; /* MTL RX Parser ECC */
-	value |= MESTEE; /* MTL EST ECC */
-	value |= MRXEE; /* MTL RX FIFO ECC */
-	value |= MTXEE; /* MTL TX FIFO ECC */
+	if (safety_feat_cfg->tsoee)
+		value |= TSOEE; /* TSO ECC */
+	if (safety_feat_cfg->mrxpee)
+		value |= MRXPEE; /* MTL RX Parser ECC */
+	if (safety_feat_cfg->mestee)
+		value |= MESTEE; /* MTL EST ECC */
+	if (safety_feat_cfg->mrxee)
+		value |= MRXEE; /* MTL RX FIFO ECC */
+	if (safety_feat_cfg->mtxee)
+		value |= MTXEE; /* MTL TX FIFO ECC */
 	writel(value, ioaddr + MTL_ECC_CONTROL);
 
 	/* 2. Enable MTL Safety Interrupts */
@@ -219,13 +225,16 @@ int dwmac5_safety_feat_config(void __iomem *ioaddr, unsigned int asp)
 
 	/* 5. Enable Parity and Timeout for FSM */
 	value = readl(ioaddr + MAC_FSM_CONTROL);
-	value |= PRTYEN; /* FSM Parity Feature */
-	value |= TMOUTEN; /* FSM Timeout Feature */
+	if (safety_feat_cfg->prtyen)
+		value |= PRTYEN; /* FSM Parity Feature */
+	if (safety_feat_cfg->tmouten)
+		value |= TMOUTEN; /* FSM Timeout Feature */
 	writel(value, ioaddr + MAC_FSM_CONTROL);
 
 	/* 4. Enable Data Parity Protection */
 	value = readl(ioaddr + MTL_DPP_CONTROL);
-	value |= EDPP;
+	if (safety_feat_cfg->edpp)
+		value |= EDPP;
 	writel(value, ioaddr + MTL_DPP_CONTROL);
 
 	/*
@@ -235,7 +244,8 @@ int dwmac5_safety_feat_config(void __iomem *ioaddr, unsigned int asp)
 	if (asp <= 0x2)
 		return 0;
 
-	value |= EPSI;
+	if (safety_feat_cfg->epsi)
+		value |= EPSI;
 	writel(value, ioaddr + MTL_DPP_CONTROL);
 	return 0;
 }
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac5.h b/drivers/net/ethernet/stmicro/stmmac/dwmac5.h
index 6b2fd37b29ad..53c138d0ff48 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac5.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac5.h
@@ -137,7 +137,8 @@
 
 #define GMAC_INT_FPE_EN			BIT(17)
 
-int dwmac5_safety_feat_config(void __iomem *ioaddr, unsigned int asp);
+int dwmac5_safety_feat_config(void __iomem *ioaddr, unsigned int asp,
+			      struct stmmac_safety_feature_cfg *safety_cfg);
 int dwmac5_safety_feat_irq_status(struct net_device *ndev,
 		void __iomem *ioaddr, unsigned int asp,
 		struct stmmac_safety_stats *stats);
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
index ad4df9bddcf3..c4d78fa93663 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
@@ -801,7 +801,9 @@ static void dwxgmac3_handle_dma_err(struct net_device *ndev,
 			   dwxgmac3_dma_errors, STAT_OFF(dma_errors), stats);
 }
 
-static int dwxgmac3_safety_feat_config(void __iomem *ioaddr, unsigned int asp)
+static int
+dwxgmac3_safety_feat_config(void __iomem *ioaddr, unsigned int asp,
+			    struct stmmac_safety_feature_cfg *safety_cfg)
 {
 	u32 value;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index 75a8b90c202a..dbafedb24290 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -348,7 +348,8 @@ struct stmmac_ops {
 	void (*pcs_rane)(void __iomem *ioaddr, bool restart);
 	void (*pcs_get_adv_lp)(void __iomem *ioaddr, struct rgmii_adv *adv);
 	/* Safety Features */
-	int (*safety_feat_config)(void __iomem *ioaddr, unsigned int asp);
+	int (*safety_feat_config)(void __iomem *ioaddr, unsigned int asp,
+				  struct stmmac_safety_feature_cfg *safety_cfg);
 	int (*safety_feat_irq_status)(struct net_device *ndev,
 			void __iomem *ioaddr, unsigned int asp,
 			struct stmmac_safety_stats *stats);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 9962a1041d35..13720bf6f6ff 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -3172,7 +3172,8 @@ static void stmmac_safety_feat_configuration(struct stmmac_priv *priv)
 {
 	if (priv->dma_cap.asp) {
 		netdev_info(priv->dev, "Enabling Safety Features\n");
-		stmmac_safety_feat_config(priv, priv->ioaddr, priv->dma_cap.asp);
+		stmmac_safety_feat_config(priv, priv->ioaddr, priv->dma_cap.asp,
+					  priv->plat->safety_feat_cfg);
 	} else {
 		netdev_info(priv->dev, "No Safety Features support found\n");
 	}
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
index 95e0e4d6f74d..fcf17d8a0494 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
@@ -174,6 +174,12 @@ static int stmmac_pci_probe(struct pci_dev *pdev,
 	if (!plat->dma_cfg)
 		return -ENOMEM;
 
+	plat->safety_feat_cfg = devm_kzalloc(&pdev->dev,
+					     sizeof(*plat->safety_feat_cfg),
+					     GFP_KERNEL);
+	if (!plat->safety_feat_cfg)
+		return -ENOMEM;
+
 	/* Enable pci device */
 	ret = pci_enable_device(pdev);
 	if (ret) {
@@ -203,6 +209,16 @@ static int stmmac_pci_probe(struct pci_dev *pdev,
 	res.wol_irq = pdev->irq;
 	res.irq = pdev->irq;
 
+	plat->safety_feat_cfg->tsoee = 1;
+	plat->safety_feat_cfg->mrxpee = 1;
+	plat->safety_feat_cfg->mestee = 1;
+	plat->safety_feat_cfg->mrxee = 1;
+	plat->safety_feat_cfg->mtxee = 1;
+	plat->safety_feat_cfg->epsi = 1;
+	plat->safety_feat_cfg->edpp = 1;
+	plat->safety_feat_cfg->prtyen = 1;
+	plat->safety_feat_cfg->tmouten = 1;
+
 	return stmmac_dvr_probe(&pdev->dev, plat, &res);
 }
 
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index e14a12df381b..e55a4807e3ea 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -172,6 +172,18 @@ struct stmmac_fpe_cfg {
 	enum stmmac_fpe_state lo_fpe_state;	/* Local station FPE state */
 };
 
+struct stmmac_safety_feature_cfg {
+	u32 tsoee;
+	u32 mrxpee;
+	u32 mestee;
+	u32 mrxee;
+	u32 mtxee;
+	u32 epsi;
+	u32 edpp;
+	u32 prtyen;
+	u32 tmouten;
+};
+
 struct plat_stmmacenet_data {
 	int bus_id;
 	int phy_addr;
@@ -184,6 +196,7 @@ struct plat_stmmacenet_data {
 	struct stmmac_dma_cfg *dma_cfg;
 	struct stmmac_est *est;
 	struct stmmac_fpe_cfg *fpe_cfg;
+	struct stmmac_safety_feature_cfg *safety_feat_cfg;
 	int clk_csr;
 	int has_gmac;
 	int enh_desc;
-- 
cgit v1.2.3


From e1d9a90a9bfdb0735062d3adb16b07314b4b7b01 Mon Sep 17 00:00:00 2001
From: Sharath Chandra Vurukala <sharathv@codeaurora.org>
Date: Wed, 2 Jun 2021 00:58:35 +0530
Subject: net: ethernet: rmnet: Support for ingress MAPv5 checksum offload

Adding support for processing of MAPv5 downlink packets.
It involves parsing the Mapv5 packet and checking the csum header
to know whether the hardware has validated the checksum and is
valid or not.

Based on the checksum valid bit the corresponding stats are
incremented and skb->ip_summed is marked either CHECKSUM_UNNECESSARY
or left as CHEKSUM_NONE to let network stack revalidate the checksum
and update the respective snmp stats.

Current MAPV1 header has been modified, the reserved field in the
Mapv1 header is now used for next header indication.

Signed-off-by: Sharath Chandra Vurukala <sharathv@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/qualcomm/rmnet/rmnet_handlers.c   | 17 ++++---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h    |  3 +-
 .../net/ethernet/qualcomm/rmnet/rmnet_map_data.c   | 57 +++++++++++++++++++++-
 include/linux/if_rmnet.h                           | 30 ++++++++++--
 include/uapi/linux/if_link.h                       |  1 +
 5 files changed, 96 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
index 0be5ac7ab261..706a225075a3 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (c) 2013-2018, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2018, 2021, The Linux Foundation. All rights reserved.
  *
  * RMNET Data ingress/egress handler
  */
@@ -82,11 +82,16 @@ __rmnet_map_ingress_handler(struct sk_buff *skb,
 
 	skb->dev = ep->egress_dev;
 
-	/* Subtract MAP header */
-	skb_pull(skb, sizeof(struct rmnet_map_header));
-	rmnet_set_skb_proto(skb);
-
-	if (port->data_format & RMNET_FLAGS_INGRESS_MAP_CKSUMV4) {
+	if ((port->data_format & RMNET_FLAGS_INGRESS_MAP_CKSUMV5) &&
+	    (map_header->flags & MAP_NEXT_HEADER_FLAG)) {
+		if (rmnet_map_process_next_hdr_packet(skb, len))
+			goto free_skb;
+		skb_pull(skb, sizeof(*map_header));
+		rmnet_set_skb_proto(skb);
+	} else if (port->data_format & RMNET_FLAGS_INGRESS_MAP_CKSUMV4) {
+		/* Subtract MAP header */
+		skb_pull(skb, sizeof(*map_header));
+		rmnet_set_skb_proto(skb);
 		if (!rmnet_map_checksum_downlink_packet(skb, len + pad))
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 	}
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
index 2aea153f4247..1a399bfa07d2 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* Copyright (c) 2013-2018, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2018, 2021, The Linux Foundation. All rights reserved.
  */
 
 #ifndef _RMNET_MAP_H_
@@ -48,5 +48,6 @@ void rmnet_map_command(struct sk_buff *skb, struct rmnet_port *port);
 int rmnet_map_checksum_downlink_packet(struct sk_buff *skb, u16 len);
 void rmnet_map_checksum_uplink_packet(struct sk_buff *skb,
 				      struct net_device *orig_dev);
+int rmnet_map_process_next_hdr_packet(struct sk_buff *skb, u16 len);
 
 #endif /* _RMNET_MAP_H_ */
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
index 0ac2ff828320..5c018bd64689 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* Copyright (c) 2013-2018, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2018, 2021, The Linux Foundation. All rights reserved.
  *
  * RMNET Data MAP protocol
  */
@@ -8,6 +8,7 @@
 #include <linux/ip.h>
 #include <linux/ipv6.h>
 #include <net/ip6_checksum.h>
+#include <linux/bitfield.h>
 #include "rmnet_config.h"
 #include "rmnet_map.h"
 #include "rmnet_private.h"
@@ -300,8 +301,11 @@ done:
 struct sk_buff *rmnet_map_deaggregate(struct sk_buff *skb,
 				      struct rmnet_port *port)
 {
+	struct rmnet_map_v5_csum_header *next_hdr = NULL;
 	struct rmnet_map_header *maph;
+	void *data = skb->data;
 	struct sk_buff *skbn;
+	u8 nexthdr_type;
 	u32 packet_len;
 
 	if (skb->len == 0)
@@ -310,8 +314,18 @@ struct sk_buff *rmnet_map_deaggregate(struct sk_buff *skb,
 	maph = (struct rmnet_map_header *)skb->data;
 	packet_len = ntohs(maph->pkt_len) + sizeof(*maph);
 
-	if (port->data_format & RMNET_FLAGS_INGRESS_MAP_CKSUMV4)
+	if (port->data_format & RMNET_FLAGS_INGRESS_MAP_CKSUMV4) {
 		packet_len += sizeof(struct rmnet_map_dl_csum_trailer);
+	} else if (port->data_format & RMNET_FLAGS_INGRESS_MAP_CKSUMV5) {
+		if (!(maph->flags & MAP_CMD_FLAG)) {
+			packet_len += sizeof(*next_hdr);
+			if (maph->flags & MAP_NEXT_HEADER_FLAG)
+				next_hdr = data + sizeof(*maph);
+			else
+				/* Mapv5 data pkt without csum hdr is invalid */
+				return NULL;
+		}
+	}
 
 	if (((int)skb->len - (int)packet_len) < 0)
 		return NULL;
@@ -320,6 +334,13 @@ struct sk_buff *rmnet_map_deaggregate(struct sk_buff *skb,
 	if (!maph->pkt_len)
 		return NULL;
 
+	if (next_hdr) {
+		nexthdr_type = u8_get_bits(next_hdr->header_info,
+					   MAPV5_HDRINFO_HDR_TYPE_FMASK);
+		if (nexthdr_type != RMNET_MAP_HEADER_TYPE_CSUM_OFFLOAD)
+			return NULL;
+	}
+
 	skbn = alloc_skb(packet_len + RMNET_MAP_DEAGGR_SPACING, GFP_ATOMIC);
 	if (!skbn)
 		return NULL;
@@ -414,3 +435,35 @@ sw_csum:
 
 	priv->stats.csum_sw++;
 }
+
+/* Process a MAPv5 packet header */
+int rmnet_map_process_next_hdr_packet(struct sk_buff *skb,
+				      u16 len)
+{
+	struct rmnet_priv *priv = netdev_priv(skb->dev);
+	struct rmnet_map_v5_csum_header *next_hdr;
+	u8 nexthdr_type;
+
+	next_hdr = (struct rmnet_map_v5_csum_header *)(skb->data +
+			sizeof(struct rmnet_map_header));
+
+	nexthdr_type = u8_get_bits(next_hdr->header_info,
+				   MAPV5_HDRINFO_HDR_TYPE_FMASK);
+
+	if (nexthdr_type != RMNET_MAP_HEADER_TYPE_CSUM_OFFLOAD)
+		return -EINVAL;
+
+	if (unlikely(!(skb->dev->features & NETIF_F_RXCSUM))) {
+		priv->stats.csum_sw++;
+	} else if (next_hdr->csum_info & MAPV5_CSUMINFO_VALID_FLAG) {
+		priv->stats.csum_ok++;
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	} else {
+		priv->stats.csum_valid_unset++;
+	}
+
+	/* Pull csum v5 header */
+	skb_pull(skb, sizeof(*next_hdr));
+
+	return 0;
+}
diff --git a/include/linux/if_rmnet.h b/include/linux/if_rmnet.h
index 4efb537f57f3..be17610a981e 100644
--- a/include/linux/if_rmnet.h
+++ b/include/linux/if_rmnet.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only
- * Copyright (c) 2013-2019, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2013-2019, 2021 The Linux Foundation. All rights reserved.
  */
 
 #ifndef _LINUX_IF_RMNET_H_
@@ -12,10 +12,12 @@ struct rmnet_map_header {
 }  __aligned(1);
 
 /* rmnet_map_header flags field:
- *  PAD_LEN:	number of pad bytes following packet data
- *  CMD:	1 = packet contains a MAP command; 0 = packet contains data
+ *  PAD_LEN:	  number of pad bytes following packet data
+ *  CMD:	  1 = packet contains a MAP command; 0 = packet contains data
+ *  NEXT_HEADER: 1 = packet contains V5 CSUM header 0 = no V5 CSUM header
  */
 #define MAP_PAD_LEN_MASK		GENMASK(5, 0)
+#define MAP_NEXT_HEADER_FLAG		BIT(6)
 #define MAP_CMD_FLAG			BIT(7)
 
 struct rmnet_map_dl_csum_trailer {
@@ -45,4 +47,26 @@ struct rmnet_map_ul_csum_header {
 #define MAP_CSUM_UL_UDP_FLAG		BIT(14)
 #define MAP_CSUM_UL_ENABLED_FLAG	BIT(15)
 
+/* MAP CSUM headers */
+struct rmnet_map_v5_csum_header {
+	u8 header_info;
+	u8 csum_info;
+	__be16 reserved;
+} __aligned(1);
+
+/* v5 header_info field
+ * NEXT_HEADER: represents whether there is any next header
+ * HEADER_TYPE: represents the type of this header
+ *
+ * csum_info field
+ * CSUM_VALID_OR_REQ:
+ * 1 = for UL, checksum computation is requested.
+ * 1 = for DL, validated the checksum and has found it valid
+ */
+
+#define MAPV5_HDRINFO_NXT_HDR_FLAG	BIT(0)
+#define MAPV5_HDRINFO_HDR_TYPE_FMASK	GENMASK(7, 1)
+#define MAPV5_CSUMINFO_VALID_FLAG	BIT(7)
+
+#define RMNET_MAP_HEADER_TYPE_CSUM_OFFLOAD 2
 #endif /* !(_LINUX_IF_RMNET_H_) */
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index cd5b382a4138..1f753dcd85e1 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1236,6 +1236,7 @@ enum {
 #define RMNET_FLAGS_INGRESS_MAP_COMMANDS          (1U << 1)
 #define RMNET_FLAGS_INGRESS_MAP_CKSUMV4           (1U << 2)
 #define RMNET_FLAGS_EGRESS_MAP_CKSUMV4            (1U << 3)
+#define RMNET_FLAGS_INGRESS_MAP_CKSUMV5           (1U << 4)
 
 enum {
 	IFLA_RMNET_UNSPEC,
-- 
cgit v1.2.3


From 216214c64a8c1cb9078c2c0aec7bb4a2f8e75397 Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Wed, 9 Dec 2020 16:40:38 +0200
Subject: net/mlx5: DR, Create multi-destination flow table with level less
 than 64

Flow table that contains flow pointing to multiple flow tables or multiple
TIRs must have a level lower than 64. In our case it applies to muli-
destination flow table.
Fix the level of the created table to comply with HW Spec definitions, and
still make sure that its level lower than SW-owned tables, so that it
would be possible to point from the multi-destination FW table to SW
tables.

Fixes: 34583beea4b7 ("net/mlx5: DR, Create multi-destination table for SW-steering use")
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Alex Vesker <valex@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/steering/dr_fw.c | 3 ++-
 include/linux/mlx5/mlx5_ifc.h                            | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_fw.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_fw.c
index 1fbcd012bb85..7ccfd40586ce 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_fw.c
@@ -112,7 +112,8 @@ int mlx5dr_fw_create_md_tbl(struct mlx5dr_domain *dmn,
 	int ret;
 
 	ft_attr.table_type = MLX5_FLOW_TABLE_TYPE_FDB;
-	ft_attr.level = dmn->info.caps.max_ft_level - 2;
+	ft_attr.level = min_t(int, dmn->info.caps.max_ft_level - 2,
+			      MLX5_FT_MAX_MULTIPATH_LEVEL);
 	ft_attr.reformat_en = reformat_req;
 	ft_attr.decap_en = reformat_req;
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 6d16eed6850e..eb86e80e4643 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1289,6 +1289,8 @@ enum mlx5_fc_bulk_alloc_bitmask {
 
 #define MLX5_FC_BULK_NUM_FCS(fc_enum) (MLX5_FC_BULK_SIZE_FACTOR * (fc_enum))
 
+#define MLX5_FT_MAX_MULTIPATH_LEVEL 63
+
 enum {
 	MLX5_STEERING_FORMAT_CONNECTX_5   = 0,
 	MLX5_STEERING_FORMAT_CONNECTX_6DX = 1,
-- 
cgit v1.2.3


From d27ac0fba71cfd4da45f1ba6564f32ddd2914cc4 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Tue, 1 Jun 2021 16:37:54 -0700
Subject: Input: cyttsp - remove public header

There is nothing in include/linux/input/cyttsp.h that might be of interes
to the kernel at large, so let's move this information into the driver
code and remove the header.

Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20210531052307.1433979-2-dmitry.torokhov@gmail.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 MAINTAINERS                             |  1 -
 drivers/input/touchscreen/cyttsp_core.c |  9 ++++++++-
 drivers/input/touchscreen/cyttsp_core.h |  1 -
 drivers/input/touchscreen/cyttsp_i2c.c  |  2 ++
 drivers/input/touchscreen/cyttsp_spi.c  |  2 ++
 include/linux/input/cyttsp.h            | 29 -----------------------------
 6 files changed, 12 insertions(+), 32 deletions(-)
 delete mode 100644 include/linux/input/cyttsp.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 9a43400480e2..920f33866799 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4918,7 +4918,6 @@ M:	Linus Walleij <linus.walleij@linaro.org>
 L:	linux-input@vger.kernel.org
 S:	Maintained
 F:	drivers/input/touchscreen/cyttsp*
-F:	include/linux/input/cyttsp.h
 
 D-LINK DIR-685 TOUCHKEYS DRIVER
 M:	Linus Walleij <linus.walleij@linaro.org>
diff --git a/drivers/input/touchscreen/cyttsp_core.c b/drivers/input/touchscreen/cyttsp_core.c
index a6352d79e23d..1dbd849c9613 100644
--- a/drivers/input/touchscreen/cyttsp_core.c
+++ b/drivers/input/touchscreen/cyttsp_core.c
@@ -46,8 +46,15 @@
 #define CY_MAXZ				255
 #define CY_DELAY_DFLT			20 /* ms */
 #define CY_DELAY_MAX			500
-#define CY_ACT_DIST_DFLT		0xF8
+/* Active distance in pixels for a gesture to be reported */
+#define CY_ACT_DIST_DFLT		0xF8 /* pixels */
 #define CY_ACT_DIST_MASK		0x0F
+/* Active Power state scanning/processing refresh interval */
+#define CY_ACT_INTRVL_DFLT		0x00 /* ms */
+/* Low Power state scanning/processing refresh interval */
+#define CY_LP_INTRVL_DFLT		0x0A /* ms */
+/* touch timeout for the Active power */
+#define CY_TCH_TMOUT_DFLT		0xFF /* ms */
 #define CY_HNDSHK_BIT			0x80
 /* device mode bits */
 #define CY_OPERATE_MODE			0x00
diff --git a/drivers/input/touchscreen/cyttsp_core.h b/drivers/input/touchscreen/cyttsp_core.h
index 8eba9d8ba74a..075509e695a2 100644
--- a/drivers/input/touchscreen/cyttsp_core.h
+++ b/drivers/input/touchscreen/cyttsp_core.h
@@ -22,7 +22,6 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/device.h>
-#include <linux/input/cyttsp.h>
 #include <linux/regulator/consumer.h>
 
 #define CY_NUM_RETRY		16 /* max number of retries for read ops */
diff --git a/drivers/input/touchscreen/cyttsp_i2c.c b/drivers/input/touchscreen/cyttsp_i2c.c
index 0a09f07bc23a..4c8473d327ab 100644
--- a/drivers/input/touchscreen/cyttsp_i2c.c
+++ b/drivers/input/touchscreen/cyttsp_i2c.c
@@ -18,6 +18,8 @@
 #include <linux/i2c.h>
 #include <linux/input.h>
 
+#define CY_I2C_NAME		"cyttsp-i2c"
+
 #define CY_I2C_DATA_SIZE	128
 
 static const struct cyttsp_bus_ops cyttsp_i2c_bus_ops = {
diff --git a/drivers/input/touchscreen/cyttsp_spi.c b/drivers/input/touchscreen/cyttsp_spi.c
index 8715e5354d79..30c6fbf86a86 100644
--- a/drivers/input/touchscreen/cyttsp_spi.c
+++ b/drivers/input/touchscreen/cyttsp_spi.c
@@ -20,6 +20,8 @@
 #include <linux/input.h>
 #include <linux/spi/spi.h>
 
+#define CY_SPI_NAME		"cyttsp-spi"
+
 #define CY_SPI_WR_OP		0x00 /* r/~w */
 #define CY_SPI_RD_OP		0x01
 #define CY_SPI_CMD_BYTES	4
diff --git a/include/linux/input/cyttsp.h b/include/linux/input/cyttsp.h
deleted file mode 100644
index 118b9af6e01a..000000000000
--- a/include/linux/input/cyttsp.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Header file for:
- * Cypress TrueTouch(TM) Standard Product (TTSP) touchscreen drivers.
- * For use with Cypress Txx3xx parts.
- * Supported parts include:
- * CY8CTST341
- * CY8CTMA340
- *
- * Copyright (C) 2009, 2010, 2011 Cypress Semiconductor, Inc.
- * Copyright (C) 2012 Javier Martinez Canillas <javier@dowhile0.org>
- *
- * Contact Cypress Semiconductor at www.cypress.com (kev@cypress.com)
- */
-#ifndef _CYTTSP_H_
-#define _CYTTSP_H_
-
-#define CY_SPI_NAME "cyttsp-spi"
-#define CY_I2C_NAME "cyttsp-i2c"
-/* Active Power state scanning/processing refresh interval */
-#define CY_ACT_INTRVL_DFLT 0x00 /* ms */
-/* touch timeout for the Active power */
-#define CY_TCH_TMOUT_DFLT 0xFF /* ms */
-/* Low Power state scanning/processing refresh interval */
-#define CY_LP_INTRVL_DFLT 0x0A /* ms */
-/* Active distance in pixels for a gesture to be reported */
-#define CY_ACT_DIST_DFLT 0xF8 /* pixels */
-
-#endif /* _CYTTSP_H_ */
-- 
cgit v1.2.3


From 9a2601ebc2e909ec2260ca224d886936f56d41e7 Mon Sep 17 00:00:00 2001
From: Mattijs Korpershoek <mkorpershoek@baylibre.com>
Date: Thu, 6 May 2021 11:41:13 +0200
Subject: mfd: mt6397: Add MT6358 register definitions for power key

To support power/home key detection, add definitions for
two more MT6358 PMIC registers:

- TOPSTATUS: homekey and powerkey debounce status
- TOP_RST_MISC: controls homekey,powerkey long press reset time

Signed-off-by: Mattijs Korpershoek <mkorpershoek@baylibre.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/mt6358/registers.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/mt6358/registers.h b/include/linux/mfd/mt6358/registers.h
index 2ad0b312aa28..201139b12140 100644
--- a/include/linux/mfd/mt6358/registers.h
+++ b/include/linux/mfd/mt6358/registers.h
@@ -8,6 +8,8 @@
 
 /* PMIC Registers */
 #define MT6358_SWCID                          0xa
+#define MT6358_TOPSTATUS                      0x28
+#define MT6358_TOP_RST_MISC                   0x14c
 #define MT6358_MISC_TOP_INT_CON0              0x188
 #define MT6358_MISC_TOP_INT_STATUS0           0x194
 #define MT6358_TOP_INT_STATUS0                0x19e
-- 
cgit v1.2.3


From 50e4d7a2a667353321d4315fcc025e76c4fa2a89 Mon Sep 17 00:00:00 2001
From: Luca Ceresoli <luca@lucaceresoli.net>
Date: Fri, 26 Feb 2021 15:28:52 +0100
Subject: mfd: lp87565: Handle optional reset pin

Optionally handle the NRST pin (active low reset) in order to start from a
known state during boot and to shut down the chip when rebooting.

Signed-off-by: Luca Ceresoli <luca@lucaceresoli.net>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/lp87565.c       | 27 +++++++++++++++++++++++++++
 include/linux/mfd/lp87565.h |  1 +
 2 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/lp87565.c b/drivers/mfd/lp87565.c
index 9c21483d9653..a52ab76febb3 100644
--- a/drivers/mfd/lp87565.c
+++ b/drivers/mfd/lp87565.c
@@ -5,6 +5,7 @@
  * Author: Keerthy <j-keerthy@ti.com>
  */
 
+#include <linux/gpio/consumer.h>
 #include <linux/interrupt.h>
 #include <linux/mfd/core.h>
 #include <linux/module.h>
@@ -64,6 +65,24 @@ static int lp87565_probe(struct i2c_client *client,
 		return ret;
 	}
 
+	lp87565->reset_gpio = devm_gpiod_get_optional(lp87565->dev, "reset",
+						      GPIOD_OUT_LOW);
+	if (IS_ERR(lp87565->reset_gpio)) {
+		ret = PTR_ERR(lp87565->reset_gpio);
+		if (ret == -EPROBE_DEFER)
+			return ret;
+	}
+
+	if (lp87565->reset_gpio) {
+		gpiod_set_value_cansleep(lp87565->reset_gpio, 1);
+		/* The minimum assertion time is undocumented, just guess */
+		usleep_range(2000, 4000);
+
+		gpiod_set_value_cansleep(lp87565->reset_gpio, 0);
+		/* Min 1.2 ms before first I2C transaction */
+		usleep_range(1500, 3000);
+	}
+
 	ret = regmap_read(lp87565->regmap, LP87565_REG_OTP_REV, &otpid);
 	if (ret) {
 		dev_err(lp87565->dev, "Failed to read OTP ID\n");
@@ -83,6 +102,13 @@ static int lp87565_probe(struct i2c_client *client,
 				    NULL, 0, NULL);
 }
 
+static void lp87565_shutdown(struct i2c_client *client)
+{
+	struct lp87565 *lp87565 = i2c_get_clientdata(client);
+
+	gpiod_set_value_cansleep(lp87565->reset_gpio, 1);
+}
+
 static const struct i2c_device_id lp87565_id_table[] = {
 	{ "lp87565-q1", 0 },
 	{ },
@@ -95,6 +121,7 @@ static struct i2c_driver lp87565_driver = {
 		.of_match_table = of_lp87565_match_table,
 	},
 	.probe = lp87565_probe,
+	.shutdown = lp87565_shutdown,
 	.id_table = lp87565_id_table,
 };
 module_i2c_driver(lp87565_driver);
diff --git a/include/linux/mfd/lp87565.h b/include/linux/mfd/lp87565.h
index 94cb581af34b..4c895072d91b 100644
--- a/include/linux/mfd/lp87565.h
+++ b/include/linux/mfd/lp87565.h
@@ -252,5 +252,6 @@ struct lp87565 {
 	u8 rev;
 	u8 dev_type;
 	struct regmap *regmap;
+	struct gpio_desc *reset_gpio;
 };
 #endif /* __LINUX_MFD_LP87565_H */
-- 
cgit v1.2.3


From 50e89312e39dff9e779e267fc191249a27294f39 Mon Sep 17 00:00:00 2001
From: Gene Chen <gene_chen@richtek.com>
Date: Tue, 18 May 2021 01:33:06 +0800
Subject: mfd: mt6360: Remove redundant brackets around raw numbers

Remove redundant brackets around raw numbers.

Signed-off-by: Gene Chen <gene_chen@richtek.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/mt6360-core.c  | 172 +++++++++----------
 include/linux/mfd/mt6360.h | 410 ++++++++++++++++++++++-----------------------
 2 files changed, 291 insertions(+), 291 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/mt6360-core.c b/drivers/mfd/mt6360-core.c
index de6cf8edff0c..c374aa6c4a1b 100644
--- a/drivers/mfd/mt6360-core.c
+++ b/drivers/mfd/mt6360-core.c
@@ -18,107 +18,107 @@
 #include <linux/mfd/mt6360.h>
 
 /* reg 0 -> 0 ~ 7 */
-#define MT6360_CHG_TREG_EVT		(4)
-#define MT6360_CHG_AICR_EVT		(5)
-#define MT6360_CHG_MIVR_EVT		(6)
-#define MT6360_PWR_RDY_EVT		(7)
+#define MT6360_CHG_TREG_EVT		4
+#define MT6360_CHG_AICR_EVT		5
+#define MT6360_CHG_MIVR_EVT		6
+#define MT6360_PWR_RDY_EVT		7
 /* REG 1 -> 8 ~ 15 */
-#define MT6360_CHG_BATSYSUV_EVT		(9)
-#define MT6360_FLED_CHG_VINOVP_EVT	(11)
-#define MT6360_CHG_VSYSUV_EVT		(12)
-#define MT6360_CHG_VSYSOV_EVT		(13)
-#define MT6360_CHG_VBATOV_EVT		(14)
-#define MT6360_CHG_VBUSOV_EVT		(15)
+#define MT6360_CHG_BATSYSUV_EVT		9
+#define MT6360_FLED_CHG_VINOVP_EVT	11
+#define MT6360_CHG_VSYSUV_EVT		12
+#define MT6360_CHG_VSYSOV_EVT		13
+#define MT6360_CHG_VBATOV_EVT		14
+#define MT6360_CHG_VBUSOV_EVT		15
 /* REG 2 -> 16 ~ 23 */
 /* REG 3 -> 24 ~ 31 */
-#define MT6360_WD_PMU_DET		(25)
-#define MT6360_WD_PMU_DONE		(26)
-#define MT6360_CHG_TMRI			(27)
-#define MT6360_CHG_ADPBADI		(29)
-#define MT6360_CHG_RVPI			(30)
-#define MT6360_OTPI			(31)
+#define MT6360_WD_PMU_DET		25
+#define MT6360_WD_PMU_DONE		26
+#define MT6360_CHG_TMRI			27
+#define MT6360_CHG_ADPBADI		29
+#define MT6360_CHG_RVPI			30
+#define MT6360_OTPI			31
 /* REG 4 -> 32 ~ 39 */
-#define MT6360_CHG_AICCMEASL		(32)
-#define MT6360_CHGDET_DONEI		(34)
-#define MT6360_WDTMRI			(35)
-#define MT6360_SSFINISHI		(36)
-#define MT6360_CHG_RECHGI		(37)
-#define MT6360_CHG_TERMI		(38)
-#define MT6360_CHG_IEOCI		(39)
+#define MT6360_CHG_AICCMEASL		32
+#define MT6360_CHGDET_DONEI		34
+#define MT6360_WDTMRI			35
+#define MT6360_SSFINISHI		36
+#define MT6360_CHG_RECHGI		37
+#define MT6360_CHG_TERMI		38
+#define MT6360_CHG_IEOCI		39
 /* REG 5 -> 40 ~ 47 */
-#define MT6360_PUMPX_DONEI		(40)
-#define MT6360_BAT_OVP_ADC_EVT		(41)
-#define MT6360_TYPEC_OTP_EVT		(42)
-#define MT6360_ADC_WAKEUP_EVT		(43)
-#define MT6360_ADC_DONEI		(44)
-#define MT6360_BST_BATUVI		(45)
-#define MT6360_BST_VBUSOVI		(46)
-#define MT6360_BST_OLPI			(47)
+#define MT6360_PUMPX_DONEI		40
+#define MT6360_BAT_OVP_ADC_EVT		41
+#define MT6360_TYPEC_OTP_EVT		42
+#define MT6360_ADC_WAKEUP_EVT		43
+#define MT6360_ADC_DONEI		44
+#define MT6360_BST_BATUVI		45
+#define MT6360_BST_VBUSOVI		46
+#define MT6360_BST_OLPI			47
 /* REG 6 -> 48 ~ 55 */
-#define MT6360_ATTACH_I			(48)
-#define MT6360_DETACH_I			(49)
-#define MT6360_QC30_STPDONE		(51)
-#define MT6360_QC_VBUSDET_DONE		(52)
-#define MT6360_HVDCP_DET		(53)
-#define MT6360_CHGDETI			(54)
-#define MT6360_DCDTI			(55)
+#define MT6360_ATTACH_I			48
+#define MT6360_DETACH_I			49
+#define MT6360_QC30_STPDONE		51
+#define MT6360_QC_VBUSDET_DONE		52
+#define MT6360_HVDCP_DET		53
+#define MT6360_CHGDETI			54
+#define MT6360_DCDTI			55
 /* REG 7 -> 56 ~ 63 */
-#define MT6360_FOD_DONE_EVT		(56)
-#define MT6360_FOD_OV_EVT		(57)
-#define MT6360_CHRDET_UVP_EVT		(58)
-#define MT6360_CHRDET_OVP_EVT		(59)
-#define MT6360_CHRDET_EXT_EVT		(60)
-#define MT6360_FOD_LR_EVT		(61)
-#define MT6360_FOD_HR_EVT		(62)
-#define MT6360_FOD_DISCHG_FAIL_EVT	(63)
+#define MT6360_FOD_DONE_EVT		56
+#define MT6360_FOD_OV_EVT		57
+#define MT6360_CHRDET_UVP_EVT		58
+#define MT6360_CHRDET_OVP_EVT		59
+#define MT6360_CHRDET_EXT_EVT		60
+#define MT6360_FOD_LR_EVT		61
+#define MT6360_FOD_HR_EVT		62
+#define MT6360_FOD_DISCHG_FAIL_EVT	63
 /* REG 8 -> 64 ~ 71 */
-#define MT6360_USBID_EVT		(64)
-#define MT6360_APWDTRST_EVT		(65)
-#define MT6360_EN_EVT			(66)
-#define MT6360_QONB_RST_EVT		(67)
-#define MT6360_MRSTB_EVT		(68)
-#define MT6360_OTP_EVT			(69)
-#define MT6360_VDDAOV_EVT		(70)
-#define MT6360_SYSUV_EVT		(71)
+#define MT6360_USBID_EVT		64
+#define MT6360_APWDTRST_EVT		65
+#define MT6360_EN_EVT			66
+#define MT6360_QONB_RST_EVT		67
+#define MT6360_MRSTB_EVT		68
+#define MT6360_OTP_EVT			69
+#define MT6360_VDDAOV_EVT		70
+#define MT6360_SYSUV_EVT		71
 /* REG 9 -> 72 ~ 79 */
-#define MT6360_FLED_STRBPIN_EVT		(72)
-#define MT6360_FLED_TORPIN_EVT		(73)
-#define MT6360_FLED_TX_EVT		(74)
-#define MT6360_FLED_LVF_EVT		(75)
-#define MT6360_FLED2_SHORT_EVT		(78)
-#define MT6360_FLED1_SHORT_EVT		(79)
+#define MT6360_FLED_STRBPIN_EVT		72
+#define MT6360_FLED_TORPIN_EVT		73
+#define MT6360_FLED_TX_EVT		74
+#define MT6360_FLED_LVF_EVT		75
+#define MT6360_FLED2_SHORT_EVT		78
+#define MT6360_FLED1_SHORT_EVT		79
 /* REG 10 -> 80 ~ 87 */
-#define MT6360_FLED2_STRB_EVT		(80)
-#define MT6360_FLED1_STRB_EVT		(81)
-#define MT6360_FLED2_STRB_TO_EVT	(82)
-#define MT6360_FLED1_STRB_TO_EVT	(83)
-#define MT6360_FLED2_TOR_EVT		(84)
-#define MT6360_FLED1_TOR_EVT		(85)
+#define MT6360_FLED2_STRB_EVT		80
+#define MT6360_FLED1_STRB_EVT		81
+#define MT6360_FLED2_STRB_TO_EVT	82
+#define MT6360_FLED1_STRB_TO_EVT	83
+#define MT6360_FLED2_TOR_EVT		84
+#define MT6360_FLED1_TOR_EVT		85
 /* REG 11 -> 88 ~ 95 */
 /* REG 12 -> 96 ~ 103 */
-#define MT6360_BUCK1_PGB_EVT		(96)
-#define MT6360_BUCK1_OC_EVT		(100)
-#define MT6360_BUCK1_OV_EVT		(101)
-#define MT6360_BUCK1_UV_EVT		(102)
+#define MT6360_BUCK1_PGB_EVT		96
+#define MT6360_BUCK1_OC_EVT		100
+#define MT6360_BUCK1_OV_EVT		101
+#define MT6360_BUCK1_UV_EVT		102
 /* REG 13 -> 104 ~ 111 */
-#define MT6360_BUCK2_PGB_EVT		(104)
-#define MT6360_BUCK2_OC_EVT		(108)
-#define MT6360_BUCK2_OV_EVT		(109)
-#define MT6360_BUCK2_UV_EVT		(110)
+#define MT6360_BUCK2_PGB_EVT		104
+#define MT6360_BUCK2_OC_EVT		108
+#define MT6360_BUCK2_OV_EVT		109
+#define MT6360_BUCK2_UV_EVT		110
 /* REG 14 -> 112 ~ 119 */
-#define MT6360_LDO1_OC_EVT		(113)
-#define MT6360_LDO2_OC_EVT		(114)
-#define MT6360_LDO3_OC_EVT		(115)
-#define MT6360_LDO5_OC_EVT		(117)
-#define MT6360_LDO6_OC_EVT		(118)
-#define MT6360_LDO7_OC_EVT		(119)
+#define MT6360_LDO1_OC_EVT		113
+#define MT6360_LDO2_OC_EVT		114
+#define MT6360_LDO3_OC_EVT		115
+#define MT6360_LDO5_OC_EVT		117
+#define MT6360_LDO6_OC_EVT		118
+#define MT6360_LDO7_OC_EVT		119
 /* REG 15 -> 120 ~ 127 */
-#define MT6360_LDO1_PGB_EVT		(121)
-#define MT6360_LDO2_PGB_EVT		(122)
-#define MT6360_LDO3_PGB_EVT		(123)
-#define MT6360_LDO5_PGB_EVT		(125)
-#define MT6360_LDO6_PGB_EVT		(126)
-#define MT6360_LDO7_PGB_EVT		(127)
+#define MT6360_LDO1_PGB_EVT		121
+#define MT6360_LDO2_PGB_EVT		122
+#define MT6360_LDO3_PGB_EVT		123
+#define MT6360_LDO5_PGB_EVT		125
+#define MT6360_LDO6_PGB_EVT		126
+#define MT6360_LDO7_PGB_EVT		127
 
 static const struct regmap_irq mt6360_pmu_irqs[] =  {
 	REGMAP_IRQ_REG_LINE(MT6360_CHG_TREG_EVT, 8),
diff --git a/include/linux/mfd/mt6360.h b/include/linux/mfd/mt6360.h
index ea1304035d4d..72edf1352229 100644
--- a/include/linux/mfd/mt6360.h
+++ b/include/linux/mfd/mt6360.h
@@ -16,10 +16,10 @@ enum {
 	MT6360_SLAVE_MAX,
 };
 
-#define MT6360_PMU_SLAVEID	(0x34)
-#define MT6360_PMIC_SLAVEID	(0x1A)
-#define MT6360_LDO_SLAVEID	(0x64)
-#define MT6360_TCPC_SLAVEID	(0x4E)
+#define MT6360_PMU_SLAVEID	0x34
+#define MT6360_PMIC_SLAVEID	0x1A
+#define MT6360_LDO_SLAVEID	0x64
+#define MT6360_TCPC_SLAVEID	0x4E
 
 struct mt6360_pmu_data {
 	struct i2c_client *i2c[MT6360_SLAVE_MAX];
@@ -30,211 +30,211 @@ struct mt6360_pmu_data {
 };
 
 /* PMU register defininition */
-#define MT6360_PMU_DEV_INFO			(0x00)
-#define MT6360_PMU_CORE_CTRL1			(0x01)
-#define MT6360_PMU_RST1				(0x02)
-#define MT6360_PMU_CRCEN			(0x03)
-#define MT6360_PMU_RST_PAS_CODE1		(0x04)
-#define MT6360_PMU_RST_PAS_CODE2		(0x05)
-#define MT6360_PMU_CORE_CTRL2			(0x06)
-#define MT6360_PMU_TM_PAS_CODE1			(0x07)
-#define MT6360_PMU_TM_PAS_CODE2			(0x08)
-#define MT6360_PMU_TM_PAS_CODE3			(0x09)
-#define MT6360_PMU_TM_PAS_CODE4			(0x0A)
-#define MT6360_PMU_IRQ_IND			(0x0B)
-#define MT6360_PMU_IRQ_MASK			(0x0C)
-#define MT6360_PMU_IRQ_SET			(0x0D)
-#define MT6360_PMU_SHDN_CTRL			(0x0E)
-#define MT6360_PMU_TM_INF			(0x0F)
-#define MT6360_PMU_I2C_CTRL			(0x10)
-#define MT6360_PMU_CHG_CTRL1			(0x11)
-#define MT6360_PMU_CHG_CTRL2			(0x12)
-#define MT6360_PMU_CHG_CTRL3			(0x13)
-#define MT6360_PMU_CHG_CTRL4			(0x14)
-#define MT6360_PMU_CHG_CTRL5			(0x15)
-#define MT6360_PMU_CHG_CTRL6			(0x16)
-#define MT6360_PMU_CHG_CTRL7			(0x17)
-#define MT6360_PMU_CHG_CTRL8			(0x18)
-#define MT6360_PMU_CHG_CTRL9			(0x19)
-#define MT6360_PMU_CHG_CTRL10			(0x1A)
-#define MT6360_PMU_CHG_CTRL11			(0x1B)
-#define MT6360_PMU_CHG_CTRL12			(0x1C)
-#define MT6360_PMU_CHG_CTRL13			(0x1D)
-#define MT6360_PMU_CHG_CTRL14			(0x1E)
-#define MT6360_PMU_CHG_CTRL15			(0x1F)
-#define MT6360_PMU_CHG_CTRL16			(0x20)
-#define MT6360_PMU_CHG_AICC_RESULT		(0x21)
-#define MT6360_PMU_DEVICE_TYPE			(0x22)
-#define MT6360_PMU_QC_CONTROL1			(0x23)
-#define MT6360_PMU_QC_CONTROL2			(0x24)
-#define MT6360_PMU_QC30_CONTROL1		(0x25)
-#define MT6360_PMU_QC30_CONTROL2		(0x26)
-#define MT6360_PMU_USB_STATUS1			(0x27)
-#define MT6360_PMU_QC_STATUS1			(0x28)
-#define MT6360_PMU_QC_STATUS2			(0x29)
-#define MT6360_PMU_CHG_PUMP			(0x2A)
-#define MT6360_PMU_CHG_CTRL17			(0x2B)
-#define MT6360_PMU_CHG_CTRL18			(0x2C)
-#define MT6360_PMU_CHRDET_CTRL1			(0x2D)
-#define MT6360_PMU_CHRDET_CTRL2			(0x2E)
-#define MT6360_PMU_DPDN_CTRL			(0x2F)
-#define MT6360_PMU_CHG_HIDDEN_CTRL1		(0x30)
-#define MT6360_PMU_CHG_HIDDEN_CTRL2		(0x31)
-#define MT6360_PMU_CHG_HIDDEN_CTRL3		(0x32)
-#define MT6360_PMU_CHG_HIDDEN_CTRL4		(0x33)
-#define MT6360_PMU_CHG_HIDDEN_CTRL5		(0x34)
-#define MT6360_PMU_CHG_HIDDEN_CTRL6		(0x35)
-#define MT6360_PMU_CHG_HIDDEN_CTRL7		(0x36)
-#define MT6360_PMU_CHG_HIDDEN_CTRL8		(0x37)
-#define MT6360_PMU_CHG_HIDDEN_CTRL9		(0x38)
-#define MT6360_PMU_CHG_HIDDEN_CTRL10		(0x39)
-#define MT6360_PMU_CHG_HIDDEN_CTRL11		(0x3A)
-#define MT6360_PMU_CHG_HIDDEN_CTRL12		(0x3B)
-#define MT6360_PMU_CHG_HIDDEN_CTRL13		(0x3C)
-#define MT6360_PMU_CHG_HIDDEN_CTRL14		(0x3D)
-#define MT6360_PMU_CHG_HIDDEN_CTRL15		(0x3E)
-#define MT6360_PMU_CHG_HIDDEN_CTRL16		(0x3F)
-#define MT6360_PMU_CHG_HIDDEN_CTRL17		(0x40)
-#define MT6360_PMU_CHG_HIDDEN_CTRL18		(0x41)
-#define MT6360_PMU_CHG_HIDDEN_CTRL19		(0x42)
-#define MT6360_PMU_CHG_HIDDEN_CTRL20		(0x43)
-#define MT6360_PMU_CHG_HIDDEN_CTRL21		(0x44)
-#define MT6360_PMU_CHG_HIDDEN_CTRL22		(0x45)
-#define MT6360_PMU_CHG_HIDDEN_CTRL23		(0x46)
-#define MT6360_PMU_CHG_HIDDEN_CTRL24		(0x47)
-#define MT6360_PMU_CHG_HIDDEN_CTRL25		(0x48)
-#define MT6360_PMU_BC12_CTRL			(0x49)
-#define MT6360_PMU_CHG_STAT			(0x4A)
-#define MT6360_PMU_RESV1			(0x4B)
-#define MT6360_PMU_TYPEC_OTP_TH_SEL_CODEH	(0x4E)
-#define MT6360_PMU_TYPEC_OTP_TH_SEL_CODEL	(0x4F)
-#define MT6360_PMU_TYPEC_OTP_HYST_TH		(0x50)
-#define MT6360_PMU_TYPEC_OTP_CTRL		(0x51)
-#define MT6360_PMU_ADC_BAT_DATA_H		(0x52)
-#define MT6360_PMU_ADC_BAT_DATA_L		(0x53)
-#define MT6360_PMU_IMID_BACKBST_ON		(0x54)
-#define MT6360_PMU_IMID_BACKBST_OFF		(0x55)
-#define MT6360_PMU_ADC_CONFIG			(0x56)
-#define MT6360_PMU_ADC_EN2			(0x57)
-#define MT6360_PMU_ADC_IDLE_T			(0x58)
-#define MT6360_PMU_ADC_RPT_1			(0x5A)
-#define MT6360_PMU_ADC_RPT_2			(0x5B)
-#define MT6360_PMU_ADC_RPT_3			(0x5C)
-#define MT6360_PMU_ADC_RPT_ORG1			(0x5D)
-#define MT6360_PMU_ADC_RPT_ORG2			(0x5E)
-#define MT6360_PMU_BAT_OVP_TH_SEL_CODEH		(0x5F)
-#define MT6360_PMU_BAT_OVP_TH_SEL_CODEL		(0x60)
-#define MT6360_PMU_CHG_CTRL19			(0x61)
-#define MT6360_PMU_VDDASUPPLY			(0x62)
-#define MT6360_PMU_BC12_MANUAL			(0x63)
-#define MT6360_PMU_CHGDET_FUNC			(0x64)
-#define MT6360_PMU_FOD_CTRL			(0x65)
-#define MT6360_PMU_CHG_CTRL20			(0x66)
-#define MT6360_PMU_CHG_HIDDEN_CTRL26		(0x67)
-#define MT6360_PMU_CHG_HIDDEN_CTRL27		(0x68)
-#define MT6360_PMU_RESV2			(0x69)
-#define MT6360_PMU_USBID_CTRL1			(0x6D)
-#define MT6360_PMU_USBID_CTRL2			(0x6E)
-#define MT6360_PMU_USBID_CTRL3			(0x6F)
-#define MT6360_PMU_FLED_CFG			(0x70)
-#define MT6360_PMU_RESV3			(0x71)
-#define MT6360_PMU_FLED1_CTRL			(0x72)
-#define MT6360_PMU_FLED_STRB_CTRL		(0x73)
-#define MT6360_PMU_FLED1_STRB_CTRL2		(0x74)
-#define MT6360_PMU_FLED1_TOR_CTRL		(0x75)
-#define MT6360_PMU_FLED2_CTRL			(0x76)
-#define MT6360_PMU_RESV4			(0x77)
-#define MT6360_PMU_FLED2_STRB_CTRL2		(0x78)
-#define MT6360_PMU_FLED2_TOR_CTRL		(0x79)
-#define MT6360_PMU_FLED_VMIDTRK_CTRL1		(0x7A)
-#define MT6360_PMU_FLED_VMID_RTM		(0x7B)
-#define MT6360_PMU_FLED_VMIDTRK_CTRL2		(0x7C)
-#define MT6360_PMU_FLED_PWSEL			(0x7D)
-#define MT6360_PMU_FLED_EN			(0x7E)
-#define MT6360_PMU_FLED_Hidden1			(0x7F)
-#define MT6360_PMU_RGB_EN			(0x80)
-#define MT6360_PMU_RGB1_ISNK			(0x81)
-#define MT6360_PMU_RGB2_ISNK			(0x82)
-#define MT6360_PMU_RGB3_ISNK			(0x83)
-#define MT6360_PMU_RGB_ML_ISNK			(0x84)
-#define MT6360_PMU_RGB1_DIM			(0x85)
-#define MT6360_PMU_RGB2_DIM			(0x86)
-#define MT6360_PMU_RGB3_DIM			(0x87)
-#define MT6360_PMU_RESV5			(0x88)
-#define MT6360_PMU_RGB12_Freq			(0x89)
-#define MT6360_PMU_RGB34_Freq			(0x8A)
-#define MT6360_PMU_RGB1_Tr			(0x8B)
-#define MT6360_PMU_RGB1_Tf			(0x8C)
-#define MT6360_PMU_RGB1_TON_TOFF		(0x8D)
-#define MT6360_PMU_RGB2_Tr			(0x8E)
-#define MT6360_PMU_RGB2_Tf			(0x8F)
-#define MT6360_PMU_RGB2_TON_TOFF		(0x90)
-#define MT6360_PMU_RGB3_Tr			(0x91)
-#define MT6360_PMU_RGB3_Tf			(0x92)
-#define MT6360_PMU_RGB3_TON_TOFF		(0x93)
-#define MT6360_PMU_RGB_Hidden_CTRL1		(0x94)
-#define MT6360_PMU_RGB_Hidden_CTRL2		(0x95)
-#define MT6360_PMU_RESV6			(0x97)
-#define MT6360_PMU_SPARE1			(0x9A)
-#define MT6360_PMU_SPARE2			(0xA0)
-#define MT6360_PMU_SPARE3			(0xB0)
-#define MT6360_PMU_SPARE4			(0xC0)
-#define MT6360_PMU_CHG_IRQ1			(0xD0)
-#define MT6360_PMU_CHG_IRQ2			(0xD1)
-#define MT6360_PMU_CHG_IRQ3			(0xD2)
-#define MT6360_PMU_CHG_IRQ4			(0xD3)
-#define MT6360_PMU_CHG_IRQ5			(0xD4)
-#define MT6360_PMU_CHG_IRQ6			(0xD5)
-#define MT6360_PMU_QC_IRQ			(0xD6)
-#define MT6360_PMU_FOD_IRQ			(0xD7)
-#define MT6360_PMU_BASE_IRQ			(0xD8)
-#define MT6360_PMU_FLED_IRQ1			(0xD9)
-#define MT6360_PMU_FLED_IRQ2			(0xDA)
-#define MT6360_PMU_RGB_IRQ			(0xDB)
-#define MT6360_PMU_BUCK1_IRQ			(0xDC)
-#define MT6360_PMU_BUCK2_IRQ			(0xDD)
-#define MT6360_PMU_LDO_IRQ1			(0xDE)
-#define MT6360_PMU_LDO_IRQ2			(0xDF)
-#define MT6360_PMU_CHG_STAT1			(0xE0)
-#define MT6360_PMU_CHG_STAT2			(0xE1)
-#define MT6360_PMU_CHG_STAT3			(0xE2)
-#define MT6360_PMU_CHG_STAT4			(0xE3)
-#define MT6360_PMU_CHG_STAT5			(0xE4)
-#define MT6360_PMU_CHG_STAT6			(0xE5)
-#define MT6360_PMU_QC_STAT			(0xE6)
-#define MT6360_PMU_FOD_STAT			(0xE7)
-#define MT6360_PMU_BASE_STAT			(0xE8)
-#define MT6360_PMU_FLED_STAT1			(0xE9)
-#define MT6360_PMU_FLED_STAT2			(0xEA)
-#define MT6360_PMU_RGB_STAT			(0xEB)
-#define MT6360_PMU_BUCK1_STAT			(0xEC)
-#define MT6360_PMU_BUCK2_STAT			(0xED)
-#define MT6360_PMU_LDO_STAT1			(0xEE)
-#define MT6360_PMU_LDO_STAT2			(0xEF)
-#define MT6360_PMU_CHG_MASK1			(0xF0)
-#define MT6360_PMU_CHG_MASK2			(0xF1)
-#define MT6360_PMU_CHG_MASK3			(0xF2)
-#define MT6360_PMU_CHG_MASK4			(0xF3)
-#define MT6360_PMU_CHG_MASK5			(0xF4)
-#define MT6360_PMU_CHG_MASK6			(0xF5)
-#define MT6360_PMU_QC_MASK			(0xF6)
-#define MT6360_PMU_FOD_MASK			(0xF7)
-#define MT6360_PMU_BASE_MASK			(0xF8)
-#define MT6360_PMU_FLED_MASK1			(0xF9)
-#define MT6360_PMU_FLED_MASK2			(0xFA)
-#define MT6360_PMU_FAULTB_MASK			(0xFB)
-#define MT6360_PMU_BUCK1_MASK			(0xFC)
-#define MT6360_PMU_BUCK2_MASK			(0xFD)
-#define MT6360_PMU_LDO_MASK1			(0xFE)
-#define MT6360_PMU_LDO_MASK2			(0xFF)
-#define MT6360_PMU_MAXREG			(MT6360_PMU_LDO_MASK2)
+#define MT6360_PMU_DEV_INFO			0x00
+#define MT6360_PMU_CORE_CTRL1			0x01
+#define MT6360_PMU_RST1				0x02
+#define MT6360_PMU_CRCEN			0x03
+#define MT6360_PMU_RST_PAS_CODE1		0x04
+#define MT6360_PMU_RST_PAS_CODE2		0x05
+#define MT6360_PMU_CORE_CTRL2			0x06
+#define MT6360_PMU_TM_PAS_CODE1			0x07
+#define MT6360_PMU_TM_PAS_CODE2			0x08
+#define MT6360_PMU_TM_PAS_CODE3			0x09
+#define MT6360_PMU_TM_PAS_CODE4			0x0A
+#define MT6360_PMU_IRQ_IND			0x0B
+#define MT6360_PMU_IRQ_MASK			0x0C
+#define MT6360_PMU_IRQ_SET			0x0D
+#define MT6360_PMU_SHDN_CTRL			0x0E
+#define MT6360_PMU_TM_INF			0x0F
+#define MT6360_PMU_I2C_CTRL			0x10
+#define MT6360_PMU_CHG_CTRL1			0x11
+#define MT6360_PMU_CHG_CTRL2			0x12
+#define MT6360_PMU_CHG_CTRL3			0x13
+#define MT6360_PMU_CHG_CTRL4			0x14
+#define MT6360_PMU_CHG_CTRL5			0x15
+#define MT6360_PMU_CHG_CTRL6			0x16
+#define MT6360_PMU_CHG_CTRL7			0x17
+#define MT6360_PMU_CHG_CTRL8			0x18
+#define MT6360_PMU_CHG_CTRL9			0x19
+#define MT6360_PMU_CHG_CTRL10			0x1A
+#define MT6360_PMU_CHG_CTRL11			0x1B
+#define MT6360_PMU_CHG_CTRL12			0x1C
+#define MT6360_PMU_CHG_CTRL13			0x1D
+#define MT6360_PMU_CHG_CTRL14			0x1E
+#define MT6360_PMU_CHG_CTRL15			0x1F
+#define MT6360_PMU_CHG_CTRL16			0x20
+#define MT6360_PMU_CHG_AICC_RESULT		0x21
+#define MT6360_PMU_DEVICE_TYPE			0x22
+#define MT6360_PMU_QC_CONTROL1			0x23
+#define MT6360_PMU_QC_CONTROL2			0x24
+#define MT6360_PMU_QC30_CONTROL1		0x25
+#define MT6360_PMU_QC30_CONTROL2		0x26
+#define MT6360_PMU_USB_STATUS1			0x27
+#define MT6360_PMU_QC_STATUS1			0x28
+#define MT6360_PMU_QC_STATUS2			0x29
+#define MT6360_PMU_CHG_PUMP			0x2A
+#define MT6360_PMU_CHG_CTRL17			0x2B
+#define MT6360_PMU_CHG_CTRL18			0x2C
+#define MT6360_PMU_CHRDET_CTRL1			0x2D
+#define MT6360_PMU_CHRDET_CTRL2			0x2E
+#define MT6360_PMU_DPDN_CTRL			0x2F
+#define MT6360_PMU_CHG_HIDDEN_CTRL1		0x30
+#define MT6360_PMU_CHG_HIDDEN_CTRL2		0x31
+#define MT6360_PMU_CHG_HIDDEN_CTRL3		0x32
+#define MT6360_PMU_CHG_HIDDEN_CTRL4		0x33
+#define MT6360_PMU_CHG_HIDDEN_CTRL5		0x34
+#define MT6360_PMU_CHG_HIDDEN_CTRL6		0x35
+#define MT6360_PMU_CHG_HIDDEN_CTRL7		0x36
+#define MT6360_PMU_CHG_HIDDEN_CTRL8		0x37
+#define MT6360_PMU_CHG_HIDDEN_CTRL9		0x38
+#define MT6360_PMU_CHG_HIDDEN_CTRL10		0x39
+#define MT6360_PMU_CHG_HIDDEN_CTRL11		0x3A
+#define MT6360_PMU_CHG_HIDDEN_CTRL12		0x3B
+#define MT6360_PMU_CHG_HIDDEN_CTRL13		0x3C
+#define MT6360_PMU_CHG_HIDDEN_CTRL14		0x3D
+#define MT6360_PMU_CHG_HIDDEN_CTRL15		0x3E
+#define MT6360_PMU_CHG_HIDDEN_CTRL16		0x3F
+#define MT6360_PMU_CHG_HIDDEN_CTRL17		0x40
+#define MT6360_PMU_CHG_HIDDEN_CTRL18		0x41
+#define MT6360_PMU_CHG_HIDDEN_CTRL19		0x42
+#define MT6360_PMU_CHG_HIDDEN_CTRL20		0x43
+#define MT6360_PMU_CHG_HIDDEN_CTRL21		0x44
+#define MT6360_PMU_CHG_HIDDEN_CTRL22		0x45
+#define MT6360_PMU_CHG_HIDDEN_CTRL23		0x46
+#define MT6360_PMU_CHG_HIDDEN_CTRL24		0x47
+#define MT6360_PMU_CHG_HIDDEN_CTRL25		0x48
+#define MT6360_PMU_BC12_CTRL			0x49
+#define MT6360_PMU_CHG_STAT			0x4A
+#define MT6360_PMU_RESV1			0x4B
+#define MT6360_PMU_TYPEC_OTP_TH_SEL_CODEH	0x4E
+#define MT6360_PMU_TYPEC_OTP_TH_SEL_CODEL	0x4F
+#define MT6360_PMU_TYPEC_OTP_HYST_TH		0x50
+#define MT6360_PMU_TYPEC_OTP_CTRL		0x51
+#define MT6360_PMU_ADC_BAT_DATA_H		0x52
+#define MT6360_PMU_ADC_BAT_DATA_L		0x53
+#define MT6360_PMU_IMID_BACKBST_ON		0x54
+#define MT6360_PMU_IMID_BACKBST_OFF		0x55
+#define MT6360_PMU_ADC_CONFIG			0x56
+#define MT6360_PMU_ADC_EN2			0x57
+#define MT6360_PMU_ADC_IDLE_T			0x58
+#define MT6360_PMU_ADC_RPT_1			0x5A
+#define MT6360_PMU_ADC_RPT_2			0x5B
+#define MT6360_PMU_ADC_RPT_3			0x5C
+#define MT6360_PMU_ADC_RPT_ORG1			0x5D
+#define MT6360_PMU_ADC_RPT_ORG2			0x5E
+#define MT6360_PMU_BAT_OVP_TH_SEL_CODEH		0x5F
+#define MT6360_PMU_BAT_OVP_TH_SEL_CODEL		0x60
+#define MT6360_PMU_CHG_CTRL19			0x61
+#define MT6360_PMU_VDDASUPPLY			0x62
+#define MT6360_PMU_BC12_MANUAL			0x63
+#define MT6360_PMU_CHGDET_FUNC			0x64
+#define MT6360_PMU_FOD_CTRL			0x65
+#define MT6360_PMU_CHG_CTRL20			0x66
+#define MT6360_PMU_CHG_HIDDEN_CTRL26		0x67
+#define MT6360_PMU_CHG_HIDDEN_CTRL27		0x68
+#define MT6360_PMU_RESV2			0x69
+#define MT6360_PMU_USBID_CTRL1			0x6D
+#define MT6360_PMU_USBID_CTRL2			0x6E
+#define MT6360_PMU_USBID_CTRL3			0x6F
+#define MT6360_PMU_FLED_CFG			0x70
+#define MT6360_PMU_RESV3			0x71
+#define MT6360_PMU_FLED1_CTRL			0x72
+#define MT6360_PMU_FLED_STRB_CTRL		0x73
+#define MT6360_PMU_FLED1_STRB_CTRL2		0x74
+#define MT6360_PMU_FLED1_TOR_CTRL		0x75
+#define MT6360_PMU_FLED2_CTRL			0x76
+#define MT6360_PMU_RESV4			0x77
+#define MT6360_PMU_FLED2_STRB_CTRL2		0x78
+#define MT6360_PMU_FLED2_TOR_CTRL		0x79
+#define MT6360_PMU_FLED_VMIDTRK_CTRL1		0x7A
+#define MT6360_PMU_FLED_VMID_RTM		0x7B
+#define MT6360_PMU_FLED_VMIDTRK_CTRL2		0x7C
+#define MT6360_PMU_FLED_PWSEL			0x7D
+#define MT6360_PMU_FLED_EN			0x7E
+#define MT6360_PMU_FLED_Hidden1			0x7F
+#define MT6360_PMU_RGB_EN			0x80
+#define MT6360_PMU_RGB1_ISNK			0x81
+#define MT6360_PMU_RGB2_ISNK			0x82
+#define MT6360_PMU_RGB3_ISNK			0x83
+#define MT6360_PMU_RGB_ML_ISNK			0x84
+#define MT6360_PMU_RGB1_DIM			0x85
+#define MT6360_PMU_RGB2_DIM			0x86
+#define MT6360_PMU_RGB3_DIM			0x87
+#define MT6360_PMU_RESV5			0x88
+#define MT6360_PMU_RGB12_Freq			0x89
+#define MT6360_PMU_RGB34_Freq			0x8A
+#define MT6360_PMU_RGB1_Tr			0x8B
+#define MT6360_PMU_RGB1_Tf			0x8C
+#define MT6360_PMU_RGB1_TON_TOFF		0x8D
+#define MT6360_PMU_RGB2_Tr			0x8E
+#define MT6360_PMU_RGB2_Tf			0x8F
+#define MT6360_PMU_RGB2_TON_TOFF		0x90
+#define MT6360_PMU_RGB3_Tr			0x91
+#define MT6360_PMU_RGB3_Tf			0x92
+#define MT6360_PMU_RGB3_TON_TOFF		0x93
+#define MT6360_PMU_RGB_Hidden_CTRL1		0x94
+#define MT6360_PMU_RGB_Hidden_CTRL2		0x95
+#define MT6360_PMU_RESV6			0x97
+#define MT6360_PMU_SPARE1			0x9A
+#define MT6360_PMU_SPARE2			0xA0
+#define MT6360_PMU_SPARE3			0xB0
+#define MT6360_PMU_SPARE4			0xC0
+#define MT6360_PMU_CHG_IRQ1			0xD0
+#define MT6360_PMU_CHG_IRQ2			0xD1
+#define MT6360_PMU_CHG_IRQ3			0xD2
+#define MT6360_PMU_CHG_IRQ4			0xD3
+#define MT6360_PMU_CHG_IRQ5			0xD4
+#define MT6360_PMU_CHG_IRQ6			0xD5
+#define MT6360_PMU_QC_IRQ			0xD6
+#define MT6360_PMU_FOD_IRQ			0xD7
+#define MT6360_PMU_BASE_IRQ			0xD8
+#define MT6360_PMU_FLED_IRQ1			0xD9
+#define MT6360_PMU_FLED_IRQ2			0xDA
+#define MT6360_PMU_RGB_IRQ			0xDB
+#define MT6360_PMU_BUCK1_IRQ			0xDC
+#define MT6360_PMU_BUCK2_IRQ			0xDD
+#define MT6360_PMU_LDO_IRQ1			0xDE
+#define MT6360_PMU_LDO_IRQ2			0xDF
+#define MT6360_PMU_CHG_STAT1			0xE0
+#define MT6360_PMU_CHG_STAT2			0xE1
+#define MT6360_PMU_CHG_STAT3			0xE2
+#define MT6360_PMU_CHG_STAT4			0xE3
+#define MT6360_PMU_CHG_STAT5			0xE4
+#define MT6360_PMU_CHG_STAT6			0xE5
+#define MT6360_PMU_QC_STAT			0xE6
+#define MT6360_PMU_FOD_STAT			0xE7
+#define MT6360_PMU_BASE_STAT			0xE8
+#define MT6360_PMU_FLED_STAT1			0xE9
+#define MT6360_PMU_FLED_STAT2			0xEA
+#define MT6360_PMU_RGB_STAT			0xEB
+#define MT6360_PMU_BUCK1_STAT			0xEC
+#define MT6360_PMU_BUCK2_STAT			0xED
+#define MT6360_PMU_LDO_STAT1			0xEE
+#define MT6360_PMU_LDO_STAT2			0xEF
+#define MT6360_PMU_CHG_MASK1			0xF0
+#define MT6360_PMU_CHG_MASK2			0xF1
+#define MT6360_PMU_CHG_MASK3			0xF2
+#define MT6360_PMU_CHG_MASK4			0xF3
+#define MT6360_PMU_CHG_MASK5			0xF4
+#define MT6360_PMU_CHG_MASK6			0xF5
+#define MT6360_PMU_QC_MASK			0xF6
+#define MT6360_PMU_FOD_MASK			0xF7
+#define MT6360_PMU_BASE_MASK			0xF8
+#define MT6360_PMU_FLED_MASK1			0xF9
+#define MT6360_PMU_FLED_MASK2			0xFA
+#define MT6360_PMU_FAULTB_MASK			0xFB
+#define MT6360_PMU_BUCK1_MASK			0xFC
+#define MT6360_PMU_BUCK2_MASK			0xFD
+#define MT6360_PMU_LDO_MASK1			0xFE
+#define MT6360_PMU_LDO_MASK2			0xFF
+#define MT6360_PMU_MAXREG			MT6360_PMU_LDO_MASK2
 
 /* MT6360_PMU_IRQ_SET */
 #define MT6360_PMU_IRQ_REGNUM	(MT6360_PMU_LDO_IRQ2 - MT6360_PMU_CHG_IRQ1 + 1)
 #define MT6360_IRQ_RETRIG	BIT(2)
 
-#define CHIP_VEN_MASK				(0xF0)
-#define CHIP_VEN_MT6360				(0x50)
-#define CHIP_REV_MASK				(0x0F)
+#define CHIP_VEN_MASK				0xF0
+#define CHIP_VEN_MT6360				0x50
+#define CHIP_REV_MASK				0x0F
 
 #endif /* __MT6360_H__ */
-- 
cgit v1.2.3


From e63ce9a5b3edad84e5f1b3ffb081da7c9847c641 Mon Sep 17 00:00:00 2001
From: Gene Chen <gene_chen@richtek.com>
Date: Tue, 18 May 2021 01:33:09 +0800
Subject: mfd: mt6360: Rename mt6360_pmu_data by mt6360_ddata

Rename mt6360_pmu_data by mt6360_ddata because of including
not only PMU part, but also entire MT6360 IC.

Signed-off-by: Gene Chen <gene_chen@richtek.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/mt6360-core.c  | 44 ++++++++++++++++++++++----------------------
 include/linux/mfd/mt6360.h |  2 +-
 2 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/mt6360-core.c b/drivers/mfd/mt6360-core.c
index 3553c7045829..6428160ef19d 100644
--- a/drivers/mfd/mt6360-core.c
+++ b/drivers/mfd/mt6360-core.c
@@ -211,9 +211,9 @@ static const struct regmap_irq mt6360_pmu_irqs[] =  {
 
 static int mt6360_pmu_handle_post_irq(void *irq_drv_data)
 {
-	struct mt6360_pmu_data *mpd = irq_drv_data;
+	struct mt6360_ddata *ddata = irq_drv_data;
 
-	return regmap_update_bits(mpd->regmap,
+	return regmap_update_bits(ddata->regmap,
 		MT6360_PMU_IRQ_SET, MT6360_IRQ_RETRIG, MT6360_IRQ_RETRIG);
 }
 
@@ -310,61 +310,61 @@ static const unsigned short mt6360_slave_addr[MT6360_SLAVE_MAX] = {
 
 static int mt6360_pmu_probe(struct i2c_client *client)
 {
-	struct mt6360_pmu_data *mpd;
+	struct mt6360_ddata *ddata;
 	unsigned int reg_data;
 	int i, ret;
 
-	mpd = devm_kzalloc(&client->dev, sizeof(*mpd), GFP_KERNEL);
-	if (!mpd)
+	ddata = devm_kzalloc(&client->dev, sizeof(*ddata), GFP_KERNEL);
+	if (!ddata)
 		return -ENOMEM;
 
-	mpd->dev = &client->dev;
-	i2c_set_clientdata(client, mpd);
+	ddata->dev = &client->dev;
+	i2c_set_clientdata(client, ddata);
 
-	mpd->regmap = devm_regmap_init_i2c(client, &mt6360_pmu_regmap_config);
-	if (IS_ERR(mpd->regmap)) {
+	ddata->regmap = devm_regmap_init_i2c(client, &mt6360_pmu_regmap_config);
+	if (IS_ERR(ddata->regmap)) {
 		dev_err(&client->dev, "Failed to register regmap\n");
-		return PTR_ERR(mpd->regmap);
+		return PTR_ERR(ddata->regmap);
 	}
 
-	ret = regmap_read(mpd->regmap, MT6360_PMU_DEV_INFO, &reg_data);
+	ret = regmap_read(ddata->regmap, MT6360_PMU_DEV_INFO, &reg_data);
 	if (ret) {
 		dev_err(&client->dev, "Device not found\n");
 		return ret;
 	}
 
-	mpd->chip_rev = reg_data & CHIP_REV_MASK;
-	if (mpd->chip_rev != CHIP_VEN_MT6360) {
+	ddata->chip_rev = reg_data & CHIP_REV_MASK;
+	if (ddata->chip_rev != CHIP_VEN_MT6360) {
 		dev_err(&client->dev, "Device not supported\n");
 		return -ENODEV;
 	}
 
-	mt6360_pmu_irq_chip.irq_drv_data = mpd;
-	ret = devm_regmap_add_irq_chip(&client->dev, mpd->regmap, client->irq,
+	mt6360_pmu_irq_chip.irq_drv_data = ddata;
+	ret = devm_regmap_add_irq_chip(&client->dev, ddata->regmap, client->irq,
 				       IRQF_TRIGGER_FALLING, 0,
-				       &mt6360_pmu_irq_chip, &mpd->irq_data);
+				       &mt6360_pmu_irq_chip, &ddata->irq_data);
 	if (ret) {
 		dev_err(&client->dev, "Failed to add Regmap IRQ Chip\n");
 		return ret;
 	}
 
-	mpd->i2c[0] = client;
+	ddata->i2c[0] = client;
 	for (i = 1; i < MT6360_SLAVE_MAX; i++) {
-		mpd->i2c[i] = devm_i2c_new_dummy_device(&client->dev,
+		ddata->i2c[i] = devm_i2c_new_dummy_device(&client->dev,
 							client->adapter,
 							mt6360_slave_addr[i]);
-		if (IS_ERR(mpd->i2c[i])) {
+		if (IS_ERR(ddata->i2c[i])) {
 			dev_err(&client->dev,
 				"Failed to get new dummy I2C device for address 0x%x",
 				mt6360_slave_addr[i]);
-			return PTR_ERR(mpd->i2c[i]);
+			return PTR_ERR(ddata->i2c[i]);
 		}
-		i2c_set_clientdata(mpd->i2c[i], mpd);
+		i2c_set_clientdata(ddata->i2c[i], ddata);
 	}
 
 	ret = devm_mfd_add_devices(&client->dev, PLATFORM_DEVID_AUTO,
 				   mt6360_devs, ARRAY_SIZE(mt6360_devs), NULL,
-				   0, regmap_irq_get_domain(mpd->irq_data));
+				   0, regmap_irq_get_domain(ddata->irq_data));
 	if (ret) {
 		dev_err(&client->dev,
 			"Failed to register subordinate devices\n");
diff --git a/include/linux/mfd/mt6360.h b/include/linux/mfd/mt6360.h
index 72edf1352229..81bca7c2ad4d 100644
--- a/include/linux/mfd/mt6360.h
+++ b/include/linux/mfd/mt6360.h
@@ -21,7 +21,7 @@ enum {
 #define MT6360_LDO_SLAVEID	0x64
 #define MT6360_TCPC_SLAVEID	0x4E
 
-struct mt6360_pmu_data {
+struct mt6360_ddata {
 	struct i2c_client *i2c[MT6360_SLAVE_MAX];
 	struct device *dev;
 	struct regmap *regmap;
-- 
cgit v1.2.3


From a75a2d56dc2f1a95a0b481eec74f60ff81a1b291 Mon Sep 17 00:00:00 2001
From: Gene Chen <gene_chen@richtek.com>
Date: Tue, 18 May 2021 01:33:11 +0800
Subject: mfd: mt6360: Remove handle_post_irq callback function

Remove handle_post_irq which is used to retrigger IRQ.
Set IRQ level low trigger in dtsi to keep IRQ always be handled.

Signed-off-by: Gene Chen <gene_chen@richtek.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/mt6360-core.c  | 17 ++++-------------
 include/linux/mfd/mt6360.h |  2 +-
 2 files changed, 5 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/mt6360-core.c b/drivers/mfd/mt6360-core.c
index a1d74bb302cd..50a170065a98 100644
--- a/drivers/mfd/mt6360-core.c
+++ b/drivers/mfd/mt6360-core.c
@@ -209,15 +209,8 @@ static const struct regmap_irq mt6360_irqs[] =  {
 	REGMAP_IRQ_REG_LINE(MT6360_LDO7_PGB_EVT, 8),
 };
 
-static int mt6360_pmu_handle_post_irq(void *irq_drv_data)
-{
-	struct mt6360_ddata *ddata = irq_drv_data;
-
-	return regmap_update_bits(ddata->regmap,
-		MT6360_PMU_IRQ_SET, MT6360_IRQ_RETRIG, MT6360_IRQ_RETRIG);
-}
-
-static struct regmap_irq_chip mt6360_irq_chip = {
+static const struct regmap_irq_chip mt6360_irq_chip = {
+	.name = "mt6360_irqs",
 	.irqs = mt6360_irqs,
 	.num_irqs = ARRAY_SIZE(mt6360_irqs),
 	.num_regs = MT6360_PMU_IRQ_REGNUM,
@@ -226,7 +219,6 @@ static struct regmap_irq_chip mt6360_irq_chip = {
 	.ack_base = MT6360_PMU_CHG_IRQ1,
 	.init_ack_masked = true,
 	.use_ack = true,
-	.handle_post_irq = mt6360_pmu_handle_post_irq,
 };
 
 static const struct regmap_config mt6360_pmu_regmap_config = {
@@ -339,10 +331,9 @@ static int mt6360_probe(struct i2c_client *client)
 		return -ENODEV;
 	}
 
-	mt6360_irq_chip.irq_drv_data = ddata;
 	ret = devm_regmap_add_irq_chip(&client->dev, ddata->regmap, client->irq,
-				       IRQF_TRIGGER_FALLING, 0,
-				       &mt6360_irq_chip, &ddata->irq_data);
+				       0, 0, &mt6360_irq_chip,
+				       &ddata->irq_data);
 	if (ret) {
 		dev_err(&client->dev, "Failed to add Regmap IRQ Chip\n");
 		return ret;
diff --git a/include/linux/mfd/mt6360.h b/include/linux/mfd/mt6360.h
index 81bca7c2ad4d..ef8257dffe3f 100644
--- a/include/linux/mfd/mt6360.h
+++ b/include/linux/mfd/mt6360.h
@@ -230,7 +230,7 @@ struct mt6360_ddata {
 #define MT6360_PMU_MAXREG			MT6360_PMU_LDO_MASK2
 
 /* MT6360_PMU_IRQ_SET */
-#define MT6360_PMU_IRQ_REGNUM	(MT6360_PMU_LDO_IRQ2 - MT6360_PMU_CHG_IRQ1 + 1)
+#define MT6360_PMU_IRQ_REGNUM	16
 #define MT6360_IRQ_RETRIG	BIT(2)
 
 #define CHIP_VEN_MASK				0xF0
-- 
cgit v1.2.3


From b042c085de7aa89eedfe8df8388b19a0e6679a39 Mon Sep 17 00:00:00 2001
From: Gene Chen <gene_chen@richtek.com>
Date: Tue, 18 May 2021 01:33:13 +0800
Subject: mfd: mt6360: Merge header file into driver and remove unuse register
 define

Merge header file into driver and remove unuse register define

Signed-off-by: Gene Chen <gene_chen@richtek.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/mt6360-core.c  |  48 ++++++++-
 include/linux/mfd/mt6360.h | 240 ---------------------------------------------
 2 files changed, 47 insertions(+), 241 deletions(-)
 delete mode 100644 include/linux/mfd/mt6360.h

(limited to 'include/linux')

diff --git a/drivers/mfd/mt6360-core.c b/drivers/mfd/mt6360-core.c
index cdd99770f94a..a4c705c8dd68 100644
--- a/drivers/mfd/mt6360-core.c
+++ b/drivers/mfd/mt6360-core.c
@@ -15,7 +15,53 @@
 #include <linux/regmap.h>
 #include <linux/slab.h>
 
-#include <linux/mfd/mt6360.h>
+enum {
+	MT6360_SLAVE_TCPC = 0,
+	MT6360_SLAVE_PMIC,
+	MT6360_SLAVE_LDO,
+	MT6360_SLAVE_PMU,
+	MT6360_SLAVE_MAX,
+};
+
+struct mt6360_ddata {
+	struct i2c_client *i2c[MT6360_SLAVE_MAX];
+	struct device *dev;
+	struct regmap *regmap;
+	struct regmap_irq_chip_data *irq_data;
+	unsigned int chip_rev;
+	u8 crc8_tbl[CRC8_TABLE_SIZE];
+};
+
+#define MT6360_TCPC_SLAVEID		0x4E
+#define MT6360_PMIC_SLAVEID		0x1A
+#define MT6360_LDO_SLAVEID		0x64
+#define MT6360_PMU_SLAVEID		0x34
+
+#define MT6360_REG_TCPCSTART		0x00
+#define MT6360_REG_TCPCEND		0xFF
+#define MT6360_REG_PMICSTART		0x100
+#define MT6360_REG_PMICEND		0x13B
+#define MT6360_REG_LDOSTART		0x200
+#define MT6360_REG_LDOEND		0x21C
+#define MT6360_REG_PMUSTART		0x300
+#define MT6360_PMU_DEV_INFO		0x300
+#define MT6360_PMU_CHG_IRQ1		0x3D0
+#define MT6360_PMU_CHG_MASK1		0x3F0
+#define MT6360_REG_PMUEND		0x3FF
+
+#define MT6360_PMU_IRQ_REGNUM		16
+
+#define CHIP_VEN_MASK			0xF0
+#define CHIP_VEN_MT6360			0x50
+#define CHIP_REV_MASK			0x0F
+
+#define MT6360_ADDRESS_MASK		0x3F
+#define MT6360_DATA_SIZE_1_BYTE		0x00
+#define MT6360_DATA_SIZE_2_BYTES	0x40
+#define MT6360_DATA_SIZE_3_BYTES	0x80
+#define MT6360_DATA_SIZE_4_BYTES	0xC0
+
+#define MT6360_CRC8_POLYNOMIAL		0x7
 
 /* reg 0 -> 0 ~ 7 */
 #define MT6360_CHG_TREG_EVT		4
diff --git a/include/linux/mfd/mt6360.h b/include/linux/mfd/mt6360.h
deleted file mode 100644
index ef8257dffe3f..000000000000
--- a/include/linux/mfd/mt6360.h
+++ /dev/null
@@ -1,240 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (c) 2020 MediaTek Inc.
- */
-
-#ifndef __MT6360_H__
-#define __MT6360_H__
-
-#include <linux/regmap.h>
-
-enum {
-	MT6360_SLAVE_PMU = 0,
-	MT6360_SLAVE_PMIC,
-	MT6360_SLAVE_LDO,
-	MT6360_SLAVE_TCPC,
-	MT6360_SLAVE_MAX,
-};
-
-#define MT6360_PMU_SLAVEID	0x34
-#define MT6360_PMIC_SLAVEID	0x1A
-#define MT6360_LDO_SLAVEID	0x64
-#define MT6360_TCPC_SLAVEID	0x4E
-
-struct mt6360_ddata {
-	struct i2c_client *i2c[MT6360_SLAVE_MAX];
-	struct device *dev;
-	struct regmap *regmap;
-	struct regmap_irq_chip_data *irq_data;
-	unsigned int chip_rev;
-};
-
-/* PMU register defininition */
-#define MT6360_PMU_DEV_INFO			0x00
-#define MT6360_PMU_CORE_CTRL1			0x01
-#define MT6360_PMU_RST1				0x02
-#define MT6360_PMU_CRCEN			0x03
-#define MT6360_PMU_RST_PAS_CODE1		0x04
-#define MT6360_PMU_RST_PAS_CODE2		0x05
-#define MT6360_PMU_CORE_CTRL2			0x06
-#define MT6360_PMU_TM_PAS_CODE1			0x07
-#define MT6360_PMU_TM_PAS_CODE2			0x08
-#define MT6360_PMU_TM_PAS_CODE3			0x09
-#define MT6360_PMU_TM_PAS_CODE4			0x0A
-#define MT6360_PMU_IRQ_IND			0x0B
-#define MT6360_PMU_IRQ_MASK			0x0C
-#define MT6360_PMU_IRQ_SET			0x0D
-#define MT6360_PMU_SHDN_CTRL			0x0E
-#define MT6360_PMU_TM_INF			0x0F
-#define MT6360_PMU_I2C_CTRL			0x10
-#define MT6360_PMU_CHG_CTRL1			0x11
-#define MT6360_PMU_CHG_CTRL2			0x12
-#define MT6360_PMU_CHG_CTRL3			0x13
-#define MT6360_PMU_CHG_CTRL4			0x14
-#define MT6360_PMU_CHG_CTRL5			0x15
-#define MT6360_PMU_CHG_CTRL6			0x16
-#define MT6360_PMU_CHG_CTRL7			0x17
-#define MT6360_PMU_CHG_CTRL8			0x18
-#define MT6360_PMU_CHG_CTRL9			0x19
-#define MT6360_PMU_CHG_CTRL10			0x1A
-#define MT6360_PMU_CHG_CTRL11			0x1B
-#define MT6360_PMU_CHG_CTRL12			0x1C
-#define MT6360_PMU_CHG_CTRL13			0x1D
-#define MT6360_PMU_CHG_CTRL14			0x1E
-#define MT6360_PMU_CHG_CTRL15			0x1F
-#define MT6360_PMU_CHG_CTRL16			0x20
-#define MT6360_PMU_CHG_AICC_RESULT		0x21
-#define MT6360_PMU_DEVICE_TYPE			0x22
-#define MT6360_PMU_QC_CONTROL1			0x23
-#define MT6360_PMU_QC_CONTROL2			0x24
-#define MT6360_PMU_QC30_CONTROL1		0x25
-#define MT6360_PMU_QC30_CONTROL2		0x26
-#define MT6360_PMU_USB_STATUS1			0x27
-#define MT6360_PMU_QC_STATUS1			0x28
-#define MT6360_PMU_QC_STATUS2			0x29
-#define MT6360_PMU_CHG_PUMP			0x2A
-#define MT6360_PMU_CHG_CTRL17			0x2B
-#define MT6360_PMU_CHG_CTRL18			0x2C
-#define MT6360_PMU_CHRDET_CTRL1			0x2D
-#define MT6360_PMU_CHRDET_CTRL2			0x2E
-#define MT6360_PMU_DPDN_CTRL			0x2F
-#define MT6360_PMU_CHG_HIDDEN_CTRL1		0x30
-#define MT6360_PMU_CHG_HIDDEN_CTRL2		0x31
-#define MT6360_PMU_CHG_HIDDEN_CTRL3		0x32
-#define MT6360_PMU_CHG_HIDDEN_CTRL4		0x33
-#define MT6360_PMU_CHG_HIDDEN_CTRL5		0x34
-#define MT6360_PMU_CHG_HIDDEN_CTRL6		0x35
-#define MT6360_PMU_CHG_HIDDEN_CTRL7		0x36
-#define MT6360_PMU_CHG_HIDDEN_CTRL8		0x37
-#define MT6360_PMU_CHG_HIDDEN_CTRL9		0x38
-#define MT6360_PMU_CHG_HIDDEN_CTRL10		0x39
-#define MT6360_PMU_CHG_HIDDEN_CTRL11		0x3A
-#define MT6360_PMU_CHG_HIDDEN_CTRL12		0x3B
-#define MT6360_PMU_CHG_HIDDEN_CTRL13		0x3C
-#define MT6360_PMU_CHG_HIDDEN_CTRL14		0x3D
-#define MT6360_PMU_CHG_HIDDEN_CTRL15		0x3E
-#define MT6360_PMU_CHG_HIDDEN_CTRL16		0x3F
-#define MT6360_PMU_CHG_HIDDEN_CTRL17		0x40
-#define MT6360_PMU_CHG_HIDDEN_CTRL18		0x41
-#define MT6360_PMU_CHG_HIDDEN_CTRL19		0x42
-#define MT6360_PMU_CHG_HIDDEN_CTRL20		0x43
-#define MT6360_PMU_CHG_HIDDEN_CTRL21		0x44
-#define MT6360_PMU_CHG_HIDDEN_CTRL22		0x45
-#define MT6360_PMU_CHG_HIDDEN_CTRL23		0x46
-#define MT6360_PMU_CHG_HIDDEN_CTRL24		0x47
-#define MT6360_PMU_CHG_HIDDEN_CTRL25		0x48
-#define MT6360_PMU_BC12_CTRL			0x49
-#define MT6360_PMU_CHG_STAT			0x4A
-#define MT6360_PMU_RESV1			0x4B
-#define MT6360_PMU_TYPEC_OTP_TH_SEL_CODEH	0x4E
-#define MT6360_PMU_TYPEC_OTP_TH_SEL_CODEL	0x4F
-#define MT6360_PMU_TYPEC_OTP_HYST_TH		0x50
-#define MT6360_PMU_TYPEC_OTP_CTRL		0x51
-#define MT6360_PMU_ADC_BAT_DATA_H		0x52
-#define MT6360_PMU_ADC_BAT_DATA_L		0x53
-#define MT6360_PMU_IMID_BACKBST_ON		0x54
-#define MT6360_PMU_IMID_BACKBST_OFF		0x55
-#define MT6360_PMU_ADC_CONFIG			0x56
-#define MT6360_PMU_ADC_EN2			0x57
-#define MT6360_PMU_ADC_IDLE_T			0x58
-#define MT6360_PMU_ADC_RPT_1			0x5A
-#define MT6360_PMU_ADC_RPT_2			0x5B
-#define MT6360_PMU_ADC_RPT_3			0x5C
-#define MT6360_PMU_ADC_RPT_ORG1			0x5D
-#define MT6360_PMU_ADC_RPT_ORG2			0x5E
-#define MT6360_PMU_BAT_OVP_TH_SEL_CODEH		0x5F
-#define MT6360_PMU_BAT_OVP_TH_SEL_CODEL		0x60
-#define MT6360_PMU_CHG_CTRL19			0x61
-#define MT6360_PMU_VDDASUPPLY			0x62
-#define MT6360_PMU_BC12_MANUAL			0x63
-#define MT6360_PMU_CHGDET_FUNC			0x64
-#define MT6360_PMU_FOD_CTRL			0x65
-#define MT6360_PMU_CHG_CTRL20			0x66
-#define MT6360_PMU_CHG_HIDDEN_CTRL26		0x67
-#define MT6360_PMU_CHG_HIDDEN_CTRL27		0x68
-#define MT6360_PMU_RESV2			0x69
-#define MT6360_PMU_USBID_CTRL1			0x6D
-#define MT6360_PMU_USBID_CTRL2			0x6E
-#define MT6360_PMU_USBID_CTRL3			0x6F
-#define MT6360_PMU_FLED_CFG			0x70
-#define MT6360_PMU_RESV3			0x71
-#define MT6360_PMU_FLED1_CTRL			0x72
-#define MT6360_PMU_FLED_STRB_CTRL		0x73
-#define MT6360_PMU_FLED1_STRB_CTRL2		0x74
-#define MT6360_PMU_FLED1_TOR_CTRL		0x75
-#define MT6360_PMU_FLED2_CTRL			0x76
-#define MT6360_PMU_RESV4			0x77
-#define MT6360_PMU_FLED2_STRB_CTRL2		0x78
-#define MT6360_PMU_FLED2_TOR_CTRL		0x79
-#define MT6360_PMU_FLED_VMIDTRK_CTRL1		0x7A
-#define MT6360_PMU_FLED_VMID_RTM		0x7B
-#define MT6360_PMU_FLED_VMIDTRK_CTRL2		0x7C
-#define MT6360_PMU_FLED_PWSEL			0x7D
-#define MT6360_PMU_FLED_EN			0x7E
-#define MT6360_PMU_FLED_Hidden1			0x7F
-#define MT6360_PMU_RGB_EN			0x80
-#define MT6360_PMU_RGB1_ISNK			0x81
-#define MT6360_PMU_RGB2_ISNK			0x82
-#define MT6360_PMU_RGB3_ISNK			0x83
-#define MT6360_PMU_RGB_ML_ISNK			0x84
-#define MT6360_PMU_RGB1_DIM			0x85
-#define MT6360_PMU_RGB2_DIM			0x86
-#define MT6360_PMU_RGB3_DIM			0x87
-#define MT6360_PMU_RESV5			0x88
-#define MT6360_PMU_RGB12_Freq			0x89
-#define MT6360_PMU_RGB34_Freq			0x8A
-#define MT6360_PMU_RGB1_Tr			0x8B
-#define MT6360_PMU_RGB1_Tf			0x8C
-#define MT6360_PMU_RGB1_TON_TOFF		0x8D
-#define MT6360_PMU_RGB2_Tr			0x8E
-#define MT6360_PMU_RGB2_Tf			0x8F
-#define MT6360_PMU_RGB2_TON_TOFF		0x90
-#define MT6360_PMU_RGB3_Tr			0x91
-#define MT6360_PMU_RGB3_Tf			0x92
-#define MT6360_PMU_RGB3_TON_TOFF		0x93
-#define MT6360_PMU_RGB_Hidden_CTRL1		0x94
-#define MT6360_PMU_RGB_Hidden_CTRL2		0x95
-#define MT6360_PMU_RESV6			0x97
-#define MT6360_PMU_SPARE1			0x9A
-#define MT6360_PMU_SPARE2			0xA0
-#define MT6360_PMU_SPARE3			0xB0
-#define MT6360_PMU_SPARE4			0xC0
-#define MT6360_PMU_CHG_IRQ1			0xD0
-#define MT6360_PMU_CHG_IRQ2			0xD1
-#define MT6360_PMU_CHG_IRQ3			0xD2
-#define MT6360_PMU_CHG_IRQ4			0xD3
-#define MT6360_PMU_CHG_IRQ5			0xD4
-#define MT6360_PMU_CHG_IRQ6			0xD5
-#define MT6360_PMU_QC_IRQ			0xD6
-#define MT6360_PMU_FOD_IRQ			0xD7
-#define MT6360_PMU_BASE_IRQ			0xD8
-#define MT6360_PMU_FLED_IRQ1			0xD9
-#define MT6360_PMU_FLED_IRQ2			0xDA
-#define MT6360_PMU_RGB_IRQ			0xDB
-#define MT6360_PMU_BUCK1_IRQ			0xDC
-#define MT6360_PMU_BUCK2_IRQ			0xDD
-#define MT6360_PMU_LDO_IRQ1			0xDE
-#define MT6360_PMU_LDO_IRQ2			0xDF
-#define MT6360_PMU_CHG_STAT1			0xE0
-#define MT6360_PMU_CHG_STAT2			0xE1
-#define MT6360_PMU_CHG_STAT3			0xE2
-#define MT6360_PMU_CHG_STAT4			0xE3
-#define MT6360_PMU_CHG_STAT5			0xE4
-#define MT6360_PMU_CHG_STAT6			0xE5
-#define MT6360_PMU_QC_STAT			0xE6
-#define MT6360_PMU_FOD_STAT			0xE7
-#define MT6360_PMU_BASE_STAT			0xE8
-#define MT6360_PMU_FLED_STAT1			0xE9
-#define MT6360_PMU_FLED_STAT2			0xEA
-#define MT6360_PMU_RGB_STAT			0xEB
-#define MT6360_PMU_BUCK1_STAT			0xEC
-#define MT6360_PMU_BUCK2_STAT			0xED
-#define MT6360_PMU_LDO_STAT1			0xEE
-#define MT6360_PMU_LDO_STAT2			0xEF
-#define MT6360_PMU_CHG_MASK1			0xF0
-#define MT6360_PMU_CHG_MASK2			0xF1
-#define MT6360_PMU_CHG_MASK3			0xF2
-#define MT6360_PMU_CHG_MASK4			0xF3
-#define MT6360_PMU_CHG_MASK5			0xF4
-#define MT6360_PMU_CHG_MASK6			0xF5
-#define MT6360_PMU_QC_MASK			0xF6
-#define MT6360_PMU_FOD_MASK			0xF7
-#define MT6360_PMU_BASE_MASK			0xF8
-#define MT6360_PMU_FLED_MASK1			0xF9
-#define MT6360_PMU_FLED_MASK2			0xFA
-#define MT6360_PMU_FAULTB_MASK			0xFB
-#define MT6360_PMU_BUCK1_MASK			0xFC
-#define MT6360_PMU_BUCK2_MASK			0xFD
-#define MT6360_PMU_LDO_MASK1			0xFE
-#define MT6360_PMU_LDO_MASK2			0xFF
-#define MT6360_PMU_MAXREG			MT6360_PMU_LDO_MASK2
-
-/* MT6360_PMU_IRQ_SET */
-#define MT6360_PMU_IRQ_REGNUM	16
-#define MT6360_IRQ_RETRIG	BIT(2)
-
-#define CHIP_VEN_MASK				0xF0
-#define CHIP_VEN_MT6360				0x50
-#define CHIP_REV_MASK				0x0F
-
-#endif /* __MT6360_H__ */
-- 
cgit v1.2.3


From 07a0b7d6f1543b45068ba427a1f0ee5375a259c9 Mon Sep 17 00:00:00 2001
From: Hao Fang <fanghao11@huawei.com>
Date: Sat, 22 May 2021 18:25:15 +0800
Subject: mfd: hisilicon: Use the correct HiSilicon copyright

s/Hisilicon/HiSilicon/

It should use capital S, according to the official website
https://www.hisilicon.com/en.

Signed-off-by: Hao Fang <fanghao11@huawei.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/hi655x-pmic.c       | 2 +-
 include/linux/mfd/hi655x-pmic.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/hi655x-pmic.c b/drivers/mfd/hi655x-pmic.c
index d3c86a7a3805..6909d075d017 100644
--- a/drivers/mfd/hi655x-pmic.c
+++ b/drivers/mfd/hi655x-pmic.c
@@ -2,7 +2,7 @@
 /*
  * Device driver for MFD hi655x PMIC
  *
- * Copyright (c) 2016 Hisilicon.
+ * Copyright (c) 2016 HiSilicon Ltd.
  *
  * Authors:
  * Chen Feng <puck.chen@hisilicon.com>
diff --git a/include/linux/mfd/hi655x-pmic.h b/include/linux/mfd/hi655x-pmic.h
index b06171322178..af5d97239c0d 100644
--- a/include/linux/mfd/hi655x-pmic.h
+++ b/include/linux/mfd/hi655x-pmic.h
@@ -2,7 +2,7 @@
 /*
  * Device driver for regulators in hi655x IC
  *
- * Copyright (c) 2016 Hisilicon.
+ * Copyright (c) 2016 HiSilicon Ltd.
  *
  * Authors:
  * Chen Feng <puck.chen@hisilicon.com>
-- 
cgit v1.2.3


From 6f1b660731d841aa429a836c4ba04af551628050 Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@ingics.com>
Date: Sun, 23 May 2021 15:10:45 +0800
Subject: mfd: bd71828: Fix .n_voltages settings

Current .n_voltages settings do not cover the latest 2 valid selectors,
so it fails to set voltage for the hightest voltage support.
The latest linear range has step_uV = 0, so it does not matter if we
count the .n_voltages to maximum selector + 1 or the first selector of
latest linear range + 1.
To simplify calculating the n_voltages, let's just set the
.n_voltages to maximum selector + 1.

Fixes: 522498f8cb8c ("regulator: bd71828: Basic support for ROHM bd71828 PMIC regulators")
Signed-off-by: Axel Lin <axel.lin@ingics.com>
Reviewed-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/rohm-bd71828.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/rohm-bd71828.h b/include/linux/mfd/rohm-bd71828.h
index c7ab69c87ee8..3b5f3a7db4bd 100644
--- a/include/linux/mfd/rohm-bd71828.h
+++ b/include/linux/mfd/rohm-bd71828.h
@@ -26,11 +26,11 @@ enum {
 	BD71828_REGULATOR_AMOUNT,
 };
 
-#define BD71828_BUCK1267_VOLTS		0xEF
-#define BD71828_BUCK3_VOLTS		0x10
-#define BD71828_BUCK4_VOLTS		0x20
-#define BD71828_BUCK5_VOLTS		0x10
-#define BD71828_LDO_VOLTS		0x32
+#define BD71828_BUCK1267_VOLTS		0x100
+#define BD71828_BUCK3_VOLTS		0x20
+#define BD71828_BUCK4_VOLTS		0x40
+#define BD71828_BUCK5_VOLTS		0x20
+#define BD71828_LDO_VOLTS		0x40
 /* LDO6 is fixed 1.8V voltage */
 #define BD71828_LDO_6_VOLTAGE		1800000
 
-- 
cgit v1.2.3


From 12e1a41952c08fda89f6b14188ec6cdf31462907 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Date: Wed, 26 May 2021 08:47:07 -0400
Subject: mfd: sec: Remove unused cfg_pmic_irq in platform data

The 'cfg_pmic_irq' field of platform data structure is not used and can
be safely dropped.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/sec-core.c           | 3 ---
 include/linux/mfd/samsung/core.h | 1 -
 2 files changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/sec-core.c b/drivers/mfd/sec-core.c
index c61c1fc62165..653d02b98d53 100644
--- a/drivers/mfd/sec-core.c
+++ b/drivers/mfd/sec-core.c
@@ -382,9 +382,6 @@ static int sec_pmic_probe(struct i2c_client *i2c,
 		return ret;
 	}
 
-	if (pdata->cfg_pmic_irq)
-		pdata->cfg_pmic_irq();
-
 	sec_irq_init(sec_pmic);
 
 	pm_runtime_set_active(sec_pmic->dev);
diff --git a/include/linux/mfd/samsung/core.h b/include/linux/mfd/samsung/core.h
index f1631a39acfc..68afc2b97a41 100644
--- a/include/linux/mfd/samsung/core.h
+++ b/include/linux/mfd/samsung/core.h
@@ -85,7 +85,6 @@ struct sec_platform_data {
 	int				num_regulators;
 
 	int				irq_base;
-	int				(*cfg_pmic_irq)(void);
 
 	bool				wakeup;
 	bool				buck_voltage_lock;
-- 
cgit v1.2.3


From 294fb2ce2de246e126f9f3a4568bfd8e568a2b5b Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Date: Wed, 26 May 2021 08:47:08 -0400
Subject: mfd: sec: Remove unused device_type in platform data

The 'device_type' field of platform data structure is not used and can
be safely dropped.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/sec-core.c           | 5 +----
 include/linux/mfd/samsung/core.h | 1 -
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/sec-core.c b/drivers/mfd/sec-core.c
index 653d02b98d53..4c4db3171ce6 100644
--- a/drivers/mfd/sec-core.c
+++ b/drivers/mfd/sec-core.c
@@ -318,7 +318,6 @@ static int sec_pmic_probe(struct i2c_client *i2c,
 	struct sec_platform_data *pdata;
 	const struct mfd_cell *sec_devs;
 	struct sec_pmic_dev *sec_pmic;
-	unsigned long device_type;
 	int ret, num_sec_devs;
 
 	sec_pmic = devm_kzalloc(&i2c->dev, sizeof(struct sec_pmic_dev),
@@ -330,16 +329,14 @@ static int sec_pmic_probe(struct i2c_client *i2c,
 	sec_pmic->dev = &i2c->dev;
 	sec_pmic->i2c = i2c;
 	sec_pmic->irq = i2c->irq;
-	device_type = (unsigned long)of_device_get_match_data(sec_pmic->dev);
 
 	pdata = sec_pmic_i2c_parse_dt_pdata(sec_pmic->dev);
 	if (IS_ERR(pdata)) {
 		ret = PTR_ERR(pdata);
 		return ret;
 	}
-	pdata->device_type = device_type;
 
-	sec_pmic->device_type = pdata->device_type;
+	sec_pmic->device_type = (unsigned long)of_device_get_match_data(sec_pmic->dev);
 	sec_pmic->irq_base = pdata->irq_base;
 	sec_pmic->wakeup = pdata->wakeup;
 	sec_pmic->pdata = pdata;
diff --git a/include/linux/mfd/samsung/core.h b/include/linux/mfd/samsung/core.h
index 68afc2b97a41..bfde1b7c6303 100644
--- a/include/linux/mfd/samsung/core.h
+++ b/include/linux/mfd/samsung/core.h
@@ -81,7 +81,6 @@ int sec_irq_resume(struct sec_pmic_dev *sec_pmic);
 struct sec_platform_data {
 	struct sec_regulator_data	*regulators;
 	struct sec_opmode_data		*opmode;
-	int				device_type;
 	int				num_regulators;
 
 	int				irq_base;
-- 
cgit v1.2.3


From c1d3ab31e7356cb54de35991ac70176379d4caed Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Date: Wed, 26 May 2021 08:47:09 -0400
Subject: mfd: sec: Remove unused irq_base in platform data

The 'irq_base' field of platform data structure is not assigned,
therefore its default value of 0 has no impact and can be safely
dropped.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/sec-core.c           | 1 -
 drivers/mfd/sec-irq.c            | 4 +---
 include/linux/mfd/samsung/core.h | 3 ---
 3 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/sec-core.c b/drivers/mfd/sec-core.c
index 4c4db3171ce6..a9d4fbc51997 100644
--- a/drivers/mfd/sec-core.c
+++ b/drivers/mfd/sec-core.c
@@ -337,7 +337,6 @@ static int sec_pmic_probe(struct i2c_client *i2c,
 	}
 
 	sec_pmic->device_type = (unsigned long)of_device_get_match_data(sec_pmic->dev);
-	sec_pmic->irq_base = pdata->irq_base;
 	sec_pmic->wakeup = pdata->wakeup;
 	sec_pmic->pdata = pdata;
 
diff --git a/drivers/mfd/sec-irq.c b/drivers/mfd/sec-irq.c
index a98c5d165039..e473c2fb42d5 100644
--- a/drivers/mfd/sec-irq.c
+++ b/drivers/mfd/sec-irq.c
@@ -444,7 +444,6 @@ int sec_irq_init(struct sec_pmic_dev *sec_pmic)
 	if (!sec_pmic->irq) {
 		dev_warn(sec_pmic->dev,
 			 "No interrupt specified, no interrupts\n");
-		sec_pmic->irq_base = 0;
 		return 0;
 	}
 
@@ -482,8 +481,7 @@ int sec_irq_init(struct sec_pmic_dev *sec_pmic)
 	ret = devm_regmap_add_irq_chip(sec_pmic->dev, sec_pmic->regmap_pmic,
 				       sec_pmic->irq,
 				       IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
-				       sec_pmic->irq_base, sec_irq_chip,
-				       &sec_pmic->irq_data);
+				       0, sec_irq_chip, &sec_pmic->irq_data);
 	if (ret != 0) {
 		dev_err(sec_pmic->dev, "Failed to register IRQ chip: %d\n", ret);
 		return ret;
diff --git a/include/linux/mfd/samsung/core.h b/include/linux/mfd/samsung/core.h
index bfde1b7c6303..9864f13b7814 100644
--- a/include/linux/mfd/samsung/core.h
+++ b/include/linux/mfd/samsung/core.h
@@ -67,7 +67,6 @@ struct sec_pmic_dev {
 	struct i2c_client *i2c;
 
 	unsigned long device_type;
-	int irq_base;
 	int irq;
 	struct regmap_irq_chip_data *irq_data;
 
@@ -83,8 +82,6 @@ struct sec_platform_data {
 	struct sec_opmode_data		*opmode;
 	int				num_regulators;
 
-	int				irq_base;
-
 	bool				wakeup;
 	bool				buck_voltage_lock;
 
-- 
cgit v1.2.3


From 2056f024c89cf2c7cd5eeab47592e3c397efb468 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Date: Wed, 26 May 2021 08:47:10 -0400
Subject: mfd: sec: Enable wakeup from suspend via devicetree property

Set device wakeup capability from devicetree property (done by drivers
core), instead of always setting it to 0 (because value in platform data
is not assigned).

This should not have visible effect on actual resuming from suspend
because the child device - S5M RTC driver - is responsible for waking
up and sets device wakeup unconditionally.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/sec-core.c           | 8 --------
 include/linux/mfd/samsung/core.h | 3 ---
 2 files changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/sec-core.c b/drivers/mfd/sec-core.c
index a9d4fbc51997..1fb29c45f5cf 100644
--- a/drivers/mfd/sec-core.c
+++ b/drivers/mfd/sec-core.c
@@ -298,12 +298,6 @@ sec_pmic_i2c_parse_dt_pdata(struct device *dev)
 	if (!pd)
 		return ERR_PTR(-ENOMEM);
 
-	/*
-	 * ToDo: the 'wakeup' member in the platform data is more of a linux
-	 * specfic information. Hence, there is no binding for that yet and
-	 * not parsed here.
-	 */
-
 	pd->manual_poweroff = of_property_read_bool(dev->of_node,
 						"samsung,s2mps11-acokb-ground");
 	pd->disable_wrstbi = of_property_read_bool(dev->of_node,
@@ -337,7 +331,6 @@ static int sec_pmic_probe(struct i2c_client *i2c,
 	}
 
 	sec_pmic->device_type = (unsigned long)of_device_get_match_data(sec_pmic->dev);
-	sec_pmic->wakeup = pdata->wakeup;
 	sec_pmic->pdata = pdata;
 
 	switch (sec_pmic->device_type) {
@@ -429,7 +422,6 @@ static int sec_pmic_probe(struct i2c_client *i2c,
 	if (ret)
 		return ret;
 
-	device_init_wakeup(sec_pmic->dev, sec_pmic->wakeup);
 	sec_pmic_configure(sec_pmic);
 	sec_pmic_dump_rev(sec_pmic);
 
diff --git a/include/linux/mfd/samsung/core.h b/include/linux/mfd/samsung/core.h
index 9864f13b7814..b0d049a56d16 100644
--- a/include/linux/mfd/samsung/core.h
+++ b/include/linux/mfd/samsung/core.h
@@ -69,8 +69,6 @@ struct sec_pmic_dev {
 	unsigned long device_type;
 	int irq;
 	struct regmap_irq_chip_data *irq_data;
-
-	bool wakeup;
 };
 
 int sec_irq_init(struct sec_pmic_dev *sec_pmic);
@@ -82,7 +80,6 @@ struct sec_platform_data {
 	struct sec_opmode_data		*opmode;
 	int				num_regulators;
 
-	bool				wakeup;
 	bool				buck_voltage_lock;
 
 	int				buck_gpios[3];
-- 
cgit v1.2.3


From 39cdbe8d2bc61d70efa22b06b14b129ebd9d0bc5 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Date: Wed, 26 May 2021 08:47:11 -0400
Subject: mfd: sec: Remove unused platform data members

The Samsung PMIC drivers for early chipsets like S5M8767 stored quite a
lot in platform data (struct sec_platform_data).  The s5m8767 regulator
driver currently references only some of its fields.  Newer regulator
drivers (e.g. s2mps11) use even less platform data fields.

Clean up the structure to reduce memory footprint and source code size.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/samsung/core.h | 25 -------------------------
 1 file changed, 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/samsung/core.h b/include/linux/mfd/samsung/core.h
index b0d049a56d16..f92fe090473d 100644
--- a/include/linux/mfd/samsung/core.h
+++ b/include/linux/mfd/samsung/core.h
@@ -80,8 +80,6 @@ struct sec_platform_data {
 	struct sec_opmode_data		*opmode;
 	int				num_regulators;
 
-	bool				buck_voltage_lock;
-
 	int				buck_gpios[3];
 	int				buck_ds[3];
 	unsigned int			buck2_voltage[8];
@@ -91,35 +89,12 @@ struct sec_platform_data {
 	unsigned int			buck4_voltage[8];
 	bool				buck4_gpiodvs;
 
-	int				buck_set1;
-	int				buck_set2;
-	int				buck_set3;
-	int				buck2_enable;
-	int				buck3_enable;
-	int				buck4_enable;
 	int				buck_default_idx;
-	int				buck2_default_idx;
-	int				buck3_default_idx;
-	int				buck4_default_idx;
-
 	int				buck_ramp_delay;
 
-	int				buck2_ramp_delay;
-	int				buck34_ramp_delay;
-	int				buck5_ramp_delay;
-	int				buck16_ramp_delay;
-	int				buck7810_ramp_delay;
-	int				buck9_ramp_delay;
-	int				buck24_ramp_delay;
-	int				buck3_ramp_delay;
-	int				buck7_ramp_delay;
-	int				buck8910_ramp_delay;
-
-	bool				buck1_ramp_enable;
 	bool				buck2_ramp_enable;
 	bool				buck3_ramp_enable;
 	bool				buck4_ramp_enable;
-	bool				buck6_ramp_enable;
 
 	int				buck2_init;
 	int				buck3_init;
-- 
cgit v1.2.3


From 6490fa565534fa83593278267785a694fd378a2b Mon Sep 17 00:00:00 2001
From: Kyle Tso <kyletso@google.com>
Date: Fri, 28 May 2021 16:16:13 +0800
Subject: usb: pd: Set PD_T_SINK_WAIT_CAP to 310ms

Current timer PD_T_SINK_WAIT_CAP is set to 240ms which will violate the
SinkWaitCapTimer (tTypeCSinkWaitCap 310 - 620 ms) defined in the PD
Spec if the port is faster enough when running the state machine. Set it
to the lower bound 310ms to ensure the timeout is in Spec.

Fixes: f0690a25a140 ("staging: typec: USB Type-C Port Manager (tcpm)")
Cc: stable <stable@vger.kernel.org>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Kyle Tso <kyletso@google.com>
Link: https://lore.kernel.org/r/20210528081613.730661-1-kyletso@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/pd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/pd.h b/include/linux/usb/pd.h
index bf00259493e0..96b7ff66f074 100644
--- a/include/linux/usb/pd.h
+++ b/include/linux/usb/pd.h
@@ -460,7 +460,7 @@ static inline unsigned int rdo_max_power(u32 rdo)
 #define PD_T_RECEIVER_RESPONSE	15	/* 15ms max */
 #define PD_T_SOURCE_ACTIVITY	45
 #define PD_T_SINK_ACTIVITY	135
-#define PD_T_SINK_WAIT_CAP	240
+#define PD_T_SINK_WAIT_CAP	310	/* 310 - 620 ms */
 #define PD_T_PS_TRANSITION	500
 #define PD_T_SRC_TRANSITION	35
 #define PD_T_DRP_SNK		40
-- 
cgit v1.2.3


From 8314b6732ae4e600bb933e108f96ce0176acb09c Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Tue, 1 Jun 2021 10:23:38 +0200
Subject: ima: Define new template fields xattrnames, xattrlengths and
 xattrvalues

This patch defines the new template fields xattrnames, xattrlengths and
xattrvalues, which contain respectively a list of xattr names (strings,
separated by |), lengths (u32, hex) and values (hex). If an xattr is not
present, the name and length are not displayed in the measurement list.

Reported-by: kernel test robot <lkp@intel.com> (Missing prototype def)
Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 Documentation/security/IMA-templates.rst  |  4 ++
 include/linux/evm.h                       | 10 +++++
 security/integrity/evm/evm_main.c         | 69 +++++++++++++++++++++++++++++++
 security/integrity/ima/ima_template.c     |  9 ++++
 security/integrity/ima/ima_template_lib.c | 64 ++++++++++++++++++++++++++++
 security/integrity/ima/ima_template_lib.h |  6 +++
 6 files changed, 162 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/security/IMA-templates.rst b/Documentation/security/IMA-templates.rst
index 65c1ce451d08..6a58760a0a35 100644
--- a/Documentation/security/IMA-templates.rst
+++ b/Documentation/security/IMA-templates.rst
@@ -78,6 +78,10 @@ descriptors by adding their identifier to the format string
  - 'iuid': the inode UID;
  - 'igid': the inode GID;
  - 'imode': the inode mode;
+ - 'xattrnames': a list of xattr names (separated by |), only if the xattr is
+    present;
+ - 'xattrlengths': a list of xattr lengths (u32), only if the xattr is present;
+ - 'xattrvalues': a list of xattr values;
 
 
 Below, there is the list of defined template descriptors:
diff --git a/include/linux/evm.h b/include/linux/evm.h
index 5011a299c251..4c374be70247 100644
--- a/include/linux/evm.h
+++ b/include/linux/evm.h
@@ -39,6 +39,9 @@ extern int evm_inode_init_security(struct inode *inode,
 				   struct xattr *evm);
 extern bool evm_revalidate_status(const char *xattr_name);
 extern int evm_protected_xattr_if_enabled(const char *req_xattr_name);
+extern int evm_read_protected_xattrs(struct dentry *dentry, u8 *buffer,
+				     int buffer_size, char type,
+				     bool canonical_fmt);
 #ifdef CONFIG_FS_POSIX_ACL
 extern int posix_xattr_acl(const char *xattrname);
 #else
@@ -120,5 +123,12 @@ static inline int evm_protected_xattr_if_enabled(const char *req_xattr_name)
 	return false;
 }
 
+static inline int evm_read_protected_xattrs(struct dentry *dentry, u8 *buffer,
+					    int buffer_size, char type,
+					    bool canonical_fmt)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif /* CONFIG_EVM */
 #endif /* LINUX_EVM_H */
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c
index ee4e17a790fb..2c226e634ae9 100644
--- a/security/integrity/evm/evm_main.c
+++ b/security/integrity/evm/evm_main.c
@@ -317,6 +317,75 @@ int evm_protected_xattr_if_enabled(const char *req_xattr_name)
 	return evm_protected_xattr_common(req_xattr_name, true);
 }
 
+/**
+ * evm_read_protected_xattrs - read EVM protected xattr names, lengths, values
+ * @dentry: dentry of the read xattrs
+ * @inode: inode of the read xattrs
+ * @buffer: buffer xattr names, lengths or values are copied to
+ * @buffer_size: size of buffer
+ * @type: n: names, l: lengths, v: values
+ * @canonical_fmt: data format (true: little endian, false: native format)
+ *
+ * Read protected xattr names (separated by |), lengths (u32) or values for a
+ * given dentry and return the total size of copied data. If buffer is NULL,
+ * just return the total size.
+ *
+ * Returns the total size on success, a negative value on error.
+ */
+int evm_read_protected_xattrs(struct dentry *dentry, u8 *buffer,
+			      int buffer_size, char type, bool canonical_fmt)
+{
+	struct xattr_list *xattr;
+	int rc, size, total_size = 0;
+
+	list_for_each_entry_lockless(xattr, &evm_config_xattrnames, list) {
+		rc = __vfs_getxattr(dentry, d_backing_inode(dentry),
+				    xattr->name, NULL, 0);
+		if (rc < 0 && rc == -ENODATA)
+			continue;
+		else if (rc < 0)
+			return rc;
+
+		switch (type) {
+		case 'n':
+			size = strlen(xattr->name) + 1;
+			if (buffer) {
+				if (total_size)
+					*(buffer + total_size - 1) = '|';
+
+				memcpy(buffer + total_size, xattr->name, size);
+			}
+			break;
+		case 'l':
+			size = sizeof(u32);
+			if (buffer) {
+				if (canonical_fmt)
+					rc = cpu_to_le32(rc);
+
+				*(u32 *)(buffer + total_size) = rc;
+			}
+			break;
+		case 'v':
+			size = rc;
+			if (buffer) {
+				rc = __vfs_getxattr(dentry,
+					d_backing_inode(dentry), xattr->name,
+					buffer + total_size,
+					buffer_size - total_size);
+				if (rc < 0)
+					return rc;
+			}
+			break;
+		default:
+			return -EINVAL;
+		}
+
+		total_size += size;
+	}
+
+	return total_size;
+}
+
 /**
  * evm_verifyxattr - verify the integrity of the requested xattr
  * @dentry: object of the verify xattr
diff --git a/security/integrity/ima/ima_template.c b/security/integrity/ima/ima_template.c
index 43784f2bf8bd..159a31d2fcdf 100644
--- a/security/integrity/ima/ima_template.c
+++ b/security/integrity/ima/ima_template.c
@@ -53,6 +53,15 @@ static const struct ima_template_field supported_fields[] = {
 	 .field_show = ima_show_template_uint},
 	{.field_id = "imode", .field_init = ima_eventinodemode_init,
 	 .field_show = ima_show_template_uint},
+	{.field_id = "xattrnames",
+	 .field_init = ima_eventinodexattrnames_init,
+	 .field_show = ima_show_template_string},
+	{.field_id = "xattrlengths",
+	 .field_init = ima_eventinodexattrlengths_init,
+	 .field_show = ima_show_template_sig},
+	{.field_id = "xattrvalues",
+	 .field_init = ima_eventinodexattrvalues_init,
+	 .field_show = ima_show_template_sig},
 };
 
 /*
diff --git a/security/integrity/ima/ima_template_lib.c b/security/integrity/ima/ima_template_lib.c
index 3156fb34b1af..518fd50ea48a 100644
--- a/security/integrity/ima/ima_template_lib.c
+++ b/security/integrity/ima/ima_template_lib.c
@@ -11,6 +11,7 @@
 
 #include "ima_template_lib.h"
 #include <linux/xattr.h>
+#include <linux/evm.h>
 
 static bool ima_template_hash_algo_allowed(u8 algo)
 {
@@ -618,3 +619,66 @@ int ima_eventinodemode_init(struct ima_event_data *event_data,
 	return ima_write_template_field_data((char *)&mode, sizeof(mode),
 					     DATA_FMT_UINT, field_data);
 }
+
+static int ima_eventinodexattrs_init_common(struct ima_event_data *event_data,
+					    struct ima_field_data *field_data,
+					    char type)
+{
+	u8 *buffer = NULL;
+	int rc;
+
+	if (!event_data->file)
+		return 0;
+
+	rc = evm_read_protected_xattrs(file_dentry(event_data->file), NULL, 0,
+				       type, ima_canonical_fmt);
+	if (rc < 0)
+		return 0;
+
+	buffer = kmalloc(rc, GFP_KERNEL);
+	if (!buffer)
+		return 0;
+
+	rc = evm_read_protected_xattrs(file_dentry(event_data->file), buffer,
+				       rc, type, ima_canonical_fmt);
+	if (rc < 0) {
+		rc = 0;
+		goto out;
+	}
+
+	rc = ima_write_template_field_data((char *)buffer, rc, DATA_FMT_HEX,
+					   field_data);
+out:
+	kfree(buffer);
+	return rc;
+}
+
+/*
+ *  ima_eventinodexattrnames_init - include a list of xattr names as part of the
+ *  template data
+ */
+int ima_eventinodexattrnames_init(struct ima_event_data *event_data,
+				  struct ima_field_data *field_data)
+{
+	return ima_eventinodexattrs_init_common(event_data, field_data, 'n');
+}
+
+/*
+ *  ima_eventinodexattrlengths_init - include a list of xattr lengths as part of
+ *  the template data
+ */
+int ima_eventinodexattrlengths_init(struct ima_event_data *event_data,
+				    struct ima_field_data *field_data)
+{
+	return ima_eventinodexattrs_init_common(event_data, field_data, 'l');
+}
+
+/*
+ *  ima_eventinodexattrvalues_init - include a list of xattr values as part of
+ *  the template data
+ */
+int ima_eventinodexattrvalues_init(struct ima_event_data *event_data,
+				   struct ima_field_data *field_data)
+{
+	return ima_eventinodexattrs_init_common(event_data, field_data, 'v');
+}
diff --git a/security/integrity/ima/ima_template_lib.h b/security/integrity/ima/ima_template_lib.h
index 6509af4a97ee..c71f1de95753 100644
--- a/security/integrity/ima/ima_template_lib.h
+++ b/security/integrity/ima/ima_template_lib.h
@@ -56,4 +56,10 @@ int ima_eventinodegid_init(struct ima_event_data *event_data,
 			   struct ima_field_data *field_data);
 int ima_eventinodemode_init(struct ima_event_data *event_data,
 			    struct ima_field_data *field_data);
+int ima_eventinodexattrnames_init(struct ima_event_data *event_data,
+				  struct ima_field_data *field_data);
+int ima_eventinodexattrlengths_init(struct ima_event_data *event_data,
+				    struct ima_field_data *field_data);
+int ima_eventinodexattrvalues_init(struct ima_event_data *event_data,
+				   struct ima_field_data *field_data);
 #endif /* __LINUX_IMA_TEMPLATE_LIB_H */
-- 
cgit v1.2.3


From fa0cf568fd76550c1ddb806c03a65a1a4a1ea909 Mon Sep 17 00:00:00 2001
From: Shiraz Saleem <shiraz.saleem@intel.com>
Date: Wed, 2 Jun 2021 15:51:37 -0500
Subject: RDMA/irdma: Add irdma Kconfig/Makefile and remove i40iw

Add Kconfig and Makefile to build irdma driver.

Remove i40iw driver and add an alias in irdma.

Remove legacy exported symbols i40e_register_client
and i40e_unregister_client from i40e as they are no
longer used.

irdma is the replacement driver that supports X722.

Link: https://lore.kernel.org/r/20210602205138.889-16-shiraz.saleem@intel.com
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 Documentation/ABI/stable/sysfs-class-infiniband |   20 -
 MAINTAINERS                                     |    8 -
 drivers/infiniband/Kconfig                      |    2 +-
 drivers/infiniband/hw/Makefile                  |    2 +-
 drivers/infiniband/hw/i40iw/Kconfig             |    9 -
 drivers/infiniband/hw/i40iw/Makefile            |    9 -
 drivers/infiniband/hw/i40iw/i40iw.h             |  602 ---
 drivers/infiniband/hw/i40iw/i40iw_cm.c          | 4419 -------------------
 drivers/infiniband/hw/i40iw/i40iw_cm.h          |  462 --
 drivers/infiniband/hw/i40iw/i40iw_ctrl.c        | 5243 -----------------------
 drivers/infiniband/hw/i40iw/i40iw_d.h           | 1746 --------
 drivers/infiniband/hw/i40iw/i40iw_hmc.c         |  821 ----
 drivers/infiniband/hw/i40iw/i40iw_hmc.h         |  241 --
 drivers/infiniband/hw/i40iw/i40iw_hw.c          |  851 ----
 drivers/infiniband/hw/i40iw/i40iw_main.c        | 2064 ---------
 drivers/infiniband/hw/i40iw/i40iw_osdep.h       |  195 -
 drivers/infiniband/hw/i40iw/i40iw_p.h           |  129 -
 drivers/infiniband/hw/i40iw/i40iw_pble.c        |  611 ---
 drivers/infiniband/hw/i40iw/i40iw_pble.h        |  131 -
 drivers/infiniband/hw/i40iw/i40iw_puda.c        | 1496 -------
 drivers/infiniband/hw/i40iw/i40iw_puda.h        |  188 -
 drivers/infiniband/hw/i40iw/i40iw_register.h    | 1030 -----
 drivers/infiniband/hw/i40iw/i40iw_status.h      |  101 -
 drivers/infiniband/hw/i40iw/i40iw_type.h        | 1358 ------
 drivers/infiniband/hw/i40iw/i40iw_uk.c          | 1200 ------
 drivers/infiniband/hw/i40iw/i40iw_user.h        |  422 --
 drivers/infiniband/hw/i40iw/i40iw_utils.c       | 1518 -------
 drivers/infiniband/hw/i40iw/i40iw_verbs.c       | 2652 ------------
 drivers/infiniband/hw/i40iw/i40iw_verbs.h       |  179 -
 drivers/infiniband/hw/i40iw/i40iw_vf.c          |   85 -
 drivers/infiniband/hw/i40iw/i40iw_vf.h          |   62 -
 drivers/infiniband/hw/i40iw/i40iw_virtchnl.c    |  759 ----
 drivers/infiniband/hw/i40iw/i40iw_virtchnl.h    |  124 -
 drivers/infiniband/hw/irdma/Kconfig             |   12 +
 drivers/infiniband/hw/irdma/Makefile            |   27 +
 drivers/net/ethernet/intel/i40e/i40e_client.c   |  149 -
 include/linux/net/intel/i40e_client.h           |    3 -
 include/uapi/rdma/i40iw-abi.h                   |  107 -
 38 files changed, 41 insertions(+), 28996 deletions(-)
 delete mode 100644 drivers/infiniband/hw/i40iw/Kconfig
 delete mode 100644 drivers/infiniband/hw/i40iw/Makefile
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw.h
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw_cm.c
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw_cm.h
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw_ctrl.c
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw_d.h
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw_hmc.c
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw_hmc.h
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw_hw.c
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw_main.c
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw_osdep.h
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw_p.h
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw_pble.c
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw_pble.h
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw_puda.c
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw_puda.h
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw_register.h
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw_status.h
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw_type.h
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw_uk.c
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw_user.h
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw_utils.c
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw_verbs.c
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw_verbs.h
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw_vf.c
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw_vf.h
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw_virtchnl.c
 delete mode 100644 drivers/infiniband/hw/i40iw/i40iw_virtchnl.h
 create mode 100644 drivers/infiniband/hw/irdma/Kconfig
 create mode 100644 drivers/infiniband/hw/irdma/Makefile
 delete mode 100644 include/uapi/rdma/i40iw-abi.h

(limited to 'include/linux')

diff --git a/Documentation/ABI/stable/sysfs-class-infiniband b/Documentation/ABI/stable/sysfs-class-infiniband
index 348c4ac803ad..9b1bdfa43354 100644
--- a/Documentation/ABI/stable/sysfs-class-infiniband
+++ b/Documentation/ABI/stable/sysfs-class-infiniband
@@ -731,26 +731,6 @@ Description:
 		is the irq number of "sdma3", and M is irq number of "sdma4" in
 		the /proc/interrupts file.
 
-
-sysfs interface for Intel(R) X722 iWARP i40iw driver
-----------------------------------------------------
-
-What:		/sys/class/infiniband/i40iwX/hw_rev
-What:		/sys/class/infiniband/i40iwX/hca_type
-What:		/sys/class/infiniband/i40iwX/board_id
-Date:		Jan, 2016
-KernelVersion:	v4.10
-Contact:	linux-rdma@vger.kernel.org
-Description:
-		=============== ==== ========================
-		hw_rev:		(RO) Hardware revision number
-
-		hca_type:	(RO) Show HCA type (I40IW)
-
-		board_id:	(RO) I40IW board ID
-		=============== ==== ========================
-
-
 sysfs interface for QLogic qedr NIC Driver
 ------------------------------------------
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 34d2bf36b5ad..05cab3b1c955 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9365,14 +9365,6 @@ L:	linux-pm@vger.kernel.org
 S:	Supported
 F:	drivers/cpufreq/intel_pstate.c
 
-INTEL RDMA RNIC DRIVER
-M:	Faisal Latif <faisal.latif@intel.com>
-M:	Shiraz Saleem <shiraz.saleem@intel.com>
-L:	linux-rdma@vger.kernel.org
-S:	Supported
-F:	drivers/infiniband/hw/i40iw/
-F:	include/uapi/rdma/i40iw-abi.h
-
 INTEL SCU DRIVERS
 M:	Mika Westerberg <mika.westerberg@linux.intel.com>
 S:	Maintained
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index 04a78d9f8fe3..33d3ce9c888e 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -82,7 +82,7 @@ source "drivers/infiniband/hw/mthca/Kconfig"
 source "drivers/infiniband/hw/qib/Kconfig"
 source "drivers/infiniband/hw/cxgb4/Kconfig"
 source "drivers/infiniband/hw/efa/Kconfig"
-source "drivers/infiniband/hw/i40iw/Kconfig"
+source "drivers/infiniband/hw/irdma/Kconfig"
 source "drivers/infiniband/hw/mlx4/Kconfig"
 source "drivers/infiniband/hw/mlx5/Kconfig"
 source "drivers/infiniband/hw/ocrdma/Kconfig"
diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile
index 0aeccd984889..fba0b3be903e 100644
--- a/drivers/infiniband/hw/Makefile
+++ b/drivers/infiniband/hw/Makefile
@@ -3,7 +3,7 @@ obj-$(CONFIG_INFINIBAND_MTHCA)		+= mthca/
 obj-$(CONFIG_INFINIBAND_QIB)		+= qib/
 obj-$(CONFIG_INFINIBAND_CXGB4)		+= cxgb4/
 obj-$(CONFIG_INFINIBAND_EFA)		+= efa/
-obj-$(CONFIG_INFINIBAND_I40IW)		+= i40iw/
+obj-$(CONFIG_INFINIBAND_IRDMA)		+= irdma/
 obj-$(CONFIG_MLX4_INFINIBAND)		+= mlx4/
 obj-$(CONFIG_MLX5_INFINIBAND)		+= mlx5/
 obj-$(CONFIG_INFINIBAND_OCRDMA)		+= ocrdma/
diff --git a/drivers/infiniband/hw/i40iw/Kconfig b/drivers/infiniband/hw/i40iw/Kconfig
deleted file mode 100644
index 7476f3b14c39..000000000000
--- a/drivers/infiniband/hw/i40iw/Kconfig
+++ /dev/null
@@ -1,9 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-config INFINIBAND_I40IW
-	tristate "Intel(R) Ethernet X722 iWARP Driver"
-	depends on INET && I40E
-	depends on IPV6 || !IPV6
-	depends on PCI
-	select GENERIC_ALLOCATOR
-	help
-	Intel(R) Ethernet X722 iWARP Driver
diff --git a/drivers/infiniband/hw/i40iw/Makefile b/drivers/infiniband/hw/i40iw/Makefile
deleted file mode 100644
index 34da9eba8a7c..000000000000
--- a/drivers/infiniband/hw/i40iw/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-
-obj-$(CONFIG_INFINIBAND_I40IW) += i40iw.o
-
-i40iw-objs :=\
-               i40iw_cm.o i40iw_ctrl.o \
-               i40iw_hmc.o i40iw_hw.o i40iw_main.o  \
-               i40iw_pble.o i40iw_puda.o i40iw_uk.o i40iw_utils.o \
-               i40iw_verbs.o i40iw_virtchnl.o i40iw_vf.o
diff --git a/drivers/infiniband/hw/i40iw/i40iw.h b/drivers/infiniband/hw/i40iw/i40iw.h
deleted file mode 100644
index be4094ac4fac..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw.h
+++ /dev/null
@@ -1,602 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#ifndef I40IW_IW_H
-#define I40IW_IW_H
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/spinlock.h>
-#include <linux/kernel.h>
-#include <linux/delay.h>
-#include <linux/pci.h>
-#include <linux/dma-mapping.h>
-#include <linux/workqueue.h>
-#include <linux/slab.h>
-#include <linux/io.h>
-#include <linux/crc32c.h>
-#include <linux/net/intel/i40e_client.h>
-#include <rdma/ib_smi.h>
-#include <rdma/ib_verbs.h>
-#include <rdma/ib_pack.h>
-#include <rdma/rdma_cm.h>
-#include <rdma/iw_cm.h>
-#include <crypto/hash.h>
-
-#include "i40iw_status.h"
-#include "i40iw_osdep.h"
-#include "i40iw_d.h"
-#include "i40iw_hmc.h"
-
-#include "i40iw_type.h"
-#include "i40iw_p.h"
-#include <rdma/i40iw-abi.h>
-#include "i40iw_pble.h"
-#include "i40iw_verbs.h"
-#include "i40iw_cm.h"
-#include "i40iw_user.h"
-#include "i40iw_puda.h"
-
-#define I40IW_FW_VER_DEFAULT 2
-#define I40IW_HW_VERSION  2
-
-#define I40IW_ARP_ADD     1
-#define I40IW_ARP_DELETE  2
-#define I40IW_ARP_RESOLVE 3
-
-#define I40IW_MACIP_ADD     1
-#define I40IW_MACIP_DELETE  2
-
-#define IW_CCQ_SIZE         (I40IW_CQP_SW_SQSIZE_2048 + 1)
-#define IW_CEQ_SIZE         2048
-#define IW_AEQ_SIZE         2048
-
-#define RX_BUF_SIZE            (1536 + 8)
-#define IW_REG0_SIZE           (4 * 1024)
-#define IW_TX_TIMEOUT          (6 * HZ)
-#define IW_FIRST_QPN           1
-#define IW_SW_CONTEXT_ALIGN    1024
-
-#define MAX_DPC_ITERATIONS		128
-
-#define I40IW_EVENT_TIMEOUT		100000
-#define I40IW_VCHNL_EVENT_TIMEOUT	100000
-
-#define	I40IW_NO_VLAN			0xffff
-#define	I40IW_NO_QSET			0xffff
-
-/* access to mcast filter list */
-#define IW_ADD_MCAST false
-#define IW_DEL_MCAST true
-
-#define I40IW_DRV_OPT_ENABLE_MPA_VER_0     0x00000001
-#define I40IW_DRV_OPT_DISABLE_MPA_CRC      0x00000002
-#define I40IW_DRV_OPT_DISABLE_FIRST_WRITE  0x00000004
-#define I40IW_DRV_OPT_DISABLE_INTF         0x00000008
-#define I40IW_DRV_OPT_ENABLE_MSI           0x00000010
-#define I40IW_DRV_OPT_DUAL_LOGICAL_PORT    0x00000020
-#define I40IW_DRV_OPT_NO_INLINE_DATA       0x00000080
-#define I40IW_DRV_OPT_DISABLE_INT_MOD      0x00000100
-#define I40IW_DRV_OPT_DISABLE_VIRT_WQ      0x00000200
-#define I40IW_DRV_OPT_ENABLE_PAU           0x00000400
-#define I40IW_DRV_OPT_MCAST_LOGPORT_MAP    0x00000800
-
-#define IW_HMC_OBJ_TYPE_NUM ARRAY_SIZE(iw_hmc_obj_types)
-#define IW_CFG_FPM_QP_COUNT               32768
-#define I40IW_MAX_PAGES_PER_FMR           512
-#define I40IW_MIN_PAGES_PER_FMR           1
-#define I40IW_CQP_COMPL_RQ_WQE_FLUSHED    2
-#define I40IW_CQP_COMPL_SQ_WQE_FLUSHED    3
-#define I40IW_CQP_COMPL_RQ_SQ_WQE_FLUSHED 4
-
-struct i40iw_cqp_compl_info {
-	u32 op_ret_val;
-	u16 maj_err_code;
-	u16 min_err_code;
-	bool error;
-	u8 op_code;
-};
-
-#define i40iw_pr_err(fmt, args ...) pr_err("%s: "fmt, __func__, ## args)
-
-#define i40iw_pr_info(fmt, args ...) pr_info("%s: " fmt, __func__, ## args)
-
-#define i40iw_pr_warn(fmt, args ...) pr_warn("%s: " fmt, __func__, ## args)
-
-struct i40iw_cqp_request {
-	struct cqp_commands_info info;
-	wait_queue_head_t waitq;
-	struct list_head list;
-	atomic_t refcount;
-	void (*callback_fcn)(struct i40iw_cqp_request*, u32);
-	void *param;
-	struct i40iw_cqp_compl_info compl_info;
-	bool waiting;
-	bool request_done;
-	bool dynamic;
-};
-
-struct i40iw_cqp {
-	struct i40iw_sc_cqp sc_cqp;
-	spinlock_t req_lock; /*cqp request list */
-	wait_queue_head_t waitq;
-	struct i40iw_dma_mem sq;
-	struct i40iw_dma_mem host_ctx;
-	u64 *scratch_array;
-	struct i40iw_cqp_request *cqp_requests;
-	struct list_head cqp_avail_reqs;
-	struct list_head cqp_pending_reqs;
-};
-
-struct i40iw_device;
-
-struct i40iw_ccq {
-	struct i40iw_sc_cq sc_cq;
-	spinlock_t lock; /* ccq control */
-	wait_queue_head_t waitq;
-	struct i40iw_dma_mem mem_cq;
-	struct i40iw_dma_mem shadow_area;
-};
-
-struct i40iw_ceq {
-	struct i40iw_sc_ceq sc_ceq;
-	struct i40iw_dma_mem mem;
-	u32 irq;
-	u32 msix_idx;
-	struct i40iw_device *iwdev;
-	struct tasklet_struct dpc_tasklet;
-};
-
-struct i40iw_aeq {
-	struct i40iw_sc_aeq sc_aeq;
-	struct i40iw_dma_mem mem;
-};
-
-struct i40iw_arp_entry {
-	u32 ip_addr[4];
-	u8 mac_addr[ETH_ALEN];
-};
-
-enum init_completion_state {
-	INVALID_STATE = 0,
-	INITIAL_STATE,
-	CQP_CREATED,
-	HMC_OBJS_CREATED,
-	PBLE_CHUNK_MEM,
-	CCQ_CREATED,
-	AEQ_CREATED,
-	CEQ_CREATED,
-	ILQ_CREATED,
-	IEQ_CREATED,
-	IP_ADDR_REGISTERED,
-	RDMA_DEV_REGISTERED
-};
-
-struct i40iw_msix_vector {
-	u32 idx;
-	u32 irq;
-	u32 cpu_affinity;
-	u32 ceq_id;
-	cpumask_t mask;
-};
-
-struct l2params_work {
-	struct work_struct work;
-	struct i40iw_device *iwdev;
-	struct i40iw_l2params l2params;
-};
-
-#define I40IW_MSIX_TABLE_SIZE   65
-
-struct virtchnl_work {
-	struct work_struct work;
-	union {
-		struct i40iw_cqp_request *cqp_request;
-		struct i40iw_virtchnl_work_info work_info;
-	};
-};
-
-struct i40e_qvlist_info;
-
-struct i40iw_device {
-	struct i40iw_ib_device *iwibdev;
-	struct net_device *netdev;
-	wait_queue_head_t vchnl_waitq;
-	struct i40iw_sc_dev sc_dev;
-	struct i40iw_sc_vsi vsi;
-	struct i40iw_handler *hdl;
-	struct i40e_info *ldev;
-	struct i40e_client *client;
-	struct i40iw_hw hw;
-	struct i40iw_cm_core cm_core;
-	u8 *mem_resources;
-	unsigned long *allocated_qps;
-	unsigned long *allocated_cqs;
-	unsigned long *allocated_mrs;
-	unsigned long *allocated_pds;
-	unsigned long *allocated_arps;
-	struct i40iw_qp **qp_table;
-	bool msix_shared;
-	u32 msix_count;
-	struct i40iw_msix_vector *iw_msixtbl;
-	struct i40e_qvlist_info *iw_qvlist;
-
-	struct i40iw_hmc_pble_rsrc *pble_rsrc;
-	struct i40iw_arp_entry *arp_table;
-	struct i40iw_cqp cqp;
-	struct i40iw_ccq ccq;
-	u32 ceqs_count;
-	struct i40iw_ceq *ceqlist;
-	struct i40iw_aeq aeq;
-	u32 arp_table_size;
-	u32 next_arp_index;
-	spinlock_t resource_lock; /* hw resource access */
-	spinlock_t qptable_lock;
-	u32 vendor_id;
-	u32 vendor_part_id;
-	u32 of_device_registered;
-
-	u32 device_cap_flags;
-	unsigned long db_start;
-	u8 resource_profile;
-	u8 max_rdma_vfs;
-	u8 max_enabled_vfs;
-	u8 max_sge;
-	u8 iw_status;
-	u8 send_term_ok;
-
-	/* x710 specific */
-	struct mutex pbl_mutex;
-	struct tasklet_struct dpc_tasklet;
-	struct workqueue_struct *virtchnl_wq;
-	struct virtchnl_work virtchnl_w[I40IW_MAX_PE_ENABLED_VF_COUNT];
-	struct i40iw_dma_mem obj_mem;
-	struct i40iw_dma_mem obj_next;
-	u8 *hmc_info_mem;
-	u32 sd_type;
-	struct workqueue_struct *param_wq;
-	atomic_t params_busy;
-	enum init_completion_state init_state;
-	u16 mac_ip_table_idx;
-	atomic_t vchnl_msgs;
-	u32 max_mr;
-	u32 max_qp;
-	u32 max_cq;
-	u32 max_pd;
-	u32 next_qp;
-	u32 next_cq;
-	u32 next_pd;
-	u32 max_mr_size;
-	u32 max_qp_wr;
-	u32 max_cqe;
-	u32 mr_stagmask;
-	u32 mpa_version;
-	bool dcb;
-	bool closing;
-	bool reset;
-	u32 used_pds;
-	u32 used_cqs;
-	u32 used_mrs;
-	u32 used_qps;
-	wait_queue_head_t close_wq;
-	atomic64_t use_count;
-};
-
-struct i40iw_ib_device {
-	struct ib_device ibdev;
-	struct i40iw_device *iwdev;
-};
-
-struct i40iw_handler {
-	struct list_head list;
-	struct i40e_client *client;
-	struct i40iw_device device;
-	struct i40e_info ldev;
-};
-
-/**
- * i40iw_fw_major_ver - get firmware major version
- * @dev: iwarp device
- **/
-static inline u64 i40iw_fw_major_ver(struct i40iw_sc_dev *dev)
-{
-	return RS_64(dev->feature_info[I40IW_FEATURE_FW_INFO],
-		     I40IW_FW_VER_MAJOR);
-}
-
-/**
- * i40iw_fw_minor_ver - get firmware minor version
- * @dev: iwarp device
- **/
-static inline u64 i40iw_fw_minor_ver(struct i40iw_sc_dev *dev)
-{
-	return RS_64(dev->feature_info[I40IW_FEATURE_FW_INFO],
-		     I40IW_FW_VER_MINOR);
-}
-
-/**
- * to_iwdev - get device
- * @ibdev: ib device
- **/
-static inline struct i40iw_device *to_iwdev(struct ib_device *ibdev)
-{
-	return container_of(ibdev, struct i40iw_ib_device, ibdev)->iwdev;
-}
-
-/**
- * to_ucontext - get user context
- * @ibucontext: ib user context
- **/
-static inline struct i40iw_ucontext *to_ucontext(struct ib_ucontext *ibucontext)
-{
-	return container_of(ibucontext, struct i40iw_ucontext, ibucontext);
-}
-
-/**
- * to_iwpd - get protection domain
- * @ibpd: ib pd
- **/
-static inline struct i40iw_pd *to_iwpd(struct ib_pd *ibpd)
-{
-	return container_of(ibpd, struct i40iw_pd, ibpd);
-}
-
-/**
- * to_iwmr - get device memory region
- * @ibdev: ib memory region
- **/
-static inline struct i40iw_mr *to_iwmr(struct ib_mr *ibmr)
-{
-	return container_of(ibmr, struct i40iw_mr, ibmr);
-}
-
-/**
- * to_iwmw - get device memory window
- * @ibmw: ib memory window
- **/
-static inline struct i40iw_mr *to_iwmw(struct ib_mw *ibmw)
-{
-	return container_of(ibmw, struct i40iw_mr, ibmw);
-}
-
-/**
- * to_iwcq - get completion queue
- * @ibcq: ib cqdevice
- **/
-static inline struct i40iw_cq *to_iwcq(struct ib_cq *ibcq)
-{
-	return container_of(ibcq, struct i40iw_cq, ibcq);
-}
-
-/**
- * to_iwqp - get device qp
- * @ibqp: ib qp
- **/
-static inline struct i40iw_qp *to_iwqp(struct ib_qp *ibqp)
-{
-	return container_of(ibqp, struct i40iw_qp, ibqp);
-}
-
-/* i40iw.c */
-void i40iw_qp_add_ref(struct ib_qp *ibqp);
-void i40iw_qp_rem_ref(struct ib_qp *ibqp);
-struct ib_qp *i40iw_get_qp(struct ib_device *, int);
-
-void i40iw_flush_wqes(struct i40iw_device *iwdev,
-		      struct i40iw_qp *qp);
-
-void i40iw_manage_arp_cache(struct i40iw_device *iwdev,
-			    unsigned char *mac_addr,
-			    u32 *ip_addr,
-			    bool ipv4,
-			    u32 action);
-
-int i40iw_manage_apbvt(struct i40iw_device *iwdev,
-		       u16 accel_local_port,
-		       bool add_port);
-
-struct i40iw_cqp_request *i40iw_get_cqp_request(struct i40iw_cqp *cqp, bool wait);
-void i40iw_free_cqp_request(struct i40iw_cqp *cqp, struct i40iw_cqp_request *cqp_request);
-void i40iw_put_cqp_request(struct i40iw_cqp *cqp, struct i40iw_cqp_request *cqp_request);
-
-/**
- * i40iw_alloc_resource - allocate a resource
- * @iwdev: device pointer
- * @resource_array: resource bit array:
- * @max_resources: maximum resource number
- * @req_resources_num: Allocated resource number
- * @next: next free id
- **/
-static inline int i40iw_alloc_resource(struct i40iw_device *iwdev,
-				       unsigned long *resource_array,
-				       u32 max_resources,
-				       u32 *req_resource_num,
-				       u32 *next)
-{
-	u32 resource_num;
-	unsigned long flags;
-
-	spin_lock_irqsave(&iwdev->resource_lock, flags);
-	resource_num = find_next_zero_bit(resource_array, max_resources, *next);
-	if (resource_num >= max_resources) {
-		resource_num = find_first_zero_bit(resource_array, max_resources);
-		if (resource_num >= max_resources) {
-			spin_unlock_irqrestore(&iwdev->resource_lock, flags);
-			return -EOVERFLOW;
-		}
-	}
-	set_bit(resource_num, resource_array);
-	*next = resource_num + 1;
-	if (*next == max_resources)
-		*next = 0;
-	*req_resource_num = resource_num;
-	spin_unlock_irqrestore(&iwdev->resource_lock, flags);
-
-	return 0;
-}
-
-/**
- * i40iw_is_resource_allocated - detrmine if resource is
- * allocated
- * @iwdev: device pointer
- * @resource_array: resource array for the resource_num
- * @resource_num: resource number to check
- **/
-static inline bool i40iw_is_resource_allocated(struct i40iw_device *iwdev,
-					       unsigned long *resource_array,
-					       u32 resource_num)
-{
-	bool bit_is_set;
-	unsigned long flags;
-
-	spin_lock_irqsave(&iwdev->resource_lock, flags);
-
-	bit_is_set = test_bit(resource_num, resource_array);
-	spin_unlock_irqrestore(&iwdev->resource_lock, flags);
-
-	return bit_is_set;
-}
-
-/**
- * i40iw_free_resource - free a resource
- * @iwdev: device pointer
- * @resource_array: resource array for the resource_num
- * @resource_num: resource number to free
- **/
-static inline void i40iw_free_resource(struct i40iw_device *iwdev,
-				       unsigned long *resource_array,
-				       u32 resource_num)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&iwdev->resource_lock, flags);
-	clear_bit(resource_num, resource_array);
-	spin_unlock_irqrestore(&iwdev->resource_lock, flags);
-}
-
-struct i40iw_handler *i40iw_find_netdev(struct net_device *netdev);
-
-/**
- * iw_init_resources -
- */
-u32 i40iw_initialize_hw_resources(struct i40iw_device *iwdev);
-
-int i40iw_register_rdma_device(struct i40iw_device *iwdev);
-void i40iw_port_ibevent(struct i40iw_device *iwdev);
-void i40iw_cm_disconn(struct i40iw_qp *iwqp);
-void i40iw_cm_disconn_worker(void *);
-int mini_cm_recv_pkt(struct i40iw_cm_core *, struct i40iw_device *,
-		     struct sk_buff *);
-
-enum i40iw_status_code i40iw_handle_cqp_op(struct i40iw_device *iwdev,
-					   struct i40iw_cqp_request *cqp_request);
-enum i40iw_status_code i40iw_add_mac_addr(struct i40iw_device *iwdev,
-					  u8 *mac_addr, u8 *mac_index);
-int i40iw_modify_qp(struct ib_qp *, struct ib_qp_attr *, int, struct ib_udata *);
-void i40iw_cq_wq_destroy(struct i40iw_device *iwdev, struct i40iw_sc_cq *cq);
-
-void i40iw_cleanup_pending_cqp_op(struct i40iw_device *iwdev);
-void i40iw_rem_pdusecount(struct i40iw_pd *iwpd, struct i40iw_device *iwdev);
-void i40iw_add_pdusecount(struct i40iw_pd *iwpd);
-void i40iw_rem_devusecount(struct i40iw_device *iwdev);
-void i40iw_add_devusecount(struct i40iw_device *iwdev);
-void i40iw_hw_modify_qp(struct i40iw_device *iwdev, struct i40iw_qp *iwqp,
-			struct i40iw_modify_qp_info *info, bool wait);
-
-void i40iw_qp_suspend_resume(struct i40iw_sc_dev *dev,
-			     struct i40iw_sc_qp *qp,
-			     bool suspend);
-enum i40iw_status_code i40iw_manage_qhash(struct i40iw_device *iwdev,
-					  struct i40iw_cm_info *cminfo,
-					  enum i40iw_quad_entry_type etype,
-					  enum i40iw_quad_hash_manage_type mtype,
-					  void *cmnode,
-					  bool wait);
-void i40iw_receive_ilq(struct i40iw_sc_vsi *vsi, struct i40iw_puda_buf *rbuf);
-void i40iw_free_sqbuf(struct i40iw_sc_vsi *vsi, void *bufp);
-void i40iw_free_qp_resources(struct i40iw_qp *iwqp);
-
-enum i40iw_status_code i40iw_obj_aligned_mem(struct i40iw_device *iwdev,
-					     struct i40iw_dma_mem *memptr,
-					     u32 size, u32 mask);
-
-void i40iw_request_reset(struct i40iw_device *iwdev);
-void i40iw_destroy_rdma_device(struct i40iw_ib_device *iwibdev);
-int i40iw_setup_cm_core(struct i40iw_device *iwdev);
-void i40iw_cleanup_cm_core(struct i40iw_cm_core *cm_core);
-void i40iw_process_ceq(struct i40iw_device *, struct i40iw_ceq *iwceq);
-void i40iw_process_aeq(struct i40iw_device *);
-void i40iw_next_iw_state(struct i40iw_qp *iwqp,
-			 u8 state, u8 del_hash,
-			 u8 term, u8 term_len);
-int i40iw_send_syn(struct i40iw_cm_node *cm_node, u32 sendack);
-int i40iw_send_reset(struct i40iw_cm_node *cm_node);
-struct i40iw_cm_node *i40iw_find_node(struct i40iw_cm_core *cm_core,
-				      u16 rem_port,
-				      u32 *rem_addr,
-				      u16 loc_port,
-				      u32 *loc_addr,
-				      bool add_refcnt,
-				      bool accelerated_list);
-
-enum i40iw_status_code i40iw_hw_flush_wqes(struct i40iw_device *iwdev,
-					   struct i40iw_sc_qp *qp,
-					   struct i40iw_qp_flush_info *info,
-					   bool wait);
-
-void i40iw_gen_ae(struct i40iw_device *iwdev,
-		  struct i40iw_sc_qp *qp,
-		  struct i40iw_gen_ae_info *info,
-		  bool wait);
-
-void i40iw_copy_ip_ntohl(u32 *dst, __be32 *src);
-struct ib_mr *i40iw_reg_phys_mr(struct ib_pd *ib_pd,
-				u64 addr,
-				u64 size,
-				int acc,
-				u64 *iova_start);
-
-int i40iw_inetaddr_event(struct notifier_block *notifier,
-			 unsigned long event,
-			 void *ptr);
-int i40iw_inet6addr_event(struct notifier_block *notifier,
-			  unsigned long event,
-			  void *ptr);
-int i40iw_net_event(struct notifier_block *notifier,
-		    unsigned long event,
-		    void *ptr);
-int i40iw_netdevice_event(struct notifier_block *notifier,
-			  unsigned long event,
-			  void *ptr);
-
-#endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c
deleted file mode 100644
index 2450b7dd51f6..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw_cm.c
+++ /dev/null
@@ -1,4419 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#include <linux/atomic.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/init.h>
-#include <linux/if_arp.h>
-#include <linux/if_vlan.h>
-#include <linux/notifier.h>
-#include <linux/net.h>
-#include <linux/types.h>
-#include <linux/timer.h>
-#include <linux/time.h>
-#include <linux/delay.h>
-#include <linux/etherdevice.h>
-#include <linux/netdevice.h>
-#include <linux/random.h>
-#include <linux/list.h>
-#include <linux/threads.h>
-#include <linux/highmem.h>
-#include <net/arp.h>
-#include <net/ndisc.h>
-#include <net/neighbour.h>
-#include <net/route.h>
-#include <net/addrconf.h>
-#include <net/ip6_route.h>
-#include <net/ip_fib.h>
-#include <net/secure_seq.h>
-#include <net/tcp.h>
-#include <asm/checksum.h>
-
-#include "i40iw.h"
-
-static void i40iw_rem_ref_cm_node(struct i40iw_cm_node *);
-static void i40iw_cm_post_event(struct i40iw_cm_event *event);
-static void i40iw_disconnect_worker(struct work_struct *work);
-
-/**
- * i40iw_free_sqbuf - put back puda buffer if refcount = 0
- * @vsi: pointer to vsi structure
- * @bufp: puda buffer to free
- */
-void i40iw_free_sqbuf(struct i40iw_sc_vsi *vsi, void *bufp)
-{
-	struct i40iw_puda_buf *buf = (struct i40iw_puda_buf *)bufp;
-	struct i40iw_puda_rsrc *ilq = vsi->ilq;
-
-	if (!atomic_dec_return(&buf->refcount))
-		i40iw_puda_ret_bufpool(ilq, buf);
-}
-
-/**
- * i40iw_derive_hw_ird_setting - Calculate IRD
- *
- * @cm_ird: IRD of connection's node
- *
- * The ird from the connection is rounded to a supported HW
- * setting (2,8,32,64) and then encoded for ird_size field of
- * qp_ctx
- */
-static u8 i40iw_derive_hw_ird_setting(u16 cm_ird)
-{
-	u8 encoded_ird_size;
-
-	/* ird_size field is encoded in qp_ctx */
-	switch (cm_ird ? roundup_pow_of_two(cm_ird) : 0) {
-	case I40IW_HW_IRD_SETTING_64:
-		encoded_ird_size = 3;
-		break;
-	case I40IW_HW_IRD_SETTING_32:
-	case I40IW_HW_IRD_SETTING_16:
-		encoded_ird_size = 2;
-		break;
-	case I40IW_HW_IRD_SETTING_8:
-	case I40IW_HW_IRD_SETTING_4:
-		encoded_ird_size = 1;
-		break;
-	case I40IW_HW_IRD_SETTING_2:
-	default:
-		encoded_ird_size = 0;
-		break;
-	}
-	return encoded_ird_size;
-}
-
-/**
- * i40iw_record_ird_ord - Record IRD/ORD passed in
- * @cm_node: connection's node
- * @conn_ird: connection IRD
- * @conn_ord: connection ORD
- */
-static void i40iw_record_ird_ord(struct i40iw_cm_node *cm_node, u32 conn_ird,
-				 u32 conn_ord)
-{
-	if (conn_ird > I40IW_MAX_IRD_SIZE)
-		conn_ird = I40IW_MAX_IRD_SIZE;
-
-	if (conn_ord > I40IW_MAX_ORD_SIZE)
-		conn_ord = I40IW_MAX_ORD_SIZE;
-	else if (!conn_ord && cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO)
-		conn_ord = 1;
-
-	cm_node->ird_size = conn_ird;
-	cm_node->ord_size = conn_ord;
-}
-
-/**
- * i40iw_copy_ip_ntohl - change network to host ip
- * @dst: host ip
- * @src: big endian
- */
-void i40iw_copy_ip_ntohl(u32 *dst, __be32 *src)
-{
-	*dst++ = ntohl(*src++);
-	*dst++ = ntohl(*src++);
-	*dst++ = ntohl(*src++);
-	*dst = ntohl(*src);
-}
-
-/**
- * i40iw_copy_ip_htonl - change host addr to network ip
- * @dst: host ip
- * @src: little endian
- */
-static inline void i40iw_copy_ip_htonl(__be32 *dst, u32 *src)
-{
-	*dst++ = htonl(*src++);
-	*dst++ = htonl(*src++);
-	*dst++ = htonl(*src++);
-	*dst = htonl(*src);
-}
-
-/**
- * i40iw_fill_sockaddr4 - get addr info for passive connection
- * @cm_node: connection's node
- * @event: upper layer's cm event
- */
-static inline void i40iw_fill_sockaddr4(struct i40iw_cm_node *cm_node,
-					struct iw_cm_event *event)
-{
-	struct sockaddr_in *laddr = (struct sockaddr_in *)&event->local_addr;
-	struct sockaddr_in *raddr = (struct sockaddr_in *)&event->remote_addr;
-
-	laddr->sin_family = AF_INET;
-	raddr->sin_family = AF_INET;
-
-	laddr->sin_port = htons(cm_node->loc_port);
-	raddr->sin_port = htons(cm_node->rem_port);
-
-	laddr->sin_addr.s_addr = htonl(cm_node->loc_addr[0]);
-	raddr->sin_addr.s_addr = htonl(cm_node->rem_addr[0]);
-}
-
-/**
- * i40iw_fill_sockaddr6 - get ipv6 addr info for passive side
- * @cm_node: connection's node
- * @event: upper layer's cm event
- */
-static inline void i40iw_fill_sockaddr6(struct i40iw_cm_node *cm_node,
-					struct iw_cm_event *event)
-{
-	struct sockaddr_in6 *laddr6 = (struct sockaddr_in6 *)&event->local_addr;
-	struct sockaddr_in6 *raddr6 = (struct sockaddr_in6 *)&event->remote_addr;
-
-	laddr6->sin6_family = AF_INET6;
-	raddr6->sin6_family = AF_INET6;
-
-	laddr6->sin6_port = htons(cm_node->loc_port);
-	raddr6->sin6_port = htons(cm_node->rem_port);
-
-	i40iw_copy_ip_htonl(laddr6->sin6_addr.in6_u.u6_addr32,
-			    cm_node->loc_addr);
-	i40iw_copy_ip_htonl(raddr6->sin6_addr.in6_u.u6_addr32,
-			    cm_node->rem_addr);
-}
-
-/**
- * i40iw_get_addr_info
- * @cm_node: contains ip/tcp info
- * @cm_info: to get a copy of the cm_node ip/tcp info
-*/
-static void i40iw_get_addr_info(struct i40iw_cm_node *cm_node,
-				struct i40iw_cm_info *cm_info)
-{
-	cm_info->ipv4 = cm_node->ipv4;
-	cm_info->vlan_id = cm_node->vlan_id;
-	memcpy(cm_info->loc_addr, cm_node->loc_addr, sizeof(cm_info->loc_addr));
-	memcpy(cm_info->rem_addr, cm_node->rem_addr, sizeof(cm_info->rem_addr));
-	cm_info->loc_port = cm_node->loc_port;
-	cm_info->rem_port = cm_node->rem_port;
-	cm_info->user_pri = cm_node->user_pri;
-}
-
-/**
- * i40iw_get_cmevent_info - for cm event upcall
- * @cm_node: connection's node
- * @cm_id: upper layers cm struct for the event
- * @event: upper layer's cm event
- */
-static inline void i40iw_get_cmevent_info(struct i40iw_cm_node *cm_node,
-					  struct iw_cm_id *cm_id,
-					  struct iw_cm_event *event)
-{
-	memcpy(&event->local_addr, &cm_id->m_local_addr,
-	       sizeof(event->local_addr));
-	memcpy(&event->remote_addr, &cm_id->m_remote_addr,
-	       sizeof(event->remote_addr));
-	if (cm_node) {
-		event->private_data = (void *)cm_node->pdata_buf;
-		event->private_data_len = (u8)cm_node->pdata.size;
-		event->ird = cm_node->ird_size;
-		event->ord = cm_node->ord_size;
-	}
-}
-
-/**
- * i40iw_send_cm_event - upcall cm's event handler
- * @cm_node: connection's node
- * @cm_id: upper layer's cm info struct
- * @type: Event type to indicate
- * @status: status for the event type
- */
-static int i40iw_send_cm_event(struct i40iw_cm_node *cm_node,
-			       struct iw_cm_id *cm_id,
-			       enum iw_cm_event_type type,
-			       int status)
-{
-	struct iw_cm_event event;
-
-	memset(&event, 0, sizeof(event));
-	event.event = type;
-	event.status = status;
-	switch (type) {
-	case IW_CM_EVENT_CONNECT_REQUEST:
-		if (cm_node->ipv4)
-			i40iw_fill_sockaddr4(cm_node, &event);
-		else
-			i40iw_fill_sockaddr6(cm_node, &event);
-		event.provider_data = (void *)cm_node;
-		event.private_data = (void *)cm_node->pdata_buf;
-		event.private_data_len = (u8)cm_node->pdata.size;
-		event.ird = cm_node->ird_size;
-		break;
-	case IW_CM_EVENT_CONNECT_REPLY:
-		i40iw_get_cmevent_info(cm_node, cm_id, &event);
-		break;
-	case IW_CM_EVENT_ESTABLISHED:
-		event.ird = cm_node->ird_size;
-		event.ord = cm_node->ord_size;
-		break;
-	case IW_CM_EVENT_DISCONNECT:
-		break;
-	case IW_CM_EVENT_CLOSE:
-		break;
-	default:
-		i40iw_pr_err("event type received type = %d\n", type);
-		return -1;
-	}
-	return cm_id->event_handler(cm_id, &event);
-}
-
-/**
- * i40iw_create_event - create cm event
- * @cm_node: connection's node
- * @type: Event type to generate
- */
-static struct i40iw_cm_event *i40iw_create_event(struct i40iw_cm_node *cm_node,
-						 enum i40iw_cm_event_type type)
-{
-	struct i40iw_cm_event *event;
-
-	if (!cm_node->cm_id)
-		return NULL;
-
-	event = kzalloc(sizeof(*event), GFP_ATOMIC);
-
-	if (!event)
-		return NULL;
-
-	event->type = type;
-	event->cm_node = cm_node;
-	memcpy(event->cm_info.rem_addr, cm_node->rem_addr, sizeof(event->cm_info.rem_addr));
-	memcpy(event->cm_info.loc_addr, cm_node->loc_addr, sizeof(event->cm_info.loc_addr));
-	event->cm_info.rem_port = cm_node->rem_port;
-	event->cm_info.loc_port = cm_node->loc_port;
-	event->cm_info.cm_id = cm_node->cm_id;
-
-	i40iw_debug(cm_node->dev,
-		    I40IW_DEBUG_CM,
-		    "node=%p event=%p type=%u dst=%pI4 src=%pI4\n",
-		    cm_node,
-		    event,
-		    type,
-		    event->cm_info.loc_addr,
-		    event->cm_info.rem_addr);
-
-	i40iw_cm_post_event(event);
-	return event;
-}
-
-/**
- * i40iw_free_retrans_entry - free send entry
- * @cm_node: connection's node
- */
-static void i40iw_free_retrans_entry(struct i40iw_cm_node *cm_node)
-{
-	struct i40iw_device *iwdev = cm_node->iwdev;
-	struct i40iw_timer_entry *send_entry;
-
-	send_entry = cm_node->send_entry;
-	if (send_entry) {
-		cm_node->send_entry = NULL;
-		i40iw_free_sqbuf(&iwdev->vsi, (void *)send_entry->sqbuf);
-		kfree(send_entry);
-		atomic_dec(&cm_node->ref_count);
-	}
-}
-
-/**
- * i40iw_cleanup_retrans_entry - free send entry with lock
- * @cm_node: connection's node
- */
-static void i40iw_cleanup_retrans_entry(struct i40iw_cm_node *cm_node)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
-	i40iw_free_retrans_entry(cm_node);
-	spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
-}
-
-/**
- * i40iw_form_cm_frame - get a free packet and build frame
- * @cm_node: connection's node ionfo to use in frame
- * @options: pointer to options info
- * @hdr: pointer mpa header
- * @pdata: pointer to private data
- * @flags:  indicates FIN or ACK
- */
-static struct i40iw_puda_buf *i40iw_form_cm_frame(struct i40iw_cm_node *cm_node,
-						  struct i40iw_kmem_info *options,
-						  struct i40iw_kmem_info *hdr,
-						  struct i40iw_kmem_info *pdata,
-						  u8 flags)
-{
-	struct i40iw_puda_buf *sqbuf;
-	struct i40iw_sc_vsi *vsi = &cm_node->iwdev->vsi;
-	u8 *buf;
-
-	struct tcphdr *tcph;
-	struct iphdr *iph;
-	struct ipv6hdr *ip6h;
-	struct ethhdr *ethh;
-	u16 packetsize;
-	u16 eth_hlen = ETH_HLEN;
-	u32 opts_len = 0;
-	u32 pd_len = 0;
-	u32 hdr_len = 0;
-	u16 vtag;
-
-	sqbuf = i40iw_puda_get_bufpool(vsi->ilq);
-	if (!sqbuf)
-		return NULL;
-	buf = sqbuf->mem.va;
-
-	if (options)
-		opts_len = (u32)options->size;
-
-	if (hdr)
-		hdr_len = hdr->size;
-
-	if (pdata)
-		pd_len = pdata->size;
-
-	if (cm_node->vlan_id <= VLAN_VID_MASK)
-		eth_hlen += 4;
-
-	if (cm_node->ipv4)
-		packetsize = sizeof(*iph) + sizeof(*tcph);
-	else
-		packetsize = sizeof(*ip6h) + sizeof(*tcph);
-	packetsize += opts_len + hdr_len + pd_len;
-
-	memset(buf, 0x00, eth_hlen + packetsize);
-
-	sqbuf->totallen = packetsize + eth_hlen;
-	sqbuf->maclen = eth_hlen;
-	sqbuf->tcphlen = sizeof(*tcph) + opts_len;
-	sqbuf->scratch = (void *)cm_node;
-
-	ethh = (struct ethhdr *)buf;
-	buf += eth_hlen;
-
-	if (cm_node->ipv4) {
-		sqbuf->ipv4 = true;
-
-		iph = (struct iphdr *)buf;
-		buf += sizeof(*iph);
-		tcph = (struct tcphdr *)buf;
-		buf += sizeof(*tcph);
-
-		ether_addr_copy(ethh->h_dest, cm_node->rem_mac);
-		ether_addr_copy(ethh->h_source, cm_node->loc_mac);
-		if (cm_node->vlan_id <= VLAN_VID_MASK) {
-			((struct vlan_ethhdr *)ethh)->h_vlan_proto = htons(ETH_P_8021Q);
-			vtag = (cm_node->user_pri << VLAN_PRIO_SHIFT) | cm_node->vlan_id;
-			((struct vlan_ethhdr *)ethh)->h_vlan_TCI = htons(vtag);
-
-			((struct vlan_ethhdr *)ethh)->h_vlan_encapsulated_proto = htons(ETH_P_IP);
-		} else {
-			ethh->h_proto = htons(ETH_P_IP);
-		}
-
-		iph->version = IPVERSION;
-		iph->ihl = 5;	/* 5 * 4Byte words, IP headr len */
-		iph->tos = cm_node->tos;
-		iph->tot_len = htons(packetsize);
-		iph->id = htons(++cm_node->tcp_cntxt.loc_id);
-
-		iph->frag_off = htons(0x4000);
-		iph->ttl = 0x40;
-		iph->protocol = IPPROTO_TCP;
-		iph->saddr = htonl(cm_node->loc_addr[0]);
-		iph->daddr = htonl(cm_node->rem_addr[0]);
-	} else {
-		sqbuf->ipv4 = false;
-		ip6h = (struct ipv6hdr *)buf;
-		buf += sizeof(*ip6h);
-		tcph = (struct tcphdr *)buf;
-		buf += sizeof(*tcph);
-
-		ether_addr_copy(ethh->h_dest, cm_node->rem_mac);
-		ether_addr_copy(ethh->h_source, cm_node->loc_mac);
-		if (cm_node->vlan_id <= VLAN_VID_MASK) {
-			((struct vlan_ethhdr *)ethh)->h_vlan_proto = htons(ETH_P_8021Q);
-			vtag = (cm_node->user_pri << VLAN_PRIO_SHIFT) | cm_node->vlan_id;
-			((struct vlan_ethhdr *)ethh)->h_vlan_TCI = htons(vtag);
-			((struct vlan_ethhdr *)ethh)->h_vlan_encapsulated_proto = htons(ETH_P_IPV6);
-		} else {
-			ethh->h_proto = htons(ETH_P_IPV6);
-		}
-		ip6h->version = 6;
-		ip6h->priority = cm_node->tos >> 4;
-		ip6h->flow_lbl[0] = cm_node->tos << 4;
-		ip6h->flow_lbl[1] = 0;
-		ip6h->flow_lbl[2] = 0;
-		ip6h->payload_len = htons(packetsize - sizeof(*ip6h));
-		ip6h->nexthdr = 6;
-		ip6h->hop_limit = 128;
-		i40iw_copy_ip_htonl(ip6h->saddr.in6_u.u6_addr32,
-				    cm_node->loc_addr);
-		i40iw_copy_ip_htonl(ip6h->daddr.in6_u.u6_addr32,
-				    cm_node->rem_addr);
-	}
-
-	tcph->source = htons(cm_node->loc_port);
-	tcph->dest = htons(cm_node->rem_port);
-
-	tcph->seq = htonl(cm_node->tcp_cntxt.loc_seq_num);
-
-	if (flags & SET_ACK) {
-		cm_node->tcp_cntxt.loc_ack_num = cm_node->tcp_cntxt.rcv_nxt;
-		tcph->ack_seq = htonl(cm_node->tcp_cntxt.loc_ack_num);
-		tcph->ack = 1;
-	} else {
-		tcph->ack_seq = 0;
-	}
-
-	if (flags & SET_SYN) {
-		cm_node->tcp_cntxt.loc_seq_num++;
-		tcph->syn = 1;
-	} else {
-		cm_node->tcp_cntxt.loc_seq_num += hdr_len + pd_len;
-	}
-
-	if (flags & SET_FIN) {
-		cm_node->tcp_cntxt.loc_seq_num++;
-		tcph->fin = 1;
-	}
-
-	if (flags & SET_RST)
-		tcph->rst = 1;
-
-	tcph->doff = (u16)((sizeof(*tcph) + opts_len + 3) >> 2);
-	sqbuf->tcphlen = tcph->doff << 2;
-	tcph->window = htons(cm_node->tcp_cntxt.rcv_wnd);
-	tcph->urg_ptr = 0;
-
-	if (opts_len) {
-		memcpy(buf, options->addr, opts_len);
-		buf += opts_len;
-	}
-
-	if (hdr_len) {
-		memcpy(buf, hdr->addr, hdr_len);
-		buf += hdr_len;
-	}
-
-	if (pdata && pdata->addr)
-		memcpy(buf, pdata->addr, pdata->size);
-
-	atomic_set(&sqbuf->refcount, 1);
-
-	return sqbuf;
-}
-
-/**
- * i40iw_send_reset - Send RST packet
- * @cm_node: connection's node
- */
-int i40iw_send_reset(struct i40iw_cm_node *cm_node)
-{
-	struct i40iw_puda_buf *sqbuf;
-	int flags = SET_RST | SET_ACK;
-
-	sqbuf = i40iw_form_cm_frame(cm_node, NULL, NULL, NULL, flags);
-	if (!sqbuf) {
-		i40iw_pr_err("no sqbuf\n");
-		return -1;
-	}
-
-	return i40iw_schedule_cm_timer(cm_node, sqbuf, I40IW_TIMER_TYPE_SEND, 0, 1);
-}
-
-/**
- * i40iw_active_open_err - send event for active side cm error
- * @cm_node: connection's node
- * @reset: Flag to send reset or not
- */
-static void i40iw_active_open_err(struct i40iw_cm_node *cm_node, bool reset)
-{
-	i40iw_cleanup_retrans_entry(cm_node);
-	cm_node->cm_core->stats_connect_errs++;
-	if (reset) {
-		i40iw_debug(cm_node->dev,
-			    I40IW_DEBUG_CM,
-			    "%s cm_node=%p state=%d\n",
-			    __func__,
-			    cm_node,
-			    cm_node->state);
-		atomic_inc(&cm_node->ref_count);
-		i40iw_send_reset(cm_node);
-	}
-
-	cm_node->state = I40IW_CM_STATE_CLOSED;
-	i40iw_create_event(cm_node, I40IW_CM_EVENT_ABORTED);
-}
-
-/**
- * i40iw_passive_open_err - handle passive side cm error
- * @cm_node: connection's node
- * @reset: send reset or just free cm_node
- */
-static void i40iw_passive_open_err(struct i40iw_cm_node *cm_node, bool reset)
-{
-	i40iw_cleanup_retrans_entry(cm_node);
-	cm_node->cm_core->stats_passive_errs++;
-	cm_node->state = I40IW_CM_STATE_CLOSED;
-	i40iw_debug(cm_node->dev,
-		    I40IW_DEBUG_CM,
-		    "%s cm_node=%p state =%d\n",
-		    __func__,
-		    cm_node,
-		    cm_node->state);
-	if (reset)
-		i40iw_send_reset(cm_node);
-	else
-		i40iw_rem_ref_cm_node(cm_node);
-}
-
-/**
- * i40iw_event_connect_error - to create connect error event
- * @event: cm information for connect event
- */
-static void i40iw_event_connect_error(struct i40iw_cm_event *event)
-{
-	struct i40iw_qp *iwqp;
-	struct iw_cm_id *cm_id;
-
-	cm_id = event->cm_node->cm_id;
-	if (!cm_id)
-		return;
-
-	iwqp = cm_id->provider_data;
-
-	if (!iwqp || !iwqp->iwdev)
-		return;
-
-	iwqp->cm_id = NULL;
-	cm_id->provider_data = NULL;
-	i40iw_send_cm_event(event->cm_node, cm_id,
-			    IW_CM_EVENT_CONNECT_REPLY,
-			    -ECONNRESET);
-	cm_id->rem_ref(cm_id);
-	i40iw_rem_ref_cm_node(event->cm_node);
-}
-
-/**
- * i40iw_process_options
- * @cm_node: connection's node
- * @optionsloc: point to start of options
- * @optionsize: size of all options
- * @syn_packet: flag if syn packet
- */
-static int i40iw_process_options(struct i40iw_cm_node *cm_node,
-				 u8 *optionsloc,
-				 u32 optionsize,
-				 u32 syn_packet)
-{
-	u32 tmp;
-	u32 offset = 0;
-	union all_known_options *all_options;
-	char got_mss_option = 0;
-
-	while (offset < optionsize) {
-		all_options = (union all_known_options *)(optionsloc + offset);
-		switch (all_options->as_base.optionnum) {
-		case OPTION_NUMBER_END:
-			offset = optionsize;
-			break;
-		case OPTION_NUMBER_NONE:
-			offset += 1;
-			continue;
-		case OPTION_NUMBER_MSS:
-			i40iw_debug(cm_node->dev,
-				    I40IW_DEBUG_CM,
-				    "%s: MSS Length: %d Offset: %d Size: %d\n",
-				    __func__,
-				    all_options->as_mss.length,
-				    offset,
-				    optionsize);
-			got_mss_option = 1;
-			if (all_options->as_mss.length != 4)
-				return -1;
-			tmp = ntohs(all_options->as_mss.mss);
-			if (tmp > 0 && tmp < cm_node->tcp_cntxt.mss)
-				cm_node->tcp_cntxt.mss = tmp;
-			break;
-		case OPTION_NUMBER_WINDOW_SCALE:
-			cm_node->tcp_cntxt.snd_wscale =
-			    all_options->as_windowscale.shiftcount;
-			break;
-		default:
-			i40iw_debug(cm_node->dev,
-				    I40IW_DEBUG_CM,
-				    "TCP Option not understood: %x\n",
-				    all_options->as_base.optionnum);
-			break;
-		}
-		offset += all_options->as_base.length;
-	}
-	if (!got_mss_option && syn_packet)
-		cm_node->tcp_cntxt.mss = I40IW_CM_DEFAULT_MSS;
-	return 0;
-}
-
-/**
- * i40iw_handle_tcp_options -
- * @cm_node: connection's node
- * @tcph: pointer tcp header
- * @optionsize: size of options rcvd
- * @passive: active or passive flag
- */
-static int i40iw_handle_tcp_options(struct i40iw_cm_node *cm_node,
-				    struct tcphdr *tcph,
-				    int optionsize,
-				    int passive)
-{
-	u8 *optionsloc = (u8 *)&tcph[1];
-
-	if (optionsize) {
-		if (i40iw_process_options(cm_node,
-					  optionsloc,
-					  optionsize,
-					  (u32)tcph->syn)) {
-			i40iw_debug(cm_node->dev,
-				    I40IW_DEBUG_CM,
-				    "%s: Node %p, Sending RESET\n",
-				    __func__,
-				    cm_node);
-			if (passive)
-				i40iw_passive_open_err(cm_node, true);
-			else
-				i40iw_active_open_err(cm_node, true);
-			return -1;
-		}
-	}
-
-	cm_node->tcp_cntxt.snd_wnd = ntohs(tcph->window) <<
-	    cm_node->tcp_cntxt.snd_wscale;
-
-	if (cm_node->tcp_cntxt.snd_wnd > cm_node->tcp_cntxt.max_snd_wnd)
-		cm_node->tcp_cntxt.max_snd_wnd = cm_node->tcp_cntxt.snd_wnd;
-	return 0;
-}
-
-/**
- * i40iw_build_mpa_v1 - build a MPA V1 frame
- * @cm_node: connection's node
- * @start_addr: MPA frame start address
- * @mpa_key: to do read0 or write0
- */
-static void i40iw_build_mpa_v1(struct i40iw_cm_node *cm_node,
-			       void *start_addr,
-			       u8 mpa_key)
-{
-	struct ietf_mpa_v1 *mpa_frame = (struct ietf_mpa_v1 *)start_addr;
-
-	switch (mpa_key) {
-	case MPA_KEY_REQUEST:
-		memcpy(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE);
-		break;
-	case MPA_KEY_REPLY:
-		memcpy(mpa_frame->key, IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE);
-		break;
-	default:
-		break;
-	}
-	mpa_frame->flags = IETF_MPA_FLAGS_CRC;
-	mpa_frame->rev = cm_node->mpa_frame_rev;
-	mpa_frame->priv_data_len = htons(cm_node->pdata.size);
-}
-
-/**
- * i40iw_build_mpa_v2 - build a MPA V2 frame
- * @cm_node: connection's node
- * @start_addr: buffer start address
- * @mpa_key: to do read0 or write0
- */
-static void i40iw_build_mpa_v2(struct i40iw_cm_node *cm_node,
-			       void *start_addr,
-			       u8 mpa_key)
-{
-	struct ietf_mpa_v2 *mpa_frame = (struct ietf_mpa_v2 *)start_addr;
-	struct ietf_rtr_msg *rtr_msg = &mpa_frame->rtr_msg;
-	u16 ctrl_ird, ctrl_ord;
-
-	/* initialize the upper 5 bytes of the frame */
-	i40iw_build_mpa_v1(cm_node, start_addr, mpa_key);
-	mpa_frame->flags |= IETF_MPA_V2_FLAG;
-	mpa_frame->priv_data_len += htons(IETF_RTR_MSG_SIZE);
-
-	/* initialize RTR msg */
-	if (cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) {
-		ctrl_ird = IETF_NO_IRD_ORD;
-		ctrl_ord = IETF_NO_IRD_ORD;
-	} else {
-		ctrl_ird = (cm_node->ird_size > IETF_NO_IRD_ORD) ?
-			IETF_NO_IRD_ORD : cm_node->ird_size;
-		ctrl_ord = (cm_node->ord_size > IETF_NO_IRD_ORD) ?
-			IETF_NO_IRD_ORD : cm_node->ord_size;
-	}
-
-	ctrl_ird |= IETF_PEER_TO_PEER;
-
-	switch (mpa_key) {
-	case MPA_KEY_REQUEST:
-		ctrl_ord |= IETF_RDMA0_WRITE;
-		ctrl_ord |= IETF_RDMA0_READ;
-		break;
-	case MPA_KEY_REPLY:
-		switch (cm_node->send_rdma0_op) {
-		case SEND_RDMA_WRITE_ZERO:
-			ctrl_ord |= IETF_RDMA0_WRITE;
-			break;
-		case SEND_RDMA_READ_ZERO:
-			ctrl_ord |= IETF_RDMA0_READ;
-			break;
-		}
-		break;
-	default:
-		break;
-	}
-	rtr_msg->ctrl_ird = htons(ctrl_ird);
-	rtr_msg->ctrl_ord = htons(ctrl_ord);
-}
-
-/**
- * i40iw_cm_build_mpa_frame - build mpa frame for mpa version 1 or version 2
- * @cm_node: connection's node
- * @mpa: mpa: data buffer
- * @mpa_key: to do read0 or write0
- */
-static int i40iw_cm_build_mpa_frame(struct i40iw_cm_node *cm_node,
-				    struct i40iw_kmem_info *mpa,
-				    u8 mpa_key)
-{
-	int hdr_len = 0;
-
-	switch (cm_node->mpa_frame_rev) {
-	case IETF_MPA_V1:
-		hdr_len = sizeof(struct ietf_mpa_v1);
-		i40iw_build_mpa_v1(cm_node, mpa->addr, mpa_key);
-		break;
-	case IETF_MPA_V2:
-		hdr_len = sizeof(struct ietf_mpa_v2);
-		i40iw_build_mpa_v2(cm_node, mpa->addr, mpa_key);
-		break;
-	default:
-		break;
-	}
-
-	return hdr_len;
-}
-
-/**
- * i40iw_send_mpa_request - active node send mpa request to passive node
- * @cm_node: connection's node
- */
-static int i40iw_send_mpa_request(struct i40iw_cm_node *cm_node)
-{
-	struct i40iw_puda_buf *sqbuf;
-
-	if (!cm_node) {
-		i40iw_pr_err("cm_node == NULL\n");
-		return -1;
-	}
-
-	cm_node->mpa_hdr.addr = &cm_node->mpa_frame;
-	cm_node->mpa_hdr.size = i40iw_cm_build_mpa_frame(cm_node,
-							 &cm_node->mpa_hdr,
-							 MPA_KEY_REQUEST);
-	if (!cm_node->mpa_hdr.size) {
-		i40iw_pr_err("mpa size = %d\n", cm_node->mpa_hdr.size);
-		return -1;
-	}
-
-	sqbuf = i40iw_form_cm_frame(cm_node,
-				    NULL,
-				    &cm_node->mpa_hdr,
-				    &cm_node->pdata,
-				    SET_ACK);
-	if (!sqbuf) {
-		i40iw_pr_err("sq_buf == NULL\n");
-		return -1;
-	}
-	return i40iw_schedule_cm_timer(cm_node, sqbuf, I40IW_TIMER_TYPE_SEND, 1, 0);
-}
-
-/**
- * i40iw_send_mpa_reject -
- * @cm_node: connection's node
- * @pdata: reject data for connection
- * @plen: length of reject data
- */
-static int i40iw_send_mpa_reject(struct i40iw_cm_node *cm_node,
-				 const void *pdata,
-				 u8 plen)
-{
-	struct i40iw_puda_buf *sqbuf;
-	struct i40iw_kmem_info priv_info;
-
-	cm_node->mpa_hdr.addr = &cm_node->mpa_frame;
-	cm_node->mpa_hdr.size = i40iw_cm_build_mpa_frame(cm_node,
-							 &cm_node->mpa_hdr,
-							 MPA_KEY_REPLY);
-
-	cm_node->mpa_frame.flags |= IETF_MPA_FLAGS_REJECT;
-	priv_info.addr = (void *)pdata;
-	priv_info.size = plen;
-
-	sqbuf = i40iw_form_cm_frame(cm_node,
-				    NULL,
-				    &cm_node->mpa_hdr,
-				    &priv_info,
-				    SET_ACK | SET_FIN);
-	if (!sqbuf) {
-		i40iw_pr_err("no sqbuf\n");
-		return -ENOMEM;
-	}
-	cm_node->state = I40IW_CM_STATE_FIN_WAIT1;
-	return i40iw_schedule_cm_timer(cm_node, sqbuf, I40IW_TIMER_TYPE_SEND, 1, 0);
-}
-
-/**
- * i40iw_parse_mpa - process an IETF MPA frame
- * @cm_node: connection's node
- * @buffer: Data pointer
- * @type: to return accept or reject
- * @len: Len of mpa buffer
- */
-static int i40iw_parse_mpa(struct i40iw_cm_node *cm_node, u8 *buffer, u32 *type, u32 len)
-{
-	struct ietf_mpa_v1 *mpa_frame;
-	struct ietf_mpa_v2 *mpa_v2_frame;
-	struct ietf_rtr_msg *rtr_msg;
-	int mpa_hdr_len;
-	int priv_data_len;
-
-	*type = I40IW_MPA_REQUEST_ACCEPT;
-
-	if (len < sizeof(struct ietf_mpa_v1)) {
-		i40iw_pr_err("ietf buffer small (%x)\n", len);
-		return -1;
-	}
-
-	mpa_frame = (struct ietf_mpa_v1 *)buffer;
-	mpa_hdr_len = sizeof(struct ietf_mpa_v1);
-	priv_data_len = ntohs(mpa_frame->priv_data_len);
-
-	if (priv_data_len > IETF_MAX_PRIV_DATA_LEN) {
-		i40iw_pr_err("large pri_data %d\n", priv_data_len);
-		return -1;
-	}
-	if (mpa_frame->rev != IETF_MPA_V1 && mpa_frame->rev != IETF_MPA_V2) {
-		i40iw_pr_err("unsupported mpa rev = %d\n", mpa_frame->rev);
-		return -1;
-	}
-	if (mpa_frame->rev > cm_node->mpa_frame_rev) {
-		i40iw_pr_err("rev %d\n", mpa_frame->rev);
-		return -1;
-	}
-	cm_node->mpa_frame_rev = mpa_frame->rev;
-
-	if (cm_node->state != I40IW_CM_STATE_MPAREQ_SENT) {
-		if (memcmp(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE)) {
-			i40iw_pr_err("Unexpected MPA Key received\n");
-			return -1;
-		}
-	} else {
-		if (memcmp(mpa_frame->key, IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE)) {
-			i40iw_pr_err("Unexpected MPA Key received\n");
-			return -1;
-		}
-	}
-
-	if (priv_data_len + mpa_hdr_len > len) {
-		i40iw_pr_err("ietf buffer len(%x + %x != %x)\n",
-			     priv_data_len, mpa_hdr_len, len);
-		return -1;
-	}
-	if (len > MAX_CM_BUFFER) {
-		i40iw_pr_err("ietf buffer large len = %d\n", len);
-		return -1;
-	}
-
-	switch (mpa_frame->rev) {
-	case IETF_MPA_V2:{
-			u16 ird_size;
-			u16 ord_size;
-			u16 ctrl_ord;
-			u16 ctrl_ird;
-
-			mpa_v2_frame = (struct ietf_mpa_v2 *)buffer;
-			mpa_hdr_len += IETF_RTR_MSG_SIZE;
-			rtr_msg = &mpa_v2_frame->rtr_msg;
-
-			/* parse rtr message */
-			ctrl_ord = ntohs(rtr_msg->ctrl_ord);
-			ctrl_ird = ntohs(rtr_msg->ctrl_ird);
-			ird_size = ctrl_ird & IETF_NO_IRD_ORD;
-			ord_size = ctrl_ord & IETF_NO_IRD_ORD;
-
-			if (!(ctrl_ird & IETF_PEER_TO_PEER))
-				return -1;
-
-			if (ird_size == IETF_NO_IRD_ORD || ord_size == IETF_NO_IRD_ORD) {
-				cm_node->mpav2_ird_ord = IETF_NO_IRD_ORD;
-				goto negotiate_done;
-			}
-
-			if (cm_node->state != I40IW_CM_STATE_MPAREQ_SENT) {
-				/* responder */
-				if (!ord_size && (ctrl_ord & IETF_RDMA0_READ))
-					cm_node->ird_size = 1;
-				if (cm_node->ord_size > ird_size)
-					cm_node->ord_size = ird_size;
-			} else {
-				/* initiator */
-				if (!ird_size && (ctrl_ord & IETF_RDMA0_READ))
-					return -1;
-				if (cm_node->ord_size > ird_size)
-					cm_node->ord_size = ird_size;
-
-				if (cm_node->ird_size < ord_size)
-					/* no resources available */
-					return -1;
-			}
-
-negotiate_done:
-			if (ctrl_ord & IETF_RDMA0_READ)
-				cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO;
-			else if (ctrl_ord & IETF_RDMA0_WRITE)
-				cm_node->send_rdma0_op = SEND_RDMA_WRITE_ZERO;
-			else	/* Not supported RDMA0 operation */
-				return -1;
-			i40iw_debug(cm_node->dev, I40IW_DEBUG_CM,
-				    "MPAV2: Negotiated ORD: %d, IRD: %d\n",
-				    cm_node->ord_size, cm_node->ird_size);
-			break;
-		}
-		break;
-	case IETF_MPA_V1:
-	default:
-		break;
-	}
-
-	memcpy(cm_node->pdata_buf, buffer + mpa_hdr_len, priv_data_len);
-	cm_node->pdata.size = priv_data_len;
-
-	if (mpa_frame->flags & IETF_MPA_FLAGS_REJECT)
-		*type = I40IW_MPA_REQUEST_REJECT;
-
-	if (mpa_frame->flags & IETF_MPA_FLAGS_MARKERS)
-		cm_node->snd_mark_en = true;
-
-	return 0;
-}
-
-/**
- * i40iw_schedule_cm_timer
- * @cm_node: connection's node
- * @sqbuf: buffer to send
- * @type: if it is send or close
- * @send_retrans: if rexmits to be done
- * @close_when_complete: is cm_node to be removed
- *
- * note - cm_node needs to be protected before calling this. Encase in:
- *		i40iw_rem_ref_cm_node(cm_core, cm_node);
- *		i40iw_schedule_cm_timer(...)
- *		atomic_inc(&cm_node->ref_count);
- */
-int i40iw_schedule_cm_timer(struct i40iw_cm_node *cm_node,
-			    struct i40iw_puda_buf *sqbuf,
-			    enum i40iw_timer_type type,
-			    int send_retrans,
-			    int close_when_complete)
-{
-	struct i40iw_sc_vsi *vsi = &cm_node->iwdev->vsi;
-	struct i40iw_cm_core *cm_core = cm_node->cm_core;
-	struct i40iw_timer_entry *new_send;
-	int ret = 0;
-	u32 was_timer_set;
-	unsigned long flags;
-
-	new_send = kzalloc(sizeof(*new_send), GFP_ATOMIC);
-	if (!new_send) {
-		if (type != I40IW_TIMER_TYPE_CLOSE)
-			i40iw_free_sqbuf(vsi, (void *)sqbuf);
-		return -ENOMEM;
-	}
-	new_send->retrycount = I40IW_DEFAULT_RETRYS;
-	new_send->retranscount = I40IW_DEFAULT_RETRANS;
-	new_send->sqbuf = sqbuf;
-	new_send->timetosend = jiffies;
-	new_send->type = type;
-	new_send->send_retrans = send_retrans;
-	new_send->close_when_complete = close_when_complete;
-
-	if (type == I40IW_TIMER_TYPE_CLOSE) {
-		new_send->timetosend += (HZ / 10);
-		if (cm_node->close_entry) {
-			kfree(new_send);
-			i40iw_pr_err("already close entry\n");
-			return -EINVAL;
-		}
-		cm_node->close_entry = new_send;
-	}
-
-	if (type == I40IW_TIMER_TYPE_SEND) {
-		spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
-		cm_node->send_entry = new_send;
-		atomic_inc(&cm_node->ref_count);
-		spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
-		new_send->timetosend = jiffies + I40IW_RETRY_TIMEOUT;
-
-		atomic_inc(&sqbuf->refcount);
-		i40iw_puda_send_buf(vsi->ilq, sqbuf);
-		if (!send_retrans) {
-			i40iw_cleanup_retrans_entry(cm_node);
-			if (close_when_complete)
-				i40iw_rem_ref_cm_node(cm_node);
-			return ret;
-		}
-	}
-
-	spin_lock_irqsave(&cm_core->ht_lock, flags);
-	was_timer_set = timer_pending(&cm_core->tcp_timer);
-
-	if (!was_timer_set) {
-		cm_core->tcp_timer.expires = new_send->timetosend;
-		add_timer(&cm_core->tcp_timer);
-	}
-	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
-
-	return ret;
-}
-
-/**
- * i40iw_retrans_expired - Could not rexmit the packet
- * @cm_node: connection's node
- */
-static void i40iw_retrans_expired(struct i40iw_cm_node *cm_node)
-{
-	struct iw_cm_id *cm_id = cm_node->cm_id;
-	enum i40iw_cm_node_state state = cm_node->state;
-
-	cm_node->state = I40IW_CM_STATE_CLOSED;
-	switch (state) {
-	case I40IW_CM_STATE_SYN_RCVD:
-	case I40IW_CM_STATE_CLOSING:
-		i40iw_rem_ref_cm_node(cm_node);
-		break;
-	case I40IW_CM_STATE_FIN_WAIT1:
-	case I40IW_CM_STATE_LAST_ACK:
-		if (cm_node->cm_id)
-			cm_id->rem_ref(cm_id);
-		i40iw_send_reset(cm_node);
-		break;
-	default:
-		atomic_inc(&cm_node->ref_count);
-		i40iw_send_reset(cm_node);
-		i40iw_create_event(cm_node, I40IW_CM_EVENT_ABORTED);
-		break;
-	}
-}
-
-/**
- * i40iw_handle_close_entry - for handling retry/timeouts
- * @cm_node: connection's node
- * @rem_node: flag for remove cm_node
- */
-static void i40iw_handle_close_entry(struct i40iw_cm_node *cm_node, u32 rem_node)
-{
-	struct i40iw_timer_entry *close_entry = cm_node->close_entry;
-	struct iw_cm_id *cm_id = cm_node->cm_id;
-	struct i40iw_qp *iwqp;
-	unsigned long flags;
-
-	if (!close_entry)
-		return;
-	iwqp = (struct i40iw_qp *)close_entry->sqbuf;
-	if (iwqp) {
-		spin_lock_irqsave(&iwqp->lock, flags);
-		if (iwqp->cm_id) {
-			iwqp->hw_tcp_state = I40IW_TCP_STATE_CLOSED;
-			iwqp->hw_iwarp_state = I40IW_QP_STATE_ERROR;
-			iwqp->last_aeq = I40IW_AE_RESET_SENT;
-			iwqp->ibqp_state = IB_QPS_ERR;
-			spin_unlock_irqrestore(&iwqp->lock, flags);
-			i40iw_cm_disconn(iwqp);
-		} else {
-			spin_unlock_irqrestore(&iwqp->lock, flags);
-		}
-	} else if (rem_node) {
-		/* TIME_WAIT state */
-		i40iw_rem_ref_cm_node(cm_node);
-	}
-	if (cm_id)
-		cm_id->rem_ref(cm_id);
-	kfree(close_entry);
-	cm_node->close_entry = NULL;
-}
-
-/**
- * i40iw_build_timer_list - Add cm_nodes to timer list
- * @timer_list: ptr to timer list
- * @hte: ptr to accelerated or non-accelerated list
- */
-static void i40iw_build_timer_list(struct list_head *timer_list,
-				   struct list_head *hte)
-{
-	struct i40iw_cm_node *cm_node;
-	struct list_head *list_core_temp, *list_node;
-
-	list_for_each_safe(list_node, list_core_temp, hte) {
-		cm_node = container_of(list_node, struct i40iw_cm_node, list);
-		if (cm_node->close_entry || cm_node->send_entry) {
-			atomic_inc(&cm_node->ref_count);
-			list_add(&cm_node->timer_entry, timer_list);
-		}
-	}
-}
-
-/**
- * i40iw_cm_timer_tick - system's timer expired callback
- * @t: Timer instance to fetch the cm_core pointer from
- */
-static void i40iw_cm_timer_tick(struct timer_list *t)
-{
-	unsigned long nexttimeout = jiffies + I40IW_LONG_TIME;
-	struct i40iw_cm_node *cm_node;
-	struct i40iw_timer_entry *send_entry, *close_entry;
-	struct list_head *list_core_temp;
-	struct i40iw_sc_vsi *vsi;
-	struct list_head *list_node;
-	struct i40iw_cm_core *cm_core = from_timer(cm_core, t, tcp_timer);
-	u32 settimer = 0;
-	unsigned long timetosend;
-	unsigned long flags;
-
-	struct list_head timer_list;
-
-	INIT_LIST_HEAD(&timer_list);
-
-	spin_lock_irqsave(&cm_core->ht_lock, flags);
-	i40iw_build_timer_list(&timer_list, &cm_core->non_accelerated_list);
-	i40iw_build_timer_list(&timer_list, &cm_core->accelerated_list);
-	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
-
-	list_for_each_safe(list_node, list_core_temp, &timer_list) {
-		cm_node = container_of(list_node,
-				       struct i40iw_cm_node,
-				       timer_entry);
-		close_entry = cm_node->close_entry;
-
-		if (close_entry) {
-			if (time_after(close_entry->timetosend, jiffies)) {
-				if (nexttimeout > close_entry->timetosend ||
-				    !settimer) {
-					nexttimeout = close_entry->timetosend;
-					settimer = 1;
-				}
-			} else {
-				i40iw_handle_close_entry(cm_node, 1);
-			}
-		}
-
-		spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
-
-		send_entry = cm_node->send_entry;
-		if (!send_entry)
-			goto done;
-		if (time_after(send_entry->timetosend, jiffies)) {
-			if (cm_node->state != I40IW_CM_STATE_OFFLOADED) {
-				if ((nexttimeout > send_entry->timetosend) ||
-				    !settimer) {
-					nexttimeout = send_entry->timetosend;
-					settimer = 1;
-				}
-			} else {
-				i40iw_free_retrans_entry(cm_node);
-			}
-			goto done;
-		}
-
-		if ((cm_node->state == I40IW_CM_STATE_OFFLOADED) ||
-		    (cm_node->state == I40IW_CM_STATE_CLOSED)) {
-			i40iw_free_retrans_entry(cm_node);
-			goto done;
-		}
-
-		if (!send_entry->retranscount || !send_entry->retrycount) {
-			i40iw_free_retrans_entry(cm_node);
-
-			spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
-			i40iw_retrans_expired(cm_node);
-			cm_node->state = I40IW_CM_STATE_CLOSED;
-			spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
-			goto done;
-		}
-		spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
-
-		vsi = &cm_node->iwdev->vsi;
-
-		if (!cm_node->ack_rcvd) {
-			atomic_inc(&send_entry->sqbuf->refcount);
-			i40iw_puda_send_buf(vsi->ilq, send_entry->sqbuf);
-			cm_node->cm_core->stats_pkt_retrans++;
-		}
-		spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
-		if (send_entry->send_retrans) {
-			send_entry->retranscount--;
-			timetosend = (I40IW_RETRY_TIMEOUT <<
-				      (I40IW_DEFAULT_RETRANS -
-				       send_entry->retranscount));
-
-			send_entry->timetosend = jiffies +
-			    min(timetosend, I40IW_MAX_TIMEOUT);
-			if (nexttimeout > send_entry->timetosend || !settimer) {
-				nexttimeout = send_entry->timetosend;
-				settimer = 1;
-			}
-		} else {
-			int close_when_complete;
-
-			close_when_complete = send_entry->close_when_complete;
-			i40iw_debug(cm_node->dev,
-				    I40IW_DEBUG_CM,
-				    "cm_node=%p state=%d\n",
-				    cm_node,
-				    cm_node->state);
-			i40iw_free_retrans_entry(cm_node);
-			if (close_when_complete)
-				i40iw_rem_ref_cm_node(cm_node);
-		}
-done:
-		spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
-		i40iw_rem_ref_cm_node(cm_node);
-	}
-
-	if (settimer) {
-		spin_lock_irqsave(&cm_core->ht_lock, flags);
-		if (!timer_pending(&cm_core->tcp_timer)) {
-			cm_core->tcp_timer.expires = nexttimeout;
-			add_timer(&cm_core->tcp_timer);
-		}
-		spin_unlock_irqrestore(&cm_core->ht_lock, flags);
-	}
-}
-
-/**
- * i40iw_send_syn - send SYN packet
- * @cm_node: connection's node
- * @sendack: flag to set ACK bit or not
- */
-int i40iw_send_syn(struct i40iw_cm_node *cm_node, u32 sendack)
-{
-	struct i40iw_puda_buf *sqbuf;
-	int flags = SET_SYN;
-	char optionsbuffer[sizeof(struct option_mss) +
-			   sizeof(struct option_windowscale) +
-			   sizeof(struct option_base) + TCP_OPTIONS_PADDING];
-	struct i40iw_kmem_info opts;
-
-	int optionssize = 0;
-	/* Sending MSS option */
-	union all_known_options *options;
-
-	opts.addr = optionsbuffer;
-	if (!cm_node) {
-		i40iw_pr_err("no cm_node\n");
-		return -EINVAL;
-	}
-
-	options = (union all_known_options *)&optionsbuffer[optionssize];
-	options->as_mss.optionnum = OPTION_NUMBER_MSS;
-	options->as_mss.length = sizeof(struct option_mss);
-	options->as_mss.mss = htons(cm_node->tcp_cntxt.mss);
-	optionssize += sizeof(struct option_mss);
-
-	options = (union all_known_options *)&optionsbuffer[optionssize];
-	options->as_windowscale.optionnum = OPTION_NUMBER_WINDOW_SCALE;
-	options->as_windowscale.length = sizeof(struct option_windowscale);
-	options->as_windowscale.shiftcount = cm_node->tcp_cntxt.rcv_wscale;
-	optionssize += sizeof(struct option_windowscale);
-	options = (union all_known_options *)&optionsbuffer[optionssize];
-	options->as_end = OPTION_NUMBER_END;
-	optionssize += 1;
-
-	if (sendack)
-		flags |= SET_ACK;
-
-	opts.size = optionssize;
-
-	sqbuf = i40iw_form_cm_frame(cm_node, &opts, NULL, NULL, flags);
-	if (!sqbuf) {
-		i40iw_pr_err("no sqbuf\n");
-		return -1;
-	}
-	return i40iw_schedule_cm_timer(cm_node, sqbuf, I40IW_TIMER_TYPE_SEND, 1, 0);
-}
-
-/**
- * i40iw_send_ack - Send ACK packet
- * @cm_node: connection's node
- */
-static void i40iw_send_ack(struct i40iw_cm_node *cm_node)
-{
-	struct i40iw_puda_buf *sqbuf;
-	struct i40iw_sc_vsi *vsi = &cm_node->iwdev->vsi;
-
-	sqbuf = i40iw_form_cm_frame(cm_node, NULL, NULL, NULL, SET_ACK);
-	if (sqbuf)
-		i40iw_puda_send_buf(vsi->ilq, sqbuf);
-	else
-		i40iw_pr_err("no sqbuf\n");
-}
-
-/**
- * i40iw_send_fin - Send FIN pkt
- * @cm_node: connection's node
- */
-static int i40iw_send_fin(struct i40iw_cm_node *cm_node)
-{
-	struct i40iw_puda_buf *sqbuf;
-
-	sqbuf = i40iw_form_cm_frame(cm_node, NULL, NULL, NULL, SET_ACK | SET_FIN);
-	if (!sqbuf) {
-		i40iw_pr_err("no sqbuf\n");
-		return -1;
-	}
-	return i40iw_schedule_cm_timer(cm_node, sqbuf, I40IW_TIMER_TYPE_SEND, 1, 0);
-}
-
-/**
- * i40iw_find_node - find a cm node that matches the reference cm node
- * @cm_core: cm's core
- * @rem_port: remote tcp port num
- * @rem_addr: remote ip addr
- * @loc_port: local tcp port num
- * @loc_addr: loc ip addr
- * @add_refcnt: flag to increment refcount of cm_node
- * @accelerated_list: flag for accelerated vs non-accelerated list to search
- */
-struct i40iw_cm_node *i40iw_find_node(struct i40iw_cm_core *cm_core,
-				      u16 rem_port,
-				      u32 *rem_addr,
-				      u16 loc_port,
-				      u32 *loc_addr,
-				      bool add_refcnt,
-				      bool accelerated_list)
-{
-	struct list_head *hte;
-	struct i40iw_cm_node *cm_node;
-	unsigned long flags;
-
-	hte = accelerated_list ?
-	      &cm_core->accelerated_list : &cm_core->non_accelerated_list;
-
-	/* walk list and find cm_node associated with this session ID */
-	spin_lock_irqsave(&cm_core->ht_lock, flags);
-	list_for_each_entry(cm_node, hte, list) {
-		if (!memcmp(cm_node->loc_addr, loc_addr, sizeof(cm_node->loc_addr)) &&
-		    (cm_node->loc_port == loc_port) &&
-		    !memcmp(cm_node->rem_addr, rem_addr, sizeof(cm_node->rem_addr)) &&
-		    (cm_node->rem_port == rem_port)) {
-			if (add_refcnt)
-				atomic_inc(&cm_node->ref_count);
-			spin_unlock_irqrestore(&cm_core->ht_lock, flags);
-			return cm_node;
-		}
-	}
-	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
-
-	/* no owner node */
-	return NULL;
-}
-
-/**
- * i40iw_find_listener - find a cm node listening on this addr-port pair
- * @cm_core: cm's core
- * @dst_port: listener tcp port num
- * @dst_addr: listener ip addr
- * @vlan_id: vlan id for the given address
- * @listener_state: state to match with listen node's
- */
-static struct i40iw_cm_listener *i40iw_find_listener(
-						     struct i40iw_cm_core *cm_core,
-						     u32 *dst_addr,
-						     u16 dst_port,
-						     u16 vlan_id,
-						     enum i40iw_cm_listener_state
-						     listener_state)
-{
-	struct i40iw_cm_listener *listen_node;
-	static const u32 ip_zero[4] = { 0, 0, 0, 0 };
-	u32 listen_addr[4];
-	u16 listen_port;
-	unsigned long flags;
-
-	/* walk list and find cm_node associated with this session ID */
-	spin_lock_irqsave(&cm_core->listen_list_lock, flags);
-	list_for_each_entry(listen_node, &cm_core->listen_nodes, list) {
-		memcpy(listen_addr, listen_node->loc_addr, sizeof(listen_addr));
-		listen_port = listen_node->loc_port;
-		/* compare node pair, return node handle if a match */
-		if ((!memcmp(listen_addr, dst_addr, sizeof(listen_addr)) ||
-		     !memcmp(listen_addr, ip_zero, sizeof(listen_addr))) &&
-		     (listen_port == dst_port) &&
-		     (listener_state & listen_node->listener_state)) {
-			atomic_inc(&listen_node->ref_count);
-			spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
-			return listen_node;
-		}
-	}
-	spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
-	return NULL;
-}
-
-/**
- * i40iw_add_hte_node - add a cm node to the hash table
- * @cm_core: cm's core
- * @cm_node: connection's node
- */
-static void i40iw_add_hte_node(struct i40iw_cm_core *cm_core,
-			       struct i40iw_cm_node *cm_node)
-{
-	unsigned long flags;
-
-	if (!cm_node || !cm_core) {
-		i40iw_pr_err("cm_node or cm_core == NULL\n");
-		return;
-	}
-
-	spin_lock_irqsave(&cm_core->ht_lock, flags);
-	list_add_tail(&cm_node->list, &cm_core->non_accelerated_list);
-	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
-}
-
-/**
- * i40iw_find_port - find port that matches reference port
- * @hte: ptr to accelerated or non-accelerated list
- * @port: port number to locate
- */
-static bool i40iw_find_port(struct list_head *hte, u16 port)
-{
-	struct i40iw_cm_node *cm_node;
-
-	list_for_each_entry(cm_node, hte, list) {
-		if (cm_node->loc_port == port)
-			return true;
-	}
-	return false;
-}
-
-/**
- * i40iw_port_in_use - determine if port is in use
- * @cm_core: cm's core
- * @port: port number
- */
-bool i40iw_port_in_use(struct i40iw_cm_core *cm_core, u16 port)
-{
-	struct i40iw_cm_listener *listen_node;
-	unsigned long flags;
-
-	spin_lock_irqsave(&cm_core->ht_lock, flags);
-	if (i40iw_find_port(&cm_core->accelerated_list, port) ||
-	    i40iw_find_port(&cm_core->non_accelerated_list, port)) {
-		spin_unlock_irqrestore(&cm_core->ht_lock, flags);
-		return true;
-	}
-	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
-
-	spin_lock_irqsave(&cm_core->listen_list_lock, flags);
-	list_for_each_entry(listen_node, &cm_core->listen_nodes, list) {
-		if (listen_node->loc_port == port) {
-			spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
-			return true;
-		}
-	}
-	spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
-
-	return false;
-}
-
-/**
- * i40iw_del_multiple_qhash - Remove qhash and child listens
- * @iwdev: iWarp device
- * @cm_info: CM info for parent listen node
- * @cm_parent_listen_node: The parent listen node
- */
-static enum i40iw_status_code i40iw_del_multiple_qhash(
-						       struct i40iw_device *iwdev,
-						       struct i40iw_cm_info *cm_info,
-						       struct i40iw_cm_listener *cm_parent_listen_node)
-{
-	struct i40iw_cm_listener *child_listen_node;
-	enum i40iw_status_code ret = I40IW_ERR_CONFIG;
-	struct list_head *pos, *tpos;
-	unsigned long flags;
-
-	spin_lock_irqsave(&iwdev->cm_core.listen_list_lock, flags);
-	list_for_each_safe(pos, tpos, &cm_parent_listen_node->child_listen_list) {
-		child_listen_node = list_entry(pos, struct i40iw_cm_listener, child_listen_list);
-		if (child_listen_node->ipv4)
-			i40iw_debug(&iwdev->sc_dev,
-				    I40IW_DEBUG_CM,
-				    "removing child listen for IP=%pI4, port=%d, vlan=%d\n",
-				    child_listen_node->loc_addr,
-				    child_listen_node->loc_port,
-				    child_listen_node->vlan_id);
-		else
-			i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_CM,
-				    "removing child listen for IP=%pI6, port=%d, vlan=%d\n",
-				    child_listen_node->loc_addr,
-				    child_listen_node->loc_port,
-				    child_listen_node->vlan_id);
-		list_del(pos);
-		memcpy(cm_info->loc_addr, child_listen_node->loc_addr,
-		       sizeof(cm_info->loc_addr));
-		cm_info->vlan_id = child_listen_node->vlan_id;
-		if (child_listen_node->qhash_set) {
-			ret = i40iw_manage_qhash(iwdev, cm_info,
-						 I40IW_QHASH_TYPE_TCP_SYN,
-						 I40IW_QHASH_MANAGE_TYPE_DELETE,
-						 NULL, false);
-			child_listen_node->qhash_set = false;
-		} else {
-			ret = I40IW_SUCCESS;
-		}
-		i40iw_debug(&iwdev->sc_dev,
-			    I40IW_DEBUG_CM,
-			    "freed pointer = %p\n",
-			    child_listen_node);
-		kfree(child_listen_node);
-		cm_parent_listen_node->cm_core->stats_listen_nodes_destroyed++;
-	}
-	spin_unlock_irqrestore(&iwdev->cm_core.listen_list_lock, flags);
-
-	return ret;
-}
-
-/**
- * i40iw_netdev_vlan_ipv6 - Gets the netdev and vlan
- * @addr: local IPv6 address
- * @vlan_id: vlan id for the given IPv6 address
- *
- * Returns the net_device of the IPv6 address and also sets the
- * vlan id for that address.
- */
-static struct net_device *i40iw_netdev_vlan_ipv6(u32 *addr, u16 *vlan_id)
-{
-	struct net_device *ip_dev = NULL;
-	struct in6_addr laddr6;
-
-	if (!IS_ENABLED(CONFIG_IPV6))
-		return NULL;
-	i40iw_copy_ip_htonl(laddr6.in6_u.u6_addr32, addr);
-	if (vlan_id)
-		*vlan_id = I40IW_NO_VLAN;
-	rcu_read_lock();
-	for_each_netdev_rcu(&init_net, ip_dev) {
-		if (ipv6_chk_addr(&init_net, &laddr6, ip_dev, 1)) {
-			if (vlan_id)
-				*vlan_id = rdma_vlan_dev_vlan_id(ip_dev);
-			break;
-		}
-	}
-	rcu_read_unlock();
-	return ip_dev;
-}
-
-/**
- * i40iw_get_vlan_ipv4 - Returns the vlan_id for IPv4 address
- * @addr: local IPv4 address
- */
-static u16 i40iw_get_vlan_ipv4(u32 *addr)
-{
-	struct net_device *netdev;
-	u16 vlan_id = I40IW_NO_VLAN;
-
-	netdev = ip_dev_find(&init_net, htonl(addr[0]));
-	if (netdev) {
-		vlan_id = rdma_vlan_dev_vlan_id(netdev);
-		dev_put(netdev);
-	}
-	return vlan_id;
-}
-
-/**
- * i40iw_add_mqh_6 - Adds multiple qhashes for IPv6
- * @iwdev: iWarp device
- * @cm_info: CM info for parent listen node
- * @cm_parent_listen_node: The parent listen node
- *
- * Adds a qhash and a child listen node for every IPv6 address
- * on the adapter and adds the associated qhash filter
- */
-static enum i40iw_status_code i40iw_add_mqh_6(struct i40iw_device *iwdev,
-					      struct i40iw_cm_info *cm_info,
-					      struct i40iw_cm_listener *cm_parent_listen_node)
-{
-	struct net_device *ip_dev;
-	struct inet6_dev *idev;
-	struct inet6_ifaddr *ifp, *tmp;
-	enum i40iw_status_code ret = 0;
-	struct i40iw_cm_listener *child_listen_node;
-	unsigned long flags;
-
-	rtnl_lock();
-	for_each_netdev(&init_net, ip_dev) {
-		if ((((rdma_vlan_dev_vlan_id(ip_dev) < I40IW_NO_VLAN) &&
-		      (rdma_vlan_dev_real_dev(ip_dev) == iwdev->netdev)) ||
-		     (ip_dev == iwdev->netdev)) && (ip_dev->flags & IFF_UP)) {
-			idev = __in6_dev_get(ip_dev);
-			if (!idev) {
-				i40iw_pr_err("idev == NULL\n");
-				break;
-			}
-			list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) {
-				i40iw_debug(&iwdev->sc_dev,
-					    I40IW_DEBUG_CM,
-					    "IP=%pI6, vlan_id=%d, MAC=%pM\n",
-					    &ifp->addr,
-					    rdma_vlan_dev_vlan_id(ip_dev),
-					    ip_dev->dev_addr);
-				child_listen_node =
-					kzalloc(sizeof(*child_listen_node), GFP_ATOMIC);
-				i40iw_debug(&iwdev->sc_dev,
-					    I40IW_DEBUG_CM,
-					    "Allocating child listener %p\n",
-					    child_listen_node);
-				if (!child_listen_node) {
-					ret = I40IW_ERR_NO_MEMORY;
-					goto exit;
-				}
-				cm_info->vlan_id = rdma_vlan_dev_vlan_id(ip_dev);
-				cm_parent_listen_node->vlan_id = cm_info->vlan_id;
-
-				memcpy(child_listen_node, cm_parent_listen_node,
-				       sizeof(*child_listen_node));
-
-				i40iw_copy_ip_ntohl(child_listen_node->loc_addr,
-						    ifp->addr.in6_u.u6_addr32);
-				memcpy(cm_info->loc_addr, child_listen_node->loc_addr,
-				       sizeof(cm_info->loc_addr));
-
-				ret = i40iw_manage_qhash(iwdev, cm_info,
-							 I40IW_QHASH_TYPE_TCP_SYN,
-							 I40IW_QHASH_MANAGE_TYPE_ADD,
-							 NULL, true);
-				if (!ret) {
-					child_listen_node->qhash_set = true;
-					spin_lock_irqsave(&iwdev->cm_core.listen_list_lock, flags);
-					list_add(&child_listen_node->child_listen_list,
-						 &cm_parent_listen_node->child_listen_list);
-					spin_unlock_irqrestore(&iwdev->cm_core.listen_list_lock, flags);
-					cm_parent_listen_node->cm_core->stats_listen_nodes_created++;
-				} else {
-					kfree(child_listen_node);
-				}
-			}
-		}
-	}
-exit:
-	rtnl_unlock();
-	return ret;
-}
-
-/**
- * i40iw_add_mqh_4 - Adds multiple qhashes for IPv4
- * @iwdev: iWarp device
- * @cm_info: CM info for parent listen node
- * @cm_parent_listen_node: The parent listen node
- *
- * Adds a qhash and a child listen node for every IPv4 address
- * on the adapter and adds the associated qhash filter
- */
-static enum i40iw_status_code i40iw_add_mqh_4(
-				struct i40iw_device *iwdev,
-				struct i40iw_cm_info *cm_info,
-				struct i40iw_cm_listener *cm_parent_listen_node)
-{
-	struct net_device *dev;
-	struct in_device *idev;
-	struct i40iw_cm_listener *child_listen_node;
-	enum i40iw_status_code ret = 0;
-	unsigned long flags;
-
-	rtnl_lock();
-	for_each_netdev(&init_net, dev) {
-		if ((((rdma_vlan_dev_vlan_id(dev) < I40IW_NO_VLAN) &&
-		      (rdma_vlan_dev_real_dev(dev) == iwdev->netdev)) ||
-		    (dev == iwdev->netdev)) && (dev->flags & IFF_UP)) {
-			const struct in_ifaddr *ifa;
-
-			idev = in_dev_get(dev);
-
-			in_dev_for_each_ifa_rtnl(ifa, idev) {
-				i40iw_debug(&iwdev->sc_dev,
-					    I40IW_DEBUG_CM,
-					    "Allocating child CM Listener forIP=%pI4, vlan_id=%d, MAC=%pM\n",
-					    &ifa->ifa_address,
-					    rdma_vlan_dev_vlan_id(dev),
-					    dev->dev_addr);
-				child_listen_node = kzalloc(sizeof(*child_listen_node), GFP_KERNEL);
-				cm_parent_listen_node->cm_core->stats_listen_nodes_created++;
-				i40iw_debug(&iwdev->sc_dev,
-					    I40IW_DEBUG_CM,
-					    "Allocating child listener %p\n",
-					    child_listen_node);
-				if (!child_listen_node) {
-					in_dev_put(idev);
-					ret = I40IW_ERR_NO_MEMORY;
-					goto exit;
-				}
-				cm_info->vlan_id = rdma_vlan_dev_vlan_id(dev);
-				cm_parent_listen_node->vlan_id = cm_info->vlan_id;
-				memcpy(child_listen_node,
-				       cm_parent_listen_node,
-				       sizeof(*child_listen_node));
-
-				child_listen_node->loc_addr[0] = ntohl(ifa->ifa_address);
-				memcpy(cm_info->loc_addr, child_listen_node->loc_addr,
-				       sizeof(cm_info->loc_addr));
-
-				ret = i40iw_manage_qhash(iwdev,
-							 cm_info,
-							 I40IW_QHASH_TYPE_TCP_SYN,
-							 I40IW_QHASH_MANAGE_TYPE_ADD,
-							 NULL,
-							 true);
-				if (!ret) {
-					child_listen_node->qhash_set = true;
-					spin_lock_irqsave(&iwdev->cm_core.listen_list_lock, flags);
-					list_add(&child_listen_node->child_listen_list,
-						 &cm_parent_listen_node->child_listen_list);
-					spin_unlock_irqrestore(&iwdev->cm_core.listen_list_lock, flags);
-				} else {
-					kfree(child_listen_node);
-					cm_parent_listen_node->cm_core->stats_listen_nodes_created--;
-				}
-			}
-
-			in_dev_put(idev);
-		}
-	}
-exit:
-	rtnl_unlock();
-	return ret;
-}
-
-/**
- * i40iw_dec_refcnt_listen - delete listener and associated cm nodes
- * @cm_core: cm's core
- * @listener: passive connection's listener
- * @free_hanging_nodes: to free associated cm_nodes
- * @apbvt_del: flag to delete the apbvt
- */
-static int i40iw_dec_refcnt_listen(struct i40iw_cm_core *cm_core,
-				   struct i40iw_cm_listener *listener,
-				   int free_hanging_nodes, bool apbvt_del)
-{
-	int ret = -EINVAL;
-	int err = 0;
-	struct list_head *list_pos;
-	struct list_head *list_temp;
-	struct i40iw_cm_node *cm_node;
-	struct list_head reset_list;
-	struct i40iw_cm_info nfo;
-	struct i40iw_cm_node *loopback;
-	enum i40iw_cm_node_state old_state;
-	unsigned long flags;
-
-	/* free non-accelerated child nodes for this listener */
-	INIT_LIST_HEAD(&reset_list);
-	if (free_hanging_nodes) {
-		spin_lock_irqsave(&cm_core->ht_lock, flags);
-		list_for_each_safe(list_pos,
-				   list_temp, &cm_core->non_accelerated_list) {
-			cm_node = container_of(list_pos, struct i40iw_cm_node, list);
-			if ((cm_node->listener == listener) &&
-			    !cm_node->accelerated) {
-				atomic_inc(&cm_node->ref_count);
-				list_add(&cm_node->reset_entry, &reset_list);
-			}
-		}
-		spin_unlock_irqrestore(&cm_core->ht_lock, flags);
-	}
-
-	list_for_each_safe(list_pos, list_temp, &reset_list) {
-		cm_node = container_of(list_pos, struct i40iw_cm_node, reset_entry);
-		loopback = cm_node->loopbackpartner;
-		if (cm_node->state >= I40IW_CM_STATE_FIN_WAIT1) {
-			i40iw_rem_ref_cm_node(cm_node);
-		} else {
-			if (!loopback) {
-				i40iw_cleanup_retrans_entry(cm_node);
-				err = i40iw_send_reset(cm_node);
-				if (err) {
-					cm_node->state = I40IW_CM_STATE_CLOSED;
-					i40iw_pr_err("send reset\n");
-				} else {
-					old_state = cm_node->state;
-					cm_node->state = I40IW_CM_STATE_LISTENER_DESTROYED;
-					if (old_state != I40IW_CM_STATE_MPAREQ_RCVD)
-						i40iw_rem_ref_cm_node(cm_node);
-				}
-			} else {
-				struct i40iw_cm_event event;
-
-				event.cm_node = loopback;
-				memcpy(event.cm_info.rem_addr,
-				       loopback->rem_addr, sizeof(event.cm_info.rem_addr));
-				memcpy(event.cm_info.loc_addr,
-				       loopback->loc_addr, sizeof(event.cm_info.loc_addr));
-				event.cm_info.rem_port = loopback->rem_port;
-				event.cm_info.loc_port = loopback->loc_port;
-				event.cm_info.cm_id = loopback->cm_id;
-				event.cm_info.ipv4 = loopback->ipv4;
-				atomic_inc(&loopback->ref_count);
-				loopback->state = I40IW_CM_STATE_CLOSED;
-				i40iw_event_connect_error(&event);
-				cm_node->state = I40IW_CM_STATE_LISTENER_DESTROYED;
-				i40iw_rem_ref_cm_node(cm_node);
-			}
-		}
-	}
-
-	if (!atomic_dec_return(&listener->ref_count)) {
-		spin_lock_irqsave(&cm_core->listen_list_lock, flags);
-		list_del(&listener->list);
-		spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
-
-		if (listener->iwdev) {
-			if (apbvt_del)
-				i40iw_manage_apbvt(listener->iwdev,
-						   listener->loc_port,
-						   I40IW_MANAGE_APBVT_DEL);
-
-			memcpy(nfo.loc_addr, listener->loc_addr, sizeof(nfo.loc_addr));
-			nfo.loc_port = listener->loc_port;
-			nfo.ipv4 = listener->ipv4;
-			nfo.vlan_id = listener->vlan_id;
-			nfo.user_pri = listener->user_pri;
-
-			if (!list_empty(&listener->child_listen_list)) {
-				i40iw_del_multiple_qhash(listener->iwdev, &nfo, listener);
-			} else {
-				if (listener->qhash_set)
-					i40iw_manage_qhash(listener->iwdev,
-							   &nfo,
-							   I40IW_QHASH_TYPE_TCP_SYN,
-							   I40IW_QHASH_MANAGE_TYPE_DELETE,
-							   NULL,
-							   false);
-			}
-		}
-
-		cm_core->stats_listen_destroyed++;
-		kfree(listener);
-		cm_core->stats_listen_nodes_destroyed++;
-		listener = NULL;
-		ret = 0;
-	}
-
-	if (listener) {
-		if (atomic_read(&listener->pend_accepts_cnt) > 0)
-			i40iw_debug(cm_core->dev,
-				    I40IW_DEBUG_CM,
-				    "%s: listener (%p) pending accepts=%u\n",
-				    __func__,
-				    listener,
-				    atomic_read(&listener->pend_accepts_cnt));
-	}
-
-	return ret;
-}
-
-/**
- * i40iw_cm_del_listen - delete a linstener
- * @cm_core: cm's core
-  * @listener: passive connection's listener
- * @apbvt_del: flag to delete apbvt
- */
-static int i40iw_cm_del_listen(struct i40iw_cm_core *cm_core,
-			       struct i40iw_cm_listener *listener,
-			       bool apbvt_del)
-{
-	listener->listener_state = I40IW_CM_LISTENER_PASSIVE_STATE;
-	listener->cm_id = NULL;	/* going to be destroyed pretty soon */
-	return i40iw_dec_refcnt_listen(cm_core, listener, 1, apbvt_del);
-}
-
-/**
- * i40iw_addr_resolve_neigh - resolve neighbor address
- * @iwdev: iwarp device structure
- * @src_ip: local ip address
- * @dst_ip: remote ip address
- * @arpindex: if there is an arp entry
- */
-static int i40iw_addr_resolve_neigh(struct i40iw_device *iwdev,
-				    u32 src_ip,
-				    u32 dst_ip,
-				    int arpindex)
-{
-	struct rtable *rt;
-	struct neighbour *neigh;
-	int rc = arpindex;
-	__be32 dst_ipaddr = htonl(dst_ip);
-	__be32 src_ipaddr = htonl(src_ip);
-
-	rt = ip_route_output(&init_net, dst_ipaddr, src_ipaddr, 0, 0);
-	if (IS_ERR(rt)) {
-		i40iw_pr_err("ip_route_output\n");
-		return rc;
-	}
-
-	neigh = dst_neigh_lookup(&rt->dst, &dst_ipaddr);
-
-	rcu_read_lock();
-	if (neigh) {
-		if (neigh->nud_state & NUD_VALID) {
-			if (arpindex >= 0) {
-				if (ether_addr_equal(iwdev->arp_table[arpindex].mac_addr,
-						     neigh->ha))
-					/* Mac address same as arp table */
-					goto resolve_neigh_exit;
-				i40iw_manage_arp_cache(iwdev,
-						       iwdev->arp_table[arpindex].mac_addr,
-						       &dst_ip,
-						       true,
-						       I40IW_ARP_DELETE);
-			}
-
-			i40iw_manage_arp_cache(iwdev, neigh->ha, &dst_ip, true, I40IW_ARP_ADD);
-			rc = i40iw_arp_table(iwdev, &dst_ip, true, NULL, I40IW_ARP_RESOLVE);
-		} else {
-			neigh_event_send(neigh, NULL);
-		}
-	}
- resolve_neigh_exit:
-
-	rcu_read_unlock();
-	if (neigh)
-		neigh_release(neigh);
-
-	ip_rt_put(rt);
-	return rc;
-}
-
-/*
- * i40iw_get_dst_ipv6
- */
-static struct dst_entry *i40iw_get_dst_ipv6(struct sockaddr_in6 *src_addr,
-					    struct sockaddr_in6 *dst_addr)
-{
-	struct dst_entry *dst;
-	struct flowi6 fl6;
-
-	memset(&fl6, 0, sizeof(fl6));
-	fl6.daddr = dst_addr->sin6_addr;
-	fl6.saddr = src_addr->sin6_addr;
-	if (ipv6_addr_type(&fl6.daddr) & IPV6_ADDR_LINKLOCAL)
-		fl6.flowi6_oif = dst_addr->sin6_scope_id;
-
-	dst = ip6_route_output(&init_net, NULL, &fl6);
-	return dst;
-}
-
-/**
- * i40iw_addr_resolve_neigh_ipv6 - resolve neighbor ipv6 address
- * @iwdev: iwarp device structure
- * @src: source ip address
- * @dest: remote ip address
- * @arpindex: if there is an arp entry
- */
-static int i40iw_addr_resolve_neigh_ipv6(struct i40iw_device *iwdev,
-					 u32 *src,
-					 u32 *dest,
-					 int arpindex)
-{
-	struct neighbour *neigh;
-	int rc = arpindex;
-	struct dst_entry *dst;
-	struct sockaddr_in6 dst_addr;
-	struct sockaddr_in6 src_addr;
-
-	memset(&dst_addr, 0, sizeof(dst_addr));
-	dst_addr.sin6_family = AF_INET6;
-	i40iw_copy_ip_htonl(dst_addr.sin6_addr.in6_u.u6_addr32, dest);
-	memset(&src_addr, 0, sizeof(src_addr));
-	src_addr.sin6_family = AF_INET6;
-	i40iw_copy_ip_htonl(src_addr.sin6_addr.in6_u.u6_addr32, src);
-	dst = i40iw_get_dst_ipv6(&src_addr, &dst_addr);
-	if (!dst || dst->error) {
-		if (dst) {
-			i40iw_pr_err("ip6_route_output returned dst->error = %d\n",
-				     dst->error);
-			dst_release(dst);
-		}
-		return rc;
-	}
-
-	neigh = dst_neigh_lookup(dst, dst_addr.sin6_addr.in6_u.u6_addr32);
-
-	rcu_read_lock();
-	if (neigh) {
-		i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_CM, "dst_neigh_lookup MAC=%pM\n", neigh->ha);
-		if (neigh->nud_state & NUD_VALID) {
-			if (arpindex >= 0) {
-				if (ether_addr_equal
-				    (iwdev->arp_table[arpindex].mac_addr,
-				     neigh->ha)) {
-					/* Mac address same as in arp table */
-					goto resolve_neigh_exit6;
-				}
-				i40iw_manage_arp_cache(iwdev,
-						       iwdev->arp_table[arpindex].mac_addr,
-						       dest,
-						       false,
-						       I40IW_ARP_DELETE);
-			}
-			i40iw_manage_arp_cache(iwdev,
-					       neigh->ha,
-					       dest,
-					       false,
-					       I40IW_ARP_ADD);
-			rc = i40iw_arp_table(iwdev,
-					     dest,
-					     false,
-					     NULL,
-					     I40IW_ARP_RESOLVE);
-		} else {
-			neigh_event_send(neigh, NULL);
-		}
-	}
-
- resolve_neigh_exit6:
-	rcu_read_unlock();
-	if (neigh)
-		neigh_release(neigh);
-	dst_release(dst);
-	return rc;
-}
-
-/**
- * i40iw_ipv4_is_loopback - check if loopback
- * @loc_addr: local addr to compare
- * @rem_addr: remote address
- */
-static bool i40iw_ipv4_is_loopback(u32 loc_addr, u32 rem_addr)
-{
-	return ipv4_is_loopback(htonl(rem_addr)) || (loc_addr == rem_addr);
-}
-
-/**
- * i40iw_ipv6_is_loopback - check if loopback
- * @loc_addr: local addr to compare
- * @rem_addr: remote address
- */
-static bool i40iw_ipv6_is_loopback(u32 *loc_addr, u32 *rem_addr)
-{
-	struct in6_addr raddr6;
-
-	i40iw_copy_ip_htonl(raddr6.in6_u.u6_addr32, rem_addr);
-	return !memcmp(loc_addr, rem_addr, 16) || ipv6_addr_loopback(&raddr6);
-}
-
-/**
- * i40iw_make_cm_node - create a new instance of a cm node
- * @cm_core: cm's core
- * @iwdev: iwarp device structure
- * @cm_info: quad info for connection
- * @listener: passive connection's listener
- */
-static struct i40iw_cm_node *i40iw_make_cm_node(
-				   struct i40iw_cm_core *cm_core,
-				   struct i40iw_device *iwdev,
-				   struct i40iw_cm_info *cm_info,
-				   struct i40iw_cm_listener *listener)
-{
-	struct i40iw_cm_node *cm_node;
-	int oldarpindex;
-	int arpindex;
-	struct net_device *netdev = iwdev->netdev;
-
-	/* create an hte and cm_node for this instance */
-	cm_node = kzalloc(sizeof(*cm_node), GFP_ATOMIC);
-	if (!cm_node)
-		return NULL;
-
-	/* set our node specific transport info */
-	cm_node->ipv4 = cm_info->ipv4;
-	cm_node->vlan_id = cm_info->vlan_id;
-	if ((cm_node->vlan_id == I40IW_NO_VLAN) && iwdev->dcb)
-		cm_node->vlan_id = 0;
-	cm_node->tos = cm_info->tos;
-	cm_node->user_pri = cm_info->user_pri;
-	if (listener) {
-		if (listener->tos != cm_info->tos)
-			i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_DCB,
-				    "application TOS[%d] and remote client TOS[%d] mismatch\n",
-				     listener->tos, cm_info->tos);
-		cm_node->tos = max(listener->tos, cm_info->tos);
-		cm_node->user_pri = rt_tos2priority(cm_node->tos);
-		i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_DCB, "listener: TOS:[%d] UP:[%d]\n",
-			    cm_node->tos, cm_node->user_pri);
-	}
-	memcpy(cm_node->loc_addr, cm_info->loc_addr, sizeof(cm_node->loc_addr));
-	memcpy(cm_node->rem_addr, cm_info->rem_addr, sizeof(cm_node->rem_addr));
-	cm_node->loc_port = cm_info->loc_port;
-	cm_node->rem_port = cm_info->rem_port;
-
-	cm_node->mpa_frame_rev = iwdev->mpa_version;
-	cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO;
-	cm_node->ird_size = I40IW_MAX_IRD_SIZE;
-	cm_node->ord_size = I40IW_MAX_ORD_SIZE;
-
-	cm_node->listener = listener;
-	cm_node->cm_id = cm_info->cm_id;
-	ether_addr_copy(cm_node->loc_mac, netdev->dev_addr);
-	spin_lock_init(&cm_node->retrans_list_lock);
-	cm_node->ack_rcvd = false;
-
-	atomic_set(&cm_node->ref_count, 1);
-	/* associate our parent CM core */
-	cm_node->cm_core = cm_core;
-	cm_node->tcp_cntxt.loc_id = I40IW_CM_DEF_LOCAL_ID;
-	cm_node->tcp_cntxt.rcv_wscale = I40IW_CM_DEFAULT_RCV_WND_SCALE;
-	cm_node->tcp_cntxt.rcv_wnd =
-			I40IW_CM_DEFAULT_RCV_WND_SCALED >> I40IW_CM_DEFAULT_RCV_WND_SCALE;
-	if (cm_node->ipv4) {
-		cm_node->tcp_cntxt.loc_seq_num = secure_tcp_seq(htonl(cm_node->loc_addr[0]),
-							htonl(cm_node->rem_addr[0]),
-							htons(cm_node->loc_port),
-							htons(cm_node->rem_port));
-		cm_node->tcp_cntxt.mss = iwdev->vsi.mtu - I40IW_MTU_TO_MSS_IPV4;
-	} else if (IS_ENABLED(CONFIG_IPV6)) {
-		__be32 loc[4] = {
-			htonl(cm_node->loc_addr[0]), htonl(cm_node->loc_addr[1]),
-			htonl(cm_node->loc_addr[2]), htonl(cm_node->loc_addr[3])
-		};
-		__be32 rem[4] = {
-			htonl(cm_node->rem_addr[0]), htonl(cm_node->rem_addr[1]),
-			htonl(cm_node->rem_addr[2]), htonl(cm_node->rem_addr[3])
-		};
-		cm_node->tcp_cntxt.loc_seq_num = secure_tcpv6_seq(loc, rem,
-							htons(cm_node->loc_port),
-							htons(cm_node->rem_port));
-		cm_node->tcp_cntxt.mss = iwdev->vsi.mtu - I40IW_MTU_TO_MSS_IPV6;
-	}
-
-	cm_node->iwdev = iwdev;
-	cm_node->dev = &iwdev->sc_dev;
-
-	if ((cm_node->ipv4 &&
-	     i40iw_ipv4_is_loopback(cm_node->loc_addr[0], cm_node->rem_addr[0])) ||
-	     (!cm_node->ipv4 && i40iw_ipv6_is_loopback(cm_node->loc_addr,
-						       cm_node->rem_addr))) {
-		arpindex = i40iw_arp_table(iwdev,
-					   cm_node->rem_addr,
-					   false,
-					   NULL,
-					   I40IW_ARP_RESOLVE);
-	} else {
-		oldarpindex = i40iw_arp_table(iwdev,
-					      cm_node->rem_addr,
-					      false,
-					      NULL,
-					      I40IW_ARP_RESOLVE);
-		if (cm_node->ipv4)
-			arpindex = i40iw_addr_resolve_neigh(iwdev,
-							    cm_info->loc_addr[0],
-							    cm_info->rem_addr[0],
-							    oldarpindex);
-		else if (IS_ENABLED(CONFIG_IPV6))
-			arpindex = i40iw_addr_resolve_neigh_ipv6(iwdev,
-								 cm_info->loc_addr,
-								 cm_info->rem_addr,
-								 oldarpindex);
-		else
-			arpindex = -EINVAL;
-	}
-	if (arpindex < 0) {
-		i40iw_pr_err("cm_node arpindex\n");
-		kfree(cm_node);
-		return NULL;
-	}
-	ether_addr_copy(cm_node->rem_mac, iwdev->arp_table[arpindex].mac_addr);
-	i40iw_add_hte_node(cm_core, cm_node);
-	cm_core->stats_nodes_created++;
-	return cm_node;
-}
-
-/**
- * i40iw_rem_ref_cm_node - destroy an instance of a cm node
- * @cm_node: connection's node
- */
-static void i40iw_rem_ref_cm_node(struct i40iw_cm_node *cm_node)
-{
-	struct i40iw_cm_core *cm_core = cm_node->cm_core;
-	struct i40iw_qp *iwqp;
-	struct i40iw_cm_info nfo;
-	unsigned long flags;
-
-	spin_lock_irqsave(&cm_node->cm_core->ht_lock, flags);
-	if (atomic_dec_return(&cm_node->ref_count)) {
-		spin_unlock_irqrestore(&cm_node->cm_core->ht_lock, flags);
-		return;
-	}
-	list_del(&cm_node->list);
-	spin_unlock_irqrestore(&cm_node->cm_core->ht_lock, flags);
-
-	/* if the node is destroyed before connection was accelerated */
-	if (!cm_node->accelerated && cm_node->accept_pend) {
-		pr_err("node destroyed before established\n");
-		atomic_dec(&cm_node->listener->pend_accepts_cnt);
-	}
-	if (cm_node->close_entry)
-		i40iw_handle_close_entry(cm_node, 0);
-	if (cm_node->listener) {
-		i40iw_dec_refcnt_listen(cm_core, cm_node->listener, 0, true);
-	} else {
-		if (cm_node->apbvt_set) {
-			i40iw_manage_apbvt(cm_node->iwdev,
-					   cm_node->loc_port,
-					   I40IW_MANAGE_APBVT_DEL);
-			cm_node->apbvt_set = 0;
-		}
-		i40iw_get_addr_info(cm_node, &nfo);
-		if (cm_node->qhash_set) {
-			i40iw_manage_qhash(cm_node->iwdev,
-					   &nfo,
-					   I40IW_QHASH_TYPE_TCP_ESTABLISHED,
-					   I40IW_QHASH_MANAGE_TYPE_DELETE,
-					   NULL,
-					   false);
-			cm_node->qhash_set = 0;
-		}
-	}
-
-	iwqp = cm_node->iwqp;
-	if (iwqp) {
-		iwqp->cm_node = NULL;
-		i40iw_qp_rem_ref(&iwqp->ibqp);
-		cm_node->iwqp = NULL;
-	} else if (cm_node->qhash_set) {
-		i40iw_get_addr_info(cm_node, &nfo);
-		i40iw_manage_qhash(cm_node->iwdev,
-				   &nfo,
-				   I40IW_QHASH_TYPE_TCP_ESTABLISHED,
-				   I40IW_QHASH_MANAGE_TYPE_DELETE,
-				   NULL,
-				   false);
-		cm_node->qhash_set = 0;
-	}
-
-	cm_node->cm_core->stats_nodes_destroyed++;
-	kfree(cm_node);
-}
-
-/**
- * i40iw_handle_fin_pkt - FIN packet received
- * @cm_node: connection's node
- */
-static void i40iw_handle_fin_pkt(struct i40iw_cm_node *cm_node)
-{
-	u32 ret;
-
-	switch (cm_node->state) {
-	case I40IW_CM_STATE_SYN_RCVD:
-	case I40IW_CM_STATE_SYN_SENT:
-	case I40IW_CM_STATE_ESTABLISHED:
-	case I40IW_CM_STATE_MPAREJ_RCVD:
-		cm_node->tcp_cntxt.rcv_nxt++;
-		i40iw_cleanup_retrans_entry(cm_node);
-		cm_node->state = I40IW_CM_STATE_LAST_ACK;
-		i40iw_send_fin(cm_node);
-		break;
-	case I40IW_CM_STATE_MPAREQ_SENT:
-		i40iw_create_event(cm_node, I40IW_CM_EVENT_ABORTED);
-		cm_node->tcp_cntxt.rcv_nxt++;
-		i40iw_cleanup_retrans_entry(cm_node);
-		cm_node->state = I40IW_CM_STATE_CLOSED;
-		atomic_inc(&cm_node->ref_count);
-		i40iw_send_reset(cm_node);
-		break;
-	case I40IW_CM_STATE_FIN_WAIT1:
-		cm_node->tcp_cntxt.rcv_nxt++;
-		i40iw_cleanup_retrans_entry(cm_node);
-		cm_node->state = I40IW_CM_STATE_CLOSING;
-		i40iw_send_ack(cm_node);
-		/*
-		 * Wait for ACK as this is simultaneous close.
-		 * After we receive ACK, do not send anything.
-		 * Just rm the node.
-		 */
-		break;
-	case I40IW_CM_STATE_FIN_WAIT2:
-		cm_node->tcp_cntxt.rcv_nxt++;
-		i40iw_cleanup_retrans_entry(cm_node);
-		cm_node->state = I40IW_CM_STATE_TIME_WAIT;
-		i40iw_send_ack(cm_node);
-		ret =
-		    i40iw_schedule_cm_timer(cm_node, NULL, I40IW_TIMER_TYPE_CLOSE, 1, 0);
-		if (ret)
-			i40iw_pr_err("node %p state = %d\n", cm_node, cm_node->state);
-		break;
-	case I40IW_CM_STATE_TIME_WAIT:
-		cm_node->tcp_cntxt.rcv_nxt++;
-		i40iw_cleanup_retrans_entry(cm_node);
-		cm_node->state = I40IW_CM_STATE_CLOSED;
-		i40iw_rem_ref_cm_node(cm_node);
-		break;
-	case I40IW_CM_STATE_OFFLOADED:
-	default:
-		i40iw_pr_err("bad state node %p state = %d\n", cm_node, cm_node->state);
-		break;
-	}
-}
-
-/**
- * i40iw_handle_rst_pkt - process received RST packet
- * @cm_node: connection's node
- * @rbuf: receive buffer
- */
-static void i40iw_handle_rst_pkt(struct i40iw_cm_node *cm_node,
-				 struct i40iw_puda_buf *rbuf)
-{
-	i40iw_cleanup_retrans_entry(cm_node);
-	switch (cm_node->state) {
-	case I40IW_CM_STATE_SYN_SENT:
-	case I40IW_CM_STATE_MPAREQ_SENT:
-		switch (cm_node->mpa_frame_rev) {
-		case IETF_MPA_V2:
-			cm_node->mpa_frame_rev = IETF_MPA_V1;
-			/* send a syn and goto syn sent state */
-			cm_node->state = I40IW_CM_STATE_SYN_SENT;
-			if (i40iw_send_syn(cm_node, 0))
-				i40iw_active_open_err(cm_node, false);
-			break;
-		case IETF_MPA_V1:
-		default:
-			i40iw_active_open_err(cm_node, false);
-			break;
-		}
-		break;
-	case I40IW_CM_STATE_MPAREQ_RCVD:
-		atomic_inc(&cm_node->passive_state);
-		break;
-	case I40IW_CM_STATE_ESTABLISHED:
-	case I40IW_CM_STATE_SYN_RCVD:
-	case I40IW_CM_STATE_LISTENING:
-		i40iw_pr_err("Bad state state = %d\n", cm_node->state);
-		i40iw_passive_open_err(cm_node, false);
-		break;
-	case I40IW_CM_STATE_OFFLOADED:
-		i40iw_active_open_err(cm_node, false);
-		break;
-	case I40IW_CM_STATE_CLOSED:
-		break;
-	case I40IW_CM_STATE_FIN_WAIT2:
-	case I40IW_CM_STATE_FIN_WAIT1:
-	case I40IW_CM_STATE_LAST_ACK:
-		cm_node->cm_id->rem_ref(cm_node->cm_id);
-		fallthrough;
-	case I40IW_CM_STATE_TIME_WAIT:
-		cm_node->state = I40IW_CM_STATE_CLOSED;
-		i40iw_rem_ref_cm_node(cm_node);
-		break;
-	default:
-		break;
-	}
-}
-
-/**
- * i40iw_handle_rcv_mpa - Process a recv'd mpa buffer
- * @cm_node: connection's node
- * @rbuf: receive buffer
- */
-static void i40iw_handle_rcv_mpa(struct i40iw_cm_node *cm_node,
-				 struct i40iw_puda_buf *rbuf)
-{
-	int ret;
-	int datasize = rbuf->datalen;
-	u8 *dataloc = rbuf->data;
-
-	enum i40iw_cm_event_type type = I40IW_CM_EVENT_UNKNOWN;
-	u32 res_type;
-
-	ret = i40iw_parse_mpa(cm_node, dataloc, &res_type, datasize);
-	if (ret) {
-		if (cm_node->state == I40IW_CM_STATE_MPAREQ_SENT)
-			i40iw_active_open_err(cm_node, true);
-		else
-			i40iw_passive_open_err(cm_node, true);
-		return;
-	}
-
-	switch (cm_node->state) {
-	case I40IW_CM_STATE_ESTABLISHED:
-		if (res_type == I40IW_MPA_REQUEST_REJECT)
-			i40iw_pr_err("state for reject\n");
-		cm_node->state = I40IW_CM_STATE_MPAREQ_RCVD;
-		type = I40IW_CM_EVENT_MPA_REQ;
-		i40iw_send_ack(cm_node);	/* ACK received MPA request */
-		atomic_set(&cm_node->passive_state,
-			   I40IW_PASSIVE_STATE_INDICATED);
-		break;
-	case I40IW_CM_STATE_MPAREQ_SENT:
-		i40iw_cleanup_retrans_entry(cm_node);
-		if (res_type == I40IW_MPA_REQUEST_REJECT) {
-			type = I40IW_CM_EVENT_MPA_REJECT;
-			cm_node->state = I40IW_CM_STATE_MPAREJ_RCVD;
-		} else {
-			type = I40IW_CM_EVENT_CONNECTED;
-			cm_node->state = I40IW_CM_STATE_OFFLOADED;
-		}
-		i40iw_send_ack(cm_node);
-		break;
-	default:
-		pr_err("%s wrong cm_node state =%d\n", __func__, cm_node->state);
-		break;
-	}
-	i40iw_create_event(cm_node, type);
-}
-
-/**
- * i40iw_indicate_pkt_err - Send up err event to cm
- * @cm_node: connection's node
- */
-static void i40iw_indicate_pkt_err(struct i40iw_cm_node *cm_node)
-{
-	switch (cm_node->state) {
-	case I40IW_CM_STATE_SYN_SENT:
-	case I40IW_CM_STATE_MPAREQ_SENT:
-		i40iw_active_open_err(cm_node, true);
-		break;
-	case I40IW_CM_STATE_ESTABLISHED:
-	case I40IW_CM_STATE_SYN_RCVD:
-		i40iw_passive_open_err(cm_node, true);
-		break;
-	case I40IW_CM_STATE_OFFLOADED:
-	default:
-		break;
-	}
-}
-
-/**
- * i40iw_check_syn - Check for error on received syn ack
- * @cm_node: connection's node
- * @tcph: pointer tcp header
- */
-static int i40iw_check_syn(struct i40iw_cm_node *cm_node, struct tcphdr *tcph)
-{
-	int err = 0;
-
-	if (ntohl(tcph->ack_seq) != cm_node->tcp_cntxt.loc_seq_num) {
-		err = 1;
-		i40iw_active_open_err(cm_node, true);
-	}
-	return err;
-}
-
-/**
- * i40iw_check_seq - check seq numbers if OK
- * @cm_node: connection's node
- * @tcph: pointer tcp header
- */
-static int i40iw_check_seq(struct i40iw_cm_node *cm_node, struct tcphdr *tcph)
-{
-	int err = 0;
-	u32 seq;
-	u32 ack_seq;
-	u32 loc_seq_num = cm_node->tcp_cntxt.loc_seq_num;
-	u32 rcv_nxt = cm_node->tcp_cntxt.rcv_nxt;
-	u32 rcv_wnd;
-
-	seq = ntohl(tcph->seq);
-	ack_seq = ntohl(tcph->ack_seq);
-	rcv_wnd = cm_node->tcp_cntxt.rcv_wnd;
-	if (ack_seq != loc_seq_num)
-		err = -1;
-	else if (!between(seq, rcv_nxt, (rcv_nxt + rcv_wnd)))
-		err = -1;
-	if (err) {
-		i40iw_pr_err("seq number\n");
-		i40iw_indicate_pkt_err(cm_node);
-	}
-	return err;
-}
-
-/**
- * i40iw_handle_syn_pkt - is for Passive node
- * @cm_node: connection's node
- * @rbuf: receive buffer
- */
-static void i40iw_handle_syn_pkt(struct i40iw_cm_node *cm_node,
-				 struct i40iw_puda_buf *rbuf)
-{
-	struct tcphdr *tcph = (struct tcphdr *)rbuf->tcph;
-	int ret;
-	u32 inc_sequence;
-	int optionsize;
-	struct i40iw_cm_info nfo;
-
-	optionsize = (tcph->doff << 2) - sizeof(struct tcphdr);
-	inc_sequence = ntohl(tcph->seq);
-
-	switch (cm_node->state) {
-	case I40IW_CM_STATE_SYN_SENT:
-	case I40IW_CM_STATE_MPAREQ_SENT:
-		/* Rcvd syn on active open connection */
-		i40iw_active_open_err(cm_node, 1);
-		break;
-	case I40IW_CM_STATE_LISTENING:
-		/* Passive OPEN */
-		if (atomic_read(&cm_node->listener->pend_accepts_cnt) >
-		    cm_node->listener->backlog) {
-			cm_node->cm_core->stats_backlog_drops++;
-			i40iw_passive_open_err(cm_node, false);
-			break;
-		}
-		ret = i40iw_handle_tcp_options(cm_node, tcph, optionsize, 1);
-		if (ret) {
-			i40iw_passive_open_err(cm_node, false);
-			/* drop pkt */
-			break;
-		}
-		cm_node->tcp_cntxt.rcv_nxt = inc_sequence + 1;
-		cm_node->accept_pend = 1;
-		atomic_inc(&cm_node->listener->pend_accepts_cnt);
-
-		cm_node->state = I40IW_CM_STATE_SYN_RCVD;
-		i40iw_get_addr_info(cm_node, &nfo);
-		ret = i40iw_manage_qhash(cm_node->iwdev,
-					 &nfo,
-					 I40IW_QHASH_TYPE_TCP_ESTABLISHED,
-					 I40IW_QHASH_MANAGE_TYPE_ADD,
-					 (void *)cm_node,
-					 false);
-		cm_node->qhash_set = true;
-		break;
-	case I40IW_CM_STATE_CLOSED:
-		i40iw_cleanup_retrans_entry(cm_node);
-		atomic_inc(&cm_node->ref_count);
-		i40iw_send_reset(cm_node);
-		break;
-	case I40IW_CM_STATE_OFFLOADED:
-	case I40IW_CM_STATE_ESTABLISHED:
-	case I40IW_CM_STATE_FIN_WAIT1:
-	case I40IW_CM_STATE_FIN_WAIT2:
-	case I40IW_CM_STATE_MPAREQ_RCVD:
-	case I40IW_CM_STATE_LAST_ACK:
-	case I40IW_CM_STATE_CLOSING:
-	case I40IW_CM_STATE_UNKNOWN:
-	default:
-		break;
-	}
-}
-
-/**
- * i40iw_handle_synack_pkt - Process SYN+ACK packet (active side)
- * @cm_node: connection's node
- * @rbuf: receive buffer
- */
-static void i40iw_handle_synack_pkt(struct i40iw_cm_node *cm_node,
-				    struct i40iw_puda_buf *rbuf)
-{
-	struct tcphdr *tcph = (struct tcphdr *)rbuf->tcph;
-	int ret;
-	u32 inc_sequence;
-	int optionsize;
-
-	optionsize = (tcph->doff << 2) - sizeof(struct tcphdr);
-	inc_sequence = ntohl(tcph->seq);
-	switch (cm_node->state) {
-	case I40IW_CM_STATE_SYN_SENT:
-		i40iw_cleanup_retrans_entry(cm_node);
-		/* active open */
-		if (i40iw_check_syn(cm_node, tcph)) {
-			i40iw_pr_err("check syn fail\n");
-			return;
-		}
-		cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq);
-		/* setup options */
-		ret = i40iw_handle_tcp_options(cm_node, tcph, optionsize, 0);
-		if (ret) {
-			i40iw_debug(cm_node->dev,
-				    I40IW_DEBUG_CM,
-				    "cm_node=%p tcp_options failed\n",
-				    cm_node);
-			break;
-		}
-		i40iw_cleanup_retrans_entry(cm_node);
-		cm_node->tcp_cntxt.rcv_nxt = inc_sequence + 1;
-		i40iw_send_ack(cm_node);	/* ACK  for the syn_ack */
-		ret = i40iw_send_mpa_request(cm_node);
-		if (ret) {
-			i40iw_debug(cm_node->dev,
-				    I40IW_DEBUG_CM,
-				    "cm_node=%p i40iw_send_mpa_request failed\n",
-				    cm_node);
-			break;
-		}
-		cm_node->state = I40IW_CM_STATE_MPAREQ_SENT;
-		break;
-	case I40IW_CM_STATE_MPAREQ_RCVD:
-		i40iw_passive_open_err(cm_node, true);
-		break;
-	case I40IW_CM_STATE_LISTENING:
-		cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq);
-		i40iw_cleanup_retrans_entry(cm_node);
-		cm_node->state = I40IW_CM_STATE_CLOSED;
-		i40iw_send_reset(cm_node);
-		break;
-	case I40IW_CM_STATE_CLOSED:
-		cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq);
-		i40iw_cleanup_retrans_entry(cm_node);
-		atomic_inc(&cm_node->ref_count);
-		i40iw_send_reset(cm_node);
-		break;
-	case I40IW_CM_STATE_ESTABLISHED:
-	case I40IW_CM_STATE_FIN_WAIT1:
-	case I40IW_CM_STATE_FIN_WAIT2:
-	case I40IW_CM_STATE_LAST_ACK:
-	case I40IW_CM_STATE_OFFLOADED:
-	case I40IW_CM_STATE_CLOSING:
-	case I40IW_CM_STATE_UNKNOWN:
-	case I40IW_CM_STATE_MPAREQ_SENT:
-	default:
-		break;
-	}
-}
-
-/**
- * i40iw_handle_ack_pkt - process packet with ACK
- * @cm_node: connection's node
- * @rbuf: receive buffer
- */
-static int i40iw_handle_ack_pkt(struct i40iw_cm_node *cm_node,
-				struct i40iw_puda_buf *rbuf)
-{
-	struct tcphdr *tcph = (struct tcphdr *)rbuf->tcph;
-	u32 inc_sequence;
-	int ret = 0;
-	int optionsize;
-	u32 datasize = rbuf->datalen;
-
-	optionsize = (tcph->doff << 2) - sizeof(struct tcphdr);
-
-	if (i40iw_check_seq(cm_node, tcph))
-		return -EINVAL;
-
-	inc_sequence = ntohl(tcph->seq);
-	switch (cm_node->state) {
-	case I40IW_CM_STATE_SYN_RCVD:
-		i40iw_cleanup_retrans_entry(cm_node);
-		ret = i40iw_handle_tcp_options(cm_node, tcph, optionsize, 1);
-		if (ret)
-			break;
-		cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq);
-		cm_node->state = I40IW_CM_STATE_ESTABLISHED;
-		if (datasize) {
-			cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
-			i40iw_handle_rcv_mpa(cm_node, rbuf);
-		}
-		break;
-	case I40IW_CM_STATE_ESTABLISHED:
-		i40iw_cleanup_retrans_entry(cm_node);
-		if (datasize) {
-			cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
-			i40iw_handle_rcv_mpa(cm_node, rbuf);
-		}
-		break;
-	case I40IW_CM_STATE_MPAREQ_SENT:
-		cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq);
-		if (datasize) {
-			cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
-			cm_node->ack_rcvd = false;
-			i40iw_handle_rcv_mpa(cm_node, rbuf);
-		} else {
-			cm_node->ack_rcvd = true;
-		}
-		break;
-	case I40IW_CM_STATE_LISTENING:
-		i40iw_cleanup_retrans_entry(cm_node);
-		cm_node->state = I40IW_CM_STATE_CLOSED;
-		i40iw_send_reset(cm_node);
-		break;
-	case I40IW_CM_STATE_CLOSED:
-		i40iw_cleanup_retrans_entry(cm_node);
-		atomic_inc(&cm_node->ref_count);
-		i40iw_send_reset(cm_node);
-		break;
-	case I40IW_CM_STATE_LAST_ACK:
-	case I40IW_CM_STATE_CLOSING:
-		i40iw_cleanup_retrans_entry(cm_node);
-		cm_node->state = I40IW_CM_STATE_CLOSED;
-		if (!cm_node->accept_pend)
-			cm_node->cm_id->rem_ref(cm_node->cm_id);
-		i40iw_rem_ref_cm_node(cm_node);
-		break;
-	case I40IW_CM_STATE_FIN_WAIT1:
-		i40iw_cleanup_retrans_entry(cm_node);
-		cm_node->state = I40IW_CM_STATE_FIN_WAIT2;
-		break;
-	case I40IW_CM_STATE_SYN_SENT:
-	case I40IW_CM_STATE_FIN_WAIT2:
-	case I40IW_CM_STATE_OFFLOADED:
-	case I40IW_CM_STATE_MPAREQ_RCVD:
-	case I40IW_CM_STATE_UNKNOWN:
-	default:
-		i40iw_cleanup_retrans_entry(cm_node);
-		break;
-	}
-	return ret;
-}
-
-/**
- * i40iw_process_packet - process cm packet
- * @cm_node: connection's node
- * @rbuf: receive buffer
- */
-static void i40iw_process_packet(struct i40iw_cm_node *cm_node,
-				 struct i40iw_puda_buf *rbuf)
-{
-	enum i40iw_tcpip_pkt_type pkt_type = I40IW_PKT_TYPE_UNKNOWN;
-	struct tcphdr *tcph = (struct tcphdr *)rbuf->tcph;
-	u32 fin_set = 0;
-	int ret;
-
-	if (tcph->rst) {
-		pkt_type = I40IW_PKT_TYPE_RST;
-	} else if (tcph->syn) {
-		pkt_type = I40IW_PKT_TYPE_SYN;
-		if (tcph->ack)
-			pkt_type = I40IW_PKT_TYPE_SYNACK;
-	} else if (tcph->ack) {
-		pkt_type = I40IW_PKT_TYPE_ACK;
-	}
-	if (tcph->fin)
-		fin_set = 1;
-
-	switch (pkt_type) {
-	case I40IW_PKT_TYPE_SYN:
-		i40iw_handle_syn_pkt(cm_node, rbuf);
-		break;
-	case I40IW_PKT_TYPE_SYNACK:
-		i40iw_handle_synack_pkt(cm_node, rbuf);
-		break;
-	case I40IW_PKT_TYPE_ACK:
-		ret = i40iw_handle_ack_pkt(cm_node, rbuf);
-		if (fin_set && !ret)
-			i40iw_handle_fin_pkt(cm_node);
-		break;
-	case I40IW_PKT_TYPE_RST:
-		i40iw_handle_rst_pkt(cm_node, rbuf);
-		break;
-	default:
-		if (fin_set &&
-		    (!i40iw_check_seq(cm_node, (struct tcphdr *)rbuf->tcph)))
-			i40iw_handle_fin_pkt(cm_node);
-		break;
-	}
-}
-
-/**
- * i40iw_make_listen_node - create a listen node with params
- * @cm_core: cm's core
- * @iwdev: iwarp device structure
- * @cm_info: quad info for connection
- */
-static struct i40iw_cm_listener *i40iw_make_listen_node(
-					struct i40iw_cm_core *cm_core,
-					struct i40iw_device *iwdev,
-					struct i40iw_cm_info *cm_info)
-{
-	struct i40iw_cm_listener *listener;
-	unsigned long flags;
-
-	/* cannot have multiple matching listeners */
-	listener = i40iw_find_listener(cm_core, cm_info->loc_addr,
-				       cm_info->loc_port,
-				       cm_info->vlan_id,
-				       I40IW_CM_LISTENER_EITHER_STATE);
-	if (listener &&
-	    (listener->listener_state == I40IW_CM_LISTENER_ACTIVE_STATE)) {
-		atomic_dec(&listener->ref_count);
-		i40iw_debug(cm_core->dev,
-			    I40IW_DEBUG_CM,
-			    "Not creating listener since it already exists\n");
-		return NULL;
-	}
-
-	if (!listener) {
-		/* create a CM listen node (1/2 node to compare incoming traffic to) */
-		listener = kzalloc(sizeof(*listener), GFP_KERNEL);
-		if (!listener)
-			return NULL;
-		cm_core->stats_listen_nodes_created++;
-		memcpy(listener->loc_addr, cm_info->loc_addr, sizeof(listener->loc_addr));
-		listener->loc_port = cm_info->loc_port;
-
-		INIT_LIST_HEAD(&listener->child_listen_list);
-
-		atomic_set(&listener->ref_count, 1);
-	} else {
-		listener->reused_node = 1;
-	}
-
-	listener->cm_id = cm_info->cm_id;
-	listener->ipv4 = cm_info->ipv4;
-	listener->vlan_id = cm_info->vlan_id;
-	atomic_set(&listener->pend_accepts_cnt, 0);
-	listener->cm_core = cm_core;
-	listener->iwdev = iwdev;
-
-	listener->backlog = cm_info->backlog;
-	listener->listener_state = I40IW_CM_LISTENER_ACTIVE_STATE;
-
-	if (!listener->reused_node) {
-		spin_lock_irqsave(&cm_core->listen_list_lock, flags);
-		list_add(&listener->list, &cm_core->listen_nodes);
-		spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
-	}
-
-	return listener;
-}
-
-/**
- * i40iw_create_cm_node - make a connection node with params
- * @cm_core: cm's core
- * @iwdev: iwarp device structure
- * @conn_param: upper layer connection parameters
- * @cm_info: quad info for connection
- */
-static struct i40iw_cm_node *i40iw_create_cm_node(
-					struct i40iw_cm_core *cm_core,
-					struct i40iw_device *iwdev,
-					struct iw_cm_conn_param *conn_param,
-					struct i40iw_cm_info *cm_info)
-{
-	struct i40iw_cm_node *cm_node;
-	struct i40iw_cm_listener *loopback_remotelistener;
-	struct i40iw_cm_node *loopback_remotenode;
-	struct i40iw_cm_info loopback_cm_info;
-
-	u16 private_data_len = conn_param->private_data_len;
-	const void *private_data = conn_param->private_data;
-
-	/* create a CM connection node */
-	cm_node = i40iw_make_cm_node(cm_core, iwdev, cm_info, NULL);
-	if (!cm_node)
-		return ERR_PTR(-ENOMEM);
-	/* set our node side to client (active) side */
-	cm_node->tcp_cntxt.client = 1;
-	cm_node->tcp_cntxt.rcv_wscale = I40IW_CM_DEFAULT_RCV_WND_SCALE;
-
-	i40iw_record_ird_ord(cm_node, conn_param->ird, conn_param->ord);
-
-	if (!memcmp(cm_info->loc_addr, cm_info->rem_addr, sizeof(cm_info->loc_addr))) {
-		loopback_remotelistener = i40iw_find_listener(
-						cm_core,
-						cm_info->rem_addr,
-						cm_node->rem_port,
-						cm_node->vlan_id,
-						I40IW_CM_LISTENER_ACTIVE_STATE);
-		if (!loopback_remotelistener) {
-			i40iw_rem_ref_cm_node(cm_node);
-			return ERR_PTR(-ECONNREFUSED);
-		} else {
-			loopback_cm_info = *cm_info;
-			loopback_cm_info.loc_port = cm_info->rem_port;
-			loopback_cm_info.rem_port = cm_info->loc_port;
-			loopback_cm_info.cm_id = loopback_remotelistener->cm_id;
-			loopback_cm_info.ipv4 = cm_info->ipv4;
-			loopback_remotenode = i40iw_make_cm_node(cm_core,
-								 iwdev,
-								 &loopback_cm_info,
-								 loopback_remotelistener);
-			if (!loopback_remotenode) {
-				i40iw_rem_ref_cm_node(cm_node);
-				return ERR_PTR(-ENOMEM);
-			}
-			cm_core->stats_loopbacks++;
-			loopback_remotenode->loopbackpartner = cm_node;
-			loopback_remotenode->tcp_cntxt.rcv_wscale =
-				I40IW_CM_DEFAULT_RCV_WND_SCALE;
-			cm_node->loopbackpartner = loopback_remotenode;
-			memcpy(loopback_remotenode->pdata_buf, private_data,
-			       private_data_len);
-			loopback_remotenode->pdata.size = private_data_len;
-
-			if (loopback_remotenode->ord_size > cm_node->ird_size)
-				loopback_remotenode->ord_size =
-					cm_node->ird_size;
-
-			cm_node->state = I40IW_CM_STATE_OFFLOADED;
-			cm_node->tcp_cntxt.rcv_nxt =
-				loopback_remotenode->tcp_cntxt.loc_seq_num;
-			loopback_remotenode->tcp_cntxt.rcv_nxt =
-				cm_node->tcp_cntxt.loc_seq_num;
-			cm_node->tcp_cntxt.max_snd_wnd =
-				loopback_remotenode->tcp_cntxt.rcv_wnd;
-			loopback_remotenode->tcp_cntxt.max_snd_wnd = cm_node->tcp_cntxt.rcv_wnd;
-			cm_node->tcp_cntxt.snd_wnd = loopback_remotenode->tcp_cntxt.rcv_wnd;
-			loopback_remotenode->tcp_cntxt.snd_wnd = cm_node->tcp_cntxt.rcv_wnd;
-			cm_node->tcp_cntxt.snd_wscale = loopback_remotenode->tcp_cntxt.rcv_wscale;
-			loopback_remotenode->tcp_cntxt.snd_wscale = cm_node->tcp_cntxt.rcv_wscale;
-		}
-		return cm_node;
-	}
-
-	cm_node->pdata.size = private_data_len;
-	cm_node->pdata.addr = cm_node->pdata_buf;
-
-	memcpy(cm_node->pdata_buf, private_data, private_data_len);
-
-	cm_node->state = I40IW_CM_STATE_SYN_SENT;
-	return cm_node;
-}
-
-/**
- * i40iw_cm_reject - reject and teardown a connection
- * @cm_node: connection's node
- * @pdata: ptr to private data for reject
- * @plen: size of private data
- */
-static int i40iw_cm_reject(struct i40iw_cm_node *cm_node, const void *pdata, u8 plen)
-{
-	int ret = 0;
-	int err;
-	int passive_state;
-	struct iw_cm_id *cm_id = cm_node->cm_id;
-	struct i40iw_cm_node *loopback = cm_node->loopbackpartner;
-
-	if (cm_node->tcp_cntxt.client)
-		return ret;
-	i40iw_cleanup_retrans_entry(cm_node);
-
-	if (!loopback) {
-		passive_state = atomic_inc_return(&cm_node->passive_state);
-		if (passive_state == I40IW_SEND_RESET_EVENT) {
-			cm_node->state = I40IW_CM_STATE_CLOSED;
-			i40iw_rem_ref_cm_node(cm_node);
-		} else {
-			if (cm_node->state == I40IW_CM_STATE_LISTENER_DESTROYED) {
-				i40iw_rem_ref_cm_node(cm_node);
-			} else {
-				ret = i40iw_send_mpa_reject(cm_node, pdata, plen);
-				if (ret) {
-					cm_node->state = I40IW_CM_STATE_CLOSED;
-					err = i40iw_send_reset(cm_node);
-					if (err)
-						i40iw_pr_err("send reset failed\n");
-				} else {
-					cm_id->add_ref(cm_id);
-				}
-			}
-		}
-	} else {
-		cm_node->cm_id = NULL;
-		if (cm_node->state == I40IW_CM_STATE_LISTENER_DESTROYED) {
-			i40iw_rem_ref_cm_node(cm_node);
-			i40iw_rem_ref_cm_node(loopback);
-		} else {
-			ret = i40iw_send_cm_event(loopback,
-						  loopback->cm_id,
-						  IW_CM_EVENT_CONNECT_REPLY,
-						  -ECONNREFUSED);
-			i40iw_rem_ref_cm_node(cm_node);
-			loopback->state = I40IW_CM_STATE_CLOSING;
-
-			cm_id = loopback->cm_id;
-			i40iw_rem_ref_cm_node(loopback);
-			cm_id->rem_ref(cm_id);
-		}
-	}
-
-	return ret;
-}
-
-/**
- * i40iw_cm_close - close of cm connection
- * @cm_node: connection's node
- */
-static int i40iw_cm_close(struct i40iw_cm_node *cm_node)
-{
-	int ret = 0;
-
-	if (!cm_node)
-		return -EINVAL;
-
-	switch (cm_node->state) {
-	case I40IW_CM_STATE_SYN_RCVD:
-	case I40IW_CM_STATE_SYN_SENT:
-	case I40IW_CM_STATE_ONE_SIDE_ESTABLISHED:
-	case I40IW_CM_STATE_ESTABLISHED:
-	case I40IW_CM_STATE_ACCEPTING:
-	case I40IW_CM_STATE_MPAREQ_SENT:
-	case I40IW_CM_STATE_MPAREQ_RCVD:
-		i40iw_cleanup_retrans_entry(cm_node);
-		i40iw_send_reset(cm_node);
-		break;
-	case I40IW_CM_STATE_CLOSE_WAIT:
-		cm_node->state = I40IW_CM_STATE_LAST_ACK;
-		i40iw_send_fin(cm_node);
-		break;
-	case I40IW_CM_STATE_FIN_WAIT1:
-	case I40IW_CM_STATE_FIN_WAIT2:
-	case I40IW_CM_STATE_LAST_ACK:
-	case I40IW_CM_STATE_TIME_WAIT:
-	case I40IW_CM_STATE_CLOSING:
-		ret = -1;
-		break;
-	case I40IW_CM_STATE_LISTENING:
-		i40iw_cleanup_retrans_entry(cm_node);
-		i40iw_send_reset(cm_node);
-		break;
-	case I40IW_CM_STATE_MPAREJ_RCVD:
-	case I40IW_CM_STATE_UNKNOWN:
-	case I40IW_CM_STATE_INITED:
-	case I40IW_CM_STATE_CLOSED:
-	case I40IW_CM_STATE_LISTENER_DESTROYED:
-		i40iw_rem_ref_cm_node(cm_node);
-		break;
-	case I40IW_CM_STATE_OFFLOADED:
-		if (cm_node->send_entry)
-			i40iw_pr_err("send_entry\n");
-		i40iw_rem_ref_cm_node(cm_node);
-		break;
-	}
-	return ret;
-}
-
-/**
- * i40iw_receive_ilq - recv an ETHERNET packet, and process it
- * through CM
- * @vsi: pointer to the vsi structure
- * @rbuf: receive buffer
- */
-void i40iw_receive_ilq(struct i40iw_sc_vsi *vsi, struct i40iw_puda_buf *rbuf)
-{
-	struct i40iw_cm_node *cm_node;
-	struct i40iw_cm_listener *listener;
-	struct iphdr *iph;
-	struct ipv6hdr *ip6h;
-	struct tcphdr *tcph;
-	struct i40iw_cm_info cm_info;
-	struct i40iw_sc_dev *dev = vsi->dev;
-	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
-	struct i40iw_cm_core *cm_core = &iwdev->cm_core;
-	struct vlan_ethhdr *ethh;
-	u16 vtag;
-
-	/* if vlan, then maclen = 18 else 14 */
-	iph = (struct iphdr *)rbuf->iph;
-	memset(&cm_info, 0, sizeof(cm_info));
-
-	i40iw_debug_buf(dev,
-			I40IW_DEBUG_ILQ,
-			"RECEIVE ILQ BUFFER",
-			rbuf->mem.va,
-			rbuf->totallen);
-	ethh = (struct vlan_ethhdr *)rbuf->mem.va;
-
-	if (ethh->h_vlan_proto == htons(ETH_P_8021Q)) {
-		vtag = ntohs(ethh->h_vlan_TCI);
-		cm_info.user_pri = (vtag & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
-		cm_info.vlan_id = vtag & VLAN_VID_MASK;
-		i40iw_debug(cm_core->dev,
-			    I40IW_DEBUG_CM,
-			    "%s vlan_id=%d\n",
-			    __func__,
-			    cm_info.vlan_id);
-	} else {
-		cm_info.vlan_id = I40IW_NO_VLAN;
-	}
-	tcph = (struct tcphdr *)rbuf->tcph;
-
-	if (rbuf->ipv4) {
-		cm_info.loc_addr[0] = ntohl(iph->daddr);
-		cm_info.rem_addr[0] = ntohl(iph->saddr);
-		cm_info.ipv4 = true;
-		cm_info.tos = iph->tos;
-	} else {
-		ip6h = (struct ipv6hdr *)rbuf->iph;
-		i40iw_copy_ip_ntohl(cm_info.loc_addr,
-				    ip6h->daddr.in6_u.u6_addr32);
-		i40iw_copy_ip_ntohl(cm_info.rem_addr,
-				    ip6h->saddr.in6_u.u6_addr32);
-		cm_info.ipv4 = false;
-		cm_info.tos = (ip6h->priority << 4) | (ip6h->flow_lbl[0] >> 4);
-	}
-	cm_info.loc_port = ntohs(tcph->dest);
-	cm_info.rem_port = ntohs(tcph->source);
-	cm_node = i40iw_find_node(cm_core,
-				  cm_info.rem_port,
-				  cm_info.rem_addr,
-				  cm_info.loc_port,
-				  cm_info.loc_addr,
-				  true,
-				  false);
-
-	if (!cm_node) {
-		/* Only type of packet accepted are for */
-		/* the PASSIVE open (syn only) */
-		if (!tcph->syn || tcph->ack)
-			return;
-		listener =
-		    i40iw_find_listener(cm_core,
-					cm_info.loc_addr,
-					cm_info.loc_port,
-					cm_info.vlan_id,
-					I40IW_CM_LISTENER_ACTIVE_STATE);
-		if (!listener) {
-			cm_info.cm_id = NULL;
-			i40iw_debug(cm_core->dev,
-				    I40IW_DEBUG_CM,
-				    "%s no listener found\n",
-				    __func__);
-			return;
-		}
-		cm_info.cm_id = listener->cm_id;
-		cm_node = i40iw_make_cm_node(cm_core, iwdev, &cm_info, listener);
-		if (!cm_node) {
-			i40iw_debug(cm_core->dev,
-				    I40IW_DEBUG_CM,
-				    "%s allocate node failed\n",
-				    __func__);
-			atomic_dec(&listener->ref_count);
-			return;
-		}
-		if (!tcph->rst && !tcph->fin) {
-			cm_node->state = I40IW_CM_STATE_LISTENING;
-		} else {
-			i40iw_rem_ref_cm_node(cm_node);
-			return;
-		}
-		atomic_inc(&cm_node->ref_count);
-	} else if (cm_node->state == I40IW_CM_STATE_OFFLOADED) {
-		i40iw_rem_ref_cm_node(cm_node);
-		return;
-	}
-	i40iw_process_packet(cm_node, rbuf);
-	i40iw_rem_ref_cm_node(cm_node);
-}
-
-/**
- * i40iw_setup_cm_core - allocate a top level instance of a cm
- * core
- * @iwdev: iwarp device structure
- */
-int i40iw_setup_cm_core(struct i40iw_device *iwdev)
-{
-	struct i40iw_cm_core *cm_core = &iwdev->cm_core;
-
-	cm_core->iwdev = iwdev;
-	cm_core->dev = &iwdev->sc_dev;
-
-	INIT_LIST_HEAD(&cm_core->accelerated_list);
-	INIT_LIST_HEAD(&cm_core->non_accelerated_list);
-	INIT_LIST_HEAD(&cm_core->listen_nodes);
-
-	timer_setup(&cm_core->tcp_timer, i40iw_cm_timer_tick, 0);
-
-	spin_lock_init(&cm_core->ht_lock);
-	spin_lock_init(&cm_core->listen_list_lock);
-	spin_lock_init(&cm_core->apbvt_lock);
-
-	cm_core->event_wq = alloc_ordered_workqueue("iwewq",
-						    WQ_MEM_RECLAIM);
-	if (!cm_core->event_wq)
-		goto error;
-
-	cm_core->disconn_wq = alloc_ordered_workqueue("iwdwq",
-						      WQ_MEM_RECLAIM);
-	if (!cm_core->disconn_wq)
-		goto error;
-
-	return 0;
-error:
-	i40iw_cleanup_cm_core(&iwdev->cm_core);
-
-	return -ENOMEM;
-}
-
-/**
- * i40iw_cleanup_cm_core - deallocate a top level instance of a
- * cm core
- * @cm_core: cm's core
- */
-void i40iw_cleanup_cm_core(struct i40iw_cm_core *cm_core)
-{
-	unsigned long flags;
-
-	if (!cm_core)
-		return;
-
-	spin_lock_irqsave(&cm_core->ht_lock, flags);
-	if (timer_pending(&cm_core->tcp_timer))
-		del_timer_sync(&cm_core->tcp_timer);
-	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
-
-	if (cm_core->event_wq)
-		destroy_workqueue(cm_core->event_wq);
-	if (cm_core->disconn_wq)
-		destroy_workqueue(cm_core->disconn_wq);
-}
-
-/**
- * i40iw_init_tcp_ctx - setup qp context
- * @cm_node: connection's node
- * @tcp_info: offload info for tcp
- * @iwqp: associate qp for the connection
- */
-static void i40iw_init_tcp_ctx(struct i40iw_cm_node *cm_node,
-			       struct i40iw_tcp_offload_info *tcp_info,
-			       struct i40iw_qp *iwqp)
-{
-	tcp_info->ipv4 = cm_node->ipv4;
-	tcp_info->drop_ooo_seg = true;
-	tcp_info->wscale = true;
-	tcp_info->ignore_tcp_opt = true;
-	tcp_info->ignore_tcp_uns_opt = true;
-	tcp_info->no_nagle = false;
-
-	tcp_info->ttl = I40IW_DEFAULT_TTL;
-	tcp_info->rtt_var = cpu_to_le32(I40IW_DEFAULT_RTT_VAR);
-	tcp_info->ss_thresh = cpu_to_le32(I40IW_DEFAULT_SS_THRESH);
-	tcp_info->rexmit_thresh = I40IW_DEFAULT_REXMIT_THRESH;
-
-	tcp_info->tcp_state = I40IW_TCP_STATE_ESTABLISHED;
-	tcp_info->snd_wscale = cm_node->tcp_cntxt.snd_wscale;
-	tcp_info->rcv_wscale = cm_node->tcp_cntxt.rcv_wscale;
-
-	tcp_info->snd_nxt = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
-	tcp_info->snd_wnd = cpu_to_le32(cm_node->tcp_cntxt.snd_wnd);
-	tcp_info->rcv_nxt = cpu_to_le32(cm_node->tcp_cntxt.rcv_nxt);
-	tcp_info->snd_max = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
-
-	tcp_info->snd_una = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
-	tcp_info->cwnd = cpu_to_le32(2 * cm_node->tcp_cntxt.mss);
-	tcp_info->snd_wl1 = cpu_to_le32(cm_node->tcp_cntxt.rcv_nxt);
-	tcp_info->snd_wl2 = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
-	tcp_info->max_snd_window = cpu_to_le32(cm_node->tcp_cntxt.max_snd_wnd);
-	tcp_info->rcv_wnd = cpu_to_le32(cm_node->tcp_cntxt.rcv_wnd <<
-					cm_node->tcp_cntxt.rcv_wscale);
-
-	tcp_info->flow_label = 0;
-	tcp_info->snd_mss = cpu_to_le32(((u32)cm_node->tcp_cntxt.mss));
-	if (cm_node->vlan_id <= VLAN_VID_MASK) {
-		tcp_info->insert_vlan_tag = true;
-		tcp_info->vlan_tag = cpu_to_le16(((u16)cm_node->user_pri << I40IW_VLAN_PRIO_SHIFT) |
-						  cm_node->vlan_id);
-	}
-	if (cm_node->ipv4) {
-		tcp_info->src_port = cpu_to_le16(cm_node->loc_port);
-		tcp_info->dst_port = cpu_to_le16(cm_node->rem_port);
-
-		tcp_info->dest_ip_addr3 = cpu_to_le32(cm_node->rem_addr[0]);
-		tcp_info->local_ipaddr3 = cpu_to_le32(cm_node->loc_addr[0]);
-		tcp_info->arp_idx =
-			cpu_to_le16((u16)i40iw_arp_table(
-							 iwqp->iwdev,
-							 &tcp_info->dest_ip_addr3,
-							 true,
-							 NULL,
-							 I40IW_ARP_RESOLVE));
-	} else {
-		tcp_info->src_port = cpu_to_le16(cm_node->loc_port);
-		tcp_info->dst_port = cpu_to_le16(cm_node->rem_port);
-		tcp_info->dest_ip_addr0 = cpu_to_le32(cm_node->rem_addr[0]);
-		tcp_info->dest_ip_addr1 = cpu_to_le32(cm_node->rem_addr[1]);
-		tcp_info->dest_ip_addr2 = cpu_to_le32(cm_node->rem_addr[2]);
-		tcp_info->dest_ip_addr3 = cpu_to_le32(cm_node->rem_addr[3]);
-		tcp_info->local_ipaddr0 = cpu_to_le32(cm_node->loc_addr[0]);
-		tcp_info->local_ipaddr1 = cpu_to_le32(cm_node->loc_addr[1]);
-		tcp_info->local_ipaddr2 = cpu_to_le32(cm_node->loc_addr[2]);
-		tcp_info->local_ipaddr3 = cpu_to_le32(cm_node->loc_addr[3]);
-		tcp_info->arp_idx =
-			cpu_to_le16((u16)i40iw_arp_table(
-							 iwqp->iwdev,
-							 &tcp_info->dest_ip_addr0,
-							 false,
-							 NULL,
-							 I40IW_ARP_RESOLVE));
-	}
-}
-
-/**
- * i40iw_cm_init_tsa_conn - setup qp for RTS
- * @iwqp: associate qp for the connection
- * @cm_node: connection's node
- */
-static void i40iw_cm_init_tsa_conn(struct i40iw_qp *iwqp,
-				   struct i40iw_cm_node *cm_node)
-{
-	struct i40iw_tcp_offload_info tcp_info;
-	struct i40iwarp_offload_info *iwarp_info;
-	struct i40iw_qp_host_ctx_info *ctx_info;
-	struct i40iw_device *iwdev = iwqp->iwdev;
-	struct i40iw_sc_dev *dev = &iwqp->iwdev->sc_dev;
-
-	memset(&tcp_info, 0x00, sizeof(struct i40iw_tcp_offload_info));
-	iwarp_info = &iwqp->iwarp_info;
-	ctx_info = &iwqp->ctx_info;
-
-	ctx_info->tcp_info = &tcp_info;
-	ctx_info->send_cq_num = iwqp->iwscq->sc_cq.cq_uk.cq_id;
-	ctx_info->rcv_cq_num = iwqp->iwrcq->sc_cq.cq_uk.cq_id;
-
-	iwarp_info->ord_size = cm_node->ord_size;
-	iwarp_info->ird_size = i40iw_derive_hw_ird_setting(cm_node->ird_size);
-
-	if (iwarp_info->ord_size == 1)
-		iwarp_info->ord_size = 2;
-
-	iwarp_info->rd_enable = true;
-	iwarp_info->rdmap_ver = 1;
-	iwarp_info->ddp_ver = 1;
-
-	iwarp_info->pd_id = iwqp->iwpd->sc_pd.pd_id;
-
-	ctx_info->tcp_info_valid = true;
-	ctx_info->iwarp_info_valid = true;
-	ctx_info->add_to_qoslist = true;
-	ctx_info->user_pri = cm_node->user_pri;
-
-	i40iw_init_tcp_ctx(cm_node, &tcp_info, iwqp);
-	if (cm_node->snd_mark_en) {
-		iwarp_info->snd_mark_en = true;
-		iwarp_info->snd_mark_offset = (tcp_info.snd_nxt &
-				SNDMARKER_SEQNMASK) + cm_node->lsmm_size;
-	}
-
-	cm_node->state = I40IW_CM_STATE_OFFLOADED;
-	tcp_info.tcp_state = I40IW_TCP_STATE_ESTABLISHED;
-	tcp_info.src_mac_addr_idx = iwdev->mac_ip_table_idx;
-	tcp_info.tos = cm_node->tos;
-
-	dev->iw_priv_qp_ops->qp_setctx(&iwqp->sc_qp, (u64 *)(iwqp->host_ctx.va), ctx_info);
-
-	/* once tcp_info is set, no need to do it again */
-	ctx_info->tcp_info_valid = false;
-	ctx_info->iwarp_info_valid = false;
-	ctx_info->add_to_qoslist = false;
-}
-
-/**
- * i40iw_cm_disconn - when a connection is being closed
- * @iwqp: associate qp for the connection
- */
-void i40iw_cm_disconn(struct i40iw_qp *iwqp)
-{
-	struct disconn_work *work;
-	struct i40iw_device *iwdev = iwqp->iwdev;
-	struct i40iw_cm_core *cm_core = &iwdev->cm_core;
-	unsigned long flags;
-
-	work = kzalloc(sizeof(*work), GFP_ATOMIC);
-	if (!work)
-		return;	/* Timer will clean up */
-
-	spin_lock_irqsave(&iwdev->qptable_lock, flags);
-	if (!iwdev->qp_table[iwqp->ibqp.qp_num]) {
-		spin_unlock_irqrestore(&iwdev->qptable_lock, flags);
-		i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_CM,
-			    "%s qp_id %d is already freed\n",
-			     __func__, iwqp->ibqp.qp_num);
-		kfree(work);
-		return;
-	}
-	i40iw_qp_add_ref(&iwqp->ibqp);
-	spin_unlock_irqrestore(&iwdev->qptable_lock, flags);
-
-	work->iwqp = iwqp;
-	INIT_WORK(&work->work, i40iw_disconnect_worker);
-	queue_work(cm_core->disconn_wq, &work->work);
-	return;
-}
-
-/**
- * i40iw_qp_disconnect - free qp and close cm
- * @iwqp: associate qp for the connection
- */
-static void i40iw_qp_disconnect(struct i40iw_qp *iwqp)
-{
-	struct i40iw_device *iwdev;
-	struct i40iw_ib_device *iwibdev;
-
-	iwdev = to_iwdev(iwqp->ibqp.device);
-	if (!iwdev) {
-		i40iw_pr_err("iwdev == NULL\n");
-		return;
-	}
-
-	iwibdev = iwdev->iwibdev;
-
-	if (iwqp->active_conn) {
-		/* indicate this connection is NOT active */
-		iwqp->active_conn = 0;
-	} else {
-		/* Need to free the Last Streaming Mode Message */
-		if (iwqp->ietf_mem.va) {
-			if (iwqp->lsmm_mr)
-				iwibdev->ibdev.ops.dereg_mr(iwqp->lsmm_mr,
-							    NULL);
-			i40iw_free_dma_mem(iwdev->sc_dev.hw, &iwqp->ietf_mem);
-		}
-	}
-
-	/* close the CM node down if it is still active */
-	if (iwqp->cm_node) {
-		i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_CM, "%s Call close API\n", __func__);
-		i40iw_cm_close(iwqp->cm_node);
-	}
-}
-
-/**
- * i40iw_cm_disconn_true - called by worker thread to disconnect qp
- * @iwqp: associate qp for the connection
- */
-static void i40iw_cm_disconn_true(struct i40iw_qp *iwqp)
-{
-	struct iw_cm_id *cm_id;
-	struct i40iw_device *iwdev;
-	struct i40iw_sc_qp *qp = &iwqp->sc_qp;
-	u16 last_ae;
-	u8 original_hw_tcp_state;
-	u8 original_ibqp_state;
-	int disconn_status = 0;
-	int issue_disconn = 0;
-	int issue_close = 0;
-	int issue_flush = 0;
-	struct ib_event ibevent;
-	unsigned long flags;
-	int ret;
-
-	if (!iwqp) {
-		i40iw_pr_err("iwqp == NULL\n");
-		return;
-	}
-
-	spin_lock_irqsave(&iwqp->lock, flags);
-	cm_id = iwqp->cm_id;
-	/* make sure we havent already closed this connection */
-	if (!cm_id) {
-		spin_unlock_irqrestore(&iwqp->lock, flags);
-		return;
-	}
-
-	iwdev = to_iwdev(iwqp->ibqp.device);
-
-	original_hw_tcp_state = iwqp->hw_tcp_state;
-	original_ibqp_state = iwqp->ibqp_state;
-	last_ae = iwqp->last_aeq;
-
-	if (qp->term_flags) {
-		issue_disconn = 1;
-		issue_close = 1;
-		iwqp->cm_id = NULL;
-		/*When term timer expires after cm_timer, don't want
-		 *terminate-handler to issue cm_disconn which can re-free
-		 *a QP even after its refcnt=0.
-		 */
-		i40iw_terminate_del_timer(qp);
-		if (!iwqp->flush_issued) {
-			iwqp->flush_issued = 1;
-			issue_flush = 1;
-		}
-	} else if ((original_hw_tcp_state == I40IW_TCP_STATE_CLOSE_WAIT) ||
-		   ((original_ibqp_state == IB_QPS_RTS) &&
-		    (last_ae == I40IW_AE_LLP_CONNECTION_RESET))) {
-		issue_disconn = 1;
-		if (last_ae == I40IW_AE_LLP_CONNECTION_RESET)
-			disconn_status = -ECONNRESET;
-	}
-
-	if (((original_hw_tcp_state == I40IW_TCP_STATE_CLOSED) ||
-	     (original_hw_tcp_state == I40IW_TCP_STATE_TIME_WAIT) ||
-	     (last_ae == I40IW_AE_RDMAP_ROE_BAD_LLP_CLOSE) ||
-	     (last_ae == I40IW_AE_LLP_CONNECTION_RESET) ||
-	      iwdev->reset)) {
-		issue_close = 1;
-		iwqp->cm_id = NULL;
-		if (!iwqp->flush_issued) {
-			iwqp->flush_issued = 1;
-			issue_flush = 1;
-		}
-	}
-
-	spin_unlock_irqrestore(&iwqp->lock, flags);
-	if (issue_flush && !iwqp->destroyed) {
-		/* Flush the queues */
-		i40iw_flush_wqes(iwdev, iwqp);
-
-		if (qp->term_flags && iwqp->ibqp.event_handler) {
-			ibevent.device = iwqp->ibqp.device;
-			ibevent.event = (qp->eventtype == TERM_EVENT_QP_FATAL) ?
-					IB_EVENT_QP_FATAL : IB_EVENT_QP_ACCESS_ERR;
-			ibevent.element.qp = &iwqp->ibqp;
-			iwqp->ibqp.event_handler(&ibevent, iwqp->ibqp.qp_context);
-		}
-	}
-
-	if (cm_id && cm_id->event_handler) {
-		if (issue_disconn) {
-			ret = i40iw_send_cm_event(NULL,
-						  cm_id,
-						  IW_CM_EVENT_DISCONNECT,
-						  disconn_status);
-
-			if (ret)
-				i40iw_debug(&iwdev->sc_dev,
-					    I40IW_DEBUG_CM,
-					    "disconnect event failed %s: - cm_id = %p\n",
-					    __func__, cm_id);
-		}
-		if (issue_close) {
-			i40iw_qp_disconnect(iwqp);
-			cm_id->provider_data = iwqp;
-			ret = i40iw_send_cm_event(NULL, cm_id, IW_CM_EVENT_CLOSE, 0);
-			if (ret)
-				i40iw_debug(&iwdev->sc_dev,
-					    I40IW_DEBUG_CM,
-					    "close event failed %s: - cm_id = %p\n",
-					    __func__, cm_id);
-			cm_id->rem_ref(cm_id);
-		}
-	}
-}
-
-/**
- * i40iw_disconnect_worker - worker for connection close
- * @work: points or disconn structure
- */
-static void i40iw_disconnect_worker(struct work_struct *work)
-{
-	struct disconn_work *dwork = container_of(work, struct disconn_work, work);
-	struct i40iw_qp *iwqp = dwork->iwqp;
-
-	kfree(dwork);
-	i40iw_cm_disconn_true(iwqp);
-	i40iw_qp_rem_ref(&iwqp->ibqp);
-}
-
-/**
- * i40iw_accept - registered call for connection to be accepted
- * @cm_id: cm information for passive connection
- * @conn_param: accpet parameters
- */
-int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
-{
-	struct ib_qp *ibqp;
-	struct i40iw_qp *iwqp;
-	struct i40iw_device *iwdev;
-	struct i40iw_sc_dev *dev;
-	struct i40iw_cm_core *cm_core;
-	struct i40iw_cm_node *cm_node;
-	struct ib_qp_attr attr;
-	int passive_state;
-	struct ib_mr *ibmr;
-	struct i40iw_pd *iwpd;
-	u16 buf_len = 0;
-	struct i40iw_kmem_info accept;
-	enum i40iw_status_code status;
-	u64 tagged_offset;
-	unsigned long flags;
-
-	memset(&attr, 0, sizeof(attr));
-	ibqp = i40iw_get_qp(cm_id->device, conn_param->qpn);
-	if (!ibqp)
-		return -EINVAL;
-
-	iwqp = to_iwqp(ibqp);
-	iwdev = iwqp->iwdev;
-	dev = &iwdev->sc_dev;
-	cm_core = &iwdev->cm_core;
-	cm_node = (struct i40iw_cm_node *)cm_id->provider_data;
-
-	if (((struct sockaddr_in *)&cm_id->local_addr)->sin_family == AF_INET) {
-		cm_node->ipv4 = true;
-		cm_node->vlan_id = i40iw_get_vlan_ipv4(cm_node->loc_addr);
-	} else {
-		cm_node->ipv4 = false;
-		i40iw_netdev_vlan_ipv6(cm_node->loc_addr, &cm_node->vlan_id);
-	}
-	i40iw_debug(cm_node->dev,
-		    I40IW_DEBUG_CM,
-		    "Accept vlan_id=%d\n",
-		    cm_node->vlan_id);
-	if (cm_node->state == I40IW_CM_STATE_LISTENER_DESTROYED) {
-		if (cm_node->loopbackpartner)
-			i40iw_rem_ref_cm_node(cm_node->loopbackpartner);
-		i40iw_rem_ref_cm_node(cm_node);
-		return -EINVAL;
-	}
-
-	passive_state = atomic_inc_return(&cm_node->passive_state);
-	if (passive_state == I40IW_SEND_RESET_EVENT) {
-		i40iw_rem_ref_cm_node(cm_node);
-		return -ECONNRESET;
-	}
-
-	cm_node->cm_core->stats_accepts++;
-	iwqp->cm_node = (void *)cm_node;
-	cm_node->iwqp = iwqp;
-
-	buf_len = conn_param->private_data_len + I40IW_MAX_IETF_SIZE;
-
-	status = i40iw_allocate_dma_mem(dev->hw, &iwqp->ietf_mem, buf_len, 1);
-
-	if (status)
-		return -ENOMEM;
-	cm_node->pdata.size = conn_param->private_data_len;
-	accept.addr = iwqp->ietf_mem.va;
-	accept.size = i40iw_cm_build_mpa_frame(cm_node, &accept, MPA_KEY_REPLY);
-	memcpy(accept.addr + accept.size, conn_param->private_data,
-	       conn_param->private_data_len);
-
-	/* setup our first outgoing iWarp send WQE (the IETF frame response) */
-	if ((cm_node->ipv4 &&
-	     !i40iw_ipv4_is_loopback(cm_node->loc_addr[0], cm_node->rem_addr[0])) ||
-	    (!cm_node->ipv4 &&
-	     !i40iw_ipv6_is_loopback(cm_node->loc_addr, cm_node->rem_addr))) {
-		iwpd = iwqp->iwpd;
-		tagged_offset = (uintptr_t)iwqp->ietf_mem.va;
-		ibmr = i40iw_reg_phys_mr(&iwpd->ibpd,
-					 iwqp->ietf_mem.pa,
-					 buf_len,
-					 IB_ACCESS_LOCAL_WRITE,
-					 &tagged_offset);
-		if (IS_ERR(ibmr)) {
-			i40iw_free_dma_mem(dev->hw, &iwqp->ietf_mem);
-			return -ENOMEM;
-		}
-
-		ibmr->pd = &iwpd->ibpd;
-		ibmr->device = iwpd->ibpd.device;
-		iwqp->lsmm_mr = ibmr;
-		if (iwqp->page)
-			iwqp->sc_qp.qp_uk.sq_base = kmap(iwqp->page);
-		dev->iw_priv_qp_ops->qp_send_lsmm(&iwqp->sc_qp,
-							iwqp->ietf_mem.va,
-							(accept.size + conn_param->private_data_len),
-							ibmr->lkey);
-
-	} else {
-		if (iwqp->page)
-			iwqp->sc_qp.qp_uk.sq_base = kmap(iwqp->page);
-		dev->iw_priv_qp_ops->qp_send_lsmm(&iwqp->sc_qp, NULL, 0, 0);
-	}
-
-	if (iwqp->page)
-		kunmap(iwqp->page);
-
-	iwqp->cm_id = cm_id;
-	cm_node->cm_id = cm_id;
-
-	cm_id->provider_data = (void *)iwqp;
-	iwqp->active_conn = 0;
-
-	cm_node->lsmm_size = accept.size + conn_param->private_data_len;
-	i40iw_cm_init_tsa_conn(iwqp, cm_node);
-	cm_id->add_ref(cm_id);
-	i40iw_qp_add_ref(&iwqp->ibqp);
-
-	attr.qp_state = IB_QPS_RTS;
-	cm_node->qhash_set = false;
-	i40iw_modify_qp(&iwqp->ibqp, &attr, IB_QP_STATE, NULL);
-
-	cm_node->accelerated = true;
-	spin_lock_irqsave(&cm_core->ht_lock, flags);
-	list_move_tail(&cm_node->list, &cm_core->accelerated_list);
-	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
-
-	status =
-		i40iw_send_cm_event(cm_node, cm_id, IW_CM_EVENT_ESTABLISHED, 0);
-	if (status)
-		i40iw_debug(dev, I40IW_DEBUG_CM, "error sending cm event - ESTABLISHED\n");
-
-	if (cm_node->loopbackpartner) {
-		cm_node->loopbackpartner->pdata.size = conn_param->private_data_len;
-
-		/* copy entire MPA frame to our cm_node's frame */
-		memcpy(cm_node->loopbackpartner->pdata_buf,
-		       conn_param->private_data,
-		       conn_param->private_data_len);
-		i40iw_create_event(cm_node->loopbackpartner, I40IW_CM_EVENT_CONNECTED);
-	}
-
-	if (cm_node->accept_pend) {
-		atomic_dec(&cm_node->listener->pend_accepts_cnt);
-		cm_node->accept_pend = 0;
-	}
-	return 0;
-}
-
-/**
- * i40iw_reject - registered call for connection to be rejected
- * @cm_id: cm information for passive connection
- * @pdata: private data to be sent
- * @pdata_len: private data length
- */
-int i40iw_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
-{
-	struct i40iw_device *iwdev;
-	struct i40iw_cm_node *cm_node;
-	struct i40iw_cm_node *loopback;
-
-	cm_node = (struct i40iw_cm_node *)cm_id->provider_data;
-	loopback = cm_node->loopbackpartner;
-	cm_node->cm_id = cm_id;
-	cm_node->pdata.size = pdata_len;
-
-	iwdev = to_iwdev(cm_id->device);
-	if (!iwdev)
-		return -EINVAL;
-	cm_node->cm_core->stats_rejects++;
-
-	if (pdata_len + sizeof(struct ietf_mpa_v2) > MAX_CM_BUFFER)
-		return -EINVAL;
-
-	if (loopback) {
-		memcpy(&loopback->pdata_buf, pdata, pdata_len);
-		loopback->pdata.size = pdata_len;
-	}
-
-	return i40iw_cm_reject(cm_node, pdata, pdata_len);
-}
-
-/**
- * i40iw_connect - registered call for connection to be established
- * @cm_id: cm information for passive connection
- * @conn_param: Information about the connection
- */
-int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
-{
-	struct ib_qp *ibqp;
-	struct i40iw_qp *iwqp;
-	struct i40iw_device *iwdev;
-	struct i40iw_cm_node *cm_node;
-	struct i40iw_cm_info cm_info;
-	struct sockaddr_in *laddr;
-	struct sockaddr_in *raddr;
-	struct sockaddr_in6 *laddr6;
-	struct sockaddr_in6 *raddr6;
-	int ret = 0;
-
-	ibqp = i40iw_get_qp(cm_id->device, conn_param->qpn);
-	if (!ibqp)
-		return -EINVAL;
-	iwqp = to_iwqp(ibqp);
-	if (!iwqp)
-		return -EINVAL;
-	iwdev = to_iwdev(iwqp->ibqp.device);
-	if (!iwdev)
-		return -EINVAL;
-
-	laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
-	raddr = (struct sockaddr_in *)&cm_id->m_remote_addr;
-	laddr6 = (struct sockaddr_in6 *)&cm_id->m_local_addr;
-	raddr6 = (struct sockaddr_in6 *)&cm_id->m_remote_addr;
-
-	if (!(laddr->sin_port) || !(raddr->sin_port))
-		return -EINVAL;
-
-	iwqp->active_conn = 1;
-	iwqp->cm_id = NULL;
-	cm_id->provider_data = iwqp;
-
-	/* set up the connection params for the node */
-	if (cm_id->remote_addr.ss_family == AF_INET) {
-		cm_info.ipv4 = true;
-		memset(cm_info.loc_addr, 0, sizeof(cm_info.loc_addr));
-		memset(cm_info.rem_addr, 0, sizeof(cm_info.rem_addr));
-		cm_info.loc_addr[0] = ntohl(laddr->sin_addr.s_addr);
-		cm_info.rem_addr[0] = ntohl(raddr->sin_addr.s_addr);
-		cm_info.loc_port = ntohs(laddr->sin_port);
-		cm_info.rem_port = ntohs(raddr->sin_port);
-		cm_info.vlan_id = i40iw_get_vlan_ipv4(cm_info.loc_addr);
-	} else {
-		cm_info.ipv4 = false;
-		i40iw_copy_ip_ntohl(cm_info.loc_addr,
-				    laddr6->sin6_addr.in6_u.u6_addr32);
-		i40iw_copy_ip_ntohl(cm_info.rem_addr,
-				    raddr6->sin6_addr.in6_u.u6_addr32);
-		cm_info.loc_port = ntohs(laddr6->sin6_port);
-		cm_info.rem_port = ntohs(raddr6->sin6_port);
-		i40iw_netdev_vlan_ipv6(cm_info.loc_addr, &cm_info.vlan_id);
-	}
-	cm_info.cm_id = cm_id;
-	cm_info.tos = cm_id->tos;
-	cm_info.user_pri = rt_tos2priority(cm_id->tos);
-	i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_DCB, "%s TOS:[%d] UP:[%d]\n",
-		    __func__, cm_id->tos, cm_info.user_pri);
-	cm_id->add_ref(cm_id);
-	cm_node = i40iw_create_cm_node(&iwdev->cm_core, iwdev,
-				       conn_param, &cm_info);
-
-	if (IS_ERR(cm_node)) {
-		ret = PTR_ERR(cm_node);
-		cm_id->rem_ref(cm_id);
-		return ret;
-	}
-
-	if ((cm_info.ipv4 && (laddr->sin_addr.s_addr != raddr->sin_addr.s_addr)) ||
-	    (!cm_info.ipv4 && memcmp(laddr6->sin6_addr.in6_u.u6_addr32,
-				     raddr6->sin6_addr.in6_u.u6_addr32,
-				     sizeof(laddr6->sin6_addr.in6_u.u6_addr32)))) {
-		if (i40iw_manage_qhash(iwdev, &cm_info, I40IW_QHASH_TYPE_TCP_ESTABLISHED,
-				       I40IW_QHASH_MANAGE_TYPE_ADD, NULL, true)) {
-			ret = -EINVAL;
-			goto err;
-		}
-		cm_node->qhash_set = true;
-	}
-
-	if (i40iw_manage_apbvt(iwdev, cm_info.loc_port,
-			       I40IW_MANAGE_APBVT_ADD)) {
-		ret =  -EINVAL;
-		goto err;
-	}
-
-	cm_node->apbvt_set = true;
-	iwqp->cm_node = cm_node;
-	cm_node->iwqp = iwqp;
-	iwqp->cm_id = cm_id;
-	i40iw_qp_add_ref(&iwqp->ibqp);
-
-	if (cm_node->state != I40IW_CM_STATE_OFFLOADED) {
-		cm_node->state = I40IW_CM_STATE_SYN_SENT;
-		ret = i40iw_send_syn(cm_node, 0);
-		if (ret)
-			goto err;
-	}
-
-	if (cm_node->loopbackpartner) {
-		cm_node->loopbackpartner->state = I40IW_CM_STATE_MPAREQ_RCVD;
-		i40iw_create_event(cm_node->loopbackpartner,
-				   I40IW_CM_EVENT_MPA_REQ);
-	}
-
-	i40iw_debug(cm_node->dev,
-		    I40IW_DEBUG_CM,
-		    "Api - connect(): port=0x%04x, cm_node=%p, cm_id = %p.\n",
-		    cm_node->rem_port,
-		    cm_node,
-		    cm_node->cm_id);
-
-	return 0;
-
-err:
-	if (cm_info.ipv4)
-		i40iw_debug(&iwdev->sc_dev,
-			    I40IW_DEBUG_CM,
-			    "Api - connect() FAILED: dest addr=%pI4",
-			    cm_info.rem_addr);
-	else
-		i40iw_debug(&iwdev->sc_dev,
-			    I40IW_DEBUG_CM,
-			    "Api - connect() FAILED: dest addr=%pI6",
-			    cm_info.rem_addr);
-
-	i40iw_rem_ref_cm_node(cm_node);
-	cm_id->rem_ref(cm_id);
-	iwdev->cm_core.stats_connect_errs++;
-	return ret;
-}
-
-/**
- * i40iw_create_listen - registered call creating listener
- * @cm_id: cm information for passive connection
- * @backlog: to max accept pending count
- */
-int i40iw_create_listen(struct iw_cm_id *cm_id, int backlog)
-{
-	struct i40iw_device *iwdev;
-	struct i40iw_cm_listener *cm_listen_node;
-	struct i40iw_cm_info cm_info;
-	enum i40iw_status_code ret;
-	struct sockaddr_in *laddr;
-	struct sockaddr_in6 *laddr6;
-	bool wildcard = false;
-
-	iwdev = to_iwdev(cm_id->device);
-	if (!iwdev)
-		return -EINVAL;
-
-	laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
-	laddr6 = (struct sockaddr_in6 *)&cm_id->m_local_addr;
-	memset(&cm_info, 0, sizeof(cm_info));
-	if (laddr->sin_family == AF_INET) {
-		cm_info.ipv4 = true;
-		cm_info.loc_addr[0] = ntohl(laddr->sin_addr.s_addr);
-		cm_info.loc_port = ntohs(laddr->sin_port);
-
-		if (laddr->sin_addr.s_addr != INADDR_ANY)
-			cm_info.vlan_id = i40iw_get_vlan_ipv4(cm_info.loc_addr);
-		else
-			wildcard = true;
-
-	} else {
-		cm_info.ipv4 = false;
-		i40iw_copy_ip_ntohl(cm_info.loc_addr,
-				    laddr6->sin6_addr.in6_u.u6_addr32);
-		cm_info.loc_port = ntohs(laddr6->sin6_port);
-		if (ipv6_addr_type(&laddr6->sin6_addr) != IPV6_ADDR_ANY)
-			i40iw_netdev_vlan_ipv6(cm_info.loc_addr,
-					       &cm_info.vlan_id);
-		else
-			wildcard = true;
-	}
-	cm_info.backlog = backlog;
-	cm_info.cm_id = cm_id;
-
-	cm_listen_node = i40iw_make_listen_node(&iwdev->cm_core, iwdev, &cm_info);
-	if (!cm_listen_node) {
-		i40iw_pr_err("cm_listen_node == NULL\n");
-		return -ENOMEM;
-	}
-
-	cm_id->provider_data = cm_listen_node;
-
-	cm_listen_node->tos = cm_id->tos;
-	cm_listen_node->user_pri = rt_tos2priority(cm_id->tos);
-	cm_info.user_pri = cm_listen_node->user_pri;
-
-	if (!cm_listen_node->reused_node) {
-		if (wildcard) {
-			if (cm_info.ipv4)
-				ret = i40iw_add_mqh_4(iwdev,
-						      &cm_info,
-						      cm_listen_node);
-			else
-				ret = i40iw_add_mqh_6(iwdev,
-						      &cm_info,
-						      cm_listen_node);
-			if (ret)
-				goto error;
-
-			ret = i40iw_manage_apbvt(iwdev,
-						 cm_info.loc_port,
-						 I40IW_MANAGE_APBVT_ADD);
-
-			if (ret)
-				goto error;
-		} else {
-			ret = i40iw_manage_qhash(iwdev,
-						 &cm_info,
-						 I40IW_QHASH_TYPE_TCP_SYN,
-						 I40IW_QHASH_MANAGE_TYPE_ADD,
-						 NULL,
-						 true);
-			if (ret)
-				goto error;
-			cm_listen_node->qhash_set = true;
-			ret = i40iw_manage_apbvt(iwdev,
-						 cm_info.loc_port,
-						 I40IW_MANAGE_APBVT_ADD);
-			if (ret)
-				goto error;
-		}
-	}
-	cm_id->add_ref(cm_id);
-	cm_listen_node->cm_core->stats_listen_created++;
-	return 0;
- error:
-	i40iw_cm_del_listen(&iwdev->cm_core, (void *)cm_listen_node, false);
-	return -EINVAL;
-}
-
-/**
- * i40iw_destroy_listen - registered call to destroy listener
- * @cm_id: cm information for passive connection
- */
-int i40iw_destroy_listen(struct iw_cm_id *cm_id)
-{
-	struct i40iw_device *iwdev;
-
-	iwdev = to_iwdev(cm_id->device);
-	if (cm_id->provider_data)
-		i40iw_cm_del_listen(&iwdev->cm_core, cm_id->provider_data, true);
-	else
-		i40iw_pr_err("cm_id->provider_data was NULL\n");
-
-	cm_id->rem_ref(cm_id);
-
-	return 0;
-}
-
-/**
- * i40iw_cm_event_connected - handle connected active node
- * @event: the info for cm_node of connection
- */
-static void i40iw_cm_event_connected(struct i40iw_cm_event *event)
-{
-	struct i40iw_qp *iwqp;
-	struct i40iw_device *iwdev;
-	struct i40iw_cm_core *cm_core;
-	struct i40iw_cm_node *cm_node;
-	struct i40iw_sc_dev *dev;
-	struct ib_qp_attr attr;
-	struct iw_cm_id *cm_id;
-	unsigned long flags;
-	int status;
-	bool read0;
-
-	cm_node = event->cm_node;
-	cm_id = cm_node->cm_id;
-	iwqp = (struct i40iw_qp *)cm_id->provider_data;
-	iwdev = to_iwdev(iwqp->ibqp.device);
-	dev = &iwdev->sc_dev;
-	cm_core = &iwdev->cm_core;
-
-	if (iwqp->destroyed) {
-		status = -ETIMEDOUT;
-		goto error;
-	}
-	i40iw_cm_init_tsa_conn(iwqp, cm_node);
-	read0 = (cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO);
-	if (iwqp->page)
-		iwqp->sc_qp.qp_uk.sq_base = kmap(iwqp->page);
-	dev->iw_priv_qp_ops->qp_send_rtt(&iwqp->sc_qp, read0);
-	if (iwqp->page)
-		kunmap(iwqp->page);
-
-	memset(&attr, 0, sizeof(attr));
-	attr.qp_state = IB_QPS_RTS;
-	cm_node->qhash_set = false;
-	i40iw_modify_qp(&iwqp->ibqp, &attr, IB_QP_STATE, NULL);
-
-	cm_node->accelerated = true;
-	spin_lock_irqsave(&cm_core->ht_lock, flags);
-	list_move_tail(&cm_node->list, &cm_core->accelerated_list);
-	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
-	status = i40iw_send_cm_event(cm_node, cm_id, IW_CM_EVENT_CONNECT_REPLY,
-				     0);
-	if (status)
-		i40iw_debug(dev, I40IW_DEBUG_CM, "error sending cm event - CONNECT_REPLY\n");
-
-	return;
-
-error:
-	iwqp->cm_id = NULL;
-	cm_id->provider_data = NULL;
-	i40iw_send_cm_event(event->cm_node,
-			    cm_id,
-			    IW_CM_EVENT_CONNECT_REPLY,
-			    status);
-	cm_id->rem_ref(cm_id);
-	i40iw_rem_ref_cm_node(event->cm_node);
-}
-
-/**
- * i40iw_cm_event_reset - handle reset
- * @event: the info for cm_node of connection
- */
-static void i40iw_cm_event_reset(struct i40iw_cm_event *event)
-{
-	struct i40iw_cm_node *cm_node = event->cm_node;
-	struct iw_cm_id   *cm_id = cm_node->cm_id;
-	struct i40iw_qp *iwqp;
-
-	if (!cm_id)
-		return;
-
-	iwqp = cm_id->provider_data;
-	if (!iwqp)
-		return;
-
-	i40iw_debug(cm_node->dev,
-		    I40IW_DEBUG_CM,
-		    "reset event %p - cm_id = %p\n",
-		     event->cm_node, cm_id);
-	iwqp->cm_id = NULL;
-
-	i40iw_send_cm_event(cm_node, cm_node->cm_id, IW_CM_EVENT_DISCONNECT, -ECONNRESET);
-	i40iw_send_cm_event(cm_node, cm_node->cm_id, IW_CM_EVENT_CLOSE, 0);
-}
-
-/**
- * i40iw_cm_event_handler - worker thread callback to send event to cm upper layer
- * @work: pointer of cm event info.
- */
-static void i40iw_cm_event_handler(struct work_struct *work)
-{
-	struct i40iw_cm_event *event = container_of(work,
-						    struct i40iw_cm_event,
-						    event_work);
-	struct i40iw_cm_node *cm_node;
-
-	if (!event || !event->cm_node || !event->cm_node->cm_core)
-		return;
-
-	cm_node = event->cm_node;
-
-	switch (event->type) {
-	case I40IW_CM_EVENT_MPA_REQ:
-		i40iw_send_cm_event(cm_node,
-				    cm_node->cm_id,
-				    IW_CM_EVENT_CONNECT_REQUEST,
-				    0);
-		break;
-	case I40IW_CM_EVENT_RESET:
-		i40iw_cm_event_reset(event);
-		break;
-	case I40IW_CM_EVENT_CONNECTED:
-		if (!event->cm_node->cm_id ||
-		    (event->cm_node->state != I40IW_CM_STATE_OFFLOADED))
-			break;
-		i40iw_cm_event_connected(event);
-		break;
-	case I40IW_CM_EVENT_MPA_REJECT:
-		if (!event->cm_node->cm_id ||
-		    (cm_node->state == I40IW_CM_STATE_OFFLOADED))
-			break;
-		i40iw_send_cm_event(cm_node,
-				    cm_node->cm_id,
-				    IW_CM_EVENT_CONNECT_REPLY,
-				    -ECONNREFUSED);
-		break;
-	case I40IW_CM_EVENT_ABORTED:
-		if (!event->cm_node->cm_id ||
-		    (event->cm_node->state == I40IW_CM_STATE_OFFLOADED))
-			break;
-		i40iw_event_connect_error(event);
-		break;
-	default:
-		i40iw_pr_err("event type = %d\n", event->type);
-		break;
-	}
-
-	event->cm_info.cm_id->rem_ref(event->cm_info.cm_id);
-	i40iw_rem_ref_cm_node(event->cm_node);
-	kfree(event);
-}
-
-/**
- * i40iw_cm_post_event - queue event request for worker thread
- * @event: cm node's info for up event call
- */
-static void i40iw_cm_post_event(struct i40iw_cm_event *event)
-{
-	atomic_inc(&event->cm_node->ref_count);
-	event->cm_info.cm_id->add_ref(event->cm_info.cm_id);
-	INIT_WORK(&event->event_work, i40iw_cm_event_handler);
-
-	queue_work(event->cm_node->cm_core->event_wq, &event->event_work);
-}
-
-/**
- * i40iw_qhash_ctrl - enable/disable qhash for list
- * @iwdev: device pointer
- * @parent_listen_node: parent listen node
- * @nfo: cm info node
- * @ipaddr: Pointer to IPv4 or IPv6 address
- * @ipv4: flag indicating IPv4 when true
- * @ifup: flag indicating interface up when true
- *
- * Enables or disables the qhash for the node in the child
- * listen list that matches ipaddr. If no matching IP was found
- * it will allocate and add a new child listen node to the
- * parent listen node. The listen_list_lock is assumed to be
- * held when called.
- */
-static void i40iw_qhash_ctrl(struct i40iw_device *iwdev,
-			     struct i40iw_cm_listener *parent_listen_node,
-			     struct i40iw_cm_info *nfo,
-			     u32 *ipaddr, bool ipv4, bool ifup)
-{
-	struct list_head *child_listen_list = &parent_listen_node->child_listen_list;
-	struct i40iw_cm_listener *child_listen_node;
-	struct list_head *pos, *tpos;
-	enum i40iw_status_code ret;
-	bool node_allocated = false;
-	enum i40iw_quad_hash_manage_type op =
-		ifup ? I40IW_QHASH_MANAGE_TYPE_ADD : I40IW_QHASH_MANAGE_TYPE_DELETE;
-
-	list_for_each_safe(pos, tpos, child_listen_list) {
-		child_listen_node =
-			list_entry(pos,
-				   struct i40iw_cm_listener,
-				   child_listen_list);
-		if (!memcmp(child_listen_node->loc_addr, ipaddr, ipv4 ? 4 : 16))
-			goto set_qhash;
-	}
-
-	/* if not found then add a child listener if interface is going up */
-	if (!ifup)
-		return;
-	child_listen_node = kmemdup(parent_listen_node,
-			sizeof(*child_listen_node), GFP_ATOMIC);
-	if (!child_listen_node)
-		return;
-	node_allocated = true;
-
-	memcpy(child_listen_node->loc_addr, ipaddr,  ipv4 ? 4 : 16);
-
-set_qhash:
-	memcpy(nfo->loc_addr,
-	       child_listen_node->loc_addr,
-	       sizeof(nfo->loc_addr));
-	nfo->vlan_id = child_listen_node->vlan_id;
-	ret = i40iw_manage_qhash(iwdev, nfo,
-				 I40IW_QHASH_TYPE_TCP_SYN,
-				 op,
-				 NULL, false);
-	if (!ret) {
-		child_listen_node->qhash_set = ifup;
-		if (node_allocated)
-			list_add(&child_listen_node->child_listen_list,
-				 &parent_listen_node->child_listen_list);
-	} else if (node_allocated) {
-		kfree(child_listen_node);
-	}
-}
-
-/**
- * i40iw_cm_teardown_connections - teardown QPs
- * @iwdev: device pointer
- * @ipaddr: Pointer to IPv4 or IPv6 address
- * @nfo: cm info node
- * @disconnect_all: flag indicating disconnect all QPs
- * teardown QPs where source or destination addr matches ip addr
- */
-void i40iw_cm_teardown_connections(struct i40iw_device *iwdev, u32 *ipaddr,
-				   struct i40iw_cm_info *nfo,
-				   bool disconnect_all)
-{
-	struct i40iw_cm_core *cm_core = &iwdev->cm_core;
-	struct list_head *list_core_temp;
-	struct list_head *list_node;
-	struct i40iw_cm_node *cm_node;
-	unsigned long flags;
-	struct list_head teardown_list;
-	struct ib_qp_attr attr;
-
-	INIT_LIST_HEAD(&teardown_list);
-	spin_lock_irqsave(&cm_core->ht_lock, flags);
-	list_for_each_safe(list_node, list_core_temp,
-			   &cm_core->accelerated_list) {
-		cm_node = container_of(list_node, struct i40iw_cm_node, list);
-		if (disconnect_all ||
-		    (nfo->vlan_id == cm_node->vlan_id &&
-		    (!memcmp(cm_node->loc_addr, ipaddr, nfo->ipv4 ? 4 : 16) ||
-		     !memcmp(cm_node->rem_addr, ipaddr, nfo->ipv4 ? 4 : 16)))) {
-			atomic_inc(&cm_node->ref_count);
-			list_add(&cm_node->teardown_entry, &teardown_list);
-		}
-	}
-	list_for_each_safe(list_node, list_core_temp,
-			   &cm_core->non_accelerated_list) {
-		cm_node = container_of(list_node, struct i40iw_cm_node, list);
-		if (disconnect_all ||
-		    (nfo->vlan_id == cm_node->vlan_id &&
-		    (!memcmp(cm_node->loc_addr, ipaddr, nfo->ipv4 ? 4 : 16) ||
-		     !memcmp(cm_node->rem_addr, ipaddr, nfo->ipv4 ? 4 : 16)))) {
-			atomic_inc(&cm_node->ref_count);
-			list_add(&cm_node->teardown_entry, &teardown_list);
-		}
-	}
-	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
-
-	list_for_each_safe(list_node, list_core_temp, &teardown_list) {
-		cm_node = container_of(list_node, struct i40iw_cm_node,
-				       teardown_entry);
-		attr.qp_state = IB_QPS_ERR;
-		i40iw_modify_qp(&cm_node->iwqp->ibqp, &attr, IB_QP_STATE, NULL);
-		if (iwdev->reset)
-			i40iw_cm_disconn(cm_node->iwqp);
-		i40iw_rem_ref_cm_node(cm_node);
-	}
-}
-
-/**
- * i40iw_if_notify - process an ifdown on an interface
- * @iwdev: device pointer
- * @netdev: network interface device structure
- * @ipaddr: Pointer to IPv4 or IPv6 address
- * @ipv4: flag indicating IPv4 when true
- * @ifup: flag indicating interface up when true
- */
-void i40iw_if_notify(struct i40iw_device *iwdev, struct net_device *netdev,
-		     u32 *ipaddr, bool ipv4, bool ifup)
-{
-	struct i40iw_cm_core *cm_core = &iwdev->cm_core;
-	unsigned long flags;
-	struct i40iw_cm_listener *listen_node;
-	static const u32 ip_zero[4] = { 0, 0, 0, 0 };
-	struct i40iw_cm_info nfo;
-	u16 vlan_id = rdma_vlan_dev_vlan_id(netdev);
-	enum i40iw_status_code ret;
-	enum i40iw_quad_hash_manage_type op =
-		ifup ? I40IW_QHASH_MANAGE_TYPE_ADD : I40IW_QHASH_MANAGE_TYPE_DELETE;
-
-	nfo.vlan_id = vlan_id;
-	nfo.ipv4 = ipv4;
-
-	/* Disable or enable qhash for listeners */
-	spin_lock_irqsave(&cm_core->listen_list_lock, flags);
-	list_for_each_entry(listen_node, &cm_core->listen_nodes, list) {
-		if (vlan_id == listen_node->vlan_id &&
-		    (!memcmp(listen_node->loc_addr, ipaddr, ipv4 ? 4 : 16) ||
-		    !memcmp(listen_node->loc_addr, ip_zero, ipv4 ? 4 : 16))) {
-			memcpy(nfo.loc_addr, listen_node->loc_addr,
-			       sizeof(nfo.loc_addr));
-			nfo.loc_port = listen_node->loc_port;
-			nfo.user_pri = listen_node->user_pri;
-			if (!list_empty(&listen_node->child_listen_list)) {
-				i40iw_qhash_ctrl(iwdev,
-						 listen_node,
-						 &nfo,
-						 ipaddr, ipv4, ifup);
-			} else if (memcmp(listen_node->loc_addr, ip_zero,
-					  ipv4 ? 4 : 16)) {
-				ret = i40iw_manage_qhash(iwdev,
-							 &nfo,
-							 I40IW_QHASH_TYPE_TCP_SYN,
-							 op,
-							 NULL,
-							 false);
-				if (!ret)
-					listen_node->qhash_set = ifup;
-			}
-		}
-	}
-	spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
-
-	/* teardown connected qp's on ifdown */
-	if (!ifup)
-		i40iw_cm_teardown_connections(iwdev, ipaddr, &nfo, false);
-}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.h b/drivers/infiniband/hw/i40iw/i40iw_cm.h
deleted file mode 100644
index 6e43e4d730f4..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw_cm.h
+++ /dev/null
@@ -1,462 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#ifndef I40IW_CM_H
-#define I40IW_CM_H
-
-#define QUEUE_EVENTS
-
-#define I40IW_MANAGE_APBVT_DEL 0
-#define I40IW_MANAGE_APBVT_ADD 1
-
-#define I40IW_MPA_REQUEST_ACCEPT  1
-#define I40IW_MPA_REQUEST_REJECT  2
-
-/* IETF MPA -- defines, enums, structs */
-#define IEFT_MPA_KEY_REQ  "MPA ID Req Frame"
-#define IEFT_MPA_KEY_REP  "MPA ID Rep Frame"
-#define IETF_MPA_KEY_SIZE 16
-#define IETF_MPA_VERSION  1
-#define IETF_MAX_PRIV_DATA_LEN 512
-#define IETF_MPA_FRAME_SIZE    20
-#define IETF_RTR_MSG_SIZE      4
-#define IETF_MPA_V2_FLAG       0x10
-#define SNDMARKER_SEQNMASK     0x000001FF
-
-#define I40IW_MAX_IETF_SIZE      32
-
-/* IETF RTR MSG Fields               */
-#define IETF_PEER_TO_PEER       0x8000
-#define IETF_FLPDU_ZERO_LEN     0x4000
-#define IETF_RDMA0_WRITE        0x8000
-#define IETF_RDMA0_READ         0x4000
-#define IETF_NO_IRD_ORD         0x3FFF
-
-/* HW-supported IRD sizes*/
-#define	I40IW_HW_IRD_SETTING_2	2
-#define	I40IW_HW_IRD_SETTING_4	4
-#define	I40IW_HW_IRD_SETTING_8	8
-#define	I40IW_HW_IRD_SETTING_16	16
-#define	I40IW_HW_IRD_SETTING_32	32
-#define	I40IW_HW_IRD_SETTING_64	64
-
-#define MAX_PORTS		65536
-#define I40IW_VLAN_PRIO_SHIFT   13
-
-enum ietf_mpa_flags {
-	IETF_MPA_FLAGS_MARKERS = 0x80,	/* receive Markers */
-	IETF_MPA_FLAGS_CRC = 0x40,	/* receive Markers */
-	IETF_MPA_FLAGS_REJECT = 0x20,	/* Reject */
-};
-
-struct ietf_mpa_v1 {
-	u8 key[IETF_MPA_KEY_SIZE];
-	u8 flags;
-	u8 rev;
-	__be16 priv_data_len;
-	u8 priv_data[];
-};
-
-#define ietf_mpa_req_resp_frame ietf_mpa_frame
-
-struct ietf_rtr_msg {
-	__be16 ctrl_ird;
-	__be16 ctrl_ord;
-};
-
-struct ietf_mpa_v2 {
-	u8 key[IETF_MPA_KEY_SIZE];
-	u8 flags;
-	u8 rev;
-	__be16 priv_data_len;
-	struct ietf_rtr_msg rtr_msg;
-	u8 priv_data[];
-};
-
-struct i40iw_cm_node;
-enum i40iw_timer_type {
-	I40IW_TIMER_TYPE_SEND,
-	I40IW_TIMER_TYPE_RECV,
-	I40IW_TIMER_NODE_CLEANUP,
-	I40IW_TIMER_TYPE_CLOSE,
-};
-
-#define I40IW_PASSIVE_STATE_INDICATED    0
-#define I40IW_DO_NOT_SEND_RESET_EVENT    1
-#define I40IW_SEND_RESET_EVENT           2
-
-#define MAX_I40IW_IFS 4
-
-#define SET_ACK 0x1
-#define SET_SYN 0x2
-#define SET_FIN 0x4
-#define SET_RST 0x8
-
-#define TCP_OPTIONS_PADDING     3
-
-struct option_base {
-	u8 optionnum;
-	u8 length;
-};
-
-enum option_numbers {
-	OPTION_NUMBER_END,
-	OPTION_NUMBER_NONE,
-	OPTION_NUMBER_MSS,
-	OPTION_NUMBER_WINDOW_SCALE,
-	OPTION_NUMBER_SACK_PERM,
-	OPTION_NUMBER_SACK,
-	OPTION_NUMBER_WRITE0 = 0xbc
-};
-
-struct option_mss {
-	u8 optionnum;
-	u8 length;
-	__be16 mss;
-};
-
-struct option_windowscale {
-	u8 optionnum;
-	u8 length;
-	u8 shiftcount;
-};
-
-union all_known_options {
-	char as_end;
-	struct option_base as_base;
-	struct option_mss as_mss;
-	struct option_windowscale as_windowscale;
-};
-
-struct i40iw_timer_entry {
-	struct list_head list;
-	unsigned long timetosend;	/* jiffies */
-	struct i40iw_puda_buf *sqbuf;
-	u32 type;
-	u32 retrycount;
-	u32 retranscount;
-	u32 context;
-	u32 send_retrans;
-	int close_when_complete;
-};
-
-#define I40IW_DEFAULT_RETRYS	64
-#define I40IW_DEFAULT_RETRANS	8
-#define I40IW_DEFAULT_TTL	0x40
-#define I40IW_DEFAULT_RTT_VAR	0x6
-#define I40IW_DEFAULT_SS_THRESH 0x3FFFFFFF
-#define I40IW_DEFAULT_REXMIT_THRESH 8
-
-#define I40IW_RETRY_TIMEOUT   HZ
-#define I40IW_SHORT_TIME      10
-#define I40IW_LONG_TIME       (2 * HZ)
-#define I40IW_MAX_TIMEOUT     ((unsigned long)(12 * HZ))
-
-#define I40IW_CM_HASHTABLE_SIZE         1024
-#define I40IW_CM_TCP_TIMER_INTERVAL     3000
-#define I40IW_CM_DEFAULT_MTU            1540
-#define I40IW_CM_DEFAULT_FRAME_CNT      10
-#define I40IW_CM_THREAD_STACK_SIZE      256
-#define I40IW_CM_DEFAULT_RCV_WND        64240
-#define I40IW_CM_DEFAULT_RCV_WND_SCALED 0x3fffc
-#define I40IW_CM_DEFAULT_RCV_WND_SCALE  2
-#define I40IW_CM_DEFAULT_FREE_PKTS      0x000A
-#define I40IW_CM_FREE_PKT_LO_WATERMARK  2
-
-#define I40IW_CM_DEFAULT_MSS   536
-
-#define I40IW_CM_DEF_SEQ       0x159bf75f
-#define I40IW_CM_DEF_LOCAL_ID  0x3b47
-
-#define I40IW_CM_DEF_SEQ2      0x18ed5740
-#define I40IW_CM_DEF_LOCAL_ID2 0xb807
-#define MAX_CM_BUFFER   (I40IW_MAX_IETF_SIZE + IETF_MAX_PRIV_DATA_LEN)
-
-typedef u32 i40iw_addr_t;
-
-#define i40iw_cm_tsa_context i40iw_qp_context
-
-struct i40iw_qp;
-
-/* cm node transition states */
-enum i40iw_cm_node_state {
-	I40IW_CM_STATE_UNKNOWN,
-	I40IW_CM_STATE_INITED,
-	I40IW_CM_STATE_LISTENING,
-	I40IW_CM_STATE_SYN_RCVD,
-	I40IW_CM_STATE_SYN_SENT,
-	I40IW_CM_STATE_ONE_SIDE_ESTABLISHED,
-	I40IW_CM_STATE_ESTABLISHED,
-	I40IW_CM_STATE_ACCEPTING,
-	I40IW_CM_STATE_MPAREQ_SENT,
-	I40IW_CM_STATE_MPAREQ_RCVD,
-	I40IW_CM_STATE_MPAREJ_RCVD,
-	I40IW_CM_STATE_OFFLOADED,
-	I40IW_CM_STATE_FIN_WAIT1,
-	I40IW_CM_STATE_FIN_WAIT2,
-	I40IW_CM_STATE_CLOSE_WAIT,
-	I40IW_CM_STATE_TIME_WAIT,
-	I40IW_CM_STATE_LAST_ACK,
-	I40IW_CM_STATE_CLOSING,
-	I40IW_CM_STATE_LISTENER_DESTROYED,
-	I40IW_CM_STATE_CLOSED
-};
-
-enum mpa_frame_version {
-	IETF_MPA_V1 = 1,
-	IETF_MPA_V2 = 2
-};
-
-enum mpa_frame_key {
-	MPA_KEY_REQUEST,
-	MPA_KEY_REPLY
-};
-
-enum send_rdma0 {
-	SEND_RDMA_READ_ZERO = 1,
-	SEND_RDMA_WRITE_ZERO = 2
-};
-
-enum i40iw_tcpip_pkt_type {
-	I40IW_PKT_TYPE_UNKNOWN,
-	I40IW_PKT_TYPE_SYN,
-	I40IW_PKT_TYPE_SYNACK,
-	I40IW_PKT_TYPE_ACK,
-	I40IW_PKT_TYPE_FIN,
-	I40IW_PKT_TYPE_RST
-};
-
-/* CM context params */
-struct i40iw_cm_tcp_context {
-	u8 client;
-
-	u32 loc_seq_num;
-	u32 loc_ack_num;
-	u32 rem_ack_num;
-	u32 rcv_nxt;
-
-	u32 loc_id;
-	u32 rem_id;
-
-	u32 snd_wnd;
-	u32 max_snd_wnd;
-
-	u32 rcv_wnd;
-	u32 mss;
-	u8 snd_wscale;
-	u8 rcv_wscale;
-};
-
-enum i40iw_cm_listener_state {
-	I40IW_CM_LISTENER_PASSIVE_STATE = 1,
-	I40IW_CM_LISTENER_ACTIVE_STATE = 2,
-	I40IW_CM_LISTENER_EITHER_STATE = 3
-};
-
-struct i40iw_cm_listener {
-	struct list_head list;
-	struct i40iw_cm_core *cm_core;
-	u8 loc_mac[ETH_ALEN];
-	u32 loc_addr[4];
-	u16 loc_port;
-	struct iw_cm_id *cm_id;
-	atomic_t ref_count;
-	struct i40iw_device *iwdev;
-	atomic_t pend_accepts_cnt;
-	int backlog;
-	enum i40iw_cm_listener_state listener_state;
-	u32 reused_node;
-	u8 user_pri;
-	u8 tos;
-	u16 vlan_id;
-	bool qhash_set;
-	bool ipv4;
-	struct list_head child_listen_list;
-
-};
-
-struct i40iw_kmem_info {
-	void *addr;
-	u32 size;
-};
-
-/* per connection node and node state information */
-struct i40iw_cm_node {
-	u32 loc_addr[4], rem_addr[4];
-	u16 loc_port, rem_port;
-	u16 vlan_id;
-	enum i40iw_cm_node_state state;
-	u8 loc_mac[ETH_ALEN];
-	u8 rem_mac[ETH_ALEN];
-	atomic_t ref_count;
-	struct i40iw_qp *iwqp;
-	struct i40iw_device *iwdev;
-	struct i40iw_sc_dev *dev;
-	struct i40iw_cm_tcp_context tcp_cntxt;
-	struct i40iw_cm_core *cm_core;
-	struct i40iw_cm_node *loopbackpartner;
-	struct i40iw_timer_entry *send_entry;
-	struct i40iw_timer_entry *close_entry;
-	spinlock_t retrans_list_lock; /* cm transmit packet */
-	enum send_rdma0 send_rdma0_op;
-	u16 ird_size;
-	u16 ord_size;
-	u16     mpav2_ird_ord;
-	struct iw_cm_id *cm_id;
-	struct list_head list;
-	bool accelerated;
-	struct i40iw_cm_listener *listener;
-	int apbvt_set;
-	int accept_pend;
-	struct list_head timer_entry;
-	struct list_head reset_entry;
-	struct list_head teardown_entry;
-	atomic_t passive_state;
-	bool qhash_set;
-	u8 user_pri;
-	u8 tos;
-	bool ipv4;
-	bool snd_mark_en;
-	u16 lsmm_size;
-	enum mpa_frame_version mpa_frame_rev;
-	struct i40iw_kmem_info pdata;
-	union {
-		struct ietf_mpa_v1 mpa_frame;
-		struct ietf_mpa_v2 mpa_v2_frame;
-	};
-
-	u8 pdata_buf[IETF_MAX_PRIV_DATA_LEN];
-	struct i40iw_kmem_info mpa_hdr;
-	bool ack_rcvd;
-};
-
-/* structure for client or CM to fill when making CM api calls. */
-/*	- only need to set relevant data, based on op. */
-struct i40iw_cm_info {
-	struct iw_cm_id *cm_id;
-	u16 loc_port;
-	u16 rem_port;
-	u32 loc_addr[4];
-	u32 rem_addr[4];
-	u16 vlan_id;
-	int backlog;
-	u8 user_pri;
-	u8 tos;
-	bool ipv4;
-};
-
-/* CM event codes */
-enum i40iw_cm_event_type {
-	I40IW_CM_EVENT_UNKNOWN,
-	I40IW_CM_EVENT_ESTABLISHED,
-	I40IW_CM_EVENT_MPA_REQ,
-	I40IW_CM_EVENT_MPA_CONNECT,
-	I40IW_CM_EVENT_MPA_ACCEPT,
-	I40IW_CM_EVENT_MPA_REJECT,
-	I40IW_CM_EVENT_MPA_ESTABLISHED,
-	I40IW_CM_EVENT_CONNECTED,
-	I40IW_CM_EVENT_RESET,
-	I40IW_CM_EVENT_ABORTED
-};
-
-/* event to post to CM event handler */
-struct i40iw_cm_event {
-	enum i40iw_cm_event_type type;
-	struct i40iw_cm_info cm_info;
-	struct work_struct event_work;
-	struct i40iw_cm_node *cm_node;
-};
-
-struct i40iw_cm_core {
-	struct i40iw_device *iwdev;
-	struct i40iw_sc_dev *dev;
-
-	struct list_head listen_nodes;
-	struct list_head accelerated_list;
-	struct list_head non_accelerated_list;
-
-	struct timer_list tcp_timer;
-
-	struct workqueue_struct *event_wq;
-	struct workqueue_struct *disconn_wq;
-
-	spinlock_t ht_lock; /* manage hash table */
-	spinlock_t listen_list_lock; /* listen list */
-	spinlock_t apbvt_lock; /*manage apbvt entries*/
-
-	unsigned long ports_in_use[BITS_TO_LONGS(MAX_PORTS)];
-
-	u64	stats_nodes_created;
-	u64	stats_nodes_destroyed;
-	u64	stats_listen_created;
-	u64	stats_listen_destroyed;
-	u64	stats_listen_nodes_created;
-	u64	stats_listen_nodes_destroyed;
-	u64	stats_loopbacks;
-	u64	stats_accepts;
-	u64	stats_rejects;
-	u64	stats_connect_errs;
-	u64	stats_passive_errs;
-	u64	stats_pkt_retrans;
-	u64	stats_backlog_drops;
-};
-
-int i40iw_schedule_cm_timer(struct i40iw_cm_node *cm_node,
-			    struct i40iw_puda_buf *sqbuf,
-			    enum i40iw_timer_type type,
-			    int send_retrans,
-			    int close_when_complete);
-
-int i40iw_accept(struct iw_cm_id *, struct iw_cm_conn_param *);
-int i40iw_reject(struct iw_cm_id *, const void *, u8);
-int i40iw_connect(struct iw_cm_id *, struct iw_cm_conn_param *);
-int i40iw_create_listen(struct iw_cm_id *, int);
-int i40iw_destroy_listen(struct iw_cm_id *);
-
-int i40iw_cm_start(struct i40iw_device *);
-int i40iw_cm_stop(struct i40iw_device *);
-
-int i40iw_arp_table(struct i40iw_device *iwdev,
-		    u32 *ip_addr,
-		    bool ipv4,
-		    u8 *mac_addr,
-		    u32 action);
-
-void i40iw_if_notify(struct i40iw_device *iwdev, struct net_device *netdev,
-		     u32 *ipaddr, bool ipv4, bool ifup);
-void i40iw_cm_teardown_connections(struct i40iw_device *iwdev, u32 *ipaddr,
-				   struct i40iw_cm_info *nfo,
-				   bool disconnect_all);
-bool i40iw_port_in_use(struct i40iw_cm_core *cm_core, u16 port);
-#endif /* I40IW_CM_H */
diff --git a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c
deleted file mode 100644
index eaea5d545eb8..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c
+++ /dev/null
@@ -1,5243 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#include "i40iw_osdep.h"
-#include "i40iw_register.h"
-#include "i40iw_status.h"
-#include "i40iw_hmc.h"
-
-#include "i40iw_d.h"
-#include "i40iw_type.h"
-#include "i40iw_p.h"
-#include "i40iw_vf.h"
-#include "i40iw_virtchnl.h"
-
-/**
- * i40iw_insert_wqe_hdr - write wqe header
- * @wqe: cqp wqe for header
- * @header: header for the cqp wqe
- */
-void i40iw_insert_wqe_hdr(u64 *wqe, u64 header)
-{
-	wmb();            /* make sure WQE is populated before polarity is set */
-	set_64bit_val(wqe, 24, header);
-}
-
-void i40iw_check_cqp_progress(struct i40iw_cqp_timeout *cqp_timeout, struct i40iw_sc_dev *dev)
-{
-	if (cqp_timeout->compl_cqp_cmds != dev->cqp_cmd_stats[OP_COMPLETED_COMMANDS]) {
-		cqp_timeout->compl_cqp_cmds = dev->cqp_cmd_stats[OP_COMPLETED_COMMANDS];
-		cqp_timeout->count = 0;
-	} else {
-		if (dev->cqp_cmd_stats[OP_REQUESTED_COMMANDS] != cqp_timeout->compl_cqp_cmds)
-			cqp_timeout->count++;
-	}
-}
-
-/**
- * i40iw_get_cqp_reg_info - get head and tail for cqp using registers
- * @cqp: struct for cqp hw
- * @val: cqp tail register value
- * @tail:wqtail register value
- * @error: cqp processing err
- */
-static inline void i40iw_get_cqp_reg_info(struct i40iw_sc_cqp *cqp,
-					  u32 *val,
-					  u32 *tail,
-					  u32 *error)
-{
-	if (cqp->dev->is_pf) {
-		*val = i40iw_rd32(cqp->dev->hw, I40E_PFPE_CQPTAIL);
-		*tail = RS_32(*val, I40E_PFPE_CQPTAIL_WQTAIL);
-		*error = RS_32(*val, I40E_PFPE_CQPTAIL_CQP_OP_ERR);
-	} else {
-		*val = i40iw_rd32(cqp->dev->hw, I40E_VFPE_CQPTAIL1);
-		*tail = RS_32(*val, I40E_VFPE_CQPTAIL_WQTAIL);
-		*error = RS_32(*val, I40E_VFPE_CQPTAIL_CQP_OP_ERR);
-	}
-}
-
-/**
- * i40iw_cqp_poll_registers - poll cqp registers
- * @cqp: struct for cqp hw
- * @tail:wqtail register value
- * @count: how many times to try for completion
- */
-static enum i40iw_status_code i40iw_cqp_poll_registers(
-						struct i40iw_sc_cqp *cqp,
-						u32 tail,
-						u32 count)
-{
-	u32 i = 0;
-	u32 newtail, error, val;
-
-	while (i < count) {
-		i++;
-		i40iw_get_cqp_reg_info(cqp, &val, &newtail, &error);
-		if (error) {
-			error = (cqp->dev->is_pf) ?
-				 i40iw_rd32(cqp->dev->hw, I40E_PFPE_CQPERRCODES) :
-				 i40iw_rd32(cqp->dev->hw, I40E_VFPE_CQPERRCODES1);
-			return I40IW_ERR_CQP_COMPL_ERROR;
-		}
-		if (newtail != tail) {
-			/* SUCCESS */
-			I40IW_RING_MOVE_TAIL(cqp->sq_ring);
-			cqp->dev->cqp_cmd_stats[OP_COMPLETED_COMMANDS]++;
-			return 0;
-		}
-		udelay(I40IW_SLEEP_COUNT);
-	}
-	return I40IW_ERR_TIMEOUT;
-}
-
-/**
- * i40iw_sc_parse_fpm_commit_buf - parse fpm commit buffer
- * @buf: ptr to fpm commit buffer
- * @info: ptr to i40iw_hmc_obj_info struct
- * @sd: number of SDs for HMC objects
- *
- * parses fpm commit info and copy base value
- * of hmc objects in hmc_info
- */
-static enum i40iw_status_code i40iw_sc_parse_fpm_commit_buf(
-				u64 *buf,
-				struct i40iw_hmc_obj_info *info,
-				u32 *sd)
-{
-	u64 temp;
-	u64 size;
-	u64 base = 0;
-	u32 i, j;
-	u32 k = 0;
-
-	/* copy base values in obj_info */
-	for (i = I40IW_HMC_IW_QP, j = 0; i <= I40IW_HMC_IW_PBLE; i++, j += 8) {
-		if ((i == I40IW_HMC_IW_SRQ) ||
-			(i == I40IW_HMC_IW_FSIMC) ||
-			(i == I40IW_HMC_IW_FSIAV)) {
-			info[i].base = 0;
-			info[i].cnt = 0;
-			continue;
-		}
-		get_64bit_val(buf, j, &temp);
-		info[i].base = RS_64_1(temp, 32) * 512;
-		if (info[i].base > base) {
-			base = info[i].base;
-			k = i;
-		}
-		if (i == I40IW_HMC_IW_APBVT_ENTRY) {
-			info[i].cnt = 1;
-			continue;
-		}
-		if (i == I40IW_HMC_IW_QP)
-			info[i].cnt = (u32)RS_64(temp, I40IW_QUERY_FPM_MAX_QPS);
-		else if (i == I40IW_HMC_IW_CQ)
-			info[i].cnt = (u32)RS_64(temp, I40IW_QUERY_FPM_MAX_CQS);
-		else
-			info[i].cnt = (u32)(temp);
-	}
-	size = info[k].cnt * info[k].size + info[k].base;
-	if (size & 0x1FFFFF)
-		*sd = (u32)((size >> 21) + 1); /* add 1 for remainder */
-	else
-		*sd = (u32)(size >> 21);
-
-	return 0;
-}
-
-/**
- * i40iw_sc_decode_fpm_query() - Decode a 64 bit value into max count and size
- * @buf: ptr to fpm query buffer
- * @buf_idx: index into buf
- * @obj_info: ptr to i40iw_hmc_obj_info struct
- * @rsrc_idx: resource index into info
- *
- * Decode a 64 bit value from fpm query buffer into max count and size
- */
-static u64 i40iw_sc_decode_fpm_query(u64 *buf,
-					    u32 buf_idx,
-					    struct i40iw_hmc_obj_info *obj_info,
-					    u32 rsrc_idx)
-{
-	u64 temp;
-	u32 size;
-
-	get_64bit_val(buf, buf_idx, &temp);
-	obj_info[rsrc_idx].max_cnt = (u32)temp;
-	size = (u32)RS_64_1(temp, 32);
-	obj_info[rsrc_idx].size = LS_64_1(1, size);
-
-	return temp;
-}
-
-/**
- * i40iw_sc_parse_fpm_query_buf() - parses fpm query buffer
- * @buf: ptr to fpm query buffer
- * @hmc_info: ptr to i40iw_hmc_obj_info struct
- * @hmc_fpm_misc: ptr to fpm data
- *
- * parses fpm query buffer and copy max_cnt and
- * size value of hmc objects in hmc_info
- */
-static enum i40iw_status_code i40iw_sc_parse_fpm_query_buf(
-				u64 *buf,
-				struct i40iw_hmc_info *hmc_info,
-				struct i40iw_hmc_fpm_misc *hmc_fpm_misc)
-{
-	struct i40iw_hmc_obj_info *obj_info;
-	u64 temp;
-	u32 size;
-	u16 max_pe_sds;
-
-	obj_info = hmc_info->hmc_obj;
-
-	get_64bit_val(buf, 0, &temp);
-	hmc_info->first_sd_index = (u16)RS_64(temp, I40IW_QUERY_FPM_FIRST_PE_SD_INDEX);
-	max_pe_sds = (u16)RS_64(temp, I40IW_QUERY_FPM_MAX_PE_SDS);
-
-	/* Reduce SD count for VFs by 1 to account for PBLE backing page rounding */
-	if (hmc_info->hmc_fn_id >= I40IW_FIRST_VF_FPM_ID)
-		max_pe_sds--;
-	hmc_fpm_misc->max_sds = max_pe_sds;
-	hmc_info->sd_table.sd_cnt = max_pe_sds + hmc_info->first_sd_index;
-
-	get_64bit_val(buf, 8, &temp);
-	obj_info[I40IW_HMC_IW_QP].max_cnt = (u32)RS_64(temp, I40IW_QUERY_FPM_MAX_QPS);
-	size = (u32)RS_64_1(temp, 32);
-	obj_info[I40IW_HMC_IW_QP].size = LS_64_1(1, size);
-
-	get_64bit_val(buf, 16, &temp);
-	obj_info[I40IW_HMC_IW_CQ].max_cnt = (u32)RS_64(temp, I40IW_QUERY_FPM_MAX_CQS);
-	size = (u32)RS_64_1(temp, 32);
-	obj_info[I40IW_HMC_IW_CQ].size = LS_64_1(1, size);
-
-	i40iw_sc_decode_fpm_query(buf, 32, obj_info, I40IW_HMC_IW_HTE);
-	i40iw_sc_decode_fpm_query(buf, 40, obj_info, I40IW_HMC_IW_ARP);
-
-	obj_info[I40IW_HMC_IW_APBVT_ENTRY].size = 8192;
-	obj_info[I40IW_HMC_IW_APBVT_ENTRY].max_cnt = 1;
-
-	i40iw_sc_decode_fpm_query(buf, 48, obj_info, I40IW_HMC_IW_MR);
-	i40iw_sc_decode_fpm_query(buf, 56, obj_info, I40IW_HMC_IW_XF);
-
-	get_64bit_val(buf, 64, &temp);
-	obj_info[I40IW_HMC_IW_XFFL].max_cnt = (u32)temp;
-	obj_info[I40IW_HMC_IW_XFFL].size = 4;
-	hmc_fpm_misc->xf_block_size = RS_64(temp, I40IW_QUERY_FPM_XFBLOCKSIZE);
-	if (!hmc_fpm_misc->xf_block_size)
-		return I40IW_ERR_INVALID_SIZE;
-
-	i40iw_sc_decode_fpm_query(buf, 72, obj_info, I40IW_HMC_IW_Q1);
-
-	get_64bit_val(buf, 80, &temp);
-	obj_info[I40IW_HMC_IW_Q1FL].max_cnt = (u32)temp;
-	obj_info[I40IW_HMC_IW_Q1FL].size = 4;
-	hmc_fpm_misc->q1_block_size = RS_64(temp, I40IW_QUERY_FPM_Q1BLOCKSIZE);
-	if (!hmc_fpm_misc->q1_block_size)
-		return I40IW_ERR_INVALID_SIZE;
-
-	i40iw_sc_decode_fpm_query(buf, 88, obj_info, I40IW_HMC_IW_TIMER);
-
-	get_64bit_val(buf, 112, &temp);
-	obj_info[I40IW_HMC_IW_PBLE].max_cnt = (u32)temp;
-	obj_info[I40IW_HMC_IW_PBLE].size = 8;
-
-	get_64bit_val(buf, 120, &temp);
-	hmc_fpm_misc->max_ceqs = (u8)RS_64(temp, I40IW_QUERY_FPM_MAX_CEQS);
-	hmc_fpm_misc->ht_multiplier = RS_64(temp, I40IW_QUERY_FPM_HTMULTIPLIER);
-	hmc_fpm_misc->timer_bucket = RS_64(temp, I40IW_QUERY_FPM_TIMERBUCKET);
-
-	return 0;
-}
-
-/**
- * i40iw_fill_qos_list - Change all unknown qs handles to available ones
- * @qs_list: list of qs_handles to be fixed with valid qs_handles
- */
-static void i40iw_fill_qos_list(u16 *qs_list)
-{
-	u16 qshandle = qs_list[0];
-	int i;
-
-	for (i = 0; i < I40IW_MAX_USER_PRIORITY; i++) {
-		if (qs_list[i] == QS_HANDLE_UNKNOWN)
-			qs_list[i] = qshandle;
-		else
-			qshandle = qs_list[i];
-	}
-}
-
-/**
- * i40iw_qp_from_entry - Given entry, get to the qp structure
- * @entry: Points to list of qp structure
- */
-static struct i40iw_sc_qp *i40iw_qp_from_entry(struct list_head *entry)
-{
-	if (!entry)
-		return NULL;
-
-	return (struct i40iw_sc_qp *)((char *)entry - offsetof(struct i40iw_sc_qp, list));
-}
-
-/**
- * i40iw_get_qp - get the next qp from the list given current qp
- * @head: Listhead of qp's
- * @qp: current qp
- */
-static struct i40iw_sc_qp *i40iw_get_qp(struct list_head *head, struct i40iw_sc_qp *qp)
-{
-	struct list_head *entry = NULL;
-	struct list_head *lastentry;
-
-	if (list_empty(head))
-		return NULL;
-
-	if (!qp) {
-		entry = head->next;
-	} else {
-		lastentry = &qp->list;
-		entry = (lastentry != head) ? lastentry->next : NULL;
-	}
-
-	return i40iw_qp_from_entry(entry);
-}
-
-/**
- * i40iw_change_l2params - given the new l2 parameters, change all qp
- * @vsi: pointer to the vsi structure
- * @l2params: New paramaters from l2
- */
-void i40iw_change_l2params(struct i40iw_sc_vsi *vsi, struct i40iw_l2params *l2params)
-{
-	struct i40iw_sc_dev *dev = vsi->dev;
-	struct i40iw_sc_qp *qp = NULL;
-	bool qs_handle_change = false;
-	unsigned long flags;
-	u16 qs_handle;
-	int i;
-
-	if (vsi->mtu != l2params->mtu) {
-		vsi->mtu = l2params->mtu;
-		i40iw_reinitialize_ieq(dev);
-	}
-
-	i40iw_fill_qos_list(l2params->qs_handle_list);
-	for (i = 0; i < I40IW_MAX_USER_PRIORITY; i++) {
-		qs_handle = l2params->qs_handle_list[i];
-		if (vsi->qos[i].qs_handle != qs_handle)
-			qs_handle_change = true;
-		spin_lock_irqsave(&vsi->qos[i].lock, flags);
-		qp = i40iw_get_qp(&vsi->qos[i].qplist, qp);
-		while (qp) {
-			if (qs_handle_change) {
-				qp->qs_handle = qs_handle;
-				/* issue cqp suspend command */
-				i40iw_qp_suspend_resume(dev, qp, true);
-			}
-			qp = i40iw_get_qp(&vsi->qos[i].qplist, qp);
-		}
-		spin_unlock_irqrestore(&vsi->qos[i].lock, flags);
-		vsi->qos[i].qs_handle = qs_handle;
-	}
-}
-
-/**
- * i40iw_qp_rem_qos - remove qp from qos lists during destroy qp
- * @qp: qp to be removed from qos
- */
-void i40iw_qp_rem_qos(struct i40iw_sc_qp *qp)
-{
-	struct i40iw_sc_vsi *vsi = qp->vsi;
-	unsigned long flags;
-
-	if (!qp->on_qoslist)
-		return;
-	spin_lock_irqsave(&vsi->qos[qp->user_pri].lock, flags);
-	list_del(&qp->list);
-	spin_unlock_irqrestore(&vsi->qos[qp->user_pri].lock, flags);
-}
-
-/**
- * i40iw_qp_add_qos - called during setctx fot qp to be added to qos
- * @qp: qp to be added to qos
- */
-void i40iw_qp_add_qos(struct i40iw_sc_qp *qp)
-{
-	struct i40iw_sc_vsi *vsi = qp->vsi;
-	unsigned long flags;
-
-	if (qp->on_qoslist)
-		return;
-	spin_lock_irqsave(&vsi->qos[qp->user_pri].lock, flags);
-	qp->qs_handle = vsi->qos[qp->user_pri].qs_handle;
-	list_add(&qp->list, &vsi->qos[qp->user_pri].qplist);
-	qp->on_qoslist = true;
-	spin_unlock_irqrestore(&vsi->qos[qp->user_pri].lock, flags);
-}
-
-/**
- * i40iw_sc_pd_init - initialize sc pd struct
- * @dev: sc device struct
- * @pd: sc pd ptr
- * @pd_id: pd_id for allocated pd
- * @abi_ver: ABI version from user context, -1 if not valid
- */
-static void i40iw_sc_pd_init(struct i40iw_sc_dev *dev,
-			     struct i40iw_sc_pd *pd,
-			     u16 pd_id,
-			     int abi_ver)
-{
-	pd->size = sizeof(*pd);
-	pd->pd_id = pd_id;
-	pd->abi_ver = abi_ver;
-	pd->dev = dev;
-}
-
-/**
- * i40iw_get_encoded_wqe_size - given wq size, returns hardware encoded size
- * @wqsize: size of the wq (sq, rq, srq) to encoded_size
- * @cqpsq: encoded size for sq for cqp as its encoded size is 1+ other wq's
- */
-u8 i40iw_get_encoded_wqe_size(u32 wqsize, bool cqpsq)
-{
-	u8 encoded_size = 0;
-
-	/* cqp sq's hw coded value starts from 1 for size of 4
-	 * while it starts from 0 for qp' wq's.
-	 */
-	if (cqpsq)
-		encoded_size = 1;
-	wqsize >>= 2;
-	while (wqsize >>= 1)
-		encoded_size++;
-	return encoded_size;
-}
-
-/**
- * i40iw_sc_cqp_init - Initialize buffers for a control Queue Pair
- * @cqp: IWARP control queue pair pointer
- * @info: IWARP control queue pair init info pointer
- *
- * Initializes the object and context buffers for a control Queue Pair.
- */
-static enum i40iw_status_code i40iw_sc_cqp_init(struct i40iw_sc_cqp *cqp,
-						struct i40iw_cqp_init_info *info)
-{
-	u8 hw_sq_size;
-
-	if ((info->sq_size > I40IW_CQP_SW_SQSIZE_2048) ||
-	    (info->sq_size < I40IW_CQP_SW_SQSIZE_4) ||
-	    ((info->sq_size & (info->sq_size - 1))))
-		return I40IW_ERR_INVALID_SIZE;
-
-	hw_sq_size = i40iw_get_encoded_wqe_size(info->sq_size, true);
-	cqp->size = sizeof(*cqp);
-	cqp->sq_size = info->sq_size;
-	cqp->hw_sq_size = hw_sq_size;
-	cqp->sq_base = info->sq;
-	cqp->host_ctx = info->host_ctx;
-	cqp->sq_pa = info->sq_pa;
-	cqp->host_ctx_pa = info->host_ctx_pa;
-	cqp->dev = info->dev;
-	cqp->struct_ver = info->struct_ver;
-	cqp->scratch_array = info->scratch_array;
-	cqp->polarity = 0;
-	cqp->en_datacenter_tcp = info->en_datacenter_tcp;
-	cqp->enabled_vf_count = info->enabled_vf_count;
-	cqp->hmc_profile = info->hmc_profile;
-	info->dev->cqp = cqp;
-
-	I40IW_RING_INIT(cqp->sq_ring, cqp->sq_size);
-	cqp->dev->cqp_cmd_stats[OP_REQUESTED_COMMANDS] = 0;
-	cqp->dev->cqp_cmd_stats[OP_COMPLETED_COMMANDS] = 0;
-	INIT_LIST_HEAD(&cqp->dev->cqp_cmd_head);               /* for the cqp commands backlog. */
-
-	i40iw_wr32(cqp->dev->hw, I40E_PFPE_CQPTAIL, 0);
-	i40iw_wr32(cqp->dev->hw, I40E_PFPE_CQPDB, 0);
-
-	i40iw_debug(cqp->dev, I40IW_DEBUG_WQE,
-		    "%s: sq_size[%04d] hw_sq_size[%04d] sq_base[%p] sq_pa[%llxh] cqp[%p] polarity[x%04X]\n",
-		    __func__, cqp->sq_size, cqp->hw_sq_size,
-		    cqp->sq_base, cqp->sq_pa, cqp, cqp->polarity);
-	return 0;
-}
-
-/**
- * i40iw_sc_cqp_create - create cqp during bringup
- * @cqp: struct for cqp hw
- * @maj_err: If error, major err number
- * @min_err: If error, minor err number
- */
-static enum i40iw_status_code i40iw_sc_cqp_create(struct i40iw_sc_cqp *cqp,
-						  u16 *maj_err,
-						  u16 *min_err)
-{
-	u64 temp;
-	u32 cnt = 0, p1, p2, val = 0, err_code;
-	enum i40iw_status_code ret_code;
-
-	*maj_err = 0;
-	*min_err = 0;
-
-	ret_code = i40iw_allocate_dma_mem(cqp->dev->hw,
-					  &cqp->sdbuf,
-					  I40IW_UPDATE_SD_BUF_SIZE * cqp->sq_size,
-					  I40IW_SD_BUF_ALIGNMENT);
-
-	if (ret_code)
-		goto exit;
-
-	temp = LS_64(cqp->hw_sq_size, I40IW_CQPHC_SQSIZE) |
-	       LS_64(cqp->struct_ver, I40IW_CQPHC_SVER);
-
-	set_64bit_val(cqp->host_ctx, 0, temp);
-	set_64bit_val(cqp->host_ctx, 8, cqp->sq_pa);
-	temp = LS_64(cqp->enabled_vf_count, I40IW_CQPHC_ENABLED_VFS) |
-	       LS_64(cqp->hmc_profile, I40IW_CQPHC_HMC_PROFILE);
-	set_64bit_val(cqp->host_ctx, 16, temp);
-	set_64bit_val(cqp->host_ctx, 24, (uintptr_t)cqp);
-	set_64bit_val(cqp->host_ctx, 32, 0);
-	set_64bit_val(cqp->host_ctx, 40, 0);
-	set_64bit_val(cqp->host_ctx, 48, 0);
-	set_64bit_val(cqp->host_ctx, 56, 0);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "CQP_HOST_CTX",
-			cqp->host_ctx, I40IW_CQP_CTX_SIZE * 8);
-
-	p1 = RS_32_1(cqp->host_ctx_pa, 32);
-	p2 = (u32)cqp->host_ctx_pa;
-
-	if (cqp->dev->is_pf) {
-		i40iw_wr32(cqp->dev->hw, I40E_PFPE_CCQPHIGH, p1);
-		i40iw_wr32(cqp->dev->hw, I40E_PFPE_CCQPLOW, p2);
-	} else {
-		i40iw_wr32(cqp->dev->hw, I40E_VFPE_CCQPHIGH1, p1);
-		i40iw_wr32(cqp->dev->hw, I40E_VFPE_CCQPLOW1, p2);
-	}
-	do {
-		if (cnt++ > I40IW_DONE_COUNT) {
-			i40iw_free_dma_mem(cqp->dev->hw, &cqp->sdbuf);
-			ret_code = I40IW_ERR_TIMEOUT;
-			/*
-			 * read PFPE_CQPERRORCODES register to get the minor
-			 * and major error code
-			 */
-			if (cqp->dev->is_pf)
-				err_code = i40iw_rd32(cqp->dev->hw, I40E_PFPE_CQPERRCODES);
-			else
-				err_code = i40iw_rd32(cqp->dev->hw, I40E_VFPE_CQPERRCODES1);
-			*min_err = RS_32(err_code, I40E_PFPE_CQPERRCODES_CQP_MINOR_CODE);
-			*maj_err = RS_32(err_code, I40E_PFPE_CQPERRCODES_CQP_MAJOR_CODE);
-			goto exit;
-		}
-		udelay(I40IW_SLEEP_COUNT);
-		if (cqp->dev->is_pf)
-			val = i40iw_rd32(cqp->dev->hw, I40E_PFPE_CCQPSTATUS);
-		else
-			val = i40iw_rd32(cqp->dev->hw, I40E_VFPE_CCQPSTATUS1);
-	} while (!val);
-
-exit:
-	if (!ret_code)
-		cqp->process_cqp_sds = i40iw_update_sds_noccq;
-	return ret_code;
-}
-
-/**
- * i40iw_sc_cqp_post_sq - post of cqp's sq
- * @cqp: struct for cqp hw
- */
-void i40iw_sc_cqp_post_sq(struct i40iw_sc_cqp *cqp)
-{
-	if (cqp->dev->is_pf)
-		i40iw_wr32(cqp->dev->hw, I40E_PFPE_CQPDB, I40IW_RING_GETCURRENT_HEAD(cqp->sq_ring));
-	else
-		i40iw_wr32(cqp->dev->hw, I40E_VFPE_CQPDB1, I40IW_RING_GETCURRENT_HEAD(cqp->sq_ring));
-
-	i40iw_debug(cqp->dev,
-		    I40IW_DEBUG_WQE,
-		    "%s: HEAD_TAIL[%04d,%04d,%04d]\n",
-		    __func__,
-		    cqp->sq_ring.head,
-		    cqp->sq_ring.tail,
-		    cqp->sq_ring.size);
-}
-
-/**
- * i40iw_sc_cqp_get_next_send_wqe_idx - get next WQE on CQP SQ and pass back the index
- * @cqp: pointer to CQP structure
- * @scratch: private data for CQP WQE
- * @wqe_idx: WQE index for next WQE on CQP SQ
- */
-static u64 *i40iw_sc_cqp_get_next_send_wqe_idx(struct i40iw_sc_cqp *cqp,
-					       u64 scratch, u32 *wqe_idx)
-{
-	u64 *wqe = NULL;
-	enum i40iw_status_code ret_code;
-
-	if (I40IW_RING_FULL_ERR(cqp->sq_ring)) {
-		i40iw_debug(cqp->dev,
-			    I40IW_DEBUG_WQE,
-			    "%s: ring is full head %x tail %x size %x\n",
-			    __func__,
-			    cqp->sq_ring.head,
-			    cqp->sq_ring.tail,
-			    cqp->sq_ring.size);
-		return NULL;
-	}
-	I40IW_ATOMIC_RING_MOVE_HEAD(cqp->sq_ring, *wqe_idx, ret_code);
-	cqp->dev->cqp_cmd_stats[OP_REQUESTED_COMMANDS]++;
-	if (ret_code)
-		return NULL;
-	if (!*wqe_idx)
-		cqp->polarity = !cqp->polarity;
-
-	wqe = cqp->sq_base[*wqe_idx].elem;
-	cqp->scratch_array[*wqe_idx] = scratch;
-	I40IW_CQP_INIT_WQE(wqe);
-
-	return wqe;
-}
-
-/**
- * i40iw_sc_cqp_get_next_send_wqe - get next wqe on cqp sq
- * @cqp: struct for cqp hw
- * @scratch: private data for CQP WQE
- */
-u64 *i40iw_sc_cqp_get_next_send_wqe(struct i40iw_sc_cqp *cqp, u64 scratch)
-{
-	u32 wqe_idx;
-
-	return i40iw_sc_cqp_get_next_send_wqe_idx(cqp, scratch, &wqe_idx);
-}
-
-/**
- * i40iw_sc_cqp_destroy - destroy cqp during close
- * @cqp: struct for cqp hw
- */
-static enum i40iw_status_code i40iw_sc_cqp_destroy(struct i40iw_sc_cqp *cqp)
-{
-	u32 cnt = 0, val = 1;
-	enum i40iw_status_code ret_code = 0;
-	u32 cqpstat_addr;
-
-	if (cqp->dev->is_pf) {
-		i40iw_wr32(cqp->dev->hw, I40E_PFPE_CCQPHIGH, 0);
-		i40iw_wr32(cqp->dev->hw, I40E_PFPE_CCQPLOW, 0);
-		cqpstat_addr = I40E_PFPE_CCQPSTATUS;
-	} else {
-		i40iw_wr32(cqp->dev->hw, I40E_VFPE_CCQPHIGH1, 0);
-		i40iw_wr32(cqp->dev->hw, I40E_VFPE_CCQPLOW1, 0);
-		cqpstat_addr = I40E_VFPE_CCQPSTATUS1;
-	}
-	do {
-		if (cnt++ > I40IW_DONE_COUNT) {
-			ret_code = I40IW_ERR_TIMEOUT;
-			break;
-		}
-		udelay(I40IW_SLEEP_COUNT);
-		val = i40iw_rd32(cqp->dev->hw, cqpstat_addr);
-	} while (val);
-
-	i40iw_free_dma_mem(cqp->dev->hw, &cqp->sdbuf);
-	return ret_code;
-}
-
-/**
- * i40iw_sc_ccq_arm - enable intr for control cq
- * @ccq: ccq sc struct
- */
-static void i40iw_sc_ccq_arm(struct i40iw_sc_cq *ccq)
-{
-	u64 temp_val;
-	u16 sw_cq_sel;
-	u8 arm_next_se;
-	u8 arm_seq_num;
-
-	/* write to cq doorbell shadow area */
-	/* arm next se should always be zero */
-	get_64bit_val(ccq->cq_uk.shadow_area, 32, &temp_val);
-
-	sw_cq_sel = (u16)RS_64(temp_val, I40IW_CQ_DBSA_SW_CQ_SELECT);
-	arm_next_se = (u8)RS_64(temp_val, I40IW_CQ_DBSA_ARM_NEXT_SE);
-
-	arm_seq_num = (u8)RS_64(temp_val, I40IW_CQ_DBSA_ARM_SEQ_NUM);
-	arm_seq_num++;
-
-	temp_val = LS_64(arm_seq_num, I40IW_CQ_DBSA_ARM_SEQ_NUM) |
-		   LS_64(sw_cq_sel, I40IW_CQ_DBSA_SW_CQ_SELECT) |
-		   LS_64(arm_next_se, I40IW_CQ_DBSA_ARM_NEXT_SE) |
-		   LS_64(1, I40IW_CQ_DBSA_ARM_NEXT);
-
-	set_64bit_val(ccq->cq_uk.shadow_area, 32, temp_val);
-
-	wmb();       /* make sure shadow area is updated before arming */
-
-	if (ccq->dev->is_pf)
-		i40iw_wr32(ccq->dev->hw, I40E_PFPE_CQARM, ccq->cq_uk.cq_id);
-	else
-		i40iw_wr32(ccq->dev->hw, I40E_VFPE_CQARM1, ccq->cq_uk.cq_id);
-}
-
-/**
- * i40iw_sc_ccq_get_cqe_info - get ccq's cq entry
- * @ccq: ccq sc struct
- * @info: completion q entry to return
- */
-static enum i40iw_status_code i40iw_sc_ccq_get_cqe_info(
-					struct i40iw_sc_cq *ccq,
-					struct i40iw_ccq_cqe_info *info)
-{
-	u64 qp_ctx, temp, temp1;
-	u64 *cqe;
-	struct i40iw_sc_cqp *cqp;
-	u32 wqe_idx;
-	u8 polarity;
-	enum i40iw_status_code ret_code = 0;
-
-	if (ccq->cq_uk.avoid_mem_cflct)
-		cqe = (u64 *)I40IW_GET_CURRENT_EXTENDED_CQ_ELEMENT(&ccq->cq_uk);
-	else
-		cqe = (u64 *)I40IW_GET_CURRENT_CQ_ELEMENT(&ccq->cq_uk);
-
-	get_64bit_val(cqe, 24, &temp);
-	polarity = (u8)RS_64(temp, I40IW_CQ_VALID);
-	if (polarity != ccq->cq_uk.polarity)
-		return I40IW_ERR_QUEUE_EMPTY;
-
-	get_64bit_val(cqe, 8, &qp_ctx);
-	cqp = (struct i40iw_sc_cqp *)(unsigned long)qp_ctx;
-	info->error = (bool)RS_64(temp, I40IW_CQ_ERROR);
-	info->min_err_code = (u16)RS_64(temp, I40IW_CQ_MINERR);
-	if (info->error) {
-		info->maj_err_code = (u16)RS_64(temp, I40IW_CQ_MAJERR);
-		info->min_err_code = (u16)RS_64(temp, I40IW_CQ_MINERR);
-	}
-	wqe_idx = (u32)RS_64(temp, I40IW_CQ_WQEIDX);
-	info->scratch = cqp->scratch_array[wqe_idx];
-
-	get_64bit_val(cqe, 16, &temp1);
-	info->op_ret_val = (u32)RS_64(temp1, I40IW_CCQ_OPRETVAL);
-	get_64bit_val(cqp->sq_base[wqe_idx].elem, 24, &temp1);
-	info->op_code = (u8)RS_64(temp1, I40IW_CQPSQ_OPCODE);
-	info->cqp = cqp;
-
-	/*  move the head for cq */
-	I40IW_RING_MOVE_HEAD(ccq->cq_uk.cq_ring, ret_code);
-	if (I40IW_RING_GETCURRENT_HEAD(ccq->cq_uk.cq_ring) == 0)
-		ccq->cq_uk.polarity ^= 1;
-
-	/* update cq tail in cq shadow memory also */
-	I40IW_RING_MOVE_TAIL(ccq->cq_uk.cq_ring);
-	set_64bit_val(ccq->cq_uk.shadow_area,
-		      0,
-		      I40IW_RING_GETCURRENT_HEAD(ccq->cq_uk.cq_ring));
-	wmb(); /* write shadow area before tail */
-	I40IW_RING_MOVE_TAIL(cqp->sq_ring);
-	ccq->dev->cqp_cmd_stats[OP_COMPLETED_COMMANDS]++;
-
-	return ret_code;
-}
-
-/**
- * i40iw_sc_poll_for_cqp_op_done - Waits for last write to complete in CQP SQ
- * @cqp: struct for cqp hw
- * @op_code: cqp opcode for completion
- * @compl_info: completion q entry to return
- */
-static enum i40iw_status_code i40iw_sc_poll_for_cqp_op_done(
-					struct i40iw_sc_cqp *cqp,
-					u8 op_code,
-					struct i40iw_ccq_cqe_info *compl_info)
-{
-	struct i40iw_ccq_cqe_info info;
-	struct i40iw_sc_cq *ccq;
-	enum i40iw_status_code ret_code = 0;
-	u32 cnt = 0;
-
-	memset(&info, 0, sizeof(info));
-	ccq = cqp->dev->ccq;
-	while (1) {
-		if (cnt++ > I40IW_DONE_COUNT)
-			return I40IW_ERR_TIMEOUT;
-
-		if (i40iw_sc_ccq_get_cqe_info(ccq, &info)) {
-			udelay(I40IW_SLEEP_COUNT);
-			continue;
-		}
-
-		if (info.error) {
-			ret_code = I40IW_ERR_CQP_COMPL_ERROR;
-			break;
-		}
-		/* check if opcode is cq create */
-		if (op_code != info.op_code) {
-			i40iw_debug(cqp->dev, I40IW_DEBUG_WQE,
-				    "%s: opcode mismatch for my op code 0x%x, returned opcode %x\n",
-				    __func__, op_code, info.op_code);
-		}
-		/* success, exit out of the loop */
-		if (op_code == info.op_code)
-			break;
-	}
-
-	if (compl_info)
-		memcpy(compl_info, &info, sizeof(*compl_info));
-
-	return ret_code;
-}
-
-/**
- * i40iw_sc_manage_hmc_pm_func_table - manage of function table
- * @cqp: struct for cqp hw
- * @scratch: u64 saved to be used during cqp completion
- * @vf_index: vf index for cqp
- * @free_pm_fcn: function number
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_manage_hmc_pm_func_table(
-				struct i40iw_sc_cqp *cqp,
-				u64 scratch,
-				u8 vf_index,
-				bool free_pm_fcn,
-				bool post_sq)
-{
-	u64 *wqe;
-	u64 header;
-
-	if (vf_index >= I40IW_MAX_VF_PER_PF)
-		return I40IW_ERR_INVALID_VF_ID;
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-
-	header = LS_64(vf_index, I40IW_CQPSQ_MHMC_VFIDX) |
-		 LS_64(I40IW_CQP_OP_MANAGE_HMC_PM_FUNC_TABLE, I40IW_CQPSQ_OPCODE) |
-		 LS_64(free_pm_fcn, I40IW_CQPSQ_MHMC_FREEPMFN) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "MANAGE_HMC_PM_FUNC_TABLE WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_set_hmc_resource_profile - cqp wqe for hmc profile
- * @cqp: struct for cqp hw
- * @scratch: u64 saved to be used during cqp completion
- * @hmc_profile_type: type of profile to set
- * @vf_num: vf number for profile
- * @post_sq: flag for cqp db to ring
- * @poll_registers: flag to poll register for cqp completion
- */
-static enum i40iw_status_code i40iw_sc_set_hmc_resource_profile(
-				struct i40iw_sc_cqp *cqp,
-				u64 scratch,
-				u8 hmc_profile_type,
-				u8 vf_num, bool post_sq,
-				bool poll_registers)
-{
-	u64 *wqe;
-	u64 header;
-	u32 val, tail, error;
-	enum i40iw_status_code ret_code = 0;
-
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-
-	set_64bit_val(wqe, 16,
-		      (LS_64(hmc_profile_type, I40IW_CQPSQ_SHMCRP_HMC_PROFILE) |
-				LS_64(vf_num, I40IW_CQPSQ_SHMCRP_VFNUM)));
-
-	header = LS_64(I40IW_CQP_OP_SET_HMC_RESOURCE_PROFILE, I40IW_CQPSQ_OPCODE) |
-		       LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "MANAGE_HMC_PM_FUNC_TABLE WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	i40iw_get_cqp_reg_info(cqp, &val, &tail, &error);
-	if (error)
-		return I40IW_ERR_CQP_COMPL_ERROR;
-
-	if (post_sq) {
-		i40iw_sc_cqp_post_sq(cqp);
-		if (poll_registers)
-			ret_code = i40iw_cqp_poll_registers(cqp, tail, 1000000);
-		else
-			ret_code = i40iw_sc_poll_for_cqp_op_done(cqp,
-								 I40IW_CQP_OP_SHMC_PAGES_ALLOCATED,
-								 NULL);
-	}
-
-	return ret_code;
-}
-
-/**
- * i40iw_sc_manage_hmc_pm_func_table_done - wait for cqp wqe completion for function table
- * @cqp: struct for cqp hw
- */
-static enum i40iw_status_code i40iw_sc_manage_hmc_pm_func_table_done(struct i40iw_sc_cqp *cqp)
-{
-	return i40iw_sc_poll_for_cqp_op_done(cqp, I40IW_CQP_OP_MANAGE_HMC_PM_FUNC_TABLE, NULL);
-}
-
-/**
- * i40iw_sc_commit_fpm_values_done - wait for cqp eqe completion for fpm commit
- * @cqp: struct for cqp hw
- */
-static enum i40iw_status_code i40iw_sc_commit_fpm_values_done(struct i40iw_sc_cqp *cqp)
-{
-	return i40iw_sc_poll_for_cqp_op_done(cqp, I40IW_CQP_OP_COMMIT_FPM_VALUES, NULL);
-}
-
-/**
- * i40iw_sc_commit_fpm_values - cqp wqe for commit fpm values
- * @cqp: struct for cqp hw
- * @scratch: u64 saved to be used during cqp completion
- * @hmc_fn_id: hmc function id
- * @commit_fpm_mem: Memory for fpm values
- * @post_sq: flag for cqp db to ring
- * @wait_type: poll ccq or cqp registers for cqp completion
- */
-static enum i40iw_status_code i40iw_sc_commit_fpm_values(
-					struct i40iw_sc_cqp *cqp,
-					u64 scratch,
-					u8 hmc_fn_id,
-					struct i40iw_dma_mem *commit_fpm_mem,
-					bool post_sq,
-					u8 wait_type)
-{
-	u64 *wqe;
-	u64 header;
-	u32 tail, val, error;
-	enum i40iw_status_code ret_code = 0;
-
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-
-	set_64bit_val(wqe, 16, hmc_fn_id);
-	set_64bit_val(wqe, 32, commit_fpm_mem->pa);
-
-	header = LS_64(I40IW_CQP_OP_COMMIT_FPM_VALUES, I40IW_CQPSQ_OPCODE) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "COMMIT_FPM_VALUES WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	i40iw_get_cqp_reg_info(cqp, &val, &tail, &error);
-	if (error)
-		return I40IW_ERR_CQP_COMPL_ERROR;
-
-	if (post_sq) {
-		i40iw_sc_cqp_post_sq(cqp);
-
-		if (wait_type == I40IW_CQP_WAIT_POLL_REGS)
-			ret_code = i40iw_cqp_poll_registers(cqp, tail, I40IW_DONE_COUNT);
-		else if (wait_type == I40IW_CQP_WAIT_POLL_CQ)
-			ret_code = i40iw_sc_commit_fpm_values_done(cqp);
-	}
-
-	return ret_code;
-}
-
-/**
- * i40iw_sc_query_rdma_features_done - poll cqp for query features done
- * @cqp: struct for cqp hw
- */
-static enum i40iw_status_code
-i40iw_sc_query_rdma_features_done(struct i40iw_sc_cqp *cqp)
-{
-	return i40iw_sc_poll_for_cqp_op_done(
-		cqp, I40IW_CQP_OP_QUERY_RDMA_FEATURES, NULL);
-}
-
-/**
- * i40iw_sc_query_rdma_features - query rdma features
- * @cqp: struct for cqp hw
- * @feat_mem: holds PA for HW to use
- * @scratch: u64 saved to be used during cqp completion
- */
-static enum i40iw_status_code
-i40iw_sc_query_rdma_features(struct i40iw_sc_cqp *cqp,
-			     struct i40iw_dma_mem *feat_mem, u64 scratch)
-{
-	u64 *wqe;
-	u64 header;
-
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-
-	set_64bit_val(wqe, 32, feat_mem->pa);
-
-	header = LS_64(I40IW_CQP_OP_QUERY_RDMA_FEATURES, I40IW_CQPSQ_OPCODE) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID) | feat_mem->size;
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "QUERY RDMA FEATURES WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	i40iw_sc_cqp_post_sq(cqp);
-
-	return 0;
-}
-
-/**
- * i40iw_get_rdma_features - get RDMA features
- * @dev: sc device struct
- */
-enum i40iw_status_code i40iw_get_rdma_features(struct i40iw_sc_dev *dev)
-{
-	enum i40iw_status_code ret_code;
-	struct i40iw_dma_mem feat_buf;
-	u64 temp;
-	u16 byte_idx, feat_type, feat_cnt;
-
-	ret_code = i40iw_allocate_dma_mem(dev->hw, &feat_buf,
-					  I40IW_FEATURE_BUF_SIZE,
-					  I40IW_FEATURE_BUF_ALIGNMENT);
-
-	if (ret_code)
-		return I40IW_ERR_NO_MEMORY;
-
-	ret_code = i40iw_sc_query_rdma_features(dev->cqp, &feat_buf, 0);
-	if (!ret_code)
-		ret_code = i40iw_sc_query_rdma_features_done(dev->cqp);
-
-	if (ret_code)
-		goto exit;
-
-	get_64bit_val(feat_buf.va, 0, &temp);
-	feat_cnt = RS_64(temp, I40IW_FEATURE_CNT);
-	if (feat_cnt < I40IW_MAX_FEATURES) {
-		ret_code = I40IW_ERR_INVALID_FEAT_CNT;
-		goto exit;
-	} else if (feat_cnt > I40IW_MAX_FEATURES) {
-		i40iw_debug(dev, I40IW_DEBUG_CQP,
-			    "features buf size insufficient\n");
-	}
-
-	for (byte_idx = 0, feat_type = 0; feat_type < I40IW_MAX_FEATURES;
-	     feat_type++, byte_idx += 8) {
-		get_64bit_val((u64 *)feat_buf.va, byte_idx, &temp);
-		dev->feature_info[feat_type] = RS_64(temp, I40IW_FEATURE_INFO);
-	}
-exit:
-	i40iw_free_dma_mem(dev->hw, &feat_buf);
-
-	return ret_code;
-}
-
-/**
- * i40iw_sc_query_fpm_values_done - poll for cqp wqe completion for query fpm
- * @cqp: struct for cqp hw
- */
-static enum i40iw_status_code i40iw_sc_query_fpm_values_done(struct i40iw_sc_cqp *cqp)
-{
-	return i40iw_sc_poll_for_cqp_op_done(cqp, I40IW_CQP_OP_QUERY_FPM_VALUES, NULL);
-}
-
-/**
- * i40iw_sc_query_fpm_values - cqp wqe query fpm values
- * @cqp: struct for cqp hw
- * @scratch: u64 saved to be used during cqp completion
- * @hmc_fn_id: hmc function id
- * @query_fpm_mem: memory for return fpm values
- * @post_sq: flag for cqp db to ring
- * @wait_type: poll ccq or cqp registers for cqp completion
- */
-static enum i40iw_status_code i40iw_sc_query_fpm_values(
-					struct i40iw_sc_cqp *cqp,
-					u64 scratch,
-					u8 hmc_fn_id,
-					struct i40iw_dma_mem *query_fpm_mem,
-					bool post_sq,
-					u8 wait_type)
-{
-	u64 *wqe;
-	u64 header;
-	u32 tail, val, error;
-	enum i40iw_status_code ret_code = 0;
-
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-
-	set_64bit_val(wqe, 16, hmc_fn_id);
-	set_64bit_val(wqe, 32, query_fpm_mem->pa);
-
-	header = LS_64(I40IW_CQP_OP_QUERY_FPM_VALUES, I40IW_CQPSQ_OPCODE) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "QUERY_FPM WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	/* read the tail from CQP_TAIL register */
-	i40iw_get_cqp_reg_info(cqp, &val, &tail, &error);
-
-	if (error)
-		return I40IW_ERR_CQP_COMPL_ERROR;
-
-	if (post_sq) {
-		i40iw_sc_cqp_post_sq(cqp);
-		if (wait_type == I40IW_CQP_WAIT_POLL_REGS)
-			ret_code = i40iw_cqp_poll_registers(cqp, tail, I40IW_DONE_COUNT);
-		else if (wait_type == I40IW_CQP_WAIT_POLL_CQ)
-			ret_code = i40iw_sc_query_fpm_values_done(cqp);
-	}
-
-	return ret_code;
-}
-
-/**
- * i40iw_sc_add_arp_cache_entry - cqp wqe add arp cache entry
- * @cqp: struct for cqp hw
- * @info: arp entry information
- * @scratch: u64 saved to be used during cqp completion
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_add_arp_cache_entry(
-				struct i40iw_sc_cqp *cqp,
-				struct i40iw_add_arp_cache_entry_info *info,
-				u64 scratch,
-				bool post_sq)
-{
-	u64 *wqe;
-	u64 temp, header;
-
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-	set_64bit_val(wqe, 8, info->reach_max);
-
-	temp = info->mac_addr[5] |
-	       LS_64_1(info->mac_addr[4], 8) |
-	       LS_64_1(info->mac_addr[3], 16) |
-	       LS_64_1(info->mac_addr[2], 24) |
-	       LS_64_1(info->mac_addr[1], 32) |
-	       LS_64_1(info->mac_addr[0], 40);
-
-	set_64bit_val(wqe, 16, temp);
-
-	header = info->arp_index |
-		 LS_64(I40IW_CQP_OP_MANAGE_ARP, I40IW_CQPSQ_OPCODE) |
-		 LS_64((info->permanent ? 1 : 0), I40IW_CQPSQ_MAT_PERMANENT) |
-		 LS_64(1, I40IW_CQPSQ_MAT_ENTRYVALID) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "ARP_CACHE_ENTRY WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_del_arp_cache_entry - dele arp cache entry
- * @cqp: struct for cqp hw
- * @scratch: u64 saved to be used during cqp completion
- * @arp_index: arp index to delete arp entry
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_del_arp_cache_entry(
-					struct i40iw_sc_cqp *cqp,
-					u64 scratch,
-					u16 arp_index,
-					bool post_sq)
-{
-	u64 *wqe;
-	u64 header;
-
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-
-	header = arp_index |
-		 LS_64(I40IW_CQP_OP_MANAGE_ARP, I40IW_CQPSQ_OPCODE) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "ARP_CACHE_DEL_ENTRY WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_query_arp_cache_entry - cqp wqe to query arp and arp index
- * @cqp: struct for cqp hw
- * @scratch: u64 saved to be used during cqp completion
- * @arp_index: arp index to delete arp entry
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_query_arp_cache_entry(
-				struct i40iw_sc_cqp *cqp,
-				u64 scratch,
-				u16 arp_index,
-				bool post_sq)
-{
-	u64 *wqe;
-	u64 header;
-
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-
-	header = arp_index |
-		 LS_64(I40IW_CQP_OP_MANAGE_ARP, I40IW_CQPSQ_OPCODE) |
-		 LS_64(1, I40IW_CQPSQ_MAT_QUERY) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "QUERY_ARP_CACHE_ENTRY WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_manage_apbvt_entry - for adding and deleting apbvt entries
- * @cqp: struct for cqp hw
- * @info: info for apbvt entry to add or delete
- * @scratch: u64 saved to be used during cqp completion
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_manage_apbvt_entry(
-				struct i40iw_sc_cqp *cqp,
-				struct i40iw_apbvt_info *info,
-				u64 scratch,
-				bool post_sq)
-{
-	u64 *wqe;
-	u64 header;
-
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-
-	set_64bit_val(wqe, 16, info->port);
-
-	header = LS_64(I40IW_CQP_OP_MANAGE_APBVT, I40IW_CQPSQ_OPCODE) |
-		 LS_64(info->add, I40IW_CQPSQ_MAPT_ADDPORT) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "MANAGE_APBVT WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_manage_qhash_table_entry - manage quad hash entries
- * @cqp: struct for cqp hw
- * @info: info for quad hash to manage
- * @scratch: u64 saved to be used during cqp completion
- * @post_sq: flag for cqp db to ring
- *
- * This is called before connection establishment is started. For passive connections, when
- * listener is created, it will call with entry type of  I40IW_QHASH_TYPE_TCP_SYN with local
- * ip address and tcp port. When SYN is received (passive connections) or
- * sent (active connections), this routine is called with entry type of
- * I40IW_QHASH_TYPE_TCP_ESTABLISHED and quad is passed in info.
- *
- * When iwarp connection is done and its state moves to RTS, the quad hash entry in
- * the hardware will point to iwarp's qp number and requires no calls from the driver.
- */
-static enum i40iw_status_code i40iw_sc_manage_qhash_table_entry(
-					struct i40iw_sc_cqp *cqp,
-					struct i40iw_qhash_table_info *info,
-					u64 scratch,
-					bool post_sq)
-{
-	u64 *wqe;
-	u64 qw1 = 0;
-	u64 qw2 = 0;
-	u64 temp;
-	struct i40iw_sc_vsi *vsi = info->vsi;
-
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-
-	temp = info->mac_addr[5] |
-		LS_64_1(info->mac_addr[4], 8) |
-		LS_64_1(info->mac_addr[3], 16) |
-		LS_64_1(info->mac_addr[2], 24) |
-		LS_64_1(info->mac_addr[1], 32) |
-		LS_64_1(info->mac_addr[0], 40);
-
-	set_64bit_val(wqe, 0, temp);
-
-	qw1 = LS_64(info->qp_num, I40IW_CQPSQ_QHASH_QPN) |
-	      LS_64(info->dest_port, I40IW_CQPSQ_QHASH_DEST_PORT);
-	if (info->ipv4_valid) {
-		set_64bit_val(wqe,
-			      48,
-			      LS_64(info->dest_ip[0], I40IW_CQPSQ_QHASH_ADDR3));
-	} else {
-		set_64bit_val(wqe,
-			      56,
-			      LS_64(info->dest_ip[0], I40IW_CQPSQ_QHASH_ADDR0) |
-			      LS_64(info->dest_ip[1], I40IW_CQPSQ_QHASH_ADDR1));
-
-		set_64bit_val(wqe,
-			      48,
-			      LS_64(info->dest_ip[2], I40IW_CQPSQ_QHASH_ADDR2) |
-			      LS_64(info->dest_ip[3], I40IW_CQPSQ_QHASH_ADDR3));
-	}
-	qw2 = LS_64(vsi->qos[info->user_pri].qs_handle, I40IW_CQPSQ_QHASH_QS_HANDLE);
-	if (info->vlan_valid)
-		qw2 |= LS_64(info->vlan_id, I40IW_CQPSQ_QHASH_VLANID);
-	set_64bit_val(wqe, 16, qw2);
-	if (info->entry_type == I40IW_QHASH_TYPE_TCP_ESTABLISHED) {
-		qw1 |= LS_64(info->src_port, I40IW_CQPSQ_QHASH_SRC_PORT);
-		if (!info->ipv4_valid) {
-			set_64bit_val(wqe,
-				      40,
-				      LS_64(info->src_ip[0], I40IW_CQPSQ_QHASH_ADDR0) |
-				      LS_64(info->src_ip[1], I40IW_CQPSQ_QHASH_ADDR1));
-			set_64bit_val(wqe,
-				      32,
-				      LS_64(info->src_ip[2], I40IW_CQPSQ_QHASH_ADDR2) |
-				      LS_64(info->src_ip[3], I40IW_CQPSQ_QHASH_ADDR3));
-		} else {
-			set_64bit_val(wqe,
-				      32,
-				      LS_64(info->src_ip[0], I40IW_CQPSQ_QHASH_ADDR3));
-		}
-	}
-
-	set_64bit_val(wqe, 8, qw1);
-	temp = LS_64(cqp->polarity, I40IW_CQPSQ_QHASH_WQEVALID) |
-	       LS_64(I40IW_CQP_OP_MANAGE_QUAD_HASH_TABLE_ENTRY, I40IW_CQPSQ_QHASH_OPCODE) |
-	       LS_64(info->manage, I40IW_CQPSQ_QHASH_MANAGE) |
-	       LS_64(info->ipv4_valid, I40IW_CQPSQ_QHASH_IPV4VALID) |
-	       LS_64(info->vlan_valid, I40IW_CQPSQ_QHASH_VLANVALID) |
-	       LS_64(info->entry_type, I40IW_CQPSQ_QHASH_ENTRYTYPE);
-
-	i40iw_insert_wqe_hdr(wqe, temp);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "MANAGE_QHASH WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_alloc_local_mac_ipaddr_entry - cqp wqe for loc mac entry
- * @cqp: struct for cqp hw
- * @scratch: u64 saved to be used during cqp completion
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_alloc_local_mac_ipaddr_entry(
-					struct i40iw_sc_cqp *cqp,
-					u64 scratch,
-					bool post_sq)
-{
-	u64 *wqe;
-	u64 header;
-
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-	header = LS_64(I40IW_CQP_OP_ALLOCATE_LOC_MAC_IP_TABLE_ENTRY, I40IW_CQPSQ_OPCODE) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "ALLOCATE_LOCAL_MAC_IPADDR WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_add_local_mac_ipaddr_entry - add mac enry
- * @cqp: struct for cqp hw
- * @info:mac addr info
- * @scratch: u64 saved to be used during cqp completion
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_add_local_mac_ipaddr_entry(
-				struct i40iw_sc_cqp *cqp,
-				struct i40iw_local_mac_ipaddr_entry_info *info,
-				u64 scratch,
-				bool post_sq)
-{
-	u64 *wqe;
-	u64 temp, header;
-
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-	temp = info->mac_addr[5] |
-		LS_64_1(info->mac_addr[4], 8) |
-		LS_64_1(info->mac_addr[3], 16) |
-		LS_64_1(info->mac_addr[2], 24) |
-		LS_64_1(info->mac_addr[1], 32) |
-		LS_64_1(info->mac_addr[0], 40);
-
-	set_64bit_val(wqe, 32, temp);
-
-	header = LS_64(info->entry_idx, I40IW_CQPSQ_MLIPA_IPTABLEIDX) |
-		 LS_64(I40IW_CQP_OP_MANAGE_LOC_MAC_IP_TABLE, I40IW_CQPSQ_OPCODE) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "ADD_LOCAL_MAC_IPADDR WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_del_local_mac_ipaddr_entry - cqp wqe to dele local mac
- * @cqp: struct for cqp hw
- * @scratch: u64 saved to be used during cqp completion
- * @entry_idx: index of mac entry
- * @ignore_ref_count: to force mac adde delete
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_del_local_mac_ipaddr_entry(
-				struct i40iw_sc_cqp *cqp,
-				u64 scratch,
-				u8 entry_idx,
-				u8 ignore_ref_count,
-				bool post_sq)
-{
-	u64 *wqe;
-	u64 header;
-
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-	header = LS_64(entry_idx, I40IW_CQPSQ_MLIPA_IPTABLEIDX) |
-		 LS_64(I40IW_CQP_OP_MANAGE_LOC_MAC_IP_TABLE, I40IW_CQPSQ_OPCODE) |
-		 LS_64(1, I40IW_CQPSQ_MLIPA_FREEENTRY) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID) |
-		 LS_64(ignore_ref_count, I40IW_CQPSQ_MLIPA_IGNORE_REF_CNT);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "DEL_LOCAL_MAC_IPADDR WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_cqp_nop - send a nop wqe
- * @cqp: struct for cqp hw
- * @scratch: u64 saved to be used during cqp completion
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_cqp_nop(struct i40iw_sc_cqp *cqp,
-					       u64 scratch,
-					       bool post_sq)
-{
-	u64 *wqe;
-	u64 header;
-
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-	header = LS_64(I40IW_CQP_OP_NOP, I40IW_CQPSQ_OPCODE) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-	i40iw_insert_wqe_hdr(wqe, header);
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "NOP WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_ceq_init - initialize ceq
- * @ceq: ceq sc structure
- * @info: ceq initialization info
- */
-static enum i40iw_status_code i40iw_sc_ceq_init(struct i40iw_sc_ceq *ceq,
-						struct i40iw_ceq_init_info *info)
-{
-	u32 pble_obj_cnt;
-
-	if ((info->elem_cnt < I40IW_MIN_CEQ_ENTRIES) ||
-	    (info->elem_cnt > I40IW_MAX_CEQ_ENTRIES))
-		return I40IW_ERR_INVALID_SIZE;
-
-	if (info->ceq_id >= I40IW_MAX_CEQID)
-		return I40IW_ERR_INVALID_CEQ_ID;
-
-	pble_obj_cnt = info->dev->hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt;
-
-	if (info->virtual_map && (info->first_pm_pbl_idx >= pble_obj_cnt))
-		return I40IW_ERR_INVALID_PBLE_INDEX;
-
-	ceq->size = sizeof(*ceq);
-	ceq->ceqe_base = (struct i40iw_ceqe *)info->ceqe_base;
-	ceq->ceq_id = info->ceq_id;
-	ceq->dev = info->dev;
-	ceq->elem_cnt = info->elem_cnt;
-	ceq->ceq_elem_pa = info->ceqe_pa;
-	ceq->virtual_map = info->virtual_map;
-
-	ceq->pbl_chunk_size = (ceq->virtual_map ? info->pbl_chunk_size : 0);
-	ceq->first_pm_pbl_idx = (ceq->virtual_map ? info->first_pm_pbl_idx : 0);
-	ceq->pbl_list = (ceq->virtual_map ? info->pbl_list : NULL);
-
-	ceq->tph_en = info->tph_en;
-	ceq->tph_val = info->tph_val;
-	ceq->polarity = 1;
-	I40IW_RING_INIT(ceq->ceq_ring, ceq->elem_cnt);
-	ceq->dev->ceq[info->ceq_id] = ceq;
-
-	return 0;
-}
-
-/**
- * i40iw_sc_ceq_create - create ceq wqe
- * @ceq: ceq sc structure
- * @scratch: u64 saved to be used during cqp completion
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_ceq_create(struct i40iw_sc_ceq *ceq,
-						  u64 scratch,
-						  bool post_sq)
-{
-	struct i40iw_sc_cqp *cqp;
-	u64 *wqe;
-	u64 header;
-
-	cqp = ceq->dev->cqp;
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-	set_64bit_val(wqe, 16, ceq->elem_cnt);
-	set_64bit_val(wqe, 32, (ceq->virtual_map ? 0 : ceq->ceq_elem_pa));
-	set_64bit_val(wqe, 48, (ceq->virtual_map ? ceq->first_pm_pbl_idx : 0));
-	set_64bit_val(wqe, 56, LS_64(ceq->tph_val, I40IW_CQPSQ_TPHVAL));
-
-	header = ceq->ceq_id |
-		 LS_64(I40IW_CQP_OP_CREATE_CEQ, I40IW_CQPSQ_OPCODE) |
-		 LS_64(ceq->pbl_chunk_size, I40IW_CQPSQ_CEQ_LPBLSIZE) |
-		 LS_64(ceq->virtual_map, I40IW_CQPSQ_CEQ_VMAP) |
-		 LS_64(ceq->tph_en, I40IW_CQPSQ_TPHEN) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "CEQ_CREATE WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_cceq_create_done - poll for control ceq wqe to complete
- * @ceq: ceq sc structure
- */
-static enum i40iw_status_code i40iw_sc_cceq_create_done(struct i40iw_sc_ceq *ceq)
-{
-	struct i40iw_sc_cqp *cqp;
-
-	cqp = ceq->dev->cqp;
-	return i40iw_sc_poll_for_cqp_op_done(cqp, I40IW_CQP_OP_CREATE_CEQ, NULL);
-}
-
-/**
- * i40iw_sc_cceq_destroy_done - poll for destroy cceq to complete
- * @ceq: ceq sc structure
- */
-static enum i40iw_status_code i40iw_sc_cceq_destroy_done(struct i40iw_sc_ceq *ceq)
-{
-	struct i40iw_sc_cqp *cqp;
-
-	cqp = ceq->dev->cqp;
-	cqp->process_cqp_sds = i40iw_update_sds_noccq;
-	return i40iw_sc_poll_for_cqp_op_done(cqp, I40IW_CQP_OP_DESTROY_CEQ, NULL);
-}
-
-/**
- * i40iw_sc_cceq_create - create cceq
- * @ceq: ceq sc structure
- * @scratch: u64 saved to be used during cqp completion
- */
-static enum i40iw_status_code i40iw_sc_cceq_create(struct i40iw_sc_ceq *ceq, u64 scratch)
-{
-	enum i40iw_status_code ret_code;
-
-	ret_code = i40iw_sc_ceq_create(ceq, scratch, true);
-	if (!ret_code)
-		ret_code = i40iw_sc_cceq_create_done(ceq);
-	return ret_code;
-}
-
-/**
- * i40iw_sc_ceq_destroy - destroy ceq
- * @ceq: ceq sc structure
- * @scratch: u64 saved to be used during cqp completion
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_ceq_destroy(struct i40iw_sc_ceq *ceq,
-						   u64 scratch,
-						   bool post_sq)
-{
-	struct i40iw_sc_cqp *cqp;
-	u64 *wqe;
-	u64 header;
-
-	cqp = ceq->dev->cqp;
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-	set_64bit_val(wqe, 16, ceq->elem_cnt);
-	set_64bit_val(wqe, 48, ceq->first_pm_pbl_idx);
-	header = ceq->ceq_id |
-		 LS_64(I40IW_CQP_OP_DESTROY_CEQ, I40IW_CQPSQ_OPCODE) |
-		 LS_64(ceq->pbl_chunk_size, I40IW_CQPSQ_CEQ_LPBLSIZE) |
-		 LS_64(ceq->virtual_map, I40IW_CQPSQ_CEQ_VMAP) |
-		 LS_64(ceq->tph_en, I40IW_CQPSQ_TPHEN) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-	i40iw_insert_wqe_hdr(wqe, header);
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "CEQ_DESTROY WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_process_ceq - process ceq
- * @dev: sc device struct
- * @ceq: ceq sc structure
- */
-static void *i40iw_sc_process_ceq(struct i40iw_sc_dev *dev, struct i40iw_sc_ceq *ceq)
-{
-	u64 temp;
-	u64 *ceqe;
-	struct i40iw_sc_cq *cq = NULL;
-	u8 polarity;
-
-	ceqe = (u64 *)I40IW_GET_CURRENT_CEQ_ELEMENT(ceq);
-	get_64bit_val(ceqe, 0, &temp);
-	polarity = (u8)RS_64(temp, I40IW_CEQE_VALID);
-	if (polarity != ceq->polarity)
-		return cq;
-
-	cq = (struct i40iw_sc_cq *)(unsigned long)LS_64_1(temp, 1);
-
-	I40IW_RING_MOVE_TAIL(ceq->ceq_ring);
-	if (I40IW_RING_GETCURRENT_TAIL(ceq->ceq_ring) == 0)
-		ceq->polarity ^= 1;
-
-	if (dev->is_pf)
-		i40iw_wr32(dev->hw, I40E_PFPE_CQACK, cq->cq_uk.cq_id);
-	else
-		i40iw_wr32(dev->hw, I40E_VFPE_CQACK1, cq->cq_uk.cq_id);
-
-	return cq;
-}
-
-/**
- * i40iw_sc_aeq_init - initialize aeq
- * @aeq: aeq structure ptr
- * @info: aeq initialization info
- */
-static enum i40iw_status_code i40iw_sc_aeq_init(struct i40iw_sc_aeq *aeq,
-						struct i40iw_aeq_init_info *info)
-{
-	u32 pble_obj_cnt;
-
-	if ((info->elem_cnt < I40IW_MIN_AEQ_ENTRIES) ||
-	    (info->elem_cnt > I40IW_MAX_AEQ_ENTRIES))
-		return I40IW_ERR_INVALID_SIZE;
-	pble_obj_cnt = info->dev->hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt;
-
-	if (info->virtual_map && (info->first_pm_pbl_idx >= pble_obj_cnt))
-		return I40IW_ERR_INVALID_PBLE_INDEX;
-
-	aeq->size = sizeof(*aeq);
-	aeq->polarity = 1;
-	aeq->aeqe_base = (struct i40iw_sc_aeqe *)info->aeqe_base;
-	aeq->dev = info->dev;
-	aeq->elem_cnt = info->elem_cnt;
-
-	aeq->aeq_elem_pa = info->aeq_elem_pa;
-	I40IW_RING_INIT(aeq->aeq_ring, aeq->elem_cnt);
-	info->dev->aeq = aeq;
-
-	aeq->virtual_map = info->virtual_map;
-	aeq->pbl_list = (aeq->virtual_map ? info->pbl_list : NULL);
-	aeq->pbl_chunk_size = (aeq->virtual_map ? info->pbl_chunk_size : 0);
-	aeq->first_pm_pbl_idx = (aeq->virtual_map ? info->first_pm_pbl_idx : 0);
-	info->dev->aeq = aeq;
-	return 0;
-}
-
-/**
- * i40iw_sc_aeq_create - create aeq
- * @aeq: aeq structure ptr
- * @scratch: u64 saved to be used during cqp completion
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_aeq_create(struct i40iw_sc_aeq *aeq,
-						  u64 scratch,
-						  bool post_sq)
-{
-	u64 *wqe;
-	struct i40iw_sc_cqp *cqp;
-	u64 header;
-
-	cqp = aeq->dev->cqp;
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-	set_64bit_val(wqe, 16, aeq->elem_cnt);
-	set_64bit_val(wqe, 32,
-		      (aeq->virtual_map ? 0 : aeq->aeq_elem_pa));
-	set_64bit_val(wqe, 48,
-		      (aeq->virtual_map ? aeq->first_pm_pbl_idx : 0));
-
-	header = LS_64(I40IW_CQP_OP_CREATE_AEQ, I40IW_CQPSQ_OPCODE) |
-		 LS_64(aeq->pbl_chunk_size, I40IW_CQPSQ_AEQ_LPBLSIZE) |
-		 LS_64(aeq->virtual_map, I40IW_CQPSQ_AEQ_VMAP) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "AEQ_CREATE WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_aeq_destroy - destroy aeq during close
- * @aeq: aeq structure ptr
- * @scratch: u64 saved to be used during cqp completion
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_aeq_destroy(struct i40iw_sc_aeq *aeq,
-						   u64 scratch,
-						   bool post_sq)
-{
-	u64 *wqe;
-	struct i40iw_sc_cqp *cqp;
-	u64 header;
-
-	cqp = aeq->dev->cqp;
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-	set_64bit_val(wqe, 16, aeq->elem_cnt);
-	set_64bit_val(wqe, 48, aeq->first_pm_pbl_idx);
-	header = LS_64(I40IW_CQP_OP_DESTROY_AEQ, I40IW_CQPSQ_OPCODE) |
-		 LS_64(aeq->pbl_chunk_size, I40IW_CQPSQ_AEQ_LPBLSIZE) |
-		 LS_64(aeq->virtual_map, I40IW_CQPSQ_AEQ_VMAP) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "AEQ_DESTROY WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_get_next_aeqe - get next aeq entry
- * @aeq: aeq structure ptr
- * @info: aeqe info to be returned
- */
-static enum i40iw_status_code i40iw_sc_get_next_aeqe(struct i40iw_sc_aeq *aeq,
-						     struct i40iw_aeqe_info *info)
-{
-	u64 temp, compl_ctx;
-	u64 *aeqe;
-	u16 wqe_idx;
-	u8 ae_src;
-	u8 polarity;
-
-	aeqe = (u64 *)I40IW_GET_CURRENT_AEQ_ELEMENT(aeq);
-	get_64bit_val(aeqe, 0, &compl_ctx);
-	get_64bit_val(aeqe, 8, &temp);
-	polarity = (u8)RS_64(temp, I40IW_AEQE_VALID);
-
-	if (aeq->polarity != polarity)
-		return I40IW_ERR_QUEUE_EMPTY;
-
-	i40iw_debug_buf(aeq->dev, I40IW_DEBUG_WQE, "AEQ_ENTRY", aeqe, 16);
-
-	ae_src = (u8)RS_64(temp, I40IW_AEQE_AESRC);
-	wqe_idx = (u16)RS_64(temp, I40IW_AEQE_WQDESCIDX);
-	info->qp_cq_id = (u32)RS_64(temp, I40IW_AEQE_QPCQID);
-	info->ae_id = (u16)RS_64(temp, I40IW_AEQE_AECODE);
-	info->tcp_state = (u8)RS_64(temp, I40IW_AEQE_TCPSTATE);
-	info->iwarp_state = (u8)RS_64(temp, I40IW_AEQE_IWSTATE);
-	info->q2_data_written = (u8)RS_64(temp, I40IW_AEQE_Q2DATA);
-	info->aeqe_overflow = (bool)RS_64(temp, I40IW_AEQE_OVERFLOW);
-
-	switch (info->ae_id) {
-	case I40IW_AE_PRIV_OPERATION_DENIED:
-	case I40IW_AE_UDA_XMIT_DGRAM_TOO_LONG:
-	case I40IW_AE_UDA_XMIT_DGRAM_TOO_SHORT:
-	case I40IW_AE_BAD_CLOSE:
-	case I40IW_AE_RDMAP_ROE_BAD_LLP_CLOSE:
-	case I40IW_AE_RDMA_READ_WHILE_ORD_ZERO:
-	case I40IW_AE_STAG_ZERO_INVALID:
-	case I40IW_AE_IB_RREQ_AND_Q1_FULL:
-	case I40IW_AE_WQE_UNEXPECTED_OPCODE:
-	case I40IW_AE_DDP_UBE_INVALID_DDP_VERSION:
-	case I40IW_AE_DDP_UBE_INVALID_MO:
-	case I40IW_AE_DDP_UBE_INVALID_QN:
-	case I40IW_AE_DDP_NO_L_BIT:
-	case I40IW_AE_RDMAP_ROE_INVALID_RDMAP_VERSION:
-	case I40IW_AE_RDMAP_ROE_UNEXPECTED_OPCODE:
-	case I40IW_AE_ROE_INVALID_RDMA_READ_REQUEST:
-	case I40IW_AE_ROE_INVALID_RDMA_WRITE_OR_READ_RESP:
-	case I40IW_AE_INVALID_ARP_ENTRY:
-	case I40IW_AE_INVALID_TCP_OPTION_RCVD:
-	case I40IW_AE_STALE_ARP_ENTRY:
-	case I40IW_AE_LLP_CLOSE_COMPLETE:
-	case I40IW_AE_LLP_CONNECTION_RESET:
-	case I40IW_AE_LLP_FIN_RECEIVED:
-	case I40IW_AE_LLP_RECEIVED_MPA_CRC_ERROR:
-	case I40IW_AE_LLP_SEGMENT_TOO_SMALL:
-	case I40IW_AE_LLP_SYN_RECEIVED:
-	case I40IW_AE_LLP_TERMINATE_RECEIVED:
-	case I40IW_AE_LLP_TOO_MANY_RETRIES:
-	case I40IW_AE_LLP_DOUBT_REACHABILITY:
-	case I40IW_AE_RESET_SENT:
-	case I40IW_AE_TERMINATE_SENT:
-	case I40IW_AE_RESET_NOT_SENT:
-	case I40IW_AE_LCE_QP_CATASTROPHIC:
-	case I40IW_AE_QP_SUSPEND_COMPLETE:
-		info->qp = true;
-		info->compl_ctx = compl_ctx;
-		ae_src = I40IW_AE_SOURCE_RSVD;
-		break;
-	case I40IW_AE_LCE_CQ_CATASTROPHIC:
-		info->cq = true;
-		info->compl_ctx = LS_64_1(compl_ctx, 1);
-		ae_src = I40IW_AE_SOURCE_RSVD;
-		break;
-	}
-
-	switch (ae_src) {
-	case I40IW_AE_SOURCE_RQ:
-	case I40IW_AE_SOURCE_RQ_0011:
-		info->qp = true;
-		info->wqe_idx = wqe_idx;
-		info->compl_ctx = compl_ctx;
-		break;
-	case I40IW_AE_SOURCE_CQ:
-	case I40IW_AE_SOURCE_CQ_0110:
-	case I40IW_AE_SOURCE_CQ_1010:
-	case I40IW_AE_SOURCE_CQ_1110:
-		info->cq = true;
-		info->compl_ctx = LS_64_1(compl_ctx, 1);
-		break;
-	case I40IW_AE_SOURCE_SQ:
-	case I40IW_AE_SOURCE_SQ_0111:
-		info->qp = true;
-		info->sq = true;
-		info->wqe_idx = wqe_idx;
-		info->compl_ctx = compl_ctx;
-		break;
-	case I40IW_AE_SOURCE_IN_RR_WR:
-	case I40IW_AE_SOURCE_IN_RR_WR_1011:
-		info->qp = true;
-		info->compl_ctx = compl_ctx;
-		info->in_rdrsp_wr = true;
-		break;
-	case I40IW_AE_SOURCE_OUT_RR:
-	case I40IW_AE_SOURCE_OUT_RR_1111:
-		info->qp = true;
-		info->compl_ctx = compl_ctx;
-		info->out_rdrsp = true;
-		break;
-	case I40IW_AE_SOURCE_RSVD:
-	default:
-		break;
-	}
-	I40IW_RING_MOVE_TAIL(aeq->aeq_ring);
-	if (I40IW_RING_GETCURRENT_TAIL(aeq->aeq_ring) == 0)
-		aeq->polarity ^= 1;
-	return 0;
-}
-
-/**
- * i40iw_sc_repost_aeq_entries - repost completed aeq entries
- * @dev: sc device struct
- * @count: allocate count
- */
-static enum i40iw_status_code i40iw_sc_repost_aeq_entries(struct i40iw_sc_dev *dev,
-							  u32 count)
-{
-
-	if (dev->is_pf)
-		i40iw_wr32(dev->hw, I40E_PFPE_AEQALLOC, count);
-	else
-		i40iw_wr32(dev->hw, I40E_VFPE_AEQALLOC1, count);
-
-	return 0;
-}
-
-/**
- * i40iw_sc_aeq_create_done - create aeq
- * @aeq: aeq structure ptr
- */
-static enum i40iw_status_code i40iw_sc_aeq_create_done(struct i40iw_sc_aeq *aeq)
-{
-	struct i40iw_sc_cqp *cqp;
-
-	cqp = aeq->dev->cqp;
-	return i40iw_sc_poll_for_cqp_op_done(cqp, I40IW_CQP_OP_CREATE_AEQ, NULL);
-}
-
-/**
- * i40iw_sc_aeq_destroy_done - destroy of aeq during close
- * @aeq: aeq structure ptr
- */
-static enum i40iw_status_code i40iw_sc_aeq_destroy_done(struct i40iw_sc_aeq *aeq)
-{
-	struct i40iw_sc_cqp *cqp;
-
-	cqp = aeq->dev->cqp;
-	return  i40iw_sc_poll_for_cqp_op_done(cqp, I40IW_CQP_OP_DESTROY_AEQ, NULL);
-}
-
-/**
- * i40iw_sc_ccq_init - initialize control cq
- * @cq: sc's cq ctruct
- * @info: info for control cq initialization
- */
-static enum i40iw_status_code i40iw_sc_ccq_init(struct i40iw_sc_cq *cq,
-						struct i40iw_ccq_init_info *info)
-{
-	u32 pble_obj_cnt;
-
-	if (info->num_elem < I40IW_MIN_CQ_SIZE || info->num_elem > I40IW_MAX_CQ_SIZE)
-		return I40IW_ERR_INVALID_SIZE;
-
-	if (info->ceq_id > I40IW_MAX_CEQID)
-		return I40IW_ERR_INVALID_CEQ_ID;
-
-	pble_obj_cnt = info->dev->hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt;
-
-	if (info->virtual_map && (info->first_pm_pbl_idx >= pble_obj_cnt))
-		return I40IW_ERR_INVALID_PBLE_INDEX;
-
-	cq->cq_pa = info->cq_pa;
-	cq->cq_uk.cq_base = info->cq_base;
-	cq->shadow_area_pa = info->shadow_area_pa;
-	cq->cq_uk.shadow_area = info->shadow_area;
-	cq->shadow_read_threshold = info->shadow_read_threshold;
-	cq->dev = info->dev;
-	cq->ceq_id = info->ceq_id;
-	cq->cq_uk.cq_size = info->num_elem;
-	cq->cq_type = I40IW_CQ_TYPE_CQP;
-	cq->ceqe_mask = info->ceqe_mask;
-	I40IW_RING_INIT(cq->cq_uk.cq_ring, info->num_elem);
-
-	cq->cq_uk.cq_id = 0;    /* control cq is id 0 always */
-	cq->ceq_id_valid = info->ceq_id_valid;
-	cq->tph_en = info->tph_en;
-	cq->tph_val = info->tph_val;
-	cq->cq_uk.avoid_mem_cflct = info->avoid_mem_cflct;
-
-	cq->pbl_list = info->pbl_list;
-	cq->virtual_map = info->virtual_map;
-	cq->pbl_chunk_size = info->pbl_chunk_size;
-	cq->first_pm_pbl_idx = info->first_pm_pbl_idx;
-	cq->cq_uk.polarity = true;
-
-	/* following are only for iw cqs so initialize them to zero */
-	cq->cq_uk.cqe_alloc_reg = NULL;
-	info->dev->ccq = cq;
-	return 0;
-}
-
-/**
- * i40iw_sc_ccq_create_done - poll cqp for ccq create
- * @ccq: ccq sc struct
- */
-static enum i40iw_status_code i40iw_sc_ccq_create_done(struct i40iw_sc_cq *ccq)
-{
-	struct i40iw_sc_cqp *cqp;
-
-	cqp = ccq->dev->cqp;
-	return	i40iw_sc_poll_for_cqp_op_done(cqp, I40IW_CQP_OP_CREATE_CQ, NULL);
-}
-
-/**
- * i40iw_sc_ccq_create - create control cq
- * @ccq: ccq sc struct
- * @scratch: u64 saved to be used during cqp completion
- * @check_overflow: overlow flag for ccq
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_ccq_create(struct i40iw_sc_cq *ccq,
-						  u64 scratch,
-						  bool check_overflow,
-						  bool post_sq)
-{
-	u64 *wqe;
-	struct i40iw_sc_cqp *cqp;
-	u64 header;
-	enum i40iw_status_code ret_code;
-
-	cqp = ccq->dev->cqp;
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-	set_64bit_val(wqe, 0, ccq->cq_uk.cq_size);
-	set_64bit_val(wqe, 8, RS_64_1(ccq, 1));
-	set_64bit_val(wqe, 16,
-		      LS_64(ccq->shadow_read_threshold, I40IW_CQPSQ_CQ_SHADOW_READ_THRESHOLD));
-	set_64bit_val(wqe, 32, (ccq->virtual_map ? 0 : ccq->cq_pa));
-	set_64bit_val(wqe, 40, ccq->shadow_area_pa);
-	set_64bit_val(wqe, 48,
-		      (ccq->virtual_map ? ccq->first_pm_pbl_idx : 0));
-	set_64bit_val(wqe, 56,
-		      LS_64(ccq->tph_val, I40IW_CQPSQ_TPHVAL));
-
-	header = ccq->cq_uk.cq_id |
-		 LS_64((ccq->ceq_id_valid ? ccq->ceq_id : 0), I40IW_CQPSQ_CQ_CEQID) |
-		 LS_64(I40IW_CQP_OP_CREATE_CQ, I40IW_CQPSQ_OPCODE) |
-		 LS_64(ccq->pbl_chunk_size, I40IW_CQPSQ_CQ_LPBLSIZE) |
-		 LS_64(check_overflow, I40IW_CQPSQ_CQ_CHKOVERFLOW) |
-		 LS_64(ccq->virtual_map, I40IW_CQPSQ_CQ_VIRTMAP) |
-		 LS_64(ccq->ceqe_mask, I40IW_CQPSQ_CQ_ENCEQEMASK) |
-		 LS_64(ccq->ceq_id_valid, I40IW_CQPSQ_CQ_CEQIDVALID) |
-		 LS_64(ccq->tph_en, I40IW_CQPSQ_TPHEN) |
-		 LS_64(ccq->cq_uk.avoid_mem_cflct, I40IW_CQPSQ_CQ_AVOIDMEMCNFLCT) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "CCQ_CREATE WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq) {
-		i40iw_sc_cqp_post_sq(cqp);
-		ret_code = i40iw_sc_ccq_create_done(ccq);
-		if (ret_code)
-			return ret_code;
-	}
-	cqp->process_cqp_sds = i40iw_cqp_sds_cmd;
-
-	return 0;
-}
-
-/**
- * i40iw_sc_ccq_destroy - destroy ccq during close
- * @ccq: ccq sc struct
- * @scratch: u64 saved to be used during cqp completion
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_ccq_destroy(struct i40iw_sc_cq *ccq,
-						   u64 scratch,
-						   bool post_sq)
-{
-	struct i40iw_sc_cqp *cqp;
-	u64 *wqe;
-	u64 header;
-	enum i40iw_status_code ret_code = 0;
-	u32 tail, val, error;
-
-	cqp = ccq->dev->cqp;
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-	set_64bit_val(wqe, 0, ccq->cq_uk.cq_size);
-	set_64bit_val(wqe, 8, RS_64_1(ccq, 1));
-	set_64bit_val(wqe, 40, ccq->shadow_area_pa);
-
-	header = ccq->cq_uk.cq_id |
-		 LS_64((ccq->ceq_id_valid ? ccq->ceq_id : 0), I40IW_CQPSQ_CQ_CEQID) |
-		 LS_64(I40IW_CQP_OP_DESTROY_CQ, I40IW_CQPSQ_OPCODE) |
-		 LS_64(ccq->ceqe_mask, I40IW_CQPSQ_CQ_ENCEQEMASK) |
-		 LS_64(ccq->ceq_id_valid, I40IW_CQPSQ_CQ_CEQIDVALID) |
-		 LS_64(ccq->tph_en, I40IW_CQPSQ_TPHEN) |
-		 LS_64(ccq->cq_uk.avoid_mem_cflct, I40IW_CQPSQ_CQ_AVOIDMEMCNFLCT) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "CCQ_DESTROY WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	i40iw_get_cqp_reg_info(cqp, &val, &tail, &error);
-	if (error)
-		return I40IW_ERR_CQP_COMPL_ERROR;
-
-	if (post_sq) {
-		i40iw_sc_cqp_post_sq(cqp);
-		ret_code = i40iw_cqp_poll_registers(cqp, tail, 1000);
-	}
-
-	cqp->process_cqp_sds = i40iw_update_sds_noccq;
-
-	return ret_code;
-}
-
-/**
- * i40iw_sc_cq_init - initialize completion q
- * @cq: cq struct
- * @info: cq initialization info
- */
-static enum i40iw_status_code i40iw_sc_cq_init(struct i40iw_sc_cq *cq,
-					       struct i40iw_cq_init_info *info)
-{
-	u32 __iomem *cqe_alloc_reg = NULL;
-	enum i40iw_status_code ret_code;
-	u32 pble_obj_cnt;
-	u32 arm_offset;
-
-	pble_obj_cnt = info->dev->hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt;
-
-	if (info->virtual_map && (info->first_pm_pbl_idx >= pble_obj_cnt))
-		return I40IW_ERR_INVALID_PBLE_INDEX;
-
-	cq->cq_pa = info->cq_base_pa;
-	cq->dev = info->dev;
-	cq->ceq_id = info->ceq_id;
-	arm_offset = (info->dev->is_pf) ? I40E_PFPE_CQARM : I40E_VFPE_CQARM1;
-	if (i40iw_get_hw_addr(cq->dev))
-		cqe_alloc_reg = (u32 __iomem *)(i40iw_get_hw_addr(cq->dev) +
-					      arm_offset);
-	info->cq_uk_init_info.cqe_alloc_reg = cqe_alloc_reg;
-	ret_code = i40iw_cq_uk_init(&cq->cq_uk, &info->cq_uk_init_info);
-	if (ret_code)
-		return ret_code;
-	cq->virtual_map = info->virtual_map;
-	cq->pbl_chunk_size = info->pbl_chunk_size;
-	cq->ceqe_mask = info->ceqe_mask;
-	cq->cq_type = (info->type) ? info->type : I40IW_CQ_TYPE_IWARP;
-
-	cq->shadow_area_pa = info->shadow_area_pa;
-	cq->shadow_read_threshold = info->shadow_read_threshold;
-
-	cq->ceq_id_valid = info->ceq_id_valid;
-	cq->tph_en = info->tph_en;
-	cq->tph_val = info->tph_val;
-
-	cq->first_pm_pbl_idx = info->first_pm_pbl_idx;
-
-	return 0;
-}
-
-/**
- * i40iw_sc_cq_create - create completion q
- * @cq: cq struct
- * @scratch: u64 saved to be used during cqp completion
- * @check_overflow: flag for overflow check
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_cq_create(struct i40iw_sc_cq *cq,
-						 u64 scratch,
-						 bool check_overflow,
-						 bool post_sq)
-{
-	u64 *wqe;
-	struct i40iw_sc_cqp *cqp;
-	u64 header;
-
-	if (cq->cq_uk.cq_id > I40IW_MAX_CQID)
-		return I40IW_ERR_INVALID_CQ_ID;
-
-	if (cq->ceq_id > I40IW_MAX_CEQID)
-		return I40IW_ERR_INVALID_CEQ_ID;
-
-	cqp = cq->dev->cqp;
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-
-	set_64bit_val(wqe, 0, cq->cq_uk.cq_size);
-	set_64bit_val(wqe, 8, RS_64_1(cq, 1));
-	set_64bit_val(wqe,
-		      16,
-		      LS_64(cq->shadow_read_threshold, I40IW_CQPSQ_CQ_SHADOW_READ_THRESHOLD));
-
-	set_64bit_val(wqe, 32, (cq->virtual_map ? 0 : cq->cq_pa));
-
-	set_64bit_val(wqe, 40, cq->shadow_area_pa);
-	set_64bit_val(wqe, 48, (cq->virtual_map ? cq->first_pm_pbl_idx : 0));
-	set_64bit_val(wqe, 56, LS_64(cq->tph_val, I40IW_CQPSQ_TPHVAL));
-
-	header = cq->cq_uk.cq_id |
-		 LS_64((cq->ceq_id_valid ? cq->ceq_id : 0), I40IW_CQPSQ_CQ_CEQID) |
-		 LS_64(I40IW_CQP_OP_CREATE_CQ, I40IW_CQPSQ_OPCODE) |
-		 LS_64(cq->pbl_chunk_size, I40IW_CQPSQ_CQ_LPBLSIZE) |
-		 LS_64(check_overflow, I40IW_CQPSQ_CQ_CHKOVERFLOW) |
-		 LS_64(cq->virtual_map, I40IW_CQPSQ_CQ_VIRTMAP) |
-		 LS_64(cq->ceqe_mask, I40IW_CQPSQ_CQ_ENCEQEMASK) |
-		 LS_64(cq->ceq_id_valid, I40IW_CQPSQ_CQ_CEQIDVALID) |
-		 LS_64(cq->tph_en, I40IW_CQPSQ_TPHEN) |
-		 LS_64(cq->cq_uk.avoid_mem_cflct, I40IW_CQPSQ_CQ_AVOIDMEMCNFLCT) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "CQ_CREATE WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_cq_destroy - destroy completion q
- * @cq: cq struct
- * @scratch: u64 saved to be used during cqp completion
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_cq_destroy(struct i40iw_sc_cq *cq,
-						  u64 scratch,
-						  bool post_sq)
-{
-	struct i40iw_sc_cqp *cqp;
-	u64 *wqe;
-	u64 header;
-
-	cqp = cq->dev->cqp;
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-	set_64bit_val(wqe, 0, cq->cq_uk.cq_size);
-	set_64bit_val(wqe, 8, RS_64_1(cq, 1));
-	set_64bit_val(wqe, 40, cq->shadow_area_pa);
-	set_64bit_val(wqe, 48, (cq->virtual_map ? cq->first_pm_pbl_idx : 0));
-
-	header = cq->cq_uk.cq_id |
-		 LS_64((cq->ceq_id_valid ? cq->ceq_id : 0), I40IW_CQPSQ_CQ_CEQID) |
-		 LS_64(I40IW_CQP_OP_DESTROY_CQ, I40IW_CQPSQ_OPCODE) |
-		 LS_64(cq->pbl_chunk_size, I40IW_CQPSQ_CQ_LPBLSIZE) |
-		 LS_64(cq->virtual_map, I40IW_CQPSQ_CQ_VIRTMAP) |
-		 LS_64(cq->ceqe_mask, I40IW_CQPSQ_CQ_ENCEQEMASK) |
-		 LS_64(cq->ceq_id_valid, I40IW_CQPSQ_CQ_CEQIDVALID) |
-		 LS_64(cq->tph_en, I40IW_CQPSQ_TPHEN) |
-		 LS_64(cq->cq_uk.avoid_mem_cflct, I40IW_CQPSQ_CQ_AVOIDMEMCNFLCT) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "CQ_DESTROY WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_cq_modify - modify a Completion Queue
- * @cq: cq struct
- * @info: modification info struct
- * @scratch: u64 saved to be used during cqp completion
- * @post_sq: flag to post to sq
- */
-static enum i40iw_status_code i40iw_sc_cq_modify(struct i40iw_sc_cq *cq,
-						 struct i40iw_modify_cq_info *info,
-						 u64 scratch,
-						 bool post_sq)
-{
-	struct i40iw_sc_cqp *cqp;
-	u64 *wqe;
-	u64 header;
-	u32 cq_size, ceq_id, first_pm_pbl_idx;
-	u8 pbl_chunk_size;
-	bool virtual_map, ceq_id_valid, check_overflow;
-	u32 pble_obj_cnt;
-
-	if (info->ceq_valid && (info->ceq_id > I40IW_MAX_CEQID))
-		return I40IW_ERR_INVALID_CEQ_ID;
-
-	pble_obj_cnt = cq->dev->hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt;
-
-	if (info->cq_resize && info->virtual_map &&
-	    (info->first_pm_pbl_idx >= pble_obj_cnt))
-		return I40IW_ERR_INVALID_PBLE_INDEX;
-
-	cqp = cq->dev->cqp;
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-
-	cq->pbl_list = info->pbl_list;
-	cq->cq_pa = info->cq_pa;
-	cq->first_pm_pbl_idx = info->first_pm_pbl_idx;
-
-	cq_size = info->cq_resize ? info->cq_size : cq->cq_uk.cq_size;
-	if (info->ceq_change) {
-		ceq_id_valid = true;
-		ceq_id = info->ceq_id;
-	} else {
-		ceq_id_valid = cq->ceq_id_valid;
-		ceq_id = ceq_id_valid ? cq->ceq_id : 0;
-	}
-	virtual_map = info->cq_resize ? info->virtual_map : cq->virtual_map;
-	first_pm_pbl_idx = (info->cq_resize ?
-			    (info->virtual_map ? info->first_pm_pbl_idx : 0) :
-			    (cq->virtual_map ? cq->first_pm_pbl_idx : 0));
-	pbl_chunk_size = (info->cq_resize ?
-			  (info->virtual_map ? info->pbl_chunk_size : 0) :
-			  (cq->virtual_map ? cq->pbl_chunk_size : 0));
-	check_overflow = info->check_overflow_change ? info->check_overflow :
-			 cq->check_overflow;
-	cq->cq_uk.cq_size = cq_size;
-	cq->ceq_id_valid = ceq_id_valid;
-	cq->ceq_id = ceq_id;
-	cq->virtual_map = virtual_map;
-	cq->first_pm_pbl_idx = first_pm_pbl_idx;
-	cq->pbl_chunk_size = pbl_chunk_size;
-	cq->check_overflow = check_overflow;
-
-	set_64bit_val(wqe, 0, cq_size);
-	set_64bit_val(wqe, 8, RS_64_1(cq, 1));
-	set_64bit_val(wqe, 16,
-		      LS_64(info->shadow_read_threshold, I40IW_CQPSQ_CQ_SHADOW_READ_THRESHOLD));
-	set_64bit_val(wqe, 32, (cq->virtual_map ? 0 : cq->cq_pa));
-	set_64bit_val(wqe, 40, cq->shadow_area_pa);
-	set_64bit_val(wqe, 48, (cq->virtual_map ? first_pm_pbl_idx : 0));
-	set_64bit_val(wqe, 56, LS_64(cq->tph_val, I40IW_CQPSQ_TPHVAL));
-
-	header = cq->cq_uk.cq_id |
-		 LS_64(ceq_id, I40IW_CQPSQ_CQ_CEQID) |
-		 LS_64(I40IW_CQP_OP_MODIFY_CQ, I40IW_CQPSQ_OPCODE) |
-		 LS_64(info->cq_resize, I40IW_CQPSQ_CQ_CQRESIZE) |
-		 LS_64(pbl_chunk_size, I40IW_CQPSQ_CQ_LPBLSIZE) |
-		 LS_64(check_overflow, I40IW_CQPSQ_CQ_CHKOVERFLOW) |
-		 LS_64(virtual_map, I40IW_CQPSQ_CQ_VIRTMAP) |
-		 LS_64(cq->ceqe_mask, I40IW_CQPSQ_CQ_ENCEQEMASK) |
-		 LS_64(ceq_id_valid, I40IW_CQPSQ_CQ_CEQIDVALID) |
-		 LS_64(cq->tph_en, I40IW_CQPSQ_TPHEN) |
-		 LS_64(cq->cq_uk.avoid_mem_cflct, I40IW_CQPSQ_CQ_AVOIDMEMCNFLCT) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "CQ_MODIFY WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_qp_init - initialize qp
- * @qp: sc qp
- * @info: initialization qp info
- */
-static enum i40iw_status_code i40iw_sc_qp_init(struct i40iw_sc_qp *qp,
-					       struct i40iw_qp_init_info *info)
-{
-	u32 __iomem *wqe_alloc_reg = NULL;
-	enum i40iw_status_code ret_code;
-	u32 pble_obj_cnt;
-	u8 wqe_size;
-	u32 offset;
-
-	qp->dev = info->pd->dev;
-	qp->vsi = info->vsi;
-	qp->sq_pa = info->sq_pa;
-	qp->rq_pa = info->rq_pa;
-	qp->hw_host_ctx_pa = info->host_ctx_pa;
-	qp->q2_pa = info->q2_pa;
-	qp->shadow_area_pa = info->shadow_area_pa;
-
-	qp->q2_buf = info->q2;
-	qp->pd = info->pd;
-	qp->hw_host_ctx = info->host_ctx;
-	offset = (qp->pd->dev->is_pf) ? I40E_PFPE_WQEALLOC : I40E_VFPE_WQEALLOC1;
-	if (i40iw_get_hw_addr(qp->pd->dev))
-		wqe_alloc_reg = (u32 __iomem *)(i40iw_get_hw_addr(qp->pd->dev) +
-					      offset);
-
-	info->qp_uk_init_info.wqe_alloc_reg = wqe_alloc_reg;
-	info->qp_uk_init_info.abi_ver = qp->pd->abi_ver;
-	ret_code = i40iw_qp_uk_init(&qp->qp_uk, &info->qp_uk_init_info);
-	if (ret_code)
-		return ret_code;
-	qp->virtual_map = info->virtual_map;
-
-	pble_obj_cnt = info->pd->dev->hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt;
-
-	if ((info->virtual_map && (info->sq_pa >= pble_obj_cnt)) ||
-	    (info->virtual_map && (info->rq_pa >= pble_obj_cnt)))
-		return I40IW_ERR_INVALID_PBLE_INDEX;
-
-	qp->llp_stream_handle = (void *)(-1);
-	qp->qp_type = (info->type) ? info->type : I40IW_QP_TYPE_IWARP;
-
-	qp->hw_sq_size = i40iw_get_encoded_wqe_size(qp->qp_uk.sq_ring.size,
-						    false);
-	i40iw_debug(qp->dev, I40IW_DEBUG_WQE, "%s: hw_sq_size[%04d] sq_ring.size[%04d]\n",
-		    __func__, qp->hw_sq_size, qp->qp_uk.sq_ring.size);
-
-	switch (qp->pd->abi_ver) {
-	case 4:
-		ret_code = i40iw_fragcnt_to_wqesize_rq(qp->qp_uk.max_rq_frag_cnt,
-						       &wqe_size);
-		if (ret_code)
-			return ret_code;
-		break;
-	case 5: /* fallthrough until next ABI version */
-	default:
-		if (qp->qp_uk.max_rq_frag_cnt > I40IW_MAX_WQ_FRAGMENT_COUNT)
-			return I40IW_ERR_INVALID_FRAG_COUNT;
-		wqe_size = I40IW_MAX_WQE_SIZE_RQ;
-		break;
-	}
-	qp->hw_rq_size = i40iw_get_encoded_wqe_size(qp->qp_uk.rq_size *
-				(wqe_size / I40IW_QP_WQE_MIN_SIZE), false);
-	i40iw_debug(qp->dev, I40IW_DEBUG_WQE,
-		    "%s: hw_rq_size[%04d] qp_uk.rq_size[%04d] wqe_size[%04d]\n",
-		    __func__, qp->hw_rq_size, qp->qp_uk.rq_size, wqe_size);
-	qp->sq_tph_val = info->sq_tph_val;
-	qp->rq_tph_val = info->rq_tph_val;
-	qp->sq_tph_en = info->sq_tph_en;
-	qp->rq_tph_en = info->rq_tph_en;
-	qp->rcv_tph_en = info->rcv_tph_en;
-	qp->xmit_tph_en = info->xmit_tph_en;
-	qp->qs_handle = qp->vsi->qos[qp->user_pri].qs_handle;
-
-	return 0;
-}
-
-/**
- * i40iw_sc_qp_create - create qp
- * @qp: sc qp
- * @info: qp create info
- * @scratch: u64 saved to be used during cqp completion
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_qp_create(
-				struct i40iw_sc_qp *qp,
-				struct i40iw_create_qp_info *info,
-				u64 scratch,
-				bool post_sq)
-{
-	struct i40iw_sc_cqp *cqp;
-	u64 *wqe;
-	u64 header;
-
-	if ((qp->qp_uk.qp_id < I40IW_MIN_IW_QP_ID) ||
-	    (qp->qp_uk.qp_id > I40IW_MAX_IW_QP_ID))
-		return I40IW_ERR_INVALID_QP_ID;
-
-	cqp = qp->pd->dev->cqp;
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-
-	set_64bit_val(wqe, 16, qp->hw_host_ctx_pa);
-
-	set_64bit_val(wqe, 40, qp->shadow_area_pa);
-
-	header = qp->qp_uk.qp_id |
-		 LS_64(I40IW_CQP_OP_CREATE_QP, I40IW_CQPSQ_OPCODE) |
-		 LS_64((info->ord_valid ? 1 : 0), I40IW_CQPSQ_QP_ORDVALID) |
-		 LS_64(info->tcp_ctx_valid, I40IW_CQPSQ_QP_TOECTXVALID) |
-		 LS_64(qp->qp_type, I40IW_CQPSQ_QP_QPTYPE) |
-		 LS_64(qp->virtual_map, I40IW_CQPSQ_QP_VQ) |
-		 LS_64(info->cq_num_valid, I40IW_CQPSQ_QP_CQNUMVALID) |
-		 LS_64(info->arp_cache_idx_valid, I40IW_CQPSQ_QP_ARPTABIDXVALID) |
-		 LS_64(info->next_iwarp_state, I40IW_CQPSQ_QP_NEXTIWSTATE) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "QP_CREATE WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_qp_modify - modify qp cqp wqe
- * @qp: sc qp
- * @info: modify qp info
- * @scratch: u64 saved to be used during cqp completion
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_qp_modify(
-				struct i40iw_sc_qp *qp,
-				struct i40iw_modify_qp_info *info,
-				u64 scratch,
-				bool post_sq)
-{
-	u64 *wqe;
-	struct i40iw_sc_cqp *cqp;
-	u64 header;
-	u8 term_actions = 0;
-	u8 term_len = 0;
-
-	cqp = qp->pd->dev->cqp;
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-	if (info->next_iwarp_state == I40IW_QP_STATE_TERMINATE) {
-		if (info->dont_send_fin)
-			term_actions += I40IWQP_TERM_SEND_TERM_ONLY;
-		if (info->dont_send_term)
-			term_actions += I40IWQP_TERM_SEND_FIN_ONLY;
-		if ((term_actions == I40IWQP_TERM_SEND_TERM_AND_FIN) ||
-		    (term_actions == I40IWQP_TERM_SEND_TERM_ONLY))
-			term_len = info->termlen;
-	}
-
-	set_64bit_val(wqe,
-		      8,
-		      LS_64(term_len, I40IW_CQPSQ_QP_TERMLEN));
-
-	set_64bit_val(wqe, 16, qp->hw_host_ctx_pa);
-	set_64bit_val(wqe, 40, qp->shadow_area_pa);
-
-	header = qp->qp_uk.qp_id |
-		 LS_64(I40IW_CQP_OP_MODIFY_QP, I40IW_CQPSQ_OPCODE) |
-		 LS_64(info->ord_valid, I40IW_CQPSQ_QP_ORDVALID) |
-		 LS_64(info->tcp_ctx_valid, I40IW_CQPSQ_QP_TOECTXVALID) |
-		 LS_64(info->cached_var_valid, I40IW_CQPSQ_QP_CACHEDVARVALID) |
-		 LS_64(qp->virtual_map, I40IW_CQPSQ_QP_VQ) |
-		 LS_64(info->cq_num_valid, I40IW_CQPSQ_QP_CQNUMVALID) |
-		 LS_64(info->force_loopback, I40IW_CQPSQ_QP_FORCELOOPBACK) |
-		 LS_64(qp->qp_type, I40IW_CQPSQ_QP_QPTYPE) |
-		 LS_64(info->remove_hash_idx, I40IW_CQPSQ_QP_REMOVEHASHENTRY) |
-		 LS_64(term_actions, I40IW_CQPSQ_QP_TERMACT) |
-		 LS_64(info->reset_tcp_conn, I40IW_CQPSQ_QP_RESETCON) |
-		 LS_64(info->arp_cache_idx_valid, I40IW_CQPSQ_QP_ARPTABIDXVALID) |
-		 LS_64(info->next_iwarp_state, I40IW_CQPSQ_QP_NEXTIWSTATE) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "QP_MODIFY WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_qp_destroy - cqp destroy qp
- * @qp: sc qp
- * @scratch: u64 saved to be used during cqp completion
- * @remove_hash_idx: flag if to remove hash idx
- * @ignore_mw_bnd: memory window bind flag
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_qp_destroy(
-					struct i40iw_sc_qp *qp,
-					u64 scratch,
-					bool remove_hash_idx,
-					bool ignore_mw_bnd,
-					bool post_sq)
-{
-	u64 *wqe;
-	struct i40iw_sc_cqp *cqp;
-	u64 header;
-
-	i40iw_qp_rem_qos(qp);
-	cqp = qp->pd->dev->cqp;
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-	set_64bit_val(wqe, 16, qp->hw_host_ctx_pa);
-	set_64bit_val(wqe, 40, qp->shadow_area_pa);
-
-	header = qp->qp_uk.qp_id |
-		 LS_64(I40IW_CQP_OP_DESTROY_QP, I40IW_CQPSQ_OPCODE) |
-		 LS_64(qp->qp_type, I40IW_CQPSQ_QP_QPTYPE) |
-		 LS_64(ignore_mw_bnd, I40IW_CQPSQ_QP_IGNOREMWBOUND) |
-		 LS_64(remove_hash_idx, I40IW_CQPSQ_QP_REMOVEHASHENTRY) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "QP_DESTROY WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_qp_flush_wqes - flush qp's wqe
- * @qp: sc qp
- * @info: dlush information
- * @scratch: u64 saved to be used during cqp completion
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_qp_flush_wqes(
-				struct i40iw_sc_qp *qp,
-				struct i40iw_qp_flush_info *info,
-				u64 scratch,
-				bool post_sq)
-{
-	u64 temp = 0;
-	u64 *wqe;
-	struct i40iw_sc_cqp *cqp;
-	u64 header;
-	bool flush_sq = false, flush_rq = false;
-
-	if (info->rq && !qp->flush_rq)
-		flush_rq = true;
-
-	if (info->sq && !qp->flush_sq)
-		flush_sq = true;
-
-	qp->flush_sq |= flush_sq;
-	qp->flush_rq |= flush_rq;
-	if (!flush_sq && !flush_rq)
-		return 0;
-
-	cqp = qp->pd->dev->cqp;
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-	if (info->userflushcode) {
-		if (flush_rq) {
-			temp |= LS_64(info->rq_minor_code, I40IW_CQPSQ_FWQE_RQMNERR) |
-				LS_64(info->rq_major_code, I40IW_CQPSQ_FWQE_RQMJERR);
-		}
-		if (flush_sq) {
-			temp |= LS_64(info->sq_minor_code, I40IW_CQPSQ_FWQE_SQMNERR) |
-				LS_64(info->sq_major_code, I40IW_CQPSQ_FWQE_SQMJERR);
-		}
-	}
-	set_64bit_val(wqe, 16, temp);
-
-	temp = (info->generate_ae) ?
-		info->ae_code | LS_64(info->ae_source, I40IW_CQPSQ_FWQE_AESOURCE) : 0;
-
-	set_64bit_val(wqe, 8, temp);
-
-	header = qp->qp_uk.qp_id |
-		 LS_64(I40IW_CQP_OP_FLUSH_WQES, I40IW_CQPSQ_OPCODE) |
-		 LS_64(info->generate_ae, I40IW_CQPSQ_FWQE_GENERATE_AE) |
-		 LS_64(info->userflushcode, I40IW_CQPSQ_FWQE_USERFLCODE) |
-		 LS_64(flush_sq, I40IW_CQPSQ_FWQE_FLUSHSQ) |
-		 LS_64(flush_rq, I40IW_CQPSQ_FWQE_FLUSHRQ) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "QP_FLUSH WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_gen_ae - generate AE, currently uses flush WQE CQP OP
- * @qp: sc qp
- * @info: gen ae information
- * @scratch: u64 saved to be used during cqp completion
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_gen_ae(
-				struct i40iw_sc_qp *qp,
-				struct i40iw_gen_ae_info *info,
-				u64 scratch,
-				bool post_sq)
-{
-	u64 temp;
-	u64 *wqe;
-	struct i40iw_sc_cqp *cqp;
-	u64 header;
-
-	cqp = qp->pd->dev->cqp;
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-
-	temp = info->ae_code |
-	       LS_64(info->ae_source, I40IW_CQPSQ_FWQE_AESOURCE);
-
-	set_64bit_val(wqe, 8, temp);
-
-	header = qp->qp_uk.qp_id |
-		 LS_64(I40IW_CQP_OP_GEN_AE, I40IW_CQPSQ_OPCODE) |
-		 LS_64(1, I40IW_CQPSQ_FWQE_GENERATE_AE) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "GEN_AE WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_qp_upload_context - upload qp's context
- * @dev: sc device struct
- * @info: upload context info ptr for return
- * @scratch: u64 saved to be used during cqp completion
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_qp_upload_context(
-					struct i40iw_sc_dev *dev,
-					struct i40iw_upload_context_info *info,
-					u64 scratch,
-					bool post_sq)
-{
-	u64 *wqe;
-	struct i40iw_sc_cqp *cqp;
-	u64 header;
-
-	cqp = dev->cqp;
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-	set_64bit_val(wqe, 16, info->buf_pa);
-
-	header = LS_64(info->qp_id, I40IW_CQPSQ_UCTX_QPID) |
-		 LS_64(I40IW_CQP_OP_UPLOAD_CONTEXT, I40IW_CQPSQ_OPCODE) |
-		 LS_64(info->qp_type, I40IW_CQPSQ_UCTX_QPTYPE) |
-		 LS_64(info->raw_format, I40IW_CQPSQ_UCTX_RAWFORMAT) |
-		 LS_64(info->freeze_qp, I40IW_CQPSQ_UCTX_FREEZEQP) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(dev, I40IW_DEBUG_WQE, "QP_UPLOAD_CTX WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_qp_setctx - set qp's context
- * @qp: sc qp
- * @qp_ctx: context ptr
- * @info: ctx info
- */
-static enum i40iw_status_code i40iw_sc_qp_setctx(
-				struct i40iw_sc_qp *qp,
-				u64 *qp_ctx,
-				struct i40iw_qp_host_ctx_info *info)
-{
-	struct i40iwarp_offload_info *iw;
-	struct i40iw_tcp_offload_info *tcp;
-	struct i40iw_sc_vsi *vsi;
-	struct i40iw_sc_dev *dev;
-	u64 qw0, qw3, qw7 = 0;
-
-	iw = info->iwarp_info;
-	tcp = info->tcp_info;
-	vsi = qp->vsi;
-	dev = qp->dev;
-	if (info->add_to_qoslist) {
-		qp->user_pri = info->user_pri;
-		i40iw_qp_add_qos(qp);
-		i40iw_debug(qp->dev, I40IW_DEBUG_DCB, "%s qp[%d] UP[%d] qset[%d]\n",
-			    __func__, qp->qp_uk.qp_id, qp->user_pri, qp->qs_handle);
-	}
-	qw0 = LS_64(qp->qp_uk.rq_wqe_size, I40IWQPC_RQWQESIZE) |
-	      LS_64(info->err_rq_idx_valid, I40IWQPC_ERR_RQ_IDX_VALID) |
-	      LS_64(qp->rcv_tph_en, I40IWQPC_RCVTPHEN) |
-	      LS_64(qp->xmit_tph_en, I40IWQPC_XMITTPHEN) |
-	      LS_64(qp->rq_tph_en, I40IWQPC_RQTPHEN) |
-	      LS_64(qp->sq_tph_en, I40IWQPC_SQTPHEN);
-
-	set_64bit_val(qp_ctx, 8, qp->sq_pa);
-	set_64bit_val(qp_ctx, 16, qp->rq_pa);
-
-	qw3 = LS_64(qp->src_mac_addr_idx, I40IWQPC_SRCMACADDRIDX) |
-	      LS_64(qp->hw_rq_size, I40IWQPC_RQSIZE) |
-	      LS_64(qp->hw_sq_size, I40IWQPC_SQSIZE);
-
-	set_64bit_val(qp_ctx,
-		      128,
-		      LS_64(info->err_rq_idx, I40IWQPC_ERR_RQ_IDX));
-
-	set_64bit_val(qp_ctx,
-		      136,
-		      LS_64(info->send_cq_num, I40IWQPC_TXCQNUM) |
-		      LS_64(info->rcv_cq_num, I40IWQPC_RXCQNUM));
-
-	set_64bit_val(qp_ctx,
-		      168,
-		      LS_64(info->qp_compl_ctx, I40IWQPC_QPCOMPCTX));
-	set_64bit_val(qp_ctx,
-		      176,
-		      LS_64(qp->sq_tph_val, I40IWQPC_SQTPHVAL) |
-		      LS_64(qp->rq_tph_val, I40IWQPC_RQTPHVAL) |
-		      LS_64(qp->qs_handle, I40IWQPC_QSHANDLE) |
-		      LS_64(vsi->exception_lan_queue, I40IWQPC_EXCEPTION_LAN_QUEUE));
-
-	if (info->iwarp_info_valid) {
-		qw0 |= LS_64(iw->ddp_ver, I40IWQPC_DDP_VER) |
-		       LS_64(iw->rdmap_ver, I40IWQPC_RDMAP_VER);
-
-		qw7 |= LS_64(iw->pd_id, I40IWQPC_PDIDX);
-		set_64bit_val(qp_ctx,
-			      144,
-			      LS_64(qp->q2_pa, I40IWQPC_Q2ADDR) |
-			      LS_64(vsi->fcn_id, I40IWQPC_STAT_INDEX));
-		set_64bit_val(qp_ctx,
-			      152,
-			      LS_64(iw->last_byte_sent, I40IWQPC_LASTBYTESENT));
-
-		set_64bit_val(qp_ctx,
-			      160,
-			      LS_64(iw->ord_size, I40IWQPC_ORDSIZE) |
-			      LS_64(iw->ird_size, I40IWQPC_IRDSIZE) |
-			      LS_64(iw->wr_rdresp_en, I40IWQPC_WRRDRSPOK) |
-			      LS_64(iw->rd_enable, I40IWQPC_RDOK) |
-			      LS_64(iw->snd_mark_en, I40IWQPC_SNDMARKERS) |
-			      LS_64(iw->bind_en, I40IWQPC_BINDEN) |
-			      LS_64(iw->fast_reg_en, I40IWQPC_FASTREGEN) |
-			      LS_64(iw->priv_mode_en, I40IWQPC_PRIVEN) |
-			      LS_64((((vsi->stats_fcn_id_alloc) &&
-				      (dev->is_pf) && (vsi->fcn_id >= I40IW_FIRST_NON_PF_STAT)) ? 1 : 0),
-				    I40IWQPC_USESTATSINSTANCE) |
-			      LS_64(1, I40IWQPC_IWARPMODE) |
-			      LS_64(iw->rcv_mark_en, I40IWQPC_RCVMARKERS) |
-			      LS_64(iw->align_hdrs, I40IWQPC_ALIGNHDRS) |
-			      LS_64(iw->rcv_no_mpa_crc, I40IWQPC_RCVNOMPACRC) |
-			      LS_64(iw->rcv_mark_offset, I40IWQPC_RCVMARKOFFSET) |
-			      LS_64(iw->snd_mark_offset, I40IWQPC_SNDMARKOFFSET));
-	}
-	if (info->tcp_info_valid) {
-		qw0 |= LS_64(tcp->ipv4, I40IWQPC_IPV4) |
-		       LS_64(tcp->no_nagle, I40IWQPC_NONAGLE) |
-		       LS_64(tcp->insert_vlan_tag, I40IWQPC_INSERTVLANTAG) |
-		       LS_64(tcp->time_stamp, I40IWQPC_TIMESTAMP) |
-		       LS_64(tcp->cwnd_inc_limit, I40IWQPC_LIMIT) |
-		       LS_64(tcp->drop_ooo_seg, I40IWQPC_DROPOOOSEG) |
-		       LS_64(tcp->dup_ack_thresh, I40IWQPC_DUPACK_THRESH);
-
-		qw3 |= LS_64(tcp->ttl, I40IWQPC_TTL) |
-		       LS_64(tcp->src_mac_addr_idx, I40IWQPC_SRCMACADDRIDX) |
-		       LS_64(tcp->avoid_stretch_ack, I40IWQPC_AVOIDSTRETCHACK) |
-		       LS_64(tcp->tos, I40IWQPC_TOS) |
-		       LS_64(tcp->src_port, I40IWQPC_SRCPORTNUM) |
-		       LS_64(tcp->dst_port, I40IWQPC_DESTPORTNUM);
-
-		qp->src_mac_addr_idx = tcp->src_mac_addr_idx;
-		set_64bit_val(qp_ctx,
-			      32,
-			      LS_64(tcp->dest_ip_addr2, I40IWQPC_DESTIPADDR2) |
-			      LS_64(tcp->dest_ip_addr3, I40IWQPC_DESTIPADDR3));
-
-		set_64bit_val(qp_ctx,
-			      40,
-			      LS_64(tcp->dest_ip_addr0, I40IWQPC_DESTIPADDR0) |
-			      LS_64(tcp->dest_ip_addr1, I40IWQPC_DESTIPADDR1));
-
-		set_64bit_val(qp_ctx,
-			      48,
-			      LS_64(tcp->snd_mss, I40IWQPC_SNDMSS) |
-				LS_64(tcp->vlan_tag, I40IWQPC_VLANTAG) |
-				LS_64(tcp->arp_idx, I40IWQPC_ARPIDX));
-
-		qw7 |= LS_64(tcp->flow_label, I40IWQPC_FLOWLABEL) |
-		       LS_64(tcp->wscale, I40IWQPC_WSCALE) |
-		       LS_64(tcp->ignore_tcp_opt, I40IWQPC_IGNORE_TCP_OPT) |
-		       LS_64(tcp->ignore_tcp_uns_opt, I40IWQPC_IGNORE_TCP_UNS_OPT) |
-		       LS_64(tcp->tcp_state, I40IWQPC_TCPSTATE) |
-		       LS_64(tcp->rcv_wscale, I40IWQPC_RCVSCALE) |
-		       LS_64(tcp->snd_wscale, I40IWQPC_SNDSCALE);
-
-		set_64bit_val(qp_ctx,
-			      72,
-			      LS_64(tcp->time_stamp_recent, I40IWQPC_TIMESTAMP_RECENT) |
-			      LS_64(tcp->time_stamp_age, I40IWQPC_TIMESTAMP_AGE));
-		set_64bit_val(qp_ctx,
-			      80,
-			      LS_64(tcp->snd_nxt, I40IWQPC_SNDNXT) |
-			      LS_64(tcp->snd_wnd, I40IWQPC_SNDWND));
-
-		set_64bit_val(qp_ctx,
-			      88,
-			      LS_64(tcp->rcv_nxt, I40IWQPC_RCVNXT) |
-			      LS_64(tcp->rcv_wnd, I40IWQPC_RCVWND));
-		set_64bit_val(qp_ctx,
-			      96,
-			      LS_64(tcp->snd_max, I40IWQPC_SNDMAX) |
-			      LS_64(tcp->snd_una, I40IWQPC_SNDUNA));
-		set_64bit_val(qp_ctx,
-			      104,
-			      LS_64(tcp->srtt, I40IWQPC_SRTT) |
-			      LS_64(tcp->rtt_var, I40IWQPC_RTTVAR));
-		set_64bit_val(qp_ctx,
-			      112,
-			      LS_64(tcp->ss_thresh, I40IWQPC_SSTHRESH) |
-			      LS_64(tcp->cwnd, I40IWQPC_CWND));
-		set_64bit_val(qp_ctx,
-			      120,
-			      LS_64(tcp->snd_wl1, I40IWQPC_SNDWL1) |
-			      LS_64(tcp->snd_wl2, I40IWQPC_SNDWL2));
-		set_64bit_val(qp_ctx,
-			      128,
-			      LS_64(tcp->max_snd_window, I40IWQPC_MAXSNDWND) |
-			      LS_64(tcp->rexmit_thresh, I40IWQPC_REXMIT_THRESH));
-		set_64bit_val(qp_ctx,
-			      184,
-			      LS_64(tcp->local_ipaddr3, I40IWQPC_LOCAL_IPADDR3) |
-			      LS_64(tcp->local_ipaddr2, I40IWQPC_LOCAL_IPADDR2));
-		set_64bit_val(qp_ctx,
-			      192,
-			      LS_64(tcp->local_ipaddr1, I40IWQPC_LOCAL_IPADDR1) |
-			      LS_64(tcp->local_ipaddr0, I40IWQPC_LOCAL_IPADDR0));
-	}
-
-	set_64bit_val(qp_ctx, 0, qw0);
-	set_64bit_val(qp_ctx, 24, qw3);
-	set_64bit_val(qp_ctx, 56, qw7);
-
-	i40iw_debug_buf(qp->dev, I40IW_DEBUG_WQE, "QP_HOST)CTX WQE",
-			qp_ctx, I40IW_QP_CTX_SIZE);
-	return 0;
-}
-
-/**
- * i40iw_sc_alloc_stag - mr stag alloc
- * @dev: sc device struct
- * @info: stag info
- * @scratch: u64 saved to be used during cqp completion
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_alloc_stag(
-				struct i40iw_sc_dev *dev,
-				struct i40iw_allocate_stag_info *info,
-				u64 scratch,
-				bool post_sq)
-{
-	u64 *wqe;
-	struct i40iw_sc_cqp *cqp;
-	u64 header;
-	enum i40iw_page_size page_size;
-
-	page_size = (info->page_size == 0x200000) ? I40IW_PAGE_SIZE_2M : I40IW_PAGE_SIZE_4K;
-	cqp = dev->cqp;
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-	set_64bit_val(wqe,
-		      8,
-		      LS_64(info->pd_id, I40IW_CQPSQ_STAG_PDID) |
-		      LS_64(info->total_len, I40IW_CQPSQ_STAG_STAGLEN));
-	set_64bit_val(wqe,
-		      16,
-		      LS_64(info->stag_idx, I40IW_CQPSQ_STAG_IDX));
-	set_64bit_val(wqe,
-		      40,
-		      LS_64(info->hmc_fcn_index, I40IW_CQPSQ_STAG_HMCFNIDX));
-
-	header = LS_64(I40IW_CQP_OP_ALLOC_STAG, I40IW_CQPSQ_OPCODE) |
-		 LS_64(1, I40IW_CQPSQ_STAG_MR) |
-		 LS_64(info->access_rights, I40IW_CQPSQ_STAG_ARIGHTS) |
-		 LS_64(info->chunk_size, I40IW_CQPSQ_STAG_LPBLSIZE) |
-		 LS_64(page_size, I40IW_CQPSQ_STAG_HPAGESIZE) |
-		 LS_64(info->remote_access, I40IW_CQPSQ_STAG_REMACCENABLED) |
-		 LS_64(info->use_hmc_fcn_index, I40IW_CQPSQ_STAG_USEHMCFNIDX) |
-		 LS_64(info->use_pf_rid, I40IW_CQPSQ_STAG_USEPFRID) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(dev, I40IW_DEBUG_WQE, "ALLOC_STAG WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_mr_reg_non_shared - non-shared mr registration
- * @dev: sc device struct
- * @info: mr info
- * @scratch: u64 saved to be used during cqp completion
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_mr_reg_non_shared(
-				struct i40iw_sc_dev *dev,
-				struct i40iw_reg_ns_stag_info *info,
-				u64 scratch,
-				bool post_sq)
-{
-	u64 *wqe;
-	u64 temp;
-	struct i40iw_sc_cqp *cqp;
-	u64 header;
-	u32 pble_obj_cnt;
-	bool remote_access;
-	u8 addr_type;
-	enum i40iw_page_size page_size;
-
-	page_size = (info->page_size == 0x200000) ? I40IW_PAGE_SIZE_2M : I40IW_PAGE_SIZE_4K;
-	if (info->access_rights & (I40IW_ACCESS_FLAGS_REMOTEREAD_ONLY |
-				   I40IW_ACCESS_FLAGS_REMOTEWRITE_ONLY))
-		remote_access = true;
-	else
-		remote_access = false;
-
-	pble_obj_cnt = dev->hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt;
-
-	if (info->chunk_size && (info->first_pm_pbl_index >= pble_obj_cnt))
-		return I40IW_ERR_INVALID_PBLE_INDEX;
-
-	cqp = dev->cqp;
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-
-	temp = (info->addr_type == I40IW_ADDR_TYPE_VA_BASED) ? (uintptr_t)info->va : info->fbo;
-	set_64bit_val(wqe, 0, temp);
-
-	set_64bit_val(wqe,
-		      8,
-		      LS_64(info->total_len, I40IW_CQPSQ_STAG_STAGLEN) |
-		      LS_64(info->pd_id, I40IW_CQPSQ_STAG_PDID));
-
-	set_64bit_val(wqe,
-		      16,
-		      LS_64(info->stag_key, I40IW_CQPSQ_STAG_KEY) |
-		      LS_64(info->stag_idx, I40IW_CQPSQ_STAG_IDX));
-	if (!info->chunk_size) {
-		set_64bit_val(wqe, 32, info->reg_addr_pa);
-		set_64bit_val(wqe, 48, 0);
-	} else {
-		set_64bit_val(wqe, 32, 0);
-		set_64bit_val(wqe, 48, info->first_pm_pbl_index);
-	}
-	set_64bit_val(wqe, 40, info->hmc_fcn_index);
-	set_64bit_val(wqe, 56, 0);
-
-	addr_type = (info->addr_type == I40IW_ADDR_TYPE_VA_BASED) ? 1 : 0;
-	header = LS_64(I40IW_CQP_OP_REG_MR, I40IW_CQPSQ_OPCODE) |
-		 LS_64(1, I40IW_CQPSQ_STAG_MR) |
-		 LS_64(info->chunk_size, I40IW_CQPSQ_STAG_LPBLSIZE) |
-		 LS_64(page_size, I40IW_CQPSQ_STAG_HPAGESIZE) |
-		 LS_64(info->access_rights, I40IW_CQPSQ_STAG_ARIGHTS) |
-		 LS_64(remote_access, I40IW_CQPSQ_STAG_REMACCENABLED) |
-		 LS_64(addr_type, I40IW_CQPSQ_STAG_VABASEDTO) |
-		 LS_64(info->use_hmc_fcn_index, I40IW_CQPSQ_STAG_USEHMCFNIDX) |
-		 LS_64(info->use_pf_rid, I40IW_CQPSQ_STAG_USEPFRID) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(dev, I40IW_DEBUG_WQE, "MR_REG_NS WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_mr_reg_shared - registered shared memory region
- * @dev: sc device struct
- * @info: info for shared memory registeration
- * @scratch: u64 saved to be used during cqp completion
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_mr_reg_shared(
-					struct i40iw_sc_dev *dev,
-					struct i40iw_register_shared_stag *info,
-					u64 scratch,
-					bool post_sq)
-{
-	u64 *wqe;
-	struct i40iw_sc_cqp *cqp;
-	u64 temp, va64, fbo, header;
-	u32 va32;
-	bool remote_access;
-	u8 addr_type;
-
-	if (info->access_rights & (I40IW_ACCESS_FLAGS_REMOTEREAD_ONLY |
-				   I40IW_ACCESS_FLAGS_REMOTEWRITE_ONLY))
-		remote_access = true;
-	else
-		remote_access = false;
-	cqp = dev->cqp;
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-	va64 = (uintptr_t)(info->va);
-	va32 = (u32)(va64 & 0x00000000FFFFFFFF);
-	fbo = (u64)(va32 & (4096 - 1));
-
-	set_64bit_val(wqe,
-		      0,
-		      (info->addr_type == I40IW_ADDR_TYPE_VA_BASED ? (uintptr_t)info->va : fbo));
-
-	set_64bit_val(wqe,
-		      8,
-		      LS_64(info->pd_id, I40IW_CQPSQ_STAG_PDID));
-	temp = LS_64(info->new_stag_key, I40IW_CQPSQ_STAG_KEY) |
-	       LS_64(info->new_stag_idx, I40IW_CQPSQ_STAG_IDX) |
-	       LS_64(info->parent_stag_idx, I40IW_CQPSQ_STAG_PARENTSTAGIDX);
-	set_64bit_val(wqe, 16, temp);
-
-	addr_type = (info->addr_type == I40IW_ADDR_TYPE_VA_BASED) ? 1 : 0;
-	header = LS_64(I40IW_CQP_OP_REG_SMR, I40IW_CQPSQ_OPCODE) |
-		 LS_64(1, I40IW_CQPSQ_STAG_MR) |
-		 LS_64(info->access_rights, I40IW_CQPSQ_STAG_ARIGHTS) |
-		 LS_64(remote_access, I40IW_CQPSQ_STAG_REMACCENABLED) |
-		 LS_64(addr_type, I40IW_CQPSQ_STAG_VABASEDTO) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(dev, I40IW_DEBUG_WQE, "MR_REG_SHARED WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_dealloc_stag - deallocate stag
- * @dev: sc device struct
- * @info: dealloc stag info
- * @scratch: u64 saved to be used during cqp completion
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_dealloc_stag(
-					struct i40iw_sc_dev *dev,
-					struct i40iw_dealloc_stag_info *info,
-					u64 scratch,
-					bool post_sq)
-{
-	u64 header;
-	u64 *wqe;
-	struct i40iw_sc_cqp *cqp;
-
-	cqp = dev->cqp;
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-	set_64bit_val(wqe,
-		      8,
-		      LS_64(info->pd_id, I40IW_CQPSQ_STAG_PDID));
-	set_64bit_val(wqe,
-		      16,
-		      LS_64(info->stag_idx, I40IW_CQPSQ_STAG_IDX));
-
-	header = LS_64(I40IW_CQP_OP_DEALLOC_STAG, I40IW_CQPSQ_OPCODE) |
-		 LS_64(info->mr, I40IW_CQPSQ_STAG_MR) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(dev, I40IW_DEBUG_WQE, "DEALLOC_STAG WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_query_stag - query hardware for stag
- * @dev: sc device struct
- * @scratch: u64 saved to be used during cqp completion
- * @stag_index: stag index for query
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_query_stag(struct i40iw_sc_dev *dev,
-						  u64 scratch,
-						  u32 stag_index,
-						  bool post_sq)
-{
-	u64 header;
-	u64 *wqe;
-	struct i40iw_sc_cqp *cqp;
-
-	cqp = dev->cqp;
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-	set_64bit_val(wqe,
-		      16,
-		      LS_64(stag_index, I40IW_CQPSQ_QUERYSTAG_IDX));
-
-	header = LS_64(I40IW_CQP_OP_QUERY_STAG, I40IW_CQPSQ_OPCODE) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(dev, I40IW_DEBUG_WQE, "QUERY_STAG WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_mw_alloc - mw allocate
- * @dev: sc device struct
- * @scratch: u64 saved to be used during cqp completion
- * @mw_stag_index:stag index
- * @pd_id: pd is for this mw
- * @post_sq: flag for cqp db to ring
- */
-static enum i40iw_status_code i40iw_sc_mw_alloc(
-					struct i40iw_sc_dev *dev,
-					u64 scratch,
-					u32 mw_stag_index,
-					u16 pd_id,
-					bool post_sq)
-{
-	u64 header;
-	struct i40iw_sc_cqp *cqp;
-	u64 *wqe;
-
-	cqp = dev->cqp;
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-	set_64bit_val(wqe, 8, LS_64(pd_id, I40IW_CQPSQ_STAG_PDID));
-	set_64bit_val(wqe,
-		      16,
-		      LS_64(mw_stag_index, I40IW_CQPSQ_STAG_IDX));
-
-	header = LS_64(I40IW_CQP_OP_ALLOC_STAG, I40IW_CQPSQ_OPCODE) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(dev, I40IW_DEBUG_WQE, "MW_ALLOC WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_mr_fast_register - Posts RDMA fast register mr WR to iwarp qp
- * @qp: sc qp struct
- * @info: fast mr info
- * @post_sq: flag for cqp db to ring
- */
-enum i40iw_status_code i40iw_sc_mr_fast_register(
-				struct i40iw_sc_qp *qp,
-				struct i40iw_fast_reg_stag_info *info,
-				bool post_sq)
-{
-	u64 temp, header;
-	u64 *wqe;
-	u32 wqe_idx;
-	enum i40iw_page_size page_size;
-
-	page_size = (info->page_size == 0x200000) ? I40IW_PAGE_SIZE_2M : I40IW_PAGE_SIZE_4K;
-	wqe = i40iw_qp_get_next_send_wqe(&qp->qp_uk, &wqe_idx, I40IW_QP_WQE_MIN_SIZE,
-					 0, info->wr_id);
-	if (!wqe)
-		return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
-
-	i40iw_debug(qp->dev, I40IW_DEBUG_MR, "%s: wr_id[%llxh] wqe_idx[%04d] location[%p]\n",
-		    __func__, info->wr_id, wqe_idx,
-		    &qp->qp_uk.sq_wrtrk_array[wqe_idx].wrid);
-	temp = (info->addr_type == I40IW_ADDR_TYPE_VA_BASED) ? (uintptr_t)info->va : info->fbo;
-	set_64bit_val(wqe, 0, temp);
-
-	temp = RS_64(info->first_pm_pbl_index >> 16, I40IWQPSQ_FIRSTPMPBLIDXHI);
-	set_64bit_val(wqe,
-		      8,
-		      LS_64(temp, I40IWQPSQ_FIRSTPMPBLIDXHI) |
-		      LS_64(info->reg_addr_pa >> I40IWQPSQ_PBLADDR_SHIFT, I40IWQPSQ_PBLADDR));
-
-	set_64bit_val(wqe,
-		      16,
-		      info->total_len |
-		      LS_64(info->first_pm_pbl_index, I40IWQPSQ_FIRSTPMPBLIDXLO));
-
-	header = LS_64(info->stag_key, I40IWQPSQ_STAGKEY) |
-		 LS_64(info->stag_idx, I40IWQPSQ_STAGINDEX) |
-		 LS_64(I40IWQP_OP_FAST_REGISTER, I40IWQPSQ_OPCODE) |
-		 LS_64(info->chunk_size, I40IWQPSQ_LPBLSIZE) |
-		 LS_64(page_size, I40IWQPSQ_HPAGESIZE) |
-		 LS_64(info->access_rights, I40IWQPSQ_STAGRIGHTS) |
-		 LS_64(info->addr_type, I40IWQPSQ_VABASEDTO) |
-		 LS_64(info->read_fence, I40IWQPSQ_READFENCE) |
-		 LS_64(info->local_fence, I40IWQPSQ_LOCALFENCE) |
-		 LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) |
-		 LS_64(qp->qp_uk.swqe_polarity, I40IWQPSQ_VALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(qp->dev, I40IW_DEBUG_WQE, "FAST_REG WQE",
-			wqe, I40IW_QP_WQE_MIN_SIZE);
-
-	if (post_sq)
-		i40iw_qp_post_wr(&qp->qp_uk);
-	return 0;
-}
-
-/**
- * i40iw_sc_send_lsmm - send last streaming mode message
- * @qp: sc qp struct
- * @lsmm_buf: buffer with lsmm message
- * @size: size of lsmm buffer
- * @stag: stag of lsmm buffer
- */
-static void i40iw_sc_send_lsmm(struct i40iw_sc_qp *qp,
-			       void *lsmm_buf,
-			       u32 size,
-			       i40iw_stag stag)
-{
-	u64 *wqe;
-	u64 header;
-	struct i40iw_qp_uk *qp_uk;
-
-	qp_uk = &qp->qp_uk;
-	wqe = qp_uk->sq_base->elem;
-
-	set_64bit_val(wqe, 0, (uintptr_t)lsmm_buf);
-
-	set_64bit_val(wqe, 8, (size | LS_64(stag, I40IWQPSQ_FRAG_STAG)));
-
-	set_64bit_val(wqe, 16, 0);
-
-	header = LS_64(I40IWQP_OP_RDMA_SEND, I40IWQPSQ_OPCODE) |
-		 LS_64(1, I40IWQPSQ_STREAMMODE) |
-		 LS_64(1, I40IWQPSQ_WAITFORRCVPDU) |
-		 LS_64(qp->qp_uk.swqe_polarity, I40IWQPSQ_VALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(qp->dev, I40IW_DEBUG_QP, "SEND_LSMM WQE",
-			wqe, I40IW_QP_WQE_MIN_SIZE);
-}
-
-/**
- * i40iw_sc_send_lsmm_nostag - for privilege qp
- * @qp: sc qp struct
- * @lsmm_buf: buffer with lsmm message
- * @size: size of lsmm buffer
- */
-static void i40iw_sc_send_lsmm_nostag(struct i40iw_sc_qp *qp,
-				      void *lsmm_buf,
-				      u32 size)
-{
-	u64 *wqe;
-	u64 header;
-	struct i40iw_qp_uk *qp_uk;
-
-	qp_uk = &qp->qp_uk;
-	wqe = qp_uk->sq_base->elem;
-
-	set_64bit_val(wqe, 0, (uintptr_t)lsmm_buf);
-
-	set_64bit_val(wqe, 8, size);
-
-	set_64bit_val(wqe, 16, 0);
-
-	header = LS_64(I40IWQP_OP_RDMA_SEND, I40IWQPSQ_OPCODE) |
-		 LS_64(1, I40IWQPSQ_STREAMMODE) |
-		 LS_64(1, I40IWQPSQ_WAITFORRCVPDU) |
-		 LS_64(qp->qp_uk.swqe_polarity, I40IWQPSQ_VALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(qp->dev, I40IW_DEBUG_WQE, "SEND_LSMM_NOSTAG WQE",
-			wqe, I40IW_QP_WQE_MIN_SIZE);
-}
-
-/**
- * i40iw_sc_send_rtt - send last read0 or write0
- * @qp: sc qp struct
- * @read: Do read0 or write0
- */
-static void i40iw_sc_send_rtt(struct i40iw_sc_qp *qp, bool read)
-{
-	u64 *wqe;
-	u64 header;
-	struct i40iw_qp_uk *qp_uk;
-
-	qp_uk = &qp->qp_uk;
-	wqe = qp_uk->sq_base->elem;
-
-	set_64bit_val(wqe, 0, 0);
-	set_64bit_val(wqe, 8, 0);
-	set_64bit_val(wqe, 16, 0);
-	if (read) {
-		header = LS_64(0x1234, I40IWQPSQ_REMSTAG) |
-			 LS_64(I40IWQP_OP_RDMA_READ, I40IWQPSQ_OPCODE) |
-			 LS_64(qp->qp_uk.swqe_polarity, I40IWQPSQ_VALID);
-		set_64bit_val(wqe, 8, ((u64)0xabcd << 32));
-	} else {
-		header = LS_64(I40IWQP_OP_RDMA_WRITE, I40IWQPSQ_OPCODE) |
-			 LS_64(qp->qp_uk.swqe_polarity, I40IWQPSQ_VALID);
-	}
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(qp->dev, I40IW_DEBUG_WQE, "RTR WQE",
-			wqe, I40IW_QP_WQE_MIN_SIZE);
-}
-
-/**
- * i40iw_sc_post_wqe0 - send wqe with opcode
- * @qp: sc qp struct
- * @opcode: opcode to use for wqe0
- */
-static enum i40iw_status_code i40iw_sc_post_wqe0(struct i40iw_sc_qp *qp, u8 opcode)
-{
-	u64 *wqe;
-	u64 header;
-	struct i40iw_qp_uk *qp_uk;
-
-	qp_uk = &qp->qp_uk;
-	wqe = qp_uk->sq_base->elem;
-
-	if (!wqe)
-		return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
-	switch (opcode) {
-	case I40IWQP_OP_NOP:
-		set_64bit_val(wqe, 0, 0);
-		set_64bit_val(wqe, 8, 0);
-		set_64bit_val(wqe, 16, 0);
-		header = LS_64(I40IWQP_OP_NOP, I40IWQPSQ_OPCODE) |
-			 LS_64(qp->qp_uk.swqe_polarity, I40IWQPSQ_VALID);
-
-		i40iw_insert_wqe_hdr(wqe, header);
-		break;
-	case I40IWQP_OP_RDMA_SEND:
-		set_64bit_val(wqe, 0, 0);
-		set_64bit_val(wqe, 8, 0);
-		set_64bit_val(wqe, 16, 0);
-		header = LS_64(I40IWQP_OP_RDMA_SEND, I40IWQPSQ_OPCODE) |
-			 LS_64(qp->qp_uk.swqe_polarity, I40IWQPSQ_VALID) |
-			 LS_64(1, I40IWQPSQ_STREAMMODE) |
-			 LS_64(1, I40IWQPSQ_WAITFORRCVPDU);
-
-		i40iw_insert_wqe_hdr(wqe, header);
-		break;
-	default:
-		i40iw_debug(qp->dev, I40IW_DEBUG_QP, "%s: Invalid WQE zero opcode\n",
-			    __func__);
-		break;
-	}
-	return 0;
-}
-
-/**
- * i40iw_sc_init_iw_hmc() - queries fpm values using cqp and populates hmc_info
- * @dev : ptr to i40iw_dev struct
- * @hmc_fn_id: hmc function id
- */
-enum i40iw_status_code i40iw_sc_init_iw_hmc(struct i40iw_sc_dev *dev, u8 hmc_fn_id)
-{
-	struct i40iw_hmc_info *hmc_info;
-	struct i40iw_dma_mem query_fpm_mem;
-	struct i40iw_virt_mem virt_mem;
-	struct i40iw_vfdev *vf_dev = NULL;
-	u32 mem_size;
-	enum i40iw_status_code ret_code = 0;
-	bool poll_registers = true;
-	u16 iw_vf_idx;
-	u8 wait_type;
-
-	if (hmc_fn_id >= I40IW_MAX_VF_FPM_ID ||
-	    (dev->hmc_fn_id != hmc_fn_id && hmc_fn_id < I40IW_FIRST_VF_FPM_ID))
-		return I40IW_ERR_INVALID_HMCFN_ID;
-
-	i40iw_debug(dev, I40IW_DEBUG_HMC, "hmc_fn_id %u, dev->hmc_fn_id %u\n", hmc_fn_id,
-		    dev->hmc_fn_id);
-	if (hmc_fn_id == dev->hmc_fn_id) {
-		hmc_info = dev->hmc_info;
-		query_fpm_mem.pa = dev->fpm_query_buf_pa;
-		query_fpm_mem.va = dev->fpm_query_buf;
-	} else {
-		vf_dev = i40iw_vfdev_from_fpm(dev, hmc_fn_id);
-		if (!vf_dev)
-			return I40IW_ERR_INVALID_VF_ID;
-
-		hmc_info = &vf_dev->hmc_info;
-		iw_vf_idx = vf_dev->iw_vf_idx;
-		i40iw_debug(dev, I40IW_DEBUG_HMC, "vf_dev %p, hmc_info %p, hmc_obj %p\n", vf_dev,
-			    hmc_info, hmc_info->hmc_obj);
-		if (!vf_dev->fpm_query_buf) {
-			if (!dev->vf_fpm_query_buf[iw_vf_idx].va) {
-				ret_code = i40iw_alloc_query_fpm_buf(dev,
-								     &dev->vf_fpm_query_buf[iw_vf_idx]);
-				if (ret_code)
-					return ret_code;
-			}
-			vf_dev->fpm_query_buf = dev->vf_fpm_query_buf[iw_vf_idx].va;
-			vf_dev->fpm_query_buf_pa = dev->vf_fpm_query_buf[iw_vf_idx].pa;
-		}
-		query_fpm_mem.pa = vf_dev->fpm_query_buf_pa;
-		query_fpm_mem.va = vf_dev->fpm_query_buf;
-		/**
-		 * It is HARDWARE specific:
-		 * this call is done by PF for VF and
-		 * i40iw_sc_query_fpm_values needs ccq poll
-		 * because PF ccq is already created.
-		 */
-		poll_registers = false;
-	}
-
-	hmc_info->hmc_fn_id = hmc_fn_id;
-
-	if (hmc_fn_id != dev->hmc_fn_id) {
-		ret_code =
-			i40iw_cqp_query_fpm_values_cmd(dev, &query_fpm_mem, hmc_fn_id);
-	} else {
-		wait_type = poll_registers ? (u8)I40IW_CQP_WAIT_POLL_REGS :
-			    (u8)I40IW_CQP_WAIT_POLL_CQ;
-
-		ret_code = i40iw_sc_query_fpm_values(
-					dev->cqp,
-					0,
-					hmc_info->hmc_fn_id,
-					&query_fpm_mem,
-					true,
-					wait_type);
-	}
-	if (ret_code)
-		return ret_code;
-
-	/* parse the fpm_query_buf and fill hmc obj info */
-	ret_code =
-		i40iw_sc_parse_fpm_query_buf((u64 *)query_fpm_mem.va,
-					     hmc_info,
-					     &dev->hmc_fpm_misc);
-	if (ret_code)
-		return ret_code;
-	i40iw_debug_buf(dev, I40IW_DEBUG_HMC, "QUERY FPM BUFFER",
-			query_fpm_mem.va, I40IW_QUERY_FPM_BUF_SIZE);
-
-	if (hmc_fn_id != dev->hmc_fn_id) {
-		i40iw_cqp_commit_fpm_values_cmd(dev, &query_fpm_mem, hmc_fn_id);
-
-		/* parse the fpm_commit_buf and fill hmc obj info */
-		i40iw_sc_parse_fpm_commit_buf((u64 *)query_fpm_mem.va, hmc_info->hmc_obj, &hmc_info->sd_table.sd_cnt);
-		mem_size = sizeof(struct i40iw_hmc_sd_entry) *
-			   (hmc_info->sd_table.sd_cnt + hmc_info->first_sd_index);
-		ret_code = i40iw_allocate_virt_mem(dev->hw, &virt_mem, mem_size);
-		if (ret_code)
-			return ret_code;
-		hmc_info->sd_table.sd_entry = virt_mem.va;
-	}
-
-	return ret_code;
-}
-
-/**
- * i40iw_sc_configure_iw_fpm() - commits hmc obj cnt values using cqp command and
- * populates fpm base address in hmc_info
- * @dev : ptr to i40iw_dev struct
- * @hmc_fn_id: hmc function id
- */
-static enum i40iw_status_code i40iw_sc_configure_iw_fpm(struct i40iw_sc_dev *dev,
-							u8 hmc_fn_id)
-{
-	struct i40iw_hmc_info *hmc_info;
-	struct i40iw_hmc_obj_info *obj_info;
-	u64 *buf;
-	struct i40iw_dma_mem commit_fpm_mem;
-	u32 i, j;
-	enum i40iw_status_code ret_code = 0;
-	bool poll_registers = true;
-	u8 wait_type;
-
-	if (hmc_fn_id >= I40IW_MAX_VF_FPM_ID ||
-	    (dev->hmc_fn_id != hmc_fn_id && hmc_fn_id < I40IW_FIRST_VF_FPM_ID))
-		return I40IW_ERR_INVALID_HMCFN_ID;
-
-	if (hmc_fn_id == dev->hmc_fn_id) {
-		hmc_info = dev->hmc_info;
-	} else {
-		hmc_info = i40iw_vf_hmcinfo_from_fpm(dev, hmc_fn_id);
-		poll_registers = false;
-	}
-	if (!hmc_info)
-		return I40IW_ERR_BAD_PTR;
-
-	obj_info = hmc_info->hmc_obj;
-	buf = dev->fpm_commit_buf;
-
-	/* copy cnt values in commit buf */
-	for (i = I40IW_HMC_IW_QP, j = 0; i <= I40IW_HMC_IW_PBLE;
-	     i++, j += 8)
-		set_64bit_val(buf, j, (u64)obj_info[i].cnt);
-
-	set_64bit_val(buf, 40, 0);   /* APBVT rsvd */
-
-	commit_fpm_mem.pa = dev->fpm_commit_buf_pa;
-	commit_fpm_mem.va = dev->fpm_commit_buf;
-	wait_type = poll_registers ? (u8)I40IW_CQP_WAIT_POLL_REGS :
-			(u8)I40IW_CQP_WAIT_POLL_CQ;
-	ret_code = i40iw_sc_commit_fpm_values(
-					dev->cqp,
-					0,
-					hmc_info->hmc_fn_id,
-					&commit_fpm_mem,
-					true,
-					wait_type);
-
-	/* parse the fpm_commit_buf and fill hmc obj info */
-	if (!ret_code)
-		ret_code = i40iw_sc_parse_fpm_commit_buf(dev->fpm_commit_buf,
-							 hmc_info->hmc_obj,
-							 &hmc_info->sd_table.sd_cnt);
-
-	i40iw_debug_buf(dev, I40IW_DEBUG_HMC, "COMMIT FPM BUFFER",
-			commit_fpm_mem.va, I40IW_COMMIT_FPM_BUF_SIZE);
-
-	return ret_code;
-}
-
-/**
- * cqp_sds_wqe_fill - fill cqp wqe doe sd
- * @cqp: struct for cqp hw
- * @info: sd info for wqe
- * @scratch: u64 saved to be used during cqp completion
- */
-static enum i40iw_status_code cqp_sds_wqe_fill(struct i40iw_sc_cqp *cqp,
-					       struct i40iw_update_sds_info *info,
-					       u64 scratch)
-{
-	u64 data;
-	u64 header;
-	u64 *wqe;
-	int mem_entries, wqe_entries;
-	struct i40iw_dma_mem *sdbuf = &cqp->sdbuf;
-	u64 offset;
-	u32 wqe_idx;
-
-	wqe = i40iw_sc_cqp_get_next_send_wqe_idx(cqp, scratch, &wqe_idx);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-
-	I40IW_CQP_INIT_WQE(wqe);
-	wqe_entries = (info->cnt > 3) ? 3 : info->cnt;
-	mem_entries = info->cnt - wqe_entries;
-
-	header = LS_64(I40IW_CQP_OP_UPDATE_PE_SDS, I40IW_CQPSQ_OPCODE) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID) |
-		 LS_64(mem_entries, I40IW_CQPSQ_UPESD_ENTRY_COUNT);
-
-	if (mem_entries) {
-		offset = wqe_idx * I40IW_UPDATE_SD_BUF_SIZE;
-		memcpy((char *)sdbuf->va + offset, &info->entry[3],
-		       mem_entries << 4);
-		data = (u64)sdbuf->pa + offset;
-	} else {
-		data = 0;
-	}
-	data |= LS_64(info->hmc_fn_id, I40IW_CQPSQ_UPESD_HMCFNID);
-
-	set_64bit_val(wqe, 16, data);
-
-	switch (wqe_entries) {
-	case 3:
-		set_64bit_val(wqe, 48,
-			      (LS_64(info->entry[2].cmd, I40IW_CQPSQ_UPESD_SDCMD) |
-					LS_64(1, I40IW_CQPSQ_UPESD_ENTRY_VALID)));
-
-		set_64bit_val(wqe, 56, info->entry[2].data);
-		fallthrough;
-	case 2:
-		set_64bit_val(wqe, 32,
-			      (LS_64(info->entry[1].cmd, I40IW_CQPSQ_UPESD_SDCMD) |
-					LS_64(1, I40IW_CQPSQ_UPESD_ENTRY_VALID)));
-
-		set_64bit_val(wqe, 40, info->entry[1].data);
-		fallthrough;
-	case 1:
-		set_64bit_val(wqe, 0,
-			      LS_64(info->entry[0].cmd, I40IW_CQPSQ_UPESD_SDCMD));
-
-		set_64bit_val(wqe, 8, info->entry[0].data);
-		break;
-	default:
-		break;
-	}
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "UPDATE_PE_SDS WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-	return 0;
-}
-
-/**
- * i40iw_update_pe_sds - cqp wqe for sd
- * @dev: ptr to i40iw_dev struct
- * @info: sd info for sd's
- * @scratch: u64 saved to be used during cqp completion
- */
-static enum i40iw_status_code i40iw_update_pe_sds(struct i40iw_sc_dev *dev,
-						  struct i40iw_update_sds_info *info,
-						  u64 scratch)
-{
-	struct i40iw_sc_cqp *cqp = dev->cqp;
-	enum i40iw_status_code ret_code;
-
-	ret_code = cqp_sds_wqe_fill(cqp, info, scratch);
-	if (!ret_code)
-		i40iw_sc_cqp_post_sq(cqp);
-
-	return ret_code;
-}
-
-/**
- * i40iw_update_sds_noccq - update sd before ccq created
- * @dev: sc device struct
- * @info: sd info for sd's
- */
-enum i40iw_status_code i40iw_update_sds_noccq(struct i40iw_sc_dev *dev,
-					      struct i40iw_update_sds_info *info)
-{
-	u32 error, val, tail;
-	struct i40iw_sc_cqp *cqp = dev->cqp;
-	enum i40iw_status_code ret_code;
-
-	ret_code = cqp_sds_wqe_fill(cqp, info, 0);
-	if (ret_code)
-		return ret_code;
-	i40iw_get_cqp_reg_info(cqp, &val, &tail, &error);
-	if (error)
-		return I40IW_ERR_CQP_COMPL_ERROR;
-
-	i40iw_sc_cqp_post_sq(cqp);
-	ret_code = i40iw_cqp_poll_registers(cqp, tail, I40IW_DONE_COUNT);
-
-	return ret_code;
-}
-
-/**
- * i40iw_sc_suspend_qp - suspend qp for param change
- * @cqp: struct for cqp hw
- * @qp: sc qp struct
- * @scratch: u64 saved to be used during cqp completion
- */
-enum i40iw_status_code i40iw_sc_suspend_qp(struct i40iw_sc_cqp *cqp,
-					   struct i40iw_sc_qp *qp,
-					   u64 scratch)
-{
-	u64 header;
-	u64 *wqe;
-
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-	header = LS_64(qp->qp_uk.qp_id, I40IW_CQPSQ_SUSPENDQP_QPID) |
-		 LS_64(I40IW_CQP_OP_SUSPEND_QP, I40IW_CQPSQ_OPCODE) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "SUSPEND_QP WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_resume_qp - resume qp after suspend
- * @cqp: struct for cqp hw
- * @qp: sc qp struct
- * @scratch: u64 saved to be used during cqp completion
- */
-enum i40iw_status_code i40iw_sc_resume_qp(struct i40iw_sc_cqp *cqp,
-					  struct i40iw_sc_qp *qp,
-					  u64 scratch)
-{
-	u64 header;
-	u64 *wqe;
-
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-	set_64bit_val(wqe,
-		      16,
-			LS_64(qp->qs_handle, I40IW_CQPSQ_RESUMEQP_QSHANDLE));
-
-	header = LS_64(qp->qp_uk.qp_id, I40IW_CQPSQ_RESUMEQP_QPID) |
-		 LS_64(I40IW_CQP_OP_RESUME_QP, I40IW_CQPSQ_OPCODE) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "RESUME_QP WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-/**
- * i40iw_sc_static_hmc_pages_allocated - cqp wqe to allocate hmc pages
- * @cqp: struct for cqp hw
- * @scratch: u64 saved to be used during cqp completion
- * @hmc_fn_id: hmc function id
- * @post_sq: flag for cqp db to ring
- * @poll_registers: flag to poll register for cqp completion
- */
-enum i40iw_status_code i40iw_sc_static_hmc_pages_allocated(
-					struct i40iw_sc_cqp *cqp,
-					u64 scratch,
-					u8 hmc_fn_id,
-					bool post_sq,
-					bool poll_registers)
-{
-	u64 header;
-	u64 *wqe;
-	u32 tail, val, error;
-	enum i40iw_status_code ret_code = 0;
-
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-	set_64bit_val(wqe,
-		      16,
-		      LS_64(hmc_fn_id, I40IW_SHMC_PAGE_ALLOCATED_HMC_FN_ID));
-
-	header = LS_64(I40IW_CQP_OP_SHMC_PAGES_ALLOCATED, I40IW_CQPSQ_OPCODE) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "SHMC_PAGES_ALLOCATED WQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-	i40iw_get_cqp_reg_info(cqp, &val, &tail, &error);
-	if (error) {
-		ret_code = I40IW_ERR_CQP_COMPL_ERROR;
-		return ret_code;
-	}
-	if (post_sq) {
-		i40iw_sc_cqp_post_sq(cqp);
-		if (poll_registers)
-			/* check for cqp sq tail update */
-			ret_code = i40iw_cqp_poll_registers(cqp, tail, 1000);
-		else
-			ret_code = i40iw_sc_poll_for_cqp_op_done(cqp,
-								 I40IW_CQP_OP_SHMC_PAGES_ALLOCATED,
-								 NULL);
-	}
-
-	return ret_code;
-}
-
-/**
- * i40iw_ring_full - check if cqp ring is full
- * @cqp: struct for cqp hw
- */
-static bool i40iw_ring_full(struct i40iw_sc_cqp *cqp)
-{
-	return I40IW_RING_FULL_ERR(cqp->sq_ring);
-}
-
-/**
- * i40iw_est_sd - returns approximate number of SDs for HMC
- * @dev: sc device struct
- * @hmc_info: hmc structure, size and count for HMC objects
- */
-static u64 i40iw_est_sd(struct i40iw_sc_dev *dev, struct i40iw_hmc_info *hmc_info)
-{
-	int i;
-	u64 size = 0;
-	u64 sd;
-
-	for (i = I40IW_HMC_IW_QP; i < I40IW_HMC_IW_PBLE; i++)
-		size += hmc_info->hmc_obj[i].cnt * hmc_info->hmc_obj[i].size;
-
-	if (dev->is_pf)
-		size += hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt * hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].size;
-
-	if (size & 0x1FFFFF)
-		sd = (size >> 21) + 1; /* add 1 for remainder */
-	else
-		sd = size >> 21;
-
-	if (!dev->is_pf) {
-		/* 2MB alignment for VF PBLE HMC */
-		size = hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt * hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].size;
-		if (size & 0x1FFFFF)
-			sd += (size >> 21) + 1; /* add 1 for remainder */
-		else
-			sd += size >> 21;
-	}
-
-	return sd;
-}
-
-/**
- * i40iw_config_fpm_values - configure HMC objects
- * @dev: sc device struct
- * @qp_count: desired qp count
- */
-enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_count)
-{
-	struct i40iw_virt_mem virt_mem;
-	u32 i, mem_size;
-	u32 qpwantedoriginal, qpwanted, mrwanted, pblewanted;
-	u64 sd_needed;
-	u32 loop_count = 0;
-
-	struct i40iw_hmc_info *hmc_info;
-	struct i40iw_hmc_fpm_misc *hmc_fpm_misc;
-	enum i40iw_status_code ret_code = 0;
-
-	hmc_info = dev->hmc_info;
-	hmc_fpm_misc = &dev->hmc_fpm_misc;
-
-	ret_code = i40iw_sc_init_iw_hmc(dev, dev->hmc_fn_id);
-	if (ret_code) {
-		i40iw_debug(dev, I40IW_DEBUG_HMC,
-			    "i40iw_sc_init_iw_hmc returned error_code = %d\n",
-			    ret_code);
-		return ret_code;
-	}
-
-	for (i = I40IW_HMC_IW_QP; i < I40IW_HMC_IW_MAX; i++)
-		hmc_info->hmc_obj[i].cnt = hmc_info->hmc_obj[i].max_cnt;
-	sd_needed = i40iw_est_sd(dev, hmc_info);
-	i40iw_debug(dev, I40IW_DEBUG_HMC,
-		    "%s: FW initial max sd_count[%08lld] first_sd_index[%04d]\n",
-		    __func__, sd_needed, hmc_info->first_sd_index);
-	i40iw_debug(dev, I40IW_DEBUG_HMC,
-		    "%s: sd count %d where max sd is %d\n",
-		    __func__, hmc_info->sd_table.sd_cnt,
-		    hmc_fpm_misc->max_sds);
-
-	qpwanted = min(qp_count, hmc_info->hmc_obj[I40IW_HMC_IW_QP].max_cnt);
-	qpwantedoriginal = qpwanted;
-	mrwanted = hmc_info->hmc_obj[I40IW_HMC_IW_MR].max_cnt;
-	pblewanted = hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].max_cnt;
-
-	i40iw_debug(dev, I40IW_DEBUG_HMC,
-		    "req_qp=%d max_sd=%d, max_qp = %d, max_cq=%d, max_mr=%d, max_pble=%d\n",
-		    qp_count, hmc_fpm_misc->max_sds,
-		    hmc_info->hmc_obj[I40IW_HMC_IW_QP].max_cnt,
-		    hmc_info->hmc_obj[I40IW_HMC_IW_CQ].max_cnt,
-		    hmc_info->hmc_obj[I40IW_HMC_IW_MR].max_cnt,
-		    hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].max_cnt);
-
-	do {
-		++loop_count;
-		hmc_info->hmc_obj[I40IW_HMC_IW_QP].cnt = qpwanted;
-		hmc_info->hmc_obj[I40IW_HMC_IW_CQ].cnt =
-			min(2 * qpwanted, hmc_info->hmc_obj[I40IW_HMC_IW_CQ].cnt);
-		hmc_info->hmc_obj[I40IW_HMC_IW_SRQ].cnt = 0x00; /* Reserved */
-		hmc_info->hmc_obj[I40IW_HMC_IW_HTE].cnt =
-					qpwanted * hmc_fpm_misc->ht_multiplier;
-		hmc_info->hmc_obj[I40IW_HMC_IW_ARP].cnt =
-			hmc_info->hmc_obj[I40IW_HMC_IW_ARP].max_cnt;
-		hmc_info->hmc_obj[I40IW_HMC_IW_APBVT_ENTRY].cnt = 1;
-		hmc_info->hmc_obj[I40IW_HMC_IW_MR].cnt = mrwanted;
-
-		hmc_info->hmc_obj[I40IW_HMC_IW_XF].cnt =
-			roundup_pow_of_two(I40IW_MAX_WQ_ENTRIES * qpwanted);
-		hmc_info->hmc_obj[I40IW_HMC_IW_Q1].cnt =
-			roundup_pow_of_two(2 * I40IW_MAX_IRD_SIZE * qpwanted);
-		hmc_info->hmc_obj[I40IW_HMC_IW_XFFL].cnt =
-			hmc_info->hmc_obj[I40IW_HMC_IW_XF].cnt / hmc_fpm_misc->xf_block_size;
-		hmc_info->hmc_obj[I40IW_HMC_IW_Q1FL].cnt =
-			hmc_info->hmc_obj[I40IW_HMC_IW_Q1].cnt / hmc_fpm_misc->q1_block_size;
-		hmc_info->hmc_obj[I40IW_HMC_IW_TIMER].cnt =
-			((qpwanted) / 512 + 1) * hmc_fpm_misc->timer_bucket;
-		hmc_info->hmc_obj[I40IW_HMC_IW_FSIMC].cnt = 0x00;
-		hmc_info->hmc_obj[I40IW_HMC_IW_FSIAV].cnt = 0x00;
-		hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt = pblewanted;
-
-		/* How much memory is needed for all the objects. */
-		sd_needed = i40iw_est_sd(dev, hmc_info);
-		if ((loop_count > 1000) ||
-		    ((!(loop_count % 10)) &&
-		    (qpwanted > qpwantedoriginal * 2 / 3))) {
-			if (qpwanted > FPM_MULTIPLIER)
-				qpwanted = roundup_pow_of_two(qpwanted -
-							      FPM_MULTIPLIER);
-			qpwanted >>= 1;
-		}
-		if (mrwanted > FPM_MULTIPLIER * 10)
-			mrwanted -= FPM_MULTIPLIER * 10;
-		if (pblewanted > FPM_MULTIPLIER * 1000)
-			pblewanted -= FPM_MULTIPLIER * 1000;
-	} while (sd_needed > hmc_fpm_misc->max_sds && loop_count < 2000);
-
-	i40iw_debug(dev, I40IW_DEBUG_HMC,
-		    "loop_cnt=%d, sd_needed=%lld, qpcnt = %d, cqcnt=%d, mrcnt=%d, pblecnt=%d\n",
-		    loop_count, sd_needed,
-		    hmc_info->hmc_obj[I40IW_HMC_IW_QP].cnt,
-		    hmc_info->hmc_obj[I40IW_HMC_IW_CQ].cnt,
-		    hmc_info->hmc_obj[I40IW_HMC_IW_MR].cnt,
-		    hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt);
-
-	ret_code = i40iw_sc_configure_iw_fpm(dev, dev->hmc_fn_id);
-	if (ret_code) {
-		i40iw_debug(dev, I40IW_DEBUG_HMC,
-			    "configure_iw_fpm returned error_code[x%08X]\n",
-			    i40iw_rd32(dev->hw, dev->is_pf ? I40E_PFPE_CQPERRCODES : I40E_VFPE_CQPERRCODES1));
-		return ret_code;
-	}
-
-	mem_size = sizeof(struct i40iw_hmc_sd_entry) *
-		   (hmc_info->sd_table.sd_cnt + hmc_info->first_sd_index + 1);
-	ret_code = i40iw_allocate_virt_mem(dev->hw, &virt_mem, mem_size);
-	if (ret_code) {
-		i40iw_debug(dev, I40IW_DEBUG_HMC,
-			    "%s: failed to allocate memory for sd_entry buffer\n",
-			    __func__);
-		return ret_code;
-	}
-	hmc_info->sd_table.sd_entry = virt_mem.va;
-
-	return ret_code;
-}
-
-/**
- * i40iw_exec_cqp_cmd - execute cqp cmd when wqe are available
- * @dev: rdma device
- * @pcmdinfo: cqp command info
- */
-static enum i40iw_status_code i40iw_exec_cqp_cmd(struct i40iw_sc_dev *dev,
-						 struct cqp_commands_info *pcmdinfo)
-{
-	enum i40iw_status_code status;
-	struct i40iw_dma_mem values_mem;
-
-	dev->cqp_cmd_stats[pcmdinfo->cqp_cmd]++;
-	switch (pcmdinfo->cqp_cmd) {
-	case OP_DELETE_LOCAL_MAC_IPADDR_ENTRY:
-		status = i40iw_sc_del_local_mac_ipaddr_entry(
-				pcmdinfo->in.u.del_local_mac_ipaddr_entry.cqp,
-				pcmdinfo->in.u.del_local_mac_ipaddr_entry.scratch,
-				pcmdinfo->in.u.del_local_mac_ipaddr_entry.entry_idx,
-				pcmdinfo->in.u.del_local_mac_ipaddr_entry.ignore_ref_count,
-				pcmdinfo->post_sq);
-		break;
-	case OP_CEQ_DESTROY:
-		status = i40iw_sc_ceq_destroy(pcmdinfo->in.u.ceq_destroy.ceq,
-					      pcmdinfo->in.u.ceq_destroy.scratch,
-					      pcmdinfo->post_sq);
-		break;
-	case OP_AEQ_DESTROY:
-		status = i40iw_sc_aeq_destroy(pcmdinfo->in.u.aeq_destroy.aeq,
-					      pcmdinfo->in.u.aeq_destroy.scratch,
-					      pcmdinfo->post_sq);
-
-		break;
-	case OP_DELETE_ARP_CACHE_ENTRY:
-		status = i40iw_sc_del_arp_cache_entry(
-				pcmdinfo->in.u.del_arp_cache_entry.cqp,
-				pcmdinfo->in.u.del_arp_cache_entry.scratch,
-				pcmdinfo->in.u.del_arp_cache_entry.arp_index,
-				pcmdinfo->post_sq);
-		break;
-	case OP_MANAGE_APBVT_ENTRY:
-		status = i40iw_sc_manage_apbvt_entry(
-				pcmdinfo->in.u.manage_apbvt_entry.cqp,
-				&pcmdinfo->in.u.manage_apbvt_entry.info,
-				pcmdinfo->in.u.manage_apbvt_entry.scratch,
-				pcmdinfo->post_sq);
-		break;
-	case OP_CEQ_CREATE:
-		status = i40iw_sc_ceq_create(pcmdinfo->in.u.ceq_create.ceq,
-					     pcmdinfo->in.u.ceq_create.scratch,
-					     pcmdinfo->post_sq);
-		break;
-	case OP_AEQ_CREATE:
-		status = i40iw_sc_aeq_create(pcmdinfo->in.u.aeq_create.aeq,
-					     pcmdinfo->in.u.aeq_create.scratch,
-					     pcmdinfo->post_sq);
-		break;
-	case OP_ALLOC_LOCAL_MAC_IPADDR_ENTRY:
-		status = i40iw_sc_alloc_local_mac_ipaddr_entry(
-				pcmdinfo->in.u.alloc_local_mac_ipaddr_entry.cqp,
-				pcmdinfo->in.u.alloc_local_mac_ipaddr_entry.scratch,
-				pcmdinfo->post_sq);
-		break;
-	case OP_ADD_LOCAL_MAC_IPADDR_ENTRY:
-		status = i40iw_sc_add_local_mac_ipaddr_entry(
-				pcmdinfo->in.u.add_local_mac_ipaddr_entry.cqp,
-				&pcmdinfo->in.u.add_local_mac_ipaddr_entry.info,
-				pcmdinfo->in.u.add_local_mac_ipaddr_entry.scratch,
-				pcmdinfo->post_sq);
-		break;
-	case OP_MANAGE_QHASH_TABLE_ENTRY:
-		status = i40iw_sc_manage_qhash_table_entry(
-				pcmdinfo->in.u.manage_qhash_table_entry.cqp,
-				&pcmdinfo->in.u.manage_qhash_table_entry.info,
-				pcmdinfo->in.u.manage_qhash_table_entry.scratch,
-				pcmdinfo->post_sq);
-
-		break;
-	case OP_QP_MODIFY:
-		status = i40iw_sc_qp_modify(
-				pcmdinfo->in.u.qp_modify.qp,
-				&pcmdinfo->in.u.qp_modify.info,
-				pcmdinfo->in.u.qp_modify.scratch,
-				pcmdinfo->post_sq);
-
-		break;
-	case OP_QP_UPLOAD_CONTEXT:
-		status = i40iw_sc_qp_upload_context(
-				pcmdinfo->in.u.qp_upload_context.dev,
-				&pcmdinfo->in.u.qp_upload_context.info,
-				pcmdinfo->in.u.qp_upload_context.scratch,
-				pcmdinfo->post_sq);
-
-		break;
-	case OP_CQ_CREATE:
-		status = i40iw_sc_cq_create(
-				pcmdinfo->in.u.cq_create.cq,
-				pcmdinfo->in.u.cq_create.scratch,
-				pcmdinfo->in.u.cq_create.check_overflow,
-				pcmdinfo->post_sq);
-		break;
-	case OP_CQ_DESTROY:
-		status = i40iw_sc_cq_destroy(
-				pcmdinfo->in.u.cq_destroy.cq,
-				pcmdinfo->in.u.cq_destroy.scratch,
-				pcmdinfo->post_sq);
-
-		break;
-	case OP_QP_CREATE:
-		status = i40iw_sc_qp_create(
-				pcmdinfo->in.u.qp_create.qp,
-				&pcmdinfo->in.u.qp_create.info,
-				pcmdinfo->in.u.qp_create.scratch,
-				pcmdinfo->post_sq);
-		break;
-	case OP_QP_DESTROY:
-		status = i40iw_sc_qp_destroy(
-				pcmdinfo->in.u.qp_destroy.qp,
-				pcmdinfo->in.u.qp_destroy.scratch,
-				pcmdinfo->in.u.qp_destroy.remove_hash_idx,
-				pcmdinfo->in.u.qp_destroy.
-				ignore_mw_bnd,
-				pcmdinfo->post_sq);
-
-		break;
-	case OP_ALLOC_STAG:
-		status = i40iw_sc_alloc_stag(
-				pcmdinfo->in.u.alloc_stag.dev,
-				&pcmdinfo->in.u.alloc_stag.info,
-				pcmdinfo->in.u.alloc_stag.scratch,
-				pcmdinfo->post_sq);
-		break;
-	case OP_MR_REG_NON_SHARED:
-		status = i40iw_sc_mr_reg_non_shared(
-				pcmdinfo->in.u.mr_reg_non_shared.dev,
-				&pcmdinfo->in.u.mr_reg_non_shared.info,
-				pcmdinfo->in.u.mr_reg_non_shared.scratch,
-				pcmdinfo->post_sq);
-
-		break;
-	case OP_DEALLOC_STAG:
-		status = i40iw_sc_dealloc_stag(
-				pcmdinfo->in.u.dealloc_stag.dev,
-				&pcmdinfo->in.u.dealloc_stag.info,
-				pcmdinfo->in.u.dealloc_stag.scratch,
-				pcmdinfo->post_sq);
-
-		break;
-	case OP_MW_ALLOC:
-		status = i40iw_sc_mw_alloc(
-				pcmdinfo->in.u.mw_alloc.dev,
-				pcmdinfo->in.u.mw_alloc.scratch,
-				pcmdinfo->in.u.mw_alloc.mw_stag_index,
-				pcmdinfo->in.u.mw_alloc.pd_id,
-				pcmdinfo->post_sq);
-
-		break;
-	case OP_QP_FLUSH_WQES:
-		status = i40iw_sc_qp_flush_wqes(
-				pcmdinfo->in.u.qp_flush_wqes.qp,
-				&pcmdinfo->in.u.qp_flush_wqes.info,
-				pcmdinfo->in.u.qp_flush_wqes.
-				scratch, pcmdinfo->post_sq);
-		break;
-	case OP_GEN_AE:
-		status = i40iw_sc_gen_ae(
-				pcmdinfo->in.u.gen_ae.qp,
-				&pcmdinfo->in.u.gen_ae.info,
-				pcmdinfo->in.u.gen_ae.scratch,
-				pcmdinfo->post_sq);
-		break;
-	case OP_ADD_ARP_CACHE_ENTRY:
-		status = i40iw_sc_add_arp_cache_entry(
-				pcmdinfo->in.u.add_arp_cache_entry.cqp,
-				&pcmdinfo->in.u.add_arp_cache_entry.info,
-				pcmdinfo->in.u.add_arp_cache_entry.scratch,
-				pcmdinfo->post_sq);
-		break;
-	case OP_UPDATE_PE_SDS:
-		/* case I40IW_CQP_OP_UPDATE_PE_SDS */
-		status = i40iw_update_pe_sds(
-				pcmdinfo->in.u.update_pe_sds.dev,
-				&pcmdinfo->in.u.update_pe_sds.info,
-				pcmdinfo->in.u.update_pe_sds.
-				scratch);
-
-		break;
-	case OP_MANAGE_HMC_PM_FUNC_TABLE:
-		status = i40iw_sc_manage_hmc_pm_func_table(
-				pcmdinfo->in.u.manage_hmc_pm.dev->cqp,
-				pcmdinfo->in.u.manage_hmc_pm.scratch,
-				(u8)pcmdinfo->in.u.manage_hmc_pm.info.vf_id,
-				pcmdinfo->in.u.manage_hmc_pm.info.free_fcn,
-				true);
-		break;
-	case OP_SUSPEND:
-		status = i40iw_sc_suspend_qp(
-				pcmdinfo->in.u.suspend_resume.cqp,
-				pcmdinfo->in.u.suspend_resume.qp,
-				pcmdinfo->in.u.suspend_resume.scratch);
-		break;
-	case OP_RESUME:
-		status = i40iw_sc_resume_qp(
-				pcmdinfo->in.u.suspend_resume.cqp,
-				pcmdinfo->in.u.suspend_resume.qp,
-				pcmdinfo->in.u.suspend_resume.scratch);
-		break;
-	case OP_MANAGE_VF_PBLE_BP:
-		status = i40iw_manage_vf_pble_bp(
-				pcmdinfo->in.u.manage_vf_pble_bp.cqp,
-				&pcmdinfo->in.u.manage_vf_pble_bp.info,
-				pcmdinfo->in.u.manage_vf_pble_bp.scratch, true);
-		break;
-	case OP_QUERY_FPM_VALUES:
-		values_mem.pa = pcmdinfo->in.u.query_fpm_values.fpm_values_pa;
-		values_mem.va = pcmdinfo->in.u.query_fpm_values.fpm_values_va;
-		status = i40iw_sc_query_fpm_values(
-				pcmdinfo->in.u.query_fpm_values.cqp,
-				pcmdinfo->in.u.query_fpm_values.scratch,
-				pcmdinfo->in.u.query_fpm_values.hmc_fn_id,
-				&values_mem, true, I40IW_CQP_WAIT_EVENT);
-		break;
-	case OP_COMMIT_FPM_VALUES:
-		values_mem.pa = pcmdinfo->in.u.commit_fpm_values.fpm_values_pa;
-		values_mem.va = pcmdinfo->in.u.commit_fpm_values.fpm_values_va;
-		status = i40iw_sc_commit_fpm_values(
-				pcmdinfo->in.u.commit_fpm_values.cqp,
-				pcmdinfo->in.u.commit_fpm_values.scratch,
-				pcmdinfo->in.u.commit_fpm_values.hmc_fn_id,
-				&values_mem,
-				true,
-				I40IW_CQP_WAIT_EVENT);
-		break;
-	case OP_QUERY_RDMA_FEATURES:
-		values_mem.pa = pcmdinfo->in.u.query_rdma_features.cap_pa;
-		values_mem.va = pcmdinfo->in.u.query_rdma_features.cap_va;
-		status = i40iw_sc_query_rdma_features(
-			pcmdinfo->in.u.query_rdma_features.cqp, &values_mem,
-			pcmdinfo->in.u.query_rdma_features.scratch);
-		break;
-	default:
-		status = I40IW_NOT_SUPPORTED;
-		break;
-	}
-
-	return status;
-}
-
-/**
- * i40iw_process_cqp_cmd - process all cqp commands
- * @dev: sc device struct
- * @pcmdinfo: cqp command info
- */
-enum i40iw_status_code i40iw_process_cqp_cmd(struct i40iw_sc_dev *dev,
-					     struct cqp_commands_info *pcmdinfo)
-{
-	enum i40iw_status_code status = 0;
-	unsigned long flags;
-
-	spin_lock_irqsave(&dev->cqp_lock, flags);
-	if (list_empty(&dev->cqp_cmd_head) && !i40iw_ring_full(dev->cqp))
-		status = i40iw_exec_cqp_cmd(dev, pcmdinfo);
-	else
-		list_add_tail(&pcmdinfo->cqp_cmd_entry, &dev->cqp_cmd_head);
-	spin_unlock_irqrestore(&dev->cqp_lock, flags);
-	return status;
-}
-
-/**
- * i40iw_process_bh - called from tasklet for cqp list
- * @dev: sc device struct
- */
-enum i40iw_status_code i40iw_process_bh(struct i40iw_sc_dev *dev)
-{
-	enum i40iw_status_code status = 0;
-	struct cqp_commands_info *pcmdinfo;
-	unsigned long flags;
-
-	spin_lock_irqsave(&dev->cqp_lock, flags);
-	while (!list_empty(&dev->cqp_cmd_head) && !i40iw_ring_full(dev->cqp)) {
-		pcmdinfo = (struct cqp_commands_info *)i40iw_remove_head(&dev->cqp_cmd_head);
-
-		status = i40iw_exec_cqp_cmd(dev, pcmdinfo);
-		if (status)
-			break;
-	}
-	spin_unlock_irqrestore(&dev->cqp_lock, flags);
-	return status;
-}
-
-/**
- * i40iw_iwarp_opcode - determine if incoming is rdma layer
- * @info: aeq info for the packet
- * @pkt: packet for error
- */
-static u32 i40iw_iwarp_opcode(struct i40iw_aeqe_info *info, u8 *pkt)
-{
-	__be16 *mpa;
-	u32 opcode = 0xffffffff;
-
-	if (info->q2_data_written) {
-		mpa = (__be16 *)pkt;
-		opcode = ntohs(mpa[1]) & 0xf;
-	}
-	return opcode;
-}
-
-/**
- * i40iw_locate_mpa - return pointer to mpa in the pkt
- * @pkt: packet with data
- */
-static u8 *i40iw_locate_mpa(u8 *pkt)
-{
-	/* skip over ethernet header */
-	pkt += I40IW_MAC_HLEN;
-
-	/* Skip over IP and TCP headers */
-	pkt += 4 * (pkt[0] & 0x0f);
-	pkt += 4 * ((pkt[12] >> 4) & 0x0f);
-	return pkt;
-}
-
-/**
- * i40iw_setup_termhdr - termhdr for terminate pkt
- * @qp: sc qp ptr for pkt
- * @hdr: term hdr
- * @opcode: flush opcode for termhdr
- * @layer_etype: error layer + error type
- * @err: error cod ein the header
- */
-static void i40iw_setup_termhdr(struct i40iw_sc_qp *qp,
-				struct i40iw_terminate_hdr *hdr,
-				enum i40iw_flush_opcode opcode,
-				u8 layer_etype,
-				u8 err)
-{
-	qp->flush_code = opcode;
-	hdr->layer_etype = layer_etype;
-	hdr->error_code = err;
-}
-
-/**
- * i40iw_bld_terminate_hdr - build terminate message header
- * @qp: qp associated with received terminate AE
- * @info: the struct contiaing AE information
- */
-static int i40iw_bld_terminate_hdr(struct i40iw_sc_qp *qp,
-				   struct i40iw_aeqe_info *info)
-{
-	u8 *pkt = qp->q2_buf + Q2_BAD_FRAME_OFFSET;
-	u16 ddp_seg_len;
-	int copy_len = 0;
-	u8 is_tagged = 0;
-	u32 opcode;
-	struct i40iw_terminate_hdr *termhdr;
-
-	termhdr = (struct i40iw_terminate_hdr *)qp->q2_buf;
-	memset(termhdr, 0, Q2_BAD_FRAME_OFFSET);
-
-	if (info->q2_data_written) {
-		/* Use data from offending packet to fill in ddp & rdma hdrs */
-		pkt = i40iw_locate_mpa(pkt);
-		ddp_seg_len = ntohs(*(__be16 *)pkt);
-		if (ddp_seg_len) {
-			copy_len = 2;
-			termhdr->hdrct = DDP_LEN_FLAG;
-			if (pkt[2] & 0x80) {
-				is_tagged = 1;
-				if (ddp_seg_len >= TERM_DDP_LEN_TAGGED) {
-					copy_len += TERM_DDP_LEN_TAGGED;
-					termhdr->hdrct |= DDP_HDR_FLAG;
-				}
-			} else {
-				if (ddp_seg_len >= TERM_DDP_LEN_UNTAGGED) {
-					copy_len += TERM_DDP_LEN_UNTAGGED;
-					termhdr->hdrct |= DDP_HDR_FLAG;
-				}
-
-				if (ddp_seg_len >= (TERM_DDP_LEN_UNTAGGED + TERM_RDMA_LEN)) {
-					if ((pkt[3] & RDMA_OPCODE_MASK) == RDMA_READ_REQ_OPCODE) {
-						copy_len += TERM_RDMA_LEN;
-						termhdr->hdrct |= RDMA_HDR_FLAG;
-					}
-				}
-			}
-		}
-	}
-
-	opcode = i40iw_iwarp_opcode(info, pkt);
-
-	switch (info->ae_id) {
-	case I40IW_AE_AMP_UNALLOCATED_STAG:
-		qp->eventtype = TERM_EVENT_QP_ACCESS_ERR;
-		if (opcode == I40IW_OP_TYPE_RDMA_WRITE)
-			i40iw_setup_termhdr(qp, termhdr, FLUSH_PROT_ERR,
-					    (LAYER_DDP << 4) | DDP_TAGGED_BUFFER, DDP_TAGGED_INV_STAG);
-		else
-			i40iw_setup_termhdr(qp, termhdr, FLUSH_REM_ACCESS_ERR,
-					    (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT, RDMAP_INV_STAG);
-		break;
-	case I40IW_AE_AMP_BOUNDS_VIOLATION:
-		qp->eventtype = TERM_EVENT_QP_ACCESS_ERR;
-		if (info->q2_data_written)
-			i40iw_setup_termhdr(qp, termhdr, FLUSH_PROT_ERR,
-					    (LAYER_DDP << 4) | DDP_TAGGED_BUFFER, DDP_TAGGED_BOUNDS);
-		else
-			i40iw_setup_termhdr(qp, termhdr, FLUSH_REM_ACCESS_ERR,
-					    (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT, RDMAP_INV_BOUNDS);
-		break;
-	case I40IW_AE_AMP_BAD_PD:
-		switch (opcode) {
-		case I40IW_OP_TYPE_RDMA_WRITE:
-			i40iw_setup_termhdr(qp, termhdr, FLUSH_PROT_ERR,
-					    (LAYER_DDP << 4) | DDP_TAGGED_BUFFER, DDP_TAGGED_UNASSOC_STAG);
-			break;
-		case I40IW_OP_TYPE_SEND_INV:
-		case I40IW_OP_TYPE_SEND_SOL_INV:
-			i40iw_setup_termhdr(qp, termhdr, FLUSH_REM_ACCESS_ERR,
-					    (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT, RDMAP_CANT_INV_STAG);
-			break;
-		default:
-			i40iw_setup_termhdr(qp, termhdr, FLUSH_REM_ACCESS_ERR,
-					    (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT, RDMAP_UNASSOC_STAG);
-		}
-		break;
-	case I40IW_AE_AMP_INVALID_STAG:
-		qp->eventtype = TERM_EVENT_QP_ACCESS_ERR;
-		i40iw_setup_termhdr(qp, termhdr, FLUSH_REM_ACCESS_ERR,
-				    (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT, RDMAP_INV_STAG);
-		break;
-	case I40IW_AE_AMP_BAD_QP:
-		i40iw_setup_termhdr(qp, termhdr, FLUSH_LOC_QP_OP_ERR,
-				    (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER, DDP_UNTAGGED_INV_QN);
-		break;
-	case I40IW_AE_AMP_BAD_STAG_KEY:
-	case I40IW_AE_AMP_BAD_STAG_INDEX:
-		qp->eventtype = TERM_EVENT_QP_ACCESS_ERR;
-		switch (opcode) {
-		case I40IW_OP_TYPE_SEND_INV:
-		case I40IW_OP_TYPE_SEND_SOL_INV:
-			i40iw_setup_termhdr(qp, termhdr, FLUSH_REM_OP_ERR,
-					    (LAYER_RDMA << 4) | RDMAP_REMOTE_OP, RDMAP_CANT_INV_STAG);
-			break;
-		default:
-			i40iw_setup_termhdr(qp, termhdr, FLUSH_REM_ACCESS_ERR,
-					    (LAYER_RDMA << 4) | RDMAP_REMOTE_OP, RDMAP_INV_STAG);
-		}
-		break;
-	case I40IW_AE_AMP_RIGHTS_VIOLATION:
-	case I40IW_AE_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS:
-	case I40IW_AE_PRIV_OPERATION_DENIED:
-		qp->eventtype = TERM_EVENT_QP_ACCESS_ERR;
-		i40iw_setup_termhdr(qp, termhdr, FLUSH_REM_ACCESS_ERR,
-				    (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT, RDMAP_ACCESS);
-		break;
-	case I40IW_AE_AMP_TO_WRAP:
-		qp->eventtype = TERM_EVENT_QP_ACCESS_ERR;
-		i40iw_setup_termhdr(qp, termhdr, FLUSH_REM_ACCESS_ERR,
-				    (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT, RDMAP_TO_WRAP);
-		break;
-	case I40IW_AE_LLP_RECEIVED_MPA_CRC_ERROR:
-		i40iw_setup_termhdr(qp, termhdr, FLUSH_GENERAL_ERR,
-				    (LAYER_MPA << 4) | DDP_LLP, MPA_CRC);
-		break;
-	case I40IW_AE_LLP_SEGMENT_TOO_LARGE:
-	case I40IW_AE_LLP_SEGMENT_TOO_SMALL:
-		i40iw_setup_termhdr(qp, termhdr, FLUSH_LOC_LEN_ERR,
-				    (LAYER_DDP << 4) | DDP_CATASTROPHIC, DDP_CATASTROPHIC_LOCAL);
-		break;
-	case I40IW_AE_LCE_QP_CATASTROPHIC:
-	case I40IW_AE_DDP_NO_L_BIT:
-		i40iw_setup_termhdr(qp, termhdr, FLUSH_FATAL_ERR,
-				    (LAYER_DDP << 4) | DDP_CATASTROPHIC, DDP_CATASTROPHIC_LOCAL);
-		break;
-	case I40IW_AE_DDP_INVALID_MSN_GAP_IN_MSN:
-		i40iw_setup_termhdr(qp, termhdr, FLUSH_GENERAL_ERR,
-				    (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER, DDP_UNTAGGED_INV_MSN_RANGE);
-		break;
-	case I40IW_AE_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER:
-		qp->eventtype = TERM_EVENT_QP_ACCESS_ERR;
-		i40iw_setup_termhdr(qp, termhdr, FLUSH_LOC_LEN_ERR,
-				    (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER, DDP_UNTAGGED_INV_TOO_LONG);
-		break;
-	case I40IW_AE_DDP_UBE_INVALID_DDP_VERSION:
-		if (is_tagged)
-			i40iw_setup_termhdr(qp, termhdr, FLUSH_GENERAL_ERR,
-					    (LAYER_DDP << 4) | DDP_TAGGED_BUFFER, DDP_TAGGED_INV_DDP_VER);
-		else
-			i40iw_setup_termhdr(qp, termhdr, FLUSH_GENERAL_ERR,
-					    (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER, DDP_UNTAGGED_INV_DDP_VER);
-		break;
-	case I40IW_AE_DDP_UBE_INVALID_MO:
-		i40iw_setup_termhdr(qp, termhdr, FLUSH_GENERAL_ERR,
-				    (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER, DDP_UNTAGGED_INV_MO);
-		break;
-	case I40IW_AE_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE:
-		i40iw_setup_termhdr(qp, termhdr, FLUSH_REM_OP_ERR,
-				    (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER, DDP_UNTAGGED_INV_MSN_NO_BUF);
-		break;
-	case I40IW_AE_DDP_UBE_INVALID_QN:
-		i40iw_setup_termhdr(qp, termhdr, FLUSH_GENERAL_ERR,
-				    (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER, DDP_UNTAGGED_INV_QN);
-		break;
-	case I40IW_AE_RDMAP_ROE_INVALID_RDMAP_VERSION:
-		i40iw_setup_termhdr(qp, termhdr, FLUSH_GENERAL_ERR,
-				    (LAYER_RDMA << 4) | RDMAP_REMOTE_OP, RDMAP_INV_RDMAP_VER);
-		break;
-	case I40IW_AE_RDMAP_ROE_UNEXPECTED_OPCODE:
-		i40iw_setup_termhdr(qp, termhdr, FLUSH_LOC_QP_OP_ERR,
-				    (LAYER_RDMA << 4) | RDMAP_REMOTE_OP, RDMAP_UNEXPECTED_OP);
-		break;
-	default:
-		i40iw_setup_termhdr(qp, termhdr, FLUSH_FATAL_ERR,
-				    (LAYER_RDMA << 4) | RDMAP_REMOTE_OP, RDMAP_UNSPECIFIED);
-		break;
-	}
-
-	if (copy_len)
-		memcpy(termhdr + 1, pkt, copy_len);
-
-	return sizeof(struct i40iw_terminate_hdr) + copy_len;
-}
-
-/**
- * i40iw_terminate_send_fin() - Send fin for terminate message
- * @qp: qp associated with received terminate AE
- */
-void i40iw_terminate_send_fin(struct i40iw_sc_qp *qp)
-{
-	/* Send the fin only */
-	i40iw_term_modify_qp(qp,
-			     I40IW_QP_STATE_TERMINATE,
-			     I40IWQP_TERM_SEND_FIN_ONLY,
-			     0);
-}
-
-/**
- * i40iw_terminate_connection() - Bad AE and send terminate to remote QP
- * @qp: qp associated with received terminate AE
- * @info: the struct contiaing AE information
- */
-void i40iw_terminate_connection(struct i40iw_sc_qp *qp, struct i40iw_aeqe_info *info)
-{
-	u8 termlen = 0;
-
-	if (qp->term_flags & I40IW_TERM_SENT)
-		return;         /* Sanity check */
-
-	/* Eventtype can change from bld_terminate_hdr */
-	qp->eventtype = TERM_EVENT_QP_FATAL;
-	termlen = i40iw_bld_terminate_hdr(qp, info);
-	i40iw_terminate_start_timer(qp);
-	qp->term_flags |= I40IW_TERM_SENT;
-	i40iw_term_modify_qp(qp, I40IW_QP_STATE_TERMINATE,
-			     I40IWQP_TERM_SEND_TERM_ONLY, termlen);
-}
-
-/**
- * i40iw_terminate_received - handle terminate received AE
- * @qp: qp associated with received terminate AE
- * @info: the struct contiaing AE information
- */
-void i40iw_terminate_received(struct i40iw_sc_qp *qp, struct i40iw_aeqe_info *info)
-{
-	u8 *pkt = qp->q2_buf + Q2_BAD_FRAME_OFFSET;
-	__be32 *mpa;
-	u8 ddp_ctl;
-	u8 rdma_ctl;
-	u16 aeq_id = 0;
-	struct i40iw_terminate_hdr *termhdr;
-
-	mpa = (__be32 *)i40iw_locate_mpa(pkt);
-	if (info->q2_data_written) {
-		/* did not validate the frame - do it now */
-		ddp_ctl = (ntohl(mpa[0]) >> 8) & 0xff;
-		rdma_ctl = ntohl(mpa[0]) & 0xff;
-		if ((ddp_ctl & 0xc0) != 0x40)
-			aeq_id = I40IW_AE_LCE_QP_CATASTROPHIC;
-		else if ((ddp_ctl & 0x03) != 1)
-			aeq_id = I40IW_AE_DDP_UBE_INVALID_DDP_VERSION;
-		else if (ntohl(mpa[2]) != 2)
-			aeq_id = I40IW_AE_DDP_UBE_INVALID_QN;
-		else if (ntohl(mpa[3]) != 1)
-			aeq_id = I40IW_AE_DDP_INVALID_MSN_GAP_IN_MSN;
-		else if (ntohl(mpa[4]) != 0)
-			aeq_id = I40IW_AE_DDP_UBE_INVALID_MO;
-		else if ((rdma_ctl & 0xc0) != 0x40)
-			aeq_id = I40IW_AE_RDMAP_ROE_INVALID_RDMAP_VERSION;
-
-		info->ae_id = aeq_id;
-		if (info->ae_id) {
-			/* Bad terminate recvd - send back a terminate */
-			i40iw_terminate_connection(qp, info);
-			return;
-		}
-	}
-
-	qp->term_flags |= I40IW_TERM_RCVD;
-	qp->eventtype = TERM_EVENT_QP_FATAL;
-	termhdr = (struct i40iw_terminate_hdr *)&mpa[5];
-	if (termhdr->layer_etype == RDMAP_REMOTE_PROT ||
-	    termhdr->layer_etype == RDMAP_REMOTE_OP) {
-		i40iw_terminate_done(qp, 0);
-	} else {
-		i40iw_terminate_start_timer(qp);
-		i40iw_terminate_send_fin(qp);
-	}
-}
-
-/**
- * i40iw_sc_vsi_init - Initialize virtual device
- * @vsi: pointer to the vsi structure
- * @info: parameters to initialize vsi
- **/
-void i40iw_sc_vsi_init(struct i40iw_sc_vsi *vsi, struct i40iw_vsi_init_info *info)
-{
-	int i;
-
-	vsi->dev = info->dev;
-	vsi->back_vsi = info->back_vsi;
-	vsi->mtu = info->params->mtu;
-	vsi->exception_lan_queue = info->exception_lan_queue;
-	i40iw_fill_qos_list(info->params->qs_handle_list);
-
-	for (i = 0; i < I40IW_MAX_USER_PRIORITY; i++) {
-		vsi->qos[i].qs_handle = info->params->qs_handle_list[i];
-		i40iw_debug(vsi->dev, I40IW_DEBUG_DCB, "qset[%d]: %d\n", i,
-			    vsi->qos[i].qs_handle);
-		spin_lock_init(&vsi->qos[i].lock);
-		INIT_LIST_HEAD(&vsi->qos[i].qplist);
-	}
-}
-
-/**
- * i40iw_hw_stats_init - Initiliaze HW stats table
- * @stats: pestat struct
- * @fcn_idx: PCI fn id
- * @is_pf: Is it a PF?
- *
- * Populate the HW stats table with register offset addr for each
- * stats. And start the perioidic stats timer.
- */
-void i40iw_hw_stats_init(struct i40iw_vsi_pestat *stats, u8 fcn_idx, bool is_pf)
-{
-	u32 stats_reg_offset;
-	u32 stats_index;
-	struct i40iw_dev_hw_stats_offsets *stats_table =
-		&stats->hw_stats_offsets;
-	struct i40iw_dev_hw_stats *last_rd_stats = &stats->last_read_hw_stats;
-
-	if (is_pf) {
-		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_IP4RXDISCARD] =
-				I40E_GLPES_PFIP4RXDISCARD(fcn_idx);
-		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_IP4RXTRUNC] =
-				I40E_GLPES_PFIP4RXTRUNC(fcn_idx);
-		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_IP4TXNOROUTE] =
-				I40E_GLPES_PFIP4TXNOROUTE(fcn_idx);
-		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_IP6RXDISCARD] =
-				I40E_GLPES_PFIP6RXDISCARD(fcn_idx);
-		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_IP6RXTRUNC] =
-				I40E_GLPES_PFIP6RXTRUNC(fcn_idx);
-		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_IP6TXNOROUTE] =
-				I40E_GLPES_PFIP6TXNOROUTE(fcn_idx);
-		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_TCPRTXSEG] =
-				I40E_GLPES_PFTCPRTXSEG(fcn_idx);
-		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_TCPRXOPTERR] =
-				I40E_GLPES_PFTCPRXOPTERR(fcn_idx);
-		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_TCPRXPROTOERR] =
-				I40E_GLPES_PFTCPRXPROTOERR(fcn_idx);
-
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4RXOCTS] =
-				I40E_GLPES_PFIP4RXOCTSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4RXPKTS] =
-				I40E_GLPES_PFIP4RXPKTSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4RXFRAGS] =
-				I40E_GLPES_PFIP4RXFRAGSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4RXMCPKTS] =
-				I40E_GLPES_PFIP4RXMCPKTSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4TXOCTS] =
-				I40E_GLPES_PFIP4TXOCTSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4TXPKTS] =
-				I40E_GLPES_PFIP4TXPKTSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4TXFRAGS] =
-				I40E_GLPES_PFIP4TXFRAGSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4TXMCPKTS] =
-				I40E_GLPES_PFIP4TXMCPKTSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6RXOCTS] =
-				I40E_GLPES_PFIP6RXOCTSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6RXPKTS] =
-				I40E_GLPES_PFIP6RXPKTSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6RXFRAGS] =
-				I40E_GLPES_PFIP6RXFRAGSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6RXMCPKTS] =
-				I40E_GLPES_PFIP6RXMCPKTSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6TXOCTS] =
-				I40E_GLPES_PFIP6TXOCTSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6TXPKTS] =
-				I40E_GLPES_PFIP6TXPKTSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6TXPKTS] =
-				I40E_GLPES_PFIP6TXPKTSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6TXFRAGS] =
-				I40E_GLPES_PFIP6TXFRAGSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_TCPRXSEGS] =
-				I40E_GLPES_PFTCPRXSEGSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_TCPTXSEG] =
-				I40E_GLPES_PFTCPTXSEGLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMARXRDS] =
-				I40E_GLPES_PFRDMARXRDSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMARXSNDS] =
-				I40E_GLPES_PFRDMARXSNDSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMARXWRS] =
-				I40E_GLPES_PFRDMARXWRSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMATXRDS] =
-				I40E_GLPES_PFRDMATXRDSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMATXSNDS] =
-				I40E_GLPES_PFRDMATXSNDSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMATXWRS] =
-				I40E_GLPES_PFRDMATXWRSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMAVBND] =
-				I40E_GLPES_PFRDMAVBNDLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMAVINV] =
-				I40E_GLPES_PFRDMAVINVLO(fcn_idx);
-	} else {
-		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_IP4RXDISCARD] =
-				I40E_GLPES_VFIP4RXDISCARD(fcn_idx);
-		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_IP4RXTRUNC] =
-				I40E_GLPES_VFIP4RXTRUNC(fcn_idx);
-		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_IP4TXNOROUTE] =
-				I40E_GLPES_VFIP4TXNOROUTE(fcn_idx);
-		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_IP6RXDISCARD] =
-				I40E_GLPES_VFIP6RXDISCARD(fcn_idx);
-		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_IP6RXTRUNC] =
-				I40E_GLPES_VFIP6RXTRUNC(fcn_idx);
-		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_IP6TXNOROUTE] =
-				I40E_GLPES_VFIP6TXNOROUTE(fcn_idx);
-		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_TCPRTXSEG] =
-				I40E_GLPES_VFTCPRTXSEG(fcn_idx);
-		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_TCPRXOPTERR] =
-				I40E_GLPES_VFTCPRXOPTERR(fcn_idx);
-		stats_table->stats_offset_32[I40IW_HW_STAT_INDEX_TCPRXPROTOERR] =
-				I40E_GLPES_VFTCPRXPROTOERR(fcn_idx);
-
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4RXOCTS] =
-				I40E_GLPES_VFIP4RXOCTSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4RXPKTS] =
-				I40E_GLPES_VFIP4RXPKTSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4RXFRAGS] =
-				I40E_GLPES_VFIP4RXFRAGSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4RXMCPKTS] =
-				I40E_GLPES_VFIP4RXMCPKTSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4TXOCTS] =
-				I40E_GLPES_VFIP4TXOCTSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4TXPKTS] =
-				I40E_GLPES_VFIP4TXPKTSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4TXFRAGS] =
-				I40E_GLPES_VFIP4TXFRAGSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP4TXMCPKTS] =
-				I40E_GLPES_VFIP4TXMCPKTSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6RXOCTS] =
-				I40E_GLPES_VFIP6RXOCTSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6RXPKTS] =
-				I40E_GLPES_VFIP6RXPKTSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6RXFRAGS] =
-				I40E_GLPES_VFIP6RXFRAGSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6RXMCPKTS] =
-				I40E_GLPES_VFIP6RXMCPKTSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6TXOCTS] =
-				I40E_GLPES_VFIP6TXOCTSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6TXPKTS] =
-				I40E_GLPES_VFIP6TXPKTSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6TXPKTS] =
-				I40E_GLPES_VFIP6TXPKTSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_IP6TXFRAGS] =
-				I40E_GLPES_VFIP6TXFRAGSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_TCPRXSEGS] =
-				I40E_GLPES_VFTCPRXSEGSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_TCPTXSEG] =
-				I40E_GLPES_VFTCPTXSEGLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMARXRDS] =
-				I40E_GLPES_VFRDMARXRDSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMARXSNDS] =
-				I40E_GLPES_VFRDMARXSNDSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMARXWRS] =
-				I40E_GLPES_VFRDMARXWRSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMATXRDS] =
-				I40E_GLPES_VFRDMATXRDSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMATXSNDS] =
-				I40E_GLPES_VFRDMATXSNDSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMATXWRS] =
-				I40E_GLPES_VFRDMATXWRSLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMAVBND] =
-				I40E_GLPES_VFRDMAVBNDLO(fcn_idx);
-		stats_table->stats_offset_64[I40IW_HW_STAT_INDEX_RDMAVINV] =
-				I40E_GLPES_VFRDMAVINVLO(fcn_idx);
-	}
-
-	for (stats_index = 0; stats_index < I40IW_HW_STAT_INDEX_MAX_64;
-	     stats_index++) {
-		stats_reg_offset = stats_table->stats_offset_64[stats_index];
-		last_rd_stats->stats_value_64[stats_index] =
-			readq(stats->hw->hw_addr + stats_reg_offset);
-	}
-
-	for (stats_index = 0; stats_index < I40IW_HW_STAT_INDEX_MAX_32;
-	     stats_index++) {
-		stats_reg_offset = stats_table->stats_offset_32[stats_index];
-		last_rd_stats->stats_value_32[stats_index] =
-			i40iw_rd32(stats->hw, stats_reg_offset);
-	}
-}
-
-/**
- * i40iw_hw_stats_read_32 - Read 32-bit HW stats counters and accommodates for roll-overs.
- * @stats: pestat struct
- * @index: index in HW stats table which contains offset reg-addr
- * @value: hw stats value
- */
-void i40iw_hw_stats_read_32(struct i40iw_vsi_pestat *stats,
-			    enum i40iw_hw_stats_index_32b index,
-			    u64 *value)
-{
-	struct i40iw_dev_hw_stats_offsets *stats_table =
-		&stats->hw_stats_offsets;
-	struct i40iw_dev_hw_stats *last_rd_stats = &stats->last_read_hw_stats;
-	struct i40iw_dev_hw_stats *hw_stats = &stats->hw_stats;
-	u64 new_stats_value = 0;
-	u32 stats_reg_offset = stats_table->stats_offset_32[index];
-
-	new_stats_value = i40iw_rd32(stats->hw, stats_reg_offset);
-	/*roll-over case */
-	if (new_stats_value < last_rd_stats->stats_value_32[index])
-		hw_stats->stats_value_32[index] += new_stats_value;
-	else
-		hw_stats->stats_value_32[index] +=
-			new_stats_value - last_rd_stats->stats_value_32[index];
-	last_rd_stats->stats_value_32[index] = new_stats_value;
-	*value = hw_stats->stats_value_32[index];
-}
-
-/**
- * i40iw_hw_stats_read_64 - Read HW stats counters (greater than 32-bit) and accommodates for roll-overs.
- * @stats: pestat struct
- * @index: index in HW stats table which contains offset reg-addr
- * @value: hw stats value
- */
-void i40iw_hw_stats_read_64(struct i40iw_vsi_pestat *stats,
-			    enum i40iw_hw_stats_index_64b index,
-			    u64 *value)
-{
-	struct i40iw_dev_hw_stats_offsets *stats_table =
-		&stats->hw_stats_offsets;
-	struct i40iw_dev_hw_stats *last_rd_stats = &stats->last_read_hw_stats;
-	struct i40iw_dev_hw_stats *hw_stats = &stats->hw_stats;
-	u64 new_stats_value = 0;
-	u32 stats_reg_offset = stats_table->stats_offset_64[index];
-
-	new_stats_value = readq(stats->hw->hw_addr + stats_reg_offset);
-	/*roll-over case */
-	if (new_stats_value < last_rd_stats->stats_value_64[index])
-		hw_stats->stats_value_64[index] += new_stats_value;
-	else
-		hw_stats->stats_value_64[index] +=
-			new_stats_value - last_rd_stats->stats_value_64[index];
-	last_rd_stats->stats_value_64[index] = new_stats_value;
-	*value = hw_stats->stats_value_64[index];
-}
-
-/**
- * i40iw_hw_stats_read_all - read all HW stat counters
- * @stats: pestat struct
- * @stats_values: hw stats structure
- *
- * Read all the HW stat counters and populates hw_stats structure
- * of passed-in vsi's pestat as well as copy created in stat_values.
- */
-void i40iw_hw_stats_read_all(struct i40iw_vsi_pestat *stats,
-			     struct i40iw_dev_hw_stats *stats_values)
-{
-	u32 stats_index;
-	unsigned long flags;
-
-	spin_lock_irqsave(&stats->lock, flags);
-
-	for (stats_index = 0; stats_index < I40IW_HW_STAT_INDEX_MAX_32;
-	     stats_index++)
-		i40iw_hw_stats_read_32(stats, stats_index,
-				       &stats_values->stats_value_32[stats_index]);
-	for (stats_index = 0; stats_index < I40IW_HW_STAT_INDEX_MAX_64;
-	     stats_index++)
-		i40iw_hw_stats_read_64(stats, stats_index,
-				       &stats_values->stats_value_64[stats_index]);
-	spin_unlock_irqrestore(&stats->lock, flags);
-}
-
-/**
- * i40iw_hw_stats_refresh_all - Update all HW stats structs
- * @stats: pestat struct
- *
- * Read all the HW stats counters to refresh values in hw_stats structure
- * of passed-in dev's pestat
- */
-void i40iw_hw_stats_refresh_all(struct i40iw_vsi_pestat *stats)
-{
-	u64 stats_value;
-	u32 stats_index;
-	unsigned long flags;
-
-	spin_lock_irqsave(&stats->lock, flags);
-
-	for (stats_index = 0; stats_index < I40IW_HW_STAT_INDEX_MAX_32;
-	     stats_index++)
-		i40iw_hw_stats_read_32(stats, stats_index, &stats_value);
-	for (stats_index = 0; stats_index < I40IW_HW_STAT_INDEX_MAX_64;
-	     stats_index++)
-		i40iw_hw_stats_read_64(stats, stats_index, &stats_value);
-	spin_unlock_irqrestore(&stats->lock, flags);
-}
-
-/**
- * i40iw_get_fcn_id - Return the function id
- * @dev: pointer to the device
- */
-static u8 i40iw_get_fcn_id(struct i40iw_sc_dev *dev)
-{
-	u8 fcn_id = I40IW_INVALID_FCN_ID;
-	u8 i;
-
-	for (i = I40IW_FIRST_NON_PF_STAT; i < I40IW_MAX_STATS_COUNT; i++)
-		if (!dev->fcn_id_array[i]) {
-			fcn_id = i;
-			dev->fcn_id_array[i] = true;
-			break;
-		}
-	return fcn_id;
-}
-
-/**
- * i40iw_vsi_stats_init - Initialize the vsi statistics
- * @vsi: pointer to the vsi structure
- * @info: The info structure used for initialization
- */
-enum i40iw_status_code i40iw_vsi_stats_init(struct i40iw_sc_vsi *vsi, struct i40iw_vsi_stats_info *info)
-{
-	u8 fcn_id = info->fcn_id;
-
-	if (info->alloc_fcn_id)
-		fcn_id = i40iw_get_fcn_id(vsi->dev);
-
-	if (fcn_id == I40IW_INVALID_FCN_ID)
-		return I40IW_ERR_NOT_READY;
-
-	vsi->pestat = info->pestat;
-	vsi->pestat->hw = vsi->dev->hw;
-	vsi->pestat->vsi = vsi;
-
-	if (info->stats_initialize) {
-		i40iw_hw_stats_init(vsi->pestat, fcn_id, true);
-		spin_lock_init(&vsi->pestat->lock);
-		i40iw_hw_stats_start_timer(vsi);
-	}
-	vsi->stats_fcn_id_alloc = info->alloc_fcn_id;
-	vsi->fcn_id = fcn_id;
-	return I40IW_SUCCESS;
-}
-
-/**
- * i40iw_vsi_stats_free - Free the vsi stats
- * @vsi: pointer to the vsi structure
- */
-void i40iw_vsi_stats_free(struct i40iw_sc_vsi *vsi)
-{
-	u8 fcn_id = vsi->fcn_id;
-
-	if (vsi->stats_fcn_id_alloc && fcn_id < I40IW_MAX_STATS_COUNT)
-		vsi->dev->fcn_id_array[fcn_id] = false;
-	i40iw_hw_stats_stop_timer(vsi);
-}
-
-static const struct i40iw_cqp_ops iw_cqp_ops = {
-	.cqp_init = i40iw_sc_cqp_init,
-	.cqp_create = i40iw_sc_cqp_create,
-	.cqp_post_sq = i40iw_sc_cqp_post_sq,
-	.cqp_get_next_send_wqe = i40iw_sc_cqp_get_next_send_wqe,
-	.cqp_destroy = i40iw_sc_cqp_destroy,
-	.poll_for_cqp_op_done = i40iw_sc_poll_for_cqp_op_done
-};
-
-static const struct i40iw_ccq_ops iw_ccq_ops = {
-	.ccq_init = i40iw_sc_ccq_init,
-	.ccq_create = i40iw_sc_ccq_create,
-	.ccq_destroy = i40iw_sc_ccq_destroy,
-	.ccq_create_done = i40iw_sc_ccq_create_done,
-	.ccq_get_cqe_info = i40iw_sc_ccq_get_cqe_info,
-	.ccq_arm = i40iw_sc_ccq_arm
-};
-
-static const struct i40iw_ceq_ops iw_ceq_ops = {
-	.ceq_init = i40iw_sc_ceq_init,
-	.ceq_create = i40iw_sc_ceq_create,
-	.cceq_create_done = i40iw_sc_cceq_create_done,
-	.cceq_destroy_done = i40iw_sc_cceq_destroy_done,
-	.cceq_create = i40iw_sc_cceq_create,
-	.ceq_destroy = i40iw_sc_ceq_destroy,
-	.process_ceq = i40iw_sc_process_ceq
-};
-
-static const struct i40iw_aeq_ops iw_aeq_ops = {
-	.aeq_init = i40iw_sc_aeq_init,
-	.aeq_create = i40iw_sc_aeq_create,
-	.aeq_destroy = i40iw_sc_aeq_destroy,
-	.get_next_aeqe = i40iw_sc_get_next_aeqe,
-	.repost_aeq_entries = i40iw_sc_repost_aeq_entries,
-	.aeq_create_done = i40iw_sc_aeq_create_done,
-	.aeq_destroy_done = i40iw_sc_aeq_destroy_done
-};
-
-/* iwarp pd ops */
-static const struct i40iw_pd_ops iw_pd_ops = {
-	.pd_init = i40iw_sc_pd_init,
-};
-
-static const struct i40iw_priv_qp_ops iw_priv_qp_ops = {
-	.qp_init = i40iw_sc_qp_init,
-	.qp_create = i40iw_sc_qp_create,
-	.qp_modify = i40iw_sc_qp_modify,
-	.qp_destroy = i40iw_sc_qp_destroy,
-	.qp_flush_wqes = i40iw_sc_qp_flush_wqes,
-	.qp_upload_context = i40iw_sc_qp_upload_context,
-	.qp_setctx = i40iw_sc_qp_setctx,
-	.qp_send_lsmm = i40iw_sc_send_lsmm,
-	.qp_send_lsmm_nostag = i40iw_sc_send_lsmm_nostag,
-	.qp_send_rtt = i40iw_sc_send_rtt,
-	.qp_post_wqe0 = i40iw_sc_post_wqe0,
-	.iw_mr_fast_register = i40iw_sc_mr_fast_register
-};
-
-static const struct i40iw_priv_cq_ops iw_priv_cq_ops = {
-	.cq_init = i40iw_sc_cq_init,
-	.cq_create = i40iw_sc_cq_create,
-	.cq_destroy = i40iw_sc_cq_destroy,
-	.cq_modify = i40iw_sc_cq_modify,
-};
-
-static const struct i40iw_mr_ops iw_mr_ops = {
-	.alloc_stag = i40iw_sc_alloc_stag,
-	.mr_reg_non_shared = i40iw_sc_mr_reg_non_shared,
-	.mr_reg_shared = i40iw_sc_mr_reg_shared,
-	.dealloc_stag = i40iw_sc_dealloc_stag,
-	.query_stag = i40iw_sc_query_stag,
-	.mw_alloc = i40iw_sc_mw_alloc
-};
-
-static const struct i40iw_cqp_misc_ops iw_cqp_misc_ops = {
-	.manage_hmc_pm_func_table = i40iw_sc_manage_hmc_pm_func_table,
-	.set_hmc_resource_profile = i40iw_sc_set_hmc_resource_profile,
-	.commit_fpm_values = i40iw_sc_commit_fpm_values,
-	.query_fpm_values = i40iw_sc_query_fpm_values,
-	.static_hmc_pages_allocated = i40iw_sc_static_hmc_pages_allocated,
-	.add_arp_cache_entry = i40iw_sc_add_arp_cache_entry,
-	.del_arp_cache_entry = i40iw_sc_del_arp_cache_entry,
-	.query_arp_cache_entry = i40iw_sc_query_arp_cache_entry,
-	.manage_apbvt_entry = i40iw_sc_manage_apbvt_entry,
-	.manage_qhash_table_entry = i40iw_sc_manage_qhash_table_entry,
-	.alloc_local_mac_ipaddr_table_entry = i40iw_sc_alloc_local_mac_ipaddr_entry,
-	.add_local_mac_ipaddr_entry = i40iw_sc_add_local_mac_ipaddr_entry,
-	.del_local_mac_ipaddr_entry = i40iw_sc_del_local_mac_ipaddr_entry,
-	.cqp_nop = i40iw_sc_cqp_nop,
-	.commit_fpm_values_done = i40iw_sc_commit_fpm_values_done,
-	.query_fpm_values_done = i40iw_sc_query_fpm_values_done,
-	.manage_hmc_pm_func_table_done = i40iw_sc_manage_hmc_pm_func_table_done,
-	.update_suspend_qp = i40iw_sc_suspend_qp,
-	.update_resume_qp = i40iw_sc_resume_qp
-};
-
-static const struct i40iw_hmc_ops iw_hmc_ops = {
-	.init_iw_hmc = i40iw_sc_init_iw_hmc,
-	.parse_fpm_query_buf = i40iw_sc_parse_fpm_query_buf,
-	.configure_iw_fpm = i40iw_sc_configure_iw_fpm,
-	.parse_fpm_commit_buf = i40iw_sc_parse_fpm_commit_buf,
-	.create_hmc_object = i40iw_sc_create_hmc_obj,
-	.del_hmc_object = i40iw_sc_del_hmc_obj
-};
-
-/**
- * i40iw_device_init - Initialize IWARP device
- * @dev: IWARP device pointer
- * @info: IWARP init info
- */
-enum i40iw_status_code i40iw_device_init(struct i40iw_sc_dev *dev,
-					 struct i40iw_device_init_info *info)
-{
-	u32 val;
-	u32 vchnl_ver = 0;
-	u16 hmc_fcn = 0;
-	enum i40iw_status_code ret_code = 0;
-	u8 db_size;
-
-	spin_lock_init(&dev->cqp_lock);
-
-	i40iw_device_init_uk(&dev->dev_uk);
-
-	dev->debug_mask = info->debug_mask;
-
-	dev->hmc_fn_id = info->hmc_fn_id;
-	dev->is_pf = info->is_pf;
-
-	dev->fpm_query_buf_pa = info->fpm_query_buf_pa;
-	dev->fpm_query_buf = info->fpm_query_buf;
-
-	dev->fpm_commit_buf_pa = info->fpm_commit_buf_pa;
-	dev->fpm_commit_buf = info->fpm_commit_buf;
-
-	dev->hw = info->hw;
-	dev->hw->hw_addr = info->bar0;
-
-	if (dev->is_pf) {
-		val = i40iw_rd32(dev->hw, I40E_GLPCI_DREVID);
-		dev->hw_rev = (u8)RS_32(val, I40E_GLPCI_DREVID_DEFAULT_REVID);
-
-		val = i40iw_rd32(dev->hw, I40E_GLPCI_LBARCTRL);
-		db_size = (u8)RS_32(val, I40E_GLPCI_LBARCTRL_PE_DB_SIZE);
-		if ((db_size != I40IW_PE_DB_SIZE_4M) &&
-		    (db_size != I40IW_PE_DB_SIZE_8M)) {
-			i40iw_debug(dev, I40IW_DEBUG_DEV,
-				    "%s: PE doorbell is not enabled in CSR val 0x%x\n",
-				    __func__, val);
-			ret_code = I40IW_ERR_PE_DOORBELL_NOT_ENABLED;
-			return ret_code;
-		}
-		dev->db_addr = dev->hw->hw_addr + I40IW_DB_ADDR_OFFSET;
-		dev->vchnl_if.vchnl_recv = i40iw_vchnl_recv_pf;
-	} else {
-		dev->db_addr = dev->hw->hw_addr + I40IW_VF_DB_ADDR_OFFSET;
-	}
-
-	dev->cqp_ops = &iw_cqp_ops;
-	dev->ccq_ops = &iw_ccq_ops;
-	dev->ceq_ops = &iw_ceq_ops;
-	dev->aeq_ops = &iw_aeq_ops;
-	dev->cqp_misc_ops = &iw_cqp_misc_ops;
-	dev->iw_pd_ops = &iw_pd_ops;
-	dev->iw_priv_qp_ops = &iw_priv_qp_ops;
-	dev->iw_priv_cq_ops = &iw_priv_cq_ops;
-	dev->mr_ops = &iw_mr_ops;
-	dev->hmc_ops = &iw_hmc_ops;
-	dev->vchnl_if.vchnl_send = info->vchnl_send;
-	if (dev->vchnl_if.vchnl_send)
-		dev->vchnl_up = true;
-	else
-		dev->vchnl_up = false;
-	if (!dev->is_pf) {
-		dev->vchnl_if.vchnl_recv = i40iw_vchnl_recv_vf;
-		ret_code = i40iw_vchnl_vf_get_ver(dev, &vchnl_ver);
-		if (!ret_code) {
-			i40iw_debug(dev, I40IW_DEBUG_DEV,
-				    "%s: Get Channel version rc = 0x%0x, version is %u\n",
-				__func__, ret_code, vchnl_ver);
-			ret_code = i40iw_vchnl_vf_get_hmc_fcn(dev, &hmc_fcn);
-			if (!ret_code) {
-				i40iw_debug(dev, I40IW_DEBUG_DEV,
-					    "%s Get HMC function rc = 0x%0x, hmc fcn is %u\n",
-					    __func__, ret_code, hmc_fcn);
-				dev->hmc_fn_id = (u8)hmc_fcn;
-			}
-		}
-	}
-	dev->iw_vf_cqp_ops = &iw_vf_cqp_ops;
-
-	return ret_code;
-}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_d.h b/drivers/infiniband/hw/i40iw/i40iw_d.h
deleted file mode 100644
index 86d5a33c57cc..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw_d.h
+++ /dev/null
@@ -1,1746 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#ifndef I40IW_D_H
-#define I40IW_D_H
-
-#define I40IW_FIRST_USER_QP_ID  2
-
-#define I40IW_DB_ADDR_OFFSET    (4 * 1024 * 1024 - 64 * 1024)
-#define I40IW_VF_DB_ADDR_OFFSET (64 * 1024)
-
-#define I40IW_PE_DB_SIZE_4M     1
-#define I40IW_PE_DB_SIZE_8M     2
-
-#define I40IW_DDP_VER 1
-#define I40IW_RDMAP_VER 1
-
-#define I40IW_RDMA_MODE_RDMAC 0
-#define I40IW_RDMA_MODE_IETF  1
-
-#define I40IW_QP_STATE_INVALID 0
-#define I40IW_QP_STATE_IDLE 1
-#define I40IW_QP_STATE_RTS 2
-#define I40IW_QP_STATE_CLOSING 3
-#define I40IW_QP_STATE_RESERVED 4
-#define I40IW_QP_STATE_TERMINATE 5
-#define I40IW_QP_STATE_ERROR 6
-
-#define I40IW_STAG_STATE_INVALID 0
-#define I40IW_STAG_STATE_VALID 1
-
-#define I40IW_STAG_TYPE_SHARED 0
-#define I40IW_STAG_TYPE_NONSHARED 1
-
-#define I40IW_MAX_USER_PRIORITY 8
-#define I40IW_MAX_STATS_COUNT 16
-#define I40IW_FIRST_NON_PF_STAT	4
-
-
-#define I40IW_MTU_TO_MSS_IPV4		40
-#define I40IW_MTU_TO_MSS_IPV6		60
-#define I40IW_DEFAULT_MTU		1500
-
-#define LS_64_1(val, bits)      ((u64)(uintptr_t)val << bits)
-#define RS_64_1(val, bits)      ((u64)(uintptr_t)val >> bits)
-#define LS_32_1(val, bits)      (u32)(val << bits)
-#define RS_32_1(val, bits)      (u32)(val >> bits)
-#define I40E_HI_DWORD(x)        ((u32)((((x) >> 16) >> 16) & 0xFFFFFFFF))
-
-#define QS_HANDLE_UNKNOWN       0xffff
-
-#define LS_64(val, field) (((u64)val << field ## _SHIFT) & (field ## _MASK))
-
-#define RS_64(val, field) ((u64)(val & field ## _MASK) >> field ## _SHIFT)
-#define LS_32(val, field) ((val << field ## _SHIFT) & (field ## _MASK))
-#define RS_32(val, field) ((val & field ## _MASK) >> field ## _SHIFT)
-
-#define TERM_DDP_LEN_TAGGED     14
-#define TERM_DDP_LEN_UNTAGGED   18
-#define TERM_RDMA_LEN           28
-#define RDMA_OPCODE_MASK        0x0f
-#define RDMA_READ_REQ_OPCODE    1
-#define Q2_BAD_FRAME_OFFSET     72
-#define Q2_FPSN_OFFSET          64
-#define CQE_MAJOR_DRV           0x8000
-
-#define I40IW_TERM_SENT 0x01
-#define I40IW_TERM_RCVD 0x02
-#define I40IW_TERM_DONE 0x04
-#define I40IW_MAC_HLEN  14
-
-#define I40IW_INVALID_WQE_INDEX 0xffffffff
-
-#define I40IW_CQP_WAIT_POLL_REGS 1
-#define I40IW_CQP_WAIT_POLL_CQ 2
-#define I40IW_CQP_WAIT_EVENT 3
-
-#define I40IW_CQP_INIT_WQE(wqe) memset(wqe, 0, 64)
-
-#define I40IW_GET_CURRENT_CQ_ELEMENT(_cq) \
-	( \
-		&((_cq)->cq_base[I40IW_RING_GETCURRENT_HEAD((_cq)->cq_ring)])  \
-	)
-#define I40IW_GET_CURRENT_EXTENDED_CQ_ELEMENT(_cq) \
-	( \
-		&(((struct i40iw_extended_cqe *)        \
-		   ((_cq)->cq_base))[I40IW_RING_GETCURRENT_HEAD((_cq)->cq_ring)]) \
-	)
-
-#define I40IW_GET_CURRENT_AEQ_ELEMENT(_aeq) \
-	( \
-		&_aeq->aeqe_base[I40IW_RING_GETCURRENT_TAIL(_aeq->aeq_ring)]   \
-	)
-
-#define I40IW_GET_CURRENT_CEQ_ELEMENT(_ceq) \
-	( \
-		&_ceq->ceqe_base[I40IW_RING_GETCURRENT_TAIL(_ceq->ceq_ring)]   \
-	)
-
-#define I40IW_AE_SOURCE_RSVD            0x0
-#define I40IW_AE_SOURCE_RQ              0x1
-#define I40IW_AE_SOURCE_RQ_0011         0x3
-
-#define I40IW_AE_SOURCE_CQ              0x2
-#define I40IW_AE_SOURCE_CQ_0110         0x6
-#define I40IW_AE_SOURCE_CQ_1010         0xA
-#define I40IW_AE_SOURCE_CQ_1110         0xE
-
-#define I40IW_AE_SOURCE_SQ              0x5
-#define I40IW_AE_SOURCE_SQ_0111         0x7
-
-#define I40IW_AE_SOURCE_IN_RR_WR        0x9
-#define I40IW_AE_SOURCE_IN_RR_WR_1011   0xB
-#define I40IW_AE_SOURCE_OUT_RR          0xD
-#define I40IW_AE_SOURCE_OUT_RR_1111     0xF
-
-#define I40IW_TCP_STATE_NON_EXISTENT 0
-#define I40IW_TCP_STATE_CLOSED 1
-#define I40IW_TCP_STATE_LISTEN 2
-#define I40IW_STATE_SYN_SEND 3
-#define I40IW_TCP_STATE_SYN_RECEIVED 4
-#define I40IW_TCP_STATE_ESTABLISHED 5
-#define I40IW_TCP_STATE_CLOSE_WAIT 6
-#define I40IW_TCP_STATE_FIN_WAIT_1 7
-#define I40IW_TCP_STATE_CLOSING  8
-#define I40IW_TCP_STATE_LAST_ACK 9
-#define I40IW_TCP_STATE_FIN_WAIT_2 10
-#define I40IW_TCP_STATE_TIME_WAIT 11
-#define I40IW_TCP_STATE_RESERVED_1 12
-#define I40IW_TCP_STATE_RESERVED_2 13
-#define I40IW_TCP_STATE_RESERVED_3 14
-#define I40IW_TCP_STATE_RESERVED_4 15
-
-/* ILQ CQP hash table fields */
-#define I40IW_CQPSQ_QHASH_VLANID_SHIFT 32
-#define I40IW_CQPSQ_QHASH_VLANID_MASK \
-	((u64)0xfff << I40IW_CQPSQ_QHASH_VLANID_SHIFT)
-
-#define I40IW_CQPSQ_QHASH_QPN_SHIFT 32
-#define I40IW_CQPSQ_QHASH_QPN_MASK \
-	((u64)0x3ffff << I40IW_CQPSQ_QHASH_QPN_SHIFT)
-
-#define I40IW_CQPSQ_QHASH_QS_HANDLE_SHIFT 0
-#define I40IW_CQPSQ_QHASH_QS_HANDLE_MASK ((u64)0x3ff << I40IW_CQPSQ_QHASH_QS_HANDLE_SHIFT)
-
-#define I40IW_CQPSQ_QHASH_SRC_PORT_SHIFT 16
-#define I40IW_CQPSQ_QHASH_SRC_PORT_MASK \
-	((u64)0xffff << I40IW_CQPSQ_QHASH_SRC_PORT_SHIFT)
-
-#define I40IW_CQPSQ_QHASH_DEST_PORT_SHIFT 0
-#define I40IW_CQPSQ_QHASH_DEST_PORT_MASK \
-	((u64)0xffff << I40IW_CQPSQ_QHASH_DEST_PORT_SHIFT)
-
-#define I40IW_CQPSQ_QHASH_ADDR0_SHIFT 32
-#define I40IW_CQPSQ_QHASH_ADDR0_MASK \
-	((u64)0xffffffff << I40IW_CQPSQ_QHASH_ADDR0_SHIFT)
-
-#define I40IW_CQPSQ_QHASH_ADDR1_SHIFT 0
-#define I40IW_CQPSQ_QHASH_ADDR1_MASK \
-	((u64)0xffffffff << I40IW_CQPSQ_QHASH_ADDR1_SHIFT)
-
-#define I40IW_CQPSQ_QHASH_ADDR2_SHIFT 32
-#define I40IW_CQPSQ_QHASH_ADDR2_MASK \
-	((u64)0xffffffff << I40IW_CQPSQ_QHASH_ADDR2_SHIFT)
-
-#define I40IW_CQPSQ_QHASH_ADDR3_SHIFT 0
-#define I40IW_CQPSQ_QHASH_ADDR3_MASK \
-	((u64)0xffffffff << I40IW_CQPSQ_QHASH_ADDR3_SHIFT)
-
-#define I40IW_CQPSQ_QHASH_WQEVALID_SHIFT 63
-#define I40IW_CQPSQ_QHASH_WQEVALID_MASK \
-	((u64)0x1 << I40IW_CQPSQ_QHASH_WQEVALID_SHIFT)
-#define I40IW_CQPSQ_QHASH_OPCODE_SHIFT 32
-#define I40IW_CQPSQ_QHASH_OPCODE_MASK \
-	((u64)0x3f << I40IW_CQPSQ_QHASH_OPCODE_SHIFT)
-
-#define I40IW_CQPSQ_QHASH_MANAGE_SHIFT 61
-#define I40IW_CQPSQ_QHASH_MANAGE_MASK \
-	((u64)0x3 << I40IW_CQPSQ_QHASH_MANAGE_SHIFT)
-
-#define I40IW_CQPSQ_QHASH_IPV4VALID_SHIFT 60
-#define I40IW_CQPSQ_QHASH_IPV4VALID_MASK \
-	((u64)0x1 << I40IW_CQPSQ_QHASH_IPV4VALID_SHIFT)
-
-#define I40IW_CQPSQ_QHASH_VLANVALID_SHIFT 59
-#define I40IW_CQPSQ_QHASH_VLANVALID_MASK \
-	((u64)0x1 << I40IW_CQPSQ_QHASH_VLANVALID_SHIFT)
-
-#define I40IW_CQPSQ_QHASH_ENTRYTYPE_SHIFT 42
-#define I40IW_CQPSQ_QHASH_ENTRYTYPE_MASK \
-	((u64)0x7 << I40IW_CQPSQ_QHASH_ENTRYTYPE_SHIFT)
-/* CQP Host Context */
-#define I40IW_CQPHC_EN_DC_TCP_SHIFT 0
-#define I40IW_CQPHC_EN_DC_TCP_MASK (1UL << I40IW_CQPHC_EN_DC_TCP_SHIFT)
-
-#define I40IW_CQPHC_SQSIZE_SHIFT 8
-#define I40IW_CQPHC_SQSIZE_MASK (0xfUL << I40IW_CQPHC_SQSIZE_SHIFT)
-
-#define I40IW_CQPHC_DISABLE_PFPDUS_SHIFT 1
-#define I40IW_CQPHC_DISABLE_PFPDUS_MASK (0x1UL << I40IW_CQPHC_DISABLE_PFPDUS_SHIFT)
-
-#define I40IW_CQPHC_ENABLED_VFS_SHIFT 32
-#define I40IW_CQPHC_ENABLED_VFS_MASK (0x3fULL << I40IW_CQPHC_ENABLED_VFS_SHIFT)
-
-#define I40IW_CQPHC_HMC_PROFILE_SHIFT 0
-#define I40IW_CQPHC_HMC_PROFILE_MASK (0x7ULL << I40IW_CQPHC_HMC_PROFILE_SHIFT)
-
-#define I40IW_CQPHC_SVER_SHIFT 24
-#define I40IW_CQPHC_SVER_MASK (0xffUL << I40IW_CQPHC_SVER_SHIFT)
-
-#define I40IW_CQPHC_SQBASE_SHIFT 9
-#define I40IW_CQPHC_SQBASE_MASK \
-	(0xfffffffffffffeULL << I40IW_CQPHC_SQBASE_SHIFT)
-
-#define I40IW_CQPHC_QPCTX_SHIFT 0
-#define I40IW_CQPHC_QPCTX_MASK  \
-	(0xffffffffffffffffULL << I40IW_CQPHC_QPCTX_SHIFT)
-#define I40IW_CQPHC_SVER        1
-
-#define I40IW_CQP_SW_SQSIZE_4 4
-#define I40IW_CQP_SW_SQSIZE_2048 2048
-
-/* iWARP QP Doorbell shadow area */
-#define I40IW_QP_DBSA_HW_SQ_TAIL_SHIFT 0
-#define I40IW_QP_DBSA_HW_SQ_TAIL_MASK \
-	(0x3fffUL << I40IW_QP_DBSA_HW_SQ_TAIL_SHIFT)
-
-/* Completion Queue Doorbell shadow area */
-#define I40IW_CQ_DBSA_CQEIDX_SHIFT 0
-#define I40IW_CQ_DBSA_CQEIDX_MASK (0xfffffUL << I40IW_CQ_DBSA_CQEIDX_SHIFT)
-
-#define I40IW_CQ_DBSA_SW_CQ_SELECT_SHIFT 0
-#define I40IW_CQ_DBSA_SW_CQ_SELECT_MASK \
-	(0x3fffUL << I40IW_CQ_DBSA_SW_CQ_SELECT_SHIFT)
-
-#define I40IW_CQ_DBSA_ARM_NEXT_SHIFT 14
-#define I40IW_CQ_DBSA_ARM_NEXT_MASK (1UL << I40IW_CQ_DBSA_ARM_NEXT_SHIFT)
-
-#define I40IW_CQ_DBSA_ARM_NEXT_SE_SHIFT 15
-#define I40IW_CQ_DBSA_ARM_NEXT_SE_MASK (1UL << I40IW_CQ_DBSA_ARM_NEXT_SE_SHIFT)
-
-#define I40IW_CQ_DBSA_ARM_SEQ_NUM_SHIFT 16
-#define I40IW_CQ_DBSA_ARM_SEQ_NUM_MASK \
-	(0x3UL << I40IW_CQ_DBSA_ARM_SEQ_NUM_SHIFT)
-
-/* CQP and iWARP Completion Queue */
-#define I40IW_CQ_QPCTX_SHIFT I40IW_CQPHC_QPCTX_SHIFT
-#define I40IW_CQ_QPCTX_MASK I40IW_CQPHC_QPCTX_MASK
-
-#define I40IW_CCQ_OPRETVAL_SHIFT 0
-#define I40IW_CCQ_OPRETVAL_MASK (0xffffffffUL << I40IW_CCQ_OPRETVAL_SHIFT)
-
-#define I40IW_CQ_MINERR_SHIFT 0
-#define I40IW_CQ_MINERR_MASK (0xffffUL << I40IW_CQ_MINERR_SHIFT)
-
-#define I40IW_CQ_MAJERR_SHIFT 16
-#define I40IW_CQ_MAJERR_MASK (0xffffUL << I40IW_CQ_MAJERR_SHIFT)
-
-#define I40IW_CQ_WQEIDX_SHIFT 32
-#define I40IW_CQ_WQEIDX_MASK (0x3fffULL << I40IW_CQ_WQEIDX_SHIFT)
-
-#define I40IW_CQ_ERROR_SHIFT 55
-#define I40IW_CQ_ERROR_MASK (1ULL << I40IW_CQ_ERROR_SHIFT)
-
-#define I40IW_CQ_SQ_SHIFT 62
-#define I40IW_CQ_SQ_MASK (1ULL << I40IW_CQ_SQ_SHIFT)
-
-#define I40IW_CQ_VALID_SHIFT 63
-#define I40IW_CQ_VALID_MASK (1ULL << I40IW_CQ_VALID_SHIFT)
-
-#define I40IWCQ_PAYLDLEN_SHIFT 0
-#define I40IWCQ_PAYLDLEN_MASK (0xffffffffUL << I40IWCQ_PAYLDLEN_SHIFT)
-
-#define I40IWCQ_TCPSEQNUM_SHIFT 32
-#define I40IWCQ_TCPSEQNUM_MASK (0xffffffffULL << I40IWCQ_TCPSEQNUM_SHIFT)
-
-#define I40IWCQ_INVSTAG_SHIFT 0
-#define I40IWCQ_INVSTAG_MASK (0xffffffffUL << I40IWCQ_INVSTAG_SHIFT)
-
-#define I40IWCQ_QPID_SHIFT 32
-#define I40IWCQ_QPID_MASK (0x3ffffULL << I40IWCQ_QPID_SHIFT)
-
-#define I40IWCQ_PSHDROP_SHIFT 51
-#define I40IWCQ_PSHDROP_MASK (1ULL << I40IWCQ_PSHDROP_SHIFT)
-
-#define I40IWCQ_SRQ_SHIFT 52
-#define I40IWCQ_SRQ_MASK (1ULL << I40IWCQ_SRQ_SHIFT)
-
-#define I40IWCQ_STAG_SHIFT 53
-#define I40IWCQ_STAG_MASK (1ULL << I40IWCQ_STAG_SHIFT)
-
-#define I40IWCQ_SOEVENT_SHIFT 54
-#define I40IWCQ_SOEVENT_MASK (1ULL << I40IWCQ_SOEVENT_SHIFT)
-
-#define I40IWCQ_OP_SHIFT 56
-#define I40IWCQ_OP_MASK (0x3fULL << I40IWCQ_OP_SHIFT)
-
-/* CEQE format */
-#define I40IW_CEQE_CQCTX_SHIFT 0
-#define I40IW_CEQE_CQCTX_MASK   \
-	(0x7fffffffffffffffULL << I40IW_CEQE_CQCTX_SHIFT)
-
-#define I40IW_CEQE_VALID_SHIFT 63
-#define I40IW_CEQE_VALID_MASK (1ULL << I40IW_CEQE_VALID_SHIFT)
-
-/* AEQE format */
-#define I40IW_AEQE_COMPCTX_SHIFT I40IW_CQPHC_QPCTX_SHIFT
-#define I40IW_AEQE_COMPCTX_MASK I40IW_CQPHC_QPCTX_MASK
-
-#define I40IW_AEQE_QPCQID_SHIFT 0
-#define I40IW_AEQE_QPCQID_MASK (0x3ffffUL << I40IW_AEQE_QPCQID_SHIFT)
-
-#define I40IW_AEQE_WQDESCIDX_SHIFT 18
-#define I40IW_AEQE_WQDESCIDX_MASK (0x3fffULL << I40IW_AEQE_WQDESCIDX_SHIFT)
-
-#define I40IW_AEQE_OVERFLOW_SHIFT 33
-#define I40IW_AEQE_OVERFLOW_MASK (1ULL << I40IW_AEQE_OVERFLOW_SHIFT)
-
-#define I40IW_AEQE_AECODE_SHIFT 34
-#define I40IW_AEQE_AECODE_MASK (0xffffULL << I40IW_AEQE_AECODE_SHIFT)
-
-#define I40IW_AEQE_AESRC_SHIFT 50
-#define I40IW_AEQE_AESRC_MASK (0xfULL << I40IW_AEQE_AESRC_SHIFT)
-
-#define I40IW_AEQE_IWSTATE_SHIFT 54
-#define I40IW_AEQE_IWSTATE_MASK (0x7ULL << I40IW_AEQE_IWSTATE_SHIFT)
-
-#define I40IW_AEQE_TCPSTATE_SHIFT 57
-#define I40IW_AEQE_TCPSTATE_MASK (0xfULL << I40IW_AEQE_TCPSTATE_SHIFT)
-
-#define I40IW_AEQE_Q2DATA_SHIFT 61
-#define I40IW_AEQE_Q2DATA_MASK (0x3ULL << I40IW_AEQE_Q2DATA_SHIFT)
-
-#define I40IW_AEQE_VALID_SHIFT 63
-#define I40IW_AEQE_VALID_MASK (1ULL << I40IW_AEQE_VALID_SHIFT)
-
-/* CQP SQ WQES */
-#define I40IW_QP_TYPE_IWARP     1
-#define I40IW_QP_TYPE_UDA       2
-#define I40IW_QP_TYPE_CQP       4
-
-#define I40IW_CQ_TYPE_IWARP     1
-#define I40IW_CQ_TYPE_ILQ       2
-#define I40IW_CQ_TYPE_IEQ       3
-#define I40IW_CQ_TYPE_CQP       4
-
-#define I40IWQP_TERM_SEND_TERM_AND_FIN          0
-#define I40IWQP_TERM_SEND_TERM_ONLY             1
-#define I40IWQP_TERM_SEND_FIN_ONLY              2
-#define I40IWQP_TERM_DONOT_SEND_TERM_OR_FIN     3
-
-#define I40IW_CQP_OP_CREATE_QP                  0
-#define I40IW_CQP_OP_MODIFY_QP                  0x1
-#define I40IW_CQP_OP_DESTROY_QP                 0x02
-#define I40IW_CQP_OP_CREATE_CQ                  0x03
-#define I40IW_CQP_OP_MODIFY_CQ                  0x04
-#define I40IW_CQP_OP_DESTROY_CQ                 0x05
-#define I40IW_CQP_OP_CREATE_SRQ                 0x06
-#define I40IW_CQP_OP_MODIFY_SRQ                 0x07
-#define I40IW_CQP_OP_DESTROY_SRQ                0x08
-#define I40IW_CQP_OP_ALLOC_STAG                 0x09
-#define I40IW_CQP_OP_REG_MR                     0x0a
-#define I40IW_CQP_OP_QUERY_STAG                 0x0b
-#define I40IW_CQP_OP_REG_SMR                    0x0c
-#define I40IW_CQP_OP_DEALLOC_STAG               0x0d
-#define I40IW_CQP_OP_MANAGE_LOC_MAC_IP_TABLE    0x0e
-#define I40IW_CQP_OP_MANAGE_ARP                 0x0f
-#define I40IW_CQP_OP_MANAGE_VF_PBLE_BP          0x10
-#define I40IW_CQP_OP_QUERY_RDMA_FEATURES	0x12
-#define I40IW_CQP_OP_UPLOAD_CONTEXT             0x13
-#define I40IW_CQP_OP_ALLOCATE_LOC_MAC_IP_TABLE_ENTRY 0x14
-#define I40IW_CQP_OP_MANAGE_HMC_PM_FUNC_TABLE   0x15
-#define I40IW_CQP_OP_CREATE_CEQ                 0x16
-#define I40IW_CQP_OP_DESTROY_CEQ                0x18
-#define I40IW_CQP_OP_CREATE_AEQ                 0x19
-#define I40IW_CQP_OP_DESTROY_AEQ                0x1b
-#define I40IW_CQP_OP_CREATE_ADDR_VECT           0x1c
-#define I40IW_CQP_OP_MODIFY_ADDR_VECT           0x1d
-#define I40IW_CQP_OP_DESTROY_ADDR_VECT          0x1e
-#define I40IW_CQP_OP_UPDATE_PE_SDS              0x1f
-#define I40IW_CQP_OP_QUERY_FPM_VALUES           0x20
-#define I40IW_CQP_OP_COMMIT_FPM_VALUES          0x21
-#define I40IW_CQP_OP_FLUSH_WQES                 0x22
-/* I40IW_CQP_OP_GEN_AE is the same value as I40IW_CQP_OP_FLUSH_WQES */
-#define I40IW_CQP_OP_GEN_AE                     0x22
-#define I40IW_CQP_OP_MANAGE_APBVT               0x23
-#define I40IW_CQP_OP_NOP                        0x24
-#define I40IW_CQP_OP_MANAGE_QUAD_HASH_TABLE_ENTRY 0x25
-#define I40IW_CQP_OP_CREATE_UDA_MCAST_GROUP     0x26
-#define I40IW_CQP_OP_MODIFY_UDA_MCAST_GROUP     0x27
-#define I40IW_CQP_OP_DESTROY_UDA_MCAST_GROUP    0x28
-#define I40IW_CQP_OP_SUSPEND_QP                 0x29
-#define I40IW_CQP_OP_RESUME_QP                  0x2a
-#define I40IW_CQP_OP_SHMC_PAGES_ALLOCATED       0x2b
-#define I40IW_CQP_OP_SET_HMC_RESOURCE_PROFILE   0x2d
-
-#define I40IW_FEATURE_BUF_SIZE                  (8 * I40IW_MAX_FEATURES)
-
-#define I40IW_FW_VER_MINOR_SHIFT        0
-#define I40IW_FW_VER_MINOR_MASK         \
-	(0xffffULL << I40IW_FW_VER_MINOR_SHIFT)
-
-#define I40IW_FW_VER_MAJOR_SHIFT        16
-#define I40IW_FW_VER_MAJOR_MASK	        \
-	(0xffffULL << I40IW_FW_VER_MAJOR_SHIFT)
-
-#define I40IW_FEATURE_INFO_SHIFT        0
-#define I40IW_FEATURE_INFO_MASK         \
-	(0xffffULL << I40IW_FEATURE_INFO_SHIFT)
-
-#define I40IW_FEATURE_CNT_SHIFT         32
-#define I40IW_FEATURE_CNT_MASK          \
-	(0xffffULL << I40IW_FEATURE_CNT_SHIFT)
-
-#define I40IW_UDA_QPSQ_NEXT_HEADER_SHIFT 16
-#define I40IW_UDA_QPSQ_NEXT_HEADER_MASK ((u64)0xff << I40IW_UDA_QPSQ_NEXT_HEADER_SHIFT)
-
-#define I40IW_UDA_QPSQ_OPCODE_SHIFT 32
-#define I40IW_UDA_QPSQ_OPCODE_MASK ((u64)0x3f << I40IW_UDA_QPSQ_OPCODE_SHIFT)
-
-#define I40IW_UDA_QPSQ_MACLEN_SHIFT 56
-#define I40IW_UDA_QPSQ_MACLEN_MASK \
-	((u64)0x7f << I40IW_UDA_QPSQ_MACLEN_SHIFT)
-
-#define I40IW_UDA_QPSQ_IPLEN_SHIFT 48
-#define I40IW_UDA_QPSQ_IPLEN_MASK \
-	((u64)0x7f << I40IW_UDA_QPSQ_IPLEN_SHIFT)
-
-#define I40IW_UDA_QPSQ_L4T_SHIFT 30
-#define I40IW_UDA_QPSQ_L4T_MASK \
-	((u64)0x3 << I40IW_UDA_QPSQ_L4T_SHIFT)
-
-#define I40IW_UDA_QPSQ_IIPT_SHIFT 28
-#define I40IW_UDA_QPSQ_IIPT_MASK \
-	((u64)0x3 << I40IW_UDA_QPSQ_IIPT_SHIFT)
-
-#define I40IW_UDA_QPSQ_L4LEN_SHIFT 24
-#define I40IW_UDA_QPSQ_L4LEN_MASK ((u64)0xf << I40IW_UDA_QPSQ_L4LEN_SHIFT)
-
-#define I40IW_UDA_QPSQ_AVIDX_SHIFT 0
-#define I40IW_UDA_QPSQ_AVIDX_MASK ((u64)0xffff << I40IW_UDA_QPSQ_AVIDX_SHIFT)
-
-#define I40IW_UDA_QPSQ_VALID_SHIFT 63
-#define I40IW_UDA_QPSQ_VALID_MASK \
-	((u64)0x1 << I40IW_UDA_QPSQ_VALID_SHIFT)
-
-#define I40IW_UDA_QPSQ_SIGCOMPL_SHIFT 62
-#define I40IW_UDA_QPSQ_SIGCOMPL_MASK ((u64)0x1 << I40IW_UDA_QPSQ_SIGCOMPL_SHIFT)
-
-#define I40IW_UDA_PAYLOADLEN_SHIFT 0
-#define I40IW_UDA_PAYLOADLEN_MASK ((u64)0x3fff << I40IW_UDA_PAYLOADLEN_SHIFT)
-
-#define I40IW_UDA_HDRLEN_SHIFT 16
-#define I40IW_UDA_HDRLEN_MASK ((u64)0x1ff << I40IW_UDA_HDRLEN_SHIFT)
-
-#define I40IW_VLAN_TAG_VALID_SHIFT 50
-#define I40IW_VLAN_TAG_VALID_MASK ((u64)0x1 << I40IW_VLAN_TAG_VALID_SHIFT)
-
-#define I40IW_UDA_L3PROTO_SHIFT 0
-#define I40IW_UDA_L3PROTO_MASK ((u64)0x3 << I40IW_UDA_L3PROTO_SHIFT)
-
-#define I40IW_UDA_L4PROTO_SHIFT 16
-#define I40IW_UDA_L4PROTO_MASK ((u64)0x3 << I40IW_UDA_L4PROTO_SHIFT)
-
-#define I40IW_UDA_QPSQ_DOLOOPBACK_SHIFT 44
-#define I40IW_UDA_QPSQ_DOLOOPBACK_MASK \
-	((u64)0x1 << I40IW_UDA_QPSQ_DOLOOPBACK_SHIFT)
-
-/* CQP SQ WQE common fields */
-#define I40IW_CQPSQ_OPCODE_SHIFT 32
-#define I40IW_CQPSQ_OPCODE_MASK (0x3fULL << I40IW_CQPSQ_OPCODE_SHIFT)
-
-#define I40IW_CQPSQ_WQEVALID_SHIFT 63
-#define I40IW_CQPSQ_WQEVALID_MASK (1ULL << I40IW_CQPSQ_WQEVALID_SHIFT)
-
-#define I40IW_CQPSQ_TPHVAL_SHIFT 0
-#define I40IW_CQPSQ_TPHVAL_MASK (0xffUL << I40IW_CQPSQ_TPHVAL_SHIFT)
-
-#define I40IW_CQPSQ_TPHEN_SHIFT 60
-#define I40IW_CQPSQ_TPHEN_MASK (1ULL << I40IW_CQPSQ_TPHEN_SHIFT)
-
-#define I40IW_CQPSQ_PBUFADDR_SHIFT I40IW_CQPHC_QPCTX_SHIFT
-#define I40IW_CQPSQ_PBUFADDR_MASK I40IW_CQPHC_QPCTX_MASK
-
-/* Create/Modify/Destroy QP */
-
-#define I40IW_CQPSQ_QP_NEWMSS_SHIFT 32
-#define I40IW_CQPSQ_QP_NEWMSS_MASK (0x3fffULL << I40IW_CQPSQ_QP_NEWMSS_SHIFT)
-
-#define I40IW_CQPSQ_QP_TERMLEN_SHIFT 48
-#define I40IW_CQPSQ_QP_TERMLEN_MASK (0xfULL << I40IW_CQPSQ_QP_TERMLEN_SHIFT)
-
-#define I40IW_CQPSQ_QP_QPCTX_SHIFT I40IW_CQPHC_QPCTX_SHIFT
-#define I40IW_CQPSQ_QP_QPCTX_MASK I40IW_CQPHC_QPCTX_MASK
-
-#define I40IW_CQPSQ_QP_QPID_SHIFT 0
-#define I40IW_CQPSQ_QP_QPID_MASK (0x3FFFFUL)
-/* I40IWCQ_QPID_MASK */
-
-#define I40IW_CQPSQ_QP_OP_SHIFT 32
-#define I40IW_CQPSQ_QP_OP_MASK I40IWCQ_OP_MASK
-
-#define I40IW_CQPSQ_QP_ORDVALID_SHIFT 42
-#define I40IW_CQPSQ_QP_ORDVALID_MASK (1ULL << I40IW_CQPSQ_QP_ORDVALID_SHIFT)
-
-#define I40IW_CQPSQ_QP_TOECTXVALID_SHIFT 43
-#define I40IW_CQPSQ_QP_TOECTXVALID_MASK \
-	(1ULL << I40IW_CQPSQ_QP_TOECTXVALID_SHIFT)
-
-#define I40IW_CQPSQ_QP_CACHEDVARVALID_SHIFT 44
-#define I40IW_CQPSQ_QP_CACHEDVARVALID_MASK      \
-	(1ULL << I40IW_CQPSQ_QP_CACHEDVARVALID_SHIFT)
-
-#define I40IW_CQPSQ_QP_VQ_SHIFT 45
-#define I40IW_CQPSQ_QP_VQ_MASK (1ULL << I40IW_CQPSQ_QP_VQ_SHIFT)
-
-#define I40IW_CQPSQ_QP_FORCELOOPBACK_SHIFT 46
-#define I40IW_CQPSQ_QP_FORCELOOPBACK_MASK       \
-	(1ULL << I40IW_CQPSQ_QP_FORCELOOPBACK_SHIFT)
-
-#define I40IW_CQPSQ_QP_CQNUMVALID_SHIFT 47
-#define I40IW_CQPSQ_QP_CQNUMVALID_MASK  \
-	(1ULL << I40IW_CQPSQ_QP_CQNUMVALID_SHIFT)
-
-#define I40IW_CQPSQ_QP_QPTYPE_SHIFT 48
-#define I40IW_CQPSQ_QP_QPTYPE_MASK (0x3ULL << I40IW_CQPSQ_QP_QPTYPE_SHIFT)
-
-#define I40IW_CQPSQ_QP_MSSCHANGE_SHIFT 52
-#define I40IW_CQPSQ_QP_MSSCHANGE_MASK (1ULL << I40IW_CQPSQ_QP_MSSCHANGE_SHIFT)
-
-#define I40IW_CQPSQ_QP_IGNOREMWBOUND_SHIFT 54
-#define I40IW_CQPSQ_QP_IGNOREMWBOUND_MASK       \
-	(1ULL << I40IW_CQPSQ_QP_IGNOREMWBOUND_SHIFT)
-
-#define I40IW_CQPSQ_QP_REMOVEHASHENTRY_SHIFT 55
-#define I40IW_CQPSQ_QP_REMOVEHASHENTRY_MASK     \
-	(1ULL << I40IW_CQPSQ_QP_REMOVEHASHENTRY_SHIFT)
-
-#define I40IW_CQPSQ_QP_TERMACT_SHIFT 56
-#define I40IW_CQPSQ_QP_TERMACT_MASK (0x3ULL << I40IW_CQPSQ_QP_TERMACT_SHIFT)
-
-#define I40IW_CQPSQ_QP_RESETCON_SHIFT 58
-#define I40IW_CQPSQ_QP_RESETCON_MASK (1ULL << I40IW_CQPSQ_QP_RESETCON_SHIFT)
-
-#define I40IW_CQPSQ_QP_ARPTABIDXVALID_SHIFT 59
-#define I40IW_CQPSQ_QP_ARPTABIDXVALID_MASK      \
-	(1ULL << I40IW_CQPSQ_QP_ARPTABIDXVALID_SHIFT)
-
-#define I40IW_CQPSQ_QP_NEXTIWSTATE_SHIFT 60
-#define I40IW_CQPSQ_QP_NEXTIWSTATE_MASK \
-	(0x7ULL << I40IW_CQPSQ_QP_NEXTIWSTATE_SHIFT)
-
-#define I40IW_CQPSQ_QP_DBSHADOWADDR_SHIFT I40IW_CQPHC_QPCTX_SHIFT
-#define I40IW_CQPSQ_QP_DBSHADOWADDR_MASK I40IW_CQPHC_QPCTX_MASK
-
-/* Create/Modify/Destroy CQ */
-#define I40IW_CQPSQ_CQ_CQSIZE_SHIFT 0
-#define I40IW_CQPSQ_CQ_CQSIZE_MASK (0x3ffffUL << I40IW_CQPSQ_CQ_CQSIZE_SHIFT)
-
-#define I40IW_CQPSQ_CQ_CQCTX_SHIFT 0
-#define I40IW_CQPSQ_CQ_CQCTX_MASK       \
-	(0x7fffffffffffffffULL << I40IW_CQPSQ_CQ_CQCTX_SHIFT)
-
-#define I40IW_CQPSQ_CQ_CQCTX_SHIFT 0
-#define I40IW_CQPSQ_CQ_CQCTX_MASK       \
-	(0x7fffffffffffffffULL << I40IW_CQPSQ_CQ_CQCTX_SHIFT)
-
-#define I40IW_CQPSQ_CQ_SHADOW_READ_THRESHOLD_SHIFT 0
-#define I40IW_CQPSQ_CQ_SHADOW_READ_THRESHOLD_MASK       \
-	(0x3ffff << I40IW_CQPSQ_CQ_SHADOW_READ_THRESHOLD_SHIFT)
-
-#define I40IW_CQPSQ_CQ_CEQID_SHIFT 24
-#define I40IW_CQPSQ_CQ_CEQID_MASK (0x7fUL << I40IW_CQPSQ_CQ_CEQID_SHIFT)
-
-#define I40IW_CQPSQ_CQ_OP_SHIFT 32
-#define I40IW_CQPSQ_CQ_OP_MASK (0x3fULL << I40IW_CQPSQ_CQ_OP_SHIFT)
-
-#define I40IW_CQPSQ_CQ_CQRESIZE_SHIFT 43
-#define I40IW_CQPSQ_CQ_CQRESIZE_MASK (1ULL << I40IW_CQPSQ_CQ_CQRESIZE_SHIFT)
-
-#define I40IW_CQPSQ_CQ_LPBLSIZE_SHIFT 44
-#define I40IW_CQPSQ_CQ_LPBLSIZE_MASK (3ULL << I40IW_CQPSQ_CQ_LPBLSIZE_SHIFT)
-
-#define I40IW_CQPSQ_CQ_CHKOVERFLOW_SHIFT 46
-#define I40IW_CQPSQ_CQ_CHKOVERFLOW_MASK         \
-	(1ULL << I40IW_CQPSQ_CQ_CHKOVERFLOW_SHIFT)
-
-#define I40IW_CQPSQ_CQ_VIRTMAP_SHIFT 47
-#define I40IW_CQPSQ_CQ_VIRTMAP_MASK (1ULL << I40IW_CQPSQ_CQ_VIRTMAP_SHIFT)
-
-#define I40IW_CQPSQ_CQ_ENCEQEMASK_SHIFT 48
-#define I40IW_CQPSQ_CQ_ENCEQEMASK_MASK  \
-	(1ULL << I40IW_CQPSQ_CQ_ENCEQEMASK_SHIFT)
-
-#define I40IW_CQPSQ_CQ_CEQIDVALID_SHIFT 49
-#define I40IW_CQPSQ_CQ_CEQIDVALID_MASK  \
-	(1ULL << I40IW_CQPSQ_CQ_CEQIDVALID_SHIFT)
-
-#define I40IW_CQPSQ_CQ_AVOIDMEMCNFLCT_SHIFT 61
-#define I40IW_CQPSQ_CQ_AVOIDMEMCNFLCT_MASK      \
-	(1ULL << I40IW_CQPSQ_CQ_AVOIDMEMCNFLCT_SHIFT)
-
-/* Create/Modify/Destroy Shared Receive Queue */
-
-#define I40IW_CQPSQ_SRQ_RQSIZE_SHIFT 0
-#define I40IW_CQPSQ_SRQ_RQSIZE_MASK (0xfUL << I40IW_CQPSQ_SRQ_RQSIZE_SHIFT)
-
-#define I40IW_CQPSQ_SRQ_RQWQESIZE_SHIFT 4
-#define I40IW_CQPSQ_SRQ_RQWQESIZE_MASK \
-	(0x7UL << I40IW_CQPSQ_SRQ_RQWQESIZE_SHIFT)
-
-#define I40IW_CQPSQ_SRQ_SRQLIMIT_SHIFT 32
-#define I40IW_CQPSQ_SRQ_SRQLIMIT_MASK   \
-	(0xfffULL << I40IW_CQPSQ_SRQ_SRQLIMIT_SHIFT)
-
-#define I40IW_CQPSQ_SRQ_SRQCTX_SHIFT I40IW_CQPHC_QPCTX_SHIFT
-#define I40IW_CQPSQ_SRQ_SRQCTX_MASK I40IW_CQPHC_QPCTX_MASK
-
-#define I40IW_CQPSQ_SRQ_PDID_SHIFT 16
-#define I40IW_CQPSQ_SRQ_PDID_MASK       \
-	(0x7fffULL << I40IW_CQPSQ_SRQ_PDID_SHIFT)
-
-#define I40IW_CQPSQ_SRQ_SRQID_SHIFT 0
-#define I40IW_CQPSQ_SRQ_SRQID_MASK (0x7fffUL << I40IW_CQPSQ_SRQ_SRQID_SHIFT)
-
-#define I40IW_CQPSQ_SRQ_LPBLSIZE_SHIFT I40IW_CQPSQ_CQ_LPBLSIZE_SHIFT
-#define I40IW_CQPSQ_SRQ_LPBLSIZE_MASK I40IW_CQPSQ_CQ_LPBLSIZE_MASK
-
-#define I40IW_CQPSQ_SRQ_VIRTMAP_SHIFT I40IW_CQPSQ_CQ_VIRTMAP_SHIFT
-#define I40IW_CQPSQ_SRQ_VIRTMAP_MASK I40IW_CQPSQ_CQ_VIRTMAP_MASK
-
-#define I40IW_CQPSQ_SRQ_TPHEN_SHIFT I40IW_CQPSQ_TPHEN_SHIFT
-#define I40IW_CQPSQ_SRQ_TPHEN_MASK I40IW_CQPSQ_TPHEN_MASK
-
-#define I40IW_CQPSQ_SRQ_ARMLIMITEVENT_SHIFT 61
-#define I40IW_CQPSQ_SRQ_ARMLIMITEVENT_MASK      \
-	(1ULL << I40IW_CQPSQ_SRQ_ARMLIMITEVENT_SHIFT)
-
-#define I40IW_CQPSQ_SRQ_DBSHADOWAREA_SHIFT 6
-#define I40IW_CQPSQ_SRQ_DBSHADOWAREA_MASK       \
-	(0x3ffffffffffffffULL << I40IW_CQPSQ_SRQ_DBSHADOWAREA_SHIFT)
-
-#define I40IW_CQPSQ_SRQ_FIRSTPMPBLIDX_SHIFT 0
-#define I40IW_CQPSQ_SRQ_FIRSTPMPBLIDX_MASK      \
-	(0xfffffffUL << I40IW_CQPSQ_SRQ_FIRSTPMPBLIDX_SHIFT)
-
-/* Allocate/Register/Register Shared/Deallocate Stag */
-#define I40IW_CQPSQ_STAG_VA_FBO_SHIFT I40IW_CQPHC_QPCTX_SHIFT
-#define I40IW_CQPSQ_STAG_VA_FBO_MASK I40IW_CQPHC_QPCTX_MASK
-
-#define I40IW_CQPSQ_STAG_STAGLEN_SHIFT 0
-#define I40IW_CQPSQ_STAG_STAGLEN_MASK   \
-	(0x3fffffffffffULL << I40IW_CQPSQ_STAG_STAGLEN_SHIFT)
-
-#define I40IW_CQPSQ_STAG_PDID_SHIFT 48
-#define I40IW_CQPSQ_STAG_PDID_MASK (0x7fffULL << I40IW_CQPSQ_STAG_PDID_SHIFT)
-
-#define I40IW_CQPSQ_STAG_KEY_SHIFT 0
-#define I40IW_CQPSQ_STAG_KEY_MASK (0xffUL << I40IW_CQPSQ_STAG_KEY_SHIFT)
-
-#define I40IW_CQPSQ_STAG_IDX_SHIFT 8
-#define I40IW_CQPSQ_STAG_IDX_MASK (0xffffffUL << I40IW_CQPSQ_STAG_IDX_SHIFT)
-
-#define I40IW_CQPSQ_STAG_PARENTSTAGIDX_SHIFT 32
-#define I40IW_CQPSQ_STAG_PARENTSTAGIDX_MASK     \
-	(0xffffffULL << I40IW_CQPSQ_STAG_PARENTSTAGIDX_SHIFT)
-
-#define I40IW_CQPSQ_STAG_MR_SHIFT 43
-#define I40IW_CQPSQ_STAG_MR_MASK (1ULL << I40IW_CQPSQ_STAG_MR_SHIFT)
-
-#define I40IW_CQPSQ_STAG_LPBLSIZE_SHIFT I40IW_CQPSQ_CQ_LPBLSIZE_SHIFT
-#define I40IW_CQPSQ_STAG_LPBLSIZE_MASK I40IW_CQPSQ_CQ_LPBLSIZE_MASK
-
-#define I40IW_CQPSQ_STAG_HPAGESIZE_SHIFT 46
-#define I40IW_CQPSQ_STAG_HPAGESIZE_MASK \
-	(1ULL << I40IW_CQPSQ_STAG_HPAGESIZE_SHIFT)
-
-#define I40IW_CQPSQ_STAG_ARIGHTS_SHIFT 48
-#define I40IW_CQPSQ_STAG_ARIGHTS_MASK   \
-	(0x1fULL << I40IW_CQPSQ_STAG_ARIGHTS_SHIFT)
-
-#define I40IW_CQPSQ_STAG_REMACCENABLED_SHIFT 53
-#define I40IW_CQPSQ_STAG_REMACCENABLED_MASK     \
-	(1ULL << I40IW_CQPSQ_STAG_REMACCENABLED_SHIFT)
-
-#define I40IW_CQPSQ_STAG_VABASEDTO_SHIFT 59
-#define I40IW_CQPSQ_STAG_VABASEDTO_MASK \
-	(1ULL << I40IW_CQPSQ_STAG_VABASEDTO_SHIFT)
-
-#define I40IW_CQPSQ_STAG_USEHMCFNIDX_SHIFT 60
-#define I40IW_CQPSQ_STAG_USEHMCFNIDX_MASK       \
-	(1ULL << I40IW_CQPSQ_STAG_USEHMCFNIDX_SHIFT)
-
-#define I40IW_CQPSQ_STAG_USEPFRID_SHIFT 61
-#define I40IW_CQPSQ_STAG_USEPFRID_MASK  \
-	(1ULL << I40IW_CQPSQ_STAG_USEPFRID_SHIFT)
-
-#define I40IW_CQPSQ_STAG_PBA_SHIFT I40IW_CQPHC_QPCTX_SHIFT
-#define I40IW_CQPSQ_STAG_PBA_MASK I40IW_CQPHC_QPCTX_MASK
-
-#define I40IW_CQPSQ_STAG_HMCFNIDX_SHIFT 0
-#define I40IW_CQPSQ_STAG_HMCFNIDX_MASK \
-	(0x3fUL << I40IW_CQPSQ_STAG_HMCFNIDX_SHIFT)
-
-#define I40IW_CQPSQ_STAG_FIRSTPMPBLIDX_SHIFT 0
-#define I40IW_CQPSQ_STAG_FIRSTPMPBLIDX_MASK     \
-	(0xfffffffUL << I40IW_CQPSQ_STAG_FIRSTPMPBLIDX_SHIFT)
-
-/* Query stag */
-#define I40IW_CQPSQ_QUERYSTAG_IDX_SHIFT I40IW_CQPSQ_STAG_IDX_SHIFT
-#define I40IW_CQPSQ_QUERYSTAG_IDX_MASK I40IW_CQPSQ_STAG_IDX_MASK
-
-/* Allocate Local IP Address Entry */
-
-/* Manage Local IP Address Table - MLIPA */
-#define I40IW_CQPSQ_MLIPA_IPV6LO_SHIFT  I40IW_CQPHC_QPCTX_SHIFT
-#define I40IW_CQPSQ_MLIPA_IPV6LO_MASK I40IW_CQPHC_QPCTX_MASK
-
-#define I40IW_CQPSQ_MLIPA_IPV6HI_SHIFT  I40IW_CQPHC_QPCTX_SHIFT
-#define I40IW_CQPSQ_MLIPA_IPV6HI_MASK I40IW_CQPHC_QPCTX_MASK
-
-#define I40IW_CQPSQ_MLIPA_IPV4_SHIFT 0
-#define I40IW_CQPSQ_MLIPA_IPV4_MASK \
-	(0xffffffffUL << I40IW_CQPSQ_MLIPA_IPV4_SHIFT)
-
-#define I40IW_CQPSQ_MLIPA_IPTABLEIDX_SHIFT 0
-#define I40IW_CQPSQ_MLIPA_IPTABLEIDX_MASK       \
-	(0x3fUL << I40IW_CQPSQ_MLIPA_IPTABLEIDX_SHIFT)
-
-#define I40IW_CQPSQ_MLIPA_IPV4VALID_SHIFT 42
-#define I40IW_CQPSQ_MLIPA_IPV4VALID_MASK        \
-	(1ULL << I40IW_CQPSQ_MLIPA_IPV4VALID_SHIFT)
-
-#define I40IW_CQPSQ_MLIPA_IPV6VALID_SHIFT 43
-#define I40IW_CQPSQ_MLIPA_IPV6VALID_MASK        \
-	(1ULL << I40IW_CQPSQ_MLIPA_IPV6VALID_SHIFT)
-
-#define I40IW_CQPSQ_MLIPA_FREEENTRY_SHIFT 62
-#define I40IW_CQPSQ_MLIPA_FREEENTRY_MASK        \
-	(1ULL << I40IW_CQPSQ_MLIPA_FREEENTRY_SHIFT)
-
-#define I40IW_CQPSQ_MLIPA_IGNORE_REF_CNT_SHIFT 61
-#define I40IW_CQPSQ_MLIPA_IGNORE_REF_CNT_MASK   \
-	(1ULL << I40IW_CQPSQ_MLIPA_IGNORE_REF_CNT_SHIFT)
-
-#define I40IW_CQPSQ_MLIPA_MAC0_SHIFT 0
-#define I40IW_CQPSQ_MLIPA_MAC0_MASK (0xffUL << I40IW_CQPSQ_MLIPA_MAC0_SHIFT)
-
-#define I40IW_CQPSQ_MLIPA_MAC1_SHIFT 8
-#define I40IW_CQPSQ_MLIPA_MAC1_MASK (0xffUL << I40IW_CQPSQ_MLIPA_MAC1_SHIFT)
-
-#define I40IW_CQPSQ_MLIPA_MAC2_SHIFT 16
-#define I40IW_CQPSQ_MLIPA_MAC2_MASK (0xffUL << I40IW_CQPSQ_MLIPA_MAC2_SHIFT)
-
-#define I40IW_CQPSQ_MLIPA_MAC3_SHIFT 24
-#define I40IW_CQPSQ_MLIPA_MAC3_MASK (0xffUL << I40IW_CQPSQ_MLIPA_MAC3_SHIFT)
-
-#define I40IW_CQPSQ_MLIPA_MAC4_SHIFT 32
-#define I40IW_CQPSQ_MLIPA_MAC4_MASK (0xffULL << I40IW_CQPSQ_MLIPA_MAC4_SHIFT)
-
-#define I40IW_CQPSQ_MLIPA_MAC5_SHIFT 40
-#define I40IW_CQPSQ_MLIPA_MAC5_MASK (0xffULL << I40IW_CQPSQ_MLIPA_MAC5_SHIFT)
-
-/* Manage ARP Table  - MAT */
-#define I40IW_CQPSQ_MAT_REACHMAX_SHIFT 0
-#define I40IW_CQPSQ_MAT_REACHMAX_MASK   \
-	(0xffffffffUL << I40IW_CQPSQ_MAT_REACHMAX_SHIFT)
-
-#define I40IW_CQPSQ_MAT_MACADDR_SHIFT 0
-#define I40IW_CQPSQ_MAT_MACADDR_MASK    \
-	(0xffffffffffffULL << I40IW_CQPSQ_MAT_MACADDR_SHIFT)
-
-#define I40IW_CQPSQ_MAT_ARPENTRYIDX_SHIFT 0
-#define I40IW_CQPSQ_MAT_ARPENTRYIDX_MASK        \
-	(0xfffUL << I40IW_CQPSQ_MAT_ARPENTRYIDX_SHIFT)
-
-#define I40IW_CQPSQ_MAT_ENTRYVALID_SHIFT 42
-#define I40IW_CQPSQ_MAT_ENTRYVALID_MASK \
-	(1ULL << I40IW_CQPSQ_MAT_ENTRYVALID_SHIFT)
-
-#define I40IW_CQPSQ_MAT_PERMANENT_SHIFT 43
-#define I40IW_CQPSQ_MAT_PERMANENT_MASK  \
-	(1ULL << I40IW_CQPSQ_MAT_PERMANENT_SHIFT)
-
-#define I40IW_CQPSQ_MAT_QUERY_SHIFT 44
-#define I40IW_CQPSQ_MAT_QUERY_MASK (1ULL << I40IW_CQPSQ_MAT_QUERY_SHIFT)
-
-/* Manage VF PBLE Backing Pages - MVPBP*/
-#define I40IW_CQPSQ_MVPBP_PD_ENTRY_CNT_SHIFT 0
-#define I40IW_CQPSQ_MVPBP_PD_ENTRY_CNT_MASK \
-	(0x3ffULL << I40IW_CQPSQ_MVPBP_PD_ENTRY_CNT_SHIFT)
-
-#define I40IW_CQPSQ_MVPBP_FIRST_PD_INX_SHIFT 16
-#define I40IW_CQPSQ_MVPBP_FIRST_PD_INX_MASK \
-	(0x1ffULL << I40IW_CQPSQ_MVPBP_FIRST_PD_INX_SHIFT)
-
-#define I40IW_CQPSQ_MVPBP_SD_INX_SHIFT 32
-#define I40IW_CQPSQ_MVPBP_SD_INX_MASK \
-	(0xfffULL << I40IW_CQPSQ_MVPBP_SD_INX_SHIFT)
-
-#define I40IW_CQPSQ_MVPBP_INV_PD_ENT_SHIFT 62
-#define I40IW_CQPSQ_MVPBP_INV_PD_ENT_MASK \
-	(0x1ULL << I40IW_CQPSQ_MVPBP_INV_PD_ENT_SHIFT)
-
-#define I40IW_CQPSQ_MVPBP_PD_PLPBA_SHIFT 3
-#define I40IW_CQPSQ_MVPBP_PD_PLPBA_MASK \
-	(0x1fffffffffffffffULL << I40IW_CQPSQ_MVPBP_PD_PLPBA_SHIFT)
-
-#define I40IW_INVALID_PUSH_PAGE_INDEX 0xffff
-
-#define I40IW_CQPSQ_MPP_QS_HANDLE_SHIFT 0
-#define I40IW_CQPSQ_MPP_QS_HANDLE_MASK (0xffffUL << \
-					I40IW_CQPSQ_MPP_QS_HANDLE_SHIFT)
-
-#define I40IW_CQPSQ_MPP_PPIDX_SHIFT 0
-#define I40IW_CQPSQ_MPP_PPIDX_MASK (0x3ffUL << I40IW_CQPSQ_MPP_PPIDX_SHIFT)
-
-#define I40IW_CQPSQ_MPP_FREE_PAGE_SHIFT 62
-#define I40IW_CQPSQ_MPP_FREE_PAGE_MASK (1ULL << I40IW_CQPSQ_MPP_FREE_PAGE_SHIFT)
-
-/* Upload Context - UCTX */
-#define I40IW_CQPSQ_UCTX_QPCTXADDR_SHIFT I40IW_CQPHC_QPCTX_SHIFT
-#define I40IW_CQPSQ_UCTX_QPCTXADDR_MASK I40IW_CQPHC_QPCTX_MASK
-
-#define I40IW_CQPSQ_UCTX_QPID_SHIFT 0
-#define I40IW_CQPSQ_UCTX_QPID_MASK (0x3ffffUL << I40IW_CQPSQ_UCTX_QPID_SHIFT)
-
-#define I40IW_CQPSQ_UCTX_QPTYPE_SHIFT 48
-#define I40IW_CQPSQ_UCTX_QPTYPE_MASK (0xfULL << I40IW_CQPSQ_UCTX_QPTYPE_SHIFT)
-
-#define I40IW_CQPSQ_UCTX_RAWFORMAT_SHIFT 61
-#define I40IW_CQPSQ_UCTX_RAWFORMAT_MASK \
-	(1ULL << I40IW_CQPSQ_UCTX_RAWFORMAT_SHIFT)
-
-#define I40IW_CQPSQ_UCTX_FREEZEQP_SHIFT 62
-#define I40IW_CQPSQ_UCTX_FREEZEQP_MASK  \
-	(1ULL << I40IW_CQPSQ_UCTX_FREEZEQP_SHIFT)
-
-/* Manage HMC PM Function Table - MHMC */
-#define I40IW_CQPSQ_MHMC_VFIDX_SHIFT 0
-#define I40IW_CQPSQ_MHMC_VFIDX_MASK (0x7fUL << I40IW_CQPSQ_MHMC_VFIDX_SHIFT)
-
-#define I40IW_CQPSQ_MHMC_FREEPMFN_SHIFT 62
-#define I40IW_CQPSQ_MHMC_FREEPMFN_MASK  \
-	(1ULL << I40IW_CQPSQ_MHMC_FREEPMFN_SHIFT)
-
-/* Set HMC Resource Profile - SHMCRP */
-#define I40IW_CQPSQ_SHMCRP_HMC_PROFILE_SHIFT 0
-#define I40IW_CQPSQ_SHMCRP_HMC_PROFILE_MASK \
-	(0x7ULL << I40IW_CQPSQ_SHMCRP_HMC_PROFILE_SHIFT)
-#define I40IW_CQPSQ_SHMCRP_VFNUM_SHIFT 32
-#define I40IW_CQPSQ_SHMCRP_VFNUM_MASK (0x3fULL << I40IW_CQPSQ_SHMCRP_VFNUM_SHIFT)
-
-/* Create/Destroy CEQ */
-#define I40IW_CQPSQ_CEQ_CEQSIZE_SHIFT 0
-#define I40IW_CQPSQ_CEQ_CEQSIZE_MASK \
-	(0x1ffffUL << I40IW_CQPSQ_CEQ_CEQSIZE_SHIFT)
-
-#define I40IW_CQPSQ_CEQ_CEQID_SHIFT 0
-#define I40IW_CQPSQ_CEQ_CEQID_MASK (0x7fUL << I40IW_CQPSQ_CEQ_CEQID_SHIFT)
-
-#define I40IW_CQPSQ_CEQ_LPBLSIZE_SHIFT I40IW_CQPSQ_CQ_LPBLSIZE_SHIFT
-#define I40IW_CQPSQ_CEQ_LPBLSIZE_MASK I40IW_CQPSQ_CQ_LPBLSIZE_MASK
-
-#define I40IW_CQPSQ_CEQ_VMAP_SHIFT 47
-#define I40IW_CQPSQ_CEQ_VMAP_MASK (1ULL << I40IW_CQPSQ_CEQ_VMAP_SHIFT)
-
-#define I40IW_CQPSQ_CEQ_FIRSTPMPBLIDX_SHIFT 0
-#define I40IW_CQPSQ_CEQ_FIRSTPMPBLIDX_MASK      \
-	(0xfffffffUL << I40IW_CQPSQ_CEQ_FIRSTPMPBLIDX_SHIFT)
-
-/* Create/Destroy AEQ */
-#define I40IW_CQPSQ_AEQ_AEQECNT_SHIFT 0
-#define I40IW_CQPSQ_AEQ_AEQECNT_MASK \
-	(0x7ffffUL << I40IW_CQPSQ_AEQ_AEQECNT_SHIFT)
-
-#define I40IW_CQPSQ_AEQ_LPBLSIZE_SHIFT I40IW_CQPSQ_CQ_LPBLSIZE_SHIFT
-#define I40IW_CQPSQ_AEQ_LPBLSIZE_MASK I40IW_CQPSQ_CQ_LPBLSIZE_MASK
-
-#define I40IW_CQPSQ_AEQ_VMAP_SHIFT 47
-#define I40IW_CQPSQ_AEQ_VMAP_MASK (1ULL << I40IW_CQPSQ_AEQ_VMAP_SHIFT)
-
-#define I40IW_CQPSQ_AEQ_FIRSTPMPBLIDX_SHIFT 0
-#define I40IW_CQPSQ_AEQ_FIRSTPMPBLIDX_MASK      \
-	(0xfffffffUL << I40IW_CQPSQ_AEQ_FIRSTPMPBLIDX_SHIFT)
-
-/* Commit FPM Values - CFPM */
-#define I40IW_CQPSQ_CFPM_HMCFNID_SHIFT 0
-#define I40IW_CQPSQ_CFPM_HMCFNID_MASK (0x3fUL << I40IW_CQPSQ_CFPM_HMCFNID_SHIFT)
-
-/* Flush WQEs - FWQE */
-#define I40IW_CQPSQ_FWQE_AECODE_SHIFT 0
-#define I40IW_CQPSQ_FWQE_AECODE_MASK (0xffffUL << I40IW_CQPSQ_FWQE_AECODE_SHIFT)
-
-#define I40IW_CQPSQ_FWQE_AESOURCE_SHIFT 16
-#define I40IW_CQPSQ_FWQE_AESOURCE_MASK \
-	(0xfUL << I40IW_CQPSQ_FWQE_AESOURCE_SHIFT)
-
-#define I40IW_CQPSQ_FWQE_RQMNERR_SHIFT 0
-#define I40IW_CQPSQ_FWQE_RQMNERR_MASK \
-	(0xffffUL << I40IW_CQPSQ_FWQE_RQMNERR_SHIFT)
-
-#define I40IW_CQPSQ_FWQE_RQMJERR_SHIFT 16
-#define I40IW_CQPSQ_FWQE_RQMJERR_MASK \
-	(0xffffUL << I40IW_CQPSQ_FWQE_RQMJERR_SHIFT)
-
-#define I40IW_CQPSQ_FWQE_SQMNERR_SHIFT 32
-#define I40IW_CQPSQ_FWQE_SQMNERR_MASK   \
-	(0xffffULL << I40IW_CQPSQ_FWQE_SQMNERR_SHIFT)
-
-#define I40IW_CQPSQ_FWQE_SQMJERR_SHIFT 48
-#define I40IW_CQPSQ_FWQE_SQMJERR_MASK   \
-	(0xffffULL << I40IW_CQPSQ_FWQE_SQMJERR_SHIFT)
-
-#define I40IW_CQPSQ_FWQE_QPID_SHIFT 0
-#define I40IW_CQPSQ_FWQE_QPID_MASK (0x3ffffULL << I40IW_CQPSQ_FWQE_QPID_SHIFT)
-
-#define I40IW_CQPSQ_FWQE_GENERATE_AE_SHIFT 59
-#define I40IW_CQPSQ_FWQE_GENERATE_AE_MASK (1ULL <<      \
-					   I40IW_CQPSQ_FWQE_GENERATE_AE_SHIFT)
-
-#define I40IW_CQPSQ_FWQE_USERFLCODE_SHIFT 60
-#define I40IW_CQPSQ_FWQE_USERFLCODE_MASK        \
-	(1ULL << I40IW_CQPSQ_FWQE_USERFLCODE_SHIFT)
-
-#define I40IW_CQPSQ_FWQE_FLUSHSQ_SHIFT 61
-#define I40IW_CQPSQ_FWQE_FLUSHSQ_MASK (1ULL << I40IW_CQPSQ_FWQE_FLUSHSQ_SHIFT)
-
-#define I40IW_CQPSQ_FWQE_FLUSHRQ_SHIFT 62
-#define I40IW_CQPSQ_FWQE_FLUSHRQ_MASK (1ULL << I40IW_CQPSQ_FWQE_FLUSHRQ_SHIFT)
-
-/* Manage Accelerated Port Table - MAPT */
-#define I40IW_CQPSQ_MAPT_PORT_SHIFT 0
-#define I40IW_CQPSQ_MAPT_PORT_MASK (0xffffUL << I40IW_CQPSQ_MAPT_PORT_SHIFT)
-
-#define I40IW_CQPSQ_MAPT_ADDPORT_SHIFT 62
-#define I40IW_CQPSQ_MAPT_ADDPORT_MASK (1ULL << I40IW_CQPSQ_MAPT_ADDPORT_SHIFT)
-
-/* Update Protocol Engine SDs */
-#define I40IW_CQPSQ_UPESD_SDCMD_SHIFT 0
-#define I40IW_CQPSQ_UPESD_SDCMD_MASK (0xffffffffUL << I40IW_CQPSQ_UPESD_SDCMD_SHIFT)
-
-#define I40IW_CQPSQ_UPESD_SDDATALOW_SHIFT 0
-#define I40IW_CQPSQ_UPESD_SDDATALOW_MASK        \
-	(0xffffffffUL << I40IW_CQPSQ_UPESD_SDDATALOW_SHIFT)
-
-#define I40IW_CQPSQ_UPESD_SDDATAHI_SHIFT 32
-#define I40IW_CQPSQ_UPESD_SDDATAHI_MASK \
-	(0xffffffffULL << I40IW_CQPSQ_UPESD_SDDATAHI_SHIFT)
-#define I40IW_CQPSQ_UPESD_HMCFNID_SHIFT 0
-#define I40IW_CQPSQ_UPESD_HMCFNID_MASK  \
-	(0x3fUL << I40IW_CQPSQ_UPESD_HMCFNID_SHIFT)
-
-#define I40IW_CQPSQ_UPESD_ENTRY_VALID_SHIFT 63
-#define I40IW_CQPSQ_UPESD_ENTRY_VALID_MASK      \
-	((u64)1 << I40IW_CQPSQ_UPESD_ENTRY_VALID_SHIFT)
-
-#define I40IW_CQPSQ_UPESD_ENTRY_COUNT_SHIFT 0
-#define I40IW_CQPSQ_UPESD_ENTRY_COUNT_MASK      \
-	(0xfUL << I40IW_CQPSQ_UPESD_ENTRY_COUNT_SHIFT)
-
-#define I40IW_CQPSQ_UPESD_SKIP_ENTRY_SHIFT 7
-#define I40IW_CQPSQ_UPESD_SKIP_ENTRY_MASK       \
-	(0x1UL << I40IW_CQPSQ_UPESD_SKIP_ENTRY_SHIFT)
-
-/* Suspend QP */
-#define I40IW_CQPSQ_SUSPENDQP_QPID_SHIFT 0
-#define I40IW_CQPSQ_SUSPENDQP_QPID_MASK (0x3FFFFUL)
-/* I40IWCQ_QPID_MASK */
-
-/* Resume QP */
-#define I40IW_CQPSQ_RESUMEQP_QSHANDLE_SHIFT 0
-#define I40IW_CQPSQ_RESUMEQP_QSHANDLE_MASK      \
-	(0xffffffffUL << I40IW_CQPSQ_RESUMEQP_QSHANDLE_SHIFT)
-
-#define I40IW_CQPSQ_RESUMEQP_QPID_SHIFT 0
-#define I40IW_CQPSQ_RESUMEQP_QPID_MASK (0x3FFFFUL)
-/* I40IWCQ_QPID_MASK */
-
-/* IW QP Context */
-#define I40IWQPC_DDP_VER_SHIFT 0
-#define I40IWQPC_DDP_VER_MASK (3UL << I40IWQPC_DDP_VER_SHIFT)
-
-#define I40IWQPC_SNAP_SHIFT 2
-#define I40IWQPC_SNAP_MASK (1UL << I40IWQPC_SNAP_SHIFT)
-
-#define I40IWQPC_IPV4_SHIFT 3
-#define I40IWQPC_IPV4_MASK (1UL << I40IWQPC_IPV4_SHIFT)
-
-#define I40IWQPC_NONAGLE_SHIFT 4
-#define I40IWQPC_NONAGLE_MASK (1UL << I40IWQPC_NONAGLE_SHIFT)
-
-#define I40IWQPC_INSERTVLANTAG_SHIFT 5
-#define I40IWQPC_INSERTVLANTAG_MASK (1 << I40IWQPC_INSERTVLANTAG_SHIFT)
-
-#define I40IWQPC_USESRQ_SHIFT 6
-#define I40IWQPC_USESRQ_MASK (1UL << I40IWQPC_USESRQ_SHIFT)
-
-#define I40IWQPC_TIMESTAMP_SHIFT 7
-#define I40IWQPC_TIMESTAMP_MASK (1UL << I40IWQPC_TIMESTAMP_SHIFT)
-
-#define I40IWQPC_RQWQESIZE_SHIFT 8
-#define I40IWQPC_RQWQESIZE_MASK (3UL << I40IWQPC_RQWQESIZE_SHIFT)
-
-#define I40IWQPC_INSERTL2TAG2_SHIFT 11
-#define I40IWQPC_INSERTL2TAG2_MASK (1UL << I40IWQPC_INSERTL2TAG2_SHIFT)
-
-#define I40IWQPC_LIMIT_SHIFT 12
-#define I40IWQPC_LIMIT_MASK (3UL << I40IWQPC_LIMIT_SHIFT)
-
-#define I40IWQPC_DROPOOOSEG_SHIFT 15
-#define I40IWQPC_DROPOOOSEG_MASK (1UL << I40IWQPC_DROPOOOSEG_SHIFT)
-
-#define I40IWQPC_DUPACK_THRESH_SHIFT 16
-#define I40IWQPC_DUPACK_THRESH_MASK (7UL << I40IWQPC_DUPACK_THRESH_SHIFT)
-
-#define I40IWQPC_ERR_RQ_IDX_VALID_SHIFT 19
-#define I40IWQPC_ERR_RQ_IDX_VALID_MASK  (1UL << I40IWQPC_ERR_RQ_IDX_VALID_SHIFT)
-
-#define I40IWQPC_DIS_VLAN_CHECKS_SHIFT 19
-#define I40IWQPC_DIS_VLAN_CHECKS_MASK (7UL << I40IWQPC_DIS_VLAN_CHECKS_SHIFT)
-
-#define I40IWQPC_RCVTPHEN_SHIFT 28
-#define I40IWQPC_RCVTPHEN_MASK (1UL << I40IWQPC_RCVTPHEN_SHIFT)
-
-#define I40IWQPC_XMITTPHEN_SHIFT 29
-#define I40IWQPC_XMITTPHEN_MASK (1ULL << I40IWQPC_XMITTPHEN_SHIFT)
-
-#define I40IWQPC_RQTPHEN_SHIFT 30
-#define I40IWQPC_RQTPHEN_MASK (1UL << I40IWQPC_RQTPHEN_SHIFT)
-
-#define I40IWQPC_SQTPHEN_SHIFT 31
-#define I40IWQPC_SQTPHEN_MASK (1ULL << I40IWQPC_SQTPHEN_SHIFT)
-
-#define I40IWQPC_PPIDX_SHIFT 32
-#define I40IWQPC_PPIDX_MASK (0x3ffULL << I40IWQPC_PPIDX_SHIFT)
-
-#define I40IWQPC_PMENA_SHIFT 47
-#define I40IWQPC_PMENA_MASK (1ULL << I40IWQPC_PMENA_SHIFT)
-
-#define I40IWQPC_RDMAP_VER_SHIFT 62
-#define I40IWQPC_RDMAP_VER_MASK (3ULL << I40IWQPC_RDMAP_VER_SHIFT)
-
-#define I40IWQPC_SQADDR_SHIFT I40IW_CQPHC_QPCTX_SHIFT
-#define I40IWQPC_SQADDR_MASK I40IW_CQPHC_QPCTX_MASK
-
-#define I40IWQPC_RQADDR_SHIFT I40IW_CQPHC_QPCTX_SHIFT
-#define I40IWQPC_RQADDR_MASK I40IW_CQPHC_QPCTX_MASK
-
-#define I40IWQPC_TTL_SHIFT 0
-#define I40IWQPC_TTL_MASK (0xffUL << I40IWQPC_TTL_SHIFT)
-
-#define I40IWQPC_RQSIZE_SHIFT 8
-#define I40IWQPC_RQSIZE_MASK (0xfUL << I40IWQPC_RQSIZE_SHIFT)
-
-#define I40IWQPC_SQSIZE_SHIFT 12
-#define I40IWQPC_SQSIZE_MASK (0xfUL << I40IWQPC_SQSIZE_SHIFT)
-
-#define I40IWQPC_SRCMACADDRIDX_SHIFT 16
-#define I40IWQPC_SRCMACADDRIDX_MASK (0x3fUL << I40IWQPC_SRCMACADDRIDX_SHIFT)
-
-#define I40IWQPC_AVOIDSTRETCHACK_SHIFT 23
-#define I40IWQPC_AVOIDSTRETCHACK_MASK (1UL << I40IWQPC_AVOIDSTRETCHACK_SHIFT)
-
-#define I40IWQPC_TOS_SHIFT 24
-#define I40IWQPC_TOS_MASK (0xffUL << I40IWQPC_TOS_SHIFT)
-
-#define I40IWQPC_SRCPORTNUM_SHIFT 32
-#define I40IWQPC_SRCPORTNUM_MASK (0xffffULL << I40IWQPC_SRCPORTNUM_SHIFT)
-
-#define I40IWQPC_DESTPORTNUM_SHIFT 48
-#define I40IWQPC_DESTPORTNUM_MASK (0xffffULL << I40IWQPC_DESTPORTNUM_SHIFT)
-
-#define I40IWQPC_DESTIPADDR0_SHIFT 32
-#define I40IWQPC_DESTIPADDR0_MASK       \
-	(0xffffffffULL << I40IWQPC_DESTIPADDR0_SHIFT)
-
-#define I40IWQPC_DESTIPADDR1_SHIFT 0
-#define I40IWQPC_DESTIPADDR1_MASK       \
-	(0xffffffffULL << I40IWQPC_DESTIPADDR1_SHIFT)
-
-#define I40IWQPC_DESTIPADDR2_SHIFT 32
-#define I40IWQPC_DESTIPADDR2_MASK       \
-	(0xffffffffULL << I40IWQPC_DESTIPADDR2_SHIFT)
-
-#define I40IWQPC_DESTIPADDR3_SHIFT 0
-#define I40IWQPC_DESTIPADDR3_MASK       \
-	(0xffffffffULL << I40IWQPC_DESTIPADDR3_SHIFT)
-
-#define I40IWQPC_SNDMSS_SHIFT 16
-#define I40IWQPC_SNDMSS_MASK (0x3fffUL << I40IWQPC_SNDMSS_SHIFT)
-
-#define I40IW_UDA_QPC_MAXFRAMESIZE_SHIFT 16
-#define I40IW_UDA_QPC_MAXFRAMESIZE_MASK (0x3fffUL << I40IW_UDA_QPC_MAXFRAMESIZE_SHIFT)
-
-#define I40IWQPC_VLANTAG_SHIFT 32
-#define I40IWQPC_VLANTAG_MASK (0xffffULL << I40IWQPC_VLANTAG_SHIFT)
-
-#define I40IWQPC_ARPIDX_SHIFT 48
-#define I40IWQPC_ARPIDX_MASK (0xffffULL << I40IWQPC_ARPIDX_SHIFT)
-
-#define I40IWQPC_FLOWLABEL_SHIFT 0
-#define I40IWQPC_FLOWLABEL_MASK (0xfffffUL << I40IWQPC_FLOWLABEL_SHIFT)
-
-#define I40IWQPC_WSCALE_SHIFT 20
-#define I40IWQPC_WSCALE_MASK (1UL << I40IWQPC_WSCALE_SHIFT)
-
-#define I40IWQPC_KEEPALIVE_SHIFT 21
-#define I40IWQPC_KEEPALIVE_MASK (1UL << I40IWQPC_KEEPALIVE_SHIFT)
-
-#define I40IWQPC_IGNORE_TCP_OPT_SHIFT 22
-#define I40IWQPC_IGNORE_TCP_OPT_MASK (1UL << I40IWQPC_IGNORE_TCP_OPT_SHIFT)
-
-#define I40IWQPC_IGNORE_TCP_UNS_OPT_SHIFT 23
-#define I40IWQPC_IGNORE_TCP_UNS_OPT_MASK        \
-	(1UL << I40IWQPC_IGNORE_TCP_UNS_OPT_SHIFT)
-
-#define I40IWQPC_TCPSTATE_SHIFT 28
-#define I40IWQPC_TCPSTATE_MASK (0xfUL << I40IWQPC_TCPSTATE_SHIFT)
-
-#define I40IWQPC_RCVSCALE_SHIFT 32
-#define I40IWQPC_RCVSCALE_MASK (0xfULL << I40IWQPC_RCVSCALE_SHIFT)
-
-#define I40IWQPC_SNDSCALE_SHIFT 40
-#define I40IWQPC_SNDSCALE_MASK (0xfULL << I40IWQPC_SNDSCALE_SHIFT)
-
-#define I40IWQPC_PDIDX_SHIFT 48
-#define I40IWQPC_PDIDX_MASK (0x7fffULL << I40IWQPC_PDIDX_SHIFT)
-
-#define I40IWQPC_KALIVE_TIMER_MAX_PROBES_SHIFT 16
-#define I40IWQPC_KALIVE_TIMER_MAX_PROBES_MASK   \
-	(0xffUL << I40IWQPC_KALIVE_TIMER_MAX_PROBES_SHIFT)
-
-#define I40IWQPC_KEEPALIVE_INTERVAL_SHIFT 24
-#define I40IWQPC_KEEPALIVE_INTERVAL_MASK        \
-	(0xffUL << I40IWQPC_KEEPALIVE_INTERVAL_SHIFT)
-
-#define I40IWQPC_TIMESTAMP_RECENT_SHIFT 0
-#define I40IWQPC_TIMESTAMP_RECENT_MASK  \
-	(0xffffffffUL << I40IWQPC_TIMESTAMP_RECENT_SHIFT)
-
-#define I40IWQPC_TIMESTAMP_AGE_SHIFT 32
-#define I40IWQPC_TIMESTAMP_AGE_MASK     \
-	(0xffffffffULL << I40IWQPC_TIMESTAMP_AGE_SHIFT)
-
-#define I40IWQPC_SNDNXT_SHIFT 0
-#define I40IWQPC_SNDNXT_MASK (0xffffffffUL << I40IWQPC_SNDNXT_SHIFT)
-
-#define I40IWQPC_SNDWND_SHIFT 32
-#define I40IWQPC_SNDWND_MASK (0xffffffffULL << I40IWQPC_SNDWND_SHIFT)
-
-#define I40IWQPC_RCVNXT_SHIFT 0
-#define I40IWQPC_RCVNXT_MASK (0xffffffffUL << I40IWQPC_RCVNXT_SHIFT)
-
-#define I40IWQPC_RCVWND_SHIFT 32
-#define I40IWQPC_RCVWND_MASK (0xffffffffULL << I40IWQPC_RCVWND_SHIFT)
-
-#define I40IWQPC_SNDMAX_SHIFT 0
-#define I40IWQPC_SNDMAX_MASK (0xffffffffUL << I40IWQPC_SNDMAX_SHIFT)
-
-#define I40IWQPC_SNDUNA_SHIFT 32
-#define I40IWQPC_SNDUNA_MASK (0xffffffffULL << I40IWQPC_SNDUNA_SHIFT)
-
-#define I40IWQPC_SRTT_SHIFT 0
-#define I40IWQPC_SRTT_MASK (0xffffffffUL << I40IWQPC_SRTT_SHIFT)
-
-#define I40IWQPC_RTTVAR_SHIFT 32
-#define I40IWQPC_RTTVAR_MASK (0xffffffffULL << I40IWQPC_RTTVAR_SHIFT)
-
-#define I40IWQPC_SSTHRESH_SHIFT 0
-#define I40IWQPC_SSTHRESH_MASK (0xffffffffUL << I40IWQPC_SSTHRESH_SHIFT)
-
-#define I40IWQPC_CWND_SHIFT 32
-#define I40IWQPC_CWND_MASK (0xffffffffULL << I40IWQPC_CWND_SHIFT)
-
-#define I40IWQPC_SNDWL1_SHIFT 0
-#define I40IWQPC_SNDWL1_MASK (0xffffffffUL << I40IWQPC_SNDWL1_SHIFT)
-
-#define I40IWQPC_SNDWL2_SHIFT 32
-#define I40IWQPC_SNDWL2_MASK (0xffffffffULL << I40IWQPC_SNDWL2_SHIFT)
-
-#define I40IWQPC_ERR_RQ_IDX_SHIFT 32
-#define I40IWQPC_ERR_RQ_IDX_MASK  (0x3fffULL << I40IWQPC_ERR_RQ_IDX_SHIFT)
-
-#define I40IWQPC_MAXSNDWND_SHIFT 0
-#define I40IWQPC_MAXSNDWND_MASK (0xffffffffUL << I40IWQPC_MAXSNDWND_SHIFT)
-
-#define I40IWQPC_REXMIT_THRESH_SHIFT 48
-#define I40IWQPC_REXMIT_THRESH_MASK (0x3fULL << I40IWQPC_REXMIT_THRESH_SHIFT)
-
-#define I40IWQPC_TXCQNUM_SHIFT 0
-#define I40IWQPC_TXCQNUM_MASK (0x1ffffUL << I40IWQPC_TXCQNUM_SHIFT)
-
-#define I40IWQPC_RXCQNUM_SHIFT 32
-#define I40IWQPC_RXCQNUM_MASK (0x1ffffULL << I40IWQPC_RXCQNUM_SHIFT)
-
-#define I40IWQPC_STAT_INDEX_SHIFT 0
-#define I40IWQPC_STAT_INDEX_MASK (0x1fULL << I40IWQPC_STAT_INDEX_SHIFT)
-
-#define I40IWQPC_Q2ADDR_SHIFT 0
-#define I40IWQPC_Q2ADDR_MASK (0xffffffffffffff00ULL << I40IWQPC_Q2ADDR_SHIFT)
-
-#define I40IWQPC_LASTBYTESENT_SHIFT 0
-#define I40IWQPC_LASTBYTESENT_MASK (0xffUL << I40IWQPC_LASTBYTESENT_SHIFT)
-
-#define I40IWQPC_SRQID_SHIFT 32
-#define I40IWQPC_SRQID_MASK (0xffULL << I40IWQPC_SRQID_SHIFT)
-
-#define I40IWQPC_ORDSIZE_SHIFT 0
-#define I40IWQPC_ORDSIZE_MASK (0x7fUL << I40IWQPC_ORDSIZE_SHIFT)
-
-#define I40IWQPC_IRDSIZE_SHIFT 16
-#define I40IWQPC_IRDSIZE_MASK (0x3UL << I40IWQPC_IRDSIZE_SHIFT)
-
-#define I40IWQPC_WRRDRSPOK_SHIFT 20
-#define I40IWQPC_WRRDRSPOK_MASK (1UL << I40IWQPC_WRRDRSPOK_SHIFT)
-
-#define I40IWQPC_RDOK_SHIFT 21
-#define I40IWQPC_RDOK_MASK (1UL << I40IWQPC_RDOK_SHIFT)
-
-#define I40IWQPC_SNDMARKERS_SHIFT 22
-#define I40IWQPC_SNDMARKERS_MASK (1UL << I40IWQPC_SNDMARKERS_SHIFT)
-
-#define I40IWQPC_BINDEN_SHIFT 23
-#define I40IWQPC_BINDEN_MASK (1UL << I40IWQPC_BINDEN_SHIFT)
-
-#define I40IWQPC_FASTREGEN_SHIFT 24
-#define I40IWQPC_FASTREGEN_MASK (1UL << I40IWQPC_FASTREGEN_SHIFT)
-
-#define I40IWQPC_PRIVEN_SHIFT 25
-#define I40IWQPC_PRIVEN_MASK (1UL << I40IWQPC_PRIVEN_SHIFT)
-
-#define I40IWQPC_USESTATSINSTANCE_SHIFT 26
-#define I40IWQPC_USESTATSINSTANCE_MASK (1UL << I40IWQPC_USESTATSINSTANCE_SHIFT)
-
-#define I40IWQPC_IWARPMODE_SHIFT 28
-#define I40IWQPC_IWARPMODE_MASK (1UL << I40IWQPC_IWARPMODE_SHIFT)
-
-#define I40IWQPC_RCVMARKERS_SHIFT 29
-#define I40IWQPC_RCVMARKERS_MASK (1UL << I40IWQPC_RCVMARKERS_SHIFT)
-
-#define I40IWQPC_ALIGNHDRS_SHIFT 30
-#define I40IWQPC_ALIGNHDRS_MASK (1UL << I40IWQPC_ALIGNHDRS_SHIFT)
-
-#define I40IWQPC_RCVNOMPACRC_SHIFT 31
-#define I40IWQPC_RCVNOMPACRC_MASK (1UL << I40IWQPC_RCVNOMPACRC_SHIFT)
-
-#define I40IWQPC_RCVMARKOFFSET_SHIFT 33
-#define I40IWQPC_RCVMARKOFFSET_MASK (0x1ffULL << I40IWQPC_RCVMARKOFFSET_SHIFT)
-
-#define I40IWQPC_SNDMARKOFFSET_SHIFT 48
-#define I40IWQPC_SNDMARKOFFSET_MASK (0x1ffULL << I40IWQPC_SNDMARKOFFSET_SHIFT)
-
-#define I40IWQPC_QPCOMPCTX_SHIFT I40IW_CQPHC_QPCTX_SHIFT
-#define I40IWQPC_QPCOMPCTX_MASK I40IW_CQPHC_QPCTX_MASK
-
-#define I40IWQPC_SQTPHVAL_SHIFT 0
-#define I40IWQPC_SQTPHVAL_MASK (0xffUL << I40IWQPC_SQTPHVAL_SHIFT)
-
-#define I40IWQPC_RQTPHVAL_SHIFT 8
-#define I40IWQPC_RQTPHVAL_MASK (0xffUL << I40IWQPC_RQTPHVAL_SHIFT)
-
-#define I40IWQPC_QSHANDLE_SHIFT 16
-#define I40IWQPC_QSHANDLE_MASK (0x3ffUL << I40IWQPC_QSHANDLE_SHIFT)
-
-#define I40IWQPC_EXCEPTION_LAN_QUEUE_SHIFT 32
-#define I40IWQPC_EXCEPTION_LAN_QUEUE_MASK (0xfffULL <<  \
-					   I40IWQPC_EXCEPTION_LAN_QUEUE_SHIFT)
-
-#define I40IWQPC_LOCAL_IPADDR3_SHIFT 0
-#define I40IWQPC_LOCAL_IPADDR3_MASK \
-	(0xffffffffUL << I40IWQPC_LOCAL_IPADDR3_SHIFT)
-
-#define I40IWQPC_LOCAL_IPADDR2_SHIFT 32
-#define I40IWQPC_LOCAL_IPADDR2_MASK     \
-	(0xffffffffULL << I40IWQPC_LOCAL_IPADDR2_SHIFT)
-
-#define I40IWQPC_LOCAL_IPADDR1_SHIFT 0
-#define I40IWQPC_LOCAL_IPADDR1_MASK     \
-	(0xffffffffUL << I40IWQPC_LOCAL_IPADDR1_SHIFT)
-
-#define I40IWQPC_LOCAL_IPADDR0_SHIFT 32
-#define I40IWQPC_LOCAL_IPADDR0_MASK     \
-	(0xffffffffULL << I40IWQPC_LOCAL_IPADDR0_SHIFT)
-
-/* wqe size considering 32 bytes per wqe*/
-#define I40IW_QP_SW_MIN_WQSIZE 4		/*in WRs*/
-#define I40IW_SQ_RSVD 2
-#define I40IW_RQ_RSVD 1
-#define I40IW_MAX_QUANTAS_PER_WR 2
-#define I40IW_QP_SW_MAX_SQ_QUANTAS 2048
-#define I40IW_QP_SW_MAX_RQ_QUANTAS 16384
-#define I40IW_MAX_QP_WRS ((I40IW_QP_SW_MAX_SQ_QUANTAS / I40IW_MAX_QUANTAS_PER_WR) - 1)
-
-#define I40IWQP_OP_RDMA_WRITE 0
-#define I40IWQP_OP_RDMA_READ 1
-#define I40IWQP_OP_RDMA_SEND 3
-#define I40IWQP_OP_RDMA_SEND_INV 4
-#define I40IWQP_OP_RDMA_SEND_SOL_EVENT 5
-#define I40IWQP_OP_RDMA_SEND_SOL_EVENT_INV 6
-#define I40IWQP_OP_BIND_MW 8
-#define I40IWQP_OP_FAST_REGISTER 9
-#define I40IWQP_OP_LOCAL_INVALIDATE 10
-#define I40IWQP_OP_RDMA_READ_LOC_INV 11
-#define I40IWQP_OP_NOP 12
-
-#define I40IW_RSVD_SHIFT        41
-#define I40IW_RSVD_MASK (0x7fffULL << I40IW_RSVD_SHIFT)
-
-/* iwarp QP SQ WQE common fields */
-#define I40IWQPSQ_OPCODE_SHIFT 32
-#define I40IWQPSQ_OPCODE_MASK (0x3fULL << I40IWQPSQ_OPCODE_SHIFT)
-
-#define I40IWQPSQ_ADDFRAGCNT_SHIFT 38
-#define I40IWQPSQ_ADDFRAGCNT_MASK (0x7ULL << I40IWQPSQ_ADDFRAGCNT_SHIFT)
-
-#define I40IWQPSQ_STREAMMODE_SHIFT 58
-#define I40IWQPSQ_STREAMMODE_MASK (1ULL << I40IWQPSQ_STREAMMODE_SHIFT)
-
-#define I40IWQPSQ_WAITFORRCVPDU_SHIFT 59
-#define I40IWQPSQ_WAITFORRCVPDU_MASK (1ULL << I40IWQPSQ_WAITFORRCVPDU_SHIFT)
-
-#define I40IWQPSQ_READFENCE_SHIFT 60
-#define I40IWQPSQ_READFENCE_MASK (1ULL << I40IWQPSQ_READFENCE_SHIFT)
-
-#define I40IWQPSQ_LOCALFENCE_SHIFT 61
-#define I40IWQPSQ_LOCALFENCE_MASK (1ULL << I40IWQPSQ_LOCALFENCE_SHIFT)
-
-#define I40IWQPSQ_SIGCOMPL_SHIFT 62
-#define I40IWQPSQ_SIGCOMPL_MASK (1ULL << I40IWQPSQ_SIGCOMPL_SHIFT)
-
-#define I40IWQPSQ_VALID_SHIFT 63
-#define I40IWQPSQ_VALID_MASK (1ULL << I40IWQPSQ_VALID_SHIFT)
-
-#define I40IWQPSQ_FRAG_TO_SHIFT I40IW_CQPHC_QPCTX_SHIFT
-#define I40IWQPSQ_FRAG_TO_MASK I40IW_CQPHC_QPCTX_MASK
-
-#define I40IWQPSQ_FRAG_LEN_SHIFT 0
-#define I40IWQPSQ_FRAG_LEN_MASK (0xffffffffUL << I40IWQPSQ_FRAG_LEN_SHIFT)
-
-#define I40IWQPSQ_FRAG_STAG_SHIFT 32
-#define I40IWQPSQ_FRAG_STAG_MASK (0xffffffffULL << I40IWQPSQ_FRAG_STAG_SHIFT)
-
-#define I40IWQPSQ_REMSTAGINV_SHIFT 0
-#define I40IWQPSQ_REMSTAGINV_MASK (0xffffffffUL << I40IWQPSQ_REMSTAGINV_SHIFT)
-
-#define I40IWQPSQ_INLINEDATAFLAG_SHIFT 57
-#define I40IWQPSQ_INLINEDATAFLAG_MASK (1ULL << I40IWQPSQ_INLINEDATAFLAG_SHIFT)
-
-#define I40IWQPSQ_INLINEDATALEN_SHIFT 48
-#define I40IWQPSQ_INLINEDATALEN_MASK    \
-	(0x7fULL << I40IWQPSQ_INLINEDATALEN_SHIFT)
-
-/* iwarp send with push mode */
-#define I40IWQPSQ_WQDESCIDX_SHIFT 0
-#define I40IWQPSQ_WQDESCIDX_MASK (0x3fffUL << I40IWQPSQ_WQDESCIDX_SHIFT)
-
-/* rdma write */
-#define I40IWQPSQ_REMSTAG_SHIFT 0
-#define I40IWQPSQ_REMSTAG_MASK (0xffffffffUL << I40IWQPSQ_REMSTAG_SHIFT)
-
-#define I40IWQPSQ_REMTO_SHIFT I40IW_CQPHC_QPCTX_SHIFT
-#define I40IWQPSQ_REMTO_MASK I40IW_CQPHC_QPCTX_MASK
-
-/* memory window */
-#define I40IWQPSQ_STAGRIGHTS_SHIFT 48
-#define I40IWQPSQ_STAGRIGHTS_MASK (0x1fULL << I40IWQPSQ_STAGRIGHTS_SHIFT)
-
-#define I40IWQPSQ_VABASEDTO_SHIFT 53
-#define I40IWQPSQ_VABASEDTO_MASK (1ULL << I40IWQPSQ_VABASEDTO_SHIFT)
-
-#define I40IWQPSQ_MWLEN_SHIFT I40IW_CQPHC_QPCTX_SHIFT
-#define I40IWQPSQ_MWLEN_MASK I40IW_CQPHC_QPCTX_MASK
-
-#define I40IWQPSQ_PARENTMRSTAG_SHIFT 0
-#define I40IWQPSQ_PARENTMRSTAG_MASK \
-	(0xffffffffUL << I40IWQPSQ_PARENTMRSTAG_SHIFT)
-
-#define I40IWQPSQ_MWSTAG_SHIFT 32
-#define I40IWQPSQ_MWSTAG_MASK (0xffffffffULL << I40IWQPSQ_MWSTAG_SHIFT)
-
-#define I40IWQPSQ_BASEVA_TO_FBO_SHIFT I40IW_CQPHC_QPCTX_SHIFT
-#define I40IWQPSQ_BASEVA_TO_FBO_MASK I40IW_CQPHC_QPCTX_MASK
-
-/* Local Invalidate */
-#define I40IWQPSQ_LOCSTAG_SHIFT 32
-#define I40IWQPSQ_LOCSTAG_MASK (0xffffffffULL << I40IWQPSQ_LOCSTAG_SHIFT)
-
-/* Fast Register */
-#define I40IWQPSQ_STAGKEY_SHIFT 0
-#define I40IWQPSQ_STAGKEY_MASK (0xffUL << I40IWQPSQ_STAGKEY_SHIFT)
-
-#define I40IWQPSQ_STAGINDEX_SHIFT 8
-#define I40IWQPSQ_STAGINDEX_MASK (0xffffffUL << I40IWQPSQ_STAGINDEX_SHIFT)
-
-#define I40IWQPSQ_COPYHOSTPBLS_SHIFT 43
-#define I40IWQPSQ_COPYHOSTPBLS_MASK (1ULL << I40IWQPSQ_COPYHOSTPBLS_SHIFT)
-
-#define I40IWQPSQ_LPBLSIZE_SHIFT 44
-#define I40IWQPSQ_LPBLSIZE_MASK (3ULL << I40IWQPSQ_LPBLSIZE_SHIFT)
-
-#define I40IWQPSQ_HPAGESIZE_SHIFT 46
-#define I40IWQPSQ_HPAGESIZE_MASK (3ULL << I40IWQPSQ_HPAGESIZE_SHIFT)
-
-#define I40IWQPSQ_STAGLEN_SHIFT 0
-#define I40IWQPSQ_STAGLEN_MASK (0x1ffffffffffULL << I40IWQPSQ_STAGLEN_SHIFT)
-
-#define I40IWQPSQ_FIRSTPMPBLIDXLO_SHIFT 48
-#define I40IWQPSQ_FIRSTPMPBLIDXLO_MASK  \
-	(0xffffULL << I40IWQPSQ_FIRSTPMPBLIDXLO_SHIFT)
-
-#define I40IWQPSQ_FIRSTPMPBLIDXHI_SHIFT 0
-#define I40IWQPSQ_FIRSTPMPBLIDXHI_MASK  \
-	(0xfffUL << I40IWQPSQ_FIRSTPMPBLIDXHI_SHIFT)
-
-#define I40IWQPSQ_PBLADDR_SHIFT 12
-#define I40IWQPSQ_PBLADDR_MASK (0xfffffffffffffULL << I40IWQPSQ_PBLADDR_SHIFT)
-
-/*  iwarp QP RQ WQE common fields */
-#define I40IWQPRQ_ADDFRAGCNT_SHIFT I40IWQPSQ_ADDFRAGCNT_SHIFT
-#define I40IWQPRQ_ADDFRAGCNT_MASK I40IWQPSQ_ADDFRAGCNT_MASK
-
-#define I40IWQPRQ_VALID_SHIFT I40IWQPSQ_VALID_SHIFT
-#define I40IWQPRQ_VALID_MASK I40IWQPSQ_VALID_MASK
-
-#define I40IWQPRQ_COMPLCTX_SHIFT I40IW_CQPHC_QPCTX_SHIFT
-#define I40IWQPRQ_COMPLCTX_MASK I40IW_CQPHC_QPCTX_MASK
-
-#define I40IWQPRQ_FRAG_LEN_SHIFT I40IWQPSQ_FRAG_LEN_SHIFT
-#define I40IWQPRQ_FRAG_LEN_MASK I40IWQPSQ_FRAG_LEN_MASK
-
-#define I40IWQPRQ_STAG_SHIFT I40IWQPSQ_FRAG_STAG_SHIFT
-#define I40IWQPRQ_STAG_MASK I40IWQPSQ_FRAG_STAG_MASK
-
-#define I40IWQPRQ_TO_SHIFT I40IWQPSQ_FRAG_TO_SHIFT
-#define I40IWQPRQ_TO_MASK I40IWQPSQ_FRAG_TO_MASK
-
-/* Query FPM CQP buf */
-#define I40IW_QUERY_FPM_MAX_QPS_SHIFT 0
-#define I40IW_QUERY_FPM_MAX_QPS_MASK               \
-	(0x7ffffUL << I40IW_QUERY_FPM_MAX_QPS_SHIFT)
-
-#define I40IW_QUERY_FPM_MAX_CQS_SHIFT 0
-#define I40IW_QUERY_FPM_MAX_CQS_MASK               \
-	(0x3ffffUL << I40IW_QUERY_FPM_MAX_CQS_SHIFT)
-
-#define I40IW_QUERY_FPM_FIRST_PE_SD_INDEX_SHIFT 0
-#define I40IW_QUERY_FPM_FIRST_PE_SD_INDEX_MASK  \
-	(0x3fffUL << I40IW_QUERY_FPM_FIRST_PE_SD_INDEX_SHIFT)
-
-#define I40IW_QUERY_FPM_MAX_PE_SDS_SHIFT 32
-#define I40IW_QUERY_FPM_MAX_PE_SDS_MASK \
-	(0x3fffULL << I40IW_QUERY_FPM_MAX_PE_SDS_SHIFT)
-
-#define I40IW_QUERY_FPM_MAX_QPS_SHIFT 0
-#define I40IW_QUERY_FPM_MAX_QPS_MASK    \
-	(0x7ffffUL << I40IW_QUERY_FPM_MAX_QPS_SHIFT)
-
-#define I40IW_QUERY_FPM_MAX_CQS_SHIFT 0
-#define I40IW_QUERY_FPM_MAX_CQS_MASK    \
-	(0x3ffffUL << I40IW_QUERY_FPM_MAX_CQS_SHIFT)
-
-#define I40IW_QUERY_FPM_MAX_CEQS_SHIFT 0
-#define I40IW_QUERY_FPM_MAX_CEQS_MASK   \
-	(0xffUL << I40IW_QUERY_FPM_MAX_CEQS_SHIFT)
-
-#define I40IW_QUERY_FPM_XFBLOCKSIZE_SHIFT 32
-#define I40IW_QUERY_FPM_XFBLOCKSIZE_MASK        \
-	(0xffffffffULL << I40IW_QUERY_FPM_XFBLOCKSIZE_SHIFT)
-
-#define I40IW_QUERY_FPM_Q1BLOCKSIZE_SHIFT 32
-#define I40IW_QUERY_FPM_Q1BLOCKSIZE_MASK        \
-	(0xffffffffULL << I40IW_QUERY_FPM_Q1BLOCKSIZE_SHIFT)
-
-#define I40IW_QUERY_FPM_HTMULTIPLIER_SHIFT 16
-#define I40IW_QUERY_FPM_HTMULTIPLIER_MASK       \
-	(0xfUL << I40IW_QUERY_FPM_HTMULTIPLIER_SHIFT)
-
-#define I40IW_QUERY_FPM_TIMERBUCKET_SHIFT 32
-#define I40IW_QUERY_FPM_TIMERBUCKET_MASK        \
-	(0xffFFULL << I40IW_QUERY_FPM_TIMERBUCKET_SHIFT)
-
-/* Static HMC pages allocated buf */
-#define I40IW_SHMC_PAGE_ALLOCATED_HMC_FN_ID_SHIFT 0
-#define I40IW_SHMC_PAGE_ALLOCATED_HMC_FN_ID_MASK        \
-	(0x3fUL << I40IW_SHMC_PAGE_ALLOCATED_HMC_FN_ID_SHIFT)
-
-#define I40IW_HW_PAGE_SIZE	4096
-#define I40IW_DONE_COUNT	1000
-#define I40IW_SLEEP_COUNT	10
-
-enum {
-	I40IW_QUEUES_ALIGNMENT_MASK =		(128 - 1),
-	I40IW_AEQ_ALIGNMENT_MASK =		(256 - 1),
-	I40IW_Q2_ALIGNMENT_MASK =		(256 - 1),
-	I40IW_CEQ_ALIGNMENT_MASK =		(256 - 1),
-	I40IW_CQ0_ALIGNMENT_MASK =		(256 - 1),
-	I40IW_HOST_CTX_ALIGNMENT_MASK =		(4 - 1),
-	I40IW_SHADOWAREA_MASK =			(128 - 1),
-	I40IW_FPM_QUERY_BUF_ALIGNMENT_MASK =	(4 - 1),
-	I40IW_FPM_COMMIT_BUF_ALIGNMENT_MASK =	(4 - 1)
-};
-
-enum i40iw_alignment {
-	I40IW_CQP_ALIGNMENT =		0x200,
-	I40IW_AEQ_ALIGNMENT =		0x100,
-	I40IW_CEQ_ALIGNMENT =		0x100,
-	I40IW_CQ0_ALIGNMENT =		0x100,
-	I40IW_SD_BUF_ALIGNMENT =	0x80,
-	I40IW_FEATURE_BUF_ALIGNMENT =	0x8
-};
-
-#define I40IW_WQE_SIZE_64	64
-
-#define I40IW_QP_WQE_MIN_SIZE	32
-#define I40IW_QP_WQE_MAX_SIZE	128
-
-#define I40IW_UPDATE_SD_BUF_SIZE 128
-
-#define I40IW_CQE_QTYPE_RQ 0
-#define I40IW_CQE_QTYPE_SQ 1
-
-#define I40IW_RING_INIT(_ring, _size) \
-	{ \
-		(_ring).head = 0; \
-		(_ring).tail = 0; \
-		(_ring).size = (_size); \
-	}
-#define I40IW_RING_GETSIZE(_ring) ((_ring).size)
-#define I40IW_RING_GETCURRENT_HEAD(_ring) ((_ring).head)
-#define I40IW_RING_GETCURRENT_TAIL(_ring) ((_ring).tail)
-
-#define I40IW_RING_MOVE_HEAD(_ring, _retcode) \
-	{ \
-		register u32 size; \
-		size = (_ring).size;  \
-		if (!I40IW_RING_FULL_ERR(_ring)) { \
-			(_ring).head = ((_ring).head + 1) % size; \
-			(_retcode) = 0; \
-		} else { \
-			(_retcode) = I40IW_ERR_RING_FULL; \
-		} \
-	}
-
-#define I40IW_RING_MOVE_HEAD_BY_COUNT(_ring, _count, _retcode) \
-	{ \
-		register u32 size; \
-		size = (_ring).size; \
-		if ((I40IW_RING_WORK_AVAILABLE(_ring) + (_count)) < size) { \
-			(_ring).head = ((_ring).head + (_count)) % size; \
-			(_retcode) = 0; \
-		} else { \
-			(_retcode) = I40IW_ERR_RING_FULL; \
-		} \
-	}
-
-#define I40IW_RING_MOVE_TAIL(_ring) \
-	(_ring).tail = ((_ring).tail + 1) % (_ring).size
-
-#define I40IW_RING_MOVE_HEAD_NOCHECK(_ring) \
-	(_ring).head = ((_ring).head + 1) % (_ring).size
-
-#define I40IW_RING_MOVE_TAIL_BY_COUNT(_ring, _count) \
-	(_ring).tail = ((_ring).tail + (_count)) % (_ring).size
-
-#define I40IW_RING_SET_TAIL(_ring, _pos) \
-	(_ring).tail = (_pos) % (_ring).size
-
-#define I40IW_RING_FULL_ERR(_ring) \
-	( \
-		(I40IW_RING_WORK_AVAILABLE(_ring) == ((_ring).size - 1))  \
-	)
-
-#define I40IW_ERR_RING_FULL2(_ring) \
-	( \
-		(I40IW_RING_WORK_AVAILABLE(_ring) == ((_ring).size - 2))  \
-	)
-
-#define I40IW_ERR_RING_FULL3(_ring) \
-	( \
-		(I40IW_RING_WORK_AVAILABLE(_ring) == ((_ring).size - 3))  \
-	)
-
-#define I40IW_RING_MORE_WORK(_ring) \
-	( \
-		(I40IW_RING_WORK_AVAILABLE(_ring) != 0) \
-	)
-
-#define I40IW_RING_WORK_AVAILABLE(_ring) \
-	( \
-		(((_ring).head + (_ring).size - (_ring).tail) % (_ring).size) \
-	)
-
-#define I40IW_RING_GET_WQES_AVAILABLE(_ring) \
-	( \
-		((_ring).size - I40IW_RING_WORK_AVAILABLE(_ring) - 1) \
-	)
-
-#define I40IW_ATOMIC_RING_MOVE_HEAD(_ring, index, _retcode) \
-	{ \
-		index = I40IW_RING_GETCURRENT_HEAD(_ring); \
-		I40IW_RING_MOVE_HEAD(_ring, _retcode); \
-	}
-
-/* Async Events codes */
-#define I40IW_AE_AMP_UNALLOCATED_STAG                                   0x0102
-#define I40IW_AE_AMP_INVALID_STAG                                       0x0103
-#define I40IW_AE_AMP_BAD_QP                                             0x0104
-#define I40IW_AE_AMP_BAD_PD                                             0x0105
-#define I40IW_AE_AMP_BAD_STAG_KEY                                       0x0106
-#define I40IW_AE_AMP_BAD_STAG_INDEX                                     0x0107
-#define I40IW_AE_AMP_BOUNDS_VIOLATION                                   0x0108
-#define I40IW_AE_AMP_RIGHTS_VIOLATION                                   0x0109
-#define I40IW_AE_AMP_TO_WRAP                                            0x010a
-#define I40IW_AE_AMP_FASTREG_SHARED                                     0x010b
-#define I40IW_AE_AMP_FASTREG_VALID_STAG                                 0x010c
-#define I40IW_AE_AMP_FASTREG_MW_STAG                                    0x010d
-#define I40IW_AE_AMP_FASTREG_INVALID_RIGHTS                             0x010e
-#define I40IW_AE_AMP_FASTREG_PBL_TABLE_OVERFLOW                         0x010f
-#define I40IW_AE_AMP_FASTREG_INVALID_LENGTH                             0x0110
-#define I40IW_AE_AMP_INVALIDATE_SHARED                                  0x0111
-#define I40IW_AE_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS                 0x0112
-#define I40IW_AE_AMP_INVALIDATE_MR_WITH_BOUND_WINDOWS                   0x0113
-#define I40IW_AE_AMP_MWBIND_VALID_STAG                                  0x0114
-#define I40IW_AE_AMP_MWBIND_OF_MR_STAG                                  0x0115
-#define I40IW_AE_AMP_MWBIND_TO_ZERO_BASED_STAG                          0x0116
-#define I40IW_AE_AMP_MWBIND_TO_MW_STAG                                  0x0117
-#define I40IW_AE_AMP_MWBIND_INVALID_RIGHTS                              0x0118
-#define I40IW_AE_AMP_MWBIND_INVALID_BOUNDS                              0x0119
-#define I40IW_AE_AMP_MWBIND_TO_INVALID_PARENT                           0x011a
-#define I40IW_AE_AMP_MWBIND_BIND_DISABLED                               0x011b
-#define I40IW_AE_UDA_XMIT_DGRAM_TOO_LONG                                0x0132
-#define I40IW_AE_UDA_XMIT_DGRAM_TOO_SHORT                               0x0134
-#define I40IW_AE_BAD_CLOSE                                              0x0201
-#define I40IW_AE_RDMAP_ROE_BAD_LLP_CLOSE                                0x0202
-#define I40IW_AE_CQ_OPERATION_ERROR                                     0x0203
-#define I40IW_AE_PRIV_OPERATION_DENIED                                  0x011c
-#define I40IW_AE_RDMA_READ_WHILE_ORD_ZERO                               0x0205
-#define I40IW_AE_STAG_ZERO_INVALID                                      0x0206
-#define I40IW_AE_IB_RREQ_AND_Q1_FULL                                    0x0207
-#define I40IW_AE_WQE_UNEXPECTED_OPCODE                                  0x020a
-#define I40IW_AE_WQE_INVALID_PARAMETER                                  0x020b
-#define I40IW_AE_WQE_LSMM_TOO_LONG                                      0x0220
-#define I40IW_AE_DDP_INVALID_MSN_GAP_IN_MSN                             0x0301
-#define I40IW_AE_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER      0x0303
-#define I40IW_AE_DDP_UBE_INVALID_DDP_VERSION                            0x0304
-#define I40IW_AE_DDP_UBE_INVALID_MO                                     0x0305
-#define I40IW_AE_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE                0x0306
-#define I40IW_AE_DDP_UBE_INVALID_QN                                     0x0307
-#define I40IW_AE_DDP_NO_L_BIT                                           0x0308
-#define I40IW_AE_RDMAP_ROE_INVALID_RDMAP_VERSION                        0x0311
-#define I40IW_AE_RDMAP_ROE_UNEXPECTED_OPCODE                            0x0312
-#define I40IW_AE_ROE_INVALID_RDMA_READ_REQUEST                          0x0313
-#define I40IW_AE_ROE_INVALID_RDMA_WRITE_OR_READ_RESP                    0x0314
-#define I40IW_AE_INVALID_ARP_ENTRY                                      0x0401
-#define I40IW_AE_INVALID_TCP_OPTION_RCVD                                0x0402
-#define I40IW_AE_STALE_ARP_ENTRY                                        0x0403
-#define I40IW_AE_INVALID_MAC_ENTRY                                      0x0405
-#define I40IW_AE_LLP_CLOSE_COMPLETE                                     0x0501
-#define I40IW_AE_LLP_CONNECTION_RESET                                   0x0502
-#define I40IW_AE_LLP_FIN_RECEIVED                                       0x0503
-#define I40IW_AE_LLP_RECEIVED_MPA_CRC_ERROR                             0x0505
-#define I40IW_AE_LLP_SEGMENT_TOO_LARGE                                  0x0506
-#define I40IW_AE_LLP_SEGMENT_TOO_SMALL                                  0x0507
-#define I40IW_AE_LLP_SYN_RECEIVED                                       0x0508
-#define I40IW_AE_LLP_TERMINATE_RECEIVED                                 0x0509
-#define I40IW_AE_LLP_TOO_MANY_RETRIES                                   0x050a
-#define I40IW_AE_LLP_TOO_MANY_KEEPALIVE_RETRIES                         0x050b
-#define I40IW_AE_LLP_DOUBT_REACHABILITY                                 0x050c
-#define I40IW_AE_LLP_RX_VLAN_MISMATCH                                   0x050d
-#define I40IW_AE_RESOURCE_EXHAUSTION                                    0x0520
-#define I40IW_AE_RESET_SENT                                             0x0601
-#define I40IW_AE_TERMINATE_SENT                                         0x0602
-#define I40IW_AE_RESET_NOT_SENT                                         0x0603
-#define I40IW_AE_LCE_QP_CATASTROPHIC                                    0x0700
-#define I40IW_AE_LCE_FUNCTION_CATASTROPHIC                              0x0701
-#define I40IW_AE_LCE_CQ_CATASTROPHIC                                    0x0702
-#define I40IW_AE_QP_SUSPEND_COMPLETE                                    0x0900
-
-#define OP_DELETE_LOCAL_MAC_IPADDR_ENTRY        1
-#define OP_CEQ_DESTROY                          2
-#define OP_AEQ_DESTROY                          3
-#define OP_DELETE_ARP_CACHE_ENTRY               4
-#define OP_MANAGE_APBVT_ENTRY                   5
-#define OP_CEQ_CREATE                           6
-#define OP_AEQ_CREATE                           7
-#define OP_ALLOC_LOCAL_MAC_IPADDR_ENTRY         8
-#define OP_ADD_LOCAL_MAC_IPADDR_ENTRY           9
-#define OP_MANAGE_QHASH_TABLE_ENTRY             10
-#define OP_QP_MODIFY                            11
-#define OP_QP_UPLOAD_CONTEXT                    12
-#define OP_CQ_CREATE                            13
-#define OP_CQ_DESTROY                           14
-#define OP_QP_CREATE                            15
-#define OP_QP_DESTROY                           16
-#define OP_ALLOC_STAG                           17
-#define OP_MR_REG_NON_SHARED                    18
-#define OP_DEALLOC_STAG                         19
-#define OP_MW_ALLOC                             20
-#define OP_QP_FLUSH_WQES                        21
-#define OP_ADD_ARP_CACHE_ENTRY                  22
-#define OP_UPDATE_PE_SDS                        23
-#define OP_MANAGE_HMC_PM_FUNC_TABLE             24
-#define OP_SUSPEND                              25
-#define OP_RESUME                               26
-#define OP_MANAGE_VF_PBLE_BP                    27
-#define OP_QUERY_FPM_VALUES                     28
-#define OP_COMMIT_FPM_VALUES                    29
-#define OP_REQUESTED_COMMANDS                   30
-#define OP_COMPLETED_COMMANDS                   31
-#define OP_GEN_AE                               32
-#define OP_QUERY_RDMA_FEATURES                  33
-#define OP_SIZE_CQP_STAT_ARRAY			34
-
-#endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_hmc.c b/drivers/infiniband/hw/i40iw/i40iw_hmc.c
deleted file mode 100644
index b44bfc1d239b..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw_hmc.c
+++ /dev/null
@@ -1,821 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#include "i40iw_osdep.h"
-#include "i40iw_register.h"
-#include "i40iw_status.h"
-#include "i40iw_hmc.h"
-#include "i40iw_d.h"
-#include "i40iw_type.h"
-#include "i40iw_p.h"
-#include "i40iw_vf.h"
-#include "i40iw_virtchnl.h"
-
-/**
- * i40iw_find_sd_index_limit - finds segment descriptor index limit
- * @hmc_info: pointer to the HMC configuration information structure
- * @type: type of HMC resources we're searching
- * @idx: starting index for the object
- * @cnt: number of objects we're trying to create
- * @sd_idx: pointer to return index of the segment descriptor in question
- * @sd_limit: pointer to return the maximum number of segment descriptors
- *
- * This function calculates the segment descriptor index and index limit
- * for the resource defined by i40iw_hmc_rsrc_type.
- */
-
-static inline void i40iw_find_sd_index_limit(struct i40iw_hmc_info *hmc_info,
-					     u32 type,
-					     u32 idx,
-					     u32 cnt,
-					     u32 *sd_idx,
-					     u32 *sd_limit)
-{
-	u64 fpm_addr, fpm_limit;
-
-	fpm_addr = hmc_info->hmc_obj[(type)].base +
-			hmc_info->hmc_obj[type].size * idx;
-	fpm_limit = fpm_addr + hmc_info->hmc_obj[type].size * cnt;
-	*sd_idx = (u32)(fpm_addr / I40IW_HMC_DIRECT_BP_SIZE);
-	*sd_limit = (u32)((fpm_limit - 1) / I40IW_HMC_DIRECT_BP_SIZE);
-	*sd_limit += 1;
-}
-
-/**
- * i40iw_find_pd_index_limit - finds page descriptor index limit
- * @hmc_info: pointer to the HMC configuration information struct
- * @type: HMC resource type we're examining
- * @idx: starting index for the object
- * @cnt: number of objects we're trying to create
- * @pd_idx: pointer to return page descriptor index
- * @pd_limit: pointer to return page descriptor index limit
- *
- * Calculates the page descriptor index and index limit for the resource
- * defined by i40iw_hmc_rsrc_type.
- */
-
-static inline void i40iw_find_pd_index_limit(struct i40iw_hmc_info *hmc_info,
-					     u32 type,
-					     u32 idx,
-					     u32 cnt,
-					     u32 *pd_idx,
-					     u32 *pd_limit)
-{
-	u64 fpm_adr, fpm_limit;
-
-	fpm_adr = hmc_info->hmc_obj[type].base +
-			hmc_info->hmc_obj[type].size * idx;
-	fpm_limit = fpm_adr + (hmc_info)->hmc_obj[(type)].size * (cnt);
-	*(pd_idx) = (u32)(fpm_adr / I40IW_HMC_PAGED_BP_SIZE);
-	*(pd_limit) = (u32)((fpm_limit - 1) / I40IW_HMC_PAGED_BP_SIZE);
-	*(pd_limit) += 1;
-}
-
-/**
- * i40iw_set_sd_entry - setup entry for sd programming
- * @pa: physical addr
- * @idx: sd index
- * @type: paged or direct sd
- * @entry: sd entry ptr
- */
-static inline void i40iw_set_sd_entry(u64 pa,
-				      u32 idx,
-				      enum i40iw_sd_entry_type type,
-				      struct update_sd_entry *entry)
-{
-	entry->data = pa | (I40IW_HMC_MAX_BP_COUNT << I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_SHIFT) |
-			(((type == I40IW_SD_TYPE_PAGED) ? 0 : 1) <<
-				I40E_PFHMC_SDDATALOW_PMSDTYPE_SHIFT) |
-			(1 << I40E_PFHMC_SDDATALOW_PMSDVALID_SHIFT);
-	entry->cmd = (idx | (1 << I40E_PFHMC_SDCMD_PMSDWR_SHIFT) | (1 << 15));
-}
-
-/**
- * i40iw_clr_sd_entry - setup entry for sd clear
- * @idx: sd index
- * @type: paged or direct sd
- * @entry: sd entry ptr
- */
-static inline void i40iw_clr_sd_entry(u32 idx, enum i40iw_sd_entry_type type,
-				      struct update_sd_entry *entry)
-{
-	entry->data = (I40IW_HMC_MAX_BP_COUNT <<
-			I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_SHIFT) |
-			(((type == I40IW_SD_TYPE_PAGED) ? 0 : 1) <<
-				I40E_PFHMC_SDDATALOW_PMSDTYPE_SHIFT);
-	entry->cmd = (idx | (1 << I40E_PFHMC_SDCMD_PMSDWR_SHIFT) | (1 << 15));
-}
-
-/**
- * i40iw_hmc_sd_one - setup 1 sd entry for cqp
- * @dev: pointer to the device structure
- * @hmc_fn_id: hmc's function id
- * @pa: physical addr
- * @sd_idx: sd index
- * @type: paged or direct sd
- * @setsd: flag to set or clear sd
- */
-enum i40iw_status_code i40iw_hmc_sd_one(struct i40iw_sc_dev *dev,
-					u8 hmc_fn_id,
-					u64 pa, u32 sd_idx,
-					enum i40iw_sd_entry_type type,
-					bool setsd)
-{
-	struct i40iw_update_sds_info sdinfo;
-
-	sdinfo.cnt = 1;
-	sdinfo.hmc_fn_id = hmc_fn_id;
-	if (setsd)
-		i40iw_set_sd_entry(pa, sd_idx, type, sdinfo.entry);
-	else
-		i40iw_clr_sd_entry(sd_idx, type, sdinfo.entry);
-
-	return dev->cqp->process_cqp_sds(dev, &sdinfo);
-}
-
-/**
- * i40iw_hmc_sd_grp - setup group od sd entries for cqp
- * @dev: pointer to the device structure
- * @hmc_info: pointer to the HMC configuration information struct
- * @sd_index: sd index
- * @sd_cnt: number of sd entries
- * @setsd: flag to set or clear sd
- */
-static enum i40iw_status_code i40iw_hmc_sd_grp(struct i40iw_sc_dev *dev,
-					       struct i40iw_hmc_info *hmc_info,
-					       u32 sd_index,
-					       u32 sd_cnt,
-					       bool setsd)
-{
-	struct i40iw_hmc_sd_entry *sd_entry;
-	struct i40iw_update_sds_info sdinfo;
-	u64 pa;
-	u32 i;
-	enum i40iw_status_code ret_code = 0;
-
-	memset(&sdinfo, 0, sizeof(sdinfo));
-	sdinfo.hmc_fn_id = hmc_info->hmc_fn_id;
-	for (i = sd_index; i < sd_index + sd_cnt; i++) {
-		sd_entry = &hmc_info->sd_table.sd_entry[i];
-		if (!sd_entry ||
-		    (!sd_entry->valid && setsd) ||
-		    (sd_entry->valid && !setsd))
-			continue;
-		if (setsd) {
-			pa = (sd_entry->entry_type == I40IW_SD_TYPE_PAGED) ?
-			    sd_entry->u.pd_table.pd_page_addr.pa :
-			    sd_entry->u.bp.addr.pa;
-			i40iw_set_sd_entry(pa, i, sd_entry->entry_type,
-					   &sdinfo.entry[sdinfo.cnt]);
-		} else {
-			i40iw_clr_sd_entry(i, sd_entry->entry_type,
-					   &sdinfo.entry[sdinfo.cnt]);
-		}
-		sdinfo.cnt++;
-		if (sdinfo.cnt == I40IW_MAX_SD_ENTRIES) {
-			ret_code = dev->cqp->process_cqp_sds(dev, &sdinfo);
-			if (ret_code) {
-				i40iw_debug(dev, I40IW_DEBUG_HMC,
-					    "i40iw_hmc_sd_grp: sd_programming failed err=%d\n",
-					    ret_code);
-				return ret_code;
-			}
-			sdinfo.cnt = 0;
-		}
-	}
-	if (sdinfo.cnt)
-		ret_code = dev->cqp->process_cqp_sds(dev, &sdinfo);
-
-	return ret_code;
-}
-
-/**
- * i40iw_vfdev_from_fpm - return vf dev ptr for hmc function id
- * @dev: pointer to the device structure
- * @hmc_fn_id: hmc's function id
- */
-struct i40iw_vfdev *i40iw_vfdev_from_fpm(struct i40iw_sc_dev *dev, u8 hmc_fn_id)
-{
-	struct i40iw_vfdev *vf_dev = NULL;
-	u16 idx;
-
-	for (idx = 0; idx < I40IW_MAX_PE_ENABLED_VF_COUNT; idx++) {
-		if (dev->vf_dev[idx] &&
-		    ((u8)dev->vf_dev[idx]->pmf_index == hmc_fn_id)) {
-			vf_dev = dev->vf_dev[idx];
-			break;
-		}
-	}
-	return vf_dev;
-}
-
-/**
- * i40iw_vf_hmcinfo_from_fpm - get ptr to hmc for func_id
- * @dev: pointer to the device structure
- * @hmc_fn_id: hmc's function id
- */
-struct i40iw_hmc_info *i40iw_vf_hmcinfo_from_fpm(struct i40iw_sc_dev *dev,
-						 u8 hmc_fn_id)
-{
-	struct i40iw_hmc_info *hmc_info = NULL;
-	u16 idx;
-
-	for (idx = 0; idx < I40IW_MAX_PE_ENABLED_VF_COUNT; idx++) {
-		if (dev->vf_dev[idx] &&
-		    ((u8)dev->vf_dev[idx]->pmf_index == hmc_fn_id)) {
-			hmc_info = &dev->vf_dev[idx]->hmc_info;
-			break;
-		}
-	}
-	return hmc_info;
-}
-
-/**
- * i40iw_hmc_finish_add_sd_reg - program sd entries for objects
- * @dev: pointer to the device structure
- * @info: create obj info
- */
-static enum i40iw_status_code i40iw_hmc_finish_add_sd_reg(struct i40iw_sc_dev *dev,
-							  struct i40iw_hmc_create_obj_info *info)
-{
-	if (info->start_idx >= info->hmc_info->hmc_obj[info->rsrc_type].cnt)
-		return I40IW_ERR_INVALID_HMC_OBJ_INDEX;
-
-	if ((info->start_idx + info->count) >
-			info->hmc_info->hmc_obj[info->rsrc_type].cnt)
-		return I40IW_ERR_INVALID_HMC_OBJ_COUNT;
-
-	if (!info->add_sd_cnt)
-		return 0;
-
-	return i40iw_hmc_sd_grp(dev, info->hmc_info,
-				info->hmc_info->sd_indexes[0],
-				info->add_sd_cnt, true);
-}
-
-/**
- * i40iw_sc_create_hmc_obj - allocate backing store for hmc objects
- * @dev: pointer to the device structure
- * @info: pointer to i40iw_hmc_iw_create_obj_info struct
- *
- * This will allocate memory for PDs and backing pages and populate
- * the sd and pd entries.
- */
-enum i40iw_status_code i40iw_sc_create_hmc_obj(struct i40iw_sc_dev *dev,
-					       struct i40iw_hmc_create_obj_info *info)
-{
-	struct i40iw_hmc_sd_entry *sd_entry;
-	u32 sd_idx, sd_lmt;
-	u32 pd_idx = 0, pd_lmt = 0;
-	u32 pd_idx1 = 0, pd_lmt1 = 0;
-	u32 i, j;
-	bool pd_error = false;
-	enum i40iw_status_code ret_code = 0;
-
-	if (info->start_idx >= info->hmc_info->hmc_obj[info->rsrc_type].cnt)
-		return I40IW_ERR_INVALID_HMC_OBJ_INDEX;
-
-	if ((info->start_idx + info->count) >
-	    info->hmc_info->hmc_obj[info->rsrc_type].cnt) {
-		i40iw_debug(dev, I40IW_DEBUG_HMC,
-			    "%s: error type %u, start = %u, req cnt %u, cnt = %u\n",
-			    __func__, info->rsrc_type, info->start_idx, info->count,
-			    info->hmc_info->hmc_obj[info->rsrc_type].cnt);
-		return I40IW_ERR_INVALID_HMC_OBJ_COUNT;
-	}
-
-	if (!dev->is_pf)
-		return i40iw_vchnl_vf_add_hmc_objs(dev, info->rsrc_type, 0, info->count);
-
-	i40iw_find_sd_index_limit(info->hmc_info, info->rsrc_type,
-				  info->start_idx, info->count,
-				  &sd_idx, &sd_lmt);
-	if (sd_idx >= info->hmc_info->sd_table.sd_cnt ||
-	    sd_lmt > info->hmc_info->sd_table.sd_cnt) {
-		return I40IW_ERR_INVALID_SD_INDEX;
-	}
-	i40iw_find_pd_index_limit(info->hmc_info, info->rsrc_type,
-				  info->start_idx, info->count, &pd_idx, &pd_lmt);
-
-	for (j = sd_idx; j < sd_lmt; j++) {
-		ret_code = i40iw_add_sd_table_entry(dev->hw, info->hmc_info,
-						    j,
-						    info->entry_type,
-						    I40IW_HMC_DIRECT_BP_SIZE);
-		if (ret_code)
-			goto exit_sd_error;
-		sd_entry = &info->hmc_info->sd_table.sd_entry[j];
-
-		if ((sd_entry->entry_type == I40IW_SD_TYPE_PAGED) &&
-		    ((dev->hmc_info == info->hmc_info) &&
-		     (info->rsrc_type != I40IW_HMC_IW_PBLE))) {
-			pd_idx1 = max(pd_idx, (j * I40IW_HMC_MAX_BP_COUNT));
-			pd_lmt1 = min(pd_lmt,
-				      (j + 1) * I40IW_HMC_MAX_BP_COUNT);
-			for (i = pd_idx1; i < pd_lmt1; i++) {
-				/* update the pd table entry */
-				ret_code = i40iw_add_pd_table_entry(dev->hw, info->hmc_info,
-								    i, NULL);
-				if (ret_code) {
-					pd_error = true;
-					break;
-				}
-			}
-			if (pd_error) {
-				while (i && (i > pd_idx1)) {
-					i40iw_remove_pd_bp(dev->hw, info->hmc_info, (i - 1),
-							   info->is_pf);
-					i--;
-				}
-			}
-		}
-		if (sd_entry->valid)
-			continue;
-
-		info->hmc_info->sd_indexes[info->add_sd_cnt] = (u16)j;
-		info->add_sd_cnt++;
-		sd_entry->valid = true;
-	}
-	return i40iw_hmc_finish_add_sd_reg(dev, info);
-
-exit_sd_error:
-	while (j && (j > sd_idx)) {
-		sd_entry = &info->hmc_info->sd_table.sd_entry[j - 1];
-		switch (sd_entry->entry_type) {
-		case I40IW_SD_TYPE_PAGED:
-			pd_idx1 = max(pd_idx,
-				      (j - 1) * I40IW_HMC_MAX_BP_COUNT);
-			pd_lmt1 = min(pd_lmt, (j * I40IW_HMC_MAX_BP_COUNT));
-			for (i = pd_idx1; i < pd_lmt1; i++)
-				i40iw_prep_remove_pd_page(info->hmc_info, i);
-			break;
-		case I40IW_SD_TYPE_DIRECT:
-			i40iw_prep_remove_pd_page(info->hmc_info, (j - 1));
-			break;
-		default:
-			ret_code = I40IW_ERR_INVALID_SD_TYPE;
-			break;
-		}
-		j--;
-	}
-
-	return ret_code;
-}
-
-/**
- * i40iw_finish_del_sd_reg - delete sd entries for objects
- * @dev: pointer to the device structure
- * @info: dele obj info
- * @reset: true if called before reset
- */
-static enum i40iw_status_code i40iw_finish_del_sd_reg(struct i40iw_sc_dev *dev,
-						      struct i40iw_hmc_del_obj_info *info,
-						      bool reset)
-{
-	struct i40iw_hmc_sd_entry *sd_entry;
-	enum i40iw_status_code ret_code = 0;
-	u32 i, sd_idx;
-	struct i40iw_dma_mem *mem;
-
-	if (dev->is_pf && !reset)
-		ret_code = i40iw_hmc_sd_grp(dev, info->hmc_info,
-					    info->hmc_info->sd_indexes[0],
-					    info->del_sd_cnt, false);
-
-	if (ret_code)
-		i40iw_debug(dev, I40IW_DEBUG_HMC, "%s: error cqp sd sd_grp\n", __func__);
-
-	for (i = 0; i < info->del_sd_cnt; i++) {
-		sd_idx = info->hmc_info->sd_indexes[i];
-		sd_entry = &info->hmc_info->sd_table.sd_entry[sd_idx];
-		if (!sd_entry)
-			continue;
-		mem = (sd_entry->entry_type == I40IW_SD_TYPE_PAGED) ?
-			&sd_entry->u.pd_table.pd_page_addr :
-			&sd_entry->u.bp.addr;
-
-		if (!mem || !mem->va)
-			i40iw_debug(dev, I40IW_DEBUG_HMC, "%s: error cqp sd mem\n", __func__);
-		else
-			i40iw_free_dma_mem(dev->hw, mem);
-	}
-	return ret_code;
-}
-
-/**
- * i40iw_sc_del_hmc_obj - remove pe hmc objects
- * @dev: pointer to the device structure
- * @info: pointer to i40iw_hmc_del_obj_info struct
- * @reset: true if called before reset
- *
- * This will de-populate the SDs and PDs.  It frees
- * the memory for PDS and backing storage.  After this function is returned,
- * caller should deallocate memory allocated previously for
- * book-keeping information about PDs and backing storage.
- */
-enum i40iw_status_code i40iw_sc_del_hmc_obj(struct i40iw_sc_dev *dev,
-					    struct i40iw_hmc_del_obj_info *info,
-					    bool reset)
-{
-	struct i40iw_hmc_pd_table *pd_table;
-	u32 sd_idx, sd_lmt;
-	u32 pd_idx, pd_lmt, rel_pd_idx;
-	u32 i, j;
-	enum i40iw_status_code ret_code = 0;
-
-	if (info->start_idx >= info->hmc_info->hmc_obj[info->rsrc_type].cnt) {
-		i40iw_debug(dev, I40IW_DEBUG_HMC,
-			    "%s: error start_idx[%04d]  >= [type %04d].cnt[%04d]\n",
-			    __func__, info->start_idx, info->rsrc_type,
-			    info->hmc_info->hmc_obj[info->rsrc_type].cnt);
-		return I40IW_ERR_INVALID_HMC_OBJ_INDEX;
-	}
-
-	if ((info->start_idx + info->count) >
-	    info->hmc_info->hmc_obj[info->rsrc_type].cnt) {
-		i40iw_debug(dev, I40IW_DEBUG_HMC,
-			    "%s: error start_idx[%04d] + count %04d  >= [type %04d].cnt[%04d]\n",
-			    __func__, info->start_idx, info->count,
-			    info->rsrc_type,
-			    info->hmc_info->hmc_obj[info->rsrc_type].cnt);
-		return I40IW_ERR_INVALID_HMC_OBJ_COUNT;
-	}
-	if (!dev->is_pf) {
-		ret_code = i40iw_vchnl_vf_del_hmc_obj(dev, info->rsrc_type, 0,
-						      info->count);
-		if (info->rsrc_type != I40IW_HMC_IW_PBLE)
-			return ret_code;
-	}
-
-	i40iw_find_pd_index_limit(info->hmc_info, info->rsrc_type,
-				  info->start_idx, info->count, &pd_idx, &pd_lmt);
-
-	for (j = pd_idx; j < pd_lmt; j++) {
-		sd_idx = j / I40IW_HMC_PD_CNT_IN_SD;
-
-		if (info->hmc_info->sd_table.sd_entry[sd_idx].entry_type !=
-		    I40IW_SD_TYPE_PAGED)
-			continue;
-
-		rel_pd_idx = j % I40IW_HMC_PD_CNT_IN_SD;
-		pd_table = &info->hmc_info->sd_table.sd_entry[sd_idx].u.pd_table;
-		if (pd_table->pd_entry[rel_pd_idx].valid) {
-			ret_code = i40iw_remove_pd_bp(dev->hw, info->hmc_info, j,
-						      info->is_pf);
-			if (ret_code) {
-				i40iw_debug(dev, I40IW_DEBUG_HMC, "%s: error\n", __func__);
-				return ret_code;
-			}
-		}
-	}
-
-	i40iw_find_sd_index_limit(info->hmc_info, info->rsrc_type,
-				  info->start_idx, info->count, &sd_idx, &sd_lmt);
-	if (sd_idx >= info->hmc_info->sd_table.sd_cnt ||
-	    sd_lmt > info->hmc_info->sd_table.sd_cnt) {
-		i40iw_debug(dev, I40IW_DEBUG_HMC, "%s: error invalid sd_idx\n", __func__);
-		return I40IW_ERR_INVALID_SD_INDEX;
-	}
-
-	for (i = sd_idx; i < sd_lmt; i++) {
-		if (!info->hmc_info->sd_table.sd_entry[i].valid)
-			continue;
-		switch (info->hmc_info->sd_table.sd_entry[i].entry_type) {
-		case I40IW_SD_TYPE_DIRECT:
-			ret_code = i40iw_prep_remove_sd_bp(info->hmc_info, i);
-			if (!ret_code) {
-				info->hmc_info->sd_indexes[info->del_sd_cnt] = (u16)i;
-				info->del_sd_cnt++;
-			}
-			break;
-		case I40IW_SD_TYPE_PAGED:
-			ret_code = i40iw_prep_remove_pd_page(info->hmc_info, i);
-			if (!ret_code) {
-				info->hmc_info->sd_indexes[info->del_sd_cnt] = (u16)i;
-				info->del_sd_cnt++;
-			}
-			break;
-		default:
-			break;
-		}
-	}
-	return i40iw_finish_del_sd_reg(dev, info, reset);
-}
-
-/**
- * i40iw_add_sd_table_entry - Adds a segment descriptor to the table
- * @hw: pointer to our hw struct
- * @hmc_info: pointer to the HMC configuration information struct
- * @sd_index: segment descriptor index to manipulate
- * @type: what type of segment descriptor we're manipulating
- * @direct_mode_sz: size to alloc in direct mode
- */
-enum i40iw_status_code i40iw_add_sd_table_entry(struct i40iw_hw *hw,
-						struct i40iw_hmc_info *hmc_info,
-						u32 sd_index,
-						enum i40iw_sd_entry_type type,
-						u64 direct_mode_sz)
-{
-	enum i40iw_status_code ret_code = 0;
-	struct i40iw_hmc_sd_entry *sd_entry;
-	bool dma_mem_alloc_done = false;
-	struct i40iw_dma_mem mem;
-	u64 alloc_len;
-
-	sd_entry = &hmc_info->sd_table.sd_entry[sd_index];
-	if (!sd_entry->valid) {
-		if (type == I40IW_SD_TYPE_PAGED)
-			alloc_len = I40IW_HMC_PAGED_BP_SIZE;
-		else
-			alloc_len = direct_mode_sz;
-
-		/* allocate a 4K pd page or 2M backing page */
-		ret_code = i40iw_allocate_dma_mem(hw, &mem, alloc_len,
-						  I40IW_HMC_PD_BP_BUF_ALIGNMENT);
-		if (ret_code)
-			goto exit;
-		dma_mem_alloc_done = true;
-		if (type == I40IW_SD_TYPE_PAGED) {
-			ret_code = i40iw_allocate_virt_mem(hw,
-							   &sd_entry->u.pd_table.pd_entry_virt_mem,
-							   sizeof(struct i40iw_hmc_pd_entry) * 512);
-			if (ret_code)
-				goto exit;
-			sd_entry->u.pd_table.pd_entry = (struct i40iw_hmc_pd_entry *)
-							 sd_entry->u.pd_table.pd_entry_virt_mem.va;
-
-			memcpy(&sd_entry->u.pd_table.pd_page_addr, &mem, sizeof(struct i40iw_dma_mem));
-		} else {
-			memcpy(&sd_entry->u.bp.addr, &mem, sizeof(struct i40iw_dma_mem));
-			sd_entry->u.bp.sd_pd_index = sd_index;
-		}
-
-		hmc_info->sd_table.sd_entry[sd_index].entry_type = type;
-
-		I40IW_INC_SD_REFCNT(&hmc_info->sd_table);
-	}
-	if (sd_entry->entry_type == I40IW_SD_TYPE_DIRECT)
-		I40IW_INC_BP_REFCNT(&sd_entry->u.bp);
-exit:
-	if (ret_code)
-		if (dma_mem_alloc_done)
-			i40iw_free_dma_mem(hw, &mem);
-
-	return ret_code;
-}
-
-/**
- * i40iw_add_pd_table_entry - Adds page descriptor to the specified table
- * @hw: pointer to our HW structure
- * @hmc_info: pointer to the HMC configuration information structure
- * @pd_index: which page descriptor index to manipulate
- * @rsrc_pg: if not NULL, use preallocated page instead of allocating new one.
- *
- * This function:
- *	1. Initializes the pd entry
- *	2. Adds pd_entry in the pd_table
- *	3. Mark the entry valid in i40iw_hmc_pd_entry structure
- *	4. Initializes the pd_entry's ref count to 1
- * assumptions:
- *	1. The memory for pd should be pinned down, physically contiguous and
- *	   aligned on 4K boundary and zeroed memory.
- *	2. It should be 4K in size.
- */
-enum i40iw_status_code i40iw_add_pd_table_entry(struct i40iw_hw *hw,
-						struct i40iw_hmc_info *hmc_info,
-						u32 pd_index,
-						struct i40iw_dma_mem *rsrc_pg)
-{
-	enum i40iw_status_code ret_code = 0;
-	struct i40iw_hmc_pd_table *pd_table;
-	struct i40iw_hmc_pd_entry *pd_entry;
-	struct i40iw_dma_mem mem;
-	struct i40iw_dma_mem *page = &mem;
-	u32 sd_idx, rel_pd_idx;
-	u64 *pd_addr;
-	u64 page_desc;
-
-	if (pd_index / I40IW_HMC_PD_CNT_IN_SD >= hmc_info->sd_table.sd_cnt)
-		return I40IW_ERR_INVALID_PAGE_DESC_INDEX;
-
-	sd_idx = (pd_index / I40IW_HMC_PD_CNT_IN_SD);
-	if (hmc_info->sd_table.sd_entry[sd_idx].entry_type != I40IW_SD_TYPE_PAGED)
-		return 0;
-
-	rel_pd_idx = (pd_index % I40IW_HMC_PD_CNT_IN_SD);
-	pd_table = &hmc_info->sd_table.sd_entry[sd_idx].u.pd_table;
-	pd_entry = &pd_table->pd_entry[rel_pd_idx];
-	if (!pd_entry->valid) {
-		if (rsrc_pg) {
-			pd_entry->rsrc_pg = true;
-			page = rsrc_pg;
-		} else {
-			ret_code = i40iw_allocate_dma_mem(hw, page,
-							  I40IW_HMC_PAGED_BP_SIZE,
-							  I40IW_HMC_PD_BP_BUF_ALIGNMENT);
-			if (ret_code)
-				return ret_code;
-			pd_entry->rsrc_pg = false;
-		}
-
-		memcpy(&pd_entry->bp.addr, page, sizeof(struct i40iw_dma_mem));
-		pd_entry->bp.sd_pd_index = pd_index;
-		pd_entry->bp.entry_type = I40IW_SD_TYPE_PAGED;
-		page_desc = page->pa | 0x1;
-
-		pd_addr = (u64 *)pd_table->pd_page_addr.va;
-		pd_addr += rel_pd_idx;
-
-		memcpy(pd_addr, &page_desc, sizeof(*pd_addr));
-
-		pd_entry->sd_index = sd_idx;
-		pd_entry->valid = true;
-		I40IW_INC_PD_REFCNT(pd_table);
-		if (hmc_info->hmc_fn_id < I40IW_FIRST_VF_FPM_ID)
-			I40IW_INVALIDATE_PF_HMC_PD(hw, sd_idx, rel_pd_idx);
-		else if (hw->hmc.hmc_fn_id != hmc_info->hmc_fn_id)
-			I40IW_INVALIDATE_VF_HMC_PD(hw, sd_idx, rel_pd_idx,
-						   hmc_info->hmc_fn_id);
-	}
-	I40IW_INC_BP_REFCNT(&pd_entry->bp);
-
-	return 0;
-}
-
-/**
- * i40iw_remove_pd_bp - remove a backing page from a page descriptor
- * @hw: pointer to our HW structure
- * @hmc_info: pointer to the HMC configuration information structure
- * @idx: the page index
- * @is_pf: distinguishes a VF from a PF
- *
- * This function:
- *	1. Marks the entry in pd table (for paged address mode) or in sd table
- *	   (for direct address mode) invalid.
- *	2. Write to register PMPDINV to invalidate the backing page in FV cache
- *	3. Decrement the ref count for the pd _entry
- * assumptions:
- *	1. Caller can deallocate the memory used by backing storage after this
- *	   function returns.
- */
-enum i40iw_status_code i40iw_remove_pd_bp(struct i40iw_hw *hw,
-					  struct i40iw_hmc_info *hmc_info,
-					  u32 idx,
-					  bool is_pf)
-{
-	struct i40iw_hmc_pd_entry *pd_entry;
-	struct i40iw_hmc_pd_table *pd_table;
-	struct i40iw_hmc_sd_entry *sd_entry;
-	u32 sd_idx, rel_pd_idx;
-	struct i40iw_dma_mem *mem;
-	u64 *pd_addr;
-
-	sd_idx = idx / I40IW_HMC_PD_CNT_IN_SD;
-	rel_pd_idx = idx % I40IW_HMC_PD_CNT_IN_SD;
-	if (sd_idx >= hmc_info->sd_table.sd_cnt)
-		return I40IW_ERR_INVALID_PAGE_DESC_INDEX;
-
-	sd_entry = &hmc_info->sd_table.sd_entry[sd_idx];
-	if (sd_entry->entry_type != I40IW_SD_TYPE_PAGED)
-		return I40IW_ERR_INVALID_SD_TYPE;
-
-	pd_table = &hmc_info->sd_table.sd_entry[sd_idx].u.pd_table;
-	pd_entry = &pd_table->pd_entry[rel_pd_idx];
-	I40IW_DEC_BP_REFCNT(&pd_entry->bp);
-	if (pd_entry->bp.ref_cnt)
-		return 0;
-
-	pd_entry->valid = false;
-	I40IW_DEC_PD_REFCNT(pd_table);
-	pd_addr = (u64 *)pd_table->pd_page_addr.va;
-	pd_addr += rel_pd_idx;
-	memset(pd_addr, 0, sizeof(u64));
-	if (is_pf)
-		I40IW_INVALIDATE_PF_HMC_PD(hw, sd_idx, idx);
-	else
-		I40IW_INVALIDATE_VF_HMC_PD(hw, sd_idx, idx,
-					   hmc_info->hmc_fn_id);
-
-	if (!pd_entry->rsrc_pg) {
-		mem = &pd_entry->bp.addr;
-		if (!mem || !mem->va)
-			return I40IW_ERR_PARAM;
-		i40iw_free_dma_mem(hw, mem);
-	}
-	if (!pd_table->ref_cnt)
-		i40iw_free_virt_mem(hw, &pd_table->pd_entry_virt_mem);
-
-	return 0;
-}
-
-/**
- * i40iw_prep_remove_sd_bp - Prepares to remove a backing page from a sd entry
- * @hmc_info: pointer to the HMC configuration information structure
- * @idx: the page index
- */
-enum i40iw_status_code i40iw_prep_remove_sd_bp(struct i40iw_hmc_info *hmc_info, u32 idx)
-{
-	struct i40iw_hmc_sd_entry *sd_entry;
-
-	sd_entry = &hmc_info->sd_table.sd_entry[idx];
-	I40IW_DEC_BP_REFCNT(&sd_entry->u.bp);
-	if (sd_entry->u.bp.ref_cnt)
-		return I40IW_ERR_NOT_READY;
-
-	I40IW_DEC_SD_REFCNT(&hmc_info->sd_table);
-	sd_entry->valid = false;
-
-	return 0;
-}
-
-/**
- * i40iw_prep_remove_pd_page - Prepares to remove a PD page from sd entry.
- * @hmc_info: pointer to the HMC configuration information structure
- * @idx: segment descriptor index to find the relevant page descriptor
- */
-enum i40iw_status_code i40iw_prep_remove_pd_page(struct i40iw_hmc_info *hmc_info,
-						 u32 idx)
-{
-	struct i40iw_hmc_sd_entry *sd_entry;
-
-	sd_entry = &hmc_info->sd_table.sd_entry[idx];
-
-	if (sd_entry->u.pd_table.ref_cnt)
-		return I40IW_ERR_NOT_READY;
-
-	sd_entry->valid = false;
-	I40IW_DEC_SD_REFCNT(&hmc_info->sd_table);
-
-	return 0;
-}
-
-/**
- * i40iw_pf_init_vfhmc -
- * @vf_cnt_array: array of cnt values of iwarp hmc objects
- * @vf_hmc_fn_id: hmc function id ofr vf driver
- * @dev: pointer to i40iw_dev struct
- *
- * Called by pf driver to initialize hmc_info for vf driver instance.
- */
-enum i40iw_status_code i40iw_pf_init_vfhmc(struct i40iw_sc_dev *dev,
-					   u8 vf_hmc_fn_id,
-					   u32 *vf_cnt_array)
-{
-	struct i40iw_hmc_info *hmc_info;
-	enum i40iw_status_code ret_code = 0;
-	u32 i;
-
-	if ((vf_hmc_fn_id < I40IW_FIRST_VF_FPM_ID) ||
-	    (vf_hmc_fn_id >= I40IW_FIRST_VF_FPM_ID +
-	     I40IW_MAX_PE_ENABLED_VF_COUNT)) {
-		i40iw_debug(dev, I40IW_DEBUG_HMC, "%s: invalid vf_hmc_fn_id  0x%x\n",
-			    __func__, vf_hmc_fn_id);
-		return I40IW_ERR_INVALID_HMCFN_ID;
-	}
-
-	ret_code = i40iw_sc_init_iw_hmc(dev, vf_hmc_fn_id);
-	if (ret_code)
-		return ret_code;
-
-	hmc_info = i40iw_vf_hmcinfo_from_fpm(dev, vf_hmc_fn_id);
-
-	for (i = I40IW_HMC_IW_QP; i < I40IW_HMC_IW_MAX; i++)
-		if (vf_cnt_array)
-			hmc_info->hmc_obj[i].cnt =
-			    vf_cnt_array[i - I40IW_HMC_IW_QP];
-		else
-			hmc_info->hmc_obj[i].cnt = hmc_info->hmc_obj[i].max_cnt;
-
-	return 0;
-}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_hmc.h b/drivers/infiniband/hw/i40iw/i40iw_hmc.h
deleted file mode 100644
index 4c3fdd875621..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw_hmc.h
+++ /dev/null
@@ -1,241 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#ifndef I40IW_HMC_H
-#define I40IW_HMC_H
-
-#include "i40iw_d.h"
-
-struct i40iw_hw;
-enum i40iw_status_code;
-
-#define I40IW_HMC_MAX_BP_COUNT 512
-#define I40IW_MAX_SD_ENTRIES 11
-#define I40IW_HW_DBG_HMC_INVALID_BP_MARK     0xCA
-
-#define I40IW_HMC_INFO_SIGNATURE	0x484D5347
-#define I40IW_HMC_PD_CNT_IN_SD		512
-#define I40IW_HMC_DIRECT_BP_SIZE	0x200000
-#define I40IW_HMC_MAX_SD_COUNT		4096
-#define I40IW_HMC_PAGED_BP_SIZE		4096
-#define I40IW_HMC_PD_BP_BUF_ALIGNMENT	4096
-#define I40IW_FIRST_VF_FPM_ID		16
-#define FPM_MULTIPLIER			1024
-
-#define I40IW_INC_SD_REFCNT(sd_table)   ((sd_table)->ref_cnt++)
-#define I40IW_INC_PD_REFCNT(pd_table)   ((pd_table)->ref_cnt++)
-#define I40IW_INC_BP_REFCNT(bp)         ((bp)->ref_cnt++)
-
-#define I40IW_DEC_SD_REFCNT(sd_table)   ((sd_table)->ref_cnt--)
-#define I40IW_DEC_PD_REFCNT(pd_table)   ((pd_table)->ref_cnt--)
-#define I40IW_DEC_BP_REFCNT(bp)         ((bp)->ref_cnt--)
-
-/**
- * I40IW_INVALIDATE_PF_HMC_PD - Invalidates the pd cache in the hardware
- * @hw: pointer to our hw struct
- * @sd_idx: segment descriptor index
- * @pd_idx: page descriptor index
- */
-#define I40IW_INVALIDATE_PF_HMC_PD(hw, sd_idx, pd_idx)                  \
-	i40iw_wr32((hw), I40E_PFHMC_PDINV,                                    \
-		(((sd_idx) << I40E_PFHMC_PDINV_PMSDIDX_SHIFT) |             \
-		(0x1 << I40E_PFHMC_PDINV_PMSDPARTSEL_SHIFT) | \
-		((pd_idx) << I40E_PFHMC_PDINV_PMPDIDX_SHIFT)))
-
-/**
- * I40IW_INVALIDATE_VF_HMC_PD - Invalidates the pd cache in the hardware
- * @hw: pointer to our hw struct
- * @sd_idx: segment descriptor index
- * @pd_idx: page descriptor index
- * @hmc_fn_id: VF's function id
- */
-#define I40IW_INVALIDATE_VF_HMC_PD(hw, sd_idx, pd_idx, hmc_fn_id)        \
-	i40iw_wr32(hw, I40E_GLHMC_VFPDINV(hmc_fn_id - I40IW_FIRST_VF_FPM_ID),  \
-	     ((sd_idx << I40E_PFHMC_PDINV_PMSDIDX_SHIFT) |              \
-	      (pd_idx << I40E_PFHMC_PDINV_PMPDIDX_SHIFT)))
-
-struct i40iw_hmc_obj_info {
-	u64 base;
-	u32 max_cnt;
-	u32 cnt;
-	u64 size;
-};
-
-enum i40iw_sd_entry_type {
-	I40IW_SD_TYPE_INVALID = 0,
-	I40IW_SD_TYPE_PAGED = 1,
-	I40IW_SD_TYPE_DIRECT = 2
-};
-
-struct i40iw_hmc_bp {
-	enum i40iw_sd_entry_type entry_type;
-	struct i40iw_dma_mem addr;
-	u32 sd_pd_index;
-	u32 ref_cnt;
-};
-
-struct i40iw_hmc_pd_entry {
-	struct i40iw_hmc_bp bp;
-	u32 sd_index;
-	bool rsrc_pg;
-	bool valid;
-};
-
-struct i40iw_hmc_pd_table {
-	struct i40iw_dma_mem pd_page_addr;
-	struct i40iw_hmc_pd_entry *pd_entry;
-	struct i40iw_virt_mem pd_entry_virt_mem;
-	u32 ref_cnt;
-	u32 sd_index;
-};
-
-struct i40iw_hmc_sd_entry {
-	enum i40iw_sd_entry_type entry_type;
-	bool valid;
-
-	union {
-		struct i40iw_hmc_pd_table pd_table;
-		struct i40iw_hmc_bp bp;
-	} u;
-};
-
-struct i40iw_hmc_sd_table {
-	struct i40iw_virt_mem addr;
-	u32 sd_cnt;
-	u32 ref_cnt;
-	struct i40iw_hmc_sd_entry *sd_entry;
-};
-
-struct i40iw_hmc_info {
-	u32 signature;
-	u8 hmc_fn_id;
-	u16 first_sd_index;
-
-	struct i40iw_hmc_obj_info *hmc_obj;
-	struct i40iw_virt_mem hmc_obj_virt_mem;
-	struct i40iw_hmc_sd_table sd_table;
-	u16 sd_indexes[I40IW_HMC_MAX_SD_COUNT];
-};
-
-struct update_sd_entry {
-	u64 cmd;
-	u64 data;
-};
-
-struct i40iw_update_sds_info {
-	u32 cnt;
-	u8 hmc_fn_id;
-	struct update_sd_entry entry[I40IW_MAX_SD_ENTRIES];
-};
-
-struct i40iw_ccq_cqe_info;
-struct i40iw_hmc_fcn_info {
-	void (*callback_fcn)(struct i40iw_sc_dev *, void *,
-			     struct i40iw_ccq_cqe_info *);
-	void *cqp_callback_param;
-	u32 vf_id;
-	u16 iw_vf_idx;
-	bool free_fcn;
-};
-
-enum i40iw_hmc_rsrc_type {
-	I40IW_HMC_IW_QP = 0,
-	I40IW_HMC_IW_CQ = 1,
-	I40IW_HMC_IW_SRQ = 2,
-	I40IW_HMC_IW_HTE = 3,
-	I40IW_HMC_IW_ARP = 4,
-	I40IW_HMC_IW_APBVT_ENTRY = 5,
-	I40IW_HMC_IW_MR = 6,
-	I40IW_HMC_IW_XF = 7,
-	I40IW_HMC_IW_XFFL = 8,
-	I40IW_HMC_IW_Q1 = 9,
-	I40IW_HMC_IW_Q1FL = 10,
-	I40IW_HMC_IW_TIMER = 11,
-	I40IW_HMC_IW_FSIMC = 12,
-	I40IW_HMC_IW_FSIAV = 13,
-	I40IW_HMC_IW_PBLE = 14,
-	I40IW_HMC_IW_MAX = 15,
-};
-
-struct i40iw_hmc_create_obj_info {
-	struct i40iw_hmc_info *hmc_info;
-	struct i40iw_virt_mem add_sd_virt_mem;
-	u32 rsrc_type;
-	u32 start_idx;
-	u32 count;
-	u32 add_sd_cnt;
-	enum i40iw_sd_entry_type entry_type;
-	bool is_pf;
-};
-
-struct i40iw_hmc_del_obj_info {
-	struct i40iw_hmc_info *hmc_info;
-	struct i40iw_virt_mem del_sd_virt_mem;
-	u32 rsrc_type;
-	u32 start_idx;
-	u32 count;
-	u32 del_sd_cnt;
-	bool is_pf;
-};
-
-enum i40iw_status_code i40iw_copy_dma_mem(struct i40iw_hw *hw, void *dest_buf,
-					  struct i40iw_dma_mem *src_mem, u64 src_offset, u64 size);
-enum i40iw_status_code i40iw_sc_create_hmc_obj(struct i40iw_sc_dev *dev,
-					       struct i40iw_hmc_create_obj_info *info);
-enum i40iw_status_code i40iw_sc_del_hmc_obj(struct i40iw_sc_dev *dev,
-					    struct i40iw_hmc_del_obj_info *info,
-					    bool reset);
-enum i40iw_status_code i40iw_hmc_sd_one(struct i40iw_sc_dev *dev, u8 hmc_fn_id,
-					u64 pa, u32 sd_idx, enum i40iw_sd_entry_type type,
-					bool setsd);
-enum i40iw_status_code i40iw_update_sds_noccq(struct i40iw_sc_dev *dev,
-					      struct i40iw_update_sds_info *info);
-struct i40iw_vfdev *i40iw_vfdev_from_fpm(struct i40iw_sc_dev *dev, u8 hmc_fn_id);
-struct i40iw_hmc_info *i40iw_vf_hmcinfo_from_fpm(struct i40iw_sc_dev *dev,
-						 u8 hmc_fn_id);
-enum i40iw_status_code i40iw_add_sd_table_entry(struct i40iw_hw *hw,
-						struct i40iw_hmc_info *hmc_info, u32 sd_index,
-						enum i40iw_sd_entry_type type, u64 direct_mode_sz);
-enum i40iw_status_code i40iw_add_pd_table_entry(struct i40iw_hw *hw,
-						struct i40iw_hmc_info *hmc_info, u32 pd_index,
-						struct i40iw_dma_mem *rsrc_pg);
-enum i40iw_status_code i40iw_remove_pd_bp(struct i40iw_hw *hw,
-					  struct i40iw_hmc_info *hmc_info, u32 idx, bool is_pf);
-enum i40iw_status_code i40iw_prep_remove_sd_bp(struct i40iw_hmc_info *hmc_info, u32 idx);
-enum i40iw_status_code i40iw_prep_remove_pd_page(struct i40iw_hmc_info *hmc_info, u32 idx);
-
-#define     ENTER_SHARED_FUNCTION()
-#define     EXIT_SHARED_FUNCTION()
-
-#endif				/* I40IW_HMC_H */
diff --git a/drivers/infiniband/hw/i40iw/i40iw_hw.c b/drivers/infiniband/hw/i40iw/i40iw_hw.c
deleted file mode 100644
index d167ac10c751..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw_hw.c
+++ /dev/null
@@ -1,851 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/if_vlan.h>
-
-#include "i40iw.h"
-
-/**
- * i40iw_initialize_hw_resources - initialize hw resource during open
- * @iwdev: iwarp device
- */
-u32 i40iw_initialize_hw_resources(struct i40iw_device *iwdev)
-{
-	unsigned long num_pds;
-	u32 resources_size;
-	u32 max_mr;
-	u32 max_qp;
-	u32 max_cq;
-	u32 arp_table_size;
-	u32 mrdrvbits;
-	void *resource_ptr;
-
-	max_qp = iwdev->sc_dev.hmc_info->hmc_obj[I40IW_HMC_IW_QP].cnt;
-	max_cq = iwdev->sc_dev.hmc_info->hmc_obj[I40IW_HMC_IW_CQ].cnt;
-	max_mr = iwdev->sc_dev.hmc_info->hmc_obj[I40IW_HMC_IW_MR].cnt;
-	arp_table_size = iwdev->sc_dev.hmc_info->hmc_obj[I40IW_HMC_IW_ARP].cnt;
-	iwdev->max_cqe = 0xFFFFF;
-	num_pds = I40IW_MAX_PDS;
-	resources_size = sizeof(struct i40iw_arp_entry) * arp_table_size;
-	resources_size += sizeof(unsigned long) * BITS_TO_LONGS(max_qp);
-	resources_size += sizeof(unsigned long) * BITS_TO_LONGS(max_mr);
-	resources_size += sizeof(unsigned long) * BITS_TO_LONGS(max_cq);
-	resources_size += sizeof(unsigned long) * BITS_TO_LONGS(num_pds);
-	resources_size += sizeof(unsigned long) * BITS_TO_LONGS(arp_table_size);
-	resources_size += sizeof(struct i40iw_qp **) * max_qp;
-	iwdev->mem_resources = kzalloc(resources_size, GFP_KERNEL);
-
-	if (!iwdev->mem_resources)
-		return -ENOMEM;
-
-	iwdev->max_qp = max_qp;
-	iwdev->max_mr = max_mr;
-	iwdev->max_cq = max_cq;
-	iwdev->max_pd = num_pds;
-	iwdev->arp_table_size = arp_table_size;
-	iwdev->arp_table = (struct i40iw_arp_entry *)iwdev->mem_resources;
-	resource_ptr = iwdev->mem_resources + (sizeof(struct i40iw_arp_entry) * arp_table_size);
-
-	iwdev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY |
-	    IB_DEVICE_MEM_WINDOW | IB_DEVICE_MEM_MGT_EXTENSIONS;
-
-	iwdev->allocated_qps = resource_ptr;
-	iwdev->allocated_cqs = &iwdev->allocated_qps[BITS_TO_LONGS(max_qp)];
-	iwdev->allocated_mrs = &iwdev->allocated_cqs[BITS_TO_LONGS(max_cq)];
-	iwdev->allocated_pds = &iwdev->allocated_mrs[BITS_TO_LONGS(max_mr)];
-	iwdev->allocated_arps = &iwdev->allocated_pds[BITS_TO_LONGS(num_pds)];
-	iwdev->qp_table = (struct i40iw_qp **)(&iwdev->allocated_arps[BITS_TO_LONGS(arp_table_size)]);
-	set_bit(0, iwdev->allocated_mrs);
-	set_bit(0, iwdev->allocated_qps);
-	set_bit(0, iwdev->allocated_cqs);
-	set_bit(0, iwdev->allocated_pds);
-	set_bit(0, iwdev->allocated_arps);
-
-	/* Following for ILQ/IEQ */
-	set_bit(1, iwdev->allocated_qps);
-	set_bit(1, iwdev->allocated_cqs);
-	set_bit(1, iwdev->allocated_pds);
-	set_bit(2, iwdev->allocated_cqs);
-	set_bit(2, iwdev->allocated_pds);
-
-	spin_lock_init(&iwdev->resource_lock);
-	spin_lock_init(&iwdev->qptable_lock);
-	/* stag index mask has a minimum of 14 bits */
-	mrdrvbits = 24 - max(get_count_order(iwdev->max_mr), 14);
-	iwdev->mr_stagmask = ~(((1 << mrdrvbits) - 1) << (32 - mrdrvbits));
-	return 0;
-}
-
-/**
- * i40iw_cqp_ce_handler - handle cqp completions
- * @iwdev: iwarp device
- * @arm: flag to arm after completions
- * @cq: cq for cqp completions
- */
-static void i40iw_cqp_ce_handler(struct i40iw_device *iwdev, struct i40iw_sc_cq *cq, bool arm)
-{
-	struct i40iw_cqp_request *cqp_request;
-	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-	u32 cqe_count = 0;
-	struct i40iw_ccq_cqe_info info;
-	int ret;
-
-	do {
-		memset(&info, 0, sizeof(info));
-		ret = dev->ccq_ops->ccq_get_cqe_info(cq, &info);
-		if (ret)
-			break;
-		cqp_request = (struct i40iw_cqp_request *)(unsigned long)info.scratch;
-		if (info.error)
-			i40iw_pr_err("opcode = 0x%x maj_err_code = 0x%x min_err_code = 0x%x\n",
-				     info.op_code, info.maj_err_code, info.min_err_code);
-		if (cqp_request) {
-			cqp_request->compl_info.maj_err_code = info.maj_err_code;
-			cqp_request->compl_info.min_err_code = info.min_err_code;
-			cqp_request->compl_info.op_ret_val = info.op_ret_val;
-			cqp_request->compl_info.error = info.error;
-
-			if (cqp_request->waiting) {
-				cqp_request->request_done = true;
-				wake_up(&cqp_request->waitq);
-				i40iw_put_cqp_request(&iwdev->cqp, cqp_request);
-			} else {
-				if (cqp_request->callback_fcn)
-					cqp_request->callback_fcn(cqp_request, 1);
-				i40iw_put_cqp_request(&iwdev->cqp, cqp_request);
-			}
-		}
-
-		cqe_count++;
-	} while (1);
-
-	if (arm && cqe_count) {
-		i40iw_process_bh(dev);
-		dev->ccq_ops->ccq_arm(cq);
-	}
-}
-
-/**
- * i40iw_iwarp_ce_handler - handle iwarp completions
- * @iwdev: iwarp device
- * @iwcq: iwarp cq receiving event
- */
-static void i40iw_iwarp_ce_handler(struct i40iw_device *iwdev,
-				   struct i40iw_sc_cq *iwcq)
-{
-	struct i40iw_cq *i40iwcq = iwcq->back_cq;
-
-	if (i40iwcq->ibcq.comp_handler)
-		i40iwcq->ibcq.comp_handler(&i40iwcq->ibcq,
-					   i40iwcq->ibcq.cq_context);
-}
-
-/**
- * i40iw_puda_ce_handler - handle puda completion events
- * @iwdev: iwarp device
- * @cq: puda completion q for event
- */
-static void i40iw_puda_ce_handler(struct i40iw_device *iwdev,
-				  struct i40iw_sc_cq *cq)
-{
-	struct i40iw_sc_dev *dev = (struct i40iw_sc_dev *)&iwdev->sc_dev;
-	enum i40iw_status_code status;
-	u32 compl_error;
-
-	do {
-		status = i40iw_puda_poll_completion(dev, cq, &compl_error);
-		if (status == I40IW_ERR_QUEUE_EMPTY)
-			break;
-		if (status) {
-			i40iw_pr_err("puda  status = %d\n", status);
-			break;
-		}
-		if (compl_error) {
-			i40iw_pr_err("puda compl_err  =0x%x\n", compl_error);
-			break;
-		}
-	} while (1);
-
-	dev->ccq_ops->ccq_arm(cq);
-}
-
-/**
- * i40iw_process_ceq - handle ceq for completions
- * @iwdev: iwarp device
- * @ceq: ceq having cq for completion
- */
-void i40iw_process_ceq(struct i40iw_device *iwdev, struct i40iw_ceq *ceq)
-{
-	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-	struct i40iw_sc_ceq *sc_ceq;
-	struct i40iw_sc_cq *cq;
-	bool arm = true;
-
-	sc_ceq = &ceq->sc_ceq;
-	do {
-		cq = dev->ceq_ops->process_ceq(dev, sc_ceq);
-		if (!cq)
-			break;
-
-		if (cq->cq_type == I40IW_CQ_TYPE_CQP)
-			i40iw_cqp_ce_handler(iwdev, cq, arm);
-		else if (cq->cq_type == I40IW_CQ_TYPE_IWARP)
-			i40iw_iwarp_ce_handler(iwdev, cq);
-		else if ((cq->cq_type == I40IW_CQ_TYPE_ILQ) ||
-			 (cq->cq_type == I40IW_CQ_TYPE_IEQ))
-			i40iw_puda_ce_handler(iwdev, cq);
-	} while (1);
-}
-
-/**
- * i40iw_next_iw_state - modify qp state
- * @iwqp: iwarp qp to modify
- * @state: next state for qp
- * @del_hash: del hash
- * @term: term message
- * @termlen: length of term message
- */
-void i40iw_next_iw_state(struct i40iw_qp *iwqp,
-			 u8 state,
-			 u8 del_hash,
-			 u8 term,
-			 u8 termlen)
-{
-	struct i40iw_modify_qp_info info;
-
-	memset(&info, 0, sizeof(info));
-	info.next_iwarp_state = state;
-	info.remove_hash_idx = del_hash;
-	info.cq_num_valid = true;
-	info.arp_cache_idx_valid = true;
-	info.dont_send_term = true;
-	info.dont_send_fin = true;
-	info.termlen = termlen;
-
-	if (term & I40IWQP_TERM_SEND_TERM_ONLY)
-		info.dont_send_term = false;
-	if (term & I40IWQP_TERM_SEND_FIN_ONLY)
-		info.dont_send_fin = false;
-	if (iwqp->sc_qp.term_flags && (state == I40IW_QP_STATE_ERROR))
-		info.reset_tcp_conn = true;
-	iwqp->hw_iwarp_state = state;
-	i40iw_hw_modify_qp(iwqp->iwdev, iwqp, &info, 0);
-}
-
-/**
- * i40iw_process_aeq - handle aeq events
- * @iwdev: iwarp device
- */
-void i40iw_process_aeq(struct i40iw_device *iwdev)
-{
-	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-	struct i40iw_aeq *aeq = &iwdev->aeq;
-	struct i40iw_sc_aeq *sc_aeq = &aeq->sc_aeq;
-	struct i40iw_aeqe_info aeinfo;
-	struct i40iw_aeqe_info *info = &aeinfo;
-	int ret;
-	struct i40iw_qp *iwqp = NULL;
-	struct i40iw_sc_cq *cq = NULL;
-	struct i40iw_cq *iwcq = NULL;
-	struct i40iw_sc_qp *qp = NULL;
-	struct i40iw_qp_host_ctx_info *ctx_info = NULL;
-	unsigned long flags;
-
-	u32 aeqcnt = 0;
-
-	if (!sc_aeq->size)
-		return;
-
-	do {
-		memset(info, 0, sizeof(*info));
-		ret = dev->aeq_ops->get_next_aeqe(sc_aeq, info);
-		if (ret)
-			break;
-
-		aeqcnt++;
-		i40iw_debug(dev, I40IW_DEBUG_AEQ,
-			    "%s ae_id = 0x%x bool qp=%d qp_id = %d\n",
-			    __func__, info->ae_id, info->qp, info->qp_cq_id);
-		if (info->qp) {
-			spin_lock_irqsave(&iwdev->qptable_lock, flags);
-			iwqp = iwdev->qp_table[info->qp_cq_id];
-			if (!iwqp) {
-				spin_unlock_irqrestore(&iwdev->qptable_lock, flags);
-				i40iw_debug(dev, I40IW_DEBUG_AEQ,
-					    "%s qp_id %d is already freed\n",
-					    __func__, info->qp_cq_id);
-				continue;
-			}
-			i40iw_qp_add_ref(&iwqp->ibqp);
-			spin_unlock_irqrestore(&iwdev->qptable_lock, flags);
-			qp = &iwqp->sc_qp;
-			spin_lock_irqsave(&iwqp->lock, flags);
-			iwqp->hw_tcp_state = info->tcp_state;
-			iwqp->hw_iwarp_state = info->iwarp_state;
-			iwqp->last_aeq = info->ae_id;
-			spin_unlock_irqrestore(&iwqp->lock, flags);
-			ctx_info = &iwqp->ctx_info;
-			ctx_info->err_rq_idx_valid = true;
-		} else {
-			if (info->ae_id != I40IW_AE_CQ_OPERATION_ERROR)
-				continue;
-		}
-
-		switch (info->ae_id) {
-		case I40IW_AE_LLP_FIN_RECEIVED:
-			if (qp->term_flags)
-				break;
-			if (atomic_inc_return(&iwqp->close_timer_started) == 1) {
-				iwqp->hw_tcp_state = I40IW_TCP_STATE_CLOSE_WAIT;
-				if ((iwqp->hw_tcp_state == I40IW_TCP_STATE_CLOSE_WAIT) &&
-				    (iwqp->ibqp_state == IB_QPS_RTS)) {
-					i40iw_next_iw_state(iwqp,
-							    I40IW_QP_STATE_CLOSING, 0, 0, 0);
-					i40iw_cm_disconn(iwqp);
-				}
-				iwqp->cm_id->add_ref(iwqp->cm_id);
-				i40iw_schedule_cm_timer(iwqp->cm_node,
-							(struct i40iw_puda_buf *)iwqp,
-							I40IW_TIMER_TYPE_CLOSE, 1, 0);
-			}
-			break;
-		case I40IW_AE_LLP_CLOSE_COMPLETE:
-			if (qp->term_flags)
-				i40iw_terminate_done(qp, 0);
-			else
-				i40iw_cm_disconn(iwqp);
-			break;
-		case I40IW_AE_BAD_CLOSE:
-		case I40IW_AE_RESET_SENT:
-			i40iw_next_iw_state(iwqp, I40IW_QP_STATE_ERROR, 1, 0, 0);
-			i40iw_cm_disconn(iwqp);
-			break;
-		case I40IW_AE_LLP_CONNECTION_RESET:
-			if (atomic_read(&iwqp->close_timer_started))
-				break;
-			i40iw_cm_disconn(iwqp);
-			break;
-		case I40IW_AE_QP_SUSPEND_COMPLETE:
-			i40iw_qp_suspend_resume(dev, &iwqp->sc_qp, false);
-			break;
-		case I40IW_AE_TERMINATE_SENT:
-			i40iw_terminate_send_fin(qp);
-			break;
-		case I40IW_AE_LLP_TERMINATE_RECEIVED:
-			i40iw_terminate_received(qp, info);
-			break;
-		case I40IW_AE_CQ_OPERATION_ERROR:
-			i40iw_pr_err("Processing an iWARP related AE for CQ misc = 0x%04X\n",
-				     info->ae_id);
-			cq = (struct i40iw_sc_cq *)(unsigned long)info->compl_ctx;
-			iwcq = (struct i40iw_cq *)cq->back_cq;
-
-			if (iwcq->ibcq.event_handler) {
-				struct ib_event ibevent;
-
-				ibevent.device = iwcq->ibcq.device;
-				ibevent.event = IB_EVENT_CQ_ERR;
-				ibevent.element.cq = &iwcq->ibcq;
-				iwcq->ibcq.event_handler(&ibevent, iwcq->ibcq.cq_context);
-			}
-			break;
-		case I40IW_AE_LLP_DOUBT_REACHABILITY:
-			break;
-		case I40IW_AE_PRIV_OPERATION_DENIED:
-		case I40IW_AE_STAG_ZERO_INVALID:
-		case I40IW_AE_IB_RREQ_AND_Q1_FULL:
-		case I40IW_AE_DDP_UBE_INVALID_DDP_VERSION:
-		case I40IW_AE_DDP_UBE_INVALID_MO:
-		case I40IW_AE_DDP_UBE_INVALID_QN:
-		case I40IW_AE_DDP_NO_L_BIT:
-		case I40IW_AE_RDMAP_ROE_INVALID_RDMAP_VERSION:
-		case I40IW_AE_RDMAP_ROE_UNEXPECTED_OPCODE:
-		case I40IW_AE_ROE_INVALID_RDMA_READ_REQUEST:
-		case I40IW_AE_ROE_INVALID_RDMA_WRITE_OR_READ_RESP:
-		case I40IW_AE_INVALID_ARP_ENTRY:
-		case I40IW_AE_INVALID_TCP_OPTION_RCVD:
-		case I40IW_AE_STALE_ARP_ENTRY:
-		case I40IW_AE_LLP_RECEIVED_MPA_CRC_ERROR:
-		case I40IW_AE_LLP_SEGMENT_TOO_SMALL:
-		case I40IW_AE_LLP_SYN_RECEIVED:
-		case I40IW_AE_LLP_TOO_MANY_RETRIES:
-		case I40IW_AE_LCE_QP_CATASTROPHIC:
-		case I40IW_AE_LCE_FUNCTION_CATASTROPHIC:
-		case I40IW_AE_LCE_CQ_CATASTROPHIC:
-		case I40IW_AE_UDA_XMIT_DGRAM_TOO_LONG:
-		case I40IW_AE_UDA_XMIT_DGRAM_TOO_SHORT:
-			ctx_info->err_rq_idx_valid = false;
-			fallthrough;
-		default:
-			if (!info->sq && ctx_info->err_rq_idx_valid) {
-				ctx_info->err_rq_idx = info->wqe_idx;
-				ctx_info->tcp_info_valid = false;
-				ctx_info->iwarp_info_valid = false;
-				ret = dev->iw_priv_qp_ops->qp_setctx(&iwqp->sc_qp,
-								     iwqp->host_ctx.va,
-								     ctx_info);
-			}
-			i40iw_terminate_connection(qp, info);
-			break;
-		}
-		if (info->qp)
-			i40iw_qp_rem_ref(&iwqp->ibqp);
-	} while (1);
-
-	if (aeqcnt)
-		dev->aeq_ops->repost_aeq_entries(dev, aeqcnt);
-}
-
-/**
- * i40iw_cqp_manage_abvpt_cmd - send cqp command manage abpvt
- * @iwdev: iwarp device
- * @accel_local_port: port for apbvt
- * @add_port: add or delete port
- */
-static enum i40iw_status_code
-i40iw_cqp_manage_abvpt_cmd(struct i40iw_device *iwdev,
-			   u16 accel_local_port,
-			   bool add_port)
-{
-	struct i40iw_apbvt_info *info;
-	struct i40iw_cqp_request *cqp_request;
-	struct cqp_commands_info *cqp_info;
-	enum i40iw_status_code status;
-
-	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, add_port);
-	if (!cqp_request)
-		return I40IW_ERR_NO_MEMORY;
-
-	cqp_info = &cqp_request->info;
-	info = &cqp_info->in.u.manage_apbvt_entry.info;
-
-	memset(info, 0, sizeof(*info));
-	info->add = add_port;
-	info->port = cpu_to_le16(accel_local_port);
-
-	cqp_info->cqp_cmd = OP_MANAGE_APBVT_ENTRY;
-	cqp_info->post_sq = 1;
-	cqp_info->in.u.manage_apbvt_entry.cqp = &iwdev->cqp.sc_cqp;
-	cqp_info->in.u.manage_apbvt_entry.scratch = (uintptr_t)cqp_request;
-	status = i40iw_handle_cqp_op(iwdev, cqp_request);
-	if (status)
-		i40iw_pr_err("CQP-OP Manage APBVT entry fail");
-
-	return status;
-}
-
-/**
- * i40iw_manage_apbvt - add or delete tcp port
- * @iwdev: iwarp device
- * @accel_local_port: port for apbvt
- * @add_port: add or delete port
- */
-enum i40iw_status_code i40iw_manage_apbvt(struct i40iw_device *iwdev,
-					  u16 accel_local_port,
-					  bool add_port)
-{
-	struct i40iw_cm_core *cm_core = &iwdev->cm_core;
-	enum i40iw_status_code status;
-	unsigned long flags;
-	bool in_use;
-
-	/* apbvt_lock is held across CQP delete APBVT OP (non-waiting) to
-	 * protect against race where add APBVT CQP can race ahead of the delete
-	 * APBVT for same port.
-	 */
-	if (add_port) {
-		spin_lock_irqsave(&cm_core->apbvt_lock, flags);
-		in_use = __test_and_set_bit(accel_local_port,
-					    cm_core->ports_in_use);
-		spin_unlock_irqrestore(&cm_core->apbvt_lock, flags);
-		if (in_use)
-			return 0;
-		return i40iw_cqp_manage_abvpt_cmd(iwdev, accel_local_port,
-						  true);
-	} else {
-		spin_lock_irqsave(&cm_core->apbvt_lock, flags);
-		in_use = i40iw_port_in_use(cm_core, accel_local_port);
-		if (in_use) {
-			spin_unlock_irqrestore(&cm_core->apbvt_lock, flags);
-			return 0;
-		}
-		__clear_bit(accel_local_port, cm_core->ports_in_use);
-		status = i40iw_cqp_manage_abvpt_cmd(iwdev, accel_local_port,
-						    false);
-		spin_unlock_irqrestore(&cm_core->apbvt_lock, flags);
-		return status;
-	}
-}
-
-/**
- * i40iw_manage_arp_cache - manage hw arp cache
- * @iwdev: iwarp device
- * @mac_addr: mac address ptr
- * @ip_addr: ip addr for arp cache
- * @ipv4: flag indicating IPv4 when true
- * @action: add, delete or modify
- */
-void i40iw_manage_arp_cache(struct i40iw_device *iwdev,
-			    unsigned char *mac_addr,
-			    u32 *ip_addr,
-			    bool ipv4,
-			    u32 action)
-{
-	struct i40iw_add_arp_cache_entry_info *info;
-	struct i40iw_cqp_request *cqp_request;
-	struct cqp_commands_info *cqp_info;
-	int arp_index;
-
-	arp_index = i40iw_arp_table(iwdev, ip_addr, ipv4, mac_addr, action);
-	if (arp_index < 0)
-		return;
-	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, false);
-	if (!cqp_request)
-		return;
-
-	cqp_info = &cqp_request->info;
-	if (action == I40IW_ARP_ADD) {
-		cqp_info->cqp_cmd = OP_ADD_ARP_CACHE_ENTRY;
-		info = &cqp_info->in.u.add_arp_cache_entry.info;
-		memset(info, 0, sizeof(*info));
-		info->arp_index = cpu_to_le16((u16)arp_index);
-		info->permanent = true;
-		ether_addr_copy(info->mac_addr, mac_addr);
-		cqp_info->in.u.add_arp_cache_entry.scratch = (uintptr_t)cqp_request;
-		cqp_info->in.u.add_arp_cache_entry.cqp = &iwdev->cqp.sc_cqp;
-	} else {
-		cqp_info->cqp_cmd = OP_DELETE_ARP_CACHE_ENTRY;
-		cqp_info->in.u.del_arp_cache_entry.scratch = (uintptr_t)cqp_request;
-		cqp_info->in.u.del_arp_cache_entry.cqp = &iwdev->cqp.sc_cqp;
-		cqp_info->in.u.del_arp_cache_entry.arp_index = arp_index;
-	}
-
-	cqp_info->in.u.add_arp_cache_entry.cqp = &iwdev->cqp.sc_cqp;
-	cqp_info->in.u.add_arp_cache_entry.scratch = (uintptr_t)cqp_request;
-	cqp_info->post_sq = 1;
-	if (i40iw_handle_cqp_op(iwdev, cqp_request))
-		i40iw_pr_err("CQP-OP Add/Del Arp Cache entry fail");
-}
-
-/**
- * i40iw_send_syn_cqp_callback - do syn/ack after qhash
- * @cqp_request: qhash cqp completion
- * @send_ack: flag send ack
- */
-static void i40iw_send_syn_cqp_callback(struct i40iw_cqp_request *cqp_request, u32 send_ack)
-{
-	i40iw_send_syn(cqp_request->param, send_ack);
-}
-
-/**
- * i40iw_manage_qhash - add or modify qhash
- * @iwdev: iwarp device
- * @cminfo: cm info for qhash
- * @etype: type (syn or quad)
- * @mtype: type of qhash
- * @cmnode: cmnode associated with connection
- * @wait: wait for completion
- */
-enum i40iw_status_code i40iw_manage_qhash(struct i40iw_device *iwdev,
-					  struct i40iw_cm_info *cminfo,
-					  enum i40iw_quad_entry_type etype,
-					  enum i40iw_quad_hash_manage_type mtype,
-					  void *cmnode,
-					  bool wait)
-{
-	struct i40iw_qhash_table_info *info;
-	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-	struct i40iw_sc_vsi *vsi = &iwdev->vsi;
-	enum i40iw_status_code status;
-	struct i40iw_cqp *iwcqp = &iwdev->cqp;
-	struct i40iw_cqp_request *cqp_request;
-	struct cqp_commands_info *cqp_info;
-
-	cqp_request = i40iw_get_cqp_request(iwcqp, wait);
-	if (!cqp_request)
-		return I40IW_ERR_NO_MEMORY;
-	cqp_info = &cqp_request->info;
-	info = &cqp_info->in.u.manage_qhash_table_entry.info;
-	memset(info, 0, sizeof(*info));
-
-	info->vsi = &iwdev->vsi;
-	info->manage = mtype;
-	info->entry_type = etype;
-	if (cminfo->vlan_id != 0xFFFF) {
-		info->vlan_valid = true;
-		info->vlan_id = cpu_to_le16(cminfo->vlan_id);
-	} else {
-		info->vlan_valid = false;
-	}
-
-	info->ipv4_valid = cminfo->ipv4;
-	info->user_pri = cminfo->user_pri;
-	ether_addr_copy(info->mac_addr, iwdev->netdev->dev_addr);
-	info->qp_num = cpu_to_le32(vsi->ilq->qp_id);
-	info->dest_port = cpu_to_le16(cminfo->loc_port);
-	info->dest_ip[0] = cpu_to_le32(cminfo->loc_addr[0]);
-	info->dest_ip[1] = cpu_to_le32(cminfo->loc_addr[1]);
-	info->dest_ip[2] = cpu_to_le32(cminfo->loc_addr[2]);
-	info->dest_ip[3] = cpu_to_le32(cminfo->loc_addr[3]);
-	if (etype == I40IW_QHASH_TYPE_TCP_ESTABLISHED) {
-		info->src_port = cpu_to_le16(cminfo->rem_port);
-		info->src_ip[0] = cpu_to_le32(cminfo->rem_addr[0]);
-		info->src_ip[1] = cpu_to_le32(cminfo->rem_addr[1]);
-		info->src_ip[2] = cpu_to_le32(cminfo->rem_addr[2]);
-		info->src_ip[3] = cpu_to_le32(cminfo->rem_addr[3]);
-	}
-	if (cmnode) {
-		cqp_request->callback_fcn = i40iw_send_syn_cqp_callback;
-		cqp_request->param = (void *)cmnode;
-	}
-
-	if (info->ipv4_valid)
-		i40iw_debug(dev, I40IW_DEBUG_CM,
-			    "%s:%s IP=%pI4, port=%d, mac=%pM, vlan_id=%d\n",
-			    __func__, (!mtype) ? "DELETE" : "ADD",
-			    info->dest_ip,
-			    info->dest_port, info->mac_addr, cminfo->vlan_id);
-	else
-		i40iw_debug(dev, I40IW_DEBUG_CM,
-			    "%s:%s IP=%pI6, port=%d, mac=%pM, vlan_id=%d\n",
-			    __func__, (!mtype) ? "DELETE" : "ADD",
-			    info->dest_ip,
-			    info->dest_port, info->mac_addr, cminfo->vlan_id);
-	cqp_info->in.u.manage_qhash_table_entry.cqp = &iwdev->cqp.sc_cqp;
-	cqp_info->in.u.manage_qhash_table_entry.scratch = (uintptr_t)cqp_request;
-	cqp_info->cqp_cmd = OP_MANAGE_QHASH_TABLE_ENTRY;
-	cqp_info->post_sq = 1;
-	status = i40iw_handle_cqp_op(iwdev, cqp_request);
-	if (status)
-		i40iw_pr_err("CQP-OP Manage Qhash Entry fail");
-	return status;
-}
-
-/**
- * i40iw_hw_flush_wqes - flush qp's wqe
- * @iwdev: iwarp device
- * @qp: hardware control qp
- * @info: info for flush
- * @wait: flag wait for completion
- */
-enum i40iw_status_code i40iw_hw_flush_wqes(struct i40iw_device *iwdev,
-					   struct i40iw_sc_qp *qp,
-					   struct i40iw_qp_flush_info *info,
-					   bool wait)
-{
-	enum i40iw_status_code status;
-	struct i40iw_qp_flush_info *hw_info;
-	struct i40iw_cqp_request *cqp_request;
-	struct cqp_commands_info *cqp_info;
-	struct i40iw_qp *iwqp = (struct i40iw_qp *)qp->back_qp;
-
-	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, wait);
-	if (!cqp_request)
-		return I40IW_ERR_NO_MEMORY;
-
-	cqp_info = &cqp_request->info;
-	hw_info = &cqp_request->info.in.u.qp_flush_wqes.info;
-	memcpy(hw_info, info, sizeof(*hw_info));
-
-	cqp_info->cqp_cmd = OP_QP_FLUSH_WQES;
-	cqp_info->post_sq = 1;
-	cqp_info->in.u.qp_flush_wqes.qp = qp;
-	cqp_info->in.u.qp_flush_wqes.scratch = (uintptr_t)cqp_request;
-	status = i40iw_handle_cqp_op(iwdev, cqp_request);
-	if (status) {
-		i40iw_pr_err("CQP-OP Flush WQE's fail");
-		complete(&iwqp->sq_drained);
-		complete(&iwqp->rq_drained);
-		return status;
-	}
-	if (!cqp_request->compl_info.maj_err_code) {
-		switch (cqp_request->compl_info.min_err_code) {
-		case I40IW_CQP_COMPL_RQ_WQE_FLUSHED:
-			complete(&iwqp->sq_drained);
-			break;
-		case I40IW_CQP_COMPL_SQ_WQE_FLUSHED:
-			complete(&iwqp->rq_drained);
-			break;
-		case I40IW_CQP_COMPL_RQ_SQ_WQE_FLUSHED:
-			break;
-		default:
-			complete(&iwqp->sq_drained);
-			complete(&iwqp->rq_drained);
-			break;
-		}
-	}
-
-	return 0;
-}
-
-/**
- * i40iw_gen_ae - generate AE
- * @iwdev: iwarp device
- * @qp: qp associated with AE
- * @info: info for ae
- * @wait: wait for completion
- */
-void i40iw_gen_ae(struct i40iw_device *iwdev,
-		  struct i40iw_sc_qp *qp,
-		  struct i40iw_gen_ae_info *info,
-		  bool wait)
-{
-	struct i40iw_gen_ae_info *ae_info;
-	struct i40iw_cqp_request *cqp_request;
-	struct cqp_commands_info *cqp_info;
-
-	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, wait);
-	if (!cqp_request)
-		return;
-
-	cqp_info = &cqp_request->info;
-	ae_info = &cqp_request->info.in.u.gen_ae.info;
-	memcpy(ae_info, info, sizeof(*ae_info));
-
-	cqp_info->cqp_cmd = OP_GEN_AE;
-	cqp_info->post_sq = 1;
-	cqp_info->in.u.gen_ae.qp = qp;
-	cqp_info->in.u.gen_ae.scratch = (uintptr_t)cqp_request;
-	if (i40iw_handle_cqp_op(iwdev, cqp_request))
-		i40iw_pr_err("CQP OP failed attempting to generate ae_code=0x%x\n",
-			     info->ae_code);
-}
-
-/**
- * i40iw_hw_manage_vf_pble_bp - manage vf pbles
- * @iwdev: iwarp device
- * @info: info for managing pble
- * @wait: flag wait for completion
- */
-enum i40iw_status_code i40iw_hw_manage_vf_pble_bp(struct i40iw_device *iwdev,
-						  struct i40iw_manage_vf_pble_info *info,
-						  bool wait)
-{
-	enum i40iw_status_code status;
-	struct i40iw_manage_vf_pble_info *hw_info;
-	struct i40iw_cqp_request *cqp_request;
-	struct cqp_commands_info *cqp_info;
-
-	if ((iwdev->init_state < CCQ_CREATED) && wait)
-		wait = false;
-
-	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, wait);
-	if (!cqp_request)
-		return I40IW_ERR_NO_MEMORY;
-
-	cqp_info = &cqp_request->info;
-	hw_info = &cqp_request->info.in.u.manage_vf_pble_bp.info;
-	memcpy(hw_info, info, sizeof(*hw_info));
-
-	cqp_info->cqp_cmd = OP_MANAGE_VF_PBLE_BP;
-	cqp_info->post_sq = 1;
-	cqp_info->in.u.manage_vf_pble_bp.cqp = &iwdev->cqp.sc_cqp;
-	cqp_info->in.u.manage_vf_pble_bp.scratch = (uintptr_t)cqp_request;
-	status = i40iw_handle_cqp_op(iwdev, cqp_request);
-	if (status)
-		i40iw_pr_err("CQP-OP Manage VF pble_bp fail");
-	return status;
-}
-
-/**
- * i40iw_get_ib_wc - return change flush code to IB's
- * @opcode: iwarp flush code
- */
-static enum ib_wc_status i40iw_get_ib_wc(enum i40iw_flush_opcode opcode)
-{
-	switch (opcode) {
-	case FLUSH_PROT_ERR:
-		return IB_WC_LOC_PROT_ERR;
-	case FLUSH_REM_ACCESS_ERR:
-		return IB_WC_REM_ACCESS_ERR;
-	case FLUSH_LOC_QP_OP_ERR:
-		return IB_WC_LOC_QP_OP_ERR;
-	case FLUSH_REM_OP_ERR:
-		return IB_WC_REM_OP_ERR;
-	case FLUSH_LOC_LEN_ERR:
-		return IB_WC_LOC_LEN_ERR;
-	case FLUSH_GENERAL_ERR:
-		return IB_WC_GENERAL_ERR;
-	case FLUSH_FATAL_ERR:
-	default:
-		return IB_WC_FATAL_ERR;
-	}
-}
-
-/**
- * i40iw_set_flush_info - set flush info
- * @pinfo: set flush info
- * @min: minor err
- * @maj: major err
- * @opcode: flush error code
- */
-static void i40iw_set_flush_info(struct i40iw_qp_flush_info *pinfo,
-				 u16 *min,
-				 u16 *maj,
-				 enum i40iw_flush_opcode opcode)
-{
-	*min = (u16)i40iw_get_ib_wc(opcode);
-	*maj = CQE_MAJOR_DRV;
-	pinfo->userflushcode = true;
-}
-
-/**
- * i40iw_flush_wqes - flush wqe for qp
- * @iwdev: iwarp device
- * @iwqp: qp to flush wqes
- */
-void i40iw_flush_wqes(struct i40iw_device *iwdev, struct i40iw_qp *iwqp)
-{
-	struct i40iw_qp_flush_info info;
-	struct i40iw_qp_flush_info *pinfo = &info;
-
-	struct i40iw_sc_qp *qp = &iwqp->sc_qp;
-
-	memset(pinfo, 0, sizeof(*pinfo));
-	info.sq = true;
-	info.rq = true;
-	if (qp->term_flags) {
-		i40iw_set_flush_info(pinfo, &pinfo->sq_minor_code,
-				     &pinfo->sq_major_code, qp->flush_code);
-		i40iw_set_flush_info(pinfo, &pinfo->rq_minor_code,
-				     &pinfo->rq_major_code, qp->flush_code);
-	}
-	(void)i40iw_hw_flush_wqes(iwdev, &iwqp->sc_qp, &info, true);
-}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c
deleted file mode 100644
index 364f69cd620f..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw_main.c
+++ /dev/null
@@ -1,2064 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/if_vlan.h>
-#include <net/addrconf.h>
-
-#include "i40iw.h"
-#include "i40iw_register.h"
-#include <net/netevent.h>
-#define CLIENT_IW_INTERFACE_VERSION_MAJOR 0
-#define CLIENT_IW_INTERFACE_VERSION_MINOR 01
-#define CLIENT_IW_INTERFACE_VERSION_BUILD 00
-
-#define DRV_VERSION_MAJOR 0
-#define DRV_VERSION_MINOR 5
-#define DRV_VERSION_BUILD 123
-#define DRV_VERSION	__stringify(DRV_VERSION_MAJOR) "."		\
-	__stringify(DRV_VERSION_MINOR) "." __stringify(DRV_VERSION_BUILD)
-
-static int debug;
-module_param(debug, int, 0644);
-MODULE_PARM_DESC(debug, "debug flags: 0=disabled (default), 0x7fffffff=all");
-
-static int resource_profile;
-module_param(resource_profile, int, 0644);
-MODULE_PARM_DESC(resource_profile,
-		 "Resource Profile: 0=no VF RDMA support (default), 1=Weighted VF, 2=Even Distribution");
-
-static int max_rdma_vfs = 32;
-module_param(max_rdma_vfs, int, 0644);
-MODULE_PARM_DESC(max_rdma_vfs, "Maximum VF count: 0-32 32=default");
-static int mpa_version = 2;
-module_param(mpa_version, int, 0644);
-MODULE_PARM_DESC(mpa_version, "MPA version to be used in MPA Req/Resp 1 or 2");
-
-MODULE_AUTHOR("Intel Corporation, <e1000-rdma@lists.sourceforge.net>");
-MODULE_DESCRIPTION("Intel(R) Ethernet Connection X722 iWARP RDMA Driver");
-MODULE_LICENSE("Dual BSD/GPL");
-
-static struct i40e_client i40iw_client;
-static char i40iw_client_name[I40E_CLIENT_STR_LENGTH] = "i40iw";
-
-static LIST_HEAD(i40iw_handlers);
-static DEFINE_SPINLOCK(i40iw_handler_lock);
-
-static enum i40iw_status_code i40iw_virtchnl_send(struct i40iw_sc_dev *dev,
-						  u32 vf_id, u8 *msg, u16 len);
-
-static struct notifier_block i40iw_inetaddr_notifier = {
-	.notifier_call = i40iw_inetaddr_event
-};
-
-static struct notifier_block i40iw_inetaddr6_notifier = {
-	.notifier_call = i40iw_inet6addr_event
-};
-
-static struct notifier_block i40iw_net_notifier = {
-	.notifier_call = i40iw_net_event
-};
-
-static struct notifier_block i40iw_netdevice_notifier = {
-	.notifier_call = i40iw_netdevice_event
-};
-
-/**
- * i40iw_find_i40e_handler - find a handler given a client info
- * @ldev: pointer to a client info
- */
-static struct i40iw_handler *i40iw_find_i40e_handler(struct i40e_info *ldev)
-{
-	struct i40iw_handler *hdl;
-	unsigned long flags;
-
-	spin_lock_irqsave(&i40iw_handler_lock, flags);
-	list_for_each_entry(hdl, &i40iw_handlers, list) {
-		if (hdl->ldev.netdev == ldev->netdev) {
-			spin_unlock_irqrestore(&i40iw_handler_lock, flags);
-			return hdl;
-		}
-	}
-	spin_unlock_irqrestore(&i40iw_handler_lock, flags);
-	return NULL;
-}
-
-/**
- * i40iw_find_netdev - find a handler given a netdev
- * @netdev: pointer to net_device
- */
-struct i40iw_handler *i40iw_find_netdev(struct net_device *netdev)
-{
-	struct i40iw_handler *hdl;
-	unsigned long flags;
-
-	spin_lock_irqsave(&i40iw_handler_lock, flags);
-	list_for_each_entry(hdl, &i40iw_handlers, list) {
-		if (hdl->ldev.netdev == netdev) {
-			spin_unlock_irqrestore(&i40iw_handler_lock, flags);
-			return hdl;
-		}
-	}
-	spin_unlock_irqrestore(&i40iw_handler_lock, flags);
-	return NULL;
-}
-
-/**
- * i40iw_add_handler - add a handler to the list
- * @hdl: handler to be added to the handler list
- */
-static void i40iw_add_handler(struct i40iw_handler *hdl)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&i40iw_handler_lock, flags);
-	list_add(&hdl->list, &i40iw_handlers);
-	spin_unlock_irqrestore(&i40iw_handler_lock, flags);
-}
-
-/**
- * i40iw_del_handler - delete a handler from the list
- * @hdl: handler to be deleted from the handler list
- */
-static int i40iw_del_handler(struct i40iw_handler *hdl)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&i40iw_handler_lock, flags);
-	list_del(&hdl->list);
-	spin_unlock_irqrestore(&i40iw_handler_lock, flags);
-	return 0;
-}
-
-/**
- * i40iw_enable_intr - set up device interrupts
- * @dev: hardware control device structure
- * @msix_id: id of the interrupt to be enabled
- */
-static void i40iw_enable_intr(struct i40iw_sc_dev *dev, u32 msix_id)
-{
-	u32 val;
-
-	val = I40E_PFINT_DYN_CTLN_INTENA_MASK |
-		I40E_PFINT_DYN_CTLN_CLEARPBA_MASK |
-		(3 << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT);
-	if (dev->is_pf)
-		i40iw_wr32(dev->hw, I40E_PFINT_DYN_CTLN(msix_id - 1), val);
-	else
-		i40iw_wr32(dev->hw, I40E_VFINT_DYN_CTLN1(msix_id - 1), val);
-}
-
-/**
- * i40iw_dpc - tasklet for aeq and ceq 0
- * @t: Timer context to fetch pointer to iwarp device
- */
-static void i40iw_dpc(struct tasklet_struct *t)
-{
-	struct i40iw_device *iwdev = from_tasklet(iwdev, t, dpc_tasklet);
-
-	if (iwdev->msix_shared)
-		i40iw_process_ceq(iwdev, iwdev->ceqlist);
-	i40iw_process_aeq(iwdev);
-	i40iw_enable_intr(&iwdev->sc_dev, iwdev->iw_msixtbl[0].idx);
-}
-
-/**
- * i40iw_ceq_dpc - dpc handler for CEQ
- * @t: Timer context to fetch pointer to CEQ data
- */
-static void i40iw_ceq_dpc(struct tasklet_struct *t)
-{
-	struct i40iw_ceq *iwceq = from_tasklet(iwceq, t, dpc_tasklet);
-	struct i40iw_device *iwdev = iwceq->iwdev;
-
-	i40iw_process_ceq(iwdev, iwceq);
-	i40iw_enable_intr(&iwdev->sc_dev, iwceq->msix_idx);
-}
-
-/**
- * i40iw_irq_handler - interrupt handler for aeq and ceq0
- * @irq: Interrupt request number
- * @data: iwarp device
- */
-static irqreturn_t i40iw_irq_handler(int irq, void *data)
-{
-	struct i40iw_device *iwdev = (struct i40iw_device *)data;
-
-	tasklet_schedule(&iwdev->dpc_tasklet);
-	return IRQ_HANDLED;
-}
-
-/**
- * i40iw_destroy_cqp  - destroy control qp
- * @iwdev: iwarp device
- * @free_hwcqp: 1 if CQP should be destroyed
- *
- * Issue destroy cqp request and
- * free the resources associated with the cqp
- */
-static void i40iw_destroy_cqp(struct i40iw_device *iwdev, bool free_hwcqp)
-{
-	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-	struct i40iw_cqp *cqp = &iwdev->cqp;
-
-	if (free_hwcqp)
-		dev->cqp_ops->cqp_destroy(dev->cqp);
-
-	i40iw_cleanup_pending_cqp_op(iwdev);
-
-	i40iw_free_dma_mem(dev->hw, &cqp->sq);
-	kfree(cqp->scratch_array);
-	iwdev->cqp.scratch_array = NULL;
-
-	kfree(cqp->cqp_requests);
-	cqp->cqp_requests = NULL;
-}
-
-/**
- * i40iw_disable_irq - disable device interrupts
- * @dev: hardware control device structure
- * @msix_vec: msix vector to disable irq
- * @dev_id: parameter to pass to free_irq (used during irq setup)
- *
- * The function is called when destroying aeq/ceq
- */
-static void i40iw_disable_irq(struct i40iw_sc_dev *dev,
-			      struct i40iw_msix_vector *msix_vec,
-			      void *dev_id)
-{
-	if (dev->is_pf)
-		i40iw_wr32(dev->hw, I40E_PFINT_DYN_CTLN(msix_vec->idx - 1), 0);
-	else
-		i40iw_wr32(dev->hw, I40E_VFINT_DYN_CTLN1(msix_vec->idx - 1), 0);
-	irq_set_affinity_hint(msix_vec->irq, NULL);
-	free_irq(msix_vec->irq, dev_id);
-}
-
-/**
- * i40iw_destroy_aeq - destroy aeq
- * @iwdev: iwarp device
- *
- * Issue a destroy aeq request and
- * free the resources associated with the aeq
- * The function is called during driver unload
- */
-static void i40iw_destroy_aeq(struct i40iw_device *iwdev)
-{
-	enum i40iw_status_code status = I40IW_ERR_NOT_READY;
-	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-	struct i40iw_aeq *aeq = &iwdev->aeq;
-
-	if (!iwdev->msix_shared)
-		i40iw_disable_irq(dev, iwdev->iw_msixtbl, (void *)iwdev);
-	if (iwdev->reset)
-		goto exit;
-
-	if (!dev->aeq_ops->aeq_destroy(&aeq->sc_aeq, 0, 1))
-		status = dev->aeq_ops->aeq_destroy_done(&aeq->sc_aeq);
-	if (status)
-		i40iw_pr_err("destroy aeq failed %d\n", status);
-
-exit:
-	i40iw_free_dma_mem(dev->hw, &aeq->mem);
-}
-
-/**
- * i40iw_destroy_ceq - destroy ceq
- * @iwdev: iwarp device
- * @iwceq: ceq to be destroyed
- *
- * Issue a destroy ceq request and
- * free the resources associated with the ceq
- */
-static void i40iw_destroy_ceq(struct i40iw_device *iwdev,
-			      struct i40iw_ceq *iwceq)
-{
-	enum i40iw_status_code status;
-	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-
-	if (iwdev->reset)
-		goto exit;
-
-	status = dev->ceq_ops->ceq_destroy(&iwceq->sc_ceq, 0, 1);
-	if (status) {
-		i40iw_pr_err("ceq destroy command failed %d\n", status);
-		goto exit;
-	}
-
-	status = dev->ceq_ops->cceq_destroy_done(&iwceq->sc_ceq);
-	if (status)
-		i40iw_pr_err("ceq destroy completion failed %d\n", status);
-exit:
-	i40iw_free_dma_mem(dev->hw, &iwceq->mem);
-}
-
-/**
- * i40iw_dele_ceqs - destroy all ceq's
- * @iwdev: iwarp device
- *
- * Go through all of the device ceq's and for each ceq
- * disable the ceq interrupt and destroy the ceq
- */
-static void i40iw_dele_ceqs(struct i40iw_device *iwdev)
-{
-	u32 i = 0;
-	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-	struct i40iw_ceq *iwceq = iwdev->ceqlist;
-	struct i40iw_msix_vector *msix_vec = iwdev->iw_msixtbl;
-
-	if (iwdev->msix_shared) {
-		i40iw_disable_irq(dev, msix_vec, (void *)iwdev);
-		i40iw_destroy_ceq(iwdev, iwceq);
-		iwceq++;
-		i++;
-	}
-
-	for (msix_vec++; i < iwdev->ceqs_count; i++, msix_vec++, iwceq++) {
-		i40iw_disable_irq(dev, msix_vec, (void *)iwceq);
-		i40iw_destroy_ceq(iwdev, iwceq);
-	}
-
-	iwdev->sc_dev.ceq_valid = false;
-}
-
-/**
- * i40iw_destroy_ccq - destroy control cq
- * @iwdev: iwarp device
- *
- * Issue destroy ccq request and
- * free the resources associated with the ccq
- */
-static void i40iw_destroy_ccq(struct i40iw_device *iwdev)
-{
-	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-	struct i40iw_ccq *ccq = &iwdev->ccq;
-	enum i40iw_status_code status = 0;
-
-	if (!iwdev->reset)
-		status = dev->ccq_ops->ccq_destroy(dev->ccq, 0, true);
-	if (status)
-		i40iw_pr_err("ccq destroy failed %d\n", status);
-	i40iw_free_dma_mem(dev->hw, &ccq->mem_cq);
-}
-
-/* types of hmc objects */
-static enum i40iw_hmc_rsrc_type iw_hmc_obj_types[] = {
-	I40IW_HMC_IW_QP,
-	I40IW_HMC_IW_CQ,
-	I40IW_HMC_IW_HTE,
-	I40IW_HMC_IW_ARP,
-	I40IW_HMC_IW_APBVT_ENTRY,
-	I40IW_HMC_IW_MR,
-	I40IW_HMC_IW_XF,
-	I40IW_HMC_IW_XFFL,
-	I40IW_HMC_IW_Q1,
-	I40IW_HMC_IW_Q1FL,
-	I40IW_HMC_IW_TIMER,
-};
-
-/**
- * i40iw_close_hmc_objects_type - delete hmc objects of a given type
- * @dev: iwarp device
- * @obj_type: the hmc object type to be deleted
- * @hmc_info: pointer to the HMC configuration information
- * @is_pf: true if the function is PF otherwise false
- * @reset: true if called before reset
- */
-static void i40iw_close_hmc_objects_type(struct i40iw_sc_dev *dev,
-					 enum i40iw_hmc_rsrc_type obj_type,
-					 struct i40iw_hmc_info *hmc_info,
-					 bool is_pf,
-					 bool reset)
-{
-	struct i40iw_hmc_del_obj_info info;
-
-	memset(&info, 0, sizeof(info));
-	info.hmc_info = hmc_info;
-	info.rsrc_type = obj_type;
-	info.count = hmc_info->hmc_obj[obj_type].cnt;
-	info.is_pf = is_pf;
-	if (dev->hmc_ops->del_hmc_object(dev, &info, reset))
-		i40iw_pr_err("del obj of type %d failed\n", obj_type);
-}
-
-/**
- * i40iw_del_hmc_objects - remove all device hmc objects
- * @dev: iwarp device
- * @hmc_info: hmc_info to free
- * @is_pf: true if hmc_info belongs to PF, not vf nor allocated
- *	   by PF on behalf of VF
- * @reset: true if called before reset
- */
-static void i40iw_del_hmc_objects(struct i40iw_sc_dev *dev,
-				  struct i40iw_hmc_info *hmc_info,
-				  bool is_pf,
-				  bool reset)
-{
-	unsigned int i;
-
-	for (i = 0; i < IW_HMC_OBJ_TYPE_NUM; i++)
-		i40iw_close_hmc_objects_type(dev, iw_hmc_obj_types[i], hmc_info, is_pf, reset);
-}
-
-/**
- * i40iw_ceq_handler - interrupt handler for ceq
- * @irq: interrupt request number
- * @data: ceq pointer
- */
-static irqreturn_t i40iw_ceq_handler(int irq, void *data)
-{
-	struct i40iw_ceq *iwceq = (struct i40iw_ceq *)data;
-
-	if (iwceq->irq != irq)
-		i40iw_pr_err("expected irq = %d received irq = %d\n", iwceq->irq, irq);
-	tasklet_schedule(&iwceq->dpc_tasklet);
-	return IRQ_HANDLED;
-}
-
-/**
- * i40iw_create_hmc_obj_type - create hmc object of a given type
- * @dev: hardware control device structure
- * @info: information for the hmc object to create
- */
-static enum i40iw_status_code i40iw_create_hmc_obj_type(struct i40iw_sc_dev *dev,
-							struct i40iw_hmc_create_obj_info *info)
-{
-	return dev->hmc_ops->create_hmc_object(dev, info);
-}
-
-/**
- * i40iw_create_hmc_objs - create all hmc objects for the device
- * @iwdev: iwarp device
- * @is_pf: true if the function is PF otherwise false
- *
- * Create the device hmc objects and allocate hmc pages
- * Return 0 if successful, otherwise clean up and return error
- */
-static enum i40iw_status_code i40iw_create_hmc_objs(struct i40iw_device *iwdev,
-						    bool is_pf)
-{
-	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-	struct i40iw_hmc_create_obj_info info;
-	enum i40iw_status_code status;
-	int i;
-
-	memset(&info, 0, sizeof(info));
-	info.hmc_info = dev->hmc_info;
-	info.is_pf = is_pf;
-	info.entry_type = iwdev->sd_type;
-	for (i = 0; i < IW_HMC_OBJ_TYPE_NUM; i++) {
-		info.rsrc_type = iw_hmc_obj_types[i];
-		info.count = dev->hmc_info->hmc_obj[info.rsrc_type].cnt;
-		info.add_sd_cnt = 0;
-		status = i40iw_create_hmc_obj_type(dev, &info);
-		if (status) {
-			i40iw_pr_err("create obj type %d status = %d\n",
-				     iw_hmc_obj_types[i], status);
-			break;
-		}
-	}
-	if (!status)
-		return (dev->cqp_misc_ops->static_hmc_pages_allocated(dev->cqp, 0,
-								      dev->hmc_fn_id,
-								      true, true));
-
-	while (i) {
-		i--;
-		/* destroy the hmc objects of a given type */
-		i40iw_close_hmc_objects_type(dev,
-					     iw_hmc_obj_types[i],
-					     dev->hmc_info,
-					     is_pf,
-					     false);
-	}
-	return status;
-}
-
-/**
- * i40iw_obj_aligned_mem - get aligned memory from device allocated memory
- * @iwdev: iwarp device
- * @memptr: points to the memory addresses
- * @size: size of memory needed
- * @mask: mask for the aligned memory
- *
- * Get aligned memory of the requested size and
- * update the memptr to point to the new aligned memory
- * Return 0 if successful, otherwise return no memory error
- */
-enum i40iw_status_code i40iw_obj_aligned_mem(struct i40iw_device *iwdev,
-					     struct i40iw_dma_mem *memptr,
-					     u32 size,
-					     u32 mask)
-{
-	unsigned long va, newva;
-	unsigned long extra;
-
-	va = (unsigned long)iwdev->obj_next.va;
-	newva = va;
-	if (mask)
-		newva = ALIGN(va, (mask + 1));
-	extra = newva - va;
-	memptr->va = (u8 *)va + extra;
-	memptr->pa = iwdev->obj_next.pa + extra;
-	memptr->size = size;
-	if ((memptr->va + size) > (iwdev->obj_mem.va + iwdev->obj_mem.size))
-		return I40IW_ERR_NO_MEMORY;
-
-	iwdev->obj_next.va = memptr->va + size;
-	iwdev->obj_next.pa = memptr->pa + size;
-	return 0;
-}
-
-/**
- * i40iw_create_cqp - create control qp
- * @iwdev: iwarp device
- *
- * Return 0, if the cqp and all the resources associated with it
- * are successfully created, otherwise return error
- */
-static enum i40iw_status_code i40iw_create_cqp(struct i40iw_device *iwdev)
-{
-	enum i40iw_status_code status;
-	u32 sqsize = I40IW_CQP_SW_SQSIZE_2048;
-	struct i40iw_dma_mem mem;
-	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-	struct i40iw_cqp_init_info cqp_init_info;
-	struct i40iw_cqp *cqp = &iwdev->cqp;
-	u16 maj_err, min_err;
-	int i;
-
-	cqp->cqp_requests = kcalloc(sqsize, sizeof(*cqp->cqp_requests), GFP_KERNEL);
-	if (!cqp->cqp_requests)
-		return I40IW_ERR_NO_MEMORY;
-	cqp->scratch_array = kcalloc(sqsize, sizeof(*cqp->scratch_array), GFP_KERNEL);
-	if (!cqp->scratch_array) {
-		kfree(cqp->cqp_requests);
-		return I40IW_ERR_NO_MEMORY;
-	}
-	dev->cqp = &cqp->sc_cqp;
-	dev->cqp->dev = dev;
-	memset(&cqp_init_info, 0, sizeof(cqp_init_info));
-	status = i40iw_allocate_dma_mem(dev->hw, &cqp->sq,
-					(sizeof(struct i40iw_cqp_sq_wqe) * sqsize),
-					I40IW_CQP_ALIGNMENT);
-	if (status)
-		goto exit;
-	status = i40iw_obj_aligned_mem(iwdev, &mem, sizeof(struct i40iw_cqp_ctx),
-				       I40IW_HOST_CTX_ALIGNMENT_MASK);
-	if (status)
-		goto exit;
-	dev->cqp->host_ctx_pa = mem.pa;
-	dev->cqp->host_ctx = mem.va;
-	/* populate the cqp init info */
-	cqp_init_info.dev = dev;
-	cqp_init_info.sq_size = sqsize;
-	cqp_init_info.sq = cqp->sq.va;
-	cqp_init_info.sq_pa = cqp->sq.pa;
-	cqp_init_info.host_ctx_pa = mem.pa;
-	cqp_init_info.host_ctx = mem.va;
-	cqp_init_info.hmc_profile = iwdev->resource_profile;
-	cqp_init_info.enabled_vf_count = iwdev->max_rdma_vfs;
-	cqp_init_info.scratch_array = cqp->scratch_array;
-	status = dev->cqp_ops->cqp_init(dev->cqp, &cqp_init_info);
-	if (status) {
-		i40iw_pr_err("cqp init status %d\n", status);
-		goto exit;
-	}
-	status = dev->cqp_ops->cqp_create(dev->cqp, &maj_err, &min_err);
-	if (status) {
-		i40iw_pr_err("cqp create status %d maj_err %d min_err %d\n",
-			     status, maj_err, min_err);
-		goto exit;
-	}
-	spin_lock_init(&cqp->req_lock);
-	INIT_LIST_HEAD(&cqp->cqp_avail_reqs);
-	INIT_LIST_HEAD(&cqp->cqp_pending_reqs);
-	/* init the waitq of the cqp_requests and add them to the list */
-	for (i = 0; i < sqsize; i++) {
-		init_waitqueue_head(&cqp->cqp_requests[i].waitq);
-		list_add_tail(&cqp->cqp_requests[i].list, &cqp->cqp_avail_reqs);
-	}
-	return 0;
-exit:
-	/* clean up the created resources */
-	i40iw_destroy_cqp(iwdev, false);
-	return status;
-}
-
-/**
- * i40iw_create_ccq - create control cq
- * @iwdev: iwarp device
- *
- * Return 0, if the ccq and the resources associated with it
- * are successfully created, otherwise return error
- */
-static enum i40iw_status_code i40iw_create_ccq(struct i40iw_device *iwdev)
-{
-	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-	struct i40iw_dma_mem mem;
-	enum i40iw_status_code status;
-	struct i40iw_ccq_init_info info;
-	struct i40iw_ccq *ccq = &iwdev->ccq;
-
-	memset(&info, 0, sizeof(info));
-	dev->ccq = &ccq->sc_cq;
-	dev->ccq->dev = dev;
-	info.dev = dev;
-	ccq->shadow_area.size = sizeof(struct i40iw_cq_shadow_area);
-	ccq->mem_cq.size = sizeof(struct i40iw_cqe) * IW_CCQ_SIZE;
-	status = i40iw_allocate_dma_mem(dev->hw, &ccq->mem_cq,
-					ccq->mem_cq.size, I40IW_CQ0_ALIGNMENT);
-	if (status)
-		goto exit;
-	status = i40iw_obj_aligned_mem(iwdev, &mem, ccq->shadow_area.size,
-				       I40IW_SHADOWAREA_MASK);
-	if (status)
-		goto exit;
-	ccq->sc_cq.back_cq = (void *)ccq;
-	/* populate the ccq init info */
-	info.cq_base = ccq->mem_cq.va;
-	info.cq_pa = ccq->mem_cq.pa;
-	info.num_elem = IW_CCQ_SIZE;
-	info.shadow_area = mem.va;
-	info.shadow_area_pa = mem.pa;
-	info.ceqe_mask = false;
-	info.ceq_id_valid = true;
-	info.shadow_read_threshold = 16;
-	status = dev->ccq_ops->ccq_init(dev->ccq, &info);
-	if (!status)
-		status = dev->ccq_ops->ccq_create(dev->ccq, 0, true, true);
-exit:
-	if (status)
-		i40iw_free_dma_mem(dev->hw, &ccq->mem_cq);
-	return status;
-}
-
-/**
- * i40iw_configure_ceq_vector - set up the msix interrupt vector for ceq
- * @iwdev: iwarp device
- * @msix_vec: interrupt vector information
- * @iwceq: ceq associated with the vector
- * @ceq_id: the id number of the iwceq
- *
- * Allocate interrupt resources and enable irq handling
- * Return 0 if successful, otherwise return error
- */
-static enum i40iw_status_code i40iw_configure_ceq_vector(struct i40iw_device *iwdev,
-							 struct i40iw_ceq *iwceq,
-							 u32 ceq_id,
-							 struct i40iw_msix_vector *msix_vec)
-{
-	enum i40iw_status_code status;
-
-	if (iwdev->msix_shared && !ceq_id) {
-		tasklet_setup(&iwdev->dpc_tasklet, i40iw_dpc);
-		status = request_irq(msix_vec->irq, i40iw_irq_handler, 0, "AEQCEQ", iwdev);
-	} else {
-		tasklet_setup(&iwceq->dpc_tasklet, i40iw_ceq_dpc);
-		status = request_irq(msix_vec->irq, i40iw_ceq_handler, 0, "CEQ", iwceq);
-	}
-
-	cpumask_clear(&msix_vec->mask);
-	cpumask_set_cpu(msix_vec->cpu_affinity, &msix_vec->mask);
-	irq_set_affinity_hint(msix_vec->irq, &msix_vec->mask);
-
-	if (status) {
-		i40iw_pr_err("ceq irq config fail\n");
-		return I40IW_ERR_CONFIG;
-	}
-	msix_vec->ceq_id = ceq_id;
-
-	return 0;
-}
-
-/**
- * i40iw_create_ceq - create completion event queue
- * @iwdev: iwarp device
- * @iwceq: pointer to the ceq resources to be created
- * @ceq_id: the id number of the iwceq
- *
- * Return 0, if the ceq and the resources associated with it
- * are successfully created, otherwise return error
- */
-static enum i40iw_status_code i40iw_create_ceq(struct i40iw_device *iwdev,
-					       struct i40iw_ceq *iwceq,
-					       u32 ceq_id)
-{
-	enum i40iw_status_code status;
-	struct i40iw_ceq_init_info info;
-	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-	u64 scratch;
-
-	memset(&info, 0, sizeof(info));
-	info.ceq_id = ceq_id;
-	iwceq->iwdev = iwdev;
-	iwceq->mem.size = sizeof(struct i40iw_ceqe) *
-		iwdev->sc_dev.hmc_info->hmc_obj[I40IW_HMC_IW_CQ].cnt;
-	status = i40iw_allocate_dma_mem(dev->hw, &iwceq->mem, iwceq->mem.size,
-					I40IW_CEQ_ALIGNMENT);
-	if (status)
-		goto exit;
-	info.ceq_id = ceq_id;
-	info.ceqe_base = iwceq->mem.va;
-	info.ceqe_pa = iwceq->mem.pa;
-
-	info.elem_cnt = iwdev->sc_dev.hmc_info->hmc_obj[I40IW_HMC_IW_CQ].cnt;
-	iwceq->sc_ceq.ceq_id = ceq_id;
-	info.dev = dev;
-	scratch = (uintptr_t)&iwdev->cqp.sc_cqp;
-	status = dev->ceq_ops->ceq_init(&iwceq->sc_ceq, &info);
-	if (!status)
-		status = dev->ceq_ops->cceq_create(&iwceq->sc_ceq, scratch);
-
-exit:
-	if (status)
-		i40iw_free_dma_mem(dev->hw, &iwceq->mem);
-	return status;
-}
-
-void i40iw_request_reset(struct i40iw_device *iwdev)
-{
-	struct i40e_info *ldev = iwdev->ldev;
-
-	ldev->ops->request_reset(ldev, iwdev->client, 1);
-}
-
-/**
- * i40iw_setup_ceqs - manage the device ceq's and their interrupt resources
- * @iwdev: iwarp device
- * @ldev: i40e lan device
- *
- * Allocate a list for all device completion event queues
- * Create the ceq's and configure their msix interrupt vectors
- * Return 0, if at least one ceq is successfully set up, otherwise return error
- */
-static enum i40iw_status_code i40iw_setup_ceqs(struct i40iw_device *iwdev,
-					       struct i40e_info *ldev)
-{
-	u32 i;
-	u32 ceq_id;
-	struct i40iw_ceq *iwceq;
-	struct i40iw_msix_vector *msix_vec;
-	enum i40iw_status_code status = 0;
-	u32 num_ceqs;
-
-	if (ldev && ldev->ops && ldev->ops->setup_qvlist) {
-		status = ldev->ops->setup_qvlist(ldev, &i40iw_client,
-						 iwdev->iw_qvlist);
-		if (status)
-			goto exit;
-	} else {
-		status = I40IW_ERR_BAD_PTR;
-		goto exit;
-	}
-
-	num_ceqs = min(iwdev->msix_count, iwdev->sc_dev.hmc_fpm_misc.max_ceqs);
-	iwdev->ceqlist = kcalloc(num_ceqs, sizeof(*iwdev->ceqlist), GFP_KERNEL);
-	if (!iwdev->ceqlist) {
-		status = I40IW_ERR_NO_MEMORY;
-		goto exit;
-	}
-	i = (iwdev->msix_shared) ? 0 : 1;
-	for (ceq_id = 0; i < num_ceqs; i++, ceq_id++) {
-		iwceq = &iwdev->ceqlist[ceq_id];
-		status = i40iw_create_ceq(iwdev, iwceq, ceq_id);
-		if (status) {
-			i40iw_pr_err("create ceq status = %d\n", status);
-			break;
-		}
-
-		msix_vec = &iwdev->iw_msixtbl[i];
-		iwceq->irq = msix_vec->irq;
-		iwceq->msix_idx = msix_vec->idx;
-		status = i40iw_configure_ceq_vector(iwdev, iwceq, ceq_id, msix_vec);
-		if (status) {
-			i40iw_destroy_ceq(iwdev, iwceq);
-			break;
-		}
-		i40iw_enable_intr(&iwdev->sc_dev, msix_vec->idx);
-		iwdev->ceqs_count++;
-	}
-exit:
-	if (status && !iwdev->ceqs_count) {
-		kfree(iwdev->ceqlist);
-		iwdev->ceqlist = NULL;
-		return status;
-	} else {
-		iwdev->sc_dev.ceq_valid = true;
-		return 0;
-	}
-
-}
-
-/**
- * i40iw_configure_aeq_vector - set up the msix vector for aeq
- * @iwdev: iwarp device
- *
- * Allocate interrupt resources and enable irq handling
- * Return 0 if successful, otherwise return error
- */
-static enum i40iw_status_code i40iw_configure_aeq_vector(struct i40iw_device *iwdev)
-{
-	struct i40iw_msix_vector *msix_vec = iwdev->iw_msixtbl;
-	u32 ret = 0;
-
-	if (!iwdev->msix_shared) {
-		tasklet_setup(&iwdev->dpc_tasklet, i40iw_dpc);
-		ret = request_irq(msix_vec->irq, i40iw_irq_handler, 0, "i40iw", iwdev);
-	}
-	if (ret) {
-		i40iw_pr_err("aeq irq config fail\n");
-		return I40IW_ERR_CONFIG;
-	}
-
-	return 0;
-}
-
-/**
- * i40iw_create_aeq - create async event queue
- * @iwdev: iwarp device
- *
- * Return 0, if the aeq and the resources associated with it
- * are successfully created, otherwise return error
- */
-static enum i40iw_status_code i40iw_create_aeq(struct i40iw_device *iwdev)
-{
-	enum i40iw_status_code status;
-	struct i40iw_aeq_init_info info;
-	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-	struct i40iw_aeq *aeq = &iwdev->aeq;
-	u64 scratch = 0;
-	u32 aeq_size;
-
-	aeq_size = 2 * iwdev->sc_dev.hmc_info->hmc_obj[I40IW_HMC_IW_QP].cnt +
-		iwdev->sc_dev.hmc_info->hmc_obj[I40IW_HMC_IW_CQ].cnt;
-	memset(&info, 0, sizeof(info));
-	aeq->mem.size = sizeof(struct i40iw_sc_aeqe) * aeq_size;
-	status = i40iw_allocate_dma_mem(dev->hw, &aeq->mem, aeq->mem.size,
-					I40IW_AEQ_ALIGNMENT);
-	if (status)
-		goto exit;
-
-	info.aeqe_base = aeq->mem.va;
-	info.aeq_elem_pa = aeq->mem.pa;
-	info.elem_cnt = aeq_size;
-	info.dev = dev;
-	status = dev->aeq_ops->aeq_init(&aeq->sc_aeq, &info);
-	if (status)
-		goto exit;
-	status = dev->aeq_ops->aeq_create(&aeq->sc_aeq, scratch, 1);
-	if (!status)
-		status = dev->aeq_ops->aeq_create_done(&aeq->sc_aeq);
-exit:
-	if (status)
-		i40iw_free_dma_mem(dev->hw, &aeq->mem);
-	return status;
-}
-
-/**
- * i40iw_setup_aeq - set up the device aeq
- * @iwdev: iwarp device
- *
- * Create the aeq and configure its msix interrupt vector
- * Return 0 if successful, otherwise return error
- */
-static enum i40iw_status_code i40iw_setup_aeq(struct i40iw_device *iwdev)
-{
-	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-	enum i40iw_status_code status;
-
-	status = i40iw_create_aeq(iwdev);
-	if (status)
-		return status;
-
-	status = i40iw_configure_aeq_vector(iwdev);
-	if (status) {
-		i40iw_destroy_aeq(iwdev);
-		return status;
-	}
-
-	if (!iwdev->msix_shared)
-		i40iw_enable_intr(dev, iwdev->iw_msixtbl[0].idx);
-	return 0;
-}
-
-/**
- * i40iw_initialize_ilq - create iwarp local queue for cm
- * @iwdev: iwarp device
- *
- * Return 0 if successful, otherwise return error
- */
-static enum i40iw_status_code i40iw_initialize_ilq(struct i40iw_device *iwdev)
-{
-	struct i40iw_puda_rsrc_info info;
-	enum i40iw_status_code status;
-
-	memset(&info, 0, sizeof(info));
-	info.type = I40IW_PUDA_RSRC_TYPE_ILQ;
-	info.cq_id = 1;
-	info.qp_id = 0;
-	info.count = 1;
-	info.pd_id = 1;
-	info.sq_size = 8192;
-	info.rq_size = 8192;
-	info.buf_size = 1024;
-	info.tx_buf_cnt = 16384;
-	info.receive = i40iw_receive_ilq;
-	info.xmit_complete = i40iw_free_sqbuf;
-	status = i40iw_puda_create_rsrc(&iwdev->vsi, &info);
-	if (status)
-		i40iw_pr_err("ilq create fail\n");
-	return status;
-}
-
-/**
- * i40iw_initialize_ieq - create iwarp exception queue
- * @iwdev: iwarp device
- *
- * Return 0 if successful, otherwise return error
- */
-static enum i40iw_status_code i40iw_initialize_ieq(struct i40iw_device *iwdev)
-{
-	struct i40iw_puda_rsrc_info info;
-	enum i40iw_status_code status;
-
-	memset(&info, 0, sizeof(info));
-	info.type = I40IW_PUDA_RSRC_TYPE_IEQ;
-	info.cq_id = 2;
-	info.qp_id = iwdev->vsi.exception_lan_queue;
-	info.count = 1;
-	info.pd_id = 2;
-	info.sq_size = 8192;
-	info.rq_size = 8192;
-	info.buf_size = iwdev->vsi.mtu + VLAN_ETH_HLEN;
-	info.tx_buf_cnt = 4096;
-	status = i40iw_puda_create_rsrc(&iwdev->vsi, &info);
-	if (status)
-		i40iw_pr_err("ieq create fail\n");
-	return status;
-}
-
-/**
- * i40iw_reinitialize_ieq - destroy and re-create ieq
- * @dev: iwarp device
- */
-void i40iw_reinitialize_ieq(struct i40iw_sc_dev *dev)
-{
-	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
-
-	i40iw_puda_dele_resources(&iwdev->vsi, I40IW_PUDA_RSRC_TYPE_IEQ, false);
-	if (i40iw_initialize_ieq(iwdev)) {
-		iwdev->reset = true;
-		i40iw_request_reset(iwdev);
-	}
-}
-
-/**
- * i40iw_hmc_setup - create hmc objects for the device
- * @iwdev: iwarp device
- *
- * Set up the device private memory space for the number and size of
- * the hmc objects and create the objects
- * Return 0 if successful, otherwise return error
- */
-static enum i40iw_status_code i40iw_hmc_setup(struct i40iw_device *iwdev)
-{
-	enum i40iw_status_code status;
-
-	iwdev->sd_type = I40IW_SD_TYPE_DIRECT;
-	status = i40iw_config_fpm_values(&iwdev->sc_dev, IW_CFG_FPM_QP_COUNT);
-	if (status)
-		goto exit;
-	status = i40iw_create_hmc_objs(iwdev, true);
-	if (status)
-		goto exit;
-	iwdev->init_state = HMC_OBJS_CREATED;
-exit:
-	return status;
-}
-
-/**
- * i40iw_del_init_mem - deallocate memory resources
- * @iwdev: iwarp device
- */
-static void i40iw_del_init_mem(struct i40iw_device *iwdev)
-{
-	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-
-	i40iw_free_dma_mem(&iwdev->hw, &iwdev->obj_mem);
-	kfree(dev->hmc_info->sd_table.sd_entry);
-	dev->hmc_info->sd_table.sd_entry = NULL;
-	kfree(iwdev->mem_resources);
-	iwdev->mem_resources = NULL;
-	kfree(iwdev->ceqlist);
-	iwdev->ceqlist = NULL;
-	kfree(iwdev->iw_msixtbl);
-	iwdev->iw_msixtbl = NULL;
-	kfree(iwdev->hmc_info_mem);
-	iwdev->hmc_info_mem = NULL;
-}
-
-/**
- * i40iw_del_macip_entry - remove a mac ip address entry from the hw table
- * @iwdev: iwarp device
- * @idx: the index of the mac ip address to delete
- */
-static void i40iw_del_macip_entry(struct i40iw_device *iwdev, u8 idx)
-{
-	struct i40iw_cqp *iwcqp = &iwdev->cqp;
-	struct i40iw_cqp_request *cqp_request;
-	struct cqp_commands_info *cqp_info;
-	enum i40iw_status_code status = 0;
-
-	cqp_request = i40iw_get_cqp_request(iwcqp, true);
-	if (!cqp_request) {
-		i40iw_pr_err("cqp_request memory failed\n");
-		return;
-	}
-	cqp_info = &cqp_request->info;
-	cqp_info->cqp_cmd = OP_DELETE_LOCAL_MAC_IPADDR_ENTRY;
-	cqp_info->post_sq = 1;
-	cqp_info->in.u.del_local_mac_ipaddr_entry.cqp = &iwcqp->sc_cqp;
-	cqp_info->in.u.del_local_mac_ipaddr_entry.scratch = (uintptr_t)cqp_request;
-	cqp_info->in.u.del_local_mac_ipaddr_entry.entry_idx = idx;
-	cqp_info->in.u.del_local_mac_ipaddr_entry.ignore_ref_count = 0;
-	status = i40iw_handle_cqp_op(iwdev, cqp_request);
-	if (status)
-		i40iw_pr_err("CQP-OP Del MAC Ip entry fail");
-}
-
-/**
- * i40iw_add_mac_ipaddr_entry - add a mac ip address entry to the hw table
- * @iwdev: iwarp device
- * @mac_addr: pointer to mac address
- * @idx: the index of the mac ip address to add
- */
-static enum i40iw_status_code i40iw_add_mac_ipaddr_entry(struct i40iw_device *iwdev,
-							 u8 *mac_addr,
-							 u8 idx)
-{
-	struct i40iw_local_mac_ipaddr_entry_info *info;
-	struct i40iw_cqp *iwcqp = &iwdev->cqp;
-	struct i40iw_cqp_request *cqp_request;
-	struct cqp_commands_info *cqp_info;
-	enum i40iw_status_code status = 0;
-
-	cqp_request = i40iw_get_cqp_request(iwcqp, true);
-	if (!cqp_request) {
-		i40iw_pr_err("cqp_request memory failed\n");
-		return I40IW_ERR_NO_MEMORY;
-	}
-
-	cqp_info = &cqp_request->info;
-
-	cqp_info->post_sq = 1;
-	info = &cqp_info->in.u.add_local_mac_ipaddr_entry.info;
-	ether_addr_copy(info->mac_addr, mac_addr);
-	info->entry_idx = idx;
-	cqp_info->in.u.add_local_mac_ipaddr_entry.scratch = (uintptr_t)cqp_request;
-	cqp_info->cqp_cmd = OP_ADD_LOCAL_MAC_IPADDR_ENTRY;
-	cqp_info->in.u.add_local_mac_ipaddr_entry.cqp = &iwcqp->sc_cqp;
-	cqp_info->in.u.add_local_mac_ipaddr_entry.scratch = (uintptr_t)cqp_request;
-	status = i40iw_handle_cqp_op(iwdev, cqp_request);
-	if (status)
-		i40iw_pr_err("CQP-OP Add MAC Ip entry fail");
-	return status;
-}
-
-/**
- * i40iw_alloc_local_mac_ipaddr_entry - allocate a mac ip address entry
- * @iwdev: iwarp device
- * @mac_ip_tbl_idx: the index of the new mac ip address
- *
- * Allocate a mac ip address entry and update the mac_ip_tbl_idx
- * to hold the index of the newly created mac ip address
- * Return 0 if successful, otherwise return error
- */
-static enum i40iw_status_code i40iw_alloc_local_mac_ipaddr_entry(struct i40iw_device *iwdev,
-								 u16 *mac_ip_tbl_idx)
-{
-	struct i40iw_cqp *iwcqp = &iwdev->cqp;
-	struct i40iw_cqp_request *cqp_request;
-	struct cqp_commands_info *cqp_info;
-	enum i40iw_status_code status = 0;
-
-	cqp_request = i40iw_get_cqp_request(iwcqp, true);
-	if (!cqp_request) {
-		i40iw_pr_err("cqp_request memory failed\n");
-		return I40IW_ERR_NO_MEMORY;
-	}
-
-	/* increment refcount, because we need the cqp request ret value */
-	atomic_inc(&cqp_request->refcount);
-
-	cqp_info = &cqp_request->info;
-	cqp_info->cqp_cmd = OP_ALLOC_LOCAL_MAC_IPADDR_ENTRY;
-	cqp_info->post_sq = 1;
-	cqp_info->in.u.alloc_local_mac_ipaddr_entry.cqp = &iwcqp->sc_cqp;
-	cqp_info->in.u.alloc_local_mac_ipaddr_entry.scratch = (uintptr_t)cqp_request;
-	status = i40iw_handle_cqp_op(iwdev, cqp_request);
-	if (!status)
-		*mac_ip_tbl_idx = cqp_request->compl_info.op_ret_val;
-	else
-		i40iw_pr_err("CQP-OP Alloc MAC Ip entry fail");
-	/* decrement refcount and free the cqp request, if no longer used */
-	i40iw_put_cqp_request(iwcqp, cqp_request);
-	return status;
-}
-
-/**
- * i40iw_alloc_set_mac_ipaddr - set up a mac ip address table entry
- * @iwdev: iwarp device
- * @macaddr: pointer to mac address
- *
- * Allocate a mac ip address entry and add it to the hw table
- * Return 0 if successful, otherwise return error
- */
-static enum i40iw_status_code i40iw_alloc_set_mac_ipaddr(struct i40iw_device *iwdev,
-							 u8 *macaddr)
-{
-	enum i40iw_status_code status;
-
-	status = i40iw_alloc_local_mac_ipaddr_entry(iwdev, &iwdev->mac_ip_table_idx);
-	if (!status) {
-		status = i40iw_add_mac_ipaddr_entry(iwdev, macaddr,
-						    (u8)iwdev->mac_ip_table_idx);
-		if (status)
-			i40iw_del_macip_entry(iwdev, (u8)iwdev->mac_ip_table_idx);
-	}
-	return status;
-}
-
-/**
- * i40iw_add_ipv6_addr - add ipv6 address to the hw arp table
- * @iwdev: iwarp device
- */
-static void i40iw_add_ipv6_addr(struct i40iw_device *iwdev)
-{
-	struct net_device *ip_dev;
-	struct inet6_dev *idev;
-	struct inet6_ifaddr *ifp, *tmp;
-	u32 local_ipaddr6[4];
-
-	rcu_read_lock();
-	for_each_netdev_rcu(&init_net, ip_dev) {
-		if ((((rdma_vlan_dev_vlan_id(ip_dev) < 0xFFFF) &&
-		      (rdma_vlan_dev_real_dev(ip_dev) == iwdev->netdev)) ||
-		     (ip_dev == iwdev->netdev)) && (ip_dev->flags & IFF_UP)) {
-			idev = __in6_dev_get(ip_dev);
-			if (!idev) {
-				i40iw_pr_err("ipv6 inet device not found\n");
-				break;
-			}
-			list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) {
-				i40iw_pr_info("IP=%pI6, vlan_id=%d, MAC=%pM\n", &ifp->addr,
-					      rdma_vlan_dev_vlan_id(ip_dev), ip_dev->dev_addr);
-				i40iw_copy_ip_ntohl(local_ipaddr6,
-						    ifp->addr.in6_u.u6_addr32);
-				i40iw_manage_arp_cache(iwdev,
-						       ip_dev->dev_addr,
-						       local_ipaddr6,
-						       false,
-						       I40IW_ARP_ADD);
-			}
-		}
-	}
-	rcu_read_unlock();
-}
-
-/**
- * i40iw_add_ipv4_addr - add ipv4 address to the hw arp table
- * @iwdev: iwarp device
- */
-static void i40iw_add_ipv4_addr(struct i40iw_device *iwdev)
-{
-	struct net_device *dev;
-	struct in_device *idev;
-	u32 ip_addr;
-
-	rcu_read_lock();
-	for_each_netdev_rcu(&init_net, dev) {
-		if ((((rdma_vlan_dev_vlan_id(dev) < 0xFFFF) &&
-		      (rdma_vlan_dev_real_dev(dev) == iwdev->netdev)) ||
-		    (dev == iwdev->netdev)) && (READ_ONCE(dev->flags) & IFF_UP)) {
-			const struct in_ifaddr *ifa;
-
-			idev = __in_dev_get_rcu(dev);
-			if (!idev)
-				continue;
-			in_dev_for_each_ifa_rcu(ifa, idev) {
-				i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_CM,
-					    "IP=%pI4, vlan_id=%d, MAC=%pM\n", &ifa->ifa_address,
-					     rdma_vlan_dev_vlan_id(dev), dev->dev_addr);
-
-				ip_addr = ntohl(ifa->ifa_address);
-				i40iw_manage_arp_cache(iwdev,
-						       dev->dev_addr,
-						       &ip_addr,
-						       true,
-						       I40IW_ARP_ADD);
-			}
-		}
-	}
-	rcu_read_unlock();
-}
-
-/**
- * i40iw_add_mac_ip - add mac and ip addresses
- * @iwdev: iwarp device
- *
- * Create and add a mac ip address entry to the hw table and
- * ipv4/ipv6 addresses to the arp cache
- * Return 0 if successful, otherwise return error
- */
-static enum i40iw_status_code i40iw_add_mac_ip(struct i40iw_device *iwdev)
-{
-	struct net_device *netdev = iwdev->netdev;
-	enum i40iw_status_code status;
-
-	status = i40iw_alloc_set_mac_ipaddr(iwdev, (u8 *)netdev->dev_addr);
-	if (status)
-		return status;
-	i40iw_add_ipv4_addr(iwdev);
-	i40iw_add_ipv6_addr(iwdev);
-	return 0;
-}
-
-/**
- * i40iw_wait_pe_ready - Check if firmware is ready
- * @hw: provides access to registers
- */
-static void i40iw_wait_pe_ready(struct i40iw_hw *hw)
-{
-	u32 statusfw;
-	u32 statuscpu0;
-	u32 statuscpu1;
-	u32 statuscpu2;
-	u32 retrycount = 0;
-
-	do {
-		statusfw = i40iw_rd32(hw, I40E_GLPE_FWLDSTATUS);
-		i40iw_pr_info("[%04d] fm load status[x%04X]\n", __LINE__, statusfw);
-		statuscpu0 = i40iw_rd32(hw, I40E_GLPE_CPUSTATUS0);
-		i40iw_pr_info("[%04d] CSR_CQP status[x%04X]\n", __LINE__, statuscpu0);
-		statuscpu1 = i40iw_rd32(hw, I40E_GLPE_CPUSTATUS1);
-		i40iw_pr_info("[%04d] I40E_GLPE_CPUSTATUS1 status[x%04X]\n",
-			      __LINE__, statuscpu1);
-		statuscpu2 = i40iw_rd32(hw, I40E_GLPE_CPUSTATUS2);
-		i40iw_pr_info("[%04d] I40E_GLPE_CPUSTATUS2 status[x%04X]\n",
-			      __LINE__, statuscpu2);
-		if ((statuscpu0 == 0x80) && (statuscpu1 == 0x80) && (statuscpu2 == 0x80))
-			break;	/* SUCCESS */
-		msleep(1000);
-		retrycount++;
-	} while (retrycount < 14);
-	i40iw_wr32(hw, 0xb4040, 0x4C104C5);
-}
-
-/**
- * i40iw_initialize_dev - initialize device
- * @iwdev: iwarp device
- * @ldev: lan device information
- *
- * Allocate memory for the hmc objects and initialize iwdev
- * Return 0 if successful, otherwise clean up the resources
- * and return error
- */
-static enum i40iw_status_code i40iw_initialize_dev(struct i40iw_device *iwdev,
-						   struct i40e_info *ldev)
-{
-	enum i40iw_status_code status;
-	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-	struct i40iw_device_init_info info;
-	struct i40iw_vsi_init_info vsi_info;
-	struct i40iw_dma_mem mem;
-	struct i40iw_l2params l2params;
-	u32 size;
-	struct i40iw_vsi_stats_info stats_info;
-	u16 last_qset = I40IW_NO_QSET;
-	u16 qset;
-	u32 i;
-
-	memset(&l2params, 0, sizeof(l2params));
-	memset(&info, 0, sizeof(info));
-	size = sizeof(struct i40iw_hmc_pble_rsrc) + sizeof(struct i40iw_hmc_info) +
-				(sizeof(struct i40iw_hmc_obj_info) * I40IW_HMC_IW_MAX);
-	iwdev->hmc_info_mem = kzalloc(size, GFP_KERNEL);
-	if (!iwdev->hmc_info_mem)
-		return I40IW_ERR_NO_MEMORY;
-
-	iwdev->pble_rsrc = (struct i40iw_hmc_pble_rsrc *)iwdev->hmc_info_mem;
-	dev->hmc_info = &iwdev->hw.hmc;
-	dev->hmc_info->hmc_obj = (struct i40iw_hmc_obj_info *)(iwdev->pble_rsrc + 1);
-	status = i40iw_obj_aligned_mem(iwdev, &mem, I40IW_QUERY_FPM_BUF_SIZE,
-				       I40IW_FPM_QUERY_BUF_ALIGNMENT_MASK);
-	if (status)
-		goto error;
-	info.fpm_query_buf_pa = mem.pa;
-	info.fpm_query_buf = mem.va;
-	status = i40iw_obj_aligned_mem(iwdev, &mem, I40IW_COMMIT_FPM_BUF_SIZE,
-				       I40IW_FPM_COMMIT_BUF_ALIGNMENT_MASK);
-	if (status)
-		goto error;
-	info.fpm_commit_buf_pa = mem.pa;
-	info.fpm_commit_buf = mem.va;
-	info.hmc_fn_id = ldev->fid;
-	info.is_pf = (ldev->ftype) ? false : true;
-	info.bar0 = ldev->hw_addr;
-	info.hw = &iwdev->hw;
-	info.debug_mask = debug;
-	l2params.mtu =
-		(ldev->params.mtu) ? ldev->params.mtu : I40IW_DEFAULT_MTU;
-	for (i = 0; i < I40E_CLIENT_MAX_USER_PRIORITY; i++) {
-		qset = ldev->params.qos.prio_qos[i].qs_handle;
-		l2params.qs_handle_list[i] = qset;
-		if (last_qset == I40IW_NO_QSET)
-			last_qset = qset;
-		else if ((qset != last_qset) && (qset != I40IW_NO_QSET))
-			iwdev->dcb = true;
-	}
-	i40iw_pr_info("DCB is set/clear = %d\n", iwdev->dcb);
-	info.vchnl_send = i40iw_virtchnl_send;
-	status = i40iw_device_init(&iwdev->sc_dev, &info);
-
-	if (status)
-		goto error;
-	memset(&vsi_info, 0, sizeof(vsi_info));
-	vsi_info.dev = &iwdev->sc_dev;
-	vsi_info.back_vsi = (void *)iwdev;
-	vsi_info.params = &l2params;
-	vsi_info.exception_lan_queue = 1;
-	i40iw_sc_vsi_init(&iwdev->vsi, &vsi_info);
-
-	if (dev->is_pf) {
-		memset(&stats_info, 0, sizeof(stats_info));
-		stats_info.fcn_id = ldev->fid;
-		stats_info.pestat = kzalloc(sizeof(*stats_info.pestat), GFP_KERNEL);
-		if (!stats_info.pestat) {
-			status = I40IW_ERR_NO_MEMORY;
-			goto error;
-		}
-		stats_info.stats_initialize = true;
-		if (stats_info.pestat)
-			i40iw_vsi_stats_init(&iwdev->vsi, &stats_info);
-	}
-	return status;
-error:
-	kfree(iwdev->hmc_info_mem);
-	iwdev->hmc_info_mem = NULL;
-	return status;
-}
-
-/**
- * i40iw_register_notifiers - register tcp ip notifiers
- */
-static void i40iw_register_notifiers(void)
-{
-	register_inetaddr_notifier(&i40iw_inetaddr_notifier);
-	register_inet6addr_notifier(&i40iw_inetaddr6_notifier);
-	register_netevent_notifier(&i40iw_net_notifier);
-	register_netdevice_notifier(&i40iw_netdevice_notifier);
-}
-
-/**
- * i40iw_unregister_notifiers - unregister tcp ip notifiers
- */
-
-static void i40iw_unregister_notifiers(void)
-{
-	unregister_netevent_notifier(&i40iw_net_notifier);
-	unregister_inetaddr_notifier(&i40iw_inetaddr_notifier);
-	unregister_inet6addr_notifier(&i40iw_inetaddr6_notifier);
-	unregister_netdevice_notifier(&i40iw_netdevice_notifier);
-}
-
-/**
- * i40iw_save_msix_info - copy msix vector information to iwarp device
- * @iwdev: iwarp device
- * @ldev: lan device information
- *
- * Allocate iwdev msix table and copy the ldev msix info to the table
- * Return 0 if successful, otherwise return error
- */
-static enum i40iw_status_code i40iw_save_msix_info(struct i40iw_device *iwdev,
-						   struct i40e_info *ldev)
-{
-	struct i40e_qvlist_info *iw_qvlist;
-	struct i40e_qv_info *iw_qvinfo;
-	u32 ceq_idx;
-	u32 i;
-	size_t size;
-
-	if (!ldev->msix_count) {
-		i40iw_pr_err("No MSI-X vectors\n");
-		return I40IW_ERR_CONFIG;
-	}
-
-	iwdev->msix_count = ldev->msix_count;
-
-	size = sizeof(struct i40iw_msix_vector) * iwdev->msix_count;
-	size += struct_size(iw_qvlist, qv_info, iwdev->msix_count);
-	iwdev->iw_msixtbl = kzalloc(size, GFP_KERNEL);
-
-	if (!iwdev->iw_msixtbl)
-		return I40IW_ERR_NO_MEMORY;
-	iwdev->iw_qvlist = (struct i40e_qvlist_info *)(&iwdev->iw_msixtbl[iwdev->msix_count]);
-	iw_qvlist = iwdev->iw_qvlist;
-	iw_qvinfo = iw_qvlist->qv_info;
-	iw_qvlist->num_vectors = iwdev->msix_count;
-	if (iwdev->msix_count <= num_online_cpus())
-		iwdev->msix_shared = true;
-	for (i = 0, ceq_idx = 0; i < iwdev->msix_count; i++, iw_qvinfo++) {
-		iwdev->iw_msixtbl[i].idx = ldev->msix_entries[i].entry;
-		iwdev->iw_msixtbl[i].irq = ldev->msix_entries[i].vector;
-		iwdev->iw_msixtbl[i].cpu_affinity = ceq_idx;
-		if (i == 0) {
-			iw_qvinfo->aeq_idx = 0;
-			if (iwdev->msix_shared)
-				iw_qvinfo->ceq_idx = ceq_idx++;
-			else
-				iw_qvinfo->ceq_idx = I40E_QUEUE_INVALID_IDX;
-		} else {
-			iw_qvinfo->aeq_idx = I40E_QUEUE_INVALID_IDX;
-			iw_qvinfo->ceq_idx = ceq_idx++;
-		}
-		iw_qvinfo->itr_idx = 3;
-		iw_qvinfo->v_idx = iwdev->iw_msixtbl[i].idx;
-	}
-	return 0;
-}
-
-/**
- * i40iw_deinit_device - clean up the device resources
- * @iwdev: iwarp device
- *
- * Destroy the ib device interface, remove the mac ip entry and ipv4/ipv6 addresses,
- * destroy the device queues and free the pble and the hmc objects
- */
-static void i40iw_deinit_device(struct i40iw_device *iwdev)
-{
-	struct i40e_info *ldev = iwdev->ldev;
-
-	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-
-	i40iw_pr_info("state = %d\n", iwdev->init_state);
-	if (iwdev->param_wq)
-		destroy_workqueue(iwdev->param_wq);
-
-	switch (iwdev->init_state) {
-	case RDMA_DEV_REGISTERED:
-		iwdev->iw_status = 0;
-		i40iw_port_ibevent(iwdev);
-		i40iw_destroy_rdma_device(iwdev->iwibdev);
-		fallthrough;
-	case IP_ADDR_REGISTERED:
-		if (!iwdev->reset)
-			i40iw_del_macip_entry(iwdev, (u8)iwdev->mac_ip_table_idx);
-		fallthrough;
-	case PBLE_CHUNK_MEM:
-		i40iw_destroy_pble_pool(dev, iwdev->pble_rsrc);
-		fallthrough;
-	case CEQ_CREATED:
-		i40iw_dele_ceqs(iwdev);
-		fallthrough;
-	case AEQ_CREATED:
-		i40iw_destroy_aeq(iwdev);
-		fallthrough;
-	case IEQ_CREATED:
-		i40iw_puda_dele_resources(&iwdev->vsi, I40IW_PUDA_RSRC_TYPE_IEQ, iwdev->reset);
-		fallthrough;
-	case ILQ_CREATED:
-		i40iw_puda_dele_resources(&iwdev->vsi, I40IW_PUDA_RSRC_TYPE_ILQ, iwdev->reset);
-		fallthrough;
-	case CCQ_CREATED:
-		i40iw_destroy_ccq(iwdev);
-		fallthrough;
-	case HMC_OBJS_CREATED:
-		i40iw_del_hmc_objects(dev, dev->hmc_info, true, iwdev->reset);
-		fallthrough;
-	case CQP_CREATED:
-		i40iw_destroy_cqp(iwdev, true);
-		fallthrough;
-	case INITIAL_STATE:
-		i40iw_cleanup_cm_core(&iwdev->cm_core);
-		if (iwdev->vsi.pestat) {
-			i40iw_vsi_stats_free(&iwdev->vsi);
-			kfree(iwdev->vsi.pestat);
-		}
-		i40iw_del_init_mem(iwdev);
-		break;
-	case INVALID_STATE:
-	default:
-		i40iw_pr_err("bad init_state = %d\n", iwdev->init_state);
-		break;
-	}
-
-	i40iw_del_handler(i40iw_find_i40e_handler(ldev));
-	kfree(iwdev->hdl);
-}
-
-/**
- * i40iw_setup_init_state - set up the initial device struct
- * @hdl: handler for iwarp device - one per instance
- * @ldev: lan device information
- * @client: iwarp client information, provided during registration
- *
- * Initialize the iwarp device and its hdl information
- * using the ldev and client information
- * Return 0 if successful, otherwise return error
- */
-static enum i40iw_status_code i40iw_setup_init_state(struct i40iw_handler *hdl,
-						     struct i40e_info *ldev,
-						     struct i40e_client *client)
-{
-	struct i40iw_device *iwdev = &hdl->device;
-	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-	enum i40iw_status_code status;
-
-	memcpy(&hdl->ldev, ldev, sizeof(*ldev));
-
-	iwdev->mpa_version = mpa_version;
-	iwdev->resource_profile = (resource_profile < I40IW_HMC_PROFILE_EQUAL) ?
-	    (u8)resource_profile + I40IW_HMC_PROFILE_DEFAULT :
-	    I40IW_HMC_PROFILE_DEFAULT;
-	iwdev->max_rdma_vfs =
-		(iwdev->resource_profile != I40IW_HMC_PROFILE_DEFAULT) ?  max_rdma_vfs : 0;
-	iwdev->max_enabled_vfs = iwdev->max_rdma_vfs;
-	iwdev->netdev = ldev->netdev;
-	hdl->client = client;
-	if (!ldev->ftype)
-		iwdev->db_start = pci_resource_start(ldev->pcidev, 0) + I40IW_DB_ADDR_OFFSET;
-	else
-		iwdev->db_start = pci_resource_start(ldev->pcidev, 0) + I40IW_VF_DB_ADDR_OFFSET;
-
-	status = i40iw_save_msix_info(iwdev, ldev);
-	if (status)
-		return status;
-	iwdev->hw.pcidev = ldev->pcidev;
-	iwdev->hw.hw_addr = ldev->hw_addr;
-	status = i40iw_allocate_dma_mem(&iwdev->hw,
-					&iwdev->obj_mem, 8192, 4096);
-	if (status)
-		goto exit;
-	iwdev->obj_next = iwdev->obj_mem;
-
-	init_waitqueue_head(&iwdev->vchnl_waitq);
-	init_waitqueue_head(&dev->vf_reqs);
-	init_waitqueue_head(&iwdev->close_wq);
-
-	status = i40iw_initialize_dev(iwdev, ldev);
-exit:
-	if (status) {
-		kfree(iwdev->iw_msixtbl);
-		i40iw_free_dma_mem(dev->hw, &iwdev->obj_mem);
-		iwdev->iw_msixtbl = NULL;
-	}
-	return status;
-}
-
-/**
- * i40iw_get_used_rsrc - determine resources used internally
- * @iwdev: iwarp device
- *
- * Called after internal allocations
- */
-static void i40iw_get_used_rsrc(struct i40iw_device *iwdev)
-{
-	iwdev->used_pds = find_next_zero_bit(iwdev->allocated_pds, iwdev->max_pd, 0);
-	iwdev->used_qps = find_next_zero_bit(iwdev->allocated_qps, iwdev->max_qp, 0);
-	iwdev->used_cqs = find_next_zero_bit(iwdev->allocated_cqs, iwdev->max_cq, 0);
-	iwdev->used_mrs = find_next_zero_bit(iwdev->allocated_mrs, iwdev->max_mr, 0);
-}
-
-/**
- * i40iw_open - client interface operation open for iwarp/uda device
- * @ldev: lan device information
- * @client: iwarp client information, provided during registration
- *
- * Called by the lan driver during the processing of client register
- * Create device resources, set up queues, pble and hmc objects and
- * register the device with the ib verbs interface
- * Return 0 if successful, otherwise return error
- */
-static int i40iw_open(struct i40e_info *ldev, struct i40e_client *client)
-{
-	struct i40iw_device *iwdev;
-	struct i40iw_sc_dev *dev;
-	enum i40iw_status_code status;
-	struct i40iw_handler *hdl;
-
-	hdl = i40iw_find_netdev(ldev->netdev);
-	if (hdl)
-		return 0;
-
-	hdl = kzalloc(sizeof(*hdl), GFP_KERNEL);
-	if (!hdl)
-		return -ENOMEM;
-	iwdev = &hdl->device;
-	iwdev->hdl = hdl;
-	dev = &iwdev->sc_dev;
-	if (i40iw_setup_cm_core(iwdev)) {
-		kfree(iwdev->hdl);
-		return -ENOMEM;
-	}
-
-	dev->back_dev = (void *)iwdev;
-	iwdev->ldev = &hdl->ldev;
-	iwdev->client = client;
-	mutex_init(&iwdev->pbl_mutex);
-	i40iw_add_handler(hdl);
-
-	do {
-		status = i40iw_setup_init_state(hdl, ldev, client);
-		if (status)
-			break;
-		iwdev->init_state = INITIAL_STATE;
-		if (dev->is_pf)
-			i40iw_wait_pe_ready(dev->hw);
-		status = i40iw_create_cqp(iwdev);
-		if (status)
-			break;
-		iwdev->init_state = CQP_CREATED;
-		status = i40iw_hmc_setup(iwdev);
-		if (status)
-			break;
-		status = i40iw_create_ccq(iwdev);
-		if (status)
-			break;
-		iwdev->init_state = CCQ_CREATED;
-		status = i40iw_initialize_ilq(iwdev);
-		if (status)
-			break;
-		iwdev->init_state = ILQ_CREATED;
-		status = i40iw_initialize_ieq(iwdev);
-		if (status)
-			break;
-		iwdev->init_state = IEQ_CREATED;
-		status = i40iw_setup_aeq(iwdev);
-		if (status)
-			break;
-		iwdev->init_state = AEQ_CREATED;
-		status = i40iw_setup_ceqs(iwdev, ldev);
-		if (status)
-			break;
-
-		status = i40iw_get_rdma_features(dev);
-		if (status)
-			dev->feature_info[I40IW_FEATURE_FW_INFO] =
-				I40IW_FW_VER_DEFAULT;
-
-		iwdev->init_state = CEQ_CREATED;
-		status = i40iw_initialize_hw_resources(iwdev);
-		if (status)
-			break;
-		i40iw_get_used_rsrc(iwdev);
-		dev->ccq_ops->ccq_arm(dev->ccq);
-		status = i40iw_hmc_init_pble(&iwdev->sc_dev, iwdev->pble_rsrc);
-		if (status)
-			break;
-		iwdev->init_state = PBLE_CHUNK_MEM;
-		iwdev->virtchnl_wq = alloc_ordered_workqueue("iwvch", WQ_MEM_RECLAIM);
-		status = i40iw_add_mac_ip(iwdev);
-		if (status)
-			break;
-		iwdev->init_state = IP_ADDR_REGISTERED;
-		if (i40iw_register_rdma_device(iwdev)) {
-			i40iw_pr_err("register rdma device fail\n");
-			break;
-		};
-
-		iwdev->init_state = RDMA_DEV_REGISTERED;
-		iwdev->iw_status = 1;
-		i40iw_port_ibevent(iwdev);
-		iwdev->param_wq = alloc_ordered_workqueue("l2params", WQ_MEM_RECLAIM);
-		if(iwdev->param_wq == NULL)
-			break;
-		i40iw_pr_info("i40iw_open completed\n");
-		return 0;
-	} while (0);
-
-	i40iw_pr_err("status = %d last completion = %d\n", status, iwdev->init_state);
-	i40iw_deinit_device(iwdev);
-	return -ERESTART;
-}
-
-/**
- * i40iw_l2params_worker - worker for l2 params change
- * @work: work pointer for l2 params
- */
-static void i40iw_l2params_worker(struct work_struct *work)
-{
-	struct l2params_work *dwork =
-	    container_of(work, struct l2params_work, work);
-	struct i40iw_device *iwdev = dwork->iwdev;
-
-	i40iw_change_l2params(&iwdev->vsi, &dwork->l2params);
-	atomic_dec(&iwdev->params_busy);
-	kfree(work);
-}
-
-/**
- * i40iw_l2param_change - handle qs handles for qos and mss change
- * @ldev: lan device information
- * @client: client for paramater change
- * @params: new parameters from L2
- */
-static void i40iw_l2param_change(struct i40e_info *ldev, struct i40e_client *client,
-				 struct i40e_params *params)
-{
-	struct i40iw_handler *hdl;
-	struct i40iw_l2params *l2params;
-	struct l2params_work *work;
-	struct i40iw_device *iwdev;
-	int i;
-
-	hdl = i40iw_find_i40e_handler(ldev);
-	if (!hdl)
-		return;
-
-	iwdev = &hdl->device;
-
-	if (atomic_read(&iwdev->params_busy))
-		return;
-
-
-	work = kzalloc(sizeof(*work), GFP_KERNEL);
-	if (!work)
-		return;
-
-	atomic_inc(&iwdev->params_busy);
-
-	work->iwdev = iwdev;
-	l2params = &work->l2params;
-	for (i = 0; i < I40E_CLIENT_MAX_USER_PRIORITY; i++)
-		l2params->qs_handle_list[i] = params->qos.prio_qos[i].qs_handle;
-
-	l2params->mtu = (params->mtu) ? params->mtu : iwdev->vsi.mtu;
-
-	INIT_WORK(&work->work, i40iw_l2params_worker);
-	queue_work(iwdev->param_wq, &work->work);
-}
-
-/**
- * i40iw_close - client interface operation close for iwarp/uda device
- * @ldev: lan device information
- * @reset: true if called before reset
- * @client: client to close
- *
- * Called by the lan driver during the processing of client unregister
- * Destroy and clean up the driver resources
- */
-static void i40iw_close(struct i40e_info *ldev, struct i40e_client *client, bool reset)
-{
-	struct i40iw_device *iwdev;
-	struct i40iw_handler *hdl;
-
-	hdl = i40iw_find_i40e_handler(ldev);
-	if (!hdl)
-		return;
-
-	iwdev = &hdl->device;
-	iwdev->closing = true;
-
-	if (reset)
-		iwdev->reset = true;
-
-	i40iw_cm_teardown_connections(iwdev, NULL, NULL, true);
-	destroy_workqueue(iwdev->virtchnl_wq);
-	i40iw_deinit_device(iwdev);
-}
-
-/**
- * i40iw_vf_reset - process VF reset
- * @ldev: lan device information
- * @client: client interface instance
- * @vf_id: virtual function id
- *
- * Called when a VF is reset by the PF
- * Destroy and clean up the VF resources
- */
-static void i40iw_vf_reset(struct i40e_info *ldev, struct i40e_client *client, u32 vf_id)
-{
-	struct i40iw_handler *hdl;
-	struct i40iw_sc_dev *dev;
-	struct i40iw_hmc_fcn_info hmc_fcn_info;
-	struct i40iw_virt_mem vf_dev_mem;
-	struct i40iw_vfdev *tmp_vfdev;
-	unsigned int i;
-	unsigned long flags;
-	struct i40iw_device *iwdev;
-
-	hdl = i40iw_find_i40e_handler(ldev);
-	if (!hdl)
-		return;
-
-	dev = &hdl->device.sc_dev;
-	iwdev = (struct i40iw_device *)dev->back_dev;
-
-	for (i = 0; i < I40IW_MAX_PE_ENABLED_VF_COUNT; i++) {
-		if (!dev->vf_dev[i] || (dev->vf_dev[i]->vf_id != vf_id))
-			continue;
-		/* free all resources allocated on behalf of vf */
-		tmp_vfdev = dev->vf_dev[i];
-		spin_lock_irqsave(&iwdev->vsi.pestat->lock, flags);
-		dev->vf_dev[i] = NULL;
-		spin_unlock_irqrestore(&iwdev->vsi.pestat->lock, flags);
-		i40iw_del_hmc_objects(dev, &tmp_vfdev->hmc_info, false, false);
-		/* remove vf hmc function */
-		memset(&hmc_fcn_info, 0, sizeof(hmc_fcn_info));
-		hmc_fcn_info.vf_id = vf_id;
-		hmc_fcn_info.iw_vf_idx = tmp_vfdev->iw_vf_idx;
-		hmc_fcn_info.free_fcn = true;
-		i40iw_cqp_manage_hmc_fcn_cmd(dev, &hmc_fcn_info);
-		/* free vf_dev */
-		vf_dev_mem.va = tmp_vfdev;
-		vf_dev_mem.size = sizeof(struct i40iw_vfdev) +
-					sizeof(struct i40iw_hmc_obj_info) * I40IW_HMC_IW_MAX;
-		i40iw_free_virt_mem(dev->hw, &vf_dev_mem);
-		break;
-	}
-}
-
-/**
- * i40iw_vf_enable - enable a number of VFs
- * @ldev: lan device information
- * @client: client interface instance
- * @num_vfs: number of VFs for the PF
- *
- * Called when the number of VFs changes
- */
-static void i40iw_vf_enable(struct i40e_info *ldev,
-			    struct i40e_client *client,
-			    u32 num_vfs)
-{
-	struct i40iw_handler *hdl;
-
-	hdl = i40iw_find_i40e_handler(ldev);
-	if (!hdl)
-		return;
-
-	if (num_vfs > I40IW_MAX_PE_ENABLED_VF_COUNT)
-		hdl->device.max_enabled_vfs = I40IW_MAX_PE_ENABLED_VF_COUNT;
-	else
-		hdl->device.max_enabled_vfs = num_vfs;
-}
-
-/**
- * i40iw_vf_capable - check if VF capable
- * @ldev: lan device information
- * @client: client interface instance
- * @vf_id: virtual function id
- *
- * Return 1 if a VF slot is available or if VF is already RDMA enabled
- * Return 0 otherwise
- */
-static int i40iw_vf_capable(struct i40e_info *ldev,
-			    struct i40e_client *client,
-			    u32 vf_id)
-{
-	struct i40iw_handler *hdl;
-	struct i40iw_sc_dev *dev;
-	unsigned int i;
-
-	hdl = i40iw_find_i40e_handler(ldev);
-	if (!hdl)
-		return 0;
-
-	dev = &hdl->device.sc_dev;
-
-	for (i = 0; i < hdl->device.max_enabled_vfs; i++) {
-		if (!dev->vf_dev[i] || (dev->vf_dev[i]->vf_id == vf_id))
-			return 1;
-	}
-
-	return 0;
-}
-
-/**
- * i40iw_virtchnl_receive - receive a message through the virtual channel
- * @ldev: lan device information
- * @client: client interface instance
- * @vf_id: virtual function id associated with the message
- * @msg: message buffer pointer
- * @len: length of the message
- *
- * Invoke virtual channel receive operation for the given msg
- * Return 0 if successful, otherwise return error
- */
-static int i40iw_virtchnl_receive(struct i40e_info *ldev,
-				  struct i40e_client *client,
-				  u32 vf_id,
-				  u8 *msg,
-				  u16 len)
-{
-	struct i40iw_handler *hdl;
-	struct i40iw_sc_dev *dev;
-	struct i40iw_device *iwdev;
-	int ret_code = I40IW_NOT_SUPPORTED;
-
-	if (!len || !msg)
-		return I40IW_ERR_PARAM;
-
-	hdl = i40iw_find_i40e_handler(ldev);
-	if (!hdl)
-		return I40IW_ERR_PARAM;
-
-	dev = &hdl->device.sc_dev;
-	iwdev = dev->back_dev;
-
-	if (dev->vchnl_if.vchnl_recv) {
-		ret_code = dev->vchnl_if.vchnl_recv(dev, vf_id, msg, len);
-		if (!dev->is_pf) {
-			atomic_dec(&iwdev->vchnl_msgs);
-			wake_up(&iwdev->vchnl_waitq);
-		}
-	}
-	return ret_code;
-}
-
-/**
- * i40iw_vf_clear_to_send - wait to send virtual channel message
- * @dev: iwarp device *
- * Wait for until virtual channel is clear
- * before sending the next message
- *
- * Returns false if error
- * Returns true if clear to send
- */
-bool i40iw_vf_clear_to_send(struct i40iw_sc_dev *dev)
-{
-	struct i40iw_device *iwdev;
-	wait_queue_entry_t wait;
-
-	iwdev = dev->back_dev;
-
-	if (!wq_has_sleeper(&dev->vf_reqs) &&
-	    (atomic_read(&iwdev->vchnl_msgs) == 0))
-		return true; /* virtual channel is clear */
-
-	init_wait(&wait);
-	add_wait_queue_exclusive(&dev->vf_reqs, &wait);
-
-	if (!wait_event_timeout(dev->vf_reqs,
-				(atomic_read(&iwdev->vchnl_msgs) == 0),
-				I40IW_VCHNL_EVENT_TIMEOUT))
-		dev->vchnl_up = false;
-
-	remove_wait_queue(&dev->vf_reqs, &wait);
-
-	return dev->vchnl_up;
-}
-
-/**
- * i40iw_virtchnl_send - send a message through the virtual channel
- * @dev: iwarp device
- * @vf_id: virtual function id associated with the message
- * @msg: virtual channel message buffer pointer
- * @len: length of the message
- *
- * Invoke virtual channel send operation for the given msg
- * Return 0 if successful, otherwise return error
- */
-static enum i40iw_status_code i40iw_virtchnl_send(struct i40iw_sc_dev *dev,
-						  u32 vf_id,
-						  u8 *msg,
-						  u16 len)
-{
-	struct i40iw_device *iwdev;
-	struct i40e_info *ldev;
-
-	if (!dev || !dev->back_dev)
-		return I40IW_ERR_BAD_PTR;
-
-	iwdev = dev->back_dev;
-	ldev = iwdev->ldev;
-
-	if (ldev && ldev->ops && ldev->ops->virtchnl_send)
-		return ldev->ops->virtchnl_send(ldev, &i40iw_client, vf_id, msg, len);
-	return I40IW_ERR_BAD_PTR;
-}
-
-/* client interface functions */
-static const struct i40e_client_ops i40e_ops = {
-	.open = i40iw_open,
-	.close = i40iw_close,
-	.l2_param_change = i40iw_l2param_change,
-	.virtchnl_receive = i40iw_virtchnl_receive,
-	.vf_reset = i40iw_vf_reset,
-	.vf_enable = i40iw_vf_enable,
-	.vf_capable = i40iw_vf_capable
-};
-
-/**
- * i40iw_init_module - driver initialization function
- *
- * First function to call when the driver is loaded
- * Register the driver as i40e client and port mapper client
- */
-static int __init i40iw_init_module(void)
-{
-	int ret;
-
-	memset(&i40iw_client, 0, sizeof(i40iw_client));
-	i40iw_client.version.major = CLIENT_IW_INTERFACE_VERSION_MAJOR;
-	i40iw_client.version.minor = CLIENT_IW_INTERFACE_VERSION_MINOR;
-	i40iw_client.version.build = CLIENT_IW_INTERFACE_VERSION_BUILD;
-	i40iw_client.ops = &i40e_ops;
-	memcpy(i40iw_client.name, i40iw_client_name, I40E_CLIENT_STR_LENGTH);
-	i40iw_client.type = I40E_CLIENT_IWARP;
-	ret = i40e_register_client(&i40iw_client);
-	i40iw_register_notifiers();
-
-	return ret;
-}
-
-/**
- * i40iw_exit_module - driver exit clean up function
- *
- * The function is called just before the driver is unloaded
- * Unregister the driver as i40e client and port mapper client
- */
-static void __exit i40iw_exit_module(void)
-{
-	i40iw_unregister_notifiers();
-	i40e_unregister_client(&i40iw_client);
-}
-
-module_init(i40iw_init_module);
-module_exit(i40iw_exit_module);
diff --git a/drivers/infiniband/hw/i40iw/i40iw_osdep.h b/drivers/infiniband/hw/i40iw/i40iw_osdep.h
deleted file mode 100644
index d938ccb195b1..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw_osdep.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#ifndef I40IW_OSDEP_H
-#define I40IW_OSDEP_H
-
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/bitops.h>
-#include <net/tcp.h>
-#include <crypto/hash.h>
-/* get readq/writeq support for 32 bit kernels, use the low-first version */
-#include <linux/io-64-nonatomic-lo-hi.h>
-
-#define STATS_TIMER_DELAY 1000
-
-static inline void set_64bit_val(u64 *wqe_words, u32 byte_index, u64 value)
-{
-	wqe_words[byte_index >> 3] = value;
-}
-
-/**
- * get_64bit_val - read 64 bit value from wqe
- * @wqe_words: wqe addr
- * @byte_index: index to read from
- * @value: read value
- **/
-static inline void get_64bit_val(u64 *wqe_words, u32 byte_index, u64 *value)
-{
-	*value = wqe_words[byte_index >> 3];
-}
-
-struct i40iw_dma_mem {
-	void *va;
-	dma_addr_t pa;
-	u32 size;
-} __packed;
-
-struct i40iw_virt_mem {
-	void *va;
-	u32 size;
-} __packed;
-
-#define i40iw_debug(h, m, s, ...)                               \
-do {                                                            \
-	if (((m) & (h)->debug_mask))                            \
-		pr_info("i40iw " s, ##__VA_ARGS__);             \
-} while (0)
-
-#define i40iw_flush(a)          readl((a)->hw_addr + I40E_GLGEN_STAT)
-
-#define I40E_GLHMC_VFSDCMD(_i)  (0x000C8000 + ((_i) * 4)) \
-				/* _i=0...31 */
-#define I40E_GLHMC_VFSDCMD_MAX_INDEX    31
-#define I40E_GLHMC_VFSDCMD_PMSDIDX_SHIFT  0
-#define I40E_GLHMC_VFSDCMD_PMSDIDX_MASK  (0xFFF \
-					  << I40E_GLHMC_VFSDCMD_PMSDIDX_SHIFT)
-#define I40E_GLHMC_VFSDCMD_PF_SHIFT       16
-#define I40E_GLHMC_VFSDCMD_PF_MASK        (0xF << I40E_GLHMC_VFSDCMD_PF_SHIFT)
-#define I40E_GLHMC_VFSDCMD_VF_SHIFT       20
-#define I40E_GLHMC_VFSDCMD_VF_MASK        (0x1FF << I40E_GLHMC_VFSDCMD_VF_SHIFT)
-#define I40E_GLHMC_VFSDCMD_PMF_TYPE_SHIFT 29
-#define I40E_GLHMC_VFSDCMD_PMF_TYPE_MASK  (0x3 \
-					   << I40E_GLHMC_VFSDCMD_PMF_TYPE_SHIFT)
-#define I40E_GLHMC_VFSDCMD_PMSDWR_SHIFT   31
-#define I40E_GLHMC_VFSDCMD_PMSDWR_MASK  (0x1 << I40E_GLHMC_VFSDCMD_PMSDWR_SHIFT)
-
-#define I40E_GLHMC_VFSDDATAHIGH(_i)     (0x000C8200 + ((_i) * 4)) \
-				/* _i=0...31 */
-#define I40E_GLHMC_VFSDDATAHIGH_MAX_INDEX       31
-#define I40E_GLHMC_VFSDDATAHIGH_PMSDDATAHIGH_SHIFT 0
-#define I40E_GLHMC_VFSDDATAHIGH_PMSDDATAHIGH_MASK  (0xFFFFFFFF \
-			<< I40E_GLHMC_VFSDDATAHIGH_PMSDDATAHIGH_SHIFT)
-
-#define I40E_GLHMC_VFSDDATALOW(_i)      (0x000C8100 + ((_i) * 4)) \
-				/* _i=0...31 */
-#define I40E_GLHMC_VFSDDATALOW_MAX_INDEX        31
-#define I40E_GLHMC_VFSDDATALOW_PMSDVALID_SHIFT   0
-#define I40E_GLHMC_VFSDDATALOW_PMSDVALID_MASK  (0x1 \
-			<< I40E_GLHMC_VFSDDATALOW_PMSDVALID_SHIFT)
-#define I40E_GLHMC_VFSDDATALOW_PMSDTYPE_SHIFT    1
-#define I40E_GLHMC_VFSDDATALOW_PMSDTYPE_MASK  (0x1 \
-			<< I40E_GLHMC_VFSDDATALOW_PMSDTYPE_SHIFT)
-#define I40E_GLHMC_VFSDDATALOW_PMSDBPCOUNT_SHIFT 2
-#define I40E_GLHMC_VFSDDATALOW_PMSDBPCOUNT_MASK  (0x3FF \
-			<< I40E_GLHMC_VFSDDATALOW_PMSDBPCOUNT_SHIFT)
-#define I40E_GLHMC_VFSDDATALOW_PMSDDATALOW_SHIFT 12
-#define I40E_GLHMC_VFSDDATALOW_PMSDDATALOW_MASK  (0xFFFFF \
-			<< I40E_GLHMC_VFSDDATALOW_PMSDDATALOW_SHIFT)
-
-#define I40E_GLPE_FWLDSTATUS                     0x0000D200
-#define I40E_GLPE_FWLDSTATUS_LOAD_REQUESTED_SHIFT 0
-#define I40E_GLPE_FWLDSTATUS_LOAD_REQUESTED_MASK  (0x1 \
-			<< I40E_GLPE_FWLDSTATUS_LOAD_REQUESTED_SHIFT)
-#define I40E_GLPE_FWLDSTATUS_DONE_SHIFT           1
-#define I40E_GLPE_FWLDSTATUS_DONE_MASK  (0x1 << I40E_GLPE_FWLDSTATUS_DONE_SHIFT)
-#define I40E_GLPE_FWLDSTATUS_CQP_FAIL_SHIFT       2
-#define I40E_GLPE_FWLDSTATUS_CQP_FAIL_MASK  (0x1 \
-			 << I40E_GLPE_FWLDSTATUS_CQP_FAIL_SHIFT)
-#define I40E_GLPE_FWLDSTATUS_TEP_FAIL_SHIFT       3
-#define I40E_GLPE_FWLDSTATUS_TEP_FAIL_MASK  (0x1 \
-			 << I40E_GLPE_FWLDSTATUS_TEP_FAIL_SHIFT)
-#define I40E_GLPE_FWLDSTATUS_OOP_FAIL_SHIFT       4
-#define I40E_GLPE_FWLDSTATUS_OOP_FAIL_MASK  (0x1 \
-			 << I40E_GLPE_FWLDSTATUS_OOP_FAIL_SHIFT)
-
-struct i40iw_sc_dev;
-struct i40iw_sc_qp;
-struct i40iw_puda_buf;
-struct i40iw_puda_completion_info;
-struct i40iw_update_sds_info;
-struct i40iw_hmc_fcn_info;
-struct i40iw_virtchnl_work_info;
-struct i40iw_manage_vf_pble_info;
-struct i40iw_device;
-struct i40iw_hmc_info;
-struct i40iw_hw;
-
-u8 __iomem *i40iw_get_hw_addr(void *dev);
-void i40iw_ieq_mpa_crc_ae(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp);
-enum i40iw_status_code i40iw_vf_wait_vchnl_resp(struct i40iw_sc_dev *dev);
-bool i40iw_vf_clear_to_send(struct i40iw_sc_dev *dev);
-enum i40iw_status_code i40iw_ieq_check_mpacrc(struct shash_desc *desc, void *addr,
-					      u32 length, u32 value);
-struct i40iw_sc_qp *i40iw_ieq_get_qp(struct i40iw_sc_dev *dev, struct i40iw_puda_buf *buf);
-void i40iw_ieq_update_tcpip_info(struct i40iw_puda_buf *buf, u16 length, u32 seqnum);
-void i40iw_free_hash_desc(struct shash_desc *);
-enum i40iw_status_code i40iw_init_hash_desc(struct shash_desc **);
-enum i40iw_status_code i40iw_puda_get_tcpip_info(struct i40iw_puda_completion_info *info,
-						 struct i40iw_puda_buf *buf);
-enum i40iw_status_code i40iw_cqp_sds_cmd(struct i40iw_sc_dev *dev,
-					 struct i40iw_update_sds_info *info);
-enum i40iw_status_code i40iw_cqp_manage_hmc_fcn_cmd(struct i40iw_sc_dev *dev,
-						    struct i40iw_hmc_fcn_info *hmcfcninfo);
-enum i40iw_status_code i40iw_cqp_query_fpm_values_cmd(struct i40iw_sc_dev *dev,
-						      struct i40iw_dma_mem *values_mem,
-						      u8 hmc_fn_id);
-enum i40iw_status_code i40iw_cqp_commit_fpm_values_cmd(struct i40iw_sc_dev *dev,
-						       struct i40iw_dma_mem *values_mem,
-						       u8 hmc_fn_id);
-enum i40iw_status_code i40iw_alloc_query_fpm_buf(struct i40iw_sc_dev *dev,
-						 struct i40iw_dma_mem *mem);
-enum i40iw_status_code i40iw_cqp_manage_vf_pble_bp(struct i40iw_sc_dev *dev,
-						   struct i40iw_manage_vf_pble_info *info);
-void i40iw_cqp_spawn_worker(struct i40iw_sc_dev *dev,
-			    struct i40iw_virtchnl_work_info *work_info, u32 iw_vf_idx);
-void *i40iw_remove_head(struct list_head *list);
-void i40iw_qp_suspend_resume(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp, bool suspend);
-
-void i40iw_term_modify_qp(struct i40iw_sc_qp *qp, u8 next_state, u8 term, u8 term_len);
-void i40iw_terminate_done(struct i40iw_sc_qp *qp, int timeout_occurred);
-void i40iw_terminate_start_timer(struct i40iw_sc_qp *qp);
-void i40iw_terminate_del_timer(struct i40iw_sc_qp *qp);
-
-enum i40iw_status_code i40iw_hw_manage_vf_pble_bp(struct i40iw_device *iwdev,
-						  struct i40iw_manage_vf_pble_info *info,
-						  bool wait);
-struct i40iw_sc_vsi;
-void i40iw_hw_stats_start_timer(struct i40iw_sc_vsi *vsi);
-void i40iw_hw_stats_stop_timer(struct i40iw_sc_vsi *vsi);
-#define i40iw_mmiowb() do { } while (0)
-void i40iw_wr32(struct i40iw_hw *hw, u32 reg, u32 value);
-u32  i40iw_rd32(struct i40iw_hw *hw, u32 reg);
-#endif				/* _I40IW_OSDEP_H_ */
diff --git a/drivers/infiniband/hw/i40iw/i40iw_p.h b/drivers/infiniband/hw/i40iw/i40iw_p.h
deleted file mode 100644
index 4c429567bbb4..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw_p.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#ifndef I40IW_P_H
-#define I40IW_P_H
-
-#define PAUSE_TIMER_VALUE       0xFFFF
-#define REFRESH_THRESHOLD       0x7FFF
-#define HIGH_THRESHOLD          0x800
-#define LOW_THRESHOLD           0x200
-#define ALL_TC2PFC              0xFF
-#define CQP_COMPL_WAIT_TIME     0x3E8
-#define CQP_TIMEOUT_THRESHOLD   5
-
-void i40iw_debug_buf(struct i40iw_sc_dev *dev, enum i40iw_debug_flag mask,
-		     char *desc, u64 *buf, u32 size);
-/* init operations */
-enum i40iw_status_code i40iw_device_init(struct i40iw_sc_dev *dev,
-					 struct i40iw_device_init_info *info);
-
-void i40iw_sc_cqp_post_sq(struct i40iw_sc_cqp *cqp);
-
-u64 *i40iw_sc_cqp_get_next_send_wqe(struct i40iw_sc_cqp *cqp, u64 scratch);
-
-void i40iw_check_cqp_progress(struct i40iw_cqp_timeout *cqp_timeout, struct i40iw_sc_dev *dev);
-
-enum i40iw_status_code i40iw_sc_mr_fast_register(struct i40iw_sc_qp *qp,
-						 struct i40iw_fast_reg_stag_info *info,
-						 bool post_sq);
-
-void i40iw_insert_wqe_hdr(u64 *wqe, u64 header);
-
-/* HMC/FPM functions */
-enum i40iw_status_code i40iw_sc_init_iw_hmc(struct i40iw_sc_dev *dev,
-					    u8 hmc_fn_id);
-
-enum i40iw_status_code i40iw_pf_init_vfhmc(struct i40iw_sc_dev *dev, u8 vf_hmc_fn_id,
-					   u32 *vf_cnt_array);
-
-/* stats functions */
-void i40iw_hw_stats_refresh_all(struct i40iw_vsi_pestat *stats);
-void i40iw_hw_stats_read_all(struct i40iw_vsi_pestat *stats, struct i40iw_dev_hw_stats *stats_values);
-void i40iw_hw_stats_read_32(struct i40iw_vsi_pestat *stats,
-			    enum i40iw_hw_stats_index_32b index,
-			    u64 *value);
-void i40iw_hw_stats_read_64(struct i40iw_vsi_pestat *stats,
-			    enum i40iw_hw_stats_index_64b index,
-			    u64 *value);
-void i40iw_hw_stats_init(struct i40iw_vsi_pestat *stats, u8 index, bool is_pf);
-
-/* vsi misc functions */
-enum i40iw_status_code i40iw_vsi_stats_init(struct i40iw_sc_vsi *vsi, struct i40iw_vsi_stats_info *info);
-void i40iw_vsi_stats_free(struct i40iw_sc_vsi *vsi);
-void i40iw_sc_vsi_init(struct i40iw_sc_vsi *vsi, struct i40iw_vsi_init_info *info);
-
-void i40iw_change_l2params(struct i40iw_sc_vsi *vsi, struct i40iw_l2params *l2params);
-void i40iw_qp_add_qos(struct i40iw_sc_qp *qp);
-void i40iw_qp_rem_qos(struct i40iw_sc_qp *qp);
-void i40iw_terminate_send_fin(struct i40iw_sc_qp *qp);
-
-void i40iw_terminate_connection(struct i40iw_sc_qp *qp, struct i40iw_aeqe_info *info);
-
-void i40iw_terminate_received(struct i40iw_sc_qp *qp, struct i40iw_aeqe_info *info);
-
-enum i40iw_status_code i40iw_sc_suspend_qp(struct i40iw_sc_cqp *cqp,
-					   struct i40iw_sc_qp *qp, u64 scratch);
-
-enum i40iw_status_code i40iw_sc_resume_qp(struct i40iw_sc_cqp *cqp,
-					  struct i40iw_sc_qp *qp, u64 scratch);
-
-enum i40iw_status_code i40iw_sc_static_hmc_pages_allocated(struct i40iw_sc_cqp *cqp,
-							   u64 scratch, u8 hmc_fn_id,
-							   bool post_sq,
-							   bool poll_registers);
-
-enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_count);
-enum i40iw_status_code i40iw_get_rdma_features(struct i40iw_sc_dev *dev);
-
-void free_sd_mem(struct i40iw_sc_dev *dev);
-
-enum i40iw_status_code i40iw_process_cqp_cmd(struct i40iw_sc_dev *dev,
-					     struct cqp_commands_info *pcmdinfo);
-
-enum i40iw_status_code i40iw_process_bh(struct i40iw_sc_dev *dev);
-
-/* prototype for functions used for dynamic memory allocation */
-enum i40iw_status_code i40iw_allocate_dma_mem(struct i40iw_hw *hw,
-					      struct i40iw_dma_mem *mem, u64 size,
-					      u32 alignment);
-void i40iw_free_dma_mem(struct i40iw_hw *hw, struct i40iw_dma_mem *mem);
-enum i40iw_status_code i40iw_allocate_virt_mem(struct i40iw_hw *hw,
-					       struct i40iw_virt_mem *mem, u32 size);
-enum i40iw_status_code i40iw_free_virt_mem(struct i40iw_hw *hw,
-					   struct i40iw_virt_mem *mem);
-u8 i40iw_get_encoded_wqe_size(u32 wqsize, bool cqpsq);
-void i40iw_reinitialize_ieq(struct i40iw_sc_dev *dev);
-
-#endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_pble.c b/drivers/infiniband/hw/i40iw/i40iw_pble.c
deleted file mode 100644
index 146a4148219b..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw_pble.c
+++ /dev/null
@@ -1,611 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#include "i40iw_status.h"
-#include "i40iw_osdep.h"
-#include "i40iw_register.h"
-#include "i40iw_hmc.h"
-
-#include "i40iw_d.h"
-#include "i40iw_type.h"
-#include "i40iw_p.h"
-
-#include <linux/pci.h>
-#include <linux/genalloc.h>
-#include <linux/vmalloc.h>
-#include "i40iw_pble.h"
-#include "i40iw.h"
-
-struct i40iw_device;
-static enum i40iw_status_code add_pble_pool(struct i40iw_sc_dev *dev,
-					    struct i40iw_hmc_pble_rsrc *pble_rsrc);
-static void i40iw_free_vmalloc_mem(struct i40iw_hw *hw, struct i40iw_chunk *chunk);
-
-/**
- * i40iw_destroy_pble_pool - destroy pool during module unload
- * @dev: i40iw_sc_dev struct
- * @pble_rsrc:	pble resources
- */
-void i40iw_destroy_pble_pool(struct i40iw_sc_dev *dev, struct i40iw_hmc_pble_rsrc *pble_rsrc)
-{
-	struct list_head *clist;
-	struct list_head *tlist;
-	struct i40iw_chunk *chunk;
-	struct i40iw_pble_pool *pinfo = &pble_rsrc->pinfo;
-
-	if (pinfo->pool) {
-		list_for_each_safe(clist, tlist, &pinfo->clist) {
-			chunk = list_entry(clist, struct i40iw_chunk, list);
-			if (chunk->type == I40IW_VMALLOC)
-				i40iw_free_vmalloc_mem(dev->hw, chunk);
-			kfree(chunk);
-		}
-		gen_pool_destroy(pinfo->pool);
-	}
-}
-
-/**
- * i40iw_hmc_init_pble - Initialize pble resources during module load
- * @dev: i40iw_sc_dev struct
- * @pble_rsrc:	pble resources
- */
-enum i40iw_status_code i40iw_hmc_init_pble(struct i40iw_sc_dev *dev,
-					   struct i40iw_hmc_pble_rsrc *pble_rsrc)
-{
-	struct i40iw_hmc_info *hmc_info;
-	u32 fpm_idx = 0;
-
-	hmc_info = dev->hmc_info;
-	pble_rsrc->fpm_base_addr = hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].base;
-	/* Now start the pble' on 4k boundary */
-	if (pble_rsrc->fpm_base_addr & 0xfff)
-		fpm_idx = (PAGE_SIZE - (pble_rsrc->fpm_base_addr & 0xfff)) >> 3;
-
-	pble_rsrc->unallocated_pble =
-	    hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt - fpm_idx;
-	pble_rsrc->next_fpm_addr = pble_rsrc->fpm_base_addr + (fpm_idx << 3);
-
-	pble_rsrc->pinfo.pool_shift = POOL_SHIFT;
-	pble_rsrc->pinfo.pool = gen_pool_create(pble_rsrc->pinfo.pool_shift, -1);
-	INIT_LIST_HEAD(&pble_rsrc->pinfo.clist);
-	if (!pble_rsrc->pinfo.pool)
-		goto error;
-
-	if (add_pble_pool(dev, pble_rsrc))
-		goto error;
-
-	return 0;
-
- error:i40iw_destroy_pble_pool(dev, pble_rsrc);
-	return I40IW_ERR_NO_MEMORY;
-}
-
-/**
- * get_sd_pd_idx -  Returns sd index, pd index and rel_pd_idx from fpm address
- * @pble_rsrc:	structure containing fpm address
- * @idx: where to return indexes
- */
-static inline void get_sd_pd_idx(struct i40iw_hmc_pble_rsrc *pble_rsrc,
-				 struct sd_pd_idx *idx)
-{
-	idx->sd_idx = (u32)(pble_rsrc->next_fpm_addr) / I40IW_HMC_DIRECT_BP_SIZE;
-	idx->pd_idx = (u32)(pble_rsrc->next_fpm_addr) / I40IW_HMC_PAGED_BP_SIZE;
-	idx->rel_pd_idx = (idx->pd_idx % I40IW_HMC_PD_CNT_IN_SD);
-}
-
-/**
- * add_sd_direct - add sd direct for pble
- * @dev: hardware control device structure
- * @pble_rsrc: pble resource ptr
- * @info: page info for sd
- */
-static enum i40iw_status_code add_sd_direct(struct i40iw_sc_dev *dev,
-					    struct i40iw_hmc_pble_rsrc *pble_rsrc,
-					    struct i40iw_add_page_info *info)
-{
-	enum i40iw_status_code ret_code = 0;
-	struct sd_pd_idx *idx = &info->idx;
-	struct i40iw_chunk *chunk = info->chunk;
-	struct i40iw_hmc_info *hmc_info = info->hmc_info;
-	struct i40iw_hmc_sd_entry *sd_entry = info->sd_entry;
-	u32 offset = 0;
-
-	if (!sd_entry->valid) {
-		if (dev->is_pf) {
-			ret_code = i40iw_add_sd_table_entry(dev->hw, hmc_info,
-							    info->idx.sd_idx,
-							    I40IW_SD_TYPE_DIRECT,
-							    I40IW_HMC_DIRECT_BP_SIZE);
-			if (ret_code)
-				return ret_code;
-			chunk->type = I40IW_DMA_COHERENT;
-		}
-	}
-	offset = idx->rel_pd_idx << I40IW_HMC_PAGED_BP_SHIFT;
-	chunk->size = info->pages << I40IW_HMC_PAGED_BP_SHIFT;
-	chunk->vaddr = ((u8 *)sd_entry->u.bp.addr.va + offset);
-	chunk->fpm_addr = pble_rsrc->next_fpm_addr;
-	i40iw_debug(dev, I40IW_DEBUG_PBLE, "chunk_size[%d] = 0x%x vaddr=%p fpm_addr = %llx\n",
-		    chunk->size, chunk->size, chunk->vaddr, chunk->fpm_addr);
-	return 0;
-}
-
-/**
- * i40iw_free_vmalloc_mem - free vmalloc during close
- * @hw: hw struct
- * @chunk: chunk information for vmalloc
- */
-static void i40iw_free_vmalloc_mem(struct i40iw_hw *hw, struct i40iw_chunk *chunk)
-{
-	struct pci_dev *pcidev = hw->pcidev;
-	int i;
-
-	if (!chunk->pg_cnt)
-		goto done;
-	for (i = 0; i < chunk->pg_cnt; i++)
-		dma_unmap_page(&pcidev->dev, chunk->dmaaddrs[i], PAGE_SIZE, DMA_BIDIRECTIONAL);
-
- done:
-	kfree(chunk->dmaaddrs);
-	chunk->dmaaddrs = NULL;
-	vfree(chunk->vaddr);
-	chunk->vaddr = NULL;
-	chunk->type = 0;
-}
-
-/**
- * i40iw_get_vmalloc_mem - get 2M page for sd
- * @hw: hardware address
- * @chunk: chunk to adf
- * @pg_cnt: #of 4 K pages
- */
-static enum i40iw_status_code i40iw_get_vmalloc_mem(struct i40iw_hw *hw,
-						    struct i40iw_chunk *chunk,
-						    int pg_cnt)
-{
-	struct pci_dev *pcidev = hw->pcidev;
-	struct page *page;
-	u8 *addr;
-	u32 size;
-	int i;
-
-	chunk->dmaaddrs = kzalloc(pg_cnt << 3, GFP_KERNEL);
-	if (!chunk->dmaaddrs)
-		return I40IW_ERR_NO_MEMORY;
-	size = PAGE_SIZE * pg_cnt;
-	chunk->vaddr = vmalloc(size);
-	if (!chunk->vaddr) {
-		kfree(chunk->dmaaddrs);
-		chunk->dmaaddrs = NULL;
-		return I40IW_ERR_NO_MEMORY;
-	}
-	chunk->size = size;
-	addr = (u8 *)chunk->vaddr;
-	for (i = 0; i < pg_cnt; i++) {
-		page = vmalloc_to_page((void *)addr);
-		if (!page)
-			break;
-		chunk->dmaaddrs[i] = dma_map_page(&pcidev->dev, page, 0,
-						  PAGE_SIZE, DMA_BIDIRECTIONAL);
-		if (dma_mapping_error(&pcidev->dev, chunk->dmaaddrs[i]))
-			break;
-		addr += PAGE_SIZE;
-	}
-
-	chunk->pg_cnt = i;
-	chunk->type = I40IW_VMALLOC;
-	if (i == pg_cnt)
-		return 0;
-
-	i40iw_free_vmalloc_mem(hw, chunk);
-	return I40IW_ERR_NO_MEMORY;
-}
-
-/**
- * fpm_to_idx - given fpm address, get pble index
- * @pble_rsrc: pble resource management
- * @addr: fpm address for index
- */
-static inline u32 fpm_to_idx(struct i40iw_hmc_pble_rsrc *pble_rsrc, u64 addr)
-{
-	return (addr - (pble_rsrc->fpm_base_addr)) >> 3;
-}
-
-/**
- * add_bp_pages - add backing pages for sd
- * @dev: hardware control device structure
- * @pble_rsrc: pble resource management
- * @info: page info for sd
- */
-static enum i40iw_status_code add_bp_pages(struct i40iw_sc_dev *dev,
-					   struct i40iw_hmc_pble_rsrc *pble_rsrc,
-					   struct i40iw_add_page_info *info)
-{
-	u8 *addr;
-	struct i40iw_dma_mem mem;
-	struct i40iw_hmc_pd_entry *pd_entry;
-	struct i40iw_hmc_sd_entry *sd_entry = info->sd_entry;
-	struct i40iw_hmc_info *hmc_info = info->hmc_info;
-	struct i40iw_chunk *chunk = info->chunk;
-	struct i40iw_manage_vf_pble_info vf_pble_info;
-	enum i40iw_status_code status = 0;
-	u32 rel_pd_idx = info->idx.rel_pd_idx;
-	u32 pd_idx = info->idx.pd_idx;
-	u32 i;
-
-	status = i40iw_get_vmalloc_mem(dev->hw, chunk, info->pages);
-	if (status)
-		return I40IW_ERR_NO_MEMORY;
-	status = i40iw_add_sd_table_entry(dev->hw, hmc_info,
-					  info->idx.sd_idx, I40IW_SD_TYPE_PAGED,
-					  I40IW_HMC_DIRECT_BP_SIZE);
-	if (status)
-		goto error;
-	if (!dev->is_pf) {
-		status = i40iw_vchnl_vf_add_hmc_objs(dev, I40IW_HMC_IW_PBLE,
-						     fpm_to_idx(pble_rsrc,
-								pble_rsrc->next_fpm_addr),
-						     (info->pages << PBLE_512_SHIFT));
-		if (status) {
-			i40iw_pr_err("allocate PBLEs in the PF.  Error %i\n", status);
-			goto error;
-		}
-	}
-	addr = chunk->vaddr;
-	for (i = 0; i < info->pages; i++) {
-		mem.pa = chunk->dmaaddrs[i];
-		mem.size = PAGE_SIZE;
-		mem.va = (void *)(addr);
-		pd_entry = &sd_entry->u.pd_table.pd_entry[rel_pd_idx++];
-		if (!pd_entry->valid) {
-			status = i40iw_add_pd_table_entry(dev->hw, hmc_info, pd_idx++, &mem);
-			if (status)
-				goto error;
-			addr += PAGE_SIZE;
-		} else {
-			i40iw_pr_err("pd entry is valid expecting to be invalid\n");
-		}
-	}
-	if (!dev->is_pf) {
-		vf_pble_info.first_pd_index = info->idx.rel_pd_idx;
-		vf_pble_info.inv_pd_ent = false;
-		vf_pble_info.pd_entry_cnt = PBLE_PER_PAGE;
-		vf_pble_info.pd_pl_pba = sd_entry->u.pd_table.pd_page_addr.pa;
-		vf_pble_info.sd_index = info->idx.sd_idx;
-		status = i40iw_hw_manage_vf_pble_bp(dev->back_dev,
-						    &vf_pble_info, true);
-		if (status) {
-			i40iw_pr_err("CQP manage VF PBLE BP failed.  %i\n", status);
-			goto error;
-		}
-	}
-	chunk->fpm_addr = pble_rsrc->next_fpm_addr;
-	return 0;
-error:
-	i40iw_free_vmalloc_mem(dev->hw, chunk);
-	return status;
-}
-
-/**
- * add_pble_pool - add a sd entry for pble resoure
- * @dev: hardware control device structure
- * @pble_rsrc: pble resource management
- */
-static enum i40iw_status_code add_pble_pool(struct i40iw_sc_dev *dev,
-					    struct i40iw_hmc_pble_rsrc *pble_rsrc)
-{
-	struct i40iw_hmc_sd_entry *sd_entry;
-	struct i40iw_hmc_info *hmc_info;
-	struct i40iw_chunk *chunk;
-	struct i40iw_add_page_info info;
-	struct sd_pd_idx *idx = &info.idx;
-	enum i40iw_status_code ret_code = 0;
-	enum i40iw_sd_entry_type sd_entry_type;
-	u64 sd_reg_val = 0;
-	u32 pages;
-
-	if (pble_rsrc->unallocated_pble < PBLE_PER_PAGE)
-		return I40IW_ERR_NO_MEMORY;
-	if (pble_rsrc->next_fpm_addr & 0xfff) {
-		i40iw_pr_err("next fpm_addr %llx\n", pble_rsrc->next_fpm_addr);
-		return I40IW_ERR_INVALID_PAGE_DESC_INDEX;
-	}
-	chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
-	if (!chunk)
-		return I40IW_ERR_NO_MEMORY;
-	hmc_info = dev->hmc_info;
-	chunk->fpm_addr = pble_rsrc->next_fpm_addr;
-	get_sd_pd_idx(pble_rsrc, idx);
-	sd_entry = &hmc_info->sd_table.sd_entry[idx->sd_idx];
-	pages = (idx->rel_pd_idx) ? (I40IW_HMC_PD_CNT_IN_SD -
-			idx->rel_pd_idx) : I40IW_HMC_PD_CNT_IN_SD;
-	pages = min(pages, pble_rsrc->unallocated_pble >> PBLE_512_SHIFT);
-	info.chunk = chunk;
-	info.hmc_info = hmc_info;
-	info.pages = pages;
-	info.sd_entry = sd_entry;
-	if (!sd_entry->valid) {
-		sd_entry_type = (!idx->rel_pd_idx &&
-				 (pages == I40IW_HMC_PD_CNT_IN_SD) &&
-				 dev->is_pf) ? I40IW_SD_TYPE_DIRECT : I40IW_SD_TYPE_PAGED;
-	} else {
-		sd_entry_type = sd_entry->entry_type;
-	}
-	i40iw_debug(dev, I40IW_DEBUG_PBLE,
-		    "pages = %d, unallocated_pble[%u] current_fpm_addr = %llx\n",
-		    pages, pble_rsrc->unallocated_pble, pble_rsrc->next_fpm_addr);
-	i40iw_debug(dev, I40IW_DEBUG_PBLE, "sd_entry_type = %d sd_entry valid = %d\n",
-		    sd_entry_type, sd_entry->valid);
-
-	if (sd_entry_type == I40IW_SD_TYPE_DIRECT)
-		ret_code = add_sd_direct(dev, pble_rsrc, &info);
-	if (ret_code)
-		sd_entry_type = I40IW_SD_TYPE_PAGED;
-	else
-		pble_rsrc->stats_direct_sds++;
-
-	if (sd_entry_type == I40IW_SD_TYPE_PAGED) {
-		ret_code = add_bp_pages(dev, pble_rsrc, &info);
-		if (ret_code)
-			goto error;
-		else
-			pble_rsrc->stats_paged_sds++;
-	}
-
-	if (gen_pool_add_virt(pble_rsrc->pinfo.pool, (unsigned long)chunk->vaddr,
-			      (phys_addr_t)chunk->fpm_addr, chunk->size, -1)) {
-		i40iw_pr_err("could not allocate memory by gen_pool_addr_virt()\n");
-		ret_code = I40IW_ERR_NO_MEMORY;
-		goto error;
-	}
-	pble_rsrc->next_fpm_addr += chunk->size;
-	i40iw_debug(dev, I40IW_DEBUG_PBLE, "next_fpm_addr = %llx chunk_size[%u] = 0x%x\n",
-		    pble_rsrc->next_fpm_addr, chunk->size, chunk->size);
-	pble_rsrc->unallocated_pble -= (chunk->size >> 3);
-	sd_reg_val = (sd_entry_type == I40IW_SD_TYPE_PAGED) ?
-			sd_entry->u.pd_table.pd_page_addr.pa : sd_entry->u.bp.addr.pa;
-	if (dev->is_pf && !sd_entry->valid) {
-		ret_code = i40iw_hmc_sd_one(dev, hmc_info->hmc_fn_id,
-					    sd_reg_val, idx->sd_idx,
-					    sd_entry->entry_type, true);
-		if (ret_code) {
-			i40iw_pr_err("cqp cmd failed for sd (pbles)\n");
-			goto error;
-		}
-	}
-
-	sd_entry->valid = true;
-	list_add(&chunk->list, &pble_rsrc->pinfo.clist);
-	return 0;
- error:
-	kfree(chunk);
-	return ret_code;
-}
-
-/**
- * free_lvl2 - fee level 2 pble
- * @pble_rsrc: pble resource management
- * @palloc: level 2 pble allocation
- */
-static void free_lvl2(struct i40iw_hmc_pble_rsrc *pble_rsrc,
-		      struct i40iw_pble_alloc *palloc)
-{
-	u32 i;
-	struct gen_pool *pool;
-	struct i40iw_pble_level2 *lvl2 = &palloc->level2;
-	struct i40iw_pble_info *root = &lvl2->root;
-	struct i40iw_pble_info *leaf = lvl2->leaf;
-
-	pool = pble_rsrc->pinfo.pool;
-
-	for (i = 0; i < lvl2->leaf_cnt; i++, leaf++) {
-		if (leaf->addr)
-			gen_pool_free(pool, leaf->addr, (leaf->cnt << 3));
-		else
-			break;
-	}
-
-	if (root->addr)
-		gen_pool_free(pool, root->addr, (root->cnt << 3));
-
-	kfree(lvl2->leaf);
-	lvl2->leaf = NULL;
-}
-
-/**
- * get_lvl2_pble - get level 2 pble resource
- * @pble_rsrc: pble resource management
- * @palloc: level 2 pble allocation
- * @pool: pool pointer
- */
-static enum i40iw_status_code get_lvl2_pble(struct i40iw_hmc_pble_rsrc *pble_rsrc,
-					    struct i40iw_pble_alloc *palloc,
-					    struct gen_pool *pool)
-{
-	u32 lf4k, lflast, total, i;
-	u32 pblcnt = PBLE_PER_PAGE;
-	u64 *addr;
-	struct i40iw_pble_level2 *lvl2 = &palloc->level2;
-	struct i40iw_pble_info *root = &lvl2->root;
-	struct i40iw_pble_info *leaf;
-
-	/* number of full 512 (4K) leafs) */
-	lf4k = palloc->total_cnt >> 9;
-	lflast = palloc->total_cnt % PBLE_PER_PAGE;
-	total = (lflast == 0) ? lf4k : lf4k + 1;
-	lvl2->leaf_cnt = total;
-
-	leaf = kzalloc((sizeof(*leaf) * total), GFP_ATOMIC);
-	if (!leaf)
-		return I40IW_ERR_NO_MEMORY;
-	lvl2->leaf = leaf;
-	/* allocate pbles for the root */
-	root->addr = gen_pool_alloc(pool, (total << 3));
-	if (!root->addr) {
-		kfree(lvl2->leaf);
-		lvl2->leaf = NULL;
-		return I40IW_ERR_NO_MEMORY;
-	}
-	root->idx = fpm_to_idx(pble_rsrc,
-			       (u64)gen_pool_virt_to_phys(pool, root->addr));
-	root->cnt = total;
-	addr = (u64 *)root->addr;
-	for (i = 0; i < total; i++, leaf++) {
-		pblcnt = (lflast && ((i + 1) == total)) ? lflast : PBLE_PER_PAGE;
-		leaf->addr = gen_pool_alloc(pool, (pblcnt << 3));
-		if (!leaf->addr)
-			goto error;
-		leaf->idx = fpm_to_idx(pble_rsrc, (u64)gen_pool_virt_to_phys(pool, leaf->addr));
-
-		leaf->cnt = pblcnt;
-		*addr = (u64)leaf->idx;
-		addr++;
-	}
-	palloc->level = I40IW_LEVEL_2;
-	pble_rsrc->stats_lvl2++;
-	return 0;
- error:
-	free_lvl2(pble_rsrc, palloc);
-	return I40IW_ERR_NO_MEMORY;
-}
-
-/**
- * get_lvl1_pble - get level 1 pble resource
- * @dev: hardware control device structure
- * @pble_rsrc: pble resource management
- * @palloc: level 1 pble allocation
- */
-static enum i40iw_status_code get_lvl1_pble(struct i40iw_sc_dev *dev,
-					    struct i40iw_hmc_pble_rsrc *pble_rsrc,
-					    struct i40iw_pble_alloc *palloc)
-{
-	u64 *addr;
-	struct gen_pool *pool;
-	struct i40iw_pble_info *lvl1 = &palloc->level1;
-
-	pool = pble_rsrc->pinfo.pool;
-	addr = (u64 *)gen_pool_alloc(pool, (palloc->total_cnt << 3));
-
-	if (!addr)
-		return I40IW_ERR_NO_MEMORY;
-
-	palloc->level = I40IW_LEVEL_1;
-	lvl1->addr = (unsigned long)addr;
-	lvl1->idx = fpm_to_idx(pble_rsrc, (u64)gen_pool_virt_to_phys(pool,
-			       (unsigned long)addr));
-	lvl1->cnt = palloc->total_cnt;
-	pble_rsrc->stats_lvl1++;
-	return 0;
-}
-
-/**
- * get_lvl1_lvl2_pble - calls get_lvl1 and get_lvl2 pble routine
- * @dev: i40iw_sc_dev struct
- * @pble_rsrc:	pble resources
- * @palloc: contains all inforamtion regarding pble (idx + pble addr)
- * @pool: pointer to general purpose special memory pool descriptor
- */
-static inline enum i40iw_status_code get_lvl1_lvl2_pble(struct i40iw_sc_dev *dev,
-							struct i40iw_hmc_pble_rsrc *pble_rsrc,
-							struct i40iw_pble_alloc *palloc,
-							struct gen_pool *pool)
-{
-	enum i40iw_status_code status = 0;
-
-	status = get_lvl1_pble(dev, pble_rsrc, palloc);
-	if (status && (palloc->total_cnt > PBLE_PER_PAGE))
-		status = get_lvl2_pble(pble_rsrc, palloc, pool);
-	return status;
-}
-
-/**
- * i40iw_get_pble - allocate pbles from the pool
- * @dev: i40iw_sc_dev struct
- * @pble_rsrc:	pble resources
- * @palloc: contains all inforamtion regarding pble (idx + pble addr)
- * @pble_cnt: #of pbles requested
- */
-enum i40iw_status_code i40iw_get_pble(struct i40iw_sc_dev *dev,
-				      struct i40iw_hmc_pble_rsrc *pble_rsrc,
-				      struct i40iw_pble_alloc *palloc,
-				      u32 pble_cnt)
-{
-	struct gen_pool *pool;
-	enum i40iw_status_code status = 0;
-	u32 max_sds = 0;
-	int i;
-
-	pool = pble_rsrc->pinfo.pool;
-	palloc->total_cnt = pble_cnt;
-	palloc->level = I40IW_LEVEL_0;
-	/*check first to see if we can get pble's without acquiring additional sd's */
-	status = get_lvl1_lvl2_pble(dev, pble_rsrc, palloc, pool);
-	if (!status)
-		goto exit;
-	max_sds = (palloc->total_cnt >> 18) + 1;
-	for (i = 0; i < max_sds; i++) {
-		status = add_pble_pool(dev, pble_rsrc);
-		if (status)
-			break;
-		status = get_lvl1_lvl2_pble(dev, pble_rsrc, palloc, pool);
-		if (!status)
-			break;
-	}
-exit:
-	if (!status)
-		pble_rsrc->stats_alloc_ok++;
-	else
-		pble_rsrc->stats_alloc_fail++;
-
-	return status;
-}
-
-/**
- * i40iw_free_pble - put pbles back into pool
- * @pble_rsrc:	pble resources
- * @palloc: contains all inforamtion regarding pble resource being freed
- */
-void i40iw_free_pble(struct i40iw_hmc_pble_rsrc *pble_rsrc,
-		     struct i40iw_pble_alloc *palloc)
-{
-	struct gen_pool *pool;
-
-	pool = pble_rsrc->pinfo.pool;
-	if (palloc->level == I40IW_LEVEL_2)
-		free_lvl2(pble_rsrc, palloc);
-	else
-		gen_pool_free(pool, palloc->level1.addr,
-			      (palloc->level1.cnt << 3));
-	pble_rsrc->stats_alloc_freed++;
-}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_pble.h b/drivers/infiniband/hw/i40iw/i40iw_pble.h
deleted file mode 100644
index 7b1851d21cc0..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw_pble.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#ifndef I40IW_PBLE_H
-#define I40IW_PBLE_H
-
-#define POOL_SHIFT      6
-#define PBLE_PER_PAGE   512
-#define I40IW_HMC_PAGED_BP_SHIFT 12
-#define PBLE_512_SHIFT  9
-
-enum i40iw_pble_level {
-	I40IW_LEVEL_0 = 0,
-	I40IW_LEVEL_1 = 1,
-	I40IW_LEVEL_2 = 2
-};
-
-enum i40iw_alloc_type {
-	I40IW_NO_ALLOC = 0,
-	I40IW_DMA_COHERENT = 1,
-	I40IW_VMALLOC = 2
-};
-
-struct i40iw_pble_info {
-	unsigned long addr;
-	u32 idx;
-	u32 cnt;
-};
-
-struct i40iw_pble_level2 {
-	struct i40iw_pble_info root;
-	struct i40iw_pble_info *leaf;
-	u32 leaf_cnt;
-};
-
-struct i40iw_pble_alloc {
-	u32 total_cnt;
-	enum i40iw_pble_level level;
-	union {
-		struct i40iw_pble_info level1;
-		struct i40iw_pble_level2 level2;
-	};
-};
-
-struct sd_pd_idx {
-	u32 sd_idx;
-	u32 pd_idx;
-	u32 rel_pd_idx;
-};
-
-struct i40iw_add_page_info {
-	struct i40iw_chunk *chunk;
-	struct i40iw_hmc_sd_entry *sd_entry;
-	struct i40iw_hmc_info *hmc_info;
-	struct sd_pd_idx idx;
-	u32 pages;
-};
-
-struct i40iw_chunk {
-	struct list_head list;
-	u32 size;
-	void *vaddr;
-	u64 fpm_addr;
-	u32 pg_cnt;
-	dma_addr_t *dmaaddrs;
-	enum i40iw_alloc_type type;
-};
-
-struct i40iw_pble_pool {
-	struct gen_pool *pool;
-	struct list_head clist;
-	u32 total_pble_alloc;
-	u32 free_pble_cnt;
-	u32 pool_shift;
-};
-
-struct i40iw_hmc_pble_rsrc {
-	u32 unallocated_pble;
-	u64 fpm_base_addr;
-	u64 next_fpm_addr;
-	struct i40iw_pble_pool pinfo;
-
-	u32 stats_direct_sds;
-	u32 stats_paged_sds;
-	u64 stats_alloc_ok;
-	u64 stats_alloc_fail;
-	u64 stats_alloc_freed;
-	u64 stats_lvl1;
-	u64 stats_lvl2;
-};
-
-void i40iw_destroy_pble_pool(struct i40iw_sc_dev *dev, struct i40iw_hmc_pble_rsrc *pble_rsrc);
-enum i40iw_status_code i40iw_hmc_init_pble(struct i40iw_sc_dev *dev,
-					   struct i40iw_hmc_pble_rsrc *pble_rsrc);
-void i40iw_free_pble(struct i40iw_hmc_pble_rsrc *pble_rsrc, struct i40iw_pble_alloc *palloc);
-enum i40iw_status_code i40iw_get_pble(struct i40iw_sc_dev *dev,
-				      struct i40iw_hmc_pble_rsrc *pble_rsrc,
-				      struct i40iw_pble_alloc *palloc,
-				      u32 pble_cnt);
-#endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_puda.c b/drivers/infiniband/hw/i40iw/i40iw_puda.c
deleted file mode 100644
index 88fb68e866ba..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw_puda.c
+++ /dev/null
@@ -1,1496 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#include "i40iw_osdep.h"
-#include "i40iw_register.h"
-#include "i40iw_status.h"
-#include "i40iw_hmc.h"
-
-#include "i40iw_d.h"
-#include "i40iw_type.h"
-#include "i40iw_p.h"
-#include "i40iw_puda.h"
-
-static void i40iw_ieq_receive(struct i40iw_sc_vsi *vsi,
-			      struct i40iw_puda_buf *buf);
-static void i40iw_ieq_tx_compl(struct i40iw_sc_vsi *vsi, void *sqwrid);
-static void i40iw_ilq_putback_rcvbuf(struct i40iw_sc_qp *qp, u32 wqe_idx);
-static enum i40iw_status_code i40iw_puda_replenish_rq(struct i40iw_puda_rsrc
-						      *rsrc, bool initial);
-/**
- * i40iw_puda_get_listbuf - get buffer from puda list
- * @list: list to use for buffers (ILQ or IEQ)
- */
-static struct i40iw_puda_buf *i40iw_puda_get_listbuf(struct list_head *list)
-{
-	struct i40iw_puda_buf *buf = NULL;
-
-	if (!list_empty(list)) {
-		buf = (struct i40iw_puda_buf *)list->next;
-		list_del((struct list_head *)&buf->list);
-	}
-	return buf;
-}
-
-/**
- * i40iw_puda_get_bufpool - return buffer from resource
- * @rsrc: resource to use for buffer
- */
-struct i40iw_puda_buf *i40iw_puda_get_bufpool(struct i40iw_puda_rsrc *rsrc)
-{
-	struct i40iw_puda_buf *buf = NULL;
-	struct list_head *list = &rsrc->bufpool;
-	unsigned long	flags;
-
-	spin_lock_irqsave(&rsrc->bufpool_lock, flags);
-	buf = i40iw_puda_get_listbuf(list);
-	if (buf)
-		rsrc->avail_buf_count--;
-	else
-		rsrc->stats_buf_alloc_fail++;
-	spin_unlock_irqrestore(&rsrc->bufpool_lock, flags);
-	return buf;
-}
-
-/**
- * i40iw_puda_ret_bufpool - return buffer to rsrc list
- * @rsrc: resource to use for buffer
- * @buf: buffe to return to resouce
- */
-void i40iw_puda_ret_bufpool(struct i40iw_puda_rsrc *rsrc,
-			    struct i40iw_puda_buf *buf)
-{
-	unsigned long	flags;
-
-	spin_lock_irqsave(&rsrc->bufpool_lock, flags);
-	list_add(&buf->list, &rsrc->bufpool);
-	spin_unlock_irqrestore(&rsrc->bufpool_lock, flags);
-	rsrc->avail_buf_count++;
-}
-
-/**
- * i40iw_puda_post_recvbuf - set wqe for rcv buffer
- * @rsrc: resource ptr
- * @wqe_idx: wqe index to use
- * @buf: puda buffer for rcv q
- * @initial: flag if during init time
- */
-static void i40iw_puda_post_recvbuf(struct i40iw_puda_rsrc *rsrc, u32 wqe_idx,
-				    struct i40iw_puda_buf *buf, bool initial)
-{
-	u64 *wqe;
-	struct i40iw_sc_qp *qp = &rsrc->qp;
-	u64 offset24 = 0;
-
-	qp->qp_uk.rq_wrid_array[wqe_idx] = (uintptr_t)buf;
-	wqe = qp->qp_uk.rq_base[wqe_idx].elem;
-	i40iw_debug(rsrc->dev, I40IW_DEBUG_PUDA,
-		    "%s: wqe_idx= %d buf = %p wqe = %p\n", __func__,
-		    wqe_idx, buf, wqe);
-	if (!initial)
-		get_64bit_val(wqe, 24, &offset24);
-
-	offset24 = (offset24) ? 0 : LS_64(1, I40IWQPSQ_VALID);
-
-	set_64bit_val(wqe, 0, buf->mem.pa);
-	set_64bit_val(wqe, 8,
-		      LS_64(buf->mem.size, I40IWQPSQ_FRAG_LEN));
-	i40iw_insert_wqe_hdr(wqe, offset24);
-}
-
-/**
- * i40iw_puda_replenish_rq - post rcv buffers
- * @rsrc: resource to use for buffer
- * @initial: flag if during init time
- */
-static enum i40iw_status_code i40iw_puda_replenish_rq(struct i40iw_puda_rsrc *rsrc,
-						      bool initial)
-{
-	u32 i;
-	u32 invalid_cnt = rsrc->rxq_invalid_cnt;
-	struct i40iw_puda_buf *buf = NULL;
-
-	for (i = 0; i < invalid_cnt; i++) {
-		buf = i40iw_puda_get_bufpool(rsrc);
-		if (!buf)
-			return I40IW_ERR_list_empty;
-		i40iw_puda_post_recvbuf(rsrc, rsrc->rx_wqe_idx, buf,
-					initial);
-		rsrc->rx_wqe_idx =
-		    ((rsrc->rx_wqe_idx + 1) % rsrc->rq_size);
-		rsrc->rxq_invalid_cnt--;
-	}
-	return 0;
-}
-
-/**
- * i40iw_puda_alloc_buf - allocate mem for buffer
- * @dev: iwarp device
- * @length: length of buffer
- */
-static struct i40iw_puda_buf *i40iw_puda_alloc_buf(struct i40iw_sc_dev *dev,
-						   u32 length)
-{
-	struct i40iw_puda_buf *buf = NULL;
-	struct i40iw_virt_mem buf_mem;
-	enum i40iw_status_code ret;
-
-	ret = i40iw_allocate_virt_mem(dev->hw, &buf_mem,
-				      sizeof(struct i40iw_puda_buf));
-	if (ret) {
-		i40iw_debug(dev, I40IW_DEBUG_PUDA,
-			    "%s: error mem for buf\n", __func__);
-		return NULL;
-	}
-	buf = (struct i40iw_puda_buf *)buf_mem.va;
-	ret = i40iw_allocate_dma_mem(dev->hw, &buf->mem, length, 1);
-	if (ret) {
-		i40iw_debug(dev, I40IW_DEBUG_PUDA,
-			    "%s: error dma mem for buf\n", __func__);
-		i40iw_free_virt_mem(dev->hw, &buf_mem);
-		return NULL;
-	}
-	buf->buf_mem.va = buf_mem.va;
-	buf->buf_mem.size = buf_mem.size;
-	return buf;
-}
-
-/**
- * i40iw_puda_dele_buf - delete buffer back to system
- * @dev: iwarp device
- * @buf: buffer to free
- */
-static void i40iw_puda_dele_buf(struct i40iw_sc_dev *dev,
-				struct i40iw_puda_buf *buf)
-{
-	i40iw_free_dma_mem(dev->hw, &buf->mem);
-	i40iw_free_virt_mem(dev->hw, &buf->buf_mem);
-}
-
-/**
- * i40iw_puda_get_next_send_wqe - return next wqe for processing
- * @qp: puda qp for wqe
- * @wqe_idx: wqe index for caller
- */
-static u64 *i40iw_puda_get_next_send_wqe(struct i40iw_qp_uk *qp, u32 *wqe_idx)
-{
-	u64 *wqe = NULL;
-	enum i40iw_status_code ret_code = 0;
-
-	*wqe_idx = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring);
-	if (!*wqe_idx)
-		qp->swqe_polarity = !qp->swqe_polarity;
-	I40IW_RING_MOVE_HEAD(qp->sq_ring, ret_code);
-	if (ret_code)
-		return wqe;
-	wqe = qp->sq_base[*wqe_idx].elem;
-
-	return wqe;
-}
-
-/**
- * i40iw_puda_poll_info - poll cq for completion
- * @cq: cq for poll
- * @info: info return for successful completion
- */
-static enum i40iw_status_code i40iw_puda_poll_info(struct i40iw_sc_cq *cq,
-						   struct i40iw_puda_completion_info *info)
-{
-	u64 qword0, qword2, qword3;
-	u64 *cqe;
-	u64 comp_ctx;
-	bool valid_bit;
-	u32 major_err, minor_err;
-	bool error;
-
-	cqe = (u64 *)I40IW_GET_CURRENT_CQ_ELEMENT(&cq->cq_uk);
-	get_64bit_val(cqe, 24, &qword3);
-	valid_bit = (bool)RS_64(qword3, I40IW_CQ_VALID);
-
-	if (valid_bit != cq->cq_uk.polarity)
-		return I40IW_ERR_QUEUE_EMPTY;
-
-	i40iw_debug_buf(cq->dev, I40IW_DEBUG_PUDA, "PUDA CQE", cqe, 32);
-	error = (bool)RS_64(qword3, I40IW_CQ_ERROR);
-	if (error) {
-		i40iw_debug(cq->dev, I40IW_DEBUG_PUDA, "%s receive error\n", __func__);
-		major_err = (u32)(RS_64(qword3, I40IW_CQ_MAJERR));
-		minor_err = (u32)(RS_64(qword3, I40IW_CQ_MINERR));
-		info->compl_error = major_err << 16 | minor_err;
-		return I40IW_ERR_CQ_COMPL_ERROR;
-	}
-
-	get_64bit_val(cqe, 0, &qword0);
-	get_64bit_val(cqe, 16, &qword2);
-
-	info->q_type = (u8)RS_64(qword3, I40IW_CQ_SQ);
-	info->qp_id = (u32)RS_64(qword2, I40IWCQ_QPID);
-
-	get_64bit_val(cqe, 8, &comp_ctx);
-	info->qp = (struct i40iw_qp_uk *)(unsigned long)comp_ctx;
-	info->wqe_idx = (u32)RS_64(qword3, I40IW_CQ_WQEIDX);
-
-	if (info->q_type == I40IW_CQE_QTYPE_RQ) {
-		info->vlan_valid = (bool)RS_64(qword3, I40IW_VLAN_TAG_VALID);
-		info->l4proto = (u8)RS_64(qword2, I40IW_UDA_L4PROTO);
-		info->l3proto = (u8)RS_64(qword2, I40IW_UDA_L3PROTO);
-		info->payload_len = (u16)RS_64(qword0, I40IW_UDA_PAYLOADLEN);
-	}
-
-	return 0;
-}
-
-/**
- * i40iw_puda_poll_completion - processes completion for cq
- * @dev: iwarp device
- * @cq: cq getting interrupt
- * @compl_err: return any completion err
- */
-enum i40iw_status_code i40iw_puda_poll_completion(struct i40iw_sc_dev *dev,
-						  struct i40iw_sc_cq *cq, u32 *compl_err)
-{
-	struct i40iw_qp_uk *qp;
-	struct i40iw_cq_uk *cq_uk = &cq->cq_uk;
-	struct i40iw_puda_completion_info info;
-	enum i40iw_status_code ret = 0;
-	struct i40iw_puda_buf *buf;
-	struct i40iw_puda_rsrc *rsrc;
-	void *sqwrid;
-	u8 cq_type = cq->cq_type;
-	unsigned long	flags;
-
-	if ((cq_type == I40IW_CQ_TYPE_ILQ) || (cq_type == I40IW_CQ_TYPE_IEQ)) {
-		rsrc = (cq_type == I40IW_CQ_TYPE_ILQ) ? cq->vsi->ilq : cq->vsi->ieq;
-	} else {
-		i40iw_debug(dev, I40IW_DEBUG_PUDA, "%s qp_type error\n", __func__);
-		return I40IW_ERR_BAD_PTR;
-	}
-	memset(&info, 0, sizeof(info));
-	ret = i40iw_puda_poll_info(cq, &info);
-	*compl_err = info.compl_error;
-	if (ret == I40IW_ERR_QUEUE_EMPTY)
-		return ret;
-	if (ret)
-		goto done;
-
-	qp = info.qp;
-	if (!qp || !rsrc) {
-		ret = I40IW_ERR_BAD_PTR;
-		goto done;
-	}
-
-	if (qp->qp_id != rsrc->qp_id) {
-		ret = I40IW_ERR_BAD_PTR;
-		goto done;
-	}
-
-	if (info.q_type == I40IW_CQE_QTYPE_RQ) {
-		buf = (struct i40iw_puda_buf *)(uintptr_t)qp->rq_wrid_array[info.wqe_idx];
-		/* Get all the tcpip information in the buf header */
-		ret = i40iw_puda_get_tcpip_info(&info, buf);
-		if (ret) {
-			rsrc->stats_rcvd_pkt_err++;
-			if (cq_type == I40IW_CQ_TYPE_ILQ) {
-				i40iw_ilq_putback_rcvbuf(&rsrc->qp,
-							 info.wqe_idx);
-			} else {
-				i40iw_puda_ret_bufpool(rsrc, buf);
-				i40iw_puda_replenish_rq(rsrc, false);
-			}
-			goto done;
-		}
-
-		rsrc->stats_pkt_rcvd++;
-		rsrc->compl_rxwqe_idx = info.wqe_idx;
-		i40iw_debug(dev, I40IW_DEBUG_PUDA, "%s RQ completion\n", __func__);
-		rsrc->receive(rsrc->vsi, buf);
-		if (cq_type == I40IW_CQ_TYPE_ILQ)
-			i40iw_ilq_putback_rcvbuf(&rsrc->qp, info.wqe_idx);
-		else
-			i40iw_puda_replenish_rq(rsrc, false);
-
-	} else {
-		i40iw_debug(dev, I40IW_DEBUG_PUDA, "%s SQ completion\n", __func__);
-		sqwrid = (void *)(uintptr_t)qp->sq_wrtrk_array[info.wqe_idx].wrid;
-		I40IW_RING_SET_TAIL(qp->sq_ring, info.wqe_idx);
-		rsrc->xmit_complete(rsrc->vsi, sqwrid);
-		spin_lock_irqsave(&rsrc->bufpool_lock, flags);
-		rsrc->tx_wqe_avail_cnt++;
-		spin_unlock_irqrestore(&rsrc->bufpool_lock, flags);
-		if (!list_empty(&rsrc->txpend))
-			i40iw_puda_send_buf(rsrc, NULL);
-	}
-
-done:
-	I40IW_RING_MOVE_HEAD(cq_uk->cq_ring, ret);
-	if (I40IW_RING_GETCURRENT_HEAD(cq_uk->cq_ring) == 0)
-		cq_uk->polarity = !cq_uk->polarity;
-	/* update cq tail in cq shadow memory also */
-	I40IW_RING_MOVE_TAIL(cq_uk->cq_ring);
-	set_64bit_val(cq_uk->shadow_area, 0,
-		      I40IW_RING_GETCURRENT_HEAD(cq_uk->cq_ring));
-	return 0;
-}
-
-/**
- * i40iw_puda_send - complete send wqe for transmit
- * @qp: puda qp for send
- * @info: buffer information for transmit
- */
-enum i40iw_status_code i40iw_puda_send(struct i40iw_sc_qp *qp,
-				       struct i40iw_puda_send_info *info)
-{
-	u64 *wqe;
-	u32 iplen, l4len;
-	u64 header[2];
-	u32 wqe_idx;
-	u8 iipt;
-
-	/* number of 32 bits DWORDS in header */
-	l4len = info->tcplen >> 2;
-	if (info->ipv4) {
-		iipt = 3;
-		iplen = 5;
-	} else {
-		iipt = 1;
-		iplen = 10;
-	}
-
-	wqe = i40iw_puda_get_next_send_wqe(&qp->qp_uk, &wqe_idx);
-	if (!wqe)
-		return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
-	qp->qp_uk.sq_wrtrk_array[wqe_idx].wrid = (uintptr_t)info->scratch;
-	/* Third line of WQE descriptor */
-	/* maclen is in words */
-	header[0] = LS_64((info->maclen >> 1), I40IW_UDA_QPSQ_MACLEN) |
-		    LS_64(iplen, I40IW_UDA_QPSQ_IPLEN) | LS_64(1, I40IW_UDA_QPSQ_L4T) |
-		    LS_64(iipt, I40IW_UDA_QPSQ_IIPT) |
-		    LS_64(l4len, I40IW_UDA_QPSQ_L4LEN);
-	/* Forth line of WQE descriptor */
-	header[1] = LS_64(I40IW_OP_TYPE_SEND, I40IW_UDA_QPSQ_OPCODE) |
-		    LS_64(1, I40IW_UDA_QPSQ_SIGCOMPL) |
-		    LS_64(info->doloopback, I40IW_UDA_QPSQ_DOLOOPBACK) |
-		    LS_64(qp->qp_uk.swqe_polarity, I40IW_UDA_QPSQ_VALID);
-
-	set_64bit_val(wqe, 0, info->paddr);
-	set_64bit_val(wqe, 8, LS_64(info->len, I40IWQPSQ_FRAG_LEN));
-	set_64bit_val(wqe, 16, header[0]);
-
-	i40iw_insert_wqe_hdr(wqe, header[1]);
-
-	i40iw_debug_buf(qp->dev, I40IW_DEBUG_PUDA, "PUDA SEND WQE", wqe, 32);
-	i40iw_qp_post_wr(&qp->qp_uk);
-	return 0;
-}
-
-/**
- * i40iw_puda_send_buf - transmit puda buffer
- * @rsrc: resource to use for buffer
- * @buf: puda buffer to transmit
- */
-void i40iw_puda_send_buf(struct i40iw_puda_rsrc *rsrc, struct i40iw_puda_buf *buf)
-{
-	struct i40iw_puda_send_info info;
-	enum i40iw_status_code ret = 0;
-	unsigned long	flags;
-
-	spin_lock_irqsave(&rsrc->bufpool_lock, flags);
-	/* if no wqe available or not from a completion and we have
-	 * pending buffers, we must queue new buffer
-	 */
-	if (!rsrc->tx_wqe_avail_cnt || (buf && !list_empty(&rsrc->txpend))) {
-		list_add_tail(&buf->list, &rsrc->txpend);
-		spin_unlock_irqrestore(&rsrc->bufpool_lock, flags);
-		rsrc->stats_sent_pkt_q++;
-		if (rsrc->type == I40IW_PUDA_RSRC_TYPE_ILQ)
-			i40iw_debug(rsrc->dev, I40IW_DEBUG_PUDA,
-				    "%s: adding to txpend\n", __func__);
-		return;
-	}
-	rsrc->tx_wqe_avail_cnt--;
-	/* if we are coming from a completion and have pending buffers
-	 * then Get one from pending list
-	 */
-	if (!buf) {
-		buf = i40iw_puda_get_listbuf(&rsrc->txpend);
-		if (!buf)
-			goto done;
-	}
-
-	info.scratch = (void *)buf;
-	info.paddr = buf->mem.pa;
-	info.len = buf->totallen;
-	info.tcplen = buf->tcphlen;
-	info.maclen = buf->maclen;
-	info.ipv4 = buf->ipv4;
-	info.doloopback = (rsrc->type == I40IW_PUDA_RSRC_TYPE_IEQ);
-
-	ret = i40iw_puda_send(&rsrc->qp, &info);
-	if (ret) {
-		rsrc->tx_wqe_avail_cnt++;
-		rsrc->stats_sent_pkt_q++;
-		list_add(&buf->list, &rsrc->txpend);
-		if (rsrc->type == I40IW_PUDA_RSRC_TYPE_ILQ)
-			i40iw_debug(rsrc->dev, I40IW_DEBUG_PUDA,
-				    "%s: adding to puda_send\n", __func__);
-	} else {
-		rsrc->stats_pkt_sent++;
-	}
-done:
-	spin_unlock_irqrestore(&rsrc->bufpool_lock, flags);
-}
-
-/**
- * i40iw_puda_qp_setctx - during init, set qp's context
- * @rsrc: qp's resource
- */
-static void i40iw_puda_qp_setctx(struct i40iw_puda_rsrc *rsrc)
-{
-	struct i40iw_sc_qp *qp = &rsrc->qp;
-	u64 *qp_ctx = qp->hw_host_ctx;
-
-	set_64bit_val(qp_ctx, 8, qp->sq_pa);
-	set_64bit_val(qp_ctx, 16, qp->rq_pa);
-
-	set_64bit_val(qp_ctx, 24,
-		      LS_64(qp->hw_rq_size, I40IWQPC_RQSIZE) |
-		      LS_64(qp->hw_sq_size, I40IWQPC_SQSIZE));
-
-	set_64bit_val(qp_ctx, 48, LS_64(rsrc->buf_size, I40IW_UDA_QPC_MAXFRAMESIZE));
-	set_64bit_val(qp_ctx, 56, 0);
-	set_64bit_val(qp_ctx, 64, 1);
-
-	set_64bit_val(qp_ctx, 136,
-		      LS_64(rsrc->cq_id, I40IWQPC_TXCQNUM) |
-		      LS_64(rsrc->cq_id, I40IWQPC_RXCQNUM));
-
-	set_64bit_val(qp_ctx, 160, LS_64(1, I40IWQPC_PRIVEN));
-
-	set_64bit_val(qp_ctx, 168,
-		      LS_64((uintptr_t)qp, I40IWQPC_QPCOMPCTX));
-
-	set_64bit_val(qp_ctx, 176,
-		      LS_64(qp->sq_tph_val, I40IWQPC_SQTPHVAL) |
-		      LS_64(qp->rq_tph_val, I40IWQPC_RQTPHVAL) |
-		      LS_64(qp->qs_handle, I40IWQPC_QSHANDLE));
-
-	i40iw_debug_buf(rsrc->dev, I40IW_DEBUG_PUDA, "PUDA QP CONTEXT",
-			qp_ctx, I40IW_QP_CTX_SIZE);
-}
-
-/**
- * i40iw_puda_qp_wqe - setup wqe for qp create
- * @dev: iwarp device
- * @qp: resource for qp
- */
-static enum i40iw_status_code i40iw_puda_qp_wqe(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp)
-{
-	struct i40iw_sc_cqp *cqp;
-	u64 *wqe;
-	u64 header;
-	struct i40iw_ccq_cqe_info compl_info;
-	enum i40iw_status_code status = 0;
-
-	cqp = dev->cqp;
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, 0);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-
-	set_64bit_val(wqe, 16, qp->hw_host_ctx_pa);
-	set_64bit_val(wqe, 40, qp->shadow_area_pa);
-	header = qp->qp_uk.qp_id |
-		 LS_64(I40IW_CQP_OP_CREATE_QP, I40IW_CQPSQ_OPCODE) |
-		 LS_64(I40IW_QP_TYPE_UDA, I40IW_CQPSQ_QP_QPTYPE) |
-		 LS_64(1, I40IW_CQPSQ_QP_CQNUMVALID) |
-		 LS_64(2, I40IW_CQPSQ_QP_NEXTIWSTATE) |
-		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_PUDA, "PUDA CQE", wqe, 32);
-	i40iw_sc_cqp_post_sq(cqp);
-	status = dev->cqp_ops->poll_for_cqp_op_done(dev->cqp,
-						    I40IW_CQP_OP_CREATE_QP,
-						    &compl_info);
-	return status;
-}
-
-/**
- * i40iw_puda_qp_create - create qp for resource
- * @rsrc: resource to use for buffer
- */
-static enum i40iw_status_code i40iw_puda_qp_create(struct i40iw_puda_rsrc *rsrc)
-{
-	struct i40iw_sc_qp *qp = &rsrc->qp;
-	struct i40iw_qp_uk *ukqp = &qp->qp_uk;
-	enum i40iw_status_code ret = 0;
-	u32 sq_size, rq_size, t_size;
-	struct i40iw_dma_mem *mem;
-
-	sq_size = rsrc->sq_size * I40IW_QP_WQE_MIN_SIZE;
-	rq_size = rsrc->rq_size * I40IW_QP_WQE_MIN_SIZE;
-	t_size = (sq_size + rq_size + (I40IW_SHADOW_AREA_SIZE << 3) +
-		  I40IW_QP_CTX_SIZE);
-	/* Get page aligned memory */
-	ret =
-	    i40iw_allocate_dma_mem(rsrc->dev->hw, &rsrc->qpmem, t_size,
-				   I40IW_HW_PAGE_SIZE);
-	if (ret) {
-		i40iw_debug(rsrc->dev, I40IW_DEBUG_PUDA, "%s: error dma mem\n", __func__);
-		return ret;
-	}
-
-	mem = &rsrc->qpmem;
-	memset(mem->va, 0, t_size);
-	qp->hw_sq_size = i40iw_get_encoded_wqe_size(rsrc->sq_size, false);
-	qp->hw_rq_size = i40iw_get_encoded_wqe_size(rsrc->rq_size, false);
-	qp->pd = &rsrc->sc_pd;
-	qp->qp_type = I40IW_QP_TYPE_UDA;
-	qp->dev = rsrc->dev;
-	qp->back_qp = (void *)rsrc;
-	qp->sq_pa = mem->pa;
-	qp->rq_pa = qp->sq_pa + sq_size;
-	qp->vsi = rsrc->vsi;
-	ukqp->sq_base = mem->va;
-	ukqp->rq_base = &ukqp->sq_base[rsrc->sq_size];
-	ukqp->shadow_area = ukqp->rq_base[rsrc->rq_size].elem;
-	qp->shadow_area_pa = qp->rq_pa + rq_size;
-	qp->hw_host_ctx = ukqp->shadow_area + I40IW_SHADOW_AREA_SIZE;
-	qp->hw_host_ctx_pa =
-		qp->shadow_area_pa + (I40IW_SHADOW_AREA_SIZE << 3);
-	ukqp->qp_id = rsrc->qp_id;
-	ukqp->sq_wrtrk_array = rsrc->sq_wrtrk_array;
-	ukqp->rq_wrid_array = rsrc->rq_wrid_array;
-
-	ukqp->qp_id = rsrc->qp_id;
-	ukqp->sq_size = rsrc->sq_size;
-	ukqp->rq_size = rsrc->rq_size;
-
-	I40IW_RING_INIT(ukqp->sq_ring, ukqp->sq_size);
-	I40IW_RING_INIT(ukqp->initial_ring, ukqp->sq_size);
-	I40IW_RING_INIT(ukqp->rq_ring, ukqp->rq_size);
-
-	if (qp->pd->dev->is_pf)
-		ukqp->wqe_alloc_reg = (u32 __iomem *)(i40iw_get_hw_addr(qp->pd->dev) +
-						    I40E_PFPE_WQEALLOC);
-	else
-		ukqp->wqe_alloc_reg = (u32 __iomem *)(i40iw_get_hw_addr(qp->pd->dev) +
-						    I40E_VFPE_WQEALLOC1);
-
-	qp->user_pri = 0;
-	i40iw_qp_add_qos(qp);
-	i40iw_puda_qp_setctx(rsrc);
-	if (rsrc->dev->ceq_valid)
-		ret = i40iw_cqp_qp_create_cmd(rsrc->dev, qp);
-	else
-		ret = i40iw_puda_qp_wqe(rsrc->dev, qp);
-	if (ret) {
-		i40iw_qp_rem_qos(qp);
-		i40iw_free_dma_mem(rsrc->dev->hw, &rsrc->qpmem);
-	}
-	return ret;
-}
-
-/**
- * i40iw_puda_cq_wqe - setup wqe for cq create
- * @dev: iwarp device
- * @cq: cq to setup
- */
-static enum i40iw_status_code i40iw_puda_cq_wqe(struct i40iw_sc_dev *dev, struct i40iw_sc_cq *cq)
-{
-	u64 *wqe;
-	struct i40iw_sc_cqp *cqp;
-	u64 header;
-	struct i40iw_ccq_cqe_info compl_info;
-	enum i40iw_status_code status = 0;
-
-	cqp = dev->cqp;
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, 0);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-
-	set_64bit_val(wqe, 0, cq->cq_uk.cq_size);
-	set_64bit_val(wqe, 8, RS_64_1(cq, 1));
-	set_64bit_val(wqe, 16,
-		      LS_64(cq->shadow_read_threshold,
-			    I40IW_CQPSQ_CQ_SHADOW_READ_THRESHOLD));
-	set_64bit_val(wqe, 32, cq->cq_pa);
-
-	set_64bit_val(wqe, 40, cq->shadow_area_pa);
-
-	header = cq->cq_uk.cq_id |
-	    LS_64(I40IW_CQP_OP_CREATE_CQ, I40IW_CQPSQ_OPCODE) |
-	    LS_64(1, I40IW_CQPSQ_CQ_CHKOVERFLOW) |
-	    LS_64(1, I40IW_CQPSQ_CQ_ENCEQEMASK) |
-	    LS_64(1, I40IW_CQPSQ_CQ_CEQIDVALID) |
-	    LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-	i40iw_insert_wqe_hdr(wqe, header);
-
-	i40iw_debug_buf(dev, I40IW_DEBUG_PUDA, "PUDA CQE",
-			wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	i40iw_sc_cqp_post_sq(dev->cqp);
-	status = dev->cqp_ops->poll_for_cqp_op_done(dev->cqp,
-						 I40IW_CQP_OP_CREATE_CQ,
-						 &compl_info);
-	return status;
-}
-
-/**
- * i40iw_puda_cq_create - create cq for resource
- * @rsrc: resource for which cq to create
- */
-static enum i40iw_status_code i40iw_puda_cq_create(struct i40iw_puda_rsrc *rsrc)
-{
-	struct i40iw_sc_dev *dev = rsrc->dev;
-	struct i40iw_sc_cq *cq = &rsrc->cq;
-	enum i40iw_status_code ret = 0;
-	u32 tsize, cqsize;
-	struct i40iw_dma_mem *mem;
-	struct i40iw_cq_init_info info;
-	struct i40iw_cq_uk_init_info *init_info = &info.cq_uk_init_info;
-
-	cq->vsi = rsrc->vsi;
-	cqsize = rsrc->cq_size * (sizeof(struct i40iw_cqe));
-	tsize = cqsize + sizeof(struct i40iw_cq_shadow_area);
-	ret = i40iw_allocate_dma_mem(dev->hw, &rsrc->cqmem, tsize,
-				     I40IW_CQ0_ALIGNMENT);
-	if (ret)
-		return ret;
-
-	mem = &rsrc->cqmem;
-	memset(&info, 0, sizeof(info));
-	info.dev = dev;
-	info.type = (rsrc->type == I40IW_PUDA_RSRC_TYPE_ILQ) ?
-			 I40IW_CQ_TYPE_ILQ : I40IW_CQ_TYPE_IEQ;
-	info.shadow_read_threshold = rsrc->cq_size >> 2;
-	info.ceq_id_valid = true;
-	info.cq_base_pa = mem->pa;
-	info.shadow_area_pa = mem->pa + cqsize;
-	init_info->cq_base = mem->va;
-	init_info->shadow_area = (u64 *)((u8 *)mem->va + cqsize);
-	init_info->cq_size = rsrc->cq_size;
-	init_info->cq_id = rsrc->cq_id;
-	info.ceqe_mask = true;
-	info.ceq_id_valid = true;
-	ret = dev->iw_priv_cq_ops->cq_init(cq, &info);
-	if (ret)
-		goto error;
-	if (rsrc->dev->ceq_valid)
-		ret = i40iw_cqp_cq_create_cmd(dev, cq);
-	else
-		ret = i40iw_puda_cq_wqe(dev, cq);
-error:
-	if (ret)
-		i40iw_free_dma_mem(dev->hw, &rsrc->cqmem);
-	return ret;
-}
-
-/**
- * i40iw_puda_free_qp - free qp for resource
- * @rsrc: resource for which qp to free
- */
-static void i40iw_puda_free_qp(struct i40iw_puda_rsrc *rsrc)
-{
-	enum i40iw_status_code ret;
-	struct i40iw_ccq_cqe_info compl_info;
-	struct i40iw_sc_dev *dev = rsrc->dev;
-
-	if (rsrc->dev->ceq_valid) {
-		i40iw_cqp_qp_destroy_cmd(dev, &rsrc->qp);
-		return;
-	}
-
-	ret = dev->iw_priv_qp_ops->qp_destroy(&rsrc->qp,
-			0, false, true, true);
-	if (ret)
-		i40iw_debug(dev, I40IW_DEBUG_PUDA,
-			    "%s error puda qp destroy wqe\n",
-			    __func__);
-
-	if (!ret) {
-		ret = dev->cqp_ops->poll_for_cqp_op_done(dev->cqp,
-				I40IW_CQP_OP_DESTROY_QP,
-				&compl_info);
-		if (ret)
-			i40iw_debug(dev, I40IW_DEBUG_PUDA,
-				    "%s error puda qp destroy failed\n",
-				    __func__);
-	}
-}
-
-/**
- * i40iw_puda_free_cq - free cq for resource
- * @rsrc: resource for which cq to free
- */
-static void i40iw_puda_free_cq(struct i40iw_puda_rsrc *rsrc)
-{
-	enum i40iw_status_code ret;
-	struct i40iw_ccq_cqe_info compl_info;
-	struct i40iw_sc_dev *dev = rsrc->dev;
-
-	if (rsrc->dev->ceq_valid) {
-		i40iw_cqp_cq_destroy_cmd(dev, &rsrc->cq);
-		return;
-	}
-	ret = dev->iw_priv_cq_ops->cq_destroy(&rsrc->cq, 0, true);
-
-	if (ret)
-		i40iw_debug(dev, I40IW_DEBUG_PUDA,
-			    "%s error ieq cq destroy\n",
-			    __func__);
-
-	if (!ret) {
-		ret = dev->cqp_ops->poll_for_cqp_op_done(dev->cqp,
-				I40IW_CQP_OP_DESTROY_CQ,
-				&compl_info);
-		if (ret)
-			i40iw_debug(dev, I40IW_DEBUG_PUDA,
-				    "%s error ieq qp destroy done\n",
-				    __func__);
-	}
-}
-
-/**
- * i40iw_puda_dele_resources - delete all resources during close
- * @vsi: pointer to vsi structure
- * @type: type of resource to dele
- * @reset: true if reset chip
- */
-void i40iw_puda_dele_resources(struct i40iw_sc_vsi *vsi,
-			       enum puda_resource_type type,
-			       bool reset)
-{
-	struct i40iw_sc_dev *dev = vsi->dev;
-	struct i40iw_puda_rsrc *rsrc;
-	struct i40iw_puda_buf *buf = NULL;
-	struct i40iw_puda_buf *nextbuf = NULL;
-	struct i40iw_virt_mem *vmem;
-
-	switch (type) {
-	case I40IW_PUDA_RSRC_TYPE_ILQ:
-		rsrc = vsi->ilq;
-		vmem = &vsi->ilq_mem;
-		break;
-	case I40IW_PUDA_RSRC_TYPE_IEQ:
-		rsrc = vsi->ieq;
-		vmem = &vsi->ieq_mem;
-		break;
-	default:
-		i40iw_debug(dev, I40IW_DEBUG_PUDA, "%s: error resource type = 0x%x\n",
-			    __func__, type);
-		return;
-	}
-
-	switch (rsrc->completion) {
-	case PUDA_HASH_CRC_COMPLETE:
-		i40iw_free_hash_desc(rsrc->hash_desc);
-		fallthrough;
-	case PUDA_QP_CREATED:
-		if (!reset)
-			i40iw_puda_free_qp(rsrc);
-
-		i40iw_free_dma_mem(dev->hw, &rsrc->qpmem);
-		fallthrough;
-	case PUDA_CQ_CREATED:
-		if (!reset)
-			i40iw_puda_free_cq(rsrc);
-
-		i40iw_free_dma_mem(dev->hw, &rsrc->cqmem);
-		break;
-	default:
-		i40iw_debug(rsrc->dev, I40IW_DEBUG_PUDA, "%s error no resources\n", __func__);
-		break;
-	}
-	/* Free all allocated puda buffers for both tx and rx */
-	buf = rsrc->alloclist;
-	while (buf) {
-		nextbuf = buf->next;
-		i40iw_puda_dele_buf(dev, buf);
-		buf = nextbuf;
-		rsrc->alloc_buf_count--;
-	}
-	i40iw_free_virt_mem(dev->hw, vmem);
-}
-
-/**
- * i40iw_puda_allocbufs - allocate buffers for resource
- * @rsrc: resource for buffer allocation
- * @count: number of buffers to create
- */
-static enum i40iw_status_code i40iw_puda_allocbufs(struct i40iw_puda_rsrc *rsrc,
-						   u32 count)
-{
-	u32 i;
-	struct i40iw_puda_buf *buf;
-	struct i40iw_puda_buf *nextbuf;
-
-	for (i = 0; i < count; i++) {
-		buf = i40iw_puda_alloc_buf(rsrc->dev, rsrc->buf_size);
-		if (!buf) {
-			rsrc->stats_buf_alloc_fail++;
-			return I40IW_ERR_NO_MEMORY;
-		}
-		i40iw_puda_ret_bufpool(rsrc, buf);
-		rsrc->alloc_buf_count++;
-		if (!rsrc->alloclist) {
-			rsrc->alloclist = buf;
-		} else {
-			nextbuf = rsrc->alloclist;
-			rsrc->alloclist = buf;
-			buf->next = nextbuf;
-		}
-	}
-	rsrc->avail_buf_count = rsrc->alloc_buf_count;
-	return 0;
-}
-
-/**
- * i40iw_puda_create_rsrc - create resouce (ilq or ieq)
- * @vsi: pointer to vsi structure
- * @info: resource information
- */
-enum i40iw_status_code i40iw_puda_create_rsrc(struct i40iw_sc_vsi *vsi,
-					      struct i40iw_puda_rsrc_info *info)
-{
-	struct i40iw_sc_dev *dev = vsi->dev;
-	enum i40iw_status_code ret = 0;
-	struct i40iw_puda_rsrc *rsrc;
-	u32 pudasize;
-	u32 sqwridsize, rqwridsize;
-	struct i40iw_virt_mem *vmem;
-
-	info->count = 1;
-	pudasize = sizeof(struct i40iw_puda_rsrc);
-	sqwridsize = info->sq_size * sizeof(struct i40iw_sq_uk_wr_trk_info);
-	rqwridsize = info->rq_size * 8;
-	switch (info->type) {
-	case I40IW_PUDA_RSRC_TYPE_ILQ:
-		vmem = &vsi->ilq_mem;
-		break;
-	case I40IW_PUDA_RSRC_TYPE_IEQ:
-		vmem = &vsi->ieq_mem;
-		break;
-	default:
-		return I40IW_NOT_SUPPORTED;
-	}
-	ret =
-	    i40iw_allocate_virt_mem(dev->hw, vmem,
-				    pudasize + sqwridsize + rqwridsize);
-	if (ret)
-		return ret;
-	rsrc = (struct i40iw_puda_rsrc *)vmem->va;
-	spin_lock_init(&rsrc->bufpool_lock);
-	if (info->type == I40IW_PUDA_RSRC_TYPE_ILQ) {
-		vsi->ilq = (struct i40iw_puda_rsrc *)vmem->va;
-		vsi->ilq_count = info->count;
-		rsrc->receive = info->receive;
-		rsrc->xmit_complete = info->xmit_complete;
-	} else {
-		vmem = &vsi->ieq_mem;
-		vsi->ieq_count = info->count;
-		vsi->ieq = (struct i40iw_puda_rsrc *)vmem->va;
-		rsrc->receive = i40iw_ieq_receive;
-		rsrc->xmit_complete = i40iw_ieq_tx_compl;
-	}
-
-	rsrc->type = info->type;
-	rsrc->sq_wrtrk_array = (struct i40iw_sq_uk_wr_trk_info *)((u8 *)vmem->va + pudasize);
-	rsrc->rq_wrid_array = (u64 *)((u8 *)vmem->va + pudasize + sqwridsize);
-	/* Initialize all ieq lists */
-	INIT_LIST_HEAD(&rsrc->bufpool);
-	INIT_LIST_HEAD(&rsrc->txpend);
-
-	rsrc->tx_wqe_avail_cnt = info->sq_size - 1;
-	dev->iw_pd_ops->pd_init(dev, &rsrc->sc_pd, info->pd_id, -1);
-	rsrc->qp_id = info->qp_id;
-	rsrc->cq_id = info->cq_id;
-	rsrc->sq_size = info->sq_size;
-	rsrc->rq_size = info->rq_size;
-	rsrc->cq_size = info->rq_size + info->sq_size;
-	rsrc->buf_size = info->buf_size;
-	rsrc->dev = dev;
-	rsrc->vsi = vsi;
-
-	ret = i40iw_puda_cq_create(rsrc);
-	if (!ret) {
-		rsrc->completion = PUDA_CQ_CREATED;
-		ret = i40iw_puda_qp_create(rsrc);
-	}
-	if (ret) {
-		i40iw_debug(dev, I40IW_DEBUG_PUDA, "[%s] error qp_create\n",
-			    __func__);
-		goto error;
-	}
-	rsrc->completion = PUDA_QP_CREATED;
-
-	ret = i40iw_puda_allocbufs(rsrc, info->tx_buf_cnt + info->rq_size);
-	if (ret) {
-		i40iw_debug(dev, I40IW_DEBUG_PUDA, "[%s] error alloc_buf\n",
-			    __func__);
-		goto error;
-	}
-
-	rsrc->rxq_invalid_cnt = info->rq_size;
-	ret = i40iw_puda_replenish_rq(rsrc, true);
-	if (ret)
-		goto error;
-
-	if (info->type == I40IW_PUDA_RSRC_TYPE_IEQ) {
-		if (!i40iw_init_hash_desc(&rsrc->hash_desc)) {
-			rsrc->check_crc = true;
-			rsrc->completion = PUDA_HASH_CRC_COMPLETE;
-			ret = 0;
-		}
-	}
-
-	dev->ccq_ops->ccq_arm(&rsrc->cq);
-	return ret;
- error:
-	i40iw_puda_dele_resources(vsi, info->type, false);
-
-	return ret;
-}
-
-/**
- * i40iw_ilq_putback_rcvbuf - ilq buffer to put back on rq
- * @qp: ilq's qp resource
- * @wqe_idx:  wqe index of completed rcvbuf
- */
-static void i40iw_ilq_putback_rcvbuf(struct i40iw_sc_qp *qp, u32 wqe_idx)
-{
-	u64 *wqe;
-	u64 offset24;
-
-	wqe = qp->qp_uk.rq_base[wqe_idx].elem;
-	get_64bit_val(wqe, 24, &offset24);
-	offset24 = (offset24) ? 0 : LS_64(1, I40IWQPSQ_VALID);
-	set_64bit_val(wqe, 24, offset24);
-}
-
-/**
- * i40iw_ieq_get_fpdu_length - given length return fpdu length
- * @length: length if fpdu
- */
-static u16 i40iw_ieq_get_fpdu_length(u16 length)
-{
-	u16 fpdu_len;
-
-	fpdu_len = length + I40IW_IEQ_MPA_FRAMING;
-	fpdu_len = (fpdu_len + 3) & 0xfffffffc;
-	return fpdu_len;
-}
-
-/**
- * i40iw_ieq_copy_to_txbuf - copydata from rcv buf to tx buf
- * @buf: rcv buffer with partial
- * @txbuf: tx buffer for sendign back
- * @buf_offset: rcv buffer offset to copy from
- * @txbuf_offset: at offset in tx buf to copy
- * @length: length of data to copy
- */
-static void i40iw_ieq_copy_to_txbuf(struct i40iw_puda_buf *buf,
-				    struct i40iw_puda_buf *txbuf,
-				    u16 buf_offset, u32 txbuf_offset,
-				    u32 length)
-{
-	void *mem1 = (u8 *)buf->mem.va + buf_offset;
-	void *mem2 = (u8 *)txbuf->mem.va + txbuf_offset;
-
-	memcpy(mem2, mem1, length);
-}
-
-/**
- * i40iw_ieq_setup_tx_buf - setup tx buffer for partial handling
- * @buf: reeive buffer with partial
- * @txbuf: buffer to prepare
- */
-static void i40iw_ieq_setup_tx_buf(struct i40iw_puda_buf *buf,
-				   struct i40iw_puda_buf *txbuf)
-{
-	txbuf->maclen = buf->maclen;
-	txbuf->tcphlen = buf->tcphlen;
-	txbuf->ipv4 = buf->ipv4;
-	txbuf->hdrlen = buf->hdrlen;
-	i40iw_ieq_copy_to_txbuf(buf, txbuf, 0, 0, buf->hdrlen);
-}
-
-/**
- * i40iw_ieq_check_first_buf - check if rcv buffer's seq is in range
- * @buf: receive exception buffer
- * @fps: first partial sequence number
- */
-static void i40iw_ieq_check_first_buf(struct i40iw_puda_buf *buf, u32 fps)
-{
-	u32 offset;
-
-	if (buf->seqnum < fps) {
-		offset = fps - buf->seqnum;
-		if (offset > buf->datalen)
-			return;
-		buf->data += offset;
-		buf->datalen -= (u16)offset;
-		buf->seqnum = fps;
-	}
-}
-
-/**
- * i40iw_ieq_compl_pfpdu - write txbuf with full fpdu
- * @ieq: ieq resource
- * @rxlist: ieq's received buffer list
- * @pbufl: temporary list for buffers for fpddu
- * @txbuf: tx buffer for fpdu
- * @fpdu_len: total length of fpdu
- */
-static void  i40iw_ieq_compl_pfpdu(struct i40iw_puda_rsrc *ieq,
-				   struct list_head *rxlist,
-				   struct list_head *pbufl,
-				   struct i40iw_puda_buf *txbuf,
-				   u16 fpdu_len)
-{
-	struct i40iw_puda_buf *buf;
-	u32 nextseqnum;
-	u16 txoffset, bufoffset;
-
-	buf = i40iw_puda_get_listbuf(pbufl);
-	if (!buf)
-		return;
-	nextseqnum = buf->seqnum + fpdu_len;
-	txbuf->totallen = buf->hdrlen + fpdu_len;
-	txbuf->data = (u8 *)txbuf->mem.va + buf->hdrlen;
-	i40iw_ieq_setup_tx_buf(buf, txbuf);
-
-	txoffset = buf->hdrlen;
-	bufoffset = (u16)(buf->data - (u8 *)buf->mem.va);
-
-	do {
-		if (buf->datalen >= fpdu_len) {
-			/* copied full fpdu */
-			i40iw_ieq_copy_to_txbuf(buf, txbuf, bufoffset, txoffset, fpdu_len);
-			buf->datalen -= fpdu_len;
-			buf->data += fpdu_len;
-			buf->seqnum = nextseqnum;
-			break;
-		}
-		/* copy partial fpdu */
-		i40iw_ieq_copy_to_txbuf(buf, txbuf, bufoffset, txoffset, buf->datalen);
-		txoffset += buf->datalen;
-		fpdu_len -= buf->datalen;
-		i40iw_puda_ret_bufpool(ieq, buf);
-		buf = i40iw_puda_get_listbuf(pbufl);
-		if (!buf)
-			return;
-		bufoffset = (u16)(buf->data - (u8 *)buf->mem.va);
-	} while (1);
-
-	/* last buffer on the list*/
-	if (buf->datalen)
-		list_add(&buf->list, rxlist);
-	else
-		i40iw_puda_ret_bufpool(ieq, buf);
-}
-
-/**
- * i40iw_ieq_create_pbufl - create buffer list for single fpdu
- * @pfpdu: partial management per user qp
- * @rxlist: resource list for receive ieq buffes
- * @pbufl: temp. list for buffers for fpddu
- * @buf: first receive buffer
- * @fpdu_len: total length of fpdu
- */
-static enum i40iw_status_code i40iw_ieq_create_pbufl(
-						     struct i40iw_pfpdu *pfpdu,
-						     struct list_head *rxlist,
-						     struct list_head *pbufl,
-						     struct i40iw_puda_buf *buf,
-						     u16 fpdu_len)
-{
-	enum i40iw_status_code status = 0;
-	struct i40iw_puda_buf *nextbuf;
-	u32	nextseqnum;
-	u16 plen = fpdu_len - buf->datalen;
-	bool done = false;
-
-	nextseqnum = buf->seqnum + buf->datalen;
-	do {
-		nextbuf = i40iw_puda_get_listbuf(rxlist);
-		if (!nextbuf) {
-			status = I40IW_ERR_list_empty;
-			break;
-		}
-		list_add_tail(&nextbuf->list, pbufl);
-		if (nextbuf->seqnum != nextseqnum) {
-			pfpdu->bad_seq_num++;
-			status = I40IW_ERR_SEQ_NUM;
-			break;
-		}
-		if (nextbuf->datalen >= plen) {
-			done = true;
-		} else {
-			plen -= nextbuf->datalen;
-			nextseqnum = nextbuf->seqnum + nextbuf->datalen;
-		}
-
-	} while (!done);
-
-	return status;
-}
-
-/**
- * i40iw_ieq_handle_partial - process partial fpdu buffer
- * @ieq: ieq resource
- * @pfpdu: partial management per user qp
- * @buf: receive buffer
- * @fpdu_len: fpdu len in the buffer
- */
-static enum i40iw_status_code i40iw_ieq_handle_partial(struct i40iw_puda_rsrc *ieq,
-						       struct i40iw_pfpdu *pfpdu,
-						       struct i40iw_puda_buf *buf,
-						       u16 fpdu_len)
-{
-	enum i40iw_status_code status = 0;
-	u8 *crcptr;
-	u32 mpacrc;
-	u32 seqnum = buf->seqnum;
-	struct list_head pbufl;	/* partial buffer list */
-	struct i40iw_puda_buf *txbuf = NULL;
-	struct list_head *rxlist = &pfpdu->rxlist;
-
-	INIT_LIST_HEAD(&pbufl);
-	list_add(&buf->list, &pbufl);
-
-	status = i40iw_ieq_create_pbufl(pfpdu, rxlist, &pbufl, buf, fpdu_len);
-	if (status)
-		goto error;
-
-	txbuf = i40iw_puda_get_bufpool(ieq);
-	if (!txbuf) {
-		pfpdu->no_tx_bufs++;
-		status = I40IW_ERR_NO_TXBUFS;
-		goto error;
-	}
-
-	i40iw_ieq_compl_pfpdu(ieq, rxlist, &pbufl, txbuf, fpdu_len);
-	i40iw_ieq_update_tcpip_info(txbuf, fpdu_len, seqnum);
-	crcptr = txbuf->data + fpdu_len - 4;
-	mpacrc = *(u32 *)crcptr;
-	if (ieq->check_crc) {
-		status = i40iw_ieq_check_mpacrc(ieq->hash_desc, txbuf->data,
-						(fpdu_len - 4), mpacrc);
-		if (status) {
-			i40iw_debug(ieq->dev, I40IW_DEBUG_IEQ,
-				    "%s: error bad crc\n", __func__);
-			goto error;
-		}
-	}
-
-	i40iw_debug_buf(ieq->dev, I40IW_DEBUG_IEQ, "IEQ TX BUFFER",
-			txbuf->mem.va, txbuf->totallen);
-	i40iw_puda_send_buf(ieq, txbuf);
-	pfpdu->rcv_nxt = seqnum + fpdu_len;
-	return status;
- error:
-	while (!list_empty(&pbufl)) {
-		buf = (struct i40iw_puda_buf *)(pbufl.prev);
-		list_del(&buf->list);
-		list_add(&buf->list, rxlist);
-	}
-	if (txbuf)
-		i40iw_puda_ret_bufpool(ieq, txbuf);
-	return status;
-}
-
-/**
- * i40iw_ieq_process_buf - process buffer rcvd for ieq
- * @ieq: ieq resource
- * @pfpdu: partial management per user qp
- * @buf: receive buffer
- */
-static enum i40iw_status_code i40iw_ieq_process_buf(struct i40iw_puda_rsrc *ieq,
-						    struct i40iw_pfpdu *pfpdu,
-						    struct i40iw_puda_buf *buf)
-{
-	u16 fpdu_len = 0;
-	u16 datalen = buf->datalen;
-	u8 *datap = buf->data;
-	u8 *crcptr;
-	u16 ioffset = 0;
-	u32 mpacrc;
-	u32 seqnum = buf->seqnum;
-	u16 length = 0;
-	u16 full = 0;
-	bool partial = false;
-	struct i40iw_puda_buf *txbuf;
-	struct list_head *rxlist = &pfpdu->rxlist;
-	enum i40iw_status_code ret = 0;
-	enum i40iw_status_code status = 0;
-
-	ioffset = (u16)(buf->data - (u8 *)buf->mem.va);
-	while (datalen) {
-		fpdu_len = i40iw_ieq_get_fpdu_length(ntohs(*(__be16 *)datap));
-		if (fpdu_len > pfpdu->max_fpdu_data) {
-			i40iw_debug(ieq->dev, I40IW_DEBUG_IEQ,
-				    "%s: error bad fpdu_len\n", __func__);
-			status = I40IW_ERR_MPA_CRC;
-			list_add(&buf->list, rxlist);
-			return status;
-		}
-
-		if (datalen < fpdu_len) {
-			partial = true;
-			break;
-		}
-		crcptr = datap + fpdu_len - 4;
-		mpacrc = *(u32 *)crcptr;
-		if (ieq->check_crc)
-			ret = i40iw_ieq_check_mpacrc(ieq->hash_desc,
-						     datap, fpdu_len - 4, mpacrc);
-		if (ret) {
-			status = I40IW_ERR_MPA_CRC;
-			list_add(&buf->list, rxlist);
-			return status;
-		}
-		full++;
-		pfpdu->fpdu_processed++;
-		datap += fpdu_len;
-		length += fpdu_len;
-		datalen -= fpdu_len;
-	}
-	if (full) {
-		/* copy full pdu's in the txbuf and send them out */
-		txbuf = i40iw_puda_get_bufpool(ieq);
-		if (!txbuf) {
-			pfpdu->no_tx_bufs++;
-			status = I40IW_ERR_NO_TXBUFS;
-			list_add(&buf->list, rxlist);
-			return status;
-		}
-		/* modify txbuf's buffer header */
-		i40iw_ieq_setup_tx_buf(buf, txbuf);
-		/* copy full fpdu's to new buffer */
-		i40iw_ieq_copy_to_txbuf(buf, txbuf, ioffset, buf->hdrlen,
-					length);
-		txbuf->totallen = buf->hdrlen + length;
-
-		i40iw_ieq_update_tcpip_info(txbuf, length, buf->seqnum);
-		i40iw_puda_send_buf(ieq, txbuf);
-
-		if (!datalen) {
-			pfpdu->rcv_nxt = buf->seqnum + length;
-			i40iw_puda_ret_bufpool(ieq, buf);
-			return status;
-		}
-		buf->data = datap;
-		buf->seqnum = seqnum + length;
-		buf->datalen = datalen;
-		pfpdu->rcv_nxt = buf->seqnum;
-	}
-	if (partial)
-		status = i40iw_ieq_handle_partial(ieq, pfpdu, buf, fpdu_len);
-
-	return status;
-}
-
-/**
- * i40iw_ieq_process_fpdus - process fpdu's buffers on its list
- * @qp: qp for which partial fpdus
- * @ieq: ieq resource
- */
-static void i40iw_ieq_process_fpdus(struct i40iw_sc_qp *qp,
-				    struct i40iw_puda_rsrc *ieq)
-{
-	struct i40iw_pfpdu *pfpdu = &qp->pfpdu;
-	struct list_head *rxlist = &pfpdu->rxlist;
-	struct i40iw_puda_buf *buf;
-	enum i40iw_status_code status;
-
-	do {
-		if (list_empty(rxlist))
-			break;
-		buf = i40iw_puda_get_listbuf(rxlist);
-		if (!buf) {
-			i40iw_debug(ieq->dev, I40IW_DEBUG_IEQ,
-				    "%s: error no buf\n", __func__);
-			break;
-		}
-		if (buf->seqnum != pfpdu->rcv_nxt) {
-			/* This could be out of order or missing packet */
-			pfpdu->out_of_order++;
-			list_add(&buf->list, rxlist);
-			break;
-		}
-		/* keep processing buffers from the head of the list */
-		status = i40iw_ieq_process_buf(ieq, pfpdu, buf);
-		if (status == I40IW_ERR_MPA_CRC) {
-			pfpdu->mpa_crc_err = true;
-			while (!list_empty(rxlist)) {
-				buf = i40iw_puda_get_listbuf(rxlist);
-				i40iw_puda_ret_bufpool(ieq, buf);
-				pfpdu->crc_err++;
-			}
-			/* create CQP for AE */
-			i40iw_ieq_mpa_crc_ae(ieq->dev, qp);
-		}
-	} while (!status);
-}
-
-/**
- * i40iw_ieq_handle_exception - handle qp's exception
- * @ieq: ieq resource
- * @qp: qp receiving excpetion
- * @buf: receive buffer
- */
-static void i40iw_ieq_handle_exception(struct i40iw_puda_rsrc *ieq,
-				       struct i40iw_sc_qp *qp,
-				       struct i40iw_puda_buf *buf)
-{
-	struct i40iw_puda_buf *tmpbuf = NULL;
-	struct i40iw_pfpdu *pfpdu = &qp->pfpdu;
-	u32 *hw_host_ctx = (u32 *)qp->hw_host_ctx;
-	u32 rcv_wnd = hw_host_ctx[23];
-	/* first partial seq # in q2 */
-	u32 fps = *(u32 *)(qp->q2_buf + Q2_FPSN_OFFSET);
-	struct list_head *rxlist = &pfpdu->rxlist;
-	struct list_head *plist;
-
-	pfpdu->total_ieq_bufs++;
-
-	if (pfpdu->mpa_crc_err) {
-		pfpdu->crc_err++;
-		goto error;
-	}
-	if (pfpdu->mode && (fps != pfpdu->fps)) {
-		/* clean up qp as it is new partial sequence */
-		i40iw_ieq_cleanup_qp(ieq, qp);
-		i40iw_debug(ieq->dev, I40IW_DEBUG_IEQ,
-			    "%s: restarting new partial\n", __func__);
-		pfpdu->mode = false;
-	}
-
-	if (!pfpdu->mode) {
-		i40iw_debug_buf(ieq->dev, I40IW_DEBUG_IEQ, "Q2 BUFFER", (u64 *)qp->q2_buf, 128);
-		/* First_Partial_Sequence_Number check */
-		pfpdu->rcv_nxt = fps;
-		pfpdu->fps = fps;
-		pfpdu->mode = true;
-		pfpdu->max_fpdu_data = (buf->ipv4) ? (ieq->vsi->mtu - I40IW_MTU_TO_MSS_IPV4) :
-				       (ieq->vsi->mtu - I40IW_MTU_TO_MSS_IPV6);
-		pfpdu->pmode_count++;
-		INIT_LIST_HEAD(rxlist);
-		i40iw_ieq_check_first_buf(buf, fps);
-	}
-
-	if (!(rcv_wnd >= (buf->seqnum - pfpdu->rcv_nxt))) {
-		pfpdu->bad_seq_num++;
-		goto error;
-	}
-
-	if (!list_empty(rxlist)) {
-		tmpbuf = (struct i40iw_puda_buf *)rxlist->next;
-		while ((struct list_head *)tmpbuf != rxlist) {
-			if ((int)(buf->seqnum - tmpbuf->seqnum) < 0)
-				break;
-			plist = &tmpbuf->list;
-			tmpbuf = (struct i40iw_puda_buf *)plist->next;
-		}
-		/* Insert buf before tmpbuf */
-		list_add_tail(&buf->list, &tmpbuf->list);
-	} else {
-		list_add_tail(&buf->list, rxlist);
-	}
-	i40iw_ieq_process_fpdus(qp, ieq);
-	return;
- error:
-	i40iw_puda_ret_bufpool(ieq, buf);
-}
-
-/**
- * i40iw_ieq_receive - received exception buffer
- * @vsi: pointer to vsi structure
- * @buf: exception buffer received
- */
-static void i40iw_ieq_receive(struct i40iw_sc_vsi *vsi,
-			      struct i40iw_puda_buf *buf)
-{
-	struct i40iw_puda_rsrc *ieq = vsi->ieq;
-	struct i40iw_sc_qp *qp = NULL;
-	u32 wqe_idx = ieq->compl_rxwqe_idx;
-
-	qp = i40iw_ieq_get_qp(vsi->dev, buf);
-	if (!qp) {
-		ieq->stats_bad_qp_id++;
-		i40iw_puda_ret_bufpool(ieq, buf);
-	} else {
-		i40iw_ieq_handle_exception(ieq, qp, buf);
-	}
-	/*
-	 * ieq->rx_wqe_idx is used by i40iw_puda_replenish_rq()
-	 * on which wqe_idx to start replenish rq
-	 */
-	if (!ieq->rxq_invalid_cnt)
-		ieq->rx_wqe_idx = wqe_idx;
-	ieq->rxq_invalid_cnt++;
-}
-
-/**
- * i40iw_ieq_tx_compl - put back after sending completed exception buffer
- * @vsi: pointer to the vsi structure
- * @sqwrid: pointer to puda buffer
- */
-static void i40iw_ieq_tx_compl(struct i40iw_sc_vsi *vsi, void *sqwrid)
-{
-	struct i40iw_puda_rsrc *ieq = vsi->ieq;
-	struct i40iw_puda_buf *buf = (struct i40iw_puda_buf *)sqwrid;
-
-	i40iw_puda_ret_bufpool(ieq, buf);
-}
-
-/**
- * i40iw_ieq_cleanup_qp - qp is being destroyed
- * @ieq: ieq resource
- * @qp: all pending fpdu buffers
- */
-void i40iw_ieq_cleanup_qp(struct i40iw_puda_rsrc *ieq, struct i40iw_sc_qp *qp)
-{
-	struct i40iw_puda_buf *buf;
-	struct i40iw_pfpdu *pfpdu = &qp->pfpdu;
-	struct list_head *rxlist = &pfpdu->rxlist;
-
-	if (!pfpdu->mode)
-		return;
-	while (!list_empty(rxlist)) {
-		buf = i40iw_puda_get_listbuf(rxlist);
-		i40iw_puda_ret_bufpool(ieq, buf);
-	}
-}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_puda.h b/drivers/infiniband/hw/i40iw/i40iw_puda.h
deleted file mode 100644
index 53a7d58c84b5..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw_puda.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#ifndef I40IW_PUDA_H
-#define I40IW_PUDA_H
-
-#define I40IW_IEQ_MPA_FRAMING 6
-
-struct i40iw_sc_dev;
-struct i40iw_sc_qp;
-struct i40iw_sc_cq;
-
-enum puda_resource_type {
-	I40IW_PUDA_RSRC_TYPE_ILQ = 1,
-	I40IW_PUDA_RSRC_TYPE_IEQ
-};
-
-enum puda_rsrc_complete {
-	PUDA_CQ_CREATED = 1,
-	PUDA_QP_CREATED,
-	PUDA_TX_COMPLETE,
-	PUDA_RX_COMPLETE,
-	PUDA_HASH_CRC_COMPLETE
-};
-
-struct i40iw_puda_completion_info {
-	struct i40iw_qp_uk *qp;
-	u8 q_type;
-	u8 vlan_valid;
-	u8 l3proto;
-	u8 l4proto;
-	u16 payload_len;
-	u32 compl_error;	/* No_err=0, else major and minor err code */
-	u32 qp_id;
-	u32 wqe_idx;
-};
-
-struct i40iw_puda_send_info {
-	u64 paddr;		/* Physical address */
-	u32 len;
-	u8 tcplen;
-	u8 maclen;
-	bool ipv4;
-	bool doloopback;
-	void *scratch;
-};
-
-struct i40iw_puda_buf {
-	struct list_head list;	/* MUST be first entry */
-	struct i40iw_dma_mem mem;	/* DMA memory for the buffer */
-	struct i40iw_puda_buf *next;	/* for alloclist in rsrc struct */
-	struct i40iw_virt_mem buf_mem;	/* Buffer memory for this buffer */
-	void *scratch;
-	u8 *iph;
-	u8 *tcph;
-	u8 *data;
-	u16 datalen;
-	u16 vlan_id;
-	u8 tcphlen;		/* tcp length in bytes */
-	u8 maclen;		/* mac length in bytes */
-	u32 totallen;		/* machlen+iphlen+tcphlen+datalen */
-	atomic_t refcount;
-	u8 hdrlen;
-	bool ipv4;
-	u32 seqnum;
-};
-
-struct i40iw_puda_rsrc_info {
-	enum puda_resource_type type;	/* ILQ or IEQ */
-	u32 count;
-	u16 pd_id;
-	u32 cq_id;
-	u32 qp_id;
-	u32 sq_size;
-	u32 rq_size;
-	u16 buf_size;
-	u16 mss;
-	u32 tx_buf_cnt;		/* total bufs allocated will be rq_size + tx_buf_cnt */
-	void (*receive)(struct i40iw_sc_vsi *, struct i40iw_puda_buf *);
-	void (*xmit_complete)(struct i40iw_sc_vsi *, void *);
-};
-
-struct i40iw_puda_rsrc {
-	struct i40iw_sc_cq cq;
-	struct i40iw_sc_qp qp;
-	struct i40iw_sc_pd sc_pd;
-	struct i40iw_sc_dev *dev;
-	struct i40iw_sc_vsi *vsi;
-	struct i40iw_dma_mem cqmem;
-	struct i40iw_dma_mem qpmem;
-	struct i40iw_virt_mem ilq_mem;
-	enum puda_rsrc_complete completion;
-	enum puda_resource_type type;
-	u16 buf_size;		/*buffer must be max datalen + tcpip hdr + mac */
-	u16 mss;
-	u32 cq_id;
-	u32 qp_id;
-	u32 sq_size;
-	u32 rq_size;
-	u32 cq_size;
-	struct i40iw_sq_uk_wr_trk_info *sq_wrtrk_array;
-	u64 *rq_wrid_array;
-	u32 compl_rxwqe_idx;
-	u32 rx_wqe_idx;
-	u32 rxq_invalid_cnt;
-	u32 tx_wqe_avail_cnt;
-	bool check_crc;
-	struct shash_desc *hash_desc;
-	struct list_head txpend;
-	struct list_head bufpool;	/* free buffers pool list for recv and xmit */
-	u32 alloc_buf_count;
-	u32 avail_buf_count;		/* snapshot of currently available buffers */
-	spinlock_t bufpool_lock;
-	struct i40iw_puda_buf *alloclist;
-	void (*receive)(struct i40iw_sc_vsi *, struct i40iw_puda_buf *);
-	void (*xmit_complete)(struct i40iw_sc_vsi *, void *);
-	/* puda stats */
-	u64 stats_buf_alloc_fail;
-	u64 stats_pkt_rcvd;
-	u64 stats_pkt_sent;
-	u64 stats_rcvd_pkt_err;
-	u64 stats_sent_pkt_q;
-	u64 stats_bad_qp_id;
-};
-
-struct i40iw_puda_buf *i40iw_puda_get_bufpool(struct i40iw_puda_rsrc *rsrc);
-void i40iw_puda_ret_bufpool(struct i40iw_puda_rsrc *rsrc,
-			    struct i40iw_puda_buf *buf);
-void i40iw_puda_send_buf(struct i40iw_puda_rsrc *rsrc,
-			 struct i40iw_puda_buf *buf);
-enum i40iw_status_code i40iw_puda_send(struct i40iw_sc_qp *qp,
-				       struct i40iw_puda_send_info *info);
-enum i40iw_status_code i40iw_puda_create_rsrc(struct i40iw_sc_vsi *vsi,
-					      struct i40iw_puda_rsrc_info *info);
-void i40iw_puda_dele_resources(struct i40iw_sc_vsi *vsi,
-			       enum puda_resource_type type,
-			       bool reset);
-enum i40iw_status_code i40iw_puda_poll_completion(struct i40iw_sc_dev *dev,
-						  struct i40iw_sc_cq *cq, u32 *compl_err);
-
-struct i40iw_sc_qp *i40iw_ieq_get_qp(struct i40iw_sc_dev *dev,
-				     struct i40iw_puda_buf *buf);
-enum i40iw_status_code i40iw_puda_get_tcpip_info(struct i40iw_puda_completion_info *info,
-						 struct i40iw_puda_buf *buf);
-enum i40iw_status_code i40iw_ieq_check_mpacrc(struct shash_desc *desc,
-					      void *addr, u32 length, u32 value);
-enum i40iw_status_code i40iw_init_hash_desc(struct shash_desc **desc);
-void i40iw_ieq_mpa_crc_ae(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp);
-void i40iw_free_hash_desc(struct shash_desc *desc);
-void i40iw_ieq_update_tcpip_info(struct i40iw_puda_buf *buf, u16 length,
-				 u32 seqnum);
-enum i40iw_status_code i40iw_cqp_qp_create_cmd(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp);
-enum i40iw_status_code i40iw_cqp_cq_create_cmd(struct i40iw_sc_dev *dev, struct i40iw_sc_cq *cq);
-void i40iw_cqp_qp_destroy_cmd(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp);
-void i40iw_cqp_cq_destroy_cmd(struct i40iw_sc_dev *dev, struct i40iw_sc_cq *cq);
-void i40iw_ieq_cleanup_qp(struct i40iw_puda_rsrc *ieq, struct i40iw_sc_qp *qp);
-#endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_register.h b/drivers/infiniband/hw/i40iw/i40iw_register.h
deleted file mode 100644
index 57768184e251..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw_register.h
+++ /dev/null
@@ -1,1030 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#ifndef I40IW_REGISTER_H
-#define I40IW_REGISTER_H
-
-#define I40E_GLGEN_STAT               0x000B612C /* Reset: POR */
-
-#define I40E_PFHMC_PDINV               0x000C0300 /* Reset: PFR */
-#define I40E_PFHMC_PDINV_PMSDIDX_SHIFT 0
-#define I40E_PFHMC_PDINV_PMSDIDX_MASK  (0xFFF <<  I40E_PFHMC_PDINV_PMSDIDX_SHIFT)
-#define I40E_PFHMC_PDINV_PMPDIDX_SHIFT 16
-#define I40E_PFHMC_PDINV_PMPDIDX_MASK  (0x1FF <<  I40E_PFHMC_PDINV_PMPDIDX_SHIFT)
-#define I40E_PFHMC_SDCMD_PMSDWR_SHIFT  31
-#define I40E_PFHMC_SDCMD_PMSDWR_MASK   (0x1 <<  I40E_PFHMC_SDCMD_PMSDWR_SHIFT)
-#define I40E_PFHMC_SDDATALOW_PMSDVALID_SHIFT   0
-#define I40E_PFHMC_SDDATALOW_PMSDVALID_MASK    (0x1 <<  I40E_PFHMC_SDDATALOW_PMSDVALID_SHIFT)
-#define I40E_PFHMC_SDDATALOW_PMSDTYPE_SHIFT    1
-#define I40E_PFHMC_SDDATALOW_PMSDTYPE_MASK     (0x1 <<  I40E_PFHMC_SDDATALOW_PMSDTYPE_SHIFT)
-#define I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_SHIFT 2
-#define I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_MASK  (0x3FF <<  I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_SHIFT)
-
-#define I40E_PFINT_DYN_CTLN(_INTPF) (0x00034800 + ((_INTPF) * 4)) /* _i=0...511 */	/* Reset: PFR */
-#define I40E_PFINT_DYN_CTLN_INTENA_SHIFT          0
-#define I40E_PFINT_DYN_CTLN_INTENA_MASK           (0x1 <<  I40E_PFINT_DYN_CTLN_INTENA_SHIFT)
-#define I40E_PFINT_DYN_CTLN_CLEARPBA_SHIFT        1
-#define I40E_PFINT_DYN_CTLN_CLEARPBA_MASK         (0x1 <<  I40E_PFINT_DYN_CTLN_CLEARPBA_SHIFT)
-#define I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT        3
-#define I40E_PFINT_DYN_CTLN_ITR_INDX_MASK         (0x3 <<  I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT)
-
-#define I40E_VFINT_DYN_CTLN1(_INTVF)               (0x00003800 + ((_INTVF) * 4)) /* _i=0...15 */ /* Reset: VFR */
-#define I40E_GLHMC_VFPDINV(_i)               (0x000C8300 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
-
-#define I40E_PFHMC_PDINV_PMSDPARTSEL_SHIFT 15
-#define I40E_PFHMC_PDINV_PMSDPARTSEL_MASK  (0x1 <<  I40E_PFHMC_PDINV_PMSDPARTSEL_SHIFT)
-#define I40E_GLPCI_LBARCTRL                    0x000BE484 /* Reset: POR */
-#define I40E_GLPCI_LBARCTRL_PE_DB_SIZE_SHIFT    4
-#define I40E_GLPCI_LBARCTRL_PE_DB_SIZE_MASK     (0x3 <<  I40E_GLPCI_LBARCTRL_PE_DB_SIZE_SHIFT)
-#define I40E_GLPCI_DREVID			0x0009C480 /* Reset: PCIR */
-#define I40E_GLPCI_DREVID_DEFAULT_REVID_SHIFT 0
-#define I40E_GLPCI_DREVID_DEFAULT_REVID_MASK 0xFF
-
-#define I40E_PFPE_AEQALLOC               0x00131180 /* Reset: PFR */
-#define I40E_PFPE_AEQALLOC_AECOUNT_SHIFT 0
-#define I40E_PFPE_AEQALLOC_AECOUNT_MASK  (0xFFFFFFFF <<  I40E_PFPE_AEQALLOC_AECOUNT_SHIFT)
-#define I40E_PFPE_CCQPHIGH                  0x00008200 /* Reset: PFR */
-#define I40E_PFPE_CCQPHIGH_PECCQPHIGH_SHIFT 0
-#define I40E_PFPE_CCQPHIGH_PECCQPHIGH_MASK  (0xFFFFFFFF <<  I40E_PFPE_CCQPHIGH_PECCQPHIGH_SHIFT)
-#define I40E_PFPE_CCQPLOW                 0x00008180 /* Reset: PFR */
-#define I40E_PFPE_CCQPLOW_PECCQPLOW_SHIFT 0
-#define I40E_PFPE_CCQPLOW_PECCQPLOW_MASK  (0xFFFFFFFF <<  I40E_PFPE_CCQPLOW_PECCQPLOW_SHIFT)
-#define I40E_PFPE_CCQPSTATUS                   0x00008100 /* Reset: PFR */
-#define I40E_PFPE_CCQPSTATUS_CCQP_DONE_SHIFT   0
-#define I40E_PFPE_CCQPSTATUS_CCQP_DONE_MASK    (0x1 <<  I40E_PFPE_CCQPSTATUS_CCQP_DONE_SHIFT)
-#define I40E_PFPE_CCQPSTATUS_HMC_PROFILE_SHIFT 4
-#define I40E_PFPE_CCQPSTATUS_HMC_PROFILE_MASK  (0x7 <<  I40E_PFPE_CCQPSTATUS_HMC_PROFILE_SHIFT)
-#define I40E_PFPE_CCQPSTATUS_RDMA_EN_VFS_SHIFT 16
-#define I40E_PFPE_CCQPSTATUS_RDMA_EN_VFS_MASK  (0x3F <<  I40E_PFPE_CCQPSTATUS_RDMA_EN_VFS_SHIFT)
-#define I40E_PFPE_CCQPSTATUS_CCQP_ERR_SHIFT    31
-#define I40E_PFPE_CCQPSTATUS_CCQP_ERR_MASK     (0x1 <<  I40E_PFPE_CCQPSTATUS_CCQP_ERR_SHIFT)
-#define I40E_PFPE_CQACK              0x00131100 /* Reset: PFR */
-#define I40E_PFPE_CQACK_PECQID_SHIFT 0
-#define I40E_PFPE_CQACK_PECQID_MASK  (0x1FFFF <<  I40E_PFPE_CQACK_PECQID_SHIFT)
-#define I40E_PFPE_CQARM              0x00131080 /* Reset: PFR */
-#define I40E_PFPE_CQARM_PECQID_SHIFT 0
-#define I40E_PFPE_CQARM_PECQID_MASK  (0x1FFFF <<  I40E_PFPE_CQARM_PECQID_SHIFT)
-#define I40E_PFPE_CQPDB              0x00008000 /* Reset: PFR */
-#define I40E_PFPE_CQPDB_WQHEAD_SHIFT 0
-#define I40E_PFPE_CQPDB_WQHEAD_MASK  (0x7FF <<  I40E_PFPE_CQPDB_WQHEAD_SHIFT)
-#define I40E_PFPE_CQPERRCODES                      0x00008880 /* Reset: PFR */
-#define I40E_PFPE_CQPERRCODES_CQP_MINOR_CODE_SHIFT 0
-#define I40E_PFPE_CQPERRCODES_CQP_MINOR_CODE_MASK  (0xFFFF <<  I40E_PFPE_CQPERRCODES_CQP_MINOR_CODE_SHIFT)
-#define I40E_PFPE_CQPERRCODES_CQP_MAJOR_CODE_SHIFT 16
-#define I40E_PFPE_CQPERRCODES_CQP_MAJOR_CODE_MASK  (0xFFFF <<  I40E_PFPE_CQPERRCODES_CQP_MAJOR_CODE_SHIFT)
-#define I40E_PFPE_CQPTAIL                  0x00008080 /* Reset: PFR */
-#define I40E_PFPE_CQPTAIL_WQTAIL_SHIFT     0
-#define I40E_PFPE_CQPTAIL_WQTAIL_MASK      (0x7FF <<  I40E_PFPE_CQPTAIL_WQTAIL_SHIFT)
-#define I40E_PFPE_CQPTAIL_CQP_OP_ERR_SHIFT 31
-#define I40E_PFPE_CQPTAIL_CQP_OP_ERR_MASK  (0x1 <<  I40E_PFPE_CQPTAIL_CQP_OP_ERR_SHIFT)
-#define I40E_PFPE_FLMQ1ALLOCERR                   0x00008980 /* Reset: PFR */
-#define I40E_PFPE_FLMQ1ALLOCERR_ERROR_COUNT_SHIFT 0
-#define I40E_PFPE_FLMQ1ALLOCERR_ERROR_COUNT_MASK  (0xFFFF <<  I40E_PFPE_FLMQ1ALLOCERR_ERROR_COUNT_SHIFT)
-#define I40E_PFPE_FLMXMITALLOCERR                   0x00008900 /* Reset: PFR */
-#define I40E_PFPE_FLMXMITALLOCERR_ERROR_COUNT_SHIFT 0
-#define I40E_PFPE_FLMXMITALLOCERR_ERROR_COUNT_MASK  (0xFFFF <<  I40E_PFPE_FLMXMITALLOCERR_ERROR_COUNT_SHIFT)
-#define I40E_PFPE_IPCONFIG0                        0x00008280 /* Reset: PFR */
-#define I40E_PFPE_IPCONFIG0_PEIPID_SHIFT           0
-#define I40E_PFPE_IPCONFIG0_PEIPID_MASK            (0xFFFF <<  I40E_PFPE_IPCONFIG0_PEIPID_SHIFT)
-#define I40E_PFPE_IPCONFIG0_USEENTIREIDRANGE_SHIFT 16
-#define I40E_PFPE_IPCONFIG0_USEENTIREIDRANGE_MASK  (0x1 <<  I40E_PFPE_IPCONFIG0_USEENTIREIDRANGE_SHIFT)
-#define I40E_PFPE_MRTEIDXMASK                       0x00008600 /* Reset: PFR */
-#define I40E_PFPE_MRTEIDXMASK_MRTEIDXMASKBITS_SHIFT 0
-#define I40E_PFPE_MRTEIDXMASK_MRTEIDXMASKBITS_MASK  (0x1F <<  I40E_PFPE_MRTEIDXMASK_MRTEIDXMASKBITS_SHIFT)
-#define I40E_PFPE_RCVUNEXPECTEDERROR                        0x00008680 /* Reset: PFR */
-#define I40E_PFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_SHIFT 0
-#define I40E_PFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_MASK  (0xFFFFFF <<  I40E_PFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_SHIFT)
-#define I40E_PFPE_TCPNOWTIMER               0x00008580 /* Reset: PFR */
-#define I40E_PFPE_TCPNOWTIMER_TCP_NOW_SHIFT 0
-#define I40E_PFPE_TCPNOWTIMER_TCP_NOW_MASK  (0xFFFFFFFF <<  I40E_PFPE_TCPNOWTIMER_TCP_NOW_SHIFT)
-
-#define I40E_PFPE_WQEALLOC                      0x00138C00 /* Reset: PFR */
-#define I40E_PFPE_WQEALLOC_PEQPID_SHIFT         0
-#define I40E_PFPE_WQEALLOC_PEQPID_MASK          (0x3FFFF <<  I40E_PFPE_WQEALLOC_PEQPID_SHIFT)
-#define I40E_PFPE_WQEALLOC_WQE_DESC_INDEX_SHIFT 20
-#define I40E_PFPE_WQEALLOC_WQE_DESC_INDEX_MASK  (0xFFF <<  I40E_PFPE_WQEALLOC_WQE_DESC_INDEX_SHIFT)
-
-#define I40E_VFPE_AEQALLOC(_VF)          (0x00130C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
-#define I40E_VFPE_AEQALLOC_MAX_INDEX     127
-#define I40E_VFPE_AEQALLOC_AECOUNT_SHIFT 0
-#define I40E_VFPE_AEQALLOC_AECOUNT_MASK  (0xFFFFFFFF <<  I40E_VFPE_AEQALLOC_AECOUNT_SHIFT)
-#define I40E_VFPE_CCQPHIGH(_VF)             (0x00001000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
-#define I40E_VFPE_CCQPHIGH_MAX_INDEX        127
-#define I40E_VFPE_CCQPHIGH_PECCQPHIGH_SHIFT 0
-#define I40E_VFPE_CCQPHIGH_PECCQPHIGH_MASK  (0xFFFFFFFF <<  I40E_VFPE_CCQPHIGH_PECCQPHIGH_SHIFT)
-#define I40E_VFPE_CCQPLOW(_VF)            (0x00000C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
-#define I40E_VFPE_CCQPLOW_MAX_INDEX       127
-#define I40E_VFPE_CCQPLOW_PECCQPLOW_SHIFT 0
-#define I40E_VFPE_CCQPLOW_PECCQPLOW_MASK  (0xFFFFFFFF <<  I40E_VFPE_CCQPLOW_PECCQPLOW_SHIFT)
-#define I40E_VFPE_CCQPSTATUS(_VF)              (0x00000800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
-#define I40E_VFPE_CCQPSTATUS_MAX_INDEX         127
-#define I40E_VFPE_CCQPSTATUS_CCQP_DONE_SHIFT   0
-#define I40E_VFPE_CCQPSTATUS_CCQP_DONE_MASK    (0x1 <<  I40E_VFPE_CCQPSTATUS_CCQP_DONE_SHIFT)
-#define I40E_VFPE_CCQPSTATUS_HMC_PROFILE_SHIFT 4
-#define I40E_VFPE_CCQPSTATUS_HMC_PROFILE_MASK  (0x7 <<  I40E_VFPE_CCQPSTATUS_HMC_PROFILE_SHIFT)
-#define I40E_VFPE_CCQPSTATUS_RDMA_EN_VFS_SHIFT 16
-#define I40E_VFPE_CCQPSTATUS_RDMA_EN_VFS_MASK  (0x3F <<  I40E_VFPE_CCQPSTATUS_RDMA_EN_VFS_SHIFT)
-#define I40E_VFPE_CCQPSTATUS_CCQP_ERR_SHIFT    31
-#define I40E_VFPE_CCQPSTATUS_CCQP_ERR_MASK     (0x1 <<  I40E_VFPE_CCQPSTATUS_CCQP_ERR_SHIFT)
-#define I40E_VFPE_CQACK(_VF)         (0x00130800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
-#define I40E_VFPE_CQACK_MAX_INDEX    127
-#define I40E_VFPE_CQACK_PECQID_SHIFT 0
-#define I40E_VFPE_CQACK_PECQID_MASK  (0x1FFFF <<  I40E_VFPE_CQACK_PECQID_SHIFT)
-#define I40E_VFPE_CQARM(_VF)         (0x00130400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
-#define I40E_VFPE_CQARM_MAX_INDEX    127
-#define I40E_VFPE_CQARM_PECQID_SHIFT 0
-#define I40E_VFPE_CQARM_PECQID_MASK  (0x1FFFF <<  I40E_VFPE_CQARM_PECQID_SHIFT)
-#define I40E_VFPE_CQPDB(_VF)         (0x00000000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
-#define I40E_VFPE_CQPDB_MAX_INDEX    127
-#define I40E_VFPE_CQPDB_WQHEAD_SHIFT 0
-#define I40E_VFPE_CQPDB_WQHEAD_MASK  (0x7FF <<  I40E_VFPE_CQPDB_WQHEAD_SHIFT)
-#define I40E_VFPE_CQPERRCODES(_VF)                 (0x00001800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
-#define I40E_VFPE_CQPERRCODES_MAX_INDEX            127
-#define I40E_VFPE_CQPERRCODES_CQP_MINOR_CODE_SHIFT 0
-#define I40E_VFPE_CQPERRCODES_CQP_MINOR_CODE_MASK  (0xFFFF <<  I40E_VFPE_CQPERRCODES_CQP_MINOR_CODE_SHIFT)
-#define I40E_VFPE_CQPERRCODES_CQP_MAJOR_CODE_SHIFT 16
-#define I40E_VFPE_CQPERRCODES_CQP_MAJOR_CODE_MASK  (0xFFFF <<  I40E_VFPE_CQPERRCODES_CQP_MAJOR_CODE_SHIFT)
-#define I40E_VFPE_CQPTAIL(_VF)             (0x00000400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
-#define I40E_VFPE_CQPTAIL_MAX_INDEX        127
-#define I40E_VFPE_CQPTAIL_WQTAIL_SHIFT     0
-#define I40E_VFPE_CQPTAIL_WQTAIL_MASK      (0x7FF <<  I40E_VFPE_CQPTAIL_WQTAIL_SHIFT)
-#define I40E_VFPE_CQPTAIL_CQP_OP_ERR_SHIFT 31
-#define I40E_VFPE_CQPTAIL_CQP_OP_ERR_MASK  (0x1 <<  I40E_VFPE_CQPTAIL_CQP_OP_ERR_SHIFT)
-#define I40E_VFPE_IPCONFIG0(_VF)                   (0x00001400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
-#define I40E_VFPE_IPCONFIG0_MAX_INDEX              127
-#define I40E_VFPE_IPCONFIG0_PEIPID_SHIFT           0
-#define I40E_VFPE_IPCONFIG0_PEIPID_MASK            (0xFFFF <<  I40E_VFPE_IPCONFIG0_PEIPID_SHIFT)
-#define I40E_VFPE_IPCONFIG0_USEENTIREIDRANGE_SHIFT 16
-#define I40E_VFPE_IPCONFIG0_USEENTIREIDRANGE_MASK  (0x1 <<  I40E_VFPE_IPCONFIG0_USEENTIREIDRANGE_SHIFT)
-#define I40E_VFPE_MRTEIDXMASK(_VF)                  (0x00003000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
-#define I40E_VFPE_MRTEIDXMASK_MAX_INDEX             127
-#define I40E_VFPE_MRTEIDXMASK_MRTEIDXMASKBITS_SHIFT 0
-#define I40E_VFPE_MRTEIDXMASK_MRTEIDXMASKBITS_MASK  (0x1F <<  I40E_VFPE_MRTEIDXMASK_MRTEIDXMASKBITS_SHIFT)
-#define I40E_VFPE_RCVUNEXPECTEDERROR(_VF)                   (0x00003400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
-#define I40E_VFPE_RCVUNEXPECTEDERROR_MAX_INDEX              127
-#define I40E_VFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_SHIFT 0
-#define I40E_VFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_MASK  (0xFFFFFF <<  I40E_VFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_SHIFT)
-#define I40E_VFPE_TCPNOWTIMER(_VF)          (0x00002C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
-#define I40E_VFPE_TCPNOWTIMER_MAX_INDEX     127
-#define I40E_VFPE_TCPNOWTIMER_TCP_NOW_SHIFT 0
-#define I40E_VFPE_TCPNOWTIMER_TCP_NOW_MASK  (0xFFFFFFFF <<  I40E_VFPE_TCPNOWTIMER_TCP_NOW_SHIFT)
-#define I40E_VFPE_WQEALLOC(_VF)                 (0x00138000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */
-#define I40E_VFPE_WQEALLOC_MAX_INDEX            127
-#define I40E_VFPE_WQEALLOC_PEQPID_SHIFT         0
-#define I40E_VFPE_WQEALLOC_PEQPID_MASK          (0x3FFFF <<  I40E_VFPE_WQEALLOC_PEQPID_SHIFT)
-#define I40E_VFPE_WQEALLOC_WQE_DESC_INDEX_SHIFT 20
-#define I40E_VFPE_WQEALLOC_WQE_DESC_INDEX_MASK  (0xFFF <<  I40E_VFPE_WQEALLOC_WQE_DESC_INDEX_SHIFT)
-
-#define I40E_GLPE_CPUSTATUS0                    0x0000D040 /* Reset: PE_CORER */
-#define I40E_GLPE_CPUSTATUS0_PECPUSTATUS0_SHIFT 0
-#define I40E_GLPE_CPUSTATUS0_PECPUSTATUS0_MASK  (0xFFFFFFFF <<  I40E_GLPE_CPUSTATUS0_PECPUSTATUS0_SHIFT)
-#define I40E_GLPE_CPUSTATUS1                    0x0000D044 /* Reset: PE_CORER */
-#define I40E_GLPE_CPUSTATUS1_PECPUSTATUS1_SHIFT 0
-#define I40E_GLPE_CPUSTATUS1_PECPUSTATUS1_MASK  (0xFFFFFFFF <<  I40E_GLPE_CPUSTATUS1_PECPUSTATUS1_SHIFT)
-#define I40E_GLPE_CPUSTATUS2                    0x0000D048 /* Reset: PE_CORER */
-#define I40E_GLPE_CPUSTATUS2_PECPUSTATUS2_SHIFT 0
-#define I40E_GLPE_CPUSTATUS2_PECPUSTATUS2_MASK  (0xFFFFFFFF <<  I40E_GLPE_CPUSTATUS2_PECPUSTATUS2_SHIFT)
-#define I40E_GLPE_CPUTRIG0                   0x0000D060 /* Reset: PE_CORER */
-#define I40E_GLPE_CPUTRIG0_PECPUTRIG0_SHIFT  0
-#define I40E_GLPE_CPUTRIG0_PECPUTRIG0_MASK   (0xFFFF <<  I40E_GLPE_CPUTRIG0_PECPUTRIG0_SHIFT)
-#define I40E_GLPE_CPUTRIG0_TEPREQUEST0_SHIFT 17
-#define I40E_GLPE_CPUTRIG0_TEPREQUEST0_MASK  (0x1 <<  I40E_GLPE_CPUTRIG0_TEPREQUEST0_SHIFT)
-#define I40E_GLPE_CPUTRIG0_OOPREQUEST0_SHIFT 18
-#define I40E_GLPE_CPUTRIG0_OOPREQUEST0_MASK  (0x1 <<  I40E_GLPE_CPUTRIG0_OOPREQUEST0_SHIFT)
-#define I40E_GLPE_DUAL40_RUPM                     0x0000DA04 /* Reset: PE_CORER */
-#define I40E_GLPE_DUAL40_RUPM_DUAL_40G_MODE_SHIFT 0
-#define I40E_GLPE_DUAL40_RUPM_DUAL_40G_MODE_MASK  (0x1 <<  I40E_GLPE_DUAL40_RUPM_DUAL_40G_MODE_SHIFT)
-#define I40E_GLPE_PFAEQEDROPCNT(_i)               (0x00131440 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
-#define I40E_GLPE_PFAEQEDROPCNT_MAX_INDEX         15
-#define I40E_GLPE_PFAEQEDROPCNT_AEQEDROPCNT_SHIFT 0
-#define I40E_GLPE_PFAEQEDROPCNT_AEQEDROPCNT_MASK  (0xFFFF <<  I40E_GLPE_PFAEQEDROPCNT_AEQEDROPCNT_SHIFT)
-#define I40E_GLPE_PFCEQEDROPCNT(_i)               (0x001313C0 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
-#define I40E_GLPE_PFCEQEDROPCNT_MAX_INDEX         15
-#define I40E_GLPE_PFCEQEDROPCNT_CEQEDROPCNT_SHIFT 0
-#define I40E_GLPE_PFCEQEDROPCNT_CEQEDROPCNT_MASK  (0xFFFF <<  I40E_GLPE_PFCEQEDROPCNT_CEQEDROPCNT_SHIFT)
-#define I40E_GLPE_PFCQEDROPCNT(_i)              (0x00131340 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */
-#define I40E_GLPE_PFCQEDROPCNT_MAX_INDEX        15
-#define I40E_GLPE_PFCQEDROPCNT_CQEDROPCNT_SHIFT 0
-#define I40E_GLPE_PFCQEDROPCNT_CQEDROPCNT_MASK  (0xFFFF <<  I40E_GLPE_PFCQEDROPCNT_CQEDROPCNT_SHIFT)
-#define I40E_GLPE_RUPM_CQPPOOL                0x0000DACC /* Reset: PE_CORER */
-#define I40E_GLPE_RUPM_CQPPOOL_CQPSPADS_SHIFT 0
-#define I40E_GLPE_RUPM_CQPPOOL_CQPSPADS_MASK  (0xFF <<  I40E_GLPE_RUPM_CQPPOOL_CQPSPADS_SHIFT)
-#define I40E_GLPE_RUPM_FLRPOOL                0x0000DAC4 /* Reset: PE_CORER */
-#define I40E_GLPE_RUPM_FLRPOOL_FLRSPADS_SHIFT 0
-#define I40E_GLPE_RUPM_FLRPOOL_FLRSPADS_MASK  (0xFF <<  I40E_GLPE_RUPM_FLRPOOL_FLRSPADS_SHIFT)
-#define I40E_GLPE_RUPM_GCTL                   0x0000DA00 /* Reset: PE_CORER */
-#define I40E_GLPE_RUPM_GCTL_ALLOFFTH_SHIFT    0
-#define I40E_GLPE_RUPM_GCTL_ALLOFFTH_MASK     (0xFF <<  I40E_GLPE_RUPM_GCTL_ALLOFFTH_SHIFT)
-#define I40E_GLPE_RUPM_GCTL_RUPM_P0_DIS_SHIFT 26
-#define I40E_GLPE_RUPM_GCTL_RUPM_P0_DIS_MASK  (0x1 <<  I40E_GLPE_RUPM_GCTL_RUPM_P0_DIS_SHIFT)
-#define I40E_GLPE_RUPM_GCTL_RUPM_P1_DIS_SHIFT 27
-#define I40E_GLPE_RUPM_GCTL_RUPM_P1_DIS_MASK  (0x1 <<  I40E_GLPE_RUPM_GCTL_RUPM_P1_DIS_SHIFT)
-#define I40E_GLPE_RUPM_GCTL_RUPM_P2_DIS_SHIFT 28
-#define I40E_GLPE_RUPM_GCTL_RUPM_P2_DIS_MASK  (0x1 <<  I40E_GLPE_RUPM_GCTL_RUPM_P2_DIS_SHIFT)
-#define I40E_GLPE_RUPM_GCTL_RUPM_P3_DIS_SHIFT 29
-#define I40E_GLPE_RUPM_GCTL_RUPM_P3_DIS_MASK  (0x1 <<  I40E_GLPE_RUPM_GCTL_RUPM_P3_DIS_SHIFT)
-#define I40E_GLPE_RUPM_GCTL_RUPM_DIS_SHIFT    30
-#define I40E_GLPE_RUPM_GCTL_RUPM_DIS_MASK     (0x1 <<  I40E_GLPE_RUPM_GCTL_RUPM_DIS_SHIFT)
-#define I40E_GLPE_RUPM_GCTL_SWLB_MODE_SHIFT   31
-#define I40E_GLPE_RUPM_GCTL_SWLB_MODE_MASK    (0x1 <<  I40E_GLPE_RUPM_GCTL_SWLB_MODE_SHIFT)
-#define I40E_GLPE_RUPM_PTXPOOL                0x0000DAC8 /* Reset: PE_CORER */
-#define I40E_GLPE_RUPM_PTXPOOL_PTXSPADS_SHIFT 0
-#define I40E_GLPE_RUPM_PTXPOOL_PTXSPADS_MASK  (0xFF <<  I40E_GLPE_RUPM_PTXPOOL_PTXSPADS_SHIFT)
-#define I40E_GLPE_RUPM_PUSHPOOL                 0x0000DAC0 /* Reset: PE_CORER */
-#define I40E_GLPE_RUPM_PUSHPOOL_PUSHSPADS_SHIFT 0
-#define I40E_GLPE_RUPM_PUSHPOOL_PUSHSPADS_MASK  (0xFF <<  I40E_GLPE_RUPM_PUSHPOOL_PUSHSPADS_SHIFT)
-#define I40E_GLPE_RUPM_TXHOST_EN                 0x0000DA08 /* Reset: PE_CORER */
-#define I40E_GLPE_RUPM_TXHOST_EN_TXHOST_EN_SHIFT 0
-#define I40E_GLPE_RUPM_TXHOST_EN_TXHOST_EN_MASK  (0x1 <<  I40E_GLPE_RUPM_TXHOST_EN_TXHOST_EN_SHIFT)
-#define I40E_GLPE_VFAEQEDROPCNT(_i)               (0x00132540 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
-#define I40E_GLPE_VFAEQEDROPCNT_MAX_INDEX         31
-#define I40E_GLPE_VFAEQEDROPCNT_AEQEDROPCNT_SHIFT 0
-#define I40E_GLPE_VFAEQEDROPCNT_AEQEDROPCNT_MASK  (0xFFFF <<  I40E_GLPE_VFAEQEDROPCNT_AEQEDROPCNT_SHIFT)
-#define I40E_GLPE_VFCEQEDROPCNT(_i)               (0x00132440 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
-#define I40E_GLPE_VFCEQEDROPCNT_MAX_INDEX         31
-#define I40E_GLPE_VFCEQEDROPCNT_CEQEDROPCNT_SHIFT 0
-#define I40E_GLPE_VFCEQEDROPCNT_CEQEDROPCNT_MASK  (0xFFFF <<  I40E_GLPE_VFCEQEDROPCNT_CEQEDROPCNT_SHIFT)
-#define I40E_GLPE_VFCQEDROPCNT(_i)              (0x00132340 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */
-#define I40E_GLPE_VFCQEDROPCNT_MAX_INDEX        31
-#define I40E_GLPE_VFCQEDROPCNT_CQEDROPCNT_SHIFT 0
-#define I40E_GLPE_VFCQEDROPCNT_CQEDROPCNT_MASK  (0xFFFF <<  I40E_GLPE_VFCQEDROPCNT_CQEDROPCNT_SHIFT)
-#define I40E_GLPE_VFFLMOBJCTRL(_i)                  (0x0000D400 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPE_VFFLMOBJCTRL_MAX_INDEX            31
-#define I40E_GLPE_VFFLMOBJCTRL_XMIT_BLOCKSIZE_SHIFT 0
-#define I40E_GLPE_VFFLMOBJCTRL_XMIT_BLOCKSIZE_MASK  (0x7 <<  I40E_GLPE_VFFLMOBJCTRL_XMIT_BLOCKSIZE_SHIFT)
-#define I40E_GLPE_VFFLMOBJCTRL_Q1_BLOCKSIZE_SHIFT   8
-#define I40E_GLPE_VFFLMOBJCTRL_Q1_BLOCKSIZE_MASK    (0x7 <<  I40E_GLPE_VFFLMOBJCTRL_Q1_BLOCKSIZE_SHIFT)
-#define I40E_GLPE_VFFLMQ1ALLOCERR(_i)               (0x0000C700 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPE_VFFLMQ1ALLOCERR_MAX_INDEX         31
-#define I40E_GLPE_VFFLMQ1ALLOCERR_ERROR_COUNT_SHIFT 0
-#define I40E_GLPE_VFFLMQ1ALLOCERR_ERROR_COUNT_MASK  (0xFFFF <<  I40E_GLPE_VFFLMQ1ALLOCERR_ERROR_COUNT_SHIFT)
-#define I40E_GLPE_VFFLMXMITALLOCERR(_i)               (0x0000C600 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPE_VFFLMXMITALLOCERR_MAX_INDEX         31
-#define I40E_GLPE_VFFLMXMITALLOCERR_ERROR_COUNT_SHIFT 0
-#define I40E_GLPE_VFFLMXMITALLOCERR_ERROR_COUNT_MASK  (0xFFFF <<  I40E_GLPE_VFFLMXMITALLOCERR_ERROR_COUNT_SHIFT)
-#define I40E_GLPE_VFUDACTRL(_i)                    (0x0000C000 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPE_VFUDACTRL_MAX_INDEX              31
-#define I40E_GLPE_VFUDACTRL_IPV4MCFRAGRESBP_SHIFT  0
-#define I40E_GLPE_VFUDACTRL_IPV4MCFRAGRESBP_MASK   (0x1 <<  I40E_GLPE_VFUDACTRL_IPV4MCFRAGRESBP_SHIFT)
-#define I40E_GLPE_VFUDACTRL_IPV4UCFRAGRESBP_SHIFT  1
-#define I40E_GLPE_VFUDACTRL_IPV4UCFRAGRESBP_MASK   (0x1 <<  I40E_GLPE_VFUDACTRL_IPV4UCFRAGRESBP_SHIFT)
-#define I40E_GLPE_VFUDACTRL_IPV6MCFRAGRESBP_SHIFT  2
-#define I40E_GLPE_VFUDACTRL_IPV6MCFRAGRESBP_MASK   (0x1 <<  I40E_GLPE_VFUDACTRL_IPV6MCFRAGRESBP_SHIFT)
-#define I40E_GLPE_VFUDACTRL_IPV6UCFRAGRESBP_SHIFT  3
-#define I40E_GLPE_VFUDACTRL_IPV6UCFRAGRESBP_MASK   (0x1 <<  I40E_GLPE_VFUDACTRL_IPV6UCFRAGRESBP_SHIFT)
-#define I40E_GLPE_VFUDACTRL_UDPMCFRAGRESFAIL_SHIFT 4
-#define I40E_GLPE_VFUDACTRL_UDPMCFRAGRESFAIL_MASK  (0x1 <<  I40E_GLPE_VFUDACTRL_UDPMCFRAGRESFAIL_SHIFT)
-#define I40E_GLPE_VFUDAUCFBQPN(_i)         (0x0000C100 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPE_VFUDAUCFBQPN_MAX_INDEX   31
-#define I40E_GLPE_VFUDAUCFBQPN_QPN_SHIFT   0
-#define I40E_GLPE_VFUDAUCFBQPN_QPN_MASK    (0x3FFFF <<  I40E_GLPE_VFUDAUCFBQPN_QPN_SHIFT)
-#define I40E_GLPE_VFUDAUCFBQPN_VALID_SHIFT 31
-#define I40E_GLPE_VFUDAUCFBQPN_VALID_MASK  (0x1 <<  I40E_GLPE_VFUDAUCFBQPN_VALID_SHIFT)
-
-#define I40E_GLPES_PFIP4RXDISCARD(_i)                (0x00010600 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP4RXDISCARD_MAX_INDEX          15
-#define I40E_GLPES_PFIP4RXDISCARD_IP4RXDISCARD_SHIFT 0
-#define I40E_GLPES_PFIP4RXDISCARD_IP4RXDISCARD_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP4RXDISCARD_IP4RXDISCARD_SHIFT)
-#define I40E_GLPES_PFIP4RXFRAGSHI(_i)                (0x00010804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP4RXFRAGSHI_MAX_INDEX          15
-#define I40E_GLPES_PFIP4RXFRAGSHI_IP4RXFRAGSHI_SHIFT 0
-#define I40E_GLPES_PFIP4RXFRAGSHI_IP4RXFRAGSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP4RXFRAGSHI_IP4RXFRAGSHI_SHIFT)
-#define I40E_GLPES_PFIP4RXFRAGSLO(_i)                (0x00010800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP4RXFRAGSLO_MAX_INDEX          15
-#define I40E_GLPES_PFIP4RXFRAGSLO_IP4RXFRAGSLO_SHIFT 0
-#define I40E_GLPES_PFIP4RXFRAGSLO_IP4RXFRAGSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP4RXFRAGSLO_IP4RXFRAGSLO_SHIFT)
-#define I40E_GLPES_PFIP4RXMCOCTSHI(_i)                 (0x00010A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP4RXMCOCTSHI_MAX_INDEX           15
-#define I40E_GLPES_PFIP4RXMCOCTSHI_IP4RXMCOCTSHI_SHIFT 0
-#define I40E_GLPES_PFIP4RXMCOCTSHI_IP4RXMCOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP4RXMCOCTSHI_IP4RXMCOCTSHI_SHIFT)
-#define I40E_GLPES_PFIP4RXMCOCTSLO(_i)                 (0x00010A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP4RXMCOCTSLO_MAX_INDEX           15
-#define I40E_GLPES_PFIP4RXMCOCTSLO_IP4RXMCOCTSLO_SHIFT 0
-#define I40E_GLPES_PFIP4RXMCOCTSLO_IP4RXMCOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP4RXMCOCTSLO_IP4RXMCOCTSLO_SHIFT)
-#define I40E_GLPES_PFIP4RXMCPKTSHI(_i)                 (0x00010C04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP4RXMCPKTSHI_MAX_INDEX           15
-#define I40E_GLPES_PFIP4RXMCPKTSHI_IP4RXMCPKTSHI_SHIFT 0
-#define I40E_GLPES_PFIP4RXMCPKTSHI_IP4RXMCPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP4RXMCPKTSHI_IP4RXMCPKTSHI_SHIFT)
-#define I40E_GLPES_PFIP4RXMCPKTSLO(_i)                 (0x00010C00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP4RXMCPKTSLO_MAX_INDEX           15
-#define I40E_GLPES_PFIP4RXMCPKTSLO_IP4RXMCPKTSLO_SHIFT 0
-#define I40E_GLPES_PFIP4RXMCPKTSLO_IP4RXMCPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP4RXMCPKTSLO_IP4RXMCPKTSLO_SHIFT)
-#define I40E_GLPES_PFIP4RXOCTSHI(_i)               (0x00010204 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP4RXOCTSHI_MAX_INDEX         15
-#define I40E_GLPES_PFIP4RXOCTSHI_IP4RXOCTSHI_SHIFT 0
-#define I40E_GLPES_PFIP4RXOCTSHI_IP4RXOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP4RXOCTSHI_IP4RXOCTSHI_SHIFT)
-#define I40E_GLPES_PFIP4RXOCTSLO(_i)               (0x00010200 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP4RXOCTSLO_MAX_INDEX         15
-#define I40E_GLPES_PFIP4RXOCTSLO_IP4RXOCTSLO_SHIFT 0
-#define I40E_GLPES_PFIP4RXOCTSLO_IP4RXOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP4RXOCTSLO_IP4RXOCTSLO_SHIFT)
-#define I40E_GLPES_PFIP4RXPKTSHI(_i)               (0x00010404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP4RXPKTSHI_MAX_INDEX         15
-#define I40E_GLPES_PFIP4RXPKTSHI_IP4RXPKTSHI_SHIFT 0
-#define I40E_GLPES_PFIP4RXPKTSHI_IP4RXPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP4RXPKTSHI_IP4RXPKTSHI_SHIFT)
-#define I40E_GLPES_PFIP4RXPKTSLO(_i)               (0x00010400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP4RXPKTSLO_MAX_INDEX         15
-#define I40E_GLPES_PFIP4RXPKTSLO_IP4RXPKTSLO_SHIFT 0
-#define I40E_GLPES_PFIP4RXPKTSLO_IP4RXPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP4RXPKTSLO_IP4RXPKTSLO_SHIFT)
-#define I40E_GLPES_PFIP4RXTRUNC(_i)              (0x00010700 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP4RXTRUNC_MAX_INDEX        15
-#define I40E_GLPES_PFIP4RXTRUNC_IP4RXTRUNC_SHIFT 0
-#define I40E_GLPES_PFIP4RXTRUNC_IP4RXTRUNC_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP4RXTRUNC_IP4RXTRUNC_SHIFT)
-#define I40E_GLPES_PFIP4TXFRAGSHI(_i)                (0x00011E04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP4TXFRAGSHI_MAX_INDEX          15
-#define I40E_GLPES_PFIP4TXFRAGSHI_IP4TXFRAGSHI_SHIFT 0
-#define I40E_GLPES_PFIP4TXFRAGSHI_IP4TXFRAGSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP4TXFRAGSHI_IP4TXFRAGSHI_SHIFT)
-#define I40E_GLPES_PFIP4TXFRAGSLO(_i)                (0x00011E00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP4TXFRAGSLO_MAX_INDEX          15
-#define I40E_GLPES_PFIP4TXFRAGSLO_IP4TXFRAGSLO_SHIFT 0
-#define I40E_GLPES_PFIP4TXFRAGSLO_IP4TXFRAGSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP4TXFRAGSLO_IP4TXFRAGSLO_SHIFT)
-#define I40E_GLPES_PFIP4TXMCOCTSHI(_i)                 (0x00012004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP4TXMCOCTSHI_MAX_INDEX           15
-#define I40E_GLPES_PFIP4TXMCOCTSHI_IP4TXMCOCTSHI_SHIFT 0
-#define I40E_GLPES_PFIP4TXMCOCTSHI_IP4TXMCOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP4TXMCOCTSHI_IP4TXMCOCTSHI_SHIFT)
-#define I40E_GLPES_PFIP4TXMCOCTSLO(_i)                 (0x00012000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP4TXMCOCTSLO_MAX_INDEX           15
-#define I40E_GLPES_PFIP4TXMCOCTSLO_IP4TXMCOCTSLO_SHIFT 0
-#define I40E_GLPES_PFIP4TXMCOCTSLO_IP4TXMCOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP4TXMCOCTSLO_IP4TXMCOCTSLO_SHIFT)
-#define I40E_GLPES_PFIP4TXMCPKTSHI(_i)                 (0x00012204 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP4TXMCPKTSHI_MAX_INDEX           15
-#define I40E_GLPES_PFIP4TXMCPKTSHI_IP4TXMCPKTSHI_SHIFT 0
-#define I40E_GLPES_PFIP4TXMCPKTSHI_IP4TXMCPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP4TXMCPKTSHI_IP4TXMCPKTSHI_SHIFT)
-#define I40E_GLPES_PFIP4TXMCPKTSLO(_i)                 (0x00012200 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP4TXMCPKTSLO_MAX_INDEX           15
-#define I40E_GLPES_PFIP4TXMCPKTSLO_IP4TXMCPKTSLO_SHIFT 0
-#define I40E_GLPES_PFIP4TXMCPKTSLO_IP4TXMCPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP4TXMCPKTSLO_IP4TXMCPKTSLO_SHIFT)
-#define I40E_GLPES_PFIP4TXNOROUTE(_i)                (0x00012E00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP4TXNOROUTE_MAX_INDEX          15
-#define I40E_GLPES_PFIP4TXNOROUTE_IP4TXNOROUTE_SHIFT 0
-#define I40E_GLPES_PFIP4TXNOROUTE_IP4TXNOROUTE_MASK  (0xFFFFFF <<  I40E_GLPES_PFIP4TXNOROUTE_IP4TXNOROUTE_SHIFT)
-#define I40E_GLPES_PFIP4TXOCTSHI(_i)               (0x00011A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP4TXOCTSHI_MAX_INDEX         15
-#define I40E_GLPES_PFIP4TXOCTSHI_IP4TXOCTSHI_SHIFT 0
-#define I40E_GLPES_PFIP4TXOCTSHI_IP4TXOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP4TXOCTSHI_IP4TXOCTSHI_SHIFT)
-#define I40E_GLPES_PFIP4TXOCTSLO(_i)               (0x00011A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP4TXOCTSLO_MAX_INDEX         15
-#define I40E_GLPES_PFIP4TXOCTSLO_IP4TXOCTSLO_SHIFT 0
-#define I40E_GLPES_PFIP4TXOCTSLO_IP4TXOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP4TXOCTSLO_IP4TXOCTSLO_SHIFT)
-#define I40E_GLPES_PFIP4TXPKTSHI(_i)               (0x00011C04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP4TXPKTSHI_MAX_INDEX         15
-#define I40E_GLPES_PFIP4TXPKTSHI_IP4TXPKTSHI_SHIFT 0
-#define I40E_GLPES_PFIP4TXPKTSHI_IP4TXPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP4TXPKTSHI_IP4TXPKTSHI_SHIFT)
-#define I40E_GLPES_PFIP4TXPKTSLO(_i)               (0x00011C00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP4TXPKTSLO_MAX_INDEX         15
-#define I40E_GLPES_PFIP4TXPKTSLO_IP4TXPKTSLO_SHIFT 0
-#define I40E_GLPES_PFIP4TXPKTSLO_IP4TXPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP4TXPKTSLO_IP4TXPKTSLO_SHIFT)
-#define I40E_GLPES_PFIP6RXDISCARD(_i)                (0x00011200 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP6RXDISCARD_MAX_INDEX          15
-#define I40E_GLPES_PFIP6RXDISCARD_IP6RXDISCARD_SHIFT 0
-#define I40E_GLPES_PFIP6RXDISCARD_IP6RXDISCARD_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP6RXDISCARD_IP6RXDISCARD_SHIFT)
-#define I40E_GLPES_PFIP6RXFRAGSHI(_i)                (0x00011404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP6RXFRAGSHI_MAX_INDEX          15
-#define I40E_GLPES_PFIP6RXFRAGSHI_IP6RXFRAGSHI_SHIFT 0
-#define I40E_GLPES_PFIP6RXFRAGSHI_IP6RXFRAGSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP6RXFRAGSHI_IP6RXFRAGSHI_SHIFT)
-#define I40E_GLPES_PFIP6RXFRAGSLO(_i)                (0x00011400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP6RXFRAGSLO_MAX_INDEX          15
-#define I40E_GLPES_PFIP6RXFRAGSLO_IP6RXFRAGSLO_SHIFT 0
-#define I40E_GLPES_PFIP6RXFRAGSLO_IP6RXFRAGSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP6RXFRAGSLO_IP6RXFRAGSLO_SHIFT)
-#define I40E_GLPES_PFIP6RXMCOCTSHI(_i)                 (0x00011604 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP6RXMCOCTSHI_MAX_INDEX           15
-#define I40E_GLPES_PFIP6RXMCOCTSHI_IP6RXMCOCTSHI_SHIFT 0
-#define I40E_GLPES_PFIP6RXMCOCTSHI_IP6RXMCOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP6RXMCOCTSHI_IP6RXMCOCTSHI_SHIFT)
-#define I40E_GLPES_PFIP6RXMCOCTSLO(_i)                 (0x00011600 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP6RXMCOCTSLO_MAX_INDEX           15
-#define I40E_GLPES_PFIP6RXMCOCTSLO_IP6RXMCOCTSLO_SHIFT 0
-#define I40E_GLPES_PFIP6RXMCOCTSLO_IP6RXMCOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP6RXMCOCTSLO_IP6RXMCOCTSLO_SHIFT)
-#define I40E_GLPES_PFIP6RXMCPKTSHI(_i)                 (0x00011804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP6RXMCPKTSHI_MAX_INDEX           15
-#define I40E_GLPES_PFIP6RXMCPKTSHI_IP6RXMCPKTSHI_SHIFT 0
-#define I40E_GLPES_PFIP6RXMCPKTSHI_IP6RXMCPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP6RXMCPKTSHI_IP6RXMCPKTSHI_SHIFT)
-#define I40E_GLPES_PFIP6RXMCPKTSLO(_i)                 (0x00011800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP6RXMCPKTSLO_MAX_INDEX           15
-#define I40E_GLPES_PFIP6RXMCPKTSLO_IP6RXMCPKTSLO_SHIFT 0
-#define I40E_GLPES_PFIP6RXMCPKTSLO_IP6RXMCPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP6RXMCPKTSLO_IP6RXMCPKTSLO_SHIFT)
-#define I40E_GLPES_PFIP6RXOCTSHI(_i)               (0x00010E04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP6RXOCTSHI_MAX_INDEX         15
-#define I40E_GLPES_PFIP6RXOCTSHI_IP6RXOCTSHI_SHIFT 0
-#define I40E_GLPES_PFIP6RXOCTSHI_IP6RXOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP6RXOCTSHI_IP6RXOCTSHI_SHIFT)
-#define I40E_GLPES_PFIP6RXOCTSLO(_i)               (0x00010E00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP6RXOCTSLO_MAX_INDEX         15
-#define I40E_GLPES_PFIP6RXOCTSLO_IP6RXOCTSLO_SHIFT 0
-#define I40E_GLPES_PFIP6RXOCTSLO_IP6RXOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP6RXOCTSLO_IP6RXOCTSLO_SHIFT)
-#define I40E_GLPES_PFIP6RXPKTSHI(_i)               (0x00011004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP6RXPKTSHI_MAX_INDEX         15
-#define I40E_GLPES_PFIP6RXPKTSHI_IP6RXPKTSHI_SHIFT 0
-#define I40E_GLPES_PFIP6RXPKTSHI_IP6RXPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP6RXPKTSHI_IP6RXPKTSHI_SHIFT)
-#define I40E_GLPES_PFIP6RXPKTSLO(_i)               (0x00011000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP6RXPKTSLO_MAX_INDEX         15
-#define I40E_GLPES_PFIP6RXPKTSLO_IP6RXPKTSLO_SHIFT 0
-#define I40E_GLPES_PFIP6RXPKTSLO_IP6RXPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP6RXPKTSLO_IP6RXPKTSLO_SHIFT)
-#define I40E_GLPES_PFIP6RXTRUNC(_i)              (0x00011300 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP6RXTRUNC_MAX_INDEX        15
-#define I40E_GLPES_PFIP6RXTRUNC_IP6RXTRUNC_SHIFT 0
-#define I40E_GLPES_PFIP6RXTRUNC_IP6RXTRUNC_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP6RXTRUNC_IP6RXTRUNC_SHIFT)
-#define I40E_GLPES_PFIP6TXFRAGSHI(_i)                (0x00012804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP6TXFRAGSHI_MAX_INDEX          15
-#define I40E_GLPES_PFIP6TXFRAGSHI_IP6TXFRAGSHI_SHIFT 0
-#define I40E_GLPES_PFIP6TXFRAGSHI_IP6TXFRAGSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP6TXFRAGSHI_IP6TXFRAGSHI_SHIFT)
-#define I40E_GLPES_PFIP6TXFRAGSLO(_i)                (0x00012800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP6TXFRAGSLO_MAX_INDEX          15
-#define I40E_GLPES_PFIP6TXFRAGSLO_IP6TXFRAGSLO_SHIFT 0
-#define I40E_GLPES_PFIP6TXFRAGSLO_IP6TXFRAGSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP6TXFRAGSLO_IP6TXFRAGSLO_SHIFT)
-#define I40E_GLPES_PFIP6TXMCOCTSHI(_i)                 (0x00012A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP6TXMCOCTSHI_MAX_INDEX           15
-#define I40E_GLPES_PFIP6TXMCOCTSHI_IP6TXMCOCTSHI_SHIFT 0
-#define I40E_GLPES_PFIP6TXMCOCTSHI_IP6TXMCOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP6TXMCOCTSHI_IP6TXMCOCTSHI_SHIFT)
-#define I40E_GLPES_PFIP6TXMCOCTSLO(_i)                 (0x00012A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP6TXMCOCTSLO_MAX_INDEX           15
-#define I40E_GLPES_PFIP6TXMCOCTSLO_IP6TXMCOCTSLO_SHIFT 0
-#define I40E_GLPES_PFIP6TXMCOCTSLO_IP6TXMCOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP6TXMCOCTSLO_IP6TXMCOCTSLO_SHIFT)
-#define I40E_GLPES_PFIP6TXMCPKTSHI(_i)                 (0x00012C04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP6TXMCPKTSHI_MAX_INDEX           15
-#define I40E_GLPES_PFIP6TXMCPKTSHI_IP6TXMCPKTSHI_SHIFT 0
-#define I40E_GLPES_PFIP6TXMCPKTSHI_IP6TXMCPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP6TXMCPKTSHI_IP6TXMCPKTSHI_SHIFT)
-#define I40E_GLPES_PFIP6TXMCPKTSLO(_i)                 (0x00012C00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP6TXMCPKTSLO_MAX_INDEX           15
-#define I40E_GLPES_PFIP6TXMCPKTSLO_IP6TXMCPKTSLO_SHIFT 0
-#define I40E_GLPES_PFIP6TXMCPKTSLO_IP6TXMCPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP6TXMCPKTSLO_IP6TXMCPKTSLO_SHIFT)
-#define I40E_GLPES_PFIP6TXNOROUTE(_i)                (0x00012F00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP6TXNOROUTE_MAX_INDEX          15
-#define I40E_GLPES_PFIP6TXNOROUTE_IP6TXNOROUTE_SHIFT 0
-#define I40E_GLPES_PFIP6TXNOROUTE_IP6TXNOROUTE_MASK  (0xFFFFFF <<  I40E_GLPES_PFIP6TXNOROUTE_IP6TXNOROUTE_SHIFT)
-#define I40E_GLPES_PFIP6TXOCTSHI(_i)               (0x00012404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP6TXOCTSHI_MAX_INDEX         15
-#define I40E_GLPES_PFIP6TXOCTSHI_IP6TXOCTSHI_SHIFT 0
-#define I40E_GLPES_PFIP6TXOCTSHI_IP6TXOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP6TXOCTSHI_IP6TXOCTSHI_SHIFT)
-#define I40E_GLPES_PFIP6TXOCTSLO(_i)               (0x00012400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP6TXOCTSLO_MAX_INDEX         15
-#define I40E_GLPES_PFIP6TXOCTSLO_IP6TXOCTSLO_SHIFT 0
-#define I40E_GLPES_PFIP6TXOCTSLO_IP6TXOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP6TXOCTSLO_IP6TXOCTSLO_SHIFT)
-#define I40E_GLPES_PFIP6TXPKTSHI(_i)               (0x00012604 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP6TXPKTSHI_MAX_INDEX         15
-#define I40E_GLPES_PFIP6TXPKTSHI_IP6TXPKTSHI_SHIFT 0
-#define I40E_GLPES_PFIP6TXPKTSHI_IP6TXPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFIP6TXPKTSHI_IP6TXPKTSHI_SHIFT)
-#define I40E_GLPES_PFIP6TXPKTSLO(_i)               (0x00012600 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFIP6TXPKTSLO_MAX_INDEX         15
-#define I40E_GLPES_PFIP6TXPKTSLO_IP6TXPKTSLO_SHIFT 0
-#define I40E_GLPES_PFIP6TXPKTSLO_IP6TXPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFIP6TXPKTSLO_IP6TXPKTSLO_SHIFT)
-#define I40E_GLPES_PFRDMARXRDSHI(_i)               (0x00013E04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFRDMARXRDSHI_MAX_INDEX         15
-#define I40E_GLPES_PFRDMARXRDSHI_RDMARXRDSHI_SHIFT 0
-#define I40E_GLPES_PFRDMARXRDSHI_RDMARXRDSHI_MASK  (0xFFFF <<  I40E_GLPES_PFRDMARXRDSHI_RDMARXRDSHI_SHIFT)
-#define I40E_GLPES_PFRDMARXRDSLO(_i)               (0x00013E00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFRDMARXRDSLO_MAX_INDEX         15
-#define I40E_GLPES_PFRDMARXRDSLO_RDMARXRDSLO_SHIFT 0
-#define I40E_GLPES_PFRDMARXRDSLO_RDMARXRDSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFRDMARXRDSLO_RDMARXRDSLO_SHIFT)
-#define I40E_GLPES_PFRDMARXSNDSHI(_i)                (0x00014004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFRDMARXSNDSHI_MAX_INDEX          15
-#define I40E_GLPES_PFRDMARXSNDSHI_RDMARXSNDSHI_SHIFT 0
-#define I40E_GLPES_PFRDMARXSNDSHI_RDMARXSNDSHI_MASK  (0xFFFF <<  I40E_GLPES_PFRDMARXSNDSHI_RDMARXSNDSHI_SHIFT)
-#define I40E_GLPES_PFRDMARXSNDSLO(_i)                (0x00014000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFRDMARXSNDSLO_MAX_INDEX          15
-#define I40E_GLPES_PFRDMARXSNDSLO_RDMARXSNDSLO_SHIFT 0
-#define I40E_GLPES_PFRDMARXSNDSLO_RDMARXSNDSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFRDMARXSNDSLO_RDMARXSNDSLO_SHIFT)
-#define I40E_GLPES_PFRDMARXWRSHI(_i)               (0x00013C04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFRDMARXWRSHI_MAX_INDEX         15
-#define I40E_GLPES_PFRDMARXWRSHI_RDMARXWRSHI_SHIFT 0
-#define I40E_GLPES_PFRDMARXWRSHI_RDMARXWRSHI_MASK  (0xFFFF <<  I40E_GLPES_PFRDMARXWRSHI_RDMARXWRSHI_SHIFT)
-#define I40E_GLPES_PFRDMARXWRSLO(_i)               (0x00013C00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFRDMARXWRSLO_MAX_INDEX         15
-#define I40E_GLPES_PFRDMARXWRSLO_RDMARXWRSLO_SHIFT 0
-#define I40E_GLPES_PFRDMARXWRSLO_RDMARXWRSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFRDMARXWRSLO_RDMARXWRSLO_SHIFT)
-#define I40E_GLPES_PFRDMATXRDSHI(_i)               (0x00014404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFRDMATXRDSHI_MAX_INDEX         15
-#define I40E_GLPES_PFRDMATXRDSHI_RDMARXRDSHI_SHIFT 0
-#define I40E_GLPES_PFRDMATXRDSHI_RDMARXRDSHI_MASK  (0xFFFF <<  I40E_GLPES_PFRDMATXRDSHI_RDMARXRDSHI_SHIFT)
-#define I40E_GLPES_PFRDMATXRDSLO(_i)               (0x00014400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFRDMATXRDSLO_MAX_INDEX         15
-#define I40E_GLPES_PFRDMATXRDSLO_RDMARXRDSLO_SHIFT 0
-#define I40E_GLPES_PFRDMATXRDSLO_RDMARXRDSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFRDMATXRDSLO_RDMARXRDSLO_SHIFT)
-#define I40E_GLPES_PFRDMATXSNDSHI(_i)                (0x00014604 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFRDMATXSNDSHI_MAX_INDEX          15
-#define I40E_GLPES_PFRDMATXSNDSHI_RDMARXSNDSHI_SHIFT 0
-#define I40E_GLPES_PFRDMATXSNDSHI_RDMARXSNDSHI_MASK  (0xFFFF <<  I40E_GLPES_PFRDMATXSNDSHI_RDMARXSNDSHI_SHIFT)
-#define I40E_GLPES_PFRDMATXSNDSLO(_i)                (0x00014600 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFRDMATXSNDSLO_MAX_INDEX          15
-#define I40E_GLPES_PFRDMATXSNDSLO_RDMARXSNDSLO_SHIFT 0
-#define I40E_GLPES_PFRDMATXSNDSLO_RDMARXSNDSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFRDMATXSNDSLO_RDMARXSNDSLO_SHIFT)
-#define I40E_GLPES_PFRDMATXWRSHI(_i)               (0x00014204 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFRDMATXWRSHI_MAX_INDEX         15
-#define I40E_GLPES_PFRDMATXWRSHI_RDMARXWRSHI_SHIFT 0
-#define I40E_GLPES_PFRDMATXWRSHI_RDMARXWRSHI_MASK  (0xFFFF <<  I40E_GLPES_PFRDMATXWRSHI_RDMARXWRSHI_SHIFT)
-#define I40E_GLPES_PFRDMATXWRSLO(_i)               (0x00014200 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFRDMATXWRSLO_MAX_INDEX         15
-#define I40E_GLPES_PFRDMATXWRSLO_RDMARXWRSLO_SHIFT 0
-#define I40E_GLPES_PFRDMATXWRSLO_RDMARXWRSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFRDMATXWRSLO_RDMARXWRSLO_SHIFT)
-#define I40E_GLPES_PFRDMAVBNDHI(_i)              (0x00014804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFRDMAVBNDHI_MAX_INDEX        15
-#define I40E_GLPES_PFRDMAVBNDHI_RDMAVBNDHI_SHIFT 0
-#define I40E_GLPES_PFRDMAVBNDHI_RDMAVBNDHI_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFRDMAVBNDHI_RDMAVBNDHI_SHIFT)
-#define I40E_GLPES_PFRDMAVBNDLO(_i)              (0x00014800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFRDMAVBNDLO_MAX_INDEX        15
-#define I40E_GLPES_PFRDMAVBNDLO_RDMAVBNDLO_SHIFT 0
-#define I40E_GLPES_PFRDMAVBNDLO_RDMAVBNDLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFRDMAVBNDLO_RDMAVBNDLO_SHIFT)
-#define I40E_GLPES_PFRDMAVINVHI(_i)              (0x00014A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFRDMAVINVHI_MAX_INDEX        15
-#define I40E_GLPES_PFRDMAVINVHI_RDMAVINVHI_SHIFT 0
-#define I40E_GLPES_PFRDMAVINVHI_RDMAVINVHI_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFRDMAVINVHI_RDMAVINVHI_SHIFT)
-#define I40E_GLPES_PFRDMAVINVLO(_i)              (0x00014A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFRDMAVINVLO_MAX_INDEX        15
-#define I40E_GLPES_PFRDMAVINVLO_RDMAVINVLO_SHIFT 0
-#define I40E_GLPES_PFRDMAVINVLO_RDMAVINVLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFRDMAVINVLO_RDMAVINVLO_SHIFT)
-#define I40E_GLPES_PFRXVLANERR(_i)             (0x00010000 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFRXVLANERR_MAX_INDEX       15
-#define I40E_GLPES_PFRXVLANERR_RXVLANERR_SHIFT 0
-#define I40E_GLPES_PFRXVLANERR_RXVLANERR_MASK  (0xFFFFFF <<  I40E_GLPES_PFRXVLANERR_RXVLANERR_SHIFT)
-#define I40E_GLPES_PFTCPRTXSEG(_i)             (0x00013600 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFTCPRTXSEG_MAX_INDEX       15
-#define I40E_GLPES_PFTCPRTXSEG_TCPRTXSEG_SHIFT 0
-#define I40E_GLPES_PFTCPRTXSEG_TCPRTXSEG_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFTCPRTXSEG_TCPRTXSEG_SHIFT)
-#define I40E_GLPES_PFTCPRXOPTERR(_i)               (0x00013200 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFTCPRXOPTERR_MAX_INDEX         15
-#define I40E_GLPES_PFTCPRXOPTERR_TCPRXOPTERR_SHIFT 0
-#define I40E_GLPES_PFTCPRXOPTERR_TCPRXOPTERR_MASK  (0xFFFFFF <<  I40E_GLPES_PFTCPRXOPTERR_TCPRXOPTERR_SHIFT)
-#define I40E_GLPES_PFTCPRXPROTOERR(_i)                 (0x00013300 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFTCPRXPROTOERR_MAX_INDEX           15
-#define I40E_GLPES_PFTCPRXPROTOERR_TCPRXPROTOERR_SHIFT 0
-#define I40E_GLPES_PFTCPRXPROTOERR_TCPRXPROTOERR_MASK  (0xFFFFFF <<  I40E_GLPES_PFTCPRXPROTOERR_TCPRXPROTOERR_SHIFT)
-#define I40E_GLPES_PFTCPRXSEGSHI(_i)               (0x00013004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFTCPRXSEGSHI_MAX_INDEX         15
-#define I40E_GLPES_PFTCPRXSEGSHI_TCPRXSEGSHI_SHIFT 0
-#define I40E_GLPES_PFTCPRXSEGSHI_TCPRXSEGSHI_MASK  (0xFFFF <<  I40E_GLPES_PFTCPRXSEGSHI_TCPRXSEGSHI_SHIFT)
-#define I40E_GLPES_PFTCPRXSEGSLO(_i)               (0x00013000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFTCPRXSEGSLO_MAX_INDEX         15
-#define I40E_GLPES_PFTCPRXSEGSLO_TCPRXSEGSLO_SHIFT 0
-#define I40E_GLPES_PFTCPRXSEGSLO_TCPRXSEGSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFTCPRXSEGSLO_TCPRXSEGSLO_SHIFT)
-#define I40E_GLPES_PFTCPTXSEGHI(_i)              (0x00013404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFTCPTXSEGHI_MAX_INDEX        15
-#define I40E_GLPES_PFTCPTXSEGHI_TCPTXSEGHI_SHIFT 0
-#define I40E_GLPES_PFTCPTXSEGHI_TCPTXSEGHI_MASK  (0xFFFF <<  I40E_GLPES_PFTCPTXSEGHI_TCPTXSEGHI_SHIFT)
-#define I40E_GLPES_PFTCPTXSEGLO(_i)              (0x00013400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFTCPTXSEGLO_MAX_INDEX        15
-#define I40E_GLPES_PFTCPTXSEGLO_TCPTXSEGLO_SHIFT 0
-#define I40E_GLPES_PFTCPTXSEGLO_TCPTXSEGLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFTCPTXSEGLO_TCPTXSEGLO_SHIFT)
-#define I40E_GLPES_PFUDPRXPKTSHI(_i)               (0x00013804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFUDPRXPKTSHI_MAX_INDEX         15
-#define I40E_GLPES_PFUDPRXPKTSHI_UDPRXPKTSHI_SHIFT 0
-#define I40E_GLPES_PFUDPRXPKTSHI_UDPRXPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFUDPRXPKTSHI_UDPRXPKTSHI_SHIFT)
-#define I40E_GLPES_PFUDPRXPKTSLO(_i)               (0x00013800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFUDPRXPKTSLO_MAX_INDEX         15
-#define I40E_GLPES_PFUDPRXPKTSLO_UDPRXPKTSLO_SHIFT 0
-#define I40E_GLPES_PFUDPRXPKTSLO_UDPRXPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFUDPRXPKTSLO_UDPRXPKTSLO_SHIFT)
-#define I40E_GLPES_PFUDPTXPKTSHI(_i)               (0x00013A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFUDPTXPKTSHI_MAX_INDEX         15
-#define I40E_GLPES_PFUDPTXPKTSHI_UDPTXPKTSHI_SHIFT 0
-#define I40E_GLPES_PFUDPTXPKTSHI_UDPTXPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_PFUDPTXPKTSHI_UDPTXPKTSHI_SHIFT)
-#define I40E_GLPES_PFUDPTXPKTSLO(_i)               (0x00013A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */
-#define I40E_GLPES_PFUDPTXPKTSLO_MAX_INDEX         15
-#define I40E_GLPES_PFUDPTXPKTSLO_UDPTXPKTSLO_SHIFT 0
-#define I40E_GLPES_PFUDPTXPKTSLO_UDPTXPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_PFUDPTXPKTSLO_UDPTXPKTSLO_SHIFT)
-#define I40E_GLPES_RDMARXMULTFPDUSHI                         0x0001E014 /* Reset: PE_CORER */
-#define I40E_GLPES_RDMARXMULTFPDUSHI_RDMARXMULTFPDUSHI_SHIFT 0
-#define I40E_GLPES_RDMARXMULTFPDUSHI_RDMARXMULTFPDUSHI_MASK  (0xFFFFFF <<  I40E_GLPES_RDMARXMULTFPDUSHI_RDMARXMULTFPDUSHI_SHIFT)
-#define I40E_GLPES_RDMARXMULTFPDUSLO                         0x0001E010 /* Reset: PE_CORER */
-#define I40E_GLPES_RDMARXMULTFPDUSLO_RDMARXMULTFPDUSLO_SHIFT 0
-#define I40E_GLPES_RDMARXMULTFPDUSLO_RDMARXMULTFPDUSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_RDMARXMULTFPDUSLO_RDMARXMULTFPDUSLO_SHIFT)
-#define I40E_GLPES_RDMARXOOODDPHI                      0x0001E01C /* Reset: PE_CORER */
-#define I40E_GLPES_RDMARXOOODDPHI_RDMARXOOODDPHI_SHIFT 0
-#define I40E_GLPES_RDMARXOOODDPHI_RDMARXOOODDPHI_MASK  (0xFFFFFF <<  I40E_GLPES_RDMARXOOODDPHI_RDMARXOOODDPHI_SHIFT)
-#define I40E_GLPES_RDMARXOOODDPLO                      0x0001E018 /* Reset: PE_CORER */
-#define I40E_GLPES_RDMARXOOODDPLO_RDMARXOOODDPLO_SHIFT 0
-#define I40E_GLPES_RDMARXOOODDPLO_RDMARXOOODDPLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_RDMARXOOODDPLO_RDMARXOOODDPLO_SHIFT)
-#define I40E_GLPES_RDMARXOOONOMARK                     0x0001E004 /* Reset: PE_CORER */
-#define I40E_GLPES_RDMARXOOONOMARK_RDMAOOONOMARK_SHIFT 0
-#define I40E_GLPES_RDMARXOOONOMARK_RDMAOOONOMARK_MASK  (0xFFFFFFFF <<  I40E_GLPES_RDMARXOOONOMARK_RDMAOOONOMARK_SHIFT)
-#define I40E_GLPES_RDMARXUNALIGN                     0x0001E000 /* Reset: PE_CORER */
-#define I40E_GLPES_RDMARXUNALIGN_RDMRXAUNALIGN_SHIFT 0
-#define I40E_GLPES_RDMARXUNALIGN_RDMRXAUNALIGN_MASK  (0xFFFFFFFF <<  I40E_GLPES_RDMARXUNALIGN_RDMRXAUNALIGN_SHIFT)
-#define I40E_GLPES_TCPRXFOURHOLEHI                       0x0001E044 /* Reset: PE_CORER */
-#define I40E_GLPES_TCPRXFOURHOLEHI_TCPRXFOURHOLEHI_SHIFT 0
-#define I40E_GLPES_TCPRXFOURHOLEHI_TCPRXFOURHOLEHI_MASK  (0xFFFFFF <<  I40E_GLPES_TCPRXFOURHOLEHI_TCPRXFOURHOLEHI_SHIFT)
-#define I40E_GLPES_TCPRXFOURHOLELO                       0x0001E040 /* Reset: PE_CORER */
-#define I40E_GLPES_TCPRXFOURHOLELO_TCPRXFOURHOLELO_SHIFT 0
-#define I40E_GLPES_TCPRXFOURHOLELO_TCPRXFOURHOLELO_MASK  (0xFFFFFFFF <<  I40E_GLPES_TCPRXFOURHOLELO_TCPRXFOURHOLELO_SHIFT)
-#define I40E_GLPES_TCPRXONEHOLEHI                      0x0001E02C /* Reset: PE_CORER */
-#define I40E_GLPES_TCPRXONEHOLEHI_TCPRXONEHOLEHI_SHIFT 0
-#define I40E_GLPES_TCPRXONEHOLEHI_TCPRXONEHOLEHI_MASK  (0xFFFFFF <<  I40E_GLPES_TCPRXONEHOLEHI_TCPRXONEHOLEHI_SHIFT)
-#define I40E_GLPES_TCPRXONEHOLELO                      0x0001E028 /* Reset: PE_CORER */
-#define I40E_GLPES_TCPRXONEHOLELO_TCPRXONEHOLELO_SHIFT 0
-#define I40E_GLPES_TCPRXONEHOLELO_TCPRXONEHOLELO_MASK  (0xFFFFFFFF <<  I40E_GLPES_TCPRXONEHOLELO_TCPRXONEHOLELO_SHIFT)
-#define I40E_GLPES_TCPRXPUREACKHI                       0x0001E024 /* Reset: PE_CORER */
-#define I40E_GLPES_TCPRXPUREACKHI_TCPRXPUREACKSHI_SHIFT 0
-#define I40E_GLPES_TCPRXPUREACKHI_TCPRXPUREACKSHI_MASK  (0xFFFFFF <<  I40E_GLPES_TCPRXPUREACKHI_TCPRXPUREACKSHI_SHIFT)
-#define I40E_GLPES_TCPRXPUREACKSLO                      0x0001E020 /* Reset: PE_CORER */
-#define I40E_GLPES_TCPRXPUREACKSLO_TCPRXPUREACKLO_SHIFT 0
-#define I40E_GLPES_TCPRXPUREACKSLO_TCPRXPUREACKLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_TCPRXPUREACKSLO_TCPRXPUREACKLO_SHIFT)
-#define I40E_GLPES_TCPRXTHREEHOLEHI                        0x0001E03C /* Reset: PE_CORER */
-#define I40E_GLPES_TCPRXTHREEHOLEHI_TCPRXTHREEHOLEHI_SHIFT 0
-#define I40E_GLPES_TCPRXTHREEHOLEHI_TCPRXTHREEHOLEHI_MASK  (0xFFFFFF <<  I40E_GLPES_TCPRXTHREEHOLEHI_TCPRXTHREEHOLEHI_SHIFT)
-#define I40E_GLPES_TCPRXTHREEHOLELO                        0x0001E038 /* Reset: PE_CORER */
-#define I40E_GLPES_TCPRXTHREEHOLELO_TCPRXTHREEHOLELO_SHIFT 0
-#define I40E_GLPES_TCPRXTHREEHOLELO_TCPRXTHREEHOLELO_MASK  (0xFFFFFFFF <<  I40E_GLPES_TCPRXTHREEHOLELO_TCPRXTHREEHOLELO_SHIFT)
-#define I40E_GLPES_TCPRXTWOHOLEHI                      0x0001E034 /* Reset: PE_CORER */
-#define I40E_GLPES_TCPRXTWOHOLEHI_TCPRXTWOHOLEHI_SHIFT 0
-#define I40E_GLPES_TCPRXTWOHOLEHI_TCPRXTWOHOLEHI_MASK  (0xFFFFFF <<  I40E_GLPES_TCPRXTWOHOLEHI_TCPRXTWOHOLEHI_SHIFT)
-#define I40E_GLPES_TCPRXTWOHOLELO                      0x0001E030 /* Reset: PE_CORER */
-#define I40E_GLPES_TCPRXTWOHOLELO_TCPRXTWOHOLELO_SHIFT 0
-#define I40E_GLPES_TCPRXTWOHOLELO_TCPRXTWOHOLELO_MASK  (0xFFFFFFFF <<  I40E_GLPES_TCPRXTWOHOLELO_TCPRXTWOHOLELO_SHIFT)
-#define I40E_GLPES_TCPTXRETRANSFASTHI                          0x0001E04C /* Reset: PE_CORER */
-#define I40E_GLPES_TCPTXRETRANSFASTHI_TCPTXRETRANSFASTHI_SHIFT 0
-#define I40E_GLPES_TCPTXRETRANSFASTHI_TCPTXRETRANSFASTHI_MASK  (0xFFFFFF <<  I40E_GLPES_TCPTXRETRANSFASTHI_TCPTXRETRANSFASTHI_SHIFT)
-#define I40E_GLPES_TCPTXRETRANSFASTLO                          0x0001E048 /* Reset: PE_CORER */
-#define I40E_GLPES_TCPTXRETRANSFASTLO_TCPTXRETRANSFASTLO_SHIFT 0
-#define I40E_GLPES_TCPTXRETRANSFASTLO_TCPTXRETRANSFASTLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_TCPTXRETRANSFASTLO_TCPTXRETRANSFASTLO_SHIFT)
-#define I40E_GLPES_TCPTXTOUTSFASTHI                        0x0001E054 /* Reset: PE_CORER */
-#define I40E_GLPES_TCPTXTOUTSFASTHI_TCPTXTOUTSFASTHI_SHIFT 0
-#define I40E_GLPES_TCPTXTOUTSFASTHI_TCPTXTOUTSFASTHI_MASK  (0xFFFFFF <<  I40E_GLPES_TCPTXTOUTSFASTHI_TCPTXTOUTSFASTHI_SHIFT)
-#define I40E_GLPES_TCPTXTOUTSFASTLO                        0x0001E050 /* Reset: PE_CORER */
-#define I40E_GLPES_TCPTXTOUTSFASTLO_TCPTXTOUTSFASTLO_SHIFT 0
-#define I40E_GLPES_TCPTXTOUTSFASTLO_TCPTXTOUTSFASTLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_TCPTXTOUTSFASTLO_TCPTXTOUTSFASTLO_SHIFT)
-#define I40E_GLPES_TCPTXTOUTSHI                    0x0001E05C /* Reset: PE_CORER */
-#define I40E_GLPES_TCPTXTOUTSHI_TCPTXTOUTSHI_SHIFT 0
-#define I40E_GLPES_TCPTXTOUTSHI_TCPTXTOUTSHI_MASK  (0xFFFFFF <<  I40E_GLPES_TCPTXTOUTSHI_TCPTXTOUTSHI_SHIFT)
-#define I40E_GLPES_TCPTXTOUTSLO                    0x0001E058 /* Reset: PE_CORER */
-#define I40E_GLPES_TCPTXTOUTSLO_TCPTXTOUTSLO_SHIFT 0
-#define I40E_GLPES_TCPTXTOUTSLO_TCPTXTOUTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_TCPTXTOUTSLO_TCPTXTOUTSLO_SHIFT)
-#define I40E_GLPES_VFIP4RXDISCARD(_i)                (0x00018600 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP4RXDISCARD_MAX_INDEX          31
-#define I40E_GLPES_VFIP4RXDISCARD_IP4RXDISCARD_SHIFT 0
-#define I40E_GLPES_VFIP4RXDISCARD_IP4RXDISCARD_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP4RXDISCARD_IP4RXDISCARD_SHIFT)
-#define I40E_GLPES_VFIP4RXFRAGSHI(_i)                (0x00018804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP4RXFRAGSHI_MAX_INDEX          31
-#define I40E_GLPES_VFIP4RXFRAGSHI_IP4RXFRAGSHI_SHIFT 0
-#define I40E_GLPES_VFIP4RXFRAGSHI_IP4RXFRAGSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP4RXFRAGSHI_IP4RXFRAGSHI_SHIFT)
-#define I40E_GLPES_VFIP4RXFRAGSLO(_i)                (0x00018800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP4RXFRAGSLO_MAX_INDEX          31
-#define I40E_GLPES_VFIP4RXFRAGSLO_IP4RXFRAGSLO_SHIFT 0
-#define I40E_GLPES_VFIP4RXFRAGSLO_IP4RXFRAGSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP4RXFRAGSLO_IP4RXFRAGSLO_SHIFT)
-#define I40E_GLPES_VFIP4RXMCOCTSHI(_i)                 (0x00018A04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP4RXMCOCTSHI_MAX_INDEX           31
-#define I40E_GLPES_VFIP4RXMCOCTSHI_IP4RXMCOCTSHI_SHIFT 0
-#define I40E_GLPES_VFIP4RXMCOCTSHI_IP4RXMCOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP4RXMCOCTSHI_IP4RXMCOCTSHI_SHIFT)
-#define I40E_GLPES_VFIP4RXMCOCTSLO(_i)                 (0x00018A00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP4RXMCOCTSLO_MAX_INDEX           31
-#define I40E_GLPES_VFIP4RXMCOCTSLO_IP4RXMCOCTSLO_SHIFT 0
-#define I40E_GLPES_VFIP4RXMCOCTSLO_IP4RXMCOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP4RXMCOCTSLO_IP4RXMCOCTSLO_SHIFT)
-#define I40E_GLPES_VFIP4RXMCPKTSHI(_i)                 (0x00018C04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP4RXMCPKTSHI_MAX_INDEX           31
-#define I40E_GLPES_VFIP4RXMCPKTSHI_IP4RXMCPKTSHI_SHIFT 0
-#define I40E_GLPES_VFIP4RXMCPKTSHI_IP4RXMCPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP4RXMCPKTSHI_IP4RXMCPKTSHI_SHIFT)
-#define I40E_GLPES_VFIP4RXMCPKTSLO(_i)                 (0x00018C00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP4RXMCPKTSLO_MAX_INDEX           31
-#define I40E_GLPES_VFIP4RXMCPKTSLO_IP4RXMCPKTSLO_SHIFT 0
-#define I40E_GLPES_VFIP4RXMCPKTSLO_IP4RXMCPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP4RXMCPKTSLO_IP4RXMCPKTSLO_SHIFT)
-#define I40E_GLPES_VFIP4RXOCTSHI(_i)               (0x00018204 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP4RXOCTSHI_MAX_INDEX         31
-#define I40E_GLPES_VFIP4RXOCTSHI_IP4RXOCTSHI_SHIFT 0
-#define I40E_GLPES_VFIP4RXOCTSHI_IP4RXOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP4RXOCTSHI_IP4RXOCTSHI_SHIFT)
-#define I40E_GLPES_VFIP4RXOCTSLO(_i)               (0x00018200 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP4RXOCTSLO_MAX_INDEX         31
-#define I40E_GLPES_VFIP4RXOCTSLO_IP4RXOCTSLO_SHIFT 0
-#define I40E_GLPES_VFIP4RXOCTSLO_IP4RXOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP4RXOCTSLO_IP4RXOCTSLO_SHIFT)
-#define I40E_GLPES_VFIP4RXPKTSHI(_i)               (0x00018404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP4RXPKTSHI_MAX_INDEX         31
-#define I40E_GLPES_VFIP4RXPKTSHI_IP4RXPKTSHI_SHIFT 0
-#define I40E_GLPES_VFIP4RXPKTSHI_IP4RXPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP4RXPKTSHI_IP4RXPKTSHI_SHIFT)
-#define I40E_GLPES_VFIP4RXPKTSLO(_i)               (0x00018400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP4RXPKTSLO_MAX_INDEX         31
-#define I40E_GLPES_VFIP4RXPKTSLO_IP4RXPKTSLO_SHIFT 0
-#define I40E_GLPES_VFIP4RXPKTSLO_IP4RXPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP4RXPKTSLO_IP4RXPKTSLO_SHIFT)
-#define I40E_GLPES_VFIP4RXTRUNC(_i)              (0x00018700 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP4RXTRUNC_MAX_INDEX        31
-#define I40E_GLPES_VFIP4RXTRUNC_IP4RXTRUNC_SHIFT 0
-#define I40E_GLPES_VFIP4RXTRUNC_IP4RXTRUNC_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP4RXTRUNC_IP4RXTRUNC_SHIFT)
-#define I40E_GLPES_VFIP4TXFRAGSHI(_i)                (0x00019E04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP4TXFRAGSHI_MAX_INDEX          31
-#define I40E_GLPES_VFIP4TXFRAGSHI_IP4TXFRAGSHI_SHIFT 0
-#define I40E_GLPES_VFIP4TXFRAGSHI_IP4TXFRAGSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP4TXFRAGSHI_IP4TXFRAGSHI_SHIFT)
-#define I40E_GLPES_VFIP4TXFRAGSLO(_i)                (0x00019E00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP4TXFRAGSLO_MAX_INDEX          31
-#define I40E_GLPES_VFIP4TXFRAGSLO_IP4TXFRAGSLO_SHIFT 0
-#define I40E_GLPES_VFIP4TXFRAGSLO_IP4TXFRAGSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP4TXFRAGSLO_IP4TXFRAGSLO_SHIFT)
-#define I40E_GLPES_VFIP4TXMCOCTSHI(_i)                 (0x0001A004 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP4TXMCOCTSHI_MAX_INDEX           31
-#define I40E_GLPES_VFIP4TXMCOCTSHI_IP4TXMCOCTSHI_SHIFT 0
-#define I40E_GLPES_VFIP4TXMCOCTSHI_IP4TXMCOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP4TXMCOCTSHI_IP4TXMCOCTSHI_SHIFT)
-#define I40E_GLPES_VFIP4TXMCOCTSLO(_i)                 (0x0001A000 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP4TXMCOCTSLO_MAX_INDEX           31
-#define I40E_GLPES_VFIP4TXMCOCTSLO_IP4TXMCOCTSLO_SHIFT 0
-#define I40E_GLPES_VFIP4TXMCOCTSLO_IP4TXMCOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP4TXMCOCTSLO_IP4TXMCOCTSLO_SHIFT)
-#define I40E_GLPES_VFIP4TXMCPKTSHI(_i)                 (0x0001A204 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP4TXMCPKTSHI_MAX_INDEX           31
-#define I40E_GLPES_VFIP4TXMCPKTSHI_IP4TXMCPKTSHI_SHIFT 0
-#define I40E_GLPES_VFIP4TXMCPKTSHI_IP4TXMCPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP4TXMCPKTSHI_IP4TXMCPKTSHI_SHIFT)
-#define I40E_GLPES_VFIP4TXMCPKTSLO(_i)                 (0x0001A200 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP4TXMCPKTSLO_MAX_INDEX           31
-#define I40E_GLPES_VFIP4TXMCPKTSLO_IP4TXMCPKTSLO_SHIFT 0
-#define I40E_GLPES_VFIP4TXMCPKTSLO_IP4TXMCPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP4TXMCPKTSLO_IP4TXMCPKTSLO_SHIFT)
-#define I40E_GLPES_VFIP4TXNOROUTE(_i)                (0x0001AE00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP4TXNOROUTE_MAX_INDEX          31
-#define I40E_GLPES_VFIP4TXNOROUTE_IP4TXNOROUTE_SHIFT 0
-#define I40E_GLPES_VFIP4TXNOROUTE_IP4TXNOROUTE_MASK  (0xFFFFFF <<  I40E_GLPES_VFIP4TXNOROUTE_IP4TXNOROUTE_SHIFT)
-#define I40E_GLPES_VFIP4TXOCTSHI(_i)               (0x00019A04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP4TXOCTSHI_MAX_INDEX         31
-#define I40E_GLPES_VFIP4TXOCTSHI_IP4TXOCTSHI_SHIFT 0
-#define I40E_GLPES_VFIP4TXOCTSHI_IP4TXOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP4TXOCTSHI_IP4TXOCTSHI_SHIFT)
-#define I40E_GLPES_VFIP4TXOCTSLO(_i)               (0x00019A00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP4TXOCTSLO_MAX_INDEX         31
-#define I40E_GLPES_VFIP4TXOCTSLO_IP4TXOCTSLO_SHIFT 0
-#define I40E_GLPES_VFIP4TXOCTSLO_IP4TXOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP4TXOCTSLO_IP4TXOCTSLO_SHIFT)
-#define I40E_GLPES_VFIP4TXPKTSHI(_i)               (0x00019C04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP4TXPKTSHI_MAX_INDEX         31
-#define I40E_GLPES_VFIP4TXPKTSHI_IP4TXPKTSHI_SHIFT 0
-#define I40E_GLPES_VFIP4TXPKTSHI_IP4TXPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP4TXPKTSHI_IP4TXPKTSHI_SHIFT)
-#define I40E_GLPES_VFIP4TXPKTSLO(_i)               (0x00019C00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP4TXPKTSLO_MAX_INDEX         31
-#define I40E_GLPES_VFIP4TXPKTSLO_IP4TXPKTSLO_SHIFT 0
-#define I40E_GLPES_VFIP4TXPKTSLO_IP4TXPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP4TXPKTSLO_IP4TXPKTSLO_SHIFT)
-#define I40E_GLPES_VFIP6RXDISCARD(_i)                (0x00019200 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP6RXDISCARD_MAX_INDEX          31
-#define I40E_GLPES_VFIP6RXDISCARD_IP6RXDISCARD_SHIFT 0
-#define I40E_GLPES_VFIP6RXDISCARD_IP6RXDISCARD_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP6RXDISCARD_IP6RXDISCARD_SHIFT)
-#define I40E_GLPES_VFIP6RXFRAGSHI(_i)                (0x00019404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP6RXFRAGSHI_MAX_INDEX          31
-#define I40E_GLPES_VFIP6RXFRAGSHI_IP6RXFRAGSHI_SHIFT 0
-#define I40E_GLPES_VFIP6RXFRAGSHI_IP6RXFRAGSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP6RXFRAGSHI_IP6RXFRAGSHI_SHIFT)
-#define I40E_GLPES_VFIP6RXFRAGSLO(_i)                (0x00019400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP6RXFRAGSLO_MAX_INDEX          31
-#define I40E_GLPES_VFIP6RXFRAGSLO_IP6RXFRAGSLO_SHIFT 0
-#define I40E_GLPES_VFIP6RXFRAGSLO_IP6RXFRAGSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP6RXFRAGSLO_IP6RXFRAGSLO_SHIFT)
-#define I40E_GLPES_VFIP6RXMCOCTSHI(_i)                 (0x00019604 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP6RXMCOCTSHI_MAX_INDEX           31
-#define I40E_GLPES_VFIP6RXMCOCTSHI_IP6RXMCOCTSHI_SHIFT 0
-#define I40E_GLPES_VFIP6RXMCOCTSHI_IP6RXMCOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP6RXMCOCTSHI_IP6RXMCOCTSHI_SHIFT)
-#define I40E_GLPES_VFIP6RXMCOCTSLO(_i)                 (0x00019600 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP6RXMCOCTSLO_MAX_INDEX           31
-#define I40E_GLPES_VFIP6RXMCOCTSLO_IP6RXMCOCTSLO_SHIFT 0
-#define I40E_GLPES_VFIP6RXMCOCTSLO_IP6RXMCOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP6RXMCOCTSLO_IP6RXMCOCTSLO_SHIFT)
-#define I40E_GLPES_VFIP6RXMCPKTSHI(_i)                 (0x00019804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP6RXMCPKTSHI_MAX_INDEX           31
-#define I40E_GLPES_VFIP6RXMCPKTSHI_IP6RXMCPKTSHI_SHIFT 0
-#define I40E_GLPES_VFIP6RXMCPKTSHI_IP6RXMCPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP6RXMCPKTSHI_IP6RXMCPKTSHI_SHIFT)
-#define I40E_GLPES_VFIP6RXMCPKTSLO(_i)                 (0x00019800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP6RXMCPKTSLO_MAX_INDEX           31
-#define I40E_GLPES_VFIP6RXMCPKTSLO_IP6RXMCPKTSLO_SHIFT 0
-#define I40E_GLPES_VFIP6RXMCPKTSLO_IP6RXMCPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP6RXMCPKTSLO_IP6RXMCPKTSLO_SHIFT)
-#define I40E_GLPES_VFIP6RXOCTSHI(_i)               (0x00018E04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP6RXOCTSHI_MAX_INDEX         31
-#define I40E_GLPES_VFIP6RXOCTSHI_IP6RXOCTSHI_SHIFT 0
-#define I40E_GLPES_VFIP6RXOCTSHI_IP6RXOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP6RXOCTSHI_IP6RXOCTSHI_SHIFT)
-#define I40E_GLPES_VFIP6RXOCTSLO(_i)               (0x00018E00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP6RXOCTSLO_MAX_INDEX         31
-#define I40E_GLPES_VFIP6RXOCTSLO_IP6RXOCTSLO_SHIFT 0
-#define I40E_GLPES_VFIP6RXOCTSLO_IP6RXOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP6RXOCTSLO_IP6RXOCTSLO_SHIFT)
-#define I40E_GLPES_VFIP6RXPKTSHI(_i)               (0x00019004 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP6RXPKTSHI_MAX_INDEX         31
-#define I40E_GLPES_VFIP6RXPKTSHI_IP6RXPKTSHI_SHIFT 0
-#define I40E_GLPES_VFIP6RXPKTSHI_IP6RXPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP6RXPKTSHI_IP6RXPKTSHI_SHIFT)
-#define I40E_GLPES_VFIP6RXPKTSLO(_i)               (0x00019000 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP6RXPKTSLO_MAX_INDEX         31
-#define I40E_GLPES_VFIP6RXPKTSLO_IP6RXPKTSLO_SHIFT 0
-#define I40E_GLPES_VFIP6RXPKTSLO_IP6RXPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP6RXPKTSLO_IP6RXPKTSLO_SHIFT)
-#define I40E_GLPES_VFIP6RXTRUNC(_i)              (0x00019300 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP6RXTRUNC_MAX_INDEX        31
-#define I40E_GLPES_VFIP6RXTRUNC_IP6RXTRUNC_SHIFT 0
-#define I40E_GLPES_VFIP6RXTRUNC_IP6RXTRUNC_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP6RXTRUNC_IP6RXTRUNC_SHIFT)
-#define I40E_GLPES_VFIP6TXFRAGSHI(_i)                (0x0001A804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP6TXFRAGSHI_MAX_INDEX          31
-#define I40E_GLPES_VFIP6TXFRAGSHI_IP6TXFRAGSHI_SHIFT 0
-#define I40E_GLPES_VFIP6TXFRAGSHI_IP6TXFRAGSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP6TXFRAGSHI_IP6TXFRAGSHI_SHIFT)
-#define I40E_GLPES_VFIP6TXFRAGSLO(_i)                (0x0001A800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP6TXFRAGSLO_MAX_INDEX          31
-#define I40E_GLPES_VFIP6TXFRAGSLO_IP6TXFRAGSLO_SHIFT 0
-#define I40E_GLPES_VFIP6TXFRAGSLO_IP6TXFRAGSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP6TXFRAGSLO_IP6TXFRAGSLO_SHIFT)
-#define I40E_GLPES_VFIP6TXMCOCTSHI(_i)                 (0x0001AA04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP6TXMCOCTSHI_MAX_INDEX           31
-#define I40E_GLPES_VFIP6TXMCOCTSHI_IP6TXMCOCTSHI_SHIFT 0
-#define I40E_GLPES_VFIP6TXMCOCTSHI_IP6TXMCOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP6TXMCOCTSHI_IP6TXMCOCTSHI_SHIFT)
-#define I40E_GLPES_VFIP6TXMCOCTSLO(_i)                 (0x0001AA00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP6TXMCOCTSLO_MAX_INDEX           31
-#define I40E_GLPES_VFIP6TXMCOCTSLO_IP6TXMCOCTSLO_SHIFT 0
-#define I40E_GLPES_VFIP6TXMCOCTSLO_IP6TXMCOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP6TXMCOCTSLO_IP6TXMCOCTSLO_SHIFT)
-#define I40E_GLPES_VFIP6TXMCPKTSHI(_i)                 (0x0001AC04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP6TXMCPKTSHI_MAX_INDEX           31
-#define I40E_GLPES_VFIP6TXMCPKTSHI_IP6TXMCPKTSHI_SHIFT 0
-#define I40E_GLPES_VFIP6TXMCPKTSHI_IP6TXMCPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP6TXMCPKTSHI_IP6TXMCPKTSHI_SHIFT)
-#define I40E_GLPES_VFIP6TXMCPKTSLO(_i)                 (0x0001AC00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP6TXMCPKTSLO_MAX_INDEX           31
-#define I40E_GLPES_VFIP6TXMCPKTSLO_IP6TXMCPKTSLO_SHIFT 0
-#define I40E_GLPES_VFIP6TXMCPKTSLO_IP6TXMCPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP6TXMCPKTSLO_IP6TXMCPKTSLO_SHIFT)
-#define I40E_GLPES_VFIP6TXNOROUTE(_i)                (0x0001AF00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP6TXNOROUTE_MAX_INDEX          31
-#define I40E_GLPES_VFIP6TXNOROUTE_IP6TXNOROUTE_SHIFT 0
-#define I40E_GLPES_VFIP6TXNOROUTE_IP6TXNOROUTE_MASK  (0xFFFFFF <<  I40E_GLPES_VFIP6TXNOROUTE_IP6TXNOROUTE_SHIFT)
-#define I40E_GLPES_VFIP6TXOCTSHI(_i)               (0x0001A404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP6TXOCTSHI_MAX_INDEX         31
-#define I40E_GLPES_VFIP6TXOCTSHI_IP6TXOCTSHI_SHIFT 0
-#define I40E_GLPES_VFIP6TXOCTSHI_IP6TXOCTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP6TXOCTSHI_IP6TXOCTSHI_SHIFT)
-#define I40E_GLPES_VFIP6TXOCTSLO(_i)               (0x0001A400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP6TXOCTSLO_MAX_INDEX         31
-#define I40E_GLPES_VFIP6TXOCTSLO_IP6TXOCTSLO_SHIFT 0
-#define I40E_GLPES_VFIP6TXOCTSLO_IP6TXOCTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP6TXOCTSLO_IP6TXOCTSLO_SHIFT)
-#define I40E_GLPES_VFIP6TXPKTSHI(_i)               (0x0001A604 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP6TXPKTSHI_MAX_INDEX         31
-#define I40E_GLPES_VFIP6TXPKTSHI_IP6TXPKTSHI_SHIFT 0
-#define I40E_GLPES_VFIP6TXPKTSHI_IP6TXPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFIP6TXPKTSHI_IP6TXPKTSHI_SHIFT)
-#define I40E_GLPES_VFIP6TXPKTSLO(_i)               (0x0001A600 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFIP6TXPKTSLO_MAX_INDEX         31
-#define I40E_GLPES_VFIP6TXPKTSLO_IP6TXPKTSLO_SHIFT 0
-#define I40E_GLPES_VFIP6TXPKTSLO_IP6TXPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFIP6TXPKTSLO_IP6TXPKTSLO_SHIFT)
-#define I40E_GLPES_VFRDMARXRDSHI(_i)               (0x0001BE04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFRDMARXRDSHI_MAX_INDEX         31
-#define I40E_GLPES_VFRDMARXRDSHI_RDMARXRDSHI_SHIFT 0
-#define I40E_GLPES_VFRDMARXRDSHI_RDMARXRDSHI_MASK  (0xFFFF <<  I40E_GLPES_VFRDMARXRDSHI_RDMARXRDSHI_SHIFT)
-#define I40E_GLPES_VFRDMARXRDSLO(_i)               (0x0001BE00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFRDMARXRDSLO_MAX_INDEX         31
-#define I40E_GLPES_VFRDMARXRDSLO_RDMARXRDSLO_SHIFT 0
-#define I40E_GLPES_VFRDMARXRDSLO_RDMARXRDSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFRDMARXRDSLO_RDMARXRDSLO_SHIFT)
-#define I40E_GLPES_VFRDMARXSNDSHI(_i)                (0x0001C004 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFRDMARXSNDSHI_MAX_INDEX          31
-#define I40E_GLPES_VFRDMARXSNDSHI_RDMARXSNDSHI_SHIFT 0
-#define I40E_GLPES_VFRDMARXSNDSHI_RDMARXSNDSHI_MASK  (0xFFFF <<  I40E_GLPES_VFRDMARXSNDSHI_RDMARXSNDSHI_SHIFT)
-#define I40E_GLPES_VFRDMARXSNDSLO(_i)                (0x0001C000 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFRDMARXSNDSLO_MAX_INDEX          31
-#define I40E_GLPES_VFRDMARXSNDSLO_RDMARXSNDSLO_SHIFT 0
-#define I40E_GLPES_VFRDMARXSNDSLO_RDMARXSNDSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFRDMARXSNDSLO_RDMARXSNDSLO_SHIFT)
-#define I40E_GLPES_VFRDMARXWRSHI(_i)               (0x0001BC04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFRDMARXWRSHI_MAX_INDEX         31
-#define I40E_GLPES_VFRDMARXWRSHI_RDMARXWRSHI_SHIFT 0
-#define I40E_GLPES_VFRDMARXWRSHI_RDMARXWRSHI_MASK  (0xFFFF <<  I40E_GLPES_VFRDMARXWRSHI_RDMARXWRSHI_SHIFT)
-#define I40E_GLPES_VFRDMARXWRSLO(_i)               (0x0001BC00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFRDMARXWRSLO_MAX_INDEX         31
-#define I40E_GLPES_VFRDMARXWRSLO_RDMARXWRSLO_SHIFT 0
-#define I40E_GLPES_VFRDMARXWRSLO_RDMARXWRSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFRDMARXWRSLO_RDMARXWRSLO_SHIFT)
-#define I40E_GLPES_VFRDMATXRDSHI(_i)               (0x0001C404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFRDMATXRDSHI_MAX_INDEX         31
-#define I40E_GLPES_VFRDMATXRDSHI_RDMARXRDSHI_SHIFT 0
-#define I40E_GLPES_VFRDMATXRDSHI_RDMARXRDSHI_MASK  (0xFFFF <<  I40E_GLPES_VFRDMATXRDSHI_RDMARXRDSHI_SHIFT)
-#define I40E_GLPES_VFRDMATXRDSLO(_i)               (0x0001C400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFRDMATXRDSLO_MAX_INDEX         31
-#define I40E_GLPES_VFRDMATXRDSLO_RDMARXRDSLO_SHIFT 0
-#define I40E_GLPES_VFRDMATXRDSLO_RDMARXRDSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFRDMATXRDSLO_RDMARXRDSLO_SHIFT)
-#define I40E_GLPES_VFRDMATXSNDSHI(_i)                (0x0001C604 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFRDMATXSNDSHI_MAX_INDEX          31
-#define I40E_GLPES_VFRDMATXSNDSHI_RDMARXSNDSHI_SHIFT 0
-#define I40E_GLPES_VFRDMATXSNDSHI_RDMARXSNDSHI_MASK  (0xFFFF <<  I40E_GLPES_VFRDMATXSNDSHI_RDMARXSNDSHI_SHIFT)
-#define I40E_GLPES_VFRDMATXSNDSLO(_i)                (0x0001C600 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFRDMATXSNDSLO_MAX_INDEX          31
-#define I40E_GLPES_VFRDMATXSNDSLO_RDMARXSNDSLO_SHIFT 0
-#define I40E_GLPES_VFRDMATXSNDSLO_RDMARXSNDSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFRDMATXSNDSLO_RDMARXSNDSLO_SHIFT)
-#define I40E_GLPES_VFRDMATXWRSHI(_i)               (0x0001C204 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFRDMATXWRSHI_MAX_INDEX         31
-#define I40E_GLPES_VFRDMATXWRSHI_RDMARXWRSHI_SHIFT 0
-#define I40E_GLPES_VFRDMATXWRSHI_RDMARXWRSHI_MASK  (0xFFFF <<  I40E_GLPES_VFRDMATXWRSHI_RDMARXWRSHI_SHIFT)
-#define I40E_GLPES_VFRDMATXWRSLO(_i)               (0x0001C200 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFRDMATXWRSLO_MAX_INDEX         31
-#define I40E_GLPES_VFRDMATXWRSLO_RDMARXWRSLO_SHIFT 0
-#define I40E_GLPES_VFRDMATXWRSLO_RDMARXWRSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFRDMATXWRSLO_RDMARXWRSLO_SHIFT)
-#define I40E_GLPES_VFRDMAVBNDHI(_i)              (0x0001C804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFRDMAVBNDHI_MAX_INDEX        31
-#define I40E_GLPES_VFRDMAVBNDHI_RDMAVBNDHI_SHIFT 0
-#define I40E_GLPES_VFRDMAVBNDHI_RDMAVBNDHI_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFRDMAVBNDHI_RDMAVBNDHI_SHIFT)
-#define I40E_GLPES_VFRDMAVBNDLO(_i)              (0x0001C800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFRDMAVBNDLO_MAX_INDEX        31
-#define I40E_GLPES_VFRDMAVBNDLO_RDMAVBNDLO_SHIFT 0
-#define I40E_GLPES_VFRDMAVBNDLO_RDMAVBNDLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFRDMAVBNDLO_RDMAVBNDLO_SHIFT)
-#define I40E_GLPES_VFRDMAVINVHI(_i)              (0x0001CA04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFRDMAVINVHI_MAX_INDEX        31
-#define I40E_GLPES_VFRDMAVINVHI_RDMAVINVHI_SHIFT 0
-#define I40E_GLPES_VFRDMAVINVHI_RDMAVINVHI_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFRDMAVINVHI_RDMAVINVHI_SHIFT)
-#define I40E_GLPES_VFRDMAVINVLO(_i)              (0x0001CA00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFRDMAVINVLO_MAX_INDEX        31
-#define I40E_GLPES_VFRDMAVINVLO_RDMAVINVLO_SHIFT 0
-#define I40E_GLPES_VFRDMAVINVLO_RDMAVINVLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFRDMAVINVLO_RDMAVINVLO_SHIFT)
-#define I40E_GLPES_VFRXVLANERR(_i)             (0x00018000 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFRXVLANERR_MAX_INDEX       31
-#define I40E_GLPES_VFRXVLANERR_RXVLANERR_SHIFT 0
-#define I40E_GLPES_VFRXVLANERR_RXVLANERR_MASK  (0xFFFFFF <<  I40E_GLPES_VFRXVLANERR_RXVLANERR_SHIFT)
-#define I40E_GLPES_VFTCPRTXSEG(_i)             (0x0001B600 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFTCPRTXSEG_MAX_INDEX       31
-#define I40E_GLPES_VFTCPRTXSEG_TCPRTXSEG_SHIFT 0
-#define I40E_GLPES_VFTCPRTXSEG_TCPRTXSEG_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFTCPRTXSEG_TCPRTXSEG_SHIFT)
-#define I40E_GLPES_VFTCPRXOPTERR(_i)               (0x0001B200 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFTCPRXOPTERR_MAX_INDEX         31
-#define I40E_GLPES_VFTCPRXOPTERR_TCPRXOPTERR_SHIFT 0
-#define I40E_GLPES_VFTCPRXOPTERR_TCPRXOPTERR_MASK  (0xFFFFFF <<  I40E_GLPES_VFTCPRXOPTERR_TCPRXOPTERR_SHIFT)
-#define I40E_GLPES_VFTCPRXPROTOERR(_i)                 (0x0001B300 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFTCPRXPROTOERR_MAX_INDEX           31
-#define I40E_GLPES_VFTCPRXPROTOERR_TCPRXPROTOERR_SHIFT 0
-#define I40E_GLPES_VFTCPRXPROTOERR_TCPRXPROTOERR_MASK  (0xFFFFFF <<  I40E_GLPES_VFTCPRXPROTOERR_TCPRXPROTOERR_SHIFT)
-#define I40E_GLPES_VFTCPRXSEGSHI(_i)               (0x0001B004 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFTCPRXSEGSHI_MAX_INDEX         31
-#define I40E_GLPES_VFTCPRXSEGSHI_TCPRXSEGSHI_SHIFT 0
-#define I40E_GLPES_VFTCPRXSEGSHI_TCPRXSEGSHI_MASK  (0xFFFF <<  I40E_GLPES_VFTCPRXSEGSHI_TCPRXSEGSHI_SHIFT)
-#define I40E_GLPES_VFTCPRXSEGSLO(_i)               (0x0001B000 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFTCPRXSEGSLO_MAX_INDEX         31
-#define I40E_GLPES_VFTCPRXSEGSLO_TCPRXSEGSLO_SHIFT 0
-#define I40E_GLPES_VFTCPRXSEGSLO_TCPRXSEGSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFTCPRXSEGSLO_TCPRXSEGSLO_SHIFT)
-#define I40E_GLPES_VFTCPTXSEGHI(_i)              (0x0001B404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFTCPTXSEGHI_MAX_INDEX        31
-#define I40E_GLPES_VFTCPTXSEGHI_TCPTXSEGHI_SHIFT 0
-#define I40E_GLPES_VFTCPTXSEGHI_TCPTXSEGHI_MASK  (0xFFFF <<  I40E_GLPES_VFTCPTXSEGHI_TCPTXSEGHI_SHIFT)
-#define I40E_GLPES_VFTCPTXSEGLO(_i)              (0x0001B400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFTCPTXSEGLO_MAX_INDEX        31
-#define I40E_GLPES_VFTCPTXSEGLO_TCPTXSEGLO_SHIFT 0
-#define I40E_GLPES_VFTCPTXSEGLO_TCPTXSEGLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFTCPTXSEGLO_TCPTXSEGLO_SHIFT)
-#define I40E_GLPES_VFUDPRXPKTSHI(_i)               (0x0001B804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFUDPRXPKTSHI_MAX_INDEX         31
-#define I40E_GLPES_VFUDPRXPKTSHI_UDPRXPKTSHI_SHIFT 0
-#define I40E_GLPES_VFUDPRXPKTSHI_UDPRXPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFUDPRXPKTSHI_UDPRXPKTSHI_SHIFT)
-#define I40E_GLPES_VFUDPRXPKTSLO(_i)               (0x0001B800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFUDPRXPKTSLO_MAX_INDEX         31
-#define I40E_GLPES_VFUDPRXPKTSLO_UDPRXPKTSLO_SHIFT 0
-#define I40E_GLPES_VFUDPRXPKTSLO_UDPRXPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFUDPRXPKTSLO_UDPRXPKTSLO_SHIFT)
-#define I40E_GLPES_VFUDPTXPKTSHI(_i)               (0x0001BA04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFUDPTXPKTSHI_MAX_INDEX         31
-#define I40E_GLPES_VFUDPTXPKTSHI_UDPTXPKTSHI_SHIFT 0
-#define I40E_GLPES_VFUDPTXPKTSHI_UDPTXPKTSHI_MASK  (0xFFFF <<  I40E_GLPES_VFUDPTXPKTSHI_UDPTXPKTSHI_SHIFT)
-#define I40E_GLPES_VFUDPTXPKTSLO(_i)               (0x0001BA00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */
-#define I40E_GLPES_VFUDPTXPKTSLO_MAX_INDEX         31
-#define I40E_GLPES_VFUDPTXPKTSLO_UDPTXPKTSLO_SHIFT 0
-#define I40E_GLPES_VFUDPTXPKTSLO_UDPTXPKTSLO_MASK  (0xFFFFFFFF <<  I40E_GLPES_VFUDPTXPKTSLO_UDPTXPKTSLO_SHIFT)
-
-#define I40E_VFPE_AEQALLOC1               0x0000A400 /* Reset: VFR */
-#define I40E_VFPE_AEQALLOC1_AECOUNT_SHIFT 0
-#define I40E_VFPE_AEQALLOC1_AECOUNT_MASK  (0xFFFFFFFF <<  I40E_VFPE_AEQALLOC1_AECOUNT_SHIFT)
-#define I40E_VFPE_CCQPHIGH1                  0x00009800 /* Reset: VFR */
-#define I40E_VFPE_CCQPHIGH1_PECCQPHIGH_SHIFT 0
-#define I40E_VFPE_CCQPHIGH1_PECCQPHIGH_MASK  (0xFFFFFFFF <<  I40E_VFPE_CCQPHIGH1_PECCQPHIGH_SHIFT)
-#define I40E_VFPE_CCQPLOW1                 0x0000AC00 /* Reset: VFR */
-#define I40E_VFPE_CCQPLOW1_PECCQPLOW_SHIFT 0
-#define I40E_VFPE_CCQPLOW1_PECCQPLOW_MASK  (0xFFFFFFFF <<  I40E_VFPE_CCQPLOW1_PECCQPLOW_SHIFT)
-#define I40E_VFPE_CCQPSTATUS1                   0x0000B800 /* Reset: VFR */
-#define I40E_VFPE_CCQPSTATUS1_CCQP_DONE_SHIFT   0
-#define I40E_VFPE_CCQPSTATUS1_CCQP_DONE_MASK    (0x1 <<  I40E_VFPE_CCQPSTATUS1_CCQP_DONE_SHIFT)
-#define I40E_VFPE_CCQPSTATUS1_HMC_PROFILE_SHIFT 4
-#define I40E_VFPE_CCQPSTATUS1_HMC_PROFILE_MASK  (0x7 <<  I40E_VFPE_CCQPSTATUS1_HMC_PROFILE_SHIFT)
-#define I40E_VFPE_CCQPSTATUS1_RDMA_EN_VFS_SHIFT 16
-#define I40E_VFPE_CCQPSTATUS1_RDMA_EN_VFS_MASK  (0x3F <<  I40E_VFPE_CCQPSTATUS1_RDMA_EN_VFS_SHIFT)
-#define I40E_VFPE_CCQPSTATUS1_CCQP_ERR_SHIFT    31
-#define I40E_VFPE_CCQPSTATUS1_CCQP_ERR_MASK     (0x1 <<  I40E_VFPE_CCQPSTATUS1_CCQP_ERR_SHIFT)
-#define I40E_VFPE_CQACK1              0x0000B000 /* Reset: VFR */
-#define I40E_VFPE_CQACK1_PECQID_SHIFT 0
-#define I40E_VFPE_CQACK1_PECQID_MASK  (0x1FFFF <<  I40E_VFPE_CQACK1_PECQID_SHIFT)
-#define I40E_VFPE_CQARM1              0x0000B400 /* Reset: VFR */
-#define I40E_VFPE_CQARM1_PECQID_SHIFT 0
-#define I40E_VFPE_CQARM1_PECQID_MASK  (0x1FFFF <<  I40E_VFPE_CQARM1_PECQID_SHIFT)
-#define I40E_VFPE_CQPDB1              0x0000BC00 /* Reset: VFR */
-#define I40E_VFPE_CQPDB1_WQHEAD_SHIFT 0
-#define I40E_VFPE_CQPDB1_WQHEAD_MASK  (0x7FF <<  I40E_VFPE_CQPDB1_WQHEAD_SHIFT)
-#define I40E_VFPE_CQPERRCODES1                      0x00009C00 /* Reset: VFR */
-#define I40E_VFPE_CQPERRCODES1_CQP_MINOR_CODE_SHIFT 0
-#define I40E_VFPE_CQPERRCODES1_CQP_MINOR_CODE_MASK  (0xFFFF <<  I40E_VFPE_CQPERRCODES1_CQP_MINOR_CODE_SHIFT)
-#define I40E_VFPE_CQPERRCODES1_CQP_MAJOR_CODE_SHIFT 16
-#define I40E_VFPE_CQPERRCODES1_CQP_MAJOR_CODE_MASK  (0xFFFF <<  I40E_VFPE_CQPERRCODES1_CQP_MAJOR_CODE_SHIFT)
-#define I40E_VFPE_CQPTAIL1                  0x0000A000 /* Reset: VFR */
-#define I40E_VFPE_CQPTAIL1_WQTAIL_SHIFT     0
-#define I40E_VFPE_CQPTAIL1_WQTAIL_MASK      (0x7FF <<  I40E_VFPE_CQPTAIL1_WQTAIL_SHIFT)
-#define I40E_VFPE_CQPTAIL1_CQP_OP_ERR_SHIFT 31
-#define I40E_VFPE_CQPTAIL1_CQP_OP_ERR_MASK  (0x1 <<  I40E_VFPE_CQPTAIL1_CQP_OP_ERR_SHIFT)
-#define I40E_VFPE_IPCONFIG01                        0x00008C00 /* Reset: VFR */
-#define I40E_VFPE_IPCONFIG01_PEIPID_SHIFT           0
-#define I40E_VFPE_IPCONFIG01_PEIPID_MASK            (0xFFFF <<  I40E_VFPE_IPCONFIG01_PEIPID_SHIFT)
-#define I40E_VFPE_IPCONFIG01_USEENTIREIDRANGE_SHIFT 16
-#define I40E_VFPE_IPCONFIG01_USEENTIREIDRANGE_MASK  (0x1 <<  I40E_VFPE_IPCONFIG01_USEENTIREIDRANGE_SHIFT)
-#define I40E_VFPE_MRTEIDXMASK1                       0x00009000 /* Reset: VFR */
-#define I40E_VFPE_MRTEIDXMASK1_MRTEIDXMASKBITS_SHIFT 0
-#define I40E_VFPE_MRTEIDXMASK1_MRTEIDXMASKBITS_MASK  (0x1F <<  I40E_VFPE_MRTEIDXMASK1_MRTEIDXMASKBITS_SHIFT)
-#define I40E_VFPE_RCVUNEXPECTEDERROR1                        0x00009400 /* Reset: VFR */
-#define I40E_VFPE_RCVUNEXPECTEDERROR1_TCP_RX_UNEXP_ERR_SHIFT 0
-#define I40E_VFPE_RCVUNEXPECTEDERROR1_TCP_RX_UNEXP_ERR_MASK  (0xFFFFFF <<  I40E_VFPE_RCVUNEXPECTEDERROR1_TCP_RX_UNEXP_ERR_SHIFT)
-#define I40E_VFPE_TCPNOWTIMER1               0x0000A800 /* Reset: VFR */
-#define I40E_VFPE_TCPNOWTIMER1_TCP_NOW_SHIFT 0
-#define I40E_VFPE_TCPNOWTIMER1_TCP_NOW_MASK  (0xFFFFFFFF <<  I40E_VFPE_TCPNOWTIMER1_TCP_NOW_SHIFT)
-#define I40E_VFPE_WQEALLOC1                      0x0000C000 /* Reset: VFR */
-#define I40E_VFPE_WQEALLOC1_PEQPID_SHIFT         0
-#define I40E_VFPE_WQEALLOC1_PEQPID_MASK          (0x3FFFF <<  I40E_VFPE_WQEALLOC1_PEQPID_SHIFT)
-#define I40E_VFPE_WQEALLOC1_WQE_DESC_INDEX_SHIFT 20
-#define I40E_VFPE_WQEALLOC1_WQE_DESC_INDEX_MASK  (0xFFF <<  I40E_VFPE_WQEALLOC1_WQE_DESC_INDEX_SHIFT)
-#endif /* I40IW_REGISTER_H */
diff --git a/drivers/infiniband/hw/i40iw/i40iw_status.h b/drivers/infiniband/hw/i40iw/i40iw_status.h
deleted file mode 100644
index 36a19c4e5bba..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw_status.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#ifndef I40IW_STATUS_H
-#define I40IW_STATUS_H
-
-/* Error Codes */
-enum i40iw_status_code {
-	I40IW_SUCCESS = 0,
-	I40IW_ERR_NVM = -1,
-	I40IW_ERR_NVM_CHECKSUM = -2,
-	I40IW_ERR_CONFIG = -4,
-	I40IW_ERR_PARAM = -5,
-	I40IW_ERR_DEVICE_NOT_SUPPORTED = -6,
-	I40IW_ERR_RESET_FAILED = -7,
-	I40IW_ERR_SWFW_SYNC = -8,
-	I40IW_ERR_NO_MEMORY = -9,
-	I40IW_ERR_BAD_PTR = -10,
-	I40IW_ERR_INVALID_PD_ID = -11,
-	I40IW_ERR_INVALID_QP_ID = -12,
-	I40IW_ERR_INVALID_CQ_ID = -13,
-	I40IW_ERR_INVALID_CEQ_ID = -14,
-	I40IW_ERR_INVALID_AEQ_ID = -15,
-	I40IW_ERR_INVALID_SIZE = -16,
-	I40IW_ERR_INVALID_ARP_INDEX = -17,
-	I40IW_ERR_INVALID_FPM_FUNC_ID = -18,
-	I40IW_ERR_QP_INVALID_MSG_SIZE = -19,
-	I40IW_ERR_QP_TOOMANY_WRS_POSTED = -20,
-	I40IW_ERR_INVALID_FRAG_COUNT = -21,
-	I40IW_ERR_QUEUE_EMPTY = -22,
-	I40IW_ERR_INVALID_ALIGNMENT = -23,
-	I40IW_ERR_FLUSHED_QUEUE = -24,
-	I40IW_ERR_INVALID_INLINE_DATA_SIZE = -26,
-	I40IW_ERR_TIMEOUT = -27,
-	I40IW_ERR_OPCODE_MISMATCH = -28,
-	I40IW_ERR_CQP_COMPL_ERROR = -29,
-	I40IW_ERR_INVALID_VF_ID = -30,
-	I40IW_ERR_INVALID_HMCFN_ID = -31,
-	I40IW_ERR_BACKING_PAGE_ERROR = -32,
-	I40IW_ERR_NO_PBLCHUNKS_AVAILABLE = -33,
-	I40IW_ERR_INVALID_PBLE_INDEX = -34,
-	I40IW_ERR_INVALID_SD_INDEX = -35,
-	I40IW_ERR_INVALID_PAGE_DESC_INDEX = -36,
-	I40IW_ERR_INVALID_SD_TYPE = -37,
-	I40IW_ERR_MEMCPY_FAILED = -38,
-	I40IW_ERR_INVALID_HMC_OBJ_INDEX = -39,
-	I40IW_ERR_INVALID_HMC_OBJ_COUNT = -40,
-	I40IW_ERR_INVALID_SRQ_ARM_LIMIT = -41,
-	I40IW_ERR_SRQ_ENABLED = -42,
-	I40IW_ERR_BUF_TOO_SHORT = -43,
-	I40IW_ERR_BAD_IWARP_CQE = -44,
-	I40IW_ERR_NVM_BLANK_MODE = -45,
-	I40IW_ERR_NOT_IMPLEMENTED = -46,
-	I40IW_ERR_PE_DOORBELL_NOT_ENABLED = -47,
-	I40IW_ERR_NOT_READY = -48,
-	I40IW_NOT_SUPPORTED = -49,
-	I40IW_ERR_FIRMWARE_API_VERSION = -50,
-	I40IW_ERR_RING_FULL = -51,
-	I40IW_ERR_MPA_CRC = -61,
-	I40IW_ERR_NO_TXBUFS = -62,
-	I40IW_ERR_SEQ_NUM = -63,
-	I40IW_ERR_list_empty = -64,
-	I40IW_ERR_INVALID_MAC_ADDR = -65,
-	I40IW_ERR_BAD_STAG      = -66,
-	I40IW_ERR_CQ_COMPL_ERROR = -67,
-	I40IW_ERR_QUEUE_DESTROYED = -68,
-	I40IW_ERR_INVALID_FEAT_CNT = -69
-
-};
-#endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_type.h b/drivers/infiniband/hw/i40iw/i40iw_type.h
deleted file mode 100644
index 394e182686cf..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw_type.h
+++ /dev/null
@@ -1,1358 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#ifndef I40IW_TYPE_H
-#define I40IW_TYPE_H
-#include "i40iw_user.h"
-#include "i40iw_hmc.h"
-#include "i40iw_vf.h"
-#include "i40iw_virtchnl.h"
-
-struct i40iw_cqp_sq_wqe {
-	u64 buf[I40IW_CQP_WQE_SIZE];
-};
-
-struct i40iw_sc_aeqe {
-	u64 buf[I40IW_AEQE_SIZE];
-};
-
-struct i40iw_ceqe {
-	u64 buf[I40IW_CEQE_SIZE];
-};
-
-struct i40iw_cqp_ctx {
-	u64 buf[I40IW_CQP_CTX_SIZE];
-};
-
-struct i40iw_cq_shadow_area {
-	u64 buf[I40IW_SHADOW_AREA_SIZE];
-};
-
-struct i40iw_sc_dev;
-struct i40iw_hmc_info;
-struct i40iw_vsi_pestat;
-
-struct i40iw_cqp_ops;
-struct i40iw_ccq_ops;
-struct i40iw_ceq_ops;
-struct i40iw_aeq_ops;
-struct i40iw_mr_ops;
-struct i40iw_cqp_misc_ops;
-struct i40iw_pd_ops;
-struct i40iw_priv_qp_ops;
-struct i40iw_priv_cq_ops;
-struct i40iw_hmc_ops;
-struct pci_dev;
-
-enum i40iw_page_size {
-	I40IW_PAGE_SIZE_4K,
-	I40IW_PAGE_SIZE_2M
-};
-
-enum i40iw_resource_indicator_type {
-	I40IW_RSRC_INDICATOR_TYPE_ADAPTER = 0,
-	I40IW_RSRC_INDICATOR_TYPE_CQ,
-	I40IW_RSRC_INDICATOR_TYPE_QP,
-	I40IW_RSRC_INDICATOR_TYPE_SRQ
-};
-
-enum i40iw_hdrct_flags {
-	DDP_LEN_FLAG = 0x80,
-	DDP_HDR_FLAG = 0x40,
-	RDMA_HDR_FLAG = 0x20
-};
-
-enum i40iw_term_layers {
-	LAYER_RDMA = 0,
-	LAYER_DDP = 1,
-	LAYER_MPA = 2
-};
-
-enum i40iw_term_error_types {
-	RDMAP_REMOTE_PROT = 1,
-	RDMAP_REMOTE_OP = 2,
-	DDP_CATASTROPHIC = 0,
-	DDP_TAGGED_BUFFER = 1,
-	DDP_UNTAGGED_BUFFER = 2,
-	DDP_LLP = 3
-};
-
-enum i40iw_term_rdma_errors {
-	RDMAP_INV_STAG = 0x00,
-	RDMAP_INV_BOUNDS = 0x01,
-	RDMAP_ACCESS = 0x02,
-	RDMAP_UNASSOC_STAG = 0x03,
-	RDMAP_TO_WRAP = 0x04,
-	RDMAP_INV_RDMAP_VER = 0x05,
-	RDMAP_UNEXPECTED_OP = 0x06,
-	RDMAP_CATASTROPHIC_LOCAL = 0x07,
-	RDMAP_CATASTROPHIC_GLOBAL = 0x08,
-	RDMAP_CANT_INV_STAG = 0x09,
-	RDMAP_UNSPECIFIED = 0xff
-};
-
-enum i40iw_term_ddp_errors {
-	DDP_CATASTROPHIC_LOCAL = 0x00,
-	DDP_TAGGED_INV_STAG = 0x00,
-	DDP_TAGGED_BOUNDS = 0x01,
-	DDP_TAGGED_UNASSOC_STAG = 0x02,
-	DDP_TAGGED_TO_WRAP = 0x03,
-	DDP_TAGGED_INV_DDP_VER = 0x04,
-	DDP_UNTAGGED_INV_QN = 0x01,
-	DDP_UNTAGGED_INV_MSN_NO_BUF = 0x02,
-	DDP_UNTAGGED_INV_MSN_RANGE = 0x03,
-	DDP_UNTAGGED_INV_MO = 0x04,
-	DDP_UNTAGGED_INV_TOO_LONG = 0x05,
-	DDP_UNTAGGED_INV_DDP_VER = 0x06
-};
-
-enum i40iw_term_mpa_errors {
-	MPA_CLOSED = 0x01,
-	MPA_CRC = 0x02,
-	MPA_MARKER = 0x03,
-	MPA_REQ_RSP = 0x04,
-};
-
-enum i40iw_flush_opcode {
-	FLUSH_INVALID = 0,
-	FLUSH_PROT_ERR,
-	FLUSH_REM_ACCESS_ERR,
-	FLUSH_LOC_QP_OP_ERR,
-	FLUSH_REM_OP_ERR,
-	FLUSH_LOC_LEN_ERR,
-	FLUSH_GENERAL_ERR,
-	FLUSH_FATAL_ERR
-};
-
-enum i40iw_term_eventtypes {
-	TERM_EVENT_QP_FATAL,
-	TERM_EVENT_QP_ACCESS_ERR
-};
-
-struct i40iw_terminate_hdr {
-	u8 layer_etype;
-	u8 error_code;
-	u8 hdrct;
-	u8 rsvd;
-};
-
-enum i40iw_debug_flag {
-	I40IW_DEBUG_NONE	= 0x00000000,
-	I40IW_DEBUG_ERR		= 0x00000001,
-	I40IW_DEBUG_INIT	= 0x00000002,
-	I40IW_DEBUG_DEV		= 0x00000004,
-	I40IW_DEBUG_CM		= 0x00000008,
-	I40IW_DEBUG_VERBS	= 0x00000010,
-	I40IW_DEBUG_PUDA	= 0x00000020,
-	I40IW_DEBUG_ILQ		= 0x00000040,
-	I40IW_DEBUG_IEQ		= 0x00000080,
-	I40IW_DEBUG_QP		= 0x00000100,
-	I40IW_DEBUG_CQ		= 0x00000200,
-	I40IW_DEBUG_MR		= 0x00000400,
-	I40IW_DEBUG_PBLE	= 0x00000800,
-	I40IW_DEBUG_WQE		= 0x00001000,
-	I40IW_DEBUG_AEQ		= 0x00002000,
-	I40IW_DEBUG_CQP		= 0x00004000,
-	I40IW_DEBUG_HMC		= 0x00008000,
-	I40IW_DEBUG_USER	= 0x00010000,
-	I40IW_DEBUG_VIRT	= 0x00020000,
-	I40IW_DEBUG_DCB		= 0x00040000,
-	I40IW_DEBUG_CQE		= 0x00800000,
-	I40IW_DEBUG_ALL		= 0xFFFFFFFF
-};
-
-enum i40iw_hw_stats_index_32b {
-	I40IW_HW_STAT_INDEX_IP4RXDISCARD = 0,
-	I40IW_HW_STAT_INDEX_IP4RXTRUNC,
-	I40IW_HW_STAT_INDEX_IP4TXNOROUTE,
-	I40IW_HW_STAT_INDEX_IP6RXDISCARD,
-	I40IW_HW_STAT_INDEX_IP6RXTRUNC,
-	I40IW_HW_STAT_INDEX_IP6TXNOROUTE,
-	I40IW_HW_STAT_INDEX_TCPRTXSEG,
-	I40IW_HW_STAT_INDEX_TCPRXOPTERR,
-	I40IW_HW_STAT_INDEX_TCPRXPROTOERR,
-	I40IW_HW_STAT_INDEX_MAX_32
-};
-
-enum i40iw_hw_stats_index_64b {
-	I40IW_HW_STAT_INDEX_IP4RXOCTS = 0,
-	I40IW_HW_STAT_INDEX_IP4RXPKTS,
-	I40IW_HW_STAT_INDEX_IP4RXFRAGS,
-	I40IW_HW_STAT_INDEX_IP4RXMCPKTS,
-	I40IW_HW_STAT_INDEX_IP4TXOCTS,
-	I40IW_HW_STAT_INDEX_IP4TXPKTS,
-	I40IW_HW_STAT_INDEX_IP4TXFRAGS,
-	I40IW_HW_STAT_INDEX_IP4TXMCPKTS,
-	I40IW_HW_STAT_INDEX_IP6RXOCTS,
-	I40IW_HW_STAT_INDEX_IP6RXPKTS,
-	I40IW_HW_STAT_INDEX_IP6RXFRAGS,
-	I40IW_HW_STAT_INDEX_IP6RXMCPKTS,
-	I40IW_HW_STAT_INDEX_IP6TXOCTS,
-	I40IW_HW_STAT_INDEX_IP6TXPKTS,
-	I40IW_HW_STAT_INDEX_IP6TXFRAGS,
-	I40IW_HW_STAT_INDEX_IP6TXMCPKTS,
-	I40IW_HW_STAT_INDEX_TCPRXSEGS,
-	I40IW_HW_STAT_INDEX_TCPTXSEG,
-	I40IW_HW_STAT_INDEX_RDMARXRDS,
-	I40IW_HW_STAT_INDEX_RDMARXSNDS,
-	I40IW_HW_STAT_INDEX_RDMARXWRS,
-	I40IW_HW_STAT_INDEX_RDMATXRDS,
-	I40IW_HW_STAT_INDEX_RDMATXSNDS,
-	I40IW_HW_STAT_INDEX_RDMATXWRS,
-	I40IW_HW_STAT_INDEX_RDMAVBND,
-	I40IW_HW_STAT_INDEX_RDMAVINV,
-	I40IW_HW_STAT_INDEX_MAX_64
-};
-
-enum i40iw_feature_type {
-	I40IW_FEATURE_FW_INFO = 0,
-	I40IW_MAX_FEATURES
-};
-
-struct i40iw_dev_hw_stats_offsets {
-	u32 stats_offset_32[I40IW_HW_STAT_INDEX_MAX_32];
-	u32 stats_offset_64[I40IW_HW_STAT_INDEX_MAX_64];
-};
-
-struct i40iw_dev_hw_stats {
-	u64 stats_value_32[I40IW_HW_STAT_INDEX_MAX_32];
-	u64 stats_value_64[I40IW_HW_STAT_INDEX_MAX_64];
-};
-
-struct i40iw_vsi_pestat {
-	struct i40iw_hw *hw;
-	struct i40iw_dev_hw_stats hw_stats;
-	struct i40iw_dev_hw_stats last_read_hw_stats;
-	struct i40iw_dev_hw_stats_offsets hw_stats_offsets;
-	struct timer_list stats_timer;
-	struct i40iw_sc_vsi *vsi;
-	spinlock_t lock; /* rdma stats lock */
-};
-
-struct i40iw_hw {
-	u8 __iomem *hw_addr;
-	struct pci_dev *pcidev;
-	struct i40iw_hmc_info hmc;
-};
-
-struct i40iw_pfpdu {
-	struct list_head rxlist;
-	u32 rcv_nxt;
-	u32 fps;
-	u32 max_fpdu_data;
-	bool mode;
-	bool mpa_crc_err;
-	u64 total_ieq_bufs;
-	u64 fpdu_processed;
-	u64 bad_seq_num;
-	u64 crc_err;
-	u64 no_tx_bufs;
-	u64 tx_err;
-	u64 out_of_order;
-	u64 pmode_count;
-};
-
-struct i40iw_sc_pd {
-	u32 size;
-	struct i40iw_sc_dev *dev;
-	u16 pd_id;
-	int abi_ver;
-};
-
-struct i40iw_cqp_quanta {
-	u64 elem[I40IW_CQP_WQE_SIZE];
-};
-
-struct i40iw_sc_cqp {
-	u32 size;
-	u64 sq_pa;
-	u64 host_ctx_pa;
-	void *back_cqp;
-	struct i40iw_sc_dev *dev;
-	enum i40iw_status_code (*process_cqp_sds)(struct i40iw_sc_dev *,
-						  struct i40iw_update_sds_info *);
-	struct i40iw_dma_mem sdbuf;
-	struct i40iw_ring sq_ring;
-	struct i40iw_cqp_quanta *sq_base;
-	u64 *host_ctx;
-	u64 *scratch_array;
-	u32 cqp_id;
-	u32 sq_size;
-	u32 hw_sq_size;
-	u8 struct_ver;
-	u8 polarity;
-	bool en_datacenter_tcp;
-	u8 hmc_profile;
-	u8 enabled_vf_count;
-	u8 timeout_count;
-};
-
-struct i40iw_sc_aeq {
-	u32 size;
-	u64 aeq_elem_pa;
-	struct i40iw_sc_dev *dev;
-	struct i40iw_sc_aeqe *aeqe_base;
-	void *pbl_list;
-	u32 elem_cnt;
-	struct i40iw_ring aeq_ring;
-	bool virtual_map;
-	u8 pbl_chunk_size;
-	u32 first_pm_pbl_idx;
-	u8 polarity;
-};
-
-struct i40iw_sc_ceq {
-	u32 size;
-	u64 ceq_elem_pa;
-	struct i40iw_sc_dev *dev;
-	struct i40iw_ceqe *ceqe_base;
-	void *pbl_list;
-	u32 ceq_id;
-	u32 elem_cnt;
-	struct i40iw_ring ceq_ring;
-	bool virtual_map;
-	u8 pbl_chunk_size;
-	bool tph_en;
-	u8 tph_val;
-	u32 first_pm_pbl_idx;
-	u8 polarity;
-};
-
-struct i40iw_sc_cq {
-	struct i40iw_cq_uk cq_uk;
-	u64 cq_pa;
-	u64 shadow_area_pa;
-	struct i40iw_sc_dev *dev;
-	struct i40iw_sc_vsi *vsi;
-	void *pbl_list;
-	void *back_cq;
-	u32 ceq_id;
-	u32 shadow_read_threshold;
-	bool ceqe_mask;
-	bool virtual_map;
-	u8 pbl_chunk_size;
-	u8 cq_type;
-	bool ceq_id_valid;
-	bool tph_en;
-	u8 tph_val;
-	u32 first_pm_pbl_idx;
-	bool check_overflow;
-};
-
-struct i40iw_sc_qp {
-	struct i40iw_qp_uk qp_uk;
-	u64 sq_pa;
-	u64 rq_pa;
-	u64 hw_host_ctx_pa;
-	u64 shadow_area_pa;
-	u64 q2_pa;
-	struct i40iw_sc_dev *dev;
-	struct i40iw_sc_vsi *vsi;
-	struct i40iw_sc_pd *pd;
-	u64 *hw_host_ctx;
-	void *llp_stream_handle;
-	void *back_qp;
-	struct i40iw_pfpdu pfpdu;
-	u8 *q2_buf;
-	u64 qp_compl_ctx;
-	u16 qs_handle;
-	u8 sq_tph_val;
-	u8 rq_tph_val;
-	u8 qp_state;
-	u8 qp_type;
-	u8 hw_sq_size;
-	u8 hw_rq_size;
-	u8 src_mac_addr_idx;
-	bool sq_tph_en;
-	bool rq_tph_en;
-	bool rcv_tph_en;
-	bool xmit_tph_en;
-	bool virtual_map;
-	bool flush_sq;
-	bool flush_rq;
-	u8 user_pri;
-	struct list_head list;
-	bool on_qoslist;
-	bool sq_flush;
-	enum i40iw_flush_opcode flush_code;
-	enum i40iw_term_eventtypes eventtype;
-	u8 term_flags;
-};
-
-struct i40iw_hmc_fpm_misc {
-	u32 max_ceqs;
-	u32 max_sds;
-	u32 xf_block_size;
-	u32 q1_block_size;
-	u32 ht_multiplier;
-	u32 timer_bucket;
-};
-
-struct i40iw_vchnl_if {
-	enum i40iw_status_code (*vchnl_recv)(struct i40iw_sc_dev *, u32, u8 *, u16);
-	enum i40iw_status_code (*vchnl_send)(struct i40iw_sc_dev *dev, u32, u8 *, u16);
-};
-
-#define I40IW_VCHNL_MAX_VF_MSG_SIZE 512
-
-struct i40iw_vchnl_vf_msg_buffer {
-	struct i40iw_virtchnl_op_buf vchnl_msg;
-	char parm_buffer[I40IW_VCHNL_MAX_VF_MSG_SIZE - 1];
-};
-
-struct i40iw_qos {
-	struct list_head qplist;
-	spinlock_t lock;	/* qos list */
-	u16 qs_handle;
-};
-
-struct i40iw_vfdev {
-	struct i40iw_sc_dev *pf_dev;
-	u8 *hmc_info_mem;
-	struct i40iw_vsi_pestat pestat;
-	struct i40iw_hmc_pble_info *pble_info;
-	struct i40iw_hmc_info hmc_info;
-	struct i40iw_vchnl_vf_msg_buffer vf_msg_buffer;
-	u64 fpm_query_buf_pa;
-	u64 *fpm_query_buf;
-	u32 vf_id;
-	u32 msg_count;
-	bool pf_hmc_initialized;
-	u16 pmf_index;
-	u16 iw_vf_idx;		/* VF Device table index */
-	bool stats_initialized;
-};
-
-#define I40IW_INVALID_FCN_ID 0xff
-struct i40iw_sc_vsi {
-	struct i40iw_sc_dev *dev;
-	void *back_vsi; /* Owned by OS */
-	u32 ilq_count;
-	struct i40iw_virt_mem ilq_mem;
-	struct i40iw_puda_rsrc *ilq;
-	u32 ieq_count;
-	struct i40iw_virt_mem ieq_mem;
-	struct i40iw_puda_rsrc *ieq;
-	u16 exception_lan_queue;
-	u16 mtu;
-	u8 fcn_id;
-	bool stats_fcn_id_alloc;
-	struct i40iw_qos qos[I40IW_MAX_USER_PRIORITY];
-	struct i40iw_vsi_pestat *pestat;
-};
-
-struct i40iw_sc_dev {
-	struct list_head cqp_cmd_head;	/* head of the CQP command list */
-	spinlock_t cqp_lock; /* cqp list sync */
-	struct i40iw_dev_uk dev_uk;
-	bool fcn_id_array[I40IW_MAX_STATS_COUNT];
-	struct i40iw_dma_mem vf_fpm_query_buf[I40IW_MAX_PE_ENABLED_VF_COUNT];
-	u64 fpm_query_buf_pa;
-	u64 fpm_commit_buf_pa;
-	u64 *fpm_query_buf;
-	u64 *fpm_commit_buf;
-	void *back_dev;
-	struct i40iw_hw *hw;
-	u8 __iomem *db_addr;
-	struct i40iw_hmc_info *hmc_info;
-	struct i40iw_hmc_pble_info *pble_info;
-	struct i40iw_vfdev *vf_dev[I40IW_MAX_PE_ENABLED_VF_COUNT];
-	struct i40iw_sc_cqp *cqp;
-	struct i40iw_sc_aeq *aeq;
-	struct i40iw_sc_ceq *ceq[I40IW_CEQ_MAX_COUNT];
-	struct i40iw_sc_cq *ccq;
-	const struct i40iw_cqp_ops *cqp_ops;
-	const struct i40iw_ccq_ops *ccq_ops;
-	const struct i40iw_ceq_ops *ceq_ops;
-	const struct i40iw_aeq_ops *aeq_ops;
-	const struct i40iw_pd_ops *iw_pd_ops;
-	const struct i40iw_priv_qp_ops *iw_priv_qp_ops;
-	const struct i40iw_priv_cq_ops *iw_priv_cq_ops;
-	const struct i40iw_mr_ops *mr_ops;
-	const struct i40iw_cqp_misc_ops *cqp_misc_ops;
-	const struct i40iw_hmc_ops *hmc_ops;
-	struct i40iw_vchnl_if vchnl_if;
-	const struct i40iw_vf_cqp_ops *iw_vf_cqp_ops;
-
-	struct i40iw_hmc_fpm_misc hmc_fpm_misc;
-	u64 feature_info[I40IW_MAX_FEATURES];
-	u32 debug_mask;
-	u8 hmc_fn_id;
-	bool is_pf;
-	bool vchnl_up;
-	bool ceq_valid;
-	u8 vf_id;
-	wait_queue_head_t vf_reqs;
-	u64 cqp_cmd_stats[OP_SIZE_CQP_STAT_ARRAY];
-	struct i40iw_vchnl_vf_msg_buffer vchnl_vf_msg_buf;
-	u8 hw_rev;
-};
-
-struct i40iw_modify_cq_info {
-	u64 cq_pa;
-	struct i40iw_cqe *cq_base;
-	void *pbl_list;
-	u32 ceq_id;
-	u32 cq_size;
-	u32 shadow_read_threshold;
-	bool virtual_map;
-	u8 pbl_chunk_size;
-	bool check_overflow;
-	bool cq_resize;
-	bool ceq_change;
-	bool check_overflow_change;
-	u32 first_pm_pbl_idx;
-	bool ceq_valid;
-};
-
-struct i40iw_create_qp_info {
-	u8 next_iwarp_state;
-	bool ord_valid;
-	bool tcp_ctx_valid;
-	bool cq_num_valid;
-	bool arp_cache_idx_valid;
-};
-
-struct i40iw_modify_qp_info {
-	u64 rx_win0;
-	u64 rx_win1;
-	u8 next_iwarp_state;
-	u8 termlen;
-	bool ord_valid;
-	bool tcp_ctx_valid;
-	bool cq_num_valid;
-	bool arp_cache_idx_valid;
-	bool reset_tcp_conn;
-	bool remove_hash_idx;
-	bool dont_send_term;
-	bool dont_send_fin;
-	bool cached_var_valid;
-	bool force_loopback;
-};
-
-struct i40iw_ccq_cqe_info {
-	struct i40iw_sc_cqp *cqp;
-	u64 scratch;
-	u32 op_ret_val;
-	u16 maj_err_code;
-	u16 min_err_code;
-	u8 op_code;
-	bool error;
-};
-
-struct i40iw_l2params {
-	u16 qs_handle_list[I40IW_MAX_USER_PRIORITY];
-	u16 mtu;
-};
-
-struct i40iw_vsi_init_info {
-	struct i40iw_sc_dev *dev;
-	void  *back_vsi;
-	struct i40iw_l2params *params;
-	u16 exception_lan_queue;
-};
-
-struct i40iw_vsi_stats_info {
-	struct i40iw_vsi_pestat *pestat;
-	u8 fcn_id;
-	bool alloc_fcn_id;
-	bool stats_initialize;
-};
-
-struct i40iw_device_init_info {
-	u64 fpm_query_buf_pa;
-	u64 fpm_commit_buf_pa;
-	u64 *fpm_query_buf;
-	u64 *fpm_commit_buf;
-	struct i40iw_hw *hw;
-	void __iomem *bar0;
-	enum i40iw_status_code (*vchnl_send)(struct i40iw_sc_dev *, u32, u8 *, u16);
-	u8 hmc_fn_id;
-	bool is_pf;
-	u32 debug_mask;
-};
-
-enum i40iw_cqp_hmc_profile {
-	I40IW_HMC_PROFILE_DEFAULT = 1,
-	I40IW_HMC_PROFILE_FAVOR_VF = 2,
-	I40IW_HMC_PROFILE_EQUAL = 3,
-};
-
-struct i40iw_cqp_init_info {
-	u64 cqp_compl_ctx;
-	u64 host_ctx_pa;
-	u64 sq_pa;
-	struct i40iw_sc_dev *dev;
-	struct i40iw_cqp_quanta *sq;
-	u64 *host_ctx;
-	u64 *scratch_array;
-	u32 sq_size;
-	u8 struct_ver;
-	bool en_datacenter_tcp;
-	u8 hmc_profile;
-	u8 enabled_vf_count;
-};
-
-struct i40iw_ceq_init_info {
-	u64 ceqe_pa;
-	struct i40iw_sc_dev *dev;
-	u64 *ceqe_base;
-	void *pbl_list;
-	u32 elem_cnt;
-	u32 ceq_id;
-	bool virtual_map;
-	u8 pbl_chunk_size;
-	bool tph_en;
-	u8 tph_val;
-	u32 first_pm_pbl_idx;
-};
-
-struct i40iw_aeq_init_info {
-	u64 aeq_elem_pa;
-	struct i40iw_sc_dev *dev;
-	u32 *aeqe_base;
-	void *pbl_list;
-	u32 elem_cnt;
-	bool virtual_map;
-	u8 pbl_chunk_size;
-	u32 first_pm_pbl_idx;
-};
-
-struct i40iw_ccq_init_info {
-	u64 cq_pa;
-	u64 shadow_area_pa;
-	struct i40iw_sc_dev *dev;
-	struct i40iw_cqe *cq_base;
-	u64 *shadow_area;
-	void *pbl_list;
-	u32 num_elem;
-	u32 ceq_id;
-	u32 shadow_read_threshold;
-	bool ceqe_mask;
-	bool ceq_id_valid;
-	bool tph_en;
-	u8 tph_val;
-	bool avoid_mem_cflct;
-	bool virtual_map;
-	u8 pbl_chunk_size;
-	u32 first_pm_pbl_idx;
-};
-
-struct i40iwarp_offload_info {
-	u16 rcv_mark_offset;
-	u16 snd_mark_offset;
-	u16 pd_id;
-	u8 ddp_ver;
-	u8 rdmap_ver;
-	u8 ord_size;
-	u8 ird_size;
-	bool wr_rdresp_en;
-	bool rd_enable;
-	bool snd_mark_en;
-	bool rcv_mark_en;
-	bool bind_en;
-	bool fast_reg_en;
-	bool priv_mode_en;
-	bool lsmm_present;
-	u8 iwarp_mode;
-	bool align_hdrs;
-	bool rcv_no_mpa_crc;
-
-	u8 last_byte_sent;
-};
-
-struct i40iw_tcp_offload_info {
-	bool ipv4;
-	bool no_nagle;
-	bool insert_vlan_tag;
-	bool time_stamp;
-	u8 cwnd_inc_limit;
-	bool drop_ooo_seg;
-	u8 dup_ack_thresh;
-	u8 ttl;
-	u8 src_mac_addr_idx;
-	bool avoid_stretch_ack;
-	u8 tos;
-	u16 src_port;
-	u16 dst_port;
-	u32 dest_ip_addr0;
-	u32 dest_ip_addr1;
-	u32 dest_ip_addr2;
-	u32 dest_ip_addr3;
-	u32 snd_mss;
-	u16 vlan_tag;
-	u16 arp_idx;
-	u32 flow_label;
-	bool wscale;
-	u8 tcp_state;
-	u8 snd_wscale;
-	u8 rcv_wscale;
-	u32 time_stamp_recent;
-	u32 time_stamp_age;
-	u32 snd_nxt;
-	u32 snd_wnd;
-	u32 rcv_nxt;
-	u32 rcv_wnd;
-	u32 snd_max;
-	u32 snd_una;
-	u32 srtt;
-	u32 rtt_var;
-	u32 ss_thresh;
-	u32 cwnd;
-	u32 snd_wl1;
-	u32 snd_wl2;
-	u32 max_snd_window;
-	u8 rexmit_thresh;
-	u32 local_ipaddr0;
-	u32 local_ipaddr1;
-	u32 local_ipaddr2;
-	u32 local_ipaddr3;
-	bool ignore_tcp_opt;
-	bool ignore_tcp_uns_opt;
-};
-
-struct i40iw_qp_host_ctx_info {
-	u64 qp_compl_ctx;
-	struct i40iw_tcp_offload_info *tcp_info;
-	struct i40iwarp_offload_info *iwarp_info;
-	u32 send_cq_num;
-	u32 rcv_cq_num;
-	bool tcp_info_valid;
-	bool iwarp_info_valid;
-	bool err_rq_idx_valid;
-	u16 err_rq_idx;
-	bool add_to_qoslist;
-	u8 user_pri;
-};
-
-struct i40iw_aeqe_info {
-	u64 compl_ctx;
-	u32 qp_cq_id;
-	u16 ae_id;
-	u16 wqe_idx;
-	u8 tcp_state;
-	u8 iwarp_state;
-	bool qp;
-	bool cq;
-	bool sq;
-	bool in_rdrsp_wr;
-	bool out_rdrsp;
-	u8 q2_data_written;
-	bool aeqe_overflow;
-};
-
-struct i40iw_allocate_stag_info {
-	u64 total_len;
-	u32 chunk_size;
-	u32 stag_idx;
-	u32 page_size;
-	u16 pd_id;
-	u16 access_rights;
-	bool remote_access;
-	bool use_hmc_fcn_index;
-	u8 hmc_fcn_index;
-	bool use_pf_rid;
-};
-
-struct i40iw_reg_ns_stag_info {
-	u64 reg_addr_pa;
-	u64 fbo;
-	void *va;
-	u64 total_len;
-	u32 page_size;
-	u32 chunk_size;
-	u32 first_pm_pbl_index;
-	enum i40iw_addressing_type addr_type;
-	i40iw_stag_index stag_idx;
-	u16 access_rights;
-	u16 pd_id;
-	i40iw_stag_key stag_key;
-	bool use_hmc_fcn_index;
-	u8 hmc_fcn_index;
-	bool use_pf_rid;
-};
-
-struct i40iw_fast_reg_stag_info {
-	u64 wr_id;
-	u64 reg_addr_pa;
-	u64 fbo;
-	void *va;
-	u64 total_len;
-	u32 page_size;
-	u32 chunk_size;
-	u32 first_pm_pbl_index;
-	enum i40iw_addressing_type addr_type;
-	i40iw_stag_index stag_idx;
-	u16 access_rights;
-	u16 pd_id;
-	i40iw_stag_key stag_key;
-	bool local_fence;
-	bool read_fence;
-	bool signaled;
-	bool use_hmc_fcn_index;
-	u8 hmc_fcn_index;
-	bool use_pf_rid;
-	bool defer_flag;
-};
-
-struct i40iw_dealloc_stag_info {
-	u32 stag_idx;
-	u16 pd_id;
-	bool mr;
-	bool dealloc_pbl;
-};
-
-struct i40iw_register_shared_stag {
-	void *va;
-	enum i40iw_addressing_type addr_type;
-	i40iw_stag_index new_stag_idx;
-	i40iw_stag_index parent_stag_idx;
-	u32 access_rights;
-	u16 pd_id;
-	i40iw_stag_key new_stag_key;
-};
-
-struct i40iw_qp_init_info {
-	struct i40iw_qp_uk_init_info qp_uk_init_info;
-	struct i40iw_sc_pd *pd;
-	struct i40iw_sc_vsi *vsi;
-	u64 *host_ctx;
-	u8 *q2;
-	u64 sq_pa;
-	u64 rq_pa;
-	u64 host_ctx_pa;
-	u64 q2_pa;
-	u64 shadow_area_pa;
-	int abi_ver;
-	u8 sq_tph_val;
-	u8 rq_tph_val;
-	u8 type;
-	bool sq_tph_en;
-	bool rq_tph_en;
-	bool rcv_tph_en;
-	bool xmit_tph_en;
-	bool virtual_map;
-};
-
-struct i40iw_cq_init_info {
-	struct i40iw_sc_dev *dev;
-	u64 cq_base_pa;
-	u64 shadow_area_pa;
-	u32 ceq_id;
-	u32 shadow_read_threshold;
-	bool virtual_map;
-	bool ceqe_mask;
-	u8 pbl_chunk_size;
-	u32 first_pm_pbl_idx;
-	bool ceq_id_valid;
-	bool tph_en;
-	u8 tph_val;
-	u8 type;
-	struct i40iw_cq_uk_init_info cq_uk_init_info;
-};
-
-struct i40iw_upload_context_info {
-	u64 buf_pa;
-	bool freeze_qp;
-	bool raw_format;
-	u32 qp_id;
-	u8 qp_type;
-};
-
-struct i40iw_add_arp_cache_entry_info {
-	u8 mac_addr[6];
-	u32 reach_max;
-	u16 arp_index;
-	bool permanent;
-};
-
-struct i40iw_apbvt_info {
-	u16 port;
-	bool add;
-};
-
-enum i40iw_quad_entry_type {
-	I40IW_QHASH_TYPE_TCP_ESTABLISHED = 1,
-	I40IW_QHASH_TYPE_TCP_SYN,
-};
-
-enum i40iw_quad_hash_manage_type {
-	I40IW_QHASH_MANAGE_TYPE_DELETE = 0,
-	I40IW_QHASH_MANAGE_TYPE_ADD,
-	I40IW_QHASH_MANAGE_TYPE_MODIFY
-};
-
-struct i40iw_qhash_table_info {
-	struct i40iw_sc_vsi *vsi;
-	enum i40iw_quad_hash_manage_type manage;
-	enum i40iw_quad_entry_type entry_type;
-	bool vlan_valid;
-	bool ipv4_valid;
-	u8 mac_addr[6];
-	u16 vlan_id;
-	u8 user_pri;
-	u32 qp_num;
-	u32 dest_ip[4];
-	u32 src_ip[4];
-	u16 dest_port;
-	u16 src_port;
-};
-
-struct i40iw_local_mac_ipaddr_entry_info {
-	u8 mac_addr[6];
-	u8 entry_idx;
-};
-
-struct i40iw_qp_flush_info {
-	u16 sq_minor_code;
-	u16 sq_major_code;
-	u16 rq_minor_code;
-	u16 rq_major_code;
-	u16 ae_code;
-	u8 ae_source;
-	bool sq;
-	bool rq;
-	bool userflushcode;
-	bool generate_ae;
-};
-
-struct i40iw_cqp_commit_fpm_values {
-	u64 qp_base;
-	u64 cq_base;
-	u32 hte_base;
-	u32 arp_base;
-	u32 apbvt_inuse_base;
-	u32 mr_base;
-	u32 xf_base;
-	u32 xffl_base;
-	u32 q1_base;
-	u32 q1fl_base;
-	u32 fsimc_base;
-	u32 fsiav_base;
-	u32 pbl_base;
-
-	u32 qp_cnt;
-	u32 cq_cnt;
-	u32 hte_cnt;
-	u32 arp_cnt;
-	u32 mr_cnt;
-	u32 xf_cnt;
-	u32 xffl_cnt;
-	u32 q1_cnt;
-	u32 q1fl_cnt;
-	u32 fsimc_cnt;
-	u32 fsiav_cnt;
-	u32 pbl_cnt;
-};
-
-struct i40iw_cqp_query_fpm_values {
-	u16 first_pe_sd_index;
-	u32 qp_objsize;
-	u32 cq_objsize;
-	u32 hte_objsize;
-	u32 arp_objsize;
-	u32 mr_objsize;
-	u32 xf_objsize;
-	u32 q1_objsize;
-	u32 fsimc_objsize;
-	u32 fsiav_objsize;
-
-	u32 qp_max;
-	u32 cq_max;
-	u32 hte_max;
-	u32 arp_max;
-	u32 mr_max;
-	u32 xf_max;
-	u32 xffl_max;
-	u32 q1_max;
-	u32 q1fl_max;
-	u32 fsimc_max;
-	u32 fsiav_max;
-	u32 pbl_max;
-};
-
-struct i40iw_gen_ae_info {
-	u16 ae_code;
-	u8 ae_source;
-};
-
-struct i40iw_cqp_ops {
-	enum i40iw_status_code (*cqp_init)(struct i40iw_sc_cqp *,
-					   struct i40iw_cqp_init_info *);
-	enum i40iw_status_code (*cqp_create)(struct i40iw_sc_cqp *, u16 *, u16 *);
-	void (*cqp_post_sq)(struct i40iw_sc_cqp *);
-	u64 *(*cqp_get_next_send_wqe)(struct i40iw_sc_cqp *, u64 scratch);
-	enum i40iw_status_code (*cqp_destroy)(struct i40iw_sc_cqp *);
-	enum i40iw_status_code (*poll_for_cqp_op_done)(struct i40iw_sc_cqp *, u8,
-						       struct i40iw_ccq_cqe_info *);
-};
-
-struct i40iw_ccq_ops {
-	enum i40iw_status_code (*ccq_init)(struct i40iw_sc_cq *,
-					   struct i40iw_ccq_init_info *);
-	enum i40iw_status_code (*ccq_create)(struct i40iw_sc_cq *, u64, bool, bool);
-	enum i40iw_status_code (*ccq_destroy)(struct i40iw_sc_cq *, u64, bool);
-	enum i40iw_status_code (*ccq_create_done)(struct i40iw_sc_cq *);
-	enum i40iw_status_code (*ccq_get_cqe_info)(struct i40iw_sc_cq *,
-						   struct i40iw_ccq_cqe_info *);
-	void (*ccq_arm)(struct i40iw_sc_cq *);
-};
-
-struct i40iw_ceq_ops {
-	enum i40iw_status_code (*ceq_init)(struct i40iw_sc_ceq *,
-					   struct i40iw_ceq_init_info *);
-	enum i40iw_status_code (*ceq_create)(struct i40iw_sc_ceq *, u64, bool);
-	enum i40iw_status_code (*cceq_create_done)(struct i40iw_sc_ceq *);
-	enum i40iw_status_code (*cceq_destroy_done)(struct i40iw_sc_ceq *);
-	enum i40iw_status_code (*cceq_create)(struct i40iw_sc_ceq *, u64);
-	enum i40iw_status_code (*ceq_destroy)(struct i40iw_sc_ceq *, u64, bool);
-	void *(*process_ceq)(struct i40iw_sc_dev *, struct i40iw_sc_ceq *);
-};
-
-struct i40iw_aeq_ops {
-	enum i40iw_status_code (*aeq_init)(struct i40iw_sc_aeq *,
-					   struct i40iw_aeq_init_info *);
-	enum i40iw_status_code (*aeq_create)(struct i40iw_sc_aeq *, u64, bool);
-	enum i40iw_status_code (*aeq_destroy)(struct i40iw_sc_aeq *, u64, bool);
-	enum i40iw_status_code (*get_next_aeqe)(struct i40iw_sc_aeq *,
-						struct i40iw_aeqe_info *);
-	enum i40iw_status_code (*repost_aeq_entries)(struct i40iw_sc_dev *, u32);
-	enum i40iw_status_code (*aeq_create_done)(struct i40iw_sc_aeq *);
-	enum i40iw_status_code (*aeq_destroy_done)(struct i40iw_sc_aeq *);
-};
-
-struct i40iw_pd_ops {
-	void (*pd_init)(struct i40iw_sc_dev *, struct i40iw_sc_pd *, u16, int);
-};
-
-struct i40iw_priv_qp_ops {
-	enum i40iw_status_code (*qp_init)(struct i40iw_sc_qp *, struct i40iw_qp_init_info *);
-	enum i40iw_status_code (*qp_create)(struct i40iw_sc_qp *,
-					    struct i40iw_create_qp_info *, u64, bool);
-	enum i40iw_status_code (*qp_modify)(struct i40iw_sc_qp *,
-					    struct i40iw_modify_qp_info *, u64, bool);
-	enum i40iw_status_code (*qp_destroy)(struct i40iw_sc_qp *, u64, bool, bool, bool);
-	enum i40iw_status_code (*qp_flush_wqes)(struct i40iw_sc_qp *,
-						struct i40iw_qp_flush_info *, u64, bool);
-	enum i40iw_status_code (*qp_upload_context)(struct i40iw_sc_dev *,
-						    struct i40iw_upload_context_info *,
-						    u64, bool);
-	enum i40iw_status_code (*qp_setctx)(struct i40iw_sc_qp *, u64 *,
-					    struct i40iw_qp_host_ctx_info *);
-
-	void (*qp_send_lsmm)(struct i40iw_sc_qp *, void *, u32, i40iw_stag);
-	void (*qp_send_lsmm_nostag)(struct i40iw_sc_qp *, void *, u32);
-	void (*qp_send_rtt)(struct i40iw_sc_qp *, bool);
-	enum i40iw_status_code (*qp_post_wqe0)(struct i40iw_sc_qp *, u8);
-	enum i40iw_status_code (*iw_mr_fast_register)(struct i40iw_sc_qp *,
-						      struct i40iw_fast_reg_stag_info *,
-						      bool);
-};
-
-struct i40iw_priv_cq_ops {
-	enum i40iw_status_code (*cq_init)(struct i40iw_sc_cq *, struct i40iw_cq_init_info *);
-	enum i40iw_status_code (*cq_create)(struct i40iw_sc_cq *, u64, bool, bool);
-	enum i40iw_status_code (*cq_destroy)(struct i40iw_sc_cq *, u64, bool);
-	enum i40iw_status_code (*cq_modify)(struct i40iw_sc_cq *,
-					    struct i40iw_modify_cq_info *, u64, bool);
-};
-
-struct i40iw_mr_ops {
-	enum i40iw_status_code (*alloc_stag)(struct i40iw_sc_dev *,
-					     struct i40iw_allocate_stag_info *, u64, bool);
-	enum i40iw_status_code (*mr_reg_non_shared)(struct i40iw_sc_dev *,
-						    struct i40iw_reg_ns_stag_info *,
-						    u64, bool);
-	enum i40iw_status_code (*mr_reg_shared)(struct i40iw_sc_dev *,
-						struct i40iw_register_shared_stag *,
-						u64, bool);
-	enum i40iw_status_code (*dealloc_stag)(struct i40iw_sc_dev *,
-					       struct i40iw_dealloc_stag_info *,
-					       u64, bool);
-	enum i40iw_status_code (*query_stag)(struct i40iw_sc_dev *, u64, u32, bool);
-	enum i40iw_status_code (*mw_alloc)(struct i40iw_sc_dev *, u64, u32, u16, bool);
-};
-
-struct i40iw_cqp_misc_ops {
-	enum i40iw_status_code (*manage_hmc_pm_func_table)(struct i40iw_sc_cqp *,
-							   u64, u8, bool, bool);
-	enum i40iw_status_code (*set_hmc_resource_profile)(struct i40iw_sc_cqp *,
-							   u64, u8, u8, bool, bool);
-	enum i40iw_status_code (*commit_fpm_values)(struct i40iw_sc_cqp *, u64, u8,
-						    struct i40iw_dma_mem *, bool, u8);
-	enum i40iw_status_code (*query_fpm_values)(struct i40iw_sc_cqp *, u64, u8,
-						   struct i40iw_dma_mem *, bool, u8);
-	enum i40iw_status_code (*static_hmc_pages_allocated)(struct i40iw_sc_cqp *,
-							     u64, u8, bool, bool);
-	enum i40iw_status_code (*add_arp_cache_entry)(struct i40iw_sc_cqp *,
-						      struct i40iw_add_arp_cache_entry_info *,
-						      u64, bool);
-	enum i40iw_status_code (*del_arp_cache_entry)(struct i40iw_sc_cqp *, u64, u16, bool);
-	enum i40iw_status_code (*query_arp_cache_entry)(struct i40iw_sc_cqp *, u64, u16, bool);
-	enum i40iw_status_code (*manage_apbvt_entry)(struct i40iw_sc_cqp *,
-						     struct i40iw_apbvt_info *, u64, bool);
-	enum i40iw_status_code (*manage_qhash_table_entry)(struct i40iw_sc_cqp *,
-							   struct i40iw_qhash_table_info *, u64, bool);
-	enum i40iw_status_code (*alloc_local_mac_ipaddr_table_entry)(struct i40iw_sc_cqp *, u64, bool);
-	enum i40iw_status_code (*add_local_mac_ipaddr_entry)(struct i40iw_sc_cqp *,
-							     struct i40iw_local_mac_ipaddr_entry_info *,
-							     u64, bool);
-	enum i40iw_status_code (*del_local_mac_ipaddr_entry)(struct i40iw_sc_cqp *, u64, u8, u8, bool);
-	enum i40iw_status_code (*cqp_nop)(struct i40iw_sc_cqp *, u64, bool);
-	enum i40iw_status_code (*commit_fpm_values_done)(struct i40iw_sc_cqp
-							  *);
-	enum i40iw_status_code (*query_fpm_values_done)(struct i40iw_sc_cqp *);
-	enum i40iw_status_code (*manage_hmc_pm_func_table_done)(struct i40iw_sc_cqp *);
-	enum i40iw_status_code (*update_suspend_qp)(struct i40iw_sc_cqp *, struct i40iw_sc_qp *, u64);
-	enum i40iw_status_code (*update_resume_qp)(struct i40iw_sc_cqp *, struct i40iw_sc_qp *, u64);
-};
-
-struct i40iw_hmc_ops {
-	enum i40iw_status_code (*init_iw_hmc)(struct i40iw_sc_dev *, u8);
-	enum i40iw_status_code (*parse_fpm_query_buf)(u64 *, struct i40iw_hmc_info *,
-						      struct i40iw_hmc_fpm_misc *);
-	enum i40iw_status_code (*configure_iw_fpm)(struct i40iw_sc_dev *, u8);
-	enum i40iw_status_code (*parse_fpm_commit_buf)(u64 *, struct i40iw_hmc_obj_info *, u32 *sd);
-	enum i40iw_status_code (*create_hmc_object)(struct i40iw_sc_dev *dev,
-						    struct i40iw_hmc_create_obj_info *);
-	enum i40iw_status_code (*del_hmc_object)(struct i40iw_sc_dev *dev,
-						 struct i40iw_hmc_del_obj_info *,
-						 bool reset);
-	enum i40iw_status_code (*pf_init_vfhmc)(struct i40iw_sc_dev *, u8, u32 *);
-	enum i40iw_status_code (*vf_configure_vffpm)(struct i40iw_sc_dev *, u32 *);
-};
-
-struct cqp_info {
-	union {
-		struct {
-			struct i40iw_sc_qp *qp;
-			struct i40iw_create_qp_info info;
-			u64 scratch;
-		} qp_create;
-
-		struct {
-			struct i40iw_sc_qp *qp;
-			struct i40iw_modify_qp_info info;
-			u64 scratch;
-		} qp_modify;
-
-		struct {
-			struct i40iw_sc_qp *qp;
-			u64 scratch;
-			bool remove_hash_idx;
-			bool ignore_mw_bnd;
-		} qp_destroy;
-
-		struct {
-			struct i40iw_sc_cq *cq;
-			u64 scratch;
-			bool check_overflow;
-		} cq_create;
-
-		struct {
-			struct i40iw_sc_cq *cq;
-			u64 scratch;
-		} cq_destroy;
-
-		struct {
-			struct i40iw_sc_dev *dev;
-			struct i40iw_allocate_stag_info info;
-			u64 scratch;
-		} alloc_stag;
-
-		struct {
-			struct i40iw_sc_dev *dev;
-			u64 scratch;
-			u32 mw_stag_index;
-			u16 pd_id;
-		} mw_alloc;
-
-		struct {
-			struct i40iw_sc_dev *dev;
-			struct i40iw_reg_ns_stag_info info;
-			u64 scratch;
-		} mr_reg_non_shared;
-
-		struct {
-			struct i40iw_sc_dev *dev;
-			struct i40iw_dealloc_stag_info info;
-			u64 scratch;
-		} dealloc_stag;
-
-		struct {
-			struct i40iw_sc_cqp *cqp;
-			struct i40iw_local_mac_ipaddr_entry_info info;
-			u64 scratch;
-		} add_local_mac_ipaddr_entry;
-
-		struct {
-			struct i40iw_sc_cqp *cqp;
-			struct i40iw_add_arp_cache_entry_info info;
-			u64 scratch;
-		} add_arp_cache_entry;
-
-		struct {
-			struct i40iw_sc_cqp *cqp;
-			u64 scratch;
-			u8 entry_idx;
-			u8 ignore_ref_count;
-		} del_local_mac_ipaddr_entry;
-
-		struct {
-			struct i40iw_sc_cqp *cqp;
-			u64 scratch;
-			u16 arp_index;
-		} del_arp_cache_entry;
-
-		struct {
-			struct i40iw_sc_cqp *cqp;
-			struct i40iw_manage_vf_pble_info info;
-			u64 scratch;
-		} manage_vf_pble_bp;
-
-		struct {
-			struct i40iw_sc_dev *dev;
-			struct i40iw_upload_context_info info;
-			u64 scratch;
-		} qp_upload_context;
-
-		struct {
-			struct i40iw_sc_cqp *cqp;
-			u64 scratch;
-		} alloc_local_mac_ipaddr_entry;
-
-		struct {
-			struct i40iw_sc_dev *dev;
-			struct i40iw_hmc_fcn_info info;
-			u64 scratch;
-		} manage_hmc_pm;
-
-		struct {
-			struct i40iw_sc_ceq *ceq;
-			u64 scratch;
-		} ceq_create;
-
-		struct {
-			struct i40iw_sc_ceq *ceq;
-			u64 scratch;
-		} ceq_destroy;
-
-		struct {
-			struct i40iw_sc_aeq *aeq;
-			u64 scratch;
-		} aeq_create;
-
-		struct {
-			struct i40iw_sc_aeq *aeq;
-			u64 scratch;
-		} aeq_destroy;
-
-		struct {
-			struct i40iw_sc_qp *qp;
-			struct i40iw_qp_flush_info info;
-			u64 scratch;
-		} qp_flush_wqes;
-
-		struct {
-			struct i40iw_sc_qp *qp;
-			struct i40iw_gen_ae_info info;
-			u64 scratch;
-		} gen_ae;
-
-		struct {
-			struct i40iw_sc_cqp *cqp;
-			void *fpm_values_va;
-			u64 fpm_values_pa;
-			u8 hmc_fn_id;
-			u64 scratch;
-		} query_fpm_values;
-
-		struct {
-			struct i40iw_sc_cqp *cqp;
-			void *fpm_values_va;
-			u64 fpm_values_pa;
-			u8 hmc_fn_id;
-			u64 scratch;
-		} commit_fpm_values;
-
-		struct {
-			struct i40iw_sc_cqp *cqp;
-			struct i40iw_apbvt_info info;
-			u64 scratch;
-		} manage_apbvt_entry;
-
-		struct {
-			struct i40iw_sc_cqp *cqp;
-			struct i40iw_qhash_table_info info;
-			u64 scratch;
-		} manage_qhash_table_entry;
-
-		struct {
-			struct i40iw_sc_dev *dev;
-			struct i40iw_update_sds_info info;
-			u64 scratch;
-		} update_pe_sds;
-
-		struct {
-			struct i40iw_sc_cqp *cqp;
-			struct i40iw_sc_qp *qp;
-			u64 scratch;
-		} suspend_resume;
-		struct {
-			struct i40iw_sc_cqp *cqp;
-			void *cap_va;
-			u64 cap_pa;
-			u64 scratch;
-		} query_rdma_features;
-	} u;
-};
-
-struct cqp_commands_info {
-	struct list_head cqp_cmd_entry;
-	u8 cqp_cmd;
-	u8 post_sq;
-	struct cqp_info in;
-};
-
-struct i40iw_virtchnl_work_info {
-	void (*callback_fcn)(void *vf_dev);
-	void *worker_vf_dev;
-};
-
-struct i40iw_cqp_timeout {
-	u64 compl_cqp_cmds;
-	u8 count;
-};
-
-#endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_uk.c b/drivers/infiniband/hw/i40iw/i40iw_uk.c
deleted file mode 100644
index f521be16bf31..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw_uk.c
+++ /dev/null
@@ -1,1200 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#include "i40iw_osdep.h"
-#include "i40iw_status.h"
-#include "i40iw_d.h"
-#include "i40iw_user.h"
-#include "i40iw_register.h"
-
-static u32 nop_signature = 0x55550000;
-
-/**
- * i40iw_nop_1 - insert a nop wqe and move head. no post work
- * @qp: hw qp ptr
- */
-static enum i40iw_status_code i40iw_nop_1(struct i40iw_qp_uk *qp)
-{
-	u64 header, *wqe;
-	u64 *wqe_0 = NULL;
-	u32 wqe_idx, peek_head;
-	bool signaled = false;
-
-	if (!qp->sq_ring.head)
-		return I40IW_ERR_PARAM;
-
-	wqe_idx = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring);
-	wqe = qp->sq_base[wqe_idx].elem;
-
-	qp->sq_wrtrk_array[wqe_idx].wqe_size = I40IW_QP_WQE_MIN_SIZE;
-
-	peek_head = (qp->sq_ring.head + 1) % qp->sq_ring.size;
-	wqe_0 = qp->sq_base[peek_head].elem;
-	if (peek_head)
-		wqe_0[3] = LS_64(!qp->swqe_polarity, I40IWQPSQ_VALID);
-	else
-		wqe_0[3] = LS_64(qp->swqe_polarity, I40IWQPSQ_VALID);
-
-	set_64bit_val(wqe, 0, 0);
-	set_64bit_val(wqe, 8, 0);
-	set_64bit_val(wqe, 16, 0);
-
-	header = LS_64(I40IWQP_OP_NOP, I40IWQPSQ_OPCODE) |
-	    LS_64(signaled, I40IWQPSQ_SIGCOMPL) |
-	    LS_64(qp->swqe_polarity, I40IWQPSQ_VALID) | nop_signature++;
-
-	wmb();	/* Memory barrier to ensure data is written before valid bit is set */
-
-	set_64bit_val(wqe, 24, header);
-	return 0;
-}
-
-/**
- * i40iw_qp_post_wr - post wr to hrdware
- * @qp: hw qp ptr
- */
-void i40iw_qp_post_wr(struct i40iw_qp_uk *qp)
-{
-	u64 temp;
-	u32 hw_sq_tail;
-	u32 sw_sq_head;
-
-	mb(); /* valid bit is written and loads completed before reading shadow */
-
-	/* read the doorbell shadow area */
-	get_64bit_val(qp->shadow_area, 0, &temp);
-
-	hw_sq_tail = (u32)RS_64(temp, I40IW_QP_DBSA_HW_SQ_TAIL);
-	sw_sq_head = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring);
-	if (sw_sq_head != hw_sq_tail) {
-		if (sw_sq_head > qp->initial_ring.head) {
-			if ((hw_sq_tail >= qp->initial_ring.head) &&
-			    (hw_sq_tail < sw_sq_head)) {
-				writel(qp->qp_id, qp->wqe_alloc_reg);
-			}
-		} else if (sw_sq_head != qp->initial_ring.head) {
-			if ((hw_sq_tail >= qp->initial_ring.head) ||
-			    (hw_sq_tail < sw_sq_head)) {
-				writel(qp->qp_id, qp->wqe_alloc_reg);
-			}
-		}
-	}
-
-	qp->initial_ring.head = qp->sq_ring.head;
-}
-
-/**
- * i40iw_qp_get_next_send_wqe - return next wqe ptr
- * @qp: hw qp ptr
- * @wqe_idx: return wqe index
- * @wqe_size: size of sq wqe
- * @total_size: work request length
- * @wr_id: work request id
- */
-u64 *i40iw_qp_get_next_send_wqe(struct i40iw_qp_uk *qp,
-				u32 *wqe_idx,
-				u8 wqe_size,
-				u32 total_size,
-				u64 wr_id
-				)
-{
-	u64 *wqe = NULL;
-	u64 wqe_ptr;
-	u32 peek_head = 0;
-	u16 offset;
-	enum i40iw_status_code ret_code = 0;
-	u8 nop_wqe_cnt = 0, i;
-	u64 *wqe_0 = NULL;
-
-	*wqe_idx = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring);
-
-	if (!*wqe_idx)
-		qp->swqe_polarity = !qp->swqe_polarity;
-	wqe_ptr = (uintptr_t)qp->sq_base[*wqe_idx].elem;
-	offset = (u16)(wqe_ptr) & 0x7F;
-	if ((offset + wqe_size) > I40IW_QP_WQE_MAX_SIZE) {
-		nop_wqe_cnt = (u8)(I40IW_QP_WQE_MAX_SIZE - offset) / I40IW_QP_WQE_MIN_SIZE;
-		for (i = 0; i < nop_wqe_cnt; i++) {
-			i40iw_nop_1(qp);
-			I40IW_RING_MOVE_HEAD(qp->sq_ring, ret_code);
-			if (ret_code)
-				return NULL;
-		}
-
-		*wqe_idx = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring);
-		if (!*wqe_idx)
-			qp->swqe_polarity = !qp->swqe_polarity;
-	}
-
-	if (((*wqe_idx & 3) == 1) && (wqe_size == I40IW_WQE_SIZE_64)) {
-		i40iw_nop_1(qp);
-		I40IW_RING_MOVE_HEAD(qp->sq_ring, ret_code);
-		if (ret_code)
-			return NULL;
-		*wqe_idx = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring);
-		if (!*wqe_idx)
-			qp->swqe_polarity = !qp->swqe_polarity;
-	}
-	I40IW_RING_MOVE_HEAD_BY_COUNT(qp->sq_ring,
-				      wqe_size / I40IW_QP_WQE_MIN_SIZE, ret_code);
-	if (ret_code)
-		return NULL;
-
-	wqe = qp->sq_base[*wqe_idx].elem;
-
-	peek_head = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring);
-	wqe_0 = qp->sq_base[peek_head].elem;
-
-	if (((peek_head & 3) == 1) || ((peek_head & 3) == 3)) {
-		if (RS_64(wqe_0[3], I40IWQPSQ_VALID) != !qp->swqe_polarity)
-			wqe_0[3] = LS_64(!qp->swqe_polarity, I40IWQPSQ_VALID);
-	}
-
-	qp->sq_wrtrk_array[*wqe_idx].wrid = wr_id;
-	qp->sq_wrtrk_array[*wqe_idx].wr_len = total_size;
-	qp->sq_wrtrk_array[*wqe_idx].wqe_size = wqe_size;
-	return wqe;
-}
-
-/**
- * i40iw_set_fragment - set fragment in wqe
- * @wqe: wqe for setting fragment
- * @offset: offset value
- * @sge: sge length and stag
- */
-static void i40iw_set_fragment(u64 *wqe, u32 offset, struct i40iw_sge *sge)
-{
-	if (sge) {
-		set_64bit_val(wqe, offset, LS_64(sge->tag_off, I40IWQPSQ_FRAG_TO));
-		set_64bit_val(wqe, (offset + 8),
-			      (LS_64(sge->len, I40IWQPSQ_FRAG_LEN) |
-			       LS_64(sge->stag, I40IWQPSQ_FRAG_STAG)));
-	}
-}
-
-/**
- * i40iw_qp_get_next_recv_wqe - get next qp's rcv wqe
- * @qp: hw qp ptr
- * @wqe_idx: return wqe index
- */
-u64 *i40iw_qp_get_next_recv_wqe(struct i40iw_qp_uk *qp, u32 *wqe_idx)
-{
-	u64 *wqe = NULL;
-	enum i40iw_status_code ret_code;
-
-	if (I40IW_RING_FULL_ERR(qp->rq_ring))
-		return NULL;
-
-	I40IW_ATOMIC_RING_MOVE_HEAD(qp->rq_ring, *wqe_idx, ret_code);
-	if (ret_code)
-		return NULL;
-	if (!*wqe_idx)
-		qp->rwqe_polarity = !qp->rwqe_polarity;
-	/* rq_wqe_size_multiplier is no of qwords in one rq wqe */
-	wqe = qp->rq_base[*wqe_idx * (qp->rq_wqe_size_multiplier >> 2)].elem;
-
-	return wqe;
-}
-
-/**
- * i40iw_rdma_write - rdma write operation
- * @qp: hw qp ptr
- * @info: post sq information
- * @post_sq: flag to post sq
- */
-static enum i40iw_status_code i40iw_rdma_write(struct i40iw_qp_uk *qp,
-					       struct i40iw_post_sq_info *info,
-					       bool post_sq)
-{
-	u64 header;
-	u64 *wqe;
-	struct i40iw_rdma_write *op_info;
-	u32 i, wqe_idx;
-	u32 total_size = 0, byte_off;
-	enum i40iw_status_code ret_code;
-	bool read_fence = false;
-	u8 wqe_size;
-
-	op_info = &info->op.rdma_write;
-	if (op_info->num_lo_sges > qp->max_sq_frag_cnt)
-		return I40IW_ERR_INVALID_FRAG_COUNT;
-
-	for (i = 0; i < op_info->num_lo_sges; i++)
-		total_size += op_info->lo_sg_list[i].len;
-
-	if (total_size > I40IW_MAX_OUTBOUND_MESSAGE_SIZE)
-		return I40IW_ERR_QP_INVALID_MSG_SIZE;
-
-	read_fence |= info->read_fence;
-
-	ret_code = i40iw_fragcnt_to_wqesize_sq(op_info->num_lo_sges, &wqe_size);
-	if (ret_code)
-		return ret_code;
-
-	wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, total_size, info->wr_id);
-	if (!wqe)
-		return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
-	set_64bit_val(wqe, 16,
-		      LS_64(op_info->rem_addr.tag_off, I40IWQPSQ_FRAG_TO));
-	if (!op_info->rem_addr.stag)
-		return I40IW_ERR_BAD_STAG;
-
-	header = LS_64(op_info->rem_addr.stag, I40IWQPSQ_REMSTAG) |
-		 LS_64(I40IWQP_OP_RDMA_WRITE, I40IWQPSQ_OPCODE) |
-		 LS_64((op_info->num_lo_sges > 1 ?  (op_info->num_lo_sges - 1) : 0), I40IWQPSQ_ADDFRAGCNT) |
-		 LS_64(read_fence, I40IWQPSQ_READFENCE) |
-		 LS_64(info->local_fence, I40IWQPSQ_LOCALFENCE) |
-		 LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) |
-		 LS_64(qp->swqe_polarity, I40IWQPSQ_VALID);
-
-	i40iw_set_fragment(wqe, 0, op_info->lo_sg_list);
-
-	for (i = 1, byte_off = 32; i < op_info->num_lo_sges; i++) {
-		i40iw_set_fragment(wqe, byte_off, &op_info->lo_sg_list[i]);
-		byte_off += 16;
-	}
-
-	wmb(); /* make sure WQE is populated before valid bit is set */
-
-	set_64bit_val(wqe, 24, header);
-
-	if (post_sq)
-		i40iw_qp_post_wr(qp);
-
-	return 0;
-}
-
-/**
- * i40iw_rdma_read - rdma read command
- * @qp: hw qp ptr
- * @info: post sq information
- * @inv_stag: flag for inv_stag
- * @post_sq: flag to post sq
- */
-static enum i40iw_status_code i40iw_rdma_read(struct i40iw_qp_uk *qp,
-					      struct i40iw_post_sq_info *info,
-					      bool inv_stag,
-					      bool post_sq)
-{
-	u64 *wqe;
-	struct i40iw_rdma_read *op_info;
-	u64 header;
-	u32 wqe_idx;
-	enum i40iw_status_code ret_code;
-	u8 wqe_size;
-	bool local_fence = false;
-
-	op_info = &info->op.rdma_read;
-	ret_code = i40iw_fragcnt_to_wqesize_sq(1, &wqe_size);
-	if (ret_code)
-		return ret_code;
-	wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, op_info->lo_addr.len, info->wr_id);
-	if (!wqe)
-		return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
-	local_fence |= info->local_fence;
-
-	set_64bit_val(wqe, 16, LS_64(op_info->rem_addr.tag_off, I40IWQPSQ_FRAG_TO));
-	header = LS_64(op_info->rem_addr.stag, I40IWQPSQ_REMSTAG) |
-		 LS_64((inv_stag ? I40IWQP_OP_RDMA_READ_LOC_INV : I40IWQP_OP_RDMA_READ), I40IWQPSQ_OPCODE) |
-		 LS_64(info->read_fence, I40IWQPSQ_READFENCE) |
-		 LS_64(local_fence, I40IWQPSQ_LOCALFENCE) |
-		 LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) |
-		 LS_64(qp->swqe_polarity, I40IWQPSQ_VALID);
-
-	i40iw_set_fragment(wqe, 0, &op_info->lo_addr);
-
-	wmb(); /* make sure WQE is populated before valid bit is set */
-
-	set_64bit_val(wqe, 24, header);
-	if (post_sq)
-		i40iw_qp_post_wr(qp);
-
-	return 0;
-}
-
-/**
- * i40iw_send - rdma send command
- * @qp: hw qp ptr
- * @info: post sq information
- * @stag_to_inv: stag_to_inv value
- * @post_sq: flag to post sq
- */
-static enum i40iw_status_code i40iw_send(struct i40iw_qp_uk *qp,
-					 struct i40iw_post_sq_info *info,
-					 u32 stag_to_inv,
-					 bool post_sq)
-{
-	u64 *wqe;
-	struct i40iw_post_send *op_info;
-	u64 header;
-	u32 i, wqe_idx, total_size = 0, byte_off;
-	enum i40iw_status_code ret_code;
-	bool read_fence = false;
-	u8 wqe_size;
-
-	op_info = &info->op.send;
-	if (qp->max_sq_frag_cnt < op_info->num_sges)
-		return I40IW_ERR_INVALID_FRAG_COUNT;
-
-	for (i = 0; i < op_info->num_sges; i++)
-		total_size += op_info->sg_list[i].len;
-	ret_code = i40iw_fragcnt_to_wqesize_sq(op_info->num_sges, &wqe_size);
-	if (ret_code)
-		return ret_code;
-
-	wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, total_size, info->wr_id);
-	if (!wqe)
-		return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
-
-	read_fence |= info->read_fence;
-	set_64bit_val(wqe, 16, 0);
-	header = LS_64(stag_to_inv, I40IWQPSQ_REMSTAG) |
-		 LS_64(info->op_type, I40IWQPSQ_OPCODE) |
-		 LS_64((op_info->num_sges > 1 ? (op_info->num_sges - 1) : 0),
-		       I40IWQPSQ_ADDFRAGCNT) |
-		 LS_64(read_fence, I40IWQPSQ_READFENCE) |
-		 LS_64(info->local_fence, I40IWQPSQ_LOCALFENCE) |
-		 LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) |
-		 LS_64(qp->swqe_polarity, I40IWQPSQ_VALID);
-
-	i40iw_set_fragment(wqe, 0, op_info->sg_list);
-
-	for (i = 1, byte_off = 32; i < op_info->num_sges; i++) {
-		i40iw_set_fragment(wqe, byte_off, &op_info->sg_list[i]);
-		byte_off += 16;
-	}
-
-	wmb(); /* make sure WQE is populated before valid bit is set */
-
-	set_64bit_val(wqe, 24, header);
-	if (post_sq)
-		i40iw_qp_post_wr(qp);
-
-	return 0;
-}
-
-/**
- * i40iw_inline_rdma_write - inline rdma write operation
- * @qp: hw qp ptr
- * @info: post sq information
- * @post_sq: flag to post sq
- */
-static enum i40iw_status_code i40iw_inline_rdma_write(struct i40iw_qp_uk *qp,
-						      struct i40iw_post_sq_info *info,
-						      bool post_sq)
-{
-	u64 *wqe;
-	u8 *dest, *src;
-	struct i40iw_inline_rdma_write *op_info;
-	u64 header = 0;
-	u32 wqe_idx;
-	enum i40iw_status_code ret_code;
-	bool read_fence = false;
-	u8 wqe_size;
-
-	op_info = &info->op.inline_rdma_write;
-	if (op_info->len > I40IW_MAX_INLINE_DATA_SIZE)
-		return I40IW_ERR_INVALID_INLINE_DATA_SIZE;
-
-	ret_code = i40iw_inline_data_size_to_wqesize(op_info->len, &wqe_size);
-	if (ret_code)
-		return ret_code;
-
-	wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, op_info->len, info->wr_id);
-	if (!wqe)
-		return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
-
-	read_fence |= info->read_fence;
-	set_64bit_val(wqe, 16,
-		      LS_64(op_info->rem_addr.tag_off, I40IWQPSQ_FRAG_TO));
-
-	header = LS_64(op_info->rem_addr.stag, I40IWQPSQ_REMSTAG) |
-		 LS_64(I40IWQP_OP_RDMA_WRITE, I40IWQPSQ_OPCODE) |
-		 LS_64(op_info->len, I40IWQPSQ_INLINEDATALEN) |
-		 LS_64(1, I40IWQPSQ_INLINEDATAFLAG) |
-		 LS_64(read_fence, I40IWQPSQ_READFENCE) |
-		 LS_64(info->local_fence, I40IWQPSQ_LOCALFENCE) |
-		 LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) |
-		 LS_64(qp->swqe_polarity, I40IWQPSQ_VALID);
-
-	dest = (u8 *)wqe;
-	src = (u8 *)(op_info->data);
-
-	if (op_info->len <= 16) {
-		memcpy(dest, src, op_info->len);
-	} else {
-		memcpy(dest, src, 16);
-		src += 16;
-		dest = (u8 *)wqe + 32;
-		memcpy(dest, src, op_info->len - 16);
-	}
-
-	wmb(); /* make sure WQE is populated before valid bit is set */
-
-	set_64bit_val(wqe, 24, header);
-
-	if (post_sq)
-		i40iw_qp_post_wr(qp);
-
-	return 0;
-}
-
-/**
- * i40iw_inline_send - inline send operation
- * @qp: hw qp ptr
- * @info: post sq information
- * @stag_to_inv: remote stag
- * @post_sq: flag to post sq
- */
-static enum i40iw_status_code i40iw_inline_send(struct i40iw_qp_uk *qp,
-						struct i40iw_post_sq_info *info,
-						u32 stag_to_inv,
-						bool post_sq)
-{
-	u64 *wqe;
-	u8 *dest, *src;
-	struct i40iw_post_inline_send *op_info;
-	u64 header;
-	u32 wqe_idx;
-	enum i40iw_status_code ret_code;
-	bool read_fence = false;
-	u8 wqe_size;
-
-	op_info = &info->op.inline_send;
-	if (op_info->len > I40IW_MAX_INLINE_DATA_SIZE)
-		return I40IW_ERR_INVALID_INLINE_DATA_SIZE;
-
-	ret_code = i40iw_inline_data_size_to_wqesize(op_info->len, &wqe_size);
-	if (ret_code)
-		return ret_code;
-
-	wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, op_info->len, info->wr_id);
-	if (!wqe)
-		return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
-
-	read_fence |= info->read_fence;
-	header = LS_64(stag_to_inv, I40IWQPSQ_REMSTAG) |
-	    LS_64(info->op_type, I40IWQPSQ_OPCODE) |
-	    LS_64(op_info->len, I40IWQPSQ_INLINEDATALEN) |
-	    LS_64(1, I40IWQPSQ_INLINEDATAFLAG) |
-	    LS_64(read_fence, I40IWQPSQ_READFENCE) |
-	    LS_64(info->local_fence, I40IWQPSQ_LOCALFENCE) |
-	    LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) |
-	    LS_64(qp->swqe_polarity, I40IWQPSQ_VALID);
-
-	dest = (u8 *)wqe;
-	src = (u8 *)(op_info->data);
-
-	if (op_info->len <= 16) {
-		memcpy(dest, src, op_info->len);
-	} else {
-		memcpy(dest, src, 16);
-		src += 16;
-		dest = (u8 *)wqe + 32;
-		memcpy(dest, src, op_info->len - 16);
-	}
-
-	wmb(); /* make sure WQE is populated before valid bit is set */
-
-	set_64bit_val(wqe, 24, header);
-
-	if (post_sq)
-		i40iw_qp_post_wr(qp);
-
-	return 0;
-}
-
-/**
- * i40iw_stag_local_invalidate - stag invalidate operation
- * @qp: hw qp ptr
- * @info: post sq information
- * @post_sq: flag to post sq
- */
-static enum i40iw_status_code i40iw_stag_local_invalidate(struct i40iw_qp_uk *qp,
-							  struct i40iw_post_sq_info *info,
-							  bool post_sq)
-{
-	u64 *wqe;
-	struct i40iw_inv_local_stag *op_info;
-	u64 header;
-	u32 wqe_idx;
-	bool local_fence = false;
-
-	op_info = &info->op.inv_local_stag;
-	local_fence = info->local_fence;
-
-	wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE, 0, info->wr_id);
-	if (!wqe)
-		return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
-	set_64bit_val(wqe, 0, 0);
-	set_64bit_val(wqe, 8,
-		      LS_64(op_info->target_stag, I40IWQPSQ_LOCSTAG));
-	set_64bit_val(wqe, 16, 0);
-	header = LS_64(I40IW_OP_TYPE_INV_STAG, I40IWQPSQ_OPCODE) |
-	    LS_64(info->read_fence, I40IWQPSQ_READFENCE) |
-	    LS_64(local_fence, I40IWQPSQ_LOCALFENCE) |
-	    LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) |
-	    LS_64(qp->swqe_polarity, I40IWQPSQ_VALID);
-
-	wmb(); /* make sure WQE is populated before valid bit is set */
-
-	set_64bit_val(wqe, 24, header);
-
-	if (post_sq)
-		i40iw_qp_post_wr(qp);
-
-	return 0;
-}
-
-/**
- * i40iw_mw_bind - Memory Window bind operation
- * @qp: hw qp ptr
- * @info: post sq information
- * @post_sq: flag to post sq
- */
-static enum i40iw_status_code i40iw_mw_bind(struct i40iw_qp_uk *qp,
-					    struct i40iw_post_sq_info *info,
-					    bool post_sq)
-{
-	u64 *wqe;
-	struct i40iw_bind_window *op_info;
-	u64 header;
-	u32 wqe_idx;
-	bool local_fence = false;
-
-	op_info = &info->op.bind_window;
-
-	local_fence |= info->local_fence;
-	wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE, 0, info->wr_id);
-	if (!wqe)
-		return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
-	set_64bit_val(wqe, 0, (uintptr_t)op_info->va);
-	set_64bit_val(wqe, 8,
-		      LS_64(op_info->mr_stag, I40IWQPSQ_PARENTMRSTAG) |
-		      LS_64(op_info->mw_stag, I40IWQPSQ_MWSTAG));
-	set_64bit_val(wqe, 16, op_info->bind_length);
-	header = LS_64(I40IW_OP_TYPE_BIND_MW, I40IWQPSQ_OPCODE) |
-	    LS_64(((op_info->enable_reads << 2) |
-		   (op_info->enable_writes << 3)),
-		  I40IWQPSQ_STAGRIGHTS) |
-	    LS_64((op_info->addressing_type == I40IW_ADDR_TYPE_VA_BASED ?  1 : 0),
-		  I40IWQPSQ_VABASEDTO) |
-	    LS_64(info->read_fence, I40IWQPSQ_READFENCE) |
-	    LS_64(local_fence, I40IWQPSQ_LOCALFENCE) |
-	    LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) |
-	    LS_64(qp->swqe_polarity, I40IWQPSQ_VALID);
-
-	wmb(); /* make sure WQE is populated before valid bit is set */
-
-	set_64bit_val(wqe, 24, header);
-
-	if (post_sq)
-		i40iw_qp_post_wr(qp);
-
-	return 0;
-}
-
-/**
- * i40iw_post_receive - post receive wqe
- * @qp: hw qp ptr
- * @info: post rq information
- */
-static enum i40iw_status_code i40iw_post_receive(struct i40iw_qp_uk *qp,
-						 struct i40iw_post_rq_info *info)
-{
-	u64 *wqe;
-	u64 header;
-	u32 total_size = 0, wqe_idx, i, byte_off;
-
-	if (qp->max_rq_frag_cnt < info->num_sges)
-		return I40IW_ERR_INVALID_FRAG_COUNT;
-	for (i = 0; i < info->num_sges; i++)
-		total_size += info->sg_list[i].len;
-	wqe = i40iw_qp_get_next_recv_wqe(qp, &wqe_idx);
-	if (!wqe)
-		return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
-
-	qp->rq_wrid_array[wqe_idx] = info->wr_id;
-	set_64bit_val(wqe, 16, 0);
-
-	header = LS_64((info->num_sges > 1 ? (info->num_sges - 1) : 0),
-		       I40IWQPSQ_ADDFRAGCNT) |
-	    LS_64(qp->rwqe_polarity, I40IWQPSQ_VALID);
-
-	i40iw_set_fragment(wqe, 0, info->sg_list);
-
-	for (i = 1, byte_off = 32; i < info->num_sges; i++) {
-		i40iw_set_fragment(wqe, byte_off, &info->sg_list[i]);
-		byte_off += 16;
-	}
-
-	wmb(); /* make sure WQE is populated before valid bit is set */
-
-	set_64bit_val(wqe, 24, header);
-
-	return 0;
-}
-
-/**
- * i40iw_cq_request_notification - cq notification request (door bell)
- * @cq: hw cq
- * @cq_notify: notification type
- */
-static void i40iw_cq_request_notification(struct i40iw_cq_uk *cq,
-					  enum i40iw_completion_notify cq_notify)
-{
-	u64 temp_val;
-	u16 sw_cq_sel;
-	u8 arm_next_se = 0;
-	u8 arm_next = 0;
-	u8 arm_seq_num;
-
-	get_64bit_val(cq->shadow_area, 32, &temp_val);
-	arm_seq_num = (u8)RS_64(temp_val, I40IW_CQ_DBSA_ARM_SEQ_NUM);
-	arm_seq_num++;
-
-	sw_cq_sel = (u16)RS_64(temp_val, I40IW_CQ_DBSA_SW_CQ_SELECT);
-	arm_next_se = (u8)RS_64(temp_val, I40IW_CQ_DBSA_ARM_NEXT_SE);
-	arm_next_se |= 1;
-	if (cq_notify == IW_CQ_COMPL_EVENT)
-		arm_next = 1;
-	temp_val = LS_64(arm_seq_num, I40IW_CQ_DBSA_ARM_SEQ_NUM) |
-	    LS_64(sw_cq_sel, I40IW_CQ_DBSA_SW_CQ_SELECT) |
-	    LS_64(arm_next_se, I40IW_CQ_DBSA_ARM_NEXT_SE) |
-	    LS_64(arm_next, I40IW_CQ_DBSA_ARM_NEXT);
-
-	set_64bit_val(cq->shadow_area, 32, temp_val);
-
-	wmb(); /* make sure WQE is populated before valid bit is set */
-
-	writel(cq->cq_id, cq->cqe_alloc_reg);
-}
-
-/**
- * i40iw_cq_post_entries - update tail in shadow memory
- * @cq: hw cq
- * @count: # of entries processed
- */
-static enum i40iw_status_code i40iw_cq_post_entries(struct i40iw_cq_uk *cq,
-						    u8 count)
-{
-	I40IW_RING_MOVE_TAIL_BY_COUNT(cq->cq_ring, count);
-	set_64bit_val(cq->shadow_area, 0,
-		      I40IW_RING_GETCURRENT_HEAD(cq->cq_ring));
-	return 0;
-}
-
-/**
- * i40iw_cq_poll_completion - get cq completion info
- * @cq: hw cq
- * @info: cq poll information returned
- */
-static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq,
-						       struct i40iw_cq_poll_info *info)
-{
-	u64 comp_ctx, qword0, qword2, qword3, wqe_qword;
-	u64 *cqe, *sw_wqe;
-	struct i40iw_qp_uk *qp;
-	struct i40iw_ring *pring = NULL;
-	u32 wqe_idx, q_type, array_idx = 0;
-	enum i40iw_status_code ret_code = 0;
-	bool move_cq_head = true;
-	u8 polarity;
-	u8 addl_wqes = 0;
-
-	if (cq->avoid_mem_cflct)
-		cqe = (u64 *)I40IW_GET_CURRENT_EXTENDED_CQ_ELEMENT(cq);
-	else
-		cqe = (u64 *)I40IW_GET_CURRENT_CQ_ELEMENT(cq);
-
-	get_64bit_val(cqe, 24, &qword3);
-	polarity = (u8)RS_64(qword3, I40IW_CQ_VALID);
-
-	if (polarity != cq->polarity)
-		return I40IW_ERR_QUEUE_EMPTY;
-
-	q_type = (u8)RS_64(qword3, I40IW_CQ_SQ);
-	info->error = (bool)RS_64(qword3, I40IW_CQ_ERROR);
-	if (info->error) {
-		info->comp_status = I40IW_COMPL_STATUS_FLUSHED;
-		info->major_err = (bool)RS_64(qword3, I40IW_CQ_MAJERR);
-		info->minor_err = (bool)RS_64(qword3, I40IW_CQ_MINERR);
-	} else {
-		info->comp_status = I40IW_COMPL_STATUS_SUCCESS;
-	}
-
-	get_64bit_val(cqe, 0, &qword0);
-	get_64bit_val(cqe, 16, &qword2);
-
-	info->tcp_seq_num = (u32)RS_64(qword0, I40IWCQ_TCPSEQNUM);
-
-	info->qp_id = (u32)RS_64(qword2, I40IWCQ_QPID);
-
-	get_64bit_val(cqe, 8, &comp_ctx);
-
-	info->solicited_event = (bool)RS_64(qword3, I40IWCQ_SOEVENT);
-	info->is_srq = (bool)RS_64(qword3, I40IWCQ_SRQ);
-
-	qp = (struct i40iw_qp_uk *)(unsigned long)comp_ctx;
-	if (!qp) {
-		ret_code = I40IW_ERR_QUEUE_DESTROYED;
-		goto exit;
-	}
-	wqe_idx = (u32)RS_64(qword3, I40IW_CQ_WQEIDX);
-	info->qp_handle = (i40iw_qp_handle)(unsigned long)qp;
-
-	if (q_type == I40IW_CQE_QTYPE_RQ) {
-		array_idx = (wqe_idx * 4) / qp->rq_wqe_size_multiplier;
-		if (info->comp_status == I40IW_COMPL_STATUS_FLUSHED) {
-			info->wr_id = qp->rq_wrid_array[qp->rq_ring.tail];
-			array_idx = qp->rq_ring.tail;
-		} else {
-			info->wr_id = qp->rq_wrid_array[array_idx];
-		}
-
-		info->op_type = I40IW_OP_TYPE_REC;
-		if (qword3 & I40IWCQ_STAG_MASK) {
-			info->stag_invalid_set = true;
-			info->inv_stag = (u32)RS_64(qword2, I40IWCQ_INVSTAG);
-		} else {
-			info->stag_invalid_set = false;
-		}
-		info->bytes_xfered = (u32)RS_64(qword0, I40IWCQ_PAYLDLEN);
-		I40IW_RING_SET_TAIL(qp->rq_ring, array_idx + 1);
-		pring = &qp->rq_ring;
-	} else {
-		if (qp->first_sq_wq) {
-			qp->first_sq_wq = false;
-			if (!wqe_idx && (qp->sq_ring.head == qp->sq_ring.tail)) {
-				I40IW_RING_MOVE_HEAD_NOCHECK(cq->cq_ring);
-				I40IW_RING_MOVE_TAIL(cq->cq_ring);
-				set_64bit_val(cq->shadow_area, 0,
-					      I40IW_RING_GETCURRENT_HEAD(cq->cq_ring));
-				memset(info, 0, sizeof(struct i40iw_cq_poll_info));
-				return i40iw_cq_poll_completion(cq, info);
-			}
-		}
-
-		if (info->comp_status != I40IW_COMPL_STATUS_FLUSHED) {
-			info->wr_id = qp->sq_wrtrk_array[wqe_idx].wrid;
-			info->bytes_xfered = qp->sq_wrtrk_array[wqe_idx].wr_len;
-
-			info->op_type = (u8)RS_64(qword3, I40IWCQ_OP);
-			sw_wqe = qp->sq_base[wqe_idx].elem;
-			get_64bit_val(sw_wqe, 24, &wqe_qword);
-
-			addl_wqes = qp->sq_wrtrk_array[wqe_idx].wqe_size / I40IW_QP_WQE_MIN_SIZE;
-			I40IW_RING_SET_TAIL(qp->sq_ring, (wqe_idx + addl_wqes));
-		} else {
-			do {
-				u8 op_type;
-				u32 tail;
-
-				tail = qp->sq_ring.tail;
-				sw_wqe = qp->sq_base[tail].elem;
-				get_64bit_val(sw_wqe, 24, &wqe_qword);
-				op_type = (u8)RS_64(wqe_qword, I40IWQPSQ_OPCODE);
-				info->op_type = op_type;
-				addl_wqes = qp->sq_wrtrk_array[tail].wqe_size / I40IW_QP_WQE_MIN_SIZE;
-				I40IW_RING_SET_TAIL(qp->sq_ring, (tail + addl_wqes));
-				if (op_type != I40IWQP_OP_NOP) {
-					info->wr_id = qp->sq_wrtrk_array[tail].wrid;
-					info->bytes_xfered = qp->sq_wrtrk_array[tail].wr_len;
-					break;
-				}
-			} while (1);
-		}
-		pring = &qp->sq_ring;
-	}
-
-	ret_code = 0;
-
-exit:
-	if (!ret_code &&
-	    (info->comp_status == I40IW_COMPL_STATUS_FLUSHED))
-		if (pring && (I40IW_RING_MORE_WORK(*pring)))
-			move_cq_head = false;
-
-	if (move_cq_head) {
-		I40IW_RING_MOVE_HEAD_NOCHECK(cq->cq_ring);
-
-		if (I40IW_RING_GETCURRENT_HEAD(cq->cq_ring) == 0)
-			cq->polarity ^= 1;
-
-		I40IW_RING_MOVE_TAIL(cq->cq_ring);
-		set_64bit_val(cq->shadow_area, 0,
-			      I40IW_RING_GETCURRENT_HEAD(cq->cq_ring));
-	} else {
-		if (info->is_srq)
-			return ret_code;
-		qword3 &= ~I40IW_CQ_WQEIDX_MASK;
-		qword3 |= LS_64(pring->tail, I40IW_CQ_WQEIDX);
-		set_64bit_val(cqe, 24, qword3);
-	}
-
-	return ret_code;
-}
-
-/**
- * i40iw_get_wqe_shift - get shift count for maximum wqe size
- * @sge: Maximum Scatter Gather Elements wqe
- * @inline_data: Maximum inline data size
- * @shift: Returns the shift needed based on sge
- *
- * Shift can be used to left shift the wqe size based on number of SGEs and inlind data size.
- * For 1 SGE or inline data <= 16, shift = 0 (wqe size of 32 bytes).
- * For 2 or 3 SGEs or inline data <= 48, shift = 1 (wqe size of 64 bytes).
- * Shift of 2 otherwise (wqe size of 128 bytes).
- */
-void i40iw_get_wqe_shift(u32 sge, u32 inline_data, u8 *shift)
-{
-	*shift = 0;
-	if (sge > 1 || inline_data > 16)
-		*shift = (sge < 4 && inline_data <= 48) ? 1 : 2;
-}
-
-/*
- * i40iw_get_sqdepth - get SQ depth (quantas)
- * @sq_size: SQ size
- * @shift: shift which determines size of WQE
- * @sqdepth: depth of SQ
- *
- */
-enum i40iw_status_code i40iw_get_sqdepth(u32 sq_size, u8 shift, u32 *sqdepth)
-{
-	*sqdepth = roundup_pow_of_two((sq_size << shift) + I40IW_SQ_RSVD);
-
-	if (*sqdepth < (I40IW_QP_SW_MIN_WQSIZE << shift))
-		*sqdepth = I40IW_QP_SW_MIN_WQSIZE << shift;
-	else if (*sqdepth > I40IW_QP_SW_MAX_SQ_QUANTAS)
-		return I40IW_ERR_INVALID_SIZE;
-
-	return 0;
-}
-
-/*
- * i40iw_get_rq_depth - get RQ depth (quantas)
- * @rq_size: RQ size
- * @shift: shift which determines size of WQE
- * @rqdepth: depth of RQ
- *
- */
-enum i40iw_status_code i40iw_get_rqdepth(u32 rq_size, u8 shift, u32 *rqdepth)
-{
-	*rqdepth = roundup_pow_of_two((rq_size << shift) + I40IW_RQ_RSVD);
-
-	if (*rqdepth < (I40IW_QP_SW_MIN_WQSIZE << shift))
-		*rqdepth = I40IW_QP_SW_MIN_WQSIZE << shift;
-	else if (*rqdepth > I40IW_QP_SW_MAX_RQ_QUANTAS)
-		return I40IW_ERR_INVALID_SIZE;
-
-	return 0;
-}
-
-static const struct i40iw_qp_uk_ops iw_qp_uk_ops = {
-	.iw_qp_post_wr = i40iw_qp_post_wr,
-	.iw_rdma_write = i40iw_rdma_write,
-	.iw_rdma_read = i40iw_rdma_read,
-	.iw_send = i40iw_send,
-	.iw_inline_rdma_write = i40iw_inline_rdma_write,
-	.iw_inline_send = i40iw_inline_send,
-	.iw_stag_local_invalidate = i40iw_stag_local_invalidate,
-	.iw_mw_bind = i40iw_mw_bind,
-	.iw_post_receive = i40iw_post_receive,
-	.iw_post_nop = i40iw_nop
-};
-
-static const struct i40iw_cq_ops iw_cq_ops = {
-	.iw_cq_request_notification = i40iw_cq_request_notification,
-	.iw_cq_poll_completion = i40iw_cq_poll_completion,
-	.iw_cq_post_entries = i40iw_cq_post_entries,
-	.iw_cq_clean = i40iw_clean_cq
-};
-
-static const struct i40iw_device_uk_ops iw_device_uk_ops = {
-	.iwarp_cq_uk_init = i40iw_cq_uk_init,
-	.iwarp_qp_uk_init = i40iw_qp_uk_init,
-};
-
-/**
- * i40iw_qp_uk_init - initialize shared qp
- * @qp: hw qp (user and kernel)
- * @info: qp initialization info
- *
- * initializes the vars used in both user and kernel mode.
- * size of the wqe depends on numbers of max. fragements
- * allowed. Then size of wqe * the number of wqes should be the
- * amount of memory allocated for sq and rq. If srq is used,
- * then rq_base will point to one rq wqe only (not the whole
- * array of wqes)
- */
-enum i40iw_status_code i40iw_qp_uk_init(struct i40iw_qp_uk *qp,
-					struct i40iw_qp_uk_init_info *info)
-{
-	enum i40iw_status_code ret_code = 0;
-	u32 sq_ring_size;
-	u8 sqshift, rqshift;
-
-	if (info->max_sq_frag_cnt > I40IW_MAX_WQ_FRAGMENT_COUNT)
-		return I40IW_ERR_INVALID_FRAG_COUNT;
-
-	if (info->max_rq_frag_cnt > I40IW_MAX_WQ_FRAGMENT_COUNT)
-		return I40IW_ERR_INVALID_FRAG_COUNT;
-	i40iw_get_wqe_shift(info->max_sq_frag_cnt, info->max_inline_data, &sqshift);
-
-	qp->sq_base = info->sq;
-	qp->rq_base = info->rq;
-	qp->shadow_area = info->shadow_area;
-	qp->sq_wrtrk_array = info->sq_wrtrk_array;
-	qp->rq_wrid_array = info->rq_wrid_array;
-
-	qp->wqe_alloc_reg = info->wqe_alloc_reg;
-	qp->qp_id = info->qp_id;
-	qp->sq_size = info->sq_size;
-	qp->max_sq_frag_cnt = info->max_sq_frag_cnt;
-	sq_ring_size = qp->sq_size << sqshift;
-
-	I40IW_RING_INIT(qp->sq_ring, sq_ring_size);
-	I40IW_RING_INIT(qp->initial_ring, sq_ring_size);
-	I40IW_RING_MOVE_HEAD(qp->sq_ring, ret_code);
-	I40IW_RING_MOVE_TAIL(qp->sq_ring);
-	I40IW_RING_MOVE_HEAD(qp->initial_ring, ret_code);
-	qp->swqe_polarity = 1;
-	qp->first_sq_wq = true;
-	qp->swqe_polarity_deferred = 1;
-	qp->rwqe_polarity = 0;
-
-	if (!qp->use_srq) {
-		qp->rq_size = info->rq_size;
-		qp->max_rq_frag_cnt = info->max_rq_frag_cnt;
-		I40IW_RING_INIT(qp->rq_ring, qp->rq_size);
-		switch (info->abi_ver) {
-		case 4:
-			i40iw_get_wqe_shift(info->max_rq_frag_cnt, 0, &rqshift);
-			break;
-		case 5: /* fallthrough until next ABI version */
-		default:
-			rqshift = I40IW_MAX_RQ_WQE_SHIFT;
-			break;
-		}
-		qp->rq_wqe_size = rqshift;
-		qp->rq_wqe_size_multiplier = 4 << rqshift;
-	}
-	qp->ops = iw_qp_uk_ops;
-
-	return ret_code;
-}
-
-/**
- * i40iw_cq_uk_init - initialize shared cq (user and kernel)
- * @cq: hw cq
- * @info: hw cq initialization info
- */
-enum i40iw_status_code i40iw_cq_uk_init(struct i40iw_cq_uk *cq,
-					struct i40iw_cq_uk_init_info *info)
-{
-	if ((info->cq_size < I40IW_MIN_CQ_SIZE) ||
-	    (info->cq_size > I40IW_MAX_CQ_SIZE))
-		return I40IW_ERR_INVALID_SIZE;
-	cq->cq_base = (struct i40iw_cqe *)info->cq_base;
-	cq->cq_id = info->cq_id;
-	cq->cq_size = info->cq_size;
-	cq->cqe_alloc_reg = info->cqe_alloc_reg;
-	cq->shadow_area = info->shadow_area;
-	cq->avoid_mem_cflct = info->avoid_mem_cflct;
-
-	I40IW_RING_INIT(cq->cq_ring, cq->cq_size);
-	cq->polarity = 1;
-	cq->ops = iw_cq_ops;
-
-	return 0;
-}
-
-/**
- * i40iw_device_init_uk - setup routines for iwarp shared device
- * @dev: iwarp shared (user and kernel)
- */
-void i40iw_device_init_uk(struct i40iw_dev_uk *dev)
-{
-	dev->ops_uk = iw_device_uk_ops;
-}
-
-/**
- * i40iw_clean_cq - clean cq entries
- * @queue: completion context
- * @cq: cq to clean
- */
-void i40iw_clean_cq(void *queue, struct i40iw_cq_uk *cq)
-{
-	u64 *cqe;
-	u64 qword3, comp_ctx;
-	u32 cq_head;
-	u8 polarity, temp;
-
-	cq_head = cq->cq_ring.head;
-	temp = cq->polarity;
-	do {
-		if (cq->avoid_mem_cflct)
-			cqe = (u64 *)&(((struct i40iw_extended_cqe *)cq->cq_base)[cq_head]);
-		else
-			cqe = (u64 *)&cq->cq_base[cq_head];
-		get_64bit_val(cqe, 24, &qword3);
-		polarity = (u8)RS_64(qword3, I40IW_CQ_VALID);
-
-		if (polarity != temp)
-			break;
-
-		get_64bit_val(cqe, 8, &comp_ctx);
-		if ((void *)(unsigned long)comp_ctx == queue)
-			set_64bit_val(cqe, 8, 0);
-
-		cq_head = (cq_head + 1) % cq->cq_ring.size;
-		if (!cq_head)
-			temp ^= 1;
-	} while (true);
-}
-
-/**
- * i40iw_nop - send a nop
- * @qp: hw qp ptr
- * @wr_id: work request id
- * @signaled: flag if signaled for completion
- * @post_sq: flag to post sq
- */
-enum i40iw_status_code i40iw_nop(struct i40iw_qp_uk *qp,
-				 u64 wr_id,
-				 bool signaled,
-				 bool post_sq)
-{
-	u64 header, *wqe;
-	u32 wqe_idx;
-
-	wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE, 0, wr_id);
-	if (!wqe)
-		return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
-	set_64bit_val(wqe, 0, 0);
-	set_64bit_val(wqe, 8, 0);
-	set_64bit_val(wqe, 16, 0);
-
-	header = LS_64(I40IWQP_OP_NOP, I40IWQPSQ_OPCODE) |
-	    LS_64(signaled, I40IWQPSQ_SIGCOMPL) |
-	    LS_64(qp->swqe_polarity, I40IWQPSQ_VALID);
-
-	wmb(); /* make sure WQE is populated before valid bit is set */
-
-	set_64bit_val(wqe, 24, header);
-	if (post_sq)
-		i40iw_qp_post_wr(qp);
-
-	return 0;
-}
-
-/**
- * i40iw_fragcnt_to_wqesize_sq - calculate wqe size based on fragment count for SQ
- * @frag_cnt: number of fragments
- * @wqe_size: size of sq wqe returned
- */
-enum i40iw_status_code i40iw_fragcnt_to_wqesize_sq(u32 frag_cnt, u8 *wqe_size)
-{
-	switch (frag_cnt) {
-	case 0:
-	case 1:
-		*wqe_size = I40IW_QP_WQE_MIN_SIZE;
-		break;
-	case 2:
-	case 3:
-		*wqe_size = 64;
-		break;
-	case 4:
-	case 5:
-		*wqe_size = 96;
-		break;
-	case 6:
-	case 7:
-		*wqe_size = 128;
-		break;
-	default:
-		return I40IW_ERR_INVALID_FRAG_COUNT;
-	}
-
-	return 0;
-}
-
-/**
- * i40iw_fragcnt_to_wqesize_rq - calculate wqe size based on fragment count for RQ
- * @frag_cnt: number of fragments
- * @wqe_size: size of rq wqe returned
- */
-enum i40iw_status_code i40iw_fragcnt_to_wqesize_rq(u32 frag_cnt, u8 *wqe_size)
-{
-	switch (frag_cnt) {
-	case 0:
-	case 1:
-		*wqe_size = 32;
-		break;
-	case 2:
-	case 3:
-		*wqe_size = 64;
-		break;
-	case 4:
-	case 5:
-	case 6:
-	case 7:
-		*wqe_size = 128;
-		break;
-	default:
-		return I40IW_ERR_INVALID_FRAG_COUNT;
-	}
-
-	return 0;
-}
-
-/**
- * i40iw_inline_data_size_to_wqesize - based on inline data, wqe size
- * @data_size: data size for inline
- * @wqe_size: size of sq wqe returned
- */
-enum i40iw_status_code i40iw_inline_data_size_to_wqesize(u32 data_size,
-							 u8 *wqe_size)
-{
-	if (data_size > I40IW_MAX_INLINE_DATA_SIZE)
-		return I40IW_ERR_INVALID_INLINE_DATA_SIZE;
-
-	if (data_size <= 16)
-		*wqe_size = I40IW_QP_WQE_MIN_SIZE;
-	else
-		*wqe_size = 64;
-
-	return 0;
-}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_user.h b/drivers/infiniband/hw/i40iw/i40iw_user.h
deleted file mode 100644
index 93fc3081dd65..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw_user.h
+++ /dev/null
@@ -1,422 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#ifndef I40IW_USER_H
-#define I40IW_USER_H
-
-enum i40iw_device_capabilities_const {
-	I40IW_WQE_SIZE =			4,
-	I40IW_CQP_WQE_SIZE =			8,
-	I40IW_CQE_SIZE =			4,
-	I40IW_EXTENDED_CQE_SIZE =		8,
-	I40IW_AEQE_SIZE =			2,
-	I40IW_CEQE_SIZE =			1,
-	I40IW_CQP_CTX_SIZE =			8,
-	I40IW_SHADOW_AREA_SIZE =		8,
-	I40IW_CEQ_MAX_COUNT =			256,
-	I40IW_QUERY_FPM_BUF_SIZE =		128,
-	I40IW_COMMIT_FPM_BUF_SIZE =		128,
-	I40IW_MIN_IW_QP_ID =			1,
-	I40IW_MAX_IW_QP_ID =			262143,
-	I40IW_MIN_CEQID =			0,
-	I40IW_MAX_CEQID =			256,
-	I40IW_MIN_CQID =			0,
-	I40IW_MAX_CQID =			131071,
-	I40IW_MIN_AEQ_ENTRIES =			1,
-	I40IW_MAX_AEQ_ENTRIES =			524287,
-	I40IW_MIN_CEQ_ENTRIES =			1,
-	I40IW_MAX_CEQ_ENTRIES =			131071,
-	I40IW_MIN_CQ_SIZE =			1,
-	I40IW_MAX_CQ_SIZE =			1048575,
-	I40IW_DB_ID_ZERO =			0,
-	I40IW_MAX_WQ_FRAGMENT_COUNT =		3,
-	I40IW_MAX_SGE_RD =			1,
-	I40IW_MAX_OUTBOUND_MESSAGE_SIZE =	2147483647,
-	I40IW_MAX_INBOUND_MESSAGE_SIZE =	2147483647,
-	I40IW_MAX_PE_ENABLED_VF_COUNT =		32,
-	I40IW_MAX_VF_FPM_ID =			47,
-	I40IW_MAX_VF_PER_PF =			127,
-	I40IW_MAX_SQ_PAYLOAD_SIZE =		2145386496,
-	I40IW_MAX_INLINE_DATA_SIZE =		48,
-	I40IW_MAX_IRD_SIZE =			64,
-	I40IW_MAX_ORD_SIZE =			127,
-	I40IW_MAX_WQ_ENTRIES =			2048,
-	I40IW_Q2_BUFFER_SIZE =			(248 + 100),
-	I40IW_MAX_WQE_SIZE_RQ =			128,
-	I40IW_QP_CTX_SIZE =			248,
-	I40IW_MAX_PDS = 			32768
-};
-
-#define i40iw_handle void *
-#define i40iw_adapter_handle i40iw_handle
-#define i40iw_qp_handle i40iw_handle
-#define i40iw_cq_handle i40iw_handle
-#define i40iw_srq_handle i40iw_handle
-#define i40iw_pd_id i40iw_handle
-#define i40iw_stag_handle i40iw_handle
-#define i40iw_stag_index u32
-#define i40iw_stag u32
-#define i40iw_stag_key u8
-
-#define i40iw_tagged_offset u64
-#define i40iw_access_privileges u32
-#define i40iw_physical_fragment u64
-#define i40iw_address_list u64 *
-
-#define	I40IW_MAX_MR_SIZE	0x10000000000L
-#define	I40IW_MAX_RQ_WQE_SHIFT	2
-
-struct i40iw_qp_uk;
-struct i40iw_cq_uk;
-struct i40iw_srq_uk;
-struct i40iw_qp_uk_init_info;
-struct i40iw_cq_uk_init_info;
-struct i40iw_srq_uk_init_info;
-
-struct i40iw_sge {
-	i40iw_tagged_offset tag_off;
-	u32 len;
-	i40iw_stag stag;
-};
-
-#define i40iw_sgl struct i40iw_sge *
-
-struct i40iw_ring {
-	u32 head;
-	u32 tail;
-	u32 size;
-};
-
-struct i40iw_cqe {
-	u64 buf[I40IW_CQE_SIZE];
-};
-
-struct i40iw_extended_cqe {
-	u64 buf[I40IW_EXTENDED_CQE_SIZE];
-};
-
-struct i40iw_wqe {
-	u64 buf[I40IW_WQE_SIZE];
-};
-
-struct i40iw_qp_uk_ops;
-
-enum i40iw_addressing_type {
-	I40IW_ADDR_TYPE_ZERO_BASED = 0,
-	I40IW_ADDR_TYPE_VA_BASED = 1,
-};
-
-#define I40IW_ACCESS_FLAGS_LOCALREAD		0x01
-#define I40IW_ACCESS_FLAGS_LOCALWRITE		0x02
-#define I40IW_ACCESS_FLAGS_REMOTEREAD_ONLY	0x04
-#define I40IW_ACCESS_FLAGS_REMOTEREAD		0x05
-#define I40IW_ACCESS_FLAGS_REMOTEWRITE_ONLY	0x08
-#define I40IW_ACCESS_FLAGS_REMOTEWRITE		0x0a
-#define I40IW_ACCESS_FLAGS_BIND_WINDOW		0x10
-#define I40IW_ACCESS_FLAGS_ALL			0x1F
-
-#define I40IW_OP_TYPE_RDMA_WRITE	0
-#define I40IW_OP_TYPE_RDMA_READ		1
-#define I40IW_OP_TYPE_SEND		3
-#define I40IW_OP_TYPE_SEND_INV		4
-#define I40IW_OP_TYPE_SEND_SOL		5
-#define I40IW_OP_TYPE_SEND_SOL_INV	6
-#define I40IW_OP_TYPE_REC		7
-#define I40IW_OP_TYPE_BIND_MW		8
-#define I40IW_OP_TYPE_FAST_REG_NSMR	9
-#define I40IW_OP_TYPE_INV_STAG		10
-#define I40IW_OP_TYPE_RDMA_READ_INV_STAG 11
-#define I40IW_OP_TYPE_NOP		12
-
-enum i40iw_completion_status {
-	I40IW_COMPL_STATUS_SUCCESS = 0,
-	I40IW_COMPL_STATUS_FLUSHED,
-	I40IW_COMPL_STATUS_INVALID_WQE,
-	I40IW_COMPL_STATUS_QP_CATASTROPHIC,
-	I40IW_COMPL_STATUS_REMOTE_TERMINATION,
-	I40IW_COMPL_STATUS_INVALID_STAG,
-	I40IW_COMPL_STATUS_BASE_BOUND_VIOLATION,
-	I40IW_COMPL_STATUS_ACCESS_VIOLATION,
-	I40IW_COMPL_STATUS_INVALID_PD_ID,
-	I40IW_COMPL_STATUS_WRAP_ERROR,
-	I40IW_COMPL_STATUS_STAG_INVALID_PDID,
-	I40IW_COMPL_STATUS_RDMA_READ_ZERO_ORD,
-	I40IW_COMPL_STATUS_QP_NOT_PRIVLEDGED,
-	I40IW_COMPL_STATUS_STAG_NOT_INVALID,
-	I40IW_COMPL_STATUS_INVALID_PHYS_BUFFER_SIZE,
-	I40IW_COMPL_STATUS_INVALID_PHYS_BUFFER_ENTRY,
-	I40IW_COMPL_STATUS_INVALID_FBO,
-	I40IW_COMPL_STATUS_INVALID_LENGTH,
-	I40IW_COMPL_STATUS_INVALID_ACCESS,
-	I40IW_COMPL_STATUS_PHYS_BUFFER_LIST_TOO_LONG,
-	I40IW_COMPL_STATUS_INVALID_VIRT_ADDRESS,
-	I40IW_COMPL_STATUS_INVALID_REGION,
-	I40IW_COMPL_STATUS_INVALID_WINDOW,
-	I40IW_COMPL_STATUS_INVALID_TOTAL_LENGTH
-};
-
-enum i40iw_completion_notify {
-	IW_CQ_COMPL_EVENT = 0,
-	IW_CQ_COMPL_SOLICITED = 1
-};
-
-struct i40iw_post_send {
-	i40iw_sgl sg_list;
-	u32 num_sges;
-};
-
-struct i40iw_post_inline_send {
-	void *data;
-	u32 len;
-};
-
-struct i40iw_rdma_write {
-	i40iw_sgl lo_sg_list;
-	u32 num_lo_sges;
-	struct i40iw_sge rem_addr;
-};
-
-struct i40iw_inline_rdma_write {
-	void *data;
-	u32 len;
-	struct i40iw_sge rem_addr;
-};
-
-struct i40iw_rdma_read {
-	struct i40iw_sge lo_addr;
-	struct i40iw_sge rem_addr;
-};
-
-struct i40iw_bind_window {
-	i40iw_stag mr_stag;
-	u64 bind_length;
-	void *va;
-	enum i40iw_addressing_type addressing_type;
-	bool enable_reads;
-	bool enable_writes;
-	i40iw_stag mw_stag;
-};
-
-struct i40iw_inv_local_stag {
-	i40iw_stag target_stag;
-};
-
-struct i40iw_post_sq_info {
-	u64 wr_id;
-	u8 op_type;
-	bool signaled;
-	bool read_fence;
-	bool local_fence;
-	bool inline_data;
-	bool defer_flag;
-	union {
-		struct i40iw_post_send send;
-		struct i40iw_rdma_write rdma_write;
-		struct i40iw_rdma_read rdma_read;
-		struct i40iw_rdma_read rdma_read_inv;
-		struct i40iw_bind_window bind_window;
-		struct i40iw_inv_local_stag inv_local_stag;
-		struct i40iw_inline_rdma_write inline_rdma_write;
-		struct i40iw_post_inline_send inline_send;
-	} op;
-};
-
-struct i40iw_post_rq_info {
-	u64 wr_id;
-	i40iw_sgl sg_list;
-	u32 num_sges;
-};
-
-struct i40iw_cq_poll_info {
-	u64 wr_id;
-	i40iw_qp_handle qp_handle;
-	u32 bytes_xfered;
-	u32 tcp_seq_num;
-	u32 qp_id;
-	i40iw_stag inv_stag;
-	enum i40iw_completion_status comp_status;
-	u16 major_err;
-	u16 minor_err;
-	u8 op_type;
-	bool stag_invalid_set;
-	bool error;
-	bool is_srq;
-	bool solicited_event;
-};
-
-struct i40iw_qp_uk_ops {
-	void (*iw_qp_post_wr)(struct i40iw_qp_uk *);
-	enum i40iw_status_code (*iw_rdma_write)(struct i40iw_qp_uk *,
-						struct i40iw_post_sq_info *, bool);
-	enum i40iw_status_code (*iw_rdma_read)(struct i40iw_qp_uk *,
-					       struct i40iw_post_sq_info *, bool, bool);
-	enum i40iw_status_code (*iw_send)(struct i40iw_qp_uk *,
-					  struct i40iw_post_sq_info *, u32, bool);
-	enum i40iw_status_code (*iw_inline_rdma_write)(struct i40iw_qp_uk *,
-						       struct i40iw_post_sq_info *, bool);
-	enum i40iw_status_code (*iw_inline_send)(struct i40iw_qp_uk *,
-						 struct i40iw_post_sq_info *, u32, bool);
-	enum i40iw_status_code (*iw_stag_local_invalidate)(struct i40iw_qp_uk *,
-							   struct i40iw_post_sq_info *, bool);
-	enum i40iw_status_code (*iw_mw_bind)(struct i40iw_qp_uk *,
-					     struct i40iw_post_sq_info *, bool);
-	enum i40iw_status_code (*iw_post_receive)(struct i40iw_qp_uk *,
-						  struct i40iw_post_rq_info *);
-	enum i40iw_status_code (*iw_post_nop)(struct i40iw_qp_uk *, u64, bool, bool);
-};
-
-struct i40iw_cq_ops {
-	void (*iw_cq_request_notification)(struct i40iw_cq_uk *,
-					   enum i40iw_completion_notify);
-	enum i40iw_status_code (*iw_cq_poll_completion)(struct i40iw_cq_uk *,
-							struct i40iw_cq_poll_info *);
-	enum i40iw_status_code (*iw_cq_post_entries)(struct i40iw_cq_uk *, u8 count);
-	void (*iw_cq_clean)(void *, struct i40iw_cq_uk *);
-};
-
-struct i40iw_dev_uk;
-
-struct i40iw_device_uk_ops {
-	enum i40iw_status_code (*iwarp_cq_uk_init)(struct i40iw_cq_uk *,
-						   struct i40iw_cq_uk_init_info *);
-	enum i40iw_status_code (*iwarp_qp_uk_init)(struct i40iw_qp_uk *,
-						   struct i40iw_qp_uk_init_info *);
-};
-
-struct i40iw_dev_uk {
-	struct i40iw_device_uk_ops ops_uk;
-};
-
-struct i40iw_sq_uk_wr_trk_info {
-	u64 wrid;
-	u32 wr_len;
-	u8 wqe_size;
-	u8 reserved[3];
-};
-
-struct i40iw_qp_quanta {
-	u64 elem[I40IW_WQE_SIZE];
-};
-
-struct i40iw_qp_uk {
-	struct i40iw_qp_quanta *sq_base;
-	struct i40iw_qp_quanta *rq_base;
-	u32 __iomem *wqe_alloc_reg;
-	struct i40iw_sq_uk_wr_trk_info *sq_wrtrk_array;
-	u64 *rq_wrid_array;
-	u64 *shadow_area;
-	struct i40iw_ring sq_ring;
-	struct i40iw_ring rq_ring;
-	struct i40iw_ring initial_ring;
-	u32 qp_id;
-	u32 sq_size;
-	u32 rq_size;
-	u32 max_sq_frag_cnt;
-	u32 max_rq_frag_cnt;
-	struct i40iw_qp_uk_ops ops;
-	bool use_srq;
-	u8 swqe_polarity;
-	u8 swqe_polarity_deferred;
-	u8 rwqe_polarity;
-	u8 rq_wqe_size;
-	u8 rq_wqe_size_multiplier;
-	bool first_sq_wq;
-	bool deferred_flag;
-};
-
-struct i40iw_cq_uk {
-	struct i40iw_cqe *cq_base;
-	u32 __iomem *cqe_alloc_reg;
-	u64 *shadow_area;
-	u32 cq_id;
-	u32 cq_size;
-	struct i40iw_ring cq_ring;
-	u8 polarity;
-	bool avoid_mem_cflct;
-
-	struct i40iw_cq_ops ops;
-};
-
-struct i40iw_qp_uk_init_info {
-	struct i40iw_qp_quanta *sq;
-	struct i40iw_qp_quanta *rq;
-	u32 __iomem *wqe_alloc_reg;
-	u64 *shadow_area;
-	struct i40iw_sq_uk_wr_trk_info *sq_wrtrk_array;
-	u64 *rq_wrid_array;
-	u32 qp_id;
-	u32 sq_size;
-	u32 rq_size;
-	u32 max_sq_frag_cnt;
-	u32 max_rq_frag_cnt;
-	u32 max_inline_data;
-	int abi_ver;
-};
-
-struct i40iw_cq_uk_init_info {
-	u32 __iomem *cqe_alloc_reg;
-	struct i40iw_cqe *cq_base;
-	u64 *shadow_area;
-	u32 cq_size;
-	u32 cq_id;
-	bool avoid_mem_cflct;
-};
-
-void i40iw_device_init_uk(struct i40iw_dev_uk *dev);
-
-void i40iw_qp_post_wr(struct i40iw_qp_uk *qp);
-u64 *i40iw_qp_get_next_send_wqe(struct i40iw_qp_uk *qp, u32 *wqe_idx,
-				u8 wqe_size,
-				u32 total_size,
-				u64 wr_id
-				);
-u64 *i40iw_qp_get_next_recv_wqe(struct i40iw_qp_uk *qp, u32 *wqe_idx);
-u64 *i40iw_qp_get_next_srq_wqe(struct i40iw_srq_uk *srq, u32 *wqe_idx);
-
-enum i40iw_status_code i40iw_cq_uk_init(struct i40iw_cq_uk *cq,
-					struct i40iw_cq_uk_init_info *info);
-enum i40iw_status_code i40iw_qp_uk_init(struct i40iw_qp_uk *qp,
-					struct i40iw_qp_uk_init_info *info);
-
-void i40iw_clean_cq(void *queue, struct i40iw_cq_uk *cq);
-enum i40iw_status_code i40iw_nop(struct i40iw_qp_uk *qp, u64 wr_id,
-				 bool signaled, bool post_sq);
-enum i40iw_status_code i40iw_fragcnt_to_wqesize_sq(u32 frag_cnt, u8 *wqe_size);
-enum i40iw_status_code i40iw_fragcnt_to_wqesize_rq(u32 frag_cnt, u8 *wqe_size);
-enum i40iw_status_code i40iw_inline_data_size_to_wqesize(u32 data_size,
-							 u8 *wqe_size);
-void i40iw_get_wqe_shift(u32 sge, u32 inline_data, u8 *shift);
-enum i40iw_status_code i40iw_get_sqdepth(u32 sq_size, u8 shift, u32 *sqdepth);
-enum i40iw_status_code i40iw_get_rqdepth(u32 rq_size, u8 shift, u32 *rqdepth);
-#endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c b/drivers/infiniband/hw/i40iw/i40iw_utils.c
deleted file mode 100644
index 9ff825f7860b..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw_utils.c
+++ /dev/null
@@ -1,1518 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/ethtool.h>
-#include <linux/mii.h>
-#include <linux/if_vlan.h>
-#include <linux/crc32.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/init.h>
-#include <linux/io.h>
-#include <asm/irq.h>
-#include <asm/byteorder.h>
-#include <net/netevent.h>
-#include <net/neighbour.h>
-#include "i40iw.h"
-
-/**
- * i40iw_arp_table - manage arp table
- * @iwdev: iwarp device
- * @ip_addr: ip address for device
- * @ipv4: flag indicating IPv4 when true
- * @mac_addr: mac address ptr
- * @action: modify, delete or add
- */
-int i40iw_arp_table(struct i40iw_device *iwdev,
-		    u32 *ip_addr,
-		    bool ipv4,
-		    u8 *mac_addr,
-		    u32 action)
-{
-	int arp_index;
-	int err;
-	u32 ip[4];
-
-	if (ipv4) {
-		memset(ip, 0, sizeof(ip));
-		ip[0] = *ip_addr;
-	} else {
-		memcpy(ip, ip_addr, sizeof(ip));
-	}
-
-	for (arp_index = 0; (u32)arp_index < iwdev->arp_table_size; arp_index++)
-		if (memcmp(iwdev->arp_table[arp_index].ip_addr, ip, sizeof(ip)) == 0)
-			break;
-	switch (action) {
-	case I40IW_ARP_ADD:
-		if (arp_index != iwdev->arp_table_size)
-			return -1;
-
-		arp_index = 0;
-		err = i40iw_alloc_resource(iwdev, iwdev->allocated_arps,
-					   iwdev->arp_table_size,
-					   (u32 *)&arp_index,
-					   &iwdev->next_arp_index);
-
-		if (err)
-			return err;
-
-		memcpy(iwdev->arp_table[arp_index].ip_addr, ip, sizeof(ip));
-		ether_addr_copy(iwdev->arp_table[arp_index].mac_addr, mac_addr);
-		break;
-	case I40IW_ARP_RESOLVE:
-		if (arp_index == iwdev->arp_table_size)
-			return -1;
-		break;
-	case I40IW_ARP_DELETE:
-		if (arp_index == iwdev->arp_table_size)
-			return -1;
-		memset(iwdev->arp_table[arp_index].ip_addr, 0,
-		       sizeof(iwdev->arp_table[arp_index].ip_addr));
-		eth_zero_addr(iwdev->arp_table[arp_index].mac_addr);
-		i40iw_free_resource(iwdev, iwdev->allocated_arps, arp_index);
-		break;
-	default:
-		return -1;
-	}
-	return arp_index;
-}
-
-/**
- * i40iw_wr32 - write 32 bits to hw register
- * @hw: hardware information including registers
- * @reg: register offset
- * @value: vvalue to write to register
- */
-inline void i40iw_wr32(struct i40iw_hw *hw, u32 reg, u32 value)
-{
-	writel(value, hw->hw_addr + reg);
-}
-
-/**
- * i40iw_rd32 - read a 32 bit hw register
- * @hw: hardware information including registers
- * @reg: register offset
- *
- * Return value of register content
- */
-inline u32 i40iw_rd32(struct i40iw_hw *hw, u32 reg)
-{
-	return readl(hw->hw_addr + reg);
-}
-
-/**
- * i40iw_inetaddr_event - system notifier for ipv4 addr events
- * @notifier: not used
- * @event: event for notifier
- * @ptr: if address
- */
-int i40iw_inetaddr_event(struct notifier_block *notifier,
-			 unsigned long event,
-			 void *ptr)
-{
-	struct in_ifaddr *ifa = ptr;
-	struct net_device *event_netdev = ifa->ifa_dev->dev;
-	struct net_device *netdev;
-	struct net_device *upper_dev;
-	struct i40iw_device *iwdev;
-	struct i40iw_handler *hdl;
-	u32 local_ipaddr;
-	u32 action = I40IW_ARP_ADD;
-
-	hdl = i40iw_find_netdev(event_netdev);
-	if (!hdl)
-		return NOTIFY_DONE;
-
-	iwdev = &hdl->device;
-	if (iwdev->init_state < IP_ADDR_REGISTERED || iwdev->closing)
-		return NOTIFY_DONE;
-
-	netdev = iwdev->ldev->netdev;
-	upper_dev = netdev_master_upper_dev_get(netdev);
-	if (netdev != event_netdev)
-		return NOTIFY_DONE;
-
-	if (upper_dev) {
-		struct in_device *in;
-
-		rcu_read_lock();
-		in = __in_dev_get_rcu(upper_dev);
-
-		local_ipaddr = 0;
-		if (in) {
-			struct in_ifaddr *ifa;
-
-			ifa = rcu_dereference(in->ifa_list);
-			if (ifa)
-				local_ipaddr = ntohl(ifa->ifa_address);
-		}
-
-		rcu_read_unlock();
-	} else {
-		local_ipaddr = ntohl(ifa->ifa_address);
-	}
-	switch (event) {
-	case NETDEV_DOWN:
-		action = I40IW_ARP_DELETE;
-		fallthrough;
-	case NETDEV_UP:
-	case NETDEV_CHANGEADDR:
-
-		/* Just skip if no need to handle ARP cache */
-		if (!local_ipaddr)
-			break;
-
-		i40iw_manage_arp_cache(iwdev,
-				       netdev->dev_addr,
-				       &local_ipaddr,
-				       true,
-				       action);
-		i40iw_if_notify(iwdev, netdev, &local_ipaddr, true,
-				(action == I40IW_ARP_ADD) ? true : false);
-		break;
-	default:
-		break;
-	}
-	return NOTIFY_DONE;
-}
-
-/**
- * i40iw_inet6addr_event - system notifier for ipv6 addr events
- * @notifier: not used
- * @event: event for notifier
- * @ptr: if address
- */
-int i40iw_inet6addr_event(struct notifier_block *notifier,
-			  unsigned long event,
-			  void *ptr)
-{
-	struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
-	struct net_device *event_netdev = ifa->idev->dev;
-	struct net_device *netdev;
-	struct i40iw_device *iwdev;
-	struct i40iw_handler *hdl;
-	u32 local_ipaddr6[4];
-	u32 action = I40IW_ARP_ADD;
-
-	hdl = i40iw_find_netdev(event_netdev);
-	if (!hdl)
-		return NOTIFY_DONE;
-
-	iwdev = &hdl->device;
-	if (iwdev->init_state < IP_ADDR_REGISTERED || iwdev->closing)
-		return NOTIFY_DONE;
-
-	netdev = iwdev->ldev->netdev;
-	if (netdev != event_netdev)
-		return NOTIFY_DONE;
-
-	i40iw_copy_ip_ntohl(local_ipaddr6, ifa->addr.in6_u.u6_addr32);
-	switch (event) {
-	case NETDEV_DOWN:
-		action = I40IW_ARP_DELETE;
-		fallthrough;
-	case NETDEV_UP:
-	case NETDEV_CHANGEADDR:
-		i40iw_manage_arp_cache(iwdev,
-				       netdev->dev_addr,
-				       local_ipaddr6,
-				       false,
-				       action);
-		i40iw_if_notify(iwdev, netdev, local_ipaddr6, false,
-				(action == I40IW_ARP_ADD) ? true : false);
-		break;
-	default:
-		break;
-	}
-	return NOTIFY_DONE;
-}
-
-/**
- * i40iw_net_event - system notifier for netevents
- * @notifier: not used
- * @event: event for notifier
- * @ptr: neighbor
- */
-int i40iw_net_event(struct notifier_block *notifier, unsigned long event, void *ptr)
-{
-	struct neighbour *neigh = ptr;
-	struct i40iw_device *iwdev;
-	struct i40iw_handler *iwhdl;
-	__be32 *p;
-	u32 local_ipaddr[4];
-
-	switch (event) {
-	case NETEVENT_NEIGH_UPDATE:
-		iwhdl = i40iw_find_netdev((struct net_device *)neigh->dev);
-		if (!iwhdl)
-			return NOTIFY_DONE;
-		iwdev = &iwhdl->device;
-		if (iwdev->init_state < IP_ADDR_REGISTERED || iwdev->closing)
-			return NOTIFY_DONE;
-		p = (__be32 *)neigh->primary_key;
-		i40iw_copy_ip_ntohl(local_ipaddr, p);
-		if (neigh->nud_state & NUD_VALID) {
-			i40iw_manage_arp_cache(iwdev,
-					       neigh->ha,
-					       local_ipaddr,
-					       false,
-					       I40IW_ARP_ADD);
-
-		} else {
-			i40iw_manage_arp_cache(iwdev,
-					       neigh->ha,
-					       local_ipaddr,
-					       false,
-					       I40IW_ARP_DELETE);
-		}
-		break;
-	default:
-		break;
-	}
-	return NOTIFY_DONE;
-}
-
-/**
- * i40iw_netdevice_event - system notifier for netdev events
- * @notifier: not used
- * @event: event for notifier
- * @ptr: netdev
- */
-int i40iw_netdevice_event(struct notifier_block *notifier,
-			  unsigned long event,
-			  void *ptr)
-{
-	struct net_device *event_netdev;
-	struct net_device *netdev;
-	struct i40iw_device *iwdev;
-	struct i40iw_handler *hdl;
-
-	event_netdev = netdev_notifier_info_to_dev(ptr);
-
-	hdl = i40iw_find_netdev(event_netdev);
-	if (!hdl)
-		return NOTIFY_DONE;
-
-	iwdev = &hdl->device;
-	if (iwdev->init_state < RDMA_DEV_REGISTERED || iwdev->closing)
-		return NOTIFY_DONE;
-
-	netdev = iwdev->ldev->netdev;
-	if (netdev != event_netdev)
-		return NOTIFY_DONE;
-
-	iwdev->iw_status = 1;
-
-	switch (event) {
-	case NETDEV_DOWN:
-		iwdev->iw_status = 0;
-		fallthrough;
-	case NETDEV_UP:
-		i40iw_port_ibevent(iwdev);
-		break;
-	default:
-		break;
-	}
-	return NOTIFY_DONE;
-}
-
-/**
- * i40iw_get_cqp_request - get cqp struct
- * @cqp: device cqp ptr
- * @wait: cqp to be used in wait mode
- */
-struct i40iw_cqp_request *i40iw_get_cqp_request(struct i40iw_cqp *cqp, bool wait)
-{
-	struct i40iw_cqp_request *cqp_request = NULL;
-	unsigned long flags;
-
-	spin_lock_irqsave(&cqp->req_lock, flags);
-	if (!list_empty(&cqp->cqp_avail_reqs)) {
-		cqp_request = list_entry(cqp->cqp_avail_reqs.next,
-					 struct i40iw_cqp_request, list);
-		list_del_init(&cqp_request->list);
-	}
-	spin_unlock_irqrestore(&cqp->req_lock, flags);
-	if (!cqp_request) {
-		cqp_request = kzalloc(sizeof(*cqp_request), GFP_ATOMIC);
-		if (cqp_request) {
-			cqp_request->dynamic = true;
-			INIT_LIST_HEAD(&cqp_request->list);
-			init_waitqueue_head(&cqp_request->waitq);
-		}
-	}
-	if (!cqp_request) {
-		i40iw_pr_err("CQP Request Fail: No Memory");
-		return NULL;
-	}
-
-	if (wait) {
-		atomic_set(&cqp_request->refcount, 2);
-		cqp_request->waiting = true;
-	} else {
-		atomic_set(&cqp_request->refcount, 1);
-	}
-	return cqp_request;
-}
-
-/**
- * i40iw_free_cqp_request - free cqp request
- * @cqp: cqp ptr
- * @cqp_request: to be put back in cqp list
- */
-void i40iw_free_cqp_request(struct i40iw_cqp *cqp, struct i40iw_cqp_request *cqp_request)
-{
-	struct i40iw_device *iwdev = container_of(cqp, struct i40iw_device, cqp);
-	unsigned long flags;
-
-	if (cqp_request->dynamic) {
-		kfree(cqp_request);
-	} else {
-		cqp_request->request_done = false;
-		cqp_request->callback_fcn = NULL;
-		cqp_request->waiting = false;
-
-		spin_lock_irqsave(&cqp->req_lock, flags);
-		list_add_tail(&cqp_request->list, &cqp->cqp_avail_reqs);
-		spin_unlock_irqrestore(&cqp->req_lock, flags);
-	}
-	wake_up(&iwdev->close_wq);
-}
-
-/**
- * i40iw_put_cqp_request - dec ref count and free if 0
- * @cqp: cqp ptr
- * @cqp_request: to be put back in cqp list
- */
-void i40iw_put_cqp_request(struct i40iw_cqp *cqp,
-			   struct i40iw_cqp_request *cqp_request)
-{
-	if (atomic_dec_and_test(&cqp_request->refcount))
-		i40iw_free_cqp_request(cqp, cqp_request);
-}
-
-/**
- * i40iw_free_pending_cqp_request -free pending cqp request objs
- * @cqp: cqp ptr
- * @cqp_request: to be put back in cqp list
- */
-static void i40iw_free_pending_cqp_request(struct i40iw_cqp *cqp,
-					   struct i40iw_cqp_request *cqp_request)
-{
-	struct i40iw_device *iwdev = container_of(cqp, struct i40iw_device, cqp);
-
-	if (cqp_request->waiting) {
-		cqp_request->compl_info.error = true;
-		cqp_request->request_done = true;
-		wake_up(&cqp_request->waitq);
-	}
-	i40iw_put_cqp_request(cqp, cqp_request);
-	wait_event_timeout(iwdev->close_wq,
-			   !atomic_read(&cqp_request->refcount),
-			   1000);
-}
-
-/**
- * i40iw_cleanup_pending_cqp_op - clean-up cqp with no completions
- * @iwdev: iwarp device
- */
-void i40iw_cleanup_pending_cqp_op(struct i40iw_device *iwdev)
-{
-	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-	struct i40iw_cqp *cqp = &iwdev->cqp;
-	struct i40iw_cqp_request *cqp_request = NULL;
-	struct cqp_commands_info *pcmdinfo = NULL;
-	u32 i, pending_work, wqe_idx;
-
-	pending_work = I40IW_RING_WORK_AVAILABLE(cqp->sc_cqp.sq_ring);
-	wqe_idx = I40IW_RING_GETCURRENT_TAIL(cqp->sc_cqp.sq_ring);
-	for (i = 0; i < pending_work; i++) {
-		cqp_request = (struct i40iw_cqp_request *)(unsigned long)cqp->scratch_array[wqe_idx];
-		if (cqp_request)
-			i40iw_free_pending_cqp_request(cqp, cqp_request);
-		wqe_idx = (wqe_idx + 1) % I40IW_RING_GETSIZE(cqp->sc_cqp.sq_ring);
-	}
-
-	while (!list_empty(&dev->cqp_cmd_head)) {
-		pcmdinfo = (struct cqp_commands_info *)i40iw_remove_head(&dev->cqp_cmd_head);
-		cqp_request = container_of(pcmdinfo, struct i40iw_cqp_request, info);
-		if (cqp_request)
-			i40iw_free_pending_cqp_request(cqp, cqp_request);
-	}
-}
-
-/**
- * i40iw_wait_event - wait for completion
- * @iwdev: iwarp device
- * @cqp_request: cqp request to wait
- */
-static int i40iw_wait_event(struct i40iw_device *iwdev,
-			    struct i40iw_cqp_request *cqp_request)
-{
-	struct cqp_commands_info *info = &cqp_request->info;
-	struct i40iw_cqp *iwcqp = &iwdev->cqp;
-	struct i40iw_cqp_timeout cqp_timeout;
-	bool cqp_error = false;
-	int err_code = 0;
-	memset(&cqp_timeout, 0, sizeof(cqp_timeout));
-	cqp_timeout.compl_cqp_cmds = iwdev->sc_dev.cqp_cmd_stats[OP_COMPLETED_COMMANDS];
-	do {
-		if (wait_event_timeout(cqp_request->waitq,
-				       cqp_request->request_done, CQP_COMPL_WAIT_TIME))
-			break;
-
-		i40iw_check_cqp_progress(&cqp_timeout, &iwdev->sc_dev);
-
-		if (cqp_timeout.count < CQP_TIMEOUT_THRESHOLD)
-			continue;
-
-		i40iw_pr_err("error cqp command 0x%x timed out", info->cqp_cmd);
-		err_code = -ETIME;
-		if (!iwdev->reset) {
-			iwdev->reset = true;
-			i40iw_request_reset(iwdev);
-		}
-		goto done;
-	} while (1);
-	cqp_error = cqp_request->compl_info.error;
-	if (cqp_error) {
-		i40iw_pr_err("error cqp command 0x%x completion maj = 0x%x min=0x%x\n",
-			     info->cqp_cmd, cqp_request->compl_info.maj_err_code,
-			     cqp_request->compl_info.min_err_code);
-		err_code = -EPROTO;
-		goto done;
-	}
-done:
-	i40iw_put_cqp_request(iwcqp, cqp_request);
-	return err_code;
-}
-
-/**
- * i40iw_handle_cqp_op - process cqp command
- * @iwdev: iwarp device
- * @cqp_request: cqp request to process
- */
-enum i40iw_status_code i40iw_handle_cqp_op(struct i40iw_device *iwdev,
-					   struct i40iw_cqp_request
-					   *cqp_request)
-{
-	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-	enum i40iw_status_code status;
-	struct cqp_commands_info *info = &cqp_request->info;
-	int err_code = 0;
-
-	if (iwdev->reset) {
-		i40iw_free_cqp_request(&iwdev->cqp, cqp_request);
-		return I40IW_ERR_CQP_COMPL_ERROR;
-	}
-
-	status = i40iw_process_cqp_cmd(dev, info);
-	if (status) {
-		i40iw_pr_err("error cqp command 0x%x failed\n", info->cqp_cmd);
-		i40iw_free_cqp_request(&iwdev->cqp, cqp_request);
-		return status;
-	}
-	if (cqp_request->waiting)
-		err_code = i40iw_wait_event(iwdev, cqp_request);
-	if (err_code)
-		status = I40IW_ERR_CQP_COMPL_ERROR;
-	return status;
-}
-
-/**
- * i40iw_add_devusecount - add dev refcount
- * @iwdev: dev for refcount
- */
-void i40iw_add_devusecount(struct i40iw_device *iwdev)
-{
-	atomic64_inc(&iwdev->use_count);
-}
-
-/**
- * i40iw_rem_devusecount - decrement refcount for dev
- * @iwdev: device
- */
-void i40iw_rem_devusecount(struct i40iw_device *iwdev)
-{
-	if (!atomic64_dec_and_test(&iwdev->use_count))
-		return;
-	wake_up(&iwdev->close_wq);
-}
-
-/**
- * i40iw_add_pdusecount - add pd refcount
- * @iwpd: pd for refcount
- */
-void i40iw_add_pdusecount(struct i40iw_pd *iwpd)
-{
-	atomic_inc(&iwpd->usecount);
-}
-
-/**
- * i40iw_rem_pdusecount - decrement refcount for pd and free if 0
- * @iwpd: pd for refcount
- * @iwdev: iwarp device
- */
-void i40iw_rem_pdusecount(struct i40iw_pd *iwpd, struct i40iw_device *iwdev)
-{
-	if (!atomic_dec_and_test(&iwpd->usecount))
-		return;
-	i40iw_free_resource(iwdev, iwdev->allocated_pds, iwpd->sc_pd.pd_id);
-}
-
-/**
- * i40iw_qp_add_ref - add refcount for qp
- * @ibqp: iqarp qp
- */
-void i40iw_qp_add_ref(struct ib_qp *ibqp)
-{
-	struct i40iw_qp *iwqp = (struct i40iw_qp *)ibqp;
-
-	refcount_inc(&iwqp->refcount);
-}
-
-/**
- * i40iw_qp_rem_ref - rem refcount for qp and free if 0
- * @ibqp: iqarp qp
- */
-void i40iw_qp_rem_ref(struct ib_qp *ibqp)
-{
-	struct i40iw_qp *iwqp;
-	struct i40iw_device *iwdev;
-	u32 qp_num;
-	unsigned long flags;
-
-	iwqp = to_iwqp(ibqp);
-	iwdev = iwqp->iwdev;
-	spin_lock_irqsave(&iwdev->qptable_lock, flags);
-	if (!refcount_dec_and_test(&iwqp->refcount)) {
-		spin_unlock_irqrestore(&iwdev->qptable_lock, flags);
-		return;
-	}
-
-	qp_num = iwqp->ibqp.qp_num;
-	iwdev->qp_table[qp_num] = NULL;
-	spin_unlock_irqrestore(&iwdev->qptable_lock, flags);
-	complete(&iwqp->free_qp);
-
-}
-
-/**
- * i40iw_get_qp - get qp address
- * @device: iwarp device
- * @qpn: qp number
- */
-struct ib_qp *i40iw_get_qp(struct ib_device *device, int qpn)
-{
-	struct i40iw_device *iwdev = to_iwdev(device);
-
-	if ((qpn < IW_FIRST_QPN) || (qpn >= iwdev->max_qp))
-		return NULL;
-
-	return &iwdev->qp_table[qpn]->ibqp;
-}
-
-/**
- * i40iw_debug_buf - print debug msg and buffer is mask set
- * @dev: hardware control device structure
- * @mask: mask to compare if to print debug buffer
- * @desc: identifying string
- * @buf: points buffer addr
- * @size: saize of buffer to print
- */
-void i40iw_debug_buf(struct i40iw_sc_dev *dev,
-		     enum i40iw_debug_flag mask,
-		     char *desc,
-		     u64 *buf,
-		     u32 size)
-{
-	u32 i;
-
-	if (!(dev->debug_mask & mask))
-		return;
-	i40iw_debug(dev, mask, "%s\n", desc);
-	i40iw_debug(dev, mask, "starting address virt=%p phy=%llxh\n", buf,
-		    (unsigned long long)virt_to_phys(buf));
-
-	for (i = 0; i < size; i += 8)
-		i40iw_debug(dev, mask, "index %03d val: %016llx\n", i, buf[i / 8]);
-}
-
-/**
- * i40iw_get_hw_addr - return hw addr
- * @par: points to shared dev
- */
-u8 __iomem *i40iw_get_hw_addr(void *par)
-{
-	struct i40iw_sc_dev *dev = (struct i40iw_sc_dev *)par;
-
-	return dev->hw->hw_addr;
-}
-
-/**
- * i40iw_remove_head - return head entry and remove from list
- * @list: list for entry
- */
-void *i40iw_remove_head(struct list_head *list)
-{
-	struct list_head *entry;
-
-	if (list_empty(list))
-		return NULL;
-
-	entry = (void *)list->next;
-	list_del(entry);
-	return (void *)entry;
-}
-
-/**
- * i40iw_allocate_dma_mem - Memory alloc helper fn
- * @hw:   pointer to the HW structure
- * @mem:  ptr to mem struct to fill out
- * @size: size of memory requested
- * @alignment: what to align the allocation to
- */
-enum i40iw_status_code i40iw_allocate_dma_mem(struct i40iw_hw *hw,
-					      struct i40iw_dma_mem *mem,
-					      u64 size,
-					      u32 alignment)
-{
-	struct pci_dev *pcidev = hw->pcidev;
-
-	if (!mem)
-		return I40IW_ERR_PARAM;
-	mem->size = ALIGN(size, alignment);
-	mem->va = dma_alloc_coherent(&pcidev->dev, mem->size,
-				     (dma_addr_t *)&mem->pa, GFP_KERNEL);
-	if (!mem->va)
-		return I40IW_ERR_NO_MEMORY;
-	return 0;
-}
-
-/**
- * i40iw_free_dma_mem - Memory free helper fn
- * @hw:   pointer to the HW structure
- * @mem:  ptr to mem struct to free
- */
-void i40iw_free_dma_mem(struct i40iw_hw *hw, struct i40iw_dma_mem *mem)
-{
-	struct pci_dev *pcidev = hw->pcidev;
-
-	if (!mem || !mem->va)
-		return;
-
-	dma_free_coherent(&pcidev->dev, mem->size,
-			  mem->va, (dma_addr_t)mem->pa);
-	mem->va = NULL;
-}
-
-/**
- * i40iw_allocate_virt_mem - virtual memory alloc helper fn
- * @hw:   pointer to the HW structure
- * @mem:  ptr to mem struct to fill out
- * @size: size of memory requested
- */
-enum i40iw_status_code i40iw_allocate_virt_mem(struct i40iw_hw *hw,
-					       struct i40iw_virt_mem *mem,
-					       u32 size)
-{
-	if (!mem)
-		return I40IW_ERR_PARAM;
-
-	mem->size = size;
-	mem->va = kzalloc(size, GFP_KERNEL);
-
-	if (mem->va)
-		return 0;
-	else
-		return I40IW_ERR_NO_MEMORY;
-}
-
-/**
- * i40iw_free_virt_mem - virtual memory free helper fn
- * @hw:   pointer to the HW structure
- * @mem:  ptr to mem struct to free
- */
-enum i40iw_status_code i40iw_free_virt_mem(struct i40iw_hw *hw,
-					   struct i40iw_virt_mem *mem)
-{
-	if (!mem)
-		return I40IW_ERR_PARAM;
-	/*
-	 * mem->va points to the parent of mem, so both mem and mem->va
-	 * can not be touched once mem->va is freed
-	 */
-	kfree(mem->va);
-	return 0;
-}
-
-/**
- * i40iw_cqp_sds_cmd - create cqp command for sd
- * @dev: hardware control device structure
- * @sdinfo: information  for sd cqp
- *
- */
-enum i40iw_status_code i40iw_cqp_sds_cmd(struct i40iw_sc_dev *dev,
-					 struct i40iw_update_sds_info *sdinfo)
-{
-	enum i40iw_status_code status;
-	struct i40iw_cqp_request *cqp_request;
-	struct cqp_commands_info *cqp_info;
-	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
-
-	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, true);
-	if (!cqp_request)
-		return I40IW_ERR_NO_MEMORY;
-	cqp_info = &cqp_request->info;
-	memcpy(&cqp_info->in.u.update_pe_sds.info, sdinfo,
-	       sizeof(cqp_info->in.u.update_pe_sds.info));
-	cqp_info->cqp_cmd = OP_UPDATE_PE_SDS;
-	cqp_info->post_sq = 1;
-	cqp_info->in.u.update_pe_sds.dev = dev;
-	cqp_info->in.u.update_pe_sds.scratch = (uintptr_t)cqp_request;
-	status = i40iw_handle_cqp_op(iwdev, cqp_request);
-	if (status)
-		i40iw_pr_err("CQP-OP Update SD's fail");
-	return status;
-}
-
-/**
- * i40iw_qp_suspend_resume - cqp command for suspend/resume
- * @dev: hardware control device structure
- * @qp: hardware control qp
- * @suspend: flag if suspend or resume
- */
-void i40iw_qp_suspend_resume(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp, bool suspend)
-{
-	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
-	struct i40iw_cqp_request *cqp_request;
-	struct i40iw_sc_cqp *cqp = dev->cqp;
-	struct cqp_commands_info *cqp_info;
-	enum i40iw_status_code status;
-
-	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, false);
-	if (!cqp_request)
-		return;
-
-	cqp_info = &cqp_request->info;
-	cqp_info->cqp_cmd = (suspend) ? OP_SUSPEND : OP_RESUME;
-	cqp_info->in.u.suspend_resume.cqp = cqp;
-	cqp_info->in.u.suspend_resume.qp = qp;
-	cqp_info->in.u.suspend_resume.scratch = (uintptr_t)cqp_request;
-	status = i40iw_handle_cqp_op(iwdev, cqp_request);
-	if (status)
-		i40iw_pr_err("CQP-OP QP Suspend/Resume fail");
-}
-
-/**
- * i40iw_term_modify_qp - modify qp for term message
- * @qp: hardware control qp
- * @next_state: qp's next state
- * @term: terminate code
- * @term_len: length
- */
-void i40iw_term_modify_qp(struct i40iw_sc_qp *qp, u8 next_state, u8 term, u8 term_len)
-{
-	struct i40iw_qp *iwqp;
-
-	iwqp = (struct i40iw_qp *)qp->back_qp;
-	i40iw_next_iw_state(iwqp, next_state, 0, term, term_len);
-};
-
-/**
- * i40iw_terminate_done - after terminate is completed
- * @qp: hardware control qp
- * @timeout_occurred: indicates if terminate timer expired
- */
-void i40iw_terminate_done(struct i40iw_sc_qp *qp, int timeout_occurred)
-{
-	struct i40iw_qp *iwqp;
-	u32 next_iwarp_state = I40IW_QP_STATE_ERROR;
-	u8 hte = 0;
-	bool first_time;
-	unsigned long flags;
-
-	iwqp = (struct i40iw_qp *)qp->back_qp;
-	spin_lock_irqsave(&iwqp->lock, flags);
-	if (iwqp->hte_added) {
-		iwqp->hte_added = 0;
-		hte = 1;
-	}
-	first_time = !(qp->term_flags & I40IW_TERM_DONE);
-	qp->term_flags |= I40IW_TERM_DONE;
-	spin_unlock_irqrestore(&iwqp->lock, flags);
-	if (first_time) {
-		if (!timeout_occurred)
-			i40iw_terminate_del_timer(qp);
-		else
-			next_iwarp_state = I40IW_QP_STATE_CLOSING;
-
-		i40iw_next_iw_state(iwqp, next_iwarp_state, hte, 0, 0);
-		i40iw_cm_disconn(iwqp);
-	}
-}
-
-/**
- * i40iw_terminate_timeout - timeout happened
- * @t: points to iwarp qp
- */
-static void i40iw_terminate_timeout(struct timer_list *t)
-{
-	struct i40iw_qp *iwqp = from_timer(iwqp, t, terminate_timer);
-	struct i40iw_sc_qp *qp = (struct i40iw_sc_qp *)&iwqp->sc_qp;
-
-	i40iw_terminate_done(qp, 1);
-	i40iw_qp_rem_ref(&iwqp->ibqp);
-}
-
-/**
- * i40iw_terminate_start_timer - start terminate timeout
- * @qp: hardware control qp
- */
-void i40iw_terminate_start_timer(struct i40iw_sc_qp *qp)
-{
-	struct i40iw_qp *iwqp;
-
-	iwqp = (struct i40iw_qp *)qp->back_qp;
-	i40iw_qp_add_ref(&iwqp->ibqp);
-	timer_setup(&iwqp->terminate_timer, i40iw_terminate_timeout, 0);
-	iwqp->terminate_timer.expires = jiffies + HZ;
-	add_timer(&iwqp->terminate_timer);
-}
-
-/**
- * i40iw_terminate_del_timer - delete terminate timeout
- * @qp: hardware control qp
- */
-void i40iw_terminate_del_timer(struct i40iw_sc_qp *qp)
-{
-	struct i40iw_qp *iwqp;
-
-	iwqp = (struct i40iw_qp *)qp->back_qp;
-	if (del_timer(&iwqp->terminate_timer))
-		i40iw_qp_rem_ref(&iwqp->ibqp);
-}
-
-/**
- * i40iw_cqp_generic_worker - generic worker for cqp
- * @work: work pointer
- */
-static void i40iw_cqp_generic_worker(struct work_struct *work)
-{
-	struct i40iw_virtchnl_work_info *work_info =
-	    &((struct virtchnl_work *)work)->work_info;
-
-	if (work_info->worker_vf_dev)
-		work_info->callback_fcn(work_info->worker_vf_dev);
-}
-
-/**
- * i40iw_cqp_spawn_worker - spawn worket thread
- * @dev: device struct pointer
- * @work_info: work request info
- * @iw_vf_idx: virtual function index
- */
-void i40iw_cqp_spawn_worker(struct i40iw_sc_dev *dev,
-			    struct i40iw_virtchnl_work_info *work_info,
-			    u32 iw_vf_idx)
-{
-	struct virtchnl_work *work;
-	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
-
-	work = &iwdev->virtchnl_w[iw_vf_idx];
-	memcpy(&work->work_info, work_info, sizeof(*work_info));
-	INIT_WORK(&work->work, i40iw_cqp_generic_worker);
-	queue_work(iwdev->virtchnl_wq, &work->work);
-}
-
-/**
- * i40iw_cqp_manage_hmc_fcn_worker -
- * @work: work pointer for hmc info
- */
-static void i40iw_cqp_manage_hmc_fcn_worker(struct work_struct *work)
-{
-	struct i40iw_cqp_request *cqp_request =
-	    ((struct virtchnl_work *)work)->cqp_request;
-	struct i40iw_ccq_cqe_info ccq_cqe_info;
-	struct i40iw_hmc_fcn_info *hmcfcninfo =
-			&cqp_request->info.in.u.manage_hmc_pm.info;
-	struct i40iw_device *iwdev =
-	    (struct i40iw_device *)cqp_request->info.in.u.manage_hmc_pm.dev->back_dev;
-
-	ccq_cqe_info.cqp = NULL;
-	ccq_cqe_info.maj_err_code = cqp_request->compl_info.maj_err_code;
-	ccq_cqe_info.min_err_code = cqp_request->compl_info.min_err_code;
-	ccq_cqe_info.op_code = cqp_request->compl_info.op_code;
-	ccq_cqe_info.op_ret_val = cqp_request->compl_info.op_ret_val;
-	ccq_cqe_info.scratch = 0;
-	ccq_cqe_info.error = cqp_request->compl_info.error;
-	hmcfcninfo->callback_fcn(cqp_request->info.in.u.manage_hmc_pm.dev,
-				 hmcfcninfo->cqp_callback_param, &ccq_cqe_info);
-	i40iw_put_cqp_request(&iwdev->cqp, cqp_request);
-}
-
-/**
- * i40iw_cqp_manage_hmc_fcn_callback - called function after cqp completion
- * @cqp_request: cqp request info struct for hmc fun
- * @unused: unused param of callback
- */
-static void i40iw_cqp_manage_hmc_fcn_callback(struct i40iw_cqp_request *cqp_request,
-					      u32 unused)
-{
-	struct virtchnl_work *work;
-	struct i40iw_hmc_fcn_info *hmcfcninfo =
-	    &cqp_request->info.in.u.manage_hmc_pm.info;
-	struct i40iw_device *iwdev =
-	    (struct i40iw_device *)cqp_request->info.in.u.manage_hmc_pm.dev->
-	    back_dev;
-
-	if (hmcfcninfo && hmcfcninfo->callback_fcn) {
-		i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_HMC, "%s1\n", __func__);
-		atomic_inc(&cqp_request->refcount);
-		work = &iwdev->virtchnl_w[hmcfcninfo->iw_vf_idx];
-		work->cqp_request = cqp_request;
-		INIT_WORK(&work->work, i40iw_cqp_manage_hmc_fcn_worker);
-		queue_work(iwdev->virtchnl_wq, &work->work);
-		i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_HMC, "%s2\n", __func__);
-	} else {
-		i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_HMC, "%s: Something wrong\n", __func__);
-	}
-}
-
-/**
- * i40iw_cqp_manage_hmc_fcn_cmd - issue cqp command to manage hmc
- * @dev: hardware control device structure
- * @hmcfcninfo: info for hmc
- */
-enum i40iw_status_code i40iw_cqp_manage_hmc_fcn_cmd(struct i40iw_sc_dev *dev,
-						    struct i40iw_hmc_fcn_info *hmcfcninfo)
-{
-	enum i40iw_status_code status;
-	struct i40iw_cqp_request *cqp_request;
-	struct cqp_commands_info *cqp_info;
-	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
-
-	i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_HMC, "%s\n", __func__);
-	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, false);
-	if (!cqp_request)
-		return I40IW_ERR_NO_MEMORY;
-	cqp_info = &cqp_request->info;
-	cqp_request->callback_fcn = i40iw_cqp_manage_hmc_fcn_callback;
-	cqp_request->param = hmcfcninfo;
-	memcpy(&cqp_info->in.u.manage_hmc_pm.info, hmcfcninfo,
-	       sizeof(*hmcfcninfo));
-	cqp_info->in.u.manage_hmc_pm.dev = dev;
-	cqp_info->cqp_cmd = OP_MANAGE_HMC_PM_FUNC_TABLE;
-	cqp_info->post_sq = 1;
-	cqp_info->in.u.manage_hmc_pm.scratch = (uintptr_t)cqp_request;
-	status = i40iw_handle_cqp_op(iwdev, cqp_request);
-	if (status)
-		i40iw_pr_err("CQP-OP Manage HMC fail");
-	return status;
-}
-
-/**
- * i40iw_cqp_query_fpm_values_cmd - send cqp command for fpm
- * @dev: function device struct
- * @values_mem: buffer for fpm
- * @hmc_fn_id: function id for fpm
- */
-enum i40iw_status_code i40iw_cqp_query_fpm_values_cmd(struct i40iw_sc_dev *dev,
-						      struct i40iw_dma_mem *values_mem,
-						      u8 hmc_fn_id)
-{
-	enum i40iw_status_code status;
-	struct i40iw_cqp_request *cqp_request;
-	struct cqp_commands_info *cqp_info;
-	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
-
-	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, true);
-	if (!cqp_request)
-		return I40IW_ERR_NO_MEMORY;
-	cqp_info = &cqp_request->info;
-	cqp_request->param = NULL;
-	cqp_info->in.u.query_fpm_values.cqp = dev->cqp;
-	cqp_info->in.u.query_fpm_values.fpm_values_pa = values_mem->pa;
-	cqp_info->in.u.query_fpm_values.fpm_values_va = values_mem->va;
-	cqp_info->in.u.query_fpm_values.hmc_fn_id = hmc_fn_id;
-	cqp_info->cqp_cmd = OP_QUERY_FPM_VALUES;
-	cqp_info->post_sq = 1;
-	cqp_info->in.u.query_fpm_values.scratch = (uintptr_t)cqp_request;
-	status = i40iw_handle_cqp_op(iwdev, cqp_request);
-	if (status)
-		i40iw_pr_err("CQP-OP Query FPM fail");
-	return status;
-}
-
-/**
- * i40iw_cqp_commit_fpm_values_cmd - commit fpm values in hw
- * @dev: hardware control device structure
- * @values_mem: buffer with fpm values
- * @hmc_fn_id: function id for fpm
- */
-enum i40iw_status_code i40iw_cqp_commit_fpm_values_cmd(struct i40iw_sc_dev *dev,
-						       struct i40iw_dma_mem *values_mem,
-						       u8 hmc_fn_id)
-{
-	enum i40iw_status_code status;
-	struct i40iw_cqp_request *cqp_request;
-	struct cqp_commands_info *cqp_info;
-	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
-
-	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, true);
-	if (!cqp_request)
-		return I40IW_ERR_NO_MEMORY;
-	cqp_info = &cqp_request->info;
-	cqp_request->param = NULL;
-	cqp_info->in.u.commit_fpm_values.cqp = dev->cqp;
-	cqp_info->in.u.commit_fpm_values.fpm_values_pa = values_mem->pa;
-	cqp_info->in.u.commit_fpm_values.fpm_values_va = values_mem->va;
-	cqp_info->in.u.commit_fpm_values.hmc_fn_id = hmc_fn_id;
-	cqp_info->cqp_cmd = OP_COMMIT_FPM_VALUES;
-	cqp_info->post_sq = 1;
-	cqp_info->in.u.commit_fpm_values.scratch = (uintptr_t)cqp_request;
-	status = i40iw_handle_cqp_op(iwdev, cqp_request);
-	if (status)
-		i40iw_pr_err("CQP-OP Commit FPM fail");
-	return status;
-}
-
-/**
- * i40iw_vf_wait_vchnl_resp - wait for channel msg
- * @dev: function's device struct
- */
-enum i40iw_status_code i40iw_vf_wait_vchnl_resp(struct i40iw_sc_dev *dev)
-{
-	struct i40iw_device *iwdev = dev->back_dev;
-	int timeout_ret;
-
-	i40iw_debug(dev, I40IW_DEBUG_VIRT, "%s[%u] dev %p, iwdev %p\n",
-		    __func__, __LINE__, dev, iwdev);
-
-	atomic_set(&iwdev->vchnl_msgs, 2);
-	timeout_ret = wait_event_timeout(iwdev->vchnl_waitq,
-					 (atomic_read(&iwdev->vchnl_msgs) == 1),
-					 I40IW_VCHNL_EVENT_TIMEOUT);
-	atomic_dec(&iwdev->vchnl_msgs);
-	if (!timeout_ret) {
-		i40iw_pr_err("virt channel completion timeout = 0x%x\n", timeout_ret);
-		atomic_set(&iwdev->vchnl_msgs, 0);
-		dev->vchnl_up = false;
-		return I40IW_ERR_TIMEOUT;
-	}
-	wake_up(&dev->vf_reqs);
-	return 0;
-}
-
-/**
- * i40iw_cqp_cq_create_cmd - create a cq for the cqp
- * @dev: device pointer
- * @cq: pointer to created cq
- */
-enum i40iw_status_code i40iw_cqp_cq_create_cmd(struct i40iw_sc_dev *dev,
-					       struct i40iw_sc_cq *cq)
-{
-	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
-	struct i40iw_cqp *iwcqp = &iwdev->cqp;
-	struct i40iw_cqp_request *cqp_request;
-	struct cqp_commands_info *cqp_info;
-	enum i40iw_status_code status;
-
-	cqp_request = i40iw_get_cqp_request(iwcqp, true);
-	if (!cqp_request)
-		return I40IW_ERR_NO_MEMORY;
-
-	cqp_info = &cqp_request->info;
-	cqp_info->cqp_cmd = OP_CQ_CREATE;
-	cqp_info->post_sq = 1;
-	cqp_info->in.u.cq_create.cq = cq;
-	cqp_info->in.u.cq_create.scratch = (uintptr_t)cqp_request;
-	status = i40iw_handle_cqp_op(iwdev, cqp_request);
-	if (status)
-		i40iw_pr_err("CQP-OP Create QP fail");
-
-	return status;
-}
-
-/**
- * i40iw_cqp_qp_create_cmd - create a qp for the cqp
- * @dev: device pointer
- * @qp: pointer to created qp
- */
-enum i40iw_status_code i40iw_cqp_qp_create_cmd(struct i40iw_sc_dev *dev,
-					       struct i40iw_sc_qp *qp)
-{
-	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
-	struct i40iw_cqp *iwcqp = &iwdev->cqp;
-	struct i40iw_cqp_request *cqp_request;
-	struct cqp_commands_info *cqp_info;
-	struct i40iw_create_qp_info *qp_info;
-	enum i40iw_status_code status;
-
-	cqp_request = i40iw_get_cqp_request(iwcqp, true);
-	if (!cqp_request)
-		return I40IW_ERR_NO_MEMORY;
-
-	cqp_info = &cqp_request->info;
-	qp_info = &cqp_request->info.in.u.qp_create.info;
-
-	memset(qp_info, 0, sizeof(*qp_info));
-
-	qp_info->cq_num_valid = true;
-	qp_info->next_iwarp_state = I40IW_QP_STATE_RTS;
-
-	cqp_info->cqp_cmd = OP_QP_CREATE;
-	cqp_info->post_sq = 1;
-	cqp_info->in.u.qp_create.qp = qp;
-	cqp_info->in.u.qp_create.scratch = (uintptr_t)cqp_request;
-	status = i40iw_handle_cqp_op(iwdev, cqp_request);
-	if (status)
-		i40iw_pr_err("CQP-OP QP create fail");
-	return status;
-}
-
-/**
- * i40iw_cqp_cq_destroy_cmd - destroy the cqp cq
- * @dev: device pointer
- * @cq: pointer to cq
- */
-void i40iw_cqp_cq_destroy_cmd(struct i40iw_sc_dev *dev, struct i40iw_sc_cq *cq)
-{
-	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
-
-	i40iw_cq_wq_destroy(iwdev, cq);
-}
-
-/**
- * i40iw_cqp_qp_destroy_cmd - destroy the cqp
- * @dev: device pointer
- * @qp: pointer to qp
- */
-void i40iw_cqp_qp_destroy_cmd(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp)
-{
-	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
-	struct i40iw_cqp *iwcqp = &iwdev->cqp;
-	struct i40iw_cqp_request *cqp_request;
-	struct cqp_commands_info *cqp_info;
-	enum i40iw_status_code status;
-
-	cqp_request = i40iw_get_cqp_request(iwcqp, true);
-	if (!cqp_request)
-		return;
-
-	cqp_info = &cqp_request->info;
-	memset(cqp_info, 0, sizeof(*cqp_info));
-
-	cqp_info->cqp_cmd = OP_QP_DESTROY;
-	cqp_info->post_sq = 1;
-	cqp_info->in.u.qp_destroy.qp = qp;
-	cqp_info->in.u.qp_destroy.scratch = (uintptr_t)cqp_request;
-	cqp_info->in.u.qp_destroy.remove_hash_idx = true;
-	status = i40iw_handle_cqp_op(iwdev, cqp_request);
-	if (status)
-		i40iw_pr_err("CQP QP_DESTROY fail");
-}
-
-
-/**
- * i40iw_ieq_mpa_crc_ae - generate AE for crc error
- * @dev: hardware control device structure
- * @qp: hardware control qp
- */
-void i40iw_ieq_mpa_crc_ae(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp)
-{
-	struct i40iw_gen_ae_info info;
-	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
-
-	i40iw_debug(dev, I40IW_DEBUG_AEQ, "%s entered\n", __func__);
-	info.ae_code = I40IW_AE_LLP_RECEIVED_MPA_CRC_ERROR;
-	info.ae_source = I40IW_AE_SOURCE_RQ;
-	i40iw_gen_ae(iwdev, qp, &info, false);
-}
-
-/**
- * i40iw_init_hash_desc - initialize hash for crc calculation
- * @desc: cryption type
- */
-enum i40iw_status_code i40iw_init_hash_desc(struct shash_desc **desc)
-{
-	struct crypto_shash *tfm;
-	struct shash_desc *tdesc;
-
-	tfm = crypto_alloc_shash("crc32c", 0, 0);
-	if (IS_ERR(tfm))
-		return I40IW_ERR_MPA_CRC;
-
-	tdesc = kzalloc(sizeof(*tdesc) + crypto_shash_descsize(tfm),
-			GFP_KERNEL);
-	if (!tdesc) {
-		crypto_free_shash(tfm);
-		return I40IW_ERR_MPA_CRC;
-	}
-	tdesc->tfm = tfm;
-	*desc = tdesc;
-
-	return 0;
-}
-
-/**
- * i40iw_free_hash_desc - free hash desc
- * @desc: to be freed
- */
-void i40iw_free_hash_desc(struct shash_desc *desc)
-{
-	if (desc) {
-		crypto_free_shash(desc->tfm);
-		kfree(desc);
-	}
-}
-
-/**
- * i40iw_alloc_query_fpm_buf - allocate buffer for fpm
- * @dev: hardware control device structure
- * @mem: buffer ptr for fpm to be allocated
- * @return: memory allocation status
- */
-enum i40iw_status_code i40iw_alloc_query_fpm_buf(struct i40iw_sc_dev *dev,
-						 struct i40iw_dma_mem *mem)
-{
-	enum i40iw_status_code status;
-	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
-
-	status = i40iw_obj_aligned_mem(iwdev, mem, I40IW_QUERY_FPM_BUF_SIZE,
-				       I40IW_FPM_QUERY_BUF_ALIGNMENT_MASK);
-	return status;
-}
-
-/**
- * i40iw_ieq_check_mpacrc - check if mpa crc is OK
- * @desc: desc for hash
- * @addr: address of buffer for crc
- * @length: length of buffer
- * @value: value to be compared
- */
-enum i40iw_status_code i40iw_ieq_check_mpacrc(struct shash_desc *desc,
-					      void *addr,
-					      u32 length,
-					      u32 value)
-{
-	u32 crc = 0;
-	int ret;
-	enum i40iw_status_code ret_code = 0;
-
-	crypto_shash_init(desc);
-	ret = crypto_shash_update(desc, addr, length);
-	if (!ret)
-		crypto_shash_final(desc, (u8 *)&crc);
-	if (crc != value) {
-		i40iw_pr_err("mpa crc check fail\n");
-		ret_code = I40IW_ERR_MPA_CRC;
-	}
-	return ret_code;
-}
-
-/**
- * i40iw_ieq_get_qp - get qp based on quad in puda buffer
- * @dev: hardware control device structure
- * @buf: receive puda buffer on exception q
- */
-struct i40iw_sc_qp *i40iw_ieq_get_qp(struct i40iw_sc_dev *dev,
-				     struct i40iw_puda_buf *buf)
-{
-	struct i40iw_device *iwdev = (struct i40iw_device *)dev->back_dev;
-	struct i40iw_qp *iwqp;
-	struct i40iw_cm_node *cm_node;
-	u32 loc_addr[4], rem_addr[4];
-	u16 loc_port, rem_port;
-	struct ipv6hdr *ip6h;
-	struct iphdr *iph = (struct iphdr *)buf->iph;
-	struct tcphdr *tcph = (struct tcphdr *)buf->tcph;
-
-	if (iph->version == 4) {
-		memset(loc_addr, 0, sizeof(loc_addr));
-		loc_addr[0] = ntohl(iph->daddr);
-		memset(rem_addr, 0, sizeof(rem_addr));
-		rem_addr[0] = ntohl(iph->saddr);
-	} else {
-		ip6h = (struct ipv6hdr *)buf->iph;
-		i40iw_copy_ip_ntohl(loc_addr, ip6h->daddr.in6_u.u6_addr32);
-		i40iw_copy_ip_ntohl(rem_addr, ip6h->saddr.in6_u.u6_addr32);
-	}
-	loc_port = ntohs(tcph->dest);
-	rem_port = ntohs(tcph->source);
-
-	cm_node = i40iw_find_node(&iwdev->cm_core, rem_port, rem_addr, loc_port,
-				  loc_addr, false, true);
-	if (!cm_node)
-		return NULL;
-	iwqp = cm_node->iwqp;
-	return &iwqp->sc_qp;
-}
-
-/**
- * i40iw_ieq_update_tcpip_info - update tcpip in the buffer
- * @buf: puda to update
- * @length: length of buffer
- * @seqnum: seq number for tcp
- */
-void i40iw_ieq_update_tcpip_info(struct i40iw_puda_buf *buf, u16 length, u32 seqnum)
-{
-	struct tcphdr *tcph;
-	struct iphdr *iph;
-	u16 iphlen;
-	u16 packetsize;
-	u8 *addr = (u8 *)buf->mem.va;
-
-	iphlen = (buf->ipv4) ? 20 : 40;
-	iph = (struct iphdr *)(addr + buf->maclen);
-	tcph = (struct tcphdr *)(addr + buf->maclen + iphlen);
-	packetsize = length + buf->tcphlen + iphlen;
-
-	iph->tot_len = htons(packetsize);
-	tcph->seq = htonl(seqnum);
-}
-
-/**
- * i40iw_puda_get_tcpip_info - get tcpip info from puda buffer
- * @info: to get information
- * @buf: puda buffer
- */
-enum i40iw_status_code i40iw_puda_get_tcpip_info(struct i40iw_puda_completion_info *info,
-						 struct i40iw_puda_buf *buf)
-{
-	struct iphdr *iph;
-	struct ipv6hdr *ip6h;
-	struct tcphdr *tcph;
-	u16 iphlen;
-	u16 pkt_len;
-	u8 *mem = (u8 *)buf->mem.va;
-	struct ethhdr *ethh = (struct ethhdr *)buf->mem.va;
-
-	if (ethh->h_proto == htons(0x8100)) {
-		info->vlan_valid = true;
-		buf->vlan_id = ntohs(((struct vlan_ethhdr *)ethh)->h_vlan_TCI) & VLAN_VID_MASK;
-	}
-	buf->maclen = (info->vlan_valid) ? 18 : 14;
-	iphlen = (info->l3proto) ? 40 : 20;
-	buf->ipv4 = (info->l3proto) ? false : true;
-	buf->iph = mem + buf->maclen;
-	iph = (struct iphdr *)buf->iph;
-
-	buf->tcph = buf->iph + iphlen;
-	tcph = (struct tcphdr *)buf->tcph;
-
-	if (buf->ipv4) {
-		pkt_len = ntohs(iph->tot_len);
-	} else {
-		ip6h = (struct ipv6hdr *)buf->iph;
-		pkt_len = ntohs(ip6h->payload_len) + iphlen;
-	}
-
-	buf->totallen = pkt_len + buf->maclen;
-
-	if (info->payload_len < buf->totallen) {
-		i40iw_pr_err("payload_len = 0x%x totallen expected0x%x\n",
-			     info->payload_len, buf->totallen);
-		return I40IW_ERR_INVALID_SIZE;
-	}
-
-	buf->tcphlen = (tcph->doff) << 2;
-	buf->datalen = pkt_len - iphlen - buf->tcphlen;
-	buf->data = (buf->datalen) ? buf->tcph + buf->tcphlen : NULL;
-	buf->hdrlen = buf->maclen + iphlen + buf->tcphlen;
-	buf->seqnum = ntohl(tcph->seq);
-	return 0;
-}
-
-/**
- * i40iw_hw_stats_timeout - Stats timer-handler which updates all HW stats
- * @t: Timer context containing pointer to the vsi structure
- */
-static void i40iw_hw_stats_timeout(struct timer_list *t)
-{
-	struct i40iw_vsi_pestat *pf_devstat = from_timer(pf_devstat, t,
-						       stats_timer);
-	struct i40iw_sc_vsi *sc_vsi = pf_devstat->vsi;
-	struct i40iw_sc_dev *pf_dev = sc_vsi->dev;
-	struct i40iw_vsi_pestat *vf_devstat = NULL;
-	u16 iw_vf_idx;
-	unsigned long flags;
-
-	/*PF*/
-	i40iw_hw_stats_read_all(pf_devstat, &pf_devstat->hw_stats);
-
-	for (iw_vf_idx = 0; iw_vf_idx < I40IW_MAX_PE_ENABLED_VF_COUNT; iw_vf_idx++) {
-		spin_lock_irqsave(&pf_devstat->lock, flags);
-		if (pf_dev->vf_dev[iw_vf_idx]) {
-			if (pf_dev->vf_dev[iw_vf_idx]->stats_initialized) {
-				vf_devstat = &pf_dev->vf_dev[iw_vf_idx]->pestat;
-				i40iw_hw_stats_read_all(vf_devstat, &vf_devstat->hw_stats);
-			}
-		}
-		spin_unlock_irqrestore(&pf_devstat->lock, flags);
-	}
-
-	mod_timer(&pf_devstat->stats_timer,
-		  jiffies + msecs_to_jiffies(STATS_TIMER_DELAY));
-}
-
-/**
- * i40iw_hw_stats_start_timer - Start periodic stats timer
- * @vsi: pointer to the vsi structure
- */
-void i40iw_hw_stats_start_timer(struct i40iw_sc_vsi *vsi)
-{
-	struct i40iw_vsi_pestat *devstat = vsi->pestat;
-
-	timer_setup(&devstat->stats_timer, i40iw_hw_stats_timeout, 0);
-	mod_timer(&devstat->stats_timer,
-		  jiffies + msecs_to_jiffies(STATS_TIMER_DELAY));
-}
-
-/**
- * i40iw_hw_stats_stop_timer - Delete periodic stats timer
- * @vsi: pointer to the vsi structure
- */
-void i40iw_hw_stats_stop_timer(struct i40iw_sc_vsi *vsi)
-{
-	struct i40iw_vsi_pestat *devstat = vsi->pestat;
-
-	del_timer_sync(&devstat->stats_timer);
-}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
deleted file mode 100644
index b876d722fcc8..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ /dev/null
@@ -1,2652 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/random.h>
-#include <linux/highmem.h>
-#include <linux/time.h>
-#include <linux/hugetlb.h>
-#include <linux/irq.h>
-#include <asm/byteorder.h>
-#include <net/ip.h>
-#include <rdma/ib_verbs.h>
-#include <rdma/iw_cm.h>
-#include <rdma/ib_user_verbs.h>
-#include <rdma/ib_umem.h>
-#include <rdma/uverbs_ioctl.h>
-#include "i40iw.h"
-
-/**
- * i40iw_query_device - get device attributes
- * @ibdev: device pointer from stack
- * @props: returning device attributes
- * @udata: user data
- */
-static int i40iw_query_device(struct ib_device *ibdev,
-			      struct ib_device_attr *props,
-			      struct ib_udata *udata)
-{
-	struct i40iw_device *iwdev = to_iwdev(ibdev);
-
-	if (udata->inlen || udata->outlen)
-		return -EINVAL;
-	memset(props, 0, sizeof(*props));
-	ether_addr_copy((u8 *)&props->sys_image_guid, iwdev->netdev->dev_addr);
-	props->fw_ver = i40iw_fw_major_ver(&iwdev->sc_dev) << 32 |
-			i40iw_fw_minor_ver(&iwdev->sc_dev);
-	props->device_cap_flags = iwdev->device_cap_flags;
-	props->vendor_id = iwdev->ldev->pcidev->vendor;
-	props->vendor_part_id = iwdev->ldev->pcidev->device;
-	props->hw_ver = (u32)iwdev->sc_dev.hw_rev;
-	props->max_mr_size = I40IW_MAX_OUTBOUND_MESSAGE_SIZE;
-	props->max_qp = iwdev->max_qp - iwdev->used_qps;
-	props->max_qp_wr = I40IW_MAX_QP_WRS;
-	props->max_send_sge = I40IW_MAX_WQ_FRAGMENT_COUNT;
-	props->max_recv_sge = I40IW_MAX_WQ_FRAGMENT_COUNT;
-	props->max_cq = iwdev->max_cq - iwdev->used_cqs;
-	props->max_cqe = iwdev->max_cqe;
-	props->max_mr = iwdev->max_mr - iwdev->used_mrs;
-	props->max_pd = iwdev->max_pd - iwdev->used_pds;
-	props->max_sge_rd = I40IW_MAX_SGE_RD;
-	props->max_qp_rd_atom = I40IW_MAX_IRD_SIZE;
-	props->max_qp_init_rd_atom = props->max_qp_rd_atom;
-	props->atomic_cap = IB_ATOMIC_NONE;
-	props->max_fast_reg_page_list_len = I40IW_MAX_PAGES_PER_FMR;
-	return 0;
-}
-
-/**
- * i40iw_query_port - get port attrubutes
- * @ibdev: device pointer from stack
- * @port: port number for query
- * @props: returning device attributes
- */
-static int i40iw_query_port(struct ib_device *ibdev,
-			    u32 port,
-			    struct ib_port_attr *props)
-{
-	props->lid = 1;
-	props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP |
-		IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
-	props->gid_tbl_len = 1;
-	props->active_width = IB_WIDTH_4X;
-	props->active_speed = 1;
-	props->max_msg_sz = I40IW_MAX_OUTBOUND_MESSAGE_SIZE;
-	return 0;
-}
-
-/**
- * i40iw_alloc_ucontext - Allocate the user context data structure
- * @uctx: Uverbs context pointer from stack
- * @udata: user data
- *
- * This keeps track of all objects associated with a particular
- * user-mode client.
- */
-static int i40iw_alloc_ucontext(struct ib_ucontext *uctx,
-				struct ib_udata *udata)
-{
-	struct ib_device *ibdev = uctx->device;
-	struct i40iw_device *iwdev = to_iwdev(ibdev);
-	struct i40iw_alloc_ucontext_req req;
-	struct i40iw_alloc_ucontext_resp uresp = {};
-	struct i40iw_ucontext *ucontext = to_ucontext(uctx);
-
-	if (ib_copy_from_udata(&req, udata, sizeof(req)))
-		return -EINVAL;
-
-	if (req.userspace_ver < 4 || req.userspace_ver > I40IW_ABI_VER) {
-		i40iw_pr_err("Unsupported provider library version %u.\n", req.userspace_ver);
-		return -EINVAL;
-	}
-
-	uresp.max_qps = iwdev->max_qp;
-	uresp.max_pds = iwdev->max_pd;
-	uresp.wq_size = iwdev->max_qp_wr * 2;
-	uresp.kernel_ver = req.userspace_ver;
-
-	ucontext->iwdev = iwdev;
-	ucontext->abi_ver = req.userspace_ver;
-
-	if (ib_copy_to_udata(udata, &uresp, sizeof(uresp)))
-		return -EFAULT;
-
-	INIT_LIST_HEAD(&ucontext->cq_reg_mem_list);
-	spin_lock_init(&ucontext->cq_reg_mem_list_lock);
-	INIT_LIST_HEAD(&ucontext->qp_reg_mem_list);
-	spin_lock_init(&ucontext->qp_reg_mem_list_lock);
-
-	return 0;
-}
-
-/**
- * i40iw_dealloc_ucontext - deallocate the user context data structure
- * @context: user context created during alloc
- */
-static void i40iw_dealloc_ucontext(struct ib_ucontext *context)
-{
-	return;
-}
-
-/**
- * i40iw_mmap - user memory map
- * @context: context created during alloc
- * @vma: kernel info for user memory map
- */
-static int i40iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
-{
-	struct i40iw_ucontext *ucontext = to_ucontext(context);
-	u64 dbaddr;
-
-	if (vma->vm_pgoff || vma->vm_end - vma->vm_start != PAGE_SIZE)
-		return -EINVAL;
-
-	dbaddr = I40IW_DB_ADDR_OFFSET + pci_resource_start(ucontext->iwdev->ldev->pcidev, 0);
-
-	return rdma_user_mmap_io(context, vma, dbaddr >> PAGE_SHIFT, PAGE_SIZE,
-				 pgprot_noncached(vma->vm_page_prot), NULL);
-}
-
-/**
- * i40iw_alloc_pd - allocate protection domain
- * @pd: PD pointer
- * @udata: user data
- */
-static int i40iw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata)
-{
-	struct i40iw_pd *iwpd = to_iwpd(pd);
-	struct i40iw_device *iwdev = to_iwdev(pd->device);
-	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-	struct i40iw_alloc_pd_resp uresp;
-	struct i40iw_sc_pd *sc_pd;
-	u32 pd_id = 0;
-	int err;
-
-	if (iwdev->closing)
-		return -ENODEV;
-
-	err = i40iw_alloc_resource(iwdev, iwdev->allocated_pds,
-				   iwdev->max_pd, &pd_id, &iwdev->next_pd);
-	if (err) {
-		i40iw_pr_err("alloc resource failed\n");
-		return err;
-	}
-
-	sc_pd = &iwpd->sc_pd;
-
-	if (udata) {
-		struct i40iw_ucontext *ucontext = rdma_udata_to_drv_context(
-			udata, struct i40iw_ucontext, ibucontext);
-		dev->iw_pd_ops->pd_init(dev, sc_pd, pd_id, ucontext->abi_ver);
-		memset(&uresp, 0, sizeof(uresp));
-		uresp.pd_id = pd_id;
-		if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
-			err = -EFAULT;
-			goto error;
-		}
-	} else {
-		dev->iw_pd_ops->pd_init(dev, sc_pd, pd_id, -1);
-	}
-
-	i40iw_add_pdusecount(iwpd);
-	return 0;
-
-error:
-	i40iw_free_resource(iwdev, iwdev->allocated_pds, pd_id);
-	return err;
-}
-
-/**
- * i40iw_dealloc_pd - deallocate pd
- * @ibpd: ptr of pd to be deallocated
- * @udata: user data or null for kernel object
- */
-static int i40iw_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
-{
-	struct i40iw_pd *iwpd = to_iwpd(ibpd);
-	struct i40iw_device *iwdev = to_iwdev(ibpd->device);
-
-	i40iw_rem_pdusecount(iwpd, iwdev);
-	return 0;
-}
-
-/**
- * i40iw_get_pbl - Retrieve pbl from a list given a virtual
- * address
- * @va: user virtual address
- * @pbl_list: pbl list to search in (QP's or CQ's)
- */
-static struct i40iw_pbl *i40iw_get_pbl(unsigned long va,
-				       struct list_head *pbl_list)
-{
-	struct i40iw_pbl *iwpbl;
-
-	list_for_each_entry(iwpbl, pbl_list, list) {
-		if (iwpbl->user_base == va) {
-			iwpbl->on_list = false;
-			list_del(&iwpbl->list);
-			return iwpbl;
-		}
-	}
-	return NULL;
-}
-
-/**
- * i40iw_free_qp_resources - free up memory resources for qp
- * @iwqp: qp ptr (user or kernel)
- */
-void i40iw_free_qp_resources(struct i40iw_qp *iwqp)
-{
-	struct i40iw_pbl *iwpbl = &iwqp->iwpbl;
-	struct i40iw_device *iwdev = iwqp->iwdev;
-	u32 qp_num = iwqp->ibqp.qp_num;
-
-	i40iw_ieq_cleanup_qp(iwdev->vsi.ieq, &iwqp->sc_qp);
-	if (qp_num)
-		i40iw_free_resource(iwdev, iwdev->allocated_qps, qp_num);
-	if (iwpbl->pbl_allocated)
-		i40iw_free_pble(iwdev->pble_rsrc, &iwpbl->pble_alloc);
-	i40iw_free_dma_mem(iwdev->sc_dev.hw, &iwqp->q2_ctx_mem);
-	i40iw_free_dma_mem(iwdev->sc_dev.hw, &iwqp->kqp.dma_mem);
-	kfree(iwqp->kqp.wrid_mem);
-	iwqp->kqp.wrid_mem = NULL;
-	kfree(iwqp);
-}
-
-/**
- * i40iw_clean_cqes - clean cq entries for qp
- * @iwqp: qp ptr (user or kernel)
- * @iwcq: cq ptr
- */
-static void i40iw_clean_cqes(struct i40iw_qp *iwqp, struct i40iw_cq *iwcq)
-{
-	struct i40iw_cq_uk *ukcq = &iwcq->sc_cq.cq_uk;
-
-	ukcq->ops.iw_cq_clean(&iwqp->sc_qp.qp_uk, ukcq);
-}
-
-/**
- * i40iw_destroy_qp - destroy qp
- * @ibqp: qp's ib pointer also to get to device's qp address
- * @udata: user data
- */
-static int i40iw_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
-{
-	struct i40iw_qp *iwqp = to_iwqp(ibqp);
-	struct ib_qp_attr attr;
-	struct i40iw_device *iwdev = iwqp->iwdev;
-
-	memset(&attr, 0, sizeof(attr));
-
-	iwqp->destroyed = 1;
-
-	if (iwqp->ibqp_state >= IB_QPS_INIT && iwqp->ibqp_state < IB_QPS_RTS)
-		i40iw_next_iw_state(iwqp, I40IW_QP_STATE_ERROR, 0, 0, 0);
-
-	if (!iwqp->user_mode) {
-		if (iwqp->iwscq) {
-			i40iw_clean_cqes(iwqp, iwqp->iwscq);
-			if (iwqp->iwrcq != iwqp->iwscq)
-				i40iw_clean_cqes(iwqp, iwqp->iwrcq);
-		}
-	}
-
-	attr.qp_state = IB_QPS_ERR;
-	i40iw_modify_qp(&iwqp->ibqp, &attr, IB_QP_STATE, NULL);
-	i40iw_qp_rem_ref(&iwqp->ibqp);
-	wait_for_completion(&iwqp->free_qp);
-	i40iw_cqp_qp_destroy_cmd(&iwdev->sc_dev, &iwqp->sc_qp);
-	i40iw_rem_pdusecount(iwqp->iwpd, iwdev);
-	i40iw_free_qp_resources(iwqp);
-	i40iw_rem_devusecount(iwdev);
-
-	return 0;
-}
-
-/**
- * i40iw_setup_virt_qp - setup for allocation of virtual qp
- * @iwdev: iwarp device
- * @iwqp: qp ptr
- * @init_info: initialize info to return
- */
-static int i40iw_setup_virt_qp(struct i40iw_device *iwdev,
-			       struct i40iw_qp *iwqp,
-			       struct i40iw_qp_init_info *init_info)
-{
-	struct i40iw_pbl *iwpbl = &iwqp->iwpbl;
-	struct i40iw_qp_mr *qpmr = &iwpbl->qp_mr;
-
-	iwqp->page = qpmr->sq_page;
-	init_info->shadow_area_pa = cpu_to_le64(qpmr->shadow);
-	if (iwpbl->pbl_allocated) {
-		init_info->virtual_map = true;
-		init_info->sq_pa = qpmr->sq_pbl.idx;
-		init_info->rq_pa = qpmr->rq_pbl.idx;
-	} else {
-		init_info->sq_pa = qpmr->sq_pbl.addr;
-		init_info->rq_pa = qpmr->rq_pbl.addr;
-	}
-	return 0;
-}
-
-/**
- * i40iw_setup_kmode_qp - setup initialization for kernel mode qp
- * @iwdev: iwarp device
- * @iwqp: qp ptr (user or kernel)
- * @info: initialize info to return
- */
-static int i40iw_setup_kmode_qp(struct i40iw_device *iwdev,
-				struct i40iw_qp *iwqp,
-				struct i40iw_qp_init_info *info)
-{
-	struct i40iw_dma_mem *mem = &iwqp->kqp.dma_mem;
-	u32 sqdepth, rqdepth;
-	u8 sqshift;
-	u32 size;
-	enum i40iw_status_code status;
-	struct i40iw_qp_uk_init_info *ukinfo = &info->qp_uk_init_info;
-
-	i40iw_get_wqe_shift(ukinfo->max_sq_frag_cnt, ukinfo->max_inline_data, &sqshift);
-	status = i40iw_get_sqdepth(ukinfo->sq_size, sqshift, &sqdepth);
-	if (status)
-		return -ENOMEM;
-
-	status = i40iw_get_rqdepth(ukinfo->rq_size, I40IW_MAX_RQ_WQE_SHIFT, &rqdepth);
-	if (status)
-		return -ENOMEM;
-
-	size = sqdepth * sizeof(struct i40iw_sq_uk_wr_trk_info) + (rqdepth << 3);
-	iwqp->kqp.wrid_mem = kzalloc(size, GFP_KERNEL);
-
-	ukinfo->sq_wrtrk_array = (struct i40iw_sq_uk_wr_trk_info *)iwqp->kqp.wrid_mem;
-	if (!ukinfo->sq_wrtrk_array)
-		return -ENOMEM;
-
-	ukinfo->rq_wrid_array = (u64 *)&ukinfo->sq_wrtrk_array[sqdepth];
-
-	size = (sqdepth + rqdepth) * I40IW_QP_WQE_MIN_SIZE;
-	size += (I40IW_SHADOW_AREA_SIZE << 3);
-
-	status = i40iw_allocate_dma_mem(iwdev->sc_dev.hw, mem, size, 256);
-	if (status) {
-		kfree(ukinfo->sq_wrtrk_array);
-		ukinfo->sq_wrtrk_array = NULL;
-		return -ENOMEM;
-	}
-
-	ukinfo->sq = mem->va;
-	info->sq_pa = mem->pa;
-
-	ukinfo->rq = &ukinfo->sq[sqdepth];
-	info->rq_pa = info->sq_pa + (sqdepth * I40IW_QP_WQE_MIN_SIZE);
-
-	ukinfo->shadow_area = ukinfo->rq[rqdepth].elem;
-	info->shadow_area_pa = info->rq_pa + (rqdepth * I40IW_QP_WQE_MIN_SIZE);
-
-	ukinfo->sq_size = sqdepth >> sqshift;
-	ukinfo->rq_size = rqdepth >> I40IW_MAX_RQ_WQE_SHIFT;
-	ukinfo->qp_id = iwqp->ibqp.qp_num;
-	return 0;
-}
-
-/**
- * i40iw_create_qp - create qp
- * @ibpd: ptr of pd
- * @init_attr: attributes for qp
- * @udata: user data for create qp
- */
-static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd,
-				     struct ib_qp_init_attr *init_attr,
-				     struct ib_udata *udata)
-{
-	struct i40iw_pd *iwpd = to_iwpd(ibpd);
-	struct i40iw_device *iwdev = to_iwdev(ibpd->device);
-	struct i40iw_cqp *iwcqp = &iwdev->cqp;
-	struct i40iw_qp *iwqp;
-	struct i40iw_ucontext *ucontext = rdma_udata_to_drv_context(
-		udata, struct i40iw_ucontext, ibucontext);
-	struct i40iw_create_qp_req req;
-	struct i40iw_create_qp_resp uresp;
-	u32 qp_num = 0;
-	enum i40iw_status_code ret;
-	int err_code;
-	int sq_size;
-	int rq_size;
-	struct i40iw_sc_qp *qp;
-	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-	struct i40iw_qp_init_info init_info;
-	struct i40iw_create_qp_info *qp_info;
-	struct i40iw_cqp_request *cqp_request;
-	struct cqp_commands_info *cqp_info;
-
-	struct i40iw_qp_host_ctx_info *ctx_info;
-	struct i40iwarp_offload_info *iwarp_info;
-	unsigned long flags;
-
-	if (iwdev->closing)
-		return ERR_PTR(-ENODEV);
-
-	if (init_attr->create_flags)
-		return ERR_PTR(-EOPNOTSUPP);
-	if (init_attr->cap.max_inline_data > I40IW_MAX_INLINE_DATA_SIZE)
-		init_attr->cap.max_inline_data = I40IW_MAX_INLINE_DATA_SIZE;
-
-	if (init_attr->cap.max_send_sge > I40IW_MAX_WQ_FRAGMENT_COUNT)
-		init_attr->cap.max_send_sge = I40IW_MAX_WQ_FRAGMENT_COUNT;
-
-	if (init_attr->cap.max_recv_sge > I40IW_MAX_WQ_FRAGMENT_COUNT)
-		init_attr->cap.max_recv_sge = I40IW_MAX_WQ_FRAGMENT_COUNT;
-
-	memset(&init_info, 0, sizeof(init_info));
-
-	sq_size = init_attr->cap.max_send_wr;
-	rq_size = init_attr->cap.max_recv_wr;
-
-	init_info.vsi = &iwdev->vsi;
-	init_info.qp_uk_init_info.sq_size = sq_size;
-	init_info.qp_uk_init_info.rq_size = rq_size;
-	init_info.qp_uk_init_info.max_sq_frag_cnt = init_attr->cap.max_send_sge;
-	init_info.qp_uk_init_info.max_rq_frag_cnt = init_attr->cap.max_recv_sge;
-	init_info.qp_uk_init_info.max_inline_data = init_attr->cap.max_inline_data;
-
-	iwqp = kzalloc(sizeof(*iwqp), GFP_KERNEL);
-	if (!iwqp)
-		return ERR_PTR(-ENOMEM);
-
-	qp = &iwqp->sc_qp;
-	qp->back_qp = (void *)iwqp;
-	iwqp->iwdev = iwdev;
-	iwqp->ctx_info.iwarp_info = &iwqp->iwarp_info;
-
-	if (i40iw_allocate_dma_mem(dev->hw,
-				   &iwqp->q2_ctx_mem,
-				   I40IW_Q2_BUFFER_SIZE + I40IW_QP_CTX_SIZE,
-				   256)) {
-		i40iw_pr_err("dma_mem failed\n");
-		err_code = -ENOMEM;
-		goto error;
-	}
-
-	init_info.q2 = iwqp->q2_ctx_mem.va;
-	init_info.q2_pa = iwqp->q2_ctx_mem.pa;
-
-	init_info.host_ctx = (void *)init_info.q2 + I40IW_Q2_BUFFER_SIZE;
-	init_info.host_ctx_pa = init_info.q2_pa + I40IW_Q2_BUFFER_SIZE;
-
-	err_code = i40iw_alloc_resource(iwdev, iwdev->allocated_qps, iwdev->max_qp,
-					&qp_num, &iwdev->next_qp);
-	if (err_code) {
-		i40iw_pr_err("qp resource\n");
-		goto error;
-	}
-
-	iwqp->iwpd = iwpd;
-	iwqp->ibqp.qp_num = qp_num;
-	qp = &iwqp->sc_qp;
-	iwqp->iwscq = to_iwcq(init_attr->send_cq);
-	iwqp->iwrcq = to_iwcq(init_attr->recv_cq);
-
-	iwqp->host_ctx.va = init_info.host_ctx;
-	iwqp->host_ctx.pa = init_info.host_ctx_pa;
-	iwqp->host_ctx.size = I40IW_QP_CTX_SIZE;
-
-	init_info.pd = &iwpd->sc_pd;
-	init_info.qp_uk_init_info.qp_id = iwqp->ibqp.qp_num;
-	iwqp->ctx_info.qp_compl_ctx = (uintptr_t)qp;
-
-	if (init_attr->qp_type != IB_QPT_RC) {
-		err_code = -EOPNOTSUPP;
-		goto error;
-	}
-	if (udata) {
-		err_code = ib_copy_from_udata(&req, udata, sizeof(req));
-		if (err_code) {
-			i40iw_pr_err("ib_copy_from_data\n");
-			goto error;
-		}
-		iwqp->ctx_info.qp_compl_ctx = req.user_compl_ctx;
-		iwqp->user_mode = 1;
-
-		if (req.user_wqe_buffers) {
-			struct i40iw_pbl *iwpbl;
-
-			spin_lock_irqsave(
-			    &ucontext->qp_reg_mem_list_lock, flags);
-			iwpbl = i40iw_get_pbl(
-			    (unsigned long)req.user_wqe_buffers,
-			    &ucontext->qp_reg_mem_list);
-			spin_unlock_irqrestore(
-			    &ucontext->qp_reg_mem_list_lock, flags);
-
-			if (!iwpbl) {
-				err_code = -ENODATA;
-				i40iw_pr_err("no pbl info\n");
-				goto error;
-			}
-			memcpy(&iwqp->iwpbl, iwpbl, sizeof(iwqp->iwpbl));
-		}
-		err_code = i40iw_setup_virt_qp(iwdev, iwqp, &init_info);
-	} else {
-		err_code = i40iw_setup_kmode_qp(iwdev, iwqp, &init_info);
-	}
-
-	if (err_code) {
-		i40iw_pr_err("setup qp failed\n");
-		goto error;
-	}
-
-	init_info.type = I40IW_QP_TYPE_IWARP;
-	ret = dev->iw_priv_qp_ops->qp_init(qp, &init_info);
-	if (ret) {
-		err_code = -EPROTO;
-		i40iw_pr_err("qp_init fail\n");
-		goto error;
-	}
-	ctx_info = &iwqp->ctx_info;
-	iwarp_info = &iwqp->iwarp_info;
-	iwarp_info->rd_enable = true;
-	iwarp_info->wr_rdresp_en = true;
-	if (!iwqp->user_mode) {
-		iwarp_info->fast_reg_en = true;
-		iwarp_info->priv_mode_en = true;
-	}
-	iwarp_info->ddp_ver = 1;
-	iwarp_info->rdmap_ver = 1;
-
-	ctx_info->iwarp_info_valid = true;
-	ctx_info->send_cq_num = iwqp->iwscq->sc_cq.cq_uk.cq_id;
-	ctx_info->rcv_cq_num = iwqp->iwrcq->sc_cq.cq_uk.cq_id;
-	ret = dev->iw_priv_qp_ops->qp_setctx(&iwqp->sc_qp,
-					     (u64 *)iwqp->host_ctx.va,
-					     ctx_info);
-	ctx_info->iwarp_info_valid = false;
-	cqp_request = i40iw_get_cqp_request(iwcqp, true);
-	if (!cqp_request) {
-		err_code = -ENOMEM;
-		goto error;
-	}
-	cqp_info = &cqp_request->info;
-	qp_info = &cqp_request->info.in.u.qp_create.info;
-
-	memset(qp_info, 0, sizeof(*qp_info));
-
-	qp_info->cq_num_valid = true;
-	qp_info->next_iwarp_state = I40IW_QP_STATE_IDLE;
-
-	cqp_info->cqp_cmd = OP_QP_CREATE;
-	cqp_info->post_sq = 1;
-	cqp_info->in.u.qp_create.qp = qp;
-	cqp_info->in.u.qp_create.scratch = (uintptr_t)cqp_request;
-	ret = i40iw_handle_cqp_op(iwdev, cqp_request);
-	if (ret) {
-		i40iw_pr_err("CQP-OP QP create fail");
-		err_code = -EACCES;
-		goto error;
-	}
-
-	refcount_set(&iwqp->refcount, 1);
-	spin_lock_init(&iwqp->lock);
-	iwqp->sig_all = (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) ? 1 : 0;
-	iwdev->qp_table[qp_num] = iwqp;
-	i40iw_add_pdusecount(iwqp->iwpd);
-	i40iw_add_devusecount(iwdev);
-	if (udata) {
-		memset(&uresp, 0, sizeof(uresp));
-		uresp.actual_sq_size = sq_size;
-		uresp.actual_rq_size = rq_size;
-		uresp.qp_id = qp_num;
-		uresp.push_idx = I40IW_INVALID_PUSH_PAGE_INDEX;
-		err_code = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
-		if (err_code) {
-			i40iw_pr_err("copy_to_udata failed\n");
-			i40iw_destroy_qp(&iwqp->ibqp, udata);
-			/* let the completion of the qp destroy free the qp */
-			return ERR_PTR(err_code);
-		}
-	}
-	init_completion(&iwqp->sq_drained);
-	init_completion(&iwqp->rq_drained);
-	init_completion(&iwqp->free_qp);
-
-	return &iwqp->ibqp;
-error:
-	i40iw_free_qp_resources(iwqp);
-	return ERR_PTR(err_code);
-}
-
-/**
- * i40iw_query_qp - query qp attributes
- * @ibqp: qp pointer
- * @attr: attributes pointer
- * @attr_mask: Not used
- * @init_attr: qp attributes to return
- */
-static int i40iw_query_qp(struct ib_qp *ibqp,
-			  struct ib_qp_attr *attr,
-			  int attr_mask,
-			  struct ib_qp_init_attr *init_attr)
-{
-	struct i40iw_qp *iwqp = to_iwqp(ibqp);
-	struct i40iw_sc_qp *qp = &iwqp->sc_qp;
-
-	attr->qp_state = iwqp->ibqp_state;
-	attr->cur_qp_state = attr->qp_state;
-	attr->qp_access_flags = 0;
-	attr->cap.max_send_wr = qp->qp_uk.sq_size;
-	attr->cap.max_recv_wr = qp->qp_uk.rq_size;
-	attr->cap.max_inline_data = I40IW_MAX_INLINE_DATA_SIZE;
-	attr->cap.max_send_sge = I40IW_MAX_WQ_FRAGMENT_COUNT;
-	attr->cap.max_recv_sge = I40IW_MAX_WQ_FRAGMENT_COUNT;
-	attr->port_num = 1;
-	init_attr->event_handler = iwqp->ibqp.event_handler;
-	init_attr->qp_context = iwqp->ibqp.qp_context;
-	init_attr->send_cq = iwqp->ibqp.send_cq;
-	init_attr->recv_cq = iwqp->ibqp.recv_cq;
-	init_attr->srq = iwqp->ibqp.srq;
-	init_attr->cap = attr->cap;
-	init_attr->port_num = 1;
-	return 0;
-}
-
-/**
- * i40iw_hw_modify_qp - setup cqp for modify qp
- * @iwdev: iwarp device
- * @iwqp: qp ptr (user or kernel)
- * @info: info for modify qp
- * @wait: flag to wait or not for modify qp completion
- */
-void i40iw_hw_modify_qp(struct i40iw_device *iwdev, struct i40iw_qp *iwqp,
-			struct i40iw_modify_qp_info *info, bool wait)
-{
-	struct i40iw_cqp_request *cqp_request;
-	struct cqp_commands_info *cqp_info;
-	struct i40iw_modify_qp_info *m_info;
-	struct i40iw_gen_ae_info ae_info;
-
-	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, wait);
-	if (!cqp_request)
-		return;
-
-	cqp_info = &cqp_request->info;
-	m_info = &cqp_info->in.u.qp_modify.info;
-	memcpy(m_info, info, sizeof(*m_info));
-	cqp_info->cqp_cmd = OP_QP_MODIFY;
-	cqp_info->post_sq = 1;
-	cqp_info->in.u.qp_modify.qp = &iwqp->sc_qp;
-	cqp_info->in.u.qp_modify.scratch = (uintptr_t)cqp_request;
-	if (!i40iw_handle_cqp_op(iwdev, cqp_request))
-		return;
-
-	switch (m_info->next_iwarp_state) {
-	case I40IW_QP_STATE_RTS:
-		if (iwqp->iwarp_state == I40IW_QP_STATE_IDLE)
-			i40iw_send_reset(iwqp->cm_node);
-		fallthrough;
-	case I40IW_QP_STATE_IDLE:
-	case I40IW_QP_STATE_TERMINATE:
-	case I40IW_QP_STATE_CLOSING:
-		ae_info.ae_code = I40IW_AE_BAD_CLOSE;
-		ae_info.ae_source = 0;
-		i40iw_gen_ae(iwdev, &iwqp->sc_qp, &ae_info, false);
-		break;
-	case I40IW_QP_STATE_ERROR:
-	default:
-		break;
-	}
-}
-
-/**
- * i40iw_modify_qp - modify qp request
- * @ibqp: qp's pointer for modify
- * @attr: access attributes
- * @attr_mask: state mask
- * @udata: user data
- */
-int i40iw_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-		    int attr_mask, struct ib_udata *udata)
-{
-	struct i40iw_qp *iwqp = to_iwqp(ibqp);
-	struct i40iw_device *iwdev = iwqp->iwdev;
-	struct i40iw_qp_host_ctx_info *ctx_info;
-	struct i40iwarp_offload_info *iwarp_info;
-	struct i40iw_modify_qp_info info;
-	u8 issue_modify_qp = 0;
-	u8 dont_wait = 0;
-	u32 err;
-	unsigned long flags;
-
-	if (attr_mask & ~IB_QP_ATTR_STANDARD_BITS)
-		return -EOPNOTSUPP;
-
-	memset(&info, 0, sizeof(info));
-	ctx_info = &iwqp->ctx_info;
-	iwarp_info = &iwqp->iwarp_info;
-
-	spin_lock_irqsave(&iwqp->lock, flags);
-
-	if (attr_mask & IB_QP_STATE) {
-		if (iwdev->closing && attr->qp_state != IB_QPS_ERR) {
-			err = -EINVAL;
-			goto exit;
-		}
-
-		switch (attr->qp_state) {
-		case IB_QPS_INIT:
-		case IB_QPS_RTR:
-			if (iwqp->iwarp_state > (u32)I40IW_QP_STATE_IDLE) {
-				err = -EINVAL;
-				goto exit;
-			}
-			if (iwqp->iwarp_state == I40IW_QP_STATE_INVALID) {
-				info.next_iwarp_state = I40IW_QP_STATE_IDLE;
-				issue_modify_qp = 1;
-			}
-			break;
-		case IB_QPS_RTS:
-			if ((iwqp->iwarp_state > (u32)I40IW_QP_STATE_RTS) ||
-			    (!iwqp->cm_id)) {
-				err = -EINVAL;
-				goto exit;
-			}
-
-			issue_modify_qp = 1;
-			iwqp->hw_tcp_state = I40IW_TCP_STATE_ESTABLISHED;
-			iwqp->hte_added = 1;
-			info.next_iwarp_state = I40IW_QP_STATE_RTS;
-			info.tcp_ctx_valid = true;
-			info.ord_valid = true;
-			info.arp_cache_idx_valid = true;
-			info.cq_num_valid = true;
-			break;
-		case IB_QPS_SQD:
-			if (iwqp->hw_iwarp_state > (u32)I40IW_QP_STATE_RTS) {
-				err = 0;
-				goto exit;
-			}
-			if ((iwqp->iwarp_state == (u32)I40IW_QP_STATE_CLOSING) ||
-			    (iwqp->iwarp_state < (u32)I40IW_QP_STATE_RTS)) {
-				err = 0;
-				goto exit;
-			}
-			if (iwqp->iwarp_state > (u32)I40IW_QP_STATE_CLOSING) {
-				err = -EINVAL;
-				goto exit;
-			}
-			info.next_iwarp_state = I40IW_QP_STATE_CLOSING;
-			issue_modify_qp = 1;
-			break;
-		case IB_QPS_SQE:
-			if (iwqp->iwarp_state >= (u32)I40IW_QP_STATE_TERMINATE) {
-				err = -EINVAL;
-				goto exit;
-			}
-			info.next_iwarp_state = I40IW_QP_STATE_TERMINATE;
-			issue_modify_qp = 1;
-			break;
-		case IB_QPS_ERR:
-		case IB_QPS_RESET:
-			if (iwqp->iwarp_state == (u32)I40IW_QP_STATE_ERROR) {
-				err = -EINVAL;
-				goto exit;
-			}
-			if (iwqp->sc_qp.term_flags)
-				i40iw_terminate_del_timer(&iwqp->sc_qp);
-			info.next_iwarp_state = I40IW_QP_STATE_ERROR;
-			if ((iwqp->hw_tcp_state > I40IW_TCP_STATE_CLOSED) &&
-			    iwdev->iw_status &&
-			    (iwqp->hw_tcp_state != I40IW_TCP_STATE_TIME_WAIT))
-				info.reset_tcp_conn = true;
-			else
-				dont_wait = 1;
-			issue_modify_qp = 1;
-			info.next_iwarp_state = I40IW_QP_STATE_ERROR;
-			break;
-		default:
-			err = -EINVAL;
-			goto exit;
-		}
-
-		iwqp->ibqp_state = attr->qp_state;
-
-	}
-	if (attr_mask & IB_QP_ACCESS_FLAGS) {
-		ctx_info->iwarp_info_valid = true;
-		if (attr->qp_access_flags & IB_ACCESS_LOCAL_WRITE)
-			iwarp_info->wr_rdresp_en = true;
-		if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE)
-			iwarp_info->wr_rdresp_en = true;
-		if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ)
-			iwarp_info->rd_enable = true;
-		if (attr->qp_access_flags & IB_ACCESS_MW_BIND)
-			iwarp_info->bind_en = true;
-
-		if (iwqp->user_mode) {
-			iwarp_info->rd_enable = true;
-			iwarp_info->wr_rdresp_en = true;
-			iwarp_info->priv_mode_en = false;
-		}
-	}
-
-	if (ctx_info->iwarp_info_valid) {
-		struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-		int ret;
-
-		ctx_info->send_cq_num = iwqp->iwscq->sc_cq.cq_uk.cq_id;
-		ctx_info->rcv_cq_num = iwqp->iwrcq->sc_cq.cq_uk.cq_id;
-		ret = dev->iw_priv_qp_ops->qp_setctx(&iwqp->sc_qp,
-						     (u64 *)iwqp->host_ctx.va,
-						     ctx_info);
-		if (ret) {
-			i40iw_pr_err("setting QP context\n");
-			err = -EINVAL;
-			goto exit;
-		}
-	}
-
-	spin_unlock_irqrestore(&iwqp->lock, flags);
-
-	if (issue_modify_qp) {
-		i40iw_hw_modify_qp(iwdev, iwqp, &info, true);
-
-		spin_lock_irqsave(&iwqp->lock, flags);
-		iwqp->iwarp_state = info.next_iwarp_state;
-		spin_unlock_irqrestore(&iwqp->lock, flags);
-	}
-
-	if (issue_modify_qp && (iwqp->ibqp_state > IB_QPS_RTS)) {
-		if (dont_wait) {
-			if (iwqp->cm_id && iwqp->hw_tcp_state) {
-				spin_lock_irqsave(&iwqp->lock, flags);
-				iwqp->hw_tcp_state = I40IW_TCP_STATE_CLOSED;
-				iwqp->last_aeq = I40IW_AE_RESET_SENT;
-				spin_unlock_irqrestore(&iwqp->lock, flags);
-				i40iw_cm_disconn(iwqp);
-			}
-		} else {
-			spin_lock_irqsave(&iwqp->lock, flags);
-			if (iwqp->cm_id) {
-				if (atomic_inc_return(&iwqp->close_timer_started) == 1) {
-					iwqp->cm_id->add_ref(iwqp->cm_id);
-					i40iw_schedule_cm_timer(iwqp->cm_node,
-								(struct i40iw_puda_buf *)iwqp,
-								 I40IW_TIMER_TYPE_CLOSE, 1, 0);
-				}
-			}
-			spin_unlock_irqrestore(&iwqp->lock, flags);
-		}
-	}
-	return 0;
-exit:
-	spin_unlock_irqrestore(&iwqp->lock, flags);
-	return err;
-}
-
-/**
- * cq_free_resources - free up recources for cq
- * @iwdev: iwarp device
- * @iwcq: cq ptr
- */
-static void cq_free_resources(struct i40iw_device *iwdev, struct i40iw_cq *iwcq)
-{
-	struct i40iw_sc_cq *cq = &iwcq->sc_cq;
-
-	if (!iwcq->user_mode)
-		i40iw_free_dma_mem(iwdev->sc_dev.hw, &iwcq->kmem);
-	i40iw_free_resource(iwdev, iwdev->allocated_cqs, cq->cq_uk.cq_id);
-}
-
-/**
- * i40iw_cq_wq_destroy - send cq destroy cqp
- * @iwdev: iwarp device
- * @cq: hardware control cq
- */
-void i40iw_cq_wq_destroy(struct i40iw_device *iwdev, struct i40iw_sc_cq *cq)
-{
-	enum i40iw_status_code status;
-	struct i40iw_cqp_request *cqp_request;
-	struct cqp_commands_info *cqp_info;
-
-	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, true);
-	if (!cqp_request)
-		return;
-
-	cqp_info = &cqp_request->info;
-
-	cqp_info->cqp_cmd = OP_CQ_DESTROY;
-	cqp_info->post_sq = 1;
-	cqp_info->in.u.cq_destroy.cq = cq;
-	cqp_info->in.u.cq_destroy.scratch = (uintptr_t)cqp_request;
-	status = i40iw_handle_cqp_op(iwdev, cqp_request);
-	if (status)
-		i40iw_pr_err("CQP-OP Destroy QP fail");
-}
-
-/**
- * i40iw_destroy_cq - destroy cq
- * @ib_cq: cq pointer
- * @udata: user data or NULL for kernel object
- */
-static int i40iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
-{
-	struct i40iw_cq *iwcq;
-	struct i40iw_device *iwdev;
-	struct i40iw_sc_cq *cq;
-
-	iwcq = to_iwcq(ib_cq);
-	iwdev = to_iwdev(ib_cq->device);
-	cq = &iwcq->sc_cq;
-	i40iw_cq_wq_destroy(iwdev, cq);
-	cq_free_resources(iwdev, iwcq);
-	i40iw_rem_devusecount(iwdev);
-	return 0;
-}
-
-/**
- * i40iw_create_cq - create cq
- * @ibcq: CQ allocated
- * @attr: attributes for cq
- * @udata: user data
- */
-static int i40iw_create_cq(struct ib_cq *ibcq,
-			   const struct ib_cq_init_attr *attr,
-			   struct ib_udata *udata)
-{
-	struct ib_device *ibdev = ibcq->device;
-	struct i40iw_device *iwdev = to_iwdev(ibdev);
-	struct i40iw_cq *iwcq = to_iwcq(ibcq);
-	struct i40iw_pbl *iwpbl;
-	u32 cq_num = 0;
-	struct i40iw_sc_cq *cq;
-	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-	struct i40iw_cq_init_info info = {};
-	enum i40iw_status_code status;
-	struct i40iw_cqp_request *cqp_request;
-	struct cqp_commands_info *cqp_info;
-	struct i40iw_cq_uk_init_info *ukinfo = &info.cq_uk_init_info;
-	unsigned long flags;
-	int err_code;
-	int entries = attr->cqe;
-
-	if (attr->flags)
-		return -EOPNOTSUPP;
-
-	if (iwdev->closing)
-		return -ENODEV;
-
-	if (entries > iwdev->max_cqe)
-		return -EINVAL;
-
-	err_code = i40iw_alloc_resource(iwdev, iwdev->allocated_cqs,
-					iwdev->max_cq, &cq_num,
-					&iwdev->next_cq);
-	if (err_code)
-		return err_code;
-
-	cq = &iwcq->sc_cq;
-	cq->back_cq = (void *)iwcq;
-	spin_lock_init(&iwcq->lock);
-
-	info.dev = dev;
-	ukinfo->cq_size = max(entries, 4);
-	ukinfo->cq_id = cq_num;
-	iwcq->ibcq.cqe = info.cq_uk_init_info.cq_size;
-	info.ceqe_mask = 0;
-	if (attr->comp_vector < iwdev->ceqs_count)
-		info.ceq_id = attr->comp_vector;
-	info.ceq_id_valid = true;
-	info.ceqe_mask = 1;
-	info.type = I40IW_CQ_TYPE_IWARP;
-	if (udata) {
-		struct i40iw_ucontext *ucontext = rdma_udata_to_drv_context(
-			udata, struct i40iw_ucontext, ibucontext);
-		struct i40iw_create_cq_req req;
-		struct i40iw_cq_mr *cqmr;
-
-		memset(&req, 0, sizeof(req));
-		iwcq->user_mode = true;
-		if (ib_copy_from_udata(&req, udata, sizeof(struct i40iw_create_cq_req))) {
-			err_code = -EFAULT;
-			goto cq_free_resources;
-		}
-
-		spin_lock_irqsave(&ucontext->cq_reg_mem_list_lock, flags);
-		iwpbl = i40iw_get_pbl((unsigned long)req.user_cq_buffer,
-				      &ucontext->cq_reg_mem_list);
-		spin_unlock_irqrestore(&ucontext->cq_reg_mem_list_lock, flags);
-		if (!iwpbl) {
-			err_code = -EPROTO;
-			goto cq_free_resources;
-		}
-
-		iwcq->iwpbl = iwpbl;
-		iwcq->cq_mem_size = 0;
-		cqmr = &iwpbl->cq_mr;
-		info.shadow_area_pa = cpu_to_le64(cqmr->shadow);
-		if (iwpbl->pbl_allocated) {
-			info.virtual_map = true;
-			info.pbl_chunk_size = 1;
-			info.first_pm_pbl_idx = cqmr->cq_pbl.idx;
-		} else {
-			info.cq_base_pa = cqmr->cq_pbl.addr;
-		}
-	} else {
-		/* Kmode allocations */
-		int rsize;
-		int shadow;
-
-		rsize = info.cq_uk_init_info.cq_size * sizeof(struct i40iw_cqe);
-		rsize = round_up(rsize, 256);
-		shadow = I40IW_SHADOW_AREA_SIZE << 3;
-		status = i40iw_allocate_dma_mem(dev->hw, &iwcq->kmem,
-						rsize + shadow, 256);
-		if (status) {
-			err_code = -ENOMEM;
-			goto cq_free_resources;
-		}
-		ukinfo->cq_base = iwcq->kmem.va;
-		info.cq_base_pa = iwcq->kmem.pa;
-		info.shadow_area_pa = info.cq_base_pa + rsize;
-		ukinfo->shadow_area = iwcq->kmem.va + rsize;
-	}
-
-	if (dev->iw_priv_cq_ops->cq_init(cq, &info)) {
-		i40iw_pr_err("init cq fail\n");
-		err_code = -EPROTO;
-		goto cq_free_resources;
-	}
-
-	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, true);
-	if (!cqp_request) {
-		err_code = -ENOMEM;
-		goto cq_free_resources;
-	}
-
-	cqp_info = &cqp_request->info;
-	cqp_info->cqp_cmd = OP_CQ_CREATE;
-	cqp_info->post_sq = 1;
-	cqp_info->in.u.cq_create.cq = cq;
-	cqp_info->in.u.cq_create.scratch = (uintptr_t)cqp_request;
-	status = i40iw_handle_cqp_op(iwdev, cqp_request);
-	if (status) {
-		i40iw_pr_err("CQP-OP Create QP fail");
-		err_code = -EPROTO;
-		goto cq_free_resources;
-	}
-
-	if (udata) {
-		struct i40iw_create_cq_resp resp;
-
-		memset(&resp, 0, sizeof(resp));
-		resp.cq_id = info.cq_uk_init_info.cq_id;
-		resp.cq_size = info.cq_uk_init_info.cq_size;
-		if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
-			i40iw_pr_err("copy to user data\n");
-			err_code = -EPROTO;
-			goto cq_destroy;
-		}
-	}
-
-	i40iw_add_devusecount(iwdev);
-	return 0;
-
-cq_destroy:
-	i40iw_cq_wq_destroy(iwdev, cq);
-cq_free_resources:
-	cq_free_resources(iwdev, iwcq);
-	return err_code;
-}
-
-/**
- * i40iw_get_user_access - get hw access from IB access
- * @acc: IB access to return hw access
- */
-static inline u16 i40iw_get_user_access(int acc)
-{
-	u16 access = 0;
-
-	access |= (acc & IB_ACCESS_LOCAL_WRITE) ? I40IW_ACCESS_FLAGS_LOCALWRITE : 0;
-	access |= (acc & IB_ACCESS_REMOTE_WRITE) ? I40IW_ACCESS_FLAGS_REMOTEWRITE : 0;
-	access |= (acc & IB_ACCESS_REMOTE_READ) ? I40IW_ACCESS_FLAGS_REMOTEREAD : 0;
-	access |= (acc & IB_ACCESS_MW_BIND) ? I40IW_ACCESS_FLAGS_BIND_WINDOW : 0;
-	return access;
-}
-
-/**
- * i40iw_free_stag - free stag resource
- * @iwdev: iwarp device
- * @stag: stag to free
- */
-static void i40iw_free_stag(struct i40iw_device *iwdev, u32 stag)
-{
-	u32 stag_idx;
-
-	stag_idx = (stag & iwdev->mr_stagmask) >> I40IW_CQPSQ_STAG_IDX_SHIFT;
-	i40iw_free_resource(iwdev, iwdev->allocated_mrs, stag_idx);
-	i40iw_rem_devusecount(iwdev);
-}
-
-/**
- * i40iw_create_stag - create random stag
- * @iwdev: iwarp device
- */
-static u32 i40iw_create_stag(struct i40iw_device *iwdev)
-{
-	u32 stag = 0;
-	u32 stag_index = 0;
-	u32 next_stag_index;
-	u32 driver_key;
-	u32 random;
-	u8 consumer_key;
-	int ret;
-
-	get_random_bytes(&random, sizeof(random));
-	consumer_key = (u8)random;
-
-	driver_key = random & ~iwdev->mr_stagmask;
-	next_stag_index = (random & iwdev->mr_stagmask) >> 8;
-	next_stag_index %= iwdev->max_mr;
-
-	ret = i40iw_alloc_resource(iwdev,
-				   iwdev->allocated_mrs, iwdev->max_mr,
-				   &stag_index, &next_stag_index);
-	if (!ret) {
-		stag = stag_index << I40IW_CQPSQ_STAG_IDX_SHIFT;
-		stag |= driver_key;
-		stag += (u32)consumer_key;
-		i40iw_add_devusecount(iwdev);
-	}
-	return stag;
-}
-
-/**
- * i40iw_next_pbl_addr - Get next pbl address
- * @pbl: pointer to a pble
- * @pinfo: info pointer
- * @idx: index
- */
-static inline u64 *i40iw_next_pbl_addr(u64 *pbl,
-				       struct i40iw_pble_info **pinfo,
-				       u32 *idx)
-{
-	*idx += 1;
-	if ((!(*pinfo)) || (*idx != (*pinfo)->cnt))
-		return ++pbl;
-	*idx = 0;
-	(*pinfo)++;
-	return (u64 *)(*pinfo)->addr;
-}
-
-/**
- * i40iw_copy_user_pgaddrs - copy user page address to pble's os locally
- * @iwmr: iwmr for IB's user page addresses
- * @pbl: ple pointer to save 1 level or 0 level pble
- * @level: indicated level 0, 1 or 2
- */
-static void i40iw_copy_user_pgaddrs(struct i40iw_mr *iwmr,
-				    u64 *pbl,
-				    enum i40iw_pble_level level)
-{
-	struct ib_umem *region = iwmr->region;
-	struct i40iw_pbl *iwpbl = &iwmr->iwpbl;
-	struct i40iw_pble_alloc *palloc = &iwpbl->pble_alloc;
-	struct i40iw_pble_info *pinfo;
-	struct ib_block_iter biter;
-	u32 idx = 0;
-
-	pinfo = (level == I40IW_LEVEL_1) ? NULL : palloc->level2.leaf;
-
-	if (iwmr->type == IW_MEMREG_TYPE_QP)
-		iwpbl->qp_mr.sq_page = sg_page(region->sg_head.sgl);
-
-	rdma_umem_for_each_dma_block(region, &biter, iwmr->page_size) {
-		*pbl = rdma_block_iter_dma_address(&biter);
-		pbl = i40iw_next_pbl_addr(pbl, &pinfo, &idx);
-	}
-}
-
-/**
- * i40iw_check_mem_contiguous - check if pbls stored in arr are contiguous
- * @arr: lvl1 pbl array
- * @npages: page count
- * @pg_size: page size
- *
- */
-static bool i40iw_check_mem_contiguous(u64 *arr, u32 npages, u32 pg_size)
-{
-	u32 pg_idx;
-
-	for (pg_idx = 0; pg_idx < npages; pg_idx++) {
-		if ((*arr + (pg_size * pg_idx)) != arr[pg_idx])
-			return false;
-	}
-	return true;
-}
-
-/**
- * i40iw_check_mr_contiguous - check if MR is physically contiguous
- * @palloc: pbl allocation struct
- * @pg_size: page size
- */
-static bool i40iw_check_mr_contiguous(struct i40iw_pble_alloc *palloc, u32 pg_size)
-{
-	struct i40iw_pble_level2 *lvl2 = &palloc->level2;
-	struct i40iw_pble_info *leaf = lvl2->leaf;
-	u64 *arr = NULL;
-	u64 *start_addr = NULL;
-	int i;
-	bool ret;
-
-	if (palloc->level == I40IW_LEVEL_1) {
-		arr = (u64 *)palloc->level1.addr;
-		ret = i40iw_check_mem_contiguous(arr, palloc->total_cnt, pg_size);
-		return ret;
-	}
-
-	start_addr = (u64 *)leaf->addr;
-
-	for (i = 0; i < lvl2->leaf_cnt; i++, leaf++) {
-		arr = (u64 *)leaf->addr;
-		if ((*start_addr + (i * pg_size * PBLE_PER_PAGE)) != *arr)
-			return false;
-		ret = i40iw_check_mem_contiguous(arr, leaf->cnt, pg_size);
-		if (!ret)
-			return false;
-	}
-
-	return true;
-}
-
-/**
- * i40iw_setup_pbles - copy user pg address to pble's
- * @iwdev: iwarp device
- * @iwmr: mr pointer for this memory registration
- * @use_pbles: flag if to use pble's
- */
-static int i40iw_setup_pbles(struct i40iw_device *iwdev,
-			     struct i40iw_mr *iwmr,
-			     bool use_pbles)
-{
-	struct i40iw_pbl *iwpbl = &iwmr->iwpbl;
-	struct i40iw_pble_alloc *palloc = &iwpbl->pble_alloc;
-	struct i40iw_pble_info *pinfo;
-	u64 *pbl;
-	enum i40iw_status_code status;
-	enum i40iw_pble_level level = I40IW_LEVEL_1;
-
-	if (use_pbles) {
-		mutex_lock(&iwdev->pbl_mutex);
-		status = i40iw_get_pble(&iwdev->sc_dev, iwdev->pble_rsrc, palloc, iwmr->page_cnt);
-		mutex_unlock(&iwdev->pbl_mutex);
-		if (status)
-			return -ENOMEM;
-
-		iwpbl->pbl_allocated = true;
-		level = palloc->level;
-		pinfo = (level == I40IW_LEVEL_1) ? &palloc->level1 : palloc->level2.leaf;
-		pbl = (u64 *)pinfo->addr;
-	} else {
-		pbl = iwmr->pgaddrmem;
-	}
-
-	i40iw_copy_user_pgaddrs(iwmr, pbl, level);
-
-	if (use_pbles)
-		iwmr->pgaddrmem[0] = *pbl;
-
-	return 0;
-}
-
-/**
- * i40iw_handle_q_mem - handle memory for qp and cq
- * @iwdev: iwarp device
- * @req: information for q memory management
- * @iwpbl: pble struct
- * @use_pbles: flag to use pble
- */
-static int i40iw_handle_q_mem(struct i40iw_device *iwdev,
-			      struct i40iw_mem_reg_req *req,
-			      struct i40iw_pbl *iwpbl,
-			      bool use_pbles)
-{
-	struct i40iw_pble_alloc *palloc = &iwpbl->pble_alloc;
-	struct i40iw_mr *iwmr = iwpbl->iwmr;
-	struct i40iw_qp_mr *qpmr = &iwpbl->qp_mr;
-	struct i40iw_cq_mr *cqmr = &iwpbl->cq_mr;
-	struct i40iw_hmc_pble *hmc_p;
-	u64 *arr = iwmr->pgaddrmem;
-	u32 pg_size;
-	int err;
-	int total;
-	bool ret = true;
-
-	total = req->sq_pages + req->rq_pages + req->cq_pages;
-	pg_size = iwmr->page_size;
-
-	err = i40iw_setup_pbles(iwdev, iwmr, use_pbles);
-	if (err)
-		return err;
-
-	if (use_pbles && (palloc->level != I40IW_LEVEL_1)) {
-		i40iw_free_pble(iwdev->pble_rsrc, palloc);
-		iwpbl->pbl_allocated = false;
-		return -ENOMEM;
-	}
-
-	if (use_pbles)
-		arr = (u64 *)palloc->level1.addr;
-
-	if (iwmr->type == IW_MEMREG_TYPE_QP) {
-		hmc_p = &qpmr->sq_pbl;
-		qpmr->shadow = (dma_addr_t)arr[total];
-
-		if (use_pbles) {
-			ret = i40iw_check_mem_contiguous(arr, req->sq_pages, pg_size);
-			if (ret)
-				ret = i40iw_check_mem_contiguous(&arr[req->sq_pages], req->rq_pages, pg_size);
-		}
-
-		if (!ret) {
-			hmc_p->idx = palloc->level1.idx;
-			hmc_p = &qpmr->rq_pbl;
-			hmc_p->idx = palloc->level1.idx + req->sq_pages;
-		} else {
-			hmc_p->addr = arr[0];
-			hmc_p = &qpmr->rq_pbl;
-			hmc_p->addr = arr[req->sq_pages];
-		}
-	} else {		/* CQ */
-		hmc_p = &cqmr->cq_pbl;
-		cqmr->shadow = (dma_addr_t)arr[total];
-
-		if (use_pbles)
-			ret = i40iw_check_mem_contiguous(arr, req->cq_pages, pg_size);
-
-		if (!ret)
-			hmc_p->idx = palloc->level1.idx;
-		else
-			hmc_p->addr = arr[0];
-	}
-
-	if (use_pbles && ret) {
-		i40iw_free_pble(iwdev->pble_rsrc, palloc);
-		iwpbl->pbl_allocated = false;
-	}
-
-	return err;
-}
-
-/**
- * i40iw_hw_alloc_stag - cqp command to allocate stag
- * @iwdev: iwarp device
- * @iwmr: iwarp mr pointer
- */
-static int i40iw_hw_alloc_stag(struct i40iw_device *iwdev, struct i40iw_mr *iwmr)
-{
-	struct i40iw_allocate_stag_info *info;
-	struct i40iw_pd *iwpd = to_iwpd(iwmr->ibmr.pd);
-	enum i40iw_status_code status;
-	int err = 0;
-	struct i40iw_cqp_request *cqp_request;
-	struct cqp_commands_info *cqp_info;
-
-	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, true);
-	if (!cqp_request)
-		return -ENOMEM;
-
-	cqp_info = &cqp_request->info;
-	info = &cqp_info->in.u.alloc_stag.info;
-	memset(info, 0, sizeof(*info));
-	info->page_size = PAGE_SIZE;
-	info->stag_idx = iwmr->stag >> I40IW_CQPSQ_STAG_IDX_SHIFT;
-	info->pd_id = iwpd->sc_pd.pd_id;
-	info->total_len = iwmr->length;
-	info->remote_access = true;
-	cqp_info->cqp_cmd = OP_ALLOC_STAG;
-	cqp_info->post_sq = 1;
-	cqp_info->in.u.alloc_stag.dev = &iwdev->sc_dev;
-	cqp_info->in.u.alloc_stag.scratch = (uintptr_t)cqp_request;
-
-	status = i40iw_handle_cqp_op(iwdev, cqp_request);
-	if (status) {
-		err = -ENOMEM;
-		i40iw_pr_err("CQP-OP MR Reg fail");
-	}
-	return err;
-}
-
-/**
- * i40iw_alloc_mr - register stag for fast memory registration
- * @pd: ibpd pointer
- * @mr_type: memory for stag registrion
- * @max_num_sg: man number of pages
- */
-static struct ib_mr *i40iw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-				    u32 max_num_sg)
-{
-	struct i40iw_pd *iwpd = to_iwpd(pd);
-	struct i40iw_device *iwdev = to_iwdev(pd->device);
-	struct i40iw_pble_alloc *palloc;
-	struct i40iw_pbl *iwpbl;
-	struct i40iw_mr *iwmr;
-	enum i40iw_status_code status;
-	u32 stag;
-	int err_code = -ENOMEM;
-
-	iwmr = kzalloc(sizeof(*iwmr), GFP_KERNEL);
-	if (!iwmr)
-		return ERR_PTR(-ENOMEM);
-
-	stag = i40iw_create_stag(iwdev);
-	if (!stag) {
-		err_code = -EOVERFLOW;
-		goto err;
-	}
-	stag &= ~I40IW_CQPSQ_STAG_KEY_MASK;
-	iwmr->stag = stag;
-	iwmr->ibmr.rkey = stag;
-	iwmr->ibmr.lkey = stag;
-	iwmr->ibmr.pd = pd;
-	iwmr->ibmr.device = pd->device;
-	iwpbl = &iwmr->iwpbl;
-	iwpbl->iwmr = iwmr;
-	iwmr->type = IW_MEMREG_TYPE_MEM;
-	palloc = &iwpbl->pble_alloc;
-	iwmr->page_cnt = max_num_sg;
-	mutex_lock(&iwdev->pbl_mutex);
-	status = i40iw_get_pble(&iwdev->sc_dev, iwdev->pble_rsrc, palloc, iwmr->page_cnt);
-	mutex_unlock(&iwdev->pbl_mutex);
-	if (status)
-		goto err1;
-
-	if (palloc->level != I40IW_LEVEL_1)
-		goto err2;
-	err_code = i40iw_hw_alloc_stag(iwdev, iwmr);
-	if (err_code)
-		goto err2;
-	iwpbl->pbl_allocated = true;
-	i40iw_add_pdusecount(iwpd);
-	return &iwmr->ibmr;
-err2:
-	i40iw_free_pble(iwdev->pble_rsrc, palloc);
-err1:
-	i40iw_free_stag(iwdev, stag);
-err:
-	kfree(iwmr);
-	return ERR_PTR(err_code);
-}
-
-/**
- * i40iw_set_page - populate pbl list for fmr
- * @ibmr: ib mem to access iwarp mr pointer
- * @addr: page dma address fro pbl list
- */
-static int i40iw_set_page(struct ib_mr *ibmr, u64 addr)
-{
-	struct i40iw_mr *iwmr = to_iwmr(ibmr);
-	struct i40iw_pbl *iwpbl = &iwmr->iwpbl;
-	struct i40iw_pble_alloc *palloc = &iwpbl->pble_alloc;
-	u64 *pbl;
-
-	if (unlikely(iwmr->npages == iwmr->page_cnt))
-		return -ENOMEM;
-
-	pbl = (u64 *)palloc->level1.addr;
-	pbl[iwmr->npages++] = cpu_to_le64(addr);
-	return 0;
-}
-
-/**
- * i40iw_map_mr_sg - map of sg list for fmr
- * @ibmr: ib mem to access iwarp mr pointer
- * @sg: scatter gather list for fmr
- * @sg_nents: number of sg pages
- * @sg_offset: scatter gather offset
- */
-static int i40iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
-			   int sg_nents, unsigned int *sg_offset)
-{
-	struct i40iw_mr *iwmr = to_iwmr(ibmr);
-
-	iwmr->npages = 0;
-	return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, i40iw_set_page);
-}
-
-/**
- * i40iw_drain_sq - drain the send queue
- * @ibqp: ib qp pointer
- */
-static void i40iw_drain_sq(struct ib_qp *ibqp)
-{
-	struct i40iw_qp *iwqp = to_iwqp(ibqp);
-	struct i40iw_sc_qp *qp = &iwqp->sc_qp;
-
-	if (I40IW_RING_MORE_WORK(qp->qp_uk.sq_ring))
-		wait_for_completion(&iwqp->sq_drained);
-}
-
-/**
- * i40iw_drain_rq - drain the receive queue
- * @ibqp: ib qp pointer
- */
-static void i40iw_drain_rq(struct ib_qp *ibqp)
-{
-	struct i40iw_qp *iwqp = to_iwqp(ibqp);
-	struct i40iw_sc_qp *qp = &iwqp->sc_qp;
-
-	if (I40IW_RING_MORE_WORK(qp->qp_uk.rq_ring))
-		wait_for_completion(&iwqp->rq_drained);
-}
-
-/**
- * i40iw_hwreg_mr - send cqp command for memory registration
- * @iwdev: iwarp device
- * @iwmr: iwarp mr pointer
- * @access: access for MR
- */
-static int i40iw_hwreg_mr(struct i40iw_device *iwdev,
-			  struct i40iw_mr *iwmr,
-			  u16 access)
-{
-	struct i40iw_pbl *iwpbl = &iwmr->iwpbl;
-	struct i40iw_reg_ns_stag_info *stag_info;
-	struct i40iw_pd *iwpd = to_iwpd(iwmr->ibmr.pd);
-	struct i40iw_pble_alloc *palloc = &iwpbl->pble_alloc;
-	enum i40iw_status_code status;
-	int err = 0;
-	struct i40iw_cqp_request *cqp_request;
-	struct cqp_commands_info *cqp_info;
-
-	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, true);
-	if (!cqp_request)
-		return -ENOMEM;
-
-	cqp_info = &cqp_request->info;
-	stag_info = &cqp_info->in.u.mr_reg_non_shared.info;
-	memset(stag_info, 0, sizeof(*stag_info));
-	stag_info->va = (void *)(unsigned long)iwpbl->user_base;
-	stag_info->stag_idx = iwmr->stag >> I40IW_CQPSQ_STAG_IDX_SHIFT;
-	stag_info->stag_key = (u8)iwmr->stag;
-	stag_info->total_len = iwmr->length;
-	stag_info->access_rights = access;
-	stag_info->pd_id = iwpd->sc_pd.pd_id;
-	stag_info->addr_type = I40IW_ADDR_TYPE_VA_BASED;
-	stag_info->page_size = iwmr->page_size;
-
-	if (iwpbl->pbl_allocated) {
-		if (palloc->level == I40IW_LEVEL_1) {
-			stag_info->first_pm_pbl_index = palloc->level1.idx;
-			stag_info->chunk_size = 1;
-		} else {
-			stag_info->first_pm_pbl_index = palloc->level2.root.idx;
-			stag_info->chunk_size = 3;
-		}
-	} else {
-		stag_info->reg_addr_pa = iwmr->pgaddrmem[0];
-	}
-
-	cqp_info->cqp_cmd = OP_MR_REG_NON_SHARED;
-	cqp_info->post_sq = 1;
-	cqp_info->in.u.mr_reg_non_shared.dev = &iwdev->sc_dev;
-	cqp_info->in.u.mr_reg_non_shared.scratch = (uintptr_t)cqp_request;
-
-	status = i40iw_handle_cqp_op(iwdev, cqp_request);
-	if (status) {
-		err = -ENOMEM;
-		i40iw_pr_err("CQP-OP MR Reg fail");
-	}
-	return err;
-}
-
-/**
- * i40iw_reg_user_mr - Register a user memory region
- * @pd: ptr of pd
- * @start: virtual start address
- * @length: length of mr
- * @virt: virtual address
- * @acc: access of mr
- * @udata: user data
- */
-static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd,
-				       u64 start,
-				       u64 length,
-				       u64 virt,
-				       int acc,
-				       struct ib_udata *udata)
-{
-	struct i40iw_pd *iwpd = to_iwpd(pd);
-	struct i40iw_device *iwdev = to_iwdev(pd->device);
-	struct i40iw_ucontext *ucontext = rdma_udata_to_drv_context(
-		udata, struct i40iw_ucontext, ibucontext);
-	struct i40iw_pble_alloc *palloc;
-	struct i40iw_pbl *iwpbl;
-	struct i40iw_mr *iwmr;
-	struct ib_umem *region;
-	struct i40iw_mem_reg_req req;
-	u32 stag = 0;
-	u16 access;
-	bool use_pbles = false;
-	unsigned long flags;
-	int err = -ENOSYS;
-	int ret;
-
-	if (!udata)
-		return ERR_PTR(-EOPNOTSUPP);
-
-	if (iwdev->closing)
-		return ERR_PTR(-ENODEV);
-
-	if (length > I40IW_MAX_MR_SIZE)
-		return ERR_PTR(-EINVAL);
-	region = ib_umem_get(pd->device, start, length, acc);
-	if (IS_ERR(region))
-		return (struct ib_mr *)region;
-
-	if (ib_copy_from_udata(&req, udata, sizeof(req))) {
-		ib_umem_release(region);
-		return ERR_PTR(-EFAULT);
-	}
-
-	iwmr = kzalloc(sizeof(*iwmr), GFP_KERNEL);
-	if (!iwmr) {
-		ib_umem_release(region);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	iwpbl = &iwmr->iwpbl;
-	iwpbl->iwmr = iwmr;
-	iwmr->region = region;
-	iwmr->ibmr.pd = pd;
-	iwmr->ibmr.device = pd->device;
-
-	iwmr->page_size = PAGE_SIZE;
-	if (req.reg_type == IW_MEMREG_TYPE_MEM)
-		iwmr->page_size = ib_umem_find_best_pgsz(region, SZ_4K | SZ_2M,
-							 virt);
-	iwmr->length = region->length;
-
-	iwpbl->user_base = virt;
-	palloc = &iwpbl->pble_alloc;
-
-	iwmr->type = req.reg_type;
-	iwmr->page_cnt = ib_umem_num_dma_blocks(region, iwmr->page_size);
-
-	switch (req.reg_type) {
-	case IW_MEMREG_TYPE_QP:
-		use_pbles = ((req.sq_pages + req.rq_pages) > 2);
-		err = i40iw_handle_q_mem(iwdev, &req, iwpbl, use_pbles);
-		if (err)
-			goto error;
-		spin_lock_irqsave(&ucontext->qp_reg_mem_list_lock, flags);
-		list_add_tail(&iwpbl->list, &ucontext->qp_reg_mem_list);
-		iwpbl->on_list = true;
-		spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags);
-		break;
-	case IW_MEMREG_TYPE_CQ:
-		use_pbles = (req.cq_pages > 1);
-		err = i40iw_handle_q_mem(iwdev, &req, iwpbl, use_pbles);
-		if (err)
-			goto error;
-
-		spin_lock_irqsave(&ucontext->cq_reg_mem_list_lock, flags);
-		list_add_tail(&iwpbl->list, &ucontext->cq_reg_mem_list);
-		iwpbl->on_list = true;
-		spin_unlock_irqrestore(&ucontext->cq_reg_mem_list_lock, flags);
-		break;
-	case IW_MEMREG_TYPE_MEM:
-		use_pbles = (iwmr->page_cnt != 1);
-		access = I40IW_ACCESS_FLAGS_LOCALREAD;
-
-		err = i40iw_setup_pbles(iwdev, iwmr, use_pbles);
-		if (err)
-			goto error;
-
-		if (use_pbles) {
-			ret = i40iw_check_mr_contiguous(palloc, iwmr->page_size);
-			if (ret) {
-				i40iw_free_pble(iwdev->pble_rsrc, palloc);
-				iwpbl->pbl_allocated = false;
-			}
-		}
-
-		access |= i40iw_get_user_access(acc);
-		stag = i40iw_create_stag(iwdev);
-		if (!stag) {
-			err = -ENOMEM;
-			goto error;
-		}
-
-		iwmr->stag = stag;
-		iwmr->ibmr.rkey = stag;
-		iwmr->ibmr.lkey = stag;
-
-		err = i40iw_hwreg_mr(iwdev, iwmr, access);
-		if (err) {
-			i40iw_free_stag(iwdev, stag);
-			goto error;
-		}
-
-		break;
-	default:
-		goto error;
-	}
-
-	iwmr->type = req.reg_type;
-	if (req.reg_type == IW_MEMREG_TYPE_MEM)
-		i40iw_add_pdusecount(iwpd);
-	return &iwmr->ibmr;
-
-error:
-	if (palloc->level != I40IW_LEVEL_0 && iwpbl->pbl_allocated)
-		i40iw_free_pble(iwdev->pble_rsrc, palloc);
-	ib_umem_release(region);
-	kfree(iwmr);
-	return ERR_PTR(err);
-}
-
-/**
- * i40iw_reg_phys_mr - register kernel physical memory
- * @pd: ibpd pointer
- * @addr: physical address of memory to register
- * @size: size of memory to register
- * @acc: Access rights
- * @iova_start: start of virtual address for physical buffers
- */
-struct ib_mr *i40iw_reg_phys_mr(struct ib_pd *pd,
-				u64 addr,
-				u64 size,
-				int acc,
-				u64 *iova_start)
-{
-	struct i40iw_pd *iwpd = to_iwpd(pd);
-	struct i40iw_device *iwdev = to_iwdev(pd->device);
-	struct i40iw_pbl *iwpbl;
-	struct i40iw_mr *iwmr;
-	enum i40iw_status_code status;
-	u32 stag;
-	u16 access = I40IW_ACCESS_FLAGS_LOCALREAD;
-	int ret;
-
-	iwmr = kzalloc(sizeof(*iwmr), GFP_KERNEL);
-	if (!iwmr)
-		return ERR_PTR(-ENOMEM);
-	iwmr->ibmr.pd = pd;
-	iwmr->ibmr.device = pd->device;
-	iwpbl = &iwmr->iwpbl;
-	iwpbl->iwmr = iwmr;
-	iwmr->type = IW_MEMREG_TYPE_MEM;
-	iwpbl->user_base = *iova_start;
-	stag = i40iw_create_stag(iwdev);
-	if (!stag) {
-		ret = -EOVERFLOW;
-		goto err;
-	}
-	access |= i40iw_get_user_access(acc);
-	iwmr->stag = stag;
-	iwmr->ibmr.rkey = stag;
-	iwmr->ibmr.lkey = stag;
-	iwmr->page_cnt = 1;
-	iwmr->pgaddrmem[0]  = addr;
-	iwmr->length = size;
-	status = i40iw_hwreg_mr(iwdev, iwmr, access);
-	if (status) {
-		i40iw_free_stag(iwdev, stag);
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	i40iw_add_pdusecount(iwpd);
-	return &iwmr->ibmr;
- err:
-	kfree(iwmr);
-	return ERR_PTR(ret);
-}
-
-/**
- * i40iw_get_dma_mr - register physical mem
- * @pd: ptr of pd
- * @acc: access for memory
- */
-static struct ib_mr *i40iw_get_dma_mr(struct ib_pd *pd, int acc)
-{
-	u64 kva = 0;
-
-	return i40iw_reg_phys_mr(pd, 0, 0, acc, &kva);
-}
-
-/**
- * i40iw_del_memlist - Deleting pbl list entries for CQ/QP
- * @iwmr: iwmr for IB's user page addresses
- * @ucontext: ptr to user context
- */
-static void i40iw_del_memlist(struct i40iw_mr *iwmr,
-			      struct i40iw_ucontext *ucontext)
-{
-	struct i40iw_pbl *iwpbl = &iwmr->iwpbl;
-	unsigned long flags;
-
-	switch (iwmr->type) {
-	case IW_MEMREG_TYPE_CQ:
-		spin_lock_irqsave(&ucontext->cq_reg_mem_list_lock, flags);
-		if (iwpbl->on_list) {
-			iwpbl->on_list = false;
-			list_del(&iwpbl->list);
-		}
-		spin_unlock_irqrestore(&ucontext->cq_reg_mem_list_lock, flags);
-		break;
-	case IW_MEMREG_TYPE_QP:
-		spin_lock_irqsave(&ucontext->qp_reg_mem_list_lock, flags);
-		if (iwpbl->on_list) {
-			iwpbl->on_list = false;
-			list_del(&iwpbl->list);
-		}
-		spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags);
-		break;
-	default:
-		break;
-	}
-}
-
-/**
- * i40iw_dereg_mr - deregister mr
- * @ib_mr: mr ptr for dereg
- * @udata: user data
- */
-static int i40iw_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata)
-{
-	struct ib_pd *ibpd = ib_mr->pd;
-	struct i40iw_pd *iwpd = to_iwpd(ibpd);
-	struct i40iw_mr *iwmr = to_iwmr(ib_mr);
-	struct i40iw_device *iwdev = to_iwdev(ib_mr->device);
-	enum i40iw_status_code status;
-	struct i40iw_dealloc_stag_info *info;
-	struct i40iw_pbl *iwpbl = &iwmr->iwpbl;
-	struct i40iw_pble_alloc *palloc = &iwpbl->pble_alloc;
-	struct i40iw_cqp_request *cqp_request;
-	struct cqp_commands_info *cqp_info;
-	u32 stag_idx;
-
-	ib_umem_release(iwmr->region);
-
-	if (iwmr->type != IW_MEMREG_TYPE_MEM) {
-		/* region is released. only test for userness. */
-		if (iwmr->region) {
-			struct i40iw_ucontext *ucontext =
-				rdma_udata_to_drv_context(
-					udata,
-					struct i40iw_ucontext,
-					ibucontext);
-
-			i40iw_del_memlist(iwmr, ucontext);
-		}
-		if (iwpbl->pbl_allocated && iwmr->type != IW_MEMREG_TYPE_QP)
-			i40iw_free_pble(iwdev->pble_rsrc, palloc);
-		kfree(iwmr);
-		return 0;
-	}
-
-	cqp_request = i40iw_get_cqp_request(&iwdev->cqp, true);
-	if (!cqp_request)
-		return -ENOMEM;
-
-	cqp_info = &cqp_request->info;
-	info = &cqp_info->in.u.dealloc_stag.info;
-	memset(info, 0, sizeof(*info));
-
-	info->pd_id = cpu_to_le32(iwpd->sc_pd.pd_id & 0x00007fff);
-	info->stag_idx = RS_64_1(ib_mr->rkey, I40IW_CQPSQ_STAG_IDX_SHIFT);
-	stag_idx = info->stag_idx;
-	info->mr = true;
-	if (iwpbl->pbl_allocated)
-		info->dealloc_pbl = true;
-
-	cqp_info->cqp_cmd = OP_DEALLOC_STAG;
-	cqp_info->post_sq = 1;
-	cqp_info->in.u.dealloc_stag.dev = &iwdev->sc_dev;
-	cqp_info->in.u.dealloc_stag.scratch = (uintptr_t)cqp_request;
-	status = i40iw_handle_cqp_op(iwdev, cqp_request);
-	if (status)
-		i40iw_pr_err("CQP-OP dealloc failed for stag_idx = 0x%x\n", stag_idx);
-	i40iw_rem_pdusecount(iwpd, iwdev);
-	i40iw_free_stag(iwdev, iwmr->stag);
-	if (iwpbl->pbl_allocated)
-		i40iw_free_pble(iwdev->pble_rsrc, palloc);
-	kfree(iwmr);
-	return 0;
-}
-
-/*
- * hw_rev_show
- */
-static ssize_t hw_rev_show(struct device *dev,
-			   struct device_attribute *attr, char *buf)
-{
-	struct i40iw_ib_device *iwibdev =
-		rdma_device_to_drv_device(dev, struct i40iw_ib_device, ibdev);
-	u32 hw_rev = iwibdev->iwdev->sc_dev.hw_rev;
-
-	return sysfs_emit(buf, "%x\n", hw_rev);
-}
-static DEVICE_ATTR_RO(hw_rev);
-
-/*
- * hca_type_show
- */
-static ssize_t hca_type_show(struct device *dev,
-			     struct device_attribute *attr, char *buf)
-{
-	return sysfs_emit(buf, "I40IW\n");
-}
-static DEVICE_ATTR_RO(hca_type);
-
-/*
- * board_id_show
- */
-static ssize_t board_id_show(struct device *dev,
-			     struct device_attribute *attr, char *buf)
-{
-	return sysfs_emit(buf, "%.*s\n", 32, "I40IW Board ID");
-}
-static DEVICE_ATTR_RO(board_id);
-
-static struct attribute *i40iw_dev_attributes[] = {
-	&dev_attr_hw_rev.attr,
-	&dev_attr_hca_type.attr,
-	&dev_attr_board_id.attr,
-	NULL
-};
-
-static const struct attribute_group i40iw_attr_group = {
-	.attrs = i40iw_dev_attributes,
-};
-
-/**
- * i40iw_copy_sg_list - copy sg list for qp
- * @sg_list: copied into sg_list
- * @sgl: copy from sgl
- * @num_sges: count of sg entries
- */
-static void i40iw_copy_sg_list(struct i40iw_sge *sg_list, struct ib_sge *sgl, int num_sges)
-{
-	unsigned int i;
-
-	for (i = 0; (i < num_sges) && (i < I40IW_MAX_WQ_FRAGMENT_COUNT); i++) {
-		sg_list[i].tag_off = sgl[i].addr;
-		sg_list[i].len = sgl[i].length;
-		sg_list[i].stag = sgl[i].lkey;
-	}
-}
-
-/**
- * i40iw_post_send -  kernel application wr
- * @ibqp: qp ptr for wr
- * @ib_wr: work request ptr
- * @bad_wr: return of bad wr if err
- */
-static int i40iw_post_send(struct ib_qp *ibqp,
-			   const struct ib_send_wr *ib_wr,
-			   const struct ib_send_wr **bad_wr)
-{
-	struct i40iw_qp *iwqp;
-	struct i40iw_qp_uk *ukqp;
-	struct i40iw_post_sq_info info;
-	enum i40iw_status_code ret;
-	int err = 0;
-	unsigned long flags;
-	bool inv_stag;
-
-	iwqp = (struct i40iw_qp *)ibqp;
-	ukqp = &iwqp->sc_qp.qp_uk;
-
-	spin_lock_irqsave(&iwqp->lock, flags);
-
-	if (iwqp->flush_issued) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	while (ib_wr) {
-		inv_stag = false;
-		memset(&info, 0, sizeof(info));
-		info.wr_id = (u64)(ib_wr->wr_id);
-		if ((ib_wr->send_flags & IB_SEND_SIGNALED) || iwqp->sig_all)
-			info.signaled = true;
-		if (ib_wr->send_flags & IB_SEND_FENCE)
-			info.read_fence = true;
-
-		switch (ib_wr->opcode) {
-		case IB_WR_SEND:
-		case IB_WR_SEND_WITH_INV:
-			if (ib_wr->opcode == IB_WR_SEND) {
-				if (ib_wr->send_flags & IB_SEND_SOLICITED)
-					info.op_type = I40IW_OP_TYPE_SEND_SOL;
-				else
-					info.op_type = I40IW_OP_TYPE_SEND;
-			} else {
-				if (ib_wr->send_flags & IB_SEND_SOLICITED)
-					info.op_type = I40IW_OP_TYPE_SEND_SOL_INV;
-				else
-					info.op_type = I40IW_OP_TYPE_SEND_INV;
-			}
-
-			if (ib_wr->send_flags & IB_SEND_INLINE) {
-				info.op.inline_send.data = (void *)(unsigned long)ib_wr->sg_list[0].addr;
-				info.op.inline_send.len = ib_wr->sg_list[0].length;
-				ret = ukqp->ops.iw_inline_send(ukqp, &info, ib_wr->ex.invalidate_rkey, false);
-			} else {
-				info.op.send.num_sges = ib_wr->num_sge;
-				info.op.send.sg_list = (struct i40iw_sge *)ib_wr->sg_list;
-				ret = ukqp->ops.iw_send(ukqp, &info, ib_wr->ex.invalidate_rkey, false);
-			}
-
-			if (ret) {
-				if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED)
-					err = -ENOMEM;
-				else
-					err = -EINVAL;
-			}
-			break;
-		case IB_WR_RDMA_WRITE:
-			info.op_type = I40IW_OP_TYPE_RDMA_WRITE;
-
-			if (ib_wr->send_flags & IB_SEND_INLINE) {
-				info.op.inline_rdma_write.data = (void *)(unsigned long)ib_wr->sg_list[0].addr;
-				info.op.inline_rdma_write.len = ib_wr->sg_list[0].length;
-				info.op.inline_rdma_write.rem_addr.tag_off = rdma_wr(ib_wr)->remote_addr;
-				info.op.inline_rdma_write.rem_addr.stag = rdma_wr(ib_wr)->rkey;
-				ret = ukqp->ops.iw_inline_rdma_write(ukqp, &info, false);
-			} else {
-				info.op.rdma_write.lo_sg_list = (void *)ib_wr->sg_list;
-				info.op.rdma_write.num_lo_sges = ib_wr->num_sge;
-				info.op.rdma_write.rem_addr.tag_off = rdma_wr(ib_wr)->remote_addr;
-				info.op.rdma_write.rem_addr.stag = rdma_wr(ib_wr)->rkey;
-				ret = ukqp->ops.iw_rdma_write(ukqp, &info, false);
-			}
-
-			if (ret) {
-				if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED)
-					err = -ENOMEM;
-				else
-					err = -EINVAL;
-			}
-			break;
-		case IB_WR_RDMA_READ_WITH_INV:
-			inv_stag = true;
-			fallthrough;
-		case IB_WR_RDMA_READ:
-			if (ib_wr->num_sge > I40IW_MAX_SGE_RD) {
-				err = -EINVAL;
-				break;
-			}
-			info.op_type = I40IW_OP_TYPE_RDMA_READ;
-			info.op.rdma_read.rem_addr.tag_off = rdma_wr(ib_wr)->remote_addr;
-			info.op.rdma_read.rem_addr.stag = rdma_wr(ib_wr)->rkey;
-			info.op.rdma_read.lo_addr.tag_off = ib_wr->sg_list->addr;
-			info.op.rdma_read.lo_addr.stag = ib_wr->sg_list->lkey;
-			info.op.rdma_read.lo_addr.len = ib_wr->sg_list->length;
-			ret = ukqp->ops.iw_rdma_read(ukqp, &info, inv_stag, false);
-			if (ret) {
-				if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED)
-					err = -ENOMEM;
-				else
-					err = -EINVAL;
-			}
-			break;
-		case IB_WR_LOCAL_INV:
-			info.op_type = I40IW_OP_TYPE_INV_STAG;
-			info.op.inv_local_stag.target_stag = ib_wr->ex.invalidate_rkey;
-			ret = ukqp->ops.iw_stag_local_invalidate(ukqp, &info, true);
-			if (ret)
-				err = -ENOMEM;
-			break;
-		case IB_WR_REG_MR:
-		{
-			struct i40iw_mr *iwmr = to_iwmr(reg_wr(ib_wr)->mr);
-			int flags = reg_wr(ib_wr)->access;
-			struct i40iw_pble_alloc *palloc = &iwmr->iwpbl.pble_alloc;
-			struct i40iw_sc_dev *dev = &iwqp->iwdev->sc_dev;
-			struct i40iw_fast_reg_stag_info info;
-
-			memset(&info, 0, sizeof(info));
-			info.access_rights = I40IW_ACCESS_FLAGS_LOCALREAD;
-			info.access_rights |= i40iw_get_user_access(flags);
-			info.stag_key = reg_wr(ib_wr)->key & 0xff;
-			info.stag_idx = reg_wr(ib_wr)->key >> 8;
-			info.page_size = reg_wr(ib_wr)->mr->page_size;
-			info.wr_id = ib_wr->wr_id;
-
-			info.addr_type = I40IW_ADDR_TYPE_VA_BASED;
-			info.va = (void *)(uintptr_t)iwmr->ibmr.iova;
-			info.total_len = iwmr->ibmr.length;
-			info.reg_addr_pa = *(u64 *)palloc->level1.addr;
-			info.first_pm_pbl_index = palloc->level1.idx;
-			info.local_fence = ib_wr->send_flags & IB_SEND_FENCE;
-			info.signaled = ib_wr->send_flags & IB_SEND_SIGNALED;
-
-			if (iwmr->npages > I40IW_MIN_PAGES_PER_FMR)
-				info.chunk_size = 1;
-
-			ret = dev->iw_priv_qp_ops->iw_mr_fast_register(&iwqp->sc_qp, &info, true);
-			if (ret)
-				err = -ENOMEM;
-			break;
-		}
-		default:
-			err = -EINVAL;
-			i40iw_pr_err(" upost_send bad opcode = 0x%x\n",
-				     ib_wr->opcode);
-			break;
-		}
-
-		if (err)
-			break;
-		ib_wr = ib_wr->next;
-	}
-
-out:
-	if (err)
-		*bad_wr = ib_wr;
-	else
-		ukqp->ops.iw_qp_post_wr(ukqp);
-	spin_unlock_irqrestore(&iwqp->lock, flags);
-
-	return err;
-}
-
-/**
- * i40iw_post_recv - post receive wr for kernel application
- * @ibqp: ib qp pointer
- * @ib_wr: work request for receive
- * @bad_wr: bad wr caused an error
- */
-static int i40iw_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *ib_wr,
-			   const struct ib_recv_wr **bad_wr)
-{
-	struct i40iw_qp *iwqp;
-	struct i40iw_qp_uk *ukqp;
-	struct i40iw_post_rq_info post_recv;
-	struct i40iw_sge sg_list[I40IW_MAX_WQ_FRAGMENT_COUNT];
-	enum i40iw_status_code ret = 0;
-	unsigned long flags;
-	int err = 0;
-
-	iwqp = (struct i40iw_qp *)ibqp;
-	ukqp = &iwqp->sc_qp.qp_uk;
-
-	memset(&post_recv, 0, sizeof(post_recv));
-	spin_lock_irqsave(&iwqp->lock, flags);
-
-	if (iwqp->flush_issued) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	while (ib_wr) {
-		post_recv.num_sges = ib_wr->num_sge;
-		post_recv.wr_id = ib_wr->wr_id;
-		i40iw_copy_sg_list(sg_list, ib_wr->sg_list, ib_wr->num_sge);
-		post_recv.sg_list = sg_list;
-		ret = ukqp->ops.iw_post_receive(ukqp, &post_recv);
-		if (ret) {
-			i40iw_pr_err(" post_recv err %d\n", ret);
-			if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED)
-				err = -ENOMEM;
-			else
-				err = -EINVAL;
-			*bad_wr = ib_wr;
-			goto out;
-		}
-		ib_wr = ib_wr->next;
-	}
- out:
-	spin_unlock_irqrestore(&iwqp->lock, flags);
-	return err;
-}
-
-/**
- * i40iw_poll_cq - poll cq for completion (kernel apps)
- * @ibcq: cq to poll
- * @num_entries: number of entries to poll
- * @entry: wr of entry completed
- */
-static int i40iw_poll_cq(struct ib_cq *ibcq,
-			 int num_entries,
-			 struct ib_wc *entry)
-{
-	struct i40iw_cq *iwcq;
-	int cqe_count = 0;
-	struct i40iw_cq_poll_info cq_poll_info;
-	enum i40iw_status_code ret;
-	struct i40iw_cq_uk *ukcq;
-	struct i40iw_sc_qp *qp;
-	struct i40iw_qp *iwqp;
-	unsigned long flags;
-
-	iwcq = (struct i40iw_cq *)ibcq;
-	ukcq = &iwcq->sc_cq.cq_uk;
-
-	spin_lock_irqsave(&iwcq->lock, flags);
-	while (cqe_count < num_entries) {
-		ret = ukcq->ops.iw_cq_poll_completion(ukcq, &cq_poll_info);
-		if (ret == I40IW_ERR_QUEUE_EMPTY) {
-			break;
-		} else if (ret == I40IW_ERR_QUEUE_DESTROYED) {
-			continue;
-		} else if (ret) {
-			if (!cqe_count)
-				cqe_count = -1;
-			break;
-		}
-		entry->wc_flags = 0;
-		entry->wr_id = cq_poll_info.wr_id;
-		if (cq_poll_info.error) {
-			entry->status = IB_WC_WR_FLUSH_ERR;
-			entry->vendor_err = cq_poll_info.major_err << 16 | cq_poll_info.minor_err;
-		} else {
-			entry->status = IB_WC_SUCCESS;
-		}
-
-		switch (cq_poll_info.op_type) {
-		case I40IW_OP_TYPE_RDMA_WRITE:
-			entry->opcode = IB_WC_RDMA_WRITE;
-			break;
-		case I40IW_OP_TYPE_RDMA_READ_INV_STAG:
-		case I40IW_OP_TYPE_RDMA_READ:
-			entry->opcode = IB_WC_RDMA_READ;
-			break;
-		case I40IW_OP_TYPE_SEND_SOL:
-		case I40IW_OP_TYPE_SEND_SOL_INV:
-		case I40IW_OP_TYPE_SEND_INV:
-		case I40IW_OP_TYPE_SEND:
-			entry->opcode = IB_WC_SEND;
-			break;
-		case I40IW_OP_TYPE_REC:
-			entry->opcode = IB_WC_RECV;
-			break;
-		default:
-			entry->opcode = IB_WC_RECV;
-			break;
-		}
-
-		entry->ex.imm_data = 0;
-		qp = (struct i40iw_sc_qp *)cq_poll_info.qp_handle;
-		entry->qp = (struct ib_qp *)qp->back_qp;
-		entry->src_qp = cq_poll_info.qp_id;
-		iwqp = (struct i40iw_qp *)qp->back_qp;
-		if (iwqp->iwarp_state > I40IW_QP_STATE_RTS) {
-			if (!I40IW_RING_MORE_WORK(qp->qp_uk.sq_ring))
-				complete(&iwqp->sq_drained);
-			if (!I40IW_RING_MORE_WORK(qp->qp_uk.rq_ring))
-				complete(&iwqp->rq_drained);
-		}
-		entry->byte_len = cq_poll_info.bytes_xfered;
-		entry++;
-		cqe_count++;
-	}
-	spin_unlock_irqrestore(&iwcq->lock, flags);
-	return cqe_count;
-}
-
-/**
- * i40iw_req_notify_cq - arm cq kernel application
- * @ibcq: cq to arm
- * @notify_flags: notofication flags
- */
-static int i40iw_req_notify_cq(struct ib_cq *ibcq,
-			       enum ib_cq_notify_flags notify_flags)
-{
-	struct i40iw_cq *iwcq;
-	struct i40iw_cq_uk *ukcq;
-	unsigned long flags;
-	enum i40iw_completion_notify cq_notify = IW_CQ_COMPL_EVENT;
-
-	iwcq = (struct i40iw_cq *)ibcq;
-	ukcq = &iwcq->sc_cq.cq_uk;
-	if (notify_flags == IB_CQ_SOLICITED)
-		cq_notify = IW_CQ_COMPL_SOLICITED;
-	spin_lock_irqsave(&iwcq->lock, flags);
-	ukcq->ops.iw_cq_request_notification(ukcq, cq_notify);
-	spin_unlock_irqrestore(&iwcq->lock, flags);
-	return 0;
-}
-
-/**
- * i40iw_port_immutable - return port's immutable data
- * @ibdev: ib dev struct
- * @port_num: port number
- * @immutable: immutable data for the port return
- */
-static int i40iw_port_immutable(struct ib_device *ibdev, u32 port_num,
-				struct ib_port_immutable *immutable)
-{
-	struct ib_port_attr attr;
-	int err;
-
-	immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
-
-	err = ib_query_port(ibdev, port_num, &attr);
-
-	if (err)
-		return err;
-
-	immutable->gid_tbl_len = attr.gid_tbl_len;
-
-	return 0;
-}
-
-static const char * const i40iw_hw_stat_names[] = {
-	// 32bit names
-	[I40IW_HW_STAT_INDEX_IP4RXDISCARD] = "ip4InDiscards",
-	[I40IW_HW_STAT_INDEX_IP4RXTRUNC] = "ip4InTruncatedPkts",
-	[I40IW_HW_STAT_INDEX_IP4TXNOROUTE] = "ip4OutNoRoutes",
-	[I40IW_HW_STAT_INDEX_IP6RXDISCARD] = "ip6InDiscards",
-	[I40IW_HW_STAT_INDEX_IP6RXTRUNC] = "ip6InTruncatedPkts",
-	[I40IW_HW_STAT_INDEX_IP6TXNOROUTE] = "ip6OutNoRoutes",
-	[I40IW_HW_STAT_INDEX_TCPRTXSEG] = "tcpRetransSegs",
-	[I40IW_HW_STAT_INDEX_TCPRXOPTERR] = "tcpInOptErrors",
-	[I40IW_HW_STAT_INDEX_TCPRXPROTOERR] = "tcpInProtoErrors",
-	// 64bit names
-	[I40IW_HW_STAT_INDEX_IP4RXOCTS + I40IW_HW_STAT_INDEX_MAX_32] =
-		"ip4InOctets",
-	[I40IW_HW_STAT_INDEX_IP4RXPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
-		"ip4InPkts",
-	[I40IW_HW_STAT_INDEX_IP4RXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] =
-		"ip4InReasmRqd",
-	[I40IW_HW_STAT_INDEX_IP4RXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
-		"ip4InMcastPkts",
-	[I40IW_HW_STAT_INDEX_IP4TXOCTS + I40IW_HW_STAT_INDEX_MAX_32] =
-		"ip4OutOctets",
-	[I40IW_HW_STAT_INDEX_IP4TXPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
-		"ip4OutPkts",
-	[I40IW_HW_STAT_INDEX_IP4TXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] =
-		"ip4OutSegRqd",
-	[I40IW_HW_STAT_INDEX_IP4TXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
-		"ip4OutMcastPkts",
-	[I40IW_HW_STAT_INDEX_IP6RXOCTS + I40IW_HW_STAT_INDEX_MAX_32] =
-		"ip6InOctets",
-	[I40IW_HW_STAT_INDEX_IP6RXPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
-		"ip6InPkts",
-	[I40IW_HW_STAT_INDEX_IP6RXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] =
-		"ip6InReasmRqd",
-	[I40IW_HW_STAT_INDEX_IP6RXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
-		"ip6InMcastPkts",
-	[I40IW_HW_STAT_INDEX_IP6TXOCTS + I40IW_HW_STAT_INDEX_MAX_32] =
-		"ip6OutOctets",
-	[I40IW_HW_STAT_INDEX_IP6TXPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
-		"ip6OutPkts",
-	[I40IW_HW_STAT_INDEX_IP6TXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] =
-		"ip6OutSegRqd",
-	[I40IW_HW_STAT_INDEX_IP6TXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
-		"ip6OutMcastPkts",
-	[I40IW_HW_STAT_INDEX_TCPRXSEGS + I40IW_HW_STAT_INDEX_MAX_32] =
-		"tcpInSegs",
-	[I40IW_HW_STAT_INDEX_TCPTXSEG + I40IW_HW_STAT_INDEX_MAX_32] =
-		"tcpOutSegs",
-	[I40IW_HW_STAT_INDEX_RDMARXRDS + I40IW_HW_STAT_INDEX_MAX_32] =
-		"iwInRdmaReads",
-	[I40IW_HW_STAT_INDEX_RDMARXSNDS + I40IW_HW_STAT_INDEX_MAX_32] =
-		"iwInRdmaSends",
-	[I40IW_HW_STAT_INDEX_RDMARXWRS + I40IW_HW_STAT_INDEX_MAX_32] =
-		"iwInRdmaWrites",
-	[I40IW_HW_STAT_INDEX_RDMATXRDS + I40IW_HW_STAT_INDEX_MAX_32] =
-		"iwOutRdmaReads",
-	[I40IW_HW_STAT_INDEX_RDMATXSNDS + I40IW_HW_STAT_INDEX_MAX_32] =
-		"iwOutRdmaSends",
-	[I40IW_HW_STAT_INDEX_RDMATXWRS + I40IW_HW_STAT_INDEX_MAX_32] =
-		"iwOutRdmaWrites",
-	[I40IW_HW_STAT_INDEX_RDMAVBND + I40IW_HW_STAT_INDEX_MAX_32] =
-		"iwRdmaBnd",
-	[I40IW_HW_STAT_INDEX_RDMAVINV + I40IW_HW_STAT_INDEX_MAX_32] =
-		"iwRdmaInv"
-};
-
-static void i40iw_get_dev_fw_str(struct ib_device *dev, char *str)
-{
-	struct i40iw_device *iwdev = to_iwdev(dev);
-
-	snprintf(str, IB_FW_VERSION_NAME_MAX, "%llu.%llu",
-		 i40iw_fw_major_ver(&iwdev->sc_dev),
-		 i40iw_fw_minor_ver(&iwdev->sc_dev));
-}
-
-/**
- * i40iw_alloc_hw_stats - Allocate a hw stats structure
- * @ibdev: device pointer from stack
- * @port_num: port number
- */
-static struct rdma_hw_stats *i40iw_alloc_hw_stats(struct ib_device *ibdev,
-						  u32 port_num)
-{
-	struct i40iw_device *iwdev = to_iwdev(ibdev);
-	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-	int num_counters = I40IW_HW_STAT_INDEX_MAX_32 +
-		I40IW_HW_STAT_INDEX_MAX_64;
-	unsigned long lifespan = RDMA_HW_STATS_DEFAULT_LIFESPAN;
-
-	BUILD_BUG_ON(ARRAY_SIZE(i40iw_hw_stat_names) !=
-		     (I40IW_HW_STAT_INDEX_MAX_32 +
-		      I40IW_HW_STAT_INDEX_MAX_64));
-
-	/*
-	 * PFs get the default update lifespan, but VFs only update once
-	 * per second
-	 */
-	if (!dev->is_pf)
-		lifespan = 1000;
-	return rdma_alloc_hw_stats_struct(i40iw_hw_stat_names, num_counters,
-					  lifespan);
-}
-
-/**
- * i40iw_get_hw_stats - Populates the rdma_hw_stats structure
- * @ibdev: device pointer from stack
- * @stats: stats pointer from stack
- * @port_num: port number
- * @index: which hw counter the stack is requesting we update
- */
-static int i40iw_get_hw_stats(struct ib_device *ibdev,
-			      struct rdma_hw_stats *stats,
-			      u32 port_num, int index)
-{
-	struct i40iw_device *iwdev = to_iwdev(ibdev);
-	struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-	struct i40iw_vsi_pestat *devstat = iwdev->vsi.pestat;
-	struct i40iw_dev_hw_stats *hw_stats = &devstat->hw_stats;
-
-	if (dev->is_pf) {
-		i40iw_hw_stats_read_all(devstat, &devstat->hw_stats);
-	} else {
-		if (i40iw_vchnl_vf_get_pe_stats(dev, &devstat->hw_stats))
-			return -ENOSYS;
-	}
-
-	memcpy(&stats->value[0], hw_stats, sizeof(*hw_stats));
-
-	return stats->num_counters;
-}
-
-/**
- * i40iw_query_gid - Query port GID
- * @ibdev: device pointer from stack
- * @port: port number
- * @index: Entry index
- * @gid: Global ID
- */
-static int i40iw_query_gid(struct ib_device *ibdev,
-			   u32 port,
-			   int index,
-			   union ib_gid *gid)
-{
-	struct i40iw_device *iwdev = to_iwdev(ibdev);
-
-	memset(gid->raw, 0, sizeof(gid->raw));
-	ether_addr_copy(gid->raw, iwdev->netdev->dev_addr);
-	return 0;
-}
-
-static const struct ib_device_ops i40iw_dev_ops = {
-	.owner = THIS_MODULE,
-	.driver_id = RDMA_DRIVER_I40IW,
-	/* NOTE: Older kernels wrongly use 0 for the uverbs_abi_ver */
-	.uverbs_abi_ver = I40IW_ABI_VER,
-
-	.alloc_hw_stats = i40iw_alloc_hw_stats,
-	.alloc_mr = i40iw_alloc_mr,
-	.alloc_pd = i40iw_alloc_pd,
-	.alloc_ucontext = i40iw_alloc_ucontext,
-	.create_cq = i40iw_create_cq,
-	.create_qp = i40iw_create_qp,
-	.dealloc_pd = i40iw_dealloc_pd,
-	.dealloc_ucontext = i40iw_dealloc_ucontext,
-	.dereg_mr = i40iw_dereg_mr,
-	.destroy_cq = i40iw_destroy_cq,
-	.destroy_qp = i40iw_destroy_qp,
-	.drain_rq = i40iw_drain_rq,
-	.drain_sq = i40iw_drain_sq,
-	.get_dev_fw_str = i40iw_get_dev_fw_str,
-	.get_dma_mr = i40iw_get_dma_mr,
-	.get_hw_stats = i40iw_get_hw_stats,
-	.get_port_immutable = i40iw_port_immutable,
-	.iw_accept = i40iw_accept,
-	.iw_add_ref = i40iw_qp_add_ref,
-	.iw_connect = i40iw_connect,
-	.iw_create_listen = i40iw_create_listen,
-	.iw_destroy_listen = i40iw_destroy_listen,
-	.iw_get_qp = i40iw_get_qp,
-	.iw_reject = i40iw_reject,
-	.iw_rem_ref = i40iw_qp_rem_ref,
-	.map_mr_sg = i40iw_map_mr_sg,
-	.mmap = i40iw_mmap,
-	.modify_qp = i40iw_modify_qp,
-	.poll_cq = i40iw_poll_cq,
-	.post_recv = i40iw_post_recv,
-	.post_send = i40iw_post_send,
-	.query_device = i40iw_query_device,
-	.query_gid = i40iw_query_gid,
-	.query_port = i40iw_query_port,
-	.query_qp = i40iw_query_qp,
-	.reg_user_mr = i40iw_reg_user_mr,
-	.req_notify_cq = i40iw_req_notify_cq,
-	INIT_RDMA_OBJ_SIZE(ib_pd, i40iw_pd, ibpd),
-	INIT_RDMA_OBJ_SIZE(ib_cq, i40iw_cq, ibcq),
-	INIT_RDMA_OBJ_SIZE(ib_ucontext, i40iw_ucontext, ibucontext),
-};
-
-/**
- * i40iw_init_rdma_device - initialization of iwarp device
- * @iwdev: iwarp device
- */
-static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev)
-{
-	struct i40iw_ib_device *iwibdev;
-	struct net_device *netdev = iwdev->netdev;
-	struct pci_dev *pcidev = iwdev->hw.pcidev;
-
-	iwibdev = ib_alloc_device(i40iw_ib_device, ibdev);
-	if (!iwibdev) {
-		i40iw_pr_err("iwdev == NULL\n");
-		return NULL;
-	}
-	iwdev->iwibdev = iwibdev;
-	iwibdev->iwdev = iwdev;
-
-	iwibdev->ibdev.node_type = RDMA_NODE_RNIC;
-	ether_addr_copy((u8 *)&iwibdev->ibdev.node_guid, netdev->dev_addr);
-
-	iwibdev->ibdev.phys_port_cnt = 1;
-	iwibdev->ibdev.num_comp_vectors = iwdev->ceqs_count;
-	iwibdev->ibdev.dev.parent = &pcidev->dev;
-	memcpy(iwibdev->ibdev.iw_ifname, netdev->name,
-	       sizeof(iwibdev->ibdev.iw_ifname));
-	ib_set_device_ops(&iwibdev->ibdev, &i40iw_dev_ops);
-
-	return iwibdev;
-}
-
-/**
- * i40iw_port_ibevent - indicate port event
- * @iwdev: iwarp device
- */
-void i40iw_port_ibevent(struct i40iw_device *iwdev)
-{
-	struct i40iw_ib_device *iwibdev = iwdev->iwibdev;
-	struct ib_event event;
-
-	event.device = &iwibdev->ibdev;
-	event.element.port_num = 1;
-	event.event = iwdev->iw_status ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
-	ib_dispatch_event(&event);
-}
-
-/**
- * i40iw_destroy_rdma_device - destroy rdma device and free resources
- * @iwibdev: IB device ptr
- */
-void i40iw_destroy_rdma_device(struct i40iw_ib_device *iwibdev)
-{
-	ib_unregister_device(&iwibdev->ibdev);
-	wait_event_timeout(iwibdev->iwdev->close_wq,
-			   !atomic64_read(&iwibdev->iwdev->use_count),
-			   I40IW_EVENT_TIMEOUT);
-	ib_dealloc_device(&iwibdev->ibdev);
-}
-
-/**
- * i40iw_register_rdma_device - register iwarp device to IB
- * @iwdev: iwarp device
- */
-int i40iw_register_rdma_device(struct i40iw_device *iwdev)
-{
-	int ret;
-	struct i40iw_ib_device *iwibdev;
-
-	iwdev->iwibdev = i40iw_init_rdma_device(iwdev);
-	if (!iwdev->iwibdev)
-		return -ENOMEM;
-	iwibdev = iwdev->iwibdev;
-	rdma_set_device_sysfs_group(&iwibdev->ibdev, &i40iw_attr_group);
-	ret = ib_device_set_netdev(&iwibdev->ibdev, iwdev->netdev, 1);
-	if (ret)
-		goto error;
-
-	dma_set_max_seg_size(&iwdev->hw.pcidev->dev, UINT_MAX);
-	ret = ib_register_device(&iwibdev->ibdev, "i40iw%d", &iwdev->hw.pcidev->dev);
-	if (ret)
-		goto error;
-
-	return 0;
-error:
-	ib_dealloc_device(&iwdev->iwibdev->ibdev);
-	return ret;
-}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.h b/drivers/infiniband/hw/i40iw/i40iw_verbs.h
deleted file mode 100644
index bab71f3e5637..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#ifndef I40IW_VERBS_H
-#define I40IW_VERBS_H
-
-struct i40iw_ucontext {
-	struct ib_ucontext ibucontext;
-	struct i40iw_device *iwdev;
-	struct list_head cq_reg_mem_list;
-	spinlock_t cq_reg_mem_list_lock; /* memory list for cq's */
-	struct list_head qp_reg_mem_list;
-	spinlock_t qp_reg_mem_list_lock; /* memory list for qp's */
-	int abi_ver;
-};
-
-struct i40iw_pd {
-	struct ib_pd ibpd;
-	struct i40iw_sc_pd sc_pd;
-	atomic_t usecount;
-};
-
-struct i40iw_hmc_pble {
-	union {
-		u32 idx;
-		dma_addr_t addr;
-	};
-};
-
-struct i40iw_cq_mr {
-	struct i40iw_hmc_pble cq_pbl;
-	dma_addr_t shadow;
-};
-
-struct i40iw_qp_mr {
-	struct i40iw_hmc_pble sq_pbl;
-	struct i40iw_hmc_pble rq_pbl;
-	dma_addr_t shadow;
-	struct page *sq_page;
-};
-
-struct i40iw_pbl {
-	struct list_head list;
-	union {
-		struct i40iw_qp_mr qp_mr;
-		struct i40iw_cq_mr cq_mr;
-	};
-
-	bool pbl_allocated;
-	bool on_list;
-	u64 user_base;
-	struct i40iw_pble_alloc pble_alloc;
-	struct i40iw_mr *iwmr;
-};
-
-#define MAX_SAVE_PAGE_ADDRS     4
-struct i40iw_mr {
-	union {
-		struct ib_mr ibmr;
-		struct ib_mw ibmw;
-	};
-	struct ib_umem *region;
-	u16 type;
-	u32 page_cnt;
-	u64 page_size;
-	u32 npages;
-	u32 stag;
-	u64 length;
-	u64 pgaddrmem[MAX_SAVE_PAGE_ADDRS];
-	struct i40iw_pbl iwpbl;
-};
-
-struct i40iw_cq {
-	struct ib_cq ibcq;
-	struct i40iw_sc_cq sc_cq;
-	u16 cq_head;
-	u16 cq_size;
-	u16 cq_number;
-	bool user_mode;
-	u32 polled_completions;
-	u32 cq_mem_size;
-	struct i40iw_dma_mem kmem;
-	spinlock_t lock; /* for poll cq */
-	struct i40iw_pbl *iwpbl;
-};
-
-struct disconn_work {
-	struct work_struct work;
-	struct i40iw_qp *iwqp;
-};
-
-struct iw_cm_id;
-struct ietf_mpa_frame;
-struct i40iw_ud_file;
-
-struct i40iw_qp_kmode {
-	struct i40iw_dma_mem dma_mem;
-	u64 *wrid_mem;
-};
-
-struct i40iw_qp {
-	struct ib_qp ibqp;
-	struct i40iw_sc_qp sc_qp;
-	struct i40iw_device *iwdev;
-	struct i40iw_cq *iwscq;
-	struct i40iw_cq *iwrcq;
-	struct i40iw_pd *iwpd;
-	struct i40iw_qp_host_ctx_info ctx_info;
-	struct i40iwarp_offload_info iwarp_info;
-	void *allocated_buffer;
-	refcount_t refcount;
-	struct iw_cm_id *cm_id;
-	void *cm_node;
-	struct ib_mr *lsmm_mr;
-	struct work_struct work;
-	enum ib_qp_state ibqp_state;
-	u32 iwarp_state;
-	u32 qp_mem_size;
-	u32 last_aeq;
-	atomic_t close_timer_started;
-	spinlock_t lock; /* for post work requests */
-	struct i40iw_qp_context *iwqp_context;
-	void *pbl_vbase;
-	dma_addr_t pbl_pbase;
-	struct page *page;
-	u8 active_conn:1;
-	u8 user_mode:1;
-	u8 hte_added:1;
-	u8 flush_issued:1;
-	u8 destroyed:1;
-	u8 sig_all:1;
-	u8 pau_mode:1;
-	u8 rsvd:1;
-	u16 term_sq_flush_code;
-	u16 term_rq_flush_code;
-	u8 hw_iwarp_state;
-	u8 hw_tcp_state;
-	struct i40iw_qp_kmode kqp;
-	struct i40iw_dma_mem host_ctx;
-	struct timer_list terminate_timer;
-	struct i40iw_pbl iwpbl;
-	struct i40iw_dma_mem q2_ctx_mem;
-	struct i40iw_dma_mem ietf_mem;
-	struct completion sq_drained;
-	struct completion rq_drained;
-	struct completion free_qp;
-};
-#endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_vf.c b/drivers/infiniband/hw/i40iw/i40iw_vf.c
deleted file mode 100644
index e33d4810965c..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw_vf.c
+++ /dev/null
@@ -1,85 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#include "i40iw_osdep.h"
-#include "i40iw_register.h"
-#include "i40iw_status.h"
-#include "i40iw_hmc.h"
-#include "i40iw_d.h"
-#include "i40iw_type.h"
-#include "i40iw_p.h"
-#include "i40iw_vf.h"
-
-/**
- * i40iw_manage_vf_pble_bp - manage vf pble
- * @cqp: cqp for cqp' sq wqe
- * @info: pble info
- * @scratch: pointer for completion
- * @post_sq: to post and ring
- */
-enum i40iw_status_code i40iw_manage_vf_pble_bp(struct i40iw_sc_cqp *cqp,
-					       struct i40iw_manage_vf_pble_info *info,
-					       u64 scratch,
-					       bool post_sq)
-{
-	u64 *wqe;
-	u64 temp, header, pd_pl_pba = 0;
-
-	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
-	if (!wqe)
-		return I40IW_ERR_RING_FULL;
-
-	temp = LS_64(info->pd_entry_cnt, I40IW_CQPSQ_MVPBP_PD_ENTRY_CNT) |
-	    LS_64(info->first_pd_index, I40IW_CQPSQ_MVPBP_FIRST_PD_INX) |
-	    LS_64(info->sd_index, I40IW_CQPSQ_MVPBP_SD_INX);
-	set_64bit_val(wqe, 16, temp);
-
-	header = LS_64((info->inv_pd_ent ? 1 : 0), I40IW_CQPSQ_MVPBP_INV_PD_ENT) |
-	    LS_64(I40IW_CQP_OP_MANAGE_VF_PBLE_BP, I40IW_CQPSQ_OPCODE) |
-	    LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-	set_64bit_val(wqe, 24, header);
-
-	pd_pl_pba = LS_64(info->pd_pl_pba >> 3, I40IW_CQPSQ_MVPBP_PD_PLPBA);
-	set_64bit_val(wqe, 32, pd_pl_pba);
-
-	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "MANAGE VF_PBLE_BP WQE", wqe, I40IW_CQP_WQE_SIZE * 8);
-
-	if (post_sq)
-		i40iw_sc_cqp_post_sq(cqp);
-	return 0;
-}
-
-const struct i40iw_vf_cqp_ops iw_vf_cqp_ops = {
-	i40iw_manage_vf_pble_bp
-};
diff --git a/drivers/infiniband/hw/i40iw/i40iw_vf.h b/drivers/infiniband/hw/i40iw/i40iw_vf.h
deleted file mode 100644
index 4359559ece9c..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw_vf.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#ifndef I40IW_VF_H
-#define I40IW_VF_H
-
-struct i40iw_sc_cqp;
-
-struct i40iw_manage_vf_pble_info {
-	u32 sd_index;
-	u16 first_pd_index;
-	u16 pd_entry_cnt;
-	u8 inv_pd_ent;
-	u64 pd_pl_pba;
-};
-
-struct i40iw_vf_cqp_ops {
-	enum i40iw_status_code (*manage_vf_pble_bp)(struct i40iw_sc_cqp *,
-						    struct i40iw_manage_vf_pble_info *,
-						    u64,
-						    bool);
-};
-
-enum i40iw_status_code i40iw_manage_vf_pble_bp(struct i40iw_sc_cqp *cqp,
-					       struct i40iw_manage_vf_pble_info *info,
-					       u64 scratch,
-					       bool post_sq);
-
-extern const struct i40iw_vf_cqp_ops iw_vf_cqp_ops;
-
-#endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_virtchnl.c b/drivers/infiniband/hw/i40iw/i40iw_virtchnl.c
deleted file mode 100644
index e34a1522132c..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw_virtchnl.c
+++ /dev/null
@@ -1,759 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#include "i40iw_osdep.h"
-#include "i40iw_register.h"
-#include "i40iw_status.h"
-#include "i40iw_hmc.h"
-#include "i40iw_d.h"
-#include "i40iw_type.h"
-#include "i40iw_p.h"
-#include "i40iw_virtchnl.h"
-
-/**
- * vchnl_vf_send_get_ver_req - Request Channel version
- * @dev: IWARP device pointer
- * @vchnl_req: Virtual channel message request pointer
- */
-static enum i40iw_status_code vchnl_vf_send_get_ver_req(struct i40iw_sc_dev *dev,
-							struct i40iw_virtchnl_req *vchnl_req)
-{
-	enum i40iw_status_code ret_code = I40IW_ERR_NOT_READY;
-	struct i40iw_virtchnl_op_buf *vchnl_msg = vchnl_req->vchnl_msg;
-
-	if (!dev->vchnl_up)
-		return ret_code;
-
-	memset(vchnl_msg, 0, sizeof(*vchnl_msg));
-	vchnl_msg->iw_chnl_op_ctx = (uintptr_t)vchnl_req;
-	vchnl_msg->iw_chnl_buf_len = sizeof(*vchnl_msg);
-	vchnl_msg->iw_op_code = I40IW_VCHNL_OP_GET_VER;
-	vchnl_msg->iw_op_ver = I40IW_VCHNL_OP_GET_VER_V0;
-	ret_code = dev->vchnl_if.vchnl_send(dev, 0, (u8 *)vchnl_msg, vchnl_msg->iw_chnl_buf_len);
-	if (ret_code)
-		i40iw_debug(dev, I40IW_DEBUG_VIRT,
-			    "%s: virt channel send failed 0x%x\n", __func__, ret_code);
-	return ret_code;
-}
-
-/**
- * vchnl_vf_send_get_hmc_fcn_req - Request HMC Function from VF
- * @dev: IWARP device pointer
- * @vchnl_req: Virtual channel message request pointer
- */
-static enum i40iw_status_code vchnl_vf_send_get_hmc_fcn_req(struct i40iw_sc_dev *dev,
-							    struct i40iw_virtchnl_req *vchnl_req)
-{
-	enum i40iw_status_code ret_code = I40IW_ERR_NOT_READY;
-	struct i40iw_virtchnl_op_buf *vchnl_msg = vchnl_req->vchnl_msg;
-
-	if (!dev->vchnl_up)
-		return ret_code;
-
-	memset(vchnl_msg, 0, sizeof(*vchnl_msg));
-	vchnl_msg->iw_chnl_op_ctx = (uintptr_t)vchnl_req;
-	vchnl_msg->iw_chnl_buf_len = sizeof(*vchnl_msg);
-	vchnl_msg->iw_op_code = I40IW_VCHNL_OP_GET_HMC_FCN;
-	vchnl_msg->iw_op_ver = I40IW_VCHNL_OP_GET_HMC_FCN_V0;
-	ret_code = dev->vchnl_if.vchnl_send(dev, 0, (u8 *)vchnl_msg, vchnl_msg->iw_chnl_buf_len);
-	if (ret_code)
-		i40iw_debug(dev, I40IW_DEBUG_VIRT,
-			    "%s: virt channel send failed 0x%x\n", __func__, ret_code);
-	return ret_code;
-}
-
-/**
- * vchnl_vf_send_get_pe_stats_req - Request PE stats from VF
- * @dev: IWARP device pointer
- * @vchnl_req: Virtual channel message request pointer
- */
-static enum i40iw_status_code vchnl_vf_send_get_pe_stats_req(struct i40iw_sc_dev *dev,
-							     struct i40iw_virtchnl_req  *vchnl_req)
-{
-	enum i40iw_status_code ret_code = I40IW_ERR_NOT_READY;
-	struct i40iw_virtchnl_op_buf *vchnl_msg = vchnl_req->vchnl_msg;
-
-	if (!dev->vchnl_up)
-		return ret_code;
-
-	memset(vchnl_msg, 0, sizeof(*vchnl_msg));
-	vchnl_msg->iw_chnl_op_ctx = (uintptr_t)vchnl_req;
-	vchnl_msg->iw_chnl_buf_len = sizeof(*vchnl_msg) + sizeof(struct i40iw_dev_hw_stats) - 1;
-	vchnl_msg->iw_op_code = I40IW_VCHNL_OP_GET_STATS;
-	vchnl_msg->iw_op_ver = I40IW_VCHNL_OP_GET_STATS_V0;
-	ret_code = dev->vchnl_if.vchnl_send(dev, 0, (u8 *)vchnl_msg, vchnl_msg->iw_chnl_buf_len);
-	if (ret_code)
-		i40iw_debug(dev, I40IW_DEBUG_VIRT,
-			    "%s: virt channel send failed 0x%x\n", __func__, ret_code);
-	return ret_code;
-}
-
-/*
- * vchnl_vf_send_add_hmc_objs_req - Add HMC objects
- * @dev: IWARP device pointer
- * @vchnl_req: Virtual channel message request pointer
- */
-static enum i40iw_status_code vchnl_vf_send_add_hmc_objs_req(struct i40iw_sc_dev *dev,
-							     struct i40iw_virtchnl_req *vchnl_req,
-							     enum i40iw_hmc_rsrc_type rsrc_type,
-							     u32 start_index,
-							     u32 rsrc_count)
-{
-	enum i40iw_status_code ret_code = I40IW_ERR_NOT_READY;
-	struct i40iw_virtchnl_op_buf *vchnl_msg = vchnl_req->vchnl_msg;
-	struct i40iw_virtchnl_hmc_obj_range *add_hmc_obj;
-
-	if (!dev->vchnl_up)
-		return ret_code;
-
-	add_hmc_obj = (struct i40iw_virtchnl_hmc_obj_range *)vchnl_msg->iw_chnl_buf;
-	memset(vchnl_msg, 0, sizeof(*vchnl_msg));
-	memset(add_hmc_obj, 0, sizeof(*add_hmc_obj));
-	vchnl_msg->iw_chnl_op_ctx = (uintptr_t)vchnl_req;
-	vchnl_msg->iw_chnl_buf_len = sizeof(*vchnl_msg) + sizeof(struct i40iw_virtchnl_hmc_obj_range) - 1;
-	vchnl_msg->iw_op_code = I40IW_VCHNL_OP_ADD_HMC_OBJ_RANGE;
-	vchnl_msg->iw_op_ver = I40IW_VCHNL_OP_ADD_HMC_OBJ_RANGE_V0;
-	add_hmc_obj->obj_type = (u16)rsrc_type;
-	add_hmc_obj->start_index = start_index;
-	add_hmc_obj->obj_count = rsrc_count;
-	ret_code = dev->vchnl_if.vchnl_send(dev, 0, (u8 *)vchnl_msg, vchnl_msg->iw_chnl_buf_len);
-	if (ret_code)
-		i40iw_debug(dev, I40IW_DEBUG_VIRT,
-			    "%s: virt channel send failed 0x%x\n", __func__, ret_code);
-	return ret_code;
-}
-
-/**
- * vchnl_vf_send_del_hmc_objs_req - del HMC objects
- * @dev: IWARP device pointer
- * @vchnl_req: Virtual channel message request pointer
- * @rsrc_type: resource type to delete
- * @start_index: starting index for resource
- * @rsrc_count: number of resource type to delete
- */
-static enum i40iw_status_code vchnl_vf_send_del_hmc_objs_req(struct i40iw_sc_dev *dev,
-							     struct i40iw_virtchnl_req *vchnl_req,
-							     enum i40iw_hmc_rsrc_type rsrc_type,
-							     u32 start_index,
-							     u32 rsrc_count)
-{
-	enum i40iw_status_code ret_code = I40IW_ERR_NOT_READY;
-	struct i40iw_virtchnl_op_buf *vchnl_msg = vchnl_req->vchnl_msg;
-	struct i40iw_virtchnl_hmc_obj_range *add_hmc_obj;
-
-	if (!dev->vchnl_up)
-		return ret_code;
-
-	add_hmc_obj = (struct i40iw_virtchnl_hmc_obj_range *)vchnl_msg->iw_chnl_buf;
-	memset(vchnl_msg, 0, sizeof(*vchnl_msg));
-	memset(add_hmc_obj, 0, sizeof(*add_hmc_obj));
-	vchnl_msg->iw_chnl_op_ctx = (uintptr_t)vchnl_req;
-	vchnl_msg->iw_chnl_buf_len = sizeof(*vchnl_msg) + sizeof(struct i40iw_virtchnl_hmc_obj_range) - 1;
-	vchnl_msg->iw_op_code = I40IW_VCHNL_OP_DEL_HMC_OBJ_RANGE;
-	vchnl_msg->iw_op_ver = I40IW_VCHNL_OP_DEL_HMC_OBJ_RANGE_V0;
-	add_hmc_obj->obj_type = (u16)rsrc_type;
-	add_hmc_obj->start_index = start_index;
-	add_hmc_obj->obj_count = rsrc_count;
-	ret_code = dev->vchnl_if.vchnl_send(dev, 0, (u8 *)vchnl_msg, vchnl_msg->iw_chnl_buf_len);
-	if (ret_code)
-		i40iw_debug(dev, I40IW_DEBUG_VIRT,
-			    "%s: virt channel send failed 0x%x\n", __func__, ret_code);
-	return ret_code;
-}
-
-/**
- * vchnl_pf_send_get_ver_resp - Send channel version to VF
- * @dev: IWARP device pointer
- * @vf_id: Virtual function ID associated with the message
- * @vchnl_msg: Virtual channel message buffer pointer
- */
-static void vchnl_pf_send_get_ver_resp(struct i40iw_sc_dev *dev,
-				       u32 vf_id,
-				       struct i40iw_virtchnl_op_buf *vchnl_msg)
-{
-	enum i40iw_status_code ret_code;
-	u8 resp_buffer[sizeof(struct i40iw_virtchnl_resp_buf) + sizeof(u32) - 1];
-	struct i40iw_virtchnl_resp_buf *vchnl_msg_resp = (struct i40iw_virtchnl_resp_buf *)resp_buffer;
-
-	memset(resp_buffer, 0, sizeof(*resp_buffer));
-	vchnl_msg_resp->iw_chnl_op_ctx = vchnl_msg->iw_chnl_op_ctx;
-	vchnl_msg_resp->iw_chnl_buf_len = sizeof(resp_buffer);
-	vchnl_msg_resp->iw_op_ret_code = I40IW_SUCCESS;
-	*((u32 *)vchnl_msg_resp->iw_chnl_buf) = I40IW_VCHNL_CHNL_VER_V0;
-	ret_code = dev->vchnl_if.vchnl_send(dev, vf_id, resp_buffer, sizeof(resp_buffer));
-	if (ret_code)
-		i40iw_debug(dev, I40IW_DEBUG_VIRT,
-			    "%s: virt channel send failed 0x%x\n", __func__, ret_code);
-}
-
-/**
- * vchnl_pf_send_get_hmc_fcn_resp - Send HMC Function to VF
- * @dev: IWARP device pointer
- * @vf_id: Virtual function ID associated with the message
- * @vchnl_msg: Virtual channel message buffer pointer
- * @hmc_fcn: HMC function index pointer
- */
-static void vchnl_pf_send_get_hmc_fcn_resp(struct i40iw_sc_dev *dev,
-					   u32 vf_id,
-					   struct i40iw_virtchnl_op_buf *vchnl_msg,
-					   u16 hmc_fcn)
-{
-	enum i40iw_status_code ret_code;
-	u8 resp_buffer[sizeof(struct i40iw_virtchnl_resp_buf) + sizeof(u16) - 1];
-	struct i40iw_virtchnl_resp_buf *vchnl_msg_resp = (struct i40iw_virtchnl_resp_buf *)resp_buffer;
-
-	memset(resp_buffer, 0, sizeof(*resp_buffer));
-	vchnl_msg_resp->iw_chnl_op_ctx = vchnl_msg->iw_chnl_op_ctx;
-	vchnl_msg_resp->iw_chnl_buf_len = sizeof(resp_buffer);
-	vchnl_msg_resp->iw_op_ret_code = I40IW_SUCCESS;
-	*((u16 *)vchnl_msg_resp->iw_chnl_buf) = hmc_fcn;
-	ret_code = dev->vchnl_if.vchnl_send(dev, vf_id, resp_buffer, sizeof(resp_buffer));
-	if (ret_code)
-		i40iw_debug(dev, I40IW_DEBUG_VIRT,
-			    "%s: virt channel send failed 0x%x\n", __func__, ret_code);
-}
-
-/**
- * vchnl_pf_send_get_pe_stats_resp - Send PE Stats to VF
- * @dev: IWARP device pointer
- * @vf_id: Virtual function ID associated with the message
- * @vchnl_msg: Virtual channel message buffer pointer
- * @hw_stats: HW Stats struct
- */
-
-static void vchnl_pf_send_get_pe_stats_resp(struct i40iw_sc_dev *dev,
-					    u32 vf_id,
-					    struct i40iw_virtchnl_op_buf *vchnl_msg,
-					    struct i40iw_dev_hw_stats *hw_stats)
-{
-	enum i40iw_status_code ret_code;
-	u8 resp_buffer[sizeof(struct i40iw_virtchnl_resp_buf) + sizeof(struct i40iw_dev_hw_stats) - 1];
-	struct i40iw_virtchnl_resp_buf *vchnl_msg_resp = (struct i40iw_virtchnl_resp_buf *)resp_buffer;
-
-	memset(resp_buffer, 0, sizeof(*resp_buffer));
-	vchnl_msg_resp->iw_chnl_op_ctx = vchnl_msg->iw_chnl_op_ctx;
-	vchnl_msg_resp->iw_chnl_buf_len = sizeof(resp_buffer);
-	vchnl_msg_resp->iw_op_ret_code = I40IW_SUCCESS;
-	*((struct i40iw_dev_hw_stats *)vchnl_msg_resp->iw_chnl_buf) = *hw_stats;
-	ret_code = dev->vchnl_if.vchnl_send(dev, vf_id, resp_buffer, sizeof(resp_buffer));
-	if (ret_code)
-		i40iw_debug(dev, I40IW_DEBUG_VIRT,
-			    "%s: virt channel send failed 0x%x\n", __func__, ret_code);
-}
-
-/**
- * vchnl_pf_send_error_resp - Send an error response to VF
- * @dev: IWARP device pointer
- * @vf_id: Virtual function ID associated with the message
- * @vchnl_msg: Virtual channel message buffer pointer
- * @op_ret_code: I40IW_ERR_* status code
- */
-static void vchnl_pf_send_error_resp(struct i40iw_sc_dev *dev, u32 vf_id,
-				     struct i40iw_virtchnl_op_buf *vchnl_msg,
-				     u16 op_ret_code)
-{
-	enum i40iw_status_code ret_code;
-	u8 resp_buffer[sizeof(struct i40iw_virtchnl_resp_buf)];
-	struct i40iw_virtchnl_resp_buf *vchnl_msg_resp = (struct i40iw_virtchnl_resp_buf *)resp_buffer;
-
-	memset(resp_buffer, 0, sizeof(resp_buffer));
-	vchnl_msg_resp->iw_chnl_op_ctx = vchnl_msg->iw_chnl_op_ctx;
-	vchnl_msg_resp->iw_chnl_buf_len = sizeof(resp_buffer);
-	vchnl_msg_resp->iw_op_ret_code = (u16)op_ret_code;
-	ret_code = dev->vchnl_if.vchnl_send(dev, vf_id, resp_buffer, sizeof(resp_buffer));
-	if (ret_code)
-		i40iw_debug(dev, I40IW_DEBUG_VIRT,
-			    "%s: virt channel send failed 0x%x\n", __func__, ret_code);
-}
-
-/**
- * pf_cqp_get_hmc_fcn_callback - Callback for Get HMC Fcn
- * @dev: IWARP device pointer
- * @callback_param: unused CQP callback parameter
- * @cqe_info: CQE information pointer
- */
-static void pf_cqp_get_hmc_fcn_callback(struct i40iw_sc_dev *dev, void *callback_param,
-					struct i40iw_ccq_cqe_info *cqe_info)
-{
-	struct i40iw_vfdev *vf_dev = callback_param;
-	struct i40iw_virt_mem vf_dev_mem;
-
-	if (cqe_info->error) {
-		i40iw_debug(dev, I40IW_DEBUG_VIRT,
-			    "CQP Completion Error on Get HMC Function.  Maj = 0x%04x, Minor = 0x%04x\n",
-			    cqe_info->maj_err_code, cqe_info->min_err_code);
-		dev->vf_dev[vf_dev->iw_vf_idx] = NULL;
-		vchnl_pf_send_error_resp(dev, vf_dev->vf_id, &vf_dev->vf_msg_buffer.vchnl_msg,
-					 (u16)I40IW_ERR_CQP_COMPL_ERROR);
-		vf_dev_mem.va = vf_dev;
-		vf_dev_mem.size = sizeof(*vf_dev);
-		i40iw_free_virt_mem(dev->hw, &vf_dev_mem);
-	} else {
-		i40iw_debug(dev, I40IW_DEBUG_VIRT,
-			    "CQP Completion Operation Return information = 0x%08x\n",
-			    cqe_info->op_ret_val);
-		vf_dev->pmf_index = (u16)cqe_info->op_ret_val;
-		vf_dev->msg_count--;
-		vchnl_pf_send_get_hmc_fcn_resp(dev,
-					       vf_dev->vf_id,
-					       &vf_dev->vf_msg_buffer.vchnl_msg,
-					       vf_dev->pmf_index);
-	}
-}
-
-/**
- * pf_add_hmc_obj_callback - Callback for Add HMC Object
- * @work_vf_dev: pointer to the VF Device
- */
-static void pf_add_hmc_obj_callback(void *work_vf_dev)
-{
-	struct i40iw_vfdev *vf_dev = (struct i40iw_vfdev *)work_vf_dev;
-	struct i40iw_hmc_info *hmc_info = &vf_dev->hmc_info;
-	struct i40iw_virtchnl_op_buf *vchnl_msg = &vf_dev->vf_msg_buffer.vchnl_msg;
-	struct i40iw_hmc_create_obj_info info;
-	struct i40iw_virtchnl_hmc_obj_range *add_hmc_obj;
-	enum i40iw_status_code ret_code;
-
-	if (!vf_dev->pf_hmc_initialized) {
-		ret_code = i40iw_pf_init_vfhmc(vf_dev->pf_dev, (u8)vf_dev->pmf_index, NULL);
-		if (ret_code)
-			goto add_out;
-		vf_dev->pf_hmc_initialized = true;
-	}
-
-	add_hmc_obj = (struct i40iw_virtchnl_hmc_obj_range *)vchnl_msg->iw_chnl_buf;
-
-	memset(&info, 0, sizeof(info));
-	info.hmc_info = hmc_info;
-	info.is_pf = false;
-	info.rsrc_type = (u32)add_hmc_obj->obj_type;
-	info.entry_type = (info.rsrc_type == I40IW_HMC_IW_PBLE) ? I40IW_SD_TYPE_PAGED : I40IW_SD_TYPE_DIRECT;
-	info.start_idx = add_hmc_obj->start_index;
-	info.count = add_hmc_obj->obj_count;
-	i40iw_debug(vf_dev->pf_dev, I40IW_DEBUG_VIRT,
-		    "I40IW_VCHNL_OP_ADD_HMC_OBJ_RANGE.  Add %u type %u objects\n",
-		    info.count, info.rsrc_type);
-	ret_code = i40iw_sc_create_hmc_obj(vf_dev->pf_dev, &info);
-	if (!ret_code)
-		vf_dev->hmc_info.hmc_obj[add_hmc_obj->obj_type].cnt = add_hmc_obj->obj_count;
-add_out:
-	vf_dev->msg_count--;
-	vchnl_pf_send_error_resp(vf_dev->pf_dev, vf_dev->vf_id, vchnl_msg, (u16)ret_code);
-}
-
-/**
- * pf_del_hmc_obj_callback - Callback for delete HMC Object
- * @work_vf_dev: pointer to the VF Device
- */
-static void pf_del_hmc_obj_callback(void *work_vf_dev)
-{
-	struct i40iw_vfdev *vf_dev = (struct i40iw_vfdev *)work_vf_dev;
-	struct i40iw_hmc_info *hmc_info = &vf_dev->hmc_info;
-	struct i40iw_virtchnl_op_buf *vchnl_msg = &vf_dev->vf_msg_buffer.vchnl_msg;
-	struct i40iw_hmc_del_obj_info info;
-	struct i40iw_virtchnl_hmc_obj_range *del_hmc_obj;
-	enum i40iw_status_code ret_code = I40IW_SUCCESS;
-
-	if (!vf_dev->pf_hmc_initialized)
-		goto del_out;
-
-	del_hmc_obj = (struct i40iw_virtchnl_hmc_obj_range *)vchnl_msg->iw_chnl_buf;
-
-	memset(&info, 0, sizeof(info));
-	info.hmc_info = hmc_info;
-	info.is_pf = false;
-	info.rsrc_type = (u32)del_hmc_obj->obj_type;
-	info.start_idx = del_hmc_obj->start_index;
-	info.count = del_hmc_obj->obj_count;
-	i40iw_debug(vf_dev->pf_dev, I40IW_DEBUG_VIRT,
-		    "I40IW_VCHNL_OP_DEL_HMC_OBJ_RANGE.  Delete %u type %u objects\n",
-		    info.count, info.rsrc_type);
-	ret_code = i40iw_sc_del_hmc_obj(vf_dev->pf_dev, &info, false);
-del_out:
-	vf_dev->msg_count--;
-	vchnl_pf_send_error_resp(vf_dev->pf_dev, vf_dev->vf_id, vchnl_msg, (u16)ret_code);
-}
-
-/**
- * i40iw_vf_init_pestat - Initialize stats for VF
- * @dev: pointer to the VF Device
- * @stats: Statistics structure pointer
- * @index: Stats index
- */
-static void i40iw_vf_init_pestat(struct i40iw_sc_dev *dev, struct i40iw_vsi_pestat *stats, u16 index)
-{
-	stats->hw = dev->hw;
-	i40iw_hw_stats_init(stats, (u8)index, false);
-	spin_lock_init(&stats->lock);
-}
-
-/**
- * i40iw_vchnl_recv_pf - Receive PF virtual channel messages
- * @dev: IWARP device pointer
- * @vf_id: Virtual function ID associated with the message
- * @msg: Virtual channel message buffer pointer
- * @len: Length of the virtual channels message
- */
-enum i40iw_status_code i40iw_vchnl_recv_pf(struct i40iw_sc_dev *dev,
-					   u32 vf_id,
-					   u8 *msg,
-					   u16 len)
-{
-	struct i40iw_virtchnl_op_buf *vchnl_msg = (struct i40iw_virtchnl_op_buf *)msg;
-	struct i40iw_vfdev *vf_dev = NULL;
-	struct i40iw_hmc_fcn_info hmc_fcn_info;
-	u16 iw_vf_idx;
-	u16 first_avail_iw_vf = I40IW_MAX_PE_ENABLED_VF_COUNT;
-	struct i40iw_virt_mem vf_dev_mem;
-	struct i40iw_virtchnl_work_info work_info;
-	struct i40iw_vsi_pestat *stats;
-	enum i40iw_status_code ret_code;
-
-	if (!dev || !msg || !len)
-		return I40IW_ERR_PARAM;
-
-	if (!dev->vchnl_up)
-		return I40IW_ERR_NOT_READY;
-	if (vchnl_msg->iw_op_code == I40IW_VCHNL_OP_GET_VER) {
-		vchnl_pf_send_get_ver_resp(dev, vf_id, vchnl_msg);
-		return I40IW_SUCCESS;
-	}
-	for (iw_vf_idx = 0; iw_vf_idx < I40IW_MAX_PE_ENABLED_VF_COUNT; iw_vf_idx++) {
-		if (!dev->vf_dev[iw_vf_idx]) {
-			if (first_avail_iw_vf == I40IW_MAX_PE_ENABLED_VF_COUNT)
-				first_avail_iw_vf = iw_vf_idx;
-			continue;
-		}
-		if (dev->vf_dev[iw_vf_idx]->vf_id == vf_id) {
-			vf_dev = dev->vf_dev[iw_vf_idx];
-			break;
-		}
-	}
-	if (vf_dev) {
-		if (!vf_dev->msg_count) {
-			vf_dev->msg_count++;
-		} else {
-			i40iw_debug(dev, I40IW_DEBUG_VIRT,
-				    "VF%u already has a channel message in progress.\n",
-				    vf_id);
-			return I40IW_SUCCESS;
-		}
-	}
-	switch (vchnl_msg->iw_op_code) {
-	case I40IW_VCHNL_OP_GET_HMC_FCN:
-		if (!vf_dev &&
-		    (first_avail_iw_vf != I40IW_MAX_PE_ENABLED_VF_COUNT)) {
-			ret_code = i40iw_allocate_virt_mem(dev->hw, &vf_dev_mem, sizeof(struct i40iw_vfdev) +
-							   (sizeof(struct i40iw_hmc_obj_info) * I40IW_HMC_IW_MAX));
-			if (!ret_code) {
-				vf_dev = vf_dev_mem.va;
-				vf_dev->stats_initialized = false;
-				vf_dev->pf_dev = dev;
-				vf_dev->msg_count = 1;
-				vf_dev->vf_id = vf_id;
-				vf_dev->iw_vf_idx = first_avail_iw_vf;
-				vf_dev->pf_hmc_initialized = false;
-				vf_dev->hmc_info.hmc_obj = (struct i40iw_hmc_obj_info *)(&vf_dev[1]);
-				i40iw_debug(dev, I40IW_DEBUG_VIRT,
-					    "vf_dev %p, hmc_info %p, hmc_obj %p\n",
-					    vf_dev, &vf_dev->hmc_info, vf_dev->hmc_info.hmc_obj);
-				dev->vf_dev[first_avail_iw_vf] = vf_dev;
-				iw_vf_idx = first_avail_iw_vf;
-			} else {
-				i40iw_debug(dev, I40IW_DEBUG_VIRT,
-					    "VF%u Unable to allocate a VF device structure.\n",
-					    vf_id);
-				vchnl_pf_send_error_resp(dev, vf_id, vchnl_msg, (u16)I40IW_ERR_NO_MEMORY);
-				return I40IW_SUCCESS;
-			}
-			memcpy(&vf_dev->vf_msg_buffer.vchnl_msg, vchnl_msg, len);
-			hmc_fcn_info.callback_fcn = pf_cqp_get_hmc_fcn_callback;
-			hmc_fcn_info.vf_id = vf_id;
-			hmc_fcn_info.iw_vf_idx = vf_dev->iw_vf_idx;
-			hmc_fcn_info.cqp_callback_param = vf_dev;
-			hmc_fcn_info.free_fcn = false;
-			ret_code = i40iw_cqp_manage_hmc_fcn_cmd(dev, &hmc_fcn_info);
-			if (ret_code)
-				i40iw_debug(dev, I40IW_DEBUG_VIRT,
-					    "VF%u error CQP HMC Function operation.\n",
-					    vf_id);
-			i40iw_vf_init_pestat(dev, &vf_dev->pestat, vf_dev->pmf_index);
-			vf_dev->stats_initialized = true;
-		} else {
-			if (vf_dev) {
-				vf_dev->msg_count--;
-				vchnl_pf_send_get_hmc_fcn_resp(dev, vf_id, vchnl_msg, vf_dev->pmf_index);
-			} else {
-				vchnl_pf_send_error_resp(dev, vf_id, vchnl_msg,
-							 (u16)I40IW_ERR_NO_MEMORY);
-			}
-		}
-		break;
-	case I40IW_VCHNL_OP_ADD_HMC_OBJ_RANGE:
-		if (!vf_dev)
-			return I40IW_ERR_BAD_PTR;
-		work_info.worker_vf_dev = vf_dev;
-		work_info.callback_fcn = pf_add_hmc_obj_callback;
-		memcpy(&vf_dev->vf_msg_buffer.vchnl_msg, vchnl_msg, len);
-		i40iw_cqp_spawn_worker(dev, &work_info, vf_dev->iw_vf_idx);
-		break;
-	case I40IW_VCHNL_OP_DEL_HMC_OBJ_RANGE:
-		if (!vf_dev)
-			return I40IW_ERR_BAD_PTR;
-		work_info.worker_vf_dev = vf_dev;
-		work_info.callback_fcn = pf_del_hmc_obj_callback;
-		memcpy(&vf_dev->vf_msg_buffer.vchnl_msg, vchnl_msg, len);
-		i40iw_cqp_spawn_worker(dev, &work_info, vf_dev->iw_vf_idx);
-		break;
-	case I40IW_VCHNL_OP_GET_STATS:
-		if (!vf_dev)
-			return I40IW_ERR_BAD_PTR;
-		stats = &vf_dev->pestat;
-		i40iw_hw_stats_read_all(stats, &stats->hw_stats);
-		vf_dev->msg_count--;
-		vchnl_pf_send_get_pe_stats_resp(dev, vf_id, vchnl_msg, &stats->hw_stats);
-		break;
-	default:
-		i40iw_debug(dev, I40IW_DEBUG_VIRT,
-			    "40iw_vchnl_recv_pf: Invalid OpCode 0x%x\n",
-			    vchnl_msg->iw_op_code);
-		vchnl_pf_send_error_resp(dev, vf_id,
-					 vchnl_msg, (u16)I40IW_ERR_NOT_IMPLEMENTED);
-	}
-	return I40IW_SUCCESS;
-}
-
-/**
- * i40iw_vchnl_recv_vf - Receive VF virtual channel messages
- * @dev: IWARP device pointer
- * @vf_id: Virtual function ID associated with the message
- * @msg: Virtual channel message buffer pointer
- * @len: Length of the virtual channels message
- */
-enum i40iw_status_code i40iw_vchnl_recv_vf(struct i40iw_sc_dev *dev,
-					   u32 vf_id,
-					   u8 *msg,
-					   u16 len)
-{
-	struct i40iw_virtchnl_resp_buf *vchnl_msg_resp = (struct i40iw_virtchnl_resp_buf *)msg;
-	struct i40iw_virtchnl_req *vchnl_req;
-
-	vchnl_req = (struct i40iw_virtchnl_req *)(uintptr_t)vchnl_msg_resp->iw_chnl_op_ctx;
-	vchnl_req->ret_code = (enum i40iw_status_code)vchnl_msg_resp->iw_op_ret_code;
-	if (len == (sizeof(*vchnl_msg_resp) + vchnl_req->parm_len - 1)) {
-		if (vchnl_req->parm_len && vchnl_req->parm)
-			memcpy(vchnl_req->parm, vchnl_msg_resp->iw_chnl_buf, vchnl_req->parm_len);
-		i40iw_debug(dev, I40IW_DEBUG_VIRT,
-			    "%s: Got response, data size %u\n", __func__,
-			    vchnl_req->parm_len);
-	} else {
-		i40iw_debug(dev, I40IW_DEBUG_VIRT,
-			    "%s: error length on response, Got %u, expected %u\n", __func__,
-			    len, (u32)(sizeof(*vchnl_msg_resp) + vchnl_req->parm_len - 1));
-	}
-
-	return I40IW_SUCCESS;
-}
-
-/**
- * i40iw_vchnl_vf_get_ver - Request Channel version
- * @dev: IWARP device pointer
- * @vchnl_ver: Virtual channel message version pointer
- */
-enum i40iw_status_code i40iw_vchnl_vf_get_ver(struct i40iw_sc_dev *dev,
-					      u32 *vchnl_ver)
-{
-	struct i40iw_virtchnl_req vchnl_req;
-	enum i40iw_status_code ret_code;
-
-	if (!i40iw_vf_clear_to_send(dev))
-		return I40IW_ERR_TIMEOUT;
-	memset(&vchnl_req, 0, sizeof(vchnl_req));
-	vchnl_req.dev = dev;
-	vchnl_req.parm = vchnl_ver;
-	vchnl_req.parm_len = sizeof(*vchnl_ver);
-	vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg;
-
-	ret_code = vchnl_vf_send_get_ver_req(dev, &vchnl_req);
-	if (ret_code) {
-		i40iw_debug(dev, I40IW_DEBUG_VIRT,
-			    "%s Send message failed 0x%0x\n", __func__, ret_code);
-		return ret_code;
-	}
-	ret_code = i40iw_vf_wait_vchnl_resp(dev);
-	if (ret_code)
-		return ret_code;
-	else
-		return vchnl_req.ret_code;
-}
-
-/**
- * i40iw_vchnl_vf_get_hmc_fcn - Request HMC Function
- * @dev: IWARP device pointer
- * @hmc_fcn: HMC function index pointer
- */
-enum i40iw_status_code i40iw_vchnl_vf_get_hmc_fcn(struct i40iw_sc_dev *dev,
-						  u16 *hmc_fcn)
-{
-	struct i40iw_virtchnl_req vchnl_req;
-	enum i40iw_status_code ret_code;
-
-	if (!i40iw_vf_clear_to_send(dev))
-		return I40IW_ERR_TIMEOUT;
-	memset(&vchnl_req, 0, sizeof(vchnl_req));
-	vchnl_req.dev = dev;
-	vchnl_req.parm = hmc_fcn;
-	vchnl_req.parm_len = sizeof(*hmc_fcn);
-	vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg;
-
-	ret_code = vchnl_vf_send_get_hmc_fcn_req(dev, &vchnl_req);
-	if (ret_code) {
-		i40iw_debug(dev, I40IW_DEBUG_VIRT,
-			    "%s Send message failed 0x%0x\n", __func__, ret_code);
-		return ret_code;
-	}
-	ret_code = i40iw_vf_wait_vchnl_resp(dev);
-	if (ret_code)
-		return ret_code;
-	else
-		return vchnl_req.ret_code;
-}
-
-/**
- * i40iw_vchnl_vf_add_hmc_objs - Add HMC Object
- * @dev: IWARP device pointer
- * @rsrc_type: HMC Resource type
- * @start_index: Starting index of the objects to be added
- * @rsrc_count: Number of resources to be added
- */
-enum i40iw_status_code i40iw_vchnl_vf_add_hmc_objs(struct i40iw_sc_dev *dev,
-						   enum i40iw_hmc_rsrc_type rsrc_type,
-						   u32 start_index,
-						   u32 rsrc_count)
-{
-	struct i40iw_virtchnl_req vchnl_req;
-	enum i40iw_status_code ret_code;
-
-	if (!i40iw_vf_clear_to_send(dev))
-		return I40IW_ERR_TIMEOUT;
-	memset(&vchnl_req, 0, sizeof(vchnl_req));
-	vchnl_req.dev = dev;
-	vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg;
-
-	ret_code = vchnl_vf_send_add_hmc_objs_req(dev,
-						  &vchnl_req,
-						  rsrc_type,
-						  start_index,
-						  rsrc_count);
-	if (ret_code) {
-		i40iw_debug(dev, I40IW_DEBUG_VIRT,
-			    "%s Send message failed 0x%0x\n", __func__, ret_code);
-		return ret_code;
-	}
-	ret_code = i40iw_vf_wait_vchnl_resp(dev);
-	if (ret_code)
-		return ret_code;
-	else
-		return vchnl_req.ret_code;
-}
-
-/**
- * i40iw_vchnl_vf_del_hmc_obj - del HMC obj
- * @dev: IWARP device pointer
- * @rsrc_type: HMC Resource type
- * @start_index: Starting index of the object to delete
- * @rsrc_count: Number of resources to be delete
- */
-enum i40iw_status_code i40iw_vchnl_vf_del_hmc_obj(struct i40iw_sc_dev *dev,
-						  enum i40iw_hmc_rsrc_type rsrc_type,
-						  u32 start_index,
-						  u32 rsrc_count)
-{
-	struct i40iw_virtchnl_req vchnl_req;
-	enum i40iw_status_code ret_code;
-
-	if (!i40iw_vf_clear_to_send(dev))
-		return I40IW_ERR_TIMEOUT;
-	memset(&vchnl_req, 0, sizeof(vchnl_req));
-	vchnl_req.dev = dev;
-	vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg;
-
-	ret_code = vchnl_vf_send_del_hmc_objs_req(dev,
-						  &vchnl_req,
-						  rsrc_type,
-						  start_index,
-						  rsrc_count);
-	if (ret_code) {
-		i40iw_debug(dev, I40IW_DEBUG_VIRT,
-			    "%s Send message failed 0x%0x\n", __func__, ret_code);
-		return ret_code;
-	}
-	ret_code = i40iw_vf_wait_vchnl_resp(dev);
-	if (ret_code)
-		return ret_code;
-	else
-		return vchnl_req.ret_code;
-}
-
-/**
- * i40iw_vchnl_vf_get_pe_stats - Get PE stats
- * @dev: IWARP device pointer
- * @hw_stats: HW stats struct
- */
-enum i40iw_status_code i40iw_vchnl_vf_get_pe_stats(struct i40iw_sc_dev *dev,
-						   struct i40iw_dev_hw_stats *hw_stats)
-{
-	struct i40iw_virtchnl_req  vchnl_req;
-	enum i40iw_status_code ret_code;
-
-	if (!i40iw_vf_clear_to_send(dev))
-		return I40IW_ERR_TIMEOUT;
-	memset(&vchnl_req, 0, sizeof(vchnl_req));
-	vchnl_req.dev = dev;
-	vchnl_req.parm = hw_stats;
-	vchnl_req.parm_len = sizeof(*hw_stats);
-	vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg;
-
-	ret_code = vchnl_vf_send_get_pe_stats_req(dev, &vchnl_req);
-	if (ret_code) {
-		i40iw_debug(dev, I40IW_DEBUG_VIRT,
-			    "%s Send message failed 0x%0x\n", __func__, ret_code);
-		return ret_code;
-	}
-	ret_code = i40iw_vf_wait_vchnl_resp(dev);
-	if (ret_code)
-		return ret_code;
-	else
-		return vchnl_req.ret_code;
-}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_virtchnl.h b/drivers/infiniband/hw/i40iw/i40iw_virtchnl.h
deleted file mode 100644
index 24886ef08293..000000000000
--- a/drivers/infiniband/hw/i40iw/i40iw_virtchnl.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/*******************************************************************************
-*
-* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenFabrics.org BSD license below:
-*
-*   Redistribution and use in source and binary forms, with or
-*   without modification, are permitted provided that the following
-*   conditions are met:
-*
-*    - Redistributions of source code must retain the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer.
-*
-*    - Redistributions in binary form must reproduce the above
-*	copyright notice, this list of conditions and the following
-*	disclaimer in the documentation and/or other materials
-*	provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*
-*******************************************************************************/
-
-#ifndef I40IW_VIRTCHNL_H
-#define I40IW_VIRTCHNL_H
-
-#include "i40iw_hmc.h"
-
-#pragma pack(push, 1)
-
-struct i40iw_virtchnl_op_buf {
-	u16 iw_op_code;
-	u16 iw_op_ver;
-	u16 iw_chnl_buf_len;
-	u16 rsvd;
-	u64 iw_chnl_op_ctx;
-	/* Member alignment MUST be maintained above this location */
-	u8 iw_chnl_buf[1];
-};
-
-struct i40iw_virtchnl_resp_buf {
-	u64 iw_chnl_op_ctx;
-	u16 iw_chnl_buf_len;
-	s16 iw_op_ret_code;
-	/* Member alignment MUST be maintained above this location */
-	u16 rsvd[2];
-	u8 iw_chnl_buf[1];
-};
-
-enum i40iw_virtchnl_ops {
-	I40IW_VCHNL_OP_GET_VER = 0,
-	I40IW_VCHNL_OP_GET_HMC_FCN,
-	I40IW_VCHNL_OP_ADD_HMC_OBJ_RANGE,
-	I40IW_VCHNL_OP_DEL_HMC_OBJ_RANGE,
-	I40IW_VCHNL_OP_GET_STATS
-};
-
-#define I40IW_VCHNL_OP_GET_VER_V0 0
-#define I40IW_VCHNL_OP_GET_HMC_FCN_V0 0
-#define I40IW_VCHNL_OP_ADD_HMC_OBJ_RANGE_V0 0
-#define I40IW_VCHNL_OP_DEL_HMC_OBJ_RANGE_V0 0
-#define I40IW_VCHNL_OP_GET_STATS_V0 0
-#define I40IW_VCHNL_CHNL_VER_V0 0
-
-struct i40iw_dev_hw_stats;
-
-struct i40iw_virtchnl_hmc_obj_range {
-	u16 obj_type;
-	u16 rsvd;
-	u32 start_index;
-	u32 obj_count;
-};
-
-enum i40iw_status_code i40iw_vchnl_recv_pf(struct i40iw_sc_dev *dev,
-					   u32 vf_id,
-					   u8 *msg,
-					   u16 len);
-
-enum i40iw_status_code i40iw_vchnl_recv_vf(struct i40iw_sc_dev *dev,
-					   u32 vf_id,
-					   u8 *msg,
-					   u16 len);
-
-struct i40iw_virtchnl_req {
-	struct i40iw_sc_dev *dev;
-	struct i40iw_virtchnl_op_buf *vchnl_msg;
-	void *parm;
-	u32 vf_id;
-	u16 parm_len;
-	s16 ret_code;
-};
-
-#pragma pack(pop)
-
-enum i40iw_status_code i40iw_vchnl_vf_get_ver(struct i40iw_sc_dev *dev,
-					      u32 *vchnl_ver);
-
-enum i40iw_status_code i40iw_vchnl_vf_get_hmc_fcn(struct i40iw_sc_dev *dev,
-						  u16 *hmc_fcn);
-
-enum i40iw_status_code i40iw_vchnl_vf_add_hmc_objs(struct i40iw_sc_dev *dev,
-						   enum i40iw_hmc_rsrc_type rsrc_type,
-						   u32 start_index,
-						   u32 rsrc_count);
-
-enum i40iw_status_code i40iw_vchnl_vf_del_hmc_obj(struct i40iw_sc_dev *dev,
-						  enum i40iw_hmc_rsrc_type rsrc_type,
-						  u32 start_index,
-						  u32 rsrc_count);
-
-enum i40iw_status_code i40iw_vchnl_vf_get_pe_stats(struct i40iw_sc_dev *dev,
-						   struct i40iw_dev_hw_stats *hw_stats);
-#endif
diff --git a/drivers/infiniband/hw/irdma/Kconfig b/drivers/infiniband/hw/irdma/Kconfig
new file mode 100644
index 000000000000..dab88286d549
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/Kconfig
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config INFINIBAND_IRDMA
+	tristate "Intel(R) Ethernet Protocol Driver for RDMA"
+	depends on INET
+	depends on IPV6 || !IPV6
+	depends on PCI
+	depends on ICE && I40E
+	select GENERIC_ALLOCATOR
+	select CONFIG_AUXILIARY_BUS
+	help
+	  This is an Intel(R) Ethernet Protocol Driver for RDMA driver
+	  that support E810 (iWARP/RoCE) and X722 (iWARP) network devices.
diff --git a/drivers/infiniband/hw/irdma/Makefile b/drivers/infiniband/hw/irdma/Makefile
new file mode 100644
index 000000000000..48c3854235a0
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/Makefile
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+# Copyright (c) 2019, Intel Corporation.
+
+#
+# Makefile for the Intel(R) Ethernet Connection RDMA Linux Driver
+#
+
+obj-$(CONFIG_INFINIBAND_IRDMA) += irdma.o
+
+irdma-objs := cm.o        \
+              ctrl.o      \
+              hmc.o       \
+              hw.o        \
+              i40iw_hw.o  \
+              i40iw_if.o  \
+              icrdma_hw.o \
+              main.o      \
+              pble.o      \
+              puda.o      \
+              trace.o     \
+              uda.o       \
+              uk.o        \
+              utils.o     \
+              verbs.o     \
+              ws.o        \
+
+CFLAGS_trace.o = -I$(src)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_client.c b/drivers/net/ethernet/intel/i40e/i40e_client.c
index e07ed065d3a4..ea2bb0140a6e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_client.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_client.c
@@ -8,8 +8,6 @@
 #include "i40e.h"
 #include "i40e_prototype.h"
 
-static const char i40e_client_interface_version_str[] = I40E_CLIENT_VERSION_STR;
-static struct i40e_client *registered_client;
 static LIST_HEAD(i40e_devices);
 static DEFINE_MUTEX(i40e_device_mutex);
 DEFINE_IDA(i40e_client_ida);
@@ -367,7 +365,6 @@ static void i40e_client_add_instance(struct i40e_pf *pf)
 	else
 		dev_err(&pf->pdev->dev, "MAC address list is empty!\n");
 
-	cdev->client = registered_client;
 	pf->cinst = cdev;
 
 	cdev->lan_info.msix_count = pf->num_iwarp_msix;
@@ -525,69 +522,6 @@ int i40e_lan_del_device(struct i40e_pf *pf)
 	return ret;
 }
 
-/**
- * i40e_client_release - release client specific resources
- * @client: pointer to the registered client
- *
- **/
-static void i40e_client_release(struct i40e_client *client)
-{
-	struct i40e_client_instance *cdev;
-	struct i40e_device *ldev;
-	struct i40e_pf *pf;
-
-	mutex_lock(&i40e_device_mutex);
-	list_for_each_entry(ldev, &i40e_devices, list) {
-		pf = ldev->pf;
-		cdev = pf->cinst;
-		if (!cdev)
-			continue;
-
-		while (test_and_set_bit(__I40E_SERVICE_SCHED,
-					pf->state))
-			usleep_range(500, 1000);
-
-		if (test_bit(__I40E_CLIENT_INSTANCE_OPENED, &cdev->state)) {
-			if (client->ops && client->ops->close)
-				client->ops->close(&cdev->lan_info, client,
-						   false);
-			i40e_client_release_qvlist(&cdev->lan_info);
-			clear_bit(__I40E_CLIENT_INSTANCE_OPENED, &cdev->state);
-
-			dev_warn(&pf->pdev->dev,
-				 "Client %s instance for PF id %d closed\n",
-				 client->name, pf->hw.pf_id);
-		}
-		/* delete the client instance */
-		i40e_client_del_instance(pf);
-		dev_info(&pf->pdev->dev, "Deleted client instance of Client %s\n",
-			 client->name);
-		clear_bit(__I40E_SERVICE_SCHED, pf->state);
-	}
-	mutex_unlock(&i40e_device_mutex);
-}
-
-/**
- * i40e_client_prepare - prepare client specific resources
- * @client: pointer to the registered client
- *
- **/
-static void i40e_client_prepare(struct i40e_client *client)
-{
-	struct i40e_device *ldev;
-	struct i40e_pf *pf;
-
-	mutex_lock(&i40e_device_mutex);
-	list_for_each_entry(ldev, &i40e_devices, list) {
-		pf = ldev->pf;
-		i40e_client_add_instance(pf);
-		/* Start the client subtask */
-		set_bit(__I40E_CLIENT_SERVICE_REQUESTED, pf->state);
-		i40e_service_event_schedule(pf);
-	}
-	mutex_unlock(&i40e_device_mutex);
-}
-
 /**
  * i40e_client_virtchnl_send - TBD
  * @ldev: pointer to L2 context
@@ -817,86 +751,3 @@ void i40e_client_device_unregister(struct i40e_info *ldev)
 	clear_bit(__I40E_SERVICE_SCHED, pf->state);
 }
 EXPORT_SYMBOL_GPL(i40e_client_device_unregister);
-
-/* Retain these legacy global registration/unregistration calls till i40iw is
- * removed from the kernel. The irdma unified driver does not use these
- * exported symbols.
- */
-/**
- * i40e_register_client - Register a i40e client driver with the L2 driver
- * @client: pointer to the i40e_client struct
- *
- * Returns 0 on success or non-0 on error
- **/
-int i40e_register_client(struct i40e_client *client)
-{
-	int ret = 0;
-
-	if (!client) {
-		ret = -EIO;
-		goto out;
-	}
-
-	if (strlen(client->name) == 0) {
-		pr_info("i40e: Failed to register client with no name\n");
-		ret = -EIO;
-		goto out;
-	}
-
-	if (registered_client) {
-		pr_info("i40e: Client %s has already been registered!\n",
-			client->name);
-		ret = -EEXIST;
-		goto out;
-	}
-
-	if ((client->version.major != I40E_CLIENT_VERSION_MAJOR) ||
-	    (client->version.minor != I40E_CLIENT_VERSION_MINOR)) {
-		pr_info("i40e: Failed to register client %s due to mismatched client interface version\n",
-			client->name);
-		pr_info("Client is using version: %02d.%02d.%02d while LAN driver supports %s\n",
-			client->version.major, client->version.minor,
-			client->version.build,
-			i40e_client_interface_version_str);
-		ret = -EIO;
-		goto out;
-	}
-
-	registered_client = client;
-
-	i40e_client_prepare(client);
-
-	pr_info("i40e: Registered client %s\n", client->name);
-out:
-	return ret;
-}
-EXPORT_SYMBOL(i40e_register_client);
-
-/**
- * i40e_unregister_client - Unregister a i40e client driver with the L2 driver
- * @client: pointer to the i40e_client struct
- *
- * Returns 0 on success or non-0 on error
- **/
-int i40e_unregister_client(struct i40e_client *client)
-{
-	int ret = 0;
-
-	if (registered_client != client) {
-		pr_info("i40e: Client %s has not been registered\n",
-			client->name);
-		ret = -ENODEV;
-		goto out;
-	}
-	registered_client = NULL;
-	/* When a unregister request comes through we would have to send
-	 * a close for each of the client instances that were opened.
-	 * client_release function is called to handle this.
-	 */
-	i40e_client_release(client);
-
-	pr_info("i40e: Unregistered client %s\n", client->name);
-out:
-	return ret;
-}
-EXPORT_SYMBOL(i40e_unregister_client);
diff --git a/include/linux/net/intel/i40e_client.h b/include/linux/net/intel/i40e_client.h
index 41f24b5241ab..6b3267b49755 100644
--- a/include/linux/net/intel/i40e_client.h
+++ b/include/linux/net/intel/i40e_client.h
@@ -197,8 +197,5 @@ static inline bool i40e_client_is_registered(struct i40e_client *client)
 
 void i40e_client_device_register(struct i40e_info *ldev, struct i40e_client *client);
 void i40e_client_device_unregister(struct i40e_info *ldev);
-/* used by clients */
-int i40e_register_client(struct i40e_client *client);
-int i40e_unregister_client(struct i40e_client *client);
 
 #endif /* _I40E_CLIENT_H_ */
diff --git a/include/uapi/rdma/i40iw-abi.h b/include/uapi/rdma/i40iw-abi.h
deleted file mode 100644
index 79890baa6fdb..000000000000
--- a/include/uapi/rdma/i40iw-abi.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2006 - 2016 Intel Corporation.  All rights reserved.
- * Copyright (c) 2005 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005 Cisco Systems.  All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#ifndef I40IW_ABI_H
-#define I40IW_ABI_H
-
-#include <linux/types.h>
-
-#define I40IW_ABI_VER 5
-
-struct i40iw_alloc_ucontext_req {
-	__u32 reserved32;
-	__u8 userspace_ver;
-	__u8 reserved8[3];
-};
-
-struct i40iw_alloc_ucontext_resp {
-	__u32 max_pds;		/* maximum pds allowed for this user process */
-	__u32 max_qps;		/* maximum qps allowed for this user process */
-	__u32 wq_size;		/* size of the WQs (sq+rq) allocated to the mmaped area */
-	__u8 kernel_ver;
-	__u8 reserved[3];
-};
-
-struct i40iw_alloc_pd_resp {
-	__u32 pd_id;
-	__u8 reserved[4];
-};
-
-struct i40iw_create_cq_req {
-	__aligned_u64 user_cq_buffer;
-	__aligned_u64 user_shadow_area;
-};
-
-struct i40iw_create_qp_req {
-	__aligned_u64 user_wqe_buffers;
-	__aligned_u64 user_compl_ctx;
-
-	/* UDA QP PHB */
-	__aligned_u64 user_sq_phb;	/* place for VA of the sq phb buff */
-	__aligned_u64 user_rq_phb;	/* place for VA of the rq phb buff */
-};
-
-enum i40iw_memreg_type {
-	IW_MEMREG_TYPE_MEM = 0x0000,
-	IW_MEMREG_TYPE_QP = 0x0001,
-	IW_MEMREG_TYPE_CQ = 0x0002,
-};
-
-struct i40iw_mem_reg_req {
-	__u16 reg_type;		/* Memory, QP or CQ */
-	__u16 cq_pages;
-	__u16 rq_pages;
-	__u16 sq_pages;
-};
-
-struct i40iw_create_cq_resp {
-	__u32 cq_id;
-	__u32 cq_size;
-	__u32 mmap_db_index;
-	__u32 reserved;
-};
-
-struct i40iw_create_qp_resp {
-	__u32 qp_id;
-	__u32 actual_sq_size;
-	__u32 actual_rq_size;
-	__u32 i40iw_drv_opt;
-	__u16 push_idx;
-	__u8  lsmm;
-	__u8  rsvd2;
-};
-
-#endif
-- 
cgit v1.2.3


From 12d55d3b5370448f6568d0031b5e401cc050c29e Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Thu, 27 May 2021 14:38:41 -0500
Subject: of: Move reserved memory private function declarations

fdt_init_reserved_mem() and fdt_reserved_mem_save_node() are private to
the DT code, so move there declarations to of_private.h. There's no need
for the dummy functions as CONFIG_OF_RESERVED_MEM is always enabled for
CONFIG_OF_EARLY_FLATTREE.

Cc: Frank Rowand <frowand.list@gmail.com>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20210527193841.1284169-1-robh@kernel.org
---
 drivers/of/of_private.h         | 4 ++++
 drivers/of/of_reserved_mem.c    | 2 ++
 include/linux/of_reserved_mem.h | 6 ------
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h
index d717efbd637d..631489f7f8c0 100644
--- a/drivers/of/of_private.h
+++ b/drivers/of/of_private.h
@@ -171,4 +171,8 @@ static inline int of_dma_get_range(struct device_node *np,
 }
 #endif
 
+void fdt_init_reserved_mem(void);
+void fdt_reserved_mem_save_node(unsigned long node, const char *uname,
+			       phys_addr_t base, phys_addr_t size);
+
 #endif /* _LINUX_OF_PRIVATE_H */
diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c
index 15e2417974d6..4592b71aba5c 100644
--- a/drivers/of/of_reserved_mem.c
+++ b/drivers/of/of_reserved_mem.c
@@ -22,6 +22,8 @@
 #include <linux/slab.h>
 #include <linux/memblock.h>
 
+#include "of_private.h"
+
 #define MAX_RESERVED_REGIONS	64
 static struct reserved_mem reserved_mem[MAX_RESERVED_REGIONS];
 static int reserved_mem_count;
diff --git a/include/linux/of_reserved_mem.h b/include/linux/of_reserved_mem.h
index 8216a4156263..76e4a0fffba4 100644
--- a/include/linux/of_reserved_mem.h
+++ b/include/linux/of_reserved_mem.h
@@ -39,9 +39,6 @@ int of_reserved_mem_device_init_by_name(struct device *dev,
 					const char *name);
 void of_reserved_mem_device_release(struct device *dev);
 
-void fdt_init_reserved_mem(void);
-void fdt_reserved_mem_save_node(unsigned long node, const char *uname,
-			       phys_addr_t base, phys_addr_t size);
 struct reserved_mem *of_reserved_mem_lookup(struct device_node *np);
 #else
 static inline int of_reserved_mem_device_init_by_idx(struct device *dev,
@@ -59,9 +56,6 @@ static inline int of_reserved_mem_device_init_by_name(struct device *dev,
 
 static inline void of_reserved_mem_device_release(struct device *pdev) { }
 
-static inline void fdt_init_reserved_mem(void) { }
-static inline void fdt_reserved_mem_save_node(unsigned long node,
-		const char *uname, phys_addr_t base, phys_addr_t size) { }
 static inline struct reserved_mem *of_reserved_mem_lookup(struct device_node *np)
 {
 	return NULL;
-- 
cgit v1.2.3


From 00dcc7cf1a49f93efaa281cc85c88005995ecf63 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Thu, 27 May 2021 14:45:44 -0500
Subject: PCI: Add empty stub for pci_register_io_range()

Add an empty stub for pci_register_io_range() when !CONFIG_PCI. It's needed
to convert of_pci_range_to_resource() to use IS_ENABLED(CONFIG_PCI).

Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: linux-pci@vger.kernel.org
Signed-off-by: Rob Herring <robh@kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/20210527194547.1287934-2-robh@kernel.org
---
 include/linux/pci.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index c20211e59a57..29da7598f8d0 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1772,6 +1772,10 @@ static inline int pci_request_regions(struct pci_dev *dev, const char *res_name)
 { return -EIO; }
 static inline void pci_release_regions(struct pci_dev *dev) { }
 
+static inline int pci_register_io_range(struct fwnode_handle *fwnode,
+					phys_addr_t addr, resource_size_t size)
+{ return -EINVAL; }
+
 static inline unsigned long pci_address_to_pio(phys_addr_t addr) { return -1; }
 
 static inline struct pci_bus *pci_find_next_bus(const struct pci_bus *from)
-- 
cgit v1.2.3


From 050a2c62dfc7d9ef457405f6ab4b715e9a2e32d7 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Thu, 27 May 2021 14:45:45 -0500
Subject: of: Merge of_get_address() and of_get_pci_address() implementations

of_get_address() and of_get_pci_address() are the same implementation
except of_get_pci_address() takes the PCI BAR number rather than an
index. Modify the of_get_address() implementation to work on either
index or BAR and provide wrapper functions for the existing functions.

Cc: Frank Rowand <frowand.list@gmail.com>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20210527194547.1287934-3-robh@kernel.org
---
 drivers/of/address.c       | 62 ++++++++++------------------------------------
 include/linux/of_address.h | 27 ++++++++++++--------
 2 files changed, 29 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/address.c b/drivers/of/address.c
index aca94c348bd4..aa766437995c 100644
--- a/drivers/of/address.c
+++ b/drivers/of/address.c
@@ -199,50 +199,6 @@ static int of_bus_pci_translate(__be32 *addr, u64 offset, int na)
 	return of_bus_default_translate(addr + 1, offset, na - 1);
 }
 
-const __be32 *of_get_pci_address(struct device_node *dev, int bar_no, u64 *size,
-			unsigned int *flags)
-{
-	const __be32 *prop;
-	unsigned int psize;
-	struct device_node *parent;
-	struct of_bus *bus;
-	int onesize, i, na, ns;
-
-	/* Get parent & match bus type */
-	parent = of_get_parent(dev);
-	if (parent == NULL)
-		return NULL;
-	bus = of_match_bus(parent);
-	if (strcmp(bus->name, "pci")) {
-		of_node_put(parent);
-		return NULL;
-	}
-	bus->count_cells(dev, &na, &ns);
-	of_node_put(parent);
-	if (!OF_CHECK_ADDR_COUNT(na))
-		return NULL;
-
-	/* Get "reg" or "assigned-addresses" property */
-	prop = of_get_property(dev, bus->addresses, &psize);
-	if (prop == NULL)
-		return NULL;
-	psize /= 4;
-
-	onesize = na + ns;
-	for (i = 0; psize >= onesize; psize -= onesize, prop += onesize, i++) {
-		u32 val = be32_to_cpu(prop[0]);
-		if ((val & 0xff) == ((bar_no * 4) + PCI_BASE_ADDRESS_0)) {
-			if (size)
-				*size = of_read_number(prop + na, ns);
-			if (flags)
-				*flags = bus->get_flags(prop);
-			return prop;
-		}
-	}
-	return NULL;
-}
-EXPORT_SYMBOL(of_get_pci_address);
-
 int of_pci_address_to_resource(struct device_node *dev, int bar,
 			       struct resource *r)
 {
@@ -675,8 +631,8 @@ u64 of_translate_dma_address(struct device_node *dev, const __be32 *in_addr)
 }
 EXPORT_SYMBOL(of_translate_dma_address);
 
-const __be32 *of_get_address(struct device_node *dev, int index, u64 *size,
-		    unsigned int *flags)
+const __be32 *__of_get_address(struct device_node *dev, int index, int bar_no,
+			       u64 *size, unsigned int *flags)
 {
 	const __be32 *prop;
 	unsigned int psize;
@@ -689,6 +645,10 @@ const __be32 *of_get_address(struct device_node *dev, int index, u64 *size,
 	if (parent == NULL)
 		return NULL;
 	bus = of_match_bus(parent);
+	if (strcmp(bus->name, "pci") && (bar_no >= 0)) {
+		of_node_put(parent);
+		return NULL;
+	}
 	bus->count_cells(dev, &na, &ns);
 	of_node_put(parent);
 	if (!OF_CHECK_ADDR_COUNT(na))
@@ -701,17 +661,21 @@ const __be32 *of_get_address(struct device_node *dev, int index, u64 *size,
 	psize /= 4;
 
 	onesize = na + ns;
-	for (i = 0; psize >= onesize; psize -= onesize, prop += onesize, i++)
-		if (i == index) {
+	for (i = 0; psize >= onesize; psize -= onesize, prop += onesize, i++) {
+		u32 val = be32_to_cpu(prop[0]);
+		/* PCI bus matches on BAR number instead of index */
+		if (((bar_no >= 0) && ((val & 0xff) == ((bar_no * 4) + PCI_BASE_ADDRESS_0))) ||
+		    ((index >= 0) && (i == index))) {
 			if (size)
 				*size = of_read_number(prop + na, ns);
 			if (flags)
 				*flags = bus->get_flags(prop);
 			return prop;
 		}
+	}
 	return NULL;
 }
-EXPORT_SYMBOL(of_get_address);
+EXPORT_SYMBOL(__of_get_address);
 
 static int parser_init(struct of_pci_range_parser *parser,
 			struct device_node *node, const char *name)
diff --git a/include/linux/of_address.h b/include/linux/of_address.h
index 88bc943405cd..b72807faf037 100644
--- a/include/linux/of_address.h
+++ b/include/linux/of_address.h
@@ -51,8 +51,8 @@ void __iomem *of_io_request_and_map(struct device_node *device,
  * the address space flags too. The PCI version uses a BAR number
  * instead of an absolute index
  */
-extern const __be32 *of_get_address(struct device_node *dev, int index,
-			   u64 *size, unsigned int *flags);
+extern const __be32 *__of_get_address(struct device_node *dev, int index, int bar_no,
+				      u64 *size, unsigned int *flags);
 
 extern int of_pci_range_parser_init(struct of_pci_range_parser *parser,
 			struct device_node *node);
@@ -75,8 +75,8 @@ static inline u64 of_translate_address(struct device_node *np,
 	return OF_BAD_ADDR;
 }
 
-static inline const __be32 *of_get_address(struct device_node *dev, int index,
-					u64 *size, unsigned int *flags)
+static inline const __be32 *__of_get_address(struct device_node *dev, int index, int bar_no,
+					     u64 *size, unsigned int *flags)
 {
 	return NULL;
 }
@@ -125,8 +125,6 @@ static inline void __iomem *of_iomap(struct device_node *device, int index)
 #define of_range_parser_init of_pci_range_parser_init
 
 #if defined(CONFIG_OF_ADDRESS) && defined(CONFIG_PCI)
-extern const __be32 *of_get_pci_address(struct device_node *dev, int bar_no,
-			       u64 *size, unsigned int *flags);
 extern int of_pci_address_to_resource(struct device_node *dev, int bar,
 				      struct resource *r);
 extern int of_pci_range_to_resource(struct of_pci_range *range,
@@ -139,11 +137,6 @@ static inline int of_pci_address_to_resource(struct device_node *dev, int bar,
 	return -ENOSYS;
 }
 
-static inline const __be32 *of_get_pci_address(struct device_node *dev,
-		int bar_no, u64 *size, unsigned int *flags)
-{
-	return NULL;
-}
 static inline int of_pci_range_to_resource(struct of_pci_range *range,
 					   struct device_node *np,
 					   struct resource *res)
@@ -152,4 +145,16 @@ static inline int of_pci_range_to_resource(struct of_pci_range *range,
 }
 #endif /* CONFIG_OF_ADDRESS && CONFIG_PCI */
 
+static inline const __be32 *of_get_address(struct device_node *dev, int index,
+					   u64 *size, unsigned int *flags)
+{
+	return __of_get_address(dev, index, -1, size, flags);
+}
+
+static inline const __be32 *of_get_pci_address(struct device_node *dev, int bar_no,
+					       u64 *size, unsigned int *flags)
+{
+	return __of_get_address(dev, -1, bar_no, size, flags);
+}
+
 #endif /* __OF_ADDRESS_H */
-- 
cgit v1.2.3


From c3c0dc75774b488770f33598109161040d291367 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Thu, 27 May 2021 14:45:46 -0500
Subject: of: address: Use IS_ENABLED() for !CONFIG_PCI

Convert address.c to use IS_ENABLED() instead of ifdefs for the
public PCI functions. This simplifies the ifdefs in of_address.h.

Cc: Frank Rowand <frowand.list@gmail.com>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20210527194547.1287934-4-robh@kernel.org
---
 drivers/of/address.c       |  8 +++++++-
 include/linux/of_address.h | 39 ++++++++++++++++++---------------------
 2 files changed, 25 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/address.c b/drivers/of/address.c
index aa766437995c..e643f999743a 100644
--- a/drivers/of/address.c
+++ b/drivers/of/address.c
@@ -198,6 +198,7 @@ static int of_bus_pci_translate(__be32 *addr, u64 offset, int na)
 {
 	return of_bus_default_translate(addr + 1, offset, na - 1);
 }
+#endif /* CONFIG_PCI */
 
 int of_pci_address_to_resource(struct device_node *dev, int bar,
 			       struct resource *r)
@@ -206,6 +207,9 @@ int of_pci_address_to_resource(struct device_node *dev, int bar,
 	u64		size;
 	unsigned int	flags;
 
+	if (!IS_ENABLED(CONFIG_PCI))
+		return -ENOSYS;
+
 	addrp = of_get_pci_address(dev, bar, &size, &flags);
 	if (addrp == NULL)
 		return -EINVAL;
@@ -236,6 +240,9 @@ int of_pci_range_to_resource(struct of_pci_range *range,
 	res->parent = res->child = res->sibling = NULL;
 	res->name = np->full_name;
 
+	if (!IS_ENABLED(CONFIG_PCI))
+		return -ENOSYS;
+
 	if (res->flags & IORESOURCE_IO) {
 		unsigned long port;
 		err = pci_register_io_range(&np->fwnode, range->cpu_addr,
@@ -266,7 +273,6 @@ invalid_range:
 	return err;
 }
 EXPORT_SYMBOL(of_pci_range_to_resource);
-#endif /* CONFIG_PCI */
 
 /*
  * ISA bus specific translator
diff --git a/include/linux/of_address.h b/include/linux/of_address.h
index b72807faf037..45598dbec269 100644
--- a/include/linux/of_address.h
+++ b/include/linux/of_address.h
@@ -61,6 +61,11 @@ extern int of_pci_dma_range_parser_init(struct of_pci_range_parser *parser,
 extern struct of_pci_range *of_pci_range_parser_one(
 					struct of_pci_range_parser *parser,
 					struct of_pci_range *range);
+extern int of_pci_address_to_resource(struct device_node *dev, int bar,
+				      struct resource *r);
+extern int of_pci_range_to_resource(struct of_pci_range *range,
+				    struct device_node *np,
+				    struct resource *res);
 extern bool of_dma_is_coherent(struct device_node *np);
 #else /* CONFIG_OF_ADDRESS */
 static inline void __iomem *of_io_request_and_map(struct device_node *device,
@@ -100,6 +105,19 @@ static inline struct of_pci_range *of_pci_range_parser_one(
 	return NULL;
 }
 
+static inline int of_pci_address_to_resource(struct device_node *dev, int bar,
+				             struct resource *r)
+{
+	return -ENOSYS;
+}
+
+static inline int of_pci_range_to_resource(struct of_pci_range *range,
+					   struct device_node *np,
+					   struct resource *res)
+{
+	return -ENOSYS;
+}
+
 static inline bool of_dma_is_coherent(struct device_node *np)
 {
 	return false;
@@ -124,27 +142,6 @@ static inline void __iomem *of_iomap(struct device_node *device, int index)
 #endif
 #define of_range_parser_init of_pci_range_parser_init
 
-#if defined(CONFIG_OF_ADDRESS) && defined(CONFIG_PCI)
-extern int of_pci_address_to_resource(struct device_node *dev, int bar,
-				      struct resource *r);
-extern int of_pci_range_to_resource(struct of_pci_range *range,
-				    struct device_node *np,
-				    struct resource *res);
-#else /* CONFIG_OF_ADDRESS && CONFIG_PCI */
-static inline int of_pci_address_to_resource(struct device_node *dev, int bar,
-				             struct resource *r)
-{
-	return -ENOSYS;
-}
-
-static inline int of_pci_range_to_resource(struct of_pci_range *range,
-					   struct device_node *np,
-					   struct resource *res)
-{
-	return -ENOSYS;
-}
-#endif /* CONFIG_OF_ADDRESS && CONFIG_PCI */
-
 static inline const __be32 *of_get_address(struct device_node *dev, int index,
 					   u64 *size, unsigned int *flags)
 {
-- 
cgit v1.2.3


From c545a90567125b874d817509036ec7d6698097ac Mon Sep 17 00:00:00 2001
From: JC Kuo <jckuo@nvidia.com>
Date: Wed, 20 Jan 2021 15:34:06 +0800
Subject: phy: tegra: xusb: Add sleepwalk and suspend/resume

This commit adds sleepwalk/wake and suspend/resume interfaces
to Tegra XUSB PHY driver.

Tegra XUSB host controller driver makes use of sleepwalk functions
to enable/disable sleepwalk circuit which is in always-on partition
and can respond to USB resume signals when controller is not powered.
Sleepwalk can be enabled/disabled for any USB UPHY individually.

  - tegra_xusb_padctl_enable_phy_sleepwalk()
  - tegra_xusb_padctl_disable_phy_sleepwalk()

Tegra XUSB host controller driver makes use of wake functions to
enable/disable/query wake circuit which is in always-on partition
can wake system up when USB resume happens.
Wake circuit can be enabled/disabled for any USB PHY individually.

  - tegra_xusb_padctl_enable_phy_wake()
  - tegra_xusb_padctl_disable_phy_wake()
  - tegra_xusb_padctl_remote_wake_detected()

This commit also adds two system suspend stubs that can be used to
save and restore XUSB PADCTL context during system suspend and
resume.
  - tegra_xusb_padctl_suspend_noirq()
  - tegra_xusb_padctl_resume_noirq()

Signed-off-by: JC Kuo <jckuo@nvidia.com>
Acked-By: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/phy/tegra/xusb.c       | 82 ++++++++++++++++++++++++++++++++++++++++++
 drivers/phy/tegra/xusb.h       |  8 +++++
 include/linux/phy/tegra/xusb.h | 10 +++++-
 3 files changed, 99 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/phy/tegra/xusb.c b/drivers/phy/tegra/xusb.c
index a34d304677bb..0aadac678191 100644
--- a/drivers/phy/tegra/xusb.c
+++ b/drivers/phy/tegra/xusb.c
@@ -1273,10 +1273,36 @@ static int tegra_xusb_padctl_remove(struct platform_device *pdev)
 	return err;
 }
 
+static int tegra_xusb_padctl_suspend_noirq(struct device *dev)
+{
+	struct tegra_xusb_padctl *padctl = dev_get_drvdata(dev);
+
+	if (padctl->soc && padctl->soc->ops && padctl->soc->ops->suspend_noirq)
+		return padctl->soc->ops->suspend_noirq(padctl);
+
+	return 0;
+}
+
+static int tegra_xusb_padctl_resume_noirq(struct device *dev)
+{
+	struct tegra_xusb_padctl *padctl = dev_get_drvdata(dev);
+
+	if (padctl->soc && padctl->soc->ops && padctl->soc->ops->resume_noirq)
+		return padctl->soc->ops->resume_noirq(padctl);
+
+	return 0;
+}
+
+static const struct dev_pm_ops tegra_xusb_padctl_pm_ops = {
+	SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(tegra_xusb_padctl_suspend_noirq,
+				      tegra_xusb_padctl_resume_noirq)
+};
+
 static struct platform_driver tegra_xusb_padctl_driver = {
 	.driver = {
 		.name = "tegra-xusb-padctl",
 		.of_match_table = tegra_xusb_padctl_of_match,
+		.pm = &tegra_xusb_padctl_pm_ops,
 	},
 	.probe = tegra_xusb_padctl_probe,
 	.remove = tegra_xusb_padctl_remove,
@@ -1343,6 +1369,62 @@ int tegra_xusb_padctl_hsic_set_idle(struct tegra_xusb_padctl *padctl,
 }
 EXPORT_SYMBOL_GPL(tegra_xusb_padctl_hsic_set_idle);
 
+int tegra_xusb_padctl_enable_phy_sleepwalk(struct tegra_xusb_padctl *padctl, struct phy *phy,
+					   enum usb_device_speed speed)
+{
+	struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
+
+	if (lane->pad->ops->enable_phy_sleepwalk)
+		return lane->pad->ops->enable_phy_sleepwalk(lane, speed);
+
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(tegra_xusb_padctl_enable_phy_sleepwalk);
+
+int tegra_xusb_padctl_disable_phy_sleepwalk(struct tegra_xusb_padctl *padctl, struct phy *phy)
+{
+	struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
+
+	if (lane->pad->ops->disable_phy_sleepwalk)
+		return lane->pad->ops->disable_phy_sleepwalk(lane);
+
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(tegra_xusb_padctl_disable_phy_sleepwalk);
+
+int tegra_xusb_padctl_enable_phy_wake(struct tegra_xusb_padctl *padctl, struct phy *phy)
+{
+	struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
+
+	if (lane->pad->ops->enable_phy_wake)
+		return lane->pad->ops->enable_phy_wake(lane);
+
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(tegra_xusb_padctl_enable_phy_wake);
+
+int tegra_xusb_padctl_disable_phy_wake(struct tegra_xusb_padctl *padctl, struct phy *phy)
+{
+	struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
+
+	if (lane->pad->ops->disable_phy_wake)
+		return lane->pad->ops->disable_phy_wake(lane);
+
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(tegra_xusb_padctl_disable_phy_wake);
+
+bool tegra_xusb_padctl_remote_wake_detected(struct tegra_xusb_padctl *padctl, struct phy *phy)
+{
+	struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
+
+	if (lane->pad->ops->remote_wake_detected)
+		return lane->pad->ops->remote_wake_detected(lane);
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(tegra_xusb_padctl_remote_wake_detected);
+
 int tegra_xusb_padctl_usb3_set_lfps_detect(struct tegra_xusb_padctl *padctl,
 					   unsigned int port, bool enable)
 {
diff --git a/drivers/phy/tegra/xusb.h b/drivers/phy/tegra/xusb.h
index e789d5ff4eb8..034f7a2c28d6 100644
--- a/drivers/phy/tegra/xusb.h
+++ b/drivers/phy/tegra/xusb.h
@@ -11,6 +11,7 @@
 #include <linux/mutex.h>
 #include <linux/workqueue.h>
 
+#include <linux/usb/ch9.h>
 #include <linux/usb/otg.h>
 #include <linux/usb/role.h>
 
@@ -132,6 +133,11 @@ struct tegra_xusb_lane_ops {
 	void (*remove)(struct tegra_xusb_lane *lane);
 	void (*iddq_enable)(struct tegra_xusb_lane *lane);
 	void (*iddq_disable)(struct tegra_xusb_lane *lane);
+	int (*enable_phy_sleepwalk)(struct tegra_xusb_lane *lane, enum usb_device_speed speed);
+	int (*disable_phy_sleepwalk)(struct tegra_xusb_lane *lane);
+	int (*enable_phy_wake)(struct tegra_xusb_lane *lane);
+	int (*disable_phy_wake)(struct tegra_xusb_lane *lane);
+	bool (*remote_wake_detected)(struct tegra_xusb_lane *lane);
 };
 
 bool tegra_xusb_lane_check(struct tegra_xusb_lane *lane, const char *function);
@@ -396,6 +402,8 @@ struct tegra_xusb_padctl_ops {
 			 const struct tegra_xusb_padctl_soc *soc);
 	void (*remove)(struct tegra_xusb_padctl *padctl);
 
+	int (*suspend_noirq)(struct tegra_xusb_padctl *padctl);
+	int (*resume_noirq)(struct tegra_xusb_padctl *padctl);
 	int (*usb3_save_context)(struct tegra_xusb_padctl *padctl,
 				 unsigned int index);
 	int (*hsic_set_idle)(struct tegra_xusb_padctl *padctl,
diff --git a/include/linux/phy/tegra/xusb.h b/include/linux/phy/tegra/xusb.h
index 71d956935405..3a35e74cdc61 100644
--- a/include/linux/phy/tegra/xusb.h
+++ b/include/linux/phy/tegra/xusb.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION.  All rights reserved.
  */
 
 #ifndef PHY_TEGRA_XUSB_H
@@ -8,6 +8,7 @@
 
 struct tegra_xusb_padctl;
 struct device;
+enum usb_device_speed;
 
 struct tegra_xusb_padctl *tegra_xusb_padctl_get(struct device *dev);
 void tegra_xusb_padctl_put(struct tegra_xusb_padctl *padctl);
@@ -23,4 +24,11 @@ int tegra_xusb_padctl_set_vbus_override(struct tegra_xusb_padctl *padctl,
 int tegra_phy_xusb_utmi_port_reset(struct phy *phy);
 int tegra_xusb_padctl_get_usb3_companion(struct tegra_xusb_padctl *padctl,
 					 unsigned int port);
+int tegra_xusb_padctl_enable_phy_sleepwalk(struct tegra_xusb_padctl *padctl, struct phy *phy,
+					   enum usb_device_speed speed);
+int tegra_xusb_padctl_disable_phy_sleepwalk(struct tegra_xusb_padctl *padctl, struct phy *phy);
+int tegra_xusb_padctl_enable_phy_wake(struct tegra_xusb_padctl *padctl, struct phy *phy);
+int tegra_xusb_padctl_disable_phy_wake(struct tegra_xusb_padctl *padctl, struct phy *phy);
+bool tegra_xusb_padctl_remote_wake_detected(struct tegra_xusb_padctl *padctl, struct phy *phy);
+
 #endif /* PHY_TEGRA_XUSB_H */
-- 
cgit v1.2.3


From c955a0cc8a286e5da1ebb88c19201e9bab8c2422 Mon Sep 17 00:00:00 2001
From: Patrice Chotard <patrice.chotard@foss.st.com>
Date: Tue, 18 May 2021 18:27:52 +0200
Subject: spi: spi-mem: add automatic poll status functions

With STM32 QSPI, it is possible to poll the status register of the device.
This could be done to offload the CPU during an operation (erase or
program a SPI NAND for example).

spi_mem_poll_status API has been added to handle this feature.
This new function take care of the offload/non-offload cases.

For the non-offload case, use read_poll_timeout() to poll the status in
order to release CPU during this phase.
For example, previously, when erasing large area, in non-offload case,
CPU load can reach ~50%, now it decrease to ~35%.

Signed-off-by: Patrice Chotard <patrice.chotard@foss.st.com>
Signed-off-by: Christophe Kerello <christophe.kerello@foss.st.com>
Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Link: https://lore.kernel.org/r/20210518162754.15940-2-patrice.chotard@foss.st.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-mem.c       | 86 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/spi/spi-mem.h | 16 +++++++++
 2 files changed, 102 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/spi/spi-mem.c b/drivers/spi/spi-mem.c
index 1513553e4080..177b3e21febf 100644
--- a/drivers/spi/spi-mem.c
+++ b/drivers/spi/spi-mem.c
@@ -6,6 +6,7 @@
  * Author: Boris Brezillon <boris.brezillon@bootlin.com>
  */
 #include <linux/dmaengine.h>
+#include <linux/iopoll.h>
 #include <linux/pm_runtime.h>
 #include <linux/spi/spi.h>
 #include <linux/spi/spi-mem.h>
@@ -743,6 +744,91 @@ static inline struct spi_mem_driver *to_spi_mem_drv(struct device_driver *drv)
 	return container_of(drv, struct spi_mem_driver, spidrv.driver);
 }
 
+static int spi_mem_read_status(struct spi_mem *mem,
+			       const struct spi_mem_op *op,
+			       u16 *status)
+{
+	const u8 *bytes = (u8 *)op->data.buf.in;
+	int ret;
+
+	ret = spi_mem_exec_op(mem, op);
+	if (ret)
+		return ret;
+
+	if (op->data.nbytes > 1)
+		*status = ((u16)bytes[0] << 8) | bytes[1];
+	else
+		*status = bytes[0];
+
+	return 0;
+}
+
+/**
+ * spi_mem_poll_status() - Poll memory device status
+ * @mem: SPI memory device
+ * @op: the memory operation to execute
+ * @mask: status bitmask to ckeck
+ * @match: (status & mask) expected value
+ * @initial_delay_us: delay in us before starting to poll
+ * @polling_delay_us: time to sleep between reads in us
+ * @timeout_ms: timeout in milliseconds
+ *
+ * This function polls a status register and returns when
+ * (status & mask) == match or when the timeout has expired.
+ *
+ * Return: 0 in case of success, -ETIMEDOUT in case of error,
+ *         -EOPNOTSUPP if not supported.
+ */
+int spi_mem_poll_status(struct spi_mem *mem,
+			const struct spi_mem_op *op,
+			u16 mask, u16 match,
+			unsigned long initial_delay_us,
+			unsigned long polling_delay_us,
+			u16 timeout_ms)
+{
+	struct spi_controller *ctlr = mem->spi->controller;
+	int ret = -EOPNOTSUPP;
+	int read_status_ret;
+	u16 status;
+
+	if (op->data.nbytes < 1 || op->data.nbytes > 2 ||
+	    op->data.dir != SPI_MEM_DATA_IN)
+		return -EINVAL;
+
+	if (ctlr->mem_ops && ctlr->mem_ops->poll_status) {
+		ret = spi_mem_access_start(mem);
+		if (ret)
+			return ret;
+
+		ret = ctlr->mem_ops->poll_status(mem, op, mask, match,
+						 initial_delay_us, polling_delay_us,
+						 timeout_ms);
+
+		spi_mem_access_end(mem);
+	}
+
+	if (ret == -EOPNOTSUPP) {
+		if (!spi_mem_supports_op(mem, op))
+			return ret;
+
+		if (initial_delay_us < 10)
+			udelay(initial_delay_us);
+		else
+			usleep_range((initial_delay_us >> 2) + 1,
+				     initial_delay_us);
+
+		ret = read_poll_timeout(spi_mem_read_status, read_status_ret,
+					(read_status_ret || ((status) & mask) == match),
+					polling_delay_us, timeout_ms * 1000, false, mem,
+					op, &status);
+		if (read_status_ret)
+			return read_status_ret;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(spi_mem_poll_status);
+
 static int spi_mem_probe(struct spi_device *spi)
 {
 	struct spi_mem_driver *memdrv = to_spi_mem_drv(spi->dev.driver);
diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h
index 2b65c9edc34e..85e2ff7b840d 100644
--- a/include/linux/spi/spi-mem.h
+++ b/include/linux/spi/spi-mem.h
@@ -250,6 +250,9 @@ static inline void *spi_mem_get_drvdata(struct spi_mem *mem)
  *		  the currently mapped area), and the caller of
  *		  spi_mem_dirmap_write() is responsible for calling it again in
  *		  this case.
+ * @poll_status: poll memory device status until (status & mask) == match or
+ *               when the timeout has expired. It fills the data buffer with
+ *               the last status value.
  *
  * This interface should be implemented by SPI controllers providing an
  * high-level interface to execute SPI memory operation, which is usually the
@@ -274,6 +277,12 @@ struct spi_controller_mem_ops {
 			       u64 offs, size_t len, void *buf);
 	ssize_t (*dirmap_write)(struct spi_mem_dirmap_desc *desc,
 				u64 offs, size_t len, const void *buf);
+	int (*poll_status)(struct spi_mem *mem,
+			   const struct spi_mem_op *op,
+			   u16 mask, u16 match,
+			   unsigned long initial_delay_us,
+			   unsigned long polling_rate_us,
+			   unsigned long timeout_ms);
 };
 
 /**
@@ -369,6 +378,13 @@ devm_spi_mem_dirmap_create(struct device *dev, struct spi_mem *mem,
 void devm_spi_mem_dirmap_destroy(struct device *dev,
 				 struct spi_mem_dirmap_desc *desc);
 
+int spi_mem_poll_status(struct spi_mem *mem,
+			const struct spi_mem_op *op,
+			u16 mask, u16 match,
+			unsigned long initial_delay_us,
+			unsigned long polling_delay_us,
+			u16 timeout_ms);
+
 int spi_mem_driver_register_with_owner(struct spi_mem_driver *drv,
 				       struct module *owner);
 
-- 
cgit v1.2.3


From 8941cd8d295e40f8ea1c0a5045d6d068b8e33eec Mon Sep 17 00:00:00 2001
From: Patrice Chotard <patrice.chotard@foss.st.com>
Date: Tue, 18 May 2021 18:27:53 +0200
Subject: mtd: spinand: use the spi-mem poll status APIs

Make use of spi-mem poll status APIs to let advanced controllers
optimize wait operations.
This should also fix the high CPU usage for system that don't have
a dedicated STATUS poll block logic.

Signed-off-by: Patrice Chotard <patrice.chotard@foss.st.com>
Signed-off-by: Christophe Kerello <christophe.kerello@foss.st.com>
Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Acked-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/r/20210518162754.15940-3-patrice.chotard@foss.st.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/mtd/nand/spi/core.c | 45 ++++++++++++++++++++++++++++++++-------------
 include/linux/mtd/spinand.h | 22 ++++++++++++++++++++++
 2 files changed, 54 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c
index 17f63f95f4a2..3131fae0c715 100644
--- a/drivers/mtd/nand/spi/core.c
+++ b/drivers/mtd/nand/spi/core.c
@@ -473,20 +473,26 @@ static int spinand_erase_op(struct spinand_device *spinand,
 	return spi_mem_exec_op(spinand->spimem, &op);
 }
 
-static int spinand_wait(struct spinand_device *spinand, u8 *s)
+static int spinand_wait(struct spinand_device *spinand,
+			unsigned long initial_delay_us,
+			unsigned long poll_delay_us,
+			u8 *s)
 {
-	unsigned long timeo =  jiffies + msecs_to_jiffies(400);
+	struct spi_mem_op op = SPINAND_GET_FEATURE_OP(REG_STATUS,
+						      spinand->scratchbuf);
 	u8 status;
 	int ret;
 
-	do {
-		ret = spinand_read_status(spinand, &status);
-		if (ret)
-			return ret;
+	ret = spi_mem_poll_status(spinand->spimem, &op, STATUS_BUSY, 0,
+				  initial_delay_us,
+				  poll_delay_us,
+				  SPINAND_WAITRDY_TIMEOUT_MS);
+	if (ret)
+		return ret;
 
-		if (!(status & STATUS_BUSY))
-			goto out;
-	} while (time_before(jiffies, timeo));
+	status = *spinand->scratchbuf;
+	if (!(status & STATUS_BUSY))
+		goto out;
 
 	/*
 	 * Extra read, just in case the STATUS_READY bit has changed
@@ -526,7 +532,10 @@ static int spinand_reset_op(struct spinand_device *spinand)
 	if (ret)
 		return ret;
 
-	return spinand_wait(spinand, NULL);
+	return spinand_wait(spinand,
+			    SPINAND_RESET_INITIAL_DELAY_US,
+			    SPINAND_RESET_POLL_DELAY_US,
+			    NULL);
 }
 
 static int spinand_lock_block(struct spinand_device *spinand, u8 lock)
@@ -549,7 +558,10 @@ static int spinand_read_page(struct spinand_device *spinand,
 	if (ret)
 		return ret;
 
-	ret = spinand_wait(spinand, &status);
+	ret = spinand_wait(spinand,
+			   SPINAND_READ_INITIAL_DELAY_US,
+			   SPINAND_READ_POLL_DELAY_US,
+			   &status);
 	if (ret < 0)
 		return ret;
 
@@ -585,7 +597,10 @@ static int spinand_write_page(struct spinand_device *spinand,
 	if (ret)
 		return ret;
 
-	ret = spinand_wait(spinand, &status);
+	ret = spinand_wait(spinand,
+			   SPINAND_WRITE_INITIAL_DELAY_US,
+			   SPINAND_WRITE_POLL_DELAY_US,
+			   &status);
 	if (!ret && (status & STATUS_PROG_FAILED))
 		return -EIO;
 
@@ -768,7 +783,11 @@ static int spinand_erase(struct nand_device *nand, const struct nand_pos *pos)
 	if (ret)
 		return ret;
 
-	ret = spinand_wait(spinand, &status);
+	ret = spinand_wait(spinand,
+			   SPINAND_ERASE_INITIAL_DELAY_US,
+			   SPINAND_ERASE_POLL_DELAY_US,
+			   &status);
+
 	if (!ret && (status & STATUS_ERASE_FAILED))
 		ret = -EIO;
 
diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index 6bb92f26833e..6988956b8492 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -170,6 +170,28 @@ struct spinand_op;
 struct spinand_device;
 
 #define SPINAND_MAX_ID_LEN	4
+/*
+ * For erase, write and read operation, we got the following timings :
+ * tBERS (erase) 1ms to 4ms
+ * tPROG 300us to 400us
+ * tREAD 25us to 100us
+ * In order to minimize latency, the min value is divided by 4 for the
+ * initial delay, and dividing by 20 for the poll delay.
+ * For reset, 5us/10us/500us if the device is respectively
+ * reading/programming/erasing when the RESET occurs. Since we always
+ * issue a RESET when the device is IDLE, 5us is selected for both initial
+ * and poll delay.
+ */
+#define SPINAND_READ_INITIAL_DELAY_US	6
+#define SPINAND_READ_POLL_DELAY_US	5
+#define SPINAND_RESET_INITIAL_DELAY_US	5
+#define SPINAND_RESET_POLL_DELAY_US	5
+#define SPINAND_WRITE_INITIAL_DELAY_US	75
+#define SPINAND_WRITE_POLL_DELAY_US	15
+#define SPINAND_ERASE_INITIAL_DELAY_US	250
+#define SPINAND_ERASE_POLL_DELAY_US	50
+
+#define SPINAND_WAITRDY_TIMEOUT_MS	400
 
 /**
  * struct spinand_id - SPI NAND id structure
-- 
cgit v1.2.3


From 68d7a190682aa4eb02db477328088ebad15acc83 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Wed, 2 Jun 2021 16:58:08 +0200
Subject: sched/fair: Fix util_est UTIL_AVG_UNCHANGED handling

The util_est internal UTIL_AVG_UNCHANGED flag which is used to prevent
unnecessary util_est updates uses the LSB of util_est.enqueued. It is
exposed via _task_util_est() (and task_util_est()).

Commit 92a801e5d5b7 ("sched/fair: Mask UTIL_AVG_UNCHANGED usages")
mentions that the LSB is lost for util_est resolution but
find_energy_efficient_cpu() checks if task_util_est() returns 0 to
return prev_cpu early.

_task_util_est() returns the max value of util_est.ewma and
util_est.enqueued or'ed w/ UTIL_AVG_UNCHANGED.
So task_util_est() returning the max of task_util() and
_task_util_est() will never return 0 under the default
SCHED_FEAT(UTIL_EST, true).

To fix this use the MSB of util_est.enqueued instead and keep the flag
util_est internal, i.e. don't export it via _task_util_est().

The maximal possible util_avg value for a task is 1024 so the MSB of
'unsigned int util_est.enqueued' isn't used to store a util value.

As a caveat the code behind the util_est_se trace point has to filter
UTIL_AVG_UNCHANGED to see the real util_est.enqueued value which should
be easy to do.

This also fixes an issue report by Xuewen Yan that util_est_update()
only used UTIL_AVG_UNCHANGED for the subtrahend of the equation:

  last_enqueued_diff = ue.enqueued - (task_util() | UTIL_AVG_UNCHANGED)

Fixes: b89997aa88f0b sched/pelt: Fix task util_est update filtering
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Xuewen Yan <xuewen.yan@unisoc.com>
Reviewed-by: Vincent Donnefort <vincent.donnefort@arm.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lore.kernel.org/r/20210602145808.1562603-1-dietmar.eggemann@arm.com
---
 include/linux/sched.h |  8 ++++++++
 kernel/sched/debug.c  |  3 ++-
 kernel/sched/fair.c   |  5 +++--
 kernel/sched/pelt.h   | 11 +----------
 4 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d2c881384517..28a98fc4ded4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -350,11 +350,19 @@ struct load_weight {
  * Only for tasks we track a moving average of the past instantaneous
  * estimated utilization. This allows to absorb sporadic drops in utilization
  * of an otherwise almost periodic task.
+ *
+ * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
+ * updates. When a task is dequeued, its util_est should not be updated if its
+ * util_avg has not been updated in the meantime.
+ * This information is mapped into the MSB bit of util_est.enqueued at dequeue
+ * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
+ * for a task) it is safe to use MSB.
  */
 struct util_est {
 	unsigned int			enqueued;
 	unsigned int			ewma;
 #define UTIL_EST_WEIGHT_SHIFT		2
+#define UTIL_AVG_UNCHANGED		0x80000000
 } __attribute__((__aligned__(sizeof(u64))));
 
 /*
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 9c882f20803e..c5aacbd492a1 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -885,6 +885,7 @@ static const struct seq_operations sched_debug_sops = {
 #define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F))
 #define __P(F) __PS(#F, F)
 #define   P(F) __PS(#F, p->F)
+#define   PM(F, M) __PS(#F, p->F & (M))
 #define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F)))
 #define __PN(F) __PSN(#F, F)
 #define   PN(F) __PSN(#F, p->F)
@@ -1011,7 +1012,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 	P(se.avg.util_avg);
 	P(se.avg.last_update_time);
 	P(se.avg.util_est.ewma);
-	P(se.avg.util_est.enqueued);
+	PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED);
 #endif
 #ifdef CONFIG_UCLAMP_TASK
 	__PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7b98fb37330a..2c8a9352590d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3907,7 +3907,7 @@ static inline unsigned long _task_util_est(struct task_struct *p)
 {
 	struct util_est ue = READ_ONCE(p->se.avg.util_est);
 
-	return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
+	return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
 }
 
 static inline unsigned long task_util_est(struct task_struct *p)
@@ -4007,7 +4007,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
 	 * Reset EWMA on utilization increases, the moving average is used only
 	 * to smooth utilization decreases.
 	 */
-	ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
+	ue.enqueued = task_util(p);
 	if (sched_feat(UTIL_EST_FASTUP)) {
 		if (ue.ewma < ue.enqueued) {
 			ue.ewma = ue.enqueued;
@@ -4056,6 +4056,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
 	ue.ewma  += last_ewma_diff;
 	ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
 done:
+	ue.enqueued |= UTIL_AVG_UNCHANGED;
 	WRITE_ONCE(p->se.avg.util_est, ue);
 
 	trace_sched_util_est_se_tp(&p->se);
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 1462846d244e..cfe94ffd2b38 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -42,15 +42,6 @@ static inline u32 get_pelt_divider(struct sched_avg *avg)
 	return LOAD_AVG_MAX - 1024 + avg->period_contrib;
 }
 
-/*
- * When a task is dequeued, its estimated utilization should not be update if
- * its util_avg has not been updated at least once.
- * This flag is used to synchronize util_avg updates with util_est updates.
- * We map this information into the LSB bit of the utilization saved at
- * dequeue time (i.e. util_est.dequeued).
- */
-#define UTIL_AVG_UNCHANGED 0x1
-
 static inline void cfs_se_util_change(struct sched_avg *avg)
 {
 	unsigned int enqueued;
@@ -58,7 +49,7 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
 	if (!sched_feat(UTIL_EST))
 		return;
 
-	/* Avoid store if the flag has been already set */
+	/* Avoid store if the flag has been already reset */
 	enqueued = avg->util_est.enqueued;
 	if (!(enqueued & UTIL_AVG_UNCHANGED))
 		return;
-- 
cgit v1.2.3


From 66cd071a1f839b4834d45aa7dde622151041b1a0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 9 Apr 2021 19:10:53 +0100
Subject: iov_iter: Remove iov_iter_for_each_range()

Remove iov_iter_for_each_range() as it's no longer used with the removal of
lustre.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/uio.h |  4 ----
 lib/iov_iter.c      | 27 ---------------------------
 2 files changed, 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index d3ec87706d75..74a401f04bd3 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -294,8 +294,4 @@ ssize_t __import_iovec(int type, const struct iovec __user *uvec,
 int import_single_range(int type, void __user *buf, size_t len,
 		 struct iovec *iov, struct iov_iter *i);
 
-int iov_iter_for_each_range(struct iov_iter *i, size_t bytes,
-			    int (*f)(struct kvec *vec, void *context),
-			    void *context);
-
 #endif
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index c701b7a187f2..8f5ce5b1ff91 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -2093,30 +2093,3 @@ int import_single_range(int rw, void __user *buf, size_t len,
 	return 0;
 }
 EXPORT_SYMBOL(import_single_range);
-
-int iov_iter_for_each_range(struct iov_iter *i, size_t bytes,
-			    int (*f)(struct kvec *vec, void *context),
-			    void *context)
-{
-	struct kvec w;
-	int err = -EINVAL;
-	if (!bytes)
-		return 0;
-
-	iterate_all_kinds(i, bytes, v, -EINVAL, ({
-		w.iov_base = kmap(v.bv_page) + v.bv_offset;
-		w.iov_len = v.bv_len;
-		err = f(&w, context);
-		kunmap(v.bv_page);
-		err;}), ({
-		w = v;
-		err = f(&w, context);}), ({
-		w.iov_base = kmap(v.bv_page) + v.bv_offset;
-		w.iov_len = v.bv_len;
-		err = f(&w, context);
-		kunmap(v.bv_page);
-		err;})
-	)
-	return err;
-}
-EXPORT_SYMBOL(iov_iter_for_each_range);
-- 
cgit v1.2.3


From 066ebe8ca1e4734471772df734233af5c53d21ae Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Date: Tue, 1 Jun 2021 15:35:59 +0200
Subject: power: ab8500: remove unused header

The ab8500.h header in linux/power is not referenced/included, so can be
safely removed.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 include/linux/power/ab8500.h | 16 ----------------
 1 file changed, 16 deletions(-)
 delete mode 100644 include/linux/power/ab8500.h

(limited to 'include/linux')

diff --git a/include/linux/power/ab8500.h b/include/linux/power/ab8500.h
deleted file mode 100644
index 51976b52f373..000000000000
--- a/include/linux/power/ab8500.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) ST-Ericsson 2013
- * Author: Hongbo Zhang <hongbo.zhang@linaro.com>
- */
-
-#ifndef PWR_AB8500_H
-#define PWR_AB8500_H
-
-extern const struct abx500_res_to_temp ab8500_temp_tbl_a_thermistor[];
-extern const int ab8500_temp_tbl_a_size;
-
-extern const struct abx500_res_to_temp ab8500_temp_tbl_b_thermistor[];
-extern const int ab8500_temp_tbl_b_size;
-
-#endif /* PWR_AB8500_H */
-- 
cgit v1.2.3


From aa8c8bf64b6e11f846087301f033b0e5977b1342 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sat, 29 May 2021 01:59:52 +0200
Subject: power: supply: pm2301_charger: Delete driver

The PM2301 was only used in tandem with AB9540, part of U9540,
a platform that was cancelled and never deployed in products.
Delete it.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/Makefile         |    2 +-
 drivers/power/supply/pm2301_charger.c | 1249 ---------------------------------
 include/linux/pm2301_charger.h        |   48 --
 3 files changed, 1 insertion(+), 1298 deletions(-)
 delete mode 100644 drivers/power/supply/pm2301_charger.c
 delete mode 100644 include/linux/pm2301_charger.h

(limited to 'include/linux')

diff --git a/drivers/power/supply/Makefile b/drivers/power/supply/Makefile
index a7309a3d1a47..16ebfaf6d55b 100644
--- a/drivers/power/supply/Makefile
+++ b/drivers/power/supply/Makefile
@@ -60,7 +60,7 @@ obj-$(CONFIG_BATTERY_TWL4030_MADC)	+= twl4030_madc_battery.o
 obj-$(CONFIG_CHARGER_88PM860X)	+= 88pm860x_charger.o
 obj-$(CONFIG_CHARGER_PCF50633)	+= pcf50633-charger.o
 obj-$(CONFIG_BATTERY_RX51)	+= rx51_battery.o
-obj-$(CONFIG_AB8500_BM)		+= ab8500_bmdata.o ab8500_charger.o ab8500_fg.o ab8500_btemp.o abx500_chargalg.o pm2301_charger.o
+obj-$(CONFIG_AB8500_BM)		+= ab8500_bmdata.o ab8500_charger.o ab8500_fg.o ab8500_btemp.o abx500_chargalg.o
 obj-$(CONFIG_CHARGER_CPCAP)	+= cpcap-charger.o
 obj-$(CONFIG_CHARGER_ISP1704)	+= isp1704_charger.o
 obj-$(CONFIG_CHARGER_MAX8903)	+= max8903_charger.o
diff --git a/drivers/power/supply/pm2301_charger.c b/drivers/power/supply/pm2301_charger.c
deleted file mode 100644
index f86bbbeaff6c..000000000000
--- a/drivers/power/supply/pm2301_charger.c
+++ /dev/null
@@ -1,1249 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright 2012 ST Ericsson.
- *
- * Power supply driver for ST Ericsson pm2xxx_charger charger
- */
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/interrupt.h>
-#include <linux/delay.h>
-#include <linux/slab.h>
-#include <linux/platform_device.h>
-#include <linux/power_supply.h>
-#include <linux/regulator/consumer.h>
-#include <linux/err.h>
-#include <linux/i2c.h>
-#include <linux/workqueue.h>
-#include <linux/mfd/abx500/ab8500.h>
-#include <linux/pm2301_charger.h>
-#include <linux/gpio.h>
-#include <linux/pm_runtime.h>
-#include <linux/pm.h>
-
-#include "ab8500-bm.h"
-#include "ab8500-chargalg.h"
-#include "pm2301_charger.h"
-
-#define to_pm2xxx_charger_ac_device_info(x) container_of((x), \
-		struct pm2xxx_charger, ac_chg)
-#define SLEEP_MIN		50
-#define SLEEP_MAX		100
-#define PM2XXX_AUTOSUSPEND_DELAY 500
-
-static int pm2xxx_interrupt_registers[] = {
-	PM2XXX_REG_INT1,
-	PM2XXX_REG_INT2,
-	PM2XXX_REG_INT3,
-	PM2XXX_REG_INT4,
-	PM2XXX_REG_INT5,
-	PM2XXX_REG_INT6,
-};
-
-static enum power_supply_property pm2xxx_charger_ac_props[] = {
-	POWER_SUPPLY_PROP_HEALTH,
-	POWER_SUPPLY_PROP_PRESENT,
-	POWER_SUPPLY_PROP_ONLINE,
-	POWER_SUPPLY_PROP_VOLTAGE_AVG,
-};
-
-static int pm2xxx_charger_voltage_map[] = {
-	3500,
-	3525,
-	3550,
-	3575,
-	3600,
-	3625,
-	3650,
-	3675,
-	3700,
-	3725,
-	3750,
-	3775,
-	3800,
-	3825,
-	3850,
-	3875,
-	3900,
-	3925,
-	3950,
-	3975,
-	4000,
-	4025,
-	4050,
-	4075,
-	4100,
-	4125,
-	4150,
-	4175,
-	4200,
-	4225,
-	4250,
-	4275,
-	4300,
-};
-
-static int pm2xxx_charger_current_map[] = {
-	200,
-	200,
-	400,
-	600,
-	800,
-	1000,
-	1200,
-	1400,
-	1600,
-	1800,
-	2000,
-	2200,
-	2400,
-	2600,
-	2800,
-	3000,
-};
-
-static void set_lpn_pin(struct pm2xxx_charger *pm2)
-{
-	if (!pm2->ac.charger_connected && gpio_is_valid(pm2->lpn_pin)) {
-		gpio_set_value(pm2->lpn_pin, 1);
-		usleep_range(SLEEP_MIN, SLEEP_MAX);
-	}
-}
-
-static void clear_lpn_pin(struct pm2xxx_charger *pm2)
-{
-	if (!pm2->ac.charger_connected && gpio_is_valid(pm2->lpn_pin))
-		gpio_set_value(pm2->lpn_pin, 0);
-}
-
-static int pm2xxx_reg_read(struct pm2xxx_charger *pm2, int reg, u8 *val)
-{
-	int ret;
-
-	/* wake up the device */
-	pm_runtime_get_sync(pm2->dev);
-
-	ret = i2c_smbus_read_i2c_block_data(pm2->config.pm2xxx_i2c, reg,
-				1, val);
-	if (ret < 0)
-		dev_err(pm2->dev, "Error reading register at 0x%x\n", reg);
-	else
-		ret = 0;
-
-	pm_runtime_put_sync(pm2->dev);
-
-	return ret;
-}
-
-static int pm2xxx_reg_write(struct pm2xxx_charger *pm2, int reg, u8 val)
-{
-	int ret;
-
-	/* wake up the device */
-	pm_runtime_get_sync(pm2->dev);
-
-	ret = i2c_smbus_write_i2c_block_data(pm2->config.pm2xxx_i2c, reg,
-				1, &val);
-	if (ret < 0)
-		dev_err(pm2->dev, "Error writing register at 0x%x\n", reg);
-	else
-		ret = 0;
-
-	pm_runtime_put_sync(pm2->dev);
-
-	return ret;
-}
-
-static int pm2xxx_charging_enable_mngt(struct pm2xxx_charger *pm2)
-{
-	int ret;
-
-	/* Enable charging */
-	ret = pm2xxx_reg_write(pm2, PM2XXX_BATT_CTRL_REG2,
-			(PM2XXX_CH_AUTO_RESUME_EN | PM2XXX_CHARGER_ENA));
-
-	return ret;
-}
-
-static int pm2xxx_charging_disable_mngt(struct pm2xxx_charger *pm2)
-{
-	int ret;
-
-	/* Disable SW EOC ctrl */
-	ret = pm2xxx_reg_write(pm2, PM2XXX_SW_CTRL_REG, PM2XXX_SWCTRL_HW);
-	if (ret < 0) {
-		dev_err(pm2->dev, "%s pm2xxx write failed\n", __func__);
-		return ret;
-	}
-
-	/* Disable charging */
-	ret = pm2xxx_reg_write(pm2, PM2XXX_BATT_CTRL_REG2,
-			(PM2XXX_CH_AUTO_RESUME_DIS | PM2XXX_CHARGER_DIS));
-	if (ret < 0) {
-		dev_err(pm2->dev, "%s pm2xxx write failed\n", __func__);
-		return ret;
-	}
-
-	return 0;
-}
-
-static int pm2xxx_charger_batt_therm_mngt(struct pm2xxx_charger *pm2, int val)
-{
-	queue_work(pm2->charger_wq, &pm2->check_main_thermal_prot_work);
-
-	return 0;
-}
-
-
-static int pm2xxx_charger_die_therm_mngt(struct pm2xxx_charger *pm2, int val)
-{
-	queue_work(pm2->charger_wq, &pm2->check_main_thermal_prot_work);
-
-	return 0;
-}
-
-static int pm2xxx_charger_ovv_mngt(struct pm2xxx_charger *pm2, int val)
-{
-	dev_err(pm2->dev, "Overvoltage detected\n");
-	pm2->flags.ovv = true;
-	power_supply_changed(pm2->ac_chg.psy);
-
-	/* Schedule a new HW failure check */
-	queue_delayed_work(pm2->charger_wq, &pm2->check_hw_failure_work, 0);
-
-	return 0;
-}
-
-static int pm2xxx_charger_wd_exp_mngt(struct pm2xxx_charger *pm2, int val)
-{
-	dev_dbg(pm2->dev , "20 minutes watchdog expired\n");
-
-	pm2->ac.wd_expired = true;
-	power_supply_changed(pm2->ac_chg.psy);
-
-	return 0;
-}
-
-static int pm2xxx_charger_vbat_lsig_mngt(struct pm2xxx_charger *pm2, int val)
-{
-	int ret;
-
-	switch (val) {
-	case PM2XXX_INT1_ITVBATLOWR:
-		dev_dbg(pm2->dev, "VBAT grows above VBAT_LOW level\n");
-		/* Enable SW EOC ctrl */
-		ret = pm2xxx_reg_write(pm2, PM2XXX_SW_CTRL_REG,
-							PM2XXX_SWCTRL_SW);
-		if (ret < 0) {
-			dev_err(pm2->dev, "%s pm2xxx write failed\n", __func__);
-			return ret;
-		}
-		break;
-
-	case PM2XXX_INT1_ITVBATLOWF:
-		dev_dbg(pm2->dev, "VBAT drops below VBAT_LOW level\n");
-		/* Disable SW EOC ctrl */
-		ret = pm2xxx_reg_write(pm2, PM2XXX_SW_CTRL_REG,
-							PM2XXX_SWCTRL_HW);
-		if (ret < 0) {
-			dev_err(pm2->dev, "%s pm2xxx write failed\n", __func__);
-			return ret;
-		}
-		break;
-
-	default:
-		dev_err(pm2->dev, "Unknown VBAT level\n");
-	}
-
-	return 0;
-}
-
-static int pm2xxx_charger_bat_disc_mngt(struct pm2xxx_charger *pm2, int val)
-{
-	dev_dbg(pm2->dev, "battery disconnected\n");
-
-	return 0;
-}
-
-static int pm2xxx_charger_detection(struct pm2xxx_charger *pm2, u8 *val)
-{
-	int ret;
-
-	ret = pm2xxx_reg_read(pm2, PM2XXX_SRCE_REG_INT2, val);
-
-	if (ret < 0) {
-		dev_err(pm2->dev, "Charger detection failed\n");
-		goto out;
-	}
-
-	*val &= (PM2XXX_INT2_S_ITVPWR1PLUG | PM2XXX_INT2_S_ITVPWR2PLUG);
-
-out:
-	return ret;
-}
-
-static int pm2xxx_charger_itv_pwr_plug_mngt(struct pm2xxx_charger *pm2, int val)
-{
-
-	int ret;
-	u8 read_val;
-
-	/*
-	 * Since we can't be sure that the events are received
-	 * synchronously, we have the check if the main charger is
-	 * connected by reading the interrupt source register.
-	 */
-	ret = pm2xxx_charger_detection(pm2, &read_val);
-
-	if ((ret == 0) && read_val) {
-		pm2->ac.charger_connected = 1;
-		pm2->ac_conn = true;
-		queue_work(pm2->charger_wq, &pm2->ac_work);
-	}
-
-
-	return ret;
-}
-
-static int pm2xxx_charger_itv_pwr_unplug_mngt(struct pm2xxx_charger *pm2,
-								int val)
-{
-	pm2->ac.charger_connected = 0;
-	queue_work(pm2->charger_wq, &pm2->ac_work);
-
-	return 0;
-}
-
-static int pm2_int_reg0(void *pm2_data, int val)
-{
-	struct pm2xxx_charger *pm2 = pm2_data;
-	int ret = 0;
-
-	if (val & PM2XXX_INT1_ITVBATLOWR) {
-		ret = pm2xxx_charger_vbat_lsig_mngt(pm2,
-						PM2XXX_INT1_ITVBATLOWR);
-		if (ret < 0)
-			goto out;
-	}
-
-	if (val & PM2XXX_INT1_ITVBATLOWF) {
-		ret = pm2xxx_charger_vbat_lsig_mngt(pm2,
-						PM2XXX_INT1_ITVBATLOWF);
-		if (ret < 0)
-			goto out;
-	}
-
-	if (val & PM2XXX_INT1_ITVBATDISCONNECT) {
-		ret = pm2xxx_charger_bat_disc_mngt(pm2,
-				PM2XXX_INT1_ITVBATDISCONNECT);
-		if (ret < 0)
-			goto out;
-	}
-out:
-	return ret;
-}
-
-static int pm2_int_reg1(void *pm2_data, int val)
-{
-	struct pm2xxx_charger *pm2 = pm2_data;
-	int ret = 0;
-
-	if (val & (PM2XXX_INT2_ITVPWR1PLUG | PM2XXX_INT2_ITVPWR2PLUG)) {
-		dev_dbg(pm2->dev , "Main charger plugged\n");
-		ret = pm2xxx_charger_itv_pwr_plug_mngt(pm2, val &
-			(PM2XXX_INT2_ITVPWR1PLUG | PM2XXX_INT2_ITVPWR2PLUG));
-	}
-
-	if (val &
-		(PM2XXX_INT2_ITVPWR1UNPLUG | PM2XXX_INT2_ITVPWR2UNPLUG)) {
-		dev_dbg(pm2->dev , "Main charger unplugged\n");
-		ret = pm2xxx_charger_itv_pwr_unplug_mngt(pm2, val &
-						(PM2XXX_INT2_ITVPWR1UNPLUG |
-						PM2XXX_INT2_ITVPWR2UNPLUG));
-	}
-
-	return ret;
-}
-
-static int pm2_int_reg2(void *pm2_data, int val)
-{
-	struct pm2xxx_charger *pm2 = pm2_data;
-	int ret = 0;
-
-	if (val & PM2XXX_INT3_ITAUTOTIMEOUTWD)
-		ret = pm2xxx_charger_wd_exp_mngt(pm2, val);
-
-	if (val & (PM2XXX_INT3_ITCHPRECHARGEWD |
-				PM2XXX_INT3_ITCHCCWD | PM2XXX_INT3_ITCHCVWD)) {
-		dev_dbg(pm2->dev,
-			"Watchdog occurred for precharge, CC and CV charge\n");
-	}
-
-	return ret;
-}
-
-static int pm2_int_reg3(void *pm2_data, int val)
-{
-	struct pm2xxx_charger *pm2 = pm2_data;
-	int ret = 0;
-
-	if (val & (PM2XXX_INT4_ITCHARGINGON)) {
-		dev_dbg(pm2->dev ,
-			"charging operation has started\n");
-	}
-
-	if (val & (PM2XXX_INT4_ITVRESUME)) {
-		dev_dbg(pm2->dev,
-			"battery discharged down to VResume threshold\n");
-	}
-
-	if (val & (PM2XXX_INT4_ITBATTFULL)) {
-		dev_dbg(pm2->dev , "battery fully detected\n");
-	}
-
-	if (val & (PM2XXX_INT4_ITCVPHASE)) {
-		dev_dbg(pm2->dev, "CV phase enter with 0.5C charging\n");
-	}
-
-	if (val & (PM2XXX_INT4_ITVPWR2OVV | PM2XXX_INT4_ITVPWR1OVV)) {
-		pm2->failure_case = VPWR_OVV;
-		ret = pm2xxx_charger_ovv_mngt(pm2, val &
-			(PM2XXX_INT4_ITVPWR2OVV | PM2XXX_INT4_ITVPWR1OVV));
-		dev_dbg(pm2->dev, "VPWR/VSYSTEM overvoltage detected\n");
-	}
-
-	if (val & (PM2XXX_INT4_S_ITBATTEMPCOLD |
-				PM2XXX_INT4_S_ITBATTEMPHOT)) {
-		ret = pm2xxx_charger_batt_therm_mngt(pm2, val &
-			(PM2XXX_INT4_S_ITBATTEMPCOLD |
-			PM2XXX_INT4_S_ITBATTEMPHOT));
-		dev_dbg(pm2->dev, "BTEMP is too Low/High\n");
-	}
-
-	return ret;
-}
-
-static int pm2_int_reg4(void *pm2_data, int val)
-{
-	struct pm2xxx_charger *pm2 = pm2_data;
-	int ret = 0;
-
-	if (val & PM2XXX_INT5_ITVSYSTEMOVV) {
-		pm2->failure_case = VSYSTEM_OVV;
-		ret = pm2xxx_charger_ovv_mngt(pm2, val &
-						PM2XXX_INT5_ITVSYSTEMOVV);
-		dev_dbg(pm2->dev, "VSYSTEM overvoltage detected\n");
-	}
-
-	if (val & (PM2XXX_INT5_ITTHERMALWARNINGFALL |
-				PM2XXX_INT5_ITTHERMALWARNINGRISE |
-				PM2XXX_INT5_ITTHERMALSHUTDOWNFALL |
-				PM2XXX_INT5_ITTHERMALSHUTDOWNRISE)) {
-		dev_dbg(pm2->dev, "BTEMP die temperature is too Low/High\n");
-		ret = pm2xxx_charger_die_therm_mngt(pm2, val &
-			(PM2XXX_INT5_ITTHERMALWARNINGFALL |
-			PM2XXX_INT5_ITTHERMALWARNINGRISE |
-			PM2XXX_INT5_ITTHERMALSHUTDOWNFALL |
-			PM2XXX_INT5_ITTHERMALSHUTDOWNRISE));
-	}
-
-	return ret;
-}
-
-static int pm2_int_reg5(void *pm2_data, int val)
-{
-	struct pm2xxx_charger *pm2 = pm2_data;
-
-	if (val & (PM2XXX_INT6_ITVPWR2DROP | PM2XXX_INT6_ITVPWR1DROP)) {
-		dev_dbg(pm2->dev, "VMPWR drop to VBAT level\n");
-	}
-
-	if (val & (PM2XXX_INT6_ITVPWR2VALIDRISE |
-			PM2XXX_INT6_ITVPWR1VALIDRISE |
-			PM2XXX_INT6_ITVPWR2VALIDFALL |
-			PM2XXX_INT6_ITVPWR1VALIDFALL)) {
-		dev_dbg(pm2->dev, "Falling/Rising edge on WPWR1/2\n");
-	}
-
-	return 0;
-}
-
-static irqreturn_t  pm2xxx_irq_int(int irq, void *data)
-{
-	struct pm2xxx_charger *pm2 = data;
-	struct pm2xxx_interrupts *interrupt = pm2->pm2_int;
-	int i;
-
-	/* wake up the device */
-	pm_runtime_get_sync(pm2->dev);
-
-	do {
-		for (i = 0; i < PM2XXX_NUM_INT_REG; i++) {
-			pm2xxx_reg_read(pm2,
-				pm2xxx_interrupt_registers[i],
-				&(interrupt->reg[i]));
-
-			if (interrupt->reg[i] > 0)
-				interrupt->handler[i](pm2, interrupt->reg[i]);
-		}
-	} while (gpio_get_value(pm2->pdata->gpio_irq_number) == 0);
-
-	pm_runtime_mark_last_busy(pm2->dev);
-	pm_runtime_put_autosuspend(pm2->dev);
-
-	return IRQ_HANDLED;
-}
-
-static int pm2xxx_charger_get_ac_cv(struct pm2xxx_charger *pm2)
-{
-	int ret = 0;
-	u8 val;
-
-	if (pm2->ac.charger_connected && pm2->ac.charger_online) {
-
-		ret = pm2xxx_reg_read(pm2, PM2XXX_SRCE_REG_INT4, &val);
-		if (ret < 0) {
-			dev_err(pm2->dev, "%s pm2xxx read failed\n", __func__);
-			goto out;
-		}
-
-		if (val & PM2XXX_INT4_S_ITCVPHASE)
-			ret = PM2XXX_CONST_VOLT;
-		else
-			ret = PM2XXX_CONST_CURR;
-	}
-out:
-	return ret;
-}
-
-static int pm2xxx_current_to_regval(int curr)
-{
-	int i;
-
-	if (curr < pm2xxx_charger_current_map[0])
-		return 0;
-
-	for (i = 1; i < ARRAY_SIZE(pm2xxx_charger_current_map); i++) {
-		if (curr < pm2xxx_charger_current_map[i])
-			return (i - 1);
-	}
-
-	i = ARRAY_SIZE(pm2xxx_charger_current_map) - 1;
-	if (curr == pm2xxx_charger_current_map[i])
-		return i;
-	else
-		return -EINVAL;
-}
-
-static int pm2xxx_voltage_to_regval(int curr)
-{
-	int i;
-
-	if (curr < pm2xxx_charger_voltage_map[0])
-		return 0;
-
-	for (i = 1; i < ARRAY_SIZE(pm2xxx_charger_voltage_map); i++) {
-		if (curr < pm2xxx_charger_voltage_map[i])
-			return i - 1;
-	}
-
-	i = ARRAY_SIZE(pm2xxx_charger_voltage_map) - 1;
-	if (curr == pm2xxx_charger_voltage_map[i])
-		return i;
-	else
-		return -EINVAL;
-}
-
-static int pm2xxx_charger_update_charger_current(struct ux500_charger *charger,
-		int ich_out)
-{
-	int ret;
-	int curr_index;
-	struct pm2xxx_charger *pm2;
-	u8 val;
-
-	if (charger->psy->desc->type == POWER_SUPPLY_TYPE_MAINS)
-		pm2 = to_pm2xxx_charger_ac_device_info(charger);
-	else
-		return -ENXIO;
-
-	curr_index = pm2xxx_current_to_regval(ich_out);
-	if (curr_index < 0) {
-		dev_err(pm2->dev,
-			"Charger current too high, charging not started\n");
-		return -ENXIO;
-	}
-
-	ret = pm2xxx_reg_read(pm2, PM2XXX_BATT_CTRL_REG6, &val);
-	if (ret >= 0) {
-		val &= ~PM2XXX_DIR_CH_CC_CURRENT_MASK;
-		val |= curr_index;
-		ret = pm2xxx_reg_write(pm2, PM2XXX_BATT_CTRL_REG6, val);
-		if (ret < 0) {
-			dev_err(pm2->dev,
-				"%s write failed\n", __func__);
-		}
-	}
-	else
-		dev_err(pm2->dev, "%s read failed\n", __func__);
-
-	return ret;
-}
-
-static int pm2xxx_charger_ac_get_property(struct power_supply *psy,
-	enum power_supply_property psp,
-	union power_supply_propval *val)
-{
-	struct pm2xxx_charger *pm2;
-
-	pm2 = to_pm2xxx_charger_ac_device_info(psy_to_ux500_charger(psy));
-
-	switch (psp) {
-	case POWER_SUPPLY_PROP_HEALTH:
-		if (pm2->flags.mainextchnotok)
-			val->intval = POWER_SUPPLY_HEALTH_UNSPEC_FAILURE;
-		else if (pm2->ac.wd_expired)
-			val->intval = POWER_SUPPLY_HEALTH_DEAD;
-		else if (pm2->flags.main_thermal_prot)
-			val->intval = POWER_SUPPLY_HEALTH_OVERHEAT;
-		else if (pm2->flags.ovv)
-			val->intval = POWER_SUPPLY_HEALTH_OVERVOLTAGE;
-		else
-			val->intval = POWER_SUPPLY_HEALTH_GOOD;
-		break;
-	case POWER_SUPPLY_PROP_ONLINE:
-		val->intval = pm2->ac.charger_online;
-		break;
-	case POWER_SUPPLY_PROP_PRESENT:
-		val->intval = pm2->ac.charger_connected;
-		break;
-	case POWER_SUPPLY_PROP_VOLTAGE_AVG:
-		pm2->ac.cv_active = pm2xxx_charger_get_ac_cv(pm2);
-		val->intval = pm2->ac.cv_active;
-		break;
-	default:
-		return -EINVAL;
-	}
-	return 0;
-}
-
-static int pm2xxx_charging_init(struct pm2xxx_charger *pm2)
-{
-	int ret = 0;
-
-	/* enable CC and CV watchdog */
-	ret = pm2xxx_reg_write(pm2, PM2XXX_BATT_CTRL_REG3,
-		(PM2XXX_CH_WD_CV_PHASE_60MIN | PM2XXX_CH_WD_CC_PHASE_60MIN));
-	if( ret < 0)
-		return ret;
-
-	/* enable precharge watchdog */
-	ret = pm2xxx_reg_write(pm2, PM2XXX_BATT_CTRL_REG4,
-					PM2XXX_CH_WD_PRECH_PHASE_60MIN);
-
-	/* Disable auto timeout */
-	ret = pm2xxx_reg_write(pm2, PM2XXX_BATT_CTRL_REG5,
-					PM2XXX_CH_WD_AUTO_TIMEOUT_20MIN);
-
-	/*
-     * EOC current level = 100mA
-	 * Precharge current level = 100mA
-	 * CC current level = 1000mA
-	 */
-	ret = pm2xxx_reg_write(pm2, PM2XXX_BATT_CTRL_REG6,
-		(PM2XXX_DIR_CH_CC_CURRENT_1000MA |
-		PM2XXX_CH_PRECH_CURRENT_100MA |
-		PM2XXX_CH_EOC_CURRENT_100MA));
-
-	/*
-     * recharge threshold = 3.8V
-	 * Precharge to CC threshold = 2.9V
-	 */
-	ret = pm2xxx_reg_write(pm2, PM2XXX_BATT_CTRL_REG7,
-		(PM2XXX_CH_PRECH_VOL_2_9 | PM2XXX_CH_VRESUME_VOL_3_8));
-
-	/* float voltage charger level = 4.2V */
-	ret = pm2xxx_reg_write(pm2, PM2XXX_BATT_CTRL_REG8,
-		PM2XXX_CH_VOLT_4_2);
-
-	/* Voltage drop between VBAT and VSYS in HW charging = 300mV */
-	ret = pm2xxx_reg_write(pm2, PM2XXX_BATT_CTRL_REG9,
-		(PM2XXX_CH_150MV_DROP_300MV | PM2XXX_CHARCHING_INFO_DIS |
-		PM2XXX_CH_CC_REDUCED_CURRENT_IDENT |
-		PM2XXX_CH_CC_MODEDROP_DIS));
-
-	/* Input charger level of over voltage = 10V */
-	ret = pm2xxx_reg_write(pm2, PM2XXX_INP_VOLT_VPWR2,
-					PM2XXX_VPWR2_OVV_10);
-	ret = pm2xxx_reg_write(pm2, PM2XXX_INP_VOLT_VPWR1,
-					PM2XXX_VPWR1_OVV_10);
-
-	/* Input charger drop */
-	ret = pm2xxx_reg_write(pm2, PM2XXX_INP_DROP_VPWR2,
-		(PM2XXX_VPWR2_HW_OPT_DIS | PM2XXX_VPWR2_VALID_DIS |
-		PM2XXX_VPWR2_DROP_DIS));
-	ret = pm2xxx_reg_write(pm2, PM2XXX_INP_DROP_VPWR1,
-		(PM2XXX_VPWR1_HW_OPT_DIS | PM2XXX_VPWR1_VALID_DIS |
-		PM2XXX_VPWR1_DROP_DIS));
-
-	/* Disable battery low monitoring */
-	ret = pm2xxx_reg_write(pm2, PM2XXX_BATT_LOW_LEV_COMP_REG,
-		PM2XXX_VBAT_LOW_MONITORING_ENA);
-
-	return ret;
-}
-
-static int pm2xxx_charger_ac_en(struct ux500_charger *charger,
-	int enable, int vset, int iset)
-{
-	int ret;
-	int volt_index;
-	int curr_index;
-	u8 val;
-
-	struct pm2xxx_charger *pm2 = to_pm2xxx_charger_ac_device_info(charger);
-
-	if (enable) {
-		if (!pm2->ac.charger_connected) {
-			dev_dbg(pm2->dev, "AC charger not connected\n");
-			return -ENXIO;
-		}
-
-		dev_dbg(pm2->dev, "Enable AC: %dmV %dmA\n", vset, iset);
-		if (!pm2->vddadc_en_ac) {
-			ret = regulator_enable(pm2->regu);
-			if (ret)
-				dev_warn(pm2->dev,
-					"Failed to enable vddadc regulator\n");
-			else
-				pm2->vddadc_en_ac = true;
-		}
-
-		ret = pm2xxx_charging_init(pm2);
-		if (ret < 0) {
-			dev_err(pm2->dev, "%s charging init failed\n",
-					__func__);
-			goto error_occured;
-		}
-
-		volt_index = pm2xxx_voltage_to_regval(vset);
-		curr_index = pm2xxx_current_to_regval(iset);
-
-		if (volt_index < 0 || curr_index < 0) {
-			dev_err(pm2->dev,
-				"Charger voltage or current too high, "
-				"charging not started\n");
-			return -ENXIO;
-		}
-
-		ret = pm2xxx_reg_read(pm2, PM2XXX_BATT_CTRL_REG8, &val);
-		if (ret < 0) {
-			dev_err(pm2->dev, "%s pm2xxx read failed\n", __func__);
-			goto error_occured;
-		}
-		val &= ~PM2XXX_CH_VOLT_MASK;
-		val |= volt_index;
-		ret = pm2xxx_reg_write(pm2, PM2XXX_BATT_CTRL_REG8, val);
-		if (ret < 0) {
-			dev_err(pm2->dev, "%s pm2xxx write failed\n", __func__);
-			goto error_occured;
-		}
-
-		ret = pm2xxx_reg_read(pm2, PM2XXX_BATT_CTRL_REG6, &val);
-		if (ret < 0) {
-			dev_err(pm2->dev, "%s pm2xxx read failed\n", __func__);
-			goto error_occured;
-		}
-		val &= ~PM2XXX_DIR_CH_CC_CURRENT_MASK;
-		val |= curr_index;
-		ret = pm2xxx_reg_write(pm2, PM2XXX_BATT_CTRL_REG6, val);
-		if (ret < 0) {
-			dev_err(pm2->dev, "%s pm2xxx write failed\n", __func__);
-			goto error_occured;
-		}
-
-		if (!pm2->bat->enable_overshoot) {
-			ret = pm2xxx_reg_read(pm2, PM2XXX_LED_CTRL_REG, &val);
-			if (ret < 0) {
-				dev_err(pm2->dev, "%s pm2xxx read failed\n",
-								__func__);
-				goto error_occured;
-			}
-			val |= PM2XXX_ANTI_OVERSHOOT_EN;
-			ret = pm2xxx_reg_write(pm2, PM2XXX_LED_CTRL_REG, val);
-			if (ret < 0) {
-				dev_err(pm2->dev, "%s pm2xxx write failed\n",
-								__func__);
-				goto error_occured;
-			}
-		}
-
-		ret = pm2xxx_charging_enable_mngt(pm2);
-		if (ret < 0) {
-			dev_err(pm2->dev, "Failed to enable"
-						"pm2xxx ac charger\n");
-			goto error_occured;
-		}
-
-		pm2->ac.charger_online = 1;
-	} else {
-		pm2->ac.charger_online = 0;
-		pm2->ac.wd_expired = false;
-
-		/* Disable regulator if enabled */
-		if (pm2->vddadc_en_ac) {
-			regulator_disable(pm2->regu);
-			pm2->vddadc_en_ac = false;
-		}
-
-		ret = pm2xxx_charging_disable_mngt(pm2);
-		if (ret < 0) {
-			dev_err(pm2->dev, "failed to disable"
-						"pm2xxx ac charger\n");
-			goto error_occured;
-		}
-
-		dev_dbg(pm2->dev, "PM2301: " "Disabled AC charging\n");
-	}
-	power_supply_changed(pm2->ac_chg.psy);
-
-error_occured:
-	return ret;
-}
-
-static int pm2xxx_charger_watchdog_kick(struct ux500_charger *charger)
-{
-	int ret;
-	struct pm2xxx_charger *pm2;
-
-	if (charger->psy->desc->type == POWER_SUPPLY_TYPE_MAINS)
-		pm2 = to_pm2xxx_charger_ac_device_info(charger);
-	else
-		return -ENXIO;
-
-	ret = pm2xxx_reg_write(pm2, PM2XXX_BATT_WD_KICK, WD_TIMER);
-	if (ret)
-		dev_err(pm2->dev, "Failed to kick WD!\n");
-
-	return ret;
-}
-
-static void pm2xxx_charger_ac_work(struct work_struct *work)
-{
-	struct pm2xxx_charger *pm2 = container_of(work,
-		struct pm2xxx_charger, ac_work);
-
-
-	power_supply_changed(pm2->ac_chg.psy);
-	sysfs_notify(&pm2->ac_chg.psy->dev.kobj, NULL, "present");
-};
-
-static void pm2xxx_charger_check_hw_failure_work(struct work_struct *work)
-{
-	u8 reg_value;
-
-	struct pm2xxx_charger *pm2 = container_of(work,
-		struct pm2xxx_charger, check_hw_failure_work.work);
-
-	if (pm2->flags.ovv) {
-		pm2xxx_reg_read(pm2, PM2XXX_SRCE_REG_INT4, &reg_value);
-
-		if (!(reg_value & (PM2XXX_INT4_S_ITVPWR1OVV |
-					PM2XXX_INT4_S_ITVPWR2OVV))) {
-			pm2->flags.ovv = false;
-			power_supply_changed(pm2->ac_chg.psy);
-		}
-	}
-
-	/* If we still have a failure, schedule a new check */
-	if (pm2->flags.ovv) {
-		queue_delayed_work(pm2->charger_wq,
-			&pm2->check_hw_failure_work, round_jiffies(HZ));
-	}
-}
-
-static void pm2xxx_charger_check_main_thermal_prot_work(
-	struct work_struct *work)
-{
-	int ret;
-	u8 val;
-
-	struct pm2xxx_charger *pm2 = container_of(work, struct pm2xxx_charger,
-					check_main_thermal_prot_work);
-
-	/* Check if die temp warning is still active */
-	ret = pm2xxx_reg_read(pm2, PM2XXX_SRCE_REG_INT5, &val);
-	if (ret < 0) {
-		dev_err(pm2->dev, "%s pm2xxx read failed\n", __func__);
-		return;
-	}
-	if (val & (PM2XXX_INT5_S_ITTHERMALWARNINGRISE
-			| PM2XXX_INT5_S_ITTHERMALSHUTDOWNRISE))
-		pm2->flags.main_thermal_prot = true;
-	else if (val & (PM2XXX_INT5_S_ITTHERMALWARNINGFALL
-				| PM2XXX_INT5_S_ITTHERMALSHUTDOWNFALL))
-		pm2->flags.main_thermal_prot = false;
-
-	power_supply_changed(pm2->ac_chg.psy);
-}
-
-static struct pm2xxx_interrupts pm2xxx_int = {
-	.handler[0] = pm2_int_reg0,
-	.handler[1] = pm2_int_reg1,
-	.handler[2] = pm2_int_reg2,
-	.handler[3] = pm2_int_reg3,
-	.handler[4] = pm2_int_reg4,
-	.handler[5] = pm2_int_reg5,
-};
-
-static struct pm2xxx_irq pm2xxx_charger_irq[] = {
-	{"PM2XXX_IRQ_INT", pm2xxx_irq_int},
-};
-
-static int __maybe_unused pm2xxx_wall_charger_resume(struct device *dev)
-{
-	struct i2c_client *i2c_client = to_i2c_client(dev);
-	struct pm2xxx_charger *pm2;
-
-	pm2 =  (struct pm2xxx_charger *)i2c_get_clientdata(i2c_client);
-	set_lpn_pin(pm2);
-
-	/* If we still have a HW failure, schedule a new check */
-	if (pm2->flags.ovv)
-		queue_delayed_work(pm2->charger_wq,
-				&pm2->check_hw_failure_work, 0);
-
-	return 0;
-}
-
-static int __maybe_unused pm2xxx_wall_charger_suspend(struct device *dev)
-{
-	struct i2c_client *i2c_client = to_i2c_client(dev);
-	struct pm2xxx_charger *pm2;
-
-	pm2 =  (struct pm2xxx_charger *)i2c_get_clientdata(i2c_client);
-	clear_lpn_pin(pm2);
-
-	/* Cancel any pending HW failure check */
-	if (delayed_work_pending(&pm2->check_hw_failure_work))
-		cancel_delayed_work(&pm2->check_hw_failure_work);
-
-	flush_work(&pm2->ac_work);
-	flush_work(&pm2->check_main_thermal_prot_work);
-
-	return 0;
-}
-
-static int __maybe_unused pm2xxx_runtime_suspend(struct device *dev)
-{
-	struct i2c_client *pm2xxx_i2c_client = to_i2c_client(dev);
-	struct pm2xxx_charger *pm2;
-
-	pm2 = (struct pm2xxx_charger *)i2c_get_clientdata(pm2xxx_i2c_client);
-	clear_lpn_pin(pm2);
-
-	return 0;
-}
-
-static int __maybe_unused pm2xxx_runtime_resume(struct device *dev)
-{
-	struct i2c_client *pm2xxx_i2c_client = to_i2c_client(dev);
-	struct pm2xxx_charger *pm2;
-
-	pm2 = (struct pm2xxx_charger *)i2c_get_clientdata(pm2xxx_i2c_client);
-
-	if (gpio_is_valid(pm2->lpn_pin) && gpio_get_value(pm2->lpn_pin) == 0)
-		set_lpn_pin(pm2);
-
-	return 0;
-}
-
-static const struct dev_pm_ops pm2xxx_pm_ops __maybe_unused = {
-	SET_SYSTEM_SLEEP_PM_OPS(pm2xxx_wall_charger_suspend,
-		pm2xxx_wall_charger_resume)
-	SET_RUNTIME_PM_OPS(pm2xxx_runtime_suspend, pm2xxx_runtime_resume, NULL)
-};
-
-static int pm2xxx_wall_charger_probe(struct i2c_client *i2c_client,
-		const struct i2c_device_id *id)
-{
-	struct pm2xxx_platform_data *pl_data = i2c_client->dev.platform_data;
-	struct power_supply_config psy_cfg = {};
-	struct pm2xxx_charger *pm2;
-	int ret = 0;
-	u8 val;
-	int i;
-
-	if (!pl_data) {
-		dev_err(&i2c_client->dev, "No platform data supplied\n");
-		return -EINVAL;
-	}
-
-	pm2 = kzalloc(sizeof(struct pm2xxx_charger), GFP_KERNEL);
-	if (!pm2) {
-		dev_err(&i2c_client->dev, "pm2xxx_charger allocation failed\n");
-		return -ENOMEM;
-	}
-
-	/* get parent data */
-	pm2->dev = &i2c_client->dev;
-
-	pm2->pm2_int = &pm2xxx_int;
-
-	/* get charger spcific platform data */
-	if (!pl_data->wall_charger) {
-		dev_err(pm2->dev, "no charger platform data supplied\n");
-		ret = -EINVAL;
-		goto free_device_info;
-	}
-
-	pm2->pdata = pl_data->wall_charger;
-
-	/* get battery specific platform data */
-	if (!pl_data->battery) {
-		dev_err(pm2->dev, "no battery platform data supplied\n");
-		ret = -EINVAL;
-		goto free_device_info;
-	}
-
-	pm2->bat = pl_data->battery;
-
-	if (!i2c_check_functionality(i2c_client->adapter,
-			I2C_FUNC_SMBUS_BYTE_DATA |
-			I2C_FUNC_SMBUS_READ_WORD_DATA)) {
-		ret = -ENODEV;
-		dev_info(pm2->dev, "pm2301 i2c_check_functionality failed\n");
-		goto free_device_info;
-	}
-
-	pm2->config.pm2xxx_i2c = i2c_client;
-	pm2->config.pm2xxx_id = (struct i2c_device_id *) id;
-	i2c_set_clientdata(i2c_client, pm2);
-
-	/* AC supply */
-	/* power_supply base class */
-	pm2->ac_chg_desc.name = pm2->pdata->label;
-	pm2->ac_chg_desc.type = POWER_SUPPLY_TYPE_MAINS;
-	pm2->ac_chg_desc.properties = pm2xxx_charger_ac_props;
-	pm2->ac_chg_desc.num_properties = ARRAY_SIZE(pm2xxx_charger_ac_props);
-	pm2->ac_chg_desc.get_property = pm2xxx_charger_ac_get_property;
-
-	psy_cfg.supplied_to = pm2->pdata->supplied_to;
-	psy_cfg.num_supplicants = pm2->pdata->num_supplicants;
-	/* pm2xxx_charger sub-class */
-	pm2->ac_chg.ops.enable = &pm2xxx_charger_ac_en;
-	pm2->ac_chg.ops.kick_wd = &pm2xxx_charger_watchdog_kick;
-	pm2->ac_chg.ops.update_curr = &pm2xxx_charger_update_charger_current;
-	pm2->ac_chg.max_out_volt = pm2xxx_charger_voltage_map[
-		ARRAY_SIZE(pm2xxx_charger_voltage_map) - 1];
-	pm2->ac_chg.max_out_curr = pm2xxx_charger_current_map[
-		ARRAY_SIZE(pm2xxx_charger_current_map) - 1];
-	pm2->ac_chg.wdt_refresh = WD_KICK_INTERVAL;
-	pm2->ac_chg.enabled = true;
-	pm2->ac_chg.external = true;
-
-	/* Create a work queue for the charger */
-	pm2->charger_wq = alloc_ordered_workqueue("pm2xxx_charger_wq",
-						  WQ_MEM_RECLAIM);
-	if (pm2->charger_wq == NULL) {
-		ret = -ENOMEM;
-		dev_err(pm2->dev, "failed to create work queue\n");
-		goto free_device_info;
-	}
-
-	/* Init work for charger detection */
-	INIT_WORK(&pm2->ac_work, pm2xxx_charger_ac_work);
-
-	/* Init work for checking HW status */
-	INIT_WORK(&pm2->check_main_thermal_prot_work,
-		pm2xxx_charger_check_main_thermal_prot_work);
-
-	/* Init work for HW failure check */
-	INIT_DEFERRABLE_WORK(&pm2->check_hw_failure_work,
-		pm2xxx_charger_check_hw_failure_work);
-
-	/*
-	 * VDD ADC supply needs to be enabled from this driver when there
-	 * is a charger connected to avoid erroneous BTEMP_HIGH/LOW
-	 * interrupts during charging
-	 */
-	pm2->regu = regulator_get(pm2->dev, "vddadc");
-	if (IS_ERR(pm2->regu)) {
-		ret = PTR_ERR(pm2->regu);
-		dev_err(pm2->dev, "failed to get vddadc regulator\n");
-		goto free_charger_wq;
-	}
-
-	/* Register AC charger class */
-	pm2->ac_chg.psy = power_supply_register(pm2->dev, &pm2->ac_chg_desc,
-						&psy_cfg);
-	if (IS_ERR(pm2->ac_chg.psy)) {
-		dev_err(pm2->dev, "failed to register AC charger\n");
-		ret = PTR_ERR(pm2->ac_chg.psy);
-		goto free_regulator;
-	}
-
-	/* Register interrupts */
-	ret = request_threaded_irq(gpio_to_irq(pm2->pdata->gpio_irq_number),
-				NULL,
-				pm2xxx_charger_irq[0].isr,
-				pm2->pdata->irq_type | IRQF_ONESHOT,
-				pm2xxx_charger_irq[0].name, pm2);
-
-	if (ret != 0) {
-		dev_err(pm2->dev, "failed to request %s IRQ %d: %d\n",
-		pm2xxx_charger_irq[0].name,
-			gpio_to_irq(pm2->pdata->gpio_irq_number), ret);
-		goto unregister_pm2xxx_charger;
-	}
-
-	ret = pm_runtime_set_active(pm2->dev);
-	if (ret)
-		dev_err(pm2->dev, "set active Error\n");
-
-	pm_runtime_enable(pm2->dev);
-	pm_runtime_set_autosuspend_delay(pm2->dev, PM2XXX_AUTOSUSPEND_DELAY);
-	pm_runtime_use_autosuspend(pm2->dev);
-	pm_runtime_resume(pm2->dev);
-
-	/* pm interrupt can wake up system */
-	ret = enable_irq_wake(gpio_to_irq(pm2->pdata->gpio_irq_number));
-	if (ret) {
-		dev_err(pm2->dev, "failed to set irq wake\n");
-		goto unregister_pm2xxx_interrupt;
-	}
-
-	mutex_init(&pm2->lock);
-
-	if (gpio_is_valid(pm2->pdata->lpn_gpio)) {
-		/* get lpn GPIO from platform data */
-		pm2->lpn_pin = pm2->pdata->lpn_gpio;
-
-		/*
-		 * Charger detection mechanism requires pulling up the LPN pin
-		 * while i2c communication if Charger is not connected
-		 * LPN pin of PM2301 is GPIO60 of AB9540
-		 */
-		ret = gpio_request(pm2->lpn_pin, "pm2301_lpm_gpio");
-
-		if (ret < 0) {
-			dev_err(pm2->dev, "pm2301_lpm_gpio request failed\n");
-			goto disable_pm2_irq_wake;
-		}
-		ret = gpio_direction_output(pm2->lpn_pin, 0);
-		if (ret < 0) {
-			dev_err(pm2->dev, "pm2301_lpm_gpio direction failed\n");
-			goto free_gpio;
-		}
-		set_lpn_pin(pm2);
-	}
-
-	/* read  interrupt registers */
-	for (i = 0; i < PM2XXX_NUM_INT_REG; i++)
-		pm2xxx_reg_read(pm2,
-			pm2xxx_interrupt_registers[i],
-			&val);
-
-	ret = pm2xxx_charger_detection(pm2, &val);
-
-	if ((ret == 0) && val) {
-		pm2->ac.charger_connected = 1;
-		ab8500_override_turn_on_stat(~AB8500_POW_KEY_1_ON,
-					     AB8500_MAIN_CH_DET);
-		pm2->ac_conn = true;
-		power_supply_changed(pm2->ac_chg.psy);
-		sysfs_notify(&pm2->ac_chg.psy->dev.kobj, NULL, "present");
-	}
-
-	return 0;
-
-free_gpio:
-	if (gpio_is_valid(pm2->lpn_pin))
-		gpio_free(pm2->lpn_pin);
-disable_pm2_irq_wake:
-	disable_irq_wake(gpio_to_irq(pm2->pdata->gpio_irq_number));
-unregister_pm2xxx_interrupt:
-	/* disable interrupt */
-	free_irq(gpio_to_irq(pm2->pdata->gpio_irq_number), pm2);
-unregister_pm2xxx_charger:
-	/* unregister power supply */
-	power_supply_unregister(pm2->ac_chg.psy);
-free_regulator:
-	/* disable the regulator */
-	regulator_put(pm2->regu);
-free_charger_wq:
-	destroy_workqueue(pm2->charger_wq);
-free_device_info:
-	kfree(pm2);
-
-	return ret;
-}
-
-static int pm2xxx_wall_charger_remove(struct i2c_client *i2c_client)
-{
-	struct pm2xxx_charger *pm2 = i2c_get_clientdata(i2c_client);
-
-	/* Disable pm_runtime */
-	pm_runtime_disable(pm2->dev);
-	/* Disable AC charging */
-	pm2xxx_charger_ac_en(&pm2->ac_chg, false, 0, 0);
-
-	/* Disable wake by pm interrupt */
-	disable_irq_wake(gpio_to_irq(pm2->pdata->gpio_irq_number));
-
-	/* Disable interrupts */
-	free_irq(gpio_to_irq(pm2->pdata->gpio_irq_number), pm2);
-
-	/* Delete the work queue */
-	destroy_workqueue(pm2->charger_wq);
-
-	flush_scheduled_work();
-
-	/* disable the regulator */
-	regulator_put(pm2->regu);
-
-	power_supply_unregister(pm2->ac_chg.psy);
-
-	if (gpio_is_valid(pm2->lpn_pin))
-		gpio_free(pm2->lpn_pin);
-
-	kfree(pm2);
-
-	return 0;
-}
-
-static const struct i2c_device_id pm2xxx_id[] = {
-	{ "pm2301", 0 },
-	{ }
-};
-
-MODULE_DEVICE_TABLE(i2c, pm2xxx_id);
-
-static struct i2c_driver pm2xxx_charger_driver = {
-	.probe = pm2xxx_wall_charger_probe,
-	.remove = pm2xxx_wall_charger_remove,
-	.driver = {
-		.name = "pm2xxx-wall_charger",
-		.pm = IS_ENABLED(CONFIG_PM) ? &pm2xxx_pm_ops : NULL,
-	},
-	.id_table = pm2xxx_id,
-};
-
-static int __init pm2xxx_charger_init(void)
-{
-	return i2c_add_driver(&pm2xxx_charger_driver);
-}
-
-static void __exit pm2xxx_charger_exit(void)
-{
-	i2c_del_driver(&pm2xxx_charger_driver);
-}
-
-device_initcall_sync(pm2xxx_charger_init);
-module_exit(pm2xxx_charger_exit);
-
-MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR("Rajkumar kasirajan, Olivier Launay");
-MODULE_DESCRIPTION("PM2xxx charger management driver");
diff --git a/include/linux/pm2301_charger.h b/include/linux/pm2301_charger.h
deleted file mode 100644
index b8fac96f05aa..000000000000
--- a/include/linux/pm2301_charger.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * PM2301 charger driver.
- *
- * Copyright (C) 2012 ST Ericsson Corporation
- *
- * Contact: Olivier LAUNAY (olivier.launay@stericsson.com
- */
-
-#ifndef __LINUX_PM2301_H
-#define __LINUX_PM2301_H
-
-/**
- * struct pm2xxx_bm_charger_parameters - Charger specific parameters
- * @ac_volt_max:	maximum allowed AC charger voltage in mV
- * @ac_curr_max:	maximum allowed AC charger current in mA
- */
-struct pm2xxx_bm_charger_parameters {
-	int ac_volt_max;
-	int ac_curr_max;
-};
-
-/**
- * struct pm2xxx_bm_data - pm2xxx battery management data
- * @enable_overshoot    flag to enable VBAT overshoot control
- * @chg_params	  charger parameters
- */
-struct pm2xxx_bm_data {
-	bool enable_overshoot;
-	const struct pm2xxx_bm_charger_parameters *chg_params;
-};
-
-struct pm2xxx_charger_platform_data {
-	char **supplied_to;
-	size_t num_supplicants;
-	int i2c_bus;
-	const char *label;
-	int gpio_irq_number;
-	unsigned int lpn_gpio;
-	int irq_type;
-};
-
-struct pm2xxx_platform_data {
-	struct pm2xxx_charger_platform_data *wall_charger;
-	struct pm2xxx_bm_data *battery;
-};
-
-#endif /* __LINUX_PM2301_H */
-- 
cgit v1.2.3


From 404e5a12691fe797486475fe28cc0b80cb8bef2c Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Thu, 3 Jun 2021 16:19:39 +0300
Subject: RDMA/mlx4: Do not map the core_clock page to user space unless
 enabled

Currently when mlx4 maps the hca_core_clock page to the user space there
are read-modifiable registers, one of which is semaphore, on this page as
well as the clock counter. If user reads the wrong offset, it can modify
the semaphore and hang the device.

Do not map the hca_core_clock page to the user space unless the device has
been put in a backwards compatibility mode to support this feature.

After this patch, mlx4 core_clock won't be mapped to user space on the
majority of existing devices and the uverbs device time feature in
ibv_query_rt_values_ex() will be disabled.

Fixes: 52033cfb5aab ("IB/mlx4: Add mmap call to map the hardware clock")
Link: https://lore.kernel.org/r/9632304e0d6790af84b3b706d8c18732bc0d5e27.1622726305.git.leonro@nvidia.com
Signed-off-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/infiniband/hw/mlx4/main.c         | 5 +----
 drivers/net/ethernet/mellanox/mlx4/fw.c   | 3 +++
 drivers/net/ethernet/mellanox/mlx4/fw.h   | 1 +
 drivers/net/ethernet/mellanox/mlx4/main.c | 6 ++++++
 include/linux/mlx4/device.h               | 1 +
 5 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 22898d97ecbd..16704262fc3a 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -581,12 +581,9 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
 	props->cq_caps.max_cq_moderation_count = MLX4_MAX_CQ_COUNT;
 	props->cq_caps.max_cq_moderation_period = MLX4_MAX_CQ_PERIOD;
 
-	if (!mlx4_is_slave(dev->dev))
-		err = mlx4_get_internal_clock_params(dev->dev, &clock_params);
-
 	if (uhw->outlen >= resp.response_length + sizeof(resp.hca_core_clock_offset)) {
 		resp.response_length += sizeof(resp.hca_core_clock_offset);
-		if (!err && !mlx4_is_slave(dev->dev)) {
+		if (!mlx4_get_internal_clock_params(dev->dev, &clock_params)) {
 			resp.comp_mask |= MLX4_IB_QUERY_DEV_RESP_MASK_CORE_CLOCK_OFFSET;
 			resp.hca_core_clock_offset = clock_params.offset % PAGE_SIZE;
 		}
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index f6cfec81ccc3..dc4ac1a2b6b6 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -823,6 +823,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_MAD_DEMUX_OFFSET		0xb0
 #define QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_BASE_OFFSET	0xa8
 #define QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_RANGE_OFFSET	0xac
+#define QUERY_DEV_CAP_MAP_CLOCK_TO_USER 0xc1
 #define QUERY_DEV_CAP_QP_RATE_LIMIT_NUM_OFFSET	0xcc
 #define QUERY_DEV_CAP_QP_RATE_LIMIT_MAX_OFFSET	0xd0
 #define QUERY_DEV_CAP_QP_RATE_LIMIT_MIN_OFFSET	0xd2
@@ -841,6 +842,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 
 	if (mlx4_is_mfunc(dev))
 		disable_unsupported_roce_caps(outbox);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAP_CLOCK_TO_USER);
+	dev_cap->map_clock_to_user = field & 0x80;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_QP_OFFSET);
 	dev_cap->reserved_qps = 1 << (field & 0xf);
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_OFFSET);
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h
index 8f020f26ebf5..cf64e54eecb0 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.h
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.h
@@ -131,6 +131,7 @@ struct mlx4_dev_cap {
 	u32 health_buffer_addrs;
 	struct mlx4_port_cap port_cap[MLX4_MAX_PORTS + 1];
 	bool wol_port[MLX4_MAX_PORTS + 1];
+	bool map_clock_to_user;
 };
 
 struct mlx4_func_cap {
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index c326b434734e..00c84656b2e7 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -498,6 +498,7 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		}
 	}
 
+	dev->caps.map_clock_to_user  = dev_cap->map_clock_to_user;
 	dev->caps.uar_page_size	     = PAGE_SIZE;
 	dev->caps.num_uars	     = dev_cap->uar_size / PAGE_SIZE;
 	dev->caps.local_ca_ack_delay = dev_cap->local_ca_ack_delay;
@@ -1948,6 +1949,11 @@ int mlx4_get_internal_clock_params(struct mlx4_dev *dev,
 	if (mlx4_is_slave(dev))
 		return -EOPNOTSUPP;
 
+	if (!dev->caps.map_clock_to_user) {
+		mlx4_dbg(dev, "Map clock to user is not supported.\n");
+		return -EOPNOTSUPP;
+	}
+
 	if (!params)
 		return -EINVAL;
 
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 236a7d04f891..30bb59fe970c 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -630,6 +630,7 @@ struct mlx4_caps {
 	bool			wol_port[MLX4_MAX_PORTS + 1];
 	struct mlx4_rate_limit_caps rl_caps;
 	u32			health_buffer_addrs;
+	bool			map_clock_to_user;
 };
 
 struct mlx4_buf_list {
-- 
cgit v1.2.3


From b892770a2c553fd905ebd3ced55d5a437669b54d Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 18 May 2021 14:25:46 +0300
Subject: iio: Drop Duplicated "mount-matrix" parameter

All of the users of iio_read_mount_matrix() are using the very same
property name. Moreover, the property name is hard coded in the API
documentation.

Make this clear and avoid duplication now and in the future.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Sean Nyekjaer <sean@geanix.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20210518112546.44592-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/accel/bma180.c                       | 3 +--
 drivers/iio/accel/bma400_core.c                  | 2 +-
 drivers/iio/accel/bmc150-accel-core.c            | 3 +--
 drivers/iio/accel/fxls8962af-core.c              | 2 +-
 drivers/iio/accel/kxcjk-1013.c                   | 3 +--
 drivers/iio/accel/kxsd9.c                        | 2 +-
 drivers/iio/gyro/bmg160_core.c                   | 3 +--
 drivers/iio/gyro/itg3200_core.c                  | 3 +--
 drivers/iio/gyro/mpu3050-core.c                  | 2 +-
 drivers/iio/imu/bmi160/bmi160_core.c             | 3 +--
 drivers/iio/imu/inv_icm42600/inv_icm42600_core.c | 2 +-
 drivers/iio/imu/inv_mpu6050/inv_mpu_core.c       | 3 +--
 drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c     | 2 +-
 drivers/iio/industrialio-core.c                  | 7 ++-----
 drivers/iio/magnetometer/ak8974.c                | 3 +--
 drivers/iio/magnetometer/ak8975.c                | 2 +-
 drivers/iio/magnetometer/bmc150_magn.c           | 3 +--
 drivers/iio/magnetometer/hmc5843_core.c          | 3 +--
 drivers/iio/magnetometer/yamaha-yas530.c         | 2 +-
 include/linux/iio/iio.h                          | 3 +--
 20 files changed, 21 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/accel/bma180.c b/drivers/iio/accel/bma180.c
index 71c76bbd81d4..97e991581960 100644
--- a/drivers/iio/accel/bma180.c
+++ b/drivers/iio/accel/bma180.c
@@ -1001,8 +1001,7 @@ static int bma180_probe(struct i2c_client *client,
 		chip = id->driver_data;
 	data->part_info = &bma180_part_info[chip];
 
-	ret = iio_read_mount_matrix(dev, "mount-matrix",
-				&data->orientation);
+	ret = iio_read_mount_matrix(dev, &data->orientation);
 	if (ret)
 		return ret;
 
diff --git a/drivers/iio/accel/bma400_core.c b/drivers/iio/accel/bma400_core.c
index 7eeba80e32cb..21520e022a21 100644
--- a/drivers/iio/accel/bma400_core.c
+++ b/drivers/iio/accel/bma400_core.c
@@ -811,7 +811,7 @@ int bma400_probe(struct device *dev, struct regmap *regmap, const char *name)
 	if (ret)
 		return ret;
 
-	ret = iio_read_mount_matrix(dev, "mount-matrix", &data->orientation);
+	ret = iio_read_mount_matrix(dev, &data->orientation);
 	if (ret)
 		return ret;
 
diff --git a/drivers/iio/accel/bmc150-accel-core.c b/drivers/iio/accel/bmc150-accel-core.c
index 43cfadf8f6b7..46ab7675186c 100644
--- a/drivers/iio/accel/bmc150-accel-core.c
+++ b/drivers/iio/accel/bmc150-accel-core.c
@@ -1685,8 +1685,7 @@ int bmc150_accel_core_probe(struct device *dev, struct regmap *regmap, int irq,
 	data->regmap = regmap;
 
 	if (!bmc150_apply_acpi_orientation(dev, &data->orientation)) {
-		ret = iio_read_mount_matrix(dev, "mount-matrix",
-					     &data->orientation);
+		ret = iio_read_mount_matrix(dev, &data->orientation);
 		if (ret)
 			return ret;
 	}
diff --git a/drivers/iio/accel/fxls8962af-core.c b/drivers/iio/accel/fxls8962af-core.c
index 9fe5a18a605c..078d87865fde 100644
--- a/drivers/iio/accel/fxls8962af-core.c
+++ b/drivers/iio/accel/fxls8962af-core.c
@@ -862,7 +862,7 @@ int fxls8962af_core_probe(struct device *dev, struct regmap *regmap, int irq)
 	dev_set_drvdata(dev, indio_dev);
 	data->regmap = regmap;
 
-	ret = iio_read_mount_matrix(dev, "mount-matrix", &data->orientation);
+	ret = iio_read_mount_matrix(dev, &data->orientation);
 	if (ret)
 		return ret;
 
diff --git a/drivers/iio/accel/kxcjk-1013.c b/drivers/iio/accel/kxcjk-1013.c
index 7cd647315194..a51fdd3c9b5b 100644
--- a/drivers/iio/accel/kxcjk-1013.c
+++ b/drivers/iio/accel/kxcjk-1013.c
@@ -1455,8 +1455,7 @@ static int kxcjk1013_probe(struct i2c_client *client,
 	} else {
 		data->active_high_intr = true; /* default polarity */
 
-		ret = iio_read_mount_matrix(&client->dev, "mount-matrix",
-					    &data->orientation);
+		ret = iio_read_mount_matrix(&client->dev, &data->orientation);
 		if (ret)
 			return ret;
 	}
diff --git a/drivers/iio/accel/kxsd9.c b/drivers/iio/accel/kxsd9.c
index 0e18b92e2099..bf7ed9e7d00f 100644
--- a/drivers/iio/accel/kxsd9.c
+++ b/drivers/iio/accel/kxsd9.c
@@ -420,7 +420,7 @@ int kxsd9_common_probe(struct device *dev,
 	indio_dev->available_scan_masks = kxsd9_scan_masks;
 
 	/* Read the mounting matrix, if present */
-	ret = iio_read_mount_matrix(dev, "mount-matrix", &st->orientation);
+	ret = iio_read_mount_matrix(dev, &st->orientation);
 	if (ret)
 		return ret;
 
diff --git a/drivers/iio/gyro/bmg160_core.c b/drivers/iio/gyro/bmg160_core.c
index a7cc2cad8bbf..17b939a367ad 100644
--- a/drivers/iio/gyro/bmg160_core.c
+++ b/drivers/iio/gyro/bmg160_core.c
@@ -1106,8 +1106,7 @@ int bmg160_core_probe(struct device *dev, struct regmap *regmap, int irq,
 	if (ret)
 		return ret;
 
-	ret = iio_read_mount_matrix(dev, "mount-matrix",
-				&data->orientation);
+	ret = iio_read_mount_matrix(dev, &data->orientation);
 	if (ret)
 		return ret;
 
diff --git a/drivers/iio/gyro/itg3200_core.c b/drivers/iio/gyro/itg3200_core.c
index e9804664db73..a7f1bbb5f289 100644
--- a/drivers/iio/gyro/itg3200_core.c
+++ b/drivers/iio/gyro/itg3200_core.c
@@ -308,8 +308,7 @@ static int itg3200_probe(struct i2c_client *client,
 
 	st = iio_priv(indio_dev);
 
-	ret = iio_read_mount_matrix(&client->dev, "mount-matrix",
-				&st->orientation);
+	ret = iio_read_mount_matrix(&client->dev, &st->orientation);
 	if (ret)
 		return ret;
 
diff --git a/drivers/iio/gyro/mpu3050-core.c b/drivers/iio/gyro/mpu3050-core.c
index 2b930c7f4d86..3225de1f023b 100644
--- a/drivers/iio/gyro/mpu3050-core.c
+++ b/drivers/iio/gyro/mpu3050-core.c
@@ -1164,7 +1164,7 @@ int mpu3050_common_probe(struct device *dev,
 	mpu3050->divisor = 99;
 
 	/* Read the mounting matrix, if present */
-	ret = iio_read_mount_matrix(dev, "mount-matrix", &mpu3050->orientation);
+	ret = iio_read_mount_matrix(dev, &mpu3050->orientation);
 	if (ret)
 		return ret;
 
diff --git a/drivers/iio/imu/bmi160/bmi160_core.c b/drivers/iio/imu/bmi160/bmi160_core.c
index b63bd7e5e5e5..824b5124a5f5 100644
--- a/drivers/iio/imu/bmi160/bmi160_core.c
+++ b/drivers/iio/imu/bmi160/bmi160_core.c
@@ -852,8 +852,7 @@ int bmi160_core_probe(struct device *dev, struct regmap *regmap,
 		return ret;
 	}
 
-	ret = iio_read_mount_matrix(dev, "mount-matrix",
-				    &data->orientation);
+	ret = iio_read_mount_matrix(dev, &data->orientation);
 	if (ret)
 		return ret;
 
diff --git a/drivers/iio/imu/inv_icm42600/inv_icm42600_core.c b/drivers/iio/imu/inv_icm42600/inv_icm42600_core.c
index 8bd77185ccb7..86858da9cc38 100644
--- a/drivers/iio/imu/inv_icm42600/inv_icm42600_core.c
+++ b/drivers/iio/imu/inv_icm42600/inv_icm42600_core.c
@@ -592,7 +592,7 @@ int inv_icm42600_core_probe(struct regmap *regmap, int chip, int irq,
 	st->chip = chip;
 	st->map = regmap;
 
-	ret = iio_read_mount_matrix(dev, "mount-matrix", &st->orientation);
+	ret = iio_read_mount_matrix(dev, &st->orientation);
 	if (ret) {
 		dev_err(dev, "failed to retrieve mounting matrix %d\n", ret);
 		return ret;
diff --git a/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c b/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c
index 6244a07048df..64704b55f6eb 100644
--- a/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c
+++ b/drivers/iio/imu/inv_mpu6050/inv_mpu_core.c
@@ -1455,8 +1455,7 @@ int inv_mpu_core_probe(struct regmap *regmap, int irq, const char *name,
 
 	pdata = dev_get_platdata(dev);
 	if (!pdata) {
-		result = iio_read_mount_matrix(dev, "mount-matrix",
-					       &st->orientation);
+		result = iio_read_mount_matrix(dev, &st->orientation);
 		if (result) {
 			dev_err(dev, "Failed to retrieve mounting matrix %d\n",
 				result);
diff --git a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c
index e8d242ee6743..db45f1fc0b81 100644
--- a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c
+++ b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c
@@ -2256,7 +2256,7 @@ int st_lsm6dsx_probe(struct device *dev, int irq, int hw_id,
 			return err;
 	}
 
-	err = iio_read_mount_matrix(hw->dev, "mount-matrix", &hw->orientation);
+	err = iio_read_mount_matrix(hw->dev, &hw->orientation);
 	if (err)
 		return err;
 
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 75e92bac78f3..6d2175eb7af2 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -617,7 +617,6 @@ EXPORT_SYMBOL_GPL(iio_show_mount_matrix);
  * iio_read_mount_matrix() - retrieve iio device mounting matrix from
  *                           device "mount-matrix" property
  * @dev:	device the mounting matrix property is assigned to
- * @propname:	device specific mounting matrix property name
  * @matrix:	where to store retrieved matrix
  *
  * If device is assigned no mounting matrix property, a default 3x3 identity
@@ -625,14 +624,12 @@ EXPORT_SYMBOL_GPL(iio_show_mount_matrix);
  *
  * Return: 0 if success, or a negative error code on failure.
  */
-int iio_read_mount_matrix(struct device *dev, const char *propname,
-			  struct iio_mount_matrix *matrix)
+int iio_read_mount_matrix(struct device *dev, struct iio_mount_matrix *matrix)
 {
 	size_t len = ARRAY_SIZE(iio_mount_idmatrix.rotation);
 	int err;
 
-	err = device_property_read_string_array(dev, propname,
-						matrix->rotation, len);
+	err = device_property_read_string_array(dev, "mount-matrix", matrix->rotation, len);
 	if (err == len)
 		return 0;
 
diff --git a/drivers/iio/magnetometer/ak8974.c b/drivers/iio/magnetometer/ak8974.c
index 24b2f7b1fe44..e54feacfb980 100644
--- a/drivers/iio/magnetometer/ak8974.c
+++ b/drivers/iio/magnetometer/ak8974.c
@@ -833,8 +833,7 @@ static int ak8974_probe(struct i2c_client *i2c,
 	ak8974->i2c = i2c;
 	mutex_init(&ak8974->lock);
 
-	ret = iio_read_mount_matrix(&i2c->dev, "mount-matrix",
-				    &ak8974->orientation);
+	ret = iio_read_mount_matrix(&i2c->dev, &ak8974->orientation);
 	if (ret)
 		return ret;
 
diff --git a/drivers/iio/magnetometer/ak8975.c b/drivers/iio/magnetometer/ak8975.c
index d988b6ac3659..42b8a2680e3a 100644
--- a/drivers/iio/magnetometer/ak8975.c
+++ b/drivers/iio/magnetometer/ak8975.c
@@ -890,7 +890,7 @@ static int ak8975_probe(struct i2c_client *client,
 	data->reset_gpiod = reset_gpiod;
 	data->eoc_irq = 0;
 
-	err = iio_read_mount_matrix(&client->dev, "mount-matrix", &data->orientation);
+	err = iio_read_mount_matrix(&client->dev, &data->orientation);
 	if (err)
 		return err;
 
diff --git a/drivers/iio/magnetometer/bmc150_magn.c b/drivers/iio/magnetometer/bmc150_magn.c
index 5f28220a7994..f96f53175349 100644
--- a/drivers/iio/magnetometer/bmc150_magn.c
+++ b/drivers/iio/magnetometer/bmc150_magn.c
@@ -890,8 +890,7 @@ int bmc150_magn_probe(struct device *dev, struct regmap *regmap,
 	if (ret)
 		return dev_err_probe(dev, ret, "failed to get regulators\n");
 
-	ret = iio_read_mount_matrix(dev, "mount-matrix",
-				&data->orientation);
+	ret = iio_read_mount_matrix(dev, &data->orientation);
 	if (ret)
 		return ret;
 
diff --git a/drivers/iio/magnetometer/hmc5843_core.c b/drivers/iio/magnetometer/hmc5843_core.c
index 221563e0c18f..cf62057480cf 100644
--- a/drivers/iio/magnetometer/hmc5843_core.c
+++ b/drivers/iio/magnetometer/hmc5843_core.c
@@ -637,8 +637,7 @@ int hmc5843_common_probe(struct device *dev, struct regmap *regmap,
 	data->variant = &hmc5843_chip_info_tbl[id];
 	mutex_init(&data->lock);
 
-	ret = iio_read_mount_matrix(dev, "mount-matrix",
-				&data->orientation);
+	ret = iio_read_mount_matrix(dev, &data->orientation);
 	if (ret)
 		return ret;
 
diff --git a/drivers/iio/magnetometer/yamaha-yas530.c b/drivers/iio/magnetometer/yamaha-yas530.c
index 2f2f8cb3c26c..9ff7b0e56cf6 100644
--- a/drivers/iio/magnetometer/yamaha-yas530.c
+++ b/drivers/iio/magnetometer/yamaha-yas530.c
@@ -831,7 +831,7 @@ static int yas5xx_probe(struct i2c_client *i2c,
 	yas5xx->dev = dev;
 	mutex_init(&yas5xx->lock);
 
-	ret = iio_read_mount_matrix(dev, "mount-matrix", &yas5xx->orientation);
+	ret = iio_read_mount_matrix(dev, &yas5xx->orientation);
 	if (ret)
 		return ret;
 
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 5606a3f4c4cb..324561b7a5e8 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -127,8 +127,7 @@ struct iio_mount_matrix {
 
 ssize_t iio_show_mount_matrix(struct iio_dev *indio_dev, uintptr_t priv,
 			      const struct iio_chan_spec *chan, char *buf);
-int iio_read_mount_matrix(struct device *dev, const char *propname,
-			  struct iio_mount_matrix *matrix);
+int iio_read_mount_matrix(struct device *dev, struct iio_mount_matrix *matrix);
 
 typedef const struct iio_mount_matrix *
 	(iio_get_mount_matrix_t)(const struct iio_dev *indio_dev,
-- 
cgit v1.2.3


From 42ef8aa2263b19b06e69a318dbd8f1639013ded3 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Wed, 19 May 2021 01:07:18 +0200
Subject: iio: st_sensors: Create extended attr macro

Extend ST_SENSORS_LSM_CHANNELS() to a version that will accept extended
attributes named ST_SENSORS_LSM_CHANNELS_EXT() and wrap the former as a
specialized version of the former.

Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Denis Ciocca <denis.ciocca@st.com>
Cc: Daniel Drake <drake@endlessm.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Stephan Gerhold <stephan@gerhold.net>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/20210518230722.522446-1-linus.walleij@linaro.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/common/st_sensors.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h
index 0b9aeb479f48..8e0d76b42db9 100644
--- a/include/linux/iio/common/st_sensors.h
+++ b/include/linux/iio/common/st_sensors.h
@@ -48,8 +48,8 @@
 #define ST_SENSORS_MAX_NAME			17
 #define ST_SENSORS_MAX_4WAI			8
 
-#define ST_SENSORS_LSM_CHANNELS(device_type, mask, index, mod, \
-					ch2, s, endian, rbits, sbits, addr) \
+#define ST_SENSORS_LSM_CHANNELS_EXT(device_type, mask, index, mod, \
+				    ch2, s, endian, rbits, sbits, addr, ext) \
 { \
 	.type = device_type, \
 	.modified = mod, \
@@ -65,8 +65,14 @@
 		.storagebits = sbits, \
 		.endianness = endian, \
 	}, \
+	.ext_info = ext, \
 }
 
+#define ST_SENSORS_LSM_CHANNELS(device_type, mask, index, mod, \
+				ch2, s, endian, rbits, sbits, addr)	\
+	ST_SENSORS_LSM_CHANNELS_EXT(device_type, mask, index, mod,	\
+				    ch2, s, endian, rbits, sbits, addr, NULL)
+
 #define ST_SENSORS_DEV_ATTR_SAMP_FREQ_AVAIL() \
 		IIO_DEV_ATTR_SAMP_FREQ_AVAIL( \
 			st_sensors_sysfs_sampling_frequency_avail)
-- 
cgit v1.2.3


From 3d8ad94bb175c2de7200569bb706d67c45903838 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Wed, 19 May 2021 01:07:19 +0200
Subject: iio: accel: st_sensors: Support generic mounting matrix

The ST accelerators support a special type of quirky mounting matrix found
in ACPI systems, but not a generic mounting matrix such as from the device
tree.

Augment the ACPI hack to be a bit more generic and accept a mounting
matrix from device properties.

This makes it possible to fix orientation on the Ux500 HREF device.

Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Denis Ciocca <denis.ciocca@st.com>
Cc: Daniel Drake <drake@endlessm.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Stephan Gerhold <stephan@gerhold.net>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/20210518230722.522446-2-linus.walleij@linaro.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/accel/st_accel_core.c     | 112 ++++++++++++++++++----------------
 include/linux/iio/common/st_sensors.h |   4 +-
 2 files changed, 62 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/accel/st_accel_core.c b/drivers/iio/accel/st_accel_core.c
index dc32ebefe3fc..9abcebf767b1 100644
--- a/drivers/iio/accel/st_accel_core.c
+++ b/drivers/iio/accel/st_accel_core.c
@@ -41,51 +41,74 @@
 #define ST_ACCEL_FS_AVL_200G			200
 #define ST_ACCEL_FS_AVL_400G			400
 
+static const struct iio_mount_matrix *
+st_accel_get_mount_matrix(const struct iio_dev *indio_dev,
+			  const struct iio_chan_spec *chan)
+{
+	struct st_sensor_data *adata = iio_priv(indio_dev);
+
+	return &adata->mount_matrix;
+}
+
+static const struct iio_chan_spec_ext_info st_accel_mount_matrix_ext_info[] = {
+	IIO_MOUNT_MATRIX(IIO_SHARED_BY_ALL, st_accel_get_mount_matrix),
+	{ }
+};
+
 static const struct iio_chan_spec st_accel_8bit_channels[] = {
-	ST_SENSORS_LSM_CHANNELS(IIO_ACCEL,
+	ST_SENSORS_LSM_CHANNELS_EXT(IIO_ACCEL,
 			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
 			ST_SENSORS_SCAN_X, 1, IIO_MOD_X, 's', IIO_LE, 8, 8,
-			ST_ACCEL_DEFAULT_OUT_X_L_ADDR+1),
-	ST_SENSORS_LSM_CHANNELS(IIO_ACCEL,
+			ST_ACCEL_DEFAULT_OUT_X_L_ADDR+1,
+			st_accel_mount_matrix_ext_info),
+	ST_SENSORS_LSM_CHANNELS_EXT(IIO_ACCEL,
 			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
 			ST_SENSORS_SCAN_Y, 1, IIO_MOD_Y, 's', IIO_LE, 8, 8,
-			ST_ACCEL_DEFAULT_OUT_Y_L_ADDR+1),
-	ST_SENSORS_LSM_CHANNELS(IIO_ACCEL,
+			ST_ACCEL_DEFAULT_OUT_Y_L_ADDR+1,
+			st_accel_mount_matrix_ext_info),
+	ST_SENSORS_LSM_CHANNELS_EXT(IIO_ACCEL,
 			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
 			ST_SENSORS_SCAN_Z, 1, IIO_MOD_Z, 's', IIO_LE, 8, 8,
-			ST_ACCEL_DEFAULT_OUT_Z_L_ADDR+1),
+			ST_ACCEL_DEFAULT_OUT_Z_L_ADDR+1,
+			st_accel_mount_matrix_ext_info),
 	IIO_CHAN_SOFT_TIMESTAMP(3)
 };
 
 static const struct iio_chan_spec st_accel_12bit_channels[] = {
-	ST_SENSORS_LSM_CHANNELS(IIO_ACCEL,
+	ST_SENSORS_LSM_CHANNELS_EXT(IIO_ACCEL,
 			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
 			ST_SENSORS_SCAN_X, 1, IIO_MOD_X, 's', IIO_LE, 12, 16,
-			ST_ACCEL_DEFAULT_OUT_X_L_ADDR),
-	ST_SENSORS_LSM_CHANNELS(IIO_ACCEL,
+			ST_ACCEL_DEFAULT_OUT_X_L_ADDR,
+			st_accel_mount_matrix_ext_info),
+	ST_SENSORS_LSM_CHANNELS_EXT(IIO_ACCEL,
 			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
 			ST_SENSORS_SCAN_Y, 1, IIO_MOD_Y, 's', IIO_LE, 12, 16,
-			ST_ACCEL_DEFAULT_OUT_Y_L_ADDR),
-	ST_SENSORS_LSM_CHANNELS(IIO_ACCEL,
+			ST_ACCEL_DEFAULT_OUT_Y_L_ADDR,
+			st_accel_mount_matrix_ext_info),
+	ST_SENSORS_LSM_CHANNELS_EXT(IIO_ACCEL,
 			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
 			ST_SENSORS_SCAN_Z, 1, IIO_MOD_Z, 's', IIO_LE, 12, 16,
-			ST_ACCEL_DEFAULT_OUT_Z_L_ADDR),
+			ST_ACCEL_DEFAULT_OUT_Z_L_ADDR,
+			st_accel_mount_matrix_ext_info),
 	IIO_CHAN_SOFT_TIMESTAMP(3)
 };
 
 static const struct iio_chan_spec st_accel_16bit_channels[] = {
-	ST_SENSORS_LSM_CHANNELS(IIO_ACCEL,
+	ST_SENSORS_LSM_CHANNELS_EXT(IIO_ACCEL,
 			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
 			ST_SENSORS_SCAN_X, 1, IIO_MOD_X, 's', IIO_LE, 16, 16,
-			ST_ACCEL_DEFAULT_OUT_X_L_ADDR),
-	ST_SENSORS_LSM_CHANNELS(IIO_ACCEL,
+			ST_ACCEL_DEFAULT_OUT_X_L_ADDR,
+			st_accel_mount_matrix_ext_info),
+	ST_SENSORS_LSM_CHANNELS_EXT(IIO_ACCEL,
 			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
 			ST_SENSORS_SCAN_Y, 1, IIO_MOD_Y, 's', IIO_LE, 16, 16,
-			ST_ACCEL_DEFAULT_OUT_Y_L_ADDR),
-	ST_SENSORS_LSM_CHANNELS(IIO_ACCEL,
+			ST_ACCEL_DEFAULT_OUT_Y_L_ADDR,
+			st_accel_mount_matrix_ext_info),
+	ST_SENSORS_LSM_CHANNELS_EXT(IIO_ACCEL,
 			BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
 			ST_SENSORS_SCAN_Z, 1, IIO_MOD_Z, 's', IIO_LE, 16, 16,
-			ST_ACCEL_DEFAULT_OUT_Z_L_ADDR),
+			ST_ACCEL_DEFAULT_OUT_Z_L_ADDR,
+			st_accel_mount_matrix_ext_info),
 	IIO_CHAN_SOFT_TIMESTAMP(3)
 };
 
@@ -1162,25 +1185,10 @@ static const struct iio_trigger_ops st_accel_trigger_ops = {
 #endif
 
 #ifdef CONFIG_ACPI
-static const struct iio_mount_matrix *
-get_mount_matrix(const struct iio_dev *indio_dev,
-		 const struct iio_chan_spec *chan)
-{
-	struct st_sensor_data *adata = iio_priv(indio_dev);
-
-	return adata->mount_matrix;
-}
-
-static const struct iio_chan_spec_ext_info mount_matrix_ext_info[] = {
-	IIO_MOUNT_MATRIX(IIO_SHARED_BY_ALL, get_mount_matrix),
-	{ },
-};
-
 /* Read ST-specific _ONT orientation data from ACPI and generate an
  * appropriate mount matrix.
  */
-static int apply_acpi_orientation(struct iio_dev *indio_dev,
-				  struct iio_chan_spec *channels)
+static int apply_acpi_orientation(struct iio_dev *indio_dev)
 {
 	struct st_sensor_data *adata = iio_priv(indio_dev);
 	struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
@@ -1269,14 +1277,6 @@ static int apply_acpi_orientation(struct iio_dev *indio_dev,
 	}
 
 	/* Convert our integer matrix to a string-based iio_mount_matrix */
-	adata->mount_matrix = devm_kmalloc(&indio_dev->dev,
-					   sizeof(*adata->mount_matrix),
-					   GFP_KERNEL);
-	if (!adata->mount_matrix) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
 	for (i = 0; i < 3; i++) {
 		for (j = 0; j < 3; j++) {
 			int matrix_val = final_ont[i][j];
@@ -1295,26 +1295,25 @@ static int apply_acpi_orientation(struct iio_dev *indio_dev,
 			default:
 				goto out;
 			}
-			adata->mount_matrix->rotation[i * 3 + j] = str_value;
+			adata->mount_matrix.rotation[i * 3 + j] = str_value;
 		}
 	}
 
-	/* Expose the mount matrix via ext_info */
-	for (i = 0; i < indio_dev->num_channels; i++)
-		channels[i].ext_info = mount_matrix_ext_info;
-
 	ret = 0;
 	dev_info(&indio_dev->dev, "computed mount matrix from ACPI\n");
 
 out:
 	kfree(buffer.pointer);
+	if (ret)
+		dev_dbg(&indio_dev->dev,
+			"failed to apply ACPI orientation data: %d\n", ret);
+
 	return ret;
 }
 #else /* !CONFIG_ACPI */
-static int apply_acpi_orientation(struct iio_dev *indio_dev,
-				  struct iio_chan_spec *channels)
+static int apply_acpi_orientation(struct iio_dev *indio_dev)
 {
-	return 0;
+	return -EINVAL;
 }
 #endif
 
@@ -1361,9 +1360,16 @@ int st_accel_common_probe(struct iio_dev *indio_dev)
 	if (!channels)
 		return -ENOMEM;
 
-	if (apply_acpi_orientation(indio_dev, channels))
-		dev_warn(&indio_dev->dev,
-			 "failed to apply ACPI orientation data: %d\n", err);
+	/*
+	 * First try specific ACPI methods to retrieve orientation then try the
+	 * generic function.
+	 */
+	err = apply_acpi_orientation(indio_dev);
+	if (err) {
+		err = iio_read_mount_matrix(adata->dev, &adata->mount_matrix);
+		if (err)
+			return err;
+	}
 
 	indio_dev->channels = channels;
 	adata->current_fullscale = &adata->sensor_settings->fs.fs_avl[0];
diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h
index 8e0d76b42db9..8bdbaf3f3796 100644
--- a/include/linux/iio/common/st_sensors.h
+++ b/include/linux/iio/common/st_sensors.h
@@ -13,6 +13,7 @@
 #include <linux/i2c.h>
 #include <linux/spi/spi.h>
 #include <linux/irqreturn.h>
+#include <linux/iio/iio.h>
 #include <linux/iio/trigger.h>
 #include <linux/bitops.h>
 #include <linux/regulator/consumer.h>
@@ -221,6 +222,7 @@ struct st_sensor_settings {
  * struct st_sensor_data - ST sensor device status
  * @dev: Pointer to instance of struct device (I2C or SPI).
  * @trig: The trigger in use by the core driver.
+ * @mount_matrix: The mounting matrix of the sensor.
  * @sensor_settings: Pointer to the specific sensor settings in use.
  * @current_fullscale: Maximum range of measure by the sensor.
  * @vdd: Pointer to sensor's Vdd power supply
@@ -240,7 +242,7 @@ struct st_sensor_settings {
 struct st_sensor_data {
 	struct device *dev;
 	struct iio_trigger *trig;
-	struct iio_mount_matrix *mount_matrix;
+	struct iio_mount_matrix mount_matrix;
 	struct st_sensor_settings *sensor_settings;
 	struct st_sensor_fullscale_avl *current_fullscale;
 	struct regulator *vdd;
-- 
cgit v1.2.3


From 490dcecabbf93e705006af498fa6815251404a54 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 19 May 2021 10:18:25 -0700
Subject: mlx5: count all link events

mlx5 devices were observed generating MLX5_PORT_CHANGE_SUBTYPE_ACTIVE
events without an intervening MLX5_PORT_CHANGE_SUBTYPE_DOWN. This
breaks link flap detection based on Linux carrier state transition
count as netif_carrier_on() does nothing if carrier is already on.
Make sure we count such events.

netif_carrier_event() increments the counters and fires the linkwatch
events. The latter is not necessary for the use case but seems like
the right thing to do.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |  6 +++++-
 include/linux/netdevice.h                         |  2 +-
 net/sched/sch_generic.c                           | 18 ++++++++++++++++++
 3 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index ad0f69480b9c..e36d0c6a08db 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -91,12 +91,16 @@ void mlx5e_update_carrier(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	u8 port_state;
+	bool up;
 
 	port_state = mlx5_query_vport_state(mdev,
 					    MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT,
 					    0);
 
-	if (port_state == VPORT_STATE_UP) {
+	up = port_state == VPORT_STATE_UP;
+	if (up == netif_carrier_ok(priv->netdev))
+		netif_carrier_event(priv->netdev);
+	if (up) {
 		netdev_info(priv->netdev, "Link up\n");
 		netif_carrier_on(priv->netdev);
 	} else {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5cbc950b34df..be1dcceda5e4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4187,8 +4187,8 @@ unsigned long dev_trans_start(struct net_device *dev);
 void __netdev_watchdog_up(struct net_device *dev);
 
 void netif_carrier_on(struct net_device *dev);
-
 void netif_carrier_off(struct net_device *dev);
+void netif_carrier_event(struct net_device *dev);
 
 /**
  *	netif_dormant_on - mark device as dormant.
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index fc8b56bcabf3..e9c0afc8becc 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -540,6 +540,24 @@ void netif_carrier_off(struct net_device *dev)
 }
 EXPORT_SYMBOL(netif_carrier_off);
 
+/**
+ *	netif_carrier_event - report carrier state event
+ *	@dev: network device
+ *
+ * Device has detected a carrier event but the carrier state wasn't changed.
+ * Use in drivers when querying carrier state asynchronously, to avoid missing
+ * events (link flaps) if link recovers before it's queried.
+ */
+void netif_carrier_event(struct net_device *dev)
+{
+	if (dev->reg_state == NETREG_UNINITIALIZED)
+		return;
+	atomic_inc(&dev->carrier_up_count);
+	atomic_inc(&dev->carrier_down_count);
+	linkwatch_fire_event(dev);
+}
+EXPORT_SYMBOL_GPL(netif_carrier_event);
+
 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
    under all circumstances. It is difficult to invent anything faster or
    cheaper.
-- 
cgit v1.2.3


From b81017aeee4eb9159296cdb68889932649317b9b Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 2 Jun 2021 19:20:11 +0300
Subject: net: pcs: xpcs: delete shim definition for mdio_xpcs_get_ops()

CONFIG_STMMAC_ETH selects CONFIG_PCS_XPCS, so there should be no
situation where the shim should be needed.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pcs/pcs-xpcs.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h
index 5938ced805f4..c4d0a2c469c7 100644
--- a/include/linux/pcs/pcs-xpcs.h
+++ b/include/linux/pcs/pcs-xpcs.h
@@ -36,13 +36,6 @@ struct mdio_xpcs_ops {
 			  int enable);
 };
 
-#if IS_ENABLED(CONFIG_PCS_XPCS)
 struct mdio_xpcs_ops *mdio_xpcs_get_ops(void);
-#else
-static inline struct mdio_xpcs_ops *mdio_xpcs_get_ops(void)
-{
-	return NULL;
-}
-#endif
 
 #endif /* __LINUX_PCS_XPCS_H */
-- 
cgit v1.2.3


From 9900074ecccec472c9d89929c3d37c235f45d33a Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 2 Jun 2021 19:20:13 +0300
Subject: net: pcs: xpcs: make the checks related to the PHY interface mode
 stateless

The operating mode of the driver is currently to populate its
struct mdio_xpcs_args::supported and struct mdio_xpcs_args::an_mode
statically in xpcs_probe(), based on the passed phy_interface_t,
and work with those.

However this is not the operation that phylink expects from a PCS
driver, because the port might be attached to an SFP cage that triggers
changes of the phy_interface_t dynamically as one SFP module is
unpluggged and another is plugged.

To migrate towards that model, the struct mdio_xpcs_args should not
cache anything related to the phy_interface_t, but just look up the
statically defined, const struct xpcs_compat structure corresponding to
the detected PCS OUI/model number.

So we delete the "supported" and "an_mode" members of struct
mdio_xpcs_args, and add the "id" structure there (since the ID is not
expected to change at runtime).

Since xpcs->supported is used deep in the code in _xpcs_config_aneg_c73(),
we need to modify some function headers to pass the xpcs_compat from all
callers. In turn, the xpcs_compat is always supplied externally to the
xpcs module:
- Most of the time by phylink
- In xpcs_probe() it is needed because xpcs_soft_reset() writes to
  MDIO_MMD_PCS or to MDIO_MMD_VEND2 depending on whether an_mode is clause
  37 or clause 73. In order to not introduce functional changes related
  to when the soft reset is issued, we continue to require the initial
  phy_interface_t argument to be passed to xpcs_probe() so we can pass
  this on to xpcs_soft_reset().
- stmmac_open() wants to know whether to call stmmac_init_phy() or not,
  and for that it looks inside xpcs->an_mode, because the clause 73
  (backplane) AN modes supposedly do not have a PHY. Because we moved
  an_mode outside of struct mdio_xpcs_args, this is now no longer
  directly possible, so we introduce a helper function xpcs_get_an_mode()
  which protects the data encapsulation of the xpcs module and requires
  a phy_interface_t to be passed as argument. This function can look up
  the appropriate compat based on the phy_interface_t.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |   4 +-
 drivers/net/pcs/pcs-xpcs.c                        | 175 ++++++++++++++--------
 include/linux/pcs/pcs-xpcs.h                      |   6 +-
 3 files changed, 120 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 13720bf6f6ff..c96a89fa4e3c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -3638,6 +3638,7 @@ static int stmmac_request_irq(struct net_device *dev)
 int stmmac_open(struct net_device *dev)
 {
 	struct stmmac_priv *priv = netdev_priv(dev);
+	int mode = priv->plat->phy_interface;
 	int bfsize = 0;
 	u32 chan;
 	int ret;
@@ -3650,7 +3651,8 @@ int stmmac_open(struct net_device *dev)
 
 	if (priv->hw->pcs != STMMAC_PCS_TBI &&
 	    priv->hw->pcs != STMMAC_PCS_RTBI &&
-	    priv->hw->xpcs_args.an_mode != DW_AN_C73) {
+	    (!priv->hw->xpcs ||
+	     xpcs_get_an_mode(&priv->hw->xpcs_args, mode) != DW_AN_C73)) {
 		ret = stmmac_init_phy(dev);
 		if (ret) {
 			netdev_err(priv->dev,
diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c
index 9f2da9e873c4..610073cb55d0 100644
--- a/drivers/net/pcs/pcs-xpcs.c
+++ b/drivers/net/pcs/pcs-xpcs.c
@@ -195,6 +195,49 @@ struct xpcs_id {
 	const struct xpcs_compat *compat;
 };
 
+static const struct xpcs_compat *xpcs_find_compat(const struct xpcs_id *id,
+						  phy_interface_t interface)
+{
+	int i, j;
+
+	for (i = 0; i < DW_XPCS_INTERFACE_MAX; i++) {
+		const struct xpcs_compat *compat = &id->compat[i];
+
+		for (j = 0; j < compat->num_interfaces; j++)
+			if (compat->interface[j] == interface)
+				return compat;
+	}
+
+	return NULL;
+}
+
+int xpcs_get_an_mode(struct mdio_xpcs_args *xpcs, phy_interface_t interface)
+{
+	const struct xpcs_compat *compat;
+
+	compat = xpcs_find_compat(xpcs->id, interface);
+	if (!compat)
+		return -ENODEV;
+
+	return compat->an_mode;
+}
+EXPORT_SYMBOL_GPL(xpcs_get_an_mode);
+
+static bool __xpcs_linkmode_supported(const struct xpcs_compat *compat,
+				      enum ethtool_link_mode_bit_indices linkmode)
+{
+	int i;
+
+	for (i = 0; compat->supported[i] != __ETHTOOL_LINK_MODE_MASK_NBITS; i++)
+		if (compat->supported[i] == linkmode)
+			return true;
+
+	return false;
+}
+
+#define xpcs_linkmode_supported(compat, mode) \
+	__xpcs_linkmode_supported(compat, ETHTOOL_LINK_MODE_ ## mode ## _BIT)
+
 static int xpcs_read(struct mdio_xpcs_args *xpcs, int dev, u32 reg)
 {
 	u32 reg_addr = MII_ADDR_C45 | dev << 16 | reg;
@@ -246,11 +289,12 @@ static int xpcs_poll_reset(struct mdio_xpcs_args *xpcs, int dev)
 	return (ret & MDIO_CTRL1_RESET) ? -ETIMEDOUT : 0;
 }
 
-static int xpcs_soft_reset(struct mdio_xpcs_args *xpcs)
+static int xpcs_soft_reset(struct mdio_xpcs_args *xpcs,
+			   const struct xpcs_compat *compat)
 {
 	int ret, dev;
 
-	switch (xpcs->an_mode) {
+	switch (compat->an_mode) {
 	case DW_AN_C73:
 		dev = MDIO_MMD_PCS;
 		break;
@@ -419,7 +463,8 @@ static int xpcs_config_usxgmii(struct mdio_xpcs_args *xpcs, int speed)
 	return xpcs_write_vpcs(xpcs, MDIO_CTRL1, ret | DW_USXGMII_RST);
 }
 
-static int _xpcs_config_aneg_c73(struct mdio_xpcs_args *xpcs)
+static int _xpcs_config_aneg_c73(struct mdio_xpcs_args *xpcs,
+				 const struct xpcs_compat *compat)
 {
 	int ret, adv;
 
@@ -431,7 +476,7 @@ static int _xpcs_config_aneg_c73(struct mdio_xpcs_args *xpcs)
 
 	/* SR_AN_ADV3 */
 	adv = 0;
-	if (phylink_test(xpcs->supported, 2500baseX_Full))
+	if (xpcs_linkmode_supported(compat, 2500baseX_Full))
 		adv |= DW_C73_2500KX;
 
 	/* TODO: 5000baseKR */
@@ -442,11 +487,11 @@ static int _xpcs_config_aneg_c73(struct mdio_xpcs_args *xpcs)
 
 	/* SR_AN_ADV2 */
 	adv = 0;
-	if (phylink_test(xpcs->supported, 1000baseKX_Full))
+	if (xpcs_linkmode_supported(compat, 1000baseKX_Full))
 		adv |= DW_C73_1000KX;
-	if (phylink_test(xpcs->supported, 10000baseKX4_Full))
+	if (xpcs_linkmode_supported(compat, 10000baseKX4_Full))
 		adv |= DW_C73_10000KX4;
-	if (phylink_test(xpcs->supported, 10000baseKR_Full))
+	if (xpcs_linkmode_supported(compat, 10000baseKR_Full))
 		adv |= DW_C73_10000KR;
 
 	ret = xpcs_write(xpcs, MDIO_MMD_AN, DW_SR_AN_ADV2, adv);
@@ -455,19 +500,20 @@ static int _xpcs_config_aneg_c73(struct mdio_xpcs_args *xpcs)
 
 	/* SR_AN_ADV1 */
 	adv = DW_C73_AN_ADV_SF;
-	if (phylink_test(xpcs->supported, Pause))
+	if (xpcs_linkmode_supported(compat, Pause))
 		adv |= DW_C73_PAUSE;
-	if (phylink_test(xpcs->supported, Asym_Pause))
+	if (xpcs_linkmode_supported(compat, Asym_Pause))
 		adv |= DW_C73_ASYM_PAUSE;
 
 	return xpcs_write(xpcs, MDIO_MMD_AN, DW_SR_AN_ADV1, adv);
 }
 
-static int xpcs_config_aneg_c73(struct mdio_xpcs_args *xpcs)
+static int xpcs_config_aneg_c73(struct mdio_xpcs_args *xpcs,
+				const struct xpcs_compat *compat)
 {
 	int ret;
 
-	ret = _xpcs_config_aneg_c73(xpcs);
+	ret = _xpcs_config_aneg_c73(xpcs, compat);
 	if (ret < 0)
 		return ret;
 
@@ -481,7 +527,8 @@ static int xpcs_config_aneg_c73(struct mdio_xpcs_args *xpcs)
 }
 
 static int xpcs_aneg_done_c73(struct mdio_xpcs_args *xpcs,
-			      struct phylink_link_state *state)
+			      struct phylink_link_state *state,
+			      const struct xpcs_compat *compat)
 {
 	int ret;
 
@@ -496,7 +543,7 @@ static int xpcs_aneg_done_c73(struct mdio_xpcs_args *xpcs,
 
 		/* Check if Aneg outcome is valid */
 		if (!(ret & DW_C73_AN_ADV_SF)) {
-			xpcs_config_aneg_c73(xpcs);
+			xpcs_config_aneg_c73(xpcs, compat);
 			return 0;
 		}
 
@@ -642,8 +689,31 @@ static int xpcs_validate(struct mdio_xpcs_args *xpcs,
 			 unsigned long *supported,
 			 struct phylink_link_state *state)
 {
-	linkmode_and(supported, supported, xpcs->supported);
-	linkmode_and(state->advertising, state->advertising, xpcs->supported);
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(xpcs_supported);
+	const struct xpcs_compat *compat;
+	int i;
+
+	/* phylink expects us to report all supported modes with
+	 * PHY_INTERFACE_MODE_NA, just don't limit the supported and
+	 * advertising masks and exit.
+	 */
+	if (state->interface == PHY_INTERFACE_MODE_NA)
+		return 0;
+
+	bitmap_zero(xpcs_supported, __ETHTOOL_LINK_MODE_MASK_NBITS);
+
+	compat = xpcs_find_compat(xpcs->id, state->interface);
+
+	/* Populate the supported link modes for this
+	 * PHY interface type
+	 */
+	if (compat)
+		for (i = 0; compat->supported[i] != __ETHTOOL_LINK_MODE_MASK_NBITS; i++)
+			set_bit(compat->supported[i], xpcs_supported);
+
+	linkmode_and(supported, supported, xpcs_supported);
+	linkmode_and(state->advertising, state->advertising, xpcs_supported);
+
 	return 0;
 }
 
@@ -724,12 +794,17 @@ static int xpcs_config_aneg_c37_sgmii(struct mdio_xpcs_args *xpcs)
 static int xpcs_config(struct mdio_xpcs_args *xpcs,
 		       const struct phylink_link_state *state)
 {
+	const struct xpcs_compat *compat;
 	int ret;
 
-	switch (xpcs->an_mode) {
+	compat = xpcs_find_compat(xpcs->id, state->interface);
+	if (!compat)
+		return -ENODEV;
+
+	switch (compat->an_mode) {
 	case DW_AN_C73:
 		if (state->an_enabled) {
-			ret = xpcs_config_aneg_c73(xpcs);
+			ret = xpcs_config_aneg_c73(xpcs, compat);
 			if (ret)
 				return ret;
 		}
@@ -747,7 +822,8 @@ static int xpcs_config(struct mdio_xpcs_args *xpcs,
 }
 
 static int xpcs_get_state_c73(struct mdio_xpcs_args *xpcs,
-			      struct phylink_link_state *state)
+			      struct phylink_link_state *state,
+			      const struct xpcs_compat *compat)
 {
 	int ret;
 
@@ -757,7 +833,7 @@ static int xpcs_get_state_c73(struct mdio_xpcs_args *xpcs,
 	/* ... and then we check the faults. */
 	ret = xpcs_read_fault_c73(xpcs, state);
 	if (ret) {
-		ret = xpcs_soft_reset(xpcs);
+		ret = xpcs_soft_reset(xpcs, compat);
 		if (ret)
 			return ret;
 
@@ -766,7 +842,7 @@ static int xpcs_get_state_c73(struct mdio_xpcs_args *xpcs,
 		return xpcs_config(xpcs, state);
 	}
 
-	if (state->an_enabled && xpcs_aneg_done_c73(xpcs, state)) {
+	if (state->an_enabled && xpcs_aneg_done_c73(xpcs, state, compat)) {
 		state->an_complete = true;
 		xpcs_read_lpa_c73(xpcs, state);
 		xpcs_resolve_lpa_c73(xpcs, state);
@@ -823,11 +899,16 @@ static int xpcs_get_state_c37_sgmii(struct mdio_xpcs_args *xpcs,
 static int xpcs_get_state(struct mdio_xpcs_args *xpcs,
 			  struct phylink_link_state *state)
 {
+	const struct xpcs_compat *compat;
 	int ret;
 
-	switch (xpcs->an_mode) {
+	compat = xpcs_find_compat(xpcs->id, state->interface);
+	if (!compat)
+		return -ENODEV;
+
+	switch (compat->an_mode) {
 	case DW_AN_C73:
-		ret = xpcs_get_state_c73(xpcs, state);
+		ret = xpcs_get_state_c73(xpcs, state, compat);
 		if (ret)
 			return ret;
 		break;
@@ -890,40 +971,6 @@ static u32 xpcs_get_id(struct mdio_xpcs_args *xpcs)
 	return 0xffffffff;
 }
 
-static bool xpcs_check_features(struct mdio_xpcs_args *xpcs,
-				const struct xpcs_id *match,
-				phy_interface_t interface)
-{
-	int i, j;
-
-	for (i = 0; i < DW_XPCS_INTERFACE_MAX; i++) {
-		const struct xpcs_compat *compat = &match->compat[i];
-		bool supports_interface = false;
-
-		for (j = 0; j < compat->num_interfaces; j++) {
-			if (compat->interface[j] == interface) {
-				supports_interface = true;
-				break;
-			}
-		}
-
-		if (!supports_interface)
-			continue;
-
-		/* Populate the supported link modes for this
-		 * PHY interface type
-		 */
-		for (j = 0; compat->supported[j] != __ETHTOOL_LINK_MODE_MASK_NBITS; j++)
-			set_bit(compat->supported[j], xpcs->supported);
-
-		xpcs->an_mode = compat->an_mode;
-
-		return true;
-	}
-
-	return false;
-}
-
 static const struct xpcs_compat synopsys_xpcs_compat[DW_XPCS_INTERFACE_MAX] = {
 	[DW_XPCS_USXGMII] = {
 		.supported = xpcs_usxgmii_features,
@@ -961,19 +1008,23 @@ static const struct xpcs_id xpcs_id_list[] = {
 
 static int xpcs_probe(struct mdio_xpcs_args *xpcs, phy_interface_t interface)
 {
-	const struct xpcs_id *match = NULL;
 	u32 xpcs_id = xpcs_get_id(xpcs);
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(xpcs_id_list); i++) {
 		const struct xpcs_id *entry = &xpcs_id_list[i];
+		const struct xpcs_compat *compat;
 
-		if ((xpcs_id & entry->mask) == entry->id) {
-			match = entry;
+		if ((xpcs_id & entry->mask) != entry->id)
+			continue;
 
-			if (xpcs_check_features(xpcs, match, interface))
-				return xpcs_soft_reset(xpcs);
-		}
+		xpcs->id = entry;
+
+		compat = xpcs_find_compat(entry, interface);
+		if (!compat)
+			return -ENODEV;
+
+		return xpcs_soft_reset(xpcs, compat);
 	}
 
 	return -ENODEV;
diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h
index c4d0a2c469c7..c2ec440d2c5d 100644
--- a/include/linux/pcs/pcs-xpcs.h
+++ b/include/linux/pcs/pcs-xpcs.h
@@ -14,11 +14,12 @@
 #define DW_AN_C73			1
 #define DW_AN_C37_SGMII			2
 
+struct xpcs_id;
+
 struct mdio_xpcs_args {
-	__ETHTOOL_DECLARE_LINK_MODE_MASK(supported);
 	struct mii_bus *bus;
+	const struct xpcs_id *id;
 	int addr;
-	int an_mode;
 };
 
 struct mdio_xpcs_ops {
@@ -36,6 +37,7 @@ struct mdio_xpcs_ops {
 			  int enable);
 };
 
+int xpcs_get_an_mode(struct mdio_xpcs_args *xpcs, phy_interface_t interface);
 struct mdio_xpcs_ops *mdio_xpcs_get_ops(void);
 
 #endif /* __LINUX_PCS_XPCS_H */
-- 
cgit v1.2.3


From a1a753ed1d4ae46c1c1874fb1af899f6579a7547 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 2 Jun 2021 19:20:14 +0300
Subject: net: pcs: xpcs: export xpcs_validate

Calling a function pointer with a single implementation through
struct mdio_xpcs_ops is clunky, and the stmmac_do_callback system forces
this to return int, even though it always returns zero.

Simply remove the "validate" function pointer from struct mdio_xpcs_ops
and replace it with an exported xpcs_validate symbol which is called
directly by stmmac.

priv->hw->xpcs is of the type "const struct mdio_xpcs_ops *" and is used
as a placeholder/synonym for priv->plat->mdio_bus_data->has_xpcs. It is
done that way because the mdio_bus_data pointer might or might not be
populated in all stmmac instantiations.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/hwif.h        |  2 --
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |  3 ++-
 drivers/net/pcs/pcs-xpcs.c                        | 11 ++++-------
 include/linux/pcs/pcs-xpcs.h                      |  5 ++---
 4 files changed, 8 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index dbafedb24290..441985f9cf49 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -614,8 +614,6 @@ struct stmmac_mmc_ops {
 	stmmac_do_void_callback(__priv, mmc, read, __args)
 
 /* XPCS callbacks */
-#define stmmac_xpcs_validate(__priv, __args...) \
-	stmmac_do_callback(__priv, xpcs, validate, __args)
 #define stmmac_xpcs_config(__priv, __args...) \
 	stmmac_do_callback(__priv, xpcs, config, __args)
 #define stmmac_xpcs_get_state(__priv, __args...) \
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index c96a89fa4e3c..b7e6ab05ddd9 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -996,7 +996,8 @@ static void stmmac_validate(struct phylink_config *config,
 	linkmode_andnot(state->advertising, state->advertising, mask);
 
 	/* If PCS is supported, check which modes it supports. */
-	stmmac_xpcs_validate(priv, &priv->hw->xpcs_args, supported, state);
+	if (priv->hw->xpcs)
+		xpcs_validate(&priv->hw->xpcs_args, supported, state);
 }
 
 static void stmmac_mac_pcs_get_state(struct phylink_config *config,
diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c
index 610073cb55d0..2f7791bcf07b 100644
--- a/drivers/net/pcs/pcs-xpcs.c
+++ b/drivers/net/pcs/pcs-xpcs.c
@@ -685,9 +685,8 @@ static void xpcs_resolve_pma(struct mdio_xpcs_args *xpcs,
 	}
 }
 
-static int xpcs_validate(struct mdio_xpcs_args *xpcs,
-			 unsigned long *supported,
-			 struct phylink_link_state *state)
+void xpcs_validate(struct mdio_xpcs_args *xpcs, unsigned long *supported,
+		   struct phylink_link_state *state)
 {
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(xpcs_supported);
 	const struct xpcs_compat *compat;
@@ -698,7 +697,7 @@ static int xpcs_validate(struct mdio_xpcs_args *xpcs,
 	 * advertising masks and exit.
 	 */
 	if (state->interface == PHY_INTERFACE_MODE_NA)
-		return 0;
+		return;
 
 	bitmap_zero(xpcs_supported, __ETHTOOL_LINK_MODE_MASK_NBITS);
 
@@ -713,9 +712,8 @@ static int xpcs_validate(struct mdio_xpcs_args *xpcs,
 
 	linkmode_and(supported, supported, xpcs_supported);
 	linkmode_and(state->advertising, state->advertising, xpcs_supported);
-
-	return 0;
 }
+EXPORT_SYMBOL_GPL(xpcs_validate);
 
 static int xpcs_config_eee(struct mdio_xpcs_args *xpcs, int mult_fact_100ns,
 			   int enable)
@@ -1031,7 +1029,6 @@ static int xpcs_probe(struct mdio_xpcs_args *xpcs, phy_interface_t interface)
 }
 
 static struct mdio_xpcs_ops xpcs_ops = {
-	.validate = xpcs_validate,
 	.config = xpcs_config,
 	.get_state = xpcs_get_state,
 	.link_up = xpcs_link_up,
diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h
index c2ec440d2c5d..5ec9aaca01fe 100644
--- a/include/linux/pcs/pcs-xpcs.h
+++ b/include/linux/pcs/pcs-xpcs.h
@@ -23,9 +23,6 @@ struct mdio_xpcs_args {
 };
 
 struct mdio_xpcs_ops {
-	int (*validate)(struct mdio_xpcs_args *xpcs,
-			unsigned long *supported,
-			struct phylink_link_state *state);
 	int (*config)(struct mdio_xpcs_args *xpcs,
 		      const struct phylink_link_state *state);
 	int (*get_state)(struct mdio_xpcs_args *xpcs,
@@ -39,5 +36,7 @@ struct mdio_xpcs_ops {
 
 int xpcs_get_an_mode(struct mdio_xpcs_args *xpcs, phy_interface_t interface);
 struct mdio_xpcs_ops *mdio_xpcs_get_ops(void);
+void xpcs_validate(struct mdio_xpcs_args *xpcs, unsigned long *supported,
+		   struct phylink_link_state *state);
 
 #endif /* __LINUX_PCS_XPCS_H */
-- 
cgit v1.2.3


From 14b517cb62d6efc8866f176c922de03dfe1564f3 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 2 Jun 2021 19:20:15 +0300
Subject: net: pcs: xpcs: export xpcs_config_eee

There is no good reason why we need to go through:

stmmac_xpcs_config_eee
-> stmmac_do_callback
   -> mdio_xpcs_ops->config_eee
      -> xpcs_config_eee

when we can simply call xpcs_config_eee.

priv->hw->xpcs is of the type "const struct mdio_xpcs_ops *" and is used
as a placeholder/synonym for priv->plat->mdio_bus_data->has_xpcs. It is
done that way because the mdio_bus_data pointer might or might not be
populated in all stmmac instantiations.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/hwif.h           |  2 --
 drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 12 +++++++-----
 drivers/net/pcs/pcs-xpcs.c                           |  6 +++---
 include/linux/pcs/pcs-xpcs.h                         |  4 ++--
 4 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index 441985f9cf49..c10d11dbde61 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -622,8 +622,6 @@ struct stmmac_mmc_ops {
 	stmmac_do_callback(__priv, xpcs, link_up, __args)
 #define stmmac_xpcs_probe(__priv, __args...) \
 	stmmac_do_callback(__priv, xpcs, probe, __args)
-#define stmmac_xpcs_config_eee(__priv, __args...) \
-	stmmac_do_callback(__priv, xpcs, config_eee, __args)
 
 struct stmmac_regs_off {
 	u32 ptp_off;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index 1f6d749fd9a3..ba7d0f40723a 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
@@ -720,11 +720,13 @@ static int stmmac_ethtool_op_set_eee(struct net_device *dev,
 		netdev_warn(priv->dev,
 			    "Setting EEE tx-lpi is not supported\n");
 
-	ret = stmmac_xpcs_config_eee(priv, &priv->hw->xpcs_args,
-				     priv->plat->mult_fact_100ns,
-				     edata->eee_enabled);
-	if (ret)
-		return ret;
+	if (priv->hw->xpcs) {
+		ret = xpcs_config_eee(&priv->hw->xpcs_args,
+				      priv->plat->mult_fact_100ns,
+				      edata->eee_enabled);
+		if (ret)
+			return ret;
+	}
 
 	if (!edata->eee_enabled)
 		stmmac_disable_eee_mode(priv);
diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c
index 2f7791bcf07b..2f2ffab855aa 100644
--- a/drivers/net/pcs/pcs-xpcs.c
+++ b/drivers/net/pcs/pcs-xpcs.c
@@ -715,8 +715,8 @@ void xpcs_validate(struct mdio_xpcs_args *xpcs, unsigned long *supported,
 }
 EXPORT_SYMBOL_GPL(xpcs_validate);
 
-static int xpcs_config_eee(struct mdio_xpcs_args *xpcs, int mult_fact_100ns,
-			   int enable)
+int xpcs_config_eee(struct mdio_xpcs_args *xpcs, int mult_fact_100ns,
+		    int enable)
 {
 	int ret;
 
@@ -747,6 +747,7 @@ static int xpcs_config_eee(struct mdio_xpcs_args *xpcs, int mult_fact_100ns,
 	ret |= DW_VR_MII_EEE_TRN_LPI;
 	return xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_EEE_MCTRL1, ret);
 }
+EXPORT_SYMBOL_GPL(xpcs_config_eee);
 
 static int xpcs_config_aneg_c37_sgmii(struct mdio_xpcs_args *xpcs)
 {
@@ -1033,7 +1034,6 @@ static struct mdio_xpcs_ops xpcs_ops = {
 	.get_state = xpcs_get_state,
 	.link_up = xpcs_link_up,
 	.probe = xpcs_probe,
-	.config_eee = xpcs_config_eee,
 };
 
 struct mdio_xpcs_ops *mdio_xpcs_get_ops(void)
diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h
index 5ec9aaca01fe..ae74a336dcb9 100644
--- a/include/linux/pcs/pcs-xpcs.h
+++ b/include/linux/pcs/pcs-xpcs.h
@@ -30,13 +30,13 @@ struct mdio_xpcs_ops {
 	int (*link_up)(struct mdio_xpcs_args *xpcs, int speed,
 		       phy_interface_t interface);
 	int (*probe)(struct mdio_xpcs_args *xpcs, phy_interface_t interface);
-	int (*config_eee)(struct mdio_xpcs_args *xpcs, int mult_fact_100ns,
-			  int enable);
 };
 
 int xpcs_get_an_mode(struct mdio_xpcs_args *xpcs, phy_interface_t interface);
 struct mdio_xpcs_ops *mdio_xpcs_get_ops(void);
 void xpcs_validate(struct mdio_xpcs_args *xpcs, unsigned long *supported,
 		   struct phylink_link_state *state);
+int xpcs_config_eee(struct mdio_xpcs_args *xpcs, int mult_fact_100ns,
+		    int enable);
 
 #endif /* __LINUX_PCS_XPCS_H */
-- 
cgit v1.2.3


From 8e2bb9569942f9cb2ef816dbf66fbf3e8d722720 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 2 Jun 2021 19:20:16 +0300
Subject: net: pcs: xpcs: export xpcs_probe

Similar to the other recently functions, it is not necessary for
xpcs_probe to be a function pointer, so export it so that it can be
called directly.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/hwif.h        | 2 --
 drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 2 +-
 drivers/net/pcs/pcs-xpcs.c                        | 4 ++--
 include/linux/pcs/pcs-xpcs.h                      | 2 +-
 4 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index c10d11dbde61..3dc291ff9f32 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -620,8 +620,6 @@ struct stmmac_mmc_ops {
 	stmmac_do_callback(__priv, xpcs, get_state, __args)
 #define stmmac_xpcs_link_up(__priv, __args...) \
 	stmmac_do_callback(__priv, xpcs, link_up, __args)
-#define stmmac_xpcs_probe(__priv, __args...) \
-	stmmac_do_callback(__priv, xpcs, probe, __args)
 
 struct stmmac_regs_off {
 	u32 ptp_off;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index e293bf1ce9f3..56deb92a8430 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -521,7 +521,7 @@ int stmmac_mdio_register(struct net_device *ndev)
 		for (addr = 0; addr < max_addr; addr++) {
 			xpcs->addr = addr;
 
-			ret = stmmac_xpcs_probe(priv, xpcs, mode);
+			ret = xpcs_probe(xpcs, mode);
 			if (!ret) {
 				found = 1;
 				break;
diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c
index 2f2ffab855aa..7f51eb4bbaa4 100644
--- a/drivers/net/pcs/pcs-xpcs.c
+++ b/drivers/net/pcs/pcs-xpcs.c
@@ -1005,7 +1005,7 @@ static const struct xpcs_id xpcs_id_list[] = {
 	},
 };
 
-static int xpcs_probe(struct mdio_xpcs_args *xpcs, phy_interface_t interface)
+int xpcs_probe(struct mdio_xpcs_args *xpcs, phy_interface_t interface)
 {
 	u32 xpcs_id = xpcs_get_id(xpcs);
 	int i;
@@ -1028,12 +1028,12 @@ static int xpcs_probe(struct mdio_xpcs_args *xpcs, phy_interface_t interface)
 
 	return -ENODEV;
 }
+EXPORT_SYMBOL_GPL(xpcs_probe);
 
 static struct mdio_xpcs_ops xpcs_ops = {
 	.config = xpcs_config,
 	.get_state = xpcs_get_state,
 	.link_up = xpcs_link_up,
-	.probe = xpcs_probe,
 };
 
 struct mdio_xpcs_ops *mdio_xpcs_get_ops(void)
diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h
index ae74a336dcb9..1d8581b74d81 100644
--- a/include/linux/pcs/pcs-xpcs.h
+++ b/include/linux/pcs/pcs-xpcs.h
@@ -29,7 +29,6 @@ struct mdio_xpcs_ops {
 			 struct phylink_link_state *state);
 	int (*link_up)(struct mdio_xpcs_args *xpcs, int speed,
 		       phy_interface_t interface);
-	int (*probe)(struct mdio_xpcs_args *xpcs, phy_interface_t interface);
 };
 
 int xpcs_get_an_mode(struct mdio_xpcs_args *xpcs, phy_interface_t interface);
@@ -38,5 +37,6 @@ void xpcs_validate(struct mdio_xpcs_args *xpcs, unsigned long *supported,
 		   struct phylink_link_state *state);
 int xpcs_config_eee(struct mdio_xpcs_args *xpcs, int mult_fact_100ns,
 		    int enable);
+int xpcs_probe(struct mdio_xpcs_args *xpcs, phy_interface_t interface);
 
 #endif /* __LINUX_PCS_XPCS_H */
-- 
cgit v1.2.3


From 2cac15dae2f6e2f86bef1acc2a7f78fc97a0a060 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 2 Jun 2021 19:20:18 +0300
Subject: net: pcs: xpcs: convert to mdio_device

Unify the 2 existing PCS drivers (lynx and xpcs) by doing a similar
thing on probe, which is to have a *_create function that takes a
struct mdio_device * given by the caller, and builds a private PCS
structure around that.

This changes stmmac to hold only a pointer to the xpcs, as opposed to
the full structure. This will be used in the next patch when struct
mdio_xpcs_ops is removed. Currently a pointer to struct mdio_xpcs_ops
is used as a shorthand to determine whether the port has an XPCS or not.
We can do the same now with the mdio_xpcs_args pointer.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/common.h       |  2 +-
 .../net/ethernet/stmicro/stmmac/stmmac_ethtool.c   |  2 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 10 ++--
 drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c  | 39 +++++++++-------
 drivers/net/pcs/pcs-xpcs.c                         | 53 +++++++++++++++++-----
 include/linux/pcs/pcs-xpcs.h                       |  7 +--
 6 files changed, 76 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index 619e3c0760d6..4bcd1d340766 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -504,7 +504,7 @@ struct mac_device_info {
 	const struct stmmac_tc_ops *tc;
 	const struct stmmac_mmc_ops *mmc;
 	const struct mdio_xpcs_ops *xpcs;
-	struct mdio_xpcs_args xpcs_args;
+	struct mdio_xpcs_args *xpcs_args;
 	struct mii_regs mii;	/* MII register Addresses */
 	struct mac_link link;
 	void __iomem *pcsr;     /* vpointer to device CSRs */
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index ba7d0f40723a..050576ee704d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
@@ -721,7 +721,7 @@ static int stmmac_ethtool_op_set_eee(struct net_device *dev,
 			    "Setting EEE tx-lpi is not supported\n");
 
 	if (priv->hw->xpcs) {
-		ret = xpcs_config_eee(&priv->hw->xpcs_args,
+		ret = xpcs_config_eee(priv->hw->xpcs_args,
 				      priv->plat->mult_fact_100ns,
 				      edata->eee_enabled);
 		if (ret)
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index b7e6ab05ddd9..e9e5bcb79d48 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -997,7 +997,7 @@ static void stmmac_validate(struct phylink_config *config,
 
 	/* If PCS is supported, check which modes it supports. */
 	if (priv->hw->xpcs)
-		xpcs_validate(&priv->hw->xpcs_args, supported, state);
+		xpcs_validate(priv->hw->xpcs_args, supported, state);
 }
 
 static void stmmac_mac_pcs_get_state(struct phylink_config *config,
@@ -1006,7 +1006,7 @@ static void stmmac_mac_pcs_get_state(struct phylink_config *config,
 	struct stmmac_priv *priv = netdev_priv(to_net_dev(config->dev));
 
 	state->link = 0;
-	stmmac_xpcs_get_state(priv, &priv->hw->xpcs_args, state);
+	stmmac_xpcs_get_state(priv, priv->hw->xpcs_args, state);
 }
 
 static void stmmac_mac_config(struct phylink_config *config, unsigned int mode,
@@ -1014,7 +1014,7 @@ static void stmmac_mac_config(struct phylink_config *config, unsigned int mode,
 {
 	struct stmmac_priv *priv = netdev_priv(to_net_dev(config->dev));
 
-	stmmac_xpcs_config(priv, &priv->hw->xpcs_args, state);
+	stmmac_xpcs_config(priv, priv->hw->xpcs_args, state);
 }
 
 static void stmmac_mac_an_restart(struct phylink_config *config)
@@ -1061,7 +1061,7 @@ static void stmmac_mac_link_up(struct phylink_config *config,
 	struct stmmac_priv *priv = netdev_priv(to_net_dev(config->dev));
 	u32 ctrl;
 
-	stmmac_xpcs_link_up(priv, &priv->hw->xpcs_args, speed, interface);
+	stmmac_xpcs_link_up(priv, priv->hw->xpcs_args, speed, interface);
 
 	ctrl = readl(priv->ioaddr + MAC_CTRL_REG);
 	ctrl &= ~priv->hw->link.speed_mask;
@@ -3653,7 +3653,7 @@ int stmmac_open(struct net_device *dev)
 	if (priv->hw->pcs != STMMAC_PCS_TBI &&
 	    priv->hw->pcs != STMMAC_PCS_RTBI &&
 	    (!priv->hw->xpcs ||
-	     xpcs_get_an_mode(&priv->hw->xpcs_args, mode) != DW_AN_C73)) {
+	     xpcs_get_an_mode(priv->hw->xpcs_args, mode) != DW_AN_C73)) {
 		ret = stmmac_init_phy(dev);
 		if (ret) {
 			netdev_err(priv->dev,
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index 56deb92a8430..9b4bf78d2eaa 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -510,25 +510,27 @@ int stmmac_mdio_register(struct net_device *ndev)
 	}
 
 	/* Try to probe the XPCS by scanning all addresses. */
-	if (priv->hw->xpcs) {
-		struct mdio_xpcs_args *xpcs = &priv->hw->xpcs_args;
-		int ret, mode = priv->plat->phy_interface;
-		max_addr = PHY_MAX_ADDR;
-
-		xpcs->bus = new_bus;
-
-		found = 0;
-		for (addr = 0; addr < max_addr; addr++) {
-			xpcs->addr = addr;
-
-			ret = xpcs_probe(xpcs, mode);
-			if (!ret) {
-				found = 1;
-				break;
+	if (mdio_bus_data->has_xpcs) {
+		int mode = priv->plat->phy_interface;
+		struct mdio_device *mdiodev;
+		struct mdio_xpcs_args *xpcs;
+
+		for (addr = 0; addr < PHY_MAX_ADDR; addr++) {
+			mdiodev = mdio_device_create(new_bus, addr);
+			if (IS_ERR(mdiodev))
+				continue;
+
+			xpcs = xpcs_create(mdiodev, mode);
+			if (IS_ERR_OR_NULL(xpcs)) {
+				mdio_device_free(mdiodev);
+				continue;
 			}
+
+			priv->hw->xpcs_args = xpcs;
+			break;
 		}
 
-		if (!found && !mdio_node) {
+		if (!priv->hw->xpcs_args) {
 			dev_warn(dev, "No XPCS found\n");
 			err = -ENODEV;
 			goto no_xpcs_found;
@@ -560,6 +562,11 @@ int stmmac_mdio_unregister(struct net_device *ndev)
 	if (!priv->mii)
 		return 0;
 
+	if (priv->hw->xpcs) {
+		mdio_device_free(priv->hw->xpcs_args->mdiodev);
+		xpcs_destroy(priv->hw->xpcs_args);
+	}
+
 	mdiobus_unregister(priv->mii);
 	priv->mii->priv = NULL;
 	mdiobus_free(priv->mii);
diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c
index afabb9209c52..e17e72175ebb 100644
--- a/drivers/net/pcs/pcs-xpcs.c
+++ b/drivers/net/pcs/pcs-xpcs.c
@@ -241,15 +241,19 @@ static bool __xpcs_linkmode_supported(const struct xpcs_compat *compat,
 static int xpcs_read(struct mdio_xpcs_args *xpcs, int dev, u32 reg)
 {
 	u32 reg_addr = mdiobus_c45_addr(dev, reg);
+	struct mii_bus *bus = xpcs->mdiodev->bus;
+	int addr = xpcs->mdiodev->addr;
 
-	return mdiobus_read(xpcs->bus, xpcs->addr, reg_addr);
+	return mdiobus_read(bus, addr, reg_addr);
 }
 
 static int xpcs_write(struct mdio_xpcs_args *xpcs, int dev, u32 reg, u16 val)
 {
 	u32 reg_addr = mdiobus_c45_addr(dev, reg);
+	struct mii_bus *bus = xpcs->mdiodev->bus;
+	int addr = xpcs->mdiodev->addr;
 
-	return mdiobus_write(xpcs->bus, xpcs->addr, reg_addr, val);
+	return mdiobus_write(bus, addr, reg_addr, val);
 }
 
 static int xpcs_read_vendor(struct mdio_xpcs_args *xpcs, int dev, u32 reg)
@@ -315,7 +319,7 @@ static int xpcs_soft_reset(struct mdio_xpcs_args *xpcs,
 #define xpcs_warn(__xpcs, __state, __args...) \
 ({ \
 	if ((__state)->link) \
-		dev_warn(&(__xpcs)->bus->dev, ##__args); \
+		dev_warn(&(__xpcs)->mdiodev->dev, ##__args); \
 })
 
 static int xpcs_read_fault_c73(struct mdio_xpcs_args *xpcs,
@@ -1005,10 +1009,20 @@ static const struct xpcs_id xpcs_id_list[] = {
 	},
 };
 
-int xpcs_probe(struct mdio_xpcs_args *xpcs, phy_interface_t interface)
+struct mdio_xpcs_args *xpcs_create(struct mdio_device *mdiodev,
+				   phy_interface_t interface)
 {
-	u32 xpcs_id = xpcs_get_id(xpcs);
-	int i;
+	struct mdio_xpcs_args *xpcs;
+	u32 xpcs_id;
+	int i, ret;
+
+	xpcs = kzalloc(sizeof(*xpcs), GFP_KERNEL);
+	if (!xpcs)
+		return NULL;
+
+	xpcs->mdiodev = mdiodev;
+
+	xpcs_id = xpcs_get_id(xpcs);
 
 	for (i = 0; i < ARRAY_SIZE(xpcs_id_list); i++) {
 		const struct xpcs_id *entry = &xpcs_id_list[i];
@@ -1020,15 +1034,32 @@ int xpcs_probe(struct mdio_xpcs_args *xpcs, phy_interface_t interface)
 		xpcs->id = entry;
 
 		compat = xpcs_find_compat(entry, interface);
-		if (!compat)
-			return -ENODEV;
+		if (!compat) {
+			ret = -ENODEV;
+			goto out;
+		}
 
-		return xpcs_soft_reset(xpcs, compat);
+		ret = xpcs_soft_reset(xpcs, compat);
+		if (ret)
+			goto out;
+
+		return xpcs;
 	}
 
-	return -ENODEV;
+	ret = -ENODEV;
+
+out:
+	kfree(xpcs);
+
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(xpcs_create);
+
+void xpcs_destroy(struct mdio_xpcs_args *xpcs)
+{
+	kfree(xpcs);
 }
-EXPORT_SYMBOL_GPL(xpcs_probe);
+EXPORT_SYMBOL_GPL(xpcs_destroy);
 
 static struct mdio_xpcs_ops xpcs_ops = {
 	.config = xpcs_config,
diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h
index 1d8581b74d81..57a199393d63 100644
--- a/include/linux/pcs/pcs-xpcs.h
+++ b/include/linux/pcs/pcs-xpcs.h
@@ -17,9 +17,8 @@
 struct xpcs_id;
 
 struct mdio_xpcs_args {
-	struct mii_bus *bus;
+	struct mdio_device *mdiodev;
 	const struct xpcs_id *id;
-	int addr;
 };
 
 struct mdio_xpcs_ops {
@@ -37,6 +36,8 @@ void xpcs_validate(struct mdio_xpcs_args *xpcs, unsigned long *supported,
 		   struct phylink_link_state *state);
 int xpcs_config_eee(struct mdio_xpcs_args *xpcs, int mult_fact_100ns,
 		    int enable);
-int xpcs_probe(struct mdio_xpcs_args *xpcs, phy_interface_t interface);
+struct mdio_xpcs_args *xpcs_create(struct mdio_device *mdiodev,
+				   phy_interface_t interface);
+void xpcs_destroy(struct mdio_xpcs_args *xpcs);
 
 #endif /* __LINUX_PCS_XPCS_H */
-- 
cgit v1.2.3


From 11059740e616f4d83d8d9e3f8a63dafefdc2ae5d Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 2 Jun 2021 19:20:19 +0300
Subject: net: pcs: xpcs: convert to phylink_pcs_ops

Since all the remaining members of struct mdio_xpcs_ops have direct
equivalents in struct phylink_pcs_ops, it is about time we remove it
altogether.

Since the phylink ops return void, we need to remove the error
propagation from the various xpcs methods and simply print an error
message where appropriate.

Since xpcs_get_state_c73() detects link faults and attempts to reset the
link on its own by calling xpcs_config(), but xpcs_config() now has a
lot of phylink arguments which are not needed and cannot be simply
fabricated by anybody else except phylink, the actual implementation has
been moved into a smaller xpcs_do_config().

The const struct mdio_xpcs_ops *priv->hw->xpcs has been removed, so we
need to look at the struct mdio_xpcs_args pointer now as an indication
whether the port has an XPCS or not.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/common.h       |  3 +-
 drivers/net/ethernet/stmicro/stmmac/hwif.h         |  8 --
 .../net/ethernet/stmicro/stmmac/stmmac_ethtool.c   |  2 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 36 +++-----
 drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c  | 16 +---
 drivers/net/pcs/pcs-xpcs.c                         | 99 +++++++++++++---------
 include/linux/pcs/pcs-xpcs.h                       | 11 +--
 7 files changed, 78 insertions(+), 97 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index 4bcd1d340766..8a83f9e1e95b 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -503,8 +503,7 @@ struct mac_device_info {
 	const struct stmmac_hwtimestamp *ptp;
 	const struct stmmac_tc_ops *tc;
 	const struct stmmac_mmc_ops *mmc;
-	const struct mdio_xpcs_ops *xpcs;
-	struct mdio_xpcs_args *xpcs_args;
+	struct mdio_xpcs_args *xpcs;
 	struct mii_regs mii;	/* MII register Addresses */
 	struct mac_link link;
 	void __iomem *pcsr;     /* vpointer to device CSRs */
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index 3dc291ff9f32..6dc1c98ebec8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -613,14 +613,6 @@ struct stmmac_mmc_ops {
 #define stmmac_mmc_read(__priv, __args...) \
 	stmmac_do_void_callback(__priv, mmc, read, __args)
 
-/* XPCS callbacks */
-#define stmmac_xpcs_config(__priv, __args...) \
-	stmmac_do_callback(__priv, xpcs, config, __args)
-#define stmmac_xpcs_get_state(__priv, __args...) \
-	stmmac_do_callback(__priv, xpcs, get_state, __args)
-#define stmmac_xpcs_link_up(__priv, __args...) \
-	stmmac_do_callback(__priv, xpcs, link_up, __args)
-
 struct stmmac_regs_off {
 	u32 ptp_off;
 	u32 mmc_off;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index 050576ee704d..d0ce608b81c3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
@@ -721,7 +721,7 @@ static int stmmac_ethtool_op_set_eee(struct net_device *dev,
 			    "Setting EEE tx-lpi is not supported\n");
 
 	if (priv->hw->xpcs) {
-		ret = xpcs_config_eee(priv->hw->xpcs_args,
+		ret = xpcs_config_eee(priv->hw->xpcs,
 				      priv->plat->mult_fact_100ns,
 				      edata->eee_enabled);
 		if (ret)
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index e9e5bcb79d48..6d41dd6f9f7a 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -997,29 +997,13 @@ static void stmmac_validate(struct phylink_config *config,
 
 	/* If PCS is supported, check which modes it supports. */
 	if (priv->hw->xpcs)
-		xpcs_validate(priv->hw->xpcs_args, supported, state);
-}
-
-static void stmmac_mac_pcs_get_state(struct phylink_config *config,
-				     struct phylink_link_state *state)
-{
-	struct stmmac_priv *priv = netdev_priv(to_net_dev(config->dev));
-
-	state->link = 0;
-	stmmac_xpcs_get_state(priv, priv->hw->xpcs_args, state);
+		xpcs_validate(priv->hw->xpcs, supported, state);
 }
 
 static void stmmac_mac_config(struct phylink_config *config, unsigned int mode,
 			      const struct phylink_link_state *state)
 {
-	struct stmmac_priv *priv = netdev_priv(to_net_dev(config->dev));
-
-	stmmac_xpcs_config(priv, priv->hw->xpcs_args, state);
-}
-
-static void stmmac_mac_an_restart(struct phylink_config *config)
-{
-	/* Not Supported */
+	/* Nothing to do, xpcs_config() handles everything */
 }
 
 static void stmmac_fpe_link_state_handle(struct stmmac_priv *priv, bool is_up)
@@ -1061,8 +1045,6 @@ static void stmmac_mac_link_up(struct phylink_config *config,
 	struct stmmac_priv *priv = netdev_priv(to_net_dev(config->dev));
 	u32 ctrl;
 
-	stmmac_xpcs_link_up(priv, priv->hw->xpcs_args, speed, interface);
-
 	ctrl = readl(priv->ioaddr + MAC_CTRL_REG);
 	ctrl &= ~priv->hw->link.speed_mask;
 
@@ -1155,9 +1137,7 @@ static void stmmac_mac_link_up(struct phylink_config *config,
 
 static const struct phylink_mac_ops stmmac_phylink_mac_ops = {
 	.validate = stmmac_validate,
-	.mac_pcs_get_state = stmmac_mac_pcs_get_state,
 	.mac_config = stmmac_mac_config,
-	.mac_an_restart = stmmac_mac_an_restart,
 	.mac_link_down = stmmac_mac_link_down,
 	.mac_link_up = stmmac_mac_link_up,
 };
@@ -1234,6 +1214,7 @@ static int stmmac_init_phy(struct net_device *dev)
 
 static int stmmac_phy_setup(struct stmmac_priv *priv)
 {
+	struct stmmac_mdio_bus_data *mdio_bus_data = priv->plat->mdio_bus_data;
 	struct fwnode_handle *fwnode = of_fwnode_handle(priv->plat->phylink_node);
 	int mode = priv->plat->phy_interface;
 	struct phylink *phylink;
@@ -1241,8 +1222,7 @@ static int stmmac_phy_setup(struct stmmac_priv *priv)
 	priv->phylink_config.dev = &priv->dev->dev;
 	priv->phylink_config.type = PHYLINK_NETDEV;
 	priv->phylink_config.pcs_poll = true;
-	priv->phylink_config.ovr_an_inband =
-		priv->plat->mdio_bus_data->xpcs_an_inband;
+	priv->phylink_config.ovr_an_inband = mdio_bus_data->xpcs_an_inband;
 
 	if (!fwnode)
 		fwnode = dev_fwnode(priv->device);
@@ -1252,6 +1232,12 @@ static int stmmac_phy_setup(struct stmmac_priv *priv)
 	if (IS_ERR(phylink))
 		return PTR_ERR(phylink);
 
+	if (mdio_bus_data->has_xpcs) {
+		struct mdio_xpcs_args *xpcs = priv->hw->xpcs;
+
+		phylink_set_pcs(phylink, &xpcs->pcs);
+	}
+
 	priv->phylink = phylink;
 	return 0;
 }
@@ -3653,7 +3639,7 @@ int stmmac_open(struct net_device *dev)
 	if (priv->hw->pcs != STMMAC_PCS_TBI &&
 	    priv->hw->pcs != STMMAC_PCS_RTBI &&
 	    (!priv->hw->xpcs ||
-	     xpcs_get_an_mode(priv->hw->xpcs_args, mode) != DW_AN_C73)) {
+	     xpcs_get_an_mode(priv->hw->xpcs, mode) != DW_AN_C73)) {
 		ret = stmmac_init_phy(dev);
 		if (ret) {
 			netdev_err(priv->dev,
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index 9b4bf78d2eaa..6312a152c8ad 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -444,14 +444,6 @@ int stmmac_mdio_register(struct net_device *ndev)
 		max_addr = PHY_MAX_ADDR;
 	}
 
-	if (mdio_bus_data->has_xpcs) {
-		priv->hw->xpcs = mdio_xpcs_get_ops();
-		if (!priv->hw->xpcs) {
-			err = -ENODEV;
-			goto bus_register_fail;
-		}
-	}
-
 	if (mdio_bus_data->needs_reset)
 		new_bus->reset = &stmmac_mdio_reset;
 
@@ -526,11 +518,11 @@ int stmmac_mdio_register(struct net_device *ndev)
 				continue;
 			}
 
-			priv->hw->xpcs_args = xpcs;
+			priv->hw->xpcs = xpcs;
 			break;
 		}
 
-		if (!priv->hw->xpcs_args) {
+		if (!priv->hw->xpcs) {
 			dev_warn(dev, "No XPCS found\n");
 			err = -ENODEV;
 			goto no_xpcs_found;
@@ -563,8 +555,8 @@ int stmmac_mdio_unregister(struct net_device *ndev)
 		return 0;
 
 	if (priv->hw->xpcs) {
-		mdio_device_free(priv->hw->xpcs_args->mdiodev);
-		xpcs_destroy(priv->hw->xpcs_args);
+		mdio_device_free(priv->hw->xpcs->mdiodev);
+		xpcs_destroy(priv->hw->xpcs);
 	}
 
 	mdiobus_unregister(priv->mii);
diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c
index e17e72175ebb..34164437c135 100644
--- a/drivers/net/pcs/pcs-xpcs.c
+++ b/drivers/net/pcs/pcs-xpcs.c
@@ -100,6 +100,9 @@
 /* VR MII EEE Control 1 defines */
 #define DW_VR_MII_EEE_TRN_LPI		BIT(0)	/* Transparent Mode Enable */
 
+#define phylink_pcs_to_xpcs(pl_pcs) \
+	container_of((pl_pcs), struct mdio_xpcs_args, pcs)
+
 static const int xpcs_usxgmii_features[] = {
 	ETHTOOL_LINK_MODE_Pause_BIT,
 	ETHTOOL_LINK_MODE_Asym_Pause_BIT,
@@ -413,7 +416,7 @@ static int xpcs_get_max_usxgmii_speed(const unsigned long *supported)
 	return max;
 }
 
-static int xpcs_config_usxgmii(struct mdio_xpcs_args *xpcs, int speed)
+static void xpcs_config_usxgmii(struct mdio_xpcs_args *xpcs, int speed)
 {
 	int ret, speed_sel;
 
@@ -438,33 +441,40 @@ static int xpcs_config_usxgmii(struct mdio_xpcs_args *xpcs, int speed)
 		break;
 	default:
 		/* Nothing to do here */
-		return -EINVAL;
+		return;
 	}
 
 	ret = xpcs_read_vpcs(xpcs, MDIO_CTRL1);
 	if (ret < 0)
-		return ret;
+		goto out;
 
 	ret = xpcs_write_vpcs(xpcs, MDIO_CTRL1, ret | DW_USXGMII_EN);
 	if (ret < 0)
-		return ret;
+		goto out;
 
 	ret = xpcs_read(xpcs, MDIO_MMD_VEND2, MDIO_CTRL1);
 	if (ret < 0)
-		return ret;
+		goto out;
 
 	ret &= ~DW_USXGMII_SS_MASK;
 	ret |= speed_sel | DW_USXGMII_FULL;
 
 	ret = xpcs_write(xpcs, MDIO_MMD_VEND2, MDIO_CTRL1, ret);
 	if (ret < 0)
-		return ret;
+		goto out;
 
 	ret = xpcs_read_vpcs(xpcs, MDIO_CTRL1);
 	if (ret < 0)
-		return ret;
+		goto out;
+
+	ret = xpcs_write_vpcs(xpcs, MDIO_CTRL1, ret | DW_USXGMII_RST);
+	if (ret < 0)
+		goto out;
+
+	return;
 
-	return xpcs_write_vpcs(xpcs, MDIO_CTRL1, ret | DW_USXGMII_RST);
+out:
+	pr_err("%s: XPCS access returned %pe\n", __func__, ERR_PTR(ret));
 }
 
 static int _xpcs_config_aneg_c73(struct mdio_xpcs_args *xpcs,
@@ -794,19 +804,19 @@ static int xpcs_config_aneg_c37_sgmii(struct mdio_xpcs_args *xpcs)
 	return xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_DIG_CTRL1, ret);
 }
 
-static int xpcs_config(struct mdio_xpcs_args *xpcs,
-		       const struct phylink_link_state *state)
+static int xpcs_do_config(struct mdio_xpcs_args *xpcs,
+			  phy_interface_t interface, unsigned int mode)
 {
 	const struct xpcs_compat *compat;
 	int ret;
 
-	compat = xpcs_find_compat(xpcs->id, state->interface);
+	compat = xpcs_find_compat(xpcs->id, interface);
 	if (!compat)
 		return -ENODEV;
 
 	switch (compat->an_mode) {
 	case DW_AN_C73:
-		if (state->an_enabled) {
+		if (phylink_autoneg_inband(mode)) {
 			ret = xpcs_config_aneg_c73(xpcs, compat);
 			if (ret)
 				return ret;
@@ -824,6 +834,16 @@ static int xpcs_config(struct mdio_xpcs_args *xpcs,
 	return 0;
 }
 
+static int xpcs_config(struct phylink_pcs *pcs, unsigned int mode,
+		       phy_interface_t interface,
+		       const unsigned long *advertising,
+		       bool permit_pause_to_mac)
+{
+	struct mdio_xpcs_args *xpcs = phylink_pcs_to_xpcs(pcs);
+
+	return xpcs_do_config(xpcs, interface, mode);
+}
+
 static int xpcs_get_state_c73(struct mdio_xpcs_args *xpcs,
 			      struct phylink_link_state *state,
 			      const struct xpcs_compat *compat)
@@ -842,7 +862,7 @@ static int xpcs_get_state_c73(struct mdio_xpcs_args *xpcs,
 
 		state->link = 0;
 
-		return xpcs_config(xpcs, state);
+		return xpcs_do_config(xpcs, state->interface, MLO_AN_INBAND);
 	}
 
 	if (state->an_enabled && xpcs_aneg_done_c73(xpcs, state, compat)) {
@@ -899,41 +919,45 @@ static int xpcs_get_state_c37_sgmii(struct mdio_xpcs_args *xpcs,
 	return 0;
 }
 
-static int xpcs_get_state(struct mdio_xpcs_args *xpcs,
-			  struct phylink_link_state *state)
+static void xpcs_get_state(struct phylink_pcs *pcs,
+			   struct phylink_link_state *state)
 {
+	struct mdio_xpcs_args *xpcs = phylink_pcs_to_xpcs(pcs);
 	const struct xpcs_compat *compat;
 	int ret;
 
 	compat = xpcs_find_compat(xpcs->id, state->interface);
 	if (!compat)
-		return -ENODEV;
+		return;
 
 	switch (compat->an_mode) {
 	case DW_AN_C73:
 		ret = xpcs_get_state_c73(xpcs, state, compat);
-		if (ret)
-			return ret;
+		if (ret) {
+			pr_err("xpcs_get_state_c73 returned %pe\n",
+			       ERR_PTR(ret));
+			return;
+		}
 		break;
 	case DW_AN_C37_SGMII:
 		ret = xpcs_get_state_c37_sgmii(xpcs, state);
-		if (ret)
-			return ret;
+		if (ret) {
+			pr_err("xpcs_get_state_c37_sgmii returned %pe\n",
+			       ERR_PTR(ret));
+		}
 		break;
 	default:
-		return -1;
+		return;
 	}
-
-	return 0;
 }
 
-static int xpcs_link_up(struct mdio_xpcs_args *xpcs, int speed,
-			phy_interface_t interface)
+static void xpcs_link_up(struct phylink_pcs *pcs, unsigned int mode,
+			 phy_interface_t interface, int speed, int duplex)
 {
+	struct mdio_xpcs_args *xpcs = phylink_pcs_to_xpcs(pcs);
+
 	if (interface == PHY_INTERFACE_MODE_USXGMII)
 		return xpcs_config_usxgmii(xpcs, speed);
-
-	return 0;
 }
 
 static u32 xpcs_get_id(struct mdio_xpcs_args *xpcs)
@@ -1009,6 +1033,12 @@ static const struct xpcs_id xpcs_id_list[] = {
 	},
 };
 
+static const struct phylink_pcs_ops xpcs_phylink_ops = {
+	.pcs_config = xpcs_config,
+	.pcs_get_state = xpcs_get_state,
+	.pcs_link_up = xpcs_link_up,
+};
+
 struct mdio_xpcs_args *xpcs_create(struct mdio_device *mdiodev,
 				   phy_interface_t interface)
 {
@@ -1039,6 +1069,9 @@ struct mdio_xpcs_args *xpcs_create(struct mdio_device *mdiodev,
 			goto out;
 		}
 
+		xpcs->pcs.ops = &xpcs_phylink_ops;
+		xpcs->pcs.poll = true;
+
 		ret = xpcs_soft_reset(xpcs, compat);
 		if (ret)
 			goto out;
@@ -1061,16 +1094,4 @@ void xpcs_destroy(struct mdio_xpcs_args *xpcs)
 }
 EXPORT_SYMBOL_GPL(xpcs_destroy);
 
-static struct mdio_xpcs_ops xpcs_ops = {
-	.config = xpcs_config,
-	.get_state = xpcs_get_state,
-	.link_up = xpcs_link_up,
-};
-
-struct mdio_xpcs_ops *mdio_xpcs_get_ops(void)
-{
-	return &xpcs_ops;
-}
-EXPORT_SYMBOL_GPL(mdio_xpcs_get_ops);
-
 MODULE_LICENSE("GPL v2");
diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h
index 57a199393d63..0860a5b59f10 100644
--- a/include/linux/pcs/pcs-xpcs.h
+++ b/include/linux/pcs/pcs-xpcs.h
@@ -19,19 +19,10 @@ struct xpcs_id;
 struct mdio_xpcs_args {
 	struct mdio_device *mdiodev;
 	const struct xpcs_id *id;
-};
-
-struct mdio_xpcs_ops {
-	int (*config)(struct mdio_xpcs_args *xpcs,
-		      const struct phylink_link_state *state);
-	int (*get_state)(struct mdio_xpcs_args *xpcs,
-			 struct phylink_link_state *state);
-	int (*link_up)(struct mdio_xpcs_args *xpcs, int speed,
-		       phy_interface_t interface);
+	struct phylink_pcs pcs;
 };
 
 int xpcs_get_an_mode(struct mdio_xpcs_args *xpcs, phy_interface_t interface);
-struct mdio_xpcs_ops *mdio_xpcs_get_ops(void);
 void xpcs_validate(struct mdio_xpcs_args *xpcs, unsigned long *supported,
 		   struct phylink_link_state *state);
 int xpcs_config_eee(struct mdio_xpcs_args *xpcs, int mult_fact_100ns,
-- 
cgit v1.2.3


From 1bd4f5716fc3bb4882033fbeeb97472503f1c7e2 Mon Sep 17 00:00:00 2001
From: Omkar Kulkarni <okulkarni@marvell.com>
Date: Wed, 2 Jun 2021 20:16:49 +0300
Subject: qed: Add TCP_ULP FW resource layout

Add TCP_ULP as a storage common TCP offload FW resource layout.
This will be used by the core driver (QED) for both the NVMeTCP and iSCSI.

Acked-by: Igor Russkikh <irusskikh@marvell.com>
Signed-off-by: Prabhakar Kushwaha <pkushwaha@marvell.com>
Signed-off-by: Omkar Kulkarni <okulkarni@marvell.com>
Signed-off-by: Michal Kalderon <mkalderon@marvell.com>
Signed-off-by: Ariel Elior <aelior@marvell.com>
Signed-off-by: Shai Malin <smalin@marvell.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_cxt.c         | 18 +++++++++---------
 drivers/net/ethernet/qlogic/qed/qed_cxt.h         |  2 +-
 drivers/net/ethernet/qlogic/qed/qed_dev.c         |  2 +-
 drivers/net/ethernet/qlogic/qed/qed_hsi.h         |  2 +-
 drivers/net/ethernet/qlogic/qed/qed_iscsi.c       | 20 ++++++++++----------
 drivers/net/ethernet/qlogic/qed/qed_ll2.c         |  8 ++++----
 drivers/net/ethernet/qlogic/qed/qed_ooo.c         |  2 +-
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c |  2 +-
 include/linux/qed/common_hsi.h                    |  2 +-
 include/linux/qed/qed_ll2_if.h                    |  2 +-
 10 files changed, 30 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.c b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
index 0a22f8ce9a2c..fcabbaa518df 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
@@ -94,14 +94,14 @@ struct src_ent {
 
 static bool src_proto(enum protocol_type type)
 {
-	return type == PROTOCOLID_ISCSI ||
+	return type == PROTOCOLID_TCP_ULP ||
 	       type == PROTOCOLID_FCOE ||
 	       type == PROTOCOLID_IWARP;
 }
 
 static bool tm_cid_proto(enum protocol_type type)
 {
-	return type == PROTOCOLID_ISCSI ||
+	return type == PROTOCOLID_TCP_ULP ||
 	       type == PROTOCOLID_FCOE ||
 	       type == PROTOCOLID_ROCE ||
 	       type == PROTOCOLID_IWARP;
@@ -2090,13 +2090,13 @@ int qed_cxt_set_pf_params(struct qed_hwfn *p_hwfn, u32 rdma_tasks)
 
 		if (p_params->num_cons && p_params->num_tasks) {
 			qed_cxt_set_proto_cid_count(p_hwfn,
-						    PROTOCOLID_ISCSI,
+						    PROTOCOLID_TCP_ULP,
 						    p_params->num_cons,
 						    0);
 
 			qed_cxt_set_proto_tid_count(p_hwfn,
-						    PROTOCOLID_ISCSI,
-						    QED_CXT_ISCSI_TID_SEG,
+						    PROTOCOLID_TCP_ULP,
+						    QED_CXT_TCP_ULP_TID_SEG,
 						    0,
 						    p_params->num_tasks,
 						    true);
@@ -2129,8 +2129,8 @@ int qed_cxt_get_tid_mem_info(struct qed_hwfn *p_hwfn,
 		seg = QED_CXT_FCOE_TID_SEG;
 		break;
 	case QED_PCI_ISCSI:
-		proto = PROTOCOLID_ISCSI;
-		seg = QED_CXT_ISCSI_TID_SEG;
+		proto = PROTOCOLID_TCP_ULP;
+		seg = QED_CXT_TCP_ULP_TID_SEG;
 		break;
 	default:
 		return -EINVAL;
@@ -2455,8 +2455,8 @@ int qed_cxt_get_task_ctx(struct qed_hwfn *p_hwfn,
 		seg = QED_CXT_FCOE_TID_SEG;
 		break;
 	case QED_PCI_ISCSI:
-		proto = PROTOCOLID_ISCSI;
-		seg = QED_CXT_ISCSI_TID_SEG;
+		proto = PROTOCOLID_TCP_ULP;
+		seg = QED_CXT_TCP_ULP_TID_SEG;
 		break;
 	default:
 		return -EINVAL;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.h b/drivers/net/ethernet/qlogic/qed/qed_cxt.h
index 056e79620a0e..8adb7ed0c12d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.h
@@ -50,7 +50,7 @@ int qed_cxt_get_cid_info(struct qed_hwfn *p_hwfn,
 int qed_cxt_get_tid_mem_info(struct qed_hwfn *p_hwfn,
 			     struct qed_tid_mem *p_info);
 
-#define QED_CXT_ISCSI_TID_SEG	PROTOCOLID_ISCSI
+#define QED_CXT_TCP_ULP_TID_SEG	PROTOCOLID_TCP_ULP
 #define QED_CXT_ROCE_TID_SEG	PROTOCOLID_ROCE
 #define QED_CXT_FCOE_TID_SEG	PROTOCOLID_FCOE
 enum qed_cxt_elem_type {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index d2f5855b2ea7..c231d0e56571 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -2266,7 +2266,7 @@ int qed_resc_alloc(struct qed_dev *cdev)
 		} else if (p_hwfn->hw_info.personality == QED_PCI_ISCSI) {
 			num_cons =
 			    qed_cxt_get_proto_cid_count(p_hwfn,
-							PROTOCOLID_ISCSI,
+							PROTOCOLID_TCP_ULP,
 							NULL);
 			n_eqes += 2 * num_cons;
 		}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 559df9f4d656..9dbeb2efdc51 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -1118,7 +1118,7 @@ struct outer_tag_config_struct {
 /* personality per PF */
 enum personality_type {
 	BAD_PERSONALITY_TYP,
-	PERSONALITY_ISCSI,
+	PERSONALITY_TCP_ULP,
 	PERSONALITY_FCOE,
 	PERSONALITY_RDMA_AND_ETH,
 	PERSONALITY_RDMA,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iscsi.c b/drivers/net/ethernet/qlogic/qed/qed_iscsi.c
index 448567a1f520..db926d8b3033 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iscsi.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_iscsi.c
@@ -158,7 +158,7 @@ qed_sp_iscsi_func_start(struct qed_hwfn *p_hwfn,
 
 	rc = qed_sp_init_request(p_hwfn, &p_ent,
 				 ISCSI_RAMROD_CMD_ID_INIT_FUNC,
-				 PROTOCOLID_ISCSI, &init_data);
+				 PROTOCOLID_TCP_ULP, &init_data);
 	if (rc)
 		return rc;
 
@@ -250,7 +250,7 @@ qed_sp_iscsi_func_start(struct qed_hwfn *p_hwfn,
 	p_hwfn->p_iscsi_info->event_context = event_context;
 	p_hwfn->p_iscsi_info->event_cb = async_event_cb;
 
-	qed_spq_register_async_cb(p_hwfn, PROTOCOLID_ISCSI,
+	qed_spq_register_async_cb(p_hwfn, PROTOCOLID_TCP_ULP,
 				  qed_iscsi_async_event);
 
 	return qed_spq_post(p_hwfn, p_ent, NULL);
@@ -286,7 +286,7 @@ static int qed_sp_iscsi_conn_offload(struct qed_hwfn *p_hwfn,
 
 	rc = qed_sp_init_request(p_hwfn, &p_ent,
 				 ISCSI_RAMROD_CMD_ID_OFFLOAD_CONN,
-				 PROTOCOLID_ISCSI, &init_data);
+				 PROTOCOLID_TCP_ULP, &init_data);
 	if (rc)
 		return rc;
 
@@ -465,7 +465,7 @@ static int qed_sp_iscsi_conn_update(struct qed_hwfn *p_hwfn,
 
 	rc = qed_sp_init_request(p_hwfn, &p_ent,
 				 ISCSI_RAMROD_CMD_ID_UPDATE_CONN,
-				 PROTOCOLID_ISCSI, &init_data);
+				 PROTOCOLID_TCP_ULP, &init_data);
 	if (rc)
 		return rc;
 
@@ -506,7 +506,7 @@ qed_sp_iscsi_mac_update(struct qed_hwfn *p_hwfn,
 
 	rc = qed_sp_init_request(p_hwfn, &p_ent,
 				 ISCSI_RAMROD_CMD_ID_MAC_UPDATE,
-				 PROTOCOLID_ISCSI, &init_data);
+				 PROTOCOLID_TCP_ULP, &init_data);
 	if (rc)
 		return rc;
 
@@ -548,7 +548,7 @@ static int qed_sp_iscsi_conn_terminate(struct qed_hwfn *p_hwfn,
 
 	rc = qed_sp_init_request(p_hwfn, &p_ent,
 				 ISCSI_RAMROD_CMD_ID_TERMINATION_CONN,
-				 PROTOCOLID_ISCSI, &init_data);
+				 PROTOCOLID_TCP_ULP, &init_data);
 	if (rc)
 		return rc;
 
@@ -582,7 +582,7 @@ static int qed_sp_iscsi_conn_clear_sq(struct qed_hwfn *p_hwfn,
 
 	rc = qed_sp_init_request(p_hwfn, &p_ent,
 				 ISCSI_RAMROD_CMD_ID_CLEAR_SQ,
-				 PROTOCOLID_ISCSI, &init_data);
+				 PROTOCOLID_TCP_ULP, &init_data);
 	if (rc)
 		return rc;
 
@@ -606,13 +606,13 @@ static int qed_sp_iscsi_func_stop(struct qed_hwfn *p_hwfn,
 
 	rc = qed_sp_init_request(p_hwfn, &p_ent,
 				 ISCSI_RAMROD_CMD_ID_DESTROY_FUNC,
-				 PROTOCOLID_ISCSI, &init_data);
+				 PROTOCOLID_TCP_ULP, &init_data);
 	if (rc)
 		return rc;
 
 	rc = qed_spq_post(p_hwfn, p_ent, NULL);
 
-	qed_spq_unregister_async_cb(p_hwfn, PROTOCOLID_ISCSI);
+	qed_spq_unregister_async_cb(p_hwfn, PROTOCOLID_TCP_ULP);
 	return rc;
 }
 
@@ -786,7 +786,7 @@ static int qed_iscsi_acquire_connection(struct qed_hwfn *p_hwfn,
 	u32 icid;
 
 	spin_lock_bh(&p_hwfn->p_iscsi_info->lock);
-	rc = qed_cxt_acquire_cid(p_hwfn, PROTOCOLID_ISCSI, &icid);
+	rc = qed_cxt_acquire_cid(p_hwfn, PROTOCOLID_TCP_ULP, &icid);
 	spin_unlock_bh(&p_hwfn->p_iscsi_info->lock);
 	if (rc)
 		return rc;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index 49783f365079..286e53927866 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -1037,8 +1037,8 @@ static int qed_sp_ll2_tx_queue_start(struct qed_hwfn *p_hwfn,
 	case QED_LL2_TYPE_FCOE:
 		p_ramrod->conn_type = PROTOCOLID_FCOE;
 		break;
-	case QED_LL2_TYPE_ISCSI:
-		p_ramrod->conn_type = PROTOCOLID_ISCSI;
+	case QED_LL2_TYPE_TCP_ULP:
+		p_ramrod->conn_type = PROTOCOLID_TCP_ULP;
 		break;
 	case QED_LL2_TYPE_ROCE:
 		p_ramrod->conn_type = PROTOCOLID_ROCE;
@@ -1048,7 +1048,7 @@ static int qed_sp_ll2_tx_queue_start(struct qed_hwfn *p_hwfn,
 		break;
 	case QED_LL2_TYPE_OOO:
 		if (p_hwfn->hw_info.personality == QED_PCI_ISCSI)
-			p_ramrod->conn_type = PROTOCOLID_ISCSI;
+			p_ramrod->conn_type = PROTOCOLID_TCP_ULP;
 		else
 			p_ramrod->conn_type = PROTOCOLID_IWARP;
 		break;
@@ -2442,7 +2442,7 @@ static int __qed_ll2_start(struct qed_hwfn *p_hwfn,
 		conn_type = QED_LL2_TYPE_FCOE;
 		break;
 	case QED_PCI_ISCSI:
-		conn_type = QED_LL2_TYPE_ISCSI;
+		conn_type = QED_LL2_TYPE_TCP_ULP;
 		break;
 	case QED_PCI_ETH_ROCE:
 		conn_type = QED_LL2_TYPE_ROCE;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ooo.c b/drivers/net/ethernet/qlogic/qed/qed_ooo.c
index 88353aa404dc..599da0d7366b 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ooo.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ooo.c
@@ -83,7 +83,7 @@ int qed_ooo_alloc(struct qed_hwfn *p_hwfn)
 
 	switch (p_hwfn->hw_info.personality) {
 	case QED_PCI_ISCSI:
-		proto = PROTOCOLID_ISCSI;
+		proto = PROTOCOLID_TCP_ULP;
 		break;
 	case QED_PCI_ETH_RDMA:
 	case QED_PCI_ETH_IWARP:
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index aa71adcf31ee..ee7dc0a7da6c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -385,7 +385,7 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 		p_ramrod->personality = PERSONALITY_FCOE;
 		break;
 	case QED_PCI_ISCSI:
-		p_ramrod->personality = PERSONALITY_ISCSI;
+		p_ramrod->personality = PERSONALITY_TCP_ULP;
 		break;
 	case QED_PCI_ETH_ROCE:
 	case QED_PCI_ETH_IWARP:
diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h
index 977807e1be53..0a3807e927c5 100644
--- a/include/linux/qed/common_hsi.h
+++ b/include/linux/qed/common_hsi.h
@@ -702,7 +702,7 @@ enum mf_mode {
 
 /* Per-protocol connection types */
 enum protocol_type {
-	PROTOCOLID_ISCSI,
+	PROTOCOLID_TCP_ULP,
 	PROTOCOLID_FCOE,
 	PROTOCOLID_ROCE,
 	PROTOCOLID_CORE,
diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h
index ea273ba1c991..ff808d248883 100644
--- a/include/linux/qed/qed_ll2_if.h
+++ b/include/linux/qed/qed_ll2_if.h
@@ -18,7 +18,7 @@
 
 enum qed_ll2_conn_type {
 	QED_LL2_TYPE_FCOE,
-	QED_LL2_TYPE_ISCSI,
+	QED_LL2_TYPE_TCP_ULP,
 	QED_LL2_TYPE_TEST,
 	QED_LL2_TYPE_OOO,
 	QED_LL2_TYPE_RESERVED2,
-- 
cgit v1.2.3


From 897e87a10c35fb37a20886af6f731748d92c1836 Mon Sep 17 00:00:00 2001
From: Shai Malin <smalin@marvell.com>
Date: Wed, 2 Jun 2021 20:16:50 +0300
Subject: qed: Add NVMeTCP Offload PF Level FW and HW HSI

This patch introduces the NVMeTCP device and PF level HSI and HSI
functionality in order to initialize and interact with the HW device.
The patch also adds qed NVMeTCP personality.

This patch is based on the qede, qedr, qedi, qedf drivers HSI.

Acked-by: Igor Russkikh <irusskikh@marvell.com>
Signed-off-by: Dean Balandin <dbalandin@marvell.com>
Signed-off-by: Prabhakar Kushwaha <pkushwaha@marvell.com>
Signed-off-by: Omkar Kulkarni <okulkarni@marvell.com>
Signed-off-by: Shai Malin <smalin@marvell.com>
Signed-off-by: Michal Kalderon <mkalderon@marvell.com>
Signed-off-by: Ariel Elior <aelior@marvell.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/Kconfig               |   3 +
 drivers/net/ethernet/qlogic/qed/Makefile          |   2 +
 drivers/net/ethernet/qlogic/qed/qed.h             |   6 +
 drivers/net/ethernet/qlogic/qed/qed_cxt.c         |  27 ++-
 drivers/net/ethernet/qlogic/qed/qed_dev.c         |  48 +++-
 drivers/net/ethernet/qlogic/qed/qed_hsi.h         |   4 +-
 drivers/net/ethernet/qlogic/qed/qed_ll2.c         |  32 ++-
 drivers/net/ethernet/qlogic/qed/qed_mcp.c         |   3 +
 drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c     |   3 +-
 drivers/net/ethernet/qlogic/qed/qed_nvmetcp.c     | 266 ++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_nvmetcp.h     |  51 +++++
 drivers/net/ethernet/qlogic/qed/qed_ooo.c         |   3 +-
 drivers/net/ethernet/qlogic/qed/qed_sp.h          |   2 +
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c |   1 +
 include/linux/qed/nvmetcp_common.h                |  54 +++++
 include/linux/qed/qed_if.h                        |  18 ++
 include/linux/qed/qed_nvmetcp_if.h                |  71 ++++++
 17 files changed, 572 insertions(+), 22 deletions(-)
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_nvmetcp.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_nvmetcp.h
 create mode 100644 include/linux/qed/nvmetcp_common.h
 create mode 100644 include/linux/qed/qed_nvmetcp_if.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/Kconfig b/drivers/net/ethernet/qlogic/Kconfig
index 6b5ddb07ee83..98f430905ffa 100644
--- a/drivers/net/ethernet/qlogic/Kconfig
+++ b/drivers/net/ethernet/qlogic/Kconfig
@@ -110,6 +110,9 @@ config QED_RDMA
 config QED_ISCSI
 	bool
 
+config QED_NVMETCP
+	bool
+
 config QED_FCOE
 	bool
 
diff --git a/drivers/net/ethernet/qlogic/qed/Makefile b/drivers/net/ethernet/qlogic/qed/Makefile
index 8251755ec18c..7cb0db67ba5b 100644
--- a/drivers/net/ethernet/qlogic/qed/Makefile
+++ b/drivers/net/ethernet/qlogic/qed/Makefile
@@ -28,6 +28,8 @@ qed-$(CONFIG_QED_ISCSI) += qed_iscsi.o
 qed-$(CONFIG_QED_LL2) += qed_ll2.o
 qed-$(CONFIG_QED_OOO) += qed_ooo.o
 
+qed-$(CONFIG_QED_NVMETCP) += qed_nvmetcp.o
+
 qed-$(CONFIG_QED_RDMA) +=	\
 	qed_iwarp.o		\
 	qed_rdma.o		\
diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index a20cb8a0c377..bc9bdb9d1bb9 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -200,6 +200,7 @@ enum qed_pci_personality {
 	QED_PCI_ETH,
 	QED_PCI_FCOE,
 	QED_PCI_ISCSI,
+	QED_PCI_NVMETCP,
 	QED_PCI_ETH_ROCE,
 	QED_PCI_ETH_IWARP,
 	QED_PCI_ETH_RDMA,
@@ -239,6 +240,7 @@ enum QED_FEATURE {
 	QED_PF_L2_QUE,
 	QED_VF,
 	QED_RDMA_CNQ,
+	QED_NVMETCP_CQ,
 	QED_ISCSI_CQ,
 	QED_FCOE_CQ,
 	QED_VF_L2_QUE,
@@ -284,6 +286,8 @@ struct qed_hw_info {
 	((dev)->hw_info.personality == QED_PCI_FCOE)
 #define QED_IS_ISCSI_PERSONALITY(dev)					\
 	((dev)->hw_info.personality == QED_PCI_ISCSI)
+#define QED_IS_NVMETCP_PERSONALITY(dev)					\
+	((dev)->hw_info.personality == QED_PCI_NVMETCP)
 
 	/* Resource Allocation scheme results */
 	u32				resc_start[QED_MAX_RESC];
@@ -592,6 +596,7 @@ struct qed_hwfn {
 	struct qed_ooo_info		*p_ooo_info;
 	struct qed_rdma_info		*p_rdma_info;
 	struct qed_iscsi_info		*p_iscsi_info;
+	struct qed_nvmetcp_info		*p_nvmetcp_info;
 	struct qed_fcoe_info		*p_fcoe_info;
 	struct qed_pf_params		pf_params;
 
@@ -828,6 +833,7 @@ struct qed_dev {
 		struct qed_eth_cb_ops		*eth;
 		struct qed_fcoe_cb_ops		*fcoe;
 		struct qed_iscsi_cb_ops		*iscsi;
+		struct qed_nvmetcp_cb_ops	*nvmetcp;
 	} protocol_ops;
 	void				*ops_cookie;
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.c b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
index fcabbaa518df..5a0a3cbcc1c1 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
@@ -2072,7 +2072,6 @@ int qed_cxt_set_pf_params(struct qed_hwfn *p_hwfn, u32 rdma_tasks)
 						    PROTOCOLID_FCOE,
 						    p_params->num_cons,
 						    0);
-
 			qed_cxt_set_proto_tid_count(p_hwfn, PROTOCOLID_FCOE,
 						    QED_CXT_FCOE_TID_SEG, 0,
 						    p_params->num_tasks, true);
@@ -2093,7 +2092,6 @@ int qed_cxt_set_pf_params(struct qed_hwfn *p_hwfn, u32 rdma_tasks)
 						    PROTOCOLID_TCP_ULP,
 						    p_params->num_cons,
 						    0);
-
 			qed_cxt_set_proto_tid_count(p_hwfn,
 						    PROTOCOLID_TCP_ULP,
 						    QED_CXT_TCP_ULP_TID_SEG,
@@ -2106,6 +2104,29 @@ int qed_cxt_set_pf_params(struct qed_hwfn *p_hwfn, u32 rdma_tasks)
 		}
 		break;
 	}
+	case QED_PCI_NVMETCP:
+	{
+		struct qed_nvmetcp_pf_params *p_params;
+
+		p_params = &p_hwfn->pf_params.nvmetcp_pf_params;
+
+		if (p_params->num_cons && p_params->num_tasks) {
+			qed_cxt_set_proto_cid_count(p_hwfn,
+						    PROTOCOLID_TCP_ULP,
+						    p_params->num_cons,
+						    0);
+			qed_cxt_set_proto_tid_count(p_hwfn,
+						    PROTOCOLID_TCP_ULP,
+						    QED_CXT_TCP_ULP_TID_SEG,
+						    0,
+						    p_params->num_tasks,
+						    true);
+		} else {
+			DP_INFO(p_hwfn->cdev,
+				"NvmeTCP personality used without setting params!\n");
+		}
+		break;
+	}
 	default:
 		return -EINVAL;
 	}
@@ -2129,6 +2150,7 @@ int qed_cxt_get_tid_mem_info(struct qed_hwfn *p_hwfn,
 		seg = QED_CXT_FCOE_TID_SEG;
 		break;
 	case QED_PCI_ISCSI:
+	case QED_PCI_NVMETCP:
 		proto = PROTOCOLID_TCP_ULP;
 		seg = QED_CXT_TCP_ULP_TID_SEG;
 		break;
@@ -2455,6 +2477,7 @@ int qed_cxt_get_task_ctx(struct qed_hwfn *p_hwfn,
 		seg = QED_CXT_FCOE_TID_SEG;
 		break;
 	case QED_PCI_ISCSI:
+	case QED_PCI_NVMETCP:
 		proto = PROTOCOLID_TCP_ULP;
 		seg = QED_CXT_TCP_ULP_TID_SEG;
 		break;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index c231d0e56571..932b892f1ef1 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -37,6 +37,7 @@
 #include "qed_sriov.h"
 #include "qed_vf.h"
 #include "qed_rdma.h"
+#include "qed_nvmetcp.h"
 
 static DEFINE_SPINLOCK(qm_lock);
 
@@ -667,7 +668,8 @@ qed_llh_set_engine_affin(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 	}
 
 	/* Storage PF is bound to a single engine while L2 PF uses both */
-	if (QED_IS_FCOE_PERSONALITY(p_hwfn) || QED_IS_ISCSI_PERSONALITY(p_hwfn))
+	if (QED_IS_FCOE_PERSONALITY(p_hwfn) || QED_IS_ISCSI_PERSONALITY(p_hwfn) ||
+	    QED_IS_NVMETCP_PERSONALITY(p_hwfn))
 		eng = cdev->fir_affin ? QED_ENG1 : QED_ENG0;
 	else			/* L2_PERSONALITY */
 		eng = QED_BOTH_ENG;
@@ -1164,6 +1166,9 @@ void qed_llh_remove_mac_filter(struct qed_dev *cdev,
 	if (!test_bit(QED_MF_LLH_MAC_CLSS, &cdev->mf_bits))
 		goto out;
 
+	if (QED_IS_NVMETCP_PERSONALITY(p_hwfn))
+		return;
+
 	ether_addr_copy(filter.mac.addr, mac_addr);
 	rc = qed_llh_shadow_remove_filter(cdev, ppfid, &filter, &filter_idx,
 					  &ref_cnt);
@@ -1381,6 +1386,11 @@ void qed_resc_free(struct qed_dev *cdev)
 			qed_ooo_free(p_hwfn);
 		}
 
+		if (p_hwfn->hw_info.personality == QED_PCI_NVMETCP) {
+			qed_nvmetcp_free(p_hwfn);
+			qed_ooo_free(p_hwfn);
+		}
+
 		if (QED_IS_RDMA_PERSONALITY(p_hwfn) && rdma_info) {
 			qed_spq_unregister_async_cb(p_hwfn, rdma_info->proto);
 			qed_rdma_info_free(p_hwfn);
@@ -1423,6 +1433,7 @@ static u32 qed_get_pq_flags(struct qed_hwfn *p_hwfn)
 		flags |= PQ_FLAGS_OFLD;
 		break;
 	case QED_PCI_ISCSI:
+	case QED_PCI_NVMETCP:
 		flags |= PQ_FLAGS_ACK | PQ_FLAGS_OOO | PQ_FLAGS_OFLD;
 		break;
 	case QED_PCI_ETH_ROCE:
@@ -2263,7 +2274,8 @@ int qed_resc_alloc(struct qed_dev *cdev)
 			 * at the same time
 			 */
 			n_eqes += num_cons + 2 * MAX_NUM_VFS_BB + n_srq;
-		} else if (p_hwfn->hw_info.personality == QED_PCI_ISCSI) {
+		} else if (p_hwfn->hw_info.personality == QED_PCI_ISCSI ||
+			   p_hwfn->hw_info.personality == QED_PCI_NVMETCP) {
 			num_cons =
 			    qed_cxt_get_proto_cid_count(p_hwfn,
 							PROTOCOLID_TCP_ULP,
@@ -2313,6 +2325,15 @@ int qed_resc_alloc(struct qed_dev *cdev)
 				goto alloc_err;
 		}
 
+		if (p_hwfn->hw_info.personality == QED_PCI_NVMETCP) {
+			rc = qed_nvmetcp_alloc(p_hwfn);
+			if (rc)
+				goto alloc_err;
+			rc = qed_ooo_alloc(p_hwfn);
+			if (rc)
+				goto alloc_err;
+		}
+
 		if (QED_IS_RDMA_PERSONALITY(p_hwfn)) {
 			rc = qed_rdma_info_alloc(p_hwfn);
 			if (rc)
@@ -2393,6 +2414,11 @@ void qed_resc_setup(struct qed_dev *cdev)
 			qed_iscsi_setup(p_hwfn);
 			qed_ooo_setup(p_hwfn);
 		}
+
+		if (p_hwfn->hw_info.personality == QED_PCI_NVMETCP) {
+			qed_nvmetcp_setup(p_hwfn);
+			qed_ooo_setup(p_hwfn);
+		}
 	}
 }
 
@@ -2854,7 +2880,8 @@ static int qed_hw_init_pf(struct qed_hwfn *p_hwfn,
 
 	/* Protocol Configuration */
 	STORE_RT_REG(p_hwfn, PRS_REG_SEARCH_TCP_RT_OFFSET,
-		     (p_hwfn->hw_info.personality == QED_PCI_ISCSI) ? 1 : 0);
+		     ((p_hwfn->hw_info.personality == QED_PCI_ISCSI) ||
+			 (p_hwfn->hw_info.personality == QED_PCI_NVMETCP)) ? 1 : 0);
 	STORE_RT_REG(p_hwfn, PRS_REG_SEARCH_FCOE_RT_OFFSET,
 		     (p_hwfn->hw_info.personality == QED_PCI_FCOE) ? 1 : 0);
 	STORE_RT_REG(p_hwfn, PRS_REG_SEARCH_ROCE_RT_OFFSET, 0);
@@ -3535,14 +3562,21 @@ static void qed_hw_set_feat(struct qed_hwfn *p_hwfn)
 		feat_num[QED_ISCSI_CQ] = min_t(u32, sb_cnt.cnt,
 					       RESC_NUM(p_hwfn,
 							QED_CMDQS_CQS));
+
+	if (QED_IS_NVMETCP_PERSONALITY(p_hwfn))
+		feat_num[QED_NVMETCP_CQ] = min_t(u32, sb_cnt.cnt,
+						 RESC_NUM(p_hwfn,
+							  QED_CMDQS_CQS));
+
 	DP_VERBOSE(p_hwfn,
 		   NETIF_MSG_PROBE,
-		   "#PF_L2_QUEUES=%d VF_L2_QUEUES=%d #ROCE_CNQ=%d FCOE_CQ=%d ISCSI_CQ=%d #SBS=%d\n",
+		   "#PF_L2_QUEUES=%d VF_L2_QUEUES=%d #ROCE_CNQ=%d FCOE_CQ=%d ISCSI_CQ=%d NVMETCP_CQ=%d #SBS=%d\n",
 		   (int)FEAT_NUM(p_hwfn, QED_PF_L2_QUE),
 		   (int)FEAT_NUM(p_hwfn, QED_VF_L2_QUE),
 		   (int)FEAT_NUM(p_hwfn, QED_RDMA_CNQ),
 		   (int)FEAT_NUM(p_hwfn, QED_FCOE_CQ),
 		   (int)FEAT_NUM(p_hwfn, QED_ISCSI_CQ),
+		   (int)FEAT_NUM(p_hwfn, QED_NVMETCP_CQ),
 		   (int)sb_cnt.cnt);
 }
 
@@ -3734,7 +3768,8 @@ int qed_hw_get_dflt_resc(struct qed_hwfn *p_hwfn,
 		break;
 	case QED_BDQ:
 		if (p_hwfn->hw_info.personality != QED_PCI_ISCSI &&
-		    p_hwfn->hw_info.personality != QED_PCI_FCOE)
+		    p_hwfn->hw_info.personality != QED_PCI_FCOE &&
+			p_hwfn->hw_info.personality != QED_PCI_NVMETCP)
 			*p_resc_num = 0;
 		else
 			*p_resc_num = 1;
@@ -3755,7 +3790,8 @@ int qed_hw_get_dflt_resc(struct qed_hwfn *p_hwfn,
 			*p_resc_start = 0;
 		else if (p_hwfn->cdev->num_ports_in_engine == 4)
 			*p_resc_start = p_hwfn->port_id;
-		else if (p_hwfn->hw_info.personality == QED_PCI_ISCSI)
+		else if (p_hwfn->hw_info.personality == QED_PCI_ISCSI ||
+			 p_hwfn->hw_info.personality == QED_PCI_NVMETCP)
 			*p_resc_start = p_hwfn->port_id;
 		else if (p_hwfn->hw_info.personality == QED_PCI_FCOE)
 			*p_resc_start = p_hwfn->port_id + 2;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 9dbeb2efdc51..fb1baa2da2d0 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -20,6 +20,7 @@
 #include <linux/qed/fcoe_common.h>
 #include <linux/qed/eth_common.h>
 #include <linux/qed/iscsi_common.h>
+#include <linux/qed/nvmetcp_common.h>
 #include <linux/qed/iwarp_common.h>
 #include <linux/qed/rdma_common.h>
 #include <linux/qed/roce_common.h>
@@ -12147,7 +12148,8 @@ struct public_func {
 #define FUNC_MF_CFG_PROTOCOL_ISCSI              0x00000010
 #define FUNC_MF_CFG_PROTOCOL_FCOE               0x00000020
 #define FUNC_MF_CFG_PROTOCOL_ROCE               0x00000030
-#define FUNC_MF_CFG_PROTOCOL_MAX	0x00000030
+#define FUNC_MF_CFG_PROTOCOL_NVMETCP    0x00000040
+#define FUNC_MF_CFG_PROTOCOL_MAX	0x00000040
 
 #define FUNC_MF_CFG_MIN_BW_MASK		0x0000ff00
 #define FUNC_MF_CFG_MIN_BW_SHIFT	8
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index 286e53927866..02a4610d9330 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -960,7 +960,8 @@ static int qed_sp_ll2_rx_queue_start(struct qed_hwfn *p_hwfn,
 
 	if (test_bit(QED_MF_LL2_NON_UNICAST, &p_hwfn->cdev->mf_bits) &&
 	    p_ramrod->main_func_queue && conn_type != QED_LL2_TYPE_ROCE &&
-	    conn_type != QED_LL2_TYPE_IWARP) {
+	    conn_type != QED_LL2_TYPE_IWARP &&
+		(!QED_IS_NVMETCP_PERSONALITY(p_hwfn))) {
 		p_ramrod->mf_si_bcast_accept_all = 1;
 		p_ramrod->mf_si_mcast_accept_all = 1;
 	} else {
@@ -1047,7 +1048,8 @@ static int qed_sp_ll2_tx_queue_start(struct qed_hwfn *p_hwfn,
 		p_ramrod->conn_type = PROTOCOLID_IWARP;
 		break;
 	case QED_LL2_TYPE_OOO:
-		if (p_hwfn->hw_info.personality == QED_PCI_ISCSI)
+		if (p_hwfn->hw_info.personality == QED_PCI_ISCSI ||
+		    p_hwfn->hw_info.personality == QED_PCI_NVMETCP)
 			p_ramrod->conn_type = PROTOCOLID_TCP_ULP;
 		else
 			p_ramrod->conn_type = PROTOCOLID_IWARP;
@@ -1634,7 +1636,8 @@ int qed_ll2_establish_connection(void *cxt, u8 connection_handle)
 	if (rc)
 		goto out;
 
-	if (!QED_IS_RDMA_PERSONALITY(p_hwfn))
+	if (!QED_IS_RDMA_PERSONALITY(p_hwfn) &&
+	    !QED_IS_NVMETCP_PERSONALITY(p_hwfn))
 		qed_wr(p_hwfn, p_ptt, PRS_REG_USE_LIGHT_L2, 1);
 
 	qed_ll2_establish_connection_ooo(p_hwfn, p_ll2_conn);
@@ -2376,7 +2379,8 @@ out:
 static bool qed_ll2_is_storage_eng1(struct qed_dev *cdev)
 {
 	return (QED_IS_FCOE_PERSONALITY(QED_LEADING_HWFN(cdev)) ||
-		QED_IS_ISCSI_PERSONALITY(QED_LEADING_HWFN(cdev))) &&
+		QED_IS_ISCSI_PERSONALITY(QED_LEADING_HWFN(cdev)) ||
+		QED_IS_NVMETCP_PERSONALITY(QED_LEADING_HWFN(cdev))) &&
 		(QED_AFFIN_HWFN(cdev) != QED_LEADING_HWFN(cdev));
 }
 
@@ -2402,11 +2406,13 @@ static int qed_ll2_stop(struct qed_dev *cdev)
 
 	if (cdev->ll2->handle == QED_LL2_UNUSED_HANDLE)
 		return 0;
+	if (!QED_IS_NVMETCP_PERSONALITY(p_hwfn))
+		qed_llh_remove_mac_filter(cdev, 0, cdev->ll2_mac_address);
 
 	qed_llh_remove_mac_filter(cdev, 0, cdev->ll2_mac_address);
 	eth_zero_addr(cdev->ll2_mac_address);
 
-	if (QED_IS_ISCSI_PERSONALITY(p_hwfn))
+	if (QED_IS_ISCSI_PERSONALITY(p_hwfn) || QED_IS_NVMETCP_PERSONALITY(p_hwfn))
 		qed_ll2_stop_ooo(p_hwfn);
 
 	/* In CMT mode, LL2 is always started on engine 0 for a storage PF */
@@ -2442,6 +2448,7 @@ static int __qed_ll2_start(struct qed_hwfn *p_hwfn,
 		conn_type = QED_LL2_TYPE_FCOE;
 		break;
 	case QED_PCI_ISCSI:
+	case QED_PCI_NVMETCP:
 		conn_type = QED_LL2_TYPE_TCP_ULP;
 		break;
 	case QED_PCI_ETH_ROCE:
@@ -2567,7 +2574,7 @@ static int qed_ll2_start(struct qed_dev *cdev, struct qed_ll2_params *params)
 		}
 	}
 
-	if (QED_IS_ISCSI_PERSONALITY(p_hwfn)) {
+	if (QED_IS_ISCSI_PERSONALITY(p_hwfn) || QED_IS_NVMETCP_PERSONALITY(p_hwfn)) {
 		DP_VERBOSE(cdev, QED_MSG_STORAGE, "Starting OOO LL2 queue\n");
 		rc = qed_ll2_start_ooo(p_hwfn, params);
 		if (rc) {
@@ -2576,10 +2583,13 @@ static int qed_ll2_start(struct qed_dev *cdev, struct qed_ll2_params *params)
 		}
 	}
 
-	rc = qed_llh_add_mac_filter(cdev, 0, params->ll2_mac_address);
-	if (rc) {
-		DP_NOTICE(cdev, "Failed to add an LLH filter\n");
-		goto err3;
+	if (!QED_IS_NVMETCP_PERSONALITY(p_hwfn)) {
+		rc = qed_llh_add_mac_filter(cdev, 0, params->ll2_mac_address);
+		if (rc) {
+			DP_NOTICE(cdev, "Failed to add an LLH filter\n");
+			goto err3;
+		}
+
 	}
 
 	ether_addr_copy(cdev->ll2_mac_address, params->ll2_mac_address);
@@ -2587,7 +2597,7 @@ static int qed_ll2_start(struct qed_dev *cdev, struct qed_ll2_params *params)
 	return 0;
 
 err3:
-	if (QED_IS_ISCSI_PERSONALITY(p_hwfn))
+	if (QED_IS_ISCSI_PERSONALITY(p_hwfn) || QED_IS_NVMETCP_PERSONALITY(p_hwfn))
 		qed_ll2_stop_ooo(p_hwfn);
 err2:
 	if (b_is_storage_eng1)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index cd882c453394..4387292c37e2 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -2446,6 +2446,9 @@ qed_mcp_get_shmem_proto(struct qed_hwfn *p_hwfn,
 	case FUNC_MF_CFG_PROTOCOL_ISCSI:
 		*p_proto = QED_PCI_ISCSI;
 		break;
+	case FUNC_MF_CFG_PROTOCOL_NVMETCP:
+		*p_proto = QED_PCI_NVMETCP;
+		break;
 	case FUNC_MF_CFG_PROTOCOL_FCOE:
 		*p_proto = QED_PCI_FCOE;
 		break;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c b/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c
index 3e3192a3ad9b..6190adf965bc 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c
@@ -1306,7 +1306,8 @@ int qed_mfw_process_tlv_req(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 	}
 
 	if ((tlv_group & QED_MFW_TLV_ISCSI) &&
-	    p_hwfn->hw_info.personality != QED_PCI_ISCSI) {
+	    p_hwfn->hw_info.personality != QED_PCI_ISCSI &&
+		p_hwfn->hw_info.personality != QED_PCI_NVMETCP) {
 		DP_VERBOSE(p_hwfn, QED_MSG_SP,
 			   "Skipping iSCSI TLVs for non-iSCSI function\n");
 		tlv_group &= ~QED_MFW_TLV_ISCSI;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_nvmetcp.c b/drivers/net/ethernet/qlogic/qed/qed_nvmetcp.c
new file mode 100644
index 000000000000..cb9c71109b2d
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_nvmetcp.c
@@ -0,0 +1,266 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause)
+/* Copyright 2021 Marvell. All rights reserved. */
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <asm/param.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/etherdevice.h>
+#include <linux/kernel.h>
+#include <linux/log2.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/list.h>
+#include <linux/qed/qed_nvmetcp_if.h>
+#include "qed.h"
+#include "qed_cxt.h"
+#include "qed_dev_api.h"
+#include "qed_hsi.h"
+#include "qed_hw.h"
+#include "qed_int.h"
+#include "qed_nvmetcp.h"
+#include "qed_ll2.h"
+#include "qed_mcp.h"
+#include "qed_sp.h"
+#include "qed_reg_addr.h"
+
+static int qed_nvmetcp_async_event(struct qed_hwfn *p_hwfn, u8 fw_event_code,
+				   u16 echo, union event_ring_data *data,
+				   u8 fw_return_code)
+{
+	if (p_hwfn->p_nvmetcp_info->event_cb) {
+		struct qed_nvmetcp_info *p_nvmetcp = p_hwfn->p_nvmetcp_info;
+
+		return p_nvmetcp->event_cb(p_nvmetcp->event_context,
+					 fw_event_code, data);
+	} else {
+		DP_NOTICE(p_hwfn, "nvmetcp async completion is not set\n");
+
+		return -EINVAL;
+	}
+}
+
+static int qed_sp_nvmetcp_func_start(struct qed_hwfn *p_hwfn,
+				     enum spq_mode comp_mode,
+				     struct qed_spq_comp_cb *p_comp_addr,
+				     void *event_context,
+				     nvmetcp_event_cb_t async_event_cb)
+{
+	struct nvmetcp_init_ramrod_params *p_ramrod = NULL;
+	struct qed_nvmetcp_pf_params *p_params = NULL;
+	struct scsi_init_func_queues *p_queue = NULL;
+	struct nvmetcp_spe_func_init *p_init = NULL;
+	struct qed_sp_init_data init_data = {};
+	struct qed_spq_entry *p_ent = NULL;
+	int rc = 0;
+	u16 val;
+	u8 i;
+
+	/* Get SPQ entry */
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 NVMETCP_RAMROD_CMD_ID_INIT_FUNC,
+				 PROTOCOLID_TCP_ULP, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.nvmetcp_init;
+	p_init = &p_ramrod->nvmetcp_init_spe;
+	p_params = &p_hwfn->pf_params.nvmetcp_pf_params;
+	p_queue = &p_init->q_params;
+	p_init->num_sq_pages_in_ring = p_params->num_sq_pages_in_ring;
+	p_init->num_r2tq_pages_in_ring = p_params->num_r2tq_pages_in_ring;
+	p_init->num_uhq_pages_in_ring = p_params->num_uhq_pages_in_ring;
+	p_init->ll2_rx_queue_id = RESC_START(p_hwfn, QED_LL2_RAM_QUEUE) +
+					p_params->ll2_ooo_queue_id;
+	SET_FIELD(p_init->flags, NVMETCP_SPE_FUNC_INIT_NVMETCP_MODE, 1);
+	p_init->func_params.log_page_size = ilog2(PAGE_SIZE);
+	p_init->func_params.num_tasks = cpu_to_le16(p_params->num_tasks);
+	p_init->debug_flags = p_params->debug_mode;
+	DMA_REGPAIR_LE(p_queue->glbl_q_params_addr,
+		       p_params->glbl_q_params_addr);
+	p_queue->cq_num_entries = cpu_to_le16(QED_NVMETCP_FW_CQ_SIZE);
+	p_queue->num_queues = p_params->num_queues;
+	val = RESC_START(p_hwfn, QED_CMDQS_CQS);
+	p_queue->queue_relative_offset = cpu_to_le16((u16)val);
+	p_queue->cq_sb_pi = p_params->gl_rq_pi;
+
+	for (i = 0; i < p_params->num_queues; i++) {
+		val = qed_get_igu_sb_id(p_hwfn, i);
+		p_queue->cq_cmdq_sb_num_arr[i] = cpu_to_le16(val);
+	}
+
+	SET_FIELD(p_queue->q_validity,
+		  SCSI_INIT_FUNC_QUEUES_CMD_VALID, 0);
+	p_queue->cmdq_num_entries = 0;
+	p_queue->bdq_resource_id = (u8)RESC_START(p_hwfn, QED_BDQ);
+	p_ramrod->tcp_init.two_msl_timer = cpu_to_le32(QED_TCP_TWO_MSL_TIMER);
+	p_ramrod->tcp_init.tx_sws_timer = cpu_to_le16(QED_TCP_SWS_TIMER);
+	p_init->half_way_close_timeout = cpu_to_le16(QED_TCP_HALF_WAY_CLOSE_TIMEOUT);
+	p_ramrod->tcp_init.max_fin_rt = QED_TCP_MAX_FIN_RT;
+	SET_FIELD(p_ramrod->nvmetcp_init_spe.params,
+		  NVMETCP_SPE_FUNC_INIT_MAX_SYN_RT, QED_TCP_MAX_FIN_RT);
+	p_hwfn->p_nvmetcp_info->event_context = event_context;
+	p_hwfn->p_nvmetcp_info->event_cb = async_event_cb;
+	qed_spq_register_async_cb(p_hwfn, PROTOCOLID_TCP_ULP,
+				  qed_nvmetcp_async_event);
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int qed_sp_nvmetcp_func_stop(struct qed_hwfn *p_hwfn,
+				    enum spq_mode comp_mode,
+				    struct qed_spq_comp_cb *p_comp_addr)
+{
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 NVMETCP_RAMROD_CMD_ID_DESTROY_FUNC,
+				 PROTOCOLID_TCP_ULP, &init_data);
+	if (rc)
+		return rc;
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+	qed_spq_unregister_async_cb(p_hwfn, PROTOCOLID_TCP_ULP);
+
+	return rc;
+}
+
+static int qed_fill_nvmetcp_dev_info(struct qed_dev *cdev,
+				     struct qed_dev_nvmetcp_info *info)
+{
+	struct qed_hwfn *hwfn = QED_AFFIN_HWFN(cdev);
+	int rc;
+
+	memset(info, 0, sizeof(*info));
+	rc = qed_fill_dev_info(cdev, &info->common);
+	info->port_id = MFW_PORT(hwfn);
+	info->num_cqs = FEAT_NUM(hwfn, QED_NVMETCP_CQ);
+
+	return rc;
+}
+
+static void qed_register_nvmetcp_ops(struct qed_dev *cdev,
+				     struct qed_nvmetcp_cb_ops *ops,
+				     void *cookie)
+{
+	cdev->protocol_ops.nvmetcp = ops;
+	cdev->ops_cookie = cookie;
+}
+
+static int qed_nvmetcp_stop(struct qed_dev *cdev)
+{
+	int rc;
+
+	if (!(cdev->flags & QED_FLAG_STORAGE_STARTED)) {
+		DP_NOTICE(cdev, "nvmetcp already stopped\n");
+
+		return 0;
+	}
+
+	if (!hash_empty(cdev->connections)) {
+		DP_NOTICE(cdev,
+			  "Can't stop nvmetcp - not all connections were returned\n");
+
+		return -EINVAL;
+	}
+
+	/* Stop the nvmetcp */
+	rc = qed_sp_nvmetcp_func_stop(QED_AFFIN_HWFN(cdev), QED_SPQ_MODE_EBLOCK,
+				      NULL);
+	cdev->flags &= ~QED_FLAG_STORAGE_STARTED;
+
+	return rc;
+}
+
+static int qed_nvmetcp_start(struct qed_dev *cdev,
+			     struct qed_nvmetcp_tid *tasks,
+			     void *event_context,
+			     nvmetcp_event_cb_t async_event_cb)
+{
+	struct qed_tid_mem *tid_info;
+	int rc;
+
+	if (cdev->flags & QED_FLAG_STORAGE_STARTED) {
+		DP_NOTICE(cdev, "nvmetcp already started;\n");
+
+		return 0;
+	}
+
+	rc = qed_sp_nvmetcp_func_start(QED_AFFIN_HWFN(cdev),
+				       QED_SPQ_MODE_EBLOCK, NULL,
+				       event_context, async_event_cb);
+	if (rc) {
+		DP_NOTICE(cdev, "Failed to start nvmetcp\n");
+
+		return rc;
+	}
+
+	cdev->flags |= QED_FLAG_STORAGE_STARTED;
+	hash_init(cdev->connections);
+
+	if (!tasks)
+		return 0;
+
+	tid_info = kzalloc(sizeof(*tid_info), GFP_KERNEL);
+	if (!tid_info) {
+		qed_nvmetcp_stop(cdev);
+
+		return -ENOMEM;
+	}
+
+	rc = qed_cxt_get_tid_mem_info(QED_AFFIN_HWFN(cdev), tid_info);
+	if (rc) {
+		DP_NOTICE(cdev, "Failed to gather task information\n");
+		qed_nvmetcp_stop(cdev);
+		kfree(tid_info);
+
+		return rc;
+	}
+
+	/* Fill task information */
+	tasks->size = tid_info->tid_size;
+	tasks->num_tids_per_block = tid_info->num_tids_per_block;
+	memcpy(tasks->blocks, tid_info->blocks,
+	       MAX_TID_BLOCKS_NVMETCP * sizeof(u8 *));
+	kfree(tid_info);
+
+	return 0;
+}
+
+static const struct qed_nvmetcp_ops qed_nvmetcp_ops_pass = {
+	.common = &qed_common_ops_pass,
+	.ll2 = &qed_ll2_ops_pass,
+	.fill_dev_info = &qed_fill_nvmetcp_dev_info,
+	.register_ops = &qed_register_nvmetcp_ops,
+	.start = &qed_nvmetcp_start,
+	.stop = &qed_nvmetcp_stop,
+
+	/* Placeholder - Connection level ops */
+};
+
+const struct qed_nvmetcp_ops *qed_get_nvmetcp_ops(void)
+{
+	return &qed_nvmetcp_ops_pass;
+}
+EXPORT_SYMBOL(qed_get_nvmetcp_ops);
+
+void qed_put_nvmetcp_ops(void)
+{
+}
+EXPORT_SYMBOL(qed_put_nvmetcp_ops);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_nvmetcp.h b/drivers/net/ethernet/qlogic/qed/qed_nvmetcp.h
new file mode 100644
index 000000000000..774b46ade408
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_nvmetcp.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */
+/* Copyright 2021 Marvell. All rights reserved. */
+
+#ifndef _QED_NVMETCP_H
+#define _QED_NVMETCP_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/qed/tcp_common.h>
+#include <linux/qed/qed_nvmetcp_if.h>
+#include <linux/qed/qed_chain.h>
+#include "qed.h"
+#include "qed_hsi.h"
+#include "qed_mcp.h"
+#include "qed_sp.h"
+
+#define QED_NVMETCP_FW_CQ_SIZE (4 * 1024)
+
+/* tcp parameters */
+#define QED_TCP_TWO_MSL_TIMER 4000
+#define QED_TCP_HALF_WAY_CLOSE_TIMEOUT 10
+#define QED_TCP_MAX_FIN_RT 2
+#define QED_TCP_SWS_TIMER 5000
+
+struct qed_nvmetcp_info {
+	spinlock_t lock; /* Connection resources. */
+	struct list_head free_list;
+	u16 max_num_outstanding_tasks;
+	void *event_context;
+	nvmetcp_event_cb_t event_cb;
+};
+
+#if IS_ENABLED(CONFIG_QED_NVMETCP)
+int qed_nvmetcp_alloc(struct qed_hwfn *p_hwfn);
+void qed_nvmetcp_setup(struct qed_hwfn *p_hwfn);
+void qed_nvmetcp_free(struct qed_hwfn *p_hwfn);
+
+#else /* IS_ENABLED(CONFIG_QED_NVMETCP) */
+static inline int qed_nvmetcp_alloc(struct qed_hwfn *p_hwfn)
+{
+	return -EINVAL;
+}
+
+static inline void qed_nvmetcp_setup(struct qed_hwfn *p_hwfn) {}
+static inline void qed_nvmetcp_free(struct qed_hwfn *p_hwfn) {}
+
+#endif /* IS_ENABLED(CONFIG_QED_NVMETCP) */
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ooo.c b/drivers/net/ethernet/qlogic/qed/qed_ooo.c
index 599da0d7366b..b8c5641b29a8 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ooo.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ooo.c
@@ -16,7 +16,7 @@
 #include "qed_ll2.h"
 #include "qed_ooo.h"
 #include "qed_cxt.h"
-
+#include "qed_nvmetcp.h"
 static struct qed_ooo_archipelago
 *qed_ooo_seek_archipelago(struct qed_hwfn *p_hwfn,
 			  struct qed_ooo_info
@@ -83,6 +83,7 @@ int qed_ooo_alloc(struct qed_hwfn *p_hwfn)
 
 	switch (p_hwfn->hw_info.personality) {
 	case QED_PCI_ISCSI:
+	case QED_PCI_NVMETCP:
 		proto = PROTOCOLID_TCP_ULP;
 		break;
 	case QED_PCI_ETH_RDMA:
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h
index 993f1357b6fc..525159e747a5 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h
@@ -100,6 +100,8 @@ union ramrod_data {
 	struct iscsi_spe_conn_mac_update iscsi_conn_mac_update;
 	struct iscsi_spe_conn_termination iscsi_conn_terminate;
 
+	struct nvmetcp_init_ramrod_params nvmetcp_init;
+
 	struct vf_start_ramrod_data vf_start;
 	struct vf_stop_ramrod_data vf_stop;
 };
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index ee7dc0a7da6c..b4ed54ffef9b 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -385,6 +385,7 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 		p_ramrod->personality = PERSONALITY_FCOE;
 		break;
 	case QED_PCI_ISCSI:
+	case QED_PCI_NVMETCP:
 		p_ramrod->personality = PERSONALITY_TCP_ULP;
 		break;
 	case QED_PCI_ETH_ROCE:
diff --git a/include/linux/qed/nvmetcp_common.h b/include/linux/qed/nvmetcp_common.h
new file mode 100644
index 000000000000..e9ccfc07041d
--- /dev/null
+++ b/include/linux/qed/nvmetcp_common.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */
+/* Copyright 2021 Marvell. All rights reserved. */
+
+#ifndef __NVMETCP_COMMON__
+#define __NVMETCP_COMMON__
+
+#include "tcp_common.h"
+
+/* NVMeTCP firmware function init parameters */
+struct nvmetcp_spe_func_init {
+	__le16 half_way_close_timeout;
+	u8 num_sq_pages_in_ring;
+	u8 num_r2tq_pages_in_ring;
+	u8 num_uhq_pages_in_ring;
+	u8 ll2_rx_queue_id;
+	u8 flags;
+#define NVMETCP_SPE_FUNC_INIT_COUNTERS_EN_MASK 0x1
+#define NVMETCP_SPE_FUNC_INIT_COUNTERS_EN_SHIFT 0
+#define NVMETCP_SPE_FUNC_INIT_NVMETCP_MODE_MASK 0x1
+#define NVMETCP_SPE_FUNC_INIT_NVMETCP_MODE_SHIFT 1
+#define NVMETCP_SPE_FUNC_INIT_RESERVED0_MASK 0x3F
+#define NVMETCP_SPE_FUNC_INIT_RESERVED0_SHIFT 2
+	u8 debug_flags;
+	__le16 reserved1;
+	u8 params;
+#define NVMETCP_SPE_FUNC_INIT_MAX_SYN_RT_MASK	0xF
+#define NVMETCP_SPE_FUNC_INIT_MAX_SYN_RT_SHIFT	0
+#define NVMETCP_SPE_FUNC_INIT_RESERVED1_MASK	0xF
+#define NVMETCP_SPE_FUNC_INIT_RESERVED1_SHIFT	4
+	u8 reserved2[5];
+	struct scsi_init_func_params func_params;
+	struct scsi_init_func_queues q_params;
+};
+
+/* NVMeTCP init params passed by driver to FW in NVMeTCP init ramrod. */
+struct nvmetcp_init_ramrod_params {
+	struct nvmetcp_spe_func_init nvmetcp_init_spe;
+	struct tcp_init_params tcp_init;
+};
+
+/* NVMeTCP Ramrod Command IDs */
+enum nvmetcp_ramrod_cmd_id {
+	NVMETCP_RAMROD_CMD_ID_UNUSED = 0,
+	NVMETCP_RAMROD_CMD_ID_INIT_FUNC = 1,
+	NVMETCP_RAMROD_CMD_ID_DESTROY_FUNC = 2,
+	MAX_NVMETCP_RAMROD_CMD_ID
+};
+
+struct nvmetcp_glbl_queue_entry {
+	struct regpair cq_pbl_addr;
+	struct regpair reserved;
+};
+
+#endif /* __NVMETCP_COMMON__ */
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 68d17a4fbf20..850b98991670 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -542,6 +542,22 @@ struct qed_iscsi_pf_params {
 	u8 bdq_pbl_num_entries[3];
 };
 
+struct qed_nvmetcp_pf_params {
+	u64 glbl_q_params_addr;
+	u16 cq_num_entries;
+	u16 num_cons;
+	u16 num_tasks;
+	u8 num_sq_pages_in_ring;
+	u8 num_r2tq_pages_in_ring;
+	u8 num_uhq_pages_in_ring;
+	u8 num_queues;
+	u8 gl_rq_pi;
+	u8 gl_cmd_pi;
+	u8 debug_mode;
+	u8 ll2_ooo_queue_id;
+	u16 min_rto;
+};
+
 struct qed_rdma_pf_params {
 	/* Supplied to QED during resource allocation (may affect the ILT and
 	 * the doorbell BAR).
@@ -560,6 +576,7 @@ struct qed_pf_params {
 	struct qed_eth_pf_params eth_pf_params;
 	struct qed_fcoe_pf_params fcoe_pf_params;
 	struct qed_iscsi_pf_params iscsi_pf_params;
+	struct qed_nvmetcp_pf_params nvmetcp_pf_params;
 	struct qed_rdma_pf_params rdma_pf_params;
 };
 
@@ -662,6 +679,7 @@ enum qed_sb_type {
 enum qed_protocol {
 	QED_PROTOCOL_ETH,
 	QED_PROTOCOL_ISCSI,
+	QED_PROTOCOL_NVMETCP = QED_PROTOCOL_ISCSI,
 	QED_PROTOCOL_FCOE,
 };
 
diff --git a/include/linux/qed/qed_nvmetcp_if.h b/include/linux/qed/qed_nvmetcp_if.h
new file mode 100644
index 000000000000..76868bdf0883
--- /dev/null
+++ b/include/linux/qed/qed_nvmetcp_if.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */
+/* Copyright 2021 Marvell. All rights reserved. */
+
+#ifndef _QED_NVMETCP_IF_H
+#define _QED_NVMETCP_IF_H
+#include <linux/types.h>
+#include <linux/qed/qed_if.h>
+
+#define QED_NVMETCP_MAX_IO_SIZE	0x800000
+
+typedef int (*nvmetcp_event_cb_t) (void *context,
+				   u8 fw_event_code, void *fw_handle);
+
+struct qed_dev_nvmetcp_info {
+	struct qed_dev_info common;
+	u8 port_id;  /* Physical port */
+	u8 num_cqs;
+};
+
+#define MAX_TID_BLOCKS_NVMETCP (512)
+struct qed_nvmetcp_tid {
+	u32 size;		/* In bytes per task */
+	u32 num_tids_per_block;
+	u8 *blocks[MAX_TID_BLOCKS_NVMETCP];
+};
+
+struct qed_nvmetcp_cb_ops {
+	struct qed_common_cb_ops common;
+};
+
+/**
+ * struct qed_nvmetcp_ops - qed NVMeTCP operations.
+ * @common:		common operations pointer
+ * @ll2:		light L2 operations pointer
+ * @fill_dev_info:	fills NVMeTCP specific information
+ *			@param cdev
+ *			@param info
+ *			@return 0 on success, otherwise error value.
+ * @register_ops:	register nvmetcp operations
+ *			@param cdev
+ *			@param ops - specified using qed_nvmetcp_cb_ops
+ *			@param cookie - driver private
+ * @start:		nvmetcp in FW
+ *			@param cdev
+ *			@param tasks - qed will fill information about tasks
+ *			return 0 on success, otherwise error value.
+ * @stop:		nvmetcp in FW
+ *			@param cdev
+ *			return 0 on success, otherwise error value.
+ */
+struct qed_nvmetcp_ops {
+	const struct qed_common_ops *common;
+
+	const struct qed_ll2_ops *ll2;
+
+	int (*fill_dev_info)(struct qed_dev *cdev,
+			     struct qed_dev_nvmetcp_info *info);
+
+	void (*register_ops)(struct qed_dev *cdev,
+			     struct qed_nvmetcp_cb_ops *ops, void *cookie);
+
+	int (*start)(struct qed_dev *cdev,
+		     struct qed_nvmetcp_tid *tasks,
+		     void *event_context, nvmetcp_event_cb_t async_event_cb);
+
+	int (*stop)(struct qed_dev *cdev);
+};
+
+const struct qed_nvmetcp_ops *qed_get_nvmetcp_ops(void);
+void qed_put_nvmetcp_ops(void);
+#endif
-- 
cgit v1.2.3


From 76684ab8f4f95394df6a752cee37b197b4c8732b Mon Sep 17 00:00:00 2001
From: Shai Malin <smalin@marvell.com>
Date: Wed, 2 Jun 2021 20:16:51 +0300
Subject: qed: Add NVMeTCP Offload Connection Level FW and HW HSI

This patch introduces the NVMeTCP HSI and HSI functionality in order to
initialize and interact with the HW device as part of the connection level
HSI.

This includes:
- Connection offload: offload a TCP connection to the FW.
- Connection update: update the ICReq-ICResp params
- Connection clear SQ: outstanding IOs FW flush.
- Connection termination: terminate the TCP connection and flush the FW.

Acked-by: Igor Russkikh <irusskikh@marvell.com>
Signed-off-by: Prabhakar Kushwaha <pkushwaha@marvell.com>
Signed-off-by: Omkar Kulkarni <okulkarni@marvell.com>
Signed-off-by: Shai Malin <smalin@marvell.com>
Signed-off-by: Michal Kalderon <mkalderon@marvell.com>
Signed-off-by: Ariel Elior <aelior@marvell.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_nvmetcp.c | 557 +++++++++++++++++++++++++-
 drivers/net/ethernet/qlogic/qed/qed_nvmetcp.h |  52 +++
 drivers/net/ethernet/qlogic/qed/qed_sp.h      |   3 +
 include/linux/qed/nvmetcp_common.h            | 143 +++++++
 include/linux/qed/qed_nvmetcp_if.h            |  94 +++++
 5 files changed, 847 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_nvmetcp.c b/drivers/net/ethernet/qlogic/qed/qed_nvmetcp.c
index cb9c71109b2d..7943804e88cd 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_nvmetcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_nvmetcp.c
@@ -243,6 +243,555 @@ static int qed_nvmetcp_start(struct qed_dev *cdev,
 	return 0;
 }
 
+static struct qed_hash_nvmetcp_con *qed_nvmetcp_get_hash(struct qed_dev *cdev,
+							 u32 handle)
+{
+	struct qed_hash_nvmetcp_con *hash_con = NULL;
+
+	if (!(cdev->flags & QED_FLAG_STORAGE_STARTED))
+		return NULL;
+
+	hash_for_each_possible(cdev->connections, hash_con, node, handle) {
+		if (hash_con->con->icid == handle)
+			break;
+	}
+
+	if (!hash_con || hash_con->con->icid != handle)
+		return NULL;
+
+	return hash_con;
+}
+
+static int qed_sp_nvmetcp_conn_offload(struct qed_hwfn *p_hwfn,
+				       struct qed_nvmetcp_conn *p_conn,
+				       enum spq_mode comp_mode,
+				       struct qed_spq_comp_cb *p_comp_addr)
+{
+	struct nvmetcp_spe_conn_offload *p_ramrod = NULL;
+	struct tcp_offload_params_opt2 *p_tcp = NULL;
+	struct qed_sp_init_data init_data = { 0 };
+	struct qed_spq_entry *p_ent = NULL;
+	dma_addr_t r2tq_pbl_addr;
+	dma_addr_t xhq_pbl_addr;
+	dma_addr_t uhq_pbl_addr;
+	u16 physical_q;
+	int rc = 0;
+	u8 i;
+
+	/* Get SPQ entry */
+	init_data.cid = p_conn->icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 NVMETCP_RAMROD_CMD_ID_OFFLOAD_CONN,
+				 PROTOCOLID_TCP_ULP, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.nvmetcp_conn_offload;
+
+	/* Transmission PQ is the first of the PF */
+	physical_q = qed_get_cm_pq_idx(p_hwfn, PQ_FLAGS_OFLD);
+	p_conn->physical_q0 = cpu_to_le16(physical_q);
+	p_ramrod->nvmetcp.physical_q0 = cpu_to_le16(physical_q);
+
+	/* nvmetcp Pure-ACK PQ */
+	physical_q = qed_get_cm_pq_idx(p_hwfn, PQ_FLAGS_ACK);
+	p_conn->physical_q1 = cpu_to_le16(physical_q);
+	p_ramrod->nvmetcp.physical_q1 = cpu_to_le16(physical_q);
+	p_ramrod->conn_id = cpu_to_le16(p_conn->conn_id);
+	DMA_REGPAIR_LE(p_ramrod->nvmetcp.sq_pbl_addr, p_conn->sq_pbl_addr);
+	r2tq_pbl_addr = qed_chain_get_pbl_phys(&p_conn->r2tq);
+	DMA_REGPAIR_LE(p_ramrod->nvmetcp.r2tq_pbl_addr, r2tq_pbl_addr);
+	xhq_pbl_addr = qed_chain_get_pbl_phys(&p_conn->xhq);
+	DMA_REGPAIR_LE(p_ramrod->nvmetcp.xhq_pbl_addr, xhq_pbl_addr);
+	uhq_pbl_addr = qed_chain_get_pbl_phys(&p_conn->uhq);
+	DMA_REGPAIR_LE(p_ramrod->nvmetcp.uhq_pbl_addr, uhq_pbl_addr);
+	p_ramrod->nvmetcp.flags = p_conn->offl_flags;
+	p_ramrod->nvmetcp.default_cq = p_conn->default_cq;
+	p_ramrod->nvmetcp.initial_ack = 0;
+	DMA_REGPAIR_LE(p_ramrod->nvmetcp.nvmetcp.cccid_itid_table_addr,
+		       p_conn->nvmetcp_cccid_itid_table_addr);
+	p_ramrod->nvmetcp.nvmetcp.cccid_max_range =
+		 cpu_to_le16(p_conn->nvmetcp_cccid_max_range);
+	p_tcp = &p_ramrod->tcp;
+	qed_set_fw_mac_addr(&p_tcp->remote_mac_addr_hi,
+			    &p_tcp->remote_mac_addr_mid,
+			    &p_tcp->remote_mac_addr_lo, p_conn->remote_mac);
+	qed_set_fw_mac_addr(&p_tcp->local_mac_addr_hi,
+			    &p_tcp->local_mac_addr_mid,
+			    &p_tcp->local_mac_addr_lo, p_conn->local_mac);
+	p_tcp->vlan_id = cpu_to_le16(p_conn->vlan_id);
+	p_tcp->flags = cpu_to_le16(p_conn->tcp_flags);
+	p_tcp->ip_version = p_conn->ip_version;
+	if (p_tcp->ip_version == TCP_IPV6) {
+		for (i = 0; i < 4; i++) {
+			p_tcp->remote_ip[i] = cpu_to_le32(p_conn->remote_ip[i]);
+			p_tcp->local_ip[i] = cpu_to_le32(p_conn->local_ip[i]);
+		}
+	} else {
+		p_tcp->remote_ip[0] = cpu_to_le32(p_conn->remote_ip[0]);
+		p_tcp->local_ip[0] = cpu_to_le32(p_conn->local_ip[0]);
+	}
+
+	p_tcp->flow_label = cpu_to_le32(p_conn->flow_label);
+	p_tcp->ttl = p_conn->ttl;
+	p_tcp->tos_or_tc = p_conn->tos_or_tc;
+	p_tcp->remote_port = cpu_to_le16(p_conn->remote_port);
+	p_tcp->local_port = cpu_to_le16(p_conn->local_port);
+	p_tcp->mss = cpu_to_le16(p_conn->mss);
+	p_tcp->rcv_wnd_scale = p_conn->rcv_wnd_scale;
+	p_tcp->connect_mode = p_conn->connect_mode;
+	p_tcp->cwnd = cpu_to_le32(p_conn->cwnd);
+	p_tcp->ka_max_probe_cnt = p_conn->ka_max_probe_cnt;
+	p_tcp->ka_timeout = cpu_to_le32(p_conn->ka_timeout);
+	p_tcp->max_rt_time = cpu_to_le32(p_conn->max_rt_time);
+	p_tcp->ka_interval = cpu_to_le32(p_conn->ka_interval);
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int qed_sp_nvmetcp_conn_update(struct qed_hwfn *p_hwfn,
+				      struct qed_nvmetcp_conn *p_conn,
+				      enum spq_mode comp_mode,
+				      struct qed_spq_comp_cb *p_comp_addr)
+{
+	struct nvmetcp_conn_update_ramrod_params *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+	u32 dval;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = p_conn->icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 NVMETCP_RAMROD_CMD_ID_UPDATE_CONN,
+				 PROTOCOLID_TCP_ULP, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.nvmetcp_conn_update;
+	p_ramrod->conn_id = cpu_to_le16(p_conn->conn_id);
+	p_ramrod->flags = p_conn->update_flag;
+	p_ramrod->max_seq_size = cpu_to_le32(p_conn->max_seq_size);
+	dval = p_conn->max_recv_pdu_length;
+	p_ramrod->max_recv_pdu_length = cpu_to_le32(dval);
+	dval = p_conn->max_send_pdu_length;
+	p_ramrod->max_send_pdu_length = cpu_to_le32(dval);
+	p_ramrod->first_seq_length = cpu_to_le32(p_conn->first_seq_length);
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int qed_sp_nvmetcp_conn_terminate(struct qed_hwfn *p_hwfn,
+					 struct qed_nvmetcp_conn *p_conn,
+					 enum spq_mode comp_mode,
+					 struct qed_spq_comp_cb *p_comp_addr)
+{
+	struct nvmetcp_spe_conn_termination *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = p_conn->icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 NVMETCP_RAMROD_CMD_ID_TERMINATION_CONN,
+				 PROTOCOLID_TCP_ULP, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.nvmetcp_conn_terminate;
+	p_ramrod->conn_id = cpu_to_le16(p_conn->conn_id);
+	p_ramrod->abortive = p_conn->abortive_dsconnect;
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int qed_sp_nvmetcp_conn_clear_sq(struct qed_hwfn *p_hwfn,
+					struct qed_nvmetcp_conn *p_conn,
+					enum spq_mode comp_mode,
+					struct qed_spq_comp_cb *p_comp_addr)
+{
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = p_conn->icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 NVMETCP_RAMROD_CMD_ID_CLEAR_SQ,
+				 PROTOCOLID_TCP_ULP, &init_data);
+	if (rc)
+		return rc;
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static void __iomem *qed_nvmetcp_get_db_addr(struct qed_hwfn *p_hwfn, u32 cid)
+{
+	return (u8 __iomem *)p_hwfn->doorbells +
+			     qed_db_addr(cid, DQ_DEMS_LEGACY);
+}
+
+static int qed_nvmetcp_allocate_connection(struct qed_hwfn *p_hwfn,
+					   struct qed_nvmetcp_conn **p_out_conn)
+{
+	struct qed_chain_init_params params = {
+		.mode		= QED_CHAIN_MODE_PBL,
+		.intended_use	= QED_CHAIN_USE_TO_CONSUME_PRODUCE,
+		.cnt_type	= QED_CHAIN_CNT_TYPE_U16,
+	};
+	struct qed_nvmetcp_pf_params *p_params = NULL;
+	struct qed_nvmetcp_conn *p_conn = NULL;
+	int rc = 0;
+
+	/* Try finding a free connection that can be used */
+	spin_lock_bh(&p_hwfn->p_nvmetcp_info->lock);
+	if (!list_empty(&p_hwfn->p_nvmetcp_info->free_list))
+		p_conn = list_first_entry(&p_hwfn->p_nvmetcp_info->free_list,
+					  struct qed_nvmetcp_conn, list_entry);
+	if (p_conn) {
+		list_del(&p_conn->list_entry);
+		spin_unlock_bh(&p_hwfn->p_nvmetcp_info->lock);
+		*p_out_conn = p_conn;
+
+		return 0;
+	}
+	spin_unlock_bh(&p_hwfn->p_nvmetcp_info->lock);
+
+	/* Need to allocate a new connection */
+	p_params = &p_hwfn->pf_params.nvmetcp_pf_params;
+	p_conn = kzalloc(sizeof(*p_conn), GFP_KERNEL);
+	if (!p_conn)
+		return -ENOMEM;
+
+	params.num_elems = p_params->num_r2tq_pages_in_ring *
+			   QED_CHAIN_PAGE_SIZE / sizeof(struct nvmetcp_wqe);
+	params.elem_size = sizeof(struct nvmetcp_wqe);
+	rc = qed_chain_alloc(p_hwfn->cdev, &p_conn->r2tq, &params);
+	if (rc)
+		goto nomem_r2tq;
+
+	params.num_elems = p_params->num_uhq_pages_in_ring *
+			   QED_CHAIN_PAGE_SIZE / sizeof(struct iscsi_uhqe);
+	params.elem_size = sizeof(struct iscsi_uhqe);
+	rc = qed_chain_alloc(p_hwfn->cdev, &p_conn->uhq, &params);
+	if (rc)
+		goto nomem_uhq;
+
+	params.elem_size = sizeof(struct iscsi_xhqe);
+	rc = qed_chain_alloc(p_hwfn->cdev, &p_conn->xhq, &params);
+	if (rc)
+		goto nomem;
+
+	p_conn->free_on_delete = true;
+	*p_out_conn = p_conn;
+
+	return 0;
+
+nomem:
+	qed_chain_free(p_hwfn->cdev, &p_conn->uhq);
+nomem_uhq:
+	qed_chain_free(p_hwfn->cdev, &p_conn->r2tq);
+nomem_r2tq:
+	kfree(p_conn);
+
+	return -ENOMEM;
+}
+
+static int qed_nvmetcp_acquire_connection(struct qed_hwfn *p_hwfn,
+					  struct qed_nvmetcp_conn **p_out_conn)
+{
+	struct qed_nvmetcp_conn *p_conn = NULL;
+	int rc = 0;
+	u32 icid;
+
+	spin_lock_bh(&p_hwfn->p_nvmetcp_info->lock);
+	rc = qed_cxt_acquire_cid(p_hwfn, PROTOCOLID_TCP_ULP, &icid);
+	spin_unlock_bh(&p_hwfn->p_nvmetcp_info->lock);
+
+	if (rc)
+		return rc;
+
+	rc = qed_nvmetcp_allocate_connection(p_hwfn, &p_conn);
+	if (rc) {
+		spin_lock_bh(&p_hwfn->p_nvmetcp_info->lock);
+		qed_cxt_release_cid(p_hwfn, icid);
+		spin_unlock_bh(&p_hwfn->p_nvmetcp_info->lock);
+
+		return rc;
+	}
+
+	p_conn->icid = icid;
+	p_conn->conn_id = (u16)icid;
+	p_conn->fw_cid = (p_hwfn->hw_info.opaque_fid << 16) | icid;
+	*p_out_conn = p_conn;
+
+	return rc;
+}
+
+static void qed_nvmetcp_release_connection(struct qed_hwfn *p_hwfn,
+					   struct qed_nvmetcp_conn *p_conn)
+{
+	spin_lock_bh(&p_hwfn->p_nvmetcp_info->lock);
+	list_add_tail(&p_conn->list_entry, &p_hwfn->p_nvmetcp_info->free_list);
+	qed_cxt_release_cid(p_hwfn, p_conn->icid);
+	spin_unlock_bh(&p_hwfn->p_nvmetcp_info->lock);
+}
+
+static void qed_nvmetcp_free_connection(struct qed_hwfn *p_hwfn,
+					struct qed_nvmetcp_conn *p_conn)
+{
+	qed_chain_free(p_hwfn->cdev, &p_conn->xhq);
+	qed_chain_free(p_hwfn->cdev, &p_conn->uhq);
+	qed_chain_free(p_hwfn->cdev, &p_conn->r2tq);
+	kfree(p_conn);
+}
+
+int qed_nvmetcp_alloc(struct qed_hwfn *p_hwfn)
+{
+	struct qed_nvmetcp_info *p_nvmetcp_info;
+
+	p_nvmetcp_info = kzalloc(sizeof(*p_nvmetcp_info), GFP_KERNEL);
+	if (!p_nvmetcp_info)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&p_nvmetcp_info->free_list);
+	p_hwfn->p_nvmetcp_info = p_nvmetcp_info;
+
+	return 0;
+}
+
+void qed_nvmetcp_setup(struct qed_hwfn *p_hwfn)
+{
+	spin_lock_init(&p_hwfn->p_nvmetcp_info->lock);
+}
+
+void qed_nvmetcp_free(struct qed_hwfn *p_hwfn)
+{
+	struct qed_nvmetcp_conn *p_conn = NULL;
+
+	if (!p_hwfn->p_nvmetcp_info)
+		return;
+
+	while (!list_empty(&p_hwfn->p_nvmetcp_info->free_list)) {
+		p_conn = list_first_entry(&p_hwfn->p_nvmetcp_info->free_list,
+					  struct qed_nvmetcp_conn, list_entry);
+		if (p_conn) {
+			list_del(&p_conn->list_entry);
+			qed_nvmetcp_free_connection(p_hwfn, p_conn);
+		}
+	}
+
+	kfree(p_hwfn->p_nvmetcp_info);
+	p_hwfn->p_nvmetcp_info = NULL;
+}
+
+static int qed_nvmetcp_acquire_conn(struct qed_dev *cdev,
+				    u32 *handle,
+				    u32 *fw_cid, void __iomem **p_doorbell)
+{
+	struct qed_hash_nvmetcp_con *hash_con;
+	int rc;
+
+	/* Allocate a hashed connection */
+	hash_con = kzalloc(sizeof(*hash_con), GFP_ATOMIC);
+	if (!hash_con)
+		return -ENOMEM;
+
+	/* Acquire the connection */
+	rc = qed_nvmetcp_acquire_connection(QED_AFFIN_HWFN(cdev),
+					    &hash_con->con);
+	if (rc) {
+		DP_NOTICE(cdev, "Failed to acquire Connection\n");
+		kfree(hash_con);
+
+		return rc;
+	}
+
+	/* Added the connection to hash table */
+	*handle = hash_con->con->icid;
+	*fw_cid = hash_con->con->fw_cid;
+	hash_add(cdev->connections, &hash_con->node, *handle);
+	if (p_doorbell)
+		*p_doorbell = qed_nvmetcp_get_db_addr(QED_AFFIN_HWFN(cdev),
+						      *handle);
+
+	return 0;
+}
+
+static int qed_nvmetcp_release_conn(struct qed_dev *cdev, u32 handle)
+{
+	struct qed_hash_nvmetcp_con *hash_con;
+
+	hash_con = qed_nvmetcp_get_hash(cdev, handle);
+	if (!hash_con) {
+		DP_NOTICE(cdev, "Failed to find connection for handle %d\n",
+			  handle);
+
+		return -EINVAL;
+	}
+
+	hlist_del(&hash_con->node);
+	qed_nvmetcp_release_connection(QED_AFFIN_HWFN(cdev), hash_con->con);
+	kfree(hash_con);
+
+	return 0;
+}
+
+static int qed_nvmetcp_offload_conn(struct qed_dev *cdev, u32 handle,
+				    struct qed_nvmetcp_params_offload *conn_info)
+{
+	struct qed_hash_nvmetcp_con *hash_con;
+	struct qed_nvmetcp_conn *con;
+
+	hash_con = qed_nvmetcp_get_hash(cdev, handle);
+	if (!hash_con) {
+		DP_NOTICE(cdev, "Failed to find connection for handle %d\n",
+			  handle);
+
+		return -EINVAL;
+	}
+
+	/* Update the connection with information from the params */
+	con = hash_con->con;
+
+	/* FW initializations */
+	con->layer_code = NVMETCP_SLOW_PATH_LAYER_CODE;
+	con->sq_pbl_addr = conn_info->sq_pbl_addr;
+	con->nvmetcp_cccid_max_range = conn_info->nvmetcp_cccid_max_range;
+	con->nvmetcp_cccid_itid_table_addr = conn_info->nvmetcp_cccid_itid_table_addr;
+	con->default_cq = conn_info->default_cq;
+	SET_FIELD(con->offl_flags, NVMETCP_CONN_OFFLOAD_PARAMS_TARGET_MODE, 0);
+	SET_FIELD(con->offl_flags, NVMETCP_CONN_OFFLOAD_PARAMS_NVMETCP_MODE, 1);
+	SET_FIELD(con->offl_flags, NVMETCP_CONN_OFFLOAD_PARAMS_TCP_ON_CHIP_1B, 1);
+
+	/* Networking and TCP stack initializations */
+	ether_addr_copy(con->local_mac, conn_info->src.mac);
+	ether_addr_copy(con->remote_mac, conn_info->dst.mac);
+	memcpy(con->local_ip, conn_info->src.ip, sizeof(con->local_ip));
+	memcpy(con->remote_ip, conn_info->dst.ip, sizeof(con->remote_ip));
+	con->local_port = conn_info->src.port;
+	con->remote_port = conn_info->dst.port;
+	con->vlan_id = conn_info->vlan_id;
+
+	if (conn_info->timestamp_en)
+		SET_FIELD(con->tcp_flags, TCP_OFFLOAD_PARAMS_OPT2_TS_EN, 1);
+
+	if (conn_info->delayed_ack_en)
+		SET_FIELD(con->tcp_flags, TCP_OFFLOAD_PARAMS_OPT2_DA_EN, 1);
+
+	if (conn_info->tcp_keep_alive_en)
+		SET_FIELD(con->tcp_flags, TCP_OFFLOAD_PARAMS_OPT2_KA_EN, 1);
+
+	if (conn_info->ecn_en)
+		SET_FIELD(con->tcp_flags, TCP_OFFLOAD_PARAMS_OPT2_ECN_EN, 1);
+
+	con->ip_version = conn_info->ip_version;
+	con->flow_label = QED_TCP_FLOW_LABEL;
+	con->ka_max_probe_cnt = conn_info->ka_max_probe_cnt;
+	con->ka_timeout = conn_info->ka_timeout;
+	con->ka_interval = conn_info->ka_interval;
+	con->max_rt_time = conn_info->max_rt_time;
+	con->ttl = conn_info->ttl;
+	con->tos_or_tc = conn_info->tos_or_tc;
+	con->mss = conn_info->mss;
+	con->cwnd = conn_info->cwnd;
+	con->rcv_wnd_scale = conn_info->rcv_wnd_scale;
+	con->connect_mode = 0;
+
+	return qed_sp_nvmetcp_conn_offload(QED_AFFIN_HWFN(cdev), con,
+					 QED_SPQ_MODE_EBLOCK, NULL);
+}
+
+static int qed_nvmetcp_update_conn(struct qed_dev *cdev,
+				   u32 handle,
+				   struct qed_nvmetcp_params_update *conn_info)
+{
+	struct qed_hash_nvmetcp_con *hash_con;
+	struct qed_nvmetcp_conn *con;
+
+	hash_con = qed_nvmetcp_get_hash(cdev, handle);
+	if (!hash_con) {
+		DP_NOTICE(cdev, "Failed to find connection for handle %d\n",
+			  handle);
+
+		return -EINVAL;
+	}
+
+	/* Update the connection with information from the params */
+	con = hash_con->con;
+	SET_FIELD(con->update_flag,
+		  ISCSI_CONN_UPDATE_RAMROD_PARAMS_INITIAL_R2T, 0);
+	SET_FIELD(con->update_flag,
+		  ISCSI_CONN_UPDATE_RAMROD_PARAMS_IMMEDIATE_DATA, 1);
+	if (conn_info->hdr_digest_en)
+		SET_FIELD(con->update_flag, ISCSI_CONN_UPDATE_RAMROD_PARAMS_HD_EN, 1);
+
+	if (conn_info->data_digest_en)
+		SET_FIELD(con->update_flag, ISCSI_CONN_UPDATE_RAMROD_PARAMS_DD_EN, 1);
+
+	/* Placeholder - initialize pfv, cpda, hpda */
+
+	con->max_seq_size = conn_info->max_io_size;
+	con->max_recv_pdu_length = conn_info->max_recv_pdu_length;
+	con->max_send_pdu_length = conn_info->max_send_pdu_length;
+	con->first_seq_length = conn_info->max_io_size;
+
+	return qed_sp_nvmetcp_conn_update(QED_AFFIN_HWFN(cdev), con,
+					QED_SPQ_MODE_EBLOCK, NULL);
+}
+
+static int qed_nvmetcp_clear_conn_sq(struct qed_dev *cdev, u32 handle)
+{
+	struct qed_hash_nvmetcp_con *hash_con;
+
+	hash_con = qed_nvmetcp_get_hash(cdev, handle);
+	if (!hash_con) {
+		DP_NOTICE(cdev, "Failed to find connection for handle %d\n",
+			  handle);
+
+		return -EINVAL;
+	}
+
+	return qed_sp_nvmetcp_conn_clear_sq(QED_AFFIN_HWFN(cdev), hash_con->con,
+					    QED_SPQ_MODE_EBLOCK, NULL);
+}
+
+static int qed_nvmetcp_destroy_conn(struct qed_dev *cdev,
+				    u32 handle, u8 abrt_conn)
+{
+	struct qed_hash_nvmetcp_con *hash_con;
+
+	hash_con = qed_nvmetcp_get_hash(cdev, handle);
+	if (!hash_con) {
+		DP_NOTICE(cdev, "Failed to find connection for handle %d\n",
+			  handle);
+
+		return -EINVAL;
+	}
+
+	hash_con->con->abortive_dsconnect = abrt_conn;
+
+	return qed_sp_nvmetcp_conn_terminate(QED_AFFIN_HWFN(cdev), hash_con->con,
+					   QED_SPQ_MODE_EBLOCK, NULL);
+}
+
 static const struct qed_nvmetcp_ops qed_nvmetcp_ops_pass = {
 	.common = &qed_common_ops_pass,
 	.ll2 = &qed_ll2_ops_pass,
@@ -250,8 +799,12 @@ static const struct qed_nvmetcp_ops qed_nvmetcp_ops_pass = {
 	.register_ops = &qed_register_nvmetcp_ops,
 	.start = &qed_nvmetcp_start,
 	.stop = &qed_nvmetcp_stop,
-
-	/* Placeholder - Connection level ops */
+	.acquire_conn = &qed_nvmetcp_acquire_conn,
+	.release_conn = &qed_nvmetcp_release_conn,
+	.offload_conn = &qed_nvmetcp_offload_conn,
+	.update_conn = &qed_nvmetcp_update_conn,
+	.destroy_conn = &qed_nvmetcp_destroy_conn,
+	.clear_sq = &qed_nvmetcp_clear_conn_sq,
 };
 
 const struct qed_nvmetcp_ops *qed_get_nvmetcp_ops(void)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_nvmetcp.h b/drivers/net/ethernet/qlogic/qed/qed_nvmetcp.h
index 774b46ade408..e5e9d075bf4f 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_nvmetcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_nvmetcp.h
@@ -19,6 +19,7 @@
 #define QED_NVMETCP_FW_CQ_SIZE (4 * 1024)
 
 /* tcp parameters */
+#define QED_TCP_FLOW_LABEL 0
 #define QED_TCP_TWO_MSL_TIMER 4000
 #define QED_TCP_HALF_WAY_CLOSE_TIMEOUT 10
 #define QED_TCP_MAX_FIN_RT 2
@@ -32,6 +33,57 @@ struct qed_nvmetcp_info {
 	nvmetcp_event_cb_t event_cb;
 };
 
+struct qed_hash_nvmetcp_con {
+	struct hlist_node node;
+	struct qed_nvmetcp_conn *con;
+};
+
+struct qed_nvmetcp_conn {
+	struct list_head list_entry;
+	bool free_on_delete;
+	u16 conn_id;
+	u32 icid;
+	u32 fw_cid;
+	u8 layer_code;
+	u8 offl_flags;
+	u8 connect_mode;
+	dma_addr_t sq_pbl_addr;
+	struct qed_chain r2tq;
+	struct qed_chain xhq;
+	struct qed_chain uhq;
+	u8 local_mac[6];
+	u8 remote_mac[6];
+	u8 ip_version;
+	u8 ka_max_probe_cnt;
+	u16 vlan_id;
+	u16 tcp_flags;
+	u32 remote_ip[4];
+	u32 local_ip[4];
+	u32 flow_label;
+	u32 ka_timeout;
+	u32 ka_interval;
+	u32 max_rt_time;
+	u8 ttl;
+	u8 tos_or_tc;
+	u16 remote_port;
+	u16 local_port;
+	u16 mss;
+	u8 rcv_wnd_scale;
+	u32 rcv_wnd;
+	u32 cwnd;
+	u8 update_flag;
+	u8 default_cq;
+	u8 abortive_dsconnect;
+	u32 max_seq_size;
+	u32 max_recv_pdu_length;
+	u32 max_send_pdu_length;
+	u32 first_seq_length;
+	u16 physical_q0;
+	u16 physical_q1;
+	u16 nvmetcp_cccid_max_range;
+	dma_addr_t nvmetcp_cccid_itid_table_addr;
+};
+
 #if IS_ENABLED(CONFIG_QED_NVMETCP)
 int qed_nvmetcp_alloc(struct qed_hwfn *p_hwfn);
 void qed_nvmetcp_setup(struct qed_hwfn *p_hwfn);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h
index 525159e747a5..60ff3222bf55 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h
@@ -101,6 +101,9 @@ union ramrod_data {
 	struct iscsi_spe_conn_termination iscsi_conn_terminate;
 
 	struct nvmetcp_init_ramrod_params nvmetcp_init;
+	struct nvmetcp_spe_conn_offload nvmetcp_conn_offload;
+	struct nvmetcp_conn_update_ramrod_params nvmetcp_conn_update;
+	struct nvmetcp_spe_conn_termination nvmetcp_conn_terminate;
 
 	struct vf_start_ramrod_data vf_start;
 	struct vf_stop_ramrod_data vf_stop;
diff --git a/include/linux/qed/nvmetcp_common.h b/include/linux/qed/nvmetcp_common.h
index e9ccfc07041d..c8836b71b866 100644
--- a/include/linux/qed/nvmetcp_common.h
+++ b/include/linux/qed/nvmetcp_common.h
@@ -6,6 +6,8 @@
 
 #include "tcp_common.h"
 
+#define NVMETCP_SLOW_PATH_LAYER_CODE (6)
+
 /* NVMeTCP firmware function init parameters */
 struct nvmetcp_spe_func_init {
 	__le16 half_way_close_timeout;
@@ -43,6 +45,10 @@ enum nvmetcp_ramrod_cmd_id {
 	NVMETCP_RAMROD_CMD_ID_UNUSED = 0,
 	NVMETCP_RAMROD_CMD_ID_INIT_FUNC = 1,
 	NVMETCP_RAMROD_CMD_ID_DESTROY_FUNC = 2,
+	NVMETCP_RAMROD_CMD_ID_OFFLOAD_CONN = 3,
+	NVMETCP_RAMROD_CMD_ID_UPDATE_CONN = 4,
+	NVMETCP_RAMROD_CMD_ID_TERMINATION_CONN = 5,
+	NVMETCP_RAMROD_CMD_ID_CLEAR_SQ = 6,
 	MAX_NVMETCP_RAMROD_CMD_ID
 };
 
@@ -51,4 +57,141 @@ struct nvmetcp_glbl_queue_entry {
 	struct regpair reserved;
 };
 
+/* NVMeTCP conn level EQEs */
+enum nvmetcp_eqe_opcode {
+	NVMETCP_EVENT_TYPE_INIT_FUNC = 0, /* Response after init Ramrod */
+	NVMETCP_EVENT_TYPE_DESTROY_FUNC, /* Response after destroy Ramrod */
+	NVMETCP_EVENT_TYPE_OFFLOAD_CONN,/* Response after option 2 offload Ramrod */
+	NVMETCP_EVENT_TYPE_UPDATE_CONN, /* Response after update Ramrod */
+	NVMETCP_EVENT_TYPE_CLEAR_SQ, /* Response after clear sq Ramrod */
+	NVMETCP_EVENT_TYPE_TERMINATE_CONN, /* Response after termination Ramrod */
+	NVMETCP_EVENT_TYPE_RESERVED0,
+	NVMETCP_EVENT_TYPE_RESERVED1,
+	NVMETCP_EVENT_TYPE_ASYN_CONNECT_COMPLETE, /* Connect completed (A-syn EQE) */
+	NVMETCP_EVENT_TYPE_ASYN_TERMINATE_DONE, /* Termination completed (A-syn EQE) */
+	NVMETCP_EVENT_TYPE_START_OF_ERROR_TYPES = 10, /* Separate EQs from err EQs */
+	NVMETCP_EVENT_TYPE_ASYN_ABORT_RCVD, /* TCP RST packet receive (A-syn EQE) */
+	NVMETCP_EVENT_TYPE_ASYN_CLOSE_RCVD, /* TCP FIN packet receive (A-syn EQE) */
+	NVMETCP_EVENT_TYPE_ASYN_SYN_RCVD, /* TCP SYN+ACK packet receive (A-syn EQE) */
+	NVMETCP_EVENT_TYPE_ASYN_MAX_RT_TIME, /* TCP max retransmit time (A-syn EQE) */
+	NVMETCP_EVENT_TYPE_ASYN_MAX_RT_CNT, /* TCP max retransmit count (A-syn EQE) */
+	NVMETCP_EVENT_TYPE_ASYN_MAX_KA_PROBES_CNT, /* TCP ka probes count (A-syn EQE) */
+	NVMETCP_EVENT_TYPE_ASYN_FIN_WAIT2, /* TCP fin wait 2 (A-syn EQE) */
+	NVMETCP_EVENT_TYPE_NVMETCP_CONN_ERROR, /* NVMeTCP error response (A-syn EQE) */
+	NVMETCP_EVENT_TYPE_TCP_CONN_ERROR, /* NVMeTCP error - tcp error (A-syn EQE) */
+	MAX_NVMETCP_EQE_OPCODE
+};
+
+struct nvmetcp_conn_offload_section {
+	struct regpair cccid_itid_table_addr; /* CCCID to iTID table address */
+	__le16 cccid_max_range; /* CCCID max value - used for validation */
+	__le16 reserved[3];
+};
+
+/* NVMe TCP connection offload params passed by driver to FW in NVMeTCP offload ramrod */
+struct nvmetcp_conn_offload_params {
+	struct regpair sq_pbl_addr;
+	struct regpair r2tq_pbl_addr;
+	struct regpair xhq_pbl_addr;
+	struct regpair uhq_pbl_addr;
+	__le16 physical_q0;
+	__le16 physical_q1;
+	u8 flags;
+#define NVMETCP_CONN_OFFLOAD_PARAMS_TCP_ON_CHIP_1B_MASK 0x1
+#define NVMETCP_CONN_OFFLOAD_PARAMS_TCP_ON_CHIP_1B_SHIFT 0
+#define NVMETCP_CONN_OFFLOAD_PARAMS_TARGET_MODE_MASK 0x1
+#define NVMETCP_CONN_OFFLOAD_PARAMS_TARGET_MODE_SHIFT 1
+#define NVMETCP_CONN_OFFLOAD_PARAMS_RESTRICTED_MODE_MASK 0x1
+#define NVMETCP_CONN_OFFLOAD_PARAMS_RESTRICTED_MODE_SHIFT 2
+#define NVMETCP_CONN_OFFLOAD_PARAMS_NVMETCP_MODE_MASK 0x1
+#define NVMETCP_CONN_OFFLOAD_PARAMS_NVMETCP_MODE_SHIFT 3
+#define NVMETCP_CONN_OFFLOAD_PARAMS_RESERVED1_MASK 0xF
+#define NVMETCP_CONN_OFFLOAD_PARAMS_RESERVED1_SHIFT 4
+	u8 default_cq;
+	__le16 reserved0;
+	__le32 reserved1;
+	__le32 initial_ack;
+
+	struct nvmetcp_conn_offload_section nvmetcp; /* NVMe/TCP section */
+};
+
+/* NVMe TCP and TCP connection offload params passed by driver to FW in NVMeTCP offload ramrod. */
+struct nvmetcp_spe_conn_offload {
+	__le16 reserved;
+	__le16 conn_id;
+	__le32 fw_cid;
+	struct nvmetcp_conn_offload_params nvmetcp;
+	struct tcp_offload_params_opt2 tcp;
+};
+
+/* NVMeTCP connection update params passed by driver to FW in NVMETCP update ramrod. */
+struct nvmetcp_conn_update_ramrod_params {
+	__le16 reserved0;
+	__le16 conn_id;
+	__le32 reserved1;
+	u8 flags;
+#define NVMETCP_CONN_UPDATE_RAMROD_PARAMS_HD_EN_MASK 0x1
+#define NVMETCP_CONN_UPDATE_RAMROD_PARAMS_HD_EN_SHIFT 0
+#define NVMETCP_CONN_UPDATE_RAMROD_PARAMS_DD_EN_MASK 0x1
+#define NVMETCP_CONN_UPDATE_RAMROD_PARAMS_DD_EN_SHIFT 1
+#define NVMETCP_CONN_UPDATE_RAMROD_PARAMS_RESERVED0_MASK 0x1
+#define NVMETCP_CONN_UPDATE_RAMROD_PARAMS_RESERVED0_SHIFT 2
+#define NVMETCP_CONN_UPDATE_RAMROD_PARAMS_RESERVED1_MASK 0x1
+#define NVMETCP_CONN_UPDATE_RAMROD_PARAMS_RESERVED1_DATA_SHIFT 3
+#define NVMETCP_CONN_UPDATE_RAMROD_PARAMS_RESERVED2_MASK 0x1
+#define NVMETCP_CONN_UPDATE_RAMROD_PARAMS_RESERVED2_SHIFT 4
+#define NVMETCP_CONN_UPDATE_RAMROD_PARAMS_RESERVED3_MASK 0x1
+#define NVMETCP_CONN_UPDATE_RAMROD_PARAMS_RESERVED3_SHIFT 5
+#define NVMETCP_CONN_UPDATE_RAMROD_PARAMS_RESERVED4_MASK 0x1
+#define NVMETCP_CONN_UPDATE_RAMROD_PARAMS_RESERVED4_SHIFT 6
+#define NVMETCP_CONN_UPDATE_RAMROD_PARAMS_RESERVED5_MASK 0x1
+#define NVMETCP_CONN_UPDATE_RAMROD_PARAMS_RESERVED5_SHIFT 7
+	u8 reserved3[3];
+	__le32 max_seq_size;
+	__le32 max_send_pdu_length;
+	__le32 max_recv_pdu_length;
+	__le32 first_seq_length;
+	__le32 reserved4[5];
+};
+
+/* NVMeTCP connection termination request */
+struct nvmetcp_spe_conn_termination {
+	__le16 reserved0;
+	__le16 conn_id;
+	__le32 reserved1;
+	u8 abortive;
+	u8 reserved2[7];
+	struct regpair reserved3;
+	struct regpair reserved4;
+};
+
+struct nvmetcp_dif_flags {
+	u8 flags;
+};
+
+enum nvmetcp_wqe_type {
+	NVMETCP_WQE_TYPE_NORMAL,
+	NVMETCP_WQE_TYPE_TASK_CLEANUP,
+	NVMETCP_WQE_TYPE_MIDDLE_PATH,
+	NVMETCP_WQE_TYPE_IC,
+	MAX_NVMETCP_WQE_TYPE
+};
+
+struct nvmetcp_wqe {
+	__le16 task_id;
+	u8 flags;
+#define NVMETCP_WQE_WQE_TYPE_MASK 0x7 /* [use nvmetcp_wqe_type] */
+#define NVMETCP_WQE_WQE_TYPE_SHIFT 0
+#define NVMETCP_WQE_NUM_SGES_MASK 0xF
+#define NVMETCP_WQE_NUM_SGES_SHIFT 3
+#define NVMETCP_WQE_RESPONSE_MASK 0x1
+#define NVMETCP_WQE_RESPONSE_SHIFT 7
+	struct nvmetcp_dif_flags prot_flags;
+	__le32 contlen_cdbsize;
+#define NVMETCP_WQE_CONT_LEN_MASK 0xFFFFFF
+#define NVMETCP_WQE_CONT_LEN_SHIFT 0
+#define NVMETCP_WQE_CDB_SIZE_OR_NVMETCP_CMD_MASK 0xFF
+#define NVMETCP_WQE_CDB_SIZE_OR_NVMETCP_CMD_SHIFT 24
+};
+
 #endif /* __NVMETCP_COMMON__ */
diff --git a/include/linux/qed/qed_nvmetcp_if.h b/include/linux/qed/qed_nvmetcp_if.h
index 76868bdf0883..5baf1c5ce798 100644
--- a/include/linux/qed/qed_nvmetcp_if.h
+++ b/include/linux/qed/qed_nvmetcp_if.h
@@ -24,6 +24,50 @@ struct qed_nvmetcp_tid {
 	u8 *blocks[MAX_TID_BLOCKS_NVMETCP];
 };
 
+struct qed_nvmetcp_id_params {
+	u8 mac[ETH_ALEN];
+	u32 ip[4];
+	u16 port;
+};
+
+struct qed_nvmetcp_params_offload {
+	/* FW initializations */
+	dma_addr_t sq_pbl_addr;
+	dma_addr_t nvmetcp_cccid_itid_table_addr;
+	u16 nvmetcp_cccid_max_range;
+	u8 default_cq;
+
+	/* Networking and TCP stack initializations */
+	struct qed_nvmetcp_id_params src;
+	struct qed_nvmetcp_id_params dst;
+	u32 ka_timeout;
+	u32 ka_interval;
+	u32 max_rt_time;
+	u32 cwnd;
+	u16 mss;
+	u16 vlan_id;
+	bool timestamp_en;
+	bool delayed_ack_en;
+	bool tcp_keep_alive_en;
+	bool ecn_en;
+	u8 ip_version;
+	u8 ka_max_probe_cnt;
+	u8 ttl;
+	u8 tos_or_tc;
+	u8 rcv_wnd_scale;
+};
+
+struct qed_nvmetcp_params_update {
+	u32 max_io_size;
+	u32 max_recv_pdu_length;
+	u32 max_send_pdu_length;
+
+	/* Placeholder: pfv, cpda, hpda */
+
+	bool hdr_digest_en;
+	bool data_digest_en;
+};
+
 struct qed_nvmetcp_cb_ops {
 	struct qed_common_cb_ops common;
 };
@@ -47,6 +91,38 @@ struct qed_nvmetcp_cb_ops {
  * @stop:		nvmetcp in FW
  *			@param cdev
  *			return 0 on success, otherwise error value.
+ * @acquire_conn:	acquire a new nvmetcp connection
+ *			@param cdev
+ *			@param handle - qed will fill handle that should be
+ *				used henceforth as identifier of the
+ *				connection.
+ *			@param p_doorbell - qed will fill the address of the
+ *				doorbell.
+ *			@return 0 on sucesss, otherwise error value.
+ * @release_conn:	release a previously acquired nvmetcp connection
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			@return 0 on success, otherwise error value.
+ * @offload_conn:	configures an offloaded connection
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			@param conn_info - the configuration to use for the
+ *				offload.
+ *			@return 0 on success, otherwise error value.
+ * @update_conn:	updates an offloaded connection
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			@param conn_info - the configuration to use for the
+ *				offload.
+ *			@return 0 on success, otherwise error value.
+ * @destroy_conn:	stops an offloaded connection
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			@return 0 on success, otherwise error value.
+ * @clear_sq:		clear all task in sq
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			@return 0 on success, otherwise error value.
  */
 struct qed_nvmetcp_ops {
 	const struct qed_common_ops *common;
@@ -64,6 +140,24 @@ struct qed_nvmetcp_ops {
 		     void *event_context, nvmetcp_event_cb_t async_event_cb);
 
 	int (*stop)(struct qed_dev *cdev);
+
+	int (*acquire_conn)(struct qed_dev *cdev,
+			    u32 *handle,
+			    u32 *fw_cid, void __iomem **p_doorbell);
+
+	int (*release_conn)(struct qed_dev *cdev, u32 handle);
+
+	int (*offload_conn)(struct qed_dev *cdev,
+			    u32 handle,
+			    struct qed_nvmetcp_params_offload *conn_info);
+
+	int (*update_conn)(struct qed_dev *cdev,
+			   u32 handle,
+			   struct qed_nvmetcp_params_update *conn_info);
+
+	int (*destroy_conn)(struct qed_dev *cdev, u32 handle, u8 abrt_conn);
+
+	int (*clear_sq)(struct qed_dev *cdev, u32 handle);
 };
 
 const struct qed_nvmetcp_ops *qed_get_nvmetcp_ops(void);
-- 
cgit v1.2.3


From 203d136e8958a7c65834601f669bdd0fcaa6fcbd Mon Sep 17 00:00:00 2001
From: Prabhakar Kushwaha <pkushwaha@marvell.com>
Date: Wed, 2 Jun 2021 20:16:52 +0300
Subject: qed: Add support of HW filter block

This patch introduces the functionality of HW filter block.
It adds and removes filters based on source and target TCP port.

It also add functionality to clear all filters at once.

Acked-by: Igor Russkikh <irusskikh@marvell.com>
Signed-off-by: Prabhakar Kushwaha <pkushwaha@marvell.com>
Signed-off-by: Omkar Kulkarni <okulkarni@marvell.com>
Signed-off-by: Shai Malin <smalin@marvell.com>
Signed-off-by: Michal Kalderon <mkalderon@marvell.com>
Signed-off-by: Ariel Elior <aelior@marvell.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed.h         |  8 +++
 drivers/net/ethernet/qlogic/qed/qed_dev.c     | 90 +++++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_nvmetcp.c |  5 ++
 include/linux/qed/qed_nvmetcp_if.h            | 24 +++++++
 4 files changed, 127 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index bc9bdb9d1bb9..b590c70539b5 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -49,6 +49,8 @@ extern const struct qed_common_ops qed_common_ops_pass;
 #define QED_MIN_WIDS		(4)
 #define QED_PF_DEMS_SIZE        (4)
 
+#define QED_LLH_DONT_CARE 0
+
 /* cau states */
 enum qed_coalescing_mode {
 	QED_COAL_MODE_DISABLE,
@@ -1005,4 +1007,10 @@ int qed_mfw_fill_tlv_data(struct qed_hwfn *hwfn,
 void qed_hw_info_set_offload_tc(struct qed_hw_info *p_info, u8 tc);
 
 void qed_periodic_db_rec_start(struct qed_hwfn *p_hwfn);
+
+int qed_llh_add_src_tcp_port_filter(struct qed_dev *cdev, u16 src_port);
+int qed_llh_add_dst_tcp_port_filter(struct qed_dev *cdev, u16 dest_port);
+void qed_llh_remove_src_tcp_port_filter(struct qed_dev *cdev, u16 src_port);
+void qed_llh_remove_dst_tcp_port_filter(struct qed_dev *cdev, u16 src_port);
+void qed_llh_clear_all_filters(struct qed_dev *cdev);
 #endif /* _QED_H */
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 932b892f1ef1..0410c3604abd 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -5362,3 +5362,93 @@ void qed_set_fw_mac_addr(__le16 *fw_msb,
 	((u8 *)fw_lsb)[0] = mac[5];
 	((u8 *)fw_lsb)[1] = mac[4];
 }
+
+static int qed_llh_shadow_remove_all_filters(struct qed_dev *cdev, u8 ppfid)
+{
+	struct qed_llh_info *p_llh_info = cdev->p_llh_info;
+	struct qed_llh_filter_info *p_filters;
+	int rc;
+
+	rc = qed_llh_shadow_sanity(cdev, ppfid, 0, "remove_all");
+	if (rc)
+		return rc;
+
+	p_filters = p_llh_info->pp_filters[ppfid];
+	memset(p_filters, 0, NIG_REG_LLH_FUNC_FILTER_EN_SIZE *
+	       sizeof(*p_filters));
+
+	return 0;
+}
+
+static void qed_llh_clear_ppfid_filters(struct qed_dev *cdev, u8 ppfid)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *p_ptt = qed_ptt_acquire(p_hwfn);
+	u8 filter_idx, abs_ppfid;
+	int rc = 0;
+
+	if (!p_ptt)
+		return;
+
+	if (!test_bit(QED_MF_LLH_PROTO_CLSS, &cdev->mf_bits) &&
+	    !test_bit(QED_MF_LLH_MAC_CLSS, &cdev->mf_bits))
+		goto out;
+
+	rc = qed_llh_abs_ppfid(cdev, ppfid, &abs_ppfid);
+	if (rc)
+		goto out;
+
+	rc = qed_llh_shadow_remove_all_filters(cdev, ppfid);
+	if (rc)
+		goto out;
+
+	for (filter_idx = 0; filter_idx < NIG_REG_LLH_FUNC_FILTER_EN_SIZE;
+	     filter_idx++) {
+		rc = qed_llh_remove_filter(p_hwfn, p_ptt,
+					   abs_ppfid, filter_idx);
+		if (rc)
+			goto out;
+	}
+out:
+	qed_ptt_release(p_hwfn, p_ptt);
+}
+
+int qed_llh_add_src_tcp_port_filter(struct qed_dev *cdev, u16 src_port)
+{
+	return qed_llh_add_protocol_filter(cdev, 0,
+					   QED_LLH_FILTER_TCP_SRC_PORT,
+					   src_port, QED_LLH_DONT_CARE);
+}
+
+void qed_llh_remove_src_tcp_port_filter(struct qed_dev *cdev, u16 src_port)
+{
+	qed_llh_remove_protocol_filter(cdev, 0,
+				       QED_LLH_FILTER_TCP_SRC_PORT,
+				       src_port, QED_LLH_DONT_CARE);
+}
+
+int qed_llh_add_dst_tcp_port_filter(struct qed_dev *cdev, u16 dest_port)
+{
+	return qed_llh_add_protocol_filter(cdev, 0,
+					   QED_LLH_FILTER_TCP_DEST_PORT,
+					   QED_LLH_DONT_CARE, dest_port);
+}
+
+void qed_llh_remove_dst_tcp_port_filter(struct qed_dev *cdev, u16 dest_port)
+{
+	qed_llh_remove_protocol_filter(cdev, 0,
+				       QED_LLH_FILTER_TCP_DEST_PORT,
+				       QED_LLH_DONT_CARE, dest_port);
+}
+
+void qed_llh_clear_all_filters(struct qed_dev *cdev)
+{
+	u8 ppfid;
+
+	if (!test_bit(QED_MF_LLH_PROTO_CLSS, &cdev->mf_bits) &&
+	    !test_bit(QED_MF_LLH_MAC_CLSS, &cdev->mf_bits))
+		return;
+
+	for (ppfid = 0; ppfid < cdev->p_llh_info->num_ppfid; ppfid++)
+		qed_llh_clear_ppfid_filters(cdev, ppfid);
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_nvmetcp.c b/drivers/net/ethernet/qlogic/qed/qed_nvmetcp.c
index 7943804e88cd..d4d609a4d3a3 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_nvmetcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_nvmetcp.c
@@ -805,6 +805,11 @@ static const struct qed_nvmetcp_ops qed_nvmetcp_ops_pass = {
 	.update_conn = &qed_nvmetcp_update_conn,
 	.destroy_conn = &qed_nvmetcp_destroy_conn,
 	.clear_sq = &qed_nvmetcp_clear_conn_sq,
+	.add_src_tcp_port_filter = &qed_llh_add_src_tcp_port_filter,
+	.remove_src_tcp_port_filter = &qed_llh_remove_src_tcp_port_filter,
+	.add_dst_tcp_port_filter = &qed_llh_add_dst_tcp_port_filter,
+	.remove_dst_tcp_port_filter = &qed_llh_remove_dst_tcp_port_filter,
+	.clear_all_filters = &qed_llh_clear_all_filters
 };
 
 const struct qed_nvmetcp_ops *qed_get_nvmetcp_ops(void)
diff --git a/include/linux/qed/qed_nvmetcp_if.h b/include/linux/qed/qed_nvmetcp_if.h
index 5baf1c5ce798..5180edad24e5 100644
--- a/include/linux/qed/qed_nvmetcp_if.h
+++ b/include/linux/qed/qed_nvmetcp_if.h
@@ -123,6 +123,20 @@ struct qed_nvmetcp_cb_ops {
  *			@param cdev
  *			@param handle - the connection handle.
  *			@return 0 on success, otherwise error value.
+ * @add_src_tcp_port_filter: Add source tcp port filter
+ *			@param cdev
+ *			@param src_port
+ * @remove_src_tcp_port_filter: Remove source tcp port filter
+ *			@param cdev
+ *			@param src_port
+ * @add_dst_tcp_port_filter: Add destination tcp port filter
+ *			@param cdev
+ *			@param dest_port
+ * @remove_dst_tcp_port_filter: Remove destination tcp port filter
+ *			@param cdev
+ *			@param dest_port
+ * @clear_all_filters: Clear all filters.
+ *			@param cdev
  */
 struct qed_nvmetcp_ops {
 	const struct qed_common_ops *common;
@@ -158,6 +172,16 @@ struct qed_nvmetcp_ops {
 	int (*destroy_conn)(struct qed_dev *cdev, u32 handle, u8 abrt_conn);
 
 	int (*clear_sq)(struct qed_dev *cdev, u32 handle);
+
+	int (*add_src_tcp_port_filter)(struct qed_dev *cdev, u16 src_port);
+
+	void (*remove_src_tcp_port_filter)(struct qed_dev *cdev, u16 src_port);
+
+	int (*add_dst_tcp_port_filter)(struct qed_dev *cdev, u16 dest_port);
+
+	void (*remove_dst_tcp_port_filter)(struct qed_dev *cdev, u16 dest_port);
+
+	void (*clear_all_filters)(struct qed_dev *cdev);
 };
 
 const struct qed_nvmetcp_ops *qed_get_nvmetcp_ops(void);
-- 
cgit v1.2.3


From ab47bdfd2e2e9670172a737d12ebfc94bf9d299d Mon Sep 17 00:00:00 2001
From: Shai Malin <smalin@marvell.com>
Date: Wed, 2 Jun 2021 20:16:53 +0300
Subject: qed: Add NVMeTCP Offload IO Level FW and HW HSI

This patch introduces the NVMeTCP Offload FW and HW  HSI in order
to initialize the IO level configuration into a per IO HW
resource ("task") as part of the IO path flow.

Acked-by: Igor Russkikh <irusskikh@marvell.com>
Signed-off-by: Prabhakar Kushwaha <pkushwaha@marvell.com>
Signed-off-by: Omkar Kulkarni <okulkarni@marvell.com>
Signed-off-by: Shai Malin <smalin@marvell.com>
Signed-off-by: Michal Kalderon <mkalderon@marvell.com>
Signed-off-by: Ariel Elior <aelior@marvell.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/qed/nvmetcp_common.h | 335 ++++++++++++++++++++++++++++++++++++-
 include/linux/qed/qed_nvmetcp_if.h |  31 ++++
 2 files changed, 365 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/qed/nvmetcp_common.h b/include/linux/qed/nvmetcp_common.h
index c8836b71b866..ad745a9c2264 100644
--- a/include/linux/qed/nvmetcp_common.h
+++ b/include/linux/qed/nvmetcp_common.h
@@ -7,6 +7,7 @@
 #include "tcp_common.h"
 
 #define NVMETCP_SLOW_PATH_LAYER_CODE (6)
+#define NVMETCP_WQE_NUM_SGES_SLOWIO (0xf)
 
 /* NVMeTCP firmware function init parameters */
 struct nvmetcp_spe_func_init {
@@ -194,4 +195,336 @@ struct nvmetcp_wqe {
 #define NVMETCP_WQE_CDB_SIZE_OR_NVMETCP_CMD_SHIFT 24
 };
 
-#endif /* __NVMETCP_COMMON__ */
+struct nvmetcp_host_cccid_itid_entry {
+	__le16 itid;
+};
+
+struct nvmetcp_connect_done_results {
+	__le16 icid;
+	__le16 conn_id;
+	struct tcp_ulp_connect_done_params params;
+};
+
+struct nvmetcp_eqe_data {
+	__le16 icid;
+	__le16 conn_id;
+	__le16 reserved;
+	u8 error_code;
+	u8 error_pdu_opcode_reserved;
+#define NVMETCP_EQE_DATA_ERROR_PDU_OPCODE_MASK 0x3F
+#define NVMETCP_EQE_DATA_ERROR_PDU_OPCODE_SHIFT  0
+#define NVMETCP_EQE_DATA_ERROR_PDU_OPCODE_VALID_MASK  0x1
+#define NVMETCP_EQE_DATA_ERROR_PDU_OPCODE_VALID_SHIFT  6
+#define NVMETCP_EQE_DATA_RESERVED0_MASK 0x1
+#define NVMETCP_EQE_DATA_RESERVED0_SHIFT 7
+};
+
+enum nvmetcp_task_type {
+	NVMETCP_TASK_TYPE_HOST_WRITE,
+	NVMETCP_TASK_TYPE_HOST_READ,
+	NVMETCP_TASK_TYPE_INIT_CONN_REQUEST,
+	NVMETCP_TASK_TYPE_RESERVED0,
+	NVMETCP_TASK_TYPE_CLEANUP,
+	NVMETCP_TASK_TYPE_HOST_READ_NO_CQE,
+	MAX_NVMETCP_TASK_TYPE
+};
+
+struct nvmetcp_db_data {
+	u8 params;
+#define NVMETCP_DB_DATA_DEST_MASK 0x3 /* destination of doorbell (use enum db_dest) */
+#define NVMETCP_DB_DATA_DEST_SHIFT 0
+#define NVMETCP_DB_DATA_AGG_CMD_MASK 0x3 /* aggregative command to CM (use enum db_agg_cmd_sel) */
+#define NVMETCP_DB_DATA_AGG_CMD_SHIFT 2
+#define NVMETCP_DB_DATA_BYPASS_EN_MASK 0x1 /* enable QM bypass */
+#define NVMETCP_DB_DATA_BYPASS_EN_SHIFT 4
+#define NVMETCP_DB_DATA_RESERVED_MASK 0x1
+#define NVMETCP_DB_DATA_RESERVED_SHIFT 5
+#define NVMETCP_DB_DATA_AGG_VAL_SEL_MASK 0x3 /* aggregative value selection */
+#define NVMETCP_DB_DATA_AGG_VAL_SEL_SHIFT 6
+	u8 agg_flags; /* bit for every DQ counter flags in CM context that DQ can increment */
+	__le16 sq_prod;
+};
+
+struct nvmetcp_fw_nvmf_cqe {
+	__le32 reserved[4];
+};
+
+struct nvmetcp_icresp_mdata {
+	u8  digest;
+	u8  cpda;
+	__le16  pfv;
+	__le32 maxdata;
+	__le16 rsvd[4];
+};
+
+union nvmetcp_fw_cqe_data {
+	struct nvmetcp_fw_nvmf_cqe nvme_cqe;
+	struct nvmetcp_icresp_mdata icresp_mdata;
+};
+
+struct nvmetcp_fw_cqe {
+	__le16 conn_id;
+	u8 cqe_type;
+	u8 cqe_error_status_bits;
+#define CQE_ERROR_BITMAP_DIF_ERR_BITS_MASK 0x7
+#define CQE_ERROR_BITMAP_DIF_ERR_BITS_SHIFT 0
+#define CQE_ERROR_BITMAP_DATA_DIGEST_ERR_MASK 0x1
+#define CQE_ERROR_BITMAP_DATA_DIGEST_ERR_SHIFT 3
+#define CQE_ERROR_BITMAP_RCV_ON_INVALID_CONN_MASK 0x1
+#define CQE_ERROR_BITMAP_RCV_ON_INVALID_CONN_SHIFT 4
+	__le16 itid;
+	u8 task_type;
+	u8 fw_dbg_field;
+	u8 caused_conn_err;
+	u8 reserved0[3];
+	__le32 reserved1;
+	union nvmetcp_fw_cqe_data cqe_data;
+	struct regpair task_opaque;
+	__le32 reserved[6];
+};
+
+enum nvmetcp_fw_cqes_type {
+	NVMETCP_FW_CQE_TYPE_NORMAL = 1,
+	NVMETCP_FW_CQE_TYPE_RESERVED0,
+	NVMETCP_FW_CQE_TYPE_RESERVED1,
+	NVMETCP_FW_CQE_TYPE_CLEANUP,
+	NVMETCP_FW_CQE_TYPE_DUMMY,
+	MAX_NVMETCP_FW_CQES_TYPE
+};
+
+struct ystorm_nvmetcp_task_state {
+	struct scsi_cached_sges data_desc;
+	struct scsi_sgl_params sgl_params;
+	__le32 resrved0;
+	__le32 buffer_offset;
+	__le16 cccid;
+	struct nvmetcp_dif_flags dif_flags;
+	u8 flags;
+#define YSTORM_NVMETCP_TASK_STATE_LOCAL_COMP_MASK 0x1
+#define YSTORM_NVMETCP_TASK_STATE_LOCAL_COMP_SHIFT 0
+#define YSTORM_NVMETCP_TASK_STATE_SLOW_IO_MASK 0x1
+#define YSTORM_NVMETCP_TASK_STATE_SLOW_IO_SHIFT 1
+#define YSTORM_NVMETCP_TASK_STATE_SET_DIF_OFFSET_MASK 0x1
+#define YSTORM_NVMETCP_TASK_STATE_SET_DIF_OFFSET_SHIFT 2
+#define YSTORM_NVMETCP_TASK_STATE_SEND_W_RSP_MASK 0x1
+#define YSTORM_NVMETCP_TASK_STATE_SEND_W_RSP_SHIFT 3
+};
+
+struct ystorm_nvmetcp_task_rxmit_opt {
+	__le32 reserved[4];
+};
+
+struct nvmetcp_task_hdr {
+	__le32 reg[18];
+};
+
+struct nvmetcp_task_hdr_aligned {
+	struct nvmetcp_task_hdr task_hdr;
+	__le32 reserved[2];	/* HSI_COMMENT: Align to QREG */
+};
+
+struct e5_tdif_task_context {
+	__le32 reserved[16];
+};
+
+struct e5_rdif_task_context {
+	__le32 reserved[12];
+};
+
+struct ystorm_nvmetcp_task_st_ctx {
+	struct ystorm_nvmetcp_task_state state;
+	struct ystorm_nvmetcp_task_rxmit_opt rxmit_opt;
+	struct nvmetcp_task_hdr_aligned pdu_hdr;
+};
+
+struct mstorm_nvmetcp_task_st_ctx {
+	struct scsi_cached_sges data_desc;
+	struct scsi_sgl_params sgl_params;
+	__le32 rem_task_size;
+	__le32 data_buffer_offset;
+	u8 task_type;
+	struct nvmetcp_dif_flags dif_flags;
+	__le16 dif_task_icid;
+	struct regpair reserved0;
+	__le32 expected_itt;
+	__le32 reserved1;
+};
+
+struct ustorm_nvmetcp_task_st_ctx {
+	__le32 rem_rcv_len;
+	__le32 exp_data_transfer_len;
+	__le32 exp_data_sn;
+	struct regpair reserved0;
+	__le32 reg1_map;
+#define REG1_NUM_SGES_MASK 0xF
+#define REG1_NUM_SGES_SHIFT 0
+#define REG1_RESERVED1_MASK 0xFFFFFFF
+#define REG1_RESERVED1_SHIFT 4
+	u8 flags2;
+#define USTORM_NVMETCP_TASK_ST_CTX_AHS_EXIST_MASK 0x1
+#define USTORM_NVMETCP_TASK_ST_CTX_AHS_EXIST_SHIFT 0
+#define USTORM_NVMETCP_TASK_ST_CTX_RESERVED1_MASK 0x7F
+#define USTORM_NVMETCP_TASK_ST_CTX_RESERVED1_SHIFT 1
+	struct nvmetcp_dif_flags dif_flags;
+	__le16 reserved3;
+	__le16 tqe_opaque[2];
+	__le32 reserved5;
+	__le32 nvme_tcp_opaque_lo;
+	__le32 nvme_tcp_opaque_hi;
+	u8 task_type;
+	u8 error_flags;
+#define USTORM_NVMETCP_TASK_ST_CTX_DATA_DIGEST_ERROR_MASK 0x1
+#define USTORM_NVMETCP_TASK_ST_CTX_DATA_DIGEST_ERROR_SHIFT 0
+#define USTORM_NVMETCP_TASK_ST_CTX_DATA_TRUNCATED_ERROR_MASK 0x1
+#define USTORM_NVMETCP_TASK_ST_CTX_DATA_TRUNCATED_ERROR_SHIFT 1
+#define USTORM_NVMETCP_TASK_ST_CTX_UNDER_RUN_ERROR_MASK 0x1
+#define USTORM_NVMETCP_TASK_ST_CTX_UNDER_RUN_ERROR_SHIFT 2
+#define USTORM_NVMETCP_TASK_ST_CTX_NVME_TCP_MASK 0x1
+#define USTORM_NVMETCP_TASK_ST_CTX_NVME_TCP_SHIFT 3
+	u8 flags;
+#define USTORM_NVMETCP_TASK_ST_CTX_CQE_WRITE_MASK 0x3
+#define USTORM_NVMETCP_TASK_ST_CTX_CQE_WRITE_SHIFT 0
+#define USTORM_NVMETCP_TASK_ST_CTX_LOCAL_COMP_MASK 0x1
+#define USTORM_NVMETCP_TASK_ST_CTX_LOCAL_COMP_SHIFT 2
+#define USTORM_NVMETCP_TASK_ST_CTX_Q0_R2TQE_WRITE_MASK 0x1
+#define USTORM_NVMETCP_TASK_ST_CTX_Q0_R2TQE_WRITE_SHIFT 3
+#define USTORM_NVMETCP_TASK_ST_CTX_TOTAL_DATA_ACKED_DONE_MASK 0x1
+#define USTORM_NVMETCP_TASK_ST_CTX_TOTAL_DATA_ACKED_DONE_SHIFT 4
+#define USTORM_NVMETCP_TASK_ST_CTX_HQ_SCANNED_DONE_MASK 0x1
+#define USTORM_NVMETCP_TASK_ST_CTX_HQ_SCANNED_DONE_SHIFT 5
+#define USTORM_NVMETCP_TASK_ST_CTX_R2T2RECV_DONE_MASK 0x1
+#define USTORM_NVMETCP_TASK_ST_CTX_R2T2RECV_DONE_SHIFT 6
+	u8 cq_rss_number;
+};
+
+struct e5_ystorm_nvmetcp_task_ag_ctx {
+	u8 reserved /* cdu_validation */;
+	u8 byte1 /* state_and_core_id */;
+	__le16 word0 /* icid */;
+	u8 flags0;
+	u8 flags1;
+	u8 flags2;
+	u8 flags3;
+	__le32 TTT;
+	u8 byte2;
+	u8 byte3;
+	u8 byte4;
+	u8 e4_reserved7;
+};
+
+struct e5_mstorm_nvmetcp_task_ag_ctx {
+	u8 cdu_validation;
+	u8 byte1;
+	__le16 task_cid;
+	u8 flags0;
+#define E5_MSTORM_NVMETCP_TASK_AG_CTX_CONNECTION_TYPE_MASK 0xF
+#define E5_MSTORM_NVMETCP_TASK_AG_CTX_CONNECTION_TYPE_SHIFT 0
+#define E5_MSTORM_NVMETCP_TASK_AG_CTX_EXIST_IN_QM0_MASK 0x1
+#define E5_MSTORM_NVMETCP_TASK_AG_CTX_EXIST_IN_QM0_SHIFT 4
+#define E5_MSTORM_NVMETCP_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_MASK 0x1
+#define E5_MSTORM_NVMETCP_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_SHIFT 5
+#define E5_MSTORM_NVMETCP_TASK_AG_CTX_VALID_MASK 0x1
+#define E5_MSTORM_NVMETCP_TASK_AG_CTX_VALID_SHIFT 6
+#define E5_MSTORM_NVMETCP_TASK_AG_CTX_TASK_CLEANUP_FLAG_MASK 0x1
+#define E5_MSTORM_NVMETCP_TASK_AG_CTX_TASK_CLEANUP_FLAG_SHIFT 7
+	u8 flags1;
+#define E5_MSTORM_NVMETCP_TASK_AG_CTX_TASK_CLEANUP_CF_MASK 0x3
+#define E5_MSTORM_NVMETCP_TASK_AG_CTX_TASK_CLEANUP_CF_SHIFT 0
+#define E5_MSTORM_NVMETCP_TASK_AG_CTX_CF1_MASK 0x3
+#define E5_MSTORM_NVMETCP_TASK_AG_CTX_CF1_SHIFT 2
+#define E5_MSTORM_NVMETCP_TASK_AG_CTX_CF2_MASK 0x3
+#define E5_MSTORM_NVMETCP_TASK_AG_CTX_CF2_SHIFT 4
+#define E5_MSTORM_NVMETCP_TASK_AG_CTX_TASK_CLEANUP_CF_EN_MASK 0x1
+#define E5_MSTORM_NVMETCP_TASK_AG_CTX_TASK_CLEANUP_CF_EN_SHIFT 6
+#define E5_MSTORM_NVMETCP_TASK_AG_CTX_CF1EN_MASK 0x1
+#define E5_MSTORM_NVMETCP_TASK_AG_CTX_CF1EN_SHIFT 7
+	u8 flags2;
+	u8 flags3;
+	__le32 reg0;
+	u8 byte2;
+	u8 byte3;
+	u8 byte4;
+	u8 e4_reserved7;
+};
+
+struct e5_ustorm_nvmetcp_task_ag_ctx {
+	u8 reserved;
+	u8 state_and_core_id;
+	__le16 icid;
+	u8 flags0;
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_CONNECTION_TYPE_MASK 0xF
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_CONNECTION_TYPE_SHIFT 0
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_EXIST_IN_QM0_MASK 0x1
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_EXIST_IN_QM0_SHIFT 4
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_MASK 0x1
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_SHIFT 5
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_HQ_SCANNED_CF_MASK 0x3
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_HQ_SCANNED_CF_SHIFT 6
+	u8 flags1;
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_RESERVED1_MASK 0x3
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_RESERVED1_SHIFT 0
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_R2T2RECV_MASK 0x3
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_R2T2RECV_SHIFT 2
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_CF3_MASK 0x3
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_CF3_SHIFT 4
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_DIF_ERROR_CF_MASK 0x3
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_DIF_ERROR_CF_SHIFT 6
+	u8 flags2;
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_HQ_SCANNED_CF_EN_MASK 0x1
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_HQ_SCANNED_CF_EN_SHIFT 0
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_DISABLE_DATA_ACKED_MASK 0x1
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_DISABLE_DATA_ACKED_SHIFT 1
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_R2T2RECV_EN_MASK 0x1
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_R2T2RECV_EN_SHIFT 2
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_CF3EN_MASK 0x1
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_CF3EN_SHIFT 3
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_DIF_ERROR_CF_EN_MASK 0x1
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_DIF_ERROR_CF_EN_SHIFT 4
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_CMP_DATA_TOTAL_EXP_EN_MASK 0x1
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_CMP_DATA_TOTAL_EXP_EN_SHIFT 5
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_RULE1EN_MASK 0x1
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_RULE1EN_SHIFT 6
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_CMP_CONT_RCV_EXP_EN_MASK 0x1
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_CMP_CONT_RCV_EXP_EN_SHIFT 7
+	u8 flags3;
+	u8 flags4;
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_E4_RESERVED5_MASK 0x3
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_E4_RESERVED5_SHIFT 0
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_E4_RESERVED6_MASK 0x1
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_E4_RESERVED6_SHIFT 2
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_E4_RESERVED7_MASK 0x1
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_E4_RESERVED7_SHIFT 3
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_DIF_ERROR_TYPE_MASK 0xF
+#define E5_USTORM_NVMETCP_TASK_AG_CTX_DIF_ERROR_TYPE_SHIFT 4
+	u8 byte2;
+	u8 byte3;
+	u8 e4_reserved8;
+	__le32 dif_err_intervals;
+	__le32 dif_error_1st_interval;
+	__le32 rcv_cont_len;
+	__le32 exp_cont_len;
+	__le32 total_data_acked;
+	__le32 exp_data_acked;
+	__le16 word1;
+	__le16 next_tid;
+	__le32 hdr_residual_count;
+	__le32 exp_r2t_sn;
+};
+
+struct e5_nvmetcp_task_context {
+	struct ystorm_nvmetcp_task_st_ctx ystorm_st_context;
+	struct e5_ystorm_nvmetcp_task_ag_ctx ystorm_ag_context;
+	struct regpair ystorm_ag_padding[2];
+	struct e5_tdif_task_context tdif_context;
+	struct e5_mstorm_nvmetcp_task_ag_ctx mstorm_ag_context;
+	struct regpair mstorm_ag_padding[2];
+	struct e5_ustorm_nvmetcp_task_ag_ctx ustorm_ag_context;
+	struct regpair ustorm_ag_padding[2];
+	struct mstorm_nvmetcp_task_st_ctx mstorm_st_context;
+	struct regpair mstorm_st_padding[2];
+	struct ustorm_nvmetcp_task_st_ctx ustorm_st_context;
+	struct regpair ustorm_st_padding[2];
+	struct e5_rdif_task_context rdif_context;
+};
+
+#endif /* __NVMETCP_COMMON__*/
diff --git a/include/linux/qed/qed_nvmetcp_if.h b/include/linux/qed/qed_nvmetcp_if.h
index 5180edad24e5..606427ebb63c 100644
--- a/include/linux/qed/qed_nvmetcp_if.h
+++ b/include/linux/qed/qed_nvmetcp_if.h
@@ -5,6 +5,8 @@
 #define _QED_NVMETCP_IF_H
 #include <linux/types.h>
 #include <linux/qed/qed_if.h>
+#include <linux/qed/storage_common.h>
+#include <linux/qed/nvmetcp_common.h>
 
 #define QED_NVMETCP_MAX_IO_SIZE	0x800000
 
@@ -72,6 +74,35 @@ struct qed_nvmetcp_cb_ops {
 	struct qed_common_cb_ops common;
 };
 
+struct nvmetcp_sge {
+	struct regpair sge_addr; /* SGE address */
+	__le32 sge_len; /* SGE length */
+	__le32 reserved;
+};
+
+/* IO path HSI function SGL params */
+struct storage_sgl_task_params {
+	struct nvmetcp_sge *sgl;
+	struct regpair sgl_phys_addr;
+	u32 total_buffer_size;
+	u16 num_sges;
+	bool small_mid_sge;
+};
+
+/* IO path HSI function FW task context params */
+struct nvmetcp_task_params {
+	void *context; /* Output parameter - set/filled by the HSI function */
+	struct nvmetcp_wqe *sqe;
+	u32 tx_io_size; /* in bytes (Without DIF, if exists) */
+	u32 rx_io_size; /* in bytes (Without DIF, if exists) */
+	u16 conn_icid;
+	u16 itid;
+	struct regpair opq; /* qedn_task_ctx address */
+	u16 host_cccid;
+	u8 cq_rss_number;
+	bool send_write_incapsule;
+};
+
 /**
  * struct qed_nvmetcp_ops - qed NVMeTCP operations.
  * @common:		common operations pointer
-- 
cgit v1.2.3


From 826da4861430898495fa49f072335e795e8adfd3 Mon Sep 17 00:00:00 2001
From: Shai Malin <smalin@marvell.com>
Date: Wed, 2 Jun 2021 20:16:54 +0300
Subject: qed: Add NVMeTCP Offload IO Level FW Initializations

This patch introduces the NVMeTCP FW initializations which is used
to initialize the IO level configuration into a per IO HW
resource ("task") as part of the IO path flow.

This includes:
- Write IO FW initialization
- Read IO FW initialization.
- IC-Req and IC-Resp FW exchange.
- FW Cleanup flow (Flush IO).

Acked-by: Igor Russkikh <irusskikh@marvell.com>
Signed-off-by: Prabhakar Kushwaha <pkushwaha@marvell.com>
Signed-off-by: Omkar Kulkarni <okulkarni@marvell.com>
Signed-off-by: Shai Malin <smalin@marvell.com>
Signed-off-by: Michal Kalderon <mkalderon@marvell.com>
Signed-off-by: Ariel Elior <aelior@marvell.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/Makefile           |   5 +-
 drivers/net/ethernet/qlogic/qed/qed_nvmetcp.c      |   7 +-
 .../net/ethernet/qlogic/qed/qed_nvmetcp_fw_funcs.c | 376 +++++++++++++++++++++
 .../net/ethernet/qlogic/qed/qed_nvmetcp_fw_funcs.h |  40 +++
 include/linux/qed/nvmetcp_common.h                 |   1 +
 include/linux/qed/qed_nvmetcp_if.h                 |  20 ++
 6 files changed, 447 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_nvmetcp_fw_funcs.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_nvmetcp_fw_funcs.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/Makefile b/drivers/net/ethernet/qlogic/qed/Makefile
index 7cb0db67ba5b..0d9c2fe0245d 100644
--- a/drivers/net/ethernet/qlogic/qed/Makefile
+++ b/drivers/net/ethernet/qlogic/qed/Makefile
@@ -28,7 +28,10 @@ qed-$(CONFIG_QED_ISCSI) += qed_iscsi.o
 qed-$(CONFIG_QED_LL2) += qed_ll2.o
 qed-$(CONFIG_QED_OOO) += qed_ooo.o
 
-qed-$(CONFIG_QED_NVMETCP) += qed_nvmetcp.o
+qed-$(CONFIG_QED_NVMETCP) +=	\
+	qed_nvmetcp.o		\
+	qed_nvmetcp_fw_funcs.o	\
+	qed_nvmetcp_ip_services.o
 
 qed-$(CONFIG_QED_RDMA) +=	\
 	qed_iwarp.o		\
diff --git a/drivers/net/ethernet/qlogic/qed/qed_nvmetcp.c b/drivers/net/ethernet/qlogic/qed/qed_nvmetcp.c
index d4d609a4d3a3..f19128c8d9cc 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_nvmetcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_nvmetcp.c
@@ -27,6 +27,7 @@
 #include "qed_mcp.h"
 #include "qed_sp.h"
 #include "qed_reg_addr.h"
+#include "qed_nvmetcp_fw_funcs.h"
 
 static int qed_nvmetcp_async_event(struct qed_hwfn *p_hwfn, u8 fw_event_code,
 				   u16 echo, union event_ring_data *data,
@@ -809,7 +810,11 @@ static const struct qed_nvmetcp_ops qed_nvmetcp_ops_pass = {
 	.remove_src_tcp_port_filter = &qed_llh_remove_src_tcp_port_filter,
 	.add_dst_tcp_port_filter = &qed_llh_add_dst_tcp_port_filter,
 	.remove_dst_tcp_port_filter = &qed_llh_remove_dst_tcp_port_filter,
-	.clear_all_filters = &qed_llh_clear_all_filters
+	.clear_all_filters = &qed_llh_clear_all_filters,
+	.init_read_io = &init_nvmetcp_host_read_task,
+	.init_write_io = &init_nvmetcp_host_write_task,
+	.init_icreq_exchange = &init_nvmetcp_init_conn_req_task,
+	.init_task_cleanup = &init_cleanup_task_nvmetcp
 };
 
 const struct qed_nvmetcp_ops *qed_get_nvmetcp_ops(void)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_nvmetcp_fw_funcs.c b/drivers/net/ethernet/qlogic/qed/qed_nvmetcp_fw_funcs.c
new file mode 100644
index 000000000000..c1dd71d19f3f
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_nvmetcp_fw_funcs.c
@@ -0,0 +1,376 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause)
+/* Copyright 2021 Marvell. All rights reserved. */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <linux/qed/common_hsi.h>
+#include <linux/qed/storage_common.h>
+#include <linux/qed/nvmetcp_common.h>
+#include <linux/qed/qed_nvmetcp_if.h>
+#include "qed_nvmetcp_fw_funcs.h"
+
+#define NVMETCP_NUM_SGES_IN_CACHE 0x4
+
+bool nvmetcp_is_slow_sgl(u16 num_sges, bool small_mid_sge)
+{
+	return (num_sges > SCSI_NUM_SGES_SLOW_SGL_THR && small_mid_sge);
+}
+
+void init_scsi_sgl_context(struct scsi_sgl_params *ctx_sgl_params,
+			   struct scsi_cached_sges *ctx_data_desc,
+			   struct storage_sgl_task_params *sgl_params)
+{
+	u8 num_sges_to_init = (u8)(sgl_params->num_sges > NVMETCP_NUM_SGES_IN_CACHE ?
+				   NVMETCP_NUM_SGES_IN_CACHE : sgl_params->num_sges);
+	u8 sge_index;
+
+	/* sgl params */
+	ctx_sgl_params->sgl_addr.lo = cpu_to_le32(sgl_params->sgl_phys_addr.lo);
+	ctx_sgl_params->sgl_addr.hi = cpu_to_le32(sgl_params->sgl_phys_addr.hi);
+	ctx_sgl_params->sgl_total_length = cpu_to_le32(sgl_params->total_buffer_size);
+	ctx_sgl_params->sgl_num_sges = cpu_to_le16(sgl_params->num_sges);
+
+	for (sge_index = 0; sge_index < num_sges_to_init; sge_index++) {
+		ctx_data_desc->sge[sge_index].sge_addr.lo =
+			cpu_to_le32(sgl_params->sgl[sge_index].sge_addr.lo);
+		ctx_data_desc->sge[sge_index].sge_addr.hi =
+			cpu_to_le32(sgl_params->sgl[sge_index].sge_addr.hi);
+		ctx_data_desc->sge[sge_index].sge_len =
+			cpu_to_le32(sgl_params->sgl[sge_index].sge_len);
+	}
+}
+
+static inline u32 calc_rw_task_size(struct nvmetcp_task_params *task_params,
+				    enum nvmetcp_task_type task_type)
+{
+	u32 io_size;
+
+	if (task_type == NVMETCP_TASK_TYPE_HOST_WRITE)
+		io_size = task_params->tx_io_size;
+	else
+		io_size = task_params->rx_io_size;
+
+	if (unlikely(!io_size))
+		return 0;
+
+	return io_size;
+}
+
+static inline void init_sqe(struct nvmetcp_task_params *task_params,
+			    struct storage_sgl_task_params *sgl_task_params,
+			    enum nvmetcp_task_type task_type)
+{
+	if (!task_params->sqe)
+		return;
+
+	memset(task_params->sqe, 0, sizeof(*task_params->sqe));
+	task_params->sqe->task_id = cpu_to_le16(task_params->itid);
+
+	switch (task_type) {
+	case NVMETCP_TASK_TYPE_HOST_WRITE: {
+		u32 buf_size = 0;
+		u32 num_sges = 0;
+
+		SET_FIELD(task_params->sqe->contlen_cdbsize,
+			  NVMETCP_WQE_CDB_SIZE_OR_NVMETCP_CMD, 1);
+		SET_FIELD(task_params->sqe->flags, NVMETCP_WQE_WQE_TYPE,
+			  NVMETCP_WQE_TYPE_NORMAL);
+		if (task_params->tx_io_size) {
+			if (task_params->send_write_incapsule)
+				buf_size = calc_rw_task_size(task_params, task_type);
+
+			if (nvmetcp_is_slow_sgl(sgl_task_params->num_sges,
+						sgl_task_params->small_mid_sge))
+				num_sges = NVMETCP_WQE_NUM_SGES_SLOWIO;
+			else
+				num_sges = min((u16)sgl_task_params->num_sges,
+					       (u16)SCSI_NUM_SGES_SLOW_SGL_THR);
+		}
+		SET_FIELD(task_params->sqe->flags, NVMETCP_WQE_NUM_SGES, num_sges);
+		SET_FIELD(task_params->sqe->contlen_cdbsize, NVMETCP_WQE_CONT_LEN, buf_size);
+	} break;
+
+	case NVMETCP_TASK_TYPE_HOST_READ: {
+		SET_FIELD(task_params->sqe->flags, NVMETCP_WQE_WQE_TYPE,
+			  NVMETCP_WQE_TYPE_NORMAL);
+		SET_FIELD(task_params->sqe->contlen_cdbsize,
+			  NVMETCP_WQE_CDB_SIZE_OR_NVMETCP_CMD, 1);
+	} break;
+
+	case NVMETCP_TASK_TYPE_INIT_CONN_REQUEST: {
+		SET_FIELD(task_params->sqe->flags, NVMETCP_WQE_WQE_TYPE,
+			  NVMETCP_WQE_TYPE_MIDDLE_PATH);
+
+		if (task_params->tx_io_size) {
+			SET_FIELD(task_params->sqe->contlen_cdbsize, NVMETCP_WQE_CONT_LEN,
+				  task_params->tx_io_size);
+			SET_FIELD(task_params->sqe->flags, NVMETCP_WQE_NUM_SGES,
+				  min((u16)sgl_task_params->num_sges,
+				      (u16)SCSI_NUM_SGES_SLOW_SGL_THR));
+		}
+	} break;
+
+	case NVMETCP_TASK_TYPE_CLEANUP:
+		SET_FIELD(task_params->sqe->flags, NVMETCP_WQE_WQE_TYPE,
+			  NVMETCP_WQE_TYPE_TASK_CLEANUP);
+
+	default:
+		break;
+	}
+}
+
+/* The following function initializes of NVMeTCP task params */
+static inline void
+init_nvmetcp_task_params(struct e5_nvmetcp_task_context *context,
+			 struct nvmetcp_task_params *task_params,
+			 enum nvmetcp_task_type task_type)
+{
+	context->ystorm_st_context.state.cccid = task_params->host_cccid;
+	SET_FIELD(context->ustorm_st_context.error_flags, USTORM_NVMETCP_TASK_ST_CTX_NVME_TCP, 1);
+	context->ustorm_st_context.nvme_tcp_opaque_lo = cpu_to_le32(task_params->opq.lo);
+	context->ustorm_st_context.nvme_tcp_opaque_hi = cpu_to_le32(task_params->opq.hi);
+}
+
+/* The following function initializes default values to all tasks */
+static inline void
+init_default_nvmetcp_task(struct nvmetcp_task_params *task_params,
+			  void *pdu_header, void *nvme_cmd,
+			  enum nvmetcp_task_type task_type)
+{
+	struct e5_nvmetcp_task_context *context = task_params->context;
+	const u8 val_byte = context->mstorm_ag_context.cdu_validation;
+	u8 dw_index;
+
+	memset(context, 0, sizeof(*context));
+	init_nvmetcp_task_params(context, task_params,
+				 (enum nvmetcp_task_type)task_type);
+
+	/* Swapping requirements used below, will be removed in future FW versions */
+	if (task_type == NVMETCP_TASK_TYPE_HOST_WRITE ||
+	    task_type == NVMETCP_TASK_TYPE_HOST_READ) {
+		for (dw_index = 0;
+		     dw_index < QED_NVMETCP_CMN_HDR_SIZE / sizeof(u32);
+		     dw_index++)
+			context->ystorm_st_context.pdu_hdr.task_hdr.reg[dw_index] =
+				cpu_to_le32(__swab32(((u32 *)pdu_header)[dw_index]));
+
+		for (dw_index = QED_NVMETCP_CMN_HDR_SIZE / sizeof(u32);
+		     dw_index < QED_NVMETCP_CMD_HDR_SIZE / sizeof(u32);
+		     dw_index++)
+			context->ystorm_st_context.pdu_hdr.task_hdr.reg[dw_index] =
+				cpu_to_le32(__swab32(((u32 *)nvme_cmd)[dw_index - 2]));
+	} else {
+		for (dw_index = 0;
+		     dw_index < QED_NVMETCP_NON_IO_HDR_SIZE / sizeof(u32);
+		     dw_index++)
+			context->ystorm_st_context.pdu_hdr.task_hdr.reg[dw_index] =
+				cpu_to_le32(__swab32(((u32 *)pdu_header)[dw_index]));
+	}
+
+	/* M-Storm Context: */
+	context->mstorm_ag_context.cdu_validation = val_byte;
+	context->mstorm_st_context.task_type = (u8)(task_type);
+	context->mstorm_ag_context.task_cid = cpu_to_le16(task_params->conn_icid);
+
+	/* Ustorm Context: */
+	SET_FIELD(context->ustorm_ag_context.flags1, E5_USTORM_NVMETCP_TASK_AG_CTX_R2T2RECV, 1);
+	context->ustorm_st_context.task_type = (u8)(task_type);
+	context->ustorm_st_context.cq_rss_number = task_params->cq_rss_number;
+	context->ustorm_ag_context.icid = cpu_to_le16(task_params->conn_icid);
+}
+
+/* The following function initializes the U-Storm Task Contexts */
+static inline void
+init_ustorm_task_contexts(struct ustorm_nvmetcp_task_st_ctx *ustorm_st_context,
+			  struct e5_ustorm_nvmetcp_task_ag_ctx *ustorm_ag_context,
+			  u32 remaining_recv_len,
+			  u32 expected_data_transfer_len, u8 num_sges,
+			  bool tx_dif_conn_err_en)
+{
+	/* Remaining data to be received in bytes. Used in validations*/
+	ustorm_st_context->rem_rcv_len = cpu_to_le32(remaining_recv_len);
+	ustorm_ag_context->exp_data_acked = cpu_to_le32(expected_data_transfer_len);
+	ustorm_st_context->exp_data_transfer_len = cpu_to_le32(expected_data_transfer_len);
+	SET_FIELD(ustorm_st_context->reg1_map, REG1_NUM_SGES, num_sges);
+	SET_FIELD(ustorm_ag_context->flags2, E5_USTORM_NVMETCP_TASK_AG_CTX_DIF_ERROR_CF_EN,
+		  tx_dif_conn_err_en ? 1 : 0);
+}
+
+/* The following function initializes Local Completion Contexts: */
+static inline void
+set_local_completion_context(struct e5_nvmetcp_task_context *context)
+{
+	SET_FIELD(context->ystorm_st_context.state.flags,
+		  YSTORM_NVMETCP_TASK_STATE_LOCAL_COMP, 1);
+	SET_FIELD(context->ustorm_st_context.flags,
+		  USTORM_NVMETCP_TASK_ST_CTX_LOCAL_COMP, 1);
+}
+
+/* Common Fastpath task init function: */
+static inline void
+init_rw_nvmetcp_task(struct nvmetcp_task_params *task_params,
+		     enum nvmetcp_task_type task_type,
+		     void *pdu_header, void *nvme_cmd,
+		     struct storage_sgl_task_params *sgl_task_params)
+{
+	struct e5_nvmetcp_task_context *context = task_params->context;
+	u32 task_size = calc_rw_task_size(task_params, task_type);
+	bool slow_io = false;
+	u8 num_sges = 0;
+
+	init_default_nvmetcp_task(task_params, pdu_header, nvme_cmd, task_type);
+
+	/* Tx/Rx: */
+	if (task_params->tx_io_size) {
+		/* if data to transmit: */
+		init_scsi_sgl_context(&context->ystorm_st_context.state.sgl_params,
+				      &context->ystorm_st_context.state.data_desc,
+				      sgl_task_params);
+		slow_io = nvmetcp_is_slow_sgl(sgl_task_params->num_sges,
+					      sgl_task_params->small_mid_sge);
+		num_sges =
+			(u8)(!slow_io ? min((u32)sgl_task_params->num_sges,
+					    (u32)SCSI_NUM_SGES_SLOW_SGL_THR) :
+					    NVMETCP_WQE_NUM_SGES_SLOWIO);
+		if (slow_io) {
+			SET_FIELD(context->ystorm_st_context.state.flags,
+				  YSTORM_NVMETCP_TASK_STATE_SLOW_IO, 1);
+		}
+	} else if (task_params->rx_io_size) {
+		/* if data to receive: */
+		init_scsi_sgl_context(&context->mstorm_st_context.sgl_params,
+				      &context->mstorm_st_context.data_desc,
+				      sgl_task_params);
+		num_sges =
+			(u8)(!nvmetcp_is_slow_sgl(sgl_task_params->num_sges,
+						  sgl_task_params->small_mid_sge) ?
+						  min((u32)sgl_task_params->num_sges,
+						      (u32)SCSI_NUM_SGES_SLOW_SGL_THR) :
+						      NVMETCP_WQE_NUM_SGES_SLOWIO);
+		context->mstorm_st_context.rem_task_size = cpu_to_le32(task_size);
+	}
+
+	/* Ustorm context: */
+	init_ustorm_task_contexts(&context->ustorm_st_context,
+				  &context->ustorm_ag_context,
+				  /* Remaining Receive length is the Task Size */
+				  task_size,
+				  /* The size of the transmitted task */
+				  task_size,
+				  /* num_sges */
+				  num_sges,
+				  false);
+
+	/* Set exp_data_acked */
+	if (task_type == NVMETCP_TASK_TYPE_HOST_WRITE) {
+		if (task_params->send_write_incapsule)
+			context->ustorm_ag_context.exp_data_acked = task_size;
+		else
+			context->ustorm_ag_context.exp_data_acked = 0;
+	} else if (task_type == NVMETCP_TASK_TYPE_HOST_READ) {
+		context->ustorm_ag_context.exp_data_acked = 0;
+	}
+
+	context->ustorm_ag_context.exp_cont_len = 0;
+	init_sqe(task_params, sgl_task_params, task_type);
+}
+
+static void
+init_common_initiator_read_task(struct nvmetcp_task_params *task_params,
+				struct nvme_tcp_cmd_pdu *cmd_pdu_header,
+				struct nvme_command *nvme_cmd,
+				struct storage_sgl_task_params *sgl_task_params)
+{
+	init_rw_nvmetcp_task(task_params, NVMETCP_TASK_TYPE_HOST_READ,
+			     cmd_pdu_header, nvme_cmd, sgl_task_params);
+}
+
+void init_nvmetcp_host_read_task(struct nvmetcp_task_params *task_params,
+				 struct nvme_tcp_cmd_pdu *cmd_pdu_header,
+				 struct nvme_command *nvme_cmd,
+				 struct storage_sgl_task_params *sgl_task_params)
+{
+	init_common_initiator_read_task(task_params, (void *)cmd_pdu_header,
+					(void *)nvme_cmd, sgl_task_params);
+}
+
+static void
+init_common_initiator_write_task(struct nvmetcp_task_params *task_params,
+				 struct nvme_tcp_cmd_pdu *cmd_pdu_header,
+				 struct nvme_command *nvme_cmd,
+				 struct storage_sgl_task_params *sgl_task_params)
+{
+	init_rw_nvmetcp_task(task_params, NVMETCP_TASK_TYPE_HOST_WRITE,
+			     cmd_pdu_header, nvme_cmd, sgl_task_params);
+}
+
+void init_nvmetcp_host_write_task(struct nvmetcp_task_params *task_params,
+				  struct nvme_tcp_cmd_pdu *cmd_pdu_header,
+				  struct nvme_command *nvme_cmd,
+				  struct storage_sgl_task_params *sgl_task_params)
+{
+	init_common_initiator_write_task(task_params, (void *)cmd_pdu_header,
+					 (void *)nvme_cmd, sgl_task_params);
+}
+
+static void
+init_common_login_request_task(struct nvmetcp_task_params *task_params,
+			       void *login_req_pdu_header,
+			       struct storage_sgl_task_params *tx_sgl_task_params,
+			       struct storage_sgl_task_params *rx_sgl_task_params)
+{
+	struct e5_nvmetcp_task_context *context = task_params->context;
+
+	init_default_nvmetcp_task(task_params, (void *)login_req_pdu_header, NULL,
+				  NVMETCP_TASK_TYPE_INIT_CONN_REQUEST);
+
+	/* Ustorm Context: */
+	init_ustorm_task_contexts(&context->ustorm_st_context,
+				  &context->ustorm_ag_context,
+
+				  /* Remaining Receive length is the Task Size */
+				  task_params->rx_io_size ?
+				  rx_sgl_task_params->total_buffer_size : 0,
+
+				  /* The size of the transmitted task */
+				  task_params->tx_io_size ?
+				  tx_sgl_task_params->total_buffer_size : 0,
+				  0, /* num_sges */
+				  0); /* tx_dif_conn_err_en */
+
+	/* SGL context: */
+	if (task_params->tx_io_size)
+		init_scsi_sgl_context(&context->ystorm_st_context.state.sgl_params,
+				      &context->ystorm_st_context.state.data_desc,
+				      tx_sgl_task_params);
+	if (task_params->rx_io_size)
+		init_scsi_sgl_context(&context->mstorm_st_context.sgl_params,
+				      &context->mstorm_st_context.data_desc,
+				      rx_sgl_task_params);
+
+	context->mstorm_st_context.rem_task_size =
+		cpu_to_le32(task_params->rx_io_size ?
+				 rx_sgl_task_params->total_buffer_size : 0);
+	init_sqe(task_params, tx_sgl_task_params, NVMETCP_TASK_TYPE_INIT_CONN_REQUEST);
+}
+
+/* The following function initializes Login task in Host mode: */
+void init_nvmetcp_init_conn_req_task(struct nvmetcp_task_params *task_params,
+				     struct nvme_tcp_icreq_pdu *init_conn_req_pdu_hdr,
+				     struct storage_sgl_task_params *tx_sgl_task_params,
+				     struct storage_sgl_task_params *rx_sgl_task_params)
+{
+	init_common_login_request_task(task_params, init_conn_req_pdu_hdr,
+				       tx_sgl_task_params, rx_sgl_task_params);
+}
+
+void init_cleanup_task_nvmetcp(struct nvmetcp_task_params *task_params)
+{
+	init_sqe(task_params, NULL, NVMETCP_TASK_TYPE_CLEANUP);
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_nvmetcp_fw_funcs.h b/drivers/net/ethernet/qlogic/qed/qed_nvmetcp_fw_funcs.h
new file mode 100644
index 000000000000..4c7ac2bd2ea5
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_nvmetcp_fw_funcs.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */
+/* Copyright 2021 Marvell. All rights reserved. */
+
+#ifndef _QED_NVMETCP_FW_FUNCS_H
+#define _QED_NVMETCP_FW_FUNCS_H
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <linux/qed/common_hsi.h>
+#include <linux/qed/storage_common.h>
+#include <linux/qed/nvmetcp_common.h>
+#include <linux/qed/qed_nvmetcp_if.h>
+
+#if IS_ENABLED(CONFIG_QED_NVMETCP)
+
+void init_nvmetcp_host_read_task(struct nvmetcp_task_params *task_params,
+				 struct nvme_tcp_cmd_pdu *cmd_pdu_header,
+				 struct nvme_command *nvme_cmd,
+				 struct storage_sgl_task_params *sgl_task_params);
+void init_nvmetcp_host_write_task(struct nvmetcp_task_params *task_params,
+				  struct nvme_tcp_cmd_pdu *cmd_pdu_header,
+				  struct nvme_command *nvme_cmd,
+				  struct storage_sgl_task_params *sgl_task_params);
+void init_nvmetcp_init_conn_req_task(struct nvmetcp_task_params *task_params,
+				     struct nvme_tcp_icreq_pdu *init_conn_req_pdu_hdr,
+				     struct storage_sgl_task_params *tx_sgl_task_params,
+				     struct storage_sgl_task_params *rx_sgl_task_params);
+void init_cleanup_task_nvmetcp(struct nvmetcp_task_params *task_params);
+
+#else /* IS_ENABLED(CONFIG_QED_NVMETCP) */
+
+#endif /* IS_ENABLED(CONFIG_QED_NVMETCP) */
+
+#endif /* _QED_NVMETCP_FW_FUNCS_H */
diff --git a/include/linux/qed/nvmetcp_common.h b/include/linux/qed/nvmetcp_common.h
index ad745a9c2264..5a2ab0606308 100644
--- a/include/linux/qed/nvmetcp_common.h
+++ b/include/linux/qed/nvmetcp_common.h
@@ -5,6 +5,7 @@
 #define __NVMETCP_COMMON__
 
 #include "tcp_common.h"
+#include <linux/nvme-tcp.h>
 
 #define NVMETCP_SLOW_PATH_LAYER_CODE (6)
 #define NVMETCP_WQE_NUM_SGES_SLOWIO (0xf)
diff --git a/include/linux/qed/qed_nvmetcp_if.h b/include/linux/qed/qed_nvmetcp_if.h
index 606427ebb63c..14671bc19ed1 100644
--- a/include/linux/qed/qed_nvmetcp_if.h
+++ b/include/linux/qed/qed_nvmetcp_if.h
@@ -9,6 +9,9 @@
 #include <linux/qed/nvmetcp_common.h>
 
 #define QED_NVMETCP_MAX_IO_SIZE	0x800000
+#define QED_NVMETCP_CMN_HDR_SIZE (sizeof(struct nvme_tcp_hdr))
+#define QED_NVMETCP_CMD_HDR_SIZE (sizeof(struct nvme_tcp_cmd_pdu))
+#define QED_NVMETCP_NON_IO_HDR_SIZE ((QED_NVMETCP_CMN_HDR_SIZE + 16))
 
 typedef int (*nvmetcp_event_cb_t) (void *context,
 				   u8 fw_event_code, void *fw_handle);
@@ -213,6 +216,23 @@ struct qed_nvmetcp_ops {
 	void (*remove_dst_tcp_port_filter)(struct qed_dev *cdev, u16 dest_port);
 
 	void (*clear_all_filters)(struct qed_dev *cdev);
+
+	void (*init_read_io)(struct nvmetcp_task_params *task_params,
+			     struct nvme_tcp_cmd_pdu *cmd_pdu_header,
+			     struct nvme_command *nvme_cmd,
+			     struct storage_sgl_task_params *sgl_task_params);
+
+	void (*init_write_io)(struct nvmetcp_task_params *task_params,
+			      struct nvme_tcp_cmd_pdu *cmd_pdu_header,
+			      struct nvme_command *nvme_cmd,
+			      struct storage_sgl_task_params *sgl_task_params);
+
+	void (*init_icreq_exchange)(struct nvmetcp_task_params *task_params,
+				    struct nvme_tcp_icreq_pdu *init_conn_req_pdu_hdr,
+				    struct storage_sgl_task_params *tx_sgl_task_params,
+				    struct storage_sgl_task_params *rx_sgl_task_params);
+
+	void (*init_task_cleanup)(struct nvmetcp_task_params *task_params);
 };
 
 const struct qed_nvmetcp_ops *qed_get_nvmetcp_ops(void);
-- 
cgit v1.2.3


From 806ee7f81a2b037e3f57275adcdf974453cc3254 Mon Sep 17 00:00:00 2001
From: Nikolay Assa <nassa@marvell.com>
Date: Wed, 2 Jun 2021 20:16:55 +0300
Subject: qed: Add IP services APIs support

This patch introduces APIs which the NVMeTCP Offload device (qedn)
will use through the paired net-device (qede).
It includes APIs for:
- ipv4/ipv6 routing
- get VLAN from net-device
- TCP ports reservation

Acked-by: Igor Russkikh <irusskikh@marvell.com>
Signed-off-by: Nikolay Assa <nassa@marvell.com>
Signed-off-by: Prabhakar Kushwaha <pkushwaha@marvell.com>
Signed-off-by: Omkar Kulkarni <okulkarni@marvell.com>
Signed-off-by: Michal Kalderon <mkalderon@marvell.com>
Signed-off-by: Ariel Elior <aelior@marvell.com>
Signed-off-by: Shai Malin <smalin@marvell.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/qlogic/qed/qed_nvmetcp_ip_services.c  | 238 +++++++++++++++++++++
 include/linux/qed/qed_nvmetcp_ip_services_if.h     |  29 +++
 2 files changed, 267 insertions(+)
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_nvmetcp_ip_services.c
 create mode 100644 include/linux/qed/qed_nvmetcp_ip_services_if.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_nvmetcp_ip_services.c b/drivers/net/ethernet/qlogic/qed/qed_nvmetcp_ip_services.c
new file mode 100644
index 000000000000..96a2077fd315
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_nvmetcp_ip_services.c
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause)
+/*
+ * Copyright 2021 Marvell. All rights reserved.
+ */
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <asm/param.h>
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/etherdevice.h>
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+#include <linux/errno.h>
+
+#include <net/tcp.h>
+
+#include <linux/qed/qed_nvmetcp_ip_services_if.h>
+
+#define QED_IP_RESOL_TIMEOUT  4
+
+int qed_route_ipv4(struct sockaddr_storage *local_addr,
+		   struct sockaddr_storage *remote_addr,
+		   struct sockaddr *hardware_address,
+		   struct net_device **ndev)
+{
+	struct neighbour *neigh = NULL;
+	__be32 *loc_ip, *rem_ip;
+	struct rtable *rt;
+	int rc = -ENXIO;
+	int retry;
+
+	loc_ip = &((struct sockaddr_in *)local_addr)->sin_addr.s_addr;
+	rem_ip = &((struct sockaddr_in *)remote_addr)->sin_addr.s_addr;
+	*ndev = NULL;
+	rt = ip_route_output(&init_net, *rem_ip, *loc_ip, 0/*tos*/, 0/*oif*/);
+	if (IS_ERR(rt)) {
+		pr_err("lookup route failed\n");
+		rc = PTR_ERR(rt);
+		goto return_err;
+	}
+
+	neigh = dst_neigh_lookup(&rt->dst, rem_ip);
+	if (!neigh) {
+		rc = -ENOMEM;
+		ip_rt_put(rt);
+		goto return_err;
+	}
+
+	*ndev = rt->dst.dev;
+	ip_rt_put(rt);
+
+	/* If not resolved, kick-off state machine towards resolution */
+	if (!(neigh->nud_state & NUD_VALID))
+		neigh_event_send(neigh, NULL);
+
+	/* query neighbor until resolved or timeout */
+	retry = QED_IP_RESOL_TIMEOUT;
+	while (!(neigh->nud_state & NUD_VALID) && retry > 0) {
+		msleep(1000);
+		retry--;
+	}
+
+	if (neigh->nud_state & NUD_VALID) {
+		/* copy resolved MAC address */
+		neigh_ha_snapshot(hardware_address->sa_data, neigh, *ndev);
+		hardware_address->sa_family = (*ndev)->type;
+		rc = 0;
+	}
+
+	neigh_release(neigh);
+	if (!(*loc_ip)) {
+		*loc_ip = inet_select_addr(*ndev, *rem_ip, RT_SCOPE_UNIVERSE);
+		local_addr->ss_family = AF_INET;
+	}
+
+return_err:
+
+	return rc;
+}
+EXPORT_SYMBOL(qed_route_ipv4);
+
+int qed_route_ipv6(struct sockaddr_storage *local_addr,
+		   struct sockaddr_storage *remote_addr,
+		   struct sockaddr *hardware_address,
+		   struct net_device **ndev)
+{
+	struct neighbour *neigh = NULL;
+	struct dst_entry *dst;
+	struct flowi6 fl6;
+	int rc = -ENXIO;
+	int retry;
+
+	memset(&fl6, 0, sizeof(fl6));
+	fl6.saddr = ((struct sockaddr_in6 *)local_addr)->sin6_addr;
+	fl6.daddr = ((struct sockaddr_in6 *)remote_addr)->sin6_addr;
+	dst = ip6_route_output(&init_net, NULL, &fl6);
+	if (!dst || dst->error) {
+		if (dst) {
+			dst_release(dst);
+			pr_err("lookup route failed %d\n", dst->error);
+		}
+
+		goto out;
+	}
+
+	neigh = dst_neigh_lookup(dst, &fl6.daddr);
+	if (neigh) {
+		*ndev = ip6_dst_idev(dst)->dev;
+
+		/* If not resolved, kick-off state machine towards resolution */
+		if (!(neigh->nud_state & NUD_VALID))
+			neigh_event_send(neigh, NULL);
+
+		/* query neighbor until resolved or timeout */
+		retry = QED_IP_RESOL_TIMEOUT;
+		while (!(neigh->nud_state & NUD_VALID) && retry > 0) {
+			msleep(1000);
+			retry--;
+		}
+
+		if (neigh->nud_state & NUD_VALID) {
+			neigh_ha_snapshot((u8 *)hardware_address->sa_data,
+					  neigh, *ndev);
+			hardware_address->sa_family = (*ndev)->type;
+			rc = 0;
+		}
+
+		neigh_release(neigh);
+
+		if (ipv6_addr_any(&fl6.saddr)) {
+			if (ipv6_dev_get_saddr(dev_net(*ndev), *ndev,
+					       &fl6.daddr, 0, &fl6.saddr)) {
+				pr_err("Unable to find source IP address\n");
+				goto out;
+			}
+
+			local_addr->ss_family = AF_INET6;
+			((struct sockaddr_in6 *)local_addr)->sin6_addr =
+								fl6.saddr;
+		}
+	}
+
+	dst_release(dst);
+
+out:
+
+	return rc;
+}
+EXPORT_SYMBOL(qed_route_ipv6);
+
+void qed_vlan_get_ndev(struct net_device **ndev, u16 *vlan_id)
+{
+	if (is_vlan_dev(*ndev)) {
+		*vlan_id = vlan_dev_vlan_id(*ndev);
+		*ndev = vlan_dev_real_dev(*ndev);
+	}
+}
+EXPORT_SYMBOL(qed_vlan_get_ndev);
+
+struct pci_dev *qed_validate_ndev(struct net_device *ndev)
+{
+	struct pci_dev *pdev = NULL;
+	struct net_device *upper;
+
+	for_each_pci_dev(pdev) {
+		if (pdev && pdev->driver &&
+		    !strcmp(pdev->driver->name, "qede")) {
+			upper = pci_get_drvdata(pdev);
+			if (upper->ifindex == ndev->ifindex)
+				return pdev;
+		}
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(qed_validate_ndev);
+
+__be16 qed_get_in_port(struct sockaddr_storage *sa)
+{
+	return sa->ss_family == AF_INET
+		? ((struct sockaddr_in *)sa)->sin_port
+		: ((struct sockaddr_in6 *)sa)->sin6_port;
+}
+EXPORT_SYMBOL(qed_get_in_port);
+
+int qed_fetch_tcp_port(struct sockaddr_storage local_ip_addr,
+		       struct socket **sock, u16 *port)
+{
+	struct sockaddr_storage sa;
+	int rc = 0;
+
+	rc = sock_create(local_ip_addr.ss_family, SOCK_STREAM, IPPROTO_TCP,
+			 sock);
+	if (rc) {
+		pr_warn("failed to create socket: %d\n", rc);
+		goto err;
+	}
+
+	(*sock)->sk->sk_allocation = GFP_KERNEL;
+	sk_set_memalloc((*sock)->sk);
+
+	rc = kernel_bind(*sock, (struct sockaddr *)&local_ip_addr,
+			 sizeof(local_ip_addr));
+
+	if (rc) {
+		pr_warn("failed to bind socket: %d\n", rc);
+		goto err_sock;
+	}
+
+	rc = kernel_getsockname(*sock, (struct sockaddr *)&sa);
+	if (rc < 0) {
+		pr_warn("getsockname() failed: %d\n", rc);
+		goto err_sock;
+	}
+
+	*port = ntohs(qed_get_in_port(&sa));
+
+	return 0;
+
+err_sock:
+	sock_release(*sock);
+	sock = NULL;
+err:
+
+	return rc;
+}
+EXPORT_SYMBOL(qed_fetch_tcp_port);
+
+void qed_return_tcp_port(struct socket *sock)
+{
+	if (sock && sock->sk) {
+		tcp_set_state(sock->sk, TCP_CLOSE);
+		sock_release(sock);
+	}
+}
+EXPORT_SYMBOL(qed_return_tcp_port);
diff --git a/include/linux/qed/qed_nvmetcp_ip_services_if.h b/include/linux/qed/qed_nvmetcp_ip_services_if.h
new file mode 100644
index 000000000000..3604aee53796
--- /dev/null
+++ b/include/linux/qed/qed_nvmetcp_ip_services_if.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */
+/*
+ * Copyright 2021 Marvell. All rights reserved.
+ */
+
+#ifndef _QED_IP_SERVICES_IF_H
+#define _QED_IP_SERVICES_IF_H
+
+#include <linux/types.h>
+#include <net/route.h>
+#include <net/ip6_route.h>
+#include <linux/inetdevice.h>
+
+int qed_route_ipv4(struct sockaddr_storage *local_addr,
+		   struct sockaddr_storage *remote_addr,
+		   struct sockaddr *hardware_address,
+		   struct net_device **ndev);
+int qed_route_ipv6(struct sockaddr_storage *local_addr,
+		   struct sockaddr_storage *remote_addr,
+		   struct sockaddr *hardware_address,
+		   struct net_device **ndev);
+void qed_vlan_get_ndev(struct net_device **ndev, u16 *vlan_id);
+struct pci_dev *qed_validate_ndev(struct net_device *ndev);
+void qed_return_tcp_port(struct socket *sock);
+int qed_fetch_tcp_port(struct sockaddr_storage local_ip_addr,
+		       struct socket **sock, u16 *port);
+__be16 qed_get_in_port(struct sockaddr_storage *sa);
+
+#endif /* _QED_IP_SERVICES_IF_H */
-- 
cgit v1.2.3


From 9e40ee18a1dc1623a5368d6232aaed52fd29dada Mon Sep 17 00:00:00 2001
From: Clemens Gruber <clemens.gruber@pqgruber.com>
Date: Fri, 7 May 2021 15:18:42 +0200
Subject: pwm: core: Support new usage_power setting in PWM state

If usage_power is set, the PWM driver is only required to maintain
the power output but has more freedom regarding signal form.

If supported, the signal can be optimized, for example to
improve EMI by phase shifting individual channels.

Signed-off-by: Clemens Gruber <clemens.gruber@pqgruber.com>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 Documentation/driver-api/pwm.rst | 4 ++++
 drivers/pwm/core.c               | 6 +++++-
 include/linux/pwm.h              | 7 +++++++
 3 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/pwm.rst b/Documentation/driver-api/pwm.rst
index a7ca4f58305a..750734a7f874 100644
--- a/Documentation/driver-api/pwm.rst
+++ b/Documentation/driver-api/pwm.rst
@@ -48,6 +48,10 @@ After being requested, a PWM has to be configured using::
 
 This API controls both the PWM period/duty_cycle config and the
 enable/disable state.
+There is also a usage_power setting: If set, the PWM driver is only required to
+maintain the power output but has more freedom regarding signal form.
+If supported by the driver, the signal can be optimized, for example to improve
+EMI by phase shifting the individual channels of a chip.
 
 The pwm_config(), pwm_enable() and pwm_disable() functions are just wrappers
 around pwm_apply_state() and should not be used if the user wants to change
diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index c165c5822703..a42999f877d2 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -536,7 +536,8 @@ int pwm_apply_state(struct pwm_device *pwm, const struct pwm_state *state)
 	if (state->period == pwm->state.period &&
 	    state->duty_cycle == pwm->state.duty_cycle &&
 	    state->polarity == pwm->state.polarity &&
-	    state->enabled == pwm->state.enabled)
+	    state->enabled == pwm->state.enabled &&
+	    state->usage_power == pwm->state.usage_power)
 		return 0;
 
 	if (chip->ops->apply) {
@@ -1241,6 +1242,9 @@ static void pwm_dbg_show(struct pwm_chip *chip, struct seq_file *s)
 		seq_printf(s, " polarity: %s",
 			   state.polarity ? "inverse" : "normal");
 
+		if (state.usage_power)
+			seq_puts(s, " usage_power");
+
 		seq_puts(s, "\n");
 	}
 }
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 5bb90af4997e..5a73251d28e3 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -54,12 +54,17 @@ enum {
  * @duty_cycle: PWM duty cycle (in nanoseconds)
  * @polarity: PWM polarity
  * @enabled: PWM enabled status
+ * @usage_power: If set, the PWM driver is only required to maintain the power
+ *               output but has more freedom regarding signal form.
+ *               If supported, the signal can be optimized, for example to
+ *               improve EMI by phase shifting individual channels.
  */
 struct pwm_state {
 	u64 period;
 	u64 duty_cycle;
 	enum pwm_polarity polarity;
 	bool enabled;
+	bool usage_power;
 };
 
 /**
@@ -188,6 +193,7 @@ static inline void pwm_init_state(const struct pwm_device *pwm,
 	state->period = args.period;
 	state->polarity = args.polarity;
 	state->duty_cycle = 0;
+	state->usage_power = false;
 }
 
 /**
@@ -558,6 +564,7 @@ static inline void pwm_apply_args(struct pwm_device *pwm)
 	state.enabled = false;
 	state.polarity = pwm->args.polarity;
 	state.period = pwm->args.period;
+	state.usage_power = false;
 
 	pwm_apply_state(pwm, &state);
 }
-- 
cgit v1.2.3


From cd70c85c5752f060b09b0cf5b7694717471ce998 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Date: Fri, 7 May 2021 12:19:27 -0400
Subject: power: supply: max17040: drop unused platform data support

There are no platforms using the driver with platform data (no board
files with the driver), so the dead code can be dropped.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/max17040_battery.c |  3 ---
 include/linux/max17040_battery.h        | 16 ----------------
 2 files changed, 19 deletions(-)
 delete mode 100644 include/linux/max17040_battery.h

(limited to 'include/linux')

diff --git a/drivers/power/supply/max17040_battery.c b/drivers/power/supply/max17040_battery.c
index aecc0c840351..3cea92e28dc3 100644
--- a/drivers/power/supply/max17040_battery.c
+++ b/drivers/power/supply/max17040_battery.c
@@ -16,7 +16,6 @@
 #include <linux/interrupt.h>
 #include <linux/power_supply.h>
 #include <linux/of_device.h>
-#include <linux/max17040_battery.h>
 #include <linux/regmap.h>
 #include <linux/slab.h>
 
@@ -142,7 +141,6 @@ struct max17040_chip {
 	struct regmap			*regmap;
 	struct delayed_work		work;
 	struct power_supply		*battery;
-	struct max17040_platform_data	*pdata;
 	struct chip_data		data;
 
 	/* battery capacity */
@@ -451,7 +449,6 @@ static int max17040_probe(struct i2c_client *client,
 
 	chip->client = client;
 	chip->regmap = devm_regmap_init_i2c(client, &max17040_regmap);
-	chip->pdata = client->dev.platform_data;
 	chip_id = (enum chip_id) id->driver_data;
 	if (client->dev.of_node) {
 		ret = max17040_get_of_data(chip);
diff --git a/include/linux/max17040_battery.h b/include/linux/max17040_battery.h
deleted file mode 100644
index 593602fc9317..000000000000
--- a/include/linux/max17040_battery.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- *  Copyright (C) 2009 Samsung Electronics
- *  Minkyu Kang <mk7.kang@samsung.com>
- */
-
-#ifndef __MAX17040_BATTERY_H_
-#define __MAX17040_BATTERY_H_
-
-struct max17040_platform_data {
-	int (*battery_online)(void);
-	int (*charger_online)(void);
-	int (*charger_enable)(void);
-};
-
-#endif
-- 
cgit v1.2.3


From 063933f47a7af01650af9c4fbcc5831f1c4eb7d9 Mon Sep 17 00:00:00 2001
From: Kyle Tso <kyletso@google.com>
Date: Tue, 1 Jun 2021 00:49:28 +0800
Subject: usb: typec: tcpm: Properly handle Alert and Status Messages

When receiving Alert Message, if it is not unexpected but is
unsupported for some reason, the port should return Not_Supported
Message response.

Also, according to PD3.0 Spec 6.5.2.1.4 Event Flags Field, the
OTP/OVP/OCP flags in the Event Flags field in Status Message no longer
require Get_PPS_Status Message to clear them. Thus remove it when
receiving Status Message with those flags being set.

In addition, add the missing AMS operations for Status Message.

Fixes: 64f7c494a3c0 ("typec: tcpm: Add support for sink PPS related messages")
Fixes: 0908c5aca31e ("usb: typec: tcpm: AMS and Collision Avoidance")
Signed-off-by: Kyle Tso <kyletso@google.com>
Link: https://lore.kernel.org/r/20210531164928.2368606-1-kyletso@google.com
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/tcpm/tcpm.c  | 52 ++++++++++++++++++++++--------------------
 include/linux/usb/pd_ext_sdb.h |  4 ----
 2 files changed, 27 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c
index 6161a0c1dc0e..938a1afd43ec 100644
--- a/drivers/usb/typec/tcpm/tcpm.c
+++ b/drivers/usb/typec/tcpm/tcpm.c
@@ -2188,20 +2188,25 @@ static void tcpm_handle_alert(struct tcpm_port *port, const __le32 *payload,
 
 	if (!type) {
 		tcpm_log(port, "Alert message received with no type");
+		tcpm_queue_message(port, PD_MSG_CTRL_NOT_SUPP);
 		return;
 	}
 
 	/* Just handling non-battery alerts for now */
 	if (!(type & USB_PD_ADO_TYPE_BATT_STATUS_CHANGE)) {
-		switch (port->state) {
-		case SRC_READY:
-		case SNK_READY:
+		if (port->pwr_role == TYPEC_SOURCE) {
+			port->upcoming_state = GET_STATUS_SEND;
+			tcpm_ams_start(port, GETTING_SOURCE_SINK_STATUS);
+		} else {
+			/*
+			 * Do not check SinkTxOk here in case the Source doesn't set its Rp to
+			 * SinkTxOk in time.
+			 */
+			port->ams = GETTING_SOURCE_SINK_STATUS;
 			tcpm_set_state(port, GET_STATUS_SEND, 0);
-			break;
-		default:
-			tcpm_queue_message(port, PD_MSG_CTRL_WAIT);
-			break;
 		}
+	} else {
+		tcpm_queue_message(port, PD_MSG_CTRL_NOT_SUPP);
 	}
 }
 
@@ -2445,7 +2450,12 @@ static void tcpm_pd_data_request(struct tcpm_port *port,
 		tcpm_pd_handle_state(port, BIST_RX, BIST, 0);
 		break;
 	case PD_DATA_ALERT:
-		tcpm_handle_alert(port, msg->payload, cnt);
+		if (port->state != SRC_READY && port->state != SNK_READY)
+			tcpm_pd_handle_state(port, port->pwr_role == TYPEC_SOURCE ?
+					     SRC_SOFT_RESET_WAIT_SNK_TX : SNK_SOFT_RESET,
+					     NONE_AMS, 0);
+		else
+			tcpm_handle_alert(port, msg->payload, cnt);
 		break;
 	case PD_DATA_BATT_STATUS:
 	case PD_DATA_GET_COUNTRY_INFO:
@@ -2769,24 +2779,16 @@ static void tcpm_pd_ext_msg_request(struct tcpm_port *port,
 
 	switch (type) {
 	case PD_EXT_STATUS:
-		/*
-		 * If PPS related events raised then get PPS status to clear
-		 * (see USB PD 3.0 Spec, 6.5.2.4)
-		 */
-		if (msg->ext_msg.data[USB_PD_EXT_SDB_EVENT_FLAGS] &
-		    USB_PD_EXT_SDB_PPS_EVENTS)
-			tcpm_pd_handle_state(port, GET_PPS_STATUS_SEND,
-					     GETTING_SOURCE_SINK_STATUS, 0);
-
-		else
-			tcpm_pd_handle_state(port, ready_state(port), NONE_AMS, 0);
-		break;
 	case PD_EXT_PPS_STATUS:
-		/*
-		 * For now the PPS status message is used to clear events
-		 * and nothing more.
-		 */
-		tcpm_pd_handle_state(port, ready_state(port), NONE_AMS, 0);
+		if (port->ams == GETTING_SOURCE_SINK_STATUS) {
+			tcpm_ams_finish(port);
+			tcpm_set_state(port, ready_state(port), 0);
+		} else {
+			/* unexpected Status or PPS_Status Message */
+			tcpm_pd_handle_state(port, port->pwr_role == TYPEC_SOURCE ?
+					     SRC_SOFT_RESET_WAIT_SNK_TX : SNK_SOFT_RESET,
+					     NONE_AMS, 0);
+		}
 		break;
 	case PD_EXT_SOURCE_CAP_EXT:
 	case PD_EXT_GET_BATT_CAP:
diff --git a/include/linux/usb/pd_ext_sdb.h b/include/linux/usb/pd_ext_sdb.h
index 0eb83ce19597..b517ebc8f0ff 100644
--- a/include/linux/usb/pd_ext_sdb.h
+++ b/include/linux/usb/pd_ext_sdb.h
@@ -24,8 +24,4 @@ enum usb_pd_ext_sdb_fields {
 #define USB_PD_EXT_SDB_EVENT_OVP		BIT(3)
 #define USB_PD_EXT_SDB_EVENT_CF_CV_MODE		BIT(4)
 
-#define USB_PD_EXT_SDB_PPS_EVENTS	(USB_PD_EXT_SDB_EVENT_OCP |	\
-					 USB_PD_EXT_SDB_EVENT_OTP |	\
-					 USB_PD_EXT_SDB_EVENT_OVP)
-
 #endif /* __LINUX_USB_PD_EXT_SDB_H */
-- 
cgit v1.2.3


From 7dc0c55e9f302e7048e040ee4437437bbea1e2cd Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Thu, 20 May 2021 16:21:44 -0400
Subject: USB: UDC core: Add udc_async_callbacks gadget op

The Gadget API has a theoretical race when a gadget driver is unbound.
Although the pull-up is turned off before the driver's ->unbind
callback runs, if the USB cable were to be unplugged at just the wrong
moment there would be nothing to prevent the UDC driver from invoking
the ->disconnect callback after the unbind has finished.  In theory,
other asynchronous callbacks could also happen during the time before
the UDC driver's udc_stop routine is called, and the gadget driver
would not be prepared to handle any of them.

We need a way to tell UDC drivers to stop issuing asynchronous (that is,
->suspend, ->resume, ->disconnect, ->reset, or ->setup) callbacks at
some point after the pull-up has been turned off and before the
->unbind callback runs.  This patch adds a new ->udc_async_callbacks
callback to the usb_gadget_ops structure for precisely this purpose,
and it adds the corresponding support to the UDC core.

Later patches in this series add support for udc_async_callbacks to
several UDC drivers.

Acked-by: Felipe Balbi <balbi@kernel.org>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Link: https://lore.kernel.org/r/20210520202144.GC1216852@rowland.harvard.edu
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/gadget/udc/core.c | 49 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/usb/gadget.h    |  1 +
 2 files changed, 50 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c
index 493ff93f7dda..b7f0b1ebaaa8 100644
--- a/drivers/usb/gadget/udc/core.c
+++ b/drivers/usb/gadget/udc/core.c
@@ -1147,6 +1147,53 @@ static inline void usb_gadget_udc_set_speed(struct usb_udc *udc,
 		gadget->ops->udc_set_speed(gadget, s);
 }
 
+/**
+ * usb_gadget_enable_async_callbacks - tell usb device controller to enable asynchronous callbacks
+ * @udc: The UDC which should enable async callbacks
+ *
+ * This routine is used when binding gadget drivers.  It undoes the effect
+ * of usb_gadget_disable_async_callbacks(); the UDC driver should enable IRQs
+ * (if necessary) and resume issuing callbacks.
+ *
+ * This routine will always be called in process context.
+ */
+static inline void usb_gadget_enable_async_callbacks(struct usb_udc *udc)
+{
+	struct usb_gadget *gadget = udc->gadget;
+
+	if (gadget->ops->udc_async_callbacks)
+		gadget->ops->udc_async_callbacks(gadget, true);
+}
+
+/**
+ * usb_gadget_disable_async_callbacks - tell usb device controller to disable asynchronous callbacks
+ * @udc: The UDC which should disable async callbacks
+ *
+ * This routine is used when unbinding gadget drivers.  It prevents a race:
+ * The UDC driver doesn't know when the gadget driver's ->unbind callback
+ * runs, so unless it is told to disable asynchronous callbacks, it might
+ * issue a callback (such as ->disconnect) after the unbind has completed.
+ *
+ * After this function runs, the UDC driver must suppress all ->suspend,
+ * ->resume, ->disconnect, ->reset, and ->setup callbacks to the gadget driver
+ * until async callbacks are again enabled.  A simple-minded but effective
+ * way to accomplish this is to tell the UDC hardware not to generate any
+ * more IRQs.
+ *
+ * Request completion callbacks must still be issued.  However, it's okay
+ * to defer them until the request is cancelled, since the pull-up will be
+ * turned off during the time period when async callbacks are disabled.
+ *
+ * This routine will always be called in process context.
+ */
+static inline void usb_gadget_disable_async_callbacks(struct usb_udc *udc)
+{
+	struct usb_gadget *gadget = udc->gadget;
+
+	if (gadget->ops->udc_async_callbacks)
+		gadget->ops->udc_async_callbacks(gadget, false);
+}
+
 /**
  * usb_udc_release - release the usb_udc struct
  * @dev: the dev member within usb_udc
@@ -1361,6 +1408,7 @@ static void usb_gadget_remove_driver(struct usb_udc *udc)
 	kobject_uevent(&udc->dev.kobj, KOBJ_CHANGE);
 
 	usb_gadget_disconnect(udc->gadget);
+	usb_gadget_disable_async_callbacks(udc);
 	if (udc->gadget->irq)
 		synchronize_irq(udc->gadget->irq);
 	udc->driver->unbind(udc->gadget);
@@ -1442,6 +1490,7 @@ static int udc_bind_to_driver(struct usb_udc *udc, struct usb_gadget_driver *dri
 		driver->unbind(udc->gadget);
 		goto err1;
 	}
+	usb_gadget_enable_async_callbacks(udc);
 	usb_udc_connect_control(udc);
 
 	kobject_uevent(&udc->dev.kobj, KOBJ_CHANGE);
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 8811eb96e5cc..75c7538e350a 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -325,6 +325,7 @@ struct usb_gadget_ops {
 	void	(*udc_set_speed)(struct usb_gadget *, enum usb_device_speed);
 	void	(*udc_set_ssp_rate)(struct usb_gadget *gadget,
 			enum usb_ssp_rate rate);
+	void	(*udc_async_callbacks)(struct usb_gadget *gadget, bool enable);
 	struct usb_ep *(*match_ep)(struct usb_gadget *,
 			struct usb_endpoint_descriptor *,
 			struct usb_ss_ep_comp_descriptor *);
-- 
cgit v1.2.3


From 43582f29b161d820717bc13f562bca27af12e3cf Mon Sep 17 00:00:00 2001
From: Daniel Scally <djrscally@gmail.com>
Date: Thu, 3 Jun 2021 23:40:04 +0100
Subject: gpiolib: acpi: Introduce acpi_get_and_request_gpiod() helper

We need to be able to translate GPIO resources in an ACPI device's _CRS
into GPIO descriptor array. Those are represented in _CRS as a pathname
to a GPIO device plus the pin's index number: the acpi_get_gpiod()
function is perfect for that purpose.

As it's currently only used internally within the GPIO layer, provide and
export a wrapper function that additionally holds a reference to the GPIO
device.

Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Daniel Scally <djrscally@gmail.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/gpio/gpiolib-acpi.c   | 28 ++++++++++++++++++++++++++++
 include/linux/gpio/consumer.h |  2 ++
 2 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c
index 3ef22a3c104d..75cd0c5a5cc4 100644
--- a/drivers/gpio/gpiolib-acpi.c
+++ b/drivers/gpio/gpiolib-acpi.c
@@ -128,6 +128,34 @@ static struct gpio_desc *acpi_get_gpiod(char *path, int pin)
 	return gpiochip_get_desc(chip, pin);
 }
 
+/**
+ * acpi_get_and_request_gpiod - Translate ACPI GPIO pin to GPIO descriptor and
+ *                              hold a refcount to the GPIO device.
+ * @path:      ACPI GPIO controller full path name, (e.g. "\\_SB.GPO1")
+ * @pin:       ACPI GPIO pin number (0-based, controller-relative)
+ * @label:     Label to pass to gpiod_request()
+ *
+ * This function is a simple pass-through to acpi_get_gpiod(), except that
+ * as it is intended for use outside of the GPIO layer (in a similar fashion to
+ * gpiod_get_index() for example) it also holds a reference to the GPIO device.
+ */
+struct gpio_desc *acpi_get_and_request_gpiod(char *path, int pin, char *label)
+{
+	struct gpio_desc *gpio;
+	int ret;
+
+	gpio = acpi_get_gpiod(path, pin);
+	if (IS_ERR(gpio))
+		return gpio;
+
+	ret = gpiod_request(gpio, label);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return gpio;
+}
+EXPORT_SYMBOL_GPL(acpi_get_and_request_gpiod);
+
 static irqreturn_t acpi_gpio_irq_handler(int irq, void *data)
 {
 	struct acpi_gpio_event *event = data;
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index c73b25bc9213..566feb56601f 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -692,6 +692,8 @@ int devm_acpi_dev_add_driver_gpios(struct device *dev,
 				   const struct acpi_gpio_mapping *gpios);
 void devm_acpi_dev_remove_driver_gpios(struct device *dev);
 
+struct gpio_desc *acpi_get_and_request_gpiod(char *path, int pin, char *label);
+
 #else  /* CONFIG_GPIOLIB && CONFIG_ACPI */
 
 struct acpi_device;
-- 
cgit v1.2.3


From 043d7f09bf614809c10c4acbf0695ef731958300 Mon Sep 17 00:00:00 2001
From: Daniel Scally <djrscally@gmail.com>
Date: Thu, 3 Jun 2021 23:40:05 +0100
Subject: gpiolib: acpi: Add acpi_gpio_get_io_resource()

Add a function to verify that a given ACPI resource represents a GpioIo()
type of resource, and return it if so.

Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Daniel Scally <djrscally@gmail.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/gpio/gpiolib-acpi.c | 23 +++++++++++++++++++++++
 include/linux/acpi.h        |  7 +++++++
 2 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c
index 75cd0c5a5cc4..cf99a5752fe0 100644
--- a/drivers/gpio/gpiolib-acpi.c
+++ b/drivers/gpio/gpiolib-acpi.c
@@ -196,6 +196,29 @@ bool acpi_gpio_get_irq_resource(struct acpi_resource *ares,
 }
 EXPORT_SYMBOL_GPL(acpi_gpio_get_irq_resource);
 
+/**
+ * acpi_gpio_get_io_resource - Fetch details of an ACPI resource if it is a GPIO
+ *			       I/O resource or return False if not.
+ * @ares:	Pointer to the ACPI resource to fetch
+ * @agpio:	Pointer to a &struct acpi_resource_gpio to store the output pointer
+ */
+bool acpi_gpio_get_io_resource(struct acpi_resource *ares,
+			       struct acpi_resource_gpio **agpio)
+{
+	struct acpi_resource_gpio *gpio;
+
+	if (ares->type != ACPI_RESOURCE_TYPE_GPIO)
+		return false;
+
+	gpio = &ares->data.gpio;
+	if (gpio->connection_type != ACPI_RESOURCE_GPIO_TYPE_IO)
+		return false;
+
+	*agpio = gpio;
+	return true;
+}
+EXPORT_SYMBOL_GPL(acpi_gpio_get_io_resource);
+
 static void acpi_gpiochip_request_irq(struct acpi_gpio_chip *acpi_gpio,
 				      struct acpi_gpio_event *event)
 {
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index c60745f657e9..a74d37a3b618 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1096,6 +1096,8 @@ void __acpi_handle_debug(struct _ddebug *descriptor, acpi_handle handle, const c
 #if defined(CONFIG_ACPI) && defined(CONFIG_GPIOLIB)
 bool acpi_gpio_get_irq_resource(struct acpi_resource *ares,
 				struct acpi_resource_gpio **agpio);
+bool acpi_gpio_get_io_resource(struct acpi_resource *ares,
+			       struct acpi_resource_gpio **agpio);
 int acpi_dev_gpio_irq_get_by(struct acpi_device *adev, const char *name, int index);
 #else
 static inline bool acpi_gpio_get_irq_resource(struct acpi_resource *ares,
@@ -1103,6 +1105,11 @@ static inline bool acpi_gpio_get_irq_resource(struct acpi_resource *ares,
 {
 	return false;
 }
+static inline bool acpi_gpio_get_io_resource(struct acpi_resource *ares,
+					     struct acpi_resource_gpio **agpio)
+{
+	return false;
+}
 static inline int acpi_dev_gpio_irq_get_by(struct acpi_device *adev,
 					   const char *name, int index)
 {
-- 
cgit v1.2.3


From 603e4922f1c81fc2ed3a87b4f91a8d3aafc7e093 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 31 May 2021 10:25:26 +0300
Subject: remove the raw driver

The raw driver used to provide direct unbuffered access to block devices
before O_DIRECT was invented.  It has been obsolete for more than a
decade.

Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/lkml/Pine.LNX.4.64.0703180754060.6605@CPE00045a9c397f-CM001225dbafb6/
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20210531072526.97052-1-hch@lst.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/char/Kconfig     |  21 ---
 drivers/char/Makefile    |   1 -
 drivers/char/mem.c       |   1 -
 drivers/char/raw.c       | 362 -----------------------------------------------
 fs/block_dev.c           |   6 +-
 include/linux/fs.h       |   3 -
 include/uapi/linux/raw.h |  17 ---
 7 files changed, 2 insertions(+), 409 deletions(-)
 delete mode 100644 drivers/char/raw.c
 delete mode 100644 include/uapi/linux/raw.h

(limited to 'include/linux')

diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index b151e0fcdeb5..8e516aad632b 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -357,27 +357,6 @@ config NVRAM
 	  To compile this driver as a module, choose M here: the
 	  module will be called nvram.
 
-config RAW_DRIVER
-	tristate "RAW driver (/dev/raw/rawN)"
-	depends on BLOCK
-	help
-	  The raw driver permits block devices to be bound to /dev/raw/rawN.
-	  Once bound, I/O against /dev/raw/rawN uses efficient zero-copy I/O.
-	  See the raw(8) manpage for more details.
-
-	  Applications should preferably open the device (eg /dev/hda1)
-	  with the O_DIRECT flag.
-
-config MAX_RAW_DEVS
-	int "Maximum number of RAW devices to support (1-65536)"
-	depends on RAW_DRIVER
-	range 1 65536
-	default "256"
-	help
-	  The maximum number of RAW devices that are supported.
-	  Default is 256. Increase this number in case you need lots of
-	  raw devices.
-
 config DEVPORT
 	bool "/dev/port character device"
 	depends on ISA || PCI
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index c7e4fc733a37..264eb398fdd4 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -8,7 +8,6 @@ obj-$(CONFIG_TTY_PRINTK)	+= ttyprintk.o
 obj-y				+= misc.o
 obj-$(CONFIG_ATARI_DSP56K)	+= dsp56k.o
 obj-$(CONFIG_VIRTIO_CONSOLE)	+= virtio_console.o
-obj-$(CONFIG_RAW_DRIVER)	+= raw.o
 obj-$(CONFIG_MSPEC)		+= mspec.o
 obj-$(CONFIG_UV_MMTIMER)	+= uv_mmtimer.o
 obj-$(CONFIG_IBM_BSR)		+= bsr.o
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 15dc54fa1d47..1c596b5cdb27 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -16,7 +16,6 @@
 #include <linux/mman.h>
 #include <linux/random.h>
 #include <linux/init.h>
-#include <linux/raw.h>
 #include <linux/tty.h>
 #include <linux/capability.h>
 #include <linux/ptrace.h>
diff --git a/drivers/char/raw.c b/drivers/char/raw.c
deleted file mode 100644
index 5d52a1f4738c..000000000000
--- a/drivers/char/raw.c
+++ /dev/null
@@ -1,362 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/drivers/char/raw.c
- *
- * Front-end raw character devices.  These can be bound to any block
- * devices to provide genuine Unix raw character device semantics.
- *
- * We reserve minor number 0 for a control interface.  ioctl()s on this
- * device are used to bind the other minor numbers to block devices.
- */
-
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/major.h>
-#include <linux/blkdev.h>
-#include <linux/backing-dev.h>
-#include <linux/module.h>
-#include <linux/raw.h>
-#include <linux/capability.h>
-#include <linux/uio.h>
-#include <linux/cdev.h>
-#include <linux/device.h>
-#include <linux/mutex.h>
-#include <linux/gfp.h>
-#include <linux/compat.h>
-#include <linux/vmalloc.h>
-
-#include <linux/uaccess.h>
-
-struct raw_device_data {
-	dev_t binding;
-	struct block_device *bdev;
-	int inuse;
-};
-
-static struct class *raw_class;
-static struct raw_device_data *raw_devices;
-static DEFINE_MUTEX(raw_mutex);
-static const struct file_operations raw_ctl_fops; /* forward declaration */
-
-static int max_raw_minors = CONFIG_MAX_RAW_DEVS;
-
-module_param(max_raw_minors, int, 0);
-MODULE_PARM_DESC(max_raw_minors, "Maximum number of raw devices (1-65536)");
-
-/*
- * Open/close code for raw IO.
- *
- * We just rewrite the i_mapping for the /dev/raw/rawN file descriptor to
- * point at the blockdev's address_space and set the file handle to use
- * O_DIRECT.
- *
- * Set the device's soft blocksize to the minimum possible.  This gives the
- * finest possible alignment and has no adverse impact on performance.
- */
-static int raw_open(struct inode *inode, struct file *filp)
-{
-	const int minor = iminor(inode);
-	struct block_device *bdev;
-	int err;
-
-	if (minor == 0) {	/* It is the control device */
-		filp->f_op = &raw_ctl_fops;
-		return 0;
-	}
-
-	pr_warn_ratelimited(
-		"process %s (pid %d) is using the deprecated raw device\n"
-		"support will be removed in Linux 5.14.\n",
-		current->comm, current->pid);
-
-	mutex_lock(&raw_mutex);
-
-	/*
-	 * All we need to do on open is check that the device is bound.
-	 */
-	err = -ENODEV;
-	if (!raw_devices[minor].binding)
-		goto out;
-	bdev = blkdev_get_by_dev(raw_devices[minor].binding,
-				 filp->f_mode | FMODE_EXCL, raw_open);
-	if (IS_ERR(bdev)) {
-		err = PTR_ERR(bdev);
-		goto out;
-	}
-	err = set_blocksize(bdev, bdev_logical_block_size(bdev));
-	if (err)
-		goto out1;
-	filp->f_flags |= O_DIRECT;
-	filp->f_mapping = bdev->bd_inode->i_mapping;
-	if (++raw_devices[minor].inuse == 1)
-		file_inode(filp)->i_mapping =
-			bdev->bd_inode->i_mapping;
-	filp->private_data = bdev;
-	raw_devices[minor].bdev = bdev;
-	mutex_unlock(&raw_mutex);
-	return 0;
-
-out1:
-	blkdev_put(bdev, filp->f_mode | FMODE_EXCL);
-out:
-	mutex_unlock(&raw_mutex);
-	return err;
-}
-
-/*
- * When the final fd which refers to this character-special node is closed, we
- * make its ->mapping point back at its own i_data.
- */
-static int raw_release(struct inode *inode, struct file *filp)
-{
-	const int minor= iminor(inode);
-	struct block_device *bdev;
-
-	mutex_lock(&raw_mutex);
-	bdev = raw_devices[minor].bdev;
-	if (--raw_devices[minor].inuse == 0)
-		/* Here  inode->i_mapping == bdev->bd_inode->i_mapping  */
-		inode->i_mapping = &inode->i_data;
-	mutex_unlock(&raw_mutex);
-
-	blkdev_put(bdev, filp->f_mode | FMODE_EXCL);
-	return 0;
-}
-
-/*
- * Forward ioctls to the underlying block device.
- */
-static long
-raw_ioctl(struct file *filp, unsigned int command, unsigned long arg)
-{
-	struct block_device *bdev = filp->private_data;
-	return blkdev_ioctl(bdev, 0, command, arg);
-}
-
-static int bind_set(int number, u64 major, u64 minor)
-{
-	dev_t dev = MKDEV(major, minor);
-	dev_t raw = MKDEV(RAW_MAJOR, number);
-	struct raw_device_data *rawdev;
-	int err = 0;
-
-	if (number <= 0 || number >= max_raw_minors)
-		return -EINVAL;
-
-	if (MAJOR(dev) != major || MINOR(dev) != minor)
-		return -EINVAL;
-
-	rawdev = &raw_devices[number];
-
-	/*
-	 * This is like making block devices, so demand the
-	 * same capability
-	 */
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	/*
-	 * For now, we don't need to check that the underlying
-	 * block device is present or not: we can do that when
-	 * the raw device is opened.  Just check that the
-	 * major/minor numbers make sense.
-	 */
-
-	if (MAJOR(dev) == 0 && dev != 0)
-		return -EINVAL;
-
-	mutex_lock(&raw_mutex);
-	if (rawdev->inuse) {
-		mutex_unlock(&raw_mutex);
-		return -EBUSY;
-	}
-	if (rawdev->binding)
-		module_put(THIS_MODULE);
-
-	rawdev->binding = dev;
-	if (!dev) {
-		/* unbind */
-		device_destroy(raw_class, raw);
-	} else {
-		__module_get(THIS_MODULE);
-		device_destroy(raw_class, raw);
-		device_create(raw_class, NULL, raw, NULL, "raw%d", number);
-	}
-	mutex_unlock(&raw_mutex);
-	return err;
-}
-
-static int bind_get(int number, dev_t *dev)
-{
-	if (number <= 0 || number >= max_raw_minors)
-		return -EINVAL;
-	*dev = raw_devices[number].binding;
-	return 0;
-}
-
-/*
- * Deal with ioctls against the raw-device control interface, to bind
- * and unbind other raw devices.
- */
-static long raw_ctl_ioctl(struct file *filp, unsigned int command,
-			  unsigned long arg)
-{
-	struct raw_config_request rq;
-	dev_t dev;
-	int err;
-
-	switch (command) {
-	case RAW_SETBIND:
-		if (copy_from_user(&rq, (void __user *) arg, sizeof(rq)))
-			return -EFAULT;
-
-		return bind_set(rq.raw_minor, rq.block_major, rq.block_minor);
-
-	case RAW_GETBIND:
-		if (copy_from_user(&rq, (void __user *) arg, sizeof(rq)))
-			return -EFAULT;
-
-		err = bind_get(rq.raw_minor, &dev);
-		if (err)
-			return err;
-
-		rq.block_major = MAJOR(dev);
-		rq.block_minor = MINOR(dev);
-
-		if (copy_to_user((void __user *)arg, &rq, sizeof(rq)))
-			return -EFAULT;
-
-		return 0;
-	}
-
-	return -EINVAL;
-}
-
-#ifdef CONFIG_COMPAT
-struct raw32_config_request {
-	compat_int_t	raw_minor;
-	compat_u64	block_major;
-	compat_u64	block_minor;
-};
-
-static long raw_ctl_compat_ioctl(struct file *file, unsigned int cmd,
-				unsigned long arg)
-{
-	struct raw32_config_request __user *user_req = compat_ptr(arg);
-	struct raw32_config_request rq;
-	dev_t dev;
-	int err = 0;
-
-	switch (cmd) {
-	case RAW_SETBIND:
-		if (copy_from_user(&rq, user_req, sizeof(rq)))
-			return -EFAULT;
-
-		return bind_set(rq.raw_minor, rq.block_major, rq.block_minor);
-
-	case RAW_GETBIND:
-		if (copy_from_user(&rq, user_req, sizeof(rq)))
-			return -EFAULT;
-
-		err = bind_get(rq.raw_minor, &dev);
-		if (err)
-			return err;
-
-		rq.block_major = MAJOR(dev);
-		rq.block_minor = MINOR(dev);
-
-		if (copy_to_user(user_req, &rq, sizeof(rq)))
-			return -EFAULT;
-
-		return 0;
-	}
-
-	return -EINVAL;
-}
-#endif
-
-static const struct file_operations raw_fops = {
-	.read_iter	= blkdev_read_iter,
-	.write_iter	= blkdev_write_iter,
-	.fsync		= blkdev_fsync,
-	.open		= raw_open,
-	.release	= raw_release,
-	.unlocked_ioctl = raw_ioctl,
-	.llseek		= default_llseek,
-	.owner		= THIS_MODULE,
-};
-
-static const struct file_operations raw_ctl_fops = {
-	.unlocked_ioctl = raw_ctl_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= raw_ctl_compat_ioctl,
-#endif
-	.open		= raw_open,
-	.owner		= THIS_MODULE,
-	.llseek		= noop_llseek,
-};
-
-static struct cdev raw_cdev;
-
-static char *raw_devnode(struct device *dev, umode_t *mode)
-{
-	return kasprintf(GFP_KERNEL, "raw/%s", dev_name(dev));
-}
-
-static int __init raw_init(void)
-{
-	dev_t dev = MKDEV(RAW_MAJOR, 0);
-	int ret;
-
-	if (max_raw_minors < 1 || max_raw_minors > 65536) {
-		pr_warn("raw: invalid max_raw_minors (must be between 1 and 65536), using %d\n",
-			CONFIG_MAX_RAW_DEVS);
-		max_raw_minors = CONFIG_MAX_RAW_DEVS;
-	}
-
-	raw_devices = vzalloc(array_size(max_raw_minors,
-					 sizeof(struct raw_device_data)));
-	if (!raw_devices) {
-		printk(KERN_ERR "Not enough memory for raw device structures\n");
-		ret = -ENOMEM;
-		goto error;
-	}
-
-	ret = register_chrdev_region(dev, max_raw_minors, "raw");
-	if (ret)
-		goto error;
-
-	cdev_init(&raw_cdev, &raw_fops);
-	ret = cdev_add(&raw_cdev, dev, max_raw_minors);
-	if (ret)
-		goto error_region;
-	raw_class = class_create(THIS_MODULE, "raw");
-	if (IS_ERR(raw_class)) {
-		printk(KERN_ERR "Error creating raw class.\n");
-		cdev_del(&raw_cdev);
-		ret = PTR_ERR(raw_class);
-		goto error_region;
-	}
-	raw_class->devnode = raw_devnode;
-	device_create(raw_class, NULL, MKDEV(RAW_MAJOR, 0), NULL, "rawctl");
-
-	return 0;
-
-error_region:
-	unregister_chrdev_region(dev, max_raw_minors);
-error:
-	vfree(raw_devices);
-	return ret;
-}
-
-static void __exit raw_exit(void)
-{
-	device_destroy(raw_class, MKDEV(RAW_MAJOR, 0));
-	class_destroy(raw_class);
-	cdev_del(&raw_cdev);
-	unregister_chrdev_region(MKDEV(RAW_MAJOR, 0), max_raw_minors);
-}
-
-module_init(raw_init);
-module_exit(raw_exit);
-MODULE_LICENSE("GPL");
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6cc4d4cfe0c2..15c448eb8226 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1669,7 +1669,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
  * Does not take i_mutex for the write and thus is not for general purpose
  * use.
  */
-ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *bd_inode = bdev_file_inode(file);
@@ -1707,9 +1707,8 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	blk_finish_plug(&plug);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(blkdev_write_iter);
 
-ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
+static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *bd_inode = bdev_file_inode(file);
@@ -1731,7 +1730,6 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	iov_iter_reexpand(to, iov_iter_count(to) + shorted);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(blkdev_read_iter);
 
 /*
  * Try to release a page associated with block device when the system
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c3c88fdb9b2a..8652ed7cdce8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3242,11 +3242,8 @@ ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
 			    struct iov_iter *iter);
 
 /* fs/block_dev.c */
-extern ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to);
-extern ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from);
 extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
 			int datasync);
-extern void block_sync_page(struct page *page);
 
 /* fs/splice.c */
 extern ssize_t generic_file_splice_read(struct file *, loff_t *,
diff --git a/include/uapi/linux/raw.h b/include/uapi/linux/raw.h
deleted file mode 100644
index 47874919d0b9..000000000000
--- a/include/uapi/linux/raw.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef __LINUX_RAW_H
-#define __LINUX_RAW_H
-
-#include <linux/types.h>
-
-#define RAW_SETBIND	_IO( 0xac, 0 )
-#define RAW_GETBIND	_IO( 0xac, 1 )
-
-struct raw_config_request 
-{
-	int	raw_minor;
-	__u64	block_major;
-	__u64	block_minor;
-};
-
-#endif /* __LINUX_RAW_H */
-- 
cgit v1.2.3


From 9fb9b1690f0ba6b2c9ced91facc1fc44f5a0d5c1 Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Fri, 4 Jun 2021 12:52:29 +0100
Subject: ASoC: codecs: wcd934x: add mbhc support

WCD934x has Multi Button Headset Control hardware to support Headset insertion,
type detection, 8 headset buttons detection, Over Current detection and Impedence
measurements.

This patch adds support for this feature via common mbhc layer.

Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20210604115230.23259-4-srinivas.kandagatla@linaro.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/mfd/wcd934x/registers.h |  57 +++
 sound/soc/codecs/Kconfig              |   1 +
 sound/soc/codecs/wcd934x.c            | 884 +++++++++++++++++++++++++++++++++-
 3 files changed, 927 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/wcd934x/registers.h b/include/linux/mfd/wcd934x/registers.h
index bb8d2e276668..76a943c83c63 100644
--- a/include/linux/mfd/wcd934x/registers.h
+++ b/include/linux/mfd/wcd934x/registers.h
@@ -18,6 +18,8 @@
 #define WCD934X_EFUSE_SENSE_STATE_DEF				0x10
 #define WCD934X_EFUSE_SENSE_EN_MASK				BIT(0)
 #define WCD934X_EFUSE_SENSE_ENABLE				BIT(0)
+#define WCD934X_CHIP_TIER_CTRL_EFUSE_VAL_OUT1			0x002a
+#define WCD934X_CHIP_TIER_CTRL_EFUSE_VAL_OUT2			0x002b
 #define WCD934X_CHIP_TIER_CTRL_EFUSE_VAL_OUT14			0x0037
 #define WCD934X_CHIP_TIER_CTRL_EFUSE_VAL_OUT15			0x0038
 #define WCD934X_CHIP_TIER_CTRL_EFUSE_STATUS			0x0039
@@ -103,21 +105,58 @@
 #define WCD934X_ANA_AMIC3					0x0610
 #define WCD934X_ANA_AMIC4					0x0611
 #define WCD934X_ANA_MBHC_MECH					0x0614
+#define WCD934X_MBHC_L_DET_EN_MASK				BIT(7)
+#define WCD934X_MBHC_L_DET_EN					BIT(7)
+#define WCD934X_MBHC_GND_DET_EN_MASK				BIT(6)
+#define WCD934X_MBHC_MECH_DETECT_TYPE_MASK			BIT(5)
+#define WCD934X_MBHC_MECH_DETECT_TYPE_INS			1
+#define WCD934X_MBHC_HPHL_PLUG_TYPE_MASK			BIT(4)
+#define WCD934X_MBHC_HPHL_PLUG_TYPE_NO				1
+#define WCD934X_MBHC_GND_PLUG_TYPE_MASK				BIT(3)
+#define WCD934X_MBHC_GND_PLUG_TYPE_NO				1
+#define WCD934X_MBHC_HSL_PULLUP_COMP_EN				BIT(2)
+#define WCD934X_MBHC_HSG_PULLUP_COMP_EN				BIT(1)
+#define WCD934X_MBHC_HPHL_100K_TO_GND_EN			BIT(0)
 #define WCD934X_ANA_MBHC_ELECT					0x0615
+#define WCD934X_ANA_MBHC_BIAS_EN_MASK				BIT(0)
+#define WCD934X_ANA_MBHC_BIAS_EN				BIT(0)
 #define WCD934X_ANA_MBHC_ZDET					0x0616
 #define WCD934X_ANA_MBHC_RESULT_1				0x0617
 #define WCD934X_ANA_MBHC_RESULT_2				0x0618
 #define WCD934X_ANA_MBHC_RESULT_3				0x0619
+#define WCD934X_ANA_MBHC_BTN0					0x061a
+#define WCD934X_VTH_MASK					GENMASK(7, 2)
+#define WCD934X_ANA_MBHC_BTN1					0x061b
+#define WCD934X_ANA_MBHC_BTN2					0x061c
+#define WCD934X_ANA_MBHC_BTN3					0x061d
+#define WCD934X_ANA_MBHC_BTN4					0x061e
+#define WCD934X_ANA_MBHC_BTN5					0x061f
+#define WCD934X_ANA_MBHC_BTN6					0x0620
+#define WCD934X_ANA_MBHC_BTN7					0x0621
+#define WCD934X_MBHC_BTN_VTH_MASK				GENMASK(7, 2)
 #define WCD934X_ANA_MICB1					0x0622
 #define WCD934X_MICB_VAL_MASK					GENMASK(5, 0)
 #define WCD934X_ANA_MICB_EN_MASK				GENMASK(7, 6)
+#define WCD934X_MICB_DISABLE					0
+#define WCD934X_MICB_ENABLE					1
+#define WCD934X_MICB_PULL_UP					2
+#define WCD934X_MICB_PULL_DOWN					3
 #define WCD934X_ANA_MICB_PULL_UP				0x80
 #define WCD934X_ANA_MICB_ENABLE					0x40
 #define WCD934X_ANA_MICB_DISABLE				0x0
 #define WCD934X_ANA_MICB2					0x0623
+#define WCD934X_ANA_MICB2_ENABLE				BIT(6)
+#define WCD934X_ANA_MICB2_ENABLE_MASK				GENMASK(7, 6)
+#define WCD934X_ANA_MICB2_VOUT_MASK				GENMASK(5, 0)
+#define WCD934X_ANA_MICB2_RAMP					0x0624
+#define WCD934X_RAMP_EN_MASK					BIT(7)
+#define WCD934X_RAMP_SHIFT_CTRL_MASK				GENMASK(4, 2)
 #define WCD934X_ANA_MICB3					0x0625
 #define WCD934X_ANA_MICB4					0x0626
 #define WCD934X_BIAS_VBG_FINE_ADJ				0x0629
+#define WCD934X_MBHC_CTL_CLK					0x0656
+#define WCD934X_MBHC_CTL_BCS					0x065a
+#define WCD934X_MBHC_STATUS_SPARE_1				0x065b
 #define WCD934X_MICB1_TEST_CTL_1				0x066b
 #define WCD934X_MICB1_TEST_CTL_2				0x066c
 #define WCD934X_MICB2_TEST_CTL_1				0x066e
@@ -141,7 +180,11 @@
 #define WCD934X_HPH_CNP_WG_CTL					0x06cc
 #define WCD934X_HPH_GM3_BOOST_EN_MASK				BIT(7)
 #define WCD934X_HPH_GM3_BOOST_ENABLE				BIT(7)
+#define WCD934X_HPH_CNP_WG_TIME					0x06cd
 #define WCD934X_HPH_OCP_CTL					0x06ce
+#define WCD934X_HPH_PA_CTL2					0x06d2
+#define WCD934X_HPHPA_GND_R_MASK				BIT(6)
+#define WCD934X_HPHPA_GND_L_MASK				BIT(4)
 #define WCD934X_HPH_L_EN					0x06d3
 #define WCD934X_HPH_GAIN_SRC_SEL_MASK				BIT(5)
 #define WCD934X_HPH_GAIN_SRC_SEL_COMPANDER			0
@@ -152,6 +195,8 @@
 #define WCD934X_HPH_OCP_DET_MASK				BIT(0)
 #define WCD934X_HPH_OCP_DET_ENABLE				BIT(0)
 #define WCD934X_HPH_OCP_DET_DISABLE				0
+#define WCD934X_HPH_R_ATEST					0x06d8
+#define WCD934X_HPHPA_GND_OVR_MASK				BIT(1)
 #define WCD934X_DIFF_LO_LO2_COMPANDER				0x06ea
 #define WCD934X_DIFF_LO_LO1_COMPANDER				0x06eb
 #define WCD934X_CLK_SYS_MCLK_PRG				0x0711
@@ -172,7 +217,19 @@
 #define WCD934X_SIDO_NEW_VOUT_D_FREQ2				0x071e
 #define WCD934X_SIDO_RIPPLE_FREQ_EN_MASK			BIT(0)
 #define WCD934X_SIDO_RIPPLE_FREQ_ENABLE				BIT(0)
+#define WCD934X_MBHC_NEW_CTL_1					0x0720
+#define WCD934X_MBHC_CTL_RCO_EN_MASK				BIT(7)
+#define WCD935X_MBHC_CTL_RCO_EN					BIT(7)
 #define WCD934X_MBHC_NEW_CTL_2					0x0721
+#define WCD934X_M_RTH_CTL_MASK					GENMASK(3, 2)
+#define WCD934X_MBHC_NEW_PLUG_DETECT_CTL			0x0722
+#define WCD934X_HSDET_PULLUP_C_MASK				GENMASK(7, 6)
+#define WCD934X_MBHC_NEW_ZDET_ANA_CTL				0x0723
+#define WCD934X_ZDET_RANGE_CTL_MASK				GENMASK(3, 0)
+#define WCD934X_ZDET_MAXV_CTL_MASK				GENMASK(6, 4)
+#define WCD934X_MBHC_NEW_ZDET_RAMP_CTL				0x0724
+#define WCD934X_MBHC_NEW_FSM_STATUS				0x0725
+#define WCD934X_MBHC_NEW_ADC_RESULT				0x0726
 #define WCD934X_TX_NEW_AMIC_4_5_SEL				0x0727
 #define WCD934X_HPH_NEW_INT_RDAC_HD2_CTL_L			0x0733
 #define WCD934X_HPH_NEW_INT_RDAC_OVERRIDE_CTL			0x0735
diff --git a/sound/soc/codecs/Kconfig b/sound/soc/codecs/Kconfig
index 71a1ef9063cc..8252f7e9ef71 100644
--- a/sound/soc/codecs/Kconfig
+++ b/sound/soc/codecs/Kconfig
@@ -1542,6 +1542,7 @@ config SND_SOC_WCD_MBHC
 config SND_SOC_WCD934X
 	tristate "WCD9340/WCD9341 Codec"
 	depends on COMMON_CLK
+	select SND_SOC_WCD_MBHC
 	depends on MFD_WCD934X
 	help
 	  The WCD9340/9341 is a audio codec IC Integrated in
diff --git a/sound/soc/codecs/wcd934x.c b/sound/soc/codecs/wcd934x.c
index 046874ef490e..16fd1ab62609 100644
--- a/sound/soc/codecs/wcd934x.c
+++ b/sound/soc/codecs/wcd934x.c
@@ -21,6 +21,7 @@
 #include <sound/soc-dapm.h>
 #include <sound/tlv.h>
 #include "wcd-clsh-v2.h"
+#include "wcd-mbhc-v2.h"
 
 #define WCD934X_RATES_MASK (SNDRV_PCM_RATE_8000 | SNDRV_PCM_RATE_16000 |\
 			    SNDRV_PCM_RATE_32000 | SNDRV_PCM_RATE_48000 |\
@@ -131,6 +132,24 @@
 	} \
 }
 
+/* Z value defined in milliohm */
+#define WCD934X_ZDET_VAL_32             32000
+#define WCD934X_ZDET_VAL_400            400000
+#define WCD934X_ZDET_VAL_1200           1200000
+#define WCD934X_ZDET_VAL_100K           100000000
+/* Z floating defined in ohms */
+#define WCD934X_ZDET_FLOATING_IMPEDANCE 0x0FFFFFFE
+
+#define WCD934X_ZDET_NUM_MEASUREMENTS   900
+#define WCD934X_MBHC_GET_C1(c)          ((c & 0xC000) >> 14)
+#define WCD934X_MBHC_GET_X1(x)          (x & 0x3FFF)
+/* Z value compared in milliOhm */
+#define WCD934X_MBHC_IS_SECOND_RAMP_REQUIRED(z) ((z > 400000) || (z < 32000))
+#define WCD934X_MBHC_ZDET_CONST         (86 * 16384)
+#define WCD934X_MBHC_MOISTURE_RREF      R_24_KOHM
+#define WCD934X_MBHC_MAX_BUTTONS	(8)
+#define WCD_MBHC_HS_V_MAX           1600
+
 #define WCD934X_INTERPOLATOR_PATH(id)			\
 	{"RX INT" #id "_1 MIX1 INP0", "RX0", "SLIM RX0"},	\
 	{"RX INT" #id "_1 MIX1 INP0", "RX1", "SLIM RX1"},	\
@@ -287,12 +306,7 @@
 	{"AIF3_CAP Mixer", "SLIM TX" #id, "SLIM TX" #id },	\
 	{"SLIM TX" #id, NULL, "CDC_IF TX" #id " MUX"}
 
-enum {
-	MIC_BIAS_1 = 1,
-	MIC_BIAS_2,
-	MIC_BIAS_3,
-	MIC_BIAS_4
-};
+#define WCD934X_MAX_MICBIAS	MIC_BIAS_4
 
 enum {
 	SIDO_SOURCE_INTERNAL,
@@ -486,6 +500,15 @@ static struct interp_sample_rate sr_val_tbl[] = {
 	{352800, 0xC},
 };
 
+struct wcd934x_mbhc_zdet_param {
+	u16 ldo_ctl;
+	u16 noff;
+	u16 nshift;
+	u16 btn5;
+	u16 btn6;
+	u16 btn7;
+};
+
 struct wcd_slim_codec_dai_data {
 	struct list_head slim_ch_list;
 	struct slim_stream_config sconfig;
@@ -541,6 +564,18 @@ struct wcd934x_codec {
 	int comp_enabled[COMPANDER_MAX];
 	int sysclk_users;
 	struct mutex sysclk_mutex;
+	/* mbhc module */
+	struct wcd_mbhc *mbhc;
+	struct wcd_mbhc_config mbhc_cfg;
+	struct wcd_mbhc_intr intr_ids;
+	bool mbhc_started;
+	struct mutex micb_lock;
+	u32 micb_ref[WCD934X_MAX_MICBIAS];
+	u32 pullup_ref[WCD934X_MAX_MICBIAS];
+	u32 micb1_mv;
+	u32 micb2_mv;
+	u32 micb3_mv;
+	u32 micb4_mv;
 };
 
 #define to_wcd934x_codec(_hw) container_of(_hw, struct wcd934x_codec, hw)
@@ -1183,6 +1218,57 @@ static const struct soc_enum cdc_if_tx13_mux_enum =
 	SOC_ENUM_SINGLE(WCD934X_DATA_HUB_SB_TX13_INP_CFG, 0,
 			ARRAY_SIZE(cdc_if_tx13_mux_text), cdc_if_tx13_mux_text);
 
+static struct wcd_mbhc_field wcd_mbhc_fields[WCD_MBHC_REG_FUNC_MAX] = {
+	WCD_MBHC_FIELD(WCD_MBHC_L_DET_EN, WCD934X_ANA_MBHC_MECH, 0x80),
+	WCD_MBHC_FIELD(WCD_MBHC_GND_DET_EN, WCD934X_ANA_MBHC_MECH, 0x40),
+	WCD_MBHC_FIELD(WCD_MBHC_MECH_DETECTION_TYPE, WCD934X_ANA_MBHC_MECH, 0x20),
+	WCD_MBHC_FIELD(WCD_MBHC_MIC_CLAMP_CTL, WCD934X_MBHC_NEW_PLUG_DETECT_CTL, 0x30),
+	WCD_MBHC_FIELD(WCD_MBHC_ELECT_DETECTION_TYPE, WCD934X_ANA_MBHC_ELECT, 0x08),
+	WCD_MBHC_FIELD(WCD_MBHC_HS_L_DET_PULL_UP_CTRL, WCD934X_MBHC_NEW_PLUG_DETECT_CTL, 0xC0),
+	WCD_MBHC_FIELD(WCD_MBHC_HS_L_DET_PULL_UP_COMP_CTRL, WCD934X_ANA_MBHC_MECH, 0x04),
+	WCD_MBHC_FIELD(WCD_MBHC_HPHL_PLUG_TYPE, WCD934X_ANA_MBHC_MECH, 0x10),
+	WCD_MBHC_FIELD(WCD_MBHC_GND_PLUG_TYPE, WCD934X_ANA_MBHC_MECH, 0x08),
+	WCD_MBHC_FIELD(WCD_MBHC_SW_HPH_LP_100K_TO_GND, WCD934X_ANA_MBHC_MECH, 0x01),
+	WCD_MBHC_FIELD(WCD_MBHC_ELECT_SCHMT_ISRC, WCD934X_ANA_MBHC_ELECT, 0x06),
+	WCD_MBHC_FIELD(WCD_MBHC_FSM_EN, WCD934X_ANA_MBHC_ELECT, 0x80),
+	WCD_MBHC_FIELD(WCD_MBHC_INSREM_DBNC, WCD934X_MBHC_NEW_PLUG_DETECT_CTL, 0x0F),
+	WCD_MBHC_FIELD(WCD_MBHC_BTN_DBNC, WCD934X_MBHC_NEW_CTL_1, 0x03),
+	WCD_MBHC_FIELD(WCD_MBHC_HS_VREF, WCD934X_MBHC_NEW_CTL_2, 0x03),
+	WCD_MBHC_FIELD(WCD_MBHC_HS_COMP_RESULT, WCD934X_ANA_MBHC_RESULT_3, 0x08),
+	WCD_MBHC_FIELD(WCD_MBHC_IN2P_CLAMP_STATE, WCD934X_ANA_MBHC_RESULT_3, 0x10),
+	WCD_MBHC_FIELD(WCD_MBHC_MIC_SCHMT_RESULT, WCD934X_ANA_MBHC_RESULT_3, 0x20),
+	WCD_MBHC_FIELD(WCD_MBHC_HPHL_SCHMT_RESULT, WCD934X_ANA_MBHC_RESULT_3, 0x80),
+	WCD_MBHC_FIELD(WCD_MBHC_HPHR_SCHMT_RESULT, WCD934X_ANA_MBHC_RESULT_3, 0x40),
+	WCD_MBHC_FIELD(WCD_MBHC_OCP_FSM_EN, WCD934X_HPH_OCP_CTL, 0x10),
+	WCD_MBHC_FIELD(WCD_MBHC_BTN_RESULT, WCD934X_ANA_MBHC_RESULT_3, 0x07),
+	WCD_MBHC_FIELD(WCD_MBHC_BTN_ISRC_CTL, WCD934X_ANA_MBHC_ELECT, 0x70),
+	WCD_MBHC_FIELD(WCD_MBHC_ELECT_RESULT, WCD934X_ANA_MBHC_RESULT_3, 0xFF),
+	WCD_MBHC_FIELD(WCD_MBHC_MICB_CTRL, WCD934X_ANA_MICB2, 0xC0),
+	WCD_MBHC_FIELD(WCD_MBHC_HPH_CNP_WG_TIME, WCD934X_HPH_CNP_WG_TIME, 0xFF),
+	WCD_MBHC_FIELD(WCD_MBHC_HPHR_PA_EN, WCD934X_ANA_HPH, 0x40),
+	WCD_MBHC_FIELD(WCD_MBHC_HPHL_PA_EN, WCD934X_ANA_HPH, 0x80),
+	WCD_MBHC_FIELD(WCD_MBHC_HPH_PA_EN, WCD934X_ANA_HPH, 0xC0),
+	WCD_MBHC_FIELD(WCD_MBHC_SWCH_LEVEL_REMOVE, WCD934X_ANA_MBHC_RESULT_3, 0x10),
+	WCD_MBHC_FIELD(WCD_MBHC_ANC_DET_EN, WCD934X_MBHC_CTL_BCS, 0x02),
+	WCD_MBHC_FIELD(WCD_MBHC_FSM_STATUS, WCD934X_MBHC_STATUS_SPARE_1, 0x01),
+	WCD_MBHC_FIELD(WCD_MBHC_MUX_CTL, WCD934X_MBHC_NEW_CTL_2, 0x70),
+	WCD_MBHC_FIELD(WCD_MBHC_MOISTURE_STATUS, WCD934X_MBHC_NEW_FSM_STATUS, 0x20),
+	WCD_MBHC_FIELD(WCD_MBHC_HPHR_GND, WCD934X_HPH_PA_CTL2, 0x40),
+	WCD_MBHC_FIELD(WCD_MBHC_HPHL_GND, WCD934X_HPH_PA_CTL2, 0x10),
+	WCD_MBHC_FIELD(WCD_MBHC_HPHL_OCP_DET_EN, WCD934X_HPH_L_TEST, 0x01),
+	WCD_MBHC_FIELD(WCD_MBHC_HPHR_OCP_DET_EN, WCD934X_HPH_R_TEST, 0x01),
+	WCD_MBHC_FIELD(WCD_MBHC_HPHL_OCP_STATUS, WCD934X_INTR_PIN1_STATUS0, 0x04),
+	WCD_MBHC_FIELD(WCD_MBHC_HPHR_OCP_STATUS, WCD934X_INTR_PIN1_STATUS0, 0x08),
+	WCD_MBHC_FIELD(WCD_MBHC_ADC_EN, WCD934X_MBHC_NEW_CTL_1, 0x08),
+	WCD_MBHC_FIELD(WCD_MBHC_ADC_COMPLETE, WCD934X_MBHC_NEW_FSM_STATUS, 0x40),
+	WCD_MBHC_FIELD(WCD_MBHC_ADC_TIMEOUT, WCD934X_MBHC_NEW_FSM_STATUS, 0x80),
+	WCD_MBHC_FIELD(WCD_MBHC_ADC_RESULT, WCD934X_MBHC_NEW_ADC_RESULT, 0xFF),
+	WCD_MBHC_FIELD(WCD_MBHC_MICB2_VOUT, WCD934X_ANA_MICB2, 0x3F),
+	WCD_MBHC_FIELD(WCD_MBHC_ADC_MODE, WCD934X_MBHC_NEW_CTL_1, 0x10),
+	WCD_MBHC_FIELD(WCD_MBHC_DETECTION_DONE, WCD934X_MBHC_NEW_CTL_1, 0x04),
+	WCD_MBHC_FIELD(WCD_MBHC_ELECT_ISRC_EN, WCD934X_ANA_MBHC_ZDET, 0x02),
+};
+
 static int wcd934x_set_sido_input_src(struct wcd934x_codec *wcd, int sido_src)
 {
 	if (sido_src == wcd->sido_input_src)
@@ -2127,7 +2213,8 @@ static struct clk *wcd934x_register_mclk_output(struct wcd934x_codec *wcd)
 	return NULL;
 }
 
-static int wcd934x_get_micbias_val(struct device *dev, const char *micbias)
+static int wcd934x_get_micbias_val(struct device *dev, const char *micbias,
+				   u32 *micb_mv)
 {
 	int mv;
 
@@ -2145,6 +2232,8 @@ static int wcd934x_get_micbias_val(struct device *dev, const char *micbias)
 		mv = WCD934X_DEF_MICBIAS_MV;
 	}
 
+	*micb_mv = mv;
+
 	return (mv - 1000) / 50;
 }
 
@@ -2155,13 +2244,17 @@ static int wcd934x_init_dmic(struct snd_soc_component *comp)
 	u32 def_dmic_rate, dmic_clk_drv;
 
 	vout_ctl_1 = wcd934x_get_micbias_val(comp->dev,
-					     "qcom,micbias1-microvolt");
+					     "qcom,micbias1-microvolt",
+					     &wcd->micb1_mv);
 	vout_ctl_2 = wcd934x_get_micbias_val(comp->dev,
-					     "qcom,micbias2-microvolt");
+					     "qcom,micbias2-microvolt",
+					     &wcd->micb2_mv);
 	vout_ctl_3 = wcd934x_get_micbias_val(comp->dev,
-					     "qcom,micbias3-microvolt");
+					     "qcom,micbias3-microvolt",
+					     &wcd->micb3_mv);
 	vout_ctl_4 = wcd934x_get_micbias_val(comp->dev,
-					     "qcom,micbias4-microvolt");
+					     "qcom,micbias4-microvolt",
+					     &wcd->micb4_mv);
 
 	snd_soc_component_update_bits(comp, WCD934X_ANA_MICB1,
 				      WCD934X_MICB_VAL_MASK, vout_ctl_1);
@@ -2287,6 +2380,695 @@ static irqreturn_t wcd934x_slim_irq_handler(int irq, void *data)
 	return ret;
 }
 
+static void wcd934x_mbhc_clk_setup(struct snd_soc_component *component,
+				   bool enable)
+{
+	snd_soc_component_write_field(component, WCD934X_MBHC_NEW_CTL_1,
+				      WCD934X_MBHC_CTL_RCO_EN_MASK, enable);
+}
+
+static void wcd934x_mbhc_mbhc_bias_control(struct snd_soc_component *component,
+					   bool enable)
+{
+	snd_soc_component_write_field(component, WCD934X_ANA_MBHC_ELECT,
+				      WCD934X_ANA_MBHC_BIAS_EN, enable);
+}
+
+static void wcd934x_mbhc_program_btn_thr(struct snd_soc_component *component,
+					 int *btn_low, int *btn_high,
+					 int num_btn, bool is_micbias)
+{
+	int i, vth;
+
+	if (num_btn > WCD_MBHC_DEF_BUTTONS) {
+		dev_err(component->dev, "%s: invalid number of buttons: %d\n",
+			__func__, num_btn);
+		return;
+	}
+
+	for (i = 0; i < num_btn; i++) {
+		vth = ((btn_high[i] * 2) / 25) & 0x3F;
+		snd_soc_component_write_field(component, WCD934X_ANA_MBHC_BTN0 + i,
+					   WCD934X_MBHC_BTN_VTH_MASK, vth);
+	}
+}
+
+static bool wcd934x_mbhc_micb_en_status(struct snd_soc_component *component, int micb_num)
+{
+	u8 val;
+
+	if (micb_num == MIC_BIAS_2) {
+		val = snd_soc_component_read_field(component, WCD934X_ANA_MICB2,
+						   WCD934X_ANA_MICB2_ENABLE_MASK);
+		if (val == WCD934X_MICB_ENABLE)
+			return true;
+	}
+	return false;
+}
+
+static void wcd934x_mbhc_hph_l_pull_up_control(struct snd_soc_component *component,
+					       enum mbhc_hs_pullup_iref pull_up_cur)
+{
+	/* Default pull up current to 2uA */
+	if (pull_up_cur < I_OFF || pull_up_cur > I_3P0_UA ||
+	    pull_up_cur == I_DEFAULT)
+		pull_up_cur = I_2P0_UA;
+
+
+	snd_soc_component_write_field(component, WCD934X_MBHC_NEW_PLUG_DETECT_CTL,
+				      WCD934X_HSDET_PULLUP_C_MASK, pull_up_cur);
+}
+
+static int wcd934x_micbias_control(struct snd_soc_component *component,
+			    int micb_num, int req, bool is_dapm)
+{
+	struct wcd934x_codec *wcd934x = snd_soc_component_get_drvdata(component);
+	int micb_index = micb_num - 1;
+	u16 micb_reg;
+
+	switch (micb_num) {
+	case MIC_BIAS_1:
+		micb_reg = WCD934X_ANA_MICB1;
+		break;
+	case MIC_BIAS_2:
+		micb_reg = WCD934X_ANA_MICB2;
+		break;
+	case MIC_BIAS_3:
+		micb_reg = WCD934X_ANA_MICB3;
+		break;
+	case MIC_BIAS_4:
+		micb_reg = WCD934X_ANA_MICB4;
+		break;
+	default:
+		dev_err(component->dev, "%s: Invalid micbias number: %d\n",
+			__func__, micb_num);
+		return -EINVAL;
+	};
+	mutex_lock(&wcd934x->micb_lock);
+
+	switch (req) {
+	case MICB_PULLUP_ENABLE:
+		wcd934x->pullup_ref[micb_index]++;
+		if ((wcd934x->pullup_ref[micb_index] == 1) &&
+		    (wcd934x->micb_ref[micb_index] == 0))
+			snd_soc_component_write_field(component, micb_reg,
+						      WCD934X_ANA_MICB_EN_MASK,
+						      WCD934X_MICB_PULL_UP);
+		break;
+	case MICB_PULLUP_DISABLE:
+		if (wcd934x->pullup_ref[micb_index] > 0)
+			wcd934x->pullup_ref[micb_index]--;
+
+		if ((wcd934x->pullup_ref[micb_index] == 0) &&
+		    (wcd934x->micb_ref[micb_index] == 0))
+			snd_soc_component_write_field(component, micb_reg,
+						      WCD934X_ANA_MICB_EN_MASK, 0);
+		break;
+	case MICB_ENABLE:
+		wcd934x->micb_ref[micb_index]++;
+		if (wcd934x->micb_ref[micb_index] == 1) {
+			snd_soc_component_write_field(component, micb_reg,
+						      WCD934X_ANA_MICB_EN_MASK,
+						      WCD934X_MICB_ENABLE);
+			if (micb_num  == MIC_BIAS_2)
+				wcd_mbhc_event_notify(wcd934x->mbhc,
+						      WCD_EVENT_POST_MICBIAS_2_ON);
+		}
+
+		if (micb_num  == MIC_BIAS_2 && is_dapm)
+			wcd_mbhc_event_notify(wcd934x->mbhc,
+					      WCD_EVENT_POST_DAPM_MICBIAS_2_ON);
+		break;
+	case MICB_DISABLE:
+		if (wcd934x->micb_ref[micb_index] > 0)
+			wcd934x->micb_ref[micb_index]--;
+
+		if ((wcd934x->micb_ref[micb_index] == 0) &&
+		    (wcd934x->pullup_ref[micb_index] > 0))
+			snd_soc_component_write_field(component, micb_reg,
+						      WCD934X_ANA_MICB_EN_MASK,
+						      WCD934X_MICB_PULL_UP);
+		else if ((wcd934x->micb_ref[micb_index] == 0) &&
+			 (wcd934x->pullup_ref[micb_index] == 0)) {
+			if (micb_num  == MIC_BIAS_2)
+				wcd_mbhc_event_notify(wcd934x->mbhc,
+						      WCD_EVENT_PRE_MICBIAS_2_OFF);
+
+			snd_soc_component_write_field(component, micb_reg,
+						      WCD934X_ANA_MICB_EN_MASK, 0);
+			if (micb_num  == MIC_BIAS_2)
+				wcd_mbhc_event_notify(wcd934x->mbhc,
+						      WCD_EVENT_POST_MICBIAS_2_OFF);
+		}
+		if (is_dapm && micb_num  == MIC_BIAS_2)
+			wcd_mbhc_event_notify(wcd934x->mbhc,
+					      WCD_EVENT_POST_DAPM_MICBIAS_2_OFF);
+		break;
+	};
+
+	mutex_unlock(&wcd934x->micb_lock);
+
+	return 0;
+}
+
+static int wcd934x_mbhc_request_micbias(struct snd_soc_component *component,
+					int micb_num, int req)
+{
+	struct wcd934x_codec *wcd = dev_get_drvdata(component->dev);
+	int ret;
+
+	if (req == MICB_ENABLE)
+		__wcd934x_cdc_mclk_enable(wcd, true);
+
+	ret = wcd934x_micbias_control(component, micb_num, req, false);
+
+	if (req == MICB_DISABLE)
+		__wcd934x_cdc_mclk_enable(wcd, false);
+
+	return ret;
+}
+
+static void wcd934x_mbhc_micb_ramp_control(struct snd_soc_component *component,
+					   bool enable)
+{
+	if (enable) {
+		snd_soc_component_write_field(component, WCD934X_ANA_MICB2_RAMP,
+				    WCD934X_RAMP_SHIFT_CTRL_MASK, 0x3);
+		snd_soc_component_write_field(component, WCD934X_ANA_MICB2_RAMP,
+				    WCD934X_RAMP_EN_MASK, 1);
+	} else {
+		snd_soc_component_write_field(component, WCD934X_ANA_MICB2_RAMP,
+				    WCD934X_RAMP_EN_MASK, 0);
+		snd_soc_component_write_field(component, WCD934X_ANA_MICB2_RAMP,
+				    WCD934X_RAMP_SHIFT_CTRL_MASK, 0);
+	}
+}
+
+static int wcd934x_get_micb_vout_ctl_val(u32 micb_mv)
+{
+	/* min micbias voltage is 1V and maximum is 2.85V */
+	if (micb_mv < 1000 || micb_mv > 2850)
+		return -EINVAL;
+
+	return (micb_mv - 1000) / 50;
+}
+
+static int wcd934x_mbhc_micb_adjust_voltage(struct snd_soc_component *component,
+					    int req_volt, int micb_num)
+{
+	struct wcd934x_codec *wcd934x = snd_soc_component_get_drvdata(component);
+	int cur_vout_ctl, req_vout_ctl, micb_reg, micb_en, ret = 0;
+
+	switch (micb_num) {
+	case MIC_BIAS_1:
+		micb_reg = WCD934X_ANA_MICB1;
+		break;
+	case MIC_BIAS_2:
+		micb_reg = WCD934X_ANA_MICB2;
+		break;
+	case MIC_BIAS_3:
+		micb_reg = WCD934X_ANA_MICB3;
+		break;
+	case MIC_BIAS_4:
+		micb_reg = WCD934X_ANA_MICB4;
+		break;
+	default:
+		return -EINVAL;
+	}
+	mutex_lock(&wcd934x->micb_lock);
+	/*
+	 * If requested micbias voltage is same as current micbias
+	 * voltage, then just return. Otherwise, adjust voltage as
+	 * per requested value. If micbias is already enabled, then
+	 * to avoid slow micbias ramp-up or down enable pull-up
+	 * momentarily, change the micbias value and then re-enable
+	 * micbias.
+	 */
+	micb_en = snd_soc_component_read_field(component, micb_reg,
+						WCD934X_ANA_MICB_EN_MASK);
+	cur_vout_ctl = snd_soc_component_read_field(component, micb_reg,
+						    WCD934X_MICB_VAL_MASK);
+
+	req_vout_ctl = wcd934x_get_micb_vout_ctl_val(req_volt);
+	if (req_vout_ctl < 0) {
+		ret = -EINVAL;
+		goto exit;
+	}
+
+	if (cur_vout_ctl == req_vout_ctl) {
+		ret = 0;
+		goto exit;
+	}
+
+	if (micb_en == WCD934X_MICB_ENABLE)
+		snd_soc_component_write_field(component, micb_reg,
+					      WCD934X_ANA_MICB_EN_MASK,
+					      WCD934X_MICB_PULL_UP);
+
+	snd_soc_component_write_field(component, micb_reg,
+				      WCD934X_MICB_VAL_MASK,
+				      req_vout_ctl);
+
+	if (micb_en == WCD934X_MICB_ENABLE) {
+		snd_soc_component_write_field(component, micb_reg,
+					      WCD934X_ANA_MICB_EN_MASK,
+					      WCD934X_MICB_ENABLE);
+		/*
+		 * Add 2ms delay as per HW requirement after enabling
+		 * micbias
+		 */
+		usleep_range(2000, 2100);
+	}
+exit:
+	mutex_unlock(&wcd934x->micb_lock);
+	return ret;
+}
+
+static int wcd934x_mbhc_micb_ctrl_threshold_mic(struct snd_soc_component *component,
+						int micb_num, bool req_en)
+{
+	struct wcd934x_codec *wcd934x = snd_soc_component_get_drvdata(component);
+	int rc, micb_mv;
+
+	if (micb_num != MIC_BIAS_2)
+		return -EINVAL;
+	/*
+	 * If device tree micbias level is already above the minimum
+	 * voltage needed to detect threshold microphone, then do
+	 * not change the micbias, just return.
+	 */
+	if (wcd934x->micb2_mv >= WCD_MBHC_THR_HS_MICB_MV)
+		return 0;
+
+	micb_mv = req_en ? WCD_MBHC_THR_HS_MICB_MV : wcd934x->micb2_mv;
+
+	rc = wcd934x_mbhc_micb_adjust_voltage(component, micb_mv, MIC_BIAS_2);
+
+	return rc;
+}
+
+static inline void wcd934x_mbhc_get_result_params(struct wcd934x_codec *wcd934x,
+						s16 *d1_a, u16 noff,
+						int32_t *zdet)
+{
+	int i;
+	int val, val1;
+	s16 c1;
+	s32 x1, d1;
+	int32_t denom;
+	int minCode_param[] = {
+			3277, 1639, 820, 410, 205, 103, 52, 26
+	};
+
+	regmap_update_bits(wcd934x->regmap, WCD934X_ANA_MBHC_ZDET, 0x20, 0x20);
+	for (i = 0; i < WCD934X_ZDET_NUM_MEASUREMENTS; i++) {
+		regmap_read(wcd934x->regmap, WCD934X_ANA_MBHC_RESULT_2, &val);
+		if (val & 0x80)
+			break;
+	}
+	val = val << 0x8;
+	regmap_read(wcd934x->regmap, WCD934X_ANA_MBHC_RESULT_1, &val1);
+	val |= val1;
+	regmap_update_bits(wcd934x->regmap, WCD934X_ANA_MBHC_ZDET, 0x20, 0x00);
+	x1 = WCD934X_MBHC_GET_X1(val);
+	c1 = WCD934X_MBHC_GET_C1(val);
+	/* If ramp is not complete, give additional 5ms */
+	if ((c1 < 2) && x1)
+		usleep_range(5000, 5050);
+
+	if (!c1 || !x1) {
+		dev_err(wcd934x->dev, "%s: Impedance detect ramp error, c1=%d, x1=0x%x\n",
+			__func__, c1, x1);
+		goto ramp_down;
+	}
+	d1 = d1_a[c1];
+	denom = (x1 * d1) - (1 << (14 - noff));
+	if (denom > 0)
+		*zdet = (WCD934X_MBHC_ZDET_CONST * 1000) / denom;
+	else if (x1 < minCode_param[noff])
+		*zdet = WCD934X_ZDET_FLOATING_IMPEDANCE;
+
+	dev_info(wcd934x->dev, "%s: d1=%d, c1=%d, x1=0x%x, z_val=%d(milliOhm)\n",
+		__func__, d1, c1, x1, *zdet);
+ramp_down:
+	i = 0;
+
+	while (x1) {
+		regmap_read(wcd934x->regmap, WCD934X_ANA_MBHC_RESULT_1, &val);
+		regmap_read(wcd934x->regmap, WCD934X_ANA_MBHC_RESULT_2, &val1);
+		val = val << 0x08;
+		val |= val1;
+		x1 = WCD934X_MBHC_GET_X1(val);
+		i++;
+		if (i == WCD934X_ZDET_NUM_MEASUREMENTS)
+			break;
+	}
+}
+
+static void wcd934x_mbhc_zdet_ramp(struct snd_soc_component *component,
+				 struct wcd934x_mbhc_zdet_param *zdet_param,
+				 int32_t *zl, int32_t *zr, s16 *d1_a)
+{
+	struct wcd934x_codec *wcd934x = dev_get_drvdata(component->dev);
+	int32_t zdet = 0;
+
+	snd_soc_component_write_field(component, WCD934X_MBHC_NEW_ZDET_ANA_CTL,
+				WCD934X_ZDET_MAXV_CTL_MASK, zdet_param->ldo_ctl);
+	snd_soc_component_update_bits(component, WCD934X_ANA_MBHC_BTN5,
+				    WCD934X_VTH_MASK, zdet_param->btn5);
+	snd_soc_component_update_bits(component, WCD934X_ANA_MBHC_BTN6,
+				      WCD934X_VTH_MASK, zdet_param->btn6);
+	snd_soc_component_update_bits(component, WCD934X_ANA_MBHC_BTN7,
+				     WCD934X_VTH_MASK, zdet_param->btn7);
+	snd_soc_component_write_field(component, WCD934X_MBHC_NEW_ZDET_ANA_CTL,
+				WCD934X_ZDET_RANGE_CTL_MASK, zdet_param->noff);
+	snd_soc_component_update_bits(component, WCD934X_MBHC_NEW_ZDET_RAMP_CTL,
+				0x0F, zdet_param->nshift);
+
+	if (!zl)
+		goto z_right;
+	/* Start impedance measurement for HPH_L */
+	regmap_update_bits(wcd934x->regmap, WCD934X_ANA_MBHC_ZDET, 0x80, 0x80);
+	wcd934x_mbhc_get_result_params(wcd934x, d1_a, zdet_param->noff, &zdet);
+	regmap_update_bits(wcd934x->regmap, WCD934X_ANA_MBHC_ZDET, 0x80, 0x00);
+
+	*zl = zdet;
+
+z_right:
+	if (!zr)
+		return;
+	/* Start impedance measurement for HPH_R */
+	regmap_update_bits(wcd934x->regmap, WCD934X_ANA_MBHC_ZDET, 0x40, 0x40);
+	wcd934x_mbhc_get_result_params(wcd934x, d1_a, zdet_param->noff, &zdet);
+	regmap_update_bits(wcd934x->regmap, WCD934X_ANA_MBHC_ZDET, 0x40, 0x00);
+
+	*zr = zdet;
+}
+
+static inline void wcd934x_wcd_mbhc_qfuse_cal(struct snd_soc_component *component,
+					      int32_t *z_val, int flag_l_r)
+{
+	s16 q1;
+	int q1_cal;
+
+	if (*z_val < (WCD934X_ZDET_VAL_400/1000))
+		q1 = snd_soc_component_read(component,
+			WCD934X_CHIP_TIER_CTRL_EFUSE_VAL_OUT1 + (2 * flag_l_r));
+	else
+		q1 = snd_soc_component_read(component,
+			WCD934X_CHIP_TIER_CTRL_EFUSE_VAL_OUT2 + (2 * flag_l_r));
+	if (q1 & 0x80)
+		q1_cal = (10000 - ((q1 & 0x7F) * 25));
+	else
+		q1_cal = (10000 + (q1 * 25));
+	if (q1_cal > 0)
+		*z_val = ((*z_val) * 10000) / q1_cal;
+}
+
+static void wcd934x_wcd_mbhc_calc_impedance(struct snd_soc_component *component,
+					    uint32_t *zl, uint32_t *zr)
+{
+	struct wcd934x_codec *wcd934x = dev_get_drvdata(component->dev);
+	s16 reg0, reg1, reg2, reg3, reg4;
+	int32_t z1L, z1R, z1Ls;
+	int zMono, z_diff1, z_diff2;
+	bool is_fsm_disable = false;
+	struct wcd934x_mbhc_zdet_param zdet_param[] = {
+		{4, 0, 4, 0x08, 0x14, 0x18}, /* < 32ohm */
+		{2, 0, 3, 0x18, 0x7C, 0x90}, /* 32ohm < Z < 400ohm */
+		{1, 4, 5, 0x18, 0x7C, 0x90}, /* 400ohm < Z < 1200ohm */
+		{1, 6, 7, 0x18, 0x7C, 0x90}, /* >1200ohm */
+	};
+	struct wcd934x_mbhc_zdet_param *zdet_param_ptr = NULL;
+	s16 d1_a[][4] = {
+		{0, 30, 90, 30},
+		{0, 30, 30, 5},
+		{0, 30, 30, 5},
+		{0, 30, 30, 5},
+	};
+	s16 *d1 = NULL;
+
+	reg0 = snd_soc_component_read(component, WCD934X_ANA_MBHC_BTN5);
+	reg1 = snd_soc_component_read(component, WCD934X_ANA_MBHC_BTN6);
+	reg2 = snd_soc_component_read(component, WCD934X_ANA_MBHC_BTN7);
+	reg3 = snd_soc_component_read(component, WCD934X_MBHC_CTL_CLK);
+	reg4 = snd_soc_component_read(component, WCD934X_MBHC_NEW_ZDET_ANA_CTL);
+
+	if (snd_soc_component_read(component, WCD934X_ANA_MBHC_ELECT) & 0x80) {
+		is_fsm_disable = true;
+		regmap_update_bits(wcd934x->regmap, WCD934X_ANA_MBHC_ELECT, 0x80, 0x00);
+	}
+
+	/* For NO-jack, disable L_DET_EN before Z-det measurements */
+	if (wcd934x->mbhc_cfg.hphl_swh)
+		regmap_update_bits(wcd934x->regmap, WCD934X_ANA_MBHC_MECH, 0x80, 0x00);
+
+	/* Turn off 100k pull down on HPHL */
+	regmap_update_bits(wcd934x->regmap, WCD934X_ANA_MBHC_MECH, 0x01, 0x00);
+
+	/* First get impedance on Left */
+	d1 = d1_a[1];
+	zdet_param_ptr = &zdet_param[1];
+	wcd934x_mbhc_zdet_ramp(component, zdet_param_ptr, &z1L, NULL, d1);
+
+	if (!WCD934X_MBHC_IS_SECOND_RAMP_REQUIRED(z1L))
+		goto left_ch_impedance;
+
+	/* Second ramp for left ch */
+	if (z1L < WCD934X_ZDET_VAL_32) {
+		zdet_param_ptr = &zdet_param[0];
+		d1 = d1_a[0];
+	} else if ((z1L > WCD934X_ZDET_VAL_400) &&
+		  (z1L <= WCD934X_ZDET_VAL_1200)) {
+		zdet_param_ptr = &zdet_param[2];
+		d1 = d1_a[2];
+	} else if (z1L > WCD934X_ZDET_VAL_1200) {
+		zdet_param_ptr = &zdet_param[3];
+		d1 = d1_a[3];
+	}
+	wcd934x_mbhc_zdet_ramp(component, zdet_param_ptr, &z1L, NULL, d1);
+
+left_ch_impedance:
+	if ((z1L == WCD934X_ZDET_FLOATING_IMPEDANCE) ||
+		(z1L > WCD934X_ZDET_VAL_100K)) {
+		*zl = WCD934X_ZDET_FLOATING_IMPEDANCE;
+		zdet_param_ptr = &zdet_param[1];
+		d1 = d1_a[1];
+	} else {
+		*zl = z1L/1000;
+		wcd934x_wcd_mbhc_qfuse_cal(component, zl, 0);
+	}
+	dev_info(component->dev, "%s: impedance on HPH_L = %d(ohms)\n",
+		__func__, *zl);
+
+	/* Start of right impedance ramp and calculation */
+	wcd934x_mbhc_zdet_ramp(component, zdet_param_ptr, NULL, &z1R, d1);
+	if (WCD934X_MBHC_IS_SECOND_RAMP_REQUIRED(z1R)) {
+		if (((z1R > WCD934X_ZDET_VAL_1200) &&
+			(zdet_param_ptr->noff == 0x6)) ||
+			((*zl) != WCD934X_ZDET_FLOATING_IMPEDANCE))
+			goto right_ch_impedance;
+		/* Second ramp for right ch */
+		if (z1R < WCD934X_ZDET_VAL_32) {
+			zdet_param_ptr = &zdet_param[0];
+			d1 = d1_a[0];
+		} else if ((z1R > WCD934X_ZDET_VAL_400) &&
+			(z1R <= WCD934X_ZDET_VAL_1200)) {
+			zdet_param_ptr = &zdet_param[2];
+			d1 = d1_a[2];
+		} else if (z1R > WCD934X_ZDET_VAL_1200) {
+			zdet_param_ptr = &zdet_param[3];
+			d1 = d1_a[3];
+		}
+		wcd934x_mbhc_zdet_ramp(component, zdet_param_ptr, NULL, &z1R, d1);
+	}
+right_ch_impedance:
+	if ((z1R == WCD934X_ZDET_FLOATING_IMPEDANCE) ||
+		(z1R > WCD934X_ZDET_VAL_100K)) {
+		*zr = WCD934X_ZDET_FLOATING_IMPEDANCE;
+	} else {
+		*zr = z1R/1000;
+		wcd934x_wcd_mbhc_qfuse_cal(component, zr, 1);
+	}
+	dev_err(component->dev, "%s: impedance on HPH_R = %d(ohms)\n",
+		__func__, *zr);
+
+	/* Mono/stereo detection */
+	if ((*zl == WCD934X_ZDET_FLOATING_IMPEDANCE) &&
+		(*zr == WCD934X_ZDET_FLOATING_IMPEDANCE)) {
+		dev_dbg(component->dev,
+			"%s: plug type is invalid or extension cable\n",
+			__func__);
+		goto zdet_complete;
+	}
+	if ((*zl == WCD934X_ZDET_FLOATING_IMPEDANCE) ||
+	    (*zr == WCD934X_ZDET_FLOATING_IMPEDANCE) ||
+	    ((*zl < WCD_MONO_HS_MIN_THR) && (*zr > WCD_MONO_HS_MIN_THR)) ||
+	    ((*zl > WCD_MONO_HS_MIN_THR) && (*zr < WCD_MONO_HS_MIN_THR))) {
+		dev_dbg(component->dev,
+			"%s: Mono plug type with one ch floating or shorted to GND\n",
+			__func__);
+		wcd_mbhc_set_hph_type(wcd934x->mbhc, WCD_MBHC_HPH_MONO);
+		goto zdet_complete;
+	}
+	snd_soc_component_write_field(component, WCD934X_HPH_R_ATEST,
+				      WCD934X_HPHPA_GND_OVR_MASK, 1);
+	snd_soc_component_write_field(component, WCD934X_HPH_PA_CTL2,
+				      WCD934X_HPHPA_GND_R_MASK, 1);
+	if (*zl < (WCD934X_ZDET_VAL_32/1000))
+		wcd934x_mbhc_zdet_ramp(component, &zdet_param[0], &z1Ls, NULL, d1);
+	else
+		wcd934x_mbhc_zdet_ramp(component, &zdet_param[1], &z1Ls, NULL, d1);
+	snd_soc_component_write_field(component, WCD934X_HPH_PA_CTL2,
+				      WCD934X_HPHPA_GND_R_MASK, 0);
+	snd_soc_component_write_field(component, WCD934X_HPH_R_ATEST,
+				      WCD934X_HPHPA_GND_OVR_MASK, 0);
+	z1Ls /= 1000;
+	wcd934x_wcd_mbhc_qfuse_cal(component, &z1Ls, 0);
+	/* Parallel of left Z and 9 ohm pull down resistor */
+	zMono = ((*zl) * 9) / ((*zl) + 9);
+	z_diff1 = (z1Ls > zMono) ? (z1Ls - zMono) : (zMono - z1Ls);
+	z_diff2 = ((*zl) > z1Ls) ? ((*zl) - z1Ls) : (z1Ls - (*zl));
+	if ((z_diff1 * (*zl + z1Ls)) > (z_diff2 * (z1Ls + zMono))) {
+		dev_err(component->dev, "%s: stereo plug type detected\n",
+			__func__);
+		wcd_mbhc_set_hph_type(wcd934x->mbhc, WCD_MBHC_HPH_STEREO);
+	} else {
+		dev_err(component->dev, "%s: MONO plug type detected\n",
+			__func__);
+		wcd_mbhc_set_hph_type(wcd934x->mbhc, WCD_MBHC_HPH_MONO);
+	}
+
+zdet_complete:
+	snd_soc_component_write(component, WCD934X_ANA_MBHC_BTN5, reg0);
+	snd_soc_component_write(component, WCD934X_ANA_MBHC_BTN6, reg1);
+	snd_soc_component_write(component, WCD934X_ANA_MBHC_BTN7, reg2);
+	/* Turn on 100k pull down on HPHL */
+	regmap_update_bits(wcd934x->regmap, WCD934X_ANA_MBHC_MECH, 0x01, 0x01);
+
+	/* For NO-jack, re-enable L_DET_EN after Z-det measurements */
+	if (wcd934x->mbhc_cfg.hphl_swh)
+		regmap_update_bits(wcd934x->regmap, WCD934X_ANA_MBHC_MECH, 0x80, 0x80);
+
+	snd_soc_component_write(component, WCD934X_MBHC_NEW_ZDET_ANA_CTL, reg4);
+	snd_soc_component_write(component, WCD934X_MBHC_CTL_CLK, reg3);
+	if (is_fsm_disable)
+		regmap_update_bits(wcd934x->regmap, WCD934X_ANA_MBHC_ELECT, 0x80, 0x80);
+}
+
+static void wcd934x_mbhc_gnd_det_ctrl(struct snd_soc_component *component,
+			bool enable)
+{
+	if (enable) {
+		snd_soc_component_write_field(component, WCD934X_ANA_MBHC_MECH,
+					      WCD934X_MBHC_HSG_PULLUP_COMP_EN, 1);
+		snd_soc_component_write_field(component, WCD934X_ANA_MBHC_MECH,
+					      WCD934X_MBHC_GND_DET_EN_MASK, 1);
+	} else {
+		snd_soc_component_write_field(component, WCD934X_ANA_MBHC_MECH,
+					      WCD934X_MBHC_GND_DET_EN_MASK, 0);
+		snd_soc_component_write_field(component, WCD934X_ANA_MBHC_MECH,
+					      WCD934X_MBHC_HSG_PULLUP_COMP_EN, 0);
+	}
+}
+
+static void wcd934x_mbhc_hph_pull_down_ctrl(struct snd_soc_component *component,
+					  bool enable)
+{
+	snd_soc_component_write_field(component, WCD934X_HPH_PA_CTL2,
+				      WCD934X_HPHPA_GND_R_MASK, enable);
+	snd_soc_component_write_field(component, WCD934X_HPH_PA_CTL2,
+				      WCD934X_HPHPA_GND_L_MASK, enable);
+}
+
+static const struct wcd_mbhc_cb mbhc_cb = {
+	.clk_setup = wcd934x_mbhc_clk_setup,
+	.mbhc_bias = wcd934x_mbhc_mbhc_bias_control,
+	.set_btn_thr = wcd934x_mbhc_program_btn_thr,
+	.micbias_enable_status = wcd934x_mbhc_micb_en_status,
+	.hph_pull_up_control = wcd934x_mbhc_hph_l_pull_up_control,
+	.mbhc_micbias_control = wcd934x_mbhc_request_micbias,
+	.mbhc_micb_ramp_control = wcd934x_mbhc_micb_ramp_control,
+	.mbhc_micb_ctrl_thr_mic = wcd934x_mbhc_micb_ctrl_threshold_mic,
+	.compute_impedance = wcd934x_wcd_mbhc_calc_impedance,
+	.mbhc_gnd_det_ctrl = wcd934x_mbhc_gnd_det_ctrl,
+	.hph_pull_down_ctrl = wcd934x_mbhc_hph_pull_down_ctrl,
+};
+
+static int wcd934x_get_hph_type(struct snd_kcontrol *kcontrol,
+			      struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_soc_component *component = snd_soc_kcontrol_component(kcontrol);
+	struct wcd934x_codec *wcd = snd_soc_component_get_drvdata(component);
+
+	ucontrol->value.integer.value[0] = wcd_mbhc_get_hph_type(wcd->mbhc);
+
+	return 0;
+}
+
+static int wcd934x_hph_impedance_get(struct snd_kcontrol *kcontrol,
+				   struct snd_ctl_elem_value *ucontrol)
+{
+	uint32_t zl, zr;
+	bool hphr;
+	struct soc_mixer_control *mc;
+	struct snd_soc_component *component = snd_soc_kcontrol_component(kcontrol);
+	struct wcd934x_codec *wcd = snd_soc_component_get_drvdata(component);
+
+	mc = (struct soc_mixer_control *)(kcontrol->private_value);
+	hphr = mc->shift;
+	wcd_mbhc_get_impedance(wcd->mbhc, &zl, &zr);
+	dev_dbg(component->dev, "%s: zl=%u(ohms), zr=%u(ohms)\n", __func__, zl, zr);
+	ucontrol->value.integer.value[0] = hphr ? zr : zl;
+
+	return 0;
+}
+static const struct snd_kcontrol_new hph_type_detect_controls[] = {
+	SOC_SINGLE_EXT("HPH Type", 0, 0, UINT_MAX, 0,
+		       wcd934x_get_hph_type, NULL),
+};
+
+static const struct snd_kcontrol_new impedance_detect_controls[] = {
+	SOC_SINGLE_EXT("HPHL Impedance", 0, 0, UINT_MAX, 0,
+		       wcd934x_hph_impedance_get, NULL),
+	SOC_SINGLE_EXT("HPHR Impedance", 0, 1, UINT_MAX, 0,
+		       wcd934x_hph_impedance_get, NULL),
+};
+
+static int wcd934x_mbhc_init(struct snd_soc_component *component)
+{
+	struct wcd934x_ddata *data = dev_get_drvdata(component->dev->parent);
+	struct wcd934x_codec *wcd = snd_soc_component_get_drvdata(component);
+	struct wcd_mbhc_intr *intr_ids = &wcd->intr_ids;
+
+	intr_ids->mbhc_sw_intr = regmap_irq_get_virq(data->irq_data,
+						     WCD934X_IRQ_MBHC_SW_DET);
+	intr_ids->mbhc_btn_press_intr = regmap_irq_get_virq(data->irq_data,
+							    WCD934X_IRQ_MBHC_BUTTON_PRESS_DET);
+	intr_ids->mbhc_btn_release_intr = regmap_irq_get_virq(data->irq_data,
+							      WCD934X_IRQ_MBHC_BUTTON_RELEASE_DET);
+	intr_ids->mbhc_hs_ins_intr = regmap_irq_get_virq(data->irq_data,
+							 WCD934X_IRQ_MBHC_ELECT_INS_REM_LEG_DET);
+	intr_ids->mbhc_hs_rem_intr = regmap_irq_get_virq(data->irq_data,
+							 WCD934X_IRQ_MBHC_ELECT_INS_REM_DET);
+	intr_ids->hph_left_ocp = regmap_irq_get_virq(data->irq_data,
+						     WCD934X_IRQ_HPH_PA_OCPL_FAULT);
+	intr_ids->hph_right_ocp = regmap_irq_get_virq(data->irq_data,
+						      WCD934X_IRQ_HPH_PA_OCPR_FAULT);
+
+	wcd->mbhc = wcd_mbhc_init(component, &mbhc_cb, intr_ids, wcd_mbhc_fields, true);
+	if (IS_ERR(wcd->mbhc)) {
+		wcd->mbhc = NULL;
+		return -EINVAL;
+	}
+
+	snd_soc_add_component_controls(component, impedance_detect_controls,
+				       ARRAY_SIZE(impedance_detect_controls));
+	snd_soc_add_component_controls(component, hph_type_detect_controls,
+				       ARRAY_SIZE(hph_type_detect_controls));
+
+	return 0;
+}
 static int wcd934x_comp_probe(struct snd_soc_component *component)
 {
 	struct wcd934x_codec *wcd = dev_get_drvdata(component->dev);
@@ -2309,6 +3091,10 @@ static int wcd934x_comp_probe(struct snd_soc_component *component)
 		INIT_LIST_HEAD(&wcd->dai[i].slim_ch_list);
 
 	wcd934x_init_dmic(component);
+
+	if (wcd934x_mbhc_init(component))
+		dev_err(component->dev, "Failed to Initialize MBHC\n");
+
 	return 0;
 }
 
@@ -3756,6 +4542,7 @@ static int wcd934x_codec_enable_hphl_pa(struct snd_soc_dapm_widget *w,
 					int event)
 {
 	struct snd_soc_component *comp = snd_soc_dapm_to_component(w->dapm);
+	struct wcd934x_codec *wcd = snd_soc_component_get_drvdata(comp);
 
 	switch (event) {
 	case SND_SOC_DAPM_POST_PMU:
@@ -3788,6 +4575,7 @@ static int wcd934x_codec_enable_hphl_pa(struct snd_soc_dapm_widget *w,
 				WCD934X_CDC_RX_PGA_MUTE_EN_MASK, 0x00);
 		break;
 	case SND_SOC_DAPM_PRE_PMD:
+		wcd_mbhc_event_notify(wcd->mbhc, WCD_EVENT_POST_HPHL_PA_OFF);
 		/* Enable DSD Mute before PA disable */
 		snd_soc_component_update_bits(comp, WCD934X_HPH_L_TEST,
 					      WCD934X_HPH_OCP_DET_MASK,
@@ -3806,6 +4594,7 @@ static int wcd934x_codec_enable_hphl_pa(struct snd_soc_dapm_widget *w,
 		 * disabled, then 20ms delay is needed after PA disable.
 		 */
 		usleep_range(20000, 20100);
+		wcd_mbhc_event_notify(wcd->mbhc, WCD_EVENT_POST_HPHL_PA_OFF);
 		break;
 	}
 
@@ -3817,6 +4606,7 @@ static int wcd934x_codec_enable_hphr_pa(struct snd_soc_dapm_widget *w,
 					int event)
 {
 	struct snd_soc_component *comp = snd_soc_dapm_to_component(w->dapm);
+	struct wcd934x_codec *wcd = snd_soc_component_get_drvdata(comp);
 
 	switch (event) {
 	case SND_SOC_DAPM_POST_PMU:
@@ -3851,6 +4641,7 @@ static int wcd934x_codec_enable_hphr_pa(struct snd_soc_dapm_widget *w,
 					      WCD934X_CDC_RX_PGA_MUTE_DISABLE);
 		break;
 	case SND_SOC_DAPM_PRE_PMD:
+		wcd_mbhc_event_notify(wcd->mbhc, WCD_EVENT_PRE_HPHR_PA_OFF);
 		snd_soc_component_update_bits(comp, WCD934X_HPH_R_TEST,
 					      WCD934X_HPH_OCP_DET_MASK,
 					      WCD934X_HPH_OCP_DET_DISABLE);
@@ -3868,6 +4659,7 @@ static int wcd934x_codec_enable_hphr_pa(struct snd_soc_dapm_widget *w,
 		 * disabled, then 20ms delay is needed after PA disable.
 		 */
 		usleep_range(20000, 20100);
+		wcd_mbhc_event_notify(wcd->mbhc, WCD_EVENT_POST_HPHR_PA_OFF);
 		break;
 	}
 
@@ -4323,6 +5115,29 @@ static int wcd934x_codec_enable_adc(struct snd_soc_dapm_widget *w,
 	return 0;
 }
 
+static int wcd934x_codec_enable_micbias(struct snd_soc_dapm_widget *w,
+					struct snd_kcontrol *kcontrol,
+					int event)
+{
+	struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm);
+	int micb_num = w->shift;
+
+	switch (event) {
+	case SND_SOC_DAPM_PRE_PMU:
+		wcd934x_micbias_control(component, micb_num, MICB_ENABLE, true);
+		break;
+	case SND_SOC_DAPM_POST_PMU:
+		/* 1 msec delay as per HW requirement */
+		usleep_range(1000, 1100);
+		break;
+	case SND_SOC_DAPM_POST_PMD:
+		wcd934x_micbias_control(component, micb_num, MICB_DISABLE, true);
+		break;
+	};
+
+	return 0;
+}
+
 static const struct snd_soc_dapm_widget wcd934x_dapm_widgets[] = {
 	/* Analog Outputs */
 	SND_SOC_DAPM_OUTPUT("EAR"),
@@ -4778,13 +5593,17 @@ static const struct snd_soc_dapm_widget wcd934x_dapm_widgets[] = {
 			   wcd934x_codec_enable_adc, SND_SOC_DAPM_PRE_PMU),
 	SND_SOC_DAPM_ADC_E("ADC4", NULL, WCD934X_ANA_AMIC4, 7, 0,
 			   wcd934x_codec_enable_adc, SND_SOC_DAPM_PRE_PMU),
-	SND_SOC_DAPM_SUPPLY("MIC BIAS1", WCD934X_ANA_MICB1, 6, 0, NULL,
+	SND_SOC_DAPM_SUPPLY("MIC BIAS1", SND_SOC_NOPM, MIC_BIAS_1, 0,
+			    wcd934x_codec_enable_micbias, SND_SOC_DAPM_PRE_PMU |
 			    SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_POST_PMD),
-	SND_SOC_DAPM_SUPPLY("MIC BIAS2", WCD934X_ANA_MICB2, 6, 0, NULL,
+	SND_SOC_DAPM_SUPPLY("MIC BIAS2", SND_SOC_NOPM, MIC_BIAS_2, 0,
+			    wcd934x_codec_enable_micbias, SND_SOC_DAPM_PRE_PMU |
 			    SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_POST_PMD),
-	SND_SOC_DAPM_SUPPLY("MIC BIAS3", WCD934X_ANA_MICB3, 6, 0, NULL,
+	SND_SOC_DAPM_SUPPLY("MIC BIAS3", SND_SOC_NOPM, MIC_BIAS_3, 0,
+			    wcd934x_codec_enable_micbias, SND_SOC_DAPM_PRE_PMU |
 			    SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_POST_PMD),
-	SND_SOC_DAPM_SUPPLY("MIC BIAS4", WCD934X_ANA_MICB4, 6, 0, NULL,
+	SND_SOC_DAPM_SUPPLY("MIC BIAS4", SND_SOC_NOPM, MIC_BIAS_4, 0,
+			    wcd934x_codec_enable_micbias, SND_SOC_DAPM_PRE_PMU |
 			    SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_POST_PMD),
 
 	SND_SOC_DAPM_MUX("AMIC4_5 SEL", SND_SOC_NOPM, 0, 0, &tx_amic4_5),
@@ -4961,6 +5780,26 @@ static const struct snd_soc_dapm_route wcd934x_audio_map[] = {
 	{"SRC1", NULL, "IIR1"},
 };
 
+static int wcd934x_codec_set_jack(struct snd_soc_component *comp,
+				  struct snd_soc_jack *jack, void *data)
+{
+	struct wcd934x_codec *wcd = dev_get_drvdata(comp->dev);
+	int ret = 0;
+
+	if (!wcd->mbhc)
+		return -ENOTSUPP;
+
+	if (jack && !wcd->mbhc_started) {
+		ret = wcd_mbhc_start(wcd->mbhc, &wcd->mbhc_cfg, jack);
+		wcd->mbhc_started = true;
+	} else if (wcd->mbhc_started) {
+		wcd_mbhc_stop(wcd->mbhc);
+		wcd->mbhc_started = false;
+	}
+
+	return ret;
+}
+
 static const struct snd_soc_component_driver wcd934x_component_drv = {
 	.probe = wcd934x_comp_probe,
 	.remove = wcd934x_comp_remove,
@@ -4971,11 +5810,13 @@ static const struct snd_soc_component_driver wcd934x_component_drv = {
 	.num_dapm_widgets = ARRAY_SIZE(wcd934x_dapm_widgets),
 	.dapm_routes = wcd934x_audio_map,
 	.num_dapm_routes = ARRAY_SIZE(wcd934x_audio_map),
+	.set_jack = wcd934x_codec_set_jack,
 };
 
 static int wcd934x_codec_parse_data(struct wcd934x_codec *wcd)
 {
 	struct device *dev = &wcd->sdev->dev;
+	struct wcd_mbhc_config *cfg = &wcd->mbhc_cfg;
 	struct device_node *ifc_dev_np;
 
 	ifc_dev_np = of_parse_phandle(dev->of_node, "slim-ifc-dev", 0);
@@ -5001,6 +5842,18 @@ static int wcd934x_codec_parse_data(struct wcd934x_codec *wcd)
 	of_property_read_u32(dev->parent->of_node, "qcom,dmic-sample-rate",
 			     &wcd->dmic_sample_rate);
 
+	cfg->mbhc_micbias = MIC_BIAS_2;
+	cfg->anc_micbias = MIC_BIAS_2;
+	cfg->v_hs_max = WCD_MBHC_HS_V_MAX;
+	cfg->num_btn = WCD934X_MBHC_MAX_BUTTONS;
+	cfg->micb_mv = wcd->micb2_mv;
+	cfg->linein_th = 5000;
+	cfg->hs_thr = 1700;
+	cfg->hph_thr = 50;
+
+	wcd_dt_parse_mbhc_data(dev, cfg);
+
+
 	return 0;
 }
 
@@ -5020,6 +5873,7 @@ static int wcd934x_codec_probe(struct platform_device *pdev)
 	wcd->extclk = data->extclk;
 	wcd->sdev = to_slim_device(data->dev);
 	mutex_init(&wcd->sysclk_mutex);
+	mutex_init(&wcd->micb_lock);
 
 	ret = wcd934x_codec_parse_data(wcd);
 	if (ret) {
-- 
cgit v1.2.3


From 519d8ab17682da5f2fae5941d906d85b9fd3593a Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Wed, 19 May 2021 21:43:50 +0200
Subject: virtchnl: Add missing padding to virtchnl_proto_hdrs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On m68k (Coldfire M547x):

      CC      drivers/net/ethernet/intel/i40e/i40e_main.o
    In file included from drivers/net/ethernet/intel/i40e/i40e_prototype.h:9,
		     from drivers/net/ethernet/intel/i40e/i40e.h:41,
		     from drivers/net/ethernet/intel/i40e/i40e_main.c:12:
    include/linux/avf/virtchnl.h:153:36: warning: division by zero [-Wdiv-by-zero]
      153 |  { virtchnl_static_assert_##X = (n)/((sizeof(struct X) == (n)) ? 1 : 0) }
	  |                                    ^
    include/linux/avf/virtchnl.h:844:1: note: in expansion of macro ‘VIRTCHNL_CHECK_STRUCT_LEN’
      844 | VIRTCHNL_CHECK_STRUCT_LEN(2312, virtchnl_proto_hdrs);
	  | ^~~~~~~~~~~~~~~~~~~~~~~~~
    include/linux/avf/virtchnl.h:844:33: error: enumerator value for ‘virtchnl_static_assert_virtchnl_proto_hdrs’ is not an integer constant
      844 | VIRTCHNL_CHECK_STRUCT_LEN(2312, virtchnl_proto_hdrs);
	  |                                 ^~~~~~~~~~~~~~~~~~~

On m68k, integers are aligned on addresses that are multiples of two,
not four, bytes.  Hence the size of a structure containing integers may
not be divisible by 4.

Fix this by adding explicit padding.

Fixes: 1f7ea1cd6a374842 ("ice: Enable FDIR Configure for AVF")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/linux/avf/virtchnl.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index 565deea6ffe8..8612f8fc86c1 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -830,6 +830,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(72, virtchnl_proto_hdr);
 
 struct virtchnl_proto_hdrs {
 	u8 tunnel_level;
+	u8 pad[3];
 	/**
 	 * specify where protocol header start from.
 	 * 0 - from the outer layer
-- 
cgit v1.2.3


From 912e887505a07123917e537b657859723ce5d472 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Wed, 26 May 2021 06:24:57 +0900
Subject: dm: Introduce dm_report_zones()

To simplify the implementation of the report_zones operation of a zoned
target, introduce the function dm_report_zones() to set a target
mapping start sector in struct dm_report_zones_args and call
blkdev_report_zones(). This new function is exported and the report
zones callback function dm_report_zones_cb() is not.

dm-linear, dm-flakey and dm-crypt are modified to use dm_report_zones().

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c         |  7 +++----
 drivers/md/dm-flakey.c        |  7 +++----
 drivers/md/dm-linear.c        |  7 +++----
 drivers/md/dm-zone.c          | 22 ++++++++++++++++++++--
 include/linux/device-mapper.h |  3 ++-
 5 files changed, 31 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index b0ab080f2567..f410ceee51d7 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -3138,11 +3138,10 @@ static int crypt_report_zones(struct dm_target *ti,
 		struct dm_report_zones_args *args, unsigned int nr_zones)
 {
 	struct crypt_config *cc = ti->private;
-	sector_t sector = cc->start + dm_target_offset(ti, args->next_sector);
 
-	args->start = cc->start;
-	return blkdev_report_zones(cc->dev->bdev, sector, nr_zones,
-				   dm_report_zones_cb, args);
+	return dm_report_zones(cc->dev->bdev, cc->start,
+			cc->start + dm_target_offset(ti, args->next_sector),
+			args, nr_zones);
 }
 #else
 #define crypt_report_zones NULL
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index b7fee9936f05..5877220c01ed 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -463,11 +463,10 @@ static int flakey_report_zones(struct dm_target *ti,
 		struct dm_report_zones_args *args, unsigned int nr_zones)
 {
 	struct flakey_c *fc = ti->private;
-	sector_t sector = flakey_map_sector(ti, args->next_sector);
 
-	args->start = fc->start;
-	return blkdev_report_zones(fc->dev->bdev, sector, nr_zones,
-				   dm_report_zones_cb, args);
+	return dm_report_zones(fc->dev->bdev, fc->start,
+			       flakey_map_sector(ti, args->next_sector),
+			       args, nr_zones);
 }
 #else
 #define flakey_report_zones NULL
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 92db0f5e7f28..c91f1e2e2f65 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -140,11 +140,10 @@ static int linear_report_zones(struct dm_target *ti,
 		struct dm_report_zones_args *args, unsigned int nr_zones)
 {
 	struct linear_c *lc = ti->private;
-	sector_t sector = linear_map_sector(ti, args->next_sector);
 
-	args->start = lc->start;
-	return blkdev_report_zones(lc->dev->bdev, sector, nr_zones,
-				   dm_report_zones_cb, args);
+	return dm_report_zones(lc->dev->bdev, lc->start,
+			       linear_map_sector(ti, args->next_sector),
+			       args, nr_zones);
 }
 #else
 #define linear_report_zones NULL
diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
index 9a34d0f319fd..b42474043249 100644
--- a/drivers/md/dm-zone.c
+++ b/drivers/md/dm-zone.c
@@ -56,7 +56,8 @@ out:
 	return ret;
 }
 
-int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, void *data)
+static int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx,
+			      void *data)
 {
 	struct dm_report_zones_args *args = data;
 	sector_t sector_diff = args->tgt->begin - args->start;
@@ -84,7 +85,24 @@ int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, void *data)
 	args->next_sector = zone->start + zone->len;
 	return args->orig_cb(zone, args->zone_idx++, args->orig_data);
 }
-EXPORT_SYMBOL_GPL(dm_report_zones_cb);
+
+/*
+ * Helper for drivers of zoned targets to implement struct target_type
+ * report_zones operation.
+ */
+int dm_report_zones(struct block_device *bdev, sector_t start, sector_t sector,
+		    struct dm_report_zones_args *args, unsigned int nr_zones)
+{
+	/*
+	 * Set the target mapping start sector first so that
+	 * dm_report_zones_cb() can correctly remap zone information.
+	 */
+	args->start = start;
+
+	return blkdev_report_zones(bdev, sector, nr_zones,
+				   dm_report_zones_cb, args);
+}
+EXPORT_SYMBOL_GPL(dm_report_zones);
 
 void dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q)
 {
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index ff700fb6ce1d..caea0a079d2d 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -478,7 +478,8 @@ struct dm_report_zones_args {
 	/* must be filled by ->report_zones before calling dm_report_zones_cb */
 	sector_t start;
 };
-int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, void *data);
+int dm_report_zones(struct block_device *bdev, sector_t start, sector_t sector,
+		    struct dm_report_zones_args *args, unsigned int nr_zones);
 #endif /* CONFIG_BLK_DEV_ZONED */
 
 /*
-- 
cgit v1.2.3


From d0ea6bde141df9311bc36e7b07ad37b449f2c4f5 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Wed, 26 May 2021 06:24:52 +0900
Subject: block: introduce bio zone helpers

Introduce the helper functions bio_zone_no() and bio_zone_is_seq().
Both are the BIO counterparts of the request helpers blk_rq_zone_no()
and blk_rq_zone_is_seq(), respectively returning the number of the
target zone of a bio and true if the BIO target zone is sequential.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Acked-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 include/linux/blkdev.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f69c75bd6d27..2db0f376f5d9 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1008,6 +1008,18 @@ static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
 /* Helper to convert BLK_ZONE_ZONE_XXX to its string format XXX */
 const char *blk_zone_cond_str(enum blk_zone_cond zone_cond);
 
+static inline unsigned int bio_zone_no(struct bio *bio)
+{
+	return blk_queue_zone_no(bdev_get_queue(bio->bi_bdev),
+				 bio->bi_iter.bi_sector);
+}
+
+static inline unsigned int bio_zone_is_seq(struct bio *bio)
+{
+	return blk_queue_zone_is_seq(bdev_get_queue(bio->bi_bdev),
+				     bio->bi_iter.bi_sector);
+}
+
 static inline unsigned int blk_rq_zone_no(struct request *rq)
 {
 	return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
-- 
cgit v1.2.3


From 9ffbbb435d8f566a0924ce4b5dc7fc1bceb6dbf8 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Wed, 26 May 2021 06:24:53 +0900
Subject: block: introduce BIO_ZONE_WRITE_LOCKED bio flag

Introduce the BIO flag BIO_ZONE_WRITE_LOCKED to indicate that a BIO owns
the write lock of the zone it is targeting. This is the counterpart of
the struct request flag RQF_ZONE_WRITE_LOCKED.

This new BIO flag is reserved for now for zone write locking control
for device mapper targets exposing a zoned block device. Since in this
case, the lock flag must not be propagated to the struct request that
will be used to process the BIO, a BIO private flag is used rather than
changing the RQF_ZONE_WRITE_LOCKED request flag into a common REQ_XXX
flag that could be used for both BIO and request. This avoids conflicts
down the stack with the block IO scheduler zone write locking
(in mq-deadline).

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Acked-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 include/linux/blk_types.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index db026b6ec15a..e5cf12f102a2 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -304,6 +304,7 @@ enum {
 	BIO_CGROUP_ACCT,	/* has been accounted to a cgroup */
 	BIO_TRACKED,		/* set if bio goes through the rq_qos path */
 	BIO_REMAPPED,
+	BIO_ZONE_WRITE_LOCKED,	/* Owns a zoned device zone write lock */
 	BIO_FLAG_LAST
 };
 
-- 
cgit v1.2.3


From bb37d77239af25cde59693dbe3fac04dd17d7b29 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Wed, 26 May 2021 06:25:00 +0900
Subject: dm: introduce zone append emulation

For zoned targets that cannot support zone append operations, implement
an emulation using regular write operations. If the original BIO
submitted by the user is a zone append operation, change its clone into
a regular write operation directed at the target zone write pointer
position.

To do so, an array of write pointer offsets (write pointer position
relative to the start of a zone) is added to struct mapped_device. All
operations that modify a sequential zone write pointer (writes, zone
reset, zone finish and zone append) are intersepted in __map_bio() and
processed using the new functions dm_zone_map_bio().

Detection of the target ability to natively support zone append
operations is done from dm_table_set_restrictions() by calling the
function dm_set_zones_restrictions(). A target that does not support
zone append operation, either by explicitly declaring it using the new
struct dm_target field zone_append_not_supported, or because the device
table contains a non-zoned device, has its mapped device marked with the
new flag DMF_ZONE_APPEND_EMULATED. The helper function
dm_emulate_zone_append() is introduced to test a mapped device for this
new flag.

Atomicity of the zones write pointer tracking and updates is done using
a zone write locking mechanism based on a bitmap. This is similar to
the block layer method but based on BIOs rather than struct request.
A zone write lock is taken in dm_zone_map_bio() for any clone BIO with
an operation type that changes the BIO target zone write pointer
position. The zone write lock is released if the clone BIO is failed
before submission or when dm_zone_endio() is called when the clone BIO
completes.

The zone write lock bitmap of the mapped device, together with a bitmap
indicating zone types (conv_zones_bitmap) and the write pointer offset
array (zwp_offset) are allocated and initialized with a full device zone
report in dm_set_zones_restrictions() using the function
dm_revalidate_zones().

For failed operations that may have modified a zone write pointer, the
zone write pointer offset is marked as invalid in dm_zone_endio().
Zones with an invalid write pointer offset are checked and the write
pointer updated using an internal report zone operation when the
faulty zone is accessed again by the user.

All functions added for this emulation have a minimal overhead for
zoned targets natively supporting zone append operations. Regular
device targets are also not affected. The added code also does not
impact builds with CONFIG_BLK_DEV_ZONED disabled by stubbing out all
dm zone related functions.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-core.h          |  13 +
 drivers/md/dm-table.c         |  19 +-
 drivers/md/dm-zone.c          | 580 +++++++++++++++++++++++++++++++++++++++---
 drivers/md/dm.c               |  38 +--
 drivers/md/dm.h               |  16 +-
 include/linux/device-mapper.h |   6 +
 6 files changed, 618 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index cfabc1c91f9f..edc1553c4eea 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -114,6 +114,11 @@ struct mapped_device {
 	bool init_tio_pdu:1;
 
 	struct srcu_struct io_barrier;
+
+#ifdef CONFIG_BLK_DEV_ZONED
+	unsigned int nr_zones;
+	unsigned int *zwp_offset;
+#endif
 };
 
 /*
@@ -128,6 +133,7 @@ struct mapped_device {
 #define DMF_DEFERRED_REMOVE 6
 #define DMF_SUSPENDED_INTERNALLY 7
 #define DMF_POST_SUSPENDING 8
+#define DMF_EMULATE_ZONE_APPEND 9
 
 void disable_discard(struct mapped_device *md);
 void disable_write_same(struct mapped_device *md);
@@ -143,6 +149,13 @@ static inline struct dm_stats *dm_get_stats(struct mapped_device *md)
 	return &md->stats;
 }
 
+static inline bool dm_emulate_zone_append(struct mapped_device *md)
+{
+	if (blk_queue_is_zoned(md->queue))
+		return test_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
+	return false;
+}
+
 #define DM_TABLE_MAX_DEPTH 16
 
 struct dm_table {
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 1134ceed800f..0543cdf89e92 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1981,11 +1981,12 @@ static int device_requires_stable_pages(struct dm_target *ti,
 	return blk_queue_stable_writes(q);
 }
 
-void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
-			       struct queue_limits *limits)
+int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
+			      struct queue_limits *limits)
 {
 	bool wc = false, fua = false;
 	int page_size = PAGE_SIZE;
+	int r;
 
 	/*
 	 * Copy table's limits to the DM device's request_queue
@@ -2064,12 +2065,20 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	    dm_table_any_dev_attr(t, device_is_not_random, NULL))
 		blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q);
 
-	/* For a zoned target, setup the zones related queue attributes */
-	if (blk_queue_is_zoned(q))
-		dm_set_zones_restrictions(t, q);
+	/*
+	 * For a zoned target, setup the zones related queue attributes
+	 * and resources necessary for zone append emulation if necessary.
+	 */
+	if (blk_queue_is_zoned(q)) {
+		r = dm_set_zones_restrictions(t, q);
+		if (r)
+			return r;
+	}
 
 	dm_update_keyslot_manager(q, t);
 	blk_queue_update_readahead(q);
+
+	return 0;
 }
 
 unsigned int dm_table_get_num_targets(struct dm_table *t)
diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
index edc3bbb45637..c2f26949f5ee 100644
--- a/drivers/md/dm-zone.c
+++ b/drivers/md/dm-zone.c
@@ -4,55 +4,72 @@
  */
 
 #include <linux/blkdev.h>
+#include <linux/mm.h>
+#include <linux/sched/mm.h>
+#include <linux/slab.h>
 
 #include "dm-core.h"
 
+#define DM_MSG_PREFIX "zone"
+
+#define DM_ZONE_INVALID_WP_OFST		UINT_MAX
+
 /*
- * User facing dm device block device report zone operation. This calls the
- * report_zones operation for each target of a device table. This operation is
- * generally implemented by targets using dm_report_zones().
+ * For internal zone reports bypassing the top BIO submission path.
  */
-int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
-			unsigned int nr_zones, report_zones_cb cb, void *data)
+static int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t,
+				  sector_t sector, unsigned int nr_zones,
+				  report_zones_cb cb, void *data)
 {
-	struct mapped_device *md = disk->private_data;
-	struct dm_table *map;
-	int srcu_idx, ret;
+	struct gendisk *disk = md->disk;
+	int ret;
 	struct dm_report_zones_args args = {
 		.next_sector = sector,
 		.orig_data = data,
 		.orig_cb = cb,
 	};
 
-	if (dm_suspended_md(md))
-		return -EAGAIN;
-
-	map = dm_get_live_table(md, &srcu_idx);
-	if (!map) {
-		ret = -EIO;
-		goto out;
-	}
-
 	do {
 		struct dm_target *tgt;
 
-		tgt = dm_table_find_target(map, args.next_sector);
-		if (WARN_ON_ONCE(!tgt->type->report_zones)) {
-			ret = -EIO;
-			goto out;
-		}
+		tgt = dm_table_find_target(t, args.next_sector);
+		if (WARN_ON_ONCE(!tgt->type->report_zones))
+			return -EIO;
 
 		args.tgt = tgt;
 		ret = tgt->type->report_zones(tgt, &args,
 					      nr_zones - args.zone_idx);
 		if (ret < 0)
-			goto out;
+			return ret;
 	} while (args.zone_idx < nr_zones &&
 		 args.next_sector < get_capacity(disk));
 
-	ret = args.zone_idx;
-out:
+	return args.zone_idx;
+}
+
+/*
+ * User facing dm device block device report zone operation. This calls the
+ * report_zones operation for each target of a device table. This operation is
+ * generally implemented by targets using dm_report_zones().
+ */
+int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
+			unsigned int nr_zones, report_zones_cb cb, void *data)
+{
+	struct mapped_device *md = disk->private_data;
+	struct dm_table *map;
+	int srcu_idx, ret;
+
+	if (dm_suspended_md(md))
+		return -EAGAIN;
+
+	map = dm_get_live_table(md, &srcu_idx);
+	if (!map)
+		return -EIO;
+
+	ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data);
+
 	dm_put_live_table(md, srcu_idx);
+
 	return ret;
 }
 
@@ -121,16 +138,517 @@ bool dm_is_zone_write(struct mapped_device *md, struct bio *bio)
 	}
 }
 
-void dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q)
+void dm_cleanup_zoned_dev(struct mapped_device *md)
 {
-	if (!blk_queue_is_zoned(q))
-		return;
+	struct request_queue *q = md->queue;
+
+	if (q) {
+		kfree(q->conv_zones_bitmap);
+		q->conv_zones_bitmap = NULL;
+		kfree(q->seq_zones_wlock);
+		q->seq_zones_wlock = NULL;
+	}
+
+	kvfree(md->zwp_offset);
+	md->zwp_offset = NULL;
+	md->nr_zones = 0;
+}
+
+static unsigned int dm_get_zone_wp_offset(struct blk_zone *zone)
+{
+	switch (zone->cond) {
+	case BLK_ZONE_COND_IMP_OPEN:
+	case BLK_ZONE_COND_EXP_OPEN:
+	case BLK_ZONE_COND_CLOSED:
+		return zone->wp - zone->start;
+	case BLK_ZONE_COND_FULL:
+		return zone->len;
+	case BLK_ZONE_COND_EMPTY:
+	case BLK_ZONE_COND_NOT_WP:
+	case BLK_ZONE_COND_OFFLINE:
+	case BLK_ZONE_COND_READONLY:
+	default:
+		/*
+		 * Conventional, offline and read-only zones do not have a valid
+		 * write pointer. Use 0 as for an empty zone.
+		 */
+		return 0;
+	}
+}
+
+static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx,
+				 void *data)
+{
+	struct mapped_device *md = data;
+	struct request_queue *q = md->queue;
+
+	switch (zone->type) {
+	case BLK_ZONE_TYPE_CONVENTIONAL:
+		if (!q->conv_zones_bitmap) {
+			q->conv_zones_bitmap =
+				kcalloc(BITS_TO_LONGS(q->nr_zones),
+					sizeof(unsigned long), GFP_NOIO);
+			if (!q->conv_zones_bitmap)
+				return -ENOMEM;
+		}
+		set_bit(idx, q->conv_zones_bitmap);
+		break;
+	case BLK_ZONE_TYPE_SEQWRITE_REQ:
+	case BLK_ZONE_TYPE_SEQWRITE_PREF:
+		if (!q->seq_zones_wlock) {
+			q->seq_zones_wlock =
+				kcalloc(BITS_TO_LONGS(q->nr_zones),
+					sizeof(unsigned long), GFP_NOIO);
+			if (!q->seq_zones_wlock)
+				return -ENOMEM;
+		}
+		if (!md->zwp_offset) {
+			md->zwp_offset =
+				kvcalloc(q->nr_zones, sizeof(unsigned int),
+					 GFP_NOIO);
+			if (!md->zwp_offset)
+				return -ENOMEM;
+		}
+		md->zwp_offset[idx] = dm_get_zone_wp_offset(zone);
+
+		break;
+	default:
+		DMERR("Invalid zone type 0x%x at sectors %llu",
+		      (int)zone->type, zone->start);
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+/*
+ * Revalidate the zones of a mapped device to initialize resource necessary
+ * for zone append emulation. Note that we cannot simply use the block layer
+ * blk_revalidate_disk_zones() function here as the mapped device is suspended
+ * (this is called from __bind() context).
+ */
+static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t)
+{
+	struct request_queue *q = md->queue;
+	int ret;
+
+	/*
+	 * Check if something changed. If yes, cleanup the current resources
+	 * and reallocate everything.
+	 */
+	if (!q->nr_zones || q->nr_zones != md->nr_zones)
+		dm_cleanup_zoned_dev(md);
+	if (md->nr_zones)
+		return 0;
+
+	/* Scan all zones to initialize everything */
+	ret = dm_blk_do_report_zones(md, t, 0, q->nr_zones,
+				     dm_zone_revalidate_cb, md);
+	if (ret < 0)
+		goto err;
+	if (ret != q->nr_zones) {
+		ret = -EIO;
+		goto err;
+	}
+
+	md->nr_zones = q->nr_zones;
+
+	return 0;
+
+err:
+	DMERR("Revalidate zones failed %d", ret);
+	dm_cleanup_zoned_dev(md);
+	return ret;
+}
+
+static int device_not_zone_append_capable(struct dm_target *ti,
+					  struct dm_dev *dev, sector_t start,
+					  sector_t len, void *data)
+{
+	return !blk_queue_is_zoned(bdev_get_queue(dev->bdev));
+}
+
+static bool dm_table_supports_zone_append(struct dm_table *t)
+{
+	struct dm_target *ti;
+	unsigned int i;
+
+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+		ti = dm_table_get_target(t, i);
+
+		if (ti->emulate_zone_append)
+			return false;
+
+		if (!ti->type->iterate_devices ||
+		    ti->type->iterate_devices(ti, device_not_zone_append_capable, NULL))
+			return false;
+	}
+
+	return true;
+}
+
+int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q)
+{
+	struct mapped_device *md = t->md;
 
 	/*
 	 * For a zoned target, the number of zones should be updated for the
-	 * correct value to be exposed in sysfs queue/nr_zones. For a BIO based
-	 * target, this is all that is needed.
+	 * correct value to be exposed in sysfs queue/nr_zones.
 	 */
 	WARN_ON_ONCE(queue_is_mq(q));
-	q->nr_zones = blkdev_nr_zones(t->md->disk);
+	q->nr_zones = blkdev_nr_zones(md->disk);
+
+	/* Check if zone append is natively supported */
+	if (dm_table_supports_zone_append(t)) {
+		clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
+		dm_cleanup_zoned_dev(md);
+		return 0;
+	}
+
+	/*
+	 * Mark the mapped device as needing zone append emulation and
+	 * initialize the emulation resources once the capacity is set.
+	 */
+	set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
+	if (!get_capacity(md->disk))
+		return 0;
+
+	return dm_revalidate_zones(md, t);
+}
+
+static int dm_update_zone_wp_offset_cb(struct blk_zone *zone, unsigned int idx,
+				       void *data)
+{
+	unsigned int *wp_offset = data;
+
+	*wp_offset = dm_get_zone_wp_offset(zone);
+
+	return 0;
+}
+
+static int dm_update_zone_wp_offset(struct mapped_device *md, unsigned int zno,
+				    unsigned int *wp_ofst)
+{
+	sector_t sector = zno * blk_queue_zone_sectors(md->queue);
+	unsigned int noio_flag;
+	struct dm_table *t;
+	int srcu_idx, ret;
+
+	t = dm_get_live_table(md, &srcu_idx);
+	if (!t)
+		return -EIO;
+
+	/*
+	 * Ensure that all memory allocations in this context are done as if
+	 * GFP_NOIO was specified.
+	 */
+	noio_flag = memalloc_noio_save();
+	ret = dm_blk_do_report_zones(md, t, sector, 1,
+				     dm_update_zone_wp_offset_cb, wp_ofst);
+	memalloc_noio_restore(noio_flag);
+
+	dm_put_live_table(md, srcu_idx);
+
+	if (ret != 1)
+		return -EIO;
+
+	return 0;
+}
+
+/*
+ * First phase of BIO mapping for targets with zone append emulation:
+ * check all BIO that change a zone writer pointer and change zone
+ * append operations into regular write operations.
+ */
+static bool dm_zone_map_bio_begin(struct mapped_device *md,
+				  struct bio *orig_bio, struct bio *clone)
+{
+	sector_t zsectors = blk_queue_zone_sectors(md->queue);
+	unsigned int zno = bio_zone_no(orig_bio);
+	unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]);
+
+	/*
+	 * If the target zone is in an error state, recover by inspecting the
+	 * zone to get its current write pointer position. Note that since the
+	 * target zone is already locked, a BIO issuing context should never
+	 * see the zone write in the DM_ZONE_UPDATING_WP_OFST state.
+	 */
+	if (zwp_offset == DM_ZONE_INVALID_WP_OFST) {
+		if (dm_update_zone_wp_offset(md, zno, &zwp_offset))
+			return false;
+		WRITE_ONCE(md->zwp_offset[zno], zwp_offset);
+	}
+
+	switch (bio_op(orig_bio)) {
+	case REQ_OP_ZONE_RESET:
+	case REQ_OP_ZONE_FINISH:
+		return true;
+	case REQ_OP_WRITE_ZEROES:
+	case REQ_OP_WRITE_SAME:
+	case REQ_OP_WRITE:
+		/* Writes must be aligned to the zone write pointer */
+		if ((clone->bi_iter.bi_sector & (zsectors - 1)) != zwp_offset)
+			return false;
+		break;
+	case REQ_OP_ZONE_APPEND:
+		/*
+		 * Change zone append operations into a non-mergeable regular
+		 * writes directed at the current write pointer position of the
+		 * target zone.
+		 */
+		clone->bi_opf = REQ_OP_WRITE | REQ_NOMERGE |
+			(orig_bio->bi_opf & (~REQ_OP_MASK));
+		clone->bi_iter.bi_sector =
+			orig_bio->bi_iter.bi_sector + zwp_offset;
+		break;
+	default:
+		DMWARN_LIMIT("Invalid BIO operation");
+		return false;
+	}
+
+	/* Cannot write to a full zone */
+	if (zwp_offset >= zsectors)
+		return false;
+
+	return true;
+}
+
+/*
+ * Second phase of BIO mapping for targets with zone append emulation:
+ * update the zone write pointer offset array to account for the additional
+ * data written to a zone. Note that at this point, the remapped clone BIO
+ * may already have completed, so we do not touch it.
+ */
+static blk_status_t dm_zone_map_bio_end(struct mapped_device *md,
+					struct bio *orig_bio,
+					unsigned int nr_sectors)
+{
+	unsigned int zno = bio_zone_no(orig_bio);
+	unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]);
+
+	/* The clone BIO may already have been completed and failed */
+	if (zwp_offset == DM_ZONE_INVALID_WP_OFST)
+		return BLK_STS_IOERR;
+
+	/* Update the zone wp offset */
+	switch (bio_op(orig_bio)) {
+	case REQ_OP_ZONE_RESET:
+		WRITE_ONCE(md->zwp_offset[zno], 0);
+		return BLK_STS_OK;
+	case REQ_OP_ZONE_FINISH:
+		WRITE_ONCE(md->zwp_offset[zno],
+			   blk_queue_zone_sectors(md->queue));
+		return BLK_STS_OK;
+	case REQ_OP_WRITE_ZEROES:
+	case REQ_OP_WRITE_SAME:
+	case REQ_OP_WRITE:
+		WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors);
+		return BLK_STS_OK;
+	case REQ_OP_ZONE_APPEND:
+		/*
+		 * Check that the target did not truncate the write operation
+		 * emulating a zone append.
+		 */
+		if (nr_sectors != bio_sectors(orig_bio)) {
+			DMWARN_LIMIT("Truncated write for zone append");
+			return BLK_STS_IOERR;
+		}
+		WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors);
+		return BLK_STS_OK;
+	default:
+		DMWARN_LIMIT("Invalid BIO operation");
+		return BLK_STS_IOERR;
+	}
+}
+
+static inline void dm_zone_lock(struct request_queue *q,
+				unsigned int zno, struct bio *clone)
+{
+	if (WARN_ON_ONCE(bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)))
+		return;
+
+	wait_on_bit_lock_io(q->seq_zones_wlock, zno, TASK_UNINTERRUPTIBLE);
+	bio_set_flag(clone, BIO_ZONE_WRITE_LOCKED);
+}
+
+static inline void dm_zone_unlock(struct request_queue *q,
+				  unsigned int zno, struct bio *clone)
+{
+	if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))
+		return;
+
+	WARN_ON_ONCE(!test_bit(zno, q->seq_zones_wlock));
+	clear_bit_unlock(zno, q->seq_zones_wlock);
+	smp_mb__after_atomic();
+	wake_up_bit(q->seq_zones_wlock, zno);
+
+	bio_clear_flag(clone, BIO_ZONE_WRITE_LOCKED);
+}
+
+static bool dm_need_zone_wp_tracking(struct bio *orig_bio)
+{
+	/*
+	 * Special processing is not needed for operations that do not need the
+	 * zone write lock, that is, all operations that target conventional
+	 * zones and all operations that do not modify directly a sequential
+	 * zone write pointer.
+	 */
+	if (op_is_flush(orig_bio->bi_opf) && !bio_sectors(orig_bio))
+		return false;
+	switch (bio_op(orig_bio)) {
+	case REQ_OP_WRITE_ZEROES:
+	case REQ_OP_WRITE_SAME:
+	case REQ_OP_WRITE:
+	case REQ_OP_ZONE_RESET:
+	case REQ_OP_ZONE_FINISH:
+	case REQ_OP_ZONE_APPEND:
+		return bio_zone_is_seq(orig_bio);
+	default:
+		return false;
+	}
+}
+
+/*
+ * Special IO mapping for targets needing zone append emulation.
+ */
+int dm_zone_map_bio(struct dm_target_io *tio)
+{
+	struct dm_io *io = tio->io;
+	struct dm_target *ti = tio->ti;
+	struct mapped_device *md = io->md;
+	struct request_queue *q = md->queue;
+	struct bio *orig_bio = io->orig_bio;
+	struct bio *clone = &tio->clone;
+	unsigned int zno;
+	blk_status_t sts;
+	int r;
+
+	/*
+	 * IOs that do not change a zone write pointer do not need
+	 * any additional special processing.
+	 */
+	if (!dm_need_zone_wp_tracking(orig_bio))
+		return ti->type->map(ti, clone);
+
+	/* Lock the target zone */
+	zno = bio_zone_no(orig_bio);
+	dm_zone_lock(q, zno, clone);
+
+	/*
+	 * Check that the bio and the target zone write pointer offset are
+	 * both valid, and if the bio is a zone append, remap it to a write.
+	 */
+	if (!dm_zone_map_bio_begin(md, orig_bio, clone)) {
+		dm_zone_unlock(q, zno, clone);
+		return DM_MAPIO_KILL;
+	}
+
+	/*
+	 * The target map function may issue and complete the IO quickly.
+	 * Take an extra reference on the IO to make sure it does disappear
+	 * until we run dm_zone_map_bio_end().
+	 */
+	dm_io_inc_pending(io);
+
+	/* Let the target do its work */
+	r = ti->type->map(ti, clone);
+	switch (r) {
+	case DM_MAPIO_SUBMITTED:
+		/*
+		 * The target submitted the clone BIO. The target zone will
+		 * be unlocked on completion of the clone.
+		 */
+		sts = dm_zone_map_bio_end(md, orig_bio, *tio->len_ptr);
+		break;
+	case DM_MAPIO_REMAPPED:
+		/*
+		 * The target only remapped the clone BIO. In case of error,
+		 * unlock the target zone here as the clone will not be
+		 * submitted.
+		 */
+		sts = dm_zone_map_bio_end(md, orig_bio, *tio->len_ptr);
+		if (sts != BLK_STS_OK)
+			dm_zone_unlock(q, zno, clone);
+		break;
+	case DM_MAPIO_REQUEUE:
+	case DM_MAPIO_KILL:
+	default:
+		dm_zone_unlock(q, zno, clone);
+		sts = BLK_STS_IOERR;
+		break;
+	}
+
+	/* Drop the extra reference on the IO */
+	dm_io_dec_pending(io, sts);
+
+	if (sts != BLK_STS_OK)
+		return DM_MAPIO_KILL;
+
+	return r;
+}
+
+/*
+ * IO completion callback called from clone_endio().
+ */
+void dm_zone_endio(struct dm_io *io, struct bio *clone)
+{
+	struct mapped_device *md = io->md;
+	struct request_queue *q = md->queue;
+	struct bio *orig_bio = io->orig_bio;
+	unsigned int zwp_offset;
+	unsigned int zno;
+
+	/*
+	 * For targets that do not emulate zone append, we only need to
+	 * handle native zone-append bios.
+	 */
+	if (!dm_emulate_zone_append(md)) {
+		/*
+		 * Get the offset within the zone of the written sector
+		 * and add that to the original bio sector position.
+		 */
+		if (clone->bi_status == BLK_STS_OK &&
+		    bio_op(clone) == REQ_OP_ZONE_APPEND) {
+			sector_t mask = (sector_t)blk_queue_zone_sectors(q) - 1;
+
+			orig_bio->bi_iter.bi_sector +=
+				clone->bi_iter.bi_sector & mask;
+		}
+
+		return;
+	}
+
+	/*
+	 * For targets that do emulate zone append, if the clone BIO does not
+	 * own the target zone write lock, we have nothing to do.
+	 */
+	if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))
+		return;
+
+	zno = bio_zone_no(orig_bio);
+
+	if (clone->bi_status != BLK_STS_OK) {
+		/*
+		 * BIOs that modify a zone write pointer may leave the zone
+		 * in an unknown state in case of failure (e.g. the write
+		 * pointer was only partially advanced). In this case, set
+		 * the target zone write pointer as invalid unless it is
+		 * already being updated.
+		 */
+		WRITE_ONCE(md->zwp_offset[zno], DM_ZONE_INVALID_WP_OFST);
+	} else if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) {
+		/*
+		 * Get the written sector for zone append operation that were
+		 * emulated using regular write operations.
+		 */
+		zwp_offset = READ_ONCE(md->zwp_offset[zno]);
+		if (WARN_ON_ONCE(zwp_offset < bio_sectors(orig_bio)))
+			WRITE_ONCE(md->zwp_offset[zno],
+				   DM_ZONE_INVALID_WP_OFST);
+		else
+			orig_bio->bi_iter.bi_sector +=
+				zwp_offset - bio_sectors(orig_bio);
+	}
+
+	dm_zone_unlock(q, zno, clone);
 }
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 49bd18e99af6..420a12b42708 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -876,7 +876,6 @@ static void clone_endio(struct bio *bio)
 	struct dm_io *io = tio->io;
 	struct mapped_device *md = tio->io->md;
 	dm_endio_fn endio = tio->ti->type->end_io;
-	struct bio *orig_bio = io->orig_bio;
 	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
 
 	if (unlikely(error == BLK_STS_TARGET)) {
@@ -891,17 +890,8 @@ static void clone_endio(struct bio *bio)
 			disable_write_zeroes(md);
 	}
 
-	/*
-	 * For zone-append bios get offset in zone of the written
-	 * sector and add that to the original bio sector pos.
-	 */
-	if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) {
-		sector_t written_sector = bio->bi_iter.bi_sector;
-		struct request_queue *q = orig_bio->bi_bdev->bd_disk->queue;
-		u64 mask = (u64)blk_queue_zone_sectors(q) - 1;
-
-		orig_bio->bi_iter.bi_sector += written_sector & mask;
-	}
+	if (blk_queue_is_zoned(q))
+		dm_zone_endio(io, bio);
 
 	if (endio) {
 		int r = endio(tio->ti, bio, &error);
@@ -1213,7 +1203,16 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
 		down(&md->swap_bios_semaphore);
 	}
 
-	r = ti->type->map(ti, clone);
+	/*
+	 * Check if the IO needs a special mapping due to zone append emulation
+	 * on zoned target. In this case, dm_zone_map_bio() calls the target
+	 * map operation.
+	 */
+	if (dm_emulate_zone_append(io->md))
+		r = dm_zone_map_bio(tio);
+	else
+		r = ti->type->map(ti, clone);
+
 	switch (r) {
 	case DM_MAPIO_SUBMITTED:
 		break;
@@ -1711,6 +1710,7 @@ static void cleanup_mapped_device(struct mapped_device *md)
 	mutex_destroy(&md->swap_bios_lock);
 
 	dm_mq_cleanup_mapped_device(md);
+	dm_cleanup_zoned_dev(md);
 }
 
 /*
@@ -1956,11 +1956,16 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 		goto out;
 	}
 
+	ret = dm_table_set_restrictions(t, q, limits);
+	if (ret) {
+		old_map = ERR_PTR(ret);
+		goto out;
+	}
+
 	old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
 	rcu_assign_pointer(md->map, (void *)t);
 	md->immutable_target_type = dm_table_get_immutable_target_type(t);
 
-	dm_table_set_restrictions(t, q, limits);
 	if (old_map)
 		dm_sync_table(md);
 
@@ -2079,7 +2084,10 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 		DMERR("Cannot calculate initial queue limits");
 		return r;
 	}
-	dm_table_set_restrictions(t, md->queue, &limits);
+	r = dm_table_set_restrictions(t, md->queue, &limits);
+	if (r)
+		return r;
+
 	blk_register_queue(md->disk);
 
 	return 0;
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 39c243258e24..742d9c80efe1 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -45,6 +45,8 @@ struct dm_dev_internal {
 
 struct dm_table;
 struct dm_md_mempools;
+struct dm_target_io;
+struct dm_io;
 
 /*-----------------------------------------------------------------
  * Internal table functions.
@@ -56,8 +58,8 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
 bool dm_table_has_no_data_devices(struct dm_table *table);
 int dm_calculate_queue_limits(struct dm_table *table,
 			      struct queue_limits *limits);
-void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
-			       struct queue_limits *limits);
+int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
+			      struct queue_limits *limits);
 struct list_head *dm_table_get_devices(struct dm_table *t);
 void dm_table_presuspend_targets(struct dm_table *t);
 void dm_table_presuspend_undo_targets(struct dm_table *t);
@@ -103,17 +105,25 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
 /*
  * Zoned targets related functions.
  */
-void dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q);
+int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q);
+void dm_zone_endio(struct dm_io *io, struct bio *clone);
 #ifdef CONFIG_BLK_DEV_ZONED
+void dm_cleanup_zoned_dev(struct mapped_device *md);
 int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
 			unsigned int nr_zones, report_zones_cb cb, void *data);
 bool dm_is_zone_write(struct mapped_device *md, struct bio *bio);
+int dm_zone_map_bio(struct dm_target_io *io);
 #else
+static inline void dm_cleanup_zoned_dev(struct mapped_device *md) {}
 #define dm_blk_report_zones	NULL
 static inline bool dm_is_zone_write(struct mapped_device *md, struct bio *bio)
 {
 	return false;
 }
+static inline int dm_zone_map_bio(struct dm_target_io *tio)
+{
+	return DM_MAPIO_KILL;
+}
 #endif
 
 /*-----------------------------------------------------------------
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index caea0a079d2d..7457d49acf9a 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -361,6 +361,12 @@ struct dm_target {
 	 * Set if we need to limit the number of in-flight bios when swapping.
 	 */
 	bool limit_swap_bios:1;
+
+	/*
+	 * Set if this target implements a a zoned device and needs emulation of
+	 * zone append operations using regular writes.
+	 */
+	bool emulate_zone_append:1;
 };
 
 void *dm_per_bio_data(struct bio *bio, size_t data_size);
-- 
cgit v1.2.3


From 92638b4e1b47f97d7269e74465dedf73096f777d Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc@google.com>
Date: Wed, 2 Jun 2021 16:52:27 -0700
Subject: mm: arch: remove indirection level in
 alloc_zeroed_user_highpage_movable()

In an upcoming change we would like to add a flag to
GFP_HIGHUSER_MOVABLE so that it would no longer be an OR
of GFP_HIGHUSER and __GFP_MOVABLE. This poses a problem for
alloc_zeroed_user_highpage_movable() which passes __GFP_MOVABLE
into an arch-specific __alloc_zeroed_user_highpage() hook which ORs
in GFP_HIGHUSER.

Since __alloc_zeroed_user_highpage() is only ever called from
alloc_zeroed_user_highpage_movable(), we can remove one level
of indirection here. Remove __alloc_zeroed_user_highpage(),
make alloc_zeroed_user_highpage_movable() the hook, and use
GFP_HIGHUSER_MOVABLE in the hook implementations so that they will
pick up the new flag that we are going to add.

Signed-off-by: Peter Collingbourne <pcc@google.com>
Link: https://linux-review.googlesource.com/id/Ic6361c657b2cdcd896adbe0cf7cb5a7fbb1ed7bf
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20210602235230.3928842-2-pcc@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/alpha/include/asm/page.h   |  6 +++---
 arch/arm64/include/asm/page.h   |  6 +++---
 arch/ia64/include/asm/page.h    |  6 +++---
 arch/m68k/include/asm/page_no.h |  6 +++---
 arch/s390/include/asm/page.h    |  6 +++---
 arch/x86/include/asm/page.h     |  6 +++---
 include/linux/highmem.h         | 35 ++++++++---------------------------
 7 files changed, 26 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/page.h b/arch/alpha/include/asm/page.h
index 268f99b4602b..18f48a6f2ff6 100644
--- a/arch/alpha/include/asm/page.h
+++ b/arch/alpha/include/asm/page.h
@@ -17,9 +17,9 @@
 extern void clear_page(void *page);
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 
-#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
-	alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vmaddr)
-#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+#define alloc_zeroed_user_highpage_movable(vma, vaddr) \
+	alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vmaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE
 
 extern void copy_page(void * _to, void * _from);
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
index 012cffc574e8..e1fc0f60e79f 100644
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -28,9 +28,9 @@ void copy_user_highpage(struct page *to, struct page *from,
 void copy_highpage(struct page *to, struct page *from);
 #define __HAVE_ARCH_COPY_HIGHPAGE
 
-#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
-	alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
-#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+#define alloc_zeroed_user_highpage_movable(vma, vaddr) \
+	alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE
 
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
diff --git a/arch/ia64/include/asm/page.h b/arch/ia64/include/asm/page.h
index f4dc81fa7146..1b990466d540 100644
--- a/arch/ia64/include/asm/page.h
+++ b/arch/ia64/include/asm/page.h
@@ -82,16 +82,16 @@ do {						\
 } while (0)
 
 
-#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr)		\
+#define alloc_zeroed_user_highpage_movable(vma, vaddr)			\
 ({									\
 	struct page *page = alloc_page_vma(				\
-		GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr);	\
+		GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr);		\
 	if (page)							\
  		flush_dcache_page(page);				\
 	page;								\
 })
 
-#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE
 
 #define virt_addr_valid(kaddr)	pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
 
diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h
index 8d0f862ee9d7..c9d0d84158a4 100644
--- a/arch/m68k/include/asm/page_no.h
+++ b/arch/m68k/include/asm/page_no.h
@@ -13,9 +13,9 @@ extern unsigned long memory_end;
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
 
-#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
-	alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
-#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+#define alloc_zeroed_user_highpage_movable(vma, vaddr) \
+	alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE
 
 #define __pa(vaddr)		((unsigned long)(vaddr))
 #define __va(paddr)		((void *)((unsigned long)(paddr)))
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index cc98f9b78fd4..479dc76e0eca 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -68,9 +68,9 @@ static inline void copy_page(void *to, void *from)
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
 
-#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
-	alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
-#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+#define alloc_zeroed_user_highpage_movable(vma, vaddr) \
+	alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE
 
 /*
  * These are used to make use of C type-checking..
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 7555b48803a8..4d5810c8fab7 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -34,9 +34,9 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
 	copy_page(to, from);
 }
 
-#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
-	alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
-#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+#define alloc_zeroed_user_highpage_movable(vma, vaddr) \
+	alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE
 
 #ifndef __pa
 #define __pa(x)		__phys_addr((unsigned long)(x))
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 832b49b50c7b..54d0643b8fcf 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -152,28 +152,24 @@ static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
 }
 #endif
 
-#ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+#ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE
 /**
- * __alloc_zeroed_user_highpage - Allocate a zeroed HIGHMEM page for a VMA with caller-specified movable GFP flags
- * @movableflags: The GFP flags related to the pages future ability to move like __GFP_MOVABLE
+ * alloc_zeroed_user_highpage_movable - Allocate a zeroed HIGHMEM page for a VMA that the caller knows can move
  * @vma: The VMA the page is to be allocated for
  * @vaddr: The virtual address the page will be inserted into
  *
- * This function will allocate a page for a VMA but the caller is expected
- * to specify via movableflags whether the page will be movable in the
- * future or not
+ * This function will allocate a page for a VMA that the caller knows will
+ * be able to migrate in the future using move_pages() or reclaimed
  *
  * An architecture may override this function by defining
- * __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE and providing their own
+ * __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE and providing their own
  * implementation.
  */
 static inline struct page *
-__alloc_zeroed_user_highpage(gfp_t movableflags,
-			struct vm_area_struct *vma,
-			unsigned long vaddr)
+alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
+				   unsigned long vaddr)
 {
-	struct page *page = alloc_page_vma(GFP_HIGHUSER | movableflags,
-			vma, vaddr);
+	struct page *page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
 
 	if (page)
 		clear_user_highpage(page, vaddr);
@@ -182,21 +178,6 @@ __alloc_zeroed_user_highpage(gfp_t movableflags,
 }
 #endif
 
-/**
- * alloc_zeroed_user_highpage_movable - Allocate a zeroed HIGHMEM page for a VMA that the caller knows can move
- * @vma: The VMA the page is to be allocated for
- * @vaddr: The virtual address the page will be inserted into
- *
- * This function will allocate a page for a VMA that the caller knows will
- * be able to migrate in the future using move_pages() or reclaimed
- */
-static inline struct page *
-alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
-					unsigned long vaddr)
-{
-	return __alloc_zeroed_user_highpage(__GFP_MOVABLE, vma, vaddr);
-}
-
 static inline void clear_highpage(struct page *page)
 {
 	void *kaddr = kmap_atomic(page);
-- 
cgit v1.2.3


From 7a3b835371883558eb63e069d891bd87f562380d Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc@google.com>
Date: Wed, 2 Jun 2021 16:52:28 -0700
Subject: kasan: use separate (un)poison implementation for integrated init

Currently with integrated init page_alloc.c needs to know whether
kasan_alloc_pages() will zero initialize memory, but this will start
becoming more complicated once we start adding tag initialization
support for user pages. To avoid page_alloc.c needing to know more
details of what integrated init will do, move the unpoisoning logic
for integrated init into the HW tags implementation. Currently the
logic is identical but it will diverge in subsequent patches.

For symmetry do the same for poisoning although this logic will
be unaffected by subsequent patches.

Signed-off-by: Peter Collingbourne <pcc@google.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Link: https://linux-review.googlesource.com/id/I2c550234c6c4a893c48c18ff0c6ce658c7c67056
Link: https://lore.kernel.org/r/20210602235230.3928842-3-pcc@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/kasan.h | 64 ++++++++++++++++++++++++++++++---------------------
 mm/kasan/common.c     |  4 ++--
 mm/kasan/hw_tags.c    | 22 ++++++++++++++++++
 mm/mempool.c          |  6 +++--
 mm/page_alloc.c       | 55 ++++++++++++++++++++++---------------------
 5 files changed, 95 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index b1678a61e6a7..a1c7ce5f3e4f 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_KASAN_H
 #define _LINUX_KASAN_H
 
+#include <linux/bug.h>
 #include <linux/static_key.h>
 #include <linux/types.h>
 
@@ -79,14 +80,6 @@ static inline void kasan_disable_current(void) {}
 
 #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 
-#ifdef CONFIG_KASAN
-
-struct kasan_cache {
-	int alloc_meta_offset;
-	int free_meta_offset;
-	bool is_kmalloc;
-};
-
 #ifdef CONFIG_KASAN_HW_TAGS
 
 DECLARE_STATIC_KEY_FALSE(kasan_flag_enabled);
@@ -101,11 +94,14 @@ static inline bool kasan_has_integrated_init(void)
 	return kasan_enabled();
 }
 
+void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags);
+void kasan_free_pages(struct page *page, unsigned int order);
+
 #else /* CONFIG_KASAN_HW_TAGS */
 
 static inline bool kasan_enabled(void)
 {
-	return true;
+	return IS_ENABLED(CONFIG_KASAN);
 }
 
 static inline bool kasan_has_integrated_init(void)
@@ -113,8 +109,30 @@ static inline bool kasan_has_integrated_init(void)
 	return false;
 }
 
+static __always_inline void kasan_alloc_pages(struct page *page,
+					      unsigned int order, gfp_t flags)
+{
+	/* Only available for integrated init. */
+	BUILD_BUG();
+}
+
+static __always_inline void kasan_free_pages(struct page *page,
+					     unsigned int order)
+{
+	/* Only available for integrated init. */
+	BUILD_BUG();
+}
+
 #endif /* CONFIG_KASAN_HW_TAGS */
 
+#ifdef CONFIG_KASAN
+
+struct kasan_cache {
+	int alloc_meta_offset;
+	int free_meta_offset;
+	bool is_kmalloc;
+};
+
 slab_flags_t __kasan_never_merge(void);
 static __always_inline slab_flags_t kasan_never_merge(void)
 {
@@ -130,20 +148,20 @@ static __always_inline void kasan_unpoison_range(const void *addr, size_t size)
 		__kasan_unpoison_range(addr, size);
 }
 
-void __kasan_alloc_pages(struct page *page, unsigned int order, bool init);
-static __always_inline void kasan_alloc_pages(struct page *page,
+void __kasan_poison_pages(struct page *page, unsigned int order, bool init);
+static __always_inline void kasan_poison_pages(struct page *page,
 						unsigned int order, bool init)
 {
 	if (kasan_enabled())
-		__kasan_alloc_pages(page, order, init);
+		__kasan_poison_pages(page, order, init);
 }
 
-void __kasan_free_pages(struct page *page, unsigned int order, bool init);
-static __always_inline void kasan_free_pages(struct page *page,
-						unsigned int order, bool init)
+void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init);
+static __always_inline void kasan_unpoison_pages(struct page *page,
+						 unsigned int order, bool init)
 {
 	if (kasan_enabled())
-		__kasan_free_pages(page, order, init);
+		__kasan_unpoison_pages(page, order, init);
 }
 
 void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
@@ -285,21 +303,15 @@ void kasan_restore_multi_shot(bool enabled);
 
 #else /* CONFIG_KASAN */
 
-static inline bool kasan_enabled(void)
-{
-	return false;
-}
-static inline bool kasan_has_integrated_init(void)
-{
-	return false;
-}
 static inline slab_flags_t kasan_never_merge(void)
 {
 	return 0;
 }
 static inline void kasan_unpoison_range(const void *address, size_t size) {}
-static inline void kasan_alloc_pages(struct page *page, unsigned int order, bool init) {}
-static inline void kasan_free_pages(struct page *page, unsigned int order, bool init) {}
+static inline void kasan_poison_pages(struct page *page, unsigned int order,
+				      bool init) {}
+static inline void kasan_unpoison_pages(struct page *page, unsigned int order,
+					bool init) {}
 static inline void kasan_cache_create(struct kmem_cache *cache,
 				      unsigned int *size,
 				      slab_flags_t *flags) {}
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 6bb87f2acd4e..0ecd293af344 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -97,7 +97,7 @@ slab_flags_t __kasan_never_merge(void)
 	return 0;
 }
 
-void __kasan_alloc_pages(struct page *page, unsigned int order, bool init)
+void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init)
 {
 	u8 tag;
 	unsigned long i;
@@ -111,7 +111,7 @@ void __kasan_alloc_pages(struct page *page, unsigned int order, bool init)
 	kasan_unpoison(page_address(page), PAGE_SIZE << order, init);
 }
 
-void __kasan_free_pages(struct page *page, unsigned int order, bool init)
+void __kasan_poison_pages(struct page *page, unsigned int order, bool init)
 {
 	if (likely(!PageHighMem(page)))
 		kasan_poison(page_address(page), PAGE_SIZE << order,
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 4004388b4e4b..9d0f6f934016 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -238,6 +238,28 @@ struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
 	return &alloc_meta->free_track[0];
 }
 
+void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags)
+{
+	/*
+	 * This condition should match the one in post_alloc_hook() in
+	 * page_alloc.c.
+	 */
+	bool init = !want_init_on_free() && want_init_on_alloc(flags);
+
+	kasan_unpoison_pages(page, order, init);
+}
+
+void kasan_free_pages(struct page *page, unsigned int order)
+{
+	/*
+	 * This condition should match the one in free_pages_prepare() in
+	 * page_alloc.c.
+	 */
+	bool init = want_init_on_free();
+
+	kasan_poison_pages(page, order, init);
+}
+
 #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
 
 void kasan_set_tagging_report_once(bool state)
diff --git a/mm/mempool.c b/mm/mempool.c
index a258cf4de575..0b8afbec3e35 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -106,7 +106,8 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
 	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
 		kasan_slab_free_mempool(element);
 	else if (pool->alloc == mempool_alloc_pages)
-		kasan_free_pages(element, (unsigned long)pool->pool_data, false);
+		kasan_poison_pages(element, (unsigned long)pool->pool_data,
+				   false);
 }
 
 static void kasan_unpoison_element(mempool_t *pool, void *element)
@@ -114,7 +115,8 @@ static void kasan_unpoison_element(mempool_t *pool, void *element)
 	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
 		kasan_unpoison_range(element, __ksize(element));
 	else if (pool->alloc == mempool_alloc_pages)
-		kasan_alloc_pages(element, (unsigned long)pool->pool_data, false);
+		kasan_unpoison_pages(element, (unsigned long)pool->pool_data,
+				     false);
 }
 
 static __always_inline void add_element(mempool_t *pool, void *element)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index aaa1655cf682..4fddb7cac3c6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -382,7 +382,7 @@ int page_group_by_mobility_disabled __read_mostly;
 static DEFINE_STATIC_KEY_TRUE(deferred_pages);
 
 /*
- * Calling kasan_free_pages() only after deferred memory initialization
+ * Calling kasan_poison_pages() only after deferred memory initialization
  * has completed. Poisoning pages during deferred memory init will greatly
  * lengthen the process and cause problem in large memory systems as the
  * deferred pages initialization is done with interrupt disabled.
@@ -394,15 +394,11 @@ static DEFINE_STATIC_KEY_TRUE(deferred_pages);
  * on-demand allocation and then freed again before the deferred pages
  * initialization is done, but this is not likely to happen.
  */
-static inline void kasan_free_nondeferred_pages(struct page *page, int order,
-						bool init, fpi_t fpi_flags)
+static inline bool should_skip_kasan_poison(fpi_t fpi_flags)
 {
-	if (static_branch_unlikely(&deferred_pages))
-		return;
-	if (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
-			(fpi_flags & FPI_SKIP_KASAN_POISON))
-		return;
-	kasan_free_pages(page, order, init);
+	return static_branch_unlikely(&deferred_pages) ||
+	       (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+		(fpi_flags & FPI_SKIP_KASAN_POISON));
 }
 
 /* Returns true if the struct page for the pfn is uninitialised */
@@ -453,13 +449,10 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
 	return false;
 }
 #else
-static inline void kasan_free_nondeferred_pages(struct page *page, int order,
-						bool init, fpi_t fpi_flags)
+static inline bool should_skip_kasan_poison(fpi_t fpi_flags)
 {
-	if (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
-			(fpi_flags & FPI_SKIP_KASAN_POISON))
-		return;
-	kasan_free_pages(page, order, init);
+	return (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+		(fpi_flags & FPI_SKIP_KASAN_POISON));
 }
 
 static inline bool early_page_uninitialised(unsigned long pfn)
@@ -1245,7 +1238,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
 			unsigned int order, bool check_free, fpi_t fpi_flags)
 {
 	int bad = 0;
-	bool init;
+	bool skip_kasan_poison = should_skip_kasan_poison(fpi_flags);
 
 	VM_BUG_ON_PAGE(PageTail(page), page);
 
@@ -1314,10 +1307,17 @@ static __always_inline bool free_pages_prepare(struct page *page,
 	 * With hardware tag-based KASAN, memory tags must be set before the
 	 * page becomes unavailable via debug_pagealloc or arch_free_page.
 	 */
-	init = want_init_on_free();
-	if (init && !kasan_has_integrated_init())
-		kernel_init_free_pages(page, 1 << order);
-	kasan_free_nondeferred_pages(page, order, init, fpi_flags);
+	if (kasan_has_integrated_init()) {
+		if (!skip_kasan_poison)
+			kasan_free_pages(page, order);
+	} else {
+		bool init = want_init_on_free();
+
+		if (init)
+			kernel_init_free_pages(page, 1 << order);
+		if (!skip_kasan_poison)
+			kasan_poison_pages(page, order, init);
+	}
 
 	/*
 	 * arch_free_page() can make the page's contents inaccessible.  s390
@@ -2324,8 +2324,6 @@ static bool check_new_pages(struct page *page, unsigned int order)
 inline void post_alloc_hook(struct page *page, unsigned int order,
 				gfp_t gfp_flags)
 {
-	bool init;
-
 	set_page_private(page, 0);
 	set_page_refcounted(page);
 
@@ -2344,10 +2342,15 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
 	 * kasan_alloc_pages and kernel_init_free_pages must be
 	 * kept together to avoid discrepancies in behavior.
 	 */
-	init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
-	kasan_alloc_pages(page, order, init);
-	if (init && !kasan_has_integrated_init())
-		kernel_init_free_pages(page, 1 << order);
+	if (kasan_has_integrated_init()) {
+		kasan_alloc_pages(page, order, gfp_flags);
+	} else {
+		bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
+
+		kasan_unpoison_pages(page, order, init);
+		if (init)
+			kernel_init_free_pages(page, 1 << order);
+	}
 
 	set_page_owner(page, order, gfp_flags);
 }
-- 
cgit v1.2.3


From 013bb59dbb7cf876449df860946458a595a96d51 Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc@google.com>
Date: Wed, 2 Jun 2021 16:52:29 -0700
Subject: arm64: mte: handle tags zeroing at page allocation time

Currently, on an anonymous page fault, the kernel allocates a zeroed
page and maps it in user space. If the mapping is tagged (PROT_MTE),
set_pte_at() additionally clears the tags. It is, however, more
efficient to clear the tags at the same time as zeroing the data on
allocation. To avoid clearing the tags on any page (which may not be
mapped as tagged), only do this if the vma flags contain VM_MTE. This
requires introducing a new GFP flag that is used to determine whether
to clear the tags.

The DC GZVA instruction with a 0 top byte (and 0 tag) requires
top-byte-ignore. Set the TCR_EL1.{TBI1,TBID1} bits irrespective of
whether KASAN_HW is enabled.

Signed-off-by: Peter Collingbourne <pcc@google.com>
Co-developed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://linux-review.googlesource.com/id/Id46dc94e30fe11474f7e54f5d65e7658dbdddb26
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Link: https://lore.kernel.org/r/20210602235230.3928842-4-pcc@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/mte.h  |  4 ++++
 arch/arm64/include/asm/page.h |  8 ++++++--
 arch/arm64/lib/mte.S          | 20 ++++++++++++++++++++
 arch/arm64/mm/fault.c         | 26 ++++++++++++++++++++++++++
 arch/arm64/mm/proc.S          | 10 +++++++---
 include/linux/gfp.h           |  9 +++++++--
 include/linux/highmem.h       |  8 ++++++++
 mm/kasan/hw_tags.c            |  9 ++++++++-
 mm/page_alloc.c               | 13 ++++++++++---
 9 files changed, 96 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
index bc88a1ced0d7..67bf259ae768 100644
--- a/arch/arm64/include/asm/mte.h
+++ b/arch/arm64/include/asm/mte.h
@@ -37,6 +37,7 @@ void mte_free_tag_storage(char *storage);
 /* track which pages have valid allocation tags */
 #define PG_mte_tagged	PG_arch_2
 
+void mte_zero_clear_page_tags(void *addr);
 void mte_sync_tags(pte_t *ptep, pte_t pte);
 void mte_copy_page_tags(void *kto, const void *kfrom);
 void mte_thread_init_user(void);
@@ -53,6 +54,9 @@ int mte_ptrace_copy_tags(struct task_struct *child, long request,
 /* unused if !CONFIG_ARM64_MTE, silence the compiler */
 #define PG_mte_tagged	0
 
+static inline void mte_zero_clear_page_tags(void *addr)
+{
+}
 static inline void mte_sync_tags(pte_t *ptep, pte_t pte)
 {
 }
diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
index e1fc0f60e79f..ed1b9dcf12b2 100644
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -13,6 +13,7 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/personality.h> /* for READ_IMPLIES_EXEC */
+#include <linux/types.h> /* for gfp_t */
 #include <asm/pgtable-types.h>
 
 struct page;
@@ -28,10 +29,13 @@ void copy_user_highpage(struct page *to, struct page *from,
 void copy_highpage(struct page *to, struct page *from);
 #define __HAVE_ARCH_COPY_HIGHPAGE
 
-#define alloc_zeroed_user_highpage_movable(vma, vaddr) \
-	alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr)
+struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
+						unsigned long vaddr);
 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE
 
+void tag_clear_highpage(struct page *to);
+#define __HAVE_ARCH_TAG_CLEAR_HIGHPAGE
+
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
 
diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S
index 351537c12f36..e83643b3995f 100644
--- a/arch/arm64/lib/mte.S
+++ b/arch/arm64/lib/mte.S
@@ -36,6 +36,26 @@ SYM_FUNC_START(mte_clear_page_tags)
 	ret
 SYM_FUNC_END(mte_clear_page_tags)
 
+/*
+ * Zero the page and tags at the same time
+ *
+ * Parameters:
+ *	x0 - address to the beginning of the page
+ */
+SYM_FUNC_START(mte_zero_clear_page_tags)
+	mrs	x1, dczid_el0
+	and	w1, w1, #0xf
+	mov	x2, #4
+	lsl	x1, x2, x1
+	and	x0, x0, #(1 << MTE_TAG_SHIFT) - 1	// clear the tag
+
+1:	dc	gzva, x0
+	add	x0, x0, x1
+	tst	x0, #(PAGE_SIZE - 1)
+	b.ne	1b
+	ret
+SYM_FUNC_END(mte_zero_clear_page_tags)
+
 /*
  * Copy the tags from the source page to the destination one
  *   x0 - address of the destination page
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 871c82ab0a30..180c0343d82a 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -921,3 +921,29 @@ void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
 	debug_exception_exit(regs);
 }
 NOKPROBE_SYMBOL(do_debug_exception);
+
+/*
+ * Used during anonymous page fault handling.
+ */
+struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
+						unsigned long vaddr)
+{
+	gfp_t flags = GFP_HIGHUSER_MOVABLE | __GFP_ZERO;
+
+	/*
+	 * If the page is mapped with PROT_MTE, initialise the tags at the
+	 * point of allocation and page zeroing as this is usually faster than
+	 * separate DC ZVA and STGM.
+	 */
+	if (vma->vm_flags & VM_MTE)
+		flags |= __GFP_ZEROTAGS;
+
+	return alloc_page_vma(flags, vma, vaddr);
+}
+
+void tag_clear_highpage(struct page *page)
+{
+	mte_zero_clear_page_tags(page_address(page));
+	page_kasan_tag_reset(page);
+	set_bit(PG_mte_tagged, &page->flags);
+}
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 97d7bcd8d4f2..48fd1df3d05a 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -46,9 +46,13 @@
 #endif
 
 #ifdef CONFIG_KASAN_HW_TAGS
-#define TCR_KASAN_HW_FLAGS SYS_TCR_EL1_TCMA1 | TCR_TBI1 | TCR_TBID1
+#define TCR_MTE_FLAGS SYS_TCR_EL1_TCMA1 | TCR_TBI1 | TCR_TBID1
 #else
-#define TCR_KASAN_HW_FLAGS 0
+/*
+ * The mte_zero_clear_page_tags() implementation uses DC GZVA, which relies on
+ * TBI being enabled at EL1.
+ */
+#define TCR_MTE_FLAGS TCR_TBI1 | TCR_TBID1
 #endif
 
 /*
@@ -464,7 +468,7 @@ SYM_FUNC_START(__cpu_setup)
 	msr_s	SYS_TFSRE0_EL1, xzr
 
 	/* set the TCR_EL1 bits */
-	mov_q	x10, TCR_KASAN_HW_FLAGS
+	mov_q	x10, TCR_MTE_FLAGS
 	orr	tcr, tcr, x10
 1:
 #endif
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 11da8af06704..68ba237365dc 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -53,8 +53,9 @@ struct vm_area_struct;
 #define ___GFP_HARDWALL		0x100000u
 #define ___GFP_THISNODE		0x200000u
 #define ___GFP_ACCOUNT		0x400000u
+#define ___GFP_ZEROTAGS		0x800000u
 #ifdef CONFIG_LOCKDEP
-#define ___GFP_NOLOCKDEP	0x800000u
+#define ___GFP_NOLOCKDEP	0x1000000u
 #else
 #define ___GFP_NOLOCKDEP	0
 #endif
@@ -229,16 +230,20 @@ struct vm_area_struct;
  * %__GFP_COMP address compound page metadata.
  *
  * %__GFP_ZERO returns a zeroed page on success.
+ *
+ * %__GFP_ZEROTAGS returns a page with zeroed memory tags on success, if
+ * __GFP_ZERO is set.
  */
 #define __GFP_NOWARN	((__force gfp_t)___GFP_NOWARN)
 #define __GFP_COMP	((__force gfp_t)___GFP_COMP)
 #define __GFP_ZERO	((__force gfp_t)___GFP_ZERO)
+#define __GFP_ZEROTAGS	((__force gfp_t)___GFP_ZEROTAGS)
 
 /* Disable lockdep for GFP context tracking */
 #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
 
 /* Room for N __GFP_FOO bits */
-#define __GFP_BITS_SHIFT (23 + IS_ENABLED(CONFIG_LOCKDEP))
+#define __GFP_BITS_SHIFT (24 + IS_ENABLED(CONFIG_LOCKDEP))
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /**
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 54d0643b8fcf..8c6e8e996c87 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -185,6 +185,14 @@ static inline void clear_highpage(struct page *page)
 	kunmap_atomic(kaddr);
 }
 
+#ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE
+
+static inline void tag_clear_highpage(struct page *page)
+{
+}
+
+#endif
+
 /*
  * If we pass in a base or tail page, we can zero up to PAGE_SIZE.
  * If we pass in a head page, we can zero up to the size of the compound page.
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 9d0f6f934016..41fd5326ee0a 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -246,7 +246,14 @@ void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags)
 	 */
 	bool init = !want_init_on_free() && want_init_on_alloc(flags);
 
-	kasan_unpoison_pages(page, order, init);
+	if (flags & __GFP_ZEROTAGS) {
+		int i;
+
+		for (i = 0; i != 1 << order; ++i)
+			tag_clear_highpage(page + i);
+	} else {
+		kasan_unpoison_pages(page, order, init);
+	}
 }
 
 void kasan_free_pages(struct page *page, unsigned int order)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4fddb7cac3c6..13937e793fda 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1219,10 +1219,16 @@ out:
 	return ret;
 }
 
-static void kernel_init_free_pages(struct page *page, int numpages)
+static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags)
 {
 	int i;
 
+	if (zero_tags) {
+		for (i = 0; i < numpages; i++)
+			tag_clear_highpage(page + i);
+		return;
+	}
+
 	/* s390's use of memset() could override KASAN redzones. */
 	kasan_disable_current();
 	for (i = 0; i < numpages; i++) {
@@ -1314,7 +1320,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
 		bool init = want_init_on_free();
 
 		if (init)
-			kernel_init_free_pages(page, 1 << order);
+			kernel_init_free_pages(page, 1 << order, false);
 		if (!skip_kasan_poison)
 			kasan_poison_pages(page, order, init);
 	}
@@ -2349,7 +2355,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
 
 		kasan_unpoison_pages(page, order, init);
 		if (init)
-			kernel_init_free_pages(page, 1 << order);
+			kernel_init_free_pages(page, 1 << order,
+					       gfp_flags & __GFP_ZEROTAGS);
 	}
 
 	set_page_owner(page, order, gfp_flags);
-- 
cgit v1.2.3


From c275c5c6d50a0518cdb0584e85905d10e7cefc6e Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc@google.com>
Date: Wed, 2 Jun 2021 16:52:30 -0700
Subject: kasan: disable freed user page poisoning with HW tags

Poisoning freed pages protects against kernel use-after-free. The
likelihood of such a bug involving kernel pages is significantly higher
than that for user pages. At the same time, poisoning freed pages can
impose a significant performance cost, which cannot always be justified
for user pages given the lower probability of finding a bug. Therefore,
disable freed user page poisoning when using HW tags. We identify
"user" pages via the flag set GFP_HIGHUSER_MOVABLE, which indicates
a strong likelihood of not being directly accessible to the kernel.

Signed-off-by: Peter Collingbourne <pcc@google.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Link: https://linux-review.googlesource.com/id/I716846e2de8ef179f44e835770df7e6307be96c9
Link: https://lore.kernel.org/r/20210602235230.3928842-5-pcc@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/gfp.h            | 13 ++++++++++---
 include/linux/page-flags.h     |  9 +++++++++
 include/trace/events/mmflags.h |  9 ++++++++-
 mm/kasan/hw_tags.c             |  3 +++
 mm/page_alloc.c                | 12 +++++++-----
 5 files changed, 37 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 68ba237365dc..e6102dfa4faa 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -54,8 +54,9 @@ struct vm_area_struct;
 #define ___GFP_THISNODE		0x200000u
 #define ___GFP_ACCOUNT		0x400000u
 #define ___GFP_ZEROTAGS		0x800000u
+#define ___GFP_SKIP_KASAN_POISON	0x1000000u
 #ifdef CONFIG_LOCKDEP
-#define ___GFP_NOLOCKDEP	0x1000000u
+#define ___GFP_NOLOCKDEP	0x2000000u
 #else
 #define ___GFP_NOLOCKDEP	0
 #endif
@@ -233,17 +234,22 @@ struct vm_area_struct;
  *
  * %__GFP_ZEROTAGS returns a page with zeroed memory tags on success, if
  * __GFP_ZERO is set.
+ *
+ * %__GFP_SKIP_KASAN_POISON returns a page which does not need to be poisoned
+ * on deallocation. Typically used for userspace pages. Currently only has an
+ * effect in HW tags mode.
  */
 #define __GFP_NOWARN	((__force gfp_t)___GFP_NOWARN)
 #define __GFP_COMP	((__force gfp_t)___GFP_COMP)
 #define __GFP_ZERO	((__force gfp_t)___GFP_ZERO)
 #define __GFP_ZEROTAGS	((__force gfp_t)___GFP_ZEROTAGS)
+#define __GFP_SKIP_KASAN_POISON	((__force gfp_t)___GFP_SKIP_KASAN_POISON)
 
 /* Disable lockdep for GFP context tracking */
 #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
 
 /* Room for N __GFP_FOO bits */
-#define __GFP_BITS_SHIFT (24 + IS_ENABLED(CONFIG_LOCKDEP))
+#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP))
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /**
@@ -324,7 +330,8 @@ struct vm_area_struct;
 #define GFP_DMA		__GFP_DMA
 #define GFP_DMA32	__GFP_DMA32
 #define GFP_HIGHUSER	(GFP_USER | __GFP_HIGHMEM)
-#define GFP_HIGHUSER_MOVABLE	(GFP_HIGHUSER | __GFP_MOVABLE)
+#define GFP_HIGHUSER_MOVABLE	(GFP_HIGHUSER | __GFP_MOVABLE | \
+			 __GFP_SKIP_KASAN_POISON)
 #define GFP_TRANSHUGE_LIGHT	((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
 			 __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
 #define GFP_TRANSHUGE	(GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 04a34c08e0a6..40e2c5000585 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -137,6 +137,9 @@ enum pageflags {
 #endif
 #ifdef CONFIG_64BIT
 	PG_arch_2,
+#endif
+#ifdef CONFIG_KASAN_HW_TAGS
+	PG_skip_kasan_poison,
 #endif
 	__NR_PAGEFLAGS,
 
@@ -443,6 +446,12 @@ TESTCLEARFLAG(Young, young, PF_ANY)
 PAGEFLAG(Idle, idle, PF_ANY)
 #endif
 
+#ifdef CONFIG_KASAN_HW_TAGS
+PAGEFLAG(SkipKASanPoison, skip_kasan_poison, PF_HEAD)
+#else
+PAGEFLAG_FALSE(SkipKASanPoison)
+#endif
+
 /*
  * PageReported() is used to track reported free pages within the Buddy
  * allocator. We can use the non-atomic version of the test and set
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 629c7a0eaff2..390270e00a1d 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -85,6 +85,12 @@
 #define IF_HAVE_PG_ARCH_2(flag,string)
 #endif
 
+#ifdef CONFIG_KASAN_HW_TAGS
+#define IF_HAVE_PG_SKIP_KASAN_POISON(flag,string) ,{1UL << flag, string}
+#else
+#define IF_HAVE_PG_SKIP_KASAN_POISON(flag,string)
+#endif
+
 #define __def_pageflag_names						\
 	{1UL << PG_locked,		"locked"	},		\
 	{1UL << PG_waiters,		"waiters"	},		\
@@ -112,7 +118,8 @@ IF_HAVE_PG_UNCACHED(PG_uncached,	"uncached"	)		\
 IF_HAVE_PG_HWPOISON(PG_hwpoison,	"hwpoison"	)		\
 IF_HAVE_PG_IDLE(PG_young,		"young"		)		\
 IF_HAVE_PG_IDLE(PG_idle,		"idle"		)		\
-IF_HAVE_PG_ARCH_2(PG_arch_2,		"arch_2"	)
+IF_HAVE_PG_ARCH_2(PG_arch_2,		"arch_2"	)		\
+IF_HAVE_PG_SKIP_KASAN_POISON(PG_skip_kasan_poison, "skip_kasan_poison")
 
 #define show_page_flags(flags)						\
 	(flags) ? __print_flags(flags, "|",				\
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 41fd5326ee0a..ed5e5b833d61 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -246,6 +246,9 @@ void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags)
 	 */
 	bool init = !want_init_on_free() && want_init_on_alloc(flags);
 
+	if (flags & __GFP_SKIP_KASAN_POISON)
+		SetPageSkipKASanPoison(page);
+
 	if (flags & __GFP_ZEROTAGS) {
 		int i;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 13937e793fda..5ad76e540a22 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -394,11 +394,12 @@ static DEFINE_STATIC_KEY_TRUE(deferred_pages);
  * on-demand allocation and then freed again before the deferred pages
  * initialization is done, but this is not likely to happen.
  */
-static inline bool should_skip_kasan_poison(fpi_t fpi_flags)
+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
 {
 	return static_branch_unlikely(&deferred_pages) ||
 	       (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
-		(fpi_flags & FPI_SKIP_KASAN_POISON));
+		(fpi_flags & FPI_SKIP_KASAN_POISON)) ||
+	       PageSkipKASanPoison(page);
 }
 
 /* Returns true if the struct page for the pfn is uninitialised */
@@ -449,10 +450,11 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
 	return false;
 }
 #else
-static inline bool should_skip_kasan_poison(fpi_t fpi_flags)
+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
 {
 	return (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
-		(fpi_flags & FPI_SKIP_KASAN_POISON));
+		(fpi_flags & FPI_SKIP_KASAN_POISON)) ||
+	       PageSkipKASanPoison(page);
 }
 
 static inline bool early_page_uninitialised(unsigned long pfn)
@@ -1244,7 +1246,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
 			unsigned int order, bool check_free, fpi_t fpi_flags)
 {
 	int bad = 0;
-	bool skip_kasan_poison = should_skip_kasan_poison(fpi_flags);
+	bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
 
 	VM_BUG_ON_PAGE(PageTail(page), page);
 
-- 
cgit v1.2.3


From c7c90e121e992eefdf07945e5a6e9cf097b29463 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 1 Jun 2021 16:31:43 -0500
Subject: kconfig.h: explain IS_MODULE(), IS_ENABLED()

Extend IS_MODULE() and IS_ENABLED comments to explain why one might use
"#if IS_ENABLED(CONFIG_FOO)" instead of "#ifdef CONFIG_FOO".

To wit, "#ifdef CONFIG_FOO" is true only for CONFIG_FOO=y, while
"#if IS_ENABLED(CONFIG_FOO)" is true for both CONFIG_FOO=y and
CONFIG_FOO=m.

This is because "CONFIG_FOO=m" in .config does not result in "CONFIG_FOO"
being defined.  The actual definitions are in autoconf.h, where:

  CONFIG_FOO=y   results in   #define CONFIG_FOO 1
  CONFIG_FOO=m   results in   #define CONFIG_FOO_MODULE 1

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 include/linux/kconfig.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h
index cc8fa109cfa3..20d1079e92b4 100644
--- a/include/linux/kconfig.h
+++ b/include/linux/kconfig.h
@@ -51,7 +51,8 @@
 
 /*
  * IS_MODULE(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'm', 0
- * otherwise.
+ * otherwise.  CONFIG_FOO=m results in "#define CONFIG_FOO_MODULE 1" in
+ * autoconf.h.
  */
 #define IS_MODULE(option) __is_defined(option##_MODULE)
 
@@ -66,7 +67,8 @@
 
 /*
  * IS_ENABLED(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y' or 'm',
- * 0 otherwise.
+ * 0 otherwise.  Note that CONFIG_FOO=y results in "#define CONFIG_FOO 1" in
+ * autoconf.h, while CONFIG_FOO=m results in "#define CONFIG_FOO_MODULE 1".
  */
 #define IS_ENABLED(option) __or(IS_BUILTIN(option), IS_MODULE(option))
 
-- 
cgit v1.2.3


From 50c25ee97cf6ab011542167ab590c17012cea4ed Mon Sep 17 00:00:00 2001
From: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Date: Fri, 4 Jun 2021 20:01:08 -0700
Subject: Revert "MIPS: make userspace mapping young by default"

This reverts commit f685a533a7fab35c5d069dcd663f59c8e4171a75.

The MIPS cache flush logic needs to know whether the mapping was already
established to decide how to flush caches.  This is done by checking the
valid bit in the PTE.  The commit above breaks this logic by setting the
valid in the PTE in new mappings, which causes kernel crashes.

Link: https://lkml.kernel.org/r/20210526094335.92948-1-tsbogend@alpha.franken.de
Fixes: f685a533a7f ("MIPS: make userspace mapping young by default")
Reported-by: Zhou Yanjie <zhouyanjie@wanyeetech.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Huang Pei <huangpei@loongson.cn>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/mm/cache.c    | 30 ++++++++++++++----------------
 include/linux/pgtable.h |  8 ++++++++
 mm/memory.c             |  4 ++++
 3 files changed, 26 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c
index a7bf0c80371c..830ab91e574f 100644
--- a/arch/mips/mm/cache.c
+++ b/arch/mips/mm/cache.c
@@ -158,31 +158,29 @@ unsigned long _page_cachable_default;
 EXPORT_SYMBOL(_page_cachable_default);
 
 #define PM(p)	__pgprot(_page_cachable_default | (p))
-#define PVA(p)	PM(_PAGE_VALID | _PAGE_ACCESSED | (p))
 
 static inline void setup_protection_map(void)
 {
 	protection_map[0]  = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ);
-	protection_map[1]  = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC);
-	protection_map[2]  = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ);
-	protection_map[3]  = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC);
-	protection_map[4]  = PVA(_PAGE_PRESENT);
-	protection_map[5]  = PVA(_PAGE_PRESENT);
-	protection_map[6]  = PVA(_PAGE_PRESENT);
-	protection_map[7]  = PVA(_PAGE_PRESENT);
+	protection_map[1]  = PM(_PAGE_PRESENT | _PAGE_NO_EXEC);
+	protection_map[2]  = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ);
+	protection_map[3]  = PM(_PAGE_PRESENT | _PAGE_NO_EXEC);
+	protection_map[4]  = PM(_PAGE_PRESENT);
+	protection_map[5]  = PM(_PAGE_PRESENT);
+	protection_map[6]  = PM(_PAGE_PRESENT);
+	protection_map[7]  = PM(_PAGE_PRESENT);
 
 	protection_map[8]  = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ);
-	protection_map[9]  = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC);
-	protection_map[10] = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE |
+	protection_map[9]  = PM(_PAGE_PRESENT | _PAGE_NO_EXEC);
+	protection_map[10] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE |
 				_PAGE_NO_READ);
-	protection_map[11] = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE);
-	protection_map[12] = PVA(_PAGE_PRESENT);
-	protection_map[13] = PVA(_PAGE_PRESENT);
-	protection_map[14] = PVA(_PAGE_PRESENT);
-	protection_map[15] = PVA(_PAGE_PRESENT);
+	protection_map[11] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE);
+	protection_map[12] = PM(_PAGE_PRESENT);
+	protection_map[13] = PM(_PAGE_PRESENT);
+	protection_map[14] = PM(_PAGE_PRESENT | _PAGE_WRITE);
+	protection_map[15] = PM(_PAGE_PRESENT | _PAGE_WRITE);
 }
 
-#undef _PVA
 #undef PM
 
 void cpu_cache_init(void)
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 46b13780c2c8..a43047b1030d 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -432,6 +432,14 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
  * To be differentiate with macro pte_mkyoung, this macro is used on platforms
  * where software maintains page access bit.
  */
+#ifndef pte_sw_mkyoung
+static inline pte_t pte_sw_mkyoung(pte_t pte)
+{
+	return pte;
+}
+#define pte_sw_mkyoung	pte_sw_mkyoung
+#endif
+
 #ifndef pte_savedwrite
 #define pte_savedwrite pte_write
 #endif
diff --git a/mm/memory.c b/mm/memory.c
index 730daa00952b..f3ffab9b9e39 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2939,6 +2939,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 		}
 		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
 		entry = mk_pte(new_page, vma->vm_page_prot);
+		entry = pte_sw_mkyoung(entry);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 
 		/*
@@ -3602,6 +3603,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 	__SetPageUptodate(page);
 
 	entry = mk_pte(page, vma->vm_page_prot);
+	entry = pte_sw_mkyoung(entry);
 	if (vma->vm_flags & VM_WRITE)
 		entry = pte_mkwrite(pte_mkdirty(entry));
 
@@ -3786,6 +3788,8 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
 
 	if (prefault && arch_wants_old_prefaulted_pte())
 		entry = pte_mkold(entry);
+	else
+		entry = pte_sw_mkyoung(entry);
 
 	if (write)
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-- 
cgit v1.2.3


From 4d5c8aedc8aa6a1f5d1b06eb4f5517dc60dd9440 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Wed, 2 Jun 2021 18:09:30 -0700
Subject: mm, memcg: introduce mem_cgroup_kmem_disabled()

Introduce a new mem_cgroup_kmem_disabled() helper, similar to
mem_cgroup_disabled(), to check whether the kernel memory accounting
is off. A user could disable it using a boot option to eliminate
some associated costs.

The helper can be used outside of memcontrol.c to dynamically disable
the kmem-related code. The returned value is stable after the kernel
initialization is finished.

Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Dennis Zhou <dennis@kernel.org>
---
 include/linux/memcontrol.h | 5 +++++
 mm/memcontrol.c            | 5 +++++
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0c04d39a7967..8ef51c58f470 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1583,6 +1583,7 @@ static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
 #endif
 
 #ifdef CONFIG_MEMCG_KMEM
+bool mem_cgroup_kmem_disabled(void);
 int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order);
 void __memcg_kmem_uncharge_page(struct page *page, int order);
 
@@ -1636,6 +1637,10 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
 struct mem_cgroup *mem_cgroup_from_obj(void *p);
 
 #else
+static inline bool mem_cgroup_kmem_disabled(void)
+{
+	return true;
+}
 
 static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,
 					 int order)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e6203ee24a11..1fa9b00ec71d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -255,6 +255,11 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
 #ifdef CONFIG_MEMCG_KMEM
 extern spinlock_t css_set_lock;
 
+bool mem_cgroup_kmem_disabled(void)
+{
+	return cgroup_memory_nokmem;
+}
+
 static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
 			       unsigned int nr_pages);
 static void __memcg_kmem_uncharge(struct mem_cgroup *memcg,
-- 
cgit v1.2.3


From 6abee582034c123d995cd454a1ccdcf0b8699da0 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Sat, 5 Jun 2021 17:04:25 -0700
Subject: Input: cy8ctmg110_ts - rely on platform code to supply interrupt

Instead of using platform data to specify GPIO that is used as interrupt
source, rely on the platform and I2C core to set it up properly.

Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20210603043726.3793876-1-dmitry.torokhov@gmail.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/touchscreen/cy8ctmg110_ts.c | 32 +------------------------------
 include/linux/input/cy8ctmg110_pdata.h    |  1 -
 2 files changed, 1 insertion(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/cy8ctmg110_ts.c b/drivers/input/touchscreen/cy8ctmg110_ts.c
index f465bae618fe..691f35f1bdd7 100644
--- a/drivers/input/touchscreen/cy8ctmg110_ts.c
+++ b/drivers/input/touchscreen/cy8ctmg110_ts.c
@@ -46,7 +46,6 @@ struct cy8ctmg110 {
 	char phys[32];
 	struct i2c_client *client;
 	int reset_pin;
-	int irq_pin;
 };
 
 /*
@@ -191,7 +190,6 @@ static int cy8ctmg110_probe(struct i2c_client *client,
 	ts->client = client;
 	ts->input = input_dev;
 	ts->reset_pin = pdata->reset_pin;
-	ts->irq_pin = pdata->irq_pin;
 
 	snprintf(ts->phys, sizeof(ts->phys),
 		 "%s/input0", dev_name(&client->dev));
@@ -222,38 +220,13 @@ static int cy8ctmg110_probe(struct i2c_client *client,
 	cy8ctmg110_power(ts, true);
 	cy8ctmg110_set_sleepmode(ts, false);
 
-	err = gpio_request(ts->irq_pin, "touch_irq_key");
-	if (err < 0) {
-		dev_err(&client->dev,
-			"Failed to request GPIO %d, error %d\n",
-			ts->irq_pin, err);
-		goto err_shutoff_device;
-	}
-
-	err = gpio_direction_input(ts->irq_pin);
-	if (err < 0) {
-		dev_err(&client->dev,
-			"Failed to configure input direction for GPIO %d, error %d\n",
-			ts->irq_pin, err);
-		goto err_free_irq_gpio;
-	}
-
-	client->irq = gpio_to_irq(ts->irq_pin);
-	if (client->irq < 0) {
-		err = client->irq;
-		dev_err(&client->dev,
-			"Unable to get irq number for GPIO %d, error %d\n",
-			ts->irq_pin, err);
-		goto err_free_irq_gpio;
-	}
-
 	err = request_threaded_irq(client->irq, NULL, cy8ctmg110_irq_thread,
 				   IRQF_TRIGGER_RISING | IRQF_ONESHOT,
 				   "touch_reset_key", ts);
 	if (err < 0) {
 		dev_err(&client->dev,
 			"irq %d busy? error %d\n", client->irq, err);
-		goto err_free_irq_gpio;
+		goto err_shutoff_device;
 	}
 
 	err = input_register_device(input_dev);
@@ -266,8 +239,6 @@ static int cy8ctmg110_probe(struct i2c_client *client,
 
 err_free_irq:
 	free_irq(client->irq, ts);
-err_free_irq_gpio:
-	gpio_free(ts->irq_pin);
 err_shutoff_device:
 	cy8ctmg110_set_sleepmode(ts, true);
 	cy8ctmg110_power(ts, false);
@@ -318,7 +289,6 @@ static int cy8ctmg110_remove(struct i2c_client *client)
 
 	free_irq(client->irq, ts);
 	input_unregister_device(ts->input);
-	gpio_free(ts->irq_pin);
 	if (ts->reset_pin)
 		gpio_free(ts->reset_pin);
 	kfree(ts);
diff --git a/include/linux/input/cy8ctmg110_pdata.h b/include/linux/input/cy8ctmg110_pdata.h
index 77582ae1745a..ee1d44545f30 100644
--- a/include/linux/input/cy8ctmg110_pdata.h
+++ b/include/linux/input/cy8ctmg110_pdata.h
@@ -5,7 +5,6 @@
 struct cy8ctmg110_pdata
 {
 	int reset_pin;		/* Reset pin is wired to this GPIO (optional) */
-	int irq_pin;		/* IRQ pin is wired to this GPIO */
 };
 
 #endif
-- 
cgit v1.2.3


From aa5c8b25392800bbefa82dd19eeff8ebbf261ace Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Thu, 3 Jun 2021 17:58:35 +0100
Subject: i2c: core: Add stub for i2c_verify_client() if !CONFIG_I2C

If I2C is not compiled, there is no way we should see a call to
i2c_verify_client() on a device that is an i2c client. As such,
provide a stub to return NULL to resolve an associated build failure.

The build is failing with this link error
ld: fxls8962af-core.o: in function `fxls8962af_fifo_transfer':
fxls8962af-core.c: undefined reference to `i2c_verify_client'

Reported-by: Tom Rix <trix@redhat.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Fixes: af959b7b96b8 ("iio: accel: fxls8962af: fix errata bug E3 - I2C burst reads")
Reviewed-by: Sean Nyekjaer <sean@geanix.com>
Acked-by: Wolfram Sang <wsa@kernel.org>
Link: https://lore.kernel.org/r/20210603165835.3594557-1-jic23@kernel.org
---
 include/linux/i2c.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index e8f2ac8c9c3d..7d71131c394e 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -343,7 +343,6 @@ struct i2c_client {
 };
 #define to_i2c_client(d) container_of(d, struct i2c_client, dev)
 
-struct i2c_client *i2c_verify_client(struct device *dev);
 struct i2c_adapter *i2c_verify_adapter(struct device *dev);
 const struct i2c_device_id *i2c_match_id(const struct i2c_device_id *id,
 					 const struct i2c_client *client);
@@ -477,6 +476,13 @@ i2c_new_ancillary_device(struct i2c_client *client,
 			 u16 default_addr);
 
 void i2c_unregister_device(struct i2c_client *client);
+
+struct i2c_client *i2c_verify_client(struct device *dev);
+#else
+static inline struct i2c_client *i2c_verify_client(struct device *dev)
+{
+	return NULL;
+}
 #endif /* I2C */
 
 /* Mainboard arch_initcall() code should register all its I2C devices.
-- 
cgit v1.2.3


From 64c2c2c62f92339b176ea24403d8db16db36f9e6 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 25 May 2021 16:07:48 +0200
Subject: quota: Change quotactl_path() systcall to an fd-based one

Some users have pointed out that path-based syscalls are problematic in
some environments and at least directory fd argument and possibly also
resolve flags are desirable for such syscalls. Rather than
reimplementing all details of pathname lookup and following where it may
eventually evolve, let's go for full file descriptor based syscall
similar to how ioctl(2) works since the beginning. Managing of quotas
isn't performance sensitive so the extra overhead of open does not
matter and we are able to consume O_PATH descriptors as well which makes
open cheap anyway. Also for frequent operations (such as retrieving
usage information for all users) we can reuse single fd and in fact get
even better performance as well as avoiding races with possible remounts
etc.

Tested-by: Sascha Hauer <s.hauer@pengutronix.de>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/quota.c                  | 28 +++++++++++++---------------
 include/linux/syscalls.h          |  4 ++--
 include/uapi/asm-generic/unistd.h |  4 ++--
 kernel/sys_ni.c                   |  2 +-
 4 files changed, 18 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 05e4bd9ab6d6..2bcc9a6f1bfc 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -968,31 +968,30 @@ out:
 	return ret;
 }
 
-SYSCALL_DEFINE4(quotactl_path, unsigned int, cmd, const char __user *,
-		mountpoint, qid_t, id, void __user *, addr)
+SYSCALL_DEFINE4(quotactl_fd, unsigned int, fd, unsigned int, cmd,
+		qid_t, id, void __user *, addr)
 {
 	struct super_block *sb;
-	struct path mountpath;
 	unsigned int cmds = cmd >> SUBCMDSHIFT;
 	unsigned int type = cmd & SUBCMDMASK;
+	struct fd f;
 	int ret;
 
-	if (type >= MAXQUOTAS)
-		return -EINVAL;
+	f = fdget_raw(fd);
+	if (!f.file)
+		return -EBADF;
 
-	ret = user_path_at(AT_FDCWD, mountpoint,
-			     LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT, &mountpath);
-	if (ret)
-		return ret;
-
-	sb = mountpath.mnt->mnt_sb;
+	ret = -EINVAL;
+	if (type >= MAXQUOTAS)
+		goto out;
 
 	if (quotactl_cmd_write(cmds)) {
-		ret = mnt_want_write(mountpath.mnt);
+		ret = mnt_want_write(f.file->f_path.mnt);
 		if (ret)
 			goto out;
 	}
 
+	sb = f.file->f_path.mnt->mnt_sb;
 	if (quotactl_cmd_onoff(cmds))
 		down_write(&sb->s_umount);
 	else
@@ -1006,9 +1005,8 @@ SYSCALL_DEFINE4(quotactl_path, unsigned int, cmd, const char __user *,
 		up_read(&sb->s_umount);
 
 	if (quotactl_cmd_write(cmds))
-		mnt_drop_write(mountpath.mnt);
+		mnt_drop_write(f.file->f_path.mnt);
 out:
-	path_put(&mountpath);
-
+	fdput(f);
 	return ret;
 }
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 050511e8f1f8..586128d5c3b8 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -485,8 +485,8 @@ asmlinkage long sys_pipe2(int __user *fildes, int flags);
 /* fs/quota.c */
 asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special,
 				qid_t id, void __user *addr);
-asmlinkage long sys_quotactl_path(unsigned int cmd, const char __user *mountpoint,
-				  qid_t id, void __user *addr);
+asmlinkage long sys_quotactl_fd(unsigned int fd, unsigned int cmd, qid_t id,
+				void __user *addr);
 
 /* fs/readdir.c */
 asmlinkage long sys_getdents64(unsigned int fd,
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 6de5a7fc066b..f211961ce1da 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -863,8 +863,8 @@ __SYSCALL(__NR_process_madvise, sys_process_madvise)
 __SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2)
 #define __NR_mount_setattr 442
 __SYSCALL(__NR_mount_setattr, sys_mount_setattr)
-#define __NR_quotactl_path 443
-__SYSCALL(__NR_quotactl_path, sys_quotactl_path)
+#define __NR_quotactl_fd 443
+__SYSCALL(__NR_quotactl_fd, sys_quotactl_fd)
 
 #define __NR_landlock_create_ruleset 444
 __SYSCALL(__NR_landlock_create_ruleset, sys_landlock_create_ruleset)
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0ea8128468c3..dad4d994641e 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -99,7 +99,7 @@ COND_SYSCALL(flock);
 
 /* fs/quota.c */
 COND_SYSCALL(quotactl);
-COND_SYSCALL(quotactl_path);
+COND_SYSCALL(quotactl_fd);
 
 /* fs/readdir.c */
 
-- 
cgit v1.2.3


From ef4b65e53cc77e2b3ca4667b461047ad04fb45fa Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 31 May 2021 00:08:09 +0200
Subject: netfilter: nfnetlink: add struct nfgenmsg to struct nfnl_info and use
 it

Update the nfnl_info structure to add a pointer to the nfnetlink header.
This simplifies the existing codebase since this header is usually
accessed. Update existing clients to use this new field.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink.h  |  1 +
 net/netfilter/nf_conntrack_netlink.c | 23 ++++++---------
 net/netfilter/nf_tables_api.c        | 55 +++++++++++++-----------------------
 net/netfilter/nfnetlink.c            |  2 ++
 net/netfilter/nfnetlink_log.c        |  5 ++--
 net/netfilter/nfnetlink_queue.c      |  9 ++----
 net/netfilter/nft_compat.c           | 17 ++++-------
 7 files changed, 42 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 515ce53aa20d..241e005f290a 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -11,6 +11,7 @@ struct nfnl_info {
 	struct net		*net;
 	struct sock		*sk;
 	const struct nlmsghdr	*nlh;
+	const struct nfgenmsg	*nfmsg;
 	struct netlink_ext_ack	*extack;
 };
 
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 220f51f055ab..4e1a9dba7077 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1528,7 +1528,7 @@ static int ctnetlink_del_conntrack(struct sk_buff *skb,
 				   const struct nfnl_info *info,
 				   const struct nlattr * const cda[])
 {
-	struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
+	u8 family = info->nfmsg->nfgen_family;
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conntrack_tuple tuple;
 	struct nf_conntrack_zone zone;
@@ -1541,12 +1541,12 @@ static int ctnetlink_del_conntrack(struct sk_buff *skb,
 
 	if (cda[CTA_TUPLE_ORIG])
 		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG,
-					    nfmsg->nfgen_family, &zone);
+					    family, &zone);
 	else if (cda[CTA_TUPLE_REPLY])
 		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY,
-					    nfmsg->nfgen_family, &zone);
+					    family, &zone);
 	else {
-		u_int8_t u3 = nfmsg->version ? nfmsg->nfgen_family : AF_UNSPEC;
+		u_int8_t u3 = info->nfmsg->version ? family : AF_UNSPEC;
 
 		return ctnetlink_flush_conntrack(info->net, cda,
 						 NETLINK_CB(skb).portid,
@@ -1586,8 +1586,7 @@ static int ctnetlink_get_conntrack(struct sk_buff *skb,
 				   const struct nfnl_info *info,
 				   const struct nlattr * const cda[])
 {
-	struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
-	u_int8_t u3 = nfmsg->nfgen_family;
+	u_int8_t u3 = info->nfmsg->nfgen_family;
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conntrack_tuple tuple;
 	struct nf_conntrack_zone zone;
@@ -2363,10 +2362,9 @@ static int ctnetlink_new_conntrack(struct sk_buff *skb,
 				   const struct nfnl_info *info,
 				   const struct nlattr * const cda[])
 {
-	struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
 	struct nf_conntrack_tuple otuple, rtuple;
 	struct nf_conntrack_tuple_hash *h = NULL;
-	u_int8_t u3 = nfmsg->nfgen_family;
+	u_int8_t u3 = info->nfmsg->nfgen_family;
 	struct nf_conntrack_zone zone;
 	struct nf_conn *ct;
 	int err;
@@ -3259,8 +3257,7 @@ static int ctnetlink_get_expect(struct sk_buff *skb,
 				const struct nfnl_info *info,
 				const struct nlattr * const cda[])
 {
-	struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
-	u_int8_t u3 = nfmsg->nfgen_family;
+	u_int8_t u3 = info->nfmsg->nfgen_family;
 	struct nf_conntrack_tuple tuple;
 	struct nf_conntrack_expect *exp;
 	struct nf_conntrack_zone zone;
@@ -3349,8 +3346,7 @@ static int ctnetlink_del_expect(struct sk_buff *skb,
 				const struct nfnl_info *info,
 				const struct nlattr * const cda[])
 {
-	struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
-	u_int8_t u3 = nfmsg->nfgen_family;
+	u_int8_t u3 = info->nfmsg->nfgen_family;
 	struct nf_conntrack_expect *exp;
 	struct nf_conntrack_tuple tuple;
 	struct nf_conntrack_zone zone;
@@ -3601,8 +3597,7 @@ static int ctnetlink_new_expect(struct sk_buff *skb,
 				const struct nfnl_info *info,
 				const struct nlattr * const cda[])
 {
-	struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
-	u_int8_t u3 = nfmsg->nfgen_family;
+	u_int8_t u3 = info->nfmsg->nfgen_family;
 	struct nf_conntrack_tuple tuple;
 	struct nf_conntrack_expect *exp;
 	struct nf_conntrack_zone zone;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index d63d2d8f769c..b2b4e03ce036 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -861,10 +861,9 @@ static int nft_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb,
 static int nf_tables_gettable(struct sk_buff *skb, const struct nfnl_info *info,
 			      const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
 	struct netlink_ext_ack *extack = info->extack;
 	u8 genmask = nft_genmask_cur(info->net);
-	int family = nfmsg->nfgen_family;
+	u8 family = info->nfmsg->nfgen_family;
 	const struct nft_table *table;
 	struct net *net = info->net;
 	struct sk_buff *skb2;
@@ -1059,10 +1058,9 @@ static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info,
 			      const struct nlattr * const nla[])
 {
 	struct nftables_pernet *nft_net = nft_pernet(info->net);
-	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
 	struct netlink_ext_ack *extack = info->extack;
 	u8 genmask = nft_genmask_next(info->net);
-	int family = nfmsg->nfgen_family;
+	u8 family = info->nfmsg->nfgen_family;
 	struct net *net = info->net;
 	const struct nlattr *attr;
 	struct nft_table *table;
@@ -1254,10 +1252,9 @@ out:
 static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info,
 			      const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
 	struct netlink_ext_ack *extack = info->extack;
 	u8 genmask = nft_genmask_next(info->net);
-	int family = nfmsg->nfgen_family;
+	u8 family = info->nfmsg->nfgen_family;
 	struct net *net = info->net;
 	const struct nlattr *attr;
 	struct nft_table *table;
@@ -1627,10 +1624,9 @@ done:
 static int nf_tables_getchain(struct sk_buff *skb, const struct nfnl_info *info,
 			      const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
 	struct netlink_ext_ack *extack = info->extack;
 	u8 genmask = nft_genmask_cur(info->net);
-	int family = nfmsg->nfgen_family;
+	u8 family = info->nfmsg->nfgen_family;
 	const struct nft_chain *chain;
 	struct net *net = info->net;
 	struct nft_table *table;
@@ -2355,10 +2351,9 @@ static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info,
 			      const struct nlattr * const nla[])
 {
 	struct nftables_pernet *nft_net = nft_pernet(info->net);
-	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
 	struct netlink_ext_ack *extack = info->extack;
 	u8 genmask = nft_genmask_next(info->net);
-	int family = nfmsg->nfgen_family;
+	u8 family = info->nfmsg->nfgen_family;
 	struct nft_chain *chain = NULL;
 	struct net *net = info->net;
 	const struct nlattr *attr;
@@ -2453,10 +2448,9 @@ static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info,
 static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info,
 			      const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
 	struct netlink_ext_ack *extack = info->extack;
 	u8 genmask = nft_genmask_next(info->net);
-	int family = nfmsg->nfgen_family;
+	u8 family = info->nfmsg->nfgen_family;
 	struct net *net = info->net;
 	const struct nlattr *attr;
 	struct nft_table *table;
@@ -3080,10 +3074,9 @@ static int nf_tables_dump_rules_done(struct netlink_callback *cb)
 static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info,
 			     const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
 	struct netlink_ext_ack *extack = info->extack;
 	u8 genmask = nft_genmask_cur(info->net);
-	int family = nfmsg->nfgen_family;
+	u8 family = info->nfmsg->nfgen_family;
 	const struct nft_chain *chain;
 	const struct nft_rule *rule;
 	struct net *net = info->net;
@@ -3221,13 +3214,12 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
 			     const struct nlattr * const nla[])
 {
 	struct nftables_pernet *nft_net = nft_pernet(info->net);
-	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
 	struct netlink_ext_ack *extack = info->extack;
 	unsigned int size, i, n, ulen = 0, usize = 0;
 	u8 genmask = nft_genmask_next(info->net);
 	struct nft_rule *rule, *old_rule = NULL;
 	struct nft_expr_info *expr_info = NULL;
-	int family = nfmsg->nfgen_family;
+	u8 family = info->nfmsg->nfgen_family;
 	struct net *net = info->net;
 	struct nft_flow_rule *flow;
 	struct nft_userdata *udata;
@@ -3459,15 +3451,15 @@ static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
 static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info,
 			     const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
 	struct netlink_ext_ack *extack = info->extack;
-	int family = nfmsg->nfgen_family, err = 0;
 	u8 genmask = nft_genmask_next(info->net);
+	u8 family = info->nfmsg->nfgen_family;
 	struct nft_chain *chain = NULL;
 	struct net *net = info->net;
 	struct nft_table *table;
 	struct nft_rule *rule;
 	struct nft_ctx ctx;
+	int err = 0;
 
 	table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask,
 				 NETLINK_CB(skb).portid);
@@ -4050,7 +4042,6 @@ static int nf_tables_dump_sets_done(struct netlink_callback *cb)
 static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info,
 			    const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
 	struct netlink_ext_ack *extack = info->extack;
 	u8 genmask = nft_genmask_cur(info->net);
 	struct net *net = info->net;
@@ -4078,7 +4069,7 @@ static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info,
 	}
 
 	/* Only accept unspec with dump */
-	if (nfmsg->nfgen_family == NFPROTO_UNSPEC)
+	if (info->nfmsg->nfgen_family == NFPROTO_UNSPEC)
 		return -EAFNOSUPPORT;
 	if (!nla[NFTA_SET_TABLE])
 		return -EINVAL;
@@ -4171,11 +4162,10 @@ static int nf_tables_set_desc_parse(struct nft_set_desc *desc,
 static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
 			    const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
 	u32 ktype, dtype, flags, policy, gc_int, objtype;
 	struct netlink_ext_ack *extack = info->extack;
 	u8 genmask = nft_genmask_next(info->net);
-	int family = nfmsg->nfgen_family;
+	u8 family = info->nfmsg->nfgen_family;
 	const struct nft_set_ops *ops;
 	struct nft_expr *expr = NULL;
 	struct net *net = info->net;
@@ -4475,7 +4465,6 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
 static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info,
 			    const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
 	struct netlink_ext_ack *extack = info->extack;
 	u8 genmask = nft_genmask_next(info->net);
 	struct net *net = info->net;
@@ -4484,7 +4473,7 @@ static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info,
 	struct nft_ctx ctx;
 	int err;
 
-	if (nfmsg->nfgen_family == NFPROTO_UNSPEC)
+	if (info->nfmsg->nfgen_family == NFPROTO_UNSPEC)
 		return -EAFNOSUPPORT;
 	if (nla[NFTA_SET_TABLE] == NULL)
 		return -EINVAL;
@@ -6527,11 +6516,10 @@ err_free_trans:
 static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info,
 			    const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
 	struct netlink_ext_ack *extack = info->extack;
 	u8 genmask = nft_genmask_next(info->net);
+	u8 family = info->nfmsg->nfgen_family;
 	const struct nft_object_type *type;
-	int family = nfmsg->nfgen_family;
 	struct net *net = info->net;
 	struct nft_table *table;
 	struct nft_object *obj;
@@ -6783,10 +6771,9 @@ static int nf_tables_dump_obj_done(struct netlink_callback *cb)
 static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info,
 			    const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
 	struct netlink_ext_ack *extack = info->extack;
 	u8 genmask = nft_genmask_cur(info->net);
-	int family = nfmsg->nfgen_family;
+	u8 family = info->nfmsg->nfgen_family;
 	const struct nft_table *table;
 	struct net *net = info->net;
 	struct nft_object *obj;
@@ -6873,10 +6860,9 @@ static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj)
 static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info,
 			    const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
 	struct netlink_ext_ack *extack = info->extack;
 	u8 genmask = nft_genmask_next(info->net);
-	int family = nfmsg->nfgen_family;
+	u8 family = info->nfmsg->nfgen_family;
 	struct net *net = info->net;
 	const struct nlattr *attr;
 	struct nft_table *table;
@@ -7304,12 +7290,11 @@ static int nf_tables_newflowtable(struct sk_buff *skb,
 				  const struct nfnl_info *info,
 				  const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
 	struct netlink_ext_ack *extack = info->extack;
 	struct nft_flowtable_hook flowtable_hook;
 	u8 genmask = nft_genmask_next(info->net);
+	u8 family = info->nfmsg->nfgen_family;
 	const struct nf_flowtable_type *type;
-	int family = nfmsg->nfgen_family;
 	struct nft_flowtable *flowtable;
 	struct nft_hook *hook, *next;
 	struct net *net = info->net;
@@ -7493,10 +7478,9 @@ static int nf_tables_delflowtable(struct sk_buff *skb,
 				  const struct nfnl_info *info,
 				  const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
 	struct netlink_ext_ack *extack = info->extack;
 	u8 genmask = nft_genmask_next(info->net);
-	int family = nfmsg->nfgen_family;
+	u8 family = info->nfmsg->nfgen_family;
 	struct nft_flowtable *flowtable;
 	struct net *net = info->net;
 	const struct nlattr *attr;
@@ -7688,9 +7672,8 @@ static int nf_tables_getflowtable(struct sk_buff *skb,
 				  const struct nfnl_info *info,
 				  const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
 	u8 genmask = nft_genmask_cur(info->net);
-	int family = nfmsg->nfgen_family;
+	u8 family = info->nfmsg->nfgen_family;
 	struct nft_flowtable *flowtable;
 	const struct nft_table *table;
 	struct net *net = info->net;
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index e8dbd8379027..028a1f39318b 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -256,6 +256,7 @@ replay:
 			.net	= net,
 			.sk	= nfnlnet->nfnl,
 			.nlh	= nlh,
+			.nfmsg	= nlmsg_data(nlh),
 			.extack	= extack,
 		};
 
@@ -491,6 +492,7 @@ replay_abort:
 				.net	= net,
 				.sk	= nfnlnet->nfnl,
 				.nlh	= nlh,
+				.nfmsg	= nlmsg_data(nlh),
 				.extack	= &extack,
 			};
 
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 587086b18c36..691ef4cffdd9 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -871,15 +871,14 @@ static int nfulnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
 			      const struct nlattr * const nfula[])
 {
 	struct nfnl_log_net *log = nfnl_log_pernet(info->net);
-	struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
-	u_int16_t group_num = ntohs(nfmsg->res_id);
+	u_int16_t group_num = ntohs(info->nfmsg->res_id);
 	struct nfulnl_msg_config_cmd *cmd = NULL;
 	struct nfulnl_instance *inst;
 	u16 flags = 0;
 	int ret = 0;
 
 	if (nfula[NFULA_CFG_CMD]) {
-		u_int8_t pf = nfmsg->nfgen_family;
+		u_int8_t pf = info->nfmsg->nfgen_family;
 		cmd = nla_data(nfula[NFULA_CFG_CMD]);
 
 		/* Commands without queue context */
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index f37a575ebd7f..f774de0fc24f 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -1051,8 +1051,7 @@ static int nfqnl_recv_verdict_batch(struct sk_buff *skb,
 				    const struct nlattr * const nfqa[])
 {
 	struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
-	struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
-	u16 queue_num = ntohs(nfmsg->res_id);
+	u16 queue_num = ntohs(info->nfmsg->res_id);
 	struct nf_queue_entry *entry, *tmp;
 	struct nfqnl_msg_verdict_hdr *vhdr;
 	struct nfqnl_instance *queue;
@@ -1160,8 +1159,7 @@ static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info,
 			      const struct nlattr * const nfqa[])
 {
 	struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
-	struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
-	u_int16_t queue_num = ntohs(nfmsg->res_id);
+	u_int16_t queue_num = ntohs(info->nfmsg->res_id);
 	struct nfqnl_msg_verdict_hdr *vhdr;
 	enum ip_conntrack_info ctinfo;
 	struct nfqnl_instance *queue;
@@ -1243,8 +1241,7 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
 			     const struct nlattr * const nfqa[])
 {
 	struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
-	struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
-	u_int16_t queue_num = ntohs(nfmsg->res_id);
+	u_int16_t queue_num = ntohs(info->nfmsg->res_id);
 	struct nfqnl_msg_config_cmd *cmd = NULL;
 	struct nfqnl_instance *queue;
 	__u32 flags = 0, mask = 0;
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 3144a9ad2f6a..639c337c885b 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -625,7 +625,7 @@ static int nfnl_compat_get_rcu(struct sk_buff *skb,
 			       const struct nfnl_info *info,
 			       const struct nlattr * const tb[])
 {
-	struct nfgenmsg *nfmsg;
+	u8 family = info->nfmsg->nfgen_family;
 	const char *name, *fmt;
 	struct sk_buff *skb2;
 	int ret = 0, target;
@@ -640,9 +640,7 @@ static int nfnl_compat_get_rcu(struct sk_buff *skb,
 	rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV]));
 	target = ntohl(nla_get_be32(tb[NFTA_COMPAT_TYPE]));
 
-	nfmsg = nlmsg_data(info->nlh);
-
-	switch(nfmsg->nfgen_family) {
+	switch(family) {
 	case AF_INET:
 		fmt = "ipt_%s";
 		break;
@@ -656,8 +654,7 @@ static int nfnl_compat_get_rcu(struct sk_buff *skb,
 		fmt = "arpt_%s";
 		break;
 	default:
-		pr_err("nft_compat: unsupported protocol %d\n",
-			nfmsg->nfgen_family);
+		pr_err("nft_compat: unsupported protocol %d\n", family);
 		return -EINVAL;
 	}
 
@@ -665,9 +662,8 @@ static int nfnl_compat_get_rcu(struct sk_buff *skb,
 		return -EINVAL;
 
 	rcu_read_unlock();
-	try_then_request_module(xt_find_revision(nfmsg->nfgen_family, name,
-						 rev, target, &ret),
-						 fmt, name);
+	try_then_request_module(xt_find_revision(family, name, rev, target, &ret),
+				fmt, name);
 	if (ret < 0)
 		goto out_put;
 
@@ -682,8 +678,7 @@ static int nfnl_compat_get_rcu(struct sk_buff *skb,
 				  info->nlh->nlmsg_seq,
 				  NFNL_MSG_TYPE(info->nlh->nlmsg_type),
 				  NFNL_MSG_COMPAT_GET,
-				  nfmsg->nfgen_family,
-				  name, ret, target) <= 0) {
+				  family, name, ret, target) <= 0) {
 		kfree_skb(skb2);
 		goto out_put;
 	}
-- 
cgit v1.2.3


From 7b4b2fa37587394fb89fa51a4bea0820a1b37a5d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 4 Jun 2021 12:27:06 +0200
Subject: netfilter: annotate nf_tables base hook ops

This will allow a followup patch to treat the 'ops->priv' pointer
as nft_chain argument without having to first walk the table/chains
to check if there is a matching base chain pointer.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h     | 8 +++++++-
 net/netfilter/nf_tables_api.c | 4 +++-
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index f161569fbe2f..3fda1a508733 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -77,12 +77,18 @@ struct nf_hook_state {
 typedef unsigned int nf_hookfn(void *priv,
 			       struct sk_buff *skb,
 			       const struct nf_hook_state *state);
+enum nf_hook_ops_type {
+	NF_HOOK_OP_UNDEFINED,
+	NF_HOOK_OP_NF_TABLES,
+};
+
 struct nf_hook_ops {
 	/* User fills in from here down. */
 	nf_hookfn		*hook;
 	struct net_device	*dev;
 	void			*priv;
-	u_int8_t		pf;
+	u8			pf;
+	enum nf_hook_ops_type	hook_ops_type:8;
 	unsigned int		hooknum;
 	/* Hooks are ordered in ascending priority. */
 	int			priority;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 6c2000a11c7e..c9308241b688 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2168,8 +2168,10 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 	}
 
 	nft_trans_chain_policy(trans) = NFT_CHAIN_POLICY_UNSET;
-	if (nft_is_base_chain(chain))
+	if (nft_is_base_chain(chain)) {
+		basechain->ops.hook_ops_type = NF_HOOK_OP_NF_TABLES;
 		nft_trans_chain_policy(trans) = policy;
+	}
 
 	err = nft_chain_add(table, chain);
 	if (err < 0) {
-- 
cgit v1.2.3


From 5c1a72a0fbe1b02c3ce0537f85f92ea935e0beec Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 4 Jun 2021 19:50:45 +0300
Subject: ACPI: property: Constify stubs for CONFIG_ACPI=n case

There is a few stubs that left untouched during constification of
the fwnode related APIs. Constify three more stubs here.

Fixes: 8b9d6802583a ("ACPI: Constify acpi_bus helper functions, switch to macros")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
[ rjw: Subject edit ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index c60745f657e9..40657f220f8b 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -765,7 +765,7 @@ static inline bool is_acpi_device_node(const struct fwnode_handle *fwnode)
 	return false;
 }
 
-static inline struct acpi_device *to_acpi_device_node(struct fwnode_handle *fwnode)
+static inline struct acpi_device *to_acpi_device_node(const struct fwnode_handle *fwnode)
 {
 	return NULL;
 }
@@ -775,12 +775,12 @@ static inline bool is_acpi_data_node(const struct fwnode_handle *fwnode)
 	return false;
 }
 
-static inline struct acpi_data_node *to_acpi_data_node(struct fwnode_handle *fwnode)
+static inline struct acpi_data_node *to_acpi_data_node(const struct fwnode_handle *fwnode)
 {
 	return NULL;
 }
 
-static inline bool acpi_data_node_match(struct fwnode_handle *fwnode,
+static inline bool acpi_data_node_match(const struct fwnode_handle *fwnode,
 					const char *name)
 {
 	return false;
-- 
cgit v1.2.3


From 3d7c821c1d8071e517048c8b4afdf33109441c0f Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 4 Jun 2021 19:50:46 +0300
Subject: ACPI: scan: Constify acpi_dma_supported() helper function

Constify arguments to acpi_dma_supported(). The function doesn't need
to change the content of the passed argument and when it's const it
allows to supply the result of other functions that may return a pointer
to a constant object.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
[ rjw: Subject edit ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/scan.c     | 2 +-
 include/acpi/acpi_bus.h | 2 +-
 include/linux/acpi.h    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index a22778e880c2..609405ca11e2 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1404,7 +1404,7 @@ void acpi_free_pnp_ids(struct acpi_device_pnp *pnp)
  *
  * Return false if DMA is not supported. Otherwise, return true
  */
-bool acpi_dma_supported(struct acpi_device *adev)
+bool acpi_dma_supported(const struct acpi_device *adev)
 {
 	if (!adev)
 		return false;
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 3a82faac5767..2fbd2c2e0568 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -586,7 +586,7 @@ struct acpi_pci_root {
 
 /* helper */
 
-bool acpi_dma_supported(struct acpi_device *adev);
+bool acpi_dma_supported(const struct acpi_device *adev);
 enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev);
 int acpi_dma_get_range(struct device *dev, u64 *dma_addr, u64 *offset,
 		       u64 *size);
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 40657f220f8b..97f1c5588b5a 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -911,7 +911,7 @@ acpi_create_platform_device(struct acpi_device *adev,
 	return NULL;
 }
 
-static inline bool acpi_dma_supported(struct acpi_device *adev)
+static inline bool acpi_dma_supported(const struct acpi_device *adev)
 {
 	return false;
 }
-- 
cgit v1.2.3


From fb38f314fbd173e2e9f9f0f2e720a5f4889562da Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 4 Jun 2021 19:50:47 +0300
Subject: device property: Unify access to of_node

Historically we have a few variants how we access dev->fwnode
and dev->of_node. Some of the functions during development
gained different versions of the getters. Unify access to of_node
and as a side change slightly refactor ACPI specific branches.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/property.c  | 29 +++++++++++++----------------
 include/linux/property.h |  2 +-
 2 files changed, 14 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index dd98759d688b..1f533b314efc 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -759,13 +759,8 @@ EXPORT_SYMBOL_GPL(fwnode_get_next_available_child_node);
 struct fwnode_handle *device_get_next_child_node(struct device *dev,
 						 struct fwnode_handle *child)
 {
-	struct acpi_device *adev = ACPI_COMPANION(dev);
-	struct fwnode_handle *fwnode = NULL, *next;
-
-	if (dev->of_node)
-		fwnode = of_fwnode_handle(dev->of_node);
-	else if (adev)
-		fwnode = acpi_fwnode_handle(adev);
+	const struct fwnode_handle *fwnode = dev_fwnode(dev);
+	struct fwnode_handle *next;
 
 	/* Try to find a child in primary fwnode */
 	next = fwnode_get_next_child_node(fwnode, child);
@@ -868,28 +863,31 @@ EXPORT_SYMBOL_GPL(device_get_child_node_count);
 
 bool device_dma_supported(struct device *dev)
 {
+	const struct fwnode_handle *fwnode = dev_fwnode(dev);
+
 	/* For DT, this is always supported.
 	 * For ACPI, this depends on CCA, which
 	 * is determined by the acpi_dma_supported().
 	 */
-	if (IS_ENABLED(CONFIG_OF) && dev->of_node)
+	if (is_of_node(fwnode))
 		return true;
 
-	return acpi_dma_supported(ACPI_COMPANION(dev));
+	return acpi_dma_supported(to_acpi_device_node(fwnode));
 }
 EXPORT_SYMBOL_GPL(device_dma_supported);
 
 enum dev_dma_attr device_get_dma_attr(struct device *dev)
 {
+	const struct fwnode_handle *fwnode = dev_fwnode(dev);
 	enum dev_dma_attr attr = DEV_DMA_NOT_SUPPORTED;
 
-	if (IS_ENABLED(CONFIG_OF) && dev->of_node) {
-		if (of_dma_is_coherent(dev->of_node))
+	if (is_of_node(fwnode)) {
+		if (of_dma_is_coherent(to_of_node(fwnode)))
 			attr = DEV_DMA_COHERENT;
 		else
 			attr = DEV_DMA_NON_COHERENT;
 	} else
-		attr = acpi_get_dma_attr(ACPI_COMPANION(dev));
+		attr = acpi_get_dma_attr(to_acpi_device_node(fwnode));
 
 	return attr;
 }
@@ -1007,14 +1005,13 @@ EXPORT_SYMBOL(device_get_mac_address);
  * Returns Linux IRQ number on success. Other values are determined
  * accordingly to acpi_/of_ irq_get() operation.
  */
-int fwnode_irq_get(struct fwnode_handle *fwnode, unsigned int index)
+int fwnode_irq_get(const struct fwnode_handle *fwnode, unsigned int index)
 {
-	struct device_node *of_node = to_of_node(fwnode);
 	struct resource res;
 	int ret;
 
-	if (IS_ENABLED(CONFIG_OF) && of_node)
-		return of_irq_get(of_node, index);
+	if (is_of_node(fwnode))
+		return of_irq_get(to_of_node(fwnode), index);
 
 	ret = acpi_irq_get(ACPI_HANDLE_FWNODE(fwnode), index, &res);
 	if (ret)
diff --git a/include/linux/property.h b/include/linux/property.h
index 0d876316e61d..073e680c35e2 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -119,7 +119,7 @@ struct fwnode_handle *device_get_named_child_node(struct device *dev,
 struct fwnode_handle *fwnode_handle_get(struct fwnode_handle *fwnode);
 void fwnode_handle_put(struct fwnode_handle *fwnode);
 
-int fwnode_irq_get(struct fwnode_handle *fwnode, unsigned int index);
+int fwnode_irq_get(const struct fwnode_handle *fwnode, unsigned int index);
 
 unsigned int device_get_child_node_count(struct device *dev);
 
-- 
cgit v1.2.3


From 9b3c47f124b60770f7738710e95801284d69d24f Mon Sep 17 00:00:00 2001
From: Michael Walle <michael@walle.cc>
Date: Sat, 5 Jun 2021 00:58:57 +0200
Subject: gpio: regmap: move drvdata to config data

Drop gpio_regmap_set_drvdata() and instead add it to the configuration
data passed to gpio_regmap_register().

gpio_regmap_set_drvdata() can't really be used in a race free way. This
is because the gpio_regmap object which is needed by _set_drvdata() is
returned by gpio_regmap_register(). On the other hand, the callbacks
which use the drvdata might already be called right after the
gpiochip_add() call in gpio_regmap_register(). Therefore, we have to
provide the drvdata early before we call gpiochip_add().

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Signed-off-by: Michael Walle <michael@walle.cc>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
---
 drivers/gpio/gpio-regmap.c  | 7 +------
 include/linux/gpio/regmap.h | 6 +++++-
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-regmap.c b/drivers/gpio/gpio-regmap.c
index 1ead1290eb3f..69c219742083 100644
--- a/drivers/gpio/gpio-regmap.c
+++ b/drivers/gpio/gpio-regmap.c
@@ -178,12 +178,6 @@ static int gpio_regmap_direction_output(struct gpio_chip *chip,
 	return gpio_regmap_set_direction(chip, offset, true);
 }
 
-void gpio_regmap_set_drvdata(struct gpio_regmap *gpio, void *data)
-{
-	gpio->driver_data = data;
-}
-EXPORT_SYMBOL_GPL(gpio_regmap_set_drvdata);
-
 void *gpio_regmap_get_drvdata(struct gpio_regmap *gpio)
 {
 	return gpio->driver_data;
@@ -226,6 +220,7 @@ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config
 		return ERR_PTR(-ENOMEM);
 
 	gpio->parent = config->parent;
+	gpio->driver_data = config->drvdata;
 	gpio->regmap = config->regmap;
 	gpio->ngpio_per_reg = config->ngpio_per_reg;
 	gpio->reg_stride = config->reg_stride;
diff --git a/include/linux/gpio/regmap.h b/include/linux/gpio/regmap.h
index 334dd928042b..a9f7b7faf57b 100644
--- a/include/linux/gpio/regmap.h
+++ b/include/linux/gpio/regmap.h
@@ -37,6 +37,9 @@ struct regmap;
  *			offset to a register/bitmask pair. If not
  *			given the default gpio_regmap_simple_xlate()
  *			is used.
+ * @drvdata:		(Optional) Pointer to driver specific data which is
+ *			not used by gpio-remap but is provided "as is" to the
+ *			driver callback(s).
  *
  * The ->reg_mask_xlate translates a given base address and GPIO offset to
  * register and mask pair. The base address is one of the given register
@@ -78,13 +81,14 @@ struct gpio_regmap_config {
 	int (*reg_mask_xlate)(struct gpio_regmap *gpio, unsigned int base,
 			      unsigned int offset, unsigned int *reg,
 			      unsigned int *mask);
+
+	void *drvdata;
 };
 
 struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config);
 void gpio_regmap_unregister(struct gpio_regmap *gpio);
 struct gpio_regmap *devm_gpio_regmap_register(struct device *dev,
 					      const struct gpio_regmap_config *config);
-void gpio_regmap_set_drvdata(struct gpio_regmap *gpio, void *data);
 void *gpio_regmap_get_drvdata(struct gpio_regmap *gpio);
 
 #endif /* _LINUX_GPIO_REGMAP_H */
-- 
cgit v1.2.3


From a9e10e58730432e5de840eb3ddd55c75f29341b3 Mon Sep 17 00:00:00 2001
From: Daniel Scally <djrscally@gmail.com>
Date: Thu, 3 Jun 2021 23:40:02 +0100
Subject: ACPI: scan: Extend acpi_walk_dep_device_list()

The acpi_walk_dep_device_list() function is not as generic as its
name implies, serving only to decrement the dependency count for each
dependent device of the input.

Extend it to accept a callback which can be applied to all the
dependencies in acpi_dep_list.

Replace all existing calls to the function with calls to a wrapper,
passing a callback that applies the same dependency reduction.

Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Maximilian Luz <luzmaximilian@gmail.com>  # for platform/surface parts
Signed-off-by: Daniel Scally <djrscally@gmail.com>
[ rjw: Changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/ec.c                              |  2 +-
 drivers/acpi/pmic/intel_pmic_chtdc_ti.c        |  2 +-
 drivers/acpi/scan.c                            | 69 +++++++++++++++++++-------
 drivers/gpio/gpiolib-acpi.c                    | 10 ++--
 drivers/i2c/i2c-core-acpi.c                    |  8 +--
 drivers/platform/surface/aggregator/core.c     |  6 +--
 drivers/platform/surface/surface3_power.c      | 22 ++++----
 drivers/platform/surface/surface_acpi_notify.c |  7 +--
 include/acpi/acpi_bus.h                        |  7 +++
 include/linux/acpi.h                           |  4 +-
 10 files changed, 90 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c
index 13565629ce0a..3f7680a007a3 100644
--- a/drivers/acpi/ec.c
+++ b/drivers/acpi/ec.c
@@ -1627,7 +1627,7 @@ static int acpi_ec_add(struct acpi_device *device)
 	WARN(!ret, "Could not request EC cmd io port 0x%lx", ec->command_addr);
 
 	/* Reprobe devices depending on the EC */
-	acpi_walk_dep_device_list(ec->handle);
+	acpi_dev_clear_dependencies(device);
 
 	acpi_handle_debug(ec->handle, "enumerated.\n");
 	return 0;
diff --git a/drivers/acpi/pmic/intel_pmic_chtdc_ti.c b/drivers/acpi/pmic/intel_pmic_chtdc_ti.c
index a5101b07611a..fef7831d0d63 100644
--- a/drivers/acpi/pmic/intel_pmic_chtdc_ti.c
+++ b/drivers/acpi/pmic/intel_pmic_chtdc_ti.c
@@ -117,7 +117,7 @@ static int chtdc_ti_pmic_opregion_probe(struct platform_device *pdev)
 		return err;
 
 	/* Re-enumerate devices depending on PMIC */
-	acpi_walk_dep_device_list(ACPI_HANDLE(pdev->dev.parent));
+	acpi_dev_clear_dependencies(ACPI_COMPANION(pdev->dev.parent));
 	return 0;
 }
 
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index a2df7bcf4d07..2277add6da2f 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -47,12 +47,6 @@ static DEFINE_MUTEX(acpi_hp_context_lock);
  */
 static u64 spcr_uart_addr;
 
-struct acpi_dep_data {
-	struct list_head node;
-	acpi_handle supplier;
-	acpi_handle consumer;
-};
-
 void acpi_scan_lock_acquire(void)
 {
 	mutex_lock(&acpi_scan_lock);
@@ -2107,30 +2101,69 @@ static void acpi_bus_attach(struct acpi_device *device, bool first_pass)
 		device->handler->hotplug.notify_online(device);
 }
 
-void acpi_walk_dep_device_list(acpi_handle handle)
+static int acpi_scan_clear_dep(struct acpi_dep_data *dep, void *data)
 {
-	struct acpi_dep_data *dep, *tmp;
 	struct acpi_device *adev;
 
+	acpi_bus_get_device(dep->consumer, &adev);
+
+	if (adev) {
+		adev->dep_unmet--;
+		if (!adev->dep_unmet)
+			acpi_bus_attach(adev, true);
+	}
+
+	list_del(&dep->node);
+	kfree(dep);
+
+	return 0;
+}
+
+/**
+ * acpi_walk_dep_device_list - Apply a callback to every entry in acpi_dep_list
+ * @handle:	The ACPI handle of the supplier device
+ * @callback:	Pointer to the callback function to apply
+ * @data:	Pointer to some data to pass to the callback
+ *
+ * The return value of the callback determines this function's behaviour. If 0
+ * is returned we continue to iterate over acpi_dep_list. If a positive value
+ * is returned then the loop is broken but this function returns 0. If a
+ * negative value is returned by the callback then the loop is broken and that
+ * value is returned as the final error.
+ */
+int acpi_walk_dep_device_list(acpi_handle handle,
+			      int (*callback)(struct acpi_dep_data *, void *),
+			      void *data)
+{
+	struct acpi_dep_data *dep, *tmp;
+	int ret;
+
 	mutex_lock(&acpi_dep_list_lock);
 	list_for_each_entry_safe(dep, tmp, &acpi_dep_list, node) {
 		if (dep->supplier == handle) {
-			acpi_bus_get_device(dep->consumer, &adev);
-
-			if (adev) {
-				adev->dep_unmet--;
-				if (!adev->dep_unmet)
-					acpi_bus_attach(adev, true);
-			}
-
-			list_del(&dep->node);
-			kfree(dep);
+			ret = callback(dep, data);
+			if (ret)
+				break;
 		}
 	}
 	mutex_unlock(&acpi_dep_list_lock);
+
+	return ret > 0 ? 0 : ret;
 }
 EXPORT_SYMBOL_GPL(acpi_walk_dep_device_list);
 
+/**
+ * acpi_dev_clear_dependencies - Inform consumers that the device is now active
+ * @supplier: Pointer to the supplier &struct acpi_device
+ *
+ * Clear dependencies on the given device.
+ */
+void acpi_dev_clear_dependencies(struct acpi_device *supplier)
+{
+	acpi_walk_dep_device_list(supplier->handle, acpi_scan_clear_dep, NULL);
+}
+EXPORT_SYMBOL_GPL(acpi_dev_clear_dependencies);
+
 /**
  * acpi_bus_scan - Add ACPI device node objects in a given namespace scope.
  * @handle: Root of the namespace scope to scan.
diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c
index 3ef22a3c104d..5b4111e4be3f 100644
--- a/drivers/gpio/gpiolib-acpi.c
+++ b/drivers/gpio/gpiolib-acpi.c
@@ -1233,14 +1233,14 @@ static void acpi_gpiochip_scan_gpios(struct acpi_gpio_chip *achip)
 void acpi_gpiochip_add(struct gpio_chip *chip)
 {
 	struct acpi_gpio_chip *acpi_gpio;
-	acpi_handle handle;
+	struct acpi_device *adev;
 	acpi_status status;
 
 	if (!chip || !chip->parent)
 		return;
 
-	handle = ACPI_HANDLE(chip->parent);
-	if (!handle)
+	adev = ACPI_COMPANION(chip->parent);
+	if (!adev)
 		return;
 
 	acpi_gpio = kzalloc(sizeof(*acpi_gpio), GFP_KERNEL);
@@ -1254,7 +1254,7 @@ void acpi_gpiochip_add(struct gpio_chip *chip)
 	INIT_LIST_HEAD(&acpi_gpio->events);
 	INIT_LIST_HEAD(&acpi_gpio->deferred_req_irqs_list_entry);
 
-	status = acpi_attach_data(handle, acpi_gpio_chip_dh, acpi_gpio);
+	status = acpi_attach_data(adev->handle, acpi_gpio_chip_dh, acpi_gpio);
 	if (ACPI_FAILURE(status)) {
 		dev_err(chip->parent, "Failed to attach ACPI GPIO chip\n");
 		kfree(acpi_gpio);
@@ -1263,7 +1263,7 @@ void acpi_gpiochip_add(struct gpio_chip *chip)
 
 	acpi_gpiochip_request_regions(acpi_gpio);
 	acpi_gpiochip_scan_gpios(acpi_gpio);
-	acpi_walk_dep_device_list(handle);
+	acpi_dev_clear_dependencies(adev);
 }
 
 void acpi_gpiochip_remove(struct gpio_chip *chip)
diff --git a/drivers/i2c/i2c-core-acpi.c b/drivers/i2c/i2c-core-acpi.c
index 8ceaa88dd78f..6f0aa0ed3241 100644
--- a/drivers/i2c/i2c-core-acpi.c
+++ b/drivers/i2c/i2c-core-acpi.c
@@ -259,8 +259,8 @@ static acpi_status i2c_acpi_add_device(acpi_handle handle, u32 level,
  */
 void i2c_acpi_register_devices(struct i2c_adapter *adap)
 {
+	struct acpi_device *adev;
 	acpi_status status;
-	acpi_handle handle;
 
 	if (!has_acpi_companion(&adap->dev))
 		return;
@@ -275,11 +275,11 @@ void i2c_acpi_register_devices(struct i2c_adapter *adap)
 	if (!adap->dev.parent)
 		return;
 
-	handle = ACPI_HANDLE(adap->dev.parent);
-	if (!handle)
+	adev = ACPI_COMPANION(adap->dev.parent);
+	if (!adev)
 		return;
 
-	acpi_walk_dep_device_list(handle);
+	acpi_dev_clear_dependencies(adev);
 }
 
 static const struct acpi_device_id i2c_acpi_force_400khz_device_ids[] = {
diff --git a/drivers/platform/surface/aggregator/core.c b/drivers/platform/surface/aggregator/core.c
index 8dc2c267bcd6..517f774a6e60 100644
--- a/drivers/platform/surface/aggregator/core.c
+++ b/drivers/platform/surface/aggregator/core.c
@@ -621,8 +621,8 @@ static const struct acpi_gpio_mapping ssam_acpi_gpios[] = {
 
 static int ssam_serial_hub_probe(struct serdev_device *serdev)
 {
+	struct acpi_device *ssh = ACPI_COMPANION(&serdev->dev);
 	struct ssam_controller *ctrl;
-	acpi_handle *ssh = ACPI_HANDLE(&serdev->dev);
 	acpi_status astatus;
 	int status;
 
@@ -652,7 +652,7 @@ static int ssam_serial_hub_probe(struct serdev_device *serdev)
 	if (status)
 		goto err_devopen;
 
-	astatus = ssam_serdev_setup_via_acpi(ssh, serdev);
+	astatus = ssam_serdev_setup_via_acpi(ssh->handle, serdev);
 	if (ACPI_FAILURE(astatus)) {
 		status = -ENXIO;
 		goto err_devinit;
@@ -706,7 +706,7 @@ static int ssam_serial_hub_probe(struct serdev_device *serdev)
 	 *       For now let's thus default power/wakeup to false.
 	 */
 	device_set_wakeup_capable(&serdev->dev, true);
-	acpi_walk_dep_device_list(ssh);
+	acpi_dev_clear_dependencies(ssh);
 
 	return 0;
 
diff --git a/drivers/platform/surface/surface3_power.c b/drivers/platform/surface/surface3_power.c
index cc4f9cba6856..dea82aa1abd4 100644
--- a/drivers/platform/surface/surface3_power.c
+++ b/drivers/platform/surface/surface3_power.c
@@ -446,12 +446,12 @@ mshw0011_space_handler(u32 function, acpi_physical_address command,
 
 static int mshw0011_install_space_handler(struct i2c_client *client)
 {
-	acpi_handle handle;
+	struct acpi_device *adev;
 	struct mshw0011_handler_data *data;
 	acpi_status status;
 
-	handle = ACPI_HANDLE(&client->dev);
-	if (!handle)
+	adev = ACPI_COMPANION(&client->dev);
+	if (!adev)
 		return -ENODEV;
 
 	data = kzalloc(sizeof(struct mshw0011_handler_data),
@@ -460,25 +460,25 @@ static int mshw0011_install_space_handler(struct i2c_client *client)
 		return -ENOMEM;
 
 	data->client = client;
-	status = acpi_bus_attach_private_data(handle, (void *)data);
+	status = acpi_bus_attach_private_data(adev->handle, (void *)data);
 	if (ACPI_FAILURE(status)) {
 		kfree(data);
 		return -ENOMEM;
 	}
 
-	status = acpi_install_address_space_handler(handle,
-				ACPI_ADR_SPACE_GSBUS,
-				&mshw0011_space_handler,
-				NULL,
-				data);
+	status = acpi_install_address_space_handler(adev->handle,
+						    ACPI_ADR_SPACE_GSBUS,
+						    &mshw0011_space_handler,
+						    NULL,
+						    data);
 	if (ACPI_FAILURE(status)) {
 		dev_err(&client->dev, "Error installing i2c space handler\n");
-		acpi_bus_detach_private_data(handle);
+		acpi_bus_detach_private_data(adev->handle);
 		kfree(data);
 		return -ENOMEM;
 	}
 
-	acpi_walk_dep_device_list(handle);
+	acpi_dev_clear_dependencies(adev);
 	return 0;
 }
 
diff --git a/drivers/platform/surface/surface_acpi_notify.c b/drivers/platform/surface/surface_acpi_notify.c
index ef9c1f8e8336..8339988d95c1 100644
--- a/drivers/platform/surface/surface_acpi_notify.c
+++ b/drivers/platform/surface/surface_acpi_notify.c
@@ -798,7 +798,7 @@ static int san_consumer_links_setup(struct platform_device *pdev)
 
 static int san_probe(struct platform_device *pdev)
 {
-	acpi_handle san = ACPI_HANDLE(&pdev->dev);
+	struct acpi_device *san = ACPI_COMPANION(&pdev->dev);
 	struct ssam_controller *ctrl;
 	struct san_data *data;
 	acpi_status astatus;
@@ -821,7 +821,8 @@ static int san_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, data);
 
-	astatus = acpi_install_address_space_handler(san, ACPI_ADR_SPACE_GSBUS,
+	astatus = acpi_install_address_space_handler(san->handle,
+						     ACPI_ADR_SPACE_GSBUS,
 						     &san_opreg_handler, NULL,
 						     &data->info);
 	if (ACPI_FAILURE(astatus))
@@ -835,7 +836,7 @@ static int san_probe(struct platform_device *pdev)
 	if (status)
 		goto err_install_dev;
 
-	acpi_walk_dep_device_list(san);
+	acpi_dev_clear_dependencies(san);
 	return 0;
 
 err_install_dev:
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 3a82faac5767..0b2c4f170f4d 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -280,6 +280,12 @@ struct acpi_device_power {
 	struct acpi_device_power_state states[ACPI_D_STATE_COUNT];	/* Power states (D0-D3Cold) */
 };
 
+struct acpi_dep_data {
+	struct list_head node;
+	acpi_handle supplier;
+	acpi_handle consumer;
+};
+
 /* Performance Management */
 
 struct acpi_device_perf_flags {
@@ -685,6 +691,7 @@ static inline bool acpi_device_can_poweroff(struct acpi_device *adev)
 
 bool acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *uid2);
 
+void acpi_dev_clear_dependencies(struct acpi_device *supplier);
 struct acpi_device *
 acpi_dev_get_next_match_dev(struct acpi_device *adev, const char *hid, const char *uid, s64 hrv);
 struct acpi_device *
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index c60745f657e9..170b9bebdb2b 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -666,7 +666,9 @@ extern bool acpi_driver_match_device(struct device *dev,
 				     const struct device_driver *drv);
 int acpi_device_uevent_modalias(struct device *, struct kobj_uevent_env *);
 int acpi_device_modalias(struct device *, char *, int);
-void acpi_walk_dep_device_list(acpi_handle handle);
+int acpi_walk_dep_device_list(acpi_handle handle,
+			      int (*callback)(struct acpi_dep_data *, void *),
+			      void *data);
 
 struct platform_device *acpi_create_platform_device(struct acpi_device *,
 						    struct property_entry *);
-- 
cgit v1.2.3


From bcd23f93d3984a94d64ce0b6bbfa3789c0e8ebaf Mon Sep 17 00:00:00 2001
From: Maxim Kochetkov <fido_max@inbox.ru>
Date: Tue, 25 May 2021 06:42:03 +0300
Subject: regmap-irq: Introduce inverted status registers support

Some interrupt controllers have inverted status register:
cleared bits is active interrupts and set bits is inactive interrupts,
so add inverted status support to the framework.

Signed-off-by: Maxim Kochetkov <fido_max@inbox.ru>
Link: https://lore.kernel.org/r/20210525034204.5272-1-fido_max@inbox.ru
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap-irq.c | 7 +++++++
 include/linux/regmap.h           | 2 ++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/regmap/regmap-irq.c b/drivers/base/regmap/regmap-irq.c
index 760296a4b606..d2656581a608 100644
--- a/drivers/base/regmap/regmap-irq.c
+++ b/drivers/base/regmap/regmap-irq.c
@@ -531,6 +531,10 @@ static irqreturn_t regmap_irq_thread(int irq, void *d)
 		}
 	}
 
+	if (chip->status_invert)
+		for (i = 0; i < data->chip->num_regs; i++)
+			data->status_buf[i] = ~data->status_buf[i];
+
 	/*
 	 * Ignore masked IRQs and ack if we need to; we ack early so
 	 * there is no race between handling and acknowleding the
@@ -800,6 +804,9 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode,
 			goto err_alloc;
 		}
 
+		if (chip->status_invert)
+			d->status_buf[i] = ~d->status_buf[i];
+
 		if (d->status_buf[i] && (chip->ack_base || chip->use_ack)) {
 			reg = sub_irq_reg(d, d->chip->ack_base, i);
 			if (chip->ack_invert)
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index bf5a834d1774..f5f08dd0a116 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -1449,6 +1449,7 @@ struct regmap_irq_sub_irq_map {
  * @not_fixed_stride: Used when chip peripherals are not laid out with fixed
  * 		      stride. Must be used with sub_reg_offsets containing the
  * 		      offsets to each peripheral.
+ * @status_invert: Inverted status register: cleared bits are active interrupts.
  * @runtime_pm:  Hold a runtime PM lock on the device when accessing it.
  *
  * @num_regs:    Number of registers in each control bank.
@@ -1501,6 +1502,7 @@ struct regmap_irq_chip {
 	bool type_in_mask:1;
 	bool clear_on_unmask:1;
 	bool not_fixed_stride:1;
+	bool status_invert:1;
 
 	int num_regs;
 
-- 
cgit v1.2.3


From eb550f53099bf5ff8dc5de93e275378510c891c9 Mon Sep 17 00:00:00 2001
From: Brett Creeley <brett.creeley@intel.com>
Date: Thu, 17 Sep 2020 13:13:33 -0700
Subject: virtchnl: Use pad byte in virtchnl_ether_addr to specify MAC type

Currently, there is no way for a VF driver to specify that it wants to
change its device/primary unicast MAC address. This makes it
difficult/impossible for the PF driver to track the VF's device/primary
unicast MAC address, which is used for VM/VF reboot and displaying on
the host. Fix this by using 2 bits of a pad byte in the
virtchnl_ether_addr structure so the VF can specify what type of MAC
it's adding/deleting.

Below are the values that should be used by all VF drivers going
forward.

VIRTCHNL_ETHER_ADDR_LEGACY(0):
	- The type should only ever be 0 for legacy AVF drivers (i.e.
	  drivers that don't support the new type bits). The PF drivers
	  will track VF's device/primary unicast MAC, but this will only
	  be a best effort.

VIRTCHNL_ETHER_ADDR_PRIMARY(1):
	- This type should only be used when the VF is changing their
	  device/primary unicast MAC. It should be used for both delete
	  and add cases related to the device/primary unicast MAC.

VIRTCHNL_ETHER_ADDR_EXTRA(2):
	- This type should be used when the VF is adding and/or deleting
	  MAC addresses that are not the device/primary unicast MAC. For
	  example, extra unicast addresses and multicast addresses
	  assuming the PF supports "extra" addresses at all.

If a PF is parsing the type field of the virtchnl_ether_addr, then it
should use the VIRTCHNL_ETHER_ADDR_TYPE_MASK to mask the first two bits
of the type field since 0, 1, and 2 are the only valid values.

Signed-off-by: Brett Creeley <brett.creeley@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/linux/avf/virtchnl.h | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index 565deea6ffe8..1fc07f3f99ab 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -412,9 +412,36 @@ VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_queue_select);
  * PF removes the filters and returns status.
  */
 
+/* VIRTCHNL_ETHER_ADDR_LEGACY
+ * Prior to adding the @type member to virtchnl_ether_addr, there were 2 pad
+ * bytes. Moving forward all VF drivers should not set type to
+ * VIRTCHNL_ETHER_ADDR_LEGACY. This is only here to not break previous/legacy
+ * behavior. The control plane function (i.e. PF) can use a best effort method
+ * of tracking the primary/device unicast in this case, but there is no
+ * guarantee and functionality depends on the implementation of the PF.
+ */
+
+/* VIRTCHNL_ETHER_ADDR_PRIMARY
+ * All VF drivers should set @type to VIRTCHNL_ETHER_ADDR_PRIMARY for the
+ * primary/device unicast MAC address filter for VIRTCHNL_OP_ADD_ETH_ADDR and
+ * VIRTCHNL_OP_DEL_ETH_ADDR. This allows for the underlying control plane
+ * function (i.e. PF) to accurately track and use this MAC address for
+ * displaying on the host and for VM/function reset.
+ */
+
+/* VIRTCHNL_ETHER_ADDR_EXTRA
+ * All VF drivers should set @type to VIRTCHNL_ETHER_ADDR_EXTRA for any extra
+ * unicast and/or multicast filters that are being added/deleted via
+ * VIRTCHNL_OP_DEL_ETH_ADDR/VIRTCHNL_OP_ADD_ETH_ADDR respectively.
+ */
 struct virtchnl_ether_addr {
 	u8 addr[ETH_ALEN];
-	u8 pad[2];
+	u8 type;
+#define VIRTCHNL_ETHER_ADDR_LEGACY	0
+#define VIRTCHNL_ETHER_ADDR_PRIMARY	1
+#define VIRTCHNL_ETHER_ADDR_EXTRA	2
+#define VIRTCHNL_ETHER_ADDR_TYPE_MASK	3 /* first two bits of type are valid */
+	u8 pad;
 };
 
 VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_ether_addr);
-- 
cgit v1.2.3


From c858d436be8b949c368de0e079084acaff3d4aaf Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 4 Jun 2021 17:01:48 +0300
Subject: net: phy: introduce PHY_INTERFACE_MODE_REVRMII

The "reverse RMII" protocol name is a personal invention, derived from
"reverse MII".

Just like MII, RMII is an asymmetric protocol in that a PHY behaves
differently than a MAC. In the case of RMII, for example:
- the 50 MHz clock signals are either driven by the MAC or by an
  external oscillator (but never by the PHY).
- the PHY can transmit extra in-band control symbols via RXD[1:0] which
  the MAC is supposed to understand, but a PHY isn't.

The "reverse MII" protocol is not standardized either, except for this
web document:
https://www.eetimes.com/reverse-media-independent-interface-revmii-block-architecture/#

In short, it means that the Ethernet controller speaks the 4-bit data
parallel protocol from the perspective of a PHY (it acts like a PHY).
This might mean that it implements clause 22 compatible registers,
although that is optional - the important bit is that its pins can be
connected to an MII MAC and it will 'just work'.

In this discussion thread:
https://lore.kernel.org/netdev/20210201214515.cx6ivvme2tlquge2@skbuf/

we agreed that it would be an abuse of terms to use the "RevMII" name
for anything than the 4-bit parallel MII protocol. But since all the
same concepts can be applied to the 2-bit Reduced MII protocol as well,
here we are introducing a "Reverse RMII" protocol. This means: "behave
like an RMII PHY".

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/ethernet-controller.yaml | 1 +
 include/linux/phy.h                                            | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/net/ethernet-controller.yaml b/Documentation/devicetree/bindings/net/ethernet-controller.yaml
index e8f04687a3e0..d97b561003ed 100644
--- a/Documentation/devicetree/bindings/net/ethernet-controller.yaml
+++ b/Documentation/devicetree/bindings/net/ethernet-controller.yaml
@@ -68,6 +68,7 @@ properties:
       - tbi
       - rev-mii
       - rmii
+      - rev-rmii
 
       # RX and TX delays are added by the MAC when required
       - rgmii
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 852743f07e3e..ed332ac92e25 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -93,6 +93,7 @@ extern const int phy_10gbit_features_array[1];
  * @PHY_INTERFACE_MODE_TBI: Ten Bit Interface
  * @PHY_INTERFACE_MODE_REVMII: Reverse Media Independent Interface
  * @PHY_INTERFACE_MODE_RMII: Reduced Media Independent Interface
+ * @PHY_INTERFACE_MODE_REVRMII: Reduced Media Independent Interface in PHY role
  * @PHY_INTERFACE_MODE_RGMII: Reduced gigabit media-independent interface
  * @PHY_INTERFACE_MODE_RGMII_ID: RGMII with Internal RX+TX delay
  * @PHY_INTERFACE_MODE_RGMII_RXID: RGMII with Internal RX delay
@@ -126,6 +127,7 @@ typedef enum {
 	PHY_INTERFACE_MODE_TBI,
 	PHY_INTERFACE_MODE_REVMII,
 	PHY_INTERFACE_MODE_RMII,
+	PHY_INTERFACE_MODE_REVRMII,
 	PHY_INTERFACE_MODE_RGMII,
 	PHY_INTERFACE_MODE_RGMII_ID,
 	PHY_INTERFACE_MODE_RGMII_RXID,
@@ -185,6 +187,8 @@ static inline const char *phy_modes(phy_interface_t interface)
 		return "rev-mii";
 	case PHY_INTERFACE_MODE_RMII:
 		return "rmii";
+	case PHY_INTERFACE_MODE_REVRMII:
+		return "rev-rmii";
 	case PHY_INTERFACE_MODE_RGMII:
 		return "rgmii";
 	case PHY_INTERFACE_MODE_RGMII_ID:
-- 
cgit v1.2.3


From c07aea3ef4d4076f18f567b98ed01e082e02ed51 Mon Sep 17 00:00:00 2001
From: Matteo Croce <mcroce@microsoft.com>
Date: Mon, 7 Jun 2021 21:02:36 +0200
Subject: mm: add a signature in struct page

This is needed by the page_pool to avoid recycling a page not allocated
via page_pool.

The page->signature field is aliased to page->lru.next and
page->compound_head, but it can't be set by mistake because the
signature value is a bad pointer, and can't trigger a false positive
in PageTail() because the last bit is 0.

Co-developed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Matteo Croce <mcroce@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mm.h       | 11 ++++++-----
 include/linux/mm_types.h |  7 +++++++
 include/linux/poison.h   |  3 +++
 net/core/page_pool.c     |  6 ++++++
 4 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c274f75efcf9..a0434e8c2617 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1668,10 +1668,11 @@ struct address_space *page_mapping(struct page *page);
 static inline bool page_is_pfmemalloc(const struct page *page)
 {
 	/*
-	 * Page index cannot be this large so this must be
-	 * a pfmemalloc page.
+	 * lru.next has bit 1 set if the page is allocated from the
+	 * pfmemalloc reserves.  Callers may simply overwrite it if
+	 * they do not need to preserve that information.
 	 */
-	return page->index == -1UL;
+	return (uintptr_t)page->lru.next & BIT(1);
 }
 
 /*
@@ -1680,12 +1681,12 @@ static inline bool page_is_pfmemalloc(const struct page *page)
  */
 static inline void set_page_pfmemalloc(struct page *page)
 {
-	page->index = -1UL;
+	page->lru.next = (void *)BIT(1);
 }
 
 static inline void clear_page_pfmemalloc(struct page *page)
 {
-	page->index = 0;
+	page->lru.next = NULL;
 }
 
 /*
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5aacc1c10a45..ed6862eacb52 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -96,6 +96,13 @@ struct page {
 			unsigned long private;
 		};
 		struct {	/* page_pool used by netstack */
+			/**
+			 * @pp_magic: magic value to avoid recycling non
+			 * page_pool allocated pages.
+			 */
+			unsigned long pp_magic;
+			struct page_pool *pp;
+			unsigned long _pp_mapping_pad;
 			/**
 			 * @dma_addr: might require a 64-bit value on
 			 * 32-bit architectures.
diff --git a/include/linux/poison.h b/include/linux/poison.h
index aff1c9250c82..d62ef5a6b4e9 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -78,4 +78,7 @@
 /********** security/ **********/
 #define KEY_DESTROY		0xbd
 
+/********** net/core/page_pool.c **********/
+#define PP_SIGNATURE		(0x40 + POISON_POINTER_DELTA)
+
 #endif
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 3c4c4c7a0402..e1321bc9d316 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -17,6 +17,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/page-flags.h>
 #include <linux/mm.h> /* for __put_page() */
+#include <linux/poison.h>
 
 #include <trace/events/page_pool.h>
 
@@ -221,6 +222,8 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
 		return NULL;
 	}
 
+	page->pp_magic |= PP_SIGNATURE;
+
 	/* Track how many pages are held 'in-flight' */
 	pool->pages_state_hold_cnt++;
 	trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
@@ -263,6 +266,7 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
 			put_page(page);
 			continue;
 		}
+		page->pp_magic |= PP_SIGNATURE;
 		pool->alloc.cache[pool->alloc.count++] = page;
 		/* Track how many pages are held 'in-flight' */
 		pool->pages_state_hold_cnt++;
@@ -341,6 +345,8 @@ void page_pool_release_page(struct page_pool *pool, struct page *page)
 			     DMA_ATTR_SKIP_CPU_SYNC);
 	page_pool_set_dma_addr(page, 0);
 skip_dma_unmap:
+	page->pp_magic = 0;
+
 	/* This may be the last page returned, releasing the pool, so
 	 * it is not safe to reference pool afterwards.
 	 */
-- 
cgit v1.2.3


From c420c98982fa9e749c99e022845d5f323d098b72 Mon Sep 17 00:00:00 2001
From: Matteo Croce <mcroce@microsoft.com>
Date: Mon, 7 Jun 2021 21:02:37 +0200
Subject: skbuff: add a parameter to __skb_frag_unref

This is a prerequisite patch, the next one is enabling recycling of
skbs and fragments. Add an extra argument on __skb_frag_unref() to
handle recycling, and update the current users of the function with that.

Signed-off-by: Matteo Croce <mcroce@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/sky2.c        | 2 +-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c | 2 +-
 include/linux/skbuff.h                     | 8 +++++---
 net/core/skbuff.c                          | 4 ++--
 net/tls/tls_device.c                       | 2 +-
 5 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
index 324c280cc22c..8b8bff59c8fe 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -2503,7 +2503,7 @@ static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space,
 
 		if (length == 0) {
 			/* don't need this page */
-			__skb_frag_unref(frag);
+			__skb_frag_unref(frag, false);
 			--skb_shinfo(skb)->nr_frags;
 		} else {
 			size = min(length, (unsigned) PAGE_SIZE);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index e35e4d7ef4d1..cea62b8f554c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -526,7 +526,7 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
 fail:
 	while (nr > 0) {
 		nr--;
-		__skb_frag_unref(skb_shinfo(skb)->frags + nr);
+		__skb_frag_unref(skb_shinfo(skb)->frags + nr, false);
 	}
 	return 0;
 }
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index dbf820a50a39..7fcfea7e7b21 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3081,10 +3081,12 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
 /**
  * __skb_frag_unref - release a reference on a paged fragment.
  * @frag: the paged fragment
+ * @recycle: recycle the page if allocated via page_pool
  *
- * Releases a reference on the paged fragment @frag.
+ * Releases a reference on the paged fragment @frag
+ * or recycles the page via the page_pool API.
  */
-static inline void __skb_frag_unref(skb_frag_t *frag)
+static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
 {
 	put_page(skb_frag_page(frag));
 }
@@ -3098,7 +3100,7 @@ static inline void __skb_frag_unref(skb_frag_t *frag)
  */
 static inline void skb_frag_unref(struct sk_buff *skb, int f)
 {
-	__skb_frag_unref(&skb_shinfo(skb)->frags[f]);
+	__skb_frag_unref(&skb_shinfo(skb)->frags[f], false);
 }
 
 /**
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3ad22870298c..12b7e90dd2b5 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -664,7 +664,7 @@ static void skb_release_data(struct sk_buff *skb)
 	skb_zcopy_clear(skb, true);
 
 	for (i = 0; i < shinfo->nr_frags; i++)
-		__skb_frag_unref(&shinfo->frags[i]);
+		__skb_frag_unref(&shinfo->frags[i], false);
 
 	if (shinfo->frag_list)
 		kfree_skb_list(shinfo->frag_list);
@@ -3495,7 +3495,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
 		fragto = &skb_shinfo(tgt)->frags[merge];
 
 		skb_frag_size_add(fragto, skb_frag_size(fragfrom));
-		__skb_frag_unref(fragfrom);
+		__skb_frag_unref(fragfrom, false);
 	}
 
 	/* Reposition in the original skb */
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index bd9f1567aa39..b932469ee69c 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -128,7 +128,7 @@ static void destroy_record(struct tls_record_info *record)
 	int i;
 
 	for (i = 0; i < record->num_frags; i++)
-		__skb_frag_unref(&record->frags[i]);
+		__skb_frag_unref(&record->frags[i], false);
 	kfree(record);
 }
 
-- 
cgit v1.2.3


From 6a5bcd84e886a9a91982e515c539529c28acdcc2 Mon Sep 17 00:00:00 2001
From: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Date: Mon, 7 Jun 2021 21:02:38 +0200
Subject: page_pool: Allow drivers to hint on SKB recycling

Up to now several high speed NICs have custom mechanisms of recycling
the allocated memory they use for their payloads.
Our page_pool API already has recycling capabilities that are always
used when we are running in 'XDP mode'. So let's tweak the API and the
kernel network stack slightly and allow the recycling to happen even
during the standard operation.
The API doesn't take into account 'split page' policies used by those
drivers currently, but can be extended once we have users for that.

The idea is to be able to intercept the packet on skb_release_data().
If it's a buffer coming from our page_pool API recycle it back to the
pool for further usage or just release the packet entirely.

To achieve that we introduce a bit in struct sk_buff (pp_recycle:1) and
a field in struct page (page->pp) to store the page_pool pointer.
Storing the information in page->pp allows us to recycle both SKBs and
their fragments.
We could have skipped the skb bit entirely, since identical information
can bederived from struct page. However, in an effort to affect the free path
as less as possible, reading a single bit in the skb which is already
in cache, is better that trying to derive identical information for the
page stored data.

The driver or page_pool has to take care of the sync operations on it's own
during the buffer recycling since the buffer is, after opting-in to the
recycling, never unmapped.

Since the gain on the drivers depends on the architecture, we are not
enabling recycling by default if the page_pool API is used on a driver.
In order to enable recycling the driver must call skb_mark_for_recycle()
to store the information we need for recycling in page->pp and
enabling the recycling bit, or page_pool_store_mem_info() for a fragment.

Co-developed-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Co-developed-by: Matteo Croce <mcroce@microsoft.com>
Signed-off-by: Matteo Croce <mcroce@microsoft.com>
Signed-off-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h  | 33 ++++++++++++++++++++++++++++++---
 include/net/page_pool.h |  9 +++++++++
 net/core/page_pool.c    | 22 ++++++++++++++++++++++
 net/core/skbuff.c       | 20 ++++++++++++++++----
 4 files changed, 77 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7fcfea7e7b21..b2db9cd9a73f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -37,6 +37,7 @@
 #include <linux/in6.h>
 #include <linux/if_packet.h>
 #include <net/flow.h>
+#include <net/page_pool.h>
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 #include <linux/netfilter/nf_conntrack_common.h>
 #endif
@@ -667,6 +668,8 @@ typedef unsigned char *sk_buff_data_t;
  *	@head_frag: skb was allocated from page fragments,
  *		not allocated by kmalloc() or vmalloc().
  *	@pfmemalloc: skbuff was allocated from PFMEMALLOC reserves
+ *	@pp_recycle: mark the packet for recycling instead of freeing (implies
+ *		page_pool support on driver)
  *	@active_extensions: active extensions (skb_ext_id types)
  *	@ndisc_nodetype: router type (from link layer)
  *	@ooo_okay: allow the mapping of a socket to a queue to be changed
@@ -791,10 +794,12 @@ struct sk_buff {
 				fclone:2,
 				peeked:1,
 				head_frag:1,
-				pfmemalloc:1;
+				pfmemalloc:1,
+				pp_recycle:1; /* page_pool recycle indicator */
 #ifdef CONFIG_SKB_EXTENSIONS
 	__u8			active_extensions;
 #endif
+
 	/* fields enclosed in headers_start/headers_end are copied
 	 * using a single memcpy() in __copy_skb_header()
 	 */
@@ -3088,7 +3093,13 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
  */
 static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
 {
-	put_page(skb_frag_page(frag));
+	struct page *page = skb_frag_page(frag);
+
+#ifdef CONFIG_PAGE_POOL
+	if (recycle && page_pool_return_skb_page(page))
+		return;
+#endif
+	put_page(page);
 }
 
 /**
@@ -3100,7 +3111,7 @@ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
  */
 static inline void skb_frag_unref(struct sk_buff *skb, int f)
 {
-	__skb_frag_unref(&skb_shinfo(skb)->frags[f], false);
+	__skb_frag_unref(&skb_shinfo(skb)->frags[f], skb->pp_recycle);
 }
 
 /**
@@ -4699,5 +4710,21 @@ static inline u64 skb_get_kcov_handle(struct sk_buff *skb)
 #endif
 }
 
+#ifdef CONFIG_PAGE_POOL
+static inline void skb_mark_for_recycle(struct sk_buff *skb, struct page *page,
+					struct page_pool *pp)
+{
+	skb->pp_recycle = 1;
+	page_pool_store_mem_info(page, pp);
+}
+#endif
+
+static inline bool skb_pp_recycle(struct sk_buff *skb, void *data)
+{
+	if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
+		return false;
+	return page_pool_return_skb_page(virt_to_page(data));
+}
+
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index b4b6de909c93..3dd62dd73027 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -146,6 +146,8 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool)
 	return pool->p.dma_dir;
 }
 
+bool page_pool_return_skb_page(struct page *page);
+
 struct page_pool *page_pool_create(const struct page_pool_params *params);
 
 #ifdef CONFIG_PAGE_POOL
@@ -251,4 +253,11 @@ static inline void page_pool_ring_unlock(struct page_pool *pool)
 		spin_unlock_bh(&pool->ring.producer_lock);
 }
 
+/* Store mem_info on struct page and use it while recycling skb frags */
+static inline
+void page_pool_store_mem_info(struct page *page, struct page_pool *pp)
+{
+	page->pp = pp;
+}
+
 #endif /* _NET_PAGE_POOL_H */
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index e1321bc9d316..5e4eb45b139c 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -628,3 +628,25 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
 	}
 }
 EXPORT_SYMBOL(page_pool_update_nid);
+
+bool page_pool_return_skb_page(struct page *page)
+{
+	struct page_pool *pp;
+
+	page = compound_head(page);
+	if (unlikely(page->pp_magic != PP_SIGNATURE))
+		return false;
+
+	pp = page->pp;
+
+	/* Driver set this to memory recycling info. Reset it on recycle.
+	 * This will *not* work for NIC using a split-page memory model.
+	 * The page will be returned to the pool here regardless of the
+	 * 'flipped' fragment being in use or not.
+	 */
+	page->pp = NULL;
+	page_pool_put_full_page(pp, page, false);
+
+	return true;
+}
+EXPORT_SYMBOL(page_pool_return_skb_page);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 12b7e90dd2b5..a0b1d4847efe 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -70,6 +70,7 @@
 #include <net/xfrm.h>
 #include <net/mpls.h>
 #include <net/mptcp.h>
+#include <net/page_pool.h>
 
 #include <linux/uaccess.h>
 #include <trace/events/skb.h>
@@ -645,10 +646,13 @@ static void skb_free_head(struct sk_buff *skb)
 {
 	unsigned char *head = skb->head;
 
-	if (skb->head_frag)
+	if (skb->head_frag) {
+		if (skb_pp_recycle(skb, head))
+			return;
 		skb_free_frag(head);
-	else
+	} else {
 		kfree(head);
+	}
 }
 
 static void skb_release_data(struct sk_buff *skb)
@@ -664,7 +668,7 @@ static void skb_release_data(struct sk_buff *skb)
 	skb_zcopy_clear(skb, true);
 
 	for (i = 0; i < shinfo->nr_frags; i++)
-		__skb_frag_unref(&shinfo->frags[i], false);
+		__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
 
 	if (shinfo->frag_list)
 		kfree_skb_list(shinfo->frag_list);
@@ -1046,6 +1050,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
 	n->nohdr = 0;
 	n->peeked = 0;
 	C(pfmemalloc);
+	C(pp_recycle);
 	n->destructor = NULL;
 	C(tail);
 	C(end);
@@ -3495,7 +3500,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
 		fragto = &skb_shinfo(tgt)->frags[merge];
 
 		skb_frag_size_add(fragto, skb_frag_size(fragfrom));
-		__skb_frag_unref(fragfrom, false);
+		__skb_frag_unref(fragfrom, skb->pp_recycle);
 	}
 
 	/* Reposition in the original skb */
@@ -5285,6 +5290,13 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
 	if (skb_cloned(to))
 		return false;
 
+	/* The page pool signature of struct page will eventually figure out
+	 * which pages can be recycled or not but for now let's prohibit slab
+	 * allocated and page_pool allocated SKBs from being coalesced.
+	 */
+	if (to->pp_recycle != from->pp_recycle)
+		return false;
+
 	if (len <= skb_tailroom(to)) {
 		if (len)
 			BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
-- 
cgit v1.2.3


From bb6bfd79d9bc69f0808a4156ec3ca9fb78694039 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Thu, 27 May 2021 14:37:09 -0500
Subject: iommu: Remove unused of_get_dma_window()

of_get_dma_window() was added in 2012 and removed in 2014 in commit
891846516317 ("memory: Add NVIDIA Tegra memory controller support").
Remove it and simplify the header to use forward declarations for
structs rather than includes.

Cc: Joerg Roedel <joro@8bytes.org>
Cc: Will Deacon <will@kernel.org>
Cc: Frank Rowand <frowand.list@gmail.com>
Cc: iommu@lists.linux-foundation.org
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20210527193710.1281746-1-robh@kernel.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/of_iommu.c | 68 ------------------------------------------------
 include/linux/of_iommu.h | 17 +++---------
 2 files changed, 3 insertions(+), 82 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index a9d2df001149..5696314ae69e 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -19,74 +19,6 @@
 
 #define NO_IOMMU	1
 
-/**
- * of_get_dma_window - Parse *dma-window property and returns 0 if found.
- *
- * @dn: device node
- * @prefix: prefix for property name if any
- * @index: index to start to parse
- * @busno: Returns busno if supported. Otherwise pass NULL
- * @addr: Returns address that DMA starts
- * @size: Returns the range that DMA can handle
- *
- * This supports different formats flexibly. "prefix" can be
- * configured if any. "busno" and "index" are optionally
- * specified. Set 0(or NULL) if not used.
- */
-int of_get_dma_window(struct device_node *dn, const char *prefix, int index,
-		      unsigned long *busno, dma_addr_t *addr, size_t *size)
-{
-	const __be32 *dma_window, *end;
-	int bytes, cur_index = 0;
-	char propname[NAME_MAX], addrname[NAME_MAX], sizename[NAME_MAX];
-
-	if (!dn || !addr || !size)
-		return -EINVAL;
-
-	if (!prefix)
-		prefix = "";
-
-	snprintf(propname, sizeof(propname), "%sdma-window", prefix);
-	snprintf(addrname, sizeof(addrname), "%s#dma-address-cells", prefix);
-	snprintf(sizename, sizeof(sizename), "%s#dma-size-cells", prefix);
-
-	dma_window = of_get_property(dn, propname, &bytes);
-	if (!dma_window)
-		return -ENODEV;
-	end = dma_window + bytes / sizeof(*dma_window);
-
-	while (dma_window < end) {
-		u32 cells;
-		const void *prop;
-
-		/* busno is one cell if supported */
-		if (busno)
-			*busno = be32_to_cpup(dma_window++);
-
-		prop = of_get_property(dn, addrname, NULL);
-		if (!prop)
-			prop = of_get_property(dn, "#address-cells", NULL);
-
-		cells = prop ? be32_to_cpup(prop) : of_n_addr_cells(dn);
-		if (!cells)
-			return -EINVAL;
-		*addr = of_read_number(dma_window, cells);
-		dma_window += cells;
-
-		prop = of_get_property(dn, sizename, NULL);
-		cells = prop ? be32_to_cpup(prop) : of_n_size_cells(dn);
-		if (!cells)
-			return -EINVAL;
-		*size = of_read_number(dma_window, cells);
-		dma_window += cells;
-
-		if (cur_index++ == index)
-			break;
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(of_get_dma_window);
-
 static int of_iommu_xlate(struct device *dev,
 			  struct of_phandle_args *iommu_spec)
 {
diff --git a/include/linux/of_iommu.h b/include/linux/of_iommu.h
index 16f4b3e87f20..55c1eb300a86 100644
--- a/include/linux/of_iommu.h
+++ b/include/linux/of_iommu.h
@@ -2,29 +2,18 @@
 #ifndef __OF_IOMMU_H
 #define __OF_IOMMU_H
 
-#include <linux/device.h>
-#include <linux/iommu.h>
-#include <linux/of.h>
+struct device;
+struct device_node;
+struct iommu_ops;
 
 #ifdef CONFIG_OF_IOMMU
 
-extern int of_get_dma_window(struct device_node *dn, const char *prefix,
-			     int index, unsigned long *busno, dma_addr_t *addr,
-			     size_t *size);
-
 extern const struct iommu_ops *of_iommu_configure(struct device *dev,
 					struct device_node *master_np,
 					const u32 *id);
 
 #else
 
-static inline int of_get_dma_window(struct device_node *dn, const char *prefix,
-			    int index, unsigned long *busno, dma_addr_t *addr,
-			    size_t *size)
-{
-	return -EINVAL;
-}
-
 static inline const struct iommu_ops *of_iommu_configure(struct device *dev,
 					 struct device_node *master_np,
 					 const u32 *id)
-- 
cgit v1.2.3


From cfa7ff959a789a953eac40c8ac793e2cfc2db931 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 3 Jun 2021 19:41:18 +0100
Subject: arm64: smccc: Support SMCCC v1.3 SVE register saving hint

SMCCC v1.2 requires that all SVE state be preserved over SMC calls which
introduces substantial overhead in the common case where there is no SVE
state in the registers. To avoid this SMCCC v1.3 introduces a flag which
allows the caller to say that there is no state that needs to be preserved
in the registers. Make use of this flag, setting it if the SMCCC version
indicates support for it and the TIF_ flags indicate that there is no live
SVE state in the registers, this avoids placing any constraints on when
SMCCC calls can be done or triggering extra saving and reloading of SVE
register state in the kernel.

This would be straightforward enough except for the rather entertaining
inline assembly we use to do SMCCC v1.1 calls to allow us to take advantage
of the limited number of registers it clobbers. Deal with this by having a
function which we call immediately before issuing the SMCCC call to make
our checks and set the flag. Using alternatives the overhead if SVE is
supported but not detected at runtime can be reduced to a single NOP.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210603184118.15090-1-broonie@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/smccc-call.S | 26 ++++++++++++++++++++++++++
 drivers/firmware/smccc/smccc.c |  4 ++++
 include/linux/arm-smccc.h      | 33 +++++++++++++++++++++++++++++++--
 3 files changed, 61 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kernel/smccc-call.S b/arch/arm64/kernel/smccc-call.S
index 2def9d0dd3dd..d3d37f932b97 100644
--- a/arch/arm64/kernel/smccc-call.S
+++ b/arch/arm64/kernel/smccc-call.S
@@ -7,8 +7,34 @@
 
 #include <asm/asm-offsets.h>
 #include <asm/assembler.h>
+#include <asm/thread_info.h>
+
+/*
+ * If we have SMCCC v1.3 and (as is likely) no SVE state in
+ * the registers then set the SMCCC hint bit to say there's no
+ * need to preserve it.  Do this by directly adjusting the SMCCC
+ * function value which is already stored in x0 ready to be called.
+ */
+SYM_FUNC_START(__arm_smccc_sve_check)
+
+	ldr_l	x16, smccc_has_sve_hint
+	cbz	x16, 2f
+
+	get_current_task x16
+	ldr	x16, [x16, #TSK_TI_FLAGS]
+	tbnz	x16, #TIF_FOREIGN_FPSTATE, 1f	// Any live FP state?
+	tbnz	x16, #TIF_SVE, 2f		// Does that state include SVE?
+
+1:	orr	x0, x0, ARM_SMCCC_1_3_SVE_HINT
+
+2:	ret
+SYM_FUNC_END(__arm_smccc_sve_check)
+EXPORT_SYMBOL(__arm_smccc_sve_check)
 
 	.macro SMCCC instr
+alternative_if ARM64_SVE
+	bl	__arm_smccc_sve_check
+alternative_else_nop_endif
 	\instr	#0
 	ldr	x4, [sp]
 	stp	x0, x1, [x4, #ARM_SMCCC_RES_X0_OFFS]
diff --git a/drivers/firmware/smccc/smccc.c b/drivers/firmware/smccc/smccc.c
index 028f81d702cc..9f937b125ab0 100644
--- a/drivers/firmware/smccc/smccc.c
+++ b/drivers/firmware/smccc/smccc.c
@@ -15,6 +15,7 @@ static u32 smccc_version = ARM_SMCCC_VERSION_1_0;
 static enum arm_smccc_conduit smccc_conduit = SMCCC_CONDUIT_NONE;
 
 bool __ro_after_init smccc_trng_available = false;
+u64 __ro_after_init smccc_has_sve_hint = false;
 
 void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit)
 {
@@ -22,6 +23,9 @@ void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit)
 	smccc_conduit = conduit;
 
 	smccc_trng_available = smccc_probe_trng();
+	if (IS_ENABLED(CONFIG_ARM64_SVE) &&
+	    smccc_version >= ARM_SMCCC_VERSION_1_3)
+		smccc_has_sve_hint = true;
 }
 
 enum arm_smccc_conduit arm_smccc_1_1_get_conduit(void)
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index 5cef2b8b0479..7d1cabe15262 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -63,6 +63,9 @@
 #define ARM_SMCCC_VERSION_1_0		0x10000
 #define ARM_SMCCC_VERSION_1_1		0x10001
 #define ARM_SMCCC_VERSION_1_2		0x10002
+#define ARM_SMCCC_VERSION_1_3		0x10003
+
+#define ARM_SMCCC_1_3_SVE_HINT		0x10000
 
 #define ARM_SMCCC_VERSION_FUNC_ID					\
 	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
@@ -216,6 +219,8 @@ u32 arm_smccc_get_version(void);
 
 void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit);
 
+extern u64 smccc_has_sve_hint;
+
 /**
  * struct arm_smccc_res - Result from SMC/HVC call
  * @a0-a3 result values from registers 0 to 3
@@ -295,6 +300,15 @@ struct arm_smccc_quirk {
 	} state;
 };
 
+/**
+ * __arm_smccc_sve_check() - Set the SVE hint bit when doing SMC calls
+ *
+ * Sets the SMCCC hint bit to indicate if there is live state in the SVE
+ * registers, this modifies x0 in place and should never be called from C
+ * code.
+ */
+asmlinkage unsigned long __arm_smccc_sve_check(unsigned long x0);
+
 /**
  * __arm_smccc_smc() - make SMC calls
  * @a0-a7: arguments passed in registers 0 to 7
@@ -352,6 +366,20 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
 
 #endif
 
+/* nVHE hypervisor doesn't have a current thread so needs separate checks */
+#if defined(CONFIG_ARM64_SVE) && !defined(__KVM_NVHE_HYPERVISOR__)
+
+#define SMCCC_SVE_CHECK ALTERNATIVE("nop \n",  "bl __arm_smccc_sve_check \n", \
+				    ARM64_SVE)
+#define smccc_sve_clobbers "x16", "x30", "cc",
+
+#else
+
+#define SMCCC_SVE_CHECK
+#define smccc_sve_clobbers
+
+#endif
+
 #define ___count_args(_0, _1, _2, _3, _4, _5, _6, _7, _8, x, ...) x
 
 #define __count_args(...)						\
@@ -419,7 +447,7 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
 
 #define ___constraints(count)						\
 	: __constraint_read_ ## count					\
-	: "memory"
+	: smccc_sve_clobbers "memory"
 #define __constraints(count)	___constraints(count)
 
 /*
@@ -434,7 +462,8 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
 		register unsigned long r2 asm("r2");			\
 		register unsigned long r3 asm("r3"); 			\
 		__declare_args(__count_args(__VA_ARGS__), __VA_ARGS__);	\
-		asm volatile(inst "\n" :				\
+		asm volatile(SMCCC_SVE_CHECK				\
+			     inst "\n" :				\
 			     "=r" (r0), "=r" (r1), "=r" (r2), "=r" (r3)	\
 			     __constraints(__count_args(__VA_ARGS__)));	\
 		if (___res)						\
-- 
cgit v1.2.3


From 5617c9125bb66a923f3560d5739eb7f3a21c00b5 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sat, 29 May 2021 16:52:32 +0200
Subject: clkdev: remove unused clkdev_alloc() interfaces

The last user of clkdev_alloc() and clkdev_hw_alloc() was
removed last year, so everything now calls clkdev_create()
and clkdev_hw_create() instead.

Removing the unused functions lets the compiler optimize
the remaining ones slightly better.

Fixes: e5006671acc7 ("clk: versatile: Drop the legacy IM-PD1 clock code")
Reviewed-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 drivers/clk/clkdev.c   | 28 ----------------------------
 include/linux/clkdev.h |  5 -----
 2 files changed, 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clkdev.c b/drivers/clk/clkdev.c
index 0f2e3fcf0f19..67f601a41023 100644
--- a/drivers/clk/clkdev.c
+++ b/drivers/clk/clkdev.c
@@ -190,34 +190,6 @@ vclkdev_create(struct clk_hw *hw, const char *con_id, const char *dev_fmt,
 	return cl;
 }
 
-struct clk_lookup * __ref
-clkdev_alloc(struct clk *clk, const char *con_id, const char *dev_fmt, ...)
-{
-	struct clk_lookup *cl;
-	va_list ap;
-
-	va_start(ap, dev_fmt);
-	cl = vclkdev_alloc(__clk_get_hw(clk), con_id, dev_fmt, ap);
-	va_end(ap);
-
-	return cl;
-}
-EXPORT_SYMBOL(clkdev_alloc);
-
-struct clk_lookup *
-clkdev_hw_alloc(struct clk_hw *hw, const char *con_id, const char *dev_fmt, ...)
-{
-	struct clk_lookup *cl;
-	va_list ap;
-
-	va_start(ap, dev_fmt);
-	cl = vclkdev_alloc(hw, con_id, dev_fmt, ap);
-	va_end(ap);
-
-	return cl;
-}
-EXPORT_SYMBOL(clkdev_hw_alloc);
-
 /**
  * clkdev_create - allocate and add a clkdev lookup structure
  * @clk: struct clk to associate with all clk_lookups
diff --git a/include/linux/clkdev.h b/include/linux/clkdev.h
index fd06b2780a22..8a8423eb8e9a 100644
--- a/include/linux/clkdev.h
+++ b/include/linux/clkdev.h
@@ -30,11 +30,6 @@ struct clk_lookup {
 		.clk = c,	\
 	}
 
-struct clk_lookup *clkdev_alloc(struct clk *clk, const char *con_id,
-	const char *dev_fmt, ...) __printf(3, 4);
-struct clk_lookup *clkdev_hw_alloc(struct clk_hw *hw, const char *con_id,
-	const char *dev_fmt, ...) __printf(3, 4);
-
 void clkdev_add(struct clk_lookup *cl);
 void clkdev_drop(struct clk_lookup *cl);
 
-- 
cgit v1.2.3


From 149876d96877eedce0ae3ffbd64edb56360b8926 Mon Sep 17 00:00:00 2001
From: Huilong Deng <denghuilong@cdjrlc.com>
Date: Sat, 5 Jun 2021 12:53:02 +0800
Subject: seqlock: Remove trailing semicolon in macros

Macros should not use a trailing semicolon.

Signed-off-by: Huilong Deng <denghuilong@cdjrlc.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210605045302.37154-1-denghuilong@cdjrlc.com
---
 include/linux/seqlock.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index f61e34fbaaea..37ded6b8fee6 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -182,9 +182,9 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s)
 
 #define seqcount_raw_spinlock_init(s, lock)	seqcount_LOCKNAME_init(s, lock, raw_spinlock)
 #define seqcount_spinlock_init(s, lock)		seqcount_LOCKNAME_init(s, lock, spinlock)
-#define seqcount_rwlock_init(s, lock)		seqcount_LOCKNAME_init(s, lock, rwlock);
-#define seqcount_mutex_init(s, lock)		seqcount_LOCKNAME_init(s, lock, mutex);
-#define seqcount_ww_mutex_init(s, lock)		seqcount_LOCKNAME_init(s, lock, ww_mutex);
+#define seqcount_rwlock_init(s, lock)		seqcount_LOCKNAME_init(s, lock, rwlock)
+#define seqcount_mutex_init(s, lock)		seqcount_LOCKNAME_init(s, lock, mutex)
+#define seqcount_ww_mutex_init(s, lock)		seqcount_LOCKNAME_init(s, lock, ww_mutex)
 
 /*
  * SEQCOUNT_LOCKNAME()	- Instantiate seqcount_LOCKNAME_t and helpers
-- 
cgit v1.2.3


From 3958e2d0c34e18c41b60dc01832bd670a59ef70f Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Mon, 24 May 2021 12:53:39 -0700
Subject: cgroup: make per-cgroup pressure stall tracking configurable

PSI accounts stalls for each cgroup separately and aggregates it at each
level of the hierarchy. This causes additional overhead with psi_avgs_work
being called for each cgroup in the hierarchy. psi_avgs_work has been
highly optimized, however on systems with large number of cgroups the
overhead becomes noticeable.
Systems which use PSI only at the system level could avoid this overhead
if PSI can be configured to skip per-cgroup stall accounting.
Add "cgroup_disable=pressure" kernel command-line option to allow
requesting system-wide only pressure stall accounting. When set, it
keeps system-wide accounting under /proc/pressure/ but skips accounting
for individual cgroups and does not expose PSI nodes in cgroup hierarchy.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  9 +++--
 include/linux/cgroup-defs.h                     |  1 +
 include/linux/cgroup.h                          |  7 ++++
 kernel/cgroup/cgroup.c                          | 48 +++++++++++++++++++++++++
 kernel/sched/psi.c                              | 30 +++++++++-------
 5 files changed, 80 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index cb89dbdedc46..653c62142f07 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -497,16 +497,21 @@
 	ccw_timeout_log	[S390]
 			See Documentation/s390/common_io.rst for details.
 
-	cgroup_disable=	[KNL] Disable a particular controller
-			Format: {name of the controller(s) to disable}
+	cgroup_disable=	[KNL] Disable a particular controller or optional feature
+			Format: {name of the controller(s) or feature(s) to disable}
 			The effects of cgroup_disable=foo are:
 			- foo isn't auto-mounted if you mount all cgroups in
 			  a single hierarchy
 			- foo isn't visible as an individually mountable
 			  subsystem
+			- if foo is an optional feature then the feature is
+			  disabled and corresponding cgroup files are not
+			  created
 			{Currently only "memory" controller deal with this and
 			cut the overhead, others just disable the usage. So
 			only cgroup_disable=memory is actually worthy}
+			Specifying "pressure" disables per-cgroup pressure
+			stall information accounting feature
 
 	cgroup_no_v1=	[KNL] Disable cgroup controllers and named hierarchies in v1
 			Format: { { controller | "all" | "named" }
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 1a1f3e8faceb..e1c705fdfa7c 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -113,6 +113,7 @@ enum {
 	CFTYPE_NO_PREFIX	= (1 << 3),	/* (DON'T USE FOR NEW FILES) no subsys prefix */
 	CFTYPE_WORLD_WRITABLE	= (1 << 4),	/* (DON'T USE FOR NEW FILES) S_IWUGO */
 	CFTYPE_DEBUG		= (1 << 5),	/* create when cgroup_debug */
+	CFTYPE_PRESSURE		= (1 << 6),	/* only if pressure feature is enabled */
 
 	/* internal flags, do not use outside cgroup core proper */
 	__CFTYPE_ONLY_ON_DFL	= (1 << 16),	/* only on default hierarchy */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 9047fa853dd3..2cc237e3e8b3 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -676,6 +676,8 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
 	return &cgrp->psi;
 }
 
+bool cgroup_psi_enabled(void);
+
 static inline void cgroup_init_kthreadd(void)
 {
 	/*
@@ -735,6 +737,11 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
 	return NULL;
 }
 
+static inline bool cgroup_psi_enabled(void)
+{
+	return false;
+}
+
 static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
 					       struct cgroup *ancestor)
 {
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 74e3cc801615..e13a37f4acab 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -209,6 +209,22 @@ struct cgroup_namespace init_cgroup_ns = {
 static struct file_system_type cgroup2_fs_type;
 static struct cftype cgroup_base_files[];
 
+/* cgroup optional features */
+enum cgroup_opt_features {
+#ifdef CONFIG_PSI
+	OPT_FEATURE_PRESSURE,
+#endif
+	OPT_FEATURE_COUNT
+};
+
+static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
+#ifdef CONFIG_PSI
+	"pressure",
+#endif
+};
+
+static u16 cgroup_feature_disable_mask __read_mostly;
+
 static int cgroup_apply_control(struct cgroup *cgrp);
 static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
 static void css_task_iter_skip(struct css_task_iter *it,
@@ -3631,6 +3647,18 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
 {
 	psi_trigger_replace(&of->priv, NULL);
 }
+
+bool cgroup_psi_enabled(void)
+{
+	return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
+}
+
+#else /* CONFIG_PSI */
+bool cgroup_psi_enabled(void)
+{
+	return false;
+}
+
 #endif /* CONFIG_PSI */
 
 static int cgroup_freeze_show(struct seq_file *seq, void *v)
@@ -3955,6 +3983,8 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
 restart:
 	for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
 		/* does cft->flags tell us to skip this file on @cgrp? */
+		if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
+			continue;
 		if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
 			continue;
 		if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
@@ -4032,6 +4062,9 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 
 		WARN_ON(cft->ss || cft->kf_ops);
 
+		if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
+			continue;
+
 		if (cft->seq_start)
 			kf_ops = &cgroup_kf_ops;
 		else
@@ -4945,6 +4978,7 @@ static struct cftype cgroup_base_files[] = {
 #ifdef CONFIG_PSI
 	{
 		.name = "io.pressure",
+		.flags = CFTYPE_PRESSURE,
 		.seq_show = cgroup_io_pressure_show,
 		.write = cgroup_io_pressure_write,
 		.poll = cgroup_pressure_poll,
@@ -4952,6 +4986,7 @@ static struct cftype cgroup_base_files[] = {
 	},
 	{
 		.name = "memory.pressure",
+		.flags = CFTYPE_PRESSURE,
 		.seq_show = cgroup_memory_pressure_show,
 		.write = cgroup_memory_pressure_write,
 		.poll = cgroup_pressure_poll,
@@ -4959,6 +4994,7 @@ static struct cftype cgroup_base_files[] = {
 	},
 	{
 		.name = "cpu.pressure",
+		.flags = CFTYPE_PRESSURE,
 		.seq_show = cgroup_cpu_pressure_show,
 		.write = cgroup_cpu_pressure_write,
 		.poll = cgroup_pressure_poll,
@@ -6313,6 +6349,15 @@ static int __init cgroup_disable(char *str)
 			pr_info("Disabling %s control group subsystem\n",
 				ss->name);
 		}
+
+		for (i = 0; i < OPT_FEATURE_COUNT; i++) {
+			if (strcmp(token, cgroup_opt_feature_names[i]))
+				continue;
+			cgroup_feature_disable_mask |= 1 << i;
+			pr_info("Disabling %s control group feature\n",
+				cgroup_opt_feature_names[i]);
+			break;
+		}
 	}
 	return 1;
 }
@@ -6611,6 +6656,9 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf,
 		if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
 			continue;
 
+		if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
+			continue;
+
 		if (prefix)
 			ret += snprintf(buf + ret, size - ret, "%s.", prefix);
 
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index cc25a3cff41f..b773cae4c24b 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -148,6 +148,7 @@
 static int psi_bug __read_mostly;
 
 DEFINE_STATIC_KEY_FALSE(psi_disabled);
+DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);
 
 #ifdef CONFIG_PSI_DEFAULT_DISABLED
 static bool psi_enable;
@@ -211,6 +212,9 @@ void __init psi_init(void)
 		return;
 	}
 
+	if (!cgroup_psi_enabled())
+		static_branch_disable(&psi_cgroups_enabled);
+
 	psi_period = jiffies_to_nsecs(PSI_FREQ);
 	group_init(&psi_system);
 }
@@ -744,23 +748,23 @@ static void psi_group_change(struct psi_group *group, int cpu,
 
 static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
 {
+	if (*iter == &psi_system)
+		return NULL;
+
 #ifdef CONFIG_CGROUPS
-	struct cgroup *cgroup = NULL;
+	if (static_branch_likely(&psi_cgroups_enabled)) {
+		struct cgroup *cgroup = NULL;
 
-	if (!*iter)
-		cgroup = task->cgroups->dfl_cgrp;
-	else if (*iter == &psi_system)
-		return NULL;
-	else
-		cgroup = cgroup_parent(*iter);
+		if (!*iter)
+			cgroup = task->cgroups->dfl_cgrp;
+		else
+			cgroup = cgroup_parent(*iter);
 
-	if (cgroup && cgroup_parent(cgroup)) {
-		*iter = cgroup;
-		return cgroup_psi(cgroup);
+		if (cgroup && cgroup_parent(cgroup)) {
+			*iter = cgroup;
+			return cgroup_psi(cgroup);
+		}
 	}
-#else
-	if (*iter)
-		return NULL;
 #endif
 	*iter = &psi_system;
 	return &psi_system;
-- 
cgit v1.2.3


From c9c9762d4d44dcb1b2ba90cfb4122dc11ceebf31 Mon Sep 17 00:00:00 2001
From: Long Li <longli@microsoft.com>
Date: Mon, 7 Jun 2021 12:34:05 -0700
Subject: block: return the correct bvec when checking for gaps

After commit 07173c3ec276 ("block: enable multipage bvecs"), a bvec can
have multiple pages. But bio_will_gap() still assumes one page bvec while
checking for merging. If the pages in the bvec go across the
seg_boundary_mask, this check for merging can potentially succeed if only
the 1st page is tested, and can fail if all the pages are tested.

Later, when SCSI builds the SG list the same check for merging is done in
__blk_segment_map_sg_merge() with all the pages in the bvec tested. This
time the check may fail if the pages in bvec go across the
seg_boundary_mask (but tested okay in bio_will_gap() earlier, so those
BIOs were merged). If this check fails, we end up with a broken SG list
for drivers assuming the SG list not having offsets in intermediate pages.
This results in incorrect pages written to the disk.

Fix this by returning the multi-page bvec when testing gaps for merging.

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Cc: Pavel Begunkov <asml.silence@gmail.com>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: linux-kernel@vger.kernel.org
Cc: stable@vger.kernel.org
Fixes: 07173c3ec276 ("block: enable multipage bvecs")
Signed-off-by: Long Li <longli@microsoft.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/1623094445-22332-1-git-send-email-longli@linuxonhyperv.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index a0b4cfdf62a4..d2b98efb5cc5 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -44,9 +44,6 @@ static inline unsigned int bio_max_segs(unsigned int nr_segs)
 #define bio_offset(bio)		bio_iter_offset((bio), (bio)->bi_iter)
 #define bio_iovec(bio)		bio_iter_iovec((bio), (bio)->bi_iter)
 
-#define bio_multiple_segments(bio)				\
-	((bio)->bi_iter.bi_size != bio_iovec(bio).bv_len)
-
 #define bvec_iter_sectors(iter)	((iter).bi_size >> 9)
 #define bvec_iter_end_sector(iter) ((iter).bi_sector + bvec_iter_sectors((iter)))
 
@@ -271,7 +268,7 @@ static inline void bio_clear_flag(struct bio *bio, unsigned int bit)
 
 static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
 {
-	*bv = bio_iovec(bio);
+	*bv = mp_bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
 }
 
 static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
@@ -279,10 +276,9 @@ static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
 	struct bvec_iter iter = bio->bi_iter;
 	int idx;
 
-	if (unlikely(!bio_multiple_segments(bio))) {
-		*bv = bio_iovec(bio);
-		return;
-	}
+	bio_get_first_bvec(bio, bv);
+	if (bv->bv_len == bio->bi_iter.bi_size)
+		return;		/* this bio only has a single bvec */
 
 	bio_advance_iter(bio, &iter, iter.bi_size);
 
-- 
cgit v1.2.3


From da27a83fd6cc7780fea190e1f5c19e87019da65c Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 8 Jun 2021 15:31:42 -0400
Subject: kvm: avoid speculation-based attacks from out-of-range memslot
 accesses

KVM's mechanism for accessing guest memory translates a guest physical
address (gpa) to a host virtual address using the right-shifted gpa
(also known as gfn) and a struct kvm_memory_slot.  The translation is
performed in __gfn_to_hva_memslot using the following formula:

      hva = slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE

It is expected that gfn falls within the boundaries of the guest's
physical memory.  However, a guest can access invalid physical addresses
in such a way that the gfn is invalid.

__gfn_to_hva_memslot is called from kvm_vcpu_gfn_to_hva_prot, which first
retrieves a memslot through __gfn_to_memslot.  While __gfn_to_memslot
does check that the gfn falls within the boundaries of the guest's
physical memory or not, a CPU can speculate the result of the check and
continue execution speculatively using an illegal gfn. The speculation
can result in calculating an out-of-bounds hva.  If the resulting host
virtual address is used to load another guest physical address, this
is effectively a Spectre gadget consisting of two consecutive reads,
the second of which is data dependent on the first.

Right now it's not clear if there are any cases in which this is
exploitable.  One interesting case was reported by the original author
of this patch, and involves visiting guest page tables on x86.  Right
now these are not vulnerable because the hva read goes through get_user(),
which contains an LFENCE speculation barrier.  However, there are
patches in progress for x86 uaccess.h to mask kernel addresses instead of
using LFENCE; once these land, a guest could use speculation to read
from the VMM's ring 3 address space.  Other architectures such as ARM
already use the address masking method, and would be susceptible to
this same kind of data-dependent access gadgets.  Therefore, this patch
proactively protects from these attacks by masking out-of-bounds gfns
in __gfn_to_hva_memslot, which blocks speculation of invalid hvas.

Sean Christopherson noted that this patch does not cover
kvm_read_guest_offset_cached.  This however is limited to a few bytes
past the end of the cache, and therefore it is unlikely to be useful in
the context of building a chain of data dependent accesses.

Reported-by: Artemiy Margaritov <artemiy.margaritov@gmail.com>
Co-developed-by: Artemiy Margaritov <artemiy.margaritov@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 76102efbf079..74995f0a2a3c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1185,7 +1185,15 @@ __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn)
 static inline unsigned long
 __gfn_to_hva_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
 {
-	return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
+	/*
+	 * The index was checked originally in search_memslots.  To avoid
+	 * that a malicious guest builds a Spectre gadget out of e.g. page
+	 * table walks, do not let the processor speculate loads outside
+	 * the guest's registered memslots.
+	 */
+	unsigned long offset = array_index_nospec(gfn - slot->base_gfn,
+						  slot->npages);
+	return slot->userspace_addr + offset * PAGE_SIZE;
 }
 
 static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
-- 
cgit v1.2.3


From 11c7aa0ddea8611007768d3e6b58d45dc60a19e1 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 7 Jun 2021 13:26:13 +0200
Subject: rq-qos: fix missed wake-ups in rq_qos_throttle try two

Commit 545fbd0775ba ("rq-qos: fix missed wake-ups in rq_qos_throttle")
tried to fix a problem that a process could be sleeping in rq_qos_wait()
without anyone to wake it up. However the fix is not complete and the
following can still happen:

CPU1 (waiter1)		CPU2 (waiter2)		CPU3 (waker)
rq_qos_wait()		rq_qos_wait()
  acquire_inflight_cb() -> fails
			  acquire_inflight_cb() -> fails

						completes IOs, inflight
						  decreased
  prepare_to_wait_exclusive()
			  prepare_to_wait_exclusive()
  has_sleeper = !wq_has_single_sleeper() -> true as there are two sleepers
			  has_sleeper = !wq_has_single_sleeper() -> true
  io_schedule()		  io_schedule()

Deadlock as now there's nobody to wakeup the two waiters. The logic
automatically blocking when there are already sleepers is really subtle
and the only way to make it work reliably is that we check whether there
are some waiters in the queue when adding ourselves there. That way, we
are guaranteed that at least the first process to enter the wait queue
will recheck the waiting condition before going to sleep and thus
guarantee forward progress.

Fixes: 545fbd0775ba ("rq-qos: fix missed wake-ups in rq_qos_throttle")
CC: stable@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20210607112613.25344-1-jack@suse.cz
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-rq-qos.c   | 4 ++--
 include/linux/wait.h | 2 +-
 kernel/sched/wait.c  | 9 +++++++--
 3 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index 656460636ad3..e83af7bc7591 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -266,8 +266,8 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
 	if (!has_sleeper && acquire_inflight_cb(rqw, private_data))
 		return;
 
-	prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE);
-	has_sleeper = !wq_has_single_sleeper(&rqw->wait);
+	has_sleeper = !prepare_to_wait_exclusive(&rqw->wait, &data.wq,
+						 TASK_UNINTERRUPTIBLE);
 	do {
 		/* The memory barrier in set_task_state saves us here. */
 		if (data.got_token)
diff --git a/include/linux/wait.h b/include/linux/wait.h
index fe10e8570a52..6598ae35e1b5 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -1136,7 +1136,7 @@ do {										\
  * Waitqueues which are removed from the waitqueue_head at wakeup time
  */
 void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
-void prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
+bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
 long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
 void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 183cc6ae68a6..76577d1642a5 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -264,17 +264,22 @@ prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_ent
 }
 EXPORT_SYMBOL(prepare_to_wait);
 
-void
+/* Returns true if we are the first waiter in the queue, false otherwise. */
+bool
 prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
 {
 	unsigned long flags;
+	bool was_empty = false;
 
 	wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
 	spin_lock_irqsave(&wq_head->lock, flags);
-	if (list_empty(&wq_entry->entry))
+	if (list_empty(&wq_entry->entry)) {
+		was_empty = list_empty(&wq_head->head);
 		__add_wait_queue_entry_tail(wq_head, wq_entry);
+	}
 	set_current_state(state);
 	spin_unlock_irqrestore(&wq_head->lock, flags);
+	return was_empty;
 }
 EXPORT_SYMBOL(prepare_to_wait_exclusive);
 
-- 
cgit v1.2.3


From f27abde3042ab4d30d0003eaf5e6641baef94a56 Mon Sep 17 00:00:00 2001
From: Voon Weifeng <weifeng.voon@intel.com>
Date: Tue, 8 Jun 2021 11:51:57 +0800
Subject: net: pcs: add 2500BASEX support for Intel mGbE controller

XPCS IP supports 2500BASEX as PHY interface. It is configured as
autonegotiation disable to cater for PHYs that does not supports 2500BASEX
autonegotiation.

v2: Add supported link speed masking.
v3: Restructure to introduce xpcs_config_2500basex() used to configure the
    xpcs for 2.5G speeds. Added 2500BASEX specific information for
    configuration.
v4: Fix indentation error

Signed-off-by: Voon Weifeng <weifeng.voon@intel.com>
Signed-off-by: Michael Sit Wei Hong <michael.wei.hong.sit@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/pcs/pcs-xpcs.c   | 56 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pcs/pcs-xpcs.h |  1 +
 2 files changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c
index 34164437c135..98c4a3973402 100644
--- a/drivers/net/pcs/pcs-xpcs.c
+++ b/drivers/net/pcs/pcs-xpcs.c
@@ -57,9 +57,12 @@
 
 /* Clause 37 Defines */
 /* VR MII MMD registers offsets */
+#define DW_VR_MII_MMD_CTRL		0x0000
 #define DW_VR_MII_DIG_CTRL1		0x8000
 #define DW_VR_MII_AN_CTRL		0x8001
 #define DW_VR_MII_AN_INTR_STS		0x8002
+/* Enable 2.5G Mode */
+#define DW_VR_MII_DIG_CTRL1_2G5_EN	BIT(2)
 /* EEE Mode Control Register */
 #define DW_VR_MII_EEE_MCTRL0		0x8006
 #define DW_VR_MII_EEE_MCTRL1		0x800b
@@ -86,6 +89,11 @@
 #define DW_VR_MII_C37_ANSGM_SP_1000		0x2
 #define DW_VR_MII_C37_ANSGM_SP_LNKSTS		BIT(4)
 
+/* SR MII MMD Control defines */
+#define AN_CL37_EN		BIT(12)	/* Enable Clause 37 auto-nego */
+#define SGMII_SPEED_SS13	BIT(13)	/* SGMII speed along with SS6 */
+#define SGMII_SPEED_SS6		BIT(6)	/* SGMII speed along with SS13 */
+
 /* VR MII EEE Control 0 defines */
 #define DW_VR_MII_EEE_LTX_EN		BIT(0)  /* LPI Tx Enable */
 #define DW_VR_MII_EEE_LRX_EN		BIT(1)  /* LPI Rx Enable */
@@ -161,6 +169,14 @@ static const int xpcs_sgmii_features[] = {
 	__ETHTOOL_LINK_MODE_MASK_NBITS,
 };
 
+static const int xpcs_2500basex_features[] = {
+	ETHTOOL_LINK_MODE_Asym_Pause_BIT,
+	ETHTOOL_LINK_MODE_Autoneg_BIT,
+	ETHTOOL_LINK_MODE_2500baseX_Full_BIT,
+	ETHTOOL_LINK_MODE_2500baseT_Full_BIT,
+	__ETHTOOL_LINK_MODE_MASK_NBITS,
+};
+
 static const phy_interface_t xpcs_usxgmii_interfaces[] = {
 	PHY_INTERFACE_MODE_USXGMII,
 };
@@ -177,11 +193,17 @@ static const phy_interface_t xpcs_sgmii_interfaces[] = {
 	PHY_INTERFACE_MODE_SGMII,
 };
 
+static const phy_interface_t xpcs_2500basex_interfaces[] = {
+	PHY_INTERFACE_MODE_2500BASEX,
+	PHY_INTERFACE_MODE_MAX,
+};
+
 enum {
 	DW_XPCS_USXGMII,
 	DW_XPCS_10GKR,
 	DW_XPCS_XLGMII,
 	DW_XPCS_SGMII,
+	DW_XPCS_2500BASEX,
 	DW_XPCS_INTERFACE_MAX,
 };
 
@@ -306,6 +328,7 @@ static int xpcs_soft_reset(struct mdio_xpcs_args *xpcs,
 		dev = MDIO_MMD_PCS;
 		break;
 	case DW_AN_C37_SGMII:
+	case DW_2500BASEX:
 		dev = MDIO_MMD_VEND2;
 		break;
 	default:
@@ -804,6 +827,28 @@ static int xpcs_config_aneg_c37_sgmii(struct mdio_xpcs_args *xpcs)
 	return xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_DIG_CTRL1, ret);
 }
 
+static int xpcs_config_2500basex(struct mdio_xpcs_args *xpcs)
+{
+	int ret;
+
+	ret = xpcs_read(xpcs, MDIO_MMD_VEND2, DW_VR_MII_DIG_CTRL1);
+	if (ret < 0)
+		return ret;
+	ret |= DW_VR_MII_DIG_CTRL1_2G5_EN;
+	ret &= ~DW_VR_MII_DIG_CTRL1_MAC_AUTO_SW;
+	ret = xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_DIG_CTRL1, ret);
+	if (ret < 0)
+		return ret;
+
+	ret = xpcs_read(xpcs, MDIO_MMD_VEND2, DW_VR_MII_MMD_CTRL);
+	if (ret < 0)
+		return ret;
+	ret &= ~AN_CL37_EN;
+	ret |= SGMII_SPEED_SS6;
+	ret &= ~SGMII_SPEED_SS13;
+	return xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_MMD_CTRL, ret);
+}
+
 static int xpcs_do_config(struct mdio_xpcs_args *xpcs,
 			  phy_interface_t interface, unsigned int mode)
 {
@@ -827,6 +872,11 @@ static int xpcs_do_config(struct mdio_xpcs_args *xpcs,
 		if (ret)
 			return ret;
 		break;
+	case DW_2500BASEX:
+		ret = xpcs_config_2500basex(xpcs);
+		if (ret)
+			return ret;
+		break;
 	default:
 		return -1;
 	}
@@ -1023,6 +1073,12 @@ static const struct xpcs_compat synopsys_xpcs_compat[DW_XPCS_INTERFACE_MAX] = {
 		.num_interfaces = ARRAY_SIZE(xpcs_sgmii_interfaces),
 		.an_mode = DW_AN_C37_SGMII,
 	},
+	[DW_XPCS_2500BASEX] = {
+		.supported = xpcs_2500basex_features,
+		.interface = xpcs_2500basex_interfaces,
+		.num_interfaces = ARRAY_SIZE(xpcs_2500basex_features),
+		.an_mode = DW_2500BASEX,
+	},
 };
 
 static const struct xpcs_id xpcs_id_list[] = {
diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h
index 0860a5b59f10..4d815f03b4b2 100644
--- a/include/linux/pcs/pcs-xpcs.h
+++ b/include/linux/pcs/pcs-xpcs.h
@@ -13,6 +13,7 @@
 /* AN mode */
 #define DW_AN_C73			1
 #define DW_AN_C37_SGMII			2
+#define DW_2500BASEX			3
 
 struct xpcs_id;
 
-- 
cgit v1.2.3


From 46682cb86a37da435e5668db98555a1de0f0448b Mon Sep 17 00:00:00 2001
From: Voon Weifeng <weifeng.voon@intel.com>
Date: Tue, 8 Jun 2021 11:51:58 +0800
Subject: net: stmmac: enable Intel mGbE 2.5Gbps link speed

The Intel mGbE supports 2.5Gbps link speed by increasing the clock rate by
2.5 times of the original rate. In this mode, the serdes/PHY operates at a
serial baud rate of 3.125 Gbps and the PCS data path and GMII interface of
the MAC operate at 312.5 MHz instead of 125 MHz.

For Intel mGbE, the overclocking of 2.5 times clock rate to support 2.5G is
only able to be configured in the BIOS during boot time. Kernel driver has
no access to modify the clock rate for 1Gbps/2.5G mode. The way to
determined the current 1G/2.5G mode is by reading a dedicated adhoc
register through mdio bus. In short, after the system boot up, it is either
in 1G mode or 2.5G mode which not able to be changed on the fly.

Compared to 1G mode, the 2.5G mode selects the 2500BASEX as PHY interface and
disables the xpcs_an_inband. This is to cater for some PHYs that only
supports 2500BASEX PHY interface with no autonegotiation.

v2: remove MAC supported link speed masking
v3: Restructure  to introduce intel_speed_mode_2500() to read serdes registers
    for max speed supported and select the appropritate configuration.
    Use max_speed to determine the supported link speed mask.

Signed-off-by: Voon Weifeng <weifeng.voon@intel.com>
Signed-off-by: Michael Sit Wei Hong <michael.wei.hong.sit@intel.com>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 48 ++++++++++++++++++++++-
 drivers/net/ethernet/stmicro/stmmac/dwmac-intel.h | 13 ++++++
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c |  1 +
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |  7 ++++
 include/linux/stmmac.h                            |  1 +
 5 files changed, 69 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
index 2ecf93c84b9d..6a9a19b0844c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
@@ -102,6 +102,22 @@ static int intel_serdes_powerup(struct net_device *ndev, void *priv_data)
 
 	serdes_phy_addr = intel_priv->mdio_adhoc_addr;
 
+	/* Set the serdes rate and the PCLK rate */
+	data = mdiobus_read(priv->mii, serdes_phy_addr,
+			    SERDES_GCR0);
+
+	data &= ~SERDES_RATE_MASK;
+	data &= ~SERDES_PCLK_MASK;
+
+	if (priv->plat->max_speed == 2500)
+		data |= SERDES_RATE_PCIE_GEN2 << SERDES_RATE_PCIE_SHIFT |
+			SERDES_PCLK_37p5MHZ << SERDES_PCLK_SHIFT;
+	else
+		data |= SERDES_RATE_PCIE_GEN1 << SERDES_RATE_PCIE_SHIFT |
+			SERDES_PCLK_70MHZ << SERDES_PCLK_SHIFT;
+
+	mdiobus_write(priv->mii, serdes_phy_addr, SERDES_GCR0, data);
+
 	/* assert clk_req */
 	data = mdiobus_read(priv->mii, serdes_phy_addr, SERDES_GCR0);
 	data |= SERDES_PLL_CLK;
@@ -230,6 +246,32 @@ static void intel_serdes_powerdown(struct net_device *ndev, void *intel_data)
 	}
 }
 
+static void intel_speed_mode_2500(struct net_device *ndev, void *intel_data)
+{
+	struct intel_priv_data *intel_priv = intel_data;
+	struct stmmac_priv *priv = netdev_priv(ndev);
+	int serdes_phy_addr = 0;
+	u32 data = 0;
+
+	serdes_phy_addr = intel_priv->mdio_adhoc_addr;
+
+	/* Determine the link speed mode: 2.5Gbps/1Gbps */
+	data = mdiobus_read(priv->mii, serdes_phy_addr,
+			    SERDES_GCR);
+
+	if (((data & SERDES_LINK_MODE_MASK) >> SERDES_LINK_MODE_SHIFT) ==
+	    SERDES_LINK_MODE_2G5) {
+		dev_info(priv->device, "Link Speed Mode: 2.5Gbps\n");
+		priv->plat->max_speed = 2500;
+		priv->plat->phy_interface = PHY_INTERFACE_MODE_2500BASEX;
+		priv->plat->mdio_bus_data->xpcs_an_inband = false;
+	} else {
+		priv->plat->max_speed = 1000;
+		priv->plat->phy_interface = PHY_INTERFACE_MODE_SGMII;
+		priv->plat->mdio_bus_data->xpcs_an_inband = true;
+	}
+}
+
 /* Program PTP Clock Frequency for different variant of
  * Intel mGBE that has slightly different GPO mapping
  */
@@ -586,7 +628,7 @@ static int ehl_sgmii_data(struct pci_dev *pdev,
 {
 	plat->bus_id = 1;
 	plat->phy_interface = PHY_INTERFACE_MODE_SGMII;
-
+	plat->speed_mode_2500 = intel_speed_mode_2500;
 	plat->serdes_powerup = intel_serdes_powerup;
 	plat->serdes_powerdown = intel_serdes_powerdown;
 
@@ -639,6 +681,7 @@ static int ehl_pse0_sgmii1g_data(struct pci_dev *pdev,
 				 struct plat_stmmacenet_data *plat)
 {
 	plat->phy_interface = PHY_INTERFACE_MODE_SGMII;
+	plat->speed_mode_2500 = intel_speed_mode_2500;
 	plat->serdes_powerup = intel_serdes_powerup;
 	plat->serdes_powerdown = intel_serdes_powerdown;
 	return ehl_pse0_common_data(pdev, plat);
@@ -677,6 +720,7 @@ static int ehl_pse1_sgmii1g_data(struct pci_dev *pdev,
 				 struct plat_stmmacenet_data *plat)
 {
 	plat->phy_interface = PHY_INTERFACE_MODE_SGMII;
+	plat->speed_mode_2500 = intel_speed_mode_2500;
 	plat->serdes_powerup = intel_serdes_powerup;
 	plat->serdes_powerdown = intel_serdes_powerdown;
 	return ehl_pse1_common_data(pdev, plat);
@@ -711,6 +755,7 @@ static int tgl_sgmii_phy0_data(struct pci_dev *pdev,
 {
 	plat->bus_id = 1;
 	plat->phy_interface = PHY_INTERFACE_MODE_SGMII;
+	plat->speed_mode_2500 = intel_speed_mode_2500;
 	plat->serdes_powerup = intel_serdes_powerup;
 	plat->serdes_powerdown = intel_serdes_powerdown;
 	return tgl_common_data(pdev, plat);
@@ -725,6 +770,7 @@ static int tgl_sgmii_phy1_data(struct pci_dev *pdev,
 {
 	plat->bus_id = 2;
 	plat->phy_interface = PHY_INTERFACE_MODE_SGMII;
+	plat->speed_mode_2500 = intel_speed_mode_2500;
 	plat->serdes_powerup = intel_serdes_powerup;
 	plat->serdes_powerdown = intel_serdes_powerdown;
 	return tgl_common_data(pdev, plat);
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.h b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.h
index 542acb8ce467..20d14e588044 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.h
@@ -9,6 +9,7 @@
 #define POLL_DELAY_US 8
 
 /* SERDES Register */
+#define SERDES_GCR	0x0	/* Global Conguration */
 #define SERDES_GSR0	0x5	/* Global Status Reg0 */
 #define SERDES_GCR0	0xb	/* Global Configuration Reg0 */
 
@@ -17,8 +18,20 @@
 #define SERDES_PHY_RX_CLK	BIT(1)		/* PSE SGMII PHY rx clk */
 #define SERDES_RST		BIT(2)		/* Serdes Reset */
 #define SERDES_PWR_ST_MASK	GENMASK(6, 4)	/* Serdes Power state*/
+#define SERDES_RATE_MASK	GENMASK(9, 8)
+#define SERDES_PCLK_MASK	GENMASK(14, 12)	/* PCLK rate to PHY */
+#define SERDES_LINK_MODE_MASK	GENMASK(2, 1)
+#define SERDES_LINK_MODE_SHIFT	1
 #define SERDES_PWR_ST_SHIFT	4
 #define SERDES_PWR_ST_P0	0x0
 #define SERDES_PWR_ST_P3	0x3
+#define SERDES_LINK_MODE_2G5	0x3
+#define SERSED_LINK_MODE_1G	0x2
+#define SERDES_PCLK_37p5MHZ	0x0
+#define SERDES_PCLK_70MHZ	0x1
+#define SERDES_RATE_PCIE_GEN1	0x0
+#define SERDES_RATE_PCIE_GEN2	0x1
+#define SERDES_RATE_PCIE_SHIFT	8
+#define SERDES_PCLK_SHIFT	12
 
 #endif /* __DWMAC_INTEL_H__ */
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index f35c03c9f91e..67ba083eb90c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -1358,6 +1358,7 @@ int dwmac4_setup(struct stmmac_priv *priv)
 	mac->link.speed10 = GMAC_CONFIG_PS;
 	mac->link.speed100 = GMAC_CONFIG_FES | GMAC_CONFIG_PS;
 	mac->link.speed1000 = 0;
+	mac->link.speed2500 = GMAC_CONFIG_FES;
 	mac->link.speed_mask = GMAC_CONFIG_FES | GMAC_CONFIG_PS;
 	mac->mii.addr = GMAC_MDIO_ADDR;
 	mac->mii.data = GMAC_MDIO_DATA;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index af406ea3dd46..1b12a2f8bfb5 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -931,6 +931,10 @@ static void stmmac_validate(struct phylink_config *config,
 	if ((max_speed > 0) && (max_speed < 1000)) {
 		phylink_set(mask, 1000baseT_Full);
 		phylink_set(mask, 1000baseX_Full);
+	} else if (priv->plat->has_gmac4) {
+		if (!max_speed || max_speed >= 2500)
+			phylink_set(mac_supported, 2500baseT_Full);
+			phylink_set(mac_supported, 2500baseX_Full);
 	} else if (priv->plat->has_xgmac) {
 		if (!max_speed || (max_speed >= 2500)) {
 			phylink_set(mac_supported, 2500baseT_Full);
@@ -6993,6 +6997,9 @@ int stmmac_dvr_probe(struct device *device,
 		}
 	}
 
+	if (priv->plat->speed_mode_2500)
+		priv->plat->speed_mode_2500(ndev, priv->plat->bsp_priv);
+
 	if (priv->plat->mdio_bus_data) {
 		if (priv->plat->mdio_bus_data->has_xpcs) {
 			ret = stmmac_xpcs_setup(priv->mii);
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index e55a4807e3ea..b10be3385a30 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -223,6 +223,7 @@ struct plat_stmmacenet_data {
 	void (*fix_mac_speed)(void *priv, unsigned int speed);
 	int (*serdes_powerup)(struct net_device *ndev, void *priv);
 	void (*serdes_powerdown)(struct net_device *ndev, void *priv);
+	void (*speed_mode_2500)(struct net_device *ndev, void *priv);
 	void (*ptp_clk_freq_config)(void *priv);
 	int (*init)(struct platform_device *pdev, void *priv);
 	void (*exit)(struct platform_device *pdev, void *priv);
-- 
cgit v1.2.3


From b64d76b782264aa91c236c11c72646459b04c301 Mon Sep 17 00:00:00 2001
From: Sergey Ryazanov <ryazanov.s.a@gmail.com>
Date: Tue, 8 Jun 2021 07:02:34 +0300
Subject: net: wwan: make WWAN_PORT_MAX meaning less surprised

It is quite unusual when some value can not be equal to a defined range
max value. Also most subsystems defines FOO_TYPE_MAX as a maximum valid
value. So turn the WAN_PORT_MAX meaning from the number of supported
port types to the maximum valid port type.

Signed-off-by: Sergey Ryazanov <ryazanov.s.a@gmail.com>
Reviewed-by: Loic Poulain <loic.poulain@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wwan/wwan_core.c |  2 +-
 include/linux/wwan.h         | 12 +++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wwan/wwan_core.c b/drivers/net/wwan/wwan_core.c
index 6e8f19c71a9e..632ff86398ac 100644
--- a/drivers/net/wwan/wwan_core.c
+++ b/drivers/net/wwan/wwan_core.c
@@ -250,7 +250,7 @@ struct wwan_port *wwan_create_port(struct device *parent,
 	struct wwan_port *port;
 	int minor, err = -ENOMEM;
 
-	if (type >= WWAN_PORT_MAX || !ops)
+	if (type > WWAN_PORT_MAX || !ops)
 		return ERR_PTR(-EINVAL);
 
 	/* A port is always a child of a WWAN device, retrieve (allocate or
diff --git a/include/linux/wwan.h b/include/linux/wwan.h
index 7216c114d758..fa33cc16d931 100644
--- a/include/linux/wwan.h
+++ b/include/linux/wwan.h
@@ -15,8 +15,10 @@
  * @WWAN_PORT_QMI: Qcom modem/MSM interface for modem control
  * @WWAN_PORT_QCDM: Qcom Modem diagnostic interface
  * @WWAN_PORT_FIREHOSE: XML based command protocol
- * @WWAN_PORT_UNKNOWN: Unknown port type
- * @WWAN_PORT_MAX: Number of supported port types
+ *
+ * @WWAN_PORT_MAX: Highest supported port types
+ * @WWAN_PORT_UNKNOWN: Special value to indicate an unknown port type
+ * @__WWAN_PORT_MAX: Internal use
  */
 enum wwan_port_type {
 	WWAN_PORT_AT,
@@ -24,8 +26,12 @@ enum wwan_port_type {
 	WWAN_PORT_QMI,
 	WWAN_PORT_QCDM,
 	WWAN_PORT_FIREHOSE,
+
+	/* Add new port types above this line */
+
+	__WWAN_PORT_MAX,
+	WWAN_PORT_MAX = __WWAN_PORT_MAX - 1,
 	WWAN_PORT_UNKNOWN,
-	WWAN_PORT_MAX = WWAN_PORT_UNKNOWN,
 };
 
 struct wwan_port;
-- 
cgit v1.2.3


From e67f325e9cd67562b761e884680c0fec03a6f404 Mon Sep 17 00:00:00 2001
From: Matthew Hagan <mnhagan88@gmail.com>
Date: Tue, 8 Jun 2021 19:59:06 +0100
Subject: net: stmmac: explicitly deassert GMAC_AHB_RESET

We are currently assuming that GMAC_AHB_RESET will already be deasserted
by the bootloader. However if this has not been done, probing of the GMAC
will fail. To remedy this we must ensure GMAC_AHB_RESET has been deasserted
prior to probing.

v2 changes:
 - remove NULL condition check for stmmac_ahb_rst in stmmac_main.c
 - unwrap dev_err() message in stmmac_main.c
 - add PTR_ERR() around plat->stmmac_ahb_rst in stmmac_platform.c

v3 changes:
 - add error pointer to dev_err() output
 - add reset_control_assert(stmmac_ahb_rst) in stmmac_dvr_remove
 - revert PTR_ERR() around plat->stmmac_ahb_rst since this is performed
   on the returned value of ret by the calling function

Signed-off-by: Matthew Hagan <mnhagan88@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c     | 6 ++++++
 drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 7 +++++++
 include/linux/stmmac.h                                | 1 +
 3 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index f2adff59b07d..1c881ec8cd04 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -6843,6 +6843,11 @@ int stmmac_dvr_probe(struct device *device,
 			reset_control_reset(priv->plat->stmmac_rst);
 	}
 
+	ret = reset_control_deassert(priv->plat->stmmac_ahb_rst);
+	if (ret == -ENOTSUPP)
+		dev_err(priv->device, "unable to bring out of ahb reset: %pe\n",
+			ERR_PTR(ret));
+
 	/* Init MAC and get the capabilities */
 	ret = stmmac_hw_init(priv);
 	if (ret)
@@ -7086,6 +7091,7 @@ int stmmac_dvr_remove(struct device *dev)
 	phylink_destroy(priv->phylink);
 	if (priv->plat->stmmac_rst)
 		reset_control_assert(priv->plat->stmmac_rst);
+	reset_control_assert(priv->plat->stmmac_ahb_rst);
 	pm_runtime_put(dev);
 	pm_runtime_disable(dev);
 	if (priv->hw->pcs != STMMAC_PCS_TBI &&
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 97a1fedcc9ac..d8ae58bdbbe3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -600,6 +600,13 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac)
 		goto error_hw_init;
 	}
 
+	plat->stmmac_ahb_rst = devm_reset_control_get_optional_shared(
+							&pdev->dev, "ahb");
+	if (IS_ERR(plat->stmmac_ahb_rst)) {
+		ret = plat->stmmac_ahb_rst;
+		goto error_hw_init;
+	}
+
 	return plat;
 
 error_hw_init:
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index b10be3385a30..3867980d1447 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -240,6 +240,7 @@ struct plat_stmmacenet_data {
 	unsigned int mult_fact_100ns;
 	s32 ptp_max_adj;
 	struct reset_control *stmmac_rst;
+	struct reset_control *stmmac_ahb_rst;
 	struct stmmac_axi *axi;
 	int has_gmac4;
 	bool has_sun8i;
-- 
cgit v1.2.3


From 0899431f95a7a695f342527548b24ffd902c68ab Mon Sep 17 00:00:00 2001
From: Dario Binacchi <dariobin@libero.it>
Date: Sun, 6 Jun 2021 22:22:53 +0200
Subject: clk: ti: add am33xx/am43xx spread spectrum clock support

The patch enables spread spectrum clocking (SSC) for MPU and LCD PLLs.
As reported by the TI spruh73x/spruhl7x RM, SSC is only supported for
the DISP/LCD and MPU PLLs on am33xx/am43xx. SSC is not supported for
DDR, PER, and CORE PLLs.

Calculating the required values and setting the registers accordingly
was taken from the set_mpu_spreadspectrum routine contained in the
arch/arm/mach-omap2/am33xx/clock_am33xx.c file of the u-boot project.

In locked condition, DPLL output clock = CLKINP *[M/N]. In case of
SSC enabled, the reference manual explains that there is a restriction
of range of M values. Since the omap2_dpll_round_rate routine attempts
to select the minimum possible N, the value of M obtained is not
guaranteed to be within the range required. With the new "ti,min-div"
parameter it is possible to increase N and consequently M to satisfy the
constraint imposed by SSC.

Signed-off-by: Dario Binacchi <dariobin@libero.it>
Reviewed-by: Tero Kristo <kristo@kernel.org>
Link: https://lore.kernel.org/r/20210606202253.31649-6-dariobin@libero.it
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/ti/dpll.c     | 39 ++++++++++++++++++++++
 drivers/clk/ti/dpll3xxx.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/clk/ti.h    | 22 ++++++++++++
 3 files changed, 146 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/ti/dpll.c b/drivers/clk/ti/dpll.c
index d6f1ac5b53e1..e9f9aee936ae 100644
--- a/drivers/clk/ti/dpll.c
+++ b/drivers/clk/ti/dpll.c
@@ -290,7 +290,9 @@ static void __init of_ti_dpll_setup(struct device_node *node,
 	struct clk_init_data *init = NULL;
 	const char **parent_names = NULL;
 	struct dpll_data *dd = NULL;
+	int ssc_clk_index;
 	u8 dpll_mode = 0;
+	u32 min_div;
 
 	dd = kmemdup(ddt, sizeof(*dd), GFP_KERNEL);
 	clk_hw = kzalloc(sizeof(*clk_hw), GFP_KERNEL);
@@ -345,6 +347,27 @@ static void __init of_ti_dpll_setup(struct device_node *node,
 	if (dd->autoidle_mask) {
 		if (ti_clk_get_reg_addr(node, 3, &dd->autoidle_reg))
 			goto cleanup;
+
+		ssc_clk_index = 4;
+	} else {
+		ssc_clk_index = 3;
+	}
+
+	if (dd->ssc_deltam_int_mask && dd->ssc_deltam_frac_mask &&
+	    dd->ssc_modfreq_mant_mask && dd->ssc_modfreq_exp_mask) {
+		if (ti_clk_get_reg_addr(node, ssc_clk_index++,
+					&dd->ssc_deltam_reg))
+			goto cleanup;
+
+		if (ti_clk_get_reg_addr(node, ssc_clk_index++,
+					&dd->ssc_modfreq_reg))
+			goto cleanup;
+
+		of_property_read_u32(node, "ti,ssc-modfreq-hz",
+				     &dd->ssc_modfreq);
+		of_property_read_u32(node, "ti,ssc-deltam", &dd->ssc_deltam);
+		dd->ssc_downspread =
+			of_property_read_bool(node, "ti,ssc-downspread");
 	}
 
 	if (of_property_read_bool(node, "ti,low-power-stop"))
@@ -356,6 +379,10 @@ static void __init of_ti_dpll_setup(struct device_node *node,
 	if (of_property_read_bool(node, "ti,lock"))
 		dpll_mode |= 1 << DPLL_LOCKED;
 
+	if (!of_property_read_u32(node, "ti,min-div", &min_div) &&
+	    min_div > dd->min_divider)
+		dd->min_divider = min_div;
+
 	if (dpll_mode)
 		dd->modes = dpll_mode;
 
@@ -585,8 +612,14 @@ static void __init of_ti_am3_no_gate_dpll_setup(struct device_node *node)
 	const struct dpll_data dd = {
 		.idlest_mask = 0x1,
 		.enable_mask = 0x7,
+		.ssc_enable_mask = 0x1 << 12,
+		.ssc_downspread_mask = 0x1 << 14,
 		.mult_mask = 0x7ff << 8,
 		.div1_mask = 0x7f,
+		.ssc_deltam_int_mask = 0x3 << 18,
+		.ssc_deltam_frac_mask = 0x3ffff,
+		.ssc_modfreq_mant_mask = 0x7f,
+		.ssc_modfreq_exp_mask = 0x7 << 8,
 		.max_multiplier = 2047,
 		.max_divider = 128,
 		.min_divider = 1,
@@ -645,8 +678,14 @@ static void __init of_ti_am3_dpll_setup(struct device_node *node)
 	const struct dpll_data dd = {
 		.idlest_mask = 0x1,
 		.enable_mask = 0x7,
+		.ssc_enable_mask = 0x1 << 12,
+		.ssc_downspread_mask = 0x1 << 14,
 		.mult_mask = 0x7ff << 8,
 		.div1_mask = 0x7f,
+		.ssc_deltam_int_mask = 0x3 << 18,
+		.ssc_deltam_frac_mask = 0x3ffff,
+		.ssc_modfreq_mant_mask = 0x7f,
+		.ssc_modfreq_exp_mask = 0x7 << 8,
 		.max_multiplier = 2047,
 		.max_divider = 128,
 		.min_divider = 1,
diff --git a/drivers/clk/ti/dpll3xxx.c b/drivers/clk/ti/dpll3xxx.c
index 94d5b5fe9a2b..e32b3515f9e7 100644
--- a/drivers/clk/ti/dpll3xxx.c
+++ b/drivers/clk/ti/dpll3xxx.c
@@ -291,6 +291,88 @@ static void _lookup_sddiv(struct clk_hw_omap *clk, u8 *sd_div, u16 m, u8 n)
 	*sd_div = sd;
 }
 
+/**
+ * omap3_noncore_dpll_ssc_program - set spread-spectrum clocking registers
+ * @clk:	struct clk * of DPLL to set
+ *
+ * Enable the DPLL spread spectrum clocking if frequency modulation and
+ * frequency spreading have been set, otherwise disable it.
+ */
+static void omap3_noncore_dpll_ssc_program(struct clk_hw_omap *clk)
+{
+	struct dpll_data *dd = clk->dpll_data;
+	unsigned long ref_rate;
+	u32 v, ctrl, mod_freq_divider, exponent, mantissa;
+	u32 deltam_step, deltam_ceil;
+
+	ctrl = ti_clk_ll_ops->clk_readl(&dd->control_reg);
+
+	if (dd->ssc_modfreq && dd->ssc_deltam) {
+		ctrl |= dd->ssc_enable_mask;
+
+		if (dd->ssc_downspread)
+			ctrl |= dd->ssc_downspread_mask;
+		else
+			ctrl &= ~dd->ssc_downspread_mask;
+
+		ref_rate = clk_hw_get_rate(dd->clk_ref);
+		mod_freq_divider =
+		    (ref_rate / dd->last_rounded_n) / (4 * dd->ssc_modfreq);
+		if (dd->ssc_modfreq > (ref_rate / 70))
+			pr_warn("clock: SSC modulation frequency of DPLL %s greater than %ld\n",
+				__clk_get_name(clk->hw.clk), ref_rate / 70);
+
+		exponent = 0;
+		mantissa = mod_freq_divider;
+		while ((mantissa > 127) && (exponent < 7)) {
+			exponent++;
+			mantissa /= 2;
+		}
+		if (mantissa > 127)
+			mantissa = 127;
+
+		v = ti_clk_ll_ops->clk_readl(&dd->ssc_modfreq_reg);
+		v &= ~(dd->ssc_modfreq_mant_mask | dd->ssc_modfreq_exp_mask);
+		v |= mantissa << __ffs(dd->ssc_modfreq_mant_mask);
+		v |= exponent << __ffs(dd->ssc_modfreq_exp_mask);
+		ti_clk_ll_ops->clk_writel(v, &dd->ssc_modfreq_reg);
+
+		deltam_step = dd->last_rounded_m * dd->ssc_deltam;
+		deltam_step /= 10;
+		if (dd->ssc_downspread)
+			deltam_step /= 2;
+
+		deltam_step <<= __ffs(dd->ssc_deltam_int_mask);
+		deltam_step /= 100;
+		deltam_step /= mod_freq_divider;
+		if (deltam_step > 0xFFFFF)
+			deltam_step = 0xFFFFF;
+
+		deltam_ceil = (deltam_step & dd->ssc_deltam_int_mask) >>
+		    __ffs(dd->ssc_deltam_int_mask);
+		if (deltam_step & dd->ssc_deltam_frac_mask)
+			deltam_ceil++;
+
+		if ((dd->ssc_downspread &&
+		     ((dd->last_rounded_m - (2 * deltam_ceil)) < 20 ||
+		      dd->last_rounded_m > 2045)) ||
+		    ((dd->last_rounded_m - deltam_ceil) < 20 ||
+		     (dd->last_rounded_m + deltam_ceil) > 2045))
+			pr_warn("clock: SSC multiplier of DPLL %s is out of range\n",
+				__clk_get_name(clk->hw.clk));
+
+		v = ti_clk_ll_ops->clk_readl(&dd->ssc_deltam_reg);
+		v &= ~(dd->ssc_deltam_int_mask | dd->ssc_deltam_frac_mask);
+		v |= deltam_step << __ffs(dd->ssc_deltam_int_mask |
+					  dd->ssc_deltam_frac_mask);
+		ti_clk_ll_ops->clk_writel(v, &dd->ssc_deltam_reg);
+	} else {
+		ctrl &= ~dd->ssc_enable_mask;
+	}
+
+	ti_clk_ll_ops->clk_writel(ctrl, &dd->control_reg);
+}
+
 /**
  * omap3_noncore_dpll_program - set non-core DPLL M,N values directly
  * @clk:	struct clk * of DPLL to set
@@ -390,6 +472,9 @@ static int omap3_noncore_dpll_program(struct clk_hw_omap *clk, u16 freqsel)
 		ti_clk_ll_ops->clk_writel(v, &dd->control_reg);
 	}
 
+	if (dd->ssc_enable_mask)
+		omap3_noncore_dpll_ssc_program(clk);
+
 	/* We let the clock framework set the other output dividers later */
 
 	/* REVISIT: Set ramp-up delay? */
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index c62f6fa6763d..3486f20a3753 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -63,6 +63,17 @@ struct clk_omap_reg {
  * @auto_recal_bit: bitshift of the driftguard enable bit in @control_reg
  * @recal_en_bit: bitshift of the PRM_IRQENABLE_* bit for recalibration IRQs
  * @recal_st_bit: bitshift of the PRM_IRQSTATUS_* bit for recalibration IRQs
+ * @ssc_deltam_reg: register containing the DPLL SSC frequency spreading
+ * @ssc_modfreq_reg: register containing the DPLL SSC modulation frequency
+ * @ssc_modfreq_mant_mask: mask of the mantissa component in @ssc_modfreq_reg
+ * @ssc_modfreq_exp_mask: mask of the exponent component in @ssc_modfreq_reg
+ * @ssc_enable_mask: mask of the DPLL SSC enable bit in @control_reg
+ * @ssc_downspread_mask: mask of the DPLL SSC low frequency only bit in
+ *                       @control_reg
+ * @ssc_modfreq: the DPLL SSC frequency modulation in kHz
+ * @ssc_deltam: the DPLL SSC frequency spreading in permille (10th of percent)
+ * @ssc_downspread: require the only low frequency spread of the DPLL in SSC
+ *                   mode
  * @flags: DPLL type/features (see below)
  *
  * Possible values for @flags:
@@ -110,6 +121,17 @@ struct dpll_data {
 	u8			auto_recal_bit;
 	u8			recal_en_bit;
 	u8			recal_st_bit;
+	struct clk_omap_reg	ssc_deltam_reg;
+	struct clk_omap_reg	ssc_modfreq_reg;
+	u32			ssc_deltam_int_mask;
+	u32			ssc_deltam_frac_mask;
+	u32			ssc_modfreq_mant_mask;
+	u32			ssc_modfreq_exp_mask;
+	u32                     ssc_enable_mask;
+	u32                     ssc_downspread_mask;
+	u32                     ssc_modfreq;
+	u32                     ssc_deltam;
+	bool                    ssc_downspread;
 	u8			flags;
 };
 
-- 
cgit v1.2.3


From 4422829e8053068e0225e4d0ef42dc41ea7c9ef5 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 9 Jun 2021 01:49:13 -0400
Subject: kvm: fix previous commit for 32-bit builds

array_index_nospec does not work for uint64_t on 32-bit builds.
However, the size of a memory slot must be less than 20 bits wide
on those system, since the memory slot must fit in the user
address space.  So just store it in an unsigned long.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 74995f0a2a3c..8583ed3ff344 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1191,8 +1191,8 @@ __gfn_to_hva_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
 	 * table walks, do not let the processor speculate loads outside
 	 * the guest's registered memslots.
 	 */
-	unsigned long offset = array_index_nospec(gfn - slot->base_gfn,
-						  slot->npages);
+	unsigned long offset = gfn - slot->base_gfn;
+	offset = array_index_nospec(offset, slot->npages);
 	return slot->userspace_addr + offset * PAGE_SIZE;
 }
 
-- 
cgit v1.2.3


From 4ccf359849ce709f4bf0214b4b5b8b6891d38770 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 9 Jun 2021 09:19:18 +0200
Subject: spi: remove spi_set_cs_timing()

No one seems to be using this global and exported function, so remove it
as it is no longer needed.

Cc: Mark Brown <broonie@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20210609071918.2852069-1-gregkh@linuxfoundation.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c       | 73 -------------------------------------------------
 include/linux/spi/spi.h |  5 ----
 2 files changed, 78 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index ac892cc83171..a0a232669dc1 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -3472,79 +3472,6 @@ int spi_setup(struct spi_device *spi)
 }
 EXPORT_SYMBOL_GPL(spi_setup);
 
-/**
- * spi_set_cs_timing - configure CS setup, hold, and inactive delays
- * @spi: the device that requires specific CS timing configuration
- * @setup: CS setup time specified via @spi_delay
- * @hold: CS hold time specified via @spi_delay
- * @inactive: CS inactive delay between transfers specified via @spi_delay
- *
- * Return: zero on success, else a negative error code.
- */
-int spi_set_cs_timing(struct spi_device *spi, struct spi_delay *setup,
-		      struct spi_delay *hold, struct spi_delay *inactive)
-{
-	struct device *parent = spi->controller->dev.parent;
-	size_t len;
-	int status;
-
-	if (spi->controller->set_cs_timing &&
-	    !(spi->cs_gpiod || gpio_is_valid(spi->cs_gpio))) {
-		mutex_lock(&spi->controller->io_mutex);
-
-		if (spi->controller->auto_runtime_pm) {
-			status = pm_runtime_get_sync(parent);
-			if (status < 0) {
-				mutex_unlock(&spi->controller->io_mutex);
-				pm_runtime_put_noidle(parent);
-				dev_err(&spi->controller->dev, "Failed to power device: %d\n",
-					status);
-				return status;
-			}
-
-			status = spi->controller->set_cs_timing(spi, setup,
-								hold, inactive);
-			pm_runtime_mark_last_busy(parent);
-			pm_runtime_put_autosuspend(parent);
-		} else {
-			status = spi->controller->set_cs_timing(spi, setup, hold,
-							      inactive);
-		}
-
-		mutex_unlock(&spi->controller->io_mutex);
-		return status;
-	}
-
-	if ((setup && setup->unit == SPI_DELAY_UNIT_SCK) ||
-	    (hold && hold->unit == SPI_DELAY_UNIT_SCK) ||
-	    (inactive && inactive->unit == SPI_DELAY_UNIT_SCK)) {
-		dev_err(&spi->dev,
-			"Clock-cycle delays for CS not supported in SW mode\n");
-		return -ENOTSUPP;
-	}
-
-	len = sizeof(struct spi_delay);
-
-	/* copy delays to controller */
-	if (setup)
-		memcpy(&spi->controller->cs_setup, setup, len);
-	else
-		memset(&spi->controller->cs_setup, 0, len);
-
-	if (hold)
-		memcpy(&spi->controller->cs_hold, hold, len);
-	else
-		memset(&spi->controller->cs_hold, 0, len);
-
-	if (inactive)
-		memcpy(&spi->controller->cs_inactive, inactive, len);
-	else
-		memset(&spi->controller->cs_inactive, 0, len);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(spi_set_cs_timing);
-
 static int _spi_xfer_word_delay_update(struct spi_transfer *xfer,
 				       struct spi_device *spi)
 {
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 74239d65c7fd..f924160e995f 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -1108,11 +1108,6 @@ static inline void spi_message_free(struct spi_message *m)
 	kfree(m);
 }
 
-extern int spi_set_cs_timing(struct spi_device *spi,
-			     struct spi_delay *setup,
-			     struct spi_delay *hold,
-			     struct spi_delay *inactive);
-
 extern int spi_setup(struct spi_device *spi);
 extern int spi_async(struct spi_device *spi, struct spi_message *message);
 extern int spi_async_locked(struct spi_device *spi,
-- 
cgit v1.2.3


From 48a74b1147f7db4623eaed591cc01eb740b871c0 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 9 Jun 2021 13:28:06 +0200
Subject: reset: Add compile-test stubs

Add stubs for the reset controller registration functions to allow
building reset controller provider drivers with the COMPILE_TEST
Kconfig option enabled.

Reported-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Suggested-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
Link: https://lore.kernel.org/r/20210609112806.3565057-3-thierry.reding@gmail.com
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 include/linux/reset-controller.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/reset-controller.h b/include/linux/reset-controller.h
index ec35814e0bbb..0fa4f60e1186 100644
--- a/include/linux/reset-controller.h
+++ b/include/linux/reset-controller.h
@@ -79,6 +79,7 @@ struct reset_controller_dev {
 	unsigned int nr_resets;
 };
 
+#if IS_ENABLED(CONFIG_RESET_CONTROLLER)
 int reset_controller_register(struct reset_controller_dev *rcdev);
 void reset_controller_unregister(struct reset_controller_dev *rcdev);
 
@@ -88,5 +89,26 @@ int devm_reset_controller_register(struct device *dev,
 
 void reset_controller_add_lookup(struct reset_control_lookup *lookup,
 				 unsigned int num_entries);
+#else
+static inline int reset_controller_register(struct reset_controller_dev *rcdev)
+{
+	return 0;
+}
+
+static inline void reset_controller_unregister(struct reset_controller_dev *rcdev)
+{
+}
+
+static inline int devm_reset_controller_register(struct device *dev,
+						 struct reset_controller_dev *rcdev)
+{
+	return 0;
+}
+
+static inline void reset_controller_add_lookup(struct reset_control_lookup *lookup,
+					       unsigned int num_entries)
+{
+}
+#endif
 
 #endif
-- 
cgit v1.2.3


From a69008475fc565cec5a760f1997f326773c84aac Mon Sep 17 00:00:00 2001
From: Shaokun Zhang <zhangshaokun@hisilicon.com>
Date: Mon, 7 Jun 2021 18:48:53 +0800
Subject: vt: vt_kern.h, remove the repeated declaration

Function 'vt_set_led_state' is declared twice, so remove the
repeated declaration.

Cc: Jiri Slaby <jirislaby@kernel.org>
Signed-off-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
Link: https://lore.kernel.org/r/1623062933-52943-1-git-send-email-zhangshaokun@hisilicon.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/vt_kern.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/vt_kern.h b/include/linux/vt_kern.h
index 94e7a315479c..0da94a6dee15 100644
--- a/include/linux/vt_kern.h
+++ b/include/linux/vt_kern.h
@@ -166,7 +166,6 @@ int vt_get_kbd_mode_bit(int console, int bit);
 void vt_set_kbd_mode_bit(int console, int bit);
 void vt_clr_kbd_mode_bit(int console, int bit);
 void vt_set_led_state(int console, int leds);
-void vt_set_led_state(int console, int leds);
 void vt_kbd_con_start(int console);
 void vt_kbd_con_stop(int console);
 
-- 
cgit v1.2.3


From e7555cf6c263d95d2bb2bddb5bb57c240f0d608a Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Tue, 8 Jun 2021 14:23:45 -0700
Subject: fpga: bridge: change FPGA indirect article to an

Change use of 'a fpga' to 'an fpga'

Signed-off-by: Tom Rix <trix@redhat.com>
Link: https://lore.kernel.org/r/20210608212350.3029742-8-trix@redhat.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/fpga/fpga-bridge.c       | 22 +++++++++++-----------
 include/linux/fpga/fpga-bridge.h |  2 +-
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/fpga/fpga-bridge.c b/drivers/fpga/fpga-bridge.c
index e9266b2a357f..eeb71e38efa7 100644
--- a/drivers/fpga/fpga-bridge.c
+++ b/drivers/fpga/fpga-bridge.c
@@ -85,14 +85,14 @@ err_dev:
 }
 
 /**
- * of_fpga_bridge_get - get an exclusive reference to a fpga bridge
+ * of_fpga_bridge_get - get an exclusive reference to an fpga bridge
  *
- * @np: node pointer of a FPGA bridge
+ * @np: node pointer of an FPGA bridge
  * @info: fpga image specific information
  *
  * Return fpga_bridge struct if successful.
  * Return -EBUSY if someone already has a reference to the bridge.
- * Return -ENODEV if @np is not a FPGA Bridge.
+ * Return -ENODEV if @np is not an FPGA Bridge.
  */
 struct fpga_bridge *of_fpga_bridge_get(struct device_node *np,
 				       struct fpga_image_info *info)
@@ -113,11 +113,11 @@ static int fpga_bridge_dev_match(struct device *dev, const void *data)
 }
 
 /**
- * fpga_bridge_get - get an exclusive reference to a fpga bridge
+ * fpga_bridge_get - get an exclusive reference to an fpga bridge
  * @dev:	parent device that fpga bridge was registered with
  * @info:	fpga manager info
  *
- * Given a device, get an exclusive reference to a fpga bridge.
+ * Given a device, get an exclusive reference to an fpga bridge.
  *
  * Return: fpga bridge struct or IS_ERR() condition containing error code.
  */
@@ -224,7 +224,7 @@ EXPORT_SYMBOL_GPL(fpga_bridges_put);
 /**
  * of_fpga_bridge_get_to_list - get a bridge, add it to a list
  *
- * @np: node pointer of a FPGA bridge
+ * @np: node pointer of an FPGA bridge
  * @info: fpga image specific information
  * @bridge_list: list of FPGA bridges
  *
@@ -373,7 +373,7 @@ error_kfree:
 EXPORT_SYMBOL_GPL(fpga_bridge_create);
 
 /**
- * fpga_bridge_free - free a fpga bridge created by fpga_bridge_create()
+ * fpga_bridge_free - free an fpga bridge created by fpga_bridge_create()
  * @bridge:	FPGA bridge struct
  */
 void fpga_bridge_free(struct fpga_bridge *bridge)
@@ -397,7 +397,7 @@ static void devm_fpga_bridge_release(struct device *dev, void *res)
  * @br_ops:	pointer to structure of fpga bridge ops
  * @priv:	FPGA bridge private data
  *
- * This function is intended for use in a FPGA bridge driver's probe function.
+ * This function is intended for use in an FPGA bridge driver's probe function.
  * After the bridge driver creates the struct with devm_fpga_bridge_create(), it
  * should register the bridge with fpga_bridge_register().  The bridge driver's
  * remove function should call fpga_bridge_unregister().  The bridge struct
@@ -430,7 +430,7 @@ struct fpga_bridge
 EXPORT_SYMBOL_GPL(devm_fpga_bridge_create);
 
 /**
- * fpga_bridge_register - register a FPGA bridge
+ * fpga_bridge_register - register an FPGA bridge
  *
  * @bridge: FPGA bridge struct
  *
@@ -454,11 +454,11 @@ int fpga_bridge_register(struct fpga_bridge *bridge)
 EXPORT_SYMBOL_GPL(fpga_bridge_register);
 
 /**
- * fpga_bridge_unregister - unregister a FPGA bridge
+ * fpga_bridge_unregister - unregister an FPGA bridge
  *
  * @bridge: FPGA bridge struct
  *
- * This function is intended for use in a FPGA bridge driver's remove function.
+ * This function is intended for use in an FPGA bridge driver's remove function.
  */
 void fpga_bridge_unregister(struct fpga_bridge *bridge)
 {
diff --git a/include/linux/fpga/fpga-bridge.h b/include/linux/fpga/fpga-bridge.h
index 817600a32c93..6c3c28806ff1 100644
--- a/include/linux/fpga/fpga-bridge.h
+++ b/include/linux/fpga/fpga-bridge.h
@@ -11,7 +11,7 @@ struct fpga_bridge;
 /**
  * struct fpga_bridge_ops - ops for low level FPGA bridge drivers
  * @enable_show: returns the FPGA bridge's status
- * @enable_set: set a FPGA bridge as enabled or disabled
+ * @enable_set: set an FPGA bridge as enabled or disabled
  * @fpga_bridge_remove: set FPGA into a specific state during driver remove
  * @groups: optional attribute groups.
  */
-- 
cgit v1.2.3


From 895ec9c09aa77e9f0129576995cb21191d3958f1 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Tue, 8 Jun 2021 14:23:46 -0700
Subject: fpga-mgr: change FPGA indirect article to an

Change use of 'a fpga' to 'an fpga'

Signed-off-by: Tom Rix <trix@redhat.com>
Link: https://lore.kernel.org/r/20210608212350.3029742-9-trix@redhat.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/fpga/fpga-mgr.c       | 22 +++++++++++-----------
 include/linux/fpga/fpga-mgr.h |  2 +-
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/fpga/fpga-mgr.c b/drivers/fpga/fpga-mgr.c
index b85bc47c91a9..ae21202105af 100644
--- a/drivers/fpga/fpga-mgr.c
+++ b/drivers/fpga/fpga-mgr.c
@@ -26,7 +26,7 @@ struct fpga_mgr_devres {
 };
 
 /**
- * fpga_image_info_alloc - Allocate a FPGA image info struct
+ * fpga_image_info_alloc - Allocate an FPGA image info struct
  * @dev: owning device
  *
  * Return: struct fpga_image_info or NULL
@@ -50,7 +50,7 @@ struct fpga_image_info *fpga_image_info_alloc(struct device *dev)
 EXPORT_SYMBOL_GPL(fpga_image_info_alloc);
 
 /**
- * fpga_image_info_free - Free a FPGA image info struct
+ * fpga_image_info_free - Free an FPGA image info struct
  * @info: FPGA image info struct to free
  */
 void fpga_image_info_free(struct fpga_image_info *info)
@@ -470,7 +470,7 @@ static int fpga_mgr_dev_match(struct device *dev, const void *data)
 }
 
 /**
- * fpga_mgr_get - Given a device, get a reference to a fpga mgr.
+ * fpga_mgr_get - Given a device, get a reference to an fpga mgr.
  * @dev:	parent device that fpga mgr was registered with
  *
  * Return: fpga manager struct or IS_ERR() condition containing error code.
@@ -487,7 +487,7 @@ struct fpga_manager *fpga_mgr_get(struct device *dev)
 EXPORT_SYMBOL_GPL(fpga_mgr_get);
 
 /**
- * of_fpga_mgr_get - Given a device node, get a reference to a fpga mgr.
+ * of_fpga_mgr_get - Given a device node, get a reference to an fpga mgr.
  *
  * @node:	device node
  *
@@ -506,7 +506,7 @@ struct fpga_manager *of_fpga_mgr_get(struct device_node *node)
 EXPORT_SYMBOL_GPL(of_fpga_mgr_get);
 
 /**
- * fpga_mgr_put - release a reference to a fpga manager
+ * fpga_mgr_put - release a reference to an fpga manager
  * @mgr:	fpga manager structure
  */
 void fpga_mgr_put(struct fpga_manager *mgr)
@@ -550,7 +550,7 @@ void fpga_mgr_unlock(struct fpga_manager *mgr)
 EXPORT_SYMBOL_GPL(fpga_mgr_unlock);
 
 /**
- * fpga_mgr_create - create and initialize a FPGA manager struct
+ * fpga_mgr_create - create and initialize an FPGA manager struct
  * @dev:	fpga manager device from pdev
  * @name:	fpga manager name
  * @mops:	pointer to structure of fpga manager ops
@@ -617,7 +617,7 @@ error_kfree:
 EXPORT_SYMBOL_GPL(fpga_mgr_create);
 
 /**
- * fpga_mgr_free - free a FPGA manager created with fpga_mgr_create()
+ * fpga_mgr_free - free an FPGA manager created with fpga_mgr_create()
  * @mgr:	fpga manager struct
  */
 void fpga_mgr_free(struct fpga_manager *mgr)
@@ -641,7 +641,7 @@ static void devm_fpga_mgr_release(struct device *dev, void *res)
  * @mops:	pointer to structure of fpga manager ops
  * @priv:	fpga manager private data
  *
- * This function is intended for use in a FPGA manager driver's probe function.
+ * This function is intended for use in an FPGA manager driver's probe function.
  * After the manager driver creates the manager struct with
  * devm_fpga_mgr_create(), it should register it with fpga_mgr_register().  The
  * manager driver's remove function should call fpga_mgr_unregister().  The
@@ -674,7 +674,7 @@ struct fpga_manager *devm_fpga_mgr_create(struct device *dev, const char *name,
 EXPORT_SYMBOL_GPL(devm_fpga_mgr_create);
 
 /**
- * fpga_mgr_register - register a FPGA manager
+ * fpga_mgr_register - register an FPGA manager
  * @mgr: fpga manager struct
  *
  * Return: 0 on success, negative error code otherwise.
@@ -706,10 +706,10 @@ error_device:
 EXPORT_SYMBOL_GPL(fpga_mgr_register);
 
 /**
- * fpga_mgr_unregister - unregister a FPGA manager
+ * fpga_mgr_unregister - unregister an FPGA manager
  * @mgr: fpga manager struct
  *
- * This function is intended for use in a FPGA manager driver's remove function.
+ * This function is intended for use in an FPGA manager driver's remove function.
  */
 void fpga_mgr_unregister(struct fpga_manager *mgr)
 {
diff --git a/include/linux/fpga/fpga-mgr.h b/include/linux/fpga/fpga-mgr.h
index 2bc3030a69e5..ec2cd8bfceb0 100644
--- a/include/linux/fpga/fpga-mgr.h
+++ b/include/linux/fpga/fpga-mgr.h
@@ -75,7 +75,7 @@ enum fpga_mgr_states {
 #define FPGA_MGR_COMPRESSED_BITSTREAM	BIT(4)
 
 /**
- * struct fpga_image_info - information specific to a FPGA image
+ * struct fpga_image_info - information specific to an FPGA image
  * @flags: boolean flags as defined above
  * @enable_timeout_us: maximum time to enable traffic through bridge (uSec)
  * @disable_timeout_us: maximum time to disable traffic through bridge (uSec)
-- 
cgit v1.2.3


From 3df4fce739e2b263120f528c5e0fe6b2f8937b5b Mon Sep 17 00:00:00 2001
From: Ricky Wu <ricky_wu@realtek.com>
Date: Mon, 7 Jun 2021 18:16:34 +0800
Subject: misc: rtsx: separate aspm mode into MODE_REG and MODE_CFG

aspm (Active State Power Management)
rtsx_comm_set_aspm: this function is for driver to make sure
not enter power saving when processing of init and card_detcct
ASPM_MODE_CFG: 8411 5209 5227 5229 5249 5250
Change back to use original way to control aspm
ASPM_MODE_REG: 5227A 524A 5250A 5260 5261 5228
Keep the new way to control aspm

Fixes: 121e9c6b5c4c ("misc: rtsx: modify and fix init_hw function")
Reported-by: Chris Chiu <chris.chiu@canonical.com>
Tested-by: Gordon Lack <gordon.lack@dsl.pipex.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Ricky Wu <ricky_wu@realtek.com>
Link: https://lore.kernel.org/r/20210607101634.4948-1-ricky_wu@realtek.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/cardreader/rtl8411.c  |  1 +
 drivers/misc/cardreader/rts5209.c  |  1 +
 drivers/misc/cardreader/rts5227.c  |  2 ++
 drivers/misc/cardreader/rts5228.c  |  1 +
 drivers/misc/cardreader/rts5229.c  |  1 +
 drivers/misc/cardreader/rts5249.c  |  3 +++
 drivers/misc/cardreader/rts5260.c  |  1 +
 drivers/misc/cardreader/rts5261.c  |  1 +
 drivers/misc/cardreader/rtsx_pcr.c | 44 +++++++++++++++++++++++++++-----------
 include/linux/rtsx_pci.h           |  2 ++
 10 files changed, 44 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/cardreader/rtl8411.c b/drivers/misc/cardreader/rtl8411.c
index a07674ed0596..4c5621b17a6f 100644
--- a/drivers/misc/cardreader/rtl8411.c
+++ b/drivers/misc/cardreader/rtl8411.c
@@ -468,6 +468,7 @@ static void rtl8411_init_common_params(struct rtsx_pcr *pcr)
 	pcr->sd30_drive_sel_1v8 = DRIVER_TYPE_B;
 	pcr->sd30_drive_sel_3v3 = DRIVER_TYPE_D;
 	pcr->aspm_en = ASPM_L1_EN;
+	pcr->aspm_mode = ASPM_MODE_CFG;
 	pcr->tx_initial_phase = SET_CLOCK_PHASE(23, 7, 14);
 	pcr->rx_initial_phase = SET_CLOCK_PHASE(4, 3, 10);
 	pcr->ic_version = rtl8411_get_ic_version(pcr);
diff --git a/drivers/misc/cardreader/rts5209.c b/drivers/misc/cardreader/rts5209.c
index 39a6a7ecc32e..29f5414072bf 100644
--- a/drivers/misc/cardreader/rts5209.c
+++ b/drivers/misc/cardreader/rts5209.c
@@ -255,6 +255,7 @@ void rts5209_init_params(struct rtsx_pcr *pcr)
 	pcr->sd30_drive_sel_1v8 = DRIVER_TYPE_B;
 	pcr->sd30_drive_sel_3v3 = DRIVER_TYPE_D;
 	pcr->aspm_en = ASPM_L1_EN;
+	pcr->aspm_mode = ASPM_MODE_CFG;
 	pcr->tx_initial_phase = SET_CLOCK_PHASE(27, 27, 16);
 	pcr->rx_initial_phase = SET_CLOCK_PHASE(24, 6, 5);
 
diff --git a/drivers/misc/cardreader/rts5227.c b/drivers/misc/cardreader/rts5227.c
index 8200af22b529..4bcfbc9afbac 100644
--- a/drivers/misc/cardreader/rts5227.c
+++ b/drivers/misc/cardreader/rts5227.c
@@ -358,6 +358,7 @@ void rts5227_init_params(struct rtsx_pcr *pcr)
 	pcr->sd30_drive_sel_1v8 = CFG_DRIVER_TYPE_B;
 	pcr->sd30_drive_sel_3v3 = CFG_DRIVER_TYPE_B;
 	pcr->aspm_en = ASPM_L1_EN;
+	pcr->aspm_mode = ASPM_MODE_CFG;
 	pcr->tx_initial_phase = SET_CLOCK_PHASE(27, 27, 15);
 	pcr->rx_initial_phase = SET_CLOCK_PHASE(30, 7, 7);
 
@@ -483,6 +484,7 @@ void rts522a_init_params(struct rtsx_pcr *pcr)
 
 	rts5227_init_params(pcr);
 	pcr->ops = &rts522a_pcr_ops;
+	pcr->aspm_mode = ASPM_MODE_REG;
 	pcr->tx_initial_phase = SET_CLOCK_PHASE(20, 20, 11);
 	pcr->reg_pm_ctrl3 = RTS522A_PM_CTRL3;
 
diff --git a/drivers/misc/cardreader/rts5228.c b/drivers/misc/cardreader/rts5228.c
index 781a86def59a..ffc128278613 100644
--- a/drivers/misc/cardreader/rts5228.c
+++ b/drivers/misc/cardreader/rts5228.c
@@ -718,6 +718,7 @@ void rts5228_init_params(struct rtsx_pcr *pcr)
 	pcr->sd30_drive_sel_1v8 = CFG_DRIVER_TYPE_B;
 	pcr->sd30_drive_sel_3v3 = CFG_DRIVER_TYPE_B;
 	pcr->aspm_en = ASPM_L1_EN;
+	pcr->aspm_mode = ASPM_MODE_REG;
 	pcr->tx_initial_phase = SET_CLOCK_PHASE(28, 27, 11);
 	pcr->rx_initial_phase = SET_CLOCK_PHASE(24, 6, 5);
 
diff --git a/drivers/misc/cardreader/rts5229.c b/drivers/misc/cardreader/rts5229.c
index 89e6f124ca5c..c748eaf1ec1f 100644
--- a/drivers/misc/cardreader/rts5229.c
+++ b/drivers/misc/cardreader/rts5229.c
@@ -246,6 +246,7 @@ void rts5229_init_params(struct rtsx_pcr *pcr)
 	pcr->sd30_drive_sel_1v8 = DRIVER_TYPE_B;
 	pcr->sd30_drive_sel_3v3 = DRIVER_TYPE_D;
 	pcr->aspm_en = ASPM_L1_EN;
+	pcr->aspm_mode = ASPM_MODE_CFG;
 	pcr->tx_initial_phase = SET_CLOCK_PHASE(27, 27, 15);
 	pcr->rx_initial_phase = SET_CLOCK_PHASE(30, 6, 6);
 
diff --git a/drivers/misc/cardreader/rts5249.c b/drivers/misc/cardreader/rts5249.c
index b2676e7f5027..53f3a1f45c4a 100644
--- a/drivers/misc/cardreader/rts5249.c
+++ b/drivers/misc/cardreader/rts5249.c
@@ -566,6 +566,7 @@ void rts5249_init_params(struct rtsx_pcr *pcr)
 	pcr->sd30_drive_sel_1v8 = CFG_DRIVER_TYPE_B;
 	pcr->sd30_drive_sel_3v3 = CFG_DRIVER_TYPE_B;
 	pcr->aspm_en = ASPM_L1_EN;
+	pcr->aspm_mode = ASPM_MODE_CFG;
 	pcr->tx_initial_phase = SET_CLOCK_PHASE(1, 29, 16);
 	pcr->rx_initial_phase = SET_CLOCK_PHASE(24, 6, 5);
 
@@ -729,6 +730,7 @@ static const struct pcr_ops rts524a_pcr_ops = {
 void rts524a_init_params(struct rtsx_pcr *pcr)
 {
 	rts5249_init_params(pcr);
+	pcr->aspm_mode = ASPM_MODE_REG;
 	pcr->tx_initial_phase = SET_CLOCK_PHASE(27, 29, 11);
 	pcr->option.ltr_l1off_sspwrgate = LTR_L1OFF_SSPWRGATE_5250_DEF;
 	pcr->option.ltr_l1off_snooze_sspwrgate =
@@ -845,6 +847,7 @@ static const struct pcr_ops rts525a_pcr_ops = {
 void rts525a_init_params(struct rtsx_pcr *pcr)
 {
 	rts5249_init_params(pcr);
+	pcr->aspm_mode = ASPM_MODE_REG;
 	pcr->tx_initial_phase = SET_CLOCK_PHASE(25, 29, 11);
 	pcr->option.ltr_l1off_sspwrgate = LTR_L1OFF_SSPWRGATE_5250_DEF;
 	pcr->option.ltr_l1off_snooze_sspwrgate =
diff --git a/drivers/misc/cardreader/rts5260.c b/drivers/misc/cardreader/rts5260.c
index 080a7d67a8e1..9b42b20a3e5a 100644
--- a/drivers/misc/cardreader/rts5260.c
+++ b/drivers/misc/cardreader/rts5260.c
@@ -628,6 +628,7 @@ void rts5260_init_params(struct rtsx_pcr *pcr)
 	pcr->sd30_drive_sel_1v8 = CFG_DRIVER_TYPE_B;
 	pcr->sd30_drive_sel_3v3 = CFG_DRIVER_TYPE_B;
 	pcr->aspm_en = ASPM_L1_EN;
+	pcr->aspm_mode = ASPM_MODE_REG;
 	pcr->tx_initial_phase = SET_CLOCK_PHASE(27, 29, 11);
 	pcr->rx_initial_phase = SET_CLOCK_PHASE(24, 6, 5);
 
diff --git a/drivers/misc/cardreader/rts5261.c b/drivers/misc/cardreader/rts5261.c
index 6c64dade8e1a..1fd4e0e50730 100644
--- a/drivers/misc/cardreader/rts5261.c
+++ b/drivers/misc/cardreader/rts5261.c
@@ -783,6 +783,7 @@ void rts5261_init_params(struct rtsx_pcr *pcr)
 	pcr->sd30_drive_sel_1v8 = 0x00;
 	pcr->sd30_drive_sel_3v3 = 0x00;
 	pcr->aspm_en = ASPM_L1_EN;
+	pcr->aspm_mode = ASPM_MODE_REG;
 	pcr->tx_initial_phase = SET_CLOCK_PHASE(27, 27, 11);
 	pcr->rx_initial_phase = SET_CLOCK_PHASE(24, 6, 5);
 
diff --git a/drivers/misc/cardreader/rtsx_pcr.c b/drivers/misc/cardreader/rtsx_pcr.c
index 273311184669..baf83594a01d 100644
--- a/drivers/misc/cardreader/rtsx_pcr.c
+++ b/drivers/misc/cardreader/rtsx_pcr.c
@@ -85,12 +85,18 @@ static void rtsx_comm_set_aspm(struct rtsx_pcr *pcr, bool enable)
 	if (pcr->aspm_enabled == enable)
 		return;
 
-	if (pcr->aspm_en & 0x02)
-		rtsx_pci_write_register(pcr, ASPM_FORCE_CTL, FORCE_ASPM_CTL0 |
-			FORCE_ASPM_CTL1, enable ? 0 : FORCE_ASPM_CTL0 | FORCE_ASPM_CTL1);
-	else
-		rtsx_pci_write_register(pcr, ASPM_FORCE_CTL, FORCE_ASPM_CTL0 |
-			FORCE_ASPM_CTL1, FORCE_ASPM_CTL0 | FORCE_ASPM_CTL1);
+	if (pcr->aspm_mode == ASPM_MODE_CFG) {
+		pcie_capability_clear_and_set_word(pcr->pci, PCI_EXP_LNKCTL,
+						PCI_EXP_LNKCTL_ASPMC,
+						enable ? pcr->aspm_en : 0);
+	} else if (pcr->aspm_mode == ASPM_MODE_REG) {
+		if (pcr->aspm_en & 0x02)
+			rtsx_pci_write_register(pcr, ASPM_FORCE_CTL, FORCE_ASPM_CTL0 |
+				FORCE_ASPM_CTL1, enable ? 0 : FORCE_ASPM_CTL0 | FORCE_ASPM_CTL1);
+		else
+			rtsx_pci_write_register(pcr, ASPM_FORCE_CTL, FORCE_ASPM_CTL0 |
+				FORCE_ASPM_CTL1, FORCE_ASPM_CTL0 | FORCE_ASPM_CTL1);
+	}
 
 	if (!enable && (pcr->aspm_en & 0x02))
 		mdelay(10);
@@ -1394,7 +1400,8 @@ static int rtsx_pci_init_hw(struct rtsx_pcr *pcr)
 			return err;
 	}
 
-	rtsx_pci_write_register(pcr, ASPM_FORCE_CTL, 0x30, 0x30);
+	if (pcr->aspm_mode == ASPM_MODE_REG)
+		rtsx_pci_write_register(pcr, ASPM_FORCE_CTL, 0x30, 0x30);
 
 	/* No CD interrupt if probing driver with card inserted.
 	 * So we need to initialize pcr->card_exist here.
@@ -1410,6 +1417,8 @@ static int rtsx_pci_init_hw(struct rtsx_pcr *pcr)
 static int rtsx_pci_init_chip(struct rtsx_pcr *pcr)
 {
 	int err;
+	u16 cfg_val;
+	u8 val;
 
 	spin_lock_init(&pcr->lock);
 	mutex_init(&pcr->pcr_mutex);
@@ -1477,6 +1486,21 @@ static int rtsx_pci_init_chip(struct rtsx_pcr *pcr)
 	if (!pcr->slots)
 		return -ENOMEM;
 
+	if (pcr->aspm_mode == ASPM_MODE_CFG) {
+		pcie_capability_read_word(pcr->pci, PCI_EXP_LNKCTL, &cfg_val);
+		if (cfg_val & PCI_EXP_LNKCTL_ASPM_L1)
+			pcr->aspm_enabled = true;
+		else
+			pcr->aspm_enabled = false;
+
+	} else if (pcr->aspm_mode == ASPM_MODE_REG) {
+		rtsx_pci_read_register(pcr, ASPM_FORCE_CTL, &val);
+		if (val & FORCE_ASPM_CTL0 && val & FORCE_ASPM_CTL1)
+			pcr->aspm_enabled = false;
+		else
+			pcr->aspm_enabled = true;
+	}
+
 	if (pcr->ops->fetch_vendor_settings)
 		pcr->ops->fetch_vendor_settings(pcr);
 
@@ -1506,7 +1530,6 @@ static int rtsx_pci_probe(struct pci_dev *pcidev,
 	struct pcr_handle *handle;
 	u32 base, len;
 	int ret, i, bar = 0;
-	u8 val;
 
 	dev_dbg(&(pcidev->dev),
 		": Realtek PCI-E Card Reader found at %s [%04x:%04x] (rev %x)\n",
@@ -1572,11 +1595,6 @@ static int rtsx_pci_probe(struct pci_dev *pcidev,
 	pcr->host_cmds_addr = pcr->rtsx_resv_buf_addr;
 	pcr->host_sg_tbl_ptr = pcr->rtsx_resv_buf + HOST_CMDS_BUF_LEN;
 	pcr->host_sg_tbl_addr = pcr->rtsx_resv_buf_addr + HOST_CMDS_BUF_LEN;
-	rtsx_pci_read_register(pcr, ASPM_FORCE_CTL, &val);
-	if (val & FORCE_ASPM_CTL0 && val & FORCE_ASPM_CTL1)
-		pcr->aspm_enabled = false;
-	else
-		pcr->aspm_enabled = true;
 	pcr->card_inserted = 0;
 	pcr->card_removed = 0;
 	INIT_DELAYED_WORK(&pcr->carddet_work, rtsx_pci_card_detect);
diff --git a/include/linux/rtsx_pci.h b/include/linux/rtsx_pci.h
index 6f155f99aa16..4ab7bfc675f1 100644
--- a/include/linux/rtsx_pci.h
+++ b/include/linux/rtsx_pci.h
@@ -1109,6 +1109,7 @@ struct pcr_ops {
 };
 
 enum PDEV_STAT  {PDEV_STAT_IDLE, PDEV_STAT_RUN};
+enum ASPM_MODE  {ASPM_MODE_CFG, ASPM_MODE_REG};
 
 #define ASPM_L1_1_EN			BIT(0)
 #define ASPM_L1_2_EN			BIT(1)
@@ -1234,6 +1235,7 @@ struct rtsx_pcr {
 	u8				card_drive_sel;
 #define ASPM_L1_EN			0x02
 	u8				aspm_en;
+	enum ASPM_MODE			aspm_mode;
 	bool				aspm_enabled;
 
 #define PCR_MS_PMOS			(1 << 0)
-- 
cgit v1.2.3


From a3e5fd9314dfc4314a9567cde96e1aef83a7458a Mon Sep 17 00:00:00 2001
From: Dima Chumak <dchumak@nvidia.com>
Date: Wed, 26 May 2021 13:45:10 +0300
Subject: net/mlx5e: Fix page reclaim for dead peer hairpin

When adding a hairpin flow, a firmware-side send queue is created for
the peer net device, which claims some host memory pages for its
internal ring buffer. If the peer net device is removed/unbound before
the hairpin flow is deleted, then the send queue is not destroyed which
leads to a stack trace on pci device remove:

[ 748.005230] mlx5_core 0000:08:00.2: wait_func:1094:(pid 12985): MANAGE_PAGES(0x108) timeout. Will cause a leak of a command resource
[ 748.005231] mlx5_core 0000:08:00.2: reclaim_pages:514:(pid 12985): failed reclaiming pages: err -110
[ 748.001835] mlx5_core 0000:08:00.2: mlx5_reclaim_root_pages:653:(pid 12985): failed reclaiming pages (-110) for func id 0x0
[ 748.002171] ------------[ cut here ]------------
[ 748.001177] FW pages counter is 4 after reclaiming all pages
[ 748.001186] WARNING: CPU: 1 PID: 12985 at drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c:685 mlx5_reclaim_startup_pages+0x34b/0x460 [mlx5_core]                      [  +0.002771] Modules linked in: cls_flower mlx5_ib mlx5_core ptp pps_core act_mirred sch_ingress openvswitch nsh xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 br_netfilter rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi rdma_cm ib_umad ib_ipoib iw_cm ib_cm ib_uverbs ib_core overlay fuse [last unloaded: pps_core]
[ 748.007225] CPU: 1 PID: 12985 Comm: tee Not tainted 5.12.0+ #1
[ 748.001376] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[ 748.002315] RIP: 0010:mlx5_reclaim_startup_pages+0x34b/0x460 [mlx5_core]
[ 748.001679] Code: 28 00 00 00 0f 85 22 01 00 00 48 81 c4 b0 00 00 00 31 c0 5b 5d 41 5c 41 5d 41 5e 41 5f c3 48 c7 c7 40 cc 19 a1 e8 9f 71 0e e2 <0f> 0b e9 30 ff ff ff 48 c7 c7 a0 cc 19 a1 e8 8c 71 0e e2 0f 0b e9
[ 748.003781] RSP: 0018:ffff88815220faf8 EFLAGS: 00010286
[ 748.001149] RAX: 0000000000000000 RBX: ffff8881b4900280 RCX: 0000000000000000
[ 748.001445] RDX: 0000000000000027 RSI: 0000000000000004 RDI: ffffed102a441f51
[ 748.001614] RBP: 00000000000032b9 R08: 0000000000000001 R09: ffffed1054a15ee8
[ 748.001446] R10: ffff8882a50af73b R11: ffffed1054a15ee7 R12: fffffbfff07c1e30
[ 748.001447] R13: dffffc0000000000 R14: ffff8881b492cba8 R15: 0000000000000000
[ 748.001429] FS:  00007f58bd08b580(0000) GS:ffff8882a5080000(0000) knlGS:0000000000000000
[ 748.001695] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 748.001309] CR2: 000055a026351740 CR3: 00000001d3b48006 CR4: 0000000000370ea0
[ 748.001506] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 748.001483] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 748.001654] Call Trace:
[ 748.000576]  ? mlx5_satisfy_startup_pages+0x290/0x290 [mlx5_core]
[ 748.001416]  ? mlx5_cmd_teardown_hca+0xa2/0xd0 [mlx5_core]
[ 748.001354]  ? mlx5_cmd_init_hca+0x280/0x280 [mlx5_core]
[ 748.001203]  mlx5_function_teardown+0x30/0x60 [mlx5_core]
[ 748.001275]  mlx5_uninit_one+0xa7/0xc0 [mlx5_core]
[ 748.001200]  remove_one+0x5f/0xc0 [mlx5_core]
[ 748.001075]  pci_device_remove+0x9f/0x1d0
[ 748.000833]  device_release_driver_internal+0x1e0/0x490
[ 748.001207]  unbind_store+0x19f/0x200
[ 748.000942]  ? sysfs_file_ops+0x170/0x170
[ 748.001000]  kernfs_fop_write_iter+0x2bc/0x450
[ 748.000970]  new_sync_write+0x373/0x610
[ 748.001124]  ? new_sync_read+0x600/0x600
[ 748.001057]  ? lock_acquire+0x4d6/0x700
[ 748.000908]  ? lockdep_hardirqs_on_prepare+0x400/0x400
[ 748.001126]  ? fd_install+0x1c9/0x4d0
[ 748.000951]  vfs_write+0x4d0/0x800
[ 748.000804]  ksys_write+0xf9/0x1d0
[ 748.000868]  ? __x64_sys_read+0xb0/0xb0
[ 748.000811]  ? filp_open+0x50/0x50
[ 748.000919]  ? syscall_enter_from_user_mode+0x1d/0x50
[ 748.001223]  do_syscall_64+0x3f/0x80
[ 748.000892]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[ 748.001026] RIP: 0033:0x7f58bcfb22f7
[ 748.000944] Code: 0d 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24
[ 748.003925] RSP: 002b:00007fffd7f2aaa8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[ 748.001732] RAX: ffffffffffffffda RBX: 000000000000000d RCX: 00007f58bcfb22f7
[ 748.001426] RDX: 000000000000000d RSI: 00007fffd7f2abc0 RDI: 0000000000000003
[ 748.001746] RBP: 00007fffd7f2abc0 R08: 0000000000000000 R09: 0000000000000001
[ 748.001631] R10: 00000000000001b6 R11: 0000000000000246 R12: 000000000000000d
[ 748.001537] R13: 00005597ac2c24a0 R14: 000000000000000d R15: 00007f58bd084700
[ 748.001564] irq event stamp: 0
[ 748.000787] hardirqs last  enabled at (0): [<0000000000000000>] 0x0
[ 748.001399] hardirqs last disabled at (0): [<ffffffff813132cf>] copy_process+0x146f/0x5eb0
[ 748.001854] softirqs last  enabled at (0): [<ffffffff8131330e>] copy_process+0x14ae/0x5eb0
[ 748.013431] softirqs last disabled at (0): [<0000000000000000>] 0x0
[ 748.001492] ---[ end trace a6fabd773d1c51ae ]---

Fix by destroying the send queue of a hairpin peer net device that is
being removed/unbound, which returns the allocated ring buffer pages to
the host.

Fixes: 4d8fcf216c90 ("net/mlx5e: Avoid unbounded peer devices when unpairing TC hairpin rules")
Signed-off-by: Dima Chumak <dchumak@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/transobj.c | 30 +++++++++++++++++-----
 include/linux/mlx5/transobj.h                      |  1 +
 3 files changed, 26 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index dd64878e5b38..d4b0f270b6bb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -4765,7 +4765,7 @@ static void mlx5e_tc_hairpin_update_dead_peer(struct mlx5e_priv *priv,
 	list_for_each_entry_safe(hpe, tmp, &init_wait_list, dead_peer_wait_list) {
 		wait_for_completion(&hpe->res_ready);
 		if (!IS_ERR_OR_NULL(hpe->hp) && hpe->peer_vhca_id == peer_vhca_id)
-			hpe->hp->pair->peer_gone = true;
+			mlx5_core_hairpin_clear_dead_peer(hpe->hp->pair);
 
 		mlx5e_hairpin_put(priv, hpe);
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
index 01cc00ad8acf..b6931bbe52d2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
@@ -424,6 +424,15 @@ err_modify_sq:
 	return err;
 }
 
+static void mlx5_hairpin_unpair_peer_sq(struct mlx5_hairpin *hp)
+{
+	int i;
+
+	for (i = 0; i < hp->num_channels; i++)
+		mlx5_hairpin_modify_sq(hp->peer_mdev, hp->sqn[i], MLX5_SQC_STATE_RDY,
+				       MLX5_SQC_STATE_RST, 0, 0);
+}
+
 static void mlx5_hairpin_unpair_queues(struct mlx5_hairpin *hp)
 {
 	int i;
@@ -432,13 +441,9 @@ static void mlx5_hairpin_unpair_queues(struct mlx5_hairpin *hp)
 	for (i = 0; i < hp->num_channels; i++)
 		mlx5_hairpin_modify_rq(hp->func_mdev, hp->rqn[i], MLX5_RQC_STATE_RDY,
 				       MLX5_RQC_STATE_RST, 0, 0);
-
 	/* unset peer SQs */
-	if (hp->peer_gone)
-		return;
-	for (i = 0; i < hp->num_channels; i++)
-		mlx5_hairpin_modify_sq(hp->peer_mdev, hp->sqn[i], MLX5_SQC_STATE_RDY,
-				       MLX5_SQC_STATE_RST, 0, 0);
+	if (!hp->peer_gone)
+		mlx5_hairpin_unpair_peer_sq(hp);
 }
 
 struct mlx5_hairpin *
@@ -485,3 +490,16 @@ void mlx5_core_hairpin_destroy(struct mlx5_hairpin *hp)
 	mlx5_hairpin_destroy_queues(hp);
 	kfree(hp);
 }
+
+void mlx5_core_hairpin_clear_dead_peer(struct mlx5_hairpin *hp)
+{
+	int i;
+
+	mlx5_hairpin_unpair_peer_sq(hp);
+
+	/* destroy peer SQ */
+	for (i = 0; i < hp->num_channels; i++)
+		mlx5_core_destroy_sq(hp->peer_mdev, hp->sqn[i]);
+
+	hp->peer_gone = true;
+}
diff --git a/include/linux/mlx5/transobj.h b/include/linux/mlx5/transobj.h
index 028f442530cf..60ffeb6b67ae 100644
--- a/include/linux/mlx5/transobj.h
+++ b/include/linux/mlx5/transobj.h
@@ -85,4 +85,5 @@ mlx5_core_hairpin_create(struct mlx5_core_dev *func_mdev,
 			 struct mlx5_hairpin_params *params);
 
 void mlx5_core_hairpin_destroy(struct mlx5_hairpin *pair);
+void mlx5_core_hairpin_clear_dead_peer(struct mlx5_hairpin *hp);
 #endif /* __TRANSOBJ_H__ */
-- 
cgit v1.2.3


From 67133eaa93e810f5c510cd0ec6e2e7ca76fc1340 Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Tue, 9 Mar 2021 03:29:16 +0200
Subject: net/mlx5: mlx5_ifc support for header insert/remove

Add support for HCA caps 2 that contains capabilities for the new
insert/remove header actions.

Added the required definitions for supporting the new reformat type:
added packet reformat parameters, reformat anchors and definitions
to allow copy/set into the inserted EMD (Embedded MetaData) tag.

Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
Reviewed-by: Jianbo Liu <jianbol@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fw.c |  6 +++++
 include/linux/mlx5/device.h                  | 10 +++++++
 include/linux/mlx5/mlx5_ifc.h                | 40 +++++++++++++++++++++++-----
 3 files changed, 50 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
index 02558ac2ace6..016d26f809a5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -148,6 +148,12 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev)
 	if (err)
 		return err;
 
+	if (MLX5_CAP_GEN(dev, hca_cap_2)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL_2);
+		if (err)
+			return err;
+	}
+
 	if (MLX5_CAP_GEN(dev, eth_net_offloads)) {
 		err = mlx5_core_get_caps(dev, MLX5_CAP_ETHERNET_OFFLOADS);
 		if (err)
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 578c4ccae91c..0025913505ab 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1179,6 +1179,7 @@ enum mlx5_cap_type {
 	MLX5_CAP_VDPA_EMULATION = 0x13,
 	MLX5_CAP_DEV_EVENT = 0x14,
 	MLX5_CAP_IPSEC,
+	MLX5_CAP_GENERAL_2 = 0x20,
 	/* NUM OF CAP Types */
 	MLX5_CAP_NUM
 };
@@ -1220,6 +1221,15 @@ enum mlx5_qcam_feature_groups {
 #define MLX5_CAP_GEN_MAX(mdev, cap) \
 	MLX5_GET(cmd_hca_cap, mdev->caps.hca_max[MLX5_CAP_GENERAL], cap)
 
+#define MLX5_CAP_GEN_2(mdev, cap) \
+	MLX5_GET(cmd_hca_cap_2, mdev->caps.hca_cur[MLX5_CAP_GENERAL_2], cap)
+
+#define MLX5_CAP_GEN_2_64(mdev, cap) \
+	MLX5_GET64(cmd_hca_cap_2, mdev->caps.hca_cur[MLX5_CAP_GENERAL_2], cap)
+
+#define MLX5_CAP_GEN_2_MAX(mdev, cap) \
+	MLX5_GET(cmd_hca_cap_2, mdev->caps.hca_max[MLX5_CAP_GENERAL_2], cap)
+
 #define MLX5_CAP_ETH(mdev, cap) \
 	MLX5_GET(per_protocol_networking_offload_caps,\
 		 mdev->caps.hca_cur[MLX5_CAP_ETHERNET_OFFLOADS], cap)
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index eb86e80e4643..057db0eaf195 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -435,7 +435,10 @@ struct mlx5_ifc_flow_table_prop_layout_bits {
 
 	u8         reserved_at_40[0x20];
 
-	u8         reserved_at_60[0x18];
+	u8         reserved_at_60[0x2];
+	u8         reformat_insert[0x1];
+	u8         reformat_remove[0x1];
+	u8         reserver_at_64[0x14];
 	u8         log_max_ft_num[0x8];
 
 	u8         reserved_at_80[0x10];
@@ -1312,7 +1315,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_at_0[0x1f];
 	u8         vhca_resource_manager[0x1];
 
-	u8         reserved_at_20[0x3];
+	u8         hca_cap_2[0x1];
+	u8         reserved_at_21[0x2];
 	u8         event_on_vhca_state_teardown_request[0x1];
 	u8         event_on_vhca_state_in_use[0x1];
 	u8         event_on_vhca_state_active[0x1];
@@ -1732,6 +1736,17 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8	   reserved_at_7c0[0x40];
 };
 
+struct mlx5_ifc_cmd_hca_cap_2_bits {
+	u8	   reserved_at_0[0xa0];
+
+	u8	   max_reformat_insert_size[0x8];
+	u8	   max_reformat_insert_offset[0x8];
+	u8	   max_reformat_remove_size[0x8];
+	u8	   max_reformat_remove_offset[0x8];
+
+	u8	   reserved_at_c0[0x740];
+};
+
 enum mlx5_flow_destination_type {
 	MLX5_FLOW_DESTINATION_TYPE_VPORT        = 0x0,
 	MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE   = 0x1,
@@ -3105,6 +3120,7 @@ struct mlx5_ifc_roce_addr_layout_bits {
 
 union mlx5_ifc_hca_cap_union_bits {
 	struct mlx5_ifc_cmd_hca_cap_bits cmd_hca_cap;
+	struct mlx5_ifc_cmd_hca_cap_2_bits cmd_hca_cap_2;
 	struct mlx5_ifc_odp_cap_bits odp_cap;
 	struct mlx5_ifc_atomic_caps_bits atomic_caps;
 	struct mlx5_ifc_roce_cap_bits roce_cap;
@@ -5785,12 +5801,14 @@ struct mlx5_ifc_query_eq_in_bits {
 };
 
 struct mlx5_ifc_packet_reformat_context_in_bits {
-	u8         reserved_at_0[0x5];
-	u8         reformat_type[0x3];
-	u8         reserved_at_8[0xe];
+	u8         reformat_type[0x8];
+	u8         reserved_at_8[0x4];
+	u8         reformat_param_0[0x4];
+	u8         reserved_at_10[0x6];
 	u8         reformat_data_size[0xa];
 
-	u8         reserved_at_20[0x10];
+	u8         reformat_param_1[0x8];
+	u8         reserved_at_28[0x8];
 	u8         reformat_data[2][0x8];
 
 	u8         more_reformat_data[][0x8];
@@ -5830,12 +5848,20 @@ struct mlx5_ifc_alloc_packet_reformat_context_out_bits {
 	u8         reserved_at_60[0x20];
 };
 
+enum {
+	MLX5_REFORMAT_CONTEXT_ANCHOR_MAC_START = 0x1,
+	MLX5_REFORMAT_CONTEXT_ANCHOR_IP_START = 0x7,
+	MLX5_REFORMAT_CONTEXT_ANCHOR_TCP_UDP_START = 0x9,
+};
+
 enum mlx5_reformat_ctx_type {
 	MLX5_REFORMAT_TYPE_L2_TO_VXLAN = 0x0,
 	MLX5_REFORMAT_TYPE_L2_TO_NVGRE = 0x1,
 	MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL = 0x2,
 	MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2 = 0x3,
 	MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x4,
+	MLX5_REFORMAT_TYPE_INSERT_HDR = 0xf,
+	MLX5_REFORMAT_TYPE_REMOVE_HDR = 0x10,
 };
 
 struct mlx5_ifc_alloc_packet_reformat_context_in_bits {
@@ -5956,6 +5982,8 @@ enum {
 	MLX5_ACTION_IN_FIELD_OUT_TCP_SEQ_NUM   = 0x59,
 	MLX5_ACTION_IN_FIELD_OUT_TCP_ACK_NUM   = 0x5B,
 	MLX5_ACTION_IN_FIELD_IPSEC_SYNDROME    = 0x5D,
+	MLX5_ACTION_IN_FIELD_OUT_EMD_47_32     = 0x6F,
+	MLX5_ACTION_IN_FIELD_OUT_EMD_31_0      = 0x70,
 };
 
 struct mlx5_ifc_alloc_modify_header_context_out_bits {
-- 
cgit v1.2.3


From 3f3f05ab88722224fef5b0b78a0969f6b54f2cba Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Tue, 9 Mar 2021 03:30:44 +0200
Subject: net/mlx5: Added new parameters to reformat context

Adding new reformat context type (INSERT_HEADER) requires adding two new
parameters to reformat context - reformat_param_0 and reformat_param_1.
As defined by HW spec, these parameters have different meaning for
different reformat context type.

The first parameter (reformat_param_0) is not new to HW spec, but it
wasn't used by any of the supported reformats. The second parameter
(reformat_param_1) is new to the HW spec - it was added to allow
supporting INSERT_HEADER.

For NSERT_HEADER, reformat_param_0 indicates the header used to
reference the location of the inserted header, and reformat_param_1
indicates the offset of the inserted header from the reference point
defined by reformat_param_0.

Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/infiniband/hw/mlx5/fs.c                    |  9 +++--
 .../net/ethernet/mellanox/mlx5/core/en/tc_tun.c    | 38 +++++++++++++++-------
 .../ethernet/mellanox/mlx5/core/en/tc_tun_encap.c  | 17 +++++++---
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c   | 29 +++++++++--------
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h   |  4 +--
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  |  9 ++---
 .../mellanox/mlx5/core/steering/dr_action.c        |  2 ++
 .../ethernet/mellanox/mlx5/core/steering/fs_dr.c   | 17 +++++-----
 .../ethernet/mellanox/mlx5/core/steering/mlx5dr.h  |  2 ++
 include/linux/mlx5/fs.h                            | 12 +++++--
 10 files changed, 86 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c
index 2fc6a60c4e77..941adf5cf3d0 100644
--- a/drivers/infiniband/hw/mlx5/fs.c
+++ b/drivers/infiniband/hw/mlx5/fs.c
@@ -2280,6 +2280,7 @@ static int mlx5_ib_flow_action_create_packet_reformat_ctx(
 	u8 ft_type, u8 dv_prt,
 	void *in, size_t len)
 {
+	struct mlx5_pkt_reformat_params reformat_params;
 	enum mlx5_flow_namespace_type namespace;
 	u8 prm_prt;
 	int ret;
@@ -2292,9 +2293,13 @@ static int mlx5_ib_flow_action_create_packet_reformat_ctx(
 	if (ret)
 		return ret;
 
+	memset(&reformat_params, 0, sizeof(reformat_params));
+	reformat_params.type = prm_prt;
+	reformat_params.size = len;
+	reformat_params.data = in;
 	maction->flow_action_raw.pkt_reformat =
-		mlx5_packet_reformat_alloc(dev->mdev, prm_prt, len,
-					   in, namespace);
+		mlx5_packet_reformat_alloc(dev->mdev, &reformat_params,
+					   namespace);
 	if (IS_ERR(maction->flow_action_raw.pkt_reformat)) {
 		ret = PTR_ERR(maction->flow_action_raw.pkt_reformat);
 		return ret;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
index 172e0474f2e6..8f79f04eccd6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
@@ -212,6 +212,7 @@ int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv *priv,
 {
 	int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size);
 	const struct ip_tunnel_key *tun_key = &e->tun_info->key;
+	struct mlx5_pkt_reformat_params reformat_params;
 	struct mlx5e_neigh m_neigh = {};
 	TC_TUN_ROUTE_ATTR_INIT(attr);
 	int ipv4_encap_size;
@@ -295,9 +296,12 @@ int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv *priv,
 		 */
 		goto release_neigh;
 	}
-	e->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev,
-						     e->reformat_type,
-						     ipv4_encap_size, encap_header,
+
+	memset(&reformat_params, 0, sizeof(reformat_params));
+	reformat_params.type = e->reformat_type;
+	reformat_params.size = ipv4_encap_size;
+	reformat_params.data = encap_header;
+	e->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev, &reformat_params,
 						     MLX5_FLOW_NAMESPACE_FDB);
 	if (IS_ERR(e->pkt_reformat)) {
 		err = PTR_ERR(e->pkt_reformat);
@@ -324,6 +328,7 @@ int mlx5e_tc_tun_update_header_ipv4(struct mlx5e_priv *priv,
 {
 	int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size);
 	const struct ip_tunnel_key *tun_key = &e->tun_info->key;
+	struct mlx5_pkt_reformat_params reformat_params;
 	TC_TUN_ROUTE_ATTR_INIT(attr);
 	int ipv4_encap_size;
 	char *encap_header;
@@ -396,9 +401,12 @@ int mlx5e_tc_tun_update_header_ipv4(struct mlx5e_priv *priv,
 		 */
 		goto release_neigh;
 	}
-	e->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev,
-						     e->reformat_type,
-						     ipv4_encap_size, encap_header,
+
+	memset(&reformat_params, 0, sizeof(reformat_params));
+	reformat_params.type = e->reformat_type;
+	reformat_params.size = ipv4_encap_size;
+	reformat_params.data = encap_header;
+	e->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev, &reformat_params,
 						     MLX5_FLOW_NAMESPACE_FDB);
 	if (IS_ERR(e->pkt_reformat)) {
 		err = PTR_ERR(e->pkt_reformat);
@@ -471,6 +479,7 @@ int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv,
 {
 	int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size);
 	const struct ip_tunnel_key *tun_key = &e->tun_info->key;
+	struct mlx5_pkt_reformat_params reformat_params;
 	struct mlx5e_neigh m_neigh = {};
 	TC_TUN_ROUTE_ATTR_INIT(attr);
 	struct ipv6hdr *ip6h;
@@ -553,9 +562,11 @@ int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv,
 		goto release_neigh;
 	}
 
-	e->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev,
-						     e->reformat_type,
-						     ipv6_encap_size, encap_header,
+	memset(&reformat_params, 0, sizeof(reformat_params));
+	reformat_params.type = e->reformat_type;
+	reformat_params.size = ipv6_encap_size;
+	reformat_params.data = encap_header;
+	e->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev, &reformat_params,
 						     MLX5_FLOW_NAMESPACE_FDB);
 	if (IS_ERR(e->pkt_reformat)) {
 		err = PTR_ERR(e->pkt_reformat);
@@ -582,6 +593,7 @@ int mlx5e_tc_tun_update_header_ipv6(struct mlx5e_priv *priv,
 {
 	int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size);
 	const struct ip_tunnel_key *tun_key = &e->tun_info->key;
+	struct mlx5_pkt_reformat_params reformat_params;
 	TC_TUN_ROUTE_ATTR_INIT(attr);
 	struct ipv6hdr *ip6h;
 	int ipv6_encap_size;
@@ -654,9 +666,11 @@ int mlx5e_tc_tun_update_header_ipv6(struct mlx5e_priv *priv,
 		goto release_neigh;
 	}
 
-	e->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev,
-						     e->reformat_type,
-						     ipv6_encap_size, encap_header,
+	memset(&reformat_params, 0, sizeof(reformat_params));
+	reformat_params.type = e->reformat_type;
+	reformat_params.size = ipv6_encap_size;
+	reformat_params.data = encap_header;
+	e->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev, &reformat_params,
 						     MLX5_FLOW_NAMESPACE_FDB);
 	if (IS_ERR(e->pkt_reformat)) {
 		err = PTR_ERR(e->pkt_reformat);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
index f1fb11680d20..0dfd51d2d178 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
@@ -120,6 +120,7 @@ void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
 			      struct list_head *flow_list)
 {
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5_pkt_reformat_params reformat_params;
 	struct mlx5_esw_flow_attr *esw_attr;
 	struct mlx5_flow_handle *rule;
 	struct mlx5_flow_attr *attr;
@@ -130,9 +131,12 @@ void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
 	if (e->flags & MLX5_ENCAP_ENTRY_NO_ROUTE)
 		return;
 
+	memset(&reformat_params, 0, sizeof(reformat_params));
+	reformat_params.type = e->reformat_type;
+	reformat_params.size = e->encap_size;
+	reformat_params.data = e->encap_header;
 	e->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev,
-						     e->reformat_type,
-						     e->encap_size, e->encap_header,
+						     &reformat_params,
 						     MLX5_FLOW_NAMESPACE_FDB);
 	if (IS_ERR(e->pkt_reformat)) {
 		mlx5_core_warn(priv->mdev, "Failed to offload cached encapsulation header, %lu\n",
@@ -812,6 +816,7 @@ int mlx5e_attach_decap(struct mlx5e_priv *priv,
 {
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 	struct mlx5_esw_flow_attr *attr = flow->attr->esw_attr;
+	struct mlx5_pkt_reformat_params reformat_params;
 	struct mlx5e_tc_flow_parse_attr *parse_attr;
 	struct mlx5e_decap_entry *d;
 	struct mlx5e_decap_key key;
@@ -853,10 +858,12 @@ int mlx5e_attach_decap(struct mlx5e_priv *priv,
 	hash_add_rcu(esw->offloads.decap_tbl, &d->hlist, hash_key);
 	mutex_unlock(&esw->offloads.decap_tbl_lock);
 
+	memset(&reformat_params, 0, sizeof(reformat_params));
+	reformat_params.type = MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2;
+	reformat_params.size = sizeof(parse_attr->eth);
+	reformat_params.data = &parse_attr->eth;
 	d->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev,
-						     MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2,
-						     sizeof(parse_attr->eth),
-						     &parse_attr->eth,
+						     &reformat_params,
 						     MLX5_FLOW_NAMESPACE_FDB);
 	if (IS_ERR(d->pkt_reformat)) {
 		err = PTR_ERR(d->pkt_reformat);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index b7aae8b75760..896a6c3dbdb7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -111,9 +111,7 @@ static int mlx5_cmd_stub_delete_fte(struct mlx5_flow_root_namespace *ns,
 }
 
 static int mlx5_cmd_stub_packet_reformat_alloc(struct mlx5_flow_root_namespace *ns,
-					       int reformat_type,
-					       size_t size,
-					       void *reformat_data,
+					       struct mlx5_pkt_reformat_params *params,
 					       enum mlx5_flow_namespace_type namespace,
 					       struct mlx5_pkt_reformat *pkt_reformat)
 {
@@ -701,9 +699,7 @@ int mlx5_cmd_fc_bulk_query(struct mlx5_core_dev *dev, u32 base_id, int bulk_len,
 }
 
 static int mlx5_cmd_packet_reformat_alloc(struct mlx5_flow_root_namespace *ns,
-					  int reformat_type,
-					  size_t size,
-					  void *reformat_data,
+					  struct mlx5_pkt_reformat_params *params,
 					  enum mlx5_flow_namespace_type namespace,
 					  struct mlx5_pkt_reformat *pkt_reformat)
 {
@@ -721,14 +717,14 @@ static int mlx5_cmd_packet_reformat_alloc(struct mlx5_flow_root_namespace *ns,
 	else
 		max_encap_size = MLX5_CAP_FLOWTABLE(dev, max_encap_header_size);
 
-	if (size > max_encap_size) {
+	if (params->size > max_encap_size) {
 		mlx5_core_warn(dev, "encap size %zd too big, max supported is %d\n",
-			       size, max_encap_size);
+			       params->size, max_encap_size);
 		return -EINVAL;
 	}
 
-	in = kzalloc(MLX5_ST_SZ_BYTES(alloc_packet_reformat_context_in) + size,
-		     GFP_KERNEL);
+	in = kzalloc(MLX5_ST_SZ_BYTES(alloc_packet_reformat_context_in) +
+		     params->size, GFP_KERNEL);
 	if (!in)
 		return -ENOMEM;
 
@@ -737,15 +733,20 @@ static int mlx5_cmd_packet_reformat_alloc(struct mlx5_flow_root_namespace *ns,
 	reformat = MLX5_ADDR_OF(packet_reformat_context_in,
 				packet_reformat_context_in,
 				reformat_data);
-	inlen = reformat - (void *)in  + size;
+	inlen = reformat - (void *)in + params->size;
 
 	MLX5_SET(alloc_packet_reformat_context_in, in, opcode,
 		 MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT);
 	MLX5_SET(packet_reformat_context_in, packet_reformat_context_in,
-		 reformat_data_size, size);
+		 reformat_data_size, params->size);
 	MLX5_SET(packet_reformat_context_in, packet_reformat_context_in,
-		 reformat_type, reformat_type);
-	memcpy(reformat, reformat_data, size);
+		 reformat_type, params->type);
+	MLX5_SET(packet_reformat_context_in, packet_reformat_context_in,
+		 reformat_param_0, params->param_0);
+	MLX5_SET(packet_reformat_context_in, packet_reformat_context_in,
+		 reformat_param_1, params->param_1);
+	if (params->data && params->size)
+		memcpy(reformat, params->data, params->size);
 
 	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
index c2e102ed82ad..5ecd33cdc087 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
@@ -77,9 +77,7 @@ struct mlx5_flow_cmds {
 			      bool disconnect);
 
 	int (*packet_reformat_alloc)(struct mlx5_flow_root_namespace *ns,
-				     int reformat_type,
-				     size_t size,
-				     void *reformat_data,
+				     struct mlx5_pkt_reformat_params *params,
 				     enum mlx5_flow_namespace_type namespace,
 				     struct mlx5_pkt_reformat *pkt_reformat);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 1b7a1cde097c..c0936b4e53a9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -3165,9 +3165,7 @@ void mlx5_modify_header_dealloc(struct mlx5_core_dev *dev,
 EXPORT_SYMBOL(mlx5_modify_header_dealloc);
 
 struct mlx5_pkt_reformat *mlx5_packet_reformat_alloc(struct mlx5_core_dev *dev,
-						     int reformat_type,
-						     size_t size,
-						     void *reformat_data,
+						     struct mlx5_pkt_reformat_params *params,
 						     enum mlx5_flow_namespace_type ns_type)
 {
 	struct mlx5_pkt_reformat *pkt_reformat;
@@ -3183,9 +3181,8 @@ struct mlx5_pkt_reformat *mlx5_packet_reformat_alloc(struct mlx5_core_dev *dev,
 		return ERR_PTR(-ENOMEM);
 
 	pkt_reformat->ns_type = ns_type;
-	pkt_reformat->reformat_type = reformat_type;
-	err = root->cmds->packet_reformat_alloc(root, reformat_type, size,
-						reformat_data, ns_type,
+	pkt_reformat->reformat_type = params->type;
+	err = root->cmds->packet_reformat_alloc(root, params, ns_type,
 						pkt_reformat);
 	if (err) {
 		kfree(pkt_reformat);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
index 1b7a0e94d432..13fceba11d3f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
@@ -937,6 +937,8 @@ struct mlx5dr_action *mlx5dr_action_create_push_vlan(struct mlx5dr_domain *dmn,
 struct mlx5dr_action *
 mlx5dr_action_create_packet_reformat(struct mlx5dr_domain *dmn,
 				     enum mlx5dr_action_reformat_type reformat_type,
+				     u8 reformat_param_0,
+				     u8 reformat_param_1,
 				     size_t data_sz,
 				     void *data)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c
index ee0e9d79aaec..d866cd609d0b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c
@@ -289,7 +289,8 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns,
 			DR_ACTION_REFORMAT_TYP_TNL_L2_TO_L2;
 
 		tmp_action = mlx5dr_action_create_packet_reformat(domain,
-								  decap_type, 0,
+								  decap_type,
+								  0, 0, 0,
 								  NULL);
 		if (!tmp_action) {
 			err = -ENOMEM;
@@ -522,9 +523,7 @@ out_err:
 }
 
 static int mlx5_cmd_dr_packet_reformat_alloc(struct mlx5_flow_root_namespace *ns,
-					     int reformat_type,
-					     size_t size,
-					     void *reformat_data,
+					     struct mlx5_pkt_reformat_params *params,
 					     enum mlx5_flow_namespace_type namespace,
 					     struct mlx5_pkt_reformat *pkt_reformat)
 {
@@ -532,7 +531,7 @@ static int mlx5_cmd_dr_packet_reformat_alloc(struct mlx5_flow_root_namespace *ns
 	struct mlx5dr_action *action;
 	int dr_reformat;
 
-	switch (reformat_type) {
+	switch (params->type) {
 	case MLX5_REFORMAT_TYPE_L2_TO_VXLAN:
 	case MLX5_REFORMAT_TYPE_L2_TO_NVGRE:
 	case MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL:
@@ -546,14 +545,16 @@ static int mlx5_cmd_dr_packet_reformat_alloc(struct mlx5_flow_root_namespace *ns
 		break;
 	default:
 		mlx5_core_err(ns->dev, "Packet-reformat not supported(%d)\n",
-			      reformat_type);
+			      params->type);
 		return -EOPNOTSUPP;
 	}
 
 	action = mlx5dr_action_create_packet_reformat(dr_domain,
 						      dr_reformat,
-						      size,
-						      reformat_data);
+						      params->param_0,
+						      params->param_1,
+						      params->size,
+						      params->data);
 	if (!action) {
 		mlx5_core_err(ns->dev, "Failed allocating packet-reformat action\n");
 		return -EINVAL;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h
index 612b0ac31db2..8d821bbe3309 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h
@@ -105,6 +105,8 @@ mlx5dr_action_create_flow_counter(u32 counter_id);
 struct mlx5dr_action *
 mlx5dr_action_create_packet_reformat(struct mlx5dr_domain *dmn,
 				     enum mlx5dr_action_reformat_type reformat_type,
+				     u8 reformat_param_0,
+				     u8 reformat_param_1,
 				     size_t data_sz,
 				     void *data);
 
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 1f51f4c3b1af..f69f68fba946 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -254,10 +254,16 @@ struct mlx5_modify_hdr *mlx5_modify_header_alloc(struct mlx5_core_dev *dev,
 void mlx5_modify_header_dealloc(struct mlx5_core_dev *dev,
 				struct mlx5_modify_hdr *modify_hdr);
 
+struct mlx5_pkt_reformat_params {
+	int type;
+	u8 param_0;
+	u8 param_1;
+	size_t size;
+	void *data;
+};
+
 struct mlx5_pkt_reformat *mlx5_packet_reformat_alloc(struct mlx5_core_dev *dev,
-						     int reformat_type,
-						     size_t size,
-						     void *reformat_data,
+						     struct mlx5_pkt_reformat_params *params,
 						     enum mlx5_flow_namespace_type ns_type);
 void mlx5_packet_reformat_dealloc(struct mlx5_core_dev *dev,
 				  struct mlx5_pkt_reformat *reformat);
-- 
cgit v1.2.3


From ec3be8873df3bf467ead27f7cedc896cbb2bd819 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@nvidia.com>
Date: Thu, 4 Mar 2021 13:09:53 +0200
Subject: net/mlx5: Create TC-miss priority and table

In order to adhere to kernel software datapath model bridge offloads must
come after TC and NF FDBs. Following patches in this series add new FDB
priority for bridge after FDB_FT_OFFLOAD. However, since netfilter offload
is implemented with unmanaged tables, its miss path is not automatically
connected to next priority and requires the code to manually connect with
slow table. To keep bridge offloads encapsulated and not mix it with
eswitch offloads, create a new FDB_TC_MISS priority between FDB_FT_OFFLOAD
and FDB_SLOW_PATH:

          +
          |
+---------v----------+
|                    |
|   FDB_TC_OFFLOAD   |
|                    |
+---------+----------+
          |
          |
          |
+---------v----------+
|                    |
|   FDB_FT_OFFLOAD   |
|                    |
+---------+----------+
          |
          |
          |
+---------v----------+
|                    |
|    FDB_TC_MISS     |
|                    |
+---------+----------+
          |
          |
          |
+---------v----------+
|                    |
|   FDB_SLOW_PATH    |
|                    |
+---------+----------+
          |
          v

Initialize the new priority with single default empty managed table and use
the table as TC/NF miss patch instead of slow table. This approach allows
bridge offloads to be created as new FDB namespace priority between
FDB_TC_MISS and FDB_SLOW_PATH without exposing its internal tables to any
other modules since miss path of managed TC-miss table is automatically
wired to next priority.

Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
Reviewed-by: Jianbo Liu <jianbol@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h     |  1 +
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c    | 19 ++++++++++++++++++-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c     |  6 ++++++
 include/linux/mlx5/fs.h                               |  1 +
 4 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 64ccb2bc0b58..55404eabff39 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -196,6 +196,7 @@ struct mlx5_eswitch_fdb {
 
 		struct offloads_fdb {
 			struct mlx5_flow_namespace *ns;
+			struct mlx5_flow_table *tc_miss_table;
 			struct mlx5_flow_table *slow_fdb;
 			struct mlx5_flow_group *send_to_vport_grp;
 			struct mlx5_flow_group *send_to_vport_meta_grp;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index d18a28a6e9a6..7579f3402776 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -1634,7 +1634,21 @@ static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw)
 	}
 	esw->fdb_table.offloads.slow_fdb = fdb;
 
-	err = esw_chains_create(esw, fdb);
+	/* Create empty TC-miss managed table. This allows plugging in following
+	 * priorities without directly exposing their level 0 table to
+	 * eswitch_offloads and passing it as miss_fdb to following call to
+	 * esw_chains_create().
+	 */
+	memset(&ft_attr, 0, sizeof(ft_attr));
+	ft_attr.prio = FDB_TC_MISS;
+	esw->fdb_table.offloads.tc_miss_table = mlx5_create_flow_table(root_ns, &ft_attr);
+	if (IS_ERR(esw->fdb_table.offloads.tc_miss_table)) {
+		err = PTR_ERR(esw->fdb_table.offloads.tc_miss_table);
+		esw_warn(dev, "Failed to create TC miss FDB Table err %d\n", err);
+		goto tc_miss_table_err;
+	}
+
+	err = esw_chains_create(esw, esw->fdb_table.offloads.tc_miss_table);
 	if (err) {
 		esw_warn(dev, "Failed to open fdb chains err(%d)\n", err);
 		goto fdb_chains_err;
@@ -1779,6 +1793,8 @@ send_vport_meta_err:
 send_vport_err:
 	esw_chains_destroy(esw, esw_chains(esw));
 fdb_chains_err:
+	mlx5_destroy_flow_table(esw->fdb_table.offloads.tc_miss_table);
+tc_miss_table_err:
 	mlx5_destroy_flow_table(esw->fdb_table.offloads.slow_fdb);
 slow_fdb_err:
 	/* Holds true only as long as DMFS is the default */
@@ -1806,6 +1822,7 @@ static void esw_destroy_offloads_fdb_tables(struct mlx5_eswitch *esw)
 
 	esw_chains_destroy(esw, esw_chains(esw));
 
+	mlx5_destroy_flow_table(esw->fdb_table.offloads.tc_miss_table);
 	mlx5_destroy_flow_table(esw->fdb_table.offloads.slow_fdb);
 	/* Holds true only as long as DMFS is the default */
 	mlx5_flow_namespace_set_mode(esw->fdb_table.offloads.ns,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index c0936b4e53a9..fc70c4ed8469 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -2780,6 +2780,12 @@ static int init_fdb_root_ns(struct mlx5_flow_steering *steering)
 	if (err)
 		goto out_err;
 
+	maj_prio = fs_create_prio(&steering->fdb_root_ns->ns, FDB_TC_MISS, 1);
+	if (IS_ERR(maj_prio)) {
+		err = PTR_ERR(maj_prio);
+		goto out_err;
+	}
+
 	maj_prio = fs_create_prio(&steering->fdb_root_ns->ns, FDB_SLOW_PATH, 1);
 	if (IS_ERR(maj_prio)) {
 		err = PTR_ERR(maj_prio);
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index f69f68fba946..271f2f4d6b60 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -87,6 +87,7 @@ enum {
 	FDB_BYPASS_PATH,
 	FDB_TC_OFFLOAD,
 	FDB_FT_OFFLOAD,
+	FDB_TC_MISS,
 	FDB_SLOW_PATH,
 	FDB_PER_VPORT,
 };
-- 
cgit v1.2.3


From 19e9bfa044f32655f1c14e95784be93da34e103e Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@nvidia.com>
Date: Fri, 2 Apr 2021 15:57:02 +0300
Subject: net/mlx5: Bridge, add offload infrastructure

Create new files bridge.{c|h} in en/rep directory that implement bridge
interaction with representor netdevices and handle required
events/notifications, bridge.{c|h} in esw directory that implement all
necessary eswitch offloading infrastructure and works on vport/eswitch
level. Provide new kconfig MLX5_BRIDGE which is automatically selected when
both kernel bridge and mlx5 eswitch configs are enabled.

Provide basic infrastructure for bridge offloads:

- struct mlx5_esw_bridge_offloads - per-eswitch bridge offload structure
that encapsulates generic bridge-offloads data (notifier blocks, ingress
flow table/group, etc.) that is created/deleted on enable/disable eswitch
offloads.

- struct mlx5_esw_bridge - per-bridge structure that encapsulates
per-bridge data (reference counter, FDB, egress flow table/group, etc.)
that is created when first eswitch represetor is attached to new bridge and
deleted when last representor is removed from the bridge as a result of
NETDEV_CHANGEUPPER event.

The bridge tables are created with new priority FDB_BR_OFFLOAD in FDB
namespace. The new priority is between tc-miss and slow path priorities.
Priority consist of two levels: the ingress table that is global per
eswitch and matches incoming packets by src_mac/vid and redirects them to
next level (egress table) that is chosen according to ingress port bridge
membership and matches on dst_mac/vid in order to redirect packet to vport
according to the following diagram:

                +
                |
      +---------v----------+
      |                    |
      |   FDB_TC_OFFLOAD   |
      |                    |
      +---------+----------+
                |
                |
      +---------v----------+
      |                    |
      |   FDB_FT_OFFLOAD   |
      |                    |
      +---------+----------+
                |
                |
      +---------v----------+
      |                    |
      |    FDB_TC_MISS     |
      |                    |
      +---------+----------+
                |
+--------------------------------------+
|               |                      |
|        +------+                      |
|        |                             |
| +------v--------+   FDB_BR_OFFLOAD   |
| | INGRESS_TABLE |                    |
| +------+---+----+                    |
|        |   |      match              |
|        |   +---------+               |
|        |             |               |    +-------+
|        |     +-------v-------+ match |    |       |
|        |     | EGRESS_TABLE  +------------> vport |
|        |     +-------+-------+       |    |       |
|        |             |               |    +-------+
|        |    miss     |               |
|        +------+------+               |
|               |                      |
+--------------------------------------+
                |
                |
      +---------v----------+
      |                    |
      |   FDB_SLOW_PATH    |
      |                    |
      +---------+----------+
                |
                v

Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
Reviewed-by: Jianbo Liu <jianbol@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig    |  10 +
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   1 +
 .../ethernet/mellanox/mlx5/core/en/rep/bridge.c    | 108 +++++++
 .../ethernet/mellanox/mlx5/core/en/rep/bridge.h    |  21 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   |   3 +
 .../net/ethernet/mellanox/mlx5/core/esw/bridge.c   | 354 +++++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/esw/bridge.h   |  30 ++
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |   6 +
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  |   6 +
 include/linux/mlx5/fs.h                            |   1 +
 10 files changed, 540 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index 461a43f338e6..d62f90aedade 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -79,6 +79,16 @@ config MLX5_ESWITCH
 	        Legacy SRIOV mode (L2 mac vlan steering based).
 	        Switchdev mode (eswitch offloads).
 
+config MLX5_BRIDGE
+	bool
+	depends on MLX5_ESWITCH && BRIDGE
+	default y
+	help
+	  mlx5 ConnectX offloads support for Ethernet Bridging (BRIDGE).
+	  Enable adding representors of mlx5 uplink and VF ports to Bridge and
+	  offloading rules for traffic between such ports. Supports VLANs (trunk and
+	  access modes).
+
 config MLX5_CLS_ACT
 	bool "MLX5 TC classifier action support"
 	depends on MLX5_ESWITCH && NET_CLS_ACT
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 8dbdf1aef00f..b5072a3a2585 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -56,6 +56,7 @@ mlx5_core-$(CONFIG_MLX5_ESWITCH)   += esw/acl/helper.o \
 				      esw/acl/ingress_lgcy.o esw/acl/ingress_ofld.o \
 				      esw/devlink_port.o esw/vporttbl.o
 mlx5_core-$(CONFIG_MLX5_TC_SAMPLE) += esw/sample.o
+mlx5_core-$(CONFIG_MLX5_BRIDGE)    += esw/bridge.o en/rep/bridge.o
 
 mlx5_core-$(CONFIG_MLX5_MPFS)      += lib/mpfs.o
 mlx5_core-$(CONFIG_VXLAN)          += lib/vxlan.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c
new file mode 100644
index 000000000000..de7a68488a9d
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2021 Mellanox Technologies. */
+
+#include <linux/netdevice.h>
+#include <net/netevent.h>
+#include <net/switchdev.h>
+#include "bridge.h"
+#include "esw/bridge.h"
+#include "en_rep.h"
+
+static int mlx5_esw_bridge_port_changeupper(struct notifier_block *nb, void *ptr)
+{
+	struct mlx5_esw_bridge_offloads *br_offloads = container_of(nb,
+								    struct mlx5_esw_bridge_offloads,
+								    netdev_nb);
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct netdev_notifier_changeupper_info *info = ptr;
+	struct netlink_ext_ack *extack;
+	struct mlx5e_rep_priv *rpriv;
+	struct mlx5_eswitch *esw;
+	struct mlx5_vport *vport;
+	struct net_device *upper;
+	struct mlx5e_priv *priv;
+	u16 vport_num;
+
+	if (!mlx5e_eswitch_rep(dev))
+		return 0;
+
+	upper = info->upper_dev;
+	if (!netif_is_bridge_master(upper))
+		return 0;
+
+	esw = br_offloads->esw;
+	priv = netdev_priv(dev);
+	if (esw != priv->mdev->priv.eswitch)
+		return 0;
+
+	rpriv = priv->ppriv;
+	vport_num = rpriv->rep->vport;
+	vport = mlx5_eswitch_get_vport(esw, vport_num);
+	if (IS_ERR(vport))
+		return PTR_ERR(vport);
+
+	extack = netdev_notifier_info_to_extack(&info->info);
+
+	return info->linking ?
+		mlx5_esw_bridge_vport_link(upper->ifindex, br_offloads, vport, extack) :
+		mlx5_esw_bridge_vport_unlink(upper->ifindex, br_offloads, vport, extack);
+}
+
+static int mlx5_esw_bridge_switchdev_port_event(struct notifier_block *nb,
+						unsigned long event, void *ptr)
+{
+	int err = 0;
+
+	switch (event) {
+	case NETDEV_PRECHANGEUPPER:
+		break;
+
+	case NETDEV_CHANGEUPPER:
+		err = mlx5_esw_bridge_port_changeupper(nb, ptr);
+		break;
+	}
+
+	return notifier_from_errno(err);
+}
+
+void mlx5e_rep_bridge_init(struct mlx5e_priv *priv)
+{
+	struct mlx5_esw_bridge_offloads *br_offloads;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5_eswitch *esw =
+		mdev->priv.eswitch;
+	int err;
+
+	rtnl_lock();
+	br_offloads = mlx5_esw_bridge_init(esw);
+	rtnl_unlock();
+	if (IS_ERR(br_offloads)) {
+		esw_warn(mdev, "Failed to init esw bridge (err=%ld)\n", PTR_ERR(br_offloads));
+		return;
+	}
+
+	br_offloads->netdev_nb.notifier_call = mlx5_esw_bridge_switchdev_port_event;
+	err = register_netdevice_notifier(&br_offloads->netdev_nb);
+	if (err) {
+		esw_warn(mdev, "Failed to register bridge offloads netdevice notifier (err=%d)\n",
+			 err);
+		mlx5_esw_bridge_cleanup(esw);
+	}
+}
+
+void mlx5e_rep_bridge_cleanup(struct mlx5e_priv *priv)
+{
+	struct mlx5_esw_bridge_offloads *br_offloads;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5_eswitch *esw =
+		mdev->priv.eswitch;
+
+	br_offloads = esw->br_offloads;
+	if (!br_offloads)
+		return;
+
+	unregister_netdevice_notifier(&br_offloads->netdev_nb);
+	rtnl_lock();
+	mlx5_esw_bridge_cleanup(esw);
+	rtnl_unlock();
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.h b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.h
new file mode 100644
index 000000000000..fbeb64242831
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2021 Mellanox Technologies. */
+
+#ifndef __MLX5_EN_REP_BRIDGE__
+#define __MLX5_EN_REP_BRIDGE__
+
+#include "en.h"
+
+#if IS_ENABLED(CONFIG_MLX5_BRIDGE)
+
+void mlx5e_rep_bridge_init(struct mlx5e_priv *priv);
+void mlx5e_rep_bridge_cleanup(struct mlx5e_priv *priv);
+
+#else /* CONFIG_MLX5_BRIDGE */
+
+static inline void mlx5e_rep_bridge_init(struct mlx5e_priv *priv) {}
+static inline void mlx5e_rep_bridge_cleanup(struct mlx5e_priv *priv) {}
+
+#endif /* CONFIG_MLX5_BRIDGE */
+
+#endif /* __MLX5_EN_REP_BRIDGE__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 40db54412041..8290e0086178 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -45,6 +45,7 @@
 #include "en_tc.h"
 #include "en/rep/tc.h"
 #include "en/rep/neigh.h"
+#include "en/rep/bridge.h"
 #include "en/devlink.h"
 #include "fs_core.h"
 #include "lib/mlx5.h"
@@ -981,6 +982,7 @@ static void mlx5e_uplink_rep_enable(struct mlx5e_priv *priv)
 	mlx5e_dcbnl_initialize(priv);
 	mlx5e_dcbnl_init_app(priv);
 	mlx5e_rep_neigh_init(rpriv);
+	mlx5e_rep_bridge_init(priv);
 
 	netdev->wanted_features |= NETIF_F_HW_TC;
 
@@ -1002,6 +1004,7 @@ static void mlx5e_uplink_rep_disable(struct mlx5e_priv *priv)
 	netif_device_detach(priv->netdev);
 	rtnl_unlock();
 
+	mlx5e_rep_bridge_cleanup(priv);
 	mlx5e_rep_neigh_cleanup(rpriv);
 	mlx5e_dcbnl_delete_app(priv);
 	mlx5_notifier_unregister(mdev, &priv->events_nb);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c
new file mode 100644
index 000000000000..b503562f97d0
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c
@@ -0,0 +1,354 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2021 Mellanox Technologies. */
+
+#include <linux/netdevice.h>
+#include <linux/list.h>
+#include <net/switchdev.h>
+#include "bridge.h"
+#include "eswitch.h"
+#include "fs_core.h"
+
+#define MLX5_ESW_BRIDGE_INGRESS_TABLE_SIZE 64000
+#define MLX5_ESW_BRIDGE_INGRESS_TABLE_MAC_GRP_IDX_FROM 0
+#define MLX5_ESW_BRIDGE_INGRESS_TABLE_MAC_GRP_IDX_TO (MLX5_ESW_BRIDGE_INGRESS_TABLE_SIZE - 1)
+
+#define MLX5_ESW_BRIDGE_EGRESS_TABLE_SIZE 64000
+#define MLX5_ESW_BRIDGE_EGRESS_TABLE_MAC_GRP_IDX_FROM 0
+#define MLX5_ESW_BRIDGE_EGRESS_TABLE_MAC_GRP_IDX_TO (MLX5_ESW_BRIDGE_EGRESS_TABLE_SIZE - 1)
+
+enum {
+	MLX5_ESW_BRIDGE_LEVEL_INGRESS_TABLE,
+	MLX5_ESW_BRIDGE_LEVEL_EGRESS_TABLE,
+};
+
+struct mlx5_esw_bridge {
+	int ifindex;
+	int refcnt;
+	struct list_head list;
+
+	struct mlx5_flow_table *egress_ft;
+	struct mlx5_flow_group *egress_mac_fg;
+};
+
+static struct mlx5_flow_table *
+mlx5_esw_bridge_table_create(int max_fte, u32 level, struct mlx5_eswitch *esw)
+{
+	struct mlx5_flow_table_attr ft_attr = {};
+	struct mlx5_core_dev *dev = esw->dev;
+	struct mlx5_flow_namespace *ns;
+	struct mlx5_flow_table *fdb;
+
+	ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB);
+	if (!ns) {
+		esw_warn(dev, "Failed to get FDB namespace\n");
+		return ERR_PTR(-ENOENT);
+	}
+
+	ft_attr.max_fte = max_fte;
+	ft_attr.level = level;
+	ft_attr.prio = FDB_BR_OFFLOAD;
+	fdb = mlx5_create_flow_table(ns, &ft_attr);
+	if (IS_ERR(fdb))
+		esw_warn(dev, "Failed to create bridge FDB Table (err=%ld)\n", PTR_ERR(fdb));
+
+	return fdb;
+}
+
+static struct mlx5_flow_group *
+mlx5_esw_bridge_ingress_mac_fg_create(struct mlx5_eswitch *esw, struct mlx5_flow_table *ingress_ft)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	struct mlx5_flow_group *fg;
+	u32 *in, *match;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return ERR_PTR(-ENOMEM);
+
+	MLX5_SET(create_flow_group_in, in, match_criteria_enable,
+		 MLX5_MATCH_OUTER_HEADERS | MLX5_MATCH_MISC_PARAMETERS_2);
+	match = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria);
+
+	MLX5_SET_TO_ONES(fte_match_param, match, outer_headers.smac_47_16);
+	MLX5_SET_TO_ONES(fte_match_param, match, outer_headers.smac_15_0);
+
+	MLX5_SET(fte_match_param, match, misc_parameters_2.metadata_reg_c_0,
+		 mlx5_eswitch_get_vport_metadata_mask());
+
+	MLX5_SET(create_flow_group_in, in, start_flow_index,
+		 MLX5_ESW_BRIDGE_INGRESS_TABLE_MAC_GRP_IDX_FROM);
+	MLX5_SET(create_flow_group_in, in, end_flow_index,
+		 MLX5_ESW_BRIDGE_INGRESS_TABLE_MAC_GRP_IDX_TO);
+
+	fg = mlx5_create_flow_group(ingress_ft, in);
+	if (IS_ERR(fg))
+		esw_warn(esw->dev,
+			 "Failed to create bridge ingress table MAC flow group (err=%ld)\n",
+			 PTR_ERR(fg));
+
+	kvfree(in);
+	return fg;
+}
+
+static struct mlx5_flow_group *
+mlx5_esw_bridge_egress_mac_fg_create(struct mlx5_eswitch *esw, struct mlx5_flow_table *egress_ft)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	struct mlx5_flow_group *fg;
+	u32 *in, *match;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return ERR_PTR(-ENOMEM);
+
+	MLX5_SET(create_flow_group_in, in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	match = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria);
+
+	MLX5_SET_TO_ONES(fte_match_param, match, outer_headers.dmac_47_16);
+	MLX5_SET_TO_ONES(fte_match_param, match, outer_headers.dmac_15_0);
+
+	MLX5_SET(create_flow_group_in, in, start_flow_index,
+		 MLX5_ESW_BRIDGE_EGRESS_TABLE_MAC_GRP_IDX_FROM);
+	MLX5_SET(create_flow_group_in, in, end_flow_index,
+		 MLX5_ESW_BRIDGE_EGRESS_TABLE_MAC_GRP_IDX_TO);
+
+	fg = mlx5_create_flow_group(egress_ft, in);
+	if (IS_ERR(fg))
+		esw_warn(esw->dev,
+			 "Failed to create bridge egress table MAC flow group (err=%ld)\n",
+			 PTR_ERR(fg));
+	kvfree(in);
+	return fg;
+}
+
+static int
+mlx5_esw_bridge_ingress_table_init(struct mlx5_esw_bridge_offloads *br_offloads)
+{
+	struct mlx5_flow_table *ingress_ft;
+	struct mlx5_flow_group *mac_fg;
+	int err;
+
+	ingress_ft = mlx5_esw_bridge_table_create(MLX5_ESW_BRIDGE_INGRESS_TABLE_SIZE,
+						  MLX5_ESW_BRIDGE_LEVEL_INGRESS_TABLE,
+						  br_offloads->esw);
+	if (IS_ERR(ingress_ft))
+		return PTR_ERR(ingress_ft);
+
+	mac_fg = mlx5_esw_bridge_ingress_mac_fg_create(br_offloads->esw, ingress_ft);
+	if (IS_ERR(mac_fg)) {
+		err = PTR_ERR(mac_fg);
+		goto err_mac_fg;
+	}
+
+	br_offloads->ingress_ft = ingress_ft;
+	br_offloads->ingress_mac_fg = mac_fg;
+	return 0;
+
+err_mac_fg:
+	mlx5_destroy_flow_table(ingress_ft);
+	return err;
+}
+
+static void
+mlx5_esw_bridge_ingress_table_cleanup(struct mlx5_esw_bridge_offloads *br_offloads)
+{
+	mlx5_destroy_flow_group(br_offloads->ingress_mac_fg);
+	br_offloads->ingress_mac_fg = NULL;
+	mlx5_destroy_flow_table(br_offloads->ingress_ft);
+	br_offloads->ingress_ft = NULL;
+}
+
+static int
+mlx5_esw_bridge_egress_table_init(struct mlx5_esw_bridge_offloads *br_offloads,
+				  struct mlx5_esw_bridge *bridge)
+{
+	struct mlx5_flow_table *egress_ft;
+	struct mlx5_flow_group *mac_fg;
+	int err;
+
+	egress_ft = mlx5_esw_bridge_table_create(MLX5_ESW_BRIDGE_EGRESS_TABLE_SIZE,
+						 MLX5_ESW_BRIDGE_LEVEL_EGRESS_TABLE,
+						 br_offloads->esw);
+	if (IS_ERR(egress_ft))
+		return PTR_ERR(egress_ft);
+
+	mac_fg = mlx5_esw_bridge_egress_mac_fg_create(br_offloads->esw, egress_ft);
+	if (IS_ERR(mac_fg)) {
+		err = PTR_ERR(mac_fg);
+		goto err_mac_fg;
+	}
+
+	bridge->egress_ft = egress_ft;
+	bridge->egress_mac_fg = mac_fg;
+	return 0;
+
+err_mac_fg:
+	mlx5_destroy_flow_table(egress_ft);
+	return err;
+}
+
+static void
+mlx5_esw_bridge_egress_table_cleanup(struct mlx5_esw_bridge *bridge)
+{
+	mlx5_destroy_flow_group(bridge->egress_mac_fg);
+	mlx5_destroy_flow_table(bridge->egress_ft);
+}
+
+static struct mlx5_esw_bridge *mlx5_esw_bridge_create(int ifindex,
+						      struct mlx5_esw_bridge_offloads *br_offloads)
+{
+	struct mlx5_esw_bridge *bridge;
+	int err;
+
+	bridge = kvzalloc(sizeof(*bridge), GFP_KERNEL);
+	if (!bridge)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx5_esw_bridge_egress_table_init(br_offloads, bridge);
+	if (err)
+		goto err_egress_tbl;
+
+	bridge->ifindex = ifindex;
+	bridge->refcnt = 1;
+	list_add(&bridge->list, &br_offloads->bridges);
+
+	return bridge;
+
+err_egress_tbl:
+	kvfree(bridge);
+	return ERR_PTR(err);
+}
+
+static void mlx5_esw_bridge_get(struct mlx5_esw_bridge *bridge)
+{
+	bridge->refcnt++;
+}
+
+static void mlx5_esw_bridge_put(struct mlx5_esw_bridge_offloads *br_offloads,
+				struct mlx5_esw_bridge *bridge)
+{
+	if (--bridge->refcnt)
+		return;
+
+	mlx5_esw_bridge_egress_table_cleanup(bridge);
+	list_del(&bridge->list);
+	kvfree(bridge);
+
+	if (list_empty(&br_offloads->bridges))
+		mlx5_esw_bridge_ingress_table_cleanup(br_offloads);
+}
+
+static struct mlx5_esw_bridge *
+mlx5_esw_bridge_lookup(int ifindex, struct mlx5_esw_bridge_offloads *br_offloads)
+{
+	struct mlx5_esw_bridge *bridge;
+
+	ASSERT_RTNL();
+
+	list_for_each_entry(bridge, &br_offloads->bridges, list) {
+		if (bridge->ifindex == ifindex) {
+			mlx5_esw_bridge_get(bridge);
+			return bridge;
+		}
+	}
+
+	if (!br_offloads->ingress_ft) {
+		int err = mlx5_esw_bridge_ingress_table_init(br_offloads);
+
+		if (err)
+			return ERR_PTR(err);
+	}
+
+	bridge = mlx5_esw_bridge_create(ifindex, br_offloads);
+	if (IS_ERR(bridge) && list_empty(&br_offloads->bridges))
+		mlx5_esw_bridge_ingress_table_cleanup(br_offloads);
+	return bridge;
+}
+
+static int mlx5_esw_bridge_vport_init(struct mlx5_esw_bridge *bridge,
+				      struct mlx5_vport *vport)
+{
+	vport->bridge = bridge;
+	return 0;
+}
+
+static int mlx5_esw_bridge_vport_cleanup(struct mlx5_esw_bridge_offloads *br_offloads,
+					 struct mlx5_vport *vport)
+{
+	mlx5_esw_bridge_put(br_offloads, vport->bridge);
+	vport->bridge = NULL;
+	return 0;
+}
+
+int mlx5_esw_bridge_vport_link(int ifindex, struct mlx5_esw_bridge_offloads *br_offloads,
+			       struct mlx5_vport *vport, struct netlink_ext_ack *extack)
+{
+	struct mlx5_esw_bridge *bridge;
+
+	WARN_ON(vport->bridge);
+
+	bridge = mlx5_esw_bridge_lookup(ifindex, br_offloads);
+	if (IS_ERR(bridge)) {
+		NL_SET_ERR_MSG_MOD(extack, "Error checking for existing bridge with same ifindex");
+		return PTR_ERR(bridge);
+	}
+
+	return mlx5_esw_bridge_vport_init(bridge, vport);
+}
+
+int mlx5_esw_bridge_vport_unlink(int ifindex, struct mlx5_esw_bridge_offloads *br_offloads,
+				 struct mlx5_vport *vport, struct netlink_ext_ack *extack)
+{
+	if (!vport->bridge) {
+		NL_SET_ERR_MSG_MOD(extack, "Port is not attached to any bridge");
+		return -EINVAL;
+	}
+	if (vport->bridge->ifindex != ifindex) {
+		NL_SET_ERR_MSG_MOD(extack, "Port is attached to another bridge");
+		return -EINVAL;
+	}
+
+	return mlx5_esw_bridge_vport_cleanup(br_offloads, vport);
+}
+
+static void mlx5_esw_bridge_flush(struct mlx5_esw_bridge_offloads *br_offloads)
+{
+	struct mlx5_eswitch *esw = br_offloads->esw;
+	struct mlx5_vport *vport;
+	unsigned long i;
+
+	mlx5_esw_for_each_vport(esw, i, vport)
+		if (vport->bridge)
+			mlx5_esw_bridge_vport_cleanup(br_offloads, vport);
+
+	WARN_ONCE(!list_empty(&br_offloads->bridges),
+		  "Cleaning up bridge offloads while still having bridges attached\n");
+}
+
+struct mlx5_esw_bridge_offloads *mlx5_esw_bridge_init(struct mlx5_eswitch *esw)
+{
+	struct mlx5_esw_bridge_offloads *br_offloads;
+
+	br_offloads = kvzalloc(sizeof(*br_offloads), GFP_KERNEL);
+	if (!br_offloads)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&br_offloads->bridges);
+	br_offloads->esw = esw;
+	esw->br_offloads = br_offloads;
+
+	return br_offloads;
+}
+
+void mlx5_esw_bridge_cleanup(struct mlx5_eswitch *esw)
+{
+	struct mlx5_esw_bridge_offloads *br_offloads = esw->br_offloads;
+
+	if (!br_offloads)
+		return;
+
+	mlx5_esw_bridge_flush(br_offloads);
+
+	esw->br_offloads = NULL;
+	kvfree(br_offloads);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.h b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.h
new file mode 100644
index 000000000000..319b6f1db0ba
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2021 Mellanox Technologies. */
+
+#ifndef __MLX5_ESW_BRIDGE_H__
+#define __MLX5_ESW_BRIDGE_H__
+
+#include <linux/notifier.h>
+#include <linux/list.h>
+#include "eswitch.h"
+
+struct mlx5_flow_table;
+struct mlx5_flow_group;
+
+struct mlx5_esw_bridge_offloads {
+	struct mlx5_eswitch *esw;
+	struct list_head bridges;
+	struct notifier_block netdev_nb;
+
+	struct mlx5_flow_table *ingress_ft;
+	struct mlx5_flow_group *ingress_mac_fg;
+};
+
+struct mlx5_esw_bridge_offloads *mlx5_esw_bridge_init(struct mlx5_eswitch *esw);
+void mlx5_esw_bridge_cleanup(struct mlx5_eswitch *esw);
+int mlx5_esw_bridge_vport_link(int ifindex, struct mlx5_esw_bridge_offloads *br_offloads,
+			       struct mlx5_vport *vport, struct netlink_ext_ack *extack);
+int mlx5_esw_bridge_vport_unlink(int ifindex, struct mlx5_esw_bridge_offloads *br_offloads,
+				 struct mlx5_vport *vport, struct netlink_ext_ack *extack);
+
+#endif /* __MLX5_ESW_BRIDGE_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 55404eabff39..48cac5bf606d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -150,6 +150,8 @@ enum mlx5_eswitch_vport_event {
 	MLX5_VPORT_PROMISC_CHANGE = BIT(3),
 };
 
+struct mlx5_esw_bridge;
+
 struct mlx5_vport {
 	struct mlx5_core_dev    *dev;
 	struct hlist_head       uc_list[MLX5_L2_ADDR_HASH_SIZE];
@@ -178,6 +180,7 @@ struct mlx5_vport {
 	enum mlx5_eswitch_vport_event enabled_events;
 	int index;
 	struct devlink_port *dl_port;
+	struct mlx5_esw_bridge *bridge;
 };
 
 struct mlx5_esw_indir_table;
@@ -271,6 +274,8 @@ enum {
 	MLX5_ESWITCH_REG_C1_LOOPBACK_ENABLED = BIT(1),
 };
 
+struct mlx5_esw_bridge_offloads;
+
 struct mlx5_eswitch {
 	struct mlx5_core_dev    *dev;
 	struct mlx5_nb          nb;
@@ -300,6 +305,7 @@ struct mlx5_eswitch {
 		u32             root_tsar_id;
 	} qos;
 
+	struct mlx5_esw_bridge_offloads *br_offloads;
 	struct mlx5_esw_offload offloads;
 	int                     mode;
 	u16                     manager_vport;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index fc70c4ed8469..fc37ac9eab12 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -2786,6 +2786,12 @@ static int init_fdb_root_ns(struct mlx5_flow_steering *steering)
 		goto out_err;
 	}
 
+	maj_prio = fs_create_prio(&steering->fdb_root_ns->ns, FDB_BR_OFFLOAD, 2);
+	if (IS_ERR(maj_prio)) {
+		err = PTR_ERR(maj_prio);
+		goto out_err;
+	}
+
 	maj_prio = fs_create_prio(&steering->fdb_root_ns->ns, FDB_SLOW_PATH, 1);
 	if (IS_ERR(maj_prio)) {
 		err = PTR_ERR(maj_prio);
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 271f2f4d6b60..77746f7e35b8 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -88,6 +88,7 @@ enum {
 	FDB_TC_OFFLOAD,
 	FDB_FT_OFFLOAD,
 	FDB_TC_MISS,
+	FDB_BR_OFFLOAD,
 	FDB_SLOW_PATH,
 	FDB_PER_VPORT,
 };
-- 
cgit v1.2.3


From 40483774141625b9685b177fb6e1f36de48d33f8 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 10 Jun 2021 10:00:59 +0800
Subject: iommu/vt-d: Use iommu_sva_alloc(free)_pasid() helpers

Align the pasid alloc/free code with the generic helpers defined in the
iommu core. This also refactored the SVA binding code to improve the
readability.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210520031531.712333-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210610020115.1637656-8-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/Kconfig |   1 +
 drivers/iommu/intel/iommu.c |   3 +
 drivers/iommu/intel/svm.c   | 278 ++++++++++++++++++--------------------------
 include/linux/intel-iommu.h |   1 -
 4 files changed, 120 insertions(+), 163 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index 7e5b240b801d..a37bd54c5b90 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -42,6 +42,7 @@ config INTEL_IOMMU_SVM
 	select PCI_PRI
 	select MMU_NOTIFIER
 	select IOASID
+	select IOMMU_SVA_LIB
 	help
 	  Shared Virtual Memory (SVM) provides a facility for devices
 	  to access DMA resources through process address space by
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index b0ba187cb7f8..0ca7f8a2f38e 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5411,6 +5411,9 @@ intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
 		if (!info)
 			return -EINVAL;
 
+		if (intel_iommu_enable_pasid(info->iommu, dev))
+			return -ENODEV;
+
 		if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
 			return -EINVAL;
 
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 82b0627ad7e7..da4310686ed3 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -23,9 +23,11 @@
 #include <asm/fpu/api.h>
 
 #include "pasid.h"
+#include "../iommu-sva-lib.h"
 
 static irqreturn_t prq_event_thread(int irq, void *d);
 static void intel_svm_drain_prq(struct device *dev, u32 pasid);
+#define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva)
 
 #define PRQ_ORDER 0
 
@@ -222,7 +224,6 @@ static const struct mmu_notifier_ops intel_mmuops = {
 };
 
 static DEFINE_MUTEX(pasid_mutex);
-static LIST_HEAD(global_svm_list);
 
 #define for_each_svm_dev(sdev, svm, d)			\
 	list_for_each_entry((sdev), &(svm)->devs, list)	\
@@ -477,79 +478,80 @@ static void load_pasid(struct mm_struct *mm, u32 pasid)
 	mutex_unlock(&mm->context.lock);
 }
 
-/* Caller must hold pasid_mutex, mm reference */
-static int
-intel_svm_bind_mm(struct device *dev, unsigned int flags,
-		  struct mm_struct *mm, struct intel_svm_dev **sd)
+static int intel_svm_alloc_pasid(struct device *dev, struct mm_struct *mm,
+				 unsigned int flags)
 {
-	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
-	struct intel_svm *svm = NULL, *t;
-	struct device_domain_info *info;
-	struct intel_svm_dev *sdev;
-	unsigned long iflags;
-	int pasid_max;
-	int ret;
+	ioasid_t max_pasid = dev_is_pci(dev) ?
+			pci_max_pasids(to_pci_dev(dev)) : intel_pasid_max_id;
 
-	if (!iommu || dmar_disabled)
-		return -EINVAL;
+	return iommu_sva_alloc_pasid(mm, PASID_MIN, max_pasid - 1);
+}
 
-	if (!intel_svm_capable(iommu))
-		return -ENOTSUPP;
+static void intel_svm_free_pasid(struct mm_struct *mm)
+{
+	iommu_sva_free_pasid(mm);
+}
 
-	if (dev_is_pci(dev)) {
-		pasid_max = pci_max_pasids(to_pci_dev(dev));
-		if (pasid_max < 0)
-			return -EINVAL;
-	} else
-		pasid_max = 1 << 20;
+static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu,
+					   struct device *dev,
+					   struct mm_struct *mm,
+					   unsigned int flags)
+{
+	struct device_domain_info *info = get_domain_info(dev);
+	unsigned long iflags, sflags;
+	struct intel_svm_dev *sdev;
+	struct intel_svm *svm;
+	int ret = 0;
 
-	/* Bind supervisor PASID shuld have mm = NULL */
-	if (flags & SVM_FLAG_SUPERVISOR_MODE) {
-		if (!ecap_srs(iommu->ecap) || mm) {
-			pr_err("Supervisor PASID with user provided mm.\n");
-			return -EINVAL;
-		}
-	}
+	svm = pasid_private_find(mm->pasid);
+	if (!svm) {
+		svm = kzalloc(sizeof(*svm), GFP_KERNEL);
+		if (!svm)
+			return ERR_PTR(-ENOMEM);
 
-	list_for_each_entry(t, &global_svm_list, list) {
-		if (t->mm != mm)
-			continue;
+		svm->pasid = mm->pasid;
+		svm->mm = mm;
+		svm->flags = flags;
+		INIT_LIST_HEAD_RCU(&svm->devs);
 
-		svm = t;
-		if (svm->pasid >= pasid_max) {
-			dev_warn(dev,
-				 "Limited PASID width. Cannot use existing PASID %d\n",
-				 svm->pasid);
-			ret = -ENOSPC;
-			goto out;
+		if (!(flags & SVM_FLAG_SUPERVISOR_MODE)) {
+			svm->notifier.ops = &intel_mmuops;
+			ret = mmu_notifier_register(&svm->notifier, mm);
+			if (ret) {
+				kfree(svm);
+				return ERR_PTR(ret);
+			}
 		}
 
-		/* Find the matching device in svm list */
-		for_each_svm_dev(sdev, svm, dev) {
-			sdev->users++;
-			goto success;
+		ret = pasid_private_add(svm->pasid, svm);
+		if (ret) {
+			if (svm->notifier.ops)
+				mmu_notifier_unregister(&svm->notifier, mm);
+			kfree(svm);
+			return ERR_PTR(ret);
 		}
+	}
 
-		break;
+	/* Find the matching device in svm list */
+	for_each_svm_dev(sdev, svm, dev) {
+		sdev->users++;
+		goto success;
 	}
 
 	sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
 	if (!sdev) {
 		ret = -ENOMEM;
-		goto out;
+		goto free_svm;
 	}
+
 	sdev->dev = dev;
 	sdev->iommu = iommu;
-
-	ret = intel_iommu_enable_pasid(iommu, dev);
-	if (ret) {
-		kfree(sdev);
-		goto out;
-	}
-
-	info = get_domain_info(dev);
 	sdev->did = FLPT_DEFAULT_DID;
 	sdev->sid = PCI_DEVID(info->bus, info->devfn);
+	sdev->users = 1;
+	sdev->pasid = svm->pasid;
+	sdev->sva.dev = dev;
+	init_rcu_head(&sdev->rcu);
 	if (info->ats_enabled) {
 		sdev->dev_iotlb = 1;
 		sdev->qdep = info->ats_qdep;
@@ -557,96 +559,37 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 			sdev->qdep = 0;
 	}
 
-	/* Finish the setup now we know we're keeping it */
-	sdev->users = 1;
-	init_rcu_head(&sdev->rcu);
-
-	if (!svm) {
-		svm = kzalloc(sizeof(*svm), GFP_KERNEL);
-		if (!svm) {
-			ret = -ENOMEM;
-			goto sdev_err;
-		}
-
-		if (pasid_max > intel_pasid_max_id)
-			pasid_max = intel_pasid_max_id;
-
-		/* Do not use PASID 0, reserved for RID to PASID */
-		svm->pasid = ioasid_alloc(NULL, PASID_MIN,
-					  pasid_max - 1, NULL);
-		if (svm->pasid == INVALID_IOASID) {
-			ret = -ENOSPC;
-			goto svm_err;
-		}
-
-		ret = pasid_private_add(svm->pasid, svm);
-		if (ret)
-			goto pasid_err;
+	/* Setup the pasid table: */
+	sflags = (flags & SVM_FLAG_SUPERVISOR_MODE) ?
+			PASID_FLAG_SUPERVISOR_MODE : 0;
+	sflags |= cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0;
+	spin_lock_irqsave(&iommu->lock, iflags);
+	ret = intel_pasid_setup_first_level(iommu, dev, mm->pgd, mm->pasid,
+					    FLPT_DEFAULT_DID, sflags);
+	spin_unlock_irqrestore(&iommu->lock, iflags);
 
-		svm->notifier.ops = &intel_mmuops;
-		svm->mm = mm;
-		svm->flags = flags;
-		INIT_LIST_HEAD_RCU(&svm->devs);
-		INIT_LIST_HEAD(&svm->list);
-		ret = -ENOMEM;
-		if (mm) {
-			ret = mmu_notifier_register(&svm->notifier, mm);
-			if (ret)
-				goto priv_err;
-		}
+	if (ret)
+		goto free_sdev;
 
-		spin_lock_irqsave(&iommu->lock, iflags);
-		ret = intel_pasid_setup_first_level(iommu, dev,
-				mm ? mm->pgd : init_mm.pgd,
-				svm->pasid, FLPT_DEFAULT_DID,
-				(mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
-				(cpu_feature_enabled(X86_FEATURE_LA57) ?
-				 PASID_FLAG_FL5LP : 0));
-		spin_unlock_irqrestore(&iommu->lock, iflags);
-		if (ret) {
-			if (mm)
-				mmu_notifier_unregister(&svm->notifier, mm);
-priv_err:
-			pasid_private_remove(svm->pasid);
-pasid_err:
-			ioasid_put(svm->pasid);
-svm_err:
-			kfree(svm);
-sdev_err:
-			kfree(sdev);
-			goto out;
-		}
+	/* The newly allocated pasid is loaded to the mm. */
+	if (!(flags & SVM_FLAG_SUPERVISOR_MODE) && list_empty(&svm->devs))
+		load_pasid(mm, svm->pasid);
 
-		list_add_tail(&svm->list, &global_svm_list);
-		if (mm) {
-			/* The newly allocated pasid is loaded to the mm. */
-			load_pasid(mm, svm->pasid);
-		}
-	} else {
-		/*
-		 * Binding a new device with existing PASID, need to setup
-		 * the PASID entry.
-		 */
-		spin_lock_irqsave(&iommu->lock, iflags);
-		ret = intel_pasid_setup_first_level(iommu, dev,
-						mm ? mm->pgd : init_mm.pgd,
-						svm->pasid, FLPT_DEFAULT_DID,
-						(mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
-						(cpu_feature_enabled(X86_FEATURE_LA57) ?
-						PASID_FLAG_FL5LP : 0));
-		spin_unlock_irqrestore(&iommu->lock, iflags);
-		if (ret)
-			goto sdev_err;
-	}
 	list_add_rcu(&sdev->list, &svm->devs);
 success:
-	sdev->pasid = svm->pasid;
-	sdev->sva.dev = dev;
-	if (sd)
-		*sd = sdev;
-	ret = 0;
-out:
-	return ret;
+	return &sdev->sva;
+
+free_sdev:
+	kfree(sdev);
+free_svm:
+	if (list_empty(&svm->devs)) {
+		if (svm->notifier.ops)
+			mmu_notifier_unregister(&svm->notifier, mm);
+		pasid_private_remove(mm->pasid);
+		kfree(svm);
+	}
+
+	return ERR_PTR(ret);
 }
 
 /* Caller must hold pasid_mutex */
@@ -655,6 +598,7 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
 	struct intel_svm_dev *sdev;
 	struct intel_iommu *iommu;
 	struct intel_svm *svm;
+	struct mm_struct *mm;
 	int ret = -EINVAL;
 
 	iommu = device_to_iommu(dev, NULL, NULL);
@@ -664,6 +608,7 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
 	ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev);
 	if (ret)
 		goto out;
+	mm = svm->mm;
 
 	if (sdev) {
 		sdev->users--;
@@ -682,13 +627,12 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
 			kfree_rcu(sdev, rcu);
 
 			if (list_empty(&svm->devs)) {
-				ioasid_put(svm->pasid);
-				if (svm->mm) {
-					mmu_notifier_unregister(&svm->notifier, svm->mm);
+				intel_svm_free_pasid(mm);
+				if (svm->notifier.ops) {
+					mmu_notifier_unregister(&svm->notifier, mm);
 					/* Clear mm's pasid. */
-					load_pasid(svm->mm, PASID_DISABLED);
+					load_pasid(mm, PASID_DISABLED);
 				}
-				list_del(&svm->list);
 				pasid_private_remove(svm->pasid);
 				/* We mandate that no page faults may be outstanding
 				 * for the PASID when intel_svm_unbind_mm() is called.
@@ -1073,31 +1017,42 @@ prq_advance:
 	return IRQ_RETVAL(handled);
 }
 
-#define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva)
-struct iommu_sva *
-intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
+struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
 {
-	struct iommu_sva *sva = ERR_PTR(-EINVAL);
-	struct intel_svm_dev *sdev = NULL;
+	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
 	unsigned int flags = 0;
+	struct iommu_sva *sva;
 	int ret;
 
-	/*
-	 * TODO: Consolidate with generic iommu-sva bind after it is merged.
-	 * It will require shared SVM data structures, i.e. combine io_mm
-	 * and intel_svm etc.
-	 */
 	if (drvdata)
 		flags = *(unsigned int *)drvdata;
+
+	if (flags & SVM_FLAG_SUPERVISOR_MODE) {
+		if (!ecap_srs(iommu->ecap)) {
+			dev_err(dev, "%s: Supervisor PASID not supported\n",
+				iommu->name);
+			return ERR_PTR(-EOPNOTSUPP);
+		}
+
+		if (mm) {
+			dev_err(dev, "%s: Supervisor PASID with user provided mm\n",
+				iommu->name);
+			return ERR_PTR(-EINVAL);
+		}
+
+		mm = &init_mm;
+	}
+
 	mutex_lock(&pasid_mutex);
-	ret = intel_svm_bind_mm(dev, flags, mm, &sdev);
-	if (ret)
-		sva = ERR_PTR(ret);
-	else if (sdev)
-		sva = &sdev->sva;
-	else
-		WARN(!sdev, "SVM bind succeeded with no sdev!\n");
+	ret = intel_svm_alloc_pasid(dev, mm, flags);
+	if (ret) {
+		mutex_unlock(&pasid_mutex);
+		return ERR_PTR(ret);
+	}
 
+	sva = intel_svm_bind_mm(iommu, dev, mm, flags);
+	if (IS_ERR_OR_NULL(sva))
+		intel_svm_free_pasid(mm);
 	mutex_unlock(&pasid_mutex);
 
 	return sva;
@@ -1105,10 +1060,9 @@ intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
 
 void intel_svm_unbind(struct iommu_sva *sva)
 {
-	struct intel_svm_dev *sdev;
+	struct intel_svm_dev *sdev = to_intel_svm_dev(sva);
 
 	mutex_lock(&pasid_mutex);
-	sdev = to_intel_svm_dev(sva);
 	intel_svm_unbind_mm(sdev->dev, sdev->pasid);
 	mutex_unlock(&pasid_mutex);
 }
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 03faf20a6817..4e8bb186daa7 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -791,7 +791,6 @@ struct intel_svm {
 	u32 pasid;
 	int gpasid; /* In case that guest PASID is different from host PASID */
 	struct list_head devs;
-	struct list_head list;
 };
 #else
 static inline void intel_svm_check(struct intel_iommu *iommu) {}
-- 
cgit v1.2.3


From 4c82b88696ac57810ab923b3c5b0734646b9b69f Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 10 Jun 2021 10:01:02 +0800
Subject: iommu/vt-d: Allocate/register iopf queue for sva devices

This allocates and registers the iopf queue infrastructure for devices
which want to support IO page fault for SVA.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210520031531.712333-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210610020115.1637656-11-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/iommu.c | 66 ++++++++++++++++++++++++++++++++-------------
 drivers/iommu/intel/svm.c   | 37 ++++++++++++++++++++-----
 include/linux/intel-iommu.h |  2 ++
 3 files changed, 79 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 0ca7f8a2f38e..e78773d46d7d 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -46,6 +46,7 @@
 #include <asm/iommu.h>
 
 #include "../irq_remapping.h"
+#include "../iommu-sva-lib.h"
 #include "pasid.h"
 #include "cap_audit.h"
 
@@ -5338,6 +5339,34 @@ static int intel_iommu_disable_auxd(struct device *dev)
 	return 0;
 }
 
+static int intel_iommu_enable_sva(struct device *dev)
+{
+	struct device_domain_info *info = get_domain_info(dev);
+	struct intel_iommu *iommu = info->iommu;
+
+	if (!info || !iommu || dmar_disabled)
+		return -EINVAL;
+
+	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
+		return -ENODEV;
+
+	if (intel_iommu_enable_pasid(iommu, dev))
+		return -ENODEV;
+
+	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
+		return -EINVAL;
+
+	return iopf_queue_add_device(iommu->iopf_queue, dev);
+}
+
+static int intel_iommu_disable_sva(struct device *dev)
+{
+	struct device_domain_info *info = get_domain_info(dev);
+	struct intel_iommu *iommu = info->iommu;
+
+	return iopf_queue_remove_device(iommu->iopf_queue, dev);
+}
+
 /*
  * A PCI express designated vendor specific extended capability is defined
  * in the section 3.7 of Intel scalable I/O virtualization technical spec
@@ -5399,38 +5428,37 @@ intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
 static int
 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
 {
-	if (feat == IOMMU_DEV_FEAT_AUX)
+	switch (feat) {
+	case IOMMU_DEV_FEAT_AUX:
 		return intel_iommu_enable_auxd(dev);
 
-	if (feat == IOMMU_DEV_FEAT_IOPF)
+	case IOMMU_DEV_FEAT_IOPF:
 		return intel_iommu_dev_has_feat(dev, feat) ? 0 : -ENODEV;
 
-	if (feat == IOMMU_DEV_FEAT_SVA) {
-		struct device_domain_info *info = get_domain_info(dev);
-
-		if (!info)
-			return -EINVAL;
-
-		if (intel_iommu_enable_pasid(info->iommu, dev))
-			return -ENODEV;
+	case IOMMU_DEV_FEAT_SVA:
+		return intel_iommu_enable_sva(dev);
 
-		if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
-			return -EINVAL;
-
-		if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
-			return 0;
+	default:
+		return -ENODEV;
 	}
-
-	return -ENODEV;
 }
 
 static int
 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
 {
-	if (feat == IOMMU_DEV_FEAT_AUX)
+	switch (feat) {
+	case IOMMU_DEV_FEAT_AUX:
 		return intel_iommu_disable_auxd(dev);
 
-	return -ENODEV;
+	case IOMMU_DEV_FEAT_IOPF:
+		return 0;
+
+	case IOMMU_DEV_FEAT_SVA:
+		return intel_iommu_disable_sva(dev);
+
+	default:
+		return -ENODEV;
+	}
 }
 
 static bool
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index d51ddece4259..4dc3ab36e9ae 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -84,6 +84,7 @@ svm_lookup_device_by_dev(struct intel_svm *svm, struct device *dev)
 
 int intel_svm_enable_prq(struct intel_iommu *iommu)
 {
+	struct iopf_queue *iopfq;
 	struct page *pages;
 	int irq, ret;
 
@@ -100,13 +101,20 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
 		pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n",
 		       iommu->name);
 		ret = -EINVAL;
-	err:
-		free_pages((unsigned long)iommu->prq, PRQ_ORDER);
-		iommu->prq = NULL;
-		return ret;
+		goto free_prq;
 	}
 	iommu->pr_irq = irq;
 
+	snprintf(iommu->iopfq_name, sizeof(iommu->iopfq_name),
+		 "dmar%d-iopfq", iommu->seq_id);
+	iopfq = iopf_queue_alloc(iommu->iopfq_name);
+	if (!iopfq) {
+		pr_err("IOMMU: %s: Failed to allocate iopf queue\n", iommu->name);
+		ret = -ENOMEM;
+		goto free_hwirq;
+	}
+	iommu->iopf_queue = iopfq;
+
 	snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id);
 
 	ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT,
@@ -114,9 +122,7 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
 	if (ret) {
 		pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n",
 		       iommu->name);
-		dmar_free_hwirq(irq);
-		iommu->pr_irq = 0;
-		goto err;
+		goto free_iopfq;
 	}
 	dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
 	dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
@@ -125,6 +131,18 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
 	init_completion(&iommu->prq_complete);
 
 	return 0;
+
+free_iopfq:
+	iopf_queue_free(iommu->iopf_queue);
+	iommu->iopf_queue = NULL;
+free_hwirq:
+	dmar_free_hwirq(irq);
+	iommu->pr_irq = 0;
+free_prq:
+	free_pages((unsigned long)iommu->prq, PRQ_ORDER);
+	iommu->prq = NULL;
+
+	return ret;
 }
 
 int intel_svm_finish_prq(struct intel_iommu *iommu)
@@ -139,6 +157,11 @@ int intel_svm_finish_prq(struct intel_iommu *iommu)
 		iommu->pr_irq = 0;
 	}
 
+	if (iommu->iopf_queue) {
+		iopf_queue_free(iommu->iopf_queue);
+		iommu->iopf_queue = NULL;
+	}
+
 	free_pages((unsigned long)iommu->prq, PRQ_ORDER);
 	iommu->prq = NULL;
 
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 4e8bb186daa7..222520d149c1 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -606,6 +606,8 @@ struct intel_iommu {
 	struct completion prq_complete;
 	struct ioasid_allocator_ops pasid_allocator; /* Custom allocator for PASIDs */
 #endif
+	struct iopf_queue *iopf_queue;
+	unsigned char iopfq_name[16];
 	struct q_inval  *qi;            /* Queued invalidation info */
 	u32 *iommu_state; /* Store iommu states between suspend and resume.*/
 
-- 
cgit v1.2.3


From e93a67f5a0eef3e9ab5b4649cac5c3b831c6a9db Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 10 Jun 2021 10:01:04 +0800
Subject: iommu/vt-d: Add prq_report trace event

This adds a new trace event to track the page fault request report.
This event will provide almost all information defined in a page
request descriptor.

A sample output:
| prq_report: dmar0/0000:00:0a.0 seq# 1: rid=0x50 addr=0x559ef6f97 r---- pasid=0x2 index=0x1
| prq_report: dmar0/0000:00:0a.0 seq# 2: rid=0x50 addr=0x559ef6f9c rw--l pasid=0x2 index=0x1
| prq_report: dmar0/0000:00:0a.0 seq# 3: rid=0x50 addr=0x559ef6f98 r---- pasid=0x2 index=0x1
| prq_report: dmar0/0000:00:0a.0 seq# 4: rid=0x50 addr=0x559ef6f9d rw--l pasid=0x2 index=0x1
| prq_report: dmar0/0000:00:0a.0 seq# 5: rid=0x50 addr=0x559ef6f99 r---- pasid=0x2 index=0x1
| prq_report: dmar0/0000:00:0a.0 seq# 6: rid=0x50 addr=0x559ef6f9e rw--l pasid=0x2 index=0x1
| prq_report: dmar0/0000:00:0a.0 seq# 7: rid=0x50 addr=0x559ef6f9a r---- pasid=0x2 index=0x1
| prq_report: dmar0/0000:00:0a.0 seq# 8: rid=0x50 addr=0x559ef6f9f rw--l pasid=0x2 index=0x1

This will be helpful for I/O page fault related debugging.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210520031531.712333-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210610020115.1637656-13-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/svm.c          |  7 +++++++
 include/linux/intel-iommu.h        | 29 +++++++++++++++++++++++++++++
 include/trace/events/intel_iommu.h | 37 +++++++++++++++++++++++++++++++++++++
 3 files changed, 73 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index ade157b64ce7..d3d028c6a727 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -21,6 +21,7 @@
 #include <linux/ioasid.h>
 #include <asm/page.h>
 #include <asm/fpu/api.h>
+#include <trace/events/intel_iommu.h>
 
 #include "pasid.h"
 #include "../iommu-sva-lib.h"
@@ -976,12 +977,18 @@ bad_req:
 				goto bad_req;
 		}
 
+		sdev->prq_seq_number++;
+
 		/*
 		 * If prq is to be handled outside iommu driver via receiver of
 		 * the fault notifiers, we skip the page response here.
 		 */
 		if (intel_svm_prq_report(sdev->dev, req))
 			handle_bad_prq_event(iommu, req, QI_RESP_INVALID);
+
+		trace_prq_report(iommu, sdev->dev, req->qw_0, req->qw_1,
+				 req->priv_data[0], req->priv_data[1],
+				 sdev->prq_seq_number);
 prq_advance:
 		head = (head + sizeof(*req)) & PRQ_RING_MASK;
 	}
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 222520d149c1..98b04fa9373e 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -778,6 +778,7 @@ struct intel_svm_dev {
 	struct device *dev;
 	struct intel_iommu *iommu;
 	struct iommu_sva sva;
+	unsigned long prq_seq_number;
 	u32 pasid;
 	int users;
 	u16 did;
@@ -828,4 +829,32 @@ static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 #define intel_iommu_enabled (0)
 #endif
 
+static inline const char *decode_prq_descriptor(char *str, size_t size,
+		u64 dw0, u64 dw1, u64 dw2, u64 dw3)
+{
+	char *buf = str;
+	int bytes;
+
+	bytes = snprintf(buf, size,
+			 "rid=0x%llx addr=0x%llx %c%c%c%c%c pasid=0x%llx index=0x%llx",
+			 FIELD_GET(GENMASK_ULL(31, 16), dw0),
+			 FIELD_GET(GENMASK_ULL(63, 12), dw1),
+			 dw1 & BIT_ULL(0) ? 'r' : '-',
+			 dw1 & BIT_ULL(1) ? 'w' : '-',
+			 dw0 & BIT_ULL(52) ? 'x' : '-',
+			 dw0 & BIT_ULL(53) ? 'p' : '-',
+			 dw1 & BIT_ULL(2) ? 'l' : '-',
+			 FIELD_GET(GENMASK_ULL(51, 32), dw0),
+			 FIELD_GET(GENMASK_ULL(11, 3), dw1));
+
+	/* Private Data */
+	if (dw0 & BIT_ULL(9)) {
+		size -= bytes;
+		buf += bytes;
+		snprintf(buf, size, " private=0x%llx/0x%llx\n", dw2, dw3);
+	}
+
+	return str;
+}
+
 #endif
diff --git a/include/trace/events/intel_iommu.h b/include/trace/events/intel_iommu.h
index d233f2916584..e5c1ca6d16ee 100644
--- a/include/trace/events/intel_iommu.h
+++ b/include/trace/events/intel_iommu.h
@@ -15,6 +15,8 @@
 #include <linux/tracepoint.h>
 #include <linux/intel-iommu.h>
 
+#define MSG_MAX		256
+
 TRACE_EVENT(qi_submit,
 	TP_PROTO(struct intel_iommu *iommu, u64 qw0, u64 qw1, u64 qw2, u64 qw3),
 
@@ -51,6 +53,41 @@ TRACE_EVENT(qi_submit,
 		__entry->qw0, __entry->qw1, __entry->qw2, __entry->qw3
 	)
 );
+
+TRACE_EVENT(prq_report,
+	TP_PROTO(struct intel_iommu *iommu, struct device *dev,
+		 u64 dw0, u64 dw1, u64 dw2, u64 dw3,
+		 unsigned long seq),
+
+	TP_ARGS(iommu, dev, dw0, dw1, dw2, dw3, seq),
+
+	TP_STRUCT__entry(
+		__field(u64, dw0)
+		__field(u64, dw1)
+		__field(u64, dw2)
+		__field(u64, dw3)
+		__field(unsigned long, seq)
+		__string(iommu, iommu->name)
+		__string(dev, dev_name(dev))
+		__dynamic_array(char, buff, MSG_MAX)
+	),
+
+	TP_fast_assign(
+		__entry->dw0 = dw0;
+		__entry->dw1 = dw1;
+		__entry->dw2 = dw2;
+		__entry->dw3 = dw3;
+		__entry->seq = seq;
+		__assign_str(iommu, iommu->name);
+		__assign_str(dev, dev_name(dev));
+	),
+
+	TP_printk("%s/%s seq# %ld: %s",
+		__get_str(iommu), __get_str(dev), __entry->seq,
+		decode_prq_descriptor(__get_str(buff), MSG_MAX, __entry->dw0,
+				      __entry->dw1, __entry->dw2, __entry->dw3)
+	)
+);
 #endif /* _TRACE_INTEL_IOMMU_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From 55ee5e67a59a1b6f388d7a1c7b24022145f47a3e Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 10 Jun 2021 10:01:05 +0800
Subject: iommu/vt-d: Add common code for dmar latency performance monitors

The execution time of some operations is very performance critical, such
as cache invalidation and PRQ processing time. This adds some common code
to monitor the execution time range of those operations. The interfaces
include enabling/disabling, checking status, updating sampling data and
providing a common string format for users.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210520031531.712333-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210610020115.1637656-14-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/Kconfig  |   3 +
 drivers/iommu/intel/Makefile |   1 +
 drivers/iommu/intel/perf.c   | 166 +++++++++++++++++++++++++++++++++++++++++++
 drivers/iommu/intel/perf.h   |  73 +++++++++++++++++++
 include/linux/intel-iommu.h  |   1 +
 5 files changed, 244 insertions(+)
 create mode 100644 drivers/iommu/intel/perf.c
 create mode 100644 drivers/iommu/intel/perf.h

(limited to 'include/linux')

diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index a37bd54c5b90..59be5447b775 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -3,6 +3,9 @@
 config DMAR_TABLE
 	bool
 
+config DMAR_PERF
+	bool
+
 config INTEL_IOMMU
 	bool "Support for Intel IOMMU using DMA Remapping Devices"
 	depends on PCI_MSI && ACPI && (X86 || IA64)
diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile
index ae236ec7d219..fa0dae16441c 100644
--- a/drivers/iommu/intel/Makefile
+++ b/drivers/iommu/intel/Makefile
@@ -2,6 +2,7 @@
 obj-$(CONFIG_DMAR_TABLE) += dmar.o
 obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o
 obj-$(CONFIG_DMAR_TABLE) += trace.o cap_audit.o
+obj-$(CONFIG_DMAR_PERF) += perf.o
 obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o
 obj-$(CONFIG_INTEL_IOMMU_SVM) += svm.o
 obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o
diff --git a/drivers/iommu/intel/perf.c b/drivers/iommu/intel/perf.c
new file mode 100644
index 000000000000..faaa96dda437
--- /dev/null
+++ b/drivers/iommu/intel/perf.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+/**
+ * perf.c - performance monitor
+ *
+ * Copyright (C) 2021 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ *         Fenghua Yu <fenghua.yu@intel.com>
+ */
+
+#include <linux/spinlock.h>
+#include <linux/intel-iommu.h>
+
+#include "perf.h"
+
+static DEFINE_SPINLOCK(latency_lock);
+
+bool dmar_latency_enabled(struct intel_iommu *iommu, enum latency_type type)
+{
+	struct latency_statistic *lstat = iommu->perf_statistic;
+
+	return lstat && lstat[type].enabled;
+}
+
+int dmar_latency_enable(struct intel_iommu *iommu, enum latency_type type)
+{
+	struct latency_statistic *lstat;
+	unsigned long flags;
+	int ret = -EBUSY;
+
+	if (dmar_latency_enabled(iommu, type))
+		return 0;
+
+	spin_lock_irqsave(&latency_lock, flags);
+	if (!iommu->perf_statistic) {
+		iommu->perf_statistic = kzalloc(sizeof(*lstat) * DMAR_LATENCY_NUM,
+						GFP_ATOMIC);
+		if (!iommu->perf_statistic) {
+			ret = -ENOMEM;
+			goto unlock_out;
+		}
+	}
+
+	lstat = iommu->perf_statistic;
+
+	if (!lstat[type].enabled) {
+		lstat[type].enabled = true;
+		lstat[type].counter[COUNTS_MIN] = UINT_MAX;
+		ret = 0;
+	}
+unlock_out:
+	spin_unlock_irqrestore(&latency_lock, flags);
+
+	return ret;
+}
+
+void dmar_latency_disable(struct intel_iommu *iommu, enum latency_type type)
+{
+	struct latency_statistic *lstat = iommu->perf_statistic;
+	unsigned long flags;
+
+	if (!dmar_latency_enabled(iommu, type))
+		return;
+
+	spin_lock_irqsave(&latency_lock, flags);
+	memset(&lstat[type], 0, sizeof(*lstat) * DMAR_LATENCY_NUM);
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+void dmar_latency_update(struct intel_iommu *iommu, enum latency_type type, u64 latency)
+{
+	struct latency_statistic *lstat = iommu->perf_statistic;
+	unsigned long flags;
+	u64 min, max;
+
+	if (!dmar_latency_enabled(iommu, type))
+		return;
+
+	spin_lock_irqsave(&latency_lock, flags);
+	if (latency < 100)
+		lstat[type].counter[COUNTS_10e2]++;
+	else if (latency < 1000)
+		lstat[type].counter[COUNTS_10e3]++;
+	else if (latency < 10000)
+		lstat[type].counter[COUNTS_10e4]++;
+	else if (latency < 100000)
+		lstat[type].counter[COUNTS_10e5]++;
+	else if (latency < 1000000)
+		lstat[type].counter[COUNTS_10e6]++;
+	else if (latency < 10000000)
+		lstat[type].counter[COUNTS_10e7]++;
+	else
+		lstat[type].counter[COUNTS_10e8_plus]++;
+
+	min = lstat[type].counter[COUNTS_MIN];
+	max = lstat[type].counter[COUNTS_MAX];
+	lstat[type].counter[COUNTS_MIN] = min_t(u64, min, latency);
+	lstat[type].counter[COUNTS_MAX] = max_t(u64, max, latency);
+	lstat[type].counter[COUNTS_SUM] += latency;
+	lstat[type].samples++;
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static char *latency_counter_names[] = {
+	"                  <0.1us",
+	"   0.1us-1us", "    1us-10us", "  10us-100us",
+	"   100us-1ms", "    1ms-10ms", "      >=10ms",
+	"     min(us)", "     max(us)", " average(us)"
+};
+
+static char *latency_type_names[] = {
+	"   inv_iotlb", "  inv_devtlb", "     inv_iec",
+	"     svm_prq"
+};
+
+int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size)
+{
+	struct latency_statistic *lstat = iommu->perf_statistic;
+	unsigned long flags;
+	int bytes = 0, i, j;
+
+	memset(str, 0, size);
+
+	for (i = 0; i < COUNTS_NUM; i++)
+		bytes += snprintf(str + bytes, size - bytes,
+				  "%s", latency_counter_names[i]);
+
+	spin_lock_irqsave(&latency_lock, flags);
+	for (i = 0; i < DMAR_LATENCY_NUM; i++) {
+		if (!dmar_latency_enabled(iommu, i))
+			continue;
+
+		bytes += snprintf(str + bytes, size - bytes,
+				  "\n%s", latency_type_names[i]);
+
+		for (j = 0; j < COUNTS_NUM; j++) {
+			u64 val = lstat[i].counter[j];
+
+			switch (j) {
+			case COUNTS_MIN:
+				if (val == UINT_MAX)
+					val = 0;
+				else
+					val /= 1000;
+				break;
+			case COUNTS_MAX:
+				val /= 1000;
+				break;
+			case COUNTS_SUM:
+				if (lstat[i].samples)
+					val /= (lstat[i].samples * 1000);
+				else
+					val = 0;
+				break;
+			default:
+				break;
+			}
+
+			bytes += snprintf(str + bytes, size - bytes,
+					  "%12lld", val);
+		}
+	}
+	spin_unlock_irqrestore(&latency_lock, flags);
+
+	return bytes;
+}
diff --git a/drivers/iommu/intel/perf.h b/drivers/iommu/intel/perf.h
new file mode 100644
index 000000000000..fd6db8049d1a
--- /dev/null
+++ b/drivers/iommu/intel/perf.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * perf.h - performance monitor header
+ *
+ * Copyright (C) 2021 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ */
+
+enum latency_type {
+	DMAR_LATENCY_INV_IOTLB = 0,
+	DMAR_LATENCY_INV_DEVTLB,
+	DMAR_LATENCY_INV_IEC,
+	DMAR_LATENCY_PRQ,
+	DMAR_LATENCY_NUM
+};
+
+enum latency_count {
+	COUNTS_10e2 = 0,	/* < 0.1us	*/
+	COUNTS_10e3,		/* 0.1us ~ 1us	*/
+	COUNTS_10e4,		/* 1us ~ 10us	*/
+	COUNTS_10e5,		/* 10us ~ 100us	*/
+	COUNTS_10e6,		/* 100us ~ 1ms	*/
+	COUNTS_10e7,		/* 1ms ~ 10ms	*/
+	COUNTS_10e8_plus,	/* 10ms and plus*/
+	COUNTS_MIN,
+	COUNTS_MAX,
+	COUNTS_SUM,
+	COUNTS_NUM
+};
+
+struct latency_statistic {
+	bool enabled;
+	u64 counter[COUNTS_NUM];
+	u64 samples;
+};
+
+#ifdef CONFIG_DMAR_PERF
+int dmar_latency_enable(struct intel_iommu *iommu, enum latency_type type);
+void dmar_latency_disable(struct intel_iommu *iommu, enum latency_type type);
+bool dmar_latency_enabled(struct intel_iommu *iommu, enum latency_type type);
+void dmar_latency_update(struct intel_iommu *iommu, enum latency_type type,
+			 u64 latency);
+int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size);
+#else
+static inline int
+dmar_latency_enable(struct intel_iommu *iommu, enum latency_type type)
+{
+	return -EINVAL;
+}
+
+static inline void
+dmar_latency_disable(struct intel_iommu *iommu, enum latency_type type)
+{
+}
+
+static inline bool
+dmar_latency_enabled(struct intel_iommu *iommu, enum latency_type type)
+{
+	return false;
+}
+
+static inline void
+dmar_latency_update(struct intel_iommu *iommu, enum latency_type type, u64 latency)
+{
+}
+
+static inline int
+dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size)
+{
+	return 0;
+}
+#endif /* CONFIG_DMAR_PERF */
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 98b04fa9373e..f5cf31dd7280 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -621,6 +621,7 @@ struct intel_iommu {
 	u32		flags;      /* Software defined flags */
 
 	struct dmar_drhd_unit *drhd;
+	void *perf_statistic;
 };
 
 /* Per subdevice private data */
-- 
cgit v1.2.3


From 1f106ff0ea2782a6bc49bb927e4789681a2ec507 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Thu, 10 Jun 2021 10:01:11 +0800
Subject: iommu/vt-d: Use bitfields for DMAR capabilities

IOTLB device presence, iommu coherency and snooping are boolean
capabilities. Use them as bits and keep them adjacent.

Structure layout before the reorg.
$ pahole -C dmar_domain drivers/iommu/intel/dmar.o
struct dmar_domain {
        int                        nid;                  /*     0     4 */
        unsigned int               iommu_refcnt[128];    /*     4   512 */
        /* --- cacheline 8 boundary (512 bytes) was 4 bytes ago --- */
        u16                        iommu_did[128];       /*   516   256 */
        /* --- cacheline 12 boundary (768 bytes) was 4 bytes ago --- */
        bool                       has_iotlb_device;     /*   772     1 */

        /* XXX 3 bytes hole, try to pack */

        struct list_head           devices;              /*   776    16 */
        struct list_head           subdevices;           /*   792    16 */
        struct iova_domain         iovad __attribute__((__aligned__(8)));
							 /*   808  2320 */
        /* --- cacheline 48 boundary (3072 bytes) was 56 bytes ago --- */
        struct dma_pte *           pgd;                  /*  3128     8 */
        /* --- cacheline 49 boundary (3136 bytes) --- */
        int                        gaw;                  /*  3136     4 */
        int                        agaw;                 /*  3140     4 */
        int                        flags;                /*  3144     4 */
        int                        iommu_coherency;      /*  3148     4 */
        int                        iommu_snooping;       /*  3152     4 */
        int                        iommu_count;          /*  3156     4 */
        int                        iommu_superpage;      /*  3160     4 */

        /* XXX 4 bytes hole, try to pack */

        u64                        max_addr;             /*  3168     8 */
        u32                        default_pasid;        /*  3176     4 */

        /* XXX 4 bytes hole, try to pack */

        struct iommu_domain        domain;               /*  3184    72 */

        /* size: 3256, cachelines: 51, members: 18 */
        /* sum members: 3245, holes: 3, sum holes: 11 */
        /* forced alignments: 1 */
        /* last cacheline: 56 bytes */
} __attribute__((__aligned__(8)));

After arranging it for natural padding and to make flags as u8 bits, it
saves 8 bytes for the struct.

struct dmar_domain {
        int                        nid;                  /*     0     4 */
        unsigned int               iommu_refcnt[128];    /*     4   512 */
        /* --- cacheline 8 boundary (512 bytes) was 4 bytes ago --- */
        u16                        iommu_did[128];       /*   516   256 */
        /* --- cacheline 12 boundary (768 bytes) was 4 bytes ago --- */
        u8                         has_iotlb_device:1;   /*   772: 0  1 */
        u8                         iommu_coherency:1;    /*   772: 1  1 */
        u8                         iommu_snooping:1;     /*   772: 2  1 */

        /* XXX 5 bits hole, try to pack */
        /* XXX 3 bytes hole, try to pack */

        struct list_head           devices;              /*   776    16 */
        struct list_head           subdevices;           /*   792    16 */
        struct iova_domain         iovad __attribute__((__aligned__(8)));
							 /*   808  2320 */
        /* --- cacheline 48 boundary (3072 bytes) was 56 bytes ago --- */
        struct dma_pte *           pgd;                  /*  3128     8 */
        /* --- cacheline 49 boundary (3136 bytes) --- */
        int                        gaw;                  /*  3136     4 */
        int                        agaw;                 /*  3140     4 */
        int                        flags;                /*  3144     4 */
        int                        iommu_count;          /*  3148     4 */
        int                        iommu_superpage;      /*  3152     4 */

        /* XXX 4 bytes hole, try to pack */

        u64                        max_addr;             /*  3160     8 */
        u32                        default_pasid;        /*  3168     4 */

        /* XXX 4 bytes hole, try to pack */

        struct iommu_domain        domain;               /*  3176    72 */

        /* size: 3248, cachelines: 51, members: 18 */
        /* sum members: 3236, holes: 3, sum holes: 11 */
        /* sum bitfield members: 3 bits, bit holes: 1, sum bit holes: 5 bits */
        /* forced alignments: 1 */
        /* last cacheline: 48 bytes */
} __attribute__((__aligned__(8)));

Signed-off-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210530075053.264218-1-parav@nvidia.com
Link: https://lore.kernel.org/r/20210610020115.1637656-20-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/iommu.c | 18 +++++++++---------
 include/linux/intel-iommu.h |  8 ++++----
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 65458aee0d95..430ef2232d47 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -626,12 +626,12 @@ static void domain_update_iommu_coherency(struct dmar_domain *domain)
 	bool found = false;
 	int i;
 
-	domain->iommu_coherency = 1;
+	domain->iommu_coherency = true;
 
 	for_each_domain_iommu(i, domain) {
 		found = true;
 		if (!iommu_paging_structure_coherency(g_iommus[i])) {
-			domain->iommu_coherency = 0;
+			domain->iommu_coherency = false;
 			break;
 		}
 	}
@@ -642,18 +642,18 @@ static void domain_update_iommu_coherency(struct dmar_domain *domain)
 	rcu_read_lock();
 	for_each_active_iommu(iommu, drhd) {
 		if (!iommu_paging_structure_coherency(iommu)) {
-			domain->iommu_coherency = 0;
+			domain->iommu_coherency = false;
 			break;
 		}
 	}
 	rcu_read_unlock();
 }
 
-static int domain_update_iommu_snooping(struct intel_iommu *skip)
+static bool domain_update_iommu_snooping(struct intel_iommu *skip)
 {
 	struct dmar_drhd_unit *drhd;
 	struct intel_iommu *iommu;
-	int ret = 1;
+	bool ret = true;
 
 	rcu_read_lock();
 	for_each_active_iommu(iommu, drhd) {
@@ -666,7 +666,7 @@ static int domain_update_iommu_snooping(struct intel_iommu *skip)
 			 */
 			if (!sm_supported(iommu) &&
 			    !ecap_sc_support(iommu->ecap)) {
-				ret = 0;
+				ret = false;
 				break;
 			}
 		}
@@ -4506,8 +4506,8 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width)
 	adjust_width = guestwidth_to_adjustwidth(guest_width);
 	domain->agaw = width_to_agaw(adjust_width);
 
-	domain->iommu_coherency = 0;
-	domain->iommu_snooping = 0;
+	domain->iommu_coherency = false;
+	domain->iommu_snooping = false;
 	domain->iommu_superpage = 0;
 	domain->max_addr = 0;
 
@@ -5131,7 +5131,7 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 static bool intel_iommu_capable(enum iommu_cap cap)
 {
 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
-		return domain_update_iommu_snooping(NULL) == 1;
+		return domain_update_iommu_snooping(NULL);
 	if (cap == IOMMU_CAP_INTR_REMAP)
 		return irq_remapping_enabled == 1;
 
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index f5cf31dd7280..2621eff04c82 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -546,7 +546,10 @@ struct dmar_domain {
 					 * domain ids are 16 bit wide according
 					 * to VT-d spec, section 9.3 */
 
-	bool has_iotlb_device;
+	u8 has_iotlb_device: 1;
+	u8 iommu_coherency: 1;		/* indicate coherency of iommu access */
+	u8 iommu_snooping: 1;		/* indicate snooping control feature */
+
 	struct list_head devices;	/* all devices' list */
 	struct list_head subdevices;	/* all subdevices' list */
 	struct iova_domain iovad;	/* iova's that belong to this domain */
@@ -558,9 +561,6 @@ struct dmar_domain {
 	int		agaw;
 
 	int		flags;		/* flags to find out type of domain */
-
-	int		iommu_coherency;/* indicate coherency of iommu access */
-	int		iommu_snooping; /* indicate snooping control feature*/
 	int		iommu_count;	/* reference count of iommu */
 	int		iommu_superpage;/* Level of superpages supported:
 					   0 == 4KiB (no superpages), 1 == 2MiB,
-- 
cgit v1.2.3


From 74f6d776ae0b8498cfdb574ab24992bd50a2a2f1 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Thu, 10 Jun 2021 10:01:12 +0800
Subject: iommu/vt-d: Removed unused iommu_count in dmar domain

DMAR domain uses per DMAR refcount. It is indexed by iommu seq_id.
Older iommu_count is only incremented and decremented but no decisions
are taken based on this refcount. This is not of much use.

Hence, remove iommu_count and further simplify domain_detach_iommu()
by returning void.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210530075053.264218-1-parav@nvidia.com
Link: https://lore.kernel.org/r/20210610020115.1637656-21-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/iommu.c | 11 +++--------
 include/linux/intel-iommu.h |  1 -
 2 files changed, 3 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 430ef2232d47..dd8ecfbfdb23 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1920,7 +1920,6 @@ static int domain_attach_iommu(struct dmar_domain *domain,
 	assert_spin_locked(&iommu->lock);
 
 	domain->iommu_refcnt[iommu->seq_id] += 1;
-	domain->iommu_count += 1;
 	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
 		ndomains = cap_ndoms(iommu->cap);
 		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
@@ -1928,7 +1927,6 @@ static int domain_attach_iommu(struct dmar_domain *domain,
 		if (num >= ndomains) {
 			pr_err("%s: No free domain ids\n", iommu->name);
 			domain->iommu_refcnt[iommu->seq_id] -= 1;
-			domain->iommu_count -= 1;
 			return -ENOSPC;
 		}
 
@@ -1944,16 +1942,15 @@ static int domain_attach_iommu(struct dmar_domain *domain,
 	return 0;
 }
 
-static int domain_detach_iommu(struct dmar_domain *domain,
-			       struct intel_iommu *iommu)
+static void domain_detach_iommu(struct dmar_domain *domain,
+				struct intel_iommu *iommu)
 {
-	int num, count;
+	int num;
 
 	assert_spin_locked(&device_domain_lock);
 	assert_spin_locked(&iommu->lock);
 
 	domain->iommu_refcnt[iommu->seq_id] -= 1;
-	count = --domain->iommu_count;
 	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
 		num = domain->iommu_did[iommu->seq_id];
 		clear_bit(num, iommu->domain_ids);
@@ -1962,8 +1959,6 @@ static int domain_detach_iommu(struct dmar_domain *domain,
 		domain_update_iommu_cap(domain);
 		domain->iommu_did[iommu->seq_id] = 0;
 	}
-
-	return count;
 }
 
 static inline int guestwidth_to_adjustwidth(int gaw)
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 2621eff04c82..574b932dfe86 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -561,7 +561,6 @@ struct dmar_domain {
 	int		agaw;
 
 	int		flags;		/* flags to find out type of domain */
-	int		iommu_count;	/* reference count of iommu */
 	int		iommu_superpage;/* Level of superpages supported:
 					   0 == 4KiB (no superpages), 1 == 2MiB,
 					   2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
-- 
cgit v1.2.3


From 9739ba327c01e26f672661ea751132c29a54d3d9 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Thu, 10 Jun 2021 10:01:14 +0800
Subject: iommu/vt-d: Define counter explicitly as unsigned int

Avoid below checkpatch warning.

WARNING: Prefer 'unsigned int' to bare use of 'unsigned'
+       unsigned        iommu_refcnt[DMAR_UNITS_SUPPORTED];

Fixes: 29a27719abaa ("iommu/vt-d: Replace iommu_bmp with a refcount")
Signed-off-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210530075053.264218-1-parav@nvidia.com
Link: https://lore.kernel.org/r/20210610020115.1637656-23-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/intel-iommu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 574b932dfe86..d0fa0b31994d 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -537,7 +537,7 @@ struct context_entry {
 struct dmar_domain {
 	int	nid;			/* node id */
 
-	unsigned	iommu_refcnt[DMAR_UNITS_SUPPORTED];
+	unsigned int iommu_refcnt[DMAR_UNITS_SUPPORTED];
 					/* Refcount of devices per iommu */
 
 
-- 
cgit v1.2.3


From 405e94e9aed2a38bdcd22efe53c36c6cd53185a6 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 13 Sep 2018 10:42:25 +0100
Subject: irqdomain: Kill irq_domain_add_legacy_isa

This helper doesn't have a user anymore, let's remove it.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 Documentation/core-api/irq/irq-domain.rst |  1 -
 include/linux/irqdomain.h                 | 11 -----------
 2 files changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/core-api/irq/irq-domain.rst b/Documentation/core-api/irq/irq-domain.rst
index 8214e215a8bf..53283b3729a1 100644
--- a/Documentation/core-api/irq/irq-domain.rst
+++ b/Documentation/core-api/irq/irq-domain.rst
@@ -146,7 +146,6 @@ Legacy
 
 	irq_domain_add_simple()
 	irq_domain_add_legacy()
-	irq_domain_add_legacy_isa()
 	irq_domain_create_simple()
 	irq_domain_create_legacy()
 
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 62a8e3d23829..9f884c948739 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -45,9 +45,6 @@ struct cpumask;
 struct seq_file;
 struct irq_affinity_desc;
 
-/* Number of irqs reserved for a legacy isa controller */
-#define NUM_ISA_INTERRUPTS	16
-
 #define IRQ_DOMAIN_IRQ_SPEC_PARAMS 16
 
 /**
@@ -355,14 +352,6 @@ static inline struct irq_domain *irq_domain_add_nomap(struct device_node *of_nod
 {
 	return __irq_domain_add(of_node_to_fwnode(of_node), 0, max_irq, max_irq, ops, host_data);
 }
-static inline struct irq_domain *irq_domain_add_legacy_isa(
-				struct device_node *of_node,
-				const struct irq_domain_ops *ops,
-				void *host_data)
-{
-	return irq_domain_add_legacy(of_node, NUM_ISA_INTERRUPTS, 0, 0, ops,
-				     host_data);
-}
 static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
 					 const struct irq_domain_ops *ops,
 					 void *host_data)
-- 
cgit v1.2.3


From 1da027362a7db422243601e895e6f8288389f435 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Fri, 2 Apr 2021 12:50:14 +0100
Subject: irqdomain: Reimplement irq_linear_revmap() with irq_find_mapping()

irq_linear_revmap() is supposed to be a fast path for domain
lookups, but it only exposes low-level details of the irqdomain
implementation, details which are better kept private.

The *overhead* between the two is only a function call and
a couple of tests, so it is likely that noone can show any
meaningful difference compared to the cost of taking an
interrupt.

Reimplement irq_linear_revmap() with irq_find_mapping()
in order to preserve source code compatibility, and
rename the internal field for a measure.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 include/linux/irqdomain.h | 22 +++++++++-------------
 kernel/irq/irqdomain.c    |  6 +++---
 2 files changed, 12 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 9f884c948739..42b3f7d03a32 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -151,9 +151,9 @@ struct irq_domain_chip_generic;
  * Revmap data, used internally by irq_domain
  * @revmap_direct_max_irq: The largest hwirq that can be set for controllers that
  *                         support direct mapping
- * @revmap_size: Size of the linear map table @linear_revmap[]
+ * @revmap_size: Size of the linear map table @revmap[]
  * @revmap_tree: Radix map tree for hwirqs that don't fit in the linear map
- * @linear_revmap: Linear table of hwirq->virq reverse mappings
+ * @revmap: Linear table of hwirq->virq reverse mappings
  */
 struct irq_domain {
 	struct list_head link;
@@ -177,7 +177,7 @@ struct irq_domain {
 	unsigned int revmap_size;
 	struct radix_tree_root revmap_tree;
 	struct mutex revmap_tree_mutex;
-	unsigned int linear_revmap[];
+	unsigned int revmap[];
 };
 
 /* Irq domain flags */
@@ -394,24 +394,20 @@ static inline unsigned int irq_create_mapping(struct irq_domain *host,
 	return irq_create_mapping_affinity(host, hwirq, NULL);
 }
 
-
 /**
- * irq_linear_revmap() - Find a linux irq from a hw irq number.
+ * irq_find_mapping() - Find a linux irq from a hw irq number.
  * @domain: domain owning this hardware interrupt
  * @hwirq: hardware irq number in that domain space
- *
- * This is a fast path alternative to irq_find_mapping() that can be
- * called directly by irq controller code to save a handful of
- * instructions. It is always safe to call, but won't find irqs mapped
- * using the radix tree.
  */
+extern unsigned int irq_find_mapping(struct irq_domain *host,
+				     irq_hw_number_t hwirq);
+
 static inline unsigned int irq_linear_revmap(struct irq_domain *domain,
 					     irq_hw_number_t hwirq)
 {
-	return hwirq < domain->revmap_size ? domain->linear_revmap[hwirq] : 0;
+	return irq_find_mapping(domain, hwirq);
 }
-extern unsigned int irq_find_mapping(struct irq_domain *host,
-				     irq_hw_number_t hwirq);
+
 extern unsigned int irq_create_direct_mapping(struct irq_domain *host);
 
 extern const struct irq_domain_ops irq_domain_simple_ops;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 6284443b87ec..8bd012253989 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -486,7 +486,7 @@ static void irq_domain_clear_mapping(struct irq_domain *domain,
 				     irq_hw_number_t hwirq)
 {
 	if (hwirq < domain->revmap_size) {
-		domain->linear_revmap[hwirq] = 0;
+		domain->revmap[hwirq] = 0;
 	} else {
 		mutex_lock(&domain->revmap_tree_mutex);
 		radix_tree_delete(&domain->revmap_tree, hwirq);
@@ -499,7 +499,7 @@ static void irq_domain_set_mapping(struct irq_domain *domain,
 				   struct irq_data *irq_data)
 {
 	if (hwirq < domain->revmap_size) {
-		domain->linear_revmap[hwirq] = irq_data->irq;
+		domain->revmap[hwirq] = irq_data->irq;
 	} else {
 		mutex_lock(&domain->revmap_tree_mutex);
 		radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
@@ -885,7 +885,7 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
 
 	/* Check if the hwirq is in the linear revmap. */
 	if (hwirq < domain->revmap_size)
-		return domain->linear_revmap[hwirq];
+		return domain->revmap[hwirq];
 
 	rcu_read_lock();
 	data = radix_tree_lookup(&domain->revmap_tree, hwirq);
-- 
cgit v1.2.3


From e37af8011a9631996e6cd32dd81a152708eee7d4 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Sun, 4 Apr 2021 13:06:39 +0100
Subject: powerpc: Move the use of irq_domain_add_nomap() behind a config
 option

Only a handful of old PPC systems are still using the old 'nomap'
variant of the irqdomain library. Move the associated definitions
behind a configuration option, which will allow us to make some
more radical changes.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/powerpc/platforms/cell/Kconfig     | 1 +
 arch/powerpc/platforms/powermac/Kconfig | 1 +
 arch/powerpc/platforms/ps3/Kconfig      | 1 +
 arch/powerpc/sysdev/xive/Kconfig        | 1 +
 include/linux/irqdomain.h               | 8 ++++++--
 kernel/irq/Kconfig                      | 5 +++++
 kernel/irq/irqdomain.c                  | 2 ++
 7 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/cell/Kconfig b/arch/powerpc/platforms/cell/Kconfig
index e7c976bcadff..cb70c5f25bc6 100644
--- a/arch/powerpc/platforms/cell/Kconfig
+++ b/arch/powerpc/platforms/cell/Kconfig
@@ -35,6 +35,7 @@ config PPC_IBM_CELL_BLADE
 config AXON_MSI
 	bool
 	depends on PPC_IBM_CELL_BLADE && PCI_MSI
+	select IRQ_DOMAIN_NOMAP
 	default y
 
 menu "Cell Broadband Engine options"
diff --git a/arch/powerpc/platforms/powermac/Kconfig b/arch/powerpc/platforms/powermac/Kconfig
index c02d8c503b29..b97bf12801eb 100644
--- a/arch/powerpc/platforms/powermac/Kconfig
+++ b/arch/powerpc/platforms/powermac/Kconfig
@@ -24,6 +24,7 @@ config PPC_PMAC32_PSURGE
 	bool "Support for powersurge upgrade cards" if EXPERT
 	depends on SMP && PPC32 && PPC_PMAC
 	select PPC_SMP_MUXED_IPI
+	select IRQ_DOMAIN_NOMAP
 	default y
 	help
 	  The powersurge cpu boards can be used in the generation
diff --git a/arch/powerpc/platforms/ps3/Kconfig b/arch/powerpc/platforms/ps3/Kconfig
index e32406e918d0..4d0535cc7946 100644
--- a/arch/powerpc/platforms/ps3/Kconfig
+++ b/arch/powerpc/platforms/ps3/Kconfig
@@ -7,6 +7,7 @@ config PPC_PS3
 	select USB_OHCI_BIG_ENDIAN_MMIO
 	select USB_EHCI_BIG_ENDIAN_MMIO
 	select HAVE_PCI
+	select IRQ_DOMAIN_NOMAP
 	help
 	  This option enables support for the Sony PS3 game console
 	  and other platforms using the PS3 hypervisor.  Enabling this
diff --git a/arch/powerpc/sysdev/xive/Kconfig b/arch/powerpc/sysdev/xive/Kconfig
index 785c292d104b..97796c6b63f0 100644
--- a/arch/powerpc/sysdev/xive/Kconfig
+++ b/arch/powerpc/sysdev/xive/Kconfig
@@ -3,6 +3,7 @@ config PPC_XIVE
 	bool
 	select PPC_SMP_MUXED_IPI
 	select HARDIRQS_SW_RESEND
+	select IRQ_DOMAIN_NOMAP
 
 config PPC_XIVE_NATIVE
 	bool
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 42b3f7d03a32..723495ec5a2f 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -345,6 +345,8 @@ static inline struct irq_domain *irq_domain_add_linear(struct device_node *of_no
 {
 	return __irq_domain_add(of_node_to_fwnode(of_node), size, size, 0, ops, host_data);
 }
+
+#ifdef CONFIG_IRQ_DOMAIN_NOMAP
 static inline struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
 					 unsigned int max_irq,
 					 const struct irq_domain_ops *ops,
@@ -352,6 +354,10 @@ static inline struct irq_domain *irq_domain_add_nomap(struct device_node *of_nod
 {
 	return __irq_domain_add(of_node_to_fwnode(of_node), 0, max_irq, max_irq, ops, host_data);
 }
+
+extern unsigned int irq_create_direct_mapping(struct irq_domain *host);
+#endif
+
 static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
 					 const struct irq_domain_ops *ops,
 					 void *host_data)
@@ -408,8 +414,6 @@ static inline unsigned int irq_linear_revmap(struct irq_domain *domain,
 	return irq_find_mapping(domain, hwirq);
 }
 
-extern unsigned int irq_create_direct_mapping(struct irq_domain *host);
-
 extern const struct irq_domain_ops irq_domain_simple_ops;
 
 /* stock xlate functions */
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index d79ef2493a28..fbc54c2a7f23 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -70,6 +70,11 @@ config IRQ_DOMAIN_HIERARCHY
 	bool
 	select IRQ_DOMAIN
 
+# Support for obsolete non-mapping irq domains
+config IRQ_DOMAIN_NOMAP
+	bool
+	select IRQ_DOMAIN
+
 # Support for hierarchical fasteoi+edge and fasteoi+level handlers
 config IRQ_FASTEOI_HIERARCHY_HANDLERS
 	bool
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8bd012253989..e0143e640683 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -604,6 +604,7 @@ void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
 }
 EXPORT_SYMBOL_GPL(irq_domain_associate_many);
 
+#ifdef CONFIG_IRQ_DOMAIN_NOMAP
 /**
  * irq_create_direct_mapping() - Allocate an irq for direct mapping
  * @domain: domain to allocate the irq for or NULL for default domain
@@ -644,6 +645,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
 	return virq;
 }
 EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
+#endif
 
 /**
  * irq_create_mapping_affinity() - Map a hardware interrupt into linux irq space
-- 
cgit v1.2.3


From 4f86a06e2d6ece5316e4c42fbf946ee22acb30f3 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 10 Sep 2018 18:33:46 +0100
Subject: irqdomain: Make normal and nomap irqdomains exclusive

Direct mappings are completely exclusive of normal mappings, meaning
that we can refactor the code slightly so that we can get rid of
the revmap_direct_max_irq field and use the revmap_size field
instead, reducing the size of the irqdomain structure.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 include/linux/irqdomain.h |  6 +++---
 kernel/irq/irqdomain.c    | 45 +++++++++++++++++++++++++++++++++++----------
 2 files changed, 38 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 723495ec5a2f..0916cf9c6e20 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -149,8 +149,6 @@ struct irq_domain_chip_generic;
  * @parent: Pointer to parent irq_domain to support hierarchy irq_domains
  *
  * Revmap data, used internally by irq_domain
- * @revmap_direct_max_irq: The largest hwirq that can be set for controllers that
- *                         support direct mapping
  * @revmap_size: Size of the linear map table @revmap[]
  * @revmap_tree: Radix map tree for hwirqs that don't fit in the linear map
  * @revmap: Linear table of hwirq->virq reverse mappings
@@ -173,7 +171,6 @@ struct irq_domain {
 
 	/* reverse map data. The linear map gets appended to the irq_domain */
 	irq_hw_number_t hwirq_max;
-	unsigned int revmap_direct_max_irq;
 	unsigned int revmap_size;
 	struct radix_tree_root revmap_tree;
 	struct mutex revmap_tree_mutex;
@@ -207,6 +204,9 @@ enum {
 	 */
 	IRQ_DOMAIN_MSI_NOMASK_QUIRK	= (1 << 6),
 
+	/* Irq domain doesn't translate anything */
+	IRQ_DOMAIN_FLAG_NO_MAP		= (1 << 7),
+
 	/*
 	 * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved
 	 * for implementation specific purposes and ignored by the
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index e0143e640683..fa94c86e47d4 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -146,6 +146,10 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
 
 	static atomic_t unknown_domains;
 
+	if (WARN_ON((size && direct_max) ||
+		    (!IS_ENABLED(CONFIG_IRQ_DOMAIN_NOMAP) && direct_max)))
+		return NULL;
+
 	domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
 			      GFP_KERNEL, of_node_to_nid(to_of_node(fwnode)));
 	if (!domain)
@@ -213,8 +217,14 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
 	domain->ops = ops;
 	domain->host_data = host_data;
 	domain->hwirq_max = hwirq_max;
+
+	if (direct_max) {
+		size = direct_max;
+		domain->flags |= IRQ_DOMAIN_FLAG_NO_MAP;
+	}
+
 	domain->revmap_size = size;
-	domain->revmap_direct_max_irq = direct_max;
+
 	irq_domain_check_hierarchy(domain);
 
 	mutex_lock(&irq_domain_mutex);
@@ -482,9 +492,18 @@ struct irq_domain *irq_get_default_host(void)
 	return irq_default_domain;
 }
 
+static bool irq_domain_is_nomap(struct irq_domain *domain)
+{
+	return IS_ENABLED(CONFIG_IRQ_DOMAIN_NOMAP) &&
+	       (domain->flags & IRQ_DOMAIN_FLAG_NO_MAP);
+}
+
 static void irq_domain_clear_mapping(struct irq_domain *domain,
 				     irq_hw_number_t hwirq)
 {
+	if (irq_domain_is_nomap(domain))
+		return;
+
 	if (hwirq < domain->revmap_size) {
 		domain->revmap[hwirq] = 0;
 	} else {
@@ -498,6 +517,9 @@ static void irq_domain_set_mapping(struct irq_domain *domain,
 				   irq_hw_number_t hwirq,
 				   struct irq_data *irq_data)
 {
+	if (irq_domain_is_nomap(domain))
+		return;
+
 	if (hwirq < domain->revmap_size) {
 		domain->revmap[hwirq] = irq_data->irq;
 	} else {
@@ -629,9 +651,9 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
 		pr_debug("create_direct virq allocation failed\n");
 		return 0;
 	}
-	if (virq >= domain->revmap_direct_max_irq) {
+	if (virq >= domain->revmap_size) {
 		pr_err("ERROR: no free irqs available below %i maximum\n",
-			domain->revmap_direct_max_irq);
+			domain->revmap_size);
 		irq_free_desc(virq);
 		return 0;
 	}
@@ -879,10 +901,14 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
 	if (domain == NULL)
 		return 0;
 
-	if (hwirq < domain->revmap_direct_max_irq) {
-		data = irq_domain_get_irq_data(domain, hwirq);
-		if (data && data->hwirq == hwirq)
-			return hwirq;
+	if (irq_domain_is_nomap(domain)) {
+		if (hwirq < domain->revmap_size) {
+			data = irq_domain_get_irq_data(domain, hwirq);
+			if (data && data->hwirq == hwirq)
+				return hwirq;
+		}
+
+		return 0;
 	}
 
 	/* Check if the hwirq is in the linear revmap. */
@@ -1470,7 +1496,7 @@ static void irq_domain_fix_revmap(struct irq_data *d)
 {
 	void __rcu **slot;
 
-	if (d->hwirq < d->domain->revmap_size)
+	if (irq_domain_is_nomap(d->domain) || d->hwirq < d->domain->revmap_size)
 		return; /* Not using radix tree. */
 
 	/* Fix up the revmap. */
@@ -1830,8 +1856,7 @@ static void
 irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind)
 {
 	seq_printf(m, "%*sname:   %s\n", ind, "", d->name);
-	seq_printf(m, "%*ssize:   %u\n", ind + 1, "",
-		   d->revmap_size + d->revmap_direct_max_irq);
+	seq_printf(m, "%*ssize:   %u\n", ind + 1, "", d->revmap_size);
 	seq_printf(m, "%*smapped: %u\n", ind + 1, "", d->mapcount);
 	seq_printf(m, "%*sflags:  0x%08x\n", ind +1 , "", d->flags);
 	if (d->ops && d->ops->debug_show)
-- 
cgit v1.2.3


From 48b15a7921d60680babe59f64e127816585a585c Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 5 Apr 2021 11:46:53 +0100
Subject: irqdomain: Cache irq_data instead of a virq number in the revmap

Caching a virq number in the revmap is pretty inefficient, as
it means we will need to convert it back to either an irq_data
or irq_desc to do anything with it.

It is also a bit odd, as the radix tree does cache irq_data
pointers.

Change the revmap type to be an irq_data pointer instead of
an unsigned int, and preserve the current API for now.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 include/linux/irqdomain.h |  4 ++--
 kernel/irq/irqdomain.c    | 16 +++++++++++-----
 2 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 0916cf9c6e20..340cc04611dd 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -151,7 +151,7 @@ struct irq_domain_chip_generic;
  * Revmap data, used internally by irq_domain
  * @revmap_size: Size of the linear map table @revmap[]
  * @revmap_tree: Radix map tree for hwirqs that don't fit in the linear map
- * @revmap: Linear table of hwirq->virq reverse mappings
+ * @revmap: Linear table of irq_data pointers
  */
 struct irq_domain {
 	struct list_head link;
@@ -174,7 +174,7 @@ struct irq_domain {
 	unsigned int revmap_size;
 	struct radix_tree_root revmap_tree;
 	struct mutex revmap_tree_mutex;
-	unsigned int revmap[];
+	struct irq_data *revmap[];
 };
 
 /* Irq domain flags */
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index cdcb1989cd20..7a4e38804487 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -505,7 +505,7 @@ static void irq_domain_clear_mapping(struct irq_domain *domain,
 		return;
 
 	if (hwirq < domain->revmap_size) {
-		domain->revmap[hwirq] = 0;
+		domain->revmap[hwirq] = NULL;
 	} else {
 		mutex_lock(&domain->revmap_tree_mutex);
 		radix_tree_delete(&domain->revmap_tree, hwirq);
@@ -521,7 +521,7 @@ static void irq_domain_set_mapping(struct irq_domain *domain,
 		return;
 
 	if (hwirq < domain->revmap_size) {
-		domain->revmap[hwirq] = irq_data->irq;
+		domain->revmap[hwirq] = irq_data;
 	} else {
 		mutex_lock(&domain->revmap_tree_mutex);
 		radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
@@ -913,7 +913,7 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
 
 	/* Check if the hwirq is in the linear revmap. */
 	if (hwirq < domain->revmap_size)
-		return domain->revmap[hwirq];
+		return domain->revmap[hwirq]->irq;
 
 	rcu_read_lock();
 	data = radix_tree_lookup(&domain->revmap_tree, hwirq);
@@ -1496,8 +1496,14 @@ static void irq_domain_fix_revmap(struct irq_data *d)
 {
 	void __rcu **slot;
 
-	if (irq_domain_is_nomap(d->domain) || d->hwirq < d->domain->revmap_size)
-		return; /* Not using radix tree. */
+	if (irq_domain_is_nomap(d->domain))
+		return;
+
+	if (d->hwirq < d->domain->revmap_size) {
+		/* Not using radix tree */
+		d->domain->revmap[d->hwirq] = d;
+		return;
+	}
 
 	/* Fix up the revmap. */
 	mutex_lock(&d->domain->revmap_tree_mutex);
-- 
cgit v1.2.3


From d4a45c68dc81f9117ceaff9f058d5fae674181b9 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 5 Apr 2021 12:57:27 +0100
Subject: irqdomain: Protect the linear revmap with RCU

It is pretty odd that the radix tree uses RCU while the linear
portion doesn't, leading to potential surprises for the users,
depending on how the irqdomain has been created.

Fix this by moving the update of the linear revmap under
the mutex, and the lookup under the RCU read-side lock.

The mutex name is updated to reflect that it doesn't only
cover the radix-tree anymore.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 include/linux/irqdomain.h |  5 +++--
 kernel/irq/irqdomain.c    | 49 ++++++++++++++++++++++-------------------------
 2 files changed, 26 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 340cc04611dd..2b696c9bcaaf 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -151,6 +151,7 @@ struct irq_domain_chip_generic;
  * Revmap data, used internally by irq_domain
  * @revmap_size: Size of the linear map table @revmap[]
  * @revmap_tree: Radix map tree for hwirqs that don't fit in the linear map
+ * @revmap_mutex: Lock for the revmap
  * @revmap: Linear table of irq_data pointers
  */
 struct irq_domain {
@@ -173,8 +174,8 @@ struct irq_domain {
 	irq_hw_number_t hwirq_max;
 	unsigned int revmap_size;
 	struct radix_tree_root revmap_tree;
-	struct mutex revmap_tree_mutex;
-	struct irq_data *revmap[];
+	struct mutex revmap_mutex;
+	struct irq_data __rcu *revmap[];
 };
 
 /* Irq domain flags */
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 7a4e38804487..8fbadeefc814 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -213,7 +213,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
 
 	/* Fill structure */
 	INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
-	mutex_init(&domain->revmap_tree_mutex);
+	mutex_init(&domain->revmap_mutex);
 	domain->ops = ops;
 	domain->host_data = host_data;
 	domain->hwirq_max = hwirq_max;
@@ -504,13 +504,12 @@ static void irq_domain_clear_mapping(struct irq_domain *domain,
 	if (irq_domain_is_nomap(domain))
 		return;
 
-	if (hwirq < domain->revmap_size) {
-		domain->revmap[hwirq] = NULL;
-	} else {
-		mutex_lock(&domain->revmap_tree_mutex);
+	mutex_lock(&domain->revmap_mutex);
+	if (hwirq < domain->revmap_size)
+		rcu_assign_pointer(domain->revmap[hwirq], NULL);
+	else
 		radix_tree_delete(&domain->revmap_tree, hwirq);
-		mutex_unlock(&domain->revmap_tree_mutex);
-	}
+	mutex_unlock(&domain->revmap_mutex);
 }
 
 static void irq_domain_set_mapping(struct irq_domain *domain,
@@ -520,13 +519,12 @@ static void irq_domain_set_mapping(struct irq_domain *domain,
 	if (irq_domain_is_nomap(domain))
 		return;
 
-	if (hwirq < domain->revmap_size) {
-		domain->revmap[hwirq] = irq_data;
-	} else {
-		mutex_lock(&domain->revmap_tree_mutex);
+	mutex_lock(&domain->revmap_mutex);
+	if (hwirq < domain->revmap_size)
+		rcu_assign_pointer(domain->revmap[hwirq], irq_data);
+	else
 		radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
-		mutex_unlock(&domain->revmap_tree_mutex);
-	}
+	mutex_unlock(&domain->revmap_mutex);
 }
 
 static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
@@ -911,12 +909,12 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
 		return 0;
 	}
 
+	rcu_read_lock();
 	/* Check if the hwirq is in the linear revmap. */
 	if (hwirq < domain->revmap_size)
-		return domain->revmap[hwirq]->irq;
-
-	rcu_read_lock();
-	data = radix_tree_lookup(&domain->revmap_tree, hwirq);
+		data = rcu_dereference(domain->revmap[hwirq]);
+	else
+		data = radix_tree_lookup(&domain->revmap_tree, hwirq);
 	rcu_read_unlock();
 	return data ? data->irq : 0;
 }
@@ -1499,18 +1497,17 @@ static void irq_domain_fix_revmap(struct irq_data *d)
 	if (irq_domain_is_nomap(d->domain))
 		return;
 
+	/* Fix up the revmap. */
+	mutex_lock(&d->domain->revmap_mutex);
 	if (d->hwirq < d->domain->revmap_size) {
 		/* Not using radix tree */
-		d->domain->revmap[d->hwirq] = d;
-		return;
+		rcu_assign_pointer(d->domain->revmap[d->hwirq], d);
+	} else {
+		slot = radix_tree_lookup_slot(&d->domain->revmap_tree, d->hwirq);
+		if (slot)
+			radix_tree_replace_slot(&d->domain->revmap_tree, slot, d);
 	}
-
-	/* Fix up the revmap. */
-	mutex_lock(&d->domain->revmap_tree_mutex);
-	slot = radix_tree_lookup_slot(&d->domain->revmap_tree, d->hwirq);
-	if (slot)
-		radix_tree_replace_slot(&d->domain->revmap_tree, slot, d);
-	mutex_unlock(&d->domain->revmap_tree_mutex);
+	mutex_unlock(&d->domain->revmap_mutex);
 }
 
 /**
-- 
cgit v1.2.3


From d22558dd0a6c888b1829f9d3a0a627e330e27585 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 4 May 2021 14:00:13 +0100
Subject: irqdomain: Introduce irq_resolve_mapping()

Rework irq_find_mapping() to return an both an irq_desc pointer,
optionally the virtual irq number, and rename the result to
__irq_resolve_mapping(). a new helper called irq_resolve_mapping()
is provided for code that doesn't need the virtual irq number.

irq_find_mapping() is also rewritten in terms of __irq_resolve_mapping().

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 include/linux/irqdomain.h | 23 +++++++++++++++++++++--
 kernel/irq/irqdomain.c    | 28 ++++++++++++++++++++--------
 2 files changed, 41 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 2b696c9bcaaf..23e4ee523576 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -41,6 +41,7 @@ struct fwnode_handle;
 struct irq_domain;
 struct irq_chip;
 struct irq_data;
+struct irq_desc;
 struct cpumask;
 struct seq_file;
 struct irq_affinity_desc;
@@ -401,13 +402,31 @@ static inline unsigned int irq_create_mapping(struct irq_domain *host,
 	return irq_create_mapping_affinity(host, hwirq, NULL);
 }
 
+extern struct irq_desc *__irq_resolve_mapping(struct irq_domain *domain,
+					      irq_hw_number_t hwirq,
+					      unsigned int *irq);
+
+static inline struct irq_desc *irq_resolve_mapping(struct irq_domain *domain,
+						   irq_hw_number_t hwirq)
+{
+	return __irq_resolve_mapping(domain, hwirq, NULL);
+}
+
 /**
  * irq_find_mapping() - Find a linux irq from a hw irq number.
  * @domain: domain owning this hardware interrupt
  * @hwirq: hardware irq number in that domain space
  */
-extern unsigned int irq_find_mapping(struct irq_domain *host,
-				     irq_hw_number_t hwirq);
+static inline unsigned int irq_find_mapping(struct irq_domain *domain,
+					    irq_hw_number_t hwirq)
+{
+	unsigned int irq;
+
+	if (__irq_resolve_mapping(domain, hwirq, &irq))
+		return irq;
+
+	return 0;
+}
 
 static inline unsigned int irq_linear_revmap(struct irq_domain *domain,
 					     irq_hw_number_t hwirq)
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8fbadeefc814..51c483ce2447 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -884,29 +884,34 @@ void irq_dispose_mapping(unsigned int virq)
 EXPORT_SYMBOL_GPL(irq_dispose_mapping);
 
 /**
- * irq_find_mapping() - Find a linux irq from a hw irq number.
+ * __irq_resolve_mapping() - Find a linux irq from a hw irq number.
  * @domain: domain owning this hardware interrupt
  * @hwirq: hardware irq number in that domain space
+ * @irq: optional pointer to return the Linux irq if required
+ *
+ * Returns the interrupt descriptor.
  */
-unsigned int irq_find_mapping(struct irq_domain *domain,
-			      irq_hw_number_t hwirq)
+struct irq_desc *__irq_resolve_mapping(struct irq_domain *domain,
+				       irq_hw_number_t hwirq,
+				       unsigned int *irq)
 {
+	struct irq_desc *desc = NULL;
 	struct irq_data *data;
 
 	/* Look for default domain if necessary */
 	if (domain == NULL)
 		domain = irq_default_domain;
 	if (domain == NULL)
-		return 0;
+		return desc;
 
 	if (irq_domain_is_nomap(domain)) {
 		if (hwirq < domain->revmap_size) {
 			data = irq_domain_get_irq_data(domain, hwirq);
 			if (data && data->hwirq == hwirq)
-				return hwirq;
+				desc = irq_data_to_desc(data);
 		}
 
-		return 0;
+		return desc;
 	}
 
 	rcu_read_lock();
@@ -915,10 +920,17 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
 		data = rcu_dereference(domain->revmap[hwirq]);
 	else
 		data = radix_tree_lookup(&domain->revmap_tree, hwirq);
+
+	if (likely(data)) {
+		desc = irq_data_to_desc(data);
+		if (irq)
+			*irq = data->irq;
+	}
+
 	rcu_read_unlock();
-	return data ? data->irq : 0;
+	return desc;
 }
-EXPORT_SYMBOL_GPL(irq_find_mapping);
+EXPORT_SYMBOL_GPL(__irq_resolve_mapping);
 
 /**
  * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings
-- 
cgit v1.2.3


From a3016b26ee6ee13d5647d701404a7912d4eaea9e Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 4 May 2021 14:24:37 +0100
Subject: genirq: Use irq_resolve_mapping() to implement __handle_domain_irq()
 and co

In order to start reaping the benefits of irq_resolve_mapping(),
start using it in __handle_domain_irq() and handle_domain_nmi().

This involves splitting generic_handle_irq() to be able to directly
provide the irq_desc.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 include/linux/irqdesc.h |  1 +
 kernel/irq/irqdesc.c    | 60 ++++++++++++++++++++++++++++---------------------
 2 files changed, 36 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index df4651250785..cdd1cf8207f6 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -158,6 +158,7 @@ static inline void generic_handle_irq_desc(struct irq_desc *desc)
 	desc->handle_irq(desc);
 }
 
+int handle_irq_desc(struct irq_desc *desc);
 int generic_handle_irq(unsigned int irq);
 
 #ifdef CONFIG_HANDLE_DOMAIN_IRQ
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 4a617d7312a4..684c5b7b7832 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -632,14 +632,8 @@ void irq_init_desc(unsigned int irq)
 
 #endif /* !CONFIG_SPARSE_IRQ */
 
-/**
- * generic_handle_irq - Invoke the handler for a particular irq
- * @irq:	The irq number to handle
- *
- */
-int generic_handle_irq(unsigned int irq)
+int handle_irq_desc(struct irq_desc *desc)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_data *data;
 
 	if (!desc)
@@ -652,6 +646,17 @@ int generic_handle_irq(unsigned int irq)
 	generic_handle_irq_desc(desc);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(handle_irq_desc);
+
+/**
+ * generic_handle_irq - Invoke the handler for a particular irq
+ * @irq:	The irq number to handle
+ *
+ */
+int generic_handle_irq(unsigned int irq)
+{
+	return handle_irq_desc(irq_to_desc(irq));
+}
 EXPORT_SYMBOL_GPL(generic_handle_irq);
 
 #ifdef CONFIG_HANDLE_DOMAIN_IRQ
@@ -668,27 +673,32 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
 			bool lookup, struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
-	unsigned int irq = hwirq;
+	struct irq_desc *desc;
 	int ret = 0;
 
 	irq_enter();
 
-#ifdef CONFIG_IRQ_DOMAIN
-	if (lookup)
-		irq = irq_find_mapping(domain, hwirq);
-#endif
-
-	/*
-	 * Some hardware gives randomly wrong interrupts.  Rather
-	 * than crashing, do something sensible.
-	 */
-	if (unlikely(!irq || irq >= nr_irqs)) {
-		ack_bad_irq(irq);
-		ret = -EINVAL;
+	if (likely(IS_ENABLED(CONFIG_IRQ_DOMAIN) && lookup)) {
+		/* The irqdomain code provides boundary checks */
+		desc = irq_resolve_mapping(domain, hwirq);
 	} else {
-		generic_handle_irq(irq);
+		/*
+		 * Some hardware gives randomly wrong interrupts.  Rather
+		 * than crashing, do something sensible.
+		 */
+		if (unlikely(!hwirq || hwirq >= nr_irqs)) {
+			ack_bad_irq(hwirq);
+			desc = NULL;
+		} else {
+			desc = irq_to_desc(hwirq);
+		}
 	}
 
+	if (likely(desc))
+		handle_irq_desc(desc);
+	else
+		ret = -EINVAL;
+
 	irq_exit();
 	set_irq_regs(old_regs);
 	return ret;
@@ -709,7 +719,7 @@ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
 		      struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
-	unsigned int irq;
+	struct irq_desc *desc;
 	int ret = 0;
 
 	/*
@@ -717,14 +727,14 @@ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
 	 */
 	WARN_ON(!in_nmi());
 
-	irq = irq_find_mapping(domain, hwirq);
+	desc = irq_resolve_mapping(domain, hwirq);
 
 	/*
 	 * ack_bad_irq is not NMI-safe, just report
 	 * an invalid interrupt.
 	 */
-	if (likely(irq))
-		generic_handle_irq(irq);
+	if (likely(desc))
+		handle_irq_desc(desc);
 	else
 		ret = -EINVAL;
 
-- 
cgit v1.2.3


From 9626d18a20e166a864e8d1f6ed6bbb84a0fa4989 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 4 May 2021 14:33:24 +0100
Subject: irqdesc: Fix __handle_domain_irq() comment

It appears that the comment about a NULL domain meaning anything
has always been wrong. Fix it.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 include/linux/irqdesc.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index cdd1cf8207f6..2971eb7e65f1 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -165,8 +165,7 @@ int generic_handle_irq(unsigned int irq);
 /*
  * Convert a HW interrupt number to a logical one using a IRQ domain,
  * and handle the result interrupt number. Return -EINVAL if
- * conversion failed. Providing a NULL domain indicates that the
- * conversion has already been done.
+ * conversion failed.
  */
 int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
 			bool lookup, struct pt_regs *regs);
-- 
cgit v1.2.3


From 8240ef50d4864325b346e40bb9d30cda9f22102d Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 12 May 2021 13:45:52 +0100
Subject: genirq: Add generic_handle_domain_irq() helper

Provide generic_handle_domain_irq() as a pendent to handle_domain_irq()
for non-root interrupt controllers

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 include/linux/irqdesc.h |  2 ++
 kernel/irq/irqdesc.c    | 19 ++++++++++++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 2971eb7e65f1..0f226c6b0c70 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -170,6 +170,8 @@ int generic_handle_irq(unsigned int irq);
 int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
 			bool lookup, struct pt_regs *regs);
 
+int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq);
+
 static inline int handle_domain_irq(struct irq_domain *domain,
 				    unsigned int hwirq, struct pt_regs *regs)
 {
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 684c5b7b7832..6179d5bde88e 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -661,7 +661,24 @@ EXPORT_SYMBOL_GPL(generic_handle_irq);
 
 #ifdef CONFIG_HANDLE_DOMAIN_IRQ
 /**
- * __handle_domain_irq - Invoke the handler for a HW irq belonging to a domain
+ * generic_handle_domain_irq - Invoke the handler for a HW irq belonging
+ *                             to a domain, usually for a non-root interrupt
+ *                             controller
+ * @domain:	The domain where to perform the lookup
+ * @hwirq:	The HW irq number to convert to a logical one
+ *
+ * Returns:	0 on success, or -EINVAL if conversion has failed
+ *
+ */
+int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
+{
+	return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
+}
+EXPORT_SYMBOL_GPL(generic_handle_domain_irq);
+
+/**
+ * __handle_domain_irq - Invoke the handler for a HW irq belonging to a domain,
+ *                       usually for a root interrupt controller
  * @domain:	The domain where to perform the lookup
  * @hwirq:	The HW irq number to convert to a logical one
  * @lookup:	Whether to perform the domain lookup or not
-- 
cgit v1.2.3


From e1c054918c6c7a30a35d2c183ed86600a071cdab Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 12 May 2021 16:18:15 +0100
Subject: genirq: Move non-irqdomain handle_domain_irq() handling into ARM's
 handle_IRQ()

Despite the name, handle_domain_irq() deals with non-irqdomain
handling for the sake of a handful of legacy ARM platforms.

Move such handling into ARM's handle_IRQ(), allowing for better
code generation for everyone else. This allows us get rid of
some complexity, and to rearrange the guards on the various helpers
in a more logical way.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm/kernel/irq.c   | 22 +++++++++++++++++++++-
 include/linux/irqdesc.h | 14 ++++----------
 kernel/irq/irqdesc.c    | 30 ++++++++----------------------
 3 files changed, 33 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c
index 698b6f636156..20ab1e607522 100644
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@ -63,7 +63,27 @@ int arch_show_interrupts(struct seq_file *p, int prec)
  */
 void handle_IRQ(unsigned int irq, struct pt_regs *regs)
 {
-	__handle_domain_irq(NULL, irq, false, regs);
+	struct pt_regs *old_regs = set_irq_regs(regs);
+	struct irq_desc *desc;
+
+	irq_enter();
+
+	/*
+	 * Some hardware gives randomly wrong interrupts.  Rather
+	 * than crashing, do something sensible.
+	 */
+	if (unlikely(!irq || irq >= nr_irqs))
+		desc = NULL;
+	else
+		desc = irq_to_desc(irq);
+
+	if (likely(desc))
+		handle_irq_desc(desc);
+	else
+		ack_bad_irq(irq);
+
+	irq_exit();
+	set_irq_regs(old_regs);
 }
 
 /*
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 0f226c6b0c70..59aea39785bf 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -161,24 +161,18 @@ static inline void generic_handle_irq_desc(struct irq_desc *desc)
 int handle_irq_desc(struct irq_desc *desc);
 int generic_handle_irq(unsigned int irq);
 
-#ifdef CONFIG_HANDLE_DOMAIN_IRQ
+#ifdef CONFIG_IRQ_DOMAIN
 /*
  * Convert a HW interrupt number to a logical one using a IRQ domain,
  * and handle the result interrupt number. Return -EINVAL if
  * conversion failed.
  */
-int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
-			bool lookup, struct pt_regs *regs);
-
 int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq);
 
-static inline int handle_domain_irq(struct irq_domain *domain,
-				    unsigned int hwirq, struct pt_regs *regs)
-{
-	return __handle_domain_irq(domain, hwirq, true, regs);
-}
+#ifdef CONFIG_HANDLE_DOMAIN_IRQ
+int handle_domain_irq(struct irq_domain *domain,
+		      unsigned int hwirq, struct pt_regs *regs);
 
-#ifdef CONFIG_IRQ_DOMAIN
 int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
 		      struct pt_regs *regs);
 #endif
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 6179d5bde88e..f4dd5186858a 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -659,7 +659,7 @@ int generic_handle_irq(unsigned int irq)
 }
 EXPORT_SYMBOL_GPL(generic_handle_irq);
 
-#ifdef CONFIG_HANDLE_DOMAIN_IRQ
+#ifdef CONFIG_IRQ_DOMAIN
 /**
  * generic_handle_domain_irq - Invoke the handler for a HW irq belonging
  *                             to a domain, usually for a non-root interrupt
@@ -676,9 +676,10 @@ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
 }
 EXPORT_SYMBOL_GPL(generic_handle_domain_irq);
 
+#ifdef CONFIG_HANDLE_DOMAIN_IRQ
 /**
- * __handle_domain_irq - Invoke the handler for a HW irq belonging to a domain,
- *                       usually for a root interrupt controller
+ * handle_domain_irq - Invoke the handler for a HW irq belonging to a domain,
+ *                     usually for a root interrupt controller
  * @domain:	The domain where to perform the lookup
  * @hwirq:	The HW irq number to convert to a logical one
  * @lookup:	Whether to perform the domain lookup or not
@@ -686,8 +687,8 @@ EXPORT_SYMBOL_GPL(generic_handle_domain_irq);
  *
  * Returns:	0 on success, or -EINVAL if conversion has failed
  */
-int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
-			bool lookup, struct pt_regs *regs)
+int handle_domain_irq(struct irq_domain *domain,
+		      unsigned int hwirq, struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 	struct irq_desc *desc;
@@ -695,22 +696,8 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
 
 	irq_enter();
 
-	if (likely(IS_ENABLED(CONFIG_IRQ_DOMAIN) && lookup)) {
-		/* The irqdomain code provides boundary checks */
-		desc = irq_resolve_mapping(domain, hwirq);
-	} else {
-		/*
-		 * Some hardware gives randomly wrong interrupts.  Rather
-		 * than crashing, do something sensible.
-		 */
-		if (unlikely(!hwirq || hwirq >= nr_irqs)) {
-			ack_bad_irq(hwirq);
-			desc = NULL;
-		} else {
-			desc = irq_to_desc(hwirq);
-		}
-	}
-
+	/* The irqdomain code provides boundary checks */
+	desc = irq_resolve_mapping(domain, hwirq);
 	if (likely(desc))
 		handle_irq_desc(desc);
 	else
@@ -721,7 +708,6 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
 	return ret;
 }
 
-#ifdef CONFIG_IRQ_DOMAIN
 /**
  * handle_domain_nmi - Invoke the handler for a HW irq belonging to a domain
  * @domain:	The domain where to perform the lookup
-- 
cgit v1.2.3


From cefc7ca46235f01d5233e3abd4b79452af01d9e9 Mon Sep 17 00:00:00 2001
From: Erik Kaneda <erik.kaneda@intel.com>
Date: Wed, 9 Jun 2021 20:41:52 -0700
Subject: ACPI: PRM: implement OperationRegion handler for the
 PlatformRtMechanism subtype

Platform Runtime Mechanism (PRM) is a firmware interface that exposes
a set of binary executables that can either be called from the AML
interpreter or device drivers by bypassing the AML interpreter.
This change implements the AML interpreter path.

According to the specification [1], PRM services are listed in an
ACPI table called the PRMT. This patch parses module and handler
information listed in the PRMT and registers the PlatformRtMechanism
OpRegion handler before ACPI tables are loaded.

Each service is defined by a 16-byte GUID and called from writing a
26-byte ASL buffer containing the identifier to a FieldUnit object
defined inside a PlatformRtMechanism OperationRegion.

    OperationRegion (PRMR, PlatformRtMechanism, 0, 26)
    Field (PRMR, BufferAcc, NoLock, Preserve)
    {
        PRMF, 208 // Write to this field to invoke the OperationRegion Handler
    }

The 26-byte ASL buffer is defined as the following:

Byte Offset   Byte Length    Description
=============================================================
     0             1         PRM OperationRegion handler status
     1             8         PRM service status
     9             1         PRM command
    10            16         PRM handler GUID

The ASL caller fills out a 26-byte buffer containing the PRM command
and the PRM handler GUID like so:

    /* Local0 is the PRM data buffer */
    Local0 = buffer (26){}

    /* Create byte fields over the buffer */
    CreateByteField (Local0, 0x9, CMD)
    CreateField (Local0, 0x50, 0x80, GUID)

    /* Fill in the command and data fields of the data buffer */
    CMD = 0 // run command
    GUID = ToUUID("xxxx-xx-xxx-xxxx")

    /*
     * Invoke PRM service with an ID that matches GUID and save the
     * result.
     */
    Local0 = (\_SB.PRMT.PRMF = Local0)

Byte offset 0 - 8 are written by the handler as a status passed back to AML
and used by ASL like so:

    /* Create byte fields over the buffer */
    CreateByteField (Local0, 0x0, PSTA)
    CreateQWordField (Local0, 0x1, USTA)

In this ASL code, PSTA contains a status from the OperationRegion and
USTA contains a status from the PRM service.

The 26-byte buffer is recieved by acpi_platformrt_space_handler. This
handler will look at the command value and the handler guid and take
the approperiate actions.

Command value    Action
=====================================================================
    0            Run the PRM service indicated by the PRM handler
                 GUID (bytes 10-26)

    1            Prevent PRM runtime updates from happening to the
                 service's parent module

    2            Allow PRM updates from happening to the service's parent module

This patch enables command value 0.

Link: https://uefi.org/sites/default/files/resources/Platform%20Runtime%20Mechanism%20-%20with%20legal%20notice.pdf # [1]
Signed-off-by: Erik Kaneda <erik.kaneda@intel.com>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/Kconfig  |   5 +
 drivers/acpi/Makefile |   1 +
 drivers/acpi/bus.c    |   2 +
 drivers/acpi/prmt.c   | 303 ++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/acpi/tables.c |   9 ++
 include/linux/acpi.h  |   1 +
 include/linux/prmt.h  |   7 ++
 7 files changed, 328 insertions(+)
 create mode 100644 drivers/acpi/prmt.c
 create mode 100644 include/linux/prmt.h

(limited to 'include/linux')

diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index eedec61e3476..3972de7b7565 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -543,3 +543,8 @@ config X86_PM_TIMER
 
 	  You should nearly always say Y here because many modern
 	  systems require this timer.
+
+config ACPI_PRMT
+	bool "Platform Runtime Mechanism Support"
+	depends on EFI && X86_64
+	default y
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 700b41adf2db..efb0d1f64019 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -61,6 +61,7 @@ acpi-$(CONFIG_ACPI_FPDT)	+= acpi_fpdt.o
 acpi-$(CONFIG_ACPI_LPIT)	+= acpi_lpit.o
 acpi-$(CONFIG_ACPI_GENERIC_GSI) += irq.o
 acpi-$(CONFIG_ACPI_WATCHDOG)	+= acpi_watchdog.o
+acpi-$(CONFIG_ACPI_PRMT)	+= prmt.o
 
 # Address translation
 acpi-$(CONFIG_ACPI_ADXL)	+= acpi_adxl.o
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index be7da23fad76..3484497923d5 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -30,6 +30,7 @@
 #include <linux/pci.h>
 #include <acpi/apei.h>
 #include <linux/suspend.h>
+#include <linux/prmt.h>
 
 #include "internal.h"
 
@@ -1330,6 +1331,7 @@ static int __init acpi_init(void)
 		acpi_kobj = NULL;
 	}
 
+	init_prmt();
 	result = acpi_bus_init();
 	if (result) {
 		disable_acpi();
diff --git a/drivers/acpi/prmt.c b/drivers/acpi/prmt.c
new file mode 100644
index 000000000000..33c274698d07
--- /dev/null
+++ b/drivers/acpi/prmt.c
@@ -0,0 +1,303 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Author: Erik Kaneda <erik.kaneda@intel.com>
+ * Copyright 2020 Intel Corporation
+ *
+ * prmt.c
+ *
+ * Each PRM service is an executable that is run in a restricted environment
+ * that is invoked by writing to the PlatformRtMechanism OperationRegion from
+ * AML bytecode.
+ *
+ * init_prmt initializes the Platform Runtime Mechanism (PRM) services by
+ * processing data in the PRMT as well as registering an ACPI OperationRegion
+ * handler for the PlatformRtMechanism subtype.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/efi.h>
+#include <linux/acpi.h>
+#include <linux/prmt.h>
+#include <asm/efi.h>
+
+#pragma pack(1)
+struct prm_mmio_addr_range {
+	u64 phys_addr;
+	u64 virt_addr;
+	u32 length;
+};
+
+struct prm_mmio_info {
+	u64 mmio_count;
+	struct prm_mmio_addr_range addr_ranges[];
+};
+
+struct prm_buffer {
+	u8 prm_status;
+	u64 efi_status;
+	u8 prm_cmd;
+	guid_t handler_guid;
+};
+
+struct prm_context_buffer {
+	char signature[ACPI_NAMESEG_SIZE];
+	u16 revision;
+	u16 reserved;
+	guid_t identifier;
+	u64 static_data_buffer;
+	struct prm_mmio_info *mmio_ranges;
+};
+#pragma pack()
+
+
+LIST_HEAD(prm_module_list);
+
+struct prm_handler_info {
+	guid_t guid;
+	u64 handler_addr;
+	u64 static_data_buffer_addr;
+	u64 acpi_param_buffer_addr;
+
+	struct list_head handler_list;
+};
+
+struct prm_module_info {
+	guid_t guid;
+	u16 major_rev;
+	u16 minor_rev;
+	u16 handler_count;
+	struct prm_mmio_info *mmio_info;
+	bool updatable;
+
+	struct list_head module_list;
+	struct prm_handler_info handlers[];
+};
+
+
+static u64 efi_pa_va_lookup(u64 pa)
+{
+	efi_memory_desc_t *md;
+	u64 pa_offset = pa & ~PAGE_MASK;
+	u64 page = pa & PAGE_MASK;
+
+	for_each_efi_memory_desc(md) {
+		if (md->phys_addr < pa && pa < md->phys_addr + PAGE_SIZE * md->num_pages)
+			return pa_offset + md->virt_addr + page - md->phys_addr;
+	}
+
+	return 0;
+}
+
+
+#define get_first_handler(a) ((struct acpi_prmt_handler_info *) ((char *) (a) + a->handler_info_offset))
+#define get_next_handler(a) ((struct acpi_prmt_handler_info *) (sizeof(struct acpi_prmt_handler_info) + (char *) a))
+
+static int __init
+acpi_parse_prmt(union acpi_subtable_headers *header, const unsigned long end)
+{
+	struct acpi_prmt_module_info *module_info;
+	struct acpi_prmt_handler_info *handler_info;
+	struct prm_handler_info *th;
+	struct prm_module_info *tm;
+	u64 mmio_count = 0;
+	u64 cur_handler = 0;
+	u32 module_info_size = 0;
+	u64 mmio_range_size = 0;
+	void *temp_mmio;
+
+	module_info = (struct acpi_prmt_module_info *) header;
+	module_info_size = struct_size(tm, handlers, module_info->handler_info_count);
+	tm = kmalloc(module_info_size, GFP_KERNEL);
+
+	guid_copy(&tm->guid, (guid_t *) module_info->module_guid);
+	tm->major_rev = module_info->major_rev;
+	tm->minor_rev = module_info->minor_rev;
+	tm->handler_count = module_info->handler_info_count;
+	tm->updatable = true;
+
+	if (module_info->mmio_list_pointer) {
+		/*
+		 * Each module is associated with a list of addr
+		 * ranges that it can use during the service
+		 */
+		mmio_count = *(u64 *) memremap(module_info->mmio_list_pointer, 8, MEMREMAP_WB);
+		mmio_range_size = struct_size(tm->mmio_info, addr_ranges, mmio_count);
+		tm->mmio_info = kmalloc(mmio_range_size, GFP_KERNEL);
+		temp_mmio = memremap(module_info->mmio_list_pointer, mmio_range_size, MEMREMAP_WB);
+		memmove(tm->mmio_info, temp_mmio, mmio_range_size);
+	} else {
+		mmio_range_size = struct_size(tm->mmio_info, addr_ranges, mmio_count);
+		tm->mmio_info = kmalloc(mmio_range_size, GFP_KERNEL);
+		tm->mmio_info->mmio_count = 0;
+	}
+
+	INIT_LIST_HEAD(&tm->module_list);
+	list_add(&tm->module_list, &prm_module_list);
+
+	handler_info = get_first_handler(module_info);
+	do {
+		th = &tm->handlers[cur_handler];
+
+		guid_copy(&th->guid, (guid_t *)handler_info->handler_guid);
+		th->handler_addr = efi_pa_va_lookup(handler_info->handler_address);
+		th->static_data_buffer_addr = efi_pa_va_lookup(handler_info->static_data_buffer_address);
+		th->acpi_param_buffer_addr = efi_pa_va_lookup(handler_info->acpi_param_buffer_address);
+	} while (++cur_handler < tm->handler_count && (handler_info = get_next_handler(handler_info)));
+
+	return 0;
+}
+
+#define GET_MODULE	0
+#define GET_HANDLER	1
+
+static void *find_guid_info(const guid_t *guid, u8 mode)
+{
+	struct prm_handler_info *cur_handler;
+	struct prm_module_info *cur_module;
+	int i = 0;
+
+	list_for_each_entry(cur_module, &prm_module_list, module_list) {
+		for (i = 0; i < cur_module->handler_count; ++i) {
+			cur_handler = &cur_module->handlers[i];
+			if (guid_equal(guid, &cur_handler->guid)) {
+				if (mode == GET_MODULE)
+					return (void *)cur_module;
+				else
+					return (void *)cur_handler;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+
+static struct prm_module_info *find_prm_module(const guid_t *guid)
+{
+	return (struct prm_module_info *)find_guid_info(guid, GET_MODULE);
+}
+
+static struct prm_handler_info *find_prm_handler(const guid_t *guid)
+{
+	return (struct prm_handler_info *) find_guid_info(guid, GET_HANDLER);
+}
+
+/* In-coming PRM commands */
+
+#define PRM_CMD_RUN_SERVICE		0
+#define PRM_CMD_START_TRANSACTION	1
+#define PRM_CMD_END_TRANSACTION		2
+
+/* statuses that can be passed back to ASL */
+
+#define PRM_HANDLER_SUCCESS 		0
+#define PRM_HANDLER_ERROR 		1
+#define INVALID_PRM_COMMAND 		2
+#define PRM_HANDLER_GUID_NOT_FOUND 	3
+#define UPDATE_LOCK_ALREADY_HELD 	4
+#define UPDATE_UNLOCK_WITHOUT_LOCK 	5
+
+/*
+ * This is the PlatformRtMechanism opregion space handler.
+ * @function: indicates the read/write. In fact as the PlatformRtMechanism
+ * message is driven by command, only write is meaningful.
+ *
+ * @addr   : not used
+ * @bits   : not used.
+ * @value  : it is an in/out parameter. It points to the PRM message buffer.
+ * @handler_context: not used
+ */
+static acpi_status acpi_platformrt_space_handler(u32 function,
+						 acpi_physical_address addr,
+						 u32 bits, acpi_integer *value,
+						 void *handler_context,
+						 void *region_context)
+{
+	struct prm_buffer *buffer = ACPI_CAST_PTR(struct prm_buffer, value);
+	struct prm_handler_info *handler;
+	struct prm_module_info *module;
+	efi_status_t status;
+	struct prm_context_buffer context;
+
+	/*
+	 * The returned acpi_status will always be AE_OK. Error values will be
+	 * saved in the first byte of the PRM message buffer to be used by ASL.
+	 */
+	switch (buffer->prm_cmd) {
+	case PRM_CMD_RUN_SERVICE:
+
+		handler = find_prm_handler(&buffer->handler_guid);
+		module = find_prm_module(&buffer->handler_guid);
+		if (!handler || !module)
+			goto invalid_guid;
+
+		ACPI_COPY_NAMESEG(context.signature, "PRMC");
+		context.revision = 0x0;
+		context.reserved = 0x0;
+		context.identifier = handler->guid;
+		context.static_data_buffer = handler->static_data_buffer_addr;
+		context.mmio_ranges = module->mmio_info;
+
+		status = efi_call_virt_pointer(handler, handler_addr,
+					       handler->acpi_param_buffer_addr,
+					       &context);
+		if (status == EFI_SUCCESS) {
+			buffer->prm_status = PRM_HANDLER_SUCCESS;
+		} else {
+			buffer->prm_status = PRM_HANDLER_ERROR;
+			buffer->efi_status = status;
+		}
+		break;
+
+	case PRM_CMD_START_TRANSACTION:
+
+		module = find_prm_module(&buffer->handler_guid);
+		if (!module)
+			goto invalid_guid;
+
+		if (module->updatable)
+			module->updatable = false;
+		else
+			buffer->prm_status = UPDATE_LOCK_ALREADY_HELD;
+		break;
+
+	case PRM_CMD_END_TRANSACTION:
+
+		module = find_prm_module(&buffer->handler_guid);
+		if (!module)
+			goto invalid_guid;
+
+		if (module->updatable)
+			buffer->prm_status = UPDATE_UNLOCK_WITHOUT_LOCK;
+		else
+			module->updatable = true;
+		break;
+
+	default:
+
+		buffer->prm_status = INVALID_PRM_COMMAND;
+		break;
+	}
+
+	return AE_OK;
+
+invalid_guid:
+	buffer->prm_status = PRM_HANDLER_GUID_NOT_FOUND;
+	return AE_OK;
+}
+
+void __init init_prmt(void)
+{
+	acpi_status status;
+	int mc = acpi_table_parse_entries(ACPI_SIG_PRMT, sizeof(struct acpi_table_prmt) +
+					  sizeof (struct acpi_table_prmt_header),
+					  0, acpi_parse_prmt, 0);
+	pr_info("PRM: found %u modules\n", mc);
+
+	status = acpi_install_address_space_handler(ACPI_ROOT_OBJECT,
+						    ACPI_ADR_SPACE_PLATFORM_RT,
+						    &acpi_platformrt_space_handler,
+						    NULL, NULL);
+	if (ACPI_FAILURE(status))
+		pr_alert("PRM: OperationRegion handler could not be installed\n");
+}
diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index 9d581045acff..a37a1532a575 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -39,6 +39,7 @@ static int acpi_apic_instance __initdata;
 enum acpi_subtable_type {
 	ACPI_SUBTABLE_COMMON,
 	ACPI_SUBTABLE_HMAT,
+	ACPI_SUBTABLE_PRMT,
 };
 
 struct acpi_subtable_entry {
@@ -222,6 +223,8 @@ acpi_get_entry_type(struct acpi_subtable_entry *entry)
 		return entry->hdr->common.type;
 	case ACPI_SUBTABLE_HMAT:
 		return entry->hdr->hmat.type;
+	case ACPI_SUBTABLE_PRMT:
+		return 0;
 	}
 	return 0;
 }
@@ -234,6 +237,8 @@ acpi_get_entry_length(struct acpi_subtable_entry *entry)
 		return entry->hdr->common.length;
 	case ACPI_SUBTABLE_HMAT:
 		return entry->hdr->hmat.length;
+	case ACPI_SUBTABLE_PRMT:
+		return entry->hdr->prmt.length;
 	}
 	return 0;
 }
@@ -246,6 +251,8 @@ acpi_get_subtable_header_length(struct acpi_subtable_entry *entry)
 		return sizeof(entry->hdr->common);
 	case ACPI_SUBTABLE_HMAT:
 		return sizeof(entry->hdr->hmat);
+	case ACPI_SUBTABLE_PRMT:
+		return sizeof(entry->hdr->prmt);
 	}
 	return 0;
 }
@@ -255,6 +262,8 @@ acpi_get_subtable_type(char *id)
 {
 	if (strncmp(id, ACPI_SIG_HMAT, 4) == 0)
 		return ACPI_SUBTABLE_HMAT;
+	if (strncmp(id, ACPI_SIG_PRMT, 4) == 0)
+		return ACPI_SUBTABLE_PRMT;
 	return ACPI_SUBTABLE_COMMON;
 }
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index c60745f657e9..4c07ac22c6ba 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -132,6 +132,7 @@ enum acpi_address_range_id {
 union acpi_subtable_headers {
 	struct acpi_subtable_header common;
 	struct acpi_hmat_structure hmat;
+	struct acpi_prmt_module_header prmt;
 };
 
 typedef int (*acpi_tbl_table_handler)(struct acpi_table_header *table);
diff --git a/include/linux/prmt.h b/include/linux/prmt.h
new file mode 100644
index 000000000000..24da8364b919
--- /dev/null
+++ b/include/linux/prmt.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifdef CONFIG_ACPI_PRMT
+void init_prmt(void);
+#else
+static inline void init_prmt(void) { }
+#endif
-- 
cgit v1.2.3


From 60faa8f1ac6e0588d53eb9a345adcdbcc96a8f47 Mon Sep 17 00:00:00 2001
From: Erik Kaneda <erik.kaneda@intel.com>
Date: Wed, 9 Jun 2021 20:41:53 -0700
Subject: ACPI: Add \_SB._OSC bit for PRM

Signed-off-by: Erik Kaneda <erik.kaneda@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/bus.c   | 1 +
 include/linux/acpi.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 3484497923d5..e8119a9eca28 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -305,6 +305,7 @@ static void acpi_bus_osc_negotiate_platform_control(void)
 
 	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_HOTPLUG_OST_SUPPORT;
 	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PCLPI_SUPPORT;
+	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PRM_SUPPORT;
 
 #ifdef CONFIG_ARM64
 	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_GENERIC_INITIATOR_SUPPORT;
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 4c07ac22c6ba..a618ba698a5c 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -551,6 +551,7 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context);
 #define OSC_SB_OSLPI_SUPPORT			0x00000100
 #define OSC_SB_CPC_DIVERSE_HIGH_SUPPORT		0x00001000
 #define OSC_SB_GENERIC_INITIATOR_SUPPORT	0x00002000
+#define OSC_SB_PRM_SUPPORT			0x00020000
 #define OSC_SB_NATIVE_USB4_SUPPORT		0x00040000
 
 extern bool osc_sb_apei_support_acked;
-- 
cgit v1.2.3


From 6b658c4863c15936872a93c9ee879043bf6393c9 Mon Sep 17 00:00:00 2001
From: Muneendra Kumar <muneendra.kumar@broadcom.com>
Date: Tue, 8 Jun 2021 10:05:44 +0530
Subject: scsi: cgroup: Add cgroup_get_from_id()

Add a new function, cgroup_get_from_id(), to retrieve the cgroup associated
with a cgroup id. Also export the function cgroup_get_e_css() as this is
needed in blk-cgroup.h.

Link: https://lore.kernel.org/r/20210608043556.274139-2-muneendra.kumar@broadcom.com
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Muneendra Kumar <muneendra.kumar@broadcom.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/cgroup.h |  6 ++++++
 kernel/cgroup/cgroup.c | 26 ++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4f2f79de083e..d2eace88d9d1 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -696,6 +696,7 @@ static inline void cgroup_kthread_ready(void)
 }
 
 void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen);
+struct cgroup *cgroup_get_from_id(u64 id);
 #else /* !CONFIG_CGROUPS */
 
 struct cgroup_subsys_state;
@@ -743,6 +744,11 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
 
 static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
 {}
+
+static inline struct cgroup *cgroup_get_from_id(u64 id)
+{
+	return NULL;
+}
 #endif /* !CONFIG_CGROUPS */
 
 #ifdef CONFIG_CGROUPS
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index e049edd66776..692779d3cdfa 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -577,6 +577,7 @@ out_unlock:
 	rcu_read_unlock();
 	return css;
 }
+EXPORT_SYMBOL_GPL(cgroup_get_e_css);
 
 static void cgroup_get_live(struct cgroup *cgrp)
 {
@@ -5776,6 +5777,31 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
 	kernfs_put(kn);
 }
 
+/*
+ * cgroup_get_from_id : get the cgroup associated with cgroup id
+ * @id: cgroup id
+ * On success return the cgrp, on failure return NULL
+ */
+struct cgroup *cgroup_get_from_id(u64 id)
+{
+	struct kernfs_node *kn;
+	struct cgroup *cgrp = NULL;
+
+	mutex_lock(&cgroup_mutex);
+	kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
+	if (!kn)
+		goto out_unlock;
+
+	cgrp = kn->priv;
+	if (cgroup_is_dead(cgrp) || !cgroup_tryget(cgrp))
+		cgrp = NULL;
+	kernfs_put(kn);
+out_unlock:
+	mutex_unlock(&cgroup_mutex);
+	return cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_from_id);
+
 /*
  * proc_cgroup_show()
  *  - Print task's cgroup paths into seq_file, one line for each hierarchy
-- 
cgit v1.2.3


From d2bcbeab4200665b694ec4f92a7a2fd58b70b1e8 Mon Sep 17 00:00:00 2001
From: Muneendra Kumar <muneendra.kumar@broadcom.com>
Date: Tue, 8 Jun 2021 10:05:45 +0530
Subject: scsi: blkcg: Add app identifier support for blkcg

Add a unique application identifier (i.e fc_app_id member) in blkcg. This
allows identification of traffic belonging to an specific both on the host
and in the fabric infrastructure. As an example, this allows the storage
stack to uniquely identify traffic belong to particular virtual machine.

Link: https://lore.kernel.org/r/20210608043556.274139-3-muneendra.kumar@broadcom.com
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Muneendra Kumar <muneendra.kumar@broadcom.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 block/Kconfig              |  9 +++++++
 drivers/scsi/Kconfig       | 13 ++++++++++
 include/linux/blk-cgroup.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 85 insertions(+)

(limited to 'include/linux')

diff --git a/block/Kconfig b/block/Kconfig
index a2297edfdde8..03886d105301 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -144,6 +144,15 @@ config BLK_CGROUP_IOLATENCY
 
 	Note, this is an experimental interface and could be changed someday.
 
+config BLK_CGROUP_FC_APPID
+	bool "Enable support to track FC I/O Traffic across cgroup applications"
+	depends on BLK_CGROUP=y
+	help
+	  Enabling this option enables the support to track FC I/O traffic across
+	  cgroup applications. It enables the Fabric and the storage targets to
+	  identify, monitor, and handle FC traffic based on VM tags by inserting
+	  application specific identification into the FC frame.
+
 config BLK_CGROUP_IOCOST
 	bool "Enable support for cost model based cgroup IO controller"
 	depends on BLK_CGROUP=y
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 23e015af0e93..cd5fbd9d46ba 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -235,6 +235,19 @@ config SCSI_FC_ATTRS
 	  each attached FiberChannel device to sysfs, say Y.
 	  Otherwise, say N.
 
+config FC_APPID
+	bool "Enable support to track FC I/O Traffic"
+	depends on BLOCK && BLK_CGROUP
+	depends on SCSI
+	select BLK_CGROUP_FC_APPID
+	default y
+	help
+	  If you say Y here, it enables the support to track
+	  FC I/O traffic over fabric. It enables the Fabric and the
+	  storage targets to identify, monitor, and handle FC traffic
+	  based on VM tags by inserting application specific
+	  identification into the FC frame.
+
 config SCSI_ISCSI_ATTRS
 	tristate "iSCSI Transport Attributes"
 	depends on SCSI && NET
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index b9f3c246c3c9..37048438872c 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -30,6 +30,8 @@
 
 /* Max limits for throttle policy */
 #define THROTL_IOPS_MAX		UINT_MAX
+#define FC_APPID_LEN              129
+
 
 #ifdef CONFIG_BLK_CGROUP
 
@@ -55,6 +57,9 @@ struct blkcg {
 	struct blkcg_policy_data	*cpd[BLKCG_MAX_POLS];
 
 	struct list_head		all_blkcgs_node;
+#ifdef CONFIG_BLK_CGROUP_FC_APPID
+	char                            fc_app_id[FC_APPID_LEN];
+#endif
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct list_head		cgwb_list;
 #endif
@@ -660,4 +665,62 @@ static inline void blk_cgroup_bio_start(struct bio *bio) { }
 
 #endif	/* CONFIG_BLOCK */
 #endif	/* CONFIG_BLK_CGROUP */
+
+#ifdef CONFIG_BLK_CGROUP_FC_APPID
+/*
+ * Sets the fc_app_id field associted to blkcg
+ * @app_id: application identifier
+ * @cgrp_id: cgroup id
+ * @app_id_len: size of application identifier
+ */
+static inline int blkcg_set_fc_appid(char *app_id, u64 cgrp_id, size_t app_id_len)
+{
+	struct cgroup *cgrp;
+	struct cgroup_subsys_state *css;
+	struct blkcg *blkcg;
+	int ret  = 0;
+
+	if (app_id_len > FC_APPID_LEN)
+		return -EINVAL;
+
+	cgrp = cgroup_get_from_id(cgrp_id);
+	if (!cgrp)
+		return -ENOENT;
+	css = cgroup_get_e_css(cgrp, &io_cgrp_subsys);
+	if (!css) {
+		ret = -ENOENT;
+		goto out_cgrp_put;
+	}
+	blkcg = css_to_blkcg(css);
+	/*
+	 * There is a slight race condition on setting the appid.
+	 * Worst case an I/O may not find the right id.
+	 * This is no different from the I/O we let pass while obtaining
+	 * the vmid from the fabric.
+	 * Adding the overhead of a lock is not necessary.
+	 */
+	strlcpy(blkcg->fc_app_id, app_id, app_id_len);
+	css_put(css);
+out_cgrp_put:
+	cgroup_put(cgrp);
+	return ret;
+}
+
+/**
+ * blkcg_get_fc_appid - get the fc app identifier associated with a bio
+ * @bio: target bio
+ *
+ * On success return the fc_app_id, on failure return NULL
+ */
+static inline char *blkcg_get_fc_appid(struct bio *bio)
+{
+	if (bio && bio->bi_blkg &&
+		(bio->bi_blkg->blkcg->fc_app_id[0] != '\0'))
+		return bio->bi_blkg->blkcg->fc_app_id;
+	return NULL;
+}
+#else
+static inline int blkcg_set_fc_appid(char *buf, u64 id, size_t len) { return -EINVAL; }
+static inline char *blkcg_get_fc_appid(struct bio *bio) { return NULL; }
+#endif /*CONFIG_BLK_CGROUP_FC_APPID*/
 #endif	/* _BLK_CGROUP_H */
-- 
cgit v1.2.3


From 493db2b05d9217da5889840ee31121856627e3c6 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Thu, 10 Jun 2021 10:20:34 +0200
Subject: memory: pl353-smc: Let lower level controller drivers handle inits

There is no point in having all these definitions at the SMC bus level,
these are extremely tight to the NAND controller driver implementation,
are not particularly generic, imply more boilerplate than needed, do
not really follow the device model by receiving no argument and some of
them are actually buggy.

Let's get rid of these right now as there is no current user and keep
this driver at a simple level: only the SMC bare initializations.

The NAND controller driver which I am going to introduce will take care
of redefining properly all these helpers and using them directly.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/r/20210610082040.2075611-13-miquel.raynal@bootlin.com
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
---
 drivers/memory/pl353-smc.c | 294 ---------------------------------------------
 include/linux/pl353-smc.h  |  30 -----
 2 files changed, 324 deletions(-)
 delete mode 100644 include/linux/pl353-smc.h

(limited to 'include/linux')

diff --git a/drivers/memory/pl353-smc.c b/drivers/memory/pl353-smc.c
index 14720430bf9e..5b57926461a0 100644
--- a/drivers/memory/pl353-smc.c
+++ b/drivers/memory/pl353-smc.c
@@ -8,76 +8,12 @@
  */
 
 #include <linux/clk.h>
-#include <linux/io.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/of_platform.h>
 #include <linux/platform_device.h>
-#include <linux/slab.h>
-#include <linux/pl353-smc.h>
 #include <linux/amba/bus.h>
 
-/* Register definitions */
-#define PL353_SMC_MEMC_STATUS_OFFS	0	/* Controller status reg, RO */
-#define PL353_SMC_CFG_CLR_OFFS		0xC	/* Clear config reg, WO */
-#define PL353_SMC_DIRECT_CMD_OFFS	0x10	/* Direct command reg, WO */
-#define PL353_SMC_SET_CYCLES_OFFS	0x14	/* Set cycles register, WO */
-#define PL353_SMC_SET_OPMODE_OFFS	0x18	/* Set opmode register, WO */
-#define PL353_SMC_ECC_STATUS_OFFS	0x400	/* ECC status register */
-#define PL353_SMC_ECC_MEMCFG_OFFS	0x404	/* ECC mem config reg */
-#define PL353_SMC_ECC_MEMCMD1_OFFS	0x408	/* ECC mem cmd1 reg */
-#define PL353_SMC_ECC_MEMCMD2_OFFS	0x40C	/* ECC mem cmd2 reg */
-#define PL353_SMC_ECC_VALUE0_OFFS	0x418	/* ECC value 0 reg */
-
-/* Controller status register specific constants */
-#define PL353_SMC_MEMC_STATUS_RAW_INT_1_SHIFT	6
-
-/* Clear configuration register specific constants */
-#define PL353_SMC_CFG_CLR_INT_CLR_1	0x10
-#define PL353_SMC_CFG_CLR_ECC_INT_DIS_1	0x40
-#define PL353_SMC_CFG_CLR_INT_DIS_1	0x2
-#define PL353_SMC_CFG_CLR_DEFAULT_MASK	(PL353_SMC_CFG_CLR_INT_CLR_1 | \
-					 PL353_SMC_CFG_CLR_ECC_INT_DIS_1 | \
-					 PL353_SMC_CFG_CLR_INT_DIS_1)
-
-/* Set cycles register specific constants */
-#define PL353_SMC_SET_CYCLES_T0_MASK	0xF
-#define PL353_SMC_SET_CYCLES_T0_SHIFT	0
-#define PL353_SMC_SET_CYCLES_T1_MASK	0xF
-#define PL353_SMC_SET_CYCLES_T1_SHIFT	4
-#define PL353_SMC_SET_CYCLES_T2_MASK	0x7
-#define PL353_SMC_SET_CYCLES_T2_SHIFT	8
-#define PL353_SMC_SET_CYCLES_T3_MASK	0x7
-#define PL353_SMC_SET_CYCLES_T3_SHIFT	11
-#define PL353_SMC_SET_CYCLES_T4_MASK	0x7
-#define PL353_SMC_SET_CYCLES_T4_SHIFT	14
-#define PL353_SMC_SET_CYCLES_T5_MASK	0x7
-#define PL353_SMC_SET_CYCLES_T5_SHIFT	17
-#define PL353_SMC_SET_CYCLES_T6_MASK	0xF
-#define PL353_SMC_SET_CYCLES_T6_SHIFT	20
-
-/* ECC status register specific constants */
-#define PL353_SMC_ECC_STATUS_BUSY	BIT(6)
-#define PL353_SMC_ECC_REG_SIZE_OFFS	4
-
-/* ECC memory config register specific constants */
-#define PL353_SMC_ECC_MEMCFG_MODE_MASK	0xC
-#define PL353_SMC_ECC_MEMCFG_MODE_SHIFT	2
-#define PL353_SMC_ECC_MEMCFG_PGSIZE_MASK	0x3
-
-#define PL353_SMC_DC_UPT_NAND_REGS	((4 << 23) |	/* CS: NAND chip */ \
-				 (2 << 21))	/* UpdateRegs operation */
-
-#define PL353_NAND_ECC_CMD1	((0x80)       |	/* Write command */ \
-				 (0 << 8)     |	/* Read command */ \
-				 (0x30 << 16) |	/* Read End command */ \
-				 (1 << 24))	/* Read End command calid */
-
-#define PL353_NAND_ECC_CMD2	((0x85)	      |	/* Write col change cmd */ \
-				 (5 << 8)     |	/* Read col change cmd */ \
-				 (0xE0 << 16) |	/* Read col change end cmd */ \
-				 (1 << 24)) /* Read col change end cmd valid */
-#define PL353_NAND_ECC_BUSY_TIMEOUT	(1 * HZ)
 /**
  * struct pl353_smc_data - Private smc driver structure
  * @memclk:		Pointer to the peripheral clock
@@ -88,183 +24,6 @@ struct pl353_smc_data {
 	struct clk		*aclk;
 };
 
-/* SMC virtual register base */
-static void __iomem *pl353_smc_base;
-
-/**
- * pl353_smc_set_buswidth - Set memory buswidth
- * @bw: Memory buswidth (8 | 16)
- * Return: 0 on success or negative errno.
- */
-int pl353_smc_set_buswidth(unsigned int bw)
-{
-	if (bw != PL353_SMC_MEM_WIDTH_8  && bw != PL353_SMC_MEM_WIDTH_16)
-		return -EINVAL;
-
-	writel(bw, pl353_smc_base + PL353_SMC_SET_OPMODE_OFFS);
-	writel(PL353_SMC_DC_UPT_NAND_REGS, pl353_smc_base +
-	       PL353_SMC_DIRECT_CMD_OFFS);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(pl353_smc_set_buswidth);
-
-/**
- * pl353_smc_set_cycles - Set memory timing parameters
- * @timings: NAND controller timing parameters
- *
- * Sets NAND chip specific timing parameters.
- */
-void pl353_smc_set_cycles(u32 timings[])
-{
-	/*
-	 * Set write pulse timing. This one is easy to extract:
-	 *
-	 * NWE_PULSE = tWP
-	 */
-	timings[0] &= PL353_SMC_SET_CYCLES_T0_MASK;
-	timings[1] = (timings[1] & PL353_SMC_SET_CYCLES_T1_MASK) <<
-			PL353_SMC_SET_CYCLES_T1_SHIFT;
-	timings[2] = (timings[2]  & PL353_SMC_SET_CYCLES_T2_MASK) <<
-			PL353_SMC_SET_CYCLES_T2_SHIFT;
-	timings[3] = (timings[3]  & PL353_SMC_SET_CYCLES_T3_MASK) <<
-			PL353_SMC_SET_CYCLES_T3_SHIFT;
-	timings[4] = (timings[4] & PL353_SMC_SET_CYCLES_T4_MASK) <<
-			PL353_SMC_SET_CYCLES_T4_SHIFT;
-	timings[5]  = (timings[5]  & PL353_SMC_SET_CYCLES_T5_MASK) <<
-			PL353_SMC_SET_CYCLES_T5_SHIFT;
-	timings[6]  = (timings[6]  & PL353_SMC_SET_CYCLES_T6_MASK) <<
-			PL353_SMC_SET_CYCLES_T6_SHIFT;
-	timings[0] |= timings[1] | timings[2] | timings[3] |
-			timings[4] | timings[5] | timings[6];
-
-	writel(timings[0], pl353_smc_base + PL353_SMC_SET_CYCLES_OFFS);
-	writel(PL353_SMC_DC_UPT_NAND_REGS, pl353_smc_base +
-	       PL353_SMC_DIRECT_CMD_OFFS);
-}
-EXPORT_SYMBOL_GPL(pl353_smc_set_cycles);
-
-/**
- * pl353_smc_ecc_is_busy - Read ecc busy flag
- * Return: the ecc_status bit from the ecc_status register. 1 = busy, 0 = idle
- */
-bool pl353_smc_ecc_is_busy(void)
-{
-	return ((readl(pl353_smc_base + PL353_SMC_ECC_STATUS_OFFS) &
-		  PL353_SMC_ECC_STATUS_BUSY) == PL353_SMC_ECC_STATUS_BUSY);
-}
-EXPORT_SYMBOL_GPL(pl353_smc_ecc_is_busy);
-
-/**
- * pl353_smc_get_ecc_val - Read ecc_valueN registers
- * @ecc_reg: Index of the ecc_value reg (0..3)
- * Return: the content of the requested ecc_value register.
- *
- * There are four valid ecc_value registers. The argument is truncated to stay
- * within this valid boundary.
- */
-u32 pl353_smc_get_ecc_val(int ecc_reg)
-{
-	u32 addr, reg;
-
-	addr = PL353_SMC_ECC_VALUE0_OFFS +
-		(ecc_reg * PL353_SMC_ECC_REG_SIZE_OFFS);
-	reg = readl(pl353_smc_base + addr);
-
-	return reg;
-}
-EXPORT_SYMBOL_GPL(pl353_smc_get_ecc_val);
-
-/**
- * pl353_smc_get_nand_int_status_raw - Get NAND interrupt status bit
- * Return: the raw_int_status1 bit from the memc_status register
- */
-int pl353_smc_get_nand_int_status_raw(void)
-{
-	u32 reg;
-
-	reg = readl(pl353_smc_base + PL353_SMC_MEMC_STATUS_OFFS);
-	reg >>= PL353_SMC_MEMC_STATUS_RAW_INT_1_SHIFT;
-	reg &= 1;
-
-	return reg;
-}
-EXPORT_SYMBOL_GPL(pl353_smc_get_nand_int_status_raw);
-
-/**
- * pl353_smc_clr_nand_int - Clear NAND interrupt
- */
-void pl353_smc_clr_nand_int(void)
-{
-	writel(PL353_SMC_CFG_CLR_INT_CLR_1,
-	       pl353_smc_base + PL353_SMC_CFG_CLR_OFFS);
-}
-EXPORT_SYMBOL_GPL(pl353_smc_clr_nand_int);
-
-/**
- * pl353_smc_set_ecc_mode - Set SMC ECC mode
- * @mode: ECC mode (BYPASS, APB, MEM)
- * Return: 0 on success or negative errno.
- */
-int pl353_smc_set_ecc_mode(enum pl353_smc_ecc_mode mode)
-{
-	u32 reg;
-	int ret = 0;
-
-	switch (mode) {
-	case PL353_SMC_ECCMODE_BYPASS:
-	case PL353_SMC_ECCMODE_APB:
-	case PL353_SMC_ECCMODE_MEM:
-
-		reg = readl(pl353_smc_base + PL353_SMC_ECC_MEMCFG_OFFS);
-		reg &= ~PL353_SMC_ECC_MEMCFG_MODE_MASK;
-		reg |= mode << PL353_SMC_ECC_MEMCFG_MODE_SHIFT;
-		writel(reg, pl353_smc_base + PL353_SMC_ECC_MEMCFG_OFFS);
-
-		break;
-	default:
-		ret = -EINVAL;
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(pl353_smc_set_ecc_mode);
-
-/**
- * pl353_smc_set_ecc_pg_size - Set SMC ECC page size
- * @pg_sz: ECC page size
- * Return: 0 on success or negative errno.
- */
-int pl353_smc_set_ecc_pg_size(unsigned int pg_sz)
-{
-	u32 reg, sz;
-
-	switch (pg_sz) {
-	case 0:
-		sz = 0;
-		break;
-	case SZ_512:
-		sz = 1;
-		break;
-	case SZ_1K:
-		sz = 2;
-		break;
-	case SZ_2K:
-		sz = 3;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	reg = readl(pl353_smc_base + PL353_SMC_ECC_MEMCFG_OFFS);
-	reg &= ~PL353_SMC_ECC_MEMCFG_PGSIZE_MASK;
-	reg |= sz;
-	writel(reg, pl353_smc_base + PL353_SMC_ECC_MEMCFG_OFFS);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(pl353_smc_set_ecc_pg_size);
-
 static int __maybe_unused pl353_smc_suspend(struct device *dev)
 {
 	struct pl353_smc_data *pl353_smc = dev_get_drvdata(dev);
@@ -296,52 +55,15 @@ static int __maybe_unused pl353_smc_resume(struct device *dev)
 	return ret;
 }
 
-static struct amba_driver pl353_smc_driver;
-
 static SIMPLE_DEV_PM_OPS(pl353_smc_dev_pm_ops, pl353_smc_suspend,
 			 pl353_smc_resume);
 
-/**
- * pl353_smc_init_nand_interface - Initialize the NAND interface
- * @adev: Pointer to the amba_device struct
- * @nand_node: Pointer to the pl353_nand device_node struct
- */
-static void pl353_smc_init_nand_interface(struct amba_device *adev,
-					  struct device_node *nand_node)
-{
-	unsigned long timeout;
-
-	pl353_smc_set_buswidth(PL353_SMC_MEM_WIDTH_8);
-	writel(PL353_SMC_CFG_CLR_INT_CLR_1,
-	       pl353_smc_base + PL353_SMC_CFG_CLR_OFFS);
-	writel(PL353_SMC_DC_UPT_NAND_REGS, pl353_smc_base +
-	       PL353_SMC_DIRECT_CMD_OFFS);
-
-	timeout = jiffies + PL353_NAND_ECC_BUSY_TIMEOUT;
-	/* Wait till the ECC operation is complete */
-	do {
-		if (pl353_smc_ecc_is_busy())
-			cpu_relax();
-		else
-			break;
-	} while (!time_after_eq(jiffies, timeout));
-
-	if (time_after_eq(jiffies, timeout))
-		return;
-
-	writel(PL353_NAND_ECC_CMD1,
-	       pl353_smc_base + PL353_SMC_ECC_MEMCMD1_OFFS);
-	writel(PL353_NAND_ECC_CMD2,
-	       pl353_smc_base + PL353_SMC_ECC_MEMCMD2_OFFS);
-}
-
 static const struct of_device_id pl353_smc_supported_children[] = {
 	{
 		.compatible = "cfi-flash"
 	},
 	{
 		.compatible = "arm,pl353-nand-r2p1",
-		.data = pl353_smc_init_nand_interface
 	},
 	{}
 };
@@ -350,23 +72,14 @@ static int pl353_smc_probe(struct amba_device *adev, const struct amba_id *id)
 {
 	struct pl353_smc_data *pl353_smc;
 	struct device_node *child;
-	struct resource *res;
 	int err;
 	struct device_node *of_node = adev->dev.of_node;
-	static void (*init)(struct amba_device *adev,
-			    struct device_node *nand_node);
 	const struct of_device_id *match = NULL;
 
 	pl353_smc = devm_kzalloc(&adev->dev, sizeof(*pl353_smc), GFP_KERNEL);
 	if (!pl353_smc)
 		return -ENOMEM;
 
-	/* Get the NAND controller virtual address */
-	res = &adev->res;
-	pl353_smc_base = devm_ioremap_resource(&adev->dev, res);
-	if (IS_ERR(pl353_smc_base))
-		return PTR_ERR(pl353_smc_base);
-
 	pl353_smc->aclk = devm_clk_get(&adev->dev, "apb_pclk");
 	if (IS_ERR(pl353_smc->aclk)) {
 		dev_err(&adev->dev, "aclk clock not found.\n");
@@ -393,10 +106,6 @@ static int pl353_smc_probe(struct amba_device *adev, const struct amba_id *id)
 
 	amba_set_drvdata(adev, pl353_smc);
 
-	/* clear interrupts */
-	writel(PL353_SMC_CFG_CLR_DEFAULT_MASK,
-	       pl353_smc_base + PL353_SMC_CFG_CLR_OFFS);
-
 	/* Find compatible children. Only a single child is supported */
 	for_each_available_child_of_node(of_node, child) {
 		match = of_match_node(pl353_smc_supported_children, child);
@@ -411,9 +120,6 @@ static int pl353_smc_probe(struct amba_device *adev, const struct amba_id *id)
 		goto disable_mem_clk;
 	}
 
-	init = match->data;
-	if (init)
-		init(adev, child);
 	of_platform_device_create(child, NULL, &adev->dev);
 
 	return 0;
diff --git a/include/linux/pl353-smc.h b/include/linux/pl353-smc.h
deleted file mode 100644
index 0e0d3df9bf72..000000000000
--- a/include/linux/pl353-smc.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * ARM PL353 SMC Driver Header
- *
- * Copyright (C) 2012 - 2018 Xilinx, Inc
- */
-
-#ifndef __LINUX_PL353_SMC_H
-#define __LINUX_PL353_SMC_H
-
-enum pl353_smc_ecc_mode {
-	PL353_SMC_ECCMODE_BYPASS = 0,
-	PL353_SMC_ECCMODE_APB = 1,
-	PL353_SMC_ECCMODE_MEM = 2
-};
-
-enum pl353_smc_mem_width {
-	PL353_SMC_MEM_WIDTH_8 = 0,
-	PL353_SMC_MEM_WIDTH_16 = 1
-};
-
-u32 pl353_smc_get_ecc_val(int ecc_reg);
-bool pl353_smc_ecc_is_busy(void);
-int pl353_smc_get_nand_int_status_raw(void);
-void pl353_smc_clr_nand_int(void);
-int pl353_smc_set_ecc_mode(enum pl353_smc_ecc_mode mode);
-int pl353_smc_set_ecc_pg_size(unsigned int pg_sz);
-int pl353_smc_set_buswidth(unsigned int bw);
-void pl353_smc_set_cycles(u32 timings[]);
-#endif
-- 
cgit v1.2.3


From 4b6c132b7da6430cf5dcc96948b04849dea0a32a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 29 Apr 2021 21:16:56 -0400
Subject: iov_iter: switch ..._full() variants of primitives to use of
 iov_iter_revert()

Use corresponding plain variants, revert on short copy.  That's the way it
should've been done from the very beginning, except that we didn't have
iov_iter_revert() back then...

[fixed another braino caught by Qian Cai <quic_qiancai@quicinc.com>]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/uio.h |  32 ++++++++++------
 lib/iov_iter.c      | 104 ----------------------------------------------------
 2 files changed, 21 insertions(+), 115 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 74a401f04bd3..68079e2f34eb 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -132,9 +132,7 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
 
 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i);
-bool _copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i);
 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i);
-bool _copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i);
 
 static __always_inline __must_check
 size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
@@ -157,10 +155,11 @@ size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 static __always_inline __must_check
 bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i)
 {
-	if (unlikely(!check_copy_size(addr, bytes, false)))
-		return false;
-	else
-		return _copy_from_iter_full(addr, bytes, i);
+	size_t copied = copy_from_iter(addr, bytes, i);
+	if (likely(copied == bytes))
+		return true;
+	iov_iter_revert(i, copied);
+	return false;
 }
 
 static __always_inline __must_check
@@ -175,10 +174,11 @@ size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
 static __always_inline __must_check
 bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
 {
-	if (unlikely(!check_copy_size(addr, bytes, false)))
-		return false;
-	else
-		return _copy_from_iter_full_nocache(addr, bytes, i);
+	size_t copied = copy_from_iter_nocache(addr, bytes, i);
+	if (likely(copied == bytes))
+		return true;
+	iov_iter_revert(i, copied);
+	return false;
 }
 
 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
@@ -278,7 +278,17 @@ struct csum_state {
 
 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csstate, struct iov_iter *i);
 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
-bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
+
+static __always_inline __must_check
+bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
+				  __wsum *csum, struct iov_iter *i)
+{
+	size_t copied = csum_and_copy_from_iter(addr, bytes, csum, i);
+	if (likely(copied == bytes))
+		return true;
+	iov_iter_revert(i, copied);
+	return false;
+}
 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
 		struct iov_iter *i);
 
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index bdbe6691457d..ce9f8b9168ea 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -819,35 +819,6 @@ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 }
 EXPORT_SYMBOL(_copy_from_iter);
 
-bool _copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i)
-{
-	char *to = addr;
-	if (unlikely(iov_iter_is_pipe(i))) {
-		WARN_ON(1);
-		return false;
-	}
-	if (unlikely(i->count < bytes))
-		return false;
-
-	if (iter_is_iovec(i))
-		might_fault();
-	iterate_all_kinds(i, bytes, v, ({
-		if (copyin((to += v.iov_len) - v.iov_len,
-				      v.iov_base, v.iov_len))
-			return false;
-		0;}),
-		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
-				 v.bv_offset, v.bv_len),
-		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
-		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
-				 v.bv_offset, v.bv_len)
-	)
-
-	iov_iter_advance(i, bytes);
-	return true;
-}
-EXPORT_SYMBOL(_copy_from_iter_full);
-
 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
 {
 	char *to = addr;
@@ -907,32 +878,6 @@ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
 EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
 #endif
 
-bool _copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
-{
-	char *to = addr;
-	if (unlikely(iov_iter_is_pipe(i))) {
-		WARN_ON(1);
-		return false;
-	}
-	if (unlikely(i->count < bytes))
-		return false;
-	iterate_all_kinds(i, bytes, v, ({
-		if (__copy_from_user_inatomic_nocache((to += v.iov_len) - v.iov_len,
-					     v.iov_base, v.iov_len))
-			return false;
-		0;}),
-		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
-				 v.bv_offset, v.bv_len),
-		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
-		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
-				 v.bv_offset, v.bv_len)
-	)
-
-	iov_iter_advance(i, bytes);
-	return true;
-}
-EXPORT_SYMBOL(_copy_from_iter_full_nocache);
-
 static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
 {
 	struct page *head;
@@ -1740,55 +1685,6 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
 }
 EXPORT_SYMBOL(csum_and_copy_from_iter);
 
-bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum,
-			       struct iov_iter *i)
-{
-	char *to = addr;
-	__wsum sum, next;
-	size_t off = 0;
-	sum = *csum;
-	if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
-		WARN_ON(1);
-		return false;
-	}
-	if (unlikely(i->count < bytes))
-		return false;
-	iterate_all_kinds(i, bytes, v, ({
-		next = csum_and_copy_from_user(v.iov_base,
-					       (to += v.iov_len) - v.iov_len,
-					       v.iov_len);
-		if (!next)
-			return false;
-		sum = csum_block_add(sum, next, off);
-		off += v.iov_len;
-		0;
-	}), ({
-		char *p = kmap_atomic(v.bv_page);
-		sum = csum_and_memcpy((to += v.bv_len) - v.bv_len,
-				      p + v.bv_offset, v.bv_len,
-				      sum, off);
-		kunmap_atomic(p);
-		off += v.bv_len;
-	}),({
-		sum = csum_and_memcpy((to += v.iov_len) - v.iov_len,
-				      v.iov_base, v.iov_len,
-				      sum, off);
-		off += v.iov_len;
-	}), ({
-		char *p = kmap_atomic(v.bv_page);
-		sum = csum_and_memcpy((to += v.bv_len) - v.bv_len,
-				      p + v.bv_offset, v.bv_len,
-				      sum, off);
-		kunmap_atomic(p);
-		off += v.bv_len;
-	})
-	)
-	*csum = sum;
-	iov_iter_advance(i, bytes);
-	return true;
-}
-EXPORT_SYMBOL(csum_and_copy_from_iter_full);
-
 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
 			     struct iov_iter *i)
 {
-- 
cgit v1.2.3


From 8cd54c1c848031a87820e58d772166ffdf8c08c0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 22 Apr 2021 14:50:39 -0400
Subject: iov_iter: separate direction from flavour

Instead of having them mixed in iter->type, use separate ->iter_type
and ->data_source (u8 and bool resp.)  And don't bother with (pseudo-)
bitmap for the former - microoptimizations from being able to check
if the flavour is one of two values are not worth the confusion for
optimizer.  It can't prove that we never get e.g. ITER_IOVEC | ITER_PIPE,
so we end up with extra headache.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/uio.h | 24 +++++++--------
 lib/iov_iter.c      | 85 ++++++++++++++++++++++++++++++-----------------------
 2 files changed, 58 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 68079e2f34eb..ad76eef356b0 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -19,21 +19,17 @@ struct kvec {
 
 enum iter_type {
 	/* iter types */
-	ITER_IOVEC = 4,
-	ITER_KVEC = 8,
-	ITER_BVEC = 16,
-	ITER_PIPE = 32,
-	ITER_DISCARD = 64,
-	ITER_XARRAY = 128,
+	ITER_IOVEC,
+	ITER_KVEC,
+	ITER_BVEC,
+	ITER_PIPE,
+	ITER_XARRAY,
+	ITER_DISCARD,
 };
 
 struct iov_iter {
-	/*
-	 * Bit 0 is the read/write bit, set if we're writing.
-	 * Bit 1 is the BVEC_FLAG_NO_REF bit, set if type is a bvec and
-	 * the caller isn't expecting to drop a page reference when done.
-	 */
-	unsigned int type;
+	u8 iter_type;
+	bool data_source;
 	size_t iov_offset;
 	size_t count;
 	union {
@@ -55,7 +51,7 @@ struct iov_iter {
 
 static inline enum iter_type iov_iter_type(const struct iov_iter *i)
 {
-	return i->type & ~(READ | WRITE);
+	return i->iter_type;
 }
 
 static inline bool iter_is_iovec(const struct iov_iter *i)
@@ -90,7 +86,7 @@ static inline bool iov_iter_is_xarray(const struct iov_iter *i)
 
 static inline unsigned char iov_iter_rw(const struct iov_iter *i)
 {
-	return i->type & (READ | WRITE);
+	return i->data_source ? WRITE : READ;
 }
 
 /*
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index e6c5834da32d..03c4e677b075 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -489,19 +489,15 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
 			size_t count)
 {
 	WARN_ON(direction & ~(READ | WRITE));
-	direction &= READ | WRITE;
-
-	/* It will get better.  Eventually... */
-	if (uaccess_kernel()) {
-		i->type = ITER_KVEC | direction;
-		i->kvec = (struct kvec *)iov;
-	} else {
-		i->type = ITER_IOVEC | direction;
-		i->iov = iov;
-	}
-	i->nr_segs = nr_segs;
-	i->iov_offset = 0;
-	i->count = count;
+	WARN_ON_ONCE(uaccess_kernel());
+	*i = (struct iov_iter) {
+		.iter_type = ITER_IOVEC,
+		.data_source = direction,
+		.iov = iov,
+		.nr_segs = nr_segs,
+		.iov_offset = 0,
+		.count = count
+	};
 }
 EXPORT_SYMBOL(iov_iter_init);
 
@@ -1218,11 +1214,14 @@ void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
 			size_t count)
 {
 	WARN_ON(direction & ~(READ | WRITE));
-	i->type = ITER_KVEC | (direction & (READ | WRITE));
-	i->kvec = kvec;
-	i->nr_segs = nr_segs;
-	i->iov_offset = 0;
-	i->count = count;
+	*i = (struct iov_iter){
+		.iter_type = ITER_KVEC,
+		.data_source = direction,
+		.kvec = kvec,
+		.nr_segs = nr_segs,
+		.iov_offset = 0,
+		.count = count
+	};
 }
 EXPORT_SYMBOL(iov_iter_kvec);
 
@@ -1231,11 +1230,14 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
 			size_t count)
 {
 	WARN_ON(direction & ~(READ | WRITE));
-	i->type = ITER_BVEC | (direction & (READ | WRITE));
-	i->bvec = bvec;
-	i->nr_segs = nr_segs;
-	i->iov_offset = 0;
-	i->count = count;
+	*i = (struct iov_iter){
+		.iter_type = ITER_BVEC,
+		.data_source = direction,
+		.bvec = bvec,
+		.nr_segs = nr_segs,
+		.iov_offset = 0,
+		.count = count
+	};
 }
 EXPORT_SYMBOL(iov_iter_bvec);
 
@@ -1245,12 +1247,15 @@ void iov_iter_pipe(struct iov_iter *i, unsigned int direction,
 {
 	BUG_ON(direction != READ);
 	WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size));
-	i->type = ITER_PIPE | READ;
-	i->pipe = pipe;
-	i->head = pipe->head;
-	i->iov_offset = 0;
-	i->count = count;
-	i->start_head = i->head;
+	*i = (struct iov_iter){
+		.iter_type = ITER_PIPE,
+		.data_source = false,
+		.pipe = pipe,
+		.head = pipe->head,
+		.start_head = pipe->head,
+		.iov_offset = 0,
+		.count = count
+	};
 }
 EXPORT_SYMBOL(iov_iter_pipe);
 
@@ -1271,11 +1276,14 @@ void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
 		     struct xarray *xarray, loff_t start, size_t count)
 {
 	BUG_ON(direction & ~1);
-	i->type = ITER_XARRAY | (direction & (READ | WRITE));
-	i->xarray = xarray;
-	i->xarray_start = start;
-	i->count = count;
-	i->iov_offset = 0;
+	*i = (struct iov_iter) {
+		.iter_type = ITER_XARRAY,
+		.data_source = direction,
+		.xarray = xarray,
+		.xarray_start = start,
+		.count = count,
+		.iov_offset = 0
+	};
 }
 EXPORT_SYMBOL(iov_iter_xarray);
 
@@ -1291,9 +1299,12 @@ EXPORT_SYMBOL(iov_iter_xarray);
 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
 {
 	BUG_ON(direction != READ);
-	i->type = ITER_DISCARD | READ;
-	i->count = count;
-	i->iov_offset = 0;
+	*i = (struct iov_iter){
+		.iter_type = ITER_DISCARD,
+		.data_source = false,
+		.count = count,
+		.iov_offset = 0
+	};
 }
 EXPORT_SYMBOL(iov_iter_discard);
 
-- 
cgit v1.2.3


From 8409a0d261e20180361e7afe6d89847d1bad4ce8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 2 May 2021 11:57:37 -0400
Subject: sanitize iov_iter_fault_in_readable()

1) constify iov_iter argument; we are not advancing it in this primitive.

2) cap the amount requested by the amount of data in iov_iter.  All
existing callers should've been safe, but the check is really cheap and
doing it here makes for easier analysis, as well as more consistent
semantics among the primitives.

3) don't bother with iterate_iovec().  Explicit loop is not any harder
to follow, and we get rid of standalone iterate_iovec() users - it's
only used by iterate_and_advance() and (soon to be gone) iterate_all_kinds().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/uio.h |  2 +-
 lib/iov_iter.c      | 26 ++++++++++++++++----------
 2 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index ad76eef356b0..b5cf54859109 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -119,7 +119,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
 		struct iov_iter *i, unsigned long offset, size_t bytes);
 void iov_iter_advance(struct iov_iter *i, size_t bytes);
 void iov_iter_revert(struct iov_iter *i, size_t bytes);
-int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
+int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes);
 size_t iov_iter_single_seg_count(const struct iov_iter *i);
 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index cd23c79acb94..2b543bea1e0d 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -466,19 +466,25 @@ out:
  * Return 0 on success, or non-zero if the memory could not be accessed (i.e.
  * because it is an invalid address).
  */
-int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
+int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes)
 {
-	size_t skip = i->iov_offset;
-	const struct iovec *iov;
-	int err;
-	struct iovec v;
-
 	if (iter_is_iovec(i)) {
-		iterate_iovec(i, bytes, v, iov, skip, ({
-			err = fault_in_pages_readable(v.iov_base, v.iov_len);
+		const struct iovec *p;
+		size_t skip;
+
+		if (bytes > i->count)
+			bytes = i->count;
+		for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) {
+			size_t len = min(bytes, p->iov_len - skip);
+			int err;
+
+			if (unlikely(!len))
+				continue;
+			err = fault_in_pages_readable(p->iov_base + skip, len);
 			if (unlikely(err))
-			return err;
-		0;}))
+				return err;
+			bytes -= len;
+		}
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From f0b65f39ac505e8f1dcdaa165aa7b8c0bd6fd454 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 30 Apr 2021 10:26:41 -0400
Subject: iov_iter: replace iov_iter_copy_from_user_atomic() with
 iterator-advancing variant

Replacement is called copy_page_from_iter_atomic(); unlike the old primitive the
callers do *not* need to do iov_iter_advance() after it.  In case when they end
up consuming less than they'd been given they need to do iov_iter_revert() on
everything they had not consumed.  That, however, needs to be done only on slow
paths.

All in-tree callers converted.  And that kills the last user of iterate_all_kinds()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting.rst |  9 +++++++++
 fs/btrfs/file.c                       | 23 +++++++++++------------
 fs/fuse/file.c                        |  3 +--
 fs/iomap/buffered-io.c                | 14 +++++++-------
 fs/ntfs/file.c                        |  4 +---
 include/linux/uio.h                   |  4 ++--
 lib/iov_iter.c                        | 30 ++++--------------------------
 mm/filemap.c                          | 16 ++++++++--------
 8 files changed, 43 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 0302035781be..43b492d08dec 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -890,3 +890,12 @@ been called or returned with non -EIOCBQUEUED code.
 
 mnt_want_write_file() can now only be paired with mnt_drop_write_file(),
 whereas previously it could be paired with mnt_drop_write() as well.
+
+---
+
+**mandatory**
+
+iov_iter_copy_from_user_atomic() is gone; use copy_page_from_iter_atomic().
+The difference is copy_page_from_iter_atomic() advances the iterator and
+you don't need iov_iter_advance() after it.  However, if you decide to use
+only a part of obtained data, you should do iov_iter_revert().
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 864c08d08a35..78cb8f9eaa6b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -398,7 +398,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
 		/*
 		 * Copy data from userspace to the current page
 		 */
-		copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
+		copied = copy_page_from_iter_atomic(page, offset, count, i);
 
 		/* Flush processor's dcache for this page */
 		flush_dcache_page(page);
@@ -412,20 +412,19 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
 		 * The rest of the btrfs_file_write code will fall
 		 * back to page at a time copies after we return 0.
 		 */
-		if (!PageUptodate(page) && copied < count)
-			copied = 0;
+		if (unlikely(copied < count)) {
+			if (!PageUptodate(page)) {
+				iov_iter_revert(i, copied);
+				copied = 0;
+			}
+			if (!copied)
+				break;
+		}
 
-		iov_iter_advance(i, copied);
 		write_bytes -= copied;
 		total_copied += copied;
-
-		/* Return to btrfs_file_write_iter to fault page */
-		if (unlikely(copied == 0))
-			break;
-
-		if (copied < PAGE_SIZE - offset) {
-			offset += copied;
-		} else {
+		offset += copied;
+		if (offset == PAGE_SIZE) {
 			pg++;
 			offset = 0;
 		}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 44bd301fa4fb..4722fa31a185 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1171,10 +1171,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
 		if (mapping_writably_mapped(mapping))
 			flush_dcache_page(page);
 
-		tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
+		tmp = copy_page_from_iter_atomic(page, offset, bytes, ii);
 		flush_dcache_page(page);
 
-		iov_iter_advance(ii, tmp);
 		if (!tmp) {
 			unlock_page(page);
 			put_page(page);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 354b41d20e5d..c5ff13e0e7cf 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -785,13 +785,15 @@ again:
 		if (mapping_writably_mapped(inode->i_mapping))
 			flush_dcache_page(page);
 
-		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+		copied = copy_page_from_iter_atomic(page, offset, bytes, i);
 
 		status = iomap_write_end(inode, pos, bytes, copied, page, iomap,
 				srcmap);
 
-		cond_resched();
+		if (unlikely(copied != status))
+			iov_iter_revert(i, copied - status);
 
+		cond_resched();
 		if (unlikely(status == 0)) {
 			/*
 			 * A short copy made iomap_write_end() reject the
@@ -803,11 +805,9 @@ again:
 				bytes = copied;
 			goto again;
 		}
-		copied = status;
-		iov_iter_advance(i, copied);
-		pos += copied;
-		written += copied;
-		length -= copied;
+		pos += status;
+		written += status;
+		length -= status;
 
 		balance_dirty_pages_ratelimited(inode->i_mapping);
 	} while (iov_iter_count(i) && length);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 0666d4578137..ab4f3362466d 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1690,9 +1690,7 @@ static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages,
 		len = PAGE_SIZE - ofs;
 		if (len > bytes)
 			len = bytes;
-		copied = iov_iter_copy_from_user_atomic(*pages, i, ofs,
-				len);
-		iov_iter_advance(i, copied);
+		copied = copy_page_from_iter_atomic(*pages, ofs, len, i);
 		total += copied;
 		bytes -= copied;
 		if (!bytes)
diff --git a/include/linux/uio.h b/include/linux/uio.h
index b5cf54859109..82c3c3e819e0 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -115,8 +115,8 @@ static inline struct iovec iov_iter_iovec(const struct iov_iter *iter)
 	};
 }
 
-size_t iov_iter_copy_from_user_atomic(struct page *page,
-		struct iov_iter *i, unsigned long offset, size_t bytes);
+size_t copy_page_from_iter_atomic(struct page *page, unsigned offset,
+				  size_t bytes, struct iov_iter *i);
 void iov_iter_advance(struct iov_iter *i, size_t bytes);
 void iov_iter_revert(struct iov_iter *i, size_t bytes);
 int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 72c5bb794e8d..362e8b5a5dc5 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -114,28 +114,6 @@
 	n = wanted - n;						\
 }
 
-#define iterate_all_kinds(i, n, v, I, B, K, X) {		\
-	if (likely(n)) {					\
-		size_t skip = i->iov_offset;			\
-		if (likely(iter_is_iovec(i))) {			\
-			const struct iovec *iov;		\
-			struct iovec v;				\
-			iterate_iovec(i, n, v, iov, skip, (I))	\
-		} else if (iov_iter_is_bvec(i)) {		\
-			struct bio_vec v;			\
-			struct bvec_iter __bi;			\
-			iterate_bvec(i, n, v, __bi, skip, (B))	\
-		} else if (iov_iter_is_kvec(i)) {		\
-			const struct kvec *kvec;		\
-			struct kvec v;				\
-			iterate_kvec(i, n, v, kvec, skip, (K))	\
-		} else if (iov_iter_is_xarray(i)) {		\
-			struct bio_vec v;			\
-			iterate_xarray(i, n, v, skip, (X));	\
-		}						\
-	}							\
-}
-
 #define iterate_and_advance(i, n, v, I, B, K, X) {		\
 	if (unlikely(i->count < n))				\
 		n = i->count;					\
@@ -1009,8 +987,8 @@ size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
 }
 EXPORT_SYMBOL(iov_iter_zero);
 
-size_t iov_iter_copy_from_user_atomic(struct page *page,
-		struct iov_iter *i, unsigned long offset, size_t bytes)
+size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes,
+				  struct iov_iter *i)
 {
 	char *kaddr = kmap_atomic(page), *p = kaddr + offset;
 	if (unlikely(!page_copy_sane(page, offset, bytes))) {
@@ -1022,7 +1000,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
 		WARN_ON(1);
 		return 0;
 	}
-	iterate_all_kinds(i, bytes, v,
+	iterate_and_advance(i, bytes, v,
 		copyin((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
 		memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page,
 				 v.bv_offset, v.bv_len),
@@ -1033,7 +1011,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
 	kunmap_atomic(kaddr);
 	return bytes;
 }
-EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
+EXPORT_SYMBOL(copy_page_from_iter_atomic);
 
 static inline void pipe_truncate(struct iov_iter *i)
 {
diff --git a/mm/filemap.c b/mm/filemap.c
index 0be24942bf8e..cf9de790f493 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3661,14 +3661,16 @@ again:
 		if (mapping_writably_mapped(mapping))
 			flush_dcache_page(page);
 
-		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+		copied = copy_page_from_iter_atomic(page, offset, bytes, i);
 		flush_dcache_page(page);
 
 		status = a_ops->write_end(file, mapping, pos, bytes, copied,
 						page, fsdata);
-		if (unlikely(status < 0))
-			break;
-
+		if (unlikely(status != copied)) {
+			iov_iter_revert(i, copied - max(status, 0L));
+			if (unlikely(status < 0))
+				break;
+		}
 		cond_resched();
 
 		if (unlikely(status == 0)) {
@@ -3682,10 +3684,8 @@ again:
 				bytes = copied;
 			goto again;
 		}
-		copied = status;
-		iov_iter_advance(i, copied);
-		pos += copied;
-		written += copied;
+		pos += status;
+		written += status;
 
 		balance_dirty_pages_ratelimited(mapping);
 	} while (iov_iter_count(i));
-- 
cgit v1.2.3


From ca24306d83a125df187ad53eddb038fe0cffb8ca Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Wed, 2 Jun 2021 17:18:58 +0900
Subject: bootconfig: Change array value to use child node

It is not possible to put an array value with subkeys under
a key node, because both of subkeys and the array elements
are using "next" field of the xbc_node.

Thus this changes the array values to use "child" field in
the array case. The reason why split this change is to
test it easily.

Link: https://lkml.kernel.org/r/162262193838.264090.16044473274501498656.stgit@devnote2

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 fs/proc/bootconfig.c       |  2 +-
 include/linux/bootconfig.h |  6 +++---
 lib/bootconfig.c           | 23 +++++++++++++++++++----
 tools/bootconfig/main.c    |  2 +-
 4 files changed, 24 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
index ad31ec4ad627..6d8d4bf20837 100644
--- a/fs/proc/bootconfig.c
+++ b/fs/proc/bootconfig.c
@@ -49,7 +49,7 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size)
 				else
 					q = '"';
 				ret = snprintf(dst, rest(dst, end), "%c%s%c%s",
-					q, val, q, vnode->next ? ", " : "\n");
+					q, val, q, xbc_node_is_array(vnode) ? ", " : "\n");
 				if (ret < 0)
 					goto out;
 				dst += ret;
diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h
index 2696eb0fc149..3178a31fdabc 100644
--- a/include/linux/bootconfig.h
+++ b/include/linux/bootconfig.h
@@ -71,7 +71,7 @@ static inline __init bool xbc_node_is_key(struct xbc_node *node)
  */
 static inline __init bool xbc_node_is_array(struct xbc_node *node)
 {
-	return xbc_node_is_value(node) && node->next != 0;
+	return xbc_node_is_value(node) && node->child != 0;
 }
 
 /**
@@ -140,7 +140,7 @@ static inline struct xbc_node * __init xbc_find_node(const char *key)
  */
 #define xbc_array_for_each_value(anode, value)				\
 	for (value = xbc_node_get_data(anode); anode != NULL ;		\
-	     anode = xbc_node_get_next(anode),				\
+	     anode = xbc_node_get_child(anode),				\
 	     value = anode ? xbc_node_get_data(anode) : NULL)
 
 /**
@@ -171,7 +171,7 @@ static inline struct xbc_node * __init xbc_find_node(const char *key)
  */
 #define xbc_node_for_each_array_value(node, key, anode, value)		\
 	for (value = xbc_node_find_value(node, key, &anode); value != NULL; \
-	     anode = xbc_node_get_next(anode),				\
+	     anode = xbc_node_get_child(anode),				\
 	     value = anode ? xbc_node_get_data(anode) : NULL)
 
 /**
diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index 9f8c70a98fcf..44dcdcbd746a 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -367,6 +367,14 @@ static inline __init struct xbc_node *xbc_last_sibling(struct xbc_node *node)
 	return node;
 }
 
+static inline __init struct xbc_node *xbc_last_child(struct xbc_node *node)
+{
+	while (node->child)
+		node = xbc_node_get_child(node);
+
+	return node;
+}
+
 static struct xbc_node * __init xbc_add_sibling(char *data, u32 flag)
 {
 	struct xbc_node *sib, *node = xbc_add_node(data, flag);
@@ -517,17 +525,20 @@ static int __init xbc_parse_array(char **__v)
 	char *next;
 	int c = 0;
 
+	if (last_parent->child)
+		last_parent = xbc_node_get_child(last_parent);
+
 	do {
 		c = __xbc_parse_value(__v, &next);
 		if (c < 0)
 			return c;
 
-		node = xbc_add_sibling(*__v, XBC_VALUE);
+		node = xbc_add_child(*__v, XBC_VALUE);
 		if (!node)
 			return -ENOMEM;
 		*__v = next;
 	} while (c == ',');
-	node->next = 0;
+	node->child = 0;
 
 	return c;
 }
@@ -615,8 +626,12 @@ static int __init xbc_parse_kv(char **k, char *v, int op)
 
 	if (op == ':' && child) {
 		xbc_init_node(child, v, XBC_VALUE);
-	} else if (!xbc_add_sibling(v, XBC_VALUE))
-		return -ENOMEM;
+	} else {
+		if (op == '+' && child)
+			last_parent = xbc_last_child(child);
+		if (!xbc_add_sibling(v, XBC_VALUE))
+			return -ENOMEM;
+	}
 
 	if (c == ',') {	/* Array */
 		c = xbc_parse_array(&next);
diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c
index 268b72f4cc92..23569fb634f1 100644
--- a/tools/bootconfig/main.c
+++ b/tools/bootconfig/main.c
@@ -27,7 +27,7 @@ static int xbc_show_value(struct xbc_node *node, bool semicolon)
 			q = '\'';
 		else
 			q = '"';
-		printf("%c%s%c%s", q, val, q, node->next ? ", " : eol);
+		printf("%c%s%c%s", q, val, q, xbc_node_is_array(node) ? ", " : eol);
 		i++;
 	}
 	return i;
-- 
cgit v1.2.3


From e5efaeb8a8f527d6e91289ff1f67fbcae452b2ca Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Wed, 2 Jun 2021 17:19:07 +0900
Subject: bootconfig: Support mixing a value and subkeys under a key

Support mixing a value and subkeys under a key. Since kernel cmdline
options will support "aaa.bbb=value1 aaa.bbb.ccc=value2", it is
better that the bootconfig supports such configuration too.

Note that this does not change syntax itself but just accepts
mixed value and subkeys e.g.

key = value1
key.subkey = value2

But this is not accepted;

key {
 value1
 subkey = value2
}

That will make value1 as a subkey.

Also, the order of the value node under a key is fixed. If there
are a value and subkeys, the value is always the first child node
of the key. Thus if user specifies subkeys first, e.g.

key.subkey = value1
key = value2

In the program (and /proc/bootconfig), it will be shown as below

key = value2
key.subkey = value1

Link: https://lkml.kernel.org/r/162262194685.264090.7738574774030567419.stgit@devnote2

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/bootconfig.h | 32 +++++++++++++++++++++++
 lib/bootconfig.c           | 65 ++++++++++++++++++++++++++++++++--------------
 tools/bootconfig/main.c    | 45 ++++++++++++++++++++++++++------
 3 files changed, 114 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h
index 3178a31fdabc..e49043ac77c9 100644
--- a/include/linux/bootconfig.h
+++ b/include/linux/bootconfig.h
@@ -80,6 +80,8 @@ static inline __init bool xbc_node_is_array(struct xbc_node *node)
  *
  * Test the @node is a leaf key node which is a key node and has a value node
  * or no child. Returns true if it is a leaf node, or false if not.
+ * Note that the leaf node can have subkey nodes in addition to the
+ * value node.
  */
 static inline __init bool xbc_node_is_leaf(struct xbc_node *node)
 {
@@ -129,6 +131,23 @@ static inline struct xbc_node * __init xbc_find_node(const char *key)
 	return xbc_node_find_child(NULL, key);
 }
 
+/**
+ * xbc_node_get_subkey() - Return the first subkey node if exists
+ * @node: Parent node
+ *
+ * Return the first subkey node of the @node. If the @node has no child
+ * or only value node, this will return NULL.
+ */
+static inline struct xbc_node * __init xbc_node_get_subkey(struct xbc_node *node)
+{
+	struct xbc_node *child = xbc_node_get_child(node);
+
+	if (child && xbc_node_is_value(child))
+		return xbc_node_get_next(child);
+	else
+		return child;
+}
+
 /**
  * xbc_array_for_each_value() - Iterate value nodes on an array
  * @anode: An XBC arraied value node
@@ -149,11 +168,24 @@ static inline struct xbc_node * __init xbc_find_node(const char *key)
  * @child: Iterated XBC node.
  *
  * Iterate child nodes of @parent. Each child nodes are stored to @child.
+ * The @child can be mixture of a value node and subkey nodes.
  */
 #define xbc_node_for_each_child(parent, child)				\
 	for (child = xbc_node_get_child(parent); child != NULL ;	\
 	     child = xbc_node_get_next(child))
 
+/**
+ * xbc_node_for_each_subkey() - Iterate child subkey nodes
+ * @parent: An XBC node.
+ * @child: Iterated XBC node.
+ *
+ * Iterate subkey nodes of @parent. Each child nodes are stored to @child.
+ * The @child is only the subkey node.
+ */
+#define xbc_node_for_each_subkey(parent, child)				\
+	for (child = xbc_node_get_subkey(parent); child != NULL ;	\
+	     child = xbc_node_get_next(child))
+
 /**
  * xbc_node_for_each_array_value() - Iterate array entries of geven key
  * @node: An XBC node.
diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index 44dcdcbd746a..927017431fb6 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -156,7 +156,7 @@ xbc_node_find_child(struct xbc_node *parent, const char *key)
 	struct xbc_node *node;
 
 	if (parent)
-		node = xbc_node_get_child(parent);
+		node = xbc_node_get_subkey(parent);
 	else
 		node = xbc_root_node();
 
@@ -164,7 +164,7 @@ xbc_node_find_child(struct xbc_node *parent, const char *key)
 		if (!xbc_node_match_prefix(node, &key))
 			node = xbc_node_get_next(node);
 		else if (*key != '\0')
-			node = xbc_node_get_child(node);
+			node = xbc_node_get_subkey(node);
 		else
 			break;
 	}
@@ -274,6 +274,8 @@ int __init xbc_node_compose_key_after(struct xbc_node *root,
 struct xbc_node * __init xbc_node_find_next_leaf(struct xbc_node *root,
 						 struct xbc_node *node)
 {
+	struct xbc_node *next;
+
 	if (unlikely(!xbc_data))
 		return NULL;
 
@@ -282,6 +284,13 @@ struct xbc_node * __init xbc_node_find_next_leaf(struct xbc_node *root,
 		if (!node)
 			node = xbc_nodes;
 	} else {
+		/* Leaf node may have a subkey */
+		next = xbc_node_get_subkey(node);
+		if (next) {
+			node = next;
+			goto found;
+		}
+
 		if (node == root)	/* @root was a leaf, no child node. */
 			return NULL;
 
@@ -296,6 +305,7 @@ struct xbc_node * __init xbc_node_find_next_leaf(struct xbc_node *root,
 		node = xbc_node_get_next(node);
 	}
 
+found:
 	while (node && !xbc_node_is_leaf(node))
 		node = xbc_node_get_child(node);
 
@@ -375,18 +385,20 @@ static inline __init struct xbc_node *xbc_last_child(struct xbc_node *node)
 	return node;
 }
 
-static struct xbc_node * __init xbc_add_sibling(char *data, u32 flag)
+static struct xbc_node * __init __xbc_add_sibling(char *data, u32 flag, bool head)
 {
 	struct xbc_node *sib, *node = xbc_add_node(data, flag);
 
 	if (node) {
 		if (!last_parent) {
+			/* Ignore @head in this case */
 			node->parent = XBC_NODE_MAX;
 			sib = xbc_last_sibling(xbc_nodes);
 			sib->next = xbc_node_index(node);
 		} else {
 			node->parent = xbc_node_index(last_parent);
-			if (!last_parent->child) {
+			if (!last_parent->child || head) {
+				node->next = last_parent->child;
 				last_parent->child = xbc_node_index(node);
 			} else {
 				sib = xbc_node_get_child(last_parent);
@@ -400,6 +412,16 @@ static struct xbc_node * __init xbc_add_sibling(char *data, u32 flag)
 	return node;
 }
 
+static inline struct xbc_node * __init xbc_add_sibling(char *data, u32 flag)
+{
+	return __xbc_add_sibling(data, flag, false);
+}
+
+static inline struct xbc_node * __init xbc_add_head_sibling(char *data, u32 flag)
+{
+	return __xbc_add_sibling(data, flag, true);
+}
+
 static inline __init struct xbc_node *xbc_add_child(char *data, u32 flag)
 {
 	struct xbc_node *node = xbc_add_sibling(data, flag);
@@ -568,8 +590,9 @@ static int __init __xbc_add_key(char *k)
 		node = find_match_node(xbc_nodes, k);
 	else {
 		child = xbc_node_get_child(last_parent);
+		/* Since the value node is the first child, skip it. */
 		if (child && xbc_node_is_value(child))
-			return xbc_parse_error("Subkey is mixed with value", k);
+			child = xbc_node_get_next(child);
 		node = find_match_node(child, k);
 	}
 
@@ -612,27 +635,29 @@ static int __init xbc_parse_kv(char **k, char *v, int op)
 	if (ret)
 		return ret;
 
-	child = xbc_node_get_child(last_parent);
-	if (child) {
-		if (xbc_node_is_key(child))
-			return xbc_parse_error("Value is mixed with subkey", v);
-		else if (op == '=')
-			return xbc_parse_error("Value is redefined", v);
-	}
-
 	c = __xbc_parse_value(&v, &next);
 	if (c < 0)
 		return c;
 
-	if (op == ':' && child) {
-		xbc_init_node(child, v, XBC_VALUE);
-	} else {
-		if (op == '+' && child)
-			last_parent = xbc_last_child(child);
-		if (!xbc_add_sibling(v, XBC_VALUE))
-			return -ENOMEM;
+	child = xbc_node_get_child(last_parent);
+	if (child && xbc_node_is_value(child)) {
+		if (op == '=')
+			return xbc_parse_error("Value is redefined", v);
+		if (op == ':') {
+			unsigned short nidx = child->next;
+
+			xbc_init_node(child, v, XBC_VALUE);
+			child->next = nidx;	/* keep subkeys */
+			goto array;
+		}
+		/* op must be '+' */
+		last_parent = xbc_last_child(child);
 	}
+	/* The value node should always be the first child */
+	if (!xbc_add_head_sibling(v, XBC_VALUE))
+		return -ENOMEM;
 
+array:
 	if (c == ',') {	/* Array */
 		c = xbc_parse_array(&next);
 		if (c < 0)
diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c
index 23569fb634f1..62a3b5064b17 100644
--- a/tools/bootconfig/main.c
+++ b/tools/bootconfig/main.c
@@ -35,30 +35,55 @@ static int xbc_show_value(struct xbc_node *node, bool semicolon)
 
 static void xbc_show_compact_tree(void)
 {
-	struct xbc_node *node, *cnode;
+	struct xbc_node *node, *cnode = NULL, *vnode;
 	int depth = 0, i;
 
 	node = xbc_root_node();
 	while (node && xbc_node_is_key(node)) {
 		for (i = 0; i < depth; i++)
 			printf("\t");
-		cnode = xbc_node_get_child(node);
+		if (!cnode)
+			cnode = xbc_node_get_child(node);
 		while (cnode && xbc_node_is_key(cnode) && !cnode->next) {
+			vnode = xbc_node_get_child(cnode);
+			/*
+			 * If @cnode has value and subkeys, this
+			 * should show it as below.
+			 *
+			 * key(@node) {
+			 *      key(@cnode) = value;
+			 *      key(@cnode) {
+			 *          subkeys;
+			 *      }
+			 * }
+			 */
+			if (vnode && xbc_node_is_value(vnode) && vnode->next)
+				break;
 			printf("%s.", xbc_node_get_data(node));
 			node = cnode;
-			cnode = xbc_node_get_child(node);
+			cnode = vnode;
 		}
 		if (cnode && xbc_node_is_key(cnode)) {
 			printf("%s {\n", xbc_node_get_data(node));
 			depth++;
 			node = cnode;
+			cnode = NULL;
 			continue;
 		} else if (cnode && xbc_node_is_value(cnode)) {
 			printf("%s = ", xbc_node_get_data(node));
 			xbc_show_value(cnode, true);
+			/*
+			 * If @node has value and subkeys, continue
+			 * looping on subkeys with same node.
+			 */
+			if (cnode->next) {
+				cnode = xbc_node_get_next(cnode);
+				continue;
+			}
 		} else {
 			printf("%s;\n", xbc_node_get_data(node));
 		}
+		cnode = NULL;
 
 		if (node->next) {
 			node = xbc_node_get_next(node);
@@ -70,10 +95,12 @@ static void xbc_show_compact_tree(void)
 				return;
 			if (!xbc_node_get_child(node)->next)
 				continue;
-			depth--;
-			for (i = 0; i < depth; i++)
-				printf("\t");
-			printf("}\n");
+			if (depth) {
+				depth--;
+				for (i = 0; i < depth; i++)
+					printf("\t");
+				printf("}\n");
+			}
 		}
 		node = xbc_node_get_next(node);
 	}
@@ -86,8 +113,10 @@ static void xbc_show_list(void)
 	const char *val;
 
 	xbc_for_each_key_value(leaf, val) {
-		if (xbc_node_compose_key(leaf, key, XBC_KEYLEN_MAX) < 0)
+		if (xbc_node_compose_key(leaf, key, XBC_KEYLEN_MAX) < 0) {
+			fprintf(stderr, "Failed to compose key %d\n", ret);
 			break;
+		}
 		printf("%s = ", key);
 		if (!val || val[0] == '\0') {
 			printf("\"\"\n");
-- 
cgit v1.2.3


From 99f4f5d62338cab9dcf45735344541574daedd20 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Wed, 2 Jun 2021 17:19:34 +0900
Subject: bootconfig: Share the checksum function with tools

Move the checksum calculation function into the header for sharing it
with tools/bootconfig.

Link: https://lkml.kernel.org/r/162262197470.264090.16325743685807878807.stgit@devnote2

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/bootconfig.h | 20 ++++++++++++++++++++
 init/main.c                | 12 +-----------
 tools/bootconfig/main.c    | 15 ++-------------
 3 files changed, 23 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h
index e49043ac77c9..6bdd94cff4e2 100644
--- a/include/linux/bootconfig.h
+++ b/include/linux/bootconfig.h
@@ -16,6 +16,26 @@
 #define BOOTCONFIG_ALIGN	(1 << BOOTCONFIG_ALIGN_SHIFT)
 #define BOOTCONFIG_ALIGN_MASK	(BOOTCONFIG_ALIGN - 1)
 
+/**
+ * xbc_calc_checksum() - Calculate checksum of bootconfig
+ * @data: Bootconfig data.
+ * @size: The size of the bootconfig data.
+ *
+ * Calculate the checksum value of the bootconfig data.
+ * The checksum will be used with the BOOTCONFIG_MAGIC and the size for
+ * embedding the bootconfig in the initrd image.
+ */
+static inline __init u32 xbc_calc_checksum(void *data, u32 size)
+{
+	unsigned char *p = data;
+	u32 ret = 0;
+
+	while (size--)
+		ret += *p++;
+
+	return ret;
+}
+
 /* XBC tree node */
 struct xbc_node {
 	u16 next;
diff --git a/init/main.c b/init/main.c
index e9c42a183e33..7b150f0501e2 100644
--- a/init/main.c
+++ b/init/main.c
@@ -386,16 +386,6 @@ static char * __init xbc_make_cmdline(const char *key)
 	return new_cmdline;
 }
 
-static u32 boot_config_checksum(unsigned char *p, u32 size)
-{
-	u32 ret = 0;
-
-	while (size--)
-		ret += *p++;
-
-	return ret;
-}
-
 static int __init bootconfig_params(char *param, char *val,
 				    const char *unused, void *arg)
 {
@@ -439,7 +429,7 @@ static void __init setup_boot_config(void)
 		return;
 	}
 
-	if (boot_config_checksum((unsigned char *)data, size) != csum) {
+	if (xbc_calc_checksum(data, size) != csum) {
 		pr_err("bootconfig checksum failed\n");
 		return;
 	}
diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c
index 62a3b5064b17..f45fa992e01d 100644
--- a/tools/bootconfig/main.c
+++ b/tools/bootconfig/main.c
@@ -126,17 +126,6 @@ static void xbc_show_list(void)
 	}
 }
 
-/* Simple real checksum */
-static int checksum(unsigned char *buf, int len)
-{
-	int i, sum = 0;
-
-	for (i = 0; i < len; i++)
-		sum += buf[i];
-
-	return sum;
-}
-
 #define PAGE_SIZE	4096
 
 static int load_xbc_fd(int fd, char **buf, int size)
@@ -232,7 +221,7 @@ static int load_xbc_from_initrd(int fd, char **buf)
 		return ret;
 
 	/* Wrong Checksum */
-	rcsum = checksum((unsigned char *)*buf, size);
+	rcsum = xbc_calc_checksum(*buf, size);
 	if (csum != rcsum) {
 		pr_err("checksum error: %d != %d\n", csum, rcsum);
 		return -EINVAL;
@@ -381,7 +370,7 @@ static int apply_xbc(const char *path, const char *xbc_path)
 		return ret;
 	}
 	size = strlen(buf) + 1;
-	csum = checksum((unsigned char *)buf, size);
+	csum = xbc_calc_checksum(buf, size);
 
 	/* Backup the bootconfig data */
 	data = calloc(size + BOOTCONFIG_ALIGN +
-- 
cgit v1.2.3


From ad4e600cbf897f47525b342cd4b02e88ed300a83 Mon Sep 17 00:00:00 2001
From: Gabriel Somlo <gsomlo@gmail.com>
Date: Wed, 26 May 2021 06:51:26 -0400
Subject: drivers/soc/litex: remove 8-bit subregister option

Since upstream LiteX recommends that Linux support be limited to
designs configured with 32-bit CSR subregisters (see commit a2b71fde
in upstream LiteX, https://github.com/enjoy-digital/litex), remove
the option to select 8-bit subregisters, significantly reducing the
complexity of LiteX CSR (MMIO register) accessor methods.

NOTE: for details on the underlying mechanics of LiteX CSR registers,
see https://github.com/enjoy-digital/litex/wiki/CSR-Bus or the original
LiteX accessors (litex/soc/software/include/hw/common.h in the upstream
repository).

Signed-off-by: Gabriel Somlo <gsomlo@gmail.com>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Florent Kermarrec <florent@enjoy-digital.fr>
Cc: Mateusz Holenko <mholenko@antmicro.com>
Cc: Joel Stanley <joel@jms.id.au>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: Stafford Horne <shorne@gmail.com>
---
 drivers/soc/litex/Kconfig          |  12 -----
 drivers/soc/litex/litex_soc_ctrl.c |   3 +-
 include/linux/litex.h              | 103 ++++++-------------------------------
 3 files changed, 16 insertions(+), 102 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soc/litex/Kconfig b/drivers/soc/litex/Kconfig
index e7011d665b15..e6ba3573a772 100644
--- a/drivers/soc/litex/Kconfig
+++ b/drivers/soc/litex/Kconfig
@@ -17,16 +17,4 @@ config LITEX_SOC_CONTROLLER
 	  All drivers that use functions from litex.h must depend on
 	  LITEX.
 
-config LITEX_SUBREG_SIZE
-	int "Size of a LiteX CSR subregister, in bytes"
-	depends on LITEX
-	range 1 4
-	default 4
-	help
-	LiteX MMIO registers (referred to as Configuration and Status
-	registers, or CSRs) are spread across adjacent 8- or 32-bit
-	subregisters, located at 32-bit aligned MMIO addresses. Use
-	this to select the appropriate size (1 or 4 bytes) matching
-	your particular LiteX build.
-
 endmenu
diff --git a/drivers/soc/litex/litex_soc_ctrl.c b/drivers/soc/litex/litex_soc_ctrl.c
index c3e379a990f2..f75790091d38 100644
--- a/drivers/soc/litex/litex_soc_ctrl.c
+++ b/drivers/soc/litex/litex_soc_ctrl.c
@@ -62,8 +62,7 @@ static int litex_check_csr_access(void __iomem *reg_addr)
 	/* restore original value of the SCRATCH register */
 	litex_write32(reg_addr + SCRATCH_REG_OFF, SCRATCH_REG_VALUE);
 
-	pr_info("LiteX SoC Controller driver initialized: subreg:%d, align:%d",
-		LITEX_SUBREG_SIZE, LITEX_SUBREG_ALIGN);
+	pr_info("LiteX SoC Controller driver initialized");
 
 	return 0;
 }
diff --git a/include/linux/litex.h b/include/linux/litex.h
index 5ea9ccf5cce4..f2edb86d5f44 100644
--- a/include/linux/litex.h
+++ b/include/linux/litex.h
@@ -11,18 +11,6 @@
 
 #include <linux/io.h>
 
-/* LiteX SoCs support 8- or 32-bit CSR Bus data width (i.e., subreg. size) */
-#if defined(CONFIG_LITEX_SUBREG_SIZE) && \
-	(CONFIG_LITEX_SUBREG_SIZE == 1 || CONFIG_LITEX_SUBREG_SIZE == 4)
-#define LITEX_SUBREG_SIZE      CONFIG_LITEX_SUBREG_SIZE
-#else
-#error LiteX subregister size (LITEX_SUBREG_SIZE) must be 4 or 1!
-#endif
-#define LITEX_SUBREG_SIZE_BIT	 (LITEX_SUBREG_SIZE * 8)
-
-/* LiteX subregisters of any width are always aligned on a 4-byte boundary */
-#define LITEX_SUBREG_ALIGN	  0x4
-
 static inline void _write_litex_subregister(u32 val, void __iomem *addr)
 {
 	writel((u32 __force)cpu_to_le32(val), addr);
@@ -42,115 +30,54 @@ static inline u32 _read_litex_subregister(void __iomem *addr)
  * 32-bit wide logical CSR will be laid out as four 32-bit physical
  * subregisters, each one containing one byte of meaningful data.
  *
- * For details see: https://github.com/enjoy-digital/litex/wiki/CSR-Bus
- */
-
-/* number of LiteX subregisters needed to store a register of given reg_size */
-#define _litex_num_subregs(reg_size) \
-	(((reg_size) - 1) / LITEX_SUBREG_SIZE + 1)
-
-/*
- * since the number of 4-byte aligned subregisters required to store a single
- * LiteX CSR (MMIO) register varies with LITEX_SUBREG_SIZE, the offset of the
- * next adjacent LiteX CSR register w.r.t. the offset of the current one also
- * depends on how many subregisters the latter is spread across
- */
-#define _next_reg_off(off, size) \
-	((off) + _litex_num_subregs(size) * LITEX_SUBREG_ALIGN)
-
-/*
- * The purpose of `_litex_[set|get]_reg()` is to implement the logic of
- * writing to/reading from the LiteX CSR in a single place that can be then
- * reused by all LiteX drivers via the `litex_[write|read][8|16|32|64]()`
- * accessors for the appropriate data width.
- * NOTE: direct use of `_litex_[set|get]_reg()` by LiteX drivers is strongly
- * discouraged, as they perform no error checking on the requested data width!
- */
-
-/**
- * _litex_set_reg() - Writes a value to the LiteX CSR (Control&Status Register)
- * @reg: Address of the CSR
- * @reg_size: The width of the CSR expressed in the number of bytes
- * @val: Value to be written to the CSR
+ * For Linux support, upstream LiteX enforces a 32-bit wide CSR bus, which
+ * means that only larger-than-32-bit CSRs will be split across multiple
+ * subregisters (e.g., a 64-bit CSR will be spread across two consecutive
+ * 32-bit subregisters).
  *
- * This function splits a single (possibly multi-byte) LiteX CSR write into
- * a series of subregister writes with a proper offset.
- * NOTE: caller is responsible for ensuring (0 < reg_size <= sizeof(u64)).
- */
-static inline void _litex_set_reg(void __iomem *reg, size_t reg_size, u64 val)
-{
-	u8 shift = _litex_num_subregs(reg_size) * LITEX_SUBREG_SIZE_BIT;
-
-	while (shift > 0) {
-		shift -= LITEX_SUBREG_SIZE_BIT;
-		_write_litex_subregister(val >> shift, reg);
-		reg += LITEX_SUBREG_ALIGN;
-	}
-}
-
-/**
- * _litex_get_reg() - Reads a value of the LiteX CSR (Control&Status Register)
- * @reg: Address of the CSR
- * @reg_size: The width of the CSR expressed in the number of bytes
- *
- * Return: Value read from the CSR
- *
- * This function generates a series of subregister reads with a proper offset
- * and joins their results into a single (possibly multi-byte) LiteX CSR value.
- * NOTE: caller is responsible for ensuring (0 < reg_size <= sizeof(u64)).
+ * For details see: https://github.com/enjoy-digital/litex/wiki/CSR-Bus
  */
-static inline u64 _litex_get_reg(void __iomem *reg, size_t reg_size)
-{
-	u64 r;
-	u8 i;
-
-	r = _read_litex_subregister(reg);
-	for (i = 1; i < _litex_num_subregs(reg_size); i++) {
-		r <<= LITEX_SUBREG_SIZE_BIT;
-		reg += LITEX_SUBREG_ALIGN;
-		r |= _read_litex_subregister(reg);
-	}
-	return r;
-}
 
 static inline void litex_write8(void __iomem *reg, u8 val)
 {
-	_litex_set_reg(reg, sizeof(u8), val);
+	_write_litex_subregister(val, reg);
 }
 
 static inline void litex_write16(void __iomem *reg, u16 val)
 {
-	_litex_set_reg(reg, sizeof(u16), val);
+	_write_litex_subregister(val, reg);
 }
 
 static inline void litex_write32(void __iomem *reg, u32 val)
 {
-	_litex_set_reg(reg, sizeof(u32), val);
+	_write_litex_subregister(val, reg);
 }
 
 static inline void litex_write64(void __iomem *reg, u64 val)
 {
-	_litex_set_reg(reg, sizeof(u64), val);
+	_write_litex_subregister(val >> 32, reg);
+	_write_litex_subregister(val, reg + 4);
 }
 
 static inline u8 litex_read8(void __iomem *reg)
 {
-	return _litex_get_reg(reg, sizeof(u8));
+	return _read_litex_subregister(reg);
 }
 
 static inline u16 litex_read16(void __iomem *reg)
 {
-	return _litex_get_reg(reg, sizeof(u16));
+	return _read_litex_subregister(reg);
 }
 
 static inline u32 litex_read32(void __iomem *reg)
 {
-	return _litex_get_reg(reg, sizeof(u32));
+	return _read_litex_subregister(reg);
 }
 
 static inline u64 litex_read64(void __iomem *reg)
 {
-	return _litex_get_reg(reg, sizeof(u64));
+	return ((u64)_read_litex_subregister(reg) << 32) |
+		_read_litex_subregister(reg + 4);
 }
 
 #endif /* _LINUX_LITEX_H */
-- 
cgit v1.2.3


From f8e6d24144d1bfbb8714faa9044e135c0c00bd89 Mon Sep 17 00:00:00 2001
From: Qi Liu <liuqi115@huawei.com>
Date: Wed, 9 Jun 2021 14:40:57 +0800
Subject: perf: Add EVENT_ATTR_ID to simplify event attributes

Similar EVENT_ATTR macros are defined in many PMU drivers,
like Arm PMU driver, Arm SMMU PMU driver. So add a generic
macro to simplify code.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Qi Liu <liuqi115@huawei.com>
Link: https://lore.kernel.org/r/1623220863-58233-2-git-send-email-liuqi115@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/perf_event.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f5a6a2f069ed..2d510ad750ed 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1576,6 +1576,12 @@ static struct perf_pmu_events_attr _var = {				    \
 	.event_str	= _str,						    \
 };
 
+#define PMU_EVENT_ATTR_ID(_name, _show, _id)				\
+	(&((struct perf_pmu_events_attr[]) {				\
+		{ .attr = __ATTR(_name, 0444, _show, NULL),		\
+		  .id = _id, }						\
+	})[0].attr.attr)
+
 #define PMU_FORMAT_ATTR(_name, _format)					\
 static ssize_t								\
 _name##_show(struct device *dev,					\
-- 
cgit v1.2.3


From fd307a4ad332ef50be5569c92490219e7cd84ce5 Mon Sep 17 00:00:00 2001
From: Jiri Prchal <jiri.prchal@aksignal.cz>
Date: Fri, 11 Jun 2021 11:45:58 +0200
Subject: nvmem: prepare basics for FRAM support

Added enum and string for FRAM (ferroelectric RAM) to expose it as file
named "fram".
Added documentation of sysfs file.

Signed-off-by: Jiri Prchal <jiri.prchal@aksignal.cz>
Link: https://lore.kernel.org/r/20210611094601.95131-2-jiri.prchal@aksignal.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-class-spi-eeprom   |  19 +++
 Documentation/devicetree/bindings/eeprom/at25.yaml |  31 +++-
 drivers/misc/eeprom/Kconfig                        |   5 +-
 drivers/misc/eeprom/at25.c                         | 161 +++++++++++++++++----
 drivers/nvmem/core.c                               |   4 +
 include/linux/nvmem-provider.h                     |   1 +
 6 files changed, 183 insertions(+), 38 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-class-spi-eeprom

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-class-spi-eeprom b/Documentation/ABI/testing/sysfs-class-spi-eeprom
new file mode 100644
index 000000000000..1ff757982079
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-spi-eeprom
@@ -0,0 +1,19 @@
+What:		/sys/class/spi_master/spi<bus>/spi<bus>.<dev>/fram
+Date:		June 2021
+KernelVersion:	5.14
+Contact:	Jiri Prchal <jiri.prchal@aksignal.cz>
+Description:
+	Contains the FRAM binary data. Same as EEPROM, just another file
+	name to indicate that it employs ferroelectric process.
+	It performs write operations at bus speed - no write delays.
+
+What:		/sys/class/spi_master/spi<bus>/spi<bus>.<dev>/sernum
+Date:		May 2021
+KernelVersion:	5.14
+Contact:	Jiri Prchal <jiri.prchal@aksignal.cz>
+Description:
+	Contains the serial number of the Cypress FRAM (FM25VN) if it is
+	present.  It will be displayed as a 8 byte hex string, as read
+	from the device.
+
+	This is a read-only attribute.
diff --git a/Documentation/devicetree/bindings/eeprom/at25.yaml b/Documentation/devicetree/bindings/eeprom/at25.yaml
index 6a2dc8b3ed14..fbf99e346966 100644
--- a/Documentation/devicetree/bindings/eeprom/at25.yaml
+++ b/Documentation/devicetree/bindings/eeprom/at25.yaml
@@ -4,14 +4,16 @@
 $id: "http://devicetree.org/schemas/eeprom/at25.yaml#"
 $schema: "http://devicetree.org/meta-schemas/core.yaml#"
 
-title: SPI EEPROMs compatible with Atmel's AT25
+title: SPI EEPROMs or FRAMs compatible with Atmel's AT25
 
 maintainers:
   - Christian Eggers <ceggers@arri.de>
 
 properties:
   $nodename:
-    pattern: "^eeprom@[0-9a-f]{1,2}$"
+    anyOf:
+      - pattern: "^eeprom@[0-9a-f]{1,2}$"
+      - pattern: "^fram@[0-9a-f]{1,2}$"
 
   # There are multiple known vendors who manufacture EEPROM chips compatible
   # with Atmel's AT25. The compatible string requires two items where the
@@ -31,6 +33,7 @@ properties:
               - microchip,25lc040
               - st,m95m02
               - st,m95256
+              - cypress,fm25
 
           - const: atmel,at25
 
@@ -47,7 +50,7 @@ properties:
     $ref: /schemas/types.yaml#/definitions/uint32
     enum: [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
     description:
-      Size of the eeprom page.
+      Size of the eeprom page. FRAMs don't have pages.
 
   size:
     $ref: /schemas/types.yaml#/definitions/uint32
@@ -100,9 +103,19 @@ required:
   - compatible
   - reg
   - spi-max-frequency
-  - pagesize
-  - size
-  - address-width
+
+allOf:
+  - if:
+      properties:
+        compatible:
+          not:
+            contains:
+              const: cypress,fm25
+    then:
+      required:
+        - pagesize
+        - size
+        - address-width
 
 additionalProperties: false
 
@@ -125,4 +138,10 @@ examples:
             size = <32768>;
             address-width = <16>;
         };
+
+        fram@1 {
+            compatible = "cypress,fm25", "atmel,at25";
+            reg = <1>;
+            spi-max-frequency = <40000000>;
+        };
     };
diff --git a/drivers/misc/eeprom/Kconfig b/drivers/misc/eeprom/Kconfig
index 0f791bfdc1f5..f0a7531f354c 100644
--- a/drivers/misc/eeprom/Kconfig
+++ b/drivers/misc/eeprom/Kconfig
@@ -32,12 +32,13 @@ config EEPROM_AT24
 	  will be called at24.
 
 config EEPROM_AT25
-	tristate "SPI EEPROMs from most vendors"
+	tristate "SPI EEPROMs (FRAMs) from most vendors"
 	depends on SPI && SYSFS
 	select NVMEM
 	select NVMEM_SYSFS
 	help
-	  Enable this driver to get read/write support to most SPI EEPROMs,
+	  Enable this driver to get read/write support to most SPI EEPROMs
+	  and Cypress FRAMs,
 	  after you configure the board init code to know about each eeprom
 	  on your target board.
 
diff --git a/drivers/misc/eeprom/at25.c b/drivers/misc/eeprom/at25.c
index b76e4901b4a4..6e26de68a001 100644
--- a/drivers/misc/eeprom/at25.c
+++ b/drivers/misc/eeprom/at25.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * at25.c -- support most SPI EEPROMs, such as Atmel AT25 models
+ *	     and Cypress FRAMs FM25 models
  *
  * Copyright (C) 2006 David Brownell
  */
@@ -16,6 +17,9 @@
 #include <linux/spi/spi.h>
 #include <linux/spi/eeprom.h>
 #include <linux/property.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/math.h>
 
 /*
  * NOTE: this is an *EEPROM* driver.  The vagaries of product naming
@@ -27,6 +31,7 @@
  *   AT25M02, AT25128B
  */
 
+#define	FM25_SN_LEN	8		/* serial number length */
 struct at25_data {
 	struct spi_device	*spi;
 	struct mutex		lock;
@@ -34,6 +39,7 @@ struct at25_data {
 	unsigned		addrlen;
 	struct nvmem_config	nvmem_config;
 	struct nvmem_device	*nvmem;
+	u8 sernum[FM25_SN_LEN];
 };
 
 #define	AT25_WREN	0x06		/* latch the write enable */
@@ -42,6 +48,9 @@ struct at25_data {
 #define	AT25_WRSR	0x01		/* write status register */
 #define	AT25_READ	0x03		/* read byte(s) */
 #define	AT25_WRITE	0x02		/* write byte(s)/sector */
+#define	FM25_SLEEP	0xb9		/* enter sleep mode */
+#define	FM25_RDID	0x9f		/* read device ID */
+#define	FM25_RDSN	0xc3		/* read S/N */
 
 #define	AT25_SR_nRDY	0x01		/* nRDY = write-in-progress */
 #define	AT25_SR_WEN	0x02		/* write enable (latched) */
@@ -51,6 +60,8 @@ struct at25_data {
 
 #define	AT25_INSTR_BIT3	0x08		/* Additional address bit in instr */
 
+#define	FM25_ID_LEN	9		/* ID length */
+
 #define EE_MAXADDRLEN	3		/* 24 bit addresses, up to 2 MBytes */
 
 /* Specs often allow 5 msec for a page write, sometimes 20 msec;
@@ -58,6 +69,9 @@ struct at25_data {
  */
 #define	EE_TIMEOUT	25
 
+#define	IS_EEPROM	0
+#define	IS_FRAM		1
+
 /*-------------------------------------------------------------------------*/
 
 #define	io_limit	PAGE_SIZE	/* bytes */
@@ -129,6 +143,51 @@ static int at25_ee_read(void *priv, unsigned int offset,
 	return status;
 }
 
+/*
+ * read extra registers as ID or serial number
+ */
+static int fm25_aux_read(struct at25_data *at25, u8 *buf, uint8_t command,
+			 int len)
+{
+	int status;
+	struct spi_transfer t[2];
+	struct spi_message m;
+
+	spi_message_init(&m);
+	memset(t, 0, sizeof(t));
+
+	t[0].tx_buf = &command;
+	t[0].len = 1;
+	spi_message_add_tail(&t[0], &m);
+
+	t[1].rx_buf = buf;
+	t[1].len = len;
+	spi_message_add_tail(&t[1], &m);
+
+	mutex_lock(&at25->lock);
+
+	status = spi_sync(at25->spi, &m);
+	dev_dbg(&at25->spi->dev, "read %d aux bytes --> %d\n", len, status);
+
+	mutex_unlock(&at25->lock);
+	return status;
+}
+
+static ssize_t sernum_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct at25_data *at25;
+
+	at25 = dev_get_drvdata(dev);
+	return sysfs_emit(buf, "%*ph\n", sizeof(at25->sernum), at25->sernum);
+}
+static DEVICE_ATTR_RO(sernum);
+
+static struct attribute *sernum_attrs[] = {
+	&dev_attr_sernum.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(sernum);
+
 static int at25_ee_write(void *priv, unsigned int off, void *val, size_t count)
 {
 	struct at25_data *at25 = priv;
@@ -303,34 +362,39 @@ static int at25_fw_to_chip(struct device *dev, struct spi_eeprom *chip)
 	return 0;
 }
 
+static const struct of_device_id at25_of_match[] = {
+	{ .compatible = "atmel,at25", .data = (const void *)IS_EEPROM },
+	{ .compatible = "cypress,fm25", .data = (const void *)IS_FRAM },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, at25_of_match);
+
 static int at25_probe(struct spi_device *spi)
 {
 	struct at25_data	*at25 = NULL;
 	struct spi_eeprom	chip;
 	int			err;
 	int			sr;
-	int			addrlen;
+	u8 id[FM25_ID_LEN];
+	u8 sernum[FM25_SN_LEN];
+	int i;
+	const struct of_device_id *match;
+	int is_fram = 0;
+
+	match = of_match_device(of_match_ptr(at25_of_match), &spi->dev);
+	if (match)
+		is_fram = (int)match->data;
 
 	/* Chip description */
 	if (!spi->dev.platform_data) {
-		err = at25_fw_to_chip(&spi->dev, &chip);
-		if (err)
-			return err;
+		if (!is_fram) {
+			err = at25_fw_to_chip(&spi->dev, &chip);
+			if (err)
+				return err;
+		}
 	} else
 		chip = *(struct spi_eeprom *)spi->dev.platform_data;
 
-	/* For now we only support 8/16/24 bit addressing */
-	if (chip.flags & EE_ADDR1)
-		addrlen = 1;
-	else if (chip.flags & EE_ADDR2)
-		addrlen = 2;
-	else if (chip.flags & EE_ADDR3)
-		addrlen = 3;
-	else {
-		dev_dbg(&spi->dev, "unsupported address type\n");
-		return -EINVAL;
-	}
-
 	/* Ping the chip ... the status register is pretty portable,
 	 * unlike probing manufacturer IDs.  We do expect that system
 	 * firmware didn't write it in the past few milliseconds!
@@ -349,9 +413,51 @@ static int at25_probe(struct spi_device *spi)
 	at25->chip = chip;
 	at25->spi = spi;
 	spi_set_drvdata(spi, at25);
-	at25->addrlen = addrlen;
 
-	at25->nvmem_config.type = NVMEM_TYPE_EEPROM;
+	if (is_fram) {
+		/* Get ID of chip */
+		fm25_aux_read(at25, id, FM25_RDID, FM25_ID_LEN);
+		if (id[6] != 0xc2) {
+			dev_err(&spi->dev,
+				"Error: no Cypress FRAM (id %02x)\n", id[6]);
+			return -ENODEV;
+		}
+		/* set size found in ID */
+		if (id[7] < 0x21 || id[7] > 0x26) {
+			dev_err(&spi->dev, "Error: unsupported size (id %02x)\n", id[7]);
+			return -ENODEV;
+		}
+		chip.byte_len = int_pow(2, id[7] - 0x21 + 4) * 1024;
+
+		if (at25->chip.byte_len > 64 * 1024)
+			at25->chip.flags |= EE_ADDR3;
+		else
+			at25->chip.flags |= EE_ADDR2;
+
+		if (id[8]) {
+			fm25_aux_read(at25, sernum, FM25_RDSN, FM25_SN_LEN);
+			/* swap byte order */
+			for (i = 0; i < FM25_SN_LEN; i++)
+				at25->sernum[i] = sernum[FM25_SN_LEN - 1 - i];
+		}
+
+		at25->chip.page_size = PAGE_SIZE;
+		strncpy(at25->chip.name, "fm25", sizeof(at25->chip.name));
+	}
+
+	/* For now we only support 8/16/24 bit addressing */
+	if (at25->chip.flags & EE_ADDR1)
+		at25->addrlen = 1;
+	else if (at25->chip.flags & EE_ADDR2)
+		at25->addrlen = 2;
+	else if (at25->chip.flags & EE_ADDR3)
+		at25->addrlen = 3;
+	else {
+		dev_dbg(&spi->dev, "unsupported address type\n");
+		return -EINVAL;
+	}
+
+	at25->nvmem_config.type = is_fram ? NVMEM_TYPE_FRAM : NVMEM_TYPE_EEPROM;
 	at25->nvmem_config.name = dev_name(&spi->dev);
 	at25->nvmem_config.dev = &spi->dev;
 	at25->nvmem_config.read_only = chip.flags & EE_READONLY;
@@ -370,27 +476,22 @@ static int at25_probe(struct spi_device *spi)
 	if (IS_ERR(at25->nvmem))
 		return PTR_ERR(at25->nvmem);
 
-	dev_info(&spi->dev, "%d %s %s eeprom%s, pagesize %u\n",
-		(chip.byte_len < 1024) ? chip.byte_len : (chip.byte_len / 1024),
-		(chip.byte_len < 1024) ? "Byte" : "KByte",
-		at25->chip.name,
-		(chip.flags & EE_READONLY) ? " (readonly)" : "",
-		at25->chip.page_size);
+	dev_info(&spi->dev, "%d %s %s %s%s, pagesize %u\n",
+		 (chip.byte_len < 1024) ? chip.byte_len : (chip.byte_len / 1024),
+		 (chip.byte_len < 1024) ? "Byte" : "KByte",
+		 at25->chip.name, is_fram ? "fram" : "eeprom",
+		 (chip.flags & EE_READONLY) ? " (readonly)" : "",
+		 at25->chip.page_size);
 	return 0;
 }
 
 /*-------------------------------------------------------------------------*/
 
-static const struct of_device_id at25_of_match[] = {
-	{ .compatible = "atmel,at25", },
-	{ }
-};
-MODULE_DEVICE_TABLE(of, at25_of_match);
-
 static struct spi_driver at25_driver = {
 	.driver = {
 		.name		= "at25",
 		.of_match_table = at25_of_match,
+		.dev_groups	= sernum_groups,
 	},
 	.probe		= at25_probe,
 };
diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index b3c28a2d4c10..4d1c4f83b22f 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -180,6 +180,7 @@ static const char * const nvmem_type_str[] = {
 	[NVMEM_TYPE_EEPROM] = "EEPROM",
 	[NVMEM_TYPE_OTP] = "OTP",
 	[NVMEM_TYPE_BATTERY_BACKED] = "Battery backed",
+	[NVMEM_TYPE_FRAM] = "FRAM",
 };
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -359,6 +360,9 @@ static int nvmem_sysfs_setup_compat(struct nvmem_device *nvmem,
 	if (!config->base_dev)
 		return -EINVAL;
 
+	if (config->type == NVMEM_TYPE_FRAM)
+		bin_attr_nvmem_eeprom_compat.attr.name = "fram";
+
 	nvmem->eeprom = bin_attr_nvmem_eeprom_compat;
 	nvmem->eeprom.attr.mode = nvmem_bin_attr_get_umode(nvmem);
 	nvmem->eeprom.size = nvmem->size;
diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index e162b757b6d5..890003565761 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -25,6 +25,7 @@ enum nvmem_type {
 	NVMEM_TYPE_EEPROM,
 	NVMEM_TYPE_OTP,
 	NVMEM_TYPE_BATTERY_BACKED,
+	NVMEM_TYPE_FRAM,
 };
 
 #define NVMEM_DEVID_NONE	(-1)
-- 
cgit v1.2.3


From 03cb4473be92a4207a3d1df25186dafd1a5add4d Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Wed, 9 Jun 2021 09:39:49 -0700
Subject: ice: add low level PTP clock access functions

Add the ice_ptp_hw.c file and some associated definitions to the ice
driver folder. This file contains basic low level definitions for
functions that interact with the device hardware.

For now, only E810-based devices are supported. The ice hardware
supports 2 major variants which have different PHYs with different
procedures necessary for interacting with the device clock.

Because the device captures timestamps in the PHY, each PHY has its own
internal timer. The timers are synchronized in hardware by first
preparing the source timer and the PHY timer shadow registers, and then
issuing a synchronization command. This ensures that both the source
timer and PHY timers are programmed simultaneously. The timers
themselves are all driven from the same oscillator source.

The functions in ice_ptp_hw.c abstract over the differences between how
the PHYs in E810 are programmed vs how the PHYs in E822 devices are
programmed. This series only implements E810 support, but E822 support
will be added in a future change.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_hw_autogen.h |  17 +
 drivers/net/ethernet/intel/ice/ice_ptp_hw.c     | 653 ++++++++++++++++++++++++
 drivers/net/ethernet/intel/ice/ice_ptp_hw.h     |  79 +++
 drivers/net/ethernet/intel/ice/ice_type.h       |   9 +
 include/linux/kernel.h                          |  12 +
 5 files changed, 770 insertions(+)
 create mode 100644 drivers/net/ethernet/intel/ice/ice_ptp_hw.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_ptp_hw.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
index 84d5d43fe029..f6f5ced50be2 100644
--- a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
+++ b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
@@ -433,6 +433,23 @@
 #define GLV_UPRCL(_i)				(0x003B2000 + ((_i) * 8))
 #define GLV_UPTCL(_i)				(0x0030A000 + ((_i) * 8))
 #define PRTRPB_RDPC				0x000AC260
+#define GLTSYN_CMD				0x00088810
+#define GLTSYN_CMD_SYNC				0x00088814
+#define GLTSYN_ENA(_i)				(0x00088808 + ((_i) * 4))
+#define GLTSYN_ENA_TSYN_ENA_M			BIT(0)
+#define GLTSYN_INCVAL_H(_i)			(0x00088920 + ((_i) * 4))
+#define GLTSYN_INCVAL_L(_i)			(0x00088918 + ((_i) * 4))
+#define GLTSYN_SHADJ_H(_i)			(0x00088910 + ((_i) * 4))
+#define GLTSYN_SHADJ_L(_i)			(0x00088908 + ((_i) * 4))
+#define GLTSYN_SHTIME_0(_i)			(0x000888E0 + ((_i) * 4))
+#define GLTSYN_SHTIME_H(_i)			(0x000888F0 + ((_i) * 4))
+#define GLTSYN_SHTIME_L(_i)			(0x000888E8 + ((_i) * 4))
+#define GLTSYN_STAT(_i)				(0x000888C0 + ((_i) * 4))
+#define GLTSYN_SYNC_DLAY			0x00088818
+#define GLTSYN_TIME_H(_i)			(0x000888D8 + ((_i) * 4))
+#define GLTSYN_TIME_L(_i)			(0x000888D0 + ((_i) * 4))
+#define PFTSYN_SEM				0x00088880
+#define PFTSYN_SEM_BUSY_M			BIT(0)
 #define VSIQF_FD_CNT(_VSI)			(0x00464000 + ((_VSI) * 4))
 #define VSIQF_FD_CNT_FD_GCNT_S			0
 #define VSIQF_FD_CNT_FD_GCNT_M			ICE_M(0x3FFF, 0)
diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
new file mode 100644
index 000000000000..267312fad59a
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
@@ -0,0 +1,653 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2021, Intel Corporation. */
+
+#include "ice_common.h"
+#include "ice_ptp_hw.h"
+
+/* Low level functions for interacting with and managing the device clock used
+ * for the Precision Time Protocol.
+ *
+ * The ice hardware represents the current time using three registers:
+ *
+ *    GLTSYN_TIME_H     GLTSYN_TIME_L     GLTSYN_TIME_R
+ *  +---------------+ +---------------+ +---------------+
+ *  |    32 bits    | |    32 bits    | |    32 bits    |
+ *  +---------------+ +---------------+ +---------------+
+ *
+ * The registers are incremented every clock tick using a 40bit increment
+ * value defined over two registers:
+ *
+ *                     GLTSYN_INCVAL_H   GLTSYN_INCVAL_L
+ *                    +---------------+ +---------------+
+ *                    |    8 bit s    | |    32 bits    |
+ *                    +---------------+ +---------------+
+ *
+ * The increment value is added to the GLSTYN_TIME_R and GLSTYN_TIME_L
+ * registers every clock source tick. Depending on the specific device
+ * configuration, the clock source frequency could be one of a number of
+ * values.
+ *
+ * For E810 devices, the increment frequency is 812.5 MHz
+ *
+ * The hardware captures timestamps in the PHY for incoming packets, and for
+ * outgoing packets on request. To support this, the PHY maintains a timer
+ * that matches the lower 64 bits of the global source timer.
+ *
+ * In order to ensure that the PHY timers and the source timer are equivalent,
+ * shadow registers are used to prepare the desired initial values. A special
+ * sync command is issued to trigger copying from the shadow registers into
+ * the appropriate source and PHY registers simultaneously.
+ */
+
+/**
+ * ice_get_ptp_src_clock_index - determine source clock index
+ * @hw: pointer to HW struct
+ *
+ * Determine the source clock index currently in use, based on device
+ * capabilities reported during initialization.
+ */
+u8 ice_get_ptp_src_clock_index(struct ice_hw *hw)
+{
+	return hw->func_caps.ts_func_info.tmr_index_assoc;
+}
+
+/* E810 functions
+ *
+ * The following functions operate on the E810 series devices which use
+ * a separate external PHY.
+ */
+
+/**
+ * ice_read_phy_reg_e810 - Read register from external PHY on E810
+ * @hw: pointer to the HW struct
+ * @addr: the address to read from
+ * @val: On return, the value read from the PHY
+ *
+ * Read a register from the external PHY on the E810 device.
+ */
+static int ice_read_phy_reg_e810(struct ice_hw *hw, u32 addr, u32 *val)
+{
+	struct ice_sbq_msg_input msg = {0};
+	int status;
+
+	msg.msg_addr_low = lower_16_bits(addr);
+	msg.msg_addr_high = upper_16_bits(addr);
+	msg.opcode = ice_sbq_msg_rd;
+	msg.dest_dev = rmn_0;
+
+	status = ice_sbq_rw_reg(hw, &msg);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to send message to PHY, status %d\n",
+			  status);
+		return status;
+	}
+
+	*val = msg.data;
+
+	return 0;
+}
+
+/**
+ * ice_write_phy_reg_e810 - Write register on external PHY on E810
+ * @hw: pointer to the HW struct
+ * @addr: the address to writem to
+ * @val: the value to write to the PHY
+ *
+ * Write a value to a register of the external PHY on the E810 device.
+ */
+static int ice_write_phy_reg_e810(struct ice_hw *hw, u32 addr, u32 val)
+{
+	struct ice_sbq_msg_input msg = {0};
+	int status;
+
+	msg.msg_addr_low = lower_16_bits(addr);
+	msg.msg_addr_high = upper_16_bits(addr);
+	msg.opcode = ice_sbq_msg_wr;
+	msg.dest_dev = rmn_0;
+	msg.data = val;
+
+	status = ice_sbq_rw_reg(hw, &msg);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to send message to PHY, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_read_phy_tstamp_e810 - Read a PHY timestamp out of the external PHY
+ * @hw: pointer to the HW struct
+ * @lport: the lport to read from
+ * @idx: the timestamp index to read
+ * @tstamp: on return, the 40bit timestamp value
+ *
+ * Read a 40bit timestamp value out of the timestamp block of the external PHY
+ * on the E810 device.
+ */
+static int
+ice_read_phy_tstamp_e810(struct ice_hw *hw, u8 lport, u8 idx, u64 *tstamp)
+{
+	u32 lo_addr, hi_addr, lo, hi;
+	int status;
+
+	lo_addr = TS_EXT(LOW_TX_MEMORY_BANK_START, lport, idx);
+	hi_addr = TS_EXT(HIGH_TX_MEMORY_BANK_START, lport, idx);
+
+	status = ice_read_phy_reg_e810(hw, lo_addr, &lo);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read low PTP timestamp register, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_read_phy_reg_e810(hw, hi_addr, &hi);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read high PTP timestamp register, status %d\n",
+			  status);
+		return status;
+	}
+
+	/* For E810 devices, the timestamp is reported with the lower 32 bits
+	 * in the low register, and the upper 8 bits in the high register.
+	 */
+	*tstamp = ((u64)hi) << TS_HIGH_S | ((u64)lo & TS_LOW_M);
+
+	return 0;
+}
+
+/**
+ * ice_clear_phy_tstamp_e810 - Clear a timestamp from the external PHY
+ * @hw: pointer to the HW struct
+ * @lport: the lport to read from
+ * @idx: the timestamp index to reset
+ *
+ * Clear a timestamp, resetting its valid bit, from the timestamp block of the
+ * external PHY on the E810 device.
+ */
+static int ice_clear_phy_tstamp_e810(struct ice_hw *hw, u8 lport, u8 idx)
+{
+	u32 lo_addr, hi_addr;
+	int status;
+
+	lo_addr = TS_EXT(LOW_TX_MEMORY_BANK_START, lport, idx);
+	hi_addr = TS_EXT(HIGH_TX_MEMORY_BANK_START, lport, idx);
+
+	status = ice_write_phy_reg_e810(hw, lo_addr, 0);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to clear low PTP timestamp register, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_write_phy_reg_e810(hw, hi_addr, 0);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to clear high PTP timestamp register, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_ptp_init_phy_e810 - Enable PTP function on the external PHY
+ * @hw: pointer to HW struct
+ *
+ * Enable the timesync PTP functionality for the external PHY connected to
+ * this function.
+ */
+int ice_ptp_init_phy_e810(struct ice_hw *hw)
+{
+	int status;
+	u8 tmr_idx;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+	status = ice_write_phy_reg_e810(hw, ETH_GLTSYN_ENA(tmr_idx),
+					GLTSYN_ENA_TSYN_ENA_M);
+	if (status)
+		ice_debug(hw, ICE_DBG_PTP, "PTP failed in ena_phy_time_syn %d\n",
+			  status);
+
+	return status;
+}
+
+/**
+ * ice_ptp_prep_phy_time_e810 - Prepare PHY port with initial time
+ * @hw: Board private structure
+ * @time: Time to initialize the PHY port clock to
+ *
+ * Program the PHY port ETH_GLTSYN_SHTIME registers in preparation setting the
+ * initial clock time. The time will not actually be programmed until the
+ * driver issues an INIT_TIME command.
+ *
+ * The time value is the upper 32 bits of the PHY timer, usually in units of
+ * nominal nanoseconds.
+ */
+static int ice_ptp_prep_phy_time_e810(struct ice_hw *hw, u32 time)
+{
+	int status;
+	u8 tmr_idx;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+	status = ice_write_phy_reg_e810(hw, ETH_GLTSYN_SHTIME_0(tmr_idx), 0);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write SHTIME_0, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_write_phy_reg_e810(hw, ETH_GLTSYN_SHTIME_L(tmr_idx), time);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write SHTIME_L, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_ptp_prep_phy_adj_e810 - Prep PHY port for a time adjustment
+ * @hw: pointer to HW struct
+ * @adj: adjustment value to program
+ *
+ * Prepare the PHY port for an atomic adjustment by programming the PHY
+ * ETH_GLTSYN_SHADJ_L and ETH_GLTSYN_SHADJ_H registers. The actual adjustment
+ * is completed by issuing an ADJ_TIME sync command.
+ *
+ * The adjustment value only contains the portion used for the upper 32bits of
+ * the PHY timer, usually in units of nominal nanoseconds. Negative
+ * adjustments are supported using 2s complement arithmetic.
+ */
+static int ice_ptp_prep_phy_adj_e810(struct ice_hw *hw, s32 adj)
+{
+	int status;
+	u8 tmr_idx;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+
+	/* Adjustments are represented as signed 2's complement values in
+	 * nanoseconds. Sub-nanosecond adjustment is not supported.
+	 */
+	status = ice_write_phy_reg_e810(hw, ETH_GLTSYN_SHADJ_L(tmr_idx), 0);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write adj to PHY SHADJ_L, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_write_phy_reg_e810(hw, ETH_GLTSYN_SHADJ_H(tmr_idx), adj);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write adj to PHY SHADJ_H, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_ptp_prep_phy_incval_e810 - Prep PHY port increment value change
+ * @hw: pointer to HW struct
+ * @incval: The new 40bit increment value to prepare
+ *
+ * Prepare the PHY port for a new increment value by programming the PHY
+ * ETH_GLTSYN_SHADJ_L and ETH_GLTSYN_SHADJ_H registers. The actual change is
+ * completed by issuing an INIT_INCVAL command.
+ */
+static int ice_ptp_prep_phy_incval_e810(struct ice_hw *hw, u64 incval)
+{
+	u32 high, low;
+	int status;
+	u8 tmr_idx;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+	low = lower_32_bits(incval);
+	high = upper_32_bits(incval);
+
+	status = ice_write_phy_reg_e810(hw, ETH_GLTSYN_SHADJ_L(tmr_idx), low);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write incval to PHY SHADJ_L, status %d\n",
+			  status);
+		return status;
+	}
+
+	status = ice_write_phy_reg_e810(hw, ETH_GLTSYN_SHADJ_H(tmr_idx), high);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write incval PHY SHADJ_H, status %d\n",
+			  status);
+		return status;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_ptp_port_cmd_e810 - Prepare all external PHYs for a timer command
+ * @hw: pointer to HW struct
+ * @cmd: Command to be sent to the port
+ *
+ * Prepare the external PHYs connected to this device for a timer sync
+ * command.
+ */
+static int ice_ptp_port_cmd_e810(struct ice_hw *hw, enum ice_ptp_tmr_cmd cmd)
+{
+	u32 cmd_val, val;
+	int status;
+
+	switch (cmd) {
+	case INIT_TIME:
+		cmd_val = GLTSYN_CMD_INIT_TIME;
+		break;
+	case INIT_INCVAL:
+		cmd_val = GLTSYN_CMD_INIT_INCVAL;
+		break;
+	case ADJ_TIME:
+		cmd_val = GLTSYN_CMD_ADJ_TIME;
+		break;
+	case READ_TIME:
+		cmd_val = GLTSYN_CMD_READ_TIME;
+		break;
+	case ADJ_TIME_AT_TIME:
+		cmd_val = GLTSYN_CMD_ADJ_INIT_TIME;
+		break;
+	}
+
+	/* Read, modify, write */
+	status = ice_read_phy_reg_e810(hw, ETH_GLTSYN_CMD, &val);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to read GLTSYN_CMD, status %d\n", status);
+		return status;
+	}
+
+	/* Modify necessary bits only and perform write */
+	val &= ~TS_CMD_MASK_E810;
+	val |= cmd_val;
+
+	status = ice_write_phy_reg_e810(hw, ETH_GLTSYN_CMD, val);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to write back GLTSYN_CMD, status %d\n", status);
+		return status;
+	}
+
+	return 0;
+}
+
+/* Device agnostic functions
+ *
+ * The following functions implement useful behavior to hide the differences
+ * between E810 and other devices. They call the device-specific
+ * implementations where necessary.
+ *
+ * Currently, the driver only supports E810, but future work will enable
+ * support for E822-based devices.
+ */
+
+/**
+ * ice_ptp_lock - Acquire PTP global semaphore register lock
+ * @hw: pointer to the HW struct
+ *
+ * Acquire the global PTP hardware semaphore lock. Returns true if the lock
+ * was acquired, false otherwise.
+ *
+ * The PFTSYN_SEM register sets the busy bit on read, returning the previous
+ * value. If software sees the busy bit cleared, this means that this function
+ * acquired the lock (and the busy bit is now set). If software sees the busy
+ * bit set, it means that another function acquired the lock.
+ *
+ * Software must clear the busy bit with a write to release the lock for other
+ * functions when done.
+ */
+bool ice_ptp_lock(struct ice_hw *hw)
+{
+	u32 hw_lock;
+	int i;
+
+#define MAX_TRIES 5
+
+	for (i = 0; i < MAX_TRIES; i++) {
+		hw_lock = rd32(hw, PFTSYN_SEM + (PFTSYN_SEM_BYTES * hw->pf_id));
+		hw_lock = hw_lock & PFTSYN_SEM_BUSY_M;
+		if (hw_lock) {
+			/* Somebody is holding the lock */
+			usleep_range(10000, 20000);
+			continue;
+		} else {
+			break;
+		}
+	}
+
+	return !hw_lock;
+}
+
+/**
+ * ice_ptp_unlock - Release PTP global semaphore register lock
+ * @hw: pointer to the HW struct
+ *
+ * Release the global PTP hardware semaphore lock. This is done by writing to
+ * the PFTSYN_SEM register.
+ */
+void ice_ptp_unlock(struct ice_hw *hw)
+{
+	wr32(hw, PFTSYN_SEM + (PFTSYN_SEM_BYTES * hw->pf_id), 0);
+}
+
+/**
+ * ice_ptp_src_cmd - Prepare source timer for a timer command
+ * @hw: pointer to HW structure
+ * @cmd: Timer command
+ *
+ * Prepare the source timer for an upcoming timer sync command.
+ */
+static void ice_ptp_src_cmd(struct ice_hw *hw, enum ice_ptp_tmr_cmd cmd)
+{
+	u32 cmd_val;
+	u8 tmr_idx;
+
+	tmr_idx = ice_get_ptp_src_clock_index(hw);
+	cmd_val = tmr_idx << SEL_CPK_SRC;
+
+	switch (cmd) {
+	case INIT_TIME:
+		cmd_val |= GLTSYN_CMD_INIT_TIME;
+		break;
+	case INIT_INCVAL:
+		cmd_val |= GLTSYN_CMD_INIT_INCVAL;
+		break;
+	case ADJ_TIME:
+		cmd_val |= GLTSYN_CMD_ADJ_TIME;
+		break;
+	case ADJ_TIME_AT_TIME:
+		cmd_val |= GLTSYN_CMD_ADJ_INIT_TIME;
+		break;
+	case READ_TIME:
+		cmd_val |= GLTSYN_CMD_READ_TIME;
+		break;
+	}
+
+	wr32(hw, GLTSYN_CMD, cmd_val);
+}
+
+/**
+ * ice_ptp_tmr_cmd - Prepare and trigger a timer sync command
+ * @hw: pointer to HW struct
+ * @cmd: the command to issue
+ *
+ * Prepare the source timer and PHY timers and then trigger the requested
+ * command. This causes the shadow registers previously written in preparation
+ * for the command to be synchronously applied to both the source and PHY
+ * timers.
+ */
+static int ice_ptp_tmr_cmd(struct ice_hw *hw, enum ice_ptp_tmr_cmd cmd)
+{
+	int status;
+
+	/* First, prepare the source timer */
+	ice_ptp_src_cmd(hw, cmd);
+
+	/* Next, prepare the ports */
+	status = ice_ptp_port_cmd_e810(hw, cmd);
+	if (status) {
+		ice_debug(hw, ICE_DBG_PTP, "Failed to prepare PHY ports for timer command %u, status %d\n",
+			  cmd, status);
+		return status;
+	}
+
+	/* Write the sync command register to drive both source and PHY timer commands
+	 * synchronously
+	 */
+	wr32(hw, GLTSYN_CMD_SYNC, SYNC_EXEC_CMD);
+
+	return 0;
+}
+
+/**
+ * ice_ptp_init_time - Initialize device time to provided value
+ * @hw: pointer to HW struct
+ * @time: 64bits of time (GLTSYN_TIME_L and GLTSYN_TIME_H)
+ *
+ * Initialize the device to the specified time provided. This requires a three
+ * step process:
+ *
+ * 1) write the new init time to the source timer shadow registers
+ * 2) write the new init time to the PHY timer shadow registers
+ * 3) issue an init_time timer command to synchronously switch both the source
+ *    and port timers to the new init time value at the next clock cycle.
+ */
+int ice_ptp_init_time(struct ice_hw *hw, u64 time)
+{
+	int status;
+	u8 tmr_idx;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+
+	/* Source timers */
+	wr32(hw, GLTSYN_SHTIME_L(tmr_idx), lower_32_bits(time));
+	wr32(hw, GLTSYN_SHTIME_H(tmr_idx), upper_32_bits(time));
+	wr32(hw, GLTSYN_SHTIME_0(tmr_idx), 0);
+
+	/* PHY timers */
+	/* Fill Rx and Tx ports and send msg to PHY */
+	status = ice_ptp_prep_phy_time_e810(hw, time & 0xFFFFFFFF);
+	if (status)
+		return status;
+
+	return ice_ptp_tmr_cmd(hw, INIT_TIME);
+}
+
+/**
+ * ice_ptp_write_incval - Program PHC with new increment value
+ * @hw: pointer to HW struct
+ * @incval: Source timer increment value per clock cycle
+ *
+ * Program the PHC with a new increment value. This requires a three-step
+ * process:
+ *
+ * 1) Write the increment value to the source timer shadow registers
+ * 2) Write the increment value to the PHY timer shadow registers
+ * 3) Issue an INIT_INCVAL timer command to synchronously switch both the
+ *    source and port timers to the new increment value at the next clock
+ *    cycle.
+ */
+int ice_ptp_write_incval(struct ice_hw *hw, u64 incval)
+{
+	int status;
+	u8 tmr_idx;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+
+	/* Shadow Adjust */
+	wr32(hw, GLTSYN_SHADJ_L(tmr_idx), lower_32_bits(incval));
+	wr32(hw, GLTSYN_SHADJ_H(tmr_idx), upper_32_bits(incval));
+
+	status = ice_ptp_prep_phy_incval_e810(hw, incval);
+	if (status)
+		return status;
+
+	return ice_ptp_tmr_cmd(hw, INIT_INCVAL);
+}
+
+/**
+ * ice_ptp_write_incval_locked - Program new incval while holding semaphore
+ * @hw: pointer to HW struct
+ * @incval: Source timer increment value per clock cycle
+ *
+ * Program a new PHC incval while holding the PTP semaphore.
+ */
+int ice_ptp_write_incval_locked(struct ice_hw *hw, u64 incval)
+{
+	int status;
+
+	if (!ice_ptp_lock(hw))
+		return -EBUSY;
+
+	status = ice_ptp_write_incval(hw, incval);
+
+	ice_ptp_unlock(hw);
+
+	return status;
+}
+
+/**
+ * ice_ptp_adj_clock - Adjust PHC clock time atomically
+ * @hw: pointer to HW struct
+ * @adj: Adjustment in nanoseconds
+ *
+ * Perform an atomic adjustment of the PHC time by the specified number of
+ * nanoseconds. This requires a three-step process:
+ *
+ * 1) Write the adjustment to the source timer shadow registers
+ * 2) Write the adjustment to the PHY timer shadow registers
+ * 3) Issue an ADJ_TIME timer command to synchronously apply the adjustment to
+ *    both the source and port timers at the next clock cycle.
+ */
+int ice_ptp_adj_clock(struct ice_hw *hw, s32 adj)
+{
+	int status;
+	u8 tmr_idx;
+
+	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
+
+	/* Write the desired clock adjustment into the GLTSYN_SHADJ register.
+	 * For an ADJ_TIME command, this set of registers represents the value
+	 * to add to the clock time. It supports subtraction by interpreting
+	 * the value as a 2's complement integer.
+	 */
+	wr32(hw, GLTSYN_SHADJ_L(tmr_idx), 0);
+	wr32(hw, GLTSYN_SHADJ_H(tmr_idx), adj);
+
+	status = ice_ptp_prep_phy_adj_e810(hw, adj);
+	if (status)
+		return status;
+
+	return ice_ptp_tmr_cmd(hw, ADJ_TIME);
+}
+
+/**
+ * ice_read_phy_tstamp - Read a PHY timestamp from the timestamo block
+ * @hw: pointer to the HW struct
+ * @block: the block to read from
+ * @idx: the timestamp index to read
+ * @tstamp: on return, the 40bit timestamp value
+ *
+ * Read a 40bit timestamp value out of the timestamp block.
+ */
+int ice_read_phy_tstamp(struct ice_hw *hw, u8 block, u8 idx, u64 *tstamp)
+{
+	return ice_read_phy_tstamp_e810(hw, block, idx, tstamp);
+}
+
+/**
+ * ice_clear_phy_tstamp - Clear a timestamp from the timestamp block
+ * @hw: pointer to the HW struct
+ * @block: the block to read from
+ * @idx: the timestamp index to reset
+ *
+ * Clear a timestamp, resetting its valid bit, from the timestamp block.
+ */
+int ice_clear_phy_tstamp(struct ice_hw *hw, u8 block, u8 idx)
+{
+	return ice_clear_phy_tstamp_e810(hw, block, idx);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_hw.h b/drivers/net/ethernet/intel/ice/ice_ptp_hw.h
new file mode 100644
index 000000000000..55a414e87018
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_ptp_hw.h
@@ -0,0 +1,79 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2021, Intel Corporation. */
+
+#ifndef _ICE_PTP_HW_H_
+#define _ICE_PTP_HW_H_
+
+enum ice_ptp_tmr_cmd {
+	INIT_TIME,
+	INIT_INCVAL,
+	ADJ_TIME,
+	ADJ_TIME_AT_TIME,
+	READ_TIME
+};
+
+/* Increment value to generate nanoseconds in the GLTSYN_TIME_L register for
+ * the E810 devices. Based off of a PLL with an 812.5 MHz frequency.
+ */
+#define ICE_PTP_NOMINAL_INCVAL_E810 0x13b13b13bULL
+
+/* Device agnostic functions */
+u8 ice_get_ptp_src_clock_index(struct ice_hw *hw);
+bool ice_ptp_lock(struct ice_hw *hw);
+void ice_ptp_unlock(struct ice_hw *hw);
+int ice_ptp_init_time(struct ice_hw *hw, u64 time);
+int ice_ptp_write_incval(struct ice_hw *hw, u64 incval);
+int ice_ptp_write_incval_locked(struct ice_hw *hw, u64 incval);
+int ice_ptp_adj_clock(struct ice_hw *hw, s32 adj);
+int ice_read_phy_tstamp(struct ice_hw *hw, u8 block, u8 idx, u64 *tstamp);
+int ice_clear_phy_tstamp(struct ice_hw *hw, u8 block, u8 idx);
+
+/* E810 family functions */
+int ice_ptp_init_phy_e810(struct ice_hw *hw);
+
+#define PFTSYN_SEM_BYTES	4
+
+/* PHY timer commands */
+#define SEL_CPK_SRC	8
+
+/* Time Sync command Definitions */
+#define GLTSYN_CMD_INIT_TIME		BIT(0)
+#define GLTSYN_CMD_INIT_INCVAL		BIT(1)
+#define GLTSYN_CMD_ADJ_TIME		BIT(2)
+#define GLTSYN_CMD_ADJ_INIT_TIME	(BIT(2) | BIT(3))
+#define GLTSYN_CMD_READ_TIME		BIT(7)
+
+#define TS_CMD_MASK_E810		0xFF
+#define SYNC_EXEC_CMD			0x3
+
+/* E810 timesync enable register */
+#define ETH_GLTSYN_ENA(_i)		(0x03000348 + ((_i) * 4))
+
+/* E810 shadow init time registers */
+#define ETH_GLTSYN_SHTIME_0(i)		(0x03000368 + ((i) * 32))
+#define ETH_GLTSYN_SHTIME_L(i)		(0x0300036C + ((i) * 32))
+
+/* E810 shadow time adjust registers */
+#define ETH_GLTSYN_SHADJ_L(_i)		(0x03000378 + ((_i) * 32))
+#define ETH_GLTSYN_SHADJ_H(_i)		(0x0300037C + ((_i) * 32))
+
+/* E810 timer command register */
+#define ETH_GLTSYN_CMD			0x03000344
+
+/* Source timer incval macros */
+#define INCVAL_HIGH_M			0xFF
+
+/* Timestamp block macros */
+#define TS_LOW_M			0xFFFFFFFF
+#define TS_HIGH_S			32
+
+#define BYTES_PER_IDX_ADDR_L_U		8
+
+/* External PHY timestamp address */
+#define TS_EXT(a, port, idx) ((a) + (0x1000 * (port)) +			\
+				 ((idx) * BYTES_PER_IDX_ADDR_L_U))
+
+#define LOW_TX_MEMORY_BANK_START	0x03090000
+#define HIGH_TX_MEMORY_BANK_START	0x03090004
+
+#endif /* _ICE_PTP_HW_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h
index 0e5d8d52728b..d33d1906103c 100644
--- a/drivers/net/ethernet/intel/ice/ice_type.h
+++ b/drivers/net/ethernet/intel/ice/ice_type.h
@@ -49,6 +49,7 @@ static inline u32 ice_round_to_num(u32 N, u32 R)
 #define ICE_DBG_RDMA		BIT_ULL(15)
 #define ICE_DBG_PKG		BIT_ULL(16)
 #define ICE_DBG_RES		BIT_ULL(17)
+#define ICE_DBG_PTP		BIT_ULL(19)
 #define ICE_DBG_AQ_MSG		BIT_ULL(24)
 #define ICE_DBG_AQ_DESC		BIT_ULL(25)
 #define ICE_DBG_AQ_DESC_BUF	BIT_ULL(26)
@@ -842,6 +843,14 @@ struct ice_hw {
 
 	u8 ucast_shared;	/* true if VSIs can share unicast addr */
 
+#define ICE_PHY_PER_NAC		1
+#define ICE_MAX_QUAD		2
+#define ICE_NUM_QUAD_TYPE	2
+#define ICE_PORTS_PER_QUAD	4
+#define ICE_PHY_0_LAST_QUAD	1
+#define ICE_PORTS_PER_PHY	8
+#define ICE_NUM_EXTERNAL_PORTS		ICE_PORTS_PER_PHY
+
 	/* Active package version (currently active) */
 	struct ice_pkg_ver active_pkg_ver;
 	u32 active_track_id;
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 15d8bad3d2f2..e73f3bc3dba5 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -71,6 +71,18 @@
  */
 #define lower_32_bits(n) ((u32)((n) & 0xffffffff))
 
+/**
+ * upper_16_bits - return bits 16-31 of a number
+ * @n: the number we're accessing
+ */
+#define upper_16_bits(n) ((u16)((n) >> 16))
+
+/**
+ * lower_16_bits - return bits 0-15 of a number
+ * @n: the number we're accessing
+ */
+#define lower_16_bits(n) ((u16)((n) & 0xffff))
+
 struct completion;
 struct pt_regs;
 struct user;
-- 
cgit v1.2.3


From 5937c3ce21228d33d2eb3287baa7e4cf6978dba9 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Thu, 3 Jun 2021 11:34:37 +0200
Subject: PM: domains: Drop/restore performance state votes for devices at
 runtime PM

A subsystem/driver that need to manage OPPs for its device, should
typically drop its vote for the OPP when the device becomes runtime
suspended. In this way, the corresponding aggregation of the performance
state votes that is managed in genpd for the attached PM domain, may find
that the aggregated vote can be decreased. Hence, it may allow genpd to set
the lower performance state for the PM domain, thus avoiding to waste
energy.

To accomplish this, typically a subsystem/driver would need to call
dev_pm_opp_set_rate|opp() for its device from its ->runtime_suspend()
callback, to drop the vote for the OPP. Accordingly, it needs another call
to dev_pm_opp_set_rate|opp() to restore the vote for the OPP from its
->runtime_resume() callback.

To avoid boilerplate code in subsystems/driver to deal with these things,
let's instead manage this internally in genpd.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Tested-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 27 +++++++++++++++++++++++++--
 include/linux/pm_domain.h   |  1 +
 2 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 90a9828fcb2f..ab0b740cc0f1 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -400,6 +400,23 @@ static int genpd_set_performance_state(struct device *dev, unsigned int state)
 	return ret;
 }
 
+static int genpd_drop_performance_state(struct device *dev)
+{
+	unsigned int prev_state = dev_gpd_data(dev)->performance_state;
+
+	if (!genpd_set_performance_state(dev, 0))
+		return prev_state;
+
+	return 0;
+}
+
+static void genpd_restore_performance_state(struct device *dev,
+					    unsigned int state)
+{
+	if (state)
+		genpd_set_performance_state(dev, state);
+}
+
 /**
  * dev_pm_genpd_set_performance_state- Set performance state of device's power
  * domain.
@@ -843,7 +860,8 @@ static int genpd_runtime_suspend(struct device *dev)
 {
 	struct generic_pm_domain *genpd;
 	bool (*suspend_ok)(struct device *__dev);
-	struct gpd_timing_data *td = &dev_gpd_data(dev)->td;
+	struct generic_pm_domain_data *gpd_data = dev_gpd_data(dev);
+	struct gpd_timing_data *td = &gpd_data->td;
 	bool runtime_pm = pm_runtime_enabled(dev);
 	ktime_t time_start;
 	s64 elapsed_ns;
@@ -900,6 +918,7 @@ static int genpd_runtime_suspend(struct device *dev)
 		return 0;
 
 	genpd_lock(genpd);
+	gpd_data->rpm_pstate = genpd_drop_performance_state(dev);
 	genpd_power_off(genpd, true, 0);
 	genpd_unlock(genpd);
 
@@ -917,7 +936,8 @@ static int genpd_runtime_suspend(struct device *dev)
 static int genpd_runtime_resume(struct device *dev)
 {
 	struct generic_pm_domain *genpd;
-	struct gpd_timing_data *td = &dev_gpd_data(dev)->td;
+	struct generic_pm_domain_data *gpd_data = dev_gpd_data(dev);
+	struct gpd_timing_data *td = &gpd_data->td;
 	bool runtime_pm = pm_runtime_enabled(dev);
 	ktime_t time_start;
 	s64 elapsed_ns;
@@ -941,6 +961,8 @@ static int genpd_runtime_resume(struct device *dev)
 
 	genpd_lock(genpd);
 	ret = genpd_power_on(genpd, 0);
+	if (!ret)
+		genpd_restore_performance_state(dev, gpd_data->rpm_pstate);
 	genpd_unlock(genpd);
 
 	if (ret)
@@ -979,6 +1001,7 @@ err_stop:
 err_poweroff:
 	if (!pm_runtime_is_irq_safe(dev) || genpd_is_irq_safe(genpd)) {
 		genpd_lock(genpd);
+		gpd_data->rpm_pstate = genpd_drop_performance_state(dev);
 		genpd_power_off(genpd, true, 0);
 		genpd_unlock(genpd);
 	}
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index dfcfbcecc34b..21a0577305ef 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -198,6 +198,7 @@ struct generic_pm_domain_data {
 	struct notifier_block *power_nb;
 	int cpu;
 	unsigned int performance_state;
+	unsigned int rpm_pstate;
 	ktime_t	next_wakeup;
 	void *data;
 };
-- 
cgit v1.2.3


From cdb14e0f7775e767484843e8ecd736bb21754c58 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 2 Jun 2021 09:53:16 +0300
Subject: blk-mq: factor out a blk_mq_alloc_sq_tag_set helper

Factour out a helper to initialize a simple single hw queue tag_set from
blk_mq_init_sq_queue.  This will allow to phase out blk_mq_init_sq_queue
in favor of a more symmetric and general API.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Link: https://lore.kernel.org/r/20210602065345.355274-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 32 ++++++++++++++++++--------------
 include/linux/blk-mq.h |  3 +++
 2 files changed, 21 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4261adee9964..867e5faf4f5b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3152,24 +3152,12 @@ struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
 	struct request_queue *q;
 	int ret;
 
-	memset(set, 0, sizeof(*set));
-	set->ops = ops;
-	set->nr_hw_queues = 1;
-	set->nr_maps = 1;
-	set->queue_depth = queue_depth;
-	set->numa_node = NUMA_NO_NODE;
-	set->flags = set_flags;
-
-	ret = blk_mq_alloc_tag_set(set);
+	ret = blk_mq_alloc_sq_tag_set(set, ops, queue_depth, set_flags);
 	if (ret)
 		return ERR_PTR(ret);
-
 	q = blk_mq_init_queue(set);
-	if (IS_ERR(q)) {
+	if (IS_ERR(q))
 		blk_mq_free_tag_set(set);
-		return q;
-	}
-
 	return q;
 }
 EXPORT_SYMBOL(blk_mq_init_sq_queue);
@@ -3589,6 +3577,22 @@ out_free_mq_map:
 }
 EXPORT_SYMBOL(blk_mq_alloc_tag_set);
 
+/* allocate and initialize a tagset for a simple single-queue device */
+int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
+		const struct blk_mq_ops *ops, unsigned int queue_depth,
+		unsigned int set_flags)
+{
+	memset(set, 0, sizeof(*set));
+	set->ops = ops;
+	set->nr_hw_queues = 1;
+	set->nr_maps = 1;
+	set->queue_depth = queue_depth;
+	set->numa_node = NUMA_NO_NODE;
+	set->flags = set_flags;
+	return blk_mq_alloc_tag_set(set);
+}
+EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set);
+
 void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 {
 	int i, j;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 359486940fa0..bb950fc669ef 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -439,6 +439,9 @@ struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
 void blk_mq_unregister_dev(struct device *, struct request_queue *);
 
 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
+int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
+		const struct blk_mq_ops *ops, unsigned int queue_depth,
+		unsigned int set_flags);
 void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
 
 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
-- 
cgit v1.2.3


From 26a9750aa875126e4b7fc5ee6de652a529c5b7ee Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 2 Jun 2021 09:53:17 +0300
Subject: blk-mq: improve the blk_mq_init_allocated_queue interface

Don't return the passed in request_queue but a normal error code, and
drop the elevator_init argument in favor of just calling elevator_init_mq
directly from dm-rq.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Link: https://lore.kernel.org/r/20210602065345.355274-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c           | 36 ++++++++++++++----------------------
 block/blk.h              |  1 -
 block/elevator.c         |  2 +-
 drivers/md/dm-rq.c       |  9 +++------
 include/linux/blk-mq.h   |  5 ++---
 include/linux/elevator.h |  1 +
 6 files changed, 21 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 867e5faf4f5b..8550ad64982f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3115,21 +3115,18 @@ void blk_mq_release(struct request_queue *q)
 struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
 		void *queuedata)
 {
-	struct request_queue *uninit_q, *q;
+	struct request_queue *q;
+	int ret;
 
-	uninit_q = blk_alloc_queue(set->numa_node);
-	if (!uninit_q)
+	q = blk_alloc_queue(set->numa_node);
+	if (!q)
 		return ERR_PTR(-ENOMEM);
-	uninit_q->queuedata = queuedata;
-
-	/*
-	 * Initialize the queue without an elevator. device_add_disk() will do
-	 * the initialization.
-	 */
-	q = blk_mq_init_allocated_queue(set, uninit_q, false);
-	if (IS_ERR(q))
-		blk_cleanup_queue(uninit_q);
-
+	q->queuedata = queuedata;
+	ret = blk_mq_init_allocated_queue(set, q);
+	if (ret) {
+		blk_cleanup_queue(q);
+		return ERR_PTR(ret);
+	}
 	return q;
 }
 EXPORT_SYMBOL_GPL(blk_mq_init_queue_data);
@@ -3273,9 +3270,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 	mutex_unlock(&q->sysfs_lock);
 }
 
-struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
-						  struct request_queue *q,
-						  bool elevator_init)
+int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
+		struct request_queue *q)
 {
 	/* mark the queue as mq asap */
 	q->mq_ops = set->ops;
@@ -3325,11 +3321,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
 	blk_mq_add_queue_tag_set(set, q);
 	blk_mq_map_swqueue(q);
-
-	if (elevator_init)
-		elevator_init_mq(q);
-
-	return q;
+	return 0;
 
 err_hctxs:
 	kfree(q->queue_hw_ctx);
@@ -3340,7 +3332,7 @@ err_poll:
 	q->poll_cb = NULL;
 err_exit:
 	q->mq_ops = NULL;
-	return ERR_PTR(-ENOMEM);
+	return -ENOMEM;
 }
 EXPORT_SYMBOL(blk_mq_init_allocated_queue);
 
diff --git a/block/blk.h b/block/blk.h
index 3440142f029b..d3fa47af3607 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -192,7 +192,6 @@ void blk_account_io_done(struct request *req, u64 now);
 
 void blk_insert_flush(struct request *rq);
 
-void elevator_init_mq(struct request_queue *q);
 int elevator_switch_mq(struct request_queue *q,
 			      struct elevator_type *new_e);
 void __elevator_exit(struct request_queue *, struct elevator_queue *);
diff --git a/block/elevator.c b/block/elevator.c
index 440699c28119..06e203426410 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -693,7 +693,7 @@ void elevator_init_mq(struct request_queue *q)
 		elevator_put(e);
 	}
 }
-
+EXPORT_SYMBOL_GPL(elevator_init_mq); /* only for dm-rq */
 
 /*
  * switch to new_e io scheduler. be careful not to introduce deadlocks -
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 9c3bc3711b33..0dbd48cbdff9 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -530,7 +530,6 @@ static const struct blk_mq_ops dm_mq_ops = {
 
 int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
 {
-	struct request_queue *q;
 	struct dm_target *immutable_tgt;
 	int err;
 
@@ -557,12 +556,10 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
 	if (err)
 		goto out_kfree_tag_set;
 
-	q = blk_mq_init_allocated_queue(md->tag_set, md->queue, true);
-	if (IS_ERR(q)) {
-		err = PTR_ERR(q);
+	err = blk_mq_init_allocated_queue(md->tag_set, md->queue);
+	if (err)
 		goto out_tag_set;
-	}
-
+	elevator_init_mq(md->queue);
 	return 0;
 
 out_tag_set:
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index bb950fc669ef..73750b2838d2 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -429,9 +429,8 @@ enum {
 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
 struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
 		void *queuedata);
-struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
-						  struct request_queue *q,
-						  bool elevator_init);
+int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
+		struct request_queue *q);
 struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
 						const struct blk_mq_ops *ops,
 						unsigned int queue_depth,
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index dcb2f9022c1d..783ecb3cb77a 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -120,6 +120,7 @@ extern void elv_merged_request(struct request_queue *, struct request *,
 extern bool elv_attempt_insert_merge(struct request_queue *, struct request *);
 extern struct request *elv_former_request(struct request_queue *, struct request *);
 extern struct request *elv_latter_request(struct request_queue *, struct request *);
+void elevator_init_mq(struct request_queue *q);
 
 /*
  * io scheduler registration
-- 
cgit v1.2.3


From b461dfc49eb6fbabc60b9dad476e787ada56b7b4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 2 Jun 2021 09:53:18 +0300
Subject: blk-mq: add the blk_mq_alloc_disk APIs

Add a new API to allocate a gendisk including the request_queue for use
with blk-mq based drivers.  This is to avoid boilerplate code in drivers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Link: https://lore.kernel.org/r/20210602065345.355274-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 19 +++++++++++++++++++
 include/linux/blk-mq.h | 12 ++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8550ad64982f..b123077a0dc4 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3137,6 +3137,25 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 }
 EXPORT_SYMBOL(blk_mq_init_queue);
 
+struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata)
+{
+	struct request_queue *q;
+	struct gendisk *disk;
+
+	q = blk_mq_init_queue_data(set, queuedata);
+	if (IS_ERR(q))
+		return ERR_CAST(q);
+
+	disk = __alloc_disk_node(0, set->numa_node);
+	if (!disk) {
+		blk_cleanup_queue(q);
+		return ERR_PTR(-ENOMEM);
+	}
+	disk->queue = q;
+	return disk;
+}
+EXPORT_SYMBOL(__blk_mq_alloc_disk);
+
 /*
  * Helper for setting up a queue with mq ops, given queue depth, and
  * the passed in mq ops flags.
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 73750b2838d2..f496c6c5b5d2 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -426,6 +426,18 @@ enum {
 	((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \
 		<< BLK_MQ_F_ALLOC_POLICY_START_BIT)
 
+#define blk_mq_alloc_disk(set, queuedata)				\
+({									\
+	static struct lock_class_key __key;				\
+	struct gendisk *__disk = __blk_mq_alloc_disk(set, queuedata);	\
+									\
+	if (__disk)							\
+		lockdep_init_map(&__disk->lockdep_map,			\
+			"(bio completion)", &__key, 0);			\
+	__disk;								\
+})
+struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set,
+		void *queuedata);
 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
 struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
 		void *queuedata);
-- 
cgit v1.2.3


From 08c1d480ed38995690a7d83f2c6a505f6cbbed9f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 2 Jun 2021 09:53:30 +0300
Subject: blk-mq: remove blk_mq_init_sq_queue

All users are gone now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Link: https://lore.kernel.org/r/20210602065345.355274-16-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 22 ----------------------
 include/linux/blk-mq.h |  4 ----
 2 files changed, 26 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index b123077a0dc4..3115ea2d0990 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3156,28 +3156,6 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata)
 }
 EXPORT_SYMBOL(__blk_mq_alloc_disk);
 
-/*
- * Helper for setting up a queue with mq ops, given queue depth, and
- * the passed in mq ops flags.
- */
-struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
-					   const struct blk_mq_ops *ops,
-					   unsigned int queue_depth,
-					   unsigned int set_flags)
-{
-	struct request_queue *q;
-	int ret;
-
-	ret = blk_mq_alloc_sq_tag_set(set, ops, queue_depth, set_flags);
-	if (ret)
-		return ERR_PTR(ret);
-	q = blk_mq_init_queue(set);
-	if (IS_ERR(q))
-		blk_mq_free_tag_set(set);
-	return q;
-}
-EXPORT_SYMBOL(blk_mq_init_sq_queue);
-
 static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
 		struct blk_mq_tag_set *set, struct request_queue *q,
 		int hctx_idx, int node)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index f496c6c5b5d2..02a4aab0aeac 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -443,10 +443,6 @@ struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
 		void *queuedata);
 int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 		struct request_queue *q);
-struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
-						const struct blk_mq_ops *ops,
-						unsigned int queue_depth,
-						unsigned int set_flags);
 void blk_mq_unregister_dev(struct device *, struct request_queue *);
 
 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
-- 
cgit v1.2.3


From ab6a303c5440156dd475b5884cff26a7245630f8 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 11 Jun 2021 22:01:26 +0300
Subject: net: dsa: tag_8021q: remove shim declarations

All users of tag_8021q select it in Kconfig, so shim functions are not
needed because it is not possible for it to be disabled and its callers
enabled.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/8021q.h | 76 -----------------------------------------------
 1 file changed, 76 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h
index b12b05f1c8b4..cbf2c9b1ee4f 100644
--- a/include/linux/dsa/8021q.h
+++ b/include/linux/dsa/8021q.h
@@ -37,8 +37,6 @@ struct dsa_8021q_context {
 
 #define DSA_8021Q_N_SUBVLAN			8
 
-#if IS_ENABLED(CONFIG_NET_DSA_TAG_8021Q)
-
 int dsa_8021q_setup(struct dsa_8021q_context *ctx, bool enabled);
 
 int dsa_8021q_crosschip_bridge_join(struct dsa_8021q_context *ctx, int port,
@@ -70,78 +68,4 @@ bool vid_is_dsa_8021q_txvlan(u16 vid);
 
 bool vid_is_dsa_8021q(u16 vid);
 
-#else
-
-int dsa_8021q_setup(struct dsa_8021q_context *ctx, bool enabled)
-{
-	return 0;
-}
-
-int dsa_8021q_crosschip_bridge_join(struct dsa_8021q_context *ctx, int port,
-				    struct dsa_8021q_context *other_ctx,
-				    int other_port)
-{
-	return 0;
-}
-
-int dsa_8021q_crosschip_bridge_leave(struct dsa_8021q_context *ctx, int port,
-				     struct dsa_8021q_context *other_ctx,
-				     int other_port)
-{
-	return 0;
-}
-
-struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
-			       u16 tpid, u16 tci)
-{
-	return NULL;
-}
-
-u16 dsa_8021q_tx_vid(struct dsa_switch *ds, int port)
-{
-	return 0;
-}
-
-u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port)
-{
-	return 0;
-}
-
-u16 dsa_8021q_rx_vid_subvlan(struct dsa_switch *ds, int port, u16 subvlan)
-{
-	return 0;
-}
-
-int dsa_8021q_rx_switch_id(u16 vid)
-{
-	return 0;
-}
-
-int dsa_8021q_rx_source_port(u16 vid)
-{
-	return 0;
-}
-
-u16 dsa_8021q_rx_subvlan(u16 vid)
-{
-	return 0;
-}
-
-bool vid_is_dsa_8021q_rxvlan(u16 vid)
-{
-	return false;
-}
-
-bool vid_is_dsa_8021q_txvlan(u16 vid)
-{
-	return false;
-}
-
-bool vid_is_dsa_8021q(u16 vid)
-{
-	return false;
-}
-
-#endif /* IS_ENABLED(CONFIG_NET_DSA_TAG_8021Q) */
-
 #endif /* _NET_DSA_8021Q_H */
-- 
cgit v1.2.3


From 233697b3b3f60b17d02ca2a35230aee0ac6f1759 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 11 Jun 2021 22:01:27 +0300
Subject: net: dsa: tag_8021q: refactor RX VLAN parsing into a dedicated
 function

The added value of this function is that it can deal with both the case
where the VLAN header is in the skb head, as well as in the offload field.
This is something I was not able to do using other functions in the
network stack.

Since both ocelot-8021q and sja1105 need to do the same stuff, let's
make it a common service provided by tag_8021q.

This is done as refactoring for the new SJA1110 tagger, which partly
uses tag_8021q as well (just like SJA1105), and will be the third caller.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/8021q.h  |  3 +++
 net/dsa/tag_8021q.c        | 23 +++++++++++++++++++++++
 net/dsa/tag_ocelot_8021q.c | 18 ++----------------
 net/dsa/tag_sja1105.c      | 33 +++++++++++----------------------
 4 files changed, 39 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h
index cbf2c9b1ee4f..1587961f1a7b 100644
--- a/include/linux/dsa/8021q.h
+++ b/include/linux/dsa/8021q.h
@@ -50,6 +50,9 @@ int dsa_8021q_crosschip_bridge_leave(struct dsa_8021q_context *ctx, int port,
 struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
 			       u16 tpid, u16 tci);
 
+void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id,
+		   int *subvlan);
+
 u16 dsa_8021q_tx_vid(struct dsa_switch *ds, int port);
 
 u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port);
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 122ad5833fb1..4aa29f90ecea 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -471,4 +471,27 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
 }
 EXPORT_SYMBOL_GPL(dsa_8021q_xmit);
 
+void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id,
+		   int *subvlan)
+{
+	u16 vid, tci;
+
+	skb_push_rcsum(skb, ETH_HLEN);
+	if (skb_vlan_tag_present(skb)) {
+		tci = skb_vlan_tag_get(skb);
+		__vlan_hwaccel_clear_tag(skb);
+	} else {
+		__skb_vlan_pop(skb, &tci);
+	}
+	skb_pull_rcsum(skb, ETH_HLEN);
+
+	vid = tci & VLAN_VID_MASK;
+
+	*source_port = dsa_8021q_rx_source_port(vid);
+	*switch_id = dsa_8021q_rx_switch_id(vid);
+	*subvlan = dsa_8021q_rx_subvlan(vid);
+	skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+}
+EXPORT_SYMBOL_GPL(dsa_8021q_rcv);
+
 MODULE_LICENSE("GPL v2");
diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c
index 663b74793cfc..85ac85c3af8c 100644
--- a/net/dsa/tag_ocelot_8021q.c
+++ b/net/dsa/tag_ocelot_8021q.c
@@ -41,29 +41,15 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
 				  struct net_device *netdev,
 				  struct packet_type *pt)
 {
-	int src_port, switch_id, qos_class;
-	u16 vid, tci;
+	int src_port, switch_id, subvlan;
 
-	skb_push_rcsum(skb, ETH_HLEN);
-	if (skb_vlan_tag_present(skb)) {
-		tci = skb_vlan_tag_get(skb);
-		__vlan_hwaccel_clear_tag(skb);
-	} else {
-		__skb_vlan_pop(skb, &tci);
-	}
-	skb_pull_rcsum(skb, ETH_HLEN);
-
-	vid = tci & VLAN_VID_MASK;
-	src_port = dsa_8021q_rx_source_port(vid);
-	switch_id = dsa_8021q_rx_switch_id(vid);
-	qos_class = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+	dsa_8021q_rcv(skb, &src_port, &switch_id, &subvlan);
 
 	skb->dev = dsa_master_find_slave(netdev, switch_id, src_port);
 	if (!skb->dev)
 		return NULL;
 
 	skb->offload_fwd_mark = 1;
-	skb->priority = qos_class;
 
 	return skb;
 }
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index 92e147293acf..a70625fe64f7 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -275,44 +275,33 @@ static void sja1105_decode_subvlan(struct sk_buff *skb, u16 subvlan)
 	__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tci);
 }
 
+static bool sja1105_skb_has_tag_8021q(const struct sk_buff *skb)
+{
+	u16 tpid = ntohs(eth_hdr(skb)->h_proto);
+
+	return tpid == ETH_P_SJA1105 || tpid == ETH_P_8021Q ||
+	       skb_vlan_tag_present(skb);
+}
+
 static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 				   struct net_device *netdev,
 				   struct packet_type *pt)
 {
+	int source_port, switch_id, subvlan = 0;
 	struct sja1105_meta meta = {0};
-	int source_port, switch_id;
 	struct ethhdr *hdr;
-	u16 tpid, vid, tci;
 	bool is_link_local;
-	u16 subvlan = 0;
-	bool is_tagged;
 	bool is_meta;
 
 	hdr = eth_hdr(skb);
-	tpid = ntohs(hdr->h_proto);
-	is_tagged = (tpid == ETH_P_SJA1105 || tpid == ETH_P_8021Q ||
-		     skb_vlan_tag_present(skb));
 	is_link_local = sja1105_is_link_local(skb);
 	is_meta = sja1105_is_meta_frame(skb);
 
 	skb->offload_fwd_mark = 1;
 
-	if (is_tagged) {
+	if (sja1105_skb_has_tag_8021q(skb)) {
 		/* Normal traffic path. */
-		skb_push_rcsum(skb, ETH_HLEN);
-		if (skb_vlan_tag_present(skb)) {
-			tci = skb_vlan_tag_get(skb);
-			__vlan_hwaccel_clear_tag(skb);
-		} else {
-			__skb_vlan_pop(skb, &tci);
-		}
-		skb_pull_rcsum(skb, ETH_HLEN);
-
-		vid = tci & VLAN_VID_MASK;
-		source_port = dsa_8021q_rx_source_port(vid);
-		switch_id = dsa_8021q_rx_switch_id(vid);
-		skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
-		subvlan = dsa_8021q_rx_subvlan(vid);
+		dsa_8021q_rcv(skb, &source_port, &switch_id, &subvlan);
 	} else if (is_link_local) {
 		/* Management traffic path. Switch embeds the switch ID and
 		 * port ID into bytes of the destination MAC, courtesy of
-- 
cgit v1.2.3


From 617ef8d9377b9aac381c023cd0823da264c2f463 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 11 Jun 2021 22:01:28 +0300
Subject: net: dsa: sja1105: make SJA1105_SKB_CB fit a full timestamp

In SJA1105, RX timestamps for packets sent to the CPU are transmitted in
separate follow-up packets (metadata frames). These contain partial
timestamps (24 or 32 bits) which are kept in SJA1105_SKB_CB(skb)->meta_tstamp.

Thankfully, SJA1110 improved that, and the RX timestamps are now
transmitted in-band with the actual packet, in the timestamp trailer.
The RX timestamps are now full-width 64 bits.

Because we process the RX DSA tags in the rcv() method in the tagger,
but we would like to preserve the DSA code structure in that we populate
the skb timestamp in the port_rxtstamp() call which only happens later,
the implication is that we must somehow pass the 64-bit timestamp from
the rcv() method all the way to port_rxtstamp(). We can use the skb->cb
for that.

Rename the meta_tstamp from struct sja1105_skb_cb from "meta_tstamp" to
"tstamp", and increase its size to 64 bits.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/sja1105/sja1105_ptp.c | 2 +-
 include/linux/dsa/sja1105.h           | 2 +-
 net/dsa/tag_sja1105.c                 | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c
index 0bc566b9e958..dea82f8a40c4 100644
--- a/drivers/net/dsa/sja1105/sja1105_ptp.c
+++ b/drivers/net/dsa/sja1105/sja1105_ptp.c
@@ -397,7 +397,7 @@ static long sja1105_rxtstamp_work(struct ptp_clock_info *ptp)
 
 		*shwt = (struct skb_shared_hwtstamps) {0};
 
-		ts = SJA1105_SKB_CB(skb)->meta_tstamp;
+		ts = SJA1105_SKB_CB(skb)->tstamp;
 		ts = sja1105_tstamp_reconstruct(ds, ticks, ts);
 
 		shwt->hwtstamp = ns_to_ktime(sja1105_ticks_to_ns(ts));
diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index 1eb84562b311..865a548a6ef2 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -48,7 +48,7 @@ struct sja1105_tagger_data {
 
 struct sja1105_skb_cb {
 	struct sk_buff *clone;
-	u32 meta_tstamp;
+	u64 tstamp;
 };
 
 #define SJA1105_SKB_CB(skb) \
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index a70625fe64f7..11f555dd9566 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -147,7 +147,7 @@ static void sja1105_transfer_meta(struct sk_buff *skb,
 
 	hdr->h_dest[3] = meta->dmac_byte_3;
 	hdr->h_dest[4] = meta->dmac_byte_4;
-	SJA1105_SKB_CB(skb)->meta_tstamp = meta->tstamp;
+	SJA1105_SKB_CB(skb)->tstamp = meta->tstamp;
 }
 
 /* This is a simple state machine which follows the hardware mechanism of
-- 
cgit v1.2.3


From 4913b8ebf8a9c56ce66466b4daa07d7d4678cdd8 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 11 Jun 2021 22:01:29 +0300
Subject: net: dsa: add support for the SJA1110 native tagging protocol

The SJA1110 has improved a few things compared to SJA1105:

- To send a control packet from the host port with SJA1105, one needed
  to program a one-shot "management route" over SPI. This is no longer
  true with SJA1110, you can actually send "in-band control extensions"
  in the packets sent by DSA, these are in fact DSA tags which contain
  the destination port and switch ID.

- When receiving a control packet from the switch with SJA1105, the
  source port and switch ID were written in bytes 3 and 4 of the
  destination MAC address of the frame (which was a very poor shot at a
  DSA header). If the control packet also had an RX timestamp, that
  timestamp was sent in an actual follow-up packet, so there were
  reordering concerns on multi-core/multi-queue DSA masters, where the
  metadata frame with the RX timestamp might get processed before the
  actual packet to which that timestamp belonged (there is no way to
  pair a packet to its timestamp other than the order in which they were
  received). On SJA1110, this is no longer true, control packets have
  the source port, switch ID and timestamp all in the DSA tags.

- Timestamps from the switch were partial: to get a 64-bit timestamp as
  required by PTP stacks, one would need to take the partial 24-bit or
  32-bit timestamp from the packet, then read the current PTP time very
  quickly, and then patch in the high bits of the current PTP time into
  the captured partial timestamp, to reconstruct what the full 64-bit
  timestamp must have been. That is awful because packet processing is
  done in NAPI context, but reading the current PTP time is done over
  SPI and therefore needs sleepable context.

But it also aggravated a few things:

- Not only is there a DSA header in SJA1110, but there is a DSA trailer
  in fact, too. So DSA needs to be extended to support taggers which
  have both a header and a trailer. Very unconventional - my understanding
  is that the trailer exists because the timestamps couldn't be prepared
  in time for putting them in the header area.

- Like SJA1105, not all packets sent to the CPU have the DSA tag added
  to them, only control packets do:

  * the ones which match the destination MAC filters/traps in
    MAC_FLTRES1 and MAC_FLTRES0
  * the ones which match FDB entries which have TRAP or TAKETS bits set

  So we could in theory hack something up to request the switch to take
  timestamps for all packets that reach the CPU, and those would be
  DSA-tagged and contain the source port / switch ID by virtue of the
  fact that there needs to be a timestamp trailer provided. BUT:

- The SJA1110 does not parse its own DSA tags in a way that is useful
  for routing in cross-chip topologies, a la Marvell. And the sja1105
  driver already supports cross-chip bridging from the SJA1105 days.
  It does that by automatically setting up the DSA links as VLAN trunks
  which contain all the necessary tag_8021q RX VLANs that must be
  communicated between the switches that span the same bridge. So when
  using tag_8021q on sja1105, it is possible to have 2 switches with
  ports sw0p0, sw0p1, sw1p0, sw1p1, and 2 VLAN-unaware bridges br0 and
  br1, and br0 can take sw0p0 and sw1p0, and br1 can take sw0p1 and
  sw1p1, and forwarding will happen according to the expected rules of
  the Linux bridge.
  We like that, and we don't want that to go away, so as a matter of
  fact, the SJA1110 tagger still needs to support tag_8021q.

So the sja1110 tagger is a hybrid between tag_8021q for data packets,
and the native hardware support for control packets.

On RX, packets have a 13-byte trailer if they contain an RX timestamp.
That trailer is padded in such a way that its byte 8 (the start of the
"residence time" field - not parsed by Linux because we don't care) is
aligned on a 16 byte boundary. So the padding has a variable length
between 0 and 15 bytes. The DSA header contains the offset of the
beginning of the padding relative to the beginning of the frame (and the
end of the padding is obviously the end of the packet minus 13 bytes,
the length of the trailer). So we discard it.

Packets which don't have a trailer contain the source port and switch ID
information in the header (they are "trap-to-host" packets). Packets
which have a trailer contain the source port and switch ID in the trailer.

On TX, the destination port mask and switch ID is always in the trailer,
so we always need to say in the header that a trailer is present.

The header needs a custom EtherType and this was chosen as 0xdadc, after
0xdada which is for Marvell and 0xdadb which is for VLANs in
VLAN-unaware mode on SJA1105 (and SJA1110 in fact too).

Because we use tag_8021q in concert with the native tagging protocol,
control packets will have 2 DSA tags.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/sja1105/sja1105.h               |   1 +
 drivers/net/dsa/sja1105/sja1105_main.c          |   6 +-
 drivers/net/dsa/sja1105/sja1105_spi.c           |  10 ++
 drivers/net/dsa/sja1105/sja1105_static_config.c |   1 +
 drivers/net/dsa/sja1105/sja1105_static_config.h |   1 +
 include/linux/dsa/sja1105.h                     |   1 +
 include/net/dsa.h                               |   2 +
 net/dsa/tag_sja1105.c                           | 221 +++++++++++++++++++++++-
 8 files changed, 240 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h
index 4d192331754c..a6d64b27e6a9 100644
--- a/drivers/net/dsa/sja1105/sja1105.h
+++ b/drivers/net/dsa/sja1105/sja1105.h
@@ -110,6 +110,7 @@ struct sja1105_info {
 	int max_frame_mem;
 	int num_ports;
 	bool multiple_cascade_ports;
+	enum dsa_tag_protocol tag_proto;
 	const struct sja1105_dynamic_table_ops *dyn_ops;
 	const struct sja1105_table_ops *static_ops;
 	const struct sja1105_regs *regs;
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index 850bbc793369..6e2cfbf605ef 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -667,6 +667,8 @@ static int sja1105_init_general_params(struct sja1105_private *priv)
 		.tpid2 = ETH_P_SJA1105,
 		/* Enable the TTEthernet engine on SJA1110 */
 		.tte_en = true,
+		/* Set up the EtherType for control packets on SJA1110 */
+		.header_type = ETH_P_SJA1110,
 	};
 	struct sja1105_general_params_entry *general_params;
 	struct dsa_switch *ds = priv->ds;
@@ -2174,7 +2176,9 @@ static enum dsa_tag_protocol
 sja1105_get_tag_protocol(struct dsa_switch *ds, int port,
 			 enum dsa_tag_protocol mp)
 {
-	return DSA_TAG_PROTO_SJA1105;
+	struct sja1105_private *priv = ds->priv;
+
+	return priv->info->tag_proto;
 }
 
 static int sja1105_find_free_subvlan(u16 *subvlan_map, bool pvid)
diff --git a/drivers/net/dsa/sja1105/sja1105_spi.c b/drivers/net/dsa/sja1105/sja1105_spi.c
index e6c2a37aa617..9156f4cc11f2 100644
--- a/drivers/net/dsa/sja1105/sja1105_spi.c
+++ b/drivers/net/dsa/sja1105/sja1105_spi.c
@@ -569,6 +569,7 @@ const struct sja1105_info sja1105e_info = {
 	.static_ops		= sja1105e_table_ops,
 	.dyn_ops		= sja1105et_dyn_ops,
 	.qinq_tpid		= ETH_P_8021Q,
+	.tag_proto		= DSA_TAG_PROTO_SJA1105,
 	.can_limit_mcast_flood	= false,
 	.ptp_ts_bits		= 24,
 	.ptpegr_ts_bytes	= 4,
@@ -600,6 +601,7 @@ const struct sja1105_info sja1105t_info = {
 	.static_ops		= sja1105t_table_ops,
 	.dyn_ops		= sja1105et_dyn_ops,
 	.qinq_tpid		= ETH_P_8021Q,
+	.tag_proto		= DSA_TAG_PROTO_SJA1105,
 	.can_limit_mcast_flood	= false,
 	.ptp_ts_bits		= 24,
 	.ptpegr_ts_bytes	= 4,
@@ -631,6 +633,7 @@ const struct sja1105_info sja1105p_info = {
 	.static_ops		= sja1105p_table_ops,
 	.dyn_ops		= sja1105pqrs_dyn_ops,
 	.qinq_tpid		= ETH_P_8021AD,
+	.tag_proto		= DSA_TAG_PROTO_SJA1105,
 	.can_limit_mcast_flood	= true,
 	.ptp_ts_bits		= 32,
 	.ptpegr_ts_bytes	= 8,
@@ -663,6 +666,7 @@ const struct sja1105_info sja1105q_info = {
 	.static_ops		= sja1105q_table_ops,
 	.dyn_ops		= sja1105pqrs_dyn_ops,
 	.qinq_tpid		= ETH_P_8021AD,
+	.tag_proto		= DSA_TAG_PROTO_SJA1105,
 	.can_limit_mcast_flood	= true,
 	.ptp_ts_bits		= 32,
 	.ptpegr_ts_bytes	= 8,
@@ -695,6 +699,7 @@ const struct sja1105_info sja1105r_info = {
 	.static_ops		= sja1105r_table_ops,
 	.dyn_ops		= sja1105pqrs_dyn_ops,
 	.qinq_tpid		= ETH_P_8021AD,
+	.tag_proto		= DSA_TAG_PROTO_SJA1105,
 	.can_limit_mcast_flood	= true,
 	.ptp_ts_bits		= 32,
 	.ptpegr_ts_bytes	= 8,
@@ -729,6 +734,7 @@ const struct sja1105_info sja1105s_info = {
 	.dyn_ops		= sja1105pqrs_dyn_ops,
 	.regs			= &sja1105pqrs_regs,
 	.qinq_tpid		= ETH_P_8021AD,
+	.tag_proto		= DSA_TAG_PROTO_SJA1105,
 	.can_limit_mcast_flood	= true,
 	.ptp_ts_bits		= 32,
 	.ptpegr_ts_bytes	= 8,
@@ -762,6 +768,7 @@ const struct sja1105_info sja1110a_info = {
 	.dyn_ops		= sja1110_dyn_ops,
 	.regs			= &sja1110_regs,
 	.qinq_tpid		= ETH_P_8021AD,
+	.tag_proto		= DSA_TAG_PROTO_SJA1110,
 	.can_limit_mcast_flood	= true,
 	.multiple_cascade_ports	= true,
 	.ptp_ts_bits		= 32,
@@ -808,6 +815,7 @@ const struct sja1105_info sja1110b_info = {
 	.dyn_ops		= sja1110_dyn_ops,
 	.regs			= &sja1110_regs,
 	.qinq_tpid		= ETH_P_8021AD,
+	.tag_proto		= DSA_TAG_PROTO_SJA1110,
 	.can_limit_mcast_flood	= true,
 	.multiple_cascade_ports	= true,
 	.ptp_ts_bits		= 32,
@@ -854,6 +862,7 @@ const struct sja1105_info sja1110c_info = {
 	.dyn_ops		= sja1110_dyn_ops,
 	.regs			= &sja1110_regs,
 	.qinq_tpid		= ETH_P_8021AD,
+	.tag_proto		= DSA_TAG_PROTO_SJA1110,
 	.can_limit_mcast_flood	= true,
 	.multiple_cascade_ports	= true,
 	.ptp_ts_bits		= 32,
@@ -900,6 +909,7 @@ const struct sja1105_info sja1110d_info = {
 	.dyn_ops		= sja1110_dyn_ops,
 	.regs			= &sja1110_regs,
 	.qinq_tpid		= ETH_P_8021AD,
+	.tag_proto		= DSA_TAG_PROTO_SJA1110,
 	.can_limit_mcast_flood	= true,
 	.multiple_cascade_ports	= true,
 	.ptp_ts_bits		= 32,
diff --git a/drivers/net/dsa/sja1105/sja1105_static_config.c b/drivers/net/dsa/sja1105/sja1105_static_config.c
index eda571819d45..1491b72008f3 100644
--- a/drivers/net/dsa/sja1105/sja1105_static_config.c
+++ b/drivers/net/dsa/sja1105/sja1105_static_config.c
@@ -212,6 +212,7 @@ size_t sja1110_general_params_entry_packing(void *buf, void *entry_ptr,
 	sja1105_packing(buf, &entry->egrmirrdei,   110, 110, size, op);
 	sja1105_packing(buf, &entry->replay_port,  109, 106, size, op);
 	sja1105_packing(buf, &entry->tdmaconfigidx, 70,  67, size, op);
+	sja1105_packing(buf, &entry->header_type,   64,  49, size, op);
 	sja1105_packing(buf, &entry->tte_en,        16,  16, size, op);
 	return size;
 }
diff --git a/drivers/net/dsa/sja1105/sja1105_static_config.h b/drivers/net/dsa/sja1105/sja1105_static_config.h
index 9bef51791bff..bce0f5c03d0b 100644
--- a/drivers/net/dsa/sja1105/sja1105_static_config.h
+++ b/drivers/net/dsa/sja1105/sja1105_static_config.h
@@ -217,6 +217,7 @@ struct sja1105_general_params_entry {
 	/* SJA1110 only */
 	u64 tte_en;
 	u64 tdmaconfigidx;
+	u64 header_type;
 };
 
 struct sja1105_schedule_entry_points_entry {
diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index 865a548a6ef2..b02cf7b515ae 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -14,6 +14,7 @@
 
 #define ETH_P_SJA1105				ETH_P_DSA_8021Q
 #define ETH_P_SJA1105_META			0x0008
+#define ETH_P_SJA1110				0xdadc
 
 /* IEEE 802.3 Annex 57A: Slow Protocols PDUs (01:80:C2:xx:xx:xx) */
 #define SJA1105_LINKLOCAL_FILTER_A		0x0180C2000000ull
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 0a10f6fffc3d..289d68e82da0 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -50,6 +50,7 @@ struct phylink_link_state;
 #define DSA_TAG_PROTO_OCELOT_8021Q_VALUE	20
 #define DSA_TAG_PROTO_SEVILLE_VALUE		21
 #define DSA_TAG_PROTO_BRCM_LEGACY_VALUE		22
+#define DSA_TAG_PROTO_SJA1110_VALUE		23
 
 enum dsa_tag_protocol {
 	DSA_TAG_PROTO_NONE		= DSA_TAG_PROTO_NONE_VALUE,
@@ -75,6 +76,7 @@ enum dsa_tag_protocol {
 	DSA_TAG_PROTO_XRS700X		= DSA_TAG_PROTO_XRS700X_VALUE,
 	DSA_TAG_PROTO_OCELOT_8021Q	= DSA_TAG_PROTO_OCELOT_8021Q_VALUE,
 	DSA_TAG_PROTO_SEVILLE		= DSA_TAG_PROTO_SEVILLE_VALUE,
+	DSA_TAG_PROTO_SJA1110		= DSA_TAG_PROTO_SJA1110_VALUE,
 };
 
 struct packet_type;
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index 11f555dd9566..37e1d64e07c6 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -7,6 +7,47 @@
 #include <linux/packing.h>
 #include "dsa_priv.h"
 
+/* Is this a TX or an RX header? */
+#define SJA1110_HEADER_HOST_TO_SWITCH		BIT(15)
+
+/* RX header */
+#define SJA1110_RX_HEADER_IS_METADATA		BIT(14)
+#define SJA1110_RX_HEADER_HOST_ONLY		BIT(13)
+#define SJA1110_RX_HEADER_HAS_TRAILER		BIT(12)
+
+/* Trap-to-host format (no trailer present) */
+#define SJA1110_RX_HEADER_SRC_PORT(x)		(((x) & GENMASK(7, 4)) >> 4)
+#define SJA1110_RX_HEADER_SWITCH_ID(x)		((x) & GENMASK(3, 0))
+
+/* Timestamp format (trailer present) */
+#define SJA1110_RX_HEADER_TRAILER_POS(x)	((x) & GENMASK(11, 0))
+
+#define SJA1110_RX_TRAILER_SWITCH_ID(x)		(((x) & GENMASK(7, 4)) >> 4)
+#define SJA1110_RX_TRAILER_SRC_PORT(x)		((x) & GENMASK(3, 0))
+
+/* TX header */
+#define SJA1110_TX_HEADER_UPDATE_TC		BIT(14)
+#define SJA1110_TX_HEADER_TAKE_TS		BIT(13)
+#define SJA1110_TX_HEADER_TAKE_TS_CASC		BIT(12)
+#define SJA1110_TX_HEADER_HAS_TRAILER		BIT(11)
+
+/* Only valid if SJA1110_TX_HEADER_HAS_TRAILER is false */
+#define SJA1110_TX_HEADER_PRIO(x)		(((x) << 7) & GENMASK(10, 7))
+#define SJA1110_TX_HEADER_TSTAMP_ID(x)		((x) & GENMASK(7, 0))
+
+/* Only valid if SJA1110_TX_HEADER_HAS_TRAILER is true */
+#define SJA1110_TX_HEADER_TRAILER_POS(x)	((x) & GENMASK(10, 0))
+
+#define SJA1110_TX_TRAILER_TSTAMP_ID(x)		(((x) << 24) & GENMASK(31, 24))
+#define SJA1110_TX_TRAILER_PRIO(x)		(((x) << 21) & GENMASK(23, 21))
+#define SJA1110_TX_TRAILER_SWITCHID(x)		(((x) << 12) & GENMASK(15, 12))
+#define SJA1110_TX_TRAILER_DESTPORTS(x)		(((x) << 1) & GENMASK(11, 1))
+
+#define SJA1110_HEADER_LEN			4
+#define SJA1110_RX_TRAILER_LEN			13
+#define SJA1110_TX_TRAILER_LEN			4
+#define SJA1110_MAX_PADDING_LEN			15
+
 /* Similar to is_link_local_ether_addr(hdr->h_dest) but also covers PTP */
 static inline bool sja1105_is_link_local(const struct sk_buff *skb)
 {
@@ -140,6 +181,50 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb,
 			     ((pcp << VLAN_PRIO_SHIFT) | tx_vid));
 }
 
+static struct sk_buff *sja1110_xmit(struct sk_buff *skb,
+				    struct net_device *netdev)
+{
+	struct dsa_port *dp = dsa_slave_to_port(netdev);
+	u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index);
+	u16 queue_mapping = skb_get_queue_mapping(skb);
+	u8 pcp = netdev_txq_to_tc(netdev, queue_mapping);
+	struct ethhdr *eth_hdr;
+	__be32 *tx_trailer;
+	__be16 *tx_header;
+	int trailer_pos;
+
+	/* Transmitting control packets is done using in-band control
+	 * extensions, while data packets are transmitted using
+	 * tag_8021q TX VLANs.
+	 */
+	if (likely(!sja1105_is_link_local(skb)))
+		return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp->priv),
+				     ((pcp << VLAN_PRIO_SHIFT) | tx_vid));
+
+	skb_push(skb, SJA1110_HEADER_LEN);
+
+	/* Move Ethernet header to the left, making space for DSA tag */
+	memmove(skb->data, skb->data + SJA1110_HEADER_LEN, 2 * ETH_ALEN);
+
+	trailer_pos = skb->len;
+
+	/* On TX, skb->data points to skb_mac_header(skb) */
+	eth_hdr = (struct ethhdr *)skb->data;
+	tx_header = (__be16 *)(eth_hdr + 1);
+	tx_trailer = skb_put(skb, SJA1110_TX_TRAILER_LEN);
+
+	eth_hdr->h_proto = htons(ETH_P_SJA1110);
+
+	*tx_header = htons(SJA1110_HEADER_HOST_TO_SWITCH |
+			   SJA1110_TX_HEADER_HAS_TRAILER |
+			   SJA1110_TX_HEADER_TRAILER_POS(trailer_pos));
+	*tx_trailer = cpu_to_be32(SJA1110_TX_TRAILER_PRIO(pcp) |
+				  SJA1110_TX_TRAILER_SWITCHID(dp->ds->index) |
+				  SJA1110_TX_TRAILER_DESTPORTS(BIT(dp->index)));
+
+	return skb;
+}
+
 static void sja1105_transfer_meta(struct sk_buff *skb,
 				  const struct sja1105_meta *meta)
 {
@@ -283,6 +368,11 @@ static bool sja1105_skb_has_tag_8021q(const struct sk_buff *skb)
 	       skb_vlan_tag_present(skb);
 }
 
+static bool sja1110_skb_has_inband_control_extension(const struct sk_buff *skb)
+{
+	return ntohs(eth_hdr(skb)->h_proto) == ETH_P_SJA1110;
+}
+
 static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 				   struct net_device *netdev,
 				   struct packet_type *pt)
@@ -333,6 +423,98 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 					      is_meta);
 }
 
+static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb,
+							    int *source_port,
+							    int *switch_id)
+{
+	u16 rx_header;
+
+	if (unlikely(!pskb_may_pull(skb, SJA1110_HEADER_LEN)))
+		return NULL;
+
+	/* skb->data points to skb_mac_header(skb) + ETH_HLEN, which is exactly
+	 * what we need because the caller has checked the EtherType (which is
+	 * located 2 bytes back) and we just need a pointer to the header that
+	 * comes afterwards.
+	 */
+	rx_header = ntohs(*(__be16 *)skb->data);
+
+	/* Timestamp frame, we have a trailer */
+	if (rx_header & SJA1110_RX_HEADER_HAS_TRAILER) {
+		int start_of_padding = SJA1110_RX_HEADER_TRAILER_POS(rx_header);
+		u8 *rx_trailer = skb_tail_pointer(skb) - SJA1110_RX_TRAILER_LEN;
+		u64 *tstamp = &SJA1105_SKB_CB(skb)->tstamp;
+		u8 last_byte = rx_trailer[12];
+
+		/* The timestamp is unaligned, so we need to use packing()
+		 * to get it
+		 */
+		packing(rx_trailer, tstamp, 63, 0, 8, UNPACK, 0);
+
+		*source_port = SJA1110_RX_TRAILER_SRC_PORT(last_byte);
+		*switch_id = SJA1110_RX_TRAILER_SWITCH_ID(last_byte);
+
+		/* skb->len counts from skb->data, while start_of_padding
+		 * counts from the destination MAC address. Right now skb->data
+		 * is still as set by the DSA master, so to trim away the
+		 * padding and trailer we need to account for the fact that
+		 * skb->data points to skb_mac_header(skb) + ETH_HLEN.
+		 */
+		pskb_trim_rcsum(skb, start_of_padding - ETH_HLEN);
+	/* Trap-to-host frame, no timestamp trailer */
+	} else {
+		*source_port = SJA1110_RX_HEADER_SRC_PORT(rx_header);
+		*switch_id = SJA1110_RX_HEADER_SWITCH_ID(rx_header);
+	}
+
+	/* Advance skb->data past the DSA header */
+	skb_pull_rcsum(skb, SJA1110_HEADER_LEN);
+
+	/* Remove the DSA header */
+	memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - SJA1110_HEADER_LEN,
+		2 * ETH_ALEN);
+
+	/* With skb->data in its final place, update the MAC header
+	 * so that eth_hdr() continues to works properly.
+	 */
+	skb_set_mac_header(skb, -ETH_HLEN);
+
+	return skb;
+}
+
+static struct sk_buff *sja1110_rcv(struct sk_buff *skb,
+				   struct net_device *netdev,
+				   struct packet_type *pt)
+{
+	int source_port = -1, switch_id = -1, subvlan = 0;
+
+	skb->offload_fwd_mark = 1;
+
+	if (sja1110_skb_has_inband_control_extension(skb)) {
+		skb = sja1110_rcv_inband_control_extension(skb, &source_port,
+							   &switch_id);
+		if (!skb)
+			return NULL;
+	}
+
+	/* Packets with in-band control extensions might still have RX VLANs */
+	if (likely(sja1105_skb_has_tag_8021q(skb)))
+		dsa_8021q_rcv(skb, &source_port, &switch_id, &subvlan);
+
+	skb->dev = dsa_master_find_slave(netdev, switch_id, source_port);
+	if (!skb->dev) {
+		netdev_warn(netdev,
+			    "Couldn't decode source port %d and switch id %d\n",
+			    source_port, switch_id);
+		return NULL;
+	}
+
+	if (subvlan)
+		sja1105_decode_subvlan(skb, subvlan);
+
+	return skb;
+}
+
 static void sja1105_flow_dissect(const struct sk_buff *skb, __be16 *proto,
 				 int *offset)
 {
@@ -343,6 +525,20 @@ static void sja1105_flow_dissect(const struct sk_buff *skb, __be16 *proto,
 	dsa_tag_generic_flow_dissect(skb, proto, offset);
 }
 
+static void sja1110_flow_dissect(const struct sk_buff *skb, __be16 *proto,
+				 int *offset)
+{
+	/* Management frames have 2 DSA tags on RX, so the needed_headroom we
+	 * declared is fine for the generic dissector adjustment procedure.
+	 */
+	if (unlikely(sja1105_is_link_local(skb)))
+		return dsa_tag_generic_flow_dissect(skb, proto, offset);
+
+	/* For the rest, there is a single DSA tag, the tag_8021q one */
+	*offset = VLAN_HLEN;
+	*proto = ((__be16 *)skb->data)[(VLAN_HLEN / 2) - 1];
+}
+
 static const struct dsa_device_ops sja1105_netdev_ops = {
 	.name = "sja1105",
 	.proto = DSA_TAG_PROTO_SJA1105,
@@ -354,7 +550,28 @@ static const struct dsa_device_ops sja1105_netdev_ops = {
 	.promisc_on_master = true,
 };
 
-MODULE_LICENSE("GPL v2");
+DSA_TAG_DRIVER(sja1105_netdev_ops);
 MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SJA1105);
 
-module_dsa_tag_driver(sja1105_netdev_ops);
+static const struct dsa_device_ops sja1110_netdev_ops = {
+	.name = "sja1110",
+	.proto = DSA_TAG_PROTO_SJA1110,
+	.xmit = sja1110_xmit,
+	.rcv = sja1110_rcv,
+	.filter = sja1105_filter,
+	.flow_dissect = sja1110_flow_dissect,
+	.needed_headroom = SJA1110_HEADER_LEN + VLAN_HLEN,
+	.needed_tailroom = SJA1110_RX_TRAILER_LEN + SJA1110_MAX_PADDING_LEN,
+};
+
+DSA_TAG_DRIVER(sja1110_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SJA1110);
+
+static struct dsa_tag_driver *sja1105_tag_driver_array[] = {
+	&DSA_TAG_DRIVER_NAME(sja1105_netdev_ops),
+	&DSA_TAG_DRIVER_NAME(sja1110_netdev_ops),
+};
+
+module_dsa_tag_drivers(sja1105_tag_driver_array);
+
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From 566b18c8b752f67c4e82f0eb4563dd71f84a8799 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 11 Jun 2021 22:01:31 +0300
Subject: net: dsa: sja1105: implement TX timestamping for SJA1110

The TX timestamping procedure for SJA1105 is a bit unconventional
because the transmit procedure itself is unconventional.

Control packets (and therefore PTP as well) are transmitted to a
specific port in SJA1105 using "management routes" which must be written
over SPI to the switch. These are one-shot rules that match by
destination MAC address on traffic coming from the CPU port, and select
the precise destination port for that packet. So to transmit a packet
from NET_TX softirq context, we actually need to defer to a process
context so that we can perform that SPI write before we send the packet.
The DSA master dev_queue_xmit() runs in process context, and we poll
until the switch confirms it took the TX timestamp, then we annotate the
skb clone with that TX timestamp. This is why the sja1105 driver does
not need an skb queue for TX timestamping.

But the SJA1110 is a bit (not much!) more conventional, and you can
request 2-step TX timestamping through the DSA header, as well as give
the switch a cookie (timestamp ID) which it will give back to you when
it has the timestamp. So now we do need a queue for keeping the skb
clones until their TX timestamps become available.

The interesting part is that the metadata frames from SJA1105 haven't
disappeared completely. On SJA1105 they were used as follow-ups which
contained RX timestamps, but on SJA1110 they are actually TX completion
packets, which contain a variable (up to 32) array of timestamps.
Why an array? Because:
- not only is the TX timestamp on the egress port being communicated,
  but also the RX timestamp on the CPU port. Nice, but we don't care
  about that, so we ignore it.
- because a packet could be multicast to multiple egress ports, each
  port takes its own timestamp, and the TX completion packet contains
  the individual timestamps on each port.

This is unconventional because switches typically have a timestamping
FIFO and raise an interrupt, but this one doesn't. So the tagger needs
to detect and parse meta frames, and call into the main switch driver,
which pairs the timestamps with the skbs in the TX timestamping queue
which are waiting for one.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/sja1105/sja1105.h     |  1 +
 drivers/net/dsa/sja1105/sja1105_ptp.c | 69 +++++++++++++++++++++++++++++++++++
 drivers/net/dsa/sja1105/sja1105_ptp.h |  7 ++++
 drivers/net/dsa/sja1105/sja1105_spi.c |  4 ++
 include/linux/dsa/sja1105.h           | 23 ++++++++++++
 net/dsa/tag_sja1105.c                 | 52 ++++++++++++++++++++++++++
 6 files changed, 156 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h
index 201bca282884..5f3449351668 100644
--- a/drivers/net/dsa/sja1105/sja1105.h
+++ b/drivers/net/dsa/sja1105/sja1105.h
@@ -131,6 +131,7 @@ struct sja1105_info {
 	void (*ptp_cmd_packing)(u8 *buf, struct sja1105_ptp_cmd *cmd,
 				enum packing_op op);
 	bool (*rxtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb);
+	void (*txtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb);
 	int (*clocking_setup)(struct sja1105_private *priv);
 	const char *name;
 	bool supports_mii[SJA1105_MAX_NUM_PORTS];
diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c
index 62fe05b4cb60..691f6dd7e669 100644
--- a/drivers/net/dsa/sja1105/sja1105_ptp.c
+++ b/drivers/net/dsa/sja1105/sja1105_ptp.c
@@ -79,6 +79,7 @@ static int sja1105_change_rxtstamping(struct sja1105_private *priv,
 		priv->tagger_data.stampable_skb = NULL;
 	}
 	ptp_cancel_worker_sync(ptp_data->clock);
+	skb_queue_purge(&ptp_data->skb_txtstamp_queue);
 	skb_queue_purge(&ptp_data->skb_rxtstamp_queue);
 
 	return sja1105_static_config_reload(priv, SJA1105_RX_HWTSTAMPING);
@@ -451,6 +452,67 @@ bool sja1105_port_rxtstamp(struct dsa_switch *ds, int port,
 	return priv->info->rxtstamp(ds, port, skb);
 }
 
+void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port, u8 ts_id,
+				 enum sja1110_meta_tstamp dir, u64 tstamp)
+{
+	struct sja1105_private *priv = ds->priv;
+	struct sja1105_ptp_data *ptp_data = &priv->ptp_data;
+	struct sk_buff *skb, *skb_tmp, *skb_match = NULL;
+	struct skb_shared_hwtstamps shwt = {0};
+
+	/* We don't care about RX timestamps on the CPU port */
+	if (dir == SJA1110_META_TSTAMP_RX)
+		return;
+
+	spin_lock(&ptp_data->skb_txtstamp_queue.lock);
+
+	skb_queue_walk_safe(&ptp_data->skb_txtstamp_queue, skb, skb_tmp) {
+		if (SJA1105_SKB_CB(skb)->ts_id != ts_id)
+			continue;
+
+		__skb_unlink(skb, &ptp_data->skb_txtstamp_queue);
+		skb_match = skb;
+
+		break;
+	}
+
+	spin_unlock(&ptp_data->skb_txtstamp_queue.lock);
+
+	if (WARN_ON(!skb_match))
+		return;
+
+	shwt.hwtstamp = ns_to_ktime(sja1105_ticks_to_ns(tstamp));
+	skb_complete_tx_timestamp(skb_match, &shwt);
+}
+EXPORT_SYMBOL_GPL(sja1110_process_meta_tstamp);
+
+/* In addition to cloning the skb which is done by the common
+ * sja1105_port_txtstamp, we need to generate a timestamp ID and save the
+ * packet to the TX timestamping queue.
+ */
+void sja1110_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb)
+{
+	struct sk_buff *clone = SJA1105_SKB_CB(skb)->clone;
+	struct sja1105_private *priv = ds->priv;
+	struct sja1105_ptp_data *ptp_data = &priv->ptp_data;
+	struct sja1105_port *sp = &priv->ports[port];
+	u8 ts_id;
+
+	skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
+
+	spin_lock(&sp->data->meta_lock);
+
+	ts_id = sp->data->ts_id;
+	/* Deal automatically with 8-bit wraparound */
+	sp->data->ts_id++;
+
+	SJA1105_SKB_CB(clone)->ts_id = ts_id;
+
+	spin_unlock(&sp->data->meta_lock);
+
+	skb_queue_tail(&ptp_data->skb_txtstamp_queue, clone);
+}
+
 /* Called from dsa_skb_tx_timestamp. This callback is just to clone
  * the skb and have it available in SJA1105_SKB_CB in the .port_deferred_xmit
  * callback, where we will timestamp it synchronously.
@@ -469,6 +531,9 @@ void sja1105_port_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb)
 		return;
 
 	SJA1105_SKB_CB(skb)->clone = clone;
+
+	if (priv->info->txtstamp)
+		priv->info->txtstamp(ds, port, skb);
 }
 
 static int sja1105_ptp_reset(struct dsa_switch *ds)
@@ -885,7 +950,10 @@ int sja1105_ptp_clock_register(struct dsa_switch *ds)
 		.n_per_out	= 1,
 	};
 
+	/* Only used on SJA1105 */
 	skb_queue_head_init(&ptp_data->skb_rxtstamp_queue);
+	/* Only used on SJA1110 */
+	skb_queue_head_init(&ptp_data->skb_txtstamp_queue);
 	spin_lock_init(&tagger_data->meta_lock);
 
 	ptp_data->clock = ptp_clock_register(&ptp_data->caps, ds->dev);
@@ -910,6 +978,7 @@ void sja1105_ptp_clock_unregister(struct dsa_switch *ds)
 
 	del_timer_sync(&ptp_data->extts_timer);
 	ptp_cancel_worker_sync(ptp_data->clock);
+	skb_queue_purge(&ptp_data->skb_txtstamp_queue);
 	skb_queue_purge(&ptp_data->skb_rxtstamp_queue);
 	ptp_clock_unregister(ptp_data->clock);
 	ptp_data->clock = NULL;
diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.h b/drivers/net/dsa/sja1105/sja1105_ptp.h
index bf0c4f1dfed7..3c874bb4c17b 100644
--- a/drivers/net/dsa/sja1105/sja1105_ptp.h
+++ b/drivers/net/dsa/sja1105/sja1105_ptp.h
@@ -75,7 +75,12 @@ struct sja1105_ptp_cmd {
 
 struct sja1105_ptp_data {
 	struct timer_list extts_timer;
+	/* Used only on SJA1105 to reconstruct partial timestamps */
 	struct sk_buff_head skb_rxtstamp_queue;
+	/* Used on SJA1110 where meta frames are generated only for
+	 * 2-step TX timestamps
+	 */
+	struct sk_buff_head skb_txtstamp_queue;
 	struct ptp_clock_info caps;
 	struct ptp_clock *clock;
 	struct sja1105_ptp_cmd cmd;
@@ -124,6 +129,7 @@ int sja1105_ptp_commit(struct dsa_switch *ds, struct sja1105_ptp_cmd *cmd,
 
 bool sja1105_rxtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb);
 bool sja1110_rxtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb);
+void sja1110_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb);
 
 #else
 
@@ -189,6 +195,7 @@ static inline int sja1105_ptp_commit(struct dsa_switch *ds,
 
 #define sja1105_rxtstamp NULL
 #define sja1110_rxtstamp NULL
+#define sja1110_txtstamp NULL
 
 #endif /* IS_ENABLED(CONFIG_NET_DSA_SJA1105_PTP) */
 
diff --git a/drivers/net/dsa/sja1105/sja1105_spi.c b/drivers/net/dsa/sja1105/sja1105_spi.c
index f7dd86271891..32d00212423c 100644
--- a/drivers/net/dsa/sja1105/sja1105_spi.c
+++ b/drivers/net/dsa/sja1105/sja1105_spi.c
@@ -788,6 +788,7 @@ const struct sja1105_info sja1110a_info = {
 	.fdb_del_cmd		= sja1105pqrs_fdb_del,
 	.ptp_cmd_packing	= sja1105pqrs_ptp_cmd_packing,
 	.rxtstamp		= sja1110_rxtstamp,
+	.txtstamp		= sja1110_txtstamp,
 	.clocking_setup		= sja1110_clocking_setup,
 	.port_speed		= {
 		[SJA1105_SPEED_AUTO] = 0,
@@ -836,6 +837,7 @@ const struct sja1105_info sja1110b_info = {
 	.fdb_del_cmd		= sja1105pqrs_fdb_del,
 	.ptp_cmd_packing	= sja1105pqrs_ptp_cmd_packing,
 	.rxtstamp		= sja1110_rxtstamp,
+	.txtstamp		= sja1110_txtstamp,
 	.clocking_setup		= sja1110_clocking_setup,
 	.port_speed		= {
 		[SJA1105_SPEED_AUTO] = 0,
@@ -884,6 +886,7 @@ const struct sja1105_info sja1110c_info = {
 	.fdb_del_cmd		= sja1105pqrs_fdb_del,
 	.ptp_cmd_packing	= sja1105pqrs_ptp_cmd_packing,
 	.rxtstamp		= sja1110_rxtstamp,
+	.txtstamp		= sja1110_txtstamp,
 	.clocking_setup		= sja1110_clocking_setup,
 	.port_speed		= {
 		[SJA1105_SPEED_AUTO] = 0,
@@ -932,6 +935,7 @@ const struct sja1105_info sja1110d_info = {
 	.fdb_del_cmd		= sja1105pqrs_fdb_del,
 	.ptp_cmd_packing	= sja1105pqrs_ptp_cmd_packing,
 	.rxtstamp		= sja1110_rxtstamp,
+	.txtstamp		= sja1110_txtstamp,
 	.clocking_setup		= sja1110_clocking_setup,
 	.port_speed		= {
 		[SJA1105_SPEED_AUTO] = 0,
diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index b02cf7b515ae..b6089b88314c 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -45,11 +45,14 @@ struct sja1105_tagger_data {
 	 */
 	spinlock_t meta_lock;
 	unsigned long state;
+	u8 ts_id;
 };
 
 struct sja1105_skb_cb {
 	struct sk_buff *clone;
 	u64 tstamp;
+	/* Only valid for packets cloned for 2-step TX timestamping */
+	u8 ts_id;
 };
 
 #define SJA1105_SKB_CB(skb) \
@@ -66,4 +69,24 @@ struct sja1105_port {
 	u16 xmit_tpid;
 };
 
+enum sja1110_meta_tstamp {
+	SJA1110_META_TSTAMP_TX = 0,
+	SJA1110_META_TSTAMP_RX = 1,
+};
+
+#if IS_ENABLED(CONFIG_NET_DSA_SJA1105_PTP)
+
+void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port, u8 ts_id,
+				 enum sja1110_meta_tstamp dir, u64 tstamp);
+
+#else
+
+static inline void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port,
+					       u8 ts_id, enum sja1110_meta_tstamp dir,
+					       u64 tstamp)
+{
+}
+
+#endif /* IS_ENABLED(CONFIG_NET_DSA_SJA1105_PTP) */
+
 #endif /* _NET_DSA_SJA1105_H */
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index 37e1d64e07c6..9c2df9ece01b 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -25,6 +25,9 @@
 #define SJA1110_RX_TRAILER_SWITCH_ID(x)		(((x) & GENMASK(7, 4)) >> 4)
 #define SJA1110_RX_TRAILER_SRC_PORT(x)		((x) & GENMASK(3, 0))
 
+/* Meta frame format (for 2-step TX timestamps) */
+#define SJA1110_RX_HEADER_N_TS(x)		(((x) & GENMASK(8, 4)) >> 4)
+
 /* TX header */
 #define SJA1110_TX_HEADER_UPDATE_TC		BIT(14)
 #define SJA1110_TX_HEADER_TAKE_TS		BIT(13)
@@ -43,6 +46,8 @@
 #define SJA1110_TX_TRAILER_SWITCHID(x)		(((x) << 12) & GENMASK(15, 12))
 #define SJA1110_TX_TRAILER_DESTPORTS(x)		(((x) << 1) & GENMASK(11, 1))
 
+#define SJA1110_META_TSTAMP_SIZE		10
+
 #define SJA1110_HEADER_LEN			4
 #define SJA1110_RX_TRAILER_LEN			13
 #define SJA1110_TX_TRAILER_LEN			4
@@ -184,6 +189,7 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb,
 static struct sk_buff *sja1110_xmit(struct sk_buff *skb,
 				    struct net_device *netdev)
 {
+	struct sk_buff *clone = SJA1105_SKB_CB(skb)->clone;
 	struct dsa_port *dp = dsa_slave_to_port(netdev);
 	u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index);
 	u16 queue_mapping = skb_get_queue_mapping(skb);
@@ -221,6 +227,12 @@ static struct sk_buff *sja1110_xmit(struct sk_buff *skb,
 	*tx_trailer = cpu_to_be32(SJA1110_TX_TRAILER_PRIO(pcp) |
 				  SJA1110_TX_TRAILER_SWITCHID(dp->ds->index) |
 				  SJA1110_TX_TRAILER_DESTPORTS(BIT(dp->index)));
+	if (clone) {
+		u8 ts_id = SJA1105_SKB_CB(clone)->ts_id;
+
+		*tx_header |= htons(SJA1110_TX_HEADER_TAKE_TS);
+		*tx_trailer |= cpu_to_be32(SJA1110_TX_TRAILER_TSTAMP_ID(ts_id));
+	}
 
 	return skb;
 }
@@ -423,6 +435,43 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
 					      is_meta);
 }
 
+static struct sk_buff *sja1110_rcv_meta(struct sk_buff *skb, u16 rx_header)
+{
+	int switch_id = SJA1110_RX_HEADER_SWITCH_ID(rx_header);
+	int n_ts = SJA1110_RX_HEADER_N_TS(rx_header);
+	struct net_device *master = skb->dev;
+	struct dsa_port *cpu_dp;
+	u8 *buf = skb->data + 2;
+	struct dsa_switch *ds;
+	int i;
+
+	cpu_dp = master->dsa_ptr;
+	ds = dsa_switch_find(cpu_dp->dst->index, switch_id);
+	if (!ds) {
+		net_err_ratelimited("%s: cannot find switch id %d\n",
+				    master->name, switch_id);
+		return NULL;
+	}
+
+	for (i = 0; i <= n_ts; i++) {
+		u8 ts_id, source_port, dir;
+		u64 tstamp;
+
+		ts_id = buf[0];
+		source_port = (buf[1] & GENMASK(7, 4)) >> 4;
+		dir = (buf[1] & BIT(3)) >> 3;
+		tstamp = be64_to_cpu(*(__be64 *)(buf + 2));
+
+		sja1110_process_meta_tstamp(ds, source_port, ts_id, dir,
+					    tstamp);
+
+		buf += SJA1110_META_TSTAMP_SIZE;
+	}
+
+	/* Discard the meta frame, we've consumed the timestamps it contained */
+	return NULL;
+}
+
 static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb,
 							    int *source_port,
 							    int *switch_id)
@@ -439,6 +488,9 @@ static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb,
 	 */
 	rx_header = ntohs(*(__be16 *)skb->data);
 
+	if (rx_header & SJA1110_RX_HEADER_IS_METADATA)
+		return sja1110_rcv_meta(skb, rx_header);
+
 	/* Timestamp frame, we have a trailer */
 	if (rx_header & SJA1110_RX_HEADER_HAS_TRAILER) {
 		int start_of_padding = SJA1110_RX_HEADER_TRAILER_POS(rx_header);
-- 
cgit v1.2.3


From 0fb16976765143cf0d7d0dd78b3f406ab135c494 Mon Sep 17 00:00:00 2001
From: Calvin Johnson <calvin.johnson@oss.nxp.com>
Date: Fri, 11 Jun 2021 13:53:48 +0300
Subject: net: phy: Introduce fwnode_mdio_find_device()

Define fwnode_mdio_find_device() to get a pointer to the
mdio_device from fwnode passed to the function.

Refactor of_mdio_find_device() to use fwnode_mdio_find_device().

Signed-off-by: Calvin Johnson <calvin.johnson@oss.nxp.com>
Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Acked-by: Grant Likely <grant.likely@arm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/mdio/of_mdio.c   | 11 +----------
 drivers/net/phy/phy_device.c | 23 +++++++++++++++++++++++
 include/linux/phy.h          |  7 +++++++
 3 files changed, 31 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/mdio/of_mdio.c b/drivers/net/mdio/of_mdio.c
index 8e97d5b825f5..6ef8b6e40189 100644
--- a/drivers/net/mdio/of_mdio.c
+++ b/drivers/net/mdio/of_mdio.c
@@ -347,16 +347,7 @@ EXPORT_SYMBOL(of_mdiobus_register);
  */
 struct mdio_device *of_mdio_find_device(struct device_node *np)
 {
-	struct device *d;
-
-	if (!np)
-		return NULL;
-
-	d = bus_find_device_by_of_node(&mdio_bus_type, np);
-	if (!d)
-		return NULL;
-
-	return to_mdio_device(d);
+	return fwnode_mdio_find_device(of_fwnode_handle(np));
 }
 EXPORT_SYMBOL(of_mdio_find_device);
 
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 495d86b4af7c..dca454b5c209 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -2875,6 +2875,29 @@ static bool phy_drv_supports_irq(struct phy_driver *phydrv)
 	return phydrv->config_intr && phydrv->handle_interrupt;
 }
 
+/**
+ * fwnode_mdio_find_device - Given a fwnode, find the mdio_device
+ * @fwnode: pointer to the mdio_device's fwnode
+ *
+ * If successful, returns a pointer to the mdio_device with the embedded
+ * struct device refcount incremented by one, or NULL on failure.
+ * The caller should call put_device() on the mdio_device after its use.
+ */
+struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode)
+{
+	struct device *d;
+
+	if (!fwnode)
+		return NULL;
+
+	d = bus_find_device_by_fwnode(&mdio_bus_type, fwnode);
+	if (!d)
+		return NULL;
+
+	return to_mdio_device(d);
+}
+EXPORT_SYMBOL(fwnode_mdio_find_device);
+
 /**
  * phy_probe - probe and init a PHY device
  * @dev: device to probe and init
diff --git a/include/linux/phy.h b/include/linux/phy.h
index ed332ac92e25..7aa97f4e5387 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1377,10 +1377,17 @@ struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id,
 				     bool is_c45,
 				     struct phy_c45_device_ids *c45_ids);
 #if IS_ENABLED(CONFIG_PHYLIB)
+struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode);
 struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45);
 int phy_device_register(struct phy_device *phy);
 void phy_device_free(struct phy_device *phydev);
 #else
+static inline
+struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode)
+{
+	return 0;
+}
+
 static inline
 struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45)
 {
-- 
cgit v1.2.3


From 425775ed31a6fac8b66ab077f7936fafad895ef6 Mon Sep 17 00:00:00 2001
From: Calvin Johnson <calvin.johnson@oss.nxp.com>
Date: Fri, 11 Jun 2021 13:53:49 +0300
Subject: net: phy: Introduce phy related fwnode functions

Define fwnode_phy_find_device() to iterate an mdiobus and find the
phy device of the provided phy fwnode. Additionally define
device_phy_find_device() to find phy device of provided device.

Define fwnode_get_phy_node() to get phy_node using named reference.

Signed-off-by: Calvin Johnson <calvin.johnson@oss.nxp.com>
Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Acked-by: Grant Likely <grant.likely@arm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 62 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/phy.h          | 20 ++++++++++++++
 2 files changed, 82 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index dca454b5c209..786f464216dd 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -9,6 +9,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/acpi.h>
 #include <linux/bitmap.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
@@ -2898,6 +2899,67 @@ struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode)
 }
 EXPORT_SYMBOL(fwnode_mdio_find_device);
 
+/**
+ * fwnode_phy_find_device - For provided phy_fwnode, find phy_device.
+ *
+ * @phy_fwnode: Pointer to the phy's fwnode.
+ *
+ * If successful, returns a pointer to the phy_device with the embedded
+ * struct device refcount incremented by one, or NULL on failure.
+ */
+struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode)
+{
+	struct mdio_device *mdiodev;
+
+	mdiodev = fwnode_mdio_find_device(phy_fwnode);
+	if (!mdiodev)
+		return NULL;
+
+	if (mdiodev->flags & MDIO_DEVICE_FLAG_PHY)
+		return to_phy_device(&mdiodev->dev);
+
+	put_device(&mdiodev->dev);
+
+	return NULL;
+}
+EXPORT_SYMBOL(fwnode_phy_find_device);
+
+/**
+ * device_phy_find_device - For the given device, get the phy_device
+ * @dev: Pointer to the given device
+ *
+ * Refer return conditions of fwnode_phy_find_device().
+ */
+struct phy_device *device_phy_find_device(struct device *dev)
+{
+	return fwnode_phy_find_device(dev_fwnode(dev));
+}
+EXPORT_SYMBOL_GPL(device_phy_find_device);
+
+/**
+ * fwnode_get_phy_node - Get the phy_node using the named reference.
+ * @fwnode: Pointer to fwnode from which phy_node has to be obtained.
+ *
+ * Refer return conditions of fwnode_find_reference().
+ * For ACPI, only "phy-handle" is supported. Legacy DT properties "phy"
+ * and "phy-device" are not supported in ACPI. DT supports all the three
+ * named references to the phy node.
+ */
+struct fwnode_handle *fwnode_get_phy_node(struct fwnode_handle *fwnode)
+{
+	struct fwnode_handle *phy_node;
+
+	/* Only phy-handle is used for ACPI */
+	phy_node = fwnode_find_reference(fwnode, "phy-handle", 0);
+	if (is_acpi_node(fwnode) || !IS_ERR(phy_node))
+		return phy_node;
+	phy_node = fwnode_find_reference(fwnode, "phy", 0);
+	if (IS_ERR(phy_node))
+		phy_node = fwnode_find_reference(fwnode, "phy-device", 0);
+	return phy_node;
+}
+EXPORT_SYMBOL_GPL(fwnode_get_phy_node);
+
 /**
  * phy_probe - probe and init a PHY device
  * @dev: device to probe and init
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 7aa97f4e5387..f9b5fb099fa6 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1378,6 +1378,9 @@ struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id,
 				     struct phy_c45_device_ids *c45_ids);
 #if IS_ENABLED(CONFIG_PHYLIB)
 struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode);
+struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode);
+struct phy_device *device_phy_find_device(struct device *dev);
+struct fwnode_handle *fwnode_get_phy_node(struct fwnode_handle *fwnode);
 struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45);
 int phy_device_register(struct phy_device *phy);
 void phy_device_free(struct phy_device *phydev);
@@ -1388,6 +1391,23 @@ struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode)
 	return 0;
 }
 
+static inline
+struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode)
+{
+	return NULL;
+}
+
+static inline struct phy_device *device_phy_find_device(struct device *dev)
+{
+	return NULL;
+}
+
+static inline
+struct fwnode_handle *fwnode_get_phy_node(struct fwnode_handle *fwnode)
+{
+	return NULL;
+}
+
 static inline
 struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45)
 {
-- 
cgit v1.2.3


From 114dea60043b8f0c82c67dd281719ef8919c2416 Mon Sep 17 00:00:00 2001
From: Calvin Johnson <calvin.johnson@oss.nxp.com>
Date: Fri, 11 Jun 2021 13:53:51 +0300
Subject: net: phy: Introduce fwnode_get_phy_id()

Extract phy_id from compatible string. This will be used by
fwnode_mdiobus_register_phy() to create phy device using the
phy_id.

Signed-off-by: Calvin Johnson <calvin.johnson@oss.nxp.com>
Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Acked-by: Grant Likely <grant.likely@arm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 21 +++++++++++++++++++++
 include/linux/phy.h          |  5 +++++
 2 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 786f464216dd..f7472a0cf771 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -834,6 +834,27 @@ static int get_phy_c22_id(struct mii_bus *bus, int addr, u32 *phy_id)
 	return 0;
 }
 
+/* Extract the phy ID from the compatible string of the form
+ * ethernet-phy-idAAAA.BBBB.
+ */
+int fwnode_get_phy_id(struct fwnode_handle *fwnode, u32 *phy_id)
+{
+	unsigned int upper, lower;
+	const char *cp;
+	int ret;
+
+	ret = fwnode_property_read_string(fwnode, "compatible", &cp);
+	if (ret)
+		return ret;
+
+	if (sscanf(cp, "ethernet-phy-id%4x.%4x", &upper, &lower) != 2)
+		return -EINVAL;
+
+	*phy_id = ((upper & GENMASK(15, 0)) << 16) | (lower & GENMASK(15, 0));
+	return 0;
+}
+EXPORT_SYMBOL(fwnode_get_phy_id);
+
 /**
  * get_phy_device - reads the specified PHY device and returns its @phy_device
  *		    struct
diff --git a/include/linux/phy.h b/include/linux/phy.h
index f9b5fb099fa6..b60694734b07 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1377,6 +1377,7 @@ struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id,
 				     bool is_c45,
 				     struct phy_c45_device_ids *c45_ids);
 #if IS_ENABLED(CONFIG_PHYLIB)
+int fwnode_get_phy_id(struct fwnode_handle *fwnode, u32 *phy_id);
 struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode);
 struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode);
 struct phy_device *device_phy_find_device(struct device *dev);
@@ -1385,6 +1386,10 @@ struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45);
 int phy_device_register(struct phy_device *phy);
 void phy_device_free(struct phy_device *phydev);
 #else
+static inline int fwnode_get_phy_id(struct fwnode_handle *fwnode, u32 *phy_id)
+{
+	return 0;
+}
 static inline
 struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode)
 {
-- 
cgit v1.2.3


From bc1bee3b87ee48bd97ef7fd306445132ba2041b0 Mon Sep 17 00:00:00 2001
From: Calvin Johnson <calvin.johnson@oss.nxp.com>
Date: Fri, 11 Jun 2021 13:53:54 +0300
Subject: net: mdiobus: Introduce fwnode_mdiobus_register_phy()

Introduce fwnode_mdiobus_register_phy() to register PHYs on the
mdiobus. From the compatible string, identify whether the PHY is
c45 and based on this create a PHY device instance which is
registered on the mdiobus.

Along with fwnode_mdiobus_register_phy() also introduce
fwnode_find_mii_timestamper() and fwnode_mdiobus_phy_device_register()
since they are needed.
While at it, also use the newly introduced fwnode operation in
of_mdiobus_phy_device_register().

Signed-off-by: Calvin Johnson <calvin.johnson@oss.nxp.com>
Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Acked-by: Grant Likely <grant.likely@arm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                    |   1 +
 drivers/net/mdio/Kconfig       |   7 ++
 drivers/net/mdio/Makefile      |   3 +-
 drivers/net/mdio/fwnode_mdio.c | 144 +++++++++++++++++++++++++++++++++++++++++
 drivers/net/mdio/of_mdio.c     |  44 ++-----------
 include/linux/fwnode_mdio.h    |  35 ++++++++++
 6 files changed, 194 insertions(+), 40 deletions(-)
 create mode 100644 drivers/net/mdio/fwnode_mdio.c
 create mode 100644 include/linux/fwnode_mdio.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index e69c1991ec3b..e8f8b6c33a51 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6811,6 +6811,7 @@ F:	Documentation/devicetree/bindings/net/mdio*
 F:	Documentation/devicetree/bindings/net/qca,ar803x.yaml
 F:	Documentation/networking/phy.rst
 F:	drivers/net/mdio/
+F:	drivers/net/mdio/fwnode_mdio.c
 F:	drivers/net/mdio/of_mdio.c
 F:	drivers/net/pcs/
 F:	drivers/net/phy/
diff --git a/drivers/net/mdio/Kconfig b/drivers/net/mdio/Kconfig
index d06e06f5e31a..422e9e042a3c 100644
--- a/drivers/net/mdio/Kconfig
+++ b/drivers/net/mdio/Kconfig
@@ -19,6 +19,13 @@ config MDIO_BUS
 	  reflects whether the mdio_bus/mdio_device code is built as a
 	  loadable module or built-in.
 
+config FWNODE_MDIO
+	def_tristate PHYLIB
+	depends on (ACPI || OF) || COMPILE_TEST
+	select FIXED_PHY
+	help
+	  FWNODE MDIO bus (Ethernet PHY) accessors
+
 config OF_MDIO
 	def_tristate PHYLIB
 	depends on OF
diff --git a/drivers/net/mdio/Makefile b/drivers/net/mdio/Makefile
index c3ec0ef989df..2e6813c709eb 100644
--- a/drivers/net/mdio/Makefile
+++ b/drivers/net/mdio/Makefile
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0
 # Makefile for Linux MDIO bus drivers
 
-obj-$(CONFIG_OF_MDIO)	+= of_mdio.o
+obj-$(CONFIG_FWNODE_MDIO)	+= fwnode_mdio.o
+obj-$(CONFIG_OF_MDIO)		+= of_mdio.o
 
 obj-$(CONFIG_MDIO_ASPEED)		+= mdio-aspeed.o
 obj-$(CONFIG_MDIO_BCM_IPROC)		+= mdio-bcm-iproc.o
diff --git a/drivers/net/mdio/fwnode_mdio.c b/drivers/net/mdio/fwnode_mdio.c
new file mode 100644
index 000000000000..e96766da8de4
--- /dev/null
+++ b/drivers/net/mdio/fwnode_mdio.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * fwnode helpers for the MDIO (Ethernet PHY) API
+ *
+ * This file provides helper functions for extracting PHY device information
+ * out of the fwnode and using it to populate an mii_bus.
+ */
+
+#include <linux/acpi.h>
+#include <linux/fwnode_mdio.h>
+#include <linux/of.h>
+#include <linux/phy.h>
+
+MODULE_AUTHOR("Calvin Johnson <calvin.johnson@oss.nxp.com>");
+MODULE_LICENSE("GPL");
+
+static struct mii_timestamper *
+fwnode_find_mii_timestamper(struct fwnode_handle *fwnode)
+{
+	struct of_phandle_args arg;
+	int err;
+
+	if (is_acpi_node(fwnode))
+		return NULL;
+
+	err = of_parse_phandle_with_fixed_args(to_of_node(fwnode),
+					       "timestamper", 1, 0, &arg);
+	if (err == -ENOENT)
+		return NULL;
+	else if (err)
+		return ERR_PTR(err);
+
+	if (arg.args_count != 1)
+		return ERR_PTR(-EINVAL);
+
+	return register_mii_timestamper(arg.np, arg.args[0]);
+}
+
+int fwnode_mdiobus_phy_device_register(struct mii_bus *mdio,
+				       struct phy_device *phy,
+				       struct fwnode_handle *child, u32 addr)
+{
+	int rc;
+
+	rc = fwnode_irq_get(child, 0);
+	if (rc == -EPROBE_DEFER)
+		return rc;
+
+	if (rc > 0) {
+		phy->irq = rc;
+		mdio->irq[addr] = rc;
+	} else {
+		phy->irq = mdio->irq[addr];
+	}
+
+	if (fwnode_property_read_bool(child, "broken-turn-around"))
+		mdio->phy_ignore_ta_mask |= 1 << addr;
+
+	fwnode_property_read_u32(child, "reset-assert-us",
+				 &phy->mdio.reset_assert_delay);
+	fwnode_property_read_u32(child, "reset-deassert-us",
+				 &phy->mdio.reset_deassert_delay);
+
+	/* Associate the fwnode with the device structure so it
+	 * can be looked up later
+	 */
+	fwnode_handle_get(child);
+	phy->mdio.dev.fwnode = child;
+
+	/* All data is now stored in the phy struct;
+	 * register it
+	 */
+	rc = phy_device_register(phy);
+	if (rc) {
+		fwnode_handle_put(child);
+		return rc;
+	}
+
+	dev_dbg(&mdio->dev, "registered phy %p fwnode at address %i\n",
+		child, addr);
+	return 0;
+}
+EXPORT_SYMBOL(fwnode_mdiobus_phy_device_register);
+
+int fwnode_mdiobus_register_phy(struct mii_bus *bus,
+				struct fwnode_handle *child, u32 addr)
+{
+	struct mii_timestamper *mii_ts = NULL;
+	struct phy_device *phy;
+	bool is_c45 = false;
+	u32 phy_id;
+	int rc;
+
+	mii_ts = fwnode_find_mii_timestamper(child);
+	if (IS_ERR(mii_ts))
+		return PTR_ERR(mii_ts);
+
+	rc = fwnode_property_match_string(child, "compatible",
+					  "ethernet-phy-ieee802.3-c45");
+	if (rc >= 0)
+		is_c45 = true;
+
+	if (is_c45 || fwnode_get_phy_id(child, &phy_id))
+		phy = get_phy_device(bus, addr, is_c45);
+	else
+		phy = phy_device_create(bus, addr, phy_id, 0, NULL);
+	if (IS_ERR(phy)) {
+		unregister_mii_timestamper(mii_ts);
+		return PTR_ERR(phy);
+	}
+
+	if (is_acpi_node(child)) {
+		phy->irq = bus->irq[addr];
+
+		/* Associate the fwnode with the device structure so it
+		 * can be looked up later.
+		 */
+		phy->mdio.dev.fwnode = child;
+
+		/* All data is now stored in the phy struct, so register it */
+		rc = phy_device_register(phy);
+		if (rc) {
+			phy_device_free(phy);
+			fwnode_handle_put(phy->mdio.dev.fwnode);
+			return rc;
+		}
+	} else if (is_of_node(child)) {
+		rc = fwnode_mdiobus_phy_device_register(bus, phy, child, addr);
+		if (rc) {
+			unregister_mii_timestamper(mii_ts);
+			phy_device_free(phy);
+			return rc;
+		}
+	}
+
+	/* phy->mii_ts may already be defined by the PHY driver. A
+	 * mii_timestamper probed via the device tree will still have
+	 * precedence.
+	 */
+	if (mii_ts)
+		phy->mii_ts = mii_ts;
+	return 0;
+}
+EXPORT_SYMBOL(fwnode_mdiobus_register_phy);
diff --git a/drivers/net/mdio/of_mdio.c b/drivers/net/mdio/of_mdio.c
index d73c0570f19c..17327bbc1de4 100644
--- a/drivers/net/mdio/of_mdio.c
+++ b/drivers/net/mdio/of_mdio.c
@@ -10,6 +10,7 @@
 
 #include <linux/device.h>
 #include <linux/err.h>
+#include <linux/fwnode_mdio.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/netdevice.h>
@@ -51,46 +52,11 @@ static struct mii_timestamper *of_find_mii_timestamper(struct device_node *node)
 }
 
 int of_mdiobus_phy_device_register(struct mii_bus *mdio, struct phy_device *phy,
-			      struct device_node *child, u32 addr)
+				   struct device_node *child, u32 addr)
 {
-	int rc;
-
-	rc = of_irq_get(child, 0);
-	if (rc == -EPROBE_DEFER)
-		return rc;
-
-	if (rc > 0) {
-		phy->irq = rc;
-		mdio->irq[addr] = rc;
-	} else {
-		phy->irq = mdio->irq[addr];
-	}
-
-	if (of_property_read_bool(child, "broken-turn-around"))
-		mdio->phy_ignore_ta_mask |= 1 << addr;
-
-	of_property_read_u32(child, "reset-assert-us",
-			     &phy->mdio.reset_assert_delay);
-	of_property_read_u32(child, "reset-deassert-us",
-			     &phy->mdio.reset_deassert_delay);
-
-	/* Associate the OF node with the device structure so it
-	 * can be looked up later */
-	of_node_get(child);
-	phy->mdio.dev.of_node = child;
-	phy->mdio.dev.fwnode = of_fwnode_handle(child);
-
-	/* All data is now stored in the phy struct;
-	 * register it */
-	rc = phy_device_register(phy);
-	if (rc) {
-		of_node_put(child);
-		return rc;
-	}
-
-	dev_dbg(&mdio->dev, "registered phy %pOFn at address %i\n",
-		child, addr);
-	return 0;
+	return fwnode_mdiobus_phy_device_register(mdio, phy,
+						  of_fwnode_handle(child),
+						  addr);
 }
 EXPORT_SYMBOL(of_mdiobus_phy_device_register);
 
diff --git a/include/linux/fwnode_mdio.h b/include/linux/fwnode_mdio.h
new file mode 100644
index 000000000000..faf603c48c86
--- /dev/null
+++ b/include/linux/fwnode_mdio.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * FWNODE helper for the MDIO (Ethernet PHY) API
+ */
+
+#ifndef __LINUX_FWNODE_MDIO_H
+#define __LINUX_FWNODE_MDIO_H
+
+#include <linux/phy.h>
+
+#if IS_ENABLED(CONFIG_FWNODE_MDIO)
+int fwnode_mdiobus_phy_device_register(struct mii_bus *mdio,
+				       struct phy_device *phy,
+				       struct fwnode_handle *child, u32 addr);
+
+int fwnode_mdiobus_register_phy(struct mii_bus *bus,
+				struct fwnode_handle *child, u32 addr);
+
+#else /* CONFIG_FWNODE_MDIO */
+int fwnode_mdiobus_phy_device_register(struct mii_bus *mdio,
+				       struct phy_device *phy,
+				       struct fwnode_handle *child, u32 addr)
+{
+	return -EINVAL;
+}
+
+static inline int fwnode_mdiobus_register_phy(struct mii_bus *bus,
+					      struct fwnode_handle *child,
+					      u32 addr)
+{
+	return -EINVAL;
+}
+#endif
+
+#endif /* __LINUX_FWNODE_MDIO_H */
-- 
cgit v1.2.3


From 7ec16433cf1e97cfc823e50e9ee4e2fd3abfc4ee Mon Sep 17 00:00:00 2001
From: Calvin Johnson <calvin.johnson@oss.nxp.com>
Date: Fri, 11 Jun 2021 13:53:56 +0300
Subject: ACPI: utils: Introduce acpi_get_local_address()

Introduce a wrapper around the _ADR evaluation.

Signed-off-by: Calvin Johnson <calvin.johnson@oss.nxp.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Acked-by: Grant Likely <grant.likely@arm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/acpi/utils.c | 14 ++++++++++++++
 include/linux/acpi.h |  7 +++++++
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c
index 3b54b8fd7396..e7ddd281afff 100644
--- a/drivers/acpi/utils.c
+++ b/drivers/acpi/utils.c
@@ -277,6 +277,20 @@ acpi_evaluate_integer(acpi_handle handle,
 
 EXPORT_SYMBOL(acpi_evaluate_integer);
 
+int acpi_get_local_address(acpi_handle handle, u32 *addr)
+{
+	unsigned long long adr;
+	acpi_status status;
+
+	status = acpi_evaluate_integer(handle, METHOD_NAME__ADR, NULL, &adr);
+	if (ACPI_FAILURE(status))
+		return -ENODATA;
+
+	*addr = (u32)adr;
+	return 0;
+}
+EXPORT_SYMBOL(acpi_get_local_address);
+
 acpi_status
 acpi_evaluate_reference(acpi_handle handle,
 			acpi_string pathname,
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index c60745f657e9..6ace3a0f1415 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -710,6 +710,8 @@ static inline u64 acpi_arch_get_root_pointer(void)
 }
 #endif
 
+int acpi_get_local_address(acpi_handle handle, u32 *addr);
+
 #else	/* !CONFIG_ACPI */
 
 #define acpi_disabled 1
@@ -965,6 +967,11 @@ static inline struct acpi_device *acpi_resource_consumer(struct resource *res)
 	return NULL;
 }
 
+static inline int acpi_get_local_address(acpi_handle handle, u32 *addr)
+{
+	return -ENODEV;
+}
+
 #endif	/* !CONFIG_ACPI */
 
 #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
-- 
cgit v1.2.3


From 803ca24d2f92e2cf393df4705423f7b09a5eabd9 Mon Sep 17 00:00:00 2001
From: Calvin Johnson <calvin.johnson@oss.nxp.com>
Date: Fri, 11 Jun 2021 13:53:57 +0300
Subject: net: mdio: Add ACPI support code for mdio

Define acpi_mdiobus_register() to Register mii_bus and create PHYs for
each ACPI child node.

Signed-off-by: Calvin Johnson <calvin.johnson@oss.nxp.com>
Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Acked-by: Grant Likely <grant.likely@arm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                  |  1 +
 drivers/net/mdio/Kconfig     |  7 ++++++
 drivers/net/mdio/Makefile    |  1 +
 drivers/net/mdio/acpi_mdio.c | 58 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/acpi_mdio.h    | 26 ++++++++++++++++++++
 5 files changed, 93 insertions(+)
 create mode 100644 drivers/net/mdio/acpi_mdio.c
 create mode 100644 include/linux/acpi_mdio.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index e8f8b6c33a51..2172f594be8f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6811,6 +6811,7 @@ F:	Documentation/devicetree/bindings/net/mdio*
 F:	Documentation/devicetree/bindings/net/qca,ar803x.yaml
 F:	Documentation/networking/phy.rst
 F:	drivers/net/mdio/
+F:	drivers/net/mdio/acpi_mdio.c
 F:	drivers/net/mdio/fwnode_mdio.c
 F:	drivers/net/mdio/of_mdio.c
 F:	drivers/net/pcs/
diff --git a/drivers/net/mdio/Kconfig b/drivers/net/mdio/Kconfig
index 422e9e042a3c..99a6c13a11af 100644
--- a/drivers/net/mdio/Kconfig
+++ b/drivers/net/mdio/Kconfig
@@ -34,6 +34,13 @@ config OF_MDIO
 	help
 	  OpenFirmware MDIO bus (Ethernet PHY) accessors
 
+config ACPI_MDIO
+	def_tristate PHYLIB
+	depends on ACPI
+	depends on PHYLIB
+	help
+	  ACPI MDIO bus (Ethernet PHY) accessors
+
 if MDIO_BUS
 
 config MDIO_DEVRES
diff --git a/drivers/net/mdio/Makefile b/drivers/net/mdio/Makefile
index 2e6813c709eb..15f8dc4042ce 100644
--- a/drivers/net/mdio/Makefile
+++ b/drivers/net/mdio/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 # Makefile for Linux MDIO bus drivers
 
+obj-$(CONFIG_ACPI_MDIO)		+= acpi_mdio.o
 obj-$(CONFIG_FWNODE_MDIO)	+= fwnode_mdio.o
 obj-$(CONFIG_OF_MDIO)		+= of_mdio.o
 
diff --git a/drivers/net/mdio/acpi_mdio.c b/drivers/net/mdio/acpi_mdio.c
new file mode 100644
index 000000000000..d77c987fda9c
--- /dev/null
+++ b/drivers/net/mdio/acpi_mdio.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * ACPI helpers for the MDIO (Ethernet PHY) API
+ *
+ * This file provides helper functions for extracting PHY device information
+ * out of the ACPI ASL and using it to populate an mii_bus.
+ */
+
+#include <linux/acpi.h>
+#include <linux/acpi_mdio.h>
+#include <linux/bits.h>
+#include <linux/dev_printk.h>
+#include <linux/fwnode_mdio.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+MODULE_AUTHOR("Calvin Johnson <calvin.johnson@oss.nxp.com>");
+MODULE_LICENSE("GPL");
+
+/**
+ * acpi_mdiobus_register - Register mii_bus and create PHYs from the ACPI ASL.
+ * @mdio: pointer to mii_bus structure
+ * @fwnode: pointer to fwnode of MDIO bus. This fwnode is expected to represent
+ * an ACPI device object corresponding to the MDIO bus and its children are
+ * expected to correspond to the PHY devices on that bus.
+ *
+ * This function registers the mii_bus structure and registers a phy_device
+ * for each child node of @fwnode.
+ */
+int acpi_mdiobus_register(struct mii_bus *mdio, struct fwnode_handle *fwnode)
+{
+	struct fwnode_handle *child;
+	u32 addr;
+	int ret;
+
+	/* Mask out all PHYs from auto probing. */
+	mdio->phy_mask = GENMASK(31, 0);
+	ret = mdiobus_register(mdio);
+	if (ret)
+		return ret;
+
+	ACPI_COMPANION_SET(&mdio->dev, to_acpi_device_node(fwnode));
+
+	/* Loop over the child nodes and register a phy_device for each PHY */
+	fwnode_for_each_child_node(fwnode, child) {
+		ret = acpi_get_local_address(ACPI_HANDLE_FWNODE(child), &addr);
+		if (ret || addr >= PHY_MAX_ADDR)
+			continue;
+
+		ret = fwnode_mdiobus_register_phy(mdio, child, addr);
+		if (ret == -ENODEV)
+			dev_err(&mdio->dev,
+				"MDIO device at address %d is missing.\n",
+				addr);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(acpi_mdiobus_register);
diff --git a/include/linux/acpi_mdio.h b/include/linux/acpi_mdio.h
new file mode 100644
index 000000000000..0a24ab7cb66f
--- /dev/null
+++ b/include/linux/acpi_mdio.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * ACPI helper for the MDIO (Ethernet PHY) API
+ */
+
+#ifndef __LINUX_ACPI_MDIO_H
+#define __LINUX_ACPI_MDIO_H
+
+#include <linux/phy.h>
+
+#if IS_ENABLED(CONFIG_ACPI_MDIO)
+int acpi_mdiobus_register(struct mii_bus *mdio, struct fwnode_handle *fwnode);
+#else /* CONFIG_ACPI_MDIO */
+static inline int
+acpi_mdiobus_register(struct mii_bus *mdio, struct fwnode_handle *fwnode)
+{
+	/*
+	 * Fall back to mdiobus_register() function to register a bus.
+	 * This way, we don't have to keep compat bits around in drivers.
+	 */
+
+	return mdiobus_register(mdio);
+}
+#endif
+
+#endif /* __LINUX_ACPI_MDIO_H */
-- 
cgit v1.2.3


From 25396f680dd6257096c5dc6ceb90ce57caba8de1 Mon Sep 17 00:00:00 2001
From: Calvin Johnson <calvin.johnson@oss.nxp.com>
Date: Fri, 11 Jun 2021 13:53:59 +0300
Subject: net: phylink: introduce phylink_fwnode_phy_connect()

Define phylink_fwnode_phy_connect() to connect phy specified by
a fwnode to a phylink instance.

Signed-off-by: Calvin Johnson <calvin.johnson@oss.nxp.com>
Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Acked-by: Grant Likely <grant.likely@arm.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/phylink.h   |  3 +++
 2 files changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 96d8e88b4e46..9cc0f69faafe 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -5,6 +5,7 @@
  *
  * Copyright (C) 2015 Russell King
  */
+#include <linux/acpi.h>
 #include <linux/ethtool.h>
 #include <linux/export.h>
 #include <linux/gpio/consumer.h>
@@ -1125,6 +1126,59 @@ int phylink_of_phy_connect(struct phylink *pl, struct device_node *dn,
 }
 EXPORT_SYMBOL_GPL(phylink_of_phy_connect);
 
+/**
+ * phylink_fwnode_phy_connect() - connect the PHY specified in the fwnode.
+ * @pl: a pointer to a &struct phylink returned from phylink_create()
+ * @fwnode: a pointer to a &struct fwnode_handle.
+ * @flags: PHY-specific flags to communicate to the PHY device driver
+ *
+ * Connect the phy specified @fwnode to the phylink instance specified
+ * by @pl.
+ *
+ * Returns 0 on success or a negative errno.
+ */
+int phylink_fwnode_phy_connect(struct phylink *pl,
+			       struct fwnode_handle *fwnode,
+			       u32 flags)
+{
+	struct fwnode_handle *phy_fwnode;
+	struct phy_device *phy_dev;
+	int ret;
+
+	/* Fixed links and 802.3z are handled without needing a PHY */
+	if (pl->cfg_link_an_mode == MLO_AN_FIXED ||
+	    (pl->cfg_link_an_mode == MLO_AN_INBAND &&
+	     phy_interface_mode_is_8023z(pl->link_interface)))
+		return 0;
+
+	phy_fwnode = fwnode_get_phy_node(fwnode);
+	if (IS_ERR(phy_fwnode)) {
+		if (pl->cfg_link_an_mode == MLO_AN_PHY)
+			return -ENODEV;
+		return 0;
+	}
+
+	phy_dev = fwnode_phy_find_device(phy_fwnode);
+	/* We're done with the phy_node handle */
+	fwnode_handle_put(phy_fwnode);
+	if (!phy_dev)
+		return -ENODEV;
+
+	ret = phy_attach_direct(pl->netdev, phy_dev, flags,
+				pl->link_interface);
+	if (ret) {
+		phy_device_free(phy_dev);
+		return ret;
+	}
+
+	ret = phylink_bringup_phy(pl, phy_dev, pl->link_config.interface);
+	if (ret)
+		phy_detach(phy_dev);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(phylink_fwnode_phy_connect);
+
 /**
  * phylink_disconnect_phy() - disconnect any PHY attached to the phylink
  *   instance.
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index fd2acfd9b597..afb3ded0b691 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -441,6 +441,9 @@ void phylink_destroy(struct phylink *);
 
 int phylink_connect_phy(struct phylink *, struct phy_device *);
 int phylink_of_phy_connect(struct phylink *, struct device_node *, u32 flags);
+int phylink_fwnode_phy_connect(struct phylink *pl,
+			       struct fwnode_handle *fwnode,
+			       u32 flags);
 void phylink_disconnect_phy(struct phylink *);
 
 void phylink_mac_change(struct phylink *, bool up);
-- 
cgit v1.2.3


From 44931195a5412a97c46d299227fbabad4e09010d Mon Sep 17 00:00:00 2001
From: Arseny Krasnov <arseny.krasnov@kaspersky.com>
Date: Fri, 11 Jun 2021 14:12:38 +0300
Subject: virtio/vsock: dequeue callback for SOCK_SEQPACKET

Callback fetches RW packets from rx queue of socket until whole record
is copied(if user's buffer is full, user is not woken up). This is done
to not stall sender, because if we wake up user and it leaves syscall,
nobody will send credit update for rest of record, and sender will wait
for next enter of read syscall at receiver's side. So if user buffer is
full, we just send credit update and drop data.

Signed-off-by: Arseny Krasnov <arseny.krasnov@kaspersky.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/virtio_vsock.h            |  5 ++
 net/vmw_vsock/virtio_transport_common.c | 84 +++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index dc636b727179..1d9a302cb91d 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -36,6 +36,7 @@ struct virtio_vsock_sock {
 	u32 rx_bytes;
 	u32 buf_alloc;
 	struct list_head rx_queue;
+	u32 msg_count;
 };
 
 struct virtio_vsock_pkt {
@@ -80,6 +81,10 @@ virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
 			       struct msghdr *msg,
 			       size_t len, int flags);
 
+ssize_t
+virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk,
+				   struct msghdr *msg,
+				   int flags);
 s64 virtio_transport_stream_has_data(struct vsock_sock *vsk);
 s64 virtio_transport_stream_has_space(struct vsock_sock *vsk);
 
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index ad0d34d41444..1e1df19ec164 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -393,6 +393,78 @@ out:
 	return err;
 }
 
+static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk,
+						 struct msghdr *msg,
+						 int flags)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	struct virtio_vsock_pkt *pkt;
+	int dequeued_len = 0;
+	size_t user_buf_len = msg_data_left(msg);
+	bool copy_failed = false;
+	bool msg_ready = false;
+
+	spin_lock_bh(&vvs->rx_lock);
+
+	if (vvs->msg_count == 0) {
+		spin_unlock_bh(&vvs->rx_lock);
+		return 0;
+	}
+
+	while (!msg_ready) {
+		pkt = list_first_entry(&vvs->rx_queue, struct virtio_vsock_pkt, list);
+
+		if (!copy_failed) {
+			size_t pkt_len;
+			size_t bytes_to_copy;
+
+			pkt_len = (size_t)le32_to_cpu(pkt->hdr.len);
+			bytes_to_copy = min(user_buf_len, pkt_len);
+
+			if (bytes_to_copy) {
+				int err;
+
+				/* sk_lock is held by caller so no one else can dequeue.
+				 * Unlock rx_lock since memcpy_to_msg() may sleep.
+				 */
+				spin_unlock_bh(&vvs->rx_lock);
+
+				err = memcpy_to_msg(msg, pkt->buf, bytes_to_copy);
+				if (err) {
+					/* Copy of message failed, set flag to skip
+					 * copy path for rest of fragments. Rest of
+					 * fragments will be freed without copy.
+					 */
+					copy_failed = true;
+					dequeued_len = err;
+				} else {
+					user_buf_len -= bytes_to_copy;
+				}
+
+				spin_lock_bh(&vvs->rx_lock);
+			}
+
+			if (dequeued_len >= 0)
+				dequeued_len += pkt_len;
+		}
+
+		if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR) {
+			msg_ready = true;
+			vvs->msg_count--;
+		}
+
+		virtio_transport_dec_rx_pkt(vvs, pkt);
+		list_del(&pkt->list);
+		virtio_transport_free_pkt(pkt);
+	}
+
+	spin_unlock_bh(&vvs->rx_lock);
+
+	virtio_transport_send_credit_update(vsk);
+
+	return dequeued_len;
+}
+
 ssize_t
 virtio_transport_stream_dequeue(struct vsock_sock *vsk,
 				struct msghdr *msg,
@@ -405,6 +477,18 @@ virtio_transport_stream_dequeue(struct vsock_sock *vsk,
 }
 EXPORT_SYMBOL_GPL(virtio_transport_stream_dequeue);
 
+ssize_t
+virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk,
+				   struct msghdr *msg,
+				   int flags)
+{
+	if (flags & MSG_PEEK)
+		return -EOPNOTSUPP;
+
+	return virtio_transport_seqpacket_do_dequeue(vsk, msg, flags);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_dequeue);
+
 int
 virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
 			       struct msghdr *msg,
-- 
cgit v1.2.3


From 9ac841f5e9f261245d9d2841ad123566bd160a6e Mon Sep 17 00:00:00 2001
From: Arseny Krasnov <arseny.krasnov@kaspersky.com>
Date: Fri, 11 Jun 2021 14:13:06 +0300
Subject: virtio/vsock: rest of SOCK_SEQPACKET support

Small updates to make SOCK_SEQPACKET work:
1) Send SHUTDOWN on socket close for SEQPACKET type.
2) Set SEQPACKET packet type during send.
3) Set 'VIRTIO_VSOCK_SEQ_EOR' bit in flags for last
   packet of message.
4) Implement data check function for SEQPACKET.
5) Check for max datagram size.

Signed-off-by: Arseny Krasnov <arseny.krasnov@kaspersky.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/virtio_vsock.h            |  5 ++++
 net/vmw_vsock/virtio_transport_common.c | 41 +++++++++++++++++++++++++++++++--
 2 files changed, 44 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index 1d9a302cb91d..35d7eedb5e8e 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -81,12 +81,17 @@ virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
 			       struct msghdr *msg,
 			       size_t len, int flags);
 
+int
+virtio_transport_seqpacket_enqueue(struct vsock_sock *vsk,
+				   struct msghdr *msg,
+				   size_t len);
 ssize_t
 virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk,
 				   struct msghdr *msg,
 				   int flags);
 s64 virtio_transport_stream_has_data(struct vsock_sock *vsk);
 s64 virtio_transport_stream_has_space(struct vsock_sock *vsk);
+u32 virtio_transport_seqpacket_has_data(struct vsock_sock *vsk);
 
 int virtio_transport_do_socket_init(struct vsock_sock *vsk,
 				 struct vsock_sock *psk);
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 3a658ff8fccb..23704a6bc437 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -74,6 +74,10 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
 		err = memcpy_from_msg(pkt->buf, info->msg, len);
 		if (err)
 			goto out;
+
+		if (msg_data_left(info->msg) == 0 &&
+		    info->type == VIRTIO_VSOCK_TYPE_SEQPACKET)
+			pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
 	}
 
 	trace_virtio_transport_alloc_pkt(src_cid, src_port,
@@ -187,7 +191,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
 	struct virtio_vsock_pkt *pkt;
 	u32 pkt_len = info->pkt_len;
 
-	info->type = VIRTIO_VSOCK_TYPE_STREAM;
+	info->type = virtio_transport_get_type(sk_vsock(vsk));
 
 	t_ops = virtio_transport_get_ops(vsk);
 	if (unlikely(!t_ops))
@@ -497,6 +501,26 @@ virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk,
 }
 EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_dequeue);
 
+int
+virtio_transport_seqpacket_enqueue(struct vsock_sock *vsk,
+				   struct msghdr *msg,
+				   size_t len)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+
+	spin_lock_bh(&vvs->tx_lock);
+
+	if (len > vvs->peer_buf_alloc) {
+		spin_unlock_bh(&vvs->tx_lock);
+		return -EMSGSIZE;
+	}
+
+	spin_unlock_bh(&vvs->tx_lock);
+
+	return virtio_transport_stream_enqueue(vsk, msg, len);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_enqueue);
+
 int
 virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
 			       struct msghdr *msg,
@@ -519,6 +543,19 @@ s64 virtio_transport_stream_has_data(struct vsock_sock *vsk)
 }
 EXPORT_SYMBOL_GPL(virtio_transport_stream_has_data);
 
+u32 virtio_transport_seqpacket_has_data(struct vsock_sock *vsk)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	u32 msg_count;
+
+	spin_lock_bh(&vvs->rx_lock);
+	msg_count = vvs->msg_count;
+	spin_unlock_bh(&vvs->rx_lock);
+
+	return msg_count;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_has_data);
+
 static s64 virtio_transport_has_space(struct vsock_sock *vsk)
 {
 	struct virtio_vsock_sock *vvs = vsk->trans;
@@ -931,7 +968,7 @@ void virtio_transport_release(struct vsock_sock *vsk)
 	struct sock *sk = &vsk->sk;
 	bool remove_sock = true;
 
-	if (sk->sk_type == SOCK_STREAM)
+	if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)
 		remove_sock = virtio_transport_close(vsk);
 
 	if (remove_sock) {
-- 
cgit v1.2.3


From 5673ef86380414be1702ba2f1ef92526a14dd1e0 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 11 Jun 2021 23:05:19 +0300
Subject: net: pcs: xpcs: rename mdio_xpcs_args to dw_xpcs

The struct mdio_xpcs_args is reminiscent of when a similarly named
struct mdio_xpcs_ops existed. Now that that is removed, we can shorten
the name to dw_xpcs (dw for DesignWare).

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/common.h      |  2 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c |  2 +-
 drivers/net/pcs/pcs-xpcs.c                        | 73 +++++++++++------------
 include/linux/pcs/pcs-xpcs.h                      | 14 ++---
 4 files changed, 45 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index 8a83f9e1e95b..5fecc83f175b 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -503,7 +503,7 @@ struct mac_device_info {
 	const struct stmmac_hwtimestamp *ptp;
 	const struct stmmac_tc_ops *tc;
 	const struct stmmac_mmc_ops *mmc;
-	struct mdio_xpcs_args *xpcs;
+	struct dw_xpcs *xpcs;
 	struct mii_regs mii;	/* MII register Addresses */
 	struct mac_link link;
 	void __iomem *pcsr;     /* vpointer to device CSRs */
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index bc900e240da2..3b3033b20b1d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -401,7 +401,7 @@ int stmmac_xpcs_setup(struct mii_bus *bus)
 {
 	int mode, addr;
 	struct net_device *ndev = bus->priv;
-	struct mdio_xpcs_args *xpcs;
+	struct dw_xpcs *xpcs;
 	struct stmmac_priv *priv;
 	struct mdio_device *mdiodev;
 
diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c
index 98c4a3973402..a2cbb2d926b7 100644
--- a/drivers/net/pcs/pcs-xpcs.c
+++ b/drivers/net/pcs/pcs-xpcs.c
@@ -109,7 +109,7 @@
 #define DW_VR_MII_EEE_TRN_LPI		BIT(0)	/* Transparent Mode Enable */
 
 #define phylink_pcs_to_xpcs(pl_pcs) \
-	container_of((pl_pcs), struct mdio_xpcs_args, pcs)
+	container_of((pl_pcs), struct dw_xpcs, pcs)
 
 static const int xpcs_usxgmii_features[] = {
 	ETHTOOL_LINK_MODE_Pause_BIT,
@@ -236,7 +236,7 @@ static const struct xpcs_compat *xpcs_find_compat(const struct xpcs_id *id,
 	return NULL;
 }
 
-int xpcs_get_an_mode(struct mdio_xpcs_args *xpcs, phy_interface_t interface)
+int xpcs_get_an_mode(struct dw_xpcs *xpcs, phy_interface_t interface)
 {
 	const struct xpcs_compat *compat;
 
@@ -263,7 +263,7 @@ static bool __xpcs_linkmode_supported(const struct xpcs_compat *compat,
 #define xpcs_linkmode_supported(compat, mode) \
 	__xpcs_linkmode_supported(compat, ETHTOOL_LINK_MODE_ ## mode ## _BIT)
 
-static int xpcs_read(struct mdio_xpcs_args *xpcs, int dev, u32 reg)
+static int xpcs_read(struct dw_xpcs *xpcs, int dev, u32 reg)
 {
 	u32 reg_addr = mdiobus_c45_addr(dev, reg);
 	struct mii_bus *bus = xpcs->mdiodev->bus;
@@ -272,7 +272,7 @@ static int xpcs_read(struct mdio_xpcs_args *xpcs, int dev, u32 reg)
 	return mdiobus_read(bus, addr, reg_addr);
 }
 
-static int xpcs_write(struct mdio_xpcs_args *xpcs, int dev, u32 reg, u16 val)
+static int xpcs_write(struct dw_xpcs *xpcs, int dev, u32 reg, u16 val)
 {
 	u32 reg_addr = mdiobus_c45_addr(dev, reg);
 	struct mii_bus *bus = xpcs->mdiodev->bus;
@@ -281,28 +281,28 @@ static int xpcs_write(struct mdio_xpcs_args *xpcs, int dev, u32 reg, u16 val)
 	return mdiobus_write(bus, addr, reg_addr, val);
 }
 
-static int xpcs_read_vendor(struct mdio_xpcs_args *xpcs, int dev, u32 reg)
+static int xpcs_read_vendor(struct dw_xpcs *xpcs, int dev, u32 reg)
 {
 	return xpcs_read(xpcs, dev, DW_VENDOR | reg);
 }
 
-static int xpcs_write_vendor(struct mdio_xpcs_args *xpcs, int dev, int reg,
+static int xpcs_write_vendor(struct dw_xpcs *xpcs, int dev, int reg,
 			     u16 val)
 {
 	return xpcs_write(xpcs, dev, DW_VENDOR | reg, val);
 }
 
-static int xpcs_read_vpcs(struct mdio_xpcs_args *xpcs, int reg)
+static int xpcs_read_vpcs(struct dw_xpcs *xpcs, int reg)
 {
 	return xpcs_read_vendor(xpcs, MDIO_MMD_PCS, reg);
 }
 
-static int xpcs_write_vpcs(struct mdio_xpcs_args *xpcs, int reg, u16 val)
+static int xpcs_write_vpcs(struct dw_xpcs *xpcs, int reg, u16 val)
 {
 	return xpcs_write_vendor(xpcs, MDIO_MMD_PCS, reg, val);
 }
 
-static int xpcs_poll_reset(struct mdio_xpcs_args *xpcs, int dev)
+static int xpcs_poll_reset(struct dw_xpcs *xpcs, int dev)
 {
 	/* Poll until the reset bit clears (50ms per retry == 0.6 sec) */
 	unsigned int retries = 12;
@@ -318,7 +318,7 @@ static int xpcs_poll_reset(struct mdio_xpcs_args *xpcs, int dev)
 	return (ret & MDIO_CTRL1_RESET) ? -ETIMEDOUT : 0;
 }
 
-static int xpcs_soft_reset(struct mdio_xpcs_args *xpcs,
+static int xpcs_soft_reset(struct dw_xpcs *xpcs,
 			   const struct xpcs_compat *compat)
 {
 	int ret, dev;
@@ -348,7 +348,7 @@ static int xpcs_soft_reset(struct mdio_xpcs_args *xpcs,
 		dev_warn(&(__xpcs)->mdiodev->dev, ##__args); \
 })
 
-static int xpcs_read_fault_c73(struct mdio_xpcs_args *xpcs,
+static int xpcs_read_fault_c73(struct dw_xpcs *xpcs,
 			       struct phylink_link_state *state)
 {
 	int ret;
@@ -399,7 +399,7 @@ static int xpcs_read_fault_c73(struct mdio_xpcs_args *xpcs,
 	return 0;
 }
 
-static int xpcs_read_link_c73(struct mdio_xpcs_args *xpcs, bool an)
+static int xpcs_read_link_c73(struct dw_xpcs *xpcs, bool an)
 {
 	bool link = true;
 	int ret;
@@ -439,7 +439,7 @@ static int xpcs_get_max_usxgmii_speed(const unsigned long *supported)
 	return max;
 }
 
-static void xpcs_config_usxgmii(struct mdio_xpcs_args *xpcs, int speed)
+static void xpcs_config_usxgmii(struct dw_xpcs *xpcs, int speed)
 {
 	int ret, speed_sel;
 
@@ -500,7 +500,7 @@ out:
 	pr_err("%s: XPCS access returned %pe\n", __func__, ERR_PTR(ret));
 }
 
-static int _xpcs_config_aneg_c73(struct mdio_xpcs_args *xpcs,
+static int _xpcs_config_aneg_c73(struct dw_xpcs *xpcs,
 				 const struct xpcs_compat *compat)
 {
 	int ret, adv;
@@ -545,7 +545,7 @@ static int _xpcs_config_aneg_c73(struct mdio_xpcs_args *xpcs,
 	return xpcs_write(xpcs, MDIO_MMD_AN, DW_SR_AN_ADV1, adv);
 }
 
-static int xpcs_config_aneg_c73(struct mdio_xpcs_args *xpcs,
+static int xpcs_config_aneg_c73(struct dw_xpcs *xpcs,
 				const struct xpcs_compat *compat)
 {
 	int ret;
@@ -563,7 +563,7 @@ static int xpcs_config_aneg_c73(struct mdio_xpcs_args *xpcs,
 	return xpcs_write(xpcs, MDIO_MMD_AN, MDIO_CTRL1, ret);
 }
 
-static int xpcs_aneg_done_c73(struct mdio_xpcs_args *xpcs,
+static int xpcs_aneg_done_c73(struct dw_xpcs *xpcs,
 			      struct phylink_link_state *state,
 			      const struct xpcs_compat *compat)
 {
@@ -590,7 +590,7 @@ static int xpcs_aneg_done_c73(struct mdio_xpcs_args *xpcs,
 	return 0;
 }
 
-static int xpcs_read_lpa_c73(struct mdio_xpcs_args *xpcs,
+static int xpcs_read_lpa_c73(struct dw_xpcs *xpcs,
 			     struct phylink_link_state *state)
 {
 	int ret;
@@ -639,7 +639,7 @@ static int xpcs_read_lpa_c73(struct mdio_xpcs_args *xpcs,
 	return 0;
 }
 
-static void xpcs_resolve_lpa_c73(struct mdio_xpcs_args *xpcs,
+static void xpcs_resolve_lpa_c73(struct dw_xpcs *xpcs,
 				 struct phylink_link_state *state)
 {
 	int max_speed = xpcs_get_max_usxgmii_speed(state->lp_advertising);
@@ -649,7 +649,7 @@ static void xpcs_resolve_lpa_c73(struct mdio_xpcs_args *xpcs,
 	state->duplex = DUPLEX_FULL;
 }
 
-static int xpcs_get_max_xlgmii_speed(struct mdio_xpcs_args *xpcs,
+static int xpcs_get_max_xlgmii_speed(struct dw_xpcs *xpcs,
 				     struct phylink_link_state *state)
 {
 	unsigned long *adv = state->advertising;
@@ -703,7 +703,7 @@ static int xpcs_get_max_xlgmii_speed(struct mdio_xpcs_args *xpcs,
 	return speed;
 }
 
-static void xpcs_resolve_pma(struct mdio_xpcs_args *xpcs,
+static void xpcs_resolve_pma(struct dw_xpcs *xpcs,
 			     struct phylink_link_state *state)
 {
 	state->pause = MLO_PAUSE_TX | MLO_PAUSE_RX;
@@ -722,7 +722,7 @@ static void xpcs_resolve_pma(struct mdio_xpcs_args *xpcs,
 	}
 }
 
-void xpcs_validate(struct mdio_xpcs_args *xpcs, unsigned long *supported,
+void xpcs_validate(struct dw_xpcs *xpcs, unsigned long *supported,
 		   struct phylink_link_state *state)
 {
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(xpcs_supported);
@@ -752,8 +752,7 @@ void xpcs_validate(struct mdio_xpcs_args *xpcs, unsigned long *supported,
 }
 EXPORT_SYMBOL_GPL(xpcs_validate);
 
-int xpcs_config_eee(struct mdio_xpcs_args *xpcs, int mult_fact_100ns,
-		    int enable)
+int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, int enable)
 {
 	int ret;
 
@@ -786,7 +785,7 @@ int xpcs_config_eee(struct mdio_xpcs_args *xpcs, int mult_fact_100ns,
 }
 EXPORT_SYMBOL_GPL(xpcs_config_eee);
 
-static int xpcs_config_aneg_c37_sgmii(struct mdio_xpcs_args *xpcs)
+static int xpcs_config_aneg_c37_sgmii(struct dw_xpcs *xpcs)
 {
 	int ret;
 
@@ -827,7 +826,7 @@ static int xpcs_config_aneg_c37_sgmii(struct mdio_xpcs_args *xpcs)
 	return xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_DIG_CTRL1, ret);
 }
 
-static int xpcs_config_2500basex(struct mdio_xpcs_args *xpcs)
+static int xpcs_config_2500basex(struct dw_xpcs *xpcs)
 {
 	int ret;
 
@@ -849,8 +848,8 @@ static int xpcs_config_2500basex(struct mdio_xpcs_args *xpcs)
 	return xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_MMD_CTRL, ret);
 }
 
-static int xpcs_do_config(struct mdio_xpcs_args *xpcs,
-			  phy_interface_t interface, unsigned int mode)
+static int xpcs_do_config(struct dw_xpcs *xpcs, phy_interface_t interface,
+			  unsigned int mode)
 {
 	const struct xpcs_compat *compat;
 	int ret;
@@ -889,12 +888,12 @@ static int xpcs_config(struct phylink_pcs *pcs, unsigned int mode,
 		       const unsigned long *advertising,
 		       bool permit_pause_to_mac)
 {
-	struct mdio_xpcs_args *xpcs = phylink_pcs_to_xpcs(pcs);
+	struct dw_xpcs *xpcs = phylink_pcs_to_xpcs(pcs);
 
 	return xpcs_do_config(xpcs, interface, mode);
 }
 
-static int xpcs_get_state_c73(struct mdio_xpcs_args *xpcs,
+static int xpcs_get_state_c73(struct dw_xpcs *xpcs,
 			      struct phylink_link_state *state,
 			      const struct xpcs_compat *compat)
 {
@@ -928,7 +927,7 @@ static int xpcs_get_state_c73(struct mdio_xpcs_args *xpcs,
 	return 0;
 }
 
-static int xpcs_get_state_c37_sgmii(struct mdio_xpcs_args *xpcs,
+static int xpcs_get_state_c37_sgmii(struct dw_xpcs *xpcs,
 				    struct phylink_link_state *state)
 {
 	int ret;
@@ -972,7 +971,7 @@ static int xpcs_get_state_c37_sgmii(struct mdio_xpcs_args *xpcs,
 static void xpcs_get_state(struct phylink_pcs *pcs,
 			   struct phylink_link_state *state)
 {
-	struct mdio_xpcs_args *xpcs = phylink_pcs_to_xpcs(pcs);
+	struct dw_xpcs *xpcs = phylink_pcs_to_xpcs(pcs);
 	const struct xpcs_compat *compat;
 	int ret;
 
@@ -1004,13 +1003,13 @@ static void xpcs_get_state(struct phylink_pcs *pcs,
 static void xpcs_link_up(struct phylink_pcs *pcs, unsigned int mode,
 			 phy_interface_t interface, int speed, int duplex)
 {
-	struct mdio_xpcs_args *xpcs = phylink_pcs_to_xpcs(pcs);
+	struct dw_xpcs *xpcs = phylink_pcs_to_xpcs(pcs);
 
 	if (interface == PHY_INTERFACE_MODE_USXGMII)
 		return xpcs_config_usxgmii(xpcs, speed);
 }
 
-static u32 xpcs_get_id(struct mdio_xpcs_args *xpcs)
+static u32 xpcs_get_id(struct dw_xpcs *xpcs)
 {
 	int ret;
 	u32 id;
@@ -1095,10 +1094,10 @@ static const struct phylink_pcs_ops xpcs_phylink_ops = {
 	.pcs_link_up = xpcs_link_up,
 };
 
-struct mdio_xpcs_args *xpcs_create(struct mdio_device *mdiodev,
-				   phy_interface_t interface)
+struct dw_xpcs *xpcs_create(struct mdio_device *mdiodev,
+			    phy_interface_t interface)
 {
-	struct mdio_xpcs_args *xpcs;
+	struct dw_xpcs *xpcs;
 	u32 xpcs_id;
 	int i, ret;
 
@@ -1144,7 +1143,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(xpcs_create);
 
-void xpcs_destroy(struct mdio_xpcs_args *xpcs)
+void xpcs_destroy(struct dw_xpcs *xpcs)
 {
 	kfree(xpcs);
 }
diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h
index 4d815f03b4b2..4f1cdf6f3d4c 100644
--- a/include/linux/pcs/pcs-xpcs.h
+++ b/include/linux/pcs/pcs-xpcs.h
@@ -17,19 +17,19 @@
 
 struct xpcs_id;
 
-struct mdio_xpcs_args {
+struct dw_xpcs {
 	struct mdio_device *mdiodev;
 	const struct xpcs_id *id;
 	struct phylink_pcs pcs;
 };
 
-int xpcs_get_an_mode(struct mdio_xpcs_args *xpcs, phy_interface_t interface);
-void xpcs_validate(struct mdio_xpcs_args *xpcs, unsigned long *supported,
+int xpcs_get_an_mode(struct dw_xpcs *xpcs, phy_interface_t interface);
+void xpcs_validate(struct dw_xpcs *xpcs, unsigned long *supported,
 		   struct phylink_link_state *state);
-int xpcs_config_eee(struct mdio_xpcs_args *xpcs, int mult_fact_100ns,
+int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns,
 		    int enable);
-struct mdio_xpcs_args *xpcs_create(struct mdio_device *mdiodev,
-				   phy_interface_t interface);
-void xpcs_destroy(struct mdio_xpcs_args *xpcs);
+struct dw_xpcs *xpcs_create(struct mdio_device *mdiodev,
+			    phy_interface_t interface);
+void xpcs_destroy(struct dw_xpcs *xpcs);
 
 #endif /* __LINUX_PCS_XPCS_H */
-- 
cgit v1.2.3


From dd0721ea4c7a6c2ec8b309ff57d74d88f08d4c23 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 11 Jun 2021 23:05:25 +0300
Subject: net: pcs: xpcs: add support for NXP SJA1105

The NXP SJA1105 DSA switch integrates a Synopsys SGMII XPCS on port 4.
The generic code works fine, except there is an integration issue which
needs to be dealt with: in this switch, the XPCS is integrated with a
PMA that has the TX lane polarity inverted by default (PLUS is MINUS,
MINUS is PLUS).

To obtain normal non-inverted behavior, the TX lane polarity must be
inverted in the PCS, via the DIGITAL_CONTROL_2 register.

We introduce a pma_config() method in xpcs_compat which is called by the
phylink_pcs_config() implementation.

Also, the NXP SJA1105 returns all zeroes in the PHY ID registers 2 and 3.
We need to hack up an ad-hoc PHY ID (OUI is zero, device ID is 1) in
order for the XPCS driver to recognize it. This PHY ID is added to the
public include/linux/pcs/pcs-xpcs.h for that reason (for the sja1105
driver to be able to use it in a later patch).

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                    |  1 +
 drivers/net/pcs/Makefile       |  4 +++-
 drivers/net/pcs/pcs-xpcs-nxp.c | 16 ++++++++++++++++
 drivers/net/pcs/pcs-xpcs.c     | 25 +++++++++++++++++++++++--
 drivers/net/pcs/pcs-xpcs.h     | 10 ++++++++++
 include/linux/pcs/pcs-xpcs.h   |  2 ++
 6 files changed, 55 insertions(+), 3 deletions(-)
 create mode 100644 drivers/net/pcs/pcs-xpcs-nxp.c

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index c8214235380e..349a87b42d3c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13203,6 +13203,7 @@ M:	Vladimir Oltean <olteanv@gmail.com>
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	drivers/net/dsa/sja1105
+F:	drivers/net/pcs/pcs-xpcs-nxp.c
 
 NXP TDA998X DRM DRIVER
 M:	Russell King <linux@armlinux.org.uk>
diff --git a/drivers/net/pcs/Makefile b/drivers/net/pcs/Makefile
index c23146755972..0603d469bd57 100644
--- a/drivers/net/pcs/Makefile
+++ b/drivers/net/pcs/Makefile
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 # Makefile for Linux PCS drivers
 
-obj-$(CONFIG_PCS_XPCS)		+= pcs-xpcs.o
+pcs_xpcs-$(CONFIG_PCS_XPCS)	:= pcs-xpcs.o pcs-xpcs-nxp.o
+
+obj-$(CONFIG_PCS_XPCS)		+= pcs_xpcs.o
 obj-$(CONFIG_PCS_LYNX)		+= pcs-lynx.o
diff --git a/drivers/net/pcs/pcs-xpcs-nxp.c b/drivers/net/pcs/pcs-xpcs-nxp.c
new file mode 100644
index 000000000000..51b2fc7d36a9
--- /dev/null
+++ b/drivers/net/pcs/pcs-xpcs-nxp.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2021 NXP Semiconductors
+ */
+#include <linux/pcs/pcs-xpcs.h>
+#include "pcs-xpcs.h"
+
+/* In NXP SJA1105, the PCS is integrated with a PMA that has the TX lane
+ * polarity inverted by default (PLUS is MINUS, MINUS is PLUS). To obtain
+ * normal non-inverted behavior, the TX lane polarity must be inverted in the
+ * PCS, via the DIGITAL_CONTROL_2 register.
+ */
+int nxp_sja1105_sgmii_pma_config(struct dw_xpcs *xpcs)
+{
+	return xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_DIG_CTRL2,
+			  DW_VR_MII_DIG_CTRL2_TX_POL_INV);
+}
diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c
index ecf5011977d3..3b1baacfaf8f 100644
--- a/drivers/net/pcs/pcs-xpcs.c
+++ b/drivers/net/pcs/pcs-xpcs.c
@@ -117,6 +117,7 @@ struct xpcs_compat {
 	const phy_interface_t *interface;
 	int num_interfaces;
 	int an_mode;
+	int (*pma_config)(struct dw_xpcs *xpcs);
 };
 
 struct xpcs_id {
@@ -168,7 +169,7 @@ static bool __xpcs_linkmode_supported(const struct xpcs_compat *compat,
 #define xpcs_linkmode_supported(compat, mode) \
 	__xpcs_linkmode_supported(compat, ETHTOOL_LINK_MODE_ ## mode ## _BIT)
 
-static int xpcs_read(struct dw_xpcs *xpcs, int dev, u32 reg)
+int xpcs_read(struct dw_xpcs *xpcs, int dev, u32 reg)
 {
 	u32 reg_addr = mdiobus_c45_addr(dev, reg);
 	struct mii_bus *bus = xpcs->mdiodev->bus;
@@ -177,7 +178,7 @@ static int xpcs_read(struct dw_xpcs *xpcs, int dev, u32 reg)
 	return mdiobus_read(bus, addr, reg_addr);
 }
 
-static int xpcs_write(struct dw_xpcs *xpcs, int dev, u32 reg, u16 val)
+int xpcs_write(struct dw_xpcs *xpcs, int dev, u32 reg, u16 val)
 {
 	u32 reg_addr = mdiobus_c45_addr(dev, reg);
 	struct mii_bus *bus = xpcs->mdiodev->bus;
@@ -788,6 +789,12 @@ static int xpcs_do_config(struct dw_xpcs *xpcs, phy_interface_t interface,
 		return -1;
 	}
 
+	if (compat->pma_config) {
+		ret = compat->pma_config(xpcs);
+		if (ret)
+			return ret;
+	}
+
 	return 0;
 }
 
@@ -1022,11 +1029,25 @@ static const struct xpcs_compat synopsys_xpcs_compat[DW_XPCS_INTERFACE_MAX] = {
 	},
 };
 
+static const struct xpcs_compat nxp_sja1105_xpcs_compat[DW_XPCS_INTERFACE_MAX] = {
+	[DW_XPCS_SGMII] = {
+		.supported = xpcs_sgmii_features,
+		.interface = xpcs_sgmii_interfaces,
+		.num_interfaces = ARRAY_SIZE(xpcs_sgmii_interfaces),
+		.an_mode = DW_AN_C37_SGMII,
+		.pma_config = nxp_sja1105_sgmii_pma_config,
+	},
+};
+
 static const struct xpcs_id xpcs_id_list[] = {
 	{
 		.id = SYNOPSYS_XPCS_ID,
 		.mask = SYNOPSYS_XPCS_MASK,
 		.compat = synopsys_xpcs_compat,
+	}, {
+		.id = NXP_SJA1105_XPCS_ID,
+		.mask = SYNOPSYS_XPCS_MASK,
+		.compat = nxp_sja1105_xpcs_compat,
 	},
 };
 
diff --git a/drivers/net/pcs/pcs-xpcs.h b/drivers/net/pcs/pcs-xpcs.h
index 867537a68c63..3daf4276a158 100644
--- a/drivers/net/pcs/pcs-xpcs.h
+++ b/drivers/net/pcs/pcs-xpcs.h
@@ -60,10 +60,15 @@
 /* EEE Mode Control Register */
 #define DW_VR_MII_EEE_MCTRL0		0x8006
 #define DW_VR_MII_EEE_MCTRL1		0x800b
+#define DW_VR_MII_DIG_CTRL2		0x80e1
 
 /* VR_MII_DIG_CTRL1 */
 #define DW_VR_MII_DIG_CTRL1_MAC_AUTO_SW		BIT(9)
 
+/* VR_MII_DIG_CTRL2 */
+#define DW_VR_MII_DIG_CTRL2_TX_POL_INV		BIT(4)
+#define DW_VR_MII_DIG_CTRL2_RX_POL_INV		BIT(0)
+
 /* VR_MII_AN_CTRL */
 #define DW_VR_MII_AN_CTRL_TX_CONFIG_SHIFT	3
 #define DW_VR_MII_TX_CONFIG_MASK		BIT(3)
@@ -101,3 +106,8 @@
 
 /* VR MII EEE Control 1 defines */
 #define DW_VR_MII_EEE_TRN_LPI		BIT(0)	/* Transparent Mode Enable */
+
+int xpcs_read(struct dw_xpcs *xpcs, int dev, u32 reg);
+int xpcs_write(struct dw_xpcs *xpcs, int dev, u32 reg, u16 val);
+
+int nxp_sja1105_sgmii_pma_config(struct dw_xpcs *xpcs);
diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h
index 4f1cdf6f3d4c..c594f7cdc304 100644
--- a/include/linux/pcs/pcs-xpcs.h
+++ b/include/linux/pcs/pcs-xpcs.h
@@ -10,6 +10,8 @@
 #include <linux/phy.h>
 #include <linux/phylink.h>
 
+#define NXP_SJA1105_XPCS_ID		0x00000010
+
 /* AN mode */
 #define DW_AN_C73			1
 #define DW_AN_C37_SGMII			2
-- 
cgit v1.2.3


From f7380bba42fd0654bf8195fb741d5f92b0f46df9 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 11 Jun 2021 23:05:26 +0300
Subject: net: pcs: xpcs: add support for NXP SJA1110

The NXP SJA1110 switch integrates its own, non-Synopsys PMA, but it
manages it through the register space of the XPCS itself, in a small
register window inside MDIO_MMD_VEND2 from address 0x8030 to 0x806e.

This coincides with where the registers for the default Synopsys PMA
are, but the register definitions are of course not the same.

This situation is an odd hardware quirk, but the simplest way to manage
it is to drive the SJA1110's PMA from within the XPCS driver.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/pcs/pcs-xpcs-nxp.c | 169 +++++++++++++++++++++++++++++++++++++++++
 drivers/net/pcs/pcs-xpcs.c     |  21 +++++
 drivers/net/pcs/pcs-xpcs.h     |   2 +
 include/linux/pcs/pcs-xpcs.h   |   1 +
 4 files changed, 193 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/pcs/pcs-xpcs-nxp.c b/drivers/net/pcs/pcs-xpcs-nxp.c
index 51b2fc7d36a9..de99c37cf2ae 100644
--- a/drivers/net/pcs/pcs-xpcs-nxp.c
+++ b/drivers/net/pcs/pcs-xpcs-nxp.c
@@ -4,6 +4,66 @@
 #include <linux/pcs/pcs-xpcs.h>
 #include "pcs-xpcs.h"
 
+/* LANE_DRIVER1_0 register */
+#define SJA1110_LANE_DRIVER1_0		0x8038
+#define SJA1110_TXDRV(x)		(((x) << 12) & GENMASK(14, 12))
+
+/* LANE_DRIVER2_0 register */
+#define SJA1110_LANE_DRIVER2_0		0x803a
+#define SJA1110_TXDRVTRIM_LSB(x)	((x) & GENMASK_ULL(15, 0))
+
+/* LANE_DRIVER2_1 register */
+#define SJA1110_LANE_DRIVER2_1		0x803b
+#define SJA1110_LANE_DRIVER2_1_RSV	BIT(9)
+#define SJA1110_TXDRVTRIM_MSB(x)	(((x) & GENMASK_ULL(23, 16)) >> 16)
+
+/* LANE_TRIM register */
+#define SJA1110_LANE_TRIM		0x8040
+#define SJA1110_TXTEN			BIT(11)
+#define SJA1110_TXRTRIM(x)		(((x) << 8) & GENMASK(10, 8))
+#define SJA1110_TXPLL_BWSEL		BIT(7)
+#define SJA1110_RXTEN			BIT(6)
+#define SJA1110_RXRTRIM(x)		(((x) << 3) & GENMASK(5, 3))
+#define SJA1110_CDR_GAIN		BIT(2)
+#define SJA1110_ACCOUPLE_RXVCM_EN	BIT(0)
+
+/* LANE_DATAPATH_1 register */
+#define SJA1110_LANE_DATAPATH_1		0x8037
+
+/* POWERDOWN_ENABLE register */
+#define SJA1110_POWERDOWN_ENABLE	0x8041
+#define SJA1110_TXPLL_PD		BIT(12)
+#define SJA1110_TXPD			BIT(11)
+#define SJA1110_RXPKDETEN		BIT(10)
+#define SJA1110_RXCH_PD			BIT(9)
+#define SJA1110_RXBIAS_PD		BIT(8)
+#define SJA1110_RESET_SER_EN		BIT(7)
+#define SJA1110_RESET_SER		BIT(6)
+#define SJA1110_RESET_DES		BIT(5)
+#define SJA1110_RCVEN			BIT(4)
+
+/* RXPLL_CTRL0 register */
+#define SJA1110_RXPLL_CTRL0		0x8065
+#define SJA1110_RXPLL_FBDIV(x)		(((x) << 2) & GENMASK(9, 2))
+
+/* RXPLL_CTRL1 register */
+#define SJA1110_RXPLL_CTRL1		0x8066
+#define SJA1110_RXPLL_REFDIV(x)		((x) & GENMASK(4, 0))
+
+/* TXPLL_CTRL0 register */
+#define SJA1110_TXPLL_CTRL0		0x806d
+#define SJA1110_TXPLL_FBDIV(x)		((x) & GENMASK(11, 0))
+
+/* TXPLL_CTRL1 register */
+#define SJA1110_TXPLL_CTRL1		0x806e
+#define SJA1110_TXPLL_REFDIV(x)		((x) & GENMASK(5, 0))
+
+/* RX_DATA_DETECT register */
+#define SJA1110_RX_DATA_DETECT		0x8045
+
+/* RX_CDR_CTLE register */
+#define SJA1110_RX_CDR_CTLE		0x8042
+
 /* In NXP SJA1105, the PCS is integrated with a PMA that has the TX lane
  * polarity inverted by default (PLUS is MINUS, MINUS is PLUS). To obtain
  * normal non-inverted behavior, the TX lane polarity must be inverted in the
@@ -14,3 +74,112 @@ int nxp_sja1105_sgmii_pma_config(struct dw_xpcs *xpcs)
 	return xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_DIG_CTRL2,
 			  DW_VR_MII_DIG_CTRL2_TX_POL_INV);
 }
+
+static int nxp_sja1110_pma_config(struct dw_xpcs *xpcs,
+				  u16 txpll_fbdiv, u16 txpll_refdiv,
+				  u16 rxpll_fbdiv, u16 rxpll_refdiv,
+				  u16 rx_cdr_ctle)
+{
+	u16 val;
+	int ret;
+
+	/* Program TX PLL feedback divider and reference divider settings for
+	 * correct oscillation frequency.
+	 */
+	ret = xpcs_write(xpcs, MDIO_MMD_VEND2, SJA1110_TXPLL_CTRL0,
+			 SJA1110_TXPLL_FBDIV(txpll_fbdiv));
+	if (ret < 0)
+		return ret;
+
+	ret = xpcs_write(xpcs, MDIO_MMD_VEND2, SJA1110_TXPLL_CTRL1,
+			 SJA1110_TXPLL_REFDIV(txpll_refdiv));
+	if (ret < 0)
+		return ret;
+
+	/* Program transmitter amplitude and disable amplitude trimming */
+	ret = xpcs_write(xpcs, MDIO_MMD_VEND2, SJA1110_LANE_DRIVER1_0,
+			 SJA1110_TXDRV(0x5));
+	if (ret < 0)
+		return ret;
+
+	val = SJA1110_TXDRVTRIM_LSB(0xffffffull);
+
+	ret = xpcs_write(xpcs, MDIO_MMD_VEND2, SJA1110_LANE_DRIVER2_0, val);
+	if (ret < 0)
+		return ret;
+
+	val = SJA1110_TXDRVTRIM_MSB(0xffffffull) | SJA1110_LANE_DRIVER2_1_RSV;
+
+	ret = xpcs_write(xpcs, MDIO_MMD_VEND2, SJA1110_LANE_DRIVER2_1, val);
+	if (ret < 0)
+		return ret;
+
+	/* Enable input and output resistor terminations for low BER. */
+	val = SJA1110_ACCOUPLE_RXVCM_EN | SJA1110_CDR_GAIN |
+	      SJA1110_RXRTRIM(4) | SJA1110_RXTEN | SJA1110_TXPLL_BWSEL |
+	      SJA1110_TXRTRIM(3) | SJA1110_TXTEN;
+
+	ret = xpcs_write(xpcs, MDIO_MMD_VEND2, SJA1110_LANE_TRIM, val);
+	if (ret < 0)
+		return ret;
+
+	/* Select PCS as transmitter data source. */
+	ret = xpcs_write(xpcs, MDIO_MMD_VEND2, SJA1110_LANE_DATAPATH_1, 0);
+	if (ret < 0)
+		return ret;
+
+	/* Program RX PLL feedback divider and reference divider for correct
+	 * oscillation frequency.
+	 */
+	ret = xpcs_write(xpcs, MDIO_MMD_VEND2, SJA1110_RXPLL_CTRL0,
+			 SJA1110_RXPLL_FBDIV(rxpll_fbdiv));
+	if (ret < 0)
+		return ret;
+
+	ret = xpcs_write(xpcs, MDIO_MMD_VEND2, SJA1110_RXPLL_CTRL1,
+			 SJA1110_RXPLL_REFDIV(rxpll_refdiv));
+	if (ret < 0)
+		return ret;
+
+	/* Program threshold for receiver signal detector.
+	 * Enable control of RXPLL by receiver signal detector to disable RXPLL
+	 * when an input signal is not present.
+	 */
+	ret = xpcs_write(xpcs, MDIO_MMD_VEND2, SJA1110_RX_DATA_DETECT, 0x0005);
+	if (ret < 0)
+		return ret;
+
+	/* Enable TX and RX PLLs and circuits.
+	 * Release reset of PMA to enable data flow to/from PCS.
+	 */
+	val = xpcs_read(xpcs, MDIO_MMD_VEND2, SJA1110_POWERDOWN_ENABLE);
+	if (val < 0)
+		return val;
+
+	val &= ~(SJA1110_TXPLL_PD | SJA1110_TXPD | SJA1110_RXCH_PD |
+		 SJA1110_RXBIAS_PD | SJA1110_RESET_SER_EN |
+		 SJA1110_RESET_SER | SJA1110_RESET_DES);
+	val |= SJA1110_RXPKDETEN | SJA1110_RCVEN;
+
+	ret = xpcs_write(xpcs, MDIO_MMD_VEND2, SJA1110_POWERDOWN_ENABLE, val);
+	if (ret < 0)
+		return ret;
+
+	/* Program continuous-time linear equalizer (CTLE) settings. */
+	ret = xpcs_write(xpcs, MDIO_MMD_VEND2, SJA1110_RX_CDR_CTLE,
+			 rx_cdr_ctle);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+int nxp_sja1110_sgmii_pma_config(struct dw_xpcs *xpcs)
+{
+	return nxp_sja1110_pma_config(xpcs, 0x19, 0x1, 0x19, 0x1, 0x212a);
+}
+
+int nxp_sja1110_2500basex_pma_config(struct dw_xpcs *xpcs)
+{
+	return nxp_sja1110_pma_config(xpcs, 0x7d, 0x2, 0x7d, 0x2, 0x732a);
+}
diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c
index 3b1baacfaf8f..b66e46fc88dc 100644
--- a/drivers/net/pcs/pcs-xpcs.c
+++ b/drivers/net/pcs/pcs-xpcs.c
@@ -1039,6 +1039,23 @@ static const struct xpcs_compat nxp_sja1105_xpcs_compat[DW_XPCS_INTERFACE_MAX] =
 	},
 };
 
+static const struct xpcs_compat nxp_sja1110_xpcs_compat[DW_XPCS_INTERFACE_MAX] = {
+	[DW_XPCS_SGMII] = {
+		.supported = xpcs_sgmii_features,
+		.interface = xpcs_sgmii_interfaces,
+		.num_interfaces = ARRAY_SIZE(xpcs_sgmii_interfaces),
+		.an_mode = DW_AN_C37_SGMII,
+		.pma_config = nxp_sja1110_sgmii_pma_config,
+	},
+	[DW_XPCS_2500BASEX] = {
+		.supported = xpcs_2500basex_features,
+		.interface = xpcs_2500basex_interfaces,
+		.num_interfaces = ARRAY_SIZE(xpcs_2500basex_interfaces),
+		.an_mode = DW_2500BASEX,
+		.pma_config = nxp_sja1110_2500basex_pma_config,
+	},
+};
+
 static const struct xpcs_id xpcs_id_list[] = {
 	{
 		.id = SYNOPSYS_XPCS_ID,
@@ -1048,6 +1065,10 @@ static const struct xpcs_id xpcs_id_list[] = {
 		.id = NXP_SJA1105_XPCS_ID,
 		.mask = SYNOPSYS_XPCS_MASK,
 		.compat = nxp_sja1105_xpcs_compat,
+	}, {
+		.id = NXP_SJA1110_XPCS_ID,
+		.mask = SYNOPSYS_XPCS_MASK,
+		.compat = nxp_sja1110_xpcs_compat,
 	},
 };
 
diff --git a/drivers/net/pcs/pcs-xpcs.h b/drivers/net/pcs/pcs-xpcs.h
index 3daf4276a158..35651d32a224 100644
--- a/drivers/net/pcs/pcs-xpcs.h
+++ b/drivers/net/pcs/pcs-xpcs.h
@@ -111,3 +111,5 @@ int xpcs_read(struct dw_xpcs *xpcs, int dev, u32 reg);
 int xpcs_write(struct dw_xpcs *xpcs, int dev, u32 reg, u16 val);
 
 int nxp_sja1105_sgmii_pma_config(struct dw_xpcs *xpcs);
+int nxp_sja1110_sgmii_pma_config(struct dw_xpcs *xpcs);
+int nxp_sja1110_2500basex_pma_config(struct dw_xpcs *xpcs);
diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h
index c594f7cdc304..dae7dd8ac683 100644
--- a/include/linux/pcs/pcs-xpcs.h
+++ b/include/linux/pcs/pcs-xpcs.h
@@ -11,6 +11,7 @@
 #include <linux/phylink.h>
 
 #define NXP_SJA1105_XPCS_ID		0x00000010
+#define NXP_SJA1110_XPCS_ID		0x00000020
 
 /* AN mode */
 #define DW_AN_C73			1
-- 
cgit v1.2.3


From a853c68e29bb974ca0cc0a8eaf88c333217556aa Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 11 Jun 2021 23:05:27 +0300
Subject: net: pcs: xpcs: export xpcs_do_config and xpcs_link_up

The sja1105 hardware has a quirk in that some changes require a switch
reset, which loses all configuration. When the reset is initiated,
everything needs to be reprogrammed, including the MACs and the PCS.
This is currently done in sja1105_static_config_reload() - we manually
call sja1105_adjust_port_config(), sja1105_sgmii_pcs_config() and
sja1105_sgmii_pcs_force_speed() which are all internal functions.

There is a desire for sja1105 to use the common xpcs driver, and that
means that the equivalents of those functions, xpcs_do_config() and
xpcs_link_up() respectively, will no longer be local functions.

Forcing phylink to retrigger a resolve somehow, say by doing dev_close()
followed by dev_open() is not really an option, because the CPU port
might have a PCS as well, and there is no net device which we can close
and reopen for that. Additionally, the dev_close/dev_open sequence might
force a renegotiation of the copper-side link for SGMII ports connected
to a PHY, and this is undesirable as well, because the switch reset is
much quicker than a PHY autoneg, so we would have a lot more downtime.

The only solution I see is for the sja1105 driver to keep doing what
it's doing, and that means we need to export the equivalents from xpcs
for sja1105_sgmii_pcs_config and sja1105_sgmii_pcs_force_speed, and call
them directly in sja1105_static_config_reload(). This will be done
during the conversion patch.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/pcs/pcs-xpcs.c   | 10 ++++++----
 include/linux/pcs/pcs-xpcs.h |  4 ++++
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c
index b66e46fc88dc..63fda3fc40aa 100644
--- a/drivers/net/pcs/pcs-xpcs.c
+++ b/drivers/net/pcs/pcs-xpcs.c
@@ -757,8 +757,8 @@ static int xpcs_config_2500basex(struct dw_xpcs *xpcs)
 	return xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_MMD_CTRL, ret);
 }
 
-static int xpcs_do_config(struct dw_xpcs *xpcs, phy_interface_t interface,
-			  unsigned int mode)
+int xpcs_do_config(struct dw_xpcs *xpcs, phy_interface_t interface,
+		   unsigned int mode)
 {
 	const struct xpcs_compat *compat;
 	int ret;
@@ -797,6 +797,7 @@ static int xpcs_do_config(struct dw_xpcs *xpcs, phy_interface_t interface,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(xpcs_do_config);
 
 static int xpcs_config(struct phylink_pcs *pcs, unsigned int mode,
 		       phy_interface_t interface,
@@ -945,8 +946,8 @@ static void xpcs_link_up_sgmii(struct dw_xpcs *xpcs, unsigned int mode,
 		pr_err("%s: xpcs_write returned %pe\n", __func__, ERR_PTR(ret));
 }
 
-static void xpcs_link_up(struct phylink_pcs *pcs, unsigned int mode,
-			 phy_interface_t interface, int speed, int duplex)
+void xpcs_link_up(struct phylink_pcs *pcs, unsigned int mode,
+		  phy_interface_t interface, int speed, int duplex)
 {
 	struct dw_xpcs *xpcs = phylink_pcs_to_xpcs(pcs);
 
@@ -955,6 +956,7 @@ static void xpcs_link_up(struct phylink_pcs *pcs, unsigned int mode,
 	if (interface == PHY_INTERFACE_MODE_SGMII)
 		return xpcs_link_up_sgmii(xpcs, mode, speed, duplex);
 }
+EXPORT_SYMBOL_GPL(xpcs_link_up);
 
 static u32 xpcs_get_id(struct dw_xpcs *xpcs)
 {
diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h
index dae7dd8ac683..add077a81b21 100644
--- a/include/linux/pcs/pcs-xpcs.h
+++ b/include/linux/pcs/pcs-xpcs.h
@@ -27,6 +27,10 @@ struct dw_xpcs {
 };
 
 int xpcs_get_an_mode(struct dw_xpcs *xpcs, phy_interface_t interface);
+void xpcs_link_up(struct phylink_pcs *pcs, unsigned int mode,
+		  phy_interface_t interface, int speed, int duplex);
+int xpcs_do_config(struct dw_xpcs *xpcs, phy_interface_t interface,
+		   unsigned int mode);
 void xpcs_validate(struct dw_xpcs *xpcs, unsigned long *supported,
 		   struct phylink_link_state *state);
 int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns,
-- 
cgit v1.2.3


From a56c286865692ac12291afe4c66198915c6b08f9 Mon Sep 17 00:00:00 2001
From: Steen Hegelund <steen.hegelund@microchip.com>
Date: Fri, 11 Jun 2021 14:54:51 +0200
Subject: net: phy: Add 25G BASE-R interface mode

Add 25gbase-r phy interface mode

Signed-off-by: Steen Hegelund <steen.hegelund@microchip.com>
Signed-off-by: Bjarni Jonasson <bjarni.jonasson@microchip.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/phy.rst | 6 ++++++
 include/linux/phy.h              | 4 ++++
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst
index 3f05d50ecd6e..571ba08386e7 100644
--- a/Documentation/networking/phy.rst
+++ b/Documentation/networking/phy.rst
@@ -292,6 +292,12 @@ Some of the interface modes are described below:
     Note: due to legacy usage, some 10GBASE-R usage incorrectly makes
     use of this definition.
 
+``PHY_INTERFACE_MODE_25GBASER``
+    This is the IEEE 802.3 PCS Clause 107 defined 25GBASE-R protocol.
+    The PCS is identical to 10GBASE-R, i.e. 64B/66B encoded
+    running 2.5 as fast, giving a fixed bit rate of 25.78125 Gbaud.
+    Please refer to the IEEE standard for further information.
+
 ``PHY_INTERFACE_MODE_100BASEX``
     This defines IEEE 802.3 Clause 24.  The link operates at a fixed data
     rate of 125Mpbs using a 4B/5B encoding scheme, resulting in an underlying
diff --git a/include/linux/phy.h b/include/linux/phy.h
index b60694734b07..3b80dc3ed68b 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -112,6 +112,7 @@ extern const int phy_10gbit_features_array[1];
  * @PHY_INTERFACE_MODE_RXAUI: Reduced XAUI
  * @PHY_INTERFACE_MODE_XAUI: 10 Gigabit Attachment Unit Interface
  * @PHY_INTERFACE_MODE_10GBASER: 10G BaseR
+ * @PHY_INTERFACE_MODE_25GBASER: 25G BaseR
  * @PHY_INTERFACE_MODE_USXGMII:  Universal Serial 10GE MII
  * @PHY_INTERFACE_MODE_10GKR: 10GBASE-KR - with Clause 73 AN
  * @PHY_INTERFACE_MODE_MAX: Book keeping
@@ -147,6 +148,7 @@ typedef enum {
 	PHY_INTERFACE_MODE_XAUI,
 	/* 10GBASE-R, XFI, SFI - single lane 10G Serdes */
 	PHY_INTERFACE_MODE_10GBASER,
+	PHY_INTERFACE_MODE_25GBASER,
 	PHY_INTERFACE_MODE_USXGMII,
 	/* 10GBASE-KR - with Clause 73 AN */
 	PHY_INTERFACE_MODE_10GKR,
@@ -223,6 +225,8 @@ static inline const char *phy_modes(phy_interface_t interface)
 		return "xaui";
 	case PHY_INTERFACE_MODE_10GBASER:
 		return "10gbase-r";
+	case PHY_INTERFACE_MODE_25GBASER:
+		return "25gbase-r";
 	case PHY_INTERFACE_MODE_USXGMII:
 		return "usxgmii";
 	case PHY_INTERFACE_MODE_10GKR:
-- 
cgit v1.2.3


From ea6932d70e223e02fea3ae20a4feff05d7c1ea9a Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@gmail.com>
Date: Fri, 11 Jun 2021 22:29:59 +0800
Subject: net: make get_net_ns return error if NET_NS is disabled

There is a panic in socket ioctl cmd SIOCGSKNS when NET_NS is not enabled.
The reason is that nsfs tries to access ns->ops but the proc_ns_operations
is not implemented in this case.

[7.670023] Unable to handle kernel NULL pointer dereference at virtual address 00000010
[7.670268] pgd = 32b54000
[7.670544] [00000010] *pgd=00000000
[7.671861] Internal error: Oops: 5 [#1] SMP ARM
[7.672315] Modules linked in:
[7.672918] CPU: 0 PID: 1 Comm: systemd Not tainted 5.13.0-rc3-00375-g6799d4f2da49 #16
[7.673309] Hardware name: Generic DT based system
[7.673642] PC is at nsfs_evict+0x24/0x30
[7.674486] LR is at clear_inode+0x20/0x9c

The same to tun SIOCGSKNS command.

To fix this problem, we make get_net_ns() return -EINVAL when NET_NS is
disabled. Meanwhile move it to right place net/core/net_namespace.c.

Signed-off-by: Changbin Du <changbin.du@gmail.com>
Fixes: c62cce2caee5 ("net: add an ioctl to get a socket network namespace")
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: David Laight <David.Laight@ACULAB.COM>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h      |  2 --
 include/net/net_namespace.h |  7 +++++++
 net/core/net_namespace.c    | 12 ++++++++++++
 net/socket.c                | 13 -------------
 4 files changed, 19 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index b8fc5c53ba6f..0d8e3dcb7f88 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -438,6 +438,4 @@ extern int __sys_socketpair(int family, int type, int protocol,
 			    int __user *usockvec);
 extern int __sys_shutdown_sock(struct socket *sock, int how);
 extern int __sys_shutdown(int fd, int how);
-
-extern struct ns_common *get_net_ns(struct ns_common *ns);
 #endif /* _LINUX_SOCKET_H */
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index fa5887143f0d..6412d7833d97 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -184,6 +184,8 @@ struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns,
 void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid);
 
 void net_ns_barrier(void);
+
+struct ns_common *get_net_ns(struct ns_common *ns);
 #else /* CONFIG_NET_NS */
 #include <linux/sched.h>
 #include <linux/nsproxy.h>
@@ -203,6 +205,11 @@ static inline void net_ns_get_ownership(const struct net *net,
 }
 
 static inline void net_ns_barrier(void) {}
+
+static inline struct ns_common *get_net_ns(struct ns_common *ns)
+{
+	return ERR_PTR(-EINVAL);
+}
 #endif /* CONFIG_NET_NS */
 
 
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 43b6ac4c4439..cc8dafb25d61 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -641,6 +641,18 @@ void __put_net(struct net *net)
 }
 EXPORT_SYMBOL_GPL(__put_net);
 
+/**
+ * get_net_ns - increment the refcount of the network namespace
+ * @ns: common namespace (net)
+ *
+ * Returns the net's common namespace.
+ */
+struct ns_common *get_net_ns(struct ns_common *ns)
+{
+	return &get_net(container_of(ns, struct net, ns))->ns;
+}
+EXPORT_SYMBOL_GPL(get_net_ns);
+
 struct net *get_net_ns_by_fd(int fd)
 {
 	struct file *file;
diff --git a/net/socket.c b/net/socket.c
index 27e3e7d53f8e..4f2c6d2795d0 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1072,19 +1072,6 @@ static long sock_do_ioctl(struct net *net, struct socket *sock,
  *	what to do with it - that's up to the protocol still.
  */
 
-/**
- *	get_net_ns - increment the refcount of the network namespace
- *	@ns: common namespace (net)
- *
- *	Returns the net's common namespace.
- */
-
-struct ns_common *get_net_ns(struct ns_common *ns)
-{
-	return &get_net(container_of(ns, struct net, ns))->ns;
-}
-EXPORT_SYMBOL_GPL(get_net_ns);
-
 static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
 	struct socket *sock;
-- 
cgit v1.2.3


From 88b710532e53de2466d1033fb1d5125aabf3215a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Sat, 12 Jun 2021 10:20:56 +0200
Subject: wwan: add interface creation support

Add support to create (and destroy) interfaces via a new
rtnetlink kind "wwan". The responsible driver has to use
the new wwan_register_ops() to make this possible.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Sergey Ryazanov <ryazanov.s.a@gmail.com>
Signed-off-by: Loic Poulain <loic.poulain@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wwan/wwan_core.c | 245 +++++++++++++++++++++++++++++++++++++++++--
 include/linux/wwan.h         |  24 +++++
 include/uapi/linux/wwan.h    |  16 +++
 net/core/rtnetlink.c         |   1 +
 4 files changed, 279 insertions(+), 7 deletions(-)
 create mode 100644 include/uapi/linux/wwan.h

(limited to 'include/linux')

diff --git a/drivers/net/wwan/wwan_core.c b/drivers/net/wwan/wwan_core.c
index 45a41aee8958..7e728042fc41 100644
--- a/drivers/net/wwan/wwan_core.c
+++ b/drivers/net/wwan/wwan_core.c
@@ -14,6 +14,8 @@
 #include <linux/types.h>
 #include <linux/termios.h>
 #include <linux/wwan.h>
+#include <net/rtnetlink.h>
+#include <uapi/linux/wwan.h>
 
 /* Maximum number of minors in use */
 #define WWAN_MAX_MINORS		(1 << MINORBITS)
@@ -35,10 +37,16 @@ static int wwan_major;
  *
  * @id: WWAN device unique ID.
  * @dev: Underlying device.
+ * @port_id: Current available port ID to pick.
+ * @ops: wwan device ops
+ * @ops_ctxt: context to pass to ops
  */
 struct wwan_device {
 	unsigned int id;
 	struct device dev;
+	atomic_t port_id;
+	const struct wwan_ops *ops;
+	void *ops_ctxt;
 };
 
 /**
@@ -102,7 +110,8 @@ static const struct device_type wwan_dev_type = {
 
 static int wwan_dev_parent_match(struct device *dev, const void *parent)
 {
-	return (dev->type == &wwan_dev_type && dev->parent == parent);
+	return (dev->type == &wwan_dev_type &&
+		(dev->parent == parent || dev == parent));
 }
 
 static struct wwan_device *wwan_dev_get_by_parent(struct device *parent)
@@ -116,6 +125,23 @@ static struct wwan_device *wwan_dev_get_by_parent(struct device *parent)
 	return to_wwan_dev(dev);
 }
 
+static int wwan_dev_name_match(struct device *dev, const void *name)
+{
+	return dev->type == &wwan_dev_type &&
+	       strcmp(dev_name(dev), name) == 0;
+}
+
+static struct wwan_device *wwan_dev_get_by_name(const char *name)
+{
+	struct device *dev;
+
+	dev = class_find_device(wwan_class, NULL, name, wwan_dev_name_match);
+	if (!dev)
+		return ERR_PTR(-ENODEV);
+
+	return to_wwan_dev(dev);
+}
+
 /* This function allocates and registers a new WWAN device OR if a WWAN device
  * already exist for the given parent, it gets a reference and return it.
  * This function is not exported (for now), it is called indirectly via
@@ -180,9 +206,14 @@ static void wwan_remove_dev(struct wwan_device *wwandev)
 	/* WWAN device is created and registered (get+add) along with its first
 	 * child port, and subsequent port registrations only grab a reference
 	 * (get). The WWAN device must then be unregistered (del+put) along with
-	 * its latest port, and reference simply dropped (put) otherwise.
+	 * its last port, and reference simply dropped (put) otherwise. In the
+	 * same fashion, we must not unregister it when the ops are still there.
 	 */
-	ret = device_for_each_child(&wwandev->dev, NULL, is_wwan_child);
+	if (wwandev->ops)
+		ret = 1;
+	else
+		ret = device_for_each_child(&wwandev->dev, NULL, is_wwan_child);
+
 	if (!ret)
 		device_unregister(&wwandev->dev);
 	else
@@ -750,26 +781,226 @@ static const struct file_operations wwan_port_fops = {
 	.llseek = noop_llseek,
 };
 
+/**
+ * wwan_register_ops - register WWAN device ops
+ * @parent: Device to use as parent and shared by all WWAN ports and
+ *	created netdevs
+ * @ops: operations to register
+ * @ctxt: context to pass to operations
+ *
+ * Returns: 0 on success, a negative error code on failure
+ */
+int wwan_register_ops(struct device *parent, const struct wwan_ops *ops,
+		      void *ctxt)
+{
+	struct wwan_device *wwandev;
+
+	if (WARN_ON(!parent || !ops))
+		return -EINVAL;
+
+	wwandev = wwan_create_dev(parent);
+	if (!wwandev)
+		return -ENOMEM;
+
+	if (WARN_ON(wwandev->ops)) {
+		wwan_remove_dev(wwandev);
+		return -EBUSY;
+	}
+
+	if (!try_module_get(ops->owner)) {
+		wwan_remove_dev(wwandev);
+		return -ENODEV;
+	}
+
+	wwandev->ops = ops;
+	wwandev->ops_ctxt = ctxt;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(wwan_register_ops);
+
+/**
+ * wwan_unregister_ops - remove WWAN device ops
+ * @parent: Device to use as parent and shared by all WWAN ports and
+ *	created netdevs
+ */
+void wwan_unregister_ops(struct device *parent)
+{
+	struct wwan_device *wwandev = wwan_dev_get_by_parent(parent);
+	bool has_ops;
+
+	if (WARN_ON(IS_ERR(wwandev)))
+		return;
+
+	has_ops = wwandev->ops;
+
+	/* put the reference obtained by wwan_dev_get_by_parent(),
+	 * we should still have one (that the owner is giving back
+	 * now) due to the ops being assigned, check that below
+	 * and return if not.
+	 */
+	put_device(&wwandev->dev);
+
+	if (WARN_ON(!has_ops))
+		return;
+
+	module_put(wwandev->ops->owner);
+
+	wwandev->ops = NULL;
+	wwandev->ops_ctxt = NULL;
+	wwan_remove_dev(wwandev);
+}
+EXPORT_SYMBOL_GPL(wwan_unregister_ops);
+
+static int wwan_rtnl_validate(struct nlattr *tb[], struct nlattr *data[],
+			      struct netlink_ext_ack *extack)
+{
+	if (!data)
+		return -EINVAL;
+
+	if (!tb[IFLA_PARENT_DEV_NAME])
+		return -EINVAL;
+
+	if (!data[IFLA_WWAN_LINK_ID])
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct device_type wwan_type = { .name = "wwan" };
+
+static struct net_device *wwan_rtnl_alloc(struct nlattr *tb[],
+					  const char *ifname,
+					  unsigned char name_assign_type,
+					  unsigned int num_tx_queues,
+					  unsigned int num_rx_queues)
+{
+	const char *devname = nla_data(tb[IFLA_PARENT_DEV_NAME]);
+	struct wwan_device *wwandev = wwan_dev_get_by_name(devname);
+	struct net_device *dev;
+
+	if (IS_ERR(wwandev))
+		return ERR_CAST(wwandev);
+
+	/* only supported if ops were registered (not just ports) */
+	if (!wwandev->ops) {
+		dev = ERR_PTR(-EOPNOTSUPP);
+		goto out;
+	}
+
+	dev = alloc_netdev_mqs(wwandev->ops->priv_size, ifname, name_assign_type,
+			       wwandev->ops->setup, num_tx_queues, num_rx_queues);
+
+	if (dev) {
+		SET_NETDEV_DEV(dev, &wwandev->dev);
+		SET_NETDEV_DEVTYPE(dev, &wwan_type);
+	}
+
+out:
+	/* release the reference */
+	put_device(&wwandev->dev);
+	return dev;
+}
+
+static int wwan_rtnl_newlink(struct net *src_net, struct net_device *dev,
+			     struct nlattr *tb[], struct nlattr *data[],
+			     struct netlink_ext_ack *extack)
+{
+	struct wwan_device *wwandev = wwan_dev_get_by_parent(dev->dev.parent);
+	u32 link_id = nla_get_u32(data[IFLA_WWAN_LINK_ID]);
+	int ret;
+
+	if (IS_ERR(wwandev))
+		return PTR_ERR(wwandev);
+
+	/* shouldn't have a netdev (left) with us as parent so WARN */
+	if (WARN_ON(!wwandev->ops)) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (wwandev->ops->newlink)
+		ret = wwandev->ops->newlink(wwandev->ops_ctxt, dev,
+					    link_id, extack);
+	else
+		ret = register_netdevice(dev);
+
+out:
+	/* release the reference */
+	put_device(&wwandev->dev);
+	return ret;
+}
+
+static void wwan_rtnl_dellink(struct net_device *dev, struct list_head *head)
+{
+	struct wwan_device *wwandev = wwan_dev_get_by_parent(dev->dev.parent);
+
+	if (IS_ERR(wwandev))
+		return;
+
+	/* shouldn't have a netdev (left) with us as parent so WARN */
+	if (WARN_ON(!wwandev->ops))
+		goto out;
+
+	if (wwandev->ops->dellink)
+		wwandev->ops->dellink(wwandev->ops_ctxt, dev, head);
+	else
+		unregister_netdevice(dev);
+
+out:
+	/* release the reference */
+	put_device(&wwandev->dev);
+}
+
+static const struct nla_policy wwan_rtnl_policy[IFLA_WWAN_MAX + 1] = {
+	[IFLA_WWAN_LINK_ID] = { .type = NLA_U32 },
+};
+
+static struct rtnl_link_ops wwan_rtnl_link_ops __read_mostly = {
+	.kind = "wwan",
+	.maxtype = __IFLA_WWAN_MAX,
+	.alloc = wwan_rtnl_alloc,
+	.validate = wwan_rtnl_validate,
+	.newlink = wwan_rtnl_newlink,
+	.dellink = wwan_rtnl_dellink,
+	.policy = wwan_rtnl_policy,
+};
+
 static int __init wwan_init(void)
 {
+	int err;
+
+	err = rtnl_link_register(&wwan_rtnl_link_ops);
+	if (err)
+		return err;
+
 	wwan_class = class_create(THIS_MODULE, "wwan");
-	if (IS_ERR(wwan_class))
-		return PTR_ERR(wwan_class);
+	if (IS_ERR(wwan_class)) {
+		err = PTR_ERR(wwan_class);
+		goto unregister;
+	}
 
 	/* chrdev used for wwan ports */
 	wwan_major = __register_chrdev(0, 0, WWAN_MAX_MINORS, "wwan_port",
 				       &wwan_port_fops);
 	if (wwan_major < 0) {
-		class_destroy(wwan_class);
-		return wwan_major;
+		err = wwan_major;
+		goto destroy;
 	}
 
 	return 0;
+
+destroy:
+	class_destroy(wwan_class);
+unregister:
+	rtnl_link_unregister(&wwan_rtnl_link_ops);
+	return err;
 }
 
 static void __exit wwan_exit(void)
 {
 	__unregister_chrdev(wwan_major, 0, WWAN_MAX_MINORS, "wwan_port");
+	rtnl_link_unregister(&wwan_rtnl_link_ops);
 	class_destroy(wwan_class);
 }
 
diff --git a/include/linux/wwan.h b/include/linux/wwan.h
index fa33cc16d931..430a3a0817de 100644
--- a/include/linux/wwan.h
+++ b/include/linux/wwan.h
@@ -7,6 +7,7 @@
 #include <linux/device.h>
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
+#include <linux/netlink.h>
 
 /**
  * enum wwan_port_type - WWAN port types
@@ -116,4 +117,27 @@ void wwan_port_txon(struct wwan_port *port);
  */
 void *wwan_port_get_drvdata(struct wwan_port *port);
 
+/**
+ * struct wwan_ops - WWAN device ops
+ * @owner: module owner of the WWAN ops
+ * @priv_size: size of private netdev data area
+ * @setup: set up a new netdev
+ * @newlink: register the new netdev
+ * @dellink: remove the given netdev
+ */
+struct wwan_ops {
+	struct module *owner;
+	unsigned int priv_size;
+	void (*setup)(struct net_device *dev);
+	int (*newlink)(void *ctxt, struct net_device *dev,
+		       u32 if_id, struct netlink_ext_ack *extack);
+	void (*dellink)(void *ctxt, struct net_device *dev,
+			struct list_head *head);
+};
+
+int wwan_register_ops(struct device *parent, const struct wwan_ops *ops,
+		      void *ctxt);
+
+void wwan_unregister_ops(struct device *parent);
+
 #endif /* __WWAN_H */
diff --git a/include/uapi/linux/wwan.h b/include/uapi/linux/wwan.h
new file mode 100644
index 000000000000..32a2720b4d11
--- /dev/null
+++ b/include/uapi/linux/wwan.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */
+/*
+ * Copyright (C) 2021 Intel Corporation.
+ */
+#ifndef _UAPI_WWAN_H_
+#define _UAPI_WWAN_H_
+
+enum {
+	IFLA_WWAN_UNSPEC,
+	IFLA_WWAN_LINK_ID, /* u32 */
+
+	__IFLA_WWAN_MAX
+};
+#define IFLA_WWAN_MAX (__IFLA_WWAN_MAX - 1)
+
+#endif /* _UAPI_WWAN_H_ */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 170e97f3b3c6..5baa86bca876 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1890,6 +1890,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_PERM_ADDRESS]	= { .type = NLA_REJECT },
 	[IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED },
 	[IFLA_NEW_IFINDEX]	= NLA_POLICY_MIN(NLA_S32, 1),
+	[IFLA_PARENT_DEV_NAME]	= { .type = NLA_NUL_STRING },
 };
 
 static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
-- 
cgit v1.2.3


From be754f6435936e78dafe0ebb9d1e9d52c3bde842 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@linaro.org>
Date: Sat, 12 Jun 2021 09:37:34 -0500
Subject: net: qualcomm: rmnet: trailer value is a checksum

The csum_value field in the rmnet_map_dl_csum_trailer structure is a
"real" Internet checksum.  It is a 16 bit value, in big endian format,
which represents an inverted ones' complement sum over pairs of bytes.

Make that clear by changing its type to __sum16.

This makes a typecast in rmnet_map_ipv4_dl_csum_trailer() and
another in rmnet_map_ipv6_dl_csum_trailer() unnecessary.

Signed-off-by: Alex Elder <elder@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c | 5 ++---
 include/linux/if_rmnet.h                             | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
index 033b8ad3d735..610c8b5a8f46 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
@@ -87,7 +87,7 @@ rmnet_map_ipv4_dl_csum_trailer(struct sk_buff *skb,
 	 * header checksum value; it is already accounted for in the
 	 * checksum value found in the trailer.
 	 */
-	ip_payload_csum = (__force __sum16)~csum_trailer->csum_value;
+	ip_payload_csum = ~csum_trailer->csum_value;
 
 	pseudo_csum = ~csum_tcpudp_magic(ip4h->saddr, ip4h->daddr,
 					 ntohs(ip4h->tot_len) - ip4h->ihl * 4,
@@ -132,8 +132,7 @@ rmnet_map_ipv6_dl_csum_trailer(struct sk_buff *skb,
 	 * checksum computed over the pseudo header.
 	 */
 	ip_header_csum = (__force __be16)ip_fast_csum(ip6h, sizeof(*ip6h) / 4);
-	ip6_payload_csum = ~csum16_sub((__force __sum16)csum_trailer->csum_value,
-				       ip_header_csum);
+	ip6_payload_csum = ~csum16_sub(csum_trailer->csum_value, ip_header_csum);
 
 	length = (ip6h->nexthdr == IPPROTO_UDP) ?
 		 ntohs(((struct udphdr *)txporthdr)->len) :
diff --git a/include/linux/if_rmnet.h b/include/linux/if_rmnet.h
index be17610a981e..10e7521ecb6c 100644
--- a/include/linux/if_rmnet.h
+++ b/include/linux/if_rmnet.h
@@ -25,7 +25,7 @@ struct rmnet_map_dl_csum_trailer {
 	u8 flags;			/* MAP_CSUM_DL_VALID_FLAG */
 	__be16 csum_start_offset;
 	__be16 csum_length;
-	__be16 csum_value;
+	__sum16 csum_value;
 } __aligned(1);
 
 /* rmnet_map_dl_csum_trailer flags field:
-- 
cgit v1.2.3


From 2e3025434a6ba090c85871a1d4080ff784109e1f Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Fri, 11 Jun 2021 09:54:42 +0800
Subject: mm: relocate 'write_protect_seq' in struct mm_struct

0day robot reported a 9.2% regression for will-it-scale mmap1 test
case[1], caused by commit 57efa1fe5957 ("mm/gup: prevent gup_fast from
racing with COW during fork").

Further debug shows the regression is due to that commit changes the
offset of hot fields 'mmap_lock' inside structure 'mm_struct', thus some
cache alignment changes.

From the perf data, the contention for 'mmap_lock' is very severe and
takes around 95% cpu cycles, and it is a rw_semaphore

        struct rw_semaphore {
                atomic_long_t count;	/* 8 bytes */
                atomic_long_t owner;	/* 8 bytes */
                struct optimistic_spin_queue osq; /* spinner MCS lock */
                ...

Before commit 57efa1fe5957 adds the 'write_protect_seq', it happens to
have a very optimal cache alignment layout, as Linus explained:

 "and before the addition of the 'write_protect_seq' field, the
  mmap_sem was at offset 120 in 'struct mm_struct'.

  Which meant that count and owner were in two different cachelines,
  and then when you have contention and spend time in
  rwsem_down_write_slowpath(), this is probably *exactly* the kind
  of layout you want.

  Because first the rwsem_write_trylock() will do a cmpxchg on the
  first cacheline (for the optimistic fast-path), and then in the
  case of contention, rwsem_down_write_slowpath() will just access
  the second cacheline.

  Which is probably just optimal for a load that spends a lot of
  time contended - new waiters touch that first cacheline, and then
  they queue themselves up on the second cacheline."

After the commit, the rw_semaphore is at offset 128, which means the
'count' and 'owner' fields are now in the same cacheline, and causes
more cache bouncing.

Currently there are 3 "#ifdef CONFIG_XXX" before 'mmap_lock' which will
affect its offset:

  CONFIG_MMU
  CONFIG_MEMBARRIER
  CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES

The layout above is on 64 bits system with 0day's default kernel config
(similar to RHEL-8.3's config), in which all these 3 options are 'y'.
And the layout can vary with different kernel configs.

Relayouting a structure is usually a double-edged sword, as sometimes it
can helps one case, but hurt other cases.  For this case, one solution
is, as the newly added 'write_protect_seq' is a 4 bytes long seqcount_t
(when CONFIG_DEBUG_LOCK_ALLOC=n), placing it into an existing 4 bytes
hole in 'mm_struct' will not change other fields' alignment, while
restoring the regression.

Link: https://lore.kernel.org/lkml/20210525031636.GB7744@xsang-OptiPlex-9020/ [1]
Reported-by: kernel test robot <oliver.sang@intel.com>
Signed-off-by: Feng Tang <feng.tang@intel.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5aacc1c10a45..8f0fb62e8975 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -445,13 +445,6 @@ struct mm_struct {
 		 */
 		atomic_t has_pinned;
 
-		/**
-		 * @write_protect_seq: Locked when any thread is write
-		 * protecting pages mapped by this mm to enforce a later COW,
-		 * for instance during page table copying for fork().
-		 */
-		seqcount_t write_protect_seq;
-
 #ifdef CONFIG_MMU
 		atomic_long_t pgtables_bytes;	/* PTE page table pages */
 #endif
@@ -460,6 +453,18 @@ struct mm_struct {
 		spinlock_t page_table_lock; /* Protects page tables and some
 					     * counters
 					     */
+		/*
+		 * With some kernel config, the current mmap_lock's offset
+		 * inside 'mm_struct' is at 0x120, which is very optimal, as
+		 * its two hot fields 'count' and 'owner' sit in 2 different
+		 * cachelines,  and when mmap_lock is highly contended, both
+		 * of the 2 fields will be accessed frequently, current layout
+		 * will help to reduce cache bouncing.
+		 *
+		 * So please be careful with adding new fields before
+		 * mmap_lock, which can easily push the 2 fields into one
+		 * cacheline.
+		 */
 		struct rw_semaphore mmap_lock;
 
 		struct list_head mmlist; /* List of maybe swapped mm's.	These
@@ -480,7 +485,15 @@ struct mm_struct {
 		unsigned long stack_vm;	   /* VM_STACK */
 		unsigned long def_flags;
 
+		/**
+		 * @write_protect_seq: Locked when any thread is write
+		 * protecting pages mapped by this mm to enforce a later COW,
+		 * for instance during page table copying for fork().
+		 */
+		seqcount_t write_protect_seq;
+
 		spinlock_t arg_lock; /* protect the below fields */
+
 		unsigned long start_code, end_code, start_data, end_data;
 		unsigned long start_brk, brk, start_stack;
 		unsigned long arg_start, arg_end, env_start, env_end;
-- 
cgit v1.2.3


From 718fb2bcf1034232599045fc710644d903c2af4b Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <aardelean@deviqon.com>
Date: Thu, 13 May 2021 15:07:45 +0300
Subject: iio: adc: ad_sigma_delta: introduct
 devm_ad_sd_setup_buffer_and_trigger()

This is a version of ad_sd_setup_buffer_and_trigger() with all underlying
functions (that are used) being replaced with their device-managed
variants.

One thing to take care here is with {devm_}iio_trigger_alloc(), where both
functions take a parent-device object as the first parameter.

To make sure nothing quirky is happening, the devm_ad_sd_probe_trigger()
function is checking that the provided 'dev' reference is the same as the
one stored on the 'struct ad_sigma_delta' driver data.

Signed-off-by: Alexandru Ardelean <aardelean@deviqon.com>
Link: https://lore.kernel.org/r/20210513120752.90074-6-aardelean@deviqon.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/adc/ad_sigma_delta.c       | 60 ++++++++++++++++++++++++++++++++++
 include/linux/iio/adc/ad_sigma_delta.h |  3 ++
 2 files changed, 63 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iio/adc/ad_sigma_delta.c b/drivers/iio/adc/ad_sigma_delta.c
index 69b979331ccd..d5801a47be07 100644
--- a/drivers/iio/adc/ad_sigma_delta.c
+++ b/drivers/iio/adc/ad_sigma_delta.c
@@ -513,6 +513,46 @@ error_ret:
 	return ret;
 }
 
+static int devm_ad_sd_probe_trigger(struct device *dev, struct iio_dev *indio_dev)
+{
+	struct ad_sigma_delta *sigma_delta = iio_device_get_drvdata(indio_dev);
+	int ret;
+
+	if (dev != &sigma_delta->spi->dev) {
+		dev_err(dev, "Trigger parent should be '%s', got '%s'\n",
+			dev_name(dev), dev_name(&sigma_delta->spi->dev));
+		return -EFAULT;
+	}
+
+	sigma_delta->trig = devm_iio_trigger_alloc(dev, "%s-dev%d", indio_dev->name,
+						   iio_device_id(indio_dev));
+	if (sigma_delta->trig == NULL)
+		return -ENOMEM;
+
+	sigma_delta->trig->ops = &ad_sd_trigger_ops;
+	init_completion(&sigma_delta->completion);
+
+	sigma_delta->irq_dis = true;
+	ret = devm_request_irq(dev, sigma_delta->spi->irq,
+			       ad_sd_data_rdy_trig_poll,
+			       sigma_delta->info->irq_flags | IRQF_NO_AUTOEN,
+			       indio_dev->name,
+			       sigma_delta);
+	if (ret)
+		return ret;
+
+	iio_trigger_set_drvdata(sigma_delta->trig, sigma_delta);
+
+	ret = devm_iio_trigger_register(dev, sigma_delta->trig);
+	if (ret)
+		return ret;
+
+	/* select default trigger */
+	indio_dev->trig = iio_trigger_get(sigma_delta->trig);
+
+	return 0;
+}
+
 static void ad_sd_remove_trigger(struct iio_dev *indio_dev)
 {
 	struct ad_sigma_delta *sigma_delta = iio_device_get_drvdata(indio_dev);
@@ -556,6 +596,26 @@ void ad_sd_cleanup_buffer_and_trigger(struct iio_dev *indio_dev)
 }
 EXPORT_SYMBOL_GPL(ad_sd_cleanup_buffer_and_trigger);
 
+/**
+ * devm_ad_sd_setup_buffer_and_trigger() - Device-managed buffer & trigger setup
+ * @dev: Device object to which to bind the life-time of the resources attached
+ * @indio_dev: The IIO device
+ */
+int devm_ad_sd_setup_buffer_and_trigger(struct device *dev, struct iio_dev *indio_dev)
+{
+	int ret;
+
+	ret = devm_iio_triggered_buffer_setup(dev, indio_dev,
+					      &iio_pollfunc_store_time,
+					      &ad_sd_trigger_handler,
+					      &ad_sd_buffer_setup_ops);
+	if (ret)
+		return ret;
+
+	return devm_ad_sd_probe_trigger(dev, indio_dev);
+}
+EXPORT_SYMBOL_GPL(devm_ad_sd_setup_buffer_and_trigger);
+
 /**
  * ad_sd_init() - Initializes a ad_sigma_delta struct
  * @sigma_delta: The ad_sigma_delta device
diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h
index 7199280d89ca..be81ad39fb7a 100644
--- a/include/linux/iio/adc/ad_sigma_delta.h
+++ b/include/linux/iio/adc/ad_sigma_delta.h
@@ -26,6 +26,7 @@ struct ad_sd_calib_data {
 };
 
 struct ad_sigma_delta;
+struct device;
 struct iio_dev;
 
 /**
@@ -135,6 +136,8 @@ int ad_sd_init(struct ad_sigma_delta *sigma_delta, struct iio_dev *indio_dev,
 int ad_sd_setup_buffer_and_trigger(struct iio_dev *indio_dev);
 void ad_sd_cleanup_buffer_and_trigger(struct iio_dev *indio_dev);
 
+int devm_ad_sd_setup_buffer_and_trigger(struct device *dev, struct iio_dev *indio_dev);
+
 int ad_sd_validate_trigger(struct iio_dev *indio_dev, struct iio_trigger *trig);
 
 #endif
-- 
cgit v1.2.3


From 4b36151d7482654ec50ddc831f19a3e76c8ba4dd Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <aardelean@deviqon.com>
Date: Thu, 13 May 2021 15:07:52 +0300
Subject: iio: adc: ad_sigma_delta: remove
 ad_sd_{setup,cleanup}_buffer_and_trigger()

Since all AD Sigma-Delta drivers now use the
devm_ad_sd_setup_buffer_and_trigger() function, we can remove the old
ad_sd_{setup,cleanup}_buffer_and_trigger() functions.

This way we can discourage new drivers that use the ad_sigma_delta
lib-driver to use these (older functions).

Signed-off-by: Alexandru Ardelean <aardelean@deviqon.com>
Link: https://lore.kernel.org/r/20210513120752.90074-13-aardelean@deviqon.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/adc/ad_sigma_delta.c       | 86 ----------------------------------
 include/linux/iio/adc/ad_sigma_delta.h |  3 --
 2 files changed, 89 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/adc/ad_sigma_delta.c b/drivers/iio/adc/ad_sigma_delta.c
index d5801a47be07..1d652d9b2f5c 100644
--- a/drivers/iio/adc/ad_sigma_delta.c
+++ b/drivers/iio/adc/ad_sigma_delta.c
@@ -470,49 +470,6 @@ EXPORT_SYMBOL_GPL(ad_sd_validate_trigger);
 static const struct iio_trigger_ops ad_sd_trigger_ops = {
 };
 
-static int ad_sd_probe_trigger(struct iio_dev *indio_dev)
-{
-	struct ad_sigma_delta *sigma_delta = iio_device_get_drvdata(indio_dev);
-	int ret;
-
-	sigma_delta->trig = iio_trigger_alloc(&sigma_delta->spi->dev,
-					      "%s-dev%d", indio_dev->name,
-					      iio_device_id(indio_dev));
-	if (sigma_delta->trig == NULL) {
-		ret = -ENOMEM;
-		goto error_ret;
-	}
-	sigma_delta->trig->ops = &ad_sd_trigger_ops;
-	init_completion(&sigma_delta->completion);
-
-	sigma_delta->irq_dis = true;
-	ret = request_irq(sigma_delta->spi->irq,
-			  ad_sd_data_rdy_trig_poll,
-			  sigma_delta->info->irq_flags | IRQF_NO_AUTOEN,
-			  indio_dev->name,
-			  sigma_delta);
-	if (ret)
-		goto error_free_trig;
-
-	iio_trigger_set_drvdata(sigma_delta->trig, sigma_delta);
-
-	ret = iio_trigger_register(sigma_delta->trig);
-	if (ret)
-		goto error_free_irq;
-
-	/* select default trigger */
-	indio_dev->trig = iio_trigger_get(sigma_delta->trig);
-
-	return 0;
-
-error_free_irq:
-	free_irq(sigma_delta->spi->irq, sigma_delta);
-error_free_trig:
-	iio_trigger_free(sigma_delta->trig);
-error_ret:
-	return ret;
-}
-
 static int devm_ad_sd_probe_trigger(struct device *dev, struct iio_dev *indio_dev)
 {
 	struct ad_sigma_delta *sigma_delta = iio_device_get_drvdata(indio_dev);
@@ -553,49 +510,6 @@ static int devm_ad_sd_probe_trigger(struct device *dev, struct iio_dev *indio_de
 	return 0;
 }
 
-static void ad_sd_remove_trigger(struct iio_dev *indio_dev)
-{
-	struct ad_sigma_delta *sigma_delta = iio_device_get_drvdata(indio_dev);
-
-	iio_trigger_unregister(sigma_delta->trig);
-	free_irq(sigma_delta->spi->irq, sigma_delta);
-	iio_trigger_free(sigma_delta->trig);
-}
-
-/**
- * ad_sd_setup_buffer_and_trigger() -
- * @indio_dev: The IIO device
- */
-int ad_sd_setup_buffer_and_trigger(struct iio_dev *indio_dev)
-{
-	int ret;
-
-	ret = iio_triggered_buffer_setup(indio_dev, &iio_pollfunc_store_time,
-			&ad_sd_trigger_handler, &ad_sd_buffer_setup_ops);
-	if (ret)
-		return ret;
-
-	ret = ad_sd_probe_trigger(indio_dev);
-	if (ret) {
-		iio_triggered_buffer_cleanup(indio_dev);
-		return ret;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(ad_sd_setup_buffer_and_trigger);
-
-/**
- * ad_sd_cleanup_buffer_and_trigger() -
- * @indio_dev: The IIO device
- */
-void ad_sd_cleanup_buffer_and_trigger(struct iio_dev *indio_dev)
-{
-	ad_sd_remove_trigger(indio_dev);
-	iio_triggered_buffer_cleanup(indio_dev);
-}
-EXPORT_SYMBOL_GPL(ad_sd_cleanup_buffer_and_trigger);
-
 /**
  * devm_ad_sd_setup_buffer_and_trigger() - Device-managed buffer & trigger setup
  * @dev: Device object to which to bind the life-time of the resources attached
diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h
index be81ad39fb7a..c525fd51652f 100644
--- a/include/linux/iio/adc/ad_sigma_delta.h
+++ b/include/linux/iio/adc/ad_sigma_delta.h
@@ -133,9 +133,6 @@ int ad_sd_calibrate_all(struct ad_sigma_delta *sigma_delta,
 int ad_sd_init(struct ad_sigma_delta *sigma_delta, struct iio_dev *indio_dev,
 	struct spi_device *spi, const struct ad_sigma_delta_info *info);
 
-int ad_sd_setup_buffer_and_trigger(struct iio_dev *indio_dev);
-void ad_sd_cleanup_buffer_and_trigger(struct iio_dev *indio_dev);
-
 int devm_ad_sd_setup_buffer_and_trigger(struct device *dev, struct iio_dev *indio_dev);
 
 int ad_sd_validate_trigger(struct iio_dev *indio_dev, struct iio_trigger *trig);
-- 
cgit v1.2.3


From 29a269c6f54825c643a5c35762a2829ba5be67f6 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 11 May 2021 13:21:32 +0800
Subject: soundwire: intel: move to auxiliary bus

Now that the auxiliary_bus exists, there's no reason to use platform
devices as children of a PCI device any longer.

This patch refactors the code by extending a basic auxiliary device
with Intel link-specific structures that need to be passed between
controller and link levels. This refactoring is much cleaner with no
need for cross-pointers between device and link structures.

Note that the auxiliary bus API has separate init and add steps, which
requires more attention in the error unwinding paths. The main loop
needs to deal with kfree() and auxiliary_device_uninit() for the
current iteration before jumping to the common label which releases
everything allocated in prior iterations.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Guennadi Liakhovetski <guennadi.liakhovetski@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20210511052132.28150-1-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/Kconfig           |   1 +
 drivers/soundwire/intel.c           |  56 +++++----
 drivers/soundwire/intel.h           |  14 ++-
 drivers/soundwire/intel_init.c      | 232 ++++++++++++++++++++++++------------
 include/linux/soundwire/sdw_intel.h |   6 +-
 5 files changed, 202 insertions(+), 107 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soundwire/Kconfig b/drivers/soundwire/Kconfig
index 016e74230bb7..2b7795233282 100644
--- a/drivers/soundwire/Kconfig
+++ b/drivers/soundwire/Kconfig
@@ -25,6 +25,7 @@ config SOUNDWIRE_INTEL
 	tristate "Intel SoundWire Master driver"
 	select SOUNDWIRE_CADENCE
 	select SOUNDWIRE_GENERIC_ALLOCATION
+	select AUXILIARY_BUS
 	depends on ACPI && SND_SOC
 	help
 	  SoundWire Intel Master driver.
diff --git a/drivers/soundwire/intel.c b/drivers/soundwire/intel.c
index fd95f94630b1..c11e3d8cd308 100644
--- a/drivers/soundwire/intel.c
+++ b/drivers/soundwire/intel.c
@@ -11,7 +11,7 @@
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
-#include <linux/platform_device.h>
+#include <linux/auxiliary_bus.h>
 #include <sound/pcm_params.h>
 #include <linux/pm_runtime.h>
 #include <sound/soc.h>
@@ -1327,11 +1327,14 @@ static int intel_init(struct sdw_intel *sdw)
 }
 
 /*
- * probe and init
+ * probe and init (aux_dev_id argument is required by function prototype but not used)
  */
-static int intel_master_probe(struct platform_device *pdev)
+static int intel_link_probe(struct auxiliary_device *auxdev,
+			    const struct auxiliary_device_id *aux_dev_id)
+
 {
-	struct device *dev = &pdev->dev;
+	struct device *dev = &auxdev->dev;
+	struct sdw_intel_link_dev *ldev = auxiliary_dev_to_sdw_intel_link_dev(auxdev);
 	struct sdw_intel *sdw;
 	struct sdw_cdns *cdns;
 	struct sdw_bus *bus;
@@ -1344,14 +1347,14 @@ static int intel_master_probe(struct platform_device *pdev)
 	cdns = &sdw->cdns;
 	bus = &cdns->bus;
 
-	sdw->instance = pdev->id;
-	sdw->link_res = dev_get_platdata(dev);
+	sdw->instance = auxdev->id;
+	sdw->link_res = &ldev->link_res;
 	cdns->dev = dev;
 	cdns->registers = sdw->link_res->registers;
 	cdns->instance = sdw->instance;
 	cdns->msg_count = 0;
 
-	bus->link_id = pdev->id;
+	bus->link_id = auxdev->id;
 
 	sdw_cdns_probe(cdns);
 
@@ -1384,10 +1387,10 @@ static int intel_master_probe(struct platform_device *pdev)
 	return 0;
 }
 
-int intel_master_startup(struct platform_device *pdev)
+int intel_link_startup(struct auxiliary_device *auxdev)
 {
 	struct sdw_cdns_stream_config config;
-	struct device *dev = &pdev->dev;
+	struct device *dev = &auxdev->dev;
 	struct sdw_cdns *cdns = dev_get_drvdata(dev);
 	struct sdw_intel *sdw = cdns_to_intel(cdns);
 	struct sdw_bus *bus = &cdns->bus;
@@ -1524,9 +1527,9 @@ err_init:
 	return ret;
 }
 
-static int intel_master_remove(struct platform_device *pdev)
+static void intel_link_remove(struct auxiliary_device *auxdev)
 {
-	struct device *dev = &pdev->dev;
+	struct device *dev = &auxdev->dev;
 	struct sdw_cdns *cdns = dev_get_drvdata(dev);
 	struct sdw_intel *sdw = cdns_to_intel(cdns);
 	struct sdw_bus *bus = &cdns->bus;
@@ -1542,19 +1545,17 @@ static int intel_master_remove(struct platform_device *pdev)
 		snd_soc_unregister_component(dev);
 	}
 	sdw_bus_master_delete(bus);
-
-	return 0;
 }
 
-int intel_master_process_wakeen_event(struct platform_device *pdev)
+int intel_link_process_wakeen_event(struct auxiliary_device *auxdev)
 {
-	struct device *dev = &pdev->dev;
+	struct device *dev = &auxdev->dev;
 	struct sdw_intel *sdw;
 	struct sdw_bus *bus;
 	void __iomem *shim;
 	u16 wake_sts;
 
-	sdw = platform_get_drvdata(pdev);
+	sdw = dev_get_drvdata(dev);
 	bus = &sdw->cdns.bus;
 
 	if (bus->prop.hw_disabled) {
@@ -1976,17 +1977,22 @@ static const struct dev_pm_ops intel_pm = {
 	SET_RUNTIME_PM_OPS(intel_suspend_runtime, intel_resume_runtime, NULL)
 };
 
-static struct platform_driver sdw_intel_drv = {
-	.probe = intel_master_probe,
-	.remove = intel_master_remove,
+static const struct auxiliary_device_id intel_link_id_table[] = {
+	{ .name = "soundwire_intel.link" },
+	{},
+};
+MODULE_DEVICE_TABLE(auxiliary, intel_link_id_table);
+
+static struct auxiliary_driver sdw_intel_drv = {
+	.probe = intel_link_probe,
+	.remove = intel_link_remove,
 	.driver = {
-		.name = "intel-sdw",
+		/* auxiliary_driver_register() sets .name to be the modname */
 		.pm = &intel_pm,
-	}
+	},
+	.id_table = intel_link_id_table
 };
-
-module_platform_driver(sdw_intel_drv);
+module_auxiliary_driver(sdw_intel_drv);
 
 MODULE_LICENSE("Dual BSD/GPL");
-MODULE_ALIAS("platform:intel-sdw");
-MODULE_DESCRIPTION("Intel Soundwire Master Driver");
+MODULE_DESCRIPTION("Intel Soundwire Link Driver");
diff --git a/drivers/soundwire/intel.h b/drivers/soundwire/intel.h
index 06bac8ba14e9..0b47b148da3f 100644
--- a/drivers/soundwire/intel.h
+++ b/drivers/soundwire/intel.h
@@ -7,7 +7,6 @@
 /**
  * struct sdw_intel_link_res - Soundwire Intel link resource structure,
  * typically populated by the controller driver.
- * @pdev: platform_device
  * @mmio_base: mmio base of SoundWire registers
  * @registers: Link IO registers base
  * @shim: Audio shim pointer
@@ -23,7 +22,6 @@
  * @list: used to walk-through all masters exposed by the same controller
  */
 struct sdw_intel_link_res {
-	struct platform_device *pdev;
 	void __iomem *mmio_base; /* not strictly needed, useful for debug */
 	void __iomem *registers;
 	void __iomem *shim;
@@ -48,7 +46,15 @@ struct sdw_intel {
 #endif
 };
 
-int intel_master_startup(struct platform_device *pdev);
-int intel_master_process_wakeen_event(struct platform_device *pdev);
+int intel_link_startup(struct auxiliary_device *auxdev);
+int intel_link_process_wakeen_event(struct auxiliary_device *auxdev);
+
+struct sdw_intel_link_dev {
+	struct auxiliary_device auxdev;
+	struct sdw_intel_link_res link_res;
+};
+
+#define auxiliary_dev_to_sdw_intel_link_dev(auxiliary_dev) \
+	container_of(auxiliary_dev, struct sdw_intel_link_dev, auxdev)
 
 #endif /* __SDW_INTEL_LOCAL_H */
diff --git a/drivers/soundwire/intel_init.c b/drivers/soundwire/intel_init.c
index 30ce95ec2d70..9e283bef53d2 100644
--- a/drivers/soundwire/intel_init.c
+++ b/drivers/soundwire/intel_init.c
@@ -12,7 +12,7 @@
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/module.h>
-#include <linux/platform_device.h>
+#include <linux/auxiliary_bus.h>
 #include <linux/pm_runtime.h>
 #include <linux/soundwire/sdw_intel.h>
 #include "cadence_master.h"
@@ -24,28 +24,108 @@
 #define SDW_LINK_BASE		0x30000
 #define SDW_LINK_SIZE		0x10000
 
+static void intel_link_dev_release(struct device *dev)
+{
+	struct auxiliary_device *auxdev = to_auxiliary_dev(dev);
+	struct sdw_intel_link_dev *ldev = auxiliary_dev_to_sdw_intel_link_dev(auxdev);
+
+	kfree(ldev);
+}
+
+/* alloc, init and add link devices */
+static struct sdw_intel_link_dev *intel_link_dev_register(struct sdw_intel_res *res,
+							  struct sdw_intel_ctx *ctx,
+							  struct fwnode_handle *fwnode,
+							  const char *name,
+							  int link_id)
+{
+	struct sdw_intel_link_dev *ldev;
+	struct sdw_intel_link_res *link;
+	struct auxiliary_device *auxdev;
+	int ret;
+
+	ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
+	if (!ldev)
+		return ERR_PTR(-ENOMEM);
+
+	auxdev = &ldev->auxdev;
+	auxdev->name = name;
+	auxdev->dev.parent = res->parent;
+	auxdev->dev.fwnode = fwnode;
+	auxdev->dev.release = intel_link_dev_release;
+
+	/* we don't use an IDA since we already have a link ID */
+	auxdev->id = link_id;
+
+	/*
+	 * keep a handle on the allocated memory, to be used in all other functions.
+	 * Since the same pattern is used to skip links that are not enabled, there is
+	 * no need to check if ctx->ldev[i] is NULL later on.
+	 */
+	ctx->ldev[link_id] = ldev;
+
+	/* Add link information used in the driver probe */
+	link = &ldev->link_res;
+	link->mmio_base = res->mmio_base;
+	link->registers = res->mmio_base + SDW_LINK_BASE
+		+ (SDW_LINK_SIZE * link_id);
+	link->shim = res->mmio_base + SDW_SHIM_BASE;
+	link->alh = res->mmio_base + SDW_ALH_BASE;
+
+	link->ops = res->ops;
+	link->dev = res->dev;
+
+	link->clock_stop_quirks = res->clock_stop_quirks;
+	link->shim_lock = &ctx->shim_lock;
+	link->shim_mask = &ctx->shim_mask;
+	link->link_mask = ctx->link_mask;
+
+	/* now follow the two-step init/add sequence */
+	ret = auxiliary_device_init(auxdev);
+	if (ret < 0) {
+		dev_err(res->parent, "failed to initialize link dev %s link_id %d\n",
+			name, link_id);
+		kfree(ldev);
+		return ERR_PTR(ret);
+	}
+
+	ret = auxiliary_device_add(&ldev->auxdev);
+	if (ret < 0) {
+		dev_err(res->parent, "failed to add link dev %s link_id %d\n",
+			ldev->auxdev.name, link_id);
+		/* ldev will be freed with the put_device() and .release sequence */
+		auxiliary_device_uninit(&ldev->auxdev);
+		return ERR_PTR(ret);
+	}
+
+	return ldev;
+}
+
+static void intel_link_dev_unregister(struct sdw_intel_link_dev *ldev)
+{
+	auxiliary_device_delete(&ldev->auxdev);
+	auxiliary_device_uninit(&ldev->auxdev);
+}
+
 static int sdw_intel_cleanup(struct sdw_intel_ctx *ctx)
 {
-	struct sdw_intel_link_res *link = ctx->links;
+	struct sdw_intel_link_dev *ldev;
 	u32 link_mask;
 	int i;
 
-	if (!link)
-		return 0;
-
 	link_mask = ctx->link_mask;
 
-	for (i = 0; i < ctx->count; i++, link++) {
+	for (i = 0; i < ctx->count; i++) {
 		if (!(link_mask & BIT(i)))
 			continue;
 
-		if (link->pdev) {
-			pm_runtime_disable(&link->pdev->dev);
-			platform_device_unregister(link->pdev);
-		}
+		ldev = ctx->ldev[i];
 
-		if (!link->clock_stop_quirks)
-			pm_runtime_put_noidle(link->dev);
+		pm_runtime_disable(&ldev->auxdev.dev);
+		if (!ldev->link_res.clock_stop_quirks)
+			pm_runtime_put_noidle(ldev->link_res.dev);
+
+		intel_link_dev_unregister(ldev);
 	}
 
 	return 0;
@@ -91,9 +171,8 @@ EXPORT_SYMBOL_NS(sdw_intel_thread, SOUNDWIRE_INTEL_INIT);
 static struct sdw_intel_ctx
 *sdw_intel_probe_controller(struct sdw_intel_res *res)
 {
-	struct platform_device_info pdevinfo;
-	struct platform_device *pdev;
 	struct sdw_intel_link_res *link;
+	struct sdw_intel_link_dev *ldev;
 	struct sdw_intel_ctx *ctx;
 	struct acpi_device *adev;
 	struct sdw_slave *slave;
@@ -116,67 +195,60 @@ static struct sdw_intel_ctx
 	count = res->count;
 	dev_dbg(&adev->dev, "Creating %d SDW Link devices\n", count);
 
-	ctx = devm_kzalloc(&adev->dev, sizeof(*ctx), GFP_KERNEL);
+	/*
+	 * we need to alloc/free memory manually and can't use devm:
+	 * this routine may be called from a workqueue, and not from
+	 * the parent .probe.
+	 * If devm_ was used, the memory might never be freed on errors.
+	 */
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
 		return NULL;
 
 	ctx->count = count;
-	ctx->links = devm_kcalloc(&adev->dev, ctx->count,
-				  sizeof(*ctx->links), GFP_KERNEL);
-	if (!ctx->links)
+
+	/*
+	 * allocate the array of pointers. The link-specific data is allocated
+	 * as part of the first loop below and released with the auxiliary_device_uninit().
+	 * If some links are disabled, the link pointer will remain NULL. Given that the
+	 * number of links is small, this is simpler than using a list to keep track of links.
+	 */
+	ctx->ldev = kcalloc(ctx->count, sizeof(*ctx->ldev), GFP_KERNEL);
+	if (!ctx->ldev) {
+		kfree(ctx);
 		return NULL;
+	}
 
-	ctx->count = count;
 	ctx->mmio_base = res->mmio_base;
 	ctx->link_mask = res->link_mask;
 	ctx->handle = res->handle;
 	mutex_init(&ctx->shim_lock);
 
-	link = ctx->links;
 	link_mask = ctx->link_mask;
 
 	INIT_LIST_HEAD(&ctx->link_list);
 
-	/* Create SDW Master devices */
-	for (i = 0; i < count; i++, link++) {
-		if (!(link_mask & BIT(i))) {
-			dev_dbg(&adev->dev,
-				"Link %d masked, will not be enabled\n", i);
+	for (i = 0; i < count; i++) {
+		if (!(link_mask & BIT(i)))
 			continue;
-		}
 
-		link->mmio_base = res->mmio_base;
-		link->registers = res->mmio_base + SDW_LINK_BASE
-			+ (SDW_LINK_SIZE * i);
-		link->shim = res->mmio_base + SDW_SHIM_BASE;
-		link->alh = res->mmio_base + SDW_ALH_BASE;
-
-		link->ops = res->ops;
-		link->dev = res->dev;
-
-		link->clock_stop_quirks = res->clock_stop_quirks;
-		link->shim_lock = &ctx->shim_lock;
-		link->shim_mask = &ctx->shim_mask;
-		link->link_mask = link_mask;
-
-		memset(&pdevinfo, 0, sizeof(pdevinfo));
-
-		pdevinfo.parent = res->parent;
-		pdevinfo.name = "intel-sdw";
-		pdevinfo.id = i;
-		pdevinfo.fwnode = acpi_fwnode_handle(adev);
-		pdevinfo.data = link;
-		pdevinfo.size_data = sizeof(*link);
-
-		pdev = platform_device_register_full(&pdevinfo);
-		if (IS_ERR(pdev)) {
-			dev_err(&adev->dev,
-				"platform device creation failed: %ld\n",
-				PTR_ERR(pdev));
+		/*
+		 * init and add a device for each link
+		 *
+		 * The name of the device will be soundwire_intel.link.[i],
+		 * with the "soundwire_intel" module prefix automatically added
+		 * by the auxiliary bus core.
+		 */
+		ldev = intel_link_dev_register(res,
+					       ctx,
+					       acpi_fwnode_handle(adev),
+					       "link",
+					       i);
+		if (IS_ERR(ldev))
 			goto err;
-		}
-		link->pdev = pdev;
-		link->cdns = platform_get_drvdata(pdev);
+
+		link = &ldev->link_res;
+		link->cdns = dev_get_drvdata(&ldev->auxdev.dev);
 
 		if (!link->cdns) {
 			dev_err(&adev->dev, "failed to get link->cdns\n");
@@ -194,8 +266,7 @@ static struct sdw_intel_ctx
 			num_slaves++;
 	}
 
-	ctx->ids = devm_kcalloc(&adev->dev, num_slaves,
-				sizeof(*ctx->ids), GFP_KERNEL);
+	ctx->ids = kcalloc(num_slaves, sizeof(*ctx->ids), GFP_KERNEL);
 	if (!ctx->ids)
 		goto err;
 
@@ -213,8 +284,14 @@ static struct sdw_intel_ctx
 	return ctx;
 
 err:
-	ctx->count = i;
-	sdw_intel_cleanup(ctx);
+	while (i--) {
+		if (!(link_mask & BIT(i)))
+			continue;
+		ldev = ctx->ldev[i];
+		intel_link_dev_unregister(ldev);
+	}
+	kfree(ctx->ldev);
+	kfree(ctx);
 	return NULL;
 }
 
@@ -222,7 +299,7 @@ static int
 sdw_intel_startup_controller(struct sdw_intel_ctx *ctx)
 {
 	struct acpi_device *adev;
-	struct sdw_intel_link_res *link;
+	struct sdw_intel_link_dev *ldev;
 	u32 caps;
 	u32 link_mask;
 	int i;
@@ -241,27 +318,28 @@ sdw_intel_startup_controller(struct sdw_intel_ctx *ctx)
 		return -EINVAL;
 	}
 
-	if (!ctx->links)
+	if (!ctx->ldev)
 		return -EINVAL;
 
-	link = ctx->links;
 	link_mask = ctx->link_mask;
 
 	/* Startup SDW Master devices */
-	for (i = 0; i < ctx->count; i++, link++) {
+	for (i = 0; i < ctx->count; i++) {
 		if (!(link_mask & BIT(i)))
 			continue;
 
-		intel_master_startup(link->pdev);
+		ldev = ctx->ldev[i];
+
+		intel_link_startup(&ldev->auxdev);
 
-		if (!link->clock_stop_quirks) {
+		if (!ldev->link_res.clock_stop_quirks) {
 			/*
 			 * we need to prevent the parent PCI device
 			 * from entering pm_runtime suspend, so that
 			 * power rails to the SoundWire IP are not
 			 * turned off.
 			 */
-			pm_runtime_get_noresume(link->dev);
+			pm_runtime_get_noresume(ldev->link_res.dev);
 		}
 	}
 
@@ -272,8 +350,8 @@ sdw_intel_startup_controller(struct sdw_intel_ctx *ctx)
  * sdw_intel_probe() - SoundWire Intel probe routine
  * @res: resource data
  *
- * This registers a platform device for each Master handled by the controller,
- * and SoundWire Master and Slave devices will be created by the platform
+ * This registers an auxiliary device for each Master handled by the controller,
+ * and SoundWire Master and Slave devices will be created by the auxiliary
  * device probe. All the information necessary is stored in the context, and
  * the res argument pointer can be freed after this step.
  * This function will be called after sdw_intel_acpi_scan() by SOF probe.
@@ -306,27 +384,31 @@ EXPORT_SYMBOL_NS(sdw_intel_startup, SOUNDWIRE_INTEL_INIT);
 void sdw_intel_exit(struct sdw_intel_ctx *ctx)
 {
 	sdw_intel_cleanup(ctx);
+	kfree(ctx->ids);
+	kfree(ctx->ldev);
+	kfree(ctx);
 }
 EXPORT_SYMBOL_NS(sdw_intel_exit, SOUNDWIRE_INTEL_INIT);
 
 void sdw_intel_process_wakeen_event(struct sdw_intel_ctx *ctx)
 {
-	struct sdw_intel_link_res *link;
+	struct sdw_intel_link_dev *ldev;
 	u32 link_mask;
 	int i;
 
-	if (!ctx->links)
+	if (!ctx->ldev)
 		return;
 
-	link = ctx->links;
 	link_mask = ctx->link_mask;
 
 	/* Startup SDW Master devices */
-	for (i = 0; i < ctx->count; i++, link++) {
+	for (i = 0; i < ctx->count; i++) {
 		if (!(link_mask & BIT(i)))
 			continue;
 
-		intel_master_process_wakeen_event(link->pdev);
+		ldev = ctx->ldev[i];
+
+		intel_link_process_wakeen_event(&ldev->auxdev);
 	}
 }
 EXPORT_SYMBOL_NS(sdw_intel_process_wakeen_event, SOUNDWIRE_INTEL_INIT);
diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index 3a5446ac014a..1ebea7764011 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -58,7 +58,7 @@ struct sdw_intel_acpi_info {
 	u32 link_mask;
 };
 
-struct sdw_intel_link_res;
+struct sdw_intel_link_dev;
 
 /* Intel clock-stop/pm_runtime quirk definitions */
 
@@ -109,7 +109,7 @@ struct sdw_intel_slave_id {
  * Controller
  * @num_slaves: total number of devices exposed across all enabled links
  * @handle: ACPI parent handle
- * @links: information for each link (controller-specific and kept
+ * @ldev: information for each link (controller-specific and kept
  * opaque here)
  * @ids: array of slave_id, representing Slaves exposed across all enabled
  * links
@@ -123,7 +123,7 @@ struct sdw_intel_ctx {
 	u32 link_mask;
 	int num_slaves;
 	acpi_handle handle;
-	struct sdw_intel_link_res *links;
+	struct sdw_intel_link_dev **ldev;
 	struct sdw_intel_slave_id *ids;
 	struct list_head link_list;
 	struct mutex shim_lock; /* lock for access to shared SHIM registers */
-- 
cgit v1.2.3


From 307773f525eb9217090bd4b11748d880f7f99355 Mon Sep 17 00:00:00 2001
From: Aswath Govindraju <a-govindraju@ti.com>
Date: Mon, 10 May 2021 10:40:03 +0530
Subject: phy: core: Reword the comment specifying the units of max_link_rate
 to be Mbps

In some subsystems (eg. CAN, SPI), the max link rate supported can be less
than 1 Mbps and if the unit for max_link_rate is Mbps then it can't be
used. Therefore, leave the decision of units to be used, to the producer
and consumer.

Signed-off-by: Aswath Govindraju <a-govindraju@ti.com>
Acked-by: Marc Kleine-Budde <mkl@pengutronix.de>
Acked-by: Kishon Vijay Abraham I <kishon@ti.com>
Link: https://lore.kernel.org/r/20210510051006.11393-2-a-govindraju@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/phy/phy.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h
index 0ed434d02196..f3286f4cd306 100644
--- a/include/linux/phy/phy.h
+++ b/include/linux/phy/phy.h
@@ -125,7 +125,7 @@ struct phy_ops {
 /**
  * struct phy_attrs - represents phy attributes
  * @bus_width: Data path width implemented by PHY
- * @max_link_rate: Maximum link rate supported by PHY (in Mbps)
+ * @max_link_rate: Maximum link rate supported by PHY (units to be decided by producer and consumer)
  * @mode: PHY mode
  */
 struct phy_attrs {
-- 
cgit v1.2.3


From dbea8ae9febdea11cb74d094e6b730987079679e Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 4 May 2021 18:12:19 +0200
Subject: mmc: core: Parse the SD SCR register for support of CMD48/49 and
 CMD58/59

In SD spec v4.x the support for CMD48/49 and CMD58/59 were introduced as
optional features. To let the card announce whether it supports the
commands, the SCR register has been extended with corresponding support
bits. Let's parse and store this information for later use.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Shawn Lin <shawn.lin@rock-chips.com>
Acked-by: Avri Altman <avri.altman@wdc.com>
Link: https://lore.kernel.org/r/20210504161222.101536-9-ulf.hansson@linaro.org
---
 drivers/mmc/core/sd.c    | 4 +++-
 include/linux/mmc/card.h | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index 2c48d6504101..de7b5f8df550 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -222,7 +222,9 @@ static int mmc_decode_scr(struct mmc_card *card)
 	else
 		card->erased_byte = 0x0;
 
-	if (scr->sda_spec3)
+	if (scr->sda_spec4)
+		scr->cmds = UNSTUFF_BITS(resp, 32, 4);
+	else if (scr->sda_spec3)
 		scr->cmds = UNSTUFF_BITS(resp, 32, 2);
 
 	/* SD Spec says: any SD Card shall set at least bits 0 and 2 */
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index f9ad35dd6012..858fc4d11240 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -139,6 +139,8 @@ struct sd_scr {
 	unsigned char		cmds;
 #define SD_SCR_CMD20_SUPPORT   (1<<0)
 #define SD_SCR_CMD23_SUPPORT   (1<<1)
+#define SD_SCR_CMD48_SUPPORT   (1<<2)
+#define SD_SCR_CMD58_SUPPORT   (1<<3)
 };
 
 struct sd_ssr {
-- 
cgit v1.2.3


From c784f92769ae8eafb2eb489408757528ff7525df Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 4 May 2021 18:12:20 +0200
Subject: mmc: core: Read the SD function extension registers for power
 management

In the SD spec v4.0 the CMD48/49 and CMD58/59 were introduced as optional
commands. In the SD spec v4.1 the SD function extension registers were
introduced, which requires support for CMD48/49/58/59 to be read/written
from/to.

Moreover, a specific function extension register were added to let the card
announce support for optional features in regards to power management. The
features that were added are "Power Off Notification", "Power Down Mode"
and "Power Sustenance".

As a first step to support this, let's read and parse the register for
power management during the SD card initialization and store the
information about the supported features in the struct mmc_card. In this
way, we prepare for subsequent changes to implement the complete support
for the new features.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Shawn Lin <shawn.lin@rock-chips.com>
Acked-by: Avri Altman <avri.altman@wdc.com>
Link: https://lore.kernel.org/r/20210504161222.101536-10-ulf.hansson@linaro.org
---
 drivers/mmc/core/sd.c    | 178 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mmc/card.h |  13 ++++
 include/linux/mmc/sd.h   |   3 +
 3 files changed, 194 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index de7b5f8df550..2e687f1f4542 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -996,6 +996,177 @@ static bool mmc_sd_card_using_v18(struct mmc_card *card)
 	       (SD_MODE_UHS_SDR50 | SD_MODE_UHS_SDR104 | SD_MODE_UHS_DDR50);
 }
 
+static int sd_read_ext_reg(struct mmc_card *card, u8 fno, u8 page,
+			   u16 offset, u16 len, u8 *reg_buf)
+{
+	u32 cmd_args;
+
+	/*
+	 * Command arguments of CMD48:
+	 * [31:31] MIO (0 = memory).
+	 * [30:27] FNO (function number).
+	 * [26:26] reserved (0).
+	 * [25:18] page number.
+	 * [17:9] offset address.
+	 * [8:0] length (0 = 1 byte, 1ff = 512 bytes).
+	 */
+	cmd_args = fno << 27 | page << 18 | offset << 9 | (len -1);
+
+	return mmc_send_adtc_data(card, card->host, SD_READ_EXTR_SINGLE,
+				  cmd_args, reg_buf, 512);
+}
+
+static int sd_parse_ext_reg_power(struct mmc_card *card, u8 fno, u8 page,
+				  u16 offset)
+{
+	int err;
+	u8 *reg_buf;
+
+	reg_buf = kzalloc(512, GFP_KERNEL);
+	if (!reg_buf)
+		return -ENOMEM;
+
+	/* Read the extension register for power management function. */
+	err = sd_read_ext_reg(card, fno, page, offset, 512, reg_buf);
+	if (err) {
+		pr_warn("%s: error %d reading PM func of ext reg\n",
+			mmc_hostname(card->host), err);
+		goto out;
+	}
+
+	/* PM revision consists of 4 bits. */
+	card->ext_power.rev = reg_buf[0] & 0xf;
+
+	/* Power Off Notification support at bit 4. */
+	if (reg_buf[1] & BIT(4))
+		card->ext_power.feature_support |= SD_EXT_POWER_OFF_NOTIFY;
+
+	/* Power Sustenance support at bit 5. */
+	if (reg_buf[1] & BIT(5))
+		card->ext_power.feature_support |= SD_EXT_POWER_SUSTENANCE;
+
+	/* Power Down Mode support at bit 6. */
+	if (reg_buf[1] & BIT(6))
+		card->ext_power.feature_support |= SD_EXT_POWER_DOWN_MODE;
+
+	card->ext_power.fno = fno;
+	card->ext_power.page = page;
+	card->ext_power.offset = offset;
+
+out:
+	kfree(reg_buf);
+	return err;
+}
+
+static int sd_parse_ext_reg(struct mmc_card *card, u8 *gen_info_buf,
+			    u16 *next_ext_addr)
+{
+	u8 num_regs, fno, page;
+	u16 sfc, offset, ext = *next_ext_addr;
+	u32 reg_addr;
+
+	/*
+	 * Parse only one register set per extension, as that is sufficient to
+	 * support the standard functions. This means another 48 bytes in the
+	 * buffer must be available.
+	 */
+	if (ext + 48 > 512)
+		return -EFAULT;
+
+	/* Standard Function Code */
+	memcpy(&sfc, &gen_info_buf[ext], 2);
+
+	/* Address to the next extension. */
+	memcpy(next_ext_addr, &gen_info_buf[ext + 40], 2);
+
+	/* Number of registers for this extension. */
+	num_regs = gen_info_buf[ext + 42];
+
+	/* We support only one register per extension. */
+	if (num_regs != 1)
+		return 0;
+
+	/* Extension register address. */
+	memcpy(&reg_addr, &gen_info_buf[ext + 44], 4);
+
+	/* 9 bits (0 to 8) contains the offset address. */
+	offset = reg_addr & 0x1ff;
+
+	/* 8 bits (9 to 16) contains the page number. */
+	page = reg_addr >> 9 & 0xff ;
+
+	/* 4 bits (18 to 21) contains the function number. */
+	fno = reg_addr >> 18 & 0xf;
+
+	/* Standard Function Code for power management. */
+	if (sfc == 0x1)
+		return sd_parse_ext_reg_power(card, fno, page, offset);
+
+	return 0;
+}
+
+static int sd_read_ext_regs(struct mmc_card *card)
+{
+	int err, i;
+	u8 num_ext, *gen_info_buf;
+	u16 rev, len, next_ext_addr;
+
+	if (mmc_host_is_spi(card->host))
+		return 0;
+
+	if (!(card->scr.cmds & SD_SCR_CMD48_SUPPORT))
+		return 0;
+
+	gen_info_buf = kzalloc(512, GFP_KERNEL);
+	if (!gen_info_buf)
+		return -ENOMEM;
+
+	/*
+	 * Read 512 bytes of general info, which is found at function number 0,
+	 * at page 0 and with no offset.
+	 */
+	err = sd_read_ext_reg(card, 0, 0, 0, 512, gen_info_buf);
+	if (err) {
+		pr_warn("%s: error %d reading general info of SD ext reg\n",
+			mmc_hostname(card->host), err);
+		goto out;
+	}
+
+	/* General info structure revision. */
+	memcpy(&rev, &gen_info_buf[0], 2);
+
+	/* Length of general info in bytes. */
+	memcpy(&len, &gen_info_buf[2], 2);
+
+	/* Number of extensions to be find. */
+	num_ext = gen_info_buf[4];
+
+	/* We support revision 0, but limit it to 512 bytes for simplicity. */
+	if (rev != 0 || len > 512) {
+		pr_warn("%s: non-supported SD ext reg layout\n",
+			mmc_hostname(card->host));
+		goto out;
+	}
+
+	/*
+	 * Parse the extension registers. The first extension should start
+	 * immediately after the general info header (16 bytes).
+	 */
+	next_ext_addr = 16;
+	for (i = 0; i < num_ext; i++) {
+		err = sd_parse_ext_reg(card, gen_info_buf, &next_ext_addr);
+		if (err) {
+			pr_warn("%s: error %d parsing SD ext reg\n",
+				mmc_hostname(card->host), err);
+			goto out;
+		}
+	}
+
+out:
+	kfree(gen_info_buf);
+	return err;
+}
+
 /*
  * Handle the detection and initialisation of a card.
  *
@@ -1144,6 +1315,13 @@ retry:
 		}
 	}
 
+	if (!oldcard) {
+		/* Read/parse the extension registers. */
+		err = sd_read_ext_regs(card);
+		if (err)
+			goto free_card;
+	}
+
 	if (host->cqe_ops && !host->cqe_enabled) {
 		err = host->cqe_ops->cqe_enable(host, card);
 		if (!err) {
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 858fc4d11240..03a862e93594 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -191,6 +191,18 @@ struct sd_switch_caps {
 #define SD_MAX_CURRENT_800	(1 << SD_SET_CURRENT_LIMIT_800)
 };
 
+struct sd_ext_reg {
+	u8			fno;
+	u8			page;
+	u16			offset;
+	u8			rev;
+	u8			feature_support;
+/* Power Management Function. */
+#define SD_EXT_POWER_OFF_NOTIFY	(1<<0)
+#define SD_EXT_POWER_SUSTENANCE	(1<<1)
+#define SD_EXT_POWER_DOWN_MODE	(1<<2)
+};
+
 struct sdio_cccr {
 	unsigned int		sdio_vsn;
 	unsigned int		sd_vsn;
@@ -292,6 +304,7 @@ struct mmc_card {
 	struct sd_scr		scr;		/* extra SD information */
 	struct sd_ssr		ssr;		/* yet more SD information */
 	struct sd_switch_caps	sw_caps;	/* switch (CMD6) caps */
+	struct sd_ext_reg	ext_power;	/* SD extension reg for PM */
 
 	unsigned int		sdio_funcs;	/* number of SDIO functions */
 	atomic_t		sdio_funcs_probed; /* number of probed SDIO funcs */
diff --git a/include/linux/mmc/sd.h b/include/linux/mmc/sd.h
index 2236aa540faa..43bfc5c39ad4 100644
--- a/include/linux/mmc/sd.h
+++ b/include/linux/mmc/sd.h
@@ -29,6 +29,9 @@
 #define SD_APP_OP_COND           41   /* bcr  [31:0] OCR         R3  */
 #define SD_APP_SEND_SCR          51   /* adtc                    R1  */
 
+  /* class 11 */
+#define SD_READ_EXTR_SINGLE      48   /* adtc [31:0]             R1  */
+
 /* OCR bit definitions */
 #define SD_OCR_S18R		(1 << 24)    /* 1.8V switching request */
 #define SD_ROCR_S18A		SD_OCR_S18R  /* 1.8V switching accepted by card */
-- 
cgit v1.2.3


From 4e6306e0b83c6251699c2202e859b55ddf7b8c5f Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 4 May 2021 18:12:21 +0200
Subject: mmc: core: Read performance enhancements registers for SD cards

In SD spec v6.x the SD function extension registers for performance
enhancements were introduced. These registers let the SD card announce
supports for various performance related features, like "self-maintenance",
"cache" and "command queuing".

Let's extend the parsing of SD function extension registers and store the
information in the struct mmc_card. This prepares for subsequent changes to
implement the complete support for new the performance enhancement
features.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Avri Altman <avri.altman@wdc.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Shawn Lin <shawn.lin@rock-chips.com>
Link: https://lore.kernel.org/r/20210504161222.101536-11-ulf.hansson@linaro.org
---
 drivers/mmc/core/sd.c    | 53 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mmc/card.h |  7 +++++++
 2 files changed, 60 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index 2e687f1f4542..0b882aaedf78 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -1058,6 +1058,55 @@ out:
 	return err;
 }
 
+static int sd_parse_ext_reg_perf(struct mmc_card *card, u8 fno, u8 page,
+				 u16 offset)
+{
+	int err;
+	u8 *reg_buf;
+
+	reg_buf = kzalloc(512, GFP_KERNEL);
+	if (!reg_buf)
+		return -ENOMEM;
+
+	err = sd_read_ext_reg(card, fno, page, offset, 512, reg_buf);
+	if (err) {
+		pr_warn("%s: error %d reading PERF func of ext reg\n",
+			mmc_hostname(card->host), err);
+		goto out;
+	}
+
+	/* PERF revision. */
+	card->ext_perf.rev = reg_buf[0];
+
+	/* FX_EVENT support at bit 0. */
+	if (reg_buf[1] & BIT(0))
+		card->ext_perf.feature_support |= SD_EXT_PERF_FX_EVENT;
+
+	/* Card initiated self-maintenance support at bit 0. */
+	if (reg_buf[2] & BIT(0))
+		card->ext_perf.feature_support |= SD_EXT_PERF_CARD_MAINT;
+
+	/* Host initiated self-maintenance support at bit 1. */
+	if (reg_buf[2] & BIT(1))
+		card->ext_perf.feature_support |= SD_EXT_PERF_HOST_MAINT;
+
+	/* Cache support at bit 0. */
+	if (reg_buf[4] & BIT(0))
+		card->ext_perf.feature_support |= SD_EXT_PERF_CACHE;
+
+	/* Command queue support indicated via queue depth bits (0 to 4). */
+	if (reg_buf[6] & 0x1f)
+		card->ext_perf.feature_support |= SD_EXT_PERF_CMD_QUEUE;
+
+	card->ext_perf.fno = fno;
+	card->ext_perf.page = page;
+	card->ext_perf.offset = offset;
+
+out:
+	kfree(reg_buf);
+	return err;
+}
+
 static int sd_parse_ext_reg(struct mmc_card *card, u8 *gen_info_buf,
 			    u16 *next_ext_addr)
 {
@@ -1102,6 +1151,10 @@ static int sd_parse_ext_reg(struct mmc_card *card, u8 *gen_info_buf,
 	if (sfc == 0x1)
 		return sd_parse_ext_reg_power(card, fno, page, offset);
 
+	/* Standard Function Code for performance enhancement. */
+	if (sfc == 0x2)
+		return sd_parse_ext_reg_perf(card, fno, page, offset);
+
 	return 0;
 }
 
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 03a862e93594..2867af0635f8 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -201,6 +201,12 @@ struct sd_ext_reg {
 #define SD_EXT_POWER_OFF_NOTIFY	(1<<0)
 #define SD_EXT_POWER_SUSTENANCE	(1<<1)
 #define SD_EXT_POWER_DOWN_MODE	(1<<2)
+/* Performance Enhancement Function. */
+#define SD_EXT_PERF_FX_EVENT	(1<<0)
+#define SD_EXT_PERF_CARD_MAINT	(1<<1)
+#define SD_EXT_PERF_HOST_MAINT	(1<<2)
+#define SD_EXT_PERF_CACHE	(1<<3)
+#define SD_EXT_PERF_CMD_QUEUE	(1<<4)
 };
 
 struct sdio_cccr {
@@ -305,6 +311,7 @@ struct mmc_card {
 	struct sd_ssr		ssr;		/* yet more SD information */
 	struct sd_switch_caps	sw_caps;	/* switch (CMD6) caps */
 	struct sd_ext_reg	ext_power;	/* SD extension reg for PM */
+	struct sd_ext_reg	ext_perf;	/* SD extension reg for PERF */
 
 	unsigned int		sdio_funcs;	/* number of SDIO functions */
 	atomic_t		sdio_funcs_probed; /* number of probed SDIO funcs */
-- 
cgit v1.2.3


From 2c5d42769038045b92160a849aad43c4b3170e2a Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 4 May 2021 18:12:22 +0200
Subject: mmc: core: Add support for Power Off Notification for SD cards

Rather than only deselecting the SD card via a CMD7, before we cut power to
it at system suspend, at runtime suspend or at shutdown, let's add support
for a graceful power off sequence via enabling the SD Power Off
Notification feature.

Note that, the Power Off Notification feature was added in the SD spec
v4.x, which is several years ago. However, it's still a bit unclear how
often the SD card vendors decides to implement support for it. To validate
these changes a Sandisk Extreme PRO A2 64GB has been used, which seems to
work nicely.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Avri Altman <avri.altman@wdc.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Shawn Lin <shawn.lin@rock-chips.com>
Link: https://lore.kernel.org/r/20210504161222.101536-12-ulf.hansson@linaro.org
---
 drivers/mmc/core/sd.c  | 136 +++++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/mmc/sd.h |   1 +
 2 files changed, 134 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index 0b882aaedf78..bd40c682d264 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -66,6 +66,13 @@ static const unsigned int sd_au_size[] = {
 		__res & __mask;						\
 	})
 
+#define SD_POWEROFF_NOTIFY_TIMEOUT_MS 2000
+
+struct sd_busy_data {
+	struct mmc_card *card;
+	u8 *reg_buf;
+};
+
 /*
  * Given the decoded CSD structure, decode the raw CID to our CID structure.
  */
@@ -996,6 +1003,66 @@ static bool mmc_sd_card_using_v18(struct mmc_card *card)
 	       (SD_MODE_UHS_SDR50 | SD_MODE_UHS_SDR104 | SD_MODE_UHS_DDR50);
 }
 
+static int sd_write_ext_reg(struct mmc_card *card, u8 fno, u8 page, u16 offset,
+			    u8 reg_data)
+{
+	struct mmc_host *host = card->host;
+	struct mmc_request mrq = {};
+	struct mmc_command cmd = {};
+	struct mmc_data data = {};
+	struct scatterlist sg;
+	u8 *reg_buf;
+
+	reg_buf = kzalloc(512, GFP_KERNEL);
+	if (!reg_buf)
+		return -ENOMEM;
+
+	mrq.cmd = &cmd;
+	mrq.data = &data;
+
+	/*
+	 * Arguments of CMD49:
+	 * [31:31] MIO (0 = memory).
+	 * [30:27] FNO (function number).
+	 * [26:26] MW - mask write mode (0 = disable).
+	 * [25:18] page number.
+	 * [17:9] offset address.
+	 * [8:0] length (0 = 1 byte).
+	 */
+	cmd.arg = fno << 27 | page << 18 | offset << 9;
+
+	/* The first byte in the buffer is the data to be written. */
+	reg_buf[0] = reg_data;
+
+	data.flags = MMC_DATA_WRITE;
+	data.blksz = 512;
+	data.blocks = 1;
+	data.sg = &sg;
+	data.sg_len = 1;
+	sg_init_one(&sg, reg_buf, 512);
+
+	cmd.opcode = SD_WRITE_EXTR_SINGLE;
+	cmd.flags = MMC_RSP_R1 | MMC_CMD_ADTC;
+
+	mmc_set_data_timeout(&data, card);
+	mmc_wait_for_req(host, &mrq);
+
+	kfree(reg_buf);
+
+	/*
+	 * Note that, the SD card is allowed to signal busy on DAT0 up to 1s
+	 * after the CMD49. Although, let's leave this to be managed by the
+	 * caller.
+	 */
+
+	if (cmd.error)
+		return cmd.error;
+	if (data.error)
+		return data.error;
+
+	return 0;
+}
+
 static int sd_read_ext_reg(struct mmc_card *card, u8 fno, u8 page,
 			   u16 offset, u16 len, u8 *reg_buf)
 {
@@ -1446,21 +1513,84 @@ static void mmc_sd_detect(struct mmc_host *host)
 	}
 }
 
+static int sd_can_poweroff_notify(struct mmc_card *card)
+{
+	return card->ext_power.feature_support & SD_EXT_POWER_OFF_NOTIFY;
+}
+
+static int sd_busy_poweroff_notify_cb(void *cb_data, bool *busy)
+{
+	struct sd_busy_data *data = cb_data;
+	struct mmc_card *card = data->card;
+	int err;
+
+	/*
+	 * Read the status register for the power management function. It's at
+	 * one byte offset and is one byte long. The Power Off Notification
+	 * Ready is bit 0.
+	 */
+	err = sd_read_ext_reg(card, card->ext_power.fno, card->ext_power.page,
+			      card->ext_power.offset + 1, 1, data->reg_buf);
+	if (err) {
+		pr_warn("%s: error %d reading status reg of PM func\n",
+			mmc_hostname(card->host), err);
+		return err;
+	}
+
+	*busy = !(data->reg_buf[0] & BIT(0));
+	return 0;
+}
+
+static int sd_poweroff_notify(struct mmc_card *card)
+{
+	struct sd_busy_data cb_data;
+	u8 *reg_buf;
+	int err;
+
+	reg_buf = kzalloc(512, GFP_KERNEL);
+	if (!reg_buf)
+		return -ENOMEM;
+
+	/*
+	 * Set the Power Off Notification bit in the power management settings
+	 * register at 2 bytes offset.
+	 */
+	err = sd_write_ext_reg(card, card->ext_power.fno, card->ext_power.page,
+			       card->ext_power.offset + 2, BIT(0));
+	if (err) {
+		pr_warn("%s: error %d writing Power Off Notify bit\n",
+			mmc_hostname(card->host), err);
+		goto out;
+	}
+
+	cb_data.card = card;
+	cb_data.reg_buf = reg_buf;
+	err = __mmc_poll_for_busy(card, SD_POWEROFF_NOTIFY_TIMEOUT_MS,
+				  &sd_busy_poweroff_notify_cb, &cb_data);
+
+out:
+	kfree(reg_buf);
+	return err;
+}
+
 static int _mmc_sd_suspend(struct mmc_host *host)
 {
+	struct mmc_card *card = host->card;
 	int err = 0;
 
 	mmc_claim_host(host);
 
-	if (mmc_card_suspended(host->card))
+	if (mmc_card_suspended(card))
 		goto out;
 
-	if (!mmc_host_is_spi(host))
+	if (sd_can_poweroff_notify(card))
+		err = sd_poweroff_notify(card);
+	else if (!mmc_host_is_spi(host))
 		err = mmc_deselect_cards(host);
 
 	if (!err) {
 		mmc_power_off(host);
-		mmc_card_set_suspended(host->card);
+		mmc_card_set_suspended(card);
 	}
 
 out:
diff --git a/include/linux/mmc/sd.h b/include/linux/mmc/sd.h
index 43bfc5c39ad4..6727576a8755 100644
--- a/include/linux/mmc/sd.h
+++ b/include/linux/mmc/sd.h
@@ -31,6 +31,7 @@
 
   /* class 11 */
 #define SD_READ_EXTR_SINGLE      48   /* adtc [31:0]             R1  */
+#define SD_WRITE_EXTR_SINGLE     49   /* adtc [31:0]             R1  */
 
 /* OCR bit definitions */
 #define SD_OCR_S18R		(1 << 24)    /* 1.8V switching request */
-- 
cgit v1.2.3


From 130206a615a9831a65e186484a5a332f9f6d29c8 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 11 May 2021 12:13:59 +0200
Subject: mmc: core: Add support for cache ctrl for SD cards

In SD spec v6.x the SD function extension registers for performance
enhancements were introduced. As a part of this an optional internal cache
on the SD card, can be used to improve performance.

The let the SD card use the cache, the host needs to enable it and manage
flushing of the cache, so let's add support for this.

Note that for an SD card supporting the cache it's mandatory for it, to
also support the poweroff notification feature. According to the SD spec,
if the cache has been enabled and a poweroff notification is sent to the
card, that implicitly also means that the card should flush its internal
cache. Therefore, dealing with cache flushing for REQ_OP_FLUSH block
requests is sufficient.

Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Avri Altman <avri.altman@wdc.com>
Link: https://lore.kernel.org/r/20210511101359.83521-1-ulf.hansson@linaro.org
---
 drivers/mmc/core/mmc_ops.c |   1 +
 drivers/mmc/core/mmc_ops.h |   1 +
 drivers/mmc/core/sd.c      | 100 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mmc/card.h   |   1 +
 4 files changed, 103 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/mmc_ops.c b/drivers/mmc/core/mmc_ops.c
index af423acc4c88..3c58f6d0f482 100644
--- a/drivers/mmc/core/mmc_ops.c
+++ b/drivers/mmc/core/mmc_ops.c
@@ -456,6 +456,7 @@ static int mmc_busy_cb(void *cb_data, bool *busy)
 		err = R1_STATUS(status) ? -EIO : 0;
 		break;
 	case MMC_BUSY_HPI:
+	case MMC_BUSY_EXTR_SINGLE:
 		break;
 	default:
 		err = -EINVAL;
diff --git a/drivers/mmc/core/mmc_ops.h b/drivers/mmc/core/mmc_ops.h
index c3c1d9c2577e..41ab4f573a31 100644
--- a/drivers/mmc/core/mmc_ops.h
+++ b/drivers/mmc/core/mmc_ops.h
@@ -14,6 +14,7 @@ enum mmc_busy_cmd {
 	MMC_BUSY_CMD6,
 	MMC_BUSY_ERASE,
 	MMC_BUSY_HPI,
+	MMC_BUSY_EXTR_SINGLE,
 };
 
 struct mmc_host;
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index bd40c682d264..781c1e24308c 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -67,6 +67,7 @@ static const unsigned int sd_au_size[] = {
 	})
 
 #define SD_POWEROFF_NOTIFY_TIMEOUT_MS 2000
+#define SD_WRITE_EXTR_SINGLE_TIMEOUT_MS 1000
 
 struct sd_busy_data {
 	struct mmc_card *card;
@@ -1287,6 +1288,96 @@ out:
 	return err;
 }
 
+static bool sd_cache_enabled(struct mmc_host *host)
+{
+	return host->card->ext_perf.feature_enabled & SD_EXT_PERF_CACHE;
+}
+
+static int sd_flush_cache(struct mmc_host *host)
+{
+	struct mmc_card *card = host->card;
+	u8 *reg_buf, fno, page;
+	u16 offset;
+	int err;
+
+	if (!sd_cache_enabled(host))
+		return 0;
+
+	reg_buf = kzalloc(512, GFP_KERNEL);
+	if (!reg_buf)
+		return -ENOMEM;
+
+	/*
+	 * Set Flush Cache at bit 0 in the performance enhancement register at
+	 * 261 bytes offset.
+	 */
+	fno = card->ext_perf.fno;
+	page = card->ext_perf.page;
+	offset = card->ext_perf.offset + 261;
+
+	err = sd_write_ext_reg(card, fno, page, offset, BIT(0));
+	if (err) {
+		pr_warn("%s: error %d writing Cache Flush bit\n",
+			mmc_hostname(host), err);
+		goto out;
+	}
+
+	err = mmc_poll_for_busy(card, SD_WRITE_EXTR_SINGLE_TIMEOUT_MS, false,
+				MMC_BUSY_EXTR_SINGLE);
+	if (err)
+		goto out;
+
+	/*
+	 * Read the Flush Cache bit. The card shall reset it, to confirm that
+	 * it's has completed the flushing of the cache.
+	 */
+	err = sd_read_ext_reg(card, fno, page, offset, 1, reg_buf);
+	if (err) {
+		pr_warn("%s: error %d reading Cache Flush bit\n",
+			mmc_hostname(host), err);
+		goto out;
+	}
+
+	if (reg_buf[0] & BIT(0))
+		err = -ETIMEDOUT;
+out:
+	kfree(reg_buf);
+	return err;
+}
+
+static int sd_enable_cache(struct mmc_card *card)
+{
+	u8 *reg_buf;
+	int err;
+
+	card->ext_perf.feature_enabled &= ~SD_EXT_PERF_CACHE;
+
+	reg_buf = kzalloc(512, GFP_KERNEL);
+	if (!reg_buf)
+		return -ENOMEM;
+
+	/*
+	 * Set Cache Enable at bit 0 in the performance enhancement register at
+	 * 260 bytes offset.
+	 */
+	err = sd_write_ext_reg(card, card->ext_perf.fno, card->ext_perf.page,
+			       card->ext_perf.offset + 260, BIT(0));
+	if (err) {
+		pr_warn("%s: error %d writing Cache Enable bit\n",
+			mmc_hostname(card->host), err);
+		goto out;
+	}
+
+	err = mmc_poll_for_busy(card, SD_WRITE_EXTR_SINGLE_TIMEOUT_MS, false,
+				MMC_BUSY_EXTR_SINGLE);
+	if (!err)
+		card->ext_perf.feature_enabled |= SD_EXT_PERF_CACHE;
+
+out:
+	kfree(reg_buf);
+	return err;
+}
+
 /*
  * Handle the detection and initialisation of a card.
  *
@@ -1442,6 +1533,13 @@ retry:
 			goto free_card;
 	}
 
+	/* Enable internal SD cache if supported. */
+	if (card->ext_perf.feature_support & SD_EXT_PERF_CACHE) {
+		err = sd_enable_cache(card);
+		if (err)
+			goto free_card;
+	}
+
 	if (host->cqe_ops && !host->cqe_enabled) {
 		err = host->cqe_ops->cqe_enable(host, card);
 		if (!err) {
@@ -1694,6 +1792,8 @@ static const struct mmc_bus_ops mmc_sd_ops = {
 	.alive = mmc_sd_alive,
 	.shutdown = mmc_sd_suspend,
 	.hw_reset = mmc_sd_hw_reset,
+	.cache_enabled = sd_cache_enabled,
+	.flush_cache = sd_flush_cache,
 };
 
 /*
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 2867af0635f8..74e6c0624d27 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -196,6 +196,7 @@ struct sd_ext_reg {
 	u8			page;
 	u16			offset;
 	u8			rev;
+	u8			feature_enabled;
 	u8			feature_support;
 /* Power Management Function. */
 #define SD_EXT_POWER_OFF_NOTIFY	(1<<0)
-- 
cgit v1.2.3


From 21adc2e45f4ef32786807375107543797ff68615 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Tue, 8 Jun 2021 20:06:20 +0200
Subject: mmc: Improve function name when aborting a tuning cmd
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

'mmc_abort_tuning()' made me think tuning gets completely aborted.
However, it sends only a STOP cmd to cancel the current tuning cmd.
Tuning process may still continue after that. So, rename the function to
'mmc_send_abort_tuning()' to better reflect all this.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Link: https://lore.kernel.org/r/20210608180620.40059-1-wsa+renesas@sang-engineering.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/mmc_ops.c           | 4 ++--
 drivers/mmc/host/renesas_sdhi_core.c | 2 +-
 drivers/mmc/host/sdhci.c             | 2 +-
 include/linux/mmc/host.h             | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/mmc_ops.c b/drivers/mmc/core/mmc_ops.c
index 3c58f6d0f482..973756ed4016 100644
--- a/drivers/mmc/core/mmc_ops.c
+++ b/drivers/mmc/core/mmc_ops.c
@@ -700,7 +700,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(mmc_send_tuning);
 
-int mmc_abort_tuning(struct mmc_host *host, u32 opcode)
+int mmc_send_abort_tuning(struct mmc_host *host, u32 opcode)
 {
 	struct mmc_command cmd = {};
 
@@ -723,7 +723,7 @@ int mmc_abort_tuning(struct mmc_host *host, u32 opcode)
 
 	return mmc_wait_for_cmd(host, &cmd, 0);
 }
-EXPORT_SYMBOL_GPL(mmc_abort_tuning);
+EXPORT_SYMBOL_GPL(mmc_send_abort_tuning);
 
 static int
 mmc_send_bus_test(struct mmc_card *card, struct mmc_host *host, u8 opcode,
diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c
index baab4c2e1b53..e49ca0f7fe9a 100644
--- a/drivers/mmc/host/renesas_sdhi_core.c
+++ b/drivers/mmc/host/renesas_sdhi_core.c
@@ -704,7 +704,7 @@ static int renesas_sdhi_execute_tuning(struct mmc_host *mmc, u32 opcode)
 			set_bit(i, priv->smpcmp);
 
 		if (cmd_error)
-			mmc_abort_tuning(mmc, opcode);
+			mmc_send_abort_tuning(mmc, opcode);
 	}
 
 	ret = renesas_sdhi_select_tuning(host);
diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index bf238ade1602..6aaf5c3ce34c 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -2680,7 +2680,7 @@ void sdhci_abort_tuning(struct sdhci_host *host, u32 opcode)
 
 	sdhci_end_tuning(host);
 
-	mmc_abort_tuning(host->mmc, opcode);
+	mmc_send_abort_tuning(host->mmc, opcode);
 }
 EXPORT_SYMBOL_GPL(sdhci_abort_tuning);
 
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index c7e7b43600e9..0abd47e9ef9b 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -632,6 +632,6 @@ static inline enum dma_data_direction mmc_get_dma_dir(struct mmc_data *data)
 }
 
 int mmc_send_tuning(struct mmc_host *host, u32 opcode, int *cmd_error);
-int mmc_abort_tuning(struct mmc_host *host, u32 opcode);
+int mmc_send_abort_tuning(struct mmc_host *host, u32 opcode);
 
 #endif /* LINUX_MMC_HOST_H */
-- 
cgit v1.2.3


From 771fac5e26c17845de8c679e6a947a4371e86ffc Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 11 Jun 2021 08:48:02 +0530
Subject: Revert "cpufreq: CPPC: Add support for frequency invariance"

This reverts commit 4c38f2df71c8e33c0b64865992d693f5022eeaad.

There are few races in the frequency invariance support for CPPC driver,
namely the driver doesn't stop the kthread_work and irq_work on policy
exit during suspend/resume or CPU hotplug.

A proper fix won't be possible for the 5.13-rc, as it requires a lot of
changes. Lets revert the patch instead for now.

Fixes: 4c38f2df71c8 ("cpufreq: CPPC: Add support for frequency invariance")
Reported-by: Qian Cai <quic_qiancai@quicinc.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/Kconfig.arm    |  10 --
 drivers/cpufreq/cppc_cpufreq.c | 245 ++---------------------------------------
 include/linux/arch_topology.h  |   1 -
 kernel/sched/core.c            |   1 -
 4 files changed, 12 insertions(+), 245 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm
index a5c5f70acfc9..e65e0a43be64 100644
--- a/drivers/cpufreq/Kconfig.arm
+++ b/drivers/cpufreq/Kconfig.arm
@@ -19,16 +19,6 @@ config ACPI_CPPC_CPUFREQ
 
 	  If in doubt, say N.
 
-config ACPI_CPPC_CPUFREQ_FIE
-	bool "Frequency Invariance support for CPPC cpufreq driver"
-	depends on ACPI_CPPC_CPUFREQ && GENERIC_ARCH_TOPOLOGY
-	default y
-	help
-	  This extends frequency invariance support in the CPPC cpufreq driver,
-	  by using CPPC delivered and reference performance counters.
-
-	  If in doubt, say N.
-
 config ARM_ALLWINNER_SUN50I_CPUFREQ_NVMEM
 	tristate "Allwinner nvmem based SUN50I CPUFreq driver"
 	depends on ARCH_SUNXI
diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index 3848b4c222e1..2f769b1630c5 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -10,18 +10,14 @@
 
 #define pr_fmt(fmt)	"CPPC Cpufreq:"	fmt
 
-#include <linux/arch_topology.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/delay.h>
 #include <linux/cpu.h>
 #include <linux/cpufreq.h>
 #include <linux/dmi.h>
-#include <linux/irq_work.h>
-#include <linux/kthread.h>
 #include <linux/time.h>
 #include <linux/vmalloc.h>
-#include <uapi/linux/sched/types.h>
 
 #include <asm/unaligned.h>
 
@@ -61,204 +57,6 @@ static struct cppc_workaround_oem_info wa_info[] = {
 	}
 };
 
-#ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE
-
-/* Frequency invariance support */
-struct cppc_freq_invariance {
-	int cpu;
-	struct irq_work irq_work;
-	struct kthread_work work;
-	struct cppc_perf_fb_ctrs prev_perf_fb_ctrs;
-	struct cppc_cpudata *cpu_data;
-};
-
-static DEFINE_PER_CPU(struct cppc_freq_invariance, cppc_freq_inv);
-static struct kthread_worker *kworker_fie;
-static bool fie_disabled;
-
-static struct cpufreq_driver cppc_cpufreq_driver;
-static unsigned int hisi_cppc_cpufreq_get_rate(unsigned int cpu);
-static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data,
-				 struct cppc_perf_fb_ctrs fb_ctrs_t0,
-				 struct cppc_perf_fb_ctrs fb_ctrs_t1);
-
-/**
- * cppc_scale_freq_workfn - CPPC arch_freq_scale updater for frequency invariance
- * @work: The work item.
- *
- * The CPPC driver register itself with the topology core to provide its own
- * implementation (cppc_scale_freq_tick()) of topology_scale_freq_tick() which
- * gets called by the scheduler on every tick.
- *
- * Note that the arch specific counters have higher priority than CPPC counters,
- * if available, though the CPPC driver doesn't need to have any special
- * handling for that.
- *
- * On an invocation of cppc_scale_freq_tick(), we schedule an irq work (since we
- * reach here from hard-irq context), which then schedules a normal work item
- * and cppc_scale_freq_workfn() updates the per_cpu arch_freq_scale variable
- * based on the counter updates since the last tick.
- */
-static void cppc_scale_freq_workfn(struct kthread_work *work)
-{
-	struct cppc_freq_invariance *cppc_fi;
-	struct cppc_perf_fb_ctrs fb_ctrs = {0};
-	struct cppc_cpudata *cpu_data;
-	unsigned long local_freq_scale;
-	u64 perf;
-
-	cppc_fi = container_of(work, struct cppc_freq_invariance, work);
-	cpu_data = cppc_fi->cpu_data;
-
-	if (cppc_get_perf_ctrs(cppc_fi->cpu, &fb_ctrs)) {
-		pr_warn("%s: failed to read perf counters\n", __func__);
-		return;
-	}
-
-	cppc_fi->prev_perf_fb_ctrs = fb_ctrs;
-	perf = cppc_perf_from_fbctrs(cpu_data, cppc_fi->prev_perf_fb_ctrs,
-				     fb_ctrs);
-
-	perf <<= SCHED_CAPACITY_SHIFT;
-	local_freq_scale = div64_u64(perf, cpu_data->perf_caps.highest_perf);
-	if (WARN_ON(local_freq_scale > 1024))
-		local_freq_scale = 1024;
-
-	per_cpu(arch_freq_scale, cppc_fi->cpu) = local_freq_scale;
-}
-
-static void cppc_irq_work(struct irq_work *irq_work)
-{
-	struct cppc_freq_invariance *cppc_fi;
-
-	cppc_fi = container_of(irq_work, struct cppc_freq_invariance, irq_work);
-	kthread_queue_work(kworker_fie, &cppc_fi->work);
-}
-
-static void cppc_scale_freq_tick(void)
-{
-	struct cppc_freq_invariance *cppc_fi = &per_cpu(cppc_freq_inv, smp_processor_id());
-
-	/*
-	 * cppc_get_perf_ctrs() can potentially sleep, call that from the right
-	 * context.
-	 */
-	irq_work_queue(&cppc_fi->irq_work);
-}
-
-static struct scale_freq_data cppc_sftd = {
-	.source = SCALE_FREQ_SOURCE_CPPC,
-	.set_freq_scale = cppc_scale_freq_tick,
-};
-
-static void cppc_freq_invariance_policy_init(struct cpufreq_policy *policy,
-					     struct cppc_cpudata *cpu_data)
-{
-	struct cppc_perf_fb_ctrs fb_ctrs = {0};
-	struct cppc_freq_invariance *cppc_fi;
-	int i, ret;
-
-	if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate)
-		return;
-
-	if (fie_disabled)
-		return;
-
-	for_each_cpu(i, policy->cpus) {
-		cppc_fi = &per_cpu(cppc_freq_inv, i);
-		cppc_fi->cpu = i;
-		cppc_fi->cpu_data = cpu_data;
-		kthread_init_work(&cppc_fi->work, cppc_scale_freq_workfn);
-		init_irq_work(&cppc_fi->irq_work, cppc_irq_work);
-
-		ret = cppc_get_perf_ctrs(i, &fb_ctrs);
-		if (ret) {
-			pr_warn("%s: failed to read perf counters: %d\n",
-				__func__, ret);
-			fie_disabled = true;
-		} else {
-			cppc_fi->prev_perf_fb_ctrs = fb_ctrs;
-		}
-	}
-}
-
-static void __init cppc_freq_invariance_init(void)
-{
-	struct sched_attr attr = {
-		.size		= sizeof(struct sched_attr),
-		.sched_policy	= SCHED_DEADLINE,
-		.sched_nice	= 0,
-		.sched_priority	= 0,
-		/*
-		 * Fake (unused) bandwidth; workaround to "fix"
-		 * priority inheritance.
-		 */
-		.sched_runtime	= 1000000,
-		.sched_deadline = 10000000,
-		.sched_period	= 10000000,
-	};
-	int ret;
-
-	if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate)
-		return;
-
-	if (fie_disabled)
-		return;
-
-	kworker_fie = kthread_create_worker(0, "cppc_fie");
-	if (IS_ERR(kworker_fie))
-		return;
-
-	ret = sched_setattr_nocheck(kworker_fie->task, &attr);
-	if (ret) {
-		pr_warn("%s: failed to set SCHED_DEADLINE: %d\n", __func__,
-			ret);
-		kthread_destroy_worker(kworker_fie);
-		return;
-	}
-
-	/* Register for freq-invariance */
-	topology_set_scale_freq_source(&cppc_sftd, cpu_present_mask);
-}
-
-static void cppc_freq_invariance_exit(void)
-{
-	struct cppc_freq_invariance *cppc_fi;
-	int i;
-
-	if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate)
-		return;
-
-	if (fie_disabled)
-		return;
-
-	topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_CPPC, cpu_present_mask);
-
-	for_each_possible_cpu(i) {
-		cppc_fi = &per_cpu(cppc_freq_inv, i);
-		irq_work_sync(&cppc_fi->irq_work);
-	}
-
-	kthread_destroy_worker(kworker_fie);
-	kworker_fie = NULL;
-}
-
-#else
-static inline void
-cppc_freq_invariance_policy_init(struct cpufreq_policy *policy,
-				 struct cppc_cpudata *cpu_data)
-{
-}
-
-static inline void cppc_freq_invariance_init(void)
-{
-}
-
-static inline void cppc_freq_invariance_exit(void)
-{
-}
-#endif /* CONFIG_ACPI_CPPC_CPUFREQ_FIE */
-
 /* Callback function used to retrieve the max frequency from DMI */
 static void cppc_find_dmi_mhz(const struct dmi_header *dm, void *private)
 {
@@ -547,12 +345,9 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	cpu_data->perf_ctrls.desired_perf =  caps->highest_perf;
 
 	ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls);
-	if (ret) {
+	if (ret)
 		pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n",
 			 caps->highest_perf, cpu, ret);
-	} else {
-		cppc_freq_invariance_policy_init(policy, cpu_data);
-	}
 
 	return ret;
 }
@@ -565,12 +360,12 @@ static inline u64 get_delta(u64 t1, u64 t0)
 	return (u32)t1 - (u32)t0;
 }
 
-static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data,
-				 struct cppc_perf_fb_ctrs fb_ctrs_t0,
-				 struct cppc_perf_fb_ctrs fb_ctrs_t1)
+static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu_data,
+				     struct cppc_perf_fb_ctrs fb_ctrs_t0,
+				     struct cppc_perf_fb_ctrs fb_ctrs_t1)
 {
 	u64 delta_reference, delta_delivered;
-	u64 reference_perf;
+	u64 reference_perf, delivered_perf;
 
 	reference_perf = fb_ctrs_t0.reference_perf;
 
@@ -579,21 +374,12 @@ static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data,
 	delta_delivered = get_delta(fb_ctrs_t1.delivered,
 				    fb_ctrs_t0.delivered);
 
-	/* Check to avoid divide-by zero and invalid delivered_perf */
-	if (!delta_reference || !delta_delivered)
-		return cpu_data->perf_ctrls.desired_perf;
-
-	return (reference_perf * delta_delivered) / delta_reference;
-}
-
-static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu_data,
-				     struct cppc_perf_fb_ctrs fb_ctrs_t0,
-				     struct cppc_perf_fb_ctrs fb_ctrs_t1)
-{
-	u64 delivered_perf;
-
-	delivered_perf = cppc_perf_from_fbctrs(cpu_data, fb_ctrs_t0,
-					       fb_ctrs_t1);
+	/* Check to avoid divide-by zero */
+	if (delta_reference || delta_delivered)
+		delivered_perf = (reference_perf * delta_delivered) /
+					delta_reference;
+	else
+		delivered_perf = cpu_data->perf_ctrls.desired_perf;
 
 	return cppc_cpufreq_perf_to_khz(cpu_data, delivered_perf);
 }
@@ -718,8 +504,6 @@ static void cppc_check_hisi_workaround(void)
 
 static int __init cppc_cpufreq_init(void)
 {
-	int ret;
-
 	if ((acpi_disabled) || !acpi_cpc_valid())
 		return -ENODEV;
 
@@ -727,11 +511,7 @@ static int __init cppc_cpufreq_init(void)
 
 	cppc_check_hisi_workaround();
 
-	ret = cpufreq_register_driver(&cppc_cpufreq_driver);
-	if (!ret)
-		cppc_freq_invariance_init();
-
-	return ret;
+	return cpufreq_register_driver(&cppc_cpufreq_driver);
 }
 
 static inline void free_cpu_data(void)
@@ -748,7 +528,6 @@ static inline void free_cpu_data(void)
 
 static void __exit cppc_cpufreq_exit(void)
 {
-	cppc_freq_invariance_exit();
 	cpufreq_unregister_driver(&cppc_cpufreq_driver);
 
 	free_cpu_data();
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index f180240dc95f..11e555cfaecb 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -37,7 +37,6 @@ bool topology_scale_freq_invariant(void);
 enum scale_freq_source {
 	SCALE_FREQ_SOURCE_CPUFREQ = 0,
 	SCALE_FREQ_SOURCE_ARCH,
-	SCALE_FREQ_SOURCE_CPPC,
 };
 
 struct scale_freq_data {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5226cc26a095..4ca80df205ce 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6389,7 +6389,6 @@ int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
 {
 	return __sched_setscheduler(p, attr, false, true);
 }
-EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
 
 /**
  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
-- 
cgit v1.2.3


From 590e8a082a5772071d7bcfea2b8e5a2453cecad2 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 2 Jun 2021 16:37:01 +0100
Subject: CFI: Move function_nocfi() into compiler.h

Currently the common definition of function_nocfi() is provided by
<linux/mm.h>, and architectures are expected to provide a definition in
<asm/memory.h>. Due to header dependencies, this can make it hard to use
function_nocfi() in low-level headers.

As function_nocfi() has no dependency on any mm code, nor on any memory
definitions, it doesn't need to live in <linux/mm.h> or <asm/memory.h>.
Generally, it would make more sense for it to live in
<linux/compiler.h>, where an architecture can override it in
<asm/compiler.h>.

Move the definitions accordingly.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20210602153701.35957-1-mark.rutland@arm.com
---
 arch/arm64/include/asm/compiler.h | 16 ++++++++++++++++
 arch/arm64/include/asm/memory.h   | 16 ----------------
 include/linux/compiler.h          | 10 ++++++++++
 include/linux/mm.h                | 10 ----------
 4 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/compiler.h b/arch/arm64/include/asm/compiler.h
index 6fb2e6bcc392..dc3ea4080e2e 100644
--- a/arch/arm64/include/asm/compiler.h
+++ b/arch/arm64/include/asm/compiler.h
@@ -23,4 +23,20 @@
 #define __builtin_return_address(val)					\
 	(void *)(ptrauth_clear_pac((unsigned long)__builtin_return_address(val)))
 
+#ifdef CONFIG_CFI_CLANG
+/*
+ * With CONFIG_CFI_CLANG, the compiler replaces function address
+ * references with the address of the function's CFI jump table
+ * entry. The function_nocfi macro always returns the address of the
+ * actual function instead.
+ */
+#define function_nocfi(x) ({						\
+	void *addr;							\
+	asm("adrp %0, " __stringify(x) "\n\t"				\
+	    "add  %0, %0, :lo12:" __stringify(x)			\
+	    : "=r" (addr));						\
+	addr;								\
+})
+#endif
+
 #endif /* __ASM_COMPILER_H */
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 87b90dc27a43..ced44ca3e175 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -323,22 +323,6 @@ static inline void *phys_to_virt(phys_addr_t x)
 #define virt_to_pfn(x)		__phys_to_pfn(__virt_to_phys((unsigned long)(x)))
 #define sym_to_pfn(x)		__phys_to_pfn(__pa_symbol(x))
 
-#ifdef CONFIG_CFI_CLANG
-/*
- * With CONFIG_CFI_CLANG, the compiler replaces function address
- * references with the address of the function's CFI jump table
- * entry. The function_nocfi macro always returns the address of the
- * actual function instead.
- */
-#define function_nocfi(x) ({						\
-	void *addr;							\
-	asm("adrp %0, " __stringify(x) "\n\t"				\
-	    "add  %0, %0, :lo12:" __stringify(x)			\
-	    : "=r" (addr));						\
-	addr;								\
-})
-#endif
-
 /*
  *  virt_to_page(x)	convert a _valid_ virtual address to struct page *
  *  virt_addr_valid(x)	indicates whether a virtual address is valid
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index df5b405e6305..099e529a5d25 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -213,6 +213,16 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 	__v;								\
 })
 
+/*
+ * With CONFIG_CFI_CLANG, the compiler replaces function addresses in
+ * instrumented C code with jump table addresses. Architectures that
+ * support CFI can define this macro to return the actual function address
+ * when needed.
+ */
+#ifndef function_nocfi
+#define function_nocfi(x) (x)
+#endif
+
 #endif /* __KERNEL__ */
 
 /*
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c274f75efcf9..b8c28b10f25d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -124,16 +124,6 @@ extern int mmap_rnd_compat_bits __read_mostly;
 #define lm_alias(x)	__va(__pa_symbol(x))
 #endif
 
-/*
- * With CONFIG_CFI_CLANG, the compiler replaces function addresses in
- * instrumented C code with jump table addresses. Architectures that
- * support CFI can define this macro to return the actual function address
- * when needed.
- */
-#ifndef function_nocfi
-#define function_nocfi(x) (x)
-#endif
-
 /*
  * To prevent common memory management code establishing
  * a zero page mapping on a read fault.
-- 
cgit v1.2.3


From ec4b94f9b37bf028cb9b9c39cd1c1cb5dd1ab40c Mon Sep 17 00:00:00 2001
From: Michael Grzeschik <m.grzeschik@pengutronix.de>
Date: Mon, 14 Jun 2021 06:31:18 +0200
Subject: net: phy: micrel: move phy reg offsets to common header

Some micrel devices share the same PHY register defines. This patch
moves them to one common header so other drivers can reuse them.
And reuse generic MII_* defines where possible.

Signed-off-by: Michael Grzeschik <m.grzeschik@pengutronix.de>
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/microchip/ksz8795.c     | 119 ++++++++++++++++----------------
 drivers/net/dsa/microchip/ksz8795_reg.h |  62 -----------------
 drivers/net/ethernet/micrel/ksz884x.c   | 105 ++++------------------------
 include/linux/micrel_phy.h              |  13 ++++
 4 files changed, 88 insertions(+), 211 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c
index ad509a57a945..ba065003623f 100644
--- a/drivers/net/dsa/microchip/ksz8795.c
+++ b/drivers/net/dsa/microchip/ksz8795.c
@@ -15,6 +15,7 @@
 #include <linux/phy.h>
 #include <linux/etherdevice.h>
 #include <linux/if_bridge.h>
+#include <linux/micrel_phy.h>
 #include <net/dsa.h>
 #include <net/switchdev.h>
 
@@ -731,88 +732,88 @@ static void ksz8_r_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 *val)
 	u8 p = phy;
 
 	switch (reg) {
-	case PHY_REG_CTRL:
+	case MII_BMCR:
 		ksz_pread8(dev, p, regs[P_NEG_RESTART_CTRL], &restart);
 		ksz_pread8(dev, p, regs[P_SPEED_STATUS], &speed);
 		ksz_pread8(dev, p, regs[P_FORCE_CTRL], &ctrl);
 		if (restart & PORT_PHY_LOOPBACK)
-			data |= PHY_LOOPBACK;
+			data |= BMCR_LOOPBACK;
 		if (ctrl & PORT_FORCE_100_MBIT)
-			data |= PHY_SPEED_100MBIT;
+			data |= BMCR_SPEED100;
 		if (ksz_is_ksz88x3(dev)) {
 			if ((ctrl & PORT_AUTO_NEG_ENABLE))
-				data |= PHY_AUTO_NEG_ENABLE;
+				data |= BMCR_ANENABLE;
 		} else {
 			if (!(ctrl & PORT_AUTO_NEG_DISABLE))
-				data |= PHY_AUTO_NEG_ENABLE;
+				data |= BMCR_ANENABLE;
 		}
 		if (restart & PORT_POWER_DOWN)
-			data |= PHY_POWER_DOWN;
+			data |= BMCR_PDOWN;
 		if (restart & PORT_AUTO_NEG_RESTART)
-			data |= PHY_AUTO_NEG_RESTART;
+			data |= BMCR_ANRESTART;
 		if (ctrl & PORT_FORCE_FULL_DUPLEX)
-			data |= PHY_FULL_DUPLEX;
+			data |= BMCR_FULLDPLX;
 		if (speed & PORT_HP_MDIX)
-			data |= PHY_HP_MDIX;
+			data |= KSZ886X_BMCR_HP_MDIX;
 		if (restart & PORT_FORCE_MDIX)
-			data |= PHY_FORCE_MDIX;
+			data |= KSZ886X_BMCR_FORCE_MDI;
 		if (restart & PORT_AUTO_MDIX_DISABLE)
-			data |= PHY_AUTO_MDIX_DISABLE;
+			data |= KSZ886X_BMCR_DISABLE_AUTO_MDIX;
 		if (restart & PORT_TX_DISABLE)
-			data |= PHY_TRANSMIT_DISABLE;
+			data |= KSZ886X_BMCR_DISABLE_TRANSMIT;
 		if (restart & PORT_LED_OFF)
-			data |= PHY_LED_DISABLE;
+			data |= KSZ886X_BMCR_DISABLE_LED;
 		break;
-	case PHY_REG_STATUS:
+	case MII_BMSR:
 		ksz_pread8(dev, p, regs[P_LINK_STATUS], &link);
-		data = PHY_100BTX_FD_CAPABLE |
-		       PHY_100BTX_CAPABLE |
-		       PHY_10BT_FD_CAPABLE |
-		       PHY_10BT_CAPABLE |
-		       PHY_AUTO_NEG_CAPABLE;
+		data = BMSR_100FULL |
+		       BMSR_100HALF |
+		       BMSR_10FULL |
+		       BMSR_10HALF |
+		       BMSR_ANEGCAPABLE;
 		if (link & PORT_AUTO_NEG_COMPLETE)
-			data |= PHY_AUTO_NEG_ACKNOWLEDGE;
+			data |= BMSR_ANEGCOMPLETE;
 		if (link & PORT_STAT_LINK_GOOD)
-			data |= PHY_LINK_STATUS;
+			data |= BMSR_LSTATUS;
 		break;
-	case PHY_REG_ID_1:
+	case MII_PHYSID1:
 		data = KSZ8795_ID_HI;
 		break;
-	case PHY_REG_ID_2:
+	case MII_PHYSID2:
 		if (ksz_is_ksz88x3(dev))
 			data = KSZ8863_ID_LO;
 		else
 			data = KSZ8795_ID_LO;
 		break;
-	case PHY_REG_AUTO_NEGOTIATION:
+	case MII_ADVERTISE:
 		ksz_pread8(dev, p, regs[P_LOCAL_CTRL], &ctrl);
-		data = PHY_AUTO_NEG_802_3;
+		data = ADVERTISE_CSMA;
 		if (ctrl & PORT_AUTO_NEG_SYM_PAUSE)
-			data |= PHY_AUTO_NEG_SYM_PAUSE;
+			data |= ADVERTISE_PAUSE_CAP;
 		if (ctrl & PORT_AUTO_NEG_100BTX_FD)
-			data |= PHY_AUTO_NEG_100BTX_FD;
+			data |= ADVERTISE_100FULL;
 		if (ctrl & PORT_AUTO_NEG_100BTX)
-			data |= PHY_AUTO_NEG_100BTX;
+			data |= ADVERTISE_100HALF;
 		if (ctrl & PORT_AUTO_NEG_10BT_FD)
-			data |= PHY_AUTO_NEG_10BT_FD;
+			data |= ADVERTISE_10FULL;
 		if (ctrl & PORT_AUTO_NEG_10BT)
-			data |= PHY_AUTO_NEG_10BT;
+			data |= ADVERTISE_10HALF;
 		break;
-	case PHY_REG_REMOTE_CAPABILITY:
+	case MII_LPA:
 		ksz_pread8(dev, p, regs[P_REMOTE_STATUS], &link);
-		data = PHY_AUTO_NEG_802_3;
+		data = LPA_SLCT;
 		if (link & PORT_REMOTE_SYM_PAUSE)
-			data |= PHY_AUTO_NEG_SYM_PAUSE;
+			data |= LPA_PAUSE_CAP;
 		if (link & PORT_REMOTE_100BTX_FD)
-			data |= PHY_AUTO_NEG_100BTX_FD;
+			data |= LPA_100FULL;
 		if (link & PORT_REMOTE_100BTX)
-			data |= PHY_AUTO_NEG_100BTX;
+			data |= LPA_100HALF;
 		if (link & PORT_REMOTE_10BT_FD)
-			data |= PHY_AUTO_NEG_10BT_FD;
+			data |= LPA_10FULL;
 		if (link & PORT_REMOTE_10BT)
-			data |= PHY_AUTO_NEG_10BT;
-		if (data & ~PHY_AUTO_NEG_802_3)
-			data |= PHY_REMOTE_ACKNOWLEDGE_NOT;
+			data |= LPA_10HALF;
+		if (data & ~LPA_SLCT)
+			data |= LPA_LPACK;
 		break;
 	default:
 		processed = false;
@@ -830,14 +831,14 @@ static void ksz8_w_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 val)
 	u8 p = phy;
 
 	switch (reg) {
-	case PHY_REG_CTRL:
+	case MII_BMCR:
 
 		/* Do not support PHY reset function. */
-		if (val & PHY_RESET)
+		if (val & BMCR_RESET)
 			break;
 		ksz_pread8(dev, p, regs[P_SPEED_STATUS], &speed);
 		data = speed;
-		if (val & PHY_HP_MDIX)
+		if (val & KSZ886X_BMCR_HP_MDIX)
 			data |= PORT_HP_MDIX;
 		else
 			data &= ~PORT_HP_MDIX;
@@ -846,12 +847,12 @@ static void ksz8_w_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 val)
 		ksz_pread8(dev, p, regs[P_FORCE_CTRL], &ctrl);
 		data = ctrl;
 		if (ksz_is_ksz88x3(dev)) {
-			if ((val & PHY_AUTO_NEG_ENABLE))
+			if ((val & BMCR_ANENABLE))
 				data |= PORT_AUTO_NEG_ENABLE;
 			else
 				data &= ~PORT_AUTO_NEG_ENABLE;
 		} else {
-			if (!(val & PHY_AUTO_NEG_ENABLE))
+			if (!(val & BMCR_ANENABLE))
 				data |= PORT_AUTO_NEG_DISABLE;
 			else
 				data &= ~PORT_AUTO_NEG_DISABLE;
@@ -861,11 +862,11 @@ static void ksz8_w_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 val)
 				data |= PORT_AUTO_NEG_DISABLE;
 		}
 
-		if (val & PHY_SPEED_100MBIT)
+		if (val & BMCR_SPEED100)
 			data |= PORT_FORCE_100_MBIT;
 		else
 			data &= ~PORT_FORCE_100_MBIT;
-		if (val & PHY_FULL_DUPLEX)
+		if (val & BMCR_FULLDPLX)
 			data |= PORT_FORCE_FULL_DUPLEX;
 		else
 			data &= ~PORT_FORCE_FULL_DUPLEX;
@@ -873,38 +874,38 @@ static void ksz8_w_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 val)
 			ksz_pwrite8(dev, p, regs[P_FORCE_CTRL], data);
 		ksz_pread8(dev, p, regs[P_NEG_RESTART_CTRL], &restart);
 		data = restart;
-		if (val & PHY_LED_DISABLE)
+		if (val & KSZ886X_BMCR_DISABLE_LED)
 			data |= PORT_LED_OFF;
 		else
 			data &= ~PORT_LED_OFF;
-		if (val & PHY_TRANSMIT_DISABLE)
+		if (val & KSZ886X_BMCR_DISABLE_TRANSMIT)
 			data |= PORT_TX_DISABLE;
 		else
 			data &= ~PORT_TX_DISABLE;
-		if (val & PHY_AUTO_NEG_RESTART)
+		if (val & BMCR_ANRESTART)
 			data |= PORT_AUTO_NEG_RESTART;
 		else
 			data &= ~(PORT_AUTO_NEG_RESTART);
-		if (val & PHY_POWER_DOWN)
+		if (val & BMCR_PDOWN)
 			data |= PORT_POWER_DOWN;
 		else
 			data &= ~PORT_POWER_DOWN;
-		if (val & PHY_AUTO_MDIX_DISABLE)
+		if (val & KSZ886X_BMCR_DISABLE_AUTO_MDIX)
 			data |= PORT_AUTO_MDIX_DISABLE;
 		else
 			data &= ~PORT_AUTO_MDIX_DISABLE;
-		if (val & PHY_FORCE_MDIX)
+		if (val & KSZ886X_BMCR_FORCE_MDI)
 			data |= PORT_FORCE_MDIX;
 		else
 			data &= ~PORT_FORCE_MDIX;
-		if (val & PHY_LOOPBACK)
+		if (val & BMCR_LOOPBACK)
 			data |= PORT_PHY_LOOPBACK;
 		else
 			data &= ~PORT_PHY_LOOPBACK;
 		if (data != restart)
 			ksz_pwrite8(dev, p, regs[P_NEG_RESTART_CTRL], data);
 		break;
-	case PHY_REG_AUTO_NEGOTIATION:
+	case MII_ADVERTISE:
 		ksz_pread8(dev, p, regs[P_LOCAL_CTRL], &ctrl);
 		data = ctrl;
 		data &= ~(PORT_AUTO_NEG_SYM_PAUSE |
@@ -912,15 +913,15 @@ static void ksz8_w_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 val)
 			  PORT_AUTO_NEG_100BTX |
 			  PORT_AUTO_NEG_10BT_FD |
 			  PORT_AUTO_NEG_10BT);
-		if (val & PHY_AUTO_NEG_SYM_PAUSE)
+		if (val & ADVERTISE_PAUSE_CAP)
 			data |= PORT_AUTO_NEG_SYM_PAUSE;
-		if (val & PHY_AUTO_NEG_100BTX_FD)
+		if (val & ADVERTISE_100FULL)
 			data |= PORT_AUTO_NEG_100BTX_FD;
-		if (val & PHY_AUTO_NEG_100BTX)
+		if (val & ADVERTISE_100HALF)
 			data |= PORT_AUTO_NEG_100BTX;
-		if (val & PHY_AUTO_NEG_10BT_FD)
+		if (val & ADVERTISE_10FULL)
 			data |= PORT_AUTO_NEG_10BT_FD;
-		if (val & PHY_AUTO_NEG_10BT)
+		if (val & ADVERTISE_10HALF)
 			data |= PORT_AUTO_NEG_10BT;
 		if (data != ctrl)
 			ksz_pwrite8(dev, p, regs[P_LOCAL_CTRL], data);
diff --git a/drivers/net/dsa/microchip/ksz8795_reg.h b/drivers/net/dsa/microchip/ksz8795_reg.h
index c2e52c40a54c..f925ddee5238 100644
--- a/drivers/net/dsa/microchip/ksz8795_reg.h
+++ b/drivers/net/dsa/microchip/ksz8795_reg.h
@@ -744,68 +744,6 @@
 
 #define PORT_ACL_FORCE_DLR_MISS		BIT(0)
 
-#ifndef PHY_REG_CTRL
-#define PHY_REG_CTRL			0
-
-#define PHY_RESET			BIT(15)
-#define PHY_LOOPBACK			BIT(14)
-#define PHY_SPEED_100MBIT		BIT(13)
-#define PHY_AUTO_NEG_ENABLE		BIT(12)
-#define PHY_POWER_DOWN			BIT(11)
-#define PHY_MII_DISABLE			BIT(10)
-#define PHY_AUTO_NEG_RESTART		BIT(9)
-#define PHY_FULL_DUPLEX			BIT(8)
-#define PHY_COLLISION_TEST_NOT		BIT(7)
-#define PHY_HP_MDIX			BIT(5)
-#define PHY_FORCE_MDIX			BIT(4)
-#define PHY_AUTO_MDIX_DISABLE		BIT(3)
-#define PHY_REMOTE_FAULT_DISABLE	BIT(2)
-#define PHY_TRANSMIT_DISABLE		BIT(1)
-#define PHY_LED_DISABLE			BIT(0)
-
-#define PHY_REG_STATUS			1
-
-#define PHY_100BT4_CAPABLE		BIT(15)
-#define PHY_100BTX_FD_CAPABLE		BIT(14)
-#define PHY_100BTX_CAPABLE		BIT(13)
-#define PHY_10BT_FD_CAPABLE		BIT(12)
-#define PHY_10BT_CAPABLE		BIT(11)
-#define PHY_MII_SUPPRESS_CAPABLE_NOT	BIT(6)
-#define PHY_AUTO_NEG_ACKNOWLEDGE	BIT(5)
-#define PHY_REMOTE_FAULT		BIT(4)
-#define PHY_AUTO_NEG_CAPABLE		BIT(3)
-#define PHY_LINK_STATUS			BIT(2)
-#define PHY_JABBER_DETECT_NOT		BIT(1)
-#define PHY_EXTENDED_CAPABILITY		BIT(0)
-
-#define PHY_REG_ID_1			2
-#define PHY_REG_ID_2			3
-
-#define PHY_REG_AUTO_NEGOTIATION	4
-
-#define PHY_AUTO_NEG_NEXT_PAGE_NOT	BIT(15)
-#define PHY_AUTO_NEG_REMOTE_FAULT_NOT	BIT(13)
-#define PHY_AUTO_NEG_SYM_PAUSE		BIT(10)
-#define PHY_AUTO_NEG_100BT4		BIT(9)
-#define PHY_AUTO_NEG_100BTX_FD		BIT(8)
-#define PHY_AUTO_NEG_100BTX		BIT(7)
-#define PHY_AUTO_NEG_10BT_FD		BIT(6)
-#define PHY_AUTO_NEG_10BT		BIT(5)
-#define PHY_AUTO_NEG_SELECTOR		0x001F
-#define PHY_AUTO_NEG_802_3		0x0001
-
-#define PHY_REG_REMOTE_CAPABILITY	5
-
-#define PHY_REMOTE_NEXT_PAGE_NOT	BIT(15)
-#define PHY_REMOTE_ACKNOWLEDGE_NOT	BIT(14)
-#define PHY_REMOTE_REMOTE_FAULT_NOT	BIT(13)
-#define PHY_REMOTE_SYM_PAUSE		BIT(10)
-#define PHY_REMOTE_100BTX_FD		BIT(8)
-#define PHY_REMOTE_100BTX		BIT(7)
-#define PHY_REMOTE_10BT_FD		BIT(6)
-#define PHY_REMOTE_10BT			BIT(5)
-#endif
-
 #define KSZ8795_ID_HI			0x0022
 #define KSZ8795_ID_LO			0x1550
 #define KSZ8863_ID_LO			0x1430
diff --git a/drivers/net/ethernet/micrel/ksz884x.c b/drivers/net/ethernet/micrel/ksz884x.c
index 3532bfe936f6..7945eb5e2fe8 100644
--- a/drivers/net/ethernet/micrel/ksz884x.c
+++ b/drivers/net/ethernet/micrel/ksz884x.c
@@ -25,6 +25,7 @@
 #include <linux/crc32.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/micrel_phy.h>
 
 
 /* DMA Registers */
@@ -271,84 +272,15 @@
 
 #define KS884X_PHY_CTRL_OFFSET		0x00
 
-/* Mode Control Register */
-#define PHY_REG_CTRL			0
-
-#define PHY_RESET			0x8000
-#define PHY_LOOPBACK			0x4000
-#define PHY_SPEED_100MBIT		0x2000
-#define PHY_AUTO_NEG_ENABLE		0x1000
-#define PHY_POWER_DOWN			0x0800
-#define PHY_MII_DISABLE			0x0400
-#define PHY_AUTO_NEG_RESTART		0x0200
-#define PHY_FULL_DUPLEX			0x0100
-#define PHY_COLLISION_TEST		0x0080
-#define PHY_HP_MDIX			0x0020
-#define PHY_FORCE_MDIX			0x0010
-#define PHY_AUTO_MDIX_DISABLE		0x0008
-#define PHY_REMOTE_FAULT_DISABLE	0x0004
-#define PHY_TRANSMIT_DISABLE		0x0002
-#define PHY_LED_DISABLE			0x0001
-
 #define KS884X_PHY_STATUS_OFFSET	0x02
 
-/* Mode Status Register */
-#define PHY_REG_STATUS			1
-
-#define PHY_100BT4_CAPABLE		0x8000
-#define PHY_100BTX_FD_CAPABLE		0x4000
-#define PHY_100BTX_CAPABLE		0x2000
-#define PHY_10BT_FD_CAPABLE		0x1000
-#define PHY_10BT_CAPABLE		0x0800
-#define PHY_MII_SUPPRESS_CAPABLE	0x0040
-#define PHY_AUTO_NEG_ACKNOWLEDGE	0x0020
-#define PHY_REMOTE_FAULT		0x0010
-#define PHY_AUTO_NEG_CAPABLE		0x0008
-#define PHY_LINK_STATUS			0x0004
-#define PHY_JABBER_DETECT		0x0002
-#define PHY_EXTENDED_CAPABILITY		0x0001
-
 #define KS884X_PHY_ID_1_OFFSET		0x04
 #define KS884X_PHY_ID_2_OFFSET		0x06
 
-/* PHY Identifier Registers */
-#define PHY_REG_ID_1			2
-#define PHY_REG_ID_2			3
-
 #define KS884X_PHY_AUTO_NEG_OFFSET	0x08
 
-/* Auto-Negotiation Advertisement Register */
-#define PHY_REG_AUTO_NEGOTIATION	4
-
-#define PHY_AUTO_NEG_NEXT_PAGE		0x8000
-#define PHY_AUTO_NEG_REMOTE_FAULT	0x2000
-/* Not supported. */
-#define PHY_AUTO_NEG_ASYM_PAUSE		0x0800
-#define PHY_AUTO_NEG_SYM_PAUSE		0x0400
-#define PHY_AUTO_NEG_100BT4		0x0200
-#define PHY_AUTO_NEG_100BTX_FD		0x0100
-#define PHY_AUTO_NEG_100BTX		0x0080
-#define PHY_AUTO_NEG_10BT_FD		0x0040
-#define PHY_AUTO_NEG_10BT		0x0020
-#define PHY_AUTO_NEG_SELECTOR		0x001F
-#define PHY_AUTO_NEG_802_3		0x0001
-
-#define PHY_AUTO_NEG_PAUSE  (PHY_AUTO_NEG_SYM_PAUSE | PHY_AUTO_NEG_ASYM_PAUSE)
-
 #define KS884X_PHY_REMOTE_CAP_OFFSET	0x0A
 
-/* Auto-Negotiation Link Partner Ability Register */
-#define PHY_REG_REMOTE_CAPABILITY	5
-
-#define PHY_REMOTE_NEXT_PAGE		0x8000
-#define PHY_REMOTE_ACKNOWLEDGE		0x4000
-#define PHY_REMOTE_REMOTE_FAULT		0x2000
-#define PHY_REMOTE_SYM_PAUSE		0x0400
-#define PHY_REMOTE_100BTX_FD		0x0100
-#define PHY_REMOTE_100BTX		0x0080
-#define PHY_REMOTE_10BT_FD		0x0040
-#define PHY_REMOTE_10BT			0x0020
-
 /* P1VCT */
 #define KS884X_P1VCT_P			0x04F0
 #define KS884X_P1PHYCTRL_P		0x04F2
@@ -2886,15 +2818,6 @@ static void sw_block_addr(struct ksz_hw *hw)
 	}
 }
 
-#define PHY_LINK_SUPPORT		\
-	(PHY_AUTO_NEG_ASYM_PAUSE |	\
-	PHY_AUTO_NEG_SYM_PAUSE |	\
-	PHY_AUTO_NEG_100BT4 |		\
-	PHY_AUTO_NEG_100BTX_FD |	\
-	PHY_AUTO_NEG_100BTX |		\
-	PHY_AUTO_NEG_10BT_FD |		\
-	PHY_AUTO_NEG_10BT)
-
 static inline void hw_r_phy_ctrl(struct ksz_hw *hw, int phy, u16 *data)
 {
 	*data = readw(hw->io + phy + KS884X_PHY_CTRL_OFFSET);
@@ -3238,16 +3161,18 @@ static void determine_flow_ctrl(struct ksz_hw *hw, struct ksz_port *port,
 	rx = tx = 0;
 	if (port->force_link)
 		rx = tx = 1;
-	if (remote & PHY_AUTO_NEG_SYM_PAUSE) {
-		if (local & PHY_AUTO_NEG_SYM_PAUSE) {
+	if (remote & LPA_PAUSE_CAP) {
+		if (local & ADVERTISE_PAUSE_CAP) {
 			rx = tx = 1;
-		} else if ((remote & PHY_AUTO_NEG_ASYM_PAUSE) &&
-				(local & PHY_AUTO_NEG_PAUSE) ==
-				PHY_AUTO_NEG_ASYM_PAUSE) {
+		} else if ((remote & LPA_PAUSE_ASYM) &&
+			   (local &
+			    (ADVERTISE_PAUSE_CAP | ADVERTISE_PAUSE_ASYM)) ==
+			   ADVERTISE_PAUSE_ASYM) {
 			tx = 1;
 		}
-	} else if (remote & PHY_AUTO_NEG_ASYM_PAUSE) {
-		if ((local & PHY_AUTO_NEG_PAUSE) == PHY_AUTO_NEG_PAUSE)
+	} else if (remote & LPA_PAUSE_ASYM) {
+		if ((local & (ADVERTISE_PAUSE_CAP | ADVERTISE_PAUSE_ASYM))
+		    == (ADVERTISE_PAUSE_CAP | ADVERTISE_PAUSE_ASYM))
 			rx = 1;
 	}
 	if (!hw->ksz_switch)
@@ -3428,16 +3353,16 @@ static void port_force_link_speed(struct ksz_port *port)
 		phy = KS884X_PHY_1_CTRL_OFFSET + p * PHY_CTRL_INTERVAL;
 		hw_r_phy_ctrl(hw, phy, &data);
 
-		data &= ~PHY_AUTO_NEG_ENABLE;
+		data &= ~BMCR_ANENABLE;
 
 		if (10 == port->speed)
-			data &= ~PHY_SPEED_100MBIT;
+			data &= ~BMCR_SPEED100;
 		else if (100 == port->speed)
-			data |= PHY_SPEED_100MBIT;
+			data |= BMCR_SPEED100;
 		if (1 == port->duplex)
-			data &= ~PHY_FULL_DUPLEX;
+			data &= ~BMCR_FULLDPLX;
 		else if (2 == port->duplex)
-			data |= PHY_FULL_DUPLEX;
+			data |= BMCR_FULLDPLX;
 		hw_w_phy_ctrl(hw, phy, data);
 	}
 }
diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h
index 416ee6dd2574..b03e2afcb53f 100644
--- a/include/linux/micrel_phy.h
+++ b/include/linux/micrel_phy.h
@@ -45,4 +45,17 @@
 #define MICREL_KSZ9021_RGMII_CLK_CTRL_PAD_SCEW	0x104
 #define MICREL_KSZ9021_RGMII_RX_DATA_PAD_SCEW	0x105
 
+/* Device specific MII_BMCR (Reg 0) bits */
+/* 1 = HP Auto MDI/MDI-X mode, 0 = Microchip Auto MDI/MDI-X mode */
+#define KSZ886X_BMCR_HP_MDIX			BIT(5)
+/* 1 = Force MDI (transmit on RXP/RXM pins), 0 = Normal operation
+ * (transmit on TXP/TXM pins)
+ */
+#define KSZ886X_BMCR_FORCE_MDI			BIT(4)
+/* 1 = Disable auto MDI-X */
+#define KSZ886X_BMCR_DISABLE_AUTO_MDIX		BIT(3)
+#define KSZ886X_BMCR_DISABLE_FAR_END_FAULT	BIT(2)
+#define KSZ886X_BMCR_DISABLE_TRANSMIT		BIT(1)
+#define KSZ886X_BMCR_DISABLE_LED		BIT(0)
+
 #endif /* _MICREL_PHY_H */
-- 
cgit v1.2.3


From 52939393bd682248a415de4c0439280aafaccd66 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Mon, 14 Jun 2021 06:31:21 +0200
Subject: net: phy/dsa micrel/ksz886x add MDI-X support

Add support for MDI-X status and configuration

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/microchip/ksz8795.c |  5 +++
 drivers/net/phy/micrel.c            | 88 +++++++++++++++++++++++++++++++++++++
 include/linux/micrel_phy.h          |  2 +
 3 files changed, 95 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c
index cfa2a5000cd3..690304c87b02 100644
--- a/drivers/net/dsa/microchip/ksz8795.c
+++ b/drivers/net/dsa/microchip/ksz8795.c
@@ -816,6 +816,11 @@ static void ksz8_r_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 *val)
 		if (data & ~LPA_SLCT)
 			data |= LPA_LPACK;
 		break;
+	case PHY_REG_PHY_CTRL:
+		ksz_pread8(dev, p, regs[P_LINK_STATUS], &link);
+		if (link & PORT_MDIX_STATUS)
+			data |= KSZ886X_CTRL_MDIX_STAT;
+		break;
 	default:
 		processed = false;
 		break;
diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index a2755f2eb7db..9ffca754f6f7 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -1045,6 +1045,92 @@ static int ksz8873mll_config_aneg(struct phy_device *phydev)
 	return 0;
 }
 
+static int ksz886x_config_mdix(struct phy_device *phydev, u8 ctrl)
+{
+	u16 val;
+
+	switch (ctrl) {
+	case ETH_TP_MDI:
+		val = KSZ886X_BMCR_DISABLE_AUTO_MDIX;
+		break;
+	case ETH_TP_MDI_X:
+		/* Note: The naming of the bit KSZ886X_BMCR_FORCE_MDI is bit
+		 * counter intuitive, the "-X" in "1 = Force MDI" in the data
+		 * sheet seems to be missing:
+		 * 1 = Force MDI (sic!) (transmit on RX+/RX- pins)
+		 * 0 = Normal operation (transmit on TX+/TX- pins)
+		 */
+		val = KSZ886X_BMCR_DISABLE_AUTO_MDIX | KSZ886X_BMCR_FORCE_MDI;
+		break;
+	case ETH_TP_MDI_AUTO:
+		val = 0;
+		break;
+	default:
+		return 0;
+	}
+
+	return phy_modify(phydev, MII_BMCR,
+			  KSZ886X_BMCR_HP_MDIX | KSZ886X_BMCR_FORCE_MDI |
+			  KSZ886X_BMCR_DISABLE_AUTO_MDIX,
+			  KSZ886X_BMCR_HP_MDIX | val);
+}
+
+static int ksz886x_config_aneg(struct phy_device *phydev)
+{
+	int ret;
+
+	ret = genphy_config_aneg(phydev);
+	if (ret)
+		return ret;
+
+	/* The MDI-X configuration is automatically changed by the PHY after
+	 * switching from autoneg off to on. So, take MDI-X configuration under
+	 * own control and set it after autoneg configuration was done.
+	 */
+	return ksz886x_config_mdix(phydev, phydev->mdix_ctrl);
+}
+
+static int ksz886x_mdix_update(struct phy_device *phydev)
+{
+	int ret;
+
+	ret = phy_read(phydev, MII_BMCR);
+	if (ret < 0)
+		return ret;
+
+	if (ret & KSZ886X_BMCR_DISABLE_AUTO_MDIX) {
+		if (ret & KSZ886X_BMCR_FORCE_MDI)
+			phydev->mdix_ctrl = ETH_TP_MDI_X;
+		else
+			phydev->mdix_ctrl = ETH_TP_MDI;
+	} else {
+		phydev->mdix_ctrl = ETH_TP_MDI_AUTO;
+	}
+
+	ret = phy_read(phydev, MII_KSZPHY_CTRL);
+	if (ret < 0)
+		return ret;
+
+	/* Same reverse logic as KSZ886X_BMCR_FORCE_MDI */
+	if (ret & KSZ886X_CTRL_MDIX_STAT)
+		phydev->mdix = ETH_TP_MDI_X;
+	else
+		phydev->mdix = ETH_TP_MDI;
+
+	return 0;
+}
+
+static int ksz886x_read_status(struct phy_device *phydev)
+{
+	int ret;
+
+	ret = ksz886x_mdix_update(phydev);
+	if (ret < 0)
+		return ret;
+
+	return genphy_read_status(phydev);
+}
+
 static int kszphy_get_sset_count(struct phy_device *phydev)
 {
 	return ARRAY_SIZE(kszphy_hw_stats);
@@ -1397,6 +1483,8 @@ static struct phy_driver ksphy_driver[] = {
 	.name		= "Micrel KSZ8851 Ethernet MAC or KSZ886X Switch",
 	/* PHY_BASIC_FEATURES */
 	.config_init	= kszphy_config_init,
+	.config_aneg	= ksz886x_config_aneg,
+	.read_status	= ksz886x_read_status,
 	.suspend	= genphy_suspend,
 	.resume		= genphy_resume,
 }, {
diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h
index b03e2afcb53f..58370abd9f4f 100644
--- a/include/linux/micrel_phy.h
+++ b/include/linux/micrel_phy.h
@@ -58,4 +58,6 @@
 #define KSZ886X_BMCR_DISABLE_TRANSMIT		BIT(1)
 #define KSZ886X_BMCR_DISABLE_LED		BIT(0)
 
+#define KSZ886X_CTRL_MDIX_STAT			BIT(4)
+
 #endif /* _MICREL_PHY_H */
-- 
cgit v1.2.3


From 49011e0c1555dd7a689d0f32fd78c1ecd43e59cd Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Mon, 14 Jun 2021 06:31:25 +0200
Subject: net: phy: micrel: ksz886x/ksz8081: add cabletest support

This patch support for cable test for the ksz886x switches and the
ksz8081 PHY.

The patch was tested on a KSZ8873RLL switch with following results:

- port 1:
  - provides invalid values, thus return -ENOTSUPP
    (Errata: DS80000830A: "LinkMD does not work on Port 1",
     http://ww1.microchip.com/downloads/en/DeviceDoc/KSZ8873-Errata-DS80000830A.pdf)

- port 2:
  - can detect distance
  - can detect open on each wire of pair A (wire 1 and 2)
  - can detect open only on one wire of pair B (only wire 3)
  - can detect short between wires of a pair (wires 1 + 2 or 3 + 6)
  - short between pairs is detected as open.
    For example short between wires 2 + 3 is detected as open.

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/microchip/ksz8795.c |  13 +++
 drivers/net/phy/micrel.c            | 180 ++++++++++++++++++++++++++++++++++++
 include/linux/micrel_phy.h          |   1 +
 3 files changed, 194 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c
index e1731ae4497d..560f6843bb65 100644
--- a/drivers/net/dsa/microchip/ksz8795.c
+++ b/drivers/net/dsa/microchip/ksz8795.c
@@ -970,6 +970,18 @@ static enum dsa_tag_protocol ksz8_get_tag_protocol(struct dsa_switch *ds,
 		DSA_TAG_PROTO_KSZ9893 : DSA_TAG_PROTO_KSZ8795;
 }
 
+static u32 ksz8_sw_get_phy_flags(struct dsa_switch *ds, int port)
+{
+	/* Silicon Errata Sheet (DS80000830A):
+	 * Port 1 does not work with LinkMD Cable-Testing.
+	 * Port 1 does not respond to received PAUSE control frames.
+	 */
+	if (!port)
+		return MICREL_KSZ8_P1_ERRATA;
+
+	return 0;
+}
+
 static void ksz8_get_strings(struct dsa_switch *ds, int port,
 			     u32 stringset, uint8_t *buf)
 {
@@ -1503,6 +1515,7 @@ unsupported:
 
 static const struct dsa_switch_ops ksz8_switch_ops = {
 	.get_tag_protocol	= ksz8_get_tag_protocol,
+	.get_phy_flags		= ksz8_sw_get_phy_flags,
 	.setup			= ksz8_setup,
 	.phy_read		= ksz_phy_read16,
 	.phy_write		= ksz_phy_write16,
diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index 5ca39ce8db96..4d53886f7d51 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -20,6 +20,7 @@
  */
 
 #include <linux/bitfield.h>
+#include <linux/ethtool_netlink.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/phy.h>
@@ -53,6 +54,18 @@
 #define KSZPHY_INTCS_STATUS			(KSZPHY_INTCS_LINK_DOWN_STATUS |\
 						 KSZPHY_INTCS_LINK_UP_STATUS)
 
+/* LinkMD Control/Status */
+#define KSZ8081_LMD				0x1d
+#define KSZ8081_LMD_ENABLE_TEST			BIT(15)
+#define KSZ8081_LMD_STAT_NORMAL			0
+#define KSZ8081_LMD_STAT_OPEN			1
+#define KSZ8081_LMD_STAT_SHORT			2
+#define KSZ8081_LMD_STAT_FAIL			3
+#define KSZ8081_LMD_STAT_MASK			GENMASK(14, 13)
+/* Short cable (<10 meter) has been detected by LinkMD */
+#define KSZ8081_LMD_SHORT_INDICATOR		BIT(12)
+#define KSZ8081_LMD_DELTA_TIME_MASK		GENMASK(8, 0)
+
 /* PHY Control 1 */
 #define MII_KSZPHY_CTRL_1			0x1e
 #define KSZ8081_CTRL1_MDIX_STAT			BIT(4)
@@ -1363,6 +1376,167 @@ static int kszphy_probe(struct phy_device *phydev)
 	return 0;
 }
 
+static int ksz886x_cable_test_start(struct phy_device *phydev)
+{
+	if (phydev->dev_flags & MICREL_KSZ8_P1_ERRATA)
+		return -EOPNOTSUPP;
+
+	/* If autoneg is enabled, we won't be able to test cross pair
+	 * short. In this case, the PHY will "detect" a link and
+	 * confuse the internal state machine - disable auto neg here.
+	 * If autoneg is disabled, we should set the speed to 10mbit.
+	 */
+	return phy_clear_bits(phydev, MII_BMCR, BMCR_ANENABLE | BMCR_SPEED100);
+}
+
+static int ksz886x_cable_test_result_trans(u16 status)
+{
+	switch (FIELD_GET(KSZ8081_LMD_STAT_MASK, status)) {
+	case KSZ8081_LMD_STAT_NORMAL:
+		return ETHTOOL_A_CABLE_RESULT_CODE_OK;
+	case KSZ8081_LMD_STAT_SHORT:
+		return ETHTOOL_A_CABLE_RESULT_CODE_SAME_SHORT;
+	case KSZ8081_LMD_STAT_OPEN:
+		return ETHTOOL_A_CABLE_RESULT_CODE_OPEN;
+	case KSZ8081_LMD_STAT_FAIL:
+		fallthrough;
+	default:
+		return ETHTOOL_A_CABLE_RESULT_CODE_UNSPEC;
+	}
+}
+
+static bool ksz886x_cable_test_failed(u16 status)
+{
+	return FIELD_GET(KSZ8081_LMD_STAT_MASK, status) ==
+		KSZ8081_LMD_STAT_FAIL;
+}
+
+static bool ksz886x_cable_test_fault_length_valid(u16 status)
+{
+	switch (FIELD_GET(KSZ8081_LMD_STAT_MASK, status)) {
+	case KSZ8081_LMD_STAT_OPEN:
+		fallthrough;
+	case KSZ8081_LMD_STAT_SHORT:
+		return true;
+	}
+	return false;
+}
+
+static int ksz886x_cable_test_fault_length(u16 status)
+{
+	int dt;
+
+	/* According to the data sheet the distance to the fault is
+	 * DELTA_TIME * 0.4 meters.
+	 */
+	dt = FIELD_GET(KSZ8081_LMD_DELTA_TIME_MASK, status);
+
+	return (dt * 400) / 10;
+}
+
+static int ksz886x_cable_test_wait_for_completion(struct phy_device *phydev)
+{
+	int val, ret;
+
+	ret = phy_read_poll_timeout(phydev, KSZ8081_LMD, val,
+				    !(val & KSZ8081_LMD_ENABLE_TEST),
+				    30000, 100000, true);
+
+	return ret < 0 ? ret : 0;
+}
+
+static int ksz886x_cable_test_one_pair(struct phy_device *phydev, int pair)
+{
+	static const int ethtool_pair[] = {
+		ETHTOOL_A_CABLE_PAIR_A,
+		ETHTOOL_A_CABLE_PAIR_B,
+	};
+	int ret, val, mdix;
+
+	/* There is no way to choice the pair, like we do one ksz9031.
+	 * We can workaround this limitation by using the MDI-X functionality.
+	 */
+	if (pair == 0)
+		mdix = ETH_TP_MDI;
+	else
+		mdix = ETH_TP_MDI_X;
+
+	switch (phydev->phy_id & MICREL_PHY_ID_MASK) {
+	case PHY_ID_KSZ8081:
+		ret = ksz8081_config_mdix(phydev, mdix);
+		break;
+	case PHY_ID_KSZ886X:
+		ret = ksz886x_config_mdix(phydev, mdix);
+		break;
+	default:
+		ret = -ENODEV;
+	}
+
+	if (ret)
+		return ret;
+
+	/* Now we are ready to fire. This command will send a 100ns pulse
+	 * to the pair.
+	 */
+	ret = phy_write(phydev, KSZ8081_LMD, KSZ8081_LMD_ENABLE_TEST);
+	if (ret)
+		return ret;
+
+	ret = ksz886x_cable_test_wait_for_completion(phydev);
+	if (ret)
+		return ret;
+
+	val = phy_read(phydev, KSZ8081_LMD);
+	if (val < 0)
+		return val;
+
+	if (ksz886x_cable_test_failed(val))
+		return -EAGAIN;
+
+	ret = ethnl_cable_test_result(phydev, ethtool_pair[pair],
+				      ksz886x_cable_test_result_trans(val));
+	if (ret)
+		return ret;
+
+	if (!ksz886x_cable_test_fault_length_valid(val))
+		return 0;
+
+	return ethnl_cable_test_fault_length(phydev, ethtool_pair[pair],
+					     ksz886x_cable_test_fault_length(val));
+}
+
+static int ksz886x_cable_test_get_status(struct phy_device *phydev,
+					 bool *finished)
+{
+	unsigned long pair_mask = 0x3;
+	int retries = 20;
+	int pair, ret;
+
+	*finished = false;
+
+	/* Try harder if link partner is active */
+	while (pair_mask && retries--) {
+		for_each_set_bit(pair, &pair_mask, 4) {
+			ret = ksz886x_cable_test_one_pair(phydev, pair);
+			if (ret == -EAGAIN)
+				continue;
+			if (ret < 0)
+				return ret;
+			clear_bit(pair, &pair_mask);
+		}
+		/* If link partner is in autonegotiation mode it will send 2ms
+		 * of FLPs with at least 6ms of silence.
+		 * Add 2ms sleep to have better chances to hit this silence.
+		 */
+		if (pair_mask)
+			msleep(2);
+	}
+
+	*finished = true;
+
+	return ret;
+}
+
 static struct phy_driver ksphy_driver[] = {
 {
 	.phy_id		= PHY_ID_KS8737,
@@ -1469,6 +1643,7 @@ static struct phy_driver ksphy_driver[] = {
 	.phy_id		= PHY_ID_KSZ8081,
 	.name		= "Micrel KSZ8081 or KSZ8091",
 	.phy_id_mask	= MICREL_PHY_ID_MASK,
+	.flags		= PHY_POLL_CABLE_TEST,
 	/* PHY_BASIC_FEATURES */
 	.driver_data	= &ksz8081_type,
 	.probe		= kszphy_probe,
@@ -1483,6 +1658,8 @@ static struct phy_driver ksphy_driver[] = {
 	.get_stats	= kszphy_get_stats,
 	.suspend	= kszphy_suspend,
 	.resume		= kszphy_resume,
+	.cable_test_start	= ksz886x_cable_test_start,
+	.cable_test_get_status	= ksz886x_cable_test_get_status,
 }, {
 	.phy_id		= PHY_ID_KSZ8061,
 	.name		= "Micrel KSZ8061",
@@ -1571,11 +1748,14 @@ static struct phy_driver ksphy_driver[] = {
 	.phy_id_mask	= MICREL_PHY_ID_MASK,
 	.name		= "Micrel KSZ8851 Ethernet MAC or KSZ886X Switch",
 	/* PHY_BASIC_FEATURES */
+	.flags		= PHY_POLL_CABLE_TEST,
 	.config_init	= kszphy_config_init,
 	.config_aneg	= ksz886x_config_aneg,
 	.read_status	= ksz886x_read_status,
 	.suspend	= genphy_suspend,
 	.resume		= genphy_resume,
+	.cable_test_start	= ksz886x_cable_test_start,
+	.cable_test_get_status	= ksz886x_cable_test_get_status,
 }, {
 	.name		= "Micrel KSZ87XX Switch",
 	/* PHY_BASIC_FEATURES */
diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h
index 58370abd9f4f..3d43c60b49fa 100644
--- a/include/linux/micrel_phy.h
+++ b/include/linux/micrel_phy.h
@@ -39,6 +39,7 @@
 /* struct phy_device dev_flags definitions */
 #define MICREL_PHY_50MHZ_CLK	0x00000001
 #define MICREL_PHY_FXEN		0x00000002
+#define MICREL_KSZ8_P1_ERRATA	0x00000003
 
 #define MICREL_KSZ9021_EXTREG_CTRL	0xB
 #define MICREL_KSZ9021_EXTREG_DATA_WRITE	0xC
-- 
cgit v1.2.3


From e4e3f24b822f9dc9ae2427a8d686e8c1d80d6bd2 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Tue, 23 Feb 2021 10:37:05 +0200
Subject: net/mlx5: Provide cpumask at EQ creation phase

The users of EQ are running their code on different CPUs and with
various affinity patterns. Move the cpumask setting close to their
actual usage.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/infiniband/hw/mlx5/odp.c                   |   5 +
 drivers/net/ethernet/mellanox/mlx5/core/eq.c       |  27 ++++--
 drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h |   3 +-
 drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c  | 103 +++++----------------
 include/linux/mlx5/eq.h                            |   1 +
 5 files changed, 49 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 782b2af8f211..8f88b044ccbc 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -1564,7 +1564,12 @@ int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
 		.nent = MLX5_IB_NUM_PF_EQE,
 	};
 	param.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_FAULT;
+	if (!zalloc_cpumask_var(&param.affinity, GFP_KERNEL)) {
+		err = -ENOMEM;
+		goto err_wq;
+	}
 	eq->core = mlx5_eq_create_generic(dev->mdev, &param);
+	free_cpumask_var(param.affinity);
 	if (IS_ERR(eq->core)) {
 		err = PTR_ERR(eq->core);
 		goto err_wq;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 7e7bbed3763d..5a88887c1a58 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -310,7 +310,7 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
 	mlx5_init_fbc(eq->frag_buf.frags, log_eq_stride, log_eq_size, &eq->fbc);
 	init_eq_buf(eq);
 
-	eq->irq = mlx5_irq_request(dev, vecidx);
+	eq->irq = mlx5_irq_request(dev, vecidx, param->affinity);
 	if (IS_ERR(eq->irq)) {
 		err = PTR_ERR(eq->irq);
 		goto err_buf;
@@ -621,8 +621,11 @@ setup_async_eq(struct mlx5_core_dev *dev, struct mlx5_eq_async *eq,
 
 	eq->irq_nb.notifier_call = mlx5_eq_async_int;
 	spin_lock_init(&eq->lock);
+	if (!zalloc_cpumask_var(&param->affinity, GFP_KERNEL))
+		return -ENOMEM;
 
 	err = create_async_eq(dev, &eq->core, param);
+	free_cpumask_var(param->affinity);
 	if (err) {
 		mlx5_core_warn(dev, "failed to create %s EQ %d\n", name, err);
 		return err;
@@ -740,6 +743,9 @@ mlx5_eq_create_generic(struct mlx5_core_dev *dev,
 	struct mlx5_eq *eq = kvzalloc(sizeof(*eq), GFP_KERNEL);
 	int err;
 
+	if (!param->affinity)
+		return ERR_PTR(-EINVAL);
+
 	if (!eq)
 		return ERR_PTR(-ENOMEM);
 
@@ -850,16 +856,21 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 			.irq_index = vecidx,
 			.nent = nent,
 		};
-		err = create_map_eq(dev, &eq->core, &param);
-		if (err) {
-			kfree(eq);
-			goto clean;
+
+		if (!zalloc_cpumask_var(&param.affinity, GFP_KERNEL)) {
+			err = -ENOMEM;
+			goto clean_eq;
 		}
+		cpumask_set_cpu(cpumask_local_spread(i, dev->priv.numa_node),
+				param.affinity);
+		err = create_map_eq(dev, &eq->core, &param);
+		free_cpumask_var(param.affinity);
+		if (err)
+			goto clean_eq;
 		err = mlx5_eq_enable(dev, &eq->core, &eq->irq_nb);
 		if (err) {
 			destroy_unmap_eq(dev, &eq->core);
-			kfree(eq);
-			goto clean;
+			goto clean_eq;
 		}
 
 		mlx5_core_dbg(dev, "allocated completion EQN %d\n", eq->core.eqn);
@@ -868,6 +879,8 @@ static int create_comp_eqs(struct mlx5_core_dev *dev)
 	}
 
 	return 0;
+clean_eq:
+	kfree(eq);
 clean:
 	destroy_comp_eqs(dev);
 	return err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h
index dd138b38bf36..81bfb5f0d332 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h
@@ -20,7 +20,8 @@ int mlx5_set_msix_vec_count(struct mlx5_core_dev *dev, int devfn,
 			    int msix_vec_count);
 int mlx5_get_default_msix_vec_count(struct mlx5_core_dev *dev, int num_vfs);
 
-struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, int vecidx);
+struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, int vecidx,
+				  struct cpumask *affinity);
 void mlx5_irq_release(struct mlx5_irq *irq);
 int mlx5_irq_attach_nb(struct mlx5_irq *irq, struct notifier_block *nb);
 int mlx5_irq_detach_nb(struct mlx5_irq *irq, struct notifier_block *nb);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
index ecace7ca4a01..81b06b5693cd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
@@ -17,6 +17,7 @@ struct mlx5_irq {
 	struct atomic_notifier_head nh;
 	cpumask_var_t mask;
 	char name[MLX5_MAX_IRQ_NAME];
+	spinlock_t lock; /* protects affinity assignment */
 	struct kref kref;
 	int irqn;
 };
@@ -153,6 +154,8 @@ static void irq_release(struct kref *kref)
 {
 	struct mlx5_irq *irq = container_of(kref, struct mlx5_irq, kref);
 
+	irq_set_affinity_hint(irq->irqn, NULL);
+	free_cpumask_var(irq->mask);
 	free_irq(irq->irqn, &irq->nh);
 }
 
@@ -189,7 +192,8 @@ void mlx5_irq_release(struct mlx5_irq *irq)
 	irq_put(irq);
 }
 
-struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, int vecidx)
+struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, int vecidx,
+				  struct cpumask *affinity)
 {
 	struct mlx5_irq_table *table = mlx5_irq_table_get(dev);
 	struct mlx5_irq *irq = &table->irq[vecidx];
@@ -199,6 +203,16 @@ struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, int vecidx)
 	if (!err)
 		return ERR_PTR(-ENOENT);
 
+	spin_lock(&irq->lock);
+	if (!cpumask_empty(irq->mask)) {
+		/* already configured */
+		spin_unlock(&irq->lock);
+		return irq;
+	}
+
+	cpumask_copy(irq->mask, affinity);
+	irq_set_affinity_hint(irq->irqn, irq->mask);
+	spin_unlock(&irq->lock);
 	return irq;
 }
 
@@ -239,6 +253,12 @@ static int request_irqs(struct mlx5_core_dev *dev, int nvec)
 			mlx5_core_err(dev, "Failed to request irq\n");
 			goto err_request_irq;
 		}
+		if (!zalloc_cpumask_var(&irq->mask, GFP_KERNEL)) {
+			mlx5_core_warn(dev, "zalloc_cpumask_var failed\n");
+			err = -ENOMEM;
+			goto err_request_irq;
+		}
+		spin_lock_init(&irq->lock);
 		kref_init(&irq->kref);
 	}
 	return 0;
@@ -294,69 +314,6 @@ err_out:
 	return err;
 }
 
-/* Completion IRQ vectors */
-
-static int set_comp_irq_affinity_hint(struct mlx5_core_dev *mdev, int i)
-{
-	int vecidx = MLX5_IRQ_VEC_COMP_BASE + i;
-	struct mlx5_irq *irq;
-
-	irq = mlx5_irq_get(mdev, vecidx);
-	if (!zalloc_cpumask_var(&irq->mask, GFP_KERNEL)) {
-		mlx5_core_warn(mdev, "zalloc_cpumask_var failed");
-		return -ENOMEM;
-	}
-
-	cpumask_set_cpu(cpumask_local_spread(i, mdev->priv.numa_node),
-			irq->mask);
-	if (IS_ENABLED(CONFIG_SMP) &&
-	    irq_set_affinity_hint(irq->irqn, irq->mask))
-		mlx5_core_warn(mdev, "irq_set_affinity_hint failed, irq 0x%.4x",
-			       irq->irqn);
-
-	return 0;
-}
-
-static void clear_comp_irq_affinity_hint(struct mlx5_core_dev *mdev, int i)
-{
-	int vecidx = MLX5_IRQ_VEC_COMP_BASE + i;
-	struct mlx5_irq *irq;
-
-	irq = mlx5_irq_get(mdev, vecidx);
-	irq_set_affinity_hint(irq->irqn, NULL);
-	free_cpumask_var(irq->mask);
-}
-
-static int set_comp_irq_affinity_hints(struct mlx5_core_dev *mdev)
-{
-	int nvec = mlx5_irq_get_num_comp(mdev->priv.irq_table);
-	int err;
-	int i;
-
-	for (i = 0; i < nvec; i++) {
-		err = set_comp_irq_affinity_hint(mdev, i);
-		if (err)
-			goto err_out;
-	}
-
-	return 0;
-
-err_out:
-	for (i--; i >= 0; i--)
-		clear_comp_irq_affinity_hint(mdev, i);
-
-	return err;
-}
-
-static void clear_comp_irqs_affinity_hints(struct mlx5_core_dev *mdev)
-{
-	int nvec = mlx5_irq_get_num_comp(mdev->priv.irq_table);
-	int i;
-
-	for (i = 0; i < nvec; i++)
-		clear_comp_irq_affinity_hint(mdev, i);
-}
-
 struct cpumask *
 mlx5_irq_get_affinity_mask(struct mlx5_irq_table *irq_table, int vecidx)
 {
@@ -370,15 +327,6 @@ struct cpu_rmap *mlx5_irq_get_rmap(struct mlx5_irq_table *irq_table)
 }
 #endif
 
-static void unrequest_irqs(struct mlx5_core_dev *dev)
-{
-	struct mlx5_irq_table *table = dev->priv.irq_table;
-	int i;
-
-	for (i = 0; i < table->nvec; i++)
-		irq_put(mlx5_irq_get(dev, i));
-}
-
 int mlx5_irq_table_create(struct mlx5_core_dev *dev)
 {
 	struct mlx5_priv *priv = &dev->priv;
@@ -419,16 +367,8 @@ int mlx5_irq_table_create(struct mlx5_core_dev *dev)
 	if (err)
 		goto err_request_irqs;
 
-	err = set_comp_irq_affinity_hints(dev);
-	if (err) {
-		mlx5_core_err(dev, "Failed to alloc affinity hint cpumask\n");
-		goto err_set_affinity;
-	}
-
 	return 0;
 
-err_set_affinity:
-	unrequest_irqs(dev);
 err_request_irqs:
 	irq_clear_rmap(dev);
 err_set_rmap:
@@ -451,7 +391,6 @@ void mlx5_irq_table_destroy(struct mlx5_core_dev *dev)
 	 * which should be called after alloc_irq but before request_irq.
 	 */
 	irq_clear_rmap(dev);
-	clear_comp_irqs_affinity_hints(dev);
 	for (i = 0; i < table->nvec; i++)
 		irq_release(&mlx5_irq_get(dev, i)->kref);
 	pci_free_irq_vectors(dev->pdev);
diff --git a/include/linux/mlx5/eq.h b/include/linux/mlx5/eq.h
index e49d8c0d4f26..cea6ecb4b73e 100644
--- a/include/linux/mlx5/eq.h
+++ b/include/linux/mlx5/eq.h
@@ -16,6 +16,7 @@ struct mlx5_eq_param {
 	u8             irq_index;
 	int            nent;
 	u64            mask[4];
+	cpumask_var_t  affinity;
 };
 
 struct mlx5_eq *
-- 
cgit v1.2.3


From 3af26495a2473c95ada3674c6b4dfc658be0a6ec Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Mon, 10 May 2021 09:10:43 +0300
Subject: net/mlx5: Enlarge interrupt field in CREATE_EQ

FW is now supporting more than 256 MSI-X per PF (up to 2K).
Hence, enlarge interrupt field in CREATE_EQ to make use of the new
MSI-X's.

Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Maor Gottlieb <maorg@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 057db0eaf195..2d1ed78289ff 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -3806,8 +3806,8 @@ struct mlx5_ifc_eqc_bits {
 
 	u8         reserved_at_80[0x20];
 
-	u8         reserved_at_a0[0x18];
-	u8         intr[0x8];
+	u8         reserved_at_a0[0x14];
+	u8         intr[0xc];
 
 	u8         reserved_at_c0[0x3];
 	u8         log_page_size[0x5];
-- 
cgit v1.2.3


From 341466b64f301dabaed791c5862d2ae5a9e72849 Mon Sep 17 00:00:00 2001
From: Russ Weight <russell.h.weight@intel.com>
Date: Mon, 14 Jun 2021 10:09:02 -0700
Subject: fpga: altera-pr-ip: Remove function alt_pr_unregister

Remove the alt_pr_unregister() function; it is no longer used.

Signed-off-by: Russ Weight <russell.h.weight@intel.com>
Reviewed-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Moritz Fischer <mdf@kernel.org>
Link: https://lore.kernel.org/r/20210614170909.232415-2-mdf@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/fpga/altera-pr-ip-core.c       | 10 ----------
 include/linux/fpga/altera-pr-ip-core.h |  1 -
 2 files changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/fpga/altera-pr-ip-core.c b/drivers/fpga/altera-pr-ip-core.c
index 5b130c4d9882..dfdf21ed34c4 100644
--- a/drivers/fpga/altera-pr-ip-core.c
+++ b/drivers/fpga/altera-pr-ip-core.c
@@ -199,16 +199,6 @@ int alt_pr_register(struct device *dev, void __iomem *reg_base)
 }
 EXPORT_SYMBOL_GPL(alt_pr_register);
 
-void alt_pr_unregister(struct device *dev)
-{
-	struct fpga_manager *mgr = dev_get_drvdata(dev);
-
-	dev_dbg(dev, "%s\n", __func__);
-
-	fpga_mgr_unregister(mgr);
-}
-EXPORT_SYMBOL_GPL(alt_pr_unregister);
-
 MODULE_AUTHOR("Matthew Gerlach <matthew.gerlach@linux.intel.com>");
 MODULE_DESCRIPTION("Altera Partial Reconfiguration IP Core");
 MODULE_LICENSE("GPL v2");
diff --git a/include/linux/fpga/altera-pr-ip-core.h b/include/linux/fpga/altera-pr-ip-core.h
index 0b08ac20ab16..a6b4c07858cc 100644
--- a/include/linux/fpga/altera-pr-ip-core.h
+++ b/include/linux/fpga/altera-pr-ip-core.h
@@ -13,6 +13,5 @@
 #include <linux/io.h>
 
 int alt_pr_register(struct device *dev, void __iomem *reg_base);
-void alt_pr_unregister(struct device *dev);
 
 #endif /* _ALT_PR_IP_CORE_H */
-- 
cgit v1.2.3


From 654ee49b7e0883e660be6e6e20876fc4cbdaadd1 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Thu, 10 Jun 2021 11:02:44 +0200
Subject: tty: make tty_get_{char,frame}_size available

Many tty drivers contain code to compute bits count depending on termios
cflags. So extract this code from serial core to two separate tty helper
functions:
* tty_get_char_size -- only size of a character, without flags,
* tty_get_frame_size -- complete size of a frame including flags.

In the next patch, calls to these new functions replace many copies of
this code.

Note that we accept only cflag as a parameter. That's because some
callers like pch_uart_startup or sunsab_console_setup don't have at hand
termios which we could pass around.

Cc: Joe Perches <joe@perches.com>
Cc: Johan Hovold <johan@kernel.org>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20210610090247.2593-1-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/serial_core.c | 30 +++------------------------
 drivers/tty/tty_ioctl.c          | 45 ++++++++++++++++++++++++++++++++++++++++
 include/linux/tty.h              |  3 +++
 3 files changed, 51 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 642e24d6c475..69092deba11f 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -334,39 +334,15 @@ void
 uart_update_timeout(struct uart_port *port, unsigned int cflag,
 		    unsigned int baud)
 {
-	unsigned int bits;
+	unsigned int size;
 
-	/* byte size and parity */
-	switch (cflag & CSIZE) {
-	case CS5:
-		bits = 7;
-		break;
-	case CS6:
-		bits = 8;
-		break;
-	case CS7:
-		bits = 9;
-		break;
-	default:
-		bits = 10;
-		break; /* CS8 */
-	}
-
-	if (cflag & CSTOPB)
-		bits++;
-	if (cflag & PARENB)
-		bits++;
-
-	/*
-	 * The total number of bits to be transmitted in the fifo.
-	 */
-	bits = bits * port->fifosize;
+	size = tty_get_frame_size(cflag) * port->fifosize;
 
 	/*
 	 * Figure the timeout to send the above number of bits.
 	 * Add .02 seconds of slop
 	 */
-	port->timeout = (HZ * bits) / baud + HZ/50;
+	port->timeout = (HZ * size) / baud + HZ/50;
 }
 
 EXPORT_SYMBOL(uart_update_timeout);
diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c
index 75885d502749..507a25d692bb 100644
--- a/drivers/tty/tty_ioctl.c
+++ b/drivers/tty/tty_ioctl.c
@@ -279,6 +279,51 @@ int tty_termios_hw_change(const struct ktermios *a, const struct ktermios *b)
 }
 EXPORT_SYMBOL(tty_termios_hw_change);
 
+/**
+ *	tty_get_char_size	-	get size of a character
+ *	@cflag: termios cflag value
+ *
+ *	Get the size (in bits) of a character depending on @cflag's %CSIZE
+ *	setting.
+ */
+unsigned char tty_get_char_size(unsigned int cflag)
+{
+	switch (cflag & CSIZE) {
+	case CS5:
+		return 5;
+	case CS6:
+		return 6;
+	case CS7:
+		return 7;
+	case CS8:
+	default:
+		return 8;
+	}
+}
+EXPORT_SYMBOL_GPL(tty_get_char_size);
+
+/**
+ *	tty_get_frame_size	-	get size of a frame
+ *	@cflag: termios cflag value
+ *
+ *	Get the size (in bits) of a frame depending on @cflag's %CSIZE, %CSTOPB,
+ *	and %PARENB setting. The result is a sum of character size, start and
+ *	stop bits -- one bit each -- second stop bit (if set), and parity bit
+ *	(if set).
+ */
+unsigned char tty_get_frame_size(unsigned int cflag)
+{
+	unsigned char bits = 2 + tty_get_char_size(cflag);
+
+	if (cflag & CSTOPB)
+		bits++;
+	if (cflag & PARENB)
+		bits++;
+
+	return bits;
+}
+EXPORT_SYMBOL_GPL(tty_get_frame_size);
+
 /**
  *	tty_set_termios		-	update termios values
  *	@tty: tty to update
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 4c0c7ca1d9a4..19dc1097e09c 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -495,6 +495,9 @@ static inline speed_t tty_get_baud_rate(struct tty_struct *tty)
 	return tty_termios_baud_rate(&tty->termios);
 }
 
+unsigned char tty_get_char_size(unsigned int cflag);
+unsigned char tty_get_frame_size(unsigned int cflag);
+
 extern void tty_termios_copy_hw(struct ktermios *new, struct ktermios *old);
 extern int tty_termios_hw_change(const struct ktermios *a, const struct ktermios *b);
 extern int tty_set_termios(struct tty_struct *tty, struct ktermios *kt);
-- 
cgit v1.2.3


From b4e326165e21d6a11483f6a4de2174b933413554 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Wed, 9 Jun 2021 15:02:46 -0700
Subject: USB: misc: Add onboard_usb_hub driver

The main issue this driver addresses is that a USB hub needs to be
powered before it can be discovered. For discrete onboard hubs (an
example for such a hub is the Realtek RTS5411) this is often solved
by supplying the hub with an 'always-on' regulator, which is kind
of a hack. Some onboard hubs may require further initialization
steps, like changing the state of a GPIO or enabling a clock, which
requires even more hacks. This driver creates a platform device
representing the hub which performs the necessary initialization.
Currently it only supports switching on a single regulator, support
for multiple regulators or other actions can be added as needed.
Different initialization sequences can be supported based on the
compatible string.

Besides performing the initialization the driver can be configured
to power the hub off during system suspend. This can help to extend
battery life on battery powered devices which have no requirements
to keep the hub powered during suspend. The driver can also be
configured to leave the hub powered when a wakeup capable USB device
is connected when suspending, and power it off otherwise.

Technically the driver consists of two drivers, the platform driver
described above and a very thin USB driver that subclasses the
generic driver. The purpose of this driver is to provide the platform
driver with the USB devices corresponding to the hub(s) (a hub
controller may provide multiple 'logical' hubs, e.g. one to support
USB 2.0 and another for USB 3.x).

Note: the current series only supports hubs connected directly to
a root hub (through xhci-plat), support for other configurations
could be added if needed.

Co-developed-by: Ravi Chandra Sadineni <ravisadineni@chromium.org>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Ravi Chandra Sadineni <ravisadineni@chromium.org>
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Link: https://lore.kernel.org/r/20210609150159.v12.2.I7c9a1f1d6ced41dd8310e8a03da666a32364e790@changeid
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 .../ABI/testing/sysfs-bus-platform-onboard-usb-hub |   8 +
 MAINTAINERS                                        |   7 +
 drivers/usb/misc/Kconfig                           |  17 +
 drivers/usb/misc/Makefile                          |   1 +
 drivers/usb/misc/onboard_usb_hub.c                 | 497 +++++++++++++++++++++
 include/linux/usb/onboard_hub.h                    |  18 +
 6 files changed, 548 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-hub
 create mode 100644 drivers/usb/misc/onboard_usb_hub.c
 create mode 100644 include/linux/usb/onboard_hub.h

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-hub b/Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-hub
new file mode 100644
index 000000000000..e981d83648e6
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-hub
@@ -0,0 +1,8 @@
+What:		/sys/bus/platform/devices/<dev>/always_powered_in_suspend
+Date:		March 2021
+KernelVersion:	5.13
+Contact:	Matthias Kaehlcke <matthias@kaehlcke.net>
+		linux-usb@vger.kernel.org
+Description:
+		(RW) Controls whether the USB hub remains always powered
+		during system suspend or not.
\ No newline at end of file
diff --git a/MAINTAINERS b/MAINTAINERS
index bc0ceef87b73..56930e88e97e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13622,6 +13622,13 @@ S:	Maintained
 T:	git git://linuxtv.org/media_tree.git
 F:	drivers/media/i2c/ov9734.c
 
+ONBOARD USB HUB DRIVER
+M:	Matthias Kaehlcke <mka@chromium.org>
+L:	linux-usb@vger.kernel.org
+S:	Maintained
+F:	Documentation/devicetree/bindings/usb/onboard_usb_hub.yaml
+F:	drivers/usb/misc/onboard_usb_hub.c
+
 ONENAND FLASH DRIVER
 M:	Kyungmin Park <kyungmin.park@samsung.com>
 L:	linux-mtd@lists.infradead.org
diff --git a/drivers/usb/misc/Kconfig b/drivers/usb/misc/Kconfig
index 8f1144359012..f534269fbb20 100644
--- a/drivers/usb/misc/Kconfig
+++ b/drivers/usb/misc/Kconfig
@@ -284,3 +284,20 @@ config BRCM_USB_PINMAP
 	  This option enables support for remapping some USB external
 	  signals, which are typically on dedicated pins on the chip,
 	  to any gpio.
+
+config USB_ONBOARD_HUB
+	tristate "Onboard USB hub support"
+	depends on OF || COMPILE_TEST
+	help
+	  Say Y here if you want to support discrete onboard USB hubs that
+	  don't require an additional control bus for initialization, but
+	  need some nontrivial form of initialization, such as enabling a
+	  power regulator. An example for such a hub is the Realtek
+	  RTS5411.
+
+	  The driver can be configured to turn off the power of the hub
+	  during system suspend. This may reduce power consumption while
+	  the system is suspended.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called onboard_usb_hub.
diff --git a/drivers/usb/misc/Makefile b/drivers/usb/misc/Makefile
index 5f4e598573ab..2c5aec6f1b26 100644
--- a/drivers/usb/misc/Makefile
+++ b/drivers/usb/misc/Makefile
@@ -32,3 +32,4 @@ obj-$(CONFIG_USB_CHAOSKEY)		+= chaoskey.o
 obj-$(CONFIG_USB_SISUSBVGA)		+= sisusbvga/
 obj-$(CONFIG_USB_LINK_LAYER_TEST)	+= lvstest.o
 obj-$(CONFIG_BRCM_USB_PINMAP)		+= brcmstb-usb-pinmap.o
+obj-$(CONFIG_USB_ONBOARD_HUB)		+= onboard_usb_hub.o
diff --git a/drivers/usb/misc/onboard_usb_hub.c b/drivers/usb/misc/onboard_usb_hub.c
new file mode 100644
index 000000000000..ed1f5424ea5f
--- /dev/null
+++ b/drivers/usb/misc/onboard_usb_hub.c
@@ -0,0 +1,497 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  Driver for onboard USB hubs
+ *
+ * Copyright (c) 2020, Google LLC
+ */
+
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/consumer.h>
+#include <linux/suspend.h>
+#include <linux/sysfs.h>
+#include <linux/usb.h>
+#include <linux/usb/hcd.h>
+#include <linux/usb/of.h>
+#include <linux/usb/onboard_hub.h>
+
+static struct usb_device_driver onboard_hub_usbdev_driver;
+
+/************************** Platform driver **************************/
+
+struct udev_node {
+	struct usb_device *udev;
+	struct list_head list;
+};
+
+struct onboard_hub {
+	struct regulator *vdd;
+	struct device *dev;
+	bool always_powered_in_suspend;
+	bool is_powered_on;
+	bool going_away;
+	struct list_head udev_list;
+	struct mutex lock;
+};
+
+static int onboard_hub_power_on(struct onboard_hub *hub)
+{
+	int err;
+
+	err = regulator_enable(hub->vdd);
+	if (err) {
+		dev_err(hub->dev, "failed to enable regulator: %d\n", err);
+		return err;
+	}
+
+	hub->is_powered_on = true;
+
+	return 0;
+}
+
+static int onboard_hub_power_off(struct onboard_hub *hub)
+{
+	int err;
+
+	err = regulator_disable(hub->vdd);
+	if (err) {
+		dev_err(hub->dev, "failed to disable regulator: %d\n", err);
+		return err;
+	}
+
+	hub->is_powered_on = false;
+
+	return 0;
+}
+
+static int __maybe_unused onboard_hub_suspend(struct device *dev)
+{
+	struct onboard_hub *hub = dev_get_drvdata(dev);
+	struct udev_node *node;
+	bool power_off;
+	int rc = 0;
+
+	if (hub->always_powered_in_suspend)
+		return 0;
+
+	power_off = true;
+
+	mutex_lock(&hub->lock);
+
+	list_for_each_entry(node, &hub->udev_list, list) {
+		if (!device_may_wakeup(node->udev->bus->controller))
+			continue;
+
+		if (usb_wakeup_enabled_descendants(node->udev)) {
+			power_off = false;
+			break;
+		}
+	}
+
+	mutex_unlock(&hub->lock);
+
+	if (power_off)
+		rc = onboard_hub_power_off(hub);
+
+	return rc;
+}
+
+static int __maybe_unused onboard_hub_resume(struct device *dev)
+{
+	struct onboard_hub *hub = dev_get_drvdata(dev);
+	int rc = 0;
+
+	if (!hub->is_powered_on)
+		rc = onboard_hub_power_on(hub);
+
+	return rc;
+}
+
+static int onboard_hub_add_usbdev(struct onboard_hub *hub, struct usb_device *udev)
+{
+	struct udev_node *node;
+	char link_name[64];
+	int ret = 0;
+
+	mutex_lock(&hub->lock);
+
+	if (hub->going_away) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	node = devm_kzalloc(hub->dev, sizeof(*node), GFP_KERNEL);
+	if (!node) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	node->udev = udev;
+
+	list_add(&node->list, &hub->udev_list);
+
+	snprintf(link_name, sizeof(link_name), "usb_dev.%s", dev_name(&udev->dev));
+	WARN_ON(sysfs_create_link(&hub->dev->kobj, &udev->dev.kobj, link_name));
+
+unlock:
+	mutex_unlock(&hub->lock);
+
+	return ret;
+}
+
+static void onboard_hub_remove_usbdev(struct onboard_hub *hub, struct usb_device *udev)
+{
+	struct udev_node *node;
+	char link_name[64];
+
+	snprintf(link_name, sizeof(link_name), "usb_dev.%s", dev_name(&udev->dev));
+	sysfs_remove_link(&hub->dev->kobj, link_name);
+
+	mutex_lock(&hub->lock);
+
+	list_for_each_entry(node, &hub->udev_list, list) {
+		if (node->udev == udev) {
+			list_del(&node->list);
+			break;
+		}
+	}
+
+	mutex_unlock(&hub->lock);
+}
+
+static ssize_t always_powered_in_suspend_show(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct onboard_hub *hub = dev_get_drvdata(dev);
+
+	return sysfs_emit(buf, "%d\n", hub->always_powered_in_suspend);
+}
+
+static ssize_t always_powered_in_suspend_store(struct device *dev, struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	struct onboard_hub *hub = dev_get_drvdata(dev);
+	bool val;
+	int ret;
+
+	ret = kstrtobool(buf, &val);
+	if (ret < 0)
+		return ret;
+
+	hub->always_powered_in_suspend = val;
+
+	return count;
+}
+static DEVICE_ATTR_RW(always_powered_in_suspend);
+
+static struct attribute *onboard_hub_attrs[] = {
+	&dev_attr_always_powered_in_suspend.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(onboard_hub);
+
+static int onboard_hub_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct onboard_hub *hub;
+	int err;
+
+	hub = devm_kzalloc(dev, sizeof(*hub), GFP_KERNEL);
+	if (!hub)
+		return -ENOMEM;
+
+	hub->vdd = devm_regulator_get(dev, "vdd");
+	if (IS_ERR(hub->vdd))
+		return PTR_ERR(hub->vdd);
+
+	hub->dev = dev;
+	mutex_init(&hub->lock);
+	INIT_LIST_HEAD(&hub->udev_list);
+
+	dev_set_drvdata(dev, hub);
+
+	err = onboard_hub_power_on(hub);
+	if (err)
+		return err;
+
+	/*
+	 * The USB driver might have been detached from the USB devices by
+	 * onboard_hub_remove(), make sure to re-attach it if needed.
+	 */
+	err = driver_attach(&onboard_hub_usbdev_driver.drvwrap.driver);
+	if (err) {
+		onboard_hub_power_off(hub);
+		return err;
+	}
+
+	return 0;
+}
+
+static int onboard_hub_remove(struct platform_device *pdev)
+{
+	struct onboard_hub *hub = dev_get_drvdata(&pdev->dev);
+	struct udev_node *node;
+	struct usb_device *udev;
+
+	hub->going_away = true;
+
+	mutex_lock(&hub->lock);
+
+	/* unbind the USB devices to avoid dangling references to this device */
+	while (!list_empty(&hub->udev_list)) {
+		node = list_first_entry(&hub->udev_list, struct udev_node, list);
+		udev = node->udev;
+
+		/*
+		 * Unbinding the driver will call onboard_hub_remove_usbdev(),
+		 * which acquires hub->lock.  We must release the lock first.
+		 */
+		get_device(&udev->dev);
+		mutex_unlock(&hub->lock);
+		device_release_driver(&udev->dev);
+		put_device(&udev->dev);
+		mutex_lock(&hub->lock);
+	}
+
+	mutex_unlock(&hub->lock);
+
+	return onboard_hub_power_off(hub);
+}
+
+static const struct of_device_id onboard_hub_match[] = {
+	{ .compatible = "usbbda,411" },
+	{ .compatible = "usbbda,5411" },
+	{}
+};
+MODULE_DEVICE_TABLE(of, onboard_hub_match);
+
+static bool of_is_onboard_usb_hub(const struct device_node *np)
+{
+	return !!of_match_node(onboard_hub_match, np);
+}
+
+static const struct dev_pm_ops __maybe_unused onboard_hub_pm_ops = {
+	SET_LATE_SYSTEM_SLEEP_PM_OPS(onboard_hub_suspend, onboard_hub_resume)
+};
+
+static struct platform_driver onboard_hub_driver = {
+	.probe = onboard_hub_probe,
+	.remove = onboard_hub_remove,
+
+	.driver = {
+		.name = "onboard-usb-hub",
+		.of_match_table = onboard_hub_match,
+		.pm = pm_ptr(&onboard_hub_pm_ops),
+		.dev_groups = onboard_hub_groups,
+	},
+};
+
+/************************** USB driver **************************/
+
+#define VENDOR_ID_REALTEK	0x0bda
+
+/*
+ * Returns the onboard_hub platform device that is associated with the USB
+ * device passed as parameter.
+ */
+static struct onboard_hub *_find_onboard_hub(struct device *dev)
+{
+	struct platform_device *pdev;
+	struct device_node *np;
+	phandle ph;
+
+	pdev = of_find_device_by_node(dev->of_node);
+	if (!pdev) {
+		if (of_property_read_u32(dev->of_node, "companion-hub", &ph)) {
+			dev_err(dev, "failed to read 'companion-hub' property\n");
+			return ERR_PTR(-EINVAL);
+		}
+
+		np = of_find_node_by_phandle(ph);
+		if (!np) {
+			dev_err(dev, "failed to find device node for companion hub\n");
+			return ERR_PTR(-EINVAL);
+		}
+
+		pdev = of_find_device_by_node(np);
+		of_node_put(np);
+
+		if (!pdev)
+			return ERR_PTR(-EPROBE_DEFER);
+	}
+
+	put_device(&pdev->dev);
+
+	return dev_get_drvdata(&pdev->dev);
+}
+
+static int onboard_hub_usbdev_probe(struct usb_device *udev)
+{
+	struct device *dev = &udev->dev;
+	struct onboard_hub *hub;
+	int err;
+
+	/* ignore supported hubs without device tree node */
+	if (!dev->of_node)
+		return -ENODEV;
+
+	hub = _find_onboard_hub(dev);
+	if (IS_ERR(hub))
+		return PTR_ERR(hub);
+
+	dev_set_drvdata(dev, hub);
+
+	err = onboard_hub_add_usbdev(hub, udev);
+	if (err)
+		return err;
+
+	err = sysfs_create_link(&udev->dev.kobj, &hub->dev->kobj, "onboard_hub_dev");
+	if (err)
+		dev_warn(&udev->dev, "failed to create symlink to platform device: %d\n", err);
+
+	return 0;
+}
+
+static void onboard_hub_usbdev_disconnect(struct usb_device *udev)
+{
+	struct onboard_hub *hub = dev_get_drvdata(&udev->dev);
+
+	sysfs_remove_link(&udev->dev.kobj, "onboard_hub_dev");
+
+	onboard_hub_remove_usbdev(hub, udev);
+}
+
+static const struct usb_device_id onboard_hub_id_table[] = {
+	{ USB_DEVICE(VENDOR_ID_REALTEK, 0x0411) }, /* RTS5411 USB 3.0 */
+	{ USB_DEVICE(VENDOR_ID_REALTEK, 0x5411) }, /* RTS5411 USB 2.0 */
+	{},
+};
+
+MODULE_DEVICE_TABLE(usb, onboard_hub_id_table);
+
+static struct usb_device_driver onboard_hub_usbdev_driver = {
+
+	.name = "onboard-usb-hub",
+	.probe = onboard_hub_usbdev_probe,
+	.disconnect = onboard_hub_usbdev_disconnect,
+	.generic_subclass = 1,
+	.supports_autosuspend =	1,
+	.id_table = onboard_hub_id_table,
+};
+
+/*** Helpers for creating/destroying platform devices for onboard hubs ***/
+
+struct pdev_list_entry {
+	struct platform_device *pdev;
+	struct list_head node;
+};
+
+/*
+ * Creates a platform device for each supported onboard hub that is connected to
+ * the given parent hub. To keep track of the platform devices they are added to
+ * a list that is owned by the parent hub.
+ */
+void onboard_hub_create_pdevs(struct usb_device *parent_hub, struct list_head *pdev_list)
+{
+	int i;
+	phandle ph;
+	struct device_node *np, *npc;
+	struct platform_device *pdev;
+	struct pdev_list_entry *pdle;
+
+	for (i = 1; i <= parent_hub->maxchild; i++) {
+		np = usb_of_get_device_node(parent_hub, i);
+		if (!np)
+			continue;
+
+		if (!of_is_onboard_usb_hub(np))
+			goto node_put;
+
+		if (of_property_read_u32(np, "companion-hub", &ph))
+			goto node_put;
+
+		npc = of_find_node_by_phandle(ph);
+		if (!npc)
+			goto node_put;
+
+		pdev = of_find_device_by_node(npc);
+		of_node_put(npc);
+
+		if (pdev) {
+			/* the companion hub already has a platform device, nothing to do here */
+			put_device(&pdev->dev);
+			goto node_put;
+		}
+
+		pdev = of_platform_device_create(np, NULL, &parent_hub->dev);
+		if (pdev) {
+			pdle = kzalloc(sizeof(*pdle), GFP_KERNEL);
+			if (!pdle)
+				goto node_put;
+
+			INIT_LIST_HEAD(&pdle->node);
+
+			pdle->pdev = pdev;
+			list_add(&pdle->node, pdev_list);
+		} else {
+			dev_err(&parent_hub->dev,
+				"failed to create platform device for onboard hub '%s'\n",
+				of_node_full_name(np));
+		}
+
+node_put:
+		of_node_put(np);
+	}
+}
+EXPORT_SYMBOL_GPL(onboard_hub_create_pdevs);
+
+/*
+ * Destroys the platform devices in the given list and frees the memory associated
+ * with the list entry.
+ */
+void onboard_hub_destroy_pdevs(struct list_head *pdev_list)
+{
+	struct pdev_list_entry *pdle, *tmp;
+
+	list_for_each_entry_safe(pdle, tmp, pdev_list, node) {
+		of_platform_device_destroy(&pdle->pdev->dev, NULL);
+		kfree(pdle);
+	}
+}
+EXPORT_SYMBOL_GPL(onboard_hub_destroy_pdevs);
+
+/************************** Driver (de)registration **************************/
+
+static int __init onboard_hub_init(void)
+{
+	int ret;
+
+	ret = platform_driver_register(&onboard_hub_driver);
+	if (ret)
+		return ret;
+
+	ret = usb_register_device_driver(&onboard_hub_usbdev_driver, THIS_MODULE);
+	if (ret)
+		platform_driver_unregister(&onboard_hub_driver);
+
+	return ret;
+}
+module_init(onboard_hub_init);
+
+static void __exit onboard_hub_exit(void)
+{
+	usb_deregister_device_driver(&onboard_hub_usbdev_driver);
+	platform_driver_unregister(&onboard_hub_driver);
+}
+module_exit(onboard_hub_exit);
+
+MODULE_AUTHOR("Matthias Kaehlcke <mka@chromium.org>");
+MODULE_DESCRIPTION("Driver for discrete onboard USB hubs");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/usb/onboard_hub.h b/include/linux/usb/onboard_hub.h
new file mode 100644
index 000000000000..d9373230556e
--- /dev/null
+++ b/include/linux/usb/onboard_hub.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __LINUX_USB_ONBOARD_HUB_H
+#define __LINUX_USB_ONBOARD_HUB_H
+
+struct usb_device;
+struct list_head;
+
+#if IS_ENABLED(CONFIG_USB_ONBOARD_HUB)
+void onboard_hub_create_pdevs(struct usb_device *parent_hub, struct list_head *pdev_list);
+void onboard_hub_destroy_pdevs(struct list_head *pdev_list);
+#else
+static inline void onboard_hub_create_pdevs(struct usb_device *parent_hub,
+					    struct list_head *pdev_list) {}
+static inline void onboard_hub_destroy_pdevs(struct list_head *pdev_list) {}
+#endif
+
+#endif /* __LINUX_USB_ONBOARD_HUB_H */
-- 
cgit v1.2.3


From 412981e06294dac3254d83bbf71d4184ea911d05 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Wed, 9 Jun 2021 15:02:47 -0700
Subject: of/platform: Add stubs for of_platform_device_create/destroy()

Code for platform_device_create() and of_platform_device_destroy() is
only generated if CONFIG_OF_ADDRESS=y. Add stubs to avoid unresolved
symbols when CONFIG_OF_ADDRESS is not set.

Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Link: https://lore.kernel.org/r/20210609150159.v12.3.I08fd2e1c775af04f663730e9fb4d00e6bbb38541@changeid
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/of_platform.h | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index 84a966623e78..d15b6cd5e1c3 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -61,16 +61,18 @@ static inline struct platform_device *of_find_device_by_node(struct device_node
 }
 #endif
 
+extern int of_platform_bus_probe(struct device_node *root,
+				 const struct of_device_id *matches,
+				 struct device *parent);
+
+#ifdef CONFIG_OF_ADDRESS
 /* Platform devices and busses creation */
 extern struct platform_device *of_platform_device_create(struct device_node *np,
 						   const char *bus_id,
 						   struct device *parent);
 
 extern int of_platform_device_destroy(struct device *dev, void *data);
-extern int of_platform_bus_probe(struct device_node *root,
-				 const struct of_device_id *matches,
-				 struct device *parent);
-#ifdef CONFIG_OF_ADDRESS
+
 extern int of_platform_populate(struct device_node *root,
 				const struct of_device_id *matches,
 				const struct of_dev_auxdata *lookup,
@@ -84,6 +86,18 @@ extern int devm_of_platform_populate(struct device *dev);
 
 extern void devm_of_platform_depopulate(struct device *dev);
 #else
+/* Platform devices and busses creation */
+static inline struct platform_device *of_platform_device_create(struct device_node *np,
+								const char *bus_id,
+								struct device *parent)
+{
+	return NULL;
+}
+static inline int of_platform_device_destroy(struct device *dev, void *data)
+{
+	return -ENODEV;
+}
+
 static inline int of_platform_populate(struct device_node *root,
 					const struct of_device_id *matches,
 					const struct of_dev_auxdata *lookup,
-- 
cgit v1.2.3


From 09705dcb63d269000595284b5dd7f5c938d647b9 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 17 May 2021 15:29:46 +0300
Subject: devres: Enable trace events

In some cases the printf() mechanism is too heavy and can't be used.
For example, when debugging a race condition involving devres API.
When CONFIG_DEBUG_DEVRES is enabled I can't reproduce an issue, and
otherwise it's quite visible with a useful information being collected.

Enable trace events for devres part of the driver core.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210517122946.53161-4-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/Makefile  |  3 +++
 drivers/base/devres.c  | 47 ++++++++++++++++++------------------------
 drivers/base/trace.c   | 10 +++++++++
 drivers/base/trace.h   | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/device.h |  9 --------
 5 files changed, 89 insertions(+), 36 deletions(-)
 create mode 100644 drivers/base/trace.c
 create mode 100644 drivers/base/trace.h

(limited to 'include/linux')

diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index 8b93a7f291ec..ef8e44a7d288 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -30,3 +30,6 @@ obj-y			+= test/
 
 ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG
 
+# define_trace.h needs to know how to find our header
+CFLAGS_trace.o		:= -I$(src)
+obj-$(CONFIG_TRACING)	+= trace.o
diff --git a/drivers/base/devres.c b/drivers/base/devres.c
index dee48858663f..eaa9a5cd1db9 100644
--- a/drivers/base/devres.c
+++ b/drivers/base/devres.c
@@ -14,14 +14,13 @@
 #include <asm/sections.h>
 
 #include "base.h"
+#include "trace.h"
 
 struct devres_node {
 	struct list_head		entry;
 	dr_release_t			release;
-#ifdef CONFIG_DEBUG_DEVRES
 	const char			*name;
 	size_t				size;
-#endif
 };
 
 struct devres {
@@ -43,10 +42,6 @@ struct devres_group {
 	/* -- 8 pointers */
 };
 
-#ifdef CONFIG_DEBUG_DEVRES
-static int log_devres = 0;
-module_param_named(log, log_devres, int, S_IRUGO | S_IWUSR);
-
 static void set_node_dbginfo(struct devres_node *node, const char *name,
 			     size_t size)
 {
@@ -54,7 +49,11 @@ static void set_node_dbginfo(struct devres_node *node, const char *name,
 	node->size = size;
 }
 
-static void devres_log(struct device *dev, struct devres_node *node,
+#ifdef CONFIG_DEBUG_DEVRES
+static int log_devres = 0;
+module_param_named(log, log_devres, int, S_IRUGO | S_IWUSR);
+
+static void devres_dbg(struct device *dev, struct devres_node *node,
 		       const char *op)
 {
 	if (unlikely(log_devres))
@@ -62,10 +61,16 @@ static void devres_log(struct device *dev, struct devres_node *node,
 			op, node, node->name, node->size);
 }
 #else /* CONFIG_DEBUG_DEVRES */
-#define set_node_dbginfo(node, n, s)	do {} while (0)
-#define devres_log(dev, node, op)	do {} while (0)
+#define devres_dbg(dev, node, op)	do {} while (0)
 #endif /* CONFIG_DEBUG_DEVRES */
 
+static void devres_log(struct device *dev, struct devres_node *node,
+		       const char *op)
+{
+	trace_devres_log(dev, op, node, node->name, node->size);
+	devres_dbg(dev, node, op);
+}
+
 /*
  * Release functions for devres group.  These callbacks are used only
  * for identification.
@@ -134,26 +139,13 @@ static void replace_dr(struct device *dev,
 	list_replace(&old->entry, &new->entry);
 }
 
-#ifdef CONFIG_DEBUG_DEVRES
-void * __devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp, int nid,
-		      const char *name)
-{
-	struct devres *dr;
-
-	dr = alloc_dr(release, size, gfp | __GFP_ZERO, nid);
-	if (unlikely(!dr))
-		return NULL;
-	set_node_dbginfo(&dr->node, name, size);
-	return dr->data;
-}
-EXPORT_SYMBOL_GPL(__devres_alloc_node);
-#else
 /**
- * devres_alloc_node - Allocate device resource data
+ * __devres_alloc_node - Allocate device resource data
  * @release: Release function devres will be associated with
  * @size: Allocation size
  * @gfp: Allocation flags
  * @nid: NUMA node
+ * @name: Name of the resource
  *
  * Allocate devres of @size bytes.  The allocated area is zeroed, then
  * associated with @release.  The returned pointer can be passed to
@@ -162,17 +154,18 @@ EXPORT_SYMBOL_GPL(__devres_alloc_node);
  * RETURNS:
  * Pointer to allocated devres on success, NULL on failure.
  */
-void * devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp, int nid)
+void *__devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp, int nid,
+			  const char *name)
 {
 	struct devres *dr;
 
 	dr = alloc_dr(release, size, gfp | __GFP_ZERO, nid);
 	if (unlikely(!dr))
 		return NULL;
+	set_node_dbginfo(&dr->node, name, size);
 	return dr->data;
 }
-EXPORT_SYMBOL_GPL(devres_alloc_node);
-#endif
+EXPORT_SYMBOL_GPL(__devres_alloc_node);
 
 /**
  * devres_for_each_res - Resource iterator
diff --git a/drivers/base/trace.c b/drivers/base/trace.c
new file mode 100644
index 000000000000..b24b0a309c4a
--- /dev/null
+++ b/drivers/base/trace.c
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Device core Trace Support
+ * Copyright (C) 2021, Intel Corporation
+ *
+ * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ */
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/drivers/base/trace.h b/drivers/base/trace.h
new file mode 100644
index 000000000000..3192e18f877e
--- /dev/null
+++ b/drivers/base/trace.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Device core Trace Support
+ * Copyright (C) 2021, Intel Corporation
+ *
+ * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM dev
+
+#if !defined(__DEV_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __DEV_TRACE_H
+
+#include <linux/device.h>
+#include <linux/tracepoint.h>
+#include <linux/types.h>
+
+DECLARE_EVENT_CLASS(devres,
+	TP_PROTO(struct device *dev, const char *op, void *node, const char *name, size_t size),
+	TP_ARGS(dev, op, node, name, size),
+	TP_STRUCT__entry(
+		__string(devname, dev_name(dev))
+		__field(struct device *, dev)
+		__field(const char *, op)
+		__field(void *, node)
+		__field(const char *, name)
+		__field(size_t, size)
+	),
+	TP_fast_assign(
+		__assign_str(devname, dev_name(dev));
+		__entry->op = op;
+		__entry->node = node;
+		__entry->name = name;
+		__entry->size = size;
+	),
+	TP_printk("%s %3s %p %s (%zu bytes)", __get_str(devname),
+		  __entry->op, __entry->node, __entry->name, __entry->size)
+);
+
+DEFINE_EVENT(devres, devres_log,
+	TP_PROTO(struct device *dev, const char *op, void *node, const char *name, size_t size),
+	TP_ARGS(dev, op, node, name, size)
+);
+
+#endif /* __DEV_TRACE_H */
+
+/* this part has to be here */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+#include <trace/define_trace.h>
diff --git a/include/linux/device.h b/include/linux/device.h
index f1a00040fa53..b630f183f504 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -165,21 +165,12 @@ void device_remove_bin_file(struct device *dev,
 typedef void (*dr_release_t)(struct device *dev, void *res);
 typedef int (*dr_match_t)(struct device *dev, void *res, void *match_data);
 
-#ifdef CONFIG_DEBUG_DEVRES
 void *__devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp,
 			  int nid, const char *name) __malloc;
 #define devres_alloc(release, size, gfp) \
 	__devres_alloc_node(release, size, gfp, NUMA_NO_NODE, #release)
 #define devres_alloc_node(release, size, gfp, nid) \
 	__devres_alloc_node(release, size, gfp, nid, #release)
-#else
-void *devres_alloc_node(dr_release_t release, size_t size,
-			gfp_t gfp, int nid) __malloc;
-static inline void *devres_alloc(dr_release_t release, size_t size, gfp_t gfp)
-{
-	return devres_alloc_node(release, size, gfp, NUMA_NO_NODE);
-}
-#endif
 
 void devres_for_each_res(struct device *dev, dr_release_t release,
 			 dr_match_t match, void *match_data,
-- 
cgit v1.2.3


From 60f86b9a1c0d81507133173ba3dcfc3edd4d89a5 Mon Sep 17 00:00:00 2001
From: Huilong Deng <denghuilong@cdjrlc.com>
Date: Tue, 15 Jun 2021 23:55:30 +0900
Subject: mcb: Remove trailing semicolon in macros

Macros should not use a trailing semicolon.

Signed-off-by: Huilong Deng <denghuilong@cdjrlc.com>
Signed-off-by: Johannes Thumshirn <jth@kernel.org>
Link: https://lore.kernel.org/r/fe520620eeddaa2ed8c669125f9b673c89d6b5a5.1623768541.git.johannes.thumshirn@wdc.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/mcb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mcb.h b/include/linux/mcb.h
index 71dd10a3d928..f6efb16f9d1b 100644
--- a/include/linux/mcb.h
+++ b/include/linux/mcb.h
@@ -120,7 +120,7 @@ extern int __must_check __mcb_register_driver(struct mcb_driver *drv,
 	__mcb_register_driver(driver, THIS_MODULE, KBUILD_MODNAME)
 extern void mcb_unregister_driver(struct mcb_driver *driver);
 #define module_mcb_driver(__mcb_driver)		\
-	module_driver(__mcb_driver, mcb_register_driver, mcb_unregister_driver);
+	module_driver(__mcb_driver, mcb_register_driver, mcb_unregister_driver)
 extern void mcb_bus_add_devices(const struct mcb_bus *bus);
 extern int mcb_device_register(struct mcb_bus *bus, struct mcb_device *dev);
 extern struct mcb_bus *mcb_alloc_bus(struct device *carrier);
-- 
cgit v1.2.3


From d5e4ddaeb6ab2c3c7fbb7b247a6d34bb0b18d87e Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Sat, 12 Jun 2021 21:32:22 +0900
Subject: bpf: Support socket migration by eBPF.

This patch introduces a new bpf_attach_type for BPF_PROG_TYPE_SK_REUSEPORT
to check if the attached eBPF program is capable of migrating sockets. When
the eBPF program is attached, we run it for socket migration if the
expected_attach_type is BPF_SK_REUSEPORT_SELECT_OR_MIGRATE or
net.ipv4.tcp_migrate_req is enabled.

Currently, the expected_attach_type is not enforced for the
BPF_PROG_TYPE_SK_REUSEPORT type of program. Thus, this commit follows the
earlier idea in the commit aac3fc320d94 ("bpf: Post-hooks for sys_bind") to
fix up the zero expected_attach_type in bpf_prog_load_fixup_attach_type().

Moreover, this patch adds a new field (migrating_sk) to sk_reuseport_md to
select a new listener based on the child socket. migrating_sk varies
depending on if it is migrating a request in the accept queue or during
3WHS.

  - accept_queue : sock (ESTABLISHED/SYN_RECV)
  - 3WHS         : request_sock (NEW_SYN_RECV)

In the eBPF program, we can select a new listener by
BPF_FUNC_sk_select_reuseport(). Also, we can cancel migration by returning
SK_DROP. This feature is useful when listeners have different settings at
the socket API level or when we want to free resources as soon as possible.

  - SK_PASS with selected_sk, select it as a new listener
  - SK_PASS with selected_sk NULL, fallbacks to the random selection
  - SK_DROP, cancel the migration.

There is a noteworthy point. We select a listening socket in three places,
but we do not have struct skb at closing a listener or retransmitting a
SYN+ACK. On the other hand, some helper functions do not expect skb is NULL
(e.g. skb_header_pointer() in BPF_FUNC_skb_load_bytes(), skb_tail_pointer()
in BPF_FUNC_skb_load_bytes_relative()). So we allocate an empty skb
temporarily before running the eBPF program.

Suggested-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/netdev/20201123003828.xjpjdtk4ygl6tg6h@kafai-mbp.dhcp.thefacebook.com/
Link: https://lore.kernel.org/netdev/20201203042402.6cskdlit5f3mw4ru@kafai-mbp.dhcp.thefacebook.com/
Link: https://lore.kernel.org/netdev/20201209030903.hhow5r53l6fmozjn@kafai-mbp.dhcp.thefacebook.com/
Link: https://lore.kernel.org/bpf/20210612123224.12525-10-kuniyu@amazon.co.jp
---
 include/linux/bpf.h            |  1 +
 include/linux/filter.h         |  2 ++
 include/uapi/linux/bpf.h       | 15 +++++++++++++++
 kernel/bpf/syscall.c           | 13 +++++++++++++
 net/core/filter.c              | 13 ++++++++++++-
 net/core/sock_reuseport.c      | 34 ++++++++++++++++++++++++++++++----
 tools/include/uapi/linux/bpf.h | 15 +++++++++++++++
 7 files changed, 88 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 86dec5001ae2..f309fc1509f2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2048,6 +2048,7 @@ struct sk_reuseport_kern {
 	struct sk_buff *skb;
 	struct sock *sk;
 	struct sock *selected_sk;
+	struct sock *migrating_sk;
 	void *data_end;
 	u32 hash;
 	u32 reuseport_id;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index c5ad7df029ed..688856e0b28a 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -996,11 +996,13 @@ void bpf_warn_invalid_xdp_action(u32 act);
 #ifdef CONFIG_INET
 struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
 				  struct bpf_prog *prog, struct sk_buff *skb,
+				  struct sock *migrating_sk,
 				  u32 hash);
 #else
 static inline struct sock *
 bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
 		     struct bpf_prog *prog, struct sk_buff *skb,
+		     struct sock *migrating_sk,
 		     u32 hash)
 {
 	return NULL;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f3b72588442b..bf9252c7381e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -994,6 +994,8 @@ enum bpf_attach_type {
 	BPF_SK_LOOKUP,
 	BPF_XDP,
 	BPF_SK_SKB_VERDICT,
+	BPF_SK_REUSEPORT_SELECT,
+	BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -5416,7 +5418,20 @@ struct sk_reuseport_md {
 	__u32 ip_protocol;	/* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
 	__u32 bind_inany;	/* Is sock bound to an INANY address? */
 	__u32 hash;		/* A hash of the packet 4 tuples */
+	/* When reuse->migrating_sk is NULL, it is selecting a sk for the
+	 * new incoming connection request (e.g. selecting a listen sk for
+	 * the received SYN in the TCP case).  reuse->sk is one of the sk
+	 * in the reuseport group. The bpf prog can use reuse->sk to learn
+	 * the local listening ip/port without looking into the skb.
+	 *
+	 * When reuse->migrating_sk is not NULL, reuse->sk is closed and
+	 * reuse->migrating_sk is the socket that needs to be migrated
+	 * to another listening socket.  migrating_sk could be a fullsock
+	 * sk that is fully established or a reqsk that is in-the-middle
+	 * of 3-way handshake.
+	 */
 	__bpf_md_ptr(struct bpf_sock *, sk);
+	__bpf_md_ptr(struct bpf_sock *, migrating_sk);
 };
 
 #define BPF_TAG_SIZE	8
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 50457019da27..dbbc5342f221 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1972,6 +1972,11 @@ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
 			attr->expected_attach_type =
 				BPF_CGROUP_INET_SOCK_CREATE;
 		break;
+	case BPF_PROG_TYPE_SK_REUSEPORT:
+		if (!attr->expected_attach_type)
+			attr->expected_attach_type =
+				BPF_SK_REUSEPORT_SELECT;
+		break;
 	}
 }
 
@@ -2055,6 +2060,14 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
 		if (expected_attach_type == BPF_SK_LOOKUP)
 			return 0;
 		return -EINVAL;
+	case BPF_PROG_TYPE_SK_REUSEPORT:
+		switch (expected_attach_type) {
+		case BPF_SK_REUSEPORT_SELECT:
+		case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
+			return 0;
+		default:
+			return -EINVAL;
+		}
 	case BPF_PROG_TYPE_SYSCALL:
 	case BPF_PROG_TYPE_EXT:
 		if (expected_attach_type)
diff --git a/net/core/filter.c b/net/core/filter.c
index f753ab550525..5b86e47ef079 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -10044,11 +10044,13 @@ out:
 static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
 				    struct sock_reuseport *reuse,
 				    struct sock *sk, struct sk_buff *skb,
+				    struct sock *migrating_sk,
 				    u32 hash)
 {
 	reuse_kern->skb = skb;
 	reuse_kern->sk = sk;
 	reuse_kern->selected_sk = NULL;
+	reuse_kern->migrating_sk = migrating_sk;
 	reuse_kern->data_end = skb->data + skb_headlen(skb);
 	reuse_kern->hash = hash;
 	reuse_kern->reuseport_id = reuse->reuseport_id;
@@ -10057,12 +10059,13 @@ static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
 
 struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
 				  struct bpf_prog *prog, struct sk_buff *skb,
+				  struct sock *migrating_sk,
 				  u32 hash)
 {
 	struct sk_reuseport_kern reuse_kern;
 	enum sk_action action;
 
-	bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash);
+	bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash);
 	action = BPF_PROG_RUN(prog, &reuse_kern);
 
 	if (action == SK_PASS)
@@ -10207,6 +10210,10 @@ sk_reuseport_is_valid_access(int off, int size,
 		info->reg_type = PTR_TO_SOCKET;
 		return size == sizeof(__u64);
 
+	case offsetof(struct sk_reuseport_md, migrating_sk):
+		info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
+		return size == sizeof(__u64);
+
 	/* Fields that allow narrowing */
 	case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
 		if (size < sizeof_field(struct sk_buff, protocol))
@@ -10283,6 +10290,10 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
 	case offsetof(struct sk_reuseport_md, sk):
 		SK_REUSEPORT_LOAD_FIELD(sk);
 		break;
+
+	case offsetof(struct sk_reuseport_md, migrating_sk):
+		SK_REUSEPORT_LOAD_FIELD(migrating_sk);
+		break;
 	}
 
 	return insn - insn_buf;
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index b239f8cd9d39..de5ee3ae86d5 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -377,13 +377,17 @@ void reuseport_stop_listen_sock(struct sock *sk)
 {
 	if (sk->sk_protocol == IPPROTO_TCP) {
 		struct sock_reuseport *reuse;
+		struct bpf_prog *prog;
 
 		spin_lock_bh(&reuseport_lock);
 
 		reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
 						  lockdep_is_held(&reuseport_lock));
+		prog = rcu_dereference_protected(reuse->prog,
+						 lockdep_is_held(&reuseport_lock));
 
-		if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req) {
+		if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req ||
+		    (prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) {
 			/* Migration capable, move sk from the listening section
 			 * to the closed section.
 			 */
@@ -488,7 +492,7 @@ struct sock *reuseport_select_sock(struct sock *sk,
 			goto select_by_hash;
 
 		if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
-			sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash);
+			sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, NULL, hash);
 		else
 			sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
 
@@ -519,6 +523,8 @@ struct sock *reuseport_migrate_sock(struct sock *sk,
 {
 	struct sock_reuseport *reuse;
 	struct sock *nsk = NULL;
+	bool allocated = false;
+	struct bpf_prog *prog;
 	u16 socks;
 	u32 hash;
 
@@ -536,10 +542,30 @@ struct sock *reuseport_migrate_sock(struct sock *sk,
 	smp_rmb();
 
 	hash = migrating_sk->sk_hash;
-	if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req)
+	prog = rcu_dereference(reuse->prog);
+	if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) {
+		if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req)
+			goto select_by_hash;
+		goto out;
+	}
+
+	if (!skb) {
+		skb = alloc_skb(0, GFP_ATOMIC);
+		if (!skb)
+			goto out;
+		allocated = true;
+	}
+
+	nsk = bpf_run_sk_reuseport(reuse, sk, prog, skb, migrating_sk, hash);
+
+	if (allocated)
+		kfree_skb(skb);
+
+select_by_hash:
+	if (!nsk)
 		nsk = reuseport_select_sock_by_hash(reuse, hash, socks);
 
-	if (nsk && unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt)))
+	if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt)))
 		nsk = NULL;
 
 out:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f3b72588442b..bf9252c7381e 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -994,6 +994,8 @@ enum bpf_attach_type {
 	BPF_SK_LOOKUP,
 	BPF_XDP,
 	BPF_SK_SKB_VERDICT,
+	BPF_SK_REUSEPORT_SELECT,
+	BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -5416,7 +5418,20 @@ struct sk_reuseport_md {
 	__u32 ip_protocol;	/* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
 	__u32 bind_inany;	/* Is sock bound to an INANY address? */
 	__u32 hash;		/* A hash of the packet 4 tuples */
+	/* When reuse->migrating_sk is NULL, it is selecting a sk for the
+	 * new incoming connection request (e.g. selecting a listen sk for
+	 * the received SYN in the TCP case).  reuse->sk is one of the sk
+	 * in the reuseport group. The bpf prog can use reuse->sk to learn
+	 * the local listening ip/port without looking into the skb.
+	 *
+	 * When reuse->migrating_sk is not NULL, reuse->sk is closed and
+	 * reuse->migrating_sk is the socket that needs to be migrated
+	 * to another listening socket.  migrating_sk could be a fullsock
+	 * sk that is fully established or a reqsk that is in-the-middle
+	 * of 3-way handshake.
+	 */
 	__bpf_md_ptr(struct bpf_sock *, sk);
+	__bpf_md_ptr(struct bpf_sock *, migrating_sk);
 };
 
 #define BPF_TAG_SIZE	8
-- 
cgit v1.2.3


From 65b6d89d45a77256b743f421d109d469baefa688 Mon Sep 17 00:00:00 2001
From: Michael Walle <michael@walle.cc>
Date: Mon, 3 May 2021 17:56:50 +0200
Subject: mtd: spi-nor: sfdp: save a copy of the SFDP data

Due to possible mode switching to 8D-8D-8D, it might not be possible to
read the SFDP after the initial probe. To be able to dump the SFDP via
sysfs afterwards, make a complete copy of it.

Signed-off-by: Michael Walle <michael@walle.cc>
Signed-off-by: Vignesh Raghavendra <vigneshr@ti.com>
Tested-by: Heiko Thiery <heiko.thiery@gmail.com>
Reviewed-by: Pratyush Yadav <p.yadav@ti.com>
---
 drivers/mtd/spi-nor/core.h  | 10 ++++++++
 drivers/mtd/spi-nor/sfdp.c  | 58 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/spi-nor.h |  2 ++
 3 files changed, 70 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/spi-nor/core.h b/drivers/mtd/spi-nor/core.h
index 9398a8738857..f6dc28091c6d 100644
--- a/drivers/mtd/spi-nor/core.h
+++ b/drivers/mtd/spi-nor/core.h
@@ -461,6 +461,16 @@ struct spi_nor_manufacturer {
 	const struct spi_nor_fixups *fixups;
 };
 
+/**
+ * struct sfdp - SFDP data
+ * @num_dwords: number of entries in the dwords array
+ * @dwords: array of double words of the SFDP data
+ */
+struct sfdp {
+	size_t	num_dwords;
+	u32	*dwords;
+};
+
 /* Manufacturer drivers. */
 extern const struct spi_nor_manufacturer spi_nor_atmel;
 extern const struct spi_nor_manufacturer spi_nor_catalyst;
diff --git a/drivers/mtd/spi-nor/sfdp.c b/drivers/mtd/spi-nor/sfdp.c
index 23c28e91f698..c500c2118a5d 100644
--- a/drivers/mtd/spi-nor/sfdp.c
+++ b/drivers/mtd/spi-nor/sfdp.c
@@ -16,6 +16,7 @@
 	(((p)->parameter_table_pointer[2] << 16) | \
 	 ((p)->parameter_table_pointer[1] <<  8) | \
 	 ((p)->parameter_table_pointer[0] <<  0))
+#define SFDP_PARAM_HEADER_PARAM_LEN(p) ((p)->length * 4)
 
 #define SFDP_BFPT_ID		0xff00	/* Basic Flash Parameter Table */
 #define SFDP_SECTOR_MAP_ID	0xff81	/* Sector Map Table */
@@ -1245,6 +1246,8 @@ int spi_nor_parse_sfdp(struct spi_nor *nor)
 	struct sfdp_parameter_header *param_headers = NULL;
 	struct sfdp_header header;
 	struct device *dev = nor->dev;
+	struct sfdp *sfdp;
+	size_t sfdp_size;
 	size_t psize;
 	int i, err;
 
@@ -1267,6 +1270,9 @@ int spi_nor_parse_sfdp(struct spi_nor *nor)
 	    bfpt_header->major != SFDP_JESD216_MAJOR)
 		return -EINVAL;
 
+	sfdp_size = SFDP_PARAM_HEADER_PTP(bfpt_header) +
+		    SFDP_PARAM_HEADER_PARAM_LEN(bfpt_header);
+
 	/*
 	 * Allocate memory then read all parameter headers with a single
 	 * Read SFDP command. These parameter headers will actually be parsed
@@ -1293,6 +1299,58 @@ int spi_nor_parse_sfdp(struct spi_nor *nor)
 		}
 	}
 
+	/*
+	 * Cache the complete SFDP data. It is not (easily) possible to fetch
+	 * SFDP after probe time and we need it for the sysfs access.
+	 */
+	for (i = 0; i < header.nph; i++) {
+		param_header = &param_headers[i];
+		sfdp_size = max_t(size_t, sfdp_size,
+				  SFDP_PARAM_HEADER_PTP(param_header) +
+				  SFDP_PARAM_HEADER_PARAM_LEN(param_header));
+	}
+
+	/*
+	 * Limit the total size to a reasonable value to avoid allocating too
+	 * much memory just of because the flash returned some insane values.
+	 */
+	if (sfdp_size > PAGE_SIZE) {
+		dev_dbg(dev, "SFDP data (%zu) too big, truncating\n",
+			sfdp_size);
+		sfdp_size = PAGE_SIZE;
+	}
+
+	sfdp = devm_kzalloc(dev, sizeof(*sfdp), GFP_KERNEL);
+	if (!sfdp) {
+		err = -ENOMEM;
+		goto exit;
+	}
+
+	/*
+	 * The SFDP is organized in chunks of DWORDs. Thus, in theory, the
+	 * sfdp_size should be a multiple of DWORDs. But in case a flash
+	 * is not spec compliant, make sure that we have enough space to store
+	 * the complete SFDP data.
+	 */
+	sfdp->num_dwords = DIV_ROUND_UP(sfdp_size, sizeof(*sfdp->dwords));
+	sfdp->dwords = devm_kcalloc(dev, sfdp->num_dwords,
+				    sizeof(*sfdp->dwords), GFP_KERNEL);
+	if (!sfdp->dwords) {
+		err = -ENOMEM;
+		devm_kfree(dev, sfdp);
+		goto exit;
+	}
+
+	err = spi_nor_read_sfdp(nor, 0, sfdp_size, sfdp->dwords);
+	if (err < 0) {
+		dev_dbg(dev, "failed to read SFDP data\n");
+		devm_kfree(dev, sfdp->dwords);
+		devm_kfree(dev, sfdp);
+		goto exit;
+	}
+
+	nor->sfdp = sfdp;
+
 	/*
 	 * Check other parameter headers to get the latest revision of
 	 * the basic flash parameter table.
diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index 98ed91b529ea..f67457748ed8 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -383,6 +383,7 @@ struct spi_nor_flash_parameter;
  * @read_proto:		the SPI protocol for read operations
  * @write_proto:	the SPI protocol for write operations
  * @reg_proto:		the SPI protocol for read_reg/write_reg/erase operations
+ * @sfdp:		the SFDP data of the flash
  * @controller_ops:	SPI NOR controller driver specific operations.
  * @params:		[FLASH-SPECIFIC] SPI NOR flash parameters and settings.
  *                      The structure includes legacy flash parameters and
@@ -412,6 +413,7 @@ struct spi_nor {
 	bool			sst_write_second;
 	u32			flags;
 	enum spi_nor_cmd_ext	cmd_ext_type;
+	struct sfdp		*sfdp;
 
 	const struct spi_nor_controller_ops *controller_ops;
 
-- 
cgit v1.2.3


From 475b92f932168a78da8109acd10bfb7578b8f2bb Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 14 Jun 2021 15:24:05 -0700
Subject: ptp: improve max_adj check against unreasonable values

Scaled PPM conversion to PPB may (on 64bit systems) result
in a value larger than s32 can hold (freq/scaled_ppm is a long).
This means the kernel will not correctly reject unreasonably
high ->freq values (e.g. > 4294967295ppb, 281474976645 scaled PPM).

The conversion is equivalent to a division by ~66 (65.536),
so the value of ppb is always smaller than ppm, but not small
enough to assume narrowing the type from long -> s32 is okay.

Note that reasonable user space (e.g. ptp4l) will not use such
high values, anyway, 4289046510ppb ~= 4.3x, so the fix is
somewhat pedantic.

Fixes: d39a743511cd ("ptp: validate the requested frequency adjustment.")
Fixes: d94ba80ebbea ("ptp: Added a brand new class driver for ptp clocks.")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_clock.c          | 6 +++---
 include/linux/ptp_clock_kernel.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c
index 03a246e60fd9..21c4c34c52d8 100644
--- a/drivers/ptp/ptp_clock.c
+++ b/drivers/ptp/ptp_clock.c
@@ -63,7 +63,7 @@ static void enqueue_external_timestamp(struct timestamp_event_queue *queue,
 	spin_unlock_irqrestore(&queue->lock, flags);
 }
 
-s32 scaled_ppm_to_ppb(long ppm)
+long scaled_ppm_to_ppb(long ppm)
 {
 	/*
 	 * The 'freq' field in the 'struct timex' is in parts per
@@ -80,7 +80,7 @@ s32 scaled_ppm_to_ppb(long ppm)
 	s64 ppb = 1 + ppm;
 	ppb *= 125;
 	ppb >>= 13;
-	return (s32) ppb;
+	return (long) ppb;
 }
 EXPORT_SYMBOL(scaled_ppm_to_ppb);
 
@@ -138,7 +138,7 @@ static int ptp_clock_adjtime(struct posix_clock *pc, struct __kernel_timex *tx)
 		delta = ktime_to_ns(kt);
 		err = ops->adjtime(ops, delta);
 	} else if (tx->modes & ADJ_FREQUENCY) {
-		s32 ppb = scaled_ppm_to_ppb(tx->freq);
+		long ppb = scaled_ppm_to_ppb(tx->freq);
 		if (ppb > ops->max_adj || ppb < -ops->max_adj)
 			return -ERANGE;
 		if (ops->adjfine)
diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index 0d47fd33b228..51d7f1b8b32a 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -235,7 +235,7 @@ extern int ptp_clock_index(struct ptp_clock *ptp);
  * @ppm:    Parts per million, but with a 16 bit binary fractional field
  */
 
-extern s32 scaled_ppm_to_ppb(long ppm);
+extern long scaled_ppm_to_ppb(long ppm);
 
 /**
  * ptp_find_pin() - obtain the pin index of a given auxiliary function
-- 
cgit v1.2.3


From 293128b1ef5ae2cfa7403d54e183fe689ed5d303 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 15 Jun 2021 14:17:35 -0400
Subject: dm writecache: have ssd writeback wait if the kcopyd workqueue is
 busy

Make dm-writecache wait if the kcopyd workqueue is busy (as will
happen if waiting for page allocation or inside submit_bio).

This change improves performance of "mkfs.ext2" by approximately 20%
on one testbed.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-kcopyd.c     | 6 ++++++
 drivers/md/dm-writecache.c | 5 +++++
 include/linux/dm-kcopyd.h  | 1 +
 3 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index e50625ce74ec..37b03ab7e5c9 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -980,3 +980,9 @@ void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc)
 	kfree(kc);
 }
 EXPORT_SYMBOL(dm_kcopyd_client_destroy);
+
+void dm_kcopyd_client_flush(struct dm_kcopyd_client *kc)
+{
+	flush_workqueue(kc->kcopyd_wq);
+}
+EXPORT_SYMBOL(dm_kcopyd_client_flush);
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index 56179a21db0e..28bb6890fcf4 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -1812,6 +1812,11 @@ static void writecache_writeback(struct work_struct *work)
 	struct writeback_list wbl;
 	unsigned long n_walked;
 
+	if (!WC_MODE_PMEM(wc)) {
+		/* Wait for any active kcopyd work on behalf of ssd writeback */
+		dm_kcopyd_client_flush(wc->dm_kcopyd);
+	}
+
 	wc_lock(wc);
 restart:
 	if (writecache_has_error(wc)) {
diff --git a/include/linux/dm-kcopyd.h b/include/linux/dm-kcopyd.h
index e42de7750c88..c1707ee5b540 100644
--- a/include/linux/dm-kcopyd.h
+++ b/include/linux/dm-kcopyd.h
@@ -51,6 +51,7 @@ MODULE_PARM_DESC(name, description)
 struct dm_kcopyd_client;
 struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle);
 void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc);
+void dm_kcopyd_client_flush(struct dm_kcopyd_client *kc);
 
 /*
  * Submit a copy job to kcopyd.  This is built on top of the
-- 
cgit v1.2.3


From 9c54cd10e43947caa64920aaa7a30858193f8ef5 Mon Sep 17 00:00:00 2001
From: Charles Rose <charles.rose@dell.com>
Date: Tue, 15 Jun 2021 14:08:01 -0500
Subject: ahci: Add support for Dell S140 and later controllers

This patch enables support for Dell S140 and later controllers
that use Intel's PCHs configured as PCI_CLASS_STORAGE_RAID.

Reviewed-by: Mika Westerberg <mika.westerberg@intel.com>
Signed-off-by: Charles Rose <charles.rose@dell.com>
Link: https://lore.kernel.org/r/20210615190801.1744466-1-charles.rose@dell.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ata/ahci.c      | 4 ++++
 include/linux/pci_ids.h | 2 ++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 33192a8f687d..186cbf90c8ea 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -446,6 +446,10 @@ static const struct pci_device_id ahci_pci_tbl[] = {
 	{ PCI_VENDOR_ID_AMD, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID,
 	  PCI_CLASS_STORAGE_RAID << 8, 0xffffff, board_ahci },
 
+	/* Dell S140/S150 */
+	{ PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_SUBVENDOR_ID_DELL, PCI_ANY_ID,
+	  PCI_CLASS_STORAGE_RAID << 8, 0xffffff, board_ahci },
+
 	/* VIA */
 	{ PCI_VDEVICE(VIA, 0x3349), board_ahci_vt8251 }, /* VIA VT8251 */
 	{ PCI_VDEVICE(VIA, 0x6287), board_ahci_vt8251 }, /* VIA VT8251 */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 4c3fa5293d76..803ec446a729 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -631,6 +631,8 @@
 #define PCI_DEVICE_ID_DELL_RAC4		0x0012
 #define PCI_DEVICE_ID_DELL_PERC5	0x0015
 
+#define PCI_SUBVENDOR_ID_DELL		0x1028
+
 #define PCI_VENDOR_ID_MATROX		0x102B
 #define PCI_DEVICE_ID_MATROX_MGA_2	0x0518
 #define PCI_DEVICE_ID_MATROX_MIL	0x0519
-- 
cgit v1.2.3


From fd14602d05229671be81018fa226f9afdafdba88 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 15 Jun 2021 16:18:22 -0700
Subject: libnvdimm: Export nvdimm shutdown helper, nvdimm_delete()

CXL is a hotplug bus and arranges for nvdimm devices to be dynamically
discovered and removed. The libnvdimm core manages shutdown of nvdimm
security operations when the device is unregistered. That functionality
is moved to nvdimm_delete() and invoked by the CXL-to-nvdimm glue code.

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/162379910271.2993820.2955889139842401250.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/bus.c       | 19 ++++---------------
 drivers/nvdimm/dimm_devs.c | 18 ++++++++++++++++++
 include/linux/libnvdimm.h  |  1 +
 3 files changed, 23 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 3a777d0073b7..a11821df83b5 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -396,21 +396,10 @@ static int child_unregister(struct device *dev, void *data)
 	if (dev->class)
 		return 0;
 
-	if (is_nvdimm(dev)) {
-		struct nvdimm *nvdimm = to_nvdimm(dev);
-		bool dev_put = false;
-
-		/* We are shutting down. Make state frozen artificially. */
-		nvdimm_bus_lock(dev);
-		set_bit(NVDIMM_SECURITY_FROZEN, &nvdimm->sec.flags);
-		if (test_and_clear_bit(NDD_WORK_PENDING, &nvdimm->flags))
-			dev_put = true;
-		nvdimm_bus_unlock(dev);
-		cancel_delayed_work_sync(&nvdimm->dwork);
-		if (dev_put)
-			put_device(dev);
-	}
-	nd_device_unregister(dev, ND_SYNC);
+	if (is_nvdimm(dev))
+		nvdimm_delete(to_nvdimm(dev));
+	else
+		nd_device_unregister(dev, ND_SYNC);
 
 	return 0;
 }
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index 9d208570d059..dc7449a40003 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -642,6 +642,24 @@ struct nvdimm *__nvdimm_create(struct nvdimm_bus *nvdimm_bus,
 }
 EXPORT_SYMBOL_GPL(__nvdimm_create);
 
+void nvdimm_delete(struct nvdimm *nvdimm)
+{
+	struct device *dev = &nvdimm->dev;
+	bool dev_put = false;
+
+	/* We are shutting down. Make state frozen artificially. */
+	nvdimm_bus_lock(dev);
+	set_bit(NVDIMM_SECURITY_FROZEN, &nvdimm->sec.flags);
+	if (test_and_clear_bit(NDD_WORK_PENDING, &nvdimm->flags))
+		dev_put = true;
+	nvdimm_bus_unlock(dev);
+	cancel_delayed_work_sync(&nvdimm->dwork);
+	if (dev_put)
+		put_device(dev);
+	nd_device_unregister(dev, ND_SYNC);
+}
+EXPORT_SYMBOL_GPL(nvdimm_delete);
+
 static void shutdown_security_notify(void *data)
 {
 	struct nvdimm *nvdimm = data;
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 89b69e645ac7..7074aa9af525 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -278,6 +278,7 @@ static inline struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus,
 	return __nvdimm_create(nvdimm_bus, provider_data, groups, flags,
 			cmd_mask, num_flush, flush_wpq, NULL, NULL, NULL);
 }
+void nvdimm_delete(struct nvdimm *nvdimm);
 
 const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd);
 const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd);
-- 
cgit v1.2.3


From 2744d7a0733503931b71c00d156119ced002f22c Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Wed, 9 Jun 2021 13:40:17 -0500
Subject: ACPI: Check StorageD3Enable _DSD property in ACPI code

Although first implemented for NVME, this check may be usable by
other drivers as well. Microsoft's specification explicitly mentions
that is may be usable by SATA and AHCI devices.  Google also indicates
that they have used this with SDHCI in a downstream kernel tree that
a user can plug a storage device into.

Link: https://docs.microsoft.com/en-us/windows-hardware/design/component-guidelines/power-management-for-storage-hardware-devices-intro
Suggested-by: Keith Busch <kbusch@kernel.org>
CC: Shyam-sundar S-k <Shyam-sundar.S-k@amd.com>
CC: Alexander Deucher <Alexander.Deucher@amd.com>
CC: Rafael J. Wysocki <rjw@rjwysocki.net>
CC: Prike Liang <prike.liang@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/acpi/device_pm.c | 29 +++++++++++++++++++++++++++++
 drivers/nvme/host/pci.c  | 28 +---------------------------
 include/linux/acpi.h     |  5 +++++
 3 files changed, 35 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index d260bc1f3e6e..d76ab50c71dc 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -1340,4 +1340,33 @@ int acpi_dev_pm_attach(struct device *dev, bool power_on)
 	return 1;
 }
 EXPORT_SYMBOL_GPL(acpi_dev_pm_attach);
+
+/**
+ * acpi_storage_d3 - Check if D3 should be used in the suspend path
+ * @dev: Device to check
+ *
+ * Return %true if the platform firmware wants @dev to be programmed
+ * into D3hot or D3cold (if supported) in the suspend path, or %false
+ * when there is no specific preference. On some platforms, if this
+ * hint is ignored, @dev may remain unresponsive after suspending the
+ * platform as a whole.
+ *
+ * Although the property has storage in the name it actually is
+ * applied to the PCIe slot and plugging in a non-storage device the
+ * same platform restrictions will likely apply.
+ */
+bool acpi_storage_d3(struct device *dev)
+{
+	struct acpi_device *adev = ACPI_COMPANION(dev);
+	u8 val;
+
+	if (!adev)
+		return false;
+	if (fwnode_property_read_u8(acpi_fwnode_handle(adev), "StorageD3Enable",
+			&val))
+		return false;
+	return val == 1;
+}
+EXPORT_SYMBOL_GPL(acpi_storage_d3);
+
 #endif /* CONFIG_PM */
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 3aa7245a505f..8fbc4c87a0d8 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2828,32 +2828,6 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
 	return 0;
 }
 
-#ifdef CONFIG_ACPI
-static bool nvme_acpi_storage_d3(struct pci_dev *dev)
-{
-	struct acpi_device *adev = ACPI_COMPANION(&dev->dev);
-	u8 val;
-
-	/*
-	 * Look for _DSD property specifying that the storage device on the port
-	 * must use D3 to support deep platform power savings during
-	 * suspend-to-idle.
-	 */
-
-	if (!adev)
-		return false;
-	if (fwnode_property_read_u8(acpi_fwnode_handle(adev), "StorageD3Enable",
-			&val))
-		return false;
-	return val == 1;
-}
-#else
-static inline bool nvme_acpi_storage_d3(struct pci_dev *dev)
-{
-	return false;
-}
-#endif /* CONFIG_ACPI */
-
 static void nvme_async_probe(void *data, async_cookie_t cookie)
 {
 	struct nvme_dev *dev = data;
@@ -2903,7 +2877,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	quirks |= check_vendor_combination_bug(pdev);
 
-	if (!noacpi && nvme_acpi_storage_d3(pdev)) {
+	if (!noacpi && acpi_storage_d3(&pdev->dev)) {
 		/*
 		 * Some systems use a bios work around to ask for D3 on
 		 * platforms that support kernel managed suspend.
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index c60745f657e9..dd0dafd21e33 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1004,6 +1004,7 @@ int acpi_dev_resume(struct device *dev);
 int acpi_subsys_runtime_suspend(struct device *dev);
 int acpi_subsys_runtime_resume(struct device *dev);
 int acpi_dev_pm_attach(struct device *dev, bool power_on);
+bool acpi_storage_d3(struct device *dev);
 #else
 static inline int acpi_subsys_runtime_suspend(struct device *dev) { return 0; }
 static inline int acpi_subsys_runtime_resume(struct device *dev) { return 0; }
@@ -1011,6 +1012,10 @@ static inline int acpi_dev_pm_attach(struct device *dev, bool power_on)
 {
 	return 0;
 }
+static inline bool acpi_storage_d3(struct device *dev)
+{
+	return false;
+}
 #endif
 
 #if defined(CONFIG_ACPI) && defined(CONFIG_PM_SLEEP)
-- 
cgit v1.2.3


From 4e7dba070b1f44da9bef4a61fd633f6b73a2e853 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Fri, 28 May 2021 10:04:52 +0100
Subject: ata: include: libata: Move fields commonly over-written to separate
 MACRO

This is a pre-cursor to some upcoming W=1 fix-ups.

Fixes the following W=1 kernel build warning(s):

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Mark Lord <mlord@pobox.com>
Cc: Philipp Zabel <p.zabel@pengutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: linux-ide@vger.kernel.org
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20210528090502.1799866-2-lee.jones@linaro.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/libata.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 5f550eb27f81..3fcd24236793 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1397,25 +1397,28 @@ extern struct device_attribute *ata_common_sdev_attrs[];
 	ATA_SCSI_COMPAT_IOCTL					\
 	.queuecommand		= ata_scsi_queuecmd,		\
 	.dma_need_drain		= ata_scsi_dma_need_drain,	\
-	.can_queue		= ATA_DEF_QUEUE,		\
-	.tag_alloc_policy	= BLK_TAG_ALLOC_RR,		\
 	.this_id		= ATA_SHT_THIS_ID,		\
 	.emulated		= ATA_SHT_EMULATED,		\
 	.proc_name		= drv_name,			\
-	.slave_configure	= ata_scsi_slave_config,	\
 	.slave_destroy		= ata_scsi_slave_destroy,	\
 	.bios_param		= ata_std_bios_param,		\
 	.unlock_native_capacity	= ata_scsi_unlock_native_capacity
 
-#define ATA_BASE_SHT(drv_name)					\
+#define ATA_SUBBASE_SHT(drv_name)				\
 	__ATA_BASE_SHT(drv_name),				\
+	.can_queue		= ATA_DEF_QUEUE,		\
+	.tag_alloc_policy	= BLK_TAG_ALLOC_RR,		\
+	.slave_configure	= ata_scsi_slave_config
+
+#define ATA_BASE_SHT(drv_name)					\
+	ATA_SUBBASE_SHT(drv_name),				\
 	.sdev_attrs		= ata_common_sdev_attrs
 
 #ifdef CONFIG_SATA_HOST
 extern struct device_attribute *ata_ncq_sdev_attrs[];
 
 #define ATA_NCQ_SHT(drv_name)					\
-	__ATA_BASE_SHT(drv_name),				\
+	ATA_SUBBASE_SHT(drv_name),				\
 	.sdev_attrs		= ata_ncq_sdev_attrs,		\
 	.change_queue_depth	= ata_scsi_change_queue_depth
 #endif
-- 
cgit v1.2.3


From b7fb14d3ac63117e0e8beabe75f4ea52051fbe3a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 16 Jun 2021 15:46:58 +0200
Subject: ide: remove the legacy ide driver

The legay ide driver has been replace with libata starting in 2003 and has
been scheduled for removal for a while.  Finally kill it off so that we
can start cleaning up various bits of cruft it forced on the block layer.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/userspace-api/ioctl/hdio.rst |  845 +----------
 MAINTAINERS                                |   16 -
 drivers/Kconfig                            |    2 -
 drivers/Makefile                           |    1 -
 drivers/ide/Kconfig                        |  827 -----------
 drivers/ide/Makefile                       |  109 --
 drivers/ide/aec62xx.c                      |  331 -----
 drivers/ide/ali14xx.c                      |  250 ----
 drivers/ide/alim15x3.c                     |  602 --------
 drivers/ide/amd74xx.c                      |  343 -----
 drivers/ide/atiixp.c                       |  212 ---
 drivers/ide/buddha.c                       |  238 ----
 drivers/ide/cmd640.c                       |  848 -----------
 drivers/ide/cmd64x.c                       |  452 ------
 drivers/ide/cs5520.c                       |  168 ---
 drivers/ide/cs5530.c                       |  295 ----
 drivers/ide/cs5535.c                       |  216 ---
 drivers/ide/cs5536.c                       |  294 ----
 drivers/ide/cy82c693.c                     |  234 ----
 drivers/ide/delkin_cb.c                    |  181 ---
 drivers/ide/dtc2278.c                      |  155 ---
 drivers/ide/falconide.c                    |  226 ---
 drivers/ide/gayle.c                        |  188 ---
 drivers/ide/hpt366.c                       | 1545 ---------------------
 drivers/ide/ht6560b.c                      |  383 -----
 drivers/ide/icside.c                       |  692 ---------
 drivers/ide/ide-4drives.c                  |   65 -
 drivers/ide/ide-acpi.c                     |  622 ---------
 drivers/ide/ide-atapi.c                    |  756 ----------
 drivers/ide/ide-cd.c                       | 1858 -------------------------
 drivers/ide/ide-cd.h                       |  123 --
 drivers/ide/ide-cd_ioctl.c                 |  468 -------
 drivers/ide/ide-cd_verbose.c               |  362 -----
 drivers/ide/ide-cs.c                       |  364 -----
 drivers/ide/ide-devsets.c                  |  192 ---
 drivers/ide/ide-disk.c                     |  795 -----------
 drivers/ide/ide-disk.h                     |   30 -
 drivers/ide/ide-disk_ioctl.c               |   33 -
 drivers/ide/ide-disk_proc.c                |  125 --
 drivers/ide/ide-dma-sff.c                  |  336 -----
 drivers/ide/ide-dma.c                      |  551 --------
 drivers/ide/ide-eh.c                       |  443 ------
 drivers/ide/ide-floppy.c                   |  551 --------
 drivers/ide/ide-floppy.h                   |   42 -
 drivers/ide/ide-floppy_ioctl.c             |  339 -----
 drivers/ide/ide-floppy_proc.c              |   34 -
 drivers/ide/ide-gd.c                       |  432 ------
 drivers/ide/ide-gd.h                       |   43 -
 drivers/ide/ide-generic.c                  |  139 --
 drivers/ide/ide-io-std.c                   |  262 ----
 drivers/ide/ide-io.c                       |  904 ------------
 drivers/ide/ide-ioctls.c                   |  306 ----
 drivers/ide/ide-iops.c                     |  536 -------
 drivers/ide/ide-legacy.c                   |   59 -
 drivers/ide/ide-lib.c                      |  146 --
 drivers/ide/ide-park.c                     |  155 ---
 drivers/ide/ide-pci-generic.c              |  203 ---
 drivers/ide/ide-pio-blacklist.c            |   96 --
 drivers/ide/ide-pm.c                       |  261 ----
 drivers/ide/ide-pnp.c                      |   92 --
 drivers/ide/ide-probe.c                    | 1623 ----------------------
 drivers/ide/ide-proc.c                     |  633 ---------
 drivers/ide/ide-scan-pci.c                 |  113 --
 drivers/ide/ide-sysfs.c                    |  143 --
 drivers/ide/ide-tape.c                     | 2083 ----------------------------
 drivers/ide/ide-taskfile.c                 |  668 ---------
 drivers/ide/ide-timings.c                  |  198 ---
 drivers/ide/ide-xfer-mode.c                |  267 ----
 drivers/ide/ide.c                          |  415 ------
 drivers/ide/ide_platform.c                 |  133 --
 drivers/ide/it8172.c                       |  165 ---
 drivers/ide/it8213.c                       |  217 ---
 drivers/ide/it821x.c                       |  715 ----------
 drivers/ide/jmicron.c                      |  176 ---
 drivers/ide/ns87415.c                      |  350 -----
 drivers/ide/opti621.c                      |  179 ---
 drivers/ide/palm_bk3710.c                  |  387 ------
 drivers/ide/pdc202xx_new.c                 |  557 --------
 drivers/ide/pdc202xx_old.c                 |  362 -----
 drivers/ide/piix.c                         |  476 -------
 drivers/ide/pmac.c                         | 1703 -----------------------
 drivers/ide/qd65xx.c                       |  446 ------
 drivers/ide/qd65xx.h                       |  145 --
 drivers/ide/rapide.c                       |  106 --
 drivers/ide/rz1000.c                       |  100 --
 drivers/ide/sc1200.c                       |  355 -----
 drivers/ide/serverworks.c                  |  456 ------
 drivers/ide/setup-pci.c                    |  682 ---------
 drivers/ide/siimage.c                      |  843 -----------
 drivers/ide/sis5513.c                      |  637 ---------
 drivers/ide/sl82c105.c                     |  367 -----
 drivers/ide/slc90e66.c                     |  182 ---
 drivers/ide/tc86c001.c                     |  270 ----
 drivers/ide/triflex.c                      |  143 --
 drivers/ide/trm290.c                       |  374 -----
 drivers/ide/tx4938ide.c                    |  209 ---
 drivers/ide/tx4939ide.c                    |  628 ---------
 drivers/ide/umc8672.c                      |  184 ---
 drivers/ide/via82cxxx.c                    |  532 -------
 include/linux/ide.h                        | 1623 ----------------------
 100 files changed, 25 insertions(+), 41196 deletions(-)
 delete mode 100644 drivers/ide/Kconfig
 delete mode 100644 drivers/ide/Makefile
 delete mode 100644 drivers/ide/aec62xx.c
 delete mode 100644 drivers/ide/ali14xx.c
 delete mode 100644 drivers/ide/alim15x3.c
 delete mode 100644 drivers/ide/amd74xx.c
 delete mode 100644 drivers/ide/atiixp.c
 delete mode 100644 drivers/ide/buddha.c
 delete mode 100644 drivers/ide/cmd640.c
 delete mode 100644 drivers/ide/cmd64x.c
 delete mode 100644 drivers/ide/cs5520.c
 delete mode 100644 drivers/ide/cs5530.c
 delete mode 100644 drivers/ide/cs5535.c
 delete mode 100644 drivers/ide/cs5536.c
 delete mode 100644 drivers/ide/cy82c693.c
 delete mode 100644 drivers/ide/delkin_cb.c
 delete mode 100644 drivers/ide/dtc2278.c
 delete mode 100644 drivers/ide/falconide.c
 delete mode 100644 drivers/ide/gayle.c
 delete mode 100644 drivers/ide/hpt366.c
 delete mode 100644 drivers/ide/ht6560b.c
 delete mode 100644 drivers/ide/icside.c
 delete mode 100644 drivers/ide/ide-4drives.c
 delete mode 100644 drivers/ide/ide-acpi.c
 delete mode 100644 drivers/ide/ide-atapi.c
 delete mode 100644 drivers/ide/ide-cd.c
 delete mode 100644 drivers/ide/ide-cd.h
 delete mode 100644 drivers/ide/ide-cd_ioctl.c
 delete mode 100644 drivers/ide/ide-cd_verbose.c
 delete mode 100644 drivers/ide/ide-cs.c
 delete mode 100644 drivers/ide/ide-devsets.c
 delete mode 100644 drivers/ide/ide-disk.c
 delete mode 100644 drivers/ide/ide-disk.h
 delete mode 100644 drivers/ide/ide-disk_ioctl.c
 delete mode 100644 drivers/ide/ide-disk_proc.c
 delete mode 100644 drivers/ide/ide-dma-sff.c
 delete mode 100644 drivers/ide/ide-dma.c
 delete mode 100644 drivers/ide/ide-eh.c
 delete mode 100644 drivers/ide/ide-floppy.c
 delete mode 100644 drivers/ide/ide-floppy.h
 delete mode 100644 drivers/ide/ide-floppy_ioctl.c
 delete mode 100644 drivers/ide/ide-floppy_proc.c
 delete mode 100644 drivers/ide/ide-gd.c
 delete mode 100644 drivers/ide/ide-gd.h
 delete mode 100644 drivers/ide/ide-generic.c
 delete mode 100644 drivers/ide/ide-io-std.c
 delete mode 100644 drivers/ide/ide-io.c
 delete mode 100644 drivers/ide/ide-ioctls.c
 delete mode 100644 drivers/ide/ide-iops.c
 delete mode 100644 drivers/ide/ide-legacy.c
 delete mode 100644 drivers/ide/ide-lib.c
 delete mode 100644 drivers/ide/ide-park.c
 delete mode 100644 drivers/ide/ide-pci-generic.c
 delete mode 100644 drivers/ide/ide-pio-blacklist.c
 delete mode 100644 drivers/ide/ide-pm.c
 delete mode 100644 drivers/ide/ide-pnp.c
 delete mode 100644 drivers/ide/ide-probe.c
 delete mode 100644 drivers/ide/ide-proc.c
 delete mode 100644 drivers/ide/ide-scan-pci.c
 delete mode 100644 drivers/ide/ide-sysfs.c
 delete mode 100644 drivers/ide/ide-tape.c
 delete mode 100644 drivers/ide/ide-taskfile.c
 delete mode 100644 drivers/ide/ide-timings.c
 delete mode 100644 drivers/ide/ide-xfer-mode.c
 delete mode 100644 drivers/ide/ide.c
 delete mode 100644 drivers/ide/ide_platform.c
 delete mode 100644 drivers/ide/it8172.c
 delete mode 100644 drivers/ide/it8213.c
 delete mode 100644 drivers/ide/it821x.c
 delete mode 100644 drivers/ide/jmicron.c
 delete mode 100644 drivers/ide/ns87415.c
 delete mode 100644 drivers/ide/opti621.c
 delete mode 100644 drivers/ide/palm_bk3710.c
 delete mode 100644 drivers/ide/pdc202xx_new.c
 delete mode 100644 drivers/ide/pdc202xx_old.c
 delete mode 100644 drivers/ide/piix.c
 delete mode 100644 drivers/ide/pmac.c
 delete mode 100644 drivers/ide/qd65xx.c
 delete mode 100644 drivers/ide/qd65xx.h
 delete mode 100644 drivers/ide/rapide.c
 delete mode 100644 drivers/ide/rz1000.c
 delete mode 100644 drivers/ide/sc1200.c
 delete mode 100644 drivers/ide/serverworks.c
 delete mode 100644 drivers/ide/setup-pci.c
 delete mode 100644 drivers/ide/siimage.c
 delete mode 100644 drivers/ide/sis5513.c
 delete mode 100644 drivers/ide/sl82c105.c
 delete mode 100644 drivers/ide/slc90e66.c
 delete mode 100644 drivers/ide/tc86c001.c
 delete mode 100644 drivers/ide/triflex.c
 delete mode 100644 drivers/ide/trm290.c
 delete mode 100644 drivers/ide/tx4938ide.c
 delete mode 100644 drivers/ide/tx4939ide.c
 delete mode 100644 drivers/ide/umc8672.c
 delete mode 100644 drivers/ide/via82cxxx.c
 delete mode 100644 include/linux/ide.h

(limited to 'include/linux')

diff --git a/Documentation/userspace-api/ioctl/hdio.rst b/Documentation/userspace-api/ioctl/hdio.rst
index 817371bf94e9..6ee8fc88699f 100644
--- a/Documentation/userspace-api/ioctl/hdio.rst
+++ b/Documentation/userspace-api/ioctl/hdio.rst
@@ -7,8 +7,8 @@ Summary of `HDIO_` ioctl calls
 November, 2004
 
 This document attempts to describe the ioctl(2) calls supported by
-the HD/IDE layer.  These are by-and-large implemented (as of Linux 2.6)
-in drivers/ide/ide.c and drivers/block/scsi_ioctl.c
+the HD/IDE layer.  These are by-and-large implemented (as of Linux 5.11)
+drivers/ata/libata-scsi.c.
 
 ioctl values are listed in <linux/hdreg.h>.  As of this writing, they
 are as follows:
@@ -17,50 +17,17 @@ are as follows:
 
 	=======================	=======================================
 	HDIO_GETGEO		get device geometry
-	HDIO_GET_UNMASKINTR	get current unmask setting
-	HDIO_GET_MULTCOUNT	get current IDE blockmode setting
-	HDIO_GET_QDMA		get use-qdma flag
-	HDIO_SET_XFER		set transfer rate via proc
-	HDIO_OBSOLETE_IDENTITY	OBSOLETE, DO NOT USE
-	HDIO_GET_KEEPSETTINGS	get keep-settings-on-reset flag
 	HDIO_GET_32BIT		get current io_32bit setting
-	HDIO_GET_NOWERR		get ignore-write-error flag
-	HDIO_GET_DMA		get use-dma flag
-	HDIO_GET_NICE		get nice flags
 	HDIO_GET_IDENTITY	get IDE identification info
-	HDIO_GET_WCACHE		get write cache mode on|off
-	HDIO_GET_ACOUSTIC	get acoustic value
-	HDIO_GET_ADDRESS	get sector addressing mode
-	HDIO_GET_BUSSTATE	get the bus state of the hwif
-	HDIO_TRISTATE_HWIF	execute a channel tristate
-	HDIO_DRIVE_RESET	execute a device reset
 	HDIO_DRIVE_TASKFILE	execute raw taskfile
 	HDIO_DRIVE_TASK		execute task and special drive command
 	HDIO_DRIVE_CMD		execute a special drive command
-	HDIO_DRIVE_CMD_AEB	HDIO_DRIVE_TASK
 	=======================	=======================================
 
     ioctls that pass non-pointer values:
 
 	=======================	=======================================
-	HDIO_SET_MULTCOUNT	change IDE blockmode
-	HDIO_SET_UNMASKINTR	permit other irqs during I/O
-	HDIO_SET_KEEPSETTINGS	keep ioctl settings on reset
 	HDIO_SET_32BIT		change io_32bit flags
-	HDIO_SET_NOWERR		change ignore-write-error flag
-	HDIO_SET_DMA		change use-dma flag
-	HDIO_SET_PIO_MODE	reconfig interface to new speed
-	HDIO_SCAN_HWIF		register and (re)scan interface
-	HDIO_SET_NICE		set nice flags
-	HDIO_UNREGISTER_HWIF	unregister interface
-	HDIO_SET_WCACHE		change write cache enable-disable
-	HDIO_SET_ACOUSTIC	change acoustic behavior
-	HDIO_SET_BUSSTATE	set the bus state of the hwif
-	HDIO_SET_QDMA		change use-qdma flag
-	HDIO_SET_ADDRESS	change lba addressing modes
-
-	HDIO_SET_IDE_SCSI	Set scsi emulation mode on/off
-	HDIO_SET_SCSI_IDE	not implemented yet
 	=======================	=======================================
 
 
@@ -137,512 +104,49 @@ HDIO_GETGEO
 
 
-
-HDIO_GET_UNMASKINTR
-	get current unmask setting
-
-
-	usage::
-
-	  long val;
-
-	  ioctl(fd, HDIO_GET_UNMASKINTR, &val);
-
-	inputs:
-		none
-
-
-
-	outputs:
-		The value of the drive's current unmask setting
-
-
-
-
-
-HDIO_SET_UNMASKINTR
-	permit other irqs during I/O
-
-
-	usage::
-
-	  unsigned long val;
-
-	  ioctl(fd, HDIO_SET_UNMASKINTR, val);
-
-	inputs:
-		New value for unmask flag
-
-
-
-	outputs:
-		none
-
-
-
-	error return:
-	  - EINVAL	Called on a partition instead of the whole disk device
-	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
-	  - EINVAL	value out of range [0 1]
-	  - EBUSY	Controller busy
-
-
-
-
-HDIO_GET_MULTCOUNT
-	get current IDE blockmode setting
-
-
-	usage::
-
-	  long val;
-
-	  ioctl(fd, HDIO_GET_MULTCOUNT, &val);
-
-	inputs:
-		none
-
-
-
-	outputs:
-		The value of the current IDE block mode setting.  This
-		controls how many sectors the drive will transfer per
-		interrupt.
-
-
-
-HDIO_SET_MULTCOUNT
-	change IDE blockmode
-
-
-	usage::
-
-	  int val;
-
-	  ioctl(fd, HDIO_SET_MULTCOUNT, val);
-
-	inputs:
-		New value for IDE block mode setting.  This controls how many
-		sectors the drive will transfer per interrupt.
-
-	outputs:
-		none
-
-
-
-	error return:
-	  - EINVAL	Called on a partition instead of the whole disk device
-	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
-	  - EINVAL	value out of range supported by disk.
-	  - EBUSY	Controller busy or blockmode already set.
-	  - EIO		Drive did not accept new block mode.
-
-	notes:
-	  Source code comments read::
-
-	    This is tightly woven into the driver->do_special cannot
-	    touch.  DON'T do it again until a total personality rewrite
-	    is committed.
-
-	  If blockmode has already been set, this ioctl will fail with
-	  -EBUSY
-
-
-
-HDIO_GET_QDMA
-	get use-qdma flag
-
-
-	Not implemented, as of 2.6.8.1
-
-
-
-HDIO_SET_XFER
-	set transfer rate via proc
-
-
-	Not implemented, as of 2.6.8.1
-
-
-
-HDIO_OBSOLETE_IDENTITY
-	OBSOLETE, DO NOT USE
-
-
-	Same as HDIO_GET_IDENTITY (see below), except that it only
-	returns the first 142 bytes of drive identity information.
-
-
-
-HDIO_GET_IDENTITY
-	get IDE identification info
-
-
-	usage::
-
-	  unsigned char identity[512];
-
-	  ioctl(fd, HDIO_GET_IDENTITY, identity);
-
-	inputs:
-		none
-
-
-
-	outputs:
-		ATA drive identity information.  For full description, see
-		the IDENTIFY DEVICE and IDENTIFY PACKET DEVICE commands in
-		the ATA specification.
-
-	error returns:
-	  - EINVAL	Called on a partition instead of the whole disk device
-	  - ENOMSG	IDENTIFY DEVICE information not available
-
-	notes:
-		Returns information that was obtained when the drive was
-		probed.  Some of this information is subject to change, and
-		this ioctl does not re-probe the drive to update the
-		information.
-
-		This information is also available from /proc/ide/hdX/identify
-
-
-
-HDIO_GET_KEEPSETTINGS
-	get keep-settings-on-reset flag
-
-
-	usage::
-
-	  long val;
-
-	  ioctl(fd, HDIO_GET_KEEPSETTINGS, &val);
-
-	inputs:
-		none
-
-
-
-	outputs:
-		The value of the current "keep settings" flag
-
-
-
-	notes:
-		When set, indicates that kernel should restore settings
-		after a drive reset.
-
-
-
-HDIO_SET_KEEPSETTINGS
-	keep ioctl settings on reset
-
-
-	usage::
-
-	  long val;
-
-	  ioctl(fd, HDIO_SET_KEEPSETTINGS, val);
-
-	inputs:
-		New value for keep_settings flag
-
-
-
-	outputs:
-		none
-
-
-
-	error return:
-	  - EINVAL	Called on a partition instead of the whole disk device
-	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
-	  - EINVAL	value out of range [0 1]
-	  - EBUSY		Controller busy
-
-
-
-HDIO_GET_32BIT
-	get current io_32bit setting
-
-
-	usage::
-
-	  long val;
-
-	  ioctl(fd, HDIO_GET_32BIT, &val);
-
-	inputs:
-		none
-
-
-
-	outputs:
-		The value of the current io_32bit setting
-
-
-
-	notes:
-		0=16-bit, 1=32-bit, 2,3 = 32bit+sync
-
-
-
-
-
-HDIO_GET_NOWERR
-	get ignore-write-error flag
-
-
-	usage::
-
-	  long val;
-
-	  ioctl(fd, HDIO_GET_NOWERR, &val);
-
-	inputs:
-		none
-
-
-
-	outputs:
-		The value of the current ignore-write-error flag
-
-
-
-
-
-HDIO_GET_DMA
-	get use-dma flag
-
-
-	usage::
-
-	  long val;
-
-	  ioctl(fd, HDIO_GET_DMA, &val);
-
-	inputs:
-		none
-
-
-
-	outputs:
-		The value of the current use-dma flag
-
-
-
-
-
-HDIO_GET_NICE
-	get nice flags
-
-
-	usage::
-
-	  long nice;
-
-	  ioctl(fd, HDIO_GET_NICE, &nice);
-
-	inputs:
-		none
-
-
-
-	outputs:
-		The drive's "nice" values.
-
-
-
-	notes:
-		Per-drive flags which determine when the system will give more
-		bandwidth to other devices sharing the same IDE bus.
-
-		See <linux/hdreg.h>, near symbol IDE_NICE_DSC_OVERLAP.
-
-
-
-
-HDIO_SET_NICE
-	set nice flags
-
-
-	usage::
-
-	  unsigned long nice;
-
-	  ...
-	  ioctl(fd, HDIO_SET_NICE, nice);
-
-	inputs:
-		bitmask of nice flags.
-
-
-
-	outputs:
-		none
-
-
-
-	error returns:
-	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
-	  - EPERM	Flags other than DSC_OVERLAP and NICE_1 set.
-	  - EPERM	DSC_OVERLAP specified but not supported by drive
-
-	notes:
-		This ioctl sets the DSC_OVERLAP and NICE_1 flags from values
-		provided by the user.
-
-		Nice flags are listed in <linux/hdreg.h>, starting with
-		IDE_NICE_DSC_OVERLAP.  These values represent shifts.
-
-
-
-
-
-HDIO_GET_WCACHE
-	get write cache mode on|off
-
-
-	usage::
-
-	  long val;
-
-	  ioctl(fd, HDIO_GET_WCACHE, &val);
-
-	inputs:
-		none
-
-
-
-	outputs:
-		The value of the current write cache mode
-
-
-
-
-
-HDIO_GET_ACOUSTIC
-	get acoustic value
-
-
-	usage::
-
-	  long val;
-
-	  ioctl(fd, HDIO_GET_ACOUSTIC, &val);
-
-	inputs:
-		none
-
-
-
-	outputs:
-		The value of the current acoustic settings
-
-
-
-	notes:
-		See HDIO_SET_ACOUSTIC
-
-
-
-
-
-HDIO_GET_ADDRESS
-	usage::
-
-
-	  long val;
-
-	  ioctl(fd, HDIO_GET_ADDRESS, &val);
-
-	inputs:
-		none
-
-
-
-	outputs:
-		The value of the current addressing mode:
-
-	    =  ===================
-	    0  28-bit
-	    1  48-bit
-	    2  48-bit doing 28-bit
-	    3  64-bit
-	    =  ===================
-
-
-
-HDIO_GET_BUSSTATE
-	get the bus state of the hwif
-
-
-	usage::
-
-	  long state;
-
-	  ioctl(fd, HDIO_SCAN_HWIF, &state);
-
-	inputs:
-		none
-
-
-
-	outputs:
-		Current power state of the IDE bus.  One of BUSSTATE_OFF,
-		BUSSTATE_ON, or BUSSTATE_TRISTATE
-
-	error returns:
-	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
-
-
-
-
-HDIO_SET_BUSSTATE
-	set the bus state of the hwif
+HDIO_GET_IDENTITY
+	get IDE identification info
 
 
 	usage::
 
-	  int state;
+	  unsigned char identity[512];
 
-	  ...
-	  ioctl(fd, HDIO_SCAN_HWIF, state);
+	  ioctl(fd, HDIO_GET_IDENTITY, identity);
 
 	inputs:
-		Desired IDE power state.  One of BUSSTATE_OFF, BUSSTATE_ON,
-		or BUSSTATE_TRISTATE
-
-	outputs:
 		none
 
 
-	error returns:
-	  - EACCES	Access denied:  requires CAP_SYS_RAWIO
-	  - EOPNOTSUPP	Hardware interface does not support bus power control
-
-
-
+	outputs:
+		ATA drive identity information.  For full description, see
+		the IDENTIFY DEVICE and IDENTIFY PACKET DEVICE commands in
+		the ATA specification.
 
-HDIO_TRISTATE_HWIF
-	execute a channel tristate
+	error returns:
+	  - EINVAL	Called on a partition instead of the whole disk device
+	  - ENOMSG	IDENTIFY DEVICE information not available
 
+	notes:
+		Returns information that was obtained when the drive was
+		probed.  Some of this information is subject to change, and
+		this ioctl does not re-probe the drive to update the
+		information.
 
-	Not implemented, as of 2.6.8.1.  See HDIO_SET_BUSSTATE
+		This information is also available from /proc/ide/hdX/identify
 
 
-HDIO_DRIVE_RESET
-	execute a device reset
+HDIO_GET_32BIT
+	get current io_32bit setting
 
 
 	usage::
 
-	  int args[3]
+	  long val;
 
-	  ...
-	  ioctl(fd, HDIO_DRIVE_RESET, args);
+	  ioctl(fd, HDIO_GET_32BIT, &val);
 
 	inputs:
 		none
@@ -650,22 +154,12 @@ HDIO_DRIVE_RESET
 
 
 	outputs:
-		none
-
+		The value of the current io_32bit setting
 
 
-	error returns:
-	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
-	  - ENXIO	No such device:	phy dead or ctl_addr == 0
-	  - EIO		I/O error:	reset timed out or hardware error
 
 	notes:
-
-	  - Execute a reset on the device as soon as the current IO
-	    operation has completed.
-
-	  - Executes an ATAPI soft reset if applicable, otherwise
-	    executes an ATA soft reset on the controller.
+		0=16-bit, 1=32-bit, 2,3 = 32bit+sync
 
 
@@ -1026,14 +520,6 @@ HDIO_DRIVE_TASK
 
 
-HDIO_DRIVE_CMD_AEB
-	HDIO_DRIVE_TASK
-
-
-	Not implemented, as of 2.6.8.1
-
-
-
 HDIO_SET_32BIT
 	change io_32bit flags
 
@@ -1059,284 +545,3 @@ HDIO_SET_32BIT
 	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
 	  - EINVAL	value out of range [0 3]
 	  - EBUSY	Controller busy
-
-
-
-
-HDIO_SET_NOWERR
-	change ignore-write-error flag
-
-
-	usage::
-
-	  int val;
-
-	  ioctl(fd, HDIO_SET_NOWERR, val);
-
-	inputs:
-		New value for ignore-write-error flag.  Used for ignoring
-
-
-	  WRERR_STAT
-
-	outputs:
-		none
-
-
-
-	error return:
-	  - EINVAL	Called on a partition instead of the whole disk device
-	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
-	  - EINVAL	value out of range [0 1]
-	  - EBUSY		Controller busy
-
-
-
-HDIO_SET_DMA
-	change use-dma flag
-
-
-	usage::
-
-	  long val;
-
-	  ioctl(fd, HDIO_SET_DMA, val);
-
-	inputs:
-		New value for use-dma flag
-
-
-
-	outputs:
-		none
-
-
-
-	error return:
-	  - EINVAL	Called on a partition instead of the whole disk device
-	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
-	  - EINVAL	value out of range [0 1]
-	  - EBUSY	Controller busy
-
-
-
-HDIO_SET_PIO_MODE
-	reconfig interface to new speed
-
-
-	usage::
-
-	  long val;
-
-	  ioctl(fd, HDIO_SET_PIO_MODE, val);
-
-	inputs:
-		New interface speed.
-
-
-
-	outputs:
-		none
-
-
-
-	error return:
-	  - EINVAL	Called on a partition instead of the whole disk device
-	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
-	  - EINVAL	value out of range [0 255]
-	  - EBUSY	Controller busy
-
-
-
-HDIO_SCAN_HWIF
-	register and (re)scan interface
-
-
-	usage::
-
-	  int args[3]
-
-	  ...
-	  ioctl(fd, HDIO_SCAN_HWIF, args);
-
-	inputs:
-
-	  =======	=========================
-	  args[0]	io address to probe
-
-
-	  args[1]	control address to probe
-	  args[2]	irq number
-	  =======	=========================
-
-	outputs:
-		none
-
-
-
-	error returns:
-	  - EACCES	Access denied:  requires CAP_SYS_RAWIO
-	  - EIO		Probe failed.
-
-	notes:
-		This ioctl initializes the addresses and irq for a disk
-		controller, probes for drives, and creates /proc/ide
-		interfaces as appropriate.
-
-
-
-HDIO_UNREGISTER_HWIF
-	unregister interface
-
-
-	usage::
-
-	  int index;
-
-	  ioctl(fd, HDIO_UNREGISTER_HWIF, index);
-
-	inputs:
-		index		index of hardware interface to unregister
-
-
-
-	outputs:
-		none
-
-
-
-	error returns:
-	  - EACCES	Access denied:  requires CAP_SYS_RAWIO
-
-	notes:
-		This ioctl removes a hardware interface from the kernel.
-
-		Currently (2.6.8) this ioctl silently fails if any drive on
-		the interface is busy.
-
-
-
-HDIO_SET_WCACHE
-	change write cache enable-disable
-
-
-	usage::
-
-	  int val;
-
-	  ioctl(fd, HDIO_SET_WCACHE, val);
-
-	inputs:
-		New value for write cache enable
-
-
-
-	outputs:
-		none
-
-
-
-	error return:
-	  - EINVAL	Called on a partition instead of the whole disk device
-	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
-	  - EINVAL	value out of range [0 1]
-	  - EBUSY	Controller busy
-
-
-
-HDIO_SET_ACOUSTIC
-	change acoustic behavior
-
-
-	usage::
-
-	  int val;
-
-	  ioctl(fd, HDIO_SET_ACOUSTIC, val);
-
-	inputs:
-		New value for drive acoustic settings
-
-
-
-	outputs:
-		none
-
-
-
-	error return:
-	  - EINVAL	Called on a partition instead of the whole disk device
-	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
-	  - EINVAL	value out of range [0 254]
-	  - EBUSY	Controller busy
-
-
-
-HDIO_SET_QDMA
-	change use-qdma flag
-
-
-	Not implemented, as of 2.6.8.1
-
-
-
-HDIO_SET_ADDRESS
-	change lba addressing modes
-
-
-	usage::
-
-	  int val;
-
-	  ioctl(fd, HDIO_SET_ADDRESS, val);
-
-	inputs:
-		New value for addressing mode
-
-	    =   ===================
-	    0   28-bit
-	    1   48-bit
-	    2   48-bit doing 28-bit
-	    =   ===================
-
-	outputs:
-		none
-
-
-
-	error return:
-	  - EINVAL	Called on a partition instead of the whole disk device
-	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
-	  - EINVAL	value out of range [0 2]
-	  - EBUSY		Controller busy
-	  - EIO		Drive does not support lba48 mode.
-
-
-HDIO_SET_IDE_SCSI
-	usage::
-
-
-	  long val;
-
-	  ioctl(fd, HDIO_SET_IDE_SCSI, val);
-
-	inputs:
-		New value for scsi emulation mode (?)
-
-
-
-	outputs:
-		none
-
-
-
-	error return:
-	  - EINVAL	Called on a partition instead of the whole disk device
-	  - EACCES	Access denied:  requires CAP_SYS_ADMIN
-	  - EINVAL	value out of range [0 1]
-	  - EBUSY	Controller busy
-
-
-
-HDIO_SET_SCSI_IDE
-	Not implemented, as of 2.6.8.1
diff --git a/MAINTAINERS b/MAINTAINERS
index 008fcad7ac00..4f3a7e3ce93f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8763,22 +8763,6 @@ L:	linux-i2c@vger.kernel.org
 S:	Maintained
 F:	drivers/i2c/busses/i2c-icy.c
 
-IDE SUBSYSTEM
-M:	"David S. Miller" <davem@davemloft.net>
-L:	linux-ide@vger.kernel.org
-S:	Maintained
-Q:	http://patchwork.ozlabs.org/project/linux-ide/list/
-T:	git git://git.kernel.org/pub/scm/linux/kernel/git/davem/ide.git
-F:	Documentation/ide/
-F:	drivers/ide/
-F:	include/linux/ide.h
-
-IDE/ATAPI DRIVERS
-L:	linux-ide@vger.kernel.org
-S:	Orphan
-F:	Documentation/cdrom/ide-cd.rst
-F:	drivers/ide/ide-cd*
-
 IDEAPAD LAPTOP EXTRAS DRIVER
 M:	Ike Panhc <ike.pan@canonical.com>
 L:	platform-driver-x86@vger.kernel.org
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 47980c6b1945..8bad63417a50 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -33,8 +33,6 @@ source "drivers/nvme/Kconfig"
 
 source "drivers/misc/Kconfig"
 
-source "drivers/ide/Kconfig"
-
 source "drivers/scsi/Kconfig"
 
 source "drivers/ata/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index 5a6d613e868d..f85185f9139e 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -78,7 +78,6 @@ obj-$(CONFIG_CXL_BUS)		+= cxl/
 obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/
 obj-$(CONFIG_NUBUS)		+= nubus/
 obj-y				+= macintosh/
-obj-$(CONFIG_IDE)		+= ide/
 obj-y				+= scsi/
 obj-y				+= nvme/
 obj-$(CONFIG_ATA)		+= ata/
diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig
deleted file mode 100644
index 8af1ac69e5f8..000000000000
--- a/drivers/ide/Kconfig
+++ /dev/null
@@ -1,827 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# IDE ATA ATAPI Block device driver configuration
-#
-
-# Select HAVE_IDE if IDE is supported
-config HAVE_IDE
-	bool
-
-menuconfig IDE
-	tristate "ATA/ATAPI/MFM/RLL support (DEPRECATED)"
-	depends on HAVE_IDE
-	depends on BLOCK
-	select BLK_SCSI_REQUEST
-	help
-	  If you say Y here, your kernel will be able to manage ATA/(E)IDE and
-	  ATAPI units. The most common cases are IDE hard drives and ATAPI
-	  CD-ROM drives.
-
-	  This subsystem is currently in maintenance mode with only bug fix
-	  changes applied. Users of ATA hardware are encouraged to migrate to
-	  the newer ATA subsystem ("Serial ATA (prod) and Parallel ATA
-	  (experimental) drivers") which is more actively maintained.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called ide-core.
-
-	  For further information, please read <file:Documentation/ide/ide.rst>.
-
-	  If unsure, say N.
-
-if IDE
-
-comment "Please see Documentation/ide/ide.rst for help/info on IDE drives"
-
-config IDE_XFER_MODE
-	bool
-
-config IDE_TIMINGS
-	bool
-	select IDE_XFER_MODE
-
-config IDE_ATAPI
-	bool
-
-config IDE_LEGACY
-	bool
-
-config BLK_DEV_IDE_SATA
-	bool "Support for SATA (deprecated; conflicts with libata SATA driver)"
-	default n
-	help
-	  There are two drivers for Serial ATA controllers.
-
-	  The main driver, "libata", uses the SCSI subsystem
-	  and supports most modern SATA controllers. In order to use it
-	  you may take a look at "Serial ATA (prod) and Parallel ATA
-	  (experimental) drivers".
-
-	  The IDE driver (which you are currently configuring) supports
-	  a few first-generation SATA controllers.
-
-	  In order to eliminate conflicts between the two subsystems,
-	  this config option enables the IDE driver's SATA support.
-	  Normally this is disabled, as it is preferred that libata
-	  supports SATA controllers, and this (IDE) driver supports
-	  PATA controllers.
-
-	  If unsure, say N.
-
-config IDE_GD
-	tristate "generic ATA/ATAPI disk support"
-	default y
-	help
-	  Support for ATA/ATAPI disks (including ATAPI floppy drives).
-
-	  To compile this driver as a module, choose M here.
-	  The module will be called ide-gd_mod.
-
-	  If unsure, say Y.
-
-config IDE_GD_ATA
-	bool "ATA disk support"
-	depends on IDE_GD
-	default y
-	help
-	  This will include support for ATA hard disks.
-
-	  If unsure, say Y.
-
-config IDE_GD_ATAPI
-	bool "ATAPI floppy support"
-	depends on IDE_GD
-	select IDE_ATAPI
-	help
-	  This will include support for ATAPI floppy drives
-	  (i.e. Iomega ZIP or MKE LS-120).
-
-	  For information about jumper settings and the question
-	  of when a ZIP drive uses a partition table, see
-	  <http://www.win.tue.nl/~aeb/linux/zip/zip-1.html>.
-
-	  If unsure, say N.
-
-config BLK_DEV_IDECS
-	tristate "PCMCIA IDE support"
-	depends on PCMCIA
-	help
-	  Support for Compact Flash cards, outboard IDE disks, tape drives,
-	  and CD-ROM drives connected through a PCMCIA card.
-
-config BLK_DEV_DELKIN
-	tristate "Cardbus IDE support (Delkin/ASKA/Workbit)"
-	depends on CARDBUS && PCI
-	help
-	  Support for Delkin, ASKA, and Workbit Cardbus CompactFlash
-	  Adapters.  This may also work for similar SD and XD adapters.
-
-config BLK_DEV_IDECD
-	tristate "Include IDE/ATAPI CDROM support"
-	depends on BLK_DEV
-	select IDE_ATAPI
-	select CDROM
-	help
-	  If you have a CD-ROM drive using the ATAPI protocol, say Y. ATAPI is
-	  a newer protocol used by IDE CD-ROM and TAPE drives, similar to the
-	  SCSI protocol. Most new CD-ROM drives use ATAPI, including the
-	  NEC-260, Mitsumi FX400, Sony 55E, and just about all non-SCSI
-	  double(2X) or better speed drives.
-
-	  If you say Y here, the CD-ROM drive will be identified at boot time
-	  along with other IDE devices, as "hdb" or "hdc", or something
-	  similar (check the boot messages with dmesg). If this is your only
-	  CD-ROM drive, you can say N to all other CD-ROM options, but be sure
-	  to say Y or M to "ISO 9660 CD-ROM file system support".
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called ide-cd.
-
-config BLK_DEV_IDECD_VERBOSE_ERRORS
-	bool "Verbose error logging for IDE/ATAPI CDROM driver" if EXPERT
-	depends on BLK_DEV_IDECD
-	default y
-	help
-	  Turn this on to have the driver print out the meanings of the
-	  ATAPI error codes.  This will use up additional 8kB of kernel-space
-	  memory, though.
-
-config BLK_DEV_IDETAPE
-	tristate "Include IDE/ATAPI TAPE support"
-	select IDE_ATAPI
-	help
-	  If you have an IDE tape drive using the ATAPI protocol, say Y.
-	  ATAPI is a newer protocol used by IDE tape and CD-ROM drives,
-	  similar to the SCSI protocol.  If you have an SCSI tape drive
-	  however, you can say N here.
-
-	  You should also say Y if you have an OnStream DI-30 tape drive; this
-	  will not work with the SCSI protocol, until there is support for the
-	  SC-30 and SC-50 versions.
-
-	  If you say Y here, the tape drive will be identified at boot time
-	  along with other IDE devices, as "hdb" or "hdc", or something
-	  similar, and will be mapped to a character device such as "ht0"
-	  (check the boot messages with dmesg).  Be sure to consult the
-	  <file:drivers/ide/ide-tape.c> and <file:Documentation/ide/ide.rst>
-	  files for usage information.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called ide-tape.
-
-config BLK_DEV_IDEACPI
-	bool "IDE ACPI support"
-	depends on ACPI
-	help
-	  Implement ACPI support for generic IDE devices. On modern
-	  machines ACPI support is required to properly handle ACPI S3 states.
-
-config IDE_TASK_IOCTL
-	bool "IDE Taskfile Access"
-	help
-	  This is a direct raw access to the media.  It is a complex but
-	  elegant solution to test and validate the domain of the hardware and
-	  perform below the driver data recovery if needed.  This is the most
-	  basic form of media-forensics.
-
-	  If you are unsure, say N here.
-
-config IDE_PROC_FS
-	bool "legacy /proc/ide/ support"
-	depends on IDE && PROC_FS
-	default y
-	help
-	  This option enables support for the various files in
-	  /proc/ide.  In Linux 2.6 this has been superseded by
-	  files in sysfs but many legacy applications rely on this.
-
-	  If unsure say Y.
-
-comment "IDE chipset support/bugfixes"
-
-config IDE_GENERIC
-	tristate "generic/default IDE chipset support"
-	depends on ALPHA || X86 || IA64 || MIPS || ARCH_RPC
-	default ARM && ARCH_RPC
-	help
-	  This is the generic IDE driver.  This driver attaches to the
-	  fixed legacy ports (e.g. on PCs 0x1f0/0x170, 0x1e8/0x168 and
-	  so on).  Please note that if this driver is built into the
-	  kernel or loaded before other ATA (IDE or libata) drivers
-	  and the controller is located at legacy ports, this driver
-	  may grab those ports and thus can prevent the controller
-	  specific driver from attaching.
-
-	  Also, currently, IDE generic doesn't allow IRQ sharing
-	  meaning that the IRQs it grabs won't be available to other
-	  controllers sharing those IRQs which usually makes drivers
-	  for those controllers fail.  Generally, it's not a good idea
-	  to load IDE generic driver on modern systems.
-
-	  If unsure, say N.
-
-config BLK_DEV_PLATFORM
-	tristate "Platform driver for IDE interfaces"
-	help
-	  This is the platform IDE driver, used mostly for Memory Mapped
-	  IDE devices, like Compact Flashes running in True IDE mode.
-
-	  If unsure, say N.
-
-config BLK_DEV_CMD640
-	tristate "CMD640 chipset bugfix/support"
-	depends on X86
-	select IDE_TIMINGS
-	help
-	  The CMD-Technologies CMD640 IDE chip is used on many common 486 and
-	  Pentium motherboards, usually in combination with a "Neptune" or
-	  "SiS" chipset. Unfortunately, it has a number of rather nasty
-	  design flaws that can cause severe data corruption under many common
-	  conditions. Say Y here to include code which tries to automatically
-	  detect and correct the problems under Linux. This option also
-	  enables access to the secondary IDE ports in some CMD640 based
-	  systems.
-
-	  This driver will work automatically in PCI based systems (most new
-	  systems have PCI slots). But if your system uses VESA local bus
-	  (VLB) instead of PCI, you must also supply a kernel boot parameter
-	  to enable the CMD640 bugfix/support: "cmd640.probe_vlb". (Try "man
-	  bootparam" or see the documentation of your boot loader about how to
-	  pass options to the kernel.)
-
-	  The CMD640 chip is also used on add-in cards by Acculogic, and on
-	  the "CSA-6400E PCI to IDE controller" that some people have. For
-	  details, read <file:Documentation/ide/ide.rst>.
-
-config BLK_DEV_CMD640_ENHANCED
-	bool "CMD640 enhanced support"
-	depends on BLK_DEV_CMD640
-	help
-	  This option includes support for setting/autotuning PIO modes and
-	  prefetch on CMD640 IDE interfaces.  For details, read
-	  <file:Documentation/ide/ide.rst>. If you have a CMD640 IDE interface
-	  and your BIOS does not already do this for you, then say Y here.
-	  Otherwise say N.
-
-config BLK_DEV_IDEPNP
-	tristate "PNP EIDE support"
-	depends on PNP
-	help
-	  If you have a PnP (Plug and Play) compatible EIDE card and
-	  would like the kernel to automatically detect and activate
-	  it, say Y here.
-
-config BLK_DEV_IDEDMA_SFF
-	bool
-
-if PCI
-
-comment "PCI IDE chipsets support"
-
-config BLK_DEV_IDEPCI
-	bool
-
-config IDEPCI_PCIBUS_ORDER
-	bool "Probe IDE PCI devices in the PCI bus order (DEPRECATED)"
-	depends on IDE=y && BLK_DEV_IDEPCI
-	default y
-	help
-	  Probe IDE PCI devices in the order in which they appear on the
-	  PCI bus (i.e. 00:1f.1 PCI device before 02:01.0 PCI device)
-	  instead of the order in which IDE PCI host drivers are loaded.
-
-	  Please note that this method of assuring stable naming of
-	  IDE devices is unreliable and use other means for achieving
-	  it (i.e. udev).
-
-	  If in doubt, say N.
-
-# TODO: split it on per host driver config options (or module parameters)
-config BLK_DEV_OFFBOARD
-	bool "Boot off-board chipsets first support (DEPRECATED)"
-	depends on BLK_DEV_IDEPCI && (BLK_DEV_AEC62XX || BLK_DEV_GENERIC || BLK_DEV_HPT366 || BLK_DEV_PDC202XX_NEW || BLK_DEV_PDC202XX_OLD || BLK_DEV_TC86C001)
-	help
-	  Normally, IDE controllers built into the motherboard (on-board
-	  controllers) are assigned to ide0 and ide1 while those on add-in PCI
-	  cards (off-board controllers) are relegated to ide2 and ide3.
-	  Answering Y here will allow you to reverse the situation, with
-	  off-board controllers on ide0/1 and on-board controllers on ide2/3.
-	  This can improve the usability of some boot managers such as lilo
-	  when booting from a drive on an off-board controller.
-
-	  Note that, if you do this, the order of the hd* devices will be
-	  rearranged which may require modification of fstab and other files.
-
-	  Please also note that this method of assuring stable naming of
-	  IDE devices is unreliable and use other means for achieving it
-	  (i.e. udev).
-
-	  If in doubt, say N.
-
-config BLK_DEV_GENERIC
-	tristate "Generic PCI IDE Chipset Support"
-	select BLK_DEV_IDEPCI
-	help
-	  This option provides generic support for various PCI IDE Chipsets
-	  which otherwise might not be supported.
-
-config BLK_DEV_OPTI621
-	tristate "OPTi 82C621 chipset enhanced support"
-	select BLK_DEV_IDEPCI
-	help
-	  This is a driver for the OPTi 82C621 EIDE controller.
-	  Please read the comments at the top of <file:drivers/ide/opti621.c>.
-
-config BLK_DEV_RZ1000
-	tristate "RZ1000 chipset bugfix/support"
-	depends on X86
-	select BLK_DEV_IDEPCI
-	help
-	  The PC-Technologies RZ1000 IDE chip is used on many common 486 and
-	  Pentium motherboards, usually along with the "Neptune" chipset.
-	  Unfortunately, it has a rather nasty design flaw that can cause
-	  severe data corruption under many conditions. Say Y here to include
-	  code which automatically detects and corrects the problem under
-	  Linux. This may slow disk throughput by a few percent, but at least
-	  things will operate 100% reliably.
-
-config BLK_DEV_IDEDMA_PCI
-	bool
-	select BLK_DEV_IDEPCI
-	select BLK_DEV_IDEDMA_SFF
-
-config BLK_DEV_AEC62XX
-	tristate "AEC62XX chipset support"
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  This driver adds explicit support for Acard AEC62xx (Artop ATP8xx)
-	  IDE controllers. This allows the kernel to change PIO, DMA and UDMA
-	  speeds and to configure the chip to optimum performance.
-
-config BLK_DEV_ALI15X3
-	tristate "ALI M15x3 chipset support"
-	select IDE_TIMINGS
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  This driver ensures (U)DMA support for ALI 1533, 1543 and 1543C
-	  onboard chipsets.  It also tests for Simplex mode and enables
-	  normal dual channel support.
-
-	  Please read the comments at the top of
-	  <file:drivers/ide/alim15x3.c>.
-
-	  If unsure, say N.
-
-config BLK_DEV_AMD74XX
-	tristate "AMD and nVidia IDE support"
-	depends on !ARM
-	select IDE_TIMINGS
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  This driver adds explicit support for AMD-7xx and AMD-8111 chips
-	  and also for the nVidia nForce chip.  This allows the kernel to
-	  change PIO, DMA and UDMA speeds and to configure the chip to
-	  optimum performance.
-
-config BLK_DEV_ATIIXP
-	tristate "ATI IXP chipset IDE support"
-	depends on X86
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  This driver adds explicit support for ATI IXP chipset.
-	  This allows the kernel to change PIO, DMA and UDMA speeds
-	  and to configure the chip to optimum performance.
-
-	  Say Y here if you have an ATI IXP chipset IDE controller.
-
-config BLK_DEV_CMD64X
-	tristate "CMD64{3|6|8|9} chipset support"
-	select IDE_TIMINGS
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  Say Y here if you have an IDE controller which uses any of these
-	  chipsets: CMD643, CMD646, or CMD648.
-
-config BLK_DEV_TRIFLEX
-	tristate "Compaq Triflex IDE support"
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  Say Y here if you have a Compaq Triflex IDE controller, such
-	  as those commonly found on Compaq Pentium-Pro systems
-
-config BLK_DEV_CY82C693
-	tristate "CY82C693 chipset support"
-	depends on ALPHA
-	select IDE_TIMINGS
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  This driver adds detection and support for the CY82C693 chipset
-	  used on Digital's PC-Alpha 164SX boards.
-
-config BLK_DEV_CS5520
-	tristate "Cyrix CS5510/20 MediaGX chipset support (VERY EXPERIMENTAL)"
-	depends on X86_32 || COMPILE_TEST
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  Include support for PIO tuning and virtual DMA on the Cyrix MediaGX
-	  5510/5520 chipset. This will automatically be detected and
-	  configured if found.
-
-	  It is safe to say Y to this question.
-
-config BLK_DEV_CS5530
-	tristate "Cyrix/National Semiconductor CS5530 MediaGX chipset support"
-	depends on X86_32 || COMPILE_TEST
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  Include support for UDMA on the Cyrix MediaGX 5530 chipset. This
-	  will automatically be detected and configured if found.
-
-	  It is safe to say Y to this question.
-
-config BLK_DEV_CS5535
-	tristate "AMD CS5535 chipset support"
-	depends on X86_32
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  Include support for UDMA on the NSC/AMD CS5535 companion chipset.
-	  This will automatically be detected and configured if found.
-
-	  It is safe to say Y to this question.
-
-config BLK_DEV_CS5536
-	tristate "CS5536 chipset support"
-	depends on X86_32
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  This option enables support for the AMD CS5536
-	  companion chip used with the Geode LX processor family.
-
-	  If unsure, say N.
-
-config BLK_DEV_HPT366
-	tristate "HPT36X/37X chipset support"
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  HPT366 is an Ultra DMA chipset for ATA-66.
-	  HPT368 is an Ultra DMA chipset for ATA-66 RAID Based.
-	  HPT370 is an Ultra DMA chipset for ATA-100.
-	  HPT372 is an Ultra DMA chipset for ATA-100.
-	  HPT374 is an Ultra DMA chipset for ATA-100.
-
-	  This driver adds up to 4 more EIDE devices sharing a single
-	  interrupt.
-
-	  The HPT366 chipset in its current form is bootable. One solution
-	  for this problem are special LILO commands for redirecting the
-	  reference to device 0x80. The other solution is to say Y to "Boot
-	  off-board chipsets first support" (CONFIG_BLK_DEV_OFFBOARD) unless
-	  your mother board has the chipset natively mounted. Regardless one
-	  should use the fore mentioned option and call at LILO.
-
-	  This driver requires dynamic tuning of the chipset during the
-	  ide-probe at boot. It is reported to support DVD II drives, by the
-	  manufacturer.
-
-config BLK_DEV_JMICRON
-	tristate "JMicron JMB36x support"
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  Basic support for the JMicron ATA controllers. For full support
-	  use the libata drivers.
-
-config BLK_DEV_SC1200
-	tristate "National SCx200 chipset support"
-	depends on X86_32 || COMPILE_TEST
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  This driver adds support for the on-board IDE controller on the
-	  National SCx200 series of embedded x86 "Geode" systems.
-
-config BLK_DEV_PIIX
-	tristate "Intel PIIX/ICH chipsets support"
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  This driver adds explicit support for Intel PIIX and ICH chips.
-	  This allows the kernel to change PIO, DMA and UDMA speeds and to
-	  configure the chip to optimum performance.
-
-config BLK_DEV_IT8172
-	tristate "IT8172 IDE support"
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  This driver adds support for the IDE controller on the
-	  IT8172 System Controller.
-
-config BLK_DEV_IT8213
-	tristate "IT8213 IDE support"
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  This driver adds support for the ITE 8213 IDE controller.
-
-config BLK_DEV_IT821X
-	tristate "IT821X IDE support"
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  This driver adds support for the ITE 8211 IDE controller and the
-	  IT 8212 IDE RAID controller in both RAID and pass-through mode.
-
-config BLK_DEV_NS87415
-	tristate "NS87415 chipset support"
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  This driver adds detection and support for the NS87415 chip
-	  (used mainly on SPARC64 and PA-RISC machines).
-
-	  Please read the comments at the top of <file:drivers/ide/ns87415.c>.
-
-config BLK_DEV_PDC202XX_OLD
-	tristate "PROMISE PDC202{46|62|65|67} support"
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  Promise Ultra33 or PDC20246
-	  Promise Ultra66 or PDC20262
-	  Promise Ultra100 or PDC20265/PDC20267/PDC20268
-
-	  This driver adds up to 4 more EIDE devices sharing a single
-	  interrupt. This add-on card is a bootable PCI UDMA controller. Since
-	  multiple cards can be installed and there are BIOS ROM problems that
-	  happen if the BIOS revisions of all installed cards (three-max) do
-	  not match, the driver attempts to do dynamic tuning of the chipset
-	  at boot-time for max-speed.  Ultra33 BIOS 1.25 or newer is required
-	  for more than one card.
-
-	  Please read the comments at the top of
-	  <file:drivers/ide/pdc202xx_old.c>.
-
-	  If unsure, say N.
-
-config BLK_DEV_PDC202XX_NEW
-	tristate "PROMISE PDC202{68|69|70|71|75|76|77} support"
-	select BLK_DEV_IDEDMA_PCI
-
-config BLK_DEV_SVWKS
-	tristate "ServerWorks OSB4/CSB5/CSB6 chipsets support"
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  This driver adds PIO/(U)DMA support for the ServerWorks OSB4/CSB5
-	  chipsets.
-
-config BLK_DEV_SIIMAGE
-	tristate "Silicon Image chipset support"
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  This driver adds PIO/(U)DMA support for the SI CMD680 and SII
-	  3112 (Serial ATA) chips.
-
-config BLK_DEV_SIS5513
-	tristate "SiS5513 chipset support"
-	depends on X86
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  This driver ensures (U)DMA support for SIS5513 chipset family based
-	  mainboards.
-
-	  The following chipsets are supported:
-	  ATA16:  SiS5511, SiS5513
-	  ATA33:  SiS5591, SiS5597, SiS5598, SiS5600
-	  ATA66:  SiS530, SiS540, SiS620, SiS630, SiS640
-	  ATA100: SiS635, SiS645, SiS650, SiS730, SiS735, SiS740,
-	  SiS745, SiS750
-
-	  Please read the comments at the top of <file:drivers/ide/sis5513.c>.
-
-config BLK_DEV_SL82C105
-	tristate "Winbond SL82c105 support"
-	depends on (PPC || ARM)
-	select IDE_TIMINGS
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  If you have a Winbond SL82c105 IDE controller, say Y here to enable
-	  special configuration for this chip. This is common on various CHRP
-	  motherboards, but could be used elsewhere. If in doubt, say Y.
-
-config BLK_DEV_SLC90E66
-	tristate "SLC90E66 chipset support"
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  This driver ensures (U)DMA support for Victory66 SouthBridges for
-	  SMsC with Intel NorthBridges.  This is an Ultra66 based chipset.
-	  The nice thing about it is that you can mix Ultra/DMA/PIO devices
-	  and it will handle timing cycles.  Since this is an improved
-	  look-a-like to the PIIX4 it should be a nice addition.
-
-	  Please read the comments at the top of
-	  <file:drivers/ide/slc90e66.c>.
-
-config BLK_DEV_TRM290
-	tristate "Tekram TRM290 chipset support"
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  This driver adds support for bus master DMA transfers
-	  using the Tekram TRM290 PCI IDE chip. Volunteers are
-	  needed for further tweaking and development.
-	  Please read the comments at the top of <file:drivers/ide/trm290.c>.
-
-config BLK_DEV_VIA82CXXX
-	tristate "VIA82CXXX chipset support"
-	select IDE_TIMINGS
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  This driver adds explicit support for VIA BusMastering IDE chips.
-	  This allows the kernel to change PIO, DMA and UDMA speeds and to
-	  configure the chip to optimum performance.
-
-config BLK_DEV_TC86C001
-	tristate "Toshiba TC86C001 support"
-	select BLK_DEV_IDEDMA_PCI
-	help
-	This driver adds support for Toshiba TC86C001 GOKU-S chip.
-
-endif
-
-# TODO: BLK_DEV_IDEDMA_PCI -> BLK_DEV_IDEDMA_SFF
-config BLK_DEV_IDE_PMAC
-	tristate "PowerMac on-board IDE support"
-	depends on PPC_PMAC
-	select IDE_TIMINGS
-	select BLK_DEV_IDEDMA_PCI
-	help
-	  This driver provides support for the on-board IDE controller on
-	  most of the recent Apple Power Macintoshes and PowerBooks.
-	  If unsure, say Y.
-
-config BLK_DEV_IDE_PMAC_ATA100FIRST
-	bool "Probe on-board ATA/100 (Kauai) first"
-	depends on BLK_DEV_IDE_PMAC
-	help
-	  This option will cause the ATA/100 controller found in UniNorth2
-	  based machines (Windtunnel PowerMac, Aluminium PowerBooks, ...)
-	  to be probed before the ATA/66 and ATA/33 controllers. Without
-	  these, those machine used to have the hard disk on hdc and the
-	  CD-ROM on hda. This option changes this to more natural hda for
-	  hard disk and hdc for CD-ROM.
-
-config BLK_DEV_IDE_TX4938
-	tristate "TX4938 internal IDE support"
-	depends on SOC_TX4938
-	select IDE_TIMINGS
-
-config BLK_DEV_IDE_TX4939
-	tristate "TX4939 internal IDE support"
-	depends on SOC_TX4939
-	select BLK_DEV_IDEDMA_SFF
-
-config BLK_DEV_IDE_ICSIDE
-	tristate "ICS IDE interface support"
-	depends on ARM && ARCH_ACORN
-	help
-	  On Acorn systems, say Y here if you wish to use the ICS IDE
-	  interface card.  This is not required for ICS partition support.
-	  If you are unsure, say N to this.
-
-config BLK_DEV_IDEDMA_ICS
-	bool "ICS DMA support"
-	depends on BLK_DEV_IDE_ICSIDE
-	help
-	  Say Y here if you want to add DMA (Direct Memory Access) support to
-	  the ICS IDE driver.
-
-config BLK_DEV_IDE_RAPIDE
-	tristate "RapIDE interface support"
-	depends on ARM && ARCH_ACORN
-	help
-	  Say Y here if you want to support the Yellowstone RapIDE controller
-	  manufactured for use with Acorn computers.
-
-config BLK_DEV_GAYLE
-	tristate "Amiga Gayle IDE interface support"
-	depends on AMIGA
-	help
-	  This is the IDE driver for the Amiga Gayle IDE interface. It supports
-	  both the `A1200 style' and `A4000 style' of the Gayle IDE interface,
-	  This includes on-board IDE interfaces on some Amiga models (A600,
-	  A1200, A4000, and A4000T), and IDE interfaces on the Zorro expansion
-	  bus (M-Tech E-Matrix 530 expansion card).
-
-	  It also provides support for the so-called `IDE doublers' (made
-	  by various manufacturers, e.g. Eyetech) that can be connected to
-	  the on-board IDE interface of some Amiga models. Using such an IDE
-	  doubler, you can connect up to four instead of two IDE devices to
-	  the Amiga's on-board IDE interface. The feature is enabled at kernel
-	  runtime using the "gayle.doubler" kernel boot parameter.
-
-	  Say Y if you have an Amiga with a Gayle IDE interface and want to use
-	  IDE devices (hard disks, CD-ROM drives, etc.) that are connected to
-	  it.
-
-	  Note that you also have to enable Zorro bus support if you want to
-	  use Gayle IDE interfaces on the Zorro expansion bus.
-
-config BLK_DEV_BUDDHA
-	tristate "Buddha/Catweasel/X-Surf IDE interface support"
-	depends on ZORRO
-	help
-	  This is the IDE driver for the IDE interfaces on the Buddha, Catweasel
-	  and X-Surf expansion boards.  It supports up to two interfaces on the
-	  Buddha, three on the Catweasel and two on the X-Surf.
-
-	  Say Y if you have a Buddha or Catweasel expansion board and want to
-	  use IDE devices (hard disks, CD-ROM drives, etc.) that are connected
-	  to one of its IDE interfaces.
-
-config BLK_DEV_FALCON_IDE
-	tristate "Falcon and Q40/Q60 IDE interface support"
-	depends on ATARI || Q40
-	help
-	  This is the IDE driver for the on-board IDE interface on the Atari
-	  Falcon and Q40/Q60. Say Y if you have such a machine and want to use
-	  IDE devices (hard disks, CD-ROM drives, etc.) that are connected to
-	  the on-board IDE interface.
-
-config BLK_DEV_PALMCHIP_BK3710
-	tristate "Palmchip bk3710 IDE controller support"
-	depends on ARCH_DAVINCI
-	select IDE_TIMINGS
-	select BLK_DEV_IDEDMA_SFF
-	help
-	  Say Y here if you want to support the onchip IDE controller on the
-	  TI DaVinci SoC
-
-# no isa -> no vlb
-if ISA && (ALPHA || X86 || MIPS)
-
-comment "Other IDE chipsets support"
-comment "Note: most of these also require special kernel boot parameters"
-
-config BLK_DEV_4DRIVES
-	tristate "Generic 4 drives/port support"
-	help
-	  Certain older chipsets, including the Tekram 690CD, use a single set
-	  of I/O ports at 0x1f0 to control up to four drives, instead of the
-	  customary two drives per port. Support for this can be enabled at
-	  runtime using the "ide-4drives.probe" kernel boot parameter if you
-	  say Y here.
-
-config BLK_DEV_ALI14XX
-	tristate "ALI M14xx support"
-	select IDE_TIMINGS
-	select IDE_LEGACY
-	help
-	  This driver is enabled at runtime using the "ali14xx.probe" kernel
-	  boot parameter.  It enables support for the secondary IDE interface
-	  of the ALI M1439/1443/1445/1487/1489 chipsets, and permits faster
-	  I/O speeds to be set as well.
-	  See the files <file:Documentation/ide/ide.rst> and
-	  <file:drivers/ide/ali14xx.c> for more info.
-
-config BLK_DEV_DTC2278
-	tristate "DTC-2278 support"
-	select IDE_XFER_MODE
-	select IDE_LEGACY
-	help
-	  This driver is enabled at runtime using the "dtc2278.probe" kernel
-	  boot parameter. It enables support for the secondary IDE interface
-	  of the DTC-2278 card, and permits faster I/O speeds to be set as
-	  well. See the <file:Documentation/ide/ide.rst> and
-	  <file:drivers/ide/dtc2278.c> files for more info.
-
-config BLK_DEV_HT6560B
-	tristate "Holtek HT6560B support"
-	select IDE_TIMINGS
-	select IDE_LEGACY
-	help
-	  This driver is enabled at runtime using the "ht6560b.probe" kernel
-	  boot parameter. It enables support for the secondary IDE interface
-	  of the Holtek card, and permits faster I/O speeds to be set as well.
-	  See the <file:Documentation/ide/ide.rst> and
-	  <file:drivers/ide/ht6560b.c> files for more info.
-
-config BLK_DEV_QD65XX
-	tristate "QDI QD65xx support"
-	select IDE_TIMINGS
-	select IDE_LEGACY
-	help
-	  This driver is enabled at runtime using the "qd65xx.probe" kernel
-	  boot parameter.  It permits faster I/O speeds to be set.  See the
-	  <file:Documentation/ide/ide.rst> and <file:drivers/ide/qd65xx.c>
-	  for more info.
-
-config BLK_DEV_UMC8672
-	tristate "UMC-8672 support"
-	select IDE_XFER_MODE
-	select IDE_LEGACY
-	help
-	  This driver is enabled at runtime using the "umc8672.probe" kernel
-	  boot parameter. It enables support for the secondary IDE interface
-	  of the UMC-8672, and permits faster I/O speeds to be set as well.
-	  See the files <file:Documentation/ide/ide.rst> and
-	  <file:drivers/ide/umc8672.c> for more info.
-
-endif
-
-config BLK_DEV_IDEDMA
-	def_bool BLK_DEV_IDEDMA_SFF || BLK_DEV_IDEDMA_ICS
-	select IDE_XFER_MODE
-
-endif # IDE
diff --git a/drivers/ide/Makefile b/drivers/ide/Makefile
deleted file mode 100644
index 991eb72a786b..000000000000
--- a/drivers/ide/Makefile
+++ /dev/null
@@ -1,109 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# link order is important here
-#
-
-ide-core-y += ide.o ide-ioctls.o ide-io.o ide-iops.o ide-lib.o ide-probe.o \
-	      ide-taskfile.o ide-pm.o ide-park.o ide-sysfs.o ide-devsets.o \
-	      ide-io-std.o ide-eh.o
-
-# core IDE code
-ide-core-$(CONFIG_IDE_XFER_MODE)	+= ide-pio-blacklist.o ide-xfer-mode.o
-ide-core-$(CONFIG_IDE_TIMINGS)		+= ide-timings.o
-ide-core-$(CONFIG_IDE_ATAPI)		+= ide-atapi.o
-ide-core-$(CONFIG_BLK_DEV_IDEPCI)	+= setup-pci.o
-ide-core-$(CONFIG_BLK_DEV_IDEDMA)	+= ide-dma.o
-ide-core-$(CONFIG_BLK_DEV_IDEDMA_SFF)	+= ide-dma-sff.o
-ide-core-$(CONFIG_IDE_PROC_FS)		+= ide-proc.o
-ide-core-$(CONFIG_BLK_DEV_IDEACPI)	+= ide-acpi.o
-ide-core-$(CONFIG_IDE_LEGACY)		+= ide-legacy.o
-
-obj-$(CONFIG_IDE)			+= ide-core.o
-
-obj-$(CONFIG_BLK_DEV_ALI14XX)		+= ali14xx.o
-obj-$(CONFIG_BLK_DEV_UMC8672)		+= umc8672.o
-obj-$(CONFIG_BLK_DEV_DTC2278)		+= dtc2278.o
-obj-$(CONFIG_BLK_DEV_HT6560B)		+= ht6560b.o
-obj-$(CONFIG_BLK_DEV_QD65XX)		+= qd65xx.o
-obj-$(CONFIG_BLK_DEV_4DRIVES)		+= ide-4drives.o
-
-obj-$(CONFIG_BLK_DEV_GAYLE)		+= gayle.o
-obj-$(CONFIG_BLK_DEV_FALCON_IDE)	+= falconide.o
-obj-$(CONFIG_BLK_DEV_BUDDHA)		+= buddha.o
-
-obj-$(CONFIG_BLK_DEV_AEC62XX)		+= aec62xx.o
-obj-$(CONFIG_BLK_DEV_ALI15X3)		+= alim15x3.o
-obj-$(CONFIG_BLK_DEV_AMD74XX)		+= amd74xx.o
-obj-$(CONFIG_BLK_DEV_ATIIXP)		+= atiixp.o
-obj-$(CONFIG_BLK_DEV_CMD64X)		+= cmd64x.o
-obj-$(CONFIG_BLK_DEV_CS5520)		+= cs5520.o
-obj-$(CONFIG_BLK_DEV_CS5530)		+= cs5530.o
-obj-$(CONFIG_BLK_DEV_CS5535)		+= cs5535.o
-obj-$(CONFIG_BLK_DEV_CS5536)		+= cs5536.o
-obj-$(CONFIG_BLK_DEV_SC1200)		+= sc1200.o
-obj-$(CONFIG_BLK_DEV_CY82C693)		+= cy82c693.o
-obj-$(CONFIG_BLK_DEV_DELKIN)		+= delkin_cb.o
-obj-$(CONFIG_BLK_DEV_HPT366)		+= hpt366.o
-obj-$(CONFIG_BLK_DEV_IT8172)		+= it8172.o
-obj-$(CONFIG_BLK_DEV_IT8213)		+= it8213.o
-obj-$(CONFIG_BLK_DEV_IT821X)		+= it821x.o
-obj-$(CONFIG_BLK_DEV_JMICRON)		+= jmicron.o
-obj-$(CONFIG_BLK_DEV_NS87415)		+= ns87415.o
-obj-$(CONFIG_BLK_DEV_OPTI621)		+= opti621.o
-obj-$(CONFIG_BLK_DEV_PDC202XX_OLD)	+= pdc202xx_old.o
-obj-$(CONFIG_BLK_DEV_PDC202XX_NEW)	+= pdc202xx_new.o
-obj-$(CONFIG_BLK_DEV_PIIX)		+= piix.o
-obj-$(CONFIG_BLK_DEV_RZ1000)		+= rz1000.o
-obj-$(CONFIG_BLK_DEV_SVWKS)		+= serverworks.o
-obj-$(CONFIG_BLK_DEV_SIIMAGE)		+= siimage.o
-obj-$(CONFIG_BLK_DEV_SIS5513)		+= sis5513.o
-obj-$(CONFIG_BLK_DEV_SL82C105)		+= sl82c105.o
-obj-$(CONFIG_BLK_DEV_SLC90E66)		+= slc90e66.o
-obj-$(CONFIG_BLK_DEV_TC86C001)		+= tc86c001.o
-obj-$(CONFIG_BLK_DEV_TRIFLEX)		+= triflex.o
-obj-$(CONFIG_BLK_DEV_TRM290)		+= trm290.o
-obj-$(CONFIG_BLK_DEV_VIA82CXXX)		+= via82cxxx.o
-
-# Must appear at the end of the block
-obj-$(CONFIG_BLK_DEV_GENERIC)		+= ide-pci-generic.o
-
-obj-$(CONFIG_IDEPCI_PCIBUS_ORDER)	+= ide-scan-pci.o
-
-obj-$(CONFIG_BLK_DEV_CMD640)		+= cmd640.o
-
-obj-$(CONFIG_BLK_DEV_IDE_PMAC)		+= pmac.o
-
-obj-$(CONFIG_IDE_GENERIC)		+= ide-generic.o
-obj-$(CONFIG_BLK_DEV_IDEPNP)		+= ide-pnp.o
-
-ide-gd_mod-y += ide-gd.o
-ide-cd_mod-y += ide-cd.o ide-cd_ioctl.o ide-cd_verbose.o
-
-ifeq ($(CONFIG_IDE_GD_ATA), y)
-	ide-gd_mod-y += ide-disk.o ide-disk_ioctl.o
-ifeq ($(CONFIG_IDE_PROC_FS), y)
-	ide-gd_mod-y += ide-disk_proc.o
-endif
-endif
-
-ifeq ($(CONFIG_IDE_GD_ATAPI), y)
-	ide-gd_mod-y += ide-floppy.o ide-floppy_ioctl.o
-ifeq ($(CONFIG_IDE_PROC_FS), y)
-	ide-gd_mod-y += ide-floppy_proc.o
-endif
-endif
-
-obj-$(CONFIG_IDE_GD)			+= ide-gd_mod.o
-obj-$(CONFIG_BLK_DEV_IDECD)		+= ide-cd_mod.o
-obj-$(CONFIG_BLK_DEV_IDETAPE)		+= ide-tape.o
-
-obj-$(CONFIG_BLK_DEV_IDECS)		+= ide-cs.o
-
-obj-$(CONFIG_BLK_DEV_PLATFORM)		+= ide_platform.o
-
-obj-$(CONFIG_BLK_DEV_IDE_ICSIDE)	+= icside.o
-obj-$(CONFIG_BLK_DEV_IDE_RAPIDE)	+= rapide.o
-obj-$(CONFIG_BLK_DEV_PALMCHIP_BK3710)	+= palm_bk3710.o
-
-obj-$(CONFIG_BLK_DEV_IDE_TX4938)	+= tx4938ide.o
-obj-$(CONFIG_BLK_DEV_IDE_TX4939)	+= tx4939ide.o
diff --git a/drivers/ide/aec62xx.c b/drivers/ide/aec62xx.c
deleted file mode 100644
index 4c959ce41ba9..000000000000
--- a/drivers/ide/aec62xx.c
+++ /dev/null
@@ -1,331 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 1999-2002	Andre Hedrick <andre@linux-ide.org>
- * Copyright (C) 2007		MontaVista Software, Inc. <source@mvista.com>
- *
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/pci.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-
-#include <asm/io.h>
-
-#define DRV_NAME "aec62xx"
-
-struct chipset_bus_clock_list_entry {
-	u8 xfer_speed;
-	u8 chipset_settings;
-	u8 ultra_settings;
-};
-
-static const struct chipset_bus_clock_list_entry aec6xxx_33_base [] = {
-	{	XFER_UDMA_6,	0x31,	0x07	},
-	{	XFER_UDMA_5,	0x31,	0x06	},
-	{	XFER_UDMA_4,	0x31,	0x05	},
-	{	XFER_UDMA_3,	0x31,	0x04	},
-	{	XFER_UDMA_2,	0x31,	0x03	},
-	{	XFER_UDMA_1,	0x31,	0x02	},
-	{	XFER_UDMA_0,	0x31,	0x01	},
-
-	{	XFER_MW_DMA_2,	0x31,	0x00	},
-	{	XFER_MW_DMA_1,	0x31,	0x00	},
-	{	XFER_MW_DMA_0,	0x0a,	0x00	},
-	{	XFER_PIO_4,	0x31,	0x00	},
-	{	XFER_PIO_3,	0x33,	0x00	},
-	{	XFER_PIO_2,	0x08,	0x00	},
-	{	XFER_PIO_1,	0x0a,	0x00	},
-	{	XFER_PIO_0,	0x00,	0x00	},
-	{	0,		0x00,	0x00	}
-};
-
-static const struct chipset_bus_clock_list_entry aec6xxx_34_base [] = {
-	{	XFER_UDMA_6,	0x41,	0x06	},
-	{	XFER_UDMA_5,	0x41,	0x05	},
-	{	XFER_UDMA_4,	0x41,	0x04	},
-	{	XFER_UDMA_3,	0x41,	0x03	},
-	{	XFER_UDMA_2,	0x41,	0x02	},
-	{	XFER_UDMA_1,	0x41,	0x01	},
-	{	XFER_UDMA_0,	0x41,	0x01	},
-
-	{	XFER_MW_DMA_2,	0x41,	0x00	},
-	{	XFER_MW_DMA_1,	0x42,	0x00	},
-	{	XFER_MW_DMA_0,	0x7a,	0x00	},
-	{	XFER_PIO_4,	0x41,	0x00	},
-	{	XFER_PIO_3,	0x43,	0x00	},
-	{	XFER_PIO_2,	0x78,	0x00	},
-	{	XFER_PIO_1,	0x7a,	0x00	},
-	{	XFER_PIO_0,	0x70,	0x00	},
-	{	0,		0x00,	0x00	}
-};
-
-/*
- * TO DO: active tuning and correction of cards without a bios.
- */
-static u8 pci_bus_clock_list (u8 speed, struct chipset_bus_clock_list_entry * chipset_table)
-{
-	for ( ; chipset_table->xfer_speed ; chipset_table++)
-		if (chipset_table->xfer_speed == speed) {
-			return chipset_table->chipset_settings;
-		}
-	return chipset_table->chipset_settings;
-}
-
-static u8 pci_bus_clock_list_ultra (u8 speed, struct chipset_bus_clock_list_entry * chipset_table)
-{
-	for ( ; chipset_table->xfer_speed ; chipset_table++)
-		if (chipset_table->xfer_speed == speed) {
-			return chipset_table->ultra_settings;
-		}
-	return chipset_table->ultra_settings;
-}
-
-static void aec6210_set_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	struct ide_host *host	= pci_get_drvdata(dev);
-	struct chipset_bus_clock_list_entry *bus_clock = host->host_priv;
-	u16 d_conf		= 0;
-	u8 ultra = 0, ultra_conf = 0;
-	u8 tmp0 = 0, tmp1 = 0, tmp2 = 0;
-	const u8 speed = drive->dma_mode;
-	unsigned long flags;
-
-	local_irq_save(flags);
-	/* 0x40|(2*drive->dn): Active, 0x41|(2*drive->dn): Recovery */
-	pci_read_config_word(dev, 0x40|(2*drive->dn), &d_conf);
-	tmp0 = pci_bus_clock_list(speed, bus_clock);
-	d_conf = ((tmp0 & 0xf0) << 4) | (tmp0 & 0xf);
-	pci_write_config_word(dev, 0x40|(2*drive->dn), d_conf);
-
-	tmp1 = 0x00;
-	tmp2 = 0x00;
-	pci_read_config_byte(dev, 0x54, &ultra);
-	tmp1 = ((0x00 << (2*drive->dn)) | (ultra & ~(3 << (2*drive->dn))));
-	ultra_conf = pci_bus_clock_list_ultra(speed, bus_clock);
-	tmp2 = ((ultra_conf << (2*drive->dn)) | (tmp1 & ~(3 << (2*drive->dn))));
-	pci_write_config_byte(dev, 0x54, tmp2);
-	local_irq_restore(flags);
-}
-
-static void aec6260_set_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	struct ide_host *host	= pci_get_drvdata(dev);
-	struct chipset_bus_clock_list_entry *bus_clock = host->host_priv;
-	u8 unit			= drive->dn & 1;
-	u8 tmp1 = 0, tmp2 = 0;
-	u8 ultra = 0, drive_conf = 0, ultra_conf = 0;
-	const u8 speed = drive->dma_mode;
-	unsigned long flags;
-
-	local_irq_save(flags);
-	/* high 4-bits: Active, low 4-bits: Recovery */
-	pci_read_config_byte(dev, 0x40|drive->dn, &drive_conf);
-	drive_conf = pci_bus_clock_list(speed, bus_clock);
-	pci_write_config_byte(dev, 0x40|drive->dn, drive_conf);
-
-	pci_read_config_byte(dev, (0x44|hwif->channel), &ultra);
-	tmp1 = ((0x00 << (4*unit)) | (ultra & ~(7 << (4*unit))));
-	ultra_conf = pci_bus_clock_list_ultra(speed, bus_clock);
-	tmp2 = ((ultra_conf << (4*unit)) | (tmp1 & ~(7 << (4*unit))));
-	pci_write_config_byte(dev, (0x44|hwif->channel), tmp2);
-	local_irq_restore(flags);
-}
-
-static void aec_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	drive->dma_mode = drive->pio_mode;
-	hwif->port_ops->set_dma_mode(hwif, drive);
-}
-
-static int init_chipset_aec62xx(struct pci_dev *dev)
-{
-	/* These are necessary to get AEC6280 Macintosh cards to work */
-	if ((dev->device == PCI_DEVICE_ID_ARTOP_ATP865) ||
-	    (dev->device == PCI_DEVICE_ID_ARTOP_ATP865R)) {
-		u8 reg49h = 0, reg4ah = 0;
-		/* Clear reset and test bits.  */
-		pci_read_config_byte(dev, 0x49, &reg49h);
-		pci_write_config_byte(dev, 0x49, reg49h & ~0x30);
-		/* Enable chip interrupt output.  */
-		pci_read_config_byte(dev, 0x4a, &reg4ah);
-		pci_write_config_byte(dev, 0x4a, reg4ah & ~0x01);
-		/* Enable burst mode. */
-		pci_read_config_byte(dev, 0x4a, &reg4ah);
-		pci_write_config_byte(dev, 0x4a, reg4ah | 0x80);
-	}
-
-	return 0;
-}
-
-static u8 atp86x_cable_detect(ide_hwif_t *hwif)
-{
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	u8 ata66 = 0, mask = hwif->channel ? 0x02 : 0x01;
-
-	pci_read_config_byte(dev, 0x49, &ata66);
-
-	return (ata66 & mask) ? ATA_CBL_PATA40 : ATA_CBL_PATA80;
-}
-
-static const struct ide_port_ops atp850_port_ops = {
-	.set_pio_mode		= aec_set_pio_mode,
-	.set_dma_mode		= aec6210_set_mode,
-};
-
-static const struct ide_port_ops atp86x_port_ops = {
-	.set_pio_mode		= aec_set_pio_mode,
-	.set_dma_mode		= aec6260_set_mode,
-	.cable_detect		= atp86x_cable_detect,
-};
-
-static const struct ide_port_info aec62xx_chipsets[] = {
-	{	/* 0: AEC6210 */
-		.name		= DRV_NAME,
-		.init_chipset	= init_chipset_aec62xx,
-		.enablebits	= {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}},
-		.port_ops	= &atp850_port_ops,
-		.host_flags	= IDE_HFLAG_SERIALIZE |
-				  IDE_HFLAG_NO_ATAPI_DMA |
-				  IDE_HFLAG_NO_DSC |
-				  IDE_HFLAG_OFF_BOARD,
-		.pio_mask	= ATA_PIO4,
-		.mwdma_mask	= ATA_MWDMA2,
-		.udma_mask	= ATA_UDMA2,
-	},
-	{	/* 1: AEC6260 */
-		.name		= DRV_NAME,
-		.init_chipset	= init_chipset_aec62xx,
-		.port_ops	= &atp86x_port_ops,
-		.host_flags	= IDE_HFLAG_NO_ATAPI_DMA | IDE_HFLAG_NO_AUTODMA |
-				  IDE_HFLAG_OFF_BOARD,
-		.pio_mask	= ATA_PIO4,
-		.mwdma_mask	= ATA_MWDMA2,
-		.udma_mask	= ATA_UDMA4,
-	},
-	{	/* 2: AEC6260R */
-		.name		= DRV_NAME,
-		.init_chipset	= init_chipset_aec62xx,
-		.enablebits	= {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}},
-		.port_ops	= &atp86x_port_ops,
-		.host_flags	= IDE_HFLAG_NO_ATAPI_DMA |
-				  IDE_HFLAG_NON_BOOTABLE,
-		.pio_mask	= ATA_PIO4,
-		.mwdma_mask	= ATA_MWDMA2,
-		.udma_mask	= ATA_UDMA4,
-	},
-	{	/* 3: AEC6280 */
-		.name		= DRV_NAME,
-		.init_chipset	= init_chipset_aec62xx,
-		.port_ops	= &atp86x_port_ops,
-		.host_flags	= IDE_HFLAG_NO_ATAPI_DMA |
-				  IDE_HFLAG_OFF_BOARD,
-		.pio_mask	= ATA_PIO4,
-		.mwdma_mask	= ATA_MWDMA2,
-		.udma_mask	= ATA_UDMA5,
-	},
-	{	/* 4: AEC6280R */
-		.name		= DRV_NAME,
-		.init_chipset	= init_chipset_aec62xx,
-		.enablebits	= {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}},
-		.port_ops	= &atp86x_port_ops,
-		.host_flags	= IDE_HFLAG_NO_ATAPI_DMA |
-				  IDE_HFLAG_OFF_BOARD,
-		.pio_mask	= ATA_PIO4,
-		.mwdma_mask	= ATA_MWDMA2,
-		.udma_mask	= ATA_UDMA5,
-	}
-};
-
-/**
- *	aec62xx_init_one	-	called when a AEC is found
- *	@dev: the aec62xx device
- *	@id: the matching pci id
- *
- *	Called when the PCI registration layer (or the IDE initialization)
- *	finds a device matching our IDE device tables.
- *
- *	NOTE: since we're going to modify the 'name' field for AEC-6[26]80[R]
- *	chips, pass a local copy of 'struct ide_port_info' down the call chain.
- */
-
-static int aec62xx_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	const struct chipset_bus_clock_list_entry *bus_clock;
-	struct ide_port_info d;
-	u8 idx = id->driver_data;
-	int bus_speed = ide_pci_clk ? ide_pci_clk : 33;
-	int err;
-
-	if (bus_speed <= 33)
-		bus_clock = aec6xxx_33_base;
-	else
-		bus_clock = aec6xxx_34_base;
-
-	err = pci_enable_device(dev);
-	if (err)
-		return err;
-
-	d = aec62xx_chipsets[idx];
-
-	if (idx == 3 || idx == 4) {
-		unsigned long dma_base = pci_resource_start(dev, 4);
-
-		if (inb(dma_base + 2) & 0x10) {
-			printk(KERN_INFO DRV_NAME " %s: AEC6880%s card detected"
-				"\n", pci_name(dev), (idx == 4) ? "R" : "");
-			d.udma_mask = ATA_UDMA6;
-		}
-	}
-
-	err = ide_pci_init_one(dev, &d, (void *)bus_clock);
-	if (err)
-		pci_disable_device(dev);
-
-	return err;
-}
-
-static void aec62xx_remove(struct pci_dev *dev)
-{
-	ide_pci_remove(dev);
-	pci_disable_device(dev);
-}
-
-static const struct pci_device_id aec62xx_pci_tbl[] = {
-	{ PCI_VDEVICE(ARTOP, PCI_DEVICE_ID_ARTOP_ATP850UF), 0 },
-	{ PCI_VDEVICE(ARTOP, PCI_DEVICE_ID_ARTOP_ATP860),   1 },
-	{ PCI_VDEVICE(ARTOP, PCI_DEVICE_ID_ARTOP_ATP860R),  2 },
-	{ PCI_VDEVICE(ARTOP, PCI_DEVICE_ID_ARTOP_ATP865),   3 },
-	{ PCI_VDEVICE(ARTOP, PCI_DEVICE_ID_ARTOP_ATP865R),  4 },
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, aec62xx_pci_tbl);
-
-static struct pci_driver aec62xx_pci_driver = {
-	.name		= "AEC62xx_IDE",
-	.id_table	= aec62xx_pci_tbl,
-	.probe		= aec62xx_init_one,
-	.remove		= aec62xx_remove,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init aec62xx_ide_init(void)
-{
-	return ide_pci_register_driver(&aec62xx_pci_driver);
-}
-
-static void __exit aec62xx_ide_exit(void)
-{
-	pci_unregister_driver(&aec62xx_pci_driver);
-}
-
-module_init(aec62xx_ide_init);
-module_exit(aec62xx_ide_exit);
-
-MODULE_AUTHOR("Andre Hedrick");
-MODULE_DESCRIPTION("PCI driver module for ARTOP AEC62xx IDE");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/ali14xx.c b/drivers/ide/ali14xx.c
deleted file mode 100644
index 3268931c2c7a..000000000000
--- a/drivers/ide/ali14xx.c
+++ /dev/null
@@ -1,250 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  Copyright (C) 1996  Linus Torvalds & author (see below)
- */
-
-/*
- * ALI M14xx chipset EIDE controller
- *
- * Works for ALI M1439/1443/1445/1487/1489 chipsets.
- *
- * Adapted from code developed by derekn@vw.ece.cmu.edu.  -ml
- * Derek's notes follow:
- *
- * I think the code should be pretty understandable,
- * but I'll be happy to (try to) answer questions.
- *
- * The critical part is in the setupDrive function.  The initRegisters
- * function doesn't seem to be necessary, but the DOS driver does it, so
- * I threw it in.
- *
- * I've only tested this on my system, which only has one disk.  I posted
- * it to comp.sys.linux.hardware, so maybe some other people will try it
- * out.
- *
- * Derek Noonburg  (derekn@ece.cmu.edu)
- * 95-sep-26
- *
- * Update 96-jul-13:
- *
- * I've since upgraded to two disks and a CD-ROM, with no trouble, and
- * I've also heard from several others who have used it successfully.
- * This driver appears to work with both the 1443/1445 and the 1487/1489
- * chipsets.  I've added support for PIO mode 4 for the 1487.  This
- * seems to work just fine on the 1443 also, although I'm not sure it's
- * advertised as supporting mode 4.  (I've been running a WDC AC21200 in
- * mode 4 for a while now with no trouble.)  -Derek
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/delay.h>
-#include <linux/timer.h>
-#include <linux/mm.h>
-#include <linux/ioport.h>
-#include <linux/blkdev.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-
-#include <asm/io.h>
-
-#define DRV_NAME "ali14xx"
-
-/* port addresses for auto-detection */
-#define ALI_NUM_PORTS 4
-static const int ports[ALI_NUM_PORTS] __initconst =
-	{ 0x074, 0x0f4, 0x034, 0x0e4 };
-
-/* register initialization data */
-typedef struct { u8 reg, data; } RegInitializer;
-
-static const RegInitializer initData[] __initconst = {
-	{0x01, 0x0f}, {0x02, 0x00}, {0x03, 0x00}, {0x04, 0x00},
-	{0x05, 0x00}, {0x06, 0x00}, {0x07, 0x2b}, {0x0a, 0x0f},
-	{0x25, 0x00}, {0x26, 0x00}, {0x27, 0x00}, {0x28, 0x00},
-	{0x29, 0x00}, {0x2a, 0x00}, {0x2f, 0x00}, {0x2b, 0x00},
-	{0x2c, 0x00}, {0x2d, 0x00}, {0x2e, 0x00}, {0x30, 0x00},
-	{0x31, 0x00}, {0x32, 0x00}, {0x33, 0x00}, {0x34, 0xff},
-	{0x35, 0x03}, {0x00, 0x00}
-};
-
-/* timing parameter registers for each drive */
-static struct { u8 reg1, reg2, reg3, reg4; } regTab[4] = {
-	{0x03, 0x26, 0x04, 0x27},     /* drive 0 */
-	{0x05, 0x28, 0x06, 0x29},     /* drive 1 */
-	{0x2b, 0x30, 0x2c, 0x31},     /* drive 2 */
-	{0x2d, 0x32, 0x2e, 0x33},     /* drive 3 */
-};
-
-static int basePort;	/* base port address */
-static int regPort;	/* port for register number */
-static int dataPort;	/* port for register data */
-static u8 regOn;	/* output to base port to access registers */
-static u8 regOff;	/* output to base port to close registers */
-
-/*------------------------------------------------------------------------*/
-
-/*
- * Read a controller register.
- */
-static inline u8 inReg(u8 reg)
-{
-	outb_p(reg, regPort);
-	return inb(dataPort);
-}
-
-/*
- * Write a controller register.
- */
-static void outReg(u8 data, u8 reg)
-{
-	outb_p(reg, regPort);
-	outb_p(data, dataPort);
-}
-
-static DEFINE_SPINLOCK(ali14xx_lock);
-
-/*
- * Set PIO mode for the specified drive.
- * This function computes timing parameters
- * and sets controller registers accordingly.
- */
-static void ali14xx_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	int driveNum;
-	int time1, time2;
-	u8 param1, param2, param3, param4;
-	unsigned long flags;
-	int bus_speed = ide_vlb_clk ? ide_vlb_clk : 50;
-	const u8 pio = drive->pio_mode - XFER_PIO_0;
-	struct ide_timing *t = ide_timing_find_mode(XFER_PIO_0 + pio);
-
-	/* calculate timing, according to PIO mode */
-	time1 = ide_pio_cycle_time(drive, pio);
-	time2 = t->active;
-	param3 = param1 = (time2 * bus_speed + 999) / 1000;
-	param4 = param2 = (time1 * bus_speed + 999) / 1000 - param1;
-	if (pio < 3) {
-		param3 += 8;
-		param4 += 8;
-	}
-	printk(KERN_DEBUG "%s: PIO mode%d, t1=%dns, t2=%dns, cycles = %d+%d, %d+%d\n",
-		drive->name, pio, time1, time2, param1, param2, param3, param4);
-
-	/* stuff timing parameters into controller registers */
-	driveNum = (drive->hwif->index << 1) + (drive->dn & 1);
-	spin_lock_irqsave(&ali14xx_lock, flags);
-	outb_p(regOn, basePort);
-	outReg(param1, regTab[driveNum].reg1);
-	outReg(param2, regTab[driveNum].reg2);
-	outReg(param3, regTab[driveNum].reg3);
-	outReg(param4, regTab[driveNum].reg4);
-	outb_p(regOff, basePort);
-	spin_unlock_irqrestore(&ali14xx_lock, flags);
-}
-
-/*
- * Auto-detect the IDE controller port.
- */
-static int __init findPort(void)
-{
-	int i;
-	u8 t;
-	unsigned long flags;
-
-	local_irq_save(flags);
-	for (i = 0; i < ALI_NUM_PORTS; ++i) {
-		basePort = ports[i];
-		regOff = inb(basePort);
-		for (regOn = 0x30; regOn <= 0x33; ++regOn) {
-			outb_p(regOn, basePort);
-			if (inb(basePort) == regOn) {
-				regPort = basePort + 4;
-				dataPort = basePort + 8;
-				t = inReg(0) & 0xf0;
-				outb_p(regOff, basePort);
-				local_irq_restore(flags);
-				if (t != 0x50)
-					return 0;
-				return 1;  /* success */
-			}
-		}
-		outb_p(regOff, basePort);
-	}
-	local_irq_restore(flags);
-	return 0;
-}
-
-/*
- * Initialize controller registers with default values.
- */
-static int __init initRegisters(void)
-{
-	const RegInitializer *p;
-	u8 t;
-	unsigned long flags;
-
-	local_irq_save(flags);
-	outb_p(regOn, basePort);
-	for (p = initData; p->reg != 0; ++p)
-		outReg(p->data, p->reg);
-	outb_p(0x01, regPort);
-	t = inb(regPort) & 0x01;
-	outb_p(regOff, basePort);
-	local_irq_restore(flags);
-	return t;
-}
-
-static const struct ide_port_ops ali14xx_port_ops = {
-	.set_pio_mode		= ali14xx_set_pio_mode,
-};
-
-static const struct ide_port_info ali14xx_port_info = {
-	.name			= DRV_NAME,
-	.chipset		= ide_ali14xx,
-	.port_ops		= &ali14xx_port_ops,
-	.host_flags		= IDE_HFLAG_NO_DMA,
-	.pio_mask		= ATA_PIO4,
-};
-
-static int __init ali14xx_probe(void)
-{
-	printk(KERN_DEBUG "ali14xx: base=0x%03x, regOn=0x%02x.\n",
-			  basePort, regOn);
-
-	/* initialize controller registers */
-	if (!initRegisters()) {
-		printk(KERN_ERR "ali14xx: Chip initialization failed.\n");
-		return 1;
-	}
-
-	return ide_legacy_device_add(&ali14xx_port_info, 0);
-}
-
-static bool probe_ali14xx;
-
-module_param_named(probe, probe_ali14xx, bool, 0);
-MODULE_PARM_DESC(probe, "probe for ALI M14xx chipsets");
-
-static int __init ali14xx_init(void)
-{
-	if (probe_ali14xx == 0)
-		goto out;
-
-	/* auto-detect IDE controller port */
-	if (findPort()) {
-		if (ali14xx_probe())
-			return -ENODEV;
-		return 0;
-	}
-	printk(KERN_ERR "ali14xx: not found.\n");
-out:
-	return -ENODEV;
-}
-
-module_init(ali14xx_init);
-
-MODULE_AUTHOR("see local file");
-MODULE_DESCRIPTION("support of ALI 14XX IDE chipsets");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
deleted file mode 100644
index 3265970aee34..000000000000
--- a/drivers/ide/alim15x3.c
+++ /dev/null
@@ -1,602 +0,0 @@
-/*
- *  Copyright (C) 1998-2000 Michel Aubry, Maintainer
- *  Copyright (C) 1998-2000 Andrzej Krzysztofowicz, Maintainer
- *  Copyright (C) 1999-2000 CJ, cjtsai@ali.com.tw, Maintainer
- *
- *  Copyright (C) 1998-2000 Andre Hedrick (andre@linux-ide.org)
- *  May be copied or modified under the terms of the GNU General Public License
- *  Copyright (C) 2002 Alan Cox
- *  ALi (now ULi M5228) support by Clear Zhang <Clear.Zhang@ali.com.tw>
- *  Copyright (C) 2007 MontaVista Software, Inc. <source@mvista.com>
- *  Copyright (C) 2007-2010 Bartlomiej Zolnierkiewicz
- *
- *  (U)DMA capable version of ali 1533/1543(C), 1535(D)
- *
- **********************************************************************
- *  9/7/99 --Parts from the above author are included and need to be
- *  converted into standard interface, once I finish the thought.
- *
- *  Recent changes
- *	Don't use LBA48 mode on ALi <= 0xC4
- *	Don't poke 0x79 with a non ALi northbridge
- *	Don't flip undefined bits on newer chipsets (fix Fujitsu laptop hang)
- *	Allow UDMA6 on revisions > 0xC4
- *
- *  Documentation
- *	Chipset documentation available under NDA only
- *
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-#include <linux/dmi.h>
-
-#include <asm/io.h>
-
-#define DRV_NAME "alim15x3"
-
-/*
- *	ALi devices are not plug in. Otherwise these static values would
- *	need to go. They ought to go away anyway
- */
- 
-static u8 m5229_revision;
-static u8 chip_is_1543c_e;
-static struct pci_dev *isa_dev;
-
-static void ali_fifo_control(ide_hwif_t *hwif, ide_drive_t *drive, int on)
-{
-	struct pci_dev *pdev = to_pci_dev(hwif->dev);
-	int pio_fifo = 0x54 + hwif->channel;
-	u8 fifo;
-	int shift = 4 * (drive->dn & 1);
-
-	pci_read_config_byte(pdev, pio_fifo, &fifo);
-	fifo &= ~(0x0F << shift);
-	fifo |= (on << shift);
-	pci_write_config_byte(pdev, pio_fifo, fifo);
-}
-
-static void ali_program_timings(ide_hwif_t *hwif, ide_drive_t *drive,
-				struct ide_timing *t, u8 ultra)
-{
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	int port = hwif->channel ? 0x5c : 0x58;
-	int udmat = 0x56 + hwif->channel;
-	u8 unit = drive->dn & 1, udma;
-	int shift = 4 * unit;
-
-	/* Set up the UDMA */
-	pci_read_config_byte(dev, udmat, &udma);
-	udma &= ~(0x0F << shift);
-	udma |= ultra << shift;
-	pci_write_config_byte(dev, udmat, udma);
-
-	if (t == NULL)
-		return;
-
-	t->setup = clamp_val(t->setup, 1, 8) & 7;
-	t->act8b = clamp_val(t->act8b, 1, 8) & 7;
-	t->rec8b = clamp_val(t->rec8b, 1, 16) & 15;
-	t->active = clamp_val(t->active, 1, 8) & 7;
-	t->recover = clamp_val(t->recover, 1, 16) & 15;
-
-	pci_write_config_byte(dev, port, t->setup);
-	pci_write_config_byte(dev, port + 1, (t->act8b << 4) | t->rec8b);
-	pci_write_config_byte(dev, port + unit + 2,
-			      (t->active << 4) | t->recover);
-}
-
-/**
- *	ali_set_pio_mode	-	set host controller for PIO mode
- *	@hwif: port
- *	@drive: drive
- *
- *	Program the controller for the given PIO mode.
- */
-
-static void ali_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	ide_drive_t *pair = ide_get_pair_dev(drive);
-	int bus_speed = ide_pci_clk ? ide_pci_clk : 33;
-	unsigned long T =  1000000 / bus_speed; /* PCI clock based */
-	struct ide_timing t;
-
-	ide_timing_compute(drive, drive->pio_mode, &t, T, 1);
-	if (pair) {
-		struct ide_timing p;
-
-		ide_timing_compute(pair, pair->pio_mode, &p, T, 1);
-		ide_timing_merge(&p, &t, &t,
-			IDE_TIMING_SETUP | IDE_TIMING_8BIT);
-		if (pair->dma_mode) {
-			ide_timing_compute(pair, pair->dma_mode, &p, T, 1);
-			ide_timing_merge(&p, &t, &t,
-				IDE_TIMING_SETUP | IDE_TIMING_8BIT);
-		}
-	}
-
-	/* 
-	 * PIO mode => ATA FIFO on, ATAPI FIFO off
-	 */
-	ali_fifo_control(hwif, drive, (drive->media == ide_disk) ? 0x05 : 0x00);
-
-	ali_program_timings(hwif, drive, &t, 0);
-}
-
-/**
- *	ali_udma_filter		-	compute UDMA mask
- *	@drive: IDE device
- *
- *	Return available UDMA modes.
- *
- *	The actual rules for the ALi are:
- *		No UDMA on revisions <= 0x20
- *		Disk only for revisions < 0xC2
- *		Not WDC drives on M1543C-E (?)
- */
-
-static u8 ali_udma_filter(ide_drive_t *drive)
-{
-	if (m5229_revision > 0x20 && m5229_revision < 0xC2) {
-		if (drive->media != ide_disk)
-			return 0;
-		if (chip_is_1543c_e &&
-		    strstr((char *)&drive->id[ATA_ID_PROD], "WDC "))
-			return 0;
-	}
-
-	return drive->hwif->ultra_mask;
-}
-
-/**
- *	ali_set_dma_mode	-	set host controller for DMA mode
- *	@hwif: port
- *	@drive: drive
- *
- *	Configure the hardware for the desired IDE transfer mode.
- */
-
-static void ali_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	static u8 udma_timing[7] = { 0xC, 0xB, 0xA, 0x9, 0x8, 0xF, 0xD };
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	ide_drive_t *pair	= ide_get_pair_dev(drive);
-	int bus_speed		= ide_pci_clk ? ide_pci_clk : 33;
-	unsigned long T		=  1000000 / bus_speed; /* PCI clock based */
-	const u8 speed		= drive->dma_mode;
-	u8 tmpbyte		= 0x00;
-	struct ide_timing t;
-
-	if (speed < XFER_UDMA_0) {
-		ide_timing_compute(drive, drive->dma_mode, &t, T, 1);
-		if (pair) {
-			struct ide_timing p;
-
-			ide_timing_compute(pair, pair->pio_mode, &p, T, 1);
-			ide_timing_merge(&p, &t, &t,
-				IDE_TIMING_SETUP | IDE_TIMING_8BIT);
-			if (pair->dma_mode) {
-				ide_timing_compute(pair, pair->dma_mode,
-						&p, T, 1);
-				ide_timing_merge(&p, &t, &t,
-					IDE_TIMING_SETUP | IDE_TIMING_8BIT);
-			}
-		}
-		ali_program_timings(hwif, drive, &t, 0);
-	} else {
-		ali_program_timings(hwif, drive, NULL,
-				udma_timing[speed - XFER_UDMA_0]);
-		if (speed >= XFER_UDMA_3) {
-			pci_read_config_byte(dev, 0x4b, &tmpbyte);
-			tmpbyte |= 1;
-			pci_write_config_byte(dev, 0x4b, tmpbyte);
-		}
-	}
-}
-
-/**
- *	ali_dma_check	-	DMA check
- *	@drive:	target device
- *	@cmd: command
- *
- *	Returns 1 if the DMA cannot be performed, zero on success.
- */
-
-static int ali_dma_check(ide_drive_t *drive, struct ide_cmd *cmd)
-{
-	if (m5229_revision < 0xC2 && drive->media != ide_disk) {
-		if (cmd->tf_flags & IDE_TFLAG_WRITE)
-			return 1;	/* try PIO instead of DMA */
-	}
-	return 0;
-}
-
-/**
- *	init_chipset_ali15x3	-	Initialise an ALi IDE controller
- *	@dev: PCI device
- *
- *	This function initializes the ALI IDE controller and where 
- *	appropriate also sets up the 1533 southbridge.
- */
-
-static int init_chipset_ali15x3(struct pci_dev *dev)
-{
-	unsigned long flags;
-	u8 tmpbyte;
-	struct pci_dev *north = pci_get_slot(dev->bus, PCI_DEVFN(0,0));
-
-	m5229_revision = dev->revision;
-
-	isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
-
-	local_irq_save(flags);
-
-	if (m5229_revision < 0xC2) {
-		/*
-		 * revision 0x20 (1543-E, 1543-F)
-		 * revision 0xC0, 0xC1 (1543C-C, 1543C-D, 1543C-E)
-		 * clear CD-ROM DMA write bit, m5229, 0x4b, bit 7
-		 */
-		pci_read_config_byte(dev, 0x4b, &tmpbyte);
-		/*
-		 * clear bit 7
-		 */
-		pci_write_config_byte(dev, 0x4b, tmpbyte & 0x7F);
-		/*
-		 * check m1533, 0x5e, bit 1~4 == 1001 => & 00011110 = 00010010
-		 */
-		if (m5229_revision >= 0x20 && isa_dev) {
-			pci_read_config_byte(isa_dev, 0x5e, &tmpbyte);
-			chip_is_1543c_e = ((tmpbyte & 0x1e) == 0x12) ? 1: 0;
-		}
-		goto out;
-	}
-
-	/*
-	 * 1543C-B?, 1535, 1535D, 1553
-	 * Note 1: not all "motherboard" support this detection
-	 * Note 2: if no udma 66 device, the detection may "error".
-	 *         but in this case, we will not set the device to
-	 *         ultra 66, the detection result is not important
-	 */
-
-	/*
-	 * enable "Cable Detection", m5229, 0x4b, bit3
-	 */
-	pci_read_config_byte(dev, 0x4b, &tmpbyte);
-	pci_write_config_byte(dev, 0x4b, tmpbyte | 0x08);
-
-	/*
-	 * We should only tune the 1533 enable if we are using an ALi
-	 * North bridge. We might have no north found on some zany
-	 * box without a device at 0:0.0. The ALi bridge will be at
-	 * 0:0.0 so if we didn't find one we know what is cooking.
-	 */
-	if (north && north->vendor != PCI_VENDOR_ID_AL)
-		goto out;
-
-	if (m5229_revision < 0xC5 && isa_dev)
-	{	
-		/*
-		 * set south-bridge's enable bit, m1533, 0x79
-		 */
-
-		pci_read_config_byte(isa_dev, 0x79, &tmpbyte);
-		if (m5229_revision == 0xC2) {
-			/*
-			 * 1543C-B0 (m1533, 0x79, bit 2)
-			 */
-			pci_write_config_byte(isa_dev, 0x79, tmpbyte | 0x04);
-		} else if (m5229_revision >= 0xC3) {
-			/*
-			 * 1553/1535 (m1533, 0x79, bit 1)
-			 */
-			pci_write_config_byte(isa_dev, 0x79, tmpbyte | 0x02);
-		}
-	}
-
-out:
-	/*
-	 * CD_ROM DMA on (m5229, 0x53, bit0)
-	 *      Enable this bit even if we want to use PIO.
-	 * PIO FIFO off (m5229, 0x53, bit1)
-	 *      The hardware will use 0x54h and 0x55h to control PIO FIFO.
-	 *	(Not on later devices it seems)
-	 *
-	 *	0x53 changes meaning on later revs - we must no touch
-	 *	bit 1 on them.  Need to check if 0x20 is the right break.
-	 */
-	if (m5229_revision >= 0x20) {
-		pci_read_config_byte(dev, 0x53, &tmpbyte);
-
-		if (m5229_revision <= 0x20)
-			tmpbyte = (tmpbyte & (~0x02)) | 0x01;
-		else if (m5229_revision == 0xc7 || m5229_revision == 0xc8)
-			tmpbyte |= 0x03;
-		else
-			tmpbyte |= 0x01;
-
-		pci_write_config_byte(dev, 0x53, tmpbyte);
-	}
-	local_irq_restore(flags);
-	pci_dev_put(north);
-	pci_dev_put(isa_dev);
-	return 0;
-}
-
-/*
- *	Cable special cases
- */
-
-static const struct dmi_system_id cable_dmi_table[] = {
-	{
-		.ident = "HP Pavilion N5430",
-		.matches = {
-			DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
-			DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
-		},
-	},
-	{
-		.ident = "Toshiba Satellite S1800-814",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"),
-			DMI_MATCH(DMI_PRODUCT_NAME, "S1800-814"),
-		},
-	},
-	{ }
-};
-
-static int ali_cable_override(struct pci_dev *pdev)
-{
-	/* Fujitsu P2000 */
-	if (pdev->subsystem_vendor == 0x10CF &&
-	    pdev->subsystem_device == 0x10AF)
-		return 1;
-
-	/* Mitac 8317 (Winbook-A) and relatives */
-	if (pdev->subsystem_vendor == 0x1071 &&
-	    pdev->subsystem_device == 0x8317)
-		return 1;
-
-	/* Systems by DMI */
-	if (dmi_check_system(cable_dmi_table))
-		return 1;
-
-	return 0;
-}
-
-/**
- *	ali_cable_detect	-	cable detection
- *	@hwif: IDE interface
- *
- *	This checks if the controller and the cable are capable
- *	of UDMA66 transfers. It doesn't check the drives.
- */
-
-static u8 ali_cable_detect(ide_hwif_t *hwif)
-{
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	u8 cbl = ATA_CBL_PATA40, tmpbyte;
-
-	if (m5229_revision >= 0xC2) {
-		/*
-		 * m5229 80-pin cable detection (from Host View)
-		 *
-		 * 0x4a bit0 is 0 => primary channel has 80-pin
-		 * 0x4a bit1 is 0 => secondary channel has 80-pin
-		 *
-		 * Certain laptops use short but suitable cables
-		 * and don't implement the detect logic.
-		 */
-		if (ali_cable_override(dev))
-			cbl = ATA_CBL_PATA40_SHORT;
-		else {
-			pci_read_config_byte(dev, 0x4a, &tmpbyte);
-			if ((tmpbyte & (1 << hwif->channel)) == 0)
-				cbl = ATA_CBL_PATA80;
-		}
-	}
-
-	return cbl;
-}
-
-#ifndef CONFIG_SPARC64
-/**
- *	init_hwif_ali15x3	-	Initialize the ALI IDE x86 stuff
- *	@hwif: interface to configure
- *
- *	Obtain the IRQ tables for an ALi based IDE solution on the PC
- *	class platforms. This part of the code isn't applicable to the
- *	Sparc systems.
- */
-
-static void init_hwif_ali15x3(ide_hwif_t *hwif)
-{
-	u8 ideic, inmir;
-	s8 irq_routing_table[] = { -1,  9, 3, 10, 4,  5, 7,  6,
-				      1, 11, 0, 12, 0, 14, 0, 15 };
-	int irq = -1;
-
-	if (isa_dev) {
-		/*
-		 * read IDE interface control
-		 */
-		pci_read_config_byte(isa_dev, 0x58, &ideic);
-
-		/* bit0, bit1 */
-		ideic = ideic & 0x03;
-
-		/* get IRQ for IDE Controller */
-		if ((hwif->channel && ideic == 0x03) ||
-		    (!hwif->channel && !ideic)) {
-			/*
-			 * get SIRQ1 routing table
-			 */
-			pci_read_config_byte(isa_dev, 0x44, &inmir);
-			inmir = inmir & 0x0f;
-			irq = irq_routing_table[inmir];
-		} else if (hwif->channel && !(ideic & 0x01)) {
-			/*
-			 * get SIRQ2 routing table
-			 */
-			pci_read_config_byte(isa_dev, 0x75, &inmir);
-			inmir = inmir & 0x0f;
-			irq = irq_routing_table[inmir];
-		}
-		if(irq >= 0)
-			hwif->irq = irq;
-	}
-}
-#else
-#define init_hwif_ali15x3 NULL
-#endif /* CONFIG_SPARC64 */
-
-/**
- *	init_dma_ali15x3	-	set up DMA on ALi15x3
- *	@hwif: IDE interface
- *	@d: IDE port info
- *
- *	Set up the DMA functionality on the ALi 15x3.
- */
-
-static int init_dma_ali15x3(ide_hwif_t *hwif, const struct ide_port_info *d)
-{
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	unsigned long base = ide_pci_dma_base(hwif, d);
-
-	if (base == 0)
-		return -1;
-
-	hwif->dma_base = base;
-
-	if (ide_pci_check_simplex(hwif, d) < 0)
-		return -1;
-
-	if (ide_pci_set_master(dev, d->name) < 0)
-		return -1;
-
-	if (!hwif->channel)
-		outb(inb(base + 2) & 0x60, base + 2);
-
-	printk(KERN_INFO "    %s: BM-DMA at 0x%04lx-0x%04lx\n",
-			 hwif->name, base, base + 7);
-
-	if (ide_allocate_dma_engine(hwif))
-		return -1;
-
-	return 0;
-}
-
-static const struct ide_port_ops ali_port_ops = {
-	.set_pio_mode		= ali_set_pio_mode,
-	.set_dma_mode		= ali_set_dma_mode,
-	.udma_filter		= ali_udma_filter,
-	.cable_detect		= ali_cable_detect,
-};
-
-static const struct ide_dma_ops ali_dma_ops = {
-	.dma_host_set		= ide_dma_host_set,
-	.dma_setup		= ide_dma_setup,
-	.dma_start		= ide_dma_start,
-	.dma_end		= ide_dma_end,
-	.dma_test_irq		= ide_dma_test_irq,
-	.dma_lost_irq		= ide_dma_lost_irq,
-	.dma_check		= ali_dma_check,
-	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_sff_read_status	= ide_dma_sff_read_status,
-};
-
-static const struct ide_port_info ali15x3_chipset = {
-	.name		= DRV_NAME,
-	.init_chipset	= init_chipset_ali15x3,
-	.init_hwif	= init_hwif_ali15x3,
-	.init_dma	= init_dma_ali15x3,
-	.port_ops	= &ali_port_ops,
-	.dma_ops	= &sff_dma_ops,
-	.pio_mask	= ATA_PIO5,
-	.swdma_mask	= ATA_SWDMA2,
-	.mwdma_mask	= ATA_MWDMA2,
-};
-
-/**
- *	alim15x3_init_one	-	set up an ALi15x3 IDE controller
- *	@dev: PCI device to set up
- *
- *	Perform the actual set up for an ALi15x3 that has been found by the
- *	hot plug layer.
- */
- 
-static int alim15x3_init_one(struct pci_dev *dev,
-			     const struct pci_device_id *id)
-{
-	struct ide_port_info d = ali15x3_chipset;
-	u8 rev = dev->revision, idx = id->driver_data;
-
-	/* don't use LBA48 DMA on ALi devices before rev 0xC5 */
-	if (rev <= 0xC4)
-		d.host_flags |= IDE_HFLAG_NO_LBA48_DMA;
-
-	if (rev >= 0x20) {
-		if (rev == 0x20)
-			d.host_flags |= IDE_HFLAG_NO_ATAPI_DMA;
-
-		if (rev < 0xC2)
-			d.udma_mask = ATA_UDMA2;
-		else if (rev == 0xC2 || rev == 0xC3)
-			d.udma_mask = ATA_UDMA4;
-		else if (rev == 0xC4)
-			d.udma_mask = ATA_UDMA5;
-		else
-			d.udma_mask = ATA_UDMA6;
-
-		d.dma_ops = &ali_dma_ops;
-	} else {
-		d.host_flags |= IDE_HFLAG_NO_DMA;
-
-		d.mwdma_mask = d.swdma_mask = 0;
-	}
-
-	if (idx == 0)
-		d.host_flags |= IDE_HFLAG_CLEAR_SIMPLEX;
-
-	return ide_pci_init_one(dev, &d, NULL);
-}
-
-
-static const struct pci_device_id alim15x3_pci_tbl[] = {
-	{ PCI_VDEVICE(AL, PCI_DEVICE_ID_AL_M5229), 0 },
-	{ PCI_VDEVICE(AL, PCI_DEVICE_ID_AL_M5228), 1 },
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, alim15x3_pci_tbl);
-
-static struct pci_driver alim15x3_pci_driver = {
-	.name		= "ALI15x3_IDE",
-	.id_table	= alim15x3_pci_tbl,
-	.probe		= alim15x3_init_one,
-	.remove		= ide_pci_remove,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init ali15x3_ide_init(void)
-{
-	return ide_pci_register_driver(&alim15x3_pci_driver);
-}
-
-static void __exit ali15x3_ide_exit(void)
-{
-	pci_unregister_driver(&alim15x3_pci_driver);
-}
-
-module_init(ali15x3_ide_init);
-module_exit(ali15x3_ide_exit);
-
-MODULE_AUTHOR("Michael Aubry, Andrzej Krzysztofowicz, CJ, Andre Hedrick, Alan Cox, Bartlomiej Zolnierkiewicz");
-MODULE_DESCRIPTION("PCI driver module for ALi 15x3 IDE");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/amd74xx.c b/drivers/ide/amd74xx.c
deleted file mode 100644
index 7340597a373e..000000000000
--- a/drivers/ide/amd74xx.c
+++ /dev/null
@@ -1,343 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * AMD 755/756/766/8111 and nVidia nForce/2/2s/3/3s/CK804/MCP04
- * IDE driver for Linux.
- *
- * Copyright (c) 2000-2002 Vojtech Pavlik
- * Copyright (c) 2007-2010 Bartlomiej Zolnierkiewicz
- *
- * Based on the work of:
- *      Andre Hedrick
- */
-
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/init.h>
-#include <linux/ide.h>
-
-#define DRV_NAME "amd74xx"
-
-enum {
-	AMD_IDE_CONFIG		= 0x41,
-	AMD_CABLE_DETECT	= 0x42,
-	AMD_DRIVE_TIMING	= 0x48,
-	AMD_8BIT_TIMING		= 0x4e,
-	AMD_ADDRESS_SETUP	= 0x4c,
-	AMD_UDMA_TIMING		= 0x50,
-};
-
-static unsigned int amd_80w;
-static unsigned int amd_clock;
-
-static char *amd_dma[] = { "16", "25", "33", "44", "66", "100", "133" };
-static unsigned char amd_cyc2udma[] = { 6, 6, 5, 4, 0, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 7 };
-
-static inline u8 amd_offset(struct pci_dev *dev)
-{
-	return (dev->vendor == PCI_VENDOR_ID_NVIDIA) ? 0x10 : 0;
-}
-
-/*
- * amd_set_speed() writes timing values to the chipset registers
- */
-
-static void amd_set_speed(struct pci_dev *dev, u8 dn, u8 udma_mask,
-			  struct ide_timing *timing)
-{
-	u8 t = 0, offset = amd_offset(dev);
-
-	pci_read_config_byte(dev, AMD_ADDRESS_SETUP + offset, &t);
-	t = (t & ~(3 << ((3 - dn) << 1))) | ((clamp_val(timing->setup, 1, 4) - 1) << ((3 - dn) << 1));
-	pci_write_config_byte(dev, AMD_ADDRESS_SETUP + offset, t);
-
-	pci_write_config_byte(dev, AMD_8BIT_TIMING + offset + (1 - (dn >> 1)),
-		((clamp_val(timing->act8b, 1, 16) - 1) << 4) | (clamp_val(timing->rec8b, 1, 16) - 1));
-
-	pci_write_config_byte(dev, AMD_DRIVE_TIMING + offset + (3 - dn),
-		((clamp_val(timing->active, 1, 16) - 1) << 4) | (clamp_val(timing->recover, 1, 16) - 1));
-
-	switch (udma_mask) {
-	case ATA_UDMA2: t = timing->udma ? (0xc0 | (clamp_val(timing->udma, 2, 5) - 2)) : 0x03; break;
-	case ATA_UDMA4: t = timing->udma ? (0xc0 | amd_cyc2udma[clamp_val(timing->udma, 2, 10)]) : 0x03; break;
-	case ATA_UDMA5: t = timing->udma ? (0xc0 | amd_cyc2udma[clamp_val(timing->udma, 1, 10)]) : 0x03; break;
-	case ATA_UDMA6: t = timing->udma ? (0xc0 | amd_cyc2udma[clamp_val(timing->udma, 1, 15)]) : 0x03; break;
-	default: return;
-	}
-
-	if (timing->udma)
-		pci_write_config_byte(dev, AMD_UDMA_TIMING + offset + 3 - dn, t);
-}
-
-/*
- * amd_set_drive() computes timing values and configures the chipset
- * to a desired transfer mode.  It also can be called by upper layers.
- */
-
-static void amd_set_drive(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	ide_drive_t *peer = ide_get_pair_dev(drive);
-	struct ide_timing t, p;
-	int T, UT;
-	u8 udma_mask = hwif->ultra_mask;
-	const u8 speed = drive->dma_mode;
-
-	T = 1000000000 / amd_clock;
-	UT = (udma_mask == ATA_UDMA2) ? T : (T / 2);
-
-	ide_timing_compute(drive, speed, &t, T, UT);
-
-	if (peer) {
-		ide_timing_compute(peer, peer->pio_mode, &p, T, UT);
-		ide_timing_merge(&p, &t, &t, IDE_TIMING_8BIT);
-	}
-
-	if (speed == XFER_UDMA_5 && amd_clock <= 33333) t.udma = 1;
-	if (speed == XFER_UDMA_6 && amd_clock <= 33333) t.udma = 15;
-
-	amd_set_speed(dev, drive->dn, udma_mask, &t);
-}
-
-/*
- * amd_set_pio_mode() is a callback from upper layers for PIO-only tuning.
- */
-
-static void amd_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	drive->dma_mode = drive->pio_mode;
-	amd_set_drive(hwif, drive);
-}
-
-static void amd7409_cable_detect(struct pci_dev *dev)
-{
-	/* no host side cable detection */
-	amd_80w = 0x03;
-}
-
-static void amd7411_cable_detect(struct pci_dev *dev)
-{
-	int i;
-	u32 u = 0;
-	u8 t = 0, offset = amd_offset(dev);
-
-	pci_read_config_byte(dev, AMD_CABLE_DETECT + offset, &t);
-	pci_read_config_dword(dev, AMD_UDMA_TIMING + offset, &u);
-	amd_80w = ((t & 0x3) ? 1 : 0) | ((t & 0xc) ? 2 : 0);
-	for (i = 24; i >= 0; i -= 8)
-		if (((u >> i) & 4) && !(amd_80w & (1 << (1 - (i >> 4))))) {
-			printk(KERN_WARNING DRV_NAME " %s: BIOS didn't set "
-				"cable bits correctly. Enabling workaround.\n",
-				pci_name(dev));
-			amd_80w |= (1 << (1 - (i >> 4)));
-		}
-}
-
-/*
- * The initialization callback.  Initialize drive independent registers.
- */
-
-static int init_chipset_amd74xx(struct pci_dev *dev)
-{
-	u8 t = 0, offset = amd_offset(dev);
-
-/*
- * Check 80-wire cable presence.
- */
-
-	if (dev->vendor == PCI_VENDOR_ID_AMD &&
-	    dev->device == PCI_DEVICE_ID_AMD_COBRA_7401)
-		; /* no UDMA > 2 */
-	else if (dev->vendor == PCI_VENDOR_ID_AMD &&
-		 dev->device == PCI_DEVICE_ID_AMD_VIPER_7409)
-		amd7409_cable_detect(dev);
-	else
-		amd7411_cable_detect(dev);
-
-/*
- * Take care of prefetch & postwrite.
- */
-
-	pci_read_config_byte(dev, AMD_IDE_CONFIG + offset, &t);
-	/*
-	 * Check for broken FIFO support.
-	 */
-	if (dev->vendor == PCI_VENDOR_ID_AMD &&
-	    dev->device == PCI_DEVICE_ID_AMD_VIPER_7411)
-		t &= 0x0f;
-	else
-		t |= 0xf0;
-	pci_write_config_byte(dev, AMD_IDE_CONFIG + offset, t);
-
-	return 0;
-}
-
-static u8 amd_cable_detect(ide_hwif_t *hwif)
-{
-	if ((amd_80w >> hwif->channel) & 1)
-		return ATA_CBL_PATA80;
-	else
-		return ATA_CBL_PATA40;
-}
-
-static const struct ide_port_ops amd_port_ops = {
-	.set_pio_mode		= amd_set_pio_mode,
-	.set_dma_mode		= amd_set_drive,
-	.cable_detect		= amd_cable_detect,
-};
-
-#define IDE_HFLAGS_AMD \
-	(IDE_HFLAG_PIO_NO_BLACKLIST | \
-	 IDE_HFLAG_POST_SET_MODE | \
-	 IDE_HFLAG_IO_32BIT | \
-	 IDE_HFLAG_UNMASK_IRQS)
-
-#define DECLARE_AMD_DEV(swdma, udma)				\
-	{								\
-		.name		= DRV_NAME,				\
-		.init_chipset	= init_chipset_amd74xx,			\
-		.enablebits	= {{0x40,0x02,0x02}, {0x40,0x01,0x01}},	\
-		.port_ops	= &amd_port_ops,			\
-		.host_flags	= IDE_HFLAGS_AMD,			\
-		.pio_mask	= ATA_PIO5,				\
-		.swdma_mask	= swdma,				\
-		.mwdma_mask	= ATA_MWDMA2,				\
-		.udma_mask	= udma,					\
-	}
-
-#define DECLARE_NV_DEV(udma)					\
-	{								\
-		.name		= DRV_NAME,				\
-		.init_chipset	= init_chipset_amd74xx,			\
-		.enablebits	= {{0x50,0x02,0x02}, {0x50,0x01,0x01}},	\
-		.port_ops	= &amd_port_ops,			\
-		.host_flags	= IDE_HFLAGS_AMD,			\
-		.pio_mask	= ATA_PIO5,				\
-		.swdma_mask	= ATA_SWDMA2,				\
-		.mwdma_mask	= ATA_MWDMA2,				\
-		.udma_mask	= udma,					\
-	}
-
-static const struct ide_port_info amd74xx_chipsets[] = {
-	/* 0: AMD7401 */	DECLARE_AMD_DEV(0x00, ATA_UDMA2),
-	/* 1: AMD7409 */	DECLARE_AMD_DEV(ATA_SWDMA2, ATA_UDMA4),
-	/* 2: AMD7411/7441 */	DECLARE_AMD_DEV(ATA_SWDMA2, ATA_UDMA5),
-	/* 3: AMD8111 */	DECLARE_AMD_DEV(ATA_SWDMA2, ATA_UDMA6),
-
-	/* 4: NFORCE */		DECLARE_NV_DEV(ATA_UDMA5),
-	/* 5: >= NFORCE2 */	DECLARE_NV_DEV(ATA_UDMA6),
-
-	/* 6: AMD5536 */	DECLARE_AMD_DEV(ATA_SWDMA2, ATA_UDMA5),
-};
-
-static int amd74xx_probe(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	struct ide_port_info d;
-	u8 idx = id->driver_data;
-
-	d = amd74xx_chipsets[idx];
-
-	/*
-	 * Check for bad SWDMA and incorrectly wired Serenade mainboards.
-	 */
-	if (idx == 1) {
-		if (dev->revision <= 7)
-			d.swdma_mask = 0;
-		d.host_flags |= IDE_HFLAG_CLEAR_SIMPLEX;
-	} else if (idx == 3) {
-		if (dev->subsystem_vendor == PCI_VENDOR_ID_AMD &&
-		    dev->subsystem_device == PCI_DEVICE_ID_AMD_SERENADE)
-			d.udma_mask = ATA_UDMA5;
-	}
-
-	/*
-	 * It seems that on some nVidia controllers using AltStatus
-	 * register can be unreliable so default to Status register
-	 * if the device is in Compatibility Mode.
-	 */
-	if (dev->vendor == PCI_VENDOR_ID_NVIDIA &&
-	    ide_pci_is_in_compatibility_mode(dev))
-		d.host_flags |= IDE_HFLAG_BROKEN_ALTSTATUS;
-
-	printk(KERN_INFO "%s %s: UDMA%s controller\n",
-		d.name, pci_name(dev), amd_dma[fls(d.udma_mask) - 1]);
-
-	/*
-	* Determine the system bus clock.
-	*/
-	amd_clock = (ide_pci_clk ? ide_pci_clk : 33) * 1000;
-
-	switch (amd_clock) {
-	case 33000: amd_clock = 33333; break;
-	case 37000: amd_clock = 37500; break;
-	case 41000: amd_clock = 41666; break;
-	}
-
-	if (amd_clock < 20000 || amd_clock > 50000) {
-		printk(KERN_WARNING "%s: User given PCI clock speed impossible"
-				    " (%d), using 33 MHz instead.\n",
-				    d.name, amd_clock);
-		amd_clock = 33333;
-	}
-
-	return ide_pci_init_one(dev, &d, NULL);
-}
-
-static const struct pci_device_id amd74xx_pci_tbl[] = {
-	{ PCI_VDEVICE(AMD,	PCI_DEVICE_ID_AMD_COBRA_7401),		 0 },
-	{ PCI_VDEVICE(AMD,	PCI_DEVICE_ID_AMD_VIPER_7409),		 1 },
-	{ PCI_VDEVICE(AMD,	PCI_DEVICE_ID_AMD_VIPER_7411),		 2 },
-	{ PCI_VDEVICE(AMD,	PCI_DEVICE_ID_AMD_OPUS_7441),		 2 },
-	{ PCI_VDEVICE(AMD,	PCI_DEVICE_ID_AMD_8111_IDE),		 3 },
-	{ PCI_VDEVICE(NVIDIA,	PCI_DEVICE_ID_NVIDIA_NFORCE_IDE),	 4 },
-	{ PCI_VDEVICE(NVIDIA,	PCI_DEVICE_ID_NVIDIA_NFORCE2_IDE),	 5 },
-	{ PCI_VDEVICE(NVIDIA,	PCI_DEVICE_ID_NVIDIA_NFORCE2S_IDE),	 5 },
-#ifdef CONFIG_BLK_DEV_IDE_SATA
-	{ PCI_VDEVICE(NVIDIA,	PCI_DEVICE_ID_NVIDIA_NFORCE2S_SATA),	 5 },
-#endif
-	{ PCI_VDEVICE(NVIDIA,	PCI_DEVICE_ID_NVIDIA_NFORCE3_IDE),	 5 },
-	{ PCI_VDEVICE(NVIDIA,	PCI_DEVICE_ID_NVIDIA_NFORCE3S_IDE),	 5 },
-#ifdef CONFIG_BLK_DEV_IDE_SATA
-	{ PCI_VDEVICE(NVIDIA,	PCI_DEVICE_ID_NVIDIA_NFORCE3S_SATA),	 5 },
-	{ PCI_VDEVICE(NVIDIA,	PCI_DEVICE_ID_NVIDIA_NFORCE3S_SATA2),	 5 },
-#endif
-	{ PCI_VDEVICE(NVIDIA,	PCI_DEVICE_ID_NVIDIA_NFORCE_CK804_IDE),	 5 },
-	{ PCI_VDEVICE(NVIDIA,	PCI_DEVICE_ID_NVIDIA_NFORCE_MCP04_IDE),	 5 },
-	{ PCI_VDEVICE(NVIDIA,	PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_IDE),	 5 },
-	{ PCI_VDEVICE(NVIDIA,	PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_IDE),	 5 },
-	{ PCI_VDEVICE(NVIDIA,	PCI_DEVICE_ID_NVIDIA_NFORCE_MCP61_IDE),	 5 },
-	{ PCI_VDEVICE(NVIDIA,	PCI_DEVICE_ID_NVIDIA_NFORCE_MCP65_IDE),	 5 },
-	{ PCI_VDEVICE(NVIDIA,	PCI_DEVICE_ID_NVIDIA_NFORCE_MCP67_IDE),	 5 },
-	{ PCI_VDEVICE(NVIDIA,	PCI_DEVICE_ID_NVIDIA_NFORCE_MCP73_IDE),	 5 },
-	{ PCI_VDEVICE(NVIDIA,	PCI_DEVICE_ID_NVIDIA_NFORCE_MCP77_IDE),	 5 },
-	{ PCI_VDEVICE(AMD,	PCI_DEVICE_ID_AMD_CS5536_IDE),		 6 },
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, amd74xx_pci_tbl);
-
-static struct pci_driver amd74xx_pci_driver = {
-	.name		= "AMD_IDE",
-	.id_table	= amd74xx_pci_tbl,
-	.probe		= amd74xx_probe,
-	.remove		= ide_pci_remove,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init amd74xx_ide_init(void)
-{
-	return ide_pci_register_driver(&amd74xx_pci_driver);
-}
-
-static void __exit amd74xx_ide_exit(void)
-{
-	pci_unregister_driver(&amd74xx_pci_driver);
-}
-
-module_init(amd74xx_ide_init);
-module_exit(amd74xx_ide_exit);
-
-MODULE_AUTHOR("Vojtech Pavlik, Bartlomiej Zolnierkiewicz");
-MODULE_DESCRIPTION("AMD PCI IDE driver");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/atiixp.c b/drivers/ide/atiixp.c
deleted file mode 100644
index e08b0aac08b9..000000000000
--- a/drivers/ide/atiixp.c
+++ /dev/null
@@ -1,212 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  Copyright (C) 2003 ATI Inc. <hyu@ati.com>
- *  Copyright (C) 2004,2007 Bartlomiej Zolnierkiewicz
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-
-#define DRV_NAME "atiixp"
-
-#define ATIIXP_IDE_PIO_TIMING		0x40
-#define ATIIXP_IDE_MDMA_TIMING		0x44
-#define ATIIXP_IDE_PIO_CONTROL		0x48
-#define ATIIXP_IDE_PIO_MODE		0x4a
-#define ATIIXP_IDE_UDMA_CONTROL		0x54
-#define ATIIXP_IDE_UDMA_MODE		0x56
-
-struct atiixp_ide_timing {
-	u8 command_width;
-	u8 recover_width;
-};
-
-static struct atiixp_ide_timing pio_timing[] = {
-	{ 0x05, 0x0d },
-	{ 0x04, 0x07 },
-	{ 0x03, 0x04 },
-	{ 0x02, 0x02 },
-	{ 0x02, 0x00 },
-};
-
-static struct atiixp_ide_timing mdma_timing[] = {
-	{ 0x07, 0x07 },
-	{ 0x02, 0x01 },
-	{ 0x02, 0x00 },
-};
-
-static DEFINE_SPINLOCK(atiixp_lock);
-
-/**
- *	atiixp_set_pio_mode	-	set host controller for PIO mode
- *	@hwif: port
- *	@drive: drive
- *
- *	Set the interface PIO mode.
- */
-
-static void atiixp_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	unsigned long flags;
-	int timing_shift = (drive->dn ^ 1) * 8;
-	u32 pio_timing_data;
-	u16 pio_mode_data;
-	const u8 pio = drive->pio_mode - XFER_PIO_0;
-
-	spin_lock_irqsave(&atiixp_lock, flags);
-
-	pci_read_config_word(dev, ATIIXP_IDE_PIO_MODE, &pio_mode_data);
-	pio_mode_data &= ~(0x07 << (drive->dn * 4));
-	pio_mode_data |= (pio << (drive->dn * 4));
-	pci_write_config_word(dev, ATIIXP_IDE_PIO_MODE, pio_mode_data);
-
-	pci_read_config_dword(dev, ATIIXP_IDE_PIO_TIMING, &pio_timing_data);
-	pio_timing_data &= ~(0xff << timing_shift);
-	pio_timing_data |= (pio_timing[pio].recover_width << timing_shift) |
-		 (pio_timing[pio].command_width << (timing_shift + 4));
-	pci_write_config_dword(dev, ATIIXP_IDE_PIO_TIMING, pio_timing_data);
-
-	spin_unlock_irqrestore(&atiixp_lock, flags);
-}
-
-/**
- *	atiixp_set_dma_mode	-	set host controller for DMA mode
- *	@hwif: port
- *	@drive: drive
- *
- *	Set a ATIIXP host controller to the desired DMA mode.  This involves
- *	programming the right timing data into the PCI configuration space.
- */
-
-static void atiixp_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	unsigned long flags;
-	int timing_shift = (drive->dn ^ 1) * 8;
-	u32 tmp32;
-	u16 tmp16;
-	u16 udma_ctl = 0;
-	const u8 speed = drive->dma_mode;
-
-	spin_lock_irqsave(&atiixp_lock, flags);
-
-	pci_read_config_word(dev, ATIIXP_IDE_UDMA_CONTROL, &udma_ctl);
-
-	if (speed >= XFER_UDMA_0) {
-		pci_read_config_word(dev, ATIIXP_IDE_UDMA_MODE, &tmp16);
-		tmp16 &= ~(0x07 << (drive->dn * 4));
-		tmp16 |= ((speed & 0x07) << (drive->dn * 4));
-		pci_write_config_word(dev, ATIIXP_IDE_UDMA_MODE, tmp16);
-
-		udma_ctl |= (1 << drive->dn);
-	} else if (speed >= XFER_MW_DMA_0) {
-		u8 i = speed & 0x03;
-
-		pci_read_config_dword(dev, ATIIXP_IDE_MDMA_TIMING, &tmp32);
-		tmp32 &= ~(0xff << timing_shift);
-		tmp32 |= (mdma_timing[i].recover_width << timing_shift) |
-			 (mdma_timing[i].command_width << (timing_shift + 4));
-		pci_write_config_dword(dev, ATIIXP_IDE_MDMA_TIMING, tmp32);
-
-		udma_ctl &= ~(1 << drive->dn);
-	}
-
-	pci_write_config_word(dev, ATIIXP_IDE_UDMA_CONTROL, udma_ctl);
-
-	spin_unlock_irqrestore(&atiixp_lock, flags);
-}
-
-static u8 atiixp_cable_detect(ide_hwif_t *hwif)
-{
-	struct pci_dev *pdev = to_pci_dev(hwif->dev);
-	u8 udma_mode = 0, ch = hwif->channel;
-
-	pci_read_config_byte(pdev, ATIIXP_IDE_UDMA_MODE + ch, &udma_mode);
-
-	if ((udma_mode & 0x07) >= 0x04 || (udma_mode & 0x70) >= 0x40)
-		return ATA_CBL_PATA80;
-	else
-		return ATA_CBL_PATA40;
-}
-
-static const struct ide_port_ops atiixp_port_ops = {
-	.set_pio_mode		= atiixp_set_pio_mode,
-	.set_dma_mode		= atiixp_set_dma_mode,
-	.cable_detect		= atiixp_cable_detect,
-};
-
-static const struct ide_port_info atiixp_pci_info[] = {
-	{	/* 0: IXP200/300/400/700 */
-		.name		= DRV_NAME,
-		.enablebits	= {{0x48,0x01,0x00}, {0x48,0x08,0x00}},
-		.port_ops	= &atiixp_port_ops,
-		.pio_mask	= ATA_PIO4,
-		.mwdma_mask	= ATA_MWDMA2,
-		.udma_mask	= ATA_UDMA5,
-	},
-	{	/* 1: IXP600 */
-		.name		= DRV_NAME,
-		.enablebits	= {{0x48,0x01,0x00}, {0x00,0x00,0x00}},
-		.port_ops	= &atiixp_port_ops,
-		.host_flags	= IDE_HFLAG_SINGLE,
-		.pio_mask	= ATA_PIO4,
-		.mwdma_mask	= ATA_MWDMA2,
-		.udma_mask	= ATA_UDMA5,
- 	},
-};
-
-/**
- *	atiixp_init_one	-	called when a ATIIXP is found
- *	@dev: the atiixp device
- *	@id: the matching pci id
- *
- *	Called when the PCI registration layer (or the IDE initialization)
- *	finds a device matching our IDE device tables.
- */
-
-static int atiixp_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	return ide_pci_init_one(dev, &atiixp_pci_info[id->driver_data], NULL);
-}
-
-static const struct pci_device_id atiixp_pci_tbl[] = {
-	{ PCI_VDEVICE(ATI, PCI_DEVICE_ID_ATI_IXP200_IDE), 0 },
-	{ PCI_VDEVICE(ATI, PCI_DEVICE_ID_ATI_IXP300_IDE), 0 },
-	{ PCI_VDEVICE(ATI, PCI_DEVICE_ID_ATI_IXP400_IDE), 0 },
-	{ PCI_VDEVICE(ATI, PCI_DEVICE_ID_ATI_IXP600_IDE), 1 },
-	{ PCI_VDEVICE(ATI, PCI_DEVICE_ID_ATI_IXP700_IDE), 0 },
-	{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_HUDSON2_IDE), 0 },
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, atiixp_pci_tbl);
-
-static struct pci_driver atiixp_pci_driver = {
-	.name		= "ATIIXP_IDE",
-	.id_table	= atiixp_pci_tbl,
-	.probe		= atiixp_init_one,
-	.remove		= ide_pci_remove,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init atiixp_ide_init(void)
-{
-	return ide_pci_register_driver(&atiixp_pci_driver);
-}
-
-static void __exit atiixp_ide_exit(void)
-{
-	pci_unregister_driver(&atiixp_pci_driver);
-}
-
-module_init(atiixp_ide_init);
-module_exit(atiixp_ide_exit);
-
-MODULE_AUTHOR("HUI YU");
-MODULE_DESCRIPTION("PCI driver module for ATI IXP IDE");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/buddha.c b/drivers/ide/buddha.c
deleted file mode 100644
index 46eaf58d881b..000000000000
--- a/drivers/ide/buddha.c
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- *  Amiga Buddha, Catweasel and X-Surf IDE Driver
- *
- *	Copyright (C) 1997, 2001 by Geert Uytterhoeven and others
- *
- *  This driver was written based on the specifications in README.buddha and
- *  the X-Surf info from Inside_XSurf.txt available at
- *  http://www.jschoenfeld.com
- *
- *  This file is subject to the terms and conditions of the GNU General Public
- *  License.  See the file COPYING in the main directory of this archive for
- *  more details.
- *
- *  TODO:
- *    - test it :-)
- *    - tune the timings using the speed-register
- */
-
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/blkdev.h>
-#include <linux/zorro.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-#include <linux/module.h>
-
-#include <asm/amigahw.h>
-#include <asm/amigaints.h>
-
-
-    /*
-     *  The Buddha has 2 IDE interfaces, the Catweasel has 3, X-Surf has 2
-     */
-
-#define BUDDHA_NUM_HWIFS	2
-#define CATWEASEL_NUM_HWIFS	3
-#define XSURF_NUM_HWIFS         2
-
-#define MAX_NUM_HWIFS		3
-
-    /*
-     *  Bases of the IDE interfaces (relative to the board address)
-     */
-
-#define BUDDHA_BASE1	0x800
-#define BUDDHA_BASE2	0xa00
-#define BUDDHA_BASE3	0xc00
-
-#define XSURF_BASE1     0xb000 /* 2.5" Interface */
-#define XSURF_BASE2     0xd000 /* 3.5" Interface */
-
-static u_int buddha_bases[CATWEASEL_NUM_HWIFS] __initdata = {
-    BUDDHA_BASE1, BUDDHA_BASE2, BUDDHA_BASE3
-};
-
-static u_int xsurf_bases[XSURF_NUM_HWIFS] __initdata = {
-     XSURF_BASE1, XSURF_BASE2
-};
-
-    /*
-     *  Offsets from one of the above bases
-     */
-
-#define BUDDHA_CONTROL	0x11a
-
-    /*
-     *  Other registers
-     */
-
-#define BUDDHA_IRQ1	0xf00		/* MSB = 1, Harddisk is source of */
-#define BUDDHA_IRQ2	0xf40		/* interrupt */
-#define BUDDHA_IRQ3	0xf80
-
-#define XSURF_IRQ1      0x7e
-#define XSURF_IRQ2      0x7e
-
-static int buddha_irqports[CATWEASEL_NUM_HWIFS] __initdata = {
-    BUDDHA_IRQ1, BUDDHA_IRQ2, BUDDHA_IRQ3
-};
-
-static int xsurf_irqports[XSURF_NUM_HWIFS] __initdata = {
-    XSURF_IRQ1, XSURF_IRQ2
-};
-
-#define BUDDHA_IRQ_MR	0xfc0		/* master interrupt enable */
-
-
-    /*
-     *  Board information
-     */
-
-typedef enum BuddhaType_Enum {
-    BOARD_BUDDHA, BOARD_CATWEASEL, BOARD_XSURF
-} BuddhaType;
-
-static const char *buddha_board_name[] = { "Buddha", "Catweasel", "X-Surf" };
-
-    /*
-     *  Check and acknowledge the interrupt status
-     */
-
-static int buddha_test_irq(ide_hwif_t *hwif)
-{
-    unsigned char ch;
-
-    ch = z_readb(hwif->io_ports.irq_addr);
-    if (!(ch & 0x80))
-	    return 0;
-    return 1;
-}
-
-static void xsurf_clear_irq(ide_drive_t *drive)
-{
-    /*
-     * X-Surf needs 0 written to IRQ register to ensure ISA bit A11 stays at 0
-     */
-    z_writeb(0, drive->hwif->io_ports.irq_addr);
-}
-
-static void __init buddha_setup_ports(struct ide_hw *hw, unsigned long base,
-				      unsigned long ctl, unsigned long irq_port)
-{
-	int i;
-
-	memset(hw, 0, sizeof(*hw));
-
-	hw->io_ports.data_addr = base;
-
-	for (i = 1; i < 8; i++)
-		hw->io_ports_array[i] = base + 2 + i * 4;
-
-	hw->io_ports.ctl_addr = ctl;
-	hw->io_ports.irq_addr = irq_port;
-
-	hw->irq = IRQ_AMIGA_PORTS;
-}
-
-static const struct ide_port_ops buddha_port_ops = {
-	.test_irq		= buddha_test_irq,
-};
-
-static const struct ide_port_ops xsurf_port_ops = {
-	.clear_irq		= xsurf_clear_irq,
-	.test_irq		= buddha_test_irq,
-};
-
-static const struct ide_port_info buddha_port_info = {
-	.port_ops		= &buddha_port_ops,
-	.host_flags		= IDE_HFLAG_MMIO | IDE_HFLAG_NO_DMA,
-	.irq_flags		= IRQF_SHARED,
-	.chipset		= ide_generic,
-};
-
-    /*
-     *  Probe for a Buddha or Catweasel IDE interface
-     */
-
-static int __init buddha_init(void)
-{
-	struct zorro_dev *z = NULL;
-	u_long buddha_board = 0;
-	BuddhaType type;
-	int buddha_num_hwifs, i;
-
-	while ((z = zorro_find_device(ZORRO_WILDCARD, z))) {
-		unsigned long board;
-		struct ide_hw hw[MAX_NUM_HWIFS], *hws[MAX_NUM_HWIFS];
-		struct ide_port_info d = buddha_port_info;
-
-		if (z->id == ZORRO_PROD_INDIVIDUAL_COMPUTERS_BUDDHA) {
-			buddha_num_hwifs = BUDDHA_NUM_HWIFS;
-			type=BOARD_BUDDHA;
-		} else if (z->id == ZORRO_PROD_INDIVIDUAL_COMPUTERS_CATWEASEL) {
-			buddha_num_hwifs = CATWEASEL_NUM_HWIFS;
-			type=BOARD_CATWEASEL;
-		} else if (z->id == ZORRO_PROD_INDIVIDUAL_COMPUTERS_X_SURF) {
-			buddha_num_hwifs = XSURF_NUM_HWIFS;
-			type=BOARD_XSURF;
-			d.port_ops = &xsurf_port_ops;
-		} else 
-			continue;
-		
-		board = z->resource.start;
-
-		if(type != BOARD_XSURF) {
-			if (!request_mem_region(board+BUDDHA_BASE1, 0x800, "IDE"))
-				continue;
-		} else {
-			if (!request_mem_region(board+XSURF_BASE1, 0x1000, "IDE"))
-				continue;
-			if (!request_mem_region(board+XSURF_BASE2, 0x1000, "IDE"))
-				goto fail_base2;
-			if (!request_mem_region(board+XSURF_IRQ1, 0x8, "IDE")) {
-				release_mem_region(board+XSURF_BASE2, 0x1000);
-fail_base2:
-				release_mem_region(board+XSURF_BASE1, 0x1000);
-				continue;
-			}
-		}	  
-		buddha_board = (unsigned long)ZTWO_VADDR(board);
-		
-		/* write to BUDDHA_IRQ_MR to enable the board IRQ */
-		/* X-Surf doesn't have this.  IRQs are always on */
-		if (type != BOARD_XSURF)
-			z_writeb(0, buddha_board+BUDDHA_IRQ_MR);
-
-		printk(KERN_INFO "ide: %s IDE controller\n",
-				 buddha_board_name[type]);
-
-		for (i = 0; i < buddha_num_hwifs; i++) {
-			unsigned long base, ctl, irq_port;
-
-			if (type != BOARD_XSURF) {
-				base = buddha_board + buddha_bases[i];
-				ctl = base + BUDDHA_CONTROL;
-				irq_port = buddha_board + buddha_irqports[i];
-			} else {
-				base = buddha_board + xsurf_bases[i];
-				/* X-Surf has no CS1* (Control/AltStat) */
-				ctl = 0;
-				irq_port = buddha_board + xsurf_irqports[i];
-			}
-
-			buddha_setup_ports(&hw[i], base, ctl, irq_port);
-
-			hws[i] = &hw[i];
-		}
-
-		ide_host_add(&d, hws, i, NULL);
-	}
-
-	return 0;
-}
-
-module_init(buddha_init);
-
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/cmd640.c b/drivers/ide/cmd640.c
deleted file mode 100644
index f48decb9fac4..000000000000
--- a/drivers/ide/cmd640.c
+++ /dev/null
@@ -1,848 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  Copyright (C) 1995-1996  Linus Torvalds & authors (see below)
- */
-
-/*
- *  Original authors:	abramov@cecmow.enet.dec.com (Igor Abramov)
- *			mlord@pobox.com (Mark Lord)
- *
- *  See linux/MAINTAINERS for address of current maintainer.
- *
- *  This file provides support for the advanced features and bugs
- *  of IDE interfaces using the CMD Technologies 0640 IDE interface chip.
- *
- *  These chips are basically fucked by design, and getting this driver
- *  to work on every motherboard design that uses this screwed chip seems
- *  bloody well impossible.  However, we're still trying.
- *
- *  Version 0.97 worked for everybody.
- *
- *  User feedback is essential.  Many thanks to the beta test team:
- *
- *  A.Hartgers@stud.tue.nl, JZDQC@CUNYVM.CUNY.edu, abramov@cecmow.enet.dec.com,
- *  bardj@utopia.ppp.sn.no, bart@gaga.tue.nl, bbol001@cs.auckland.ac.nz,
- *  chrisc@dbass.demon.co.uk, dalecki@namu26.Num.Math.Uni-Goettingen.de,
- *  derekn@vw.ece.cmu.edu, florian@btp2x3.phy.uni-bayreuth.de,
- *  flynn@dei.unipd.it, gadio@netvision.net.il, godzilla@futuris.net,
- *  j@pobox.com, jkemp1@mises.uni-paderborn.de, jtoppe@hiwaay.net,
- *  kerouac@ssnet.com, meskes@informatik.rwth-aachen.de, hzoli@cs.elte.hu,
- *  peter@udgaard.isgtec.com, phil@tazenda.demon.co.uk, roadcapw@cfw.com,
- *  s0033las@sun10.vsz.bme.hu, schaffer@tam.cornell.edu, sjd@slip.net,
- *  steve@ei.org, ulrpeg@bigcomm.gun.de, ism@tardis.ed.ac.uk, mack@cray.com
- *  liug@mama.indstate.edu, and others.
- *
- *  Version 0.01	Initial version, hacked out of ide.c,
- *			and #include'd rather than compiled separately.
- *			This will get cleaned up in a subsequent release.
- *
- *  Version 0.02	Fixes for vlb initialization code, enable prefetch
- *			for versions 'B' and 'C' of chip by default,
- *			some code cleanup.
- *
- *  Version 0.03	Added reset of secondary interface,
- *			and black list for devices which are not compatible
- *			with prefetch mode. Separate function for setting
- *			prefetch is added, possibly it will be called some
- *			day from ioctl processing code.
- *
- *  Version 0.04	Now configs/compiles separate from ide.c
- *
- *  Version 0.05	Major rewrite of interface timing code.
- *			Added new function cmd640_set_mode to set PIO mode
- *			from ioctl call. New drives added to black list.
- *
- *  Version 0.06	More code cleanup. Prefetch is enabled only for
- *			detected hard drives, not included in prefetch
- *			black list.
- *
- *  Version 0.07	Changed to more conservative drive tuning policy.
- *			Unknown drives, which report PIO < 4 are set to
- *			(reported_PIO - 1) if it is supported, or to PIO0.
- *			List of known drives extended by info provided by
- *			CMD at their ftp site.
- *
- *  Version 0.08	Added autotune/noautotune support.
- *
- *  Version 0.09	Try to be smarter about 2nd port enabling.
- *  Version 0.10	Be nice and don't reset 2nd port.
- *  Version 0.11	Try to handle more weird situations.
- *
- *  Version 0.12	Lots of bug fixes from Laszlo Peter
- *			irq unmasking disabled for reliability.
- *			try to be even smarter about the second port.
- *			tidy up source code formatting.
- *  Version 0.13	permit irq unmasking again.
- *  Version 0.90	massive code cleanup, some bugs fixed.
- *			defaults all drives to PIO mode0, prefetch off.
- *			autotune is OFF by default, with compile time flag.
- *			prefetch can be turned OFF/ON using "hdparm -p8/-p9"
- *			 (requires hdparm-3.1 or newer)
- *  Version 0.91	first release to linux-kernel list.
- *  Version 0.92	move initial reg dump to separate callable function
- *			change "readahead" to "prefetch" to avoid confusion
- *  Version 0.95	respect original BIOS timings unless autotuning.
- *			tons of code cleanup and rearrangement.
- *			added CONFIG_BLK_DEV_CMD640_ENHANCED option
- *			prevent use of unmask when prefetch is on
- *  Version 0.96	prevent use of io_32bit when prefetch is off
- *  Version 0.97	fix VLB secondary interface for sjd@slip.net
- *			other minor tune-ups:  0.96 was very good.
- *  Version 0.98	ignore PCI version when disabled by BIOS
- *  Version 0.99	display setup/active/recovery clocks with PIO mode
- *  Version 1.00	Mmm.. cannot depend on PCMD_ENA in all systems
- *  Version 1.01	slow/fast devsel can be selected with "hdparm -p6/-p7"
- *			 ("fast" is necessary for 32bit I/O in some systems)
- *  Version 1.02	fix bug that resulted in slow "setup times"
- *			 (patch courtesy of Zoltan Hidvegi)
- */
-
-#define CMD640_PREFETCH_MASKS 1
-
-/*#define CMD640_DUMP_REGS */
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/delay.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-#include <linux/module.h>
-
-#include <asm/io.h>
-
-#define DRV_NAME "cmd640"
-
-static bool cmd640_vlb;
-
-/*
- * CMD640 specific registers definition.
- */
-
-#define VID		0x00
-#define DID		0x02
-#define PCMD		0x04
-#define   PCMD_ENA	0x01
-#define PSTTS		0x06
-#define REVID		0x08
-#define PROGIF		0x09
-#define SUBCL		0x0a
-#define BASCL		0x0b
-#define BaseA0		0x10
-#define BaseA1		0x14
-#define BaseA2		0x18
-#define BaseA3		0x1c
-#define INTLINE		0x3c
-#define INPINE		0x3d
-
-#define	CFR		0x50
-#define   CFR_DEVREV		0x03
-#define   CFR_IDE01INTR		0x04
-#define	  CFR_DEVID		0x18
-#define	  CFR_AT_VESA_078h	0x20
-#define	  CFR_DSA1		0x40
-#define	  CFR_DSA0		0x80
-
-#define CNTRL		0x51
-#define	  CNTRL_DIS_RA0		0x40
-#define   CNTRL_DIS_RA1		0x80
-#define	  CNTRL_ENA_2ND		0x08
-
-#define	CMDTIM		0x52
-#define	ARTTIM0		0x53
-#define	DRWTIM0		0x54
-#define ARTTIM1 	0x55
-#define DRWTIM1		0x56
-#define ARTTIM23	0x57
-#define   ARTTIM23_DIS_RA2	0x04
-#define   ARTTIM23_DIS_RA3	0x08
-#define   ARTTIM23_IDE23INTR	0x10
-#define DRWTIM23	0x58
-#define BRST		0x59
-
-/*
- * Registers and masks for easy access by drive index:
- */
-static u8 prefetch_regs[4]  = {CNTRL, CNTRL, ARTTIM23, ARTTIM23};
-static u8 prefetch_masks[4] = {CNTRL_DIS_RA0, CNTRL_DIS_RA1, ARTTIM23_DIS_RA2, ARTTIM23_DIS_RA3};
-
-#ifdef CONFIG_BLK_DEV_CMD640_ENHANCED
-
-static u8 arttim_regs[4] = {ARTTIM0, ARTTIM1, ARTTIM23, ARTTIM23};
-static u8 drwtim_regs[4] = {DRWTIM0, DRWTIM1, DRWTIM23, DRWTIM23};
-
-/*
- * Current cmd640 timing values for each drive.
- * The defaults for each are the slowest possible timings.
- */
-static u8 setup_counts[4]    = {4, 4, 4, 4};     /* Address setup count (in clocks) */
-static u8 active_counts[4]   = {16, 16, 16, 16}; /* Active count   (encoded) */
-static u8 recovery_counts[4] = {16, 16, 16, 16}; /* Recovery count (encoded) */
-
-#endif /* CONFIG_BLK_DEV_CMD640_ENHANCED */
-
-static DEFINE_SPINLOCK(cmd640_lock);
-
-/*
- * Interface to access cmd640x registers
- */
-static unsigned int cmd640_key;
-static void (*__put_cmd640_reg)(u16 reg, u8 val);
-static u8 (*__get_cmd640_reg)(u16 reg);
-
-/*
- * This is read from the CFR reg, and is used in several places.
- */
-static unsigned int cmd640_chip_version;
-
-/*
- * The CMD640x chip does not support DWORD config write cycles, but some
- * of the BIOSes use them to implement the config services.
- * Therefore, we must use direct IO instead.
- */
-
-/* PCI method 1 access */
-
-static void put_cmd640_reg_pci1(u16 reg, u8 val)
-{
-	outl_p((reg & 0xfc) | cmd640_key, 0xcf8);
-	outb_p(val, (reg & 3) | 0xcfc);
-}
-
-static u8 get_cmd640_reg_pci1(u16 reg)
-{
-	outl_p((reg & 0xfc) | cmd640_key, 0xcf8);
-	return inb_p((reg & 3) | 0xcfc);
-}
-
-/* PCI method 2 access (from CMD datasheet) */
-
-static void put_cmd640_reg_pci2(u16 reg, u8 val)
-{
-	outb_p(0x10, 0xcf8);
-	outb_p(val, cmd640_key + reg);
-	outb_p(0, 0xcf8);
-}
-
-static u8 get_cmd640_reg_pci2(u16 reg)
-{
-	u8 b;
-
-	outb_p(0x10, 0xcf8);
-	b = inb_p(cmd640_key + reg);
-	outb_p(0, 0xcf8);
-	return b;
-}
-
-/* VLB access */
-
-static void put_cmd640_reg_vlb(u16 reg, u8 val)
-{
-	outb_p(reg, cmd640_key);
-	outb_p(val, cmd640_key + 4);
-}
-
-static u8 get_cmd640_reg_vlb(u16 reg)
-{
-	outb_p(reg, cmd640_key);
-	return inb_p(cmd640_key + 4);
-}
-
-static u8 get_cmd640_reg(u16 reg)
-{
-	unsigned long flags;
-	u8 b;
-
-	spin_lock_irqsave(&cmd640_lock, flags);
-	b = __get_cmd640_reg(reg);
-	spin_unlock_irqrestore(&cmd640_lock, flags);
-	return b;
-}
-
-static void put_cmd640_reg(u16 reg, u8 val)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&cmd640_lock, flags);
-	__put_cmd640_reg(reg, val);
-	spin_unlock_irqrestore(&cmd640_lock, flags);
-}
-
-static int __init match_pci_cmd640_device(void)
-{
-	const u8 ven_dev[4] = {0x95, 0x10, 0x40, 0x06};
-	unsigned int i;
-	for (i = 0; i < 4; i++) {
-		if (get_cmd640_reg(i) != ven_dev[i])
-			return 0;
-	}
-#ifdef STUPIDLY_TRUST_BROKEN_PCMD_ENA_BIT
-	if ((get_cmd640_reg(PCMD) & PCMD_ENA) == 0) {
-		printk("ide: cmd640 on PCI disabled by BIOS\n");
-		return 0;
-	}
-#endif /* STUPIDLY_TRUST_BROKEN_PCMD_ENA_BIT */
-	return 1; /* success */
-}
-
-/*
- * Probe for CMD640x -- pci method 1
- */
-static int __init probe_for_cmd640_pci1(void)
-{
-	__get_cmd640_reg = get_cmd640_reg_pci1;
-	__put_cmd640_reg = put_cmd640_reg_pci1;
-	for (cmd640_key = 0x80000000;
-	     cmd640_key <= 0x8000f800;
-	     cmd640_key += 0x800) {
-		if (match_pci_cmd640_device())
-			return 1; /* success */
-	}
-	return 0;
-}
-
-/*
- * Probe for CMD640x -- pci method 2
- */
-static int __init probe_for_cmd640_pci2(void)
-{
-	__get_cmd640_reg = get_cmd640_reg_pci2;
-	__put_cmd640_reg = put_cmd640_reg_pci2;
-	for (cmd640_key = 0xc000; cmd640_key <= 0xcf00; cmd640_key += 0x100) {
-		if (match_pci_cmd640_device())
-			return 1; /* success */
-	}
-	return 0;
-}
-
-/*
- * Probe for CMD640x -- vlb
- */
-static int __init probe_for_cmd640_vlb(void)
-{
-	u8 b;
-
-	__get_cmd640_reg = get_cmd640_reg_vlb;
-	__put_cmd640_reg = put_cmd640_reg_vlb;
-	cmd640_key = 0x178;
-	b = get_cmd640_reg(CFR);
-	if (b == 0xff || b == 0x00 || (b & CFR_AT_VESA_078h)) {
-		cmd640_key = 0x78;
-		b = get_cmd640_reg(CFR);
-		if (b == 0xff || b == 0x00 || !(b & CFR_AT_VESA_078h))
-			return 0;
-	}
-	return 1; /* success */
-}
-
-/*
- *  Returns 1 if an IDE interface/drive exists at 0x170,
- *  Returns 0 otherwise.
- */
-static int __init secondary_port_responding(void)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&cmd640_lock, flags);
-
-	outb_p(0x0a, 0x176);	/* select drive0 */
-	udelay(100);
-	if ((inb_p(0x176) & 0x1f) != 0x0a) {
-		outb_p(0x1a, 0x176); /* select drive1 */
-		udelay(100);
-		if ((inb_p(0x176) & 0x1f) != 0x1a) {
-			spin_unlock_irqrestore(&cmd640_lock, flags);
-			return 0; /* nothing responded */
-		}
-	}
-	spin_unlock_irqrestore(&cmd640_lock, flags);
-	return 1; /* success */
-}
-
-#ifdef CMD640_DUMP_REGS
-/*
- * Dump out all cmd640 registers.  May be called from ide.c
- */
-static void cmd640_dump_regs(void)
-{
-	unsigned int reg = cmd640_vlb ? 0x50 : 0x00;
-
-	/* Dump current state of chip registers */
-	printk("ide: cmd640 internal register dump:");
-	for (; reg <= 0x59; reg++) {
-		if (!(reg & 0x0f))
-			printk("\n%04x:", reg);
-		printk(" %02x", get_cmd640_reg(reg));
-	}
-	printk("\n");
-}
-#endif
-
-static void __set_prefetch_mode(ide_drive_t *drive, int mode)
-{
-	if (mode) {	/* want prefetch on? */
-#if CMD640_PREFETCH_MASKS
-		drive->dev_flags |= IDE_DFLAG_NO_UNMASK;
-		drive->dev_flags &= ~IDE_DFLAG_UNMASK;
-#endif
-		drive->dev_flags &= ~IDE_DFLAG_NO_IO_32BIT;
-	} else {
-		drive->dev_flags &= ~IDE_DFLAG_NO_UNMASK;
-		drive->dev_flags |= IDE_DFLAG_NO_IO_32BIT;
-		drive->io_32bit = 0;
-	}
-}
-
-#ifndef CONFIG_BLK_DEV_CMD640_ENHANCED
-/*
- * Check whether prefetch is on for a drive,
- * and initialize the unmask flags for safe operation.
- */
-static void __init check_prefetch(ide_drive_t *drive, unsigned int index)
-{
-	u8 b = get_cmd640_reg(prefetch_regs[index]);
-
-	__set_prefetch_mode(drive, (b & prefetch_masks[index]) ? 0 : 1);
-}
-#else
-
-/*
- * Sets prefetch mode for a drive.
- */
-static void set_prefetch_mode(ide_drive_t *drive, unsigned int index, int mode)
-{
-	unsigned long flags;
-	int reg = prefetch_regs[index];
-	u8 b;
-
-	spin_lock_irqsave(&cmd640_lock, flags);
-	b = __get_cmd640_reg(reg);
-	__set_prefetch_mode(drive, mode);
-	if (mode)
-		b &= ~prefetch_masks[index];	/* enable prefetch */
-	else
-		b |= prefetch_masks[index];	/* disable prefetch */
-	__put_cmd640_reg(reg, b);
-	spin_unlock_irqrestore(&cmd640_lock, flags);
-}
-
-/*
- * Dump out current drive clocks settings
- */
-static void display_clocks(unsigned int index)
-{
-	u8 active_count, recovery_count;
-
-	active_count = active_counts[index];
-	if (active_count == 1)
-		++active_count;
-	recovery_count = recovery_counts[index];
-	if (active_count > 3 && recovery_count == 1)
-		++recovery_count;
-	if (cmd640_chip_version > 1)
-		recovery_count += 1;  /* cmd640b uses (count + 1)*/
-	printk(", clocks=%d/%d/%d\n", setup_counts[index], active_count, recovery_count);
-}
-
-/*
- * Pack active and recovery counts into single byte representation
- * used by controller
- */
-static inline u8 pack_nibbles(u8 upper, u8 lower)
-{
-	return ((upper & 0x0f) << 4) | (lower & 0x0f);
-}
-
-/*
- * This routine writes the prepared setup/active/recovery counts
- * for a drive into the cmd640 chipset registers to active them.
- */
-static void program_drive_counts(ide_drive_t *drive, unsigned int index)
-{
-	unsigned long flags;
-	u8 setup_count    = setup_counts[index];
-	u8 active_count   = active_counts[index];
-	u8 recovery_count = recovery_counts[index];
-
-	/*
-	 * Set up address setup count and drive read/write timing registers.
-	 * Primary interface has individual count/timing registers for
-	 * each drive.  Secondary interface has one common set of registers,
-	 * so we merge the timings, using the slowest value for each timing.
-	 */
-	if (index > 1) {
-		ide_drive_t *peer = ide_get_pair_dev(drive);
-		unsigned int mate = index ^ 1;
-
-		if (peer) {
-			if (setup_count < setup_counts[mate])
-				setup_count = setup_counts[mate];
-			if (active_count < active_counts[mate])
-				active_count = active_counts[mate];
-			if (recovery_count < recovery_counts[mate])
-				recovery_count = recovery_counts[mate];
-		}
-	}
-
-	/*
-	 * Convert setup_count to internal chipset representation
-	 */
-	switch (setup_count) {
-	case 4:	 setup_count = 0x00; break;
-	case 3:	 setup_count = 0x80; break;
-	case 1:
-	case 2:	 setup_count = 0x40; break;
-	default: setup_count = 0xc0; /* case 5 */
-	}
-
-	/*
-	 * Now that everything is ready, program the new timings
-	 */
-	spin_lock_irqsave(&cmd640_lock, flags);
-	/*
-	 * Program the address_setup clocks into ARTTIM reg,
-	 * and then the active/recovery counts into the DRWTIM reg
-	 * (this converts counts of 16 into counts of zero -- okay).
-	 */
-	setup_count |= __get_cmd640_reg(arttim_regs[index]) & 0x3f;
-	__put_cmd640_reg(arttim_regs[index], setup_count);
-	__put_cmd640_reg(drwtim_regs[index], pack_nibbles(active_count, recovery_count));
-	spin_unlock_irqrestore(&cmd640_lock, flags);
-}
-
-/*
- * Set a specific pio_mode for a drive
- */
-static void cmd640_set_mode(ide_drive_t *drive, unsigned int index,
-			    u8 pio_mode, unsigned int cycle_time)
-{
-	struct ide_timing *t;
-	int setup_time, active_time, recovery_time, clock_time;
-	u8 setup_count, active_count, recovery_count, recovery_count2, cycle_count;
-	int bus_speed;
-
-	if (cmd640_vlb)
-		bus_speed = ide_vlb_clk ? ide_vlb_clk : 50;
-	else
-		bus_speed = ide_pci_clk ? ide_pci_clk : 33;
-
-	if (pio_mode > 5)
-		pio_mode = 5;
-
-	t = ide_timing_find_mode(XFER_PIO_0 + pio_mode);
-	setup_time  = t->setup;
-	active_time = t->active;
-
-	recovery_time = cycle_time - (setup_time + active_time);
-	clock_time = 1000 / bus_speed;
-	cycle_count = DIV_ROUND_UP(cycle_time, clock_time);
-
-	setup_count = DIV_ROUND_UP(setup_time, clock_time);
-
-	active_count = DIV_ROUND_UP(active_time, clock_time);
-	if (active_count < 2)
-		active_count = 2; /* minimum allowed by cmd640 */
-
-	recovery_count = DIV_ROUND_UP(recovery_time, clock_time);
-	recovery_count2 = cycle_count - (setup_count + active_count);
-	if (recovery_count2 > recovery_count)
-		recovery_count = recovery_count2;
-	if (recovery_count < 2)
-		recovery_count = 2; /* minimum allowed by cmd640 */
-	if (recovery_count > 17) {
-		active_count += recovery_count - 17;
-		recovery_count = 17;
-	}
-	if (active_count > 16)
-		active_count = 16; /* maximum allowed by cmd640 */
-	if (cmd640_chip_version > 1)
-		recovery_count -= 1;  /* cmd640b uses (count + 1)*/
-	if (recovery_count > 16)
-		recovery_count = 16; /* maximum allowed by cmd640 */
-
-	setup_counts[index]    = setup_count;
-	active_counts[index]   = active_count;
-	recovery_counts[index] = recovery_count;
-
-	/*
-	 * In a perfect world, we might set the drive pio mode here
-	 * (using WIN_SETFEATURE) before continuing.
-	 *
-	 * But we do not, because:
-	 *	1) this is the wrong place to do it (proper is do_special() in ide.c)
-	 * 	2) in practice this is rarely, if ever, necessary
-	 */
-	program_drive_counts(drive, index);
-}
-
-static void cmd640_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	unsigned int index = 0, cycle_time;
-	const u8 pio = drive->pio_mode - XFER_PIO_0;
-	u8 b;
-
-	switch (pio) {
-	case 6: /* set fast-devsel off */
-	case 7: /* set fast-devsel on */
-		b = get_cmd640_reg(CNTRL) & ~0x27;
-		if (pio & 1)
-			b |= 0x27;
-		put_cmd640_reg(CNTRL, b);
-		printk("%s: %sabled cmd640 fast host timing (devsel)\n",
-			drive->name, (pio & 1) ? "en" : "dis");
-		return;
-	case 8: /* set prefetch off */
-	case 9: /* set prefetch on */
-		set_prefetch_mode(drive, index, pio & 1);
-		printk("%s: %sabled cmd640 prefetch\n",
-			drive->name, (pio & 1) ? "en" : "dis");
-		return;
-	}
-
-	cycle_time = ide_pio_cycle_time(drive, pio);
-	cmd640_set_mode(drive, index, pio, cycle_time);
-
-	printk("%s: selected cmd640 PIO mode%d (%dns)",
-		drive->name, pio, cycle_time);
-
-	display_clocks(index);
-}
-#endif /* CONFIG_BLK_DEV_CMD640_ENHANCED */
-
-static void __init cmd640_init_dev(ide_drive_t *drive)
-{
-	unsigned int i = drive->hwif->channel * 2 + (drive->dn & 1);
-
-#ifdef CONFIG_BLK_DEV_CMD640_ENHANCED
-	/*
-	 * Reset timing to the slowest speed and turn off prefetch.
-	 * This way, the drive identify code has a better chance.
-	 */
-	setup_counts[i]    =  4;	/* max possible */
-	active_counts[i]   = 16;	/* max possible */
-	recovery_counts[i] = 16;	/* max possible */
-	program_drive_counts(drive, i);
-	set_prefetch_mode(drive, i, 0);
-	printk(KERN_INFO DRV_NAME ": drive%d timings/prefetch cleared\n", i);
-#else
-	/*
-	 * Set the drive unmask flags to match the prefetch setting.
-	 */
-	check_prefetch(drive, i);
-	printk(KERN_INFO DRV_NAME ": drive%d timings/prefetch(%s) preserved\n",
-		i, (drive->dev_flags & IDE_DFLAG_NO_IO_32BIT) ? "off" : "on");
-#endif /* CONFIG_BLK_DEV_CMD640_ENHANCED */
-}
-
-static int cmd640_test_irq(ide_hwif_t *hwif)
-{
-	int irq_reg		= hwif->channel ? ARTTIM23 : CFR;
-	u8  irq_mask		= hwif->channel ? ARTTIM23_IDE23INTR :
-						  CFR_IDE01INTR;
-	u8  irq_stat		= get_cmd640_reg(irq_reg);
-
-	return (irq_stat & irq_mask) ? 1 : 0;
-}
-
-static const struct ide_port_ops cmd640_port_ops = {
-	.init_dev		= cmd640_init_dev,
-#ifdef CONFIG_BLK_DEV_CMD640_ENHANCED
-	.set_pio_mode		= cmd640_set_pio_mode,
-#endif
-	.test_irq		= cmd640_test_irq,
-};
-
-static int pci_conf1(void)
-{
-	unsigned long flags;
-	u32 tmp;
-
-	spin_lock_irqsave(&cmd640_lock, flags);
-	outb(0x01, 0xCFB);
-	tmp = inl(0xCF8);
-	outl(0x80000000, 0xCF8);
-	if (inl(0xCF8) == 0x80000000) {
-		outl(tmp, 0xCF8);
-		spin_unlock_irqrestore(&cmd640_lock, flags);
-		return 1;
-	}
-	outl(tmp, 0xCF8);
-	spin_unlock_irqrestore(&cmd640_lock, flags);
-	return 0;
-}
-
-static int pci_conf2(void)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&cmd640_lock, flags);
-	outb(0x00, 0xCFB);
-	outb(0x00, 0xCF8);
-	outb(0x00, 0xCFA);
-	if (inb(0xCF8) == 0x00 && inb(0xCF8) == 0x00) {
-		spin_unlock_irqrestore(&cmd640_lock, flags);
-		return 1;
-	}
-	spin_unlock_irqrestore(&cmd640_lock, flags);
-	return 0;
-}
-
-static const struct ide_port_info cmd640_port_info __initconst = {
-	.chipset		= ide_cmd640,
-	.host_flags		= IDE_HFLAG_SERIALIZE |
-				  IDE_HFLAG_NO_DMA |
-				  IDE_HFLAG_ABUSE_PREFETCH |
-				  IDE_HFLAG_ABUSE_FAST_DEVSEL,
-	.port_ops		= &cmd640_port_ops,
-	.pio_mask		= ATA_PIO5,
-};
-
-static int __init cmd640x_init_one(unsigned long base, unsigned long ctl)
-{
-	if (!request_region(base, 8, DRV_NAME)) {
-		printk(KERN_ERR "%s: I/O resource 0x%lX-0x%lX not free.\n",
-				DRV_NAME, base, base + 7);
-		return -EBUSY;
-	}
-
-	if (!request_region(ctl, 1, DRV_NAME)) {
-		printk(KERN_ERR "%s: I/O resource 0x%lX not free.\n",
-				DRV_NAME, ctl);
-		release_region(base, 8);
-		return -EBUSY;
-	}
-
-	return 0;
-}
-
-/*
- * Probe for a cmd640 chipset, and initialize it if found.
- */
-static int __init cmd640x_init(void)
-{
-	int second_port_cmd640 = 0, rc;
-	const char *bus_type, *port2;
-	u8 b, cfr;
-	struct ide_hw hw[2], *hws[2];
-
-	if (cmd640_vlb && probe_for_cmd640_vlb()) {
-		bus_type = "VLB";
-	} else {
-		cmd640_vlb = 0;
-		/* Find out what kind of PCI probing is supported otherwise
-		   Justin Gibbs will sulk.. */
-		if (pci_conf1() && probe_for_cmd640_pci1())
-			bus_type = "PCI (type1)";
-		else if (pci_conf2() && probe_for_cmd640_pci2())
-			bus_type = "PCI (type2)";
-		else
-			return 0;
-	}
-	/*
-	 * Undocumented magic (there is no 0x5b reg in specs)
-	 */
-	put_cmd640_reg(0x5b, 0xbd);
-	if (get_cmd640_reg(0x5b) != 0xbd) {
-		printk(KERN_ERR "ide: cmd640 init failed: wrong value in reg 0x5b\n");
-		return 0;
-	}
-	put_cmd640_reg(0x5b, 0);
-
-#ifdef CMD640_DUMP_REGS
-	cmd640_dump_regs();
-#endif
-
-	/*
-	 * Documented magic begins here
-	 */
-	cfr = get_cmd640_reg(CFR);
-	cmd640_chip_version = cfr & CFR_DEVREV;
-	if (cmd640_chip_version == 0) {
-		printk("ide: bad cmd640 revision: %d\n", cmd640_chip_version);
-		return 0;
-	}
-
-	rc = cmd640x_init_one(0x1f0, 0x3f6);
-	if (rc)
-		return rc;
-
-	rc = cmd640x_init_one(0x170, 0x376);
-	if (rc) {
-		release_region(0x3f6, 1);
-		release_region(0x1f0, 8);
-		return rc;
-	}
-
-	memset(&hw, 0, sizeof(hw));
-
-	ide_std_init_ports(&hw[0], 0x1f0, 0x3f6);
-	hw[0].irq = 14;
-
-	ide_std_init_ports(&hw[1], 0x170, 0x376);
-	hw[1].irq = 15;
-
-	printk(KERN_INFO "cmd640: buggy cmd640%c interface on %s, config=0x%02x"
-			 "\n", 'a' + cmd640_chip_version - 1, bus_type, cfr);
-
-	/*
-	 * Initialize data for primary port
-	 */
-	hws[0] = &hw[0];
-
-	/*
-	 * Ensure compatibility by always using the slowest timings
-	 * for access to the drive's command register block,
-	 * and reset the prefetch burstsize to default (512 bytes).
-	 *
-	 * Maybe we need a way to NOT do these on *some* systems?
-	 */
-	put_cmd640_reg(CMDTIM, 0);
-	put_cmd640_reg(BRST, 0x40);
-
-	b = get_cmd640_reg(CNTRL);
-
-	/*
-	 * Try to enable the secondary interface, if not already enabled
-	 */
-	if (secondary_port_responding()) {
-		if ((b & CNTRL_ENA_2ND)) {
-			second_port_cmd640 = 1;
-			port2 = "okay";
-		} else if (cmd640_vlb) {
-			second_port_cmd640 = 1;
-			port2 = "alive";
-		} else
-			port2 = "not cmd640";
-	} else {
-		put_cmd640_reg(CNTRL, b ^ CNTRL_ENA_2ND); /* toggle the bit */
-		if (secondary_port_responding()) {
-			second_port_cmd640 = 1;
-			port2 = "enabled";
-		} else {
-			put_cmd640_reg(CNTRL, b); /* restore original setting */
-			port2 = "not responding";
-		}
-	}
-
-	/*
-	 * Initialize data for secondary cmd640 port, if enabled
-	 */
-	if (second_port_cmd640)
-		hws[1] = &hw[1];
-
-	printk(KERN_INFO "cmd640: %sserialized, secondary interface %s\n",
-			 second_port_cmd640 ? "" : "not ", port2);
-
-#ifdef CMD640_DUMP_REGS
-	cmd640_dump_regs();
-#endif
-
-	return ide_host_add(&cmd640_port_info, hws, second_port_cmd640 ? 2 : 1,
-			    NULL);
-}
-
-module_param_named(probe_vlb, cmd640_vlb, bool, 0);
-MODULE_PARM_DESC(probe_vlb, "probe for VLB version of CMD640 chipset");
-
-module_init(cmd640x_init);
-
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/cmd64x.c b/drivers/ide/cmd64x.c
deleted file mode 100644
index 943bf944bf72..000000000000
--- a/drivers/ide/cmd64x.c
+++ /dev/null
@@ -1,452 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * cmd64x.c: Enable interrupts at initialization time on Ultra/PCI machines.
- *           Due to massive hardware bugs, UltraDMA is only supported
- *           on the 646U2 and not on the 646U.
- *
- * Copyright (C) 1998		Eddie C. Dost  (ecd@skynet.be)
- * Copyright (C) 1998		David S. Miller (davem@redhat.com)
- *
- * Copyright (C) 1999-2002	Andre Hedrick <andre@linux-ide.org>
- * Copyright (C) 2007-2010	Bartlomiej Zolnierkiewicz
- * Copyright (C) 2007,2009	MontaVista Software, Inc. <source@mvista.com>
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/pci.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-
-#include <asm/io.h>
-
-#define DRV_NAME "cmd64x"
-
-/*
- * CMD64x specific registers definition.
- */
-#define CFR		0x50
-#define   CFR_INTR_CH0		0x04
-
-#define	CMDTIM		0x52
-#define	ARTTIM0		0x53
-#define	DRWTIM0		0x54
-#define ARTTIM1 	0x55
-#define DRWTIM1		0x56
-#define ARTTIM23	0x57
-#define   ARTTIM23_DIS_RA2	0x04
-#define   ARTTIM23_DIS_RA3	0x08
-#define   ARTTIM23_INTR_CH1	0x10
-#define DRWTIM2		0x58
-#define BRST		0x59
-#define DRWTIM3		0x5b
-
-#define BMIDECR0	0x70
-#define MRDMODE		0x71
-#define   MRDMODE_INTR_CH0	0x04
-#define   MRDMODE_INTR_CH1	0x08
-#define UDIDETCR0	0x73
-#define DTPR0		0x74
-#define BMIDECR1	0x78
-#define BMIDECSR	0x79
-#define UDIDETCR1	0x7B
-#define DTPR1		0x7C
-
-static void cmd64x_program_timings(ide_drive_t *drive, u8 mode)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct pci_dev *dev = to_pci_dev(drive->hwif->dev);
-	int bus_speed = ide_pci_clk ? ide_pci_clk : 33;
-	const unsigned long T = 1000000 / bus_speed;
-	static const u8 recovery_values[] =
-		{15, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0};
-	static const u8 setup_values[] = {0x40, 0x40, 0x40, 0x80, 0, 0xc0};
-	static const u8 arttim_regs[4] = {ARTTIM0, ARTTIM1, ARTTIM23, ARTTIM23};
-	static const u8 drwtim_regs[4] = {DRWTIM0, DRWTIM1, DRWTIM2, DRWTIM3};
-	struct ide_timing t;
-	u8 arttim = 0;
-
-	if (drive->dn >= ARRAY_SIZE(drwtim_regs))
-		return;
-
-	ide_timing_compute(drive, mode, &t, T, 0);
-
-	/*
-	 * In case we've got too long recovery phase, try to lengthen
-	 * the active phase
-	 */
-	if (t.recover > 16) {
-		t.active += t.recover - 16;
-		t.recover = 16;
-	}
-	if (t.active > 16)		/* shouldn't actually happen... */
-		t.active = 16;
-
-	/*
-	 * Convert values to internal chipset representation
-	 */
-	t.recover = recovery_values[t.recover];
-	t.active &= 0x0f;
-
-	/* Program the active/recovery counts into the DRWTIM register */
-	pci_write_config_byte(dev, drwtim_regs[drive->dn],
-			      (t.active << 4) | t.recover);
-
-	/*
-	 * The primary channel has individual address setup timing registers
-	 * for each drive and the hardware selects the slowest timing itself.
-	 * The secondary channel has one common register and we have to select
-	 * the slowest address setup timing ourselves.
-	 */
-	if (hwif->channel) {
-		ide_drive_t *pair = ide_get_pair_dev(drive);
-
-		if (pair) {
-			struct ide_timing tp;
-
-			ide_timing_compute(pair, pair->pio_mode, &tp, T, 0);
-			ide_timing_merge(&t, &tp, &t, IDE_TIMING_SETUP);
-			if (pair->dma_mode) {
-				ide_timing_compute(pair, pair->dma_mode,
-						&tp, T, 0);
-				ide_timing_merge(&tp, &t, &t, IDE_TIMING_SETUP);
-			}
-		}
-	}
-
-	if (t.setup > 5)		/* shouldn't actually happen... */
-		t.setup = 5;
-
-	/*
-	 * Program the address setup clocks into the ARTTIM registers.
-	 * Avoid clearing the secondary channel's interrupt bit.
-	 */
-	(void) pci_read_config_byte (dev, arttim_regs[drive->dn], &arttim);
-	if (hwif->channel)
-		arttim &= ~ARTTIM23_INTR_CH1;
-	arttim &= ~0xc0;
-	arttim |= setup_values[t.setup];
-	(void) pci_write_config_byte(dev, arttim_regs[drive->dn], arttim);
-}
-
-/*
- * Attempts to set drive's PIO mode.
- * Special cases are 8: prefetch off, 9: prefetch on (both never worked)
- */
-
-static void cmd64x_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	const u8 pio = drive->pio_mode - XFER_PIO_0;
-
-	/*
-	 * Filter out the prefetch control values
-	 * to prevent PIO5 from being programmed
-	 */
-	if (pio == 8 || pio == 9)
-		return;
-
-	cmd64x_program_timings(drive, XFER_PIO_0 + pio);
-}
-
-static void cmd64x_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	u8 unit			= drive->dn & 0x01;
-	u8 regU = 0, pciU	= hwif->channel ? UDIDETCR1 : UDIDETCR0;
-	const u8 speed		= drive->dma_mode;
-
-	pci_read_config_byte(dev, pciU, &regU);
-	regU &= ~(unit ? 0xCA : 0x35);
-
-	switch(speed) {
-	case XFER_UDMA_5:
-		regU |= unit ? 0x0A : 0x05;
-		break;
-	case XFER_UDMA_4:
-		regU |= unit ? 0x4A : 0x15;
-		break;
-	case XFER_UDMA_3:
-		regU |= unit ? 0x8A : 0x25;
-		break;
-	case XFER_UDMA_2:
-		regU |= unit ? 0x42 : 0x11;
-		break;
-	case XFER_UDMA_1:
-		regU |= unit ? 0x82 : 0x21;
-		break;
-	case XFER_UDMA_0:
-		regU |= unit ? 0xC2 : 0x31;
-		break;
-	case XFER_MW_DMA_2:
-	case XFER_MW_DMA_1:
-	case XFER_MW_DMA_0:
-		cmd64x_program_timings(drive, speed);
-		break;
-	}
-
-	pci_write_config_byte(dev, pciU, regU);
-}
-
-static void cmd648_clear_irq(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif	= drive->hwif;
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	unsigned long base	= pci_resource_start(dev, 4);
-	u8  irq_mask		= hwif->channel ? MRDMODE_INTR_CH1 :
-						  MRDMODE_INTR_CH0;
-	u8  mrdmode		= inb(base + 1);
-
-	/* clear the interrupt bit */
-	outb((mrdmode & ~(MRDMODE_INTR_CH0 | MRDMODE_INTR_CH1)) | irq_mask,
-	     base + 1);
-}
-
-static void cmd64x_clear_irq(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif	= drive->hwif;
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	int irq_reg		= hwif->channel ? ARTTIM23 : CFR;
-	u8  irq_mask		= hwif->channel ? ARTTIM23_INTR_CH1 :
-						  CFR_INTR_CH0;
-	u8  irq_stat		= 0;
-
-	(void) pci_read_config_byte(dev, irq_reg, &irq_stat);
-	/* clear the interrupt bit */
-	(void) pci_write_config_byte(dev, irq_reg, irq_stat | irq_mask);
-}
-
-static int cmd648_test_irq(ide_hwif_t *hwif)
-{
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	unsigned long base	= pci_resource_start(dev, 4);
-	u8 irq_mask		= hwif->channel ? MRDMODE_INTR_CH1 :
-						  MRDMODE_INTR_CH0;
-	u8 mrdmode		= inb(base + 1);
-
-	pr_debug("%s: mrdmode: 0x%02x irq_mask: 0x%02x\n",
-		 hwif->name, mrdmode, irq_mask);
-
-	return (mrdmode & irq_mask) ? 1 : 0;
-}
-
-static int cmd64x_test_irq(ide_hwif_t *hwif)
-{
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	int irq_reg		= hwif->channel ? ARTTIM23 : CFR;
-	u8  irq_mask		= hwif->channel ? ARTTIM23_INTR_CH1 :
-						  CFR_INTR_CH0;
-	u8  irq_stat		= 0;
-
-	(void) pci_read_config_byte(dev, irq_reg, &irq_stat);
-
-	pr_debug("%s: irq_stat: 0x%02x irq_mask: 0x%02x\n",
-		 hwif->name, irq_stat, irq_mask);
-
-	return (irq_stat & irq_mask) ? 1 : 0;
-}
-
-/*
- * ASUS P55T2P4D with CMD646 chipset revision 0x01 requires the old
- * event order for DMA transfers.
- */
-
-static int cmd646_1_dma_end(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	u8 dma_stat = 0, dma_cmd = 0;
-
-	/* get DMA status */
-	dma_stat = inb(hwif->dma_base + ATA_DMA_STATUS);
-	/* read DMA command state */
-	dma_cmd = inb(hwif->dma_base + ATA_DMA_CMD);
-	/* stop DMA */
-	outb(dma_cmd & ~1, hwif->dma_base + ATA_DMA_CMD);
-	/* clear the INTR & ERROR bits */
-	outb(dma_stat | 6, hwif->dma_base + ATA_DMA_STATUS);
-	/* verify good DMA status */
-	return (dma_stat & 7) != 4;
-}
-
-static int init_chipset_cmd64x(struct pci_dev *dev)
-{
-	u8 mrdmode = 0;
-
-	/* Set a good latency timer and cache line size value. */
-	(void) pci_write_config_byte(dev, PCI_LATENCY_TIMER, 64);
-	/* FIXME: pci_set_master() to ensure a good latency timer value */
-
-	/*
-	 * Enable interrupts, select MEMORY READ LINE for reads.
-	 *
-	 * NOTE: although not mentioned in the PCI0646U specs,
-	 * bits 0-1 are write only and won't be read back as
-	 * set or not -- PCI0646U2 specs clarify this point.
-	 */
-	(void) pci_read_config_byte (dev, MRDMODE, &mrdmode);
-	mrdmode &= ~0x30;
-	(void) pci_write_config_byte(dev, MRDMODE, (mrdmode | 0x02));
-
-	return 0;
-}
-
-static u8 cmd64x_cable_detect(ide_hwif_t *hwif)
-{
-	struct pci_dev  *dev	= to_pci_dev(hwif->dev);
-	u8 bmidecsr = 0, mask	= hwif->channel ? 0x02 : 0x01;
-
-	switch (dev->device) {
-	case PCI_DEVICE_ID_CMD_648:
-	case PCI_DEVICE_ID_CMD_649:
- 		pci_read_config_byte(dev, BMIDECSR, &bmidecsr);
-		return (bmidecsr & mask) ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
-	default:
-		return ATA_CBL_PATA40;
-	}
-}
-
-static const struct ide_port_ops cmd64x_port_ops = {
-	.set_pio_mode		= cmd64x_set_pio_mode,
-	.set_dma_mode		= cmd64x_set_dma_mode,
-	.clear_irq		= cmd64x_clear_irq,
-	.test_irq		= cmd64x_test_irq,
-	.cable_detect		= cmd64x_cable_detect,
-};
-
-static const struct ide_port_ops cmd648_port_ops = {
-	.set_pio_mode		= cmd64x_set_pio_mode,
-	.set_dma_mode		= cmd64x_set_dma_mode,
-	.clear_irq		= cmd648_clear_irq,
-	.test_irq		= cmd648_test_irq,
-	.cable_detect		= cmd64x_cable_detect,
-};
-
-static const struct ide_dma_ops cmd646_rev1_dma_ops = {
-	.dma_host_set		= ide_dma_host_set,
-	.dma_setup		= ide_dma_setup,
-	.dma_start		= ide_dma_start,
-	.dma_end		= cmd646_1_dma_end,
-	.dma_test_irq		= ide_dma_test_irq,
-	.dma_lost_irq		= ide_dma_lost_irq,
-	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_sff_read_status	= ide_dma_sff_read_status,
-};
-
-static const struct ide_port_info cmd64x_chipsets[] = {
-	{	/* 0: CMD643 */
-		.name		= DRV_NAME,
-		.init_chipset	= init_chipset_cmd64x,
-		.enablebits	= {{0x00,0x00,0x00}, {0x51,0x08,0x08}},
-		.port_ops	= &cmd64x_port_ops,
-		.host_flags	= IDE_HFLAG_CLEAR_SIMPLEX |
-				  IDE_HFLAG_ABUSE_PREFETCH |
-				  IDE_HFLAG_SERIALIZE,
-		.pio_mask	= ATA_PIO5,
-		.mwdma_mask	= ATA_MWDMA2,
-		.udma_mask	= 0x00, /* no udma */
-	},
-	{	/* 1: CMD646 */
-		.name		= DRV_NAME,
-		.init_chipset	= init_chipset_cmd64x,
-		.enablebits	= {{0x51,0x04,0x04}, {0x51,0x08,0x08}},
-		.port_ops	= &cmd648_port_ops,
-		.host_flags	= IDE_HFLAG_ABUSE_PREFETCH |
-				  IDE_HFLAG_SERIALIZE,
-		.pio_mask	= ATA_PIO5,
-		.mwdma_mask	= ATA_MWDMA2,
-		.udma_mask	= ATA_UDMA2,
-	},
-	{	/* 2: CMD648 */
-		.name		= DRV_NAME,
-		.init_chipset	= init_chipset_cmd64x,
-		.enablebits	= {{0x51,0x04,0x04}, {0x51,0x08,0x08}},
-		.port_ops	= &cmd648_port_ops,
-		.host_flags	= IDE_HFLAG_ABUSE_PREFETCH,
-		.pio_mask	= ATA_PIO5,
-		.mwdma_mask	= ATA_MWDMA2,
-		.udma_mask	= ATA_UDMA4,
-	},
-	{	/* 3: CMD649 */
-		.name		= DRV_NAME,
-		.init_chipset	= init_chipset_cmd64x,
-		.enablebits	= {{0x51,0x04,0x04}, {0x51,0x08,0x08}},
-		.port_ops	= &cmd648_port_ops,
-		.host_flags	= IDE_HFLAG_ABUSE_PREFETCH,
-		.pio_mask	= ATA_PIO5,
-		.mwdma_mask	= ATA_MWDMA2,
-		.udma_mask	= ATA_UDMA5,
-	}
-};
-
-static int cmd64x_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	struct ide_port_info d;
-	u8 idx = id->driver_data;
-
-	d = cmd64x_chipsets[idx];
-
-	if (idx == 1) {
-		/*
-		 * UltraDMA only supported on PCI646U and PCI646U2, which
-		 * correspond to revisions 0x03, 0x05 and 0x07 respectively.
-		 * Actually, although the CMD tech support people won't
-		 * tell me the details, the 0x03 revision cannot support
-		 * UDMA correctly without hardware modifications, and even
-		 * then it only works with Quantum disks due to some
-		 * hold time assumptions in the 646U part which are fixed
-		 * in the 646U2.
-		 *
-		 * So we only do UltraDMA on revision 0x05 and 0x07 chipsets.
-		 */
-		if (dev->revision < 5) {
-			d.udma_mask = 0x00;
-			/*
-			 * The original PCI0646 didn't have the primary
-			 * channel enable bit, it appeared starting with
-			 * PCI0646U (i.e. revision ID 3).
-			 */
-			if (dev->revision < 3) {
-				d.enablebits[0].reg = 0;
-				d.port_ops = &cmd64x_port_ops;
-				if (dev->revision == 1)
-					d.dma_ops = &cmd646_rev1_dma_ops;
-			}
-		}
-	}
-
-	return ide_pci_init_one(dev, &d, NULL);
-}
-
-static const struct pci_device_id cmd64x_pci_tbl[] = {
-	{ PCI_VDEVICE(CMD, PCI_DEVICE_ID_CMD_643), 0 },
-	{ PCI_VDEVICE(CMD, PCI_DEVICE_ID_CMD_646), 1 },
-	{ PCI_VDEVICE(CMD, PCI_DEVICE_ID_CMD_648), 2 },
-	{ PCI_VDEVICE(CMD, PCI_DEVICE_ID_CMD_649), 3 },
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, cmd64x_pci_tbl);
-
-static struct pci_driver cmd64x_pci_driver = {
-	.name		= "CMD64x_IDE",
-	.id_table	= cmd64x_pci_tbl,
-	.probe		= cmd64x_init_one,
-	.remove		= ide_pci_remove,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init cmd64x_ide_init(void)
-{
-	return ide_pci_register_driver(&cmd64x_pci_driver);
-}
-
-static void __exit cmd64x_ide_exit(void)
-{
-	pci_unregister_driver(&cmd64x_pci_driver);
-}
-
-module_init(cmd64x_ide_init);
-module_exit(cmd64x_ide_exit);
-
-MODULE_AUTHOR("Eddie Dost, David Miller, Andre Hedrick, Bartlomiej Zolnierkiewicz");
-MODULE_DESCRIPTION("PCI driver module for CMD64x IDE");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/cs5520.c b/drivers/ide/cs5520.c
deleted file mode 100644
index 89a4ff100b7a..000000000000
--- a/drivers/ide/cs5520.c
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- *	IDE tuning and bus mastering support for the CS5510/CS5520
- *	chipsets
- *
- *	The CS5510/CS5520 are slightly unusual devices. Unlike the 
- *	typical IDE controllers they do bus mastering with the drive in
- *	PIO mode and smarter silicon.
- *
- *	The practical upshot of this is that we must always tune the
- *	drive for the right PIO mode. We must also ignore all the blacklists
- *	and the drive bus mastering DMA information.
- *
- *	*** This driver is strictly experimental ***
- *
- *	(c) Copyright Red Hat Inc 2002
- * 
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2, or (at your option) any
- * later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * For the avoidance of doubt the "preferred form" of this code is one which
- * is in an open non patent encumbered format. Where cryptographic key signing
- * forms part of the process of creating an executable the information
- * including keys needed to generate an equivalently functional executable
- * are deemed to be part of the source code.
- *
- */
- 
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/ide.h>
-#include <linux/dma-mapping.h>
-
-#define DRV_NAME "cs5520"
-
-struct pio_clocks
-{
-	int address;
-	int assert;
-	int recovery;
-};
-
-static struct pio_clocks cs5520_pio_clocks[]={
-	{3, 6, 11},
-	{2, 5, 6},
-	{1, 4, 3},
-	{1, 3, 2},
-	{1, 2, 1}
-};
-
-static void cs5520_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	struct pci_dev *pdev = to_pci_dev(hwif->dev);
-	int controller = drive->dn > 1 ? 1 : 0;
-	const u8 pio = drive->pio_mode - XFER_PIO_0;
-
-	/* 8bit CAT/CRT - 8bit command timing for channel */
-	pci_write_config_byte(pdev, 0x62 + controller, 
-		(cs5520_pio_clocks[pio].recovery << 4) |
-		(cs5520_pio_clocks[pio].assert));
-
-	/* 0x64 - 16bit Primary, 0x68 - 16bit Secondary */
-
-	/* FIXME: should these use address ? */
-	/* Data read timing */
-	pci_write_config_byte(pdev, 0x64 + 4*controller + (drive->dn&1),
-		(cs5520_pio_clocks[pio].recovery << 4) |
-		(cs5520_pio_clocks[pio].assert));
-	/* Write command timing */
-	pci_write_config_byte(pdev, 0x66 + 4*controller + (drive->dn&1),
-		(cs5520_pio_clocks[pio].recovery << 4) |
-		(cs5520_pio_clocks[pio].assert));
-}
-
-static void cs5520_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	printk(KERN_ERR "cs55x0: bad ide timing.\n");
-
-	drive->pio_mode = XFER_PIO_0 + 0;
-	cs5520_set_pio_mode(hwif, drive);
-}
-
-static const struct ide_port_ops cs5520_port_ops = {
-	.set_pio_mode		= cs5520_set_pio_mode,
-	.set_dma_mode		= cs5520_set_dma_mode,
-};
-
-static const struct ide_port_info cyrix_chipset = {
-	.name		= DRV_NAME,
-	.enablebits	= { { 0x60, 0x01, 0x01 }, { 0x60, 0x02, 0x02 } },
-	.port_ops	= &cs5520_port_ops,
-	.host_flags	= IDE_HFLAG_ISA_PORTS | IDE_HFLAG_CS5520,
-	.pio_mask	= ATA_PIO4,
-};
-
-/*
- *	The 5510/5520 are a bit weird. They don't quite set up the way
- *	the PCI helper layer expects so we must do much of the set up 
- *	work longhand.
- */
- 
-static int cs5520_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	const struct ide_port_info *d = &cyrix_chipset;
-	struct ide_hw hw[2], *hws[] = { NULL, NULL };
-
-	ide_setup_pci_noise(dev, d);
-
-	/* We must not grab the entire device, it has 'ISA' space in its
-	 * BARS too and we will freak out other bits of the kernel
-	 */
-	if (pci_enable_device_io(dev)) {
-		printk(KERN_WARNING "%s: Unable to enable 55x0.\n", d->name);
-		return -ENODEV;
-	}
-	pci_set_master(dev);
-	if (dma_set_mask(&dev->dev, DMA_BIT_MASK(32))) {
-		printk(KERN_WARNING "%s: No suitable DMA available.\n",
-			d->name);
-		return -ENODEV;
-	}
-
-	/*
-	 *	Now the chipset is configured we can let the core
-	 *	do all the device setup for us
-	 */
-
-	ide_pci_setup_ports(dev, d, &hw[0], &hws[0]);
-	hw[0].irq = 14;
-	hw[1].irq = 15;
-
-	return ide_host_add(d, hws, 2, NULL);
-}
-
-static const struct pci_device_id cs5520_pci_tbl[] = {
-	{ PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), 0 },
-	{ PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), 1 },
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, cs5520_pci_tbl);
-
-static struct pci_driver cs5520_pci_driver = {
-	.name		= "Cyrix_IDE",
-	.id_table	= cs5520_pci_tbl,
-	.probe		= cs5520_init_one,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init cs5520_ide_init(void)
-{
-	return ide_pci_register_driver(&cs5520_pci_driver);
-}
-
-module_init(cs5520_ide_init);
-
-MODULE_AUTHOR("Alan Cox");
-MODULE_DESCRIPTION("PCI driver module for Cyrix 5510/5520 IDE");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/cs5530.c b/drivers/ide/cs5530.c
deleted file mode 100644
index 65371599b976..000000000000
--- a/drivers/ide/cs5530.c
+++ /dev/null
@@ -1,295 +0,0 @@
-/*
- * Copyright (C) 2000			Andre Hedrick <andre@linux-ide.org>
- * Copyright (C) 2000			Mark Lord <mlord@pobox.com>
- * Copyright (C) 2007			Bartlomiej Zolnierkiewicz
- *
- * May be copied or modified under the terms of the GNU General Public License
- *
- * Development of this chipset driver was funded
- * by the nice folks at National Semiconductor.
- *
- * Documentation:
- *	CS5530 documentation available from National Semiconductor.
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/init.h>
-#include <linux/ide.h>
-
-#include <asm/io.h>
-
-#define DRV_NAME "cs5530"
-
-/*
- * Here are the standard PIO mode 0-4 timings for each "format".
- * Format-0 uses fast data reg timings, with slower command reg timings.
- * Format-1 uses fast timings for all registers, but won't work with all drives.
- */
-static unsigned int cs5530_pio_timings[2][5] = {
-	{0x00009172, 0x00012171, 0x00020080, 0x00032010, 0x00040010},
-	{0xd1329172, 0x71212171, 0x30200080, 0x20102010, 0x00100010}
-};
-
-/*
- * After chip reset, the PIO timings are set to 0x0000e132, which is not valid.
- */
-#define CS5530_BAD_PIO(timings) (((timings)&~0x80000000)==0x0000e132)
-#define CS5530_BASEREG(hwif)	(((hwif)->dma_base & ~0xf) + ((hwif)->channel ? 0x30 : 0x20))
-
-/**
- *	cs5530_set_pio_mode	-	set host controller for PIO mode
- *	@hwif: port
- *	@drive: drive
- *
- *	Handles setting of PIO mode for the chipset.
- *
- *	The init_hwif_cs5530() routine guarantees that all drives
- *	will have valid default PIO timings set up before we get here.
- */
-
-static void cs5530_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	unsigned long basereg = CS5530_BASEREG(hwif);
-	unsigned int format = (inl(basereg + 4) >> 31) & 1;
-	const u8 pio = drive->pio_mode - XFER_PIO_0;
-
-	outl(cs5530_pio_timings[format][pio], basereg + ((drive->dn & 1)<<3));
-}
-
-/**
- *	cs5530_udma_filter	-	UDMA filter
- *	@drive: drive
- *
- *	cs5530_udma_filter() does UDMA mask filtering for the given drive
- *	taking into the consideration capabilities of the mate device.
- *
- *	The CS5530 specifies that two drives sharing a cable cannot mix
- *	UDMA/MDMA.  It has to be one or the other, for the pair, though
- *	different timings can still be chosen for each drive.  We could
- *	set the appropriate timing bits on the fly, but that might be
- *	a bit confusing.  So, for now we statically handle this requirement
- *	by looking at our mate drive to see what it is capable of, before
- *	choosing a mode for our own drive.
- *
- *	Note: This relies on the fact we never fail from UDMA to MWDMA2
- *	but instead drop to PIO.
- */
-
-static u8 cs5530_udma_filter(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	ide_drive_t *mate = ide_get_pair_dev(drive);
-	u16 *mateid;
-	u8 mask = hwif->ultra_mask;
-
-	if (mate == NULL)
-		goto out;
-	mateid = mate->id;
-
-	if (ata_id_has_dma(mateid) && __ide_dma_bad_drive(mate) == 0) {
-		if ((mateid[ATA_ID_FIELD_VALID] & 4) &&
-		    (mateid[ATA_ID_UDMA_MODES] & 7))
-			goto out;
-		if (mateid[ATA_ID_MWDMA_MODES] & 7)
-			mask = 0;
-	}
-out:
-	return mask;
-}
-
-static void cs5530_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	unsigned long basereg;
-	unsigned int reg, timings = 0;
-
-	switch (drive->dma_mode) {
-		case XFER_UDMA_0:	timings = 0x00921250; break;
-		case XFER_UDMA_1:	timings = 0x00911140; break;
-		case XFER_UDMA_2:	timings = 0x00911030; break;
-		case XFER_MW_DMA_0:	timings = 0x00077771; break;
-		case XFER_MW_DMA_1:	timings = 0x00012121; break;
-		case XFER_MW_DMA_2:	timings = 0x00002020; break;
-	}
-	basereg = CS5530_BASEREG(hwif);
-	reg = inl(basereg + 4);			/* get drive0 config register */
-	timings |= reg & 0x80000000;		/* preserve PIO format bit */
-	if ((drive-> dn & 1) == 0) {		/* are we configuring drive0? */
-		outl(timings, basereg + 4);	/* write drive0 config register */
-	} else {
-		if (timings & 0x00100000)
-			reg |=  0x00100000;	/* enable UDMA timings for both drives */
-		else
-			reg &= ~0x00100000;	/* disable UDMA timings for both drives */
-		outl(reg, basereg + 4);		/* write drive0 config register */
-		outl(timings, basereg + 12);	/* write drive1 config register */
-	}
-}
-
-/**
- *	init_chipset_5530	-	set up 5530 bridge
- *	@dev: PCI device
- *
- *	Initialize the cs5530 bridge for reliable IDE DMA operation.
- */
-
-static int init_chipset_cs5530(struct pci_dev *dev)
-{
-	struct pci_dev *master_0 = NULL, *cs5530_0 = NULL;
-
-	if (pci_resource_start(dev, 4) == 0)
-		return -EFAULT;
-
-	dev = NULL;
-	while ((dev = pci_get_device(PCI_VENDOR_ID_CYRIX, PCI_ANY_ID, dev)) != NULL) {
-		switch (dev->device) {
-			case PCI_DEVICE_ID_CYRIX_PCI_MASTER:
-				master_0 = pci_dev_get(dev);
-				break;
-			case PCI_DEVICE_ID_CYRIX_5530_LEGACY:
-				cs5530_0 = pci_dev_get(dev);
-				break;
-		}
-	}
-	if (!master_0) {
-		printk(KERN_ERR DRV_NAME ": unable to locate PCI MASTER function\n");
-		goto out;
-	}
-	if (!cs5530_0) {
-		printk(KERN_ERR DRV_NAME ": unable to locate CS5530 LEGACY function\n");
-		goto out;
-	}
-
-	/*
-	 * Enable BusMaster and MemoryWriteAndInvalidate for the cs5530:
-	 * -->  OR 0x14 into 16-bit PCI COMMAND reg of function 0 of the cs5530
-	 */
-
-	pci_set_master(cs5530_0);
-	pci_try_set_mwi(cs5530_0);
-
-	/*
-	 * Set PCI CacheLineSize to 16-bytes:
-	 * --> Write 0x04 into 8-bit PCI CACHELINESIZE reg of function 0 of the cs5530
-	 */
-
-	pci_write_config_byte(cs5530_0, PCI_CACHE_LINE_SIZE, 0x04);
-
-	/*
-	 * Disable trapping of UDMA register accesses (Win98 hack):
-	 * --> Write 0x5006 into 16-bit reg at offset 0xd0 of function 0 of the cs5530
-	 */
-
-	pci_write_config_word(cs5530_0, 0xd0, 0x5006);
-
-	/*
-	 * Bit-1 at 0x40 enables MemoryWriteAndInvalidate on internal X-bus:
-	 * The other settings are what is necessary to get the register
-	 * into a sane state for IDE DMA operation.
-	 */
-
-	pci_write_config_byte(master_0, 0x40, 0x1e);
-
-	/* 
-	 * Set max PCI burst size (16-bytes seems to work best):
-	 *	   16bytes: set bit-1 at 0x41 (reg value of 0x16)
-	 *	all others: clear bit-1 at 0x41, and do:
-	 *	  128bytes: OR 0x00 at 0x41
-	 *	  256bytes: OR 0x04 at 0x41
-	 *	  512bytes: OR 0x08 at 0x41
-	 *	 1024bytes: OR 0x0c at 0x41
-	 */
-
-	pci_write_config_byte(master_0, 0x41, 0x14);
-
-	/*
-	 * These settings are necessary to get the chip
-	 * into a sane state for IDE DMA operation.
-	 */
-
-	pci_write_config_byte(master_0, 0x42, 0x00);
-	pci_write_config_byte(master_0, 0x43, 0xc1);
-
-out:
-	pci_dev_put(master_0);
-	pci_dev_put(cs5530_0);
-	return 0;
-}
-
-/**
- *	init_hwif_cs5530	-	initialise an IDE channel
- *	@hwif: IDE to initialize
- *
- *	This gets invoked by the IDE driver once for each channel. It
- *	performs channel-specific pre-initialization before drive probing.
- */
-
-static void init_hwif_cs5530 (ide_hwif_t *hwif)
-{
-	unsigned long basereg;
-	u32 d0_timings;
-
-	basereg = CS5530_BASEREG(hwif);
-	d0_timings = inl(basereg + 0);
-	if (CS5530_BAD_PIO(d0_timings))
-		outl(cs5530_pio_timings[(d0_timings >> 31) & 1][0], basereg + 0);
-	if (CS5530_BAD_PIO(inl(basereg + 8)))
-		outl(cs5530_pio_timings[(d0_timings >> 31) & 1][0], basereg + 8);
-}
-
-static const struct ide_port_ops cs5530_port_ops = {
-	.set_pio_mode		= cs5530_set_pio_mode,
-	.set_dma_mode		= cs5530_set_dma_mode,
-	.udma_filter		= cs5530_udma_filter,
-};
-
-static const struct ide_port_info cs5530_chipset = {
-	.name		= DRV_NAME,
-	.init_chipset	= init_chipset_cs5530,
-	.init_hwif	= init_hwif_cs5530,
-	.port_ops	= &cs5530_port_ops,
-	.host_flags	= IDE_HFLAG_SERIALIZE |
-			  IDE_HFLAG_POST_SET_MODE,
-	.pio_mask	= ATA_PIO4,
-	.mwdma_mask	= ATA_MWDMA2,
-	.udma_mask	= ATA_UDMA2,
-};
-
-static int cs5530_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	return ide_pci_init_one(dev, &cs5530_chipset, NULL);
-}
-
-static const struct pci_device_id cs5530_pci_tbl[] = {
-	{ PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_IDE), 0 },
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, cs5530_pci_tbl);
-
-static struct pci_driver cs5530_pci_driver = {
-	.name		= "CS5530 IDE",
-	.id_table	= cs5530_pci_tbl,
-	.probe		= cs5530_init_one,
-	.remove		= ide_pci_remove,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init cs5530_ide_init(void)
-{
-	return ide_pci_register_driver(&cs5530_pci_driver);
-}
-
-static void __exit cs5530_ide_exit(void)
-{
-	pci_unregister_driver(&cs5530_pci_driver);
-}
-
-module_init(cs5530_ide_init);
-module_exit(cs5530_ide_exit);
-
-MODULE_AUTHOR("Mark Lord");
-MODULE_DESCRIPTION("PCI driver module for Cyrix/NS 5530 IDE");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/cs5535.c b/drivers/ide/cs5535.c
deleted file mode 100644
index 70fdbe3161f8..000000000000
--- a/drivers/ide/cs5535.c
+++ /dev/null
@@ -1,216 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2004-2005 Advanced Micro Devices, Inc.
- * Copyright (C)      2007 Bartlomiej Zolnierkiewicz
- *
- * History:
- * 09/20/2005 - Jaya Kumar <jayakumar.ide@gmail.com>
- * - Reworked tuneproc, set_drive, misc mods to prep for mainline
- * - Work was sponsored by CIS (M) Sdn Bhd.
- * Ported to Kernel 2.6.11 on June 26, 2005 by
- *   Wolfgang Zuleger <wolfgang.zuleger@gmx.de>
- *   Alexander Kiausch <alex.kiausch@t-online.de>
- * Originally developed by AMD for 2.4/2.6
- *
- * Development of this chipset driver was funded
- * by the nice folks at National Semiconductor/AMD.
- *
- * Documentation:
- *  CS5535 documentation available from AMD
- */
-
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/ide.h>
-
-#define DRV_NAME "cs5535"
-
-#define MSR_ATAC_BASE		0x51300000
-#define ATAC_GLD_MSR_CAP	(MSR_ATAC_BASE+0)
-#define ATAC_GLD_MSR_CONFIG	(MSR_ATAC_BASE+0x01)
-#define ATAC_GLD_MSR_SMI	(MSR_ATAC_BASE+0x02)
-#define ATAC_GLD_MSR_ERROR	(MSR_ATAC_BASE+0x03)
-#define ATAC_GLD_MSR_PM		(MSR_ATAC_BASE+0x04)
-#define ATAC_GLD_MSR_DIAG	(MSR_ATAC_BASE+0x05)
-#define ATAC_IO_BAR		(MSR_ATAC_BASE+0x08)
-#define ATAC_RESET		(MSR_ATAC_BASE+0x10)
-#define ATAC_CH0D0_PIO		(MSR_ATAC_BASE+0x20)
-#define ATAC_CH0D0_DMA		(MSR_ATAC_BASE+0x21)
-#define ATAC_CH0D1_PIO		(MSR_ATAC_BASE+0x22)
-#define ATAC_CH0D1_DMA		(MSR_ATAC_BASE+0x23)
-#define ATAC_PCI_ABRTERR	(MSR_ATAC_BASE+0x24)
-#define ATAC_BM0_CMD_PRIM	0x00
-#define ATAC_BM0_STS_PRIM	0x02
-#define ATAC_BM0_PRD		0x04
-#define CS5535_CABLE_DETECT	0x48
-
-/* Format I PIO settings. We separate out cmd and data for safer timings */
-
-static unsigned int cs5535_pio_cmd_timings[5] =
-{ 0xF7F4, 0x53F3, 0x13F1, 0x5131, 0x1131 };
-static unsigned int cs5535_pio_dta_timings[5] =
-{ 0xF7F4, 0xF173, 0x8141, 0x5131, 0x1131 };
-
-static unsigned int cs5535_mwdma_timings[3] =
-{ 0x7F0FFFF3, 0x7F035352, 0x7f024241 };
-
-static unsigned int cs5535_udma_timings[5] =
-{ 0x7F7436A1, 0x7F733481, 0x7F723261, 0x7F713161, 0x7F703061 };
-
-/* Macros to check if the register is the reset value -  reset value is an
-   invalid timing and indicates the register has not been set previously */
-
-#define CS5535_BAD_PIO(timings) ( (timings&~0x80000000UL) == 0x00009172 )
-#define CS5535_BAD_DMA(timings) ( (timings & 0x000FFFFF) == 0x00077771 )
-
-/****
- *	cs5535_set_speed         -     Configure the chipset to the new speed
- *	@drive: Drive to set up
- *	@speed: desired speed
- *
- *	cs5535_set_speed() configures the chipset to a new speed.
- */
-static void cs5535_set_speed(ide_drive_t *drive, const u8 speed)
-{
-	u32 reg = 0, dummy;
-	u8 unit = drive->dn & 1;
-
-	/* Set the PIO timings */
-	if (speed < XFER_SW_DMA_0) {
-		ide_drive_t *pair = ide_get_pair_dev(drive);
-		u8 cmd, pioa;
-
-		cmd = pioa = speed - XFER_PIO_0;
-
-		if (pair) {
-			u8 piob = pair->pio_mode - XFER_PIO_0;
-
-			if (piob < cmd)
-				cmd = piob;
-		}
-
-		/* Write the speed of the current drive */
-		reg = (cs5535_pio_cmd_timings[cmd] << 16) |
-			cs5535_pio_dta_timings[pioa];
-		wrmsr(unit ? ATAC_CH0D1_PIO : ATAC_CH0D0_PIO, reg, 0);
-
-		/* And if nessesary - change the speed of the other drive */
-		rdmsr(unit ?  ATAC_CH0D0_PIO : ATAC_CH0D1_PIO, reg, dummy);
-
-		if (((reg >> 16) & cs5535_pio_cmd_timings[cmd]) !=
-			cs5535_pio_cmd_timings[cmd]) {
-			reg &= 0x0000FFFF;
-			reg |= cs5535_pio_cmd_timings[cmd] << 16;
-			wrmsr(unit ? ATAC_CH0D0_PIO : ATAC_CH0D1_PIO, reg, 0);
-		}
-
-		/* Set bit 31 of the DMA register for PIO format 1 timings */
-		rdmsr(unit ?  ATAC_CH0D1_DMA : ATAC_CH0D0_DMA, reg, dummy);
-		wrmsr(unit ? ATAC_CH0D1_DMA : ATAC_CH0D0_DMA,
-					reg | 0x80000000UL, 0);
-	} else {
-		rdmsr(unit ? ATAC_CH0D1_DMA : ATAC_CH0D0_DMA, reg, dummy);
-
-		reg &= 0x80000000UL;  /* Preserve the PIO format bit */
-
-		if (speed >= XFER_UDMA_0 && speed <= XFER_UDMA_4)
-			reg |= cs5535_udma_timings[speed - XFER_UDMA_0];
-		else if (speed >= XFER_MW_DMA_0 && speed <= XFER_MW_DMA_2)
-			reg |= cs5535_mwdma_timings[speed - XFER_MW_DMA_0];
-		else
-			return;
-
-		wrmsr(unit ? ATAC_CH0D1_DMA : ATAC_CH0D0_DMA, reg, 0);
-	}
-}
-
-/**
- *	cs5535_set_dma_mode	-	set host controller for DMA mode
- *	@hwif: port
- *	@drive: drive
- *
- *	Programs the chipset for DMA mode.
- */
-
-static void cs5535_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	cs5535_set_speed(drive, drive->dma_mode);
-}
-
-/**
- *	cs5535_set_pio_mode	-	set host controller for PIO mode
- *	@hwif: port
- *	@drive: drive
- *
- *	A callback from the upper layers for PIO-only tuning.
- */
-
-static void cs5535_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	cs5535_set_speed(drive, drive->pio_mode);
-}
-
-static u8 cs5535_cable_detect(ide_hwif_t *hwif)
-{
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	u8 bit;
-
-	/* if a 80 wire cable was detected */
-	pci_read_config_byte(dev, CS5535_CABLE_DETECT, &bit);
-
-	return (bit & 1) ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
-}
-
-static const struct ide_port_ops cs5535_port_ops = {
-	.set_pio_mode		= cs5535_set_pio_mode,
-	.set_dma_mode		= cs5535_set_dma_mode,
-	.cable_detect		= cs5535_cable_detect,
-};
-
-static const struct ide_port_info cs5535_chipset = {
-	.name		= DRV_NAME,
-	.port_ops	= &cs5535_port_ops,
-	.host_flags	= IDE_HFLAG_SINGLE | IDE_HFLAG_POST_SET_MODE,
-	.pio_mask	= ATA_PIO4,
-	.mwdma_mask	= ATA_MWDMA2,
-	.udma_mask	= ATA_UDMA4,
-};
-
-static int cs5535_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	return ide_pci_init_one(dev, &cs5535_chipset, NULL);
-}
-
-static const struct pci_device_id cs5535_pci_tbl[] = {
-	{ PCI_VDEVICE(NS, PCI_DEVICE_ID_NS_CS5535_IDE), 0 },
-	{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_CS5535_IDE), },
-	{ 0, },
-};
-
-MODULE_DEVICE_TABLE(pci, cs5535_pci_tbl);
-
-static struct pci_driver cs5535_pci_driver = {
-	.name		= "CS5535_IDE",
-	.id_table	= cs5535_pci_tbl,
-	.probe		= cs5535_init_one,
-	.remove		= ide_pci_remove,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init cs5535_ide_init(void)
-{
-	return ide_pci_register_driver(&cs5535_pci_driver);
-}
-
-static void __exit cs5535_ide_exit(void)
-{
-	pci_unregister_driver(&cs5535_pci_driver);
-}
-
-module_init(cs5535_ide_init);
-module_exit(cs5535_ide_exit);
-
-MODULE_AUTHOR("AMD");
-MODULE_DESCRIPTION("PCI driver module for AMD/NS CS5535 IDE");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/cs5536.c b/drivers/ide/cs5536.c
deleted file mode 100644
index 8b5ca145191b..000000000000
--- a/drivers/ide/cs5536.c
+++ /dev/null
@@ -1,294 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * CS5536 PATA support
- * (C) 2007 Martin K. Petersen <mkp@mkp.net>
- * (C) 2009 Bartlomiej Zolnierkiewicz
- *
- * Documentation:
- *	Available from AMD web site.
- *
- * The IDE timing registers for the CS5536 live in the Geode Machine
- * Specific Register file and not PCI config space.  Most BIOSes
- * virtualize the PCI registers so the chip looks like a standard IDE
- * controller.  Unfortunately not all implementations get this right.
- * In particular some have problems with unaligned accesses to the
- * virtualized PCI registers.  This driver always does full dword
- * writes to work around the issue.  Also, in case of a bad BIOS this
- * driver can be loaded with the "msr=1" parameter which forces using
- * the Machine Specific Registers to configure the device.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/init.h>
-#include <linux/ide.h>
-#include <asm/msr.h>
-
-#define DRV_NAME	"cs5536"
-
-enum {
-	MSR_IDE_CFG		= 0x51300010,
-	PCI_IDE_CFG		= 0x40,
-
-	CFG			= 0,
-	DTC			= 2,
-	CAST			= 3,
-	ETC			= 4,
-
-	IDE_CFG_CHANEN		= (1 << 1),
-	IDE_CFG_CABLE		= (1 << 17) | (1 << 16),
-
-	IDE_D0_SHIFT		= 24,
-	IDE_D1_SHIFT		= 16,
-	IDE_DRV_MASK		= 0xff,
-
-	IDE_CAST_D0_SHIFT	= 6,
-	IDE_CAST_D1_SHIFT	= 4,
-	IDE_CAST_DRV_MASK	= 0x3,
-
-	IDE_CAST_CMD_SHIFT	= 24,
-	IDE_CAST_CMD_MASK	= 0xff,
-
-	IDE_ETC_UDMA_MASK	= 0xc0,
-};
-
-static int use_msr;
-
-static int cs5536_read(struct pci_dev *pdev, int reg, u32 *val)
-{
-	if (unlikely(use_msr)) {
-		u32 dummy;
-
-		rdmsr(MSR_IDE_CFG + reg, *val, dummy);
-		return 0;
-	}
-
-	return pci_read_config_dword(pdev, PCI_IDE_CFG + reg * 4, val);
-}
-
-static int cs5536_write(struct pci_dev *pdev, int reg, int val)
-{
-	if (unlikely(use_msr)) {
-		wrmsr(MSR_IDE_CFG + reg, val, 0);
-		return 0;
-	}
-
-	return pci_write_config_dword(pdev, PCI_IDE_CFG + reg * 4, val);
-}
-
-static void cs5536_program_dtc(ide_drive_t *drive, u8 tim)
-{
-	struct pci_dev *pdev = to_pci_dev(drive->hwif->dev);
-	int dshift = (drive->dn & 1) ? IDE_D1_SHIFT : IDE_D0_SHIFT;
-	u32 dtc;
-
-	cs5536_read(pdev, DTC, &dtc);
-	dtc &= ~(IDE_DRV_MASK << dshift);
-	dtc |= tim << dshift;
-	cs5536_write(pdev, DTC, dtc);
-}
-
-/**
- *	cs5536_cable_detect	-	detect cable type
- *	@hwif: Port to detect on
- *
- *	Perform cable detection for ATA66 capable cable.
- *
- *	Returns a cable type.
- */
-
-static u8 cs5536_cable_detect(ide_hwif_t *hwif)
-{
-	struct pci_dev *pdev = to_pci_dev(hwif->dev);
-	u32 cfg;
-
-	cs5536_read(pdev, CFG, &cfg);
-
-	if (cfg & IDE_CFG_CABLE)
-		return ATA_CBL_PATA80;
-	else
-		return ATA_CBL_PATA40;
-}
-
-/**
- *	cs5536_set_pio_mode		-	PIO timing setup
- *	@hwif: ATA port
- *	@drive: ATA device
- */
-
-static void cs5536_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	static const u8 drv_timings[5] = {
-		0x98, 0x55, 0x32, 0x21, 0x20,
-	};
-
-	static const u8 addr_timings[5] = {
-		0x2, 0x1, 0x0, 0x0, 0x0,
-	};
-
-	static const u8 cmd_timings[5] = {
-		0x99, 0x92, 0x90, 0x22, 0x20,
-	};
-
-	struct pci_dev *pdev = to_pci_dev(hwif->dev);
-	ide_drive_t *pair = ide_get_pair_dev(drive);
-	int cshift = (drive->dn & 1) ? IDE_CAST_D1_SHIFT : IDE_CAST_D0_SHIFT;
-	unsigned long timings = (unsigned long)ide_get_drivedata(drive);
-	u32 cast;
-	const u8 pio = drive->pio_mode - XFER_PIO_0;
-	u8 cmd_pio = pio;
-
-	if (pair)
-		cmd_pio = min_t(u8, pio, pair->pio_mode - XFER_PIO_0);
-
-	timings &= (IDE_DRV_MASK << 8);
-	timings |= drv_timings[pio];
-	ide_set_drivedata(drive, (void *)timings);
-
-	cs5536_program_dtc(drive, drv_timings[pio]);
-
-	cs5536_read(pdev, CAST, &cast);
-
-	cast &= ~(IDE_CAST_DRV_MASK << cshift);
-	cast |= addr_timings[pio] << cshift;
-
-	cast &= ~(IDE_CAST_CMD_MASK << IDE_CAST_CMD_SHIFT);
-	cast |= cmd_timings[cmd_pio] << IDE_CAST_CMD_SHIFT;
-
-	cs5536_write(pdev, CAST, cast);
-}
-
-/**
- *	cs5536_set_dma_mode		-	DMA timing setup
- *	@hwif: ATA port
- *	@drive: ATA device
- */
-
-static void cs5536_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	static const u8 udma_timings[6] = {
-		0xc2, 0xc1, 0xc0, 0xc4, 0xc5, 0xc6,
-	};
-
-	static const u8 mwdma_timings[3] = {
-		0x67, 0x21, 0x20,
-	};
-
-	struct pci_dev *pdev = to_pci_dev(hwif->dev);
-	int dshift = (drive->dn & 1) ? IDE_D1_SHIFT : IDE_D0_SHIFT;
-	unsigned long timings = (unsigned long)ide_get_drivedata(drive);
-	u32 etc;
-	const u8 mode = drive->dma_mode;
-
-	cs5536_read(pdev, ETC, &etc);
-
-	if (mode >= XFER_UDMA_0) {
-		etc &= ~(IDE_DRV_MASK << dshift);
-		etc |= udma_timings[mode - XFER_UDMA_0] << dshift;
-	} else { /* MWDMA */
-		etc &= ~(IDE_ETC_UDMA_MASK << dshift);
-		timings &= IDE_DRV_MASK;
-		timings |= mwdma_timings[mode - XFER_MW_DMA_0] << 8;
-		ide_set_drivedata(drive, (void *)timings);
-	}
-
-	cs5536_write(pdev, ETC, etc);
-}
-
-static void cs5536_dma_start(ide_drive_t *drive)
-{
-	unsigned long timings = (unsigned long)ide_get_drivedata(drive);
-
-	if (drive->current_speed < XFER_UDMA_0 &&
-	    (timings >> 8) != (timings & IDE_DRV_MASK))
-		cs5536_program_dtc(drive, timings >> 8);
-
-	ide_dma_start(drive);
-}
-
-static int cs5536_dma_end(ide_drive_t *drive)
-{
-	int ret = ide_dma_end(drive);
-	unsigned long timings = (unsigned long)ide_get_drivedata(drive);
-
-	if (drive->current_speed < XFER_UDMA_0 &&
-	    (timings >> 8) != (timings & IDE_DRV_MASK))
-		cs5536_program_dtc(drive, timings & IDE_DRV_MASK);
-
-	return ret;
-}
-
-static const struct ide_port_ops cs5536_port_ops = {
-	.set_pio_mode		= cs5536_set_pio_mode,
-	.set_dma_mode		= cs5536_set_dma_mode,
-	.cable_detect		= cs5536_cable_detect,
-};
-
-static const struct ide_dma_ops cs5536_dma_ops = {
-	.dma_host_set		= ide_dma_host_set,
-	.dma_setup		= ide_dma_setup,
-	.dma_start		= cs5536_dma_start,
-	.dma_end		= cs5536_dma_end,
-	.dma_test_irq		= ide_dma_test_irq,
-	.dma_lost_irq		= ide_dma_lost_irq,
-	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_sff_read_status	= ide_dma_sff_read_status,
-};
-
-static const struct ide_port_info cs5536_info = {
-	.name		= DRV_NAME,
-	.port_ops	= &cs5536_port_ops,
-	.dma_ops	= &cs5536_dma_ops,
-	.host_flags	= IDE_HFLAG_SINGLE,
-	.pio_mask	= ATA_PIO4,
-	.mwdma_mask	= ATA_MWDMA2,
-	.udma_mask	= ATA_UDMA5,
-};
-
-/**
- *	cs5536_init_one
- *	@dev: PCI device
- *	@id: Entry in match table
- */
-
-static int cs5536_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	u32 cfg;
-
-	if (use_msr)
-		printk(KERN_INFO DRV_NAME ": Using MSR regs instead of PCI\n");
-
-	cs5536_read(dev, CFG, &cfg);
-
-	if ((cfg & IDE_CFG_CHANEN) == 0) {
-		printk(KERN_ERR DRV_NAME ": disabled by BIOS\n");
-		return -ENODEV;
-	}
-
-	return ide_pci_init_one(dev, &cs5536_info, NULL);
-}
-
-static const struct pci_device_id cs5536_pci_tbl[] = {
-	{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_CS5536_IDE), },
-	{ },
-};
-
-static struct pci_driver cs5536_pci_driver = {
-	.name		= DRV_NAME,
-	.id_table	= cs5536_pci_tbl,
-	.probe		= cs5536_init_one,
-	.remove		= ide_pci_remove,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-module_pci_driver(cs5536_pci_driver);
-
-MODULE_AUTHOR("Martin K. Petersen, Bartlomiej Zolnierkiewicz");
-MODULE_DESCRIPTION("low-level driver for the CS5536 IDE controller");
-MODULE_LICENSE("GPL");
-MODULE_DEVICE_TABLE(pci, cs5536_pci_tbl);
-
-module_param_named(msr, use_msr, int, 0644);
-MODULE_PARM_DESC(msr, "Force using MSR to configure IDE function (Default: 0)");
diff --git a/drivers/ide/cy82c693.c b/drivers/ide/cy82c693.c
deleted file mode 100644
index bc01660ee8fd..000000000000
--- a/drivers/ide/cy82c693.c
+++ /dev/null
@@ -1,234 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  Copyright (C) 1998-2000 Andreas S. Krebs (akrebs@altavista.net), Maintainer
- *  Copyright (C) 1998-2002 Andre Hedrick <andre@linux-ide.org>, Integrator
- *  Copyright (C) 2007-2011 Bartlomiej Zolnierkiewicz
- *
- * CYPRESS CY82C693 chipset IDE controller
- *
- * The CY82C693 chipset is used on Digital's PC-Alpha 164SX boards.
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/pci.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-
-#include <asm/io.h>
-
-#define DRV_NAME "cy82c693"
-
-/*
- *	NOTE: the value for busmaster timeout is tricky and I got it by
- *	trial and error!  By using a to low value will cause DMA timeouts
- *	and drop IDE performance, and by using a to high value will cause
- *	audio playback to scatter.
- *	If you know a better value or how to calc it, please let me know.
- */
-
-/* twice the value written in cy82c693ub datasheet */
-#define BUSMASTER_TIMEOUT	0x50
-/*
- * the value above was tested on my machine and it seems to work okay
- */
-
-/* here are the offset definitions for the registers */
-#define CY82_IDE_CMDREG		0x04
-#define CY82_IDE_ADDRSETUP	0x48
-#define CY82_IDE_MASTER_IOR	0x4C
-#define CY82_IDE_MASTER_IOW	0x4D
-#define CY82_IDE_SLAVE_IOR	0x4E
-#define CY82_IDE_SLAVE_IOW	0x4F
-#define CY82_IDE_MASTER_8BIT	0x50
-#define CY82_IDE_SLAVE_8BIT	0x51
-
-#define CY82_INDEX_PORT		0x22
-#define CY82_DATA_PORT		0x23
-
-#define CY82_INDEX_CHANNEL0	0x30
-#define CY82_INDEX_CHANNEL1	0x31
-#define CY82_INDEX_TIMEOUT	0x32
-
-/*
- * set DMA mode a specific channel for CY82C693
- */
-
-static void cy82c693_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	const u8 mode = drive->dma_mode;
-	u8 single = (mode & 0x10) >> 4, index = 0, data = 0;
-
-	index = hwif->channel ? CY82_INDEX_CHANNEL1 : CY82_INDEX_CHANNEL0;
-
-	data = (mode & 3) | (single << 2);
-
-	outb(index, CY82_INDEX_PORT);
-	outb(data, CY82_DATA_PORT);
-
-	/*
-	 * note: below we set the value for Bus Master IDE TimeOut Register
-	 * I'm not absolutely sure what this does, but it solved my problem
-	 * with IDE DMA and sound, so I now can play sound and work with
-	 * my IDE driver at the same time :-)
-	 *
-	 * If you know the correct (best) value for this register please
-	 * let me know - ASK
-	 */
-
-	data = BUSMASTER_TIMEOUT;
-	outb(CY82_INDEX_TIMEOUT, CY82_INDEX_PORT);
-	outb(data, CY82_DATA_PORT);
-}
-
-static void cy82c693_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	int bus_speed = ide_pci_clk ? ide_pci_clk : 33;
-	const unsigned long T = 1000000 / bus_speed;
-	unsigned int addrCtrl;
-	struct ide_timing t;
-	u8 time_16, time_8;
-
-	/* select primary or secondary channel */
-	if (drive->dn > 1) {  /* drive is on the secondary channel */
-		dev = pci_get_slot(dev->bus, dev->devfn+1);
-		if (!dev) {
-			printk(KERN_ERR "%s: tune_drive: "
-				"Cannot find secondary interface!\n",
-				drive->name);
-			return;
-		}
-	}
-
-	ide_timing_compute(drive, drive->pio_mode, &t, T, 1);
-
-	time_16 = clamp_val(t.recover - 1, 0, 15) |
-		  (clamp_val(t.active - 1, 0, 15) << 4);
-	time_8 = clamp_val(t.act8b - 1, 0, 15) |
-		 (clamp_val(t.rec8b - 1, 0, 15) << 4);
-
-	/* now let's write  the clocks registers */
-	if ((drive->dn & 1) == 0) {
-		/*
-		 * set master drive
-		 * address setup control register
-		 * is 32 bit !!!
-		 */
-		pci_read_config_dword(dev, CY82_IDE_ADDRSETUP, &addrCtrl);
-
-		addrCtrl &= (~0xF);
-		addrCtrl |= clamp_val(t.setup - 1, 0, 15);
-		pci_write_config_dword(dev, CY82_IDE_ADDRSETUP, addrCtrl);
-
-		/* now let's set the remaining registers */
-		pci_write_config_byte(dev, CY82_IDE_MASTER_IOR, time_16);
-		pci_write_config_byte(dev, CY82_IDE_MASTER_IOW, time_16);
-		pci_write_config_byte(dev, CY82_IDE_MASTER_8BIT, time_8);
-	} else {
-		/*
-		 * set slave drive
-		 * address setup control register
-		 * is 32 bit !!!
-		 */
-		pci_read_config_dword(dev, CY82_IDE_ADDRSETUP, &addrCtrl);
-
-		addrCtrl &= (~0xF0);
-		addrCtrl |= (clamp_val(t.setup - 1, 0, 15) << 4);
-		pci_write_config_dword(dev, CY82_IDE_ADDRSETUP, addrCtrl);
-
-		/* now let's set the remaining registers */
-		pci_write_config_byte(dev, CY82_IDE_SLAVE_IOR, time_16);
-		pci_write_config_byte(dev, CY82_IDE_SLAVE_IOW, time_16);
-		pci_write_config_byte(dev, CY82_IDE_SLAVE_8BIT, time_8);
-	}
-	if (drive->dn > 1)
-		pci_dev_put(dev);
-}
-
-static void init_iops_cy82c693(ide_hwif_t *hwif)
-{
-	static ide_hwif_t *primary;
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-
-	if (PCI_FUNC(dev->devfn) == 1)
-		primary = hwif;
-	else {
-		hwif->mate = primary;
-		hwif->channel = 1;
-	}
-}
-
-static const struct ide_port_ops cy82c693_port_ops = {
-	.set_pio_mode		= cy82c693_set_pio_mode,
-	.set_dma_mode		= cy82c693_set_dma_mode,
-};
-
-static const struct ide_port_info cy82c693_chipset = {
-	.name		= DRV_NAME,
-	.init_iops	= init_iops_cy82c693,
-	.port_ops	= &cy82c693_port_ops,
-	.host_flags	= IDE_HFLAG_SINGLE,
-	.pio_mask	= ATA_PIO4,
-	.swdma_mask	= ATA_SWDMA2,
-	.mwdma_mask	= ATA_MWDMA2,
-};
-
-static int cy82c693_init_one(struct pci_dev *dev,
-			     const struct pci_device_id *id)
-{
-	struct pci_dev *dev2;
-	int ret = -ENODEV;
-
-	/* CY82C693 is more than only a IDE controller.
-	   Function 1 is primary IDE channel, function 2 - secondary. */
-	if ((dev->class >> 8) == PCI_CLASS_STORAGE_IDE &&
-	    PCI_FUNC(dev->devfn) == 1) {
-		dev2 = pci_get_slot(dev->bus, dev->devfn + 1);
-		ret = ide_pci_init_two(dev, dev2, &cy82c693_chipset, NULL);
-		if (ret)
-			pci_dev_put(dev2);
-	}
-	return ret;
-}
-
-static void cy82c693_remove(struct pci_dev *dev)
-{
-	struct ide_host *host = pci_get_drvdata(dev);
-	struct pci_dev *dev2 = host->dev[1] ? to_pci_dev(host->dev[1]) : NULL;
-
-	ide_pci_remove(dev);
-	pci_dev_put(dev2);
-}
-
-static const struct pci_device_id cy82c693_pci_tbl[] = {
-	{ PCI_VDEVICE(CONTAQ, PCI_DEVICE_ID_CONTAQ_82C693), 0 },
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, cy82c693_pci_tbl);
-
-static struct pci_driver cy82c693_pci_driver = {
-	.name		= "Cypress_IDE",
-	.id_table	= cy82c693_pci_tbl,
-	.probe		= cy82c693_init_one,
-	.remove		= cy82c693_remove,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init cy82c693_ide_init(void)
-{
-	return ide_pci_register_driver(&cy82c693_pci_driver);
-}
-
-static void __exit cy82c693_ide_exit(void)
-{
-	pci_unregister_driver(&cy82c693_pci_driver);
-}
-
-module_init(cy82c693_ide_init);
-module_exit(cy82c693_ide_exit);
-
-MODULE_AUTHOR("Andreas Krebs, Andre Hedrick, Bartlomiej Zolnierkiewicz");
-MODULE_DESCRIPTION("PCI driver module for the Cypress CY82C693 IDE");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/delkin_cb.c b/drivers/ide/delkin_cb.c
deleted file mode 100644
index 300daabaa575..000000000000
--- a/drivers/ide/delkin_cb.c
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- *  Created 20 Oct 2004 by Mark Lord
- *
- *  Basic support for Delkin/ASKA/Workbit Cardbus CompactFlash adapter
- *
- *  Modeled after the 16-bit PCMCIA driver: ide-cs.c
- *
- *  This is slightly peculiar, in that it is a PCI driver,
- *  but is NOT an IDE PCI driver -- the IDE layer does not directly
- *  support hot insertion/removal of PCI interfaces, so this driver
- *  is unable to use the IDE PCI interfaces.  Instead, it uses the
- *  same interfaces as the ide-cs (PCMCIA) driver uses.
- *  On the plus side, the driver is also smaller/simpler this way.
- *
- *  This file is subject to the terms and conditions of the GNU General Public
- *  License.  See the file COPYING in the main directory of this archive for
- *  more details.
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-
-#include <asm/io.h>
-
-/*
- * No chip documentation has yet been found,
- * so these configuration values were pulled from
- * a running Win98 system using "debug".
- * This gives around 3MByte/second read performance,
- * which is about 2/3 of what the chip is capable of.
- *
- * There is also a 4KByte mmio region on the card,
- * but its purpose has yet to be reverse-engineered.
- */
-static const u8 setup[] = {
-	0x00, 0x05, 0xbe, 0x01, 0x20, 0x8f, 0x00, 0x00,
-	0xa4, 0x1f, 0xb3, 0x1b, 0x00, 0x00, 0x00, 0x80,
-	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-	0x00, 0x00, 0x00, 0x00, 0xa4, 0x83, 0x02, 0x13,
-};
-
-static const struct ide_port_ops delkin_cb_port_ops = {
-	.quirkproc		= ide_undecoded_slave,
-};
-
-static int delkin_cb_init_chipset(struct pci_dev *dev)
-{
-	unsigned long base = pci_resource_start(dev, 0);
-	int i;
-
-	outb(0x02, base + 0x1e);	/* set nIEN to block interrupts */
-	inb(base + 0x17);		/* read status to clear interrupts */
-
-	for (i = 0; i < sizeof(setup); ++i) {
-		if (setup[i])
-			outb(setup[i], base + i);
-	}
-
-	return 0;
-}
-
-static const struct ide_port_info delkin_cb_port_info = {
-	.port_ops		= &delkin_cb_port_ops,
-	.host_flags		= IDE_HFLAG_IO_32BIT | IDE_HFLAG_UNMASK_IRQS |
-				  IDE_HFLAG_NO_DMA,
-	.irq_flags		= IRQF_SHARED,
-	.init_chipset		= delkin_cb_init_chipset,
-	.chipset		= ide_pci,
-};
-
-static int delkin_cb_probe(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	struct ide_host *host;
-	unsigned long base;
-	int rc;
-	struct ide_hw hw, *hws[] = { &hw };
-
-	rc = pci_enable_device(dev);
-	if (rc) {
-		printk(KERN_ERR "delkin_cb: pci_enable_device failed (%d)\n", rc);
-		return rc;
-	}
-	rc = pci_request_regions(dev, "delkin_cb");
-	if (rc) {
-		printk(KERN_ERR "delkin_cb: pci_request_regions failed (%d)\n", rc);
-		pci_disable_device(dev);
-		return rc;
-	}
-	base = pci_resource_start(dev, 0);
-
-	delkin_cb_init_chipset(dev);
-
-	memset(&hw, 0, sizeof(hw));
-	ide_std_init_ports(&hw, base + 0x10, base + 0x1e);
-	hw.irq = dev->irq;
-	hw.dev = &dev->dev;
-
-	rc = ide_host_add(&delkin_cb_port_info, hws, 1, &host);
-	if (rc)
-		goto out_disable;
-
-	pci_set_drvdata(dev, host);
-
-	return 0;
-
-out_disable:
-	pci_release_regions(dev);
-	pci_disable_device(dev);
-	return rc;
-}
-
-static void
-delkin_cb_remove (struct pci_dev *dev)
-{
-	struct ide_host *host = pci_get_drvdata(dev);
-
-	ide_host_remove(host);
-
-	pci_release_regions(dev);
-	pci_disable_device(dev);
-}
-
-#ifdef CONFIG_PM
-static int delkin_cb_suspend(struct pci_dev *dev, pm_message_t state)
-{
-	pci_save_state(dev);
-	pci_disable_device(dev);
-	pci_set_power_state(dev, pci_choose_state(dev, state));
-
-	return 0;
-}
-
-static int delkin_cb_resume(struct pci_dev *dev)
-{
-	struct ide_host *host = pci_get_drvdata(dev);
-	int rc;
-
-	pci_set_power_state(dev, PCI_D0);
-
-	rc = pci_enable_device(dev);
-	if (rc)
-		return rc;
-
-	pci_restore_state(dev);
-	pci_set_master(dev);
-
-	if (host->init_chipset)
-		host->init_chipset(dev);
-
-	return 0;
-}
-#else
-#define delkin_cb_suspend NULL
-#define delkin_cb_resume NULL
-#endif
-
-static struct pci_device_id delkin_cb_pci_tbl[] = {
-	{ 0x1145, 0xf021, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
-	{ 0x1145, 0xf024, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, delkin_cb_pci_tbl);
-
-static struct pci_driver delkin_cb_pci_driver = {
-	.name		= "Delkin-ASKA-Workbit Cardbus IDE",
-	.id_table	= delkin_cb_pci_tbl,
-	.probe		= delkin_cb_probe,
-	.remove		= delkin_cb_remove,
-	.suspend	= delkin_cb_suspend,
-	.resume		= delkin_cb_resume,
-};
-
-module_pci_driver(delkin_cb_pci_driver);
-
-MODULE_AUTHOR("Mark Lord");
-MODULE_DESCRIPTION("Basic support for Delkin/ASKA/Workbit Cardbus IDE");
-MODULE_LICENSE("GPL");
-
diff --git a/drivers/ide/dtc2278.c b/drivers/ide/dtc2278.c
deleted file mode 100644
index 714e8cd0fa49..000000000000
--- a/drivers/ide/dtc2278.c
+++ /dev/null
@@ -1,155 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  Copyright (C) 1996  Linus Torvalds & author (see below)
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/delay.h>
-#include <linux/timer.h>
-#include <linux/mm.h>
-#include <linux/ioport.h>
-#include <linux/blkdev.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-
-#include <asm/io.h>
-
-#define DRV_NAME "dtc2278"
-
-/*
- * Changing this #undef to #define may solve start up problems in some systems.
- */
-#undef ALWAYS_SET_DTC2278_PIO_MODE
-
-/*
- * From: andy@cercle.cts.com (Dyan Wile)
- *
- * Below is a patch for DTC-2278 - alike software-programmable controllers
- * The code enables the secondary IDE controller and the PIO4 (3?) timings on
- * the primary (EIDE). You may probably have to enable the 32-bit support to
- * get the full speed. You better get the disk interrupts disabled ( hdparm -u0
- * /dev/hd.. ) for the drives connected to the EIDE interface. (I get my
- * filesystem  corrupted with -u1, but under heavy disk load only :-)
- *
- * This card is now forced to use the "serialize" feature,
- * and irq-unmasking is disallowed.  If io_32bit is enabled,
- * it must be done for BOTH drives on each interface.
- *
- * This code was written for the DTC2278E, but might work with any of these:
- *
- * DTC2278S has only a single IDE interface.
- * DTC2278D has two IDE interfaces and is otherwise identical to the S version.
- * DTC2278E also has serial ports and a printer port
- * DTC2278EB: has onboard BIOS, and "works like a charm" -- Kent Bradford <kent@theory.caltech.edu>
- *
- * There may be a fourth controller type. The S and D versions use the
- * Winbond chip, and I think the E version does also.
- *
- */
-
-static void sub22 (char b, char c)
-{
-	int i;
-
-	for(i = 0; i < 3; ++i) {
-		inb(0x3f6);
-		outb_p(b,0xb0);
-		inb(0x3f6);
-		outb_p(c,0xb4);
-		inb(0x3f6);
-		if(inb(0xb4) == c) {
-			outb_p(7,0xb0);
-			inb(0x3f6);
-			return;	/* success */
-		}
-	}
-}
-
-static DEFINE_SPINLOCK(dtc2278_lock);
-
-static void dtc2278_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	unsigned long flags;
-
-	if (drive->pio_mode >= XFER_PIO_3) {
-		spin_lock_irqsave(&dtc2278_lock, flags);
-		/*
-		 * This enables PIO mode4 (3?) on the first interface
-		 */
-		sub22(1,0xc3);
-		sub22(0,0xa0);
-		spin_unlock_irqrestore(&dtc2278_lock, flags);
-	} else {
-		/* we don't know how to set it back again.. */
-		/* Actually we do - there is a data sheet available for the
-		   Winbond but does anyone actually care */
-	}
-}
-
-static const struct ide_port_ops dtc2278_port_ops = {
-	.set_pio_mode		= dtc2278_set_pio_mode,
-};
-
-static const struct ide_port_info dtc2278_port_info __initconst = {
-	.name			= DRV_NAME,
-	.chipset		= ide_dtc2278,
-	.port_ops		= &dtc2278_port_ops,
-	.host_flags		= IDE_HFLAG_SERIALIZE |
-				  IDE_HFLAG_NO_UNMASK_IRQS |
-				  IDE_HFLAG_IO_32BIT |
-				  /* disallow ->io_32bit changes */
-				  IDE_HFLAG_NO_IO_32BIT |
-				  IDE_HFLAG_NO_DMA |
-				  IDE_HFLAG_DTC2278,
-	.pio_mask		= ATA_PIO4,
-};
-
-static int __init dtc2278_probe(void)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	/*
-	 * This enables the second interface
-	 */
-	outb_p(4,0xb0);
-	inb(0x3f6);
-	outb_p(0x20,0xb4);
-	inb(0x3f6);
-#ifdef ALWAYS_SET_DTC2278_PIO_MODE
-	/*
-	 * This enables PIO mode4 (3?) on the first interface
-	 * and may solve start-up problems for some people.
-	 */
-	sub22(1,0xc3);
-	sub22(0,0xa0);
-#endif
-	local_irq_restore(flags);
-
-	return ide_legacy_device_add(&dtc2278_port_info, 0);
-}
-
-static bool probe_dtc2278;
-
-module_param_named(probe, probe_dtc2278, bool, 0);
-MODULE_PARM_DESC(probe, "probe for DTC2278xx chipsets");
-
-static int __init dtc2278_init(void)
-{
-	if (probe_dtc2278 == 0)
-		return -ENODEV;
-
-	if (dtc2278_probe()) {
-		printk(KERN_ERR "dtc2278: ide interfaces already in use!\n");
-		return -EBUSY;
-	}
-	return 0;
-}
-
-module_init(dtc2278_init);
-
-MODULE_AUTHOR("See Local File");
-MODULE_DESCRIPTION("support of DTC-2278 VLB IDE chipsets");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/falconide.c b/drivers/ide/falconide.c
deleted file mode 100644
index a73a9dc17e4d..000000000000
--- a/drivers/ide/falconide.c
+++ /dev/null
@@ -1,226 +0,0 @@
-/*
- *  Atari Falcon IDE Driver
- *
- *     Created 12 Jul 1997 by Geert Uytterhoeven
- *
- *  This file is subject to the terms and conditions of the GNU General Public
- *  License.  See the file COPYING in the main directory of this archive for
- *  more details.
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/blkdev.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-#include <linux/platform_device.h>
-
-#include <asm/setup.h>
-#include <asm/atarihw.h>
-#include <asm/atariints.h>
-#include <asm/atari_stdma.h>
-#include <asm/ide.h>
-
-#define DRV_NAME "falconide"
-
-#ifdef CONFIG_ATARI
-    /*
-     *  falconide_intr_lock is used to obtain access to the IDE interrupt,
-     *  which is shared between several drivers.
-     */
-
-static int falconide_intr_lock;
-
-static void falconide_release_lock(void)
-{
-	if (falconide_intr_lock == 0) {
-		printk(KERN_ERR "%s: bug\n", __func__);
-		return;
-	}
-	falconide_intr_lock = 0;
-	stdma_release();
-}
-
-static void falconide_get_lock(irq_handler_t handler, void *data)
-{
-	if (falconide_intr_lock == 0) {
-		stdma_lock(handler, data);
-		falconide_intr_lock = 1;
-	}
-}
-#endif
-
-static void falconide_input_data(ide_drive_t *drive, struct ide_cmd *cmd,
-				 void *buf, unsigned int len)
-{
-	unsigned long data_addr = drive->hwif->io_ports.data_addr;
-
-	if (drive->media == ide_disk && cmd && (cmd->tf_flags & IDE_TFLAG_FS)) {
-		__ide_mm_insw(data_addr, buf, (len + 1) / 2);
-		return;
-	}
-
-	raw_insw_swapw((u16 *)data_addr, buf, (len + 1) / 2);
-}
-
-static void falconide_output_data(ide_drive_t *drive, struct ide_cmd *cmd,
-				  void *buf, unsigned int len)
-{
-	unsigned long data_addr = drive->hwif->io_ports.data_addr;
-
-	if (drive->media == ide_disk && cmd && (cmd->tf_flags & IDE_TFLAG_FS)) {
-		__ide_mm_outsw(data_addr, buf, (len + 1) / 2);
-		return;
-	}
-
-	raw_outsw_swapw((u16 *)data_addr, buf, (len + 1) / 2);
-}
-
-/* Atari has a byte-swapped IDE interface */
-static const struct ide_tp_ops falconide_tp_ops = {
-	.exec_command		= ide_exec_command,
-	.read_status		= ide_read_status,
-	.read_altstatus		= ide_read_altstatus,
-	.write_devctl		= ide_write_devctl,
-
-	.dev_select		= ide_dev_select,
-	.tf_load		= ide_tf_load,
-	.tf_read		= ide_tf_read,
-
-	.input_data		= falconide_input_data,
-	.output_data		= falconide_output_data,
-};
-
-static const struct ide_port_info falconide_port_info = {
-#ifdef CONFIG_ATARI
-	.get_lock		= falconide_get_lock,
-	.release_lock		= falconide_release_lock,
-#endif
-	.tp_ops			= &falconide_tp_ops,
-	.host_flags		= IDE_HFLAG_MMIO | IDE_HFLAG_SERIALIZE |
-				  IDE_HFLAG_NO_DMA,
-	.irq_flags		= IRQF_SHARED,
-	.chipset		= ide_generic,
-};
-
-static void __init falconide_setup_ports(struct ide_hw *hw, unsigned long base,
-					 unsigned long ctl, int irq)
-{
-	int i;
-
-	memset(hw, 0, sizeof(*hw));
-
-	hw->io_ports.data_addr = base;
-
-	for (i = 1; i < 8; i++)
-		hw->io_ports_array[i] = base + 1 + i * 4;
-
-	hw->io_ports.ctl_addr = ctl + 1;
-
-	hw->irq = irq;
-}
-
-    /*
-     *  Probe for a Falcon IDE interface
-     */
-
-static int __init falconide_init(struct platform_device *pdev)
-{
-	struct resource *base_mem_res, *ctl_mem_res;
-	struct resource *base_res, *ctl_res, *irq_res;
-	struct ide_host *host;
-	struct ide_hw hw, *hws[] = { &hw };
-	int rc;
-	int irq;
-
-	dev_info(&pdev->dev, "Atari Falcon and Q40/Q60 IDE controller\n");
-
-	base_res = platform_get_resource(pdev, IORESOURCE_IO, 0);
-	if (base_res && !devm_request_region(&pdev->dev, base_res->start,
-					   resource_size(base_res), DRV_NAME)) {
-		dev_err(&pdev->dev, "resources busy\n");
-		return -EBUSY;
-	}
-
-	ctl_res = platform_get_resource(pdev, IORESOURCE_IO, 0);
-	if (ctl_res && !devm_request_region(&pdev->dev, ctl_res->start,
-					   resource_size(ctl_res), DRV_NAME)) {
-		dev_err(&pdev->dev, "resources busy\n");
-		return -EBUSY;
-	}
-
-	base_mem_res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!base_mem_res)
-		return -ENODEV;
-
-	if (!devm_request_mem_region(&pdev->dev, base_mem_res->start,
-				     resource_size(base_mem_res), DRV_NAME)) {
-		dev_err(&pdev->dev, "resources busy\n");
-		return -EBUSY;
-	}
-
-	ctl_mem_res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-	if (!ctl_mem_res)
-		return -ENODEV;
-
-	if (MACH_IS_ATARI) {
-		irq = IRQ_MFP_IDE;
-	} else {
-		irq_res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
-		if (irq_res && irq_res->start > 0)
-			irq = irq_res->start;
-		else
-			return -ENODEV;
-	}
-
-	falconide_setup_ports(&hw, base_mem_res->start, ctl_mem_res->start, irq);
-
-	host = ide_host_alloc(&falconide_port_info, hws, 1);
-	if (!host)
-		return -ENOMEM;
-
-	if (!MACH_IS_ATARI) {
-		host->get_lock = NULL;
-		host->release_lock = NULL;
-	}
-
-	if (host->get_lock)
-		host->get_lock(NULL, NULL);
-	rc = ide_host_register(host, &falconide_port_info, hws);
-	if (host->release_lock)
-		host->release_lock();
-
-	if (rc)
-		goto err_free;
-
-	platform_set_drvdata(pdev, host);
-	return 0;
-err_free:
-	ide_host_free(host);
-	return rc;
-}
-
-static int falconide_remove(struct platform_device *pdev)
-{
-	struct ide_host *host = platform_get_drvdata(pdev);
-
-	ide_host_remove(host);
-
-	return 0;
-}
-
-static struct platform_driver ide_falcon_driver = {
-	.remove = falconide_remove,
-	.driver   = {
-		.name	= "atari-falcon-ide",
-	},
-};
-
-module_platform_driver_probe(ide_falcon_driver, falconide_init);
-
-MODULE_AUTHOR("Geert Uytterhoeven");
-MODULE_DESCRIPTION("low-level driver for Atari Falcon IDE");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:atari-falcon-ide");
diff --git a/drivers/ide/gayle.c b/drivers/ide/gayle.c
deleted file mode 100644
index 901e6ebfeb96..000000000000
--- a/drivers/ide/gayle.c
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- *  Amiga Gayle IDE Driver
- *
- *     Created 9 Jul 1997 by Geert Uytterhoeven
- *
- *  This file is subject to the terms and conditions of the GNU General Public
- *  License.  See the file COPYING in the main directory of this archive for
- *  more details.
- */
-
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/blkdev.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-#include <linux/zorro.h>
-#include <linux/module.h>
-#include <linux/platform_device.h>
-
-#include <asm/setup.h>
-#include <asm/amigahw.h>
-#include <asm/amigaints.h>
-#include <asm/amigayle.h>
-
-
-    /*
-     *  Offsets from one of the above bases
-     */
-
-#define GAYLE_CONTROL	0x101a
-
-    /*
-     *  These are at different offsets from the base
-     */
-
-#define GAYLE_IRQ_4000	0xdd3020	/* MSB = 1, Harddisk is source of */
-#define GAYLE_IRQ_1200	0xda9000	/* interrupt */
-
-
-    /*
-     *  Offset of the secondary port for IDE doublers
-     *  Note that GAYLE_CONTROL is NOT available then!
-     */
-
-#define GAYLE_NEXT_PORT	0x1000
-
-#define GAYLE_NUM_HWIFS		2
-#define GAYLE_NUM_PROBE_HWIFS	(ide_doubler ? GAYLE_NUM_HWIFS : \
-					       GAYLE_NUM_HWIFS-1)
-#define GAYLE_HAS_CONTROL_REG	(!ide_doubler)
-
-static bool ide_doubler;
-module_param_named(doubler, ide_doubler, bool, 0);
-MODULE_PARM_DESC(doubler, "enable support for IDE doublers");
-
-    /*
-     *  Check and acknowledge the interrupt status
-     */
-
-static int gayle_test_irq(ide_hwif_t *hwif)
-{
-	unsigned char ch;
-
-	ch = z_readb(hwif->io_ports.irq_addr);
-	if (!(ch & GAYLE_IRQ_IDE))
-		return 0;
-	return 1;
-}
-
-static void gayle_a1200_clear_irq(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-
-	(void)z_readb(hwif->io_ports.status_addr);
-	z_writeb(0x7c, hwif->io_ports.irq_addr);
-}
-
-static void __init gayle_setup_ports(struct ide_hw *hw, unsigned long base,
-				     unsigned long ctl, unsigned long irq_port)
-{
-	int i;
-
-	memset(hw, 0, sizeof(*hw));
-
-	hw->io_ports.data_addr = base;
-
-	for (i = 1; i < 8; i++)
-		hw->io_ports_array[i] = base + 2 + i * 4;
-
-	hw->io_ports.ctl_addr = ctl;
-	hw->io_ports.irq_addr = irq_port;
-
-	hw->irq = IRQ_AMIGA_PORTS;
-}
-
-static const struct ide_port_ops gayle_a4000_port_ops = {
-	.test_irq		= gayle_test_irq,
-};
-
-static const struct ide_port_ops gayle_a1200_port_ops = {
-	.clear_irq		= gayle_a1200_clear_irq,
-	.test_irq		= gayle_test_irq,
-};
-
-static const struct ide_port_info gayle_port_info = {
-	.host_flags		= IDE_HFLAG_MMIO | IDE_HFLAG_SERIALIZE |
-				  IDE_HFLAG_NO_DMA,
-	.irq_flags		= IRQF_SHARED,
-	.chipset		= ide_generic,
-};
-
-    /*
-     *  Probe for a Gayle IDE interface (and optionally for an IDE doubler)
-     */
-
-static int __init amiga_gayle_ide_probe(struct platform_device *pdev)
-{
-	struct resource *res;
-	struct gayle_ide_platform_data *pdata;
-	unsigned long base, ctrlport, irqport;
-	unsigned int i;
-	int error;
-	struct ide_hw hw[GAYLE_NUM_HWIFS], *hws[GAYLE_NUM_HWIFS];
-	struct ide_port_info d = gayle_port_info;
-	struct ide_host *host;
-
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res)
-		return -ENODEV;
-
-	if (!request_mem_region(res->start, resource_size(res), "IDE"))
-		return -EBUSY;
-
-	pdata = dev_get_platdata(&pdev->dev);
-	pr_info("ide: Gayle IDE controller (A%u style%s)\n",
-		pdata->explicit_ack ? 1200 : 4000,
-		ide_doubler ? ", IDE doubler" : "");
-
-	base = (unsigned long)ZTWO_VADDR(pdata->base);
-	ctrlport = 0;
-	irqport = (unsigned long)ZTWO_VADDR(pdata->irqport);
-	if (pdata->explicit_ack)
-		d.port_ops = &gayle_a1200_port_ops;
-	else
-		d.port_ops = &gayle_a4000_port_ops;
-
-	for (i = 0; i < GAYLE_NUM_PROBE_HWIFS; i++, base += GAYLE_NEXT_PORT) {
-		if (GAYLE_HAS_CONTROL_REG)
-			ctrlport = base + GAYLE_CONTROL;
-
-		gayle_setup_ports(&hw[i], base, ctrlport, irqport);
-		hws[i] = &hw[i];
-	}
-
-	error = ide_host_add(&d, hws, i, &host);
-	if (error)
-		goto out;
-
-	platform_set_drvdata(pdev, host);
-	return 0;
-
-out:
-	release_mem_region(res->start, resource_size(res));
-	return error;
-}
-
-static int __exit amiga_gayle_ide_remove(struct platform_device *pdev)
-{
-	struct ide_host *host = platform_get_drvdata(pdev);
-	struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-
-	ide_host_remove(host);
-	release_mem_region(res->start, resource_size(res));
-	return 0;
-}
-
-static struct platform_driver amiga_gayle_ide_driver = {
-	.remove = __exit_p(amiga_gayle_ide_remove),
-	.driver   = {
-		.name	= "amiga-gayle-ide",
-	},
-};
-
-module_platform_driver_probe(amiga_gayle_ide_driver, amiga_gayle_ide_probe);
-
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:amiga-gayle-ide");
diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
deleted file mode 100644
index 50c9a41467c8..000000000000
--- a/drivers/ide/hpt366.c
+++ /dev/null
@@ -1,1545 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 1999-2003		Andre Hedrick <andre@linux-ide.org>
- * Portions Copyright (C) 2001	        Sun Microsystems, Inc.
- * Portions Copyright (C) 2003		Red Hat Inc
- * Portions Copyright (C) 2007		Bartlomiej Zolnierkiewicz
- * Portions Copyright (C) 2005-2009	MontaVista Software, Inc.
- *
- * Thanks to HighPoint Technologies for their assistance, and hardware.
- * Special Thanks to Jon Burchmore in SanDiego for the deep pockets, his
- * donation of an ABit BP6 mainboard, processor, and memory acellerated
- * development and support.
- *
- *
- * HighPoint has its own drivers (open source except for the RAID part)
- * available from http://www.highpoint-tech.com/USA_new/service_support.htm 
- * This may be useful to anyone wanting to work on this driver, however  do not
- * trust  them too much since the code tends to become less and less meaningful
- * as the time passes... :-/
- *
- * Note that final HPT370 support was done by force extraction of GPL.
- *
- * - add function for getting/setting power status of drive
- * - the HPT370's state machine can get confused. reset it before each dma 
- *   xfer to prevent that from happening.
- * - reset state engine whenever we get an error.
- * - check for busmaster state at end of dma. 
- * - use new highpoint timings.
- * - detect bus speed using highpoint register.
- * - use pll if we don't have a clock table. added a 66MHz table that's
- *   just 2x the 33MHz table.
- * - removed turnaround. NOTE: we never want to switch between pll and
- *   pci clocks as the chip can glitch in those cases. the highpoint
- *   approved workaround slows everything down too much to be useful. in
- *   addition, we would have to serialize access to each chip.
- * 	Adrian Sun <a.sun@sun.com>
- *
- * add drive timings for 66MHz PCI bus,
- * fix ATA Cable signal detection, fix incorrect /proc info
- * add /proc display for per-drive PIO/DMA/UDMA mode and
- * per-channel ATA-33/66 Cable detect.
- * 	Duncan Laurie <void@sun.com>
- *
- * fixup /proc output for multiple controllers
- *	Tim Hockin <thockin@sun.com>
- *
- * On hpt366: 
- * Reset the hpt366 on error, reset on dma
- * Fix disabling Fast Interrupt hpt366.
- * 	Mike Waychison <crlf@sun.com>
- *
- * Added support for 372N clocking and clock switching. The 372N needs
- * different clocks on read/write. This requires overloading rw_disk and
- * other deeply crazy things. Thanks to <http://www.hoerstreich.de> for
- * keeping me sane. 
- *		Alan Cox <alan@lxorguk.ukuu.org.uk>
- *
- * - fix the clock turnaround code: it was writing to the wrong ports when
- *   called for the secondary channel, caching the current clock mode per-
- *   channel caused the cached register value to get out of sync with the
- *   actual one, the channels weren't serialized, the turnaround shouldn't
- *   be done on 66 MHz PCI bus
- * - disable UltraATA/100 for HPT370 by default as the 33 MHz clock being used
- *   does not allow for this speed anyway
- * - avoid touching disabled channels (e.g. HPT371/N are single channel chips,
- *   their primary channel is kind of virtual, it isn't tied to any pins)
- * - fix/remove bad/unused timing tables and use one set of tables for the whole
- *   HPT37x chip family; save space by introducing the separate transfer mode
- *   table in which the mode lookup is done
- * - use f_CNT value saved by  the HighPoint BIOS as reading it directly gives
- *   the wrong PCI frequency since DPLL has already been calibrated by BIOS;
- *   read it only from the function 0 of HPT374 chips
- * - fix the hotswap code:  it caused RESET- to glitch when tristating the bus,
- *   and for HPT36x the obsolete HDIO_TRISTATE_HWIF handler was called instead
- * - pass to init_chipset() handlers a copy of the IDE PCI device structure as
- *   they tamper with its fields
- * - pass  to the init_setup handlers a copy of the ide_pci_device_t structure
- *   since they may tamper with its fields
- * - prefix the driver startup messages with the real chip name
- * - claim the extra 240 bytes of I/O space for all chips
- * - optimize the UltraDMA filtering and the drive list lookup code
- * - use pci_get_slot() to get to the function 1 of HPT36x/374
- * - cache offset of the channel's misc. control registers (MCRs) being used
- *   throughout the driver
- * - only touch the relevant MCR when detecting the cable type on HPT374's
- *   function 1
- * - rename all the register related variables consistently
- * - move all the interrupt twiddling code from the speedproc handlers into
- *   init_hwif_hpt366(), also grouping all the DMA related code together there
- * - merge HPT36x/HPT37x speedproc handlers, fix PIO timing register mask and
- *   separate the UltraDMA and MWDMA masks there to avoid changing PIO timings
- *   when setting an UltraDMA mode
- * - fix hpt3xx_tune_drive() to set the PIO mode requested, not always select
- *   the best possible one
- * - clean up DMA timeout handling for HPT370
- * - switch to using the enumeration type to differ between the numerous chip
- *   variants, matching PCI device/revision ID with the chip type early, at the
- *   init_setup stage
- * - extend the hpt_info structure to hold the DPLL and PCI clock frequencies,
- *   stop duplicating it for each channel by storing the pointer in the pci_dev
- *   structure: first, at the init_setup stage, point it to a static "template"
- *   with only the chip type and its specific base DPLL frequency, the highest
- *   UltraDMA mode, and the chip settings table pointer filled,  then, at the
- *   init_chipset stage, allocate per-chip instance  and fill it with the rest
- *   of the necessary information
- * - get rid of the constant thresholds in the HPT37x PCI clock detection code,
- *   switch  to calculating  PCI clock frequency based on the chip's base DPLL
- *   frequency
- * - switch to using the  DPLL clock and enable UltraATA/133 mode by default on
- *   anything  newer than HPT370/A (except HPT374 that is not capable of this
- *   mode according to the manual)
- * - fold PCI clock detection and DPLL setup code into init_chipset_hpt366(),
- *   also fixing the interchanged 25/40 MHz PCI clock cases for HPT36x chips;
- *   unify HPT36x/37x timing setup code and the speedproc handlers by joining
- *   the register setting lists into the table indexed by the clock selected
- * - set the correct hwif->ultra_mask for each individual chip
- * - add Ultra and MW DMA mode filtering for the HPT37[24] based SATA cards
- * - stop resetting HPT370's state machine before each DMA transfer as that has
- *   caused more harm than good
- *	Sergei Shtylyov, <sshtylyov@ru.mvista.com> or <source@mvista.com>
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/delay.h>
-#include <linux/blkdev.h>
-#include <linux/interrupt.h>
-#include <linux/pci.h>
-#include <linux/init.h>
-#include <linux/ide.h>
-#include <linux/slab.h>
-
-#include <linux/uaccess.h>
-#include <asm/io.h>
-
-#define DRV_NAME "hpt366"
-
-/* various tuning parameters */
-#undef	HPT_RESET_STATE_ENGINE
-#undef	HPT_DELAY_INTERRUPT
-
-static const char *bad_ata100_5[] = {
-	"IBM-DTLA-307075",
-	"IBM-DTLA-307060",
-	"IBM-DTLA-307045",
-	"IBM-DTLA-307030",
-	"IBM-DTLA-307020",
-	"IBM-DTLA-307015",
-	"IBM-DTLA-305040",
-	"IBM-DTLA-305030",
-	"IBM-DTLA-305020",
-	"IC35L010AVER07-0",
-	"IC35L020AVER07-0",
-	"IC35L030AVER07-0",
-	"IC35L040AVER07-0",
-	"IC35L060AVER07-0",
-	"WDC AC310200R",
-	NULL
-};
-
-static const char *bad_ata66_4[] = {
-	"IBM-DTLA-307075",
-	"IBM-DTLA-307060",
-	"IBM-DTLA-307045",
-	"IBM-DTLA-307030",
-	"IBM-DTLA-307020",
-	"IBM-DTLA-307015",
-	"IBM-DTLA-305040",
-	"IBM-DTLA-305030",
-	"IBM-DTLA-305020",
-	"IC35L010AVER07-0",
-	"IC35L020AVER07-0",
-	"IC35L030AVER07-0",
-	"IC35L040AVER07-0",
-	"IC35L060AVER07-0",
-	"WDC AC310200R",
-	"MAXTOR STM3320620A",
-	NULL
-};
-
-static const char *bad_ata66_3[] = {
-	"WDC AC310200R",
-	NULL
-};
-
-static const char *bad_ata33[] = {
-	"Maxtor 92720U8", "Maxtor 92040U6", "Maxtor 91360U4", "Maxtor 91020U3", "Maxtor 90845U3", "Maxtor 90650U2",
-	"Maxtor 91360D8", "Maxtor 91190D7", "Maxtor 91020D6", "Maxtor 90845D5", "Maxtor 90680D4", "Maxtor 90510D3", "Maxtor 90340D2",
-	"Maxtor 91152D8", "Maxtor 91008D7", "Maxtor 90845D6", "Maxtor 90840D6", "Maxtor 90720D5", "Maxtor 90648D5", "Maxtor 90576D4",
-	"Maxtor 90510D4",
-	"Maxtor 90432D3", "Maxtor 90288D2", "Maxtor 90256D2",
-	"Maxtor 91000D8", "Maxtor 90910D8", "Maxtor 90875D7", "Maxtor 90840D7", "Maxtor 90750D6", "Maxtor 90625D5", "Maxtor 90500D4",
-	"Maxtor 91728D8", "Maxtor 91512D7", "Maxtor 91303D6", "Maxtor 91080D5", "Maxtor 90845D4", "Maxtor 90680D4", "Maxtor 90648D3", "Maxtor 90432D2",
-	NULL
-};
-
-static u8 xfer_speeds[] = {
-	XFER_UDMA_6,
-	XFER_UDMA_5,
-	XFER_UDMA_4,
-	XFER_UDMA_3,
-	XFER_UDMA_2,
-	XFER_UDMA_1,
-	XFER_UDMA_0,
-
-	XFER_MW_DMA_2,
-	XFER_MW_DMA_1,
-	XFER_MW_DMA_0,
-
-	XFER_PIO_4,
-	XFER_PIO_3,
-	XFER_PIO_2,
-	XFER_PIO_1,
-	XFER_PIO_0
-};
-
-/* Key for bus clock timings
- * 36x   37x
- * bits  bits
- * 0:3	 0:3	data_high_time. Inactive time of DIOW_/DIOR_ for PIO and MW DMA.
- *		cycles = value + 1
- * 4:7	 4:8	data_low_time. Active time of DIOW_/DIOR_ for PIO and MW DMA.
- *		cycles = value + 1
- * 8:11  9:12	cmd_high_time. Inactive time of DIOW_/DIOR_ during task file
- *		register access.
- * 12:15 13:17	cmd_low_time. Active time of DIOW_/DIOR_ during task file
- *		register access.
- * 16:18 18:20	udma_cycle_time. Clock cycles for UDMA xfer.
- * -	 21	CLK frequency: 0=ATA clock, 1=dual ATA clock.
- * 19:21 22:24	pre_high_time. Time to initialize the 1st cycle for PIO and
- *		MW DMA xfer.
- * 22:24 25:27	cmd_pre_high_time. Time to initialize the 1st PIO cycle for
- *		task file register access.
- * 28	 28	UDMA enable.
- * 29	 29	DMA  enable.
- * 30	 30	PIO MST enable. If set, the chip is in bus master mode during
- *		PIO xfer.
- * 31	 31	FIFO enable.
- */
-
-static u32 forty_base_hpt36x[] = {
-	/* XFER_UDMA_6 */	0x900fd943,
-	/* XFER_UDMA_5 */	0x900fd943,
-	/* XFER_UDMA_4 */	0x900fd943,
-	/* XFER_UDMA_3 */	0x900ad943,
-	/* XFER_UDMA_2 */	0x900bd943,
-	/* XFER_UDMA_1 */	0x9008d943,
-	/* XFER_UDMA_0 */	0x9008d943,
-
-	/* XFER_MW_DMA_2 */	0xa008d943,
-	/* XFER_MW_DMA_1 */	0xa010d955,
-	/* XFER_MW_DMA_0 */	0xa010d9fc,
-
-	/* XFER_PIO_4 */	0xc008d963,
-	/* XFER_PIO_3 */	0xc010d974,
-	/* XFER_PIO_2 */	0xc010d997,
-	/* XFER_PIO_1 */	0xc010d9c7,
-	/* XFER_PIO_0 */	0xc018d9d9
-};
-
-static u32 thirty_three_base_hpt36x[] = {
-	/* XFER_UDMA_6 */	0x90c9a731,
-	/* XFER_UDMA_5 */	0x90c9a731,
-	/* XFER_UDMA_4 */	0x90c9a731,
-	/* XFER_UDMA_3 */	0x90cfa731,
-	/* XFER_UDMA_2 */	0x90caa731,
-	/* XFER_UDMA_1 */	0x90cba731,
-	/* XFER_UDMA_0 */	0x90c8a731,
-
-	/* XFER_MW_DMA_2 */	0xa0c8a731,
-	/* XFER_MW_DMA_1 */	0xa0c8a732,	/* 0xa0c8a733 */
-	/* XFER_MW_DMA_0 */	0xa0c8a797,
-
-	/* XFER_PIO_4 */	0xc0c8a731,
-	/* XFER_PIO_3 */	0xc0c8a742,
-	/* XFER_PIO_2 */	0xc0d0a753,
-	/* XFER_PIO_1 */	0xc0d0a7a3,	/* 0xc0d0a793 */
-	/* XFER_PIO_0 */	0xc0d0a7aa	/* 0xc0d0a7a7 */
-};
-
-static u32 twenty_five_base_hpt36x[] = {
-	/* XFER_UDMA_6 */	0x90c98521,
-	/* XFER_UDMA_5 */	0x90c98521,
-	/* XFER_UDMA_4 */	0x90c98521,
-	/* XFER_UDMA_3 */	0x90cf8521,
-	/* XFER_UDMA_2 */	0x90cf8521,
-	/* XFER_UDMA_1 */	0x90cb8521,
-	/* XFER_UDMA_0 */	0x90cb8521,
-
-	/* XFER_MW_DMA_2 */	0xa0ca8521,
-	/* XFER_MW_DMA_1 */	0xa0ca8532,
-	/* XFER_MW_DMA_0 */	0xa0ca8575,
-
-	/* XFER_PIO_4 */	0xc0ca8521,
-	/* XFER_PIO_3 */	0xc0ca8532,
-	/* XFER_PIO_2 */	0xc0ca8542,
-	/* XFER_PIO_1 */	0xc0d08572,
-	/* XFER_PIO_0 */	0xc0d08585
-};
-
-/*
- * The following are the new timing tables with PIO mode data/taskfile transfer
- * overclocking fixed...
- */
-
-/* This table is taken from the HPT370 data manual rev. 1.02 */
-static u32 thirty_three_base_hpt37x[] = {
-	/* XFER_UDMA_6 */	0x16455031,	/* 0x16655031 ?? */
-	/* XFER_UDMA_5 */	0x16455031,
-	/* XFER_UDMA_4 */	0x16455031,
-	/* XFER_UDMA_3 */	0x166d5031,
-	/* XFER_UDMA_2 */	0x16495031,
-	/* XFER_UDMA_1 */	0x164d5033,
-	/* XFER_UDMA_0 */	0x16515097,
-
-	/* XFER_MW_DMA_2 */	0x26515031,
-	/* XFER_MW_DMA_1 */	0x26515033,
-	/* XFER_MW_DMA_0 */	0x26515097,
-
-	/* XFER_PIO_4 */	0x06515021,
-	/* XFER_PIO_3 */	0x06515022,
-	/* XFER_PIO_2 */	0x06515033,
-	/* XFER_PIO_1 */	0x06915065,
-	/* XFER_PIO_0 */	0x06d1508a
-};
-
-static u32 fifty_base_hpt37x[] = {
-	/* XFER_UDMA_6 */	0x1a861842,
-	/* XFER_UDMA_5 */	0x1a861842,
-	/* XFER_UDMA_4 */	0x1aae1842,
-	/* XFER_UDMA_3 */	0x1a8e1842,
-	/* XFER_UDMA_2 */	0x1a0e1842,
-	/* XFER_UDMA_1 */	0x1a161854,
-	/* XFER_UDMA_0 */	0x1a1a18ea,
-
-	/* XFER_MW_DMA_2 */	0x2a821842,
-	/* XFER_MW_DMA_1 */	0x2a821854,
-	/* XFER_MW_DMA_0 */	0x2a8218ea,
-
-	/* XFER_PIO_4 */	0x0a821842,
-	/* XFER_PIO_3 */	0x0a821843,
-	/* XFER_PIO_2 */	0x0a821855,
-	/* XFER_PIO_1 */	0x0ac218a8,
-	/* XFER_PIO_0 */	0x0b02190c
-};
-
-static u32 sixty_six_base_hpt37x[] = {
-	/* XFER_UDMA_6 */	0x1c86fe62,
-	/* XFER_UDMA_5 */	0x1caefe62,	/* 0x1c8afe62 */
-	/* XFER_UDMA_4 */	0x1c8afe62,
-	/* XFER_UDMA_3 */	0x1c8efe62,
-	/* XFER_UDMA_2 */	0x1c92fe62,
-	/* XFER_UDMA_1 */	0x1c9afe62,
-	/* XFER_UDMA_0 */	0x1c82fe62,
-
-	/* XFER_MW_DMA_2 */	0x2c82fe62,
-	/* XFER_MW_DMA_1 */	0x2c82fe66,
-	/* XFER_MW_DMA_0 */	0x2c82ff2e,
-
-	/* XFER_PIO_4 */	0x0c82fe62,
-	/* XFER_PIO_3 */	0x0c82fe84,
-	/* XFER_PIO_2 */	0x0c82fea6,
-	/* XFER_PIO_1 */	0x0d02ff26,
-	/* XFER_PIO_0 */	0x0d42ff7f
-};
-
-#define HPT371_ALLOW_ATA133_6		1
-#define HPT302_ALLOW_ATA133_6		1
-#define HPT372_ALLOW_ATA133_6		1
-#define HPT370_ALLOW_ATA100_5		0
-#define HPT366_ALLOW_ATA66_4		1
-#define HPT366_ALLOW_ATA66_3		1
-
-/* Supported ATA clock frequencies */
-enum ata_clock {
-	ATA_CLOCK_25MHZ,
-	ATA_CLOCK_33MHZ,
-	ATA_CLOCK_40MHZ,
-	ATA_CLOCK_50MHZ,
-	ATA_CLOCK_66MHZ,
-	NUM_ATA_CLOCKS
-};
-
-struct hpt_timings {
-	u32 pio_mask;
-	u32 dma_mask;
-	u32 ultra_mask;
-	u32 *clock_table[NUM_ATA_CLOCKS];
-};
-
-/*
- *	Hold all the HighPoint chip information in one place.
- */
-
-struct hpt_info {
-	char *chip_name;	/* Chip name */
-	u8 chip_type;		/* Chip type */
-	u8 udma_mask;		/* Allowed UltraDMA modes mask. */
-	u8 dpll_clk;		/* DPLL clock in MHz */
-	u8 pci_clk;		/* PCI  clock in MHz */
-	struct hpt_timings *timings; /* Chipset timing data */
-	u8 clock;		/* ATA clock selected */
-};
-
-/* Supported HighPoint chips */
-enum {
-	HPT36x,
-	HPT370,
-	HPT370A,
-	HPT374,
-	HPT372,
-	HPT372A,
-	HPT302,
-	HPT371,
-	HPT372N,
-	HPT302N,
-	HPT371N
-};
-
-static struct hpt_timings hpt36x_timings = {
-	.pio_mask	= 0xc1f8ffff,
-	.dma_mask	= 0x303800ff,
-	.ultra_mask	= 0x30070000,
-	.clock_table	= {
-		[ATA_CLOCK_25MHZ] = twenty_five_base_hpt36x,
-		[ATA_CLOCK_33MHZ] = thirty_three_base_hpt36x,
-		[ATA_CLOCK_40MHZ] = forty_base_hpt36x,
-		[ATA_CLOCK_50MHZ] = NULL,
-		[ATA_CLOCK_66MHZ] = NULL
-	}
-};
-
-static struct hpt_timings hpt37x_timings = {
-	.pio_mask	= 0xcfc3ffff,
-	.dma_mask	= 0x31c001ff,
-	.ultra_mask	= 0x303c0000,
-	.clock_table	= {
-		[ATA_CLOCK_25MHZ] = NULL,
-		[ATA_CLOCK_33MHZ] = thirty_three_base_hpt37x,
-		[ATA_CLOCK_40MHZ] = NULL,
-		[ATA_CLOCK_50MHZ] = fifty_base_hpt37x,
-		[ATA_CLOCK_66MHZ] = sixty_six_base_hpt37x
-	}
-};
-
-static const struct hpt_info hpt36x = {
-	.chip_name	= "HPT36x",
-	.chip_type	= HPT36x,
-	.udma_mask	= HPT366_ALLOW_ATA66_3 ? (HPT366_ALLOW_ATA66_4 ? ATA_UDMA4 : ATA_UDMA3) : ATA_UDMA2,
-	.dpll_clk	= 0,	/* no DPLL */
-	.timings	= &hpt36x_timings
-};
-
-static const struct hpt_info hpt370 = {
-	.chip_name	= "HPT370",
-	.chip_type	= HPT370,
-	.udma_mask	= HPT370_ALLOW_ATA100_5 ? ATA_UDMA5 : ATA_UDMA4,
-	.dpll_clk	= 48,
-	.timings	= &hpt37x_timings
-};
-
-static const struct hpt_info hpt370a = {
-	.chip_name	= "HPT370A",
-	.chip_type	= HPT370A,
-	.udma_mask	= HPT370_ALLOW_ATA100_5 ? ATA_UDMA5 : ATA_UDMA4,
-	.dpll_clk	= 48,
-	.timings	= &hpt37x_timings
-};
-
-static const struct hpt_info hpt374 = {
-	.chip_name	= "HPT374",
-	.chip_type	= HPT374,
-	.udma_mask	= ATA_UDMA5,
-	.dpll_clk	= 48,
-	.timings	= &hpt37x_timings
-};
-
-static const struct hpt_info hpt372 = {
-	.chip_name	= "HPT372",
-	.chip_type	= HPT372,
-	.udma_mask	= HPT372_ALLOW_ATA133_6 ? ATA_UDMA6 : ATA_UDMA5,
-	.dpll_clk	= 55,
-	.timings	= &hpt37x_timings
-};
-
-static const struct hpt_info hpt372a = {
-	.chip_name	= "HPT372A",
-	.chip_type	= HPT372A,
-	.udma_mask	= HPT372_ALLOW_ATA133_6 ? ATA_UDMA6 : ATA_UDMA5,
-	.dpll_clk	= 66,
-	.timings	= &hpt37x_timings
-};
-
-static const struct hpt_info hpt302 = {
-	.chip_name	= "HPT302",
-	.chip_type	= HPT302,
-	.udma_mask	= HPT302_ALLOW_ATA133_6 ? ATA_UDMA6 : ATA_UDMA5,
-	.dpll_clk	= 66,
-	.timings	= &hpt37x_timings
-};
-
-static const struct hpt_info hpt371 = {
-	.chip_name	= "HPT371",
-	.chip_type	= HPT371,
-	.udma_mask	= HPT371_ALLOW_ATA133_6 ? ATA_UDMA6 : ATA_UDMA5,
-	.dpll_clk	= 66,
-	.timings	= &hpt37x_timings
-};
-
-static const struct hpt_info hpt372n = {
-	.chip_name	= "HPT372N",
-	.chip_type	= HPT372N,
-	.udma_mask	= HPT372_ALLOW_ATA133_6 ? ATA_UDMA6 : ATA_UDMA5,
-	.dpll_clk	= 77,
-	.timings	= &hpt37x_timings
-};
-
-static const struct hpt_info hpt302n = {
-	.chip_name	= "HPT302N",
-	.chip_type	= HPT302N,
-	.udma_mask	= HPT302_ALLOW_ATA133_6 ? ATA_UDMA6 : ATA_UDMA5,
-	.dpll_clk	= 77,
-	.timings	= &hpt37x_timings
-};
-
-static const struct hpt_info hpt371n = {
-	.chip_name	= "HPT371N",
-	.chip_type	= HPT371N,
-	.udma_mask	= HPT371_ALLOW_ATA133_6 ? ATA_UDMA6 : ATA_UDMA5,
-	.dpll_clk	= 77,
-	.timings	= &hpt37x_timings
-};
-
-static bool check_in_drive_list(ide_drive_t *drive, const char **list)
-{
-	return match_string(list, -1, (char *)&drive->id[ATA_ID_PROD]) >= 0;
-}
-
-static struct hpt_info *hpt3xx_get_info(struct device *dev)
-{
-	struct ide_host *host	= dev_get_drvdata(dev);
-	struct hpt_info *info	= (struct hpt_info *)host->host_priv;
-
-	return dev == host->dev[1] ? info + 1 : info;
-}
-
-/*
- * The Marvell bridge chips used on the HighPoint SATA cards do not seem
- * to support the UltraDMA modes 1, 2, and 3 as well as any MWDMA modes...
- */
-
-static u8 hpt3xx_udma_filter(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif	= drive->hwif;
-	struct hpt_info *info	= hpt3xx_get_info(hwif->dev);
-	u8 mask 		= hwif->ultra_mask;
-
-	switch (info->chip_type) {
-	case HPT36x:
-		if (!HPT366_ALLOW_ATA66_4 ||
-		    check_in_drive_list(drive, bad_ata66_4))
-			mask = ATA_UDMA3;
-
-		if (!HPT366_ALLOW_ATA66_3 ||
-		    check_in_drive_list(drive, bad_ata66_3))
-			mask = ATA_UDMA2;
-		break;
-	case HPT370:
-		if (!HPT370_ALLOW_ATA100_5 ||
-		    check_in_drive_list(drive, bad_ata100_5))
-			mask = ATA_UDMA4;
-		break;
-	case HPT370A:
-		if (!HPT370_ALLOW_ATA100_5 ||
-		    check_in_drive_list(drive, bad_ata100_5))
-			return ATA_UDMA4;
-		fallthrough;
-	case HPT372 :
-	case HPT372A:
-	case HPT372N:
-	case HPT374 :
-		if (ata_id_is_sata(drive->id))
-			mask &= ~0x0e;
-		fallthrough;
-	default:
-		return mask;
-	}
-
-	return check_in_drive_list(drive, bad_ata33) ? 0x00 : mask;
-}
-
-static u8 hpt3xx_mdma_filter(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif	= drive->hwif;
-	struct hpt_info *info	= hpt3xx_get_info(hwif->dev);
-
-	switch (info->chip_type) {
-	case HPT372 :
-	case HPT372A:
-	case HPT372N:
-	case HPT374 :
-		if (ata_id_is_sata(drive->id))
-			return 0x00;
-		fallthrough;
-	default:
-		return 0x07;
-	}
-}
-
-static u32 get_speed_setting(u8 speed, struct hpt_info *info)
-{
-	int i;
-
-	/*
-	 * Lookup the transfer mode table to get the index into
-	 * the timing table.
-	 *
-	 * NOTE: For XFER_PIO_SLOW, PIO mode 0 timings will be used.
-	 */
-	for (i = 0; i < ARRAY_SIZE(xfer_speeds) - 1; i++)
-		if (xfer_speeds[i] == speed)
-			break;
-
-	return info->timings->clock_table[info->clock][i];
-}
-
-static void hpt3xx_set_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	struct hpt_info *info	= hpt3xx_get_info(hwif->dev);
-	struct hpt_timings *t	= info->timings;
-	u8  itr_addr		= 0x40 + (drive->dn * 4);
-	u32 old_itr		= 0;
-	const u8 speed		= drive->dma_mode;
-	u32 new_itr		= get_speed_setting(speed, info);
-	u32 itr_mask		= speed < XFER_MW_DMA_0 ? t->pio_mask :
-				 (speed < XFER_UDMA_0   ? t->dma_mask :
-							  t->ultra_mask);
-
-	pci_read_config_dword(dev, itr_addr, &old_itr);
-	new_itr = (old_itr & ~itr_mask) | (new_itr & itr_mask);
-	/*
-	 * Disable on-chip PIO FIFO/buffer (and PIO MST mode as well)
-	 * to avoid problems handling I/O errors later
-	 */
-	new_itr &= ~0xc0000000;
-
-	pci_write_config_dword(dev, itr_addr, new_itr);
-}
-
-static void hpt3xx_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	drive->dma_mode = drive->pio_mode;
-	hpt3xx_set_mode(hwif, drive);
-}
-
-static void hpt3xx_maskproc(ide_drive_t *drive, int mask)
-{
-	ide_hwif_t *hwif	= drive->hwif;
-	struct pci_dev	*dev	= to_pci_dev(hwif->dev);
-	struct hpt_info *info	= hpt3xx_get_info(hwif->dev);
-
-	if ((drive->dev_flags & IDE_DFLAG_NIEN_QUIRK) == 0)
-		return;
-
-	if (info->chip_type >= HPT370) {
-		u8 scr1 = 0;
-
-		pci_read_config_byte(dev, 0x5a, &scr1);
-		if (((scr1 & 0x10) >> 4) != mask) {
-			if (mask)
-				scr1 |=  0x10;
-			else
-				scr1 &= ~0x10;
-			pci_write_config_byte(dev, 0x5a, scr1);
-		}
-	} else if (mask)
-		disable_irq(hwif->irq);
-	else
-		enable_irq(hwif->irq);
-}
-
-/*
- * This is specific to the HPT366 UDMA chipset
- * by HighPoint|Triones Technologies, Inc.
- */
-static void hpt366_dma_lost_irq(ide_drive_t *drive)
-{
-	struct pci_dev *dev = to_pci_dev(drive->hwif->dev);
-	u8 mcr1 = 0, mcr3 = 0, scr1 = 0;
-
-	pci_read_config_byte(dev, 0x50, &mcr1);
-	pci_read_config_byte(dev, 0x52, &mcr3);
-	pci_read_config_byte(dev, 0x5a, &scr1);
-	printk("%s: (%s)  mcr1=0x%02x, mcr3=0x%02x, scr1=0x%02x\n",
-		drive->name, __func__, mcr1, mcr3, scr1);
-	if (scr1 & 0x10)
-		pci_write_config_byte(dev, 0x5a, scr1 & ~0x10);
-	ide_dma_lost_irq(drive);
-}
-
-static void hpt370_clear_engine(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-
-	pci_write_config_byte(dev, hwif->select_data, 0x37);
-	udelay(10);
-}
-
-static void hpt370_irq_timeout(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif	= drive->hwif;
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	u16 bfifo		= 0;
-	u8  dma_cmd;
-
-	pci_read_config_word(dev, hwif->select_data + 2, &bfifo);
-	printk(KERN_DEBUG "%s: %d bytes in FIFO\n", drive->name, bfifo & 0x1ff);
-
-	/* get DMA command mode */
-	dma_cmd = inb(hwif->dma_base + ATA_DMA_CMD);
-	/* stop DMA */
-	outb(dma_cmd & ~ATA_DMA_START, hwif->dma_base + ATA_DMA_CMD);
-	hpt370_clear_engine(drive);
-}
-
-static void hpt370_dma_start(ide_drive_t *drive)
-{
-#ifdef HPT_RESET_STATE_ENGINE
-	hpt370_clear_engine(drive);
-#endif
-	ide_dma_start(drive);
-}
-
-static int hpt370_dma_end(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif	= drive->hwif;
-	u8  dma_stat		= inb(hwif->dma_base + ATA_DMA_STATUS);
-
-	if (dma_stat & ATA_DMA_ACTIVE) {
-		/* wait a little */
-		udelay(20);
-		dma_stat = inb(hwif->dma_base + ATA_DMA_STATUS);
-		if (dma_stat & ATA_DMA_ACTIVE)
-			hpt370_irq_timeout(drive);
-	}
-	return ide_dma_end(drive);
-}
-
-/* returns 1 if DMA IRQ issued, 0 otherwise */
-static int hpt374_dma_test_irq(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif	= drive->hwif;
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	u16 bfifo		= 0;
-	u8  dma_stat;
-
-	pci_read_config_word(dev, hwif->select_data + 2, &bfifo);
-	if (bfifo & 0x1FF) {
-//		printk("%s: %d bytes in FIFO\n", drive->name, bfifo);
-		return 0;
-	}
-
-	dma_stat = inb(hwif->dma_base + ATA_DMA_STATUS);
-	/* return 1 if INTR asserted */
-	if (dma_stat & ATA_DMA_INTR)
-		return 1;
-
-	return 0;
-}
-
-static int hpt374_dma_end(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif	= drive->hwif;
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	u8 mcr	= 0, mcr_addr	= hwif->select_data;
-	u8 bwsr = 0, mask	= hwif->channel ? 0x02 : 0x01;
-
-	pci_read_config_byte(dev, 0x6a, &bwsr);
-	pci_read_config_byte(dev, mcr_addr, &mcr);
-	if (bwsr & mask)
-		pci_write_config_byte(dev, mcr_addr, mcr | 0x30);
-	return ide_dma_end(drive);
-}
-
-/**
- *	hpt3xxn_set_clock	-	perform clock switching dance
- *	@hwif: hwif to switch
- *	@mode: clocking mode (0x21 for write, 0x23 otherwise)
- *
- *	Switch the DPLL clock on the HPT3xxN devices. This is a	right mess.
- */
-
-static void hpt3xxn_set_clock(ide_hwif_t *hwif, u8 mode)
-{
-	unsigned long base = hwif->extra_base;
-	u8 scr2 = inb(base + 0x6b);
-
-	if ((scr2 & 0x7f) == mode)
-		return;
-
-	/* Tristate the bus */
-	outb(0x80, base + 0x63);
-	outb(0x80, base + 0x67);
-
-	/* Switch clock and reset channels */
-	outb(mode, base + 0x6b);
-	outb(0xc0, base + 0x69);
-
-	/*
-	 * Reset the state machines.
-	 * NOTE: avoid accidentally enabling the disabled channels.
-	 */
-	outb(inb(base + 0x60) | 0x32, base + 0x60);
-	outb(inb(base + 0x64) | 0x32, base + 0x64);
-
-	/* Complete reset */
-	outb(0x00, base + 0x69);
-
-	/* Reconnect channels to bus */
-	outb(0x00, base + 0x63);
-	outb(0x00, base + 0x67);
-}
-
-/**
- *	hpt3xxn_rw_disk		-	prepare for I/O
- *	@drive: drive for command
- *	@rq: block request structure
- *
- *	This is called when a disk I/O is issued to HPT3xxN.
- *	We need it because of the clock switching.
- */
-
-static void hpt3xxn_rw_disk(ide_drive_t *drive, struct request *rq)
-{
-	hpt3xxn_set_clock(drive->hwif, rq_data_dir(rq) ? 0x21 : 0x23);
-}
-
-/**
- *	hpt37x_calibrate_dpll	-	calibrate the DPLL
- *	@dev: PCI device
- *
- *	Perform a calibration cycle on the DPLL.
- *	Returns 1 if this succeeds
- */
-static int hpt37x_calibrate_dpll(struct pci_dev *dev, u16 f_low, u16 f_high)
-{
-	u32 dpll = (f_high << 16) | f_low | 0x100;
-	u8  scr2;
-	int i;
-
-	pci_write_config_dword(dev, 0x5c, dpll);
-
-	/* Wait for oscillator ready */
-	for(i = 0; i < 0x5000; ++i) {
-		udelay(50);
-		pci_read_config_byte(dev, 0x5b, &scr2);
-		if (scr2 & 0x80)
-			break;
-	}
-	/* See if it stays ready (we'll just bail out if it's not yet) */
-	for(i = 0; i < 0x1000; ++i) {
-		pci_read_config_byte(dev, 0x5b, &scr2);
-		/* DPLL destabilized? */
-		if(!(scr2 & 0x80))
-			return 0;
-	}
-	/* Turn off tuning, we have the DPLL set */
-	pci_read_config_dword (dev, 0x5c, &dpll);
-	pci_write_config_dword(dev, 0x5c, (dpll & ~0x100));
-	return 1;
-}
-
-static void hpt3xx_disable_fast_irq(struct pci_dev *dev, u8 mcr_addr)
-{
-	struct ide_host *host	= pci_get_drvdata(dev);
-	struct hpt_info *info	= host->host_priv + (&dev->dev == host->dev[1]);
-	u8  chip_type		= info->chip_type;
-	u8  new_mcr, old_mcr	= 0;
-
-	/*
-	 * Disable the "fast interrupt" prediction.  Don't hold off
-	 * on interrupts. (== 0x01 despite what the docs say)
-	 */
-	pci_read_config_byte(dev, mcr_addr + 1, &old_mcr);
-
-	if (chip_type >= HPT374)
-		new_mcr = old_mcr & ~0x07;
-	else if (chip_type >= HPT370) {
-		new_mcr = old_mcr;
-		new_mcr &= ~0x02;
-#ifdef HPT_DELAY_INTERRUPT
-		new_mcr &= ~0x01;
-#else
-		new_mcr |=  0x01;
-#endif
-	} else					/* HPT366 and HPT368  */
-		new_mcr = old_mcr & ~0x80;
-
-	if (new_mcr != old_mcr)
-		pci_write_config_byte(dev, mcr_addr + 1, new_mcr);
-}
-
-static int init_chipset_hpt366(struct pci_dev *dev)
-{
-	unsigned long io_base	= pci_resource_start(dev, 4);
-	struct hpt_info *info	= hpt3xx_get_info(&dev->dev);
-	const char *name	= DRV_NAME;
-	u8 pci_clk,  dpll_clk	= 0;	/* PCI and DPLL clock in MHz */
-	u8 chip_type;
-	enum ata_clock	clock;
-
-	chip_type = info->chip_type;
-
-	pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE, (L1_CACHE_BYTES / 4));
-	pci_write_config_byte(dev, PCI_LATENCY_TIMER, 0x78);
-	pci_write_config_byte(dev, PCI_MIN_GNT, 0x08);
-	pci_write_config_byte(dev, PCI_MAX_LAT, 0x08);
-
-	/*
-	 * First, try to estimate the PCI clock frequency...
-	 */
-	if (chip_type >= HPT370) {
-		u8  scr1  = 0;
-		u16 f_cnt = 0;
-		u32 temp  = 0;
-
-		/* Interrupt force enable. */
-		pci_read_config_byte(dev, 0x5a, &scr1);
-		if (scr1 & 0x10)
-			pci_write_config_byte(dev, 0x5a, scr1 & ~0x10);
-
-		/*
-		 * HighPoint does this for HPT372A.
-		 * NOTE: This register is only writeable via I/O space.
-		 */
-		if (chip_type == HPT372A)
-			outb(0x0e, io_base + 0x9c);
-
-		/*
-		 * Default to PCI clock. Make sure MA15/16 are set to output
-		 * to prevent drives having problems with 40-pin cables.
-		 */
-		pci_write_config_byte(dev, 0x5b, 0x23);
-
-		/*
-		 * We'll have to read f_CNT value in order to determine
-		 * the PCI clock frequency according to the following ratio:
-		 *
-		 * f_CNT = Fpci * 192 / Fdpll
-		 *
-		 * First try reading the register in which the HighPoint BIOS
-		 * saves f_CNT value before  reprogramming the DPLL from its
-		 * default setting (which differs for the various chips).
-		 *
-		 * NOTE: This register is only accessible via I/O space;
-		 * HPT374 BIOS only saves it for the function 0, so we have to
-		 * always read it from there -- no need to check the result of
-		 * pci_get_slot() for the function 0 as the whole device has
-		 * been already "pinned" (via function 1) in init_setup_hpt374()
-		 */
-		if (chip_type == HPT374 && (PCI_FUNC(dev->devfn) & 1)) {
-			struct pci_dev	*dev1 = pci_get_slot(dev->bus,
-							     dev->devfn - 1);
-			unsigned long io_base = pci_resource_start(dev1, 4);
-
-			temp =	inl(io_base + 0x90);
-			pci_dev_put(dev1);
-		} else
-			temp =	inl(io_base + 0x90);
-
-		/*
-		 * In case the signature check fails, we'll have to
-		 * resort to reading the f_CNT register itself in hopes
-		 * that nobody has touched the DPLL yet...
-		 */
-		if ((temp & 0xFFFFF000) != 0xABCDE000) {
-			int i;
-
-			printk(KERN_WARNING "%s %s: no clock data saved by "
-				"BIOS\n", name, pci_name(dev));
-
-			/* Calculate the average value of f_CNT. */
-			for (temp = i = 0; i < 128; i++) {
-				pci_read_config_word(dev, 0x78, &f_cnt);
-				temp += f_cnt & 0x1ff;
-				mdelay(1);
-			}
-			f_cnt = temp / 128;
-		} else
-			f_cnt = temp & 0x1ff;
-
-		dpll_clk = info->dpll_clk;
-		pci_clk  = (f_cnt * dpll_clk) / 192;
-
-		/* Clamp PCI clock to bands. */
-		if (pci_clk < 40)
-			pci_clk = 33;
-		else if(pci_clk < 45)
-			pci_clk = 40;
-		else if(pci_clk < 55)
-			pci_clk = 50;
-		else
-			pci_clk = 66;
-
-		printk(KERN_INFO "%s %s: DPLL base: %d MHz, f_CNT: %d, "
-			"assuming %d MHz PCI\n", name, pci_name(dev),
-			dpll_clk, f_cnt, pci_clk);
-	} else {
-		u32 itr1 = 0;
-
-		pci_read_config_dword(dev, 0x40, &itr1);
-
-		/* Detect PCI clock by looking at cmd_high_time. */
-		switch ((itr1 >> 8) & 0x0f) {
-			case 0x09:
-				pci_clk = 40;
-				break;
-			case 0x05:
-				pci_clk = 25;
-				break;
-			case 0x07:
-			default:
-				pci_clk = 33;
-				break;
-		}
-	}
-
-	/* Let's assume we'll use PCI clock for the ATA clock... */
-	switch (pci_clk) {
-		case 25:
-			clock = ATA_CLOCK_25MHZ;
-			break;
-		case 33:
-		default:
-			clock = ATA_CLOCK_33MHZ;
-			break;
-		case 40:
-			clock = ATA_CLOCK_40MHZ;
-			break;
-		case 50:
-			clock = ATA_CLOCK_50MHZ;
-			break;
-		case 66:
-			clock = ATA_CLOCK_66MHZ;
-			break;
-	}
-
-	/*
-	 * Only try the DPLL if we don't have a table for the PCI clock that
-	 * we are running at for HPT370/A, always use it  for anything newer...
-	 *
-	 * NOTE: Using the internal DPLL results in slow reads on 33 MHz PCI.
-	 * We also  don't like using  the DPLL because this causes glitches
-	 * on PRST-/SRST- when the state engine gets reset...
-	 */
-	if (chip_type >= HPT374 || info->timings->clock_table[clock] == NULL) {
-		u16 f_low, delta = pci_clk < 50 ? 2 : 4;
-		int adjust;
-
-		 /*
-		  * Select 66 MHz DPLL clock only if UltraATA/133 mode is
-		  * supported/enabled, use 50 MHz DPLL clock otherwise...
-		  */
-		if (info->udma_mask == ATA_UDMA6) {
-			dpll_clk = 66;
-			clock = ATA_CLOCK_66MHZ;
-		} else if (dpll_clk) {	/* HPT36x chips don't have DPLL */
-			dpll_clk = 50;
-			clock = ATA_CLOCK_50MHZ;
-		}
-
-		if (info->timings->clock_table[clock] == NULL) {
-			printk(KERN_ERR "%s %s: unknown bus timing!\n",
-				name, pci_name(dev));
-			return -EIO;
-		}
-
-		/* Select the DPLL clock. */
-		pci_write_config_byte(dev, 0x5b, 0x21);
-
-		/*
-		 * Adjust the DPLL based upon PCI clock, enable it,
-		 * and wait for stabilization...
-		 */
-		f_low = (pci_clk * 48) / dpll_clk;
-
-		for (adjust = 0; adjust < 8; adjust++) {
-			if(hpt37x_calibrate_dpll(dev, f_low, f_low + delta))
-				break;
-
-			/*
-			 * See if it'll settle at a fractionally different clock
-			 */
-			if (adjust & 1)
-				f_low -= adjust >> 1;
-			else
-				f_low += adjust >> 1;
-		}
-		if (adjust == 8) {
-			printk(KERN_ERR "%s %s: DPLL did not stabilize!\n",
-				name, pci_name(dev));
-			return -EIO;
-		}
-
-		printk(KERN_INFO "%s %s: using %d MHz DPLL clock\n",
-			name, pci_name(dev), dpll_clk);
-	} else {
-		/* Mark the fact that we're not using the DPLL. */
-		dpll_clk = 0;
-
-		printk(KERN_INFO "%s %s: using %d MHz PCI clock\n",
-			name, pci_name(dev), pci_clk);
-	}
-
-	/* Store the clock frequencies. */
-	info->dpll_clk	= dpll_clk;
-	info->pci_clk	= pci_clk;
-	info->clock	= clock;
-
-	if (chip_type >= HPT370) {
-		u8  mcr1, mcr4;
-
-		/*
-		 * Reset the state engines.
-		 * NOTE: Avoid accidentally enabling the disabled channels.
-		 */
-		pci_read_config_byte (dev, 0x50, &mcr1);
-		pci_read_config_byte (dev, 0x54, &mcr4);
-		pci_write_config_byte(dev, 0x50, (mcr1 | 0x32));
-		pci_write_config_byte(dev, 0x54, (mcr4 | 0x32));
-		udelay(100);
-	}
-
-	/*
-	 * On  HPT371N, if ATA clock is 66 MHz we must set bit 2 in
-	 * the MISC. register to stretch the UltraDMA Tss timing.
-	 * NOTE: This register is only writeable via I/O space.
-	 */
-	if (chip_type == HPT371N && clock == ATA_CLOCK_66MHZ)
-		outb(inb(io_base + 0x9c) | 0x04, io_base + 0x9c);
-
-	hpt3xx_disable_fast_irq(dev, 0x50);
-	hpt3xx_disable_fast_irq(dev, 0x54);
-
-	return 0;
-}
-
-static u8 hpt3xx_cable_detect(ide_hwif_t *hwif)
-{
-	struct pci_dev	*dev	= to_pci_dev(hwif->dev);
-	struct hpt_info *info	= hpt3xx_get_info(hwif->dev);
-	u8 chip_type		= info->chip_type;
-	u8 scr1 = 0, ata66	= hwif->channel ? 0x01 : 0x02;
-
-	/*
-	 * The HPT37x uses the CBLID pins as outputs for MA15/MA16
-	 * address lines to access an external EEPROM.  To read valid
-	 * cable detect state the pins must be enabled as inputs.
-	 */
-	if (chip_type == HPT374 && (PCI_FUNC(dev->devfn) & 1)) {
-		/*
-		 * HPT374 PCI function 1
-		 * - set bit 15 of reg 0x52 to enable TCBLID as input
-		 * - set bit 15 of reg 0x56 to enable FCBLID as input
-		 */
-		u8  mcr_addr = hwif->select_data + 2;
-		u16 mcr;
-
-		pci_read_config_word(dev, mcr_addr, &mcr);
-		pci_write_config_word(dev, mcr_addr, mcr | 0x8000);
-		/* Debounce, then read cable ID register */
-		udelay(10);
-		pci_read_config_byte(dev, 0x5a, &scr1);
-		pci_write_config_word(dev, mcr_addr, mcr);
-	} else if (chip_type >= HPT370) {
-		/*
-		 * HPT370/372 and 374 pcifn 0
-		 * - clear bit 0 of reg 0x5b to enable P/SCBLID as inputs
-		 */
-		u8 scr2 = 0;
-
-		pci_read_config_byte(dev, 0x5b, &scr2);
-		pci_write_config_byte(dev, 0x5b, scr2 & ~1);
-		/* Debounce, then read cable ID register */
-		udelay(10);
-		pci_read_config_byte(dev, 0x5a, &scr1);
-		pci_write_config_byte(dev, 0x5b, scr2);
-	} else
-		pci_read_config_byte(dev, 0x5a, &scr1);
-
-	return (scr1 & ata66) ? ATA_CBL_PATA40 : ATA_CBL_PATA80;
-}
-
-static void init_hwif_hpt366(ide_hwif_t *hwif)
-{
-	struct hpt_info *info	= hpt3xx_get_info(hwif->dev);
-	u8  chip_type		= info->chip_type;
-
-	/* Cache the channel's MISC. control registers' offset */
-	hwif->select_data	= hwif->channel ? 0x54 : 0x50;
-
-	/*
-	 * HPT3xxN chips have some complications:
-	 *
-	 * - on 33 MHz PCI we must clock switch
-	 * - on 66 MHz PCI we must NOT use the PCI clock
-	 */
-	if (chip_type >= HPT372N && info->dpll_clk && info->pci_clk < 66) {
-		/*
-		 * Clock is shared between the channels,
-		 * so we'll have to serialize them... :-(
-		 */
-		hwif->host->host_flags |= IDE_HFLAG_SERIALIZE;
-		hwif->rw_disk = &hpt3xxn_rw_disk;
-	}
-}
-
-static int init_dma_hpt366(ide_hwif_t *hwif,
-				     const struct ide_port_info *d)
-{
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	unsigned long flags, base = ide_pci_dma_base(hwif, d);
-	u8 dma_old, dma_new, masterdma = 0, slavedma = 0;
-
-	if (base == 0)
-		return -1;
-
-	hwif->dma_base = base;
-
-	if (ide_pci_check_simplex(hwif, d) < 0)
-		return -1;
-
-	if (ide_pci_set_master(dev, d->name) < 0)
-		return -1;
-
-	dma_old = inb(base + 2);
-
-	local_irq_save(flags);
-
-	dma_new = dma_old;
-	pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
-	pci_read_config_byte(dev, hwif->channel ? 0x4f : 0x47,  &slavedma);
-
-	if (masterdma & 0x30)	dma_new |= 0x20;
-	if ( slavedma & 0x30)	dma_new |= 0x40;
-	if (dma_new != dma_old)
-		outb(dma_new, base + 2);
-
-	local_irq_restore(flags);
-
-	printk(KERN_INFO "    %s: BM-DMA at 0x%04lx-0x%04lx\n",
-			 hwif->name, base, base + 7);
-
-	hwif->extra_base = base + (hwif->channel ? 8 : 16);
-
-	if (ide_allocate_dma_engine(hwif))
-		return -1;
-
-	return 0;
-}
-
-static void hpt374_init(struct pci_dev *dev, struct pci_dev *dev2)
-{
-	if (dev2->irq != dev->irq) {
-		/* FIXME: we need a core pci_set_interrupt() */
-		dev2->irq = dev->irq;
-		printk(KERN_INFO DRV_NAME " %s: PCI config space interrupt "
-			"fixed\n", pci_name(dev2));
-	}
-}
-
-static void hpt371_init(struct pci_dev *dev)
-{
-	u8 mcr1 = 0;
-
-	/*
-	 * HPT371 chips physically have only one channel, the secondary one,
-	 * but the primary channel registers do exist!  Go figure...
-	 * So,  we manually disable the non-existing channel here
-	 * (if the BIOS hasn't done this already).
-	 */
-	pci_read_config_byte(dev, 0x50, &mcr1);
-	if (mcr1 & 0x04)
-		pci_write_config_byte(dev, 0x50, mcr1 & ~0x04);
-}
-
-static int hpt36x_init(struct pci_dev *dev, struct pci_dev *dev2)
-{
-	u8 mcr1 = 0, pin1 = 0, pin2 = 0;
-
-	/*
-	 * Now we'll have to force both channels enabled if
-	 * at least one of them has been enabled by BIOS...
-	 */
-	pci_read_config_byte(dev, 0x50, &mcr1);
-	if (mcr1 & 0x30)
-		pci_write_config_byte(dev, 0x50, mcr1 | 0x30);
-
-	pci_read_config_byte(dev,  PCI_INTERRUPT_PIN, &pin1);
-	pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin2);
-
-	if (pin1 != pin2 && dev->irq == dev2->irq) {
-		printk(KERN_INFO DRV_NAME " %s: onboard version of chipset, "
-			"pin1=%d pin2=%d\n", pci_name(dev), pin1, pin2);
-		return 1;
-	}
-
-	return 0;
-}
-
-#define IDE_HFLAGS_HPT3XX \
-	(IDE_HFLAG_NO_ATAPI_DMA | \
-	 IDE_HFLAG_OFF_BOARD)
-
-static const struct ide_port_ops hpt3xx_port_ops = {
-	.set_pio_mode		= hpt3xx_set_pio_mode,
-	.set_dma_mode		= hpt3xx_set_mode,
-	.maskproc		= hpt3xx_maskproc,
-	.mdma_filter		= hpt3xx_mdma_filter,
-	.udma_filter		= hpt3xx_udma_filter,
-	.cable_detect		= hpt3xx_cable_detect,
-};
-
-static const struct ide_dma_ops hpt37x_dma_ops = {
-	.dma_host_set		= ide_dma_host_set,
-	.dma_setup		= ide_dma_setup,
-	.dma_start		= ide_dma_start,
-	.dma_end		= hpt374_dma_end,
-	.dma_test_irq		= hpt374_dma_test_irq,
-	.dma_lost_irq		= ide_dma_lost_irq,
-	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_sff_read_status	= ide_dma_sff_read_status,
-};
-
-static const struct ide_dma_ops hpt370_dma_ops = {
-	.dma_host_set		= ide_dma_host_set,
-	.dma_setup		= ide_dma_setup,
-	.dma_start		= hpt370_dma_start,
-	.dma_end		= hpt370_dma_end,
-	.dma_test_irq		= ide_dma_test_irq,
-	.dma_lost_irq		= ide_dma_lost_irq,
-	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_clear		= hpt370_irq_timeout,
-	.dma_sff_read_status	= ide_dma_sff_read_status,
-};
-
-static const struct ide_dma_ops hpt36x_dma_ops = {
-	.dma_host_set		= ide_dma_host_set,
-	.dma_setup		= ide_dma_setup,
-	.dma_start		= ide_dma_start,
-	.dma_end		= ide_dma_end,
-	.dma_test_irq		= ide_dma_test_irq,
-	.dma_lost_irq		= hpt366_dma_lost_irq,
-	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_sff_read_status	= ide_dma_sff_read_status,
-};
-
-static const struct ide_port_info hpt366_chipsets[] = {
-	{	/* 0: HPT36x */
-		.name		= DRV_NAME,
-		.init_chipset	= init_chipset_hpt366,
-		.init_hwif	= init_hwif_hpt366,
-		.init_dma	= init_dma_hpt366,
-		/*
-		 * HPT36x chips have one channel per function and have
-		 * both channel enable bits located differently and visible
-		 * to both functions -- really stupid design decision... :-(
-		 * Bit 4 is for the primary channel, bit 5 for the secondary.
-		 */
-		.enablebits	= {{0x50,0x10,0x10}, {0x54,0x04,0x04}},
-		.port_ops	= &hpt3xx_port_ops,
-		.dma_ops	= &hpt36x_dma_ops,
-		.host_flags	= IDE_HFLAGS_HPT3XX | IDE_HFLAG_SINGLE,
-		.pio_mask	= ATA_PIO4,
-		.mwdma_mask	= ATA_MWDMA2,
-	},
-	{	/* 1: HPT3xx */
-		.name		= DRV_NAME,
-		.init_chipset	= init_chipset_hpt366,
-		.init_hwif	= init_hwif_hpt366,
-		.init_dma	= init_dma_hpt366,
-		.enablebits	= {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
-		.port_ops	= &hpt3xx_port_ops,
-		.dma_ops	= &hpt37x_dma_ops,
-		.host_flags	= IDE_HFLAGS_HPT3XX,
-		.pio_mask	= ATA_PIO4,
-		.mwdma_mask	= ATA_MWDMA2,
-	}
-};
-
-/**
- *	hpt366_init_one	-	called when an HPT366 is found
- *	@dev: the hpt366 device
- *	@id: the matching pci id
- *
- *	Called when the PCI registration layer (or the IDE initialization)
- *	finds a device matching our IDE device tables.
- */
-static int hpt366_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	const struct hpt_info *info = NULL;
-	struct hpt_info *dyn_info;
-	struct pci_dev *dev2 = NULL;
-	struct ide_port_info d;
-	u8 idx = id->driver_data;
-	u8 rev = dev->revision;
-	int ret;
-
-	if ((idx == 0 || idx == 4) && (PCI_FUNC(dev->devfn) & 1))
-		return -ENODEV;
-
-	switch (idx) {
-	case 0:
-		if (rev < 3)
-			info = &hpt36x;
-		else {
-			switch (min_t(u8, rev, 6)) {
-			case 3: info = &hpt370;  break;
-			case 4: info = &hpt370a; break;
-			case 5: info = &hpt372;  break;
-			case 6: info = &hpt372n; break;
-			}
-			idx++;
-		}
-		break;
-	case 1:
-		info = (rev > 1) ? &hpt372n : &hpt372a;
-		break;
-	case 2:
-		info = (rev > 1) ? &hpt302n : &hpt302;
-		break;
-	case 3:
-		hpt371_init(dev);
-		info = (rev > 1) ? &hpt371n : &hpt371;
-		break;
-	case 4:
-		info = &hpt374;
-		break;
-	case 5:
-		info = &hpt372n;
-		break;
-	}
-
-	printk(KERN_INFO DRV_NAME ": %s chipset detected\n", info->chip_name);
-
-	d = hpt366_chipsets[min_t(u8, idx, 1)];
-
-	d.udma_mask = info->udma_mask;
-
-	/* fixup ->dma_ops for HPT370/HPT370A */
-	if (info == &hpt370 || info == &hpt370a)
-		d.dma_ops = &hpt370_dma_ops;
-
-	if (info == &hpt36x || info == &hpt374)
-		dev2 = pci_get_slot(dev->bus, dev->devfn + 1);
-
-	dyn_info = kcalloc(dev2 ? 2 : 1, sizeof(*dyn_info), GFP_KERNEL);
-	if (dyn_info == NULL) {
-		printk(KERN_ERR "%s %s: out of memory!\n",
-			d.name, pci_name(dev));
-		pci_dev_put(dev2);
-		return -ENOMEM;
-	}
-
-	/*
-	 * Copy everything from a static "template" structure
-	 * to just allocated per-chip hpt_info structure.
-	 */
-	memcpy(dyn_info, info, sizeof(*dyn_info));
-
-	if (dev2) {
-		memcpy(dyn_info + 1, info, sizeof(*dyn_info));
-
-		if (info == &hpt374)
-			hpt374_init(dev, dev2);
-		else {
-			if (hpt36x_init(dev, dev2))
-				d.host_flags &= ~IDE_HFLAG_NON_BOOTABLE;
-		}
-
-		ret = ide_pci_init_two(dev, dev2, &d, dyn_info);
-		if (ret < 0) {
-			pci_dev_put(dev2);
-			kfree(dyn_info);
-		}
-		return ret;
-	}
-
-	ret = ide_pci_init_one(dev, &d, dyn_info);
-	if (ret < 0)
-		kfree(dyn_info);
-
-	return ret;
-}
-
-static void hpt366_remove(struct pci_dev *dev)
-{
-	struct ide_host *host = pci_get_drvdata(dev);
-	struct ide_info *info = host->host_priv;
-	struct pci_dev *dev2 = host->dev[1] ? to_pci_dev(host->dev[1]) : NULL;
-
-	ide_pci_remove(dev);
-	pci_dev_put(dev2);
-	kfree(info);
-}
-
-static const struct pci_device_id hpt366_pci_tbl[] = {
-	{ PCI_VDEVICE(TTI, PCI_DEVICE_ID_TTI_HPT366),  0 },
-	{ PCI_VDEVICE(TTI, PCI_DEVICE_ID_TTI_HPT372),  1 },
-	{ PCI_VDEVICE(TTI, PCI_DEVICE_ID_TTI_HPT302),  2 },
-	{ PCI_VDEVICE(TTI, PCI_DEVICE_ID_TTI_HPT371),  3 },
-	{ PCI_VDEVICE(TTI, PCI_DEVICE_ID_TTI_HPT374),  4 },
-	{ PCI_VDEVICE(TTI, PCI_DEVICE_ID_TTI_HPT372N), 5 },
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, hpt366_pci_tbl);
-
-static struct pci_driver hpt366_pci_driver = {
-	.name		= "HPT366_IDE",
-	.id_table	= hpt366_pci_tbl,
-	.probe		= hpt366_init_one,
-	.remove		= hpt366_remove,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init hpt366_ide_init(void)
-{
-	return ide_pci_register_driver(&hpt366_pci_driver);
-}
-
-static void __exit hpt366_ide_exit(void)
-{
-	pci_unregister_driver(&hpt366_pci_driver);
-}
-
-module_init(hpt366_ide_init);
-module_exit(hpt366_ide_exit);
-
-MODULE_AUTHOR("Andre Hedrick");
-MODULE_DESCRIPTION("PCI driver module for Highpoint HPT366 IDE");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/ht6560b.c b/drivers/ide/ht6560b.c
deleted file mode 100644
index 743bc3693ac8..000000000000
--- a/drivers/ide/ht6560b.c
+++ /dev/null
@@ -1,383 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  Copyright (C) 1995-2000  Linus Torvalds & author (see below)
- */
-
-/*
- *  HT-6560B EIDE-controller support
- *  To activate controller support use kernel parameter "ide0=ht6560b".
- *  Use hdparm utility to enable PIO mode support.
- *
- *  Author:    Mikko Ala-Fossi            <maf@iki.fi>
- *             Jan Evert van Grootheest   <j.e.van.grootheest@caiway.nl>
- *
- */
-
-#define DRV_NAME	"ht6560b"
-#define HT6560B_VERSION "v0.08"
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/delay.h>
-#include <linux/timer.h>
-#include <linux/mm.h>
-#include <linux/ioport.h>
-#include <linux/blkdev.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-
-#include <asm/io.h>
-
-/* #define DEBUG */  /* remove comments for DEBUG messages */
-
-/*
- * The special i/o-port that HT-6560B uses to configuration:
- *    bit0 (0x01): "1" selects secondary interface
- *    bit2 (0x04): "1" enables FIFO function
- *    bit5 (0x20): "1" enables prefetched data read function  (???)
- *
- * The special i/o-port that HT-6560A uses to configuration:
- *    bit0 (0x01): "1" selects secondary interface
- *    bit1 (0x02): "1" enables prefetched data read function
- *    bit2 (0x04): "0" enables multi-master system	      (?)
- *    bit3 (0x08): "1" 3 cycle time, "0" 2 cycle time	      (?)
- */
-#define HT_CONFIG_PORT	  0x3e6
-
-static inline u8 HT_CONFIG(ide_drive_t *drive)
-{
-	return ((unsigned long)ide_get_drivedata(drive) & 0xff00) >> 8;
-}
-
-/*
- * FIFO + PREFETCH (both a/b-model)
- */
-#define HT_CONFIG_DEFAULT 0x1c /* no prefetch */
-/* #define HT_CONFIG_DEFAULT 0x3c */ /* with prefetch */
-#define HT_SECONDARY_IF	  0x01
-#define HT_PREFETCH_MODE  0x20
-
-/*
- * ht6560b Timing values:
- *
- * I reviewed some assembler source listings of htide drivers and found
- * out how they setup those cycle time interfacing values, as they at Holtek
- * call them. IDESETUP.COM that is supplied with the drivers figures out
- * optimal values and fetches those values to drivers. I found out that
- * they use Select register to fetch timings to the ide board right after
- * interface switching. After that it was quite easy to add code to
- * ht6560b.c.
- *
- * IDESETUP.COM gave me values 0x24, 0x45, 0xaa, 0xff that worked fine
- * for hda and hdc. But hdb needed higher values to work, so I guess
- * that sometimes it is necessary to give higher value than IDESETUP
- * gives.   [see cmd640.c for an extreme example of this. -ml]
- *
- * Perhaps I should explain something about these timing values:
- * The higher nibble of value is the Recovery Time  (rt) and the lower nibble
- * of the value is the Active Time  (at). Minimum value 2 is the fastest and
- * the maximum value 15 is the slowest. Default values should be 15 for both.
- * So 0x24 means 2 for rt and 4 for at. Each of the drives should have
- * both values, and IDESETUP gives automatically rt=15 st=15 for CDROMs or
- * similar. If value is too small there will be all sorts of failures.
- *
- * Timing byte consists of
- *	High nibble:  Recovery Cycle Time  (rt)
- *	     The valid values range from 2 to 15. The default is 15.
- *
- *	Low nibble:   Active Cycle Time	   (at)
- *	     The valid values range from 2 to 15. The default is 15.
- *
- * You can obtain optimized timing values by running Holtek IDESETUP.COM
- * for DOS. DOS drivers get their timing values from command line, where
- * the first value is the Recovery Time and the second value is the
- * Active Time for each drive. Smaller value gives higher speed.
- * In case of failures you should probably fall back to a higher value.
- */
-static inline u8 HT_TIMING(ide_drive_t *drive)
-{
-	return (unsigned long)ide_get_drivedata(drive) & 0x00ff;
-}
-
-#define HT_TIMING_DEFAULT 0xff
-
-/*
- * This routine handles interface switching for the peculiar hardware design
- * on the F.G.I./Holtek HT-6560B VLB IDE interface.
- * The HT-6560B can only enable one IDE port at a time, and requires a
- * silly sequence (below) whenever we switch between primary and secondary.
- */
-
-/*
- * This routine is invoked from ide.c to prepare for access to a given drive.
- */
-static void ht6560b_dev_select(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	unsigned long flags;
-	static u8 current_select = 0;
-	static u8 current_timing = 0;
-	u8 select, timing;
-	
-	local_irq_save(flags);
-
-	select = HT_CONFIG(drive);
-	timing = HT_TIMING(drive);
-
-	/*
-	 * Need to enforce prefetch sometimes because otherwise
-	 * it'll hang (hard).
-	 */
-	if (drive->media != ide_disk ||
-	    (drive->dev_flags & IDE_DFLAG_PRESENT) == 0)
-		select |= HT_PREFETCH_MODE;
-
-	if (select != current_select || timing != current_timing) {
-		current_select = select;
-		current_timing = timing;
-		(void)inb(HT_CONFIG_PORT);
-		(void)inb(HT_CONFIG_PORT);
-		(void)inb(HT_CONFIG_PORT);
-		(void)inb(HT_CONFIG_PORT);
-		outb(select, HT_CONFIG_PORT);
-		/*
-		 * Set timing for this drive:
-		 */
-		outb(timing, hwif->io_ports.device_addr);
-		(void)inb(hwif->io_ports.status_addr);
-#ifdef DEBUG
-		printk("ht6560b: %s: select=%#x timing=%#x\n",
-			drive->name, select, timing);
-#endif
-	}
-	local_irq_restore(flags);
-
-	outb(drive->select | ATA_DEVICE_OBS, hwif->io_ports.device_addr);
-}
-
-/*
- * Autodetection and initialization of ht6560b
- */
-static int __init try_to_init_ht6560b(void)
-{
-	u8 orig_value;
-	int i;
-	
-	/* Autodetect ht6560b */
-	if ((orig_value = inb(HT_CONFIG_PORT)) == 0xff)
-		return 0;
-	
-	for (i=3;i>0;i--) {
-		outb(0x00, HT_CONFIG_PORT);
-		if (!( (~inb(HT_CONFIG_PORT)) & 0x3f )) {
-			outb(orig_value, HT_CONFIG_PORT);
-			return 0;
-		}
-	}
-	outb(0x00, HT_CONFIG_PORT);
-	if ((~inb(HT_CONFIG_PORT))& 0x3f) {
-		outb(orig_value, HT_CONFIG_PORT);
-		return 0;
-	}
-	/*
-	 * Ht6560b autodetected
-	 */
-	outb(HT_CONFIG_DEFAULT, HT_CONFIG_PORT);
-	outb(HT_TIMING_DEFAULT, 0x1f6);	/* Select register */
-	(void)inb(0x1f7);		/* Status register */
-
-	printk("ht6560b " HT6560B_VERSION
-	       ": chipset detected and initialized"
-#ifdef DEBUG
-	       " with debug enabled"
-#endif
-	       "\n"
-		);
-	return 1;
-}
-
-static u8 ht_pio2timings(ide_drive_t *drive, const u8 pio)
-{
-	int active_time, recovery_time;
-	int active_cycles, recovery_cycles;
-	int bus_speed = ide_vlb_clk ? ide_vlb_clk : 50;
-
-        if (pio) {
-		unsigned int cycle_time;
-		struct ide_timing *t = ide_timing_find_mode(XFER_PIO_0 + pio);
-
-		cycle_time = ide_pio_cycle_time(drive, pio);
-
-		/*
-		 *  Just like opti621.c we try to calculate the
-		 *  actual cycle time for recovery and activity
-		 *  according system bus speed.
-		 */
-		active_time = t->active;
-		recovery_time = cycle_time - active_time - t->setup;
-		/*
-		 *  Cycle times should be Vesa bus cycles
-		 */
-		active_cycles   = (active_time   * bus_speed + 999) / 1000;
-		recovery_cycles = (recovery_time * bus_speed + 999) / 1000;
-		/*
-		 *  Upper and lower limits
-		 */
-		if (active_cycles   < 2)  active_cycles   = 2;
-		if (recovery_cycles < 2)  recovery_cycles = 2;
-		if (active_cycles   > 15) active_cycles   = 15;
-		if (recovery_cycles > 15) recovery_cycles = 0;  /* 0==16 */
-		
-#ifdef DEBUG
-		printk("ht6560b: drive %s setting pio=%d recovery=%d (%dns) active=%d (%dns)\n", drive->name, pio, recovery_cycles, recovery_time, active_cycles, active_time);
-#endif
-		
-		return (u8)((recovery_cycles << 4) | active_cycles);
-	} else {
-		
-#ifdef DEBUG
-		printk("ht6560b: drive %s setting pio=0\n", drive->name);
-#endif
-		
-		return HT_TIMING_DEFAULT;    /* default setting */
-	}
-}
-
-static DEFINE_SPINLOCK(ht6560b_lock);
-
-/*
- *  Enable/Disable so called prefetch mode
- */
-static void ht_set_prefetch(ide_drive_t *drive, u8 state)
-{
-	unsigned long flags, config;
-	int t = HT_PREFETCH_MODE << 8;
-
-	spin_lock_irqsave(&ht6560b_lock, flags);
-
-	config = (unsigned long)ide_get_drivedata(drive);
-
-	/*
-	 *  Prefetch mode and unmask irq seems to conflict
-	 */
-	if (state) {
-		config |= t;   /* enable prefetch mode */
-		drive->dev_flags |= IDE_DFLAG_NO_UNMASK;
-		drive->dev_flags &= ~IDE_DFLAG_UNMASK;
-	} else {
-		config &= ~t;  /* disable prefetch mode */
-		drive->dev_flags &= ~IDE_DFLAG_NO_UNMASK;
-	}
-
-	ide_set_drivedata(drive, (void *)config);
-
-	spin_unlock_irqrestore(&ht6560b_lock, flags);
-
-#ifdef DEBUG
-	printk("ht6560b: drive %s prefetch mode %sabled\n", drive->name, (state ? "en" : "dis"));
-#endif
-}
-
-static void ht6560b_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	unsigned long flags, config;
-	const u8 pio = drive->pio_mode - XFER_PIO_0;
-	u8 timing;
-	
-	switch (pio) {
-	case 8:         /* set prefetch off */
-	case 9:         /* set prefetch on */
-		ht_set_prefetch(drive, pio & 1);
-		return;
-	}
-
-	timing = ht_pio2timings(drive, pio);
-
-	spin_lock_irqsave(&ht6560b_lock, flags);
-	config = (unsigned long)ide_get_drivedata(drive);
-	config &= 0xff00;
-	config |= timing;
-	ide_set_drivedata(drive, (void *)config);
-	spin_unlock_irqrestore(&ht6560b_lock, flags);
-
-#ifdef DEBUG
-	printk("ht6560b: drive %s tuned to pio mode %#x timing=%#x\n", drive->name, pio, timing);
-#endif
-}
-
-static void __init ht6560b_init_dev(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	/* Setting default configurations for drives. */
-	unsigned long t = (HT_CONFIG_DEFAULT << 8) | HT_TIMING_DEFAULT;
-
-	if (hwif->channel)
-		t |= (HT_SECONDARY_IF << 8);
-
-	ide_set_drivedata(drive, (void *)t);
-}
-
-static bool probe_ht6560b;
-
-module_param_named(probe, probe_ht6560b, bool, 0);
-MODULE_PARM_DESC(probe, "probe for HT6560B chipset");
-
-static const struct ide_tp_ops ht6560b_tp_ops = {
-	.exec_command		= ide_exec_command,
-	.read_status		= ide_read_status,
-	.read_altstatus		= ide_read_altstatus,
-	.write_devctl		= ide_write_devctl,
-
-	.dev_select		= ht6560b_dev_select,
-	.tf_load		= ide_tf_load,
-	.tf_read		= ide_tf_read,
-
-	.input_data		= ide_input_data,
-	.output_data		= ide_output_data,
-};
-
-static const struct ide_port_ops ht6560b_port_ops = {
-	.init_dev		= ht6560b_init_dev,
-	.set_pio_mode		= ht6560b_set_pio_mode,
-};
-
-static const struct ide_port_info ht6560b_port_info __initconst = {
-	.name			= DRV_NAME,
-	.chipset		= ide_ht6560b,
-	.tp_ops 		= &ht6560b_tp_ops,
-	.port_ops		= &ht6560b_port_ops,
-	.host_flags		= IDE_HFLAG_SERIALIZE | /* is this needed? */
-				  IDE_HFLAG_NO_DMA |
-				  IDE_HFLAG_ABUSE_PREFETCH,
-	.pio_mask		= ATA_PIO4,
-};
-
-static int __init ht6560b_init(void)
-{
-	if (probe_ht6560b == 0)
-		return -ENODEV;
-
-	if (!request_region(HT_CONFIG_PORT, 1, DRV_NAME)) {
-		printk(KERN_NOTICE "%s: HT_CONFIG_PORT not found\n",
-			__func__);
-		return -ENODEV;
-	}
-
-	if (!try_to_init_ht6560b()) {
-		printk(KERN_NOTICE "%s: HBA not found\n", __func__);
-		goto release_region;
-	}
-
-	return ide_legacy_device_add(&ht6560b_port_info, 0);
-
-release_region:
-	release_region(HT_CONFIG_PORT, 1);
-	return -ENODEV;
-}
-
-module_init(ht6560b_init);
-
-MODULE_AUTHOR("See Local File");
-MODULE_DESCRIPTION("HT-6560B EIDE-controller support");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/icside.c b/drivers/ide/icside.c
deleted file mode 100644
index 329c7e4bc9d0..000000000000
--- a/drivers/ide/icside.c
+++ /dev/null
@@ -1,692 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (c) 1996-2004 Russell King.
- *
- * Please note that this platform does not support 32-bit IDE IO.
- */
-
-#include <linux/string.h>
-#include <linux/module.h>
-#include <linux/ioport.h>
-#include <linux/slab.h>
-#include <linux/blkdev.h>
-#include <linux/errno.h>
-#include <linux/ide.h>
-#include <linux/dma-mapping.h>
-#include <linux/device.h>
-#include <linux/init.h>
-#include <linux/scatterlist.h>
-#include <linux/io.h>
-
-#include <asm/dma.h>
-#include <asm/ecard.h>
-
-#define DRV_NAME "icside"
-
-#define ICS_IDENT_OFFSET		0x2280
-
-#define ICS_ARCIN_V5_INTRSTAT		0x0000
-#define ICS_ARCIN_V5_INTROFFSET		0x0004
-#define ICS_ARCIN_V5_IDEOFFSET		0x2800
-#define ICS_ARCIN_V5_IDEALTOFFSET	0x2b80
-#define ICS_ARCIN_V5_IDESTEPPING	6
-
-#define ICS_ARCIN_V6_IDEOFFSET_1	0x2000
-#define ICS_ARCIN_V6_INTROFFSET_1	0x2200
-#define ICS_ARCIN_V6_INTRSTAT_1		0x2290
-#define ICS_ARCIN_V6_IDEALTOFFSET_1	0x2380
-#define ICS_ARCIN_V6_IDEOFFSET_2	0x3000
-#define ICS_ARCIN_V6_INTROFFSET_2	0x3200
-#define ICS_ARCIN_V6_INTRSTAT_2		0x3290
-#define ICS_ARCIN_V6_IDEALTOFFSET_2	0x3380
-#define ICS_ARCIN_V6_IDESTEPPING	6
-
-struct cardinfo {
-	unsigned int dataoffset;
-	unsigned int ctrloffset;
-	unsigned int stepping;
-};
-
-static struct cardinfo icside_cardinfo_v5 = {
-	.dataoffset	= ICS_ARCIN_V5_IDEOFFSET,
-	.ctrloffset	= ICS_ARCIN_V5_IDEALTOFFSET,
-	.stepping	= ICS_ARCIN_V5_IDESTEPPING,
-};
-
-static struct cardinfo icside_cardinfo_v6_1 = {
-	.dataoffset	= ICS_ARCIN_V6_IDEOFFSET_1,
-	.ctrloffset	= ICS_ARCIN_V6_IDEALTOFFSET_1,
-	.stepping	= ICS_ARCIN_V6_IDESTEPPING,
-};
-
-static struct cardinfo icside_cardinfo_v6_2 = {
-	.dataoffset	= ICS_ARCIN_V6_IDEOFFSET_2,
-	.ctrloffset	= ICS_ARCIN_V6_IDEALTOFFSET_2,
-	.stepping	= ICS_ARCIN_V6_IDESTEPPING,
-};
-
-struct icside_state {
-	unsigned int channel;
-	unsigned int enabled;
-	void __iomem *irq_port;
-	void __iomem *ioc_base;
-	unsigned int sel;
-	unsigned int type;
-	struct ide_host *host;
-};
-
-#define ICS_TYPE_A3IN	0
-#define ICS_TYPE_A3USER	1
-#define ICS_TYPE_V6	3
-#define ICS_TYPE_V5	15
-#define ICS_TYPE_NOTYPE	((unsigned int)-1)
-
-/* ---------------- Version 5 PCB Support Functions --------------------- */
-/* Prototype: icside_irqenable_arcin_v5 (struct expansion_card *ec, int irqnr)
- * Purpose  : enable interrupts from card
- */
-static void icside_irqenable_arcin_v5 (struct expansion_card *ec, int irqnr)
-{
-	struct icside_state *state = ec->irq_data;
-
-	writeb(0, state->irq_port + ICS_ARCIN_V5_INTROFFSET);
-}
-
-/* Prototype: icside_irqdisable_arcin_v5 (struct expansion_card *ec, int irqnr)
- * Purpose  : disable interrupts from card
- */
-static void icside_irqdisable_arcin_v5 (struct expansion_card *ec, int irqnr)
-{
-	struct icside_state *state = ec->irq_data;
-
-	readb(state->irq_port + ICS_ARCIN_V5_INTROFFSET);
-}
-
-static const expansioncard_ops_t icside_ops_arcin_v5 = {
-	.irqenable	= icside_irqenable_arcin_v5,
-	.irqdisable	= icside_irqdisable_arcin_v5,
-};
-
-
-/* ---------------- Version 6 PCB Support Functions --------------------- */
-/* Prototype: icside_irqenable_arcin_v6 (struct expansion_card *ec, int irqnr)
- * Purpose  : enable interrupts from card
- */
-static void icside_irqenable_arcin_v6 (struct expansion_card *ec, int irqnr)
-{
-	struct icside_state *state = ec->irq_data;
-	void __iomem *base = state->irq_port;
-
-	state->enabled = 1;
-
-	switch (state->channel) {
-	case 0:
-		writeb(0, base + ICS_ARCIN_V6_INTROFFSET_1);
-		readb(base + ICS_ARCIN_V6_INTROFFSET_2);
-		break;
-	case 1:
-		writeb(0, base + ICS_ARCIN_V6_INTROFFSET_2);
-		readb(base + ICS_ARCIN_V6_INTROFFSET_1);
-		break;
-	}
-}
-
-/* Prototype: icside_irqdisable_arcin_v6 (struct expansion_card *ec, int irqnr)
- * Purpose  : disable interrupts from card
- */
-static void icside_irqdisable_arcin_v6 (struct expansion_card *ec, int irqnr)
-{
-	struct icside_state *state = ec->irq_data;
-
-	state->enabled = 0;
-
-	readb(state->irq_port + ICS_ARCIN_V6_INTROFFSET_1);
-	readb(state->irq_port + ICS_ARCIN_V6_INTROFFSET_2);
-}
-
-/* Prototype: icside_irqprobe(struct expansion_card *ec)
- * Purpose  : detect an active interrupt from card
- */
-static int icside_irqpending_arcin_v6(struct expansion_card *ec)
-{
-	struct icside_state *state = ec->irq_data;
-
-	return readb(state->irq_port + ICS_ARCIN_V6_INTRSTAT_1) & 1 ||
-	       readb(state->irq_port + ICS_ARCIN_V6_INTRSTAT_2) & 1;
-}
-
-static const expansioncard_ops_t icside_ops_arcin_v6 = {
-	.irqenable	= icside_irqenable_arcin_v6,
-	.irqdisable	= icside_irqdisable_arcin_v6,
-	.irqpending	= icside_irqpending_arcin_v6,
-};
-
-/*
- * Handle routing of interrupts.  This is called before
- * we write the command to the drive.
- */
-static void icside_maskproc(ide_drive_t *drive, int mask)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct expansion_card *ec = ECARD_DEV(hwif->dev);
-	struct icside_state *state = ecard_get_drvdata(ec);
-	unsigned long flags;
-
-	local_irq_save(flags);
-
-	state->channel = hwif->channel;
-
-	if (state->enabled && !mask) {
-		switch (hwif->channel) {
-		case 0:
-			writeb(0, state->irq_port + ICS_ARCIN_V6_INTROFFSET_1);
-			readb(state->irq_port + ICS_ARCIN_V6_INTROFFSET_2);
-			break;
-		case 1:
-			writeb(0, state->irq_port + ICS_ARCIN_V6_INTROFFSET_2);
-			readb(state->irq_port + ICS_ARCIN_V6_INTROFFSET_1);
-			break;
-		}
-	} else {
-		readb(state->irq_port + ICS_ARCIN_V6_INTROFFSET_2);
-		readb(state->irq_port + ICS_ARCIN_V6_INTROFFSET_1);
-	}
-
-	local_irq_restore(flags);
-}
-
-static const struct ide_port_ops icside_v6_no_dma_port_ops = {
-	.maskproc		= icside_maskproc,
-};
-
-#ifdef CONFIG_BLK_DEV_IDEDMA_ICS
-/*
- * SG-DMA support.
- *
- * Similar to the BM-DMA, but we use the RiscPCs IOMD DMA controllers.
- * There is only one DMA controller per card, which means that only
- * one drive can be accessed at one time.  NOTE! We do not enforce that
- * here, but we rely on the main IDE driver spotting that both
- * interfaces use the same IRQ, which should guarantee this.
- */
-
-/*
- * Configure the IOMD to give the appropriate timings for the transfer
- * mode being requested.  We take the advice of the ATA standards, and
- * calculate the cycle time based on the transfer mode, and the EIDE
- * MW DMA specs that the drive provides in the IDENTIFY command.
- *
- * We have the following IOMD DMA modes to choose from:
- *
- *	Type	Active		Recovery	Cycle
- *	A	250 (250)	312 (550)	562 (800)
- *	B	187		250		437
- *	C	125 (125)	125 (375)	250 (500)
- *	D	62		125		187
- *
- * (figures in brackets are actual measured timings)
- *
- * However, we also need to take care of the read/write active and
- * recovery timings:
- *
- *			Read	Write
- *  	Mode	Active	-- Recovery --	Cycle	IOMD type
- *	MW0	215	50	215	480	A
- *	MW1	80	50	50	150	C
- *	MW2	70	25	25	120	C
- */
-static void icside_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	unsigned long cycle_time = 0;
-	int use_dma_info = 0;
-	const u8 xfer_mode = drive->dma_mode;
-
-	switch (xfer_mode) {
-	case XFER_MW_DMA_2:
-		cycle_time = 250;
-		use_dma_info = 1;
-		break;
-
-	case XFER_MW_DMA_1:
-		cycle_time = 250;
-		use_dma_info = 1;
-		break;
-
-	case XFER_MW_DMA_0:
-		cycle_time = 480;
-		break;
-
-	case XFER_SW_DMA_2:
-	case XFER_SW_DMA_1:
-	case XFER_SW_DMA_0:
-		cycle_time = 480;
-		break;
-	}
-
-	/*
-	 * If we're going to be doing MW_DMA_1 or MW_DMA_2, we should
-	 * take care to note the values in the ID...
-	 */
-	if (use_dma_info && drive->id[ATA_ID_EIDE_DMA_TIME] > cycle_time)
-		cycle_time = drive->id[ATA_ID_EIDE_DMA_TIME];
-
-	ide_set_drivedata(drive, (void *)cycle_time);
-
-	printk(KERN_INFO "%s: %s selected (peak %luMB/s)\n",
-	       drive->name, ide_xfer_verbose(xfer_mode),
-	       2000 / (cycle_time ? cycle_time : (unsigned long) -1));
-}
-
-static const struct ide_port_ops icside_v6_port_ops = {
-	.set_dma_mode		= icside_set_dma_mode,
-	.maskproc		= icside_maskproc,
-};
-
-static void icside_dma_host_set(ide_drive_t *drive, int on)
-{
-}
-
-static int icside_dma_end(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct expansion_card *ec = ECARD_DEV(hwif->dev);
-
-	disable_dma(ec->dma);
-
-	return get_dma_residue(ec->dma) != 0;
-}
-
-static void icside_dma_start(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct expansion_card *ec = ECARD_DEV(hwif->dev);
-
-	/* We can not enable DMA on both channels simultaneously. */
-	BUG_ON(dma_channel_active(ec->dma));
-	enable_dma(ec->dma);
-}
-
-static int icside_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct expansion_card *ec = ECARD_DEV(hwif->dev);
-	struct icside_state *state = ecard_get_drvdata(ec);
-	unsigned int dma_mode;
-
-	if (cmd->tf_flags & IDE_TFLAG_WRITE)
-		dma_mode = DMA_MODE_WRITE;
-	else
-		dma_mode = DMA_MODE_READ;
-
-	/*
-	 * We can not enable DMA on both channels.
-	 */
-	BUG_ON(dma_channel_active(ec->dma));
-
-	/*
-	 * Ensure that we have the right interrupt routed.
-	 */
-	icside_maskproc(drive, 0);
-
-	/*
-	 * Route the DMA signals to the correct interface.
-	 */
-	writeb(state->sel | hwif->channel, state->ioc_base);
-
-	/*
-	 * Select the correct timing for this drive.
-	 */
-	set_dma_speed(ec->dma, (unsigned long)ide_get_drivedata(drive));
-
-	/*
-	 * Tell the DMA engine about the SG table and
-	 * data direction.
-	 */
-	set_dma_sg(ec->dma, hwif->sg_table, cmd->sg_nents);
-	set_dma_mode(ec->dma, dma_mode);
-
-	return 0;
-}
-
-static int icside_dma_test_irq(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct expansion_card *ec = ECARD_DEV(hwif->dev);
-	struct icside_state *state = ecard_get_drvdata(ec);
-
-	return readb(state->irq_port +
-		     (hwif->channel ?
-			ICS_ARCIN_V6_INTRSTAT_2 :
-			ICS_ARCIN_V6_INTRSTAT_1)) & 1;
-}
-
-static int icside_dma_init(ide_hwif_t *hwif, const struct ide_port_info *d)
-{
-	hwif->dmatable_cpu	= NULL;
-	hwif->dmatable_dma	= 0;
-
-	return 0;
-}
-
-static const struct ide_dma_ops icside_v6_dma_ops = {
-	.dma_host_set		= icside_dma_host_set,
-	.dma_setup		= icside_dma_setup,
-	.dma_start		= icside_dma_start,
-	.dma_end		= icside_dma_end,
-	.dma_test_irq		= icside_dma_test_irq,
-	.dma_lost_irq		= ide_dma_lost_irq,
-};
-#endif
-
-static int icside_dma_off_init(ide_hwif_t *hwif, const struct ide_port_info *d)
-{
-	return -EOPNOTSUPP;
-}
-
-static void icside_setup_ports(struct ide_hw *hw, void __iomem *base,
-			       struct cardinfo *info, struct expansion_card *ec)
-{
-	unsigned long port = (unsigned long)base + info->dataoffset;
-
-	hw->io_ports.data_addr	 = port;
-	hw->io_ports.error_addr	 = port + (1 << info->stepping);
-	hw->io_ports.nsect_addr	 = port + (2 << info->stepping);
-	hw->io_ports.lbal_addr	 = port + (3 << info->stepping);
-	hw->io_ports.lbam_addr	 = port + (4 << info->stepping);
-	hw->io_ports.lbah_addr	 = port + (5 << info->stepping);
-	hw->io_ports.device_addr = port + (6 << info->stepping);
-	hw->io_ports.status_addr = port + (7 << info->stepping);
-	hw->io_ports.ctl_addr	 = (unsigned long)base + info->ctrloffset;
-
-	hw->irq = ec->irq;
-	hw->dev = &ec->dev;
-}
-
-static const struct ide_port_info icside_v5_port_info = {
-	.host_flags		= IDE_HFLAG_NO_DMA,
-	.chipset		= ide_acorn,
-};
-
-static int icside_register_v5(struct icside_state *state,
-			      struct expansion_card *ec)
-{
-	void __iomem *base;
-	struct ide_host *host;
-	struct ide_hw hw, *hws[] = { &hw };
-	int ret;
-
-	base = ecardm_iomap(ec, ECARD_RES_MEMC, 0, 0);
-	if (!base)
-		return -ENOMEM;
-
-	state->irq_port = base;
-
-	ec->irqaddr  = base + ICS_ARCIN_V5_INTRSTAT;
-	ec->irqmask  = 1;
-
-	ecard_setirq(ec, &icside_ops_arcin_v5, state);
-
-	/*
-	 * Be on the safe side - disable interrupts
-	 */
-	icside_irqdisable_arcin_v5(ec, 0);
-
-	icside_setup_ports(&hw, base, &icside_cardinfo_v5, ec);
-
-	host = ide_host_alloc(&icside_v5_port_info, hws, 1);
-	if (host == NULL)
-		return -ENODEV;
-
-	state->host = host;
-
-	ecard_set_drvdata(ec, state);
-
-	ret = ide_host_register(host, &icside_v5_port_info, hws);
-	if (ret)
-		goto err_free;
-
-	return 0;
-err_free:
-	ide_host_free(host);
-	ecard_set_drvdata(ec, NULL);
-	return ret;
-}
-
-static const struct ide_port_info icside_v6_port_info = {
-	.init_dma		= icside_dma_off_init,
-	.port_ops		= &icside_v6_no_dma_port_ops,
-	.host_flags		= IDE_HFLAG_SERIALIZE | IDE_HFLAG_MMIO,
-	.mwdma_mask		= ATA_MWDMA2,
-	.swdma_mask		= ATA_SWDMA2,
-	.chipset		= ide_acorn,
-};
-
-static int icside_register_v6(struct icside_state *state,
-			      struct expansion_card *ec)
-{
-	void __iomem *ioc_base, *easi_base;
-	struct ide_host *host;
-	unsigned int sel = 0;
-	int ret;
-	struct ide_hw hw[2], *hws[] = { &hw[0], &hw[1] };
-	struct ide_port_info d = icside_v6_port_info;
-
-	ioc_base = ecardm_iomap(ec, ECARD_RES_IOCFAST, 0, 0);
-	if (!ioc_base) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	easi_base = ioc_base;
-
-	if (ecard_resource_flags(ec, ECARD_RES_EASI)) {
-		easi_base = ecardm_iomap(ec, ECARD_RES_EASI, 0, 0);
-		if (!easi_base) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		/*
-		 * Enable access to the EASI region.
-		 */
-		sel = 1 << 5;
-	}
-
-	writeb(sel, ioc_base);
-
-	ecard_setirq(ec, &icside_ops_arcin_v6, state);
-
-	state->irq_port   = easi_base;
-	state->ioc_base   = ioc_base;
-	state->sel	  = sel;
-
-	/*
-	 * Be on the safe side - disable interrupts
-	 */
-	icside_irqdisable_arcin_v6(ec, 0);
-
-	icside_setup_ports(&hw[0], easi_base, &icside_cardinfo_v6_1, ec);
-	icside_setup_ports(&hw[1], easi_base, &icside_cardinfo_v6_2, ec);
-
-	host = ide_host_alloc(&d, hws, 2);
-	if (host == NULL)
-		return -ENODEV;
-
-	state->host = host;
-
-	ecard_set_drvdata(ec, state);
-
-#ifdef CONFIG_BLK_DEV_IDEDMA_ICS
-	if (ec->dma != NO_DMA && !request_dma(ec->dma, DRV_NAME)) {
-		d.init_dma = icside_dma_init;
-		d.port_ops = &icside_v6_port_ops;
-		d.dma_ops  = &icside_v6_dma_ops;
-	}
-#endif
-
-	ret = ide_host_register(host, &d, hws);
-	if (ret)
-		goto err_free;
-
-	return 0;
-err_free:
-	ide_host_free(host);
-	if (d.dma_ops)
-		free_dma(ec->dma);
-	ecard_set_drvdata(ec, NULL);
-out:
-	return ret;
-}
-
-static int icside_probe(struct expansion_card *ec, const struct ecard_id *id)
-{
-	struct icside_state *state;
-	void __iomem *idmem;
-	int ret;
-
-	ret = ecard_request_resources(ec);
-	if (ret)
-		goto out;
-
-	state = kzalloc(sizeof(struct icside_state), GFP_KERNEL);
-	if (!state) {
-		ret = -ENOMEM;
-		goto release;
-	}
-
-	state->type	= ICS_TYPE_NOTYPE;
-
-	idmem = ecardm_iomap(ec, ECARD_RES_IOCFAST, 0, 0);
-	if (idmem) {
-		unsigned int type;
-
-		type = readb(idmem + ICS_IDENT_OFFSET) & 1;
-		type |= (readb(idmem + ICS_IDENT_OFFSET + 4) & 1) << 1;
-		type |= (readb(idmem + ICS_IDENT_OFFSET + 8) & 1) << 2;
-		type |= (readb(idmem + ICS_IDENT_OFFSET + 12) & 1) << 3;
-		ecardm_iounmap(ec, idmem);
-
-		state->type = type;
-	}
-
-	switch (state->type) {
-	case ICS_TYPE_A3IN:
-		dev_warn(&ec->dev, "A3IN unsupported\n");
-		ret = -ENODEV;
-		break;
-
-	case ICS_TYPE_A3USER:
-		dev_warn(&ec->dev, "A3USER unsupported\n");
-		ret = -ENODEV;
-		break;
-
-	case ICS_TYPE_V5:
-		ret = icside_register_v5(state, ec);
-		break;
-
-	case ICS_TYPE_V6:
-		ret = icside_register_v6(state, ec);
-		break;
-
-	default:
-		dev_warn(&ec->dev, "unknown interface type\n");
-		ret = -ENODEV;
-		break;
-	}
-
-	if (ret == 0)
-		goto out;
-
-	kfree(state);
- release:
-	ecard_release_resources(ec);
- out:
-	return ret;
-}
-
-static void icside_remove(struct expansion_card *ec)
-{
-	struct icside_state *state = ecard_get_drvdata(ec);
-
-	switch (state->type) {
-	case ICS_TYPE_V5:
-		/* FIXME: tell IDE to stop using the interface */
-
-		/* Disable interrupts */
-		icside_irqdisable_arcin_v5(ec, 0);
-		break;
-
-	case ICS_TYPE_V6:
-		/* FIXME: tell IDE to stop using the interface */
-		if (ec->dma != NO_DMA)
-			free_dma(ec->dma);
-
-		/* Disable interrupts */
-		icside_irqdisable_arcin_v6(ec, 0);
-
-		/* Reset the ROM pointer/EASI selection */
-		writeb(0, state->ioc_base);
-		break;
-	}
-
-	ecard_set_drvdata(ec, NULL);
-
-	kfree(state);
-	ecard_release_resources(ec);
-}
-
-static void icside_shutdown(struct expansion_card *ec)
-{
-	struct icside_state *state = ecard_get_drvdata(ec);
-	unsigned long flags;
-
-	/*
-	 * Disable interrupts from this card.  We need to do
-	 * this before disabling EASI since we may be accessing
-	 * this register via that region.
-	 */
-	local_irq_save(flags);
-	ec->ops->irqdisable(ec, 0);
-	local_irq_restore(flags);
-
-	/*
-	 * Reset the ROM pointer so that we can read the ROM
-	 * after a soft reboot.  This also disables access to
-	 * the IDE taskfile via the EASI region.
-	 */
-	if (state->ioc_base)
-		writeb(0, state->ioc_base);
-}
-
-static const struct ecard_id icside_ids[] = {
-	{ MANU_ICS,  PROD_ICS_IDE  },
-	{ MANU_ICS2, PROD_ICS2_IDE },
-	{ 0xffff, 0xffff }
-};
-
-static struct ecard_driver icside_driver = {
-	.probe		= icside_probe,
-	.remove		= icside_remove,
-	.shutdown	= icside_shutdown,
-	.id_table	= icside_ids,
-	.drv = {
-		.name	= "icside",
-	},
-};
-
-static int __init icside_init(void)
-{
-	return ecard_register_driver(&icside_driver);
-}
-
-static void __exit icside_exit(void)
-{
-	ecard_remove_driver(&icside_driver);
-}
-
-MODULE_AUTHOR("Russell King <rmk@arm.linux.org.uk>");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("ICS IDE driver");
-
-module_init(icside_init);
-module_exit(icside_exit);
diff --git a/drivers/ide/ide-4drives.c b/drivers/ide/ide-4drives.c
deleted file mode 100644
index 06c6215e0cbe..000000000000
--- a/drivers/ide/ide-4drives.c
+++ /dev/null
@@ -1,65 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/ide.h>
-
-#define DRV_NAME "ide-4drives"
-
-static bool probe_4drives;
-
-module_param_named(probe, probe_4drives, bool, 0);
-MODULE_PARM_DESC(probe, "probe for generic IDE chipset with 4 drives/port");
-
-static void ide_4drives_init_dev(ide_drive_t *drive)
-{
-	if (drive->hwif->channel)
-		drive->select ^= 0x20;
-}
-
-static const struct ide_port_ops ide_4drives_port_ops = {
-	.init_dev		= ide_4drives_init_dev,
-};
-
-static const struct ide_port_info ide_4drives_port_info = {
-	.port_ops		= &ide_4drives_port_ops,
-	.host_flags		= IDE_HFLAG_SERIALIZE | IDE_HFLAG_NO_DMA |
-				  IDE_HFLAG_4DRIVES,
-	.chipset		= ide_4drives,
-};
-
-static int __init ide_4drives_init(void)
-{
-	unsigned long base = 0x1f0, ctl = 0x3f6;
-	struct ide_hw hw, *hws[] = { &hw, &hw };
-
-	if (probe_4drives == 0)
-		return -ENODEV;
-
-	if (!request_region(base, 8, DRV_NAME)) {
-		printk(KERN_ERR "%s: I/O resource 0x%lX-0x%lX not free.\n",
-				DRV_NAME, base, base + 7);
-		return -EBUSY;
-	}
-
-	if (!request_region(ctl, 1, DRV_NAME)) {
-		printk(KERN_ERR "%s: I/O resource 0x%lX not free.\n",
-				DRV_NAME, ctl);
-		release_region(base, 8);
-		return -EBUSY;
-	}
-
-	memset(&hw, 0, sizeof(hw));
-
-	ide_std_init_ports(&hw, base, ctl);
-	hw.irq = 14;
-
-	return ide_host_add(&ide_4drives_port_info, hws, 2, NULL);
-}
-
-module_init(ide_4drives_init);
-
-MODULE_AUTHOR("Bartlomiej Zolnierkiewicz");
-MODULE_DESCRIPTION("generic IDE chipset with 4 drives/port support");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/ide-acpi.c b/drivers/ide/ide-acpi.c
deleted file mode 100644
index 05e18d658141..000000000000
--- a/drivers/ide/ide-acpi.c
+++ /dev/null
@@ -1,622 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Provides ACPI support for IDE drives.
- *
- * Copyright (C) 2005 Intel Corp.
- * Copyright (C) 2005 Randy Dunlap
- * Copyright (C) 2006 SUSE Linux Products GmbH
- * Copyright (C) 2006 Hannes Reinecke
- */
-
-#include <linux/acpi.h>
-#include <linux/ata.h>
-#include <linux/delay.h>
-#include <linux/device.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/ide.h>
-#include <linux/pci.h>
-#include <linux/dmi.h>
-#include <linux/module.h>
-
-#define REGS_PER_GTF		7
-
-struct GTM_buffer {
-	u32	PIO_speed0;
-	u32	DMA_speed0;
-	u32	PIO_speed1;
-	u32	DMA_speed1;
-	u32	GTM_flags;
-};
-
-struct ide_acpi_drive_link {
-	acpi_handle	 obj_handle;
-	u8		 idbuff[512];
-};
-
-struct ide_acpi_hwif_link {
-	ide_hwif_t			*hwif;
-	acpi_handle			 obj_handle;
-	struct GTM_buffer		 gtm;
-	struct ide_acpi_drive_link	 master;
-	struct ide_acpi_drive_link	 slave;
-};
-
-#undef DEBUGGING
-/* note: adds function name and KERN_DEBUG */
-#ifdef DEBUGGING
-#define DEBPRINT(fmt, args...)	\
-		printk(KERN_DEBUG "%s: " fmt, __func__, ## args)
-#else
-#define DEBPRINT(fmt, args...)	do {} while (0)
-#endif	/* DEBUGGING */
-
-static bool ide_noacpi;
-module_param_named(noacpi, ide_noacpi, bool, 0);
-MODULE_PARM_DESC(noacpi, "disable IDE ACPI support");
-
-static bool ide_acpigtf;
-module_param_named(acpigtf, ide_acpigtf, bool, 0);
-MODULE_PARM_DESC(acpigtf, "enable IDE ACPI _GTF support");
-
-static bool ide_acpionboot;
-module_param_named(acpionboot, ide_acpionboot, bool, 0);
-MODULE_PARM_DESC(acpionboot, "call IDE ACPI methods on boot");
-
-static bool ide_noacpi_psx;
-static int no_acpi_psx(const struct dmi_system_id *id)
-{
-	ide_noacpi_psx = true;
-	printk(KERN_NOTICE"%s detected - disable ACPI _PSx.\n", id->ident);
-	return 0;
-}
-
-static const struct dmi_system_id ide_acpi_dmi_table[] = {
-	/* Bug 9673. */
-	/* We should check if this is because ACPI NVS isn't save/restored. */
-	{
-		.callback = no_acpi_psx,
-		.ident    = "HP nx9005",
-		.matches  = {
-			DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies Ltd."),
-			DMI_MATCH(DMI_BIOS_VERSION, "KAM1.60")
-		},
-	},
-
-	{ }	/* terminate list */
-};
-
-int ide_acpi_init(void)
-{
-	dmi_check_system(ide_acpi_dmi_table);
-	return 0;
-}
-
-bool ide_port_acpi(ide_hwif_t *hwif)
-{
-	return ide_noacpi == 0 && hwif->acpidata;
-}
-
-static acpi_handle acpi_get_child(acpi_handle handle, u64 addr)
-{
-	struct acpi_device *adev;
-
-	if (!handle || acpi_bus_get_device(handle, &adev))
-		return NULL;
-
-	adev = acpi_find_child_device(adev, addr, false);
-	return adev ? adev->handle : NULL;
-}
-
-/**
- * ide_get_dev_handle - finds acpi_handle and PCI device.function
- * @dev: device to locate
- * @handle: returned acpi_handle for @dev
- * @pcidevfn: return PCI device.func for @dev
- *
- * Returns the ACPI object handle to the corresponding PCI device.
- *
- * Returns 0 on success, <0 on error.
- */
-static int ide_get_dev_handle(struct device *dev, acpi_handle *handle,
-			       u64 *pcidevfn)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-	unsigned int bus, devnum, func;
-	u64 addr;
-	acpi_handle dev_handle;
-	acpi_status status;
-	struct acpi_device_info	*dinfo = NULL;
-	int ret = -ENODEV;
-
-	bus = pdev->bus->number;
-	devnum = PCI_SLOT(pdev->devfn);
-	func = PCI_FUNC(pdev->devfn);
-	/* ACPI _ADR encoding for PCI bus: */
-	addr = (u64)(devnum << 16 | func);
-
-	DEBPRINT("ENTER: pci %02x:%02x.%01x\n", bus, devnum, func);
-
-	dev_handle = ACPI_HANDLE(dev);
-	if (!dev_handle) {
-		DEBPRINT("no acpi handle for device\n");
-		goto err;
-	}
-
-	status = acpi_get_object_info(dev_handle, &dinfo);
-	if (ACPI_FAILURE(status)) {
-		DEBPRINT("get_object_info for device failed\n");
-		goto err;
-	}
-	if (dinfo && (dinfo->valid & ACPI_VALID_ADR) &&
-	    dinfo->address == addr) {
-		*pcidevfn = addr;
-		*handle = dev_handle;
-	} else {
-		DEBPRINT("get_object_info for device has wrong "
-			" address: %llu, should be %u\n",
-			dinfo ? (unsigned long long)dinfo->address : -1ULL,
-			(unsigned int)addr);
-		goto err;
-	}
-
-	DEBPRINT("for dev=0x%x.%x, addr=0x%llx, *handle=0x%p\n",
-		 devnum, func, (unsigned long long)addr, *handle);
-	ret = 0;
-err:
-	kfree(dinfo);
-	return ret;
-}
-
-/**
- * ide_acpi_hwif_get_handle - Get ACPI object handle for a given hwif
- * @hwif: device to locate
- *
- * Retrieves the object handle for a given hwif.
- *
- * Returns handle on success, 0 on error.
- */
-static acpi_handle ide_acpi_hwif_get_handle(ide_hwif_t *hwif)
-{
-	struct device		*dev = hwif->gendev.parent;
-	acpi_handle		dev_handle;
-	u64			pcidevfn;
-	acpi_handle		chan_handle;
-	int			err;
-
-	DEBPRINT("ENTER: device %s\n", hwif->name);
-
-	if (!dev) {
-		DEBPRINT("no PCI device for %s\n", hwif->name);
-		return NULL;
-	}
-
-	err = ide_get_dev_handle(dev, &dev_handle, &pcidevfn);
-	if (err < 0) {
-		DEBPRINT("ide_get_dev_handle failed (%d)\n", err);
-		return NULL;
-	}
-
-	/* get child objects of dev_handle == channel objects,
-	 * + _their_ children == drive objects */
-	/* channel is hwif->channel */
-	chan_handle = acpi_get_child(dev_handle, hwif->channel);
-	DEBPRINT("chan adr=%d: handle=0x%p\n",
-		 hwif->channel, chan_handle);
-
-	return chan_handle;
-}
-
-/**
- * do_drive_get_GTF - get the drive bootup default taskfile settings
- * @drive: the drive for which the taskfile settings should be retrieved
- * @gtf_length: number of bytes of _GTF data returned at @gtf_address
- * @gtf_address: buffer containing _GTF taskfile arrays
- *
- * The _GTF method has no input parameters.
- * It returns a variable number of register set values (registers
- * hex 1F1..1F7, taskfiles).
- * The <variable number> is not known in advance, so have ACPI-CA
- * allocate the buffer as needed and return it, then free it later.
- *
- * The returned @gtf_length and @gtf_address are only valid if the
- * function return value is 0.
- */
-static int do_drive_get_GTF(ide_drive_t *drive,
-		     unsigned int *gtf_length, unsigned long *gtf_address,
-		     unsigned long *obj_loc)
-{
-	acpi_status			status;
-	struct acpi_buffer		output;
-	union acpi_object 		*out_obj;
-	int				err = -ENODEV;
-
-	*gtf_length = 0;
-	*gtf_address = 0UL;
-	*obj_loc = 0UL;
-
-	if (!drive->acpidata->obj_handle) {
-		DEBPRINT("No ACPI object found for %s\n", drive->name);
-		goto out;
-	}
-
-	/* Setting up output buffer */
-	output.length = ACPI_ALLOCATE_BUFFER;
-	output.pointer = NULL;	/* ACPI-CA sets this; save/free it later */
-
-	/* _GTF has no input parameters */
-	err = -EIO;
-	status = acpi_evaluate_object(drive->acpidata->obj_handle, "_GTF",
-				      NULL, &output);
-	if (ACPI_FAILURE(status)) {
-		printk(KERN_DEBUG
-		       "%s: Run _GTF error: status = 0x%x\n",
-		       __func__, status);
-		goto out;
-	}
-
-	if (!output.length || !output.pointer) {
-		DEBPRINT("Run _GTF: "
-		       "length or ptr is NULL (0x%llx, 0x%p)\n",
-		       (unsigned long long)output.length,
-		       output.pointer);
-		goto out;
-	}
-
-	out_obj = output.pointer;
-	if (out_obj->type != ACPI_TYPE_BUFFER) {
-		DEBPRINT("Run _GTF: error: "
-		       "expected object type of ACPI_TYPE_BUFFER, "
-		       "got 0x%x\n", out_obj->type);
-		err = -ENOENT;
-		kfree(output.pointer);
-		goto out;
-	}
-
-	if (!out_obj->buffer.length || !out_obj->buffer.pointer ||
-	    out_obj->buffer.length % REGS_PER_GTF) {
-		printk(KERN_ERR
-		       "%s: unexpected GTF length (%d) or addr (0x%p)\n",
-		       __func__, out_obj->buffer.length,
-		       out_obj->buffer.pointer);
-		err = -ENOENT;
-		kfree(output.pointer);
-		goto out;
-	}
-
-	*gtf_length = out_obj->buffer.length;
-	*gtf_address = (unsigned long)out_obj->buffer.pointer;
-	*obj_loc = (unsigned long)out_obj;
-	DEBPRINT("returning gtf_length=%d, gtf_address=0x%lx, obj_loc=0x%lx\n",
-		 *gtf_length, *gtf_address, *obj_loc);
-	err = 0;
-out:
-	return err;
-}
-
-/**
- * do_drive_set_taskfiles - write the drive taskfile settings from _GTF
- * @drive: the drive to which the taskfile command should be sent
- * @gtf_length: total number of bytes of _GTF taskfiles
- * @gtf_address: location of _GTF taskfile arrays
- *
- * Write {gtf_address, length gtf_length} in groups of
- * REGS_PER_GTF bytes.
- */
-static int do_drive_set_taskfiles(ide_drive_t *drive,
-				  unsigned int gtf_length,
-				  unsigned long gtf_address)
-{
-	int			rc = 0, err;
-	int			gtf_count = gtf_length / REGS_PER_GTF;
-	int			ix;
-
-	DEBPRINT("total GTF bytes=%u (0x%x), gtf_count=%d, addr=0x%lx\n",
-		 gtf_length, gtf_length, gtf_count, gtf_address);
-
-	/* send all taskfile registers (0x1f1-0x1f7) *in*that*order* */
-	for (ix = 0; ix < gtf_count; ix++) {
-		u8 *gtf = (u8 *)(gtf_address + ix * REGS_PER_GTF);
-		struct ide_cmd cmd;
-
-		DEBPRINT("(0x1f1-1f7): "
-			 "hex: %02x %02x %02x %02x %02x %02x %02x\n",
-			 gtf[0], gtf[1], gtf[2],
-			 gtf[3], gtf[4], gtf[5], gtf[6]);
-
-		if (!ide_acpigtf) {
-			DEBPRINT("_GTF execution disabled\n");
-			continue;
-		}
-
-		/* convert GTF to taskfile */
-		memset(&cmd, 0, sizeof(cmd));
-		memcpy(&cmd.tf.feature, gtf, REGS_PER_GTF);
-		cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
-		cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
-
-		err = ide_no_data_taskfile(drive, &cmd);
-		if (err) {
-			printk(KERN_ERR "%s: ide_no_data_taskfile failed: %u\n",
-					__func__, err);
-			rc = err;
-		}
-	}
-
-	return rc;
-}
-
-/**
- * ide_acpi_exec_tfs - get then write drive taskfile settings
- * @drive: the drive for which the taskfile settings should be
- *         written.
- *
- * According to the ACPI spec this should be called after _STM
- * has been evaluated for the interface. Some ACPI vendors interpret
- * that as a hard requirement and modify the taskfile according
- * to the Identify Drive information passed down with _STM.
- * So one should really make sure to call this only after _STM has
- * been executed.
- */
-int ide_acpi_exec_tfs(ide_drive_t *drive)
-{
-	int		ret;
-	unsigned int	gtf_length;
-	unsigned long	gtf_address;
-	unsigned long	obj_loc;
-
-	DEBPRINT("call get_GTF, drive=%s port=%d\n", drive->name, drive->dn);
-
-	ret = do_drive_get_GTF(drive, &gtf_length, &gtf_address, &obj_loc);
-	if (ret < 0) {
-		DEBPRINT("get_GTF error (%d)\n", ret);
-		return ret;
-	}
-
-	DEBPRINT("call set_taskfiles, drive=%s\n", drive->name);
-
-	ret = do_drive_set_taskfiles(drive, gtf_length, gtf_address);
-	kfree((void *)obj_loc);
-	if (ret < 0) {
-		DEBPRINT("set_taskfiles error (%d)\n", ret);
-	}
-
-	DEBPRINT("ret=%d\n", ret);
-
-	return ret;
-}
-
-/**
- * ide_acpi_get_timing - get the channel (controller) timings
- * @hwif: target IDE interface (channel)
- *
- * This function executes the _GTM ACPI method for the target channel.
- *
- */
-void ide_acpi_get_timing(ide_hwif_t *hwif)
-{
-	acpi_status		status;
-	struct acpi_buffer	output;
-	union acpi_object 	*out_obj;
-
-	/* Setting up output buffer for _GTM */
-	output.length = ACPI_ALLOCATE_BUFFER;
-	output.pointer = NULL;	/* ACPI-CA sets this; save/free it later */
-
-	/* _GTM has no input parameters */
-	status = acpi_evaluate_object(hwif->acpidata->obj_handle, "_GTM",
-				      NULL, &output);
-
-	DEBPRINT("_GTM status: %d, outptr: 0x%p, outlen: 0x%llx\n",
-		 status, output.pointer,
-		 (unsigned long long)output.length);
-
-	if (ACPI_FAILURE(status)) {
-		DEBPRINT("Run _GTM error: status = 0x%x\n", status);
-		return;
-	}
-
-	if (!output.length || !output.pointer) {
-		DEBPRINT("Run _GTM: length or ptr is NULL (0x%llx, 0x%p)\n",
-		       (unsigned long long)output.length,
-		       output.pointer);
-		kfree(output.pointer);
-		return;
-	}
-
-	out_obj = output.pointer;
-	if (out_obj->type != ACPI_TYPE_BUFFER) {
-		DEBPRINT("Run _GTM: error: "
-		       "expected object type of ACPI_TYPE_BUFFER, "
-		       "got 0x%x\n", out_obj->type);
-		kfree(output.pointer);
-		return;
-	}
-
-	if (!out_obj->buffer.length || !out_obj->buffer.pointer ||
-	    out_obj->buffer.length != sizeof(struct GTM_buffer)) {
-		printk(KERN_ERR
-			"%s: unexpected _GTM length (0x%x)[should be 0x%zx] or "
-			"addr (0x%p)\n",
-			__func__, out_obj->buffer.length,
-			sizeof(struct GTM_buffer), out_obj->buffer.pointer);
-		kfree(output.pointer);
-		return;
-	}
-
-	memcpy(&hwif->acpidata->gtm, out_obj->buffer.pointer,
-	       sizeof(struct GTM_buffer));
-
-	DEBPRINT("_GTM info: ptr: 0x%p, len: 0x%x, exp.len: 0x%zx\n",
-		 out_obj->buffer.pointer, out_obj->buffer.length,
-		 sizeof(struct GTM_buffer));
-
-	DEBPRINT("_GTM fields: 0x%x, 0x%x, 0x%x, 0x%x, 0x%x\n",
-		 hwif->acpidata->gtm.PIO_speed0,
-		 hwif->acpidata->gtm.DMA_speed0,
-		 hwif->acpidata->gtm.PIO_speed1,
-		 hwif->acpidata->gtm.DMA_speed1,
-		 hwif->acpidata->gtm.GTM_flags);
-
-	kfree(output.pointer);
-}
-
-/**
- * ide_acpi_push_timing - set the channel (controller) timings
- * @hwif: target IDE interface (channel)
- *
- * This function executes the _STM ACPI method for the target channel.
- *
- * _STM requires Identify Drive data, which has to passed as an argument.
- * Unfortunately drive->id is a mangled version which we can't readily
- * use; hence we'll get the information afresh.
- */
-void ide_acpi_push_timing(ide_hwif_t *hwif)
-{
-	acpi_status		status;
-	struct acpi_object_list	input;
-	union acpi_object 	in_params[3];
-	struct ide_acpi_drive_link	*master = &hwif->acpidata->master;
-	struct ide_acpi_drive_link	*slave = &hwif->acpidata->slave;
-
-	/* Give the GTM buffer + drive Identify data to the channel via the
-	 * _STM method: */
-	/* setup input parameters buffer for _STM */
-	input.count = 3;
-	input.pointer = in_params;
-	in_params[0].type = ACPI_TYPE_BUFFER;
-	in_params[0].buffer.length = sizeof(struct GTM_buffer);
-	in_params[0].buffer.pointer = (u8 *)&hwif->acpidata->gtm;
-	in_params[1].type = ACPI_TYPE_BUFFER;
-	in_params[1].buffer.length = ATA_ID_WORDS * 2;
-	in_params[1].buffer.pointer = (u8 *)&master->idbuff;
-	in_params[2].type = ACPI_TYPE_BUFFER;
-	in_params[2].buffer.length = ATA_ID_WORDS * 2;
-	in_params[2].buffer.pointer = (u8 *)&slave->idbuff;
-	/* Output buffer: _STM has no output */
-
-	status = acpi_evaluate_object(hwif->acpidata->obj_handle, "_STM",
-				      &input, NULL);
-
-	if (ACPI_FAILURE(status)) {
-		DEBPRINT("Run _STM error: status = 0x%x\n", status);
-	}
-	DEBPRINT("_STM status: %d\n", status);
-}
-
-/**
- * ide_acpi_set_state - set the channel power state
- * @hwif: target IDE interface
- * @on: state, on/off
- *
- * This function executes the _PS0/_PS3 ACPI method to set the power state.
- * ACPI spec requires _PS0 when IDE power on and _PS3 when power off
- */
-void ide_acpi_set_state(ide_hwif_t *hwif, int on)
-{
-	ide_drive_t *drive;
-	int i;
-
-	if (ide_noacpi_psx)
-		return;
-
-	DEBPRINT("ENTER:\n");
-
-	/* channel first and then drives for power on and verse versa for power off */
-	if (on)
-		acpi_bus_set_power(hwif->acpidata->obj_handle, ACPI_STATE_D0);
-
-	ide_port_for_each_present_dev(i, drive, hwif) {
-		if (drive->acpidata->obj_handle)
-			acpi_bus_set_power(drive->acpidata->obj_handle,
-				on ? ACPI_STATE_D0 : ACPI_STATE_D3_COLD);
-	}
-
-	if (!on)
-		acpi_bus_set_power(hwif->acpidata->obj_handle,
-				   ACPI_STATE_D3_COLD);
-}
-
-/**
- * ide_acpi_init_port - initialize the ACPI link for an IDE interface
- * @hwif: target IDE interface (channel)
- *
- * The ACPI spec is not quite clear when the drive identify buffer
- * should be obtained. Calling IDENTIFY DEVICE during shutdown
- * is not the best of ideas as the drive might already being put to
- * sleep. And obviously we can't call it during resume.
- * So we get the information during startup; but this means that
- * any changes during run-time will be lost after resume.
- */
-void ide_acpi_init_port(ide_hwif_t *hwif)
-{
-	hwif->acpidata = kzalloc(sizeof(struct ide_acpi_hwif_link), GFP_KERNEL);
-	if (!hwif->acpidata)
-		return;
-
-	hwif->acpidata->obj_handle = ide_acpi_hwif_get_handle(hwif);
-	if (!hwif->acpidata->obj_handle) {
-		DEBPRINT("no ACPI object for %s found\n", hwif->name);
-		kfree(hwif->acpidata);
-		hwif->acpidata = NULL;
-	}
-}
-
-void ide_acpi_port_init_devices(ide_hwif_t *hwif)
-{
-	ide_drive_t *drive;
-	int i, err;
-
-	if (hwif->acpidata == NULL)
-		return;
-
-	/*
-	 * The ACPI spec mandates that we send information
-	 * for both drives, regardless whether they are connected
-	 * or not.
-	 */
-	hwif->devices[0]->acpidata = &hwif->acpidata->master;
-	hwif->devices[1]->acpidata = &hwif->acpidata->slave;
-
-	/* get _ADR info for each device */
-	ide_port_for_each_present_dev(i, drive, hwif) {
-		acpi_handle dev_handle;
-
-		DEBPRINT("ENTER: %s at channel#: %d port#: %d\n",
-			 drive->name, hwif->channel, drive->dn & 1);
-
-		/* TBD: could also check ACPI object VALID bits */
-		dev_handle = acpi_get_child(hwif->acpidata->obj_handle,
-					    drive->dn & 1);
-
-		DEBPRINT("drive %s handle 0x%p\n", drive->name, dev_handle);
-
-		drive->acpidata->obj_handle = dev_handle;
-	}
-
-	/* send IDENTIFY for each device */
-	ide_port_for_each_present_dev(i, drive, hwif) {
-		err = taskfile_lib_get_identify(drive, drive->acpidata->idbuff);
-		if (err)
-			DEBPRINT("identify device %s failed (%d)\n",
-				 drive->name, err);
-	}
-
-	if (ide_noacpi || ide_acpionboot == 0) {
-		DEBPRINT("ACPI methods disabled on boot\n");
-		return;
-	}
-
-	/* ACPI _PS0 before _STM */
-	ide_acpi_set_state(hwif, 1);
-	/*
-	 * ACPI requires us to call _STM on startup
-	 */
-	ide_acpi_get_timing(hwif);
-	ide_acpi_push_timing(hwif);
-
-	ide_port_for_each_present_dev(i, drive, hwif) {
-		ide_acpi_exec_tfs(drive);
-	}
-}
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
deleted file mode 100644
index a1ce9f5ac3aa..000000000000
--- a/drivers/ide/ide-atapi.c
+++ /dev/null
@@ -1,756 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * ATAPI support.
- */
-
-#include <linux/kernel.h>
-#include <linux/cdrom.h>
-#include <linux/delay.h>
-#include <linux/export.h>
-#include <linux/ide.h>
-#include <linux/scatterlist.h>
-#include <linux/gfp.h>
-
-#include <scsi/scsi.h>
-
-#define DRV_NAME "ide-atapi"
-#define PFX DRV_NAME ": "
-
-#ifdef DEBUG
-#define debug_log(fmt, args...) \
-	printk(KERN_INFO "ide: " fmt, ## args)
-#else
-#define debug_log(fmt, args...) do {} while (0)
-#endif
-
-#define ATAPI_MIN_CDB_BYTES	12
-
-static inline int dev_is_idecd(ide_drive_t *drive)
-{
-	return drive->media == ide_cdrom || drive->media == ide_optical;
-}
-
-/*
- * Check whether we can support a device,
- * based on the ATAPI IDENTIFY command results.
- */
-int ide_check_atapi_device(ide_drive_t *drive, const char *s)
-{
-	u16 *id = drive->id;
-	u8 gcw[2], protocol, device_type, removable, drq_type, packet_size;
-
-	*((u16 *)&gcw) = id[ATA_ID_CONFIG];
-
-	protocol    = (gcw[1] & 0xC0) >> 6;
-	device_type =  gcw[1] & 0x1F;
-	removable   = (gcw[0] & 0x80) >> 7;
-	drq_type    = (gcw[0] & 0x60) >> 5;
-	packet_size =  gcw[0] & 0x03;
-
-#ifdef CONFIG_PPC
-	/* kludge for Apple PowerBook internal zip */
-	if (drive->media == ide_floppy && device_type == 5 &&
-	    !strstr((char *)&id[ATA_ID_PROD], "CD-ROM") &&
-	    strstr((char *)&id[ATA_ID_PROD], "ZIP"))
-		device_type = 0;
-#endif
-
-	if (protocol != 2)
-		printk(KERN_ERR "%s: %s: protocol (0x%02x) is not ATAPI\n",
-			s, drive->name, protocol);
-	else if ((drive->media == ide_floppy && device_type != 0) ||
-		 (drive->media == ide_tape && device_type != 1))
-		printk(KERN_ERR "%s: %s: invalid device type (0x%02x)\n",
-			s, drive->name, device_type);
-	else if (removable == 0)
-		printk(KERN_ERR "%s: %s: the removable flag is not set\n",
-			s, drive->name);
-	else if (drive->media == ide_floppy && drq_type == 3)
-		printk(KERN_ERR "%s: %s: sorry, DRQ type (0x%02x) not "
-			"supported\n", s, drive->name, drq_type);
-	else if (packet_size != 0)
-		printk(KERN_ERR "%s: %s: packet size (0x%02x) is not 12 "
-			"bytes\n", s, drive->name, packet_size);
-	else
-		return 1;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(ide_check_atapi_device);
-
-void ide_init_pc(struct ide_atapi_pc *pc)
-{
-	memset(pc, 0, sizeof(*pc));
-}
-EXPORT_SYMBOL_GPL(ide_init_pc);
-
-/*
- * Add a special packet command request to the tail of the request queue,
- * and wait for it to be serviced.
- */
-int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
-		      struct ide_atapi_pc *pc, void *buf, unsigned int bufflen)
-{
-	struct request *rq;
-	int error;
-
-	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
-	ide_req(rq)->type = ATA_PRIV_MISC;
-	ide_req(rq)->special = pc;
-
-	if (buf && bufflen) {
-		error = blk_rq_map_kern(drive->queue, rq, buf, bufflen,
-					GFP_NOIO);
-		if (error)
-			goto put_req;
-	}
-
-	memcpy(scsi_req(rq)->cmd, pc->c, 12);
-	if (drive->media == ide_tape)
-		scsi_req(rq)->cmd[13] = REQ_IDETAPE_PC1;
-	blk_execute_rq(disk, rq, 0);
-	error = scsi_req(rq)->result ? -EIO : 0;
-put_req:
-	blk_put_request(rq);
-	return error;
-}
-EXPORT_SYMBOL_GPL(ide_queue_pc_tail);
-
-int ide_do_test_unit_ready(ide_drive_t *drive, struct gendisk *disk)
-{
-	struct ide_atapi_pc pc;
-
-	ide_init_pc(&pc);
-	pc.c[0] = TEST_UNIT_READY;
-
-	return ide_queue_pc_tail(drive, disk, &pc, NULL, 0);
-}
-EXPORT_SYMBOL_GPL(ide_do_test_unit_ready);
-
-int ide_do_start_stop(ide_drive_t *drive, struct gendisk *disk, int start)
-{
-	struct ide_atapi_pc pc;
-
-	ide_init_pc(&pc);
-	pc.c[0] = START_STOP;
-	pc.c[4] = start;
-
-	if (drive->media == ide_tape)
-		pc.flags |= PC_FLAG_WAIT_FOR_DSC;
-
-	return ide_queue_pc_tail(drive, disk, &pc, NULL, 0);
-}
-EXPORT_SYMBOL_GPL(ide_do_start_stop);
-
-int ide_set_media_lock(ide_drive_t *drive, struct gendisk *disk, int on)
-{
-	struct ide_atapi_pc pc;
-
-	if ((drive->dev_flags & IDE_DFLAG_DOORLOCKING) == 0)
-		return 0;
-
-	ide_init_pc(&pc);
-	pc.c[0] = ALLOW_MEDIUM_REMOVAL;
-	pc.c[4] = on;
-
-	return ide_queue_pc_tail(drive, disk, &pc, NULL, 0);
-}
-EXPORT_SYMBOL_GPL(ide_set_media_lock);
-
-void ide_create_request_sense_cmd(ide_drive_t *drive, struct ide_atapi_pc *pc)
-{
-	ide_init_pc(pc);
-	pc->c[0] = REQUEST_SENSE;
-	if (drive->media == ide_floppy) {
-		pc->c[4] = 255;
-		pc->req_xfer = 18;
-	} else {
-		pc->c[4] = 20;
-		pc->req_xfer = 20;
-	}
-}
-EXPORT_SYMBOL_GPL(ide_create_request_sense_cmd);
-
-void ide_prep_sense(ide_drive_t *drive, struct request *rq)
-{
-	struct request_sense *sense = &drive->sense_data;
-	struct request *sense_rq;
-	struct scsi_request *req;
-	unsigned int cmd_len, sense_len;
-	int err;
-
-	switch (drive->media) {
-	case ide_floppy:
-		cmd_len = 255;
-		sense_len = 18;
-		break;
-	case ide_tape:
-		cmd_len = 20;
-		sense_len = 20;
-		break;
-	default:
-		cmd_len = 18;
-		sense_len = 18;
-	}
-
-	BUG_ON(sense_len > sizeof(*sense));
-
-	if (ata_sense_request(rq) || drive->sense_rq_armed)
-		return;
-
-	sense_rq = drive->sense_rq;
-	if (!sense_rq) {
-		sense_rq = blk_mq_alloc_request(drive->queue, REQ_OP_DRV_IN,
-					BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
-		drive->sense_rq = sense_rq;
-	}
-	req = scsi_req(sense_rq);
-
-	memset(sense, 0, sizeof(*sense));
-
-	scsi_req_init(req);
-
-	err = blk_rq_map_kern(drive->queue, sense_rq, sense, sense_len,
-			      GFP_NOIO);
-	if (unlikely(err)) {
-		if (printk_ratelimit())
-			printk(KERN_WARNING PFX "%s: failed to map sense "
-					    "buffer\n", drive->name);
-		blk_mq_free_request(sense_rq);
-		drive->sense_rq = NULL;
-		return;
-	}
-
-	sense_rq->rq_disk = rq->rq_disk;
-	sense_rq->cmd_flags = REQ_OP_DRV_IN;
-	ide_req(sense_rq)->type = ATA_PRIV_SENSE;
-
-	req->cmd[0] = GPCMD_REQUEST_SENSE;
-	req->cmd[4] = cmd_len;
-	if (drive->media == ide_tape)
-		req->cmd[13] = REQ_IDETAPE_PC1;
-
-	drive->sense_rq_armed = true;
-}
-EXPORT_SYMBOL_GPL(ide_prep_sense);
-
-int ide_queue_sense_rq(ide_drive_t *drive, void *special)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct request *sense_rq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&hwif->lock, flags);
-
-	/* deferred failure from ide_prep_sense() */
-	if (!drive->sense_rq_armed) {
-		printk(KERN_WARNING PFX "%s: error queuing a sense request\n",
-		       drive->name);
-		spin_unlock_irqrestore(&hwif->lock, flags);
-		return -ENOMEM;
-	}
-
-	sense_rq = drive->sense_rq;
-	ide_req(sense_rq)->special = special;
-	drive->sense_rq_armed = false;
-
-	drive->hwif->rq = NULL;
-
-	ide_insert_request_head(drive, sense_rq);
-	spin_unlock_irqrestore(&hwif->lock, flags);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
-
-/*
- * Called when an error was detected during the last packet command.
- * We queue a request sense packet command at the head of the request
- * queue.
- */
-void ide_retry_pc(ide_drive_t *drive)
-{
-	struct request *failed_rq = drive->hwif->rq;
-	struct request *sense_rq = drive->sense_rq;
-	struct ide_atapi_pc *pc = &drive->request_sense_pc;
-
-	(void)ide_read_error(drive);
-
-	/* init pc from sense_rq */
-	ide_init_pc(pc);
-	memcpy(pc->c, scsi_req(sense_rq)->cmd, 12);
-
-	if (drive->media == ide_tape)
-		drive->atapi_flags |= IDE_AFLAG_IGNORE_DSC;
-
-	/*
-	 * Push back the failed request and put request sense on top
-	 * of it.  The failed command will be retried after sense data
-	 * is acquired.
-	 */
-	drive->hwif->rq = NULL;
-	ide_requeue_and_plug(drive, failed_rq);
-	if (ide_queue_sense_rq(drive, pc))
-		ide_complete_rq(drive, BLK_STS_IOERR, blk_rq_bytes(failed_rq));
-}
-EXPORT_SYMBOL_GPL(ide_retry_pc);
-
-int ide_cd_expiry(ide_drive_t *drive)
-{
-	struct request *rq = drive->hwif->rq;
-	unsigned long wait = 0;
-
-	debug_log("%s: scsi_req(rq)->cmd[0]: 0x%x\n", __func__, scsi_req(rq)->cmd[0]);
-
-	/*
-	 * Some commands are *slow* and normally take a long time to complete.
-	 * Usually we can use the ATAPI "disconnect" to bypass this, but not all
-	 * commands/drives support that. Let ide_timer_expiry keep polling us
-	 * for these.
-	 */
-	switch (scsi_req(rq)->cmd[0]) {
-	case GPCMD_BLANK:
-	case GPCMD_FORMAT_UNIT:
-	case GPCMD_RESERVE_RZONE_TRACK:
-	case GPCMD_CLOSE_TRACK:
-	case GPCMD_FLUSH_CACHE:
-		wait = ATAPI_WAIT_PC;
-		break;
-	default:
-		if (!(rq->rq_flags & RQF_QUIET))
-			printk(KERN_INFO PFX "cmd 0x%x timed out\n",
-					 scsi_req(rq)->cmd[0]);
-		wait = 0;
-		break;
-	}
-	return wait;
-}
-EXPORT_SYMBOL_GPL(ide_cd_expiry);
-
-int ide_cd_get_xferlen(struct request *rq)
-{
-	switch (req_op(rq)) {
-	default:
-		return 32768;
-	case REQ_OP_SCSI_IN:
-	case REQ_OP_SCSI_OUT:
-		return blk_rq_bytes(rq);
-	case REQ_OP_DRV_IN:
-	case REQ_OP_DRV_OUT:
-		switch (ide_req(rq)->type) {
-		case ATA_PRIV_PC:
-		case ATA_PRIV_SENSE:
-			return blk_rq_bytes(rq);
-		default:
-			return 0;
-		}
-	}
-}
-EXPORT_SYMBOL_GPL(ide_cd_get_xferlen);
-
-void ide_read_bcount_and_ireason(ide_drive_t *drive, u16 *bcount, u8 *ireason)
-{
-	struct ide_taskfile tf;
-
-	drive->hwif->tp_ops->tf_read(drive, &tf, IDE_VALID_NSECT |
-				     IDE_VALID_LBAM | IDE_VALID_LBAH);
-
-	*bcount = (tf.lbah << 8) | tf.lbam;
-	*ireason = tf.nsect & 3;
-}
-EXPORT_SYMBOL_GPL(ide_read_bcount_and_ireason);
-
-/*
- * Check the contents of the interrupt reason register and attempt to recover if
- * there are problems.
- *
- * Returns:
- * - 0 if everything's ok
- * - 1 if the request has to be terminated.
- */
-int ide_check_ireason(ide_drive_t *drive, struct request *rq, int len,
-		      int ireason, int rw)
-{
-	ide_hwif_t *hwif = drive->hwif;
-
-	debug_log("ireason: 0x%x, rw: 0x%x\n", ireason, rw);
-
-	if (ireason == (!rw << 1))
-		return 0;
-	else if (ireason == (rw << 1)) {
-		printk(KERN_ERR PFX "%s: %s: wrong transfer direction!\n",
-				drive->name, __func__);
-
-		if (dev_is_idecd(drive))
-			ide_pad_transfer(drive, rw, len);
-	} else if (!rw && ireason == ATAPI_COD) {
-		if (dev_is_idecd(drive)) {
-			/*
-			 * Some drives (ASUS) seem to tell us that status info
-			 * is available.  Just get it and ignore.
-			 */
-			(void)hwif->tp_ops->read_status(hwif);
-			return 0;
-		}
-	} else {
-		if (ireason & ATAPI_COD)
-			printk(KERN_ERR PFX "%s: CoD != 0 in %s\n", drive->name,
-					__func__);
-
-		/* drive wants a command packet, or invalid ireason... */
-		printk(KERN_ERR PFX "%s: %s: bad interrupt reason 0x%02x\n",
-				drive->name, __func__, ireason);
-	}
-
-	if (dev_is_idecd(drive) && ata_pc_request(rq))
-		rq->rq_flags |= RQF_FAILED;
-
-	return 1;
-}
-EXPORT_SYMBOL_GPL(ide_check_ireason);
-
-/*
- * This is the usual interrupt handler which will be called during a packet
- * command.  We will transfer some of the data (as requested by the drive)
- * and will re-point interrupt handler to us.
- */
-static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
-{
-	struct ide_atapi_pc *pc = drive->pc;
-	ide_hwif_t *hwif = drive->hwif;
-	struct ide_cmd *cmd = &hwif->cmd;
-	struct request *rq = hwif->rq;
-	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
-	unsigned int timeout, done;
-	u16 bcount;
-	u8 stat, ireason, dsc = 0;
-	u8 write = !!(pc->flags & PC_FLAG_WRITING);
-
-	debug_log("Enter %s - interrupt handler\n", __func__);
-
-	timeout = (drive->media == ide_floppy) ? WAIT_FLOPPY_CMD
-					       : WAIT_TAPE_CMD;
-
-	/* Clear the interrupt */
-	stat = tp_ops->read_status(hwif);
-
-	if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) {
-		int rc;
-
-		drive->waiting_for_dma = 0;
-		rc = hwif->dma_ops->dma_end(drive);
-		ide_dma_unmap_sg(drive, cmd);
-
-		if (rc || (drive->media == ide_tape && (stat & ATA_ERR))) {
-			if (drive->media == ide_floppy)
-				printk(KERN_ERR PFX "%s: DMA %s error\n",
-					drive->name, rq_data_dir(pc->rq)
-						     ? "write" : "read");
-			pc->flags |= PC_FLAG_DMA_ERROR;
-		} else
-			scsi_req(rq)->resid_len = 0;
-		debug_log("%s: DMA finished\n", drive->name);
-	}
-
-	/* No more interrupts */
-	if ((stat & ATA_DRQ) == 0) {
-		int uptodate;
-		blk_status_t error;
-
-		debug_log("Packet command completed, %d bytes transferred\n",
-			  blk_rq_bytes(rq));
-
-		pc->flags &= ~PC_FLAG_DMA_IN_PROGRESS;
-
-		local_irq_enable_in_hardirq();
-
-		if (drive->media == ide_tape &&
-		    (stat & ATA_ERR) && scsi_req(rq)->cmd[0] == REQUEST_SENSE)
-			stat &= ~ATA_ERR;
-
-		if ((stat & ATA_ERR) || (pc->flags & PC_FLAG_DMA_ERROR)) {
-			/* Error detected */
-			debug_log("%s: I/O error\n", drive->name);
-
-			if (drive->media != ide_tape)
-				scsi_req(pc->rq)->result++;
-
-			if (scsi_req(rq)->cmd[0] == REQUEST_SENSE) {
-				printk(KERN_ERR PFX "%s: I/O error in request "
-						"sense command\n", drive->name);
-				return ide_do_reset(drive);
-			}
-
-			debug_log("[cmd %x]: check condition\n", scsi_req(rq)->cmd[0]);
-
-			/* Retry operation */
-			ide_retry_pc(drive);
-
-			/* queued, but not started */
-			return ide_stopped;
-		}
-		pc->error = 0;
-
-		if ((pc->flags & PC_FLAG_WAIT_FOR_DSC) && (stat & ATA_DSC) == 0)
-			dsc = 1;
-
-		/*
-		 * ->pc_callback() might change rq->data_len for
-		 * residual count, cache total length.
-		 */
-		done = blk_rq_bytes(rq);
-
-		/* Command finished - Call the callback function */
-		uptodate = drive->pc_callback(drive, dsc);
-
-		if (uptodate == 0)
-			drive->failed_pc = NULL;
-
-		if (ata_misc_request(rq)) {
-			scsi_req(rq)->result = 0;
-			error = BLK_STS_OK;
-		} else {
-
-			if (blk_rq_is_passthrough(rq) && uptodate <= 0) {
-				if (scsi_req(rq)->result == 0)
-					scsi_req(rq)->result = -EIO;
-			}
-
-			error = uptodate ? BLK_STS_OK : BLK_STS_IOERR;
-		}
-
-		ide_complete_rq(drive, error, blk_rq_bytes(rq));
-		return ide_stopped;
-	}
-
-	if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) {
-		pc->flags &= ~PC_FLAG_DMA_IN_PROGRESS;
-		printk(KERN_ERR PFX "%s: The device wants to issue more "
-				"interrupts in DMA mode\n", drive->name);
-		ide_dma_off(drive);
-		return ide_do_reset(drive);
-	}
-
-	/* Get the number of bytes to transfer on this interrupt. */
-	ide_read_bcount_and_ireason(drive, &bcount, &ireason);
-
-	if (ide_check_ireason(drive, rq, bcount, ireason, write))
-		return ide_do_reset(drive);
-
-	done = min_t(unsigned int, bcount, cmd->nleft);
-	ide_pio_bytes(drive, cmd, write, done);
-
-	/* Update transferred byte count */
-	scsi_req(rq)->resid_len -= done;
-
-	bcount -= done;
-
-	if (bcount)
-		ide_pad_transfer(drive, write, bcount);
-
-	debug_log("[cmd %x] transferred %d bytes, padded %d bytes, resid: %u\n",
-		  scsi_req(rq)->cmd[0], done, bcount, scsi_req(rq)->resid_len);
-
-	/* And set the interrupt handler again */
-	ide_set_handler(drive, ide_pc_intr, timeout);
-	return ide_started;
-}
-
-static void ide_init_packet_cmd(struct ide_cmd *cmd, u8 valid_tf,
-				u16 bcount, u8 dma)
-{
-	cmd->protocol = dma ? ATAPI_PROT_DMA : ATAPI_PROT_PIO;
-	cmd->valid.out.tf = IDE_VALID_LBAH | IDE_VALID_LBAM |
-			    IDE_VALID_FEATURE | valid_tf;
-	cmd->tf.command = ATA_CMD_PACKET;
-	cmd->tf.feature = dma;		/* Use PIO/DMA */
-	cmd->tf.lbam    = bcount & 0xff;
-	cmd->tf.lbah    = (bcount >> 8) & 0xff;
-}
-
-static u8 ide_read_ireason(ide_drive_t *drive)
-{
-	struct ide_taskfile tf;
-
-	drive->hwif->tp_ops->tf_read(drive, &tf, IDE_VALID_NSECT);
-
-	return tf.nsect & 3;
-}
-
-static u8 ide_wait_ireason(ide_drive_t *drive, u8 ireason)
-{
-	int retries = 100;
-
-	while (retries-- && ((ireason & ATAPI_COD) == 0 ||
-		(ireason & ATAPI_IO))) {
-		printk(KERN_ERR PFX "%s: (IO,CoD != (0,1) while issuing "
-				"a packet command, retrying\n", drive->name);
-		udelay(100);
-		ireason = ide_read_ireason(drive);
-		if (retries == 0) {
-			printk(KERN_ERR PFX "%s: (IO,CoD != (0,1) while issuing"
-					" a packet command, ignoring\n",
-					drive->name);
-			ireason |= ATAPI_COD;
-			ireason &= ~ATAPI_IO;
-		}
-	}
-
-	return ireason;
-}
-
-static int ide_delayed_transfer_pc(ide_drive_t *drive)
-{
-	/* Send the actual packet */
-	drive->hwif->tp_ops->output_data(drive, NULL, drive->pc->c, 12);
-
-	/* Timeout for the packet command */
-	return WAIT_FLOPPY_CMD;
-}
-
-static ide_startstop_t ide_transfer_pc(ide_drive_t *drive)
-{
-	struct ide_atapi_pc *pc;
-	ide_hwif_t *hwif = drive->hwif;
-	struct request *rq = hwif->rq;
-	ide_expiry_t *expiry;
-	unsigned int timeout;
-	int cmd_len;
-	ide_startstop_t startstop;
-	u8 ireason;
-
-	if (ide_wait_stat(&startstop, drive, ATA_DRQ, ATA_BUSY, WAIT_READY)) {
-		printk(KERN_ERR PFX "%s: Strange, packet command initiated yet "
-				"DRQ isn't asserted\n", drive->name);
-		return startstop;
-	}
-
-	if (drive->atapi_flags & IDE_AFLAG_DRQ_INTERRUPT) {
-		if (drive->dma)
-			drive->waiting_for_dma = 1;
-	}
-
-	if (dev_is_idecd(drive)) {
-		/* ATAPI commands get padded out to 12 bytes minimum */
-		cmd_len = COMMAND_SIZE(scsi_req(rq)->cmd[0]);
-		if (cmd_len < ATAPI_MIN_CDB_BYTES)
-			cmd_len = ATAPI_MIN_CDB_BYTES;
-
-		timeout = rq->timeout;
-		expiry  = ide_cd_expiry;
-	} else {
-		pc = drive->pc;
-
-		cmd_len = ATAPI_MIN_CDB_BYTES;
-
-		/*
-		 * If necessary schedule the packet transfer to occur 'timeout'
-		 * milliseconds later in ide_delayed_transfer_pc() after the
-		 * device says it's ready for a packet.
-		 */
-		if (drive->atapi_flags & IDE_AFLAG_ZIP_DRIVE) {
-			timeout = drive->pc_delay;
-			expiry = &ide_delayed_transfer_pc;
-		} else {
-			timeout = (drive->media == ide_floppy) ? WAIT_FLOPPY_CMD
-							       : WAIT_TAPE_CMD;
-			expiry = NULL;
-		}
-
-		ireason = ide_read_ireason(drive);
-		if (drive->media == ide_tape)
-			ireason = ide_wait_ireason(drive, ireason);
-
-		if ((ireason & ATAPI_COD) == 0 || (ireason & ATAPI_IO)) {
-			printk(KERN_ERR PFX "%s: (IO,CoD) != (0,1) while "
-				"issuing a packet command\n", drive->name);
-
-			return ide_do_reset(drive);
-		}
-	}
-
-	hwif->expiry = expiry;
-
-	/* Set the interrupt routine */
-	ide_set_handler(drive,
-			(dev_is_idecd(drive) ? drive->irq_handler
-					     : ide_pc_intr),
-			timeout);
-
-	/* Send the actual packet */
-	if ((drive->atapi_flags & IDE_AFLAG_ZIP_DRIVE) == 0)
-		hwif->tp_ops->output_data(drive, NULL, scsi_req(rq)->cmd, cmd_len);
-
-	/* Begin DMA, if necessary */
-	if (dev_is_idecd(drive)) {
-		if (drive->dma)
-			hwif->dma_ops->dma_start(drive);
-	} else {
-		if (pc->flags & PC_FLAG_DMA_OK) {
-			pc->flags |= PC_FLAG_DMA_IN_PROGRESS;
-			hwif->dma_ops->dma_start(drive);
-		}
-	}
-
-	return ide_started;
-}
-
-ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd)
-{
-	struct ide_atapi_pc *pc;
-	ide_hwif_t *hwif = drive->hwif;
-	ide_expiry_t *expiry = NULL;
-	struct request *rq = hwif->rq;
-	unsigned int timeout, bytes;
-	u16 bcount;
-	u8 valid_tf;
-	u8 drq_int = !!(drive->atapi_flags & IDE_AFLAG_DRQ_INTERRUPT);
-
-	if (dev_is_idecd(drive)) {
-		valid_tf = IDE_VALID_NSECT | IDE_VALID_LBAL;
-		bcount = ide_cd_get_xferlen(rq);
-		expiry = ide_cd_expiry;
-		timeout = ATAPI_WAIT_PC;
-
-		if (drive->dma)
-			drive->dma = !ide_dma_prepare(drive, cmd);
-	} else {
-		pc = drive->pc;
-
-		valid_tf = IDE_VALID_DEVICE;
-		bytes = blk_rq_bytes(rq);
-		bcount = ((drive->media == ide_tape) ? bytes
-						     : min_t(unsigned int,
-							     bytes, 63 * 1024));
-
-		/* We haven't transferred any data yet */
-		scsi_req(rq)->resid_len = bcount;
-
-		if (pc->flags & PC_FLAG_DMA_ERROR) {
-			pc->flags &= ~PC_FLAG_DMA_ERROR;
-			ide_dma_off(drive);
-		}
-
-		if (pc->flags & PC_FLAG_DMA_OK)
-			drive->dma = !ide_dma_prepare(drive, cmd);
-
-		if (!drive->dma)
-			pc->flags &= ~PC_FLAG_DMA_OK;
-
-		timeout = (drive->media == ide_floppy) ? WAIT_FLOPPY_CMD
-						       : WAIT_TAPE_CMD;
-	}
-
-	ide_init_packet_cmd(cmd, valid_tf, bcount, drive->dma);
-
-	(void)do_rw_taskfile(drive, cmd);
-
-	if (drq_int) {
-		if (drive->dma)
-			drive->waiting_for_dma = 0;
-		hwif->expiry = expiry;
-	}
-
-	ide_execute_command(drive, cmd, ide_transfer_pc, timeout);
-
-	return drq_int ? ide_started : ide_transfer_pc(drive);
-}
-EXPORT_SYMBOL_GPL(ide_issue_pc);
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
deleted file mode 100644
index cffbcc27a34c..000000000000
--- a/drivers/ide/ide-cd.c
+++ /dev/null
@@ -1,1858 +0,0 @@
-/*
- * ATAPI CD-ROM driver.
- *
- * Copyright (C) 1994-1996   Scott Snyder <snyder@fnald0.fnal.gov>
- * Copyright (C) 1996-1998   Erik Andersen <andersee@debian.org>
- * Copyright (C) 1998-2000   Jens Axboe <axboe@suse.de>
- * Copyright (C) 2005, 2007-2009  Bartlomiej Zolnierkiewicz
- *
- * May be copied or modified under the terms of the GNU General Public
- * License.  See linux/COPYING for more information.
- *
- * See Documentation/cdrom/ide-cd.rst for usage information.
- *
- * Suggestions are welcome. Patches that work are more welcome though. ;-)
- *
- * Documentation:
- *	Mt. Fuji (SFF8090 version 4) and ATAPI (SFF-8020i rev 2.6) standards.
- *
- * For historical changelog please see:
- *	Documentation/ide/ChangeLog.ide-cd.1994-2004
- */
-
-#define DRV_NAME "ide-cd"
-#define PFX DRV_NAME ": "
-
-#define IDECD_VERSION "5.00"
-
-#include <linux/compat.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/sched/task_stack.h>
-#include <linux/delay.h>
-#include <linux/timer.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include <linux/interrupt.h>
-#include <linux/errno.h>
-#include <linux/cdrom.h>
-#include <linux/ide.h>
-#include <linux/completion.h>
-#include <linux/mutex.h>
-#include <linux/bcd.h>
-
-/* For SCSI -> ATAPI command conversion */
-#include <scsi/scsi.h>
-
-#include <linux/io.h>
-#include <asm/byteorder.h>
-#include <linux/uaccess.h>
-#include <asm/unaligned.h>
-
-#include "ide-cd.h"
-
-static DEFINE_MUTEX(ide_cd_mutex);
-static DEFINE_MUTEX(idecd_ref_mutex);
-
-static void ide_cd_release(struct device *);
-
-static struct cdrom_info *ide_cd_get(struct gendisk *disk)
-{
-	struct cdrom_info *cd = NULL;
-
-	mutex_lock(&idecd_ref_mutex);
-	cd = ide_drv_g(disk, cdrom_info);
-	if (cd) {
-		if (ide_device_get(cd->drive))
-			cd = NULL;
-		else
-			get_device(&cd->dev);
-
-	}
-	mutex_unlock(&idecd_ref_mutex);
-	return cd;
-}
-
-static void ide_cd_put(struct cdrom_info *cd)
-{
-	ide_drive_t *drive = cd->drive;
-
-	mutex_lock(&idecd_ref_mutex);
-	put_device(&cd->dev);
-	ide_device_put(drive);
-	mutex_unlock(&idecd_ref_mutex);
-}
-
-/*
- * Generic packet command support and error handling routines.
- */
-
-/* Mark that we've seen a media change and invalidate our internal buffers. */
-static void cdrom_saw_media_change(ide_drive_t *drive)
-{
-	drive->dev_flags |= IDE_DFLAG_MEDIA_CHANGED;
-	drive->atapi_flags &= ~IDE_AFLAG_TOC_VALID;
-}
-
-static int cdrom_log_sense(ide_drive_t *drive, struct request *rq)
-{
-	struct request_sense *sense = &drive->sense_data;
-	int log = 0;
-
-	if (!sense || !rq || (rq->rq_flags & RQF_QUIET))
-		return 0;
-
-	ide_debug_log(IDE_DBG_SENSE, "sense_key: 0x%x", sense->sense_key);
-
-	switch (sense->sense_key) {
-	case NO_SENSE:
-	case RECOVERED_ERROR:
-		break;
-	case NOT_READY:
-		/*
-		 * don't care about tray state messages for e.g. capacity
-		 * commands or in-progress or becoming ready
-		 */
-		if (sense->asc == 0x3a || sense->asc == 0x04)
-			break;
-		log = 1;
-		break;
-	case ILLEGAL_REQUEST:
-		/*
-		 * don't log START_STOP unit with LoEj set, since we cannot
-		 * reliably check if drive can auto-close
-		 */
-		if (scsi_req(rq)->cmd[0] == GPCMD_START_STOP_UNIT && sense->asc == 0x24)
-			break;
-		log = 1;
-		break;
-	case UNIT_ATTENTION:
-		/*
-		 * Make good and sure we've seen this potential media change.
-		 * Some drives (i.e. Creative) fail to present the correct sense
-		 * key in the error register.
-		 */
-		cdrom_saw_media_change(drive);
-		break;
-	default:
-		log = 1;
-		break;
-	}
-	return log;
-}
-
-static void cdrom_analyze_sense_data(ide_drive_t *drive,
-				     struct request *failed_command)
-{
-	struct request_sense *sense = &drive->sense_data;
-	struct cdrom_info *info = drive->driver_data;
-	unsigned long sector;
-	unsigned long bio_sectors;
-
-	ide_debug_log(IDE_DBG_SENSE, "error_code: 0x%x, sense_key: 0x%x",
-				     sense->error_code, sense->sense_key);
-
-	if (failed_command)
-		ide_debug_log(IDE_DBG_SENSE, "failed cmd: 0x%x",
-					     failed_command->cmd[0]);
-
-	if (!cdrom_log_sense(drive, failed_command))
-		return;
-
-	/*
-	 * If a read toc is executed for a CD-R or CD-RW medium where the first
-	 * toc has not been recorded yet, it will fail with 05/24/00 (which is a
-	 * confusing error)
-	 */
-	if (failed_command && scsi_req(failed_command)->cmd[0] == GPCMD_READ_TOC_PMA_ATIP)
-		if (sense->sense_key == 0x05 && sense->asc == 0x24)
-			return;
-
-	/* current error */
-	if (sense->error_code == 0x70) {
-		switch (sense->sense_key) {
-		case MEDIUM_ERROR:
-		case VOLUME_OVERFLOW:
-		case ILLEGAL_REQUEST:
-			if (!sense->valid)
-				break;
-			if (failed_command == NULL ||
-			    blk_rq_is_passthrough(failed_command))
-				break;
-			sector = (sense->information[0] << 24) |
-				 (sense->information[1] << 16) |
-				 (sense->information[2] <<  8) |
-				 (sense->information[3]);
-
-			if (queue_logical_block_size(drive->queue) == 2048)
-				/* device sector size is 2K */
-				sector <<= 2;
-
-			bio_sectors = max(bio_sectors(failed_command->bio), 4U);
-			sector &= ~(bio_sectors - 1);
-
-			/*
-			 * The SCSI specification allows for the value
-			 * returned by READ CAPACITY to be up to 75 2K
-			 * sectors past the last readable block.
-			 * Therefore, if we hit a medium error within the
-			 * last 75 2K sectors, we decrease the saved size
-			 * value.
-			 */
-			if (sector < get_capacity(info->disk) &&
-			    drive->probed_capacity - sector < 4 * 75)
-				set_capacity(info->disk, sector);
-		}
-	}
-
-	ide_cd_log_error(drive->name, failed_command, sense);
-}
-
-static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
-{
-	/*
-	 * For ATA_PRIV_SENSE, "ide_req(rq)->special" points to the original
-	 * failed request.  Also, the sense data should be read
-	 * directly from rq which might be different from the original
-	 * sense buffer if it got copied during mapping.
-	 */
-	struct request *failed = ide_req(rq)->special;
-	void *sense = bio_data(rq->bio);
-
-	if (failed) {
-		/*
-		 * Sense is always read into drive->sense_data, copy back to the
-		 * original request.
-		 */
-		memcpy(scsi_req(failed)->sense, sense, 18);
-		scsi_req(failed)->sense_len = scsi_req(rq)->sense_len;
-		cdrom_analyze_sense_data(drive, failed);
-
-		if (ide_end_rq(drive, failed, BLK_STS_IOERR, blk_rq_bytes(failed)))
-			BUG();
-	} else
-		cdrom_analyze_sense_data(drive, NULL);
-}
-
-
-/*
- * Allow the drive 5 seconds to recover; some devices will return NOT_READY
- * while flushing data from cache.
- *
- * returns: 0 failed (write timeout expired)
- *	    1 success
- */
-static int ide_cd_breathe(ide_drive_t *drive, struct request *rq)
-{
-
-	struct cdrom_info *info = drive->driver_data;
-
-	if (!scsi_req(rq)->result)
-		info->write_timeout = jiffies +	ATAPI_WAIT_WRITE_BUSY;
-
-	scsi_req(rq)->result = 1;
-
-	if (time_after(jiffies, info->write_timeout))
-		return 0;
-	else {
-		/*
-		 * take a breather
-		 */
-		blk_mq_requeue_request(rq, false);
-		blk_mq_delay_kick_requeue_list(drive->queue, 1);
-		return 1;
-	}
-}
-
-static void ide_cd_free_sense(ide_drive_t *drive)
-{
-	if (!drive->sense_rq)
-		return;
-
-	blk_mq_free_request(drive->sense_rq);
-	drive->sense_rq = NULL;
-	drive->sense_rq_armed = false;
-}
-
-/**
- * Returns:
- * 0: if the request should be continued.
- * 1: if the request will be going through error recovery.
- * 2: if the request should be ended.
- */
-static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct request *rq = hwif->rq;
-	int err, sense_key, do_end_request = 0;
-
-	/* get the IDE error register */
-	err = ide_read_error(drive);
-	sense_key = err >> 4;
-
-	ide_debug_log(IDE_DBG_RQ, "cmd: 0x%x, rq->cmd_type: 0x%x, err: 0x%x, "
-				  "stat 0x%x",
-				  rq->cmd[0], rq->cmd_type, err, stat);
-
-	if (ata_sense_request(rq)) {
-		/*
-		 * We got an error trying to get sense info from the drive
-		 * (probably while trying to recover from a former error).
-		 * Just give up.
-		 */
-		rq->rq_flags |= RQF_FAILED;
-		return 2;
-	}
-
-	/* if we have an error, pass CHECK_CONDITION as the SCSI status byte */
-	if (blk_rq_is_scsi(rq) && !scsi_req(rq)->result)
-		scsi_req(rq)->result = SAM_STAT_CHECK_CONDITION;
-
-	if (blk_noretry_request(rq))
-		do_end_request = 1;
-
-	switch (sense_key) {
-	case NOT_READY:
-		if (req_op(rq) == REQ_OP_WRITE) {
-			if (ide_cd_breathe(drive, rq))
-				return 1;
-		} else {
-			cdrom_saw_media_change(drive);
-
-			if (!blk_rq_is_passthrough(rq) &&
-			    !(rq->rq_flags & RQF_QUIET))
-				printk(KERN_ERR PFX "%s: tray open\n",
-					drive->name);
-		}
-		do_end_request = 1;
-		break;
-	case UNIT_ATTENTION:
-		cdrom_saw_media_change(drive);
-
-		if (blk_rq_is_passthrough(rq))
-			return 0;
-
-		/*
-		 * Arrange to retry the request but be sure to give up if we've
-		 * retried too many times.
-		 */
-		if (++scsi_req(rq)->result > ERROR_MAX)
-			do_end_request = 1;
-		break;
-	case ILLEGAL_REQUEST:
-		/*
-		 * Don't print error message for this condition -- SFF8090i
-		 * indicates that 5/24/00 is the correct response to a request
-		 * to close the tray if the drive doesn't have that capability.
-		 *
-		 * cdrom_log_sense() knows this!
-		 */
-		if (scsi_req(rq)->cmd[0] == GPCMD_START_STOP_UNIT)
-			break;
-		fallthrough;
-	case DATA_PROTECT:
-		/*
-		 * No point in retrying after an illegal request or data
-		 * protect error.
-		 */
-		if (!(rq->rq_flags & RQF_QUIET))
-			ide_dump_status(drive, "command error", stat);
-		do_end_request = 1;
-		break;
-	case MEDIUM_ERROR:
-		/*
-		 * No point in re-trying a zillion times on a bad sector.
-		 * If we got here the error is not correctable.
-		 */
-		if (!(rq->rq_flags & RQF_QUIET))
-			ide_dump_status(drive, "media error "
-					"(bad sector)", stat);
-		do_end_request = 1;
-		break;
-	case BLANK_CHECK:
-		/* disk appears blank? */
-		if (!(rq->rq_flags & RQF_QUIET))
-			ide_dump_status(drive, "media error (blank)",
-					stat);
-		do_end_request = 1;
-		break;
-	default:
-		if (blk_rq_is_passthrough(rq))
-			break;
-		if (err & ~ATA_ABORTED) {
-			/* go to the default handler for other errors */
-			ide_error(drive, "cdrom_decode_status", stat);
-			return 1;
-		} else if (++scsi_req(rq)->result > ERROR_MAX)
-			/* we've racked up too many retries, abort */
-			do_end_request = 1;
-	}
-
-	if (blk_rq_is_passthrough(rq)) {
-		rq->rq_flags |= RQF_FAILED;
-		do_end_request = 1;
-	}
-
-	/*
-	 * End a request through request sense analysis when we have sense data.
-	 * We need this in order to perform end of media processing.
-	 */
-	if (do_end_request)
-		goto end_request;
-
-	/* if we got a CHECK_CONDITION status, queue a request sense command */
-	if (stat & ATA_ERR)
-		return ide_queue_sense_rq(drive, NULL) ? 2 : 1;
-	return 1;
-
-end_request:
-	if (stat & ATA_ERR) {
-		hwif->rq = NULL;
-		return ide_queue_sense_rq(drive, rq) ? 2 : 1;
-	} else
-		return 2;
-}
-
-static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct ide_cmd *cmd)
-{
-	struct request *rq = cmd->rq;
-
-	ide_debug_log(IDE_DBG_FUNC, "rq->cmd[0]: 0x%x", rq->cmd[0]);
-
-	/*
-	 * Some of the trailing request sense fields are optional,
-	 * and some drives don't send them.  Sigh.
-	 */
-	if (scsi_req(rq)->cmd[0] == GPCMD_REQUEST_SENSE &&
-	    cmd->nleft > 0 && cmd->nleft <= 5)
-		cmd->nleft = 0;
-}
-
-int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
-		    int write, void *buffer, unsigned *bufflen,
-		    struct scsi_sense_hdr *sshdr, int timeout,
-		    req_flags_t rq_flags)
-{
-	struct cdrom_info *info = drive->driver_data;
-	struct scsi_sense_hdr local_sshdr;
-	int retries = 10;
-	bool failed;
-
-	ide_debug_log(IDE_DBG_PC, "cmd[0]: 0x%x, write: 0x%x, timeout: %d, "
-				  "rq_flags: 0x%x",
-				  cmd[0], write, timeout, rq_flags);
-
-	if (!sshdr)
-		sshdr = &local_sshdr;
-
-	/* start of retry loop */
-	do {
-		struct request *rq;
-		int error;
-		bool delay = false;
-
-		rq = blk_get_request(drive->queue,
-			write ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
-		memcpy(scsi_req(rq)->cmd, cmd, BLK_MAX_CDB);
-		ide_req(rq)->type = ATA_PRIV_PC;
-		rq->rq_flags |= rq_flags;
-		rq->timeout = timeout;
-		if (buffer) {
-			error = blk_rq_map_kern(drive->queue, rq, buffer,
-						*bufflen, GFP_NOIO);
-			if (error) {
-				blk_put_request(rq);
-				return error;
-			}
-		}
-
-		blk_execute_rq(info->disk, rq, 0);
-		error = scsi_req(rq)->result ? -EIO : 0;
-
-		if (buffer)
-			*bufflen = scsi_req(rq)->resid_len;
-		scsi_normalize_sense(scsi_req(rq)->sense,
-				     scsi_req(rq)->sense_len, sshdr);
-
-		/*
-		 * FIXME: we should probably abort/retry or something in case of
-		 * failure.
-		 */
-		failed = (rq->rq_flags & RQF_FAILED) != 0;
-		if (failed) {
-			/*
-			 * The request failed.  Retry if it was due to a unit
-			 * attention status (usually means media was changed).
-			 */
-			if (sshdr->sense_key == UNIT_ATTENTION)
-				cdrom_saw_media_change(drive);
-			else if (sshdr->sense_key == NOT_READY &&
-				 sshdr->asc == 4 && sshdr->ascq != 4) {
-				/*
-				 * The drive is in the process of loading
-				 * a disk.  Retry, but wait a little to give
-				 * the drive time to complete the load.
-				 */
-				delay = true;
-			} else {
-				/* otherwise, don't retry */
-				retries = 0;
-			}
-			--retries;
-		}
-		blk_put_request(rq);
-		if (delay)
-			ssleep(2);
-	} while (failed && retries >= 0);
-
-	/* return an error if the command failed */
-	return failed ? -EIO : 0;
-}
-
-/*
- * returns true if rq has been completed
- */
-static bool ide_cd_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd)
-{
-	unsigned int nr_bytes = cmd->nbytes - cmd->nleft;
-
-	if (cmd->tf_flags & IDE_TFLAG_WRITE)
-		nr_bytes -= cmd->last_xfer_len;
-
-	if (nr_bytes > 0) {
-		ide_complete_rq(drive, BLK_STS_OK, nr_bytes);
-		return true;
-	}
-
-	return false;
-}
-
-/* standard prep_rq that builds 10 byte cmds */
-static bool ide_cdrom_prep_fs(struct request_queue *q, struct request *rq)
-{
-	int hard_sect = queue_logical_block_size(q);
-	long block = (long)blk_rq_pos(rq) / (hard_sect >> 9);
-	unsigned long blocks = blk_rq_sectors(rq) / (hard_sect >> 9);
-	struct scsi_request *req = scsi_req(rq);
-
-	if (rq_data_dir(rq) == READ)
-		req->cmd[0] = GPCMD_READ_10;
-	else
-		req->cmd[0] = GPCMD_WRITE_10;
-
-	/*
-	 * fill in lba
-	 */
-	req->cmd[2] = (block >> 24) & 0xff;
-	req->cmd[3] = (block >> 16) & 0xff;
-	req->cmd[4] = (block >>  8) & 0xff;
-	req->cmd[5] = block & 0xff;
-
-	/*
-	 * and transfer length
-	 */
-	req->cmd[7] = (blocks >> 8) & 0xff;
-	req->cmd[8] = blocks & 0xff;
-	req->cmd_len = 10;
-	return true;
-}
-
-/*
- * Most of the SCSI commands are supported directly by ATAPI devices.
- * This transform handles the few exceptions.
- */
-static bool ide_cdrom_prep_pc(struct request *rq)
-{
-	u8 *c = scsi_req(rq)->cmd;
-
-	/* transform 6-byte read/write commands to the 10-byte version */
-	if (c[0] == READ_6 || c[0] == WRITE_6) {
-		c[8] = c[4];
-		c[5] = c[3];
-		c[4] = c[2];
-		c[3] = c[1] & 0x1f;
-		c[2] = 0;
-		c[1] &= 0xe0;
-		c[0] += (READ_10 - READ_6);
-		scsi_req(rq)->cmd_len = 10;
-		return true;
-	}
-
-	/*
-	 * it's silly to pretend we understand 6-byte sense commands, just
-	 * reject with ILLEGAL_REQUEST and the caller should take the
-	 * appropriate action
-	 */
-	if (c[0] == MODE_SENSE || c[0] == MODE_SELECT) {
-		scsi_req(rq)->result = ILLEGAL_REQUEST;
-		return false;
-	}
-
-	return true;
-}
-
-static bool ide_cdrom_prep_rq(ide_drive_t *drive, struct request *rq)
-{
-	if (!blk_rq_is_passthrough(rq)) {
-		scsi_req_init(scsi_req(rq));
-
-		return ide_cdrom_prep_fs(drive->queue, rq);
-	} else if (blk_rq_is_scsi(rq))
-		return ide_cdrom_prep_pc(rq);
-
-	return true;
-}
-
-static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct ide_cmd *cmd = &hwif->cmd;
-	struct request *rq = hwif->rq;
-	ide_expiry_t *expiry = NULL;
-	int dma_error = 0, dma, thislen, uptodate = 0;
-	int write = (rq_data_dir(rq) == WRITE) ? 1 : 0, rc = 0;
-	int sense = ata_sense_request(rq);
-	unsigned int timeout;
-	u16 len;
-	u8 ireason, stat;
-
-	ide_debug_log(IDE_DBG_PC, "cmd: 0x%x, write: 0x%x", rq->cmd[0], write);
-
-	/* check for errors */
-	dma = drive->dma;
-	if (dma) {
-		drive->dma = 0;
-		drive->waiting_for_dma = 0;
-		dma_error = hwif->dma_ops->dma_end(drive);
-		ide_dma_unmap_sg(drive, cmd);
-		if (dma_error) {
-			printk(KERN_ERR PFX "%s: DMA %s error\n", drive->name,
-					write ? "write" : "read");
-			ide_dma_off(drive);
-		}
-	}
-
-	/* check status */
-	stat = hwif->tp_ops->read_status(hwif);
-
-	if (!OK_STAT(stat, 0, BAD_R_STAT)) {
-		rc = cdrom_decode_status(drive, stat);
-		if (rc) {
-			if (rc == 2)
-				goto out_end;
-			return ide_stopped;
-		}
-	}
-
-	/* using dma, transfer is complete now */
-	if (dma) {
-		if (dma_error)
-			return ide_error(drive, "dma error", stat);
-		uptodate = 1;
-		goto out_end;
-	}
-
-	ide_read_bcount_and_ireason(drive, &len, &ireason);
-
-	thislen = !blk_rq_is_passthrough(rq) ? len : cmd->nleft;
-	if (thislen > len)
-		thislen = len;
-
-	ide_debug_log(IDE_DBG_PC, "DRQ: stat: 0x%x, thislen: %d",
-				  stat, thislen);
-
-	/* If DRQ is clear, the command has completed. */
-	if ((stat & ATA_DRQ) == 0) {
-		switch (req_op(rq)) {
-		default:
-			/*
-			 * If we're not done reading/writing, complain.
-			 * Otherwise, complete the command normally.
-			 */
-			uptodate = 1;
-			if (cmd->nleft > 0) {
-				printk(KERN_ERR PFX "%s: %s: data underrun "
-					"(%u bytes)\n", drive->name, __func__,
-					cmd->nleft);
-				if (!write)
-					rq->rq_flags |= RQF_FAILED;
-				uptodate = 0;
-			}
-			goto out_end;
-		case REQ_OP_DRV_IN:
-		case REQ_OP_DRV_OUT:
-			ide_cd_request_sense_fixup(drive, cmd);
-
-			uptodate = cmd->nleft ? 0 : 1;
-
-			/*
-			 * suck out the remaining bytes from the drive in an
-			 * attempt to complete the data xfer. (see BZ#13399)
-			 */
-			if (!(stat & ATA_ERR) && !uptodate && thislen) {
-				ide_pio_bytes(drive, cmd, write, thislen);
-				uptodate = cmd->nleft ? 0 : 1;
-			}
-
-			if (!uptodate)
-				rq->rq_flags |= RQF_FAILED;
-			goto out_end;
-		case REQ_OP_SCSI_IN:
-		case REQ_OP_SCSI_OUT:
-			goto out_end;
-		}
-	}
-
-	rc = ide_check_ireason(drive, rq, len, ireason, write);
-	if (rc)
-		goto out_end;
-
-	cmd->last_xfer_len = 0;
-
-	ide_debug_log(IDE_DBG_PC, "data transfer, rq->cmd_type: 0x%x, "
-				  "ireason: 0x%x",
-				  rq->cmd_type, ireason);
-
-	/* transfer data */
-	while (thislen > 0) {
-		int blen = min_t(int, thislen, cmd->nleft);
-
-		if (cmd->nleft == 0)
-			break;
-
-		ide_pio_bytes(drive, cmd, write, blen);
-		cmd->last_xfer_len += blen;
-
-		thislen -= blen;
-		len -= blen;
-
-		if (sense && write == 0)
-			scsi_req(rq)->sense_len += blen;
-	}
-
-	/* pad, if necessary */
-	if (len > 0) {
-		if (blk_rq_is_passthrough(rq) || write == 0)
-			ide_pad_transfer(drive, write, len);
-		else {
-			printk(KERN_ERR PFX "%s: confused, missing data\n",
-				drive->name);
-			blk_dump_rq_flags(rq, "cdrom_newpc_intr");
-		}
-	}
-
-	switch (req_op(rq)) {
-	case REQ_OP_SCSI_IN:
-	case REQ_OP_SCSI_OUT:
-		timeout = rq->timeout;
-		break;
-	case REQ_OP_DRV_IN:
-	case REQ_OP_DRV_OUT:
-		expiry = ide_cd_expiry;
-		fallthrough;
-	default:
-		timeout = ATAPI_WAIT_PC;
-		break;
-	}
-
-	hwif->expiry = expiry;
-	ide_set_handler(drive, cdrom_newpc_intr, timeout);
-	return ide_started;
-
-out_end:
-	if (blk_rq_is_scsi(rq) && rc == 0) {
-		scsi_req(rq)->resid_len = 0;
-		blk_mq_end_request(rq, BLK_STS_OK);
-		hwif->rq = NULL;
-	} else {
-		if (sense && uptodate)
-			ide_cd_complete_failed_rq(drive, rq);
-
-		if (!blk_rq_is_passthrough(rq)) {
-			if (cmd->nleft == 0)
-				uptodate = 1;
-		} else {
-			if (uptodate <= 0 && scsi_req(rq)->result == 0)
-				scsi_req(rq)->result = -EIO;
-		}
-
-		if (uptodate == 0 && rq->bio)
-			if (ide_cd_error_cmd(drive, cmd))
-				return ide_stopped;
-
-		/* make sure it's fully ended */
-		if (blk_rq_is_passthrough(rq)) {
-			scsi_req(rq)->resid_len -= cmd->nbytes - cmd->nleft;
-			if (uptodate == 0 && (cmd->tf_flags & IDE_TFLAG_WRITE))
-				scsi_req(rq)->resid_len += cmd->last_xfer_len;
-		}
-
-		ide_complete_rq(drive, uptodate ? BLK_STS_OK : BLK_STS_IOERR, blk_rq_bytes(rq));
-
-		if (sense && rc == 2)
-			ide_error(drive, "request sense failure", stat);
-	}
-
-	ide_cd_free_sense(drive);
-	return ide_stopped;
-}
-
-static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
-{
-	struct cdrom_info *cd = drive->driver_data;
-	struct request_queue *q = drive->queue;
-	int write = rq_data_dir(rq) == WRITE;
-	unsigned short sectors_per_frame =
-		queue_logical_block_size(q) >> SECTOR_SHIFT;
-
-	ide_debug_log(IDE_DBG_RQ, "rq->cmd[0]: 0x%x, rq->cmd_flags: 0x%x, "
-				  "secs_per_frame: %u",
-				  rq->cmd[0], rq->cmd_flags, sectors_per_frame);
-
-	if (write) {
-		/* disk has become write protected */
-		if (get_disk_ro(cd->disk))
-			return ide_stopped;
-	} else {
-		/*
-		 * We may be retrying this request after an error.  Fix up any
-		 * weirdness which might be present in the request packet.
-		 */
-		ide_cdrom_prep_rq(drive, rq);
-	}
-
-	/* fs requests *must* be hardware frame aligned */
-	if ((blk_rq_sectors(rq) & (sectors_per_frame - 1)) ||
-	    (blk_rq_pos(rq) & (sectors_per_frame - 1)))
-		return ide_stopped;
-
-	/* use DMA, if possible */
-	drive->dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA);
-
-	if (write)
-		cd->devinfo.media_written = 1;
-
-	rq->timeout = ATAPI_WAIT_PC;
-
-	return ide_started;
-}
-
-static void cdrom_do_block_pc(ide_drive_t *drive, struct request *rq)
-{
-
-	ide_debug_log(IDE_DBG_PC, "rq->cmd[0]: 0x%x, rq->cmd_type: 0x%x",
-				  rq->cmd[0], rq->cmd_type);
-
-	if (blk_rq_is_scsi(rq))
-		rq->rq_flags |= RQF_QUIET;
-	else
-		rq->rq_flags &= ~RQF_FAILED;
-
-	drive->dma = 0;
-
-	/* sg request */
-	if (rq->bio) {
-		struct request_queue *q = drive->queue;
-		char *buf = bio_data(rq->bio);
-		unsigned int alignment;
-
-		drive->dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA);
-
-		/*
-		 * check if dma is safe
-		 *
-		 * NOTE! The "len" and "addr" checks should possibly have
-		 * separate masks.
-		 */
-		alignment = queue_dma_alignment(q) | q->dma_pad_mask;
-		if ((unsigned long)buf & alignment
-		    || blk_rq_bytes(rq) & q->dma_pad_mask
-		    || object_is_on_stack(buf))
-			drive->dma = 0;
-	}
-}
-
-static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
-					sector_t block)
-{
-	struct ide_cmd cmd;
-	int uptodate = 0;
-	unsigned int nsectors;
-
-	ide_debug_log(IDE_DBG_RQ, "cmd: 0x%x, block: %llu",
-				  rq->cmd[0], (unsigned long long)block);
-
-	if (drive->debug_mask & IDE_DBG_RQ)
-		blk_dump_rq_flags(rq, "ide_cd_do_request");
-
-	switch (req_op(rq)) {
-	default:
-		if (cdrom_start_rw(drive, rq) == ide_stopped)
-			goto out_end;
-		break;
-	case REQ_OP_SCSI_IN:
-	case REQ_OP_SCSI_OUT:
-	handle_pc:
-		if (!rq->timeout)
-			rq->timeout = ATAPI_WAIT_PC;
-		cdrom_do_block_pc(drive, rq);
-		break;
-	case REQ_OP_DRV_IN:
-	case REQ_OP_DRV_OUT:
-		switch (ide_req(rq)->type) {
-		case ATA_PRIV_MISC:
-			/* right now this can only be a reset... */
-			uptodate = 1;
-			goto out_end;
-		case ATA_PRIV_SENSE:
-		case ATA_PRIV_PC:
-			goto handle_pc;
-		default:
-			BUG();
-		}
-	}
-
-	/* prepare sense request for this command */
-	ide_prep_sense(drive, rq);
-
-	memset(&cmd, 0, sizeof(cmd));
-
-	if (rq_data_dir(rq))
-		cmd.tf_flags |= IDE_TFLAG_WRITE;
-
-	cmd.rq = rq;
-
-	if (!blk_rq_is_passthrough(rq) || blk_rq_bytes(rq)) {
-		ide_init_sg_cmd(&cmd, blk_rq_bytes(rq));
-		ide_map_sg(drive, &cmd);
-	}
-
-	return ide_issue_pc(drive, &cmd);
-out_end:
-	nsectors = blk_rq_sectors(rq);
-
-	if (nsectors == 0)
-		nsectors = 1;
-
-	ide_complete_rq(drive, uptodate ? BLK_STS_OK : BLK_STS_IOERR, nsectors << 9);
-
-	return ide_stopped;
-}
-
-/*
- * Ioctl handling.
- *
- * Routines which queue packet commands take as a final argument a pointer to a
- * request_sense struct. If execution of the command results in an error with a
- * CHECK CONDITION status, this structure will be filled with the results of the
- * subsequent request sense command. The pointer can also be NULL, in which case
- * no sense information is returned.
- */
-static void msf_from_bcd(struct atapi_msf *msf)
-{
-	msf->minute = bcd2bin(msf->minute);
-	msf->second = bcd2bin(msf->second);
-	msf->frame  = bcd2bin(msf->frame);
-}
-
-int cdrom_check_status(ide_drive_t *drive, struct scsi_sense_hdr *sshdr)
-{
-	struct cdrom_info *info = drive->driver_data;
-	struct cdrom_device_info *cdi;
-	unsigned char cmd[BLK_MAX_CDB];
-
-	ide_debug_log(IDE_DBG_FUNC, "enter");
-
-	if (!info)
-		return -EIO;
-
-	cdi = &info->devinfo;
-
-	memset(cmd, 0, BLK_MAX_CDB);
-	cmd[0] = GPCMD_TEST_UNIT_READY;
-
-	/*
-	 * Sanyo 3 CD changer uses byte 7 of TEST_UNIT_READY to switch CDs
-	 * instead of supporting the LOAD_UNLOAD opcode.
-	 */
-	cmd[7] = cdi->sanyo_slot % 3;
-
-	return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, sshdr, 0, RQF_QUIET);
-}
-
-static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity,
-			       unsigned long *sectors_per_frame)
-{
-	struct {
-		__be32 lba;
-		__be32 blocklen;
-	} capbuf;
-
-	int stat;
-	unsigned char cmd[BLK_MAX_CDB];
-	unsigned len = sizeof(capbuf);
-	u32 blocklen;
-
-	ide_debug_log(IDE_DBG_FUNC, "enter");
-
-	memset(cmd, 0, BLK_MAX_CDB);
-	cmd[0] = GPCMD_READ_CDVD_CAPACITY;
-
-	stat = ide_cd_queue_pc(drive, cmd, 0, &capbuf, &len, NULL, 0,
-			       RQF_QUIET);
-	if (stat)
-		return stat;
-
-	/*
-	 * Sanity check the given block size, in so far as making
-	 * sure the sectors_per_frame we give to the caller won't
-	 * end up being bogus.
-	 */
-	blocklen = be32_to_cpu(capbuf.blocklen);
-	blocklen = (blocklen >> SECTOR_SHIFT) << SECTOR_SHIFT;
-	switch (blocklen) {
-	case 512:
-	case 1024:
-	case 2048:
-	case 4096:
-		break;
-	default:
-		printk_once(KERN_ERR PFX "%s: weird block size %u; "
-				"setting default block size to 2048\n",
-				drive->name, blocklen);
-		blocklen = 2048;
-		break;
-	}
-
-	*capacity = 1 + be32_to_cpu(capbuf.lba);
-	*sectors_per_frame = blocklen >> SECTOR_SHIFT;
-
-	ide_debug_log(IDE_DBG_PROBE, "cap: %lu, sectors_per_frame: %lu",
-				     *capacity, *sectors_per_frame);
-
-	return 0;
-}
-
-static int ide_cdrom_read_tocentry(ide_drive_t *drive, int trackno,
-		int msf_flag, int format, char *buf, int buflen)
-{
-	unsigned char cmd[BLK_MAX_CDB];
-
-	ide_debug_log(IDE_DBG_FUNC, "enter");
-
-	memset(cmd, 0, BLK_MAX_CDB);
-
-	cmd[0] = GPCMD_READ_TOC_PMA_ATIP;
-	cmd[6] = trackno;
-	cmd[7] = (buflen >> 8);
-	cmd[8] = (buflen & 0xff);
-	cmd[9] = (format << 6);
-
-	if (msf_flag)
-		cmd[1] = 2;
-
-	return ide_cd_queue_pc(drive, cmd, 0, buf, &buflen, NULL, 0, RQF_QUIET);
-}
-
-/* Try to read the entire TOC for the disk into our internal buffer. */
-int ide_cd_read_toc(ide_drive_t *drive)
-{
-	int stat, ntracks, i;
-	struct cdrom_info *info = drive->driver_data;
-	struct cdrom_device_info *cdi = &info->devinfo;
-	struct atapi_toc *toc = info->toc;
-	struct {
-		struct atapi_toc_header hdr;
-		struct atapi_toc_entry  ent;
-	} ms_tmp;
-	long last_written;
-	unsigned long sectors_per_frame = SECTORS_PER_FRAME;
-
-	ide_debug_log(IDE_DBG_FUNC, "enter");
-
-	if (toc == NULL) {
-		/* try to allocate space */
-		toc = kmalloc(sizeof(struct atapi_toc), GFP_KERNEL);
-		if (toc == NULL) {
-			printk(KERN_ERR PFX "%s: No cdrom TOC buffer!\n",
-					drive->name);
-			return -ENOMEM;
-		}
-		info->toc = toc;
-	}
-
-	/*
-	 * Check to see if the existing data is still valid. If it is,
-	 * just return.
-	 */
-	(void) cdrom_check_status(drive, NULL);
-
-	if (drive->atapi_flags & IDE_AFLAG_TOC_VALID)
-		return 0;
-
-	/* try to get the total cdrom capacity and sector size */
-	stat = cdrom_read_capacity(drive, &toc->capacity, &sectors_per_frame);
-	if (stat)
-		toc->capacity = 0x1fffff;
-
-	set_capacity(info->disk, toc->capacity * sectors_per_frame);
-	/* save a private copy of the TOC capacity for error handling */
-	drive->probed_capacity = toc->capacity * sectors_per_frame;
-
-	blk_queue_logical_block_size(drive->queue,
-				     sectors_per_frame << SECTOR_SHIFT);
-
-	/* first read just the header, so we know how long the TOC is */
-	stat = ide_cdrom_read_tocentry(drive, 0, 1, 0, (char *) &toc->hdr,
-				    sizeof(struct atapi_toc_header));
-	if (stat)
-		return stat;
-
-	if (drive->atapi_flags & IDE_AFLAG_TOCTRACKS_AS_BCD) {
-		toc->hdr.first_track = bcd2bin(toc->hdr.first_track);
-		toc->hdr.last_track  = bcd2bin(toc->hdr.last_track);
-	}
-
-	ntracks = toc->hdr.last_track - toc->hdr.first_track + 1;
-	if (ntracks <= 0)
-		return -EIO;
-	if (ntracks > MAX_TRACKS)
-		ntracks = MAX_TRACKS;
-
-	/* now read the whole schmeer */
-	stat = ide_cdrom_read_tocentry(drive, toc->hdr.first_track, 1, 0,
-				  (char *)&toc->hdr,
-				   sizeof(struct atapi_toc_header) +
-				   (ntracks + 1) *
-				   sizeof(struct atapi_toc_entry));
-
-	if (stat && toc->hdr.first_track > 1) {
-		/*
-		 * Cds with CDI tracks only don't have any TOC entries, despite
-		 * of this the returned values are
-		 * first_track == last_track = number of CDI tracks + 1,
-		 * so that this case is indistinguishable from the same layout
-		 * plus an additional audio track. If we get an error for the
-		 * regular case, we assume a CDI without additional audio
-		 * tracks. In this case the readable TOC is empty (CDI tracks
-		 * are not included) and only holds the Leadout entry.
-		 *
-		 * Heiko Eißfeldt.
-		 */
-		ntracks = 0;
-		stat = ide_cdrom_read_tocentry(drive, CDROM_LEADOUT, 1, 0,
-					   (char *)&toc->hdr,
-					   sizeof(struct atapi_toc_header) +
-					   (ntracks + 1) *
-					   sizeof(struct atapi_toc_entry));
-		if (stat)
-			return stat;
-
-		if (drive->atapi_flags & IDE_AFLAG_TOCTRACKS_AS_BCD) {
-			toc->hdr.first_track = (u8)bin2bcd(CDROM_LEADOUT);
-			toc->hdr.last_track = (u8)bin2bcd(CDROM_LEADOUT);
-		} else {
-			toc->hdr.first_track = CDROM_LEADOUT;
-			toc->hdr.last_track = CDROM_LEADOUT;
-		}
-	}
-
-	if (stat)
-		return stat;
-
-	toc->hdr.toc_length = be16_to_cpu(toc->hdr.toc_length);
-
-	if (drive->atapi_flags & IDE_AFLAG_TOCTRACKS_AS_BCD) {
-		toc->hdr.first_track = bcd2bin(toc->hdr.first_track);
-		toc->hdr.last_track  = bcd2bin(toc->hdr.last_track);
-	}
-
-	for (i = 0; i <= ntracks; i++) {
-		if (drive->atapi_flags & IDE_AFLAG_TOCADDR_AS_BCD) {
-			if (drive->atapi_flags & IDE_AFLAG_TOCTRACKS_AS_BCD)
-				toc->ent[i].track = bcd2bin(toc->ent[i].track);
-			msf_from_bcd(&toc->ent[i].addr.msf);
-		}
-		toc->ent[i].addr.lba = msf_to_lba(toc->ent[i].addr.msf.minute,
-						  toc->ent[i].addr.msf.second,
-						  toc->ent[i].addr.msf.frame);
-	}
-
-	if (toc->hdr.first_track != CDROM_LEADOUT) {
-		/* read the multisession information */
-		stat = ide_cdrom_read_tocentry(drive, 0, 0, 1, (char *)&ms_tmp,
-					   sizeof(ms_tmp));
-		if (stat)
-			return stat;
-
-		toc->last_session_lba = be32_to_cpu(ms_tmp.ent.addr.lba);
-	} else {
-		ms_tmp.hdr.last_track = CDROM_LEADOUT;
-		ms_tmp.hdr.first_track = ms_tmp.hdr.last_track;
-		toc->last_session_lba = msf_to_lba(0, 2, 0); /* 0m 2s 0f */
-	}
-
-	if (drive->atapi_flags & IDE_AFLAG_TOCADDR_AS_BCD) {
-		/* re-read multisession information using MSF format */
-		stat = ide_cdrom_read_tocentry(drive, 0, 1, 1, (char *)&ms_tmp,
-					   sizeof(ms_tmp));
-		if (stat)
-			return stat;
-
-		msf_from_bcd(&ms_tmp.ent.addr.msf);
-		toc->last_session_lba = msf_to_lba(ms_tmp.ent.addr.msf.minute,
-						   ms_tmp.ent.addr.msf.second,
-						   ms_tmp.ent.addr.msf.frame);
-	}
-
-	toc->xa_flag = (ms_tmp.hdr.first_track != ms_tmp.hdr.last_track);
-
-	/* now try to get the total cdrom capacity */
-	stat = cdrom_get_last_written(cdi, &last_written);
-	if (!stat && (last_written > toc->capacity)) {
-		toc->capacity = last_written;
-		set_capacity(info->disk, toc->capacity * sectors_per_frame);
-		drive->probed_capacity = toc->capacity * sectors_per_frame;
-	}
-
-	/* Remember that we've read this stuff. */
-	drive->atapi_flags |= IDE_AFLAG_TOC_VALID;
-
-	return 0;
-}
-
-int ide_cdrom_get_capabilities(ide_drive_t *drive, u8 *buf)
-{
-	struct cdrom_info *info = drive->driver_data;
-	struct cdrom_device_info *cdi = &info->devinfo;
-	struct packet_command cgc;
-	int stat, attempts = 3, size = ATAPI_CAPABILITIES_PAGE_SIZE;
-
-	ide_debug_log(IDE_DBG_FUNC, "enter");
-
-	if ((drive->atapi_flags & IDE_AFLAG_FULL_CAPS_PAGE) == 0)
-		size -= ATAPI_CAPABILITIES_PAGE_PAD_SIZE;
-
-	init_cdrom_command(&cgc, buf, size, CGC_DATA_UNKNOWN);
-	do {
-		/* we seem to get stat=0x01,err=0x00 the first time (??) */
-		stat = cdrom_mode_sense(cdi, &cgc, GPMODE_CAPABILITIES_PAGE, 0);
-		if (!stat)
-			break;
-	} while (--attempts);
-	return stat;
-}
-
-void ide_cdrom_update_speed(ide_drive_t *drive, u8 *buf)
-{
-	struct cdrom_info *cd = drive->driver_data;
-	u16 curspeed, maxspeed;
-
-	ide_debug_log(IDE_DBG_FUNC, "enter");
-
-	if (drive->atapi_flags & IDE_AFLAG_LE_SPEED_FIELDS) {
-		curspeed = le16_to_cpup((__le16 *)&buf[8 + 14]);
-		maxspeed = le16_to_cpup((__le16 *)&buf[8 + 8]);
-	} else {
-		curspeed = be16_to_cpup((__be16 *)&buf[8 + 14]);
-		maxspeed = be16_to_cpup((__be16 *)&buf[8 + 8]);
-	}
-
-	ide_debug_log(IDE_DBG_PROBE, "curspeed: %u, maxspeed: %u",
-				     curspeed, maxspeed);
-
-	cd->current_speed = DIV_ROUND_CLOSEST(curspeed, 176);
-	cd->max_speed = DIV_ROUND_CLOSEST(maxspeed, 176);
-}
-
-#define IDE_CD_CAPABILITIES \
-	(CDC_CLOSE_TRAY | CDC_OPEN_TRAY | CDC_LOCK | CDC_SELECT_SPEED | \
-	 CDC_SELECT_DISC | CDC_MULTI_SESSION | CDC_MCN | CDC_MEDIA_CHANGED | \
-	 CDC_PLAY_AUDIO | CDC_RESET | CDC_DRIVE_STATUS | CDC_CD_R | \
-	 CDC_CD_RW | CDC_DVD | CDC_DVD_R | CDC_DVD_RAM | CDC_GENERIC_PACKET | \
-	 CDC_MO_DRIVE | CDC_MRW | CDC_MRW_W | CDC_RAM)
-
-static const struct cdrom_device_ops ide_cdrom_dops = {
-	.open			= ide_cdrom_open_real,
-	.release		= ide_cdrom_release_real,
-	.drive_status		= ide_cdrom_drive_status,
-	.check_events		= ide_cdrom_check_events_real,
-	.tray_move		= ide_cdrom_tray_move,
-	.lock_door		= ide_cdrom_lock_door,
-	.select_speed		= ide_cdrom_select_speed,
-	.get_last_session	= ide_cdrom_get_last_session,
-	.get_mcn		= ide_cdrom_get_mcn,
-	.reset			= ide_cdrom_reset,
-	.audio_ioctl		= ide_cdrom_audio_ioctl,
-	.capability		= IDE_CD_CAPABILITIES,
-	.generic_packet		= ide_cdrom_packet,
-};
-
-static int ide_cdrom_register(ide_drive_t *drive, int nslots)
-{
-	struct cdrom_info *info = drive->driver_data;
-	struct cdrom_device_info *devinfo = &info->devinfo;
-
-	ide_debug_log(IDE_DBG_PROBE, "nslots: %d", nslots);
-
-	devinfo->ops = &ide_cdrom_dops;
-	devinfo->speed = info->current_speed;
-	devinfo->capacity = nslots;
-	devinfo->handle = drive;
-	strcpy(devinfo->name, drive->name);
-
-	if (drive->atapi_flags & IDE_AFLAG_NO_SPEED_SELECT)
-		devinfo->mask |= CDC_SELECT_SPEED;
-
-	return register_cdrom(info->disk, devinfo);
-}
-
-static int ide_cdrom_probe_capabilities(ide_drive_t *drive)
-{
-	struct cdrom_info *cd = drive->driver_data;
-	struct cdrom_device_info *cdi = &cd->devinfo;
-	u8 buf[ATAPI_CAPABILITIES_PAGE_SIZE];
-	mechtype_t mechtype;
-	int nslots = 1;
-
-	ide_debug_log(IDE_DBG_PROBE, "media: 0x%x, atapi_flags: 0x%lx",
-				     drive->media, drive->atapi_flags);
-
-	cdi->mask = (CDC_CD_R | CDC_CD_RW | CDC_DVD | CDC_DVD_R |
-		     CDC_DVD_RAM | CDC_SELECT_DISC | CDC_PLAY_AUDIO |
-		     CDC_MO_DRIVE | CDC_RAM);
-
-	if (drive->media == ide_optical) {
-		cdi->mask &= ~(CDC_MO_DRIVE | CDC_RAM);
-		printk(KERN_ERR PFX "%s: ATAPI magneto-optical drive\n",
-				drive->name);
-		return nslots;
-	}
-
-	if (drive->atapi_flags & IDE_AFLAG_PRE_ATAPI12) {
-		drive->atapi_flags &= ~IDE_AFLAG_NO_EJECT;
-		cdi->mask &= ~CDC_PLAY_AUDIO;
-		return nslots;
-	}
-
-	/*
-	 * We have to cheat a little here. the packet will eventually be queued
-	 * with ide_cdrom_packet(), which extracts the drive from cdi->handle.
-	 * Since this device hasn't been registered with the Uniform layer yet,
-	 * it can't do this. Same goes for cdi->ops.
-	 */
-	cdi->handle = drive;
-	cdi->ops = &ide_cdrom_dops;
-
-	if (ide_cdrom_get_capabilities(drive, buf))
-		return 0;
-
-	if ((buf[8 + 6] & 0x01) == 0)
-		drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING;
-	if (buf[8 + 6] & 0x08)
-		drive->atapi_flags &= ~IDE_AFLAG_NO_EJECT;
-	if (buf[8 + 3] & 0x01)
-		cdi->mask &= ~CDC_CD_R;
-	if (buf[8 + 3] & 0x02)
-		cdi->mask &= ~(CDC_CD_RW | CDC_RAM);
-	if (buf[8 + 2] & 0x38)
-		cdi->mask &= ~CDC_DVD;
-	if (buf[8 + 3] & 0x20)
-		cdi->mask &= ~(CDC_DVD_RAM | CDC_RAM);
-	if (buf[8 + 3] & 0x10)
-		cdi->mask &= ~CDC_DVD_R;
-	if ((buf[8 + 4] & 0x01) || (drive->atapi_flags & IDE_AFLAG_PLAY_AUDIO_OK))
-		cdi->mask &= ~CDC_PLAY_AUDIO;
-
-	mechtype = buf[8 + 6] >> 5;
-	if (mechtype == mechtype_caddy ||
-	    mechtype == mechtype_popup ||
-	    (drive->atapi_flags & IDE_AFLAG_NO_AUTOCLOSE))
-		cdi->mask |= CDC_CLOSE_TRAY;
-
-	if (cdi->sanyo_slot > 0) {
-		cdi->mask &= ~CDC_SELECT_DISC;
-		nslots = 3;
-	} else if (mechtype == mechtype_individual_changer ||
-		   mechtype == mechtype_cartridge_changer) {
-		nslots = cdrom_number_of_slots(cdi);
-		if (nslots > 1)
-			cdi->mask &= ~CDC_SELECT_DISC;
-	}
-
-	ide_cdrom_update_speed(drive, buf);
-
-	printk(KERN_INFO PFX "%s: ATAPI", drive->name);
-
-	/* don't print speed if the drive reported 0 */
-	if (cd->max_speed)
-		printk(KERN_CONT " %dX", cd->max_speed);
-
-	printk(KERN_CONT " %s", (cdi->mask & CDC_DVD) ? "CD-ROM" : "DVD-ROM");
-
-	if ((cdi->mask & CDC_DVD_R) == 0 || (cdi->mask & CDC_DVD_RAM) == 0)
-		printk(KERN_CONT " DVD%s%s",
-				 (cdi->mask & CDC_DVD_R) ? "" : "-R",
-				 (cdi->mask & CDC_DVD_RAM) ? "" : "/RAM");
-
-	if ((cdi->mask & CDC_CD_R) == 0 || (cdi->mask & CDC_CD_RW) == 0)
-		printk(KERN_CONT " CD%s%s",
-				 (cdi->mask & CDC_CD_R) ? "" : "-R",
-				 (cdi->mask & CDC_CD_RW) ? "" : "/RW");
-
-	if ((cdi->mask & CDC_SELECT_DISC) == 0)
-		printk(KERN_CONT " changer w/%d slots", nslots);
-	else
-		printk(KERN_CONT " drive");
-
-	printk(KERN_CONT ", %dkB Cache\n",
-			 be16_to_cpup((__be16 *)&buf[8 + 12]));
-
-	return nslots;
-}
-
-struct cd_list_entry {
-	const char	*id_model;
-	const char	*id_firmware;
-	unsigned int	cd_flags;
-};
-
-#ifdef CONFIG_IDE_PROC_FS
-static sector_t ide_cdrom_capacity(ide_drive_t *drive)
-{
-	unsigned long capacity, sectors_per_frame;
-
-	if (cdrom_read_capacity(drive, &capacity, &sectors_per_frame))
-		return 0;
-
-	return capacity * sectors_per_frame;
-}
-
-static int idecd_capacity_proc_show(struct seq_file *m, void *v)
-{
-	ide_drive_t *drive = m->private;
-
-	seq_printf(m, "%llu\n", (long long)ide_cdrom_capacity(drive));
-	return 0;
-}
-
-static ide_proc_entry_t idecd_proc[] = {
-	{ "capacity", S_IFREG|S_IRUGO, idecd_capacity_proc_show },
-	{}
-};
-
-static ide_proc_entry_t *ide_cd_proc_entries(ide_drive_t *drive)
-{
-	return idecd_proc;
-}
-
-static const struct ide_proc_devset *ide_cd_proc_devsets(ide_drive_t *drive)
-{
-	return NULL;
-}
-#endif
-
-static const struct cd_list_entry ide_cd_quirks_list[] = {
-	/* SCR-3231 doesn't support the SET_CD_SPEED command. */
-	{ "SAMSUNG CD-ROM SCR-3231", NULL,   IDE_AFLAG_NO_SPEED_SELECT	     },
-	/* Old NEC260 (not R) was released before ATAPI 1.2 spec. */
-	{ "NEC CD-ROM DRIVE:260",    "1.01", IDE_AFLAG_TOCADDR_AS_BCD |
-					     IDE_AFLAG_PRE_ATAPI12,	     },
-	/* Vertos 300, some versions of this drive like to talk BCD. */
-	{ "V003S0DS",		     NULL,   IDE_AFLAG_VERTOS_300_SSD,	     },
-	/* Vertos 600 ESD. */
-	{ "V006E0DS",		     NULL,   IDE_AFLAG_VERTOS_600_ESD,	     },
-	/*
-	 * Sanyo 3 CD changer uses a non-standard command for CD changing
-	 * (by default standard ATAPI support for CD changers is used).
-	 */
-	{ "CD-ROM CDR-C3 G",	     NULL,   IDE_AFLAG_SANYO_3CD	     },
-	{ "CD-ROM CDR-C3G",	     NULL,   IDE_AFLAG_SANYO_3CD	     },
-	{ "CD-ROM CDR_C36",	     NULL,   IDE_AFLAG_SANYO_3CD	     },
-	/* Stingray 8X CD-ROM. */
-	{ "STINGRAY 8422 IDE 8X CD-ROM 7-27-95", NULL, IDE_AFLAG_PRE_ATAPI12 },
-	/*
-	 * ACER 50X CD-ROM and WPI 32X CD-ROM require the full spec length
-	 * mode sense page capabilities size, but older drives break.
-	 */
-	{ "ATAPI CD ROM DRIVE 50X MAX",	NULL,	IDE_AFLAG_FULL_CAPS_PAGE     },
-	{ "WPI CDS-32X",		NULL,	IDE_AFLAG_FULL_CAPS_PAGE     },
-	/* ACER/AOpen 24X CD-ROM has the speed fields byte-swapped. */
-	{ "",			     "241N", IDE_AFLAG_LE_SPEED_FIELDS       },
-	/*
-	 * Some drives used by Apple don't advertise audio play
-	 * but they do support reading TOC & audio datas.
-	 */
-	{ "MATSHITADVD-ROM SR-8187", NULL,   IDE_AFLAG_PLAY_AUDIO_OK	     },
-	{ "MATSHITADVD-ROM SR-8186", NULL,   IDE_AFLAG_PLAY_AUDIO_OK	     },
-	{ "MATSHITADVD-ROM SR-8176", NULL,   IDE_AFLAG_PLAY_AUDIO_OK	     },
-	{ "MATSHITADVD-ROM SR-8174", NULL,   IDE_AFLAG_PLAY_AUDIO_OK	     },
-	{ "Optiarc DVD RW AD-5200A", NULL,   IDE_AFLAG_PLAY_AUDIO_OK	     },
-	{ "Optiarc DVD RW AD-7200A", NULL,   IDE_AFLAG_PLAY_AUDIO_OK	     },
-	{ "Optiarc DVD RW AD-7543A", NULL,   IDE_AFLAG_NO_AUTOCLOSE	     },
-	{ "TEAC CD-ROM CD-224E",     NULL,   IDE_AFLAG_NO_AUTOCLOSE	     },
-	{ NULL, NULL, 0 }
-};
-
-static unsigned int ide_cd_flags(u16 *id)
-{
-	const struct cd_list_entry *cle = ide_cd_quirks_list;
-
-	while (cle->id_model) {
-		if (strcmp(cle->id_model, (char *)&id[ATA_ID_PROD]) == 0 &&
-		    (cle->id_firmware == NULL ||
-		     strstr((char *)&id[ATA_ID_FW_REV], cle->id_firmware)))
-			return cle->cd_flags;
-		cle++;
-	}
-
-	return 0;
-}
-
-static int ide_cdrom_setup(ide_drive_t *drive)
-{
-	struct cdrom_info *cd = drive->driver_data;
-	struct cdrom_device_info *cdi = &cd->devinfo;
-	struct request_queue *q = drive->queue;
-	u16 *id = drive->id;
-	char *fw_rev = (char *)&id[ATA_ID_FW_REV];
-	int nslots;
-
-	ide_debug_log(IDE_DBG_PROBE, "enter");
-
-	drive->prep_rq = ide_cdrom_prep_rq;
-	blk_queue_dma_alignment(q, 31);
-	blk_queue_update_dma_pad(q, 15);
-
-	drive->dev_flags |= IDE_DFLAG_MEDIA_CHANGED;
-	drive->atapi_flags = IDE_AFLAG_NO_EJECT | ide_cd_flags(id);
-
-	if ((drive->atapi_flags & IDE_AFLAG_VERTOS_300_SSD) &&
-	    fw_rev[4] == '1' && fw_rev[6] <= '2')
-		drive->atapi_flags |= (IDE_AFLAG_TOCTRACKS_AS_BCD |
-				     IDE_AFLAG_TOCADDR_AS_BCD);
-	else if ((drive->atapi_flags & IDE_AFLAG_VERTOS_600_ESD) &&
-		 fw_rev[4] == '1' && fw_rev[6] <= '2')
-		drive->atapi_flags |= IDE_AFLAG_TOCTRACKS_AS_BCD;
-	else if (drive->atapi_flags & IDE_AFLAG_SANYO_3CD)
-		/* 3 => use CD in slot 0 */
-		cdi->sanyo_slot = 3;
-
-	nslots = ide_cdrom_probe_capabilities(drive);
-
-	blk_queue_logical_block_size(q, CD_FRAMESIZE);
-
-	if (ide_cdrom_register(drive, nslots)) {
-		printk(KERN_ERR PFX "%s: %s failed to register device with the"
-				" cdrom driver.\n", drive->name, __func__);
-		cd->devinfo.handle = NULL;
-		return 1;
-	}
-
-	ide_proc_register_driver(drive, cd->driver);
-	return 0;
-}
-
-static void ide_cd_remove(ide_drive_t *drive)
-{
-	struct cdrom_info *info = drive->driver_data;
-
-	ide_debug_log(IDE_DBG_FUNC, "enter");
-
-	ide_proc_unregister_driver(drive, info->driver);
-	device_del(&info->dev);
-	del_gendisk(info->disk);
-
-	mutex_lock(&idecd_ref_mutex);
-	put_device(&info->dev);
-	mutex_unlock(&idecd_ref_mutex);
-}
-
-static void ide_cd_release(struct device *dev)
-{
-	struct cdrom_info *info = to_ide_drv(dev, cdrom_info);
-	struct cdrom_device_info *devinfo = &info->devinfo;
-	ide_drive_t *drive = info->drive;
-	struct gendisk *g = info->disk;
-
-	ide_debug_log(IDE_DBG_FUNC, "enter");
-
-	kfree(info->toc);
-	if (devinfo->handle == drive)
-		unregister_cdrom(devinfo);
-	drive->driver_data = NULL;
-	drive->prep_rq = NULL;
-	g->private_data = NULL;
-	put_disk(g);
-	kfree(info);
-}
-
-static int ide_cd_probe(ide_drive_t *);
-
-static struct ide_driver ide_cdrom_driver = {
-	.gen_driver = {
-		.owner		= THIS_MODULE,
-		.name		= "ide-cdrom",
-		.bus		= &ide_bus_type,
-	},
-	.probe			= ide_cd_probe,
-	.remove			= ide_cd_remove,
-	.version		= IDECD_VERSION,
-	.do_request		= ide_cd_do_request,
-#ifdef CONFIG_IDE_PROC_FS
-	.proc_entries		= ide_cd_proc_entries,
-	.proc_devsets		= ide_cd_proc_devsets,
-#endif
-};
-
-static int idecd_open(struct block_device *bdev, fmode_t mode)
-{
-	struct cdrom_info *info;
-	int rc = -ENXIO;
-
-	if (bdev_check_media_change(bdev)) {
-		info = ide_drv_g(bdev->bd_disk, cdrom_info);
-
-		ide_cd_read_toc(info->drive);
-	}
-
-	mutex_lock(&ide_cd_mutex);
-	info = ide_cd_get(bdev->bd_disk);
-	if (!info)
-		goto out;
-
-	rc = cdrom_open(&info->devinfo, bdev, mode);
-	if (rc < 0)
-		ide_cd_put(info);
-out:
-	mutex_unlock(&ide_cd_mutex);
-	return rc;
-}
-
-static void idecd_release(struct gendisk *disk, fmode_t mode)
-{
-	struct cdrom_info *info = ide_drv_g(disk, cdrom_info);
-
-	mutex_lock(&ide_cd_mutex);
-	cdrom_release(&info->devinfo, mode);
-
-	ide_cd_put(info);
-	mutex_unlock(&ide_cd_mutex);
-}
-
-static int idecd_set_spindown(struct cdrom_device_info *cdi, unsigned long arg)
-{
-	struct packet_command cgc;
-	char buffer[16];
-	int stat;
-	char spindown;
-
-	if (copy_from_user(&spindown, (void __user *)arg, sizeof(char)))
-		return -EFAULT;
-
-	init_cdrom_command(&cgc, buffer, sizeof(buffer), CGC_DATA_UNKNOWN);
-
-	stat = cdrom_mode_sense(cdi, &cgc, GPMODE_CDROM_PAGE, 0);
-	if (stat)
-		return stat;
-
-	buffer[11] = (buffer[11] & 0xf0) | (spindown & 0x0f);
-	return cdrom_mode_select(cdi, &cgc);
-}
-
-static int idecd_get_spindown(struct cdrom_device_info *cdi, unsigned long arg)
-{
-	struct packet_command cgc;
-	char buffer[16];
-	int stat;
-	char spindown;
-
-	init_cdrom_command(&cgc, buffer, sizeof(buffer), CGC_DATA_UNKNOWN);
-
-	stat = cdrom_mode_sense(cdi, &cgc, GPMODE_CDROM_PAGE, 0);
-	if (stat)
-		return stat;
-
-	spindown = buffer[11] & 0x0f;
-	if (copy_to_user((void __user *)arg, &spindown, sizeof(char)))
-		return -EFAULT;
-	return 0;
-}
-
-static int idecd_locked_ioctl(struct block_device *bdev, fmode_t mode,
-			unsigned int cmd, unsigned long arg)
-{
-	struct cdrom_info *info = ide_drv_g(bdev->bd_disk, cdrom_info);
-	int err;
-
-	switch (cmd) {
-	case CDROMSETSPINDOWN:
-		return idecd_set_spindown(&info->devinfo, arg);
-	case CDROMGETSPINDOWN:
-		return idecd_get_spindown(&info->devinfo, arg);
-	default:
-		break;
-	}
-
-	err = generic_ide_ioctl(info->drive, bdev, cmd, arg);
-	if (err == -EINVAL)
-		err = cdrom_ioctl(&info->devinfo, bdev, mode, cmd, arg);
-
-	return err;
-}
-
-static int idecd_ioctl(struct block_device *bdev, fmode_t mode,
-			     unsigned int cmd, unsigned long arg)
-{
-	int ret;
-
-	mutex_lock(&ide_cd_mutex);
-	ret = idecd_locked_ioctl(bdev, mode, cmd, arg);
-	mutex_unlock(&ide_cd_mutex);
-
-	return ret;
-}
-
-static int idecd_locked_compat_ioctl(struct block_device *bdev, fmode_t mode,
-			unsigned int cmd, unsigned long arg)
-{
-	struct cdrom_info *info = ide_drv_g(bdev->bd_disk, cdrom_info);
-	void __user *argp = compat_ptr(arg);
-	int err;
-
-	switch (cmd) {
-	case CDROMSETSPINDOWN:
-		return idecd_set_spindown(&info->devinfo, (unsigned long)argp);
-	case CDROMGETSPINDOWN:
-		return idecd_get_spindown(&info->devinfo, (unsigned long)argp);
-	default:
-		break;
-	}
-
-	err = generic_ide_ioctl(info->drive, bdev, cmd, arg);
-	if (err == -EINVAL)
-		err = cdrom_ioctl(&info->devinfo, bdev, mode, cmd,
-				  (unsigned long)argp);
-
-	return err;
-}
-
-static int idecd_compat_ioctl(struct block_device *bdev, fmode_t mode,
-			     unsigned int cmd, unsigned long arg)
-{
-	int ret;
-
-	mutex_lock(&ide_cd_mutex);
-	ret = idecd_locked_compat_ioctl(bdev, mode, cmd, arg);
-	mutex_unlock(&ide_cd_mutex);
-
-	return ret;
-}
-
-static unsigned int idecd_check_events(struct gendisk *disk,
-				       unsigned int clearing)
-{
-	struct cdrom_info *info = ide_drv_g(disk, cdrom_info);
-	return cdrom_check_events(&info->devinfo, clearing);
-}
-
-static const struct block_device_operations idecd_ops = {
-	.owner			= THIS_MODULE,
-	.open			= idecd_open,
-	.release		= idecd_release,
-	.ioctl			= idecd_ioctl,
-	.compat_ioctl		= IS_ENABLED(CONFIG_COMPAT) ?
-				  idecd_compat_ioctl : NULL,
-	.check_events		= idecd_check_events,
-};
-
-/* module options */
-static unsigned long debug_mask;
-module_param(debug_mask, ulong, 0644);
-
-MODULE_DESCRIPTION("ATAPI CD-ROM Driver");
-
-static int ide_cd_probe(ide_drive_t *drive)
-{
-	struct cdrom_info *info;
-	struct gendisk *g;
-
-	ide_debug_log(IDE_DBG_PROBE, "driver_req: %s, media: 0x%x",
-				     drive->driver_req, drive->media);
-
-	if (!strstr("ide-cdrom", drive->driver_req))
-		goto failed;
-
-	if (drive->media != ide_cdrom && drive->media != ide_optical)
-		goto failed;
-
-	drive->debug_mask = debug_mask;
-	drive->irq_handler = cdrom_newpc_intr;
-
-	info = kzalloc(sizeof(struct cdrom_info), GFP_KERNEL);
-	if (info == NULL) {
-		printk(KERN_ERR PFX "%s: Can't allocate a cdrom structure\n",
-				drive->name);
-		goto failed;
-	}
-
-	g = alloc_disk(1 << PARTN_BITS);
-	if (!g)
-		goto out_free_cd;
-
-	ide_init_disk(g, drive);
-
-	info->dev.parent = &drive->gendev;
-	info->dev.release = ide_cd_release;
-	dev_set_name(&info->dev, "%s", dev_name(&drive->gendev));
-
-	if (device_register(&info->dev))
-		goto out_free_disk;
-
-	info->drive = drive;
-	info->driver = &ide_cdrom_driver;
-	info->disk = g;
-
-	g->private_data = &info->driver;
-
-	drive->driver_data = info;
-
-	g->minors = 1;
-	g->flags = GENHD_FL_CD | GENHD_FL_REMOVABLE;
-	if (ide_cdrom_setup(drive)) {
-		put_device(&info->dev);
-		goto failed;
-	}
-
-	ide_cd_read_toc(drive);
-	g->fops = &idecd_ops;
-	g->flags |= GENHD_FL_REMOVABLE | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
-	g->events = DISK_EVENT_MEDIA_CHANGE;
-	device_add_disk(&drive->gendev, g, NULL);
-	return 0;
-
-out_free_disk:
-	put_disk(g);
-out_free_cd:
-	kfree(info);
-failed:
-	return -ENODEV;
-}
-
-static void __exit ide_cdrom_exit(void)
-{
-	driver_unregister(&ide_cdrom_driver.gen_driver);
-}
-
-static int __init ide_cdrom_init(void)
-{
-	printk(KERN_INFO DRV_NAME " driver " IDECD_VERSION "\n");
-	return driver_register(&ide_cdrom_driver.gen_driver);
-}
-
-MODULE_ALIAS("ide:*m-cdrom*");
-MODULE_ALIAS("ide-cd");
-module_init(ide_cdrom_init);
-module_exit(ide_cdrom_exit);
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/ide-cd.h b/drivers/ide/ide-cd.h
deleted file mode 100644
index a69dc7f61c4d..000000000000
--- a/drivers/ide/ide-cd.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *  Copyright (C) 1996-98  Erik Andersen
- *  Copyright (C) 1998-2000 Jens Axboe
- */
-#ifndef _IDE_CD_H
-#define _IDE_CD_H
-
-#include <linux/cdrom.h>
-#include <asm/byteorder.h>
-
-#define IDECD_DEBUG_LOG		0
-
-#if IDECD_DEBUG_LOG
-#define ide_debug_log(lvl, fmt, args...) __ide_debug_log(lvl, fmt, ## args)
-#else
-#define ide_debug_log(lvl, fmt, args...) do {} while (0)
-#endif
-
-#define ATAPI_WAIT_WRITE_BUSY	(10 * HZ)
-
-/************************************************************************/
-
-#define SECTORS_PER_FRAME	(CD_FRAMESIZE >> SECTOR_SHIFT)
-#define SECTOR_BUFFER_SIZE	(CD_FRAMESIZE * 32)
-
-/* Capabilities Page size including 8 bytes of Mode Page Header */
-#define ATAPI_CAPABILITIES_PAGE_SIZE		(8 + 20)
-#define ATAPI_CAPABILITIES_PAGE_PAD_SIZE	4
-
-/* Structure of a MSF cdrom address. */
-struct atapi_msf {
-	u8 reserved;
-	u8 minute;
-	u8 second;
-	u8 frame;
-};
-
-/* Space to hold the disk TOC. */
-#define MAX_TRACKS 99
-struct atapi_toc_header {
-	unsigned short toc_length;
-	u8 first_track;
-	u8 last_track;
-};
-
-struct atapi_toc_entry {
-	u8 reserved1;
-#if defined(__BIG_ENDIAN_BITFIELD)
-	u8 adr     : 4;
-	u8 control : 4;
-#elif defined(__LITTLE_ENDIAN_BITFIELD)
-	u8 control : 4;
-	u8 adr     : 4;
-#else
-#error "Please fix <asm/byteorder.h>"
-#endif
-	u8 track;
-	u8 reserved2;
-	union {
-		unsigned lba;
-		struct atapi_msf msf;
-	} addr;
-};
-
-struct atapi_toc {
-	int    last_session_lba;
-	int    xa_flag;
-	unsigned long capacity;
-	struct atapi_toc_header hdr;
-	struct atapi_toc_entry  ent[MAX_TRACKS+1];
-	  /* One extra for the leadout. */
-};
-
-/* Extra per-device info for cdrom drives. */
-struct cdrom_info {
-	ide_drive_t		*drive;
-	struct ide_driver	*driver;
-	struct gendisk		*disk;
-	struct device		dev;
-
-	/* Buffer for table of contents.  NULL if we haven't allocated
-	   a TOC buffer for this device yet. */
-
-	struct atapi_toc *toc;
-
-	u8 max_speed;		/* Max speed of the drive. */
-	u8 current_speed;	/* Current speed of the drive. */
-
-        /* Per-device info needed by cdrom.c generic driver. */
-        struct cdrom_device_info devinfo;
-
-	unsigned long write_timeout;
-};
-
-/* ide-cd_verbose.c */
-void ide_cd_log_error(const char *, struct request *, struct request_sense *);
-
-/* ide-cd.c functions used by ide-cd_ioctl.c */
-int ide_cd_queue_pc(ide_drive_t *, const unsigned char *, int, void *,
-		    unsigned *, struct scsi_sense_hdr *, int, req_flags_t);
-int ide_cd_read_toc(ide_drive_t *);
-int ide_cdrom_get_capabilities(ide_drive_t *, u8 *);
-void ide_cdrom_update_speed(ide_drive_t *, u8 *);
-int cdrom_check_status(ide_drive_t *, struct scsi_sense_hdr *);
-
-/* ide-cd_ioctl.c */
-int ide_cdrom_open_real(struct cdrom_device_info *, int);
-void ide_cdrom_release_real(struct cdrom_device_info *);
-int ide_cdrom_drive_status(struct cdrom_device_info *, int);
-unsigned int ide_cdrom_check_events_real(struct cdrom_device_info *,
-					 unsigned int clearing, int slot_nr);
-int ide_cdrom_tray_move(struct cdrom_device_info *, int);
-int ide_cdrom_lock_door(struct cdrom_device_info *, int);
-int ide_cdrom_select_speed(struct cdrom_device_info *, int);
-int ide_cdrom_get_last_session(struct cdrom_device_info *,
-			       struct cdrom_multisession *);
-int ide_cdrom_get_mcn(struct cdrom_device_info *, struct cdrom_mcn *);
-int ide_cdrom_reset(struct cdrom_device_info *cdi);
-int ide_cdrom_audio_ioctl(struct cdrom_device_info *, unsigned int, void *);
-int ide_cdrom_packet(struct cdrom_device_info *, struct packet_command *);
-
-#endif /* _IDE_CD_H */
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
deleted file mode 100644
index 011eab9c69b7..000000000000
--- a/drivers/ide/ide-cd_ioctl.c
+++ /dev/null
@@ -1,468 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * cdrom.c IOCTLs handling for ide-cd driver.
- *
- * Copyright (C) 1994-1996  Scott Snyder <snyder@fnald0.fnal.gov>
- * Copyright (C) 1996-1998  Erik Andersen <andersee@debian.org>
- * Copyright (C) 1998-2000  Jens Axboe <axboe@suse.de>
- */
-
-#include <linux/kernel.h>
-#include <linux/cdrom.h>
-#include <linux/gfp.h>
-#include <linux/ide.h>
-#include <scsi/scsi.h>
-
-#include "ide-cd.h"
-
-/****************************************************************************
- * Other driver requests (open, close, check media change).
- */
-int ide_cdrom_open_real(struct cdrom_device_info *cdi, int purpose)
-{
-	return 0;
-}
-
-/*
- * Close down the device.  Invalidate all cached blocks.
- */
-void ide_cdrom_release_real(struct cdrom_device_info *cdi)
-{
-	ide_drive_t *drive = cdi->handle;
-
-	if (!cdi->use_count)
-		drive->atapi_flags &= ~IDE_AFLAG_TOC_VALID;
-}
-
-/*
- * add logic to try GET_EVENT command first to check for media and tray
- * status. this should be supported by newer cd-r/w and all DVD etc
- * drives
- */
-int ide_cdrom_drive_status(struct cdrom_device_info *cdi, int slot_nr)
-{
-	ide_drive_t *drive = cdi->handle;
-	struct media_event_desc med;
-	struct scsi_sense_hdr sshdr;
-	int stat;
-
-	if (slot_nr != CDSL_CURRENT)
-		return -EINVAL;
-
-	stat = cdrom_check_status(drive, &sshdr);
-	if (!stat || sshdr.sense_key == UNIT_ATTENTION)
-		return CDS_DISC_OK;
-
-	if (!cdrom_get_media_event(cdi, &med)) {
-		if (med.media_present)
-			return CDS_DISC_OK;
-		else if (med.door_open)
-			return CDS_TRAY_OPEN;
-		else
-			return CDS_NO_DISC;
-	}
-
-	if (sshdr.sense_key == NOT_READY && sshdr.asc == 0x04
-			&& sshdr.ascq == 0x04)
-		return CDS_DISC_OK;
-
-	/*
-	 * If not using Mt Fuji extended media tray reports,
-	 * just return TRAY_OPEN since ATAPI doesn't provide
-	 * any other way to detect this...
-	 */
-	if (sshdr.sense_key == NOT_READY) {
-		if (sshdr.asc == 0x3a && sshdr.ascq == 1)
-			return CDS_NO_DISC;
-		else
-			return CDS_TRAY_OPEN;
-	}
-	return CDS_DRIVE_NOT_READY;
-}
-
-/*
- * ide-cd always generates media changed event if media is missing, which
- * makes it impossible to use for proper event reporting, so
- * DISK_EVENT_FLAG_UEVENT is cleared in disk->event_flags
- * and the following function is used only to trigger
- * revalidation and never propagated to userland.
- */
-unsigned int ide_cdrom_check_events_real(struct cdrom_device_info *cdi,
-					 unsigned int clearing, int slot_nr)
-{
-	ide_drive_t *drive = cdi->handle;
-	int retval;
-
-	if (slot_nr == CDSL_CURRENT) {
-		(void) cdrom_check_status(drive, NULL);
-		retval = (drive->dev_flags & IDE_DFLAG_MEDIA_CHANGED) ? 1 : 0;
-		drive->dev_flags &= ~IDE_DFLAG_MEDIA_CHANGED;
-		return retval ? DISK_EVENT_MEDIA_CHANGE : 0;
-	} else {
-		return 0;
-	}
-}
-
-/* Eject the disk if EJECTFLAG is 0.
-   If EJECTFLAG is 1, try to reload the disk. */
-static
-int cdrom_eject(ide_drive_t *drive, int ejectflag)
-{
-	struct cdrom_info *cd = drive->driver_data;
-	struct cdrom_device_info *cdi = &cd->devinfo;
-	char loej = 0x02;
-	unsigned char cmd[BLK_MAX_CDB];
-
-	if ((drive->atapi_flags & IDE_AFLAG_NO_EJECT) && !ejectflag)
-		return -EDRIVE_CANT_DO_THIS;
-
-	/* reload fails on some drives, if the tray is locked */
-	if ((drive->atapi_flags & IDE_AFLAG_DOOR_LOCKED) && ejectflag)
-		return 0;
-
-	/* only tell drive to close tray if open, if it can do that */
-	if (ejectflag && (cdi->mask & CDC_CLOSE_TRAY))
-		loej = 0;
-
-	memset(cmd, 0, BLK_MAX_CDB);
-
-	cmd[0] = GPCMD_START_STOP_UNIT;
-	cmd[4] = loej | (ejectflag != 0);
-
-	return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, NULL, 0, 0);
-}
-
-/* Lock the door if LOCKFLAG is nonzero; unlock it otherwise. */
-static
-int ide_cd_lockdoor(ide_drive_t *drive, int lockflag)
-{
-	struct scsi_sense_hdr sshdr;
-	int stat;
-
-	/* If the drive cannot lock the door, just pretend. */
-	if ((drive->dev_flags & IDE_DFLAG_DOORLOCKING) == 0) {
-		stat = 0;
-	} else {
-		unsigned char cmd[BLK_MAX_CDB];
-
-		memset(cmd, 0, BLK_MAX_CDB);
-
-		cmd[0] = GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL;
-		cmd[4] = lockflag ? 1 : 0;
-
-		stat = ide_cd_queue_pc(drive, cmd, 0, NULL, NULL,
-				       &sshdr, 0, 0);
-	}
-
-	/* If we got an illegal field error, the drive
-	   probably cannot lock the door. */
-	if (stat != 0 &&
-	    sshdr.sense_key == ILLEGAL_REQUEST &&
-	    (sshdr.asc == 0x24 || sshdr.asc == 0x20)) {
-		printk(KERN_ERR "%s: door locking not supported\n",
-			drive->name);
-		drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING;
-		stat = 0;
-	}
-
-	/* no medium, that's alright. */
-	if (stat != 0 && sshdr.sense_key == NOT_READY && sshdr.asc == 0x3a)
-		stat = 0;
-
-	if (stat == 0) {
-		if (lockflag)
-			drive->atapi_flags |= IDE_AFLAG_DOOR_LOCKED;
-		else
-			drive->atapi_flags &= ~IDE_AFLAG_DOOR_LOCKED;
-	}
-
-	return stat;
-}
-
-int ide_cdrom_tray_move(struct cdrom_device_info *cdi, int position)
-{
-	ide_drive_t *drive = cdi->handle;
-
-	if (position) {
-		int stat = ide_cd_lockdoor(drive, 0);
-
-		if (stat)
-			return stat;
-	}
-
-	return cdrom_eject(drive, !position);
-}
-
-int ide_cdrom_lock_door(struct cdrom_device_info *cdi, int lock)
-{
-	ide_drive_t *drive = cdi->handle;
-
-	return ide_cd_lockdoor(drive, lock);
-}
-
-/*
- * ATAPI devices are free to select the speed you request or any slower
- * rate. :-(  Requesting too fast a speed will _not_ produce an error.
- */
-int ide_cdrom_select_speed(struct cdrom_device_info *cdi, int speed)
-{
-	ide_drive_t *drive = cdi->handle;
-	struct cdrom_info *cd = drive->driver_data;
-	u8 buf[ATAPI_CAPABILITIES_PAGE_SIZE];
-	int stat;
-	unsigned char cmd[BLK_MAX_CDB];
-
-	if (speed == 0)
-		speed = 0xffff; /* set to max */
-	else
-		speed *= 177;   /* Nx to kbytes/s */
-
-	memset(cmd, 0, BLK_MAX_CDB);
-
-	cmd[0] = GPCMD_SET_SPEED;
-	/* Read Drive speed in kbytes/second MSB/LSB */
-	cmd[2] = (speed >> 8) & 0xff;
-	cmd[3] = speed & 0xff;
-	if ((cdi->mask & (CDC_CD_R | CDC_CD_RW | CDC_DVD_R)) !=
-	    (CDC_CD_R | CDC_CD_RW | CDC_DVD_R)) {
-		/* Write Drive speed in kbytes/second MSB/LSB */
-		cmd[4] = (speed >> 8) & 0xff;
-		cmd[5] = speed & 0xff;
-	}
-
-	stat = ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, NULL, 0, 0);
-
-	if (!ide_cdrom_get_capabilities(drive, buf)) {
-		ide_cdrom_update_speed(drive, buf);
-		cdi->speed = cd->current_speed;
-	}
-
-	return 0;
-}
-
-int ide_cdrom_get_last_session(struct cdrom_device_info *cdi,
-			       struct cdrom_multisession *ms_info)
-{
-	struct atapi_toc *toc;
-	ide_drive_t *drive = cdi->handle;
-	struct cdrom_info *info = drive->driver_data;
-	int ret;
-
-	if ((drive->atapi_flags & IDE_AFLAG_TOC_VALID) == 0 || !info->toc) {
-		ret = ide_cd_read_toc(drive);
-		if (ret)
-			return ret;
-	}
-
-	toc = info->toc;
-	ms_info->addr.lba = toc->last_session_lba;
-	ms_info->xa_flag = toc->xa_flag;
-
-	return 0;
-}
-
-int ide_cdrom_get_mcn(struct cdrom_device_info *cdi,
-		      struct cdrom_mcn *mcn_info)
-{
-	ide_drive_t *drive = cdi->handle;
-	int stat, mcnlen;
-	char buf[24];
-	unsigned char cmd[BLK_MAX_CDB];
-	unsigned len = sizeof(buf);
-
-	memset(cmd, 0, BLK_MAX_CDB);
-
-	cmd[0] = GPCMD_READ_SUBCHANNEL;
-	cmd[1] = 2;		/* MSF addressing */
-	cmd[2] = 0x40;	/* request subQ data */
-	cmd[3] = 2;		/* format */
-	cmd[8] = len;
-
-	stat = ide_cd_queue_pc(drive, cmd, 0, buf, &len, NULL, 0, 0);
-	if (stat)
-		return stat;
-
-	mcnlen = sizeof(mcn_info->medium_catalog_number) - 1;
-	memcpy(mcn_info->medium_catalog_number, buf + 9, mcnlen);
-	mcn_info->medium_catalog_number[mcnlen] = '\0';
-
-	return 0;
-}
-
-int ide_cdrom_reset(struct cdrom_device_info *cdi)
-{
-	ide_drive_t *drive = cdi->handle;
-	struct cdrom_info *cd = drive->driver_data;
-	struct request *rq;
-	int ret;
-
-	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
-	ide_req(rq)->type = ATA_PRIV_MISC;
-	rq->rq_flags = RQF_QUIET;
-	blk_execute_rq(cd->disk, rq, 0);
-	ret = scsi_req(rq)->result ? -EIO : 0;
-	blk_put_request(rq);
-	/*
-	 * A reset will unlock the door. If it was previously locked,
-	 * lock it again.
-	 */
-	if (drive->atapi_flags & IDE_AFLAG_DOOR_LOCKED)
-		(void)ide_cd_lockdoor(drive, 1);
-
-	return ret;
-}
-
-static int ide_cd_get_toc_entry(ide_drive_t *drive, int track,
-				struct atapi_toc_entry **ent)
-{
-	struct cdrom_info *info = drive->driver_data;
-	struct atapi_toc *toc = info->toc;
-	int ntracks;
-
-	/*
-	 * don't serve cached data, if the toc isn't valid
-	 */
-	if ((drive->atapi_flags & IDE_AFLAG_TOC_VALID) == 0)
-		return -EINVAL;
-
-	/* Check validity of requested track number. */
-	ntracks = toc->hdr.last_track - toc->hdr.first_track + 1;
-
-	if (toc->hdr.first_track == CDROM_LEADOUT)
-		ntracks = 0;
-
-	if (track == CDROM_LEADOUT)
-		*ent = &toc->ent[ntracks];
-	else if (track < toc->hdr.first_track || track > toc->hdr.last_track)
-		return -EINVAL;
-	else
-		*ent = &toc->ent[track - toc->hdr.first_track];
-
-	return 0;
-}
-
-static int ide_cd_fake_play_trkind(ide_drive_t *drive, void *arg)
-{
-	struct cdrom_ti *ti = arg;
-	struct atapi_toc_entry *first_toc, *last_toc;
-	unsigned long lba_start, lba_end;
-	int stat;
-	unsigned char cmd[BLK_MAX_CDB];
-
-	stat = ide_cd_get_toc_entry(drive, ti->cdti_trk0, &first_toc);
-	if (stat)
-		return stat;
-
-	stat = ide_cd_get_toc_entry(drive, ti->cdti_trk1, &last_toc);
-	if (stat)
-		return stat;
-
-	if (ti->cdti_trk1 != CDROM_LEADOUT)
-		++last_toc;
-	lba_start = first_toc->addr.lba;
-	lba_end   = last_toc->addr.lba;
-
-	if (lba_end <= lba_start)
-		return -EINVAL;
-
-	memset(cmd, 0, BLK_MAX_CDB);
-
-	cmd[0] = GPCMD_PLAY_AUDIO_MSF;
-	lba_to_msf(lba_start,   &cmd[3], &cmd[4], &cmd[5]);
-	lba_to_msf(lba_end - 1, &cmd[6], &cmd[7], &cmd[8]);
-
-	return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, NULL, 0, 0);
-}
-
-static int ide_cd_read_tochdr(ide_drive_t *drive, void *arg)
-{
-	struct cdrom_info *cd = drive->driver_data;
-	struct cdrom_tochdr *tochdr = arg;
-	struct atapi_toc *toc;
-	int stat;
-
-	/* Make sure our saved TOC is valid. */
-	stat = ide_cd_read_toc(drive);
-	if (stat)
-		return stat;
-
-	toc = cd->toc;
-	tochdr->cdth_trk0 = toc->hdr.first_track;
-	tochdr->cdth_trk1 = toc->hdr.last_track;
-
-	return 0;
-}
-
-static int ide_cd_read_tocentry(ide_drive_t *drive, void *arg)
-{
-	struct cdrom_tocentry *tocentry = arg;
-	struct atapi_toc_entry *toce;
-	int stat;
-
-	stat = ide_cd_get_toc_entry(drive, tocentry->cdte_track, &toce);
-	if (stat)
-		return stat;
-
-	tocentry->cdte_ctrl = toce->control;
-	tocentry->cdte_adr  = toce->adr;
-	if (tocentry->cdte_format == CDROM_MSF) {
-		lba_to_msf(toce->addr.lba,
-			   &tocentry->cdte_addr.msf.minute,
-			   &tocentry->cdte_addr.msf.second,
-			   &tocentry->cdte_addr.msf.frame);
-	} else
-		tocentry->cdte_addr.lba = toce->addr.lba;
-
-	return 0;
-}
-
-int ide_cdrom_audio_ioctl(struct cdrom_device_info *cdi,
-			  unsigned int cmd, void *arg)
-{
-	ide_drive_t *drive = cdi->handle;
-
-	switch (cmd) {
-	/*
-	 * emulate PLAY_AUDIO_TI command with PLAY_AUDIO_10, since
-	 * atapi doesn't support it
-	 */
-	case CDROMPLAYTRKIND:
-		return ide_cd_fake_play_trkind(drive, arg);
-	case CDROMREADTOCHDR:
-		return ide_cd_read_tochdr(drive, arg);
-	case CDROMREADTOCENTRY:
-		return ide_cd_read_tocentry(drive, arg);
-	default:
-		return -EINVAL;
-	}
-}
-
-/* the generic packet interface to cdrom.c */
-int ide_cdrom_packet(struct cdrom_device_info *cdi,
-			    struct packet_command *cgc)
-{
-	ide_drive_t *drive = cdi->handle;
-	req_flags_t flags = 0;
-	unsigned len = cgc->buflen;
-
-	if (cgc->timeout <= 0)
-		cgc->timeout = ATAPI_WAIT_PC;
-
-	/* here we queue the commands from the uniform CD-ROM
-	   layer. the packet must be complete, as we do not
-	   touch it at all. */
-
-	if (cgc->sshdr)
-		memset(cgc->sshdr, 0, sizeof(*cgc->sshdr));
-
-	if (cgc->quiet)
-		flags |= RQF_QUIET;
-
-	cgc->stat = ide_cd_queue_pc(drive, cgc->cmd,
-				    cgc->data_direction == CGC_DATA_WRITE,
-				    cgc->buffer, &len,
-				    cgc->sshdr, cgc->timeout, flags);
-	if (!cgc->stat)
-		cgc->buflen -= len;
-	return cgc->stat;
-}
diff --git a/drivers/ide/ide-cd_verbose.c b/drivers/ide/ide-cd_verbose.c
deleted file mode 100644
index 5ecd5b2f03a3..000000000000
--- a/drivers/ide/ide-cd_verbose.c
+++ /dev/null
@@ -1,362 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Verbose error logging for ATAPI CD/DVD devices.
- *
- * Copyright (C) 1994-1996  Scott Snyder <snyder@fnald0.fnal.gov>
- * Copyright (C) 1996-1998  Erik Andersen <andersee@debian.org>
- * Copyright (C) 1998-2000  Jens Axboe <axboe@suse.de>
- */
-
-#include <linux/kernel.h>
-#include <linux/blkdev.h>
-#include <linux/cdrom.h>
-#include <linux/ide.h>
-#include <scsi/scsi.h>
-#include "ide-cd.h"
-
-#ifndef CONFIG_BLK_DEV_IDECD_VERBOSE_ERRORS
-void ide_cd_log_error(const char *name, struct request *failed_command,
-		      struct request_sense *sense)
-{
-	/* Suppress printing unit attention and `in progress of becoming ready'
-	   errors when we're not being verbose. */
-	if (sense->sense_key == UNIT_ATTENTION ||
-	    (sense->sense_key == NOT_READY && (sense->asc == 4 ||
-						sense->asc == 0x3a)))
-		return;
-
-	printk(KERN_ERR "%s: error code: 0x%02x  sense_key: 0x%02x  "
-			"asc: 0x%02x  ascq: 0x%02x\n",
-			name, sense->error_code, sense->sense_key,
-			sense->asc, sense->ascq);
-}
-#else
-/* The generic packet command opcodes for CD/DVD Logical Units,
- * From Table 57 of the SFF8090 Ver. 3 (Mt. Fuji) draft standard. */
-static const struct {
-	unsigned short packet_command;
-	const char * const text;
-} packet_command_texts[] = {
-	{ GPCMD_TEST_UNIT_READY, "Test Unit Ready" },
-	{ GPCMD_REQUEST_SENSE, "Request Sense" },
-	{ GPCMD_FORMAT_UNIT, "Format Unit" },
-	{ GPCMD_INQUIRY, "Inquiry" },
-	{ GPCMD_START_STOP_UNIT, "Start/Stop Unit" },
-	{ GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL, "Prevent/Allow Medium Removal" },
-	{ GPCMD_READ_FORMAT_CAPACITIES, "Read Format Capacities" },
-	{ GPCMD_READ_CDVD_CAPACITY, "Read Cd/Dvd Capacity" },
-	{ GPCMD_READ_10, "Read 10" },
-	{ GPCMD_WRITE_10, "Write 10" },
-	{ GPCMD_SEEK, "Seek" },
-	{ GPCMD_WRITE_AND_VERIFY_10, "Write and Verify 10" },
-	{ GPCMD_VERIFY_10, "Verify 10" },
-	{ GPCMD_FLUSH_CACHE, "Flush Cache" },
-	{ GPCMD_READ_SUBCHANNEL, "Read Subchannel" },
-	{ GPCMD_READ_TOC_PMA_ATIP, "Read Table of Contents" },
-	{ GPCMD_READ_HEADER, "Read Header" },
-	{ GPCMD_PLAY_AUDIO_10, "Play Audio 10" },
-	{ GPCMD_GET_CONFIGURATION, "Get Configuration" },
-	{ GPCMD_PLAY_AUDIO_MSF, "Play Audio MSF" },
-	{ GPCMD_PLAYAUDIO_TI, "Play Audio TrackIndex" },
-	{ GPCMD_GET_EVENT_STATUS_NOTIFICATION,
-		"Get Event Status Notification" },
-	{ GPCMD_PAUSE_RESUME, "Pause/Resume" },
-	{ GPCMD_STOP_PLAY_SCAN, "Stop Play/Scan" },
-	{ GPCMD_READ_DISC_INFO, "Read Disc Info" },
-	{ GPCMD_READ_TRACK_RZONE_INFO, "Read Track Rzone Info" },
-	{ GPCMD_RESERVE_RZONE_TRACK, "Reserve Rzone Track" },
-	{ GPCMD_SEND_OPC, "Send OPC" },
-	{ GPCMD_MODE_SELECT_10, "Mode Select 10" },
-	{ GPCMD_REPAIR_RZONE_TRACK, "Repair Rzone Track" },
-	{ GPCMD_MODE_SENSE_10, "Mode Sense 10" },
-	{ GPCMD_CLOSE_TRACK, "Close Track" },
-	{ GPCMD_BLANK, "Blank" },
-	{ GPCMD_SEND_EVENT, "Send Event" },
-	{ GPCMD_SEND_KEY, "Send Key" },
-	{ GPCMD_REPORT_KEY, "Report Key" },
-	{ GPCMD_LOAD_UNLOAD, "Load/Unload" },
-	{ GPCMD_SET_READ_AHEAD, "Set Read-ahead" },
-	{ GPCMD_READ_12, "Read 12" },
-	{ GPCMD_GET_PERFORMANCE, "Get Performance" },
-	{ GPCMD_SEND_DVD_STRUCTURE, "Send DVD Structure" },
-	{ GPCMD_READ_DVD_STRUCTURE, "Read DVD Structure" },
-	{ GPCMD_SET_STREAMING, "Set Streaming" },
-	{ GPCMD_READ_CD_MSF, "Read CD MSF" },
-	{ GPCMD_SCAN, "Scan" },
-	{ GPCMD_SET_SPEED, "Set Speed" },
-	{ GPCMD_PLAY_CD, "Play CD" },
-	{ GPCMD_MECHANISM_STATUS, "Mechanism Status" },
-	{ GPCMD_READ_CD, "Read CD" },
-};
-
-/* From Table 303 of the SFF8090 Ver. 3 (Mt. Fuji) draft standard. */
-static const char * const sense_key_texts[16] = {
-	"No sense data",
-	"Recovered error",
-	"Not ready",
-	"Medium error",
-	"Hardware error",
-	"Illegal request",
-	"Unit attention",
-	"Data protect",
-	"Blank check",
-	"(reserved)",
-	"(reserved)",
-	"Aborted command",
-	"(reserved)",
-	"(reserved)",
-	"Miscompare",
-	"(reserved)",
-};
-
-/* From Table 304 of the SFF8090 Ver. 3 (Mt. Fuji) draft standard. */
-static const struct {
-	unsigned long asc_ascq;
-	const char * const text;
-} sense_data_texts[] = {
-	{ 0x000000, "No additional sense information" },
-	{ 0x000011, "Play operation in progress" },
-	{ 0x000012, "Play operation paused" },
-	{ 0x000013, "Play operation successfully completed" },
-	{ 0x000014, "Play operation stopped due to error" },
-	{ 0x000015, "No current audio status to return" },
-	{ 0x010c0a, "Write error - padding blocks added" },
-	{ 0x011700, "Recovered data with no error correction applied" },
-	{ 0x011701, "Recovered data with retries" },
-	{ 0x011702, "Recovered data with positive head offset" },
-	{ 0x011703, "Recovered data with negative head offset" },
-	{ 0x011704, "Recovered data with retries and/or CIRC applied" },
-	{ 0x011705, "Recovered data using previous sector ID" },
-	{ 0x011800, "Recovered data with error correction applied" },
-	{ 0x011801, "Recovered data with error correction and retries applied"},
-	{ 0x011802, "Recovered data - the data was auto-reallocated" },
-	{ 0x011803, "Recovered data with CIRC" },
-	{ 0x011804, "Recovered data with L-EC" },
-	{ 0x015d00, "Failure prediction threshold exceeded"
-		    " - Predicted logical unit failure" },
-	{ 0x015d01, "Failure prediction threshold exceeded"
-		    " - Predicted media failure" },
-	{ 0x015dff, "Failure prediction threshold exceeded - False" },
-	{ 0x017301, "Power calibration area almost full" },
-	{ 0x020400, "Logical unit not ready - cause not reportable" },
-	/* Following is misspelled in ATAPI 2.6, _and_ in Mt. Fuji */
-	{ 0x020401, "Logical unit not ready"
-		    " - in progress [sic] of becoming ready" },
-	{ 0x020402, "Logical unit not ready - initializing command required" },
-	{ 0x020403, "Logical unit not ready - manual intervention required" },
-	{ 0x020404, "Logical unit not ready - format in progress" },
-	{ 0x020407, "Logical unit not ready - operation in progress" },
-	{ 0x020408, "Logical unit not ready - long write in progress" },
-	{ 0x020600, "No reference position found (media may be upside down)" },
-	{ 0x023000, "Incompatible medium installed" },
-	{ 0x023a00, "Medium not present" },
-	{ 0x025300, "Media load or eject failed" },
-	{ 0x025700, "Unable to recover table of contents" },
-	{ 0x030300, "Peripheral device write fault" },
-	{ 0x030301, "No write current" },
-	{ 0x030302, "Excessive write errors" },
-	{ 0x030c00, "Write error" },
-	{ 0x030c01, "Write error - Recovered with auto reallocation" },
-	{ 0x030c02, "Write error - auto reallocation failed" },
-	{ 0x030c03, "Write error - recommend reassignment" },
-	{ 0x030c04, "Compression check miscompare error" },
-	{ 0x030c05, "Data expansion occurred during compress" },
-	{ 0x030c06, "Block not compressible" },
-	{ 0x030c07, "Write error - recovery needed" },
-	{ 0x030c08, "Write error - recovery failed" },
-	{ 0x030c09, "Write error - loss of streaming" },
-	{ 0x031100, "Unrecovered read error" },
-	{ 0x031106, "CIRC unrecovered error" },
-	{ 0x033101, "Format command failed" },
-	{ 0x033200, "No defect spare location available" },
-	{ 0x033201, "Defect list update failure" },
-	{ 0x035100, "Erase failure" },
-	{ 0x037200, "Session fixation error" },
-	{ 0x037201, "Session fixation error writin lead-in" },
-	{ 0x037202, "Session fixation error writin lead-out" },
-	{ 0x037300, "CD control error" },
-	{ 0x037302, "Power calibration area is full" },
-	{ 0x037303, "Power calibration area error" },
-	{ 0x037304, "Program memory area / RMA update failure" },
-	{ 0x037305, "Program memory area / RMA is full" },
-	{ 0x037306, "Program memory area / RMA is (almost) full" },
-	{ 0x040200, "No seek complete" },
-	{ 0x040300, "Write fault" },
-	{ 0x040900, "Track following error" },
-	{ 0x040901, "Tracking servo failure" },
-	{ 0x040902, "Focus servo failure" },
-	{ 0x040903, "Spindle servo failure" },
-	{ 0x041500, "Random positioning error" },
-	{ 0x041501, "Mechanical positioning or changer error" },
-	{ 0x041502, "Positioning error detected by read of medium" },
-	{ 0x043c00, "Mechanical positioning or changer error" },
-	{ 0x044000, "Diagnostic failure on component (ASCQ)" },
-	{ 0x044400, "Internal CD/DVD logical unit failure" },
-	{ 0x04b600, "Media load mechanism failed" },
-	{ 0x051a00, "Parameter list length error" },
-	{ 0x052000, "Invalid command operation code" },
-	{ 0x052100, "Logical block address out of range" },
-	{ 0x052102, "Invalid address for write" },
-	{ 0x052400, "Invalid field in command packet" },
-	{ 0x052600, "Invalid field in parameter list" },
-	{ 0x052601, "Parameter not supported" },
-	{ 0x052602, "Parameter value invalid" },
-	{ 0x052700, "Write protected media" },
-	{ 0x052c00, "Command sequence error" },
-	{ 0x052c03, "Current program area is not empty" },
-	{ 0x052c04, "Current program area is empty" },
-	{ 0x053001, "Cannot read medium - unknown format" },
-	{ 0x053002, "Cannot read medium - incompatible format" },
-	{ 0x053900, "Saving parameters not supported" },
-	{ 0x054e00, "Overlapped commands attempted" },
-	{ 0x055302, "Medium removal prevented" },
-	{ 0x055500, "System resource failure" },
-	{ 0x056300, "End of user area encountered on this track" },
-	{ 0x056400, "Illegal mode for this track or incompatible medium" },
-	{ 0x056f00, "Copy protection key exchange failure"
-		    " - Authentication failure" },
-	{ 0x056f01, "Copy protection key exchange failure - Key not present" },
-	{ 0x056f02, "Copy protection key exchange failure"
-		     " - Key not established" },
-	{ 0x056f03, "Read of scrambled sector without authentication" },
-	{ 0x056f04, "Media region code is mismatched to logical unit" },
-	{ 0x056f05, "Drive region must be permanent"
-		    " / region reset count error" },
-	{ 0x057203, "Session fixation error - incomplete track in session" },
-	{ 0x057204, "Empty or partially written reserved track" },
-	{ 0x057205, "No more RZONE reservations are allowed" },
-	{ 0x05bf00, "Loss of streaming" },
-	{ 0x062800, "Not ready to ready transition, medium may have changed" },
-	{ 0x062900, "Power on, reset or hardware reset occurred" },
-	{ 0x062a00, "Parameters changed" },
-	{ 0x062a01, "Mode parameters changed" },
-	{ 0x062e00, "Insufficient time for operation" },
-	{ 0x063f00, "Logical unit operating conditions have changed" },
-	{ 0x063f01, "Microcode has been changed" },
-	{ 0x065a00, "Operator request or state change input (unspecified)" },
-	{ 0x065a01, "Operator medium removal request" },
-	{ 0x0bb900, "Play operation aborted" },
-	/* Here we use 0xff for the key (not a valid key) to signify
-	 * that these can have _any_ key value associated with them... */
-	{ 0xff0401, "Logical unit is in process of becoming ready" },
-	{ 0xff0400, "Logical unit not ready, cause not reportable" },
-	{ 0xff0402, "Logical unit not ready, initializing command required" },
-	{ 0xff0403, "Logical unit not ready, manual intervention required" },
-	{ 0xff0500, "Logical unit does not respond to selection" },
-	{ 0xff0800, "Logical unit communication failure" },
-	{ 0xff0802, "Logical unit communication parity error" },
-	{ 0xff0801, "Logical unit communication time-out" },
-	{ 0xff2500, "Logical unit not supported" },
-	{ 0xff4c00, "Logical unit failed self-configuration" },
-	{ 0xff3e00, "Logical unit has not self-configured yet" },
-};
-
-void ide_cd_log_error(const char *name, struct request *failed_command,
-		      struct request_sense *sense)
-{
-	int i;
-	const char *s = "bad sense key!";
-	char buf[80];
-
-	printk(KERN_ERR "ATAPI device %s:\n", name);
-	if (sense->error_code == 0x70)
-		printk(KERN_CONT "  Error: ");
-	else if (sense->error_code == 0x71)
-		printk("  Deferred Error: ");
-	else if (sense->error_code == 0x7f)
-		printk(KERN_CONT "  Vendor-specific Error: ");
-	else
-		printk(KERN_CONT "  Unknown Error Type: ");
-
-	if (sense->sense_key < ARRAY_SIZE(sense_key_texts))
-		s = sense_key_texts[sense->sense_key];
-
-	printk(KERN_CONT "%s -- (Sense key=0x%02x)\n", s, sense->sense_key);
-
-	if (sense->asc == 0x40) {
-		sprintf(buf, "Diagnostic failure on component 0x%02x",
-			sense->ascq);
-		s = buf;
-	} else {
-		int lo = 0, mid, hi = ARRAY_SIZE(sense_data_texts);
-		unsigned long key = (sense->sense_key << 16);
-
-		key |= (sense->asc << 8);
-		if (!(sense->ascq >= 0x80 && sense->ascq <= 0xdd))
-			key |= sense->ascq;
-		s = NULL;
-
-		while (hi > lo) {
-			mid = (lo + hi) / 2;
-			if (sense_data_texts[mid].asc_ascq == key ||
-			    sense_data_texts[mid].asc_ascq == (0xff0000|key)) {
-				s = sense_data_texts[mid].text;
-				break;
-			} else if (sense_data_texts[mid].asc_ascq > key)
-				hi = mid;
-			else
-				lo = mid + 1;
-		}
-	}
-
-	if (s == NULL) {
-		if (sense->asc > 0x80)
-			s = "(vendor-specific error)";
-		else
-			s = "(reserved error code)";
-	}
-
-	printk(KERN_ERR "  %s -- (asc=0x%02x, ascq=0x%02x)\n",
-			s, sense->asc, sense->ascq);
-
-	if (failed_command != NULL) {
-		int lo = 0, mid, hi = ARRAY_SIZE(packet_command_texts);
-		s = NULL;
-
-		while (hi > lo) {
-			mid = (lo + hi) / 2;
-			if (packet_command_texts[mid].packet_command ==
-			    scsi_req(failed_command)->cmd[0]) {
-				s = packet_command_texts[mid].text;
-				break;
-			}
-			if (packet_command_texts[mid].packet_command >
-			    scsi_req(failed_command)->cmd[0])
-				hi = mid;
-			else
-				lo = mid + 1;
-		}
-
-		printk(KERN_ERR "  The failed \"%s\" packet command "
-				"was: \n  \"", s);
-		for (i = 0; i < BLK_MAX_CDB; i++)
-			printk(KERN_CONT "%02x ", scsi_req(failed_command)->cmd[i]);
-		printk(KERN_CONT "\"\n");
-	}
-
-	/* The SKSV bit specifies validity of the sense_key_specific
-	 * in the next two commands. It is bit 7 of the first byte.
-	 * In the case of NOT_READY, if SKSV is set the drive can
-	 * give us nice ETA readings.
-	 */
-	if (sense->sense_key == NOT_READY && (sense->sks[0] & 0x80)) {
-		int progress = (sense->sks[1] << 8 | sense->sks[2]) * 100;
-
-		printk(KERN_ERR "  Command is %02d%% complete\n",
-				progress / 0xffff);
-	}
-
-	if (sense->sense_key == ILLEGAL_REQUEST &&
-	    (sense->sks[0] & 0x80) != 0) {
-		printk(KERN_ERR "  Error in %s byte %d",
-				(sense->sks[0] & 0x40) != 0 ?
-				"command packet" : "command data",
-				(sense->sks[1] << 8) + sense->sks[2]);
-
-		if ((sense->sks[0] & 0x40) != 0)
-			printk(KERN_CONT " bit %d", sense->sks[0] & 0x07);
-
-		printk(KERN_CONT "\n");
-	}
-}
-#endif
diff --git a/drivers/ide/ide-cs.c b/drivers/ide/ide-cs.c
deleted file mode 100644
index f1e922e2479a..000000000000
--- a/drivers/ide/ide-cs.c
+++ /dev/null
@@ -1,364 +0,0 @@
-/*======================================================================
-
-    A driver for PCMCIA IDE/ATA disk cards
-
-    The contents of this file are subject to the Mozilla Public
-    License Version 1.1 (the "License"); you may not use this file
-    except in compliance with the License. You may obtain a copy of
-    the License at http://www.mozilla.org/MPL/
-
-    Software distributed under the License is distributed on an "AS
-    IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
-    implied. See the License for the specific language governing
-    rights and limitations under the License.
-
-    The initial developer of the original code is David A. Hinds
-    <dahinds@users.sourceforge.net>.  Portions created by David A. Hinds
-    are Copyright (C) 1999 David A. Hinds.  All Rights Reserved.
-
-    Alternatively, the contents of this file may be used under the
-    terms of the GNU General Public License version 2 (the "GPL"), in
-    which case the provisions of the GPL are applicable instead of the
-    above.  If you wish to allow the use of your version of this file
-    only under the terms of the GPL and not to allow others to use
-    your version of this file under the MPL, indicate your decision
-    by deleting the provisions above and replace them with the notice
-    and other provisions required by the GPL.  If you do not delete
-    the provisions above, a recipient may use your version of this
-    file under either the MPL or the GPL.
-
-======================================================================*/
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/ptrace.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/timer.h>
-#include <linux/ioport.h>
-#include <linux/ide.h>
-#include <linux/major.h>
-#include <linux/delay.h>
-#include <asm/io.h>
-
-#include <pcmcia/cistpl.h>
-#include <pcmcia/ds.h>
-#include <pcmcia/cisreg.h>
-#include <pcmcia/ciscode.h>
-
-#define DRV_NAME "ide-cs"
-
-/*====================================================================*/
-
-/* Module parameters */
-
-MODULE_AUTHOR("David Hinds <dahinds@users.sourceforge.net>");
-MODULE_DESCRIPTION("PCMCIA ATA/IDE card driver");
-MODULE_LICENSE("Dual MPL/GPL");
-
-/*====================================================================*/
-
-typedef struct ide_info_t {
-	struct pcmcia_device	*p_dev;
-	struct ide_host		*host;
-	int			ndev;
-} ide_info_t;
-
-static void ide_release(struct pcmcia_device *);
-static int ide_config(struct pcmcia_device *);
-
-static void ide_detach(struct pcmcia_device *p_dev);
-
-static int ide_probe(struct pcmcia_device *link)
-{
-    ide_info_t *info;
-
-    dev_dbg(&link->dev, "ide_attach()\n");
-
-    /* Create new ide device */
-    info = kzalloc(sizeof(*info), GFP_KERNEL);
-    if (!info)
-	return -ENOMEM;
-
-    info->p_dev = link;
-    link->priv = info;
-
-    link->config_flags |= CONF_ENABLE_IRQ | CONF_AUTO_SET_IO |
-	    CONF_AUTO_SET_VPP | CONF_AUTO_CHECK_VCC;
-
-    return ide_config(link);
-} /* ide_attach */
-
-static void ide_detach(struct pcmcia_device *link)
-{
-    ide_info_t *info = link->priv;
-
-    dev_dbg(&link->dev, "ide_detach(0x%p)\n", link);
-
-    ide_release(link);
-
-    kfree(info);
-} /* ide_detach */
-
-static const struct ide_port_ops idecs_port_ops = {
-	.quirkproc		= ide_undecoded_slave,
-};
-
-static const struct ide_port_info idecs_port_info = {
-	.port_ops		= &idecs_port_ops,
-	.host_flags		= IDE_HFLAG_NO_DMA,
-	.irq_flags		= IRQF_SHARED,
-	.chipset		= ide_pci,
-};
-
-static struct ide_host *idecs_register(unsigned long io, unsigned long ctl,
-				unsigned long irq, struct pcmcia_device *handle)
-{
-    struct ide_host *host;
-    ide_hwif_t *hwif;
-    int i, rc;
-    struct ide_hw hw, *hws[] = { &hw };
-
-    if (!request_region(io, 8, DRV_NAME)) {
-	printk(KERN_ERR "%s: I/O resource 0x%lX-0x%lX not free.\n",
-			DRV_NAME, io, io + 7);
-	return NULL;
-    }
-
-    if (!request_region(ctl, 1, DRV_NAME)) {
-	printk(KERN_ERR "%s: I/O resource 0x%lX not free.\n",
-			DRV_NAME, ctl);
-	release_region(io, 8);
-	return NULL;
-    }
-
-    memset(&hw, 0, sizeof(hw));
-    ide_std_init_ports(&hw, io, ctl);
-    hw.irq = irq;
-    hw.dev = &handle->dev;
-
-    rc = ide_host_add(&idecs_port_info, hws, 1, &host);
-    if (rc)
-	goto out_release;
-
-    hwif = host->ports[0];
-
-    if (hwif->present)
-	return host;
-
-    /* retry registration in case device is still spinning up */
-    for (i = 0; i < 10; i++) {
-	msleep(100);
-	ide_port_scan(hwif);
-	if (hwif->present)
-	    return host;
-    }
-
-    return host;
-
-out_release:
-    release_region(ctl, 1);
-    release_region(io, 8);
-    return NULL;
-}
-
-static int pcmcia_check_one_config(struct pcmcia_device *pdev, void *priv_data)
-{
-	int *is_kme = priv_data;
-
-	if ((pdev->resource[0]->flags & IO_DATA_PATH_WIDTH)
-	    != IO_DATA_PATH_WIDTH_8) {
-		pdev->resource[0]->flags &= ~IO_DATA_PATH_WIDTH;
-		pdev->resource[0]->flags |= IO_DATA_PATH_WIDTH_AUTO;
-	}
-	pdev->resource[1]->flags &= ~IO_DATA_PATH_WIDTH;
-	pdev->resource[1]->flags |= IO_DATA_PATH_WIDTH_8;
-
-	if (pdev->resource[1]->end) {
-		pdev->resource[0]->end = 8;
-		pdev->resource[1]->end = (*is_kme) ? 2 : 1;
-	} else {
-		if (pdev->resource[0]->end < 16)
-			return -ENODEV;
-	}
-
-	return pcmcia_request_io(pdev);
-}
-
-static int ide_config(struct pcmcia_device *link)
-{
-    ide_info_t *info = link->priv;
-    int ret = 0, is_kme = 0;
-    unsigned long io_base, ctl_base;
-    struct ide_host *host;
-
-    dev_dbg(&link->dev, "ide_config(0x%p)\n", link);
-
-    is_kme = ((link->manf_id == MANFID_KME) &&
-	      ((link->card_id == PRODID_KME_KXLC005_A) ||
-	       (link->card_id == PRODID_KME_KXLC005_B)));
-
-    if (pcmcia_loop_config(link, pcmcia_check_one_config, &is_kme)) {
-	    link->config_flags &= ~CONF_AUTO_CHECK_VCC;
-	    if (pcmcia_loop_config(link, pcmcia_check_one_config, &is_kme))
-		    goto failed; /* No suitable config found */
-    }
-    io_base = link->resource[0]->start;
-    if (link->resource[1]->end)
-	    ctl_base = link->resource[1]->start;
-    else
-	    ctl_base = link->resource[0]->start + 0x0e;
-
-    if (!link->irq)
-	    goto failed;
-
-    ret = pcmcia_enable_device(link);
-    if (ret)
-	    goto failed;
-
-    /* disable drive interrupts during IDE probe */
-    outb(0x02, ctl_base);
-
-    /* special setup for KXLC005 card */
-    if (is_kme)
-	outb(0x81, ctl_base+1);
-
-     host = idecs_register(io_base, ctl_base, link->irq, link);
-     if (host == NULL && resource_size(link->resource[0]) == 0x20) {
-	    outb(0x02, ctl_base + 0x10);
-	    host = idecs_register(io_base + 0x10, ctl_base + 0x10,
-				  link->irq, link);
-    }
-
-    if (host == NULL)
-	goto failed;
-
-    info->ndev = 1;
-    info->host = host;
-    dev_info(&link->dev, "ide-cs: hd%c: Vpp = %d.%d\n",
-	    'a' + host->ports[0]->index * 2,
-	    link->vpp / 10, link->vpp % 10);
-
-    return 0;
-
-failed:
-    ide_release(link);
-    return -ENODEV;
-} /* ide_config */
-
-static void ide_release(struct pcmcia_device *link)
-{
-    ide_info_t *info = link->priv;
-    struct ide_host *host = info->host;
-
-    dev_dbg(&link->dev, "ide_release(0x%p)\n", link);
-
-    if (info->ndev) {
-	ide_hwif_t *hwif = host->ports[0];
-	unsigned long data_addr, ctl_addr;
-
-	data_addr = hwif->io_ports.data_addr;
-	ctl_addr = hwif->io_ports.ctl_addr;
-
-	ide_host_remove(host);
-	info->ndev = 0;
-
-	release_region(ctl_addr, 1);
-	release_region(data_addr, 8);
-    }
-
-    pcmcia_disable_device(link);
-} /* ide_release */
-
-
-static const struct pcmcia_device_id ide_ids[] = {
-	PCMCIA_DEVICE_FUNC_ID(4),
-	PCMCIA_DEVICE_MANF_CARD(0x0000, 0x0000),	/* Corsair */
-	PCMCIA_DEVICE_MANF_CARD(0x0007, 0x0000),	/* Hitachi */
-	PCMCIA_DEVICE_MANF_CARD(0x000a, 0x0000),	/* I-O Data CFA */
-	PCMCIA_DEVICE_MANF_CARD(0x001c, 0x0001),	/* Mitsubishi CFA */
-	PCMCIA_DEVICE_MANF_CARD(0x0032, 0x0704),
-	PCMCIA_DEVICE_MANF_CARD(0x0032, 0x2904),
-	PCMCIA_DEVICE_MANF_CARD(0x0045, 0x0401),	/* SanDisk CFA */
-	PCMCIA_DEVICE_MANF_CARD(0x004f, 0x0000),	/* Kingston */
-	PCMCIA_DEVICE_MANF_CARD(0x0097, 0x1620), 	/* TI emulated */
-	PCMCIA_DEVICE_MANF_CARD(0x0098, 0x0000),	/* Toshiba */
-	PCMCIA_DEVICE_MANF_CARD(0x00a4, 0x002d),
-	PCMCIA_DEVICE_MANF_CARD(0x00ce, 0x0000),	/* Samsung */
-	PCMCIA_DEVICE_MANF_CARD(0x0319, 0x0000),	/* Hitachi */
-	PCMCIA_DEVICE_MANF_CARD(0x2080, 0x0001),
-	PCMCIA_DEVICE_MANF_CARD(0x4e01, 0x0100),	/* Viking CFA */
-	PCMCIA_DEVICE_MANF_CARD(0x4e01, 0x0200),	/* Lexar, Viking CFA */
-	PCMCIA_DEVICE_PROD_ID123("Caravelle", "PSC-IDE ", "PSC000", 0x8c36137c, 0xd0693ab8, 0x2768a9f0),
-	PCMCIA_DEVICE_PROD_ID123("CDROM", "IDE", "MCD-601p", 0x1b9179ca, 0xede88951, 0x0d902f74),
-	PCMCIA_DEVICE_PROD_ID123("PCMCIA", "IDE CARD", "F1", 0x281f1c5d, 0x1907960c, 0xf7fde8b9),
-	PCMCIA_DEVICE_PROD_ID12("ARGOSY", "CD-ROM", 0x78f308dc, 0x66536591),
-	PCMCIA_DEVICE_PROD_ID12("ARGOSY", "PnPIDE", 0x78f308dc, 0x0c694728),
-	PCMCIA_DEVICE_PROD_ID12("CNF   ", "CD-ROM", 0x46d7db81, 0x66536591),
-	PCMCIA_DEVICE_PROD_ID12("CNF CD-M", "CD-ROM", 0x7d93b852, 0x66536591),
-	PCMCIA_DEVICE_PROD_ID12("Creative Technology Ltd.", "PCMCIA CD-ROM Interface Card", 0xff8c8a45, 0xfe8020c4),
-	PCMCIA_DEVICE_PROD_ID12("Digital Equipment Corporation.", "Digital Mobile Media CD-ROM", 0x17692a66, 0xef1dcbde),
-	PCMCIA_DEVICE_PROD_ID12("EXP", "CD+GAME", 0x6f58c983, 0x63c13aaf),
-	PCMCIA_DEVICE_PROD_ID12("EXP   ", "CD-ROM", 0x0a5c52fd, 0x66536591),
-	PCMCIA_DEVICE_PROD_ID12("EXP   ", "PnPIDE", 0x0a5c52fd, 0x0c694728),
-	PCMCIA_DEVICE_PROD_ID12("FREECOM", "PCCARD-IDE", 0x5714cbf7, 0x48e0ab8e),
-	PCMCIA_DEVICE_PROD_ID12("HITACHI", "FLASH", 0xf4f43949, 0x9eb86aae),
-	PCMCIA_DEVICE_PROD_ID12("HITACHI", "microdrive", 0xf4f43949, 0xa6d76178),
-	PCMCIA_DEVICE_PROD_ID12("Hyperstone", "Model1", 0x3d5b9ef5, 0xca6ab420),
-	PCMCIA_DEVICE_PROD_ID12("IBM", "microdrive", 0xb569a6e5, 0xa6d76178),
-	PCMCIA_DEVICE_PROD_ID12("IBM", "IBM17JSSFP20", 0xb569a6e5, 0xf2508753),
-	PCMCIA_DEVICE_PROD_ID12("KINGSTON", "CF CARD 1GB", 0x2e6d1829, 0x55d5bffb),
-	PCMCIA_DEVICE_PROD_ID12("KINGSTON", "CF CARD 4GB", 0x2e6d1829, 0x531e7d10),
-	PCMCIA_DEVICE_PROD_ID12("KINGSTON", "CF8GB", 0x2e6d1829, 0xacbe682e),
-	PCMCIA_DEVICE_PROD_ID12("IO DATA", "CBIDE2      ", 0x547e66dc, 0x8671043b),
-	PCMCIA_DEVICE_PROD_ID12("IO DATA", "PCIDE", 0x547e66dc, 0x5c5ab149),
-	PCMCIA_DEVICE_PROD_ID12("IO DATA", "PCIDEII", 0x547e66dc, 0xb3662674),
-	PCMCIA_DEVICE_PROD_ID12("LOOKMEET", "CBIDE2      ", 0xe37be2b5, 0x8671043b),
-	PCMCIA_DEVICE_PROD_ID12("M-Systems", "CF300", 0x7ed2ad87, 0x7e9e78ee),
-	PCMCIA_DEVICE_PROD_ID12("M-Systems", "CF500", 0x7ed2ad87, 0x7a13045c),
-	PCMCIA_DEVICE_PROD_ID2("NinjaATA-", 0xebe0bd79),
-	PCMCIA_DEVICE_PROD_ID12("PCMCIA", "CD-ROM", 0x281f1c5d, 0x66536591),
-	PCMCIA_DEVICE_PROD_ID12("PCMCIA", "PnPIDE", 0x281f1c5d, 0x0c694728),
-	PCMCIA_DEVICE_PROD_ID12("SHUTTLE TECHNOLOGY LTD.", "PCCARD-IDE/ATAPI Adapter", 0x4a3f0ba0, 0x322560e1),
-	PCMCIA_DEVICE_PROD_ID12("SEAGATE", "ST1", 0x87c1b330, 0xe1f30883),
-	PCMCIA_DEVICE_PROD_ID12("SAMSUNG", "04/05/06", 0x43d74cb4, 0x6a22777d),
-	PCMCIA_DEVICE_PROD_ID12("SMI VENDOR", "SMI PRODUCT", 0x30896c92, 0x703cc5f6),
-	PCMCIA_DEVICE_PROD_ID12("TOSHIBA", "MK2001MPL", 0xb4585a1a, 0x3489e003),
-	PCMCIA_DEVICE_PROD_ID1("TRANSCEND    512M   ", 0xd0909443),
-	PCMCIA_DEVICE_PROD_ID12("TRANSCEND", "TS1GCF45", 0x709b1bf1, 0xf68b6f32),
-	PCMCIA_DEVICE_PROD_ID12("TRANSCEND", "TS1GCF80", 0x709b1bf1, 0x2a54d4b1),
-	PCMCIA_DEVICE_PROD_ID12("TRANSCEND", "TS2GCF120", 0x709b1bf1, 0x969aa4f2),
-	PCMCIA_DEVICE_PROD_ID12("TRANSCEND", "TS4GCF120", 0x709b1bf1, 0xf54a91c8),
-	PCMCIA_DEVICE_PROD_ID12("TRANSCEND", "TS4GCF133", 0x709b1bf1, 0x7558f133),
-	PCMCIA_DEVICE_PROD_ID12("TRANSCEND", "TS8GCF133", 0x709b1bf1, 0xb2f89b47),
-	PCMCIA_DEVICE_PROD_ID12("WIT", "IDE16", 0x244e5994, 0x3e232852),
-	PCMCIA_DEVICE_PROD_ID12("WEIDA", "TWTTI", 0xcc7cf69c, 0x212bb918),
-	PCMCIA_DEVICE_PROD_ID1("STI Flash", 0xe4a13209),
-	PCMCIA_DEVICE_PROD_ID12("STI", "Flash 5.0", 0xbf2df18d, 0x8cb57a0e),
-	PCMCIA_MFC_DEVICE_PROD_ID12(1, "SanDisk", "ConnectPlus", 0x7a954bd9, 0x74be00c6),
-	PCMCIA_DEVICE_PROD_ID2("Flash Card", 0x5a362506),
-	PCMCIA_DEVICE_NULL,
-};
-MODULE_DEVICE_TABLE(pcmcia, ide_ids);
-
-static struct pcmcia_driver ide_cs_driver = {
-	.owner		= THIS_MODULE,
-	.name		= "ide-cs",
-	.probe		= ide_probe,
-	.remove		= ide_detach,
-	.id_table       = ide_ids,
-};
-
-static int __init init_ide_cs(void)
-{
-	return pcmcia_register_driver(&ide_cs_driver);
-}
-
-static void __exit exit_ide_cs(void)
-{
-	pcmcia_unregister_driver(&ide_cs_driver);
-}
-
-late_initcall(init_ide_cs);
-module_exit(exit_ide_cs);
diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c
deleted file mode 100644
index ca1d4b3d3878..000000000000
--- a/drivers/ide/ide-devsets.c
+++ /dev/null
@@ -1,192 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/kernel.h>
-#include <linux/gfp.h>
-#include <linux/ide.h>
-
-DEFINE_MUTEX(ide_setting_mtx);
-
-ide_devset_get(io_32bit, io_32bit);
-
-static int set_io_32bit(ide_drive_t *drive, int arg)
-{
-	if (drive->dev_flags & IDE_DFLAG_NO_IO_32BIT)
-		return -EPERM;
-
-	if (arg < 0 || arg > 1 + (SUPPORT_VLB_SYNC << 1))
-		return -EINVAL;
-
-	drive->io_32bit = arg;
-
-	return 0;
-}
-
-ide_devset_get_flag(ksettings, IDE_DFLAG_KEEP_SETTINGS);
-
-static int set_ksettings(ide_drive_t *drive, int arg)
-{
-	if (arg < 0 || arg > 1)
-		return -EINVAL;
-
-	if (arg)
-		drive->dev_flags |= IDE_DFLAG_KEEP_SETTINGS;
-	else
-		drive->dev_flags &= ~IDE_DFLAG_KEEP_SETTINGS;
-
-	return 0;
-}
-
-ide_devset_get_flag(using_dma, IDE_DFLAG_USING_DMA);
-
-static int set_using_dma(ide_drive_t *drive, int arg)
-{
-#ifdef CONFIG_BLK_DEV_IDEDMA
-	int err = -EPERM;
-
-	if (arg < 0 || arg > 1)
-		return -EINVAL;
-
-	if (ata_id_has_dma(drive->id) == 0)
-		goto out;
-
-	if (drive->hwif->dma_ops == NULL)
-		goto out;
-
-	err = 0;
-
-	if (arg) {
-		if (ide_set_dma(drive))
-			err = -EIO;
-	} else
-		ide_dma_off(drive);
-
-out:
-	return err;
-#else
-	if (arg < 0 || arg > 1)
-		return -EINVAL;
-
-	return -EPERM;
-#endif
-}
-
-/*
- * handle HDIO_SET_PIO_MODE ioctl abusers here, eventually it will go away
- */
-static int set_pio_mode_abuse(ide_hwif_t *hwif, u8 req_pio)
-{
-	switch (req_pio) {
-	case 202:
-	case 201:
-	case 200:
-	case 102:
-	case 101:
-	case 100:
-		return (hwif->host_flags & IDE_HFLAG_ABUSE_DMA_MODES) ? 1 : 0;
-	case 9:
-	case 8:
-		return (hwif->host_flags & IDE_HFLAG_ABUSE_PREFETCH) ? 1 : 0;
-	case 7:
-	case 6:
-		return (hwif->host_flags & IDE_HFLAG_ABUSE_FAST_DEVSEL) ? 1 : 0;
-	default:
-		return 0;
-	}
-}
-
-static int set_pio_mode(ide_drive_t *drive, int arg)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_port_ops *port_ops = hwif->port_ops;
-
-	if (arg < 0 || arg > 255)
-		return -EINVAL;
-
-	if (port_ops == NULL || port_ops->set_pio_mode == NULL ||
-	    (hwif->host_flags & IDE_HFLAG_NO_SET_MODE))
-		return -ENOSYS;
-
-	if (set_pio_mode_abuse(drive->hwif, arg)) {
-		drive->pio_mode = arg + XFER_PIO_0;
-
-		if (arg == 8 || arg == 9) {
-			unsigned long flags;
-
-			/* take lock for IDE_DFLAG_[NO_]UNMASK/[NO_]IO_32BIT */
-			spin_lock_irqsave(&hwif->lock, flags);
-			port_ops->set_pio_mode(hwif, drive);
-			spin_unlock_irqrestore(&hwif->lock, flags);
-		} else
-			port_ops->set_pio_mode(hwif, drive);
-	} else {
-		int keep_dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA);
-
-		ide_set_pio(drive, arg);
-
-		if (hwif->host_flags & IDE_HFLAG_SET_PIO_MODE_KEEP_DMA) {
-			if (keep_dma)
-				ide_dma_on(drive);
-		}
-	}
-
-	return 0;
-}
-
-ide_devset_get_flag(unmaskirq, IDE_DFLAG_UNMASK);
-
-static int set_unmaskirq(ide_drive_t *drive, int arg)
-{
-	if (drive->dev_flags & IDE_DFLAG_NO_UNMASK)
-		return -EPERM;
-
-	if (arg < 0 || arg > 1)
-		return -EINVAL;
-
-	if (arg)
-		drive->dev_flags |= IDE_DFLAG_UNMASK;
-	else
-		drive->dev_flags &= ~IDE_DFLAG_UNMASK;
-
-	return 0;
-}
-
-ide_ext_devset_rw_sync(io_32bit, io_32bit);
-ide_ext_devset_rw_sync(keepsettings, ksettings);
-ide_ext_devset_rw_sync(unmaskirq, unmaskirq);
-ide_ext_devset_rw_sync(using_dma, using_dma);
-__IDE_DEVSET(pio_mode, DS_SYNC, NULL, set_pio_mode);
-
-int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
-		       int arg)
-{
-	struct request_queue *q = drive->queue;
-	struct request *rq;
-	int ret = 0;
-
-	if (!(setting->flags & DS_SYNC))
-		return setting->set(drive, arg);
-
-	rq = blk_get_request(q, REQ_OP_DRV_IN, 0);
-	ide_req(rq)->type = ATA_PRIV_MISC;
-	scsi_req(rq)->cmd_len = 5;
-	scsi_req(rq)->cmd[0] = REQ_DEVSET_EXEC;
-	*(int *)&scsi_req(rq)->cmd[1] = arg;
-	ide_req(rq)->special = setting->set;
-
-	blk_execute_rq(NULL, rq, 0);
-	ret = scsi_req(rq)->result;
-	blk_put_request(rq);
-
-	return ret;
-}
-
-ide_startstop_t ide_do_devset(ide_drive_t *drive, struct request *rq)
-{
-	int err, (*setfunc)(ide_drive_t *, int) = ide_req(rq)->special;
-
-	err = setfunc(drive, *(int *)&scsi_req(rq)->cmd[1]);
-	if (err)
-		scsi_req(rq)->result = err;
-	ide_complete_rq(drive, 0, blk_rq_bytes(rq));
-	return ide_stopped;
-}
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
deleted file mode 100644
index 8413731c6259..000000000000
--- a/drivers/ide/ide-disk.c
+++ /dev/null
@@ -1,795 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- *  Copyright (C) 1994-1998	   Linus Torvalds & authors (see below)
- *  Copyright (C) 1998-2002	   Linux ATA Development
- *				      Andre Hedrick <andre@linux-ide.org>
- *  Copyright (C) 2003		   Red Hat
- *  Copyright (C) 2003-2005, 2007  Bartlomiej Zolnierkiewicz
- */
-
-/*
- *  Mostly written by Mark Lord <mlord@pobox.com>
- *                and Gadi Oxman <gadio@netvision.net.il>
- *                and Andre Hedrick <andre@linux-ide.org>
- *
- * This is the IDE/ATA disk driver, as evolved from hd.c and ide.c.
- */
-
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/timer.h>
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/major.h>
-#include <linux/errno.h>
-#include <linux/genhd.h>
-#include <linux/slab.h>
-#include <linux/delay.h>
-#include <linux/mutex.h>
-#include <linux/leds.h>
-#include <linux/ide.h>
-
-#include <asm/byteorder.h>
-#include <asm/irq.h>
-#include <linux/uaccess.h>
-#include <asm/io.h>
-#include <asm/div64.h>
-
-#include "ide-disk.h"
-
-static const u8 ide_rw_cmds[] = {
-	ATA_CMD_READ_MULTI,
-	ATA_CMD_WRITE_MULTI,
-	ATA_CMD_READ_MULTI_EXT,
-	ATA_CMD_WRITE_MULTI_EXT,
-	ATA_CMD_PIO_READ,
-	ATA_CMD_PIO_WRITE,
-	ATA_CMD_PIO_READ_EXT,
-	ATA_CMD_PIO_WRITE_EXT,
-	ATA_CMD_READ,
-	ATA_CMD_WRITE,
-	ATA_CMD_READ_EXT,
-	ATA_CMD_WRITE_EXT,
-};
-
-static void ide_tf_set_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 dma)
-{
-	u8 index, lba48, write;
-
-	lba48 = (cmd->tf_flags & IDE_TFLAG_LBA48) ? 2 : 0;
-	write = (cmd->tf_flags & IDE_TFLAG_WRITE) ? 1 : 0;
-
-	if (dma) {
-		cmd->protocol = ATA_PROT_DMA;
-		index = 8;
-	} else {
-		cmd->protocol = ATA_PROT_PIO;
-		if (drive->mult_count) {
-			cmd->tf_flags |= IDE_TFLAG_MULTI_PIO;
-			index = 0;
-		} else
-			index = 4;
-	}
-
-	cmd->tf.command = ide_rw_cmds[index + lba48 + write];
-}
-
-/*
- * __ide_do_rw_disk() issues READ and WRITE commands to a disk,
- * using LBA if supported, or CHS otherwise, to address sectors.
- */
-static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
-					sector_t block)
-{
-	ide_hwif_t *hwif	= drive->hwif;
-	u16 nsectors		= (u16)blk_rq_sectors(rq);
-	u8 lba48		= !!(drive->dev_flags & IDE_DFLAG_LBA48);
-	u8 dma			= !!(drive->dev_flags & IDE_DFLAG_USING_DMA);
-	struct ide_cmd		cmd;
-	struct ide_taskfile	*tf = &cmd.tf;
-	ide_startstop_t		rc;
-
-	if ((hwif->host_flags & IDE_HFLAG_NO_LBA48_DMA) && lba48 && dma) {
-		if (block + blk_rq_sectors(rq) > 1ULL << 28)
-			dma = 0;
-		else
-			lba48 = 0;
-	}
-
-	memset(&cmd, 0, sizeof(cmd));
-	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
-	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
-
-	if (drive->dev_flags & IDE_DFLAG_LBA) {
-		if (lba48) {
-			pr_debug("%s: LBA=0x%012llx\n", drive->name,
-					(unsigned long long)block);
-
-			tf->nsect  = nsectors & 0xff;
-			tf->lbal   = (u8) block;
-			tf->lbam   = (u8)(block >>  8);
-			tf->lbah   = (u8)(block >> 16);
-			tf->device = ATA_LBA;
-
-			tf = &cmd.hob;
-			tf->nsect = (nsectors >> 8) & 0xff;
-			tf->lbal  = (u8)(block >> 24);
-			if (sizeof(block) != 4) {
-				tf->lbam = (u8)((u64)block >> 32);
-				tf->lbah = (u8)((u64)block >> 40);
-			}
-
-			cmd.valid.out.hob = IDE_VALID_OUT_HOB;
-			cmd.valid.in.hob  = IDE_VALID_IN_HOB;
-			cmd.tf_flags |= IDE_TFLAG_LBA48;
-		} else {
-			tf->nsect  = nsectors & 0xff;
-			tf->lbal   = block;
-			tf->lbam   = block >>= 8;
-			tf->lbah   = block >>= 8;
-			tf->device = ((block >> 8) & 0xf) | ATA_LBA;
-		}
-	} else {
-		unsigned int sect, head, cyl, track;
-
-		track = (int)block / drive->sect;
-		sect  = (int)block % drive->sect + 1;
-		head  = track % drive->head;
-		cyl   = track / drive->head;
-
-		pr_debug("%s: CHS=%u/%u/%u\n", drive->name, cyl, head, sect);
-
-		tf->nsect  = nsectors & 0xff;
-		tf->lbal   = sect;
-		tf->lbam   = cyl;
-		tf->lbah   = cyl >> 8;
-		tf->device = head;
-	}
-
-	cmd.tf_flags |= IDE_TFLAG_FS;
-
-	if (rq_data_dir(rq))
-		cmd.tf_flags |= IDE_TFLAG_WRITE;
-
-	ide_tf_set_cmd(drive, &cmd, dma);
-	cmd.rq = rq;
-
-	if (dma == 0) {
-		ide_init_sg_cmd(&cmd, nsectors << 9);
-		ide_map_sg(drive, &cmd);
-	}
-
-	rc = do_rw_taskfile(drive, &cmd);
-
-	if (rc == ide_stopped && dma) {
-		/* fallback to PIO */
-		cmd.tf_flags |= IDE_TFLAG_DMA_PIO_FALLBACK;
-		ide_tf_set_cmd(drive, &cmd, 0);
-		ide_init_sg_cmd(&cmd, nsectors << 9);
-		rc = do_rw_taskfile(drive, &cmd);
-	}
-
-	return rc;
-}
-
-/*
- * 268435455  == 137439 MB or 28bit limit
- * 320173056  == 163929 MB or 48bit addressing
- * 1073741822 == 549756 MB or 48bit addressing fake drive
- */
-
-static ide_startstop_t ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
-				      sector_t block)
-{
-	ide_hwif_t *hwif = drive->hwif;
-
-	BUG_ON(drive->dev_flags & IDE_DFLAG_BLOCKED);
-	BUG_ON(blk_rq_is_passthrough(rq));
-
-	ledtrig_disk_activity(rq_data_dir(rq) == WRITE);
-
-	pr_debug("%s: %sing: block=%llu, sectors=%u\n",
-		 drive->name, rq_data_dir(rq) == READ ? "read" : "writ",
-		 (unsigned long long)block, blk_rq_sectors(rq));
-
-	if (hwif->rw_disk)
-		hwif->rw_disk(drive, rq);
-
-	return __ide_do_rw_disk(drive, rq, block);
-}
-
-/*
- * Queries for true maximum capacity of the drive.
- * Returns maximum LBA address (> 0) of the drive, 0 if failed.
- */
-static u64 idedisk_read_native_max_address(ide_drive_t *drive, int lba48)
-{
-	struct ide_cmd cmd;
-	struct ide_taskfile *tf = &cmd.tf;
-	u64 addr = 0;
-
-	memset(&cmd, 0, sizeof(cmd));
-	if (lba48)
-		tf->command = ATA_CMD_READ_NATIVE_MAX_EXT;
-	else
-		tf->command = ATA_CMD_READ_NATIVE_MAX;
-	tf->device  = ATA_LBA;
-
-	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
-	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
-	if (lba48) {
-		cmd.valid.out.hob = IDE_VALID_OUT_HOB;
-		cmd.valid.in.hob  = IDE_VALID_IN_HOB;
-		cmd.tf_flags = IDE_TFLAG_LBA48;
-	}
-
-	ide_no_data_taskfile(drive, &cmd);
-
-	/* if OK, compute maximum address value */
-	if (!(tf->status & ATA_ERR))
-		addr = ide_get_lba_addr(&cmd, lba48) + 1;
-
-	return addr;
-}
-
-/*
- * Sets maximum virtual LBA address of the drive.
- * Returns new maximum virtual LBA address (> 0) or 0 on failure.
- */
-static u64 idedisk_set_max_address(ide_drive_t *drive, u64 addr_req, int lba48)
-{
-	struct ide_cmd cmd;
-	struct ide_taskfile *tf = &cmd.tf;
-	u64 addr_set = 0;
-
-	addr_req--;
-
-	memset(&cmd, 0, sizeof(cmd));
-	tf->lbal     = (addr_req >>  0) & 0xff;
-	tf->lbam     = (addr_req >>= 8) & 0xff;
-	tf->lbah     = (addr_req >>= 8) & 0xff;
-	if (lba48) {
-		cmd.hob.lbal = (addr_req >>= 8) & 0xff;
-		cmd.hob.lbam = (addr_req >>= 8) & 0xff;
-		cmd.hob.lbah = (addr_req >>= 8) & 0xff;
-		tf->command  = ATA_CMD_SET_MAX_EXT;
-	} else {
-		tf->device   = (addr_req >>= 8) & 0x0f;
-		tf->command  = ATA_CMD_SET_MAX;
-	}
-	tf->device |= ATA_LBA;
-
-	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
-	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
-	if (lba48) {
-		cmd.valid.out.hob = IDE_VALID_OUT_HOB;
-		cmd.valid.in.hob  = IDE_VALID_IN_HOB;
-		cmd.tf_flags = IDE_TFLAG_LBA48;
-	}
-
-	ide_no_data_taskfile(drive, &cmd);
-
-	/* if OK, compute maximum address value */
-	if (!(tf->status & ATA_ERR))
-		addr_set = ide_get_lba_addr(&cmd, lba48) + 1;
-
-	return addr_set;
-}
-
-static unsigned long long sectors_to_MB(unsigned long long n)
-{
-	n <<= 9;		/* make it bytes */
-	do_div(n, 1000000);	/* make it MB */
-	return n;
-}
-
-/*
- * Some disks report total number of sectors instead of
- * maximum sector address.  We list them here.
- */
-static const struct drive_list_entry hpa_list[] = {
-	{ "ST340823A",	NULL },
-	{ "ST320413A",	NULL },
-	{ "ST310211A",	NULL },
-	{ NULL,		NULL }
-};
-
-static u64 ide_disk_hpa_get_native_capacity(ide_drive_t *drive, int lba48)
-{
-	u64 capacity, set_max;
-
-	capacity = drive->capacity64;
-	set_max  = idedisk_read_native_max_address(drive, lba48);
-
-	if (ide_in_drive_list(drive->id, hpa_list)) {
-		/*
-		 * Since we are inclusive wrt to firmware revisions do this
-		 * extra check and apply the workaround only when needed.
-		 */
-		if (set_max == capacity + 1)
-			set_max--;
-	}
-
-	return set_max;
-}
-
-static u64 ide_disk_hpa_set_capacity(ide_drive_t *drive, u64 set_max, int lba48)
-{
-	set_max = idedisk_set_max_address(drive, set_max, lba48);
-	if (set_max)
-		drive->capacity64 = set_max;
-
-	return set_max;
-}
-
-static void idedisk_check_hpa(ide_drive_t *drive)
-{
-	u64 capacity, set_max;
-	int lba48 = ata_id_lba48_enabled(drive->id);
-
-	capacity = drive->capacity64;
-	set_max  = ide_disk_hpa_get_native_capacity(drive, lba48);
-
-	if (set_max <= capacity)
-		return;
-
-	drive->probed_capacity = set_max;
-
-	printk(KERN_INFO "%s: Host Protected Area detected.\n"
-			 "\tcurrent capacity is %llu sectors (%llu MB)\n"
-			 "\tnative  capacity is %llu sectors (%llu MB)\n",
-			 drive->name,
-			 capacity, sectors_to_MB(capacity),
-			 set_max, sectors_to_MB(set_max));
-
-	if ((drive->dev_flags & IDE_DFLAG_NOHPA) == 0)
-		return;
-
-	set_max = ide_disk_hpa_set_capacity(drive, set_max, lba48);
-	if (set_max)
-		printk(KERN_INFO "%s: Host Protected Area disabled.\n",
-				 drive->name);
-}
-
-static int ide_disk_get_capacity(ide_drive_t *drive)
-{
-	u16 *id = drive->id;
-	int lba;
-
-	if (ata_id_lba48_enabled(id)) {
-		/* drive speaks 48-bit LBA */
-		lba = 1;
-		drive->capacity64 = ata_id_u64(id, ATA_ID_LBA_CAPACITY_2);
-	} else if (ata_id_has_lba(id) && ata_id_is_lba_capacity_ok(id)) {
-		/* drive speaks 28-bit LBA */
-		lba = 1;
-		drive->capacity64 = ata_id_u32(id, ATA_ID_LBA_CAPACITY);
-	} else {
-		/* drive speaks boring old 28-bit CHS */
-		lba = 0;
-		drive->capacity64 = drive->cyl * drive->head * drive->sect;
-	}
-
-	drive->probed_capacity = drive->capacity64;
-
-	if (lba) {
-		drive->dev_flags |= IDE_DFLAG_LBA;
-
-		/*
-		* If this device supports the Host Protected Area feature set,
-		* then we may need to change our opinion about its capacity.
-		*/
-		if (ata_id_hpa_enabled(id))
-			idedisk_check_hpa(drive);
-	}
-
-	/* limit drive capacity to 137GB if LBA48 cannot be used */
-	if ((drive->dev_flags & IDE_DFLAG_LBA48) == 0 &&
-	    drive->capacity64 > 1ULL << 28) {
-		printk(KERN_WARNING "%s: cannot use LBA48 - full capacity "
-		       "%llu sectors (%llu MB)\n",
-		       drive->name, (unsigned long long)drive->capacity64,
-		       sectors_to_MB(drive->capacity64));
-		drive->probed_capacity = drive->capacity64 = 1ULL << 28;
-	}
-
-	if ((drive->hwif->host_flags & IDE_HFLAG_NO_LBA48_DMA) &&
-	    (drive->dev_flags & IDE_DFLAG_LBA48)) {
-		if (drive->capacity64 > 1ULL << 28) {
-			printk(KERN_INFO "%s: cannot use LBA48 DMA - PIO mode"
-					 " will be used for accessing sectors "
-					 "> %u\n", drive->name, 1 << 28);
-		} else
-			drive->dev_flags &= ~IDE_DFLAG_LBA48;
-	}
-
-	return 0;
-}
-
-static void ide_disk_unlock_native_capacity(ide_drive_t *drive)
-{
-	u16 *id = drive->id;
-	int lba48 = ata_id_lba48_enabled(id);
-
-	if ((drive->dev_flags & IDE_DFLAG_LBA) == 0 ||
-	    ata_id_hpa_enabled(id) == 0)
-		return;
-
-	/*
-	 * according to the spec the SET MAX ADDRESS command shall be
-	 * immediately preceded by a READ NATIVE MAX ADDRESS command
-	 */
-	if (!ide_disk_hpa_get_native_capacity(drive, lba48))
-		return;
-
-	if (ide_disk_hpa_set_capacity(drive, drive->probed_capacity, lba48))
-		drive->dev_flags |= IDE_DFLAG_NOHPA; /* disable HPA on resume */
-}
-
-static bool idedisk_prep_rq(ide_drive_t *drive, struct request *rq)
-{
-	struct ide_cmd *cmd;
-
-	if (req_op(rq) != REQ_OP_FLUSH)
-		return true;
-
-	if (ide_req(rq)->special) {
-		cmd = ide_req(rq)->special;
-		memset(cmd, 0, sizeof(*cmd));
-	} else {
-		cmd = kzalloc(sizeof(*cmd), GFP_ATOMIC);
-	}
-
-	/* FIXME: map struct ide_taskfile on rq->cmd[] */
-	BUG_ON(cmd == NULL);
-
-	if (ata_id_flush_ext_enabled(drive->id) &&
-	    (drive->capacity64 >= (1UL << 28)))
-		cmd->tf.command = ATA_CMD_FLUSH_EXT;
-	else
-		cmd->tf.command = ATA_CMD_FLUSH;
-	cmd->valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
-	cmd->tf_flags = IDE_TFLAG_DYN;
-	cmd->protocol = ATA_PROT_NODATA;
-	rq->cmd_flags &= ~REQ_OP_MASK;
-	rq->cmd_flags |= REQ_OP_DRV_OUT;
-	ide_req(rq)->type = ATA_PRIV_TASKFILE;
-	ide_req(rq)->special = cmd;
-	cmd->rq = rq;
-
-	return true;
-}
-
-ide_devset_get(multcount, mult_count);
-
-/*
- * This is tightly woven into the driver->do_special can not touch.
- * DON'T do it again until a total personality rewrite is committed.
- */
-static int set_multcount(ide_drive_t *drive, int arg)
-{
-	struct request *rq;
-
-	if (arg < 0 || arg > (drive->id[ATA_ID_MAX_MULTSECT] & 0xff))
-		return -EINVAL;
-
-	if (drive->special_flags & IDE_SFLAG_SET_MULTMODE)
-		return -EBUSY;
-
-	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
-	ide_req(rq)->type = ATA_PRIV_TASKFILE;
-
-	drive->mult_req = arg;
-	drive->special_flags |= IDE_SFLAG_SET_MULTMODE;
-	blk_execute_rq(NULL, rq, 0);
-	blk_put_request(rq);
-
-	return (drive->mult_count == arg) ? 0 : -EIO;
-}
-
-ide_devset_get_flag(nowerr, IDE_DFLAG_NOWERR);
-
-static int set_nowerr(ide_drive_t *drive, int arg)
-{
-	if (arg < 0 || arg > 1)
-		return -EINVAL;
-
-	if (arg)
-		drive->dev_flags |= IDE_DFLAG_NOWERR;
-	else
-		drive->dev_flags &= ~IDE_DFLAG_NOWERR;
-
-	drive->bad_wstat = arg ? BAD_R_STAT : BAD_W_STAT;
-
-	return 0;
-}
-
-static int ide_do_setfeature(ide_drive_t *drive, u8 feature, u8 nsect)
-{
-	struct ide_cmd cmd;
-
-	memset(&cmd, 0, sizeof(cmd));
-	cmd.tf.feature = feature;
-	cmd.tf.nsect   = nsect;
-	cmd.tf.command = ATA_CMD_SET_FEATURES;
-	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
-	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
-
-	return ide_no_data_taskfile(drive, &cmd);
-}
-
-static void update_flush(ide_drive_t *drive)
-{
-	u16 *id = drive->id;
-	bool wc = false;
-
-	if (drive->dev_flags & IDE_DFLAG_WCACHE) {
-		unsigned long long capacity;
-		int barrier;
-		/*
-		 * We must avoid issuing commands a drive does not
-		 * understand or we may crash it. We check flush cache
-		 * is supported. We also check we have the LBA48 flush
-		 * cache if the drive capacity is too large. By this
-		 * time we have trimmed the drive capacity if LBA48 is
-		 * not available so we don't need to recheck that.
-		 */
-		capacity = ide_gd_capacity(drive);
-		barrier = ata_id_flush_enabled(id) &&
-			(drive->dev_flags & IDE_DFLAG_NOFLUSH) == 0 &&
-			((drive->dev_flags & IDE_DFLAG_LBA48) == 0 ||
-			 capacity <= (1ULL << 28) ||
-			 ata_id_flush_ext_enabled(id));
-
-		printk(KERN_INFO "%s: cache flushes %ssupported\n",
-		       drive->name, barrier ? "" : "not ");
-
-		if (barrier) {
-			wc = true;
-			drive->prep_rq = idedisk_prep_rq;
-		}
-	}
-
-	blk_queue_write_cache(drive->queue, wc, false);
-}
-
-ide_devset_get_flag(wcache, IDE_DFLAG_WCACHE);
-
-static int set_wcache(ide_drive_t *drive, int arg)
-{
-	int err = 1;
-
-	if (arg < 0 || arg > 1)
-		return -EINVAL;
-
-	if (ata_id_flush_enabled(drive->id)) {
-		err = ide_do_setfeature(drive,
-			arg ? SETFEATURES_WC_ON : SETFEATURES_WC_OFF, 0);
-		if (err == 0) {
-			if (arg)
-				drive->dev_flags |= IDE_DFLAG_WCACHE;
-			else
-				drive->dev_flags &= ~IDE_DFLAG_WCACHE;
-		}
-	}
-
-	update_flush(drive);
-
-	return err;
-}
-
-static int do_idedisk_flushcache(ide_drive_t *drive)
-{
-	struct ide_cmd cmd;
-
-	memset(&cmd, 0, sizeof(cmd));
-	if (ata_id_flush_ext_enabled(drive->id))
-		cmd.tf.command = ATA_CMD_FLUSH_EXT;
-	else
-		cmd.tf.command = ATA_CMD_FLUSH;
-	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
-	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
-
-	return ide_no_data_taskfile(drive, &cmd);
-}
-
-ide_devset_get(acoustic, acoustic);
-
-static int set_acoustic(ide_drive_t *drive, int arg)
-{
-	if (arg < 0 || arg > 254)
-		return -EINVAL;
-
-	ide_do_setfeature(drive,
-		arg ? SETFEATURES_AAM_ON : SETFEATURES_AAM_OFF, arg);
-
-	drive->acoustic = arg;
-
-	return 0;
-}
-
-ide_devset_get_flag(addressing, IDE_DFLAG_LBA48);
-
-/*
- * drive->addressing:
- *	0: 28-bit
- *	1: 48-bit
- *	2: 48-bit capable doing 28-bit
- */
-static int set_addressing(ide_drive_t *drive, int arg)
-{
-	if (arg < 0 || arg > 2)
-		return -EINVAL;
-
-	if (arg && ((drive->hwif->host_flags & IDE_HFLAG_NO_LBA48) ||
-	    ata_id_lba48_enabled(drive->id) == 0))
-		return -EIO;
-
-	if (arg == 2)
-		arg = 0;
-
-	if (arg)
-		drive->dev_flags |= IDE_DFLAG_LBA48;
-	else
-		drive->dev_flags &= ~IDE_DFLAG_LBA48;
-
-	return 0;
-}
-
-ide_ext_devset_rw(acoustic, acoustic);
-ide_ext_devset_rw(address, addressing);
-ide_ext_devset_rw(multcount, multcount);
-ide_ext_devset_rw(wcache, wcache);
-
-ide_ext_devset_rw_sync(nowerr, nowerr);
-
-static int ide_disk_check(ide_drive_t *drive, const char *s)
-{
-	return 1;
-}
-
-static void ide_disk_setup(ide_drive_t *drive)
-{
-	struct ide_disk_obj *idkp = drive->driver_data;
-	struct request_queue *q = drive->queue;
-	ide_hwif_t *hwif = drive->hwif;
-	u16 *id = drive->id;
-	char *m = (char *)&id[ATA_ID_PROD];
-	unsigned long long capacity;
-
-	ide_proc_register_driver(drive, idkp->driver);
-
-	if ((drive->dev_flags & IDE_DFLAG_ID_READ) == 0)
-		return;
-
-	if (drive->dev_flags & IDE_DFLAG_REMOVABLE) {
-		/*
-		 * Removable disks (eg. SYQUEST); ignore 'WD' drives
-		 */
-		if (m[0] != 'W' || m[1] != 'D')
-			drive->dev_flags |= IDE_DFLAG_DOORLOCKING;
-	}
-
-	(void)set_addressing(drive, 1);
-
-	if (drive->dev_flags & IDE_DFLAG_LBA48) {
-		int max_s = 2048;
-
-		if (max_s > hwif->rqsize)
-			max_s = hwif->rqsize;
-
-		blk_queue_max_hw_sectors(q, max_s);
-	}
-
-	printk(KERN_INFO "%s: max request size: %dKiB\n", drive->name,
-	       queue_max_sectors(q) / 2);
-
-	if (ata_id_is_ssd(id)) {
-		blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
-		blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q);
-	}
-
-	/* calculate drive capacity, and select LBA if possible */
-	ide_disk_get_capacity(drive);
-
-	/*
-	 * if possible, give fdisk access to more of the drive,
-	 * by correcting bios_cyls:
-	 */
-	capacity = ide_gd_capacity(drive);
-
-	if ((drive->dev_flags & IDE_DFLAG_FORCED_GEOM) == 0) {
-		if (ata_id_lba48_enabled(drive->id)) {
-			/* compatibility */
-			drive->bios_sect = 63;
-			drive->bios_head = 255;
-		}
-
-		if (drive->bios_sect && drive->bios_head) {
-			unsigned int cap0 = capacity; /* truncate to 32 bits */
-			unsigned int cylsz, cyl;
-
-			if (cap0 != capacity)
-				drive->bios_cyl = 65535;
-			else {
-				cylsz = drive->bios_sect * drive->bios_head;
-				cyl = cap0 / cylsz;
-				if (cyl > 65535)
-					cyl = 65535;
-				if (cyl > drive->bios_cyl)
-					drive->bios_cyl = cyl;
-			}
-		}
-	}
-	printk(KERN_INFO "%s: %llu sectors (%llu MB)",
-			 drive->name, capacity, sectors_to_MB(capacity));
-
-	/* Only print cache size when it was specified */
-	if (id[ATA_ID_BUF_SIZE])
-		printk(KERN_CONT " w/%dKiB Cache", id[ATA_ID_BUF_SIZE] / 2);
-
-	printk(KERN_CONT ", CHS=%d/%d/%d\n",
-			 drive->bios_cyl, drive->bios_head, drive->bios_sect);
-
-	/* write cache enabled? */
-	if ((id[ATA_ID_CSFO] & 1) || ata_id_wcache_enabled(id))
-		drive->dev_flags |= IDE_DFLAG_WCACHE;
-
-	set_wcache(drive, 1);
-
-	if ((drive->dev_flags & IDE_DFLAG_LBA) == 0 &&
-	    (drive->head == 0 || drive->head > 16))
-		printk(KERN_ERR "%s: invalid geometry: %d physical heads?\n",
-			drive->name, drive->head);
-}
-
-static void ide_disk_flush(ide_drive_t *drive)
-{
-	if (ata_id_flush_enabled(drive->id) == 0 ||
-	    (drive->dev_flags & IDE_DFLAG_WCACHE) == 0)
-		return;
-
-	if (do_idedisk_flushcache(drive))
-		printk(KERN_INFO "%s: wcache flush failed!\n", drive->name);
-}
-
-static int ide_disk_init_media(ide_drive_t *drive, struct gendisk *disk)
-{
-	return 0;
-}
-
-static int ide_disk_set_doorlock(ide_drive_t *drive, struct gendisk *disk,
-				 int on)
-{
-	struct ide_cmd cmd;
-	int ret;
-
-	if ((drive->dev_flags & IDE_DFLAG_DOORLOCKING) == 0)
-		return 0;
-
-	memset(&cmd, 0, sizeof(cmd));
-	cmd.tf.command = on ? ATA_CMD_MEDIA_LOCK : ATA_CMD_MEDIA_UNLOCK;
-	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
-	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
-
-	ret = ide_no_data_taskfile(drive, &cmd);
-
-	if (ret)
-		drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING;
-
-	return ret;
-}
-
-const struct ide_disk_ops ide_ata_disk_ops = {
-	.check			= ide_disk_check,
-	.unlock_native_capacity	= ide_disk_unlock_native_capacity,
-	.get_capacity		= ide_disk_get_capacity,
-	.setup			= ide_disk_setup,
-	.flush			= ide_disk_flush,
-	.init_media		= ide_disk_init_media,
-	.set_doorlock		= ide_disk_set_doorlock,
-	.do_request		= ide_do_rw_disk,
-	.ioctl			= ide_disk_ioctl,
-	.compat_ioctl		= ide_disk_ioctl,
-};
diff --git a/drivers/ide/ide-disk.h b/drivers/ide/ide-disk.h
deleted file mode 100644
index 0e8cc18bfda6..000000000000
--- a/drivers/ide/ide-disk.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __IDE_DISK_H
-#define __IDE_DISK_H
-
-#include "ide-gd.h"
-
-#ifdef CONFIG_IDE_GD_ATA
-/* ide-disk.c */
-extern const struct ide_disk_ops ide_ata_disk_ops;
-ide_decl_devset(address);
-ide_decl_devset(multcount);
-ide_decl_devset(nowerr);
-ide_decl_devset(wcache);
-ide_decl_devset(acoustic);
-
-/* ide-disk_ioctl.c */
-int ide_disk_ioctl(ide_drive_t *, struct block_device *, fmode_t, unsigned int,
-		   unsigned long);
-
-#ifdef CONFIG_IDE_PROC_FS
-/* ide-disk_proc.c */
-extern ide_proc_entry_t ide_disk_proc[];
-extern const struct ide_proc_devset ide_disk_settings[];
-#endif
-#else
-#define ide_disk_proc		NULL
-#define ide_disk_settings	NULL
-#endif
-
-#endif /* __IDE_DISK_H */
diff --git a/drivers/ide/ide-disk_ioctl.c b/drivers/ide/ide-disk_ioctl.c
deleted file mode 100644
index 2c45616cff4f..000000000000
--- a/drivers/ide/ide-disk_ioctl.c
+++ /dev/null
@@ -1,33 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/kernel.h>
-#include <linux/ide.h>
-#include <linux/hdreg.h>
-#include <linux/mutex.h>
-
-#include "ide-disk.h"
-
-static DEFINE_MUTEX(ide_disk_ioctl_mutex);
-static const struct ide_ioctl_devset ide_disk_ioctl_settings[] = {
-{ HDIO_GET_ADDRESS,	HDIO_SET_ADDRESS,   &ide_devset_address   },
-{ HDIO_GET_MULTCOUNT,	HDIO_SET_MULTCOUNT, &ide_devset_multcount },
-{ HDIO_GET_NOWERR,	HDIO_SET_NOWERR,    &ide_devset_nowerr	  },
-{ HDIO_GET_WCACHE,	HDIO_SET_WCACHE,    &ide_devset_wcache	  },
-{ HDIO_GET_ACOUSTIC,	HDIO_SET_ACOUSTIC,  &ide_devset_acoustic  },
-{ 0 }
-};
-
-int ide_disk_ioctl(ide_drive_t *drive, struct block_device *bdev, fmode_t mode,
-		   unsigned int cmd, unsigned long arg)
-{
-	int err;
-
-	mutex_lock(&ide_disk_ioctl_mutex);
-	err = ide_setting_ioctl(drive, bdev, cmd, arg, ide_disk_ioctl_settings);
-	if (err != -EOPNOTSUPP)
-		goto out;
-
-	err = generic_ide_ioctl(drive, bdev, cmd, arg);
-out:
-	mutex_unlock(&ide_disk_ioctl_mutex);
-	return err;
-}
diff --git a/drivers/ide/ide-disk_proc.c b/drivers/ide/ide-disk_proc.c
deleted file mode 100644
index 95d239b2f646..000000000000
--- a/drivers/ide/ide-disk_proc.c
+++ /dev/null
@@ -1,125 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/kernel.h>
-#include <linux/ide.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include <linux/seq_file.h>
-
-#include "ide-disk.h"
-
-static int smart_enable(ide_drive_t *drive)
-{
-	struct ide_cmd cmd;
-	struct ide_taskfile *tf = &cmd.tf;
-
-	memset(&cmd, 0, sizeof(cmd));
-	tf->feature = ATA_SMART_ENABLE;
-	tf->lbam    = ATA_SMART_LBAM_PASS;
-	tf->lbah    = ATA_SMART_LBAH_PASS;
-	tf->command = ATA_CMD_SMART;
-	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
-	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
-
-	return ide_no_data_taskfile(drive, &cmd);
-}
-
-static int get_smart_data(ide_drive_t *drive, u8 *buf, u8 sub_cmd)
-{
-	struct ide_cmd cmd;
-	struct ide_taskfile *tf = &cmd.tf;
-
-	memset(&cmd, 0, sizeof(cmd));
-	tf->feature = sub_cmd;
-	tf->nsect   = 0x01;
-	tf->lbam    = ATA_SMART_LBAM_PASS;
-	tf->lbah    = ATA_SMART_LBAH_PASS;
-	tf->command = ATA_CMD_SMART;
-	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
-	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
-	cmd.protocol = ATA_PROT_PIO;
-
-	return ide_raw_taskfile(drive, &cmd, buf, 1);
-}
-
-static int idedisk_cache_proc_show(struct seq_file *m, void *v)
-{
-	ide_drive_t	*drive = (ide_drive_t *) m->private;
-
-	if (drive->dev_flags & IDE_DFLAG_ID_READ)
-		seq_printf(m, "%i\n", drive->id[ATA_ID_BUF_SIZE] / 2);
-	else
-		seq_printf(m, "(none)\n");
-	return 0;
-}
-
-static int idedisk_capacity_proc_show(struct seq_file *m, void *v)
-{
-	ide_drive_t*drive = (ide_drive_t *)m->private;
-
-	seq_printf(m, "%llu\n", (long long)ide_gd_capacity(drive));
-	return 0;
-}
-
-static int __idedisk_proc_show(struct seq_file *m, ide_drive_t *drive, u8 sub_cmd)
-{
-	u8 *buf;
-
-	buf = kmalloc(SECTOR_SIZE, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	(void)smart_enable(drive);
-
-	if (get_smart_data(drive, buf, sub_cmd) == 0) {
-		__le16 *val = (__le16 *)buf;
-		int i;
-
-		for (i = 0; i < SECTOR_SIZE / 2; i++) {
-			seq_printf(m, "%04x%c", le16_to_cpu(val[i]),
-					(i % 8) == 7 ? '\n' : ' ');
-		}
-	}
-	kfree(buf);
-	return 0;
-}
-
-static int idedisk_sv_proc_show(struct seq_file *m, void *v)
-{
-	return __idedisk_proc_show(m, m->private, ATA_SMART_READ_VALUES);
-}
-
-static int idedisk_st_proc_show(struct seq_file *m, void *v)
-{
-	return __idedisk_proc_show(m, m->private, ATA_SMART_READ_THRESHOLDS);
-}
-
-ide_proc_entry_t ide_disk_proc[] = {
-	{ "cache",	  S_IFREG|S_IRUGO, idedisk_cache_proc_show	},
-	{ "capacity",	  S_IFREG|S_IRUGO, idedisk_capacity_proc_show	},
-	{ "geometry",	  S_IFREG|S_IRUGO, ide_geometry_proc_show	},
-	{ "smart_values", S_IFREG|S_IRUSR, idedisk_sv_proc_show		},
-	{ "smart_thresholds", S_IFREG|S_IRUSR, idedisk_st_proc_show	},
-	{}
-};
-
-ide_devset_rw_field(bios_cyl, bios_cyl);
-ide_devset_rw_field(bios_head, bios_head);
-ide_devset_rw_field(bios_sect, bios_sect);
-ide_devset_rw_field(failures, failures);
-ide_devset_rw_field(lun, lun);
-ide_devset_rw_field(max_failures, max_failures);
-
-const struct ide_proc_devset ide_disk_settings[] = {
-	IDE_PROC_DEVSET(acoustic,	0,   254),
-	IDE_PROC_DEVSET(address,	0,     2),
-	IDE_PROC_DEVSET(bios_cyl,	0, 65535),
-	IDE_PROC_DEVSET(bios_head,	0,   255),
-	IDE_PROC_DEVSET(bios_sect,	0,    63),
-	IDE_PROC_DEVSET(failures,	0, 65535),
-	IDE_PROC_DEVSET(lun,		0,     7),
-	IDE_PROC_DEVSET(max_failures,	0, 65535),
-	IDE_PROC_DEVSET(multcount,	0,    16),
-	IDE_PROC_DEVSET(nowerr,		0,     1),
-	IDE_PROC_DEVSET(wcache,		0,     1),
-	{ NULL },
-};
diff --git a/drivers/ide/ide-dma-sff.c b/drivers/ide/ide-dma-sff.c
deleted file mode 100644
index b7c2c0bd18b5..000000000000
--- a/drivers/ide/ide-dma-sff.c
+++ /dev/null
@@ -1,336 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/ide.h>
-#include <linux/scatterlist.h>
-#include <linux/dma-mapping.h>
-#include <linux/io.h>
-
-/**
- *	config_drive_for_dma	-	attempt to activate IDE DMA
- *	@drive: the drive to place in DMA mode
- *
- *	If the drive supports at least mode 2 DMA or UDMA of any kind
- *	then attempt to place it into DMA mode. Drives that are known to
- *	support DMA but predate the DMA properties or that are known
- *	to have DMA handling bugs are also set up appropriately based
- *	on the good/bad drive lists.
- */
-
-int config_drive_for_dma(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	u16 *id = drive->id;
-
-	if (drive->media != ide_disk) {
-		if (hwif->host_flags & IDE_HFLAG_NO_ATAPI_DMA)
-			return 0;
-	}
-
-	/*
-	 * Enable DMA on any drive that has
-	 * UltraDMA (mode 0/1/2/3/4/5/6) enabled
-	 */
-	if ((id[ATA_ID_FIELD_VALID] & 4) &&
-	    ((id[ATA_ID_UDMA_MODES] >> 8) & 0x7f))
-		return 1;
-
-	/*
-	 * Enable DMA on any drive that has mode2 DMA
-	 * (multi or single) enabled
-	 */
-	if ((id[ATA_ID_MWDMA_MODES] & 0x404) == 0x404 ||
-	    (id[ATA_ID_SWDMA_MODES] & 0x404) == 0x404)
-		return 1;
-
-	/* Consult the list of known "good" drives */
-	if (ide_dma_good_drive(drive))
-		return 1;
-
-	return 0;
-}
-
-u8 ide_dma_sff_read_status(ide_hwif_t *hwif)
-{
-	unsigned long addr = hwif->dma_base + ATA_DMA_STATUS;
-
-	if (hwif->host_flags & IDE_HFLAG_MMIO)
-		return readb((void __iomem *)addr);
-	else
-		return inb(addr);
-}
-EXPORT_SYMBOL_GPL(ide_dma_sff_read_status);
-
-static void ide_dma_sff_write_status(ide_hwif_t *hwif, u8 val)
-{
-	unsigned long addr = hwif->dma_base + ATA_DMA_STATUS;
-
-	if (hwif->host_flags & IDE_HFLAG_MMIO)
-		writeb(val, (void __iomem *)addr);
-	else
-		outb(val, addr);
-}
-
-/**
- *	ide_dma_host_set	-	Enable/disable DMA on a host
- *	@drive: drive to control
- *
- *	Enable/disable DMA on an IDE controller following generic
- *	bus-mastering IDE controller behaviour.
- */
-
-void ide_dma_host_set(ide_drive_t *drive, int on)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	u8 unit = drive->dn & 1;
-	u8 dma_stat = hwif->dma_ops->dma_sff_read_status(hwif);
-
-	if (on)
-		dma_stat |= (1 << (5 + unit));
-	else
-		dma_stat &= ~(1 << (5 + unit));
-
-	ide_dma_sff_write_status(hwif, dma_stat);
-}
-EXPORT_SYMBOL_GPL(ide_dma_host_set);
-
-/**
- *	ide_build_dmatable	-	build IDE DMA table
- *
- *	ide_build_dmatable() prepares a dma request. We map the command
- *	to get the pci bus addresses of the buffers and then build up
- *	the PRD table that the IDE layer wants to be fed.
- *
- *	Most chipsets correctly interpret a length of 0x0000 as 64KB,
- *	but at least one (e.g. CS5530) misinterprets it as zero (!).
- *	So we break the 64KB entry into two 32KB entries instead.
- *
- *	Returns the number of built PRD entries if all went okay,
- *	returns 0 otherwise.
- *
- *	May also be invoked from trm290.c
- */
-
-int ide_build_dmatable(ide_drive_t *drive, struct ide_cmd *cmd)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	__le32 *table = (__le32 *)hwif->dmatable_cpu;
-	unsigned int count = 0;
-	int i;
-	struct scatterlist *sg;
-	u8 is_trm290 = !!(hwif->host_flags & IDE_HFLAG_TRM290);
-
-	for_each_sg(hwif->sg_table, sg, cmd->sg_nents, i) {
-		u32 cur_addr, cur_len, xcount, bcount;
-
-		cur_addr = sg_dma_address(sg);
-		cur_len = sg_dma_len(sg);
-
-		/*
-		 * Fill in the dma table, without crossing any 64kB boundaries.
-		 * Most hardware requires 16-bit alignment of all blocks,
-		 * but the trm290 requires 32-bit alignment.
-		 */
-
-		while (cur_len) {
-			if (count++ >= PRD_ENTRIES)
-				goto use_pio_instead;
-
-			bcount = 0x10000 - (cur_addr & 0xffff);
-			if (bcount > cur_len)
-				bcount = cur_len;
-			*table++ = cpu_to_le32(cur_addr);
-			xcount = bcount & 0xffff;
-			if (is_trm290)
-				xcount = ((xcount >> 2) - 1) << 16;
-			else if (xcount == 0x0000) {
-				if (count++ >= PRD_ENTRIES)
-					goto use_pio_instead;
-				*table++ = cpu_to_le32(0x8000);
-				*table++ = cpu_to_le32(cur_addr + 0x8000);
-				xcount = 0x8000;
-			}
-			*table++ = cpu_to_le32(xcount);
-			cur_addr += bcount;
-			cur_len -= bcount;
-		}
-	}
-
-	if (count) {
-		if (!is_trm290)
-			*--table |= cpu_to_le32(0x80000000);
-		return count;
-	}
-
-use_pio_instead:
-	printk(KERN_ERR "%s: %s\n", drive->name,
-		count ? "DMA table too small" : "empty DMA table?");
-
-	return 0; /* revert to PIO for this request */
-}
-EXPORT_SYMBOL_GPL(ide_build_dmatable);
-
-/**
- *	ide_dma_setup	-	begin a DMA phase
- *	@drive: target device
- *	@cmd: command
- *
- *	Build an IDE DMA PRD (IDE speak for scatter gather table)
- *	and then set up the DMA transfer registers for a device
- *	that follows generic IDE PCI DMA behaviour. Controllers can
- *	override this function if they need to
- *
- *	Returns 0 on success. If a PIO fallback is required then 1
- *	is returned.
- */
-
-int ide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0;
-	u8 rw = (cmd->tf_flags & IDE_TFLAG_WRITE) ? 0 : ATA_DMA_WR;
-	u8 dma_stat;
-
-	/* fall back to pio! */
-	if (ide_build_dmatable(drive, cmd) == 0) {
-		ide_map_sg(drive, cmd);
-		return 1;
-	}
-
-	/* PRD table */
-	if (mmio)
-		writel(hwif->dmatable_dma,
-		       (void __iomem *)(hwif->dma_base + ATA_DMA_TABLE_OFS));
-	else
-		outl(hwif->dmatable_dma, hwif->dma_base + ATA_DMA_TABLE_OFS);
-
-	/* specify r/w */
-	if (mmio)
-		writeb(rw, (void __iomem *)(hwif->dma_base + ATA_DMA_CMD));
-	else
-		outb(rw, hwif->dma_base + ATA_DMA_CMD);
-
-	/* read DMA status for INTR & ERROR flags */
-	dma_stat = hwif->dma_ops->dma_sff_read_status(hwif);
-
-	/* clear INTR & ERROR flags */
-	ide_dma_sff_write_status(hwif, dma_stat | ATA_DMA_ERR | ATA_DMA_INTR);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(ide_dma_setup);
-
-/**
- *	ide_dma_sff_timer_expiry	-	handle a DMA timeout
- *	@drive: Drive that timed out
- *
- *	An IDE DMA transfer timed out. In the event of an error we ask
- *	the driver to resolve the problem, if a DMA transfer is still
- *	in progress we continue to wait (arguably we need to add a
- *	secondary 'I don't care what the drive thinks' timeout here)
- *	Finally if we have an interrupt we let it complete the I/O.
- *	But only one time - we clear expiry and if it's still not
- *	completed after WAIT_CMD, we error and retry in PIO.
- *	This can occur if an interrupt is lost or due to hang or bugs.
- */
-
-int ide_dma_sff_timer_expiry(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	u8 dma_stat = hwif->dma_ops->dma_sff_read_status(hwif);
-
-	printk(KERN_WARNING "%s: %s: DMA status (0x%02x)\n",
-		drive->name, __func__, dma_stat);
-
-	if ((dma_stat & 0x18) == 0x18)	/* BUSY Stupid Early Timer !! */
-		return WAIT_CMD;
-
-	hwif->expiry = NULL;	/* one free ride for now */
-
-	if (dma_stat & ATA_DMA_ERR)	/* ERROR */
-		return -1;
-
-	if (dma_stat & ATA_DMA_ACTIVE)	/* DMAing */
-		return WAIT_CMD;
-
-	if (dma_stat & ATA_DMA_INTR)	/* Got an Interrupt */
-		return WAIT_CMD;
-
-	return 0;	/* Status is unknown -- reset the bus */
-}
-EXPORT_SYMBOL_GPL(ide_dma_sff_timer_expiry);
-
-void ide_dma_start(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	u8 dma_cmd;
-
-	/* Note that this is done *after* the cmd has
-	 * been issued to the drive, as per the BM-IDE spec.
-	 * The Promise Ultra33 doesn't work correctly when
-	 * we do this part before issuing the drive cmd.
-	 */
-	if (hwif->host_flags & IDE_HFLAG_MMIO) {
-		dma_cmd = readb((void __iomem *)(hwif->dma_base + ATA_DMA_CMD));
-		writeb(dma_cmd | ATA_DMA_START,
-		       (void __iomem *)(hwif->dma_base + ATA_DMA_CMD));
-	} else {
-		dma_cmd = inb(hwif->dma_base + ATA_DMA_CMD);
-		outb(dma_cmd | ATA_DMA_START, hwif->dma_base + ATA_DMA_CMD);
-	}
-}
-EXPORT_SYMBOL_GPL(ide_dma_start);
-
-/* returns 1 on error, 0 otherwise */
-int ide_dma_end(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	u8 dma_stat = 0, dma_cmd = 0;
-
-	/* stop DMA */
-	if (hwif->host_flags & IDE_HFLAG_MMIO) {
-		dma_cmd = readb((void __iomem *)(hwif->dma_base + ATA_DMA_CMD));
-		writeb(dma_cmd & ~ATA_DMA_START,
-		       (void __iomem *)(hwif->dma_base + ATA_DMA_CMD));
-	} else {
-		dma_cmd = inb(hwif->dma_base + ATA_DMA_CMD);
-		outb(dma_cmd & ~ATA_DMA_START, hwif->dma_base + ATA_DMA_CMD);
-	}
-
-	/* get DMA status */
-	dma_stat = hwif->dma_ops->dma_sff_read_status(hwif);
-
-	/* clear INTR & ERROR bits */
-	ide_dma_sff_write_status(hwif, dma_stat | ATA_DMA_ERR | ATA_DMA_INTR);
-
-#define CHECK_DMA_MASK (ATA_DMA_ACTIVE | ATA_DMA_ERR | ATA_DMA_INTR)
-
-	/* verify good DMA status */
-	if ((dma_stat & CHECK_DMA_MASK) != ATA_DMA_INTR)
-		return 0x10 | dma_stat;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(ide_dma_end);
-
-/* returns 1 if dma irq issued, 0 otherwise */
-int ide_dma_test_irq(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	u8 dma_stat = hwif->dma_ops->dma_sff_read_status(hwif);
-
-	return (dma_stat & ATA_DMA_INTR) ? 1 : 0;
-}
-EXPORT_SYMBOL_GPL(ide_dma_test_irq);
-
-const struct ide_dma_ops sff_dma_ops = {
-	.dma_host_set		= ide_dma_host_set,
-	.dma_setup		= ide_dma_setup,
-	.dma_start		= ide_dma_start,
-	.dma_end		= ide_dma_end,
-	.dma_test_irq		= ide_dma_test_irq,
-	.dma_lost_irq		= ide_dma_lost_irq,
-	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_sff_read_status	= ide_dma_sff_read_status,
-};
-EXPORT_SYMBOL_GPL(sff_dma_ops);
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
deleted file mode 100644
index 6f344654ef22..000000000000
--- a/drivers/ide/ide-dma.c
+++ /dev/null
@@ -1,551 +0,0 @@
-/*
- *  IDE DMA support (including IDE PCI BM-DMA).
- *
- *  Copyright (C) 1995-1998   Mark Lord
- *  Copyright (C) 1999-2000   Andre Hedrick <andre@linux-ide.org>
- *  Copyright (C) 2004, 2007  Bartlomiej Zolnierkiewicz
- *
- *  May be copied or modified under the terms of the GNU General Public License
- *
- *  DMA is supported for all IDE devices (disk drives, cdroms, tapes, floppies).
- */
-
-/*
- *  Special Thanks to Mark for his Six years of work.
- */
-
-/*
- * Thanks to "Christopher J. Reimer" <reimer@doe.carleton.ca> for
- * fixing the problem with the BIOS on some Acer motherboards.
- *
- * Thanks to "Benoit Poulot-Cazajous" <poulot@chorus.fr> for testing
- * "TX" chipset compatibility and for providing patches for the "TX" chipset.
- *
- * Thanks to Christian Brunner <chb@muc.de> for taking a good first crack
- * at generic DMA -- his patches were referred to when preparing this code.
- *
- * Most importantly, thanks to Robert Bringman <rob@mars.trion.com>
- * for supplying a Promise UDMA board & WD UDMA drive for this work!
- */
-
-#include <linux/types.h>
-#include <linux/gfp.h>
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/ide.h>
-#include <linux/scatterlist.h>
-#include <linux/dma-mapping.h>
-
-static const struct drive_list_entry drive_whitelist[] = {
-	{ "Micropolis 2112A"	,       NULL		},
-	{ "CONNER CTMA 4000"	,       NULL		},
-	{ "CONNER CTT8000-A"	,       NULL		},
-	{ "ST34342A"		,	NULL		},
-	{ NULL			,	NULL		}
-};
-
-static const struct drive_list_entry drive_blacklist[] = {
-	{ "WDC AC11000H"	,	NULL 		},
-	{ "WDC AC22100H"	,	NULL 		},
-	{ "WDC AC32500H"	,	NULL 		},
-	{ "WDC AC33100H"	,	NULL 		},
-	{ "WDC AC31600H"	,	NULL 		},
-	{ "WDC AC32100H"	,	"24.09P07"	},
-	{ "WDC AC23200L"	,	"21.10N21"	},
-	{ "Compaq CRD-8241B"	,	NULL 		},
-	{ "CRD-8400B"		,	NULL 		},
-	{ "CRD-8480B",			NULL 		},
-	{ "CRD-8482B",			NULL 		},
-	{ "CRD-84"		,	NULL 		},
-	{ "SanDisk SDP3B"	,	NULL 		},
-	{ "SanDisk SDP3B-64"	,	NULL 		},
-	{ "SANYO CD-ROM CRD"	,	NULL 		},
-	{ "HITACHI CDR-8"	,	NULL 		},
-	{ "HITACHI CDR-8335"	,	NULL 		},
-	{ "HITACHI CDR-8435"	,	NULL 		},
-	{ "Toshiba CD-ROM XM-6202B"	,	NULL 		},
-	{ "TOSHIBA CD-ROM XM-1702BC",	NULL 		},
-	{ "CD-532E-A"		,	NULL 		},
-	{ "E-IDE CD-ROM CR-840",	NULL 		},
-	{ "CD-ROM Drive/F5A",	NULL 		},
-	{ "WPI CDD-820",		NULL 		},
-	{ "SAMSUNG CD-ROM SC-148C",	NULL 		},
-	{ "SAMSUNG CD-ROM SC",	NULL 		},
-	{ "ATAPI CD-ROM DRIVE 40X MAXIMUM",	NULL 		},
-	{ "_NEC DV5800A",               NULL            },
-	{ "SAMSUNG CD-ROM SN-124",	"N001" },
-	{ "Seagate STT20000A",		NULL  },
-	{ "CD-ROM CDR_U200",		"1.09" },
-	{ NULL			,	NULL		}
-
-};
-
-/**
- *	ide_dma_intr	-	IDE DMA interrupt handler
- *	@drive: the drive the interrupt is for
- *
- *	Handle an interrupt completing a read/write DMA transfer on an
- *	IDE device
- */
-
-ide_startstop_t ide_dma_intr(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct ide_cmd *cmd = &hwif->cmd;
-	u8 stat = 0, dma_stat = 0;
-
-	drive->waiting_for_dma = 0;
-	dma_stat = hwif->dma_ops->dma_end(drive);
-	ide_dma_unmap_sg(drive, cmd);
-	stat = hwif->tp_ops->read_status(hwif);
-
-	if (OK_STAT(stat, DRIVE_READY, drive->bad_wstat | ATA_DRQ)) {
-		if (!dma_stat) {
-			if ((cmd->tf_flags & IDE_TFLAG_FS) == 0)
-				ide_finish_cmd(drive, cmd, stat);
-			else
-				ide_complete_rq(drive, BLK_STS_OK,
-						blk_rq_sectors(cmd->rq) << 9);
-			return ide_stopped;
-		}
-		printk(KERN_ERR "%s: %s: bad DMA status (0x%02x)\n",
-			drive->name, __func__, dma_stat);
-	}
-	return ide_error(drive, "dma_intr", stat);
-}
-
-int ide_dma_good_drive(ide_drive_t *drive)
-{
-	return ide_in_drive_list(drive->id, drive_whitelist);
-}
-
-/**
- *	ide_dma_map_sg	-	map IDE scatter gather for DMA I/O
- *	@drive: the drive to map the DMA table for
- *	@cmd: command
- *
- *	Perform the DMA mapping magic necessary to access the source or
- *	target buffers of a request via DMA.  The lower layers of the
- *	kernel provide the necessary cache management so that we can
- *	operate in a portable fashion.
- */
-
-static int ide_dma_map_sg(ide_drive_t *drive, struct ide_cmd *cmd)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct scatterlist *sg = hwif->sg_table;
-	int i;
-
-	if (cmd->tf_flags & IDE_TFLAG_WRITE)
-		cmd->sg_dma_direction = DMA_TO_DEVICE;
-	else
-		cmd->sg_dma_direction = DMA_FROM_DEVICE;
-
-	i = dma_map_sg(hwif->dev, sg, cmd->sg_nents, cmd->sg_dma_direction);
-	if (i) {
-		cmd->orig_sg_nents = cmd->sg_nents;
-		cmd->sg_nents = i;
-	}
-
-	return i;
-}
-
-/**
- *	ide_dma_unmap_sg	-	clean up DMA mapping
- *	@drive: The drive to unmap
- *
- *	Teardown mappings after DMA has completed. This must be called
- *	after the completion of each use of ide_build_dmatable and before
- *	the next use of ide_build_dmatable. Failure to do so will cause
- *	an oops as only one mapping can be live for each target at a given
- *	time.
- */
-
-void ide_dma_unmap_sg(ide_drive_t *drive, struct ide_cmd *cmd)
-{
-	ide_hwif_t *hwif = drive->hwif;
-
-	dma_unmap_sg(hwif->dev, hwif->sg_table, cmd->orig_sg_nents,
-		     cmd->sg_dma_direction);
-}
-EXPORT_SYMBOL_GPL(ide_dma_unmap_sg);
-
-/**
- *	ide_dma_off_quietly	-	Generic DMA kill
- *	@drive: drive to control
- *
- *	Turn off the current DMA on this IDE controller.
- */
-
-void ide_dma_off_quietly(ide_drive_t *drive)
-{
-	drive->dev_flags &= ~IDE_DFLAG_USING_DMA;
-
-	drive->hwif->dma_ops->dma_host_set(drive, 0);
-}
-EXPORT_SYMBOL(ide_dma_off_quietly);
-
-/**
- *	ide_dma_off	-	disable DMA on a device
- *	@drive: drive to disable DMA on
- *
- *	Disable IDE DMA for a device on this IDE controller.
- *	Inform the user that DMA has been disabled.
- */
-
-void ide_dma_off(ide_drive_t *drive)
-{
-	printk(KERN_INFO "%s: DMA disabled\n", drive->name);
-	ide_dma_off_quietly(drive);
-}
-EXPORT_SYMBOL(ide_dma_off);
-
-/**
- *	ide_dma_on		-	Enable DMA on a device
- *	@drive: drive to enable DMA on
- *
- *	Enable IDE DMA for a device on this IDE controller.
- */
-
-void ide_dma_on(ide_drive_t *drive)
-{
-	drive->dev_flags |= IDE_DFLAG_USING_DMA;
-
-	drive->hwif->dma_ops->dma_host_set(drive, 1);
-}
-
-int __ide_dma_bad_drive(ide_drive_t *drive)
-{
-	u16 *id = drive->id;
-
-	int blacklist = ide_in_drive_list(id, drive_blacklist);
-	if (blacklist) {
-		printk(KERN_WARNING "%s: Disabling (U)DMA for %s (blacklisted)\n",
-				    drive->name, (char *)&id[ATA_ID_PROD]);
-		return blacklist;
-	}
-	return 0;
-}
-EXPORT_SYMBOL(__ide_dma_bad_drive);
-
-static const u8 xfer_mode_bases[] = {
-	XFER_UDMA_0,
-	XFER_MW_DMA_0,
-	XFER_SW_DMA_0,
-};
-
-static unsigned int ide_get_mode_mask(ide_drive_t *drive, u8 base, u8 req_mode)
-{
-	u16 *id = drive->id;
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_port_ops *port_ops = hwif->port_ops;
-	unsigned int mask = 0;
-
-	switch (base) {
-	case XFER_UDMA_0:
-		if ((id[ATA_ID_FIELD_VALID] & 4) == 0)
-			break;
-		mask = id[ATA_ID_UDMA_MODES];
-		if (port_ops && port_ops->udma_filter)
-			mask &= port_ops->udma_filter(drive);
-		else
-			mask &= hwif->ultra_mask;
-
-		/*
-		 * avoid false cable warning from eighty_ninty_three()
-		 */
-		if (req_mode > XFER_UDMA_2) {
-			if ((mask & 0x78) && (eighty_ninty_three(drive) == 0))
-				mask &= 0x07;
-		}
-		break;
-	case XFER_MW_DMA_0:
-		mask = id[ATA_ID_MWDMA_MODES];
-
-		/* Also look for the CF specific MWDMA modes... */
-		if (ata_id_is_cfa(id) && (id[ATA_ID_CFA_MODES] & 0x38)) {
-			u8 mode = ((id[ATA_ID_CFA_MODES] & 0x38) >> 3) - 1;
-
-			mask |= ((2 << mode) - 1) << 3;
-		}
-
-		if (port_ops && port_ops->mdma_filter)
-			mask &= port_ops->mdma_filter(drive);
-		else
-			mask &= hwif->mwdma_mask;
-		break;
-	case XFER_SW_DMA_0:
-		mask = id[ATA_ID_SWDMA_MODES];
-		if (!(mask & ATA_SWDMA2) && (id[ATA_ID_OLD_DMA_MODES] >> 8)) {
-			u8 mode = id[ATA_ID_OLD_DMA_MODES] >> 8;
-
-			/*
-			 * if the mode is valid convert it to the mask
-			 * (the maximum allowed mode is XFER_SW_DMA_2)
-			 */
-			if (mode <= 2)
-				mask = (2 << mode) - 1;
-		}
-		mask &= hwif->swdma_mask;
-		break;
-	default:
-		BUG();
-		break;
-	}
-
-	return mask;
-}
-
-/**
- *	ide_find_dma_mode	-	compute DMA speed
- *	@drive: IDE device
- *	@req_mode: requested mode
- *
- *	Checks the drive/host capabilities and finds the speed to use for
- *	the DMA transfer.  The speed is then limited by the requested mode.
- *
- *	Returns 0 if the drive/host combination is incapable of DMA transfers
- *	or if the requested mode is not a DMA mode.
- */
-
-u8 ide_find_dma_mode(ide_drive_t *drive, u8 req_mode)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	unsigned int mask;
-	int x, i;
-	u8 mode = 0;
-
-	if (drive->media != ide_disk) {
-		if (hwif->host_flags & IDE_HFLAG_NO_ATAPI_DMA)
-			return 0;
-	}
-
-	for (i = 0; i < ARRAY_SIZE(xfer_mode_bases); i++) {
-		if (req_mode < xfer_mode_bases[i])
-			continue;
-		mask = ide_get_mode_mask(drive, xfer_mode_bases[i], req_mode);
-		x = fls(mask) - 1;
-		if (x >= 0) {
-			mode = xfer_mode_bases[i] + x;
-			break;
-		}
-	}
-
-	if (hwif->chipset == ide_acorn && mode == 0) {
-		/*
-		 * is this correct?
-		 */
-		if (ide_dma_good_drive(drive) &&
-		    drive->id[ATA_ID_EIDE_DMA_TIME] < 150)
-			mode = XFER_MW_DMA_1;
-	}
-
-	mode = min(mode, req_mode);
-
-	printk(KERN_INFO "%s: %s mode selected\n", drive->name,
-			  mode ? ide_xfer_verbose(mode) : "no DMA");
-
-	return mode;
-}
-
-static int ide_tune_dma(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	u8 speed;
-
-	if (ata_id_has_dma(drive->id) == 0 ||
-	    (drive->dev_flags & IDE_DFLAG_NODMA))
-		return 0;
-
-	/* consult the list of known "bad" drives */
-	if (__ide_dma_bad_drive(drive))
-		return 0;
-
-	if (hwif->host_flags & IDE_HFLAG_TRUST_BIOS_FOR_DMA)
-		return config_drive_for_dma(drive);
-
-	speed = ide_max_dma_mode(drive);
-
-	if (!speed)
-		return 0;
-
-	if (ide_set_dma_mode(drive, speed))
-		return 0;
-
-	return 1;
-}
-
-static int ide_dma_check(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-
-	if (ide_tune_dma(drive))
-		return 0;
-
-	/* TODO: always do PIO fallback */
-	if (hwif->host_flags & IDE_HFLAG_TRUST_BIOS_FOR_DMA)
-		return -1;
-
-	ide_set_max_pio(drive);
-
-	return -1;
-}
-
-int ide_set_dma(ide_drive_t *drive)
-{
-	int rc;
-
-	/*
-	 * Force DMAing for the beginning of the check.
-	 * Some chipsets appear to do interesting
-	 * things, if not checked and cleared.
-	 *   PARANOIA!!!
-	 */
-	ide_dma_off_quietly(drive);
-
-	rc = ide_dma_check(drive);
-	if (rc)
-		return rc;
-
-	ide_dma_on(drive);
-
-	return 0;
-}
-
-void ide_check_dma_crc(ide_drive_t *drive)
-{
-	u8 mode;
-
-	ide_dma_off_quietly(drive);
-	drive->crc_count = 0;
-	mode = drive->current_speed;
-	/*
-	 * Don't try non Ultra-DMA modes without iCRC's.  Force the
-	 * device to PIO and make the user enable SWDMA/MWDMA modes.
-	 */
-	if (mode > XFER_UDMA_0 && mode <= XFER_UDMA_7)
-		mode--;
-	else
-		mode = XFER_PIO_4;
-	ide_set_xfer_rate(drive, mode);
-	if (drive->current_speed >= XFER_SW_DMA_0)
-		ide_dma_on(drive);
-}
-
-void ide_dma_lost_irq(ide_drive_t *drive)
-{
-	printk(KERN_ERR "%s: DMA interrupt recovery\n", drive->name);
-}
-EXPORT_SYMBOL_GPL(ide_dma_lost_irq);
-
-/*
- * un-busy the port etc, and clear any pending DMA status. we want to
- * retry the current request in pio mode instead of risking tossing it
- * all away
- */
-ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_dma_ops *dma_ops = hwif->dma_ops;
-	struct ide_cmd *cmd = &hwif->cmd;
-	ide_startstop_t ret = ide_stopped;
-
-	/*
-	 * end current dma transaction
-	 */
-
-	if (error < 0) {
-		printk(KERN_WARNING "%s: DMA timeout error\n", drive->name);
-		drive->waiting_for_dma = 0;
-		(void)dma_ops->dma_end(drive);
-		ide_dma_unmap_sg(drive, cmd);
-		ret = ide_error(drive, "dma timeout error",
-				hwif->tp_ops->read_status(hwif));
-	} else {
-		printk(KERN_WARNING "%s: DMA timeout retry\n", drive->name);
-		if (dma_ops->dma_clear)
-			dma_ops->dma_clear(drive);
-		printk(KERN_ERR "%s: timeout waiting for DMA\n", drive->name);
-		if (dma_ops->dma_test_irq(drive) == 0) {
-			ide_dump_status(drive, "DMA timeout",
-					hwif->tp_ops->read_status(hwif));
-			drive->waiting_for_dma = 0;
-			(void)dma_ops->dma_end(drive);
-			ide_dma_unmap_sg(drive, cmd);
-		}
-	}
-
-	/*
-	 * disable dma for now, but remember that we did so because of
-	 * a timeout -- we'll reenable after we finish this next request
-	 * (or rather the first chunk of it) in pio.
-	 */
-	drive->dev_flags |= IDE_DFLAG_DMA_PIO_RETRY;
-	drive->retry_pio++;
-	ide_dma_off_quietly(drive);
-
-	/*
-	 * make sure request is sane
-	 */
-	if (hwif->rq)
-		scsi_req(hwif->rq)->result = 0;
-	return ret;
-}
-
-void ide_release_dma_engine(ide_hwif_t *hwif)
-{
-	if (hwif->dmatable_cpu) {
-		int prd_size = hwif->prd_max_nents * hwif->prd_ent_size;
-
-		dma_free_coherent(hwif->dev, prd_size,
-				  hwif->dmatable_cpu, hwif->dmatable_dma);
-		hwif->dmatable_cpu = NULL;
-	}
-}
-EXPORT_SYMBOL_GPL(ide_release_dma_engine);
-
-int ide_allocate_dma_engine(ide_hwif_t *hwif)
-{
-	int prd_size;
-
-	if (hwif->prd_max_nents == 0)
-		hwif->prd_max_nents = PRD_ENTRIES;
-	if (hwif->prd_ent_size == 0)
-		hwif->prd_ent_size = PRD_BYTES;
-
-	prd_size = hwif->prd_max_nents * hwif->prd_ent_size;
-
-	hwif->dmatable_cpu = dma_alloc_coherent(hwif->dev, prd_size,
-						&hwif->dmatable_dma,
-						GFP_ATOMIC);
-	if (hwif->dmatable_cpu == NULL) {
-		printk(KERN_ERR "%s: unable to allocate PRD table\n",
-			hwif->name);
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(ide_allocate_dma_engine);
-
-int ide_dma_prepare(ide_drive_t *drive, struct ide_cmd *cmd)
-{
-	const struct ide_dma_ops *dma_ops = drive->hwif->dma_ops;
-
-	if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0 ||
-	    (dma_ops->dma_check && dma_ops->dma_check(drive, cmd)))
-		goto out;
-	ide_map_sg(drive, cmd);
-	if (ide_dma_map_sg(drive, cmd) == 0)
-		goto out_map;
-	if (dma_ops->dma_setup(drive, cmd))
-		goto out_dma_unmap;
-	drive->waiting_for_dma = 1;
-	return 0;
-out_dma_unmap:
-	ide_dma_unmap_sg(drive, cmd);
-out_map:
-	ide_map_sg(drive, cmd);
-out:
-	return 1;
-}
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
deleted file mode 100644
index 2f378213e9b5..000000000000
--- a/drivers/ide/ide-eh.c
+++ /dev/null
@@ -1,443 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/ide.h>
-#include <linux/delay.h>
-
-static ide_startstop_t ide_ata_error(ide_drive_t *drive, struct request *rq,
-				     u8 stat, u8 err)
-{
-	ide_hwif_t *hwif = drive->hwif;
-
-	if ((stat & ATA_BUSY) ||
-	    ((stat & ATA_DF) && (drive->dev_flags & IDE_DFLAG_NOWERR) == 0)) {
-		/* other bits are useless when BUSY */
-		scsi_req(rq)->result |= ERROR_RESET;
-	} else if (stat & ATA_ERR) {
-		/* err has different meaning on cdrom and tape */
-		if (err == ATA_ABORTED) {
-			if ((drive->dev_flags & IDE_DFLAG_LBA) &&
-			    /* some newer drives don't support ATA_CMD_INIT_DEV_PARAMS */
-			    hwif->tp_ops->read_status(hwif) == ATA_CMD_INIT_DEV_PARAMS)
-				return ide_stopped;
-		} else if ((err & BAD_CRC) == BAD_CRC) {
-			/* UDMA crc error, just retry the operation */
-			drive->crc_count++;
-		} else if (err & (ATA_BBK | ATA_UNC)) {
-			/* retries won't help these */
-			scsi_req(rq)->result = ERROR_MAX;
-		} else if (err & ATA_TRK0NF) {
-			/* help it find track zero */
-			scsi_req(rq)->result |= ERROR_RECAL;
-		}
-	}
-
-	if ((stat & ATA_DRQ) && rq_data_dir(rq) == READ &&
-	    (hwif->host_flags & IDE_HFLAG_ERROR_STOPS_FIFO) == 0) {
-		int nsect = drive->mult_count ? drive->mult_count : 1;
-
-		ide_pad_transfer(drive, READ, nsect * SECTOR_SIZE);
-	}
-
-	if (scsi_req(rq)->result >= ERROR_MAX || blk_noretry_request(rq)) {
-		ide_kill_rq(drive, rq);
-		return ide_stopped;
-	}
-
-	if (hwif->tp_ops->read_status(hwif) & (ATA_BUSY | ATA_DRQ))
-		scsi_req(rq)->result |= ERROR_RESET;
-
-	if ((scsi_req(rq)->result & ERROR_RESET) == ERROR_RESET) {
-		++scsi_req(rq)->result;
-		return ide_do_reset(drive);
-	}
-
-	if ((scsi_req(rq)->result & ERROR_RECAL) == ERROR_RECAL)
-		drive->special_flags |= IDE_SFLAG_RECALIBRATE;
-
-	++scsi_req(rq)->result;
-
-	return ide_stopped;
-}
-
-static ide_startstop_t ide_atapi_error(ide_drive_t *drive, struct request *rq,
-				       u8 stat, u8 err)
-{
-	ide_hwif_t *hwif = drive->hwif;
-
-	if ((stat & ATA_BUSY) ||
-	    ((stat & ATA_DF) && (drive->dev_flags & IDE_DFLAG_NOWERR) == 0)) {
-		/* other bits are useless when BUSY */
-		scsi_req(rq)->result |= ERROR_RESET;
-	} else {
-		/* add decoding error stuff */
-	}
-
-	if (hwif->tp_ops->read_status(hwif) & (ATA_BUSY | ATA_DRQ))
-		/* force an abort */
-		hwif->tp_ops->exec_command(hwif, ATA_CMD_IDLEIMMEDIATE);
-
-	if (scsi_req(rq)->result >= ERROR_MAX) {
-		ide_kill_rq(drive, rq);
-	} else {
-		if ((scsi_req(rq)->result & ERROR_RESET) == ERROR_RESET) {
-			++scsi_req(rq)->result;
-			return ide_do_reset(drive);
-		}
-		++scsi_req(rq)->result;
-	}
-
-	return ide_stopped;
-}
-
-static ide_startstop_t __ide_error(ide_drive_t *drive, struct request *rq,
-				   u8 stat, u8 err)
-{
-	if (drive->media == ide_disk)
-		return ide_ata_error(drive, rq, stat, err);
-	return ide_atapi_error(drive, rq, stat, err);
-}
-
-/**
- *	ide_error	-	handle an error on the IDE
- *	@drive: drive the error occurred on
- *	@msg: message to report
- *	@stat: status bits
- *
- *	ide_error() takes action based on the error returned by the drive.
- *	For normal I/O that may well include retries. We deal with
- *	both new-style (taskfile) and old style command handling here.
- *	In the case of taskfile command handling there is work left to
- *	do
- */
-
-ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat)
-{
-	struct request *rq;
-	u8 err;
-
-	err = ide_dump_status(drive, msg, stat);
-
-	rq = drive->hwif->rq;
-	if (rq == NULL)
-		return ide_stopped;
-
-	/* retry only "normal" I/O: */
-	if (blk_rq_is_passthrough(rq)) {
-		if (ata_taskfile_request(rq)) {
-			struct ide_cmd *cmd = ide_req(rq)->special;
-
-			if (cmd)
-				ide_complete_cmd(drive, cmd, stat, err);
-		} else if (ata_pm_request(rq)) {
-			scsi_req(rq)->result = 1;
-			ide_complete_pm_rq(drive, rq);
-			return ide_stopped;
-		}
-		scsi_req(rq)->result = err;
-		ide_complete_rq(drive, err ? BLK_STS_IOERR : BLK_STS_OK, blk_rq_bytes(rq));
-		return ide_stopped;
-	}
-
-	return __ide_error(drive, rq, stat, err);
-}
-EXPORT_SYMBOL_GPL(ide_error);
-
-static inline void ide_complete_drive_reset(ide_drive_t *drive, blk_status_t err)
-{
-	struct request *rq = drive->hwif->rq;
-
-	if (rq && ata_misc_request(rq) &&
-	    scsi_req(rq)->cmd[0] == REQ_DRIVE_RESET) {
-		if (err <= 0 && scsi_req(rq)->result == 0)
-			scsi_req(rq)->result = -EIO;
-		ide_complete_rq(drive, err, blk_rq_bytes(rq));
-	}
-}
-
-/* needed below */
-static ide_startstop_t do_reset1(ide_drive_t *, int);
-
-/*
- * atapi_reset_pollfunc() gets invoked to poll the interface for completion
- * every 50ms during an atapi drive reset operation.  If the drive has not yet
- * responded, and we have not yet hit our maximum waiting time, then the timer
- * is restarted for another 50ms.
- */
-static ide_startstop_t atapi_reset_pollfunc(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
-	u8 stat;
-
-	tp_ops->dev_select(drive);
-	udelay(10);
-	stat = tp_ops->read_status(hwif);
-
-	if (OK_STAT(stat, 0, ATA_BUSY))
-		printk(KERN_INFO "%s: ATAPI reset complete\n", drive->name);
-	else {
-		if (time_before(jiffies, hwif->poll_timeout)) {
-			ide_set_handler(drive, &atapi_reset_pollfunc, HZ/20);
-			/* continue polling */
-			return ide_started;
-		}
-		/* end of polling */
-		hwif->polling = 0;
-		printk(KERN_ERR "%s: ATAPI reset timed-out, status=0x%02x\n",
-			drive->name, stat);
-		/* do it the old fashioned way */
-		return do_reset1(drive, 1);
-	}
-	/* done polling */
-	hwif->polling = 0;
-	ide_complete_drive_reset(drive, BLK_STS_OK);
-	return ide_stopped;
-}
-
-static void ide_reset_report_error(ide_hwif_t *hwif, u8 err)
-{
-	static const char *err_master_vals[] =
-		{ NULL, "passed", "formatter device error",
-		  "sector buffer error", "ECC circuitry error",
-		  "controlling MPU error" };
-
-	u8 err_master = err & 0x7f;
-
-	printk(KERN_ERR "%s: reset: master: ", hwif->name);
-	if (err_master && err_master < 6)
-		printk(KERN_CONT "%s", err_master_vals[err_master]);
-	else
-		printk(KERN_CONT "error (0x%02x?)", err);
-	if (err & 0x80)
-		printk(KERN_CONT "; slave: failed");
-	printk(KERN_CONT "\n");
-}
-
-/*
- * reset_pollfunc() gets invoked to poll the interface for completion every 50ms
- * during an ide reset operation. If the drives have not yet responded,
- * and we have not yet hit our maximum waiting time, then the timer is restarted
- * for another 50ms.
- */
-static ide_startstop_t reset_pollfunc(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_port_ops *port_ops = hwif->port_ops;
-	u8 tmp;
-	blk_status_t err = BLK_STS_OK;
-
-	if (port_ops && port_ops->reset_poll) {
-		err = port_ops->reset_poll(drive);
-		if (err) {
-			printk(KERN_ERR "%s: host reset_poll failure for %s.\n",
-				hwif->name, drive->name);
-			goto out;
-		}
-	}
-
-	tmp = hwif->tp_ops->read_status(hwif);
-
-	if (!OK_STAT(tmp, 0, ATA_BUSY)) {
-		if (time_before(jiffies, hwif->poll_timeout)) {
-			ide_set_handler(drive, &reset_pollfunc, HZ/20);
-			/* continue polling */
-			return ide_started;
-		}
-		printk(KERN_ERR "%s: reset timed-out, status=0x%02x\n",
-			hwif->name, tmp);
-		drive->failures++;
-		err = BLK_STS_IOERR;
-	} else  {
-		tmp = ide_read_error(drive);
-
-		if (tmp == 1) {
-			printk(KERN_INFO "%s: reset: success\n", hwif->name);
-			drive->failures = 0;
-		} else {
-			ide_reset_report_error(hwif, tmp);
-			drive->failures++;
-			err = BLK_STS_IOERR;
-		}
-	}
-out:
-	hwif->polling = 0;	/* done polling */
-	ide_complete_drive_reset(drive, err);
-	return ide_stopped;
-}
-
-static void ide_disk_pre_reset(ide_drive_t *drive)
-{
-	int legacy = (drive->id[ATA_ID_CFS_ENABLE_2] & 0x0400) ? 0 : 1;
-
-	drive->special_flags =
-		legacy ? (IDE_SFLAG_SET_GEOMETRY | IDE_SFLAG_RECALIBRATE) : 0;
-
-	drive->mult_count = 0;
-	drive->dev_flags &= ~IDE_DFLAG_PARKED;
-
-	if ((drive->dev_flags & IDE_DFLAG_KEEP_SETTINGS) == 0 &&
-	    (drive->dev_flags & IDE_DFLAG_USING_DMA) == 0)
-		drive->mult_req = 0;
-
-	if (drive->mult_req != drive->mult_count)
-		drive->special_flags |= IDE_SFLAG_SET_MULTMODE;
-}
-
-static void pre_reset(ide_drive_t *drive)
-{
-	const struct ide_port_ops *port_ops = drive->hwif->port_ops;
-
-	if (drive->media == ide_disk)
-		ide_disk_pre_reset(drive);
-	else
-		drive->dev_flags |= IDE_DFLAG_POST_RESET;
-
-	if (drive->dev_flags & IDE_DFLAG_USING_DMA) {
-		if (drive->crc_count)
-			ide_check_dma_crc(drive);
-		else
-			ide_dma_off(drive);
-	}
-
-	if ((drive->dev_flags & IDE_DFLAG_KEEP_SETTINGS) == 0) {
-		if ((drive->dev_flags & IDE_DFLAG_USING_DMA) == 0) {
-			drive->dev_flags &= ~IDE_DFLAG_UNMASK;
-			drive->io_32bit = 0;
-		}
-		return;
-	}
-
-	if (port_ops && port_ops->pre_reset)
-		port_ops->pre_reset(drive);
-
-	if (drive->current_speed != 0xff)
-		drive->desired_speed = drive->current_speed;
-	drive->current_speed = 0xff;
-}
-
-/*
- * do_reset1() attempts to recover a confused drive by resetting it.
- * Unfortunately, resetting a disk drive actually resets all devices on
- * the same interface, so it can really be thought of as resetting the
- * interface rather than resetting the drive.
- *
- * ATAPI devices have their own reset mechanism which allows them to be
- * individually reset without clobbering other devices on the same interface.
- *
- * Unfortunately, the IDE interface does not generate an interrupt to let
- * us know when the reset operation has finished, so we must poll for this.
- * Equally poor, though, is the fact that this may a very long time to complete,
- * (up to 30 seconds worstcase).  So, instead of busy-waiting here for it,
- * we set a timer to poll at 50ms intervals.
- */
-static ide_startstop_t do_reset1(ide_drive_t *drive, int do_not_try_atapi)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct ide_io_ports *io_ports = &hwif->io_ports;
-	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
-	const struct ide_port_ops *port_ops;
-	ide_drive_t *tdrive;
-	unsigned long flags, timeout;
-	int i;
-	DEFINE_WAIT(wait);
-
-	spin_lock_irqsave(&hwif->lock, flags);
-
-	/* We must not reset with running handlers */
-	BUG_ON(hwif->handler != NULL);
-
-	/* For an ATAPI device, first try an ATAPI SRST. */
-	if (drive->media != ide_disk && !do_not_try_atapi) {
-		pre_reset(drive);
-		tp_ops->dev_select(drive);
-		udelay(20);
-		tp_ops->exec_command(hwif, ATA_CMD_DEV_RESET);
-		ndelay(400);
-		hwif->poll_timeout = jiffies + WAIT_WORSTCASE;
-		hwif->polling = 1;
-		__ide_set_handler(drive, &atapi_reset_pollfunc, HZ/20);
-		spin_unlock_irqrestore(&hwif->lock, flags);
-		return ide_started;
-	}
-
-	/* We must not disturb devices in the IDE_DFLAG_PARKED state. */
-	do {
-		unsigned long now;
-
-		prepare_to_wait(&ide_park_wq, &wait, TASK_UNINTERRUPTIBLE);
-		timeout = jiffies;
-		ide_port_for_each_present_dev(i, tdrive, hwif) {
-			if ((tdrive->dev_flags & IDE_DFLAG_PARKED) &&
-			    time_after(tdrive->sleep, timeout))
-				timeout = tdrive->sleep;
-		}
-
-		now = jiffies;
-		if (time_before_eq(timeout, now))
-			break;
-
-		spin_unlock_irqrestore(&hwif->lock, flags);
-		timeout = schedule_timeout_uninterruptible(timeout - now);
-		spin_lock_irqsave(&hwif->lock, flags);
-	} while (timeout);
-	finish_wait(&ide_park_wq, &wait);
-
-	/*
-	 * First, reset any device state data we were maintaining
-	 * for any of the drives on this interface.
-	 */
-	ide_port_for_each_dev(i, tdrive, hwif)
-		pre_reset(tdrive);
-
-	if (io_ports->ctl_addr == 0) {
-		spin_unlock_irqrestore(&hwif->lock, flags);
-		ide_complete_drive_reset(drive, BLK_STS_IOERR);
-		return ide_stopped;
-	}
-
-	/*
-	 * Note that we also set nIEN while resetting the device,
-	 * to mask unwanted interrupts from the interface during the reset.
-	 * However, due to the design of PC hardware, this will cause an
-	 * immediate interrupt due to the edge transition it produces.
-	 * This single interrupt gives us a "fast poll" for drives that
-	 * recover from reset very quickly, saving us the first 50ms wait time.
-	 */
-	/* set SRST and nIEN */
-	tp_ops->write_devctl(hwif, ATA_SRST | ATA_NIEN | ATA_DEVCTL_OBS);
-	/* more than enough time */
-	udelay(10);
-	/* clear SRST, leave nIEN (unless device is on the quirk list) */
-	tp_ops->write_devctl(hwif,
-		((drive->dev_flags & IDE_DFLAG_NIEN_QUIRK) ? 0 : ATA_NIEN) |
-		 ATA_DEVCTL_OBS);
-	/* more than enough time */
-	udelay(10);
-	hwif->poll_timeout = jiffies + WAIT_WORSTCASE;
-	hwif->polling = 1;
-	__ide_set_handler(drive, &reset_pollfunc, HZ/20);
-
-	/*
-	 * Some weird controller like resetting themselves to a strange
-	 * state when the disks are reset this way. At least, the Winbond
-	 * 553 documentation says that
-	 */
-	port_ops = hwif->port_ops;
-	if (port_ops && port_ops->resetproc)
-		port_ops->resetproc(drive);
-
-	spin_unlock_irqrestore(&hwif->lock, flags);
-	return ide_started;
-}
-
-/*
- * ide_do_reset() is the entry point to the drive/interface reset code.
- */
-
-ide_startstop_t ide_do_reset(ide_drive_t *drive)
-{
-	return do_reset1(drive, 0);
-}
-EXPORT_SYMBOL(ide_do_reset);
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
deleted file mode 100644
index f5a2870aaf54..000000000000
--- a/drivers/ide/ide-floppy.c
+++ /dev/null
@@ -1,551 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * IDE ATAPI floppy driver.
- *
- * Copyright (C) 1996-1999  Gadi Oxman <gadio@netvision.net.il>
- * Copyright (C) 2000-2002  Paul Bristow <paul@paulbristow.net>
- * Copyright (C) 2005       Bartlomiej Zolnierkiewicz
- *
- * This driver supports the following IDE floppy drives:
- *
- * LS-120/240 SuperDisk
- * Iomega Zip 100/250
- * Iomega PC Card Clik!/PocketZip
- *
- * For a historical changelog see
- * Documentation/ide/ChangeLog.ide-floppy.1996-2002
- */
-
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/compat.h>
-#include <linux/delay.h>
-#include <linux/timer.h>
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/major.h>
-#include <linux/errno.h>
-#include <linux/genhd.h>
-#include <linux/cdrom.h>
-#include <linux/ide.h>
-#include <linux/hdreg.h>
-#include <linux/bitops.h>
-#include <linux/mutex.h>
-#include <linux/scatterlist.h>
-
-#include <scsi/scsi_ioctl.h>
-
-#include <asm/byteorder.h>
-#include <linux/uaccess.h>
-#include <linux/io.h>
-#include <asm/unaligned.h>
-
-#include "ide-floppy.h"
-
-/*
- * After each failed packet command we issue a request sense command and retry
- * the packet command IDEFLOPPY_MAX_PC_RETRIES times.
- */
-#define IDEFLOPPY_MAX_PC_RETRIES	3
-
-/* format capacities descriptor codes */
-#define CAPACITY_INVALID	0x00
-#define CAPACITY_UNFORMATTED	0x01
-#define CAPACITY_CURRENT	0x02
-#define CAPACITY_NO_CARTRIDGE	0x03
-
-/*
- * The following delay solves a problem with ATAPI Zip 100 drive where BSY bit
- * was apparently being deasserted before the unit was ready to receive data.
- */
-#define IDEFLOPPY_PC_DELAY	(HZ/20)	/* default delay for ZIP 100 (50ms) */
-
-static int ide_floppy_callback(ide_drive_t *drive, int dsc)
-{
-	struct ide_disk_obj *floppy = drive->driver_data;
-	struct ide_atapi_pc *pc = drive->pc;
-	struct request *rq = pc->rq;
-	int uptodate = pc->error ? 0 : 1;
-
-	ide_debug_log(IDE_DBG_FUNC, "enter");
-
-	if (drive->failed_pc == pc)
-		drive->failed_pc = NULL;
-
-	if (pc->c[0] == GPCMD_READ_10 || pc->c[0] == GPCMD_WRITE_10 ||
-	    blk_rq_is_scsi(rq))
-		uptodate = 1; /* FIXME */
-	else if (pc->c[0] == GPCMD_REQUEST_SENSE) {
-
-		u8 *buf = bio_data(rq->bio);
-
-		if (!pc->error) {
-			floppy->sense_key = buf[2] & 0x0F;
-			floppy->asc = buf[12];
-			floppy->ascq = buf[13];
-			floppy->progress_indication = buf[15] & 0x80 ?
-				(u16)get_unaligned((u16 *)&buf[16]) : 0x10000;
-
-			if (drive->failed_pc)
-				ide_debug_log(IDE_DBG_PC, "pc = %x",
-					      drive->failed_pc->c[0]);
-
-			ide_debug_log(IDE_DBG_SENSE, "sense key = %x, asc = %x,"
-				      "ascq = %x", floppy->sense_key,
-				      floppy->asc, floppy->ascq);
-		} else
-			printk(KERN_ERR PFX "Error in REQUEST SENSE itself - "
-			       "Aborting request!\n");
-	}
-
-	if (ata_misc_request(rq))
-		scsi_req(rq)->result = uptodate ? 0 : IDE_DRV_ERROR_GENERAL;
-
-	return uptodate;
-}
-
-static void ide_floppy_report_error(struct ide_disk_obj *floppy,
-				    struct ide_atapi_pc *pc)
-{
-	/* suppress error messages resulting from Medium not present */
-	if (floppy->sense_key == 0x02 &&
-	    floppy->asc       == 0x3a &&
-	    floppy->ascq      == 0x00)
-		return;
-
-	printk(KERN_ERR PFX "%s: I/O error, pc = %2x, key = %2x, "
-			"asc = %2x, ascq = %2x\n",
-			floppy->drive->name, pc->c[0], floppy->sense_key,
-			floppy->asc, floppy->ascq);
-
-}
-
-static ide_startstop_t ide_floppy_issue_pc(ide_drive_t *drive,
-					   struct ide_cmd *cmd,
-					   struct ide_atapi_pc *pc)
-{
-	struct ide_disk_obj *floppy = drive->driver_data;
-
-	if (drive->failed_pc == NULL &&
-	    pc->c[0] != GPCMD_REQUEST_SENSE)
-		drive->failed_pc = pc;
-
-	/* Set the current packet command */
-	drive->pc = pc;
-
-	if (pc->retries > IDEFLOPPY_MAX_PC_RETRIES) {
-		unsigned int done = blk_rq_bytes(drive->hwif->rq);
-
-		if (!(pc->flags & PC_FLAG_SUPPRESS_ERROR))
-			ide_floppy_report_error(floppy, pc);
-
-		/* Giving up */
-		pc->error = IDE_DRV_ERROR_GENERAL;
-
-		drive->failed_pc = NULL;
-		drive->pc_callback(drive, 0);
-		ide_complete_rq(drive, BLK_STS_IOERR, done);
-		return ide_stopped;
-	}
-
-	ide_debug_log(IDE_DBG_FUNC, "retry #%d", pc->retries);
-
-	pc->retries++;
-
-	return ide_issue_pc(drive, cmd);
-}
-
-void ide_floppy_create_read_capacity_cmd(struct ide_atapi_pc *pc)
-{
-	ide_init_pc(pc);
-	pc->c[0] = GPCMD_READ_FORMAT_CAPACITIES;
-	pc->c[7] = 255;
-	pc->c[8] = 255;
-	pc->req_xfer = 255;
-}
-
-/* A mode sense command is used to "sense" floppy parameters. */
-void ide_floppy_create_mode_sense_cmd(struct ide_atapi_pc *pc, u8 page_code)
-{
-	u16 length = 8; /* sizeof(Mode Parameter Header) = 8 Bytes */
-
-	ide_init_pc(pc);
-	pc->c[0] = GPCMD_MODE_SENSE_10;
-	pc->c[1] = 0;
-	pc->c[2] = page_code;
-
-	switch (page_code) {
-	case IDEFLOPPY_CAPABILITIES_PAGE:
-		length += 12;
-		break;
-	case IDEFLOPPY_FLEXIBLE_DISK_PAGE:
-		length += 32;
-		break;
-	default:
-		printk(KERN_ERR PFX "unsupported page code in %s\n", __func__);
-	}
-	put_unaligned(cpu_to_be16(length), (u16 *) &pc->c[7]);
-	pc->req_xfer = length;
-}
-
-static void idefloppy_create_rw_cmd(ide_drive_t *drive,
-				    struct ide_atapi_pc *pc, struct request *rq,
-				    unsigned long sector)
-{
-	struct ide_disk_obj *floppy = drive->driver_data;
-	int block = sector / floppy->bs_factor;
-	int blocks = blk_rq_sectors(rq) / floppy->bs_factor;
-	int cmd = rq_data_dir(rq);
-
-	ide_debug_log(IDE_DBG_FUNC, "block: %d, blocks: %d", block, blocks);
-
-	ide_init_pc(pc);
-	pc->c[0] = cmd == READ ? GPCMD_READ_10 : GPCMD_WRITE_10;
-	put_unaligned(cpu_to_be16(blocks), (unsigned short *)&pc->c[7]);
-	put_unaligned(cpu_to_be32(block), (unsigned int *) &pc->c[2]);
-
-	memcpy(scsi_req(rq)->cmd, pc->c, 12);
-
-	pc->rq = rq;
-	if (cmd == WRITE)
-		pc->flags |= PC_FLAG_WRITING;
-
-	pc->flags |= PC_FLAG_DMA_OK;
-}
-
-static void idefloppy_blockpc_cmd(struct ide_disk_obj *floppy,
-		struct ide_atapi_pc *pc, struct request *rq)
-{
-	ide_init_pc(pc);
-	memcpy(pc->c, scsi_req(rq)->cmd, sizeof(pc->c));
-	pc->rq = rq;
-	if (blk_rq_bytes(rq)) {
-		pc->flags |= PC_FLAG_DMA_OK;
-		if (rq_data_dir(rq) == WRITE)
-			pc->flags |= PC_FLAG_WRITING;
-	}
-}
-
-static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
-					     struct request *rq, sector_t block)
-{
-	struct ide_disk_obj *floppy = drive->driver_data;
-	struct ide_cmd cmd;
-	struct ide_atapi_pc *pc;
-
-	ide_debug_log(IDE_DBG_FUNC, "enter, cmd: 0x%x\n", rq->cmd[0]);
-
-	if (drive->debug_mask & IDE_DBG_RQ)
-		blk_dump_rq_flags(rq, (rq->rq_disk
-					? rq->rq_disk->disk_name
-					: "dev?"));
-
-	if (scsi_req(rq)->result >= ERROR_MAX) {
-		if (drive->failed_pc) {
-			ide_floppy_report_error(floppy, drive->failed_pc);
-			drive->failed_pc = NULL;
-		} else
-			printk(KERN_ERR PFX "%s: I/O error\n", drive->name);
-
-		if (ata_misc_request(rq)) {
-			scsi_req(rq)->result = 0;
-			ide_complete_rq(drive, BLK_STS_OK, blk_rq_bytes(rq));
-			return ide_stopped;
-		} else
-			goto out_end;
-	}
-
-	switch (req_op(rq)) {
-	default:
-		if (((long)blk_rq_pos(rq) % floppy->bs_factor) ||
-		    (blk_rq_sectors(rq) % floppy->bs_factor)) {
-			printk(KERN_ERR PFX "%s: unsupported r/w rq size\n",
-				drive->name);
-			goto out_end;
-		}
-		pc = &floppy->queued_pc;
-		idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block);
-		break;
-	case REQ_OP_SCSI_IN:
-	case REQ_OP_SCSI_OUT:
-		pc = &floppy->queued_pc;
-		idefloppy_blockpc_cmd(floppy, pc, rq);
-		break;
-	case REQ_OP_DRV_IN:
-	case REQ_OP_DRV_OUT:
-		switch (ide_req(rq)->type) {
-		case ATA_PRIV_MISC:
-		case ATA_PRIV_SENSE:
-			pc = (struct ide_atapi_pc *)ide_req(rq)->special;
-			break;
-		default:
-			BUG();
-		}
-	}
-
-	ide_prep_sense(drive, rq);
-
-	memset(&cmd, 0, sizeof(cmd));
-
-	if (rq_data_dir(rq))
-		cmd.tf_flags |= IDE_TFLAG_WRITE;
-
-	cmd.rq = rq;
-
-	if (!blk_rq_is_passthrough(rq) || blk_rq_bytes(rq)) {
-		ide_init_sg_cmd(&cmd, blk_rq_bytes(rq));
-		ide_map_sg(drive, &cmd);
-	}
-
-	pc->rq = rq;
-
-	return ide_floppy_issue_pc(drive, &cmd, pc);
-out_end:
-	drive->failed_pc = NULL;
-	if (blk_rq_is_passthrough(rq) && scsi_req(rq)->result == 0)
-		scsi_req(rq)->result = -EIO;
-	ide_complete_rq(drive, BLK_STS_IOERR, blk_rq_bytes(rq));
-	return ide_stopped;
-}
-
-/*
- * Look at the flexible disk page parameters. We ignore the CHS capacity
- * parameters and use the LBA parameters instead.
- */
-static int ide_floppy_get_flexible_disk_page(ide_drive_t *drive,
-					     struct ide_atapi_pc *pc)
-{
-	struct ide_disk_obj *floppy = drive->driver_data;
-	struct gendisk *disk = floppy->disk;
-	u8 *page, buf[40];
-	int capacity, lba_capacity;
-	u16 transfer_rate, sector_size, cyls, rpm;
-	u8 heads, sectors;
-
-	ide_floppy_create_mode_sense_cmd(pc, IDEFLOPPY_FLEXIBLE_DISK_PAGE);
-
-	if (ide_queue_pc_tail(drive, disk, pc, buf, pc->req_xfer)) {
-		printk(KERN_ERR PFX "Can't get flexible disk page params\n");
-		return 1;
-	}
-
-	if (buf[3] & 0x80)
-		drive->dev_flags |= IDE_DFLAG_WP;
-	else
-		drive->dev_flags &= ~IDE_DFLAG_WP;
-
-	set_disk_ro(disk, !!(drive->dev_flags & IDE_DFLAG_WP));
-
-	page = &buf[8];
-
-	transfer_rate = be16_to_cpup((__be16 *)&buf[8 + 2]);
-	sector_size   = be16_to_cpup((__be16 *)&buf[8 + 6]);
-	cyls          = be16_to_cpup((__be16 *)&buf[8 + 8]);
-	rpm           = be16_to_cpup((__be16 *)&buf[8 + 28]);
-	heads         = buf[8 + 4];
-	sectors       = buf[8 + 5];
-
-	capacity = cyls * heads * sectors * sector_size;
-
-	if (memcmp(page, &floppy->flexible_disk_page, 32))
-		printk(KERN_INFO PFX "%s: %dkB, %d/%d/%d CHS, %d kBps, "
-				"%d sector size, %d rpm\n",
-				drive->name, capacity / 1024, cyls, heads,
-				sectors, transfer_rate / 8, sector_size, rpm);
-
-	memcpy(&floppy->flexible_disk_page, page, 32);
-	drive->bios_cyl = cyls;
-	drive->bios_head = heads;
-	drive->bios_sect = sectors;
-	lba_capacity = floppy->blocks * floppy->block_size;
-
-	if (capacity < lba_capacity) {
-		printk(KERN_NOTICE PFX "%s: The disk reports a capacity of %d "
-			"bytes, but the drive only handles %d\n",
-			drive->name, lba_capacity, capacity);
-		floppy->blocks = floppy->block_size ?
-			capacity / floppy->block_size : 0;
-		drive->capacity64 = floppy->blocks * floppy->bs_factor;
-	}
-
-	return 0;
-}
-
-/*
- * Determine if a media is present in the floppy drive, and if so, its LBA
- * capacity.
- */
-static int ide_floppy_get_capacity(ide_drive_t *drive)
-{
-	struct ide_disk_obj *floppy = drive->driver_data;
-	struct gendisk *disk = floppy->disk;
-	struct ide_atapi_pc pc;
-	u8 *cap_desc;
-	u8 pc_buf[256], header_len, desc_cnt;
-	int i, rc = 1, blocks, length;
-
-	ide_debug_log(IDE_DBG_FUNC, "enter");
-
-	drive->bios_cyl = 0;
-	drive->bios_head = drive->bios_sect = 0;
-	floppy->blocks = 0;
-	floppy->bs_factor = 1;
-	drive->capacity64 = 0;
-
-	ide_floppy_create_read_capacity_cmd(&pc);
-	if (ide_queue_pc_tail(drive, disk, &pc, pc_buf, pc.req_xfer)) {
-		printk(KERN_ERR PFX "Can't get floppy parameters\n");
-		return 1;
-	}
-	header_len = pc_buf[3];
-	cap_desc = &pc_buf[4];
-	desc_cnt = header_len / 8; /* capacity descriptor of 8 bytes */
-
-	for (i = 0; i < desc_cnt; i++) {
-		unsigned int desc_start = 4 + i*8;
-
-		blocks = be32_to_cpup((__be32 *)&pc_buf[desc_start]);
-		length = be16_to_cpup((__be16 *)&pc_buf[desc_start + 6]);
-
-		ide_debug_log(IDE_DBG_PROBE, "Descriptor %d: %dkB, %d blocks, "
-					     "%d sector size",
-					     i, blocks * length / 1024,
-					     blocks, length);
-
-		if (i)
-			continue;
-		/*
-		 * the code below is valid only for the 1st descriptor, ie i=0
-		 */
-
-		switch (pc_buf[desc_start + 4] & 0x03) {
-		/* Clik! drive returns this instead of CAPACITY_CURRENT */
-		case CAPACITY_UNFORMATTED:
-			if (!(drive->atapi_flags & IDE_AFLAG_CLIK_DRIVE))
-				/*
-				 * If it is not a clik drive, break out
-				 * (maintains previous driver behaviour)
-				 */
-				break;
-			fallthrough;
-		case CAPACITY_CURRENT:
-			/* Normal Zip/LS-120 disks */
-			if (memcmp(cap_desc, &floppy->cap_desc, 8))
-				printk(KERN_INFO PFX "%s: %dkB, %d blocks, %d "
-				       "sector size\n",
-				       drive->name, blocks * length / 1024,
-				       blocks, length);
-			memcpy(&floppy->cap_desc, cap_desc, 8);
-
-			if (!length || length % 512) {
-				printk(KERN_NOTICE PFX "%s: %d bytes block size"
-				       " not supported\n", drive->name, length);
-			} else {
-				floppy->blocks = blocks;
-				floppy->block_size = length;
-				floppy->bs_factor = length / 512;
-				if (floppy->bs_factor != 1)
-					printk(KERN_NOTICE PFX "%s: Warning: "
-					       "non 512 bytes block size not "
-					       "fully supported\n",
-					       drive->name);
-				drive->capacity64 =
-					floppy->blocks * floppy->bs_factor;
-				rc = 0;
-			}
-			break;
-		case CAPACITY_NO_CARTRIDGE:
-			/*
-			 * This is a KERN_ERR so it appears on screen
-			 * for the user to see
-			 */
-			printk(KERN_ERR PFX "%s: No disk in drive\n",
-			       drive->name);
-			break;
-		case CAPACITY_INVALID:
-			printk(KERN_ERR PFX "%s: Invalid capacity for disk "
-				"in drive\n", drive->name);
-			break;
-		}
-		ide_debug_log(IDE_DBG_PROBE, "Descriptor 0 Code: %d",
-					     pc_buf[desc_start + 4] & 0x03);
-	}
-
-	/* Clik! disk does not support get_flexible_disk_page */
-	if (!(drive->atapi_flags & IDE_AFLAG_CLIK_DRIVE))
-		(void) ide_floppy_get_flexible_disk_page(drive, &pc);
-
-	return rc;
-}
-
-static void ide_floppy_setup(ide_drive_t *drive)
-{
-	struct ide_disk_obj *floppy = drive->driver_data;
-	u16 *id = drive->id;
-
-	drive->pc_callback	 = ide_floppy_callback;
-
-	/*
-	 * We used to check revisions here. At this point however I'm giving up.
-	 * Just assume they are all broken, its easier.
-	 *
-	 * The actual reason for the workarounds was likely a driver bug after
-	 * all rather than a firmware bug, and the workaround below used to hide
-	 * it. It should be fixed as of version 1.9, but to be on the safe side
-	 * we'll leave the limitation below for the 2.2.x tree.
-	 */
-	if (strstarts((char *)&id[ATA_ID_PROD], "IOMEGA ZIP 100 ATAPI")) {
-		drive->atapi_flags |= IDE_AFLAG_ZIP_DRIVE;
-		/* This value will be visible in the /proc/ide/hdx/settings */
-		drive->pc_delay = IDEFLOPPY_PC_DELAY;
-		blk_queue_max_hw_sectors(drive->queue, 64);
-	}
-
-	/*
-	 * Guess what? The IOMEGA Clik! drive also needs the above fix. It makes
-	 * nasty clicking noises without it, so please don't remove this.
-	 */
-	if (strstarts((char *)&id[ATA_ID_PROD], "IOMEGA Clik!")) {
-		blk_queue_max_hw_sectors(drive->queue, 64);
-		drive->atapi_flags |= IDE_AFLAG_CLIK_DRIVE;
-		/* IOMEGA Clik! drives do not support lock/unlock commands */
-		drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING;
-	}
-
-	(void) ide_floppy_get_capacity(drive);
-
-	ide_proc_register_driver(drive, floppy->driver);
-}
-
-static void ide_floppy_flush(ide_drive_t *drive)
-{
-}
-
-static int ide_floppy_init_media(ide_drive_t *drive, struct gendisk *disk)
-{
-	int ret = 0;
-
-	if (ide_do_test_unit_ready(drive, disk))
-		ide_do_start_stop(drive, disk, 1);
-
-	ret = ide_floppy_get_capacity(drive);
-
-	set_capacity(disk, ide_gd_capacity(drive));
-
-	return ret;
-}
-
-const struct ide_disk_ops ide_atapi_disk_ops = {
-	.check		= ide_check_atapi_device,
-	.get_capacity	= ide_floppy_get_capacity,
-	.setup		= ide_floppy_setup,
-	.flush		= ide_floppy_flush,
-	.init_media	= ide_floppy_init_media,
-	.set_doorlock	= ide_set_media_lock,
-	.do_request	= ide_floppy_do_request,
-	.ioctl		= ide_floppy_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= ide_floppy_compat_ioctl,
-#endif
-};
diff --git a/drivers/ide/ide-floppy.h b/drivers/ide/ide-floppy.h
deleted file mode 100644
index 8505a5f58f4e..000000000000
--- a/drivers/ide/ide-floppy.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __IDE_FLOPPY_H
-#define __IDE_FLOPPY_H
-
-#include "ide-gd.h"
-
-#ifdef CONFIG_IDE_GD_ATAPI
-/*
- * Pages of the SELECT SENSE / MODE SENSE packet commands.
- * See SFF-8070i spec.
- */
-#define	IDEFLOPPY_CAPABILITIES_PAGE	0x1b
-#define IDEFLOPPY_FLEXIBLE_DISK_PAGE	0x05
-
-/* IOCTLs used in low-level formatting. */
-#define	IDEFLOPPY_IOCTL_FORMAT_SUPPORTED	0x4600
-#define	IDEFLOPPY_IOCTL_FORMAT_GET_CAPACITY	0x4601
-#define	IDEFLOPPY_IOCTL_FORMAT_START		0x4602
-#define IDEFLOPPY_IOCTL_FORMAT_GET_PROGRESS	0x4603
-
-/* ide-floppy.c */
-extern const struct ide_disk_ops ide_atapi_disk_ops;
-void ide_floppy_create_mode_sense_cmd(struct ide_atapi_pc *, u8);
-void ide_floppy_create_read_capacity_cmd(struct ide_atapi_pc *);
-
-/* ide-floppy_ioctl.c */
-int ide_floppy_ioctl(ide_drive_t *, struct block_device *, fmode_t,
-		     unsigned int, unsigned long);
-int ide_floppy_compat_ioctl(ide_drive_t *, struct block_device *, fmode_t,
-			    unsigned int, unsigned long);
-
-#ifdef CONFIG_IDE_PROC_FS
-/* ide-floppy_proc.c */
-extern ide_proc_entry_t ide_floppy_proc[];
-extern const struct ide_proc_devset ide_floppy_settings[];
-#endif
-#else
-#define ide_floppy_proc		NULL
-#define ide_floppy_settings	NULL
-#endif
-
-#endif /*__IDE_FLOPPY_H */
diff --git a/drivers/ide/ide-floppy_ioctl.c b/drivers/ide/ide-floppy_ioctl.c
deleted file mode 100644
index 39a790ac6cc3..000000000000
--- a/drivers/ide/ide-floppy_ioctl.c
+++ /dev/null
@@ -1,339 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * ide-floppy IOCTLs handling.
- */
-
-#include <linux/kernel.h>
-#include <linux/ide.h>
-#include <linux/compat.h>
-#include <linux/cdrom.h>
-#include <linux/mutex.h>
-
-#include <asm/unaligned.h>
-
-#include <scsi/scsi_ioctl.h>
-
-#include "ide-floppy.h"
-
-/*
- * Obtain the list of formattable capacities.
- * Very similar to ide_floppy_get_capacity, except that we push the capacity
- * descriptors to userland, instead of our own structures.
- *
- * Userland gives us the following structure:
- *
- * struct idefloppy_format_capacities {
- *	int nformats;
- *	struct {
- *		int nblocks;
- *		int blocksize;
- *	} formats[];
- * };
- *
- * userland initializes nformats to the number of allocated formats[] records.
- * On exit we set nformats to the number of records we've actually initialized.
- */
-
-static DEFINE_MUTEX(ide_floppy_ioctl_mutex);
-static int ide_floppy_get_format_capacities(ide_drive_t *drive,
-					    struct ide_atapi_pc *pc,
-					    int __user *arg)
-{
-	struct ide_disk_obj *floppy = drive->driver_data;
-	int i, blocks, length, u_array_size, u_index;
-	int __user *argp;
-	u8 pc_buf[256], header_len, desc_cnt;
-
-	if (get_user(u_array_size, arg))
-		return -EFAULT;
-
-	if (u_array_size <= 0)
-		return -EINVAL;
-
-	ide_floppy_create_read_capacity_cmd(pc);
-
-	if (ide_queue_pc_tail(drive, floppy->disk, pc, pc_buf, pc->req_xfer)) {
-		printk(KERN_ERR "ide-floppy: Can't get floppy parameters\n");
-		return -EIO;
-	}
-
-	header_len = pc_buf[3];
-	desc_cnt = header_len / 8; /* capacity descriptor of 8 bytes */
-
-	u_index = 0;
-	argp = arg + 1;
-
-	/*
-	 * We always skip the first capacity descriptor.  That's the current
-	 * capacity.  We are interested in the remaining descriptors, the
-	 * formattable capacities.
-	 */
-	for (i = 1; i < desc_cnt; i++) {
-		unsigned int desc_start = 4 + i*8;
-
-		if (u_index >= u_array_size)
-			break;	/* User-supplied buffer too small */
-
-		blocks = be32_to_cpup((__be32 *)&pc_buf[desc_start]);
-		length = be16_to_cpup((__be16 *)&pc_buf[desc_start + 6]);
-
-		if (put_user(blocks, argp))
-			return -EFAULT;
-
-		++argp;
-
-		if (put_user(length, argp))
-			return -EFAULT;
-
-		++argp;
-
-		++u_index;
-	}
-
-	if (put_user(u_index, arg))
-		return -EFAULT;
-
-	return 0;
-}
-
-static void ide_floppy_create_format_unit_cmd(struct ide_atapi_pc *pc,
-					      u8 *buf, int b, int l,
-					      int flags)
-{
-	ide_init_pc(pc);
-	pc->c[0] = GPCMD_FORMAT_UNIT;
-	pc->c[1] = 0x17;
-
-	memset(buf, 0, 12);
-	buf[1] = 0xA2;
-	/* Default format list header, u8 1: FOV/DCRT/IMM bits set */
-
-	if (flags & 1)				/* Verify bit on... */
-		buf[1] ^= 0x20;			/* ... turn off DCRT bit */
-	buf[3] = 8;
-
-	put_unaligned(cpu_to_be32(b), (unsigned int *)(&buf[4]));
-	put_unaligned(cpu_to_be32(l), (unsigned int *)(&buf[8]));
-	pc->req_xfer = 12;
-	pc->flags |= PC_FLAG_WRITING;
-}
-
-static int ide_floppy_get_sfrp_bit(ide_drive_t *drive, struct ide_atapi_pc *pc)
-{
-	struct ide_disk_obj *floppy = drive->driver_data;
-	u8 buf[20];
-
-	drive->atapi_flags &= ~IDE_AFLAG_SRFP;
-
-	ide_floppy_create_mode_sense_cmd(pc, IDEFLOPPY_CAPABILITIES_PAGE);
-	pc->flags |= PC_FLAG_SUPPRESS_ERROR;
-
-	if (ide_queue_pc_tail(drive, floppy->disk, pc, buf, pc->req_xfer))
-		return 1;
-
-	if (buf[8 + 2] & 0x40)
-		drive->atapi_flags |= IDE_AFLAG_SRFP;
-
-	return 0;
-}
-
-static int ide_floppy_format_unit(ide_drive_t *drive, struct ide_atapi_pc *pc,
-				  int __user *arg)
-{
-	struct ide_disk_obj *floppy = drive->driver_data;
-	u8 buf[12];
-	int blocks, length, flags, err = 0;
-
-	if (floppy->openers > 1) {
-		/* Don't format if someone is using the disk */
-		drive->dev_flags &= ~IDE_DFLAG_FORMAT_IN_PROGRESS;
-		return -EBUSY;
-	}
-
-	drive->dev_flags |= IDE_DFLAG_FORMAT_IN_PROGRESS;
-
-	/*
-	 * Send ATAPI_FORMAT_UNIT to the drive.
-	 *
-	 * Userland gives us the following structure:
-	 *
-	 * struct idefloppy_format_command {
-	 *        int nblocks;
-	 *        int blocksize;
-	 *        int flags;
-	 *        } ;
-	 *
-	 * flags is a bitmask, currently, the only defined flag is:
-	 *
-	 *        0x01 - verify media after format.
-	 */
-	if (get_user(blocks, arg) ||
-			get_user(length, arg+1) ||
-			get_user(flags, arg+2)) {
-		err = -EFAULT;
-		goto out;
-	}
-
-	ide_floppy_get_sfrp_bit(drive, pc);
-	ide_floppy_create_format_unit_cmd(pc, buf, blocks, length, flags);
-
-	if (ide_queue_pc_tail(drive, floppy->disk, pc, buf, pc->req_xfer))
-		err = -EIO;
-
-out:
-	if (err)
-		drive->dev_flags &= ~IDE_DFLAG_FORMAT_IN_PROGRESS;
-	return err;
-}
-
-/*
- * Get ATAPI_FORMAT_UNIT progress indication.
- *
- * Userland gives a pointer to an int.  The int is set to a progress
- * indicator 0-65536, with 65536=100%.
- *
- * If the drive does not support format progress indication, we just check
- * the dsc bit, and return either 0 or 65536.
- */
-
-static int ide_floppy_get_format_progress(ide_drive_t *drive,
-					  struct ide_atapi_pc *pc,
-					  int __user *arg)
-{
-	struct ide_disk_obj *floppy = drive->driver_data;
-	u8 sense_buf[18];
-	int progress_indication = 0x10000;
-
-	if (drive->atapi_flags & IDE_AFLAG_SRFP) {
-		ide_create_request_sense_cmd(drive, pc);
-		if (ide_queue_pc_tail(drive, floppy->disk, pc, sense_buf,
-				      pc->req_xfer))
-			return -EIO;
-
-		if (floppy->sense_key == 2 &&
-		    floppy->asc == 4 &&
-		    floppy->ascq == 4)
-			progress_indication = floppy->progress_indication;
-
-		/* Else assume format_unit has finished, and we're at 0x10000 */
-	} else {
-		ide_hwif_t *hwif = drive->hwif;
-		unsigned long flags;
-		u8 stat;
-
-		local_irq_save(flags);
-		stat = hwif->tp_ops->read_status(hwif);
-		local_irq_restore(flags);
-
-		progress_indication = ((stat & ATA_DSC) == 0) ? 0 : 0x10000;
-	}
-
-	if (put_user(progress_indication, arg))
-		return -EFAULT;
-
-	return 0;
-}
-
-static int ide_floppy_lockdoor(ide_drive_t *drive, struct ide_atapi_pc *pc,
-			       unsigned long arg, unsigned int cmd)
-{
-	struct ide_disk_obj *floppy = drive->driver_data;
-	struct gendisk *disk = floppy->disk;
-	int prevent = (arg && cmd != CDROMEJECT) ? 1 : 0;
-
-	if (floppy->openers > 1)
-		return -EBUSY;
-
-	ide_set_media_lock(drive, disk, prevent);
-
-	if (cmd == CDROMEJECT)
-		ide_do_start_stop(drive, disk, 2);
-
-	return 0;
-}
-
-static int ide_floppy_format_ioctl(ide_drive_t *drive, struct ide_atapi_pc *pc,
-				   fmode_t mode, unsigned int cmd,
-				   void __user *argp)
-{
-	switch (cmd) {
-	case IDEFLOPPY_IOCTL_FORMAT_SUPPORTED:
-		return 0;
-	case IDEFLOPPY_IOCTL_FORMAT_GET_CAPACITY:
-		return ide_floppy_get_format_capacities(drive, pc, argp);
-	case IDEFLOPPY_IOCTL_FORMAT_START:
-		if (!(mode & FMODE_WRITE))
-			return -EPERM;
-		return ide_floppy_format_unit(drive, pc, (int __user *)argp);
-	case IDEFLOPPY_IOCTL_FORMAT_GET_PROGRESS:
-		return ide_floppy_get_format_progress(drive, pc, argp);
-	default:
-		return -ENOTTY;
-	}
-}
-
-int ide_floppy_ioctl(ide_drive_t *drive, struct block_device *bdev,
-		     fmode_t mode, unsigned int cmd, unsigned long arg)
-{
-	struct ide_atapi_pc pc;
-	void __user *argp = (void __user *)arg;
-	int err;
-
-	mutex_lock(&ide_floppy_ioctl_mutex);
-	if (cmd == CDROMEJECT || cmd == CDROM_LOCKDOOR) {
-		err = ide_floppy_lockdoor(drive, &pc, arg, cmd);
-		goto out;
-	}
-
-	err = ide_floppy_format_ioctl(drive, &pc, mode, cmd, argp);
-	if (err != -ENOTTY)
-		goto out;
-
-	/*
-	 * skip SCSI_IOCTL_SEND_COMMAND (deprecated)
-	 * and CDROM_SEND_PACKET (legacy) ioctls
-	 */
-	if (cmd != CDROM_SEND_PACKET && cmd != SCSI_IOCTL_SEND_COMMAND)
-		err = scsi_cmd_blk_ioctl(bdev, mode, cmd, argp);
-
-	if (err == -ENOTTY)
-		err = generic_ide_ioctl(drive, bdev, cmd, arg);
-
-out:
-	mutex_unlock(&ide_floppy_ioctl_mutex);
-	return err;
-}
-
-#ifdef CONFIG_COMPAT
-int ide_floppy_compat_ioctl(ide_drive_t *drive, struct block_device *bdev,
-			    fmode_t mode, unsigned int cmd, unsigned long arg)
-{
-	struct ide_atapi_pc pc;
-	void __user *argp = compat_ptr(arg);
-	int err;
-
-	mutex_lock(&ide_floppy_ioctl_mutex);
-	if (cmd == CDROMEJECT || cmd == CDROM_LOCKDOOR) {
-		err = ide_floppy_lockdoor(drive, &pc, arg, cmd);
-		goto out;
-	}
-
-	err = ide_floppy_format_ioctl(drive, &pc, mode, cmd, argp);
-	if (err != -ENOTTY)
-		goto out;
-
-	/*
-	 * skip SCSI_IOCTL_SEND_COMMAND (deprecated)
-	 * and CDROM_SEND_PACKET (legacy) ioctls
-	 */
-	if (cmd != CDROM_SEND_PACKET && cmd != SCSI_IOCTL_SEND_COMMAND)
-		err = scsi_cmd_blk_ioctl(bdev, mode, cmd, argp);
-
-	if (err == -ENOTTY)
-		err = generic_ide_ioctl(drive, bdev, cmd, arg);
-
-out:
-	mutex_unlock(&ide_floppy_ioctl_mutex);
-	return err;
-}
-#endif
diff --git a/drivers/ide/ide-floppy_proc.c b/drivers/ide/ide-floppy_proc.c
deleted file mode 100644
index 7f697ddb5fe5..000000000000
--- a/drivers/ide/ide-floppy_proc.c
+++ /dev/null
@@ -1,34 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/ide.h>
-#include <linux/seq_file.h>
-
-#include "ide-floppy.h"
-
-static int idefloppy_capacity_proc_show(struct seq_file *m, void *v)
-{
-	ide_drive_t*drive = (ide_drive_t *)m->private;
-
-	seq_printf(m, "%llu\n", (long long)ide_gd_capacity(drive));
-	return 0;
-}
-
-ide_proc_entry_t ide_floppy_proc[] = {
-	{ "capacity",	S_IFREG|S_IRUGO, idefloppy_capacity_proc_show	},
-	{ "geometry",	S_IFREG|S_IRUGO, ide_geometry_proc_show		},
-	{}
-};
-
-ide_devset_rw_field(bios_cyl, bios_cyl);
-ide_devset_rw_field(bios_head, bios_head);
-ide_devset_rw_field(bios_sect, bios_sect);
-ide_devset_rw_field(ticks, pc_delay);
-
-const struct ide_proc_devset ide_floppy_settings[] = {
-	IDE_PROC_DEVSET(bios_cyl,  0, 1023),
-	IDE_PROC_DEVSET(bios_head, 0,  255),
-	IDE_PROC_DEVSET(bios_sect, 0,   63),
-	IDE_PROC_DEVSET(ticks,	   0,  255),
-	{ NULL },
-};
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c
deleted file mode 100644
index e2b6c82586ce..000000000000
--- a/drivers/ide/ide-gd.c
+++ /dev/null
@@ -1,432 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/genhd.h>
-#include <linux/mutex.h>
-#include <linux/ide.h>
-#include <linux/hdreg.h>
-#include <linux/dmi.h>
-#include <linux/slab.h>
-
-#if !defined(CONFIG_DEBUG_BLOCK_EXT_DEVT)
-#define IDE_DISK_MINORS		(1 << PARTN_BITS)
-#else
-#define IDE_DISK_MINORS		0
-#endif
-
-#include "ide-disk.h"
-#include "ide-floppy.h"
-
-#define IDE_GD_VERSION	"1.18"
-
-/* module parameters */
-static DEFINE_MUTEX(ide_gd_mutex);
-static unsigned long debug_mask;
-module_param(debug_mask, ulong, 0644);
-
-static DEFINE_MUTEX(ide_disk_ref_mutex);
-
-static void ide_disk_release(struct device *);
-
-static struct ide_disk_obj *ide_disk_get(struct gendisk *disk)
-{
-	struct ide_disk_obj *idkp = NULL;
-
-	mutex_lock(&ide_disk_ref_mutex);
-	idkp = ide_drv_g(disk, ide_disk_obj);
-	if (idkp) {
-		if (ide_device_get(idkp->drive))
-			idkp = NULL;
-		else
-			get_device(&idkp->dev);
-	}
-	mutex_unlock(&ide_disk_ref_mutex);
-	return idkp;
-}
-
-static void ide_disk_put(struct ide_disk_obj *idkp)
-{
-	ide_drive_t *drive = idkp->drive;
-
-	mutex_lock(&ide_disk_ref_mutex);
-	put_device(&idkp->dev);
-	ide_device_put(drive);
-	mutex_unlock(&ide_disk_ref_mutex);
-}
-
-sector_t ide_gd_capacity(ide_drive_t *drive)
-{
-	return drive->capacity64;
-}
-
-static int ide_gd_probe(ide_drive_t *);
-
-static void ide_gd_remove(ide_drive_t *drive)
-{
-	struct ide_disk_obj *idkp = drive->driver_data;
-	struct gendisk *g = idkp->disk;
-
-	ide_proc_unregister_driver(drive, idkp->driver);
-	device_del(&idkp->dev);
-	del_gendisk(g);
-	drive->disk_ops->flush(drive);
-
-	mutex_lock(&ide_disk_ref_mutex);
-	put_device(&idkp->dev);
-	mutex_unlock(&ide_disk_ref_mutex);
-}
-
-static void ide_disk_release(struct device *dev)
-{
-	struct ide_disk_obj *idkp = to_ide_drv(dev, ide_disk_obj);
-	ide_drive_t *drive = idkp->drive;
-	struct gendisk *g = idkp->disk;
-
-	drive->disk_ops = NULL;
-	drive->driver_data = NULL;
-	g->private_data = NULL;
-	put_disk(g);
-	kfree(idkp);
-}
-
-/*
- * On HPA drives the capacity needs to be
- * reinitialized on resume otherwise the disk
- * can not be used and a hard reset is required
- */
-static void ide_gd_resume(ide_drive_t *drive)
-{
-	if (ata_id_hpa_enabled(drive->id))
-		(void)drive->disk_ops->get_capacity(drive);
-}
-
-static const struct dmi_system_id ide_coldreboot_table[] = {
-	{
-		/* Acer TravelMate 66x cuts power during reboot */
-		.ident   = "Acer TravelMate 660",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
-			DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 660"),
-		},
-	},
-
-	{ }	/* terminate list */
-};
-
-static void ide_gd_shutdown(ide_drive_t *drive)
-{
-#ifdef	CONFIG_ALPHA
-	/* On Alpha, halt(8) doesn't actually turn the machine off,
-	   it puts you into the sort of firmware monitor. Typically,
-	   it's used to boot another kernel image, so it's not much
-	   different from reboot(8). Therefore, we don't need to
-	   spin down the disk in this case, especially since Alpha
-	   firmware doesn't handle disks in standby mode properly.
-	   On the other hand, it's reasonably safe to turn the power
-	   off when the shutdown process reaches the firmware prompt,
-	   as the firmware initialization takes rather long time -
-	   at least 10 seconds, which should be sufficient for
-	   the disk to expire its write cache. */
-	if (system_state != SYSTEM_POWER_OFF) {
-#else
-	if (system_state == SYSTEM_RESTART &&
-		!dmi_check_system(ide_coldreboot_table)) {
-#endif
-		drive->disk_ops->flush(drive);
-		return;
-	}
-
-	printk(KERN_INFO "Shutdown: %s\n", drive->name);
-
-	drive->gendev.bus->suspend(&drive->gendev, PMSG_SUSPEND);
-}
-
-#ifdef CONFIG_IDE_PROC_FS
-static ide_proc_entry_t *ide_disk_proc_entries(ide_drive_t *drive)
-{
-	return (drive->media == ide_disk) ? ide_disk_proc : ide_floppy_proc;
-}
-
-static const struct ide_proc_devset *ide_disk_proc_devsets(ide_drive_t *drive)
-{
-	return (drive->media == ide_disk) ? ide_disk_settings
-					  : ide_floppy_settings;
-}
-#endif
-
-static ide_startstop_t ide_gd_do_request(ide_drive_t *drive,
-					 struct request *rq, sector_t sector)
-{
-	return drive->disk_ops->do_request(drive, rq, sector);
-}
-
-static struct ide_driver ide_gd_driver = {
-	.gen_driver = {
-		.owner		= THIS_MODULE,
-		.name		= "ide-gd",
-		.bus		= &ide_bus_type,
-	},
-	.probe			= ide_gd_probe,
-	.remove			= ide_gd_remove,
-	.resume			= ide_gd_resume,
-	.shutdown		= ide_gd_shutdown,
-	.version		= IDE_GD_VERSION,
-	.do_request		= ide_gd_do_request,
-#ifdef CONFIG_IDE_PROC_FS
-	.proc_entries		= ide_disk_proc_entries,
-	.proc_devsets		= ide_disk_proc_devsets,
-#endif
-};
-
-static int ide_gd_open(struct block_device *bdev, fmode_t mode)
-{
-	struct gendisk *disk = bdev->bd_disk;
-	struct ide_disk_obj *idkp;
-	ide_drive_t *drive;
-	int ret = 0;
-
-	idkp = ide_disk_get(disk);
-	if (idkp == NULL)
-		return -ENXIO;
-
-	drive = idkp->drive;
-
-	ide_debug_log(IDE_DBG_FUNC, "enter");
-
-	idkp->openers++;
-
-	if ((drive->dev_flags & IDE_DFLAG_REMOVABLE) && idkp->openers == 1) {
-		drive->dev_flags &= ~IDE_DFLAG_FORMAT_IN_PROGRESS;
-		/* Just in case */
-
-		ret = drive->disk_ops->init_media(drive, disk);
-
-		/*
-		 * Allow O_NDELAY to open a drive without a disk, or with an
-		 * unreadable disk, so that we can get the format capacity
-		 * of the drive or begin the format - Sam
-		 */
-		if (ret && (mode & FMODE_NDELAY) == 0) {
-			ret = -EIO;
-			goto out_put_idkp;
-		}
-
-		if ((drive->dev_flags & IDE_DFLAG_WP) && (mode & FMODE_WRITE)) {
-			ret = -EROFS;
-			goto out_put_idkp;
-		}
-
-		/*
-		 * Ignore the return code from door_lock,
-		 * since the open() has already succeeded,
-		 * and the door_lock is irrelevant at this point.
-		 */
-		drive->disk_ops->set_doorlock(drive, disk, 1);
-		if (__invalidate_device(bdev, true))
-			pr_warn("VFS: busy inodes on changed media %s\n",
-				bdev->bd_disk->disk_name);
-		drive->disk_ops->get_capacity(drive);
-		set_capacity(disk, ide_gd_capacity(drive));
-		set_bit(GD_NEED_PART_SCAN, &disk->state);
-	} else if (drive->dev_flags & IDE_DFLAG_FORMAT_IN_PROGRESS) {
-		ret = -EBUSY;
-		goto out_put_idkp;
-	}
-	return 0;
-
-out_put_idkp:
-	idkp->openers--;
-	ide_disk_put(idkp);
-	return ret;
-}
-
-static int ide_gd_unlocked_open(struct block_device *bdev, fmode_t mode)
-{
-	int ret;
-
-	mutex_lock(&ide_gd_mutex);
-	ret = ide_gd_open(bdev, mode);
-	mutex_unlock(&ide_gd_mutex);
-
-	return ret;
-}
-
-
-static void ide_gd_release(struct gendisk *disk, fmode_t mode)
-{
-	struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj);
-	ide_drive_t *drive = idkp->drive;
-
-	ide_debug_log(IDE_DBG_FUNC, "enter");
-
-	mutex_lock(&ide_gd_mutex);
-	if (idkp->openers == 1)
-		drive->disk_ops->flush(drive);
-
-	if ((drive->dev_flags & IDE_DFLAG_REMOVABLE) && idkp->openers == 1) {
-		drive->disk_ops->set_doorlock(drive, disk, 0);
-		drive->dev_flags &= ~IDE_DFLAG_FORMAT_IN_PROGRESS;
-	}
-
-	idkp->openers--;
-
-	ide_disk_put(idkp);
-	mutex_unlock(&ide_gd_mutex);
-}
-
-static int ide_gd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
-	struct ide_disk_obj *idkp = ide_drv_g(bdev->bd_disk, ide_disk_obj);
-	ide_drive_t *drive = idkp->drive;
-
-	geo->heads = drive->bios_head;
-	geo->sectors = drive->bios_sect;
-	geo->cylinders = (u16)drive->bios_cyl; /* truncate */
-	return 0;
-}
-
-static void ide_gd_unlock_native_capacity(struct gendisk *disk)
-{
-	struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj);
-	ide_drive_t *drive = idkp->drive;
-	const struct ide_disk_ops *disk_ops = drive->disk_ops;
-
-	if (disk_ops->unlock_native_capacity)
-		disk_ops->unlock_native_capacity(drive);
-}
-
-static int ide_gd_ioctl(struct block_device *bdev, fmode_t mode,
-			     unsigned int cmd, unsigned long arg)
-{
-	struct ide_disk_obj *idkp = ide_drv_g(bdev->bd_disk, ide_disk_obj);
-	ide_drive_t *drive = idkp->drive;
-
-	return drive->disk_ops->ioctl(drive, bdev, mode, cmd, arg);
-}
-
-#ifdef CONFIG_COMPAT
-static int ide_gd_compat_ioctl(struct block_device *bdev, fmode_t mode,
-			       unsigned int cmd, unsigned long arg)
-{
-	struct ide_disk_obj *idkp = ide_drv_g(bdev->bd_disk, ide_disk_obj);
-	ide_drive_t *drive = idkp->drive;
-
-	if (!drive->disk_ops->compat_ioctl)
-		return -ENOIOCTLCMD;
-
-	return drive->disk_ops->compat_ioctl(drive, bdev, mode, cmd, arg);
-}
-#endif
-
-static const struct block_device_operations ide_gd_ops = {
-	.owner			= THIS_MODULE,
-	.open			= ide_gd_unlocked_open,
-	.release		= ide_gd_release,
-	.ioctl			= ide_gd_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl		= ide_gd_compat_ioctl,
-#endif
-	.getgeo			= ide_gd_getgeo,
-	.unlock_native_capacity	= ide_gd_unlock_native_capacity,
-};
-
-static int ide_gd_probe(ide_drive_t *drive)
-{
-	const struct ide_disk_ops *disk_ops = NULL;
-	struct ide_disk_obj *idkp;
-	struct gendisk *g;
-
-	/* strstr("foo", "") is non-NULL */
-	if (!strstr("ide-gd", drive->driver_req))
-		goto failed;
-
-#ifdef CONFIG_IDE_GD_ATA
-	if (drive->media == ide_disk)
-		disk_ops = &ide_ata_disk_ops;
-#endif
-#ifdef CONFIG_IDE_GD_ATAPI
-	if (drive->media == ide_floppy)
-		disk_ops = &ide_atapi_disk_ops;
-#endif
-	if (disk_ops == NULL)
-		goto failed;
-
-	if (disk_ops->check(drive, DRV_NAME) == 0) {
-		printk(KERN_ERR PFX "%s: not supported by this driver\n",
-			drive->name);
-		goto failed;
-	}
-
-	idkp = kzalloc(sizeof(*idkp), GFP_KERNEL);
-	if (!idkp) {
-		printk(KERN_ERR PFX "%s: can't allocate a disk structure\n",
-			drive->name);
-		goto failed;
-	}
-
-	g = alloc_disk_node(IDE_DISK_MINORS, hwif_to_node(drive->hwif));
-	if (!g)
-		goto out_free_idkp;
-
-	ide_init_disk(g, drive);
-
-	idkp->dev.parent = &drive->gendev;
-	idkp->dev.release = ide_disk_release;
-	dev_set_name(&idkp->dev, "%s", dev_name(&drive->gendev));
-
-	if (device_register(&idkp->dev))
-		goto out_free_disk;
-
-	idkp->drive = drive;
-	idkp->driver = &ide_gd_driver;
-	idkp->disk = g;
-
-	g->private_data = &idkp->driver;
-
-	drive->driver_data = idkp;
-	drive->debug_mask = debug_mask;
-	drive->disk_ops = disk_ops;
-
-	disk_ops->setup(drive);
-
-	set_capacity(g, ide_gd_capacity(drive));
-
-	g->minors = IDE_DISK_MINORS;
-	g->flags |= GENHD_FL_EXT_DEVT;
-	if (drive->dev_flags & IDE_DFLAG_REMOVABLE)
-		g->flags = GENHD_FL_REMOVABLE;
-	g->fops = &ide_gd_ops;
-	g->events = DISK_EVENT_MEDIA_CHANGE;
-	device_add_disk(&drive->gendev, g, NULL);
-	return 0;
-
-out_free_disk:
-	put_disk(g);
-out_free_idkp:
-	kfree(idkp);
-failed:
-	return -ENODEV;
-}
-
-static int __init ide_gd_init(void)
-{
-	printk(KERN_INFO DRV_NAME " driver " IDE_GD_VERSION "\n");
-	return driver_register(&ide_gd_driver.gen_driver);
-}
-
-static void __exit ide_gd_exit(void)
-{
-	driver_unregister(&ide_gd_driver.gen_driver);
-}
-
-MODULE_ALIAS("ide:*m-disk*");
-MODULE_ALIAS("ide-disk");
-MODULE_ALIAS("ide:*m-floppy*");
-MODULE_ALIAS("ide-floppy");
-module_init(ide_gd_init);
-module_exit(ide_gd_exit);
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("generic ATA/ATAPI disk driver");
diff --git a/drivers/ide/ide-gd.h b/drivers/ide/ide-gd.h
deleted file mode 100644
index af3fe1880e9e..000000000000
--- a/drivers/ide/ide-gd.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __IDE_GD_H
-#define __IDE_GD_H
-
-#define DRV_NAME "ide-gd"
-#define PFX DRV_NAME ": "
-
-/* define to see debug info */
-#define IDE_GD_DEBUG_LOG	0
-
-#if IDE_GD_DEBUG_LOG
-#define ide_debug_log(lvl, fmt, args...) __ide_debug_log(lvl, fmt, ## args)
-#else
-#define ide_debug_log(lvl, fmt, args...) do {} while (0)
-#endif
-
-struct ide_disk_obj {
-	ide_drive_t		*drive;
-	struct ide_driver	*driver;
-	struct gendisk		*disk;
-	struct device		dev;
-	unsigned int		openers;	/* protected by BKL for now */
-
-	/* used for blk_{fs,pc}_request() requests */
-	struct ide_atapi_pc queued_pc;
-
-	/* Last error information */
-	u8 sense_key, asc, ascq;
-
-	int progress_indication;
-
-	/* Device information */
-	/* Current format */
-	int blocks, block_size, bs_factor;
-	/* Last format capacity descriptor */
-	u8 cap_desc[8];
-	/* Copy of the flexible disk page */
-	u8 flexible_disk_page[32];
-};
-
-sector_t ide_gd_capacity(ide_drive_t *);
-
-#endif /* __IDE_GD_H */
diff --git a/drivers/ide/ide-generic.c b/drivers/ide/ide-generic.c
deleted file mode 100644
index 80c0d69b83ac..000000000000
--- a/drivers/ide/ide-generic.c
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * generic/default IDE host driver
- *
- * Copyright (C) 2004, 2008-2009 Bartlomiej Zolnierkiewicz
- * This code was split off from ide.c.  See it for original copyrights.
- *
- * May be copied or modified under the terms of the GNU General Public License.
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/ide.h>
-#include <linux/pci_ids.h>
-
-/* FIXME: convert arm to use ide_platform host driver */
-#ifdef CONFIG_ARM
-#include <asm/irq.h>
-#endif
-
-#define DRV_NAME	"ide_generic"
-
-static int probe_mask;
-module_param(probe_mask, int, 0);
-MODULE_PARM_DESC(probe_mask, "probe mask for legacy ISA IDE ports");
-
-static const struct ide_port_info ide_generic_port_info = {
-	.host_flags		= IDE_HFLAG_NO_DMA,
-	.chipset		= ide_generic,
-};
-
-#ifdef CONFIG_ARM
-static const u16 legacy_bases[] = { 0x1f0 };
-static const int legacy_irqs[]  = { IRQ_HARDDISK };
-#elif defined(CONFIG_ALPHA)
-static const u16 legacy_bases[] = { 0x1f0, 0x170, 0x1e8, 0x168 };
-static const int legacy_irqs[]  = { 14, 15, 11, 10 };
-#else
-static const u16 legacy_bases[] = { 0x1f0, 0x170, 0x1e8, 0x168, 0x1e0, 0x160 };
-static const int legacy_irqs[]  = { 14, 15, 11, 10, 8, 12 };
-#endif
-
-static void ide_generic_check_pci_legacy_iobases(int *primary, int *secondary)
-{
-#ifdef CONFIG_PCI
-	struct pci_dev *p = NULL;
-	u16 val;
-
-	for_each_pci_dev(p) {
-		if (pci_resource_start(p, 0) == 0x1f0)
-			*primary = 1;
-		if (pci_resource_start(p, 2) == 0x170)
-			*secondary = 1;
-
-		/* Cyrix CS55{1,2}0 pre SFF MWDMA ATA on the bridge */
-		if (p->vendor == PCI_VENDOR_ID_CYRIX &&
-		    (p->device == PCI_DEVICE_ID_CYRIX_5510 ||
-		     p->device == PCI_DEVICE_ID_CYRIX_5520))
-			*primary = *secondary = 1;
-
-		/* Intel MPIIX - PIO ATA on non PCI side of bridge */
-		if (p->vendor == PCI_VENDOR_ID_INTEL &&
-		    p->device == PCI_DEVICE_ID_INTEL_82371MX) {
-			pci_read_config_word(p, 0x6C, &val);
-			if (val & 0x8000) {
-				/* ATA port enabled */
-				if (val & 0x4000)
-					*secondary = 1;
-				else
-					*primary = 1;
-			}
-		}
-	}
-#endif
-}
-
-static int __init ide_generic_init(void)
-{
-	struct ide_hw hw, *hws[] = { &hw };
-	unsigned long io_addr;
-	int i, rc = 0, primary = 0, secondary = 0;
-
-	ide_generic_check_pci_legacy_iobases(&primary, &secondary);
-
-	if (!probe_mask) {
-		printk(KERN_INFO DRV_NAME ": please use \"probe_mask=0x3f\" "
-		     "module parameter for probing all legacy ISA IDE ports\n");
-
-		if (primary == 0)
-			probe_mask |= 0x1;
-
-		if (secondary == 0)
-			probe_mask |= 0x2;
-	} else
-		printk(KERN_INFO DRV_NAME ": enforcing probing of I/O ports "
-			"upon user request\n");
-
-	for (i = 0; i < ARRAY_SIZE(legacy_bases); i++) {
-		io_addr = legacy_bases[i];
-
-		if ((probe_mask & (1 << i)) && io_addr) {
-			if (!request_region(io_addr, 8, DRV_NAME)) {
-				printk(KERN_ERR "%s: I/O resource 0x%lX-0x%lX "
-						"not free.\n",
-						DRV_NAME, io_addr, io_addr + 7);
-				rc = -EBUSY;
-				continue;
-			}
-
-			if (!request_region(io_addr + 0x206, 1, DRV_NAME)) {
-				printk(KERN_ERR "%s: I/O resource 0x%lX "
-						"not free.\n",
-						DRV_NAME, io_addr + 0x206);
-				release_region(io_addr, 8);
-				rc = -EBUSY;
-				continue;
-			}
-
-			memset(&hw, 0, sizeof(hw));
-			ide_std_init_ports(&hw, io_addr, io_addr + 0x206);
-#ifdef CONFIG_IA64
-			hw.irq = isa_irq_to_vector(legacy_irqs[i]);
-#else
-			hw.irq = legacy_irqs[i];
-#endif
-			rc = ide_host_add(&ide_generic_port_info, hws, 1, NULL);
-			if (rc) {
-				release_region(io_addr + 0x206, 1);
-				release_region(io_addr, 8);
-			}
-		}
-	}
-
-	return rc;
-}
-
-module_init(ide_generic_init);
-
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
deleted file mode 100644
index 94bdcf1ea186..000000000000
--- a/drivers/ide/ide-io-std.c
+++ /dev/null
@@ -1,262 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/ide.h>
-
-#if defined(CONFIG_ARM) || defined(CONFIG_M68K) || defined(CONFIG_MIPS) || \
-    defined(CONFIG_PARISC) || defined(CONFIG_PPC) || defined(CONFIG_SPARC)
-#include <asm/ide.h>
-#else
-#include <asm-generic/ide_iops.h>
-#endif
-
-/*
- *	Conventional PIO operations for ATA devices
- */
-
-static u8 ide_inb(unsigned long port)
-{
-	return (u8) inb(port);
-}
-
-static void ide_outb(u8 val, unsigned long port)
-{
-	outb(val, port);
-}
-
-/*
- *	MMIO operations, typically used for SATA controllers
- */
-
-static u8 ide_mm_inb(unsigned long port)
-{
-	return (u8) readb((void __iomem *) port);
-}
-
-static void ide_mm_outb(u8 value, unsigned long port)
-{
-	writeb(value, (void __iomem *) port);
-}
-
-void ide_exec_command(ide_hwif_t *hwif, u8 cmd)
-{
-	if (hwif->host_flags & IDE_HFLAG_MMIO)
-		writeb(cmd, (void __iomem *)hwif->io_ports.command_addr);
-	else
-		outb(cmd, hwif->io_ports.command_addr);
-}
-EXPORT_SYMBOL_GPL(ide_exec_command);
-
-u8 ide_read_status(ide_hwif_t *hwif)
-{
-	if (hwif->host_flags & IDE_HFLAG_MMIO)
-		return readb((void __iomem *)hwif->io_ports.status_addr);
-	else
-		return inb(hwif->io_ports.status_addr);
-}
-EXPORT_SYMBOL_GPL(ide_read_status);
-
-u8 ide_read_altstatus(ide_hwif_t *hwif)
-{
-	if (hwif->host_flags & IDE_HFLAG_MMIO)
-		return readb((void __iomem *)hwif->io_ports.ctl_addr);
-	else
-		return inb(hwif->io_ports.ctl_addr);
-}
-EXPORT_SYMBOL_GPL(ide_read_altstatus);
-
-void ide_write_devctl(ide_hwif_t *hwif, u8 ctl)
-{
-	if (hwif->host_flags & IDE_HFLAG_MMIO)
-		writeb(ctl, (void __iomem *)hwif->io_ports.ctl_addr);
-	else
-		outb(ctl, hwif->io_ports.ctl_addr);
-}
-EXPORT_SYMBOL_GPL(ide_write_devctl);
-
-void ide_dev_select(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	u8 select = drive->select | ATA_DEVICE_OBS;
-
-	if (hwif->host_flags & IDE_HFLAG_MMIO)
-		writeb(select, (void __iomem *)hwif->io_ports.device_addr);
-	else
-		outb(select, hwif->io_ports.device_addr);
-}
-EXPORT_SYMBOL_GPL(ide_dev_select);
-
-void ide_tf_load(ide_drive_t *drive, struct ide_taskfile *tf, u8 valid)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct ide_io_ports *io_ports = &hwif->io_ports;
-	void (*tf_outb)(u8 addr, unsigned long port);
-	u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0;
-
-	if (mmio)
-		tf_outb = ide_mm_outb;
-	else
-		tf_outb = ide_outb;
-
-	if (valid & IDE_VALID_FEATURE)
-		tf_outb(tf->feature, io_ports->feature_addr);
-	if (valid & IDE_VALID_NSECT)
-		tf_outb(tf->nsect, io_ports->nsect_addr);
-	if (valid & IDE_VALID_LBAL)
-		tf_outb(tf->lbal, io_ports->lbal_addr);
-	if (valid & IDE_VALID_LBAM)
-		tf_outb(tf->lbam, io_ports->lbam_addr);
-	if (valid & IDE_VALID_LBAH)
-		tf_outb(tf->lbah, io_ports->lbah_addr);
-	if (valid & IDE_VALID_DEVICE)
-		tf_outb(tf->device, io_ports->device_addr);
-}
-EXPORT_SYMBOL_GPL(ide_tf_load);
-
-void ide_tf_read(ide_drive_t *drive, struct ide_taskfile *tf, u8 valid)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct ide_io_ports *io_ports = &hwif->io_ports;
-	u8 (*tf_inb)(unsigned long port);
-	u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0;
-
-	if (mmio)
-		tf_inb  = ide_mm_inb;
-	else
-		tf_inb  = ide_inb;
-
-	if (valid & IDE_VALID_ERROR)
-		tf->error  = tf_inb(io_ports->feature_addr);
-	if (valid & IDE_VALID_NSECT)
-		tf->nsect  = tf_inb(io_ports->nsect_addr);
-	if (valid & IDE_VALID_LBAL)
-		tf->lbal   = tf_inb(io_ports->lbal_addr);
-	if (valid & IDE_VALID_LBAM)
-		tf->lbam   = tf_inb(io_ports->lbam_addr);
-	if (valid & IDE_VALID_LBAH)
-		tf->lbah   = tf_inb(io_ports->lbah_addr);
-	if (valid & IDE_VALID_DEVICE)
-		tf->device = tf_inb(io_ports->device_addr);
-}
-EXPORT_SYMBOL_GPL(ide_tf_read);
-
-/*
- * Some localbus EIDE interfaces require a special access sequence
- * when using 32-bit I/O instructions to transfer data.  We call this
- * the "vlb_sync" sequence, which consists of three successive reads
- * of the sector count register location, with interrupts disabled
- * to ensure that the reads all happen together.
- */
-static void ata_vlb_sync(unsigned long port)
-{
-	(void)inb(port);
-	(void)inb(port);
-	(void)inb(port);
-}
-
-/*
- * This is used for most PIO data transfers *from* the IDE interface
- *
- * These routines will round up any request for an odd number of bytes,
- * so if an odd len is specified, be sure that there's at least one
- * extra byte allocated for the buffer.
- */
-void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
-		    unsigned int len)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct ide_io_ports *io_ports = &hwif->io_ports;
-	unsigned long data_addr = io_ports->data_addr;
-	unsigned int words = (len + 1) >> 1;
-	u8 io_32bit = drive->io_32bit;
-	u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0;
-
-	if (io_32bit) {
-		unsigned long flags;
-
-		if ((io_32bit & 2) && !mmio) {
-			local_irq_save(flags);
-			ata_vlb_sync(io_ports->nsect_addr);
-		}
-
-		words >>= 1;
-		if (mmio)
-			__ide_mm_insl((void __iomem *)data_addr, buf, words);
-		else
-			insl(data_addr, buf, words);
-
-		if ((io_32bit & 2) && !mmio)
-			local_irq_restore(flags);
-
-		if (((len + 1) & 3) < 2)
-			return;
-
-		buf += len & ~3;
-		words = 1;
-	}
-
-	if (mmio)
-		__ide_mm_insw((void __iomem *)data_addr, buf, words);
-	else
-		insw(data_addr, buf, words);
-}
-EXPORT_SYMBOL_GPL(ide_input_data);
-
-/*
- * This is used for most PIO data transfers *to* the IDE interface
- */
-void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
-		     unsigned int len)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct ide_io_ports *io_ports = &hwif->io_ports;
-	unsigned long data_addr = io_ports->data_addr;
-	unsigned int words = (len + 1) >> 1;
-	u8 io_32bit = drive->io_32bit;
-	u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0;
-
-	if (io_32bit) {
-		unsigned long flags;
-
-		if ((io_32bit & 2) && !mmio) {
-			local_irq_save(flags);
-			ata_vlb_sync(io_ports->nsect_addr);
-		}
-
-		words >>= 1;
-		if (mmio)
-			__ide_mm_outsl((void __iomem *)data_addr, buf, words);
-		else
-			outsl(data_addr, buf, words);
-
-		if ((io_32bit & 2) && !mmio)
-			local_irq_restore(flags);
-
-		if (((len + 1) & 3) < 2)
-			return;
-
-		buf += len & ~3;
-		words = 1;
-	}
-
-	if (mmio)
-		__ide_mm_outsw((void __iomem *)data_addr, buf, words);
-	else
-		outsw(data_addr, buf, words);
-}
-EXPORT_SYMBOL_GPL(ide_output_data);
-
-const struct ide_tp_ops default_tp_ops = {
-	.exec_command		= ide_exec_command,
-	.read_status		= ide_read_status,
-	.read_altstatus		= ide_read_altstatus,
-	.write_devctl		= ide_write_devctl,
-
-	.dev_select		= ide_dev_select,
-	.tf_load		= ide_tf_load,
-	.tf_read		= ide_tf_read,
-
-	.input_data		= ide_input_data,
-	.output_data		= ide_output_data,
-};
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
deleted file mode 100644
index 4867b67b60d6..000000000000
--- a/drivers/ide/ide-io.c
+++ /dev/null
@@ -1,904 +0,0 @@
-/*
- *	IDE I/O functions
- *
- *	Basic PIO and command management functionality.
- *
- * This code was split off from ide.c. See ide.c for history and original
- * copyrights.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2, or (at your option) any
- * later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * For the avoidance of doubt the "preferred form" of this code is one which
- * is in an open non patent encumbered format. Where cryptographic key signing
- * forms part of the process of creating an executable the information
- * including keys needed to generate an equivalently functional executable
- * are deemed to be part of the source code.
- */
- 
- 
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/timer.h>
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/major.h>
-#include <linux/errno.h>
-#include <linux/genhd.h>
-#include <linux/blkpg.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/delay.h>
-#include <linux/ide.h>
-#include <linux/completion.h>
-#include <linux/reboot.h>
-#include <linux/cdrom.h>
-#include <linux/seq_file.h>
-#include <linux/device.h>
-#include <linux/kmod.h>
-#include <linux/scatterlist.h>
-#include <linux/bitops.h>
-
-#include <asm/byteorder.h>
-#include <asm/irq.h>
-#include <linux/uaccess.h>
-#include <asm/io.h>
-
-int ide_end_rq(ide_drive_t *drive, struct request *rq, blk_status_t error,
-	       unsigned int nr_bytes)
-{
-	/*
-	 * decide whether to reenable DMA -- 3 is a random magic for now,
-	 * if we DMA timeout more than 3 times, just stay in PIO
-	 */
-	if ((drive->dev_flags & IDE_DFLAG_DMA_PIO_RETRY) &&
-	    drive->retry_pio <= 3) {
-		drive->dev_flags &= ~IDE_DFLAG_DMA_PIO_RETRY;
-		ide_dma_on(drive);
-	}
-
-	if (!blk_update_request(rq, error, nr_bytes)) {
-		if (rq == drive->sense_rq) {
-			drive->sense_rq = NULL;
-			drive->sense_rq_active = false;
-		}
-
-		__blk_mq_end_request(rq, error);
-		return 0;
-	}
-
-	return 1;
-}
-EXPORT_SYMBOL_GPL(ide_end_rq);
-
-void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err)
-{
-	const struct ide_tp_ops *tp_ops = drive->hwif->tp_ops;
-	struct ide_taskfile *tf = &cmd->tf;
-	struct request *rq = cmd->rq;
-	u8 tf_cmd = tf->command;
-
-	tf->error = err;
-	tf->status = stat;
-
-	if (cmd->ftf_flags & IDE_FTFLAG_IN_DATA) {
-		u8 data[2];
-
-		tp_ops->input_data(drive, cmd, data, 2);
-
-		cmd->tf.data  = data[0];
-		cmd->hob.data = data[1];
-	}
-
-	ide_tf_readback(drive, cmd);
-
-	if ((cmd->tf_flags & IDE_TFLAG_CUSTOM_HANDLER) &&
-	    tf_cmd == ATA_CMD_IDLEIMMEDIATE) {
-		if (tf->lbal != 0xc4) {
-			printk(KERN_ERR "%s: head unload failed!\n",
-			       drive->name);
-			ide_tf_dump(drive->name, cmd);
-		} else
-			drive->dev_flags |= IDE_DFLAG_PARKED;
-	}
-
-	if (rq && ata_taskfile_request(rq)) {
-		struct ide_cmd *orig_cmd = ide_req(rq)->special;
-
-		if (cmd->tf_flags & IDE_TFLAG_DYN)
-			kfree(orig_cmd);
-		else if (cmd != orig_cmd)
-			memcpy(orig_cmd, cmd, sizeof(*cmd));
-	}
-}
-
-int ide_complete_rq(ide_drive_t *drive, blk_status_t error, unsigned int nr_bytes)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct request *rq = hwif->rq;
-	int rc;
-
-	/*
-	 * if failfast is set on a request, override number of sectors
-	 * and complete the whole request right now
-	 */
-	if (blk_noretry_request(rq) && error)
-		nr_bytes = blk_rq_sectors(rq) << 9;
-
-	rc = ide_end_rq(drive, rq, error, nr_bytes);
-	if (rc == 0)
-		hwif->rq = NULL;
-
-	return rc;
-}
-EXPORT_SYMBOL(ide_complete_rq);
-
-void ide_kill_rq(ide_drive_t *drive, struct request *rq)
-{
-	u8 drv_req = ata_misc_request(rq) && rq->rq_disk;
-	u8 media = drive->media;
-
-	drive->failed_pc = NULL;
-
-	if ((media == ide_floppy || media == ide_tape) && drv_req) {
-		scsi_req(rq)->result = 0;
-	} else {
-		if (media == ide_tape)
-			scsi_req(rq)->result = IDE_DRV_ERROR_GENERAL;
-		else if (blk_rq_is_passthrough(rq) && scsi_req(rq)->result == 0)
-			scsi_req(rq)->result = -EIO;
-	}
-
-	ide_complete_rq(drive, BLK_STS_IOERR, blk_rq_bytes(rq));
-}
-
-static void ide_tf_set_specify_cmd(ide_drive_t *drive, struct ide_taskfile *tf)
-{
-	tf->nsect   = drive->sect;
-	tf->lbal    = drive->sect;
-	tf->lbam    = drive->cyl;
-	tf->lbah    = drive->cyl >> 8;
-	tf->device  = (drive->head - 1) | drive->select;
-	tf->command = ATA_CMD_INIT_DEV_PARAMS;
-}
-
-static void ide_tf_set_restore_cmd(ide_drive_t *drive, struct ide_taskfile *tf)
-{
-	tf->nsect   = drive->sect;
-	tf->command = ATA_CMD_RESTORE;
-}
-
-static void ide_tf_set_setmult_cmd(ide_drive_t *drive, struct ide_taskfile *tf)
-{
-	tf->nsect   = drive->mult_req;
-	tf->command = ATA_CMD_SET_MULTI;
-}
-
-/**
- *	do_special		-	issue some special commands
- *	@drive: drive the command is for
- *
- *	do_special() is used to issue ATA_CMD_INIT_DEV_PARAMS,
- *	ATA_CMD_RESTORE and ATA_CMD_SET_MULTI commands to a drive.
- */
-
-static ide_startstop_t do_special(ide_drive_t *drive)
-{
-	struct ide_cmd cmd;
-
-#ifdef DEBUG
-	printk(KERN_DEBUG "%s: %s: 0x%02x\n", drive->name, __func__,
-		drive->special_flags);
-#endif
-	if (drive->media != ide_disk) {
-		drive->special_flags = 0;
-		drive->mult_req = 0;
-		return ide_stopped;
-	}
-
-	memset(&cmd, 0, sizeof(cmd));
-	cmd.protocol = ATA_PROT_NODATA;
-
-	if (drive->special_flags & IDE_SFLAG_SET_GEOMETRY) {
-		drive->special_flags &= ~IDE_SFLAG_SET_GEOMETRY;
-		ide_tf_set_specify_cmd(drive, &cmd.tf);
-	} else if (drive->special_flags & IDE_SFLAG_RECALIBRATE) {
-		drive->special_flags &= ~IDE_SFLAG_RECALIBRATE;
-		ide_tf_set_restore_cmd(drive, &cmd.tf);
-	} else if (drive->special_flags & IDE_SFLAG_SET_MULTMODE) {
-		drive->special_flags &= ~IDE_SFLAG_SET_MULTMODE;
-		ide_tf_set_setmult_cmd(drive, &cmd.tf);
-	} else
-		BUG();
-
-	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
-	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
-	cmd.tf_flags = IDE_TFLAG_CUSTOM_HANDLER;
-
-	do_rw_taskfile(drive, &cmd);
-
-	return ide_started;
-}
-
-void ide_map_sg(ide_drive_t *drive, struct ide_cmd *cmd)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct scatterlist *sg = hwif->sg_table, *last_sg = NULL;
-	struct request *rq = cmd->rq;
-
-	cmd->sg_nents = __blk_rq_map_sg(drive->queue, rq, sg, &last_sg);
-	if (blk_rq_bytes(rq) && (blk_rq_bytes(rq) & rq->q->dma_pad_mask))
-		last_sg->length +=
-			(rq->q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1;
-}
-EXPORT_SYMBOL_GPL(ide_map_sg);
-
-void ide_init_sg_cmd(struct ide_cmd *cmd, unsigned int nr_bytes)
-{
-	cmd->nbytes = cmd->nleft = nr_bytes;
-	cmd->cursg_ofs = 0;
-	cmd->cursg = NULL;
-}
-EXPORT_SYMBOL_GPL(ide_init_sg_cmd);
-
-/**
- *	execute_drive_command	-	issue special drive command
- *	@drive: the drive to issue the command on
- *	@rq: the request structure holding the command
- *
- *	execute_drive_cmd() issues a special drive command,  usually 
- *	initiated by ioctl() from the external hdparm program. The
- *	command can be a drive command, drive task or taskfile 
- *	operation. Weirdly you can call it with NULL to wait for
- *	all commands to finish. Don't do this as that is due to change
- */
-
-static ide_startstop_t execute_drive_cmd (ide_drive_t *drive,
-		struct request *rq)
-{
-	struct ide_cmd *cmd = ide_req(rq)->special;
-
-	if (cmd) {
-		if (cmd->protocol == ATA_PROT_PIO) {
-			ide_init_sg_cmd(cmd, blk_rq_sectors(rq) << 9);
-			ide_map_sg(drive, cmd);
-		}
-
-		return do_rw_taskfile(drive, cmd);
-	}
-
- 	/*
- 	 * NULL is actually a valid way of waiting for
- 	 * all current requests to be flushed from the queue.
- 	 */
-#ifdef DEBUG
- 	printk("%s: DRIVE_CMD (null)\n", drive->name);
-#endif
-	scsi_req(rq)->result = 0;
-	ide_complete_rq(drive, BLK_STS_OK, blk_rq_bytes(rq));
-
- 	return ide_stopped;
-}
-
-static ide_startstop_t ide_special_rq(ide_drive_t *drive, struct request *rq)
-{
-	u8 cmd = scsi_req(rq)->cmd[0];
-
-	switch (cmd) {
-	case REQ_PARK_HEADS:
-	case REQ_UNPARK_HEADS:
-		return ide_do_park_unpark(drive, rq);
-	case REQ_DEVSET_EXEC:
-		return ide_do_devset(drive, rq);
-	case REQ_DRIVE_RESET:
-		return ide_do_reset(drive);
-	default:
-		BUG();
-	}
-}
-
-/**
- *	start_request	-	start of I/O and command issuing for IDE
- *
- *	start_request() initiates handling of a new I/O request. It
- *	accepts commands and I/O (read/write) requests.
- *
- *	FIXME: this function needs a rename
- */
- 
-static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
-{
-	ide_startstop_t startstop;
-
-#ifdef DEBUG
-	printk("%s: start_request: current=0x%08lx\n",
-		drive->hwif->name, (unsigned long) rq);
-#endif
-
-	/* bail early if we've exceeded max_failures */
-	if (drive->max_failures && (drive->failures > drive->max_failures)) {
-		rq->rq_flags |= RQF_FAILED;
-		goto kill_rq;
-	}
-
-	if (drive->prep_rq && !drive->prep_rq(drive, rq))
-		return ide_stopped;
-
-	if (ata_pm_request(rq))
-		ide_check_pm_state(drive, rq);
-
-	drive->hwif->tp_ops->dev_select(drive);
-	if (ide_wait_stat(&startstop, drive, drive->ready_stat,
-			  ATA_BUSY | ATA_DRQ, WAIT_READY)) {
-		printk(KERN_ERR "%s: drive not ready for command\n", drive->name);
-		return startstop;
-	}
-
-	if (drive->special_flags == 0) {
-		struct ide_driver *drv;
-
-		/*
-		 * We reset the drive so we need to issue a SETFEATURES.
-		 * Do it _after_ do_special() restored device parameters.
-		 */
-		if (drive->current_speed == 0xff)
-			ide_config_drive_speed(drive, drive->desired_speed);
-
-		if (ata_taskfile_request(rq))
-			return execute_drive_cmd(drive, rq);
-		else if (ata_pm_request(rq)) {
-			struct ide_pm_state *pm = ide_req(rq)->special;
-#ifdef DEBUG_PM
-			printk("%s: start_power_step(step: %d)\n",
-				drive->name, pm->pm_step);
-#endif
-			startstop = ide_start_power_step(drive, rq);
-			if (startstop == ide_stopped &&
-			    pm->pm_step == IDE_PM_COMPLETED)
-				ide_complete_pm_rq(drive, rq);
-			return startstop;
-		} else if (!rq->rq_disk && ata_misc_request(rq))
-			/*
-			 * TODO: Once all ULDs have been modified to
-			 * check for specific op codes rather than
-			 * blindly accepting any special request, the
-			 * check for ->rq_disk above may be replaced
-			 * by a more suitable mechanism or even
-			 * dropped entirely.
-			 */
-			return ide_special_rq(drive, rq);
-
-		drv = *(struct ide_driver **)rq->rq_disk->private_data;
-
-		return drv->do_request(drive, rq, blk_rq_pos(rq));
-	}
-	return do_special(drive);
-kill_rq:
-	ide_kill_rq(drive, rq);
-	return ide_stopped;
-}
-
-/**
- *	ide_stall_queue		-	pause an IDE device
- *	@drive: drive to stall
- *	@timeout: time to stall for (jiffies)
- *
- *	ide_stall_queue() can be used by a drive to give excess bandwidth back
- *	to the port by sleeping for timeout jiffies.
- */
- 
-void ide_stall_queue (ide_drive_t *drive, unsigned long timeout)
-{
-	if (timeout > WAIT_WORSTCASE)
-		timeout = WAIT_WORSTCASE;
-	drive->sleep = timeout + jiffies;
-	drive->dev_flags |= IDE_DFLAG_SLEEPING;
-}
-EXPORT_SYMBOL(ide_stall_queue);
-
-static inline int ide_lock_port(ide_hwif_t *hwif)
-{
-	if (hwif->busy)
-		return 1;
-
-	hwif->busy = 1;
-
-	return 0;
-}
-
-static inline void ide_unlock_port(ide_hwif_t *hwif)
-{
-	hwif->busy = 0;
-}
-
-static inline int ide_lock_host(struct ide_host *host, ide_hwif_t *hwif)
-{
-	int rc = 0;
-
-	if (host->host_flags & IDE_HFLAG_SERIALIZE) {
-		rc = test_and_set_bit_lock(IDE_HOST_BUSY, &host->host_busy);
-		if (rc == 0) {
-			if (host->get_lock)
-				host->get_lock(ide_intr, hwif);
-		}
-	}
-	return rc;
-}
-
-static inline void ide_unlock_host(struct ide_host *host)
-{
-	if (host->host_flags & IDE_HFLAG_SERIALIZE) {
-		if (host->release_lock)
-			host->release_lock();
-		clear_bit_unlock(IDE_HOST_BUSY, &host->host_busy);
-	}
-}
-
-void ide_requeue_and_plug(ide_drive_t *drive, struct request *rq)
-{
-	struct request_queue *q = drive->queue;
-
-	/* Use 3ms as that was the old plug delay */
-	if (rq) {
-		blk_mq_requeue_request(rq, false);
-		blk_mq_delay_kick_requeue_list(q, 3);
-	} else
-		blk_mq_delay_run_hw_queue(q->queue_hw_ctx[0], 3);
-}
-
-blk_status_t ide_issue_rq(ide_drive_t *drive, struct request *rq,
-			  bool local_requeue)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct ide_host *host = hwif->host;
-	ide_startstop_t	startstop;
-
-	if (!blk_rq_is_passthrough(rq) && !(rq->rq_flags & RQF_DONTPREP)) {
-		rq->rq_flags |= RQF_DONTPREP;
-		ide_req(rq)->special = NULL;
-	}
-
-	/* HLD do_request() callback might sleep, make sure it's okay */
-	might_sleep();
-
-	if (ide_lock_host(host, hwif))
-		return BLK_STS_DEV_RESOURCE;
-
-	spin_lock_irq(&hwif->lock);
-
-	if (!ide_lock_port(hwif)) {
-		ide_hwif_t *prev_port;
-
-		WARN_ON_ONCE(hwif->rq);
-repeat:
-		prev_port = hwif->host->cur_port;
-		if (drive->dev_flags & IDE_DFLAG_SLEEPING &&
-		    time_after(drive->sleep, jiffies)) {
-			ide_unlock_port(hwif);
-			goto plug_device;
-		}
-
-		if ((hwif->host->host_flags & IDE_HFLAG_SERIALIZE) &&
-		    hwif != prev_port) {
-			ide_drive_t *cur_dev =
-				prev_port ? prev_port->cur_dev : NULL;
-
-			/*
-			 * set nIEN for previous port, drives in the
-			 * quirk list may not like intr setups/cleanups
-			 */
-			if (cur_dev &&
-			    (cur_dev->dev_flags & IDE_DFLAG_NIEN_QUIRK) == 0)
-				prev_port->tp_ops->write_devctl(prev_port,
-								ATA_NIEN |
-								ATA_DEVCTL_OBS);
-
-			hwif->host->cur_port = hwif;
-		}
-		hwif->cur_dev = drive;
-		drive->dev_flags &= ~(IDE_DFLAG_SLEEPING | IDE_DFLAG_PARKED);
-
-		/*
-		 * Sanity: don't accept a request that isn't a PM request
-		 * if we are currently power managed. This is very important as
-		 * blk_stop_queue() doesn't prevent the blk_fetch_request()
-		 * above to return us whatever is in the queue. Since we call
-		 * ide_do_request() ourselves, we end up taking requests while
-		 * the queue is blocked...
-		 */
-		if ((drive->dev_flags & IDE_DFLAG_BLOCKED) &&
-		    ata_pm_request(rq) == 0 &&
-		    (rq->rq_flags & RQF_PM) == 0) {
-			/* there should be no pending command at this point */
-			ide_unlock_port(hwif);
-			goto plug_device;
-		}
-
-		scsi_req(rq)->resid_len = blk_rq_bytes(rq);
-		hwif->rq = rq;
-
-		spin_unlock_irq(&hwif->lock);
-		startstop = start_request(drive, rq);
-		spin_lock_irq(&hwif->lock);
-
-		if (startstop == ide_stopped) {
-			rq = hwif->rq;
-			hwif->rq = NULL;
-			if (rq)
-				goto repeat;
-			ide_unlock_port(hwif);
-			goto out;
-		}
-	} else {
-plug_device:
-		if (local_requeue)
-			list_add(&rq->queuelist, &drive->rq_list);
-		spin_unlock_irq(&hwif->lock);
-		ide_unlock_host(host);
-		if (!local_requeue)
-			ide_requeue_and_plug(drive, rq);
-		return BLK_STS_OK;
-	}
-
-out:
-	spin_unlock_irq(&hwif->lock);
-	if (rq == NULL)
-		ide_unlock_host(host);
-	return BLK_STS_OK;
-}
-
-/*
- * Issue a new request to a device.
- */
-blk_status_t ide_queue_rq(struct blk_mq_hw_ctx *hctx,
-			  const struct blk_mq_queue_data *bd)
-{
-	ide_drive_t *drive = hctx->queue->queuedata;
-	ide_hwif_t *hwif = drive->hwif;
-
-	spin_lock_irq(&hwif->lock);
-	if (drive->sense_rq_active) {
-		spin_unlock_irq(&hwif->lock);
-		return BLK_STS_DEV_RESOURCE;
-	}
-	spin_unlock_irq(&hwif->lock);
-
-	blk_mq_start_request(bd->rq);
-	return ide_issue_rq(drive, bd->rq, false);
-}
-
-static int drive_is_ready(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	u8 stat = 0;
-
-	if (drive->waiting_for_dma)
-		return hwif->dma_ops->dma_test_irq(drive);
-
-	if (hwif->io_ports.ctl_addr &&
-	    (hwif->host_flags & IDE_HFLAG_BROKEN_ALTSTATUS) == 0)
-		stat = hwif->tp_ops->read_altstatus(hwif);
-	else
-		/* Note: this may clear a pending IRQ!! */
-		stat = hwif->tp_ops->read_status(hwif);
-
-	if (stat & ATA_BUSY)
-		/* drive busy: definitely not interrupting */
-		return 0;
-
-	/* drive ready: *might* be interrupting */
-	return 1;
-}
-
-/**
- *	ide_timer_expiry	-	handle lack of an IDE interrupt
- *	@data: timer callback magic (hwif)
- *
- *	An IDE command has timed out before the expected drive return
- *	occurred. At this point we attempt to clean up the current
- *	mess. If the current handler includes an expiry handler then
- *	we invoke the expiry handler, and providing it is happy the
- *	work is done. If that fails we apply generic recovery rules
- *	invoking the handler and checking the drive DMA status. We
- *	have an excessively incestuous relationship with the DMA
- *	logic that wants cleaning up.
- */
- 
-void ide_timer_expiry (struct timer_list *t)
-{
-	ide_hwif_t	*hwif = from_timer(hwif, t, timer);
-	ide_drive_t	*drive;
-	ide_handler_t	*handler;
-	unsigned long	flags;
-	int		wait = -1;
-	int		plug_device = 0;
-	struct request	*rq_in_flight;
-
-	spin_lock_irqsave(&hwif->lock, flags);
-
-	handler = hwif->handler;
-
-	if (handler == NULL || hwif->req_gen != hwif->req_gen_timer) {
-		/*
-		 * Either a marginal timeout occurred
-		 * (got the interrupt just as timer expired),
-		 * or we were "sleeping" to give other devices a chance.
-		 * Either way, we don't really want to complain about anything.
-		 */
-	} else {
-		ide_expiry_t *expiry = hwif->expiry;
-		ide_startstop_t startstop = ide_stopped;
-
-		drive = hwif->cur_dev;
-
-		if (expiry) {
-			wait = expiry(drive);
-			if (wait > 0) { /* continue */
-				/* reset timer */
-				hwif->timer.expires = jiffies + wait;
-				hwif->req_gen_timer = hwif->req_gen;
-				add_timer(&hwif->timer);
-				spin_unlock_irqrestore(&hwif->lock, flags);
-				return;
-			}
-		}
-		hwif->handler = NULL;
-		hwif->expiry = NULL;
-		/*
-		 * We need to simulate a real interrupt when invoking
-		 * the handler() function, which means we need to
-		 * globally mask the specific IRQ:
-		 */
-		spin_unlock(&hwif->lock);
-		/* disable_irq_nosync ?? */
-		disable_irq(hwif->irq);
-
-		if (hwif->polling) {
-			startstop = handler(drive);
-		} else if (drive_is_ready(drive)) {
-			if (drive->waiting_for_dma)
-				hwif->dma_ops->dma_lost_irq(drive);
-			if (hwif->port_ops && hwif->port_ops->clear_irq)
-				hwif->port_ops->clear_irq(drive);
-
-			printk(KERN_WARNING "%s: lost interrupt\n",
-				drive->name);
-			startstop = handler(drive);
-		} else {
-			if (drive->waiting_for_dma)
-				startstop = ide_dma_timeout_retry(drive, wait);
-			else
-				startstop = ide_error(drive, "irq timeout",
-					hwif->tp_ops->read_status(hwif));
-		}
-		/* Disable interrupts again, `handler' might have enabled it */
-		spin_lock_irq(&hwif->lock);
-		enable_irq(hwif->irq);
-		if (startstop == ide_stopped && hwif->polling == 0) {
-			rq_in_flight = hwif->rq;
-			hwif->rq = NULL;
-			ide_unlock_port(hwif);
-			plug_device = 1;
-		}
-	}
-	spin_unlock_irqrestore(&hwif->lock, flags);
-
-	if (plug_device) {
-		ide_unlock_host(hwif->host);
-		ide_requeue_and_plug(drive, rq_in_flight);
-	}
-}
-
-/**
- *	unexpected_intr		-	handle an unexpected IDE interrupt
- *	@irq: interrupt line
- *	@hwif: port being processed
- *
- *	There's nothing really useful we can do with an unexpected interrupt,
- *	other than reading the status register (to clear it), and logging it.
- *	There should be no way that an irq can happen before we're ready for it,
- *	so we needn't worry much about losing an "important" interrupt here.
- *
- *	On laptops (and "green" PCs), an unexpected interrupt occurs whenever
- *	the drive enters "idle", "standby", or "sleep" mode, so if the status
- *	looks "good", we just ignore the interrupt completely.
- *
- *	This routine assumes __cli() is in effect when called.
- *
- *	If an unexpected interrupt happens on irq15 while we are handling irq14
- *	and if the two interfaces are "serialized" (CMD640), then it looks like
- *	we could screw up by interfering with a new request being set up for 
- *	irq15.
- *
- *	In reality, this is a non-issue.  The new command is not sent unless 
- *	the drive is ready to accept one, in which case we know the drive is
- *	not trying to interrupt us.  And ide_set_handler() is always invoked
- *	before completing the issuance of any new drive command, so we will not
- *	be accidentally invoked as a result of any valid command completion
- *	interrupt.
- */
-
-static void unexpected_intr(int irq, ide_hwif_t *hwif)
-{
-	u8 stat = hwif->tp_ops->read_status(hwif);
-
-	if (!OK_STAT(stat, ATA_DRDY, BAD_STAT)) {
-		/* Try to not flood the console with msgs */
-		static unsigned long last_msgtime, count;
-		++count;
-
-		if (time_after(jiffies, last_msgtime + HZ)) {
-			last_msgtime = jiffies;
-			printk(KERN_ERR "%s: unexpected interrupt, "
-				"status=0x%02x, count=%ld\n",
-				hwif->name, stat, count);
-		}
-	}
-}
-
-/**
- *	ide_intr	-	default IDE interrupt handler
- *	@irq: interrupt number
- *	@dev_id: hwif
- *	@regs: unused weirdness from the kernel irq layer
- *
- *	This is the default IRQ handler for the IDE layer. You should
- *	not need to override it. If you do be aware it is subtle in
- *	places
- *
- *	hwif is the interface in the group currently performing
- *	a command. hwif->cur_dev is the drive and hwif->handler is
- *	the IRQ handler to call. As we issue a command the handlers
- *	step through multiple states, reassigning the handler to the
- *	next step in the process. Unlike a smart SCSI controller IDE
- *	expects the main processor to sequence the various transfer
- *	stages. We also manage a poll timer to catch up with most
- *	timeout situations. There are still a few where the handlers
- *	don't ever decide to give up.
- *
- *	The handler eventually returns ide_stopped to indicate the
- *	request completed. At this point we issue the next request
- *	on the port and the process begins again.
- */
-
-irqreturn_t ide_intr (int irq, void *dev_id)
-{
-	ide_hwif_t *hwif = (ide_hwif_t *)dev_id;
-	struct ide_host *host = hwif->host;
-	ide_drive_t *drive;
-	ide_handler_t *handler;
-	unsigned long flags;
-	ide_startstop_t startstop;
-	irqreturn_t irq_ret = IRQ_NONE;
-	int plug_device = 0;
-	struct request *rq_in_flight;
-
-	if (host->host_flags & IDE_HFLAG_SERIALIZE) {
-		if (hwif != host->cur_port)
-			goto out_early;
-	}
-
-	spin_lock_irqsave(&hwif->lock, flags);
-
-	if (hwif->port_ops && hwif->port_ops->test_irq &&
-	    hwif->port_ops->test_irq(hwif) == 0)
-		goto out;
-
-	handler = hwif->handler;
-
-	if (handler == NULL || hwif->polling) {
-		/*
-		 * Not expecting an interrupt from this drive.
-		 * That means this could be:
-		 *	(1) an interrupt from another PCI device
-		 *	sharing the same PCI INT# as us.
-		 * or	(2) a drive just entered sleep or standby mode,
-		 *	and is interrupting to let us know.
-		 * or	(3) a spurious interrupt of unknown origin.
-		 *
-		 * For PCI, we cannot tell the difference,
-		 * so in that case we just ignore it and hope it goes away.
-		 */
-		if ((host->irq_flags & IRQF_SHARED) == 0) {
-			/*
-			 * Probably not a shared PCI interrupt,
-			 * so we can safely try to do something about it:
-			 */
-			unexpected_intr(irq, hwif);
-		} else {
-			/*
-			 * Whack the status register, just in case
-			 * we have a leftover pending IRQ.
-			 */
-			(void)hwif->tp_ops->read_status(hwif);
-		}
-		goto out;
-	}
-
-	drive = hwif->cur_dev;
-
-	if (!drive_is_ready(drive))
-		/*
-		 * This happens regularly when we share a PCI IRQ with
-		 * another device.  Unfortunately, it can also happen
-		 * with some buggy drives that trigger the IRQ before
-		 * their status register is up to date.  Hopefully we have
-		 * enough advance overhead that the latter isn't a problem.
-		 */
-		goto out;
-
-	hwif->handler = NULL;
-	hwif->expiry = NULL;
-	hwif->req_gen++;
-	del_timer(&hwif->timer);
-	spin_unlock(&hwif->lock);
-
-	if (hwif->port_ops && hwif->port_ops->clear_irq)
-		hwif->port_ops->clear_irq(drive);
-
-	if (drive->dev_flags & IDE_DFLAG_UNMASK)
-		local_irq_enable_in_hardirq();
-
-	/* service this interrupt, may set handler for next interrupt */
-	startstop = handler(drive);
-
-	spin_lock_irq(&hwif->lock);
-	/*
-	 * Note that handler() may have set things up for another
-	 * interrupt to occur soon, but it cannot happen until
-	 * we exit from this routine, because it will be the
-	 * same irq as is currently being serviced here, and Linux
-	 * won't allow another of the same (on any CPU) until we return.
-	 */
-	if (startstop == ide_stopped && hwif->polling == 0) {
-		BUG_ON(hwif->handler);
-		rq_in_flight = hwif->rq;
-		hwif->rq = NULL;
-		ide_unlock_port(hwif);
-		plug_device = 1;
-	}
-	irq_ret = IRQ_HANDLED;
-out:
-	spin_unlock_irqrestore(&hwif->lock, flags);
-out_early:
-	if (plug_device) {
-		ide_unlock_host(hwif->host);
-		ide_requeue_and_plug(drive, rq_in_flight);
-	}
-
-	return irq_ret;
-}
-EXPORT_SYMBOL_GPL(ide_intr);
-
-void ide_pad_transfer(ide_drive_t *drive, int write, int len)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	u8 buf[4] = { 0 };
-
-	while (len > 0) {
-		if (write)
-			hwif->tp_ops->output_data(drive, NULL, buf, min(4, len));
-		else
-			hwif->tp_ops->input_data(drive, NULL, buf, min(4, len));
-		len -= 4;
-	}
-}
-EXPORT_SYMBOL_GPL(ide_pad_transfer);
-
-void ide_insert_request_head(ide_drive_t *drive, struct request *rq)
-{
-	drive->sense_rq_active = true;
-	list_add_tail(&rq->queuelist, &drive->rq_list);
-	kblockd_schedule_work(&drive->rq_work);
-}
-EXPORT_SYMBOL_GPL(ide_insert_request_head);
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
deleted file mode 100644
index 43fbc37d85c3..000000000000
--- a/drivers/ide/ide-ioctls.c
+++ /dev/null
@@ -1,306 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * IDE ioctls handling.
- */
-
-#include <linux/compat.h>
-#include <linux/export.h>
-#include <linux/hdreg.h>
-#include <linux/ide.h>
-#include <linux/slab.h>
-
-static int put_user_long(long val, unsigned long arg)
-{
-	if (in_compat_syscall())
-		return put_user(val, (compat_long_t __user *)compat_ptr(arg));
-
-	return put_user(val, (long __user *)arg);
-}
-
-static const struct ide_ioctl_devset ide_ioctl_settings[] = {
-{ HDIO_GET_32BIT,	 HDIO_SET_32BIT,	&ide_devset_io_32bit  },
-{ HDIO_GET_KEEPSETTINGS, HDIO_SET_KEEPSETTINGS,	&ide_devset_keepsettings },
-{ HDIO_GET_UNMASKINTR,	 HDIO_SET_UNMASKINTR,	&ide_devset_unmaskirq },
-{ HDIO_GET_DMA,		 HDIO_SET_DMA,		&ide_devset_using_dma },
-{ -1,			 HDIO_SET_PIO_MODE,	&ide_devset_pio_mode  },
-{ 0 }
-};
-
-int ide_setting_ioctl(ide_drive_t *drive, struct block_device *bdev,
-		      unsigned int cmd, unsigned long arg,
-		      const struct ide_ioctl_devset *s)
-{
-	const struct ide_devset *ds;
-	int err = -EOPNOTSUPP;
-
-	for (; (ds = s->setting); s++) {
-		if (ds->get && s->get_ioctl == cmd)
-			goto read_val;
-		else if (ds->set && s->set_ioctl == cmd)
-			goto set_val;
-	}
-
-	return err;
-
-read_val:
-	mutex_lock(&ide_setting_mtx);
-	err = ds->get(drive);
-	mutex_unlock(&ide_setting_mtx);
-	return err >= 0 ? put_user_long(err, arg) : err;
-
-set_val:
-	if (bdev_is_partition(bdev))
-		err = -EINVAL;
-	else {
-		if (!capable(CAP_SYS_ADMIN))
-			err = -EACCES;
-		else {
-			mutex_lock(&ide_setting_mtx);
-			err = ide_devset_execute(drive, ds, arg);
-			mutex_unlock(&ide_setting_mtx);
-		}
-	}
-	return err;
-}
-EXPORT_SYMBOL_GPL(ide_setting_ioctl);
-
-static int ide_get_identity_ioctl(ide_drive_t *drive, unsigned int cmd,
-				  void __user *argp)
-{
-	u16 *id = NULL;
-	int size = (cmd == HDIO_GET_IDENTITY) ? (ATA_ID_WORDS * 2) : 142;
-	int rc = 0;
-
-	if ((drive->dev_flags & IDE_DFLAG_ID_READ) == 0) {
-		rc = -ENOMSG;
-		goto out;
-	}
-
-	/* ata_id_to_hd_driveid() relies on 'id' to be fully allocated. */
-	id = kmalloc(ATA_ID_WORDS * 2, GFP_KERNEL);
-	if (id == NULL) {
-		rc = -ENOMEM;
-		goto out;
-	}
-
-	memcpy(id, drive->id, size);
-	ata_id_to_hd_driveid(id);
-
-	if (copy_to_user(argp, id, size))
-		rc = -EFAULT;
-
-	kfree(id);
-out:
-	return rc;
-}
-
-static int ide_get_nice_ioctl(ide_drive_t *drive, unsigned long arg)
-{
-	return put_user_long((!!(drive->dev_flags & IDE_DFLAG_DSC_OVERLAP)
-			 << IDE_NICE_DSC_OVERLAP) |
-			(!!(drive->dev_flags & IDE_DFLAG_NICE1)
-			 << IDE_NICE_1), arg);
-}
-
-static int ide_set_nice_ioctl(ide_drive_t *drive, unsigned long arg)
-{
-	if (arg != (arg & ((1 << IDE_NICE_DSC_OVERLAP) | (1 << IDE_NICE_1))))
-		return -EPERM;
-
-	if (((arg >> IDE_NICE_DSC_OVERLAP) & 1) &&
-	    (drive->media != ide_tape))
-		return -EPERM;
-
-	if ((arg >> IDE_NICE_DSC_OVERLAP) & 1)
-		drive->dev_flags |= IDE_DFLAG_DSC_OVERLAP;
-	else
-		drive->dev_flags &= ~IDE_DFLAG_DSC_OVERLAP;
-
-	if ((arg >> IDE_NICE_1) & 1)
-		drive->dev_flags |= IDE_DFLAG_NICE1;
-	else
-		drive->dev_flags &= ~IDE_DFLAG_NICE1;
-
-	return 0;
-}
-
-static int ide_cmd_ioctl(ide_drive_t *drive, void __user *argp)
-{
-	u8 *buf = NULL;
-	int bufsize = 0, err = 0;
-	u8 args[4], xfer_rate = 0;
-	struct ide_cmd cmd;
-	struct ide_taskfile *tf = &cmd.tf;
-
-	if (NULL == argp) {
-		struct request *rq;
-
-		rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
-		ide_req(rq)->type = ATA_PRIV_TASKFILE;
-		blk_execute_rq(NULL, rq, 0);
-		err = scsi_req(rq)->result ? -EIO : 0;
-		blk_put_request(rq);
-
-		return err;
-	}
-
-	if (copy_from_user(args, argp, 4))
-		return -EFAULT;
-
-	memset(&cmd, 0, sizeof(cmd));
-	tf->feature = args[2];
-	if (args[0] == ATA_CMD_SMART) {
-		tf->nsect = args[3];
-		tf->lbal  = args[1];
-		tf->lbam  = ATA_SMART_LBAM_PASS;
-		tf->lbah  = ATA_SMART_LBAH_PASS;
-		cmd.valid.out.tf = IDE_VALID_OUT_TF;
-		cmd.valid.in.tf  = IDE_VALID_NSECT;
-	} else {
-		tf->nsect = args[1];
-		cmd.valid.out.tf = IDE_VALID_FEATURE | IDE_VALID_NSECT;
-		cmd.valid.in.tf  = IDE_VALID_NSECT;
-	}
-	tf->command = args[0];
-	cmd.protocol = args[3] ? ATA_PROT_PIO : ATA_PROT_NODATA;
-
-	if (args[3]) {
-		cmd.tf_flags |= IDE_TFLAG_IO_16BIT;
-		bufsize = SECTOR_SIZE * args[3];
-		buf = kzalloc(bufsize, GFP_KERNEL);
-		if (buf == NULL)
-			return -ENOMEM;
-	}
-
-	if (tf->command == ATA_CMD_SET_FEATURES &&
-	    tf->feature == SETFEATURES_XFER &&
-	    tf->nsect >= XFER_SW_DMA_0) {
-		xfer_rate = ide_find_dma_mode(drive, tf->nsect);
-		if (xfer_rate != tf->nsect) {
-			err = -EINVAL;
-			goto abort;
-		}
-
-		cmd.tf_flags |= IDE_TFLAG_SET_XFER;
-	}
-
-	err = ide_raw_taskfile(drive, &cmd, buf, args[3]);
-
-	args[0] = tf->status;
-	args[1] = tf->error;
-	args[2] = tf->nsect;
-abort:
-	if (copy_to_user(argp, &args, 4))
-		err = -EFAULT;
-	if (buf) {
-		if (copy_to_user((argp + 4), buf, bufsize))
-			err = -EFAULT;
-		kfree(buf);
-	}
-	return err;
-}
-
-static int ide_task_ioctl(ide_drive_t *drive, void __user *p)
-{
-	int err = 0;
-	u8 args[7];
-	struct ide_cmd cmd;
-
-	if (copy_from_user(args, p, 7))
-		return -EFAULT;
-
-	memset(&cmd, 0, sizeof(cmd));
-	memcpy(&cmd.tf.feature, &args[1], 6);
-	cmd.tf.command = args[0];
-	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
-	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
-
-	err = ide_no_data_taskfile(drive, &cmd);
-
-	args[0] = cmd.tf.command;
-	memcpy(&args[1], &cmd.tf.feature, 6);
-
-	if (copy_to_user(p, args, 7))
-		err = -EFAULT;
-
-	return err;
-}
-
-static int generic_drive_reset(ide_drive_t *drive)
-{
-	struct request *rq;
-	int ret = 0;
-
-	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
-	ide_req(rq)->type = ATA_PRIV_MISC;
-	scsi_req(rq)->cmd_len = 1;
-	scsi_req(rq)->cmd[0] = REQ_DRIVE_RESET;
-	blk_execute_rq(NULL, rq, 1);
-	ret = scsi_req(rq)->result;
-	blk_put_request(rq);
-	return ret;
-}
-
-int generic_ide_ioctl(ide_drive_t *drive, struct block_device *bdev,
-		      unsigned int cmd, unsigned long arg)
-{
-	int err;
-	void __user *argp = (void __user *)arg;
-
-	if (in_compat_syscall())
-		argp = compat_ptr(arg);
-
-	err = ide_setting_ioctl(drive, bdev, cmd, arg, ide_ioctl_settings);
-	if (err != -EOPNOTSUPP)
-		return err;
-
-	switch (cmd) {
-	case HDIO_OBSOLETE_IDENTITY:
-	case HDIO_GET_IDENTITY:
-		if (bdev_is_partition(bdev))
-			return -EINVAL;
-		return ide_get_identity_ioctl(drive, cmd, argp);
-	case HDIO_GET_NICE:
-		return ide_get_nice_ioctl(drive, arg);
-	case HDIO_SET_NICE:
-		if (!capable(CAP_SYS_ADMIN))
-			return -EACCES;
-		return ide_set_nice_ioctl(drive, arg);
-#ifdef CONFIG_IDE_TASK_IOCTL
-	case HDIO_DRIVE_TASKFILE:
-		if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO))
-			return -EACCES;
-		/* missing compat handler for HDIO_DRIVE_TASKFILE */
-		if (in_compat_syscall())
-			return -ENOTTY;
-		if (drive->media == ide_disk)
-			return ide_taskfile_ioctl(drive, arg);
-		return -ENOMSG;
-#endif
-	case HDIO_DRIVE_CMD:
-		if (!capable(CAP_SYS_RAWIO))
-			return -EACCES;
-		return ide_cmd_ioctl(drive, argp);
-	case HDIO_DRIVE_TASK:
-		if (!capable(CAP_SYS_RAWIO))
-			return -EACCES;
-		return ide_task_ioctl(drive, argp);
-	case HDIO_DRIVE_RESET:
-		if (!capable(CAP_SYS_ADMIN))
-			return -EACCES;
-		return generic_drive_reset(drive);
-	case HDIO_GET_BUSSTATE:
-		if (!capable(CAP_SYS_ADMIN))
-			return -EACCES;
-		if (put_user_long(BUSSTATE_ON, arg))
-			return -EFAULT;
-		return 0;
-	case HDIO_SET_BUSSTATE:
-		if (!capable(CAP_SYS_ADMIN))
-			return -EACCES;
-		return -EOPNOTSUPP;
-	default:
-		return -EINVAL;
-	}
-}
-EXPORT_SYMBOL(generic_ide_ioctl);
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
deleted file mode 100644
index f2be127ee96e..000000000000
--- a/drivers/ide/ide-iops.c
+++ /dev/null
@@ -1,536 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  Copyright (C) 2000-2002	Andre Hedrick <andre@linux-ide.org>
- *  Copyright (C) 2003		Red Hat
- *
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/timer.h>
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/major.h>
-#include <linux/errno.h>
-#include <linux/genhd.h>
-#include <linux/blkpg.h>
-#include <linux/slab.h>
-#include <linux/pci.h>
-#include <linux/delay.h>
-#include <linux/ide.h>
-#include <linux/bitops.h>
-#include <linux/nmi.h>
-
-#include <asm/byteorder.h>
-#include <asm/irq.h>
-#include <linux/uaccess.h>
-#include <asm/io.h>
-
-void SELECT_MASK(ide_drive_t *drive, int mask)
-{
-	const struct ide_port_ops *port_ops = drive->hwif->port_ops;
-
-	if (port_ops && port_ops->maskproc)
-		port_ops->maskproc(drive, mask);
-}
-
-u8 ide_read_error(ide_drive_t *drive)
-{
-	struct ide_taskfile tf;
-
-	drive->hwif->tp_ops->tf_read(drive, &tf, IDE_VALID_ERROR);
-
-	return tf.error;
-}
-EXPORT_SYMBOL_GPL(ide_read_error);
-
-void ide_fix_driveid(u16 *id)
-{
-#ifndef __LITTLE_ENDIAN
-# ifdef __BIG_ENDIAN
-	int i;
-
-	for (i = 0; i < 256; i++)
-		id[i] = __le16_to_cpu(id[i]);
-# else
-#  error "Please fix <asm/byteorder.h>"
-# endif
-#endif
-}
-
-/*
- * ide_fixstring() cleans up and (optionally) byte-swaps a text string,
- * removing leading/trailing blanks and compressing internal blanks.
- * It is primarily used to tidy up the model name/number fields as
- * returned by the ATA_CMD_ID_ATA[PI] commands.
- */
-
-void ide_fixstring(u8 *s, const int bytecount, const int byteswap)
-{
-	u8 *p, *end = &s[bytecount & ~1]; /* bytecount must be even */
-
-	if (byteswap) {
-		/* convert from big-endian to host byte order */
-		for (p = s ; p != end ; p += 2)
-			be16_to_cpus((u16 *) p);
-	}
-
-	/* strip leading blanks */
-	p = s;
-	while (s != end && *s == ' ')
-		++s;
-	/* compress internal blanks and strip trailing blanks */
-	while (s != end && *s) {
-		if (*s++ != ' ' || (s != end && *s && *s != ' '))
-			*p++ = *(s-1);
-	}
-	/* wipe out trailing garbage */
-	while (p != end)
-		*p++ = '\0';
-}
-EXPORT_SYMBOL(ide_fixstring);
-
-/*
- * This routine busy-waits for the drive status to be not "busy".
- * It then checks the status for all of the "good" bits and none
- * of the "bad" bits, and if all is okay it returns 0.  All other
- * cases return error -- caller may then invoke ide_error().
- *
- * This routine should get fixed to not hog the cpu during extra long waits..
- * That could be done by busy-waiting for the first jiffy or two, and then
- * setting a timer to wake up at half second intervals thereafter,
- * until timeout is achieved, before timing out.
- */
-int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad,
-		    unsigned long timeout, u8 *rstat)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
-	unsigned long flags;
-	bool irqs_threaded = force_irqthreads;
-	int i;
-	u8 stat;
-
-	udelay(1);	/* spec allows drive 400ns to assert "BUSY" */
-	stat = tp_ops->read_status(hwif);
-
-	if (stat & ATA_BUSY) {
-		if (!irqs_threaded) {
-			local_save_flags(flags);
-			local_irq_enable_in_hardirq();
-		}
-		timeout += jiffies;
-		while ((stat = tp_ops->read_status(hwif)) & ATA_BUSY) {
-			if (time_after(jiffies, timeout)) {
-				/*
-				 * One last read after the timeout in case
-				 * heavy interrupt load made us not make any
-				 * progress during the timeout..
-				 */
-				stat = tp_ops->read_status(hwif);
-				if ((stat & ATA_BUSY) == 0)
-					break;
-
-				if (!irqs_threaded)
-					local_irq_restore(flags);
-				*rstat = stat;
-				return -EBUSY;
-			}
-		}
-		if (!irqs_threaded)
-			local_irq_restore(flags);
-	}
-	/*
-	 * Allow status to settle, then read it again.
-	 * A few rare drives vastly violate the 400ns spec here,
-	 * so we'll wait up to 10usec for a "good" status
-	 * rather than expensively fail things immediately.
-	 * This fix courtesy of Matthew Faupel & Niccolo Rigacci.
-	 */
-	for (i = 0; i < 10; i++) {
-		udelay(1);
-		stat = tp_ops->read_status(hwif);
-
-		if (OK_STAT(stat, good, bad)) {
-			*rstat = stat;
-			return 0;
-		}
-	}
-	*rstat = stat;
-	return -EFAULT;
-}
-
-/*
- * In case of error returns error value after doing "*startstop = ide_error()".
- * The caller should return the updated value of "startstop" in this case,
- * "startstop" is unchanged when the function returns 0.
- */
-int ide_wait_stat(ide_startstop_t *startstop, ide_drive_t *drive, u8 good,
-		  u8 bad, unsigned long timeout)
-{
-	int err;
-	u8 stat;
-
-	/* bail early if we've exceeded max_failures */
-	if (drive->max_failures && (drive->failures > drive->max_failures)) {
-		*startstop = ide_stopped;
-		return 1;
-	}
-
-	err = __ide_wait_stat(drive, good, bad, timeout, &stat);
-
-	if (err) {
-		char *s = (err == -EBUSY) ? "status timeout" : "status error";
-		*startstop = ide_error(drive, s, stat);
-	}
-
-	return err;
-}
-EXPORT_SYMBOL(ide_wait_stat);
-
-/**
- *	ide_in_drive_list	-	look for drive in black/white list
- *	@id: drive identifier
- *	@table: list to inspect
- *
- *	Look for a drive in the blacklist and the whitelist tables
- *	Returns 1 if the drive is found in the table.
- */
-
-int ide_in_drive_list(u16 *id, const struct drive_list_entry *table)
-{
-	for ( ; table->id_model; table++)
-		if ((!strcmp(table->id_model, (char *)&id[ATA_ID_PROD])) &&
-		    (!table->id_firmware ||
-		     strstr((char *)&id[ATA_ID_FW_REV], table->id_firmware)))
-			return 1;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(ide_in_drive_list);
-
-/*
- * Early UDMA66 devices don't set bit14 to 1, only bit13 is valid.
- * Some optical devices with the buggy firmwares have the same problem.
- */
-static const struct drive_list_entry ivb_list[] = {
-	{ "QUANTUM FIREBALLlct10 05"	, "A03.0900"	},
-	{ "QUANTUM FIREBALLlct20 30"	, "APL.0900"	},
-	{ "TSSTcorp CDDVDW SH-S202J"	, "SB00"	},
-	{ "TSSTcorp CDDVDW SH-S202J"	, "SB01"	},
-	{ "TSSTcorp CDDVDW SH-S202N"	, "SB00"	},
-	{ "TSSTcorp CDDVDW SH-S202N"	, "SB01"	},
-	{ "TSSTcorp CDDVDW SH-S202H"	, "SB00"	},
-	{ "TSSTcorp CDDVDW SH-S202H"	, "SB01"	},
-	{ "SAMSUNG SP0822N"		, "WA100-10"	},
-	{ NULL				, NULL		}
-};
-
-/*
- *  All hosts that use the 80c ribbon must use!
- *  The name is derived from upper byte of word 93 and the 80c ribbon.
- */
-u8 eighty_ninty_three(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	u16 *id = drive->id;
-	int ivb = ide_in_drive_list(id, ivb_list);
-
-	if (hwif->cbl == ATA_CBL_SATA || hwif->cbl == ATA_CBL_PATA40_SHORT)
-		return 1;
-
-	if (ivb)
-		printk(KERN_DEBUG "%s: skipping word 93 validity check\n",
-				  drive->name);
-
-	if (ata_id_is_sata(id) && !ivb)
-		return 1;
-
-	if (hwif->cbl != ATA_CBL_PATA80 && !ivb)
-		goto no_80w;
-
-	/*
-	 * FIXME:
-	 * - change master/slave IDENTIFY order
-	 * - force bit13 (80c cable present) check also for !ivb devices
-	 *   (unless the slave device is pre-ATA3)
-	 */
-	if (id[ATA_ID_HW_CONFIG] & 0x4000)
-		return 1;
-
-	if (ivb) {
-		const char *model = (char *)&id[ATA_ID_PROD];
-
-		if (strstr(model, "TSSTcorp CDDVDW SH-S202")) {
-			/*
-			 * These ATAPI devices always report 80c cable
-			 * so we have to depend on the host in this case.
-			 */
-			if (hwif->cbl == ATA_CBL_PATA80)
-				return 1;
-		} else {
-			/* Depend on the device side cable detection. */
-			if (id[ATA_ID_HW_CONFIG] & 0x2000)
-				return 1;
-		}
-	}
-no_80w:
-	if (drive->dev_flags & IDE_DFLAG_UDMA33_WARNED)
-		return 0;
-
-	printk(KERN_WARNING "%s: %s side 80-wire cable detection failed, "
-			    "limiting max speed to UDMA33\n",
-			    drive->name,
-			    hwif->cbl == ATA_CBL_PATA80 ? "drive" : "host");
-
-	drive->dev_flags |= IDE_DFLAG_UDMA33_WARNED;
-
-	return 0;
-}
-
-static const char *nien_quirk_list[] = {
-	"QUANTUM FIREBALLlct08 08",
-	"QUANTUM FIREBALLP KA6.4",
-	"QUANTUM FIREBALLP KA9.1",
-	"QUANTUM FIREBALLP KX13.6",
-	"QUANTUM FIREBALLP KX20.5",
-	"QUANTUM FIREBALLP KX27.3",
-	"QUANTUM FIREBALLP LM20.4",
-	"QUANTUM FIREBALLP LM20.5",
-	"FUJITSU MHZ2160BH G2",
-	NULL
-};
-
-void ide_check_nien_quirk_list(ide_drive_t *drive)
-{
-	const char **list, *m = (char *)&drive->id[ATA_ID_PROD];
-
-	for (list = nien_quirk_list; *list != NULL; list++)
-		if (strstr(m, *list) != NULL) {
-			drive->dev_flags |= IDE_DFLAG_NIEN_QUIRK;
-			return;
-		}
-}
-
-int ide_driveid_update(ide_drive_t *drive)
-{
-	u16 *id;
-	int rc;
-
-	id = kmalloc(SECTOR_SIZE, GFP_ATOMIC);
-	if (id == NULL)
-		return 0;
-
-	SELECT_MASK(drive, 1);
-	rc = ide_dev_read_id(drive, ATA_CMD_ID_ATA, id, 1);
-	SELECT_MASK(drive, 0);
-
-	if (rc)
-		goto out_err;
-
-	drive->id[ATA_ID_UDMA_MODES]  = id[ATA_ID_UDMA_MODES];
-	drive->id[ATA_ID_MWDMA_MODES] = id[ATA_ID_MWDMA_MODES];
-	drive->id[ATA_ID_SWDMA_MODES] = id[ATA_ID_SWDMA_MODES];
-	drive->id[ATA_ID_CFA_MODES]   = id[ATA_ID_CFA_MODES];
-	/* anything more ? */
-
-	kfree(id);
-
-	return 1;
-out_err:
-	if (rc == 2)
-		printk(KERN_ERR "%s: %s: bad status\n", drive->name, __func__);
-	kfree(id);
-	return 0;
-}
-
-int ide_config_drive_speed(ide_drive_t *drive, u8 speed)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
-	struct ide_taskfile tf;
-	u16 *id = drive->id, i;
-	int error = 0;
-	u8 stat;
-
-#ifdef CONFIG_BLK_DEV_IDEDMA
-	if (hwif->dma_ops)	/* check if host supports DMA */
-		hwif->dma_ops->dma_host_set(drive, 0);
-#endif
-
-	/* Skip setting PIO flow-control modes on pre-EIDE drives */
-	if ((speed & 0xf8) == XFER_PIO_0 && ata_id_has_iordy(drive->id) == 0)
-		goto skip;
-
-	/*
-	 * Don't use ide_wait_cmd here - it will
-	 * attempt to set_geometry and recalibrate,
-	 * but for some reason these don't work at
-	 * this point (lost interrupt).
-	 */
-
-	udelay(1);
-	tp_ops->dev_select(drive);
-	SELECT_MASK(drive, 1);
-	udelay(1);
-	tp_ops->write_devctl(hwif, ATA_NIEN | ATA_DEVCTL_OBS);
-
-	memset(&tf, 0, sizeof(tf));
-	tf.feature = SETFEATURES_XFER;
-	tf.nsect   = speed;
-
-	tp_ops->tf_load(drive, &tf, IDE_VALID_FEATURE | IDE_VALID_NSECT);
-
-	tp_ops->exec_command(hwif, ATA_CMD_SET_FEATURES);
-
-	if (drive->dev_flags & IDE_DFLAG_NIEN_QUIRK)
-		tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
-
-	error = __ide_wait_stat(drive, drive->ready_stat,
-				ATA_BUSY | ATA_DRQ | ATA_ERR,
-				WAIT_CMD, &stat);
-
-	SELECT_MASK(drive, 0);
-
-	if (error) {
-		(void) ide_dump_status(drive, "set_drive_speed_status", stat);
-		return error;
-	}
-
-	if (speed >= XFER_SW_DMA_0) {
-		id[ATA_ID_UDMA_MODES]  &= ~0xFF00;
-		id[ATA_ID_MWDMA_MODES] &= ~0x0700;
-		id[ATA_ID_SWDMA_MODES] &= ~0x0700;
-		if (ata_id_is_cfa(id))
-			id[ATA_ID_CFA_MODES] &= ~0x0E00;
-	} else	if (ata_id_is_cfa(id))
-		id[ATA_ID_CFA_MODES] &= ~0x01C0;
-
- skip:
-#ifdef CONFIG_BLK_DEV_IDEDMA
-	if (speed >= XFER_SW_DMA_0 && (drive->dev_flags & IDE_DFLAG_USING_DMA))
-		hwif->dma_ops->dma_host_set(drive, 1);
-	else if (hwif->dma_ops)	/* check if host supports DMA */
-		ide_dma_off_quietly(drive);
-#endif
-
-	if (speed >= XFER_UDMA_0) {
-		i = 1 << (speed - XFER_UDMA_0);
-		id[ATA_ID_UDMA_MODES] |= (i << 8 | i);
-	} else if (ata_id_is_cfa(id) && speed >= XFER_MW_DMA_3) {
-		i = speed - XFER_MW_DMA_2;
-		id[ATA_ID_CFA_MODES] |= i << 9;
-	} else if (speed >= XFER_MW_DMA_0) {
-		i = 1 << (speed - XFER_MW_DMA_0);
-		id[ATA_ID_MWDMA_MODES] |= (i << 8 | i);
-	} else if (speed >= XFER_SW_DMA_0) {
-		i = 1 << (speed - XFER_SW_DMA_0);
-		id[ATA_ID_SWDMA_MODES] |= (i << 8 | i);
-	} else if (ata_id_is_cfa(id) && speed >= XFER_PIO_5) {
-		i = speed - XFER_PIO_4;
-		id[ATA_ID_CFA_MODES] |= i << 6;
-	}
-
-	if (!drive->init_speed)
-		drive->init_speed = speed;
-	drive->current_speed = speed;
-	return error;
-}
-
-/*
- * This should get invoked any time we exit the driver to
- * wait for an interrupt response from a drive.  handler() points
- * at the appropriate code to handle the next interrupt, and a
- * timer is started to prevent us from waiting forever in case
- * something goes wrong (see the ide_timer_expiry() handler later on).
- *
- * See also ide_execute_command
- */
-void __ide_set_handler(ide_drive_t *drive, ide_handler_t *handler,
-		       unsigned int timeout)
-{
-	ide_hwif_t *hwif = drive->hwif;
-
-	BUG_ON(hwif->handler);
-	hwif->handler		= handler;
-	hwif->timer.expires	= jiffies + timeout;
-	hwif->req_gen_timer	= hwif->req_gen;
-	add_timer(&hwif->timer);
-}
-
-void ide_set_handler(ide_drive_t *drive, ide_handler_t *handler,
-		     unsigned int timeout)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	unsigned long flags;
-
-	spin_lock_irqsave(&hwif->lock, flags);
-	__ide_set_handler(drive, handler, timeout);
-	spin_unlock_irqrestore(&hwif->lock, flags);
-}
-EXPORT_SYMBOL(ide_set_handler);
-
-/**
- *	ide_execute_command	-	execute an IDE command
- *	@drive: IDE drive to issue the command against
- *	@cmd: command
- *	@handler: handler for next phase
- *	@timeout: timeout for command
- *
- *	Helper function to issue an IDE command. This handles the
- *	atomicity requirements, command timing and ensures that the
- *	handler and IRQ setup do not race. All IDE command kick off
- *	should go via this function or do equivalent locking.
- */
-
-void ide_execute_command(ide_drive_t *drive, struct ide_cmd *cmd,
-			 ide_handler_t *handler, unsigned timeout)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	unsigned long flags;
-
-	spin_lock_irqsave(&hwif->lock, flags);
-	if ((cmd->protocol != ATAPI_PROT_DMA &&
-	     cmd->protocol != ATAPI_PROT_PIO) ||
-	    (drive->atapi_flags & IDE_AFLAG_DRQ_INTERRUPT))
-		__ide_set_handler(drive, handler, timeout);
-	hwif->tp_ops->exec_command(hwif, cmd->tf.command);
-	/*
-	 * Drive takes 400nS to respond, we must avoid the IRQ being
-	 * serviced before that.
-	 *
-	 * FIXME: we could skip this delay with care on non shared devices
-	 */
-	ndelay(400);
-	spin_unlock_irqrestore(&hwif->lock, flags);
-}
-
-/*
- * ide_wait_not_busy() waits for the currently selected device on the hwif
- * to report a non-busy status, see comments in ide_probe_port().
- */
-int ide_wait_not_busy(ide_hwif_t *hwif, unsigned long timeout)
-{
-	u8 stat = 0;
-
-	while (timeout--) {
-		/*
-		 * Turn this into a schedule() sleep once I'm sure
-		 * about locking issues (2.5 work ?).
-		 */
-		mdelay(1);
-		stat = hwif->tp_ops->read_status(hwif);
-		if ((stat & ATA_BUSY) == 0)
-			return 0;
-		/*
-		 * Assume a value of 0xff means nothing is connected to
-		 * the interface and it doesn't implement the pull-down
-		 * resistor on D7.
-		 */
-		if (stat == 0xff)
-			return -ENODEV;
-		touch_nmi_watchdog();
-	}
-	return -EBUSY;
-}
diff --git a/drivers/ide/ide-legacy.c b/drivers/ide/ide-legacy.c
deleted file mode 100644
index be65b411ab53..000000000000
--- a/drivers/ide/ide-legacy.c
+++ /dev/null
@@ -1,59 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/ide.h>
-
-static void ide_legacy_init_one(struct ide_hw **hws, struct ide_hw *hw,
-				u8 port_no, const struct ide_port_info *d,
-				unsigned long config)
-{
-	unsigned long base, ctl;
-	int irq;
-
-	if (port_no == 0) {
-		base = 0x1f0;
-		ctl  = 0x3f6;
-		irq  = 14;
-	} else {
-		base = 0x170;
-		ctl  = 0x376;
-		irq  = 15;
-	}
-
-	if (!request_region(base, 8, d->name)) {
-		printk(KERN_ERR "%s: I/O resource 0x%lX-0x%lX not free.\n",
-				d->name, base, base + 7);
-		return;
-	}
-
-	if (!request_region(ctl, 1, d->name)) {
-		printk(KERN_ERR "%s: I/O resource 0x%lX not free.\n",
-				d->name, ctl);
-		release_region(base, 8);
-		return;
-	}
-
-	ide_std_init_ports(hw, base, ctl);
-	hw->irq = irq;
-	hw->config = config;
-
-	hws[port_no] = hw;
-}
-
-int ide_legacy_device_add(const struct ide_port_info *d, unsigned long config)
-{
-	struct ide_hw hw[2], *hws[] = { NULL, NULL };
-
-	memset(&hw, 0, sizeof(hw));
-
-	if ((d->host_flags & IDE_HFLAG_QD_2ND_PORT) == 0)
-		ide_legacy_init_one(hws, &hw[0], 0, d, config);
-	ide_legacy_init_one(hws, &hw[1], 1, d, config);
-
-	if (hws[0] == NULL && hws[1] == NULL &&
-	    (d->host_flags & IDE_HFLAG_SINGLE))
-		return -ENOENT;
-
-	return ide_host_add(d, hws, 2, NULL);
-}
-EXPORT_SYMBOL_GPL(ide_legacy_device_add);
diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c
deleted file mode 100644
index 7b9f655adbc2..000000000000
--- a/drivers/ide/ide-lib.c
+++ /dev/null
@@ -1,146 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/interrupt.h>
-#include <linux/ide.h>
-#include <linux/bitops.h>
-
-u64 ide_get_lba_addr(struct ide_cmd *cmd, int lba48)
-{
-	struct ide_taskfile *tf = &cmd->tf;
-	u32 high, low;
-
-	low  = (tf->lbah << 16) | (tf->lbam << 8) | tf->lbal;
-	if (lba48) {
-		tf = &cmd->hob;
-		high = (tf->lbah << 16) | (tf->lbam << 8) | tf->lbal;
-	} else
-		high = tf->device & 0xf;
-
-	return ((u64)high << 24) | low;
-}
-EXPORT_SYMBOL_GPL(ide_get_lba_addr);
-
-static void ide_dump_sector(ide_drive_t *drive)
-{
-	struct ide_cmd cmd;
-	struct ide_taskfile *tf = &cmd.tf;
-	u8 lba48 = !!(drive->dev_flags & IDE_DFLAG_LBA48);
-
-	memset(&cmd, 0, sizeof(cmd));
-	if (lba48) {
-		cmd.valid.in.tf  = IDE_VALID_LBA;
-		cmd.valid.in.hob = IDE_VALID_LBA;
-		cmd.tf_flags = IDE_TFLAG_LBA48;
-	} else
-		cmd.valid.in.tf  = IDE_VALID_LBA | IDE_VALID_DEVICE;
-
-	ide_tf_readback(drive, &cmd);
-
-	if (lba48 || (tf->device & ATA_LBA))
-		printk(KERN_CONT ", LBAsect=%llu",
-			(unsigned long long)ide_get_lba_addr(&cmd, lba48));
-	else
-		printk(KERN_CONT ", CHS=%d/%d/%d", (tf->lbah << 8) + tf->lbam,
-			tf->device & 0xf, tf->lbal);
-}
-
-static void ide_dump_ata_error(ide_drive_t *drive, u8 err)
-{
-	printk(KERN_CONT "{ ");
-	if (err & ATA_ABORTED)
-		printk(KERN_CONT "DriveStatusError ");
-	if (err & ATA_ICRC)
-		printk(KERN_CONT "%s",
-			(err & ATA_ABORTED) ? "BadCRC " : "BadSector ");
-	if (err & ATA_UNC)
-		printk(KERN_CONT "UncorrectableError ");
-	if (err & ATA_IDNF)
-		printk(KERN_CONT "SectorIdNotFound ");
-	if (err & ATA_TRK0NF)
-		printk(KERN_CONT "TrackZeroNotFound ");
-	if (err & ATA_AMNF)
-		printk(KERN_CONT "AddrMarkNotFound ");
-	printk(KERN_CONT "}");
-	if ((err & (ATA_BBK | ATA_ABORTED)) == ATA_BBK ||
-	    (err & (ATA_UNC | ATA_IDNF | ATA_AMNF))) {
-		struct request *rq = drive->hwif->rq;
-
-		ide_dump_sector(drive);
-
-		if (rq)
-			printk(KERN_CONT ", sector=%llu",
-			       (unsigned long long)blk_rq_pos(rq));
-	}
-	printk(KERN_CONT "\n");
-}
-
-static void ide_dump_atapi_error(ide_drive_t *drive, u8 err)
-{
-	printk(KERN_CONT "{ ");
-	if (err & ATAPI_ILI)
-		printk(KERN_CONT "IllegalLengthIndication ");
-	if (err & ATAPI_EOM)
-		printk(KERN_CONT "EndOfMedia ");
-	if (err & ATA_ABORTED)
-		printk(KERN_CONT "AbortedCommand ");
-	if (err & ATA_MCR)
-		printk(KERN_CONT "MediaChangeRequested ");
-	if (err & ATAPI_LFS)
-		printk(KERN_CONT "LastFailedSense=0x%02x ",
-			(err & ATAPI_LFS) >> 4);
-	printk(KERN_CONT "}\n");
-}
-
-/**
- *	ide_dump_status		-	translate ATA/ATAPI error
- *	@drive: drive that status applies to
- *	@msg: text message to print
- *	@stat: status byte to decode
- *
- *	Error reporting, in human readable form (luxurious, but a memory hog).
- *	Combines the drive name, message and status byte to provide a
- *	user understandable explanation of the device error.
- */
-
-u8 ide_dump_status(ide_drive_t *drive, const char *msg, u8 stat)
-{
-	u8 err = 0;
-
-	printk(KERN_ERR "%s: %s: status=0x%02x { ", drive->name, msg, stat);
-	if (stat & ATA_BUSY)
-		printk(KERN_CONT "Busy ");
-	else {
-		if (stat & ATA_DRDY)
-			printk(KERN_CONT "DriveReady ");
-		if (stat & ATA_DF)
-			printk(KERN_CONT "DeviceFault ");
-		if (stat & ATA_DSC)
-			printk(KERN_CONT "SeekComplete ");
-		if (stat & ATA_DRQ)
-			printk(KERN_CONT "DataRequest ");
-		if (stat & ATA_CORR)
-			printk(KERN_CONT "CorrectedError ");
-		if (stat & ATA_SENSE)
-			printk(KERN_CONT "Sense ");
-		if (stat & ATA_ERR)
-			printk(KERN_CONT "Error ");
-	}
-	printk(KERN_CONT "}\n");
-	if ((stat & (ATA_BUSY | ATA_ERR)) == ATA_ERR) {
-		err = ide_read_error(drive);
-		printk(KERN_ERR "%s: %s: error=0x%02x ", drive->name, msg, err);
-		if (drive->media == ide_disk)
-			ide_dump_ata_error(drive, err);
-		else
-			ide_dump_atapi_error(drive, err);
-	}
-
-	printk(KERN_ERR "%s: possibly failed opcode: 0x%02x\n",
-		drive->name, drive->hwif->cmd.tf.command);
-
-	return err;
-}
-EXPORT_SYMBOL(ide_dump_status);
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
deleted file mode 100644
index a80a0f28f7b9..000000000000
--- a/drivers/ide/ide-park.c
+++ /dev/null
@@ -1,155 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/kernel.h>
-#include <linux/gfp.h>
-#include <linux/ide.h>
-#include <linux/jiffies.h>
-#include <linux/blkdev.h>
-
-DECLARE_WAIT_QUEUE_HEAD(ide_park_wq);
-
-static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct request_queue *q = drive->queue;
-	struct request *rq;
-	int rc;
-
-	timeout += jiffies;
-	spin_lock_irq(&hwif->lock);
-	if (drive->dev_flags & IDE_DFLAG_PARKED) {
-		int reset_timer = time_before(timeout, drive->sleep);
-		int start_queue = 0;
-
-		drive->sleep = timeout;
-		wake_up_all(&ide_park_wq);
-		if (reset_timer && del_timer(&hwif->timer))
-			start_queue = 1;
-		spin_unlock_irq(&hwif->lock);
-
-		if (start_queue)
-			blk_mq_run_hw_queues(q, true);
-		return;
-	}
-	spin_unlock_irq(&hwif->lock);
-
-	rq = blk_get_request(q, REQ_OP_DRV_IN, 0);
-	scsi_req(rq)->cmd[0] = REQ_PARK_HEADS;
-	scsi_req(rq)->cmd_len = 1;
-	ide_req(rq)->type = ATA_PRIV_MISC;
-	ide_req(rq)->special = &timeout;
-	blk_execute_rq(NULL, rq, 1);
-	rc = scsi_req(rq)->result ? -EIO : 0;
-	blk_put_request(rq);
-	if (rc)
-		goto out;
-
-	/*
-	 * Make sure that *some* command is sent to the drive after the
-	 * timeout has expired, so power management will be reenabled.
-	 */
-	rq = blk_get_request(q, REQ_OP_DRV_IN, BLK_MQ_REQ_NOWAIT);
-	if (IS_ERR(rq))
-		goto out;
-
-	scsi_req(rq)->cmd[0] = REQ_UNPARK_HEADS;
-	scsi_req(rq)->cmd_len = 1;
-	ide_req(rq)->type = ATA_PRIV_MISC;
-	spin_lock_irq(&hwif->lock);
-	ide_insert_request_head(drive, rq);
-	spin_unlock_irq(&hwif->lock);
-
-out:
-	return;
-}
-
-ide_startstop_t ide_do_park_unpark(ide_drive_t *drive, struct request *rq)
-{
-	struct ide_cmd cmd;
-	struct ide_taskfile *tf = &cmd.tf;
-
-	memset(&cmd, 0, sizeof(cmd));
-	if (scsi_req(rq)->cmd[0] == REQ_PARK_HEADS) {
-		drive->sleep = *(unsigned long *)ide_req(rq)->special;
-		drive->dev_flags |= IDE_DFLAG_SLEEPING;
-		tf->command = ATA_CMD_IDLEIMMEDIATE;
-		tf->feature = 0x44;
-		tf->lbal = 0x4c;
-		tf->lbam = 0x4e;
-		tf->lbah = 0x55;
-		cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
-		cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
-	} else		/* cmd == REQ_UNPARK_HEADS */
-		tf->command = ATA_CMD_CHK_POWER;
-
-	cmd.tf_flags |= IDE_TFLAG_CUSTOM_HANDLER;
-	cmd.protocol = ATA_PROT_NODATA;
-
-	cmd.rq = rq;
-
-	return do_rw_taskfile(drive, &cmd);
-}
-
-ssize_t ide_park_show(struct device *dev, struct device_attribute *attr,
-		      char *buf)
-{
-	ide_drive_t *drive = to_ide_device(dev);
-	ide_hwif_t *hwif = drive->hwif;
-	unsigned long now;
-	unsigned int msecs;
-
-	if (drive->dev_flags & IDE_DFLAG_NO_UNLOAD)
-		return -EOPNOTSUPP;
-
-	spin_lock_irq(&hwif->lock);
-	now = jiffies;
-	if (drive->dev_flags & IDE_DFLAG_PARKED &&
-	    time_after(drive->sleep, now))
-		msecs = jiffies_to_msecs(drive->sleep - now);
-	else
-		msecs = 0;
-	spin_unlock_irq(&hwif->lock);
-
-	return snprintf(buf, 20, "%u\n", msecs);
-}
-
-ssize_t ide_park_store(struct device *dev, struct device_attribute *attr,
-		       const char *buf, size_t len)
-{
-#define MAX_PARK_TIMEOUT 30000
-	ide_drive_t *drive = to_ide_device(dev);
-	long int input;
-	int rc;
-
-	rc = kstrtol(buf, 10, &input);
-	if (rc)
-		return rc;
-	if (input < -2)
-		return -EINVAL;
-	if (input > MAX_PARK_TIMEOUT) {
-		input = MAX_PARK_TIMEOUT;
-		rc = -EOVERFLOW;
-	}
-
-	mutex_lock(&ide_setting_mtx);
-	if (input >= 0) {
-		if (drive->dev_flags & IDE_DFLAG_NO_UNLOAD)
-			rc = -EOPNOTSUPP;
-		else if (input || drive->dev_flags & IDE_DFLAG_PARKED)
-			issue_park_cmd(drive, msecs_to_jiffies(input));
-	} else {
-		if (drive->media == ide_disk)
-			switch (input) {
-			case -1:
-				drive->dev_flags &= ~IDE_DFLAG_NO_UNLOAD;
-				break;
-			case -2:
-				drive->dev_flags |= IDE_DFLAG_NO_UNLOAD;
-				break;
-			}
-		else
-			rc = -EOPNOTSUPP;
-	}
-	mutex_unlock(&ide_setting_mtx);
-
-	return rc ? rc : len;
-}
diff --git a/drivers/ide/ide-pci-generic.c b/drivers/ide/ide-pci-generic.c
deleted file mode 100644
index 673420db953f..000000000000
--- a/drivers/ide/ide-pci-generic.c
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- *  Copyright (C) 2001-2002	Andre Hedrick <andre@linux-ide.org>
- *  Portions (C) Copyright 2002  Red Hat Inc
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2, or (at your option) any
- * later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * For the avoidance of doubt the "preferred form" of this code is one which
- * is in an open non patent encumbered format. Where cryptographic key signing
- * forms part of the process of creating an executable the information
- * including keys needed to generate an equivalently functional executable
- * are deemed to be part of the source code.
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-
-#define DRV_NAME "ide_pci_generic"
-
-static bool ide_generic_all;		/* Set to claim all devices */
-
-module_param_named(all_generic_ide, ide_generic_all, bool, 0444);
-MODULE_PARM_DESC(all_generic_ide, "IDE generic will claim all unknown PCI IDE storage controllers.");
-
-static void netcell_quirkproc(ide_drive_t *drive)
-{
-	/* mark words 85-87 as valid */
-	drive->id[ATA_ID_CSF_DEFAULT] |= 0x4000;
-}
-
-static const struct ide_port_ops netcell_port_ops = {
-	.quirkproc		= netcell_quirkproc,
-};
-
-#define DECLARE_GENERIC_PCI_DEV(extra_flags) \
-	{ \
-		.name		= DRV_NAME, \
-		.host_flags	= IDE_HFLAG_TRUST_BIOS_FOR_DMA | \
-				  extra_flags, \
-		.swdma_mask	= ATA_SWDMA2, \
-		.mwdma_mask	= ATA_MWDMA2, \
-		.udma_mask	= ATA_UDMA6, \
-	}
-
-static const struct ide_port_info generic_chipsets[] = {
-	/*  0: Unknown */
-	DECLARE_GENERIC_PCI_DEV(0),
-
-	{	/* 1: NS87410 */
-		.name		= DRV_NAME,
-		.enablebits	= { {0x43, 0x08, 0x08}, {0x47, 0x08, 0x08} },
-		.host_flags	= IDE_HFLAG_TRUST_BIOS_FOR_DMA,
-		.swdma_mask	= ATA_SWDMA2,
-		.mwdma_mask	= ATA_MWDMA2,
-		.udma_mask	= ATA_UDMA6,
-	},
-
-	/*  2: SAMURAI / HT6565 / HINT_IDE */
-	DECLARE_GENERIC_PCI_DEV(0),
-	/*  3: UM8673F / UM8886A / UM8886BF */
-	DECLARE_GENERIC_PCI_DEV(IDE_HFLAG_NO_DMA),
-	/*  4: VIA_IDE / OPTI621V / Piccolo010{2,3,5} */
-	DECLARE_GENERIC_PCI_DEV(IDE_HFLAG_NO_AUTODMA),
-
-	{	/* 5: VIA8237SATA */
-		.name		= DRV_NAME,
-		.host_flags	= IDE_HFLAG_TRUST_BIOS_FOR_DMA |
-				  IDE_HFLAG_OFF_BOARD,
-		.swdma_mask	= ATA_SWDMA2,
-		.mwdma_mask	= ATA_MWDMA2,
-		.udma_mask	= ATA_UDMA6,
-	},
-
-	{	/* 6: Revolution */
-		.name		= DRV_NAME,
-		.port_ops	= &netcell_port_ops,
-		.host_flags	= IDE_HFLAG_CLEAR_SIMPLEX |
-				  IDE_HFLAG_TRUST_BIOS_FOR_DMA |
-				  IDE_HFLAG_OFF_BOARD,
-		.swdma_mask	= ATA_SWDMA2,
-		.mwdma_mask	= ATA_MWDMA2,
-		.udma_mask	= ATA_UDMA6,
-	}
-};
-
-/**
- *	generic_init_one	-	called when a PIIX is found
- *	@dev: the generic device
- *	@id: the matching pci id
- *
- *	Called when the PCI registration layer (or the IDE initialization)
- *	finds a device matching our IDE device tables.
- */
-
-static int generic_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	const struct ide_port_info *d = &generic_chipsets[id->driver_data];
-	int ret = -ENODEV;
-
-	/* Don't use the generic entry unless instructed to do so */
-	if (id->driver_data == 0 && ide_generic_all == 0)
-			goto out;
-
-	switch (dev->vendor) {
-	case PCI_VENDOR_ID_UMC:
-		if (dev->device == PCI_DEVICE_ID_UMC_UM8886A &&
-				!(PCI_FUNC(dev->devfn) & 1))
-			goto out; /* UM8886A/BF pair */
-		break;
-	case PCI_VENDOR_ID_OPTI:
-		if (dev->device == PCI_DEVICE_ID_OPTI_82C558 &&
-				!(PCI_FUNC(dev->devfn) & 1))
-			goto out;
-		break;
-	case PCI_VENDOR_ID_JMICRON:
-		if (dev->device != PCI_DEVICE_ID_JMICRON_JMB368 &&
-				PCI_FUNC(dev->devfn) != 1)
-			goto out;
-		break;
-	case PCI_VENDOR_ID_NS:
-		if (dev->device == PCI_DEVICE_ID_NS_87410 &&
-				(dev->class >> 8) != PCI_CLASS_STORAGE_IDE)
-			goto out;
-		break;
-	}
-
-	if (dev->vendor != PCI_VENDOR_ID_JMICRON) {
-		u16 command;
-		pci_read_config_word(dev, PCI_COMMAND, &command);
-		if (!(command & PCI_COMMAND_IO)) {
-			printk(KERN_INFO "%s %s: skipping disabled "
-				"controller\n", d->name, pci_name(dev));
-			goto out;
-		}
-	}
-	ret = ide_pci_init_one(dev, d, NULL);
-out:
-	return ret;
-}
-
-static const struct pci_device_id generic_pci_tbl[] = {
-	{ PCI_VDEVICE(NS,	PCI_DEVICE_ID_NS_87410),		 1 },
-	{ PCI_VDEVICE(PCTECH,	PCI_DEVICE_ID_PCTECH_SAMURAI_IDE),	 2 },
-	{ PCI_VDEVICE(HOLTEK,	PCI_DEVICE_ID_HOLTEK_6565),		 2 },
-	{ PCI_VDEVICE(UMC,	PCI_DEVICE_ID_UMC_UM8673F),		 3 },
-	{ PCI_VDEVICE(UMC,	PCI_DEVICE_ID_UMC_UM8886A),		 3 },
-	{ PCI_VDEVICE(UMC,	PCI_DEVICE_ID_UMC_UM8886BF),		 3 },
-	{ PCI_VDEVICE(HINT,	PCI_DEVICE_ID_HINT_VXPROII_IDE),	 2 },
-	{ PCI_VDEVICE(VIA,	PCI_DEVICE_ID_VIA_82C561),		 4 },
-	{ PCI_VDEVICE(OPTI,	PCI_DEVICE_ID_OPTI_82C558),		 4 },
-#ifdef CONFIG_BLK_DEV_IDE_SATA
-	{ PCI_VDEVICE(VIA,	PCI_DEVICE_ID_VIA_8237_SATA),		 5 },
-#endif
-	{ PCI_VDEVICE(TOSHIBA,	PCI_DEVICE_ID_TOSHIBA_PICCOLO_1),	 4 },
-	{ PCI_VDEVICE(TOSHIBA,	PCI_DEVICE_ID_TOSHIBA_PICCOLO_2),	 4 },
-	{ PCI_VDEVICE(TOSHIBA,	PCI_DEVICE_ID_TOSHIBA_PICCOLO_3),	 4 },
-	{ PCI_VDEVICE(TOSHIBA,	PCI_DEVICE_ID_TOSHIBA_PICCOLO_5),	 4 },
-	{ PCI_VDEVICE(NETCELL,	PCI_DEVICE_ID_REVOLUTION),		 6 },
-	/*
-	 * Must come last.  If you add entries adjust
-	 * this table and generic_chipsets[] appropriately.
-	 */
-	{ PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_STORAGE_IDE << 8, 0xFFFFFF00UL, 0 },
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, generic_pci_tbl);
-
-static struct pci_driver generic_pci_driver = {
-	.name		= "PCI_IDE",
-	.id_table	= generic_pci_tbl,
-	.probe		= generic_init_one,
-	.remove		= ide_pci_remove,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init generic_ide_init(void)
-{
-	return ide_pci_register_driver(&generic_pci_driver);
-}
-
-static void __exit generic_ide_exit(void)
-{
-	pci_unregister_driver(&generic_pci_driver);
-}
-
-module_init(generic_ide_init);
-module_exit(generic_ide_exit);
-
-MODULE_AUTHOR("Andre Hedrick");
-MODULE_DESCRIPTION("PCI driver module for generic PCI IDE");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/ide-pio-blacklist.c b/drivers/ide/ide-pio-blacklist.c
deleted file mode 100644
index 1fd24798e5c9..000000000000
--- a/drivers/ide/ide-pio-blacklist.c
+++ /dev/null
@@ -1,96 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * PIO blacklist.  Some drives incorrectly report their maximal PIO mode,
- * at least in respect to CMD640.  Here we keep info on some known drives.
- *
- * Changes to the ide_pio_blacklist[] should be made with EXTREME CAUTION
- * to avoid breaking the fragile cmd640.c support.
- */
-
-#include <linux/string.h>
-#include <linux/ide.h>
-
-static struct ide_pio_info {
-	const char	*name;
-	int		pio;
-} ide_pio_blacklist [] = {
-	{ "Conner Peripherals 540MB - CFS540A", 3 },
-
-	{ "WDC AC2700",  3 },
-	{ "WDC AC2540",  3 },
-	{ "WDC AC2420",  3 },
-	{ "WDC AC2340",  3 },
-	{ "WDC AC2250",  0 },
-	{ "WDC AC2200",  0 },
-	{ "WDC AC21200", 4 },
-	{ "WDC AC2120",  0 },
-	{ "WDC AC2850",  3 },
-	{ "WDC AC1270",  3 },
-	{ "WDC AC1170",  1 },
-	{ "WDC AC1210",  1 },
-	{ "WDC AC280",   0 },
-	{ "WDC AC31000", 3 },
-	{ "WDC AC31200", 3 },
-
-	{ "Maxtor 7131 AT", 1 },
-	{ "Maxtor 7171 AT", 1 },
-	{ "Maxtor 7213 AT", 1 },
-	{ "Maxtor 7245 AT", 1 },
-	{ "Maxtor 7345 AT", 1 },
-	{ "Maxtor 7546 AT", 3 },
-	{ "Maxtor 7540 AV", 3 },
-
-	{ "SAMSUNG SHD-3121A", 1 },
-	{ "SAMSUNG SHD-3122A", 1 },
-	{ "SAMSUNG SHD-3172A", 1 },
-
-	{ "ST5660A",  3 },
-	{ "ST3660A",  3 },
-	{ "ST3630A",  3 },
-	{ "ST3655A",  3 },
-	{ "ST3391A",  3 },
-	{ "ST3390A",  1 },
-	{ "ST3600A",  1 },
-	{ "ST3290A",  0 },
-	{ "ST3144A",  0 },
-	{ "ST3491A",  1 }, /* reports 3, should be 1 or 2 (depending on drive)
-			      according to Seagate's FIND-ATA program */
-
-	{ "QUANTUM ELS127A", 0 },
-	{ "QUANTUM ELS170A", 0 },
-	{ "QUANTUM LPS240A", 0 },
-	{ "QUANTUM LPS210A", 3 },
-	{ "QUANTUM LPS270A", 3 },
-	{ "QUANTUM LPS365A", 3 },
-	{ "QUANTUM LPS540A", 3 },
-	{ "QUANTUM LIGHTNING 540A", 3 },
-	{ "QUANTUM LIGHTNING 730A", 3 },
-
-	{ "QUANTUM FIREBALL_540", 3 }, /* Older Quantum Fireballs don't work */
-	{ "QUANTUM FIREBALL_640", 3 },
-	{ "QUANTUM FIREBALL_1080", 3 },
-	{ "QUANTUM FIREBALL_1280", 3 },
-	{ NULL, 0 }
-};
-
-/**
- *	ide_scan_pio_blacklist 	-	check for a blacklisted drive
- *	@model: Drive model string
- *
- *	This routine searches the ide_pio_blacklist for an entry
- *	matching the start/whole of the supplied model name.
- *
- *	Returns -1 if no match found.
- *	Otherwise returns the recommended PIO mode from ide_pio_blacklist[].
- */
-
-int ide_scan_pio_blacklist(char *model)
-{
-	struct ide_pio_info *p;
-
-	for (p = ide_pio_blacklist; p->name != NULL; p++) {
-		if (strncmp(p->name, model, strlen(p->name)) == 0)
-			return p->pio;
-	}
-	return -1;
-}
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
deleted file mode 100644
index d680b3e3295f..000000000000
--- a/drivers/ide/ide-pm.c
+++ /dev/null
@@ -1,261 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/kernel.h>
-#include <linux/gfp.h>
-#include <linux/ide.h>
-
-int generic_ide_suspend(struct device *dev, pm_message_t mesg)
-{
-	ide_drive_t *drive = to_ide_device(dev);
-	ide_drive_t *pair = ide_get_pair_dev(drive);
-	ide_hwif_t *hwif = drive->hwif;
-	struct request *rq;
-	struct ide_pm_state rqpm;
-	int ret;
-
-	if (ide_port_acpi(hwif)) {
-		/* call ACPI _GTM only once */
-		if ((drive->dn & 1) == 0 || pair == NULL)
-			ide_acpi_get_timing(hwif);
-	}
-
-	memset(&rqpm, 0, sizeof(rqpm));
-	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
-	ide_req(rq)->type = ATA_PRIV_PM_SUSPEND;
-	ide_req(rq)->special = &rqpm;
-	rqpm.pm_step = IDE_PM_START_SUSPEND;
-	if (mesg.event == PM_EVENT_PRETHAW)
-		mesg.event = PM_EVENT_FREEZE;
-	rqpm.pm_state = mesg.event;
-
-	blk_execute_rq(NULL, rq, 0);
-	ret = scsi_req(rq)->result ? -EIO : 0;
-	blk_put_request(rq);
-
-	if (ret == 0 && ide_port_acpi(hwif)) {
-		/* call ACPI _PS3 only after both devices are suspended */
-		if ((drive->dn & 1) || pair == NULL)
-			ide_acpi_set_state(hwif, 0);
-	}
-
-	return ret;
-}
-
-static int ide_pm_execute_rq(struct request *rq)
-{
-	struct request_queue *q = rq->q;
-
-	if (unlikely(blk_queue_dying(q))) {
-		rq->rq_flags |= RQF_QUIET;
-		scsi_req(rq)->result = -ENXIO;
-		blk_mq_end_request(rq, BLK_STS_OK);
-		return -ENXIO;
-	}
-	blk_execute_rq(NULL, rq, true);
-
-	return scsi_req(rq)->result ? -EIO : 0;
-}
-
-int generic_ide_resume(struct device *dev)
-{
-	ide_drive_t *drive = to_ide_device(dev);
-	ide_drive_t *pair = ide_get_pair_dev(drive);
-	ide_hwif_t *hwif = drive->hwif;
-	struct request *rq;
-	struct ide_pm_state rqpm;
-	int err;
-
-	blk_mq_start_stopped_hw_queues(drive->queue, true);
-
-	if (ide_port_acpi(hwif)) {
-		/* call ACPI _PS0 / _STM only once */
-		if ((drive->dn & 1) == 0 || pair == NULL) {
-			ide_acpi_set_state(hwif, 1);
-			ide_acpi_push_timing(hwif);
-		}
-
-		ide_acpi_exec_tfs(drive);
-	}
-
-	memset(&rqpm, 0, sizeof(rqpm));
-	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_PM);
-	ide_req(rq)->type = ATA_PRIV_PM_RESUME;
-	ide_req(rq)->special = &rqpm;
-	rqpm.pm_step = IDE_PM_START_RESUME;
-	rqpm.pm_state = PM_EVENT_ON;
-
-	err = ide_pm_execute_rq(rq);
-	blk_put_request(rq);
-
-	if (err == 0 && dev->driver) {
-		struct ide_driver *drv = to_ide_driver(dev->driver);
-
-		if (drv->resume)
-			drv->resume(drive);
-	}
-
-	return err;
-}
-
-void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
-{
-	struct ide_pm_state *pm = ide_req(rq)->special;
-
-#ifdef DEBUG_PM
-	printk(KERN_INFO "%s: complete_power_step(step: %d)\n",
-		drive->name, pm->pm_step);
-#endif
-	if (drive->media != ide_disk)
-		return;
-
-	switch (pm->pm_step) {
-	case IDE_PM_FLUSH_CACHE:	/* Suspend step 1 (flush cache) */
-		if (pm->pm_state == PM_EVENT_FREEZE)
-			pm->pm_step = IDE_PM_COMPLETED;
-		else
-			pm->pm_step = IDE_PM_STANDBY;
-		break;
-	case IDE_PM_STANDBY:		/* Suspend step 2 (standby) */
-		pm->pm_step = IDE_PM_COMPLETED;
-		break;
-	case IDE_PM_RESTORE_PIO:	/* Resume step 1 (restore PIO) */
-		pm->pm_step = IDE_PM_IDLE;
-		break;
-	case IDE_PM_IDLE:		/* Resume step 2 (idle)*/
-		pm->pm_step = IDE_PM_RESTORE_DMA;
-		break;
-	}
-}
-
-ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
-{
-	struct ide_pm_state *pm = ide_req(rq)->special;
-	struct ide_cmd cmd = { };
-
-	switch (pm->pm_step) {
-	case IDE_PM_FLUSH_CACHE:	/* Suspend step 1 (flush cache) */
-		if (drive->media != ide_disk)
-			break;
-		/* Not supported? Switch to next step now. */
-		if (ata_id_flush_enabled(drive->id) == 0 ||
-		    (drive->dev_flags & IDE_DFLAG_WCACHE) == 0) {
-			ide_complete_power_step(drive, rq);
-			return ide_stopped;
-		}
-		if (ata_id_flush_ext_enabled(drive->id))
-			cmd.tf.command = ATA_CMD_FLUSH_EXT;
-		else
-			cmd.tf.command = ATA_CMD_FLUSH;
-		goto out_do_tf;
-	case IDE_PM_STANDBY:		/* Suspend step 2 (standby) */
-		cmd.tf.command = ATA_CMD_STANDBYNOW1;
-		goto out_do_tf;
-	case IDE_PM_RESTORE_PIO:	/* Resume step 1 (restore PIO) */
-		ide_set_max_pio(drive);
-		/*
-		 * skip IDE_PM_IDLE for ATAPI devices
-		 */
-		if (drive->media != ide_disk)
-			pm->pm_step = IDE_PM_RESTORE_DMA;
-		else
-			ide_complete_power_step(drive, rq);
-		return ide_stopped;
-	case IDE_PM_IDLE:		/* Resume step 2 (idle) */
-		cmd.tf.command = ATA_CMD_IDLEIMMEDIATE;
-		goto out_do_tf;
-	case IDE_PM_RESTORE_DMA:	/* Resume step 3 (restore DMA) */
-		/*
-		 * Right now, all we do is call ide_set_dma(drive),
-		 * we could be smarter and check for current xfer_speed
-		 * in struct drive etc...
-		 */
-		if (drive->hwif->dma_ops == NULL)
-			break;
-		/*
-		 * TODO: respect IDE_DFLAG_USING_DMA
-		 */
-		ide_set_dma(drive);
-		break;
-	}
-
-	pm->pm_step = IDE_PM_COMPLETED;
-
-	return ide_stopped;
-
-out_do_tf:
-	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
-	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
-	cmd.protocol = ATA_PROT_NODATA;
-
-	return do_rw_taskfile(drive, &cmd);
-}
-
-/**
- *	ide_complete_pm_rq - end the current Power Management request
- *	@drive: target drive
- *	@rq: request
- *
- *	This function cleans up the current PM request and stops the queue
- *	if necessary.
- */
-void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
-{
-	struct request_queue *q = drive->queue;
-	struct ide_pm_state *pm = ide_req(rq)->special;
-
-	ide_complete_power_step(drive, rq);
-	if (pm->pm_step != IDE_PM_COMPLETED)
-		return;
-
-#ifdef DEBUG_PM
-	printk("%s: completing PM request, %s\n", drive->name,
-	       (ide_req(rq)->type == ATA_PRIV_PM_SUSPEND) ? "suspend" : "resume");
-#endif
-	if (ide_req(rq)->type == ATA_PRIV_PM_SUSPEND)
-		blk_mq_stop_hw_queues(q);
-	else
-		drive->dev_flags &= ~IDE_DFLAG_BLOCKED;
-
-	drive->hwif->rq = NULL;
-
-	blk_mq_end_request(rq, BLK_STS_OK);
-}
-
-void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
-{
-	struct ide_pm_state *pm = ide_req(rq)->special;
-
-	if (blk_rq_is_private(rq) &&
-	    ide_req(rq)->type == ATA_PRIV_PM_SUSPEND &&
-	    pm->pm_step == IDE_PM_START_SUSPEND)
-		/* Mark drive blocked when starting the suspend sequence. */
-		drive->dev_flags |= IDE_DFLAG_BLOCKED;
-	else if (blk_rq_is_private(rq) &&
-	         ide_req(rq)->type == ATA_PRIV_PM_RESUME &&
-		 pm->pm_step == IDE_PM_START_RESUME) {
-		/*
-		 * The first thing we do on wakeup is to wait for BSY bit to
-		 * go away (with a looong timeout) as a drive on this hwif may
-		 * just be POSTing itself.
-		 * We do that before even selecting as the "other" device on
-		 * the bus may be broken enough to walk on our toes at this
-		 * point.
-		 */
-		ide_hwif_t *hwif = drive->hwif;
-		const struct ide_tp_ops *tp_ops = hwif->tp_ops;
-		struct request_queue *q = drive->queue;
-		int rc;
-#ifdef DEBUG_PM
-		printk("%s: Wakeup request inited, waiting for !BSY...\n", drive->name);
-#endif
-		rc = ide_wait_not_busy(hwif, 35000);
-		if (rc)
-			printk(KERN_WARNING "%s: bus not ready on wakeup\n", drive->name);
-		tp_ops->dev_select(drive);
-		tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
-		rc = ide_wait_not_busy(hwif, 100000);
-		if (rc)
-			printk(KERN_WARNING "%s: drive not ready on wakeup\n", drive->name);
-
-		blk_mq_start_hw_queues(q);
-	}
-}
diff --git a/drivers/ide/ide-pnp.c b/drivers/ide/ide-pnp.c
deleted file mode 100644
index fc541f1cf8de..000000000000
--- a/drivers/ide/ide-pnp.c
+++ /dev/null
@@ -1,92 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * This file provides autodetection for ISA PnP IDE interfaces.
- * It was tested with "ESS ES1868 Plug and Play AudioDrive" IDE interface.
- *
- * Copyright (C) 2000 Andrey Panin <pazke@donpac.ru>
- */
-
-#include <linux/init.h>
-#include <linux/pnp.h>
-#include <linux/ide.h>
-#include <linux/module.h>
-
-#define DRV_NAME "ide-pnp"
-
-/* Add your devices here :)) */
-static const struct pnp_device_id idepnp_devices[] = {
-	/* Generic ESDI/IDE/ATA compatible hard disk controller */
-	{.id = "PNP0600", .driver_data = 0},
-	{.id = ""}
-};
-
-static const struct ide_port_info ide_pnp_port_info = {
-	.host_flags		= IDE_HFLAG_NO_DMA,
-	.chipset		= ide_generic,
-};
-
-static int idepnp_probe(struct pnp_dev *dev, const struct pnp_device_id *dev_id)
-{
-	struct ide_host *host;
-	unsigned long base, ctl;
-	int rc;
-	struct ide_hw hw, *hws[] = { &hw };
-
-	printk(KERN_INFO DRV_NAME ": generic PnP IDE interface\n");
-
-	if (!(pnp_port_valid(dev, 0) && pnp_port_valid(dev, 1) && pnp_irq_valid(dev, 0)))
-		return -1;
-
-	base = pnp_port_start(dev, 0);
-	ctl = pnp_port_start(dev, 1);
-
-	if (!request_region(base, 8, DRV_NAME)) {
-		printk(KERN_ERR "%s: I/O resource 0x%lX-0x%lX not free.\n",
-				DRV_NAME, base, base + 7);
-		return -EBUSY;
-	}
-
-	if (!request_region(ctl, 1, DRV_NAME)) {
-		printk(KERN_ERR "%s: I/O resource 0x%lX not free.\n",
-				DRV_NAME, ctl);
-		release_region(base, 8);
-		return -EBUSY;
-	}
-
-	memset(&hw, 0, sizeof(hw));
-	ide_std_init_ports(&hw, base, ctl);
-	hw.irq = pnp_irq(dev, 0);
-
-	rc = ide_host_add(&ide_pnp_port_info, hws, 1, &host);
-	if (rc)
-		goto out;
-
-	pnp_set_drvdata(dev, host);
-
-	return 0;
-out:
-	release_region(ctl, 1);
-	release_region(base, 8);
-
-	return rc;
-}
-
-static void idepnp_remove(struct pnp_dev *dev)
-{
-	struct ide_host *host = pnp_get_drvdata(dev);
-
-	ide_host_remove(host);
-
-	release_region(pnp_port_start(dev, 1), 1);
-	release_region(pnp_port_start(dev, 0), 8);
-}
-
-static struct pnp_driver idepnp_driver = {
-	.name		= "ide",
-	.id_table	= idepnp_devices,
-	.probe		= idepnp_probe,
-	.remove		= idepnp_remove,
-};
-
-module_pnp_driver(idepnp_driver);
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
deleted file mode 100644
index aefd74c0d862..000000000000
--- a/drivers/ide/ide-probe.c
+++ /dev/null
@@ -1,1623 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  Copyright (C) 1994-1998   Linus Torvalds & authors (see below)
- *  Copyright (C) 2005, 2007  Bartlomiej Zolnierkiewicz
- */
-
-/*
- *  Mostly written by Mark Lord <mlord@pobox.com>
- *                and Gadi Oxman <gadio@netvision.net.il>
- *                and Andre Hedrick <andre@linux-ide.org>
- *
- *  See linux/MAINTAINERS for address of current maintainer.
- *
- * This is the IDE probe module, as evolved from hd.c and ide.c.
- *
- * -- increase WAIT_PIDENTIFY to avoid CD-ROM locking at boot
- *	 by Andrea Arcangeli
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/timer.h>
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/major.h>
-#include <linux/errno.h>
-#include <linux/genhd.h>
-#include <linux/slab.h>
-#include <linux/delay.h>
-#include <linux/ide.h>
-#include <linux/spinlock.h>
-#include <linux/kmod.h>
-#include <linux/pci.h>
-#include <linux/scatterlist.h>
-
-#include <asm/byteorder.h>
-#include <asm/irq.h>
-#include <linux/uaccess.h>
-#include <asm/io.h>
-
-/**
- *	generic_id		-	add a generic drive id
- *	@drive:	drive to make an ID block for
- *	
- *	Add a fake id field to the drive we are passed. This allows
- *	use to skip a ton of NULL checks (which people always miss) 
- *	and make drive properties unconditional outside of this file
- */
- 
-static void generic_id(ide_drive_t *drive)
-{
-	u16 *id = drive->id;
-
-	id[ATA_ID_CUR_CYLS]	= id[ATA_ID_CYLS]	= drive->cyl;
-	id[ATA_ID_CUR_HEADS]	= id[ATA_ID_HEADS]	= drive->head;
-	id[ATA_ID_CUR_SECTORS]	= id[ATA_ID_SECTORS]	= drive->sect;
-}
-
-static void ide_disk_init_chs(ide_drive_t *drive)
-{
-	u16 *id = drive->id;
-
-	/* Extract geometry if we did not already have one for the drive */
-	if (!drive->cyl || !drive->head || !drive->sect) {
-		drive->cyl  = drive->bios_cyl  = id[ATA_ID_CYLS];
-		drive->head = drive->bios_head = id[ATA_ID_HEADS];
-		drive->sect = drive->bios_sect = id[ATA_ID_SECTORS];
-	}
-
-	/* Handle logical geometry translation by the drive */
-	if (ata_id_current_chs_valid(id)) {
-		drive->cyl  = id[ATA_ID_CUR_CYLS];
-		drive->head = id[ATA_ID_CUR_HEADS];
-		drive->sect = id[ATA_ID_CUR_SECTORS];
-	}
-
-	/* Use physical geometry if what we have still makes no sense */
-	if (drive->head > 16 && id[ATA_ID_HEADS] && id[ATA_ID_HEADS] <= 16) {
-		drive->cyl  = id[ATA_ID_CYLS];
-		drive->head = id[ATA_ID_HEADS];
-		drive->sect = id[ATA_ID_SECTORS];
-	}
-}
-
-static void ide_disk_init_mult_count(ide_drive_t *drive)
-{
-	u16 *id = drive->id;
-	u8 max_multsect = id[ATA_ID_MAX_MULTSECT] & 0xff;
-
-	if (max_multsect) {
-		if ((max_multsect / 2) > 1)
-			id[ATA_ID_MULTSECT] = max_multsect | 0x100;
-		else
-			id[ATA_ID_MULTSECT] &= ~0x1ff;
-
-		drive->mult_req = id[ATA_ID_MULTSECT] & 0xff;
-
-		if (drive->mult_req)
-			drive->special_flags |= IDE_SFLAG_SET_MULTMODE;
-	}
-}
-
-static void ide_classify_ata_dev(ide_drive_t *drive)
-{
-	u16 *id = drive->id;
-	char *m = (char *)&id[ATA_ID_PROD];
-	int is_cfa = ata_id_is_cfa(id);
-
-	/* CF devices are *not* removable in Linux definition of the term */
-	if (is_cfa == 0 && (id[ATA_ID_CONFIG] & (1 << 7)))
-		drive->dev_flags |= IDE_DFLAG_REMOVABLE;
-
-	drive->media = ide_disk;
-
-	if (!ata_id_has_unload(drive->id))
-		drive->dev_flags |= IDE_DFLAG_NO_UNLOAD;
-
-	printk(KERN_INFO "%s: %s, %s DISK drive\n", drive->name, m,
-		is_cfa ? "CFA" : "ATA");
-}
-
-static void ide_classify_atapi_dev(ide_drive_t *drive)
-{
-	u16 *id = drive->id;
-	char *m = (char *)&id[ATA_ID_PROD];
-	u8 type = (id[ATA_ID_CONFIG] >> 8) & 0x1f;
-
-	printk(KERN_INFO "%s: %s, ATAPI ", drive->name, m);
-	switch (type) {
-	case ide_floppy:
-		if (!strstr(m, "CD-ROM")) {
-			if (!strstr(m, "oppy") &&
-			    !strstr(m, "poyp") &&
-			    !strstr(m, "ZIP"))
-				printk(KERN_CONT "cdrom or floppy?, assuming ");
-			if (drive->media != ide_cdrom) {
-				printk(KERN_CONT "FLOPPY");
-				drive->dev_flags |= IDE_DFLAG_REMOVABLE;
-				break;
-			}
-		}
-		/* Early cdrom models used zero */
-		type = ide_cdrom;
-		fallthrough;
-	case ide_cdrom:
-		drive->dev_flags |= IDE_DFLAG_REMOVABLE;
-#ifdef CONFIG_PPC
-		/* kludge for Apple PowerBook internal zip */
-		if (!strstr(m, "CD-ROM") && strstr(m, "ZIP")) {
-			printk(KERN_CONT "FLOPPY");
-			type = ide_floppy;
-			break;
-		}
-#endif
-		printk(KERN_CONT "CD/DVD-ROM");
-		break;
-	case ide_tape:
-		printk(KERN_CONT "TAPE");
-		break;
-	case ide_optical:
-		printk(KERN_CONT "OPTICAL");
-		drive->dev_flags |= IDE_DFLAG_REMOVABLE;
-		break;
-	default:
-		printk(KERN_CONT "UNKNOWN (type %d)", type);
-		break;
-	}
-
-	printk(KERN_CONT " drive\n");
-	drive->media = type;
-	/* an ATAPI device ignores DRDY */
-	drive->ready_stat = 0;
-	if (ata_id_cdb_intr(id))
-		drive->atapi_flags |= IDE_AFLAG_DRQ_INTERRUPT;
-	drive->dev_flags |= IDE_DFLAG_DOORLOCKING;
-	/* we don't do head unloading on ATAPI devices */
-	drive->dev_flags |= IDE_DFLAG_NO_UNLOAD;
-}
-
-/**
- *	do_identify	-	identify a drive
- *	@drive: drive to identify 
- *	@cmd: command used
- *	@id: buffer for IDENTIFY data
- *
- *	Called when we have issued a drive identify command to
- *	read and parse the results. This function is run with
- *	interrupts disabled. 
- */
-
-static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	char *m = (char *)&id[ATA_ID_PROD];
-	unsigned long flags;
-	int bswap = 1;
-
-	/* local CPU only; some systems need this */
-	local_irq_save(flags);
-	/* read 512 bytes of id info */
-	hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
-	local_irq_restore(flags);
-
-	drive->dev_flags |= IDE_DFLAG_ID_READ;
-#ifdef DEBUG
-	printk(KERN_INFO "%s: dumping identify data\n", drive->name);
-	ide_dump_identify((u8 *)id);
-#endif
-	ide_fix_driveid(id);
-
-	/*
-	 *  ATA_CMD_ID_ATA returns little-endian info,
-	 *  ATA_CMD_ID_ATAPI *usually* returns little-endian info.
-	 */
-	if (cmd == ATA_CMD_ID_ATAPI) {
-		if ((m[0] == 'N' && m[1] == 'E') ||  /* NEC */
-		    (m[0] == 'F' && m[1] == 'X') ||  /* Mitsumi */
-		    (m[0] == 'P' && m[1] == 'i'))    /* Pioneer */
-			/* Vertos drives may still be weird */
-			bswap ^= 1;
-	}
-
-	ide_fixstring(m, ATA_ID_PROD_LEN, bswap);
-	ide_fixstring((char *)&id[ATA_ID_FW_REV], ATA_ID_FW_REV_LEN, bswap);
-	ide_fixstring((char *)&id[ATA_ID_SERNO], ATA_ID_SERNO_LEN, bswap);
-
-	/* we depend on this a lot! */
-	m[ATA_ID_PROD_LEN - 1] = '\0';
-
-	if (strstr(m, "E X A B Y T E N E S T"))
-		drive->dev_flags &= ~IDE_DFLAG_PRESENT;
-	else
-		drive->dev_flags |= IDE_DFLAG_PRESENT;
-}
-
-/**
- *	ide_dev_read_id	-	send ATA/ATAPI IDENTIFY command
- *	@drive: drive to identify
- *	@cmd: command to use
- *	@id: buffer for IDENTIFY data
- *	@irq_ctx: flag set when called from the IRQ context
- *
- *	Sends an ATA(PI) IDENTIFY request to a drive and waits for a response.
- *
- *	Returns:	0  device was identified
- *			1  device timed-out (no response to identify request)
- *			2  device aborted the command (refused to identify itself)
- */
-
-int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id, int irq_ctx)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct ide_io_ports *io_ports = &hwif->io_ports;
-	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
-	int use_altstatus = 0, rc;
-	unsigned long timeout;
-	u8 s = 0, a = 0;
-
-	/*
-	 * Disable device IRQ.  Otherwise we'll get spurious interrupts
-	 * during the identify phase that the IRQ handler isn't expecting.
-	 */
-	if (io_ports->ctl_addr)
-		tp_ops->write_devctl(hwif, ATA_NIEN | ATA_DEVCTL_OBS);
-
-	/* take a deep breath */
-	if (irq_ctx)
-		mdelay(50);
-	else
-		msleep(50);
-
-	if (io_ports->ctl_addr &&
-	    (hwif->host_flags & IDE_HFLAG_BROKEN_ALTSTATUS) == 0) {
-		a = tp_ops->read_altstatus(hwif);
-		s = tp_ops->read_status(hwif);
-		if ((a ^ s) & ~ATA_SENSE)
-			/* ancient Seagate drives, broken interfaces */
-			printk(KERN_INFO "%s: probing with STATUS(0x%02x) "
-					 "instead of ALTSTATUS(0x%02x)\n",
-					 drive->name, s, a);
-		else
-			/* use non-intrusive polling */
-			use_altstatus = 1;
-	}
-
-	/* set features register for atapi
-	 * identify command to be sure of reply
-	 */
-	if (cmd == ATA_CMD_ID_ATAPI) {
-		struct ide_taskfile tf;
-
-		memset(&tf, 0, sizeof(tf));
-		/* disable DMA & overlap */
-		tp_ops->tf_load(drive, &tf, IDE_VALID_FEATURE);
-	}
-
-	/* ask drive for ID */
-	tp_ops->exec_command(hwif, cmd);
-
-	timeout = ((cmd == ATA_CMD_ID_ATA) ? WAIT_WORSTCASE : WAIT_PIDENTIFY) / 2;
-
-	/* wait for IRQ and ATA_DRQ */
-	if (irq_ctx) {
-		rc = __ide_wait_stat(drive, ATA_DRQ, BAD_R_STAT, timeout, &s);
-		if (rc)
-			return 1;
-	} else {
-		rc = ide_busy_sleep(drive, timeout, use_altstatus);
-		if (rc)
-			return 1;
-
-		msleep(50);
-		s = tp_ops->read_status(hwif);
-	}
-
-	if (OK_STAT(s, ATA_DRQ, BAD_R_STAT)) {
-		/* drive returned ID */
-		do_identify(drive, cmd, id);
-		/* drive responded with ID */
-		rc = 0;
-		/* clear drive IRQ */
-		(void)tp_ops->read_status(hwif);
-	} else {
-		/* drive refused ID */
-		rc = 2;
-	}
-	return rc;
-}
-
-int ide_busy_sleep(ide_drive_t *drive, unsigned long timeout, int altstatus)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	u8 stat;
-
-	timeout += jiffies;
-
-	do {
-		msleep(50);	/* give drive a breather */
-		stat = altstatus ? hwif->tp_ops->read_altstatus(hwif)
-				 : hwif->tp_ops->read_status(hwif);
-		if ((stat & ATA_BUSY) == 0)
-			return 0;
-	} while (time_before(jiffies, timeout));
-
-	printk(KERN_ERR "%s: timeout in %s\n", drive->name, __func__);
-
-	return 1;	/* drive timed-out */
-}
-
-static u8 ide_read_device(ide_drive_t *drive)
-{
-	struct ide_taskfile tf;
-
-	drive->hwif->tp_ops->tf_read(drive, &tf, IDE_VALID_DEVICE);
-
-	return tf.device;
-}
-
-/**
- *	do_probe		-	probe an IDE device
- *	@drive: drive to probe
- *	@cmd: command to use
- *
- *	do_probe() has the difficult job of finding a drive if it exists,
- *	without getting hung up if it doesn't exist, without trampling on
- *	ethernet cards, and without leaving any IRQs dangling to haunt us later.
- *
- *	If a drive is "known" to exist (from CMOS or kernel parameters),
- *	but does not respond right away, the probe will "hang in there"
- *	for the maximum wait time (about 30 seconds), otherwise it will
- *	exit much more quickly.
- *
- * Returns:	0  device was identified
- *		1  device timed-out (no response to identify request)
- *		2  device aborted the command (refused to identify itself)
- *		3  bad status from device (possible for ATAPI drives)
- *		4  probe was not attempted because failure was obvious
- */
-
-static int do_probe (ide_drive_t *drive, u8 cmd)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
-	u16 *id = drive->id;
-	int rc;
-	u8 present = !!(drive->dev_flags & IDE_DFLAG_PRESENT), stat;
-
-	/* avoid waiting for inappropriate probes */
-	if (present && drive->media != ide_disk && cmd == ATA_CMD_ID_ATA)
-		return 4;
-
-#ifdef DEBUG
-	printk(KERN_INFO "probing for %s: present=%d, media=%d, probetype=%s\n",
-		drive->name, present, drive->media,
-		(cmd == ATA_CMD_ID_ATA) ? "ATA" : "ATAPI");
-#endif
-
-	/* needed for some systems
-	 * (e.g. crw9624 as drive0 with disk as slave)
-	 */
-	msleep(50);
-	tp_ops->dev_select(drive);
-	msleep(50);
-
-	if (ide_read_device(drive) != drive->select && present == 0) {
-		if (drive->dn & 1) {
-			/* exit with drive0 selected */
-			tp_ops->dev_select(hwif->devices[0]);
-			/* allow ATA_BUSY to assert & clear */
-			msleep(50);
-		}
-		/* no i/f present: mmm.. this should be a 4 -ml */
-		return 3;
-	}
-
-	stat = tp_ops->read_status(hwif);
-
-	if (OK_STAT(stat, ATA_DRDY, ATA_BUSY) ||
-	    present || cmd == ATA_CMD_ID_ATAPI) {
-		rc = ide_dev_read_id(drive, cmd, id, 0);
-		if (rc)
-			/* failed: try again */
-			rc = ide_dev_read_id(drive, cmd, id, 0);
-
-		stat = tp_ops->read_status(hwif);
-
-		if (stat == (ATA_BUSY | ATA_DRDY))
-			return 4;
-
-		if (rc == 1 && cmd == ATA_CMD_ID_ATAPI) {
-			printk(KERN_ERR "%s: no response (status = 0x%02x), "
-					"resetting drive\n", drive->name, stat);
-			msleep(50);
-			tp_ops->dev_select(drive);
-			msleep(50);
-			tp_ops->exec_command(hwif, ATA_CMD_DEV_RESET);
-			(void)ide_busy_sleep(drive, WAIT_WORSTCASE, 0);
-			rc = ide_dev_read_id(drive, cmd, id, 0);
-		}
-
-		/* ensure drive IRQ is clear */
-		stat = tp_ops->read_status(hwif);
-
-		if (rc == 1)
-			printk(KERN_ERR "%s: no response (status = 0x%02x)\n",
-					drive->name, stat);
-	} else {
-		/* not present or maybe ATAPI */
-		rc = 3;
-	}
-	if (drive->dn & 1) {
-		/* exit with drive0 selected */
-		tp_ops->dev_select(hwif->devices[0]);
-		msleep(50);
-		/* ensure drive irq is clear */
-		(void)tp_ops->read_status(hwif);
-	}
-	return rc;
-}
-
-/**
- *	probe_for_drives	-	upper level drive probe
- *	@drive: drive to probe for
- *
- *	probe_for_drive() tests for existence of a given drive using do_probe()
- *	and presents things to the user as needed.
- *
- *	Returns:	0  no device was found
- *			1  device was found
- *			   (note: IDE_DFLAG_PRESENT might still be not set)
- */
-
-static u8 probe_for_drive(ide_drive_t *drive)
-{
-	char *m;
-	int rc;
-	u8 cmd;
-
-	drive->dev_flags &= ~IDE_DFLAG_ID_READ;
-
-	m = (char *)&drive->id[ATA_ID_PROD];
-	strcpy(m, "UNKNOWN");
-
-	/* skip probing? */
-	if ((drive->dev_flags & IDE_DFLAG_NOPROBE) == 0) {
-		/* if !(success||timed-out) */
-		cmd = ATA_CMD_ID_ATA;
-		rc = do_probe(drive, cmd);
-		if (rc >= 2) {
-			/* look for ATAPI device */
-			cmd = ATA_CMD_ID_ATAPI;
-			rc = do_probe(drive, cmd);
-		}
-
-		if ((drive->dev_flags & IDE_DFLAG_PRESENT) == 0)
-			return 0;
-
-		/* identification failed? */
-		if ((drive->dev_flags & IDE_DFLAG_ID_READ) == 0) {
-			if (drive->media == ide_disk) {
-				printk(KERN_INFO "%s: non-IDE drive, CHS=%d/%d/%d\n",
-					drive->name, drive->cyl,
-					drive->head, drive->sect);
-			} else if (drive->media == ide_cdrom) {
-				printk(KERN_INFO "%s: ATAPI cdrom (?)\n", drive->name);
-			} else {
-				/* nuke it */
-				printk(KERN_WARNING "%s: Unknown device on bus refused identification. Ignoring.\n", drive->name);
-				drive->dev_flags &= ~IDE_DFLAG_PRESENT;
-			}
-		} else {
-			if (cmd == ATA_CMD_ID_ATAPI)
-				ide_classify_atapi_dev(drive);
-			else
-				ide_classify_ata_dev(drive);
-		}
-	}
-
-	if ((drive->dev_flags & IDE_DFLAG_PRESENT) == 0)
-		return 0;
-
-	/* The drive wasn't being helpful. Add generic info only */
-	if ((drive->dev_flags & IDE_DFLAG_ID_READ) == 0) {
-		generic_id(drive);
-		return 1;
-	}
-
-	if (drive->media == ide_disk) {
-		ide_disk_init_chs(drive);
-		ide_disk_init_mult_count(drive);
-	}
-
-	return 1;
-}
-
-static void hwif_release_dev(struct device *dev)
-{
-	ide_hwif_t *hwif = container_of(dev, ide_hwif_t, gendev);
-
-	complete(&hwif->gendev_rel_comp);
-}
-
-static int ide_register_port(ide_hwif_t *hwif)
-{
-	int ret;
-
-	/* register with global device tree */
-	dev_set_name(&hwif->gendev, "%s", hwif->name);
-	dev_set_drvdata(&hwif->gendev, hwif);
-	if (hwif->gendev.parent == NULL)
-		hwif->gendev.parent = hwif->dev;
-	hwif->gendev.release = hwif_release_dev;
-
-	ret = device_register(&hwif->gendev);
-	if (ret < 0) {
-		printk(KERN_WARNING "IDE: %s: device_register error: %d\n",
-			__func__, ret);
-		goto out;
-	}
-
-	hwif->portdev = device_create(ide_port_class, &hwif->gendev,
-				      MKDEV(0, 0), hwif, "%s", hwif->name);
-	if (IS_ERR(hwif->portdev)) {
-		ret = PTR_ERR(hwif->portdev);
-		device_unregister(&hwif->gendev);
-	}
-out:
-	return ret;
-}
-
-/**
- *	ide_port_wait_ready	-	wait for port to become ready
- *	@hwif: IDE port
- *
- *	This is needed on some PPCs and a bunch of BIOS-less embedded
- *	platforms.  Typical cases are:
- *
- *	- The firmware hard reset the disk before booting the kernel,
- *	  the drive is still doing it's poweron-reset sequence, that
- *	  can take up to 30 seconds.
- *
- *	- The firmware does nothing (or no firmware), the device is
- *	  still in POST state (same as above actually).
- *
- *	- Some CD/DVD/Writer combo drives tend to drive the bus during
- *	  their reset sequence even when they are non-selected slave
- *	  devices, thus preventing discovery of the main HD.
- *
- *	Doing this wait-for-non-busy should not harm any existing
- *	configuration and fix some issues like the above.
- *
- *	BenH.
- *
- *	Returns 0 on success, error code (< 0) otherwise.
- */
-
-static int ide_port_wait_ready(ide_hwif_t *hwif)
-{
-	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
-	ide_drive_t *drive;
-	int i, rc;
-
-	printk(KERN_DEBUG "Probing IDE interface %s...\n", hwif->name);
-
-	/* Let HW settle down a bit from whatever init state we
-	 * come from */
-	mdelay(2);
-
-	/* Wait for BSY bit to go away, spec timeout is 30 seconds,
-	 * I know of at least one disk who takes 31 seconds, I use 35
-	 * here to be safe
-	 */
-	rc = ide_wait_not_busy(hwif, 35000);
-	if (rc)
-		return rc;
-
-	/* Now make sure both master & slave are ready */
-	ide_port_for_each_dev(i, drive, hwif) {
-		/* Ignore disks that we will not probe for later. */
-		if ((drive->dev_flags & IDE_DFLAG_NOPROBE) == 0 ||
-		    (drive->dev_flags & IDE_DFLAG_PRESENT)) {
-			tp_ops->dev_select(drive);
-			tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
-			mdelay(2);
-			rc = ide_wait_not_busy(hwif, 35000);
-			if (rc)
-				goto out;
-		} else
-			printk(KERN_DEBUG "%s: ide_wait_not_busy() skipped\n",
-					  drive->name);
-	}
-out:
-	/* Exit function with master reselected (let's be sane) */
-	if (i)
-		tp_ops->dev_select(hwif->devices[0]);
-
-	return rc;
-}
-
-/**
- *	ide_undecoded_slave	-	look for bad CF adapters
- *	@dev1: slave device
- *
- *	Analyse the drives on the interface and attempt to decide if we
- *	have the same drive viewed twice. This occurs with crap CF adapters
- *	and PCMCIA sometimes.
- */
-
-void ide_undecoded_slave(ide_drive_t *dev1)
-{
-	ide_drive_t *dev0 = dev1->hwif->devices[0];
-
-	if ((dev1->dn & 1) == 0 || (dev0->dev_flags & IDE_DFLAG_PRESENT) == 0)
-		return;
-
-	/* If the models don't match they are not the same product */
-	if (strcmp((char *)&dev0->id[ATA_ID_PROD],
-		   (char *)&dev1->id[ATA_ID_PROD]))
-		return;
-
-	/* Serial numbers do not match */
-	if (strncmp((char *)&dev0->id[ATA_ID_SERNO],
-		    (char *)&dev1->id[ATA_ID_SERNO], ATA_ID_SERNO_LEN))
-		return;
-
-	/* No serial number, thankfully very rare for CF */
-	if (*(char *)&dev0->id[ATA_ID_SERNO] == 0)
-		return;
-
-	/* Appears to be an IDE flash adapter with decode bugs */
-	printk(KERN_WARNING "ide-probe: ignoring undecoded slave\n");
-
-	dev1->dev_flags &= ~IDE_DFLAG_PRESENT;
-}
-
-EXPORT_SYMBOL_GPL(ide_undecoded_slave);
-
-static int ide_probe_port(ide_hwif_t *hwif)
-{
-	ide_drive_t *drive;
-	unsigned int irqd;
-	int i, rc = -ENODEV;
-
-	BUG_ON(hwif->present);
-
-	if ((hwif->devices[0]->dev_flags & IDE_DFLAG_NOPROBE) &&
-	    (hwif->devices[1]->dev_flags & IDE_DFLAG_NOPROBE))
-		return -EACCES;
-
-	/*
-	 * We must always disable IRQ, as probe_for_drive will assert IRQ, but
-	 * we'll install our IRQ driver much later...
-	 */
-	irqd = hwif->irq;
-	if (irqd)
-		disable_irq(hwif->irq);
-
-	if (ide_port_wait_ready(hwif) == -EBUSY)
-		printk(KERN_DEBUG "%s: Wait for ready failed before probe !\n", hwif->name);
-
-	/*
-	 * Second drive should only exist if first drive was found,
-	 * but a lot of cdrom drives are configured as single slaves.
-	 */
-	ide_port_for_each_dev(i, drive, hwif) {
-		(void) probe_for_drive(drive);
-		if (drive->dev_flags & IDE_DFLAG_PRESENT)
-			rc = 0;
-	}
-
-	/*
-	 * Use cached IRQ number. It might be (and is...) changed by probe
-	 * code above
-	 */
-	if (irqd)
-		enable_irq(irqd);
-
-	return rc;
-}
-
-static void ide_port_tune_devices(ide_hwif_t *hwif)
-{
-	const struct ide_port_ops *port_ops = hwif->port_ops;
-	ide_drive_t *drive;
-	int i;
-
-	ide_port_for_each_present_dev(i, drive, hwif) {
-		ide_check_nien_quirk_list(drive);
-
-		if (port_ops && port_ops->quirkproc)
-			port_ops->quirkproc(drive);
-	}
-
-	ide_port_for_each_present_dev(i, drive, hwif) {
-		ide_set_max_pio(drive);
-
-		drive->dev_flags |= IDE_DFLAG_NICE1;
-
-		if (hwif->dma_ops)
-			ide_set_dma(drive);
-	}
-}
-
-static void ide_initialize_rq(struct request *rq)
-{
-	struct ide_request *req = blk_mq_rq_to_pdu(rq);
-
-	req->special = NULL;
-	scsi_req_init(&req->sreq);
-	req->sreq.sense = req->sense;
-}
-
-static const struct blk_mq_ops ide_mq_ops = {
-	.queue_rq		= ide_queue_rq,
-	.initialize_rq_fn	= ide_initialize_rq,
-};
-
-/*
- * init request queue
- */
-static int ide_init_queue(ide_drive_t *drive)
-{
-	struct request_queue *q;
-	ide_hwif_t *hwif = drive->hwif;
-	int max_sectors = 256;
-	int max_sg_entries = PRD_ENTRIES;
-	struct blk_mq_tag_set *set;
-
-	/*
-	 *	Our default set up assumes the normal IDE case,
-	 *	that is 64K segmenting, standard PRD setup
-	 *	and LBA28. Some drivers then impose their own
-	 *	limits and LBA48 we could raise it but as yet
-	 *	do not.
-	 */
-
-	set = &drive->tag_set;
-	set->ops = &ide_mq_ops;
-	set->nr_hw_queues = 1;
-	set->queue_depth = 32;
-	set->reserved_tags = 1;
-	set->cmd_size = sizeof(struct ide_request);
-	set->numa_node = hwif_to_node(hwif);
-	set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
-	if (blk_mq_alloc_tag_set(set))
-		return 1;
-
-	q = blk_mq_init_queue(set);
-	if (IS_ERR(q)) {
-		blk_mq_free_tag_set(set);
-		return 1;
-	}
-
-	blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, q);
-
-	q->queuedata = drive;
-	blk_queue_segment_boundary(q, 0xffff);
-
-	if (hwif->rqsize < max_sectors)
-		max_sectors = hwif->rqsize;
-	blk_queue_max_hw_sectors(q, max_sectors);
-
-#ifdef CONFIG_PCI
-	/* When we have an IOMMU, we may have a problem where pci_map_sg()
-	 * creates segments that don't completely match our boundary
-	 * requirements and thus need to be broken up again. Because it
-	 * doesn't align properly either, we may actually have to break up
-	 * to more segments than what was we got in the first place, a max
-	 * worst case is twice as many.
-	 * This will be fixed once we teach pci_map_sg() about our boundary
-	 * requirements, hopefully soon. *FIXME*
-	 */
-	max_sg_entries >>= 1;
-#endif /* CONFIG_PCI */
-
-	blk_queue_max_segments(q, max_sg_entries);
-
-	/* assign drive queue */
-	drive->queue = q;
-
-	return 0;
-}
-
-static DEFINE_MUTEX(ide_cfg_mtx);
-
-/*
- * For any present drive:
- * - allocate the block device queue
- */
-static int ide_port_setup_devices(ide_hwif_t *hwif)
-{
-	ide_drive_t *drive;
-	int i, j = 0;
-
-	mutex_lock(&ide_cfg_mtx);
-	ide_port_for_each_present_dev(i, drive, hwif) {
-		if (ide_init_queue(drive)) {
-			printk(KERN_ERR "ide: failed to init %s\n",
-					drive->name);
-			drive->dev_flags &= ~IDE_DFLAG_PRESENT;
-			continue;
-		}
-
-		j++;
-	}
-	mutex_unlock(&ide_cfg_mtx);
-
-	return j;
-}
-
-static void ide_host_enable_irqs(struct ide_host *host)
-{
-	ide_hwif_t *hwif;
-	int i;
-
-	ide_host_for_each_port(i, hwif, host) {
-		if (hwif == NULL)
-			continue;
-
-		/* clear any pending IRQs */
-		hwif->tp_ops->read_status(hwif);
-
-		/* unmask IRQs */
-		if (hwif->io_ports.ctl_addr)
-			hwif->tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
-	}
-}
-
-/*
- * This routine sets up the IRQ for an IDE interface.
- */
-static int init_irq (ide_hwif_t *hwif)
-{
-	struct ide_io_ports *io_ports = &hwif->io_ports;
-	struct ide_host *host = hwif->host;
-	irq_handler_t irq_handler = host->irq_handler;
-	int sa = host->irq_flags;
-
-	if (irq_handler == NULL)
-		irq_handler = ide_intr;
-
-	if (!host->get_lock)
-		if (request_irq(hwif->irq, irq_handler, sa, hwif->name, hwif))
-			goto out_up;
-
-#if !defined(__mc68000__)
-	printk(KERN_INFO "%s at 0x%03lx-0x%03lx,0x%03lx on irq %d", hwif->name,
-		io_ports->data_addr, io_ports->status_addr,
-		io_ports->ctl_addr, hwif->irq);
-#else
-	printk(KERN_INFO "%s at 0x%08lx on irq %d", hwif->name,
-		io_ports->data_addr, hwif->irq);
-#endif /* __mc68000__ */
-	if (hwif->host->host_flags & IDE_HFLAG_SERIALIZE)
-		printk(KERN_CONT " (serialized)");
-	printk(KERN_CONT "\n");
-
-	return 0;
-out_up:
-	return 1;
-}
-
-static void ata_probe(dev_t dev)
-{
-	request_module("ide-disk");
-	request_module("ide-cd");
-	request_module("ide-tape");
-	request_module("ide-floppy");
-}
-
-void ide_init_disk(struct gendisk *disk, ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	unsigned int unit = drive->dn & 1;
-
-	disk->major = hwif->major;
-	disk->first_minor = unit << PARTN_BITS;
-	sprintf(disk->disk_name, "hd%c", 'a' + hwif->index * MAX_DRIVES + unit);
-	disk->queue = drive->queue;
-}
-
-EXPORT_SYMBOL_GPL(ide_init_disk);
-
-static void drive_release_dev (struct device *dev)
-{
-	ide_drive_t *drive = container_of(dev, ide_drive_t, gendev);
-
-	ide_proc_unregister_device(drive);
-
-	if (drive->sense_rq)
-		blk_mq_free_request(drive->sense_rq);
-
-	blk_cleanup_queue(drive->queue);
-	drive->queue = NULL;
-	blk_mq_free_tag_set(&drive->tag_set);
-
-	drive->dev_flags &= ~IDE_DFLAG_PRESENT;
-
-	complete(&drive->gendev_rel_comp);
-}
-
-static int hwif_init(ide_hwif_t *hwif)
-{
-	if (!hwif->irq) {
-		printk(KERN_ERR "%s: disabled, no IRQ\n", hwif->name);
-		return 0;
-	}
-
-	if (__register_blkdev(hwif->major, hwif->name, ata_probe))
-		return 0;
-
-	if (!hwif->sg_max_nents)
-		hwif->sg_max_nents = PRD_ENTRIES;
-
-	hwif->sg_table = kmalloc_array(hwif->sg_max_nents,
-				       sizeof(struct scatterlist),
-				       GFP_KERNEL);
-	if (!hwif->sg_table) {
-		printk(KERN_ERR "%s: unable to allocate SG table.\n", hwif->name);
-		goto out;
-	}
-
-	sg_init_table(hwif->sg_table, hwif->sg_max_nents);
-	
-	if (init_irq(hwif)) {
-		printk(KERN_ERR "%s: disabled, unable to get IRQ %d\n",
-			hwif->name, hwif->irq);
-		goto out;
-	}
-
-	return 1;
-
-out:
-	unregister_blkdev(hwif->major, hwif->name);
-	return 0;
-}
-
-static void hwif_register_devices(ide_hwif_t *hwif)
-{
-	ide_drive_t *drive;
-	unsigned int i;
-
-	ide_port_for_each_present_dev(i, drive, hwif) {
-		struct device *dev = &drive->gendev;
-		int ret;
-
-		dev_set_name(dev, "%u.%u", hwif->index, i);
-		dev_set_drvdata(dev, drive);
-		dev->parent = &hwif->gendev;
-		dev->bus = &ide_bus_type;
-		dev->release = drive_release_dev;
-
-		ret = device_register(dev);
-		if (ret < 0)
-			printk(KERN_WARNING "IDE: %s: device_register error: "
-					    "%d\n", __func__, ret);
-	}
-}
-
-static void ide_port_init_devices(ide_hwif_t *hwif)
-{
-	const struct ide_port_ops *port_ops = hwif->port_ops;
-	ide_drive_t *drive;
-	int i;
-
-	ide_port_for_each_dev(i, drive, hwif) {
-		drive->dn = i + hwif->channel * 2;
-
-		if (hwif->host_flags & IDE_HFLAG_IO_32BIT)
-			drive->io_32bit = 1;
-		if (hwif->host_flags & IDE_HFLAG_NO_IO_32BIT)
-			drive->dev_flags |= IDE_DFLAG_NO_IO_32BIT;
-		if (hwif->host_flags & IDE_HFLAG_UNMASK_IRQS)
-			drive->dev_flags |= IDE_DFLAG_UNMASK;
-		if (hwif->host_flags & IDE_HFLAG_NO_UNMASK_IRQS)
-			drive->dev_flags |= IDE_DFLAG_NO_UNMASK;
-
-		drive->pio_mode = XFER_PIO_0;
-
-		if (port_ops && port_ops->init_dev)
-			port_ops->init_dev(drive);
-	}
-}
-
-static void ide_init_port(ide_hwif_t *hwif, unsigned int port,
-			  const struct ide_port_info *d)
-{
-	hwif->channel = port;
-
-	hwif->chipset = d->chipset ? d->chipset : ide_pci;
-
-	if (d->init_iops)
-		d->init_iops(hwif);
-
-	/* ->host_flags may be set by ->init_iops (or even earlier...) */
-	hwif->host_flags |= d->host_flags;
-	hwif->pio_mask = d->pio_mask;
-
-	if (d->tp_ops)
-		hwif->tp_ops = d->tp_ops;
-
-	/* ->set_pio_mode for DTC2278 is currently limited to port 0 */
-	if ((hwif->host_flags & IDE_HFLAG_DTC2278) == 0 || hwif->channel == 0)
-		hwif->port_ops = d->port_ops;
-
-	hwif->swdma_mask = d->swdma_mask;
-	hwif->mwdma_mask = d->mwdma_mask;
-	hwif->ultra_mask = d->udma_mask;
-
-	if ((d->host_flags & IDE_HFLAG_NO_DMA) == 0) {
-		int rc;
-
-		hwif->dma_ops = d->dma_ops;
-
-		if (d->init_dma)
-			rc = d->init_dma(hwif, d);
-		else
-			rc = ide_hwif_setup_dma(hwif, d);
-
-		if (rc < 0) {
-			printk(KERN_INFO "%s: DMA disabled\n", hwif->name);
-
-			hwif->dma_ops = NULL;
-			hwif->dma_base = 0;
-			hwif->swdma_mask = 0;
-			hwif->mwdma_mask = 0;
-			hwif->ultra_mask = 0;
-		}
-	}
-
-	if ((d->host_flags & IDE_HFLAG_SERIALIZE) ||
-	    ((d->host_flags & IDE_HFLAG_SERIALIZE_DMA) && hwif->dma_base))
-		hwif->host->host_flags |= IDE_HFLAG_SERIALIZE;
-
-	if (d->max_sectors)
-		hwif->rqsize = d->max_sectors;
-	else {
-		if ((hwif->host_flags & IDE_HFLAG_NO_LBA48) ||
-		    (hwif->host_flags & IDE_HFLAG_NO_LBA48_DMA))
-			hwif->rqsize = 256;
-		else
-			hwif->rqsize = 65536;
-	}
-
-	/* call chipset specific routine for each enabled port */
-	if (d->init_hwif)
-		d->init_hwif(hwif);
-}
-
-static void ide_port_cable_detect(ide_hwif_t *hwif)
-{
-	const struct ide_port_ops *port_ops = hwif->port_ops;
-
-	if (port_ops && port_ops->cable_detect && (hwif->ultra_mask & 0x78)) {
-		if (hwif->cbl != ATA_CBL_PATA40_SHORT)
-			hwif->cbl = port_ops->cable_detect(hwif);
-	}
-}
-
-/*
- * Deferred request list insertion handler
- */
-static void drive_rq_insert_work(struct work_struct *work)
-{
-	ide_drive_t *drive = container_of(work, ide_drive_t, rq_work);
-	ide_hwif_t *hwif = drive->hwif;
-	struct request *rq;
-	blk_status_t ret;
-	LIST_HEAD(list);
-
-	blk_mq_quiesce_queue(drive->queue);
-
-	ret = BLK_STS_OK;
-	spin_lock_irq(&hwif->lock);
-	while (!list_empty(&drive->rq_list)) {
-		rq = list_first_entry(&drive->rq_list, struct request, queuelist);
-		list_del_init(&rq->queuelist);
-
-		spin_unlock_irq(&hwif->lock);
-		ret = ide_issue_rq(drive, rq, true);
-		spin_lock_irq(&hwif->lock);
-	}
-	spin_unlock_irq(&hwif->lock);
-
-	blk_mq_unquiesce_queue(drive->queue);
-
-	if (ret != BLK_STS_OK)
-		kblockd_schedule_work(&drive->rq_work);
-}
-
-static const u8 ide_hwif_to_major[] =
-	{ IDE0_MAJOR, IDE1_MAJOR, IDE2_MAJOR, IDE3_MAJOR, IDE4_MAJOR,
-	  IDE5_MAJOR, IDE6_MAJOR, IDE7_MAJOR, IDE8_MAJOR, IDE9_MAJOR };
-
-static void ide_port_init_devices_data(ide_hwif_t *hwif)
-{
-	ide_drive_t *drive;
-	int i;
-
-	ide_port_for_each_dev(i, drive, hwif) {
-		u8 j = (hwif->index * MAX_DRIVES) + i;
-		u16 *saved_id = drive->id;
-
-		memset(drive, 0, sizeof(*drive));
-		memset(saved_id, 0, SECTOR_SIZE);
-		drive->id = saved_id;
-
-		drive->media			= ide_disk;
-		drive->select			= (i << 4) | ATA_DEVICE_OBS;
-		drive->hwif			= hwif;
-		drive->ready_stat		= ATA_DRDY;
-		drive->bad_wstat		= BAD_W_STAT;
-		drive->special_flags		= IDE_SFLAG_RECALIBRATE |
-						  IDE_SFLAG_SET_GEOMETRY;
-		drive->name[0]			= 'h';
-		drive->name[1]			= 'd';
-		drive->name[2]			= 'a' + j;
-		drive->max_failures		= IDE_DEFAULT_MAX_FAILURES;
-
-		INIT_LIST_HEAD(&drive->list);
-		init_completion(&drive->gendev_rel_comp);
-
-		INIT_WORK(&drive->rq_work, drive_rq_insert_work);
-		INIT_LIST_HEAD(&drive->rq_list);
-	}
-}
-
-static void ide_init_port_data(ide_hwif_t *hwif, unsigned int index)
-{
-	/* fill in any non-zero initial values */
-	hwif->index	= index;
-	hwif->major	= ide_hwif_to_major[index];
-
-	hwif->name[0]	= 'i';
-	hwif->name[1]	= 'd';
-	hwif->name[2]	= 'e';
-	hwif->name[3]	= '0' + index;
-
-	spin_lock_init(&hwif->lock);
-
-	timer_setup(&hwif->timer, ide_timer_expiry, 0);
-
-	init_completion(&hwif->gendev_rel_comp);
-
-	hwif->tp_ops = &default_tp_ops;
-
-	ide_port_init_devices_data(hwif);
-}
-
-static void ide_init_port_hw(ide_hwif_t *hwif, struct ide_hw *hw)
-{
-	memcpy(&hwif->io_ports, &hw->io_ports, sizeof(hwif->io_ports));
-	hwif->irq = hw->irq;
-	hwif->dev = hw->dev;
-	hwif->gendev.parent = hw->parent ? hw->parent : hw->dev;
-	hwif->config_data = hw->config;
-}
-
-static unsigned int ide_indexes;
-
-/**
- *	ide_find_port_slot	-	find free port slot
- *	@d: IDE port info
- *
- *	Return the new port slot index or -ENOENT if we are out of free slots.
- */
-
-static int ide_find_port_slot(const struct ide_port_info *d)
-{
-	int idx = -ENOENT;
-	u8 bootable = (d && (d->host_flags & IDE_HFLAG_NON_BOOTABLE)) ? 0 : 1;
-	u8 i = (d && (d->host_flags & IDE_HFLAG_QD_2ND_PORT)) ? 1 : 0;
-
-	/*
-	 * Claim an unassigned slot.
-	 *
-	 * Give preference to claiming other slots before claiming ide0/ide1,
-	 * just in case there's another interface yet-to-be-scanned
-	 * which uses ports 0x1f0/0x170 (the ide0/ide1 defaults).
-	 *
-	 * Unless there is a bootable card that does not use the standard
-	 * ports 0x1f0/0x170 (the ide0/ide1 defaults).
-	 */
-	mutex_lock(&ide_cfg_mtx);
-	if (bootable) {
-		if ((ide_indexes | i) != (1 << MAX_HWIFS) - 1)
-			idx = ffz(ide_indexes | i);
-	} else {
-		if ((ide_indexes | 3) != (1 << MAX_HWIFS) - 1)
-			idx = ffz(ide_indexes | 3);
-		else if ((ide_indexes & 3) != 3)
-			idx = ffz(ide_indexes);
-	}
-	if (idx >= 0)
-		ide_indexes |= (1 << idx);
-	mutex_unlock(&ide_cfg_mtx);
-
-	return idx;
-}
-
-static void ide_free_port_slot(int idx)
-{
-	mutex_lock(&ide_cfg_mtx);
-	ide_indexes &= ~(1 << idx);
-	mutex_unlock(&ide_cfg_mtx);
-}
-
-static void ide_port_free_devices(ide_hwif_t *hwif)
-{
-	ide_drive_t *drive;
-	int i;
-
-	ide_port_for_each_dev(i, drive, hwif) {
-		kfree(drive->id);
-		kfree(drive);
-	}
-}
-
-static int ide_port_alloc_devices(ide_hwif_t *hwif, int node)
-{
-	ide_drive_t *drive;
-	int i;
-
-	for (i = 0; i < MAX_DRIVES; i++) {
-		drive = kzalloc_node(sizeof(*drive), GFP_KERNEL, node);
-		if (drive == NULL)
-			goto out_nomem;
-
-		/*
-		 * In order to keep things simple we have an id
-		 * block for all drives at all times. If the device
-		 * is pre ATA or refuses ATA/ATAPI identify we
-		 * will add faked data to this.
-		 *
-		 * Also note that 0 everywhere means "can't do X"
-		 */
-		drive->id = kzalloc_node(SECTOR_SIZE, GFP_KERNEL, node);
-		if (drive->id == NULL)
-			goto out_free_drive;
-
-		hwif->devices[i] = drive;
-	}
-	return 0;
-
-out_free_drive:
-	kfree(drive);
-out_nomem:
-	ide_port_free_devices(hwif);
-	return -ENOMEM;
-}
-
-struct ide_host *ide_host_alloc(const struct ide_port_info *d,
-				struct ide_hw **hws, unsigned int n_ports)
-{
-	struct ide_host *host;
-	struct device *dev = hws[0] ? hws[0]->dev : NULL;
-	int node = dev ? dev_to_node(dev) : -1;
-	int i;
-
-	host = kzalloc_node(sizeof(*host), GFP_KERNEL, node);
-	if (host == NULL)
-		return NULL;
-
-	for (i = 0; i < n_ports; i++) {
-		ide_hwif_t *hwif;
-		int idx;
-
-		if (hws[i] == NULL)
-			continue;
-
-		hwif = kzalloc_node(sizeof(*hwif), GFP_KERNEL, node);
-		if (hwif == NULL)
-			continue;
-
-		if (ide_port_alloc_devices(hwif, node) < 0) {
-			kfree(hwif);
-			continue;
-		}
-
-		idx = ide_find_port_slot(d);
-		if (idx < 0) {
-			printk(KERN_ERR "%s: no free slot for interface\n",
-					d ? d->name : "ide");
-			ide_port_free_devices(hwif);
-			kfree(hwif);
-			continue;
-		}
-
-		ide_init_port_data(hwif, idx);
-
-		hwif->host = host;
-
-		host->ports[i] = hwif;
-		host->n_ports++;
-	}
-
-	if (host->n_ports == 0) {
-		kfree(host);
-		return NULL;
-	}
-
-	host->dev[0] = dev;
-
-	if (d) {
-		host->init_chipset = d->init_chipset;
-		host->get_lock     = d->get_lock;
-		host->release_lock = d->release_lock;
-		host->host_flags = d->host_flags;
-		host->irq_flags = d->irq_flags;
-	}
-
-	return host;
-}
-EXPORT_SYMBOL_GPL(ide_host_alloc);
-
-static void ide_port_free(ide_hwif_t *hwif)
-{
-	ide_port_free_devices(hwif);
-	ide_free_port_slot(hwif->index);
-	kfree(hwif);
-}
-
-static void ide_disable_port(ide_hwif_t *hwif)
-{
-	struct ide_host *host = hwif->host;
-	int i;
-
-	printk(KERN_INFO "%s: disabling port\n", hwif->name);
-
-	for (i = 0; i < MAX_HOST_PORTS; i++) {
-		if (host->ports[i] == hwif) {
-			host->ports[i] = NULL;
-			host->n_ports--;
-		}
-	}
-
-	ide_port_free(hwif);
-}
-
-int ide_host_register(struct ide_host *host, const struct ide_port_info *d,
-		      struct ide_hw **hws)
-{
-	ide_hwif_t *hwif, *mate = NULL;
-	int i, j = 0;
-
-	pr_warn("legacy IDE will be removed in 2021, please switch to libata\n"
-		"Report any missing HW support to linux-ide@vger.kernel.org\n");
-
-	ide_host_for_each_port(i, hwif, host) {
-		if (hwif == NULL) {
-			mate = NULL;
-			continue;
-		}
-
-		ide_init_port_hw(hwif, hws[i]);
-		ide_port_apply_params(hwif);
-
-		if ((i & 1) && mate) {
-			hwif->mate = mate;
-			mate->mate = hwif;
-		}
-
-		mate = (i & 1) ? NULL : hwif;
-
-		ide_init_port(hwif, i & 1, d);
-		ide_port_cable_detect(hwif);
-
-		hwif->port_flags |= IDE_PFLAG_PROBING;
-
-		ide_port_init_devices(hwif);
-	}
-
-	ide_host_for_each_port(i, hwif, host) {
-		if (hwif == NULL)
-			continue;
-
-		if (ide_probe_port(hwif) == 0)
-			hwif->present = 1;
-
-		hwif->port_flags &= ~IDE_PFLAG_PROBING;
-
-		if ((hwif->host_flags & IDE_HFLAG_4DRIVES) == 0 ||
-		    hwif->mate == NULL || hwif->mate->present == 0) {
-			if (ide_register_port(hwif)) {
-				ide_disable_port(hwif);
-				continue;
-			}
-		}
-
-		if (hwif->present)
-			ide_port_tune_devices(hwif);
-	}
-
-	ide_host_enable_irqs(host);
-
-	ide_host_for_each_port(i, hwif, host) {
-		if (hwif == NULL)
-			continue;
-
-		if (hwif_init(hwif) == 0) {
-			printk(KERN_INFO "%s: failed to initialize IDE "
-					 "interface\n", hwif->name);
-			device_unregister(hwif->portdev);
-			device_unregister(&hwif->gendev);
-			ide_disable_port(hwif);
-			continue;
-		}
-
-		if (hwif->present)
-			if (ide_port_setup_devices(hwif) == 0) {
-				hwif->present = 0;
-				continue;
-			}
-
-		j++;
-
-		ide_acpi_init_port(hwif);
-
-		if (hwif->present)
-			ide_acpi_port_init_devices(hwif);
-	}
-
-	ide_host_for_each_port(i, hwif, host) {
-		if (hwif == NULL)
-			continue;
-
-		ide_sysfs_register_port(hwif);
-		ide_proc_register_port(hwif);
-
-		if (hwif->present) {
-			ide_proc_port_register_devices(hwif);
-			hwif_register_devices(hwif);
-		}
-	}
-
-	return j ? 0 : -1;
-}
-EXPORT_SYMBOL_GPL(ide_host_register);
-
-int ide_host_add(const struct ide_port_info *d, struct ide_hw **hws,
-		 unsigned int n_ports, struct ide_host **hostp)
-{
-	struct ide_host *host;
-	int rc;
-
-	host = ide_host_alloc(d, hws, n_ports);
-	if (host == NULL)
-		return -ENOMEM;
-
-	rc = ide_host_register(host, d, hws);
-	if (rc) {
-		ide_host_free(host);
-		return rc;
-	}
-
-	if (hostp)
-		*hostp = host;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(ide_host_add);
-
-static void __ide_port_unregister_devices(ide_hwif_t *hwif)
-{
-	ide_drive_t *drive;
-	int i;
-
-	ide_port_for_each_present_dev(i, drive, hwif) {
-		device_unregister(&drive->gendev);
-		wait_for_completion(&drive->gendev_rel_comp);
-	}
-}
-
-void ide_port_unregister_devices(ide_hwif_t *hwif)
-{
-	mutex_lock(&ide_cfg_mtx);
-	__ide_port_unregister_devices(hwif);
-	hwif->present = 0;
-	ide_port_init_devices_data(hwif);
-	mutex_unlock(&ide_cfg_mtx);
-}
-EXPORT_SYMBOL_GPL(ide_port_unregister_devices);
-
-/**
- *	ide_unregister		-	free an IDE interface
- *	@hwif: IDE interface
- *
- *	Perform the final unregister of an IDE interface.
- *
- *	Locking:
- *	The caller must not hold the IDE locks.
- *
- *	It is up to the caller to be sure there is no pending I/O here,
- *	and that the interface will not be reopened (present/vanishing
- *	locking isn't yet done BTW).
- */
-
-static void ide_unregister(ide_hwif_t *hwif)
-{
-	mutex_lock(&ide_cfg_mtx);
-
-	if (hwif->present) {
-		__ide_port_unregister_devices(hwif);
-		hwif->present = 0;
-	}
-
-	ide_proc_unregister_port(hwif);
-
-	if (!hwif->host->get_lock)
-		free_irq(hwif->irq, hwif);
-
-	device_unregister(hwif->portdev);
-	device_unregister(&hwif->gendev);
-	wait_for_completion(&hwif->gendev_rel_comp);
-
-	/*
-	 * Remove us from the kernel's knowledge
-	 */
-	kfree(hwif->sg_table);
-	unregister_blkdev(hwif->major, hwif->name);
-
-	ide_release_dma_engine(hwif);
-
-	mutex_unlock(&ide_cfg_mtx);
-}
-
-void ide_host_free(struct ide_host *host)
-{
-	ide_hwif_t *hwif;
-	int i;
-
-	ide_host_for_each_port(i, hwif, host) {
-		if (hwif)
-			ide_port_free(hwif);
-	}
-
-	kfree(host);
-}
-EXPORT_SYMBOL_GPL(ide_host_free);
-
-void ide_host_remove(struct ide_host *host)
-{
-	ide_hwif_t *hwif;
-	int i;
-
-	ide_host_for_each_port(i, hwif, host) {
-		if (hwif)
-			ide_unregister(hwif);
-	}
-
-	ide_host_free(host);
-}
-EXPORT_SYMBOL_GPL(ide_host_remove);
-
-void ide_port_scan(ide_hwif_t *hwif)
-{
-	int rc;
-
-	ide_port_apply_params(hwif);
-	ide_port_cable_detect(hwif);
-
-	hwif->port_flags |= IDE_PFLAG_PROBING;
-
-	ide_port_init_devices(hwif);
-
-	rc = ide_probe_port(hwif);
-
-	hwif->port_flags &= ~IDE_PFLAG_PROBING;
-
-	if (rc < 0)
-		return;
-
-	hwif->present = 1;
-
-	ide_port_tune_devices(hwif);
-	ide_port_setup_devices(hwif);
-	ide_acpi_port_init_devices(hwif);
-	hwif_register_devices(hwif);
-	ide_proc_port_register_devices(hwif);
-}
-EXPORT_SYMBOL_GPL(ide_port_scan);
diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c
deleted file mode 100644
index 15c17f3781ee..000000000000
--- a/drivers/ide/ide-proc.c
+++ /dev/null
@@ -1,633 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  Copyright (C) 1997-1998	Mark Lord
- *  Copyright (C) 2003		Red Hat
- *
- *  Some code was moved here from ide.c, see it for original copyrights.
- */
-
-/*
- * This is the /proc/ide/ filesystem implementation.
- *
- * Drive/Driver settings can be retrieved by reading the drive's
- * "settings" files.  e.g.    "cat /proc/ide0/hda/settings"
- * To write a new value "val" into a specific setting "name", use:
- *   echo "name:val" >/proc/ide/ide0/hda/settings
- */
-
-#include <linux/module.h>
-
-#include <linux/uaccess.h>
-#include <linux/errno.h>
-#include <linux/proc_fs.h>
-#include <linux/stat.h>
-#include <linux/mm.h>
-#include <linux/pci.h>
-#include <linux/ctype.h>
-#include <linux/ide.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-
-#include <asm/io.h>
-
-static struct proc_dir_entry *proc_ide_root;
-
-static int ide_imodel_proc_show(struct seq_file *m, void *v)
-{
-	ide_hwif_t	*hwif = (ide_hwif_t *) m->private;
-	const char	*name;
-
-	switch (hwif->chipset) {
-	case ide_generic:	name = "generic";	break;
-	case ide_pci:		name = "pci";		break;
-	case ide_cmd640:	name = "cmd640";	break;
-	case ide_dtc2278:	name = "dtc2278";	break;
-	case ide_ali14xx:	name = "ali14xx";	break;
-	case ide_qd65xx:	name = "qd65xx";	break;
-	case ide_umc8672:	name = "umc8672";	break;
-	case ide_ht6560b:	name = "ht6560b";	break;
-	case ide_4drives:	name = "4drives";	break;
-	case ide_pmac:		name = "mac-io";	break;
-	case ide_au1xxx:	name = "au1xxx";	break;
-	case ide_palm3710:      name = "palm3710";      break;
-	case ide_acorn:		name = "acorn";		break;
-	default:		name = "(unknown)";	break;
-	}
-	seq_printf(m, "%s\n", name);
-	return 0;
-}
-
-static int ide_mate_proc_show(struct seq_file *m, void *v)
-{
-	ide_hwif_t	*hwif = (ide_hwif_t *) m->private;
-
-	if (hwif && hwif->mate)
-		seq_printf(m, "%s\n", hwif->mate->name);
-	else
-		seq_printf(m, "(none)\n");
-	return 0;
-}
-
-static int ide_channel_proc_show(struct seq_file *m, void *v)
-{
-	ide_hwif_t	*hwif = (ide_hwif_t *) m->private;
-
-	seq_printf(m, "%c\n", hwif->channel ? '1' : '0');
-	return 0;
-}
-
-static int ide_identify_proc_show(struct seq_file *m, void *v)
-{
-	ide_drive_t *drive = (ide_drive_t *)m->private;
-	u8 *buf;
-
-	if (!drive) {
-		seq_putc(m, '\n');
-		return 0;
-	}
-
-	buf = kmalloc(SECTOR_SIZE, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-	if (taskfile_lib_get_identify(drive, buf) == 0) {
-		__le16 *val = (__le16 *)buf;
-		int i;
-
-		for (i = 0; i < SECTOR_SIZE / 2; i++) {
-			seq_printf(m, "%04x%c", le16_to_cpu(val[i]),
-					(i % 8) == 7 ? '\n' : ' ');
-		}
-	} else
-		seq_putc(m, buf[0]);
-	kfree(buf);
-	return 0;
-}
-
-/**
- *	ide_find_setting	-	find a specific setting
- *	@st: setting table pointer
- *	@name: setting name
- *
- *	Scan's the setting table for a matching entry and returns
- *	this or NULL if no entry is found. The caller must hold the
- *	setting semaphore
- */
-
-static
-const struct ide_proc_devset *ide_find_setting(const struct ide_proc_devset *st,
-					       char *name)
-{
-	while (st->name) {
-		if (strcmp(st->name, name) == 0)
-			break;
-		st++;
-	}
-	return st->name ? st : NULL;
-}
-
-/**
- *	ide_read_setting	-	read an IDE setting
- *	@drive: drive to read from
- *	@setting: drive setting
- *
- *	Read a drive setting and return the value. The caller
- *	must hold the ide_setting_mtx when making this call.
- *
- *	BUGS: the data return and error are the same return value
- *	so an error -EINVAL and true return of the same value cannot
- *	be told apart
- */
-
-static int ide_read_setting(ide_drive_t *drive,
-			    const struct ide_proc_devset *setting)
-{
-	const struct ide_devset *ds = setting->setting;
-	int val = -EINVAL;
-
-	if (ds->get)
-		val = ds->get(drive);
-
-	return val;
-}
-
-/**
- *	ide_write_setting	-	read an IDE setting
- *	@drive: drive to read from
- *	@setting: drive setting
- *	@val: value
- *
- *	Write a drive setting if it is possible. The caller
- *	must hold the ide_setting_mtx when making this call.
- *
- *	BUGS: the data return and error are the same return value
- *	so an error -EINVAL and true return of the same value cannot
- *	be told apart
- *
- *	FIXME:  This should be changed to enqueue a special request
- *	to the driver to change settings, and then wait on a sema for completion.
- *	The current scheme of polling is kludgy, though safe enough.
- */
-
-static int ide_write_setting(ide_drive_t *drive,
-			     const struct ide_proc_devset *setting, int val)
-{
-	const struct ide_devset *ds = setting->setting;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
-	if (!ds->set)
-		return -EPERM;
-	if ((ds->flags & DS_SYNC)
-	    && (val < setting->min || val > setting->max))
-		return -EINVAL;
-	return ide_devset_execute(drive, ds, val);
-}
-
-ide_devset_get(xfer_rate, current_speed);
-
-static int set_xfer_rate (ide_drive_t *drive, int arg)
-{
-	struct ide_cmd cmd;
-
-	if (arg < XFER_PIO_0 || arg > XFER_UDMA_6)
-		return -EINVAL;
-
-	memset(&cmd, 0, sizeof(cmd));
-	cmd.tf.command = ATA_CMD_SET_FEATURES;
-	cmd.tf.feature = SETFEATURES_XFER;
-	cmd.tf.nsect   = (u8)arg;
-	cmd.valid.out.tf = IDE_VALID_FEATURE | IDE_VALID_NSECT;
-	cmd.valid.in.tf  = IDE_VALID_NSECT;
-	cmd.tf_flags   = IDE_TFLAG_SET_XFER;
-
-	return ide_no_data_taskfile(drive, &cmd);
-}
-
-ide_devset_rw(current_speed, xfer_rate);
-ide_devset_rw_field(init_speed, init_speed);
-ide_devset_rw_flag(nice1, IDE_DFLAG_NICE1);
-ide_devset_ro_field(number, dn);
-
-static const struct ide_proc_devset ide_generic_settings[] = {
-	IDE_PROC_DEVSET(current_speed, 0, 70),
-	IDE_PROC_DEVSET(init_speed, 0, 70),
-	IDE_PROC_DEVSET(io_32bit,  0, 1 + (SUPPORT_VLB_SYNC << 1)),
-	IDE_PROC_DEVSET(keepsettings, 0, 1),
-	IDE_PROC_DEVSET(nice1, 0, 1),
-	IDE_PROC_DEVSET(number, 0, 3),
-	IDE_PROC_DEVSET(pio_mode, 0, 255),
-	IDE_PROC_DEVSET(unmaskirq, 0, 1),
-	IDE_PROC_DEVSET(using_dma, 0, 1),
-	{ NULL },
-};
-
-static void proc_ide_settings_warn(void)
-{
-	printk_once(KERN_WARNING "Warning: /proc/ide/hd?/settings interface is "
-			    "obsolete, and will be removed soon!\n");
-}
-
-static int ide_settings_proc_show(struct seq_file *m, void *v)
-{
-	const struct ide_proc_devset *setting, *g, *d;
-	const struct ide_devset *ds;
-	ide_drive_t	*drive = (ide_drive_t *) m->private;
-	int		rc, mul_factor, div_factor;
-
-	proc_ide_settings_warn();
-
-	mutex_lock(&ide_setting_mtx);
-	g = ide_generic_settings;
-	d = drive->settings;
-	seq_printf(m, "name\t\t\tvalue\t\tmin\t\tmax\t\tmode\n");
-	seq_printf(m, "----\t\t\t-----\t\t---\t\t---\t\t----\n");
-	while (g->name || (d && d->name)) {
-		/* read settings in the alphabetical order */
-		if (g->name && d && d->name) {
-			if (strcmp(d->name, g->name) < 0)
-				setting = d++;
-			else
-				setting = g++;
-		} else if (d && d->name) {
-			setting = d++;
-		} else
-			setting = g++;
-		mul_factor = setting->mulf ? setting->mulf(drive) : 1;
-		div_factor = setting->divf ? setting->divf(drive) : 1;
-		seq_printf(m, "%-24s", setting->name);
-		rc = ide_read_setting(drive, setting);
-		if (rc >= 0)
-			seq_printf(m, "%-16d", rc * mul_factor / div_factor);
-		else
-			seq_printf(m, "%-16s", "write-only");
-		seq_printf(m, "%-16d%-16d", (setting->min * mul_factor + div_factor - 1) / div_factor, setting->max * mul_factor / div_factor);
-		ds = setting->setting;
-		if (ds->get)
-			seq_printf(m, "r");
-		if (ds->set)
-			seq_printf(m, "w");
-		seq_printf(m, "\n");
-	}
-	mutex_unlock(&ide_setting_mtx);
-	return 0;
-}
-
-static int ide_settings_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ide_settings_proc_show, PDE_DATA(inode));
-}
-
-#define MAX_LEN	30
-
-static ssize_t ide_settings_proc_write(struct file *file, const char __user *buffer,
-				       size_t count, loff_t *pos)
-{
-	ide_drive_t	*drive = PDE_DATA(file_inode(file));
-	char		name[MAX_LEN + 1];
-	int		for_real = 0, mul_factor, div_factor;
-	unsigned long	n;
-
-	const struct ide_proc_devset *setting;
-	char *buf, *s;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
-
-	proc_ide_settings_warn();
-
-	if (count >= PAGE_SIZE)
-		return -EINVAL;
-
-	s = buf = (char *)__get_free_page(GFP_USER);
-	if (!buf)
-		return -ENOMEM;
-
-	if (copy_from_user(buf, buffer, count)) {
-		free_page((unsigned long)buf);
-		return -EFAULT;
-	}
-
-	buf[count] = '\0';
-
-	/*
-	 * Skip over leading whitespace
-	 */
-	while (count && isspace(*s)) {
-		--count;
-		++s;
-	}
-	/*
-	 * Do one full pass to verify all parameters,
-	 * then do another to actually write the new settings.
-	 */
-	do {
-		char *p = s;
-		n = count;
-		while (n > 0) {
-			unsigned val;
-			char *q = p;
-
-			while (n > 0 && *p != ':') {
-				--n;
-				p++;
-			}
-			if (*p != ':')
-				goto parse_error;
-			if (p - q > MAX_LEN)
-				goto parse_error;
-			memcpy(name, q, p - q);
-			name[p - q] = 0;
-
-			if (n > 0) {
-				--n;
-				p++;
-			} else
-				goto parse_error;
-
-			val = simple_strtoul(p, &q, 10);
-			n -= q - p;
-			p = q;
-			if (n > 0 && !isspace(*p))
-				goto parse_error;
-			while (n > 0 && isspace(*p)) {
-				--n;
-				++p;
-			}
-
-			mutex_lock(&ide_setting_mtx);
-			/* generic settings first, then driver specific ones */
-			setting = ide_find_setting(ide_generic_settings, name);
-			if (!setting) {
-				if (drive->settings)
-					setting = ide_find_setting(drive->settings, name);
-				if (!setting) {
-					mutex_unlock(&ide_setting_mtx);
-					goto parse_error;
-				}
-			}
-			if (for_real) {
-				mul_factor = setting->mulf ? setting->mulf(drive) : 1;
-				div_factor = setting->divf ? setting->divf(drive) : 1;
-				ide_write_setting(drive, setting, val * div_factor / mul_factor);
-			}
-			mutex_unlock(&ide_setting_mtx);
-		}
-	} while (!for_real++);
-	free_page((unsigned long)buf);
-	return count;
-parse_error:
-	free_page((unsigned long)buf);
-	printk("%s(): parse error\n", __func__);
-	return -EINVAL;
-}
-
-static const struct proc_ops ide_settings_proc_ops = {
-	.proc_open	= ide_settings_proc_open,
-	.proc_read	= seq_read,
-	.proc_lseek	= seq_lseek,
-	.proc_release	= single_release,
-	.proc_write	= ide_settings_proc_write,
-};
-
-int ide_capacity_proc_show(struct seq_file *m, void *v)
-{
-	seq_printf(m, "%llu\n", (long long)0x7fffffff);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(ide_capacity_proc_show);
-
-int ide_geometry_proc_show(struct seq_file *m, void *v)
-{
-	ide_drive_t	*drive = (ide_drive_t *) m->private;
-
-	seq_printf(m, "physical     %d/%d/%d\n",
-			drive->cyl, drive->head, drive->sect);
-	seq_printf(m, "logical      %d/%d/%d\n",
-			drive->bios_cyl, drive->bios_head, drive->bios_sect);
-	return 0;
-}
-EXPORT_SYMBOL(ide_geometry_proc_show);
-
-static int ide_dmodel_proc_show(struct seq_file *seq, void *v)
-{
-	ide_drive_t	*drive = (ide_drive_t *) seq->private;
-	char		*m = (char *)&drive->id[ATA_ID_PROD];
-
-	seq_printf(seq, "%.40s\n", m[0] ? m : "(none)");
-	return 0;
-}
-
-static int ide_driver_proc_show(struct seq_file *m, void *v)
-{
-	ide_drive_t		*drive = (ide_drive_t *)m->private;
-	struct device		*dev = &drive->gendev;
-	struct ide_driver	*ide_drv;
-
-	if (dev->driver) {
-		ide_drv = to_ide_driver(dev->driver);
-		seq_printf(m, "%s version %s\n",
-				dev->driver->name, ide_drv->version);
-	} else
-		seq_printf(m, "ide-default version 0.9.newide\n");
-	return 0;
-}
-
-static int ide_media_proc_show(struct seq_file *m, void *v)
-{
-	ide_drive_t	*drive = (ide_drive_t *) m->private;
-	const char	*media;
-
-	switch (drive->media) {
-	case ide_disk:		media = "disk\n";	break;
-	case ide_cdrom:		media = "cdrom\n";	break;
-	case ide_tape:		media = "tape\n";	break;
-	case ide_floppy:	media = "floppy\n";	break;
-	case ide_optical:	media = "optical\n";	break;
-	default:		media = "UNKNOWN\n";	break;
-	}
-	seq_puts(m, media);
-	return 0;
-}
-
-static int ide_media_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ide_media_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations ide_media_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ide_media_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static ide_proc_entry_t generic_drive_entries[] = {
-	{ "driver",	S_IFREG|S_IRUGO,	 ide_driver_proc_show	},
-	{ "identify",	S_IFREG|S_IRUSR,	 ide_identify_proc_show	},
-	{ "media",	S_IFREG|S_IRUGO,	 ide_media_proc_show	},
-	{ "model",	S_IFREG|S_IRUGO,	 ide_dmodel_proc_show	},
-	{}
-};
-
-static void ide_add_proc_entries(struct proc_dir_entry *dir, ide_proc_entry_t *p, void *data)
-{
-	struct proc_dir_entry *ent;
-
-	if (!dir || !p)
-		return;
-	while (p->name != NULL) {
-		ent = proc_create_single_data(p->name, p->mode, dir, p->show, data);
-		if (!ent) return;
-		p++;
-	}
-}
-
-static void ide_remove_proc_entries(struct proc_dir_entry *dir, ide_proc_entry_t *p)
-{
-	if (!dir || !p)
-		return;
-	while (p->name != NULL) {
-		remove_proc_entry(p->name, dir);
-		p++;
-	}
-}
-
-void ide_proc_register_driver(ide_drive_t *drive, struct ide_driver *driver)
-{
-	mutex_lock(&ide_setting_mtx);
-	drive->settings = driver->proc_devsets(drive);
-	mutex_unlock(&ide_setting_mtx);
-
-	ide_add_proc_entries(drive->proc, driver->proc_entries(drive), drive);
-}
-
-EXPORT_SYMBOL(ide_proc_register_driver);
-
-/**
- *	ide_proc_unregister_driver	-	remove driver specific data
- *	@drive: drive
- *	@driver: driver
- *
- *	Clean up the driver specific /proc files and IDE settings
- *	for a given drive.
- *
- *	Takes ide_setting_mtx.
- */
-
-void ide_proc_unregister_driver(ide_drive_t *drive, struct ide_driver *driver)
-{
-	ide_remove_proc_entries(drive->proc, driver->proc_entries(drive));
-
-	mutex_lock(&ide_setting_mtx);
-	/*
-	 * ide_setting_mtx protects both the settings list and the use
-	 * of settings (we cannot take a setting out that is being used).
-	 */
-	drive->settings = NULL;
-	mutex_unlock(&ide_setting_mtx);
-}
-EXPORT_SYMBOL(ide_proc_unregister_driver);
-
-void ide_proc_port_register_devices(ide_hwif_t *hwif)
-{
-	struct proc_dir_entry *ent;
-	struct proc_dir_entry *parent = hwif->proc;
-	ide_drive_t *drive;
-	char name[64];
-	int i;
-
-	ide_port_for_each_dev(i, drive, hwif) {
-		if ((drive->dev_flags & IDE_DFLAG_PRESENT) == 0)
-			continue;
-
-		drive->proc = proc_mkdir(drive->name, parent);
-		if (drive->proc) {
-			ide_add_proc_entries(drive->proc, generic_drive_entries, drive);
-			proc_create_data("settings", S_IFREG|S_IRUSR|S_IWUSR,
-					drive->proc, &ide_settings_proc_ops,
-					drive);
-		}
-		sprintf(name, "ide%d/%s", (drive->name[2]-'a')/2, drive->name);
-		ent = proc_symlink(drive->name, proc_ide_root, name);
-		if (!ent) return;
-	}
-}
-
-void ide_proc_unregister_device(ide_drive_t *drive)
-{
-	if (drive->proc) {
-		remove_proc_entry("settings", drive->proc);
-		ide_remove_proc_entries(drive->proc, generic_drive_entries);
-		remove_proc_entry(drive->name, proc_ide_root);
-		remove_proc_entry(drive->name, drive->hwif->proc);
-		drive->proc = NULL;
-	}
-}
-
-static ide_proc_entry_t hwif_entries[] = {
-	{ "channel",	S_IFREG|S_IRUGO,	ide_channel_proc_show	},
-	{ "mate",	S_IFREG|S_IRUGO,	ide_mate_proc_show	},
-	{ "model",	S_IFREG|S_IRUGO,	ide_imodel_proc_show	},
-	{}
-};
-
-void ide_proc_register_port(ide_hwif_t *hwif)
-{
-	if (!hwif->proc) {
-		hwif->proc = proc_mkdir(hwif->name, proc_ide_root);
-
-		if (!hwif->proc)
-			return;
-
-		ide_add_proc_entries(hwif->proc, hwif_entries, hwif);
-	}
-}
-
-void ide_proc_unregister_port(ide_hwif_t *hwif)
-{
-	if (hwif->proc) {
-		ide_remove_proc_entries(hwif->proc, hwif_entries);
-		remove_proc_entry(hwif->name, proc_ide_root);
-		hwif->proc = NULL;
-	}
-}
-
-static int proc_print_driver(struct device_driver *drv, void *data)
-{
-	struct ide_driver *ide_drv = to_ide_driver(drv);
-	struct seq_file *s = data;
-
-	seq_printf(s, "%s version %s\n", drv->name, ide_drv->version);
-
-	return 0;
-}
-
-static int ide_drivers_show(struct seq_file *s, void *p)
-{
-	int err;
-
-	err = bus_for_each_drv(&ide_bus_type, NULL, s, proc_print_driver);
-	if (err < 0)
-		printk(KERN_WARNING "IDE: %s: bus_for_each_drv error: %d\n",
-			__func__, err);
-	return 0;
-}
-
-DEFINE_PROC_SHOW_ATTRIBUTE(ide_drivers);
-
-void proc_ide_create(void)
-{
-	proc_ide_root = proc_mkdir("ide", NULL);
-
-	if (!proc_ide_root)
-		return;
-
-	proc_create("drivers", 0, proc_ide_root, &ide_drivers_proc_ops);
-}
-
-void proc_ide_destroy(void)
-{
-	remove_proc_entry("drivers", proc_ide_root);
-	remove_proc_entry("ide", NULL);
-}
diff --git a/drivers/ide/ide-scan-pci.c b/drivers/ide/ide-scan-pci.c
deleted file mode 100644
index b0411a1827a3..000000000000
--- a/drivers/ide/ide-scan-pci.c
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * support for probing IDE PCI devices in the PCI bus order
- *
- * Copyright (c) 1998-2000  Andre Hedrick <andre@linux-ide.org>
- * Copyright (c) 1995-1998  Mark Lord
- *
- * May be copied or modified under the terms of the GNU General Public License
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/ide.h>
-
-/*
- *	Module interfaces
- */
-
-static int pre_init = 1;		/* Before first ordered IDE scan */
-static LIST_HEAD(ide_pci_drivers);
-
-/*
- *	__ide_pci_register_driver	-	attach IDE driver
- *	@driver: pci driver
- *	@module: owner module of the driver
- *
- *	Registers a driver with the IDE layer. The IDE layer arranges that
- *	boot time setup is done in the expected device order and then
- *	hands the controllers off to the core PCI code to do the rest of
- *	the work.
- *
- *	Returns are the same as for pci_register_driver
- */
-
-int __ide_pci_register_driver(struct pci_driver *driver, struct module *module,
-			      const char *mod_name)
-{
-	if (!pre_init)
-		return __pci_register_driver(driver, module, mod_name);
-	driver->driver.owner = module;
-	list_add_tail(&driver->node, &ide_pci_drivers);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(__ide_pci_register_driver);
-
-/**
- *	ide_scan_pcidev		-	find an IDE driver for a device
- *	@dev: PCI device to check
- *
- *	Look for an IDE driver to handle the device we are considering.
- *	This is only used during boot up to get the ordering correct. After
- *	boot up the pci layer takes over the job.
- */
-
-static int __init ide_scan_pcidev(struct pci_dev *dev)
-{
-	struct list_head *l;
-	struct pci_driver *d;
-	int ret;
-
-	list_for_each(l, &ide_pci_drivers) {
-		d = list_entry(l, struct pci_driver, node);
-		if (d->id_table) {
-			const struct pci_device_id *id =
-				pci_match_id(d->id_table, dev);
-
-			if (id != NULL) {
-				pci_assign_irq(dev);
-				ret = d->probe(dev, id);
-				if (ret >= 0) {
-					dev->driver = d;
-					pci_dev_get(dev);
-					return 1;
-				}
-			}
-		}
-	}
-	return 0;
-}
-
-/**
- *	ide_scan_pcibus		-	perform the initial IDE driver scan
- *
- *	Perform the initial bus rather than driver ordered scan of the
- *	PCI drivers. After this all IDE pci handling becomes standard
- *	module ordering not traditionally ordered.
- */
-
-static int __init ide_scan_pcibus(void)
-{
-	struct pci_dev *dev = NULL;
-	struct pci_driver *d, *tmp;
-
-	pre_init = 0;
-	for_each_pci_dev(dev)
-		ide_scan_pcidev(dev);
-
-	/*
-	 *	Hand the drivers over to the PCI layer now we
-	 *	are post init.
-	 */
-
-	list_for_each_entry_safe(d, tmp, &ide_pci_drivers, node) {
-		list_del(&d->node);
-		if (__pci_register_driver(d, d->driver.owner,
-					  d->driver.mod_name))
-			printk(KERN_ERR "%s: failed to register %s driver\n",
-					__func__, d->driver.mod_name);
-	}
-
-	return 0;
-}
-device_initcall(ide_scan_pcibus);
diff --git a/drivers/ide/ide-sysfs.c b/drivers/ide/ide-sysfs.c
deleted file mode 100644
index c08a8a0916e2..000000000000
--- a/drivers/ide/ide-sysfs.c
+++ /dev/null
@@ -1,143 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/kernel.h>
-#include <linux/ide.h>
-
-char *ide_media_string(ide_drive_t *drive)
-{
-	switch (drive->media) {
-	case ide_disk:
-		return "disk";
-	case ide_cdrom:
-		return "cdrom";
-	case ide_tape:
-		return "tape";
-	case ide_floppy:
-		return "floppy";
-	case ide_optical:
-		return "optical";
-	default:
-		return "UNKNOWN";
-	}
-}
-
-static ssize_t media_show(struct device *dev, struct device_attribute *attr,
-			  char *buf)
-{
-	ide_drive_t *drive = to_ide_device(dev);
-	return sprintf(buf, "%s\n", ide_media_string(drive));
-}
-static DEVICE_ATTR_RO(media);
-
-static ssize_t drivename_show(struct device *dev, struct device_attribute *attr,
-			      char *buf)
-{
-	ide_drive_t *drive = to_ide_device(dev);
-	return sprintf(buf, "%s\n", drive->name);
-}
-static DEVICE_ATTR_RO(drivename);
-
-static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
-			     char *buf)
-{
-	ide_drive_t *drive = to_ide_device(dev);
-	return sprintf(buf, "ide:m-%s\n", ide_media_string(drive));
-}
-static DEVICE_ATTR_RO(modalias);
-
-static ssize_t model_show(struct device *dev, struct device_attribute *attr,
-			  char *buf)
-{
-	ide_drive_t *drive = to_ide_device(dev);
-	return sprintf(buf, "%s\n", (char *)&drive->id[ATA_ID_PROD]);
-}
-static DEVICE_ATTR_RO(model);
-
-static ssize_t firmware_show(struct device *dev, struct device_attribute *attr,
-			     char *buf)
-{
-	ide_drive_t *drive = to_ide_device(dev);
-	return sprintf(buf, "%s\n", (char *)&drive->id[ATA_ID_FW_REV]);
-}
-static DEVICE_ATTR_RO(firmware);
-
-static ssize_t serial_show(struct device *dev, struct device_attribute *attr,
-			   char *buf)
-{
-	ide_drive_t *drive = to_ide_device(dev);
-	return sprintf(buf, "%s\n", (char *)&drive->id[ATA_ID_SERNO]);
-}
-static DEVICE_ATTR(serial, 0400, serial_show, NULL);
-
-static DEVICE_ATTR(unload_heads, 0644, ide_park_show, ide_park_store);
-
-static struct attribute *ide_attrs[] = {
-	&dev_attr_media.attr,
-	&dev_attr_drivename.attr,
-	&dev_attr_modalias.attr,
-	&dev_attr_model.attr,
-	&dev_attr_firmware.attr,
-	&dev_attr_serial.attr,
-	&dev_attr_unload_heads.attr,
-	NULL,
-};
-
-static const struct attribute_group ide_attr_group = {
-	.attrs = ide_attrs,
-};
-
-const struct attribute_group *ide_dev_groups[] = {
-	&ide_attr_group,
-	NULL,
-};
-
-static ssize_t store_delete_devices(struct device *portdev,
-				    struct device_attribute *attr,
-				    const char *buf, size_t n)
-{
-	ide_hwif_t *hwif = dev_get_drvdata(portdev);
-
-	if (strncmp(buf, "1", n))
-		return -EINVAL;
-
-	ide_port_unregister_devices(hwif);
-
-	return n;
-};
-
-static DEVICE_ATTR(delete_devices, S_IWUSR, NULL, store_delete_devices);
-
-static ssize_t store_scan(struct device *portdev,
-			  struct device_attribute *attr,
-			  const char *buf, size_t n)
-{
-	ide_hwif_t *hwif = dev_get_drvdata(portdev);
-
-	if (strncmp(buf, "1", n))
-		return -EINVAL;
-
-	ide_port_unregister_devices(hwif);
-	ide_port_scan(hwif);
-
-	return n;
-};
-
-static DEVICE_ATTR(scan, S_IWUSR, NULL, store_scan);
-
-static struct device_attribute *ide_port_attrs[] = {
-	&dev_attr_delete_devices,
-	&dev_attr_scan,
-	NULL
-};
-
-int ide_sysfs_register_port(ide_hwif_t *hwif)
-{
-	int i, rc;
-
-	for (i = 0; ide_port_attrs[i]; i++) {
-		rc = device_create_file(hwif->portdev, ide_port_attrs[i]);
-		if (rc)
-			break;
-	}
-
-	return rc;
-}
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
deleted file mode 100644
index fa05e7e7d609..000000000000
--- a/drivers/ide/ide-tape.c
+++ /dev/null
@@ -1,2083 +0,0 @@
-/*
- * IDE ATAPI streaming tape driver.
- *
- * Copyright (C) 1995-1999  Gadi Oxman <gadio@netvision.net.il>
- * Copyright (C) 2003-2005  Bartlomiej Zolnierkiewicz
- *
- * This driver was constructed as a student project in the software laboratory
- * of the faculty of electrical engineering in the Technion - Israel's
- * Institute Of Technology, with the guide of Avner Lottem and Dr. Ilana David.
- *
- * It is hereby placed under the terms of the GNU general public license.
- * (See linux/COPYING).
- *
- * For a historical changelog see
- * Documentation/ide/ChangeLog.ide-tape.1995-2002
- */
-
-#define DRV_NAME "ide-tape"
-
-#define IDETAPE_VERSION "1.20"
-
-#include <linux/compat.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/delay.h>
-#include <linux/timer.h>
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/jiffies.h>
-#include <linux/major.h>
-#include <linux/errno.h>
-#include <linux/genhd.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include <linux/pci.h>
-#include <linux/ide.h>
-#include <linux/completion.h>
-#include <linux/bitops.h>
-#include <linux/mutex.h>
-#include <scsi/scsi.h>
-
-#include <asm/byteorder.h>
-#include <linux/uaccess.h>
-#include <linux/io.h>
-#include <asm/unaligned.h>
-#include <linux/mtio.h>
-
-/* define to see debug info */
-#undef IDETAPE_DEBUG_LOG
-
-#ifdef IDETAPE_DEBUG_LOG
-#define ide_debug_log(lvl, fmt, args...) __ide_debug_log(lvl, fmt, ## args)
-#else
-#define ide_debug_log(lvl, fmt, args...) do {} while (0)
-#endif
-
-/**************************** Tunable parameters *****************************/
-/*
- * After each failed packet command we issue a request sense command and retry
- * the packet command IDETAPE_MAX_PC_RETRIES times.
- *
- * Setting IDETAPE_MAX_PC_RETRIES to 0 will disable retries.
- */
-#define IDETAPE_MAX_PC_RETRIES		3
-
-/*
- * The following parameter is used to select the point in the internal tape fifo
- * in which we will start to refill the buffer. Decreasing the following
- * parameter will improve the system's latency and interactive response, while
- * using a high value might improve system throughput.
- */
-#define IDETAPE_FIFO_THRESHOLD		2
-
-/*
- * DSC polling parameters.
- *
- * Polling for DSC (a single bit in the status register) is a very important
- * function in ide-tape. There are two cases in which we poll for DSC:
- *
- * 1. Before a read/write packet command, to ensure that we can transfer data
- * from/to the tape's data buffers, without causing an actual media access.
- * In case the tape is not ready yet, we take out our request from the device
- * request queue, so that ide.c could service requests from the other device
- * on the same interface in the meantime.
- *
- * 2. After the successful initialization of a "media access packet command",
- * which is a command that can take a long time to complete (the interval can
- * range from several seconds to even an hour). Again, we postpone our request
- * in the middle to free the bus for the other device. The polling frequency
- * here should be lower than the read/write frequency since those media access
- * commands are slow. We start from a "fast" frequency - IDETAPE_DSC_MA_FAST
- * (1 second), and if we don't receive DSC after IDETAPE_DSC_MA_THRESHOLD
- * (5 min), we switch it to a lower frequency - IDETAPE_DSC_MA_SLOW (1 min).
- *
- * We also set a timeout for the timer, in case something goes wrong. The
- * timeout should be longer then the maximum execution time of a tape operation.
- */
-
-/* DSC timings. */
-#define IDETAPE_DSC_RW_MIN		5*HZ/100	/* 50 msec */
-#define IDETAPE_DSC_RW_MAX		40*HZ/100	/* 400 msec */
-#define IDETAPE_DSC_RW_TIMEOUT		2*60*HZ		/* 2 minutes */
-#define IDETAPE_DSC_MA_FAST		2*HZ		/* 2 seconds */
-#define IDETAPE_DSC_MA_THRESHOLD	5*60*HZ		/* 5 minutes */
-#define IDETAPE_DSC_MA_SLOW		30*HZ		/* 30 seconds */
-#define IDETAPE_DSC_MA_TIMEOUT		2*60*60*HZ	/* 2 hours */
-
-/*************************** End of tunable parameters ***********************/
-
-/* tape directions */
-enum {
-	IDETAPE_DIR_NONE  = (1 << 0),
-	IDETAPE_DIR_READ  = (1 << 1),
-	IDETAPE_DIR_WRITE = (1 << 2),
-};
-
-/* Tape door status */
-#define DOOR_UNLOCKED			0
-#define DOOR_LOCKED			1
-#define DOOR_EXPLICITLY_LOCKED		2
-
-/* Some defines for the SPACE command */
-#define IDETAPE_SPACE_OVER_FILEMARK	1
-#define IDETAPE_SPACE_TO_EOD		3
-
-/* Some defines for the LOAD UNLOAD command */
-#define IDETAPE_LU_LOAD_MASK		1
-#define IDETAPE_LU_RETENSION_MASK	2
-#define IDETAPE_LU_EOT_MASK		4
-
-/* Structures related to the SELECT SENSE / MODE SENSE packet commands. */
-#define IDETAPE_BLOCK_DESCRIPTOR	0
-#define IDETAPE_CAPABILITIES_PAGE	0x2a
-
-/*
- * Most of our global data which we need to save even as we leave the driver due
- * to an interrupt or a timer event is stored in the struct defined below.
- */
-typedef struct ide_tape_obj {
-	ide_drive_t		*drive;
-	struct ide_driver	*driver;
-	struct gendisk		*disk;
-	struct device		dev;
-
-	/* used by REQ_IDETAPE_{READ,WRITE} requests */
-	struct ide_atapi_pc queued_pc;
-
-	/*
-	 * DSC polling variables.
-	 *
-	 * While polling for DSC we use postponed_rq to postpone the current
-	 * request so that ide.c will be able to service pending requests on the
-	 * other device. Note that at most we will have only one DSC (usually
-	 * data transfer) request in the device request queue.
-	 */
-	bool postponed_rq;
-
-	/* The time in which we started polling for DSC */
-	unsigned long dsc_polling_start;
-	/* Timer used to poll for dsc */
-	struct timer_list dsc_timer;
-	/* Read/Write dsc polling frequency */
-	unsigned long best_dsc_rw_freq;
-	unsigned long dsc_poll_freq;
-	unsigned long dsc_timeout;
-
-	/* Read position information */
-	u8 partition;
-	/* Current block */
-	unsigned int first_frame;
-
-	/* Last error information */
-	u8 sense_key, asc, ascq;
-
-	/* Character device operation */
-	unsigned int minor;
-	/* device name */
-	char name[4];
-	/* Current character device data transfer direction */
-	u8 chrdev_dir;
-
-	/* tape block size, usually 512 or 1024 bytes */
-	unsigned short blk_size;
-	int user_bs_factor;
-
-	/* Copy of the tape's Capabilities and Mechanical Page */
-	u8 caps[20];
-
-	/*
-	 * Active data transfer request parameters.
-	 *
-	 * At most, there is only one ide-tape originated data transfer request
-	 * in the device request queue. This allows ide.c to easily service
-	 * requests from the other device when we postpone our active request.
-	 */
-
-	/* Data buffer size chosen based on the tape's recommendation */
-	int buffer_size;
-	/* Staging buffer of buffer_size bytes */
-	void *buf;
-	/* The read/write cursor */
-	void *cur;
-	/* The number of valid bytes in buf */
-	size_t valid;
-
-	/* Measures average tape speed */
-	unsigned long avg_time;
-	int avg_size;
-	int avg_speed;
-
-	/* the door is currently locked */
-	int door_locked;
-	/* the tape hardware is write protected */
-	char drv_write_prot;
-	/* the tape is write protected (hardware or opened as read-only) */
-	char write_prot;
-} idetape_tape_t;
-
-static DEFINE_MUTEX(ide_tape_mutex);
-static DEFINE_MUTEX(idetape_ref_mutex);
-
-static DEFINE_MUTEX(idetape_chrdev_mutex);
-
-static struct class *idetape_sysfs_class;
-
-static void ide_tape_release(struct device *);
-
-static struct ide_tape_obj *idetape_devs[MAX_HWIFS * MAX_DRIVES];
-
-static struct ide_tape_obj *ide_tape_get(struct gendisk *disk, bool cdev,
-					 unsigned int i)
-{
-	struct ide_tape_obj *tape = NULL;
-
-	mutex_lock(&idetape_ref_mutex);
-
-	if (cdev)
-		tape = idetape_devs[i];
-	else
-		tape = ide_drv_g(disk, ide_tape_obj);
-
-	if (tape) {
-		if (ide_device_get(tape->drive))
-			tape = NULL;
-		else
-			get_device(&tape->dev);
-	}
-
-	mutex_unlock(&idetape_ref_mutex);
-	return tape;
-}
-
-static void ide_tape_put(struct ide_tape_obj *tape)
-{
-	ide_drive_t *drive = tape->drive;
-
-	mutex_lock(&idetape_ref_mutex);
-	put_device(&tape->dev);
-	ide_device_put(drive);
-	mutex_unlock(&idetape_ref_mutex);
-}
-
-/*
- * called on each failed packet command retry to analyze the request sense. We
- * currently do not utilize this information.
- */
-static void idetape_analyze_error(ide_drive_t *drive)
-{
-	idetape_tape_t *tape = drive->driver_data;
-	struct ide_atapi_pc *pc = drive->failed_pc;
-	struct request *rq = drive->hwif->rq;
-	u8 *sense = bio_data(rq->bio);
-
-	tape->sense_key = sense[2] & 0xF;
-	tape->asc       = sense[12];
-	tape->ascq      = sense[13];
-
-	ide_debug_log(IDE_DBG_FUNC,
-		      "cmd: 0x%x, sense key = %x, asc = %x, ascq = %x",
-		      rq->cmd[0], tape->sense_key, tape->asc, tape->ascq);
-
-	/* correct remaining bytes to transfer */
-	if (pc->flags & PC_FLAG_DMA_ERROR)
-		scsi_req(rq)->resid_len = tape->blk_size * get_unaligned_be32(&sense[3]);
-
-	/*
-	 * If error was the result of a zero-length read or write command,
-	 * with sense key=5, asc=0x22, ascq=0, let it slide.  Some drives
-	 * (i.e. Seagate STT3401A Travan) don't support 0-length read/writes.
-	 */
-	if ((pc->c[0] == READ_6 || pc->c[0] == WRITE_6)
-	    /* length == 0 */
-	    && pc->c[4] == 0 && pc->c[3] == 0 && pc->c[2] == 0) {
-		if (tape->sense_key == 5) {
-			/* don't report an error, everything's ok */
-			pc->error = 0;
-			/* don't retry read/write */
-			pc->flags |= PC_FLAG_ABORT;
-		}
-	}
-	if (pc->c[0] == READ_6 && (sense[2] & 0x80)) {
-		pc->error = IDE_DRV_ERROR_FILEMARK;
-		pc->flags |= PC_FLAG_ABORT;
-	}
-	if (pc->c[0] == WRITE_6) {
-		if ((sense[2] & 0x40) || (tape->sense_key == 0xd
-		     && tape->asc == 0x0 && tape->ascq == 0x2)) {
-			pc->error = IDE_DRV_ERROR_EOD;
-			pc->flags |= PC_FLAG_ABORT;
-		}
-	}
-	if (pc->c[0] == READ_6 || pc->c[0] == WRITE_6) {
-		if (tape->sense_key == 8) {
-			pc->error = IDE_DRV_ERROR_EOD;
-			pc->flags |= PC_FLAG_ABORT;
-		}
-		if (!(pc->flags & PC_FLAG_ABORT) &&
-		    (blk_rq_bytes(rq) - scsi_req(rq)->resid_len))
-			pc->retries = IDETAPE_MAX_PC_RETRIES + 1;
-	}
-}
-
-static void ide_tape_handle_dsc(ide_drive_t *);
-
-static int ide_tape_callback(ide_drive_t *drive, int dsc)
-{
-	idetape_tape_t *tape = drive->driver_data;
-	struct ide_atapi_pc *pc = drive->pc;
-	struct request *rq = drive->hwif->rq;
-	int uptodate = pc->error ? 0 : 1;
-	int err = uptodate ? 0 : IDE_DRV_ERROR_GENERAL;
-
-	ide_debug_log(IDE_DBG_FUNC, "cmd: 0x%x, dsc: %d, err: %d", rq->cmd[0],
-		      dsc, err);
-
-	if (dsc)
-		ide_tape_handle_dsc(drive);
-
-	if (drive->failed_pc == pc)
-		drive->failed_pc = NULL;
-
-	if (pc->c[0] == REQUEST_SENSE) {
-		if (uptodate)
-			idetape_analyze_error(drive);
-		else
-			printk(KERN_ERR "ide-tape: Error in REQUEST SENSE "
-					"itself - Aborting request!\n");
-	} else if (pc->c[0] == READ_6 || pc->c[0] == WRITE_6) {
-		unsigned int blocks =
-			(blk_rq_bytes(rq) - scsi_req(rq)->resid_len) / tape->blk_size;
-
-		tape->avg_size += blocks * tape->blk_size;
-
-		if (time_after_eq(jiffies, tape->avg_time + HZ)) {
-			tape->avg_speed = tape->avg_size * HZ /
-				(jiffies - tape->avg_time) / 1024;
-			tape->avg_size = 0;
-			tape->avg_time = jiffies;
-		}
-
-		tape->first_frame += blocks;
-
-		if (pc->error) {
-			uptodate = 0;
-			err = pc->error;
-		}
-	}
-	scsi_req(rq)->result = err;
-
-	return uptodate;
-}
-
-/*
- * Postpone the current request so that ide.c will be able to service requests
- * from another device on the same port while we are polling for DSC.
- */
-static void ide_tape_stall_queue(ide_drive_t *drive)
-{
-	idetape_tape_t *tape = drive->driver_data;
-
-	ide_debug_log(IDE_DBG_FUNC, "cmd: 0x%x, dsc_poll_freq: %lu",
-		      drive->hwif->rq->cmd[0], tape->dsc_poll_freq);
-
-	tape->postponed_rq = true;
-
-	ide_stall_queue(drive, tape->dsc_poll_freq);
-}
-
-static void ide_tape_handle_dsc(ide_drive_t *drive)
-{
-	idetape_tape_t *tape = drive->driver_data;
-
-	/* Media access command */
-	tape->dsc_polling_start = jiffies;
-	tape->dsc_poll_freq = IDETAPE_DSC_MA_FAST;
-	tape->dsc_timeout = jiffies + IDETAPE_DSC_MA_TIMEOUT;
-	/* Allow ide.c to handle other requests */
-	ide_tape_stall_queue(drive);
-}
-
-/*
- * Packet Command Interface
- *
- * The current Packet Command is available in drive->pc, and will not change
- * until we finish handling it. Each packet command is associated with a
- * callback function that will be called when the command is finished.
- *
- * The handling will be done in three stages:
- *
- * 1. ide_tape_issue_pc will send the packet command to the drive, and will set
- * the interrupt handler to ide_pc_intr.
- *
- * 2. On each interrupt, ide_pc_intr will be called. This step will be
- * repeated until the device signals us that no more interrupts will be issued.
- *
- * 3. ATAPI Tape media access commands have immediate status with a delayed
- * process. In case of a successful initiation of a media access packet command,
- * the DSC bit will be set when the actual execution of the command is finished.
- * Since the tape drive will not issue an interrupt, we have to poll for this
- * event. In this case, we define the request as "low priority request" by
- * setting rq_status to IDETAPE_RQ_POSTPONED, set a timer to poll for DSC and
- * exit the driver.
- *
- * ide.c will then give higher priority to requests which originate from the
- * other device, until will change rq_status to RQ_ACTIVE.
- *
- * 4. When the packet command is finished, it will be checked for errors.
- *
- * 5. In case an error was found, we queue a request sense packet command in
- * front of the request queue and retry the operation up to
- * IDETAPE_MAX_PC_RETRIES times.
- *
- * 6. In case no error was found, or we decided to give up and not to retry
- * again, the callback function will be called and then we will handle the next
- * request.
- */
-
-static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive,
-					 struct ide_cmd *cmd,
-					 struct ide_atapi_pc *pc)
-{
-	idetape_tape_t *tape = drive->driver_data;
-	struct request *rq = drive->hwif->rq;
-
-	if (drive->failed_pc == NULL && pc->c[0] != REQUEST_SENSE)
-		drive->failed_pc = pc;
-
-	/* Set the current packet command */
-	drive->pc = pc;
-
-	if (pc->retries > IDETAPE_MAX_PC_RETRIES ||
-		(pc->flags & PC_FLAG_ABORT)) {
-
-		/*
-		 * We will "abort" retrying a packet command in case legitimate
-		 * error code was received (crossing a filemark, or end of the
-		 * media, for example).
-		 */
-		if (!(pc->flags & PC_FLAG_ABORT)) {
-			if (!(pc->c[0] == TEST_UNIT_READY &&
-			      tape->sense_key == 2 && tape->asc == 4 &&
-			     (tape->ascq == 1 || tape->ascq == 8))) {
-				printk(KERN_ERR "ide-tape: %s: I/O error, "
-						"pc = %2x, key = %2x, "
-						"asc = %2x, ascq = %2x\n",
-						tape->name, pc->c[0],
-						tape->sense_key, tape->asc,
-						tape->ascq);
-			}
-			/* Giving up */
-			pc->error = IDE_DRV_ERROR_GENERAL;
-		}
-
-		drive->failed_pc = NULL;
-		drive->pc_callback(drive, 0);
-		ide_complete_rq(drive, BLK_STS_IOERR, blk_rq_bytes(rq));
-		return ide_stopped;
-	}
-	ide_debug_log(IDE_DBG_SENSE, "retry #%d, cmd: 0x%02x", pc->retries,
-		      pc->c[0]);
-
-	pc->retries++;
-
-	return ide_issue_pc(drive, cmd);
-}
-
-/* A mode sense command is used to "sense" tape parameters. */
-static void idetape_create_mode_sense_cmd(struct ide_atapi_pc *pc, u8 page_code)
-{
-	ide_init_pc(pc);
-	pc->c[0] = MODE_SENSE;
-	if (page_code != IDETAPE_BLOCK_DESCRIPTOR)
-		/* DBD = 1 - Don't return block descriptors */
-		pc->c[1] = 8;
-	pc->c[2] = page_code;
-	/*
-	 * Changed pc->c[3] to 0 (255 will at best return unused info).
-	 *
-	 * For SCSI this byte is defined as subpage instead of high byte
-	 * of length and some IDE drives seem to interpret it this way
-	 * and return an error when 255 is used.
-	 */
-	pc->c[3] = 0;
-	/* We will just discard data in that case */
-	pc->c[4] = 255;
-	if (page_code == IDETAPE_BLOCK_DESCRIPTOR)
-		pc->req_xfer = 12;
-	else if (page_code == IDETAPE_CAPABILITIES_PAGE)
-		pc->req_xfer = 24;
-	else
-		pc->req_xfer = 50;
-}
-
-static ide_startstop_t idetape_media_access_finished(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	idetape_tape_t *tape = drive->driver_data;
-	struct ide_atapi_pc *pc = drive->pc;
-	u8 stat;
-
-	stat = hwif->tp_ops->read_status(hwif);
-
-	if (stat & ATA_DSC) {
-		if (stat & ATA_ERR) {
-			/* Error detected */
-			if (pc->c[0] != TEST_UNIT_READY)
-				printk(KERN_ERR "ide-tape: %s: I/O error, ",
-						tape->name);
-			/* Retry operation */
-			ide_retry_pc(drive);
-			return ide_stopped;
-		}
-		pc->error = 0;
-	} else {
-		pc->error = IDE_DRV_ERROR_GENERAL;
-		drive->failed_pc = NULL;
-	}
-	drive->pc_callback(drive, 0);
-	return ide_stopped;
-}
-
-static void ide_tape_create_rw_cmd(idetape_tape_t *tape,
-				   struct ide_atapi_pc *pc, struct request *rq,
-				   u8 opcode)
-{
-	unsigned int length = blk_rq_sectors(rq) / (tape->blk_size >> 9);
-
-	ide_init_pc(pc);
-	put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]);
-	pc->c[1] = 1;
-
-	if (blk_rq_bytes(rq) == tape->buffer_size)
-		pc->flags |= PC_FLAG_DMA_OK;
-
-	if (opcode == READ_6)
-		pc->c[0] = READ_6;
-	else if (opcode == WRITE_6) {
-		pc->c[0] = WRITE_6;
-		pc->flags |= PC_FLAG_WRITING;
-	}
-
-	memcpy(scsi_req(rq)->cmd, pc->c, 12);
-}
-
-static ide_startstop_t idetape_do_request(ide_drive_t *drive,
-					  struct request *rq, sector_t block)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	idetape_tape_t *tape = drive->driver_data;
-	struct ide_atapi_pc *pc = NULL;
-	struct ide_cmd cmd;
-	struct scsi_request *req = scsi_req(rq);
-	u8 stat;
-
-	ide_debug_log(IDE_DBG_RQ, "cmd: 0x%x, sector: %llu, nr_sectors: %u",
-		      req->cmd[0], (unsigned long long)blk_rq_pos(rq),
-		      blk_rq_sectors(rq));
-
-	BUG_ON(!blk_rq_is_private(rq));
-	BUG_ON(ide_req(rq)->type != ATA_PRIV_MISC &&
-	       ide_req(rq)->type != ATA_PRIV_SENSE);
-
-	/* Retry a failed packet command */
-	if (drive->failed_pc && drive->pc->c[0] == REQUEST_SENSE) {
-		pc = drive->failed_pc;
-		goto out;
-	}
-
-	/*
-	 * If the tape is still busy, postpone our request and service
-	 * the other device meanwhile.
-	 */
-	stat = hwif->tp_ops->read_status(hwif);
-
-	if ((drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) == 0 &&
-	    (req->cmd[13] & REQ_IDETAPE_PC2) == 0)
-		drive->atapi_flags |= IDE_AFLAG_IGNORE_DSC;
-
-	if (drive->dev_flags & IDE_DFLAG_POST_RESET) {
-		drive->atapi_flags |= IDE_AFLAG_IGNORE_DSC;
-		drive->dev_flags &= ~IDE_DFLAG_POST_RESET;
-	}
-
-	if (!(drive->atapi_flags & IDE_AFLAG_IGNORE_DSC) &&
-	    !(stat & ATA_DSC)) {
-		if (!tape->postponed_rq) {
-			tape->dsc_polling_start = jiffies;
-			tape->dsc_poll_freq = tape->best_dsc_rw_freq;
-			tape->dsc_timeout = jiffies + IDETAPE_DSC_RW_TIMEOUT;
-		} else if (time_after(jiffies, tape->dsc_timeout)) {
-			printk(KERN_ERR "ide-tape: %s: DSC timeout\n",
-				tape->name);
-			if (req->cmd[13] & REQ_IDETAPE_PC2) {
-				idetape_media_access_finished(drive);
-				return ide_stopped;
-			} else {
-				return ide_do_reset(drive);
-			}
-		} else if (time_after(jiffies,
-					tape->dsc_polling_start +
-					IDETAPE_DSC_MA_THRESHOLD))
-			tape->dsc_poll_freq = IDETAPE_DSC_MA_SLOW;
-		ide_tape_stall_queue(drive);
-		return ide_stopped;
-	} else {
-		drive->atapi_flags &= ~IDE_AFLAG_IGNORE_DSC;
-		tape->postponed_rq = false;
-	}
-
-	if (req->cmd[13] & REQ_IDETAPE_READ) {
-		pc = &tape->queued_pc;
-		ide_tape_create_rw_cmd(tape, pc, rq, READ_6);
-		goto out;
-	}
-	if (req->cmd[13] & REQ_IDETAPE_WRITE) {
-		pc = &tape->queued_pc;
-		ide_tape_create_rw_cmd(tape, pc, rq, WRITE_6);
-		goto out;
-	}
-	if (req->cmd[13] & REQ_IDETAPE_PC1) {
-		pc = (struct ide_atapi_pc *)ide_req(rq)->special;
-		req->cmd[13] &= ~(REQ_IDETAPE_PC1);
-		req->cmd[13] |= REQ_IDETAPE_PC2;
-		goto out;
-	}
-	if (req->cmd[13] & REQ_IDETAPE_PC2) {
-		idetape_media_access_finished(drive);
-		return ide_stopped;
-	}
-	BUG();
-
-out:
-	/* prepare sense request for this command */
-	ide_prep_sense(drive, rq);
-
-	memset(&cmd, 0, sizeof(cmd));
-
-	if (rq_data_dir(rq))
-		cmd.tf_flags |= IDE_TFLAG_WRITE;
-
-	cmd.rq = rq;
-
-	ide_init_sg_cmd(&cmd, blk_rq_bytes(rq));
-	ide_map_sg(drive, &cmd);
-
-	return ide_tape_issue_pc(drive, &cmd, pc);
-}
-
-/*
- * Write a filemark if write_filemark=1. Flush the device buffers without
- * writing a filemark otherwise.
- */
-static void idetape_create_write_filemark_cmd(ide_drive_t *drive,
-		struct ide_atapi_pc *pc, int write_filemark)
-{
-	ide_init_pc(pc);
-	pc->c[0] = WRITE_FILEMARKS;
-	pc->c[4] = write_filemark;
-	pc->flags |= PC_FLAG_WAIT_FOR_DSC;
-}
-
-static int idetape_wait_ready(ide_drive_t *drive, unsigned long timeout)
-{
-	idetape_tape_t *tape = drive->driver_data;
-	struct gendisk *disk = tape->disk;
-	int load_attempted = 0;
-
-	/* Wait for the tape to become ready */
-	set_bit(ilog2(IDE_AFLAG_MEDIUM_PRESENT), &drive->atapi_flags);
-	timeout += jiffies;
-	while (time_before(jiffies, timeout)) {
-		if (ide_do_test_unit_ready(drive, disk) == 0)
-			return 0;
-		if ((tape->sense_key == 2 && tape->asc == 4 && tape->ascq == 2)
-		    || (tape->asc == 0x3A)) {
-			/* no media */
-			if (load_attempted)
-				return -ENOMEDIUM;
-			ide_do_start_stop(drive, disk, IDETAPE_LU_LOAD_MASK);
-			load_attempted = 1;
-		/* not about to be ready */
-		} else if (!(tape->sense_key == 2 && tape->asc == 4 &&
-			     (tape->ascq == 1 || tape->ascq == 8)))
-			return -EIO;
-		msleep(100);
-	}
-	return -EIO;
-}
-
-static int idetape_flush_tape_buffers(ide_drive_t *drive)
-{
-	struct ide_tape_obj *tape = drive->driver_data;
-	struct ide_atapi_pc pc;
-	int rc;
-
-	idetape_create_write_filemark_cmd(drive, &pc, 0);
-	rc = ide_queue_pc_tail(drive, tape->disk, &pc, NULL, 0);
-	if (rc)
-		return rc;
-	idetape_wait_ready(drive, 60 * 5 * HZ);
-	return 0;
-}
-
-static int ide_tape_read_position(ide_drive_t *drive)
-{
-	idetape_tape_t *tape = drive->driver_data;
-	struct ide_atapi_pc pc;
-	u8 buf[20];
-
-	ide_debug_log(IDE_DBG_FUNC, "enter");
-
-	/* prep cmd */
-	ide_init_pc(&pc);
-	pc.c[0] = READ_POSITION;
-	pc.req_xfer = 20;
-
-	if (ide_queue_pc_tail(drive, tape->disk, &pc, buf, pc.req_xfer))
-		return -1;
-
-	if (!pc.error) {
-		ide_debug_log(IDE_DBG_FUNC, "BOP - %s",
-				(buf[0] & 0x80) ? "Yes" : "No");
-		ide_debug_log(IDE_DBG_FUNC, "EOP - %s",
-				(buf[0] & 0x40) ? "Yes" : "No");
-
-		if (buf[0] & 0x4) {
-			printk(KERN_INFO "ide-tape: Block location is unknown"
-					 "to the tape\n");
-			clear_bit(ilog2(IDE_AFLAG_ADDRESS_VALID),
-				  &drive->atapi_flags);
-			return -1;
-		} else {
-			ide_debug_log(IDE_DBG_FUNC, "Block Location: %u",
-				      be32_to_cpup((__be32 *)&buf[4]));
-
-			tape->partition = buf[1];
-			tape->first_frame = be32_to_cpup((__be32 *)&buf[4]);
-			set_bit(ilog2(IDE_AFLAG_ADDRESS_VALID),
-				&drive->atapi_flags);
-		}
-	}
-
-	return tape->first_frame;
-}
-
-static void idetape_create_locate_cmd(ide_drive_t *drive,
-		struct ide_atapi_pc *pc,
-		unsigned int block, u8 partition, int skip)
-{
-	ide_init_pc(pc);
-	pc->c[0] = POSITION_TO_ELEMENT;
-	pc->c[1] = 2;
-	put_unaligned(cpu_to_be32(block), (unsigned int *) &pc->c[3]);
-	pc->c[8] = partition;
-	pc->flags |= PC_FLAG_WAIT_FOR_DSC;
-}
-
-static void __ide_tape_discard_merge_buffer(ide_drive_t *drive)
-{
-	idetape_tape_t *tape = drive->driver_data;
-
-	if (tape->chrdev_dir != IDETAPE_DIR_READ)
-		return;
-
-	clear_bit(ilog2(IDE_AFLAG_FILEMARK), &drive->atapi_flags);
-	tape->valid = 0;
-	if (tape->buf != NULL) {
-		kfree(tape->buf);
-		tape->buf = NULL;
-	}
-
-	tape->chrdev_dir = IDETAPE_DIR_NONE;
-}
-
-/*
- * Position the tape to the requested block using the LOCATE packet command.
- * A READ POSITION command is then issued to check where we are positioned. Like
- * all higher level operations, we queue the commands at the tail of the request
- * queue and wait for their completion.
- */
-static int idetape_position_tape(ide_drive_t *drive, unsigned int block,
-		u8 partition, int skip)
-{
-	idetape_tape_t *tape = drive->driver_data;
-	struct gendisk *disk = tape->disk;
-	int ret;
-	struct ide_atapi_pc pc;
-
-	if (tape->chrdev_dir == IDETAPE_DIR_READ)
-		__ide_tape_discard_merge_buffer(drive);
-	idetape_wait_ready(drive, 60 * 5 * HZ);
-	idetape_create_locate_cmd(drive, &pc, block, partition, skip);
-	ret = ide_queue_pc_tail(drive, disk, &pc, NULL, 0);
-	if (ret)
-		return ret;
-
-	ret = ide_tape_read_position(drive);
-	if (ret < 0)
-		return ret;
-	return 0;
-}
-
-static void ide_tape_discard_merge_buffer(ide_drive_t *drive,
-					  int restore_position)
-{
-	idetape_tape_t *tape = drive->driver_data;
-	int seek, position;
-
-	__ide_tape_discard_merge_buffer(drive);
-	if (restore_position) {
-		position = ide_tape_read_position(drive);
-		seek = position > 0 ? position : 0;
-		if (idetape_position_tape(drive, seek, 0, 0)) {
-			printk(KERN_INFO "ide-tape: %s: position_tape failed in"
-					 " %s\n", tape->name, __func__);
-			return;
-		}
-	}
-}
-
-/*
- * Generate a read/write request for the block device interface and wait for it
- * to be serviced.
- */
-static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
-{
-	idetape_tape_t *tape = drive->driver_data;
-	struct request *rq;
-	int ret;
-
-	ide_debug_log(IDE_DBG_FUNC, "cmd: 0x%x, size: %d", cmd, size);
-
-	BUG_ON(cmd != REQ_IDETAPE_READ && cmd != REQ_IDETAPE_WRITE);
-	BUG_ON(size < 0 || size % tape->blk_size);
-
-	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
-	ide_req(rq)->type = ATA_PRIV_MISC;
-	scsi_req(rq)->cmd[13] = cmd;
-	rq->rq_disk = tape->disk;
-	rq->__sector = tape->first_frame;
-
-	if (size) {
-		ret = blk_rq_map_kern(drive->queue, rq, tape->buf, size,
-				      GFP_NOIO);
-		if (ret)
-			goto out_put;
-	}
-
-	blk_execute_rq(tape->disk, rq, 0);
-
-	/* calculate the number of transferred bytes and update buffer state */
-	size -= scsi_req(rq)->resid_len;
-	tape->cur = tape->buf;
-	if (cmd == REQ_IDETAPE_READ)
-		tape->valid = size;
-	else
-		tape->valid = 0;
-
-	ret = size;
-	if (scsi_req(rq)->result == IDE_DRV_ERROR_GENERAL)
-		ret = -EIO;
-out_put:
-	blk_put_request(rq);
-	return ret;
-}
-
-static void idetape_create_inquiry_cmd(struct ide_atapi_pc *pc)
-{
-	ide_init_pc(pc);
-	pc->c[0] = INQUIRY;
-	pc->c[4] = 254;
-	pc->req_xfer = 254;
-}
-
-static void idetape_create_rewind_cmd(ide_drive_t *drive,
-		struct ide_atapi_pc *pc)
-{
-	ide_init_pc(pc);
-	pc->c[0] = REZERO_UNIT;
-	pc->flags |= PC_FLAG_WAIT_FOR_DSC;
-}
-
-static void idetape_create_erase_cmd(struct ide_atapi_pc *pc)
-{
-	ide_init_pc(pc);
-	pc->c[0] = ERASE;
-	pc->c[1] = 1;
-	pc->flags |= PC_FLAG_WAIT_FOR_DSC;
-}
-
-static void idetape_create_space_cmd(struct ide_atapi_pc *pc, int count, u8 cmd)
-{
-	ide_init_pc(pc);
-	pc->c[0] = SPACE;
-	put_unaligned(cpu_to_be32(count), (unsigned int *) &pc->c[1]);
-	pc->c[1] = cmd;
-	pc->flags |= PC_FLAG_WAIT_FOR_DSC;
-}
-
-static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
-{
-	idetape_tape_t *tape = drive->driver_data;
-
-	if (tape->chrdev_dir != IDETAPE_DIR_WRITE) {
-		printk(KERN_ERR "ide-tape: bug: Trying to empty merge buffer"
-				" but we are not writing.\n");
-		return;
-	}
-	if (tape->buf) {
-		size_t aligned = roundup(tape->valid, tape->blk_size);
-
-		memset(tape->cur, 0, aligned - tape->valid);
-		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, aligned);
-		kfree(tape->buf);
-		tape->buf = NULL;
-	}
-	tape->chrdev_dir = IDETAPE_DIR_NONE;
-}
-
-static int idetape_init_rw(ide_drive_t *drive, int dir)
-{
-	idetape_tape_t *tape = drive->driver_data;
-	int rc;
-
-	BUG_ON(dir != IDETAPE_DIR_READ && dir != IDETAPE_DIR_WRITE);
-
-	if (tape->chrdev_dir == dir)
-		return 0;
-
-	if (tape->chrdev_dir == IDETAPE_DIR_READ)
-		ide_tape_discard_merge_buffer(drive, 1);
-	else if (tape->chrdev_dir == IDETAPE_DIR_WRITE) {
-		ide_tape_flush_merge_buffer(drive);
-		idetape_flush_tape_buffers(drive);
-	}
-
-	if (tape->buf || tape->valid) {
-		printk(KERN_ERR "ide-tape: valid should be 0 now\n");
-		tape->valid = 0;
-	}
-
-	tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
-	if (!tape->buf)
-		return -ENOMEM;
-	tape->chrdev_dir = dir;
-	tape->cur = tape->buf;
-
-	/*
-	 * Issue a 0 rw command to ensure that DSC handshake is
-	 * switched from completion mode to buffer available mode.  No
-	 * point in issuing this if DSC overlap isn't supported, some
-	 * drives (Seagate STT3401A) will return an error.
-	 */
-	if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) {
-		int cmd = dir == IDETAPE_DIR_READ ? REQ_IDETAPE_READ
-						  : REQ_IDETAPE_WRITE;
-
-		rc = idetape_queue_rw_tail(drive, cmd, 0);
-		if (rc < 0) {
-			kfree(tape->buf);
-			tape->buf = NULL;
-			tape->chrdev_dir = IDETAPE_DIR_NONE;
-			return rc;
-		}
-	}
-
-	return 0;
-}
-
-static void idetape_pad_zeros(ide_drive_t *drive, int bcount)
-{
-	idetape_tape_t *tape = drive->driver_data;
-
-	memset(tape->buf, 0, tape->buffer_size);
-
-	while (bcount) {
-		unsigned int count = min(tape->buffer_size, bcount);
-
-		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, count);
-		bcount -= count;
-	}
-}
-
-/*
- * Rewinds the tape to the Beginning Of the current Partition (BOP). We
- * currently support only one partition.
- */
-static int idetape_rewind_tape(ide_drive_t *drive)
-{
-	struct ide_tape_obj *tape = drive->driver_data;
-	struct gendisk *disk = tape->disk;
-	struct ide_atapi_pc pc;
-	int ret;
-
-	ide_debug_log(IDE_DBG_FUNC, "enter");
-
-	idetape_create_rewind_cmd(drive, &pc);
-	ret = ide_queue_pc_tail(drive, disk, &pc, NULL, 0);
-	if (ret)
-		return ret;
-
-	ret = ide_tape_read_position(drive);
-	if (ret < 0)
-		return ret;
-	return 0;
-}
-
-/* mtio.h compatible commands should be issued to the chrdev interface. */
-static int idetape_blkdev_ioctl(ide_drive_t *drive, unsigned int cmd,
-				unsigned long arg)
-{
-	idetape_tape_t *tape = drive->driver_data;
-	void __user *argp = (void __user *)arg;
-
-	struct idetape_config {
-		int dsc_rw_frequency;
-		int dsc_media_access_frequency;
-		int nr_stages;
-	} config;
-
-	ide_debug_log(IDE_DBG_FUNC, "cmd: 0x%04x", cmd);
-
-	switch (cmd) {
-	case 0x0340:
-		if (copy_from_user(&config, argp, sizeof(config)))
-			return -EFAULT;
-		tape->best_dsc_rw_freq = config.dsc_rw_frequency;
-		break;
-	case 0x0350:
-		memset(&config, 0, sizeof(config));
-		config.dsc_rw_frequency = (int) tape->best_dsc_rw_freq;
-		config.nr_stages = 1;
-		if (copy_to_user(argp, &config, sizeof(config)))
-			return -EFAULT;
-		break;
-	default:
-		return -EIO;
-	}
-	return 0;
-}
-
-static int idetape_space_over_filemarks(ide_drive_t *drive, short mt_op,
-					int mt_count)
-{
-	idetape_tape_t *tape = drive->driver_data;
-	struct gendisk *disk = tape->disk;
-	struct ide_atapi_pc pc;
-	int retval, count = 0;
-	int sprev = !!(tape->caps[4] & 0x20);
-
-
-	ide_debug_log(IDE_DBG_FUNC, "mt_op: %d, mt_count: %d", mt_op, mt_count);
-
-	if (mt_count == 0)
-		return 0;
-	if (MTBSF == mt_op || MTBSFM == mt_op) {
-		if (!sprev)
-			return -EIO;
-		mt_count = -mt_count;
-	}
-
-	if (tape->chrdev_dir == IDETAPE_DIR_READ) {
-		tape->valid = 0;
-		if (test_and_clear_bit(ilog2(IDE_AFLAG_FILEMARK),
-				       &drive->atapi_flags))
-			++count;
-		ide_tape_discard_merge_buffer(drive, 0);
-	}
-
-	switch (mt_op) {
-	case MTFSF:
-	case MTBSF:
-		idetape_create_space_cmd(&pc, mt_count - count,
-					 IDETAPE_SPACE_OVER_FILEMARK);
-		return ide_queue_pc_tail(drive, disk, &pc, NULL, 0);
-	case MTFSFM:
-	case MTBSFM:
-		if (!sprev)
-			return -EIO;
-		retval = idetape_space_over_filemarks(drive, MTFSF,
-						      mt_count - count);
-		if (retval)
-			return retval;
-		count = (MTBSFM == mt_op ? 1 : -1);
-		return idetape_space_over_filemarks(drive, MTFSF, count);
-	default:
-		printk(KERN_ERR "ide-tape: MTIO operation %d not supported\n",
-				mt_op);
-		return -EIO;
-	}
-}
-
-/*
- * Our character device read / write functions.
- *
- * The tape is optimized to maximize throughput when it is transferring an
- * integral number of the "continuous transfer limit", which is a parameter of
- * the specific tape (26kB on my particular tape, 32kB for Onstream).
- *
- * As of version 1.3 of the driver, the character device provides an abstract
- * continuous view of the media - any mix of block sizes (even 1 byte) on the
- * same backup/restore procedure is supported. The driver will internally
- * convert the requests to the recommended transfer unit, so that an unmatch
- * between the user's block size to the recommended size will only result in a
- * (slightly) increased driver overhead, but will no longer hit performance.
- * This is not applicable to Onstream.
- */
-static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
-				   size_t count, loff_t *ppos)
-{
-	struct ide_tape_obj *tape = file->private_data;
-	ide_drive_t *drive = tape->drive;
-	size_t done = 0;
-	ssize_t ret = 0;
-	int rc;
-
-	ide_debug_log(IDE_DBG_FUNC, "count %zd", count);
-
-	if (tape->chrdev_dir != IDETAPE_DIR_READ) {
-		if (test_bit(ilog2(IDE_AFLAG_DETECT_BS), &drive->atapi_flags))
-			if (count > tape->blk_size &&
-			    (count % tape->blk_size) == 0)
-				tape->user_bs_factor = count / tape->blk_size;
-	}
-
-	rc = idetape_init_rw(drive, IDETAPE_DIR_READ);
-	if (rc < 0)
-		return rc;
-
-	while (done < count) {
-		size_t todo;
-
-		/* refill if staging buffer is empty */
-		if (!tape->valid) {
-			/* If we are at a filemark, nothing more to read */
-			if (test_bit(ilog2(IDE_AFLAG_FILEMARK),
-				     &drive->atapi_flags))
-				break;
-			/* read */
-			if (idetape_queue_rw_tail(drive, REQ_IDETAPE_READ,
-						  tape->buffer_size) <= 0)
-				break;
-		}
-
-		/* copy out */
-		todo = min_t(size_t, count - done, tape->valid);
-		if (copy_to_user(buf + done, tape->cur, todo))
-			ret = -EFAULT;
-
-		tape->cur += todo;
-		tape->valid -= todo;
-		done += todo;
-	}
-
-	if (!done && test_bit(ilog2(IDE_AFLAG_FILEMARK), &drive->atapi_flags)) {
-		idetape_space_over_filemarks(drive, MTFSF, 1);
-		return 0;
-	}
-
-	return ret ? ret : done;
-}
-
-static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
-				     size_t count, loff_t *ppos)
-{
-	struct ide_tape_obj *tape = file->private_data;
-	ide_drive_t *drive = tape->drive;
-	size_t done = 0;
-	ssize_t ret = 0;
-	int rc;
-
-	/* The drive is write protected. */
-	if (tape->write_prot)
-		return -EACCES;
-
-	ide_debug_log(IDE_DBG_FUNC, "count %zd", count);
-
-	/* Initialize write operation */
-	rc = idetape_init_rw(drive, IDETAPE_DIR_WRITE);
-	if (rc < 0)
-		return rc;
-
-	while (done < count) {
-		size_t todo;
-
-		/* flush if staging buffer is full */
-		if (tape->valid == tape->buffer_size &&
-		    idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE,
-					  tape->buffer_size) <= 0)
-			return rc;
-
-		/* copy in */
-		todo = min_t(size_t, count - done,
-			     tape->buffer_size - tape->valid);
-		if (copy_from_user(tape->cur, buf + done, todo))
-			ret = -EFAULT;
-
-		tape->cur += todo;
-		tape->valid += todo;
-		done += todo;
-	}
-
-	return ret ? ret : done;
-}
-
-static int idetape_write_filemark(ide_drive_t *drive)
-{
-	struct ide_tape_obj *tape = drive->driver_data;
-	struct ide_atapi_pc pc;
-
-	/* Write a filemark */
-	idetape_create_write_filemark_cmd(drive, &pc, 1);
-	if (ide_queue_pc_tail(drive, tape->disk, &pc, NULL, 0)) {
-		printk(KERN_ERR "ide-tape: Couldn't write a filemark\n");
-		return -EIO;
-	}
-	return 0;
-}
-
-/*
- * Called from idetape_chrdev_ioctl when the general mtio MTIOCTOP ioctl is
- * requested.
- *
- * Note: MTBSF and MTBSFM are not supported when the tape doesn't support
- * spacing over filemarks in the reverse direction. In this case, MTFSFM is also
- * usually not supported.
- *
- * The following commands are currently not supported:
- *
- * MTFSS, MTBSS, MTWSM, MTSETDENSITY, MTSETDRVBUFFER, MT_ST_BOOLEANS,
- * MT_ST_WRITE_THRESHOLD.
- */
-static int idetape_mtioctop(ide_drive_t *drive, short mt_op, int mt_count)
-{
-	idetape_tape_t *tape = drive->driver_data;
-	struct gendisk *disk = tape->disk;
-	struct ide_atapi_pc pc;
-	int i, retval;
-
-	ide_debug_log(IDE_DBG_FUNC, "MTIOCTOP ioctl: mt_op: %d, mt_count: %d",
-		      mt_op, mt_count);
-
-	switch (mt_op) {
-	case MTFSF:
-	case MTFSFM:
-	case MTBSF:
-	case MTBSFM:
-		if (!mt_count)
-			return 0;
-		return idetape_space_over_filemarks(drive, mt_op, mt_count);
-	default:
-		break;
-	}
-
-	switch (mt_op) {
-	case MTWEOF:
-		if (tape->write_prot)
-			return -EACCES;
-		ide_tape_discard_merge_buffer(drive, 1);
-		for (i = 0; i < mt_count; i++) {
-			retval = idetape_write_filemark(drive);
-			if (retval)
-				return retval;
-		}
-		return 0;
-	case MTREW:
-		ide_tape_discard_merge_buffer(drive, 0);
-		if (idetape_rewind_tape(drive))
-			return -EIO;
-		return 0;
-	case MTLOAD:
-		ide_tape_discard_merge_buffer(drive, 0);
-		return ide_do_start_stop(drive, disk, IDETAPE_LU_LOAD_MASK);
-	case MTUNLOAD:
-	case MTOFFL:
-		/*
-		 * If door is locked, attempt to unlock before
-		 * attempting to eject.
-		 */
-		if (tape->door_locked) {
-			if (!ide_set_media_lock(drive, disk, 0))
-				tape->door_locked = DOOR_UNLOCKED;
-		}
-		ide_tape_discard_merge_buffer(drive, 0);
-		retval = ide_do_start_stop(drive, disk, !IDETAPE_LU_LOAD_MASK);
-		if (!retval)
-			clear_bit(ilog2(IDE_AFLAG_MEDIUM_PRESENT),
-				  &drive->atapi_flags);
-		return retval;
-	case MTNOP:
-		ide_tape_discard_merge_buffer(drive, 0);
-		return idetape_flush_tape_buffers(drive);
-	case MTRETEN:
-		ide_tape_discard_merge_buffer(drive, 0);
-		return ide_do_start_stop(drive, disk,
-			IDETAPE_LU_RETENSION_MASK | IDETAPE_LU_LOAD_MASK);
-	case MTEOM:
-		idetape_create_space_cmd(&pc, 0, IDETAPE_SPACE_TO_EOD);
-		return ide_queue_pc_tail(drive, disk, &pc, NULL, 0);
-	case MTERASE:
-		(void)idetape_rewind_tape(drive);
-		idetape_create_erase_cmd(&pc);
-		return ide_queue_pc_tail(drive, disk, &pc, NULL, 0);
-	case MTSETBLK:
-		if (mt_count) {
-			if (mt_count < tape->blk_size ||
-			    mt_count % tape->blk_size)
-				return -EIO;
-			tape->user_bs_factor = mt_count / tape->blk_size;
-			clear_bit(ilog2(IDE_AFLAG_DETECT_BS),
-				  &drive->atapi_flags);
-		} else
-			set_bit(ilog2(IDE_AFLAG_DETECT_BS),
-				&drive->atapi_flags);
-		return 0;
-	case MTSEEK:
-		ide_tape_discard_merge_buffer(drive, 0);
-		return idetape_position_tape(drive,
-			mt_count * tape->user_bs_factor, tape->partition, 0);
-	case MTSETPART:
-		ide_tape_discard_merge_buffer(drive, 0);
-		return idetape_position_tape(drive, 0, mt_count, 0);
-	case MTFSR:
-	case MTBSR:
-	case MTLOCK:
-		retval = ide_set_media_lock(drive, disk, 1);
-		if (retval)
-			return retval;
-		tape->door_locked = DOOR_EXPLICITLY_LOCKED;
-		return 0;
-	case MTUNLOCK:
-		retval = ide_set_media_lock(drive, disk, 0);
-		if (retval)
-			return retval;
-		tape->door_locked = DOOR_UNLOCKED;
-		return 0;
-	default:
-		printk(KERN_ERR "ide-tape: MTIO operation %d not supported\n",
-				mt_op);
-		return -EIO;
-	}
-}
-
-/*
- * Our character device ioctls. General mtio.h magnetic io commands are
- * supported here, and not in the corresponding block interface. Our own
- * ide-tape ioctls are supported on both interfaces.
- */
-static long do_idetape_chrdev_ioctl(struct file *file,
-				unsigned int cmd, unsigned long arg)
-{
-	struct ide_tape_obj *tape = file->private_data;
-	ide_drive_t *drive = tape->drive;
-	struct mtop mtop;
-	struct mtget mtget;
-	struct mtpos mtpos;
-	int block_offset = 0, position = tape->first_frame;
-	void __user *argp = (void __user *)arg;
-
-	ide_debug_log(IDE_DBG_FUNC, "cmd: 0x%x", cmd);
-
-	if (tape->chrdev_dir == IDETAPE_DIR_WRITE) {
-		ide_tape_flush_merge_buffer(drive);
-		idetape_flush_tape_buffers(drive);
-	}
-	if (cmd == MTIOCGET || cmd == MTIOCPOS) {
-		block_offset = tape->valid /
-			(tape->blk_size * tape->user_bs_factor);
-		position = ide_tape_read_position(drive);
-		if (position < 0)
-			return -EIO;
-	}
-	switch (cmd) {
-	case MTIOCTOP:
-		if (copy_from_user(&mtop, argp, sizeof(struct mtop)))
-			return -EFAULT;
-		return idetape_mtioctop(drive, mtop.mt_op, mtop.mt_count);
-	case MTIOCGET:
-		memset(&mtget, 0, sizeof(struct mtget));
-		mtget.mt_type = MT_ISSCSI2;
-		mtget.mt_blkno = position / tape->user_bs_factor - block_offset;
-		mtget.mt_dsreg =
-			((tape->blk_size * tape->user_bs_factor)
-			 << MT_ST_BLKSIZE_SHIFT) & MT_ST_BLKSIZE_MASK;
-
-		if (tape->drv_write_prot)
-			mtget.mt_gstat |= GMT_WR_PROT(0xffffffff);
-
-		return put_user_mtget(argp, &mtget);
-	case MTIOCPOS:
-		mtpos.mt_blkno = position / tape->user_bs_factor - block_offset;
-		return put_user_mtpos(argp, &mtpos);
-	default:
-		if (tape->chrdev_dir == IDETAPE_DIR_READ)
-			ide_tape_discard_merge_buffer(drive, 1);
-		return idetape_blkdev_ioctl(drive, cmd, arg);
-	}
-}
-
-static long idetape_chrdev_ioctl(struct file *file,
-				unsigned int cmd, unsigned long arg)
-{
-	long ret;
-	mutex_lock(&ide_tape_mutex);
-	ret = do_idetape_chrdev_ioctl(file, cmd, arg);
-	mutex_unlock(&ide_tape_mutex);
-	return ret;
-}
-
-static long idetape_chrdev_compat_ioctl(struct file *file,
-				unsigned int cmd, unsigned long arg)
-{
-	long ret;
-
-	if (cmd == MTIOCPOS32)
-		cmd = MTIOCPOS;
-	else if (cmd == MTIOCGET32)
-		cmd = MTIOCGET;
-
-	mutex_lock(&ide_tape_mutex);
-	ret = do_idetape_chrdev_ioctl(file, cmd, arg);
-	mutex_unlock(&ide_tape_mutex);
-	return ret;
-}
-
-/*
- * Do a mode sense page 0 with block descriptor and if it succeeds set the tape
- * block size with the reported value.
- */
-static void ide_tape_get_bsize_from_bdesc(ide_drive_t *drive)
-{
-	idetape_tape_t *tape = drive->driver_data;
-	struct ide_atapi_pc pc;
-	u8 buf[12];
-
-	idetape_create_mode_sense_cmd(&pc, IDETAPE_BLOCK_DESCRIPTOR);
-	if (ide_queue_pc_tail(drive, tape->disk, &pc, buf, pc.req_xfer)) {
-		printk(KERN_ERR "ide-tape: Can't get block descriptor\n");
-		if (tape->blk_size == 0) {
-			printk(KERN_WARNING "ide-tape: Cannot deal with zero "
-					    "block size, assuming 32k\n");
-			tape->blk_size = 32768;
-		}
-		return;
-	}
-	tape->blk_size = (buf[4 + 5] << 16) +
-				(buf[4 + 6] << 8)  +
-				 buf[4 + 7];
-	tape->drv_write_prot = (buf[2] & 0x80) >> 7;
-
-	ide_debug_log(IDE_DBG_FUNC, "blk_size: %d, write_prot: %d",
-		      tape->blk_size, tape->drv_write_prot);
-}
-
-static int idetape_chrdev_open(struct inode *inode, struct file *filp)
-{
-	unsigned int minor = iminor(inode), i = minor & ~0xc0;
-	ide_drive_t *drive;
-	idetape_tape_t *tape;
-	int retval;
-
-	if (i >= MAX_HWIFS * MAX_DRIVES)
-		return -ENXIO;
-
-	mutex_lock(&idetape_chrdev_mutex);
-
-	tape = ide_tape_get(NULL, true, i);
-	if (!tape) {
-		mutex_unlock(&idetape_chrdev_mutex);
-		return -ENXIO;
-	}
-
-	drive = tape->drive;
-	filp->private_data = tape;
-
-	ide_debug_log(IDE_DBG_FUNC, "enter");
-
-	/*
-	 * We really want to do nonseekable_open(inode, filp); here, but some
-	 * versions of tar incorrectly call lseek on tapes and bail out if that
-	 * fails.  So we disallow pread() and pwrite(), but permit lseeks.
-	 */
-	filp->f_mode &= ~(FMODE_PREAD | FMODE_PWRITE);
-
-
-	if (test_and_set_bit(ilog2(IDE_AFLAG_BUSY), &drive->atapi_flags)) {
-		retval = -EBUSY;
-		goto out_put_tape;
-	}
-
-	retval = idetape_wait_ready(drive, 60 * HZ);
-	if (retval) {
-		clear_bit(ilog2(IDE_AFLAG_BUSY), &drive->atapi_flags);
-		printk(KERN_ERR "ide-tape: %s: drive not ready\n", tape->name);
-		goto out_put_tape;
-	}
-
-	ide_tape_read_position(drive);
-	if (!test_bit(ilog2(IDE_AFLAG_ADDRESS_VALID), &drive->atapi_flags))
-		(void)idetape_rewind_tape(drive);
-
-	/* Read block size and write protect status from drive. */
-	ide_tape_get_bsize_from_bdesc(drive);
-
-	/* Set write protect flag if device is opened as read-only. */
-	if ((filp->f_flags & O_ACCMODE) == O_RDONLY)
-		tape->write_prot = 1;
-	else
-		tape->write_prot = tape->drv_write_prot;
-
-	/* Make sure drive isn't write protected if user wants to write. */
-	if (tape->write_prot) {
-		if ((filp->f_flags & O_ACCMODE) == O_WRONLY ||
-		    (filp->f_flags & O_ACCMODE) == O_RDWR) {
-			clear_bit(ilog2(IDE_AFLAG_BUSY), &drive->atapi_flags);
-			retval = -EROFS;
-			goto out_put_tape;
-		}
-	}
-
-	/* Lock the tape drive door so user can't eject. */
-	if (tape->chrdev_dir == IDETAPE_DIR_NONE) {
-		if (!ide_set_media_lock(drive, tape->disk, 1)) {
-			if (tape->door_locked != DOOR_EXPLICITLY_LOCKED)
-				tape->door_locked = DOOR_LOCKED;
-		}
-	}
-	mutex_unlock(&idetape_chrdev_mutex);
-
-	return 0;
-
-out_put_tape:
-	ide_tape_put(tape);
-
-	mutex_unlock(&idetape_chrdev_mutex);
-
-	return retval;
-}
-
-static void idetape_write_release(ide_drive_t *drive, unsigned int minor)
-{
-	idetape_tape_t *tape = drive->driver_data;
-
-	ide_tape_flush_merge_buffer(drive);
-	tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
-	if (tape->buf != NULL) {
-		idetape_pad_zeros(drive, tape->blk_size *
-				(tape->user_bs_factor - 1));
-		kfree(tape->buf);
-		tape->buf = NULL;
-	}
-	idetape_write_filemark(drive);
-	idetape_flush_tape_buffers(drive);
-	idetape_flush_tape_buffers(drive);
-}
-
-static int idetape_chrdev_release(struct inode *inode, struct file *filp)
-{
-	struct ide_tape_obj *tape = filp->private_data;
-	ide_drive_t *drive = tape->drive;
-	unsigned int minor = iminor(inode);
-
-	mutex_lock(&idetape_chrdev_mutex);
-
-	tape = drive->driver_data;
-
-	ide_debug_log(IDE_DBG_FUNC, "enter");
-
-	if (tape->chrdev_dir == IDETAPE_DIR_WRITE)
-		idetape_write_release(drive, minor);
-	if (tape->chrdev_dir == IDETAPE_DIR_READ) {
-		if (minor < 128)
-			ide_tape_discard_merge_buffer(drive, 1);
-	}
-
-	if (minor < 128 && test_bit(ilog2(IDE_AFLAG_MEDIUM_PRESENT),
-				    &drive->atapi_flags))
-		(void) idetape_rewind_tape(drive);
-
-	if (tape->chrdev_dir == IDETAPE_DIR_NONE) {
-		if (tape->door_locked == DOOR_LOCKED) {
-			if (!ide_set_media_lock(drive, tape->disk, 0))
-				tape->door_locked = DOOR_UNLOCKED;
-		}
-	}
-	clear_bit(ilog2(IDE_AFLAG_BUSY), &drive->atapi_flags);
-	ide_tape_put(tape);
-
-	mutex_unlock(&idetape_chrdev_mutex);
-
-	return 0;
-}
-
-static void idetape_get_inquiry_results(ide_drive_t *drive)
-{
-	idetape_tape_t *tape = drive->driver_data;
-	struct ide_atapi_pc pc;
-	u8 pc_buf[256];
-	char fw_rev[4], vendor_id[8], product_id[16];
-
-	idetape_create_inquiry_cmd(&pc);
-	if (ide_queue_pc_tail(drive, tape->disk, &pc, pc_buf, pc.req_xfer)) {
-		printk(KERN_ERR "ide-tape: %s: can't get INQUIRY results\n",
-				tape->name);
-		return;
-	}
-	memcpy(vendor_id, &pc_buf[8], 8);
-	memcpy(product_id, &pc_buf[16], 16);
-	memcpy(fw_rev, &pc_buf[32], 4);
-
-	ide_fixstring(vendor_id, 8, 0);
-	ide_fixstring(product_id, 16, 0);
-	ide_fixstring(fw_rev, 4, 0);
-
-	printk(KERN_INFO "ide-tape: %s <-> %s: %.8s %.16s rev %.4s\n",
-			drive->name, tape->name, vendor_id, product_id, fw_rev);
-}
-
-/*
- * Ask the tape about its various parameters. In particular, we will adjust our
- * data transfer buffer	size to the recommended value as returned by the tape.
- */
-static void idetape_get_mode_sense_results(ide_drive_t *drive)
-{
-	idetape_tape_t *tape = drive->driver_data;
-	struct ide_atapi_pc pc;
-	u8 buf[24], *caps;
-	u8 speed, max_speed;
-
-	idetape_create_mode_sense_cmd(&pc, IDETAPE_CAPABILITIES_PAGE);
-	if (ide_queue_pc_tail(drive, tape->disk, &pc, buf, pc.req_xfer)) {
-		printk(KERN_ERR "ide-tape: Can't get tape parameters - assuming"
-				" some default values\n");
-		tape->blk_size = 512;
-		put_unaligned(52,   (u16 *)&tape->caps[12]);
-		put_unaligned(540,  (u16 *)&tape->caps[14]);
-		put_unaligned(6*52, (u16 *)&tape->caps[16]);
-		return;
-	}
-	caps = buf + 4 + buf[3];
-
-	/* convert to host order and save for later use */
-	speed = be16_to_cpup((__be16 *)&caps[14]);
-	max_speed = be16_to_cpup((__be16 *)&caps[8]);
-
-	*(u16 *)&caps[8] = max_speed;
-	*(u16 *)&caps[12] = be16_to_cpup((__be16 *)&caps[12]);
-	*(u16 *)&caps[14] = speed;
-	*(u16 *)&caps[16] = be16_to_cpup((__be16 *)&caps[16]);
-
-	if (!speed) {
-		printk(KERN_INFO "ide-tape: %s: invalid tape speed "
-				"(assuming 650KB/sec)\n", drive->name);
-		*(u16 *)&caps[14] = 650;
-	}
-	if (!max_speed) {
-		printk(KERN_INFO "ide-tape: %s: invalid max_speed "
-				"(assuming 650KB/sec)\n", drive->name);
-		*(u16 *)&caps[8] = 650;
-	}
-
-	memcpy(&tape->caps, caps, 20);
-
-	/* device lacks locking support according to capabilities page */
-	if ((caps[6] & 1) == 0)
-		drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING;
-
-	if (caps[7] & 0x02)
-		tape->blk_size = 512;
-	else if (caps[7] & 0x04)
-		tape->blk_size = 1024;
-}
-
-#ifdef CONFIG_IDE_PROC_FS
-#define ide_tape_devset_get(name, field) \
-static int get_##name(ide_drive_t *drive) \
-{ \
-	idetape_tape_t *tape = drive->driver_data; \
-	return tape->field; \
-}
-
-#define ide_tape_devset_set(name, field) \
-static int set_##name(ide_drive_t *drive, int arg) \
-{ \
-	idetape_tape_t *tape = drive->driver_data; \
-	tape->field = arg; \
-	return 0; \
-}
-
-#define ide_tape_devset_rw_field(_name, _field) \
-ide_tape_devset_get(_name, _field) \
-ide_tape_devset_set(_name, _field) \
-IDE_DEVSET(_name, DS_SYNC, get_##_name, set_##_name)
-
-#define ide_tape_devset_r_field(_name, _field) \
-ide_tape_devset_get(_name, _field) \
-IDE_DEVSET(_name, 0, get_##_name, NULL)
-
-static int mulf_tdsc(ide_drive_t *drive)	{ return 1000; }
-static int divf_tdsc(ide_drive_t *drive)	{ return   HZ; }
-static int divf_buffer(ide_drive_t *drive)	{ return    2; }
-static int divf_buffer_size(ide_drive_t *drive)	{ return 1024; }
-
-ide_devset_rw_flag(dsc_overlap, IDE_DFLAG_DSC_OVERLAP);
-
-ide_tape_devset_rw_field(tdsc, best_dsc_rw_freq);
-
-ide_tape_devset_r_field(avg_speed, avg_speed);
-ide_tape_devset_r_field(speed, caps[14]);
-ide_tape_devset_r_field(buffer, caps[16]);
-ide_tape_devset_r_field(buffer_size, buffer_size);
-
-static const struct ide_proc_devset idetape_settings[] = {
-	__IDE_PROC_DEVSET(avg_speed,	0, 0xffff, NULL, NULL),
-	__IDE_PROC_DEVSET(buffer,	0, 0xffff, NULL, divf_buffer),
-	__IDE_PROC_DEVSET(buffer_size,	0, 0xffff, NULL, divf_buffer_size),
-	__IDE_PROC_DEVSET(dsc_overlap,	0,      1, NULL, NULL),
-	__IDE_PROC_DEVSET(speed,	0, 0xffff, NULL, NULL),
-	__IDE_PROC_DEVSET(tdsc,		IDETAPE_DSC_RW_MIN, IDETAPE_DSC_RW_MAX,
-					mulf_tdsc, divf_tdsc),
-	{ NULL },
-};
-#endif
-
-/*
- * The function below is called to:
- *
- * 1. Initialize our various state variables.
- * 2. Ask the tape for its capabilities.
- * 3. Allocate a buffer which will be used for data transfer. The buffer size
- * is chosen based on the recommendation which we received in step 2.
- *
- * Note that at this point ide.c already assigned us an irq, so that we can
- * queue requests here and wait for their completion.
- */
-static void idetape_setup(ide_drive_t *drive, idetape_tape_t *tape, int minor)
-{
-	unsigned long t;
-	int speed;
-	u16 *ctl = (u16 *)&tape->caps[12];
-
-	ide_debug_log(IDE_DBG_FUNC, "minor: %d", minor);
-
-	drive->pc_callback = ide_tape_callback;
-
-	drive->dev_flags |= IDE_DFLAG_DSC_OVERLAP;
-
-	if (drive->hwif->host_flags & IDE_HFLAG_NO_DSC) {
-		printk(KERN_INFO "ide-tape: %s: disabling DSC overlap\n",
-				 tape->name);
-		drive->dev_flags &= ~IDE_DFLAG_DSC_OVERLAP;
-	}
-
-	/* Seagate Travan drives do not support DSC overlap. */
-	if (strstr((char *)&drive->id[ATA_ID_PROD], "Seagate STT3401"))
-		drive->dev_flags &= ~IDE_DFLAG_DSC_OVERLAP;
-
-	tape->minor = minor;
-	tape->name[0] = 'h';
-	tape->name[1] = 't';
-	tape->name[2] = '0' + minor;
-	tape->chrdev_dir = IDETAPE_DIR_NONE;
-
-	idetape_get_inquiry_results(drive);
-	idetape_get_mode_sense_results(drive);
-	ide_tape_get_bsize_from_bdesc(drive);
-	tape->user_bs_factor = 1;
-	tape->buffer_size = *ctl * tape->blk_size;
-	while (tape->buffer_size > 0xffff) {
-		printk(KERN_NOTICE "ide-tape: decreasing stage size\n");
-		*ctl /= 2;
-		tape->buffer_size = *ctl * tape->blk_size;
-	}
-
-	/* select the "best" DSC read/write polling freq */
-	speed = max(*(u16 *)&tape->caps[14], *(u16 *)&tape->caps[8]);
-
-	t = (IDETAPE_FIFO_THRESHOLD * tape->buffer_size * HZ) / (speed * 1000);
-
-	/*
-	 * Ensure that the number we got makes sense; limit it within
-	 * IDETAPE_DSC_RW_MIN and IDETAPE_DSC_RW_MAX.
-	 */
-	tape->best_dsc_rw_freq = clamp_t(unsigned long, t, IDETAPE_DSC_RW_MIN,
-					 IDETAPE_DSC_RW_MAX);
-	printk(KERN_INFO "ide-tape: %s <-> %s: %dKBps, %d*%dkB buffer, "
-		"%ums tDSC%s\n",
-		drive->name, tape->name, *(u16 *)&tape->caps[14],
-		(*(u16 *)&tape->caps[16] * 512) / tape->buffer_size,
-		tape->buffer_size / 1024,
-		jiffies_to_msecs(tape->best_dsc_rw_freq),
-		(drive->dev_flags & IDE_DFLAG_USING_DMA) ? ", DMA" : "");
-
-	ide_proc_register_driver(drive, tape->driver);
-}
-
-static void ide_tape_remove(ide_drive_t *drive)
-{
-	idetape_tape_t *tape = drive->driver_data;
-
-	ide_proc_unregister_driver(drive, tape->driver);
-	device_del(&tape->dev);
-
-	mutex_lock(&idetape_ref_mutex);
-	put_device(&tape->dev);
-	mutex_unlock(&idetape_ref_mutex);
-}
-
-static void ide_tape_release(struct device *dev)
-{
-	struct ide_tape_obj *tape = to_ide_drv(dev, ide_tape_obj);
-	ide_drive_t *drive = tape->drive;
-	struct gendisk *g = tape->disk;
-
-	BUG_ON(tape->valid);
-
-	drive->dev_flags &= ~IDE_DFLAG_DSC_OVERLAP;
-	drive->driver_data = NULL;
-	device_destroy(idetape_sysfs_class, MKDEV(IDETAPE_MAJOR, tape->minor));
-	device_destroy(idetape_sysfs_class,
-			MKDEV(IDETAPE_MAJOR, tape->minor + 128));
-	idetape_devs[tape->minor] = NULL;
-	g->private_data = NULL;
-	put_disk(g);
-	kfree(tape);
-}
-
-#ifdef CONFIG_IDE_PROC_FS
-static int idetape_name_proc_show(struct seq_file *m, void *v)
-{
-	ide_drive_t	*drive = (ide_drive_t *) m->private;
-	idetape_tape_t	*tape = drive->driver_data;
-
-	seq_printf(m, "%s\n", tape->name);
-	return 0;
-}
-
-static ide_proc_entry_t idetape_proc[] = {
-	{ "capacity",	S_IFREG|S_IRUGO,	ide_capacity_proc_show	},
-	{ "name",	S_IFREG|S_IRUGO,	idetape_name_proc_show	},
-	{}
-};
-
-static ide_proc_entry_t *ide_tape_proc_entries(ide_drive_t *drive)
-{
-	return idetape_proc;
-}
-
-static const struct ide_proc_devset *ide_tape_proc_devsets(ide_drive_t *drive)
-{
-	return idetape_settings;
-}
-#endif
-
-static int ide_tape_probe(ide_drive_t *);
-
-static struct ide_driver idetape_driver = {
-	.gen_driver = {
-		.owner		= THIS_MODULE,
-		.name		= "ide-tape",
-		.bus		= &ide_bus_type,
-	},
-	.probe			= ide_tape_probe,
-	.remove			= ide_tape_remove,
-	.version		= IDETAPE_VERSION,
-	.do_request		= idetape_do_request,
-#ifdef CONFIG_IDE_PROC_FS
-	.proc_entries		= ide_tape_proc_entries,
-	.proc_devsets		= ide_tape_proc_devsets,
-#endif
-};
-
-/* Our character device supporting functions, passed to register_chrdev. */
-static const struct file_operations idetape_fops = {
-	.owner		= THIS_MODULE,
-	.read		= idetape_chrdev_read,
-	.write		= idetape_chrdev_write,
-	.unlocked_ioctl	= idetape_chrdev_ioctl,
-	.compat_ioctl	= IS_ENABLED(CONFIG_COMPAT) ?
-			  idetape_chrdev_compat_ioctl : NULL,
-	.open		= idetape_chrdev_open,
-	.release	= idetape_chrdev_release,
-	.llseek		= noop_llseek,
-};
-
-static int idetape_open(struct block_device *bdev, fmode_t mode)
-{
-	struct ide_tape_obj *tape;
-
-	mutex_lock(&ide_tape_mutex);
-	tape = ide_tape_get(bdev->bd_disk, false, 0);
-	mutex_unlock(&ide_tape_mutex);
-
-	if (!tape)
-		return -ENXIO;
-
-	return 0;
-}
-
-static void idetape_release(struct gendisk *disk, fmode_t mode)
-{
-	struct ide_tape_obj *tape = ide_drv_g(disk, ide_tape_obj);
-
-	mutex_lock(&ide_tape_mutex);
-	ide_tape_put(tape);
-	mutex_unlock(&ide_tape_mutex);
-}
-
-static int idetape_ioctl(struct block_device *bdev, fmode_t mode,
-			unsigned int cmd, unsigned long arg)
-{
-	struct ide_tape_obj *tape = ide_drv_g(bdev->bd_disk, ide_tape_obj);
-	ide_drive_t *drive = tape->drive;
-	int err;
-
-	mutex_lock(&ide_tape_mutex);
-	err = generic_ide_ioctl(drive, bdev, cmd, arg);
-	if (err == -EINVAL)
-		err = idetape_blkdev_ioctl(drive, cmd, arg);
-	mutex_unlock(&ide_tape_mutex);
-
-	return err;
-}
-
-static int idetape_compat_ioctl(struct block_device *bdev, fmode_t mode,
-				unsigned int cmd, unsigned long arg)
-{
-        if (cmd == 0x0340 || cmd == 0x350)
-		arg = (unsigned long)compat_ptr(arg);
-
-	return idetape_ioctl(bdev, mode, cmd, arg);
-}
-
-static const struct block_device_operations idetape_block_ops = {
-	.owner		= THIS_MODULE,
-	.open		= idetape_open,
-	.release	= idetape_release,
-	.ioctl		= idetape_ioctl,
-	.compat_ioctl	= IS_ENABLED(CONFIG_COMPAT) ?
-				idetape_compat_ioctl : NULL,
-};
-
-static int ide_tape_probe(ide_drive_t *drive)
-{
-	idetape_tape_t *tape;
-	struct gendisk *g;
-	int minor;
-
-	ide_debug_log(IDE_DBG_FUNC, "enter");
-
-	if (!strstr(DRV_NAME, drive->driver_req))
-		goto failed;
-
-	if (drive->media != ide_tape)
-		goto failed;
-
-	if ((drive->dev_flags & IDE_DFLAG_ID_READ) &&
-	    ide_check_atapi_device(drive, DRV_NAME) == 0) {
-		printk(KERN_ERR "ide-tape: %s: not supported by this version of"
-				" the driver\n", drive->name);
-		goto failed;
-	}
-	tape = kzalloc(sizeof(idetape_tape_t), GFP_KERNEL);
-	if (tape == NULL) {
-		printk(KERN_ERR "ide-tape: %s: Can't allocate a tape struct\n",
-				drive->name);
-		goto failed;
-	}
-
-	g = alloc_disk(1 << PARTN_BITS);
-	if (!g)
-		goto out_free_tape;
-
-	ide_init_disk(g, drive);
-
-	tape->dev.parent = &drive->gendev;
-	tape->dev.release = ide_tape_release;
-	dev_set_name(&tape->dev, "%s", dev_name(&drive->gendev));
-
-	if (device_register(&tape->dev))
-		goto out_free_disk;
-
-	tape->drive = drive;
-	tape->driver = &idetape_driver;
-	tape->disk = g;
-
-	g->private_data = &tape->driver;
-
-	drive->driver_data = tape;
-
-	mutex_lock(&idetape_ref_mutex);
-	for (minor = 0; idetape_devs[minor]; minor++)
-		;
-	idetape_devs[minor] = tape;
-	mutex_unlock(&idetape_ref_mutex);
-
-	idetape_setup(drive, tape, minor);
-
-	device_create(idetape_sysfs_class, &drive->gendev,
-		      MKDEV(IDETAPE_MAJOR, minor), NULL, "%s", tape->name);
-	device_create(idetape_sysfs_class, &drive->gendev,
-		      MKDEV(IDETAPE_MAJOR, minor + 128), NULL,
-		      "n%s", tape->name);
-
-	g->fops = &idetape_block_ops;
-
-	return 0;
-
-out_free_disk:
-	put_disk(g);
-out_free_tape:
-	kfree(tape);
-failed:
-	return -ENODEV;
-}
-
-static void __exit idetape_exit(void)
-{
-	driver_unregister(&idetape_driver.gen_driver);
-	class_destroy(idetape_sysfs_class);
-	unregister_chrdev(IDETAPE_MAJOR, "ht");
-}
-
-static int __init idetape_init(void)
-{
-	int error = 1;
-	idetape_sysfs_class = class_create(THIS_MODULE, "ide_tape");
-	if (IS_ERR(idetape_sysfs_class)) {
-		idetape_sysfs_class = NULL;
-		printk(KERN_ERR "Unable to create sysfs class for ide tapes\n");
-		error = -EBUSY;
-		goto out;
-	}
-
-	if (register_chrdev(IDETAPE_MAJOR, "ht", &idetape_fops)) {
-		printk(KERN_ERR "ide-tape: Failed to register chrdev"
-				" interface\n");
-		error = -EBUSY;
-		goto out_free_class;
-	}
-
-	error = driver_register(&idetape_driver.gen_driver);
-	if (error)
-		goto out_free_chrdev;
-
-	return 0;
-
-out_free_chrdev:
-	unregister_chrdev(IDETAPE_MAJOR, "ht");
-out_free_class:
-	class_destroy(idetape_sysfs_class);
-out:
-	return error;
-}
-
-MODULE_ALIAS("ide:*m-tape*");
-module_init(idetape_init);
-module_exit(idetape_exit);
-MODULE_ALIAS_CHARDEV_MAJOR(IDETAPE_MAJOR);
-MODULE_DESCRIPTION("ATAPI Streaming TAPE Driver");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
deleted file mode 100644
index 6665fc4724b9..000000000000
--- a/drivers/ide/ide-taskfile.c
+++ /dev/null
@@ -1,668 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  Copyright (C) 2000-2002	   Michael Cornwell <cornwell@acm.org>
- *  Copyright (C) 2000-2002	   Andre Hedrick <andre@linux-ide.org>
- *  Copyright (C) 2001-2002	   Klaus Smolin
- *					IBM Storage Technology Division
- *  Copyright (C) 2003-2004, 2007  Bartlomiej Zolnierkiewicz
- *
- *  The big the bad and the ugly.
- */
-
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/sched.h>
-#include <linux/interrupt.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/delay.h>
-#include <linux/hdreg.h>
-#include <linux/ide.h>
-#include <linux/nmi.h>
-#include <linux/scatterlist.h>
-#include <linux/uaccess.h>
-
-#include <asm/io.h>
-
-void ide_tf_readback(ide_drive_t *drive, struct ide_cmd *cmd)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
-
-	/* Be sure we're looking at the low order bytes */
-	tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
-
-	tp_ops->tf_read(drive, &cmd->tf, cmd->valid.in.tf);
-
-	if (cmd->tf_flags & IDE_TFLAG_LBA48) {
-		tp_ops->write_devctl(hwif, ATA_HOB | ATA_DEVCTL_OBS);
-
-		tp_ops->tf_read(drive, &cmd->hob, cmd->valid.in.hob);
-	}
-}
-
-void ide_tf_dump(const char *s, struct ide_cmd *cmd)
-{
-#ifdef DEBUG
-	printk("%s: tf: feat 0x%02x nsect 0x%02x lbal 0x%02x "
-		"lbam 0x%02x lbah 0x%02x dev 0x%02x cmd 0x%02x\n",
-	       s, cmd->tf.feature, cmd->tf.nsect,
-	       cmd->tf.lbal, cmd->tf.lbam, cmd->tf.lbah,
-	       cmd->tf.device, cmd->tf.command);
-	printk("%s: hob: nsect 0x%02x lbal 0x%02x lbam 0x%02x lbah 0x%02x\n",
-	       s, cmd->hob.nsect, cmd->hob.lbal, cmd->hob.lbam, cmd->hob.lbah);
-#endif
-}
-
-int taskfile_lib_get_identify(ide_drive_t *drive, u8 *buf)
-{
-	struct ide_cmd cmd;
-
-	memset(&cmd, 0, sizeof(cmd));
-	cmd.tf.nsect = 0x01;
-	if (drive->media == ide_disk)
-		cmd.tf.command = ATA_CMD_ID_ATA;
-	else
-		cmd.tf.command = ATA_CMD_ID_ATAPI;
-	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
-	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
-	cmd.protocol = ATA_PROT_PIO;
-
-	return ide_raw_taskfile(drive, &cmd, buf, 1);
-}
-
-static ide_startstop_t task_no_data_intr(ide_drive_t *);
-static ide_startstop_t pre_task_out_intr(ide_drive_t *, struct ide_cmd *);
-static ide_startstop_t task_pio_intr(ide_drive_t *);
-
-ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *orig_cmd)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct ide_cmd *cmd = &hwif->cmd;
-	struct ide_taskfile *tf = &cmd->tf;
-	ide_handler_t *handler = NULL;
-	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
-	const struct ide_dma_ops *dma_ops = hwif->dma_ops;
-
-	if (orig_cmd->protocol == ATA_PROT_PIO &&
-	    (orig_cmd->tf_flags & IDE_TFLAG_MULTI_PIO) &&
-	    drive->mult_count == 0) {
-		pr_err("%s: multimode not set!\n", drive->name);
-		return ide_stopped;
-	}
-
-	if (orig_cmd->ftf_flags & IDE_FTFLAG_FLAGGED)
-		orig_cmd->ftf_flags |= IDE_FTFLAG_SET_IN_FLAGS;
-
-	memcpy(cmd, orig_cmd, sizeof(*cmd));
-
-	if ((cmd->tf_flags & IDE_TFLAG_DMA_PIO_FALLBACK) == 0) {
-		ide_tf_dump(drive->name, cmd);
-		tp_ops->write_devctl(hwif, ATA_DEVCTL_OBS);
-
-		if (cmd->ftf_flags & IDE_FTFLAG_OUT_DATA) {
-			u8 data[2] = { cmd->tf.data, cmd->hob.data };
-
-			tp_ops->output_data(drive, cmd, data, 2);
-		}
-
-		if (cmd->valid.out.tf & IDE_VALID_DEVICE) {
-			u8 HIHI = (cmd->tf_flags & IDE_TFLAG_LBA48) ?
-				  0xE0 : 0xEF;
-
-			if (!(cmd->ftf_flags & IDE_FTFLAG_FLAGGED))
-				cmd->tf.device &= HIHI;
-			cmd->tf.device |= drive->select;
-		}
-
-		tp_ops->tf_load(drive, &cmd->hob, cmd->valid.out.hob);
-		tp_ops->tf_load(drive, &cmd->tf,  cmd->valid.out.tf);
-	}
-
-	switch (cmd->protocol) {
-	case ATA_PROT_PIO:
-		if (cmd->tf_flags & IDE_TFLAG_WRITE) {
-			tp_ops->exec_command(hwif, tf->command);
-			ndelay(400);	/* FIXME */
-			return pre_task_out_intr(drive, cmd);
-		}
-		handler = task_pio_intr;
-		fallthrough;
-	case ATA_PROT_NODATA:
-		if (handler == NULL)
-			handler = task_no_data_intr;
-		ide_execute_command(drive, cmd, handler, WAIT_WORSTCASE);
-		return ide_started;
-	case ATA_PROT_DMA:
-		if (ide_dma_prepare(drive, cmd))
-			return ide_stopped;
-		hwif->expiry = dma_ops->dma_timer_expiry;
-		ide_execute_command(drive, cmd, ide_dma_intr, 2 * WAIT_CMD);
-		dma_ops->dma_start(drive);
-		fallthrough;
-	default:
-		return ide_started;
-	}
-}
-EXPORT_SYMBOL_GPL(do_rw_taskfile);
-
-static ide_startstop_t task_no_data_intr(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct ide_cmd *cmd = &hwif->cmd;
-	struct ide_taskfile *tf = &cmd->tf;
-	int custom = (cmd->tf_flags & IDE_TFLAG_CUSTOM_HANDLER) ? 1 : 0;
-	int retries = (custom && tf->command == ATA_CMD_INIT_DEV_PARAMS) ? 5 : 1;
-	u8 stat;
-
-	local_irq_enable_in_hardirq();
-
-	while (1) {
-		stat = hwif->tp_ops->read_status(hwif);
-		if ((stat & ATA_BUSY) == 0 || retries-- == 0)
-			break;
-		udelay(10);
-	};
-
-	if (!OK_STAT(stat, ATA_DRDY, BAD_STAT)) {
-		if (custom && tf->command == ATA_CMD_SET_MULTI) {
-			drive->mult_req = drive->mult_count = 0;
-			drive->special_flags |= IDE_SFLAG_RECALIBRATE;
-			(void)ide_dump_status(drive, __func__, stat);
-			return ide_stopped;
-		} else if (custom && tf->command == ATA_CMD_INIT_DEV_PARAMS) {
-			if ((stat & (ATA_ERR | ATA_DRQ)) == 0) {
-				ide_set_handler(drive, &task_no_data_intr,
-						WAIT_WORSTCASE);
-				return ide_started;
-			}
-		}
-		return ide_error(drive, "task_no_data_intr", stat);
-	}
-
-	if (custom && tf->command == ATA_CMD_SET_MULTI)
-		drive->mult_count = drive->mult_req;
-
-	if (custom == 0 || tf->command == ATA_CMD_IDLEIMMEDIATE ||
-	    tf->command == ATA_CMD_CHK_POWER) {
-		struct request *rq = hwif->rq;
-
-		if (ata_pm_request(rq))
-			ide_complete_pm_rq(drive, rq);
-		else
-			ide_finish_cmd(drive, cmd, stat);
-	}
-
-	return ide_stopped;
-}
-
-static u8 wait_drive_not_busy(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	int retries;
-	u8 stat;
-
-	/*
-	 * Last sector was transferred, wait until device is ready.  This can
-	 * take up to 6 ms on some ATAPI devices, so we will wait max 10 ms.
-	 */
-	for (retries = 0; retries < 1000; retries++) {
-		stat = hwif->tp_ops->read_status(hwif);
-
-		if (stat & ATA_BUSY)
-			udelay(10);
-		else
-			break;
-	}
-
-	if (stat & ATA_BUSY)
-		pr_err("%s: drive still BUSY!\n", drive->name);
-
-	return stat;
-}
-
-void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
-		   unsigned int write, unsigned int len)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct scatterlist *sg = hwif->sg_table;
-	struct scatterlist *cursg = cmd->cursg;
-	struct page *page;
-	unsigned int offset;
-	u8 *buf;
-
-	if (cursg == NULL)
-		cursg = cmd->cursg = sg;
-
-	while (len) {
-		unsigned nr_bytes = min(len, cursg->length - cmd->cursg_ofs);
-
-		page = sg_page(cursg);
-		offset = cursg->offset + cmd->cursg_ofs;
-
-		/* get the current page and offset */
-		page = nth_page(page, (offset >> PAGE_SHIFT));
-		offset %= PAGE_SIZE;
-
-		nr_bytes = min_t(unsigned, nr_bytes, (PAGE_SIZE - offset));
-
-		buf = kmap_atomic(page) + offset;
-
-		cmd->nleft -= nr_bytes;
-		cmd->cursg_ofs += nr_bytes;
-
-		if (cmd->cursg_ofs == cursg->length) {
-			cursg = cmd->cursg = sg_next(cmd->cursg);
-			cmd->cursg_ofs = 0;
-		}
-
-		/* do the actual data transfer */
-		if (write)
-			hwif->tp_ops->output_data(drive, cmd, buf, nr_bytes);
-		else
-			hwif->tp_ops->input_data(drive, cmd, buf, nr_bytes);
-
-		kunmap_atomic(buf);
-
-		len -= nr_bytes;
-	}
-}
-EXPORT_SYMBOL_GPL(ide_pio_bytes);
-
-static void ide_pio_datablock(ide_drive_t *drive, struct ide_cmd *cmd,
-			      unsigned int write)
-{
-	unsigned int nr_bytes;
-
-	u8 saved_io_32bit = drive->io_32bit;
-
-	if (cmd->tf_flags & IDE_TFLAG_FS)
-		scsi_req(cmd->rq)->result = 0;
-
-	if (cmd->tf_flags & IDE_TFLAG_IO_16BIT)
-		drive->io_32bit = 0;
-
-	touch_softlockup_watchdog();
-
-	if (cmd->tf_flags & IDE_TFLAG_MULTI_PIO)
-		nr_bytes = min_t(unsigned, cmd->nleft, drive->mult_count << 9);
-	else
-		nr_bytes = SECTOR_SIZE;
-
-	ide_pio_bytes(drive, cmd, write, nr_bytes);
-
-	drive->io_32bit = saved_io_32bit;
-}
-
-static void ide_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd)
-{
-	if (cmd->tf_flags & IDE_TFLAG_FS) {
-		int nr_bytes = cmd->nbytes - cmd->nleft;
-
-		if (cmd->protocol == ATA_PROT_PIO &&
-		    ((cmd->tf_flags & IDE_TFLAG_WRITE) || cmd->nleft == 0)) {
-			if (cmd->tf_flags & IDE_TFLAG_MULTI_PIO)
-				nr_bytes -= drive->mult_count << 9;
-			else
-				nr_bytes -= SECTOR_SIZE;
-		}
-
-		if (nr_bytes > 0)
-			ide_complete_rq(drive, BLK_STS_OK, nr_bytes);
-	}
-}
-
-void ide_finish_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat)
-{
-	struct request *rq = drive->hwif->rq;
-	u8 err = ide_read_error(drive), nsect = cmd->tf.nsect;
-	u8 set_xfer = !!(cmd->tf_flags & IDE_TFLAG_SET_XFER);
-
-	ide_complete_cmd(drive, cmd, stat, err);
-	scsi_req(rq)->result = err;
-
-	if (err == 0 && set_xfer) {
-		ide_set_xfer_rate(drive, nsect);
-		ide_driveid_update(drive);
-	}
-
-	ide_complete_rq(drive, err ? BLK_STS_IOERR : BLK_STS_OK, blk_rq_bytes(rq));
-}
-
-/*
- * Handler for command with PIO data phase.
- */
-static ide_startstop_t task_pio_intr(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct ide_cmd *cmd = &drive->hwif->cmd;
-	u8 stat = hwif->tp_ops->read_status(hwif);
-	u8 write = !!(cmd->tf_flags & IDE_TFLAG_WRITE);
-
-	if (write == 0) {
-		/* Error? */
-		if (stat & ATA_ERR)
-			goto out_err;
-
-		/* Didn't want any data? Odd. */
-		if ((stat & ATA_DRQ) == 0) {
-			/* Command all done? */
-			if (OK_STAT(stat, ATA_DRDY, ATA_BUSY))
-				goto out_end;
-
-			/* Assume it was a spurious irq */
-			goto out_wait;
-		}
-	} else {
-		if (!OK_STAT(stat, DRIVE_READY, drive->bad_wstat))
-			goto out_err;
-
-		/* Deal with unexpected ATA data phase. */
-		if (((stat & ATA_DRQ) == 0) ^ (cmd->nleft == 0))
-			goto out_err;
-	}
-
-	if (write && cmd->nleft == 0)
-		goto out_end;
-
-	/* Still data left to transfer. */
-	ide_pio_datablock(drive, cmd, write);
-
-	/* Are we done? Check status and finish transfer. */
-	if (write == 0 && cmd->nleft == 0) {
-		stat = wait_drive_not_busy(drive);
-		if (!OK_STAT(stat, 0, BAD_STAT))
-			goto out_err;
-
-		goto out_end;
-	}
-out_wait:
-	/* Still data left to transfer. */
-	ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
-	return ide_started;
-out_end:
-	if ((cmd->tf_flags & IDE_TFLAG_FS) == 0)
-		ide_finish_cmd(drive, cmd, stat);
-	else
-		ide_complete_rq(drive, BLK_STS_OK, blk_rq_sectors(cmd->rq) << 9);
-	return ide_stopped;
-out_err:
-	ide_error_cmd(drive, cmd);
-	return ide_error(drive, __func__, stat);
-}
-
-static ide_startstop_t pre_task_out_intr(ide_drive_t *drive,
-					 struct ide_cmd *cmd)
-{
-	ide_startstop_t startstop;
-
-	if (ide_wait_stat(&startstop, drive, ATA_DRQ,
-			  drive->bad_wstat, WAIT_DRQ)) {
-		pr_err("%s: no DRQ after issuing %sWRITE%s\n", drive->name,
-			(cmd->tf_flags & IDE_TFLAG_MULTI_PIO) ? "MULT" : "",
-			(drive->dev_flags & IDE_DFLAG_LBA48) ? "_EXT" : "");
-		return startstop;
-	}
-
-	if (!force_irqthreads && (drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
-		local_irq_disable();
-
-	ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
-
-	ide_pio_datablock(drive, cmd, 1);
-
-	return ide_started;
-}
-
-int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
-		     u16 nsect)
-{
-	struct request *rq;
-	int error;
-
-	rq = blk_get_request(drive->queue,
-		(cmd->tf_flags & IDE_TFLAG_WRITE) ?
-			REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
-	ide_req(rq)->type = ATA_PRIV_TASKFILE;
-
-	/*
-	 * (ks) We transfer currently only whole sectors.
-	 * This is suffient for now.  But, it would be great,
-	 * if we would find a solution to transfer any size.
-	 * To support special commands like READ LONG.
-	 */
-	if (nsect) {
-		error = blk_rq_map_kern(drive->queue, rq, buf,
-					nsect * SECTOR_SIZE, GFP_NOIO);
-		if (error)
-			goto put_req;
-	}
-
-	ide_req(rq)->special = cmd;
-	cmd->rq = rq;
-
-	blk_execute_rq(NULL, rq, 0);
-	error = scsi_req(rq)->result ? -EIO : 0;
-put_req:
-	blk_put_request(rq);
-	return error;
-}
-EXPORT_SYMBOL(ide_raw_taskfile);
-
-int ide_no_data_taskfile(ide_drive_t *drive, struct ide_cmd *cmd)
-{
-	cmd->protocol = ATA_PROT_NODATA;
-
-	return ide_raw_taskfile(drive, cmd, NULL, 0);
-}
-EXPORT_SYMBOL_GPL(ide_no_data_taskfile);
-
-#ifdef CONFIG_IDE_TASK_IOCTL
-int ide_taskfile_ioctl(ide_drive_t *drive, unsigned long arg)
-{
-	ide_task_request_t	*req_task;
-	struct ide_cmd		cmd;
-	u8 *outbuf		= NULL;
-	u8 *inbuf		= NULL;
-	u8 *data_buf		= NULL;
-	int err			= 0;
-	int tasksize		= sizeof(struct ide_task_request_s);
-	unsigned int taskin	= 0;
-	unsigned int taskout	= 0;
-	u16 nsect		= 0;
-	char __user *buf = (char __user *)arg;
-
-	req_task = memdup_user(buf, tasksize);
-	if (IS_ERR(req_task))
-		return PTR_ERR(req_task);
-
-	taskout = req_task->out_size;
-	taskin  = req_task->in_size;
-
-	if (taskin > 65536 || taskout > 65536) {
-		err = -EINVAL;
-		goto abort;
-	}
-
-	if (taskout) {
-		int outtotal = tasksize;
-		outbuf = kzalloc(taskout, GFP_KERNEL);
-		if (outbuf == NULL) {
-			err = -ENOMEM;
-			goto abort;
-		}
-		if (copy_from_user(outbuf, buf + outtotal, taskout)) {
-			err = -EFAULT;
-			goto abort;
-		}
-	}
-
-	if (taskin) {
-		int intotal = tasksize + taskout;
-		inbuf = kzalloc(taskin, GFP_KERNEL);
-		if (inbuf == NULL) {
-			err = -ENOMEM;
-			goto abort;
-		}
-		if (copy_from_user(inbuf, buf + intotal, taskin)) {
-			err = -EFAULT;
-			goto abort;
-		}
-	}
-
-	memset(&cmd, 0, sizeof(cmd));
-
-	memcpy(&cmd.hob, req_task->hob_ports, HDIO_DRIVE_HOB_HDR_SIZE - 2);
-	memcpy(&cmd.tf,  req_task->io_ports,  HDIO_DRIVE_TASK_HDR_SIZE);
-
-	cmd.valid.out.tf = IDE_VALID_DEVICE;
-	cmd.valid.in.tf  = IDE_VALID_DEVICE | IDE_VALID_IN_TF;
-	cmd.tf_flags = IDE_TFLAG_IO_16BIT;
-
-	if (drive->dev_flags & IDE_DFLAG_LBA48) {
-		cmd.tf_flags |= IDE_TFLAG_LBA48;
-		cmd.valid.in.hob = IDE_VALID_IN_HOB;
-	}
-
-	if (req_task->out_flags.all) {
-		cmd.ftf_flags |= IDE_FTFLAG_FLAGGED;
-
-		if (req_task->out_flags.b.data)
-			cmd.ftf_flags |= IDE_FTFLAG_OUT_DATA;
-
-		if (req_task->out_flags.b.nsector_hob)
-			cmd.valid.out.hob |= IDE_VALID_NSECT;
-		if (req_task->out_flags.b.sector_hob)
-			cmd.valid.out.hob |= IDE_VALID_LBAL;
-		if (req_task->out_flags.b.lcyl_hob)
-			cmd.valid.out.hob |= IDE_VALID_LBAM;
-		if (req_task->out_flags.b.hcyl_hob)
-			cmd.valid.out.hob |= IDE_VALID_LBAH;
-
-		if (req_task->out_flags.b.error_feature)
-			cmd.valid.out.tf  |= IDE_VALID_FEATURE;
-		if (req_task->out_flags.b.nsector)
-			cmd.valid.out.tf  |= IDE_VALID_NSECT;
-		if (req_task->out_flags.b.sector)
-			cmd.valid.out.tf  |= IDE_VALID_LBAL;
-		if (req_task->out_flags.b.lcyl)
-			cmd.valid.out.tf  |= IDE_VALID_LBAM;
-		if (req_task->out_flags.b.hcyl)
-			cmd.valid.out.tf  |= IDE_VALID_LBAH;
-	} else {
-		cmd.valid.out.tf |= IDE_VALID_OUT_TF;
-		if (cmd.tf_flags & IDE_TFLAG_LBA48)
-			cmd.valid.out.hob |= IDE_VALID_OUT_HOB;
-	}
-
-	if (req_task->in_flags.b.data)
-		cmd.ftf_flags |= IDE_FTFLAG_IN_DATA;
-
-	if (req_task->req_cmd == IDE_DRIVE_TASK_RAW_WRITE) {
-		/* fixup data phase if needed */
-		if (req_task->data_phase == TASKFILE_IN_DMAQ ||
-		    req_task->data_phase == TASKFILE_IN_DMA)
-			cmd.tf_flags |= IDE_TFLAG_WRITE;
-	}
-
-	cmd.protocol = ATA_PROT_DMA;
-
-	switch (req_task->data_phase) {
-	case TASKFILE_MULTI_OUT:
-		if (!drive->mult_count) {
-			/* (hs): give up if multcount is not set */
-			pr_err("%s: %s Multimode Write multcount is not set\n",
-				drive->name, __func__);
-			err = -EPERM;
-			goto abort;
-		}
-		cmd.tf_flags |= IDE_TFLAG_MULTI_PIO;
-		fallthrough;
-	case TASKFILE_OUT:
-		cmd.protocol = ATA_PROT_PIO;
-		fallthrough;
-	case TASKFILE_OUT_DMAQ:
-	case TASKFILE_OUT_DMA:
-		cmd.tf_flags |= IDE_TFLAG_WRITE;
-		nsect = taskout / SECTOR_SIZE;
-		data_buf = outbuf;
-		break;
-	case TASKFILE_MULTI_IN:
-		if (!drive->mult_count) {
-			/* (hs): give up if multcount is not set */
-			pr_err("%s: %s Multimode Read multcount is not set\n",
-				drive->name, __func__);
-			err = -EPERM;
-			goto abort;
-		}
-		cmd.tf_flags |= IDE_TFLAG_MULTI_PIO;
-		fallthrough;
-	case TASKFILE_IN:
-		cmd.protocol = ATA_PROT_PIO;
-		fallthrough;
-	case TASKFILE_IN_DMAQ:
-	case TASKFILE_IN_DMA:
-		nsect = taskin / SECTOR_SIZE;
-		data_buf = inbuf;
-		break;
-	case TASKFILE_NO_DATA:
-		cmd.protocol = ATA_PROT_NODATA;
-		break;
-	default:
-		err = -EFAULT;
-		goto abort;
-	}
-
-	if (req_task->req_cmd == IDE_DRIVE_TASK_NO_DATA)
-		nsect = 0;
-	else if (!nsect) {
-		nsect = (cmd.hob.nsect << 8) | cmd.tf.nsect;
-
-		if (!nsect) {
-			pr_err("%s: in/out command without data\n",
-					drive->name);
-			err = -EFAULT;
-			goto abort;
-		}
-	}
-
-	err = ide_raw_taskfile(drive, &cmd, data_buf, nsect);
-
-	memcpy(req_task->hob_ports, &cmd.hob, HDIO_DRIVE_HOB_HDR_SIZE - 2);
-	memcpy(req_task->io_ports,  &cmd.tf,  HDIO_DRIVE_TASK_HDR_SIZE);
-
-	if ((cmd.ftf_flags & IDE_FTFLAG_SET_IN_FLAGS) &&
-	    req_task->in_flags.all == 0) {
-		req_task->in_flags.all = IDE_TASKFILE_STD_IN_FLAGS;
-		if (drive->dev_flags & IDE_DFLAG_LBA48)
-			req_task->in_flags.all |= (IDE_HOB_STD_IN_FLAGS << 8);
-	}
-
-	if (copy_to_user(buf, req_task, tasksize)) {
-		err = -EFAULT;
-		goto abort;
-	}
-	if (taskout) {
-		int outtotal = tasksize;
-		if (copy_to_user(buf + outtotal, outbuf, taskout)) {
-			err = -EFAULT;
-			goto abort;
-		}
-	}
-	if (taskin) {
-		int intotal = tasksize + taskout;
-		if (copy_to_user(buf + intotal, inbuf, taskin)) {
-			err = -EFAULT;
-			goto abort;
-		}
-	}
-abort:
-	kfree(req_task);
-	kfree(outbuf);
-	kfree(inbuf);
-
-	return err;
-}
-#endif
diff --git a/drivers/ide/ide-timings.c b/drivers/ide/ide-timings.c
deleted file mode 100644
index cfe78df74b7d..000000000000
--- a/drivers/ide/ide-timings.c
+++ /dev/null
@@ -1,198 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- *  Copyright (c) 1999-2001 Vojtech Pavlik
- *  Copyright (c) 2007-2008 Bartlomiej Zolnierkiewicz
- *
- * Should you need to contact me, the author, you can do so either by
- * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
- * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
- */
-
-#include <linux/kernel.h>
-#include <linux/ide.h>
-#include <linux/module.h>
-
-/*
- * PIO 0-5, MWDMA 0-2 and UDMA 0-6 timings (in nanoseconds).
- * These were taken from ATA/ATAPI-6 standard, rev 0a, except
- * for PIO 5, which is a nonstandard extension and UDMA6, which
- * is currently supported only by Maxtor drives.
- */
-
-static struct ide_timing ide_timing[] = {
-
-	{ XFER_UDMA_6,     0,   0,   0,   0,   0,   0,   0,  15 },
-	{ XFER_UDMA_5,     0,   0,   0,   0,   0,   0,   0,  20 },
-	{ XFER_UDMA_4,     0,   0,   0,   0,   0,   0,   0,  30 },
-	{ XFER_UDMA_3,     0,   0,   0,   0,   0,   0,   0,  45 },
-
-	{ XFER_UDMA_2,     0,   0,   0,   0,   0,   0,   0,  60 },
-	{ XFER_UDMA_1,     0,   0,   0,   0,   0,   0,   0,  80 },
-	{ XFER_UDMA_0,     0,   0,   0,   0,   0,   0,   0, 120 },
-
-	{ XFER_MW_DMA_4,  25,   0,   0,   0,  55,  20,  80,   0 },
-	{ XFER_MW_DMA_3,  25,   0,   0,   0,  65,  25, 100,   0 },
-	{ XFER_MW_DMA_2,  25,   0,   0,   0,  70,  25, 120,   0 },
-	{ XFER_MW_DMA_1,  45,   0,   0,   0,  80,  50, 150,   0 },
-	{ XFER_MW_DMA_0,  60,   0,   0,   0, 215, 215, 480,   0 },
-
-	{ XFER_SW_DMA_2,  60,   0,   0,   0, 120, 120, 240,   0 },
-	{ XFER_SW_DMA_1,  90,   0,   0,   0, 240, 240, 480,   0 },
-	{ XFER_SW_DMA_0, 120,   0,   0,   0, 480, 480, 960,   0 },
-
-	{ XFER_PIO_6,     10,  55,  20,  80,  55,  20,  80,   0 },
-	{ XFER_PIO_5,     15,  65,  25, 100,  65,  25, 100,   0 },
-	{ XFER_PIO_4,     25,  70,  25, 120,  70,  25, 120,   0 },
-	{ XFER_PIO_3,     30,  80,  70, 180,  80,  70, 180,   0 },
-
-	{ XFER_PIO_2,     30, 290,  40, 330, 100,  90, 240,   0 },
-	{ XFER_PIO_1,     50, 290,  93, 383, 125, 100, 383,   0 },
-	{ XFER_PIO_0,     70, 290, 240, 600, 165, 150, 600,   0 },
-
-	{ XFER_PIO_SLOW, 120, 290, 240, 960, 290, 240, 960,   0 },
-
-	{ 0xff }
-};
-
-struct ide_timing *ide_timing_find_mode(u8 speed)
-{
-	struct ide_timing *t;
-
-	for (t = ide_timing; t->mode != speed; t++)
-		if (t->mode == 0xff)
-			return NULL;
-	return t;
-}
-EXPORT_SYMBOL_GPL(ide_timing_find_mode);
-
-u16 ide_pio_cycle_time(ide_drive_t *drive, u8 pio)
-{
-	u16 *id = drive->id;
-	struct ide_timing *t = ide_timing_find_mode(XFER_PIO_0 + pio);
-	u16 cycle = 0;
-
-	if (id[ATA_ID_FIELD_VALID] & 2) {
-		if (ata_id_has_iordy(drive->id))
-			cycle = id[ATA_ID_EIDE_PIO_IORDY];
-		else
-			cycle = id[ATA_ID_EIDE_PIO];
-
-		/* conservative "downgrade" for all pre-ATA2 drives */
-		if (pio < 3 && cycle < t->cycle)
-			cycle = 0; /* use standard timing */
-
-		/* Use the standard timing for the CF specific modes too */
-		if (pio > 4 && ata_id_is_cfa(id))
-			cycle = 0;
-	}
-
-	return cycle ? cycle : t->cycle;
-}
-EXPORT_SYMBOL_GPL(ide_pio_cycle_time);
-
-#define ENOUGH(v, unit)		(((v) - 1) / (unit) + 1)
-#define EZ(v, unit)		((v) ? ENOUGH((v) * 1000, unit) : 0)
-
-static void ide_timing_quantize(struct ide_timing *t, struct ide_timing *q,
-				int T, int UT)
-{
-	q->setup   = EZ(t->setup,   T);
-	q->act8b   = EZ(t->act8b,   T);
-	q->rec8b   = EZ(t->rec8b,   T);
-	q->cyc8b   = EZ(t->cyc8b,   T);
-	q->active  = EZ(t->active,  T);
-	q->recover = EZ(t->recover, T);
-	q->cycle   = EZ(t->cycle,   T);
-	q->udma    = EZ(t->udma,    UT);
-}
-
-void ide_timing_merge(struct ide_timing *a, struct ide_timing *b,
-		      struct ide_timing *m, unsigned int what)
-{
-	if (what & IDE_TIMING_SETUP)
-		m->setup   = max(a->setup,   b->setup);
-	if (what & IDE_TIMING_ACT8B)
-		m->act8b   = max(a->act8b,   b->act8b);
-	if (what & IDE_TIMING_REC8B)
-		m->rec8b   = max(a->rec8b,   b->rec8b);
-	if (what & IDE_TIMING_CYC8B)
-		m->cyc8b   = max(a->cyc8b,   b->cyc8b);
-	if (what & IDE_TIMING_ACTIVE)
-		m->active  = max(a->active,  b->active);
-	if (what & IDE_TIMING_RECOVER)
-		m->recover = max(a->recover, b->recover);
-	if (what & IDE_TIMING_CYCLE)
-		m->cycle   = max(a->cycle,   b->cycle);
-	if (what & IDE_TIMING_UDMA)
-		m->udma    = max(a->udma,    b->udma);
-}
-EXPORT_SYMBOL_GPL(ide_timing_merge);
-
-int ide_timing_compute(ide_drive_t *drive, u8 speed,
-		       struct ide_timing *t, int T, int UT)
-{
-	u16 *id = drive->id;
-	struct ide_timing *s, p;
-
-	/*
-	 * Find the mode.
-	 */
-	s = ide_timing_find_mode(speed);
-	if (s == NULL)
-		return -EINVAL;
-
-	/*
-	 * Copy the timing from the table.
-	 */
-	*t = *s;
-
-	/*
-	 * If the drive is an EIDE drive, it can tell us it needs extended
-	 * PIO/MWDMA cycle timing.
-	 */
-	if (id[ATA_ID_FIELD_VALID] & 2) {	/* EIDE drive */
-		memset(&p, 0, sizeof(p));
-
-		if (speed >= XFER_PIO_0 && speed < XFER_SW_DMA_0) {
-			if (speed <= XFER_PIO_2)
-				p.cycle = p.cyc8b = id[ATA_ID_EIDE_PIO];
-			else if ((speed <= XFER_PIO_4) ||
-				 (speed == XFER_PIO_5 && !ata_id_is_cfa(id)))
-				p.cycle = p.cyc8b = id[ATA_ID_EIDE_PIO_IORDY];
-		} else if (speed >= XFER_MW_DMA_0 && speed <= XFER_MW_DMA_2)
-			p.cycle = id[ATA_ID_EIDE_DMA_MIN];
-
-		ide_timing_merge(&p, t, t, IDE_TIMING_CYCLE | IDE_TIMING_CYC8B);
-	}
-
-	/*
-	 * Convert the timing to bus clock counts.
-	 */
-	ide_timing_quantize(t, t, T, UT);
-
-	/*
-	 * Even in DMA/UDMA modes we still use PIO access for IDENTIFY,
-	 * S.M.A.R.T and some other commands. We have to ensure that the
-	 * DMA cycle timing is slower/equal than the current PIO timing.
-	 */
-	if (speed >= XFER_SW_DMA_0) {
-		ide_timing_compute(drive, drive->pio_mode, &p, T, UT);
-		ide_timing_merge(&p, t, t, IDE_TIMING_ALL);
-	}
-
-	/*
-	 * Lengthen active & recovery time so that cycle time is correct.
-	 */
-	if (t->act8b + t->rec8b < t->cyc8b) {
-		t->act8b += (t->cyc8b - (t->act8b + t->rec8b)) / 2;
-		t->rec8b = t->cyc8b - t->act8b;
-	}
-
-	if (t->active + t->recover < t->cycle) {
-		t->active += (t->cycle - (t->active + t->recover)) / 2;
-		t->recover = t->cycle - t->active;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(ide_timing_compute);
diff --git a/drivers/ide/ide-xfer-mode.c b/drivers/ide/ide-xfer-mode.c
deleted file mode 100644
index 0b9709b489b7..000000000000
--- a/drivers/ide/ide-xfer-mode.c
+++ /dev/null
@@ -1,267 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/interrupt.h>
-#include <linux/ide.h>
-#include <linux/bitops.h>
-
-static const char *udma_str[] =
-	 { "UDMA/16", "UDMA/25",  "UDMA/33",  "UDMA/44",
-	   "UDMA/66", "UDMA/100", "UDMA/133", "UDMA7" };
-static const char *mwdma_str[] =
-	{ "MWDMA0", "MWDMA1", "MWDMA2", "MWDMA3", "MWDMA4" };
-static const char *swdma_str[] =
-	{ "SWDMA0", "SWDMA1", "SWDMA2" };
-static const char *pio_str[] =
-	{ "PIO0", "PIO1", "PIO2", "PIO3", "PIO4", "PIO5", "PIO6" };
-
-/**
- *	ide_xfer_verbose	-	return IDE mode names
- *	@mode: transfer mode
- *
- *	Returns a constant string giving the name of the mode
- *	requested.
- */
-
-const char *ide_xfer_verbose(u8 mode)
-{
-	const char *s;
-	u8 i = mode & 0xf;
-
-	if (mode >= XFER_UDMA_0 && mode <= XFER_UDMA_7)
-		s = udma_str[i];
-	else if (mode >= XFER_MW_DMA_0 && mode <= XFER_MW_DMA_4)
-		s = mwdma_str[i];
-	else if (mode >= XFER_SW_DMA_0 && mode <= XFER_SW_DMA_2)
-		s = swdma_str[i];
-	else if (mode >= XFER_PIO_0 && mode <= XFER_PIO_6)
-		s = pio_str[i & 0x7];
-	else if (mode == XFER_PIO_SLOW)
-		s = "PIO SLOW";
-	else
-		s = "XFER ERROR";
-
-	return s;
-}
-EXPORT_SYMBOL(ide_xfer_verbose);
-
-/**
- *	ide_get_best_pio_mode	-	get PIO mode from drive
- *	@drive: drive to consider
- *	@mode_wanted: preferred mode
- *	@max_mode: highest allowed mode
- *
- *	This routine returns the recommended PIO settings for a given drive,
- *	based on the drive->id information and the ide_pio_blacklist[].
- *
- *	Drive PIO mode is auto-selected if 255 is passed as mode_wanted.
- *	This is used by most chipset support modules when "auto-tuning".
- */
-
-static u8 ide_get_best_pio_mode(ide_drive_t *drive, u8 mode_wanted, u8 max_mode)
-{
-	u16 *id = drive->id;
-	int pio_mode = -1, overridden = 0;
-
-	if (mode_wanted != 255)
-		return min_t(u8, mode_wanted, max_mode);
-
-	if ((drive->hwif->host_flags & IDE_HFLAG_PIO_NO_BLACKLIST) == 0)
-		pio_mode = ide_scan_pio_blacklist((char *)&id[ATA_ID_PROD]);
-
-	if (pio_mode != -1) {
-		printk(KERN_INFO "%s: is on PIO blacklist\n", drive->name);
-	} else {
-		pio_mode = id[ATA_ID_OLD_PIO_MODES] >> 8;
-		if (pio_mode > 2) {	/* 2 is maximum allowed tPIO value */
-			pio_mode = 2;
-			overridden = 1;
-		}
-
-		if (id[ATA_ID_FIELD_VALID] & 2) {	      /* ATA2? */
-			if (ata_id_is_cfa(id) && (id[ATA_ID_CFA_MODES] & 7))
-				pio_mode = 4 + min_t(int, 2,
-						     id[ATA_ID_CFA_MODES] & 7);
-			else if (ata_id_has_iordy(id)) {
-				if (id[ATA_ID_PIO_MODES] & 7) {
-					overridden = 0;
-					if (id[ATA_ID_PIO_MODES] & 4)
-						pio_mode = 5;
-					else if (id[ATA_ID_PIO_MODES] & 2)
-						pio_mode = 4;
-					else
-						pio_mode = 3;
-				}
-			}
-		}
-
-		if (overridden)
-			printk(KERN_INFO "%s: tPIO > 2, assuming tPIO = 2\n",
-					 drive->name);
-	}
-
-	if (pio_mode > max_mode)
-		pio_mode = max_mode;
-
-	return pio_mode;
-}
-
-int ide_pio_need_iordy(ide_drive_t *drive, const u8 pio)
-{
-	/*
-	 * IORDY may lead to controller lock up on certain controllers
-	 * if the port is not occupied.
-	 */
-	if (pio == 0 && (drive->hwif->port_flags & IDE_PFLAG_PROBING))
-		return 0;
-	return ata_id_pio_need_iordy(drive->id, pio);
-}
-EXPORT_SYMBOL_GPL(ide_pio_need_iordy);
-
-int ide_set_pio_mode(ide_drive_t *drive, const u8 mode)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_port_ops *port_ops = hwif->port_ops;
-
-	if (hwif->host_flags & IDE_HFLAG_NO_SET_MODE)
-		return 0;
-
-	if (port_ops == NULL || port_ops->set_pio_mode == NULL)
-		return -1;
-
-	/*
-	 * TODO: temporary hack for some legacy host drivers that didn't
-	 * set transfer mode on the device in ->set_pio_mode method...
-	 */
-	if (port_ops->set_dma_mode == NULL) {
-		drive->pio_mode = mode;
-		port_ops->set_pio_mode(hwif, drive);
-		return 0;
-	}
-
-	if (hwif->host_flags & IDE_HFLAG_POST_SET_MODE) {
-		if (ide_config_drive_speed(drive, mode))
-			return -1;
-		drive->pio_mode = mode;
-		port_ops->set_pio_mode(hwif, drive);
-		return 0;
-	} else {
-		drive->pio_mode = mode;
-		port_ops->set_pio_mode(hwif, drive);
-		return ide_config_drive_speed(drive, mode);
-	}
-}
-
-int ide_set_dma_mode(ide_drive_t *drive, const u8 mode)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_port_ops *port_ops = hwif->port_ops;
-
-	if (hwif->host_flags & IDE_HFLAG_NO_SET_MODE)
-		return 0;
-
-	if (port_ops == NULL || port_ops->set_dma_mode == NULL)
-		return -1;
-
-	if (hwif->host_flags & IDE_HFLAG_POST_SET_MODE) {
-		if (ide_config_drive_speed(drive, mode))
-			return -1;
-		drive->dma_mode = mode;
-		port_ops->set_dma_mode(hwif, drive);
-		return 0;
-	} else {
-		drive->dma_mode = mode;
-		port_ops->set_dma_mode(hwif, drive);
-		return ide_config_drive_speed(drive, mode);
-	}
-}
-EXPORT_SYMBOL_GPL(ide_set_dma_mode);
-
-/* req_pio == "255" for auto-tune */
-void ide_set_pio(ide_drive_t *drive, u8 req_pio)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_port_ops *port_ops = hwif->port_ops;
-	u8 host_pio, pio;
-
-	if (port_ops == NULL || port_ops->set_pio_mode == NULL ||
-	    (hwif->host_flags & IDE_HFLAG_NO_SET_MODE))
-		return;
-
-	BUG_ON(hwif->pio_mask == 0x00);
-
-	host_pio = fls(hwif->pio_mask) - 1;
-
-	pio = ide_get_best_pio_mode(drive, req_pio, host_pio);
-
-	/*
-	 * TODO:
-	 * - report device max PIO mode
-	 * - check req_pio != 255 against device max PIO mode
-	 */
-	printk(KERN_DEBUG "%s: host max PIO%d wanted PIO%d%s selected PIO%d\n",
-			  drive->name, host_pio, req_pio,
-			  req_pio == 255 ? "(auto-tune)" : "", pio);
-
-	(void)ide_set_pio_mode(drive, XFER_PIO_0 + pio);
-}
-EXPORT_SYMBOL_GPL(ide_set_pio);
-
-/**
- *	ide_rate_filter		-	filter transfer mode
- *	@drive: IDE device
- *	@speed: desired speed
- *
- *	Given the available transfer modes this function returns
- *	the best available speed at or below the speed requested.
- *
- *	TODO: check device PIO capabilities
- */
-
-static u8 ide_rate_filter(ide_drive_t *drive, u8 speed)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	u8 mode = ide_find_dma_mode(drive, speed);
-
-	if (mode == 0) {
-		if (hwif->pio_mask)
-			mode = fls(hwif->pio_mask) - 1 + XFER_PIO_0;
-		else
-			mode = XFER_PIO_4;
-	}
-
-/*	printk("%s: mode 0x%02x, speed 0x%02x\n", __func__, mode, speed); */
-
-	return min(speed, mode);
-}
-
-/**
- *	ide_set_xfer_rate	-	set transfer rate
- *	@drive: drive to set
- *	@rate: speed to attempt to set
- *
- *	General helper for setting the speed of an IDE device. This
- *	function knows about user enforced limits from the configuration
- *	which ->set_pio_mode/->set_dma_mode does not.
- */
-
-int ide_set_xfer_rate(ide_drive_t *drive, u8 rate)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	const struct ide_port_ops *port_ops = hwif->port_ops;
-
-	if (port_ops == NULL || port_ops->set_dma_mode == NULL ||
-	    (hwif->host_flags & IDE_HFLAG_NO_SET_MODE))
-		return -1;
-
-	rate = ide_rate_filter(drive, rate);
-
-	BUG_ON(rate < XFER_PIO_0);
-
-	if (rate >= XFER_PIO_0 && rate <= XFER_PIO_6)
-		return ide_set_pio_mode(drive, rate);
-
-	return ide_set_dma_mode(drive, rate);
-}
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
deleted file mode 100644
index 9a9c64fd1032..000000000000
--- a/drivers/ide/ide.c
+++ /dev/null
@@ -1,415 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  Copyright (C) 1994-1998	    Linus Torvalds & authors (see below)
- *  Copyright (C) 2003-2005, 2007   Bartlomiej Zolnierkiewicz
- */
-
-/*
- *  Mostly written by Mark Lord  <mlord@pobox.com>
- *                and Gadi Oxman <gadio@netvision.net.il>
- *                and Andre Hedrick <andre@linux-ide.org>
- *
- *  See linux/MAINTAINERS for address of current maintainer.
- *
- * This is the multiple IDE interface driver, as evolved from hd.c.
- * It supports up to MAX_HWIFS IDE interfaces, on one or more IRQs
- *   (usually 14 & 15).
- * There can be up to two drives per interface, as per the ATA-2 spec.
- *
- * ...
- *
- *  From hd.c:
- *  |
- *  | It traverses the request-list, using interrupts to jump between functions.
- *  | As nearly all functions can be called within interrupts, we may not sleep.
- *  | Special care is recommended.  Have Fun!
- *  |
- *  | modified by Drew Eckhardt to check nr of hd's from the CMOS.
- *  |
- *  | Thanks to Branko Lankester, lankeste@fwi.uva.nl, who found a bug
- *  | in the early extended-partition checks and added DM partitions.
- *  |
- *  | Early work on error handling by Mika Liljeberg (liljeber@cs.Helsinki.FI).
- *  |
- *  | IRQ-unmask, drive-id, multiple-mode, support for ">16 heads",
- *  | and general streamlining by Mark Lord (mlord@pobox.com).
- *
- *  October, 1994 -- Complete line-by-line overhaul for linux 1.1.x, by:
- *
- *	Mark Lord	(mlord@pobox.com)		(IDE Perf.Pkg)
- *	Delman Lee	(delman@ieee.org)		("Mr. atdisk2")
- *	Scott Snyder	(snyder@fnald0.fnal.gov)	(ATAPI IDE cd-rom)
- *
- *  This was a rewrite of just about everything from hd.c, though some original
- *  code is still sprinkled about.  Think of it as a major evolution, with
- *  inspiration from lots of linux users, esp.  hamish@zot.apana.org.au
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/major.h>
-#include <linux/errno.h>
-#include <linux/genhd.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/ide.h>
-#include <linux/hdreg.h>
-#include <linux/completion.h>
-#include <linux/device.h>
-
-struct class *ide_port_class;
-
-/**
- * ide_device_get	-	get an additional reference to a ide_drive_t
- * @drive:	device to get a reference to
- *
- * Gets a reference to the ide_drive_t and increments the use count of the
- * underlying LLDD module.
- */
-int ide_device_get(ide_drive_t *drive)
-{
-	struct device *host_dev;
-	struct module *module;
-
-	if (!get_device(&drive->gendev))
-		return -ENXIO;
-
-	host_dev = drive->hwif->host->dev[0];
-	module = host_dev ? host_dev->driver->owner : NULL;
-
-	if (module && !try_module_get(module)) {
-		put_device(&drive->gendev);
-		return -ENXIO;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(ide_device_get);
-
-/**
- * ide_device_put	-	release a reference to a ide_drive_t
- * @drive:	device to release a reference on
- *
- * Release a reference to the ide_drive_t and decrements the use count of
- * the underlying LLDD module.
- */
-void ide_device_put(ide_drive_t *drive)
-{
-#ifdef CONFIG_MODULE_UNLOAD
-	struct device *host_dev = drive->hwif->host->dev[0];
-	struct module *module = host_dev ? host_dev->driver->owner : NULL;
-
-	module_put(module);
-#endif
-	put_device(&drive->gendev);
-}
-EXPORT_SYMBOL_GPL(ide_device_put);
-
-static int ide_bus_match(struct device *dev, struct device_driver *drv)
-{
-	return 1;
-}
-
-static int ide_uevent(struct device *dev, struct kobj_uevent_env *env)
-{
-	ide_drive_t *drive = to_ide_device(dev);
-
-	add_uevent_var(env, "MEDIA=%s", ide_media_string(drive));
-	add_uevent_var(env, "DRIVENAME=%s", drive->name);
-	add_uevent_var(env, "MODALIAS=ide:m-%s", ide_media_string(drive));
-	return 0;
-}
-
-static int generic_ide_probe(struct device *dev)
-{
-	ide_drive_t *drive = to_ide_device(dev);
-	struct ide_driver *drv = to_ide_driver(dev->driver);
-
-	return drv->probe ? drv->probe(drive) : -ENODEV;
-}
-
-static int generic_ide_remove(struct device *dev)
-{
-	ide_drive_t *drive = to_ide_device(dev);
-	struct ide_driver *drv = to_ide_driver(dev->driver);
-
-	if (drv->remove)
-		drv->remove(drive);
-
-	return 0;
-}
-
-static void generic_ide_shutdown(struct device *dev)
-{
-	ide_drive_t *drive = to_ide_device(dev);
-	struct ide_driver *drv = to_ide_driver(dev->driver);
-
-	if (dev->driver && drv->shutdown)
-		drv->shutdown(drive);
-}
-
-struct bus_type ide_bus_type = {
-	.name		= "ide",
-	.match		= ide_bus_match,
-	.uevent		= ide_uevent,
-	.probe		= generic_ide_probe,
-	.remove		= generic_ide_remove,
-	.shutdown	= generic_ide_shutdown,
-	.dev_groups	= ide_dev_groups,
-	.suspend	= generic_ide_suspend,
-	.resume		= generic_ide_resume,
-};
-
-EXPORT_SYMBOL_GPL(ide_bus_type);
-
-int ide_vlb_clk;
-EXPORT_SYMBOL_GPL(ide_vlb_clk);
-
-module_param_named(vlb_clock, ide_vlb_clk, int, 0);
-MODULE_PARM_DESC(vlb_clock, "VLB clock frequency (in MHz)");
-
-int ide_pci_clk;
-EXPORT_SYMBOL_GPL(ide_pci_clk);
-
-module_param_named(pci_clock, ide_pci_clk, int, 0);
-MODULE_PARM_DESC(pci_clock, "PCI bus clock frequency (in MHz)");
-
-static int ide_set_dev_param_mask(const char *s, const struct kernel_param *kp)
-{
-	unsigned int a, b, i, j = 1;
-	unsigned int *dev_param_mask = (unsigned int *)kp->arg;
-
-	/* controller . device (0 or 1) [ : 1 (set) | 0 (clear) ] */
-	if (sscanf(s, "%u.%u:%u", &a, &b, &j) != 3 &&
-	    sscanf(s, "%u.%u", &a, &b) != 2)
-		return -EINVAL;
-
-	i = a * MAX_DRIVES + b;
-
-	if (i >= MAX_HWIFS * MAX_DRIVES || j > 1)
-		return -EINVAL;
-
-	if (j)
-		*dev_param_mask |= (1 << i);
-	else
-		*dev_param_mask &= ~(1 << i);
-
-	return 0;
-}
-
-static const struct kernel_param_ops param_ops_ide_dev_mask = {
-	.set = ide_set_dev_param_mask
-};
-
-#define param_check_ide_dev_mask(name, p) param_check_uint(name, p)
-
-static unsigned int ide_nodma;
-
-module_param_named(nodma, ide_nodma, ide_dev_mask, 0);
-MODULE_PARM_DESC(nodma, "disallow DMA for a device");
-
-static unsigned int ide_noflush;
-
-module_param_named(noflush, ide_noflush, ide_dev_mask, 0);
-MODULE_PARM_DESC(noflush, "disable flush requests for a device");
-
-static unsigned int ide_nohpa;
-
-module_param_named(nohpa, ide_nohpa, ide_dev_mask, 0);
-MODULE_PARM_DESC(nohpa, "disable Host Protected Area for a device");
-
-static unsigned int ide_noprobe;
-
-module_param_named(noprobe, ide_noprobe, ide_dev_mask, 0);
-MODULE_PARM_DESC(noprobe, "skip probing for a device");
-
-static unsigned int ide_nowerr;
-
-module_param_named(nowerr, ide_nowerr, ide_dev_mask, 0);
-MODULE_PARM_DESC(nowerr, "ignore the ATA_DF bit for a device");
-
-static unsigned int ide_cdroms;
-
-module_param_named(cdrom, ide_cdroms, ide_dev_mask, 0);
-MODULE_PARM_DESC(cdrom, "force device as a CD-ROM");
-
-struct chs_geom {
-	unsigned int	cyl;
-	u8		head;
-	u8		sect;
-};
-
-static unsigned int ide_disks;
-static struct chs_geom ide_disks_chs[MAX_HWIFS * MAX_DRIVES];
-
-static int ide_set_disk_chs(const char *str, const struct kernel_param *kp)
-{
-	unsigned int a, b, c = 0, h = 0, s = 0, i, j = 1;
-
-	/* controller . device (0 or 1) : Cylinders , Heads , Sectors */
-	/* controller . device (0 or 1) : 1 (use CHS) | 0 (ignore CHS) */
-	if (sscanf(str, "%u.%u:%u,%u,%u", &a, &b, &c, &h, &s) != 5 &&
-	    sscanf(str, "%u.%u:%u", &a, &b, &j) != 3)
-		return -EINVAL;
-
-	i = a * MAX_DRIVES + b;
-
-	if (i >= MAX_HWIFS * MAX_DRIVES || j > 1)
-		return -EINVAL;
-
-	if (c > INT_MAX || h > 255 || s > 255)
-		return -EINVAL;
-
-	if (j)
-		ide_disks |= (1 << i);
-	else
-		ide_disks &= ~(1 << i);
-
-	ide_disks_chs[i].cyl  = c;
-	ide_disks_chs[i].head = h;
-	ide_disks_chs[i].sect = s;
-
-	return 0;
-}
-
-module_param_call(chs, ide_set_disk_chs, NULL, NULL, 0);
-MODULE_PARM_DESC(chs, "force device as a disk (using CHS)");
-
-static void ide_dev_apply_params(ide_drive_t *drive, u8 unit)
-{
-	int i = drive->hwif->index * MAX_DRIVES + unit;
-
-	if (ide_nodma & (1 << i)) {
-		printk(KERN_INFO "ide: disallowing DMA for %s\n", drive->name);
-		drive->dev_flags |= IDE_DFLAG_NODMA;
-	}
-	if (ide_noflush & (1 << i)) {
-		printk(KERN_INFO "ide: disabling flush requests for %s\n",
-				 drive->name);
-		drive->dev_flags |= IDE_DFLAG_NOFLUSH;
-	}
-	if (ide_nohpa & (1 << i)) {
-		printk(KERN_INFO "ide: disabling Host Protected Area for %s\n",
-				 drive->name);
-		drive->dev_flags |= IDE_DFLAG_NOHPA;
-	}
-	if (ide_noprobe & (1 << i)) {
-		printk(KERN_INFO "ide: skipping probe for %s\n", drive->name);
-		drive->dev_flags |= IDE_DFLAG_NOPROBE;
-	}
-	if (ide_nowerr & (1 << i)) {
-		printk(KERN_INFO "ide: ignoring the ATA_DF bit for %s\n",
-				 drive->name);
-		drive->bad_wstat = BAD_R_STAT;
-	}
-	if (ide_cdroms & (1 << i)) {
-		printk(KERN_INFO "ide: forcing %s as a CD-ROM\n", drive->name);
-		drive->dev_flags |= IDE_DFLAG_PRESENT;
-		drive->media = ide_cdrom;
-		/* an ATAPI device ignores DRDY */
-		drive->ready_stat = 0;
-	}
-	if (ide_disks & (1 << i)) {
-		drive->cyl  = drive->bios_cyl  = ide_disks_chs[i].cyl;
-		drive->head = drive->bios_head = ide_disks_chs[i].head;
-		drive->sect = drive->bios_sect = ide_disks_chs[i].sect;
-
-		printk(KERN_INFO "ide: forcing %s as a disk (%d/%d/%d)\n",
-				 drive->name,
-				 drive->cyl, drive->head, drive->sect);
-
-		drive->dev_flags |= IDE_DFLAG_FORCED_GEOM | IDE_DFLAG_PRESENT;
-		drive->media = ide_disk;
-		drive->ready_stat = ATA_DRDY;
-	}
-}
-
-static unsigned int ide_ignore_cable;
-
-static int ide_set_ignore_cable(const char *s, const struct kernel_param *kp)
-{
-	int i, j = 1;
-
-	/* controller (ignore) */
-	/* controller : 1 (ignore) | 0 (use) */
-	if (sscanf(s, "%d:%d", &i, &j) != 2 && sscanf(s, "%d", &i) != 1)
-		return -EINVAL;
-
-	if (i >= MAX_HWIFS || j < 0 || j > 1)
-		return -EINVAL;
-
-	if (j)
-		ide_ignore_cable |= (1 << i);
-	else
-		ide_ignore_cable &= ~(1 << i);
-
-	return 0;
-}
-
-module_param_call(ignore_cable, ide_set_ignore_cable, NULL, NULL, 0);
-MODULE_PARM_DESC(ignore_cable, "ignore cable detection");
-
-void ide_port_apply_params(ide_hwif_t *hwif)
-{
-	ide_drive_t *drive;
-	int i;
-
-	if (ide_ignore_cable & (1 << hwif->index)) {
-		printk(KERN_INFO "ide: ignoring cable detection for %s\n",
-				 hwif->name);
-		hwif->cbl = ATA_CBL_PATA40_SHORT;
-	}
-
-	ide_port_for_each_dev(i, drive, hwif)
-		ide_dev_apply_params(drive, i);
-}
-
-/*
- * This is gets invoked once during initialization, to set *everything* up
- */
-static int __init ide_init(void)
-{
-	int ret;
-
-	printk(KERN_INFO "Uniform Multi-Platform E-IDE driver\n");
-
-	ret = bus_register(&ide_bus_type);
-	if (ret < 0) {
-		printk(KERN_WARNING "IDE: bus_register error: %d\n", ret);
-		return ret;
-	}
-
-	ide_port_class = class_create(THIS_MODULE, "ide_port");
-	if (IS_ERR(ide_port_class)) {
-		ret = PTR_ERR(ide_port_class);
-		goto out_port_class;
-	}
-
-	ide_acpi_init();
-
-	proc_ide_create();
-
-	return 0;
-
-out_port_class:
-	bus_unregister(&ide_bus_type);
-
-	return ret;
-}
-
-static void __exit ide_exit(void)
-{
-	proc_ide_destroy();
-
-	class_destroy(ide_port_class);
-
-	bus_unregister(&ide_bus_type);
-}
-
-module_init(ide_init);
-module_exit(ide_exit);
-
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/ide_platform.c b/drivers/ide/ide_platform.c
deleted file mode 100644
index 91639fd6c276..000000000000
--- a/drivers/ide/ide_platform.c
+++ /dev/null
@@ -1,133 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Platform IDE driver
- *
- * Copyright (C) 2007 MontaVista Software
- *
- * Maintainer: Kumar Gala <galak@kernel.crashing.org>
- */
-
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/ide.h>
-#include <linux/ioport.h>
-#include <linux/module.h>
-#include <linux/ata_platform.h>
-#include <linux/platform_device.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-
-static void plat_ide_setup_ports(struct ide_hw *hw, void __iomem *base,
-				 void __iomem *ctrl,
-				 struct pata_platform_info *pdata, int irq)
-{
-	unsigned long port = (unsigned long)base;
-	int i;
-
-	hw->io_ports.data_addr = port;
-
-	port += (1 << pdata->ioport_shift);
-	for (i = 1; i <= 7;
-	     i++, port += (1 << pdata->ioport_shift))
-		hw->io_ports_array[i] = port;
-
-	hw->io_ports.ctl_addr = (unsigned long)ctrl;
-
-	hw->irq = irq;
-}
-
-static const struct ide_port_info platform_ide_port_info = {
-	.host_flags		= IDE_HFLAG_NO_DMA,
-	.chipset		= ide_generic,
-};
-
-static int plat_ide_probe(struct platform_device *pdev)
-{
-	struct resource *res_base, *res_alt, *res_irq;
-	void __iomem *base, *alt_base;
-	struct pata_platform_info *pdata;
-	struct ide_host *host;
-	int ret = 0, mmio = 0;
-	struct ide_hw hw, *hws[] = { &hw };
-	struct ide_port_info d = platform_ide_port_info;
-
-	pdata = dev_get_platdata(&pdev->dev);
-
-	/* get a pointer to the register memory */
-	res_base = platform_get_resource(pdev, IORESOURCE_IO, 0);
-	res_alt = platform_get_resource(pdev, IORESOURCE_IO, 1);
-
-	if (!res_base || !res_alt) {
-		res_base = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-		res_alt = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-		if (!res_base || !res_alt) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		mmio = 1;
-	}
-
-	res_irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
-	if (!res_irq) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (mmio) {
-		base = devm_ioremap(&pdev->dev,
-			res_base->start, resource_size(res_base));
-		alt_base = devm_ioremap(&pdev->dev,
-			res_alt->start, resource_size(res_alt));
-	} else {
-		base = devm_ioport_map(&pdev->dev,
-			res_base->start, resource_size(res_base));
-		alt_base = devm_ioport_map(&pdev->dev,
-			res_alt->start, resource_size(res_alt));
-	}
-
-	memset(&hw, 0, sizeof(hw));
-	plat_ide_setup_ports(&hw, base, alt_base, pdata, res_irq->start);
-	hw.dev = &pdev->dev;
-
-	d.irq_flags = res_irq->flags & IRQF_TRIGGER_MASK;
-	if (res_irq->flags & IORESOURCE_IRQ_SHAREABLE)
-		d.irq_flags |= IRQF_SHARED;
-
-	if (mmio)
-		d.host_flags |= IDE_HFLAG_MMIO;
-
-	ret = ide_host_add(&d, hws, 1, &host);
-	if (ret)
-		goto out;
-
-	platform_set_drvdata(pdev, host);
-
-	return 0;
-
-out:
-	return ret;
-}
-
-static int plat_ide_remove(struct platform_device *pdev)
-{
-	struct ide_host *host = dev_get_drvdata(&pdev->dev);
-
-	ide_host_remove(host);
-
-	return 0;
-}
-
-static struct platform_driver platform_ide_driver = {
-	.driver = {
-		.name = "pata_platform",
-	},
-	.probe = plat_ide_probe,
-	.remove = plat_ide_remove,
-};
-
-module_platform_driver(platform_ide_driver);
-
-MODULE_DESCRIPTION("Platform IDE driver");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:pata_platform");
diff --git a/drivers/ide/it8172.c b/drivers/ide/it8172.c
deleted file mode 100644
index b6f674ab4fb7..000000000000
--- a/drivers/ide/it8172.c
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- *
- * BRIEF MODULE DESCRIPTION
- *      IT8172 IDE controller support
- *
- * Copyright (C) 2000 MontaVista Software Inc.
- * Copyright (C) 2008 Shane McDonald
- *
- *  This program is free software; you can redistribute  it and/or modify it
- *  under  the terms of  the GNU General  Public License as published by the
- *  Free Software Foundation;  either version 2 of the  License, or (at your
- *  option) any later version.
- *
- *  THIS  SOFTWARE  IS PROVIDED   ``AS  IS'' AND   ANY  EXPRESS OR IMPLIED
- *  WARRANTIES,   INCLUDING, BUT NOT  LIMITED  TO, THE IMPLIED WARRANTIES OF
- *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN
- *  NO  EVENT  SHALL   THE AUTHOR  BE    LIABLE FOR ANY   DIRECT, INDIRECT,
- *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- *  NOT LIMITED   TO, PROCUREMENT OF  SUBSTITUTE GOODS  OR SERVICES; LOSS OF
- *  USE, DATA,  OR PROFITS; OR  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- *  ANY THEORY OF LIABILITY, WHETHER IN  CONTRACT, STRICT LIABILITY, OR TORT
- *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- *  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- *  You should have received a copy of the  GNU General Public License along
- *  with this program; if not, write  to the Free Software Foundation, Inc.,
- *  675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/ioport.h>
-#include <linux/pci.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-
-#define DRV_NAME "IT8172"
-
-static void it8172_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	u16 drive_enables;
-	u32 drive_timing;
-	const u8 pio = drive->pio_mode - XFER_PIO_0;
-
-	/*
-	 * The highest value of DIOR/DIOW pulse width and recovery time
-	 * that can be set in the IT8172 is 8 PCI clock cycles.  As a result,
-	 * it cannot be configured for PIO mode 0.  This table sets these
-	 * parameters to the maximum supported by the IT8172.
-	 */
-	static const u8 timings[] = { 0x3f, 0x3c, 0x1b, 0x12, 0x0a };
-
-	pci_read_config_word(dev, 0x40, &drive_enables);
-	pci_read_config_dword(dev, 0x44, &drive_timing);
-
-	/*
-	 * Enable port 0x44. The IT8172 spec is confused; it calls
-	 * this register the "Slave IDE Timing Register", but in fact,
-	 * it controls timing for both master and slave drives.
-	 */
-	drive_enables |= 0x4000;
-
-	drive_enables &= drive->dn ? 0xc006 : 0xc060;
-	if (drive->media == ide_disk)
-		/* enable prefetch */
-		drive_enables |= 0x0004 << (drive->dn * 4);
-	if (ide_pio_need_iordy(drive, pio))
-		/* enable IORDY sample-point */
-		drive_enables |= 0x0002 << (drive->dn * 4);
-
-	drive_timing &= drive->dn ? 0x00003f00 : 0x000fc000;
-	drive_timing |= timings[pio] << (drive->dn * 6 + 8);
-
-	pci_write_config_word(dev, 0x40, drive_enables);
-	pci_write_config_dword(dev, 0x44, drive_timing);
-}
-
-static void it8172_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	int a_speed		= 3 << (drive->dn * 4);
-	int u_flag		= 1 << drive->dn;
-	int u_speed		= 0;
-	u8 reg48, reg4a;
-	const u8 speed		= drive->dma_mode;
-
-	pci_read_config_byte(dev, 0x48, &reg48);
-	pci_read_config_byte(dev, 0x4a, &reg4a);
-
-	if (speed >= XFER_UDMA_0) {
-		u8 udma = speed - XFER_UDMA_0;
-		u_speed = udma << (drive->dn * 4);
-
-		pci_write_config_byte(dev, 0x48, reg48 | u_flag);
-		reg4a &= ~a_speed;
-		pci_write_config_byte(dev, 0x4a, reg4a | u_speed);
-	} else {
-		const u8 mwdma_to_pio[] = { 0, 3, 4 };
-
-		pci_write_config_byte(dev, 0x48, reg48 & ~u_flag);
-		pci_write_config_byte(dev, 0x4a, reg4a & ~a_speed);
-
-		drive->pio_mode =
-			mwdma_to_pio[speed - XFER_MW_DMA_0] + XFER_PIO_0;
-
-		it8172_set_pio_mode(hwif, drive);
-	}
-}
-
-
-static const struct ide_port_ops it8172_port_ops = {
-	.set_pio_mode	= it8172_set_pio_mode,
-	.set_dma_mode	= it8172_set_dma_mode,
-};
-
-static const struct ide_port_info it8172_port_info = {
-	.name		= DRV_NAME,
-	.port_ops	= &it8172_port_ops,
-	.enablebits	= { {0x41, 0x80, 0x80}, {0x00, 0x00, 0x00} },
-	.host_flags	= IDE_HFLAG_SINGLE,
-	.pio_mask	= ATA_PIO4 & ~ATA_PIO0,
-	.mwdma_mask	= ATA_MWDMA2,
-	.udma_mask	= ATA_UDMA2,
-};
-
-static int it8172_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	if ((dev->class >> 8) != PCI_CLASS_STORAGE_IDE)
-		return -ENODEV; /* IT8172 is more than an IDE controller */
-	return ide_pci_init_one(dev, &it8172_port_info, NULL);
-}
-
-static struct pci_device_id it8172_pci_tbl[] = {
-	{ PCI_VDEVICE(ITE, PCI_DEVICE_ID_ITE_8172), 0 },
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, it8172_pci_tbl);
-
-static struct pci_driver it8172_pci_driver = {
-	.name		= "IT8172_IDE",
-	.id_table	= it8172_pci_tbl,
-	.probe		= it8172_init_one,
-	.remove		= ide_pci_remove,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init it8172_ide_init(void)
-{
-	return ide_pci_register_driver(&it8172_pci_driver);
-}
-
-static void __exit it8172_ide_exit(void)
-{
-	pci_unregister_driver(&it8172_pci_driver);
-}
-
-module_init(it8172_ide_init);
-module_exit(it8172_ide_exit);
-
-MODULE_AUTHOR("Steve Longerbeam");
-MODULE_DESCRIPTION("PCI driver module for ITE 8172 IDE");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/it8213.c b/drivers/ide/it8213.c
deleted file mode 100644
index d0bf4430c437..000000000000
--- a/drivers/ide/it8213.c
+++ /dev/null
@@ -1,217 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * ITE 8213 IDE driver
- *
- * Copyright (C) 2006 Jack Lee
- * Copyright (C) 2006 Alan Cox
- * Copyright (C) 2007 Bartlomiej Zolnierkiewicz
- */
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-
-#define DRV_NAME "it8213"
-
-/**
- *	it8213_set_pio_mode	-	set host controller for PIO mode
- *	@hwif: port
- *	@drive: drive
- *
- *	Set the interface PIO mode.
- */
-
-static void it8213_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	int is_slave		= drive->dn & 1;
-	int master_port		= 0x40;
-	int slave_port		= 0x44;
-	unsigned long flags;
-	u16 master_data;
-	u8 slave_data;
-	static DEFINE_SPINLOCK(tune_lock);
-	int control = 0;
-	const u8 pio = drive->pio_mode - XFER_PIO_0;
-
-	static const u8 timings[][2] = {
-					{ 0, 0 },
-					{ 0, 0 },
-					{ 1, 0 },
-					{ 2, 1 },
-					{ 2, 3 }, };
-
-	spin_lock_irqsave(&tune_lock, flags);
-	pci_read_config_word(dev, master_port, &master_data);
-
-	if (pio > 1)
-		control |= 1;	/* Programmable timing on */
-	if (drive->media != ide_disk)
-		control |= 4;	/* ATAPI */
-	if (ide_pio_need_iordy(drive, pio))
-		control |= 2;	/* IORDY */
-	if (is_slave) {
-		master_data |=  0x4000;
-		master_data &= ~0x0070;
-		if (pio > 1)
-			master_data = master_data | (control << 4);
-		pci_read_config_byte(dev, slave_port, &slave_data);
-		slave_data = slave_data & 0xf0;
-		slave_data = slave_data | (timings[pio][0] << 2) | timings[pio][1];
-	} else {
-		master_data &= ~0x3307;
-		if (pio > 1)
-			master_data = master_data | control;
-		master_data = master_data | (timings[pio][0] << 12) | (timings[pio][1] << 8);
-	}
-	pci_write_config_word(dev, master_port, master_data);
-	if (is_slave)
-		pci_write_config_byte(dev, slave_port, slave_data);
-	spin_unlock_irqrestore(&tune_lock, flags);
-}
-
-/**
- *	it8213_set_dma_mode	-	set host controller for DMA mode
- *	@hwif: port
- *	@drive: drive
- *
- *	Tune the ITE chipset for the DMA mode.
- */
-
-static void it8213_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	u8 maslave		= 0x40;
-	int a_speed		= 3 << (drive->dn * 4);
-	int u_flag		= 1 << drive->dn;
-	int v_flag		= 0x01 << drive->dn;
-	int w_flag		= 0x10 << drive->dn;
-	int u_speed		= 0;
-	u16			reg4042, reg4a;
-	u8			reg48, reg54, reg55;
-	const u8 speed		= drive->dma_mode;
-
-	pci_read_config_word(dev, maslave, &reg4042);
-	pci_read_config_byte(dev, 0x48, &reg48);
-	pci_read_config_word(dev, 0x4a, &reg4a);
-	pci_read_config_byte(dev, 0x54, &reg54);
-	pci_read_config_byte(dev, 0x55, &reg55);
-
-	if (speed >= XFER_UDMA_0) {
-		u8 udma = speed - XFER_UDMA_0;
-
-		u_speed = min_t(u8, 2 - (udma & 1), udma) << (drive->dn * 4);
-
-		if (!(reg48 & u_flag))
-			pci_write_config_byte(dev, 0x48, reg48 | u_flag);
-		if (speed >= XFER_UDMA_5)
-			pci_write_config_byte(dev, 0x55, (u8) reg55|w_flag);
-		else
-			pci_write_config_byte(dev, 0x55, (u8) reg55 & ~w_flag);
-
-		if ((reg4a & a_speed) != u_speed)
-			pci_write_config_word(dev, 0x4a, (reg4a & ~a_speed) | u_speed);
-		if (speed > XFER_UDMA_2) {
-			if (!(reg54 & v_flag))
-				pci_write_config_byte(dev, 0x54, reg54 | v_flag);
-		} else
-			pci_write_config_byte(dev, 0x54, reg54 & ~v_flag);
-	} else {
-		const u8 mwdma_to_pio[] = { 0, 3, 4 };
-
-		if (reg48 & u_flag)
-			pci_write_config_byte(dev, 0x48, reg48 & ~u_flag);
-		if (reg4a & a_speed)
-			pci_write_config_word(dev, 0x4a, reg4a & ~a_speed);
-		if (reg54 & v_flag)
-			pci_write_config_byte(dev, 0x54, reg54 & ~v_flag);
-		if (reg55 & w_flag)
-			pci_write_config_byte(dev, 0x55, (u8) reg55 & ~w_flag);
-
-		if (speed >= XFER_MW_DMA_0)
-			drive->pio_mode =
-				mwdma_to_pio[speed - XFER_MW_DMA_0] + XFER_PIO_0;
-		else
-			drive->pio_mode = XFER_PIO_2; /* for SWDMA2 */
-
-		it8213_set_pio_mode(hwif, drive);
-	}
-}
-
-static u8 it8213_cable_detect(ide_hwif_t *hwif)
-{
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	u8 reg42h = 0;
-
-	pci_read_config_byte(dev, 0x42, &reg42h);
-
-	return (reg42h & 0x02) ? ATA_CBL_PATA40 : ATA_CBL_PATA80;
-}
-
-static const struct ide_port_ops it8213_port_ops = {
-	.set_pio_mode		= it8213_set_pio_mode,
-	.set_dma_mode		= it8213_set_dma_mode,
-	.cable_detect		= it8213_cable_detect,
-};
-
-static const struct ide_port_info it8213_chipset = {
-	.name		= DRV_NAME,
-	.enablebits	= { {0x41, 0x80, 0x80} },
-	.port_ops	= &it8213_port_ops,
-	.host_flags	= IDE_HFLAG_SINGLE,
-	.pio_mask	= ATA_PIO4,
-	.swdma_mask	= ATA_SWDMA2_ONLY,
-	.mwdma_mask	= ATA_MWDMA12_ONLY,
-	.udma_mask	= ATA_UDMA6,
-};
-
-/**
- *	it8213_init_one	-	pci layer discovery entry
- *	@dev: PCI device
- *	@id: ident table entry
- *
- *	Called by the PCI code when it finds an ITE8213 controller. As
- *	this device follows the standard interfaces we can use the
- *	standard helper functions to do almost all the work for us.
- */
-
-static int it8213_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	return ide_pci_init_one(dev, &it8213_chipset, NULL);
-}
-
-static const struct pci_device_id it8213_pci_tbl[] = {
-	{ PCI_VDEVICE(ITE, PCI_DEVICE_ID_ITE_8213), 0 },
-	{ 0, },
-};
-
-MODULE_DEVICE_TABLE(pci, it8213_pci_tbl);
-
-static struct pci_driver it8213_pci_driver = {
-	.name		= "ITE8213_IDE",
-	.id_table	= it8213_pci_tbl,
-	.probe		= it8213_init_one,
-	.remove		= ide_pci_remove,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init it8213_ide_init(void)
-{
-	return ide_pci_register_driver(&it8213_pci_driver);
-}
-
-static void __exit it8213_ide_exit(void)
-{
-	pci_unregister_driver(&it8213_pci_driver);
-}
-
-module_init(it8213_ide_init);
-module_exit(it8213_ide_exit);
-
-MODULE_AUTHOR("Jack Lee, Alan Cox");
-MODULE_DESCRIPTION("PCI driver module for the ITE 8213");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/it821x.c b/drivers/ide/it821x.c
deleted file mode 100644
index 36a64c8ea575..000000000000
--- a/drivers/ide/it821x.c
+++ /dev/null
@@ -1,715 +0,0 @@
-/*
- * Copyright (C) 2004		Red Hat
- * Copyright (C) 2007		Bartlomiej Zolnierkiewicz
- *
- *  May be copied or modified under the terms of the GNU General Public License
- *  Based in part on the ITE vendor provided SCSI driver.
- *
- *  Documentation:
- *	Datasheet is freely available, some other documents under NDA.
- *
- *  The ITE8212 isn't exactly a standard IDE controller. It has two
- *  modes. In pass through mode then it is an IDE controller. In its smart
- *  mode its actually quite a capable hardware raid controller disguised
- *  as an IDE controller. Smart mode only understands DMA read/write and
- *  identify, none of the fancier commands apply. The IT8211 is identical
- *  in other respects but lacks the raid mode.
- *
- *  Errata:
- *  o	Rev 0x10 also requires master/slave hold the same DMA timings and
- *	cannot do ATAPI MWDMA.
- *  o	The identify data for raid volumes lacks CHS info (technically ok)
- *	but also fails to set the LBA28 and other bits. We fix these in
- *	the IDE probe quirk code.
- *  o	If you write LBA48 sized I/O's (ie > 256 sector) in smart mode
- *	raid then the controller firmware dies
- *  o	Smart mode without RAID doesn't clear all the necessary identify
- *	bits to reduce the command set to the one used
- *
- *  This has a few impacts on the driver
- *  - In pass through mode we do all the work you would expect
- *  - In smart mode the clocking set up is done by the controller generally
- *    but we must watch the other limits and filter.
- *  - There are a few extra vendor commands that actually talk to the
- *    controller but only work PIO with no IRQ.
- *
- *  Vendor areas of the identify block in smart mode are used for the
- *  timing and policy set up. Each HDD in raid mode also has a serial
- *  block on the disk. The hardware extra commands are get/set chip status,
- *  rebuild, get rebuild status.
- *
- *  In Linux the driver supports pass through mode as if the device was
- *  just another IDE controller. If the smart mode is running then
- *  volumes are managed by the controller firmware and each IDE "disk"
- *  is a raid volume. Even more cute - the controller can do automated
- *  hotplug and rebuild.
- *
- *  The pass through controller itself is a little demented. It has a
- *  flaw that it has a single set of PIO/MWDMA timings per channel so
- *  non UDMA devices restrict each others performance. It also has a
- *  single clock source per channel so mixed UDMA100/133 performance
- *  isn't perfect and we have to pick a clock. Thankfully none of this
- *  matters in smart mode. ATAPI DMA is not currently supported.
- *
- *  It seems the smart mode is a win for RAID1/RAID10 but otherwise not.
- *
- *  TODO
- *	-	ATAPI UDMA is ok but not MWDMA it seems
- *	-	RAID configuration ioctls
- *	-	Move to libata once it grows up
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/pci.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-
-#define DRV_NAME "it821x"
-
-#define QUIRK_VORTEX86 1
-
-struct it821x_dev
-{
-	unsigned int smart:1,		/* Are we in smart raid mode */
-		timing10:1;		/* Rev 0x10 */
-	u8	clock_mode;		/* 0, ATA_50 or ATA_66 */
-	u8	want[2][2];		/* Mode/Pri log for master slave */
-	/* We need these for switching the clock when DMA goes on/off
-	   The high byte is the 66Mhz timing */
-	u16	pio[2];			/* Cached PIO values */
-	u16	mwdma[2];		/* Cached MWDMA values */
-	u16	udma[2];		/* Cached UDMA values (per drive) */
-	u16	quirks;
-};
-
-#define ATA_66		0
-#define ATA_50		1
-#define ATA_ANY		2
-
-#define UDMA_OFF	0
-#define MWDMA_OFF	0
-
-/*
- *	We allow users to force the card into non raid mode without
- *	flashing the alternative BIOS. This is also necessary right now
- *	for embedded platforms that cannot run a PC BIOS but are using this
- *	device.
- */
-
-static int it8212_noraid;
-
-/**
- *	it821x_program	-	program the PIO/MWDMA registers
- *	@drive: drive to tune
- *	@timing: timing info
- *
- *	Program the PIO/MWDMA timing for this channel according to the
- *	current clock.
- */
-
-static void it821x_program(ide_drive_t *drive, u16 timing)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	struct it821x_dev *itdev = ide_get_hwifdata(hwif);
-	int channel = hwif->channel;
-	u8 conf;
-
-	/* Program PIO/MWDMA timing bits */
-	if(itdev->clock_mode == ATA_66)
-		conf = timing >> 8;
-	else
-		conf = timing & 0xFF;
-
-	pci_write_config_byte(dev, 0x54 + 4 * channel, conf);
-}
-
-/**
- *	it821x_program_udma	-	program the UDMA registers
- *	@drive: drive to tune
- *	@timing: timing info
- *
- *	Program the UDMA timing for this drive according to the
- *	current clock.
- */
-
-static void it821x_program_udma(ide_drive_t *drive, u16 timing)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	struct it821x_dev *itdev = ide_get_hwifdata(hwif);
-	int channel = hwif->channel;
-	u8 unit = drive->dn & 1, conf;
-
-	/* Program UDMA timing bits */
-	if(itdev->clock_mode == ATA_66)
-		conf = timing >> 8;
-	else
-		conf = timing & 0xFF;
-
-	if (itdev->timing10 == 0)
-		pci_write_config_byte(dev, 0x56 + 4 * channel + unit, conf);
-	else {
-		pci_write_config_byte(dev, 0x56 + 4 * channel, conf);
-		pci_write_config_byte(dev, 0x56 + 4 * channel + 1, conf);
-	}
-}
-
-/**
- *	it821x_clock_strategy
- *	@drive: drive to set up
- *
- *	Select between the 50 and 66Mhz base clocks to get the best
- *	results for this interface.
- */
-
-static void it821x_clock_strategy(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	struct it821x_dev *itdev = ide_get_hwifdata(hwif);
-	ide_drive_t *pair = ide_get_pair_dev(drive);
-	int clock, altclock, sel = 0;
-	u8 unit = drive->dn & 1, v;
-
-	if(itdev->want[0][0] > itdev->want[1][0]) {
-		clock = itdev->want[0][1];
-		altclock = itdev->want[1][1];
-	} else {
-		clock = itdev->want[1][1];
-		altclock = itdev->want[0][1];
-	}
-
-	/*
-	 * if both clocks can be used for the mode with the higher priority
-	 * use the clock needed by the mode with the lower priority
-	 */
-	if (clock == ATA_ANY)
-		clock = altclock;
-
-	/* Nobody cares - keep the same clock */
-	if(clock == ATA_ANY)
-		return;
-	/* No change */
-	if(clock == itdev->clock_mode)
-		return;
-
-	/* Load this into the controller ? */
-	if(clock == ATA_66)
-		itdev->clock_mode = ATA_66;
-	else {
-		itdev->clock_mode = ATA_50;
-		sel = 1;
-	}
-
-	pci_read_config_byte(dev, 0x50, &v);
-	v &= ~(1 << (1 + hwif->channel));
-	v |= sel << (1 + hwif->channel);
-	pci_write_config_byte(dev, 0x50, v);
-
-	/*
-	 *	Reprogram the UDMA/PIO of the pair drive for the switch
-	 *	MWDMA will be dealt with by the dma switcher
-	 */
-	if(pair && itdev->udma[1-unit] != UDMA_OFF) {
-		it821x_program_udma(pair, itdev->udma[1-unit]);
-		it821x_program(pair, itdev->pio[1-unit]);
-	}
-	/*
-	 *	Reprogram the UDMA/PIO of our drive for the switch.
-	 *	MWDMA will be dealt with by the dma switcher
-	 */
-	if(itdev->udma[unit] != UDMA_OFF) {
-		it821x_program_udma(drive, itdev->udma[unit]);
-		it821x_program(drive, itdev->pio[unit]);
-	}
-}
-
-/**
- *	it821x_set_pio_mode	-	set host controller for PIO mode
- *	@hwif: port
- *	@drive: drive
- *
- *	Tune the host to the desired PIO mode taking into the consideration
- *	the maximum PIO mode supported by the other device on the cable.
- */
-
-static void it821x_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	struct it821x_dev *itdev = ide_get_hwifdata(hwif);
-	ide_drive_t *pair = ide_get_pair_dev(drive);
-	const u8 pio = drive->pio_mode - XFER_PIO_0;
-	u8 unit = drive->dn & 1, set_pio = pio;
-
-	/* Spec says 89 ref driver uses 88 */
-	static u16 pio_timings[]= { 0xAA88, 0xA382, 0xA181, 0x3332, 0x3121 };
-	static u8 pio_want[]    = { ATA_66, ATA_66, ATA_66, ATA_66, ATA_ANY };
-
-	/*
-	 * Compute the best PIO mode we can for a given device. We must
-	 * pick a speed that does not cause problems with the other device
-	 * on the cable.
-	 */
-	if (pair) {
-		u8 pair_pio = pair->pio_mode - XFER_PIO_0;
-		/* trim PIO to the slowest of the master/slave */
-		if (pair_pio < set_pio)
-			set_pio = pair_pio;
-	}
-
-	/* We prefer 66Mhz clock for PIO 0-3, don't care for PIO4 */
-	itdev->want[unit][1] = pio_want[set_pio];
-	itdev->want[unit][0] = 1;	/* PIO is lowest priority */
-	itdev->pio[unit] = pio_timings[set_pio];
-	it821x_clock_strategy(drive);
-	it821x_program(drive, itdev->pio[unit]);
-}
-
-/**
- *	it821x_tune_mwdma	-	tune a channel for MWDMA
- *	@drive: drive to set up
- *	@mode_wanted: the target operating mode
- *
- *	Load the timing settings for this device mode into the
- *	controller when doing MWDMA in pass through mode. The caller
- *	must manage the whole lack of per device MWDMA/PIO timings and
- *	the shared MWDMA/PIO timing register.
- */
-
-static void it821x_tune_mwdma(ide_drive_t *drive, u8 mode_wanted)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	struct it821x_dev *itdev = (void *)ide_get_hwifdata(hwif);
-	u8 unit = drive->dn & 1, channel = hwif->channel, conf;
-
-	static u16 dma[]	= { 0x8866, 0x3222, 0x3121 };
-	static u8 mwdma_want[]	= { ATA_ANY, ATA_66, ATA_ANY };
-
-	itdev->want[unit][1] = mwdma_want[mode_wanted];
-	itdev->want[unit][0] = 2;	/* MWDMA is low priority */
-	itdev->mwdma[unit] = dma[mode_wanted];
-	itdev->udma[unit] = UDMA_OFF;
-
-	/* UDMA bits off - Revision 0x10 do them in pairs */
-	pci_read_config_byte(dev, 0x50, &conf);
-	if (itdev->timing10)
-		conf |= channel ? 0x60: 0x18;
-	else
-		conf |= 1 << (3 + 2 * channel + unit);
-	pci_write_config_byte(dev, 0x50, conf);
-
-	it821x_clock_strategy(drive);
-	/* FIXME: do we need to program this ? */
-	/* it821x_program(drive, itdev->mwdma[unit]); */
-}
-
-/**
- *	it821x_tune_udma	-	tune a channel for UDMA
- *	@drive: drive to set up
- *	@mode_wanted: the target operating mode
- *
- *	Load the timing settings for this device mode into the
- *	controller when doing UDMA modes in pass through.
- */
-
-static void it821x_tune_udma(ide_drive_t *drive, u8 mode_wanted)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	struct it821x_dev *itdev = ide_get_hwifdata(hwif);
-	u8 unit = drive->dn & 1, channel = hwif->channel, conf;
-
-	static u16 udma[]	= { 0x4433, 0x4231, 0x3121, 0x2121, 0x1111, 0x2211, 0x1111 };
-	static u8 udma_want[]	= { ATA_ANY, ATA_50, ATA_ANY, ATA_66, ATA_66, ATA_50, ATA_66 };
-
-	itdev->want[unit][1] = udma_want[mode_wanted];
-	itdev->want[unit][0] = 3;	/* UDMA is high priority */
-	itdev->mwdma[unit] = MWDMA_OFF;
-	itdev->udma[unit] = udma[mode_wanted];
-	if(mode_wanted >= 5)
-		itdev->udma[unit] |= 0x8080;	/* UDMA 5/6 select on */
-
-	/* UDMA on. Again revision 0x10 must do the pair */
-	pci_read_config_byte(dev, 0x50, &conf);
-	if (itdev->timing10)
-		conf &= channel ? 0x9F: 0xE7;
-	else
-		conf &= ~ (1 << (3 + 2 * channel + unit));
-	pci_write_config_byte(dev, 0x50, conf);
-
-	it821x_clock_strategy(drive);
-	it821x_program_udma(drive, itdev->udma[unit]);
-
-}
-
-/**
- *	it821x_dma_read	-	DMA hook
- *	@drive: drive for DMA
- *
- *	The IT821x has a single timing register for MWDMA and for PIO
- *	operations. As we flip back and forth we have to reload the
- *	clock. In addition the rev 0x10 device only works if the same
- *	timing value is loaded into the master and slave UDMA clock
- * 	so we must also reload that.
- *
- *	FIXME: we could figure out in advance if we need to do reloads
- */
-
-static void it821x_dma_start(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct it821x_dev *itdev = ide_get_hwifdata(hwif);
-	u8 unit = drive->dn & 1;
-
-	if(itdev->mwdma[unit] != MWDMA_OFF)
-		it821x_program(drive, itdev->mwdma[unit]);
-	else if(itdev->udma[unit] != UDMA_OFF && itdev->timing10)
-		it821x_program_udma(drive, itdev->udma[unit]);
-	ide_dma_start(drive);
-}
-
-/**
- *	it821x_dma_write	-	DMA hook
- *	@drive: drive for DMA stop
- *
- *	The IT821x has a single timing register for MWDMA and for PIO
- *	operations. As we flip back and forth we have to reload the
- *	clock.
- */
-
-static int it821x_dma_end(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct it821x_dev *itdev = ide_get_hwifdata(hwif);
-	int ret = ide_dma_end(drive);
-	u8 unit = drive->dn & 1;
-
-	if(itdev->mwdma[unit] != MWDMA_OFF)
-		it821x_program(drive, itdev->pio[unit]);
-	return ret;
-}
-
-/**
- *	it821x_set_dma_mode	-	set host controller for DMA mode
- *	@hwif: port
- *	@drive: drive
- *
- *	Tune the ITE chipset for the desired DMA mode.
- */
-
-static void it821x_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	const u8 speed = drive->dma_mode;
-
-	/*
-	 * MWDMA tuning is really hard because our MWDMA and PIO
-	 * timings are kept in the same place.  We can switch in the
-	 * host dma on/off callbacks.
-	 */
-	if (speed >= XFER_UDMA_0 && speed <= XFER_UDMA_6)
-		it821x_tune_udma(drive, speed - XFER_UDMA_0);
-	else if (speed >= XFER_MW_DMA_0 && speed <= XFER_MW_DMA_2)
-		it821x_tune_mwdma(drive, speed - XFER_MW_DMA_0);
-}
-
-/**
- *	it821x_cable_detect	-	cable detection
- *	@hwif: interface to check
- *
- *	Check for the presence of an ATA66 capable cable on the
- *	interface. Problematic as it seems some cards don't have
- *	the needed logic onboard.
- */
-
-static u8 it821x_cable_detect(ide_hwif_t *hwif)
-{
-	/* The reference driver also only does disk side */
-	return ATA_CBL_PATA80;
-}
-
-/**
- *	it821x_quirkproc	-	post init callback
- *	@drive: drive
- *
- *	This callback is run after the drive has been probed but
- *	before anything gets attached. It allows drivers to do any
- *	final tuning that is needed, or fixups to work around bugs.
- */
-
-static void it821x_quirkproc(ide_drive_t *drive)
-{
-	struct it821x_dev *itdev = ide_get_hwifdata(drive->hwif);
-	u16 *id = drive->id;
-
-	if (!itdev->smart) {
-		/*
-		 *	If we are in pass through mode then not much
-		 *	needs to be done, but we do bother to clear the
-		 *	IRQ mask as we may well be in PIO (eg rev 0x10)
-		 *	for now and we know unmasking is safe on this chipset.
-		 */
-		drive->dev_flags |= IDE_DFLAG_UNMASK;
-	} else {
-	/*
-	 *	Perform fixups on smart mode. We need to "lose" some
-	 *	capabilities the firmware lacks but does not filter, and
-	 *	also patch up some capability bits that it forgets to set
-	 *	in RAID mode.
-	 */
-
-		/* Check for RAID v native */
-		if (strstr((char *)&id[ATA_ID_PROD],
-			   "Integrated Technology Express")) {
-			/* In raid mode the ident block is slightly buggy
-			   We need to set the bits so that the IDE layer knows
-			   LBA28. LBA48 and DMA ar valid */
-			id[ATA_ID_CAPABILITY]    |= (3 << 8); /* LBA28, DMA */
-			id[ATA_ID_COMMAND_SET_2] |= 0x0400;   /* LBA48 valid */
-			id[ATA_ID_CFS_ENABLE_2]  |= 0x0400;   /* LBA48 on */
-			/* Reporting logic */
-			printk(KERN_INFO "%s: IT8212 %sRAID %d volume",
-				drive->name, id[147] ? "Bootable " : "",
-				id[ATA_ID_CSFO]);
-			if (id[ATA_ID_CSFO] != 1)
-				printk(KERN_CONT "(%dK stripe)", id[146]);
-			printk(KERN_CONT ".\n");
-		} else {
-			/* Non RAID volume. Fixups to stop the core code
-			   doing unsupported things */
-			id[ATA_ID_FIELD_VALID]	 &= 3;
-			id[ATA_ID_QUEUE_DEPTH]	  = 0;
-			id[ATA_ID_COMMAND_SET_1]  = 0;
-			id[ATA_ID_COMMAND_SET_2] &= 0xC400;
-			id[ATA_ID_CFSSE]	 &= 0xC000;
-			id[ATA_ID_CFS_ENABLE_1]	  = 0;
-			id[ATA_ID_CFS_ENABLE_2]	 &= 0xC400;
-			id[ATA_ID_CSF_DEFAULT]	 &= 0xC000;
-			id[127]			  = 0;
-			id[ATA_ID_DLF]		  = 0;
-			id[ATA_ID_CSFO]		  = 0;
-			id[ATA_ID_CFA_POWER]	  = 0;
-			printk(KERN_INFO "%s: Performing identify fixups.\n",
-				drive->name);
-		}
-
-		/*
-		 * Set MWDMA0 mode as enabled/support - just to tell
-		 * IDE core that DMA is supported (it821x hardware
-		 * takes care of DMA mode programming).
-		 */
-		if (ata_id_has_dma(id)) {
-			id[ATA_ID_MWDMA_MODES] |= 0x0101;
-			drive->current_speed = XFER_MW_DMA_0;
-		}
-	}
-
-}
-
-static const struct ide_dma_ops it821x_pass_through_dma_ops = {
-	.dma_host_set		= ide_dma_host_set,
-	.dma_setup		= ide_dma_setup,
-	.dma_start		= it821x_dma_start,
-	.dma_end		= it821x_dma_end,
-	.dma_test_irq		= ide_dma_test_irq,
-	.dma_lost_irq		= ide_dma_lost_irq,
-	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_sff_read_status	= ide_dma_sff_read_status,
-};
-
-/**
- *	init_hwif_it821x	-	set up hwif structs
- *	@hwif: interface to set up
- *
- *	We do the basic set up of the interface structure. The IT8212
- *	requires several custom handlers so we override the default
- *	ide DMA handlers appropriately
- */
-
-static void init_hwif_it821x(ide_hwif_t *hwif)
-{
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	struct ide_host *host = pci_get_drvdata(dev);
-	struct it821x_dev *itdevs = host->host_priv;
-	struct it821x_dev *idev = itdevs + hwif->channel;
-	u8 conf;
-
-	ide_set_hwifdata(hwif, idev);
-
-	pci_read_config_byte(dev, 0x50, &conf);
-	if (conf & 1) {
-		idev->smart = 1;
-		hwif->host_flags |= IDE_HFLAG_NO_ATAPI_DMA;
-		/* Long I/O's although allowed in LBA48 space cause the
-		   onboard firmware to enter the twighlight zone */
-		hwif->rqsize = 256;
-	}
-
-	/* Pull the current clocks from 0x50 also */
-	if (conf & (1 << (1 + hwif->channel)))
-		idev->clock_mode = ATA_50;
-	else
-		idev->clock_mode = ATA_66;
-
-	idev->want[0][1] = ATA_ANY;
-	idev->want[1][1] = ATA_ANY;
-
-	/*
-	 *	Not in the docs but according to the reference driver
-	 *	this is necessary.
-	 */
-
-	if (dev->revision == 0x10) {
-		idev->timing10 = 1;
-		hwif->host_flags |= IDE_HFLAG_NO_ATAPI_DMA;
-		if (idev->smart == 0)
-			printk(KERN_WARNING DRV_NAME " %s: revision 0x10, "
-				"workarounds activated\n", pci_name(dev));
-	}
-
-	if (idev->smart == 0) {
-		/* MWDMA/PIO clock switching for pass through mode */
-		hwif->dma_ops = &it821x_pass_through_dma_ops;
-	} else
-		hwif->host_flags |= IDE_HFLAG_NO_SET_MODE;
-
-	if (hwif->dma_base == 0)
-		return;
-
-	hwif->ultra_mask = ATA_UDMA6;
-	hwif->mwdma_mask = ATA_MWDMA2;
-
-	/* Vortex86SX quirk: prevent Ultra-DMA mode to fix BadCRC issue */
-	if (idev->quirks & QUIRK_VORTEX86) {
-		if (dev->revision == 0x11)
-			hwif->ultra_mask = 0;
-	}
-}
-
-static void it8212_disable_raid(struct pci_dev *dev)
-{
-	/* Reset local CPU, and set BIOS not ready */
-	pci_write_config_byte(dev, 0x5E, 0x01);
-
-	/* Set to bypass mode, and reset PCI bus */
-	pci_write_config_byte(dev, 0x50, 0x00);
-	pci_write_config_word(dev, PCI_COMMAND,
-			      PCI_COMMAND_PARITY | PCI_COMMAND_IO |
-			      PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER);
-	pci_write_config_word(dev, 0x40, 0xA0F3);
-
-	pci_write_config_dword(dev,0x4C, 0x02040204);
-	pci_write_config_byte(dev, 0x42, 0x36);
-	pci_write_config_byte(dev, PCI_LATENCY_TIMER, 0x20);
-}
-
-static int init_chipset_it821x(struct pci_dev *dev)
-{
-	u8 conf;
-	static char *mode[2] = { "pass through", "smart" };
-
-	/* Force the card into bypass mode if so requested */
-	if (it8212_noraid) {
-		printk(KERN_INFO DRV_NAME " %s: forcing bypass mode\n",
-			pci_name(dev));
-		it8212_disable_raid(dev);
-	}
-	pci_read_config_byte(dev, 0x50, &conf);
-	printk(KERN_INFO DRV_NAME " %s: controller in %s mode\n",
-		pci_name(dev), mode[conf & 1]);
-	return 0;
-}
-
-static const struct ide_port_ops it821x_port_ops = {
-	/* it821x_set_{pio,dma}_mode() are only used in pass-through mode */
-	.set_pio_mode		= it821x_set_pio_mode,
-	.set_dma_mode		= it821x_set_dma_mode,
-	.quirkproc		= it821x_quirkproc,
-	.cable_detect		= it821x_cable_detect,
-};
-
-static const struct ide_port_info it821x_chipset = {
-	.name		= DRV_NAME,
-	.init_chipset	= init_chipset_it821x,
-	.init_hwif	= init_hwif_it821x,
-	.port_ops	= &it821x_port_ops,
-	.pio_mask	= ATA_PIO4,
-};
-
-/**
- *	it821x_init_one	-	pci layer discovery entry
- *	@dev: PCI device
- *	@id: ident table entry
- *
- *	Called by the PCI code when it finds an ITE821x controller.
- *	We then use the IDE PCI generic helper to do most of the work.
- */
-
-static int it821x_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	struct it821x_dev *itdevs;
-	int rc;
-
-	itdevs = kcalloc(2, sizeof(*itdevs), GFP_KERNEL);
-	if (itdevs == NULL) {
-		printk(KERN_ERR DRV_NAME " %s: out of memory\n", pci_name(dev));
-		return -ENOMEM;
-	}
-
-	itdevs->quirks = id->driver_data;
-
-	rc = ide_pci_init_one(dev, &it821x_chipset, itdevs);
-	if (rc)
-		kfree(itdevs);
-
-	return rc;
-}
-
-static void it821x_remove(struct pci_dev *dev)
-{
-	struct ide_host *host = pci_get_drvdata(dev);
-	struct it821x_dev *itdevs = host->host_priv;
-
-	ide_pci_remove(dev);
-	kfree(itdevs);
-}
-
-static const struct pci_device_id it821x_pci_tbl[] = {
-	{ PCI_VDEVICE(ITE, PCI_DEVICE_ID_ITE_8211), 0 },
-	{ PCI_VDEVICE(ITE, PCI_DEVICE_ID_ITE_8212), 0 },
-	{ PCI_VDEVICE(RDC, PCI_DEVICE_ID_RDC_D1010), QUIRK_VORTEX86 },
-	{ 0, },
-};
-
-MODULE_DEVICE_TABLE(pci, it821x_pci_tbl);
-
-static struct pci_driver it821x_pci_driver = {
-	.name		= "ITE821x IDE",
-	.id_table	= it821x_pci_tbl,
-	.probe		= it821x_init_one,
-	.remove		= it821x_remove,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init it821x_ide_init(void)
-{
-	return ide_pci_register_driver(&it821x_pci_driver);
-}
-
-static void __exit it821x_ide_exit(void)
-{
-	pci_unregister_driver(&it821x_pci_driver);
-}
-
-module_init(it821x_ide_init);
-module_exit(it821x_ide_exit);
-
-module_param_named(noraid, it8212_noraid, int, S_IRUGO);
-MODULE_PARM_DESC(noraid, "Force card into bypass mode");
-
-MODULE_AUTHOR("Alan Cox");
-MODULE_DESCRIPTION("PCI driver module for the ITE 821x");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/jmicron.c b/drivers/ide/jmicron.c
deleted file mode 100644
index ae6480dcbadf..000000000000
--- a/drivers/ide/jmicron.c
+++ /dev/null
@@ -1,176 +0,0 @@
-
-/*
- * Copyright (C) 2006		Red Hat
- *
- *  May be copied or modified under the terms of the GNU General Public License
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-
-#define DRV_NAME "jmicron"
-
-typedef enum {
-	PORT_PATA0 = 0,
-	PORT_PATA1 = 1,
-	PORT_SATA = 2,
-} port_type;
-
-/**
- *	jmicron_cable_detect	-	cable detection
- *	@hwif: IDE port
- *
- *	Returns the cable type.
- */
-
-static u8 jmicron_cable_detect(ide_hwif_t *hwif)
-{
-	struct pci_dev *pdev = to_pci_dev(hwif->dev);
-
-	u32 control;
-	u32 control5;
-
-	int port = hwif->channel;
-	port_type port_map[2];
-
-	pci_read_config_dword(pdev, 0x40, &control);
-
-	/* There are two basic mappings. One has the two SATA ports merged
-	   as master/slave and the secondary as PATA, the other has only the
-	   SATA port mapped */
-	if (control & (1 << 23)) {
-		port_map[0] = PORT_SATA;
-		port_map[1] = PORT_PATA0;
-	} else {
-		port_map[0] = PORT_SATA;
-		port_map[1] = PORT_SATA;
-	}
-
-	/* The 365/366 may have this bit set to map the second PATA port
-	   as the internal primary channel */
-	pci_read_config_dword(pdev, 0x80, &control5);
-	if (control5 & (1<<24))
-		port_map[0] = PORT_PATA1;
-
-	/* The two ports may then be logically swapped by the firmware */
-	if (control & (1 << 22))
-		port = port ^ 1;
-
-	/*
-	 *	Now we know which physical port we are talking about we can
-	 *	actually do our cable checking etc. Thankfully we don't need
-	 *	to do the plumbing for other cases.
-	 */
-	switch (port_map[port]) {
-	case PORT_PATA0:
-		if (control & (1 << 3))	/* 40/80 pin primary */
-			return ATA_CBL_PATA40;
-		return ATA_CBL_PATA80;
-	case PORT_PATA1:
-		if (control5 & (1 << 19))	/* 40/80 pin secondary */
-			return ATA_CBL_PATA40;
-		return ATA_CBL_PATA80;
-	case PORT_SATA:
-		break;
-	}
-	/* Avoid bogus "control reaches end of non-void function" */
-	return ATA_CBL_PATA80;
-}
-
-static void jmicron_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-}
-
-/**
- *	jmicron_set_dma_mode	-	set host controller for DMA mode
- *	@hwif: port
- *	@drive: drive
- *
- *	As the JMicron snoops for timings we don't need to do anything here.
- */
-
-static void jmicron_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-}
-
-static const struct ide_port_ops jmicron_port_ops = {
-	.set_pio_mode		= jmicron_set_pio_mode,
-	.set_dma_mode		= jmicron_set_dma_mode,
-	.cable_detect		= jmicron_cable_detect,
-};
-
-static const struct ide_port_info jmicron_chipset = {
-	.name		= DRV_NAME,
-	.enablebits	= { { 0x40, 0x01, 0x01 }, { 0x40, 0x10, 0x10 } },
-	.port_ops	= &jmicron_port_ops,
-	.pio_mask	= ATA_PIO5,
-	.mwdma_mask	= ATA_MWDMA2,
-	.udma_mask	= ATA_UDMA6,
-};
-
-/**
- *	jmicron_init_one	-	pci layer discovery entry
- *	@dev: PCI device
- *	@id: ident table entry
- *
- *	Called by the PCI code when it finds a Jmicron controller.
- *	We then use the IDE PCI generic helper to do most of the work.
- */
-
-static int jmicron_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	return ide_pci_init_one(dev, &jmicron_chipset, NULL);
-}
-
-/* All JMB PATA controllers have and will continue to have the same
- * interface.  Matching vendor and device class is enough for all
- * current and future controllers if the controller is programmed
- * properly.
- *
- * If libata is configured, jmicron PCI quirk programs the controller
- * into the correct mode.  If libata isn't configured, match known
- * device IDs too to maintain backward compatibility.
- */
-static struct pci_device_id jmicron_pci_tbl[] = {
-#if !defined(CONFIG_ATA) && !defined(CONFIG_ATA_MODULE)
-	{ PCI_VDEVICE(JMICRON, PCI_DEVICE_ID_JMICRON_JMB361) },
-	{ PCI_VDEVICE(JMICRON, PCI_DEVICE_ID_JMICRON_JMB363) },
-	{ PCI_VDEVICE(JMICRON, PCI_DEVICE_ID_JMICRON_JMB365) },
-	{ PCI_VDEVICE(JMICRON, PCI_DEVICE_ID_JMICRON_JMB366) },
-	{ PCI_VDEVICE(JMICRON, PCI_DEVICE_ID_JMICRON_JMB368) },
-#endif
-	{ PCI_VENDOR_ID_JMICRON, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID,
-	  PCI_CLASS_STORAGE_IDE << 8, 0xffff00, 0 },
-	{ 0, },
-};
-
-MODULE_DEVICE_TABLE(pci, jmicron_pci_tbl);
-
-static struct pci_driver jmicron_pci_driver = {
-	.name		= "JMicron IDE",
-	.id_table	= jmicron_pci_tbl,
-	.probe		= jmicron_init_one,
-	.remove		= ide_pci_remove,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init jmicron_ide_init(void)
-{
-	return ide_pci_register_driver(&jmicron_pci_driver);
-}
-
-static void __exit jmicron_ide_exit(void)
-{
-	pci_unregister_driver(&jmicron_pci_driver);
-}
-
-module_init(jmicron_ide_init);
-module_exit(jmicron_ide_exit);
-
-MODULE_AUTHOR("Alan Cox");
-MODULE_DESCRIPTION("PCI driver module for the JMicron in legacy modes");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/ns87415.c b/drivers/ide/ns87415.c
deleted file mode 100644
index 11a672aba6ee..000000000000
--- a/drivers/ide/ns87415.c
+++ /dev/null
@@ -1,350 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 1997-1998	Mark Lord <mlord@pobox.com>
- * Copyright (C) 1998		Eddie C. Dost <ecd@skynet.be>
- * Copyright (C) 1999-2000	Andre Hedrick <andre@linux-ide.org>
- * Copyright (C) 2004		Grant Grundler <grundler at parisc-linux.org>
- *
- * Inspired by an earlier effort from David S. Miller <davem@redhat.com>
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/pci.h>
-#include <linux/delay.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-
-#include <asm/io.h>
-
-#define DRV_NAME "ns87415"
-
-#ifdef CONFIG_SUPERIO
-/* SUPERIO 87560 is a PoS chip that NatSem denies exists.
- * Unfortunately, it's built-in on all Astro-based PA-RISC workstations
- * which use the integrated NS87514 cell for CD-ROM support.
- * i.e we have to support for CD-ROM installs.
- * See drivers/parisc/superio.c for more gory details.
- */
-#include <asm/superio.h>
-
-#define SUPERIO_IDE_MAX_RETRIES 25
-
-/* Because of a defect in Super I/O, all reads of the PCI DMA status 
- * registers, IDE status register and the IDE select register need to be 
- * retried
- */
-static u8 superio_ide_inb (unsigned long port)
-{
-	u8 tmp;
-	int retries = SUPERIO_IDE_MAX_RETRIES;
-
-	/* printk(" [ reading port 0x%x with retry ] ", port); */
-
-	do {
-		tmp = inb(port);
-		if (tmp == 0)
-			udelay(50);
-	} while (tmp == 0 && retries-- > 0);
-
-	return tmp;
-}
-
-static u8 superio_read_status(ide_hwif_t *hwif)
-{
-	return superio_ide_inb(hwif->io_ports.status_addr);
-}
-
-static u8 superio_dma_sff_read_status(ide_hwif_t *hwif)
-{
-	return superio_ide_inb(hwif->dma_base + ATA_DMA_STATUS);
-}
-
-static void superio_tf_read(ide_drive_t *drive, struct ide_taskfile *tf,
-			    u8 valid)
-{
-	struct ide_io_ports *io_ports = &drive->hwif->io_ports;
-
-	if (valid & IDE_VALID_ERROR)
-		tf->error  = inb(io_ports->feature_addr);
-	if (valid & IDE_VALID_NSECT)
-		tf->nsect  = inb(io_ports->nsect_addr);
-	if (valid & IDE_VALID_LBAL)
-		tf->lbal   = inb(io_ports->lbal_addr);
-	if (valid & IDE_VALID_LBAM)
-		tf->lbam   = inb(io_ports->lbam_addr);
-	if (valid & IDE_VALID_LBAH)
-		tf->lbah   = inb(io_ports->lbah_addr);
-	if (valid & IDE_VALID_DEVICE)
-		tf->device = superio_ide_inb(io_ports->device_addr);
-}
-
-static void ns87415_dev_select(ide_drive_t *drive);
-
-static const struct ide_tp_ops superio_tp_ops = {
-	.exec_command		= ide_exec_command,
-	.read_status		= superio_read_status,
-	.read_altstatus		= ide_read_altstatus,
-	.write_devctl		= ide_write_devctl,
-
-	.dev_select		= ns87415_dev_select,
-	.tf_load		= ide_tf_load,
-	.tf_read		= superio_tf_read,
-
-	.input_data		= ide_input_data,
-	.output_data		= ide_output_data,
-};
-
-static void superio_init_iops(struct hwif_s *hwif)
-{
-	struct pci_dev *pdev = to_pci_dev(hwif->dev);
-	u32 dma_stat;
-	u8 port = hwif->channel, tmp;
-
-	dma_stat = (pci_resource_start(pdev, 4) & ~3) + (!port ? 2 : 0xa);
-
-	/* Clear error/interrupt, enable dma */
-	tmp = superio_ide_inb(dma_stat);
-	outb(tmp | 0x66, dma_stat);
-}
-#else
-#define superio_dma_sff_read_status ide_dma_sff_read_status
-#endif
-
-static unsigned int ns87415_count = 0, ns87415_control[MAX_HWIFS] = { 0 };
-
-/*
- * This routine either enables/disables (according to IDE_DFLAG_PRESENT)
- * the IRQ associated with the port,
- * and selects either PIO or DMA handshaking for the next I/O operation.
- */
-static void ns87415_prepare_drive (ide_drive_t *drive, unsigned int use_dma)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	unsigned int bit, other, new, *old = (unsigned int *) hwif->select_data;
-	unsigned long flags;
-
-	local_irq_save(flags);
-	new = *old;
-
-	/* Adjust IRQ enable bit */
-	bit = 1 << (8 + hwif->channel);
-
-	if (drive->dev_flags & IDE_DFLAG_PRESENT)
-		new &= ~bit;
-	else
-		new |= bit;
-
-	/* Select PIO or DMA, DMA may only be selected for one drive/channel. */
-	bit   = 1 << (20 + (drive->dn & 1) + (hwif->channel << 1));
-	other = 1 << (20 + (1 - (drive->dn & 1)) + (hwif->channel << 1));
-	new = use_dma ? ((new & ~other) | bit) : (new & ~bit);
-
-	if (new != *old) {
-		unsigned char stat;
-
-		/*
-		 * Don't change DMA engine settings while Write Buffers
-		 * are busy.
-		 */
-		(void) pci_read_config_byte(dev, 0x43, &stat);
-		while (stat & 0x03) {
-			udelay(1);
-			(void) pci_read_config_byte(dev, 0x43, &stat);
-		}
-
-		*old = new;
-		(void) pci_write_config_dword(dev, 0x40, new);
-
-		/*
-		 * And let things settle...
-		 */
-		udelay(10);
-	}
-
-	local_irq_restore(flags);
-}
-
-static void ns87415_dev_select(ide_drive_t *drive)
-{
-	ns87415_prepare_drive(drive,
-			      !!(drive->dev_flags & IDE_DFLAG_USING_DMA));
-
-	outb(drive->select | ATA_DEVICE_OBS, drive->hwif->io_ports.device_addr);
-}
-
-static void ns87415_dma_start(ide_drive_t *drive)
-{
-	ns87415_prepare_drive(drive, 1);
-	ide_dma_start(drive);
-}
-
-static int ns87415_dma_end(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	u8 dma_stat = 0, dma_cmd = 0;
-
-	dma_stat = hwif->dma_ops->dma_sff_read_status(hwif);
-	/* get DMA command mode */
-	dma_cmd = inb(hwif->dma_base + ATA_DMA_CMD);
-	/* stop DMA */
-	outb(dma_cmd & ~1, hwif->dma_base + ATA_DMA_CMD);
-	/* from ERRATA: clear the INTR & ERROR bits */
-	dma_cmd = inb(hwif->dma_base + ATA_DMA_CMD);
-	outb(dma_cmd | 6, hwif->dma_base + ATA_DMA_CMD);
-
-	ns87415_prepare_drive(drive, 0);
-
-	/* verify good DMA status */
-	return (dma_stat & 7) != 4;
-}
-
-static void init_hwif_ns87415 (ide_hwif_t *hwif)
-{
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	unsigned int ctrl, using_inta;
-	u8 progif;
-#ifdef __sparc_v9__
-	int timeout;
-	u8 stat;
-#endif
-
-	/*
-	 * We cannot probe for IRQ: both ports share common IRQ on INTA.
-	 * Also, leave IRQ masked during drive probing, to prevent infinite
-	 * interrupts from a potentially floating INTA..
-	 *
-	 * IRQs get unmasked in dev_select() when drive is first used.
-	 */
-	(void) pci_read_config_dword(dev, 0x40, &ctrl);
-	(void) pci_read_config_byte(dev, 0x09, &progif);
-	/* is irq in "native" mode? */
-	using_inta = progif & (1 << (hwif->channel << 1));
-	if (!using_inta)
-		using_inta = ctrl & (1 << (4 + hwif->channel));
-	if (hwif->mate) {
-		hwif->select_data = hwif->mate->select_data;
-	} else {
-		hwif->select_data = (unsigned long)
-					&ns87415_control[ns87415_count++];
-		ctrl |= (1 << 8) | (1 << 9);	/* mask both IRQs */
-		if (using_inta)
-			ctrl &= ~(1 << 6);	/* unmask INTA */
-		*((unsigned int *)hwif->select_data) = ctrl;
-		(void) pci_write_config_dword(dev, 0x40, ctrl);
-
-		/*
-		 * Set prefetch size to 512 bytes for both ports,
-		 * but don't turn on/off prefetching here.
-		 */
-		pci_write_config_byte(dev, 0x55, 0xee);
-
-#ifdef __sparc_v9__
-		/*
-		 * XXX: Reset the device, if we don't it will not respond to
-		 *      dev_select() properly during first ide_probe_port().
-		 */
-		timeout = 10000;
-		outb(12, hwif->io_ports.ctl_addr);
-		udelay(10);
-		outb(8, hwif->io_ports.ctl_addr);
-		do {
-			udelay(50);
-			stat = hwif->tp_ops->read_status(hwif);
-			if (stat == 0xff)
-				break;
-		} while ((stat & ATA_BUSY) && --timeout);
-#endif
-	}
-
-	if (!using_inta)
-		hwif->irq = pci_get_legacy_ide_irq(dev, hwif->channel);
-
-	if (!hwif->dma_base)
-		return;
-
-	outb(0x60, hwif->dma_base + ATA_DMA_STATUS);
-}
-
-static const struct ide_tp_ops ns87415_tp_ops = {
-	.exec_command		= ide_exec_command,
-	.read_status		= ide_read_status,
-	.read_altstatus		= ide_read_altstatus,
-	.write_devctl		= ide_write_devctl,
-
-	.dev_select		= ns87415_dev_select,
-	.tf_load		= ide_tf_load,
-	.tf_read		= ide_tf_read,
-
-	.input_data		= ide_input_data,
-	.output_data		= ide_output_data,
-};
-
-static const struct ide_dma_ops ns87415_dma_ops = {
-	.dma_host_set		= ide_dma_host_set,
-	.dma_setup		= ide_dma_setup,
-	.dma_start		= ns87415_dma_start,
-	.dma_end		= ns87415_dma_end,
-	.dma_test_irq		= ide_dma_test_irq,
-	.dma_lost_irq		= ide_dma_lost_irq,
-	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_sff_read_status	= superio_dma_sff_read_status,
-};
-
-static const struct ide_port_info ns87415_chipset = {
-	.name		= DRV_NAME,
-	.init_hwif	= init_hwif_ns87415,
-	.tp_ops 	= &ns87415_tp_ops,
-	.dma_ops	= &ns87415_dma_ops,
-	.host_flags	= IDE_HFLAG_TRUST_BIOS_FOR_DMA |
-			  IDE_HFLAG_NO_ATAPI_DMA,
-};
-
-static int ns87415_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	struct ide_port_info d = ns87415_chipset;
-
-#ifdef CONFIG_SUPERIO
-	if (PCI_SLOT(dev->devfn) == 0xE) {
-		/* Built-in - assume it's under superio. */
-		d.init_iops = superio_init_iops;
-		d.tp_ops = &superio_tp_ops;
-	}
-#endif
-	return ide_pci_init_one(dev, &d, NULL);
-}
-
-static const struct pci_device_id ns87415_pci_tbl[] = {
-	{ PCI_VDEVICE(NS, PCI_DEVICE_ID_NS_87415), 0 },
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, ns87415_pci_tbl);
-
-static struct pci_driver ns87415_pci_driver = {
-	.name		= "NS87415_IDE",
-	.id_table	= ns87415_pci_tbl,
-	.probe		= ns87415_init_one,
-	.remove		= ide_pci_remove,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init ns87415_ide_init(void)
-{
-	return ide_pci_register_driver(&ns87415_pci_driver);
-}
-
-static void __exit ns87415_ide_exit(void)
-{
-	pci_unregister_driver(&ns87415_pci_driver);
-}
-
-module_init(ns87415_ide_init);
-module_exit(ns87415_ide_exit);
-
-MODULE_AUTHOR("Mark Lord, Eddie Dost, Andre Hedrick");
-MODULE_DESCRIPTION("PCI driver module for NS87415 IDE");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/opti621.c b/drivers/ide/opti621.c
deleted file mode 100644
index c374f82333c6..000000000000
--- a/drivers/ide/opti621.c
+++ /dev/null
@@ -1,179 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  Copyright (C) 1996-1998  Linus Torvalds & authors (see below)
- */
-
-/*
- * Authors:
- * Jaromir Koutek <miri@punknet.cz>,
- * Jan Harkes <jaharkes@cwi.nl>,
- * Mark Lord <mlord@pobox.com>
- * Some parts of code are from ali14xx.c and from rz1000.c.
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/ide.h>
-
-#include <asm/io.h>
-
-#define DRV_NAME "opti621"
-
-#define READ_REG 0	/* index of Read cycle timing register */
-#define WRITE_REG 1	/* index of Write cycle timing register */
-#define CNTRL_REG 3	/* index of Control register */
-#define STRAP_REG 5	/* index of Strap register */
-#define MISC_REG 6	/* index of Miscellaneous register */
-
-static int reg_base;
-
-static DEFINE_SPINLOCK(opti621_lock);
-
-/* Write value to register reg, base of register
- * is at reg_base (0x1f0 primary, 0x170 secondary,
- * if not changed by PCI configuration).
- * This is from setupvic.exe program.
- */
-static void write_reg(u8 value, int reg)
-{
-	inw(reg_base + 1);
-	inw(reg_base + 1);
-	outb(3, reg_base + 2);
-	outb(value, reg_base + reg);
-	outb(0x83, reg_base + 2);
-}
-
-/* Read value from register reg, base of register
- * is at reg_base (0x1f0 primary, 0x170 secondary,
- * if not changed by PCI configuration).
- * This is from setupvic.exe program.
- */
-static u8 read_reg(int reg)
-{
-	u8 ret = 0;
-
-	inw(reg_base + 1);
-	inw(reg_base + 1);
-	outb(3, reg_base + 2);
-	ret = inb(reg_base + reg);
-	outb(0x83, reg_base + 2);
-
-	return ret;
-}
-
-static void opti621_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	ide_drive_t *pair = ide_get_pair_dev(drive);
-	unsigned long flags;
-	unsigned long mode = drive->pio_mode, pair_mode;
-	const u8 pio = mode - XFER_PIO_0;
-	u8 tim, misc, addr_pio = pio, clk;
-
-	/* DRDY is default 2 (by OPTi Databook) */
-	static const u8 addr_timings[2][5] = {
-		{ 0x20, 0x10, 0x00, 0x00, 0x00 },	/* 33 MHz */
-		{ 0x10, 0x10, 0x00, 0x00, 0x00 },	/* 25 MHz */
-	};
-	static const u8 data_rec_timings[2][5] = {
-		{ 0x5b, 0x45, 0x32, 0x21, 0x20 },	/* 33 MHz */
-		{ 0x48, 0x34, 0x21, 0x10, 0x10 }	/* 25 MHz */
-	};
-
-	ide_set_drivedata(drive, (void *)mode);
-
-	if (pair) {
-		pair_mode = (unsigned long)ide_get_drivedata(pair);
-		if (pair_mode && pair_mode < mode)
-			addr_pio = pair_mode - XFER_PIO_0;
-	}
-
-	spin_lock_irqsave(&opti621_lock, flags);
-
-	reg_base = hwif->io_ports.data_addr;
-
-	/* allow Register-B */
-	outb(0xc0, reg_base + CNTRL_REG);
-	/* hmm, setupvic.exe does this ;-) */
-	outb(0xff, reg_base + 5);
-	/* if reads 0xff, adapter not exist? */
-	(void)inb(reg_base + CNTRL_REG);
-	/* if reads 0xc0, no interface exist? */
-	read_reg(CNTRL_REG);
-
-	/* check CLK speed */
-	clk = read_reg(STRAP_REG) & 1;
-
-	printk(KERN_INFO "%s: CLK = %d MHz\n", hwif->name, clk ? 25 : 33);
-
-	tim  = data_rec_timings[clk][pio];
-	misc = addr_timings[clk][addr_pio];
-
-	/* select Index-0/1 for Register-A/B */
-	write_reg(drive->dn & 1, MISC_REG);
-	/* set read cycle timings */
-	write_reg(tim, READ_REG);
-	/* set write cycle timings */
-	write_reg(tim, WRITE_REG);
-
-	/* use Register-A for drive 0 */
-	/* use Register-B for drive 1 */
-	write_reg(0x85, CNTRL_REG);
-
-	/* set address setup, DRDY timings,   */
-	/*  and read prefetch for both drives */
-	write_reg(misc, MISC_REG);
-
-	spin_unlock_irqrestore(&opti621_lock, flags);
-}
-
-static const struct ide_port_ops opti621_port_ops = {
-	.set_pio_mode		= opti621_set_pio_mode,
-};
-
-static const struct ide_port_info opti621_chipset = {
-	.name		= DRV_NAME,
-	.enablebits	= { {0x45, 0x80, 0x00}, {0x40, 0x08, 0x00} },
-	.port_ops	= &opti621_port_ops,
-	.host_flags	= IDE_HFLAG_NO_DMA,
-	.pio_mask	= ATA_PIO4,
-};
-
-static int opti621_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	return ide_pci_init_one(dev, &opti621_chipset, NULL);
-}
-
-static const struct pci_device_id opti621_pci_tbl[] = {
-	{ PCI_VDEVICE(OPTI, PCI_DEVICE_ID_OPTI_82C621), 0 },
-	{ PCI_VDEVICE(OPTI, PCI_DEVICE_ID_OPTI_82C825), 0 },
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, opti621_pci_tbl);
-
-static struct pci_driver opti621_pci_driver = {
-	.name		= "Opti621_IDE",
-	.id_table	= opti621_pci_tbl,
-	.probe		= opti621_init_one,
-	.remove		= ide_pci_remove,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init opti621_ide_init(void)
-{
-	return ide_pci_register_driver(&opti621_pci_driver);
-}
-
-static void __exit opti621_ide_exit(void)
-{
-	pci_unregister_driver(&opti621_pci_driver);
-}
-
-module_init(opti621_ide_init);
-module_exit(opti621_ide_exit);
-
-MODULE_AUTHOR("Jaromir Koutek, Jan Harkes, Mark Lord");
-MODULE_DESCRIPTION("PCI driver module for Opti621 IDE");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/palm_bk3710.c b/drivers/ide/palm_bk3710.c
deleted file mode 100644
index d1fe4c13e35c..000000000000
--- a/drivers/ide/palm_bk3710.c
+++ /dev/null
@@ -1,387 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Palmchip bk3710 IDE controller
- *
- * Copyright (C) 2006 Texas Instruments.
- * Copyright (C) 2007 MontaVista Software, Inc., <source@mvista.com>
- *
- * ----------------------------------------------------------------------------
- *
- * ----------------------------------------------------------------------------
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/ioport.h>
-#include <linux/ide.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/clk.h>
-#include <linux/platform_device.h>
-
-/* Offset of the primary interface registers */
-#define IDE_PALM_ATA_PRI_REG_OFFSET 0x1F0
-
-/* Primary Control Offset */
-#define IDE_PALM_ATA_PRI_CTL_OFFSET 0x3F6
-
-#define BK3710_BMICP		0x00
-#define BK3710_BMISP		0x02
-#define BK3710_BMIDTP		0x04
-#define BK3710_IDETIMP		0x40
-#define BK3710_IDESTATUS	0x47
-#define BK3710_UDMACTL		0x48
-#define BK3710_MISCCTL		0x50
-#define BK3710_REGSTB		0x54
-#define BK3710_REGRCVR		0x58
-#define BK3710_DATSTB		0x5C
-#define BK3710_DATRCVR		0x60
-#define BK3710_DMASTB		0x64
-#define BK3710_DMARCVR		0x68
-#define BK3710_UDMASTB		0x6C
-#define BK3710_UDMATRP		0x70
-#define BK3710_UDMAENV		0x74
-#define BK3710_IORDYTMP		0x78
-
-static unsigned ideclk_period; /* in nanoseconds */
-
-struct palm_bk3710_udmatiming {
-	unsigned int rptime;	/* tRP -- Ready to pause time (nsec) */
-	unsigned int cycletime;	/* tCYCTYP2/2 -- avg Cycle Time (nsec) */
-				/* tENV is always a minimum of 20 nsec */
-};
-
-static const struct palm_bk3710_udmatiming palm_bk3710_udmatimings[6] = {
-	{ 160, 240 / 2 },	/* UDMA Mode 0 */
-	{ 125, 160 / 2 },	/* UDMA Mode 1 */
-	{ 100, 120 / 2 },	/* UDMA Mode 2 */
-	{ 100,  90 / 2 },	/* UDMA Mode 3 */
-	{ 100,  60 / 2 },	/* UDMA Mode 4 */
-	{  85,  40 / 2 },	/* UDMA Mode 5 */
-};
-
-static void palm_bk3710_setudmamode(void __iomem *base, unsigned int dev,
-				    unsigned int mode)
-{
-	u8 tenv, trp, t0;
-	u32 val32;
-	u16 val16;
-
-	/* DMA Data Setup */
-	t0 = DIV_ROUND_UP(palm_bk3710_udmatimings[mode].cycletime,
-			  ideclk_period) - 1;
-	tenv = DIV_ROUND_UP(20, ideclk_period) - 1;
-	trp = DIV_ROUND_UP(palm_bk3710_udmatimings[mode].rptime,
-			   ideclk_period) - 1;
-
-	/* udmastb Ultra DMA Access Strobe Width */
-	val32 = readl(base + BK3710_UDMASTB) & (0xFF << (dev ? 0 : 8));
-	val32 |= (t0 << (dev ? 8 : 0));
-	writel(val32, base + BK3710_UDMASTB);
-
-	/* udmatrp Ultra DMA Ready to Pause Time */
-	val32 = readl(base + BK3710_UDMATRP) & (0xFF << (dev ? 0 : 8));
-	val32 |= (trp << (dev ? 8 : 0));
-	writel(val32, base + BK3710_UDMATRP);
-
-	/* udmaenv Ultra DMA envelop Time */
-	val32 = readl(base + BK3710_UDMAENV) & (0xFF << (dev ? 0 : 8));
-	val32 |= (tenv << (dev ? 8 : 0));
-	writel(val32, base + BK3710_UDMAENV);
-
-	/* Enable UDMA for Device */
-	val16 = readw(base + BK3710_UDMACTL) | (1 << dev);
-	writew(val16, base + BK3710_UDMACTL);
-}
-
-static void palm_bk3710_setdmamode(void __iomem *base, unsigned int dev,
-				   unsigned short min_cycle,
-				   unsigned int mode)
-{
-	u8 td, tkw, t0;
-	u32 val32;
-	u16 val16;
-	struct ide_timing *t;
-	int cycletime;
-
-	t = ide_timing_find_mode(mode);
-	cycletime = max_t(int, t->cycle, min_cycle);
-
-	/* DMA Data Setup */
-	t0 = DIV_ROUND_UP(cycletime, ideclk_period);
-	td = DIV_ROUND_UP(t->active, ideclk_period);
-	tkw = t0 - td - 1;
-	td -= 1;
-
-	val32 = readl(base + BK3710_DMASTB) & (0xFF << (dev ? 0 : 8));
-	val32 |= (td << (dev ? 8 : 0));
-	writel(val32, base + BK3710_DMASTB);
-
-	val32 = readl(base + BK3710_DMARCVR) & (0xFF << (dev ? 0 : 8));
-	val32 |= (tkw << (dev ? 8 : 0));
-	writel(val32, base + BK3710_DMARCVR);
-
-	/* Disable UDMA for Device */
-	val16 = readw(base + BK3710_UDMACTL) & ~(1 << dev);
-	writew(val16, base + BK3710_UDMACTL);
-}
-
-static void palm_bk3710_setpiomode(void __iomem *base, ide_drive_t *mate,
-				   unsigned int dev, unsigned int cycletime,
-				   unsigned int mode)
-{
-	u8 t2, t2i, t0;
-	u32 val32;
-	struct ide_timing *t;
-
-	t = ide_timing_find_mode(XFER_PIO_0 + mode);
-
-	/* PIO Data Setup */
-	t0 = DIV_ROUND_UP(cycletime, ideclk_period);
-	t2 = DIV_ROUND_UP(t->active, ideclk_period);
-
-	t2i = t0 - t2 - 1;
-	t2 -= 1;
-
-	val32 = readl(base + BK3710_DATSTB) & (0xFF << (dev ? 0 : 8));
-	val32 |= (t2 << (dev ? 8 : 0));
-	writel(val32, base + BK3710_DATSTB);
-
-	val32 = readl(base + BK3710_DATRCVR) & (0xFF << (dev ? 0 : 8));
-	val32 |= (t2i << (dev ? 8 : 0));
-	writel(val32, base + BK3710_DATRCVR);
-
-	if (mate) {
-		u8 mode2 = mate->pio_mode - XFER_PIO_0;
-
-		if (mode2 < mode)
-			mode = mode2;
-	}
-
-	/* TASKFILE Setup */
-	t0 = DIV_ROUND_UP(t->cyc8b, ideclk_period);
-	t2 = DIV_ROUND_UP(t->act8b, ideclk_period);
-
-	t2i = t0 - t2 - 1;
-	t2 -= 1;
-
-	val32 = readl(base + BK3710_REGSTB) & (0xFF << (dev ? 0 : 8));
-	val32 |= (t2 << (dev ? 8 : 0));
-	writel(val32, base + BK3710_REGSTB);
-
-	val32 = readl(base + BK3710_REGRCVR) & (0xFF << (dev ? 0 : 8));
-	val32 |= (t2i << (dev ? 8 : 0));
-	writel(val32, base + BK3710_REGRCVR);
-}
-
-static void palm_bk3710_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	int is_slave = drive->dn & 1;
-	void __iomem *base = (void __iomem *)hwif->dma_base;
-	const u8 xferspeed = drive->dma_mode;
-
-	if (xferspeed >= XFER_UDMA_0) {
-		palm_bk3710_setudmamode(base, is_slave,
-					xferspeed - XFER_UDMA_0);
-	} else {
-		palm_bk3710_setdmamode(base, is_slave,
-				       drive->id[ATA_ID_EIDE_DMA_MIN],
-				       xferspeed);
-	}
-}
-
-static void palm_bk3710_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	unsigned int cycle_time;
-	int is_slave = drive->dn & 1;
-	ide_drive_t *mate;
-	void __iomem *base = (void __iomem *)hwif->dma_base;
-	const u8 pio = drive->pio_mode - XFER_PIO_0;
-
-	/*
-	 * Obtain the drive PIO data for tuning the Palm Chip registers
-	 */
-	cycle_time = ide_pio_cycle_time(drive, pio);
-	mate = ide_get_pair_dev(drive);
-	palm_bk3710_setpiomode(base, mate, is_slave, cycle_time, pio);
-}
-
-static void palm_bk3710_chipinit(void __iomem *base)
-{
-	/*
-	 * REVISIT:  the ATA reset signal needs to be managed through a
-	 * GPIO, which means it should come from platform_data.  Until
-	 * we get and use such information, we have to trust that things
-	 * have been reset before we get here.
-	 */
-
-	/*
-	 * Program the IDETIMP Register Value based on the following assumptions
-	 *
-	 * (ATA_IDETIMP_IDEEN		, ENABLE ) |
-	 * (ATA_IDETIMP_PREPOST1	, DISABLE) |
-	 * (ATA_IDETIMP_PREPOST0	, DISABLE) |
-	 *
-	 * DM6446 silicon rev 2.1 and earlier have no observed net benefit
-	 * from enabling prefetch/postwrite.
-	 */
-	writew(BIT(15), base + BK3710_IDETIMP);
-
-	/*
-	 * UDMACTL Ultra-ATA DMA Control
-	 * (ATA_UDMACTL_UDMAP1	, 0 ) |
-	 * (ATA_UDMACTL_UDMAP0	, 0 )
-	 *
-	 */
-	writew(0, base + BK3710_UDMACTL);
-
-	/*
-	 * MISCCTL Miscellaneous Conrol Register
-	 * (ATA_MISCCTL_HWNHLD1P	, 1 cycle)
-	 * (ATA_MISCCTL_HWNHLD0P	, 1 cycle)
-	 * (ATA_MISCCTL_TIMORIDE	, 1)
-	 */
-	writel(0x001, base + BK3710_MISCCTL);
-
-	/*
-	 * IORDYTMP IORDY Timer for Primary Register
-	 * (ATA_IORDYTMP_IORDYTMP     , 0xffff  )
-	 */
-	writel(0xFFFF, base + BK3710_IORDYTMP);
-
-	/*
-	 * Configure BMISP Register
-	 * (ATA_BMISP_DMAEN1	, DISABLE )	|
-	 * (ATA_BMISP_DMAEN0	, DISABLE )	|
-	 * (ATA_BMISP_IORDYINT	, CLEAR)	|
-	 * (ATA_BMISP_INTRSTAT	, CLEAR)	|
-	 * (ATA_BMISP_DMAERROR	, CLEAR)
-	 */
-	writew(0, base + BK3710_BMISP);
-
-	palm_bk3710_setpiomode(base, NULL, 0, 600, 0);
-	palm_bk3710_setpiomode(base, NULL, 1, 600, 0);
-}
-
-static u8 palm_bk3710_cable_detect(ide_hwif_t *hwif)
-{
-	return ATA_CBL_PATA80;
-}
-
-static int palm_bk3710_init_dma(ide_hwif_t *hwif, const struct ide_port_info *d)
-{
-	printk(KERN_INFO "    %s: MMIO-DMA\n", hwif->name);
-
-	if (ide_allocate_dma_engine(hwif))
-		return -1;
-
-	hwif->dma_base = hwif->io_ports.data_addr - IDE_PALM_ATA_PRI_REG_OFFSET;
-
-	return 0;
-}
-
-static const struct ide_port_ops palm_bk3710_ports_ops = {
-	.set_pio_mode		= palm_bk3710_set_pio_mode,
-	.set_dma_mode		= palm_bk3710_set_dma_mode,
-	.cable_detect		= palm_bk3710_cable_detect,
-};
-
-static struct ide_port_info palm_bk3710_port_info __initdata = {
-	.init_dma		= palm_bk3710_init_dma,
-	.port_ops		= &palm_bk3710_ports_ops,
-	.dma_ops		= &sff_dma_ops,
-	.host_flags		= IDE_HFLAG_MMIO,
-	.pio_mask		= ATA_PIO4,
-	.mwdma_mask		= ATA_MWDMA2,
-	.chipset		= ide_palm3710,
-};
-
-static int __init palm_bk3710_probe(struct platform_device *pdev)
-{
-	struct clk *clk;
-	struct resource *mem, *irq;
-	void __iomem *base;
-	unsigned long rate, mem_size;
-	int i, rc;
-	struct ide_hw hw, *hws[] = { &hw };
-
-	clk = clk_get(&pdev->dev, NULL);
-	if (IS_ERR(clk))
-		return -ENODEV;
-
-	clk_enable(clk);
-	rate = clk_get_rate(clk);
-	if (!rate)
-		return -EINVAL;
-
-	/* NOTE:  round *down* to meet minimum timings; we count in clocks */
-	ideclk_period = 1000000000UL / rate;
-
-	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (mem == NULL) {
-		printk(KERN_ERR "failed to get memory region resource\n");
-		return -ENODEV;
-	}
-
-	irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
-	if (irq == NULL) {
-		printk(KERN_ERR "failed to get IRQ resource\n");
-		return -ENODEV;
-	}
-
-	mem_size = resource_size(mem);
-	if (request_mem_region(mem->start, mem_size, "palm_bk3710") == NULL) {
-		printk(KERN_ERR "failed to request memory region\n");
-		return -EBUSY;
-	}
-
-	base = ioremap(mem->start, mem_size);
-	if (!base) {
-		printk(KERN_ERR "failed to map IO memory\n");
-		release_mem_region(mem->start, mem_size);
-		return -ENOMEM;
-	}
-
-	/* Configure the Palm Chip controller */
-	palm_bk3710_chipinit(base);
-
-	memset(&hw, 0, sizeof(hw));
-	for (i = 0; i < IDE_NR_PORTS - 2; i++)
-		hw.io_ports_array[i] = (unsigned long)
-				(base + IDE_PALM_ATA_PRI_REG_OFFSET + i);
-	hw.io_ports.ctl_addr = (unsigned long)
-			(base + IDE_PALM_ATA_PRI_CTL_OFFSET);
-	hw.irq = irq->start;
-	hw.dev = &pdev->dev;
-
-	palm_bk3710_port_info.udma_mask = rate < 100000000 ? ATA_UDMA4 :
-							     ATA_UDMA5;
-
-	/* Register the IDE interface with Linux */
-	rc = ide_host_add(&palm_bk3710_port_info, hws, 1, NULL);
-	if (rc)
-		goto out;
-
-	return 0;
-out:
-	printk(KERN_WARNING "Palm Chip BK3710 IDE Register Fail\n");
-	return rc;
-}
-
-/* work with hotplug and coldplug */
-MODULE_ALIAS("platform:palm_bk3710");
-
-static struct platform_driver platform_bk_driver = {
-	.driver = {
-		.name = "palm_bk3710",
-	},
-};
-
-static int __init palm_bk3710_init(void)
-{
-	return platform_driver_probe(&platform_bk_driver, palm_bk3710_probe);
-}
-
-module_init(palm_bk3710_init);
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/pdc202xx_new.c b/drivers/ide/pdc202xx_new.c
deleted file mode 100644
index 4fcafb9121e0..000000000000
--- a/drivers/ide/pdc202xx_new.c
+++ /dev/null
@@ -1,557 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- *  Promise TX2/TX4/TX2000/133 IDE driver
- *
- *  Split from:
- *  linux/drivers/ide/pdc202xx.c	Version 0.35	Mar. 30, 2002
- *  Copyright (C) 1998-2002		Andre Hedrick <andre@linux-ide.org>
- *  Copyright (C) 2005-2007		MontaVista Software, Inc.
- *  Portions Copyright (C) 1999 Promise Technology, Inc.
- *  Author: Frank Tiernan (frankt@promise.com)
- *  Released under terms of General Public License
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/delay.h>
-#include <linux/pci.h>
-#include <linux/init.h>
-#include <linux/ide.h>
-#include <linux/ktime.h>
-
-#include <asm/io.h>
-
-#ifdef CONFIG_PPC_PMAC
-#include <asm/prom.h>
-#endif
-
-#define DRV_NAME "pdc202xx_new"
-
-#undef DEBUG
-
-#ifdef DEBUG
-#define DBG(fmt, args...) printk("%s: " fmt, __func__, ## args)
-#else
-#define DBG(fmt, args...)
-#endif
-
-static u8 max_dma_rate(struct pci_dev *pdev)
-{
-	u8 mode;
-
-	switch(pdev->device) {
-		case PCI_DEVICE_ID_PROMISE_20277:
-		case PCI_DEVICE_ID_PROMISE_20276:
-		case PCI_DEVICE_ID_PROMISE_20275:
-		case PCI_DEVICE_ID_PROMISE_20271:
-		case PCI_DEVICE_ID_PROMISE_20269:
-			mode = 4;
-			break;
-		case PCI_DEVICE_ID_PROMISE_20270:
-		case PCI_DEVICE_ID_PROMISE_20268:
-			mode = 3;
-			break;
-		default:
-			return 0;
-	}
-
-	return mode;
-}
-
-/**
- * get_indexed_reg - Get indexed register
- * @hwif: for the port address
- * @index: index of the indexed register
- */
-static u8 get_indexed_reg(ide_hwif_t *hwif, u8 index)
-{
-	u8 value;
-
-	outb(index, hwif->dma_base + 1);
-	value = inb(hwif->dma_base + 3);
-
-	DBG("index[%02X] value[%02X]\n", index, value);
-	return value;
-}
-
-/**
- * set_indexed_reg - Set indexed register
- * @hwif: for the port address
- * @index: index of the indexed register
- */
-static void set_indexed_reg(ide_hwif_t *hwif, u8 index, u8 value)
-{
-	outb(index, hwif->dma_base + 1);
-	outb(value, hwif->dma_base + 3);
-	DBG("index[%02X] value[%02X]\n", index, value);
-}
-
-/*
- * ATA Timing Tables based on 133 MHz PLL output clock.
- *
- * If the PLL outputs 100 MHz clock, the ASIC hardware will set
- * the timing registers automatically when "set features" command is
- * issued to the device. However, if the PLL output clock is 133 MHz,
- * the following tables must be used.
- */
-static struct pio_timing {
-	u8 reg0c, reg0d, reg13;
-} pio_timings [] = {
-	{ 0xfb, 0x2b, 0xac },	/* PIO mode 0, IORDY off, Prefetch off */
-	{ 0x46, 0x29, 0xa4 },	/* PIO mode 1, IORDY off, Prefetch off */
-	{ 0x23, 0x26, 0x64 },	/* PIO mode 2, IORDY off, Prefetch off */
-	{ 0x27, 0x0d, 0x35 },	/* PIO mode 3, IORDY on,  Prefetch off */
-	{ 0x23, 0x09, 0x25 },	/* PIO mode 4, IORDY on,  Prefetch off */
-};
-
-static struct mwdma_timing {
-	u8 reg0e, reg0f;
-} mwdma_timings [] = {
-	{ 0xdf, 0x5f }, 	/* MWDMA mode 0 */
-	{ 0x6b, 0x27 }, 	/* MWDMA mode 1 */
-	{ 0x69, 0x25 }, 	/* MWDMA mode 2 */
-};
-
-static struct udma_timing {
-	u8 reg10, reg11, reg12;
-} udma_timings [] = {
-	{ 0x4a, 0x0f, 0xd5 },	/* UDMA mode 0 */
-	{ 0x3a, 0x0a, 0xd0 },	/* UDMA mode 1 */
-	{ 0x2a, 0x07, 0xcd },	/* UDMA mode 2 */
-	{ 0x1a, 0x05, 0xcd },	/* UDMA mode 3 */
-	{ 0x1a, 0x03, 0xcd },	/* UDMA mode 4 */
-	{ 0x1a, 0x02, 0xcb },	/* UDMA mode 5 */
-	{ 0x1a, 0x01, 0xcb },	/* UDMA mode 6 */
-};
-
-static void pdcnew_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	u8 adj			= (drive->dn & 1) ? 0x08 : 0x00;
-	const u8 speed		= drive->dma_mode;
-
-	/*
-	 * IDE core issues SETFEATURES_XFER to the drive first (thanks to
-	 * IDE_HFLAG_POST_SET_MODE in ->host_flags).  PDC202xx hardware will
-	 * automatically set the timing registers based on 100 MHz PLL output.
-	 *
-	 * As we set up the PLL to output 133 MHz for UltraDMA/133 capable
-	 * chips, we must override the default register settings...
-	 */
-	if (max_dma_rate(dev) == 4) {
-		u8 mode = speed & 0x07;
-
-		if (speed >= XFER_UDMA_0) {
-			set_indexed_reg(hwif, 0x10 + adj,
-					udma_timings[mode].reg10);
-			set_indexed_reg(hwif, 0x11 + adj,
-					udma_timings[mode].reg11);
-			set_indexed_reg(hwif, 0x12 + adj,
-					udma_timings[mode].reg12);
-		} else {
-			set_indexed_reg(hwif, 0x0e + adj,
-					mwdma_timings[mode].reg0e);
-			set_indexed_reg(hwif, 0x0f + adj,
-					mwdma_timings[mode].reg0f);
-		}
-	} else if (speed == XFER_UDMA_2) {
-		/* Set tHOLD bit to 0 if using UDMA mode 2 */
-		u8 tmp = get_indexed_reg(hwif, 0x10 + adj);
-
-		set_indexed_reg(hwif, 0x10 + adj, tmp & 0x7f);
- 	}
-}
-
-static void pdcnew_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	u8 adj = (drive->dn & 1) ? 0x08 : 0x00;
-	const u8 pio = drive->pio_mode - XFER_PIO_0;
-
-	if (max_dma_rate(dev) == 4) {
-		set_indexed_reg(hwif, 0x0c + adj, pio_timings[pio].reg0c);
-		set_indexed_reg(hwif, 0x0d + adj, pio_timings[pio].reg0d);
-		set_indexed_reg(hwif, 0x13 + adj, pio_timings[pio].reg13);
-	}
-}
-
-static u8 pdcnew_cable_detect(ide_hwif_t *hwif)
-{
-	if (get_indexed_reg(hwif, 0x0b) & 0x04)
-		return ATA_CBL_PATA40;
-	else
-		return ATA_CBL_PATA80;
-}
-
-static void pdcnew_reset(ide_drive_t *drive)
-{
-	/*
-	 * Deleted this because it is redundant from the caller.
-	 */
-	printk(KERN_WARNING "pdc202xx_new: %s channel reset.\n",
-		drive->hwif->channel ? "Secondary" : "Primary");
-}
-
-/**
- * read_counter - Read the byte count registers
- * @dma_base: for the port address
- */
-static long read_counter(u32 dma_base)
-{
-	u32  pri_dma_base = dma_base, sec_dma_base = dma_base + 0x08;
-	u8   cnt0, cnt1, cnt2, cnt3;
-	long count = 0, last;
-	int  retry = 3;
-
-	do {
-		last = count;
-
-		/* Read the current count */
-		outb(0x20, pri_dma_base + 0x01);
-		cnt0 = inb(pri_dma_base + 0x03);
-		outb(0x21, pri_dma_base + 0x01);
-		cnt1 = inb(pri_dma_base + 0x03);
-		outb(0x20, sec_dma_base + 0x01);
-		cnt2 = inb(sec_dma_base + 0x03);
-		outb(0x21, sec_dma_base + 0x01);
-		cnt3 = inb(sec_dma_base + 0x03);
-
-		count = (cnt3 << 23) | (cnt2 << 15) | (cnt1 << 8) | cnt0;
-
-		/*
-		 * The 30-bit decrementing counter is read in 4 pieces.
-		 * Incorrect value may be read when the most significant bytes
-		 * are changing...
-		 */
-	} while (retry-- && (((last ^ count) & 0x3fff8000) || last < count));
-
-	DBG("cnt0[%02X] cnt1[%02X] cnt2[%02X] cnt3[%02X]\n",
-		  cnt0, cnt1, cnt2, cnt3);
-
-	return count;
-}
-
-/**
- * detect_pll_input_clock - Detect the PLL input clock in Hz.
- * @dma_base: for the port address
- * E.g. 16949000 on 33 MHz PCI bus, i.e. half of the PCI clock.
- */
-static long detect_pll_input_clock(unsigned long dma_base)
-{
-	ktime_t start_time, end_time;
-	long start_count, end_count;
-	long pll_input, usec_elapsed;
-	u8 scr1;
-
-	start_count = read_counter(dma_base);
-	start_time = ktime_get();
-
-	/* Start the test mode */
-	outb(0x01, dma_base + 0x01);
-	scr1 = inb(dma_base + 0x03);
-	DBG("scr1[%02X]\n", scr1);
-	outb(scr1 | 0x40, dma_base + 0x03);
-
-	/* Let the counter run for 10 ms. */
-	mdelay(10);
-
-	end_count = read_counter(dma_base);
-	end_time = ktime_get();
-
-	/* Stop the test mode */
-	outb(0x01, dma_base + 0x01);
-	scr1 = inb(dma_base + 0x03);
-	DBG("scr1[%02X]\n", scr1);
-	outb(scr1 & ~0x40, dma_base + 0x03);
-
-	/*
-	 * Calculate the input clock in Hz
-	 * (the clock counter is 30 bit wide and counts down)
-	 */
-	usec_elapsed = ktime_us_delta(end_time, start_time);
-	pll_input = ((start_count - end_count) & 0x3fffffff) / 10 *
-		(10000000 / usec_elapsed);
-
-	DBG("start[%ld] end[%ld]\n", start_count, end_count);
-
-	return pll_input;
-}
-
-#ifdef CONFIG_PPC_PMAC
-static void apple_kiwi_init(struct pci_dev *pdev)
-{
-	struct device_node *np = pci_device_to_OF_node(pdev);
-	u8 conf;
-
-	if (np == NULL || !of_device_is_compatible(np, "kiwi-root"))
-		return;
-
-	if (pdev->revision >= 0x03) {
-		/* Setup chip magic config stuff (from darwin) */
-		pci_read_config_byte (pdev, 0x40, &conf);
-		pci_write_config_byte(pdev, 0x40, (conf | 0x01));
-	}
-}
-#endif /* CONFIG_PPC_PMAC */
-
-static int init_chipset_pdcnew(struct pci_dev *dev)
-{
-	const char *name = DRV_NAME;
-	unsigned long dma_base = pci_resource_start(dev, 4);
-	unsigned long sec_dma_base = dma_base + 0x08;
-	long pll_input, pll_output, ratio;
-	int f, r;
-	u8 pll_ctl0, pll_ctl1;
-
-	if (dma_base == 0)
-		return -EFAULT;
-
-#ifdef CONFIG_PPC_PMAC
-	apple_kiwi_init(dev);
-#endif
-
-	/* Calculate the required PLL output frequency */
-	switch(max_dma_rate(dev)) {
-		case 4: /* it's 133 MHz for Ultra133 chips */
-			pll_output = 133333333;
-			break;
-		case 3: /* and  100 MHz for Ultra100 chips */
-		default:
-			pll_output = 100000000;
-			break;
-	}
-
-	/*
-	 * Detect PLL input clock.
-	 * On some systems, where PCI bus is running at non-standard clock rate
-	 * (e.g. 25 or 40 MHz), we have to adjust the cycle time.
-	 * PDC20268 and newer chips employ PLL circuit to help correct timing
-	 * registers setting.
-	 */
-	pll_input = detect_pll_input_clock(dma_base);
-	printk(KERN_INFO "%s %s: PLL input clock is %ld kHz\n",
-		name, pci_name(dev), pll_input / 1000);
-
-	/* Sanity check */
-	if (unlikely(pll_input < 5000000L || pll_input > 70000000L)) {
-		printk(KERN_ERR "%s %s: Bad PLL input clock %ld Hz, giving up!"
-			"\n", name, pci_name(dev), pll_input);
-		goto out;
-	}
-
-#ifdef DEBUG
-	DBG("pll_output is %ld Hz\n", pll_output);
-
-	/* Show the current clock value of PLL control register
-	 * (maybe already configured by the BIOS)
-	 */
-	outb(0x02, sec_dma_base + 0x01);
-	pll_ctl0 = inb(sec_dma_base + 0x03);
-	outb(0x03, sec_dma_base + 0x01);
-	pll_ctl1 = inb(sec_dma_base + 0x03);
-
-	DBG("pll_ctl[%02X][%02X]\n", pll_ctl0, pll_ctl1);
-#endif
-
-	/*
-	 * Calculate the ratio of F, R and NO
-	 * POUT = (F + 2) / (( R + 2) * NO)
-	 */
-	ratio = pll_output / (pll_input / 1000);
-	if (ratio < 8600L) { /* 8.6x */
-		/* Using NO = 0x01, R = 0x0d */
-		r = 0x0d;
-	} else if (ratio < 12900L) { /* 12.9x */
-		/* Using NO = 0x01, R = 0x08 */
-		r = 0x08;
-	} else if (ratio < 16100L) { /* 16.1x */
-		/* Using NO = 0x01, R = 0x06 */
-		r = 0x06;
-	} else if (ratio < 64000L) { /* 64x */
-		r = 0x00;
-	} else {
-		/* Invalid ratio */
-		printk(KERN_ERR "%s %s: Bad ratio %ld, giving up!\n",
-			name, pci_name(dev), ratio);
-		goto out;
-	}
-
-	f = (ratio * (r + 2)) / 1000 - 2;
-
-	DBG("F[%d] R[%d] ratio*1000[%ld]\n", f, r, ratio);
-
-	if (unlikely(f < 0 || f > 127)) {
-		/* Invalid F */
-		printk(KERN_ERR "%s %s: F[%d] invalid!\n",
-			name, pci_name(dev), f);
-		goto out;
-	}
-
-	pll_ctl0 = (u8) f;
-	pll_ctl1 = (u8) r;
-
-	DBG("Writing pll_ctl[%02X][%02X]\n", pll_ctl0, pll_ctl1);
-
-	outb(0x02,     sec_dma_base + 0x01);
-	outb(pll_ctl0, sec_dma_base + 0x03);
-	outb(0x03,     sec_dma_base + 0x01);
-	outb(pll_ctl1, sec_dma_base + 0x03);
-
-	/* Wait the PLL circuit to be stable */
-	mdelay(30);
-
-#ifdef DEBUG
-	/*
-	 *  Show the current clock value of PLL control register
-	 */
-	outb(0x02, sec_dma_base + 0x01);
-	pll_ctl0 = inb(sec_dma_base + 0x03);
-	outb(0x03, sec_dma_base + 0x01);
-	pll_ctl1 = inb(sec_dma_base + 0x03);
-
-	DBG("pll_ctl[%02X][%02X]\n", pll_ctl0, pll_ctl1);
-#endif
-
- out:
-	return 0;
-}
-
-static struct pci_dev *pdc20270_get_dev2(struct pci_dev *dev)
-{
-	struct pci_dev *dev2;
-
-	dev2 = pci_get_slot(dev->bus, PCI_DEVFN(PCI_SLOT(dev->devfn) + 1,
-						PCI_FUNC(dev->devfn)));
-
-	if (dev2 &&
-	    dev2->vendor == dev->vendor &&
-	    dev2->device == dev->device) {
-
-		if (dev2->irq != dev->irq) {
-			dev2->irq = dev->irq;
-			printk(KERN_INFO DRV_NAME " %s: PCI config space "
-				"interrupt fixed\n", pci_name(dev));
-		}
-
-		return dev2;
-	}
-
-	return NULL;
-}
-
-static const struct ide_port_ops pdcnew_port_ops = {
-	.set_pio_mode		= pdcnew_set_pio_mode,
-	.set_dma_mode		= pdcnew_set_dma_mode,
-	.resetproc		= pdcnew_reset,
-	.cable_detect		= pdcnew_cable_detect,
-};
-
-#define DECLARE_PDCNEW_DEV(udma) \
-	{ \
-		.name		= DRV_NAME, \
-		.init_chipset	= init_chipset_pdcnew, \
-		.port_ops	= &pdcnew_port_ops, \
-		.host_flags	= IDE_HFLAG_POST_SET_MODE | \
-				  IDE_HFLAG_ERROR_STOPS_FIFO | \
-				  IDE_HFLAG_OFF_BOARD, \
-		.pio_mask	= ATA_PIO4, \
-		.mwdma_mask	= ATA_MWDMA2, \
-		.udma_mask	= udma, \
-	}
-
-static const struct ide_port_info pdcnew_chipsets[] = {
-	/* 0: PDC202{68,70} */		DECLARE_PDCNEW_DEV(ATA_UDMA5),
-	/* 1: PDC202{69,71,75,76,77} */	DECLARE_PDCNEW_DEV(ATA_UDMA6),
-};
-
-/**
- *	pdc202new_init_one	-	called when a pdc202xx is found
- *	@dev: the pdc202new device
- *	@id: the matching pci id
- *
- *	Called when the PCI registration layer (or the IDE initialization)
- *	finds a device matching our IDE device tables.
- */
- 
-static int pdc202new_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	const struct ide_port_info *d = &pdcnew_chipsets[id->driver_data];
-	struct pci_dev *bridge = dev->bus->self;
-
-	if (dev->device == PCI_DEVICE_ID_PROMISE_20270 && bridge &&
-	    bridge->vendor == PCI_VENDOR_ID_DEC &&
-	    bridge->device == PCI_DEVICE_ID_DEC_21150) {
-		struct pci_dev *dev2;
-
-		if (PCI_SLOT(dev->devfn) & 2)
-			return -ENODEV;
-
-		dev2 = pdc20270_get_dev2(dev);
-
-		if (dev2) {
-			int ret = ide_pci_init_two(dev, dev2, d, NULL);
-			if (ret < 0)
-				pci_dev_put(dev2);
-			return ret;
-		}
-	}
-
-	if (dev->device == PCI_DEVICE_ID_PROMISE_20276 && bridge &&
-	    bridge->vendor == PCI_VENDOR_ID_INTEL &&
-	    (bridge->device == PCI_DEVICE_ID_INTEL_I960 ||
-	     bridge->device == PCI_DEVICE_ID_INTEL_I960RM)) {
-		printk(KERN_INFO DRV_NAME " %s: attached to I2O RAID controller,"
-			" skipping\n", pci_name(dev));
-		return -ENODEV;
-	}
-
-	return ide_pci_init_one(dev, d, NULL);
-}
-
-static void pdc202new_remove(struct pci_dev *dev)
-{
-	struct ide_host *host = pci_get_drvdata(dev);
-	struct pci_dev *dev2 = host->dev[1] ? to_pci_dev(host->dev[1]) : NULL;
-
-	ide_pci_remove(dev);
-	pci_dev_put(dev2);
-}
-
-static const struct pci_device_id pdc202new_pci_tbl[] = {
-	{ PCI_VDEVICE(PROMISE, PCI_DEVICE_ID_PROMISE_20268), 0 },
-	{ PCI_VDEVICE(PROMISE, PCI_DEVICE_ID_PROMISE_20269), 1 },
-	{ PCI_VDEVICE(PROMISE, PCI_DEVICE_ID_PROMISE_20270), 0 },
-	{ PCI_VDEVICE(PROMISE, PCI_DEVICE_ID_PROMISE_20271), 1 },
-	{ PCI_VDEVICE(PROMISE, PCI_DEVICE_ID_PROMISE_20275), 1 },
-	{ PCI_VDEVICE(PROMISE, PCI_DEVICE_ID_PROMISE_20276), 1 },
-	{ PCI_VDEVICE(PROMISE, PCI_DEVICE_ID_PROMISE_20277), 1 },
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, pdc202new_pci_tbl);
-
-static struct pci_driver pdc202new_pci_driver = {
-	.name		= "Promise_IDE",
-	.id_table	= pdc202new_pci_tbl,
-	.probe		= pdc202new_init_one,
-	.remove		= pdc202new_remove,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init pdc202new_ide_init(void)
-{
-	return ide_pci_register_driver(&pdc202new_pci_driver);
-}
-
-static void __exit pdc202new_ide_exit(void)
-{
-	pci_unregister_driver(&pdc202new_pci_driver);
-}
-
-module_init(pdc202new_ide_init);
-module_exit(pdc202new_ide_exit);
-
-MODULE_AUTHOR("Andre Hedrick, Frank Tiernan");
-MODULE_DESCRIPTION("PCI driver module for Promise PDC20268 and higher");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/pdc202xx_old.c b/drivers/ide/pdc202xx_old.c
deleted file mode 100644
index 5248ac064e6e..000000000000
--- a/drivers/ide/pdc202xx_old.c
+++ /dev/null
@@ -1,362 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  Copyright (C) 1998-2002		Andre Hedrick <andre@linux-ide.org>
- *  Copyright (C) 2006-2007, 2009	MontaVista Software, Inc.
- *  Copyright (C) 2007-2010		Bartlomiej Zolnierkiewicz
- *
- *  Portions Copyright (C) 1999 Promise Technology, Inc.
- *  Author: Frank Tiernan (frankt@promise.com)
- *  Released under terms of General Public License
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/delay.h>
-#include <linux/blkdev.h>
-#include <linux/pci.h>
-#include <linux/init.h>
-#include <linux/ide.h>
-
-#include <asm/io.h>
-
-#define DRV_NAME "pdc202xx_old"
-
-static void pdc202xx_set_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	u8 drive_pci		= 0x60 + (drive->dn << 2);
-	const u8 speed		= drive->dma_mode;
-
-	u8			AP = 0, BP = 0, CP = 0;
-	u8			TA = 0, TB = 0, TC = 0;
-
-	pci_read_config_byte(dev, drive_pci,     &AP);
-	pci_read_config_byte(dev, drive_pci + 1, &BP);
-	pci_read_config_byte(dev, drive_pci + 2, &CP);
-
-	switch(speed) {
-		case XFER_UDMA_5:
-		case XFER_UDMA_4:	TB = 0x20; TC = 0x01; break;
-		case XFER_UDMA_2:	TB = 0x20; TC = 0x01; break;
-		case XFER_UDMA_3:
-		case XFER_UDMA_1:	TB = 0x40; TC = 0x02; break;
-		case XFER_UDMA_0:
-		case XFER_MW_DMA_2:	TB = 0x60; TC = 0x03; break;
-		case XFER_MW_DMA_1:	TB = 0x60; TC = 0x04; break;
-		case XFER_MW_DMA_0:	TB = 0xE0; TC = 0x0F; break;
-		case XFER_PIO_4:	TA = 0x01; TB = 0x04; break;
-		case XFER_PIO_3:	TA = 0x02; TB = 0x06; break;
-		case XFER_PIO_2:	TA = 0x03; TB = 0x08; break;
-		case XFER_PIO_1:	TA = 0x05; TB = 0x0C; break;
-		case XFER_PIO_0:
-		default:		TA = 0x09; TB = 0x13; break;
-	}
-
-	if (speed < XFER_SW_DMA_0) {
-		/*
-		 * preserve SYNC_INT / ERDDY_EN bits while clearing
-		 * Prefetch_EN / IORDY_EN / PA[3:0] bits of register A
-		 */
-		AP &= ~0x3f;
-		if (ide_pio_need_iordy(drive, speed - XFER_PIO_0))
-			AP |= 0x20;	/* set IORDY_EN bit */
-		if (drive->media == ide_disk)
-			AP |= 0x10;	/* set Prefetch_EN bit */
-		/* clear PB[4:0] bits of register B */
-		BP &= ~0x1f;
-		pci_write_config_byte(dev, drive_pci,     AP | TA);
-		pci_write_config_byte(dev, drive_pci + 1, BP | TB);
-	} else {
-		/* clear MB[2:0] bits of register B */
-		BP &= ~0xe0;
-		/* clear MC[3:0] bits of register C */
-		CP &= ~0x0f;
-		pci_write_config_byte(dev, drive_pci + 1, BP | TB);
-		pci_write_config_byte(dev, drive_pci + 2, CP | TC);
-	}
-}
-
-static void pdc202xx_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	drive->dma_mode = drive->pio_mode;
-	pdc202xx_set_mode(hwif, drive);
-}
-
-static int pdc202xx_test_irq(ide_hwif_t *hwif)
-{
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	unsigned long high_16	= pci_resource_start(dev, 4);
-	u8 sc1d			= inb(high_16 + 0x1d);
-
-	if (hwif->channel) {
-		/*
-		 * bit 7: error, bit 6: interrupting,
-		 * bit 5: FIFO full, bit 4: FIFO empty
-		 */
-		return (sc1d & 0x40) ? 1 : 0;
-	} else	{
-		/*
-		 * bit 3: error, bit 2: interrupting,
-		 * bit 1: FIFO full, bit 0: FIFO empty
-		 */
-		return (sc1d & 0x04) ? 1 : 0;
-	}
-}
-
-static u8 pdc2026x_cable_detect(ide_hwif_t *hwif)
-{
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	u16 CIS, mask = hwif->channel ? (1 << 11) : (1 << 10);
-
-	pci_read_config_word(dev, 0x50, &CIS);
-
-	return (CIS & mask) ? ATA_CBL_PATA40 : ATA_CBL_PATA80;
-}
-
-/*
- * Set the control register to use the 66MHz system
- * clock for UDMA 3/4/5 mode operation when necessary.
- *
- * FIXME: this register is shared by both channels, some locking is needed
- *
- * It may also be possible to leave the 66MHz clock on
- * and readjust the timing parameters.
- */
-static void pdc_old_enable_66MHz_clock(ide_hwif_t *hwif)
-{
-	unsigned long clock_reg = hwif->extra_base + 0x01;
-	u8 clock = inb(clock_reg);
-
-	outb(clock | (hwif->channel ? 0x08 : 0x02), clock_reg);
-}
-
-static void pdc_old_disable_66MHz_clock(ide_hwif_t *hwif)
-{
-	unsigned long clock_reg = hwif->extra_base + 0x01;
-	u8 clock = inb(clock_reg);
-
-	outb(clock & ~(hwif->channel ? 0x08 : 0x02), clock_reg);
-}
-
-static void pdc2026x_init_hwif(ide_hwif_t *hwif)
-{
-	pdc_old_disable_66MHz_clock(hwif);
-}
-
-static void pdc202xx_dma_start(ide_drive_t *drive)
-{
-	if (drive->current_speed > XFER_UDMA_2)
-		pdc_old_enable_66MHz_clock(drive->hwif);
-	if (drive->media != ide_disk || (drive->dev_flags & IDE_DFLAG_LBA48)) {
-		ide_hwif_t *hwif	= drive->hwif;
-		struct request *rq	= hwif->rq;
-		unsigned long high_16	= hwif->extra_base - 16;
-		unsigned long atapi_reg	= high_16 + (hwif->channel ? 0x24 : 0x20);
-		u32 word_count	= 0;
-		u8 clock = inb(high_16 + 0x11);
-
-		outb(clock | (hwif->channel ? 0x08 : 0x02), high_16 + 0x11);
-		word_count = (blk_rq_sectors(rq) << 8);
-		word_count = (rq_data_dir(rq) == READ) ?
-					word_count | 0x05000000 :
-					word_count | 0x06000000;
-		outl(word_count, atapi_reg);
-	}
-	ide_dma_start(drive);
-}
-
-static int pdc202xx_dma_end(ide_drive_t *drive)
-{
-	if (drive->media != ide_disk || (drive->dev_flags & IDE_DFLAG_LBA48)) {
-		ide_hwif_t *hwif	= drive->hwif;
-		unsigned long high_16	= hwif->extra_base - 16;
-		unsigned long atapi_reg	= high_16 + (hwif->channel ? 0x24 : 0x20);
-		u8 clock		= 0;
-
-		outl(0, atapi_reg); /* zero out extra */
-		clock = inb(high_16 + 0x11);
-		outb(clock & ~(hwif->channel ? 0x08:0x02), high_16 + 0x11);
-	}
-	if (drive->current_speed > XFER_UDMA_2)
-		pdc_old_disable_66MHz_clock(drive->hwif);
-	return ide_dma_end(drive);
-}
-
-static int init_chipset_pdc202xx(struct pci_dev *dev)
-{
-	unsigned long dmabase = pci_resource_start(dev, 4);
-	u8 udma_speed_flag = 0, primary_mode = 0, secondary_mode = 0;
-
-	if (dmabase == 0)
-		goto out;
-
-	udma_speed_flag	= inb(dmabase | 0x1f);
-	primary_mode	= inb(dmabase | 0x1a);
-	secondary_mode	= inb(dmabase | 0x1b);
-	printk(KERN_INFO "%s: (U)DMA Burst Bit %sABLED " \
-		"Primary %s Mode " \
-		"Secondary %s Mode.\n", pci_name(dev),
-		(udma_speed_flag & 1) ? "EN" : "DIS",
-		(primary_mode & 1) ? "MASTER" : "PCI",
-		(secondary_mode & 1) ? "MASTER" : "PCI" );
-
-	if (!(udma_speed_flag & 1)) {
-		printk(KERN_INFO "%s: FORCING BURST BIT 0x%02x->0x%02x ",
-			pci_name(dev), udma_speed_flag,
-			(udma_speed_flag|1));
-		outb(udma_speed_flag | 1, dmabase | 0x1f);
-		printk("%sACTIVE\n", (inb(dmabase | 0x1f) & 1) ? "" : "IN");
-	}
-out:
-	return 0;
-}
-
-static void pdc202ata4_fixup_irq(struct pci_dev *dev, const char *name)
-{
-	if ((dev->class >> 8) != PCI_CLASS_STORAGE_IDE) {
-		u8 irq = 0, irq2 = 0;
-		pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq);
-		/* 0xbc */
-		pci_read_config_byte(dev, (PCI_INTERRUPT_LINE)|0x80, &irq2);
-		if (irq != irq2) {
-			pci_write_config_byte(dev,
-				(PCI_INTERRUPT_LINE)|0x80, irq);     /* 0xbc */
-			printk(KERN_INFO "%s %s: PCI config space interrupt "
-				"mirror fixed\n", name, pci_name(dev));
-		}
-	}
-}
-
-#define IDE_HFLAGS_PDC202XX \
-	(IDE_HFLAG_ERROR_STOPS_FIFO | \
-	 IDE_HFLAG_OFF_BOARD)
-
-static const struct ide_port_ops pdc20246_port_ops = {
-	.set_pio_mode		= pdc202xx_set_pio_mode,
-	.set_dma_mode		= pdc202xx_set_mode,
-	.test_irq		= pdc202xx_test_irq,
-};
-
-static const struct ide_port_ops pdc2026x_port_ops = {
-	.set_pio_mode		= pdc202xx_set_pio_mode,
-	.set_dma_mode		= pdc202xx_set_mode,
-	.test_irq		= pdc202xx_test_irq,
-	.cable_detect		= pdc2026x_cable_detect,
-};
-
-static const struct ide_dma_ops pdc2026x_dma_ops = {
-	.dma_host_set		= ide_dma_host_set,
-	.dma_setup		= ide_dma_setup,
-	.dma_start		= pdc202xx_dma_start,
-	.dma_end		= pdc202xx_dma_end,
-	.dma_test_irq		= ide_dma_test_irq,
-	.dma_lost_irq		= ide_dma_lost_irq,
-	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_sff_read_status	= ide_dma_sff_read_status,
-};
-
-#define DECLARE_PDC2026X_DEV(udma, sectors) \
-	{ \
-		.name		= DRV_NAME, \
-		.init_chipset	= init_chipset_pdc202xx, \
-		.init_hwif	= pdc2026x_init_hwif, \
-		.port_ops	= &pdc2026x_port_ops, \
-		.dma_ops	= &pdc2026x_dma_ops, \
-		.host_flags	= IDE_HFLAGS_PDC202XX, \
-		.pio_mask	= ATA_PIO4, \
-		.mwdma_mask	= ATA_MWDMA2, \
-		.udma_mask	= udma, \
-		.max_sectors	= sectors, \
-	}
-
-static const struct ide_port_info pdc202xx_chipsets[] = {
-	{	/* 0: PDC20246 */
-		.name		= DRV_NAME,
-		.init_chipset	= init_chipset_pdc202xx,
-		.port_ops	= &pdc20246_port_ops,
-		.dma_ops	= &sff_dma_ops,
-		.host_flags	= IDE_HFLAGS_PDC202XX,
-		.pio_mask	= ATA_PIO4,
-		.mwdma_mask	= ATA_MWDMA2,
-		.udma_mask	= ATA_UDMA2,
-	},
-
-	/* 1: PDC2026{2,3} */
-	DECLARE_PDC2026X_DEV(ATA_UDMA4, 0),
-	/* 2: PDC2026{5,7}: UDMA5, limit LBA48 requests to 256 sectors */
-	DECLARE_PDC2026X_DEV(ATA_UDMA5, 256),
-};
-
-/**
- *	pdc202xx_init_one	-	called when a PDC202xx is found
- *	@dev: the pdc202xx device
- *	@id: the matching pci id
- *
- *	Called when the PCI registration layer (or the IDE initialization)
- *	finds a device matching our IDE device tables.
- */
- 
-static int pdc202xx_init_one(struct pci_dev *dev,
-			     const struct pci_device_id *id)
-{
-	const struct ide_port_info *d;
-	u8 idx = id->driver_data;
-
-	d = &pdc202xx_chipsets[idx];
-
-	if (idx < 2)
-		pdc202ata4_fixup_irq(dev, d->name);
-
-	if (dev->vendor == PCI_DEVICE_ID_PROMISE_20265) {
-		struct pci_dev *bridge = dev->bus->self;
-
-		if (bridge &&
-		    bridge->vendor == PCI_VENDOR_ID_INTEL &&
-		    (bridge->device == PCI_DEVICE_ID_INTEL_I960 ||
-		     bridge->device == PCI_DEVICE_ID_INTEL_I960RM)) {
-			printk(KERN_INFO DRV_NAME " %s: skipping Promise "
-				"PDC20265 attached to I2O RAID controller\n",
-				pci_name(dev));
-			return -ENODEV;
-		}
-	}
-
-	return ide_pci_init_one(dev, d, NULL);
-}
-
-static const struct pci_device_id pdc202xx_pci_tbl[] = {
-	{ PCI_VDEVICE(PROMISE, PCI_DEVICE_ID_PROMISE_20246), 0 },
-	{ PCI_VDEVICE(PROMISE, PCI_DEVICE_ID_PROMISE_20262), 1 },
-	{ PCI_VDEVICE(PROMISE, PCI_DEVICE_ID_PROMISE_20263), 1 },
-	{ PCI_VDEVICE(PROMISE, PCI_DEVICE_ID_PROMISE_20265), 2 },
-	{ PCI_VDEVICE(PROMISE, PCI_DEVICE_ID_PROMISE_20267), 2 },
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, pdc202xx_pci_tbl);
-
-static struct pci_driver pdc202xx_pci_driver = {
-	.name		= "Promise_Old_IDE",
-	.id_table	= pdc202xx_pci_tbl,
-	.probe		= pdc202xx_init_one,
-	.remove		= ide_pci_remove,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init pdc202xx_ide_init(void)
-{
-	return ide_pci_register_driver(&pdc202xx_pci_driver);
-}
-
-static void __exit pdc202xx_ide_exit(void)
-{
-	pci_unregister_driver(&pdc202xx_pci_driver);
-}
-
-module_init(pdc202xx_ide_init);
-module_exit(pdc202xx_ide_exit);
-
-MODULE_AUTHOR("Andre Hedrick, Frank Tiernan, Bartlomiej Zolnierkiewicz");
-MODULE_DESCRIPTION("PCI driver module for older Promise IDE");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/piix.c b/drivers/ide/piix.c
deleted file mode 100644
index a671cead6ae7..000000000000
--- a/drivers/ide/piix.c
+++ /dev/null
@@ -1,476 +0,0 @@
-/*
- *  Copyright (C) 1998-1999 Andrzej Krzysztofowicz, Author and Maintainer
- *  Copyright (C) 1998-2000 Andre Hedrick <andre@linux-ide.org>
- *  Copyright (C) 2003 Red Hat
- *  Copyright (C) 2006-2007 MontaVista Software, Inc. <source@mvista.com>
- *
- *  May be copied or modified under the terms of the GNU General Public License
- *
- * Documentation:
- *
- *	Publicly available from Intel web site. Errata documentation
- * is also publicly available. As an aide to anyone hacking on this
- * driver the list of errata that are relevant is below.going back to
- * PIIX4. Older device documentation is now a bit tricky to find.
- *
- * Errata of note:
- *
- * Unfixable
- *	PIIX4    errata #9	- Only on ultra obscure hw
- *	ICH3	 errata #13     - Not observed to affect real hw
- *				  by Intel
- *
- * Things we must deal with
- *	PIIX4	errata #10	- BM IDE hang with non UDMA
- *				  (must stop/start dma to recover)
- *	440MX   errata #15	- As PIIX4 errata #10
- *	PIIX4	errata #15	- Must not read control registers
- * 				  during a PIO transfer
- *	440MX   errata #13	- As PIIX4 errata #15
- *	ICH2	errata #21	- DMA mode 0 doesn't work right
- *	ICH0/1  errata #55	- As ICH2 errata #21
- *	ICH2	spec c #9	- Extra operations needed to handle
- *				  drive hotswap [NOT YET SUPPORTED]
- *	ICH2    spec c #20	- IDE PRD must not cross a 64K boundary
- *				  and must be dword aligned
- *	ICH2    spec c #24	- UDMA mode 4,5 t85/86 should be 6ns not 3.3
- *
- * Should have been BIOS fixed:
- *	450NX:	errata #19	- DMA hangs on old 450NX
- *	450NX:  errata #20	- DMA hangs on old 450NX
- *	450NX:  errata #25	- Corruption with DMA on old 450NX
- *	ICH3    errata #15      - IDE deadlock under high load
- *				  (BIOS must set dev 31 fn 0 bit 23)
- *	ICH3	errata #18	- Don't use native mode
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-
-#include <asm/io.h>
-
-#define DRV_NAME "piix"
-
-static int no_piix_dma;
-
-/**
- *	piix_set_pio_mode	-	set host controller for PIO mode
- *	@port: port
- *	@drive: drive
- *
- *	Set the interface PIO mode based upon the settings done by AMI BIOS.
- */
-
-static void piix_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	int is_slave		= drive->dn & 1;
-	int master_port		= hwif->channel ? 0x42 : 0x40;
-	int slave_port		= 0x44;
-	unsigned long flags;
-	u16 master_data;
-	u8 slave_data;
-	static DEFINE_SPINLOCK(tune_lock);
-	int control = 0;
-	const u8 pio = drive->pio_mode - XFER_PIO_0;
-
-				     /* ISP  RTC */
-	static const u8 timings[][2]= {
-					{ 0, 0 },
-					{ 0, 0 },
-					{ 1, 0 },
-					{ 2, 1 },
-					{ 2, 3 }, };
-
-	/*
-	 * Master vs slave is synchronized above us but the slave register is
-	 * shared by the two hwifs so the corner case of two slave timeouts in
-	 * parallel must be locked.
-	 */
-	spin_lock_irqsave(&tune_lock, flags);
-	pci_read_config_word(dev, master_port, &master_data);
-
-	if (pio > 1)
-		control |= 1;	/* Programmable timing on */
-	if (drive->media == ide_disk)
-		control |= 4;	/* Prefetch, post write */
-	if (ide_pio_need_iordy(drive, pio))
-		control |= 2;	/* IORDY */
-	if (is_slave) {
-		master_data |=  0x4000;
-		master_data &= ~0x0070;
-		if (pio > 1) {
-			/* Set PPE, IE and TIME */
-			master_data |= control << 4;
-		}
-		pci_read_config_byte(dev, slave_port, &slave_data);
-		slave_data &= hwif->channel ? 0x0f : 0xf0;
-		slave_data |= ((timings[pio][0] << 2) | timings[pio][1]) <<
-			       (hwif->channel ? 4 : 0);
-	} else {
-		master_data &= ~0x3307;
-		if (pio > 1) {
-			/* enable PPE, IE and TIME */
-			master_data |= control;
-		}
-		master_data |= (timings[pio][0] << 12) | (timings[pio][1] << 8);
-	}
-	pci_write_config_word(dev, master_port, master_data);
-	if (is_slave)
-		pci_write_config_byte(dev, slave_port, slave_data);
-	spin_unlock_irqrestore(&tune_lock, flags);
-}
-
-/**
- *	piix_set_dma_mode	-	set host controller for DMA mode
- *	@hwif: port
- *	@drive: drive
- *
- *	Set a PIIX host controller to the desired DMA mode.  This involves
- *	programming the right timing data into the PCI configuration space.
- */
-
-static void piix_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	u8 maslave		= hwif->channel ? 0x42 : 0x40;
-	int a_speed		= 3 << (drive->dn * 4);
-	int u_flag		= 1 << drive->dn;
-	int v_flag		= 0x01 << drive->dn;
-	int w_flag		= 0x10 << drive->dn;
-	int u_speed		= 0;
-	int			sitre;
-	u16			reg4042, reg4a;
-	u8			reg48, reg54, reg55;
-	const u8 speed		= drive->dma_mode;
-
-	pci_read_config_word(dev, maslave, &reg4042);
-	sitre = (reg4042 & 0x4000) ? 1 : 0;
-	pci_read_config_byte(dev, 0x48, &reg48);
-	pci_read_config_word(dev, 0x4a, &reg4a);
-	pci_read_config_byte(dev, 0x54, &reg54);
-	pci_read_config_byte(dev, 0x55, &reg55);
-
-	if (speed >= XFER_UDMA_0) {
-		u8 udma = speed - XFER_UDMA_0;
-
-		u_speed = min_t(u8, 2 - (udma & 1), udma) << (drive->dn * 4);
-
-		if (!(reg48 & u_flag))
-			pci_write_config_byte(dev, 0x48, reg48 | u_flag);
-		if (speed == XFER_UDMA_5) {
-			pci_write_config_byte(dev, 0x55, (u8) reg55|w_flag);
-		} else {
-			pci_write_config_byte(dev, 0x55, (u8) reg55 & ~w_flag);
-		}
-		if ((reg4a & a_speed) != u_speed)
-			pci_write_config_word(dev, 0x4a, (reg4a & ~a_speed) | u_speed);
-		if (speed > XFER_UDMA_2) {
-			if (!(reg54 & v_flag))
-				pci_write_config_byte(dev, 0x54, reg54 | v_flag);
-		} else
-			pci_write_config_byte(dev, 0x54, reg54 & ~v_flag);
-	} else {
-		const u8 mwdma_to_pio[] = { 0, 3, 4 };
-
-		if (reg48 & u_flag)
-			pci_write_config_byte(dev, 0x48, reg48 & ~u_flag);
-		if (reg4a & a_speed)
-			pci_write_config_word(dev, 0x4a, reg4a & ~a_speed);
-		if (reg54 & v_flag)
-			pci_write_config_byte(dev, 0x54, reg54 & ~v_flag);
-		if (reg55 & w_flag)
-			pci_write_config_byte(dev, 0x55, (u8) reg55 & ~w_flag);
-
-		if (speed >= XFER_MW_DMA_0)
-			drive->pio_mode =
-				mwdma_to_pio[speed - XFER_MW_DMA_0] + XFER_PIO_0;
-		else
-			drive->pio_mode = XFER_PIO_2; /* for SWDMA2 */
-
-		piix_set_pio_mode(hwif, drive);
-	}
-}
-
-/**
- *	init_chipset_ich	-	set up the ICH chipset
- *	@dev: PCI device to set up
- *
- *	Initialize the PCI device as required.  For the ICH this turns
- *	out to be nice and simple.
- */
-
-static int init_chipset_ich(struct pci_dev *dev)
-{
-	u32 extra = 0;
-
-	pci_read_config_dword(dev, 0x54, &extra);
-	pci_write_config_dword(dev, 0x54, extra | 0x400);
-
-	return 0;
-}
-
-/**
- *	ich_clear_irq	-	clear BMDMA status
- *	@drive: IDE drive
- *
- *	ICHx contollers set DMA INTR no matter DMA or PIO.
- *	BMDMA status might need to be cleared even for
- *	PIO interrupts to prevent spurious/lost IRQ.
- */
-static void ich_clear_irq(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	u8 dma_stat;
-
-	/*
-	 * ide_dma_end() needs BMDMA status for error checking.
-	 * So, skip clearing BMDMA status here and leave it
-	 * to ide_dma_end() if this is DMA interrupt.
-	 */
-	if (drive->waiting_for_dma || hwif->dma_base == 0)
-		return;
-
-	/* clear the INTR & ERROR bits */
-	dma_stat = inb(hwif->dma_base + ATA_DMA_STATUS);
-	/* Should we force the bit as well ? */
-	outb(dma_stat, hwif->dma_base + ATA_DMA_STATUS);
-}
-
-struct ich_laptop {
-	u16 device;
-	u16 subvendor;
-	u16 subdevice;
-};
-
-/*
- *	List of laptops that use short cables rather than 80 wire
- */
-
-static const struct ich_laptop ich_laptop[] = {
-	/* devid, subvendor, subdev */
-	{ 0x27DF, 0x1025, 0x0102 },	/* ICH7 on Acer 5602aWLMi */
-	{ 0x27DF, 0x0005, 0x0280 },	/* ICH7 on Acer 5602WLMi */
-	{ 0x27DF, 0x1025, 0x0110 },	/* ICH7 on Acer 3682WLMi */
-	{ 0x27DF, 0x1043, 0x1267 },	/* ICH7 on Asus W5F */
-	{ 0x27DF, 0x103C, 0x30A1 },	/* ICH7 on HP Compaq nc2400 */
-	{ 0x27DF, 0x1071, 0xD221 },	/* ICH7 on Hercules EC-900 */
-	{ 0x24CA, 0x1025, 0x0061 },	/* ICH4 on Acer Aspire 2023WLMi */
-	{ 0x24CA, 0x1025, 0x003d },	/* ICH4 on ACER TM290 */
-	{ 0x266F, 0x1025, 0x0066 },	/* ICH6 on ACER Aspire 1694WLMi */
-	{ 0x2653, 0x1043, 0x82D8 },	/* ICH6M on Asus Eee 701 */
-	{ 0x27df, 0x104d, 0x900e },	/* ICH7 on Sony TZ-90 */
-	/* end marker */
-	{ 0, }
-};
-
-static u8 piix_cable_detect(ide_hwif_t *hwif)
-{
-	struct pci_dev *pdev = to_pci_dev(hwif->dev);
-	const struct ich_laptop *lap = &ich_laptop[0];
-	u8 reg54h = 0, mask = hwif->channel ? 0xc0 : 0x30;
-
-	/* check for specials */
-	while (lap->device) {
-		if (lap->device == pdev->device &&
-		    lap->subvendor == pdev->subsystem_vendor &&
-		    lap->subdevice == pdev->subsystem_device) {
-			return ATA_CBL_PATA40_SHORT;
-		}
-		lap++;
-	}
-
-	pci_read_config_byte(pdev, 0x54, &reg54h);
-
-	return (reg54h & mask) ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
-}
-
-/**
- *	init_hwif_piix		-	fill in the hwif for the PIIX
- *	@hwif: IDE interface
- *
- *	Set up the ide_hwif_t for the PIIX interface according to the
- *	capabilities of the hardware.
- */
-
-static void init_hwif_piix(ide_hwif_t *hwif)
-{
-	if (!hwif->dma_base)
-		return;
-
-	if (no_piix_dma)
-		hwif->ultra_mask = hwif->mwdma_mask = hwif->swdma_mask = 0;
-}
-
-static const struct ide_port_ops piix_port_ops = {
-	.set_pio_mode		= piix_set_pio_mode,
-	.set_dma_mode		= piix_set_dma_mode,
-	.cable_detect		= piix_cable_detect,
-};
-
-static const struct ide_port_ops ich_port_ops = {
-	.set_pio_mode		= piix_set_pio_mode,
-	.set_dma_mode		= piix_set_dma_mode,
-	.clear_irq		= ich_clear_irq,
-	.cable_detect		= piix_cable_detect,
-};
-
-#define DECLARE_PIIX_DEV(udma) \
-	{						\
-		.name		= DRV_NAME,		\
-		.init_hwif	= init_hwif_piix,	\
-		.enablebits	= {{0x41,0x80,0x80}, {0x43,0x80,0x80}}, \
-		.port_ops	= &piix_port_ops,	\
-		.pio_mask	= ATA_PIO4,		\
-		.swdma_mask	= ATA_SWDMA2_ONLY,	\
-		.mwdma_mask	= ATA_MWDMA12_ONLY,	\
-		.udma_mask	= udma,			\
-	}
-
-#define DECLARE_ICH_DEV(mwdma, udma) \
-	{ \
-		.name		= DRV_NAME, \
-		.init_chipset	= init_chipset_ich, \
-		.init_hwif	= init_hwif_piix, \
-		.enablebits	= {{0x41,0x80,0x80}, {0x43,0x80,0x80}}, \
-		.port_ops	= &ich_port_ops, \
-		.pio_mask	= ATA_PIO4, \
-		.swdma_mask	= ATA_SWDMA2_ONLY, \
-		.mwdma_mask	= mwdma, \
-		.udma_mask	= udma, \
-	}
-
-static const struct ide_port_info piix_pci_info[] = {
-	/* 0: MPIIX */
-	{	/*
-		 * MPIIX actually has only a single IDE channel mapped to
-		 * the primary or secondary ports depending on the value
-		 * of the bit 14 of the IDETIM register at offset 0x6c
-		 */
-		.name		= DRV_NAME,
-		.enablebits	= {{0x6d,0xc0,0x80}, {0x6d,0xc0,0xc0}},
-		.host_flags	= IDE_HFLAG_ISA_PORTS | IDE_HFLAG_NO_DMA,
-		.pio_mask	= ATA_PIO4,
-		/* This is a painful system best to let it self tune for now */
-	},
-	/* 1: PIIXa/PIIXb/PIIX3 */
-	DECLARE_PIIX_DEV(0x00), /* no udma */
-	/* 2: PIIX4 */
-	DECLARE_PIIX_DEV(ATA_UDMA2),
-	/* 3: ICH0 */
-	DECLARE_ICH_DEV(ATA_MWDMA12_ONLY, ATA_UDMA2),
-	/* 4: ICH */
-	DECLARE_ICH_DEV(ATA_MWDMA12_ONLY, ATA_UDMA4),
-	/* 5: PIIX4 */
-	DECLARE_PIIX_DEV(ATA_UDMA4),
-	/* 6: ICH[2-6]/ICH[2-3]M/C-ICH/ICH5-SATA/ESB2/ICH8M */
-	DECLARE_ICH_DEV(ATA_MWDMA12_ONLY, ATA_UDMA5),
-	/* 7: ICH7/7-R, no MWDMA1 */
-	DECLARE_ICH_DEV(ATA_MWDMA2_ONLY, ATA_UDMA5),
-};
-
-/**
- *	piix_init_one	-	called when a PIIX is found
- *	@dev: the piix device
- *	@id: the matching pci id
- *
- *	Called when the PCI registration layer (or the IDE initialization)
- *	finds a device matching our IDE device tables.
- */
- 
-static int piix_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	return ide_pci_init_one(dev, &piix_pci_info[id->driver_data], NULL);
-}
-
-/**
- *	piix_check_450nx	-	Check for problem 450NX setup
- *	
- *	Check for the present of 450NX errata #19 and errata #25. If
- *	they are found, disable use of DMA IDE
- */
-
-static void piix_check_450nx(void)
-{
-	struct pci_dev *pdev = NULL;
-	u16 cfg;
-	while((pdev=pci_get_device(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82454NX, pdev))!=NULL)
-	{
-		/* Look for 450NX PXB. Check for problem configurations
-		   A PCI quirk checks bit 6 already */
-		pci_read_config_word(pdev, 0x41, &cfg);
-		/* Only on the original revision: IDE DMA can hang */
-		if (pdev->revision == 0x00)
-			no_piix_dma = 1;
-		/* On all revisions below 5 PXB bus lock must be disabled for IDE */
-		else if (cfg & (1<<14) && pdev->revision < 5)
-			no_piix_dma = 2;
-	}
-	if(no_piix_dma)
-		printk(KERN_WARNING DRV_NAME ": 450NX errata present, disabling IDE DMA.\n");
-	if(no_piix_dma == 2)
-		printk(KERN_WARNING DRV_NAME ": A BIOS update may resolve this.\n");
-}		
-
-static const struct pci_device_id piix_pci_tbl[] = {
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_82371FB_0),  1 },
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_82371FB_1),  1 },
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_82371MX),    0 },
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_82371SB_1),  1 },
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_82371AB),    2 },
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_82801AB_1),  3 },
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_82443MX_1),  2 },
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_82801AA_1),  4 },
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_82372FB_1),  5 },
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_82451NX),    2 },
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_82801BA_9),  6 },
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_82801BA_8),  6 },
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_82801CA_10), 6 },
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_82801CA_11), 6 },
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_82801DB_11), 6 },
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_82801EB_11), 6 },
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_82801E_11),  6 },
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_82801DB_10), 6 },
-#ifdef CONFIG_BLK_DEV_IDE_SATA
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_82801EB_1),  6 },
-#endif
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_ESB_2),      6 },
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_ICH6_19),    6 },
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_ICH7_21),    7 },
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_82801DB_1),  6 },
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_ESB2_18),    7 },
-	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_ICH8_6),     6 },
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, piix_pci_tbl);
-
-static struct pci_driver piix_pci_driver = {
-	.name		= "PIIX_IDE",
-	.id_table	= piix_pci_tbl,
-	.probe		= piix_init_one,
-	.remove		= ide_pci_remove,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init piix_ide_init(void)
-{
-	piix_check_450nx();
-	return ide_pci_register_driver(&piix_pci_driver);
-}
-
-static void __exit piix_ide_exit(void)
-{
-	pci_unregister_driver(&piix_pci_driver);
-}
-
-module_init(piix_ide_init);
-module_exit(piix_ide_exit);
-
-MODULE_AUTHOR("Andre Hedrick, Andrzej Krzysztofowicz");
-MODULE_DESCRIPTION("PCI driver module for Intel PIIX IDE");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c
deleted file mode 100644
index ea0b064b5f56..000000000000
--- a/drivers/ide/pmac.c
+++ /dev/null
@@ -1,1703 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Support for IDE interfaces on PowerMacs.
- *
- * These IDE interfaces are memory-mapped and have a DBDMA channel
- * for doing DMA.
- *
- *  Copyright (C) 1998-2003 Paul Mackerras & Ben. Herrenschmidt
- *  Copyright (C) 2007-2008 Bartlomiej Zolnierkiewicz
- *
- * Some code taken from drivers/ide/ide-dma.c:
- *
- *  Copyright (c) 1995-1998  Mark Lord
- *
- * TODO: - Use pre-calculated (kauai) timing tables all the time and
- * get rid of the "rounded" tables used previously, so we have the
- * same table format for all controllers and can then just have one
- * big table
- */
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/ide.h>
-#include <linux/notifier.h>
-#include <linux/module.h>
-#include <linux/reboot.h>
-#include <linux/pci.h>
-#include <linux/adb.h>
-#include <linux/pmu.h>
-#include <linux/scatterlist.h>
-#include <linux/slab.h>
-
-#include <asm/prom.h>
-#include <asm/io.h>
-#include <asm/dbdma.h>
-#include <asm/ide.h>
-#include <asm/machdep.h>
-#include <asm/pmac_feature.h>
-#include <asm/sections.h>
-#include <asm/irq.h>
-#include <asm/mediabay.h>
-
-#define DRV_NAME "ide-pmac"
-
-#undef IDE_PMAC_DEBUG
-
-#define DMA_WAIT_TIMEOUT	50
-
-typedef struct pmac_ide_hwif {
-	unsigned long			regbase;
-	int				irq;
-	int				kind;
-	int				aapl_bus_id;
-	unsigned			broken_dma : 1;
-	unsigned			broken_dma_warn : 1;
-	struct device_node*		node;
-	struct macio_dev		*mdev;
-	u32				timings[4];
-	volatile u32 __iomem *		*kauai_fcr;
-	ide_hwif_t			*hwif;
-
-	/* Those fields are duplicating what is in hwif. We currently
-	 * can't use the hwif ones because of some assumptions that are
-	 * beeing done by the generic code about the kind of dma controller
-	 * and format of the dma table. This will have to be fixed though.
-	 */
-	volatile struct dbdma_regs __iomem *	dma_regs;
-	struct dbdma_cmd*		dma_table_cpu;
-} pmac_ide_hwif_t;
-
-enum {
-	controller_ohare,	/* OHare based */
-	controller_heathrow,	/* Heathrow/Paddington */
-	controller_kl_ata3,	/* KeyLargo ATA-3 */
-	controller_kl_ata4,	/* KeyLargo ATA-4 */
-	controller_un_ata6,	/* UniNorth2 ATA-6 */
-	controller_k2_ata6,	/* K2 ATA-6 */
-	controller_sh_ata6,	/* Shasta ATA-6 */
-};
-
-static const char* model_name[] = {
-	"OHare ATA",		/* OHare based */
-	"Heathrow ATA",		/* Heathrow/Paddington */
-	"KeyLargo ATA-3",	/* KeyLargo ATA-3 (MDMA only) */
-	"KeyLargo ATA-4",	/* KeyLargo ATA-4 (UDMA/66) */
-	"UniNorth ATA-6",	/* UniNorth2 ATA-6 (UDMA/100) */
-	"K2 ATA-6",		/* K2 ATA-6 (UDMA/100) */
-	"Shasta ATA-6",		/* Shasta ATA-6 (UDMA/133) */
-};
-
-/*
- * Extra registers, both 32-bit little-endian
- */
-#define IDE_TIMING_CONFIG	0x200
-#define IDE_INTERRUPT		0x300
-
-/* Kauai (U2) ATA has different register setup */
-#define IDE_KAUAI_PIO_CONFIG	0x200
-#define IDE_KAUAI_ULTRA_CONFIG	0x210
-#define IDE_KAUAI_POLL_CONFIG	0x220
-
-/*
- * Timing configuration register definitions
- */
-
-/* Number of IDE_SYSCLK_NS ticks, argument is in nanoseconds */
-#define SYSCLK_TICKS(t)		(((t) + IDE_SYSCLK_NS - 1) / IDE_SYSCLK_NS)
-#define SYSCLK_TICKS_66(t)	(((t) + IDE_SYSCLK_66_NS - 1) / IDE_SYSCLK_66_NS)
-#define IDE_SYSCLK_NS		30	/* 33Mhz cell */
-#define IDE_SYSCLK_66_NS	15	/* 66Mhz cell */
-
-/* 133Mhz cell, found in shasta.
- * See comments about 100 Mhz Uninorth 2...
- * Note that PIO_MASK and MDMA_MASK seem to overlap
- */
-#define TR_133_PIOREG_PIO_MASK		0xff000fff
-#define TR_133_PIOREG_MDMA_MASK		0x00fff800
-#define TR_133_UDMAREG_UDMA_MASK	0x0003ffff
-#define TR_133_UDMAREG_UDMA_EN		0x00000001
-
-/* 100Mhz cell, found in Uninorth 2. I don't have much infos about
- * this one yet, it appears as a pci device (106b/0033) on uninorth
- * internal PCI bus and it's clock is controlled like gem or fw. It
- * appears to be an evolution of keylargo ATA4 with a timing register
- * extended to 2 32bits registers and a similar DBDMA channel. Other
- * registers seem to exist but I can't tell much about them.
- * 
- * So far, I'm using pre-calculated tables for this extracted from
- * the values used by the MacOS X driver.
- * 
- * The "PIO" register controls PIO and MDMA timings, the "ULTRA"
- * register controls the UDMA timings. At least, it seems bit 0
- * of this one enables UDMA vs. MDMA, and bits 4..7 are the
- * cycle time in units of 10ns. Bits 8..15 are used by I don't
- * know their meaning yet
- */
-#define TR_100_PIOREG_PIO_MASK		0xff000fff
-#define TR_100_PIOREG_MDMA_MASK		0x00fff000
-#define TR_100_UDMAREG_UDMA_MASK	0x0000ffff
-#define TR_100_UDMAREG_UDMA_EN		0x00000001
-
-
-/* 66Mhz cell, found in KeyLargo. Can do ultra mode 0 to 2 on
- * 40 connector cable and to 4 on 80 connector one.
- * Clock unit is 15ns (66Mhz)
- * 
- * 3 Values can be programmed:
- *  - Write data setup, which appears to match the cycle time. They
- *    also call it DIOW setup.
- *  - Ready to pause time (from spec)
- *  - Address setup. That one is weird. I don't see where exactly
- *    it fits in UDMA cycles, I got it's name from an obscure piece
- *    of commented out code in Darwin. They leave it to 0, we do as
- *    well, despite a comment that would lead to think it has a
- *    min value of 45ns.
- * Apple also add 60ns to the write data setup (or cycle time ?) on
- * reads.
- */
-#define TR_66_UDMA_MASK			0xfff00000
-#define TR_66_UDMA_EN			0x00100000 /* Enable Ultra mode for DMA */
-#define TR_66_UDMA_ADDRSETUP_MASK	0xe0000000 /* Address setup */
-#define TR_66_UDMA_ADDRSETUP_SHIFT	29
-#define TR_66_UDMA_RDY2PAUS_MASK	0x1e000000 /* Ready 2 pause time */
-#define TR_66_UDMA_RDY2PAUS_SHIFT	25
-#define TR_66_UDMA_WRDATASETUP_MASK	0x01e00000 /* Write data setup time */
-#define TR_66_UDMA_WRDATASETUP_SHIFT	21
-#define TR_66_MDMA_MASK			0x000ffc00
-#define TR_66_MDMA_RECOVERY_MASK	0x000f8000
-#define TR_66_MDMA_RECOVERY_SHIFT	15
-#define TR_66_MDMA_ACCESS_MASK		0x00007c00
-#define TR_66_MDMA_ACCESS_SHIFT		10
-#define TR_66_PIO_MASK			0x000003ff
-#define TR_66_PIO_RECOVERY_MASK		0x000003e0
-#define TR_66_PIO_RECOVERY_SHIFT	5
-#define TR_66_PIO_ACCESS_MASK		0x0000001f
-#define TR_66_PIO_ACCESS_SHIFT		0
-
-/* 33Mhz cell, found in OHare, Heathrow (& Paddington) and KeyLargo
- * Can do pio & mdma modes, clock unit is 30ns (33Mhz)
- * 
- * The access time and recovery time can be programmed. Some older
- * Darwin code base limit OHare to 150ns cycle time. I decided to do
- * the same here fore safety against broken old hardware ;)
- * The HalfTick bit, when set, adds half a clock (15ns) to the access
- * time and removes one from recovery. It's not supported on KeyLargo
- * implementation afaik. The E bit appears to be set for PIO mode 0 and
- * is used to reach long timings used in this mode.
- */
-#define TR_33_MDMA_MASK			0x003ff800
-#define TR_33_MDMA_RECOVERY_MASK	0x001f0000
-#define TR_33_MDMA_RECOVERY_SHIFT	16
-#define TR_33_MDMA_ACCESS_MASK		0x0000f800
-#define TR_33_MDMA_ACCESS_SHIFT		11
-#define TR_33_MDMA_HALFTICK		0x00200000
-#define TR_33_PIO_MASK			0x000007ff
-#define TR_33_PIO_E			0x00000400
-#define TR_33_PIO_RECOVERY_MASK		0x000003e0
-#define TR_33_PIO_RECOVERY_SHIFT	5
-#define TR_33_PIO_ACCESS_MASK		0x0000001f
-#define TR_33_PIO_ACCESS_SHIFT		0
-
-/*
- * Interrupt register definitions
- */
-#define IDE_INTR_DMA			0x80000000
-#define IDE_INTR_DEVICE			0x40000000
-
-/*
- * FCR Register on Kauai. Not sure what bit 0x4 is  ...
- */
-#define KAUAI_FCR_UATA_MAGIC		0x00000004
-#define KAUAI_FCR_UATA_RESET_N		0x00000002
-#define KAUAI_FCR_UATA_ENABLE		0x00000001
-
-/* Rounded Multiword DMA timings
- * 
- * I gave up finding a generic formula for all controller
- * types and instead, built tables based on timing values
- * used by Apple in Darwin's implementation.
- */
-struct mdma_timings_t {
-	int	accessTime;
-	int	recoveryTime;
-	int	cycleTime;
-};
-
-struct mdma_timings_t mdma_timings_33[] =
-{
-    { 240, 240, 480 },
-    { 180, 180, 360 },
-    { 135, 135, 270 },
-    { 120, 120, 240 },
-    { 105, 105, 210 },
-    {  90,  90, 180 },
-    {  75,  75, 150 },
-    {  75,  45, 120 },
-    {   0,   0,   0 }
-};
-
-struct mdma_timings_t mdma_timings_33k[] =
-{
-    { 240, 240, 480 },
-    { 180, 180, 360 },
-    { 150, 150, 300 },
-    { 120, 120, 240 },
-    {  90, 120, 210 },
-    {  90,  90, 180 },
-    {  90,  60, 150 },
-    {  90,  30, 120 },
-    {   0,   0,   0 }
-};
-
-struct mdma_timings_t mdma_timings_66[] =
-{
-    { 240, 240, 480 },
-    { 180, 180, 360 },
-    { 135, 135, 270 },
-    { 120, 120, 240 },
-    { 105, 105, 210 },
-    {  90,  90, 180 },
-    {  90,  75, 165 },
-    {  75,  45, 120 },
-    {   0,   0,   0 }
-};
-
-/* KeyLargo ATA-4 Ultra DMA timings (rounded) */
-struct {
-	int	addrSetup; /* ??? */
-	int	rdy2pause;
-	int	wrDataSetup;
-} kl66_udma_timings[] =
-{
-    {   0, 180,  120 },	/* Mode 0 */
-    {   0, 150,  90 },	/*      1 */
-    {   0, 120,  60 },	/*      2 */
-    {   0, 90,   45 },	/*      3 */
-    {   0, 90,   30 }	/*      4 */
-};
-
-/* UniNorth 2 ATA/100 timings */
-struct kauai_timing {
-	int	cycle_time;
-	u32	timing_reg;
-};
-
-static struct kauai_timing	kauai_pio_timings[] =
-{
-	{ 930	, 0x08000fff },
-	{ 600	, 0x08000a92 },
-	{ 383	, 0x0800060f },
-	{ 360	, 0x08000492 },
-	{ 330	, 0x0800048f },
-	{ 300	, 0x080003cf },
-	{ 270	, 0x080003cc },
-	{ 240	, 0x0800038b },
-	{ 239	, 0x0800030c },
-	{ 180	, 0x05000249 },
-	{ 120	, 0x04000148 },
-	{ 0	, 0 },
-};
-
-static struct kauai_timing	kauai_mdma_timings[] =
-{
-	{ 1260	, 0x00fff000 },
-	{ 480	, 0x00618000 },
-	{ 360	, 0x00492000 },
-	{ 270	, 0x0038e000 },
-	{ 240	, 0x0030c000 },
-	{ 210	, 0x002cb000 },
-	{ 180	, 0x00249000 },
-	{ 150	, 0x00209000 },
-	{ 120	, 0x00148000 },
-	{ 0	, 0 },
-};
-
-static struct kauai_timing	kauai_udma_timings[] =
-{
-	{ 120	, 0x000070c0 },
-	{ 90	, 0x00005d80 },
-	{ 60	, 0x00004a60 },
-	{ 45	, 0x00003a50 },
-	{ 30	, 0x00002a30 },
-	{ 20	, 0x00002921 },
-	{ 0	, 0 },
-};
-
-static struct kauai_timing	shasta_pio_timings[] =
-{
-	{ 930	, 0x08000fff },
-	{ 600	, 0x0A000c97 },
-	{ 383	, 0x07000712 },
-	{ 360	, 0x040003cd },
-	{ 330	, 0x040003cd },
-	{ 300	, 0x040003cd },
-	{ 270	, 0x040003cd },
-	{ 240	, 0x040003cd },
-	{ 239	, 0x040003cd },
-	{ 180	, 0x0400028b },
-	{ 120	, 0x0400010a },
-	{ 0	, 0 },
-};
-
-static struct kauai_timing	shasta_mdma_timings[] =
-{
-	{ 1260	, 0x00fff000 },
-	{ 480	, 0x00820800 },
-	{ 360	, 0x00820800 },
-	{ 270	, 0x00820800 },
-	{ 240	, 0x00820800 },
-	{ 210	, 0x00820800 },
-	{ 180	, 0x00820800 },
-	{ 150	, 0x0028b000 },
-	{ 120	, 0x001ca000 },
-	{ 0	, 0 },
-};
-
-static struct kauai_timing	shasta_udma133_timings[] =
-{
-	{ 120   , 0x00035901, },
-	{ 90    , 0x000348b1, },
-	{ 60    , 0x00033881, },
-	{ 45    , 0x00033861, },
-	{ 30    , 0x00033841, },
-	{ 20    , 0x00033031, },
-	{ 15    , 0x00033021, },
-	{ 0	, 0 },
-};
-
-
-static inline u32
-kauai_lookup_timing(struct kauai_timing* table, int cycle_time)
-{
-	int i;
-	
-	for (i=0; table[i].cycle_time; i++)
-		if (cycle_time > table[i+1].cycle_time)
-			return table[i].timing_reg;
-	BUG();
-	return 0;
-}
-
-/* allow up to 256 DBDMA commands per xfer */
-#define MAX_DCMDS		256
-
-/* 
- * Wait 1s for disk to answer on IDE bus after a hard reset
- * of the device (via GPIO/FCR).
- * 
- * Some devices seem to "pollute" the bus even after dropping
- * the BSY bit (typically some combo drives slave on the UDMA
- * bus) after a hard reset. Since we hard reset all drives on
- * KeyLargo ATA66, we have to keep that delay around. I may end
- * up not hard resetting anymore on these and keep the delay only
- * for older interfaces instead (we have to reset when coming
- * from MacOS...) --BenH. 
- */
-#define IDE_WAKEUP_DELAY	(1*HZ)
-
-static int pmac_ide_init_dma(ide_hwif_t *, const struct ide_port_info *);
-
-#define PMAC_IDE_REG(x) \
-	((void __iomem *)((drive)->hwif->io_ports.data_addr + (x)))
-
-/*
- * Apply the timings of the proper unit (master/slave) to the shared
- * timing register when selecting that unit. This version is for
- * ASICs with a single timing register
- */
-static void pmac_ide_apply_timings(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	pmac_ide_hwif_t *pmif = dev_get_drvdata(hwif->gendev.parent);
-
-	if (drive->dn & 1)
-		writel(pmif->timings[1], PMAC_IDE_REG(IDE_TIMING_CONFIG));
-	else
-		writel(pmif->timings[0], PMAC_IDE_REG(IDE_TIMING_CONFIG));
-	(void)readl(PMAC_IDE_REG(IDE_TIMING_CONFIG));
-}
-
-/*
- * Apply the timings of the proper unit (master/slave) to the shared
- * timing register when selecting that unit. This version is for
- * ASICs with a dual timing register (Kauai)
- */
-static void pmac_ide_kauai_apply_timings(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	pmac_ide_hwif_t *pmif = dev_get_drvdata(hwif->gendev.parent);
-
-	if (drive->dn & 1) {
-		writel(pmif->timings[1], PMAC_IDE_REG(IDE_KAUAI_PIO_CONFIG));
-		writel(pmif->timings[3], PMAC_IDE_REG(IDE_KAUAI_ULTRA_CONFIG));
-	} else {
-		writel(pmif->timings[0], PMAC_IDE_REG(IDE_KAUAI_PIO_CONFIG));
-		writel(pmif->timings[2], PMAC_IDE_REG(IDE_KAUAI_ULTRA_CONFIG));
-	}
-	(void)readl(PMAC_IDE_REG(IDE_KAUAI_PIO_CONFIG));
-}
-
-/*
- * Force an update of controller timing values for a given drive
- */
-static void
-pmac_ide_do_update_timings(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	pmac_ide_hwif_t *pmif = dev_get_drvdata(hwif->gendev.parent);
-
-	if (pmif->kind == controller_sh_ata6 ||
-	    pmif->kind == controller_un_ata6 ||
-	    pmif->kind == controller_k2_ata6)
-		pmac_ide_kauai_apply_timings(drive);
-	else
-		pmac_ide_apply_timings(drive);
-}
-
-static void pmac_dev_select(ide_drive_t *drive)
-{
-	pmac_ide_apply_timings(drive);
-
-	writeb(drive->select | ATA_DEVICE_OBS,
-	       (void __iomem *)drive->hwif->io_ports.device_addr);
-}
-
-static void pmac_kauai_dev_select(ide_drive_t *drive)
-{
-	pmac_ide_kauai_apply_timings(drive);
-
-	writeb(drive->select | ATA_DEVICE_OBS,
-	       (void __iomem *)drive->hwif->io_ports.device_addr);
-}
-
-static void pmac_exec_command(ide_hwif_t *hwif, u8 cmd)
-{
-	writeb(cmd, (void __iomem *)hwif->io_ports.command_addr);
-	(void)readl((void __iomem *)(hwif->io_ports.data_addr
-				     + IDE_TIMING_CONFIG));
-}
-
-static void pmac_write_devctl(ide_hwif_t *hwif, u8 ctl)
-{
-	writeb(ctl, (void __iomem *)hwif->io_ports.ctl_addr);
-	(void)readl((void __iomem *)(hwif->io_ports.data_addr
-				     + IDE_TIMING_CONFIG));
-}
-
-/*
- * Old tuning functions (called on hdparm -p), sets up drive PIO timings
- */
-static void pmac_ide_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	pmac_ide_hwif_t *pmif = dev_get_drvdata(hwif->gendev.parent);
-	const u8 pio = drive->pio_mode - XFER_PIO_0;
-	struct ide_timing *tim = ide_timing_find_mode(XFER_PIO_0 + pio);
-	u32 *timings, t;
-	unsigned accessTicks, recTicks;
-	unsigned accessTime, recTime;
-	unsigned int cycle_time;
-
-	/* which drive is it ? */
-	timings = &pmif->timings[drive->dn & 1];
-	t = *timings;
-
-	cycle_time = ide_pio_cycle_time(drive, pio);
-
-	switch (pmif->kind) {
-	case controller_sh_ata6: {
-		/* 133Mhz cell */
-		u32 tr = kauai_lookup_timing(shasta_pio_timings, cycle_time);
-		t = (t & ~TR_133_PIOREG_PIO_MASK) | tr;
-		break;
-		}
-	case controller_un_ata6:
-	case controller_k2_ata6: {
-		/* 100Mhz cell */
-		u32 tr = kauai_lookup_timing(kauai_pio_timings, cycle_time);
-		t = (t & ~TR_100_PIOREG_PIO_MASK) | tr;
-		break;
-		}
-	case controller_kl_ata4:
-		/* 66Mhz cell */
-		recTime = cycle_time - tim->active - tim->setup;
-		recTime = max(recTime, 150U);
-		accessTime = tim->active;
-		accessTime = max(accessTime, 150U);
-		accessTicks = SYSCLK_TICKS_66(accessTime);
-		accessTicks = min(accessTicks, 0x1fU);
-		recTicks = SYSCLK_TICKS_66(recTime);
-		recTicks = min(recTicks, 0x1fU);
-		t = (t & ~TR_66_PIO_MASK) |
-			(accessTicks << TR_66_PIO_ACCESS_SHIFT) |
-			(recTicks << TR_66_PIO_RECOVERY_SHIFT);
-		break;
-	default: {
-		/* 33Mhz cell */
-		int ebit = 0;
-		recTime = cycle_time - tim->active - tim->setup;
-		recTime = max(recTime, 150U);
-		accessTime = tim->active;
-		accessTime = max(accessTime, 150U);
-		accessTicks = SYSCLK_TICKS(accessTime);
-		accessTicks = min(accessTicks, 0x1fU);
-		accessTicks = max(accessTicks, 4U);
-		recTicks = SYSCLK_TICKS(recTime);
-		recTicks = min(recTicks, 0x1fU);
-		recTicks = max(recTicks, 5U) - 4;
-		if (recTicks > 9) {
-			recTicks--; /* guess, but it's only for PIO0, so... */
-			ebit = 1;
-		}
-		t = (t & ~TR_33_PIO_MASK) |
-				(accessTicks << TR_33_PIO_ACCESS_SHIFT) |
-				(recTicks << TR_33_PIO_RECOVERY_SHIFT);
-		if (ebit)
-			t |= TR_33_PIO_E;
-		break;
-		}
-	}
-
-#ifdef IDE_PMAC_DEBUG
-	printk(KERN_ERR "%s: Set PIO timing for mode %d, reg: 0x%08x\n",
-		drive->name, pio,  *timings);
-#endif	
-
-	*timings = t;
-	pmac_ide_do_update_timings(drive);
-}
-
-/*
- * Calculate KeyLargo ATA/66 UDMA timings
- */
-static int
-set_timings_udma_ata4(u32 *timings, u8 speed)
-{
-	unsigned rdyToPauseTicks, wrDataSetupTicks, addrTicks;
-
-	if (speed > XFER_UDMA_4)
-		return 1;
-
-	rdyToPauseTicks = SYSCLK_TICKS_66(kl66_udma_timings[speed & 0xf].rdy2pause);
-	wrDataSetupTicks = SYSCLK_TICKS_66(kl66_udma_timings[speed & 0xf].wrDataSetup);
-	addrTicks = SYSCLK_TICKS_66(kl66_udma_timings[speed & 0xf].addrSetup);
-
-	*timings = ((*timings) & ~(TR_66_UDMA_MASK | TR_66_MDMA_MASK)) |
-			(wrDataSetupTicks << TR_66_UDMA_WRDATASETUP_SHIFT) | 
-			(rdyToPauseTicks << TR_66_UDMA_RDY2PAUS_SHIFT) |
-			(addrTicks <<TR_66_UDMA_ADDRSETUP_SHIFT) |
-			TR_66_UDMA_EN;
-#ifdef IDE_PMAC_DEBUG
-	printk(KERN_ERR "ide_pmac: Set UDMA timing for mode %d, reg: 0x%08x\n",
-		speed & 0xf,  *timings);
-#endif	
-
-	return 0;
-}
-
-/*
- * Calculate Kauai ATA/100 UDMA timings
- */
-static int
-set_timings_udma_ata6(u32 *pio_timings, u32 *ultra_timings, u8 speed)
-{
-	struct ide_timing *t = ide_timing_find_mode(speed);
-	u32 tr;
-
-	if (speed > XFER_UDMA_5 || t == NULL)
-		return 1;
-	tr = kauai_lookup_timing(kauai_udma_timings, (int)t->udma);
-	*ultra_timings = ((*ultra_timings) & ~TR_100_UDMAREG_UDMA_MASK) | tr;
-	*ultra_timings = (*ultra_timings) | TR_100_UDMAREG_UDMA_EN;
-
-	return 0;
-}
-
-/*
- * Calculate Shasta ATA/133 UDMA timings
- */
-static int
-set_timings_udma_shasta(u32 *pio_timings, u32 *ultra_timings, u8 speed)
-{
-	struct ide_timing *t = ide_timing_find_mode(speed);
-	u32 tr;
-
-	if (speed > XFER_UDMA_6 || t == NULL)
-		return 1;
-	tr = kauai_lookup_timing(shasta_udma133_timings, (int)t->udma);
-	*ultra_timings = ((*ultra_timings) & ~TR_133_UDMAREG_UDMA_MASK) | tr;
-	*ultra_timings = (*ultra_timings) | TR_133_UDMAREG_UDMA_EN;
-
-	return 0;
-}
-
-/*
- * Calculate MDMA timings for all cells
- */
-static void
-set_timings_mdma(ide_drive_t *drive, int intf_type, u32 *timings, u32 *timings2,
-		 	u8 speed)
-{
-	u16 *id = drive->id;
-	int cycleTime, accessTime = 0, recTime = 0;
-	unsigned accessTicks, recTicks;
-	struct mdma_timings_t* tm = NULL;
-	int i;
-
-	/* Get default cycle time for mode */
-	switch(speed & 0xf) {
-		case 0: cycleTime = 480; break;
-		case 1: cycleTime = 150; break;
-		case 2: cycleTime = 120; break;
-		default:
-			BUG();
-			break;
-	}
-
-	/* Check if drive provides explicit DMA cycle time */
-	if ((id[ATA_ID_FIELD_VALID] & 2) && id[ATA_ID_EIDE_DMA_TIME])
-		cycleTime = max_t(int, id[ATA_ID_EIDE_DMA_TIME], cycleTime);
-
-	/* OHare limits according to some old Apple sources */	
-	if ((intf_type == controller_ohare) && (cycleTime < 150))
-		cycleTime = 150;
-	/* Get the proper timing array for this controller */
-	switch(intf_type) {
-	        case controller_sh_ata6:
-		case controller_un_ata6:
-		case controller_k2_ata6:
-			break;
-		case controller_kl_ata4:
-			tm = mdma_timings_66;
-			break;
-		case controller_kl_ata3:
-			tm = mdma_timings_33k;
-			break;
-		default:
-			tm = mdma_timings_33;
-			break;
-	}
-	if (tm != NULL) {
-		/* Lookup matching access & recovery times */
-		i = -1;
-		for (;;) {
-			if (tm[i+1].cycleTime < cycleTime)
-				break;
-			i++;
-		}
-		cycleTime = tm[i].cycleTime;
-		accessTime = tm[i].accessTime;
-		recTime = tm[i].recoveryTime;
-
-#ifdef IDE_PMAC_DEBUG
-		printk(KERN_ERR "%s: MDMA, cycleTime: %d, accessTime: %d, recTime: %d\n",
-			drive->name, cycleTime, accessTime, recTime);
-#endif
-	}
-	switch(intf_type) {
-	case controller_sh_ata6: {
-		/* 133Mhz cell */
-		u32 tr = kauai_lookup_timing(shasta_mdma_timings, cycleTime);
-		*timings = ((*timings) & ~TR_133_PIOREG_MDMA_MASK) | tr;
-		*timings2 = (*timings2) & ~TR_133_UDMAREG_UDMA_EN;
-		}
-		break;
-	case controller_un_ata6:
-	case controller_k2_ata6: {
-		/* 100Mhz cell */
-		u32 tr = kauai_lookup_timing(kauai_mdma_timings, cycleTime);
-		*timings = ((*timings) & ~TR_100_PIOREG_MDMA_MASK) | tr;
-		*timings2 = (*timings2) & ~TR_100_UDMAREG_UDMA_EN;
-		}
-		break;
-	case controller_kl_ata4:
-		/* 66Mhz cell */
-		accessTicks = SYSCLK_TICKS_66(accessTime);
-		accessTicks = min(accessTicks, 0x1fU);
-		accessTicks = max(accessTicks, 0x1U);
-		recTicks = SYSCLK_TICKS_66(recTime);
-		recTicks = min(recTicks, 0x1fU);
-		recTicks = max(recTicks, 0x3U);
-		/* Clear out mdma bits and disable udma */
-		*timings = ((*timings) & ~(TR_66_MDMA_MASK | TR_66_UDMA_MASK)) |
-			(accessTicks << TR_66_MDMA_ACCESS_SHIFT) |
-			(recTicks << TR_66_MDMA_RECOVERY_SHIFT);
-		break;
-	case controller_kl_ata3:
-		/* 33Mhz cell on KeyLargo */
-		accessTicks = SYSCLK_TICKS(accessTime);
-		accessTicks = max(accessTicks, 1U);
-		accessTicks = min(accessTicks, 0x1fU);
-		accessTime = accessTicks * IDE_SYSCLK_NS;
-		recTicks = SYSCLK_TICKS(recTime);
-		recTicks = max(recTicks, 1U);
-		recTicks = min(recTicks, 0x1fU);
-		*timings = ((*timings) & ~TR_33_MDMA_MASK) |
-				(accessTicks << TR_33_MDMA_ACCESS_SHIFT) |
-				(recTicks << TR_33_MDMA_RECOVERY_SHIFT);
-		break;
-	default: {
-		/* 33Mhz cell on others */
-		int halfTick = 0;
-		int origAccessTime = accessTime;
-		int origRecTime = recTime;
-		
-		accessTicks = SYSCLK_TICKS(accessTime);
-		accessTicks = max(accessTicks, 1U);
-		accessTicks = min(accessTicks, 0x1fU);
-		accessTime = accessTicks * IDE_SYSCLK_NS;
-		recTicks = SYSCLK_TICKS(recTime);
-		recTicks = max(recTicks, 2U) - 1;
-		recTicks = min(recTicks, 0x1fU);
-		recTime = (recTicks + 1) * IDE_SYSCLK_NS;
-		if ((accessTicks > 1) &&
-		    ((accessTime - IDE_SYSCLK_NS/2) >= origAccessTime) &&
-		    ((recTime - IDE_SYSCLK_NS/2) >= origRecTime)) {
-            		halfTick = 1;
-			accessTicks--;
-		}
-		*timings = ((*timings) & ~TR_33_MDMA_MASK) |
-				(accessTicks << TR_33_MDMA_ACCESS_SHIFT) |
-				(recTicks << TR_33_MDMA_RECOVERY_SHIFT);
-		if (halfTick)
-			*timings |= TR_33_MDMA_HALFTICK;
-		}
-	}
-#ifdef IDE_PMAC_DEBUG
-	printk(KERN_ERR "%s: Set MDMA timing for mode %d, reg: 0x%08x\n",
-		drive->name, speed & 0xf,  *timings);
-#endif	
-}
-
-static void pmac_ide_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	pmac_ide_hwif_t *pmif = dev_get_drvdata(hwif->gendev.parent);
-	int ret = 0;
-	u32 *timings, *timings2, tl[2];
-	u8 unit = drive->dn & 1;
-	const u8 speed = drive->dma_mode;
-
-	timings = &pmif->timings[unit];
-	timings2 = &pmif->timings[unit+2];
-
-	/* Copy timings to local image */
-	tl[0] = *timings;
-	tl[1] = *timings2;
-
-	if (speed >= XFER_UDMA_0) {
-		if (pmif->kind == controller_kl_ata4)
-			ret = set_timings_udma_ata4(&tl[0], speed);
-		else if (pmif->kind == controller_un_ata6
-			 || pmif->kind == controller_k2_ata6)
-			ret = set_timings_udma_ata6(&tl[0], &tl[1], speed);
-		else if (pmif->kind == controller_sh_ata6)
-			ret = set_timings_udma_shasta(&tl[0], &tl[1], speed);
-		else
-			ret = -1;
-	} else
-		set_timings_mdma(drive, pmif->kind, &tl[0], &tl[1], speed);
-
-	if (ret)
-		return;
-
-	/* Apply timings to controller */
-	*timings = tl[0];
-	*timings2 = tl[1];
-
-	pmac_ide_do_update_timings(drive);	
-}
-
-/*
- * Blast some well known "safe" values to the timing registers at init or
- * wakeup from sleep time, before we do real calculation
- */
-static void
-sanitize_timings(pmac_ide_hwif_t *pmif)
-{
-	unsigned int value, value2 = 0;
-	
-	switch(pmif->kind) {
-		case controller_sh_ata6:
-			value = 0x0a820c97;
-			value2 = 0x00033031;
-			break;
-		case controller_un_ata6:
-		case controller_k2_ata6:
-			value = 0x08618a92;
-			value2 = 0x00002921;
-			break;
-		case controller_kl_ata4:
-			value = 0x0008438c;
-			break;
-		case controller_kl_ata3:
-			value = 0x00084526;
-			break;
-		case controller_heathrow:
-		case controller_ohare:
-		default:
-			value = 0x00074526;
-			break;
-	}
-	pmif->timings[0] = pmif->timings[1] = value;
-	pmif->timings[2] = pmif->timings[3] = value2;
-}
-
-static int on_media_bay(pmac_ide_hwif_t *pmif)
-{
-	return pmif->mdev && pmif->mdev->media_bay != NULL;
-}
-
-/* Suspend call back, should be called after the child devices
- * have actually been suspended
- */
-static int pmac_ide_do_suspend(pmac_ide_hwif_t *pmif)
-{
-	/* We clear the timings */
-	pmif->timings[0] = 0;
-	pmif->timings[1] = 0;
-	
-	disable_irq(pmif->irq);
-
-	/* The media bay will handle itself just fine */
-	if (on_media_bay(pmif))
-		return 0;
-	
-	/* Kauai has bus control FCRs directly here */
-	if (pmif->kauai_fcr) {
-		u32 fcr = readl(pmif->kauai_fcr);
-		fcr &= ~(KAUAI_FCR_UATA_RESET_N | KAUAI_FCR_UATA_ENABLE);
-		writel(fcr, pmif->kauai_fcr);
-	}
-
-	/* Disable the bus on older machines and the cell on kauai */
-	ppc_md.feature_call(PMAC_FTR_IDE_ENABLE, pmif->node, pmif->aapl_bus_id,
-			    0);
-
-	return 0;
-}
-
-/* Resume call back, should be called before the child devices
- * are resumed
- */
-static int pmac_ide_do_resume(pmac_ide_hwif_t *pmif)
-{
-	/* Hard reset & re-enable controller (do we really need to reset ? -BenH) */
-	if (!on_media_bay(pmif)) {
-		ppc_md.feature_call(PMAC_FTR_IDE_RESET, pmif->node, pmif->aapl_bus_id, 1);
-		ppc_md.feature_call(PMAC_FTR_IDE_ENABLE, pmif->node, pmif->aapl_bus_id, 1);
-		msleep(10);
-		ppc_md.feature_call(PMAC_FTR_IDE_RESET, pmif->node, pmif->aapl_bus_id, 0);
-
-		/* Kauai has it different */
-		if (pmif->kauai_fcr) {
-			u32 fcr = readl(pmif->kauai_fcr);
-			fcr |= KAUAI_FCR_UATA_RESET_N | KAUAI_FCR_UATA_ENABLE;
-			writel(fcr, pmif->kauai_fcr);
-		}
-
-		msleep(jiffies_to_msecs(IDE_WAKEUP_DELAY));
-	}
-
-	/* Sanitize drive timings */
-	sanitize_timings(pmif);
-
-	enable_irq(pmif->irq);
-
-	return 0;
-}
-
-static u8 pmac_ide_cable_detect(ide_hwif_t *hwif)
-{
-	pmac_ide_hwif_t *pmif = dev_get_drvdata(hwif->gendev.parent);
-	struct device_node *np = pmif->node;
-	const char *cable = of_get_property(np, "cable-type", NULL);
-	struct device_node *root = of_find_node_by_path("/");
-	const char *model = of_get_property(root, "model", NULL);
-
-	of_node_put(root);
-	/* Get cable type from device-tree. */
-	if (cable && !strncmp(cable, "80-", 3)) {
-		/* Some drives fail to detect 80c cable in PowerBook */
-		/* These machine use proprietary short IDE cable anyway */
-		if (!strncmp(model, "PowerBook", 9))
-			return ATA_CBL_PATA40_SHORT;
-		else
-			return ATA_CBL_PATA80;
-	}
-
-	/*
-	 * G5's seem to have incorrect cable type in device-tree.
-	 * Let's assume they have a 80 conductor cable, this seem
-	 * to be always the case unless the user mucked around.
-	 */
-	if (of_device_is_compatible(np, "K2-UATA") ||
-	    of_device_is_compatible(np, "shasta-ata"))
-		return ATA_CBL_PATA80;
-
-	return ATA_CBL_PATA40;
-}
-
-static void pmac_ide_init_dev(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	pmac_ide_hwif_t *pmif = dev_get_drvdata(hwif->gendev.parent);
-
-	if (on_media_bay(pmif)) {
-		if (check_media_bay(pmif->mdev->media_bay) == MB_CD) {
-			drive->dev_flags &= ~IDE_DFLAG_NOPROBE;
-			return;
-		}
-		drive->dev_flags |= IDE_DFLAG_NOPROBE;
-	}
-}
-
-static const struct ide_tp_ops pmac_tp_ops = {
-	.exec_command		= pmac_exec_command,
-	.read_status		= ide_read_status,
-	.read_altstatus		= ide_read_altstatus,
-	.write_devctl		= pmac_write_devctl,
-
-	.dev_select		= pmac_dev_select,
-	.tf_load		= ide_tf_load,
-	.tf_read		= ide_tf_read,
-
-	.input_data		= ide_input_data,
-	.output_data		= ide_output_data,
-};
-
-static const struct ide_tp_ops pmac_ata6_tp_ops = {
-	.exec_command		= pmac_exec_command,
-	.read_status		= ide_read_status,
-	.read_altstatus		= ide_read_altstatus,
-	.write_devctl		= pmac_write_devctl,
-
-	.dev_select		= pmac_kauai_dev_select,
-	.tf_load		= ide_tf_load,
-	.tf_read		= ide_tf_read,
-
-	.input_data		= ide_input_data,
-	.output_data		= ide_output_data,
-};
-
-static const struct ide_port_ops pmac_ide_ata4_port_ops = {
-	.init_dev		= pmac_ide_init_dev,
-	.set_pio_mode		= pmac_ide_set_pio_mode,
-	.set_dma_mode		= pmac_ide_set_dma_mode,
-	.cable_detect		= pmac_ide_cable_detect,
-};
-
-static const struct ide_port_ops pmac_ide_port_ops = {
-	.init_dev		= pmac_ide_init_dev,
-	.set_pio_mode		= pmac_ide_set_pio_mode,
-	.set_dma_mode		= pmac_ide_set_dma_mode,
-};
-
-static const struct ide_dma_ops pmac_dma_ops;
-
-static const struct ide_port_info pmac_port_info = {
-	.name			= DRV_NAME,
-	.init_dma		= pmac_ide_init_dma,
-	.chipset		= ide_pmac,
-	.tp_ops			= &pmac_tp_ops,
-	.port_ops		= &pmac_ide_port_ops,
-	.dma_ops		= &pmac_dma_ops,
-	.host_flags		= IDE_HFLAG_SET_PIO_MODE_KEEP_DMA |
-				  IDE_HFLAG_POST_SET_MODE |
-				  IDE_HFLAG_MMIO |
-				  IDE_HFLAG_UNMASK_IRQS,
-	.pio_mask		= ATA_PIO4,
-	.mwdma_mask		= ATA_MWDMA2,
-};
-
-/*
- * Setup, register & probe an IDE channel driven by this driver, this is
- * called by one of the 2 probe functions (macio or PCI).
- */
-static int pmac_ide_setup_device(pmac_ide_hwif_t *pmif, struct ide_hw *hw)
-{
-	struct device_node *np = pmif->node;
-	const int *bidp;
-	struct ide_host *host;
-	struct ide_hw *hws[] = { hw };
-	struct ide_port_info d = pmac_port_info;
-	int rc;
-
-	pmif->broken_dma = pmif->broken_dma_warn = 0;
-	if (of_device_is_compatible(np, "shasta-ata")) {
-		pmif->kind = controller_sh_ata6;
-		d.tp_ops = &pmac_ata6_tp_ops;
-		d.port_ops = &pmac_ide_ata4_port_ops;
-		d.udma_mask = ATA_UDMA6;
-	} else if (of_device_is_compatible(np, "kauai-ata")) {
-		pmif->kind = controller_un_ata6;
-		d.tp_ops = &pmac_ata6_tp_ops;
-		d.port_ops = &pmac_ide_ata4_port_ops;
-		d.udma_mask = ATA_UDMA5;
-	} else if (of_device_is_compatible(np, "K2-UATA")) {
-		pmif->kind = controller_k2_ata6;
-		d.tp_ops = &pmac_ata6_tp_ops;
-		d.port_ops = &pmac_ide_ata4_port_ops;
-		d.udma_mask = ATA_UDMA5;
-	} else if (of_device_is_compatible(np, "keylargo-ata")) {
-		if (of_node_name_eq(np, "ata-4")) {
-			pmif->kind = controller_kl_ata4;
-			d.port_ops = &pmac_ide_ata4_port_ops;
-			d.udma_mask = ATA_UDMA4;
-		} else
-			pmif->kind = controller_kl_ata3;
-	} else if (of_device_is_compatible(np, "heathrow-ata")) {
-		pmif->kind = controller_heathrow;
-	} else {
-		pmif->kind = controller_ohare;
-		pmif->broken_dma = 1;
-	}
-
-	bidp = of_get_property(np, "AAPL,bus-id", NULL);
-	pmif->aapl_bus_id =  bidp ? *bidp : 0;
-
-	/* On Kauai-type controllers, we make sure the FCR is correct */
-	if (pmif->kauai_fcr)
-		writel(KAUAI_FCR_UATA_MAGIC |
-		       KAUAI_FCR_UATA_RESET_N |
-		       KAUAI_FCR_UATA_ENABLE, pmif->kauai_fcr);
-	
-	/* Make sure we have sane timings */
-	sanitize_timings(pmif);
-
-	/* If we are on a media bay, wait for it to settle and lock it */
-	if (pmif->mdev)
-		lock_media_bay(pmif->mdev->media_bay);
-
-	host = ide_host_alloc(&d, hws, 1);
-	if (host == NULL) {
-		rc = -ENOMEM;
-		goto bail;
-	}
-	pmif->hwif = host->ports[0];
-
-	if (on_media_bay(pmif)) {
-		/* Fixup bus ID for media bay */
-		if (!bidp)
-			pmif->aapl_bus_id = 1;
-	} else if (pmif->kind == controller_ohare) {
-		/* The code below is having trouble on some ohare machines
-		 * (timing related ?). Until I can put my hand on one of these
-		 * units, I keep the old way
-		 */
-		ppc_md.feature_call(PMAC_FTR_IDE_ENABLE, np, 0, 1);
-	} else {
- 		/* This is necessary to enable IDE when net-booting */
-		ppc_md.feature_call(PMAC_FTR_IDE_RESET, np, pmif->aapl_bus_id, 1);
-		ppc_md.feature_call(PMAC_FTR_IDE_ENABLE, np, pmif->aapl_bus_id, 1);
-		msleep(10);
-		ppc_md.feature_call(PMAC_FTR_IDE_RESET, np, pmif->aapl_bus_id, 0);
-		msleep(jiffies_to_msecs(IDE_WAKEUP_DELAY));
-	}
-
-	printk(KERN_INFO DRV_NAME ": Found Apple %s controller (%s), "
-	       "bus ID %d%s, irq %d\n", model_name[pmif->kind],
-	       pmif->mdev ? "macio" : "PCI", pmif->aapl_bus_id,
-	       on_media_bay(pmif) ? " (mediabay)" : "", hw->irq);
-
-	rc = ide_host_register(host, &d, hws);
-	if (rc)
-		pmif->hwif = NULL;
-
-	if (pmif->mdev)
-		unlock_media_bay(pmif->mdev->media_bay);
-
- bail:
-	if (rc && host)
-		ide_host_free(host);
-	return rc;
-}
-
-static void pmac_ide_init_ports(struct ide_hw *hw, unsigned long base)
-{
-	int i;
-
-	for (i = 0; i < 8; ++i)
-		hw->io_ports_array[i] = base + i * 0x10;
-
-	hw->io_ports.ctl_addr = base + 0x160;
-}
-
-/*
- * Attach to a macio probed interface
- */
-static int pmac_ide_macio_attach(struct macio_dev *mdev,
-				 const struct of_device_id *match)
-{
-	void __iomem *base;
-	unsigned long regbase;
-	pmac_ide_hwif_t *pmif;
-	int irq, rc;
-	struct ide_hw hw;
-
-	pmif = kzalloc(sizeof(*pmif), GFP_KERNEL);
-	if (pmif == NULL)
-		return -ENOMEM;
-
-	if (macio_resource_count(mdev) == 0) {
-		printk(KERN_WARNING "ide-pmac: no address for %pOF\n",
-				    mdev->ofdev.dev.of_node);
-		rc = -ENXIO;
-		goto out_free_pmif;
-	}
-
-	/* Request memory resource for IO ports */
-	if (macio_request_resource(mdev, 0, "ide-pmac (ports)")) {
-		printk(KERN_ERR "ide-pmac: can't request MMIO resource for "
-				"%pOF!\n", mdev->ofdev.dev.of_node);
-		rc = -EBUSY;
-		goto out_free_pmif;
-	}
-			
-	/* XXX This is bogus. Should be fixed in the registry by checking
-	 * the kind of host interrupt controller, a bit like gatwick
-	 * fixes in irq.c. That works well enough for the single case
-	 * where that happens though...
-	 */
-	if (macio_irq_count(mdev) == 0) {
-		printk(KERN_WARNING "ide-pmac: no intrs for device %pOF, using "
-				    "13\n", mdev->ofdev.dev.of_node);
-		irq = irq_create_mapping(NULL, 13);
-	} else
-		irq = macio_irq(mdev, 0);
-
-	base = ioremap(macio_resource_start(mdev, 0), 0x400);
-	regbase = (unsigned long) base;
-
-	pmif->mdev = mdev;
-	pmif->node = mdev->ofdev.dev.of_node;
-	pmif->regbase = regbase;
-	pmif->irq = irq;
-	pmif->kauai_fcr = NULL;
-
-	if (macio_resource_count(mdev) >= 2) {
-		if (macio_request_resource(mdev, 1, "ide-pmac (dma)"))
-			printk(KERN_WARNING "ide-pmac: can't request DMA "
-					    "resource for %pOF!\n",
-					    mdev->ofdev.dev.of_node);
-		else
-			pmif->dma_regs = ioremap(macio_resource_start(mdev, 1), 0x1000);
-	} else
-		pmif->dma_regs = NULL;
-
-	dev_set_drvdata(&mdev->ofdev.dev, pmif);
-
-	memset(&hw, 0, sizeof(hw));
-	pmac_ide_init_ports(&hw, pmif->regbase);
-	hw.irq = irq;
-	hw.dev = &mdev->bus->pdev->dev;
-	hw.parent = &mdev->ofdev.dev;
-
-	rc = pmac_ide_setup_device(pmif, &hw);
-	if (rc != 0) {
-		/* The inteface is released to the common IDE layer */
-		dev_set_drvdata(&mdev->ofdev.dev, NULL);
-		iounmap(base);
-		if (pmif->dma_regs) {
-			iounmap(pmif->dma_regs);
-			macio_release_resource(mdev, 1);
-		}
-		macio_release_resource(mdev, 0);
-		kfree(pmif);
-	}
-
-	return rc;
-
-out_free_pmif:
-	kfree(pmif);
-	return rc;
-}
-
-static int
-pmac_ide_macio_suspend(struct macio_dev *mdev, pm_message_t mesg)
-{
-	pmac_ide_hwif_t *pmif = dev_get_drvdata(&mdev->ofdev.dev);
-	int rc = 0;
-
-	if (mesg.event != mdev->ofdev.dev.power.power_state.event
-			&& (mesg.event & PM_EVENT_SLEEP)) {
-		rc = pmac_ide_do_suspend(pmif);
-		if (rc == 0)
-			mdev->ofdev.dev.power.power_state = mesg;
-	}
-
-	return rc;
-}
-
-static int
-pmac_ide_macio_resume(struct macio_dev *mdev)
-{
-	pmac_ide_hwif_t *pmif = dev_get_drvdata(&mdev->ofdev.dev);
-	int rc = 0;
-
-	if (mdev->ofdev.dev.power.power_state.event != PM_EVENT_ON) {
-		rc = pmac_ide_do_resume(pmif);
-		if (rc == 0)
-			mdev->ofdev.dev.power.power_state = PMSG_ON;
-	}
-
-	return rc;
-}
-
-/*
- * Attach to a PCI probed interface
- */
-static int pmac_ide_pci_attach(struct pci_dev *pdev,
-			       const struct pci_device_id *id)
-{
-	struct device_node *np;
-	pmac_ide_hwif_t *pmif;
-	void __iomem *base;
-	unsigned long rbase, rlen;
-	int rc;
-	struct ide_hw hw;
-
-	np = pci_device_to_OF_node(pdev);
-	if (np == NULL) {
-		printk(KERN_ERR "ide-pmac: cannot find MacIO node for Kauai ATA interface\n");
-		return -ENODEV;
-	}
-
-	pmif = kzalloc(sizeof(*pmif), GFP_KERNEL);
-	if (pmif == NULL)
-		return -ENOMEM;
-
-	if (pci_enable_device(pdev)) {
-		printk(KERN_WARNING "ide-pmac: Can't enable PCI device for "
-				    "%pOF\n", np);
-		rc = -ENXIO;
-		goto out_free_pmif;
-	}
-	pci_set_master(pdev);
-			
-	if (pci_request_regions(pdev, "Kauai ATA")) {
-		printk(KERN_ERR "ide-pmac: Cannot obtain PCI resources for "
-				"%pOF\n", np);
-		rc = -ENXIO;
-		goto out_free_pmif;
-	}
-
-	pmif->mdev = NULL;
-	pmif->node = np;
-
-	rbase = pci_resource_start(pdev, 0);
-	rlen = pci_resource_len(pdev, 0);
-
-	base = ioremap(rbase, rlen);
-	pmif->regbase = (unsigned long) base + 0x2000;
-	pmif->dma_regs = base + 0x1000;
-	pmif->kauai_fcr = base;
-	pmif->irq = pdev->irq;
-
-	pci_set_drvdata(pdev, pmif);
-
-	memset(&hw, 0, sizeof(hw));
-	pmac_ide_init_ports(&hw, pmif->regbase);
-	hw.irq = pdev->irq;
-	hw.dev = &pdev->dev;
-
-	rc = pmac_ide_setup_device(pmif, &hw);
-	if (rc != 0) {
-		/* The inteface is released to the common IDE layer */
-		iounmap(base);
-		pci_release_regions(pdev);
-		kfree(pmif);
-	}
-
-	return rc;
-
-out_free_pmif:
-	kfree(pmif);
-	return rc;
-}
-
-static int
-pmac_ide_pci_suspend(struct pci_dev *pdev, pm_message_t mesg)
-{
-	pmac_ide_hwif_t *pmif = pci_get_drvdata(pdev);
-	int rc = 0;
-
-	if (mesg.event != pdev->dev.power.power_state.event
-			&& (mesg.event & PM_EVENT_SLEEP)) {
-		rc = pmac_ide_do_suspend(pmif);
-		if (rc == 0)
-			pdev->dev.power.power_state = mesg;
-	}
-
-	return rc;
-}
-
-static int
-pmac_ide_pci_resume(struct pci_dev *pdev)
-{
-	pmac_ide_hwif_t *pmif = pci_get_drvdata(pdev);
-	int rc = 0;
-
-	if (pdev->dev.power.power_state.event != PM_EVENT_ON) {
-		rc = pmac_ide_do_resume(pmif);
-		if (rc == 0)
-			pdev->dev.power.power_state = PMSG_ON;
-	}
-
-	return rc;
-}
-
-#ifdef CONFIG_PMAC_MEDIABAY
-static void pmac_ide_macio_mb_event(struct macio_dev* mdev, int mb_state)
-{
-	pmac_ide_hwif_t *pmif = dev_get_drvdata(&mdev->ofdev.dev);
-
-	switch(mb_state) {
-	case MB_CD:
-		if (!pmif->hwif->present)
-			ide_port_scan(pmif->hwif);
-		break;
-	default:
-		if (pmif->hwif->present)
-			ide_port_unregister_devices(pmif->hwif);
-	}
-}
-#endif /* CONFIG_PMAC_MEDIABAY */
-
-
-static struct of_device_id pmac_ide_macio_match[] = 
-{
-	{
-	.name 		= "IDE",
-	},
-	{
-	.name 		= "ATA",
-	},
-	{
-	.type		= "ide",
-	},
-	{
-	.type		= "ata",
-	},
-	{},
-};
-
-static struct macio_driver pmac_ide_macio_driver = 
-{
-	.driver = {
-		.name 		= "ide-pmac",
-		.owner		= THIS_MODULE,
-		.of_match_table	= pmac_ide_macio_match,
-	},
-	.probe		= pmac_ide_macio_attach,
-	.suspend	= pmac_ide_macio_suspend,
-	.resume		= pmac_ide_macio_resume,
-#ifdef CONFIG_PMAC_MEDIABAY
-	.mediabay_event	= pmac_ide_macio_mb_event,
-#endif
-};
-
-static const struct pci_device_id pmac_ide_pci_match[] = {
-	{ PCI_VDEVICE(APPLE, PCI_DEVICE_ID_APPLE_UNI_N_ATA),	0 },
-	{ PCI_VDEVICE(APPLE, PCI_DEVICE_ID_APPLE_IPID_ATA100),	0 },
-	{ PCI_VDEVICE(APPLE, PCI_DEVICE_ID_APPLE_K2_ATA100),	0 },
-	{ PCI_VDEVICE(APPLE, PCI_DEVICE_ID_APPLE_SH_ATA),	0 },
-	{ PCI_VDEVICE(APPLE, PCI_DEVICE_ID_APPLE_IPID2_ATA),	0 },
-	{},
-};
-
-static struct pci_driver pmac_ide_pci_driver = {
-	.name		= "ide-pmac",
-	.id_table	= pmac_ide_pci_match,
-	.probe		= pmac_ide_pci_attach,
-	.suspend	= pmac_ide_pci_suspend,
-	.resume		= pmac_ide_pci_resume,
-};
-MODULE_DEVICE_TABLE(pci, pmac_ide_pci_match);
-
-int __init pmac_ide_probe(void)
-{
-	int error;
-
-	if (!machine_is(powermac))
-		return -ENODEV;
-
-#ifdef CONFIG_BLK_DEV_IDE_PMAC_ATA100FIRST
-	error = pci_register_driver(&pmac_ide_pci_driver);
-	if (error)
-		goto out;
-	error = macio_register_driver(&pmac_ide_macio_driver);
-	if (error) {
-		pci_unregister_driver(&pmac_ide_pci_driver);
-		goto out;
-	}
-#else
-	error = macio_register_driver(&pmac_ide_macio_driver);
-	if (error)
-		goto out;
-	error = pci_register_driver(&pmac_ide_pci_driver);
-	if (error) {
-		macio_unregister_driver(&pmac_ide_macio_driver);
-		goto out;
-	}
-#endif
-out:
-	return error;
-}
-
-/*
- * pmac_ide_build_dmatable builds the DBDMA command list
- * for a transfer and sets the DBDMA channel to point to it.
- */
-static int pmac_ide_build_dmatable(ide_drive_t *drive, struct ide_cmd *cmd)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	pmac_ide_hwif_t *pmif = dev_get_drvdata(hwif->gendev.parent);
-	struct dbdma_cmd *table;
-	volatile struct dbdma_regs __iomem *dma = pmif->dma_regs;
-	struct scatterlist *sg;
-	int wr = !!(cmd->tf_flags & IDE_TFLAG_WRITE);
-	int i = cmd->sg_nents, count = 0;
-
-	/* DMA table is already aligned */
-	table = (struct dbdma_cmd *) pmif->dma_table_cpu;
-
-	/* Make sure DMA controller is stopped (necessary ?) */
-	writel((RUN|PAUSE|FLUSH|WAKE|DEAD) << 16, &dma->control);
-	while (readl(&dma->status) & RUN)
-		udelay(1);
-
-	/* Build DBDMA commands list */
-	sg = hwif->sg_table;
-	while (i && sg_dma_len(sg)) {
-		u32 cur_addr;
-		u32 cur_len;
-
-		cur_addr = sg_dma_address(sg);
-		cur_len = sg_dma_len(sg);
-
-		if (pmif->broken_dma && cur_addr & (L1_CACHE_BYTES - 1)) {
-			if (pmif->broken_dma_warn == 0) {
-				printk(KERN_WARNING "%s: DMA on non aligned address, "
-				       "switching to PIO on Ohare chipset\n", drive->name);
-				pmif->broken_dma_warn = 1;
-			}
-			return 0;
-		}
-		while (cur_len) {
-			unsigned int tc = (cur_len < 0xfe00)? cur_len: 0xfe00;
-
-			if (count++ >= MAX_DCMDS) {
-				printk(KERN_WARNING "%s: DMA table too small\n",
-				       drive->name);
-				return 0;
-			}
-			table->command = cpu_to_le16(wr? OUTPUT_MORE: INPUT_MORE);
-			table->req_count = cpu_to_le16(tc);
-			table->phy_addr = cpu_to_le32(cur_addr);
-			table->cmd_dep = 0;
-			table->xfer_status = 0;
-			table->res_count = 0;
-			cur_addr += tc;
-			cur_len -= tc;
-			++table;
-		}
-		sg = sg_next(sg);
-		i--;
-	}
-
-	/* convert the last command to an input/output last command */
-	if (count) {
-		table[-1].command = cpu_to_le16(wr? OUTPUT_LAST: INPUT_LAST);
-		/* add the stop command to the end of the list */
-		memset(table, 0, sizeof(struct dbdma_cmd));
-		table->command = cpu_to_le16(DBDMA_STOP);
-		mb();
-		writel(hwif->dmatable_dma, &dma->cmdptr);
-		return 1;
-	}
-
-	printk(KERN_DEBUG "%s: empty DMA table?\n", drive->name);
-
-	return 0; /* revert to PIO for this request */
-}
-
-/*
- * Prepare a DMA transfer. We build the DMA table, adjust the timings for
- * a read on KeyLargo ATA/66 and mark us as waiting for DMA completion
- */
-static int pmac_ide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	pmac_ide_hwif_t *pmif = dev_get_drvdata(hwif->gendev.parent);
-	u8 unit = drive->dn & 1, ata4 = (pmif->kind == controller_kl_ata4);
-	u8 write = !!(cmd->tf_flags & IDE_TFLAG_WRITE);
-
-	if (pmac_ide_build_dmatable(drive, cmd) == 0)
-		return 1;
-
-	/* Apple adds 60ns to wrDataSetup on reads */
-	if (ata4 && (pmif->timings[unit] & TR_66_UDMA_EN)) {
-		writel(pmif->timings[unit] + (write ? 0 : 0x00800000UL),
-			PMAC_IDE_REG(IDE_TIMING_CONFIG));
-		(void)readl(PMAC_IDE_REG(IDE_TIMING_CONFIG));
-	}
-
-	return 0;
-}
-
-/*
- * Kick the DMA controller into life after the DMA command has been issued
- * to the drive.
- */
-static void
-pmac_ide_dma_start(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	pmac_ide_hwif_t *pmif = dev_get_drvdata(hwif->gendev.parent);
-	volatile struct dbdma_regs __iomem *dma;
-
-	dma = pmif->dma_regs;
-
-	writel((RUN << 16) | RUN, &dma->control);
-	/* Make sure it gets to the controller right now */
-	(void)readl(&dma->control);
-}
-
-/*
- * After a DMA transfer, make sure the controller is stopped
- */
-static int
-pmac_ide_dma_end (ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	pmac_ide_hwif_t *pmif = dev_get_drvdata(hwif->gendev.parent);
-	volatile struct dbdma_regs __iomem *dma = pmif->dma_regs;
-	u32 dstat;
-
-	dstat = readl(&dma->status);
-	writel(((RUN|WAKE|DEAD) << 16), &dma->control);
-
-	/* verify good dma status. we don't check for ACTIVE beeing 0. We should...
-	 * in theory, but with ATAPI decices doing buffer underruns, that would
-	 * cause us to disable DMA, which isn't what we want
-	 */
-	return (dstat & (RUN|DEAD)) != RUN;
-}
-
-/*
- * Check out that the interrupt we got was for us. We can't always know this
- * for sure with those Apple interfaces (well, we could on the recent ones but
- * that's not implemented yet), on the other hand, we don't have shared interrupts
- * so it's not really a problem
- */
-static int
-pmac_ide_dma_test_irq (ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	pmac_ide_hwif_t *pmif = dev_get_drvdata(hwif->gendev.parent);
-	volatile struct dbdma_regs __iomem *dma = pmif->dma_regs;
-	unsigned long status, timeout;
-
-	/* We have to things to deal with here:
-	 * 
-	 * - The dbdma won't stop if the command was started
-	 * but completed with an error without transferring all
-	 * datas. This happens when bad blocks are met during
-	 * a multi-block transfer.
-	 * 
-	 * - The dbdma fifo hasn't yet finished flushing to
-	 * to system memory when the disk interrupt occurs.
-	 * 
-	 */
-
-	/* If ACTIVE is cleared, the STOP command have passed and
-	 * transfer is complete.
-	 */
-	status = readl(&dma->status);
-	if (!(status & ACTIVE))
-		return 1;
-
-	/* If dbdma didn't execute the STOP command yet, the
-	 * active bit is still set. We consider that we aren't
-	 * sharing interrupts (which is hopefully the case with
-	 * those controllers) and so we just try to flush the
-	 * channel for pending data in the fifo
-	 */
-	udelay(1);
-	writel((FLUSH << 16) | FLUSH, &dma->control);
-	timeout = 0;
-	for (;;) {
-		udelay(1);
-		status = readl(&dma->status);
-		if ((status & FLUSH) == 0)
-			break;
-		if (++timeout > 100) {
-			printk(KERN_WARNING "ide%d, ide_dma_test_irq timeout flushing channel\n",
-			       hwif->index);
-			break;
-		}
-	}	
-	return 1;
-}
-
-static void pmac_ide_dma_host_set(ide_drive_t *drive, int on)
-{
-}
-
-static void
-pmac_ide_dma_lost_irq (ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	pmac_ide_hwif_t *pmif = dev_get_drvdata(hwif->gendev.parent);
-	volatile struct dbdma_regs __iomem *dma = pmif->dma_regs;
-	unsigned long status = readl(&dma->status);
-
-	printk(KERN_ERR "ide-pmac lost interrupt, dma status: %lx\n", status);
-}
-
-static const struct ide_dma_ops pmac_dma_ops = {
-	.dma_host_set		= pmac_ide_dma_host_set,
-	.dma_setup		= pmac_ide_dma_setup,
-	.dma_start		= pmac_ide_dma_start,
-	.dma_end		= pmac_ide_dma_end,
-	.dma_test_irq		= pmac_ide_dma_test_irq,
-	.dma_lost_irq		= pmac_ide_dma_lost_irq,
-};
-
-/*
- * Allocate the data structures needed for using DMA with an interface
- * and fill the proper list of functions pointers
- */
-static int pmac_ide_init_dma(ide_hwif_t *hwif, const struct ide_port_info *d)
-{
-	pmac_ide_hwif_t *pmif = dev_get_drvdata(hwif->gendev.parent);
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-
-	/* We won't need pci_dev if we switch to generic consistent
-	 * DMA routines ...
-	 */
-	if (dev == NULL || pmif->dma_regs == 0)
-		return -ENODEV;
-	/*
-	 * Allocate space for the DBDMA commands.
-	 * The +2 is +1 for the stop command and +1 to allow for
-	 * aligning the start address to a multiple of 16 bytes.
-	 */
-	pmif->dma_table_cpu = dma_alloc_coherent(&dev->dev,
-		(MAX_DCMDS + 2) * sizeof(struct dbdma_cmd),
-		&hwif->dmatable_dma, GFP_KERNEL);
-	if (pmif->dma_table_cpu == NULL) {
-		printk(KERN_ERR "%s: unable to allocate DMA command list\n",
-		       hwif->name);
-		return -ENOMEM;
-	}
-
-	hwif->sg_max_nents = MAX_DCMDS;
-
-	return 0;
-}
-
-module_init(pmac_ide_probe);
-
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/qd65xx.c b/drivers/ide/qd65xx.c
deleted file mode 100644
index ab79b6289464..000000000000
--- a/drivers/ide/qd65xx.c
+++ /dev/null
@@ -1,446 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  Copyright (C) 1996-2001  Linus Torvalds & author (see below)
- */
-
-/*
- *  Version 0.03	Cleaned auto-tune, added probe
- *  Version 0.04	Added second channel tuning
- *  Version 0.05	Enhanced tuning ; added qd6500 support
- *  Version 0.06	Added dos driver's list
- *  Version 0.07	Second channel bug fix 
- *
- * QDI QD6500/QD6580 EIDE controller fast support
- *
- * To activate controller support, use "ide0=qd65xx"
- */
-
-/*
- * Rewritten from the work of Colten Edwards <pje120@cs.usask.ca> by
- * Samuel Thibault <samuel.thibault@ens-lyon.org>
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/delay.h>
-#include <linux/timer.h>
-#include <linux/mm.h>
-#include <linux/ioport.h>
-#include <linux/blkdev.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-#include <asm/io.h>
-
-#define DRV_NAME "qd65xx"
-
-#include "qd65xx.h"
-
-/*
- * I/O ports are 0x30-0x31 (and 0x32-0x33 for qd6580)
- *            or 0xb0-0xb1 (and 0xb2-0xb3 for qd6580)
- *	-- qd6500 is a single IDE interface
- *	-- qd6580 is a dual IDE interface
- *
- * More research on qd6580 being done by willmore@cig.mot.com (David)
- * More Information given by Petr Soucek (petr@ryston.cz)
- * http://www.ryston.cz/petr/vlb
- */
-
-/*
- * base: Timer1
- *
- *
- * base+0x01: Config (R/O)
- *
- * bit 0: ide baseport: 1 = 0x1f0 ; 0 = 0x170 (only useful for qd6500)
- * bit 1: qd65xx baseport: 1 = 0xb0 ; 0 = 0x30
- * bit 2: ID3: bus speed: 1 = <=33MHz ; 0 = >33MHz
- * bit 3: qd6500: 1 = disabled, 0 = enabled
- *        qd6580: 1
- * upper nibble:
- *        qd6500: 1100
- *        qd6580: either 1010 or 0101
- *
- *
- * base+0x02: Timer2 (qd6580 only)
- *
- *
- * base+0x03: Control (qd6580 only)
- *
- * bits 0-3 must always be set 1
- * bit 4 must be set 1, but is set 0 by dos driver while measuring vlb clock
- * bit 0 : 1 = Only primary port enabled : channel 0 for hda, channel 1 for hdb
- *         0 = Primary and Secondary ports enabled : channel 0 for hda & hdb
- *                                                   channel 1 for hdc & hdd
- * bit 1 : 1 = only disks on primary port
- *         0 = disks & ATAPI devices on primary port
- * bit 2-4 : always 0
- * bit 5 : status, but of what ?
- * bit 6 : always set 1 by dos driver
- * bit 7 : set 1 for non-ATAPI devices on primary port
- *	(maybe read-ahead and post-write buffer ?)
- */
-
-static int timings[4]={-1,-1,-1,-1}; /* stores current timing for each timer */
-
-/*
- * qd65xx_select:
- *
- * This routine is invoked to prepare for access to a given drive.
- */
-
-static void qd65xx_dev_select(ide_drive_t *drive)
-{
-	u8 index = ((	(QD_TIMREG(drive)) & 0x80 ) >> 7) |
-			(QD_TIMREG(drive) & 0x02);
-
-	if (timings[index] != QD_TIMING(drive))
-		outb(timings[index] = QD_TIMING(drive), QD_TIMREG(drive));
-
-	outb(drive->select | ATA_DEVICE_OBS, drive->hwif->io_ports.device_addr);
-}
-
-/*
- * qd6500_compute_timing
- *
- * computes the timing value where
- *	lower nibble represents active time,   in count of VLB clocks
- *	upper nibble represents recovery time, in count of VLB clocks
- */
-
-static u8 qd6500_compute_timing (ide_hwif_t *hwif, int active_time, int recovery_time)
-{
-	int clk = ide_vlb_clk ? ide_vlb_clk : 50;
-	u8 act_cyc, rec_cyc;
-
-	if (clk <= 33) {
-		act_cyc =  9 - IDE_IN(active_time   * clk / 1000 + 1, 2,  9);
-		rec_cyc = 15 - IDE_IN(recovery_time * clk / 1000 + 1, 0, 15);
-	} else {
-		act_cyc =  8 - IDE_IN(active_time   * clk / 1000 + 1, 1,  8);
-		rec_cyc = 18 - IDE_IN(recovery_time * clk / 1000 + 1, 3, 18);
-	}
-
-	return (rec_cyc << 4) | 0x08 | act_cyc;
-}
-
-/*
- * qd6580_compute_timing
- *
- * idem for qd6580
- */
-
-static u8 qd6580_compute_timing (int active_time, int recovery_time)
-{
-	int clk = ide_vlb_clk ? ide_vlb_clk : 50;
-	u8 act_cyc, rec_cyc;
-
-	act_cyc = 17 - IDE_IN(active_time   * clk / 1000 + 1, 2, 17);
-	rec_cyc = 15 - IDE_IN(recovery_time * clk / 1000 + 1, 2, 15);
-
-	return (rec_cyc << 4) | act_cyc;
-}
-
-/*
- * qd_find_disk_type
- *
- * tries to find timing from dos driver's table
- */
-
-static int qd_find_disk_type (ide_drive_t *drive,
-		int *active_time, int *recovery_time)
-{
-	struct qd65xx_timing_s *p;
-	char *m = (char *)&drive->id[ATA_ID_PROD];
-	char model[ATA_ID_PROD_LEN];
-
-	if (*m == 0)
-		return 0;
-
-	strncpy(model, m, ATA_ID_PROD_LEN);
-	ide_fixstring(model, ATA_ID_PROD_LEN, 1); /* byte-swap */
-
-	for (p = qd65xx_timing ; p->offset != -1 ; p++) {
-		if (!strncmp(p->model, model+p->offset, 4)) {
-			printk(KERN_DEBUG "%s: listed !\n", drive->name);
-			*active_time = p->active;
-			*recovery_time = p->recovery;
-			return 1;
-		}
-	}
-	return 0;
-}
-
-/*
- * qd_set_timing:
- *
- * records the timing
- */
-
-static void qd_set_timing (ide_drive_t *drive, u8 timing)
-{
-	unsigned long data = (unsigned long)ide_get_drivedata(drive);
-
-	data &= 0xff00;
-	data |= timing;
-	ide_set_drivedata(drive, (void *)data);
-
-	printk(KERN_DEBUG "%s: %#x\n", drive->name, timing);
-}
-
-static void qd6500_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	u16 *id = drive->id;
-	int active_time   = 175;
-	int recovery_time = 415; /* worst case values from the dos driver */
-
-	/* FIXME: use drive->pio_mode value */
-	if (!qd_find_disk_type(drive, &active_time, &recovery_time) &&
-	    (id[ATA_ID_OLD_PIO_MODES] & 0xff) && (id[ATA_ID_FIELD_VALID] & 2) &&
-	    id[ATA_ID_EIDE_PIO] >= 240) {
-		printk(KERN_INFO "%s: PIO mode%d\n", drive->name,
-			id[ATA_ID_OLD_PIO_MODES] & 0xff);
-		active_time = 110;
-		recovery_time = drive->id[ATA_ID_EIDE_PIO] - 120;
-	}
-
-	qd_set_timing(drive, qd6500_compute_timing(drive->hwif,
-				active_time, recovery_time));
-}
-
-static void qd6580_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	const u8 pio = drive->pio_mode - XFER_PIO_0;
-	struct ide_timing *t = ide_timing_find_mode(XFER_PIO_0 + pio);
-	unsigned int cycle_time;
-	int active_time   = 175;
-	int recovery_time = 415; /* worst case values from the dos driver */
-	u8 base = (hwif->config_data & 0xff00) >> 8;
-
-	if (drive->id && !qd_find_disk_type(drive, &active_time, &recovery_time)) {
-		cycle_time = ide_pio_cycle_time(drive, pio);
-
-		switch (pio) {
-			case 0: break;
-			case 3:
-				if (cycle_time >= 110) {
-					active_time = 86;
-					recovery_time = cycle_time - 102;
-				} else
-					printk(KERN_WARNING "%s: Strange recovery time !\n",drive->name);
-				break;
-			case 4:
-				if (cycle_time >= 69) {
-					active_time = 70;
-					recovery_time = cycle_time - 61;
-				} else
-					printk(KERN_WARNING "%s: Strange recovery time !\n",drive->name);
-				break;
-			default:
-				if (cycle_time >= 180) {
-					active_time = 110;
-					recovery_time = cycle_time - 120;
-				} else {
-					active_time = t->active;
-					recovery_time = cycle_time - active_time;
-				}
-		}
-		printk(KERN_INFO "%s: PIO mode%d\n", drive->name,pio);
-	}
-
-	if (!hwif->channel && drive->media != ide_disk) {
-		outb(0x5f, QD_CONTROL_PORT);
-		printk(KERN_WARNING "%s: ATAPI: disabled read-ahead FIFO "
-			"and post-write buffer on %s.\n",
-			drive->name, hwif->name);
-	}
-
-	qd_set_timing(drive, qd6580_compute_timing(active_time, recovery_time));
-}
-
-/*
- * qd_testreg
- *
- * tests if the given port is a register
- */
-
-static int __init qd_testreg(int port)
-{
-	unsigned long flags;
-	u8 savereg, readreg;
-
-	local_irq_save(flags);
-	savereg = inb_p(port);
-	outb_p(QD_TESTVAL, port);	/* safe value */
-	readreg = inb_p(port);
-	outb(savereg, port);
-	local_irq_restore(flags);
-
-	if (savereg == QD_TESTVAL) {
-		printk(KERN_ERR "Outch ! the probe for qd65xx isn't reliable !\n");
-		printk(KERN_ERR "Please contact maintainers to tell about your hardware\n");
-		printk(KERN_ERR "Assuming qd65xx is not present.\n");
-		return 1;
-	}
-
-	return (readreg != QD_TESTVAL);
-}
-
-static void __init qd6500_init_dev(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	u8 base = (hwif->config_data & 0xff00) >> 8;
-	u8 config = QD_CONFIG(hwif);
-
-	ide_set_drivedata(drive, (void *)QD6500_DEF_DATA);
-}
-
-static void __init qd6580_init_dev(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	unsigned long t1, t2;
-	u8 base = (hwif->config_data & 0xff00) >> 8;
-	u8 config = QD_CONFIG(hwif);
-
-	if (hwif->host_flags & IDE_HFLAG_SINGLE) {
-		t1 = QD6580_DEF_DATA;
-		t2 = QD6580_DEF_DATA2;
-	} else
-		t2 = t1 = hwif->channel ? QD6580_DEF_DATA2 : QD6580_DEF_DATA;
-
-	ide_set_drivedata(drive, (void *)((drive->dn & 1) ? t2 : t1));
-}
-
-static const struct ide_tp_ops qd65xx_tp_ops = {
-	.exec_command		= ide_exec_command,
-	.read_status		= ide_read_status,
-	.read_altstatus		= ide_read_altstatus,
-	.write_devctl		= ide_write_devctl,
-
-	.dev_select		= qd65xx_dev_select,
-	.tf_load		= ide_tf_load,
-	.tf_read		= ide_tf_read,
-
-	.input_data		= ide_input_data,
-	.output_data		= ide_output_data,
-};
-
-static const struct ide_port_ops qd6500_port_ops = {
-	.init_dev		= qd6500_init_dev,
-	.set_pio_mode		= qd6500_set_pio_mode,
-};
-
-static const struct ide_port_ops qd6580_port_ops = {
-	.init_dev		= qd6580_init_dev,
-	.set_pio_mode		= qd6580_set_pio_mode,
-};
-
-static const struct ide_port_info qd65xx_port_info __initconst = {
-	.name			= DRV_NAME,
-	.tp_ops 		= &qd65xx_tp_ops,
-	.chipset		= ide_qd65xx,
-	.host_flags		= IDE_HFLAG_IO_32BIT |
-				  IDE_HFLAG_NO_DMA,
-	.pio_mask		= ATA_PIO4,
-};
-
-/*
- * qd_probe:
- *
- * looks at the specified baseport, and if qd found, registers & initialises it
- * return 1 if another qd may be probed
- */
-
-static int __init qd_probe(int base)
-{
-	int rc;
-	u8 config, unit, control;
-	struct ide_port_info d = qd65xx_port_info;
-
-	config = inb(QD_CONFIG_PORT);
-
-	if (! ((config & QD_CONFIG_BASEPORT) >> 1 == (base == 0xb0)) )
-		return -ENODEV;
-
-	unit = ! (config & QD_CONFIG_IDE_BASEPORT);
-
-	if (unit)
-		d.host_flags |= IDE_HFLAG_QD_2ND_PORT;
-
-	switch (config & 0xf0) {
-	case QD_CONFIG_QD6500:
-		if (qd_testreg(base))
-			 return -ENODEV;	/* bad register */
-
-		if (config & QD_CONFIG_DISABLED) {
-			printk(KERN_WARNING "qd6500 is disabled !\n");
-			return -ENODEV;
-		}
-
-		printk(KERN_NOTICE "qd6500 at %#x\n", base);
-		printk(KERN_DEBUG "qd6500: config=%#x, ID3=%u\n",
-			config, QD_ID3);
-
-		d.port_ops = &qd6500_port_ops;
-		d.host_flags |= IDE_HFLAG_SINGLE;
-		break;
-	case QD_CONFIG_QD6580_A:
-	case QD_CONFIG_QD6580_B:
-		if (qd_testreg(base) || qd_testreg(base + 0x02))
-			return -ENODEV;	/* bad registers */
-
-		control = inb(QD_CONTROL_PORT);
-
-		printk(KERN_NOTICE "qd6580 at %#x\n", base);
-		printk(KERN_DEBUG "qd6580: config=%#x, control=%#x, ID3=%u\n",
-			config, control, QD_ID3);
-
-		outb(QD_DEF_CONTR, QD_CONTROL_PORT);
-
-		d.port_ops = &qd6580_port_ops;
-		if (control & QD_CONTR_SEC_DISABLED)
-			d.host_flags |= IDE_HFLAG_SINGLE;
-
-		printk(KERN_INFO "qd6580: %s IDE board\n",
-			(control & QD_CONTR_SEC_DISABLED) ? "single" : "dual");
-		break;
-	default:
-		return -ENODEV;
-	}
-
-	rc = ide_legacy_device_add(&d, (base << 8) | config);
-
-	if (d.host_flags & IDE_HFLAG_SINGLE)
-		return (rc == 0) ? 1 : rc;
-
-	return rc;
-}
-
-static bool probe_qd65xx;
-
-module_param_named(probe, probe_qd65xx, bool, 0);
-MODULE_PARM_DESC(probe, "probe for QD65xx chipsets");
-
-static int __init qd65xx_init(void)
-{
-	int rc1, rc2 = -ENODEV;
-
-	if (probe_qd65xx == 0)
-		return -ENODEV;
-
-	rc1 = qd_probe(0x30);
-	if (rc1)
-		rc2 = qd_probe(0xb0);
-
-	if (rc1 < 0 && rc2 < 0)
-		return -ENODEV;
-
-	return 0;
-}
-
-module_init(qd65xx_init);
-
-MODULE_AUTHOR("Samuel Thibault");
-MODULE_DESCRIPTION("support of qd65xx vlb ide chipset");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/qd65xx.h b/drivers/ide/qd65xx.h
deleted file mode 100644
index 01a43ab45e0e..000000000000
--- a/drivers/ide/qd65xx.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (c) 2000	Linus Torvalds & authors
- */
-
-/*
- * Authors:	Petr Soucek <petr@ryston.cz>
- * 		Samuel Thibault <samuel.thibault@ens-lyon.org>
- */
-
-/* truncates a in [b,c] */
-#define IDE_IN(a,b,c)   ( ((a)<(b)) ? (b) : ( (a)>(c) ? (c) : (a)) )
-
-#define IDE_IMPLY(a,b)	((!(a)) || (b))
-
-#define QD_TIM1_PORT		(base)
-#define QD_CONFIG_PORT		(base+0x01)
-#define QD_TIM2_PORT		(base+0x02)
-#define QD_CONTROL_PORT		(base+0x03)
-
-#define QD_CONFIG_IDE_BASEPORT	0x01
-#define QD_CONFIG_BASEPORT	0x02
-#define QD_CONFIG_ID3		0x04
-#define QD_CONFIG_DISABLED	0x08
-#define QD_CONFIG_QD6500	0xc0
-#define QD_CONFIG_QD6580_A	0xa0
-#define QD_CONFIG_QD6580_B	0x50
-
-#define QD_CONTR_SEC_DISABLED	0x01
-
-#define QD_ID3			((config & QD_CONFIG_ID3)!=0)
-
-#define QD_CONFIG(hwif)		((hwif)->config_data & 0x00ff)
-
-static inline u8 QD_TIMING(ide_drive_t *drive)
-{
-	return (unsigned long)ide_get_drivedata(drive) & 0x00ff;
-}
-
-static inline u8 QD_TIMREG(ide_drive_t *drive)
-{
-	return ((unsigned long)ide_get_drivedata(drive) & 0xff00) >> 8;
-}
-
-#define QD6500_DEF_DATA		((QD_TIM1_PORT<<8) | (QD_ID3 ? 0x0c : 0x08))
-#define QD6580_DEF_DATA		((QD_TIM1_PORT<<8) | (QD_ID3 ? 0x0a : 0x00))
-#define QD6580_DEF_DATA2	((QD_TIM2_PORT<<8) | (QD_ID3 ? 0x0a : 0x00))
-#define QD_DEF_CONTR		(0x40 | ((control & 0x02) ? 0x9f : 0x1f))
-
-#define QD_TESTVAL		0x19	/* safe value */
-
-/* Drive specific timing taken from DOS driver v3.7 */
-
-static struct qd65xx_timing_s {
-	s8	offset;   /* ofset from the beginning of Model Number" */
-	char	model[4];    /* 4 chars from Model number, no conversion */
-	s16	active;   /* active time */
-	s16	recovery; /* recovery time */
-} qd65xx_timing [] = {
-	{ 30, "2040", 110, 225 },  /* Conner CP30204			*/
-	{ 30, "2045", 135, 225 },  /* Conner CP30254			*/
-	{ 30, "1040", 155, 325 },  /* Conner CP30104			*/
-	{ 30, "1047", 135, 265 },  /* Conner CP30174			*/
-	{ 30, "5344", 135, 225 },  /* Conner CP3544			*/
-	{ 30, "01 4", 175, 405 },  /* Conner CP-3104			*/
-	{ 27, "C030", 175, 375 },  /* Conner CP3000			*/
-	{  8, "PL42", 110, 295 },  /* Quantum LP240			*/
-	{  8, "PL21", 110, 315 },  /* Quantum LP120			*/
-	{  8, "PL25", 175, 385 },  /* Quantum LP52			*/
-	{  4, "PA24", 110, 285 },  /* WD Piranha SP4200			*/
-	{  6, "2200", 110, 260 },  /* WD Caviar AC2200			*/
-	{  6, "3204", 110, 235 },  /* WD Caviar AC2340			*/
-	{  6, "1202", 110, 265 },  /* WD Caviar AC2120			*/
-	{  0, "DS3-", 135, 315 },  /* Teac SD340			*/
-	{  8, "KM32", 175, 355 },  /* Toshiba MK234			*/
-	{  2, "53A1", 175, 355 },  /* Seagate ST351A			*/
-	{  2, "4108", 175, 295 },  /* Seagate ST1480A			*/
-	{  2, "1344", 175, 335 },  /* Seagate ST3144A			*/
-	{  6, "7 12", 110, 225 },  /* Maxtor 7213A			*/
-	{ 30, "02F4", 145, 295 },  /* Conner 3204F			*/
-	{  2, "1302", 175, 335 },  /* Seagate ST3120A			*/
-	{  2, "2334", 145, 265 },  /* Seagate ST3243A			*/
-	{  2, "2338", 145, 275 },  /* Seagate ST3283A			*/
-	{  2, "3309", 145, 275 },  /* Seagate ST3390A			*/
-	{  2, "5305", 145, 275 },  /* Seagate ST3550A			*/
-	{  2, "4100", 175, 295 },  /* Seagate ST1400A			*/
-	{  2, "4110", 175, 295 },  /* Seagate ST1401A			*/
-	{  2, "6300", 135, 265 },  /* Seagate ST3600A			*/
-	{  2, "5300", 135, 265 },  /* Seagate ST3500A			*/
-	{  6, "7 31", 135, 225 },  /* Maxtor 7131 AT			*/
-	{  6, "7 43", 115, 265 },  /* Maxtor 7345 AT			*/
-	{  6, "7 42", 110, 255 },  /* Maxtor 7245 AT			*/
-	{  6, "3 04", 135, 265 },  /* Maxtor 340 AT			*/
-	{  6, "61 0", 135, 285 },  /* WD AC160				*/
-	{  6, "1107", 135, 235 },  /* WD AC1170				*/
-	{  6, "2101", 110, 220 },  /* WD AC1210				*/
-	{  6, "4202", 135, 245 },  /* WD AC2420				*/
-	{  6, "41 0", 175, 355 },  /* WD Caviar 140			*/
-	{  6, "82 0", 175, 355 },  /* WD Caviar 280			*/
-	{  8, "PL01", 175, 375 },  /* Quantum LP105			*/
-	{  8, "PL25", 110, 295 },  /* Quantum LP525			*/
-	{ 10, "4S 2", 175, 385 },  /* Quantum ELS42			*/
-	{ 10, "8S 5", 175, 385 },  /* Quantum ELS85			*/
-	{ 10, "1S72", 175, 385 },  /* Quantum ELS127			*/
-	{ 10, "1S07", 175, 385 },  /* Quantum ELS170			*/
-	{  8, "ZE42", 135, 295 },  /* Quantum EZ240			*/
-	{  8, "ZE21", 175, 385 },  /* Quantum EZ127			*/
-	{  8, "ZE58", 175, 385 },  /* Quantum EZ85			*/
-	{  8, "ZE24", 175, 385 },  /* Quantum EZ42			*/
-	{ 27, "C036", 155, 325 },  /* Conner CP30064			*/
-	{ 27, "C038", 155, 325 },  /* Conner CP30084			*/
-	{  6, "2205", 110, 255 },  /* WDC AC2250			*/
-	{  2, " CHA", 140, 415 },  /* WDC AH series; WDC AH260, WDC	*/
-	{  2, " CLA", 140, 415 },  /* WDC AL series: WDC AL2120, 2170,	*/
-	{  4, "UC41", 140, 415 },  /* WDC CU140				*/
-	{  6, "1207", 130, 275 },  /* WDC AC2170			*/
-	{  6, "2107", 130, 275 },  /* WDC AC1270			*/
-	{  6, "5204", 130, 275 },  /* WDC AC2540			*/
-	{ 30, "3004", 110, 235 },  /* Conner CP30340			*/
-	{ 30, "0345", 135, 255 },  /* Conner CP30544			*/
-	{ 12, "12A3", 175, 320 },  /* MAXTOR LXT-213A			*/
-	{ 12, "43A0", 145, 240 },  /* MAXTOR LXT-340A			*/
-	{  6, "7 21", 180, 290 },  /* Maxtor 7120 AT			*/
-	{  6, "7 71", 135, 240 },  /* Maxtor 7170 AT			*/
-	{ 12, "45\0000", 110, 205 },   /* MAXTOR MXT-540		*/
-	{  8, "PL11", 180, 290 },  /* QUANTUM LP110A			*/
-	{  8, "OG21", 150, 275 },  /* QUANTUM GO120			*/
-	{ 12, "42A5", 175, 320 },  /* MAXTOR LXT-245A			*/
-	{  2, "2309", 175, 295 },  /* ST3290A				*/
-	{  2, "3358", 180, 310 },  /* ST3385A				*/
-	{  2, "6355", 180, 310 },  /* ST3655A				*/
-	{  2, "1900", 175, 270 },  /* ST9100A				*/
-	{  2, "1954", 175, 270 },  /* ST9145A				*/
-	{  2, "1909", 175, 270 },  /* ST9190AG				*/
-	{  2, "2953", 175, 270 },  /* ST9235A				*/
-	{  2, "1359", 175, 270 },  /* ST3195A				*/
-	{ 24, "3R11", 175, 290 },  /* ALPS ELECTRIC Co.,LTD, DR311C	*/
-	{  0, "2M26", 175, 215 },  /* M262XT-0Ah			*/
-	{  4, "2253", 175, 300 },  /* HP C2235A				*/
-	{  4, "-32A", 145, 245 },  /* H3133-A2				*/
-	{ 30, "0326", 150, 270 },  /* Samsung Electronics 120MB		*/
-	{ 30, "3044", 110, 195 },  /* Conner CFA340A			*/
-	{ 30, "43A0", 110, 195 },  /* Conner CFA340A			*/
-	{ -1, "    ", 175, 415 }   /* unknown disk name			*/
-};
diff --git a/drivers/ide/rapide.c b/drivers/ide/rapide.c
deleted file mode 100644
index 0ab8b86b7ed7..000000000000
--- a/drivers/ide/rapide.c
+++ /dev/null
@@ -1,106 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (c) 1996-2002 Russell King.
- */
-
-#include <linux/module.h>
-#include <linux/blkdev.h>
-#include <linux/errno.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-
-#include <asm/ecard.h>
-
-static const struct ide_port_info rapide_port_info = {
-	.host_flags		= IDE_HFLAG_MMIO | IDE_HFLAG_NO_DMA,
-	.chipset		= ide_generic,
-};
-
-static void rapide_setup_ports(struct ide_hw *hw, void __iomem *base,
-			       void __iomem *ctrl, unsigned int sz, int irq)
-{
-	unsigned long port = (unsigned long)base;
-	int i;
-
-	for (i = 0; i <= 7; i++) {
-		hw->io_ports_array[i] = port;
-		port += sz;
-	}
-	hw->io_ports.ctl_addr = (unsigned long)ctrl;
-	hw->irq = irq;
-}
-
-static int rapide_probe(struct expansion_card *ec, const struct ecard_id *id)
-{
-	void __iomem *base;
-	struct ide_host *host;
-	int ret;
-	struct ide_hw hw, *hws[] = { &hw };
-
-	ret = ecard_request_resources(ec);
-	if (ret)
-		goto out;
-
-	base = ecardm_iomap(ec, ECARD_RES_MEMC, 0, 0);
-	if (!base) {
-		ret = -ENOMEM;
-		goto release;
-	}
-
-	memset(&hw, 0, sizeof(hw));
-	rapide_setup_ports(&hw, base, base + 0x818, 1 << 6, ec->irq);
-	hw.dev = &ec->dev;
-
-	ret = ide_host_add(&rapide_port_info, hws, 1, &host);
-	if (ret)
-		goto release;
-
-	ecard_set_drvdata(ec, host);
-	goto out;
-
- release:
-	ecard_release_resources(ec);
- out:
-	return ret;
-}
-
-static void rapide_remove(struct expansion_card *ec)
-{
-	struct ide_host *host = ecard_get_drvdata(ec);
-
-	ecard_set_drvdata(ec, NULL);
-
-	ide_host_remove(host);
-
-	ecard_release_resources(ec);
-}
-
-static struct ecard_id rapide_ids[] = {
-	{ MANU_YELLOWSTONE, PROD_YELLOWSTONE_RAPIDE32 },
-	{ 0xffff, 0xffff }
-};
-
-static struct ecard_driver rapide_driver = {
-	.probe		= rapide_probe,
-	.remove		= rapide_remove,
-	.id_table	= rapide_ids,
-	.drv = {
-		.name	= "rapide",
-	},
-};
-
-static int __init rapide_init(void)
-{
-	return ecard_register_driver(&rapide_driver);
-}
-
-static void __exit rapide_exit(void)
-{
-	ecard_remove_driver(&rapide_driver);
-}
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Yellowstone RAPIDE driver");
-
-module_init(rapide_init);
-module_exit(rapide_exit);
diff --git a/drivers/ide/rz1000.c b/drivers/ide/rz1000.c
deleted file mode 100644
index fce2b7de5a19..000000000000
--- a/drivers/ide/rz1000.c
+++ /dev/null
@@ -1,100 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  Copyright (C) 1995-1998  Linus Torvalds & author (see below)
- */
-
-/*
- *  Principal Author:  mlord@pobox.com (Mark Lord)
- *
- *  See linux/MAINTAINERS for address of current maintainer.
- *
- *  This file provides support for disabling the buggy read-ahead
- *  mode of the RZ1000 IDE chipset, commonly used on Intel motherboards.
- *
- *  Dunno if this fixes both ports, or only the primary port (?).
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-
-#define DRV_NAME "rz1000"
-
-static int rz1000_disable_readahead(struct pci_dev *dev)
-{
-	u16 reg;
-
-	if (!pci_read_config_word (dev, 0x40, &reg) &&
-	    !pci_write_config_word(dev, 0x40, reg & 0xdfff)) {
-		printk(KERN_INFO "%s: disabled chipset read-ahead "
-			"(buggy RZ1000/RZ1001)\n", pci_name(dev));
-		return 0;
-	} else {
-		printk(KERN_INFO "%s: serialized, disabled unmasking "
-			"(buggy RZ1000/RZ1001)\n", pci_name(dev));
-		return 1;
-	}
-}
-
-static const struct ide_port_info rz1000_chipset = {
-	.name		= DRV_NAME,
-	.host_flags	= IDE_HFLAG_NO_DMA,
-};
-
-static int rz1000_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	struct ide_port_info d = rz1000_chipset;
-	int rc;
-
-	rc = pci_enable_device(dev);
-	if (rc)
-		return rc;
-
-	if (rz1000_disable_readahead(dev)) {
-		d.host_flags |= IDE_HFLAG_SERIALIZE;
-		d.host_flags |= IDE_HFLAG_NO_UNMASK_IRQS;
-	}
-
-	return ide_pci_init_one(dev, &d, NULL);
-}
-
-static void rz1000_remove(struct pci_dev *dev)
-{
-	ide_pci_remove(dev);
-	pci_disable_device(dev);
-}
-
-static const struct pci_device_id rz1000_pci_tbl[] = {
-	{ PCI_VDEVICE(PCTECH, PCI_DEVICE_ID_PCTECH_RZ1000), 0 },
-	{ PCI_VDEVICE(PCTECH, PCI_DEVICE_ID_PCTECH_RZ1001), 0 },
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, rz1000_pci_tbl);
-
-static struct pci_driver rz1000_pci_driver = {
-	.name		= "RZ1000_IDE",
-	.id_table	= rz1000_pci_tbl,
-	.probe		= rz1000_init_one,
-	.remove		= rz1000_remove,
-};
-
-static int __init rz1000_ide_init(void)
-{
-	return ide_pci_register_driver(&rz1000_pci_driver);
-}
-
-static void __exit rz1000_ide_exit(void)
-{
-	pci_unregister_driver(&rz1000_pci_driver);
-}
-
-module_init(rz1000_ide_init);
-module_exit(rz1000_ide_exit);
-
-MODULE_AUTHOR("Andre Hedrick");
-MODULE_DESCRIPTION("PCI driver module for RZ1000 IDE");
-MODULE_LICENSE("GPL");
-
diff --git a/drivers/ide/sc1200.c b/drivers/ide/sc1200.c
deleted file mode 100644
index a5b701818405..000000000000
--- a/drivers/ide/sc1200.c
+++ /dev/null
@@ -1,355 +0,0 @@
-/*
- * Copyright (C) 2000-2002		Mark Lord <mlord@pobox.com>
- * Copyright (C)      2007		Bartlomiej Zolnierkiewicz
- *
- * May be copied or modified under the terms of the GNU General Public License
- *
- * Development of this chipset driver was funded
- * by the nice folks at National Semiconductor.
- *
- * Documentation:
- *	Available from National Semiconductor
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/pci.h>
-#include <linux/init.h>
-#include <linux/ide.h>
-#include <linux/pm.h>
-
-#include <asm/io.h>
-
-#define DRV_NAME "sc1200"
-
-#define SC1200_REV_A	0x00
-#define SC1200_REV_B1	0x01
-#define SC1200_REV_B3	0x02
-#define SC1200_REV_C1	0x03
-#define SC1200_REV_D1	0x04
-
-#define PCI_CLK_33	0x00
-#define PCI_CLK_48	0x01
-#define PCI_CLK_66	0x02
-#define PCI_CLK_33A	0x03
-
-static unsigned short sc1200_get_pci_clock (void)
-{
-	unsigned char chip_id, silicon_revision;
-	unsigned int pci_clock;
-	/*
-	 * Check the silicon revision, as not all versions of the chip
-	 * have the register with the fast PCI bus timings.
-	 */
-	chip_id = inb (0x903c);
-	silicon_revision = inb (0x903d);
-
-	// Read the fast pci clock frequency
-	if (chip_id == 0x04 && silicon_revision < SC1200_REV_B1) {
-		pci_clock = PCI_CLK_33;
-	} else {
-		// check clock generator configuration (cfcc)
-		// the clock is in bits 8 and 9 of this word
-
-		pci_clock = inw (0x901e);
-		pci_clock >>= 8;
-		pci_clock &= 0x03;
-		if (pci_clock == PCI_CLK_33A)
-			pci_clock = PCI_CLK_33;
-	}
-	return pci_clock;
-}
-
-/*
- * Here are the standard PIO mode 0-4 timings for each "format".
- * Format-0 uses fast data reg timings, with slower command reg timings.
- * Format-1 uses fast timings for all registers, but won't work with all drives.
- */
-static const unsigned int sc1200_pio_timings[4][5] =
-	{{0x00009172, 0x00012171, 0x00020080, 0x00032010, 0x00040010},	// format0  33Mhz
-	 {0xd1329172, 0x71212171, 0x30200080, 0x20102010, 0x00100010},	// format1, 33Mhz
-	 {0xfaa3f4f3, 0xc23232b2, 0x513101c1, 0x31213121, 0x10211021},	// format1, 48Mhz
-	 {0xfff4fff4, 0xf35353d3, 0x814102f1, 0x42314231, 0x11311131}};	// format1, 66Mhz
-
-/*
- * After chip reset, the PIO timings are set to 0x00009172, which is not valid.
- */
-//#define SC1200_BAD_PIO(timings) (((timings)&~0x80000000)==0x00009172)
-
-static void sc1200_tunepio(ide_drive_t *drive, u8 pio)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct pci_dev *pdev = to_pci_dev(hwif->dev);
-	unsigned int basereg = hwif->channel ? 0x50 : 0x40, format = 0;
-
-	pci_read_config_dword(pdev, basereg + 4, &format);
-	format = (format >> 31) & 1;
-	if (format)
-		format += sc1200_get_pci_clock();
-	pci_write_config_dword(pdev, basereg + ((drive->dn & 1) << 3),
-			       sc1200_pio_timings[format][pio]);
-}
-
-/*
- *	The SC1200 specifies that two drives sharing a cable cannot mix
- *	UDMA/MDMA.  It has to be one or the other, for the pair, though
- *	different timings can still be chosen for each drive.  We could
- *	set the appropriate timing bits on the fly, but that might be
- *	a bit confusing.  So, for now we statically handle this requirement
- *	by looking at our mate drive to see what it is capable of, before
- *	choosing a mode for our own drive.
- */
-static u8 sc1200_udma_filter(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	ide_drive_t *mate = ide_get_pair_dev(drive);
-	u16 *mateid;
-	u8 mask = hwif->ultra_mask;
-
-	if (mate == NULL)
-		goto out;
-	mateid = mate->id;
-
-	if (ata_id_has_dma(mateid) && __ide_dma_bad_drive(mate) == 0) {
-		if ((mateid[ATA_ID_FIELD_VALID] & 4) &&
-		    (mateid[ATA_ID_UDMA_MODES] & 7))
-			goto out;
-		if (mateid[ATA_ID_MWDMA_MODES] & 7)
-			mask = 0;
-	}
-out:
-	return mask;
-}
-
-static void sc1200_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	struct pci_dev		*dev = to_pci_dev(hwif->dev);
-	unsigned int		reg, timings;
-	unsigned short		pci_clock;
-	unsigned int		basereg = hwif->channel ? 0x50 : 0x40;
-	const u8		mode = drive->dma_mode;
-
-	static const u32 udma_timing[3][3] = {
-		{ 0x00921250, 0x00911140, 0x00911030 },
-		{ 0x00932470, 0x00922260, 0x00922140 },
-		{ 0x009436a1, 0x00933481, 0x00923261 },
-	};
-
-	static const u32 mwdma_timing[3][3] = {
-		{ 0x00077771, 0x00012121, 0x00002020 },
-		{ 0x000bbbb2, 0x00024241, 0x00013131 },
-		{ 0x000ffff3, 0x00035352, 0x00015151 },
-	};
-
-	pci_clock = sc1200_get_pci_clock();
-
-	/*
-	 * Note that each DMA mode has several timings associated with it.
-	 * The correct timing depends on the fast PCI clock freq.
-	 */
-
-	if (mode >= XFER_UDMA_0)
-		timings =  udma_timing[pci_clock][mode - XFER_UDMA_0];
-	else
-		timings = mwdma_timing[pci_clock][mode - XFER_MW_DMA_0];
-
-	if ((drive->dn & 1) == 0) {
-		pci_read_config_dword(dev, basereg + 4, &reg);
-		timings |= reg & 0x80000000;	/* preserve PIO format bit */
-		pci_write_config_dword(dev, basereg + 4, timings);
-	} else
-		pci_write_config_dword(dev, basereg + 12, timings);
-}
-
-/*  Replacement for the standard ide_dma_end action in
- *  dma_proc.
- *
- *  returns 1 on error, 0 otherwise
- */
-static int sc1200_dma_end(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	unsigned long dma_base = hwif->dma_base;
-	u8 dma_stat;
-
-	dma_stat = inb(dma_base+2);		/* get DMA status */
-
-	if (!(dma_stat & 4))
-		printk(" ide_dma_end dma_stat=%0x err=%x newerr=%x\n",
-		  dma_stat, ((dma_stat&7)!=4), ((dma_stat&2)==2));
-
-	outb(dma_stat|0x1b, dma_base+2);	/* clear the INTR & ERROR bits */
-	outb(inb(dma_base)&~1, dma_base);	/* !! DO THIS HERE !! stop DMA */
-
-	return (dma_stat & 7) != 4;		/* verify good DMA status */
-}
-
-/*
- * sc1200_set_pio_mode() handles setting of PIO modes
- * for both the chipset and drive.
- *
- * All existing BIOSs for this chipset guarantee that all drives
- * will have valid default PIO timings set up before we get here.
- */
-
-static void sc1200_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	int		mode = -1;
-	const u8	pio = drive->pio_mode - XFER_PIO_0;
-
-	/*
-	 * bad abuse of ->set_pio_mode interface
-	 */
-	switch (pio) {
-		case 200: mode = XFER_UDMA_0;	break;
-		case 201: mode = XFER_UDMA_1;	break;
-		case 202: mode = XFER_UDMA_2;	break;
-		case 100: mode = XFER_MW_DMA_0;	break;
-		case 101: mode = XFER_MW_DMA_1;	break;
-		case 102: mode = XFER_MW_DMA_2;	break;
-	}
-	if (mode != -1) {
-		printk("SC1200: %s: changing (U)DMA mode\n", drive->name);
-		ide_dma_off_quietly(drive);
-		if (ide_set_dma_mode(drive, mode) == 0 &&
-		    (drive->dev_flags & IDE_DFLAG_USING_DMA))
-			hwif->dma_ops->dma_host_set(drive, 1);
-		return;
-	}
-
-	sc1200_tunepio(drive, pio);
-}
-
-#ifdef CONFIG_PM
-struct sc1200_saved_state {
-	u32 regs[8];
-};
-
-static int sc1200_suspend (struct pci_dev *dev, pm_message_t state)
-{
-	printk("SC1200: suspend(%u)\n", state.event);
-
-	/*
-	 * we only save state when going from full power to less
-	 */
-	if (state.event == PM_EVENT_ON) {
-		struct ide_host *host = pci_get_drvdata(dev);
-		struct sc1200_saved_state *ss = host->host_priv;
-		unsigned int r;
-
-		/*
-		 * save timing registers
-		 * (this may be unnecessary if BIOS also does it)
-		 */
-		for (r = 0; r < 8; r++)
-			pci_read_config_dword(dev, 0x40 + r * 4, &ss->regs[r]);
-	}
-
-	pci_disable_device(dev);
-	pci_set_power_state(dev, pci_choose_state(dev, state));
-	return 0;
-}
-
-static int sc1200_resume (struct pci_dev *dev)
-{
-	struct ide_host *host = pci_get_drvdata(dev);
-	struct sc1200_saved_state *ss = host->host_priv;
-	unsigned int r;
-	int i;
-
-	i = pci_enable_device(dev);
-	if (i)
-		return i;
-
-	/*
-	 * restore timing registers
-	 * (this may be unnecessary if BIOS also does it)
-	 */
-	for (r = 0; r < 8; r++)
-		pci_write_config_dword(dev, 0x40 + r * 4, ss->regs[r]);
-
-	return 0;
-}
-#endif
-
-static const struct ide_port_ops sc1200_port_ops = {
-	.set_pio_mode		= sc1200_set_pio_mode,
-	.set_dma_mode		= sc1200_set_dma_mode,
-	.udma_filter		= sc1200_udma_filter,
-};
-
-static const struct ide_dma_ops sc1200_dma_ops = {
-	.dma_host_set		= ide_dma_host_set,
-	.dma_setup		= ide_dma_setup,
-	.dma_start		= ide_dma_start,
-	.dma_end		= sc1200_dma_end,
-	.dma_test_irq		= ide_dma_test_irq,
-	.dma_lost_irq		= ide_dma_lost_irq,
-	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_sff_read_status	= ide_dma_sff_read_status,
-};
-
-static const struct ide_port_info sc1200_chipset = {
-	.name		= DRV_NAME,
-	.port_ops	= &sc1200_port_ops,
-	.dma_ops	= &sc1200_dma_ops,
-	.host_flags	= IDE_HFLAG_SERIALIZE |
-			  IDE_HFLAG_POST_SET_MODE |
-			  IDE_HFLAG_ABUSE_DMA_MODES,
-	.pio_mask	= ATA_PIO4,
-	.mwdma_mask	= ATA_MWDMA2,
-	.udma_mask	= ATA_UDMA2,
-};
-
-static int sc1200_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	struct sc1200_saved_state *ss = NULL;
-	int rc;
-
-#ifdef CONFIG_PM
-	ss = kmalloc(sizeof(*ss), GFP_KERNEL);
-	if (ss == NULL)
-		return -ENOMEM;
-#endif
-	rc = ide_pci_init_one(dev, &sc1200_chipset, ss);
-	if (rc)
-		kfree(ss);
-
-	return rc;
-}
-
-static const struct pci_device_id sc1200_pci_tbl[] = {
-	{ PCI_VDEVICE(NS, PCI_DEVICE_ID_NS_SCx200_IDE), 0},
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, sc1200_pci_tbl);
-
-static struct pci_driver sc1200_pci_driver = {
-	.name		= "SC1200_IDE",
-	.id_table	= sc1200_pci_tbl,
-	.probe		= sc1200_init_one,
-	.remove		= ide_pci_remove,
-#ifdef CONFIG_PM
-	.suspend	= sc1200_suspend,
-	.resume		= sc1200_resume,
-#endif
-};
-
-static int __init sc1200_ide_init(void)
-{
-	return ide_pci_register_driver(&sc1200_pci_driver);
-}
-
-static void __exit sc1200_ide_exit(void)
-{
-	pci_unregister_driver(&sc1200_pci_driver);
-}
-
-module_init(sc1200_ide_init);
-module_exit(sc1200_ide_exit);
-
-MODULE_AUTHOR("Mark Lord");
-MODULE_DESCRIPTION("PCI driver module for NS SC1200 IDE");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/serverworks.c b/drivers/ide/serverworks.c
deleted file mode 100644
index 458e72e034b0..000000000000
--- a/drivers/ide/serverworks.c
+++ /dev/null
@@ -1,456 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 1998-2000 Michel Aubry
- * Copyright (C) 1998-2000 Andrzej Krzysztofowicz
- * Copyright (C) 1998-2000 Andre Hedrick <andre@linux-ide.org>
- * Copyright (C) 2007-2010 Bartlomiej Zolnierkiewicz
- * Portions copyright (c) 2001 Sun Microsystems
- *
- *
- * RCC/ServerWorks IDE driver for Linux
- *
- *   OSB4: `Open South Bridge' IDE Interface (fn 1)
- *         supports UDMA mode 2 (33 MB/s)
- *
- *   CSB5: `Champion South Bridge' IDE Interface (fn 1)
- *         all revisions support UDMA mode 4 (66 MB/s)
- *         revision A2.0 and up support UDMA mode 5 (100 MB/s)
- *
- *         *** The CSB5 does not provide ANY register ***
- *         *** to detect 80-conductor cable presence. ***
- *
- *   CSB6: `Champion South Bridge' IDE Interface (optional: third channel)
- *
- *   HT1000: AKA BCM5785 - Hypertransport Southbridge for Opteron systems. IDE
- *   controller same as the CSB6. Single channel ATA100 only.
- *
- * Documentation:
- *	Available under NDA only. Errata info very hard to get.
- *
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-
-#include <asm/io.h>
-
-#define DRV_NAME "serverworks"
-
-#define SVWKS_CSB5_REVISION_NEW	0x92 /* min PCI_REVISION_ID for UDMA5 (A2.0) */
-#define SVWKS_CSB6_REVISION	0xa0 /* min PCI_REVISION_ID for UDMA4 (A1.0) */
-
-/* Seagate Barracuda ATA IV Family drives in UDMA mode 5
- * can overrun their FIFOs when used with the CSB5 */
-static const char *svwks_bad_ata100[] = {
-	"ST320011A",
-	"ST340016A",
-	"ST360021A",
-	"ST380021A",
-	NULL
-};
-
-static int check_in_drive_lists (ide_drive_t *drive, const char **list)
-{
-	char *m = (char *)&drive->id[ATA_ID_PROD];
-
-	while (*list)
-		if (!strcmp(*list++, m))
-			return 1;
-	return 0;
-}
-
-static u8 svwks_udma_filter(ide_drive_t *drive)
-{
-	struct pci_dev *dev = to_pci_dev(drive->hwif->dev);
-
-	if (dev->device == PCI_DEVICE_ID_SERVERWORKS_HT1000IDE) {
-		return 0x1f;
-	} else if (dev->revision < SVWKS_CSB5_REVISION_NEW) {
-		return 0x07;
-	} else {
-		u8 btr = 0, mode, mask;
-
-		pci_read_config_byte(dev, 0x5A, &btr);
-		mode = btr & 0x3;
-
-		/* If someone decides to do UDMA133 on CSB5 the same
-		   issue will bite so be inclusive */
-		if (mode > 2 && check_in_drive_lists(drive, svwks_bad_ata100))
-			mode = 2;
-
-		switch(mode) {
-		case 3:	 mask = 0x3f; break;
-		case 2:	 mask = 0x1f; break;
-		case 1:	 mask = 0x07; break;
-		default: mask = 0x00; break;
-		}
-
-		return mask;
-	}
-}
-
-static u8 svwks_csb_check (struct pci_dev *dev)
-{
-	switch (dev->device) {
-		case PCI_DEVICE_ID_SERVERWORKS_CSB5IDE:
-		case PCI_DEVICE_ID_SERVERWORKS_CSB6IDE:
-		case PCI_DEVICE_ID_SERVERWORKS_CSB6IDE2:
-		case PCI_DEVICE_ID_SERVERWORKS_HT1000IDE:
-			return 1;
-		default:
-			break;
-	}
-	return 0;
-}
-
-static void svwks_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	static const u8 pio_modes[] = { 0x5d, 0x47, 0x34, 0x22, 0x20 };
-	static const u8 drive_pci[] = { 0x41, 0x40, 0x43, 0x42 };
-
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	const u8 pio = drive->pio_mode - XFER_PIO_0;
-
-	if (drive->dn >= ARRAY_SIZE(drive_pci))
-		return;
-
-	pci_write_config_byte(dev, drive_pci[drive->dn], pio_modes[pio]);
-
-	if (svwks_csb_check(dev)) {
-		u16 csb_pio = 0;
-
-		pci_read_config_word(dev, 0x4a, &csb_pio);
-
-		csb_pio &= ~(0x0f << (4 * drive->dn));
-		csb_pio |= (pio << (4 * drive->dn));
-
-		pci_write_config_word(dev, 0x4a, csb_pio);
-	}
-}
-
-static void svwks_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	static const u8 udma_modes[]		= { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05 };
-	static const u8 dma_modes[]		= { 0x77, 0x21, 0x20 };
-	static const u8 drive_pci2[]		= { 0x45, 0x44, 0x47, 0x46 };
-
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	const u8 speed		= drive->dma_mode;
-	u8 unit			= drive->dn & 1;
-
-	u8 ultra_enable	 = 0, ultra_timing = 0, dma_timing = 0;
-
-	if (drive->dn >= ARRAY_SIZE(drive_pci2))
-		return;
-
-	pci_read_config_byte(dev, (0x56|hwif->channel), &ultra_timing);
-	pci_read_config_byte(dev, 0x54, &ultra_enable);
-
-	ultra_timing	&= ~(0x0F << (4*unit));
-	ultra_enable	&= ~(0x01 << drive->dn);
-
-	if (speed >= XFER_UDMA_0) {
-		dma_timing   |= dma_modes[2];
-		ultra_timing |= (udma_modes[speed - XFER_UDMA_0] << (4 * unit));
-		ultra_enable |= (0x01 << drive->dn);
-	} else if (speed >= XFER_MW_DMA_0)
-		dma_timing   |= dma_modes[speed - XFER_MW_DMA_0];
-
-	pci_write_config_byte(dev, drive_pci2[drive->dn], dma_timing);
-	pci_write_config_byte(dev, (0x56|hwif->channel), ultra_timing);
-	pci_write_config_byte(dev, 0x54, ultra_enable);
-}
-
-static int init_chipset_svwks(struct pci_dev *dev)
-{
-	unsigned int reg;
-	u8 btr;
-
-	/* force Master Latency Timer value to 64 PCICLKs */
-	pci_write_config_byte(dev, PCI_LATENCY_TIMER, 0x40);
-
-	/* OSB4 : South Bridge and IDE */
-	if (dev->device == PCI_DEVICE_ID_SERVERWORKS_OSB4IDE) {
-		struct pci_dev *isa_dev =
-			pci_get_device(PCI_VENDOR_ID_SERVERWORKS,
-					PCI_DEVICE_ID_SERVERWORKS_OSB4, NULL);
-		if (isa_dev) {
-			pci_read_config_dword(isa_dev, 0x64, &reg);
-			reg &= ~0x00002000; /* disable 600ns interrupt mask */
-			if(!(reg & 0x00004000))
-				printk(KERN_DEBUG DRV_NAME " %s: UDMA not BIOS "
-					"enabled.\n", pci_name(dev));
-			reg |=  0x00004000; /* enable UDMA/33 support */
-			pci_write_config_dword(isa_dev, 0x64, reg);
-			pci_dev_put(isa_dev);
-		}
-	}
-
-	/* setup CSB5/CSB6 : South Bridge and IDE option RAID */
-	else if ((dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB5IDE) ||
-		 (dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE) ||
-		 (dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE2)) {
-
-		/* Third Channel Test */
-		if (!(PCI_FUNC(dev->devfn) & 1)) {
-			struct pci_dev * findev = NULL;
-			u32 reg4c = 0;
-			findev = pci_get_device(PCI_VENDOR_ID_SERVERWORKS,
-				PCI_DEVICE_ID_SERVERWORKS_CSB5, NULL);
-			if (findev) {
-				pci_read_config_dword(findev, 0x4C, &reg4c);
-				reg4c &= ~0x000007FF;
-				reg4c |=  0x00000040;
-				reg4c |=  0x00000020;
-				pci_write_config_dword(findev, 0x4C, reg4c);
-				pci_dev_put(findev);
-			}
-			outb_p(0x06, 0x0c00);
-			dev->irq = inb_p(0x0c01);
-		} else {
-			struct pci_dev * findev = NULL;
-			u8 reg41 = 0;
-
-			findev = pci_get_device(PCI_VENDOR_ID_SERVERWORKS,
-					PCI_DEVICE_ID_SERVERWORKS_CSB6, NULL);
-			if (findev) {
-				pci_read_config_byte(findev, 0x41, &reg41);
-				reg41 &= ~0x40;
-				pci_write_config_byte(findev, 0x41, reg41);
-				pci_dev_put(findev);
-			}
-			/*
-			 * This is a device pin issue on CSB6.
-			 * Since there will be a future raid mode,
-			 * early versions of the chipset require the
-			 * interrupt pin to be set, and it is a compatibility
-			 * mode issue.
-			 */
-			if ((dev->class >> 8) == PCI_CLASS_STORAGE_IDE)
-				dev->irq = 0;
-		}
-//		pci_read_config_dword(dev, 0x40, &pioreg)
-//		pci_write_config_dword(dev, 0x40, 0x99999999);
-//		pci_read_config_dword(dev, 0x44, &dmareg);
-//		pci_write_config_dword(dev, 0x44, 0xFFFFFFFF);
-		/* setup the UDMA Control register
-		 *
-		 * 1. clear bit 6 to enable DMA
-		 * 2. enable DMA modes with bits 0-1
-		 * 	00 : legacy
-		 * 	01 : udma2
-		 * 	10 : udma2/udma4
-		 * 	11 : udma2/udma4/udma5
-		 */
-		pci_read_config_byte(dev, 0x5A, &btr);
-		btr &= ~0x40;
-		if (!(PCI_FUNC(dev->devfn) & 1))
-			btr |= 0x2;
-		else
-			btr |= (dev->revision >= SVWKS_CSB5_REVISION_NEW) ? 0x3 : 0x2;
-		pci_write_config_byte(dev, 0x5A, btr);
-	}
-	/* Setup HT1000 SouthBridge Controller - Single Channel Only */
-	else if (dev->device == PCI_DEVICE_ID_SERVERWORKS_HT1000IDE) {
-		pci_read_config_byte(dev, 0x5A, &btr);
-		btr &= ~0x40;
-		btr |= 0x3;
-		pci_write_config_byte(dev, 0x5A, btr);
-	}
-
-	return 0;
-}
-
-static u8 ata66_svwks_svwks(ide_hwif_t *hwif)
-{
-	return ATA_CBL_PATA80;
-}
-
-/* On Dell PowerEdge servers with a CSB5/CSB6, the top two bits
- * of the subsystem device ID indicate presence of an 80-pin cable.
- * Bit 15 clear = secondary IDE channel does not have 80-pin cable.
- * Bit 15 set   = secondary IDE channel has 80-pin cable.
- * Bit 14 clear = primary IDE channel does not have 80-pin cable.
- * Bit 14 set   = primary IDE channel has 80-pin cable.
- */
-static u8 ata66_svwks_dell(ide_hwif_t *hwif)
-{
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-
-	if (dev->subsystem_vendor == PCI_VENDOR_ID_DELL &&
-	    dev->vendor	== PCI_VENDOR_ID_SERVERWORKS &&
-	    (dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB5IDE ||
-	     dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE))
-		return ((1 << (hwif->channel + 14)) &
-			dev->subsystem_device) ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
-	return ATA_CBL_PATA40;
-}
-
-/* Sun Cobalt Alpine hardware avoids the 80-pin cable
- * detect issue by attaching the drives directly to the board.
- * This check follows the Dell precedent (how scary is that?!)
- *
- * WARNING: this only works on Alpine hardware!
- */
-static u8 ata66_svwks_cobalt(ide_hwif_t *hwif)
-{
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-
-	if (dev->subsystem_vendor == PCI_VENDOR_ID_SUN &&
-	    dev->vendor	== PCI_VENDOR_ID_SERVERWORKS &&
-	    dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB5IDE)
-		return ((1 << (hwif->channel + 14)) &
-			dev->subsystem_device) ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
-	return ATA_CBL_PATA40;
-}
-
-static u8 svwks_cable_detect(ide_hwif_t *hwif)
-{
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-
-	/* Server Works */
-	if (dev->subsystem_vendor == PCI_VENDOR_ID_SERVERWORKS)
-		return ata66_svwks_svwks (hwif);
-	
-	/* Dell PowerEdge */
-	if (dev->subsystem_vendor == PCI_VENDOR_ID_DELL)
-		return ata66_svwks_dell (hwif);
-
-	/* Cobalt Alpine */
-	if (dev->subsystem_vendor == PCI_VENDOR_ID_SUN)
-		return ata66_svwks_cobalt (hwif);
-
-	/* Per Specified Design by OEM, and ASIC Architect */
-	if ((dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE) ||
-	    (dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE2))
-		return ATA_CBL_PATA80;
-
-	return ATA_CBL_PATA40;
-}
-
-static const struct ide_port_ops osb4_port_ops = {
-	.set_pio_mode		= svwks_set_pio_mode,
-	.set_dma_mode		= svwks_set_dma_mode,
-};
-
-static const struct ide_port_ops svwks_port_ops = {
-	.set_pio_mode		= svwks_set_pio_mode,
-	.set_dma_mode		= svwks_set_dma_mode,
-	.udma_filter		= svwks_udma_filter,
-	.cable_detect		= svwks_cable_detect,
-};
-
-static const struct ide_port_info serverworks_chipsets[] = {
-	{	/* 0: OSB4 */
-		.name		= DRV_NAME,
-		.init_chipset	= init_chipset_svwks,
-		.port_ops	= &osb4_port_ops,
-		.pio_mask	= ATA_PIO4,
-		.mwdma_mask	= ATA_MWDMA2,
-		.udma_mask	= 0x00, /* UDMA is problematic on OSB4 */
-	},
-	{	/* 1: CSB5 */
-		.name		= DRV_NAME,
-		.init_chipset	= init_chipset_svwks,
-		.port_ops	= &svwks_port_ops,
-		.pio_mask	= ATA_PIO4,
-		.mwdma_mask	= ATA_MWDMA2,
-		.udma_mask	= ATA_UDMA5,
-	},
-	{	/* 2: CSB6 */
-		.name		= DRV_NAME,
-		.init_chipset	= init_chipset_svwks,
-		.port_ops	= &svwks_port_ops,
-		.pio_mask	= ATA_PIO4,
-		.mwdma_mask	= ATA_MWDMA2,
-		.udma_mask	= ATA_UDMA5,
-	},
-	{	/* 3: CSB6-2 */
-		.name		= DRV_NAME,
-		.init_chipset	= init_chipset_svwks,
-		.port_ops	= &svwks_port_ops,
-		.host_flags	= IDE_HFLAG_SINGLE,
-		.pio_mask	= ATA_PIO4,
-		.mwdma_mask	= ATA_MWDMA2,
-		.udma_mask	= ATA_UDMA5,
-	},
-	{	/* 4: HT1000 */
-		.name		= DRV_NAME,
-		.init_chipset	= init_chipset_svwks,
-		.port_ops	= &svwks_port_ops,
-		.host_flags	= IDE_HFLAG_SINGLE,
-		.pio_mask	= ATA_PIO4,
-		.mwdma_mask	= ATA_MWDMA2,
-		.udma_mask	= ATA_UDMA5,
-	}
-};
-
-/**
- *	svwks_init_one	-	called when a OSB/CSB is found
- *	@dev: the svwks device
- *	@id: the matching pci id
- *
- *	Called when the PCI registration layer (or the IDE initialization)
- *	finds a device matching our IDE device tables.
- */
- 
-static int svwks_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	struct ide_port_info d;
-	u8 idx = id->driver_data;
-
-	d = serverworks_chipsets[idx];
-
-	if (idx == 1)
-		d.host_flags |= IDE_HFLAG_CLEAR_SIMPLEX;
-	else if (idx == 2 || idx == 3) {
-		if ((PCI_FUNC(dev->devfn) & 1) == 0) {
-			if (pci_resource_start(dev, 0) != 0x01f1)
-				d.host_flags |= IDE_HFLAG_NON_BOOTABLE;
-			d.host_flags |= IDE_HFLAG_SINGLE;
-		} else
-			d.host_flags &= ~IDE_HFLAG_SINGLE;
-	}
-
-	return ide_pci_init_one(dev, &d, NULL);
-}
-
-static const struct pci_device_id svwks_pci_tbl[] = {
-	{ PCI_VDEVICE(SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_OSB4IDE),   0 },
-	{ PCI_VDEVICE(SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_CSB5IDE),   1 },
-	{ PCI_VDEVICE(SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_CSB6IDE),   2 },
-	{ PCI_VDEVICE(SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_CSB6IDE2),  3 },
-	{ PCI_VDEVICE(SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_HT1000IDE), 4 },
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, svwks_pci_tbl);
-
-static struct pci_driver svwks_pci_driver = {
-	.name		= "Serverworks_IDE",
-	.id_table	= svwks_pci_tbl,
-	.probe		= svwks_init_one,
-	.remove		= ide_pci_remove,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init svwks_ide_init(void)
-{
-	return ide_pci_register_driver(&svwks_pci_driver);
-}
-
-static void __exit svwks_ide_exit(void)
-{
-	pci_unregister_driver(&svwks_pci_driver);
-}
-
-module_init(svwks_ide_init);
-module_exit(svwks_ide_exit);
-
-MODULE_AUTHOR("Michael Aubry. Andrzej Krzysztofowicz, Andre Hedrick, Bartlomiej Zolnierkiewicz");
-MODULE_DESCRIPTION("PCI driver module for Serverworks OSB4/CSB5/CSB6 IDE");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/setup-pci.c b/drivers/ide/setup-pci.c
deleted file mode 100644
index fdc8e813170c..000000000000
--- a/drivers/ide/setup-pci.c
+++ /dev/null
@@ -1,682 +0,0 @@
-/*
- *  Copyright (C) 1998-2000  Andre Hedrick <andre@linux-ide.org>
- *  Copyright (C) 1995-1998  Mark Lord
- *  Copyright (C) 2007-2009  Bartlomiej Zolnierkiewicz
- *
- *  May be copied or modified under the terms of the GNU General Public License
- */
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/pci.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/ide.h>
-#include <linux/dma-mapping.h>
-
-#include <asm/io.h>
-
-/**
- *	ide_setup_pci_baseregs	-	place a PCI IDE controller native
- *	@dev: PCI device of interface to switch native
- *	@name: Name of interface
- *
- *	We attempt to place the PCI interface into PCI native mode. If
- *	we succeed the BARs are ok and the controller is in PCI mode.
- *	Returns 0 on success or an errno code.
- *
- *	FIXME: if we program the interface and then fail to set the BARS
- *	we don't switch it back to legacy mode. Do we actually care ??
- */
-
-static int ide_setup_pci_baseregs(struct pci_dev *dev, const char *name)
-{
-	u8 progif = 0;
-
-	/*
-	 * Place both IDE interfaces into PCI "native" mode:
-	 */
-	if (pci_read_config_byte(dev, PCI_CLASS_PROG, &progif) ||
-			 (progif & 5) != 5) {
-		if ((progif & 0xa) != 0xa) {
-			printk(KERN_INFO "%s %s: device not capable of full "
-				"native PCI mode\n", name, pci_name(dev));
-			return -EOPNOTSUPP;
-		}
-		printk(KERN_INFO "%s %s: placing both ports into native PCI "
-			"mode\n", name, pci_name(dev));
-		(void) pci_write_config_byte(dev, PCI_CLASS_PROG, progif|5);
-		if (pci_read_config_byte(dev, PCI_CLASS_PROG, &progif) ||
-		    (progif & 5) != 5) {
-			printk(KERN_ERR "%s %s: rewrite of PROGIF failed, "
-				"wanted 0x%04x, got 0x%04x\n",
-				name, pci_name(dev), progif | 5, progif);
-			return -EOPNOTSUPP;
-		}
-	}
-	return 0;
-}
-
-#ifdef CONFIG_BLK_DEV_IDEDMA_PCI
-static int ide_pci_clear_simplex(unsigned long dma_base, const char *name)
-{
-	u8 dma_stat = inb(dma_base + 2);
-
-	outb(dma_stat & 0x60, dma_base + 2);
-	dma_stat = inb(dma_base + 2);
-
-	return (dma_stat & 0x80) ? 1 : 0;
-}
-
-/**
- *	ide_pci_dma_base	-	setup BMIBA
- *	@hwif: IDE interface
- *	@d: IDE port info
- *
- *	Fetch the DMA Bus-Master-I/O-Base-Address (BMIBA) from PCI space.
- */
-
-unsigned long ide_pci_dma_base(ide_hwif_t *hwif, const struct ide_port_info *d)
-{
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	unsigned long dma_base = 0;
-
-	if (hwif->host_flags & IDE_HFLAG_MMIO)
-		return hwif->dma_base;
-
-	if (hwif->mate && hwif->mate->dma_base) {
-		dma_base = hwif->mate->dma_base - (hwif->channel ? 0 : 8);
-	} else {
-		u8 baridx = (d->host_flags & IDE_HFLAG_CS5520) ? 2 : 4;
-
-		dma_base = pci_resource_start(dev, baridx);
-
-		if (dma_base == 0) {
-			printk(KERN_ERR "%s %s: DMA base is invalid\n",
-				d->name, pci_name(dev));
-			return 0;
-		}
-	}
-
-	if (hwif->channel)
-		dma_base += 8;
-
-	return dma_base;
-}
-EXPORT_SYMBOL_GPL(ide_pci_dma_base);
-
-int ide_pci_check_simplex(ide_hwif_t *hwif, const struct ide_port_info *d)
-{
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	u8 dma_stat;
-
-	if (d->host_flags & (IDE_HFLAG_MMIO | IDE_HFLAG_CS5520))
-		goto out;
-
-	if (d->host_flags & IDE_HFLAG_CLEAR_SIMPLEX) {
-		if (ide_pci_clear_simplex(hwif->dma_base, d->name))
-			printk(KERN_INFO "%s %s: simplex device: DMA forced\n",
-				d->name, pci_name(dev));
-		goto out;
-	}
-
-	/*
-	 * If the device claims "simplex" DMA, this means that only one of
-	 * the two interfaces can be trusted with DMA at any point in time
-	 * (so we should enable DMA only on one of the two interfaces).
-	 *
-	 * FIXME: At this point we haven't probed the drives so we can't make
-	 * the appropriate decision.  Really we should defer this problem until
-	 * we tune the drive then try to grab DMA ownership if we want to be
-	 * the DMA end.  This has to be become dynamic to handle hot-plug.
-	 */
-	dma_stat = hwif->dma_ops->dma_sff_read_status(hwif);
-	if ((dma_stat & 0x80) && hwif->mate && hwif->mate->dma_base) {
-		printk(KERN_INFO "%s %s: simplex device: DMA disabled\n",
-			d->name, pci_name(dev));
-		return -1;
-	}
-out:
-	return 0;
-}
-EXPORT_SYMBOL_GPL(ide_pci_check_simplex);
-
-/*
- * Set up BM-DMA capability (PnP BIOS should have done this)
- */
-int ide_pci_set_master(struct pci_dev *dev, const char *name)
-{
-	u16 pcicmd;
-
-	pci_read_config_word(dev, PCI_COMMAND, &pcicmd);
-
-	if ((pcicmd & PCI_COMMAND_MASTER) == 0) {
-		pci_set_master(dev);
-
-		if (pci_read_config_word(dev, PCI_COMMAND, &pcicmd) ||
-		    (pcicmd & PCI_COMMAND_MASTER) == 0) {
-			printk(KERN_ERR "%s %s: error updating PCICMD\n",
-				name, pci_name(dev));
-			return -EIO;
-		}
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(ide_pci_set_master);
-#endif /* CONFIG_BLK_DEV_IDEDMA_PCI */
-
-void ide_setup_pci_noise(struct pci_dev *dev, const struct ide_port_info *d)
-{
-	printk(KERN_INFO "%s %s: IDE controller (0x%04x:0x%04x rev 0x%02x)\n",
-		d->name, pci_name(dev),
-		dev->vendor, dev->device, dev->revision);
-}
-EXPORT_SYMBOL_GPL(ide_setup_pci_noise);
-
-
-/**
- *	ide_pci_enable	-	do PCI enables
- *	@dev: PCI device
- *	@bars: PCI BARs mask
- *	@d: IDE port info
- *
- *	Enable the IDE PCI device. We attempt to enable the device in full
- *	but if that fails then we only need IO space. The PCI code should
- *	have setup the proper resources for us already for controllers in
- *	legacy mode.
- *
- *	Returns zero on success or an error code
- */
-
-static int ide_pci_enable(struct pci_dev *dev, int bars,
-			  const struct ide_port_info *d)
-{
-	int ret;
-
-	if (pci_enable_device(dev)) {
-		ret = pci_enable_device_io(dev);
-		if (ret < 0) {
-			printk(KERN_WARNING "%s %s: couldn't enable device\n",
-				d->name, pci_name(dev));
-			goto out;
-		}
-		printk(KERN_WARNING "%s %s: BIOS configuration fixed\n",
-			d->name, pci_name(dev));
-	}
-
-	/*
-	 * assume all devices can do 32-bit DMA for now, we can add
-	 * a DMA mask field to the struct ide_port_info if we need it
-	 * (or let lower level driver set the DMA mask)
-	 */
-	ret = dma_set_mask(&dev->dev, DMA_BIT_MASK(32));
-	if (ret < 0) {
-		printk(KERN_ERR "%s %s: can't set DMA mask\n",
-			d->name, pci_name(dev));
-		goto out;
-	}
-
-	ret = pci_request_selected_regions(dev, bars, d->name);
-	if (ret < 0)
-		printk(KERN_ERR "%s %s: can't reserve resources\n",
-			d->name, pci_name(dev));
-out:
-	return ret;
-}
-
-/**
- *	ide_pci_configure	-	configure an unconfigured device
- *	@dev: PCI device
- *	@d: IDE port info
- *
- *	Enable and configure the PCI device we have been passed.
- *	Returns zero on success or an error code.
- */
-
-static int ide_pci_configure(struct pci_dev *dev, const struct ide_port_info *d)
-{
-	u16 pcicmd = 0;
-	/*
-	 * PnP BIOS was *supposed* to have setup this device, but we
-	 * can do it ourselves, so long as the BIOS has assigned an IRQ
-	 * (or possibly the device is using a "legacy header" for IRQs).
-	 * Maybe the user deliberately *disabled* the device,
-	 * but we'll eventually ignore it again if no drives respond.
-	 */
-	if (ide_setup_pci_baseregs(dev, d->name) ||
-	    pci_write_config_word(dev, PCI_COMMAND, pcicmd | PCI_COMMAND_IO)) {
-		printk(KERN_INFO "%s %s: device disabled (BIOS)\n",
-			d->name, pci_name(dev));
-		return -ENODEV;
-	}
-	if (pci_read_config_word(dev, PCI_COMMAND, &pcicmd)) {
-		printk(KERN_ERR "%s %s: error accessing PCI regs\n",
-			d->name, pci_name(dev));
-		return -EIO;
-	}
-	if (!(pcicmd & PCI_COMMAND_IO)) {
-		printk(KERN_ERR "%s %s: unable to enable IDE controller\n",
-			d->name, pci_name(dev));
-		return -ENXIO;
-	}
-	return 0;
-}
-
-/**
- *	ide_pci_check_iomem	-	check a register is I/O
- *	@dev: PCI device
- *	@d: IDE port info
- *	@bar: BAR number
- *
- *	Checks if a BAR is configured and points to MMIO space. If so,
- *	return an error code. Otherwise return 0
- */
-
-static int ide_pci_check_iomem(struct pci_dev *dev, const struct ide_port_info *d,
-			       int bar)
-{
-	ulong flags = pci_resource_flags(dev, bar);
-
-	/* Unconfigured ? */
-	if (!flags || pci_resource_len(dev, bar) == 0)
-		return 0;
-
-	/* I/O space */
-	if (flags & IORESOURCE_IO)
-		return 0;
-
-	/* Bad */
-	return -EINVAL;
-}
-
-/**
- *	ide_hw_configure	-	configure a struct ide_hw instance
- *	@dev: PCI device holding interface
- *	@d: IDE port info
- *	@port: port number
- *	@hw: struct ide_hw instance corresponding to this port
- *
- *	Perform the initial set up for the hardware interface structure. This
- *	is done per interface port rather than per PCI device. There may be
- *	more than one port per device.
- *
- *	Returns zero on success or an error code.
- */
-
-static int ide_hw_configure(struct pci_dev *dev, const struct ide_port_info *d,
-			    unsigned int port, struct ide_hw *hw)
-{
-	unsigned long ctl = 0, base = 0;
-
-	if ((d->host_flags & IDE_HFLAG_ISA_PORTS) == 0) {
-		if (ide_pci_check_iomem(dev, d, 2 * port) ||
-		    ide_pci_check_iomem(dev, d, 2 * port + 1)) {
-			printk(KERN_ERR "%s %s: I/O baseregs (BIOS) are "
-				"reported as MEM for port %d!\n",
-				d->name, pci_name(dev), port);
-			return -EINVAL;
-		}
-
-		ctl  = pci_resource_start(dev, 2*port+1);
-		base = pci_resource_start(dev, 2*port);
-	} else {
-		/* Use default values */
-		ctl = port ? 0x374 : 0x3f4;
-		base = port ? 0x170 : 0x1f0;
-	}
-
-	if (!base || !ctl) {
-		printk(KERN_ERR "%s %s: bad PCI BARs for port %d, skipping\n",
-			d->name, pci_name(dev), port);
-		return -EINVAL;
-	}
-
-	memset(hw, 0, sizeof(*hw));
-	hw->dev = &dev->dev;
-	ide_std_init_ports(hw, base, ctl | 2);
-
-	return 0;
-}
-
-#ifdef CONFIG_BLK_DEV_IDEDMA_PCI
-/**
- *	ide_hwif_setup_dma	-	configure DMA interface
- *	@hwif: IDE interface
- *	@d: IDE port info
- *
- *	Set up the DMA base for the interface. Enable the master bits as
- *	necessary and attempt to bring the device DMA into a ready to use
- *	state
- */
-
-int ide_hwif_setup_dma(ide_hwif_t *hwif, const struct ide_port_info *d)
-{
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-
-	if ((d->host_flags & IDE_HFLAG_NO_AUTODMA) == 0 ||
-	    ((dev->class >> 8) == PCI_CLASS_STORAGE_IDE &&
-	     (dev->class & 0x80))) {
-		unsigned long base = ide_pci_dma_base(hwif, d);
-
-		if (base == 0)
-			return -1;
-
-		hwif->dma_base = base;
-
-		if (hwif->dma_ops == NULL)
-			hwif->dma_ops = &sff_dma_ops;
-
-		if (ide_pci_check_simplex(hwif, d) < 0)
-			return -1;
-
-		if (ide_pci_set_master(dev, d->name) < 0)
-			return -1;
-
-		if (hwif->host_flags & IDE_HFLAG_MMIO)
-			printk(KERN_INFO "    %s: MMIO-DMA\n", hwif->name);
-		else
-			printk(KERN_INFO "    %s: BM-DMA at 0x%04lx-0x%04lx\n",
-					 hwif->name, base, base + 7);
-
-		hwif->extra_base = base + (hwif->channel ? 8 : 16);
-
-		if (ide_allocate_dma_engine(hwif))
-			return -1;
-	}
-
-	return 0;
-}
-#endif /* CONFIG_BLK_DEV_IDEDMA_PCI */
-
-/**
- *	ide_setup_pci_controller	-	set up IDE PCI
- *	@dev: PCI device
- *	@bars: PCI BARs mask
- *	@d: IDE port info
- *	@noisy: verbose flag
- *
- *	Set up the PCI and controller side of the IDE interface. This brings
- *	up the PCI side of the device, checks that the device is enabled
- *	and enables it if need be
- */
-
-static int ide_setup_pci_controller(struct pci_dev *dev, int bars,
-				    const struct ide_port_info *d, int noisy)
-{
-	int ret;
-	u16 pcicmd;
-
-	if (noisy)
-		ide_setup_pci_noise(dev, d);
-
-	ret = ide_pci_enable(dev, bars, d);
-	if (ret < 0)
-		goto out;
-
-	ret = pci_read_config_word(dev, PCI_COMMAND, &pcicmd);
-	if (ret < 0) {
-		printk(KERN_ERR "%s %s: error accessing PCI regs\n",
-			d->name, pci_name(dev));
-		goto out_free_bars;
-	}
-	if (!(pcicmd & PCI_COMMAND_IO)) {	/* is device disabled? */
-		ret = ide_pci_configure(dev, d);
-		if (ret < 0)
-			goto out_free_bars;
-		printk(KERN_INFO "%s %s: device enabled (Linux)\n",
-			d->name, pci_name(dev));
-	}
-
-	goto out;
-
-out_free_bars:
-	pci_release_selected_regions(dev, bars);
-out:
-	return ret;
-}
-
-/**
- *	ide_pci_setup_ports	-	configure ports/devices on PCI IDE
- *	@dev: PCI device
- *	@d: IDE port info
- *	@hw: struct ide_hw instances corresponding to this PCI IDE device
- *	@hws: struct ide_hw pointers table to update
- *
- *	Scan the interfaces attached to this device and do any
- *	necessary per port setup. Attach the devices and ask the
- *	generic DMA layer to do its work for us.
- *
- *	Normally called automaticall from do_ide_pci_setup_device,
- *	but is also used directly as a helper function by some controllers
- *	where the chipset setup is not the default PCI IDE one.
- */
-
-void ide_pci_setup_ports(struct pci_dev *dev, const struct ide_port_info *d,
-			 struct ide_hw *hw, struct ide_hw **hws)
-{
-	int channels = (d->host_flags & IDE_HFLAG_SINGLE) ? 1 : 2, port;
-	u8 tmp;
-
-	/*
-	 * Set up the IDE ports
-	 */
-
-	for (port = 0; port < channels; ++port) {
-		const struct ide_pci_enablebit *e = &d->enablebits[port];
-
-		if (e->reg && (pci_read_config_byte(dev, e->reg, &tmp) ||
-		    (tmp & e->mask) != e->val)) {
-			printk(KERN_INFO "%s %s: IDE port disabled\n",
-				d->name, pci_name(dev));
-			continue;	/* port not enabled */
-		}
-
-		if (ide_hw_configure(dev, d, port, hw + port))
-			continue;
-
-		*(hws + port) = hw + port;
-	}
-}
-EXPORT_SYMBOL_GPL(ide_pci_setup_ports);
-
-/*
- * ide_setup_pci_device() looks at the primary/secondary interfaces
- * on a PCI IDE device and, if they are enabled, prepares the IDE driver
- * for use with them.  This generic code works for most PCI chipsets.
- *
- * One thing that is not standardized is the location of the
- * primary/secondary interface "enable/disable" bits.  For chipsets that
- * we "know" about, this information is in the struct ide_port_info;
- * for all other chipsets, we just assume both interfaces are enabled.
- */
-static int do_ide_setup_pci_device(struct pci_dev *dev,
-				   const struct ide_port_info *d,
-				   u8 noisy)
-{
-	int pciirq, ret;
-
-	/*
-	 * Can we trust the reported IRQ?
-	 */
-	pciirq = dev->irq;
-
-	/*
-	 * This allows offboard ide-pci cards the enable a BIOS,
-	 * verify interrupt settings of split-mirror pci-config
-	 * space, place chipset into init-mode, and/or preserve
-	 * an interrupt if the card is not native ide support.
-	 */
-	ret = d->init_chipset ? d->init_chipset(dev) : 0;
-	if (ret < 0)
-		goto out;
-
-	if (ide_pci_is_in_compatibility_mode(dev)) {
-		if (noisy)
-			printk(KERN_INFO "%s %s: not 100%% native mode: will "
-				"probe irqs later\n", d->name, pci_name(dev));
-		pciirq = 0;
-	} else if (!pciirq && noisy) {
-		printk(KERN_WARNING "%s %s: bad irq (%d): will probe later\n",
-			d->name, pci_name(dev), pciirq);
-	} else if (noisy) {
-		printk(KERN_INFO "%s %s: 100%% native mode on irq %d\n",
-			d->name, pci_name(dev), pciirq);
-	}
-
-	ret = pciirq;
-out:
-	return ret;
-}
-
-int ide_pci_init_two(struct pci_dev *dev1, struct pci_dev *dev2,
-		     const struct ide_port_info *d, void *priv)
-{
-	struct pci_dev *pdev[] = { dev1, dev2 };
-	struct ide_host *host;
-	int ret, i, n_ports = dev2 ? 4 : 2, bars;
-	struct ide_hw hw[4], *hws[] = { NULL, NULL, NULL, NULL };
-
-	if (d->host_flags & IDE_HFLAG_SINGLE)
-		bars = (1 << 2) - 1;
-	else
-		bars = (1 << 4) - 1;
-
-	if ((d->host_flags & IDE_HFLAG_NO_DMA) == 0) {
-		if (d->host_flags & IDE_HFLAG_CS5520)
-			bars |= (1 << 2);
-		else
-			bars |= (1 << 4);
-	}
-
-	for (i = 0; i < n_ports / 2; i++) {
-		ret = ide_setup_pci_controller(pdev[i], bars, d, !i);
-		if (ret < 0) {
-			if (i == 1)
-				pci_release_selected_regions(pdev[0], bars);
-			goto out;
-		}
-
-		ide_pci_setup_ports(pdev[i], d, &hw[i*2], &hws[i*2]);
-	}
-
-	host = ide_host_alloc(d, hws, n_ports);
-	if (host == NULL) {
-		ret = -ENOMEM;
-		goto out_free_bars;
-	}
-
-	host->dev[0] = &dev1->dev;
-	if (dev2)
-		host->dev[1] = &dev2->dev;
-
-	host->host_priv = priv;
-	host->irq_flags = IRQF_SHARED;
-
-	pci_set_drvdata(pdev[0], host);
-	if (dev2)
-		pci_set_drvdata(pdev[1], host);
-
-	for (i = 0; i < n_ports / 2; i++) {
-		ret = do_ide_setup_pci_device(pdev[i], d, !i);
-
-		/*
-		 * FIXME: Mom, mom, they stole me the helper function to undo
-		 * do_ide_setup_pci_device() on the first device!
-		 */
-		if (ret < 0)
-			goto out_free_bars;
-
-		/* fixup IRQ */
-		if (ide_pci_is_in_compatibility_mode(pdev[i])) {
-			hw[i*2].irq = pci_get_legacy_ide_irq(pdev[i], 0);
-			hw[i*2 + 1].irq = pci_get_legacy_ide_irq(pdev[i], 1);
-		} else
-			hw[i*2 + 1].irq = hw[i*2].irq = ret;
-	}
-
-	ret = ide_host_register(host, d, hws);
-	if (ret)
-		ide_host_free(host);
-	else
-		goto out;
-
-out_free_bars:
-	i = n_ports / 2;
-	while (i--)
-		pci_release_selected_regions(pdev[i], bars);
-out:
-	return ret;
-}
-EXPORT_SYMBOL_GPL(ide_pci_init_two);
-
-int ide_pci_init_one(struct pci_dev *dev, const struct ide_port_info *d,
-		     void *priv)
-{
-	return ide_pci_init_two(dev, NULL, d, priv);
-}
-EXPORT_SYMBOL_GPL(ide_pci_init_one);
-
-void ide_pci_remove(struct pci_dev *dev)
-{
-	struct ide_host *host = pci_get_drvdata(dev);
-	struct pci_dev *dev2 = host->dev[1] ? to_pci_dev(host->dev[1]) : NULL;
-	int bars;
-
-	if (host->host_flags & IDE_HFLAG_SINGLE)
-		bars = (1 << 2) - 1;
-	else
-		bars = (1 << 4) - 1;
-
-	if ((host->host_flags & IDE_HFLAG_NO_DMA) == 0) {
-		if (host->host_flags & IDE_HFLAG_CS5520)
-			bars |= (1 << 2);
-		else
-			bars |= (1 << 4);
-	}
-
-	ide_host_remove(host);
-
-	if (dev2)
-		pci_release_selected_regions(dev2, bars);
-	pci_release_selected_regions(dev, bars);
-
-	if (dev2)
-		pci_disable_device(dev2);
-	pci_disable_device(dev);
-}
-EXPORT_SYMBOL_GPL(ide_pci_remove);
-
-#ifdef CONFIG_PM
-int ide_pci_suspend(struct pci_dev *dev, pm_message_t state)
-{
-	pci_save_state(dev);
-	pci_disable_device(dev);
-	pci_set_power_state(dev, pci_choose_state(dev, state));
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(ide_pci_suspend);
-
-int ide_pci_resume(struct pci_dev *dev)
-{
-	struct ide_host *host = pci_get_drvdata(dev);
-	int rc;
-
-	pci_set_power_state(dev, PCI_D0);
-
-	rc = pci_enable_device(dev);
-	if (rc)
-		return rc;
-
-	pci_restore_state(dev);
-	pci_set_master(dev);
-
-	if (host->init_chipset)
-		host->init_chipset(dev);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(ide_pci_resume);
-#endif
diff --git a/drivers/ide/siimage.c b/drivers/ide/siimage.c
deleted file mode 100644
index c4b20f350b84..000000000000
--- a/drivers/ide/siimage.c
+++ /dev/null
@@ -1,843 +0,0 @@
-/*
- * Copyright (C) 2001-2002	Andre Hedrick <andre@linux-ide.org>
- * Copyright (C) 2003		Red Hat
- * Copyright (C) 2007-2008	MontaVista Software, Inc.
- * Copyright (C) 2007-2008	Bartlomiej Zolnierkiewicz
- *
- *  May be copied or modified under the terms of the GNU General Public License
- *
- *  Documentation for CMD680:
- *  http://gkernel.sourceforge.net/specs/sii/sii-0680a-v1.31.pdf.bz2
- *
- *  Documentation for SiI 3112:
- *  http://gkernel.sourceforge.net/specs/sii/3112A_SiI-DS-0095-B2.pdf.bz2
- *
- *  Errata and other documentation only available under NDA.
- *
- *
- *  FAQ Items:
- *	If you are using Marvell SATA-IDE adapters with Maxtor drives
- *	ensure the system is set up for ATA100/UDMA5, not UDMA6.
- *
- *	If you are using WD drives with SATA bridges you must set the
- *	drive to "Single". "Master" will hang.
- *
- *	If you have strange problems with nVidia chipset systems please
- *	see the SI support documentation and update your system BIOS
- *	if necessary
- *
- *  The Dell DRAC4 has some interesting features including effectively hot
- *  unplugging/replugging the virtual CD interface when the DRAC is reset.
- *  This often causes drivers/ide/siimage to panic but is ok with the rather
- *  smarter code in libata.
- *
- * TODO:
- * - VDMA support
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-#include <linux/io.h>
-
-#define DRV_NAME "siimage"
-
-/**
- *	pdev_is_sata		-	check if device is SATA
- *	@pdev:	PCI device to check
- *
- *	Returns true if this is a SATA controller
- */
-
-static int pdev_is_sata(struct pci_dev *pdev)
-{
-#ifdef CONFIG_BLK_DEV_IDE_SATA
-	switch (pdev->device) {
-	case PCI_DEVICE_ID_SII_3112:
-	case PCI_DEVICE_ID_SII_1210SA:
-		return 1;
-	case PCI_DEVICE_ID_SII_680:
-		return 0;
-	}
-	BUG();
-#endif
-	return 0;
-}
-
-/**
- *	is_sata			-	check if hwif is SATA
- *	@hwif:	interface to check
- *
- *	Returns true if this is a SATA controller
- */
-
-static inline int is_sata(ide_hwif_t *hwif)
-{
-	return pdev_is_sata(to_pci_dev(hwif->dev));
-}
-
-/**
- *	siimage_selreg		-	return register base
- *	@hwif: interface
- *	@r: config offset
- *
- *	Turn a config register offset into the right address in either
- *	PCI space or MMIO space to access the control register in question
- *	Thankfully this is a configuration operation, so isn't performance
- *	critical.
- */
-
-static unsigned long siimage_selreg(ide_hwif_t *hwif, int r)
-{
-	unsigned long base = (unsigned long)hwif->hwif_data;
-
-	base += 0xA0 + r;
-	if (hwif->host_flags & IDE_HFLAG_MMIO)
-		base += hwif->channel << 6;
-	else
-		base += hwif->channel << 4;
-	return base;
-}
-
-/**
- *	siimage_seldev		-	return register base
- *	@hwif: interface
- *	@r: config offset
- *
- *	Turn a config register offset into the right address in either
- *	PCI space or MMIO space to access the control register in question
- *	including accounting for the unit shift.
- */
-
-static inline unsigned long siimage_seldev(ide_drive_t *drive, int r)
-{
-	ide_hwif_t *hwif	= drive->hwif;
-	unsigned long base	= (unsigned long)hwif->hwif_data;
-	u8 unit			= drive->dn & 1;
-
-	base += 0xA0 + r;
-	if (hwif->host_flags & IDE_HFLAG_MMIO)
-		base += hwif->channel << 6;
-	else
-		base += hwif->channel << 4;
-	base |= unit << unit;
-	return base;
-}
-
-static u8 sil_ioread8(struct pci_dev *dev, unsigned long addr)
-{
-	struct ide_host *host = pci_get_drvdata(dev);
-	u8 tmp = 0;
-
-	if (host->host_priv)
-		tmp = readb((void __iomem *)addr);
-	else
-		pci_read_config_byte(dev, addr, &tmp);
-
-	return tmp;
-}
-
-static u16 sil_ioread16(struct pci_dev *dev, unsigned long addr)
-{
-	struct ide_host *host = pci_get_drvdata(dev);
-	u16 tmp = 0;
-
-	if (host->host_priv)
-		tmp = readw((void __iomem *)addr);
-	else
-		pci_read_config_word(dev, addr, &tmp);
-
-	return tmp;
-}
-
-static void sil_iowrite8(struct pci_dev *dev, u8 val, unsigned long addr)
-{
-	struct ide_host *host = pci_get_drvdata(dev);
-
-	if (host->host_priv)
-		writeb(val, (void __iomem *)addr);
-	else
-		pci_write_config_byte(dev, addr, val);
-}
-
-static void sil_iowrite16(struct pci_dev *dev, u16 val, unsigned long addr)
-{
-	struct ide_host *host = pci_get_drvdata(dev);
-
-	if (host->host_priv)
-		writew(val, (void __iomem *)addr);
-	else
-		pci_write_config_word(dev, addr, val);
-}
-
-static void sil_iowrite32(struct pci_dev *dev, u32 val, unsigned long addr)
-{
-	struct ide_host *host = pci_get_drvdata(dev);
-
-	if (host->host_priv)
-		writel(val, (void __iomem *)addr);
-	else
-		pci_write_config_dword(dev, addr, val);
-}
-
-/**
- *	sil_udma_filter		-	compute UDMA mask
- *	@drive: IDE device
- *
- *	Compute the available UDMA speeds for the device on the interface.
- *
- *	For the CMD680 this depends on the clocking mode (scsc), for the
- *	SI3112 SATA controller life is a bit simpler.
- */
-
-static u8 sil_pata_udma_filter(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif	= drive->hwif;
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	unsigned long base	= (unsigned long)hwif->hwif_data;
-	u8 scsc, mask		= 0;
-
-	base += (hwif->host_flags & IDE_HFLAG_MMIO) ? 0x4A : 0x8A;
-
-	scsc = sil_ioread8(dev, base);
-
-	switch (scsc & 0x30) {
-	case 0x10:	/* 133 */
-		mask = ATA_UDMA6;
-		break;
-	case 0x20:	/* 2xPCI */
-		mask = ATA_UDMA6;
-		break;
-	case 0x00:	/* 100 */
-		mask = ATA_UDMA5;
-		break;
-	default: 	/* Disabled ? */
-		BUG();
-	}
-
-	return mask;
-}
-
-static u8 sil_sata_udma_filter(ide_drive_t *drive)
-{
-	char *m = (char *)&drive->id[ATA_ID_PROD];
-
-	return strstr(m, "Maxtor") ? ATA_UDMA5 : ATA_UDMA6;
-}
-
-/**
- *	sil_set_pio_mode	-	set host controller for PIO mode
- *	@hwif: port
- *	@drive: drive
- *
- *	Load the timing settings for this device mode into the
- *	controller.
- */
-
-static void sil_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	static const u16 tf_speed[]   = { 0x328a, 0x2283, 0x1281, 0x10c3, 0x10c1 };
-	static const u16 data_speed[] = { 0x328a, 0x2283, 0x1104, 0x10c3, 0x10c1 };
-
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	ide_drive_t *pair	= ide_get_pair_dev(drive);
-	u32 speedt		= 0;
-	u16 speedp		= 0;
-	unsigned long addr	= siimage_seldev(drive, 0x04);
-	unsigned long tfaddr	= siimage_selreg(hwif,	0x02);
-	unsigned long base	= (unsigned long)hwif->hwif_data;
-	const u8 pio		= drive->pio_mode - XFER_PIO_0;
-	u8 tf_pio		= pio;
-	u8 mmio			= (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0;
-	u8 addr_mask		= hwif->channel ? (mmio ? 0xF4 : 0x84)
-						: (mmio ? 0xB4 : 0x80);
-	u8 mode			= 0;
-	u8 unit			= drive->dn & 1;
-
-	/* trim *taskfile* PIO to the slowest of the master/slave */
-	if (pair) {
-		u8 pair_pio = pair->pio_mode - XFER_PIO_0;
-
-		if (pair_pio < tf_pio)
-			tf_pio = pair_pio;
-	}
-
-	/* cheat for now and use the docs */
-	speedp = data_speed[pio];
-	speedt = tf_speed[tf_pio];
-
-	sil_iowrite16(dev, speedp, addr);
-	sil_iowrite16(dev, speedt, tfaddr);
-
-	/* now set up IORDY */
-	speedp = sil_ioread16(dev, tfaddr - 2);
-	speedp &= ~0x200;
-
-	mode = sil_ioread8(dev, base + addr_mask);
-	mode &= ~(unit ? 0x30 : 0x03);
-
-	if (ide_pio_need_iordy(drive, pio)) {
-		speedp |= 0x200;
-		mode |= unit ? 0x10 : 0x01;
-	}
-
-	sil_iowrite16(dev, speedp, tfaddr - 2);
-	sil_iowrite8(dev, mode, base + addr_mask);
-}
-
-/**
- *	sil_set_dma_mode	-	set host controller for DMA mode
- *	@hwif: port
- *	@drive: drive
- *
- *	Tune the SiI chipset for the desired DMA mode.
- */
-
-static void sil_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	static const u8 ultra6[] = { 0x0F, 0x0B, 0x07, 0x05, 0x03, 0x02, 0x01 };
-	static const u8 ultra5[] = { 0x0C, 0x07, 0x05, 0x04, 0x02, 0x01 };
-	static const u16 dma[]	 = { 0x2208, 0x10C2, 0x10C1 };
-
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	unsigned long base	= (unsigned long)hwif->hwif_data;
-	u16 ultra = 0, multi	= 0;
-	u8 mode = 0, unit	= drive->dn & 1;
-	u8 mmio			= (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0;
-	u8 scsc = 0, addr_mask	= hwif->channel ? (mmio ? 0xF4 : 0x84)
-						: (mmio ? 0xB4 : 0x80);
-	unsigned long ma	= siimage_seldev(drive, 0x08);
-	unsigned long ua	= siimage_seldev(drive, 0x0C);
-	const u8 speed		= drive->dma_mode;
-
-	scsc  = sil_ioread8 (dev, base + (mmio ? 0x4A : 0x8A));
-	mode  = sil_ioread8 (dev, base + addr_mask);
-	multi = sil_ioread16(dev, ma);
-	ultra = sil_ioread16(dev, ua);
-
-	mode  &= ~(unit ? 0x30 : 0x03);
-	ultra &= ~0x3F;
-	scsc = ((scsc & 0x30) == 0x00) ? 0 : 1;
-
-	scsc = is_sata(hwif) ? 1 : scsc;
-
-	if (speed >= XFER_UDMA_0) {
-		multi  = dma[2];
-		ultra |= scsc ? ultra6[speed - XFER_UDMA_0] :
-				ultra5[speed - XFER_UDMA_0];
-		mode  |= unit ? 0x30 : 0x03;
-	} else {
-		multi = dma[speed - XFER_MW_DMA_0];
-		mode |= unit ? 0x20 : 0x02;
-	}
-
-	sil_iowrite8 (dev, mode, base + addr_mask);
-	sil_iowrite16(dev, multi, ma);
-	sil_iowrite16(dev, ultra, ua);
-}
-
-static int sil_test_irq(ide_hwif_t *hwif)
-{
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	unsigned long addr	= siimage_selreg(hwif, 1);
-	u8 val			= sil_ioread8(dev, addr);
-
-	/* Return 1 if INTRQ asserted */
-	return (val & 8) ? 1 : 0;
-}
-
-/**
- *	siimage_mmio_dma_test_irq	-	check we caused an IRQ
- *	@drive: drive we are testing
- *
- *	Check if we caused an IDE DMA interrupt. We may also have caused
- *	SATA status interrupts, if so we clean them up and continue.
- */
-
-static int siimage_mmio_dma_test_irq(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif	= drive->hwif;
-	void __iomem *sata_error_addr
-		= (void __iomem *)hwif->sata_scr[SATA_ERROR_OFFSET];
-
-	if (sata_error_addr) {
-		unsigned long base	= (unsigned long)hwif->hwif_data;
-		u32 ext_stat		= readl((void __iomem *)(base + 0x10));
-		u8 watchdog		= 0;
-
-		if (ext_stat & ((hwif->channel) ? 0x40 : 0x10)) {
-			u32 sata_error = readl(sata_error_addr);
-
-			writel(sata_error, sata_error_addr);
-			watchdog = (sata_error & 0x00680000) ? 1 : 0;
-			printk(KERN_WARNING "%s: sata_error = 0x%08x, "
-				"watchdog = %d, %s\n",
-				drive->name, sata_error, watchdog, __func__);
-		} else
-			watchdog = (ext_stat & 0x8000) ? 1 : 0;
-
-		ext_stat >>= 16;
-		if (!(ext_stat & 0x0404) && !watchdog)
-			return 0;
-	}
-
-	/* return 1 if INTR asserted */
-	if (readb((void __iomem *)(hwif->dma_base + ATA_DMA_STATUS)) & 4)
-		return 1;
-
-	return 0;
-}
-
-static int siimage_dma_test_irq(ide_drive_t *drive)
-{
-	if (drive->hwif->host_flags & IDE_HFLAG_MMIO)
-		return siimage_mmio_dma_test_irq(drive);
-	else
-		return ide_dma_test_irq(drive);
-}
-
-/**
- *	sil_sata_reset_poll	-	wait for SATA reset
- *	@drive: drive we are resetting
- *
- *	Poll the SATA phy and see whether it has come back from the dead
- *	yet.
- */
-
-static blk_status_t sil_sata_reset_poll(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	void __iomem *sata_status_addr
-		= (void __iomem *)hwif->sata_scr[SATA_STATUS_OFFSET];
-
-	if (sata_status_addr) {
-		/* SATA Status is available only when in MMIO mode */
-		u32 sata_stat = readl(sata_status_addr);
-
-		if ((sata_stat & 0x03) != 0x03) {
-			printk(KERN_WARNING "%s: reset phy dead, status=0x%08x\n",
-					    hwif->name, sata_stat);
-			return BLK_STS_IOERR;
-		}
-	}
-
-	return BLK_STS_OK;
-}
-
-/**
- *	sil_sata_pre_reset	-	reset hook
- *	@drive: IDE device being reset
- *
- *	For the SATA devices we need to handle recalibration/geometry
- *	differently
- */
-
-static void sil_sata_pre_reset(ide_drive_t *drive)
-{
-	if (drive->media == ide_disk) {
-		drive->special_flags &=
-			~(IDE_SFLAG_SET_GEOMETRY | IDE_SFLAG_RECALIBRATE);
-	}
-}
-
-/**
- *	init_chipset_siimage	-	set up an SI device
- *	@dev: PCI device
- *
- *	Perform the initial PCI set up for this device. Attempt to switch
- *	to 133 MHz clocking if the system isn't already set up to do it.
- */
-
-static int init_chipset_siimage(struct pci_dev *dev)
-{
-	struct ide_host *host = pci_get_drvdata(dev);
-	void __iomem *ioaddr = host->host_priv;
-	unsigned long base, scsc_addr;
-	u8 rev = dev->revision, tmp;
-
-	pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE, rev ? 1 : 255);
-
-	if (ioaddr)
-		pci_set_master(dev);
-
-	base = (unsigned long)ioaddr;
-
-	if (ioaddr && pdev_is_sata(dev)) {
-		u32 tmp32, irq_mask;
-
-		/* make sure IDE0/1 interrupts are not masked */
-		irq_mask = (1 << 22) | (1 << 23);
-		tmp32 = readl(ioaddr + 0x48);
-		if (tmp32 & irq_mask) {
-			tmp32 &= ~irq_mask;
-			writel(tmp32, ioaddr + 0x48);
-			readl(ioaddr + 0x48); /* flush */
-		}
-		writel(0, ioaddr + 0x148);
-		writel(0, ioaddr + 0x1C8);
-	}
-
-	sil_iowrite8(dev, 0, base ? (base + 0xB4) : 0x80);
-	sil_iowrite8(dev, 0, base ? (base + 0xF4) : 0x84);
-
-	scsc_addr = base ? (base + 0x4A) : 0x8A;
-	tmp = sil_ioread8(dev, scsc_addr);
-
-	switch (tmp & 0x30) {
-	case 0x00:
-		/* On 100 MHz clocking, try and switch to 133 MHz */
-		sil_iowrite8(dev, tmp | 0x10, scsc_addr);
-		break;
-	case 0x30:
-		/* Clocking is disabled, attempt to force 133MHz clocking. */
-		sil_iowrite8(dev, tmp & ~0x20, scsc_addr);
-	case 0x10:
-		/* On 133Mhz clocking. */
-		break;
-	case 0x20:
-		/* On PCIx2 clocking. */
-		break;
-	}
-
-	tmp = sil_ioread8(dev, scsc_addr);
-
-	sil_iowrite8 (dev,       0x72, base + 0xA1);
-	sil_iowrite16(dev,     0x328A, base + 0xA2);
-	sil_iowrite32(dev, 0x62DD62DD, base + 0xA4);
-	sil_iowrite32(dev, 0x43924392, base + 0xA8);
-	sil_iowrite32(dev, 0x40094009, base + 0xAC);
-	sil_iowrite8 (dev,       0x72, base ? (base + 0xE1) : 0xB1);
-	sil_iowrite16(dev,     0x328A, base ? (base + 0xE2) : 0xB2);
-	sil_iowrite32(dev, 0x62DD62DD, base ? (base + 0xE4) : 0xB4);
-	sil_iowrite32(dev, 0x43924392, base ? (base + 0xE8) : 0xB8);
-	sil_iowrite32(dev, 0x40094009, base ? (base + 0xEC) : 0xBC);
-
-	if (base && pdev_is_sata(dev)) {
-		writel(0xFFFF0000, ioaddr + 0x108);
-		writel(0xFFFF0000, ioaddr + 0x188);
-		writel(0x00680000, ioaddr + 0x148);
-		writel(0x00680000, ioaddr + 0x1C8);
-	}
-
-	/* report the clocking mode of the controller */
-	if (!pdev_is_sata(dev)) {
-		static const char *clk_str[] =
-			{ "== 100", "== 133", "== 2X PCI", "DISABLED!" };
-
-		tmp >>= 4;
-		printk(KERN_INFO DRV_NAME " %s: BASE CLOCK %s\n",
-			pci_name(dev), clk_str[tmp & 3]);
-	}
-
-	return 0;
-}
-
-/**
- *	init_mmio_iops_siimage	-	set up the iops for MMIO
- *	@hwif: interface to set up
- *
- *	The basic setup here is fairly simple, we can use standard MMIO
- *	operations. However we do have to set the taskfile register offsets
- *	by hand as there isn't a standard defined layout for them this time.
- *
- *	The hardware supports buffered taskfiles and also some rather nice
- *	extended PRD tables. For better SI3112 support use the libata driver
- */
-
-static void init_mmio_iops_siimage(ide_hwif_t *hwif)
-{
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	struct ide_host *host	= pci_get_drvdata(dev);
-	void *addr		= host->host_priv;
-	u8 ch			= hwif->channel;
-	struct ide_io_ports *io_ports = &hwif->io_ports;
-	unsigned long base;
-
-	/*
-	 *	Fill in the basic hwif bits
-	 */
-	hwif->host_flags |= IDE_HFLAG_MMIO;
-
-	hwif->hwif_data	= addr;
-
-	/*
-	 *	Now set up the hw. We have to do this ourselves as the
-	 *	MMIO layout isn't the same as the standard port based I/O.
-	 */
-	memset(io_ports, 0, sizeof(*io_ports));
-
-	base = (unsigned long)addr;
-	if (ch)
-		base += 0xC0;
-	else
-		base += 0x80;
-
-	/*
-	 *	The buffered task file doesn't have status/control, so we
-	 *	can't currently use it sanely since we want to use LBA48 mode.
-	 */
-	io_ports->data_addr	= base;
-	io_ports->error_addr	= base + 1;
-	io_ports->nsect_addr	= base + 2;
-	io_ports->lbal_addr	= base + 3;
-	io_ports->lbam_addr	= base + 4;
-	io_ports->lbah_addr	= base + 5;
-	io_ports->device_addr	= base + 6;
-	io_ports->status_addr	= base + 7;
-	io_ports->ctl_addr	= base + 10;
-
-	if (pdev_is_sata(dev)) {
-		base = (unsigned long)addr;
-		if (ch)
-			base += 0x80;
-		hwif->sata_scr[SATA_STATUS_OFFSET]	= base + 0x104;
-		hwif->sata_scr[SATA_ERROR_OFFSET]	= base + 0x108;
-		hwif->sata_scr[SATA_CONTROL_OFFSET]	= base + 0x100;
-	}
-
-	hwif->irq = dev->irq;
-
-	hwif->dma_base = (unsigned long)addr + (ch ? 0x08 : 0x00);
-}
-
-static int is_dev_seagate_sata(ide_drive_t *drive)
-{
-	const char *s	= (const char *)&drive->id[ATA_ID_PROD];
-	unsigned len	= strnlen(s, ATA_ID_PROD_LEN);
-
-	if ((len > 4) && (!memcmp(s, "ST", 2)))
-		if ((!memcmp(s + len - 2, "AS", 2)) ||
-		    (!memcmp(s + len - 3, "ASL", 3))) {
-			printk(KERN_INFO "%s: applying pessimistic Seagate "
-					 "errata fix\n", drive->name);
-			return 1;
-		}
-
-	return 0;
-}
-
-/**
- *	sil_quirkproc		-	post probe fixups
- *	@drive: drive
- *
- *	Called after drive probe we use this to decide whether the
- *	Seagate fixup must be applied. This used to be in init_iops but
- *	that can occur before we know what drives are present.
- */
-
-static void sil_quirkproc(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-
-	/* Try and rise the rqsize */
-	if (!is_sata(hwif) || !is_dev_seagate_sata(drive))
-		hwif->rqsize = 128;
-}
-
-/**
- *	init_iops_siimage	-	set up iops
- *	@hwif: interface to set up
- *
- *	Do the basic setup for the SIIMAGE hardware interface
- *	and then do the MMIO setup if we can. This is the first
- *	look in we get for setting up the hwif so that we
- *	can get the iops right before using them.
- */
-
-static void init_iops_siimage(ide_hwif_t *hwif)
-{
-	struct ide_host *host = dev_get_drvdata(hwif->dev);
-
-	hwif->hwif_data = NULL;
-
-	/* Pessimal until we finish probing */
-	hwif->rqsize = 15;
-
-	if (host->host_priv)
-		init_mmio_iops_siimage(hwif);
-}
-
-/**
- *	sil_cable_detect	-	cable detection
- *	@hwif: interface to check
- *
- *	Check for the presence of an ATA66 capable cable on the interface.
- */
-
-static u8 sil_cable_detect(ide_hwif_t *hwif)
-{
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	unsigned long addr	= siimage_selreg(hwif, 0);
-	u8 ata66		= sil_ioread8(dev, addr);
-
-	return (ata66 & 0x01) ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
-}
-
-static const struct ide_port_ops sil_pata_port_ops = {
-	.set_pio_mode		= sil_set_pio_mode,
-	.set_dma_mode		= sil_set_dma_mode,
-	.quirkproc		= sil_quirkproc,
-	.test_irq		= sil_test_irq,
-	.udma_filter		= sil_pata_udma_filter,
-	.cable_detect		= sil_cable_detect,
-};
-
-static const struct ide_port_ops sil_sata_port_ops = {
-	.set_pio_mode		= sil_set_pio_mode,
-	.set_dma_mode		= sil_set_dma_mode,
-	.reset_poll		= sil_sata_reset_poll,
-	.pre_reset		= sil_sata_pre_reset,
-	.quirkproc		= sil_quirkproc,
-	.test_irq		= sil_test_irq,
-	.udma_filter		= sil_sata_udma_filter,
-	.cable_detect		= sil_cable_detect,
-};
-
-static const struct ide_dma_ops sil_dma_ops = {
-	.dma_host_set		= ide_dma_host_set,
-	.dma_setup		= ide_dma_setup,
-	.dma_start		= ide_dma_start,
-	.dma_end		= ide_dma_end,
-	.dma_test_irq		= siimage_dma_test_irq,
-	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_lost_irq		= ide_dma_lost_irq,
-	.dma_sff_read_status	= ide_dma_sff_read_status,
-};
-
-#define DECLARE_SII_DEV(p_ops)				\
-	{						\
-		.name		= DRV_NAME,		\
-		.init_chipset	= init_chipset_siimage,	\
-		.init_iops	= init_iops_siimage,	\
-		.port_ops	= p_ops,		\
-		.dma_ops	= &sil_dma_ops,		\
-		.pio_mask	= ATA_PIO4,		\
-		.mwdma_mask	= ATA_MWDMA2,		\
-		.udma_mask	= ATA_UDMA6,		\
-	}
-
-static const struct ide_port_info siimage_chipsets[] = {
-	/* 0: SiI680 */  DECLARE_SII_DEV(&sil_pata_port_ops),
-	/* 1: SiI3112 */ DECLARE_SII_DEV(&sil_sata_port_ops)
-};
-
-/**
- *	siimage_init_one	-	PCI layer discovery entry
- *	@dev: PCI device
- *	@id: ident table entry
- *
- *	Called by the PCI code when it finds an SiI680 or SiI3112 controller.
- *	We then use the IDE PCI generic helper to do most of the work.
- */
-
-static int siimage_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	void __iomem *ioaddr = NULL;
-	resource_size_t bar5 = pci_resource_start(dev, 5);
-	unsigned long barsize = pci_resource_len(dev, 5);
-	int rc;
-	struct ide_port_info d;
-	u8 idx = id->driver_data;
-	u8 BA5_EN;
-
-	d = siimage_chipsets[idx];
-
-	if (idx) {
-		static int first = 1;
-
-		if (first) {
-			printk(KERN_INFO DRV_NAME ": For full SATA support you "
-				"should use the libata sata_sil module.\n");
-			first = 0;
-		}
-
-		d.host_flags |= IDE_HFLAG_NO_ATAPI_DMA;
-	}
-
-	rc = pci_enable_device(dev);
-	if (rc)
-		return rc;
-
-	pci_read_config_byte(dev, 0x8A, &BA5_EN);
-	if ((BA5_EN & 0x01) || bar5) {
-		/*
-		* Drop back to PIO if we can't map the MMIO. Some systems
-		* seem to get terminally confused in the PCI spaces.
-		*/
-		if (!request_mem_region(bar5, barsize, d.name)) {
-			printk(KERN_WARNING DRV_NAME " %s: MMIO ports not "
-				"available\n", pci_name(dev));
-		} else {
-			ioaddr = pci_ioremap_bar(dev, 5);
-			if (ioaddr == NULL)
-				release_mem_region(bar5, barsize);
-		}
-	}
-
-	rc = ide_pci_init_one(dev, &d, ioaddr);
-	if (rc) {
-		if (ioaddr) {
-			iounmap(ioaddr);
-			release_mem_region(bar5, barsize);
-		}
-		pci_disable_device(dev);
-	}
-
-	return rc;
-}
-
-static void siimage_remove(struct pci_dev *dev)
-{
-	struct ide_host *host = pci_get_drvdata(dev);
-	void __iomem *ioaddr = host->host_priv;
-
-	ide_pci_remove(dev);
-
-	if (ioaddr) {
-		resource_size_t bar5 = pci_resource_start(dev, 5);
-		unsigned long barsize = pci_resource_len(dev, 5);
-
-		iounmap(ioaddr);
-		release_mem_region(bar5, barsize);
-	}
-
-	pci_disable_device(dev);
-}
-
-static const struct pci_device_id siimage_pci_tbl[] = {
-	{ PCI_VDEVICE(CMD, PCI_DEVICE_ID_SII_680),    0 },
-#ifdef CONFIG_BLK_DEV_IDE_SATA
-	{ PCI_VDEVICE(CMD, PCI_DEVICE_ID_SII_3112),   1 },
-	{ PCI_VDEVICE(CMD, PCI_DEVICE_ID_SII_1210SA), 1 },
-#endif
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, siimage_pci_tbl);
-
-static struct pci_driver siimage_pci_driver = {
-	.name		= "SiI_IDE",
-	.id_table	= siimage_pci_tbl,
-	.probe		= siimage_init_one,
-	.remove		= siimage_remove,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init siimage_ide_init(void)
-{
-	return ide_pci_register_driver(&siimage_pci_driver);
-}
-
-static void __exit siimage_ide_exit(void)
-{
-	pci_unregister_driver(&siimage_pci_driver);
-}
-
-module_init(siimage_ide_init);
-module_exit(siimage_ide_exit);
-
-MODULE_AUTHOR("Andre Hedrick, Alan Cox");
-MODULE_DESCRIPTION("PCI driver module for SiI IDE");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/sis5513.c b/drivers/ide/sis5513.c
deleted file mode 100644
index 1a700bef6c56..000000000000
--- a/drivers/ide/sis5513.c
+++ /dev/null
@@ -1,637 +0,0 @@
-/*
- * Copyright (C) 1999-2000	Andre Hedrick <andre@linux-ide.org>
- * Copyright (C) 2002		Lionel Bouton <Lionel.Bouton@inet6.fr>, Maintainer
- * Copyright (C) 2003		Vojtech Pavlik <vojtech@suse.cz>
- * Copyright (C) 2007-2009	Bartlomiej Zolnierkiewicz
- *
- * May be copied or modified under the terms of the GNU General Public License
- *
- *
- * Thanks :
- *
- * SiS Taiwan		: for direct support and hardware.
- * Daniela Engert	: for initial ATA100 advices and numerous others.
- * John Fremlin, Manfred Spraul, Dave Morgan, Peter Kjellerstedt	:
- *			  for checking code correctness, providing patches.
- *
- *
- * Original tests and design on the SiS620 chipset.
- * ATA100 tests and design on the SiS735 chipset.
- * ATA16/33 support from specs
- * ATA133 support for SiS961/962 by L.C. Chang <lcchang@sis.com.tw>
- * ATA133 961/962/963 fixes by Vojtech Pavlik <vojtech@suse.cz>
- *
- * Documentation:
- *	SiS chipset documentation available under NDA to companies only
- *      (not to individuals).
- */
-
-/*
- * The original SiS5513 comes from a SiS5511/55112/5513 chipset. The original
- * SiS5513 was also used in the SiS5596/5513 chipset. Thus if we see a SiS5511
- * or SiS5596, we can assume we see the first MWDMA-16 capable SiS5513 chip.
- *
- * Later SiS chipsets integrated the 5513 functionality into the NorthBridge,
- * starting with SiS5571 and up to SiS745. The PCI ID didn't change, though. We
- * can figure out that we have a more modern and more capable 5513 by looking
- * for the respective NorthBridge IDs.
- *
- * Even later (96x family) SiS chipsets use the MuTIOL link and place the 5513
- * into the SouthBrige. Here we cannot rely on looking up the NorthBridge PCI
- * ID, while the now ATA-133 capable 5513 still has the same PCI ID.
- * Fortunately the 5513 can be 'unmasked' by fiddling with some config space
- * bits, changing its device id to the true one - 5517 for 961 and 5518 for
- * 962/963.
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/init.h>
-#include <linux/ide.h>
-
-#define DRV_NAME "sis5513"
-
-/* registers layout and init values are chipset family dependent */
-#undef ATA_16
-#define ATA_16		0x01
-#define ATA_33		0x02
-#define ATA_66		0x03
-#define ATA_100a	0x04 /* SiS730/SiS550 is ATA100 with ATA66 layout */
-#define ATA_100		0x05
-#define ATA_133a	0x06 /* SiS961b with 133 support */
-#define ATA_133		0x07 /* SiS962/963 */
-
-static u8 chipset_family;
-
-/*
- * Devices supported
- */
-static const struct {
-	const char *name;
-	u16 host_id;
-	u8 chipset_family;
-	u8 flags;
-} SiSHostChipInfo[] = {
-	{ "SiS968",	PCI_DEVICE_ID_SI_968,	ATA_133  },
-	{ "SiS966",	PCI_DEVICE_ID_SI_966,	ATA_133  },
-	{ "SiS965",	PCI_DEVICE_ID_SI_965,	ATA_133  },
-	{ "SiS745",	PCI_DEVICE_ID_SI_745,	ATA_100  },
-	{ "SiS735",	PCI_DEVICE_ID_SI_735,	ATA_100  },
-	{ "SiS733",	PCI_DEVICE_ID_SI_733,	ATA_100  },
-	{ "SiS635",	PCI_DEVICE_ID_SI_635,	ATA_100  },
-	{ "SiS633",	PCI_DEVICE_ID_SI_633,	ATA_100  },
-
-	{ "SiS730",	PCI_DEVICE_ID_SI_730,	ATA_100a },
-	{ "SiS550",	PCI_DEVICE_ID_SI_550,	ATA_100a },
-
-	{ "SiS640",	PCI_DEVICE_ID_SI_640,	ATA_66   },
-	{ "SiS630",	PCI_DEVICE_ID_SI_630,	ATA_66   },
-	{ "SiS620",	PCI_DEVICE_ID_SI_620,	ATA_66   },
-	{ "SiS540",	PCI_DEVICE_ID_SI_540,	ATA_66   },
-	{ "SiS530",	PCI_DEVICE_ID_SI_530,	ATA_66   },
-
-	{ "SiS5600",	PCI_DEVICE_ID_SI_5600,	ATA_33   },
-	{ "SiS5598",	PCI_DEVICE_ID_SI_5598,	ATA_33   },
-	{ "SiS5597",	PCI_DEVICE_ID_SI_5597,	ATA_33   },
-	{ "SiS5591/2",	PCI_DEVICE_ID_SI_5591,	ATA_33   },
-	{ "SiS5582",	PCI_DEVICE_ID_SI_5582,	ATA_33   },
-	{ "SiS5581",	PCI_DEVICE_ID_SI_5581,	ATA_33   },
-
-	{ "SiS5596",	PCI_DEVICE_ID_SI_5596,	ATA_16   },
-	{ "SiS5571",	PCI_DEVICE_ID_SI_5571,	ATA_16   },
-	{ "SiS5517",	PCI_DEVICE_ID_SI_5517,	ATA_16   },
-	{ "SiS551x",	PCI_DEVICE_ID_SI_5511,	ATA_16   },
-};
-
-/* Cycle time bits and values vary across chip dma capabilities
-   These three arrays hold the register layout and the values to set.
-   Indexed by chipset_family and (dma_mode - XFER_UDMA_0) */
-
-/* {0, ATA_16, ATA_33, ATA_66, ATA_100a, ATA_100, ATA_133} */
-static u8 cycle_time_offset[] = { 0, 0, 5, 4, 4, 0, 0 };
-static u8 cycle_time_range[]  = { 0, 0, 2, 3, 3, 4, 4 };
-static u8 cycle_time_value[][XFER_UDMA_6 - XFER_UDMA_0 + 1] = {
-	{  0,  0, 0, 0, 0, 0, 0 }, /* no UDMA */
-	{  0,  0, 0, 0, 0, 0, 0 }, /* no UDMA */
-	{  3,  2, 1, 0, 0, 0, 0 }, /* ATA_33 */
-	{  7,  5, 3, 2, 1, 0, 0 }, /* ATA_66 */
-	{  7,  5, 3, 2, 1, 0, 0 }, /* ATA_100a (730 specific),
-				      different cycle_time range and offset */
-	{ 11,  7, 5, 4, 2, 1, 0 }, /* ATA_100 */
-	{ 15, 10, 7, 5, 3, 2, 1 }, /* ATA_133a (earliest 691 southbridges) */
-	{ 15, 10, 7, 5, 3, 2, 1 }, /* ATA_133 */
-};
-/* CRC Valid Setup Time vary across IDE clock setting 33/66/100/133
-   See SiS962 data sheet for more detail */
-static u8 cvs_time_value[][XFER_UDMA_6 - XFER_UDMA_0 + 1] = {
-	{ 0, 0, 0, 0, 0, 0, 0 }, /* no UDMA */
-	{ 0, 0, 0, 0, 0, 0, 0 }, /* no UDMA */
-	{ 2, 1, 1, 0, 0, 0, 0 },
-	{ 4, 3, 2, 1, 0, 0, 0 },
-	{ 4, 3, 2, 1, 0, 0, 0 },
-	{ 6, 4, 3, 1, 1, 1, 0 },
-	{ 9, 6, 4, 2, 2, 2, 2 },
-	{ 9, 6, 4, 2, 2, 2, 2 },
-};
-/* Initialize time, Active time, Recovery time vary across
-   IDE clock settings. These 3 arrays hold the register value
-   for PIO0/1/2/3/4 and DMA0/1/2 mode in order */
-static u8 ini_time_value[][8] = {
-	{ 0, 0, 0, 0, 0, 0, 0, 0 },
-	{ 0, 0, 0, 0, 0, 0, 0, 0 },
-	{ 2, 1, 0, 0, 0, 1, 0, 0 },
-	{ 4, 3, 1, 1, 1, 3, 1, 1 },
-	{ 4, 3, 1, 1, 1, 3, 1, 1 },
-	{ 6, 4, 2, 2, 2, 4, 2, 2 },
-	{ 9, 6, 3, 3, 3, 6, 3, 3 },
-	{ 9, 6, 3, 3, 3, 6, 3, 3 },
-};
-static u8 act_time_value[][8] = {
-	{  0,  0,  0,  0, 0,  0,  0, 0 },
-	{  0,  0,  0,  0, 0,  0,  0, 0 },
-	{  9,  9,  9,  2, 2,  7,  2, 2 },
-	{ 19, 19, 19,  5, 4, 14,  5, 4 },
-	{ 19, 19, 19,  5, 4, 14,  5, 4 },
-	{ 28, 28, 28,  7, 6, 21,  7, 6 },
-	{ 38, 38, 38, 10, 9, 28, 10, 9 },
-	{ 38, 38, 38, 10, 9, 28, 10, 9 },
-};
-static u8 rco_time_value[][8] = {
-	{  0,  0, 0,  0, 0,  0,  0, 0 },
-	{  0,  0, 0,  0, 0,  0,  0, 0 },
-	{  9,  2, 0,  2, 0,  7,  1, 1 },
-	{ 19,  5, 1,  5, 2, 16,  3, 2 },
-	{ 19,  5, 1,  5, 2, 16,  3, 2 },
-	{ 30,  9, 3,  9, 4, 25,  6, 4 },
-	{ 40, 12, 4, 12, 5, 34, 12, 5 },
-	{ 40, 12, 4, 12, 5, 34, 12, 5 },
-};
-
-/*
- * Printing configuration
- */
-/* Used for chipset type printing at boot time */
-static char *chipset_capability[] = {
-	"ATA", "ATA 16",
-	"ATA 33", "ATA 66",
-	"ATA 100 (1st gen)", "ATA 100 (2nd gen)",
-	"ATA 133 (1st gen)", "ATA 133 (2nd gen)"
-};
-
-/*
- * Configuration functions
- */
-
-static u8 sis_ata133_get_base(ide_drive_t *drive)
-{
-	struct pci_dev *dev = to_pci_dev(drive->hwif->dev);
-	u32 reg54 = 0;
-
-	pci_read_config_dword(dev, 0x54, &reg54);
-
-	return ((reg54 & 0x40000000) ? 0x70 : 0x40) + drive->dn * 4;
-}
-
-static void sis_ata16_program_timings(ide_drive_t *drive, const u8 mode)
-{
-	struct pci_dev *dev = to_pci_dev(drive->hwif->dev);
-	u16 t1 = 0;
-	u8 drive_pci = 0x40 + drive->dn * 2;
-
-	const u16 pio_timings[]   = { 0x000, 0x607, 0x404, 0x303, 0x301 };
-	const u16 mwdma_timings[] = { 0x008, 0x302, 0x301 };
-
-	pci_read_config_word(dev, drive_pci, &t1);
-
-	/* clear active/recovery timings */
-	t1 &= ~0x070f;
-	if (mode >= XFER_MW_DMA_0) {
-		if (chipset_family > ATA_16)
-			t1 &= ~0x8000;	/* disable UDMA */
-		t1 |= mwdma_timings[mode - XFER_MW_DMA_0];
-	} else
-		t1 |= pio_timings[mode - XFER_PIO_0];
-
-	pci_write_config_word(dev, drive_pci, t1);
-}
-
-static void sis_ata100_program_timings(ide_drive_t *drive, const u8 mode)
-{
-	struct pci_dev *dev = to_pci_dev(drive->hwif->dev);
-	u8 t1, drive_pci = 0x40 + drive->dn * 2;
-
-	/* timing bits: 7:4 active 3:0 recovery */
-	const u8 pio_timings[]   = { 0x00, 0x67, 0x44, 0x33, 0x31 };
-	const u8 mwdma_timings[] = { 0x08, 0x32, 0x31 };
-
-	if (mode >= XFER_MW_DMA_0) {
-		u8 t2 = 0;
-
-		pci_read_config_byte(dev, drive_pci, &t2);
-		t2 &= ~0x80;	/* disable UDMA */
-		pci_write_config_byte(dev, drive_pci, t2);
-
-		t1 = mwdma_timings[mode - XFER_MW_DMA_0];
-	} else
-		t1 = pio_timings[mode - XFER_PIO_0];
-
-	pci_write_config_byte(dev, drive_pci + 1, t1);
-}
-
-static void sis_ata133_program_timings(ide_drive_t *drive, const u8 mode)
-{
-	struct pci_dev *dev = to_pci_dev(drive->hwif->dev);
-	u32 t1 = 0;
-	u8 drive_pci = sis_ata133_get_base(drive), clk, idx;
-
-	pci_read_config_dword(dev, drive_pci, &t1);
-
-	t1 &= 0xc0c00fff;
-	clk = (t1 & 0x08) ? ATA_133 : ATA_100;
-	if (mode >= XFER_MW_DMA_0) {
-		t1 &= ~0x04;	/* disable UDMA */
-		idx = mode - XFER_MW_DMA_0 + 5;
-	} else
-		idx = mode - XFER_PIO_0;
-	t1 |= ini_time_value[clk][idx] << 12;
-	t1 |= act_time_value[clk][idx] << 16;
-	t1 |= rco_time_value[clk][idx] << 24;
-
-	pci_write_config_dword(dev, drive_pci, t1);
-}
-
-static void sis_program_timings(ide_drive_t *drive, const u8 mode)
-{
-	if (chipset_family < ATA_100)		/* ATA_16/33/66/100a */
-		sis_ata16_program_timings(drive, mode);
-	else if (chipset_family < ATA_133)	/* ATA_100/133a */
-		sis_ata100_program_timings(drive, mode);
-	else					/* ATA_133 */
-		sis_ata133_program_timings(drive, mode);
-}
-
-static void config_drive_art_rwp(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif	= drive->hwif;
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	u8 reg4bh		= 0;
-	u8 rw_prefetch		= 0;
-
-	pci_read_config_byte(dev, 0x4b, &reg4bh);
-
-	rw_prefetch = reg4bh & ~(0x11 << drive->dn);
-
-	if (drive->media == ide_disk)
-		rw_prefetch |= 0x11 << drive->dn;
-
-	if (reg4bh != rw_prefetch)
-		pci_write_config_byte(dev, 0x4b, rw_prefetch);
-}
-
-static void sis_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	config_drive_art_rwp(drive);
-	sis_program_timings(drive, drive->pio_mode);
-}
-
-static void sis_ata133_program_udma_timings(ide_drive_t *drive, const u8 mode)
-{
-	struct pci_dev *dev = to_pci_dev(drive->hwif->dev);
-	u32 regdw = 0;
-	u8 drive_pci = sis_ata133_get_base(drive), clk, idx;
-
-	pci_read_config_dword(dev, drive_pci, &regdw);
-
-	regdw |= 0x04;
-	regdw &= 0xfffff00f;
-	/* check if ATA133 enable */
-	clk = (regdw & 0x08) ? ATA_133 : ATA_100;
-	idx = mode - XFER_UDMA_0;
-	regdw |= cycle_time_value[clk][idx] << 4;
-	regdw |= cvs_time_value[clk][idx] << 8;
-
-	pci_write_config_dword(dev, drive_pci, regdw);
-}
-
-static void sis_ata33_program_udma_timings(ide_drive_t *drive, const u8 mode)
-{
-	struct pci_dev *dev = to_pci_dev(drive->hwif->dev);
-	u8 drive_pci = 0x40 + drive->dn * 2, reg = 0, i = chipset_family;
-
-	pci_read_config_byte(dev, drive_pci + 1, &reg);
-
-	/* force the UDMA bit on if we want to use UDMA */
-	reg |= 0x80;
-	/* clean reg cycle time bits */
-	reg &= ~((0xff >> (8 - cycle_time_range[i])) << cycle_time_offset[i]);
-	/* set reg cycle time bits */
-	reg |= cycle_time_value[i][mode - XFER_UDMA_0] << cycle_time_offset[i];
-
-	pci_write_config_byte(dev, drive_pci + 1, reg);
-}
-
-static void sis_program_udma_timings(ide_drive_t *drive, const u8 mode)
-{
-	if (chipset_family >= ATA_133)	/* ATA_133 */
-		sis_ata133_program_udma_timings(drive, mode);
-	else				/* ATA_33/66/100a/100/133a */
-		sis_ata33_program_udma_timings(drive, mode);
-}
-
-static void sis_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	const u8 speed = drive->dma_mode;
-
-	if (speed >= XFER_UDMA_0)
-		sis_program_udma_timings(drive, speed);
-	else
-		sis_program_timings(drive, speed);
-}
-
-static u8 sis_ata133_udma_filter(ide_drive_t *drive)
-{
-	struct pci_dev *dev = to_pci_dev(drive->hwif->dev);
-	u32 regdw = 0;
-	u8 drive_pci = sis_ata133_get_base(drive);
-
-	pci_read_config_dword(dev, drive_pci, &regdw);
-
-	/* if ATA133 disable, we should not set speed above UDMA5 */
-	return (regdw & 0x08) ? ATA_UDMA6 : ATA_UDMA5;
-}
-
-static int sis_find_family(struct pci_dev *dev)
-{
-	struct pci_dev *host;
-	int i = 0;
-
-	chipset_family = 0;
-
-	for (i = 0; i < ARRAY_SIZE(SiSHostChipInfo) && !chipset_family; i++) {
-
-		host = pci_get_device(PCI_VENDOR_ID_SI, SiSHostChipInfo[i].host_id, NULL);
-
-		if (!host)
-			continue;
-
-		chipset_family = SiSHostChipInfo[i].chipset_family;
-
-		/* Special case for SiS630 : 630S/ET is ATA_100a */
-		if (SiSHostChipInfo[i].host_id == PCI_DEVICE_ID_SI_630) {
-			if (host->revision >= 0x30)
-				chipset_family = ATA_100a;
-		}
-		pci_dev_put(host);
-
-		printk(KERN_INFO DRV_NAME " %s: %s %s controller\n",
-			pci_name(dev), SiSHostChipInfo[i].name,
-			chipset_capability[chipset_family]);
-	}
-
-	if (!chipset_family) { /* Belongs to pci-quirks */
-
-			u32 idemisc;
-			u16 trueid;
-
-			/* Disable ID masking and register remapping */
-			pci_read_config_dword(dev, 0x54, &idemisc);
-			pci_write_config_dword(dev, 0x54, (idemisc & 0x7fffffff));
-			pci_read_config_word(dev, PCI_DEVICE_ID, &trueid);
-			pci_write_config_dword(dev, 0x54, idemisc);
-
-			if (trueid == 0x5518) {
-				printk(KERN_INFO DRV_NAME " %s: SiS 962/963 MuTIOL IDE UDMA133 controller\n",
-					pci_name(dev));
-				chipset_family = ATA_133;
-
-				/* Check for 5513 compatibility mapping
-				 * We must use this, else the port enabled code will fail,
-				 * as it expects the enablebits at 0x4a.
-				 */
-				if ((idemisc & 0x40000000) == 0) {
-					pci_write_config_dword(dev, 0x54, idemisc | 0x40000000);
-					printk(KERN_INFO DRV_NAME " %s: Switching to 5513 register mapping\n",
-						pci_name(dev));
-				}
-			}
-	}
-
-	if (!chipset_family) { /* Belongs to pci-quirks */
-
-			struct pci_dev *lpc_bridge;
-			u16 trueid;
-			u8 prefctl;
-			u8 idecfg;
-
-			pci_read_config_byte(dev, 0x4a, &idecfg);
-			pci_write_config_byte(dev, 0x4a, idecfg | 0x10);
-			pci_read_config_word(dev, PCI_DEVICE_ID, &trueid);
-			pci_write_config_byte(dev, 0x4a, idecfg);
-
-			if (trueid == 0x5517) { /* SiS 961/961B */
-
-				lpc_bridge = pci_get_slot(dev->bus, 0x10); /* Bus 0, Dev 2, Fn 0 */
-				pci_read_config_byte(dev, 0x49, &prefctl);
-				pci_dev_put(lpc_bridge);
-
-				if (lpc_bridge->revision == 0x10 && (prefctl & 0x80)) {
-					printk(KERN_INFO DRV_NAME " %s: SiS 961B MuTIOL IDE UDMA133 controller\n",
-						pci_name(dev));
-					chipset_family = ATA_133a;
-				} else {
-					printk(KERN_INFO DRV_NAME " %s: SiS 961 MuTIOL IDE UDMA100 controller\n",
-						pci_name(dev));
-					chipset_family = ATA_100;
-				}
-			}
-	}
-
-	return chipset_family;
-}
-
-static int init_chipset_sis5513(struct pci_dev *dev)
-{
-	/* Make general config ops here
-	   1/ tell IDE channels to operate in Compatibility mode only
-	   2/ tell old chips to allow per drive IDE timings */
-
-	u8 reg;
-	u16 regw;
-
-	switch (chipset_family) {
-	case ATA_133:
-		/* SiS962 operation mode */
-		pci_read_config_word(dev, 0x50, &regw);
-		if (regw & 0x08)
-			pci_write_config_word(dev, 0x50, regw&0xfff7);
-		pci_read_config_word(dev, 0x52, &regw);
-		if (regw & 0x08)
-			pci_write_config_word(dev, 0x52, regw&0xfff7);
-		break;
-	case ATA_133a:
-	case ATA_100:
-		/* Fixup latency */
-		pci_write_config_byte(dev, PCI_LATENCY_TIMER, 0x80);
-		/* Set compatibility bit */
-		pci_read_config_byte(dev, 0x49, &reg);
-		if (!(reg & 0x01))
-			pci_write_config_byte(dev, 0x49, reg|0x01);
-		break;
-	case ATA_100a:
-	case ATA_66:
-		/* Fixup latency */
-		pci_write_config_byte(dev, PCI_LATENCY_TIMER, 0x10);
-
-		/* On ATA_66 chips the bit was elsewhere */
-		pci_read_config_byte(dev, 0x52, &reg);
-		if (!(reg & 0x04))
-			pci_write_config_byte(dev, 0x52, reg|0x04);
-		break;
-	case ATA_33:
-		/* On ATA_33 we didn't have a single bit to set */
-		pci_read_config_byte(dev, 0x09, &reg);
-		if ((reg & 0x0f) != 0x00)
-			pci_write_config_byte(dev, 0x09, reg&0xf0);
-		fallthrough;
-	case ATA_16:
-		/* force per drive recovery and active timings
-		   needed on ATA_33 and below chips */
-		pci_read_config_byte(dev, 0x52, &reg);
-		if (!(reg & 0x08))
-			pci_write_config_byte(dev, 0x52, reg|0x08);
-		break;
-	}
-
-	return 0;
-}
-
-struct sis_laptop {
-	u16 device;
-	u16 subvendor;
-	u16 subdevice;
-};
-
-static const struct sis_laptop sis_laptop[] = {
-	/* devid, subvendor, subdev */
-	{ 0x5513, 0x1043, 0x1107 },	/* ASUS A6K */
-	{ 0x5513, 0x1734, 0x105f },	/* FSC Amilo A1630 */
-	{ 0x5513, 0x1071, 0x8640 },     /* EasyNote K5305 */
-	/* end marker */
-	{ 0, }
-};
-
-static u8 sis_cable_detect(ide_hwif_t *hwif)
-{
-	struct pci_dev *pdev = to_pci_dev(hwif->dev);
-	const struct sis_laptop *lap = &sis_laptop[0];
-	u8 ata66 = 0;
-
-	while (lap->device) {
-		if (lap->device == pdev->device &&
-		    lap->subvendor == pdev->subsystem_vendor &&
-		    lap->subdevice == pdev->subsystem_device)
-			return ATA_CBL_PATA40_SHORT;
-		lap++;
-	}
-
-	if (chipset_family >= ATA_133) {
-		u16 regw = 0;
-		u16 reg_addr = hwif->channel ? 0x52: 0x50;
-		pci_read_config_word(pdev, reg_addr, &regw);
-		ata66 = (regw & 0x8000) ? 0 : 1;
-	} else if (chipset_family >= ATA_66) {
-		u8 reg48h = 0;
-		u8 mask = hwif->channel ? 0x20 : 0x10;
-		pci_read_config_byte(pdev, 0x48, &reg48h);
-		ata66 = (reg48h & mask) ? 0 : 1;
-	}
-
-	return ata66 ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
-}
-
-static const struct ide_port_ops sis_port_ops = {
-	.set_pio_mode		= sis_set_pio_mode,
-	.set_dma_mode		= sis_set_dma_mode,
-	.cable_detect		= sis_cable_detect,
-};
-
-static const struct ide_port_ops sis_ata133_port_ops = {
-	.set_pio_mode		= sis_set_pio_mode,
-	.set_dma_mode		= sis_set_dma_mode,
-	.udma_filter		= sis_ata133_udma_filter,
-	.cable_detect		= sis_cable_detect,
-};
-
-static const struct ide_port_info sis5513_chipset = {
-	.name		= DRV_NAME,
-	.init_chipset	= init_chipset_sis5513,
-	.enablebits	= { {0x4a, 0x02, 0x02}, {0x4a, 0x04, 0x04} },
-	.host_flags	= IDE_HFLAG_NO_AUTODMA,
-	.pio_mask	= ATA_PIO4,
-	.mwdma_mask	= ATA_MWDMA2,
-};
-
-static int sis5513_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	struct ide_port_info d = sis5513_chipset;
-	u8 udma_rates[] = { 0x00, 0x00, 0x07, 0x1f, 0x3f, 0x3f, 0x7f, 0x7f };
-	int rc;
-
-	rc = pci_enable_device(dev);
-	if (rc)
-		return rc;
-
-	if (sis_find_family(dev) == 0)
-		return -ENOTSUPP;
-
-	if (chipset_family >= ATA_133)
-		d.port_ops = &sis_ata133_port_ops;
-	else
-		d.port_ops = &sis_port_ops;
-
-	d.udma_mask = udma_rates[chipset_family];
-
-	return ide_pci_init_one(dev, &d, NULL);
-}
-
-static void sis5513_remove(struct pci_dev *dev)
-{
-	ide_pci_remove(dev);
-	pci_disable_device(dev);
-}
-
-static const struct pci_device_id sis5513_pci_tbl[] = {
-	{ PCI_VDEVICE(SI, PCI_DEVICE_ID_SI_5513), 0 },
-	{ PCI_VDEVICE(SI, PCI_DEVICE_ID_SI_5518), 0 },
-	{ PCI_VDEVICE(SI, PCI_DEVICE_ID_SI_1180), 0 },
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, sis5513_pci_tbl);
-
-static struct pci_driver sis5513_pci_driver = {
-	.name		= "SIS_IDE",
-	.id_table	= sis5513_pci_tbl,
-	.probe		= sis5513_init_one,
-	.remove		= sis5513_remove,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init sis5513_ide_init(void)
-{
-	return ide_pci_register_driver(&sis5513_pci_driver);
-}
-
-static void __exit sis5513_ide_exit(void)
-{
-	pci_unregister_driver(&sis5513_pci_driver);
-}
-
-module_init(sis5513_ide_init);
-module_exit(sis5513_ide_exit);
-
-MODULE_AUTHOR("Lionel Bouton, L C Chang, Andre Hedrick, Vojtech Pavlik");
-MODULE_DESCRIPTION("PCI driver module for SIS IDE");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/sl82c105.c b/drivers/ide/sl82c105.c
deleted file mode 100644
index 5c24c420c438..000000000000
--- a/drivers/ide/sl82c105.c
+++ /dev/null
@@ -1,367 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * SL82C105/Winbond 553 IDE driver
- *
- * Maintainer unknown.
- *
- * Drive tuning added from Rebel.com's kernel sources
- *  -- Russell King (15/11/98) linux@arm.linux.org.uk
- * 
- * Merge in Russell's HW workarounds, fix various problems
- * with the timing registers setup.
- *  -- Benjamin Herrenschmidt (01/11/03) benh@kernel.crashing.org
- *
- * Copyright (C) 2006-2007,2009 MontaVista Software, Inc. <source@mvista.com>
- * Copyright (C)      2007 Bartlomiej Zolnierkiewicz
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/ide.h>
-
-#include <asm/io.h>
-
-#define DRV_NAME "sl82c105"
-
-/*
- * SL82C105 PCI config register 0x40 bits.
- */
-#define CTRL_IDE_IRQB   (1 << 30)
-#define CTRL_IDE_IRQA   (1 << 28)
-#define CTRL_LEGIRQ     (1 << 11)
-#define CTRL_P1F16      (1 << 5)
-#define CTRL_P1EN       (1 << 4)
-#define CTRL_P0F16      (1 << 1)
-#define CTRL_P0EN       (1 << 0)
-
-/*
- * Convert a PIO mode and cycle time to the required on/off times
- * for the interface.  This has protection against runaway timings.
- */
-static unsigned int get_pio_timings(ide_drive_t *drive, u8 pio)
-{
-	struct ide_timing *t = ide_timing_find_mode(XFER_PIO_0 + pio);
-	unsigned int cmd_on, cmd_off;
-	u8 iordy = 0;
-
-	cmd_on  = (t->active + 29) / 30;
-	cmd_off = (ide_pio_cycle_time(drive, pio) - 30 * cmd_on + 29) / 30;
-
-	if (cmd_on == 0)
-		cmd_on = 1;
-
-	if (cmd_off == 0)
-		cmd_off = 1;
-
-	if (ide_pio_need_iordy(drive, pio))
-		iordy = 0x40;
-
-	return (cmd_on - 1) << 8 | (cmd_off - 1) | iordy;
-}
-
-/*
- * Configure the chipset for PIO mode.
- */
-static void sl82c105_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	unsigned long timings	= (unsigned long)ide_get_drivedata(drive);
-	int reg			= 0x44 + drive->dn * 4;
-	u16 drv_ctrl;
-	const u8 pio		= drive->pio_mode - XFER_PIO_0;
-
-	drv_ctrl = get_pio_timings(drive, pio);
-
-	/*
-	 * Store the PIO timings so that we can restore them
-	 * in case DMA will be turned off...
-	 */
-	timings &= 0xffff0000;
-	timings |= drv_ctrl;
-	ide_set_drivedata(drive, (void *)timings);
-
-	pci_write_config_word(dev, reg,  drv_ctrl);
-	pci_read_config_word (dev, reg, &drv_ctrl);
-
-	printk(KERN_DEBUG "%s: selected %s (%dns) (%04X)\n", drive->name,
-			  ide_xfer_verbose(pio + XFER_PIO_0),
-			  ide_pio_cycle_time(drive, pio), drv_ctrl);
-}
-
-/*
- * Configure the chipset for DMA mode.
- */
-static void sl82c105_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	static u16 mwdma_timings[] = {0x0707, 0x0201, 0x0200};
-	unsigned long timings = (unsigned long)ide_get_drivedata(drive);
-	u16 drv_ctrl;
-	const u8 speed = drive->dma_mode;
-
-	drv_ctrl = mwdma_timings[speed - XFER_MW_DMA_0];
-
-	/*
-	 * Store the DMA timings so that we can actually program
-	 * them when DMA will be turned on...
-	 */
-	timings &= 0x0000ffff;
-	timings |= (unsigned long)drv_ctrl << 16;
-	ide_set_drivedata(drive, (void *)timings);
-}
-
-static int sl82c105_test_irq(ide_hwif_t *hwif)
-{
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	u32 val, mask		= hwif->channel ? CTRL_IDE_IRQB : CTRL_IDE_IRQA;
-
-	pci_read_config_dword(dev, 0x40, &val);
-
-	return (val & mask) ? 1 : 0;
-}
-
-/*
- * The SL82C105 holds off all IDE interrupts while in DMA mode until
- * all DMA activity is completed.  Sometimes this causes problems (eg,
- * when the drive wants to report an error condition).
- *
- * 0x7e is a "chip testing" register.  Bit 2 resets the DMA controller
- * state machine.  We need to kick this to work around various bugs.
- */
-static inline void sl82c105_reset_host(struct pci_dev *dev)
-{
-	u16 val;
-
-	pci_read_config_word(dev, 0x7e, &val);
-	pci_write_config_word(dev, 0x7e, val | (1 << 2));
-	pci_write_config_word(dev, 0x7e, val & ~(1 << 2));
-}
-
-/*
- * If we get an IRQ timeout, it might be that the DMA state machine
- * got confused.  Fix from Todd Inglett.  Details from Winbond.
- *
- * This function is called when the IDE timer expires, the drive
- * indicates that it is READY, and we were waiting for DMA to complete.
- */
-static void sl82c105_dma_lost_irq(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif	= drive->hwif;
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	u32 val, mask		= hwif->channel ? CTRL_IDE_IRQB : CTRL_IDE_IRQA;
-	u8 dma_cmd;
-
-	printk(KERN_WARNING "sl82c105: lost IRQ, resetting host\n");
-
-	/*
-	 * Check the raw interrupt from the drive.
-	 */
-	pci_read_config_dword(dev, 0x40, &val);
-	if (val & mask)
-		printk(KERN_INFO "sl82c105: drive was requesting IRQ, "
-		       "but host lost it\n");
-
-	/*
-	 * Was DMA enabled?  If so, disable it - we're resetting the
-	 * host.  The IDE layer will be handling the drive for us.
-	 */
-	dma_cmd = inb(hwif->dma_base + ATA_DMA_CMD);
-	if (dma_cmd & 1) {
-		outb(dma_cmd & ~1, hwif->dma_base + ATA_DMA_CMD);
-		printk(KERN_INFO "sl82c105: DMA was enabled\n");
-	}
-
-	sl82c105_reset_host(dev);
-}
-
-/*
- * ATAPI devices can cause the SL82C105 DMA state machine to go gaga.
- * Winbond recommend that the DMA state machine is reset prior to
- * setting the bus master DMA enable bit.
- *
- * The generic IDE core will have disabled the BMEN bit before this
- * function is called.
- */
-static void sl82c105_dma_start(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif	= drive->hwif;
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	int reg 		= 0x44 + drive->dn * 4;
-
-	pci_write_config_word(dev, reg,
-			      (unsigned long)ide_get_drivedata(drive) >> 16);
-
-	sl82c105_reset_host(dev);
-	ide_dma_start(drive);
-}
-
-static void sl82c105_dma_clear(ide_drive_t *drive)
-{
-	struct pci_dev *dev = to_pci_dev(drive->hwif->dev);
-
-	sl82c105_reset_host(dev);
-}
-
-static int sl82c105_dma_end(ide_drive_t *drive)
-{
-	struct pci_dev *dev	= to_pci_dev(drive->hwif->dev);
-	int reg 		= 0x44 + drive->dn * 4;
-	int ret			= ide_dma_end(drive);
-
-	pci_write_config_word(dev, reg,
-			      (unsigned long)ide_get_drivedata(drive));
-
-	return ret;
-}
-
-/*
- * ATA reset will clear the 16 bits mode in the control
- * register, we need to reprogram it
- */
-static void sl82c105_resetproc(ide_drive_t *drive)
-{
-	struct pci_dev *dev = to_pci_dev(drive->hwif->dev);
-	u32 val;
-
-	pci_read_config_dword(dev, 0x40, &val);
-	val |= (CTRL_P1F16 | CTRL_P0F16);
-	pci_write_config_dword(dev, 0x40, val);
-}
-
-/*
- * Return the revision of the Winbond bridge
- * which this function is part of.
- */
-static u8 sl82c105_bridge_revision(struct pci_dev *dev)
-{
-	struct pci_dev *bridge;
-
-	/*
-	 * The bridge should be part of the same device, but function 0.
-	 */
-	bridge = pci_get_domain_bus_and_slot(pci_domain_nr(dev->bus),
-					dev->bus->number,
-					PCI_DEVFN(PCI_SLOT(dev->devfn), 0));
-	if (!bridge)
-		return -1;
-
-	/*
-	 * Make sure it is a Winbond 553 and is an ISA bridge.
-	 */
-	if (bridge->vendor != PCI_VENDOR_ID_WINBOND ||
-	    bridge->device != PCI_DEVICE_ID_WINBOND_83C553 ||
-	    bridge->class >> 8 != PCI_CLASS_BRIDGE_ISA) {
-	    	pci_dev_put(bridge);
-		return -1;
-	}
-	/*
-	 * We need to find function 0's revision, not function 1
-	 */
-	pci_dev_put(bridge);
-
-	return bridge->revision;
-}
-
-/*
- * Enable the PCI device
- * 
- * --BenH: It's arch fixup code that should enable channels that
- * have not been enabled by firmware. I decided we can still enable
- * channel 0 here at least, but channel 1 has to be enabled by
- * firmware or arch code. We still set both to 16 bits mode.
- */
-static int init_chipset_sl82c105(struct pci_dev *dev)
-{
-	u32 val;
-
-	pci_read_config_dword(dev, 0x40, &val);
-	val |= CTRL_P0EN | CTRL_P0F16 | CTRL_P1F16;
-	pci_write_config_dword(dev, 0x40, val);
-
-	return 0;
-}
-
-static const struct ide_port_ops sl82c105_port_ops = {
-	.set_pio_mode		= sl82c105_set_pio_mode,
-	.set_dma_mode		= sl82c105_set_dma_mode,
-	.resetproc		= sl82c105_resetproc,
-	.test_irq		= sl82c105_test_irq,
-};
-
-static const struct ide_dma_ops sl82c105_dma_ops = {
-	.dma_host_set		= ide_dma_host_set,
-	.dma_setup		= ide_dma_setup,
-	.dma_start		= sl82c105_dma_start,
-	.dma_end		= sl82c105_dma_end,
-	.dma_test_irq		= ide_dma_test_irq,
-	.dma_lost_irq		= sl82c105_dma_lost_irq,
-	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_clear		= sl82c105_dma_clear,
-	.dma_sff_read_status	= ide_dma_sff_read_status,
-};
-
-static const struct ide_port_info sl82c105_chipset = {
-	.name		= DRV_NAME,
-	.init_chipset	= init_chipset_sl82c105,
-	.enablebits	= {{0x40,0x01,0x01}, {0x40,0x10,0x10}},
-	.port_ops	= &sl82c105_port_ops,
-	.dma_ops	= &sl82c105_dma_ops,
-	.host_flags	= IDE_HFLAG_IO_32BIT |
-			  IDE_HFLAG_UNMASK_IRQS |
-			  IDE_HFLAG_SERIALIZE_DMA |
-			  IDE_HFLAG_NO_AUTODMA,
-	.pio_mask	= ATA_PIO5,
-	.mwdma_mask	= ATA_MWDMA2,
-};
-
-static int sl82c105_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	struct ide_port_info d = sl82c105_chipset;
-	u8 rev = sl82c105_bridge_revision(dev);
-
-	if (rev <= 5) {
-		/*
-		 * Never ever EVER under any circumstances enable
-		 * DMA when the bridge is this old.
-		 */
-		printk(KERN_INFO DRV_NAME ": Winbond W83C553 bridge "
-				 "revision %d, BM-DMA disabled\n", rev);
-		d.dma_ops = NULL;
-		d.mwdma_mask = 0;
-		d.host_flags &= ~IDE_HFLAG_SERIALIZE_DMA;
-	}
-
-	return ide_pci_init_one(dev, &d, NULL);
-}
-
-static const struct pci_device_id sl82c105_pci_tbl[] = {
-	{ PCI_VDEVICE(WINBOND, PCI_DEVICE_ID_WINBOND_82C105), 0 },
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, sl82c105_pci_tbl);
-
-static struct pci_driver sl82c105_pci_driver = {
-	.name		= "W82C105_IDE",
-	.id_table	= sl82c105_pci_tbl,
-	.probe		= sl82c105_init_one,
-	.remove		= ide_pci_remove,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init sl82c105_ide_init(void)
-{
-	return ide_pci_register_driver(&sl82c105_pci_driver);
-}
-
-static void __exit sl82c105_ide_exit(void)
-{
-	pci_unregister_driver(&sl82c105_pci_driver);
-}
-
-module_init(sl82c105_ide_init);
-module_exit(sl82c105_ide_exit);
-
-MODULE_DESCRIPTION("PCI driver module for W82C105 IDE");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/slc90e66.c b/drivers/ide/slc90e66.c
deleted file mode 100644
index f521d5ebf916..000000000000
--- a/drivers/ide/slc90e66.c
+++ /dev/null
@@ -1,182 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  Copyright (C) 2000-2002 Andre Hedrick <andre@linux-ide.org>
- *  Copyright (C) 2006-2007 MontaVista Software, Inc. <source@mvista.com>
- *
- * This is a look-alike variation of the ICH0 PIIX4 Ultra-66,
- * but this keeps the ISA-Bridge and slots alive.
- *
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-
-#define DRV_NAME "slc90e66"
-
-static DEFINE_SPINLOCK(slc90e66_lock);
-
-static void slc90e66_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	int is_slave		= drive->dn & 1;
-	int master_port		= hwif->channel ? 0x42 : 0x40;
-	int slave_port		= 0x44;
-	unsigned long flags;
-	u16 master_data;
-	u8 slave_data;
-	int control = 0;
-	const u8 pio = drive->pio_mode - XFER_PIO_0;
-
-				     /* ISP  RTC */
-	static const u8 timings[][2] = {
-					{ 0, 0 },
-					{ 0, 0 },
-					{ 1, 0 },
-					{ 2, 1 },
-					{ 2, 3 }, };
-
-	spin_lock_irqsave(&slc90e66_lock, flags);
-	pci_read_config_word(dev, master_port, &master_data);
-
-	if (pio > 1)
-		control |= 1;	/* Programmable timing on */
-	if (drive->media == ide_disk)
-		control |= 4;	/* Prefetch, post write */
-	if (ide_pio_need_iordy(drive, pio))
-		control |= 2;	/* IORDY */
-	if (is_slave) {
-		master_data |=  0x4000;
-		master_data &= ~0x0070;
-		if (pio > 1) {
-			/* Set PPE, IE and TIME */
-			master_data |= control << 4;
-		}
-		pci_read_config_byte(dev, slave_port, &slave_data);
-		slave_data &= hwif->channel ? 0x0f : 0xf0;
-		slave_data |= ((timings[pio][0] << 2) | timings[pio][1]) <<
-			       (hwif->channel ? 4 : 0);
-	} else {
-		master_data &= ~0x3307;
-		if (pio > 1) {
-			/* enable PPE, IE and TIME */
-			master_data |= control;
-		}
-		master_data |= (timings[pio][0] << 12) | (timings[pio][1] << 8);
-	}
-	pci_write_config_word(dev, master_port, master_data);
-	if (is_slave)
-		pci_write_config_byte(dev, slave_port, slave_data);
-	spin_unlock_irqrestore(&slc90e66_lock, flags);
-}
-
-static void slc90e66_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	u8 maslave		= hwif->channel ? 0x42 : 0x40;
-	int sitre = 0, a_speed	= 7 << (drive->dn * 4);
-	int u_speed = 0, u_flag = 1 << drive->dn;
-	u16			reg4042, reg44, reg48, reg4a;
-	const u8 speed		= drive->dma_mode;
-
-	pci_read_config_word(dev, maslave, &reg4042);
-	sitre = (reg4042 & 0x4000) ? 1 : 0;
-	pci_read_config_word(dev, 0x44, &reg44);
-	pci_read_config_word(dev, 0x48, &reg48);
-	pci_read_config_word(dev, 0x4a, &reg4a);
-
-	if (speed >= XFER_UDMA_0) {
-		u_speed = (speed - XFER_UDMA_0) << (drive->dn * 4);
-
-		if (!(reg48 & u_flag))
-			pci_write_config_word(dev, 0x48, reg48|u_flag);
-		if ((reg4a & a_speed) != u_speed) {
-			pci_write_config_word(dev, 0x4a, reg4a & ~a_speed);
-			pci_read_config_word(dev, 0x4a, &reg4a);
-			pci_write_config_word(dev, 0x4a, reg4a|u_speed);
-		}
-	} else {
-		const u8 mwdma_to_pio[] = { 0, 3, 4 };
-
-		if (reg48 & u_flag)
-			pci_write_config_word(dev, 0x48, reg48 & ~u_flag);
-		if (reg4a & a_speed)
-			pci_write_config_word(dev, 0x4a, reg4a & ~a_speed);
-
-		if (speed >= XFER_MW_DMA_0)
-			drive->pio_mode =
-				mwdma_to_pio[speed - XFER_MW_DMA_0] + XFER_PIO_0;
-		else
-			drive->pio_mode = XFER_PIO_2; /* for SWDMA2 */
-
-		slc90e66_set_pio_mode(hwif, drive);
-	}
-}
-
-static u8 slc90e66_cable_detect(ide_hwif_t *hwif)
-{
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	u8 reg47 = 0, mask = hwif->channel ? 0x01 : 0x02;
-
-	pci_read_config_byte(dev, 0x47, &reg47);
-
-	/* bit[0(1)]: 0:80, 1:40 */
-	return (reg47 & mask) ? ATA_CBL_PATA40 : ATA_CBL_PATA80;
-}
-
-static const struct ide_port_ops slc90e66_port_ops = {
-	.set_pio_mode		= slc90e66_set_pio_mode,
-	.set_dma_mode		= slc90e66_set_dma_mode,
-	.cable_detect		= slc90e66_cable_detect,
-};
-
-static const struct ide_port_info slc90e66_chipset = {
-	.name		= DRV_NAME,
-	.enablebits	= { {0x41, 0x80, 0x80}, {0x43, 0x80, 0x80} },
-	.port_ops	= &slc90e66_port_ops,
-	.pio_mask	= ATA_PIO4,
-	.swdma_mask	= ATA_SWDMA2_ONLY,
-	.mwdma_mask	= ATA_MWDMA12_ONLY,
-	.udma_mask	= ATA_UDMA4,
-};
-
-static int slc90e66_init_one(struct pci_dev *dev,
-			     const struct pci_device_id *id)
-{
-	return ide_pci_init_one(dev, &slc90e66_chipset, NULL);
-}
-
-static const struct pci_device_id slc90e66_pci_tbl[] = {
-	{ PCI_VDEVICE(EFAR, PCI_DEVICE_ID_EFAR_SLC90E66_1), 0 },
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, slc90e66_pci_tbl);
-
-static struct pci_driver slc90e66_pci_driver = {
-	.name		= "SLC90e66_IDE",
-	.id_table	= slc90e66_pci_tbl,
-	.probe		= slc90e66_init_one,
-	.remove		= ide_pci_remove,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init slc90e66_ide_init(void)
-{
-	return ide_pci_register_driver(&slc90e66_pci_driver);
-}
-
-static void __exit slc90e66_ide_exit(void)
-{
-	pci_unregister_driver(&slc90e66_pci_driver);
-}
-
-module_init(slc90e66_ide_init);
-module_exit(slc90e66_ide_exit);
-
-MODULE_AUTHOR("Andre Hedrick");
-MODULE_DESCRIPTION("PCI driver module for SLC90E66 IDE");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/tc86c001.c b/drivers/ide/tc86c001.c
deleted file mode 100644
index 17e6132b99bf..000000000000
--- a/drivers/ide/tc86c001.c
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Copyright (C) 2002 Toshiba Corporation
- * Copyright (C) 2005-2006 MontaVista Software, Inc. <source@mvista.com>
- *
- * This file is licensed under the terms of the GNU General Public
- * License version 2.  This program is licensed "as is" without any
- * warranty of any kind, whether express or implied.
- */
-
-#include <linux/types.h>
-#include <linux/pci.h>
-#include <linux/ide.h>
-#include <linux/module.h>
-
-#define DRV_NAME "tc86c001"
-
-static void tc86c001_set_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	unsigned long scr_port	= hwif->config_data + (drive->dn ? 0x02 : 0x00);
-	u16 mode, scr		= inw(scr_port);
-	const u8 speed		= drive->dma_mode;
-
-	switch (speed) {
-	case XFER_UDMA_4:	mode = 0x00c0; break;
-	case XFER_UDMA_3:	mode = 0x00b0; break;
-	case XFER_UDMA_2:	mode = 0x00a0; break;
-	case XFER_UDMA_1:	mode = 0x0090; break;
-	case XFER_UDMA_0:	mode = 0x0080; break;
-	case XFER_MW_DMA_2:	mode = 0x0070; break;
-	case XFER_MW_DMA_1:	mode = 0x0060; break;
-	case XFER_MW_DMA_0:	mode = 0x0050; break;
-	case XFER_PIO_4:	mode = 0x0400; break;
-	case XFER_PIO_3:	mode = 0x0300; break;
-	case XFER_PIO_2:	mode = 0x0200; break;
-	case XFER_PIO_1:	mode = 0x0100; break;
-	case XFER_PIO_0:
-	default:		mode = 0x0000; break;
-	}
-
-	scr &= (speed < XFER_MW_DMA_0) ? 0xf8ff : 0xff0f;
-	scr |= mode;
-	outw(scr, scr_port);
-}
-
-static void tc86c001_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	drive->dma_mode = drive->pio_mode;
-	tc86c001_set_mode(hwif, drive);
-}
-
-/*
- * HACKITY HACK
- *
- * This is a workaround for the limitation 5 of the TC86C001 IDE controller:
- * if a DMA transfer terminates prematurely, the controller leaves the device's
- * interrupt request (INTRQ) pending and does not generate a PCI interrupt (or
- * set the interrupt bit in the DMA status register), thus no PCI interrupt
- * will occur until a DMA transfer has been successfully completed.
- *
- * We work around this by initiating dummy, zero-length DMA transfer on
- * a DMA timeout expiration. I found no better way to do this with the current
- * IDE core than to temporarily replace a higher level driver's timer expiry
- * handler with our own backing up to that handler in case our recovery fails.
- */
-static int tc86c001_timer_expiry(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif	= drive->hwif;
-	ide_expiry_t *expiry	= ide_get_hwifdata(hwif);
-	u8 dma_stat		= inb(hwif->dma_base + ATA_DMA_STATUS);
-
-	/* Restore a higher level driver's expiry handler first. */
-	hwif->expiry = expiry;
-
-	if ((dma_stat & 5) == 1) {	/* DMA active and no interrupt */
-		unsigned long sc_base	= hwif->config_data;
-		unsigned long twcr_port	= sc_base + (drive->dn ? 0x06 : 0x04);
-		u8 dma_cmd		= inb(hwif->dma_base + ATA_DMA_CMD);
-
-		printk(KERN_WARNING "%s: DMA interrupt possibly stuck, "
-		       "attempting recovery...\n", drive->name);
-
-		/* Stop DMA */
-		outb(dma_cmd & ~0x01, hwif->dma_base + ATA_DMA_CMD);
-
-		/* Setup the dummy DMA transfer */
-		outw(0, sc_base + 0x0a);	/* Sector Count */
-		outw(0, twcr_port);	/* Transfer Word Count 1 or 2 */
-
-		/* Start the dummy DMA transfer */
-
-		/* clear R_OR_WCTR for write */
-		outb(0x00, hwif->dma_base + ATA_DMA_CMD);
-		/* set START_STOPBM */
-		outb(0x01, hwif->dma_base + ATA_DMA_CMD);
-
-		/*
-		 * If an interrupt was pending, it should come thru shortly.
-		 * If not, a higher level driver's expiry handler should
-		 * eventually cause some kind of recovery from the DMA stall.
-		 */
-		return WAIT_MIN_SLEEP;
-	}
-
-	/* Chain to the restored expiry handler if DMA wasn't active. */
-	if (likely(expiry != NULL))
-		return expiry(drive);
-
-	/* If there was no handler, "emulate" that for ide_timer_expiry()... */
-	return -1;
-}
-
-static void tc86c001_dma_start(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif	= drive->hwif;
-	unsigned long sc_base	= hwif->config_data;
-	unsigned long twcr_port	= sc_base + (drive->dn ? 0x06 : 0x04);
-	unsigned long nsectors	= blk_rq_sectors(hwif->rq);
-
-	/*
-	 * We have to manually load the sector count and size into
-	 * the appropriate system control registers for DMA to work
-	 * with LBA48 and ATAPI devices...
-	 */
-	outw(nsectors, sc_base + 0x0a);	/* Sector Count */
-	outw(SECTOR_SIZE / 2, twcr_port); /* Transfer Word Count 1/2 */
-
-	/* Install our timeout expiry hook, saving the current handler... */
-	ide_set_hwifdata(hwif, hwif->expiry);
-	hwif->expiry = &tc86c001_timer_expiry;
-
-	ide_dma_start(drive);
-}
-
-static u8 tc86c001_cable_detect(ide_hwif_t *hwif)
-{
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	unsigned long sc_base = pci_resource_start(dev, 5);
-	u16 scr1 = inw(sc_base + 0x00);
-
-	/*
-	 * System Control  1 Register bit 13 (PDIAGN):
-	 * 0=80-pin cable, 1=40-pin cable
-	 */
-	return (scr1 & 0x2000) ? ATA_CBL_PATA40 : ATA_CBL_PATA80;
-}
-
-static void init_hwif_tc86c001(ide_hwif_t *hwif)
-{
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	unsigned long sc_base	= pci_resource_start(dev, 5);
-	u16 scr1		= inw(sc_base + 0x00);
-
-	/* System Control 1 Register bit 15 (Soft Reset) set */
-	outw(scr1 |  0x8000, sc_base + 0x00);
-
-	/* System Control 1 Register bit 14 (FIFO Reset) set */
-	outw(scr1 |  0x4000, sc_base + 0x00);
-
-	/* System Control 1 Register: reset clear */
-	outw(scr1 & ~0xc000, sc_base + 0x00);
-
-	/* Store the system control register base for convenience... */
-	hwif->config_data = sc_base;
-
-	if (!hwif->dma_base)
-		return;
-
-	/*
-	 * Sector Count Control Register bits 0 and 1 set:
-	 * software sets Sector Count Register for master and slave device
-	 */
-	outw(0x0003, sc_base + 0x0c);
-
-	/* Sector Count Register limit */
-	hwif->rqsize	 = 0xffff;
-}
-
-static const struct ide_port_ops tc86c001_port_ops = {
-	.set_pio_mode		= tc86c001_set_pio_mode,
-	.set_dma_mode		= tc86c001_set_mode,
-	.cable_detect		= tc86c001_cable_detect,
-};
-
-static const struct ide_dma_ops tc86c001_dma_ops = {
-	.dma_host_set		= ide_dma_host_set,
-	.dma_setup		= ide_dma_setup,
-	.dma_start		= tc86c001_dma_start,
-	.dma_end		= ide_dma_end,
-	.dma_test_irq		= ide_dma_test_irq,
-	.dma_lost_irq		= ide_dma_lost_irq,
-	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_sff_read_status	= ide_dma_sff_read_status,
-};
-
-static const struct ide_port_info tc86c001_chipset = {
-	.name		= DRV_NAME,
-	.init_hwif	= init_hwif_tc86c001,
-	.port_ops	= &tc86c001_port_ops,
-	.dma_ops	= &tc86c001_dma_ops,
-	.host_flags	= IDE_HFLAG_SINGLE | IDE_HFLAG_OFF_BOARD,
-	.pio_mask	= ATA_PIO4,
-	.mwdma_mask	= ATA_MWDMA2,
-	.udma_mask	= ATA_UDMA4,
-};
-
-static int tc86c001_init_one(struct pci_dev *dev,
-			     const struct pci_device_id *id)
-{
-	int rc;
-
-	rc = pci_enable_device(dev);
-	if (rc)
-		goto out;
-
-	rc = pci_request_region(dev, 5, DRV_NAME);
-	if (rc) {
-		printk(KERN_ERR DRV_NAME ": system control regs already in use");
-		goto out_disable;
-	}
-
-	rc = ide_pci_init_one(dev, &tc86c001_chipset, NULL);
-	if (rc)
-		goto out_release;
-
-	goto out;
-
-out_release:
-	pci_release_region(dev, 5);
-out_disable:
-	pci_disable_device(dev);
-out:
-	return rc;
-}
-
-static void tc86c001_remove(struct pci_dev *dev)
-{
-	ide_pci_remove(dev);
-	pci_release_region(dev, 5);
-	pci_disable_device(dev);
-}
-
-static const struct pci_device_id tc86c001_pci_tbl[] = {
-	{ PCI_VDEVICE(TOSHIBA_2, PCI_DEVICE_ID_TOSHIBA_TC86C001_IDE), 0 },
-	{ 0, }
-};
-MODULE_DEVICE_TABLE(pci, tc86c001_pci_tbl);
-
-static struct pci_driver tc86c001_pci_driver = {
-	.name		= "TC86C001",
-	.id_table	= tc86c001_pci_tbl,
-	.probe		= tc86c001_init_one,
-	.remove		= tc86c001_remove,
-};
-
-static int __init tc86c001_ide_init(void)
-{
-	return ide_pci_register_driver(&tc86c001_pci_driver);
-}
-
-static void __exit tc86c001_ide_exit(void)
-{
-	pci_unregister_driver(&tc86c001_pci_driver);
-}
-
-module_init(tc86c001_ide_init);
-module_exit(tc86c001_ide_exit);
-
-MODULE_AUTHOR("MontaVista Software, Inc. <source@mvista.com>");
-MODULE_DESCRIPTION("PCI driver module for TC86C001 IDE");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/triflex.c b/drivers/ide/triflex.c
deleted file mode 100644
index 16ddd0956832..000000000000
--- a/drivers/ide/triflex.c
+++ /dev/null
@@ -1,143 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * IDE Chipset driver for the Compaq TriFlex IDE controller.
- * 
- * Known to work with the Compaq Workstation 5x00 series.
- *
- * Copyright (C) 2002 Hewlett-Packard Development Group, L.P.
- * Author: Torben Mathiasen <torben.mathiasen@hp.com>
- * 
- * Loosely based on the piix & svwks drivers.
- *
- * Documentation:
- *	Not publicly available.
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-
-#define DRV_NAME "triflex"
-
-static void triflex_set_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	u32 triflex_timings = 0;
-	u16 timing = 0;
-	u8 channel_offset = hwif->channel ? 0x74 : 0x70, unit = drive->dn & 1;
-
-	pci_read_config_dword(dev, channel_offset, &triflex_timings);
-
-	switch (drive->dma_mode) {
-		case XFER_MW_DMA_2:
-			timing = 0x0103; 
-			break;
-		case XFER_MW_DMA_1:
-			timing = 0x0203;
-			break;
-		case XFER_MW_DMA_0:
-			timing = 0x0808;
-			break;
-		case XFER_SW_DMA_2:
-		case XFER_SW_DMA_1:
-		case XFER_SW_DMA_0:
-			timing = 0x0f0f;
-			break;
-		case XFER_PIO_4:
-			timing = 0x0202;
-			break;
-		case XFER_PIO_3:
-			timing = 0x0204;
-			break;
-		case XFER_PIO_2:
-			timing = 0x0404;
-			break;
-		case XFER_PIO_1:
-			timing = 0x0508;
-			break;
-		case XFER_PIO_0:
-			timing = 0x0808;
-			break;
-	}
-
-	triflex_timings &= ~(0xFFFF << (16 * unit));
-	triflex_timings |= (timing << (16 * unit));
-	
-	pci_write_config_dword(dev, channel_offset, triflex_timings);
-}
-
-static void triflex_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	drive->dma_mode = drive->pio_mode;
-	triflex_set_mode(hwif, drive);
-}
-
-static const struct ide_port_ops triflex_port_ops = {
-	.set_pio_mode		= triflex_set_pio_mode,
-	.set_dma_mode		= triflex_set_mode,
-};
-
-static const struct ide_port_info triflex_device = {
-	.name		= DRV_NAME,
-	.enablebits	= {{0x80, 0x01, 0x01}, {0x80, 0x02, 0x02}},
-	.port_ops	= &triflex_port_ops,
-	.pio_mask	= ATA_PIO4,
-	.swdma_mask	= ATA_SWDMA2,
-	.mwdma_mask	= ATA_MWDMA2,
-};
-
-static int triflex_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	return ide_pci_init_one(dev, &triflex_device, NULL);
-}
-
-static const struct pci_device_id triflex_pci_tbl[] = {
-	{ PCI_VDEVICE(COMPAQ, PCI_DEVICE_ID_COMPAQ_TRIFLEX_IDE), 0 },
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, triflex_pci_tbl);
-
-#ifdef CONFIG_PM
-static int triflex_ide_pci_suspend(struct pci_dev *dev, pm_message_t state)
-{
-	/*
-	 * We must not disable or powerdown the device.
-	 * APM bios refuses to suspend if IDE is not accessible.
-	 */
-	pci_save_state(dev);
-	return 0;
-}
-#else
-#define triflex_ide_pci_suspend NULL
-#endif
-
-static struct pci_driver triflex_pci_driver = {
-	.name		= "TRIFLEX_IDE",
-	.id_table	= triflex_pci_tbl,
-	.probe		= triflex_init_one,
-	.remove		= ide_pci_remove,
-	.suspend	= triflex_ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init triflex_ide_init(void)
-{
-	return ide_pci_register_driver(&triflex_pci_driver);
-}
-
-static void __exit triflex_ide_exit(void)
-{
-	pci_unregister_driver(&triflex_pci_driver);
-}
-
-module_init(triflex_ide_init);
-module_exit(triflex_ide_exit);
-
-MODULE_AUTHOR("Torben Mathiasen");
-MODULE_DESCRIPTION("PCI driver module for Compaq Triflex IDE");
-MODULE_LICENSE("GPL");
-
-
diff --git a/drivers/ide/trm290.c b/drivers/ide/trm290.c
deleted file mode 100644
index d550b379b0f1..000000000000
--- a/drivers/ide/trm290.c
+++ /dev/null
@@ -1,374 +0,0 @@
-/*
- *  Copyright (c) 1997-1998  Mark Lord
- *  Copyright (c) 2007       MontaVista Software, Inc. <source@mvista.com>
- *
- *  May be copied or modified under the terms of the GNU General Public License
- *
- *  June 22, 2004 - get rid of check_region
- *                   - Jesper Juhl
- *
- */
-
-/*
- * This module provides support for the bus-master IDE DMA function
- * of the Tekram TRM290 chip, used on a variety of PCI IDE add-on boards,
- * including a "Precision Instruments" board.  The TRM290 pre-dates
- * the sff-8038 standard (ide-dma.c) by a few months, and differs
- * significantly enough to warrant separate routines for some functions,
- * while re-using others from ide-dma.c.
- *
- * EXPERIMENTAL!  It works for me (a sample of one).
- *
- * Works reliably for me in DMA mode (READs only),
- * DMA WRITEs are disabled by default (see #define below);
- *
- * DMA is not enabled automatically for this chipset,
- * but can be turned on manually (with "hdparm -d1") at run time.
- *
- * I need volunteers with "spare" drives for further testing
- * and development, and maybe to help figure out the peculiarities.
- * Even knowing the registers (below), some things behave strangely.
- */
-
-#define TRM290_NO_DMA_WRITES	/* DMA writes seem unreliable sometimes */
-
-/*
- * TRM-290 PCI-IDE2 Bus Master Chip
- * ================================
- * The configuration registers are addressed in normal I/O port space
- * and are used as follows:
- *
- * trm290_base depends on jumper settings, and is probed for by ide-dma.c
- *
- * trm290_base+2 when WRITTEN: chiptest register (byte, write-only)
- *	bit7 must always be written as "1"
- *	bits6-2 undefined
- *	bit1 1=legacy_compatible_mode, 0=native_pci_mode
- *	bit0 1=test_mode, 0=normal(default)
- *
- * trm290_base+2 when READ: status register (byte, read-only)
- *	bits7-2 undefined
- *	bit1 channel0 busmaster interrupt status 0=none, 1=asserted
- *	bit0 channel0 interrupt status 0=none, 1=asserted
- *
- * trm290_base+3 Interrupt mask register
- *	bits7-5 undefined
- *	bit4 legacy_header: 1=present, 0=absent
- *	bit3 channel1 busmaster interrupt status 0=none, 1=asserted (read only)
- *	bit2 channel1 interrupt status 0=none, 1=asserted (read only)
- *	bit1 channel1 interrupt mask: 1=masked, 0=unmasked(default)
- *	bit0 channel0 interrupt mask: 1=masked, 0=unmasked(default)
- *
- * trm290_base+1 "CPR" Config Pointer Register (byte)
- *	bit7 1=autoincrement CPR bits 2-0 after each access of CDR
- *	bit6 1=min. 1 wait-state posted write cycle (default), 0=0 wait-state
- *	bit5 0=enabled master burst access (default), 1=disable  (write only)
- *	bit4 PCI DEVSEL# timing select: 1=medium(default), 0=fast
- *	bit3 0=primary IDE channel, 1=secondary IDE channel
- *	bits2-0 register index for accesses through CDR port
- *
- * trm290_base+0 "CDR" Config Data Register (word)
- *	two sets of seven config registers,
- *	selected by CPR bit 3 (channel) and CPR bits 2-0 (index 0 to 6),
- *	each index defined below:
- *
- * Index-0 Base address register for command block (word)
- *	defaults: 0x1f0 for primary, 0x170 for secondary
- *
- * Index-1 general config register (byte)
- *	bit7 1=DMA enable, 0=DMA disable
- *	bit6 1=activate IDE_RESET, 0=no action (default)
- *	bit5 1=enable IORDY, 0=disable IORDY (default)
- *	bit4 0=16-bit data port(default), 1=8-bit (XT) data port
- *	bit3 interrupt polarity: 1=active_low, 0=active_high(default)
- *	bit2 power-saving-mode(?): 1=enable, 0=disable(default) (write only)
- *	bit1 bus_master_mode(?): 1=enable, 0=disable(default)
- *	bit0 enable_io_ports: 1=enable(default), 0=disable
- *
- * Index-2 read-ahead counter preload bits 0-7 (byte, write only)
- *	bits7-0 bits7-0 of readahead count
- *
- * Index-3 read-ahead config register (byte, write only)
- *	bit7 1=enable_readahead, 0=disable_readahead(default)
- *	bit6 1=clear_FIFO, 0=no_action
- *	bit5 undefined
- *	bit4 mode4 timing control: 1=enable, 0=disable(default)
- *	bit3 undefined
- *	bit2 undefined
- *	bits1-0 bits9-8 of read-ahead count
- *
- * Index-4 base address register for control block (word)
- *	defaults: 0x3f6 for primary, 0x376 for secondary
- *
- * Index-5 data port timings (shared by both drives) (byte)
- *	standard PCI "clk" (clock) counts, default value = 0xf5
- *
- *	bits7-6 setup time:  00=1clk, 01=2clk, 10=3clk, 11=4clk
- *	bits5-3 hold time:	000=1clk, 001=2clk, 010=3clk,
- *				011=4clk, 100=5clk, 101=6clk,
- *				110=8clk, 111=12clk
- *	bits2-0 active time:	000=2clk, 001=3clk, 010=4clk,
- *				011=5clk, 100=6clk, 101=8clk,
- *				110=12clk, 111=16clk
- *
- * Index-6 command/control port timings (shared by both drives) (byte)
- *	same layout as Index-5, default value = 0xde
- *
- * Suggested CDR programming for PIO mode0 (600ns):
- *	0x01f0,0x21,0xff,0x80,0x03f6,0xf5,0xde	; primary
- *	0x0170,0x21,0xff,0x80,0x0376,0xf5,0xde	; secondary
- *
- * Suggested CDR programming for PIO mode3 (180ns):
- *	0x01f0,0x21,0xff,0x80,0x03f6,0x09,0xde	; primary
- *	0x0170,0x21,0xff,0x80,0x0376,0x09,0xde	; secondary
- *
- * Suggested CDR programming for PIO mode4 (120ns):
- *	0x01f0,0x21,0xff,0x80,0x03f6,0x00,0xde	; primary
- *	0x0170,0x21,0xff,0x80,0x0376,0x00,0xde	; secondary
- *
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/ioport.h>
-#include <linux/interrupt.h>
-#include <linux/blkdev.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/ide.h>
-
-#include <asm/io.h>
-
-#define DRV_NAME "trm290"
-
-static void trm290_prepare_drive (ide_drive_t *drive, unsigned int use_dma)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	u16 reg = 0;
-	unsigned long flags;
-
-	/* select PIO or DMA */
-	reg = use_dma ? (0x21 | 0x82) : (0x21 & ~0x82);
-
-	local_irq_save(flags);
-
-	if (reg != hwif->select_data) {
-		hwif->select_data = reg;
-		/* set PIO/DMA */
-		outb(0x51 | (hwif->channel << 3), hwif->config_data + 1);
-		outw(reg & 0xff, hwif->config_data);
-	}
-
-	/* enable IRQ if not probing */
-	if (drive->dev_flags & IDE_DFLAG_PRESENT) {
-		reg = inw(hwif->config_data + 3);
-		reg &= 0x13;
-		reg &= ~(1 << hwif->channel);
-		outw(reg, hwif->config_data + 3);
-	}
-
-	local_irq_restore(flags);
-}
-
-static void trm290_dev_select(ide_drive_t *drive)
-{
-	trm290_prepare_drive(drive, !!(drive->dev_flags & IDE_DFLAG_USING_DMA));
-
-	outb(drive->select | ATA_DEVICE_OBS, drive->hwif->io_ports.device_addr);
-}
-
-static int trm290_dma_check(ide_drive_t *drive, struct ide_cmd *cmd)
-{
-	if (cmd->tf_flags & IDE_TFLAG_WRITE) {
-#ifdef TRM290_NO_DMA_WRITES
-		/* always use PIO for writes */
-		return 1;
-#endif
-	}
-	return 0;
-}
-
-static int trm290_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	unsigned int count, rw = (cmd->tf_flags & IDE_TFLAG_WRITE) ? 1 : 2;
-
-	count = ide_build_dmatable(drive, cmd);
-	if (count == 0)
-		/* try PIO instead of DMA */
-		return 1;
-
-	outl(hwif->dmatable_dma | rw, hwif->dma_base);
-	/* start DMA */
-	outw(count * 2 - 1, hwif->dma_base + 2);
-
-	return 0;
-}
-
-static void trm290_dma_start(ide_drive_t *drive)
-{
-	trm290_prepare_drive(drive, 1);
-}
-
-static int trm290_dma_end(ide_drive_t *drive)
-{
-	u16 status = inw(drive->hwif->dma_base + 2);
-
-	trm290_prepare_drive(drive, 0);
-
-	return status != 0x00ff;
-}
-
-static int trm290_dma_test_irq(ide_drive_t *drive)
-{
-	u16 status = inw(drive->hwif->dma_base + 2);
-
-	return status == 0x00ff;
-}
-
-static void trm290_dma_host_set(ide_drive_t *drive, int on)
-{
-}
-
-static void init_hwif_trm290(ide_hwif_t *hwif)
-{
-	struct pci_dev *dev	= to_pci_dev(hwif->dev);
-	unsigned int  cfg_base	= pci_resource_start(dev, 4);
-	unsigned long flags;
-	u8 reg = 0;
-
-	if ((dev->class & 5) && cfg_base)
-		printk(KERN_INFO DRV_NAME " %s: chip", pci_name(dev));
-	else {
-		cfg_base = 0x3df0;
-		printk(KERN_INFO DRV_NAME " %s: using default", pci_name(dev));
-	}
-	printk(KERN_CONT " config base at 0x%04x\n", cfg_base);
-	hwif->config_data = cfg_base;
-	hwif->dma_base = (cfg_base + 4) ^ (hwif->channel ? 0x80 : 0);
-
-	printk(KERN_INFO "    %s: BM-DMA at 0x%04lx-0x%04lx\n",
-	       hwif->name, hwif->dma_base, hwif->dma_base + 3);
-
-	if (ide_allocate_dma_engine(hwif))
-		return;
-
-	local_irq_save(flags);
-	/* put config reg into first byte of hwif->select_data */
-	outb(0x51 | (hwif->channel << 3), hwif->config_data + 1);
-	/* select PIO as default */
-	hwif->select_data = 0x21;
-	outb(hwif->select_data, hwif->config_data);
-	/* get IRQ info */
-	reg = inb(hwif->config_data + 3);
-	/* mask IRQs for both ports */
-	reg = (reg & 0x10) | 0x03;
-	outb(reg, hwif->config_data + 3);
-	local_irq_restore(flags);
-
-	if (reg & 0x10)
-		/* legacy mode */
-		hwif->irq = hwif->channel ? 15 : 14;
-
-#if 1
-	{
-	/*
-	 * My trm290-based card doesn't seem to work with all possible values
-	 * for the control basereg, so this kludge ensures that we use only
-	 * values that are known to work.  Ugh.		-ml
-	 */
-		u16 new, old, compat = hwif->channel ? 0x374 : 0x3f4;
-		static u16 next_offset = 0;
-		u8 old_mask;
-
-		outb(0x54 | (hwif->channel << 3), hwif->config_data + 1);
-		old = inw(hwif->config_data);
-		old &= ~1;
-		old_mask = inb(old + 2);
-		if (old != compat && old_mask == 0xff) {
-			/* leave lower 10 bits untouched */
-			compat += (next_offset += 0x400);
-			hwif->io_ports.ctl_addr = compat + 2;
-			outw(compat | 1, hwif->config_data);
-			new = inw(hwif->config_data);
-			printk(KERN_INFO "%s: control basereg workaround: "
-				"old=0x%04x, new=0x%04x\n",
-				hwif->name, old, new & ~1);
-		}
-	}
-#endif
-}
-
-static const struct ide_tp_ops trm290_tp_ops = {
-	.exec_command		= ide_exec_command,
-	.read_status		= ide_read_status,
-	.read_altstatus		= ide_read_altstatus,
-	.write_devctl		= ide_write_devctl,
-
-	.dev_select		= trm290_dev_select,
-	.tf_load		= ide_tf_load,
-	.tf_read		= ide_tf_read,
-
-	.input_data		= ide_input_data,
-	.output_data		= ide_output_data,
-};
-
-static const struct ide_dma_ops trm290_dma_ops = {
-	.dma_host_set		= trm290_dma_host_set,
-	.dma_setup 		= trm290_dma_setup,
-	.dma_start 		= trm290_dma_start,
-	.dma_end		= trm290_dma_end,
-	.dma_test_irq		= trm290_dma_test_irq,
-	.dma_lost_irq		= ide_dma_lost_irq,
-	.dma_check		= trm290_dma_check,
-};
-
-static const struct ide_port_info trm290_chipset = {
-	.name		= DRV_NAME,
-	.init_hwif	= init_hwif_trm290,
-	.tp_ops 	= &trm290_tp_ops,
-	.dma_ops	= &trm290_dma_ops,
-	.host_flags	= IDE_HFLAG_TRM290 |
-			  IDE_HFLAG_NO_ATAPI_DMA |
-#if 0 /* play it safe for now */
-			  IDE_HFLAG_TRUST_BIOS_FOR_DMA |
-#endif
-			  IDE_HFLAG_NO_AUTODMA |
-			  IDE_HFLAG_NO_LBA48,
-};
-
-static int trm290_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	return ide_pci_init_one(dev, &trm290_chipset, NULL);
-}
-
-static const struct pci_device_id trm290_pci_tbl[] = {
-	{ PCI_VDEVICE(TEKRAM, PCI_DEVICE_ID_TEKRAM_DC290), 0 },
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, trm290_pci_tbl);
-
-static struct pci_driver trm290_pci_driver = {
-	.name		= "TRM290_IDE",
-	.id_table	= trm290_pci_tbl,
-	.probe		= trm290_init_one,
-	.remove		= ide_pci_remove,
-};
-
-static int __init trm290_ide_init(void)
-{
-	return ide_pci_register_driver(&trm290_pci_driver);
-}
-
-static void __exit trm290_ide_exit(void)
-{
-	pci_unregister_driver(&trm290_pci_driver);
-}
-
-module_init(trm290_ide_init);
-module_exit(trm290_ide_exit);
-
-MODULE_AUTHOR("Mark Lord");
-MODULE_DESCRIPTION("PCI driver module for Tekram TRM290 IDE");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/tx4938ide.c b/drivers/ide/tx4938ide.c
deleted file mode 100644
index 962eb92501b5..000000000000
--- a/drivers/ide/tx4938ide.c
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * TX4938 internal IDE driver
- * Based on tx4939ide.c.
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * (C) Copyright TOSHIBA CORPORATION 2005-2007
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-#include <linux/platform_device.h>
-#include <linux/io.h>
-
-#include <asm/ide.h>
-#include <asm/txx9/tx4938.h>
-
-static void tx4938ide_tune_ebusc(unsigned int ebus_ch,
-				 unsigned int gbus_clock,
-				 u8 pio)
-{
-	struct ide_timing *t = ide_timing_find_mode(XFER_PIO_0 + pio);
-	u64 cr = __raw_readq(&tx4938_ebuscptr->cr[ebus_ch]);
-	unsigned int sp = (cr >> 4) & 3;
-	unsigned int clock = gbus_clock / (4 - sp);
-	unsigned int cycle = 1000000000 / clock;
-	unsigned int shwt;
-	int wt;
-
-	/* Minimum DIOx- active time */
-	wt = DIV_ROUND_UP(t->act8b, cycle) - 2;
-	/* IORDY setup time: 35ns */
-	wt = max_t(int, wt, DIV_ROUND_UP(35, cycle));
-	/* actual wait-cycle is max(wt & ~1, 1) */
-	if (wt > 2 && (wt & 1))
-		wt++;
-	wt &= ~1;
-	/* Address-valid to DIOR/DIOW setup */
-	shwt = DIV_ROUND_UP(t->setup, cycle);
-
-	/* -DIOx recovery time (SHWT * 4) and cycle time requirement */
-	while ((shwt * 4 + wt + (wt ? 2 : 3)) * cycle < t->cycle)
-		shwt++;
-	if (shwt > 7) {
-		pr_warn("tx4938ide: SHWT violation (%d)\n", shwt);
-		shwt = 7;
-	}
-	pr_debug("tx4938ide: ebus %d, bus cycle %dns, WT %d, SHWT %d\n",
-		 ebus_ch, cycle, wt, shwt);
-
-	__raw_writeq((cr & ~0x3f007ull) | (wt << 12) | shwt,
-		     &tx4938_ebuscptr->cr[ebus_ch]);
-}
-
-static void tx4938ide_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	struct tx4938ide_platform_info *pdata = dev_get_platdata(hwif->dev);
-	u8 safe = drive->pio_mode - XFER_PIO_0;
-	ide_drive_t *pair;
-
-	pair = ide_get_pair_dev(drive);
-	if (pair)
-		safe = min_t(u8, safe, pair->pio_mode - XFER_PIO_0);
-	tx4938ide_tune_ebusc(pdata->ebus_ch, pdata->gbus_clock, safe);
-}
-
-#ifdef __BIG_ENDIAN
-
-/* custom iops (independent from SWAP_IO_SPACE) */
-static void tx4938ide_input_data_swap(ide_drive_t *drive, struct ide_cmd *cmd,
-				void *buf, unsigned int len)
-{
-	unsigned long port = drive->hwif->io_ports.data_addr;
-	unsigned short *ptr = buf;
-	unsigned int count = (len + 1) / 2;
-
-	while (count--)
-		*ptr++ = cpu_to_le16(__raw_readw((void __iomem *)port));
-	__ide_flush_dcache_range((unsigned long)buf, roundup(len, 2));
-}
-
-static void tx4938ide_output_data_swap(ide_drive_t *drive, struct ide_cmd *cmd,
-				void *buf, unsigned int len)
-{
-	unsigned long port = drive->hwif->io_ports.data_addr;
-	unsigned short *ptr = buf;
-	unsigned int count = (len + 1) / 2;
-
-	while (count--) {
-		__raw_writew(le16_to_cpu(*ptr), (void __iomem *)port);
-		ptr++;
-	}
-	__ide_flush_dcache_range((unsigned long)buf, roundup(len, 2));
-}
-
-static const struct ide_tp_ops tx4938ide_tp_ops = {
-	.exec_command		= ide_exec_command,
-	.read_status		= ide_read_status,
-	.read_altstatus		= ide_read_altstatus,
-	.write_devctl		= ide_write_devctl,
-
-	.dev_select		= ide_dev_select,
-	.tf_load		= ide_tf_load,
-	.tf_read		= ide_tf_read,
-
-	.input_data		= tx4938ide_input_data_swap,
-	.output_data		= tx4938ide_output_data_swap,
-};
-
-#endif	/* __BIG_ENDIAN */
-
-static const struct ide_port_ops tx4938ide_port_ops = {
-	.set_pio_mode		= tx4938ide_set_pio_mode,
-};
-
-static const struct ide_port_info tx4938ide_port_info __initconst = {
-	.port_ops		= &tx4938ide_port_ops,
-#ifdef __BIG_ENDIAN
-	.tp_ops			= &tx4938ide_tp_ops,
-#endif
-	.host_flags		= IDE_HFLAG_MMIO | IDE_HFLAG_NO_DMA,
-	.pio_mask		= ATA_PIO5,
-	.chipset		= ide_generic,
-};
-
-static int __init tx4938ide_probe(struct platform_device *pdev)
-{
-	struct ide_hw hw, *hws[] = { &hw };
-	struct ide_host *host;
-	struct resource *res;
-	struct tx4938ide_platform_info *pdata = dev_get_platdata(&pdev->dev);
-	int irq, ret, i;
-	unsigned long mapbase, mapctl;
-	struct ide_port_info d = tx4938ide_port_info;
-
-	irq = platform_get_irq(pdev, 0);
-	if (irq < 0)
-		return -ENODEV;
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res)
-		return -ENODEV;
-
-	if (!devm_request_mem_region(&pdev->dev, res->start,
-				     resource_size(res), "tx4938ide"))
-		return -EBUSY;
-	mapbase = (unsigned long)devm_ioremap(&pdev->dev, res->start,
-					      8 << pdata->ioport_shift);
-	mapctl = (unsigned long)devm_ioremap(&pdev->dev,
-					     res->start + 0x10000 +
-					     (6 << pdata->ioport_shift),
-					     1 << pdata->ioport_shift);
-	if (!mapbase || !mapctl)
-		return -EBUSY;
-
-	memset(&hw, 0, sizeof(hw));
-	if (pdata->ioport_shift) {
-		unsigned long port = mapbase;
-		unsigned long ctl = mapctl;
-
-		hw.io_ports_array[0] = port;
-#ifdef __BIG_ENDIAN
-		port++;
-		ctl++;
-#endif
-		for (i = 1; i <= 7; i++)
-			hw.io_ports_array[i] =
-				port + (i << pdata->ioport_shift);
-		hw.io_ports.ctl_addr = ctl;
-	} else
-		ide_std_init_ports(&hw, mapbase, mapctl);
-	hw.irq = irq;
-	hw.dev = &pdev->dev;
-
-	pr_info("TX4938 IDE interface (base %#lx, ctl %#lx, irq %d)\n",
-		mapbase, mapctl, hw.irq);
-	if (pdata->gbus_clock)
-		tx4938ide_tune_ebusc(pdata->ebus_ch, pdata->gbus_clock, 0);
-	else
-		d.port_ops = NULL;
-	ret = ide_host_add(&d, hws, 1, &host);
-	if (!ret)
-		platform_set_drvdata(pdev, host);
-	return ret;
-}
-
-static int __exit tx4938ide_remove(struct platform_device *pdev)
-{
-	struct ide_host *host = platform_get_drvdata(pdev);
-
-	ide_host_remove(host);
-	return 0;
-}
-
-static struct platform_driver tx4938ide_driver = {
-	.driver		= {
-		.name	= "tx4938ide",
-	},
-	.remove = __exit_p(tx4938ide_remove),
-};
-
-module_platform_driver_probe(tx4938ide_driver, tx4938ide_probe);
-
-MODULE_DESCRIPTION("TX4938 internal IDE driver");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:tx4938ide");
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
deleted file mode 100644
index b1bbf807bb3d..000000000000
--- a/drivers/ide/tx4939ide.c
+++ /dev/null
@@ -1,628 +0,0 @@
-/*
- * TX4939 internal IDE driver
- * Based on RBTX49xx patch from CELF patch archive.
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * (C) Copyright TOSHIBA CORPORATION 2005-2007
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/platform_device.h>
-#include <linux/io.h>
-#include <linux/scatterlist.h>
-
-#include <asm/ide.h>
-
-#define MODNAME	"tx4939ide"
-
-/* ATA Shadow Registers (8-bit except for Data which is 16-bit) */
-#define TX4939IDE_Data			0x000
-#define TX4939IDE_Error_Feature		0x001
-#define TX4939IDE_Sec			0x002
-#define TX4939IDE_LBA0			0x003
-#define TX4939IDE_LBA1			0x004
-#define TX4939IDE_LBA2			0x005
-#define TX4939IDE_DevHead		0x006
-#define TX4939IDE_Stat_Cmd		0x007
-#define TX4939IDE_AltStat_DevCtl	0x402
-/* H/W DMA Registers  */
-#define TX4939IDE_DMA_Cmd	0x800	/* 8-bit */
-#define TX4939IDE_DMA_Stat	0x802	/* 8-bit */
-#define TX4939IDE_PRD_Ptr	0x804	/* 32-bit */
-/* ATA100 CORE Registers (16-bit) */
-#define TX4939IDE_Sys_Ctl	0xc00
-#define TX4939IDE_Xfer_Cnt_1	0xc08
-#define TX4939IDE_Xfer_Cnt_2	0xc0a
-#define TX4939IDE_Sec_Cnt	0xc10
-#define TX4939IDE_Start_Lo_Addr	0xc18
-#define TX4939IDE_Start_Up_Addr	0xc20
-#define TX4939IDE_Add_Ctl	0xc28
-#define TX4939IDE_Lo_Burst_Cnt	0xc30
-#define TX4939IDE_Up_Burst_Cnt	0xc38
-#define TX4939IDE_PIO_Addr	0xc88
-#define TX4939IDE_H_Rst_Tim	0xc90
-#define TX4939IDE_Int_Ctl	0xc98
-#define TX4939IDE_Pkt_Cmd	0xcb8
-#define TX4939IDE_Bxfer_Cnt_Hi	0xcc0
-#define TX4939IDE_Bxfer_Cnt_Lo	0xcc8
-#define TX4939IDE_Dev_TErr	0xcd0
-#define TX4939IDE_Pkt_Xfer_Ctl	0xcd8
-#define TX4939IDE_Start_TAddr	0xce0
-
-/* bits for Int_Ctl */
-#define TX4939IDE_INT_ADDRERR	0x80
-#define TX4939IDE_INT_REACHMUL	0x40
-#define TX4939IDE_INT_DEVTIMING	0x20
-#define TX4939IDE_INT_UDMATERM	0x10
-#define TX4939IDE_INT_TIMER	0x08
-#define TX4939IDE_INT_BUSERR	0x04
-#define TX4939IDE_INT_XFEREND	0x02
-#define TX4939IDE_INT_HOST	0x01
-
-#define TX4939IDE_IGNORE_INTS	\
-	(TX4939IDE_INT_ADDRERR | TX4939IDE_INT_REACHMUL | \
-	 TX4939IDE_INT_DEVTIMING | TX4939IDE_INT_UDMATERM | \
-	 TX4939IDE_INT_TIMER | TX4939IDE_INT_XFEREND)
-
-#ifdef __BIG_ENDIAN
-#define tx4939ide_swizzlel(a)	((a) ^ 4)
-#define tx4939ide_swizzlew(a)	((a) ^ 6)
-#define tx4939ide_swizzleb(a)	((a) ^ 7)
-#else
-#define tx4939ide_swizzlel(a)	(a)
-#define tx4939ide_swizzlew(a)	(a)
-#define tx4939ide_swizzleb(a)	(a)
-#endif
-
-static u16 tx4939ide_readw(void __iomem *base, u32 reg)
-{
-	return __raw_readw(base + tx4939ide_swizzlew(reg));
-}
-static u8 tx4939ide_readb(void __iomem *base, u32 reg)
-{
-	return __raw_readb(base + tx4939ide_swizzleb(reg));
-}
-static void tx4939ide_writel(u32 val, void __iomem *base, u32 reg)
-{
-	__raw_writel(val, base + tx4939ide_swizzlel(reg));
-}
-static void tx4939ide_writew(u16 val, void __iomem *base, u32 reg)
-{
-	__raw_writew(val, base + tx4939ide_swizzlew(reg));
-}
-static void tx4939ide_writeb(u8 val, void __iomem *base, u32 reg)
-{
-	__raw_writeb(val, base + tx4939ide_swizzleb(reg));
-}
-
-#define TX4939IDE_BASE(hwif)	((void __iomem *)(hwif)->extra_base)
-
-static void tx4939ide_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	int is_slave = drive->dn;
-	u32 mask, val;
-	const u8 pio = drive->pio_mode - XFER_PIO_0;
-	u8 safe = pio;
-	ide_drive_t *pair;
-
-	pair = ide_get_pair_dev(drive);
-	if (pair)
-		safe = min_t(u8, safe, pair->pio_mode - XFER_PIO_0);
-	/*
-	 * Update Command Transfer Mode for master/slave and Data
-	 * Transfer Mode for this drive.
-	 */
-	mask = is_slave ? 0x07f00000 : 0x000007f0;
-	val = ((safe << 8) | (pio << 4)) << (is_slave ? 16 : 0);
-	hwif->select_data = (hwif->select_data & ~mask) | val;
-	/* tx4939ide_tf_load_fixup() will set the Sys_Ctl register */
-}
-
-static void tx4939ide_set_dma_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	u32 mask, val;
-	const u8 mode = drive->dma_mode;
-
-	/* Update Data Transfer Mode for this drive. */
-	if (mode >= XFER_UDMA_0)
-		val = mode - XFER_UDMA_0 + 8;
-	else
-		val = mode - XFER_MW_DMA_0 + 5;
-	if (drive->dn) {
-		mask = 0x00f00000;
-		val <<= 20;
-	} else {
-		mask = 0x000000f0;
-		val <<= 4;
-	}
-	hwif->select_data = (hwif->select_data & ~mask) | val;
-	/* tx4939ide_tf_load_fixup() will set the Sys_Ctl register */
-}
-
-static u16 tx4939ide_check_error_ints(ide_hwif_t *hwif)
-{
-	void __iomem *base = TX4939IDE_BASE(hwif);
-	u16 ctl = tx4939ide_readw(base, TX4939IDE_Int_Ctl);
-
-	if (ctl & TX4939IDE_INT_BUSERR) {
-		/* reset FIFO */
-		u16 sysctl = tx4939ide_readw(base, TX4939IDE_Sys_Ctl);
-
-		tx4939ide_writew(sysctl | 0x4000, base, TX4939IDE_Sys_Ctl);
-		/* wait 12GBUSCLK (typ. 60ns @ GBUS200MHz, max 270ns) */
-		ndelay(270);
-		tx4939ide_writew(sysctl, base, TX4939IDE_Sys_Ctl);
-	}
-	if (ctl & (TX4939IDE_INT_ADDRERR |
-		   TX4939IDE_INT_DEVTIMING | TX4939IDE_INT_BUSERR))
-		pr_err("%s: Error interrupt %#x (%s%s%s )\n",
-		       hwif->name, ctl,
-		       ctl & TX4939IDE_INT_ADDRERR ? " Address-Error" : "",
-		       ctl & TX4939IDE_INT_DEVTIMING ? " DEV-Timing" : "",
-		       ctl & TX4939IDE_INT_BUSERR ? " Bus-Error" : "");
-	return ctl;
-}
-
-static void tx4939ide_clear_irq(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif;
-	void __iomem *base;
-	u16 ctl;
-
-	/*
-	 * tx4939ide_dma_test_irq() and tx4939ide_dma_end() do all job
-	 * for DMA case.
-	 */
-	if (drive->waiting_for_dma)
-		return;
-	hwif = drive->hwif;
-	base = TX4939IDE_BASE(hwif);
-	ctl = tx4939ide_check_error_ints(hwif);
-	tx4939ide_writew(ctl, base, TX4939IDE_Int_Ctl);
-}
-
-static u8 tx4939ide_cable_detect(ide_hwif_t *hwif)
-{
-	void __iomem *base = TX4939IDE_BASE(hwif);
-
-	return tx4939ide_readw(base, TX4939IDE_Sys_Ctl) & 0x2000 ?
-		ATA_CBL_PATA40 : ATA_CBL_PATA80;
-}
-
-#ifdef __BIG_ENDIAN
-static void tx4939ide_dma_host_set(ide_drive_t *drive, int on)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	u8 unit = drive->dn;
-	void __iomem *base = TX4939IDE_BASE(hwif);
-	u8 dma_stat = tx4939ide_readb(base, TX4939IDE_DMA_Stat);
-
-	if (on)
-		dma_stat |= (1 << (5 + unit));
-	else
-		dma_stat &= ~(1 << (5 + unit));
-
-	tx4939ide_writeb(dma_stat, base, TX4939IDE_DMA_Stat);
-}
-#else
-#define tx4939ide_dma_host_set	ide_dma_host_set
-#endif
-
-static u8 tx4939ide_clear_dma_status(void __iomem *base)
-{
-	u8 dma_stat;
-
-	/* read DMA status for INTR & ERROR flags */
-	dma_stat = tx4939ide_readb(base, TX4939IDE_DMA_Stat);
-	/* clear INTR & ERROR flags */
-	tx4939ide_writeb(dma_stat | ATA_DMA_INTR | ATA_DMA_ERR, base,
-			 TX4939IDE_DMA_Stat);
-	/* recover intmask cleared by writing to bit2 of DMA_Stat */
-	tx4939ide_writew(TX4939IDE_IGNORE_INTS << 8, base, TX4939IDE_Int_Ctl);
-	return dma_stat;
-}
-
-#ifdef __BIG_ENDIAN
-/* custom ide_build_dmatable to handle swapped layout */
-static int tx4939ide_build_dmatable(ide_drive_t *drive, struct ide_cmd *cmd)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	u32 *table = (u32 *)hwif->dmatable_cpu;
-	unsigned int count = 0;
-	int i;
-	struct scatterlist *sg;
-
-	for_each_sg(hwif->sg_table, sg, cmd->sg_nents, i) {
-		u32 cur_addr, cur_len, bcount;
-
-		cur_addr = sg_dma_address(sg);
-		cur_len = sg_dma_len(sg);
-
-		/*
-		 * Fill in the DMA table, without crossing any 64kB boundaries.
-		 */
-
-		while (cur_len) {
-			if (count++ >= PRD_ENTRIES)
-				goto use_pio_instead;
-
-			bcount = 0x10000 - (cur_addr & 0xffff);
-			if (bcount > cur_len)
-				bcount = cur_len;
-			/*
-			 * This workaround for zero count seems required.
-			 * (standard ide_build_dmatable does it too)
-			 */
-			if (bcount == 0x10000)
-				bcount = 0x8000;
-			*table++ = bcount & 0xffff;
-			*table++ = cur_addr;
-			cur_addr += bcount;
-			cur_len -= bcount;
-		}
-	}
-
-	if (count) {
-		*(table - 2) |= 0x80000000;
-		return count;
-	}
-
-use_pio_instead:
-	printk(KERN_ERR "%s: %s\n", drive->name,
-		count ? "DMA table too small" : "empty DMA table?");
-
-	return 0; /* revert to PIO for this request */
-}
-#else
-#define tx4939ide_build_dmatable	ide_build_dmatable
-#endif
-
-static int tx4939ide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	void __iomem *base = TX4939IDE_BASE(hwif);
-	u8 rw = (cmd->tf_flags & IDE_TFLAG_WRITE) ? 0 : ATA_DMA_WR;
-
-	/* fall back to PIO! */
-	if (tx4939ide_build_dmatable(drive, cmd) == 0)
-		return 1;
-
-	/* PRD table */
-	tx4939ide_writel(hwif->dmatable_dma, base, TX4939IDE_PRD_Ptr);
-
-	/* specify r/w */
-	tx4939ide_writeb(rw, base, TX4939IDE_DMA_Cmd);
-
-	/* clear INTR & ERROR flags */
-	tx4939ide_clear_dma_status(base);
-
-	tx4939ide_writew(SECTOR_SIZE / 2, base, drive->dn ?
-			 TX4939IDE_Xfer_Cnt_2 : TX4939IDE_Xfer_Cnt_1);
-
-	tx4939ide_writew(blk_rq_sectors(cmd->rq), base, TX4939IDE_Sec_Cnt);
-
-	return 0;
-}
-
-static int tx4939ide_dma_end(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	u8 dma_stat, dma_cmd;
-	void __iomem *base = TX4939IDE_BASE(hwif);
-	u16 ctl = tx4939ide_readw(base, TX4939IDE_Int_Ctl);
-
-	/* get DMA command mode */
-	dma_cmd = tx4939ide_readb(base, TX4939IDE_DMA_Cmd);
-	/* stop DMA */
-	tx4939ide_writeb(dma_cmd & ~ATA_DMA_START, base, TX4939IDE_DMA_Cmd);
-
-	/* read and clear the INTR & ERROR bits */
-	dma_stat = tx4939ide_clear_dma_status(base);
-
-#define CHECK_DMA_MASK (ATA_DMA_ACTIVE | ATA_DMA_ERR | ATA_DMA_INTR)
-
-	/* verify good DMA status */
-	if ((dma_stat & CHECK_DMA_MASK) == 0 &&
-	    (ctl & (TX4939IDE_INT_XFEREND | TX4939IDE_INT_HOST)) ==
-	    (TX4939IDE_INT_XFEREND | TX4939IDE_INT_HOST))
-		/* INT_IDE lost... bug? */
-		return 0;
-	return ((dma_stat & CHECK_DMA_MASK) !=
-		ATA_DMA_INTR) ? 0x10 | dma_stat : 0;
-}
-
-/* returns 1 if DMA IRQ issued, 0 otherwise */
-static int tx4939ide_dma_test_irq(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	void __iomem *base = TX4939IDE_BASE(hwif);
-	u16 ctl, ide_int;
-	u8 dma_stat, stat;
-	int found = 0;
-
-	ctl = tx4939ide_check_error_ints(hwif);
-	ide_int = ctl & (TX4939IDE_INT_XFEREND | TX4939IDE_INT_HOST);
-	switch (ide_int) {
-	case TX4939IDE_INT_HOST:
-		/* On error, XFEREND might not be asserted. */
-		stat = tx4939ide_readb(base, TX4939IDE_AltStat_DevCtl);
-		if ((stat & (ATA_BUSY | ATA_DRQ | ATA_ERR)) == ATA_ERR)
-			found = 1;
-		else
-			/* Wait for XFEREND (Mask HOST and unmask XFEREND) */
-			ctl &= ~TX4939IDE_INT_XFEREND << 8;
-		ctl |= ide_int << 8;
-		break;
-	case TX4939IDE_INT_HOST | TX4939IDE_INT_XFEREND:
-		dma_stat = tx4939ide_readb(base, TX4939IDE_DMA_Stat);
-		if (!(dma_stat & ATA_DMA_INTR))
-			pr_warn("%s: weird interrupt status. "
-				"DMA_Stat %#02x int_ctl %#04x\n",
-				hwif->name, dma_stat, ctl);
-		found = 1;
-		break;
-	}
-	/*
-	 * Do not clear XFEREND, HOST now.  They will be cleared by
-	 * clearing bit2 of DMA_Stat.
-	 */
-	ctl &= ~ide_int;
-	tx4939ide_writew(ctl, base, TX4939IDE_Int_Ctl);
-	return found;
-}
-
-#ifdef __BIG_ENDIAN
-static u8 tx4939ide_dma_sff_read_status(ide_hwif_t *hwif)
-{
-	void __iomem *base = TX4939IDE_BASE(hwif);
-
-	return tx4939ide_readb(base, TX4939IDE_DMA_Stat);
-}
-#else
-#define tx4939ide_dma_sff_read_status ide_dma_sff_read_status
-#endif
-
-static void tx4939ide_init_hwif(ide_hwif_t *hwif)
-{
-	void __iomem *base = TX4939IDE_BASE(hwif);
-
-	/* Soft Reset */
-	tx4939ide_writew(0x8000, base, TX4939IDE_Sys_Ctl);
-	/* at least 20 GBUSCLK (typ. 100ns @ GBUS200MHz, max 450ns) */
-	ndelay(450);
-	tx4939ide_writew(0x0000, base, TX4939IDE_Sys_Ctl);
-	/* mask some interrupts and clear all interrupts */
-	tx4939ide_writew((TX4939IDE_IGNORE_INTS << 8) | 0xff, base,
-			 TX4939IDE_Int_Ctl);
-
-	tx4939ide_writew(0x0008, base, TX4939IDE_Lo_Burst_Cnt);
-	tx4939ide_writew(0, base, TX4939IDE_Up_Burst_Cnt);
-}
-
-static int tx4939ide_init_dma(ide_hwif_t *hwif, const struct ide_port_info *d)
-{
-	hwif->dma_base =
-		hwif->extra_base + tx4939ide_swizzleb(TX4939IDE_DMA_Cmd);
-	/*
-	 * Note that we cannot use ATA_DMA_TABLE_OFS, ATA_DMA_STATUS
-	 * for big endian.
-	 */
-	return ide_allocate_dma_engine(hwif);
-}
-
-static void tx4939ide_tf_load_fixup(ide_drive_t *drive)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	void __iomem *base = TX4939IDE_BASE(hwif);
-	u16 sysctl = hwif->select_data >> (drive->dn ? 16 : 0);
-
-	/*
-	 * Fix ATA100 CORE System Control Register. (The write to the
-	 * Device/Head register may write wrong data to the System
-	 * Control Register)
-	 * While Sys_Ctl is written here, dev_select() is not needed.
-	 */
-	tx4939ide_writew(sysctl, base, TX4939IDE_Sys_Ctl);
-}
-
-static void tx4939ide_tf_load(ide_drive_t *drive, struct ide_taskfile *tf,
-			      u8 valid)
-{
-	ide_tf_load(drive, tf, valid);
-
-	if (valid & IDE_VALID_DEVICE)
-		tx4939ide_tf_load_fixup(drive);
-}
-
-#ifdef __BIG_ENDIAN
-
-/* custom iops (independent from SWAP_IO_SPACE) */
-static void tx4939ide_input_data_swap(ide_drive_t *drive, struct ide_cmd *cmd,
-				void *buf, unsigned int len)
-{
-	unsigned long port = drive->hwif->io_ports.data_addr;
-	unsigned short *ptr = buf;
-	unsigned int count = (len + 1) / 2;
-
-	while (count--)
-		*ptr++ = cpu_to_le16(__raw_readw((void __iomem *)port));
-	__ide_flush_dcache_range((unsigned long)buf, roundup(len, 2));
-}
-
-static void tx4939ide_output_data_swap(ide_drive_t *drive, struct ide_cmd *cmd,
-				void *buf, unsigned int len)
-{
-	unsigned long port = drive->hwif->io_ports.data_addr;
-	unsigned short *ptr = buf;
-	unsigned int count = (len + 1) / 2;
-
-	while (count--) {
-		__raw_writew(le16_to_cpu(*ptr), (void __iomem *)port);
-		ptr++;
-	}
-	__ide_flush_dcache_range((unsigned long)buf, roundup(len, 2));
-}
-
-static const struct ide_tp_ops tx4939ide_tp_ops = {
-	.exec_command		= ide_exec_command,
-	.read_status		= ide_read_status,
-	.read_altstatus		= ide_read_altstatus,
-	.write_devctl		= ide_write_devctl,
-
-	.dev_select		= ide_dev_select,
-	.tf_load		= tx4939ide_tf_load,
-	.tf_read		= ide_tf_read,
-
-	.input_data		= tx4939ide_input_data_swap,
-	.output_data		= tx4939ide_output_data_swap,
-};
-
-#else	/* __LITTLE_ENDIAN */
-
-static const struct ide_tp_ops tx4939ide_tp_ops = {
-	.exec_command		= ide_exec_command,
-	.read_status		= ide_read_status,
-	.read_altstatus		= ide_read_altstatus,
-	.write_devctl		= ide_write_devctl,
-
-	.dev_select		= ide_dev_select,
-	.tf_load		= tx4939ide_tf_load,
-	.tf_read		= ide_tf_read,
-
-	.input_data		= ide_input_data,
-	.output_data		= ide_output_data,
-};
-
-#endif	/* __LITTLE_ENDIAN */
-
-static const struct ide_port_ops tx4939ide_port_ops = {
-	.set_pio_mode		= tx4939ide_set_pio_mode,
-	.set_dma_mode		= tx4939ide_set_dma_mode,
-	.clear_irq		= tx4939ide_clear_irq,
-	.cable_detect		= tx4939ide_cable_detect,
-};
-
-static const struct ide_dma_ops tx4939ide_dma_ops = {
-	.dma_host_set		= tx4939ide_dma_host_set,
-	.dma_setup		= tx4939ide_dma_setup,
-	.dma_start		= ide_dma_start,
-	.dma_end		= tx4939ide_dma_end,
-	.dma_test_irq		= tx4939ide_dma_test_irq,
-	.dma_lost_irq		= ide_dma_lost_irq,
-	.dma_timer_expiry	= ide_dma_sff_timer_expiry,
-	.dma_sff_read_status	= tx4939ide_dma_sff_read_status,
-};
-
-static const struct ide_port_info tx4939ide_port_info __initconst = {
-	.init_hwif		= tx4939ide_init_hwif,
-	.init_dma		= tx4939ide_init_dma,
-	.port_ops		= &tx4939ide_port_ops,
-	.dma_ops		= &tx4939ide_dma_ops,
-	.tp_ops			= &tx4939ide_tp_ops,
-	.host_flags		= IDE_HFLAG_MMIO,
-	.pio_mask		= ATA_PIO4,
-	.mwdma_mask		= ATA_MWDMA2,
-	.udma_mask		= ATA_UDMA5,
-	.chipset		= ide_generic,
-};
-
-static int __init tx4939ide_probe(struct platform_device *pdev)
-{
-	struct ide_hw hw, *hws[] = { &hw };
-	struct ide_host *host;
-	struct resource *res;
-	int irq, ret;
-	unsigned long mapbase;
-
-	irq = platform_get_irq(pdev, 0);
-	if (irq < 0)
-		return -ENODEV;
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res)
-		return -ENODEV;
-
-	if (!devm_request_mem_region(&pdev->dev, res->start,
-				     resource_size(res), MODNAME))
-		return -EBUSY;
-	mapbase = (unsigned long)devm_ioremap(&pdev->dev, res->start,
-					      resource_size(res));
-	if (!mapbase)
-		return -EBUSY;
-	memset(&hw, 0, sizeof(hw));
-	hw.io_ports.data_addr =
-		mapbase + tx4939ide_swizzlew(TX4939IDE_Data);
-	hw.io_ports.error_addr =
-		mapbase + tx4939ide_swizzleb(TX4939IDE_Error_Feature);
-	hw.io_ports.nsect_addr =
-		mapbase + tx4939ide_swizzleb(TX4939IDE_Sec);
-	hw.io_ports.lbal_addr =
-		mapbase + tx4939ide_swizzleb(TX4939IDE_LBA0);
-	hw.io_ports.lbam_addr =
-		mapbase + tx4939ide_swizzleb(TX4939IDE_LBA1);
-	hw.io_ports.lbah_addr =
-		mapbase + tx4939ide_swizzleb(TX4939IDE_LBA2);
-	hw.io_ports.device_addr =
-		mapbase + tx4939ide_swizzleb(TX4939IDE_DevHead);
-	hw.io_ports.command_addr =
-		mapbase + tx4939ide_swizzleb(TX4939IDE_Stat_Cmd);
-	hw.io_ports.ctl_addr =
-		mapbase + tx4939ide_swizzleb(TX4939IDE_AltStat_DevCtl);
-	hw.irq = irq;
-	hw.dev = &pdev->dev;
-
-	pr_info("TX4939 IDE interface (base %#lx, irq %d)\n", mapbase, irq);
-	host = ide_host_alloc(&tx4939ide_port_info, hws, 1);
-	if (!host)
-		return -ENOMEM;
-	/* use extra_base for base address of the all registers */
-	host->ports[0]->extra_base = mapbase;
-	ret = ide_host_register(host, &tx4939ide_port_info, hws);
-	if (ret) {
-		ide_host_free(host);
-		return ret;
-	}
-	platform_set_drvdata(pdev, host);
-	return 0;
-}
-
-static int __exit tx4939ide_remove(struct platform_device *pdev)
-{
-	struct ide_host *host = platform_get_drvdata(pdev);
-
-	ide_host_remove(host);
-	return 0;
-}
-
-#ifdef CONFIG_PM
-static int tx4939ide_resume(struct platform_device *dev)
-{
-	struct ide_host *host = platform_get_drvdata(dev);
-	ide_hwif_t *hwif = host->ports[0];
-
-	tx4939ide_init_hwif(hwif);
-	return 0;
-}
-#else
-#define tx4939ide_resume	NULL
-#endif
-
-static struct platform_driver tx4939ide_driver = {
-	.driver = {
-		.name = MODNAME,
-	},
-	.remove = __exit_p(tx4939ide_remove),
-	.resume = tx4939ide_resume,
-};
-
-module_platform_driver_probe(tx4939ide_driver, tx4939ide_probe);
-
-MODULE_DESCRIPTION("TX4939 internal IDE driver");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:tx4939ide");
diff --git a/drivers/ide/umc8672.c b/drivers/ide/umc8672.c
deleted file mode 100644
index cf996f788292..000000000000
--- a/drivers/ide/umc8672.c
+++ /dev/null
@@ -1,184 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  Copyright (C) 1995-1996  Linus Torvalds & author (see below)
- */
-
-/*
- *  Principal Author/Maintainer:  PODIEN@hml2.atlas.de (Wolfram Podien)
- *
- *  This file provides support for the advanced features
- *  of the UMC 8672 IDE interface.
- *
- *  Version 0.01	Initial version, hacked out of ide.c,
- *			and #include'd rather than compiled separately.
- *			This will get cleaned up in a subsequent release.
- *
- *  Version 0.02	now configs/compiles separate from ide.c  -ml
- *  Version 0.03	enhanced auto-tune, fix display bug
- *  Version 0.05	replace sti() with restore_flags()  -ml
- *			add detection of possible race condition  -ml
- */
-
-/*
- * VLB Controller Support from
- * Wolfram Podien
- * Rohoefe 3
- * D28832 Achim
- * Germany
- *
- * To enable UMC8672 support there must a lilo line like
- * append="ide0=umc8672"...
- * To set the speed according to the abilities of the hardware there must be a
- * line like
- * #define UMC_DRIVE0 11
- * in the beginning of the driver, which sets the speed of drive 0 to 11 (there
- * are some lines present). 0 - 11 are allowed speed values. These values are
- * the results from the DOS speed test program supplied from UMC. 11 is the
- * highest speed (about PIO mode 3)
- */
-#define REALLY_SLOW_IO		/* some systems can safely undef this */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/delay.h>
-#include <linux/timer.h>
-#include <linux/mm.h>
-#include <linux/ioport.h>
-#include <linux/blkdev.h>
-#include <linux/ide.h>
-#include <linux/init.h>
-
-#include <asm/io.h>
-
-#define DRV_NAME "umc8672"
-
-/*
- * Default speeds.  These can be changed with "auto-tune" and/or hdparm.
- */
-#define UMC_DRIVE0      1              /* DOS measured drive speeds */
-#define UMC_DRIVE1      1              /* 0 to 11 allowed */
-#define UMC_DRIVE2      1              /* 11 = Fastest Speed */
-#define UMC_DRIVE3      1              /* In case of crash reduce speed */
-
-static u8 current_speeds[4] = {UMC_DRIVE0, UMC_DRIVE1, UMC_DRIVE2, UMC_DRIVE3};
-static const u8 pio_to_umc [5] = {0, 3, 7, 10, 11};	/* rough guesses */
-
-/*       0    1    2    3    4    5    6    7    8    9    10   11      */
-static const u8 speedtab [3][12] = {
-	{0x0f, 0x0b, 0x02, 0x02, 0x02, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x1},
-	{0x03, 0x02, 0x02, 0x02, 0x02, 0x02, 0x01, 0x01, 0x01, 0x01, 0x01, 0x1},
-	{0xff, 0xcb, 0xc0, 0x58, 0x36, 0x33, 0x23, 0x22, 0x21, 0x11, 0x10, 0x0}
-};
-
-static void out_umc(char port, char wert)
-{
-	outb_p(port, 0x108);
-	outb_p(wert, 0x109);
-}
-
-static inline u8 in_umc(char port)
-{
-	outb_p(port, 0x108);
-	return inb_p(0x109);
-}
-
-static void umc_set_speeds(u8 speeds[])
-{
-	int i, tmp;
-
-	outb_p(0x5A, 0x108); /* enable umc */
-
-	out_umc(0xd7, (speedtab[0][speeds[2]] | (speedtab[0][speeds[3]]<<4)));
-	out_umc(0xd6, (speedtab[0][speeds[0]] | (speedtab[0][speeds[1]]<<4)));
-	tmp = 0;
-	for (i = 3; i >= 0; i--)
-		tmp = (tmp << 2) | speedtab[1][speeds[i]];
-	out_umc(0xdc, tmp);
-	for (i = 0; i < 4; i++) {
-		out_umc(0xd0 + i, speedtab[2][speeds[i]]);
-		out_umc(0xd8 + i, speedtab[2][speeds[i]]);
-	}
-	outb_p(0xa5, 0x108); /* disable umc */
-
-	printk("umc8672: drive speeds [0 to 11]: %d %d %d %d\n",
-		speeds[0], speeds[1], speeds[2], speeds[3]);
-}
-
-static void umc_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	ide_hwif_t *mate = hwif->mate;
-	unsigned long flags;
-	const u8 pio = drive->pio_mode - XFER_PIO_0;
-
-	printk("%s: setting umc8672 to PIO mode%d (speed %d)\n",
-		drive->name, pio, pio_to_umc[pio]);
-	if (mate)
-		spin_lock_irqsave(&mate->lock, flags);
-	if (mate && mate->handler) {
-		printk(KERN_ERR "umc8672: other interface is busy: exiting tune_umc()\n");
-	} else {
-		current_speeds[drive->name[2] - 'a'] = pio_to_umc[pio];
-		umc_set_speeds(current_speeds);
-	}
-	if (mate)
-		spin_unlock_irqrestore(&mate->lock, flags);
-}
-
-static const struct ide_port_ops umc8672_port_ops = {
-	.set_pio_mode		= umc_set_pio_mode,
-};
-
-static const struct ide_port_info umc8672_port_info __initconst = {
-	.name			= DRV_NAME,
-	.chipset		= ide_umc8672,
-	.port_ops		= &umc8672_port_ops,
-	.host_flags		= IDE_HFLAG_NO_DMA,
-	.pio_mask		= ATA_PIO4,
-};
-
-static int __init umc8672_probe(void)
-{
-	unsigned long flags;
-
-	if (!request_region(0x108, 2, "umc8672")) {
-		printk(KERN_ERR "umc8672: ports 0x108-0x109 already in use.\n");
-		return 1;
-	}
-	local_irq_save(flags);
-	outb_p(0x5A, 0x108); /* enable umc */
-	if (in_umc (0xd5) != 0xa0) {
-		local_irq_restore(flags);
-		printk(KERN_ERR "umc8672: not found\n");
-		release_region(0x108, 2);
-		return 1;
-	}
-	outb_p(0xa5, 0x108); /* disable umc */
-
-	umc_set_speeds(current_speeds);
-	local_irq_restore(flags);
-
-	return ide_legacy_device_add(&umc8672_port_info, 0);
-}
-
-static bool probe_umc8672;
-
-module_param_named(probe, probe_umc8672, bool, 0);
-MODULE_PARM_DESC(probe, "probe for UMC8672 chipset");
-
-static int __init umc8672_init(void)
-{
-	if (probe_umc8672 == 0)
-		goto out;
-
-	if (umc8672_probe() == 0)
-		return 0;
-out:
-	return -ENODEV;
-}
-
-module_init(umc8672_init);
-
-MODULE_AUTHOR("Wolfram Podien");
-MODULE_DESCRIPTION("Support for UMC 8672 IDE chipset");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ide/via82cxxx.c b/drivers/ide/via82cxxx.c
deleted file mode 100644
index 63a3aca506fc..000000000000
--- a/drivers/ide/via82cxxx.c
+++ /dev/null
@@ -1,532 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * VIA IDE driver for Linux. Supported southbridges:
- *
- *   vt82c576, vt82c586, vt82c586a, vt82c586b, vt82c596a, vt82c596b,
- *   vt82c686, vt82c686a, vt82c686b, vt8231, vt8233, vt8233c, vt8233a,
- *   vt8235, vt8237, vt8237a
- *
- * Copyright (c) 2000-2002 Vojtech Pavlik
- * Copyright (c) 2007-2010 Bartlomiej Zolnierkiewicz
- *
- * Based on the work of:
- *	Michel Aubry
- *	Jeff Garzik
- *	Andre Hedrick
- *
- * Documentation:
- *	Obsolete device documentation publicly available from via.com.tw
- *	Current device documentation available under NDA only
- */
-
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/pci.h>
-#include <linux/init.h>
-#include <linux/ide.h>
-#include <linux/dmi.h>
-
-#ifdef CONFIG_PPC_CHRP
-#include <asm/processor.h>
-#endif
-
-#define DRV_NAME "via82cxxx"
-
-#define VIA_IDE_ENABLE		0x40
-#define VIA_IDE_CONFIG		0x41
-#define VIA_FIFO_CONFIG		0x43
-#define VIA_MISC_1		0x44
-#define VIA_MISC_2		0x45
-#define VIA_MISC_3		0x46
-#define VIA_DRIVE_TIMING	0x48
-#define VIA_8BIT_TIMING		0x4e
-#define VIA_ADDRESS_SETUP	0x4c
-#define VIA_UDMA_TIMING		0x50
-
-#define VIA_BAD_PREQ		0x01 /* Crashes if PREQ# till DDACK# set */
-#define VIA_BAD_CLK66		0x02 /* 66 MHz clock doesn't work correctly */
-#define VIA_SET_FIFO		0x04 /* Needs to have FIFO split set */
-#define VIA_NO_UNMASK		0x08 /* Doesn't work with IRQ unmasking on */
-#define VIA_BAD_ID		0x10 /* Has wrong vendor ID (0x1107) */
-#define VIA_BAD_AST		0x20 /* Don't touch Address Setup Timing */
-#define VIA_SATA_PATA		0x80 /* SATA/PATA combined configuration */
-
-enum {
-	VIA_IDFLAG_SINGLE = (1 << 1), /* single channel controller */
-};
-
-/*
- * VIA SouthBridge chips.
- */
-
-static struct via_isa_bridge {
-	char *name;
-	u16 id;
-	u8 rev_min;
-	u8 rev_max;
-	u8 udma_mask;
-	u8 flags;
-} via_isa_bridges[] = {
-	{ "vx855",	PCI_DEVICE_ID_VIA_VX855,    0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST | VIA_SATA_PATA },
-	{ "vx800",	PCI_DEVICE_ID_VIA_VX800,    0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST | VIA_SATA_PATA },
-	{ "cx700",	PCI_DEVICE_ID_VIA_CX700,    0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST | VIA_SATA_PATA },
-	{ "vt8261",	PCI_DEVICE_ID_VIA_8261,     0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
-	{ "vt8237s",	PCI_DEVICE_ID_VIA_8237S,    0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
-	{ "vt6410",	PCI_DEVICE_ID_VIA_6410,     0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
-	{ "vt6415",	PCI_DEVICE_ID_VIA_6415,     0x00, 0xff, ATA_UDMA6, VIA_BAD_AST },
-	{ "vt8251",	PCI_DEVICE_ID_VIA_8251,     0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
-	{ "vt8237",	PCI_DEVICE_ID_VIA_8237,     0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
-	{ "vt8237a",	PCI_DEVICE_ID_VIA_8237A,    0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
-	{ "vt8235",	PCI_DEVICE_ID_VIA_8235,     0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
-	{ "vt8233a",	PCI_DEVICE_ID_VIA_8233A,    0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
-	{ "vt8233c",	PCI_DEVICE_ID_VIA_8233C_0,  0x00, 0x2f, ATA_UDMA5, },
-	{ "vt8233",	PCI_DEVICE_ID_VIA_8233_0,   0x00, 0x2f, ATA_UDMA5, },
-	{ "vt8231",	PCI_DEVICE_ID_VIA_8231,     0x00, 0x2f, ATA_UDMA5, },
-	{ "vt82c686b",	PCI_DEVICE_ID_VIA_82C686,   0x40, 0x4f, ATA_UDMA5, },
-	{ "vt82c686a",	PCI_DEVICE_ID_VIA_82C686,   0x10, 0x2f, ATA_UDMA4, },
-	{ "vt82c686",	PCI_DEVICE_ID_VIA_82C686,   0x00, 0x0f, ATA_UDMA2, VIA_BAD_CLK66 },
-	{ "vt82c596b",	PCI_DEVICE_ID_VIA_82C596,   0x10, 0x2f, ATA_UDMA4, },
-	{ "vt82c596a",	PCI_DEVICE_ID_VIA_82C596,   0x00, 0x0f, ATA_UDMA2, VIA_BAD_CLK66 },
-	{ "vt82c586b",	PCI_DEVICE_ID_VIA_82C586_0, 0x47, 0x4f, ATA_UDMA2, VIA_SET_FIFO },
-	{ "vt82c586b",	PCI_DEVICE_ID_VIA_82C586_0, 0x40, 0x46, ATA_UDMA2, VIA_SET_FIFO | VIA_BAD_PREQ },
-	{ "vt82c586b",	PCI_DEVICE_ID_VIA_82C586_0, 0x30, 0x3f, ATA_UDMA2, VIA_SET_FIFO },
-	{ "vt82c586a",	PCI_DEVICE_ID_VIA_82C586_0, 0x20, 0x2f, ATA_UDMA2, VIA_SET_FIFO },
-	{ "vt82c586",	PCI_DEVICE_ID_VIA_82C586_0, 0x00, 0x0f,      0x00, VIA_SET_FIFO },
-	{ "vt82c576",	PCI_DEVICE_ID_VIA_82C576,   0x00, 0x2f,      0x00, VIA_SET_FIFO | VIA_NO_UNMASK },
-	{ "vt82c576",	PCI_DEVICE_ID_VIA_82C576,   0x00, 0x2f,      0x00, VIA_SET_FIFO | VIA_NO_UNMASK | VIA_BAD_ID },
-	{ "vtxxxx",	PCI_DEVICE_ID_VIA_ANON,     0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
-	{ NULL }
-};
-
-static unsigned int via_clock;
-static char *via_dma[] = { "16", "25", "33", "44", "66", "100", "133" };
-
-struct via82cxxx_dev
-{
-	struct via_isa_bridge *via_config;
-	unsigned int via_80w;
-};
-
-/**
- *	via_set_speed			-	write timing registers
- *	@dev: PCI device
- *	@dn: device
- *	@timing: IDE timing data to use
- *
- *	via_set_speed writes timing values to the chipset registers
- */
-
-static void via_set_speed(ide_hwif_t *hwif, u8 dn, struct ide_timing *timing)
-{
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
-	struct ide_host *host = pci_get_drvdata(dev);
-	struct via82cxxx_dev *vdev = host->host_priv;
-	u8 t;
-
-	if (~vdev->via_config->flags & VIA_BAD_AST) {
-		pci_read_config_byte(dev, VIA_ADDRESS_SETUP, &t);
-		t = (t & ~(3 << ((3 - dn) << 1))) | ((clamp_val(timing->setup, 1, 4) - 1) << ((3 - dn) << 1));
-		pci_write_config_byte(dev, VIA_ADDRESS_SETUP, t);
-	}
-
-	pci_write_config_byte(dev, VIA_8BIT_TIMING + (1 - (dn >> 1)),
-		((clamp_val(timing->act8b, 1, 16) - 1) << 4) | (clamp_val(timing->rec8b, 1, 16) - 1));
-
-	pci_write_config_byte(dev, VIA_DRIVE_TIMING + (3 - dn),
-		((clamp_val(timing->active, 1, 16) - 1) << 4) | (clamp_val(timing->recover, 1, 16) - 1));
-
-	switch (vdev->via_config->udma_mask) {
-	case ATA_UDMA2: t = timing->udma ? (0xe0 | (clamp_val(timing->udma, 2, 5) - 2)) : 0x03; break;
-	case ATA_UDMA4: t = timing->udma ? (0xe8 | (clamp_val(timing->udma, 2, 9) - 2)) : 0x0f; break;
-	case ATA_UDMA5: t = timing->udma ? (0xe0 | (clamp_val(timing->udma, 2, 9) - 2)) : 0x07; break;
-	case ATA_UDMA6: t = timing->udma ? (0xe0 | (clamp_val(timing->udma, 2, 9) - 2)) : 0x07; break;
-	}
-
-	/* Set UDMA unless device is not UDMA capable */
-	if (vdev->via_config->udma_mask) {
-		u8 udma_etc;
-
-		pci_read_config_byte(dev, VIA_UDMA_TIMING + 3 - dn, &udma_etc);
-
-		/* clear transfer mode bit */
-		udma_etc &= ~0x20;
-
-		if (timing->udma) {
-			/* preserve 80-wire cable detection bit */
-			udma_etc &= 0x10;
-			udma_etc |= t;
-		}
-
-		pci_write_config_byte(dev, VIA_UDMA_TIMING + 3 - dn, udma_etc);
-	}
-}
-
-/**
- *	via_set_drive		-	configure transfer mode
- *	@hwif: port
- *	@drive: Drive to set up
- *
- *	via_set_drive() computes timing values configures the chipset to
- *	a desired transfer mode.  It also can be called by upper layers.
- */
-
-static void via_set_drive(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	ide_drive_t *peer = ide_get_pair_dev(drive);
-	struct ide_host *host = dev_get_drvdata(hwif->dev);
-	struct via82cxxx_dev *vdev = host->host_priv;
-	struct ide_timing t, p;
-	unsigned int T, UT;
-	const u8 speed = drive->dma_mode;
-
-	T = 1000000000 / via_clock;
-
-	switch (vdev->via_config->udma_mask) {
-	case ATA_UDMA2: UT = T;   break;
-	case ATA_UDMA4: UT = T/2; break;
-	case ATA_UDMA5: UT = T/3; break;
-	case ATA_UDMA6: UT = T/4; break;
-	default:	UT = T;
-	}
-
-	ide_timing_compute(drive, speed, &t, T, UT);
-
-	if (peer) {
-		ide_timing_compute(peer, peer->pio_mode, &p, T, UT);
-		ide_timing_merge(&p, &t, &t, IDE_TIMING_8BIT);
-	}
-
-	via_set_speed(hwif, drive->dn, &t);
-}
-
-/**
- *	via_set_pio_mode	-	set host controller for PIO mode
- *	@hwif: port
- *	@drive: drive
- *
- *	A callback from the upper layers for PIO-only tuning.
- */
-
-static void via_set_pio_mode(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	drive->dma_mode = drive->pio_mode;
-	via_set_drive(hwif, drive);
-}
-
-static struct via_isa_bridge *via_config_find(struct pci_dev **isa)
-{
-	struct via_isa_bridge *via_config;
-
-	for (via_config = via_isa_bridges;
-	     via_config->id != PCI_DEVICE_ID_VIA_ANON; via_config++)
-		if ((*isa = pci_get_device(PCI_VENDOR_ID_VIA +
-			!!(via_config->flags & VIA_BAD_ID),
-			via_config->id, NULL))) {
-
-			if ((*isa)->revision >= via_config->rev_min &&
-			    (*isa)->revision <= via_config->rev_max)
-				break;
-			pci_dev_put(*isa);
-		}
-
-	return via_config;
-}
-
-/*
- * Check and handle 80-wire cable presence
- */
-static void via_cable_detect(struct via82cxxx_dev *vdev, u32 u)
-{
-	int i;
-
-	switch (vdev->via_config->udma_mask) {
-		case ATA_UDMA4:
-			for (i = 24; i >= 0; i -= 8)
-				if (((u >> (i & 16)) & 8) &&
-				    ((u >> i) & 0x20) &&
-				     (((u >> i) & 7) < 2)) {
-					/*
-					 * 2x PCI clock and
-					 * UDMA w/ < 3T/cycle
-					 */
-					vdev->via_80w |= (1 << (1 - (i >> 4)));
-				}
-			break;
-
-		case ATA_UDMA5:
-			for (i = 24; i >= 0; i -= 8)
-				if (((u >> i) & 0x10) ||
-				    (((u >> i) & 0x20) &&
-				     (((u >> i) & 7) < 4))) {
-					/* BIOS 80-wire bit or
-					 * UDMA w/ < 60ns/cycle
-					 */
-					vdev->via_80w |= (1 << (1 - (i >> 4)));
-				}
-			break;
-
-		case ATA_UDMA6:
-			for (i = 24; i >= 0; i -= 8)
-				if (((u >> i) & 0x10) ||
-				    (((u >> i) & 0x20) &&
-				     (((u >> i) & 7) < 6))) {
-					/* BIOS 80-wire bit or
-					 * UDMA w/ < 60ns/cycle
-					 */
-					vdev->via_80w |= (1 << (1 - (i >> 4)));
-				}
-			break;
-	}
-}
-
-/**
- *	init_chipset_via82cxxx	-	initialization handler
- *	@dev: PCI device
- *
- *	The initialization callback. Here we determine the IDE chip type
- *	and initialize its drive independent registers.
- */
-
-static int init_chipset_via82cxxx(struct pci_dev *dev)
-{
-	struct ide_host *host = pci_get_drvdata(dev);
-	struct via82cxxx_dev *vdev = host->host_priv;
-	struct via_isa_bridge *via_config = vdev->via_config;
-	u8 t, v;
-	u32 u;
-
-	/*
-	 * Detect cable and configure Clk66
-	 */
-	pci_read_config_dword(dev, VIA_UDMA_TIMING, &u);
-
-	via_cable_detect(vdev, u);
-
-	if (via_config->udma_mask == ATA_UDMA4) {
-		/* Enable Clk66 */
-		pci_write_config_dword(dev, VIA_UDMA_TIMING, u|0x80008);
-	} else if (via_config->flags & VIA_BAD_CLK66) {
-		/* Would cause trouble on 596a and 686 */
-		pci_write_config_dword(dev, VIA_UDMA_TIMING, u & ~0x80008);
-	}
-
-	/*
-	 * Check whether interfaces are enabled.
-	 */
-
-	pci_read_config_byte(dev, VIA_IDE_ENABLE, &v);
-
-	/*
-	 * Set up FIFO sizes and thresholds.
-	 */
-
-	pci_read_config_byte(dev, VIA_FIFO_CONFIG, &t);
-
-	/* Disable PREQ# till DDACK# */
-	if (via_config->flags & VIA_BAD_PREQ) {
-		/* Would crash on 586b rev 41 */
-		t &= 0x7f;
-	}
-
-	/* Fix FIFO split between channels */
-	if (via_config->flags & VIA_SET_FIFO) {
-		t &= (t & 0x9f);
-		switch (v & 3) {
-			case 2: t |= 0x00; break;	/* 16 on primary */
-			case 1: t |= 0x60; break;	/* 16 on secondary */
-			case 3: t |= 0x20; break;	/* 8 pri 8 sec */
-		}
-	}
-
-	pci_write_config_byte(dev, VIA_FIFO_CONFIG, t);
-
-	return 0;
-}
-
-/*
- *	Cable special cases
- */
-
-static const struct dmi_system_id cable_dmi_table[] = {
-	{
-		.ident = "Acer Ferrari 3400",
-		.matches = {
-			DMI_MATCH(DMI_BOARD_VENDOR, "Acer,Inc."),
-			DMI_MATCH(DMI_BOARD_NAME, "Ferrari 3400"),
-		},
-	},
-	{ }
-};
-
-static int via_cable_override(struct pci_dev *pdev)
-{
-	/* Systems by DMI */
-	if (dmi_check_system(cable_dmi_table))
-		return 1;
-
-	/* Arima W730-K8/Targa Visionary 811/... */
-	if (pdev->subsystem_vendor == 0x161F &&
-	    pdev->subsystem_device == 0x2032)
-		return 1;
-
-	return 0;
-}
-
-static u8 via82cxxx_cable_detect(ide_hwif_t *hwif)
-{
-	struct pci_dev *pdev = to_pci_dev(hwif->dev);
-	struct ide_host *host = pci_get_drvdata(pdev);
-	struct via82cxxx_dev *vdev = host->host_priv;
-
-	if (via_cable_override(pdev))
-		return ATA_CBL_PATA40_SHORT;
-
-	if ((vdev->via_config->flags & VIA_SATA_PATA) && hwif->channel == 0)
-		return ATA_CBL_SATA;
-
-	if ((vdev->via_80w >> hwif->channel) & 1)
-		return ATA_CBL_PATA80;
-	else
-		return ATA_CBL_PATA40;
-}
-
-static const struct ide_port_ops via_port_ops = {
-	.set_pio_mode		= via_set_pio_mode,
-	.set_dma_mode		= via_set_drive,
-	.cable_detect		= via82cxxx_cable_detect,
-};
-
-static const struct ide_port_info via82cxxx_chipset = {
-	.name		= DRV_NAME,
-	.init_chipset	= init_chipset_via82cxxx,
-	.enablebits	= { { 0x40, 0x02, 0x02 }, { 0x40, 0x01, 0x01 } },
-	.port_ops	= &via_port_ops,
-	.host_flags	= IDE_HFLAG_PIO_NO_BLACKLIST |
-			  IDE_HFLAG_POST_SET_MODE |
-			  IDE_HFLAG_IO_32BIT,
-	.pio_mask	= ATA_PIO5,
-	.swdma_mask	= ATA_SWDMA2,
-	.mwdma_mask	= ATA_MWDMA2,
-};
-
-static int via_init_one(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	struct pci_dev *isa = NULL;
-	struct via_isa_bridge *via_config;
-	struct via82cxxx_dev *vdev;
-	int rc;
-	u8 idx = id->driver_data;
-	struct ide_port_info d;
-
-	d = via82cxxx_chipset;
-
-	/*
-	 * Find the ISA bridge and check we know what it is.
-	 */
-	via_config = via_config_find(&isa);
-
-	/*
-	 * Print the boot message.
-	 */
-	printk(KERN_INFO DRV_NAME " %s: VIA %s (rev %02x) IDE %sDMA%s\n",
-		pci_name(dev), via_config->name, isa->revision,
-		via_config->udma_mask ? "U" : "MW",
-		via_dma[via_config->udma_mask ?
-			(fls(via_config->udma_mask) - 1) : 0]);
-
-	pci_dev_put(isa);
-
-	/*
-	 * Determine system bus clock.
-	 */
-	via_clock = (ide_pci_clk ? ide_pci_clk : 33) * 1000;
-
-	switch (via_clock) {
-	case 33000: via_clock = 33333; break;
-	case 37000: via_clock = 37500; break;
-	case 41000: via_clock = 41666; break;
-	}
-
-	if (via_clock < 20000 || via_clock > 50000) {
-		printk(KERN_WARNING DRV_NAME ": User given PCI clock speed "
-			"impossible (%d), using 33 MHz instead.\n", via_clock);
-		via_clock = 33333;
-	}
-
-	if (idx == 1)
-		d.enablebits[1].reg = d.enablebits[0].reg = 0;
-	else
-		d.host_flags |= IDE_HFLAG_NO_AUTODMA;
-
-	if (idx == VIA_IDFLAG_SINGLE)
-		d.host_flags |= IDE_HFLAG_SINGLE;
-
-	if ((via_config->flags & VIA_NO_UNMASK) == 0)
-		d.host_flags |= IDE_HFLAG_UNMASK_IRQS;
-
-	d.udma_mask = via_config->udma_mask;
-
-	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
-	if (!vdev) {
-		printk(KERN_ERR DRV_NAME " %s: out of memory :(\n",
-			pci_name(dev));
-		return -ENOMEM;
-	}
-
-	vdev->via_config = via_config;
-
-	rc = ide_pci_init_one(dev, &d, vdev);
-	if (rc)
-		kfree(vdev);
-
-	return rc;
-}
-
-static void via_remove(struct pci_dev *dev)
-{
-	struct ide_host *host = pci_get_drvdata(dev);
-	struct via82cxxx_dev *vdev = host->host_priv;
-
-	ide_pci_remove(dev);
-	kfree(vdev);
-}
-
-static const struct pci_device_id via_pci_tbl[] = {
-	{ PCI_VDEVICE(VIA, PCI_DEVICE_ID_VIA_82C576_1),  0 },
-	{ PCI_VDEVICE(VIA, PCI_DEVICE_ID_VIA_82C586_1),  0 },
-	{ PCI_VDEVICE(VIA, PCI_DEVICE_ID_VIA_CX700_IDE), 0 },
-	{ PCI_VDEVICE(VIA, PCI_DEVICE_ID_VIA_VX855_IDE), VIA_IDFLAG_SINGLE },
-	{ PCI_VDEVICE(VIA, PCI_DEVICE_ID_VIA_6410),      1 },
-	{ PCI_VDEVICE(VIA, PCI_DEVICE_ID_VIA_6415),      1 },
-	{ PCI_VDEVICE(VIA, PCI_DEVICE_ID_VIA_SATA_EIDE), 1 },
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, via_pci_tbl);
-
-static struct pci_driver via_pci_driver = {
-	.name 		= "VIA_IDE",
-	.id_table 	= via_pci_tbl,
-	.probe 		= via_init_one,
-	.remove		= via_remove,
-	.suspend	= ide_pci_suspend,
-	.resume		= ide_pci_resume,
-};
-
-static int __init via_ide_init(void)
-{
-	return ide_pci_register_driver(&via_pci_driver);
-}
-
-static void __exit via_ide_exit(void)
-{
-	pci_unregister_driver(&via_pci_driver);
-}
-
-module_init(via_ide_init);
-module_exit(via_ide_exit);
-
-MODULE_AUTHOR("Vojtech Pavlik, Bartlomiej Zolnierkiewicz, Michel Aubry, Jeff Garzik, Andre Hedrick");
-MODULE_DESCRIPTION("PCI driver module for VIA IDE");
-MODULE_LICENSE("GPL");
diff --git a/include/linux/ide.h b/include/linux/ide.h
deleted file mode 100644
index 2c300689a51a..000000000000
--- a/include/linux/ide.h
+++ /dev/null
@@ -1,1623 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _IDE_H
-#define _IDE_H
-/*
- *  linux/include/linux/ide.h
- *
- *  Copyright (C) 1994-2002  Linus Torvalds & authors
- */
-
-#include <linux/init.h>
-#include <linux/ioport.h>
-#include <linux/ata.h>
-#include <linux/blk-mq.h>
-#include <linux/proc_fs.h>
-#include <linux/interrupt.h>
-#include <linux/bitops.h>
-#include <linux/bio.h>
-#include <linux/pci.h>
-#include <linux/completion.h>
-#include <linux/pm.h>
-#include <linux/mutex.h>
-/* for request_sense */
-#include <linux/cdrom.h>
-#include <scsi/scsi_cmnd.h>
-#include <asm/byteorder.h>
-#include <asm/io.h>
-
-/*
- * Probably not wise to fiddle with these
- */
-#define SUPPORT_VLB_SYNC 1
-#define IDE_DEFAULT_MAX_FAILURES	1
-#define ERROR_MAX	8	/* Max read/write errors per sector */
-#define ERROR_RESET	3	/* Reset controller every 4th retry */
-#define ERROR_RECAL	1	/* Recalibrate every 2nd retry */
-
-struct device;
-
-/* values for ide_request.type */
-enum ata_priv_type {
-	ATA_PRIV_MISC,
-	ATA_PRIV_TASKFILE,
-	ATA_PRIV_PC,
-	ATA_PRIV_SENSE,		/* sense request */
-	ATA_PRIV_PM_SUSPEND,	/* suspend request */
-	ATA_PRIV_PM_RESUME,	/* resume request */
-};
-
-struct ide_request {
-	struct scsi_request sreq;
-	u8 sense[SCSI_SENSE_BUFFERSIZE];
-	u8 type;
-	void *special;
-};
-
-static inline struct ide_request *ide_req(struct request *rq)
-{
-	return blk_mq_rq_to_pdu(rq);
-}
-
-static inline bool ata_misc_request(struct request *rq)
-{
-	return blk_rq_is_private(rq) && ide_req(rq)->type == ATA_PRIV_MISC;
-}
-
-static inline bool ata_taskfile_request(struct request *rq)
-{
-	return blk_rq_is_private(rq) && ide_req(rq)->type == ATA_PRIV_TASKFILE;
-}
-
-static inline bool ata_pc_request(struct request *rq)
-{
-	return blk_rq_is_private(rq) && ide_req(rq)->type == ATA_PRIV_PC;
-}
-
-static inline bool ata_sense_request(struct request *rq)
-{
-	return blk_rq_is_private(rq) && ide_req(rq)->type == ATA_PRIV_SENSE;
-}
-
-static inline bool ata_pm_request(struct request *rq)
-{
-	return blk_rq_is_private(rq) &&
-		(ide_req(rq)->type == ATA_PRIV_PM_SUSPEND ||
-		 ide_req(rq)->type == ATA_PRIV_PM_RESUME);
-}
-
-/* Error codes returned in result to the higher part of the driver. */
-enum {
-	IDE_DRV_ERROR_GENERAL	= 101,
-	IDE_DRV_ERROR_FILEMARK	= 102,
-	IDE_DRV_ERROR_EOD	= 103,
-};
-
-/*
- * Definitions for accessing IDE controller registers
- */
-#define IDE_NR_PORTS		(10)
-
-struct ide_io_ports {
-	unsigned long	data_addr;
-
-	union {
-		unsigned long error_addr;	/*   read:  error */
-		unsigned long feature_addr;	/*  write: feature */
-	};
-
-	unsigned long	nsect_addr;
-	unsigned long	lbal_addr;
-	unsigned long	lbam_addr;
-	unsigned long	lbah_addr;
-
-	unsigned long	device_addr;
-
-	union {
-		unsigned long status_addr;	/*  read: status  */
-		unsigned long command_addr;	/* write: command */
-	};
-
-	unsigned long	ctl_addr;
-
-	unsigned long	irq_addr;
-};
-
-#define OK_STAT(stat,good,bad)	(((stat)&((good)|(bad)))==(good))
-
-#define BAD_R_STAT	(ATA_BUSY | ATA_ERR)
-#define BAD_W_STAT	(BAD_R_STAT | ATA_DF)
-#define BAD_STAT	(BAD_R_STAT | ATA_DRQ)
-#define DRIVE_READY	(ATA_DRDY | ATA_DSC)
-
-#define BAD_CRC		(ATA_ABORTED | ATA_ICRC)
-
-#define SATA_NR_PORTS		(3)	/* 16 possible ?? */
-
-#define SATA_STATUS_OFFSET	(0)
-#define SATA_ERROR_OFFSET	(1)
-#define SATA_CONTROL_OFFSET	(2)
-
-/*
- * Our Physical Region Descriptor (PRD) table should be large enough
- * to handle the biggest I/O request we are likely to see.  Since requests
- * can have no more than 256 sectors, and since the typical blocksize is
- * two or more sectors, we could get by with a limit of 128 entries here for
- * the usual worst case.  Most requests seem to include some contiguous blocks,
- * further reducing the number of table entries required.
- *
- * The driver reverts to PIO mode for individual requests that exceed
- * this limit (possible with 512 byte blocksizes, eg. MSDOS f/s), so handling
- * 100% of all crazy scenarios here is not necessary.
- *
- * As it turns out though, we must allocate a full 4KB page for this,
- * so the two PRD tables (ide0 & ide1) will each get half of that,
- * allowing each to have about 256 entries (8 bytes each) from this.
- */
-#define PRD_BYTES       8
-#define PRD_ENTRIES	256
-
-/*
- * Some more useful definitions
- */
-#define PARTN_BITS	6	/* number of minor dev bits for partitions */
-#define MAX_DRIVES	2	/* per interface; 2 assumed by lots of code */
-
-/*
- * Timeouts for various operations:
- */
-enum {
-	/* spec allows up to 20ms, but CF cards and SSD drives need more */
-	WAIT_DRQ	= 1 * HZ,	/* 1s */
-	/* some laptops are very slow */
-	WAIT_READY	= 5 * HZ,	/* 5s */
-	/* should be less than 3ms (?), if all ATAPI CD is closed at boot */
-	WAIT_PIDENTIFY	= 10 * HZ,	/* 10s */
-	/* worst case when spinning up */
-	WAIT_WORSTCASE	= 30 * HZ,	/* 30s */
-	/* maximum wait for an IRQ to happen */
-	WAIT_CMD	= 10 * HZ,	/* 10s */
-	/* Some drives require a longer IRQ timeout. */
-	WAIT_FLOPPY_CMD	= 50 * HZ,	/* 50s */
-	/*
-	 * Some drives (for example, Seagate STT3401A Travan) require a very
-	 * long timeout, because they don't return an interrupt or clear their
-	 * BSY bit until after the command completes (even retension commands).
-	 */
-	WAIT_TAPE_CMD	= 900 * HZ,	/* 900s */
-	/* minimum sleep time */
-	WAIT_MIN_SLEEP	= HZ / 50,	/* 20ms */
-};
-
-/*
- * Op codes for special requests to be handled by ide_special_rq().
- * Values should be in the range of 0x20 to 0x3f.
- */
-#define REQ_DRIVE_RESET		0x20
-#define REQ_DEVSET_EXEC		0x21
-#define REQ_PARK_HEADS		0x22
-#define REQ_UNPARK_HEADS	0x23
-
-/*
- * hwif_chipset_t is used to keep track of the specific hardware
- * chipset used by each IDE interface, if known.
- */
-enum {		ide_unknown,	ide_generic,	ide_pci,
-		ide_cmd640,	ide_dtc2278,	ide_ali14xx,
-		ide_qd65xx,	ide_umc8672,	ide_ht6560b,
-		ide_4drives,	ide_pmac,	ide_acorn,
-		ide_au1xxx,	ide_palm3710
-};
-
-typedef u8 hwif_chipset_t;
-
-/*
- * Structure to hold all information about the location of this port
- */
-struct ide_hw {
-	union {
-		struct ide_io_ports	io_ports;
-		unsigned long		io_ports_array[IDE_NR_PORTS];
-	};
-
-	int		irq;			/* our irq number */
-	struct device	*dev, *parent;
-	unsigned long	config;
-};
-
-static inline void ide_std_init_ports(struct ide_hw *hw,
-				      unsigned long io_addr,
-				      unsigned long ctl_addr)
-{
-	unsigned int i;
-
-	for (i = 0; i <= 7; i++)
-		hw->io_ports_array[i] = io_addr++;
-
-	hw->io_ports.ctl_addr = ctl_addr;
-}
-
-#define MAX_HWIFS	10
-
-/*
- * Now for the data we need to maintain per-drive:  ide_drive_t
- */
-
-#define ide_scsi	0x21
-#define ide_disk	0x20
-#define ide_optical	0x7
-#define ide_cdrom	0x5
-#define ide_tape	0x1
-#define ide_floppy	0x0
-
-/*
- * Special Driver Flags
- */
-enum {
-	IDE_SFLAG_SET_GEOMETRY		= BIT(0),
-	IDE_SFLAG_RECALIBRATE		= BIT(1),
-	IDE_SFLAG_SET_MULTMODE		= BIT(2),
-};
-
-/*
- * Status returned from various ide_ functions
- */
-typedef enum {
-	ide_stopped,	/* no drive operation was started */
-	ide_started,	/* a drive operation was started, handler was set */
-} ide_startstop_t;
-
-enum {
-	IDE_VALID_ERROR 		= BIT(1),
-	IDE_VALID_FEATURE		= IDE_VALID_ERROR,
-	IDE_VALID_NSECT 		= BIT(2),
-	IDE_VALID_LBAL			= BIT(3),
-	IDE_VALID_LBAM			= BIT(4),
-	IDE_VALID_LBAH			= BIT(5),
-	IDE_VALID_DEVICE		= BIT(6),
-	IDE_VALID_LBA			= IDE_VALID_LBAL |
-					  IDE_VALID_LBAM |
-					  IDE_VALID_LBAH,
-	IDE_VALID_OUT_TF		= IDE_VALID_FEATURE |
-					  IDE_VALID_NSECT |
-					  IDE_VALID_LBA,
-	IDE_VALID_IN_TF 		= IDE_VALID_NSECT |
-					  IDE_VALID_LBA,
-	IDE_VALID_OUT_HOB		= IDE_VALID_OUT_TF,
-	IDE_VALID_IN_HOB		= IDE_VALID_ERROR |
-					  IDE_VALID_NSECT |
-					  IDE_VALID_LBA,
-};
-
-enum {
-	IDE_TFLAG_LBA48			= BIT(0),
-	IDE_TFLAG_WRITE			= BIT(1),
-	IDE_TFLAG_CUSTOM_HANDLER	= BIT(2),
-	IDE_TFLAG_DMA_PIO_FALLBACK	= BIT(3),
-	/* force 16-bit I/O operations */
-	IDE_TFLAG_IO_16BIT		= BIT(4),
-	/* struct ide_cmd was allocated using kmalloc() */
-	IDE_TFLAG_DYN			= BIT(5),
-	IDE_TFLAG_FS			= BIT(6),
-	IDE_TFLAG_MULTI_PIO		= BIT(7),
-	IDE_TFLAG_SET_XFER		= BIT(8),
-};
-
-enum {
-	IDE_FTFLAG_FLAGGED		= BIT(0),
-	IDE_FTFLAG_SET_IN_FLAGS		= BIT(1),
-	IDE_FTFLAG_OUT_DATA		= BIT(2),
-	IDE_FTFLAG_IN_DATA		= BIT(3),
-};
-
-struct ide_taskfile {
-	u8	data;		/* 0: data byte (for TASKFILE ioctl) */
-	union {			/* 1: */
-		u8 error;	/*  read: error */
-		u8 feature;	/* write: feature */
-	};
-	u8	nsect;		/* 2: number of sectors */
-	u8	lbal;		/* 3: LBA low */
-	u8	lbam;		/* 4: LBA mid */
-	u8	lbah;		/* 5: LBA high */
-	u8	device;		/* 6: device select */
-	union {			/* 7: */
-		u8 status;	/*  read: status */
-		u8 command;	/* write: command */
-	};
-};
-
-struct ide_cmd {
-	struct ide_taskfile	tf;
-	struct ide_taskfile	hob;
-	struct {
-		struct {
-			u8		tf;
-			u8		hob;
-		} out, in;
-	} valid;
-
-	u16			tf_flags;
-	u8			ftf_flags;	/* for TASKFILE ioctl */
-	int			protocol;
-
-	int			sg_nents;	  /* number of sg entries */
-	int			orig_sg_nents;
-	int			sg_dma_direction; /* DMA transfer direction */
-
-	unsigned int		nbytes;
-	unsigned int		nleft;
-	unsigned int		last_xfer_len;
-
-	struct scatterlist	*cursg;
-	unsigned int		cursg_ofs;
-
-	struct request		*rq;		/* copy of request */
-};
-
-/* ATAPI packet command flags */
-enum {
-	/* set when an error is considered normal - no retry (ide-tape) */
-	PC_FLAG_ABORT			= BIT(0),
-	PC_FLAG_SUPPRESS_ERROR		= BIT(1),
-	PC_FLAG_WAIT_FOR_DSC		= BIT(2),
-	PC_FLAG_DMA_OK			= BIT(3),
-	PC_FLAG_DMA_IN_PROGRESS		= BIT(4),
-	PC_FLAG_DMA_ERROR		= BIT(5),
-	PC_FLAG_WRITING			= BIT(6),
-};
-
-#define ATAPI_WAIT_PC		(60 * HZ)
-
-struct ide_atapi_pc {
-	/* actual packet bytes */
-	u8 c[12];
-	/* incremented on each retry */
-	int retries;
-	int error;
-
-	/* bytes to transfer */
-	int req_xfer;
-
-	/* the corresponding request */
-	struct request *rq;
-
-	unsigned long flags;
-
-	/*
-	 * those are more or less driver-specific and some of them are subject
-	 * to change/removal later.
-	 */
-	unsigned long timeout;
-};
-
-struct ide_devset;
-struct ide_driver;
-
-#ifdef CONFIG_BLK_DEV_IDEACPI
-struct ide_acpi_drive_link;
-struct ide_acpi_hwif_link;
-#endif
-
-struct ide_drive_s;
-
-struct ide_disk_ops {
-	int		(*check)(struct ide_drive_s *, const char *);
-	int		(*get_capacity)(struct ide_drive_s *);
-	void		(*unlock_native_capacity)(struct ide_drive_s *);
-	void		(*setup)(struct ide_drive_s *);
-	void		(*flush)(struct ide_drive_s *);
-	int		(*init_media)(struct ide_drive_s *, struct gendisk *);
-	int		(*set_doorlock)(struct ide_drive_s *, struct gendisk *,
-					int);
-	ide_startstop_t	(*do_request)(struct ide_drive_s *, struct request *,
-				      sector_t);
-	int		(*ioctl)(struct ide_drive_s *, struct block_device *,
-				 fmode_t, unsigned int, unsigned long);
-	int		(*compat_ioctl)(struct ide_drive_s *, struct block_device *,
-					fmode_t, unsigned int, unsigned long);
-};
-
-/* ATAPI device flags */
-enum {
-	IDE_AFLAG_DRQ_INTERRUPT		= BIT(0),
-
-	/* ide-cd */
-	/* Drive cannot eject the disc. */
-	IDE_AFLAG_NO_EJECT		= BIT(1),
-	/* Drive is a pre ATAPI 1.2 drive. */
-	IDE_AFLAG_PRE_ATAPI12		= BIT(2),
-	/* TOC addresses are in BCD. */
-	IDE_AFLAG_TOCADDR_AS_BCD	= BIT(3),
-	/* TOC track numbers are in BCD. */
-	IDE_AFLAG_TOCTRACKS_AS_BCD	= BIT(4),
-	/* Saved TOC information is current. */
-	IDE_AFLAG_TOC_VALID		= BIT(6),
-	/* We think that the drive door is locked. */
-	IDE_AFLAG_DOOR_LOCKED		= BIT(7),
-	/* SET_CD_SPEED command is unsupported. */
-	IDE_AFLAG_NO_SPEED_SELECT	= BIT(8),
-	IDE_AFLAG_VERTOS_300_SSD	= BIT(9),
-	IDE_AFLAG_VERTOS_600_ESD	= BIT(10),
-	IDE_AFLAG_SANYO_3CD		= BIT(11),
-	IDE_AFLAG_FULL_CAPS_PAGE	= BIT(12),
-	IDE_AFLAG_PLAY_AUDIO_OK		= BIT(13),
-	IDE_AFLAG_LE_SPEED_FIELDS	= BIT(14),
-
-	/* ide-floppy */
-	/* Avoid commands not supported in Clik drive */
-	IDE_AFLAG_CLIK_DRIVE		= BIT(15),
-	/* Requires BH algorithm for packets */
-	IDE_AFLAG_ZIP_DRIVE		= BIT(16),
-	/* Supports format progress report */
-	IDE_AFLAG_SRFP			= BIT(17),
-
-	/* ide-tape */
-	IDE_AFLAG_IGNORE_DSC		= BIT(18),
-	/* 0 When the tape position is unknown */
-	IDE_AFLAG_ADDRESS_VALID		= BIT(19),
-	/* Device already opened */
-	IDE_AFLAG_BUSY			= BIT(20),
-	/* Attempt to auto-detect the current user block size */
-	IDE_AFLAG_DETECT_BS		= BIT(21),
-	/* Currently on a filemark */
-	IDE_AFLAG_FILEMARK		= BIT(22),
-	/* 0 = no tape is loaded, so we don't rewind after ejecting */
-	IDE_AFLAG_MEDIUM_PRESENT	= BIT(23),
-
-	IDE_AFLAG_NO_AUTOCLOSE		= BIT(24),
-};
-
-/* device flags */
-enum {
-	/* restore settings after device reset */
-	IDE_DFLAG_KEEP_SETTINGS		= BIT(0),
-	/* device is using DMA for read/write */
-	IDE_DFLAG_USING_DMA		= BIT(1),
-	/* okay to unmask other IRQs */
-	IDE_DFLAG_UNMASK		= BIT(2),
-	/* don't attempt flushes */
-	IDE_DFLAG_NOFLUSH		= BIT(3),
-	/* DSC overlap */
-	IDE_DFLAG_DSC_OVERLAP		= BIT(4),
-	/* give potential excess bandwidth */
-	IDE_DFLAG_NICE1			= BIT(5),
-	/* device is physically present */
-	IDE_DFLAG_PRESENT		= BIT(6),
-	/* disable Host Protected Area */
-	IDE_DFLAG_NOHPA			= BIT(7),
-	/* id read from device (synthetic if not set) */
-	IDE_DFLAG_ID_READ		= BIT(8),
-	IDE_DFLAG_NOPROBE		= BIT(9),
-	/* need to do check_media_change() */
-	IDE_DFLAG_REMOVABLE		= BIT(10),
-	IDE_DFLAG_FORCED_GEOM		= BIT(12),
-	/* disallow setting unmask bit */
-	IDE_DFLAG_NO_UNMASK		= BIT(13),
-	/* disallow enabling 32-bit I/O */
-	IDE_DFLAG_NO_IO_32BIT		= BIT(14),
-	/* for removable only: door lock/unlock works */
-	IDE_DFLAG_DOORLOCKING		= BIT(15),
-	/* disallow DMA */
-	IDE_DFLAG_NODMA			= BIT(16),
-	/* powermanagement told us not to do anything, so sleep nicely */
-	IDE_DFLAG_BLOCKED		= BIT(17),
-	/* sleeping & sleep field valid */
-	IDE_DFLAG_SLEEPING		= BIT(18),
-	IDE_DFLAG_POST_RESET		= BIT(19),
-	IDE_DFLAG_UDMA33_WARNED		= BIT(20),
-	IDE_DFLAG_LBA48			= BIT(21),
-	/* status of write cache */
-	IDE_DFLAG_WCACHE		= BIT(22),
-	/* used for ignoring ATA_DF */
-	IDE_DFLAG_NOWERR		= BIT(23),
-	/* retrying in PIO */
-	IDE_DFLAG_DMA_PIO_RETRY		= BIT(24),
-	IDE_DFLAG_LBA			= BIT(25),
-	/* don't unload heads */
-	IDE_DFLAG_NO_UNLOAD		= BIT(26),
-	/* heads unloaded, please don't reset port */
-	IDE_DFLAG_PARKED		= BIT(27),
-	IDE_DFLAG_MEDIA_CHANGED		= BIT(28),
-	/* write protect */
-	IDE_DFLAG_WP			= BIT(29),
-	IDE_DFLAG_FORMAT_IN_PROGRESS	= BIT(30),
-	IDE_DFLAG_NIEN_QUIRK		= BIT(31),
-};
-
-struct ide_drive_s {
-	char		name[4];	/* drive name, such as "hda" */
-        char            driver_req[10];	/* requests specific driver */
-
-	struct request_queue	*queue;	/* request queue */
-
-	bool (*prep_rq)(struct ide_drive_s *, struct request *);
-
-	struct blk_mq_tag_set	tag_set;
-
-	struct request		*rq;	/* current request */
-	void		*driver_data;	/* extra driver data */
-	u16			*id;	/* identification info */
-#ifdef CONFIG_IDE_PROC_FS
-	struct proc_dir_entry *proc;	/* /proc/ide/ directory entry */
-	const struct ide_proc_devset *settings; /* /proc/ide/ drive settings */
-#endif
-	struct hwif_s		*hwif;	/* actually (ide_hwif_t *) */
-
-	const struct ide_disk_ops *disk_ops;
-
-	unsigned long dev_flags;
-
-	unsigned long sleep;		/* sleep until this time */
-	unsigned long timeout;		/* max time to wait for irq */
-
-	u8	special_flags;		/* special action flags */
-
-	u8	select;			/* basic drive/head select reg value */
-	u8	retry_pio;		/* retrying dma capable host in pio */
-	u8	waiting_for_dma;	/* dma currently in progress */
-	u8	dma;			/* atapi dma flag */
-
-        u8	init_speed;	/* transfer rate set at boot */
-        u8	current_speed;	/* current transfer rate set */
-	u8	desired_speed;	/* desired transfer rate set */
-	u8	pio_mode;	/* for ->set_pio_mode _only_ */
-	u8	dma_mode;	/* for ->set_dma_mode _only_ */
-	u8	dn;		/* now wide spread use */
-	u8	acoustic;	/* acoustic management */
-	u8	media;		/* disk, cdrom, tape, floppy, ... */
-	u8	ready_stat;	/* min status value for drive ready */
-	u8	mult_count;	/* current multiple sector setting */
-	u8	mult_req;	/* requested multiple sector setting */
-	u8	io_32bit;	/* 0=16-bit, 1=32-bit, 2/3=32bit+sync */
-	u8	bad_wstat;	/* used for ignoring ATA_DF */
-	u8	head;		/* "real" number of heads */
-	u8	sect;		/* "real" sectors per track */
-	u8	bios_head;	/* BIOS/fdisk/LILO number of heads */
-	u8	bios_sect;	/* BIOS/fdisk/LILO sectors per track */
-
-	/* delay this long before sending packet command */
-	u8 pc_delay;
-
-	unsigned int	bios_cyl;	/* BIOS/fdisk/LILO number of cyls */
-	unsigned int	cyl;		/* "real" number of cyls */
-	void		*drive_data;	/* used by set_pio_mode/dev_select() */
-	unsigned int	failures;	/* current failure count */
-	unsigned int	max_failures;	/* maximum allowed failure count */
-	u64		probed_capacity;/* initial/native media capacity */
-	u64		capacity64;	/* total number of sectors */
-
-	int		lun;		/* logical unit */
-	int		crc_count;	/* crc counter to reduce drive speed */
-
-	unsigned long	debug_mask;	/* debugging levels switch */
-
-#ifdef CONFIG_BLK_DEV_IDEACPI
-	struct ide_acpi_drive_link *acpidata;
-#endif
-	struct list_head list;
-	struct device	gendev;
-	struct completion gendev_rel_comp;	/* to deal with device release() */
-
-	/* current packet command */
-	struct ide_atapi_pc *pc;
-
-	/* last failed packet command */
-	struct ide_atapi_pc *failed_pc;
-
-	/* callback for packet commands */
-	int  (*pc_callback)(struct ide_drive_s *, int);
-
-	ide_startstop_t (*irq_handler)(struct ide_drive_s *);
-
-	unsigned long atapi_flags;
-
-	struct ide_atapi_pc request_sense_pc;
-
-	/* current sense rq and buffer */
-	bool sense_rq_armed;
-	bool sense_rq_active;
-	struct request *sense_rq;
-	struct request_sense sense_data;
-
-	/* async sense insertion */
-	struct work_struct rq_work;
-	struct list_head rq_list;
-};
-
-typedef struct ide_drive_s ide_drive_t;
-
-#define to_ide_device(dev)		container_of(dev, ide_drive_t, gendev)
-
-#define to_ide_drv(obj, cont_type)	\
-	container_of(obj, struct cont_type, dev)
-
-#define ide_drv_g(disk, cont_type)	\
-	container_of((disk)->private_data, struct cont_type, driver)
-
-struct ide_port_info;
-
-struct ide_tp_ops {
-	void	(*exec_command)(struct hwif_s *, u8);
-	u8	(*read_status)(struct hwif_s *);
-	u8	(*read_altstatus)(struct hwif_s *);
-	void	(*write_devctl)(struct hwif_s *, u8);
-
-	void	(*dev_select)(ide_drive_t *);
-	void	(*tf_load)(ide_drive_t *, struct ide_taskfile *, u8);
-	void	(*tf_read)(ide_drive_t *, struct ide_taskfile *, u8);
-
-	void	(*input_data)(ide_drive_t *, struct ide_cmd *,
-			      void *, unsigned int);
-	void	(*output_data)(ide_drive_t *, struct ide_cmd *,
-			       void *, unsigned int);
-};
-
-extern const struct ide_tp_ops default_tp_ops;
-
-/**
- * struct ide_port_ops - IDE port operations
- *
- * @init_dev:		host specific initialization of a device
- * @set_pio_mode:	routine to program host for PIO mode
- * @set_dma_mode:	routine to program host for DMA mode
- * @reset_poll:		chipset polling based on hba specifics
- * @pre_reset:		chipset specific changes to default for device-hba resets
- * @resetproc:		routine to reset controller after a disk reset
- * @maskproc:		special host masking for drive selection
- * @quirkproc:		check host's drive quirk list
- * @clear_irq:		clear IRQ
- *
- * @mdma_filter:	filter MDMA modes
- * @udma_filter:	filter UDMA modes
- *
- * @cable_detect:	detect cable type
- */
-struct ide_port_ops {
-	void	(*init_dev)(ide_drive_t *);
-	void	(*set_pio_mode)(struct hwif_s *, ide_drive_t *);
-	void	(*set_dma_mode)(struct hwif_s *, ide_drive_t *);
-	blk_status_t (*reset_poll)(ide_drive_t *);
-	void	(*pre_reset)(ide_drive_t *);
-	void	(*resetproc)(ide_drive_t *);
-	void	(*maskproc)(ide_drive_t *, int);
-	void	(*quirkproc)(ide_drive_t *);
-	void	(*clear_irq)(ide_drive_t *);
-	int	(*test_irq)(struct hwif_s *);
-
-	u8	(*mdma_filter)(ide_drive_t *);
-	u8	(*udma_filter)(ide_drive_t *);
-
-	u8	(*cable_detect)(struct hwif_s *);
-};
-
-struct ide_dma_ops {
-	void	(*dma_host_set)(struct ide_drive_s *, int);
-	int	(*dma_setup)(struct ide_drive_s *, struct ide_cmd *);
-	void	(*dma_start)(struct ide_drive_s *);
-	int	(*dma_end)(struct ide_drive_s *);
-	int	(*dma_test_irq)(struct ide_drive_s *);
-	void	(*dma_lost_irq)(struct ide_drive_s *);
-	/* below ones are optional */
-	int	(*dma_check)(struct ide_drive_s *, struct ide_cmd *);
-	int	(*dma_timer_expiry)(struct ide_drive_s *);
-	void	(*dma_clear)(struct ide_drive_s *);
-	/*
-	 * The following method is optional and only required to be
-	 * implemented for the SFF-8038i compatible controllers.
-	 */
-	u8	(*dma_sff_read_status)(struct hwif_s *);
-};
-
-enum {
-	IDE_PFLAG_PROBING		= BIT(0),
-};
-
-struct ide_host;
-
-typedef struct hwif_s {
-	struct hwif_s *mate;		/* other hwif from same PCI chip */
-	struct proc_dir_entry *proc;	/* /proc/ide/ directory entry */
-
-	struct ide_host *host;
-
-	char name[6];			/* name of interface, eg. "ide0" */
-
-	struct ide_io_ports	io_ports;
-
-	unsigned long	sata_scr[SATA_NR_PORTS];
-
-	ide_drive_t	*devices[MAX_DRIVES + 1];
-
-	unsigned long	port_flags;
-
-	u8 major;	/* our major number */
-	u8 index;	/* 0 for ide0; 1 for ide1; ... */
-	u8 channel;	/* for dual-port chips: 0=primary, 1=secondary */
-
-	u32 host_flags;
-
-	u8 pio_mask;
-
-	u8 ultra_mask;
-	u8 mwdma_mask;
-	u8 swdma_mask;
-
-	u8 cbl;		/* cable type */
-
-	hwif_chipset_t chipset;	/* sub-module for tuning.. */
-
-	struct device *dev;
-
-	void (*rw_disk)(ide_drive_t *, struct request *);
-
-	const struct ide_tp_ops		*tp_ops;
-	const struct ide_port_ops	*port_ops;
-	const struct ide_dma_ops	*dma_ops;
-
-	/* dma physical region descriptor table (cpu view) */
-	unsigned int	*dmatable_cpu;
-	/* dma physical region descriptor table (dma view) */
-	dma_addr_t	dmatable_dma;
-
-	/* maximum number of PRD table entries */
-	int prd_max_nents;
-	/* PRD entry size in bytes */
-	int prd_ent_size;
-
-	/* Scatter-gather list used to build the above */
-	struct scatterlist *sg_table;
-	int sg_max_nents;		/* Maximum number of entries in it */
-
-	struct ide_cmd cmd;		/* current command */
-
-	int		rqsize;		/* max sectors per request */
-	int		irq;		/* our irq number */
-
-	unsigned long	dma_base;	/* base addr for dma ports */
-
-	unsigned long	config_data;	/* for use by chipset-specific code */
-	unsigned long	select_data;	/* for use by chipset-specific code */
-
-	unsigned long	extra_base;	/* extra addr for dma ports */
-	unsigned	extra_ports;	/* number of extra dma ports */
-
-	unsigned	present    : 1;	/* this interface exists */
-	unsigned	busy	   : 1; /* serializes devices on a port */
-
-	struct device		gendev;
-	struct device		*portdev;
-
-	struct completion gendev_rel_comp; /* To deal with device release() */
-
-	void		*hwif_data;	/* extra hwif data */
-
-#ifdef CONFIG_BLK_DEV_IDEACPI
-	struct ide_acpi_hwif_link *acpidata;
-#endif
-
-	/* IRQ handler, if active */
-	ide_startstop_t	(*handler)(ide_drive_t *);
-
-	/* BOOL: polling active & poll_timeout field valid */
-	unsigned int polling : 1;
-
-	/* current drive */
-	ide_drive_t *cur_dev;
-
-	/* current request */
-	struct request *rq;
-
-	/* failsafe timer */
-	struct timer_list timer;
-	/* timeout value during long polls */
-	unsigned long poll_timeout;
-	/* queried upon timeouts */
-	int (*expiry)(ide_drive_t *);
-
-	int req_gen;
-	int req_gen_timer;
-
-	spinlock_t lock;
-} ____cacheline_internodealigned_in_smp ide_hwif_t;
-
-#define MAX_HOST_PORTS 4
-
-struct ide_host {
-	ide_hwif_t	*ports[MAX_HOST_PORTS + 1];
-	unsigned int	n_ports;
-	struct device	*dev[2];
-
-	int		(*init_chipset)(struct pci_dev *);
-
-	void		(*get_lock)(irq_handler_t, void *);
-	void		(*release_lock)(void);
-
-	irq_handler_t	irq_handler;
-
-	unsigned long	host_flags;
-
-	int		irq_flags;
-
-	void		*host_priv;
-	ide_hwif_t	*cur_port;	/* for hosts requiring serialization */
-
-	/* used for hosts requiring serialization */
-	volatile unsigned long	host_busy;
-};
-
-#define IDE_HOST_BUSY 0
-
-/*
- *  internal ide interrupt handler type
- */
-typedef ide_startstop_t (ide_handler_t)(ide_drive_t *);
-typedef int (ide_expiry_t)(ide_drive_t *);
-
-/* used by ide-cd, ide-floppy, etc. */
-typedef void (xfer_func_t)(ide_drive_t *, struct ide_cmd *, void *, unsigned);
-
-extern struct mutex ide_setting_mtx;
-
-/*
- * configurable drive settings
- */
-
-#define DS_SYNC	BIT(0)
-
-struct ide_devset {
-	int		(*get)(ide_drive_t *);
-	int		(*set)(ide_drive_t *, int);
-	unsigned int	flags;
-};
-
-#define __DEVSET(_flags, _get, _set) { \
-	.flags	= _flags, \
-	.get	= _get,	\
-	.set	= _set,	\
-}
-
-#define ide_devset_get(name, field) \
-static int get_##name(ide_drive_t *drive) \
-{ \
-	return drive->field; \
-}
-
-#define ide_devset_set(name, field) \
-static int set_##name(ide_drive_t *drive, int arg) \
-{ \
-	drive->field = arg; \
-	return 0; \
-}
-
-#define ide_devset_get_flag(name, flag) \
-static int get_##name(ide_drive_t *drive) \
-{ \
-	return !!(drive->dev_flags & flag); \
-}
-
-#define ide_devset_set_flag(name, flag) \
-static int set_##name(ide_drive_t *drive, int arg) \
-{ \
-	if (arg) \
-		drive->dev_flags |= flag; \
-	else \
-		drive->dev_flags &= ~flag; \
-	return 0; \
-}
-
-#define __IDE_DEVSET(_name, _flags, _get, _set) \
-const struct ide_devset ide_devset_##_name = \
-	__DEVSET(_flags, _get, _set)
-
-#define IDE_DEVSET(_name, _flags, _get, _set) \
-static __IDE_DEVSET(_name, _flags, _get, _set)
-
-#define ide_devset_rw(_name, _func) \
-IDE_DEVSET(_name, 0, get_##_func, set_##_func)
-
-#define ide_devset_w(_name, _func) \
-IDE_DEVSET(_name, 0, NULL, set_##_func)
-
-#define ide_ext_devset_rw(_name, _func) \
-__IDE_DEVSET(_name, 0, get_##_func, set_##_func)
-
-#define ide_ext_devset_rw_sync(_name, _func) \
-__IDE_DEVSET(_name, DS_SYNC, get_##_func, set_##_func)
-
-#define ide_decl_devset(_name) \
-extern const struct ide_devset ide_devset_##_name
-
-ide_decl_devset(io_32bit);
-ide_decl_devset(keepsettings);
-ide_decl_devset(pio_mode);
-ide_decl_devset(unmaskirq);
-ide_decl_devset(using_dma);
-
-#ifdef CONFIG_IDE_PROC_FS
-/*
- * /proc/ide interface
- */
-
-#define ide_devset_rw_field(_name, _field) \
-ide_devset_get(_name, _field); \
-ide_devset_set(_name, _field); \
-IDE_DEVSET(_name, DS_SYNC, get_##_name, set_##_name)
-
-#define ide_devset_ro_field(_name, _field) \
-ide_devset_get(_name, _field); \
-IDE_DEVSET(_name, 0, get_##_name, NULL)
-
-#define ide_devset_rw_flag(_name, _field) \
-ide_devset_get_flag(_name, _field); \
-ide_devset_set_flag(_name, _field); \
-IDE_DEVSET(_name, DS_SYNC, get_##_name, set_##_name)
-
-struct ide_proc_devset {
-	const char		*name;
-	const struct ide_devset	*setting;
-	int			min, max;
-	int			(*mulf)(ide_drive_t *);
-	int			(*divf)(ide_drive_t *);
-};
-
-#define __IDE_PROC_DEVSET(_name, _min, _max, _mulf, _divf) { \
-	.name = __stringify(_name), \
-	.setting = &ide_devset_##_name, \
-	.min = _min, \
-	.max = _max, \
-	.mulf = _mulf, \
-	.divf = _divf, \
-}
-
-#define IDE_PROC_DEVSET(_name, _min, _max) \
-__IDE_PROC_DEVSET(_name, _min, _max, NULL, NULL)
-
-typedef struct {
-	const char	*name;
-	umode_t		mode;
-	int (*show)(struct seq_file *, void *);
-} ide_proc_entry_t;
-
-void proc_ide_create(void);
-void proc_ide_destroy(void);
-void ide_proc_register_port(ide_hwif_t *);
-void ide_proc_port_register_devices(ide_hwif_t *);
-void ide_proc_unregister_device(ide_drive_t *);
-void ide_proc_unregister_port(ide_hwif_t *);
-void ide_proc_register_driver(ide_drive_t *, struct ide_driver *);
-void ide_proc_unregister_driver(ide_drive_t *, struct ide_driver *);
-
-int ide_capacity_proc_show(struct seq_file *m, void *v);
-int ide_geometry_proc_show(struct seq_file *m, void *v);
-#else
-static inline void proc_ide_create(void) { ; }
-static inline void proc_ide_destroy(void) { ; }
-static inline void ide_proc_register_port(ide_hwif_t *hwif) { ; }
-static inline void ide_proc_port_register_devices(ide_hwif_t *hwif) { ; }
-static inline void ide_proc_unregister_device(ide_drive_t *drive) { ; }
-static inline void ide_proc_unregister_port(ide_hwif_t *hwif) { ; }
-static inline void ide_proc_register_driver(ide_drive_t *drive,
-					    struct ide_driver *driver) { ; }
-static inline void ide_proc_unregister_driver(ide_drive_t *drive,
-					      struct ide_driver *driver) { ; }
-#endif
-
-enum {
-	/* enter/exit functions */
-	IDE_DBG_FUNC =			BIT(0),
-	/* sense key/asc handling */
-	IDE_DBG_SENSE =			BIT(1),
-	/* packet commands handling */
-	IDE_DBG_PC =			BIT(2),
-	/* request handling */
-	IDE_DBG_RQ =			BIT(3),
-	/* driver probing/setup */
-	IDE_DBG_PROBE =			BIT(4),
-};
-
-/* DRV_NAME has to be defined in the driver before using the macro below */
-#define __ide_debug_log(lvl, fmt, args...)				\
-{									\
-	if (unlikely(drive->debug_mask & lvl))				\
-		printk(KERN_INFO DRV_NAME ": %s: " fmt "\n",		\
-					  __func__, ## args);		\
-}
-
-/*
- * Power Management state machine (rq->pm->pm_step).
- *
- * For each step, the core calls ide_start_power_step() first.
- * This can return:
- *	- ide_stopped :	In this case, the core calls us back again unless
- *			step have been set to ide_power_state_completed.
- *	- ide_started :	In this case, the channel is left busy until an
- *			async event (interrupt) occurs.
- * Typically, ide_start_power_step() will issue a taskfile request with
- * do_rw_taskfile().
- *
- * Upon reception of the interrupt, the core will call ide_complete_power_step()
- * with the error code if any. This routine should update the step value
- * and return. It should not start a new request. The core will call
- * ide_start_power_step() for the new step value, unless step have been
- * set to IDE_PM_COMPLETED.
- */
-enum {
-	IDE_PM_START_SUSPEND,
-	IDE_PM_FLUSH_CACHE	= IDE_PM_START_SUSPEND,
-	IDE_PM_STANDBY,
-
-	IDE_PM_START_RESUME,
-	IDE_PM_RESTORE_PIO	= IDE_PM_START_RESUME,
-	IDE_PM_IDLE,
-	IDE_PM_RESTORE_DMA,
-
-	IDE_PM_COMPLETED,
-};
-
-int generic_ide_suspend(struct device *, pm_message_t);
-int generic_ide_resume(struct device *);
-
-void ide_complete_power_step(ide_drive_t *, struct request *);
-ide_startstop_t ide_start_power_step(ide_drive_t *, struct request *);
-void ide_complete_pm_rq(ide_drive_t *, struct request *);
-void ide_check_pm_state(ide_drive_t *, struct request *);
-
-/*
- * Subdrivers support.
- *
- * The gendriver.owner field should be set to the module owner of this driver.
- * The gendriver.name field should be set to the name of this driver
- */
-struct ide_driver {
-	const char			*version;
-	ide_startstop_t	(*do_request)(ide_drive_t *, struct request *, sector_t);
-	struct device_driver	gen_driver;
-	int		(*probe)(ide_drive_t *);
-	void		(*remove)(ide_drive_t *);
-	void		(*resume)(ide_drive_t *);
-	void		(*shutdown)(ide_drive_t *);
-#ifdef CONFIG_IDE_PROC_FS
-	ide_proc_entry_t *		(*proc_entries)(ide_drive_t *);
-	const struct ide_proc_devset *	(*proc_devsets)(ide_drive_t *);
-#endif
-};
-
-#define to_ide_driver(drv) container_of(drv, struct ide_driver, gen_driver)
-
-int ide_device_get(ide_drive_t *);
-void ide_device_put(ide_drive_t *);
-
-struct ide_ioctl_devset {
-	unsigned int	get_ioctl;
-	unsigned int	set_ioctl;
-	const struct ide_devset *setting;
-};
-
-int ide_setting_ioctl(ide_drive_t *, struct block_device *, unsigned int,
-		      unsigned long, const struct ide_ioctl_devset *);
-
-int generic_ide_ioctl(ide_drive_t *, struct block_device *, unsigned, unsigned long);
-
-extern int ide_vlb_clk;
-extern int ide_pci_clk;
-
-int ide_end_rq(ide_drive_t *, struct request *, blk_status_t, unsigned int);
-void ide_kill_rq(ide_drive_t *, struct request *);
-void ide_insert_request_head(ide_drive_t *, struct request *);
-
-void __ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int);
-void ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int);
-
-void ide_execute_command(ide_drive_t *, struct ide_cmd *, ide_handler_t *,
-			 unsigned int);
-
-void ide_pad_transfer(ide_drive_t *, int, int);
-
-ide_startstop_t ide_error(ide_drive_t *, const char *, u8);
-
-void ide_fix_driveid(u16 *);
-
-extern void ide_fixstring(u8 *, const int, const int);
-
-int ide_busy_sleep(ide_drive_t *, unsigned long, int);
-
-int __ide_wait_stat(ide_drive_t *, u8, u8, unsigned long, u8 *);
-int ide_wait_stat(ide_startstop_t *, ide_drive_t *, u8, u8, unsigned long);
-
-ide_startstop_t ide_do_park_unpark(ide_drive_t *, struct request *);
-ide_startstop_t ide_do_devset(ide_drive_t *, struct request *);
-
-extern ide_startstop_t ide_do_reset (ide_drive_t *);
-
-extern int ide_devset_execute(ide_drive_t *drive,
-			      const struct ide_devset *setting, int arg);
-
-void ide_complete_cmd(ide_drive_t *, struct ide_cmd *, u8, u8);
-int ide_complete_rq(ide_drive_t *, blk_status_t, unsigned int);
-
-void ide_tf_readback(ide_drive_t *drive, struct ide_cmd *cmd);
-void ide_tf_dump(const char *, struct ide_cmd *);
-
-void ide_exec_command(ide_hwif_t *, u8);
-u8 ide_read_status(ide_hwif_t *);
-u8 ide_read_altstatus(ide_hwif_t *);
-void ide_write_devctl(ide_hwif_t *, u8);
-
-void ide_dev_select(ide_drive_t *);
-void ide_tf_load(ide_drive_t *, struct ide_taskfile *, u8);
-void ide_tf_read(ide_drive_t *, struct ide_taskfile *, u8);
-
-void ide_input_data(ide_drive_t *, struct ide_cmd *, void *, unsigned int);
-void ide_output_data(ide_drive_t *, struct ide_cmd *, void *, unsigned int);
-
-void SELECT_MASK(ide_drive_t *, int);
-
-u8 ide_read_error(ide_drive_t *);
-void ide_read_bcount_and_ireason(ide_drive_t *, u16 *, u8 *);
-
-int ide_check_ireason(ide_drive_t *, struct request *, int, int, int);
-
-int ide_check_atapi_device(ide_drive_t *, const char *);
-
-void ide_init_pc(struct ide_atapi_pc *);
-
-/* Disk head parking */
-extern wait_queue_head_t ide_park_wq;
-ssize_t ide_park_show(struct device *dev, struct device_attribute *attr,
-		      char *buf);
-ssize_t ide_park_store(struct device *dev, struct device_attribute *attr,
-		       const char *buf, size_t len);
-
-/*
- * Special requests for ide-tape block device strategy routine.
- *
- * In order to service a character device command, we add special requests to
- * the tail of our block device request queue and wait for their completion.
- */
-enum {
-	REQ_IDETAPE_PC1		= BIT(0), /* packet command (first stage) */
-	REQ_IDETAPE_PC2		= BIT(1), /* packet command (second stage) */
-	REQ_IDETAPE_READ	= BIT(2),
-	REQ_IDETAPE_WRITE	= BIT(3),
-};
-
-int ide_queue_pc_tail(ide_drive_t *, struct gendisk *, struct ide_atapi_pc *,
-		      void *, unsigned int);
-
-int ide_do_test_unit_ready(ide_drive_t *, struct gendisk *);
-int ide_do_start_stop(ide_drive_t *, struct gendisk *, int);
-int ide_set_media_lock(ide_drive_t *, struct gendisk *, int);
-void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *);
-void ide_retry_pc(ide_drive_t *drive);
-
-void ide_prep_sense(ide_drive_t *drive, struct request *rq);
-int ide_queue_sense_rq(ide_drive_t *drive, void *special);
-
-int ide_cd_expiry(ide_drive_t *);
-
-int ide_cd_get_xferlen(struct request *);
-
-ide_startstop_t ide_issue_pc(ide_drive_t *, struct ide_cmd *);
-
-ide_startstop_t do_rw_taskfile(ide_drive_t *, struct ide_cmd *);
-
-void ide_pio_bytes(ide_drive_t *, struct ide_cmd *, unsigned int, unsigned int);
-
-void ide_finish_cmd(ide_drive_t *, struct ide_cmd *, u8);
-
-int ide_raw_taskfile(ide_drive_t *, struct ide_cmd *, u8 *, u16);
-int ide_no_data_taskfile(ide_drive_t *, struct ide_cmd *);
-
-int ide_taskfile_ioctl(ide_drive_t *, unsigned long);
-
-int ide_dev_read_id(ide_drive_t *, u8, u16 *, int);
-
-extern int ide_driveid_update(ide_drive_t *);
-extern int ide_config_drive_speed(ide_drive_t *, u8);
-extern u8 eighty_ninty_three (ide_drive_t *);
-extern int taskfile_lib_get_identify(ide_drive_t *drive, u8 *);
-
-extern int ide_wait_not_busy(ide_hwif_t *hwif, unsigned long timeout);
-
-extern void ide_stall_queue(ide_drive_t *drive, unsigned long timeout);
-
-extern void ide_timer_expiry(struct timer_list *t);
-extern irqreturn_t ide_intr(int irq, void *dev_id);
-extern blk_status_t ide_queue_rq(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *);
-extern blk_status_t ide_issue_rq(ide_drive_t *, struct request *, bool);
-extern void ide_requeue_and_plug(ide_drive_t *drive, struct request *rq);
-
-void ide_init_disk(struct gendisk *, ide_drive_t *);
-
-#ifdef CONFIG_IDEPCI_PCIBUS_ORDER
-extern int __ide_pci_register_driver(struct pci_driver *driver, struct module *owner, const char *mod_name);
-#define ide_pci_register_driver(d) __ide_pci_register_driver(d, THIS_MODULE, KBUILD_MODNAME)
-#else
-#define ide_pci_register_driver(d) pci_register_driver(d)
-#endif
-
-static inline int ide_pci_is_in_compatibility_mode(struct pci_dev *dev)
-{
-	if ((dev->class >> 8) == PCI_CLASS_STORAGE_IDE && (dev->class & 5) != 5)
-		return 1;
-	return 0;
-}
-
-void ide_pci_setup_ports(struct pci_dev *, const struct ide_port_info *,
-			 struct ide_hw *, struct ide_hw **);
-void ide_setup_pci_noise(struct pci_dev *, const struct ide_port_info *);
-
-#ifdef CONFIG_BLK_DEV_IDEDMA_PCI
-int ide_pci_set_master(struct pci_dev *, const char *);
-unsigned long ide_pci_dma_base(ide_hwif_t *, const struct ide_port_info *);
-int ide_pci_check_simplex(ide_hwif_t *, const struct ide_port_info *);
-int ide_hwif_setup_dma(ide_hwif_t *, const struct ide_port_info *);
-#else
-static inline int ide_hwif_setup_dma(ide_hwif_t *hwif,
-				     const struct ide_port_info *d)
-{
-	return -EINVAL;
-}
-#endif
-
-struct ide_pci_enablebit {
-	u8	reg;	/* byte pci reg holding the enable-bit */
-	u8	mask;	/* mask to isolate the enable-bit */
-	u8	val;	/* value of masked reg when "enabled" */
-};
-
-enum {
-	/* Uses ISA control ports not PCI ones. */
-	IDE_HFLAG_ISA_PORTS		= BIT(0),
-	/* single port device */
-	IDE_HFLAG_SINGLE		= BIT(1),
-	/* don't use legacy PIO blacklist */
-	IDE_HFLAG_PIO_NO_BLACKLIST	= BIT(2),
-	/* set for the second port of QD65xx */
-	IDE_HFLAG_QD_2ND_PORT		= BIT(3),
-	/* use PIO8/9 for prefetch off/on */
-	IDE_HFLAG_ABUSE_PREFETCH	= BIT(4),
-	/* use PIO6/7 for fast-devsel off/on */
-	IDE_HFLAG_ABUSE_FAST_DEVSEL	= BIT(5),
-	/* use 100-102 and 200-202 PIO values to set DMA modes */
-	IDE_HFLAG_ABUSE_DMA_MODES	= BIT(6),
-	/*
-	 * keep DMA setting when programming PIO mode, may be used only
-	 * for hosts which have separate PIO and DMA timings (ie. PMAC)
-	 */
-	IDE_HFLAG_SET_PIO_MODE_KEEP_DMA	= BIT(7),
-	/* program host for the transfer mode after programming device */
-	IDE_HFLAG_POST_SET_MODE		= BIT(8),
-	/* don't program host/device for the transfer mode ("smart" hosts) */
-	IDE_HFLAG_NO_SET_MODE		= BIT(9),
-	/* trust BIOS for programming chipset/device for DMA */
-	IDE_HFLAG_TRUST_BIOS_FOR_DMA	= BIT(10),
-	/* host is CS5510/CS5520 */
-	IDE_HFLAG_CS5520		= BIT(11),
-	/* ATAPI DMA is unsupported */
-	IDE_HFLAG_NO_ATAPI_DMA		= BIT(12),
-	/* set if host is a "non-bootable" controller */
-	IDE_HFLAG_NON_BOOTABLE		= BIT(13),
-	/* host doesn't support DMA */
-	IDE_HFLAG_NO_DMA		= BIT(14),
-	/* check if host is PCI IDE device before allowing DMA */
-	IDE_HFLAG_NO_AUTODMA		= BIT(15),
-	/* host uses MMIO */
-	IDE_HFLAG_MMIO			= BIT(16),
-	/* no LBA48 */
-	IDE_HFLAG_NO_LBA48		= BIT(17),
-	/* no LBA48 DMA */
-	IDE_HFLAG_NO_LBA48_DMA		= BIT(18),
-	/* data FIFO is cleared by an error */
-	IDE_HFLAG_ERROR_STOPS_FIFO	= BIT(19),
-	/* serialize ports */
-	IDE_HFLAG_SERIALIZE		= BIT(20),
-	/* host is DTC2278 */
-	IDE_HFLAG_DTC2278		= BIT(21),
-	/* 4 devices on a single set of I/O ports */
-	IDE_HFLAG_4DRIVES		= BIT(22),
-	/* host is TRM290 */
-	IDE_HFLAG_TRM290		= BIT(23),
-	/* use 32-bit I/O ops */
-	IDE_HFLAG_IO_32BIT		= BIT(24),
-	/* unmask IRQs */
-	IDE_HFLAG_UNMASK_IRQS		= BIT(25),
-	IDE_HFLAG_BROKEN_ALTSTATUS	= BIT(26),
-	/* serialize ports if DMA is possible (for sl82c105) */
-	IDE_HFLAG_SERIALIZE_DMA		= BIT(27),
-	/* force host out of "simplex" mode */
-	IDE_HFLAG_CLEAR_SIMPLEX		= BIT(28),
-	/* DSC overlap is unsupported */
-	IDE_HFLAG_NO_DSC		= BIT(29),
-	/* never use 32-bit I/O ops */
-	IDE_HFLAG_NO_IO_32BIT		= BIT(30),
-	/* never unmask IRQs */
-	IDE_HFLAG_NO_UNMASK_IRQS	= BIT(31),
-};
-
-#ifdef CONFIG_BLK_DEV_OFFBOARD
-# define IDE_HFLAG_OFF_BOARD	0
-#else
-# define IDE_HFLAG_OFF_BOARD	IDE_HFLAG_NON_BOOTABLE
-#endif
-
-struct ide_port_info {
-	char			*name;
-
-	int			(*init_chipset)(struct pci_dev *);
-
-	void			(*get_lock)(irq_handler_t, void *);
-	void			(*release_lock)(void);
-
-	void			(*init_iops)(ide_hwif_t *);
-	void                    (*init_hwif)(ide_hwif_t *);
-	int			(*init_dma)(ide_hwif_t *,
-					    const struct ide_port_info *);
-
-	const struct ide_tp_ops		*tp_ops;
-	const struct ide_port_ops	*port_ops;
-	const struct ide_dma_ops	*dma_ops;
-
-	struct ide_pci_enablebit	enablebits[2];
-
-	hwif_chipset_t		chipset;
-
-	u16			max_sectors;	/* if < than the default one */
-
-	u32			host_flags;
-
-	int			irq_flags;
-
-	u8			pio_mask;
-	u8			swdma_mask;
-	u8			mwdma_mask;
-	u8			udma_mask;
-};
-
-/*
- * State information carried for REQ_TYPE_ATA_PM_SUSPEND and REQ_TYPE_ATA_PM_RESUME
- * requests.
- */
-struct ide_pm_state {
-	/* PM state machine step value, currently driver specific */
-	int	pm_step;
-	/* requested PM state value (S1, S2, S3, S4, ...) */
-	u32	pm_state;
-	void*	data;		/* for driver use */
-};
-
-
-int ide_pci_init_one(struct pci_dev *, const struct ide_port_info *, void *);
-int ide_pci_init_two(struct pci_dev *, struct pci_dev *,
-		     const struct ide_port_info *, void *);
-void ide_pci_remove(struct pci_dev *);
-
-#ifdef CONFIG_PM
-int ide_pci_suspend(struct pci_dev *, pm_message_t);
-int ide_pci_resume(struct pci_dev *);
-#else
-#define ide_pci_suspend NULL
-#define ide_pci_resume NULL
-#endif
-
-void ide_map_sg(ide_drive_t *, struct ide_cmd *);
-void ide_init_sg_cmd(struct ide_cmd *, unsigned int);
-
-#define BAD_DMA_DRIVE		0
-#define GOOD_DMA_DRIVE		1
-
-struct drive_list_entry {
-	const char *id_model;
-	const char *id_firmware;
-};
-
-int ide_in_drive_list(u16 *, const struct drive_list_entry *);
-
-#ifdef CONFIG_BLK_DEV_IDEDMA
-int ide_dma_good_drive(ide_drive_t *);
-int __ide_dma_bad_drive(ide_drive_t *);
-
-u8 ide_find_dma_mode(ide_drive_t *, u8);
-
-static inline u8 ide_max_dma_mode(ide_drive_t *drive)
-{
-	return ide_find_dma_mode(drive, XFER_UDMA_6);
-}
-
-void ide_dma_off_quietly(ide_drive_t *);
-void ide_dma_off(ide_drive_t *);
-void ide_dma_on(ide_drive_t *);
-int ide_set_dma(ide_drive_t *);
-void ide_check_dma_crc(ide_drive_t *);
-ide_startstop_t ide_dma_intr(ide_drive_t *);
-
-int ide_allocate_dma_engine(ide_hwif_t *);
-void ide_release_dma_engine(ide_hwif_t *);
-
-int ide_dma_prepare(ide_drive_t *, struct ide_cmd *);
-void ide_dma_unmap_sg(ide_drive_t *, struct ide_cmd *);
-
-#ifdef CONFIG_BLK_DEV_IDEDMA_SFF
-int config_drive_for_dma(ide_drive_t *);
-int ide_build_dmatable(ide_drive_t *, struct ide_cmd *);
-void ide_dma_host_set(ide_drive_t *, int);
-int ide_dma_setup(ide_drive_t *, struct ide_cmd *);
-extern void ide_dma_start(ide_drive_t *);
-int ide_dma_end(ide_drive_t *);
-int ide_dma_test_irq(ide_drive_t *);
-int ide_dma_sff_timer_expiry(ide_drive_t *);
-u8 ide_dma_sff_read_status(ide_hwif_t *);
-extern const struct ide_dma_ops sff_dma_ops;
-#else
-static inline int config_drive_for_dma(ide_drive_t *drive) { return 0; }
-#endif /* CONFIG_BLK_DEV_IDEDMA_SFF */
-
-void ide_dma_lost_irq(ide_drive_t *);
-ide_startstop_t ide_dma_timeout_retry(ide_drive_t *, int);
-
-#else
-static inline u8 ide_find_dma_mode(ide_drive_t *drive, u8 speed) { return 0; }
-static inline u8 ide_max_dma_mode(ide_drive_t *drive) { return 0; }
-static inline void ide_dma_off_quietly(ide_drive_t *drive) { ; }
-static inline void ide_dma_off(ide_drive_t *drive) { ; }
-static inline void ide_dma_on(ide_drive_t *drive) { ; }
-static inline void ide_dma_verbose(ide_drive_t *drive) { ; }
-static inline int ide_set_dma(ide_drive_t *drive) { return 1; }
-static inline void ide_check_dma_crc(ide_drive_t *drive) { ; }
-static inline ide_startstop_t ide_dma_intr(ide_drive_t *drive) { return ide_stopped; }
-static inline ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error) { return ide_stopped; }
-static inline void ide_release_dma_engine(ide_hwif_t *hwif) { ; }
-static inline int ide_dma_prepare(ide_drive_t *drive,
-				  struct ide_cmd *cmd) { return 1; }
-static inline void ide_dma_unmap_sg(ide_drive_t *drive,
-				    struct ide_cmd *cmd) { ; }
-#endif /* CONFIG_BLK_DEV_IDEDMA */
-
-#ifdef CONFIG_BLK_DEV_IDEACPI
-int ide_acpi_init(void);
-bool ide_port_acpi(ide_hwif_t *hwif);
-extern int ide_acpi_exec_tfs(ide_drive_t *drive);
-extern void ide_acpi_get_timing(ide_hwif_t *hwif);
-extern void ide_acpi_push_timing(ide_hwif_t *hwif);
-void ide_acpi_init_port(ide_hwif_t *);
-void ide_acpi_port_init_devices(ide_hwif_t *);
-extern void ide_acpi_set_state(ide_hwif_t *hwif, int on);
-#else
-static inline int ide_acpi_init(void) { return 0; }
-static inline bool ide_port_acpi(ide_hwif_t *hwif) { return 0; }
-static inline int ide_acpi_exec_tfs(ide_drive_t *drive) { return 0; }
-static inline void ide_acpi_get_timing(ide_hwif_t *hwif) { ; }
-static inline void ide_acpi_push_timing(ide_hwif_t *hwif) { ; }
-static inline void ide_acpi_init_port(ide_hwif_t *hwif) { ; }
-static inline void ide_acpi_port_init_devices(ide_hwif_t *hwif) { ; }
-static inline void ide_acpi_set_state(ide_hwif_t *hwif, int on) {}
-#endif
-
-void ide_check_nien_quirk_list(ide_drive_t *);
-void ide_undecoded_slave(ide_drive_t *);
-
-void ide_port_apply_params(ide_hwif_t *);
-int ide_sysfs_register_port(ide_hwif_t *);
-
-struct ide_host *ide_host_alloc(const struct ide_port_info *, struct ide_hw **,
-				unsigned int);
-void ide_host_free(struct ide_host *);
-int ide_host_register(struct ide_host *, const struct ide_port_info *,
-		      struct ide_hw **);
-int ide_host_add(const struct ide_port_info *, struct ide_hw **, unsigned int,
-		 struct ide_host **);
-void ide_host_remove(struct ide_host *);
-int ide_legacy_device_add(const struct ide_port_info *, unsigned long);
-void ide_port_unregister_devices(ide_hwif_t *);
-void ide_port_scan(ide_hwif_t *);
-
-static inline void *ide_get_hwifdata (ide_hwif_t * hwif)
-{
-	return hwif->hwif_data;
-}
-
-static inline void ide_set_hwifdata (ide_hwif_t * hwif, void *data)
-{
-	hwif->hwif_data = data;
-}
-
-u64 ide_get_lba_addr(struct ide_cmd *, int);
-u8 ide_dump_status(ide_drive_t *, const char *, u8);
-
-struct ide_timing {
-	u8  mode;
-	u8  setup;	/* t1 */
-	u16 act8b;	/* t2 for 8-bit io */
-	u16 rec8b;	/* t2i for 8-bit io */
-	u16 cyc8b;	/* t0 for 8-bit io */
-	u16 active;	/* t2 or tD */
-	u16 recover;	/* t2i or tK */
-	u16 cycle;	/* t0 */
-	u16 udma;	/* t2CYCTYP/2 */
-};
-
-enum {
-	IDE_TIMING_SETUP	= BIT(0),
-	IDE_TIMING_ACT8B	= BIT(1),
-	IDE_TIMING_REC8B	= BIT(2),
-	IDE_TIMING_CYC8B	= BIT(3),
-	IDE_TIMING_8BIT		= IDE_TIMING_ACT8B | IDE_TIMING_REC8B |
-				  IDE_TIMING_CYC8B,
-	IDE_TIMING_ACTIVE	= BIT(4),
-	IDE_TIMING_RECOVER	= BIT(5),
-	IDE_TIMING_CYCLE	= BIT(6),
-	IDE_TIMING_UDMA		= BIT(7),
-	IDE_TIMING_ALL		= IDE_TIMING_SETUP | IDE_TIMING_8BIT |
-				  IDE_TIMING_ACTIVE | IDE_TIMING_RECOVER |
-				  IDE_TIMING_CYCLE | IDE_TIMING_UDMA,
-};
-
-struct ide_timing *ide_timing_find_mode(u8);
-u16 ide_pio_cycle_time(ide_drive_t *, u8);
-void ide_timing_merge(struct ide_timing *, struct ide_timing *,
-		      struct ide_timing *, unsigned int);
-int ide_timing_compute(ide_drive_t *, u8, struct ide_timing *, int, int);
-
-#ifdef CONFIG_IDE_XFER_MODE
-int ide_scan_pio_blacklist(char *);
-const char *ide_xfer_verbose(u8);
-int ide_pio_need_iordy(ide_drive_t *, const u8);
-int ide_set_pio_mode(ide_drive_t *, u8);
-int ide_set_dma_mode(ide_drive_t *, u8);
-void ide_set_pio(ide_drive_t *, u8);
-int ide_set_xfer_rate(ide_drive_t *, u8);
-#else
-static inline void ide_set_pio(ide_drive_t *drive, u8 pio) { ; }
-static inline int ide_set_xfer_rate(ide_drive_t *drive, u8 rate) { return -1; }
-#endif
-
-static inline void ide_set_max_pio(ide_drive_t *drive)
-{
-	ide_set_pio(drive, 255);
-}
-
-char *ide_media_string(ide_drive_t *);
-
-extern const struct attribute_group *ide_dev_groups[];
-extern struct bus_type ide_bus_type;
-extern struct class *ide_port_class;
-
-static inline void ide_dump_identify(u8 *id)
-{
-	print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, 16, 2, id, 512, 0);
-}
-
-static inline int hwif_to_node(ide_hwif_t *hwif)
-{
-	return hwif->dev ? dev_to_node(hwif->dev) : -1;
-}
-
-static inline ide_drive_t *ide_get_pair_dev(ide_drive_t *drive)
-{
-	ide_drive_t *peer = drive->hwif->devices[(drive->dn ^ 1) & 1];
-
-	return (peer->dev_flags & IDE_DFLAG_PRESENT) ? peer : NULL;
-}
-
-static inline void *ide_get_drivedata(ide_drive_t *drive)
-{
-	return drive->drive_data;
-}
-
-static inline void ide_set_drivedata(ide_drive_t *drive, void *data)
-{
-	drive->drive_data = data;
-}
-
-#define ide_port_for_each_dev(i, dev, port) \
-	for ((i) = 0; ((dev) = (port)->devices[i]) || (i) < MAX_DRIVES; (i)++)
-
-#define ide_port_for_each_present_dev(i, dev, port) \
-	for ((i) = 0; ((dev) = (port)->devices[i]) || (i) < MAX_DRIVES; (i)++) \
-		if ((dev)->dev_flags & IDE_DFLAG_PRESENT)
-
-#define ide_host_for_each_port(i, port, host) \
-	for ((i) = 0; ((port) = (host)->ports[i]) || (i) < MAX_HOST_PORTS; (i)++)
-
-
-#endif /* _IDE_H */
-- 
cgit v1.2.3


From 0e8512fab9fd6d78e88931c02a43b04d15566d6b Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Fri, 4 Jun 2021 15:47:49 +0200
Subject: platform/surface: aggregator: Allow registering notifiers without
 enabling events

Currently, each SSAM event notifier is directly tied to one group of
events. This makes sense as registering a notifier will automatically
take care of enabling the corresponding event group and normally drivers
only need notifications for a very limited number of events, associated
with different callbacks for each group.

However, there are rare cases, especially for debugging, when we want to
get notifications for a whole event target category instead of just a
single group of events in that category. Registering multiple notifiers,
i.e. one per group, may be infeasible due to two issues: a) we might not
know every event enable/disable specification as some events are
auto-enabled by the EC and b) forwarding this to the same callback will
lead to duplicate events as we might not know the full event
specification to perform the appropriate filtering.

This commit introduces observer-notifiers, which are notifiers that are
not tied to a specific event group and do not attempt to manage any
events. In other words, they can be registered without enabling any
event group or incrementing the corresponding reference count and just
act as silent observers, listening to all currently/previously enabled
events based on their match-specification.

Essentially, this allows us to register one single notifier for a full
event target category, meaning that we can process all events of that
target category in a single callback without duplication. Specifically,
this will be used in the cdev debug interface to forward events to
user-space via a device file from which the events can be read.

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/20210604134755.535590-2-luzmaximilian@gmail.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/platform/surface/aggregator/controller.c | 69 ++++++++++++++----------
 include/linux/surface_aggregator/controller.h    | 17 ++++++
 2 files changed, 58 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/surface/aggregator/controller.c b/drivers/platform/surface/aggregator/controller.c
index a06964aa96e7..cd3a6b77f48d 100644
--- a/drivers/platform/surface/aggregator/controller.c
+++ b/drivers/platform/surface/aggregator/controller.c
@@ -2127,9 +2127,15 @@ int ssam_ctrl_notif_d0_entry(struct ssam_controller *ctrl)
  * @ctrl: The controller to register the notifier on.
  * @n:    The event notifier to register.
  *
- * Register an event notifier and increment the usage counter of the
- * associated SAM event. If the event was previously not enabled, it will be
- * enabled during this call.
+ * Register an event notifier. Increment the usage counter of the associated
+ * SAM event if the notifier is not marked as an observer. If the event is not
+ * marked as an observer and is currently not enabled, it will be enabled
+ * during this call. If the notifier is marked as an observer, no attempt will
+ * be made at enabling any event and no reference count will be modified.
+ *
+ * Notifiers marked as observers do not need to be associated with one specific
+ * event, i.e. as long as no event matching is performed, only the event target
+ * category needs to be set.
  *
  * Return: Returns zero on success, %-ENOSPC if there have already been
  * %INT_MAX notifiers for the event ID/type associated with the notifier block
@@ -2138,11 +2144,10 @@ int ssam_ctrl_notif_d0_entry(struct ssam_controller *ctrl)
  * for the specific associated event, returns the status of the event-enable
  * EC-command.
  */
-int ssam_notifier_register(struct ssam_controller *ctrl,
-			   struct ssam_event_notifier *n)
+int ssam_notifier_register(struct ssam_controller *ctrl, struct ssam_event_notifier *n)
 {
 	u16 rqid = ssh_tc_to_rqid(n->event.id.target_category);
-	struct ssam_nf_refcount_entry *entry;
+	struct ssam_nf_refcount_entry *entry = NULL;
 	struct ssam_nf_head *nf_head;
 	struct ssam_nf *nf;
 	int status;
@@ -2155,29 +2160,32 @@ int ssam_notifier_register(struct ssam_controller *ctrl,
 
 	mutex_lock(&nf->lock);
 
-	entry = ssam_nf_refcount_inc(nf, n->event.reg, n->event.id);
-	if (IS_ERR(entry)) {
-		mutex_unlock(&nf->lock);
-		return PTR_ERR(entry);
-	}
+	if (!(n->flags & SSAM_EVENT_NOTIFIER_OBSERVER)) {
+		entry = ssam_nf_refcount_inc(nf, n->event.reg, n->event.id);
+		if (IS_ERR(entry)) {
+			mutex_unlock(&nf->lock);
+			return PTR_ERR(entry);
+		}
 
-	ssam_dbg(ctrl, "enabling event (reg: %#04x, tc: %#04x, iid: %#04x, rc: %d)\n",
-		 n->event.reg.target_category, n->event.id.target_category,
-		 n->event.id.instance, entry->refcount);
+		ssam_dbg(ctrl, "enabling event (reg: %#04x, tc: %#04x, iid: %#04x, rc: %d)\n",
+			 n->event.reg.target_category, n->event.id.target_category,
+			 n->event.id.instance, entry->refcount);
+	}
 
 	status = ssam_nfblk_insert(nf_head, &n->base);
 	if (status) {
-		entry = ssam_nf_refcount_dec(nf, n->event.reg, n->event.id);
-		if (entry->refcount == 0)
-			kfree(entry);
+		if (entry) {
+			entry = ssam_nf_refcount_dec(nf, n->event.reg, n->event.id);
+			if (entry->refcount == 0)
+				kfree(entry);
+		}
 
 		mutex_unlock(&nf->lock);
 		return status;
 	}
 
-	if (entry->refcount == 1) {
-		status = ssam_ssh_event_enable(ctrl, n->event.reg, n->event.id,
-					       n->event.flags);
+	if (entry && entry->refcount == 1) {
+		status = ssam_ssh_event_enable(ctrl, n->event.reg, n->event.id, n->event.flags);
 		if (status) {
 			ssam_nfblk_remove(&n->base);
 			kfree(ssam_nf_refcount_dec(nf, n->event.reg, n->event.id));
@@ -2188,7 +2196,7 @@ int ssam_notifier_register(struct ssam_controller *ctrl,
 
 		entry->flags = n->event.flags;
 
-	} else if (entry->flags != n->event.flags) {
+	} else if (entry && entry->flags != n->event.flags) {
 		ssam_warn(ctrl,
 			  "inconsistent flags when enabling event: got %#04x, expected %#04x (reg: %#04x, tc: %#04x, iid: %#04x)\n",
 			  n->event.flags, entry->flags, n->event.reg.target_category,
@@ -2205,17 +2213,16 @@ EXPORT_SYMBOL_GPL(ssam_notifier_register);
  * @ctrl: The controller the notifier has been registered on.
  * @n:    The event notifier to unregister.
  *
- * Unregister an event notifier and decrement the usage counter of the
- * associated SAM event. If the usage counter reaches zero, the event will be
- * disabled.
+ * Unregister an event notifier. Decrement the usage counter of the associated
+ * SAM event if the notifier is not marked as an observer. If the usage counter
+ * reaches zero, the event will be disabled.
  *
  * Return: Returns zero on success, %-ENOENT if the given notifier block has
  * not been registered on the controller. If the given notifier block was the
  * last one associated with its specific event, returns the status of the
  * event-disable EC-command.
  */
-int ssam_notifier_unregister(struct ssam_controller *ctrl,
-			     struct ssam_event_notifier *n)
+int ssam_notifier_unregister(struct ssam_controller *ctrl, struct ssam_event_notifier *n)
 {
 	u16 rqid = ssh_tc_to_rqid(n->event.id.target_category);
 	struct ssam_nf_refcount_entry *entry;
@@ -2236,6 +2243,13 @@ int ssam_notifier_unregister(struct ssam_controller *ctrl,
 		return -ENOENT;
 	}
 
+	/*
+	 * If this is an observer notifier, do not attempt to disable the
+	 * event, just remove it.
+	 */
+	if (n->flags & SSAM_EVENT_NOTIFIER_OBSERVER)
+		goto remove;
+
 	entry = ssam_nf_refcount_dec(nf, n->event.reg, n->event.id);
 	if (WARN_ON(!entry)) {
 		/*
@@ -2260,8 +2274,7 @@ int ssam_notifier_unregister(struct ssam_controller *ctrl,
 	}
 
 	if (entry->refcount == 0) {
-		status = ssam_ssh_event_disable(ctrl, n->event.reg, n->event.id,
-						n->event.flags);
+		status = ssam_ssh_event_disable(ctrl, n->event.reg, n->event.id, n->event.flags);
 		kfree(entry);
 	}
 
diff --git a/include/linux/surface_aggregator/controller.h b/include/linux/surface_aggregator/controller.h
index 0806796eabcb..cf4bb48a850e 100644
--- a/include/linux/surface_aggregator/controller.h
+++ b/include/linux/surface_aggregator/controller.h
@@ -795,6 +795,20 @@ enum ssam_event_mask {
 #define SSAM_EVENT_REGISTRY_REG \
 	SSAM_EVENT_REGISTRY(SSAM_SSH_TC_REG, 0x02, 0x01, 0x02)
 
+/**
+ * enum ssam_event_notifier_flags - Flags for event notifiers.
+ * @SSAM_EVENT_NOTIFIER_OBSERVER:
+ *	The corresponding notifier acts as observer. Registering a notifier
+ *	with this flag set will not attempt to enable any event. Equally,
+ *	unregistering will not attempt to disable any event. Note that a
+ *	notifier with this flag may not even correspond to a certain event at
+ *	all, only to a specific event target category. Event matching will not
+ *	be influenced by this flag.
+ */
+enum ssam_event_notifier_flags {
+	SSAM_EVENT_NOTIFIER_OBSERVER = BIT(0),
+};
+
 /**
  * struct ssam_event_notifier - Notifier block for SSAM events.
  * @base:        The base notifier block with callback function and priority.
@@ -803,6 +817,7 @@ enum ssam_event_mask {
  * @event.id:    ID specifying the event.
  * @event.mask:  Flags determining how events are matched to the notifier.
  * @event.flags: Flags used for enabling the event.
+ * @flags:       Notifier flags (see &enum ssam_event_notifier_flags).
  */
 struct ssam_event_notifier {
 	struct ssam_notifier_block base;
@@ -813,6 +828,8 @@ struct ssam_event_notifier {
 		enum ssam_event_mask mask;
 		u8 flags;
 	} event;
+
+	unsigned long flags;
 };
 
 int ssam_notifier_register(struct ssam_controller *ctrl,
-- 
cgit v1.2.3


From 4b38a1dcf378f5075884b54dc5afeb9d0dfe7681 Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Fri, 4 Jun 2021 15:47:50 +0200
Subject: platform/surface: aggregator: Allow enabling of events without
 notifiers

We can already enable and disable SAM events via one of two ways: either
via a (non-observer) notifier tied to a specific event group, or a
generic event enable/disable request. In some instances, however,
neither method may be desirable.

The first method will tie the event enable request to a specific
notifier, however, when we want to receive notifications for multiple
event groups of the same target category and forward this to the same
notifier callback, we may receive duplicate events, i.e. one event per
registered notifier. The second method will bypass the internal
reference counting mechanism, meaning that a disable request will
disable the event regardless of any other client driver using it, which
may break the functionality of that driver.

To address this problem, add new functions that allow enabling and
disabling of events via the event reference counting mechanism built
into the controller, without needing to register a notifier.

This can then be used in combination with observer notifiers to process
multiple events of the same target category without duplication in the
same callback function.

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Link: https://lore.kernel.org/r/20210604134755.535590-3-luzmaximilian@gmail.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/platform/surface/aggregator/controller.c | 293 +++++++++++++++++++----
 include/linux/surface_aggregator/controller.h    |   8 +
 2 files changed, 253 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/surface/aggregator/controller.c b/drivers/platform/surface/aggregator/controller.c
index cd3a6b77f48d..cedd0f779f7a 100644
--- a/drivers/platform/surface/aggregator/controller.c
+++ b/drivers/platform/surface/aggregator/controller.c
@@ -407,6 +407,31 @@ ssam_nf_refcount_dec(struct ssam_nf *nf, struct ssam_event_registry reg,
 	return NULL;
 }
 
+/**
+ * ssam_nf_refcount_dec_free() - Decrement reference-/activation-count of the
+ * given event and free its entry if the reference count reaches zero.
+ * @nf:  The notifier system reference.
+ * @reg: The registry used to enable/disable the event.
+ * @id:  The event ID.
+ *
+ * Decrements the reference-/activation-count of the specified event, freeing
+ * its entry if it reaches zero.
+ *
+ * Note: ``nf->lock`` must be held when calling this function.
+ */
+static void ssam_nf_refcount_dec_free(struct ssam_nf *nf,
+				      struct ssam_event_registry reg,
+				      struct ssam_event_id id)
+{
+	struct ssam_nf_refcount_entry *entry;
+
+	lockdep_assert_held(&nf->lock);
+
+	entry = ssam_nf_refcount_dec(nf, reg, id);
+	if (entry && entry->refcount == 0)
+		kfree(entry);
+}
+
 /**
  * ssam_nf_refcount_empty() - Test if the notification system has any
  * enabled/active events.
@@ -2122,6 +2147,109 @@ int ssam_ctrl_notif_d0_entry(struct ssam_controller *ctrl)
 
 /* -- Top-level event registry interface. ----------------------------------- */
 
+/**
+ * ssam_nf_refcount_enable() - Enable event for reference count entry if it has
+ * not already been enabled.
+ * @ctrl:  The controller to enable the event on.
+ * @entry: The reference count entry for the event to be enabled.
+ * @flags: The flags used for enabling the event on the EC.
+ *
+ * Enable the event associated with the given reference count entry if the
+ * reference count equals one, i.e. the event has not previously been enabled.
+ * If the event has already been enabled (i.e. reference count not equal to
+ * one), check that the flags used for enabling match and warn about this if
+ * they do not.
+ *
+ * This does not modify the reference count itself, which is done with
+ * ssam_nf_refcount_inc() / ssam_nf_refcount_dec().
+ *
+ * Note: ``nf->lock`` must be held when calling this function.
+ *
+ * Return: Returns zero on success. If the event is enabled by this call,
+ * returns the status of the event-enable EC command.
+ */
+static int ssam_nf_refcount_enable(struct ssam_controller *ctrl,
+				   struct ssam_nf_refcount_entry *entry, u8 flags)
+{
+	const struct ssam_event_registry reg = entry->key.reg;
+	const struct ssam_event_id id = entry->key.id;
+	struct ssam_nf *nf = &ctrl->cplt.event.notif;
+	int status;
+
+	lockdep_assert_held(&nf->lock);
+
+	ssam_dbg(ctrl, "enabling event (reg: %#04x, tc: %#04x, iid: %#04x, rc: %d)\n",
+		 reg.target_category, id.target_category, id.instance, entry->refcount);
+
+	if (entry->refcount == 1) {
+		status = ssam_ssh_event_enable(ctrl, reg, id, flags);
+		if (status)
+			return status;
+
+		entry->flags = flags;
+
+	} else if (entry->flags != flags) {
+		ssam_warn(ctrl,
+			  "inconsistent flags when enabling event: got %#04x, expected %#04x (reg: %#04x, tc: %#04x, iid: %#04x)\n",
+			  flags, entry->flags, reg.target_category, id.target_category,
+			  id.instance);
+	}
+
+	return 0;
+}
+
+/**
+ * ssam_nf_refcount_disable_free() - Disable event for reference count entry if it is
+ * no longer in use and free the corresponding entry.
+ * @ctrl:  The controller to disable the event on.
+ * @entry: The reference count entry for the event to be disabled.
+ * @flags: The flags used for enabling the event on the EC.
+ *
+ * If the reference count equals zero, i.e. the event is no longer requested by
+ * any client, the event will be disabled and the corresponding reference count
+ * entry freed. The reference count entry must not be used any more after a
+ * call to this function.
+ *
+ * Also checks if the flags used for disabling the event match the flags used
+ * for enabling the event and warns if they do not (regardless of reference
+ * count).
+ *
+ * This does not modify the reference count itself, which is done with
+ * ssam_nf_refcount_inc() / ssam_nf_refcount_dec().
+ *
+ * Note: ``nf->lock`` must be held when calling this function.
+ *
+ * Return: Returns zero on success. If the event is disabled by this call,
+ * returns the status of the event-enable EC command.
+ */
+static int ssam_nf_refcount_disable_free(struct ssam_controller *ctrl,
+					 struct ssam_nf_refcount_entry *entry, u8 flags)
+{
+	const struct ssam_event_registry reg = entry->key.reg;
+	const struct ssam_event_id id = entry->key.id;
+	struct ssam_nf *nf = &ctrl->cplt.event.notif;
+	int status;
+
+	lockdep_assert_held(&nf->lock);
+
+	ssam_dbg(ctrl, "disabling event (reg: %#04x, tc: %#04x, iid: %#04x, rc: %d)\n",
+		 reg.target_category, id.target_category, id.instance, entry->refcount);
+
+	if (entry->flags != flags) {
+		ssam_warn(ctrl,
+			  "inconsistent flags when disabling event: got %#04x, expected %#04x (reg: %#04x, tc: %#04x, iid: %#04x)\n",
+			  flags, entry->flags, reg.target_category, id.target_category,
+			  id.instance);
+	}
+
+	if (entry->refcount == 0) {
+		status = ssam_ssh_event_disable(ctrl, reg, id, flags);
+		kfree(entry);
+	}
+
+	return status;
+}
+
 /**
  * ssam_notifier_register() - Register an event notifier.
  * @ctrl: The controller to register the notifier on.
@@ -2166,41 +2294,26 @@ int ssam_notifier_register(struct ssam_controller *ctrl, struct ssam_event_notif
 			mutex_unlock(&nf->lock);
 			return PTR_ERR(entry);
 		}
-
-		ssam_dbg(ctrl, "enabling event (reg: %#04x, tc: %#04x, iid: %#04x, rc: %d)\n",
-			 n->event.reg.target_category, n->event.id.target_category,
-			 n->event.id.instance, entry->refcount);
 	}
 
 	status = ssam_nfblk_insert(nf_head, &n->base);
 	if (status) {
-		if (entry) {
-			entry = ssam_nf_refcount_dec(nf, n->event.reg, n->event.id);
-			if (entry->refcount == 0)
-				kfree(entry);
-		}
+		if (entry)
+			ssam_nf_refcount_dec_free(nf, n->event.reg, n->event.id);
 
 		mutex_unlock(&nf->lock);
 		return status;
 	}
 
-	if (entry && entry->refcount == 1) {
-		status = ssam_ssh_event_enable(ctrl, n->event.reg, n->event.id, n->event.flags);
+	if (entry) {
+		status = ssam_nf_refcount_enable(ctrl, entry, n->event.flags);
 		if (status) {
 			ssam_nfblk_remove(&n->base);
-			kfree(ssam_nf_refcount_dec(nf, n->event.reg, n->event.id));
+			ssam_nf_refcount_dec_free(nf, n->event.reg, n->event.id);
 			mutex_unlock(&nf->lock);
 			synchronize_srcu(&nf_head->srcu);
 			return status;
 		}
-
-		entry->flags = n->event.flags;
-
-	} else if (entry && entry->flags != n->event.flags) {
-		ssam_warn(ctrl,
-			  "inconsistent flags when enabling event: got %#04x, expected %#04x (reg: %#04x, tc: %#04x, iid: %#04x)\n",
-			  n->event.flags, entry->flags, n->event.reg.target_category,
-			  n->event.id.target_category, n->event.id.instance);
 	}
 
 	mutex_unlock(&nf->lock);
@@ -2247,35 +2360,20 @@ int ssam_notifier_unregister(struct ssam_controller *ctrl, struct ssam_event_not
 	 * If this is an observer notifier, do not attempt to disable the
 	 * event, just remove it.
 	 */
-	if (n->flags & SSAM_EVENT_NOTIFIER_OBSERVER)
-		goto remove;
-
-	entry = ssam_nf_refcount_dec(nf, n->event.reg, n->event.id);
-	if (WARN_ON(!entry)) {
-		/*
-		 * If this does not return an entry, there's a logic error
-		 * somewhere: The notifier block is registered, but the event
-		 * refcount entry is not there. Remove the notifier block
-		 * anyways.
-		 */
-		status = -ENOENT;
-		goto remove;
-	}
-
-	ssam_dbg(ctrl, "disabling event (reg: %#04x, tc: %#04x, iid: %#04x, rc: %d)\n",
-		 n->event.reg.target_category, n->event.id.target_category,
-		 n->event.id.instance, entry->refcount);
-
-	if (entry->flags != n->event.flags) {
-		ssam_warn(ctrl,
-			  "inconsistent flags when disabling event: got %#04x, expected %#04x (reg: %#04x, tc: %#04x, iid: %#04x)\n",
-			  n->event.flags, entry->flags, n->event.reg.target_category,
-			  n->event.id.target_category, n->event.id.instance);
-	}
+	if (!(n->flags & SSAM_EVENT_NOTIFIER_OBSERVER)) {
+		entry = ssam_nf_refcount_dec(nf, n->event.reg, n->event.id);
+		if (WARN_ON(!entry)) {
+			/*
+			 * If this does not return an entry, there's a logic
+			 * error somewhere: The notifier block is registered,
+			 * but the event refcount entry is not there. Remove
+			 * the notifier block anyways.
+			 */
+			status = -ENOENT;
+			goto remove;
+		}
 
-	if (entry->refcount == 0) {
-		status = ssam_ssh_event_disable(ctrl, n->event.reg, n->event.id, n->event.flags);
-		kfree(entry);
+		status = ssam_nf_refcount_disable_free(ctrl, entry, n->event.flags);
 	}
 
 remove:
@@ -2287,6 +2385,105 @@ remove:
 }
 EXPORT_SYMBOL_GPL(ssam_notifier_unregister);
 
+/**
+ * ssam_controller_event_enable() - Enable the specified event.
+ * @ctrl:  The controller to enable the event for.
+ * @reg:   The event registry to use for enabling the event.
+ * @id:    The event ID specifying the event to be enabled.
+ * @flags: The SAM event flags used for enabling the event.
+ *
+ * Increment the event reference count of the specified event. If the event has
+ * not been enabled previously, it will be enabled by this call.
+ *
+ * Note: In general, ssam_notifier_register() with a non-observer notifier
+ * should be preferred for enabling/disabling events, as this will guarantee
+ * proper ordering and event forwarding in case of errors during event
+ * enabling/disabling.
+ *
+ * Return: Returns zero on success, %-ENOSPC if the reference count for the
+ * specified event has reached its maximum, %-ENOMEM if the corresponding event
+ * entry could not be allocated. If this is the first time that this event has
+ * been enabled (i.e. the reference count was incremented from zero to one by
+ * this call), returns the status of the event-enable EC-command.
+ */
+int ssam_controller_event_enable(struct ssam_controller *ctrl,
+				 struct ssam_event_registry reg,
+				 struct ssam_event_id id, u8 flags)
+{
+	u16 rqid = ssh_tc_to_rqid(id.target_category);
+	struct ssam_nf *nf = &ctrl->cplt.event.notif;
+	struct ssam_nf_refcount_entry *entry;
+	int status;
+
+	if (!ssh_rqid_is_event(rqid))
+		return -EINVAL;
+
+	mutex_lock(&nf->lock);
+
+	entry = ssam_nf_refcount_inc(nf, reg, id);
+	if (IS_ERR(entry)) {
+		mutex_unlock(&nf->lock);
+		return PTR_ERR(entry);
+	}
+
+	status = ssam_nf_refcount_enable(ctrl, entry, flags);
+	if (status) {
+		ssam_nf_refcount_dec_free(nf, reg, id);
+		mutex_unlock(&nf->lock);
+		return status;
+	}
+
+	mutex_unlock(&nf->lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ssam_controller_event_enable);
+
+/**
+ * ssam_controller_event_disable() - Disable the specified event.
+ * @ctrl:  The controller to disable the event for.
+ * @reg:   The event registry to use for disabling the event.
+ * @id:    The event ID specifying the event to be disabled.
+ * @flags: The flags used when enabling the event.
+ *
+ * Decrement the reference count of the specified event. If the reference count
+ * reaches zero, the event will be disabled.
+ *
+ * Note: In general, ssam_notifier_register()/ssam_notifier_unregister() with a
+ * non-observer notifier should be preferred for enabling/disabling events, as
+ * this will guarantee proper ordering and event forwarding in case of errors
+ * during event enabling/disabling.
+ *
+ * Return: Returns zero on success, %-ENOENT if the given event has not been
+ * enabled on the controller. If the reference count of the event reaches zero
+ * during this call, returns the status of the event-disable EC-command.
+ */
+int ssam_controller_event_disable(struct ssam_controller *ctrl,
+				  struct ssam_event_registry reg,
+				  struct ssam_event_id id, u8 flags)
+{
+	u16 rqid = ssh_tc_to_rqid(id.target_category);
+	struct ssam_nf *nf = &ctrl->cplt.event.notif;
+	struct ssam_nf_refcount_entry *entry;
+	int status = 0;
+
+	if (!ssh_rqid_is_event(rqid))
+		return -EINVAL;
+
+	mutex_lock(&nf->lock);
+
+	entry = ssam_nf_refcount_dec(nf, reg, id);
+	if (!entry) {
+		mutex_unlock(&nf->lock);
+		return -ENOENT;
+	}
+
+	status = ssam_nf_refcount_disable_free(ctrl, entry, flags);
+
+	mutex_unlock(&nf->lock);
+	return status;
+}
+EXPORT_SYMBOL_GPL(ssam_controller_event_disable);
+
 /**
  * ssam_notifier_disable_registered() - Disable events for all registered
  * notifiers.
diff --git a/include/linux/surface_aggregator/controller.h b/include/linux/surface_aggregator/controller.h
index cf4bb48a850e..7965bdc669c5 100644
--- a/include/linux/surface_aggregator/controller.h
+++ b/include/linux/surface_aggregator/controller.h
@@ -838,4 +838,12 @@ int ssam_notifier_register(struct ssam_controller *ctrl,
 int ssam_notifier_unregister(struct ssam_controller *ctrl,
 			     struct ssam_event_notifier *n);
 
+int ssam_controller_event_enable(struct ssam_controller *ctrl,
+				 struct ssam_event_registry reg,
+				 struct ssam_event_id id, u8 flags);
+
+int ssam_controller_event_disable(struct ssam_controller *ctrl,
+				  struct ssam_event_registry reg,
+				  struct ssam_event_id id, u8 flags);
+
 #endif /* _LINUX_SURFACE_AGGREGATOR_CONTROLLER_H */
-- 
cgit v1.2.3


From b2763358feb28590f6b52a4c95c94a645dadfb26 Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Fri, 4 Jun 2021 15:47:51 +0200
Subject: platform/surface: aggregator: Update copyright

It's 2021, update the copyright accordingly.

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/20210604134755.535590-4-luzmaximilian@gmail.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/platform/surface/aggregator/Kconfig             | 2 +-
 drivers/platform/surface/aggregator/Makefile            | 2 +-
 drivers/platform/surface/aggregator/bus.c               | 2 +-
 drivers/platform/surface/aggregator/bus.h               | 2 +-
 drivers/platform/surface/aggregator/controller.c        | 2 +-
 drivers/platform/surface/aggregator/controller.h        | 2 +-
 drivers/platform/surface/aggregator/core.c              | 2 +-
 drivers/platform/surface/aggregator/ssh_msgb.h          | 2 +-
 drivers/platform/surface/aggregator/ssh_packet_layer.c  | 2 +-
 drivers/platform/surface/aggregator/ssh_packet_layer.h  | 2 +-
 drivers/platform/surface/aggregator/ssh_parser.c        | 2 +-
 drivers/platform/surface/aggregator/ssh_parser.h        | 2 +-
 drivers/platform/surface/aggregator/ssh_request_layer.c | 2 +-
 drivers/platform/surface/aggregator/ssh_request_layer.h | 2 +-
 drivers/platform/surface/aggregator/trace.h             | 2 +-
 include/linux/surface_aggregator/controller.h           | 2 +-
 include/linux/surface_aggregator/device.h               | 2 +-
 include/linux/surface_aggregator/serial_hub.h           | 2 +-
 18 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/surface/aggregator/Kconfig b/drivers/platform/surface/aggregator/Kconfig
index 3aaeea9f0433..fd6dc452f3e8 100644
--- a/drivers/platform/surface/aggregator/Kconfig
+++ b/drivers/platform/surface/aggregator/Kconfig
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0+
-# Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+# Copyright (C) 2019-2021 Maximilian Luz <luzmaximilian@gmail.com>
 
 menuconfig SURFACE_AGGREGATOR
 	tristate "Microsoft Surface System Aggregator Module Subsystem and Drivers"
diff --git a/drivers/platform/surface/aggregator/Makefile b/drivers/platform/surface/aggregator/Makefile
index c112e2c7112b..c8498c41e758 100644
--- a/drivers/platform/surface/aggregator/Makefile
+++ b/drivers/platform/surface/aggregator/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0+
-# Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+# Copyright (C) 2019-2021 Maximilian Luz <luzmaximilian@gmail.com>
 
 # For include/trace/define_trace.h to include trace.h
 CFLAGS_core.o = -I$(src)
diff --git a/drivers/platform/surface/aggregator/bus.c b/drivers/platform/surface/aggregator/bus.c
index a9b660af0917..0169677c243e 100644
--- a/drivers/platform/surface/aggregator/bus.c
+++ b/drivers/platform/surface/aggregator/bus.c
@@ -2,7 +2,7 @@
 /*
  * Surface System Aggregator Module bus and device integration.
  *
- * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ * Copyright (C) 2019-2021 Maximilian Luz <luzmaximilian@gmail.com>
  */
 
 #include <linux/device.h>
diff --git a/drivers/platform/surface/aggregator/bus.h b/drivers/platform/surface/aggregator/bus.h
index 7712baaed6a5..ed032c2cbdb2 100644
--- a/drivers/platform/surface/aggregator/bus.h
+++ b/drivers/platform/surface/aggregator/bus.h
@@ -2,7 +2,7 @@
 /*
  * Surface System Aggregator Module bus and device integration.
  *
- * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ * Copyright (C) 2019-2021 Maximilian Luz <luzmaximilian@gmail.com>
  */
 
 #ifndef _SURFACE_AGGREGATOR_BUS_H
diff --git a/drivers/platform/surface/aggregator/controller.c b/drivers/platform/surface/aggregator/controller.c
index cedd0f779f7a..6646f4d6e10d 100644
--- a/drivers/platform/surface/aggregator/controller.c
+++ b/drivers/platform/surface/aggregator/controller.c
@@ -2,7 +2,7 @@
 /*
  * Main SSAM/SSH controller structure and functionality.
  *
- * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ * Copyright (C) 2019-2021 Maximilian Luz <luzmaximilian@gmail.com>
  */
 
 #include <linux/acpi.h>
diff --git a/drivers/platform/surface/aggregator/controller.h b/drivers/platform/surface/aggregator/controller.h
index 8297d34e7489..a0963c3562ff 100644
--- a/drivers/platform/surface/aggregator/controller.h
+++ b/drivers/platform/surface/aggregator/controller.h
@@ -2,7 +2,7 @@
 /*
  * Main SSAM/SSH controller structure and functionality.
  *
- * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ * Copyright (C) 2019-2021 Maximilian Luz <luzmaximilian@gmail.com>
  */
 
 #ifndef _SURFACE_AGGREGATOR_CONTROLLER_H
diff --git a/drivers/platform/surface/aggregator/core.c b/drivers/platform/surface/aggregator/core.c
index 8dc2c267bcd6..5d780e55f4a1 100644
--- a/drivers/platform/surface/aggregator/core.c
+++ b/drivers/platform/surface/aggregator/core.c
@@ -7,7 +7,7 @@
  * Handles communication via requests as well as enabling, disabling, and
  * relaying of events.
  *
- * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ * Copyright (C) 2019-2021 Maximilian Luz <luzmaximilian@gmail.com>
  */
 
 #include <linux/acpi.h>
diff --git a/drivers/platform/surface/aggregator/ssh_msgb.h b/drivers/platform/surface/aggregator/ssh_msgb.h
index 1221f642dda1..e562958ffdf0 100644
--- a/drivers/platform/surface/aggregator/ssh_msgb.h
+++ b/drivers/platform/surface/aggregator/ssh_msgb.h
@@ -2,7 +2,7 @@
 /*
  * SSH message builder functions.
  *
- * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ * Copyright (C) 2019-2021 Maximilian Luz <luzmaximilian@gmail.com>
  */
 
 #ifndef _SURFACE_AGGREGATOR_SSH_MSGB_H
diff --git a/drivers/platform/surface/aggregator/ssh_packet_layer.c b/drivers/platform/surface/aggregator/ssh_packet_layer.c
index 15d96eac6811..5e08049fc3ac 100644
--- a/drivers/platform/surface/aggregator/ssh_packet_layer.c
+++ b/drivers/platform/surface/aggregator/ssh_packet_layer.c
@@ -2,7 +2,7 @@
 /*
  * SSH packet transport layer.
  *
- * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ * Copyright (C) 2019-2021 Maximilian Luz <luzmaximilian@gmail.com>
  */
 
 #include <asm/unaligned.h>
diff --git a/drivers/platform/surface/aggregator/ssh_packet_layer.h b/drivers/platform/surface/aggregator/ssh_packet_layer.h
index e8757d03f279..2eb329f0b91a 100644
--- a/drivers/platform/surface/aggregator/ssh_packet_layer.h
+++ b/drivers/platform/surface/aggregator/ssh_packet_layer.h
@@ -2,7 +2,7 @@
 /*
  * SSH packet transport layer.
  *
- * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ * Copyright (C) 2019-2021 Maximilian Luz <luzmaximilian@gmail.com>
  */
 
 #ifndef _SURFACE_AGGREGATOR_SSH_PACKET_LAYER_H
diff --git a/drivers/platform/surface/aggregator/ssh_parser.c b/drivers/platform/surface/aggregator/ssh_parser.c
index e2dead8de94a..b77912f8f13b 100644
--- a/drivers/platform/surface/aggregator/ssh_parser.c
+++ b/drivers/platform/surface/aggregator/ssh_parser.c
@@ -2,7 +2,7 @@
 /*
  * SSH message parser.
  *
- * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ * Copyright (C) 2019-2021 Maximilian Luz <luzmaximilian@gmail.com>
  */
 
 #include <asm/unaligned.h>
diff --git a/drivers/platform/surface/aggregator/ssh_parser.h b/drivers/platform/surface/aggregator/ssh_parser.h
index 63c38d350988..3bd6e180fd16 100644
--- a/drivers/platform/surface/aggregator/ssh_parser.h
+++ b/drivers/platform/surface/aggregator/ssh_parser.h
@@ -2,7 +2,7 @@
 /*
  * SSH message parser.
  *
- * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ * Copyright (C) 2019-2021 Maximilian Luz <luzmaximilian@gmail.com>
  */
 
 #ifndef _SURFACE_AGGREGATOR_SSH_PARSER_H
diff --git a/drivers/platform/surface/aggregator/ssh_request_layer.c b/drivers/platform/surface/aggregator/ssh_request_layer.c
index 52a83a8fcf82..bfe1aaf38065 100644
--- a/drivers/platform/surface/aggregator/ssh_request_layer.c
+++ b/drivers/platform/surface/aggregator/ssh_request_layer.c
@@ -2,7 +2,7 @@
 /*
  * SSH request transport layer.
  *
- * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ * Copyright (C) 2019-2021 Maximilian Luz <luzmaximilian@gmail.com>
  */
 
 #include <asm/unaligned.h>
diff --git a/drivers/platform/surface/aggregator/ssh_request_layer.h b/drivers/platform/surface/aggregator/ssh_request_layer.h
index cb35815858d1..9c3cbae2d4bd 100644
--- a/drivers/platform/surface/aggregator/ssh_request_layer.h
+++ b/drivers/platform/surface/aggregator/ssh_request_layer.h
@@ -2,7 +2,7 @@
 /*
  * SSH request transport layer.
  *
- * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ * Copyright (C) 2019-2021 Maximilian Luz <luzmaximilian@gmail.com>
  */
 
 #ifndef _SURFACE_AGGREGATOR_SSH_REQUEST_LAYER_H
diff --git a/drivers/platform/surface/aggregator/trace.h b/drivers/platform/surface/aggregator/trace.h
index eb332bb53ae4..de64cf169060 100644
--- a/drivers/platform/surface/aggregator/trace.h
+++ b/drivers/platform/surface/aggregator/trace.h
@@ -2,7 +2,7 @@
 /*
  * Trace points for SSAM/SSH.
  *
- * Copyright (C) 2020 Maximilian Luz <luzmaximilian@gmail.com>
+ * Copyright (C) 2020-2021 Maximilian Luz <luzmaximilian@gmail.com>
  */
 
 #undef TRACE_SYSTEM
diff --git a/include/linux/surface_aggregator/controller.h b/include/linux/surface_aggregator/controller.h
index 7965bdc669c5..068e1982ad37 100644
--- a/include/linux/surface_aggregator/controller.h
+++ b/include/linux/surface_aggregator/controller.h
@@ -6,7 +6,7 @@
  * managing access and communication to and from the SSAM EC, as well as main
  * communication structures and definitions.
  *
- * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ * Copyright (C) 2019-2021 Maximilian Luz <luzmaximilian@gmail.com>
  */
 
 #ifndef _LINUX_SURFACE_AGGREGATOR_CONTROLLER_H
diff --git a/include/linux/surface_aggregator/device.h b/include/linux/surface_aggregator/device.h
index 6ff9c58b3e17..f636c5310321 100644
--- a/include/linux/surface_aggregator/device.h
+++ b/include/linux/surface_aggregator/device.h
@@ -7,7 +7,7 @@
  * Provides support for non-platform/non-ACPI SSAM clients via dedicated
  * subsystem.
  *
- * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ * Copyright (C) 2019-2021 Maximilian Luz <luzmaximilian@gmail.com>
  */
 
 #ifndef _LINUX_SURFACE_AGGREGATOR_DEVICE_H
diff --git a/include/linux/surface_aggregator/serial_hub.h b/include/linux/surface_aggregator/serial_hub.h
index 64276fbfa1d5..c3de43edcffa 100644
--- a/include/linux/surface_aggregator/serial_hub.h
+++ b/include/linux/surface_aggregator/serial_hub.h
@@ -6,7 +6,7 @@
  * Surface System Aggregator Module (SSAM). Provides the interface for basic
  * packet- and request-based communication with the SSAM EC via SSH.
  *
- * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ * Copyright (C) 2019-2021 Maximilian Luz <luzmaximilian@gmail.com>
  */
 
 #ifndef _LINUX_SURFACE_AGGREGATOR_SERIAL_HUB_H
-- 
cgit v1.2.3


From 25182f05ffed0b45602438693e4eed5d7f3ebadd Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi@nec.com>
Date: Tue, 15 Jun 2021 18:23:13 -0700
Subject: mm,hwpoison: fix race with hugetlb page allocation

When hugetlb page fault (under overcommitting situation) and
memory_failure() race, VM_BUG_ON_PAGE() is triggered by the following
race:

    CPU0:                           CPU1:

                                    gather_surplus_pages()
                                      page = alloc_surplus_huge_page()
    memory_failure_hugetlb()
      get_hwpoison_page(page)
        __get_hwpoison_page(page)
          get_page_unless_zero(page)
                                      zero = put_page_testzero(page)
                                      VM_BUG_ON_PAGE(!zero, page)
                                      enqueue_huge_page(h, page)
      put_page(page)

__get_hwpoison_page() only checks the page refcount before taking an
additional one for memory error handling, which is not enough because
there's a time window where compound pages have non-zero refcount during
hugetlb page initialization.

So make __get_hwpoison_page() check page status a bit more for hugetlb
pages with get_hwpoison_huge_page().  Checking hugetlb-specific flags
under hugetlb_lock makes sure that the hugetlb page is not transitive.
It's notable that another new function, HWPoisonHandlable(), is helpful
to prevent a race against other transitive page states (like a generic
compound page just before PageHuge becomes true).

Link: https://lkml.kernel.org/r/20210603233632.2964832-2-nao.horiguchi@gmail.com
Fixes: ead07f6a867b ("mm/memory-failure: introduce get_hwpoison_page() for consistent refcount handling")
Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Reported-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: <stable@vger.kernel.org>	[5.12+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |  6 ++++++
 mm/hugetlb.c            | 15 +++++++++++++++
 mm/memory-failure.c     | 29 +++++++++++++++++++++++++++--
 3 files changed, 48 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index b92f25ccef58..790ae618548d 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -149,6 +149,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
 long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
 						long freed);
 bool isolate_huge_page(struct page *page, struct list_head *list);
+int get_hwpoison_huge_page(struct page *page, bool *hugetlb);
 void putback_active_hugepage(struct page *page);
 void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
 void free_huge_page(struct page *page);
@@ -339,6 +340,11 @@ static inline bool isolate_huge_page(struct page *page, struct list_head *list)
 	return false;
 }
 
+static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
+{
+	return 0;
+}
+
 static inline void putback_active_hugepage(struct page *page)
 {
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5560b50876fb..85f42ec1a927 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5857,6 +5857,21 @@ unlock:
 	return ret;
 }
 
+int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
+{
+	int ret = 0;
+
+	*hugetlb = false;
+	spin_lock_irq(&hugetlb_lock);
+	if (PageHeadHuge(page)) {
+		*hugetlb = true;
+		if (HPageFreed(page) || HPageMigratable(page))
+			ret = get_page_unless_zero(page);
+	}
+	spin_unlock_irq(&hugetlb_lock);
+	return ret;
+}
+
 void putback_active_hugepage(struct page *page)
 {
 	spin_lock_irq(&hugetlb_lock);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 85ad98c00fd9..29ab7b70d326 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -949,6 +949,17 @@ static int page_action(struct page_state *ps, struct page *p,
 	return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
 }
 
+/*
+ * Return true if a page type of a given page is supported by hwpoison
+ * mechanism (while handling could fail), otherwise false.  This function
+ * does not return true for hugetlb or device memory pages, so it's assumed
+ * to be called only in the context where we never have such pages.
+ */
+static inline bool HWPoisonHandlable(struct page *page)
+{
+	return PageLRU(page) || __PageMovable(page);
+}
+
 /**
  * __get_hwpoison_page() - Get refcount for memory error handling:
  * @page:	raw error page (hit by memory error)
@@ -959,8 +970,22 @@ static int page_action(struct page_state *ps, struct page *p,
 static int __get_hwpoison_page(struct page *page)
 {
 	struct page *head = compound_head(page);
+	int ret = 0;
+	bool hugetlb = false;
+
+	ret = get_hwpoison_huge_page(head, &hugetlb);
+	if (hugetlb)
+		return ret;
+
+	/*
+	 * This check prevents from calling get_hwpoison_unless_zero()
+	 * for any unsupported type of page in order to reduce the risk of
+	 * unexpected races caused by taking a page refcount.
+	 */
+	if (!HWPoisonHandlable(head))
+		return 0;
 
-	if (!PageHuge(head) && PageTransHuge(head)) {
+	if (PageTransHuge(head)) {
 		/*
 		 * Non anonymous thp exists only in allocation/free time. We
 		 * can't handle such a case correctly, so let's give it up.
@@ -1017,7 +1042,7 @@ try_again:
 			ret = -EIO;
 		}
 	} else {
-		if (PageHuge(p) || PageLRU(p) || __PageMovable(p)) {
+		if (PageHuge(p) || HWPoisonHandlable(p)) {
 			ret = 1;
 		} else {
 			/*
-- 
cgit v1.2.3


From 099dd6878b9b12d6bbfa6bf29ce0c8ddd38f6901 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Tue, 15 Jun 2021 18:23:16 -0700
Subject: mm/swap: fix pte_same_as_swp() not removing uffd-wp bit when compare

I found it by pure code review, that pte_same_as_swp() of unuse_vma()
didn't take uffd-wp bit into account when comparing ptes.
pte_same_as_swp() returning false negative could cause failure to
swapoff swap ptes that was wr-protected by userfaultfd.

Link: https://lkml.kernel.org/r/20210603180546.9083-1-peterx@redhat.com
Fixes: f45ec5ff16a7 ("userfaultfd: wp: support swap and page migration")
Signed-off-by: Peter Xu <peterx@redhat.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: <stable@vger.kernel.org>	[5.7+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swapops.h | 15 +++++++++++----
 mm/swapfile.c           |  2 +-
 2 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index d9b7c9132c2f..6430a94c6981 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -23,6 +23,16 @@
 #define SWP_TYPE_SHIFT	(BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT)
 #define SWP_OFFSET_MASK	((1UL << SWP_TYPE_SHIFT) - 1)
 
+/* Clear all flags but only keep swp_entry_t related information */
+static inline pte_t pte_swp_clear_flags(pte_t pte)
+{
+	if (pte_swp_soft_dirty(pte))
+		pte = pte_swp_clear_soft_dirty(pte);
+	if (pte_swp_uffd_wp(pte))
+		pte = pte_swp_clear_uffd_wp(pte);
+	return pte;
+}
+
 /*
  * Store a type+offset into a swp_entry_t in an arch-independent format
  */
@@ -66,10 +76,7 @@ static inline swp_entry_t pte_to_swp_entry(pte_t pte)
 {
 	swp_entry_t arch_entry;
 
-	if (pte_swp_soft_dirty(pte))
-		pte = pte_swp_clear_soft_dirty(pte);
-	if (pte_swp_uffd_wp(pte))
-		pte = pte_swp_clear_uffd_wp(pte);
+	pte = pte_swp_clear_flags(pte);
 	arch_entry = __pte_to_swp_entry(pte);
 	return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 149e77454e3c..996afa8131c8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1900,7 +1900,7 @@ unsigned int count_swap_pages(int type, int free)
 
 static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
 {
-	return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
+	return pte_same(pte_swp_clear_flags(pte), swp_pte);
 }
 
 /*
-- 
cgit v1.2.3


From 846be08578edb81f02bc8534577e6c367ef34f41 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Tue, 15 Jun 2021 18:23:29 -0700
Subject: mm/hugetlb: expand restore_reserve_on_error functionality

The routine restore_reserve_on_error is called to restore reservation
information when an error occurs after page allocation.  The routine
alloc_huge_page modifies the mapping reserve map and potentially the
reserve count during allocation.  If code calling alloc_huge_page
encounters an error after allocation and needs to free the page, the
reservation information needs to be adjusted.

Currently, restore_reserve_on_error only takes action on pages for which
the reserve count was adjusted(HPageRestoreReserve flag).  There is
nothing wrong with these adjustments.  However, alloc_huge_page ALWAYS
modifies the reserve map during allocation even if the reserve count is
not adjusted.  This can cause issues as observed during development of
this patch [1].

One specific series of operations causing an issue is:

 - Create a shared hugetlb mapping
   Reservations for all pages created by default

 - Fault in a page in the mapping
   Reservation exists so reservation count is decremented

 - Punch a hole in the file/mapping at index previously faulted
   Reservation and any associated pages will be removed

 - Allocate a page to fill the hole
   No reservation entry, so reserve count unmodified
   Reservation entry added to map by alloc_huge_page

 - Error after allocation and before instantiating the page
   Reservation entry remains in map

 - Allocate a page to fill the hole
   Reservation entry exists, so decrement reservation count

This will cause a reservation count underflow as the reservation count
was decremented twice for the same index.

A user would observe a very large number for HugePages_Rsvd in
/proc/meminfo.  This would also likely cause subsequent allocations of
hugetlb pages to fail as it would 'appear' that all pages are reserved.

This sequence of operations is unlikely to happen, however they were
easily reproduced and observed using hacked up code as described in [1].

Address the issue by having the routine restore_reserve_on_error take
action on pages where HPageRestoreReserve is not set.  In this case, we
need to remove any reserve map entry created by alloc_huge_page.  A new
helper routine vma_del_reservation assists with this operation.

There are three callers of alloc_huge_page which do not currently call
restore_reserve_on error before freeing a page on error paths.  Add
those missing calls.

[1] https://lore.kernel.org/linux-mm/20210528005029.88088-1-almasrymina@google.com/

Link: https://lkml.kernel.org/r/20210607204510.22617-1-mike.kravetz@oracle.com
Fixes: 96b96a96ddee ("mm/hugetlb: fix huge page reservation leak in private mapping error paths"
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Mina Almasry <almasrymina@google.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c    |   1 +
 include/linux/hugetlb.h |   2 +
 mm/hugetlb.c            | 120 ++++++++++++++++++++++++++++++++++++++----------
 3 files changed, 100 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 55efd3dd04f6..30dee68458c7 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -735,6 +735,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 		__SetPageUptodate(page);
 		error = huge_add_to_page_cache(page, mapping, index);
 		if (unlikely(error)) {
+			restore_reserve_on_error(h, &pseudo_vma, addr, page);
 			put_page(page);
 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 			goto out;
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 790ae618548d..6504346a1947 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -610,6 +610,8 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
 				unsigned long address);
 int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
 			pgoff_t idx);
+void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
+				unsigned long address, struct page *page);
 
 /* arch callback */
 int __init __alloc_bootmem_huge_page(struct hstate *h);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 85f42ec1a927..e0a5f9cbbece 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2121,12 +2121,18 @@ out:
  * be restored when a newly allocated huge page must be freed.  It is
  * to be called after calling vma_needs_reservation to determine if a
  * reservation exists.
+ *
+ * vma_del_reservation is used in error paths where an entry in the reserve
+ * map was created during huge page allocation and must be removed.  It is to
+ * be called after calling vma_needs_reservation to determine if a reservation
+ * exists.
  */
 enum vma_resv_mode {
 	VMA_NEEDS_RESV,
 	VMA_COMMIT_RESV,
 	VMA_END_RESV,
 	VMA_ADD_RESV,
+	VMA_DEL_RESV,
 };
 static long __vma_reservation_common(struct hstate *h,
 				struct vm_area_struct *vma, unsigned long addr,
@@ -2170,11 +2176,21 @@ static long __vma_reservation_common(struct hstate *h,
 			ret = region_del(resv, idx, idx + 1);
 		}
 		break;
+	case VMA_DEL_RESV:
+		if (vma->vm_flags & VM_MAYSHARE) {
+			region_abort(resv, idx, idx + 1, 1);
+			ret = region_del(resv, idx, idx + 1);
+		} else {
+			ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
+			/* region_add calls of range 1 should never fail. */
+			VM_BUG_ON(ret < 0);
+		}
+		break;
 	default:
 		BUG();
 	}
 
-	if (vma->vm_flags & VM_MAYSHARE)
+	if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV)
 		return ret;
 	/*
 	 * We know private mapping must have HPAGE_RESV_OWNER set.
@@ -2222,25 +2238,39 @@ static long vma_add_reservation(struct hstate *h,
 	return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
 }
 
+static long vma_del_reservation(struct hstate *h,
+			struct vm_area_struct *vma, unsigned long addr)
+{
+	return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV);
+}
+
 /*
- * This routine is called to restore a reservation on error paths.  In the
- * specific error paths, a huge page was allocated (via alloc_huge_page)
- * and is about to be freed.  If a reservation for the page existed,
- * alloc_huge_page would have consumed the reservation and set
- * HPageRestoreReserve in the newly allocated page.  When the page is freed
- * via free_huge_page, the global reservation count will be incremented if
- * HPageRestoreReserve is set.  However, free_huge_page can not adjust the
- * reserve map.  Adjust the reserve map here to be consistent with global
- * reserve count adjustments to be made by free_huge_page.
+ * This routine is called to restore reservation information on error paths.
+ * It should ONLY be called for pages allocated via alloc_huge_page(), and
+ * the hugetlb mutex should remain held when calling this routine.
+ *
+ * It handles two specific cases:
+ * 1) A reservation was in place and the page consumed the reservation.
+ *    HPageRestoreReserve is set in the page.
+ * 2) No reservation was in place for the page, so HPageRestoreReserve is
+ *    not set.  However, alloc_huge_page always updates the reserve map.
+ *
+ * In case 1, free_huge_page later in the error path will increment the
+ * global reserve count.  But, free_huge_page does not have enough context
+ * to adjust the reservation map.  This case deals primarily with private
+ * mappings.  Adjust the reserve map here to be consistent with global
+ * reserve count adjustments to be made by free_huge_page.  Make sure the
+ * reserve map indicates there is a reservation present.
+ *
+ * In case 2, simply undo reserve map modifications done by alloc_huge_page.
  */
-static void restore_reserve_on_error(struct hstate *h,
-			struct vm_area_struct *vma, unsigned long address,
-			struct page *page)
+void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
+			unsigned long address, struct page *page)
 {
-	if (unlikely(HPageRestoreReserve(page))) {
-		long rc = vma_needs_reservation(h, vma, address);
+	long rc = vma_needs_reservation(h, vma, address);
 
-		if (unlikely(rc < 0)) {
+	if (HPageRestoreReserve(page)) {
+		if (unlikely(rc < 0))
 			/*
 			 * Rare out of memory condition in reserve map
 			 * manipulation.  Clear HPageRestoreReserve so that
@@ -2253,16 +2283,57 @@ static void restore_reserve_on_error(struct hstate *h,
 			 * accounting of reserve counts.
 			 */
 			ClearHPageRestoreReserve(page);
-		} else if (rc) {
-			rc = vma_add_reservation(h, vma, address);
-			if (unlikely(rc < 0))
+		else if (rc)
+			(void)vma_add_reservation(h, vma, address);
+		else
+			vma_end_reservation(h, vma, address);
+	} else {
+		if (!rc) {
+			/*
+			 * This indicates there is an entry in the reserve map
+			 * added by alloc_huge_page.  We know it was added
+			 * before the alloc_huge_page call, otherwise
+			 * HPageRestoreReserve would be set on the page.
+			 * Remove the entry so that a subsequent allocation
+			 * does not consume a reservation.
+			 */
+			rc = vma_del_reservation(h, vma, address);
+			if (rc < 0)
 				/*
-				 * See above comment about rare out of
-				 * memory condition.
+				 * VERY rare out of memory condition.  Since
+				 * we can not delete the entry, set
+				 * HPageRestoreReserve so that the reserve
+				 * count will be incremented when the page
+				 * is freed.  This reserve will be consumed
+				 * on a subsequent allocation.
 				 */
-				ClearHPageRestoreReserve(page);
+				SetHPageRestoreReserve(page);
+		} else if (rc < 0) {
+			/*
+			 * Rare out of memory condition from
+			 * vma_needs_reservation call.  Memory allocation is
+			 * only attempted if a new entry is needed.  Therefore,
+			 * this implies there is not an entry in the
+			 * reserve map.
+			 *
+			 * For shared mappings, no entry in the map indicates
+			 * no reservation.  We are done.
+			 */
+			if (!(vma->vm_flags & VM_MAYSHARE))
+				/*
+				 * For private mappings, no entry indicates
+				 * a reservation is present.  Since we can
+				 * not add an entry, set SetHPageRestoreReserve
+				 * on the page so reserve count will be
+				 * incremented when freed.  This reserve will
+				 * be consumed on a subsequent allocation.
+				 */
+				SetHPageRestoreReserve(page);
 		} else
-			vma_end_reservation(h, vma, address);
+			/*
+			 * No reservation present, do nothing
+			 */
+			 vma_end_reservation(h, vma, address);
 	}
 }
 
@@ -4037,6 +4108,8 @@ again:
 				spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
 				entry = huge_ptep_get(src_pte);
 				if (!pte_same(src_pte_old, entry)) {
+					restore_reserve_on_error(h, vma, addr,
+								new);
 					put_page(new);
 					/* dst_entry won't change as in child */
 					goto again;
@@ -5006,6 +5079,7 @@ out_release_unlock:
 	if (vm_shared || is_continue)
 		unlock_page(page);
 out_release_nounlock:
+	restore_reserve_on_error(h, dst_vma, dst_addr, page);
 	put_page(page);
 	goto out;
 }
-- 
cgit v1.2.3


From 3b77e8c8cde581dadab9a0f1543a347e24315f11 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 15 Jun 2021 18:23:49 -0700
Subject: mm/thp: make is_huge_zero_pmd() safe and quicker

Most callers of is_huge_zero_pmd() supply a pmd already verified
present; but a few (notably zap_huge_pmd()) do not - it might be a pmd
migration entry, in which the pfn is encoded differently from a present
pmd: which might pass the is_huge_zero_pmd() test (though not on x86,
since L1TF forced us to protect against that); or perhaps even crash in
pmd_page() applied to a swap-like entry.

Make it safe by adding pmd_present() check into is_huge_zero_pmd()
itself; and make it quicker by saving huge_zero_pfn, so that
is_huge_zero_pmd() will not need to do that pmd_page() lookup each time.

__split_huge_pmd_locked() checked pmd_trans_huge() before: that worked,
but is unnecessary now that is_huge_zero_pmd() checks present.

Link: https://lkml.kernel.org/r/21ea9ca-a1f5-8b90-5e88-95fb1c49bbfa@google.com
Fixes: e71769ae5260 ("mm: enable thp migration for shmem thp")
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jue Wang <juew@google.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Wang Yugui <wangyugui@e16-tech.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h | 8 +++++++-
 mm/huge_memory.c        | 5 ++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 9626fda5efce..2a8ebe6c222e 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -286,6 +286,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
 
 extern struct page *huge_zero_page;
+extern unsigned long huge_zero_pfn;
 
 static inline bool is_huge_zero_page(struct page *page)
 {
@@ -294,7 +295,7 @@ static inline bool is_huge_zero_page(struct page *page)
 
 static inline bool is_huge_zero_pmd(pmd_t pmd)
 {
-	return is_huge_zero_page(pmd_page(pmd));
+	return READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd) && pmd_present(pmd);
 }
 
 static inline bool is_huge_zero_pud(pud_t pud)
@@ -440,6 +441,11 @@ static inline bool is_huge_zero_page(struct page *page)
 	return false;
 }
 
+static inline bool is_huge_zero_pmd(pmd_t pmd)
+{
+	return false;
+}
+
 static inline bool is_huge_zero_pud(pud_t pud)
 {
 	return false;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 42cfefc6e66e..5885c5f5836f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -62,6 +62,7 @@ static struct shrinker deferred_split_shrinker;
 
 static atomic_t huge_zero_refcount;
 struct page *huge_zero_page __read_mostly;
+unsigned long huge_zero_pfn __read_mostly = ~0UL;
 
 bool transparent_hugepage_enabled(struct vm_area_struct *vma)
 {
@@ -98,6 +99,7 @@ retry:
 		__free_pages(zero_page, compound_order(zero_page));
 		goto retry;
 	}
+	WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page));
 
 	/* We take additional reference here. It will be put back by shrinker */
 	atomic_set(&huge_zero_refcount, 2);
@@ -147,6 +149,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
 	if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
 		struct page *zero_page = xchg(&huge_zero_page, NULL);
 		BUG_ON(zero_page == NULL);
+		WRITE_ONCE(huge_zero_pfn, ~0UL);
 		__free_pages(zero_page, compound_order(zero_page));
 		return HPAGE_PMD_NR;
 	}
@@ -2071,7 +2074,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		return;
 	}
 
-	if (pmd_trans_huge(*pmd) && is_huge_zero_pmd(*pmd)) {
+	if (is_huge_zero_pmd(*pmd)) {
 		/*
 		 * FIXME: Do we want to invalidate secondary mmu by calling
 		 * mmu_notifier_invalidate_range() see comments below inside
-- 
cgit v1.2.3


From 732ed55823fc3ad998d43b86bf771887bcc5ec67 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 15 Jun 2021 18:23:53 -0700
Subject: mm/thp: try_to_unmap() use TTU_SYNC for safe splitting

Stressing huge tmpfs often crashed on unmap_page()'s VM_BUG_ON_PAGE
(!unmap_success): with dump_page() showing mapcount:1, but then its raw
struct page output showing _mapcount ffffffff i.e.  mapcount 0.

And even if that particular VM_BUG_ON_PAGE(!unmap_success) is removed,
it is immediately followed by a VM_BUG_ON_PAGE(compound_mapcount(head)),
and further down an IS_ENABLED(CONFIG_DEBUG_VM) total_mapcount BUG():
all indicative of some mapcount difficulty in development here perhaps.
But the !CONFIG_DEBUG_VM path handles the failures correctly and
silently.

I believe the problem is that once a racing unmap has cleared pte or
pmd, try_to_unmap_one() may skip taking the page table lock, and emerge
from try_to_unmap() before the racing task has reached decrementing
mapcount.

Instead of abandoning the unsafe VM_BUG_ON_PAGE(), and the ones that
follow, use PVMW_SYNC in try_to_unmap_one() in this case: adding
TTU_SYNC to the options, and passing that from unmap_page().

When CONFIG_DEBUG_VM, or for non-debug too? Consensus is to do the same
for both: the slight overhead added should rarely matter, except perhaps
if splitting sparsely-populated multiply-mapped shmem.  Once confident
that bugs are fixed, TTU_SYNC here can be removed, and the race
tolerated.

Link: https://lkml.kernel.org/r/c1e95853-8bcd-d8fd-55fa-e7f2488e78f@google.com
Fixes: fec89c109f3a ("thp: rewrite freeze_page()/unfreeze_page() with generic rmap walkers")
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jue Wang <juew@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Wang Yugui <wangyugui@e16-tech.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h |  1 +
 mm/huge_memory.c     |  2 +-
 mm/page_vma_mapped.c | 11 +++++++++++
 mm/rmap.c            | 17 ++++++++++++++++-
 4 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index def5c62c93b3..8d04e7deedc6 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -91,6 +91,7 @@ enum ttu_flags {
 
 	TTU_SPLIT_HUGE_PMD	= 0x4,	/* split huge PMD if any */
 	TTU_IGNORE_MLOCK	= 0x8,	/* ignore mlock */
+	TTU_SYNC		= 0x10,	/* avoid racy checks with PVMW_SYNC */
 	TTU_IGNORE_HWPOISON	= 0x20,	/* corrupted page is recoverable */
 	TTU_BATCH_FLUSH		= 0x40,	/* Batch TLB flushes where possible
 					 * and caller guarantees they will
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5885c5f5836f..84ab735139dc 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2350,7 +2350,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
 
 static void unmap_page(struct page *page)
 {
-	enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK |
+	enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC |
 		TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
 	bool unmap_success;
 
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 2cf01d933f13..5b559967410e 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -212,6 +212,17 @@ restart:
 			pvmw->ptl = NULL;
 		}
 	} else if (!pmd_present(pmde)) {
+		/*
+		 * If PVMW_SYNC, take and drop THP pmd lock so that we
+		 * cannot return prematurely, while zap_huge_pmd() has
+		 * cleared *pmd but not decremented compound_mapcount().
+		 */
+		if ((pvmw->flags & PVMW_SYNC) &&
+		    PageTransCompound(pvmw->page)) {
+			spinlock_t *ptl = pmd_lock(mm, pvmw->pmd);
+
+			spin_unlock(ptl);
+		}
 		return false;
 	}
 	if (!map_pte(pvmw))
diff --git a/mm/rmap.c b/mm/rmap.c
index 693a610e181d..07811b4ae793 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1405,6 +1405,15 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	struct mmu_notifier_range range;
 	enum ttu_flags flags = (enum ttu_flags)(long)arg;
 
+	/*
+	 * When racing against e.g. zap_pte_range() on another cpu,
+	 * in between its ptep_get_and_clear_full() and page_remove_rmap(),
+	 * try_to_unmap() may return false when it is about to become true,
+	 * if page table locking is skipped: use TTU_SYNC to wait for that.
+	 */
+	if (flags & TTU_SYNC)
+		pvmw.flags = PVMW_SYNC;
+
 	/* munlock has nothing to gain from examining un-locked vmas */
 	if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
 		return true;
@@ -1777,7 +1786,13 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
 	else
 		rmap_walk(page, &rwc);
 
-	return !page_mapcount(page) ? true : false;
+	/*
+	 * When racing against e.g. zap_pte_range() on another cpu,
+	 * in between its ptep_get_and_clear_full() and page_remove_rmap(),
+	 * try_to_unmap() may return false when it is about to become true,
+	 * if page table locking is skipped: use TTU_SYNC to wait for that.
+	 */
+	return !page_mapcount(page);
 }
 
 /**
-- 
cgit v1.2.3


From 22061a1ffabdb9c3385de159c5db7aac3a4df1cc Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 15 Jun 2021 18:24:03 -0700
Subject: mm/thp: unmap_mapping_page() to fix THP truncate_cleanup_page()

There is a race between THP unmapping and truncation, when truncate sees
pmd_none() and skips the entry, after munmap's zap_huge_pmd() cleared
it, but before its page_remove_rmap() gets to decrement
compound_mapcount: generating false "BUG: Bad page cache" reports that
the page is still mapped when deleted.  This commit fixes that, but not
in the way I hoped.

The first attempt used try_to_unmap(page, TTU_SYNC|TTU_IGNORE_MLOCK)
instead of unmap_mapping_range() in truncate_cleanup_page(): it has
often been an annoyance that we usually call unmap_mapping_range() with
no pages locked, but there apply it to a single locked page.
try_to_unmap() looks more suitable for a single locked page.

However, try_to_unmap_one() contains a VM_BUG_ON_PAGE(!pvmw.pte,page):
it is used to insert THP migration entries, but not used to unmap THPs.
Copy zap_huge_pmd() and add THP handling now? Perhaps, but their TLB
needs are different, I'm too ignorant of the DAX cases, and couldn't
decide how far to go for anon+swap.  Set that aside.

The second attempt took a different tack: make no change in truncate.c,
but modify zap_huge_pmd() to insert an invalidated huge pmd instead of
clearing it initially, then pmd_clear() between page_remove_rmap() and
unlocking at the end.  Nice.  But powerpc blows that approach out of the
water, with its serialize_against_pte_lookup(), and interesting pgtable
usage.  It would need serious help to get working on powerpc (with a
minor optimization issue on s390 too).  Set that aside.

Just add an "if (page_mapped(page)) synchronize_rcu();" or other such
delay, after unmapping in truncate_cleanup_page()? Perhaps, but though
that's likely to reduce or eliminate the number of incidents, it would
give less assurance of whether we had identified the problem correctly.

This successful iteration introduces "unmap_mapping_page(page)" instead
of try_to_unmap(), and goes the usual unmap_mapping_range_tree() route,
with an addition to details.  Then zap_pmd_range() watches for this
case, and does spin_unlock(pmd_lock) if so - just like
page_vma_mapped_walk() now does in the PVMW_SYNC case.  Not pretty, but
safe.

Note that unmap_mapping_page() is doing a VM_BUG_ON(!PageLocked) to
assert its interface; but currently that's only used to make sure that
page->mapping is stable, and zap_pmd_range() doesn't care if the page is
locked or not.  Along these lines, in invalidate_inode_pages2_range()
move the initial unmap_mapping_range() out from under page lock, before
then calling unmap_mapping_page() under page lock if still mapped.

Link: https://lkml.kernel.org/r/a2a4a148-cdd8-942c-4ef8-51b77f643dbe@google.com
Fixes: fc127da085c2 ("truncate: handle file thp")
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jue Wang <juew@google.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Wang Yugui <wangyugui@e16-tech.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  3 +++
 mm/memory.c        | 41 +++++++++++++++++++++++++++++++++++++++++
 mm/truncate.c      | 43 +++++++++++++++++++------------------------
 3 files changed, 63 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c274f75efcf9..8ae31622deef 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1719,6 +1719,7 @@ struct zap_details {
 	struct address_space *check_mapping;	/* Check page->mapping if set */
 	pgoff_t	first_index;			/* Lowest page->index to unmap */
 	pgoff_t last_index;			/* Highest page->index to unmap */
+	struct page *single_page;		/* Locked page to be unmapped */
 };
 
 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
@@ -1766,6 +1767,7 @@ extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
 extern int fixup_user_fault(struct mm_struct *mm,
 			    unsigned long address, unsigned int fault_flags,
 			    bool *unlocked);
+void unmap_mapping_page(struct page *page);
 void unmap_mapping_pages(struct address_space *mapping,
 		pgoff_t start, pgoff_t nr, bool even_cows);
 void unmap_mapping_range(struct address_space *mapping,
@@ -1786,6 +1788,7 @@ static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address,
 	BUG();
 	return -EFAULT;
 }
+static inline void unmap_mapping_page(struct page *page) { }
 static inline void unmap_mapping_pages(struct address_space *mapping,
 		pgoff_t start, pgoff_t nr, bool even_cows) { }
 static inline void unmap_mapping_range(struct address_space *mapping,
diff --git a/mm/memory.c b/mm/memory.c
index f3ffab9b9e39..486f4a2874e7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1361,7 +1361,18 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
 			else if (zap_huge_pmd(tlb, vma, pmd, addr))
 				goto next;
 			/* fall through */
+		} else if (details && details->single_page &&
+			   PageTransCompound(details->single_page) &&
+			   next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
+			spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
+			/*
+			 * Take and drop THP pmd lock so that we cannot return
+			 * prematurely, while zap_huge_pmd() has cleared *pmd,
+			 * but not yet decremented compound_mapcount().
+			 */
+			spin_unlock(ptl);
 		}
+
 		/*
 		 * Here there can be other concurrent MADV_DONTNEED or
 		 * trans huge page faults running, and if the pmd is
@@ -3236,6 +3247,36 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
 	}
 }
 
+/**
+ * unmap_mapping_page() - Unmap single page from processes.
+ * @page: The locked page to be unmapped.
+ *
+ * Unmap this page from any userspace process which still has it mmaped.
+ * Typically, for efficiency, the range of nearby pages has already been
+ * unmapped by unmap_mapping_pages() or unmap_mapping_range().  But once
+ * truncation or invalidation holds the lock on a page, it may find that
+ * the page has been remapped again: and then uses unmap_mapping_page()
+ * to unmap it finally.
+ */
+void unmap_mapping_page(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct zap_details details = { };
+
+	VM_BUG_ON(!PageLocked(page));
+	VM_BUG_ON(PageTail(page));
+
+	details.check_mapping = mapping;
+	details.first_index = page->index;
+	details.last_index = page->index + thp_nr_pages(page) - 1;
+	details.single_page = page;
+
+	i_mmap_lock_write(mapping);
+	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
+		unmap_mapping_range_tree(&mapping->i_mmap, &details);
+	i_mmap_unlock_write(mapping);
+}
+
 /**
  * unmap_mapping_pages() - Unmap pages from processes.
  * @mapping: The address space containing pages to be unmapped.
diff --git a/mm/truncate.c b/mm/truncate.c
index 95af244b112a..234ddd879caa 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -167,13 +167,10 @@ void do_invalidatepage(struct page *page, unsigned int offset,
  * its lock, b) when a concurrent invalidate_mapping_pages got there first and
  * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
  */
-static void
-truncate_cleanup_page(struct address_space *mapping, struct page *page)
+static void truncate_cleanup_page(struct page *page)
 {
-	if (page_mapped(page)) {
-		unsigned int nr = thp_nr_pages(page);
-		unmap_mapping_pages(mapping, page->index, nr, false);
-	}
+	if (page_mapped(page))
+		unmap_mapping_page(page);
 
 	if (page_has_private(page))
 		do_invalidatepage(page, 0, thp_size(page));
@@ -218,7 +215,7 @@ int truncate_inode_page(struct address_space *mapping, struct page *page)
 	if (page->mapping != mapping)
 		return -EIO;
 
-	truncate_cleanup_page(mapping, page);
+	truncate_cleanup_page(page);
 	delete_from_page_cache(page);
 	return 0;
 }
@@ -325,7 +322,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 		index = indices[pagevec_count(&pvec) - 1] + 1;
 		truncate_exceptional_pvec_entries(mapping, &pvec, indices);
 		for (i = 0; i < pagevec_count(&pvec); i++)
-			truncate_cleanup_page(mapping, pvec.pages[i]);
+			truncate_cleanup_page(pvec.pages[i]);
 		delete_from_page_cache_batch(mapping, &pvec);
 		for (i = 0; i < pagevec_count(&pvec); i++)
 			unlock_page(pvec.pages[i]);
@@ -639,6 +636,16 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 				continue;
 			}
 
+			if (!did_range_unmap && page_mapped(page)) {
+				/*
+				 * If page is mapped, before taking its lock,
+				 * zap the rest of the file in one hit.
+				 */
+				unmap_mapping_pages(mapping, index,
+						(1 + end - index), false);
+				did_range_unmap = 1;
+			}
+
 			lock_page(page);
 			WARN_ON(page_to_index(page) != index);
 			if (page->mapping != mapping) {
@@ -646,23 +653,11 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 				continue;
 			}
 			wait_on_page_writeback(page);
-			if (page_mapped(page)) {
-				if (!did_range_unmap) {
-					/*
-					 * Zap the rest of the file in one hit.
-					 */
-					unmap_mapping_pages(mapping, index,
-						(1 + end - index), false);
-					did_range_unmap = 1;
-				} else {
-					/*
-					 * Just zap this page
-					 */
-					unmap_mapping_pages(mapping, index,
-								1, false);
-				}
-			}
+
+			if (page_mapped(page))
+				unmap_mapping_page(page);
 			BUG_ON(page_mapped(page));
+
 			ret2 = do_launder_page(mapping, page);
 			if (ret2 == 0) {
 				if (!invalidate_complete_page2(mapping, page))
-- 
cgit v1.2.3


From 8fe55ef23387ce3c7488375b1fd539420d7654bb Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Thu, 13 May 2021 15:18:27 +0100
Subject: PCI: Dynamically map ECAM regions

Attempting to boot 32-bit ARM kernels under QEMU's 3.x virt models fails
when we have more than 512M of RAM in the model as we run out of vmalloc
space for the PCI ECAM regions. This failure will be silent when running
libvirt, as the console in that situation is a PCI device.

In this configuration, the kernel maps the whole ECAM, which QEMU sets up
for 256 buses, even when maybe only seven buses are in use.  Each bus uses
1M of ECAM space, and ioremap() adds an additional guard page between
allocations. The kernel vmap allocator will align these regions to 512K,
resulting in each mapping eating 1.5M of vmalloc space. This means we need
384M of vmalloc space just to map all of these, which is very wasteful of
resources.

Fix this by only mapping the ECAM for buses we are going to be using.  In
my setups, this is around seven buses in most guests, which is 10.5M of
vmalloc space - way smaller than the 384M that would otherwise be required.
This also means that the kernel can boot without forcing extra RAM into
highmem with the vmalloc= argument, or decreasing the virtual RAM available
to the guest.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/E1lhCAV-0002yb-50@rmk-PC.armlinux.org.uk
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
---
 drivers/pci/ecam.c       | 54 +++++++++++++++++++++++++++++++++++++++++-------
 include/linux/pci-ecam.h |  1 +
 2 files changed, 47 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/ecam.c b/drivers/pci/ecam.c
index d2a1920bb055..1c40d2506aef 100644
--- a/drivers/pci/ecam.c
+++ b/drivers/pci/ecam.c
@@ -32,7 +32,7 @@ struct pci_config_window *pci_ecam_create(struct device *dev,
 	struct pci_config_window *cfg;
 	unsigned int bus_range, bus_range_max, bsz;
 	struct resource *conflict;
-	int i, err;
+	int err;
 
 	if (busr->start > busr->end)
 		return ERR_PTR(-EINVAL);
@@ -50,6 +50,7 @@ struct pci_config_window *pci_ecam_create(struct device *dev,
 	cfg->busr.start = busr->start;
 	cfg->busr.end = busr->end;
 	cfg->busr.flags = IORESOURCE_BUS;
+	cfg->bus_shift = bus_shift;
 	bus_range = resource_size(&cfg->busr);
 	bus_range_max = resource_size(cfgres) >> bus_shift;
 	if (bus_range > bus_range_max) {
@@ -77,13 +78,6 @@ struct pci_config_window *pci_ecam_create(struct device *dev,
 		cfg->winp = kcalloc(bus_range, sizeof(*cfg->winp), GFP_KERNEL);
 		if (!cfg->winp)
 			goto err_exit_malloc;
-		for (i = 0; i < bus_range; i++) {
-			cfg->winp[i] =
-				pci_remap_cfgspace(cfgres->start + i * bsz,
-						   bsz);
-			if (!cfg->winp[i])
-				goto err_exit_iomap;
-		}
 	} else {
 		cfg->win = pci_remap_cfgspace(cfgres->start, bus_range * bsz);
 		if (!cfg->win)
@@ -129,6 +123,44 @@ void pci_ecam_free(struct pci_config_window *cfg)
 }
 EXPORT_SYMBOL_GPL(pci_ecam_free);
 
+static int pci_ecam_add_bus(struct pci_bus *bus)
+{
+	struct pci_config_window *cfg = bus->sysdata;
+	unsigned int bsz = 1 << cfg->bus_shift;
+	unsigned int busn = bus->number;
+	phys_addr_t start;
+
+	if (!per_bus_mapping)
+		return 0;
+
+	if (busn < cfg->busr.start || busn > cfg->busr.end)
+		return -EINVAL;
+
+	busn -= cfg->busr.start;
+	start = cfg->res.start + busn * bsz;
+
+	cfg->winp[busn] = pci_remap_cfgspace(start, bsz);
+	if (!cfg->winp[busn])
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void pci_ecam_remove_bus(struct pci_bus *bus)
+{
+	struct pci_config_window *cfg = bus->sysdata;
+	unsigned int busn = bus->number;
+
+	if (!per_bus_mapping || busn < cfg->busr.start || busn > cfg->busr.end)
+		return;
+
+	busn -= cfg->busr.start;
+	if (cfg->winp[busn]) {
+		iounmap(cfg->winp[busn]);
+		cfg->winp[busn] = NULL;
+	}
+}
+
 /*
  * Function to implement the pci_ops ->map_bus method
  */
@@ -167,6 +199,8 @@ EXPORT_SYMBOL_GPL(pci_ecam_map_bus);
 /* ECAM ops */
 const struct pci_ecam_ops pci_generic_ecam_ops = {
 	.pci_ops	= {
+		.add_bus	= pci_ecam_add_bus,
+		.remove_bus	= pci_ecam_remove_bus,
 		.map_bus	= pci_ecam_map_bus,
 		.read		= pci_generic_config_read,
 		.write		= pci_generic_config_write,
@@ -178,6 +212,8 @@ EXPORT_SYMBOL_GPL(pci_generic_ecam_ops);
 /* ECAM ops for 32-bit access only (non-compliant) */
 const struct pci_ecam_ops pci_32b_ops = {
 	.pci_ops	= {
+		.add_bus	= pci_ecam_add_bus,
+		.remove_bus	= pci_ecam_remove_bus,
 		.map_bus	= pci_ecam_map_bus,
 		.read		= pci_generic_config_read32,
 		.write		= pci_generic_config_write32,
@@ -187,6 +223,8 @@ const struct pci_ecam_ops pci_32b_ops = {
 /* ECAM ops for 32-bit read only (non-compliant) */
 const struct pci_ecam_ops pci_32b_read_ops = {
 	.pci_ops	= {
+		.add_bus	= pci_ecam_add_bus,
+		.remove_bus	= pci_ecam_remove_bus,
 		.map_bus	= pci_ecam_map_bus,
 		.read		= pci_generic_config_read32,
 		.write		= pci_generic_config_write,
diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h
index fbdadd4d8377..adea5a4771cf 100644
--- a/include/linux/pci-ecam.h
+++ b/include/linux/pci-ecam.h
@@ -55,6 +55,7 @@ struct pci_ecam_ops {
 struct pci_config_window {
 	struct resource			res;
 	struct resource			busr;
+	unsigned int			bus_shift;
 	void				*priv;
 	const struct pci_ecam_ops	*ops;
 	union {
-- 
cgit v1.2.3


From a5ae8fc9058e37437c8c1f82b3d412b4abd1b9e6 Mon Sep 17 00:00:00 2001
From: Dmytro Linkin <dlinkin@nvidia.com>
Date: Fri, 14 May 2021 11:14:19 +0300
Subject: net/mlx5e: Don't create devices during unload flow

Running devlink reload command for port in switchdev mode cause
resources to corrupt: driver can't release allocated EQ and reclaim
memory pages, because "rdma" auxiliary device had add CQs which blocks
EQ from deletion.
Erroneous sequence happens during reload-down phase, and is following:

1. detach device - suspends auxiliary devices which support it, destroys
   others. During this step "eth-rep" and "rdma-rep" are destroyed,
   "eth" - suspended.
2. disable SRIOV - moves device to legacy mode; as part of disablement -
   rescans drivers. This step adds "rdma" auxiliary device.
3. destroy EQ table - <failure>.

Driver shouldn't create any device during unload flows. To handle that
implement MLX5_PRIV_FLAGS_DETACH flag, set it on device detach and unset
on device attach. If flag is set do no-op on drivers rescan.

Fixes: a925b5e309c9 ("net/mlx5: Register mlx5 devices to auxiliary virtual bus")
Signed-off-by: Dmytro Linkin <dlinkin@nvidia.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/dev.c | 4 ++++
 include/linux/mlx5/driver.h                   | 4 ++++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
index 8de118adfb54..ceebfc20f65e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
@@ -303,6 +303,7 @@ int mlx5_attach_device(struct mlx5_core_dev *dev)
 	int ret = 0, i;
 
 	mutex_lock(&mlx5_intf_mutex);
+	priv->flags &= ~MLX5_PRIV_FLAGS_DETACH;
 	for (i = 0; i < ARRAY_SIZE(mlx5_adev_devices); i++) {
 		if (!priv->adev[i]) {
 			bool is_supported = false;
@@ -375,6 +376,7 @@ skip_suspend:
 		del_adev(&priv->adev[i]->adev);
 		priv->adev[i] = NULL;
 	}
+	priv->flags |= MLX5_PRIV_FLAGS_DETACH;
 	mutex_unlock(&mlx5_intf_mutex);
 }
 
@@ -463,6 +465,8 @@ int mlx5_rescan_drivers_locked(struct mlx5_core_dev *dev)
 	struct mlx5_priv *priv = &dev->priv;
 
 	lockdep_assert_held(&mlx5_intf_mutex);
+	if (priv->flags & MLX5_PRIV_FLAGS_DETACH)
+		return 0;
 
 	delete_drivers(dev);
 	if (priv->flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 020a8f7fdbdd..f8902bcd91e2 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -542,6 +542,10 @@ struct mlx5_core_roce {
 enum {
 	MLX5_PRIV_FLAGS_DISABLE_IB_ADEV = 1 << 0,
 	MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV = 1 << 1,
+	/* Set during device detach to block any further devices
+	 * creation/deletion on drivers rescan. Unset during device attach.
+	 */
+	MLX5_PRIV_FLAGS_DETACH = 1 << 2,
 };
 
 struct mlx5_adev {
-- 
cgit v1.2.3


From 1d1f6cc5818c750ac69473e4951e7165913fbf16 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 15 Jun 2021 10:19:13 -0700
Subject: pstore/blk: Include zone in pstore_device_info

Information was redundant between struct pstore_zone_info and struct
pstore_device_info. Use struct pstore_zone_info, with member name "zone".

Additionally untangle the logic for the "best effort" block device
instance.

Signed-off-by: Kees Cook <keescook@chromium.org>
Fixed-by: Pu Lehui <pulehui@huawei.com>
Link: https://lore.kernel.org/lkml/20210617005424.182305-1-pulehui@huawei.com
---
 drivers/mtd/mtdpstore.c    |  10 ++--
 fs/pstore/blk.c            | 143 +++++++++++++++++++++++++--------------------
 include/linux/pstore_blk.h |  27 +--------
 3 files changed, 87 insertions(+), 93 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdpstore.c b/drivers/mtd/mtdpstore.c
index a3ae8778f6a9..e13d42c0acb0 100644
--- a/drivers/mtd/mtdpstore.c
+++ b/drivers/mtd/mtdpstore.c
@@ -423,13 +423,13 @@ static void mtdpstore_notify_add(struct mtd_info *mtd)
 	longcnt = BITS_TO_LONGS(div_u64(mtd->size, mtd->erasesize));
 	cxt->badmap = kcalloc(longcnt, sizeof(long), GFP_KERNEL);
 
-	cxt->dev.total_size = mtd->size;
 	/* just support dmesg right now */
 	cxt->dev.flags = PSTORE_FLAGS_DMESG;
-	cxt->dev.read = mtdpstore_read;
-	cxt->dev.write = mtdpstore_write;
-	cxt->dev.erase = mtdpstore_erase;
-	cxt->dev.panic_write = mtdpstore_panic_write;
+	cxt->dev.zone.read = mtdpstore_read;
+	cxt->dev.zone.write = mtdpstore_write;
+	cxt->dev.zone.erase = mtdpstore_erase;
+	cxt->dev.zone.panic_write = mtdpstore_panic_write;
+	cxt->dev.zone.total_size = mtd->size;
 
 	ret = register_pstore_device(&cxt->dev);
 	if (ret) {
diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c
index c373e0d73e6c..04ce58c939a0 100644
--- a/fs/pstore/blk.c
+++ b/fs/pstore/blk.c
@@ -70,7 +70,7 @@ MODULE_PARM_DESC(blkdev, "block device for pstore storage");
  */
 static DEFINE_MUTEX(pstore_blk_lock);
 static struct file *psblk_file;
-static struct pstore_zone_info *pstore_zone_info;
+static struct pstore_device_info *pstore_device_info;
 
 #define check_size(name, alignsize) ({				\
 	long _##name_ = (name);					\
@@ -91,7 +91,7 @@ static struct pstore_zone_info *pstore_zone_info;
 		_##name_ = 0;					\
 	/* Synchronize module parameters with resuls. */	\
 	name = _##name_ / 1024;					\
-	pstore_zone_info->name = _##name_;			\
+	dev->zone.name = _##name_;				\
 }
 
 static int __register_pstore_device(struct pstore_device_info *dev)
@@ -104,50 +104,42 @@ static int __register_pstore_device(struct pstore_device_info *dev)
 		pr_err("NULL device info\n");
 		return -EINVAL;
 	}
-	if (!dev->total_size) {
+	if (!dev->zone.total_size) {
 		pr_err("zero sized device\n");
 		return -EINVAL;
 	}
-	if (!dev->read) {
+	if (!dev->zone.read) {
 		pr_err("no read handler for device\n");
 		return -EINVAL;
 	}
-	if (!dev->write) {
+	if (!dev->zone.write) {
 		pr_err("no write handler for device\n");
 		return -EINVAL;
 	}
 
 	/* someone already registered before */
-	if (pstore_zone_info)
+	if (pstore_device_info)
 		return -EBUSY;
 
-	pstore_zone_info = kzalloc(sizeof(struct pstore_zone_info), GFP_KERNEL);
-	if (!pstore_zone_info)
-		return -ENOMEM;
-
 	/* zero means not limit on which backends to attempt to store. */
 	if (!dev->flags)
 		dev->flags = UINT_MAX;
 
+	/* Copy in module parameters. */
 	verify_size(kmsg_size, 4096, dev->flags & PSTORE_FLAGS_DMESG);
 	verify_size(pmsg_size, 4096, dev->flags & PSTORE_FLAGS_PMSG);
 	verify_size(console_size, 4096, dev->flags & PSTORE_FLAGS_CONSOLE);
 	verify_size(ftrace_size, 4096, dev->flags & PSTORE_FLAGS_FTRACE);
+	dev->zone.max_reason = max_reason;
+
+	/* Initialize required zone ownership details. */
+	dev->zone.name = KBUILD_MODNAME;
+	dev->zone.owner = THIS_MODULE;
+
+	ret = register_pstore_zone(&dev->zone);
+	if (ret == 0)
+		pstore_device_info = dev;
 
-	pstore_zone_info->total_size = dev->total_size;
-	pstore_zone_info->max_reason = max_reason;
-	pstore_zone_info->read = dev->read;
-	pstore_zone_info->write = dev->write;
-	pstore_zone_info->erase = dev->erase;
-	pstore_zone_info->panic_write = dev->panic_write;
-	pstore_zone_info->name = KBUILD_MODNAME;
-	pstore_zone_info->owner = THIS_MODULE;
-
-	ret = register_pstore_zone(pstore_zone_info);
-	if (ret) {
-		kfree(pstore_zone_info);
-		pstore_zone_info = NULL;
-	}
 	return ret;
 }
 /**
@@ -174,10 +166,9 @@ EXPORT_SYMBOL_GPL(register_pstore_device);
 static void __unregister_pstore_device(struct pstore_device_info *dev)
 {
 	lockdep_assert_held(&pstore_blk_lock);
-	if (pstore_zone_info && pstore_zone_info->read == dev->read) {
-		unregister_pstore_zone(pstore_zone_info);
-		kfree(pstore_zone_info);
-		pstore_zone_info = NULL;
+	if (pstore_device_info && pstore_device_info == dev) {
+		unregister_pstore_zone(&dev->zone);
+		pstore_device_info = NULL;
 	}
 }
 
@@ -211,12 +202,9 @@ static ssize_t psblk_generic_blk_write(const char *buf, size_t bytes,
 /*
  * This takes its configuration only from the module parameters now.
  */
-static int __register_pstore_blk(const char *devpath)
+static int __register_pstore_blk(struct pstore_device_info *dev,
+				 const char *devpath)
 {
-	struct pstore_device_info dev = {
-		.read = psblk_generic_blk_read,
-		.write = psblk_generic_blk_write,
-	};
 	struct inode *inode;
 	int ret = -ENODEV;
 
@@ -236,9 +224,9 @@ static int __register_pstore_blk(const char *devpath)
 	}
 
 	inode = I_BDEV(psblk_file->f_mapping->host)->bd_inode;
-	dev.total_size = i_size_read(inode);
+	dev->zone.total_size = i_size_read(inode);
 
-	ret = __register_pstore_device(&dev);
+	ret = __register_pstore_device(dev);
 	if (ret)
 		goto err_fput;
 
@@ -252,18 +240,6 @@ err:
 	return ret;
 }
 
-static void __unregister_pstore_blk(struct file *device)
-{
-	struct pstore_device_info dev = { .read = psblk_generic_blk_read };
-
-	lockdep_assert_held(&pstore_blk_lock);
-	if (psblk_file && psblk_file == device) {
-		__unregister_pstore_device(&dev);
-		fput(psblk_file);
-		psblk_file = NULL;
-	}
-}
-
 /* get information of pstore/blk */
 int pstore_blk_get_config(struct pstore_blk_config *info)
 {
@@ -308,18 +284,63 @@ static inline const char *early_boot_devpath(const char *initial_devname)
 }
 #endif
 
+static int __init __best_effort_init(void)
+{
+	struct pstore_device_info *best_effort_dev;
+	int ret;
+
+	/* No best-effort mode requested. */
+	if (!best_effort)
+		return 0;
+
+	/* Reject an empty blkdev. */
+	if (!blkdev[0]) {
+		pr_err("blkdev empty with best_effort=Y\n");
+		return -EINVAL;
+	}
+
+	best_effort_dev = kzalloc(sizeof(*best_effort_dev), GFP_KERNEL);
+	if (!best_effort_dev)
+		return -ENOMEM;
+
+	best_effort_dev->zone.read = psblk_generic_blk_read;
+	best_effort_dev->zone.write = psblk_generic_blk_write;
+
+	ret = __register_pstore_blk(best_effort_dev,
+				    early_boot_devpath(blkdev));
+	if (ret)
+		kfree(best_effort_dev);
+	else
+		pr_info("attached %s (%zu) (no dedicated panic_write!)\n",
+			blkdev, best_effort_dev->zone.total_size);
+
+	return ret;
+}
+
+static void __exit __best_effort_exit(void)
+{
+	/*
+	 * Currently, the only user of psblk_file is best_effort, so
+	 * we can assume that pstore_device_info is associated with it.
+	 * Once there are "real" blk devices, there will need to be a
+	 * dedicated pstore_blk_info, etc.
+	 */
+	if (psblk_file) {
+		struct pstore_device_info *dev = pstore_device_info;
+
+		__unregister_pstore_device(dev);
+		kfree(dev);
+		fput(psblk_file);
+		psblk_file = NULL;
+	}
+}
+
 static int __init pstore_blk_init(void)
 {
-	int ret = 0;
+	int ret;
 
 	mutex_lock(&pstore_blk_lock);
-	if (!pstore_zone_info && best_effort && blkdev[0]) {
-		ret = __register_pstore_blk(early_boot_devpath(blkdev));
-		if (ret == 0 && pstore_zone_info)
-			pr_info("attached %s:%s (%zu) (no dedicated panic_write!)\n",
-				pstore_zone_info->name, blkdev,
-				pstore_zone_info->total_size);
-	}
+	ret = __best_effort_init();
 	mutex_unlock(&pstore_blk_lock);
 
 	return ret;
@@ -329,15 +350,9 @@ late_initcall(pstore_blk_init);
 static void __exit pstore_blk_exit(void)
 {
 	mutex_lock(&pstore_blk_lock);
-	if (psblk_file)
-		__unregister_pstore_blk(psblk_file);
-	else {
-		struct pstore_device_info dev = { };
-
-		if (pstore_zone_info)
-			dev.read = pstore_zone_info->read;
-		__unregister_pstore_device(&dev);
-	}
+	__best_effort_exit();
+	/* If we've been asked to unload, unregister any remaining device. */
+	__unregister_pstore_device(pstore_device_info);
 	mutex_unlock(&pstore_blk_lock);
 }
 module_exit(pstore_blk_exit);
diff --git a/include/linux/pstore_blk.h b/include/linux/pstore_blk.h
index 99564f93d774..924ca07aafbd 100644
--- a/include/linux/pstore_blk.h
+++ b/include/linux/pstore_blk.h
@@ -10,36 +10,15 @@
 /**
  * struct pstore_device_info - back-end pstore/blk driver structure.
  *
- * @total_size: The total size in bytes pstore/blk can use. It must be greater
- *		than 4096 and be multiple of 4096.
  * @flags:	Refer to macro starting with PSTORE_FLAGS defined in
  *		linux/pstore.h. It means what front-ends this device support.
  *		Zero means all backends for compatible.
- * @read:	The general read operation. Both of the function parameters
- *		@size and @offset are relative value to bock device (not the
- *		whole disk).
- *		On success, the number of bytes should be returned, others
- *		means error.
- * @write:	The same as @read, but the following error number:
- *		-EBUSY means try to write again later.
- *		-ENOMSG means to try next zone.
- * @erase:	The general erase operation for device with special removing
- *		job. Both of the function parameters @size and @offset are
- *		relative value to storage.
- *		Return 0 on success and others on failure.
- * @panic_write:The write operation only used for panic case. It's optional
- *		if you do not care panic log. The parameters are relative
- *		value to storage.
- *		On success, the number of bytes should be returned, others
- *		excluding -ENOMSG mean error. -ENOMSG means to try next zone.
+ * @zone:	The struct pstore_zone_info details.
+ *
  */
 struct pstore_device_info {
-	unsigned long total_size;
 	unsigned int flags;
-	pstore_zone_read_op read;
-	pstore_zone_write_op write;
-	pstore_zone_erase_op erase;
-	pstore_zone_write_op panic_write;
+	struct pstore_zone_info zone;
 };
 
 int  register_pstore_device(struct pstore_device_info *dev);
-- 
cgit v1.2.3


From 10ff9976d06fc6a11f512755d500ab2860cbe650 Mon Sep 17 00:00:00 2001
From: Liu Shixin <liushixin2@huawei.com>
Date: Fri, 11 Jun 2021 10:01:00 +0800
Subject: crypto: api - remove CRYPTOA_U32 and related functions

According to the advice of Eric and Herbert, type CRYPTOA_U32
has been unused for over a decade, so remove the code related to
CRYPTOA_U32.

After removing CRYPTOA_U32, the type of the variable attrs can be
changed from union to struct.

Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/algapi.c         | 18 ------------------
 crypto/algboss.c        | 31 ++++++-------------------------
 include/crypto/algapi.h |  1 -
 include/linux/crypto.h  |  5 -----
 4 files changed, 6 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/algapi.c b/crypto/algapi.c
index fdabf2675b63..43f999dba4dc 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -868,24 +868,6 @@ const char *crypto_attr_alg_name(struct rtattr *rta)
 }
 EXPORT_SYMBOL_GPL(crypto_attr_alg_name);
 
-int crypto_attr_u32(struct rtattr *rta, u32 *num)
-{
-	struct crypto_attr_u32 *nu32;
-
-	if (!rta)
-		return -ENOENT;
-	if (RTA_PAYLOAD(rta) < sizeof(*nu32))
-		return -EINVAL;
-	if (rta->rta_type != CRYPTOA_U32)
-		return -EINVAL;
-
-	nu32 = RTA_DATA(rta);
-	*num = nu32->num;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(crypto_attr_u32);
-
 int crypto_inst_setname(struct crypto_instance *inst, const char *name,
 			struct crypto_alg *alg)
 {
diff --git a/crypto/algboss.c b/crypto/algboss.c
index 5ebccbd6b74e..1814d2c5188a 100644
--- a/crypto/algboss.c
+++ b/crypto/algboss.c
@@ -28,16 +28,9 @@ struct cryptomgr_param {
 		struct crypto_attr_type data;
 	} type;
 
-	union {
+	struct {
 		struct rtattr attr;
-		struct {
-			struct rtattr attr;
-			struct crypto_attr_alg data;
-		} alg;
-		struct {
-			struct rtattr attr;
-			struct crypto_attr_u32 data;
-		} nu32;
+		struct crypto_attr_alg data;
 	} attrs[CRYPTO_MAX_ATTRS];
 
 	char template[CRYPTO_MAX_ALG_NAME];
@@ -104,12 +97,10 @@ static int cryptomgr_schedule_probe(struct crypto_larval *larval)
 
 	i = 0;
 	for (;;) {
-		int notnum = 0;
-
 		name = ++p;
 
 		for (; isalnum(*p) || *p == '-' || *p == '_'; p++)
-			notnum |= !isdigit(*p);
+			;
 
 		if (*p == '(') {
 			int recursion = 0;
@@ -123,7 +114,6 @@ static int cryptomgr_schedule_probe(struct crypto_larval *larval)
 					break;
 			}
 
-			notnum = 1;
 			p++;
 		}
 
@@ -131,18 +121,9 @@ static int cryptomgr_schedule_probe(struct crypto_larval *larval)
 		if (!len)
 			goto err_free_param;
 
-		if (notnum) {
-			param->attrs[i].alg.attr.rta_len =
-				sizeof(param->attrs[i].alg);
-			param->attrs[i].alg.attr.rta_type = CRYPTOA_ALG;
-			memcpy(param->attrs[i].alg.data.name, name, len);
-		} else {
-			param->attrs[i].nu32.attr.rta_len =
-				sizeof(param->attrs[i].nu32);
-			param->attrs[i].nu32.attr.rta_type = CRYPTOA_U32;
-			param->attrs[i].nu32.data.num =
-				simple_strtol(name, NULL, 0);
-		}
+		param->attrs[i].attr.rta_len = sizeof(param->attrs[i]);
+		param->attrs[i].attr.rta_type = CRYPTOA_ALG;
+		memcpy(param->attrs[i].data.name, name, len);
 
 		param->tb[i + 1] = &param->attrs[i].attr;
 		i++;
diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h
index 86f0748009af..41d42e649da4 100644
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
@@ -118,7 +118,6 @@ void *crypto_spawn_tfm2(struct crypto_spawn *spawn);
 struct crypto_attr_type *crypto_get_attr_type(struct rtattr **tb);
 int crypto_check_attr_type(struct rtattr **tb, u32 type, u32 *mask_ret);
 const char *crypto_attr_alg_name(struct rtattr *rta);
-int crypto_attr_u32(struct rtattr *rta, u32 *num);
 int crypto_inst_setname(struct crypto_instance *inst, const char *name,
 			struct crypto_alg *alg);
 
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index da5e0d74bb2f..3b9263d6122f 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -647,7 +647,6 @@ enum {
 	CRYPTOA_UNSPEC,
 	CRYPTOA_ALG,
 	CRYPTOA_TYPE,
-	CRYPTOA_U32,
 	__CRYPTOA_MAX,
 };
 
@@ -665,10 +664,6 @@ struct crypto_attr_type {
 	u32 mask;
 };
 
-struct crypto_attr_u32 {
-	u32 num;
-};
-
 /* 
  * Transform user interface.
  */
-- 
cgit v1.2.3


From 7a2c4cc537fa9f05fe90812e7d789b9faf7eb869 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Tue, 8 Jun 2021 13:09:34 +0300
Subject: devm-helpers: Add resource managed version of work init

A few drivers which need a work-queue must cancel work at driver detach.
Some of those implement remove() solely for this purpose. Help drivers to
avoid unnecessary remove and error-branch implementation by adding managed
verision of work initialization. This will also help drivers to avoid
mixing manual and devm based unwinding when other resources are handled by
devm.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/94ff4175e7f2ff134ed2fa7d6e7641005cc9784b.1623146580.git.matti.vaittinen@fi.rohmeurope.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/devm-helpers.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/devm-helpers.h b/include/linux/devm-helpers.h
index f40f77717a24..74891802200d 100644
--- a/include/linux/devm-helpers.h
+++ b/include/linux/devm-helpers.h
@@ -51,4 +51,29 @@ static inline int devm_delayed_work_autocancel(struct device *dev,
 	return devm_add_action(dev, devm_delayed_work_drop, w);
 }
 
+static inline void devm_work_drop(void *res)
+{
+	cancel_work_sync(res);
+}
+
+/**
+ * devm_work_autocancel - Resource-managed work allocation
+ * @dev:	Device which lifetime work is bound to
+ * @w:		Work to be added (and automatically cancelled)
+ * @worker:	Worker function
+ *
+ * Initialize work which is automatically cancelled when driver is detached.
+ * A few drivers need to queue work which must be cancelled before driver
+ * is detached to avoid accessing removed resources.
+ * devm_work_autocancel() can be used to omit the explicit
+ * cancelleation when driver is detached.
+ */
+static inline int devm_work_autocancel(struct device *dev,
+				       struct work_struct *w,
+				       work_func_t worker)
+{
+	INIT_WORK(w, worker);
+	return devm_add_action(dev, devm_work_drop, w);
+}
+
 #endif
-- 
cgit v1.2.3


From 86c908d90fb17273f5f6d15539ad3d7bf134d892 Mon Sep 17 00:00:00 2001
From: Erik Rosen <erik.rosen@metormote.com>
Date: Fri, 7 May 2021 21:40:21 +0200
Subject: hwmon: (pmbus) Add new flag PMBUS_READ_STATUS_AFTER_FAILED_CHECK

Some PMBus chips end up in an undefined state when trying to read an
unsupported register. For such chips, it is necessary to reset the
chip pmbus controller to a known state after a failed register check.
This can be done by reading a known register. By setting this flag the
driver will try to read the STATUS register after each failed
register check. This read may fail, but it will put the chip into a
known state.

Signed-off-by: Erik Rosen <erik.rosen@metormote.com>
Link: https://lore.kernel.org/r/20210507194023.61138-2-erik.rosen@metormote.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/pmbus/pmbus_core.c |  2 ++
 include/linux/pmbus.h            | 13 +++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/hwmon/pmbus/pmbus_core.c b/drivers/hwmon/pmbus/pmbus_core.c
index bbd745178147..1f7fa5337974 100644
--- a/drivers/hwmon/pmbus/pmbus_core.c
+++ b/drivers/hwmon/pmbus/pmbus_core.c
@@ -523,6 +523,8 @@ static bool pmbus_check_register(struct i2c_client *client,
 	rv = func(client, page, reg);
 	if (rv >= 0 && !(data->flags & PMBUS_SKIP_STATUS_CHECK))
 		rv = pmbus_check_status_cml(client);
+	if (rv < 0 && (data->flags & PMBUS_READ_STATUS_AFTER_FAILED_CHECK))
+		data->read_status(client, -1);
 	pmbus_clear_fault_page(client, -1);
 	return rv >= 0;
 }
diff --git a/include/linux/pmbus.h b/include/linux/pmbus.h
index 12cbbf305969..edd7c84fef65 100644
--- a/include/linux/pmbus.h
+++ b/include/linux/pmbus.h
@@ -43,6 +43,19 @@
  */
 #define PMBUS_NO_CAPABILITY			BIT(2)
 
+/*
+ * PMBUS_READ_STATUS_AFTER_FAILED_CHECK
+ *
+ * Some PMBus chips end up in an undefined state when trying to read an
+ * unsupported register. For such chips, it is necessary to reset the
+ * chip pmbus controller to a known state after a failed register check.
+ * This can be done by reading a known register. By setting this flag the
+ * driver will try to read the STATUS register after each failed
+ * register check. This read may fail, but it will put the chip in a
+ * known state.
+ */
+#define PMBUS_READ_STATUS_AFTER_FAILED_CHECK	BIT(3)
+
 struct pmbus_platform_data {
 	u32 flags;		/* Device specific flags */
 
-- 
cgit v1.2.3


From dbc0860f7a3d43604c380822a456d26ef6f70a06 Mon Sep 17 00:00:00 2001
From: Erik Rosen <erik.rosen@metormote.com>
Date: Wed, 9 Jun 2021 11:32:05 +0200
Subject: hwmon: (pmbus) Add new pmbus flag NO_WRITE_PROTECT

Some PMBus chips respond with invalid data when reading the WRITE_PROTECT
register. For such chips, this flag should be set so that the PMBus core
driver doesn't use the WRITE_PROTECT command to determine its behavior.

Signed-off-by: Erik Rosen <erik.rosen@metormote.com>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/pmbus/pmbus_core.c | 8 +++++---
 include/linux/pmbus.h            | 9 +++++++++
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hwmon/pmbus/pmbus_core.c b/drivers/hwmon/pmbus/pmbus_core.c
index 01a1ffc74bb6..a5367df1cee8 100644
--- a/drivers/hwmon/pmbus/pmbus_core.c
+++ b/drivers/hwmon/pmbus/pmbus_core.c
@@ -2231,9 +2231,11 @@ static int pmbus_init_common(struct i2c_client *client, struct pmbus_data *data,
 	 * faults, and we should not try it. Also, in that case, writes into
 	 * limit registers need to be disabled.
 	 */
-	ret = i2c_smbus_read_byte_data(client, PMBUS_WRITE_PROTECT);
-	if (ret > 0 && (ret & PB_WP_ANY))
-		data->flags |= PMBUS_WRITE_PROTECTED | PMBUS_SKIP_STATUS_CHECK;
+	if (!(data->flags & PMBUS_NO_WRITE_PROTECT)) {
+		ret = i2c_smbus_read_byte_data(client, PMBUS_WRITE_PROTECT);
+		if (ret > 0 && (ret & PB_WP_ANY))
+			data->flags |= PMBUS_WRITE_PROTECTED | PMBUS_SKIP_STATUS_CHECK;
+	}
 
 	if (data->info->pages)
 		pmbus_clear_faults(client);
diff --git a/include/linux/pmbus.h b/include/linux/pmbus.h
index edd7c84fef65..12c515a27d3a 100644
--- a/include/linux/pmbus.h
+++ b/include/linux/pmbus.h
@@ -56,6 +56,15 @@
  */
 #define PMBUS_READ_STATUS_AFTER_FAILED_CHECK	BIT(3)
 
+/*
+ * PMBUS_NO_WRITE_PROTECT
+ *
+ * Some PMBus chips respond with invalid data when reading the WRITE_PROTECT
+ * register. For such chips, this flag should be set so that the PMBus core
+ * driver doesn't use the WRITE_PROTECT command to determine its behavior.
+ */
+#define PMBUS_NO_WRITE_PROTECT			BIT(4)
+
 struct pmbus_platform_data {
 	u32 flags;		/* Device specific flags */
 
-- 
cgit v1.2.3


From e8e00c83a268d5b7d2f5bd490c2269c1ede76a07 Mon Sep 17 00:00:00 2001
From: Erik Rosen <erik.rosen@metormote.com>
Date: Wed, 9 Jun 2021 11:32:06 +0200
Subject: hwmon: (pmbus) Add support for reading direct mode coefficients

Add support for reading and decoding direct format coefficients to
the PMBus core driver. If the new flag PMBUS_USE_COEFFICIENTS_CMD
is set, the driver will use the COEFFICIENTS register together with
the information in the pmbus_sensor_attr structs to initialize
relevant coefficients for the direct mode format.

Signed-off-by: Erik Rosen <erik.rosen@metormote.com>
[groeck: Initialize ret with -EINVAL in pmbus_init_coefficients()]
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/pmbus/pmbus_core.c | 116 +++++++++++++++++++++++++++++++++++++++
 include/linux/pmbus.h            |   8 +++
 2 files changed, 124 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/hwmon/pmbus/pmbus_core.c b/drivers/hwmon/pmbus/pmbus_core.c
index a5367df1cee8..c4f557c8955b 100644
--- a/drivers/hwmon/pmbus/pmbus_core.c
+++ b/drivers/hwmon/pmbus/pmbus_core.c
@@ -2141,6 +2141,111 @@ static int pmbus_find_attributes(struct i2c_client *client,
 	return ret;
 }
 
+/*
+ * The pmbus_class_attr_map structure maps one sensor class to
+ * it's corresponding sensor attributes array.
+ */
+struct pmbus_class_attr_map {
+	enum pmbus_sensor_classes class;
+	int nattr;
+	const struct pmbus_sensor_attr *attr;
+};
+
+static const struct pmbus_class_attr_map class_attr_map[] = {
+	{
+		.class = PSC_VOLTAGE_IN,
+		.attr = voltage_attributes,
+		.nattr = ARRAY_SIZE(voltage_attributes),
+	}, {
+		.class = PSC_VOLTAGE_OUT,
+		.attr = voltage_attributes,
+		.nattr = ARRAY_SIZE(voltage_attributes),
+	}, {
+		.class = PSC_CURRENT_IN,
+		.attr = current_attributes,
+		.nattr = ARRAY_SIZE(current_attributes),
+	}, {
+		.class = PSC_CURRENT_OUT,
+		.attr = current_attributes,
+		.nattr = ARRAY_SIZE(current_attributes),
+	}, {
+		.class = PSC_POWER,
+		.attr = power_attributes,
+		.nattr = ARRAY_SIZE(power_attributes),
+	}, {
+		.class = PSC_TEMPERATURE,
+		.attr = temp_attributes,
+		.nattr = ARRAY_SIZE(temp_attributes),
+	}
+};
+
+/*
+ * Read the coefficients for direct mode.
+ */
+static int pmbus_read_coefficients(struct i2c_client *client,
+				   struct pmbus_driver_info *info,
+				   const struct pmbus_sensor_attr *attr)
+{
+	int rv;
+	union i2c_smbus_data data;
+	enum pmbus_sensor_classes class = attr->class;
+	s8 R;
+	s16 m, b;
+
+	data.block[0] = 2;
+	data.block[1] = attr->reg;
+	data.block[2] = 0x01;
+
+	rv = i2c_smbus_xfer(client->adapter, client->addr, client->flags,
+			    I2C_SMBUS_WRITE, PMBUS_COEFFICIENTS,
+			    I2C_SMBUS_BLOCK_PROC_CALL, &data);
+
+	if (rv < 0)
+		return rv;
+
+	if (data.block[0] != 5)
+		return -EIO;
+
+	m = data.block[1] | (data.block[2] << 8);
+	b = data.block[3] | (data.block[4] << 8);
+	R = data.block[5];
+	info->m[class] = m;
+	info->b[class] = b;
+	info->R[class] = R;
+
+	return rv;
+}
+
+static int pmbus_init_coefficients(struct i2c_client *client,
+				   struct pmbus_driver_info *info)
+{
+	int i, n, ret = -EINVAL;
+	const struct pmbus_class_attr_map *map;
+	const struct pmbus_sensor_attr *attr;
+
+	for (i = 0; i < ARRAY_SIZE(class_attr_map); i++) {
+		map = &class_attr_map[i];
+		if (info->format[map->class] != direct)
+			continue;
+		for (n = 0; n < map->nattr; n++) {
+			attr = &map->attr[n];
+			if (map->class != attr->class)
+				continue;
+			ret = pmbus_read_coefficients(client, info, attr);
+			if (ret >= 0)
+				break;
+		}
+		if (ret < 0) {
+			dev_err(&client->dev,
+				"No coefficients found for sensor class %d\n",
+				map->class);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 /*
  * Identify chip parameters.
  * This function is called for all chips.
@@ -2262,6 +2367,17 @@ static int pmbus_init_common(struct i2c_client *client, struct pmbus_data *data,
 			return ret;
 		}
 	}
+
+	if (data->flags & PMBUS_USE_COEFFICIENTS_CMD) {
+		if (!i2c_check_functionality(client->adapter,
+					     I2C_FUNC_SMBUS_BLOCK_PROC_CALL))
+			return -ENODEV;
+
+		ret = pmbus_init_coefficients(client, info);
+		if (ret < 0)
+			return ret;
+	}
+
 	return 0;
 }
 
diff --git a/include/linux/pmbus.h b/include/linux/pmbus.h
index 12c515a27d3a..fa9f08164c36 100644
--- a/include/linux/pmbus.h
+++ b/include/linux/pmbus.h
@@ -65,6 +65,14 @@
  */
 #define PMBUS_NO_WRITE_PROTECT			BIT(4)
 
+/*
+ * PMBUS_USE_COEFFICIENTS_CMD
+ *
+ * When this flag is set the PMBus core driver will use the COEFFICIENTS
+ * register to initialize the coefficients for the direct mode format.
+ */
+#define PMBUS_USE_COEFFICIENTS_CMD		BIT(5)
+
 struct pmbus_platform_data {
 	u32 flags;		/* Device specific flags */
 
-- 
cgit v1.2.3


From 8f1b971b4750e83e8fbd2f91a9efd4a38ad0ae51 Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Mon, 14 Jun 2021 20:12:38 +0100
Subject: sched/cpufreq: Consider reduced CPU capacity in energy calculation

Energy Aware Scheduling (EAS) needs to predict the decisions made by
SchedUtil. The map_util_freq() exists to do that.

There are corner cases where the max allowed frequency might be reduced
(due to thermal). SchedUtil as a CPUFreq governor, is aware of that
but EAS is not. This patch aims to address it.

SchedUtil stores the maximum allowed frequency in
'sugov_policy::next_freq' field. EAS has to predict that value, which is
the real used frequency. That value is made after a call to
cpufreq_driver_resolve_freq() which clamps to the CPUFreq policy limits.
In the existing code EAS is not able to predict that real frequency.
This leads to energy estimation errors.

To avoid wrong energy estimation in EAS (due to frequency miss prediction)
make sure that the step which calculates Performance Domain frequency,
is also aware of the allowed CPU capacity.

Furthermore, modify map_util_freq() to not extend the frequency value.
Instead, use map_util_perf() to extend the util value in both places:
SchedUtil and EAS, but for EAS clamp it to max allowed CPU capacity.
In the end, we achieve the same desirable behavior for both subsystems
and alignment in regards to the real CPU frequency.

Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> (For the schedutil part)
Link: https://lore.kernel.org/r/20210614191238.23224-1-lukasz.luba@arm.com
---
 include/linux/energy_model.h     | 16 +++++++++++++---
 include/linux/sched/cpufreq.h    |  2 +-
 kernel/sched/cpufreq_schedutil.c |  1 +
 kernel/sched/fair.c              |  2 +-
 4 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 757fc60658fa..3f221dbf5f95 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -91,6 +91,8 @@ void em_dev_unregister_perf_domain(struct device *dev);
  * @pd		: performance domain for which energy has to be estimated
  * @max_util	: highest utilization among CPUs of the domain
  * @sum_util	: sum of the utilization of all CPUs in the domain
+ * @allowed_cpu_cap	: maximum allowed CPU capacity for the @pd, which
+			  might reflect reduced frequency (due to thermal)
  *
  * This function must be used only for CPU devices. There is no validation,
  * i.e. if the EM is a CPU type and has cpumask allocated. It is called from
@@ -100,7 +102,8 @@ void em_dev_unregister_perf_domain(struct device *dev);
  * a capacity state satisfying the max utilization of the domain.
  */
 static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
-				unsigned long max_util, unsigned long sum_util)
+				unsigned long max_util, unsigned long sum_util,
+				unsigned long allowed_cpu_cap)
 {
 	unsigned long freq, scale_cpu;
 	struct em_perf_state *ps;
@@ -112,11 +115,17 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
 	/*
 	 * In order to predict the performance state, map the utilization of
 	 * the most utilized CPU of the performance domain to a requested
-	 * frequency, like schedutil.
+	 * frequency, like schedutil. Take also into account that the real
+	 * frequency might be set lower (due to thermal capping). Thus, clamp
+	 * max utilization to the allowed CPU capacity before calculating
+	 * effective frequency.
 	 */
 	cpu = cpumask_first(to_cpumask(pd->cpus));
 	scale_cpu = arch_scale_cpu_capacity(cpu);
 	ps = &pd->table[pd->nr_perf_states - 1];
+
+	max_util = map_util_perf(max_util);
+	max_util = min(max_util, allowed_cpu_cap);
 	freq = map_util_freq(max_util, ps->frequency, scale_cpu);
 
 	/*
@@ -209,7 +218,8 @@ static inline struct em_perf_domain *em_pd_get(struct device *dev)
 	return NULL;
 }
 static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
-			unsigned long max_util, unsigned long sum_util)
+			unsigned long max_util, unsigned long sum_util,
+			unsigned long allowed_cpu_cap)
 {
 	return 0;
 }
diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h
index 6205578ab6ee..bdd31ab93bc5 100644
--- a/include/linux/sched/cpufreq.h
+++ b/include/linux/sched/cpufreq.h
@@ -26,7 +26,7 @@ bool cpufreq_this_cpu_can_update(struct cpufreq_policy *policy);
 static inline unsigned long map_util_freq(unsigned long util,
 					unsigned long freq, unsigned long cap)
 {
-	return (freq + (freq >> 2)) * util / cap;
+	return freq * util / cap;
 }
 
 static inline unsigned long map_util_perf(unsigned long util)
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 4f09afd2f321..57124614363d 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -151,6 +151,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
 	unsigned int freq = arch_scale_freq_invariant() ?
 				policy->cpuinfo.max_freq : policy->cur;
 
+	util = map_util_perf(util);
 	freq = map_util_freq(util, freq, max);
 
 	if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0d6d190accb0..ed7df1b9cba9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6592,7 +6592,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
 		max_util = max(max_util, min(cpu_util, _cpu_cap));
 	}
 
-	return em_cpu_energy(pd->em_pd, max_util, sum_util);
+	return em_cpu_energy(pd->em_pd, max_util, sum_util, _cpu_cap);
 }
 
 /*
-- 
cgit v1.2.3


From 09aa9aabdcc4966270b031816a16d4641fb45dfa Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sun, 25 Aug 2019 21:57:25 +0200
Subject: soc: ixp4xx: move cpu detection to linux/soc/ixp4xx/cpu.h

Generic drivers are unable to use the feature macros from mach/cpu.h
or the feature bits from mach/hardware.h, so move these into a global
header file along with some dummy helpers that list these features as
disabled elsewhere.

Cc: David S. Miller <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: netdev@vger.kernel.org
Cc: Zoltan HERPAI <wigyori@uid0.hu>
Cc: Raylynn Knight <rayknight@me.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 arch/arm/mach-ixp4xx/common.c                   |  22 +++++
 arch/arm/mach-ixp4xx/include/mach/cpu.h         |  54 ------------
 arch/arm/mach-ixp4xx/include/mach/hardware.h    |   2 +-
 arch/arm/mach-ixp4xx/include/mach/ixp4xx-regs.h |  54 ------------
 drivers/crypto/ixp4xx_crypto.c                  |   4 +
 drivers/net/ethernet/xscale/ixp4xx_eth.c        |   1 +
 drivers/net/ethernet/xscale/ptp_ixp46x.c        |   3 +-
 drivers/net/wan/ixp4xx_hss.c                    |   1 +
 drivers/soc/ixp4xx/ixp4xx-npe.c                 |   2 +-
 drivers/soc/ixp4xx/ixp4xx-qmgr.c                |   2 +-
 include/linux/soc/ixp4xx/cpu.h                  | 106 ++++++++++++++++++++++++
 11 files changed, 138 insertions(+), 113 deletions(-)
 delete mode 100644 arch/arm/mach-ixp4xx/include/mach/cpu.h
 create mode 100644 include/linux/soc/ixp4xx/cpu.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-ixp4xx/common.c b/arch/arm/mach-ixp4xx/common.c
index 431da1b4f6bd..a7faf198e9d4 100644
--- a/arch/arm/mach-ixp4xx/common.c
+++ b/arch/arm/mach-ixp4xx/common.c
@@ -27,6 +27,7 @@
 #include <linux/cpu.h>
 #include <linux/pci.h>
 #include <linux/sched_clock.h>
+#include <linux/soc/ixp4xx/cpu.h>
 #include <linux/irqchip/irq-ixp4xx.h>
 #include <linux/platform_data/timer-ixp4xx.h>
 #include <linux/dma-map-ops.h>
@@ -43,6 +44,27 @@
 
 #include "irqs.h"
 
+u32 ixp4xx_read_feature_bits(void)
+{
+	u32 val = ~__raw_readl(IXP4XX_EXP_CFG2);
+
+	if (cpu_is_ixp42x_rev_a0())
+		return IXP42X_FEATURE_MASK & ~(IXP4XX_FEATURE_RCOMP |
+					       IXP4XX_FEATURE_AES);
+	if (cpu_is_ixp42x())
+		return val & IXP42X_FEATURE_MASK;
+	if (cpu_is_ixp43x())
+		return val & IXP43X_FEATURE_MASK;
+	return val & IXP46X_FEATURE_MASK;
+}
+EXPORT_SYMBOL(ixp4xx_read_feature_bits);
+
+void ixp4xx_write_feature_bits(u32 value)
+{
+	__raw_writel(~value, IXP4XX_EXP_CFG2);
+}
+EXPORT_SYMBOL(ixp4xx_write_feature_bits);
+
 #define IXP4XX_TIMER_FREQ 66666000
 
 /*************************************************************************
diff --git a/arch/arm/mach-ixp4xx/include/mach/cpu.h b/arch/arm/mach-ixp4xx/include/mach/cpu.h
deleted file mode 100644
index b872a5354ddd..000000000000
--- a/arch/arm/mach-ixp4xx/include/mach/cpu.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * arch/arm/mach-ixp4xx/include/mach/cpu.h
- *
- * IXP4XX cpu type detection
- *
- * Copyright (C) 2007 MontaVista Software, Inc.
- */
-
-#ifndef __ASM_ARCH_CPU_H__
-#define __ASM_ARCH_CPU_H__
-
-#include <linux/io.h>
-#include <asm/cputype.h>
-
-/* Processor id value in CP15 Register 0 */
-#define IXP42X_PROCESSOR_ID_VALUE	0x690541c0 /* including unused 0x690541Ex */
-#define IXP42X_PROCESSOR_ID_MASK	0xffffffc0
-
-#define IXP43X_PROCESSOR_ID_VALUE	0x69054040
-#define IXP43X_PROCESSOR_ID_MASK	0xfffffff0
-
-#define IXP46X_PROCESSOR_ID_VALUE	0x69054200 /* including IXP455 */
-#define IXP46X_PROCESSOR_ID_MASK	0xfffffff0
-
-#define cpu_is_ixp42x_rev_a0() ((read_cpuid_id() & (IXP42X_PROCESSOR_ID_MASK | 0xF)) == \
-				IXP42X_PROCESSOR_ID_VALUE)
-#define cpu_is_ixp42x()	((read_cpuid_id() & IXP42X_PROCESSOR_ID_MASK) == \
-			 IXP42X_PROCESSOR_ID_VALUE)
-#define cpu_is_ixp43x()	((read_cpuid_id() & IXP43X_PROCESSOR_ID_MASK) == \
-			 IXP43X_PROCESSOR_ID_VALUE)
-#define cpu_is_ixp46x()	((read_cpuid_id() & IXP46X_PROCESSOR_ID_MASK) == \
-			 IXP46X_PROCESSOR_ID_VALUE)
-
-static inline u32 ixp4xx_read_feature_bits(void)
-{
-	u32 val = ~__raw_readl(IXP4XX_EXP_CFG2);
-
-	if (cpu_is_ixp42x_rev_a0())
-		return IXP42X_FEATURE_MASK & ~(IXP4XX_FEATURE_RCOMP |
-					       IXP4XX_FEATURE_AES);
-	if (cpu_is_ixp42x())
-		return val & IXP42X_FEATURE_MASK;
-	if (cpu_is_ixp43x())
-		return val & IXP43X_FEATURE_MASK;
-	return val & IXP46X_FEATURE_MASK;
-}
-
-static inline void ixp4xx_write_feature_bits(u32 value)
-{
-	__raw_writel(~value, IXP4XX_EXP_CFG2);
-}
-
-#endif  /* _ASM_ARCH_CPU_H */
diff --git a/arch/arm/mach-ixp4xx/include/mach/hardware.h b/arch/arm/mach-ixp4xx/include/mach/hardware.h
index b884eedcd0fc..b2b7301ce503 100644
--- a/arch/arm/mach-ixp4xx/include/mach/hardware.h
+++ b/arch/arm/mach-ixp4xx/include/mach/hardware.h
@@ -23,7 +23,7 @@
 #include "ixp4xx-regs.h"
 
 #ifndef __ASSEMBLER__
-#include <mach/cpu.h>
+#include <linux/soc/ixp4xx/cpu.h>
 #endif
 
 /* Platform helper functions and definitions */
diff --git a/arch/arm/mach-ixp4xx/include/mach/ixp4xx-regs.h b/arch/arm/mach-ixp4xx/include/mach/ixp4xx-regs.h
index f375c1c005d4..abb07f105515 100644
--- a/arch/arm/mach-ixp4xx/include/mach/ixp4xx-regs.h
+++ b/arch/arm/mach-ixp4xx/include/mach/ixp4xx-regs.h
@@ -300,58 +300,4 @@
 
 #define DCMD_LENGTH	0x01fff		/* length mask (max = 8K - 1) */
 
-/* "fuse" bits of IXP_EXP_CFG2 */
-/* All IXP4xx CPUs */
-#define IXP4XX_FEATURE_RCOMP		(1 << 0)
-#define IXP4XX_FEATURE_USB_DEVICE	(1 << 1)
-#define IXP4XX_FEATURE_HASH		(1 << 2)
-#define IXP4XX_FEATURE_AES		(1 << 3)
-#define IXP4XX_FEATURE_DES		(1 << 4)
-#define IXP4XX_FEATURE_HDLC		(1 << 5)
-#define IXP4XX_FEATURE_AAL		(1 << 6)
-#define IXP4XX_FEATURE_HSS		(1 << 7)
-#define IXP4XX_FEATURE_UTOPIA		(1 << 8)
-#define IXP4XX_FEATURE_NPEB_ETH0	(1 << 9)
-#define IXP4XX_FEATURE_NPEC_ETH		(1 << 10)
-#define IXP4XX_FEATURE_RESET_NPEA	(1 << 11)
-#define IXP4XX_FEATURE_RESET_NPEB	(1 << 12)
-#define IXP4XX_FEATURE_RESET_NPEC	(1 << 13)
-#define IXP4XX_FEATURE_PCI		(1 << 14)
-#define IXP4XX_FEATURE_UTOPIA_PHY_LIMIT	(3 << 16)
-#define IXP4XX_FEATURE_XSCALE_MAX_FREQ	(3 << 22)
-#define IXP42X_FEATURE_MASK		(IXP4XX_FEATURE_RCOMP            | \
-					 IXP4XX_FEATURE_USB_DEVICE       | \
-					 IXP4XX_FEATURE_HASH             | \
-					 IXP4XX_FEATURE_AES              | \
-					 IXP4XX_FEATURE_DES              | \
-					 IXP4XX_FEATURE_HDLC             | \
-					 IXP4XX_FEATURE_AAL              | \
-					 IXP4XX_FEATURE_HSS              | \
-					 IXP4XX_FEATURE_UTOPIA           | \
-					 IXP4XX_FEATURE_NPEB_ETH0        | \
-					 IXP4XX_FEATURE_NPEC_ETH         | \
-					 IXP4XX_FEATURE_RESET_NPEA       | \
-					 IXP4XX_FEATURE_RESET_NPEB       | \
-					 IXP4XX_FEATURE_RESET_NPEC       | \
-					 IXP4XX_FEATURE_PCI              | \
-					 IXP4XX_FEATURE_UTOPIA_PHY_LIMIT | \
-					 IXP4XX_FEATURE_XSCALE_MAX_FREQ)
-
-
-/* IXP43x/46x CPUs */
-#define IXP4XX_FEATURE_ECC_TIMESYNC	(1 << 15)
-#define IXP4XX_FEATURE_USB_HOST		(1 << 18)
-#define IXP4XX_FEATURE_NPEA_ETH		(1 << 19)
-#define IXP43X_FEATURE_MASK		(IXP42X_FEATURE_MASK             | \
-					 IXP4XX_FEATURE_ECC_TIMESYNC     | \
-					 IXP4XX_FEATURE_USB_HOST         | \
-					 IXP4XX_FEATURE_NPEA_ETH)
-
-/* IXP46x CPU (including IXP455) only */
-#define IXP4XX_FEATURE_NPEB_ETH_1_TO_3	(1 << 20)
-#define IXP4XX_FEATURE_RSA		(1 << 21)
-#define IXP46X_FEATURE_MASK		(IXP43X_FEATURE_MASK             | \
-					 IXP4XX_FEATURE_NPEB_ETH_1_TO_3  | \
-					 IXP4XX_FEATURE_RSA)
-
 #endif
diff --git a/drivers/crypto/ixp4xx_crypto.c b/drivers/crypto/ixp4xx_crypto.c
index 0616e369522e..590689a23b35 100644
--- a/drivers/crypto/ixp4xx_crypto.c
+++ b/drivers/crypto/ixp4xx_crypto.c
@@ -30,6 +30,10 @@
 #include <linux/soc/ixp4xx/npe.h>
 #include <linux/soc/ixp4xx/qmgr.h>
 
+/* Intermittent includes, delete this after v5.14-rc1 */
+#include <linux/soc/ixp4xx/cpu.h>
+#include <mach/ixp4xx-regs.h>
+
 #define MAX_KEYLEN 32
 
 /* hash: cfgword + 2 * digestlen; crypt: keylen + cfgword */
diff --git a/drivers/net/ethernet/xscale/ixp4xx_eth.c b/drivers/net/ethernet/xscale/ixp4xx_eth.c
index 3624d9215a37..468cb5c909c6 100644
--- a/drivers/net/ethernet/xscale/ixp4xx_eth.c
+++ b/drivers/net/ethernet/xscale/ixp4xx_eth.c
@@ -38,6 +38,7 @@
 #include <linux/soc/ixp4xx/npe.h>
 #include <linux/soc/ixp4xx/qmgr.h>
 #include <mach/hardware.h>
+#include <linux/soc/ixp4xx/cpu.h>
 
 #include "ixp46x_ts.h"
 
diff --git a/drivers/net/ethernet/xscale/ptp_ixp46x.c b/drivers/net/ethernet/xscale/ptp_ixp46x.c
index 9ecc395239e9..99d4d9439d05 100644
--- a/drivers/net/ethernet/xscale/ptp_ixp46x.c
+++ b/drivers/net/ethernet/xscale/ptp_ixp46x.c
@@ -12,9 +12,8 @@
 #include <linux/io.h>
 #include <linux/irq.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
-
 #include <linux/ptp_clock_kernel.h>
+#include <linux/soc/ixp4xx/cpu.h>
 
 #include "ixp46x_ts.h"
 
diff --git a/drivers/net/wan/ixp4xx_hss.c b/drivers/net/wan/ixp4xx_hss.c
index ecea09fd21cb..7ebe627e9392 100644
--- a/drivers/net/wan/ixp4xx_hss.c
+++ b/drivers/net/wan/ixp4xx_hss.c
@@ -22,6 +22,7 @@
 #include <linux/slab.h>
 #include <linux/soc/ixp4xx/npe.h>
 #include <linux/soc/ixp4xx/qmgr.h>
+#include <linux/soc/ixp4xx/cpu.h>
 
 #define DEBUG_DESC		0
 #define DEBUG_RX		0
diff --git a/drivers/soc/ixp4xx/ixp4xx-npe.c b/drivers/soc/ixp4xx/ixp4xx-npe.c
index 0a16ac46ab59..925b49152c77 100644
--- a/drivers/soc/ixp4xx/ixp4xx-npe.c
+++ b/drivers/soc/ixp4xx/ixp4xx-npe.c
@@ -21,7 +21,7 @@
 #include <linux/platform_device.h>
 #include <linux/soc/ixp4xx/npe.h>
 #include <mach/hardware.h>
-#include <mach/cpu.h>
+#include <linux/soc/ixp4xx/cpu.h>
 
 #define DEBUG_MSG			0
 #define DEBUG_FW			0
diff --git a/drivers/soc/ixp4xx/ixp4xx-qmgr.c b/drivers/soc/ixp4xx/ixp4xx-qmgr.c
index 1b1631ac0438..7149510b307e 100644
--- a/drivers/soc/ixp4xx/ixp4xx-qmgr.c
+++ b/drivers/soc/ixp4xx/ixp4xx-qmgr.c
@@ -13,7 +13,7 @@
 #include <linux/platform_device.h>
 #include <linux/soc/ixp4xx/qmgr.h>
 #include <mach/hardware.h>
-#include <mach/cpu.h>
+#include <linux/soc/ixp4xx/cpu.h>
 
 static struct qmgr_regs __iomem *qmgr_regs;
 static int qmgr_irq_1;
diff --git a/include/linux/soc/ixp4xx/cpu.h b/include/linux/soc/ixp4xx/cpu.h
new file mode 100644
index 000000000000..88bd8de0e803
--- /dev/null
+++ b/include/linux/soc/ixp4xx/cpu.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * IXP4XX cpu type detection
+ *
+ * Copyright (C) 2007 MontaVista Software, Inc.
+ */
+
+#ifndef __SOC_IXP4XX_CPU_H__
+#define __SOC_IXP4XX_CPU_H__
+
+#include <linux/io.h>
+#ifdef CONFIG_ARM
+#include <asm/cputype.h>
+#endif
+
+/* Processor id value in CP15 Register 0 */
+#define IXP42X_PROCESSOR_ID_VALUE	0x690541c0 /* including unused 0x690541Ex */
+#define IXP42X_PROCESSOR_ID_MASK	0xffffffc0
+
+#define IXP43X_PROCESSOR_ID_VALUE	0x69054040
+#define IXP43X_PROCESSOR_ID_MASK	0xfffffff0
+
+#define IXP46X_PROCESSOR_ID_VALUE	0x69054200 /* including IXP455 */
+#define IXP46X_PROCESSOR_ID_MASK	0xfffffff0
+
+/* "fuse" bits of IXP_EXP_CFG2 */
+/* All IXP4xx CPUs */
+#define IXP4XX_FEATURE_RCOMP		(1 << 0)
+#define IXP4XX_FEATURE_USB_DEVICE	(1 << 1)
+#define IXP4XX_FEATURE_HASH		(1 << 2)
+#define IXP4XX_FEATURE_AES		(1 << 3)
+#define IXP4XX_FEATURE_DES		(1 << 4)
+#define IXP4XX_FEATURE_HDLC		(1 << 5)
+#define IXP4XX_FEATURE_AAL		(1 << 6)
+#define IXP4XX_FEATURE_HSS		(1 << 7)
+#define IXP4XX_FEATURE_UTOPIA		(1 << 8)
+#define IXP4XX_FEATURE_NPEB_ETH0	(1 << 9)
+#define IXP4XX_FEATURE_NPEC_ETH		(1 << 10)
+#define IXP4XX_FEATURE_RESET_NPEA	(1 << 11)
+#define IXP4XX_FEATURE_RESET_NPEB	(1 << 12)
+#define IXP4XX_FEATURE_RESET_NPEC	(1 << 13)
+#define IXP4XX_FEATURE_PCI		(1 << 14)
+#define IXP4XX_FEATURE_UTOPIA_PHY_LIMIT	(3 << 16)
+#define IXP4XX_FEATURE_XSCALE_MAX_FREQ	(3 << 22)
+#define IXP42X_FEATURE_MASK		(IXP4XX_FEATURE_RCOMP            | \
+					 IXP4XX_FEATURE_USB_DEVICE       | \
+					 IXP4XX_FEATURE_HASH             | \
+					 IXP4XX_FEATURE_AES              | \
+					 IXP4XX_FEATURE_DES              | \
+					 IXP4XX_FEATURE_HDLC             | \
+					 IXP4XX_FEATURE_AAL              | \
+					 IXP4XX_FEATURE_HSS              | \
+					 IXP4XX_FEATURE_UTOPIA           | \
+					 IXP4XX_FEATURE_NPEB_ETH0        | \
+					 IXP4XX_FEATURE_NPEC_ETH         | \
+					 IXP4XX_FEATURE_RESET_NPEA       | \
+					 IXP4XX_FEATURE_RESET_NPEB       | \
+					 IXP4XX_FEATURE_RESET_NPEC       | \
+					 IXP4XX_FEATURE_PCI              | \
+					 IXP4XX_FEATURE_UTOPIA_PHY_LIMIT | \
+					 IXP4XX_FEATURE_XSCALE_MAX_FREQ)
+
+
+/* IXP43x/46x CPUs */
+#define IXP4XX_FEATURE_ECC_TIMESYNC	(1 << 15)
+#define IXP4XX_FEATURE_USB_HOST		(1 << 18)
+#define IXP4XX_FEATURE_NPEA_ETH		(1 << 19)
+#define IXP43X_FEATURE_MASK		(IXP42X_FEATURE_MASK             | \
+					 IXP4XX_FEATURE_ECC_TIMESYNC     | \
+					 IXP4XX_FEATURE_USB_HOST         | \
+					 IXP4XX_FEATURE_NPEA_ETH)
+
+/* IXP46x CPU (including IXP455) only */
+#define IXP4XX_FEATURE_NPEB_ETH_1_TO_3	(1 << 20)
+#define IXP4XX_FEATURE_RSA		(1 << 21)
+#define IXP46X_FEATURE_MASK		(IXP43X_FEATURE_MASK             | \
+					 IXP4XX_FEATURE_NPEB_ETH_1_TO_3  | \
+					 IXP4XX_FEATURE_RSA)
+
+#ifdef CONFIG_ARCH_IXP4XX
+#define cpu_is_ixp42x_rev_a0() ((read_cpuid_id() & (IXP42X_PROCESSOR_ID_MASK | 0xF)) == \
+				IXP42X_PROCESSOR_ID_VALUE)
+#define cpu_is_ixp42x()	((read_cpuid_id() & IXP42X_PROCESSOR_ID_MASK) == \
+			 IXP42X_PROCESSOR_ID_VALUE)
+#define cpu_is_ixp43x()	((read_cpuid_id() & IXP43X_PROCESSOR_ID_MASK) == \
+			 IXP43X_PROCESSOR_ID_VALUE)
+#define cpu_is_ixp46x()	((read_cpuid_id() & IXP46X_PROCESSOR_ID_MASK) == \
+			 IXP46X_PROCESSOR_ID_VALUE)
+
+u32 ixp4xx_read_feature_bits(void);
+void ixp4xx_write_feature_bits(u32 value);
+#else
+#define cpu_is_ixp42x_rev_a0()		0
+#define cpu_is_ixp42x()			0
+#define cpu_is_ixp43x()			0
+#define cpu_is_ixp46x()			0
+static inline u32 ixp4xx_read_feature_bits(void)
+{
+	return 0;
+}
+static inline void ixp4xx_write_feature_bits(u32 value)
+{
+}
+#endif
+
+#endif  /* _ASM_ARCH_CPU_H */
-- 
cgit v1.2.3


From 55712627bffd666c9f25eb23c15c55ec85e5a73f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sun, 25 Aug 2019 22:14:01 +0200
Subject: pata: ixp4xx: split platform data to its own header

Portable drivers cannot use mach/platform.h, so move the
structure into its own header. With this, compile testing
can be enabled.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 arch/arm/mach-ixp4xx/avila-setup.c           |  1 +
 arch/arm/mach-ixp4xx/include/mach/platform.h | 14 --------------
 drivers/ata/Kconfig                          |  2 +-
 drivers/ata/pata_ixp4xx_cf.c                 |  2 +-
 include/linux/platform_data/pata_ixp4xx_cf.h | 21 +++++++++++++++++++++
 5 files changed, 24 insertions(+), 16 deletions(-)
 create mode 100644 include/linux/platform_data/pata_ixp4xx_cf.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-ixp4xx/avila-setup.c b/arch/arm/mach-ixp4xx/avila-setup.c
index 1981b33109cb..ec1d3029f80c 100644
--- a/arch/arm/mach-ixp4xx/avila-setup.c
+++ b/arch/arm/mach-ixp4xx/avila-setup.c
@@ -19,6 +19,7 @@
 #include <linux/tty.h>
 #include <linux/serial_8250.h>
 #include <linux/gpio/machine.h>
+#include <linux/platform_data/pata_ixp4xx_cf.h>
 #include <asm/types.h>
 #include <asm/setup.h>
 #include <asm/memory.h>
diff --git a/arch/arm/mach-ixp4xx/include/mach/platform.h b/arch/arm/mach-ixp4xx/include/mach/platform.h
index 6d403fe0bf52..d8b4df96db08 100644
--- a/arch/arm/mach-ixp4xx/include/mach/platform.h
+++ b/arch/arm/mach-ixp4xx/include/mach/platform.h
@@ -79,20 +79,6 @@ extern unsigned long ixp4xx_exp_bus_size;
 #define IXP4XX_PERIPHERAL_BUS_CLOCK 	(66) /* 66MHzi APB BUS   */ 
 #define IXP4XX_UART_XTAL        	14745600
 
-/*
- * This structure provide a means for the board setup code
- * to give information to th pata_ixp4xx driver. It is
- * passed as platform_data.
- */
-struct ixp4xx_pata_data {
-	volatile u32	*cs0_cfg;
-	volatile u32	*cs1_cfg;
-	unsigned long	cs0_bits;
-	unsigned long	cs1_bits;
-	void __iomem	*cs0;
-	void __iomem	*cs1;
-};
-
 /*
  * Frequency of clock used for primary clocksource
  */
diff --git a/drivers/ata/Kconfig b/drivers/ata/Kconfig
index 030cb32da980..d17c83319e70 100644
--- a/drivers/ata/Kconfig
+++ b/drivers/ata/Kconfig
@@ -1058,7 +1058,7 @@ config PATA_ISAPNP
 
 config PATA_IXP4XX_CF
 	tristate "IXP4XX Compact Flash support"
-	depends on ARCH_IXP4XX
+	depends on ARCH_IXP4XX || COMPILE_TEST
 	help
 	  This option enables support for a Compact Flash connected on
 	  the ixp4xx expansion bus. This driver had been written for
diff --git a/drivers/ata/pata_ixp4xx_cf.c b/drivers/ata/pata_ixp4xx_cf.c
index 073c4e2c9d04..5881d64af943 100644
--- a/drivers/ata/pata_ixp4xx_cf.c
+++ b/drivers/ata/pata_ixp4xx_cf.c
@@ -17,8 +17,8 @@
 #include <linux/libata.h>
 #include <linux/irq.h>
 #include <linux/platform_device.h>
+#include <linux/platform_data/pata_ixp4xx_cf.h>
 #include <scsi/scsi_host.h>
-#include <mach/hardware.h>
 
 #define DRV_NAME	"pata_ixp4xx_cf"
 #define DRV_VERSION	"0.2"
diff --git a/include/linux/platform_data/pata_ixp4xx_cf.h b/include/linux/platform_data/pata_ixp4xx_cf.h
new file mode 100644
index 000000000000..601ba97fef57
--- /dev/null
+++ b/include/linux/platform_data/pata_ixp4xx_cf.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PLATFORM_DATA_PATA_IXP4XX_H
+#define __PLATFORM_DATA_PATA_IXP4XX_H
+
+#include <linux/types.h>
+
+/*
+ * This structure provide a means for the board setup code
+ * to give information to th pata_ixp4xx driver. It is
+ * passed as platform_data.
+ */
+struct ixp4xx_pata_data {
+	volatile u32	*cs0_cfg;
+	volatile u32	*cs1_cfg;
+	unsigned long	cs0_bits;
+	unsigned long	cs1_bits;
+	void __iomem	*cs0;
+	void __iomem	*cs1;
+};
+
+#endif
-- 
cgit v1.2.3


From c28a61471c5898e832c6e8634b2659249761b833 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 9 Jun 2021 18:32:48 -0700
Subject: block: export blk_next_bio()

The block layer provides emulation of zone management operations
targeting all zones of a zoned block device only for the zone reset
operation (REQ_OP_ZONE_RESET). In order to correctly implement
exporting of zoned block devices with NVMeOF, emulating zone management
operations targeting all zones of a device is also necessary for the
open, close and finish zone operations (REQ_OP_ZONE_OPEN,
REQ_OP_ZONE_CLOSE and REQ_OP_ZONE_FINISH).

Instead of duplicating the code, export the existing helper from block
layer so we can use a bio chaining pattern that is present in the block
layer for REQ_OP_ZONE RESET all emulation in the NVMeOF zoned block
device backend.

Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-lib.c     | 1 +
 include/linux/bio.h | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 7b256131b20b..9f09beadcbe3 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -21,6 +21,7 @@ struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp)
 
 	return new;
 }
+EXPORT_SYMBOL_GPL(blk_next_bio);
 
 int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, int flags,
diff --git a/include/linux/bio.h b/include/linux/bio.h
index a0b4cfdf62a4..b2491ead22a0 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -822,4 +822,6 @@ static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb)
 		bio->bi_opf |= REQ_NOWAIT;
 }
 
+struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp);
+
 #endif /* __LINUX_BIO_H */
-- 
cgit v1.2.3


From ab5d0b38c0475d6ff59f1a6ccf7c668b9ec2e0a4 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 9 Jun 2021 18:32:51 -0700
Subject: nvmet: add Command Set Identifier support

NVMe TP 4056 allows controllers to support different command sets.
NVMeoF target currently only supports namespaces that contain
traditional logical blocks that may be randomly read and written. In
some applications there is a value in exposing namespaces that contain
logical blocks that have special access rules (e.g. sequentially write
required namespace such as Zoned Namespace (ZNS)).

In order to support the Zoned Block Devices (ZBD) backend, controllers
need to have support for ZNS Command Set Identifier (CSI).

In this preparation patch, we adjust the code such that it can now
support the default command set identifier. We update the namespace data
structure to store the CSI value which defaults to NVME_CSI_NVM
that represents traditional logical blocks namespace type.

The CSI support is required to implement the ZBD backend for NVMeOF
with host side NVMe ZNS interface, since ZNS commands belong to
the different command set than the default one.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/admin-cmd.c | 75 ++++++++++++++++++++++++++++++++++-------
 drivers/nvme/target/core.c      | 28 ++++++++++++---
 drivers/nvme/target/nvmet.h     |  1 +
 include/linux/nvme.h            |  1 +
 4 files changed, 87 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 3de6a6c99b01..93aaa7479e71 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -162,15 +162,8 @@ out:
 	nvmet_req_complete(req, status);
 }
 
-static void nvmet_execute_get_log_cmd_effects_ns(struct nvmet_req *req)
+static void nvmet_get_cmd_effects_nvm(struct nvme_effects_log *log)
 {
-	u16 status = NVME_SC_INTERNAL;
-	struct nvme_effects_log *log;
-
-	log = kzalloc(sizeof(*log), GFP_KERNEL);
-	if (!log)
-		goto out;
-
 	log->acs[nvme_admin_get_log_page]	= cpu_to_le32(1 << 0);
 	log->acs[nvme_admin_identify]		= cpu_to_le32(1 << 0);
 	log->acs[nvme_admin_abort_cmd]		= cpu_to_le32(1 << 0);
@@ -184,9 +177,30 @@ static void nvmet_execute_get_log_cmd_effects_ns(struct nvmet_req *req)
 	log->iocs[nvme_cmd_flush]		= cpu_to_le32(1 << 0);
 	log->iocs[nvme_cmd_dsm]			= cpu_to_le32(1 << 0);
 	log->iocs[nvme_cmd_write_zeroes]	= cpu_to_le32(1 << 0);
+}
 
-	status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log));
+static void nvmet_execute_get_log_cmd_effects_ns(struct nvmet_req *req)
+{
+	struct nvme_effects_log *log;
+	u16 status = NVME_SC_SUCCESS;
+
+	log = kzalloc(sizeof(*log), GFP_KERNEL);
+	if (!log) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	switch (req->cmd->get_log_page.csi) {
+	case NVME_CSI_NVM:
+		nvmet_get_cmd_effects_nvm(log);
+		break;
+	default:
+		status = NVME_SC_INVALID_LOG_PAGE;
+		goto free;
+	}
 
+	status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log));
+free:
 	kfree(log);
 out:
 	nvmet_req_complete(req, status);
@@ -613,6 +627,12 @@ static void nvmet_execute_identify_desclist(struct nvmet_req *req)
 			goto out;
 	}
 
+	status = nvmet_copy_ns_identifier(req, NVME_NIDT_CSI,
+					  NVME_NIDT_CSI_LEN,
+					  &req->ns->csi, &off);
+	if (status)
+		goto out;
+
 	if (sg_zero_buffer(req->sg, req->sg_cnt, NVME_IDENTIFY_DATA_SIZE - off,
 			off) != NVME_IDENTIFY_DATA_SIZE - off)
 		status = NVME_SC_INTERNAL | NVME_SC_DNR;
@@ -621,6 +641,17 @@ out:
 	nvmet_req_complete(req, status);
 }
 
+static bool nvmet_handle_identify_desclist(struct nvmet_req *req)
+{
+	switch (req->cmd->identify.csi) {
+	case NVME_CSI_NVM:
+		nvmet_execute_identify_desclist(req);
+		return true;
+	default:
+		return false;
+	}
+}
+
 static void nvmet_execute_identify(struct nvmet_req *req)
 {
 	if (!nvmet_check_transfer_len(req, NVME_IDENTIFY_DATA_SIZE))
@@ -628,13 +659,31 @@ static void nvmet_execute_identify(struct nvmet_req *req)
 
 	switch (req->cmd->identify.cns) {
 	case NVME_ID_CNS_NS:
-		return nvmet_execute_identify_ns(req);
+		switch (req->cmd->identify.csi) {
+		case NVME_CSI_NVM:
+			return nvmet_execute_identify_ns(req);
+		default:
+			break;
+		}
+		break;
 	case NVME_ID_CNS_CTRL:
-		return nvmet_execute_identify_ctrl(req);
+		switch (req->cmd->identify.csi) {
+		case NVME_CSI_NVM:
+			return nvmet_execute_identify_ctrl(req);
+		}
+		break;
 	case NVME_ID_CNS_NS_ACTIVE_LIST:
-		return nvmet_execute_identify_nslist(req);
+		switch (req->cmd->identify.csi) {
+		case NVME_CSI_NVM:
+			return nvmet_execute_identify_nslist(req);
+		default:
+			break;
+		}
+		break;
 	case NVME_ID_CNS_NS_DESC_LIST:
-		return nvmet_execute_identify_desclist(req);
+		if (nvmet_handle_identify_desclist(req) == true)
+			return;
+		break;
 	}
 
 	nvmet_req_cns_error_complete(req);
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index c8708dcaeaa5..77873d56cff5 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -682,6 +682,7 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
 
 	uuid_gen(&ns->uuid);
 	ns->buffered_io = false;
+	ns->csi = NVME_CSI_NVM;
 
 	return ns;
 }
@@ -877,10 +878,14 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
 		return ret;
 	}
 
-	if (req->ns->file)
-		return nvmet_file_parse_io_cmd(req);
-
-	return nvmet_bdev_parse_io_cmd(req);
+	switch (req->ns->csi) {
+	case NVME_CSI_NVM:
+		if (req->ns->file)
+			return nvmet_file_parse_io_cmd(req);
+		return nvmet_bdev_parse_io_cmd(req);
+	default:
+		return NVME_SC_INVALID_IO_CMD_SET;
+	}
 }
 
 bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
@@ -1102,6 +1107,17 @@ static inline u8 nvmet_cc_iocqes(u32 cc)
 	return (cc >> NVME_CC_IOCQES_SHIFT) & 0xf;
 }
 
+static inline bool nvmet_css_supported(u8 cc_css)
+{
+	switch (cc_css <<= NVME_CC_CSS_SHIFT) {
+	case NVME_CC_CSS_NVM:
+	case NVME_CC_CSS_CSI:
+		return true;
+	default:
+		return false;
+	}
+}
+
 static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl)
 {
 	lockdep_assert_held(&ctrl->lock);
@@ -1121,7 +1137,7 @@ static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl)
 
 	if (nvmet_cc_mps(ctrl->cc) != 0 ||
 	    nvmet_cc_ams(ctrl->cc) != 0 ||
-	    nvmet_cc_css(ctrl->cc) != 0) {
+	    !nvmet_css_supported(nvmet_cc_css(ctrl->cc))) {
 		ctrl->csts = NVME_CSTS_CFS;
 		return;
 	}
@@ -1172,6 +1188,8 @@ static void nvmet_init_cap(struct nvmet_ctrl *ctrl)
 {
 	/* command sets supported: NVMe command set: */
 	ctrl->cap = (1ULL << 37);
+	/* Controller supports one or more I/O Command Sets */
+	ctrl->cap |= (1ULL << 43);
 	/* CC.EN timeout in 500msec units: */
 	ctrl->cap |= (15ULL << 24);
 	/* maximum queue entries supported: */
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 29b386de1d07..cc71918ff8fe 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -83,6 +83,7 @@ struct nvmet_ns {
 	struct pci_dev		*p2p_dev;
 	int			pi_type;
 	int			metadata_size;
+	u8			csi;
 };
 
 static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item)
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index edcbd60b88b9..c7ba83144d52 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -1504,6 +1504,7 @@ enum {
 	NVME_SC_NS_WRITE_PROTECTED	= 0x20,
 	NVME_SC_CMD_INTERRUPTED		= 0x21,
 	NVME_SC_TRANSIENT_TR_ERR	= 0x22,
+	NVME_SC_INVALID_IO_CMD_SET	= 0x2C,
 
 	NVME_SC_LBA_RANGE		= 0x80,
 	NVME_SC_CAP_EXCEEDED		= 0x81,
-- 
cgit v1.2.3


From aaf2e048af2704da5869f27b508b288f36d5c7b7 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 9 Jun 2021 18:32:52 -0700
Subject: nvmet: add ZBD over ZNS backend support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

NVMe TP 4053 – Zoned Namespaces (ZNS) allows host software to
communicate with a non-volatile memory subsystem using zones for NVMe
protocol-based controllers. NVMeOF already support the ZNS NVMe
Protocol compliant devices on the target in the passthru mode. There
are generic zoned block devices like  Shingled Magnetic Recording (SMR)
HDDs that are not based on the NVMe protocol.

This patch adds ZNS backend support for non-ZNS zoned block devices as
NVMeOF targets.

This support includes implementing the new command set NVME_CSI_ZNS,
adding different command handlers for ZNS command set such as NVMe
Identify Controller, NVMe Identify Namespace, NVMe Zone Append,
NVMe Zone Management Send and NVMe Zone Management Receive.

With the new command set identifier, we also update the target command
effects logs to reflect the ZNS compliant commands.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/Makefile      |   1 +
 drivers/nvme/target/admin-cmd.c   |  41 +++
 drivers/nvme/target/core.c        |  15 +-
 drivers/nvme/target/io-cmd-bdev.c |  26 +-
 drivers/nvme/target/nvmet.h       |  20 ++
 drivers/nvme/target/zns.c         | 615 ++++++++++++++++++++++++++++++++++++++
 include/linux/nvme.h              |   7 +
 7 files changed, 714 insertions(+), 11 deletions(-)
 create mode 100644 drivers/nvme/target/zns.c

(limited to 'include/linux')

diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
index ebf91fc4c72e..9837e580fa7e 100644
--- a/drivers/nvme/target/Makefile
+++ b/drivers/nvme/target/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_NVME_TARGET_TCP)		+= nvmet-tcp.o
 nvmet-y		+= core.o configfs.o admin-cmd.o fabrics-cmd.o \
 			discovery.o io-cmd-file.o io-cmd-bdev.o
 nvmet-$(CONFIG_NVME_TARGET_PASSTHRU)	+= passthru.o
+nvmet-$(CONFIG_BLK_DEV_ZONED)		+= zns.o
 nvme-loop-y	+= loop.o
 nvmet-rdma-y	+= rdma.o
 nvmet-fc-y	+= fc.o
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 93aaa7479e71..363e357d2f20 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -179,6 +179,13 @@ static void nvmet_get_cmd_effects_nvm(struct nvme_effects_log *log)
 	log->iocs[nvme_cmd_write_zeroes]	= cpu_to_le32(1 << 0);
 }
 
+static void nvmet_get_cmd_effects_zns(struct nvme_effects_log *log)
+{
+	log->iocs[nvme_cmd_zone_append]		= cpu_to_le32(1 << 0);
+	log->iocs[nvme_cmd_zone_mgmt_send]	= cpu_to_le32(1 << 0);
+	log->iocs[nvme_cmd_zone_mgmt_recv]	= cpu_to_le32(1 << 0);
+}
+
 static void nvmet_execute_get_log_cmd_effects_ns(struct nvmet_req *req)
 {
 	struct nvme_effects_log *log;
@@ -194,6 +201,14 @@ static void nvmet_execute_get_log_cmd_effects_ns(struct nvmet_req *req)
 	case NVME_CSI_NVM:
 		nvmet_get_cmd_effects_nvm(log);
 		break;
+	case NVME_CSI_ZNS:
+		if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
+			status = NVME_SC_INVALID_IO_CMD_SET;
+			goto free;
+		}
+		nvmet_get_cmd_effects_nvm(log);
+		nvmet_get_cmd_effects_zns(log);
+		break;
 	default:
 		status = NVME_SC_INVALID_LOG_PAGE;
 		goto free;
@@ -647,6 +662,12 @@ static bool nvmet_handle_identify_desclist(struct nvmet_req *req)
 	case NVME_CSI_NVM:
 		nvmet_execute_identify_desclist(req);
 		return true;
+	case NVME_CSI_ZNS:
+		if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
+			nvmet_execute_identify_desclist(req);
+			return true;
+		}
+		return false;
 	default:
 		return false;
 	}
@@ -666,12 +687,32 @@ static void nvmet_execute_identify(struct nvmet_req *req)
 			break;
 		}
 		break;
+	case NVME_ID_CNS_CS_NS:
+		if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
+			switch (req->cmd->identify.csi) {
+			case NVME_CSI_ZNS:
+				return nvmet_execute_identify_cns_cs_ns(req);
+			default:
+				break;
+			}
+		}
+		break;
 	case NVME_ID_CNS_CTRL:
 		switch (req->cmd->identify.csi) {
 		case NVME_CSI_NVM:
 			return nvmet_execute_identify_ctrl(req);
 		}
 		break;
+	case NVME_ID_CNS_CS_CTRL:
+		if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
+			switch (req->cmd->identify.csi) {
+			case NVME_CSI_ZNS:
+				return nvmet_execute_identify_cns_cs_ctrl(req);
+			default:
+				break;
+			}
+		}
+		break;
 	case NVME_ID_CNS_NS_ACTIVE_LIST:
 		switch (req->cmd->identify.csi) {
 		case NVME_CSI_NVM:
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 77873d56cff5..dd16704c9b6b 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -16,6 +16,7 @@
 #include "nvmet.h"
 
 struct workqueue_struct *buffered_io_wq;
+struct workqueue_struct *zbd_wq;
 static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX];
 static DEFINE_IDA(cntlid_ida);
 
@@ -883,6 +884,10 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
 		if (req->ns->file)
 			return nvmet_file_parse_io_cmd(req);
 		return nvmet_bdev_parse_io_cmd(req);
+	case NVME_CSI_ZNS:
+		if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
+			return nvmet_bdev_zns_parse_io_cmd(req);
+		return NVME_SC_INVALID_IO_CMD_SET;
 	default:
 		return NVME_SC_INVALID_IO_CMD_SET;
 	}
@@ -1592,11 +1597,15 @@ static int __init nvmet_init(void)
 
 	nvmet_ana_group_enabled[NVMET_DEFAULT_ANA_GRPID] = 1;
 
+	zbd_wq = alloc_workqueue("nvmet-zbd-wq", WQ_MEM_RECLAIM, 0);
+	if (!zbd_wq)
+		return -ENOMEM;
+
 	buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq",
 			WQ_MEM_RECLAIM, 0);
 	if (!buffered_io_wq) {
 		error = -ENOMEM;
-		goto out;
+		goto out_free_zbd_work_queue;
 	}
 
 	error = nvmet_init_discovery();
@@ -1612,7 +1621,8 @@ out_exit_discovery:
 	nvmet_exit_discovery();
 out_free_work_queue:
 	destroy_workqueue(buffered_io_wq);
-out:
+out_free_zbd_work_queue:
+	destroy_workqueue(zbd_wq);
 	return error;
 }
 
@@ -1622,6 +1632,7 @@ static void __exit nvmet_exit(void)
 	nvmet_exit_discovery();
 	ida_destroy(&cntlid_ida);
 	destroy_workqueue(buffered_io_wq);
+	destroy_workqueue(zbd_wq);
 
 	BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024);
 	BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024);
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index 019cc994efcd..0fc2781ab970 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -47,6 +47,14 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id)
 	id->nows = to0based(ql->io_opt / ql->logical_block_size);
 }
 
+void nvmet_bdev_ns_disable(struct nvmet_ns *ns)
+{
+	if (ns->bdev) {
+		blkdev_put(ns->bdev, FMODE_WRITE | FMODE_READ);
+		ns->bdev = NULL;
+	}
+}
+
 static void nvmet_bdev_ns_enable_integrity(struct nvmet_ns *ns)
 {
 	struct blk_integrity *bi = bdev_get_integrity(ns->bdev);
@@ -86,15 +94,15 @@ int nvmet_bdev_ns_enable(struct nvmet_ns *ns)
 	if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY_T10))
 		nvmet_bdev_ns_enable_integrity(ns);
 
-	return 0;
-}
-
-void nvmet_bdev_ns_disable(struct nvmet_ns *ns)
-{
-	if (ns->bdev) {
-		blkdev_put(ns->bdev, FMODE_WRITE | FMODE_READ);
-		ns->bdev = NULL;
+	if (bdev_is_zoned(ns->bdev)) {
+		if (!nvmet_bdev_zns_enable(ns)) {
+			nvmet_bdev_ns_disable(ns);
+			return -EINVAL;
+		}
+		ns->csi = NVME_CSI_ZNS;
 	}
+
+	return 0;
 }
 
 void nvmet_bdev_ns_revalidate(struct nvmet_ns *ns)
@@ -102,7 +110,7 @@ void nvmet_bdev_ns_revalidate(struct nvmet_ns *ns)
 	ns->size = i_size_read(ns->bdev->bd_inode);
 }
 
-static u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts)
+u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts)
 {
 	u16 status = NVME_SC_SUCCESS;
 
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index cc71918ff8fe..d719a1cd5dda 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -250,6 +250,10 @@ struct nvmet_subsys {
 	unsigned int		admin_timeout;
 	unsigned int		io_timeout;
 #endif /* CONFIG_NVME_TARGET_PASSTHRU */
+
+#ifdef CONFIG_BLK_DEV_ZONED
+	u8			zasl;
+#endif /* CONFIG_BLK_DEV_ZONED */
 };
 
 static inline struct nvmet_subsys *to_subsys(struct config_item *item)
@@ -335,6 +339,12 @@ struct nvmet_req {
 			struct work_struct      work;
 			bool			use_workqueue;
 		} p;
+#ifdef CONFIG_BLK_DEV_ZONED
+		struct {
+			struct bio		inline_bio;
+			struct work_struct	zmgmt_work;
+		} z;
+#endif /* CONFIG_BLK_DEV_ZONED */
 	};
 	int			sg_cnt;
 	int			metadata_sg_cnt;
@@ -354,6 +364,7 @@ struct nvmet_req {
 };
 
 extern struct workqueue_struct *buffered_io_wq;
+extern struct workqueue_struct *zbd_wq;
 
 static inline void nvmet_set_result(struct nvmet_req *req, u32 result)
 {
@@ -403,6 +414,7 @@ u16 nvmet_parse_connect_cmd(struct nvmet_req *req);
 void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id);
 u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req);
 u16 nvmet_file_parse_io_cmd(struct nvmet_req *req);
+u16 nvmet_bdev_zns_parse_io_cmd(struct nvmet_req *req);
 u16 nvmet_parse_admin_cmd(struct nvmet_req *req);
 u16 nvmet_parse_discovery_cmd(struct nvmet_req *req);
 u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req);
@@ -530,6 +542,14 @@ void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid);
 void nvmet_bdev_ns_revalidate(struct nvmet_ns *ns);
 int nvmet_file_ns_revalidate(struct nvmet_ns *ns);
 void nvmet_ns_revalidate(struct nvmet_ns *ns);
+u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts);
+
+bool nvmet_bdev_zns_enable(struct nvmet_ns *ns);
+void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req);
+void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req);
+void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req);
+void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req);
+void nvmet_bdev_execute_zone_append(struct nvmet_req *req);
 
 static inline u32 nvmet_rw_data_len(struct nvmet_req *req)
 {
diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c
new file mode 100644
index 000000000000..17f8b7a45f21
--- /dev/null
+++ b/drivers/nvme/target/zns.c
@@ -0,0 +1,615 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe ZNS-ZBD command implementation.
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/nvme.h>
+#include <linux/blkdev.h>
+#include "nvmet.h"
+
+/*
+ * We set the Memory Page Size Minimum (MPSMIN) for target controller to 0
+ * which gets added by 12 in the nvme_enable_ctrl() which results in 2^12 = 4k
+ * as page_shift value. When calculating the ZASL use shift by 12.
+ */
+#define NVMET_MPSMIN_SHIFT	12
+
+static inline u8 nvmet_zasl(unsigned int zone_append_sects)
+{
+	/*
+	 * Zone Append Size Limit (zasl) is expressed as a power of 2 value
+	 * with the minimum memory page size (i.e. 12) as unit.
+	 */
+	return ilog2(zone_append_sects >> (NVMET_MPSMIN_SHIFT - 9));
+}
+
+static int validate_conv_zones_cb(struct blk_zone *z,
+				  unsigned int i, void *data)
+{
+	if (z->type == BLK_ZONE_TYPE_CONVENTIONAL)
+		return -EOPNOTSUPP;
+	return 0;
+}
+
+bool nvmet_bdev_zns_enable(struct nvmet_ns *ns)
+{
+	struct request_queue *q = ns->bdev->bd_disk->queue;
+	u8 zasl = nvmet_zasl(queue_max_zone_append_sectors(q));
+	struct gendisk *bd_disk = ns->bdev->bd_disk;
+	int ret;
+
+	if (ns->subsys->zasl) {
+		if (ns->subsys->zasl > zasl)
+			return false;
+	}
+	ns->subsys->zasl = zasl;
+
+	/*
+	 * Generic zoned block devices may have a smaller last zone which is
+	 * not supported by ZNS. Exclude zoned drives that have such smaller
+	 * last zone.
+	 */
+	if (get_capacity(bd_disk) & (bdev_zone_sectors(ns->bdev) - 1))
+		return false;
+	/*
+	 * ZNS does not define a conventional zone type. If the underlying
+	 * device has a bitmap set indicating the existence of conventional
+	 * zones, reject the device. Otherwise, use report zones to detect if
+	 * the device has conventional zones.
+	 */
+	if (ns->bdev->bd_disk->queue->conv_zones_bitmap)
+		return false;
+
+	ret = blkdev_report_zones(ns->bdev, 0, blkdev_nr_zones(bd_disk),
+				  validate_conv_zones_cb, NULL);
+	if (ret < 0)
+		return false;
+
+	ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev));
+
+	return true;
+}
+
+void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
+{
+	u8 zasl = req->sq->ctrl->subsys->zasl;
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvme_id_ctrl_zns *id;
+	u16 status;
+
+	id = kzalloc(sizeof(*id), GFP_KERNEL);
+	if (!id) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	if (ctrl->ops->get_mdts)
+		id->zasl = min_t(u8, ctrl->ops->get_mdts(ctrl), zasl);
+	else
+		id->zasl = zasl;
+
+	status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
+
+	kfree(id);
+out:
+	nvmet_req_complete(req, status);
+}
+
+void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
+{
+	struct nvme_id_ns_zns *id_zns;
+	u64 zsze;
+	u16 status;
+
+	if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) {
+		req->error_loc = offsetof(struct nvme_identify, nsid);
+		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
+		goto out;
+	}
+
+	id_zns = kzalloc(sizeof(*id_zns), GFP_KERNEL);
+	if (!id_zns) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	status = nvmet_req_find_ns(req);
+	if (status) {
+		status = NVME_SC_INTERNAL;
+		goto done;
+	}
+
+	if (!bdev_is_zoned(req->ns->bdev)) {
+		req->error_loc = offsetof(struct nvme_identify, nsid);
+		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
+		goto done;
+	}
+
+	nvmet_ns_revalidate(req->ns);
+	zsze = (bdev_zone_sectors(req->ns->bdev) << 9) >>
+					req->ns->blksize_shift;
+	id_zns->lbafe[0].zsze = cpu_to_le64(zsze);
+	id_zns->mor = cpu_to_le32(bdev_max_open_zones(req->ns->bdev));
+	id_zns->mar = cpu_to_le32(bdev_max_active_zones(req->ns->bdev));
+
+done:
+	status = nvmet_copy_to_sgl(req, 0, id_zns, sizeof(*id_zns));
+	kfree(id_zns);
+out:
+	nvmet_req_complete(req, status);
+}
+
+static u16 nvmet_bdev_validate_zone_mgmt_recv(struct nvmet_req *req)
+{
+	sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->zmr.slba);
+	u32 out_bufsize = (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2;
+
+	if (sect >= get_capacity(req->ns->bdev->bd_disk)) {
+		req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, slba);
+		return NVME_SC_LBA_RANGE | NVME_SC_DNR;
+	}
+
+	if (out_bufsize < sizeof(struct nvme_zone_report)) {
+		req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, numd);
+		return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+	}
+
+	if (req->cmd->zmr.zra != NVME_ZRA_ZONE_REPORT) {
+		req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, zra);
+		return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+	}
+
+	switch (req->cmd->zmr.pr) {
+	case 0:
+	case 1:
+		break;
+	default:
+		req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, pr);
+		return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+	}
+
+	switch (req->cmd->zmr.zrasf) {
+	case NVME_ZRASF_ZONE_REPORT_ALL:
+	case NVME_ZRASF_ZONE_STATE_EMPTY:
+	case NVME_ZRASF_ZONE_STATE_IMP_OPEN:
+	case NVME_ZRASF_ZONE_STATE_EXP_OPEN:
+	case NVME_ZRASF_ZONE_STATE_CLOSED:
+	case NVME_ZRASF_ZONE_STATE_FULL:
+	case NVME_ZRASF_ZONE_STATE_READONLY:
+	case NVME_ZRASF_ZONE_STATE_OFFLINE:
+		break;
+	default:
+		req->error_loc =
+			offsetof(struct nvme_zone_mgmt_recv_cmd, zrasf);
+		return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+	}
+
+	return NVME_SC_SUCCESS;
+}
+
+struct nvmet_report_zone_data {
+	struct nvmet_req *req;
+	u64 out_buf_offset;
+	u64 out_nr_zones;
+	u64 nr_zones;
+	u8 zrasf;
+};
+
+static int nvmet_bdev_report_zone_cb(struct blk_zone *z, unsigned i, void *d)
+{
+	static const unsigned int nvme_zrasf_to_blk_zcond[] = {
+		[NVME_ZRASF_ZONE_STATE_EMPTY]	 = BLK_ZONE_COND_EMPTY,
+		[NVME_ZRASF_ZONE_STATE_IMP_OPEN] = BLK_ZONE_COND_IMP_OPEN,
+		[NVME_ZRASF_ZONE_STATE_EXP_OPEN] = BLK_ZONE_COND_EXP_OPEN,
+		[NVME_ZRASF_ZONE_STATE_CLOSED]	 = BLK_ZONE_COND_CLOSED,
+		[NVME_ZRASF_ZONE_STATE_READONLY] = BLK_ZONE_COND_READONLY,
+		[NVME_ZRASF_ZONE_STATE_FULL]	 = BLK_ZONE_COND_FULL,
+		[NVME_ZRASF_ZONE_STATE_OFFLINE]	 = BLK_ZONE_COND_OFFLINE,
+	};
+	struct nvmet_report_zone_data *rz = d;
+
+	if (rz->zrasf != NVME_ZRASF_ZONE_REPORT_ALL &&
+	    z->cond != nvme_zrasf_to_blk_zcond[rz->zrasf])
+		return 0;
+
+	if (rz->nr_zones < rz->out_nr_zones) {
+		struct nvme_zone_descriptor zdesc = { };
+		u16 status;
+
+		zdesc.zcap = nvmet_sect_to_lba(rz->req->ns, z->capacity);
+		zdesc.zslba = nvmet_sect_to_lba(rz->req->ns, z->start);
+		zdesc.wp = nvmet_sect_to_lba(rz->req->ns, z->wp);
+		zdesc.za = z->reset ? 1 << 2 : 0;
+		zdesc.zs = z->cond << 4;
+		zdesc.zt = z->type;
+
+		status = nvmet_copy_to_sgl(rz->req, rz->out_buf_offset, &zdesc,
+					   sizeof(zdesc));
+		if (status)
+			return -EINVAL;
+
+		rz->out_buf_offset += sizeof(zdesc);
+	}
+
+	rz->nr_zones++;
+
+	return 0;
+}
+
+static unsigned long nvmet_req_nr_zones_from_slba(struct nvmet_req *req)
+{
+	unsigned int sect = nvmet_lba_to_sect(req->ns, req->cmd->zmr.slba);
+
+	return blkdev_nr_zones(req->ns->bdev->bd_disk) -
+		(sect >> ilog2(bdev_zone_sectors(req->ns->bdev)));
+}
+
+static unsigned long get_nr_zones_from_buf(struct nvmet_req *req, u32 bufsize)
+{
+	if (bufsize <= sizeof(struct nvme_zone_report))
+		return 0;
+
+	return (bufsize - sizeof(struct nvme_zone_report)) /
+		sizeof(struct nvme_zone_descriptor);
+}
+
+static void nvmet_bdev_zone_zmgmt_recv_work(struct work_struct *w)
+{
+	struct nvmet_req *req = container_of(w, struct nvmet_req, z.zmgmt_work);
+	sector_t start_sect = nvmet_lba_to_sect(req->ns, req->cmd->zmr.slba);
+	unsigned long req_slba_nr_zones = nvmet_req_nr_zones_from_slba(req);
+	u32 out_bufsize = (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2;
+	__le64 nr_zones;
+	u16 status;
+	int ret;
+	struct nvmet_report_zone_data rz_data = {
+		.out_nr_zones = get_nr_zones_from_buf(req, out_bufsize),
+		/* leave the place for report zone header */
+		.out_buf_offset = sizeof(struct nvme_zone_report),
+		.zrasf = req->cmd->zmr.zrasf,
+		.nr_zones = 0,
+		.req = req,
+	};
+
+	status = nvmet_bdev_validate_zone_mgmt_recv(req);
+	if (status)
+		goto out;
+
+	if (!req_slba_nr_zones) {
+		status = NVME_SC_SUCCESS;
+		goto out;
+	}
+
+	ret = blkdev_report_zones(req->ns->bdev, start_sect, req_slba_nr_zones,
+				 nvmet_bdev_report_zone_cb, &rz_data);
+	if (ret < 0) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	/*
+	 * When partial bit is set nr_zones must indicate the number of zone
+	 * descriptors actually transferred.
+	 */
+	if (req->cmd->zmr.pr)
+		rz_data.nr_zones = min(rz_data.nr_zones, rz_data.out_nr_zones);
+
+	nr_zones = cpu_to_le64(rz_data.nr_zones);
+	status = nvmet_copy_to_sgl(req, 0, &nr_zones, sizeof(nr_zones));
+
+out:
+	nvmet_req_complete(req, status);
+}
+
+void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
+{
+	INIT_WORK(&req->z.zmgmt_work, nvmet_bdev_zone_zmgmt_recv_work);
+	queue_work(zbd_wq, &req->z.zmgmt_work);
+}
+
+static inline enum req_opf zsa_req_op(u8 zsa)
+{
+	switch (zsa) {
+	case NVME_ZONE_OPEN:
+		return REQ_OP_ZONE_OPEN;
+	case NVME_ZONE_CLOSE:
+		return REQ_OP_ZONE_CLOSE;
+	case NVME_ZONE_FINISH:
+		return REQ_OP_ZONE_FINISH;
+	case NVME_ZONE_RESET:
+		return REQ_OP_ZONE_RESET;
+	default:
+		return REQ_OP_LAST;
+	}
+}
+
+static u16 blkdev_zone_mgmt_errno_to_nvme_status(int ret)
+{
+	switch (ret) {
+	case 0:
+		return NVME_SC_SUCCESS;
+	case -EINVAL:
+	case -EIO:
+		return NVME_SC_ZONE_INVALID_TRANSITION | NVME_SC_DNR;
+	default:
+		return NVME_SC_INTERNAL;
+	}
+}
+
+struct nvmet_zone_mgmt_send_all_data {
+	unsigned long *zbitmap;
+	struct nvmet_req *req;
+};
+
+static int zmgmt_send_scan_cb(struct blk_zone *z, unsigned i, void *d)
+{
+	struct nvmet_zone_mgmt_send_all_data *data = d;
+
+	switch (zsa_req_op(data->req->cmd->zms.zsa)) {
+	case REQ_OP_ZONE_OPEN:
+		switch (z->cond) {
+		case BLK_ZONE_COND_CLOSED:
+			break;
+		default:
+			return 0;
+		}
+		break;
+	case REQ_OP_ZONE_CLOSE:
+		switch (z->cond) {
+		case BLK_ZONE_COND_IMP_OPEN:
+		case BLK_ZONE_COND_EXP_OPEN:
+			break;
+		default:
+			return 0;
+		}
+		break;
+	case REQ_OP_ZONE_FINISH:
+		switch (z->cond) {
+		case BLK_ZONE_COND_IMP_OPEN:
+		case BLK_ZONE_COND_EXP_OPEN:
+		case BLK_ZONE_COND_CLOSED:
+			break;
+		default:
+			return 0;
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	set_bit(i, data->zbitmap);
+
+	return 0;
+}
+
+static u16 nvmet_bdev_zone_mgmt_emulate_all(struct nvmet_req *req)
+{
+	struct block_device *bdev = req->ns->bdev;
+	unsigned int nr_zones = blkdev_nr_zones(bdev->bd_disk);
+	struct request_queue *q = bdev_get_queue(bdev);
+	struct bio *bio = NULL;
+	sector_t sector = 0;
+	int ret;
+	struct nvmet_zone_mgmt_send_all_data d = {
+		.req = req,
+	};
+
+	d.zbitmap = kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(*(d.zbitmap)),
+				 GFP_NOIO, q->node);
+	if (!d.zbitmap) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* Scan and build bitmap of the eligible zones */
+	ret = blkdev_report_zones(bdev, 0, nr_zones, zmgmt_send_scan_cb, &d);
+	if (ret != nr_zones) {
+		if (ret > 0)
+			ret = -EIO;
+		goto out;
+	} else {
+		/* We scanned all the zones */
+		ret = 0;
+	}
+
+	while (sector < get_capacity(bdev->bd_disk)) {
+		if (test_bit(blk_queue_zone_no(q, sector), d.zbitmap)) {
+			bio = blk_next_bio(bio, 0, GFP_KERNEL);
+			bio->bi_opf = zsa_req_op(req->cmd->zms.zsa) | REQ_SYNC;
+			bio->bi_iter.bi_sector = sector;
+			bio_set_dev(bio, bdev);
+			/* This may take a while, so be nice to others */
+			cond_resched();
+		}
+		sector += blk_queue_zone_sectors(q);
+	}
+
+	if (bio) {
+		ret = submit_bio_wait(bio);
+		bio_put(bio);
+	}
+
+out:
+	kfree(d.zbitmap);
+
+	return blkdev_zone_mgmt_errno_to_nvme_status(ret);
+}
+
+static u16 nvmet_bdev_execute_zmgmt_send_all(struct nvmet_req *req)
+{
+	int ret;
+
+	switch (zsa_req_op(req->cmd->zms.zsa)) {
+	case REQ_OP_ZONE_RESET:
+		ret = blkdev_zone_mgmt(req->ns->bdev, REQ_OP_ZONE_RESET, 0,
+				       get_capacity(req->ns->bdev->bd_disk),
+				       GFP_KERNEL);
+		if (ret < 0)
+			return blkdev_zone_mgmt_errno_to_nvme_status(ret);
+		break;
+	case REQ_OP_ZONE_OPEN:
+	case REQ_OP_ZONE_CLOSE:
+	case REQ_OP_ZONE_FINISH:
+		return nvmet_bdev_zone_mgmt_emulate_all(req);
+	default:
+		/* this is needed to quiet compiler warning */
+		req->error_loc = offsetof(struct nvme_zone_mgmt_send_cmd, zsa);
+		return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+	}
+
+	return NVME_SC_SUCCESS;
+}
+
+static void nvmet_bdev_zmgmt_send_work(struct work_struct *w)
+{
+	struct nvmet_req *req = container_of(w, struct nvmet_req, z.zmgmt_work);
+	sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->zms.slba);
+	enum req_opf op = zsa_req_op(req->cmd->zms.zsa);
+	struct block_device *bdev = req->ns->bdev;
+	sector_t zone_sectors = bdev_zone_sectors(bdev);
+	u16 status = NVME_SC_SUCCESS;
+	int ret;
+
+	if (op == REQ_OP_LAST) {
+		req->error_loc = offsetof(struct nvme_zone_mgmt_send_cmd, zsa);
+		status = NVME_SC_ZONE_INVALID_TRANSITION | NVME_SC_DNR;
+		goto out;
+	}
+
+	/* when select all bit is set slba field is ignored */
+	if (req->cmd->zms.select_all) {
+		status = nvmet_bdev_execute_zmgmt_send_all(req);
+		goto out;
+	}
+
+	if (sect >= get_capacity(bdev->bd_disk)) {
+		req->error_loc = offsetof(struct nvme_zone_mgmt_send_cmd, slba);
+		status = NVME_SC_LBA_RANGE | NVME_SC_DNR;
+		goto out;
+	}
+
+	if (sect & (zone_sectors - 1)) {
+		req->error_loc = offsetof(struct nvme_zone_mgmt_send_cmd, slba);
+		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		goto out;
+	}
+
+	ret = blkdev_zone_mgmt(bdev, op, sect, zone_sectors, GFP_KERNEL);
+	if (ret < 0)
+		status = blkdev_zone_mgmt_errno_to_nvme_status(ret);
+
+out:
+	nvmet_req_complete(req, status);
+}
+
+void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
+{
+	INIT_WORK(&req->z.zmgmt_work, nvmet_bdev_zmgmt_send_work);
+	queue_work(zbd_wq, &req->z.zmgmt_work);
+}
+
+static void nvmet_bdev_zone_append_bio_done(struct bio *bio)
+{
+	struct nvmet_req *req = bio->bi_private;
+
+	if (bio->bi_status == BLK_STS_OK) {
+		req->cqe->result.u64 =
+			nvmet_sect_to_lba(req->ns, bio->bi_iter.bi_sector);
+	}
+
+	nvmet_req_complete(req, blk_to_nvme_status(req, bio->bi_status));
+	nvmet_req_bio_put(req, bio);
+}
+
+void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
+{
+	sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->rw.slba);
+	u16 status = NVME_SC_SUCCESS;
+	unsigned int total_len = 0;
+	struct scatterlist *sg;
+	struct bio *bio;
+	int sg_cnt;
+
+	/* Request is completed on len mismatch in nvmet_check_transter_len() */
+	if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req)))
+		return;
+
+	if (!req->sg_cnt) {
+		nvmet_req_complete(req, 0);
+		return;
+	}
+
+	if (sect >= get_capacity(req->ns->bdev->bd_disk)) {
+		req->error_loc = offsetof(struct nvme_rw_command, slba);
+		status = NVME_SC_LBA_RANGE | NVME_SC_DNR;
+		goto out;
+	}
+
+	if (sect & (bdev_zone_sectors(req->ns->bdev) - 1)) {
+		req->error_loc = offsetof(struct nvme_rw_command, slba);
+		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		goto out;
+	}
+
+	if (nvmet_use_inline_bvec(req)) {
+		bio = &req->z.inline_bio;
+		bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
+	} else {
+		bio = bio_alloc(GFP_KERNEL, req->sg_cnt);
+	}
+
+	bio->bi_opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
+	bio->bi_end_io = nvmet_bdev_zone_append_bio_done;
+	bio_set_dev(bio, req->ns->bdev);
+	bio->bi_iter.bi_sector = sect;
+	bio->bi_private = req;
+	if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA))
+		bio->bi_opf |= REQ_FUA;
+
+	for_each_sg(req->sg, sg, req->sg_cnt, sg_cnt) {
+		struct page *p = sg_page(sg);
+		unsigned int l = sg->length;
+		unsigned int o = sg->offset;
+		unsigned int ret;
+
+		ret = bio_add_zone_append_page(bio, p, l, o);
+		if (ret != sg->length) {
+			status = NVME_SC_INTERNAL;
+			goto out_put_bio;
+		}
+		total_len += sg->length;
+	}
+
+	if (total_len != nvmet_rw_data_len(req)) {
+		status = NVME_SC_INTERNAL | NVME_SC_DNR;
+		goto out_put_bio;
+	}
+
+	submit_bio(bio);
+	return;
+
+out_put_bio:
+	nvmet_req_bio_put(req, bio);
+out:
+	nvmet_req_complete(req, status);
+}
+
+u16 nvmet_bdev_zns_parse_io_cmd(struct nvmet_req *req)
+{
+	struct nvme_command *cmd = req->cmd;
+
+	switch (cmd->common.opcode) {
+	case nvme_cmd_zone_append:
+		req->execute = nvmet_bdev_execute_zone_append;
+		return 0;
+	case nvme_cmd_zone_mgmt_recv:
+		req->execute = nvmet_bdev_execute_zone_mgmt_recv;
+		return 0;
+	case nvme_cmd_zone_mgmt_send:
+		req->execute = nvmet_bdev_execute_zone_mgmt_send;
+		return 0;
+	default:
+		return nvmet_bdev_parse_io_cmd(req);
+	}
+}
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index c7ba83144d52..cb1197f1cfed 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -944,6 +944,13 @@ struct nvme_zone_mgmt_recv_cmd {
 enum {
 	NVME_ZRA_ZONE_REPORT		= 0,
 	NVME_ZRASF_ZONE_REPORT_ALL	= 0,
+	NVME_ZRASF_ZONE_STATE_EMPTY	= 0x01,
+	NVME_ZRASF_ZONE_STATE_IMP_OPEN	= 0x02,
+	NVME_ZRASF_ZONE_STATE_EXP_OPEN	= 0x03,
+	NVME_ZRASF_ZONE_STATE_CLOSED	= 0x04,
+	NVME_ZRASF_ZONE_STATE_READONLY	= 0x05,
+	NVME_ZRASF_ZONE_STATE_FULL	= 0x06,
+	NVME_ZRASF_ZONE_STATE_OFFLINE	= 0x07,
 	NVME_REPORT_ZONE_PARTIAL	= 1,
 };
 
-- 
cgit v1.2.3


From 8cf486e131b351db4f224078bef8e1efedcf0340 Mon Sep 17 00:00:00 2001
From: Wesley Sheng <wesley.sheng@amd.com>
Date: Wed, 16 Jun 2021 13:25:08 +0800
Subject: nvme.h: add missing nvme_lba_range_type endianness annotations

Signed-off-by: Wesley Sheng <wesley.sheng@amd.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nvme.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index cb1197f1cfed..b7c4c4130b65 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -636,8 +636,8 @@ struct nvme_lba_range_type {
 	__u8			type;
 	__u8			attributes;
 	__u8			rsvd2[14];
-	__u64			slba;
-	__u64			nlb;
+	__le64			slba;
+	__le64			nlb;
 	__u8			guid[16];
 	__u8			rsvd48[16];
 };
-- 
cgit v1.2.3


From aff0dbd03d3b750e2331f7cb93e01fe25ed27086 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 16 Jun 2021 16:22:50 +0200
Subject: ACPI: scan: Make acpi_walk_dep_device_list()

Because acpi_walk_dep_device_list() is only called by the code in the
file in which it is defined, make it static, drop the export of it
and drop its header from acpi.h.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/acpi/scan.c  | 7 +++----
 include/linux/acpi.h | 3 ---
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 524d85dc540c..b7f9b7ac0d04 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -2139,9 +2139,9 @@ static int acpi_scan_clear_dep(struct acpi_dep_data *dep, void *data)
  * negative value is returned by the callback then the loop is broken and that
  * value is returned as the final error.
  */
-int acpi_walk_dep_device_list(acpi_handle handle,
-			      int (*callback)(struct acpi_dep_data *, void *),
-			      void *data)
+static int acpi_walk_dep_device_list(acpi_handle handle,
+				int (*callback)(struct acpi_dep_data *, void *),
+				void *data)
 {
 	struct acpi_dep_data *dep, *tmp;
 	int ret = 0;
@@ -2158,7 +2158,6 @@ int acpi_walk_dep_device_list(acpi_handle handle,
 
 	return ret > 0 ? 0 : ret;
 }
-EXPORT_SYMBOL_GPL(acpi_walk_dep_device_list);
 
 /**
  * acpi_dev_clear_dependencies - Inform consumers that the device is now active
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 170b9bebdb2b..0a6d2845fcaf 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -666,9 +666,6 @@ extern bool acpi_driver_match_device(struct device *dev,
 				     const struct device_driver *drv);
 int acpi_device_uevent_modalias(struct device *, struct kobj_uevent_env *);
 int acpi_device_modalias(struct device *, char *, int);
-int acpi_walk_dep_device_list(acpi_handle handle,
-			      int (*callback)(struct acpi_dep_data *, void *),
-			      void *data);
 
 struct platform_device *acpi_create_platform_device(struct acpi_device *,
 						    struct property_entry *);
-- 
cgit v1.2.3


From b10a038e84d188e15819058b2978b2daa9853aeb Mon Sep 17 00:00:00 2001
From: Ben Gardon <bgardon@google.com>
Date: Tue, 18 May 2021 10:34:11 -0700
Subject: KVM: mmu: Add slots_arch_lock for memslot arch fields

Add a new lock to protect the arch-specific fields of memslots if they
need to be modified in a kvm->srcu read critical section. A future
commit will use this lock to lazily allocate memslot rmaps for x86.

Signed-off-by: Ben Gardon <bgardon@google.com>
Message-Id: <20210518173414.450044-5-bgardon@google.com>
[Add Documentation/ hunk. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/locking.rst |  5 ++++
 include/linux/kvm_host.h           |  9 +++++++
 virt/kvm/kvm_main.c                | 54 +++++++++++++++++++++++++++++++++-----
 3 files changed, 62 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst
index 1fc860c007a3..35eca377543d 100644
--- a/Documentation/virt/kvm/locking.rst
+++ b/Documentation/virt/kvm/locking.rst
@@ -16,6 +16,11 @@ The acquisition orders for mutexes are as follows:
 - kvm->slots_lock is taken outside kvm->irq_lock, though acquiring
   them together is quite rare.
 
+- Unlike kvm->slots_lock, kvm->slots_arch_lock is released before
+  synchronize_srcu(&kvm->srcu).  Therefore kvm->slots_arch_lock
+  can be taken inside a kvm->srcu read-side critical section,
+  while kvm->slots_lock cannot.
+
 On x86:
 
 - vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8583ed3ff344..11b9b11a5e9b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -523,6 +523,15 @@ struct kvm {
 #endif /* KVM_HAVE_MMU_RWLOCK */
 
 	struct mutex slots_lock;
+
+	/*
+	 * Protects the arch-specific fields of struct kvm_memory_slots in
+	 * use by the VM. To be used under the slots_lock (above) or in a
+	 * kvm->srcu critical section where acquiring the slots_lock would
+	 * lead to deadlock with the synchronize_srcu in
+	 * install_new_memslots.
+	 */
+	struct mutex slots_arch_lock;
 	struct mm_struct *mm; /* userspace tied to this vm */
 	struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
 	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d65be9461493..fa7e7ebefc79 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -909,6 +909,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
 	mutex_init(&kvm->lock);
 	mutex_init(&kvm->irq_lock);
 	mutex_init(&kvm->slots_lock);
+	mutex_init(&kvm->slots_arch_lock);
 	INIT_LIST_HEAD(&kvm->devices);
 
 	BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
@@ -1281,6 +1282,14 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 	slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
 
 	rcu_assign_pointer(kvm->memslots[as_id], slots);
+
+	/*
+	 * Acquired in kvm_set_memslot. Must be released before synchronize
+	 * SRCU below in order to avoid deadlock with another thread
+	 * acquiring the slots_arch_lock in an srcu critical section.
+	 */
+	mutex_unlock(&kvm->slots_arch_lock);
+
 	synchronize_srcu_expedited(&kvm->srcu);
 
 	/*
@@ -1352,9 +1361,27 @@ static int kvm_set_memslot(struct kvm *kvm,
 	struct kvm_memslots *slots;
 	int r;
 
+	/*
+	 * Released in install_new_memslots.
+	 *
+	 * Must be held from before the current memslots are copied until
+	 * after the new memslots are installed with rcu_assign_pointer,
+	 * then released before the synchronize srcu in install_new_memslots.
+	 *
+	 * When modifying memslots outside of the slots_lock, must be held
+	 * before reading the pointer to the current memslots until after all
+	 * changes to those memslots are complete.
+	 *
+	 * These rules ensure that installing new memslots does not lose
+	 * changes made to the previous memslots.
+	 */
+	mutex_lock(&kvm->slots_arch_lock);
+
 	slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change);
-	if (!slots)
+	if (!slots) {
+		mutex_unlock(&kvm->slots_arch_lock);
 		return -ENOMEM;
+	}
 
 	if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
 		/*
@@ -1365,10 +1392,9 @@ static int kvm_set_memslot(struct kvm *kvm,
 		slot->flags |= KVM_MEMSLOT_INVALID;
 
 		/*
-		 * We can re-use the old memslots, the only difference from the
-		 * newly installed memslots is the invalid flag, which will get
-		 * dropped by update_memslots anyway.  We'll also revert to the
-		 * old memslots if preparing the new memory region fails.
+		 * We can re-use the memory from the old memslots.
+		 * It will be overwritten with a copy of the new memslots
+		 * after reacquiring the slots_arch_lock below.
 		 */
 		slots = install_new_memslots(kvm, as_id, slots);
 
@@ -1380,6 +1406,17 @@ static int kvm_set_memslot(struct kvm *kvm,
 		 *	- kvm_is_visible_gfn (mmu_check_root)
 		 */
 		kvm_arch_flush_shadow_memslot(kvm, slot);
+
+		/* Released in install_new_memslots. */
+		mutex_lock(&kvm->slots_arch_lock);
+
+		/*
+		 * The arch-specific fields of the memslots could have changed
+		 * between releasing the slots_arch_lock in
+		 * install_new_memslots and here, so get a fresh copy of the
+		 * slots.
+		 */
+		kvm_copy_memslots(slots, __kvm_memslots(kvm, as_id));
 	}
 
 	r = kvm_arch_prepare_memory_region(kvm, new, mem, change);
@@ -1395,8 +1432,13 @@ static int kvm_set_memslot(struct kvm *kvm,
 	return 0;
 
 out_slots:
-	if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
+	if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
+		slot = id_to_memslot(slots, old->id);
+		slot->flags &= ~KVM_MEMSLOT_INVALID;
 		slots = install_new_memslots(kvm, as_id, slots);
+	} else {
+		mutex_unlock(&kvm->slots_arch_lock);
+	}
 	kvfree(slots);
 	return r;
 }
-- 
cgit v1.2.3


From 605a140a49099effc069f0fd509db34d91f48496 Mon Sep 17 00:00:00 2001
From: Ilias Stamatis <ilstam@amazon.com>
Date: Wed, 26 May 2021 19:44:08 +0100
Subject: math64.h: Add mul_s64_u64_shr()

This function is needed for KVM's nested virtualization. The nested TSC
scaling implementation requires multiplying the signed TSC offset with
the unsigned TSC multiplier.

Signed-off-by: Ilias Stamatis <ilstam@amazon.com>
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Message-Id: <20210526184418.28881-2-ilstam@amazon.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/math64.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/math64.h b/include/linux/math64.h
index 66deb1fdc2ef..2928f03d6d46 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -3,6 +3,7 @@
 #define _LINUX_MATH64_H
 
 #include <linux/types.h>
+#include <linux/math.h>
 #include <vdso/math64.h>
 #include <asm/div64.h>
 
@@ -234,6 +235,24 @@ static inline u64 mul_u64_u64_shr(u64 a, u64 b, unsigned int shift)
 
 #endif
 
+#ifndef mul_s64_u64_shr
+static inline u64 mul_s64_u64_shr(s64 a, u64 b, unsigned int shift)
+{
+	u64 ret;
+
+	/*
+	 * Extract the sign before the multiplication and put it back
+	 * afterwards if needed.
+	 */
+	ret = mul_u64_u64_shr(abs(a), b, shift);
+
+	if (a < 0)
+		ret = -((s64) ret);
+
+	return ret;
+}
+#endif /* mul_s64_u64_shr */
+
 #ifndef mul_u64_u32_div
 static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
 {
-- 
cgit v1.2.3


From 2fdef3a2ae01dfd928c4b42c5a3b76546170a74c Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <senozhatsky@chromium.org>
Date: Sun, 6 Jun 2021 11:10:44 +0900
Subject: kvm: add PM-notifier

Add KVM PM-notifier so that architectures can have arch-specific
VM suspend/resume routines. Such architectures need to select
CONFIG_HAVE_KVM_PM_NOTIFIER and implement kvm_arch_pm_notifier().

Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Acked-by: Marc Zyngier <maz@kernel.org>
Message-Id: <20210606021045.14159-1-senozhatsky@chromium.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h |  9 +++++++++
 virt/kvm/Kconfig         |  3 +++
 virt/kvm/kvm_main.c      | 35 +++++++++++++++++++++++++++++++++++
 3 files changed, 47 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 11b9b11a5e9b..37cbb56ccd09 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -28,6 +28,7 @@
 #include <linux/rcuwait.h>
 #include <linux/refcount.h>
 #include <linux/nospec.h>
+#include <linux/notifier.h>
 #include <asm/signal.h>
 
 #include <linux/kvm.h>
@@ -594,6 +595,10 @@ struct kvm {
 	pid_t userspace_pid;
 	unsigned int max_halt_poll_ns;
 	u32 dirty_ring_size;
+
+#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
+	struct notifier_block pm_notifier;
+#endif
 };
 
 #define kvm_err(fmt, ...) \
@@ -1007,6 +1012,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
 
+#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
+int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state);
+#endif
+
 #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
 void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry);
 #endif
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 1c37ccd5d402..62b39149b8c8 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -63,3 +63,6 @@ config HAVE_KVM_NO_POLL
 
 config KVM_XFER_TO_GUEST_WORK
        bool
+
+config HAVE_KVM_PM_NOTIFIER
+       bool
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index fa7e7ebefc79..fc35ba0ea5d3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -51,6 +51,7 @@
 #include <linux/io.h>
 #include <linux/lockdep.h>
 #include <linux/kthread.h>
+#include <linux/suspend.h>
 
 #include <asm/processor.h>
 #include <asm/ioctl.h>
@@ -780,6 +781,38 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
 
 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
 
+#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
+static int kvm_pm_notifier_call(struct notifier_block *bl,
+				unsigned long state,
+				void *unused)
+{
+	struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
+
+	return kvm_arch_pm_notifier(kvm, state);
+}
+
+static void kvm_init_pm_notifier(struct kvm *kvm)
+{
+	kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
+	/* Suspend KVM before we suspend ftrace, RCU, etc. */
+	kvm->pm_notifier.priority = INT_MAX;
+	register_pm_notifier(&kvm->pm_notifier);
+}
+
+static void kvm_destroy_pm_notifier(struct kvm *kvm)
+{
+	unregister_pm_notifier(&kvm->pm_notifier);
+}
+#else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
+static void kvm_init_pm_notifier(struct kvm *kvm)
+{
+}
+
+static void kvm_destroy_pm_notifier(struct kvm *kvm)
+{
+}
+#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
+
 static struct kvm_memslots *kvm_alloc_memslots(void)
 {
 	int i;
@@ -964,6 +997,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
 	mutex_unlock(&kvm_lock);
 
 	preempt_notifier_inc();
+	kvm_init_pm_notifier(kvm);
 
 	return kvm;
 
@@ -1011,6 +1045,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	int i;
 	struct mm_struct *mm = kvm->mm;
 
+	kvm_destroy_pm_notifier(kvm);
 	kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
 	kvm_destroy_vm_debugfs(kvm);
 	kvm_arch_sync_events(kvm);
-- 
cgit v1.2.3


From 2d8ea148e553e1dd4e80a87741abdfb229e2b323 Mon Sep 17 00:00:00 2001
From: Jian Shen <shenjian15@huawei.com>
Date: Thu, 17 Jun 2021 11:37:11 +0800
Subject: net: fix mistake path for netdev_features_strings

Th_strings arrays netdev_features_strings, tunable_strings, and
phy_tunable_strings has been moved to file net/ethtool/common.c.
So fixes the comment.

Signed-off-by: Jian Shen <shenjian15@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h | 2 +-
 include/uapi/linux/ethtool.h    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 3de38d6a0aea..2c6b9e416225 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -93,7 +93,7 @@ enum {
 
 	/*
 	 * Add your fresh new feature above and remember to update
-	 * netdev_features_strings[] in net/core/ethtool.c and maybe
+	 * netdev_features_strings[] in net/ethtool/common.c and maybe
 	 * some feature mask #defines below. Please also describe it
 	 * in Documentation/networking/netdev-features.rst.
 	 */
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index cfef6b08169a..67aa7134b301 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -233,7 +233,7 @@ enum tunable_id {
 	ETHTOOL_PFC_PREVENTION_TOUT, /* timeout in msecs */
 	/*
 	 * Add your fresh new tunable attribute above and remember to update
-	 * tunable_strings[] in net/core/ethtool.c
+	 * tunable_strings[] in net/ethtool/common.c
 	 */
 	__ETHTOOL_TUNABLE_COUNT,
 };
@@ -297,7 +297,7 @@ enum phy_tunable_id {
 	ETHTOOL_PHY_EDPD,
 	/*
 	 * Add your fresh new phy tunable attribute above and remember to update
-	 * phy_tunable_strings[] in net/core/ethtool.c
+	 * phy_tunable_strings[] in net/ethtool/common.c
 	 */
 	__ETHTOOL_PHY_TUNABLE_COUNT,
 };
-- 
cgit v1.2.3


From 43e76d463c09a0272b84775bcc727c1eb8b384b2 Mon Sep 17 00:00:00 2001
From: Ioana Ciornei <ioana.ciornei@nxp.com>
Date: Thu, 17 Jun 2021 15:29:04 +0300
Subject: driver core: add a helper to setup both the of_node and fwnode of a
 device

There are many places where both the fwnode_handle and the of_node of a
device need to be populated. Add a function which does both so that we
have consistency.

Suggested-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/base/core.c    | 7 +++++++
 include/linux/device.h | 1 +
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 628e33939aca..b6836bfa985c 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -4723,6 +4723,13 @@ void device_set_of_node_from_dev(struct device *dev, const struct device *dev2)
 }
 EXPORT_SYMBOL_GPL(device_set_of_node_from_dev);
 
+void device_set_node(struct device *dev, struct fwnode_handle *fwnode)
+{
+	dev->fwnode = fwnode;
+	dev->of_node = to_of_node(fwnode);
+}
+EXPORT_SYMBOL_GPL(device_set_node);
+
 int device_match_name(struct device *dev, const void *name)
 {
 	return sysfs_streq(dev_name(dev), name);
diff --git a/include/linux/device.h b/include/linux/device.h
index 38a2071cf776..a1e7cab2c7bf 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -819,6 +819,7 @@ int device_online(struct device *dev);
 void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode);
 void set_secondary_fwnode(struct device *dev, struct fwnode_handle *fwnode);
 void device_set_of_node_from_dev(struct device *dev, const struct device *dev2);
+void device_set_node(struct device *dev, struct fwnode_handle *fwnode);
 
 static inline int dev_num_vf(struct device *dev)
 {
-- 
cgit v1.2.3


From ca2e334232b6cd4ae5af9da2df83c009d042aefb Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 5 Mar 2021 13:19:52 +0100
Subject: lib: add iomem emulation (logic_iomem)

Add IO memory emulation that uses callbacks for read/write to
the allocated regions. The callbacks can be registered by the
users using logic_iomem_alloc().

To use, an architecture must 'select LOGIC_IOMEM' in Kconfig
and then include <asm-generic/logic_io.h> into asm/io.h to get
the __raw_read*/__raw_write* functions.

Optionally, an architecture may 'select LOGIC_IOMEM_FALLBACK'
in which case non-emulated regions will 'fall back' to the
various real_* functions that must then be provided.

Cc: Arnd Bergmann <arnd@kernel.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 include/asm-generic/logic_io.h |  78 ++++++++++
 include/linux/logic_iomem.h    |  62 ++++++++
 lib/Kconfig                    |  14 ++
 lib/Makefile                   |   2 +
 lib/logic_iomem.c              | 318 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 474 insertions(+)
 create mode 100644 include/asm-generic/logic_io.h
 create mode 100644 include/linux/logic_iomem.h
 create mode 100644 lib/logic_iomem.c

(limited to 'include/linux')

diff --git a/include/asm-generic/logic_io.h b/include/asm-generic/logic_io.h
new file mode 100644
index 000000000000..a53116b8c57e
--- /dev/null
+++ b/include/asm-generic/logic_io.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2021 Intel Corporation
+ * Author: johannes@sipsolutions.net
+ */
+#ifndef _LOGIC_IO_H
+#define _LOGIC_IO_H
+#include <linux/types.h>
+
+/* include this file into asm/io.h */
+
+#ifdef CONFIG_INDIRECT_IOMEM
+
+#ifdef CONFIG_INDIRECT_IOMEM_FALLBACK
+/*
+ * If you want emulated IO memory to fall back to 'normal' IO memory
+ * if a region wasn't registered as emulated, then you need to have
+ * all of the real_* functions implemented.
+ */
+#if !defined(real_ioremap) || !defined(real_iounmap) || \
+    !defined(real_raw_readb) || !defined(real_raw_writeb) || \
+    !defined(real_raw_readw) || !defined(real_raw_writew) || \
+    !defined(real_raw_readl) || !defined(real_raw_writel) || \
+    (defined(CONFIG_64BIT) && \
+     (!defined(real_raw_readq) || !defined(real_raw_writeq))) || \
+    !defined(real_memset_io) || \
+    !defined(real_memcpy_fromio) || \
+    !defined(real_memcpy_toio)
+#error "Must provide fallbacks for real IO memory access"
+#endif /* defined ... */
+#endif /* CONFIG_INDIRECT_IOMEM_FALLBACK */
+
+#define ioremap ioremap
+void __iomem *ioremap(phys_addr_t offset, size_t size);
+
+#define iounmap iounmap
+void iounmap(void __iomem *addr);
+
+#define __raw_readb __raw_readb
+u8 __raw_readb(const volatile void __iomem *addr);
+
+#define __raw_readw __raw_readw
+u16 __raw_readw(const volatile void __iomem *addr);
+
+#define __raw_readl __raw_readl
+u32 __raw_readl(const volatile void __iomem *addr);
+
+#ifdef CONFIG_64BIT
+#define __raw_readq __raw_readq
+u64 __raw_readq(const volatile void __iomem *addr);
+#endif /* CONFIG_64BIT */
+
+#define __raw_writeb __raw_writeb
+void __raw_writeb(u8 value, volatile void __iomem *addr);
+
+#define __raw_writew __raw_writew
+void __raw_writew(u16 value, volatile void __iomem *addr);
+
+#define __raw_writel __raw_writel
+void __raw_writel(u32 value, volatile void __iomem *addr);
+
+#ifdef CONFIG_64BIT
+#define __raw_writeq __raw_writeq
+void __raw_writeq(u64 value, volatile void __iomem *addr);
+#endif /* CONFIG_64BIT */
+
+#define memset_io memset_io
+void memset_io(volatile void __iomem *addr, int value, size_t size);
+
+#define memcpy_fromio memcpy_fromio
+void memcpy_fromio(void *buffer, const volatile void __iomem *addr,
+		   size_t size);
+
+#define memcpy_toio memcpy_toio
+void memcpy_toio(volatile void __iomem *addr, const void *buffer, size_t size);
+
+#endif /* CONFIG_INDIRECT_IOMEM */
+#endif /* _LOGIC_IO_H */
diff --git a/include/linux/logic_iomem.h b/include/linux/logic_iomem.h
new file mode 100644
index 000000000000..3fa65c964379
--- /dev/null
+++ b/include/linux/logic_iomem.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2021 Intel Corporation
+ * Author: johannes@sipsolutions.net
+ */
+#ifndef __LOGIC_IOMEM_H
+#define __LOGIC_IOMEM_H
+#include <linux/types.h>
+#include <linux/ioport.h>
+
+/**
+ * struct logic_iomem_ops - emulated IO memory ops
+ * @read: read an 8, 16, 32 or 64 bit quantity from the given offset,
+ *	size is given in bytes (1, 2, 4 or 8)
+ *	(64-bit only necessary if CONFIG_64BIT is set)
+ * @write: write an 8, 16 32 or 64 bit quantity to the given offset,
+ *	size is given in bytes (1, 2, 4 or 8)
+ *	(64-bit only necessary if CONFIG_64BIT is set)
+ * @set: optional, for memset_io()
+ * @copy_from: optional, for memcpy_fromio()
+ * @copy_to: optional, for memcpy_toio()
+ * @unmap: optional, this region is getting unmapped
+ */
+struct logic_iomem_ops {
+	unsigned long (*read)(void *priv, unsigned int offset, int size);
+	void (*write)(void *priv, unsigned int offset, int size,
+		      unsigned long val);
+
+	void (*set)(void *priv, unsigned int offset, u8 value, int size);
+	void (*copy_from)(void *priv, void *buffer, unsigned int offset,
+			  int size);
+	void (*copy_to)(void *priv, unsigned int offset, const void *buffer,
+			int size);
+
+	void (*unmap)(void *priv);
+};
+
+/**
+ * struct logic_iomem_region_ops - ops for an IO memory handler
+ * @map: map a range in the registered IO memory region, must
+ *	fill *ops with the ops and may fill *priv to be passed
+ *	to the ops. The offset is given as the offset into the
+ *	registered resource region.
+ *	The return value is negative for errors, or >= 0 for
+ *	success. On success, the return value is added to the
+ *	offset for later ops, to allow for partial mappings.
+ */
+struct logic_iomem_region_ops {
+	long (*map)(unsigned long offset, size_t size,
+		    const struct logic_iomem_ops **ops,
+		    void **priv);
+};
+
+/**
+ * logic_iomem_add_region - register an IO memory region
+ * @resource: the resource description for this region
+ * @ops: the IO memory mapping ops for this resource
+ */
+int logic_iomem_add_region(struct resource *resource,
+			   const struct logic_iomem_region_ops *ops);
+
+#endif /* __LOGIC_IOMEM_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index ac3b30697b2b..d241fe476fda 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -102,6 +102,20 @@ config INDIRECT_PIO
 
 	  When in doubt, say N.
 
+config INDIRECT_IOMEM
+	bool
+	help
+	  This is selected by other options/architectures to provide the
+	  emulated iomem accessors.
+
+config INDIRECT_IOMEM_FALLBACK
+	bool
+	depends on INDIRECT_IOMEM
+	help
+	  If INDIRECT_IOMEM is selected, this enables falling back to plain
+	  mmio accesses when the IO memory address is not a registered
+	  emulated region.
+
 config CRC_CCITT
 	tristate "CRC-CCITT functions"
 	help
diff --git a/lib/Makefile b/lib/Makefile
index 2cc359ec1fdd..fc38a55e6fe4 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -147,6 +147,8 @@ obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
 
 lib-y += logic_pio.o
 
+lib-$(CONFIG_INDIRECT_IOMEM) += logic_iomem.o
+
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
 
 obj-$(CONFIG_BTREE) += btree.o
diff --git a/lib/logic_iomem.c b/lib/logic_iomem.c
new file mode 100644
index 000000000000..b76b92dd0f1f
--- /dev/null
+++ b/lib/logic_iomem.c
@@ -0,0 +1,318 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Intel Corporation
+ * Author: Johannes Berg <johannes@sipsolutions.net>
+ */
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/logic_iomem.h>
+
+struct logic_iomem_region {
+	const struct resource *res;
+	const struct logic_iomem_region_ops *ops;
+	struct list_head list;
+};
+
+struct logic_iomem_area {
+	const struct logic_iomem_ops *ops;
+	void *priv;
+};
+
+#define AREA_SHIFT	24
+#define MAX_AREA_SIZE	(1 << AREA_SHIFT)
+#define MAX_AREAS	((1ULL<<32) / MAX_AREA_SIZE)
+#define AREA_BITS	((MAX_AREAS - 1) << AREA_SHIFT)
+#define AREA_MASK	(MAX_AREA_SIZE - 1)
+#ifdef CONFIG_64BIT
+#define IOREMAP_BIAS	0xDEAD000000000000UL
+#define IOREMAP_MASK	0xFFFFFFFF00000000UL
+#else
+#define IOREMAP_BIAS	0
+#define IOREMAP_MASK	0
+#endif
+
+static DEFINE_MUTEX(regions_mtx);
+static LIST_HEAD(regions_list);
+static struct logic_iomem_area mapped_areas[MAX_AREAS];
+
+int logic_iomem_add_region(struct resource *resource,
+			   const struct logic_iomem_region_ops *ops)
+{
+	struct logic_iomem_region *rreg;
+	int err;
+
+	if (WARN_ON(!resource || !ops))
+		return -EINVAL;
+
+	if (WARN_ON((resource->flags & IORESOURCE_TYPE_BITS) != IORESOURCE_MEM))
+		return -EINVAL;
+
+	rreg = kzalloc(sizeof(*rreg), GFP_KERNEL);
+	if (!rreg)
+		return -ENOMEM;
+
+	err = request_resource(&iomem_resource, resource);
+	if (err) {
+		kfree(rreg);
+		return -ENOMEM;
+	}
+
+	mutex_lock(&regions_mtx);
+	rreg->res = resource;
+	rreg->ops = ops;
+	list_add_tail(&rreg->list, &regions_list);
+	mutex_unlock(&regions_mtx);
+
+	return 0;
+}
+EXPORT_SYMBOL(logic_iomem_add_region);
+
+#ifndef CONFIG_LOGIC_IOMEM_FALLBACK
+static void __iomem *real_ioremap(phys_addr_t offset, size_t size)
+{
+	WARN(1, "invalid ioremap(0x%llx, 0x%zx)\n",
+	     (unsigned long long)offset, size);
+	return NULL;
+}
+
+static void real_iounmap(void __iomem *addr)
+{
+	WARN(1, "invalid iounmap for addr 0x%llx\n",
+	     (unsigned long long)addr);
+}
+#endif /* CONFIG_LOGIC_IOMEM_FALLBACK */
+
+void __iomem *ioremap(phys_addr_t offset, size_t size)
+{
+	void __iomem *ret = NULL;
+	struct logic_iomem_region *rreg, *found = NULL;
+	int i;
+
+	mutex_lock(&regions_mtx);
+	list_for_each_entry(rreg, &regions_list, list) {
+		if (rreg->res->start > offset)
+			continue;
+		if (rreg->res->end < offset + size - 1)
+			continue;
+		found = rreg;
+		break;
+	}
+
+	if (!found)
+		goto out;
+
+	for (i = 0; i < MAX_AREAS; i++) {
+		long offs;
+
+		if (mapped_areas[i].ops)
+			continue;
+
+		offs = rreg->ops->map(offset - found->res->start,
+				      size, &mapped_areas[i].ops,
+				      &mapped_areas[i].priv);
+		if (offs < 0) {
+			mapped_areas[i].ops = NULL;
+			break;
+		}
+
+		if (WARN_ON(!mapped_areas[i].ops)) {
+			mapped_areas[i].ops = NULL;
+			break;
+		}
+
+		ret = (void __iomem *)(IOREMAP_BIAS + (i << AREA_SHIFT) + offs);
+		break;
+	}
+out:
+	mutex_unlock(&regions_mtx);
+	if (ret)
+		return ret;
+	return real_ioremap(offset, size);
+}
+EXPORT_SYMBOL(ioremap);
+
+static inline struct logic_iomem_area *
+get_area(const volatile void __iomem *addr)
+{
+	unsigned long a = (unsigned long)addr;
+	unsigned int idx;
+
+	if (WARN_ON((a & IOREMAP_MASK) != IOREMAP_BIAS))
+		return NULL;
+
+	idx = (a & AREA_BITS) >> AREA_SHIFT;
+
+	if (mapped_areas[idx].ops)
+		return &mapped_areas[idx];
+
+	return NULL;
+}
+
+void iounmap(void __iomem *addr)
+{
+	struct logic_iomem_area *area = get_area(addr);
+
+	if (!area) {
+		real_iounmap(addr);
+		return;
+	}
+
+	if (area->ops->unmap)
+		area->ops->unmap(area->priv);
+
+	mutex_lock(&regions_mtx);
+	area->ops = NULL;
+	area->priv = NULL;
+	mutex_unlock(&regions_mtx);
+}
+EXPORT_SYMBOL(iounmap);
+
+#ifndef CONFIG_LOGIC_IOMEM_FALLBACK
+#define MAKE_FALLBACK(op, sz) 						\
+static u##sz real_raw_read ## op(const volatile void __iomem *addr)	\
+{									\
+	WARN(1, "Invalid read" #op " at address %llx\n",		\
+	     (unsigned long long)addr);					\
+	return (u ## sz)~0ULL;						\
+}									\
+									\
+void real_raw_write ## op(u ## sz val, volatile void __iomem *addr)	\
+{									\
+	WARN(1, "Invalid writeq" #op " of 0x%llx at address %llx\n",	\
+	     (unsigned long long)val, (unsigned long long)addr);	\
+}									\
+
+MAKE_FALLBACK(b, 8);
+MAKE_FALLBACK(w, 16);
+MAKE_FALLBACK(l, 32);
+#ifdef CONFIG_64BIT
+MAKE_FALLBACK(q, 64);
+#endif
+
+static void real_memset_io(volatile void __iomem *addr, int value, size_t size)
+{
+	WARN(1, "Invalid memset_io at address 0x%llx\n",
+	     (unsigned long long)addr);
+}
+
+static void real_memcpy_fromio(void *buffer, const volatile void __iomem *addr,
+			       size_t size)
+{
+	WARN(1, "Invalid memcpy_fromio at address 0x%llx\n",
+	     (unsigned long long)addr);
+
+	memset(buffer, 0xff, size);
+}
+
+static void real_memcpy_toio(volatile void __iomem *addr, const void *buffer,
+			     size_t size)
+{
+	WARN(1, "Invalid memcpy_toio at address 0x%llx\n",
+	     (unsigned long long)addr);
+}
+#endif /* CONFIG_LOGIC_IOMEM_FALLBACK */
+
+#define MAKE_OP(op, sz) 						\
+u##sz __raw_read ## op(const volatile void __iomem *addr)		\
+{									\
+	struct logic_iomem_area *area = get_area(addr);			\
+									\
+	if (!area)							\
+		return real_raw_read ## op(addr);			\
+									\
+	return (u ## sz) area->ops->read(area->priv,			\
+					 (unsigned long)addr & AREA_MASK,\
+					 sz / 8);			\
+}									\
+EXPORT_SYMBOL(__raw_read ## op);					\
+									\
+void __raw_write ## op(u ## sz val, volatile void __iomem *addr)	\
+{									\
+	struct logic_iomem_area *area = get_area(addr);			\
+									\
+	if (!area) {							\
+		real_raw_write ## op(val, addr);			\
+		return;							\
+	}								\
+									\
+	area->ops->write(area->priv,					\
+			 (unsigned long)addr & AREA_MASK,		\
+			 sz / 8, val);					\
+}									\
+EXPORT_SYMBOL(__raw_write ## op)
+
+MAKE_OP(b, 8);
+MAKE_OP(w, 16);
+MAKE_OP(l, 32);
+#ifdef CONFIG_64BIT
+MAKE_OP(q, 64);
+#endif
+
+void memset_io(volatile void __iomem *addr, int value, size_t size)
+{
+	struct logic_iomem_area *area = get_area(addr);
+	unsigned long offs, start;
+
+	if (!area) {
+		real_memset_io(addr, value, size);
+		return;
+	}
+
+	start = (unsigned long)addr & AREA_MASK;
+
+	if (area->ops->set) {
+		area->ops->set(area->priv, start, value, size);
+		return;
+	}
+
+	for (offs = 0; offs < size; offs++)
+		area->ops->write(area->priv, start + offs, 1, value);
+}
+EXPORT_SYMBOL(memset_io);
+
+void memcpy_fromio(void *buffer, const volatile void __iomem *addr,
+                   size_t size)
+{
+	struct logic_iomem_area *area = get_area(addr);
+	u8 *buf = buffer;
+	unsigned long offs, start;
+
+	if (!area) {
+		real_memcpy_fromio(buffer, addr, size);
+		return;
+	}
+
+	start = (unsigned long)addr & AREA_MASK;
+
+	if (area->ops->copy_from) {
+		area->ops->copy_from(area->priv, buffer, start, size);
+		return;
+	}
+
+	for (offs = 0; offs < size; offs++)
+		buf[offs] = area->ops->read(area->priv, start + offs, 1);
+}
+EXPORT_SYMBOL(memcpy_fromio);
+
+void memcpy_toio(volatile void __iomem *addr, const void *buffer, size_t size)
+{
+	struct logic_iomem_area *area = get_area(addr);
+	const u8 *buf = buffer;
+	unsigned long offs, start;
+
+	if (!area) {
+		real_memcpy_toio(addr, buffer, size);
+		return;
+	}
+
+	start = (unsigned long)addr & AREA_MASK;
+
+	if (area->ops->copy_to) {
+		area->ops->copy_to(area->priv, start, buffer, size);
+		return;
+	}
+
+	for (offs = 0; offs < size; offs++)
+		area->ops->write(area->priv, start + offs, 1, buf[offs]);
+}
+EXPORT_SYMBOL(memcpy_toio);
-- 
cgit v1.2.3


From 738d5ad104bbbe5d1bfb6c0553bb4a1eb91cc433 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 18 Jun 2021 08:38:47 +0200
Subject: Revert "of/platform: Add stubs for
 of_platform_device_create/destroy()"

This reverts commit 412981e06294dac3254d83bbf71d4184ea911d05 as the
patch series is causing build issues in linux-next at the moment.

Cc: Matthias Kaehlcke <mka@chromium.org>
Link: https://lore.kernel.org/r/YMuRcrE8xlWnFSWW@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/of_platform.h | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index d15b6cd5e1c3..84a966623e78 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -61,18 +61,16 @@ static inline struct platform_device *of_find_device_by_node(struct device_node
 }
 #endif
 
-extern int of_platform_bus_probe(struct device_node *root,
-				 const struct of_device_id *matches,
-				 struct device *parent);
-
-#ifdef CONFIG_OF_ADDRESS
 /* Platform devices and busses creation */
 extern struct platform_device *of_platform_device_create(struct device_node *np,
 						   const char *bus_id,
 						   struct device *parent);
 
 extern int of_platform_device_destroy(struct device *dev, void *data);
-
+extern int of_platform_bus_probe(struct device_node *root,
+				 const struct of_device_id *matches,
+				 struct device *parent);
+#ifdef CONFIG_OF_ADDRESS
 extern int of_platform_populate(struct device_node *root,
 				const struct of_device_id *matches,
 				const struct of_dev_auxdata *lookup,
@@ -86,18 +84,6 @@ extern int devm_of_platform_populate(struct device *dev);
 
 extern void devm_of_platform_depopulate(struct device *dev);
 #else
-/* Platform devices and busses creation */
-static inline struct platform_device *of_platform_device_create(struct device_node *np,
-								const char *bus_id,
-								struct device *parent)
-{
-	return NULL;
-}
-static inline int of_platform_device_destroy(struct device *dev, void *data)
-{
-	return -ENODEV;
-}
-
 static inline int of_platform_populate(struct device_node *root,
 					const struct of_device_id *matches,
 					const struct of_dev_auxdata *lookup,
-- 
cgit v1.2.3


From 04d72afa34edd14d99db7536d22819cdbb2b2e4c Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 18 Jun 2021 08:39:24 +0200
Subject: Revert "USB: misc: Add onboard_usb_hub driver"

This reverts commit b4e326165e21d6a11483f6a4de2174b933413554 as the
patch series is causing build issues in linux-next at the moment.

Cc: Matthias Kaehlcke <mka@chromium.org>
Link: https://lore.kernel.org/r/YMuRcrE8xlWnFSWW@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 .../ABI/testing/sysfs-bus-platform-onboard-usb-hub |   8 -
 MAINTAINERS                                        |   7 -
 drivers/usb/misc/Kconfig                           |  17 -
 drivers/usb/misc/Makefile                          |   1 -
 drivers/usb/misc/onboard_usb_hub.c                 | 497 ---------------------
 include/linux/usb/onboard_hub.h                    |  18 -
 6 files changed, 548 deletions(-)
 delete mode 100644 Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-hub
 delete mode 100644 drivers/usb/misc/onboard_usb_hub.c
 delete mode 100644 include/linux/usb/onboard_hub.h

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-hub b/Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-hub
deleted file mode 100644
index e981d83648e6..000000000000
--- a/Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-hub
+++ /dev/null
@@ -1,8 +0,0 @@
-What:		/sys/bus/platform/devices/<dev>/always_powered_in_suspend
-Date:		March 2021
-KernelVersion:	5.13
-Contact:	Matthias Kaehlcke <matthias@kaehlcke.net>
-		linux-usb@vger.kernel.org
-Description:
-		(RW) Controls whether the USB hub remains always powered
-		during system suspend or not.
\ No newline at end of file
diff --git a/MAINTAINERS b/MAINTAINERS
index 56930e88e97e..bc0ceef87b73 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13622,13 +13622,6 @@ S:	Maintained
 T:	git git://linuxtv.org/media_tree.git
 F:	drivers/media/i2c/ov9734.c
 
-ONBOARD USB HUB DRIVER
-M:	Matthias Kaehlcke <mka@chromium.org>
-L:	linux-usb@vger.kernel.org
-S:	Maintained
-F:	Documentation/devicetree/bindings/usb/onboard_usb_hub.yaml
-F:	drivers/usb/misc/onboard_usb_hub.c
-
 ONENAND FLASH DRIVER
 M:	Kyungmin Park <kyungmin.park@samsung.com>
 L:	linux-mtd@lists.infradead.org
diff --git a/drivers/usb/misc/Kconfig b/drivers/usb/misc/Kconfig
index f534269fbb20..8f1144359012 100644
--- a/drivers/usb/misc/Kconfig
+++ b/drivers/usb/misc/Kconfig
@@ -284,20 +284,3 @@ config BRCM_USB_PINMAP
 	  This option enables support for remapping some USB external
 	  signals, which are typically on dedicated pins on the chip,
 	  to any gpio.
-
-config USB_ONBOARD_HUB
-	tristate "Onboard USB hub support"
-	depends on OF || COMPILE_TEST
-	help
-	  Say Y here if you want to support discrete onboard USB hubs that
-	  don't require an additional control bus for initialization, but
-	  need some nontrivial form of initialization, such as enabling a
-	  power regulator. An example for such a hub is the Realtek
-	  RTS5411.
-
-	  The driver can be configured to turn off the power of the hub
-	  during system suspend. This may reduce power consumption while
-	  the system is suspended.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called onboard_usb_hub.
diff --git a/drivers/usb/misc/Makefile b/drivers/usb/misc/Makefile
index 2c5aec6f1b26..5f4e598573ab 100644
--- a/drivers/usb/misc/Makefile
+++ b/drivers/usb/misc/Makefile
@@ -32,4 +32,3 @@ obj-$(CONFIG_USB_CHAOSKEY)		+= chaoskey.o
 obj-$(CONFIG_USB_SISUSBVGA)		+= sisusbvga/
 obj-$(CONFIG_USB_LINK_LAYER_TEST)	+= lvstest.o
 obj-$(CONFIG_BRCM_USB_PINMAP)		+= brcmstb-usb-pinmap.o
-obj-$(CONFIG_USB_ONBOARD_HUB)		+= onboard_usb_hub.o
diff --git a/drivers/usb/misc/onboard_usb_hub.c b/drivers/usb/misc/onboard_usb_hub.c
deleted file mode 100644
index ed1f5424ea5f..000000000000
--- a/drivers/usb/misc/onboard_usb_hub.c
+++ /dev/null
@@ -1,497 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  Driver for onboard USB hubs
- *
- * Copyright (c) 2020, Google LLC
- */
-
-#include <linux/device.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/of.h>
-#include <linux/of_platform.h>
-#include <linux/platform_device.h>
-#include <linux/regulator/consumer.h>
-#include <linux/suspend.h>
-#include <linux/sysfs.h>
-#include <linux/usb.h>
-#include <linux/usb/hcd.h>
-#include <linux/usb/of.h>
-#include <linux/usb/onboard_hub.h>
-
-static struct usb_device_driver onboard_hub_usbdev_driver;
-
-/************************** Platform driver **************************/
-
-struct udev_node {
-	struct usb_device *udev;
-	struct list_head list;
-};
-
-struct onboard_hub {
-	struct regulator *vdd;
-	struct device *dev;
-	bool always_powered_in_suspend;
-	bool is_powered_on;
-	bool going_away;
-	struct list_head udev_list;
-	struct mutex lock;
-};
-
-static int onboard_hub_power_on(struct onboard_hub *hub)
-{
-	int err;
-
-	err = regulator_enable(hub->vdd);
-	if (err) {
-		dev_err(hub->dev, "failed to enable regulator: %d\n", err);
-		return err;
-	}
-
-	hub->is_powered_on = true;
-
-	return 0;
-}
-
-static int onboard_hub_power_off(struct onboard_hub *hub)
-{
-	int err;
-
-	err = regulator_disable(hub->vdd);
-	if (err) {
-		dev_err(hub->dev, "failed to disable regulator: %d\n", err);
-		return err;
-	}
-
-	hub->is_powered_on = false;
-
-	return 0;
-}
-
-static int __maybe_unused onboard_hub_suspend(struct device *dev)
-{
-	struct onboard_hub *hub = dev_get_drvdata(dev);
-	struct udev_node *node;
-	bool power_off;
-	int rc = 0;
-
-	if (hub->always_powered_in_suspend)
-		return 0;
-
-	power_off = true;
-
-	mutex_lock(&hub->lock);
-
-	list_for_each_entry(node, &hub->udev_list, list) {
-		if (!device_may_wakeup(node->udev->bus->controller))
-			continue;
-
-		if (usb_wakeup_enabled_descendants(node->udev)) {
-			power_off = false;
-			break;
-		}
-	}
-
-	mutex_unlock(&hub->lock);
-
-	if (power_off)
-		rc = onboard_hub_power_off(hub);
-
-	return rc;
-}
-
-static int __maybe_unused onboard_hub_resume(struct device *dev)
-{
-	struct onboard_hub *hub = dev_get_drvdata(dev);
-	int rc = 0;
-
-	if (!hub->is_powered_on)
-		rc = onboard_hub_power_on(hub);
-
-	return rc;
-}
-
-static int onboard_hub_add_usbdev(struct onboard_hub *hub, struct usb_device *udev)
-{
-	struct udev_node *node;
-	char link_name[64];
-	int ret = 0;
-
-	mutex_lock(&hub->lock);
-
-	if (hub->going_away) {
-		ret = -EINVAL;
-		goto unlock;
-	}
-
-	node = devm_kzalloc(hub->dev, sizeof(*node), GFP_KERNEL);
-	if (!node) {
-		ret = -ENOMEM;
-		goto unlock;
-	}
-
-	node->udev = udev;
-
-	list_add(&node->list, &hub->udev_list);
-
-	snprintf(link_name, sizeof(link_name), "usb_dev.%s", dev_name(&udev->dev));
-	WARN_ON(sysfs_create_link(&hub->dev->kobj, &udev->dev.kobj, link_name));
-
-unlock:
-	mutex_unlock(&hub->lock);
-
-	return ret;
-}
-
-static void onboard_hub_remove_usbdev(struct onboard_hub *hub, struct usb_device *udev)
-{
-	struct udev_node *node;
-	char link_name[64];
-
-	snprintf(link_name, sizeof(link_name), "usb_dev.%s", dev_name(&udev->dev));
-	sysfs_remove_link(&hub->dev->kobj, link_name);
-
-	mutex_lock(&hub->lock);
-
-	list_for_each_entry(node, &hub->udev_list, list) {
-		if (node->udev == udev) {
-			list_del(&node->list);
-			break;
-		}
-	}
-
-	mutex_unlock(&hub->lock);
-}
-
-static ssize_t always_powered_in_suspend_show(struct device *dev, struct device_attribute *attr,
-			   char *buf)
-{
-	struct onboard_hub *hub = dev_get_drvdata(dev);
-
-	return sysfs_emit(buf, "%d\n", hub->always_powered_in_suspend);
-}
-
-static ssize_t always_powered_in_suspend_store(struct device *dev, struct device_attribute *attr,
-			    const char *buf, size_t count)
-{
-	struct onboard_hub *hub = dev_get_drvdata(dev);
-	bool val;
-	int ret;
-
-	ret = kstrtobool(buf, &val);
-	if (ret < 0)
-		return ret;
-
-	hub->always_powered_in_suspend = val;
-
-	return count;
-}
-static DEVICE_ATTR_RW(always_powered_in_suspend);
-
-static struct attribute *onboard_hub_attrs[] = {
-	&dev_attr_always_powered_in_suspend.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(onboard_hub);
-
-static int onboard_hub_probe(struct platform_device *pdev)
-{
-	struct device *dev = &pdev->dev;
-	struct onboard_hub *hub;
-	int err;
-
-	hub = devm_kzalloc(dev, sizeof(*hub), GFP_KERNEL);
-	if (!hub)
-		return -ENOMEM;
-
-	hub->vdd = devm_regulator_get(dev, "vdd");
-	if (IS_ERR(hub->vdd))
-		return PTR_ERR(hub->vdd);
-
-	hub->dev = dev;
-	mutex_init(&hub->lock);
-	INIT_LIST_HEAD(&hub->udev_list);
-
-	dev_set_drvdata(dev, hub);
-
-	err = onboard_hub_power_on(hub);
-	if (err)
-		return err;
-
-	/*
-	 * The USB driver might have been detached from the USB devices by
-	 * onboard_hub_remove(), make sure to re-attach it if needed.
-	 */
-	err = driver_attach(&onboard_hub_usbdev_driver.drvwrap.driver);
-	if (err) {
-		onboard_hub_power_off(hub);
-		return err;
-	}
-
-	return 0;
-}
-
-static int onboard_hub_remove(struct platform_device *pdev)
-{
-	struct onboard_hub *hub = dev_get_drvdata(&pdev->dev);
-	struct udev_node *node;
-	struct usb_device *udev;
-
-	hub->going_away = true;
-
-	mutex_lock(&hub->lock);
-
-	/* unbind the USB devices to avoid dangling references to this device */
-	while (!list_empty(&hub->udev_list)) {
-		node = list_first_entry(&hub->udev_list, struct udev_node, list);
-		udev = node->udev;
-
-		/*
-		 * Unbinding the driver will call onboard_hub_remove_usbdev(),
-		 * which acquires hub->lock.  We must release the lock first.
-		 */
-		get_device(&udev->dev);
-		mutex_unlock(&hub->lock);
-		device_release_driver(&udev->dev);
-		put_device(&udev->dev);
-		mutex_lock(&hub->lock);
-	}
-
-	mutex_unlock(&hub->lock);
-
-	return onboard_hub_power_off(hub);
-}
-
-static const struct of_device_id onboard_hub_match[] = {
-	{ .compatible = "usbbda,411" },
-	{ .compatible = "usbbda,5411" },
-	{}
-};
-MODULE_DEVICE_TABLE(of, onboard_hub_match);
-
-static bool of_is_onboard_usb_hub(const struct device_node *np)
-{
-	return !!of_match_node(onboard_hub_match, np);
-}
-
-static const struct dev_pm_ops __maybe_unused onboard_hub_pm_ops = {
-	SET_LATE_SYSTEM_SLEEP_PM_OPS(onboard_hub_suspend, onboard_hub_resume)
-};
-
-static struct platform_driver onboard_hub_driver = {
-	.probe = onboard_hub_probe,
-	.remove = onboard_hub_remove,
-
-	.driver = {
-		.name = "onboard-usb-hub",
-		.of_match_table = onboard_hub_match,
-		.pm = pm_ptr(&onboard_hub_pm_ops),
-		.dev_groups = onboard_hub_groups,
-	},
-};
-
-/************************** USB driver **************************/
-
-#define VENDOR_ID_REALTEK	0x0bda
-
-/*
- * Returns the onboard_hub platform device that is associated with the USB
- * device passed as parameter.
- */
-static struct onboard_hub *_find_onboard_hub(struct device *dev)
-{
-	struct platform_device *pdev;
-	struct device_node *np;
-	phandle ph;
-
-	pdev = of_find_device_by_node(dev->of_node);
-	if (!pdev) {
-		if (of_property_read_u32(dev->of_node, "companion-hub", &ph)) {
-			dev_err(dev, "failed to read 'companion-hub' property\n");
-			return ERR_PTR(-EINVAL);
-		}
-
-		np = of_find_node_by_phandle(ph);
-		if (!np) {
-			dev_err(dev, "failed to find device node for companion hub\n");
-			return ERR_PTR(-EINVAL);
-		}
-
-		pdev = of_find_device_by_node(np);
-		of_node_put(np);
-
-		if (!pdev)
-			return ERR_PTR(-EPROBE_DEFER);
-	}
-
-	put_device(&pdev->dev);
-
-	return dev_get_drvdata(&pdev->dev);
-}
-
-static int onboard_hub_usbdev_probe(struct usb_device *udev)
-{
-	struct device *dev = &udev->dev;
-	struct onboard_hub *hub;
-	int err;
-
-	/* ignore supported hubs without device tree node */
-	if (!dev->of_node)
-		return -ENODEV;
-
-	hub = _find_onboard_hub(dev);
-	if (IS_ERR(hub))
-		return PTR_ERR(hub);
-
-	dev_set_drvdata(dev, hub);
-
-	err = onboard_hub_add_usbdev(hub, udev);
-	if (err)
-		return err;
-
-	err = sysfs_create_link(&udev->dev.kobj, &hub->dev->kobj, "onboard_hub_dev");
-	if (err)
-		dev_warn(&udev->dev, "failed to create symlink to platform device: %d\n", err);
-
-	return 0;
-}
-
-static void onboard_hub_usbdev_disconnect(struct usb_device *udev)
-{
-	struct onboard_hub *hub = dev_get_drvdata(&udev->dev);
-
-	sysfs_remove_link(&udev->dev.kobj, "onboard_hub_dev");
-
-	onboard_hub_remove_usbdev(hub, udev);
-}
-
-static const struct usb_device_id onboard_hub_id_table[] = {
-	{ USB_DEVICE(VENDOR_ID_REALTEK, 0x0411) }, /* RTS5411 USB 3.0 */
-	{ USB_DEVICE(VENDOR_ID_REALTEK, 0x5411) }, /* RTS5411 USB 2.0 */
-	{},
-};
-
-MODULE_DEVICE_TABLE(usb, onboard_hub_id_table);
-
-static struct usb_device_driver onboard_hub_usbdev_driver = {
-
-	.name = "onboard-usb-hub",
-	.probe = onboard_hub_usbdev_probe,
-	.disconnect = onboard_hub_usbdev_disconnect,
-	.generic_subclass = 1,
-	.supports_autosuspend =	1,
-	.id_table = onboard_hub_id_table,
-};
-
-/*** Helpers for creating/destroying platform devices for onboard hubs ***/
-
-struct pdev_list_entry {
-	struct platform_device *pdev;
-	struct list_head node;
-};
-
-/*
- * Creates a platform device for each supported onboard hub that is connected to
- * the given parent hub. To keep track of the platform devices they are added to
- * a list that is owned by the parent hub.
- */
-void onboard_hub_create_pdevs(struct usb_device *parent_hub, struct list_head *pdev_list)
-{
-	int i;
-	phandle ph;
-	struct device_node *np, *npc;
-	struct platform_device *pdev;
-	struct pdev_list_entry *pdle;
-
-	for (i = 1; i <= parent_hub->maxchild; i++) {
-		np = usb_of_get_device_node(parent_hub, i);
-		if (!np)
-			continue;
-
-		if (!of_is_onboard_usb_hub(np))
-			goto node_put;
-
-		if (of_property_read_u32(np, "companion-hub", &ph))
-			goto node_put;
-
-		npc = of_find_node_by_phandle(ph);
-		if (!npc)
-			goto node_put;
-
-		pdev = of_find_device_by_node(npc);
-		of_node_put(npc);
-
-		if (pdev) {
-			/* the companion hub already has a platform device, nothing to do here */
-			put_device(&pdev->dev);
-			goto node_put;
-		}
-
-		pdev = of_platform_device_create(np, NULL, &parent_hub->dev);
-		if (pdev) {
-			pdle = kzalloc(sizeof(*pdle), GFP_KERNEL);
-			if (!pdle)
-				goto node_put;
-
-			INIT_LIST_HEAD(&pdle->node);
-
-			pdle->pdev = pdev;
-			list_add(&pdle->node, pdev_list);
-		} else {
-			dev_err(&parent_hub->dev,
-				"failed to create platform device for onboard hub '%s'\n",
-				of_node_full_name(np));
-		}
-
-node_put:
-		of_node_put(np);
-	}
-}
-EXPORT_SYMBOL_GPL(onboard_hub_create_pdevs);
-
-/*
- * Destroys the platform devices in the given list and frees the memory associated
- * with the list entry.
- */
-void onboard_hub_destroy_pdevs(struct list_head *pdev_list)
-{
-	struct pdev_list_entry *pdle, *tmp;
-
-	list_for_each_entry_safe(pdle, tmp, pdev_list, node) {
-		of_platform_device_destroy(&pdle->pdev->dev, NULL);
-		kfree(pdle);
-	}
-}
-EXPORT_SYMBOL_GPL(onboard_hub_destroy_pdevs);
-
-/************************** Driver (de)registration **************************/
-
-static int __init onboard_hub_init(void)
-{
-	int ret;
-
-	ret = platform_driver_register(&onboard_hub_driver);
-	if (ret)
-		return ret;
-
-	ret = usb_register_device_driver(&onboard_hub_usbdev_driver, THIS_MODULE);
-	if (ret)
-		platform_driver_unregister(&onboard_hub_driver);
-
-	return ret;
-}
-module_init(onboard_hub_init);
-
-static void __exit onboard_hub_exit(void)
-{
-	usb_deregister_device_driver(&onboard_hub_usbdev_driver);
-	platform_driver_unregister(&onboard_hub_driver);
-}
-module_exit(onboard_hub_exit);
-
-MODULE_AUTHOR("Matthias Kaehlcke <mka@chromium.org>");
-MODULE_DESCRIPTION("Driver for discrete onboard USB hubs");
-MODULE_LICENSE("GPL v2");
diff --git a/include/linux/usb/onboard_hub.h b/include/linux/usb/onboard_hub.h
deleted file mode 100644
index d9373230556e..000000000000
--- a/include/linux/usb/onboard_hub.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef __LINUX_USB_ONBOARD_HUB_H
-#define __LINUX_USB_ONBOARD_HUB_H
-
-struct usb_device;
-struct list_head;
-
-#if IS_ENABLED(CONFIG_USB_ONBOARD_HUB)
-void onboard_hub_create_pdevs(struct usb_device *parent_hub, struct list_head *pdev_list);
-void onboard_hub_destroy_pdevs(struct list_head *pdev_list);
-#else
-static inline void onboard_hub_create_pdevs(struct usb_device *parent_hub,
-					    struct list_head *pdev_list) {}
-static inline void onboard_hub_destroy_pdevs(struct list_head *pdev_list) {}
-#endif
-
-#endif /* __LINUX_USB_ONBOARD_HUB_H */
-- 
cgit v1.2.3


From b03fbd4ff24c5f075e58eb19261d5f8b3e40d7c6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 11 Jun 2021 10:28:12 +0200
Subject: sched: Introduce task_is_running()

Replace a bunch of 'p->state == TASK_RUNNING' with a new helper:
task_is_running(p).

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Davidlohr Bueso <dave@stgolabs.net>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210611082838.222401495@infradead.org
---
 arch/alpha/kernel/process.c    | 2 +-
 arch/arc/kernel/stacktrace.c   | 2 +-
 arch/arm/kernel/process.c      | 2 +-
 arch/arm64/kernel/process.c    | 2 +-
 arch/csky/kernel/stacktrace.c  | 2 +-
 arch/h8300/kernel/process.c    | 2 +-
 arch/hexagon/kernel/process.c  | 2 +-
 arch/ia64/kernel/process.c     | 4 ++--
 arch/m68k/kernel/process.c     | 2 +-
 arch/mips/kernel/process.c     | 2 +-
 arch/nds32/kernel/process.c    | 2 +-
 arch/nios2/kernel/process.c    | 2 +-
 arch/parisc/kernel/process.c   | 4 ++--
 arch/powerpc/kernel/process.c  | 4 ++--
 arch/riscv/kernel/stacktrace.c | 2 +-
 arch/s390/kernel/process.c     | 2 +-
 arch/s390/mm/fault.c           | 2 +-
 arch/sh/kernel/process_32.c    | 2 +-
 arch/sparc/kernel/process_32.c | 3 +--
 arch/sparc/kernel/process_64.c | 3 +--
 arch/um/kernel/process.c       | 2 +-
 arch/x86/kernel/process.c      | 4 ++--
 arch/xtensa/kernel/process.c   | 2 +-
 block/blk-mq.c                 | 2 +-
 include/linux/sched.h          | 2 ++
 kernel/kcsan/report.c          | 2 +-
 kernel/locking/lockdep.c       | 2 +-
 kernel/rcu/tree_plugin.h       | 2 +-
 kernel/sched/core.c            | 6 +++---
 kernel/sched/stats.h           | 2 +-
 kernel/signal.c                | 2 +-
 kernel/softirq.c               | 3 +--
 mm/compaction.c                | 2 +-
 33 files changed, 40 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 5112ab996394..ef0c08ed0481 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -380,7 +380,7 @@ get_wchan(struct task_struct *p)
 {
 	unsigned long schedule_frame;
 	unsigned long pc;
-	if (!p || p == current || p->state == TASK_RUNNING)
+	if (!p || p == current || task_is_running(p))
 		return 0;
 	/*
 	 * This one depends on the frame size of schedule().  Do a
diff --git a/arch/arc/kernel/stacktrace.c b/arch/arc/kernel/stacktrace.c
index f73da203b170..1b9576d21e24 100644
--- a/arch/arc/kernel/stacktrace.c
+++ b/arch/arc/kernel/stacktrace.c
@@ -83,7 +83,7 @@ seed_unwind_frame_info(struct task_struct *tsk, struct pt_regs *regs,
 		 *    is safe-kept and BLINK at a well known location in there
 		 */
 
-		if (tsk->state == TASK_RUNNING)
+		if (task_is_running(tsk))
 			return -1;
 
 		frame_info->task = tsk;
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 6324f4db9b02..fc9e8b37eaa8 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -288,7 +288,7 @@ unsigned long get_wchan(struct task_struct *p)
 	struct stackframe frame;
 	unsigned long stack_page;
 	int count = 0;
-	if (!p || p == current || p->state == TASK_RUNNING)
+	if (!p || p == current || task_is_running(p))
 		return 0;
 
 	frame.fp = thread_saved_fp(p);
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index b4bb67f17a2c..14f3c19c6ad2 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -598,7 +598,7 @@ unsigned long get_wchan(struct task_struct *p)
 	struct stackframe frame;
 	unsigned long stack_page, ret = 0;
 	int count = 0;
-	if (!p || p == current || p->state == TASK_RUNNING)
+	if (!p || p == current || task_is_running(p))
 		return 0;
 
 	stack_page = (unsigned long)try_get_task_stack(p);
diff --git a/arch/csky/kernel/stacktrace.c b/arch/csky/kernel/stacktrace.c
index 16ae20a0af34..1b280ef08004 100644
--- a/arch/csky/kernel/stacktrace.c
+++ b/arch/csky/kernel/stacktrace.c
@@ -115,7 +115,7 @@ unsigned long get_wchan(struct task_struct *task)
 {
 	unsigned long pc = 0;
 
-	if (likely(task && task != current && task->state != TASK_RUNNING))
+	if (likely(task && task != current && !task_is_running(task)))
 		walk_stackframe(task, NULL, save_wchan, &pc);
 	return pc;
 }
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index 46b1342ce515..2ac27e4248a4 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -134,7 +134,7 @@ unsigned long get_wchan(struct task_struct *p)
 	unsigned long stack_page;
 	int count = 0;
 
-	if (!p || p == current || p->state == TASK_RUNNING)
+	if (!p || p == current || task_is_running(p))
 		return 0;
 
 	stack_page = (unsigned long)p;
diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c
index c61165c99ae0..6a6835fb4242 100644
--- a/arch/hexagon/kernel/process.c
+++ b/arch/hexagon/kernel/process.c
@@ -135,7 +135,7 @@ unsigned long get_wchan(struct task_struct *p)
 	unsigned long fp, pc;
 	unsigned long stack_page;
 	int count = 0;
-	if (!p || p == current || p->state == TASK_RUNNING)
+	if (!p || p == current || task_is_running(p))
 		return 0;
 
 	stack_page = (unsigned long)task_stack_page(p);
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 7e1a1525e202..e56d63f4abf9 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -529,7 +529,7 @@ get_wchan (struct task_struct *p)
 	unsigned long ip;
 	int count = 0;
 
-	if (!p || p == current || p->state == TASK_RUNNING)
+	if (!p || p == current || task_is_running(p))
 		return 0;
 
 	/*
@@ -542,7 +542,7 @@ get_wchan (struct task_struct *p)
 	 */
 	unw_init_from_blocked_task(&info, p);
 	do {
-		if (p->state == TASK_RUNNING)
+		if (task_is_running(p))
 			return 0;
 		if (unw_unwind(&info) < 0)
 			return 0;
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index da83cc83e791..db49f9091711 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -268,7 +268,7 @@ unsigned long get_wchan(struct task_struct *p)
 	unsigned long fp, pc;
 	unsigned long stack_page;
 	int count = 0;
-	if (!p || p == current || p->state == TASK_RUNNING)
+	if (!p || p == current || task_is_running(p))
 		return 0;
 
 	stack_page = (unsigned long)task_stack_page(p);
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index bff080db0294..73c8e7990a97 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -662,7 +662,7 @@ unsigned long get_wchan(struct task_struct *task)
 	unsigned long ra = 0;
 #endif
 
-	if (!task || task == current || task->state == TASK_RUNNING)
+	if (!task || task == current || task_is_running(task))
 		goto out;
 	if (!task_stack_page(task))
 		goto out;
diff --git a/arch/nds32/kernel/process.c b/arch/nds32/kernel/process.c
index c1327e552ec6..391895b54d13 100644
--- a/arch/nds32/kernel/process.c
+++ b/arch/nds32/kernel/process.c
@@ -239,7 +239,7 @@ unsigned long get_wchan(struct task_struct *p)
 	unsigned long stack_start, stack_end;
 	int count = 0;
 
-	if (!p || p == current || p->state == TASK_RUNNING)
+	if (!p || p == current || task_is_running(p))
 		return 0;
 
 	if (IS_ENABLED(CONFIG_FRAME_POINTER)) {
diff --git a/arch/nios2/kernel/process.c b/arch/nios2/kernel/process.c
index c5f916ca6845..9ff37ba2bb60 100644
--- a/arch/nios2/kernel/process.c
+++ b/arch/nios2/kernel/process.c
@@ -223,7 +223,7 @@ unsigned long get_wchan(struct task_struct *p)
 	unsigned long stack_page;
 	int count = 0;
 
-	if (!p || p == current || p->state == TASK_RUNNING)
+	if (!p || p == current || task_is_running(p))
 		return 0;
 
 	stack_page = (unsigned long)p;
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index b144fbe29bc1..184ec3c1eae4 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -249,7 +249,7 @@ get_wchan(struct task_struct *p)
 	unsigned long ip;
 	int count = 0;
 
-	if (!p || p == current || p->state == TASK_RUNNING)
+	if (!p || p == current || task_is_running(p))
 		return 0;
 
 	/*
@@ -260,7 +260,7 @@ get_wchan(struct task_struct *p)
 	do {
 		if (unwind_once(&info) < 0)
 			return 0;
-		if (p->state == TASK_RUNNING)
+		if (task_is_running(p))
                         return 0;
 		ip = info.ip;
 		if (!in_sched_functions(ip))
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 89e34aa273e2..8935c5696bce 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -2084,7 +2084,7 @@ static unsigned long __get_wchan(struct task_struct *p)
 	unsigned long ip, sp;
 	int count = 0;
 
-	if (!p || p == current || p->state == TASK_RUNNING)
+	if (!p || p == current || task_is_running(p))
 		return 0;
 
 	sp = p->thread.ksp;
@@ -2094,7 +2094,7 @@ static unsigned long __get_wchan(struct task_struct *p)
 	do {
 		sp = *(unsigned long *)sp;
 		if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD) ||
-		    p->state == TASK_RUNNING)
+		    task_is_running(p))
 			return 0;
 		if (count > 0) {
 			ip = ((unsigned long *)sp)[STACK_FRAME_LR_SAVE];
diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c
index bde85fc53357..ff467b98c3e3 100644
--- a/arch/riscv/kernel/stacktrace.c
+++ b/arch/riscv/kernel/stacktrace.c
@@ -132,7 +132,7 @@ unsigned long get_wchan(struct task_struct *task)
 {
 	unsigned long pc = 0;
 
-	if (likely(task && task != current && task->state != TASK_RUNNING))
+	if (likely(task && task != current && !task_is_running(task)))
 		walk_stackframe(task, NULL, save_wchan, &pc);
 	return pc;
 }
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index e20bed1ed34a..7ae5dde9c54d 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -180,7 +180,7 @@ unsigned long get_wchan(struct task_struct *p)
 	struct unwind_state state;
 	unsigned long ip = 0;
 
-	if (!p || p == current || p->state == TASK_RUNNING || !task_stack_page(p))
+	if (!p || p == current || task_is_running(p) || !task_stack_page(p))
 		return 0;
 
 	if (!try_get_task_stack(p))
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 826d01777361..8ae3dc5783fd 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -702,7 +702,7 @@ static void pfault_interrupt(struct ext_code ext_code,
 			 * interrupt since it must be a leftover of a PFAULT
 			 * CANCEL operation which didn't remove all pending
 			 * completion interrupts. */
-			if (tsk->state == TASK_RUNNING)
+			if (task_is_running(tsk))
 				tsk->thread.pfault_wait = -1;
 		}
 	} else {
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index 1aa508eb0823..717de05c81f4 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -186,7 +186,7 @@ unsigned long get_wchan(struct task_struct *p)
 {
 	unsigned long pc;
 
-	if (!p || p == current || p->state == TASK_RUNNING)
+	if (!p || p == current || task_is_running(p))
 		return 0;
 
 	/*
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index 3b9794978e5b..93983d6d431d 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -376,8 +376,7 @@ unsigned long get_wchan(struct task_struct *task)
 	struct reg_window32 *rw;
 	int count = 0;
 
-	if (!task || task == current ||
-            task->state == TASK_RUNNING)
+	if (!task || task == current || task_is_running(task))
 		goto out;
 
 	fp = task_thread_info(task)->ksp + bias;
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 7afd0a859a78..d33c58a58d4f 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -674,8 +674,7 @@ unsigned long get_wchan(struct task_struct *task)
         unsigned long ret = 0;
 	int count = 0; 
 
-	if (!task || task == current ||
-            task->state == TASK_RUNNING)
+	if (!task || task == current || task_is_running(task))
 		goto out;
 
 	tp = task_thread_info(task);
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index c5011064b5dd..457a38db368b 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -369,7 +369,7 @@ unsigned long get_wchan(struct task_struct *p)
 	unsigned long stack_page, sp, ip;
 	bool seen_sched = 0;
 
-	if ((p == NULL) || (p == current) || (p->state == TASK_RUNNING))
+	if ((p == NULL) || (p == current) || task_is_running(p))
 		return 0;
 
 	stack_page = (unsigned long) task_stack_page(p);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 5e1f38179f49..e52b208b4641 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -931,7 +931,7 @@ unsigned long get_wchan(struct task_struct *p)
 	unsigned long start, bottom, top, sp, fp, ip, ret = 0;
 	int count = 0;
 
-	if (p == current || p->state == TASK_RUNNING)
+	if (p == current || task_is_running(p))
 		return 0;
 
 	if (!try_get_task_stack(p))
@@ -975,7 +975,7 @@ unsigned long get_wchan(struct task_struct *p)
 			goto out;
 		}
 		fp = READ_ONCE_NOCHECK(*(unsigned long *)fp);
-	} while (count++ < 16 && p->state != TASK_RUNNING);
+	} while (count++ < 16 && !task_is_running(p));
 
 out:
 	put_task_stack(p);
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index 9534ef515d74..060165340612 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -304,7 +304,7 @@ unsigned long get_wchan(struct task_struct *p)
 	unsigned long stack_page = (unsigned long) task_stack_page(p);
 	int count = 0;
 
-	if (!p || p == current || p->state == TASK_RUNNING)
+	if (!p || p == current || task_is_running(p))
 		return 0;
 
 	sp = p->thread.sp;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c86c01bfecdb..655db5fb46d0 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3926,7 +3926,7 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
 		if (signal_pending_state(state, current))
 			__set_current_state(TASK_RUNNING);
 
-		if (current->state == TASK_RUNNING)
+		if (task_is_running(current))
 			return 1;
 		if (ret < 0 || !spin)
 			break;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ac5a7d29fd4f..2cd56352dae1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -113,6 +113,8 @@ struct task_group;
 					 __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
 					 TASK_PARKED)
 
+#define task_is_running(task)		(READ_ONCE((task)->state) == TASK_RUNNING)
+
 #define task_is_traced(task)		((task->state & __TASK_TRACED) != 0)
 
 #define task_is_stopped(task)		((task->state & __TASK_STOPPED) != 0)
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index 13dce3c664d6..56016e8e7461 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -460,7 +460,7 @@ static void set_other_info_task_blocking(unsigned long *flags,
 	 * We may be instrumenting a code-path where current->state is already
 	 * something other than TASK_RUNNING.
 	 */
-	const bool is_running = current->state == TASK_RUNNING;
+	const bool is_running = task_is_running(current);
 	/*
 	 * To avoid deadlock in case we are in an interrupt here and this is a
 	 * race with a task on the same CPU (KCSAN_INTERRUPT_WATCHER), provide a
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 7641bd407239..4931a93c5162 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -760,7 +760,7 @@ static void lockdep_print_held_locks(struct task_struct *p)
 	 * It's not reliable to print a task's held locks if it's not sleeping
 	 * and it's not the current task.
 	 */
-	if (p->state == TASK_RUNNING && p != current)
+	if (p != current && task_is_running(p))
 		return;
 	for (i = 0; i < depth; i++) {
 		printk(" #%d: ", i);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index ad0156b86937..4d6962048c30 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2768,7 +2768,7 @@ EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
 #ifdef CONFIG_SMP
 static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
 {
-	return tsp && tsp->state == TASK_RUNNING && !tsp->on_cpu ? "!" : "";
+	return tsp && task_is_running(tsp) && !tsp->on_cpu ? "!" : "";
 }
 #else // #ifdef CONFIG_SMP
 static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 75655cdee3bb..618c2b5a5758 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5974,7 +5974,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
 {
 	unsigned int task_flags;
 
-	if (!tsk->state)
+	if (task_is_running(tsk))
 		return;
 
 	task_flags = tsk->flags;
@@ -7949,7 +7949,7 @@ again:
 	if (curr->sched_class != p->sched_class)
 		goto out_unlock;
 
-	if (task_running(p_rq, p) || p->state)
+	if (task_running(p_rq, p) || !task_is_running(p))
 		goto out_unlock;
 
 	yielded = curr->sched_class->yield_to_task(rq, p);
@@ -8152,7 +8152,7 @@ void sched_show_task(struct task_struct *p)
 
 	pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
 
-	if (p->state == TASK_RUNNING)
+	if (task_is_running(p))
 		pr_cont("  running task    ");
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	free = stack_not_used(p);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 111072ee9663..d8f8eb0c655b 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -217,7 +217,7 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
 
 	rq_sched_info_depart(rq, delta);
 
-	if (t->state == TASK_RUNNING)
+	if (task_is_running(t))
 		sched_info_enqueue(rq, t);
 }
 
diff --git a/kernel/signal.c b/kernel/signal.c
index f7c6ffcbd044..5fc8fcf70c24 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -4719,7 +4719,7 @@ void kdb_send_sig(struct task_struct *t, int sig)
 	}
 	new_t = kdb_prev_t != t;
 	kdb_prev_t = t;
-	if (t->state != TASK_RUNNING && new_t) {
+	if (!task_is_running(t) && new_t) {
 		spin_unlock(&t->sighand->siglock);
 		kdb_printf("Process is not RUNNING, sending a signal from "
 			   "kdb risks deadlock\n"
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 5ddc3b15a4db..f3a012179f47 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -92,8 +92,7 @@ static bool ksoftirqd_running(unsigned long pending)
 
 	if (pending & SOFTIRQ_NOW_MASK)
 		return false;
-	return tsk && (tsk->state == TASK_RUNNING) &&
-		!__kthread_should_park(tsk);
+	return tsk && task_is_running(tsk) && !__kthread_should_park(tsk);
 }
 
 #ifdef CONFIG_TRACE_IRQFLAGS
diff --git a/mm/compaction.c b/mm/compaction.c
index 84fde270ae74..725f564a5664 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1955,7 +1955,7 @@ static inline bool is_via_compact_memory(int order)
 
 static bool kswapd_is_running(pg_data_t *pgdat)
 {
-	return pgdat->kswapd && (pgdat->kswapd->state == TASK_RUNNING);
+	return pgdat->kswapd && task_is_running(pgdat->kswapd);
 }
 
 /*
-- 
cgit v1.2.3


From d6c23bb3a2ad2f8f7dd46292b8bc54d27f2fb3f1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 11 Jun 2021 10:28:14 +0200
Subject: sched: Add get_current_state()

Remove yet another few p->state accesses.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210611082838.347475156@infradead.org
---
 block/blk-mq.c        | 2 +-
 include/linux/sched.h | 2 ++
 kernel/freezer.c      | 2 +-
 kernel/sched/core.c   | 6 +++---
 4 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 655db5fb46d0..56270bb06365 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3910,7 +3910,7 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
 
 	hctx->poll_considered++;
 
-	state = current->state;
+	state = get_current_state();
 	do {
 		int ret;
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2cd56352dae1..395c8906f502 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -213,6 +213,8 @@ struct task_group;
 
 #endif
 
+#define get_current_state()	READ_ONCE(current->state)
+
 /* Task command name length: */
 #define TASK_COMM_LEN			16
 
diff --git a/kernel/freezer.c b/kernel/freezer.c
index dc520f01f99d..45ab36ffd0e7 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -58,7 +58,7 @@ bool __refrigerator(bool check_kthr_stop)
 	/* Hmm, should we be allowed to suspend when there are realtime
 	   processes around? */
 	bool was_frozen = false;
-	long save = current->state;
+	unsigned int save = get_current_state();
 
 	pr_debug("%s entered refrigerator\n", current->comm);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 618c2b5a5758..45ebb3cfe86c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9098,15 +9098,15 @@ static inline int preempt_count_equals(int preempt_offset)
 
 void __might_sleep(const char *file, int line, int preempt_offset)
 {
+	unsigned int state = get_current_state();
 	/*
 	 * Blocking primitives will set (and therefore destroy) current->state,
 	 * since we will exit with TASK_RUNNING make sure we enter with it,
 	 * otherwise we will destroy state.
 	 */
-	WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
+	WARN_ONCE(state != TASK_RUNNING && current->task_state_change,
 			"do not call blocking ops when !TASK_RUNNING; "
-			"state=%lx set at [<%p>] %pS\n",
-			current->state,
+			"state=%x set at [<%p>] %pS\n", state,
 			(void *)current->task_state_change,
 			(void *)current->task_state_change);
 
-- 
cgit v1.2.3


From 2f064a59a11ff9bc22e52e9678bc601404c7cb34 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 11 Jun 2021 10:28:17 +0200
Subject: sched: Change task_struct::state

Change the type and name of task_struct::state. Drop the volatile and
shrink it to an 'unsigned int'. Rename it in order to find all uses
such that we can use READ_ONCE/WRITE_ONCE as appropriate.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Daniel Thompson <daniel.thompson@linaro.org>
Link: https://lore.kernel.org/r/20210611082838.550736351@infradead.org
---
 arch/ia64/kernel/mca.c         |  2 +-
 arch/ia64/kernel/ptrace.c      |  8 +++----
 arch/powerpc/xmon/xmon.c       | 13 ++++++-----
 block/blk-mq.c                 |  2 +-
 drivers/md/dm.c                |  6 ++---
 fs/binfmt_elf.c                |  8 ++++---
 fs/binfmt_elf_fdpic.c          |  4 +++-
 fs/userfaultfd.c               |  4 ++--
 include/linux/sched.h          | 31 ++++++++++++------------
 include/linux/sched/debug.h    |  2 +-
 include/linux/sched/signal.h   |  2 +-
 init/init_task.c               |  2 +-
 kernel/cgroup/cgroup-v1.c      |  2 +-
 kernel/debug/kdb/kdb_support.c | 18 +++++++-------
 kernel/fork.c                  |  4 ++--
 kernel/hung_task.c             |  2 +-
 kernel/kthread.c               |  4 ++--
 kernel/locking/mutex.c         |  6 ++---
 kernel/locking/rtmutex.c       |  4 ++--
 kernel/locking/rwsem.c         |  2 +-
 kernel/ptrace.c                | 12 +++++-----
 kernel/rcu/rcutorture.c        |  4 ++--
 kernel/rcu/tree_stall.h        | 12 +++++-----
 kernel/sched/core.c            | 53 ++++++++++++++++++++++--------------------
 kernel/sched/deadline.c        | 10 ++++----
 kernel/sched/fair.c            | 11 +++++----
 lib/syscall.c                  |  4 ++--
 net/core/dev.c                 |  2 +-
 28 files changed, 123 insertions(+), 111 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index cdbac4b52f30..e628a88607bb 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -1788,7 +1788,7 @@ format_mca_init_stack(void *mca_data, unsigned long offset,
 	ti->task = p;
 	ti->cpu = cpu;
 	p->stack = ti;
-	p->state = TASK_UNINTERRUPTIBLE;
+	p->__state = TASK_UNINTERRUPTIBLE;
 	cpumask_set_cpu(cpu, &p->cpus_mask);
 	INIT_LIST_HEAD(&p->tasks);
 	p->parent = p->real_parent = p->group_leader = p;
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index e14f5653393a..df28c7dd164f 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -641,11 +641,11 @@ ptrace_attach_sync_user_rbs (struct task_struct *child)
 	read_lock(&tasklist_lock);
 	if (child->sighand) {
 		spin_lock_irq(&child->sighand->siglock);
-		if (child->state == TASK_STOPPED &&
+		if (READ_ONCE(child->__state) == TASK_STOPPED &&
 		    !test_and_set_tsk_thread_flag(child, TIF_RESTORE_RSE)) {
 			set_notify_resume(child);
 
-			child->state = TASK_TRACED;
+			WRITE_ONCE(child->__state, TASK_TRACED);
 			stopped = 1;
 		}
 		spin_unlock_irq(&child->sighand->siglock);
@@ -665,9 +665,9 @@ ptrace_attach_sync_user_rbs (struct task_struct *child)
 	read_lock(&tasklist_lock);
 	if (child->sighand) {
 		spin_lock_irq(&child->sighand->siglock);
-		if (child->state == TASK_TRACED &&
+		if (READ_ONCE(child->__state) == TASK_TRACED &&
 		    (child->signal->flags & SIGNAL_STOP_STOPPED)) {
-			child->state = TASK_STOPPED;
+			WRITE_ONCE(child->__state, TASK_STOPPED);
 		}
 		spin_unlock_irq(&child->sighand->siglock);
 	}
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index c8173e92f19d..84de2d7c2f40 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -3162,6 +3162,7 @@ memzcan(void)
 
 static void show_task(struct task_struct *tsk)
 {
+	unsigned int p_state = READ_ONCE(tsk->__state);
 	char state;
 
 	/*
@@ -3169,14 +3170,14 @@ static void show_task(struct task_struct *tsk)
 	 * appropriate for calling from xmon. This could be moved
 	 * to a common, generic, routine used by both.
 	 */
-	state = (tsk->state == 0) ? 'R' :
-		(tsk->state < 0) ? 'U' :
-		(tsk->state & TASK_UNINTERRUPTIBLE) ? 'D' :
-		(tsk->state & TASK_STOPPED) ? 'T' :
-		(tsk->state & TASK_TRACED) ? 'C' :
+	state = (p_state == 0) ? 'R' :
+		(p_state < 0) ? 'U' :
+		(p_state & TASK_UNINTERRUPTIBLE) ? 'D' :
+		(p_state & TASK_STOPPED) ? 'T' :
+		(p_state & TASK_TRACED) ? 'C' :
 		(tsk->exit_state & EXIT_ZOMBIE) ? 'Z' :
 		(tsk->exit_state & EXIT_DEAD) ? 'E' :
-		(tsk->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
+		(p_state & TASK_INTERRUPTIBLE) ? 'S' : '?';
 
 	printf("%16px %16lx %16px %6d %6d %c %2d %s\n", tsk,
 		tsk->thread.ksp, tsk->thread.regs,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 56270bb06365..e41edae97487 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3886,7 +3886,7 @@ static bool blk_mq_poll_hybrid(struct request_queue *q,
 int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
 {
 	struct blk_mq_hw_ctx *hctx;
-	long state;
+	unsigned int state;
 
 	if (!blk_qc_t_valid(cookie) ||
 	    !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ca2aedd8ee7d..190e714cb565 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2328,7 +2328,7 @@ static bool md_in_flight_bios(struct mapped_device *md)
 	return sum != 0;
 }
 
-static int dm_wait_for_bios_completion(struct mapped_device *md, long task_state)
+static int dm_wait_for_bios_completion(struct mapped_device *md, unsigned int task_state)
 {
 	int r = 0;
 	DEFINE_WAIT(wait);
@@ -2351,7 +2351,7 @@ static int dm_wait_for_bios_completion(struct mapped_device *md, long task_state
 	return r;
 }
 
-static int dm_wait_for_completion(struct mapped_device *md, long task_state)
+static int dm_wait_for_completion(struct mapped_device *md, unsigned int task_state)
 {
 	int r = 0;
 
@@ -2478,7 +2478,7 @@ static void unlock_fs(struct mapped_device *md)
  * are being added to md->deferred list.
  */
 static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
-			unsigned suspend_flags, long task_state,
+			unsigned suspend_flags, unsigned int task_state,
 			int dmf_suspended_flag)
 {
 	bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 187b3f2b9202..3d73cbb439fa 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1537,7 +1537,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 {
 	const struct cred *cred;
 	unsigned int i, len;
-	
+	unsigned int state;
+
 	/* first copy the parameters from user space */
 	memset(psinfo, 0, sizeof(struct elf_prpsinfo));
 
@@ -1559,7 +1560,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 	psinfo->pr_pgrp = task_pgrp_vnr(p);
 	psinfo->pr_sid = task_session_vnr(p);
 
-	i = p->state ? ffz(~p->state) + 1 : 0;
+	state = READ_ONCE(p->__state);
+	i = state ? ffz(~state) + 1 : 0;
 	psinfo->pr_state = i;
 	psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i];
 	psinfo->pr_zomb = psinfo->pr_sname == 'Z';
@@ -1571,7 +1573,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 	SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid));
 	rcu_read_unlock();
 	strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
-	
+
 	return 0;
 }
 
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 2c99b102c860..ab9c31ddffda 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1331,6 +1331,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 {
 	const struct cred *cred;
 	unsigned int i, len;
+	unsigned int state;
 
 	/* first copy the parameters from user space */
 	memset(psinfo, 0, sizeof(struct elf_prpsinfo));
@@ -1353,7 +1354,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 	psinfo->pr_pgrp = task_pgrp_vnr(p);
 	psinfo->pr_sid = task_session_vnr(p);
 
-	i = p->state ? ffz(~p->state) + 1 : 0;
+	state = READ_ONCE(p->__state);
+	i = state ? ffz(~state) + 1 : 0;
 	psinfo->pr_state = i;
 	psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i];
 	psinfo->pr_zomb = psinfo->pr_sname == 'Z';
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 14f92285d04f..dd7a6c62b56f 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -337,7 +337,7 @@ out:
 	return ret;
 }
 
-static inline long userfaultfd_get_blocking_state(unsigned int flags)
+static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags)
 {
 	if (flags & FAULT_FLAG_INTERRUPTIBLE)
 		return TASK_INTERRUPTIBLE;
@@ -370,7 +370,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
 	struct userfaultfd_wait_queue uwq;
 	vm_fault_t ret = VM_FAULT_SIGBUS;
 	bool must_wait;
-	long blocking_state;
+	unsigned int blocking_state;
 
 	/*
 	 * We don't do userfault handling for the final child pid update.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 395c8906f502..50db9496c99d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -113,13 +113,13 @@ struct task_group;
 					 __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
 					 TASK_PARKED)
 
-#define task_is_running(task)		(READ_ONCE((task)->state) == TASK_RUNNING)
+#define task_is_running(task)		(READ_ONCE((task)->__state) == TASK_RUNNING)
 
-#define task_is_traced(task)		((task->state & __TASK_TRACED) != 0)
+#define task_is_traced(task)		((READ_ONCE(task->__state) & __TASK_TRACED) != 0)
 
-#define task_is_stopped(task)		((task->state & __TASK_STOPPED) != 0)
+#define task_is_stopped(task)		((READ_ONCE(task->__state) & __TASK_STOPPED) != 0)
 
-#define task_is_stopped_or_traced(task)	((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
+#define task_is_stopped_or_traced(task)	((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0)
 
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 
@@ -134,14 +134,14 @@ struct task_group;
 	do {							\
 		WARN_ON_ONCE(is_special_task_state(state_value));\
 		current->task_state_change = _THIS_IP_;		\
-		current->state = (state_value);			\
+		WRITE_ONCE(current->__state, (state_value));	\
 	} while (0)
 
 #define set_current_state(state_value)				\
 	do {							\
 		WARN_ON_ONCE(is_special_task_state(state_value));\
 		current->task_state_change = _THIS_IP_;		\
-		smp_store_mb(current->state, (state_value));	\
+		smp_store_mb(current->__state, (state_value));	\
 	} while (0)
 
 #define set_special_state(state_value)					\
@@ -150,7 +150,7 @@ struct task_group;
 		WARN_ON_ONCE(!is_special_task_state(state_value));	\
 		raw_spin_lock_irqsave(&current->pi_lock, flags);	\
 		current->task_state_change = _THIS_IP_;			\
-		current->state = (state_value);				\
+		WRITE_ONCE(current->__state, (state_value));		\
 		raw_spin_unlock_irqrestore(&current->pi_lock, flags);	\
 	} while (0)
 #else
@@ -192,10 +192,10 @@ struct task_group;
  * Also see the comments of try_to_wake_up().
  */
 #define __set_current_state(state_value)				\
-	current->state = (state_value)
+	WRITE_ONCE(current->__state, (state_value))
 
 #define set_current_state(state_value)					\
-	smp_store_mb(current->state, (state_value))
+	smp_store_mb(current->__state, (state_value))
 
 /*
  * set_special_state() should be used for those states when the blocking task
@@ -207,13 +207,13 @@ struct task_group;
 	do {								\
 		unsigned long flags; /* may shadow */			\
 		raw_spin_lock_irqsave(&current->pi_lock, flags);	\
-		current->state = (state_value);				\
+		WRITE_ONCE(current->__state, (state_value));		\
 		raw_spin_unlock_irqrestore(&current->pi_lock, flags);	\
 	} while (0)
 
 #endif
 
-#define get_current_state()	READ_ONCE(current->state)
+#define get_current_state()	READ_ONCE(current->__state)
 
 /* Task command name length: */
 #define TASK_COMM_LEN			16
@@ -666,8 +666,7 @@ struct task_struct {
 	 */
 	struct thread_info		thread_info;
 #endif
-	/* -1 unrunnable, 0 runnable, >0 stopped: */
-	volatile long			state;
+	unsigned int			__state;
 
 	/*
 	 * This begins the randomizable portion of task_struct. Only
@@ -1532,7 +1531,7 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
 
 static inline unsigned int task_state_index(struct task_struct *tsk)
 {
-	unsigned int tsk_state = READ_ONCE(tsk->state);
+	unsigned int tsk_state = READ_ONCE(tsk->__state);
 	unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;
 
 	BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX);
@@ -1840,10 +1839,10 @@ static __always_inline void scheduler_ipi(void)
 	 */
 	preempt_fold_need_resched();
 }
-extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
+extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state);
 #else
 static inline void scheduler_ipi(void) { }
-static inline unsigned long wait_task_inactive(struct task_struct *p, long match_state)
+static inline unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
 {
 	return 1;
 }
diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h
index ae51f4529fc9..b5035afa2396 100644
--- a/include/linux/sched/debug.h
+++ b/include/linux/sched/debug.h
@@ -14,7 +14,7 @@ extern void dump_cpu_task(int cpu);
 /*
  * Only dump TASK_* tasks. (0 for all tasks)
  */
-extern void show_state_filter(unsigned long state_filter);
+extern void show_state_filter(unsigned int state_filter);
 
 static inline void show_state(void)
 {
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 7f4278fa21fe..c9cf678c347d 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -382,7 +382,7 @@ static inline int fatal_signal_pending(struct task_struct *p)
 	return task_sigpending(p) && __fatal_signal_pending(p);
 }
 
-static inline int signal_pending_state(long state, struct task_struct *p)
+static inline int signal_pending_state(unsigned int state, struct task_struct *p)
 {
 	if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
 		return 0;
diff --git a/init/init_task.c b/init/init_task.c
index 8b08c2e19cbb..562f2ef8d157 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -71,7 +71,7 @@ struct task_struct init_task
 	.thread_info	= INIT_THREAD_INFO(init_task),
 	.stack_refcount	= REFCOUNT_INIT(1),
 #endif
-	.state		= 0,
+	.__state	= 0,
 	.stack		= init_stack,
 	.usage		= REFCOUNT_INIT(2),
 	.flags		= PF_KTHREAD,
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 1f274d7fc934..ee93b6e89587 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -713,7 +713,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
 
 	css_task_iter_start(&cgrp->self, 0, &it);
 	while ((tsk = css_task_iter_next(&it))) {
-		switch (tsk->state) {
+		switch (READ_ONCE(tsk->__state)) {
 		case TASK_RUNNING:
 			stats->nr_running++;
 			break;
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 91bb666d7c03..9f50d22d68e6 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -609,23 +609,25 @@ unsigned long kdb_task_state_string(const char *s)
  */
 char kdb_task_state_char (const struct task_struct *p)
 {
-	int cpu;
-	char state;
+	unsigned int p_state;
 	unsigned long tmp;
+	char state;
+	int cpu;
 
 	if (!p ||
 	    copy_from_kernel_nofault(&tmp, (char *)p, sizeof(unsigned long)))
 		return 'E';
 
 	cpu = kdb_process_cpu(p);
-	state = (p->state == 0) ? 'R' :
-		(p->state < 0) ? 'U' :
-		(p->state & TASK_UNINTERRUPTIBLE) ? 'D' :
-		(p->state & TASK_STOPPED) ? 'T' :
-		(p->state & TASK_TRACED) ? 'C' :
+	p_state = READ_ONCE(p->__state);
+	state = (p_state == 0) ? 'R' :
+		(p_state < 0) ? 'U' :
+		(p_state & TASK_UNINTERRUPTIBLE) ? 'D' :
+		(p_state & TASK_STOPPED) ? 'T' :
+		(p_state & TASK_TRACED) ? 'C' :
 		(p->exit_state & EXIT_ZOMBIE) ? 'Z' :
 		(p->exit_state & EXIT_DEAD) ? 'E' :
-		(p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
+		(p_state & TASK_INTERRUPTIBLE) ? 'S' : '?';
 	if (is_idle_task(p)) {
 		/* Idle task.  Is it really idle, apart from the kdb
 		 * interrupt? */
diff --git a/kernel/fork.c b/kernel/fork.c
index e595e77913eb..1a9af73b47c1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -425,7 +425,7 @@ static int memcg_charge_kernel_stack(struct task_struct *tsk)
 
 static void release_task_stack(struct task_struct *tsk)
 {
-	if (WARN_ON(tsk->state != TASK_DEAD))
+	if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
 		return;  /* Better to leak the stack than to free prematurely */
 
 	account_kernel_stack(tsk, -1);
@@ -2392,7 +2392,7 @@ bad_fork_cleanup_count:
 	atomic_dec(&p->cred->user->processes);
 	exit_creds(p);
 bad_fork_free:
-	p->state = TASK_DEAD;
+	WRITE_ONCE(p->__state, TASK_DEAD);
 	put_task_stack(p);
 	delayed_free_task(p);
 fork_out:
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 396ebaebea3f..b0ce8b3f3822 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -196,7 +196,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 			last_break = jiffies;
 		}
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
-		if (t->state == TASK_UNINTERRUPTIBLE)
+		if (READ_ONCE(t->__state) == TASK_UNINTERRUPTIBLE)
 			check_hung_task(t, timeout);
 	}
  unlock:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3d326833092b..7bbfeeb0e956 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -457,7 +457,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 }
 EXPORT_SYMBOL(kthread_create_on_node);
 
-static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state)
+static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
 {
 	unsigned long flags;
 
@@ -473,7 +473,7 @@ static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mas
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 }
 
-static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state)
 {
 	__kthread_bind_mask(p, cpumask_of(cpu), state);
 }
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 013e1b08a1bf..d2df5e68b503 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -923,7 +923,7 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter,
  * Lock a mutex (possibly interruptible), slowpath:
  */
 static __always_inline int __sched
-__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
+__mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclass,
 		    struct lockdep_map *nest_lock, unsigned long ip,
 		    struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
 {
@@ -1098,14 +1098,14 @@ err_early_kill:
 }
 
 static int __sched
-__mutex_lock(struct mutex *lock, long state, unsigned int subclass,
+__mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
 	     struct lockdep_map *nest_lock, unsigned long ip)
 {
 	return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false);
 }
 
 static int __sched
-__ww_mutex_lock(struct mutex *lock, long state, unsigned int subclass,
+__ww_mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
 		struct lockdep_map *nest_lock, unsigned long ip,
 		struct ww_acquire_ctx *ww_ctx)
 {
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 406818196a9f..b5d9bb5202c6 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1135,7 +1135,7 @@ void __sched rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
  *
  * Must be called with lock->wait_lock held and interrupts disabled
  */
-static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, int state,
+static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, unsigned int state,
 				       struct hrtimer_sleeper *timeout,
 				       struct rt_mutex_waiter *waiter)
 {
@@ -1190,7 +1190,7 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
 /*
  * Slow path lock function:
  */
-static int __sched rt_mutex_slowlock(struct rt_mutex *lock, int state,
+static int __sched rt_mutex_slowlock(struct rt_mutex *lock, unsigned int state,
 				     struct hrtimer_sleeper *timeout,
 				     enum rtmutex_chainwalk chwalk)
 {
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 809b0016d344..16bfbb10c74d 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -889,7 +889,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
  * Wait for the read lock to be granted
  */
 static struct rw_semaphore __sched *
-rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, int state)
+rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int state)
 {
 	long adjustment = -RWSEM_READER_BIAS;
 	long rcnt = (count >> RWSEM_READER_SHIFT);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 2997ca600d18..f8589bf8d7dc 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -197,7 +197,7 @@ static bool ptrace_freeze_traced(struct task_struct *task)
 	spin_lock_irq(&task->sighand->siglock);
 	if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
 	    !__fatal_signal_pending(task)) {
-		task->state = __TASK_TRACED;
+		WRITE_ONCE(task->__state, __TASK_TRACED);
 		ret = true;
 	}
 	spin_unlock_irq(&task->sighand->siglock);
@@ -207,7 +207,7 @@ static bool ptrace_freeze_traced(struct task_struct *task)
 
 static void ptrace_unfreeze_traced(struct task_struct *task)
 {
-	if (task->state != __TASK_TRACED)
+	if (READ_ONCE(task->__state) != __TASK_TRACED)
 		return;
 
 	WARN_ON(!task->ptrace || task->parent != current);
@@ -217,11 +217,11 @@ static void ptrace_unfreeze_traced(struct task_struct *task)
 	 * Recheck state under the lock to close this race.
 	 */
 	spin_lock_irq(&task->sighand->siglock);
-	if (task->state == __TASK_TRACED) {
+	if (READ_ONCE(task->__state) == __TASK_TRACED) {
 		if (__fatal_signal_pending(task))
 			wake_up_state(task, __TASK_TRACED);
 		else
-			task->state = TASK_TRACED;
+			WRITE_ONCE(task->__state, TASK_TRACED);
 	}
 	spin_unlock_irq(&task->sighand->siglock);
 }
@@ -256,7 +256,7 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
 	 */
 	read_lock(&tasklist_lock);
 	if (child->ptrace && child->parent == current) {
-		WARN_ON(child->state == __TASK_TRACED);
+		WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED);
 		/*
 		 * child->sighand can't be NULL, release_task()
 		 * does ptrace_unlink() before __exit_signal().
@@ -273,7 +273,7 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
 			 * ptrace_stop() changes ->state back to TASK_RUNNING,
 			 * so we should not worry about leaking __TASK_TRACED.
 			 */
-			WARN_ON(child->state == __TASK_TRACED);
+			WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED);
 			ret = -ESRCH;
 		}
 	}
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 29d2f4c647d3..194b9c145c40 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1831,10 +1831,10 @@ rcu_torture_stats_print(void)
 		srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
 					&flags, &gp_seq);
 		wtp = READ_ONCE(writer_task);
-		pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#lx cpu %d\n",
+		pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#x cpu %d\n",
 			 rcu_torture_writer_state_getname(),
 			 rcu_torture_writer_state, gp_seq, flags,
-			 wtp == NULL ? ~0UL : wtp->state,
+			 wtp == NULL ? ~0U : wtp->__state,
 			 wtp == NULL ? -1 : (int)task_cpu(wtp));
 		if (!splatted && wtp) {
 			sched_show_task(wtp);
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 59b95cc5cbdf..acb2288063b5 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -460,12 +460,12 @@ static void rcu_check_gp_kthread_starvation(void)
 
 	if (rcu_is_gp_kthread_starving(&j)) {
 		cpu = gpk ? task_cpu(gpk) : -1;
-		pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
+		pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x ->cpu=%d\n",
 		       rcu_state.name, j,
 		       (long)rcu_seq_current(&rcu_state.gp_seq),
 		       data_race(rcu_state.gp_flags),
 		       gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
-		       gpk ? gpk->state : ~0, cpu);
+		       gpk ? gpk->__state : ~0, cpu);
 		if (gpk) {
 			pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name);
 			pr_err("RCU grace-period kthread stack dump:\n");
@@ -503,12 +503,12 @@ static void rcu_check_gp_kthread_expired_fqs_timer(void)
 	    time_after(jiffies, jiffies_fqs + RCU_STALL_MIGHT_MIN) &&
 	    gpk && !READ_ONCE(gpk->on_rq)) {
 		cpu = task_cpu(gpk);
-		pr_err("%s kthread timer wakeup didn't happen for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx\n",
+		pr_err("%s kthread timer wakeup didn't happen for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x\n",
 		       rcu_state.name, (jiffies - jiffies_fqs),
 		       (long)rcu_seq_current(&rcu_state.gp_seq),
 		       data_race(rcu_state.gp_flags),
 		       gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS,
-		       gpk->state);
+		       gpk->__state);
 		pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n",
 		       cpu, kstat_softirqs_cpu(TIMER_SOFTIRQ, cpu));
 	}
@@ -735,9 +735,9 @@ void show_rcu_gp_kthreads(void)
 	ja = j - data_race(rcu_state.gp_activity);
 	jr = j - data_race(rcu_state.gp_req_activity);
 	jw = j - data_race(rcu_state.gp_wake_time);
-	pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n",
+	pr_info("%s: wait state: %s(%d) ->state: %#x delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n",
 		rcu_state.name, gp_state_getname(rcu_state.gp_state),
-		rcu_state.gp_state, t ? t->state : 0x1ffffL,
+		rcu_state.gp_state, t ? t->__state : 0x1ffff,
 		ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq),
 		(long)data_race(rcu_state.gp_seq),
 		(long)data_race(rcu_get_root()->gp_seq_needed),
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 45ebb3cfe86c..309745a7ec51 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2638,7 +2638,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
 		return -EINVAL;
 	}
 
-	if (task_running(rq, p) || p->state == TASK_WAKING) {
+	if (task_running(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
 		/*
 		 * MIGRATE_ENABLE gets here because 'p == current', but for
 		 * anything else we cannot do is_migration_disabled(), punt
@@ -2781,19 +2781,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
 #ifdef CONFIG_SCHED_DEBUG
+	unsigned int state = READ_ONCE(p->__state);
+
 	/*
 	 * We should never call set_task_cpu() on a blocked task,
 	 * ttwu() will sort out the placement.
 	 */
-	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
-			!p->on_rq);
+	WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq);
 
 	/*
 	 * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
 	 * because schedstat_wait_{start,end} rebase migrating task's wait_start
 	 * time relying on p->on_rq.
 	 */
-	WARN_ON_ONCE(p->state == TASK_RUNNING &&
+	WARN_ON_ONCE(state == TASK_RUNNING &&
 		     p->sched_class == &fair_sched_class &&
 		     (p->on_rq && !task_on_rq_migrating(p)));
 
@@ -2965,7 +2966,7 @@ out:
  * smp_call_function() if an IPI is sent by the same process we are
  * waiting to become inactive.
  */
-unsigned long wait_task_inactive(struct task_struct *p, long match_state)
+unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
 {
 	int running, queued;
 	struct rq_flags rf;
@@ -2993,7 +2994,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		 * is actually now running somewhere else!
 		 */
 		while (task_running(rq, p)) {
-			if (match_state && unlikely(p->state != match_state))
+			if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
 				return 0;
 			cpu_relax();
 		}
@@ -3008,7 +3009,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		running = task_running(rq, p);
 		queued = task_on_rq_queued(p);
 		ncsw = 0;
-		if (!match_state || p->state == match_state)
+		if (!match_state || READ_ONCE(p->__state) == match_state)
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
 		task_rq_unlock(rq, p, &rf);
 
@@ -3317,7 +3318,7 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
 			   struct rq_flags *rf)
 {
 	check_preempt_curr(rq, p, wake_flags);
-	p->state = TASK_RUNNING;
+	WRITE_ONCE(p->__state, TASK_RUNNING);
 	trace_sched_wakeup(p);
 
 #ifdef CONFIG_SMP
@@ -3709,12 +3710,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 		 *  - we're serialized against set_special_state() by virtue of
 		 *    it disabling IRQs (this allows not taking ->pi_lock).
 		 */
-		if (!(p->state & state))
+		if (!(READ_ONCE(p->__state) & state))
 			goto out;
 
 		success = 1;
 		trace_sched_waking(p);
-		p->state = TASK_RUNNING;
+		WRITE_ONCE(p->__state, TASK_RUNNING);
 		trace_sched_wakeup(p);
 		goto out;
 	}
@@ -3727,7 +3728,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	 */
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	smp_mb__after_spinlock();
-	if (!(p->state & state))
+	if (!(READ_ONCE(p->__state) & state))
 		goto unlock;
 
 	trace_sched_waking(p);
@@ -3793,7 +3794,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	 * TASK_WAKING such that we can unlock p->pi_lock before doing the
 	 * enqueue, such as ttwu_queue_wakelist().
 	 */
-	p->state = TASK_WAKING;
+	WRITE_ONCE(p->__state, TASK_WAKING);
 
 	/*
 	 * If the owning (remote) CPU is still in the middle of schedule() with
@@ -3886,7 +3887,7 @@ bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct t
 			ret = func(p, arg);
 		rq_unlock(rq, &rf);
 	} else {
-		switch (p->state) {
+		switch (READ_ONCE(p->__state)) {
 		case TASK_RUNNING:
 		case TASK_WAKING:
 			break;
@@ -4086,7 +4087,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	 * nobody will actually run it, and a signal or other external
 	 * event cannot wake it up and insert it on the runqueue either.
 	 */
-	p->state = TASK_NEW;
+	p->__state = TASK_NEW;
 
 	/*
 	 * Make sure we do not leak PI boosting priority to the child.
@@ -4192,7 +4193,7 @@ void wake_up_new_task(struct task_struct *p)
 	struct rq *rq;
 
 	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
-	p->state = TASK_RUNNING;
+	WRITE_ONCE(p->__state, TASK_RUNNING);
 #ifdef CONFIG_SMP
 	/*
 	 * Fork balancing, do it here and not earlier because:
@@ -4554,7 +4555,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	 * running on another CPU and we could rave with its RUNNING -> DEAD
 	 * transition, resulting in a double drop.
 	 */
-	prev_state = prev->state;
+	prev_state = READ_ONCE(prev->__state);
 	vtime_task_switch(prev);
 	perf_event_task_sched_in(prev, current);
 	finish_task(prev);
@@ -5248,7 +5249,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
 #endif
 
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
-	if (!preempt && prev->state && prev->non_block_count) {
+	if (!preempt && READ_ONCE(prev->__state) && prev->non_block_count) {
 		printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
 			prev->comm, prev->pid, prev->non_block_count);
 		dump_stack();
@@ -5874,10 +5875,10 @@ static void __sched notrace __schedule(bool preempt)
 	 *  - we form a control dependency vs deactivate_task() below.
 	 *  - ptrace_{,un}freeze_traced() can change ->state underneath us.
 	 */
-	prev_state = prev->state;
+	prev_state = READ_ONCE(prev->__state);
 	if (!preempt && prev_state) {
 		if (signal_pending_state(prev_state, prev)) {
-			prev->state = TASK_RUNNING;
+			WRITE_ONCE(prev->__state, TASK_RUNNING);
 		} else {
 			prev->sched_contributes_to_load =
 				(prev_state & TASK_UNINTERRUPTIBLE) &&
@@ -6049,7 +6050,7 @@ void __sched schedule_idle(void)
 	 * current task can be in any other state. Note, idle is always in the
 	 * TASK_RUNNING state.
 	 */
-	WARN_ON_ONCE(current->state);
+	WARN_ON_ONCE(current->__state);
 	do {
 		__schedule(false);
 	} while (need_resched());
@@ -8176,26 +8177,28 @@ EXPORT_SYMBOL_GPL(sched_show_task);
 static inline bool
 state_filter_match(unsigned long state_filter, struct task_struct *p)
 {
+	unsigned int state = READ_ONCE(p->__state);
+
 	/* no filter, everything matches */
 	if (!state_filter)
 		return true;
 
 	/* filter, but doesn't match */
-	if (!(p->state & state_filter))
+	if (!(state & state_filter))
 		return false;
 
 	/*
 	 * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
 	 * TASK_KILLABLE).
 	 */
-	if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
+	if (state_filter == TASK_UNINTERRUPTIBLE && state == TASK_IDLE)
 		return false;
 
 	return true;
 }
 
 
-void show_state_filter(unsigned long state_filter)
+void show_state_filter(unsigned int state_filter)
 {
 	struct task_struct *g, *p;
 
@@ -8252,7 +8255,7 @@ void __init init_idle(struct task_struct *idle, int cpu)
 	raw_spin_lock_irqsave(&idle->pi_lock, flags);
 	raw_spin_rq_lock(rq);
 
-	idle->state = TASK_RUNNING;
+	idle->__state = TASK_RUNNING;
 	idle->se.exec_start = sched_clock();
 	/*
 	 * PF_KTHREAD should already be set at this point; regardless, make it
@@ -9567,7 +9570,7 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
 		 * has happened. This would lead to problems with PELT, due to
 		 * move wanting to detach+attach while we're not attached yet.
 		 */
-		if (task->state == TASK_NEW)
+		if (READ_ONCE(task->__state) == TASK_NEW)
 			ret = -EINVAL;
 		raw_spin_unlock_irq(&task->pi_lock);
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 3829c5a1b936..22878cd5bd70 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -348,10 +348,10 @@ static void task_non_contending(struct task_struct *p)
 	if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) {
 		if (dl_task(p))
 			sub_running_bw(dl_se, dl_rq);
-		if (!dl_task(p) || p->state == TASK_DEAD) {
+		if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
 			struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
 
-			if (p->state == TASK_DEAD)
+			if (READ_ONCE(p->__state) == TASK_DEAD)
 				sub_rq_bw(&p->dl, &rq->dl);
 			raw_spin_lock(&dl_b->lock);
 			__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
@@ -1355,10 +1355,10 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
 	sched_clock_tick();
 	update_rq_clock(rq);
 
-	if (!dl_task(p) || p->state == TASK_DEAD) {
+	if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
 		struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
 
-		if (p->state == TASK_DEAD && dl_se->dl_non_contending) {
+		if (READ_ONCE(p->__state) == TASK_DEAD && dl_se->dl_non_contending) {
 			sub_running_bw(&p->dl, dl_rq_of_se(&p->dl));
 			sub_rq_bw(&p->dl, dl_rq_of_se(&p->dl));
 			dl_se->dl_non_contending = 0;
@@ -1722,7 +1722,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
 {
 	struct rq *rq;
 
-	if (p->state != TASK_WAKING)
+	if (READ_ONCE(p->__state) != TASK_WAKING)
 		return;
 
 	rq = task_rq(p);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5d1a6aace138..7b8990fd4896 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -993,11 +993,14 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
 		struct task_struct *tsk = task_of(se);
+		unsigned int state;
 
-		if (tsk->state & TASK_INTERRUPTIBLE)
+		/* XXX racy against TTWU */
+		state = READ_ONCE(tsk->__state);
+		if (state & TASK_INTERRUPTIBLE)
 			__schedstat_set(se->statistics.sleep_start,
 				      rq_clock(rq_of(cfs_rq)));
-		if (tsk->state & TASK_UNINTERRUPTIBLE)
+		if (state & TASK_UNINTERRUPTIBLE)
 			__schedstat_set(se->statistics.block_start,
 				      rq_clock(rq_of(cfs_rq)));
 	}
@@ -6888,7 +6891,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
 	 * min_vruntime -- the latter is done by enqueue_entity() when placing
 	 * the task on the new runqueue.
 	 */
-	if (p->state == TASK_WAKING) {
+	if (READ_ONCE(p->__state) == TASK_WAKING) {
 		struct sched_entity *se = &p->se;
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 		u64 min_vruntime;
@@ -11053,7 +11056,7 @@ static inline bool vruntime_normalized(struct task_struct *p)
 	 *   waiting for actually being woken up by sched_ttwu_pending().
 	 */
 	if (!se->sum_exec_runtime ||
-	    (p->state == TASK_WAKING && p->sched_remote_wakeup))
+	    (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup))
 		return true;
 
 	return false;
diff --git a/lib/syscall.c b/lib/syscall.c
index ba13e924c430..006e256d2264 100644
--- a/lib/syscall.c
+++ b/lib/syscall.c
@@ -68,13 +68,13 @@ static int collect_syscall(struct task_struct *target, struct syscall_info *info
  */
 int task_current_syscall(struct task_struct *target, struct syscall_info *info)
 {
-	long state;
 	unsigned long ncsw;
+	unsigned int state;
 
 	if (target == current)
 		return collect_syscall(target, info);
 
-	state = target->state;
+	state = READ_ONCE(target->__state);
 	if (unlikely(!state))
 		return -EAGAIN;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index ef8cf7619baf..2512f672bf8a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4363,7 +4363,7 @@ static inline void ____napi_schedule(struct softnet_data *sd,
 			 * makes sure to proceed with napi polling
 			 * if the thread is explicitly woken from here.
 			 */
-			if (READ_ONCE(thread->state) != TASK_INTERRUPTIBLE)
+			if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE)
 				set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
 			wake_up_process(thread);
 			return;
-- 
cgit v1.2.3


From 52d7e288444906aa5c99888e80a9cc1a1423ed92 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 18 Jun 2021 16:45:22 +0300
Subject: blk-mq: fix an IS_ERR() vs NULL bug

The __blk_mq_alloc_disk() function doesn't return NULLs it returns
error pointers.

Fixes: b461dfc49eb6 ("blk-mq: add the blk_mq_alloc_disk APIs")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/YMyjci35WBqrtqG+@mwanda
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 02a4aab0aeac..fd2de2b422ed 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -431,7 +431,7 @@ enum {
 	static struct lock_class_key __key;				\
 	struct gendisk *__disk = __blk_mq_alloc_disk(set, queuedata);	\
 									\
-	if (__disk)							\
+	if (!IS_ERR(__disk))						\
 		lockdep_init_map(&__disk->lockdep_map,			\
 			"(bio completion)", &__key, 0);			\
 	__disk;								\
-- 
cgit v1.2.3


From 60302ce4ea075369641426ef407c110e36ea8ba1 Mon Sep 17 00:00:00 2001
From: Stephan Gerhold <stephan@gerhold.net>
Date: Fri, 18 Jun 2021 19:36:09 +0200
Subject: rpmsg: core: Add driver_data for rpmsg_device_id

Most device_id structs provide a driver_data field that can be used
by drivers to associate data more easily for a particular device ID.
Add the same for the rpmsg_device_id.

Cc: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Stephan Gerhold <stephan@gerhold.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/rpmsg/rpmsg_core.c      | 4 +++-
 include/linux/mod_devicetable.h | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/rpmsg/rpmsg_core.c b/drivers/rpmsg/rpmsg_core.c
index e5daee4f9373..c1404d3dae2c 100644
--- a/drivers/rpmsg/rpmsg_core.c
+++ b/drivers/rpmsg/rpmsg_core.c
@@ -459,8 +459,10 @@ static int rpmsg_dev_match(struct device *dev, struct device_driver *drv)
 
 	if (ids)
 		for (i = 0; ids[i].name[0]; i++)
-			if (rpmsg_id_match(rpdev, &ids[i]))
+			if (rpmsg_id_match(rpdev, &ids[i])) {
+				rpdev->id.driver_data = ids[i].driver_data;
 				return 1;
+			}
 
 	return of_driver_match_device(dev, drv);
 }
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 7d45b5f989b0..8e291cfdaf06 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -447,6 +447,7 @@ struct hv_vmbus_device_id {
 
 struct rpmsg_device_id {
 	char name[RPMSG_NAME_SIZE];
+	kernel_ulong_t driver_data;
 };
 
 /* i2c */
-- 
cgit v1.2.3


From 31c143f712750143abaca396236bbe8707700111 Mon Sep 17 00:00:00 2001
From: Stephan Gerhold <stephan@gerhold.net>
Date: Fri, 18 Jun 2021 19:36:11 +0200
Subject: net: wwan: Allow WWAN drivers to provide blocking tx and poll
 function

At the moment, the WWAN core provides wwan_port_txon/off() to implement
blocking writes. The tx() port operation should not block, instead
wwan_port_txon/off() should be called when the TX queue is full or has
free space again.

However, in some cases it is not straightforward to make use of that
functionality. For example, the RPMSG API used by rpmsg_wwan_ctrl.c
does not provide any way to be notified when the TX queue has space
again. Instead, it only provides the following operations:

  - rpmsg_send(): blocking write (wait until there is space)
  - rpmsg_trysend(): non-blocking write (return error if no space)
  - rpmsg_poll(): set poll flags depending on TX queue state

Generally that's totally sufficient for implementing a char device,
but it does not fit well to the currently provided WWAN port ops.

Most of the time, using the non-blocking rpmsg_trysend() in the
WWAN tx() port operation works just fine. However, with high-frequent
writes to the char device it is possible to trigger a situation
where this causes issues. For example, consider the following
(somewhat unrealistic) example:

 # dd if=/dev/zero bs=1000 of=/dev/wwan0qmi0
 dd: error writing '/dev/wwan0qmi0': Resource temporarily unavailable
 1+0 records out

This fails immediately after writing the first record. It's likely
only a matter of time until this triggers issues for some real application
(e.g. ModemManager sending a lot of large QMI packets).

The rpmsg_char device does not have this problem, because it uses
rpmsg_trysend() and rpmsg_poll() to support non-blocking operations.
Make it possible to use the same in the RPMSG WWAN driver by adding
two new optional wwan_port_ops:

  - tx_blocking(): send data blocking if allowed
  - tx_poll(): set additional TX poll flags

This integrates nicely with the RPMSG API and does not require
any change in existing WWAN drivers.

With these changes, the dd example above blocks instead of exiting
with an error.

Cc: Loic Poulain <loic.poulain@linaro.org>
Signed-off-by: Stephan Gerhold <stephan@gerhold.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wwan/rpmsg_wwan_ctrl.c | 23 +++++++++++++++++++++++
 drivers/net/wwan/wwan_core.c       | 16 ++++++++++++----
 include/linux/wwan.h               | 13 +++++++++++--
 3 files changed, 46 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wwan/rpmsg_wwan_ctrl.c b/drivers/net/wwan/rpmsg_wwan_ctrl.c
index de226cdb69fd..31c24420ab2e 100644
--- a/drivers/net/wwan/rpmsg_wwan_ctrl.c
+++ b/drivers/net/wwan/rpmsg_wwan_ctrl.c
@@ -67,10 +67,33 @@ static int rpmsg_wwan_ctrl_tx(struct wwan_port *port, struct sk_buff *skb)
 	return 0;
 }
 
+static int rpmsg_wwan_ctrl_tx_blocking(struct wwan_port *port, struct sk_buff *skb)
+{
+	struct rpmsg_wwan_dev *rpwwan = wwan_port_get_drvdata(port);
+	int ret;
+
+	ret = rpmsg_send(rpwwan->ept, skb->data, skb->len);
+	if (ret)
+		return ret;
+
+	consume_skb(skb);
+	return 0;
+}
+
+static __poll_t rpmsg_wwan_ctrl_tx_poll(struct wwan_port *port,
+					struct file *filp, poll_table *wait)
+{
+	struct rpmsg_wwan_dev *rpwwan = wwan_port_get_drvdata(port);
+
+	return rpmsg_poll(rpwwan->ept, filp, wait);
+}
+
 static const struct wwan_port_ops rpmsg_wwan_pops = {
 	.start = rpmsg_wwan_ctrl_start,
 	.stop = rpmsg_wwan_ctrl_stop,
 	.tx = rpmsg_wwan_ctrl_tx,
+	.tx_blocking = rpmsg_wwan_ctrl_tx_blocking,
+	.tx_poll = rpmsg_wwan_ctrl_tx_poll,
 };
 
 static struct device *rpmsg_wwan_find_parent(struct device *dev)
diff --git a/drivers/net/wwan/wwan_core.c b/drivers/net/wwan/wwan_core.c
index 7e728042fc41..165afec1dbd1 100644
--- a/drivers/net/wwan/wwan_core.c
+++ b/drivers/net/wwan/wwan_core.c
@@ -500,7 +500,8 @@ static void wwan_port_op_stop(struct wwan_port *port)
 	mutex_unlock(&port->ops_lock);
 }
 
-static int wwan_port_op_tx(struct wwan_port *port, struct sk_buff *skb)
+static int wwan_port_op_tx(struct wwan_port *port, struct sk_buff *skb,
+			   bool nonblock)
 {
 	int ret;
 
@@ -510,7 +511,10 @@ static int wwan_port_op_tx(struct wwan_port *port, struct sk_buff *skb)
 		goto out_unlock;
 	}
 
-	ret = port->ops->tx(port, skb);
+	if (nonblock || !port->ops->tx_blocking)
+		ret = port->ops->tx(port, skb);
+	else
+		ret = port->ops->tx_blocking(port, skb);
 
 out_unlock:
 	mutex_unlock(&port->ops_lock);
@@ -637,7 +641,7 @@ static ssize_t wwan_port_fops_write(struct file *filp, const char __user *buf,
 		return -EFAULT;
 	}
 
-	ret = wwan_port_op_tx(port, skb);
+	ret = wwan_port_op_tx(port, skb, !!(filp->f_flags & O_NONBLOCK));
 	if (ret) {
 		kfree_skb(skb);
 		return ret;
@@ -653,12 +657,16 @@ static __poll_t wwan_port_fops_poll(struct file *filp, poll_table *wait)
 
 	poll_wait(filp, &port->waitqueue, wait);
 
-	if (!is_write_blocked(port))
+	mutex_lock(&port->ops_lock);
+	if (port->ops && port->ops->tx_poll)
+		mask |= port->ops->tx_poll(port, filp, wait);
+	else if (!is_write_blocked(port))
 		mask |= EPOLLOUT | EPOLLWRNORM;
 	if (!is_read_blocked(port))
 		mask |= EPOLLIN | EPOLLRDNORM;
 	if (!port->ops)
 		mask |= EPOLLHUP | EPOLLERR;
+	mutex_unlock(&port->ops_lock);
 
 	return mask;
 }
diff --git a/include/linux/wwan.h b/include/linux/wwan.h
index 430a3a0817de..34222230360c 100644
--- a/include/linux/wwan.h
+++ b/include/linux/wwan.h
@@ -6,6 +6,7 @@
 
 #include <linux/device.h>
 #include <linux/kernel.h>
+#include <linux/poll.h>
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
 
@@ -40,15 +41,23 @@ struct wwan_port;
 /** struct wwan_port_ops - The WWAN port operations
  * @start: The routine for starting the WWAN port device.
  * @stop: The routine for stopping the WWAN port device.
- * @tx: The routine that sends WWAN port protocol data to the device.
+ * @tx: Non-blocking routine that sends WWAN port protocol data to the device.
+ * @tx_blocking: Optional blocking routine that sends WWAN port protocol data
+ *               to the device.
+ * @tx_poll: Optional routine that sets additional TX poll flags.
  *
  * The wwan_port_ops structure contains a list of low-level operations
- * that control a WWAN port device. All functions are mandatory.
+ * that control a WWAN port device. All functions are mandatory unless specified.
  */
 struct wwan_port_ops {
 	int (*start)(struct wwan_port *port);
 	void (*stop)(struct wwan_port *port);
 	int (*tx)(struct wwan_port *port, struct sk_buff *skb);
+
+	/* Optional operations */
+	int (*tx_blocking)(struct wwan_port *port, struct sk_buff *skb);
+	__poll_t (*tx_poll)(struct wwan_port *port, struct file *filp,
+			    poll_table *wait);
 };
 
 /**
-- 
cgit v1.2.3


From 031e668bc1ad7ccdbfb2b67b838bb6b7cc44ecf3 Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Wed, 16 Jun 2021 15:59:01 +0100
Subject: soundwire: bus: Make sdw_nwrite() data pointer argument const

Idiomatically, write functions should take const pointers to the
data buffer, as they don't change the data. They are also likely
to be called from functions that receive a const data pointer.

Internally the pointer is passed to function/structs shared with
the read functions, requiring a cast, but this is an implementation
detail that should be hidden by the public API.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20210616145901.29402-1-rf@opensource.cirrus.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/bus.c       | 8 ++++----
 include/linux/soundwire/sdw.h | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c
index 85bcf60f9697..adcbf3969110 100644
--- a/drivers/soundwire/bus.c
+++ b/drivers/soundwire/bus.c
@@ -394,13 +394,13 @@ sdw_nread_no_pm(struct sdw_slave *slave, u32 addr, size_t count, u8 *val)
 }
 
 static int
-sdw_nwrite_no_pm(struct sdw_slave *slave, u32 addr, size_t count, u8 *val)
+sdw_nwrite_no_pm(struct sdw_slave *slave, u32 addr, size_t count, const u8 *val)
 {
 	struct sdw_msg msg;
 	int ret;
 
 	ret = sdw_fill_msg(&msg, slave, addr, count,
-			   slave->dev_num, SDW_MSG_FLAG_WRITE, val);
+			   slave->dev_num, SDW_MSG_FLAG_WRITE, (u8 *)val);
 	if (ret < 0)
 		return ret;
 
@@ -535,9 +535,9 @@ EXPORT_SYMBOL(sdw_nread);
  * @slave: SDW Slave
  * @addr: Register address
  * @count: length
- * @val: Buffer for values to be read
+ * @val: Buffer for values to be written
  */
-int sdw_nwrite(struct sdw_slave *slave, u32 addr, size_t count, u8 *val)
+int sdw_nwrite(struct sdw_slave *slave, u32 addr, size_t count, const u8 *val)
 {
 	int ret;
 
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 8ca736e92d5a..ddbeb00799e4 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -1039,7 +1039,7 @@ int sdw_write(struct sdw_slave *slave, u32 addr, u8 value);
 int sdw_write_no_pm(struct sdw_slave *slave, u32 addr, u8 value);
 int sdw_read_no_pm(struct sdw_slave *slave, u32 addr);
 int sdw_nread(struct sdw_slave *slave, u32 addr, size_t count, u8 *val);
-int sdw_nwrite(struct sdw_slave *slave, u32 addr, size_t count, u8 *val);
+int sdw_nwrite(struct sdw_slave *slave, u32 addr, size_t count, const u8 *val);
 int sdw_compare_devid(struct sdw_slave *slave, struct sdw_slave_id id);
 void sdw_extract_slave_id(struct sdw_bus *bus, u64 addr, struct sdw_slave_id *id);
 
-- 
cgit v1.2.3


From 0475c3655e6ebd1d6d6f0e705eba97fce39a08e3 Mon Sep 17 00:00:00 2001
From: Hyunchul Lee <hyc.lee@gmail.com>
Date: Tue, 8 Jun 2021 23:53:14 +0900
Subject: cifs: decoding negTokenInit with generic ASN1 decoder

Decode negTokenInit with lib/asn1_decoder. For that,
add OIDs in linux/oid_registry.h and a negTokenInit
ASN1 file, "spnego_negtokeninit.asn1".
And define decoder's callback functions, which
are the gssapi_this_mech for checking SPENGO oid and
the neg_token_init_mech_type for getting authentication
mechanisms supported by a server.

Signed-off-by: Hyunchul Lee <hyc.lee@gmail.com>
Reviewed-by: Aurelien Aptel <aaptel@suse.com>
Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/cifs/Kconfig                       |   2 +
 fs/cifs/Makefile                      |   8 +-
 fs/cifs/asn1.c                        | 623 ++--------------------------------
 fs/cifs/cifs_spnego_negtokeninit.asn1 |  40 +++
 include/linux/oid_registry.h          |   8 +
 5 files changed, 93 insertions(+), 588 deletions(-)
 create mode 100644 fs/cifs/cifs_spnego_negtokeninit.asn1

(limited to 'include/linux')

diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 55bc57e9f812..7364950a9ef4 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -19,6 +19,8 @@ config CIFS
 	select CRYPTO_LIB_DES
 	select KEYS
 	select DNS_RESOLVER
+	select ASN1
+	select OID_REGISTRY
 	help
 	  This is the client VFS module for the SMB3 family of NAS protocols,
 	  (including support for the most recent, most secure dialect SMB3.1.1)
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 3ee3b7de4ded..87fcacdf3de7 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -6,12 +6,16 @@ ccflags-y += -I$(src)		# needed for trace events
 obj-$(CONFIG_CIFS) += cifs.o
 
 cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \
-	  inode.o link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \
+	  inode.o link.o misc.o netmisc.o smbencrypt.o transport.o \
 	  cifs_unicode.o nterr.o cifsencrypt.o \
 	  readdir.o ioctl.o sess.o export.o smb1ops.o unc.o winucase.o \
 	  smb2ops.o smb2maperror.o smb2transport.o \
 	  smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o \
-	  dns_resolve.o
+	  dns_resolve.o cifs_spnego_negtokeninit.asn1.o asn1.o
+
+$(obj)/asn1.o: $(obj)/cifs_spnego_negtokeninit.asn1.h
+
+$(obj)/cifs_spnego_negtokeninit.asn1.o: $(obj)/cifs_spnego_negtokeninit.asn1.c $(obj)/cifs_spnego_negtokeninit.asn1.h
 
 cifs-$(CONFIG_CIFS_XATTR) += xattr.o
 
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index 3150c19cdc2f..b5724ef9f182 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -1,612 +1,63 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * The ASB.1/BER parsing code is derived from ip_nat_snmp_basic.c which was in
- * turn derived from the gxsnmp package by Gregory McLean & Jochen Friedrich
- *
- * Copyright (c) 2000 RP Internet (www.rpi.net.au).
- */
 
 #include <linux/module.h>
-#include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include "cifspdu.h"
+#include <linux/oid_registry.h>
 #include "cifsglob.h"
 #include "cifs_debug.h"
 #include "cifsproto.h"
+#include "cifs_spnego_negtokeninit.asn1.h"
 
-/*****************************************************************************
- *
- * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse)
- *
- *****************************************************************************/
-
-/* Class */
-#define ASN1_UNI	0	/* Universal */
-#define ASN1_APL	1	/* Application */
-#define ASN1_CTX	2	/* Context */
-#define ASN1_PRV	3	/* Private */
-
-/* Tag */
-#define ASN1_EOC	0	/* End Of Contents or N/A */
-#define ASN1_BOL	1	/* Boolean */
-#define ASN1_INT	2	/* Integer */
-#define ASN1_BTS	3	/* Bit String */
-#define ASN1_OTS	4	/* Octet String */
-#define ASN1_NUL	5	/* Null */
-#define ASN1_OJI	6	/* Object Identifier  */
-#define ASN1_OJD	7	/* Object Description */
-#define ASN1_EXT	8	/* External */
-#define ASN1_ENUM	10	/* Enumerated */
-#define ASN1_SEQ	16	/* Sequence */
-#define ASN1_SET	17	/* Set */
-#define ASN1_NUMSTR	18	/* Numerical String */
-#define ASN1_PRNSTR	19	/* Printable String */
-#define ASN1_TEXSTR	20	/* Teletext String */
-#define ASN1_VIDSTR	21	/* Video String */
-#define ASN1_IA5STR	22	/* IA5 String */
-#define ASN1_UNITIM	23	/* Universal Time */
-#define ASN1_GENTIM	24	/* General Time */
-#define ASN1_GRASTR	25	/* Graphical String */
-#define ASN1_VISSTR	26	/* Visible String */
-#define ASN1_GENSTR	27	/* General String */
-
-/* Primitive / Constructed methods*/
-#define ASN1_PRI	0	/* Primitive */
-#define ASN1_CON	1	/* Constructed */
-
-/*
- * Error codes.
- */
-#define ASN1_ERR_NOERROR		0
-#define ASN1_ERR_DEC_EMPTY		2
-#define ASN1_ERR_DEC_EOC_MISMATCH	3
-#define ASN1_ERR_DEC_LENGTH_MISMATCH	4
-#define ASN1_ERR_DEC_BADVALUE		5
-
-#define SPNEGO_OID_LEN 7
-#define NTLMSSP_OID_LEN  10
-#define KRB5_OID_LEN  7
-#define KRB5U2U_OID_LEN  8
-#define MSKRB5_OID_LEN  7
-static unsigned long SPNEGO_OID[7] = { 1, 3, 6, 1, 5, 5, 2 };
-static unsigned long NTLMSSP_OID[10] = { 1, 3, 6, 1, 4, 1, 311, 2, 2, 10 };
-static unsigned long KRB5_OID[7] = { 1, 2, 840, 113554, 1, 2, 2 };
-static unsigned long KRB5U2U_OID[8] = { 1, 2, 840, 113554, 1, 2, 2, 3 };
-static unsigned long MSKRB5_OID[7] = { 1, 2, 840, 48018, 1, 2, 2 };
-
-/*
- * ASN.1 context.
- */
-struct asn1_ctx {
-	int error;		/* Error condition */
-	unsigned char *pointer;	/* Octet just to be decoded */
-	unsigned char *begin;	/* First octet */
-	unsigned char *end;	/* Octet after last octet */
-};
-
-/*
- * Octet string (not null terminated)
- */
-struct asn1_octstr {
-	unsigned char *data;
-	unsigned int len;
-};
-
-static void
-asn1_open(struct asn1_ctx *ctx, unsigned char *buf, unsigned int len)
-{
-	ctx->begin = buf;
-	ctx->end = buf + len;
-	ctx->pointer = buf;
-	ctx->error = ASN1_ERR_NOERROR;
-}
-
-static unsigned char
-asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch)
-{
-	if (ctx->pointer >= ctx->end) {
-		ctx->error = ASN1_ERR_DEC_EMPTY;
-		return 0;
-	}
-	*ch = *(ctx->pointer)++;
-	return 1;
-}
-
-#if 0 /* will be needed later by spnego decoding/encoding of ntlmssp */
-static unsigned char
-asn1_enum_decode(struct asn1_ctx *ctx, __le32 *val)
-{
-	unsigned char ch;
-
-	if (ctx->pointer >= ctx->end) {
-		ctx->error = ASN1_ERR_DEC_EMPTY;
-		return 0;
-	}
-
-	ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to length octet */
-	if ((ch) == ASN1_ENUM)  /* if ch value is ENUM, 0xa */
-		*val = *(++(ctx->pointer)); /* value has enum value */
-	else
-		return 0;
-
-	ctx->pointer++;
-	return 1;
-}
-#endif
-
-static unsigned char
-asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag)
-{
-	unsigned char ch;
-
-	*tag = 0;
-
-	do {
-		if (!asn1_octet_decode(ctx, &ch))
-			return 0;
-		*tag <<= 7;
-		*tag |= ch & 0x7F;
-	} while ((ch & 0x80) == 0x80);
-	return 1;
-}
-
-static unsigned char
-asn1_id_decode(struct asn1_ctx *ctx,
-	       unsigned int *cls, unsigned int *con, unsigned int *tag)
-{
-	unsigned char ch;
-
-	if (!asn1_octet_decode(ctx, &ch))
-		return 0;
-
-	*cls = (ch & 0xC0) >> 6;
-	*con = (ch & 0x20) >> 5;
-	*tag = (ch & 0x1F);
-
-	if (*tag == 0x1F) {
-		if (!asn1_tag_decode(ctx, tag))
-			return 0;
-	}
-	return 1;
-}
-
-static unsigned char
-asn1_length_decode(struct asn1_ctx *ctx, unsigned int *def, unsigned int *len)
-{
-	unsigned char ch, cnt;
-
-	if (!asn1_octet_decode(ctx, &ch))
-		return 0;
-
-	if (ch == 0x80)
-		*def = 0;
-	else {
-		*def = 1;
-
-		if (ch < 0x80)
-			*len = ch;
-		else {
-			cnt = (unsigned char) (ch & 0x7F);
-			*len = 0;
-
-			while (cnt > 0) {
-				if (!asn1_octet_decode(ctx, &ch))
-					return 0;
-				*len <<= 8;
-				*len |= ch;
-				cnt--;
-			}
-		}
-	}
-
-	/* don't trust len bigger than ctx buffer */
-	if (*len > ctx->end - ctx->pointer)
-		return 0;
-
-	return 1;
-}
-
-static unsigned char
-asn1_header_decode(struct asn1_ctx *ctx,
-		   unsigned char **eoc,
-		   unsigned int *cls, unsigned int *con, unsigned int *tag)
-{
-	unsigned int def = 0;
-	unsigned int len = 0;
-
-	if (!asn1_id_decode(ctx, cls, con, tag))
-		return 0;
-
-	if (!asn1_length_decode(ctx, &def, &len))
-		return 0;
-
-	/* primitive shall be definite, indefinite shall be constructed */
-	if (*con == ASN1_PRI && !def)
-		return 0;
-
-	if (def)
-		*eoc = ctx->pointer + len;
-	else
-		*eoc = NULL;
-	return 1;
-}
-
-static unsigned char
-asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc)
+int
+decode_negTokenInit(unsigned char *security_blob, int length,
+		    struct TCP_Server_Info *server)
 {
-	unsigned char ch;
-
-	if (eoc == NULL) {
-		if (!asn1_octet_decode(ctx, &ch))
-			return 0;
-
-		if (ch != 0x00) {
-			ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
-			return 0;
-		}
-
-		if (!asn1_octet_decode(ctx, &ch))
-			return 0;
-
-		if (ch != 0x00) {
-			ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
-			return 0;
-		}
-		return 1;
-	} else {
-		if (ctx->pointer != eoc) {
-			ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH;
-			return 0;
-		}
+	if (asn1_ber_decoder(&cifs_spnego_negtokeninit_decoder, server,
+			     security_blob, length) == 0)
 		return 1;
-	}
-}
-
-/* static unsigned char asn1_null_decode(struct asn1_ctx *ctx,
-				      unsigned char *eoc)
-{
-	ctx->pointer = eoc;
-	return 1;
-}
-
-static unsigned char asn1_long_decode(struct asn1_ctx *ctx,
-				      unsigned char *eoc, long *integer)
-{
-	unsigned char ch;
-	unsigned int len;
-
-	if (!asn1_octet_decode(ctx, &ch))
-		return 0;
-
-	*integer = (signed char) ch;
-	len = 1;
-
-	while (ctx->pointer < eoc) {
-		if (++len > sizeof(long)) {
-			ctx->error = ASN1_ERR_DEC_BADVALUE;
-			return 0;
-		}
-
-		if (!asn1_octet_decode(ctx, &ch))
-			return 0;
-
-		*integer <<= 8;
-		*integer |= ch;
-	}
-	return 1;
-}
-
-static unsigned char asn1_uint_decode(struct asn1_ctx *ctx,
-				      unsigned char *eoc,
-				      unsigned int *integer)
-{
-	unsigned char ch;
-	unsigned int len;
-
-	if (!asn1_octet_decode(ctx, &ch))
-		return 0;
-
-	*integer = ch;
-	if (ch == 0)
-		len = 0;
 	else
-		len = 1;
-
-	while (ctx->pointer < eoc) {
-		if (++len > sizeof(unsigned int)) {
-			ctx->error = ASN1_ERR_DEC_BADVALUE;
-			return 0;
-		}
-
-		if (!asn1_octet_decode(ctx, &ch))
-			return 0;
-
-		*integer <<= 8;
-		*integer |= ch;
-	}
-	return 1;
-}
-
-static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx,
-				       unsigned char *eoc,
-				       unsigned long *integer)
-{
-	unsigned char ch;
-	unsigned int len;
-
-	if (!asn1_octet_decode(ctx, &ch))
 		return 0;
-
-	*integer = ch;
-	if (ch == 0)
-		len = 0;
-	else
-		len = 1;
-
-	while (ctx->pointer < eoc) {
-		if (++len > sizeof(unsigned long)) {
-			ctx->error = ASN1_ERR_DEC_BADVALUE;
-			return 0;
-		}
-
-		if (!asn1_octet_decode(ctx, &ch))
-			return 0;
-
-		*integer <<= 8;
-		*integer |= ch;
-	}
-	return 1;
 }
 
-static unsigned char
-asn1_octets_decode(struct asn1_ctx *ctx,
-		   unsigned char *eoc,
-		   unsigned char **octets, unsigned int *len)
+int cifs_gssapi_this_mech(void *context, size_t hdrlen,
+			  unsigned char tag, const void *value, size_t vlen)
 {
-	unsigned char *ptr;
-
-	*len = 0;
-
-	*octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC);
-	if (*octets == NULL) {
-		return 0;
-	}
-
-	ptr = *octets;
-	while (ctx->pointer < eoc) {
-		if (!asn1_octet_decode(ctx, (unsigned char *) ptr++)) {
-			kfree(*octets);
-			*octets = NULL;
-			return 0;
-		}
-		(*len)++;
-	}
-	return 1;
-} */
-
-static unsigned char
-asn1_subid_decode(struct asn1_ctx *ctx, unsigned long *subid)
-{
-	unsigned char ch;
-
-	*subid = 0;
-
-	do {
-		if (!asn1_octet_decode(ctx, &ch))
-			return 0;
-
-		*subid <<= 7;
-		*subid |= ch & 0x7F;
-	} while ((ch & 0x80) == 0x80);
-	return 1;
-}
-
-static int
-asn1_oid_decode(struct asn1_ctx *ctx,
-		unsigned char *eoc, unsigned long **oid, unsigned int *len)
-{
-	unsigned long subid;
-	unsigned int size;
-	unsigned long *optr;
-
-	size = eoc - ctx->pointer + 1;
-
-	/* first subid actually encodes first two subids */
-	if (size < 2 || size > UINT_MAX/sizeof(unsigned long))
-		return 0;
-
-	*oid = kmalloc_array(size, sizeof(unsigned long), GFP_ATOMIC);
-	if (*oid == NULL)
-		return 0;
-
-	optr = *oid;
-
-	if (!asn1_subid_decode(ctx, &subid)) {
-		kfree(*oid);
-		*oid = NULL;
-		return 0;
-	}
-
-	if (subid < 40) {
-		optr[0] = 0;
-		optr[1] = subid;
-	} else if (subid < 80) {
-		optr[0] = 1;
-		optr[1] = subid - 40;
-	} else {
-		optr[0] = 2;
-		optr[1] = subid - 80;
-	}
-
-	*len = 2;
-	optr += 2;
+	enum OID oid;
 
-	while (ctx->pointer < eoc) {
-		if (++(*len) > size) {
-			ctx->error = ASN1_ERR_DEC_BADVALUE;
-			kfree(*oid);
-			*oid = NULL;
-			return 0;
-		}
+	oid = look_up_OID(value, vlen);
+	if (oid != OID_spnego) {
+		char buf[50];
 
-		if (!asn1_subid_decode(ctx, optr++)) {
-			kfree(*oid);
-			*oid = NULL;
-			return 0;
-		}
+		sprint_oid(value, vlen, buf, sizeof(buf));
+		cifs_dbg(FYI, "Error decoding negTokenInit header: unexpected OID %s\n",
+			 buf);
+		return -EBADMSG;
 	}
-	return 1;
+	return 0;
 }
 
-static int
-compare_oid(unsigned long *oid1, unsigned int oid1len,
-	    unsigned long *oid2, unsigned int oid2len)
+int cifs_neg_token_init_mech_type(void *context, size_t hdrlen,
+				  unsigned char tag,
+				  const void *value, size_t vlen)
 {
-	unsigned int i;
+	struct TCP_Server_Info *server = context;
+	enum OID oid;
 
-	if (oid1len != oid2len)
-		return 0;
+	oid = look_up_OID(value, vlen);
+	if (oid == OID_mskrb5)
+		server->sec_mskerberos = true;
+	else if (oid == OID_krb5u2u)
+		server->sec_kerberosu2u = true;
+	else if (oid == OID_krb5)
+		server->sec_kerberos = true;
+	else if (oid == OID_ntlmssp)
+		server->sec_ntlmssp = true;
 	else {
-		for (i = 0; i < oid1len; i++) {
-			if (oid1[i] != oid2[i])
-				return 0;
-		}
-		return 1;
-	}
-}
-
-	/* BB check for endian conversion issues here */
-
-int
-decode_negTokenInit(unsigned char *security_blob, int length,
-		    struct TCP_Server_Info *server)
-{
-	struct asn1_ctx ctx;
-	unsigned char *end;
-	unsigned char *sequence_end;
-	unsigned long *oid = NULL;
-	unsigned int cls, con, tag, oidlen, rc;
-
-	/* cifs_dump_mem(" Received SecBlob ", security_blob, length); */
-
-	asn1_open(&ctx, security_blob, length);
+		char buf[50];
 
-	/* GSSAPI header */
-	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cifs_dbg(FYI, "Error decoding negTokenInit header\n");
-		return 0;
-	} else if ((cls != ASN1_APL) || (con != ASN1_CON)
-		   || (tag != ASN1_EOC)) {
-		cifs_dbg(FYI, "cls = %d con = %d tag = %d\n", cls, con, tag);
-		return 0;
+		sprint_oid(value, vlen, buf, sizeof(buf));
+		cifs_dbg(FYI, "Decoding negTokenInit: unsupported OID %s\n",
+			 buf);
 	}
-
-	/* Check for SPNEGO OID -- remember to free obj->oid */
-	rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
-	if (rc) {
-		if ((tag == ASN1_OJI) && (con == ASN1_PRI) &&
-		    (cls == ASN1_UNI)) {
-			rc = asn1_oid_decode(&ctx, end, &oid, &oidlen);
-			if (rc) {
-				rc = compare_oid(oid, oidlen, SPNEGO_OID,
-						 SPNEGO_OID_LEN);
-				kfree(oid);
-			}
-		} else
-			rc = 0;
-	}
-
-	/* SPNEGO OID not present or garbled -- bail out */
-	if (!rc) {
-		cifs_dbg(FYI, "Error decoding negTokenInit header\n");
-		return 0;
-	}
-
-	/* SPNEGO */
-	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cifs_dbg(FYI, "Error decoding negTokenInit\n");
-		return 0;
-	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)
-		   || (tag != ASN1_EOC)) {
-		cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 0\n",
-			 cls, con, tag, end);
-		return 0;
-	}
-
-	/* negTokenInit */
-	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cifs_dbg(FYI, "Error decoding negTokenInit\n");
-		return 0;
-	} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
-		   || (tag != ASN1_SEQ)) {
-		cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 1\n",
-			 cls, con, tag, end);
-		return 0;
-	}
-
-	/* sequence */
-	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cifs_dbg(FYI, "Error decoding 2nd part of negTokenInit\n");
-		return 0;
-	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)
-		   || (tag != ASN1_EOC)) {
-		cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 0\n",
-			 cls, con, tag, end);
-		return 0;
-	}
-
-	/* sequence of */
-	if (asn1_header_decode
-	    (&ctx, &sequence_end, &cls, &con, &tag) == 0) {
-		cifs_dbg(FYI, "Error decoding 2nd part of negTokenInit\n");
-		return 0;
-	} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
-		   || (tag != ASN1_SEQ)) {
-		cifs_dbg(FYI, "cls = %d con = %d tag = %d sequence_end = %p exit 1\n",
-			 cls, con, tag, sequence_end);
-		return 0;
-	}
-
-	/* list of security mechanisms */
-	while (!asn1_eoc_decode(&ctx, sequence_end)) {
-		rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
-		if (!rc) {
-			cifs_dbg(FYI, "Error decoding negTokenInit hdr exit2\n");
-			return 0;
-		}
-		if ((tag == ASN1_OJI) && (con == ASN1_PRI)) {
-			if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) {
-
-				cifs_dbg(FYI, "OID len = %d oid = 0x%lx 0x%lx 0x%lx 0x%lx\n",
-					 oidlen, *oid, *(oid + 1), *(oid + 2),
-					 *(oid + 3));
-
-				if (compare_oid(oid, oidlen, MSKRB5_OID,
-						MSKRB5_OID_LEN))
-					server->sec_mskerberos = true;
-				else if (compare_oid(oid, oidlen, KRB5U2U_OID,
-						     KRB5U2U_OID_LEN))
-					server->sec_kerberosu2u = true;
-				else if (compare_oid(oid, oidlen, KRB5_OID,
-						     KRB5_OID_LEN))
-					server->sec_kerberos = true;
-				else if (compare_oid(oid, oidlen, NTLMSSP_OID,
-						     NTLMSSP_OID_LEN))
-					server->sec_ntlmssp = true;
-
-				kfree(oid);
-			}
-		} else {
-			cifs_dbg(FYI, "Should be an oid what is going on?\n");
-		}
-	}
-
-	/*
-	 * We currently ignore anything at the end of the SPNEGO blob after
-	 * the mechTypes have been parsed, since none of that info is
-	 * used at the moment.
-	 */
-	return 1;
+	return 0;
 }
diff --git a/fs/cifs/cifs_spnego_negtokeninit.asn1 b/fs/cifs/cifs_spnego_negtokeninit.asn1
new file mode 100644
index 000000000000..181c083887d5
--- /dev/null
+++ b/fs/cifs/cifs_spnego_negtokeninit.asn1
@@ -0,0 +1,40 @@
+GSSAPI ::=
+	[APPLICATION 0] IMPLICIT SEQUENCE {
+		thisMech
+			OBJECT IDENTIFIER ({cifs_gssapi_this_mech}),
+		negotiationToken
+			NegotiationToken
+	}
+
+MechType ::= OBJECT IDENTIFIER ({cifs_neg_token_init_mech_type})
+
+MechTypeList ::= SEQUENCE OF MechType
+
+NegHints ::= SEQUENCE {
+	hintName
+		[0] GeneralString OPTIONAL,
+	hintAddress
+		[1] OCTET STRING OPTIONAL
+	}
+
+NegTokenInit2 ::=
+	SEQUENCE {
+		mechTypes
+			[0] MechTypeList OPTIONAL,
+		reqFlags
+			[1] BIT STRING OPTIONAL,
+		mechToken
+			[2] OCTET STRING OPTIONAL,
+		negHints
+			[3] NegHints OPTIONAL,
+		mechListMIC
+			[3] OCTET STRING OPTIONAL
+	}
+
+NegotiationToken ::=
+	CHOICE {
+		negTokenInit
+			[0] NegTokenInit2,
+		negTokenTarg
+			[1] ANY
+	}
diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h
index 461b7aa587ba..3d8db1f6a5db 100644
--- a/include/linux/oid_registry.h
+++ b/include/linux/oid_registry.h
@@ -54,6 +54,10 @@ enum OID {
 	OID_md4,			/* 1.2.840.113549.2.4 */
 	OID_md5,			/* 1.2.840.113549.2.5 */
 
+	OID_mskrb5,			/* 1.2.840.48018.1.2.2 */
+	OID_krb5,			/* 1.2.840.113554.1.2.2 */
+	OID_krb5u2u,			/* 1.2.840.113554.1.2.2.3 */
+
 	/* Microsoft Authenticode & Software Publishing */
 	OID_msIndirectData,		/* 1.3.6.1.4.1.311.2.1.4 */
 	OID_msStatementType,		/* 1.3.6.1.4.1.311.2.1.11 */
@@ -62,6 +66,10 @@ enum OID {
 	OID_msIndividualSPKeyPurpose,	/* 1.3.6.1.4.1.311.2.1.21 */
 	OID_msOutlookExpress,		/* 1.3.6.1.4.1.311.16.4 */
 
+	OID_ntlmssp,			/* 1.3.6.1.4.1.311.2.2.10 */
+
+	OID_spnego,			/* 1.3.6.1.5.5.2 */
+
 	OID_certAuthInfoAccess,		/* 1.3.6.1.5.5.7.1.1 */
 	OID_sha1,			/* 1.3.14.3.2.26 */
 	OID_id_ansip384r1,		/* 1.3.132.0.34 */
-- 
cgit v1.2.3


From bc65baf73b68448e79e8ff797522d1976788deb1 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Thu, 18 Mar 2021 17:25:06 +0000
Subject: watchdog: Remove MV64x60 watchdog driver

Commit 92c8c16f3457 ("powerpc/embedded6xx: Remove C2K board support")
removed the last selector of CONFIG_MV64X60.

Therefore CONFIG_MV64X60_WDT cannot be selected anymore and
can be removed.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/r/9c2952bcfaec3b1789909eaa36bbce2afbfab7ab.1616085654.git.christophe.leroy@csgroup.eu
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
---
 drivers/watchdog/Kconfig       |   4 -
 drivers/watchdog/Makefile      |   1 -
 drivers/watchdog/mv64x60_wdt.c | 324 -----------------------------------------
 include/linux/mv643xx.h        |   8 -
 4 files changed, 337 deletions(-)
 delete mode 100644 drivers/watchdog/mv64x60_wdt.c

(limited to 'include/linux')

diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 433ca1ffe01c..fcc61c948f01 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -1849,10 +1849,6 @@ config 8xxx_WDT
 
 	  For BookE processors (MPC85xx) use the BOOKE_WDT driver instead.
 
-config MV64X60_WDT
-	tristate "MV64X60 (Marvell Discovery) Watchdog Timer"
-	depends on MV64X60 || COMPILE_TEST
-
 config PIKA_WDT
 	tristate "PIKA FPGA Watchdog"
 	depends on WARP || (PPC64 && COMPILE_TEST)
diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile
index a7eade8b4d45..578bd6e5fbd0 100644
--- a/drivers/watchdog/Makefile
+++ b/drivers/watchdog/Makefile
@@ -175,7 +175,6 @@ obj-$(CONFIG_PIC32_DMT) += pic32-dmt.o
 # POWERPC Architecture
 obj-$(CONFIG_GEF_WDT) += gef_wdt.o
 obj-$(CONFIG_8xxx_WDT) += mpc8xxx_wdt.o
-obj-$(CONFIG_MV64X60_WDT) += mv64x60_wdt.o
 obj-$(CONFIG_PIKA_WDT) += pika_wdt.o
 obj-$(CONFIG_BOOKE_WDT) += booke_wdt.o
 obj-$(CONFIG_MEN_A21_WDT) += mena21_wdt.o
diff --git a/drivers/watchdog/mv64x60_wdt.c b/drivers/watchdog/mv64x60_wdt.c
deleted file mode 100644
index 894aa63488d3..000000000000
--- a/drivers/watchdog/mv64x60_wdt.c
+++ /dev/null
@@ -1,324 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * mv64x60_wdt.c - MV64X60 (Marvell Discovery) watchdog userspace interface
- *
- * Author: James Chapman <jchapman@katalix.com>
- *
- * Platform-specific setup code should configure the dog to generate
- * interrupt or reset as required.  This code only enables/disables
- * and services the watchdog.
- *
- * Derived from mpc8xx_wdt.c, with the following copyright.
- *
- * 2002 (c) Florian Schirmer <jolt@tuxbox.org>
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/miscdevice.h>
-#include <linux/module.h>
-#include <linux/watchdog.h>
-#include <linux/platform_device.h>
-#include <linux/mv643xx.h>
-#include <linux/uaccess.h>
-#include <linux/io.h>
-
-#define MV64x60_WDT_WDC_OFFSET	0
-
-/*
- * The watchdog configuration register contains a pair of 2-bit fields,
- *   1.  a reload field, bits 27-26, which triggers a reload of
- *       the countdown register, and
- *   2.  an enable field, bits 25-24, which toggles between
- *       enabling and disabling the watchdog timer.
- * Bit 31 is a read-only field which indicates whether the
- * watchdog timer is currently enabled.
- *
- * The low 24 bits contain the timer reload value.
- */
-#define MV64x60_WDC_ENABLE_SHIFT	24
-#define MV64x60_WDC_SERVICE_SHIFT	26
-#define MV64x60_WDC_ENABLED_SHIFT	31
-
-#define MV64x60_WDC_ENABLED_TRUE	1
-#define MV64x60_WDC_ENABLED_FALSE	0
-
-/* Flags bits */
-#define MV64x60_WDOG_FLAG_OPENED	0
-
-static unsigned long wdt_flags;
-static int wdt_status;
-static void __iomem *mv64x60_wdt_regs;
-static int mv64x60_wdt_timeout;
-static int mv64x60_wdt_count;
-static unsigned int bus_clk;
-static char expect_close;
-static DEFINE_SPINLOCK(mv64x60_wdt_spinlock);
-
-static bool nowayout = WATCHDOG_NOWAYOUT;
-module_param(nowayout, bool, 0);
-MODULE_PARM_DESC(nowayout,
-		"Watchdog cannot be stopped once started (default="
-				__MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
-
-static int mv64x60_wdt_toggle_wdc(int enabled_predicate, int field_shift)
-{
-	u32 data;
-	u32 enabled;
-	int ret = 0;
-
-	spin_lock(&mv64x60_wdt_spinlock);
-	data = readl(mv64x60_wdt_regs + MV64x60_WDT_WDC_OFFSET);
-	enabled = (data >> MV64x60_WDC_ENABLED_SHIFT) & 1;
-
-	/* only toggle the requested field if enabled state matches predicate */
-	if ((enabled ^ enabled_predicate) == 0) {
-		/* We write a 1, then a 2 -- to the appropriate field */
-		data = (1 << field_shift) | mv64x60_wdt_count;
-		writel(data, mv64x60_wdt_regs + MV64x60_WDT_WDC_OFFSET);
-
-		data = (2 << field_shift) | mv64x60_wdt_count;
-		writel(data, mv64x60_wdt_regs + MV64x60_WDT_WDC_OFFSET);
-		ret = 1;
-	}
-	spin_unlock(&mv64x60_wdt_spinlock);
-
-	return ret;
-}
-
-static void mv64x60_wdt_service(void)
-{
-	mv64x60_wdt_toggle_wdc(MV64x60_WDC_ENABLED_TRUE,
-			       MV64x60_WDC_SERVICE_SHIFT);
-}
-
-static void mv64x60_wdt_handler_enable(void)
-{
-	if (mv64x60_wdt_toggle_wdc(MV64x60_WDC_ENABLED_FALSE,
-				   MV64x60_WDC_ENABLE_SHIFT)) {
-		mv64x60_wdt_service();
-		pr_notice("watchdog activated\n");
-	}
-}
-
-static void mv64x60_wdt_handler_disable(void)
-{
-	if (mv64x60_wdt_toggle_wdc(MV64x60_WDC_ENABLED_TRUE,
-				   MV64x60_WDC_ENABLE_SHIFT))
-		pr_notice("watchdog deactivated\n");
-}
-
-static void mv64x60_wdt_set_timeout(unsigned int timeout)
-{
-	/* maximum bus cycle count is 0xFFFFFFFF */
-	if (timeout > 0xFFFFFFFF / bus_clk)
-		timeout = 0xFFFFFFFF / bus_clk;
-
-	mv64x60_wdt_count = timeout * bus_clk >> 8;
-	mv64x60_wdt_timeout = timeout;
-}
-
-static int mv64x60_wdt_open(struct inode *inode, struct file *file)
-{
-	if (test_and_set_bit(MV64x60_WDOG_FLAG_OPENED, &wdt_flags))
-		return -EBUSY;
-
-	if (nowayout)
-		__module_get(THIS_MODULE);
-
-	mv64x60_wdt_handler_enable();
-
-	return stream_open(inode, file);
-}
-
-static int mv64x60_wdt_release(struct inode *inode, struct file *file)
-{
-	if (expect_close == 42)
-		mv64x60_wdt_handler_disable();
-	else {
-		pr_crit("unexpected close, not stopping timer!\n");
-		mv64x60_wdt_service();
-	}
-	expect_close = 0;
-
-	clear_bit(MV64x60_WDOG_FLAG_OPENED, &wdt_flags);
-
-	return 0;
-}
-
-static ssize_t mv64x60_wdt_write(struct file *file, const char __user *data,
-				 size_t len, loff_t *ppos)
-{
-	if (len) {
-		if (!nowayout) {
-			size_t i;
-
-			expect_close = 0;
-
-			for (i = 0; i != len; i++) {
-				char c;
-				if (get_user(c, data + i))
-					return -EFAULT;
-				if (c == 'V')
-					expect_close = 42;
-			}
-		}
-		mv64x60_wdt_service();
-	}
-
-	return len;
-}
-
-static long mv64x60_wdt_ioctl(struct file *file,
-					unsigned int cmd, unsigned long arg)
-{
-	int timeout;
-	int options;
-	void __user *argp = (void __user *)arg;
-	static const struct watchdog_info info = {
-		.options =	WDIOF_SETTIMEOUT	|
-				WDIOF_MAGICCLOSE	|
-				WDIOF_KEEPALIVEPING,
-		.firmware_version = 0,
-		.identity = "MV64x60 watchdog",
-	};
-
-	switch (cmd) {
-	case WDIOC_GETSUPPORT:
-		if (copy_to_user(argp, &info, sizeof(info)))
-			return -EFAULT;
-		break;
-
-	case WDIOC_GETSTATUS:
-	case WDIOC_GETBOOTSTATUS:
-		if (put_user(wdt_status, (int __user *)argp))
-			return -EFAULT;
-		wdt_status &= ~WDIOF_KEEPALIVEPING;
-		break;
-
-	case WDIOC_GETTEMP:
-		return -EOPNOTSUPP;
-
-	case WDIOC_SETOPTIONS:
-		if (get_user(options, (int __user *)argp))
-			return -EFAULT;
-
-		if (options & WDIOS_DISABLECARD)
-			mv64x60_wdt_handler_disable();
-
-		if (options & WDIOS_ENABLECARD)
-			mv64x60_wdt_handler_enable();
-		break;
-
-	case WDIOC_KEEPALIVE:
-		mv64x60_wdt_service();
-		wdt_status |= WDIOF_KEEPALIVEPING;
-		break;
-
-	case WDIOC_SETTIMEOUT:
-		if (get_user(timeout, (int __user *)argp))
-			return -EFAULT;
-		mv64x60_wdt_set_timeout(timeout);
-		fallthrough;
-
-	case WDIOC_GETTIMEOUT:
-		if (put_user(mv64x60_wdt_timeout, (int __user *)argp))
-			return -EFAULT;
-		break;
-
-	default:
-		return -ENOTTY;
-	}
-
-	return 0;
-}
-
-static const struct file_operations mv64x60_wdt_fops = {
-	.owner = THIS_MODULE,
-	.llseek = no_llseek,
-	.write = mv64x60_wdt_write,
-	.unlocked_ioctl = mv64x60_wdt_ioctl,
-	.compat_ioctl = compat_ptr_ioctl,
-	.open = mv64x60_wdt_open,
-	.release = mv64x60_wdt_release,
-};
-
-static struct miscdevice mv64x60_wdt_miscdev = {
-	.minor = WATCHDOG_MINOR,
-	.name = "watchdog",
-	.fops = &mv64x60_wdt_fops,
-};
-
-static int mv64x60_wdt_probe(struct platform_device *dev)
-{
-	struct mv64x60_wdt_pdata *pdata = dev_get_platdata(&dev->dev);
-	struct resource *r;
-	int timeout = 10;
-
-	bus_clk = 133;			/* in MHz */
-	if (pdata) {
-		timeout = pdata->timeout;
-		bus_clk = pdata->bus_clk;
-	}
-
-	/* Since bus_clk is truncated MHz, actual frequency could be
-	 * up to 1MHz higher.  Round up, since it's better to time out
-	 * too late than too soon.
-	 */
-	bus_clk++;
-	bus_clk *= 1000000;		/* convert to Hz */
-
-	r = platform_get_resource(dev, IORESOURCE_MEM, 0);
-	if (!r)
-		return -ENODEV;
-
-	mv64x60_wdt_regs = devm_ioremap(&dev->dev, r->start, resource_size(r));
-	if (mv64x60_wdt_regs == NULL)
-		return -ENOMEM;
-
-	mv64x60_wdt_set_timeout(timeout);
-
-	mv64x60_wdt_handler_disable();	/* in case timer was already running */
-
-	return misc_register(&mv64x60_wdt_miscdev);
-}
-
-static int mv64x60_wdt_remove(struct platform_device *dev)
-{
-	misc_deregister(&mv64x60_wdt_miscdev);
-
-	mv64x60_wdt_handler_disable();
-
-	return 0;
-}
-
-static struct platform_driver mv64x60_wdt_driver = {
-	.probe = mv64x60_wdt_probe,
-	.remove = mv64x60_wdt_remove,
-	.driver = {
-		.name = MV64x60_WDT_NAME,
-	},
-};
-
-static int __init mv64x60_wdt_init(void)
-{
-	pr_info("MV64x60 watchdog driver\n");
-
-	return platform_driver_register(&mv64x60_wdt_driver);
-}
-
-static void __exit mv64x60_wdt_exit(void)
-{
-	platform_driver_unregister(&mv64x60_wdt_driver);
-}
-
-module_init(mv64x60_wdt_init);
-module_exit(mv64x60_wdt_exit);
-
-MODULE_AUTHOR("James Chapman <jchapman@katalix.com>");
-MODULE_DESCRIPTION("MV64x60 watchdog driver");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:" MV64x60_WDT_NAME);
diff --git a/include/linux/mv643xx.h b/include/linux/mv643xx.h
index 47e5679b48e1..000b126acfb6 100644
--- a/include/linux/mv643xx.h
+++ b/include/linux/mv643xx.h
@@ -918,12 +918,4 @@
 
 extern void mv64340_irq_init(unsigned int base);
 
-/* Watchdog Platform Device, Driver Data */
-#define	MV64x60_WDT_NAME			"mv64x60_wdt"
-
-struct mv64x60_wdt_pdata {
-	int	timeout;	/* watchdog expiry in seconds, default 10 */
-	int	bus_clk;	/* bus clock in MHz, default 133 */
-};
-
 #endif /* __ASM_MV643XX_H */
-- 
cgit v1.2.3


From 4249cb7d920060dfa925d3b9f6a37f0a7c025a16 Mon Sep 17 00:00:00 2001
From: Huilong Deng <denghuilong@cdjrlc.com>
Date: Sun, 20 Jun 2021 22:29:15 +0800
Subject: printk: Remove trailing semicolon in macros

Macros should not use a trailing semicolon.

Signed-off-by: Huilong Deng <denghuilong@cdjrlc.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/dev_printk.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dev_printk.h b/include/linux/dev_printk.h
index 6f009559ee54..82d3d46005a1 100644
--- a/include/linux/dev_printk.h
+++ b/include/linux/dev_printk.h
@@ -236,7 +236,7 @@ do {									\
  * using WARN/WARN_ONCE to include file/line information and a backtrace.
  */
 #define dev_WARN(dev, format, arg...) \
-	WARN(1, "%s %s: " format, dev_driver_string(dev), dev_name(dev), ## arg);
+	WARN(1, "%s %s: " format, dev_driver_string(dev), dev_name(dev), ## arg)
 
 #define dev_WARN_ONCE(dev, condition, format, arg...) \
 	WARN_ONCE(condition, "%s %s: " format, \
-- 
cgit v1.2.3


From d38ebaf2c88442a830d402fa7805ddbb60c4cd0c Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Mon, 14 Jun 2021 13:08:11 -0500
Subject: soundwire: export sdw_update() and sdw_update_no_pm()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We currently export sdw_read() and sdw_write() but the sdw_update()
and sdw_update_no_pm() are currently available only to the bus
code. This was missed in an earlier contribution.

Export both functions so that codec drivers can perform
read-modify-write operations without duplicating the code.

Fixes: b04c975e654c ('soundwire: bus: use sdw_update_no_pm when initializing a device')
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Bard Liao <bard.liao@intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Acked-By: Vinod Koul <vkoul@kernel.org>
Link: https://lore.kernel.org/r/20210614180815.153711-2-pierre-louis.bossart@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/soundwire/bus.c       | 17 ++++++++++++++++-
 drivers/soundwire/bus.h       | 13 -------------
 include/linux/soundwire/sdw.h |  3 +++
 3 files changed, 19 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c
index a9e0aa72654d..5d5b0bd59ae3 100644
--- a/drivers/soundwire/bus.c
+++ b/drivers/soundwire/bus.c
@@ -492,7 +492,7 @@ int sdw_read_no_pm(struct sdw_slave *slave, u32 addr)
 }
 EXPORT_SYMBOL(sdw_read_no_pm);
 
-static int sdw_update_no_pm(struct sdw_slave *slave, u32 addr, u8 mask, u8 val)
+int sdw_update_no_pm(struct sdw_slave *slave, u32 addr, u8 mask, u8 val)
 {
 	int tmp;
 
@@ -503,6 +503,21 @@ static int sdw_update_no_pm(struct sdw_slave *slave, u32 addr, u8 mask, u8 val)
 	tmp = (tmp & ~mask) | val;
 	return sdw_write_no_pm(slave, addr, tmp);
 }
+EXPORT_SYMBOL(sdw_update_no_pm);
+
+/* Read-Modify-Write Slave register */
+int sdw_update(struct sdw_slave *slave, u32 addr, u8 mask, u8 val)
+{
+	int tmp;
+
+	tmp = sdw_read(slave, addr);
+	if (tmp < 0)
+		return tmp;
+
+	tmp = (tmp & ~mask) | val;
+	return sdw_write(slave, addr, tmp);
+}
+EXPORT_SYMBOL(sdw_update);
 
 /**
  * sdw_nread() - Read "n" contiguous SDW Slave registers
diff --git a/drivers/soundwire/bus.h b/drivers/soundwire/bus.h
index 40354469860a..7631ef5e71fb 100644
--- a/drivers/soundwire/bus.h
+++ b/drivers/soundwire/bus.h
@@ -201,19 +201,6 @@ static inline void sdw_fill_port_params(struct sdw_port_params *params,
 	params->data_mode = data_mode;
 }
 
-/* Read-Modify-Write Slave register */
-static inline int sdw_update(struct sdw_slave *slave, u32 addr, u8 mask, u8 val)
-{
-	int tmp;
-
-	tmp = sdw_read(slave, addr);
-	if (tmp < 0)
-		return tmp;
-
-	tmp = (tmp & ~mask) | val;
-	return sdw_write(slave, addr, tmp);
-}
-
 /* broadcast read/write for tests */
 int sdw_bread_no_pm_unlocked(struct sdw_bus *bus, u16 dev_num, u32 addr);
 int sdw_bwrite_no_pm_unlocked(struct sdw_bus *bus, u16 dev_num, u32 addr, u8 value);
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index ced07f8fde87..de9802a24e7e 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -1041,6 +1041,9 @@ int sdw_write_no_pm(struct sdw_slave *slave, u32 addr, u8 value);
 int sdw_read_no_pm(struct sdw_slave *slave, u32 addr);
 int sdw_nread(struct sdw_slave *slave, u32 addr, size_t count, u8 *val);
 int sdw_nwrite(struct sdw_slave *slave, u32 addr, size_t count, u8 *val);
+int sdw_update(struct sdw_slave *slave, u32 addr, u8 mask, u8 val);
+int sdw_update_no_pm(struct sdw_slave *slave, u32 addr, u8 mask, u8 val);
+
 int sdw_compare_devid(struct sdw_slave *slave, struct sdw_slave_id id);
 void sdw_extract_slave_id(struct sdw_bus *bus, u64 addr, struct sdw_slave_id *id);
 
-- 
cgit v1.2.3


From dfa19b11385d4cf8f0242fd93e2073e25183c331 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Thu, 3 Jun 2021 08:40:45 +0300
Subject: reboot: Add hardware protection power-off

There can be few cases when we need to shut-down the system in order to
protect the hardware. Currently this is done at least by the thermal core
when temperature raises over certain limit.

Some PMICs can also generate interrupts for example for over-current or
over-voltage, voltage drops, short-circuit, ... etc. On some systems
these are a sign of hardware failure and only thing to do is try to
protect the rest of the hardware by shutting down the system.

Add shut-down logic which can be used by all subsystems instead of
implementing the shutdown in each subsystem. The logic is stolen from
thermal_core with difference of using atomic_t instead of a mutex in
order to allow calls directly from IRQ context and changing the WARN()
to pr_emerg() as discussed here:
https://lore.kernel.org/lkml/YJuPwAZroVZ%2Fw633@alley/
and here:
https://lore.kernel.org/linux-iommu/20210331093104.383705-4-geert+renesas@glider.be/

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/e83ec1ca9408f90c857ea9dcdc57b14d9037b03f.1622628333.git.matti.vaittinen@fi.rohmeurope.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/reboot.h |  1 +
 kernel/reboot.c        | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 3734cd8f38a8..af907a3d68d1 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -79,6 +79,7 @@ extern char poweroff_cmd[POWEROFF_CMD_PATH_LEN];
 
 extern void orderly_poweroff(bool force);
 extern void orderly_reboot(void);
+void hw_protection_shutdown(const char *reason, int ms_until_forced);
 
 /*
  * Emergency restart, callable from an interrupt handler.
diff --git a/kernel/reboot.c b/kernel/reboot.c
index a6ad5eb2fa73..f7440c0c7e43 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -7,6 +7,7 @@
 
 #define pr_fmt(fmt)	"reboot: " fmt
 
+#include <linux/atomic.h>
 #include <linux/ctype.h>
 #include <linux/export.h>
 #include <linux/kexec.h>
@@ -518,6 +519,84 @@ void orderly_reboot(void)
 }
 EXPORT_SYMBOL_GPL(orderly_reboot);
 
+/**
+ * hw_failure_emergency_poweroff_func - emergency poweroff work after a known delay
+ * @work: work_struct associated with the emergency poweroff function
+ *
+ * This function is called in very critical situations to force
+ * a kernel poweroff after a configurable timeout value.
+ */
+static void hw_failure_emergency_poweroff_func(struct work_struct *work)
+{
+	/*
+	 * We have reached here after the emergency shutdown waiting period has
+	 * expired. This means orderly_poweroff has not been able to shut off
+	 * the system for some reason.
+	 *
+	 * Try to shut down the system immediately using kernel_power_off
+	 * if populated
+	 */
+	pr_emerg("Hardware protection timed-out. Trying forced poweroff\n");
+	kernel_power_off();
+
+	/*
+	 * Worst of the worst case trigger emergency restart
+	 */
+	pr_emerg("Hardware protection shutdown failed. Trying emergency restart\n");
+	emergency_restart();
+}
+
+static DECLARE_DELAYED_WORK(hw_failure_emergency_poweroff_work,
+			    hw_failure_emergency_poweroff_func);
+
+/**
+ * hw_failure_emergency_poweroff - Trigger an emergency system poweroff
+ *
+ * This may be called from any critical situation to trigger a system shutdown
+ * after a given period of time. If time is negative this is not scheduled.
+ */
+static void hw_failure_emergency_poweroff(int poweroff_delay_ms)
+{
+	if (poweroff_delay_ms <= 0)
+		return;
+	schedule_delayed_work(&hw_failure_emergency_poweroff_work,
+			      msecs_to_jiffies(poweroff_delay_ms));
+}
+
+/**
+ * hw_protection_shutdown - Trigger an emergency system poweroff
+ *
+ * @reason:		Reason of emergency shutdown to be printed.
+ * @ms_until_forced:	Time to wait for orderly shutdown before tiggering a
+ *			forced shudown. Negative value disables the forced
+ *			shutdown.
+ *
+ * Initiate an emergency system shutdown in order to protect hardware from
+ * further damage. Usage examples include a thermal protection or a voltage or
+ * current regulator failures.
+ * NOTE: The request is ignored if protection shutdown is already pending even
+ * if the previous request has given a large timeout for forced shutdown.
+ * Can be called from any context.
+ */
+void hw_protection_shutdown(const char *reason, int ms_until_forced)
+{
+	static atomic_t allow_proceed = ATOMIC_INIT(1);
+
+	pr_emerg("HARDWARE PROTECTION shutdown (%s)\n", reason);
+
+	/* Shutdown should be initiated only once. */
+	if (!atomic_dec_and_test(&allow_proceed))
+		return;
+
+	/*
+	 * Queue a backup emergency shutdown in the event of
+	 * orderly_poweroff failure
+	 */
+	hw_failure_emergency_poweroff(ms_until_forced);
+	orderly_poweroff(true);
+}
+EXPORT_SYMBOL_GPL(hw_protection_shutdown);
+
 static int __init reboot_setup(char *str)
 {
 	for (;;) {
-- 
cgit v1.2.3


From e6c3092d43faf0aa095160cc552f8c05490d0962 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Thu, 3 Jun 2021 08:41:21 +0300
Subject: regulator: add warning flags

Add 'warning' level events and error flags to regulator core.
Current regulator core notifications are used to inform consumers
about errors where HW is misbehaving in such way it is assumed to
be broken/unrecoverable.

There are PMICs which are designed for system(s) that may have use
for regulator indications sent before HW is damaged so that some
board/consumer specific recovery-event can be performed while
continuing most of the normal operations.

Add new WARNING level events and notifications to be used for
that purpose.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Link: https://lore.kernel.org/r/9b54aa5589ae4b5945d53d114bac3fae55fa4818.1622628333.git.matti.vaittinen@fi.rohmeurope.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/consumer.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index 20e84a84fb77..f72ca73631be 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -119,6 +119,16 @@ struct regulator_dev;
 #define REGULATOR_EVENT_PRE_DISABLE		0x400
 #define REGULATOR_EVENT_ABORT_DISABLE		0x800
 #define REGULATOR_EVENT_ENABLE			0x1000
+/*
+ * Following notifications should be emitted only if detected condition
+ * is such that the HW is likely to still be working but consumers should
+ * take a recovery action to prevent problems esacalating into errors.
+ */
+#define REGULATOR_EVENT_UNDER_VOLTAGE_WARN	0x2000
+#define REGULATOR_EVENT_OVER_CURRENT_WARN	0x4000
+#define REGULATOR_EVENT_OVER_VOLTAGE_WARN	0x8000
+#define REGULATOR_EVENT_OVER_TEMP_WARN		0x10000
+#define REGULATOR_EVENT_WARN_MASK		0x1E000
 
 /*
  * Regulator errors that can be queried using regulator_get_error_flags
@@ -138,6 +148,10 @@ struct regulator_dev;
 #define REGULATOR_ERROR_FAIL			BIT(4)
 #define REGULATOR_ERROR_OVER_TEMP		BIT(5)
 
+#define REGULATOR_ERROR_UNDER_VOLTAGE_WARN	BIT(6)
+#define REGULATOR_ERROR_OVER_CURRENT_WARN	BIT(7)
+#define REGULATOR_ERROR_OVER_VOLTAGE_WARN	BIT(8)
+#define REGULATOR_ERROR_OVER_TEMP_WARN		BIT(9)
 
 /**
  * struct pre_voltage_change_data - Data sent with PRE_VOLTAGE_CHANGE event
-- 
cgit v1.2.3


From 157d2230193ae683fcffcc1cd0a2c3aa4479955f Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Thu, 3 Jun 2021 08:41:37 +0300
Subject: regulator: move rdev_print helpers to internal.h

The rdev print helpers are a nice way to print messages related to a
specific regulator device. Move them from core.c to internal.h

As the rdev print helpers use rdev_get_name() export it from core.c. Also
move the declaration from coupler.h to driver.h because the rdev name is
not just a coupled regulator property. I guess the main audience for
rdev_get_name() will be the regulator core and drivers.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Link: https://lore.kernel.org/r/dc7fd70dc31de4d0e820b7646bb78eeb04f80735.1622628333.git.matti.vaittinen@fi.rohmeurope.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/core.c          | 12 +-----------
 drivers/regulator/internal.h      | 11 +++++++++++
 include/linux/regulator/coupler.h |  5 -----
 include/linux/regulator/driver.h  | 10 ++++++++++
 4 files changed, 22 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index f192bf19492e..a8188f7e5072 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -33,17 +33,6 @@
 #include "dummy.h"
 #include "internal.h"
 
-#define rdev_crit(rdev, fmt, ...)					\
-	pr_crit("%s: " fmt, rdev_get_name(rdev), ##__VA_ARGS__)
-#define rdev_err(rdev, fmt, ...)					\
-	pr_err("%s: " fmt, rdev_get_name(rdev), ##__VA_ARGS__)
-#define rdev_warn(rdev, fmt, ...)					\
-	pr_warn("%s: " fmt, rdev_get_name(rdev), ##__VA_ARGS__)
-#define rdev_info(rdev, fmt, ...)					\
-	pr_info("%s: " fmt, rdev_get_name(rdev), ##__VA_ARGS__)
-#define rdev_dbg(rdev, fmt, ...)					\
-	pr_debug("%s: " fmt, rdev_get_name(rdev), ##__VA_ARGS__)
-
 static DEFINE_WW_CLASS(regulator_ww_class);
 static DEFINE_MUTEX(regulator_nesting_mutex);
 static DEFINE_MUTEX(regulator_list_mutex);
@@ -117,6 +106,7 @@ const char *rdev_get_name(struct regulator_dev *rdev)
 	else
 		return "";
 }
+EXPORT_SYMBOL_GPL(rdev_get_name);
 
 static bool have_full_constraints(void)
 {
diff --git a/drivers/regulator/internal.h b/drivers/regulator/internal.h
index 2391b565ef11..1e9c71642143 100644
--- a/drivers/regulator/internal.h
+++ b/drivers/regulator/internal.h
@@ -15,6 +15,17 @@
 
 #define REGULATOR_STATES_NUM	(PM_SUSPEND_MAX + 1)
 
+#define rdev_crit(rdev, fmt, ...)					\
+	pr_crit("%s: " fmt, rdev_get_name(rdev), ##__VA_ARGS__)
+#define rdev_err(rdev, fmt, ...)					\
+	pr_err("%s: " fmt, rdev_get_name(rdev), ##__VA_ARGS__)
+#define rdev_warn(rdev, fmt, ...)					\
+	pr_warn("%s: " fmt, rdev_get_name(rdev), ##__VA_ARGS__)
+#define rdev_info(rdev, fmt, ...)					\
+	pr_info("%s: " fmt, rdev_get_name(rdev), ##__VA_ARGS__)
+#define rdev_dbg(rdev, fmt, ...)					\
+	pr_debug("%s: " fmt, rdev_get_name(rdev), ##__VA_ARGS__)
+
 struct regulator_voltage {
 	int min_uV;
 	int max_uV;
diff --git a/include/linux/regulator/coupler.h b/include/linux/regulator/coupler.h
index 5f86824bd117..73291f280a23 100644
--- a/include/linux/regulator/coupler.h
+++ b/include/linux/regulator/coupler.h
@@ -52,7 +52,6 @@ struct regulator_coupler {
 
 #ifdef CONFIG_REGULATOR
 int regulator_coupler_register(struct regulator_coupler *coupler);
-const char *rdev_get_name(struct regulator_dev *rdev);
 int regulator_check_consumers(struct regulator_dev *rdev,
 			      int *min_uV, int *max_uV,
 			      suspend_state_t state);
@@ -69,10 +68,6 @@ static inline int regulator_coupler_register(struct regulator_coupler *coupler)
 {
 	return 0;
 }
-static inline const char *rdev_get_name(struct regulator_dev *rdev)
-{
-	return NULL;
-}
 static inline int regulator_check_consumers(struct regulator_dev *rdev,
 					    int *min_uV, int *max_uV,
 					    suspend_state_t state)
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 4ea520c248e9..7ec0fa79d1a8 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -550,4 +550,14 @@ int regulator_desc_list_voltage_linear_range(const struct regulator_desc *desc,
 
 int regulator_desc_list_voltage_linear(const struct regulator_desc *desc,
 				       unsigned int selector);
+
+#ifdef CONFIG_REGULATOR
+const char *rdev_get_name(struct regulator_dev *rdev);
+#else
+static inline const char *rdev_get_name(struct regulator_dev *rdev)
+{
+	return NULL;
+}
+#endif
+
 #endif
-- 
cgit v1.2.3


From 7111c6d1b31b42c8c758f6681e895a5116e3bad6 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Thu, 3 Jun 2021 08:41:55 +0300
Subject: regulator: IRQ based event/error notification helpers

Provide helper function for IC's implementing regulator notifications
when an IRQ fires. The helper also works for IRQs which can not be acked.
Helper can be set to disable the IRQ at handler and then re-enabling it
on delayed work later. The helper also adds regulator_get_error_flags()
errors in cache for the duration of IRQ disabling.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/ebdf86d8c22b924667ec2385330e30fcbfac0119.1622628334.git.matti.vaittinen@fi.rohmeurope.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/Makefile       |   2 +-
 drivers/regulator/core.c         |  29 ++-
 drivers/regulator/devres.c       |  52 +++++
 drivers/regulator/irq_helpers.c  | 397 +++++++++++++++++++++++++++++++++++++++
 include/linux/regulator/driver.h | 135 +++++++++++++
 5 files changed, 607 insertions(+), 8 deletions(-)
 create mode 100644 drivers/regulator/irq_helpers.c

(limited to 'include/linux')

diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile
index 580b015296ea..534fc0163bc4 100644
--- a/drivers/regulator/Makefile
+++ b/drivers/regulator/Makefile
@@ -4,7 +4,7 @@
 #
 
 
-obj-$(CONFIG_REGULATOR) += core.o dummy.o fixed-helper.o helpers.o devres.o
+obj-$(CONFIG_REGULATOR) += core.o dummy.o fixed-helper.o helpers.o devres.o irq_helpers.o
 obj-$(CONFIG_OF) += of_regulator.o
 obj-$(CONFIG_REGULATOR_FIXED_VOLTAGE) += fixed.o
 obj-$(CONFIG_REGULATOR_VIRTUAL_CONSUMER) += virtual.o
diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index a8188f7e5072..85b6d3960369 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -4370,22 +4370,36 @@ unsigned int regulator_get_mode(struct regulator *regulator)
 }
 EXPORT_SYMBOL_GPL(regulator_get_mode);
 
+static int rdev_get_cached_err_flags(struct regulator_dev *rdev)
+{
+	int ret = 0;
+
+	if (rdev->use_cached_err) {
+		spin_lock(&rdev->err_lock);
+		ret = rdev->cached_err;
+		spin_unlock(&rdev->err_lock);
+	}
+	return ret;
+}
+
 static int _regulator_get_error_flags(struct regulator_dev *rdev,
 					unsigned int *flags)
 {
-	int ret;
+	int cached_flags, ret = 0;
 
 	regulator_lock(rdev);
 
-	/* sanity check */
-	if (!rdev->desc->ops->get_error_flags) {
+	cached_flags = rdev_get_cached_err_flags(rdev);
+
+	if (rdev->desc->ops->get_error_flags)
+		ret = rdev->desc->ops->get_error_flags(rdev, flags);
+	else if (!rdev->use_cached_err)
 		ret = -EINVAL;
-		goto out;
-	}
 
-	ret = rdev->desc->ops->get_error_flags(rdev, flags);
-out:
+	*flags |= cached_flags;
+
 	regulator_unlock(rdev);
+
 	return ret;
 }
 
@@ -5218,6 +5232,7 @@ regulator_register(const struct regulator_desc *regulator_desc,
 		goto rinse;
 	}
 	device_initialize(&rdev->dev);
+	spin_lock_init(&rdev->err_lock);
 
 	/*
 	 * Duplicate the config so the driver could override it after
diff --git a/drivers/regulator/devres.c b/drivers/regulator/devres.c
index 3091210889e3..a8de0aa88bad 100644
--- a/drivers/regulator/devres.c
+++ b/drivers/regulator/devres.c
@@ -481,3 +481,55 @@ void devm_regulator_unregister_notifier(struct regulator *regulator,
 		WARN_ON(rc);
 }
 EXPORT_SYMBOL_GPL(devm_regulator_unregister_notifier);
+
+static void regulator_irq_helper_drop(void *res)
+{
+	regulator_irq_helper_cancel(&res);
+}
+
+/**
+ * devm_regulator_irq_helper - resource managed registration of IRQ based
+ * regulator event/error notifier
+ *
+ * @dev:		device to which lifetime the helper's lifetime is
+ *			bound.
+ * @d:			IRQ helper descriptor.
+ * @irq:		IRQ used to inform events/errors to be notified.
+ * @irq_flags:		Extra IRQ flags to be OR'ed with the default
+ *			IRQF_ONESHOT when requesting the (threaded) irq.
+ * @common_errs:	Errors which can be flagged by this IRQ for all rdevs.
+ *			When IRQ is re-enabled these errors will be cleared
+ *			from all associated regulators
+ * @per_rdev_errs:	Optional error flag array describing errors specific
+ *			for only some of the regulators. These errors will be
+ *			or'ed with common errors. If this is given the array
+ *			should contain rdev_amount flags. Can be set to NULL
+ *			if there is no regulator specific error flags for this
+ *			IRQ.
+ * @rdev:		Array of pointers to regulators associated with this
+ *			IRQ.
+ * @rdev_amount:	Amount of regulators associated with this IRQ.
+ *
+ * Return: handle to irq_helper or an ERR_PTR() encoded error code.
+ */
+void *devm_regulator_irq_helper(struct device *dev,
+				const struct regulator_irq_desc *d, int irq,
+				int irq_flags, int common_errs,
+				int *per_rdev_errs,
+				struct regulator_dev **rdev, int rdev_amount)
+{
+	void *ptr;
+	int ret;
+
+	ptr = regulator_irq_helper(dev, d, irq, irq_flags, common_errs,
+				    per_rdev_errs, rdev, rdev_amount);
+	if (IS_ERR(ptr))
+		return ptr;
+
+	ret = devm_add_action_or_reset(dev, regulator_irq_helper_drop, ptr);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return ptr;
+}
+EXPORT_SYMBOL_GPL(devm_regulator_irq_helper);
diff --git a/drivers/regulator/irq_helpers.c b/drivers/regulator/irq_helpers.c
new file mode 100644
index 000000000000..fabe2e53093e
--- /dev/null
+++ b/drivers/regulator/irq_helpers.c
@@ -0,0 +1,397 @@
+// SPDX-License-Identifier: GPL-2.0
+//
+// Copyright (C) 2021 ROHM Semiconductors
+// regulator IRQ based event notification helpers
+//
+// Logic has been partially adapted from qcom-labibb driver.
+//
+// Author: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
+
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/reboot.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/regulator/driver.h>
+
+#include "internal.h"
+
+#define REGULATOR_FORCED_SAFETY_SHUTDOWN_WAIT_MS 10000
+
+struct regulator_irq {
+	struct regulator_irq_data rdata;
+	struct regulator_irq_desc desc;
+	int irq;
+	int retry_cnt;
+	struct delayed_work isr_work;
+};
+
+/*
+ * Should only be called from threaded handler to prevent potential deadlock
+ */
+static void rdev_flag_err(struct regulator_dev *rdev, int err)
+{
+	spin_lock(&rdev->err_lock);
+	rdev->cached_err |= err;
+	spin_unlock(&rdev->err_lock);
+}
+
+static void rdev_clear_err(struct regulator_dev *rdev, int err)
+{
+	spin_lock(&rdev->err_lock);
+	rdev->cached_err &= ~err;
+	spin_unlock(&rdev->err_lock);
+}
+
+static void regulator_notifier_isr_work(struct work_struct *work)
+{
+	struct regulator_irq *h;
+	struct regulator_irq_desc *d;
+	struct regulator_irq_data *rid;
+	int ret = 0;
+	int tmo, i;
+	int num_rdevs;
+
+	h = container_of(work, struct regulator_irq,
+			    isr_work.work);
+	d = &h->desc;
+	rid = &h->rdata;
+	num_rdevs = rid->num_states;
+
+reread:
+	if (d->fatal_cnt && h->retry_cnt > d->fatal_cnt) {
+		if (!d->die)
+			return hw_protection_shutdown("Regulator HW failure? - no IC recovery",
+						      REGULATOR_FORCED_SAFETY_SHUTDOWN_WAIT_MS);
+		ret = d->die(rid);
+		/*
+		 * If the 'last resort' IC recovery failed we will have
+		 * nothing else left to do...
+		 */
+		if (ret)
+			return hw_protection_shutdown("Regulator HW failure. IC recovery failed",
+						      REGULATOR_FORCED_SAFETY_SHUTDOWN_WAIT_MS);
+
+		/*
+		 * If h->die() was implemented we assume recovery has been
+		 * attempted (probably regulator was shut down) and we
+		 * just enable IRQ and bail-out.
+		 */
+		goto enable_out;
+	}
+	if (d->renable) {
+		ret = d->renable(rid);
+
+		if (ret == REGULATOR_FAILED_RETRY) {
+			/* Driver could not get current status */
+			h->retry_cnt++;
+			if (!d->reread_ms)
+				goto reread;
+
+			tmo = d->reread_ms;
+			goto reschedule;
+		}
+
+		if (ret) {
+			/*
+			 * IC status reading succeeded. update error info
+			 * just in case the renable changed it.
+			 */
+			for (i = 0; i < num_rdevs; i++) {
+				struct regulator_err_state *stat;
+				struct regulator_dev *rdev;
+
+				stat = &rid->states[i];
+				rdev = stat->rdev;
+				rdev_clear_err(rdev, (~stat->errors) &
+						      stat->possible_errs);
+			}
+			h->retry_cnt++;
+			/*
+			 * The IC indicated problem is still ON - no point in
+			 * re-enabling the IRQ. Retry later.
+			 */
+			tmo = d->irq_off_ms;
+			goto reschedule;
+		}
+	}
+
+	/*
+	 * Either IC reported problem cleared or no status checker was provided.
+	 * If problems are gone - good. If not - then the IRQ will fire again
+	 * and we'll have a new nice loop. In any case we should clear error
+	 * flags here and re-enable IRQs.
+	 */
+	for (i = 0; i < num_rdevs; i++) {
+		struct regulator_err_state *stat;
+		struct regulator_dev *rdev;
+
+		stat = &rid->states[i];
+		rdev = stat->rdev;
+		rdev_clear_err(rdev, stat->possible_errs);
+	}
+
+	/*
+	 * Things have been seemingly successful => zero retry-counter.
+	 */
+	h->retry_cnt = 0;
+
+enable_out:
+	enable_irq(h->irq);
+
+	return;
+
+reschedule:
+	if (!d->high_prio)
+		mod_delayed_work(system_wq, &h->isr_work,
+				 msecs_to_jiffies(tmo));
+	else
+		mod_delayed_work(system_highpri_wq, &h->isr_work,
+				 msecs_to_jiffies(tmo));
+}
+
+static irqreturn_t regulator_notifier_isr(int irq, void *data)
+{
+	struct regulator_irq *h = data;
+	struct regulator_irq_desc *d;
+	struct regulator_irq_data *rid;
+	unsigned long rdev_map = 0;
+	int num_rdevs;
+	int ret, i;
+
+	d = &h->desc;
+	rid = &h->rdata;
+	num_rdevs = rid->num_states;
+
+	if (d->fatal_cnt)
+		h->retry_cnt++;
+
+	/*
+	 * we spare a few cycles by not clearing statuses prior to this call.
+	 * The IC driver must initialize the status buffers for rdevs
+	 * which it indicates having active events via rdev_map.
+	 *
+	 * Maybe we should just to be on a safer side(?)
+	 */
+	ret = d->map_event(irq, rid, &rdev_map);
+
+	/*
+	 * If status reading fails (which is unlikely) we don't ack/disable
+	 * IRQ but just increase fail count and retry when IRQ fires again.
+	 * If retry_count exceeds the given safety limit we call IC specific die
+	 * handler which can try disabling regulator(s).
+	 *
+	 * If no die handler is given we will just bug() as a last resort.
+	 *
+	 * We could try disabling all associated rdevs - but we might shoot
+	 * ourselves in the head and leave the problematic regulator enabled. So
+	 * if IC has no die-handler populated we just assume the regulator
+	 * can't be disabled.
+	 */
+	if (unlikely(ret == REGULATOR_FAILED_RETRY))
+		goto fail_out;
+
+	h->retry_cnt = 0;
+	/*
+	 * Let's not disable IRQ if there were no status bits for us. We'd
+	 * better leave spurious IRQ handling to genirq
+	 */
+	if (ret || !rdev_map)
+		return IRQ_NONE;
+
+	/*
+	 * Some events are bogus if the regulator is disabled. Skip such events
+	 * if all relevant regulators are disabled
+	 */
+	if (d->skip_off) {
+		for_each_set_bit(i, &rdev_map, num_rdevs) {
+			struct regulator_dev *rdev;
+			const struct regulator_ops *ops;
+
+			rdev = rid->states[i].rdev;
+			ops = rdev->desc->ops;
+
+			/*
+			 * If any of the flagged regulators is enabled we do
+			 * handle this
+			 */
+			if (ops->is_enabled(rdev))
+				break;
+		}
+		if (i == num_rdevs)
+			return IRQ_NONE;
+	}
+
+	/* Disable IRQ if HW keeps line asserted */
+	if (d->irq_off_ms)
+		disable_irq_nosync(irq);
+
+	/*
+	 * IRQ seems to be for us. Let's fire correct notifiers / store error
+	 * flags
+	 */
+	for_each_set_bit(i, &rdev_map, num_rdevs) {
+		struct regulator_err_state *stat;
+		struct regulator_dev *rdev;
+
+		stat = &rid->states[i];
+		rdev = stat->rdev;
+
+		rdev_dbg(rdev, "Sending regulator notification EVT 0x%lx\n",
+			 stat->notifs);
+
+		regulator_notifier_call_chain(rdev, stat->notifs, NULL);
+		rdev_flag_err(rdev, stat->errors);
+	}
+
+	if (d->irq_off_ms) {
+		if (!d->high_prio)
+			schedule_delayed_work(&h->isr_work,
+					      msecs_to_jiffies(d->irq_off_ms));
+		else
+			mod_delayed_work(system_highpri_wq,
+					 &h->isr_work,
+					 msecs_to_jiffies(d->irq_off_ms));
+	}
+
+	return IRQ_HANDLED;
+
+fail_out:
+	if (d->fatal_cnt && h->retry_cnt > d->fatal_cnt) {
+		/* If we have no recovery, just try shut down straight away */
+		if (!d->die) {
+			hw_protection_shutdown("Regulator failure. Retry count exceeded",
+					       REGULATOR_FORCED_SAFETY_SHUTDOWN_WAIT_MS);
+		} else {
+			ret = d->die(rid);
+			/* If die() failed shut down as a last attempt to save the HW */
+			if (ret)
+				hw_protection_shutdown("Regulator failure. Recovery failed",
+						       REGULATOR_FORCED_SAFETY_SHUTDOWN_WAIT_MS);
+		}
+	}
+
+	return IRQ_NONE;
+}
+
+static int init_rdev_state(struct device *dev, struct regulator_irq *h,
+			   struct regulator_dev **rdev, int common_err,
+			   int *rdev_err, int rdev_amount)
+{
+	int i;
+
+	h->rdata.states = devm_kzalloc(dev, sizeof(*h->rdata.states) *
+				       rdev_amount, GFP_KERNEL);
+	if (!h->rdata.states)
+		return -ENOMEM;
+
+	h->rdata.num_states = rdev_amount;
+	h->rdata.data = h->desc.data;
+
+	for (i = 0; i < rdev_amount; i++) {
+		h->rdata.states[i].possible_errs = common_err;
+		if (rdev_err)
+			h->rdata.states[i].possible_errs |= *rdev_err++;
+		h->rdata.states[i].rdev = *rdev++;
+	}
+
+	return 0;
+}
+
+static void init_rdev_errors(struct regulator_irq *h)
+{
+	int i;
+
+	for (i = 0; i < h->rdata.num_states; i++)
+		if (h->rdata.states[i].possible_errs)
+			h->rdata.states[i].rdev->use_cached_err = true;
+}
+
+/**
+ * regulator_irq_helper - register IRQ based regulator event/error notifier
+ *
+ * @dev:		device providing the IRQs
+ * @d:			IRQ helper descriptor.
+ * @irq:		IRQ used to inform events/errors to be notified.
+ * @irq_flags:		Extra IRQ flags to be OR'ed with the default
+ *			IRQF_ONESHOT when requesting the (threaded) irq.
+ * @common_errs:	Errors which can be flagged by this IRQ for all rdevs.
+ *			When IRQ is re-enabled these errors will be cleared
+ *			from all associated regulators
+ * @per_rdev_errs:	Optional error flag array describing errors specific
+ *			for only some of the regulators. These errors will be
+ *			or'ed with common errors. If this is given the array
+ *			should contain rdev_amount flags. Can be set to NULL
+ *			if there is no regulator specific error flags for this
+ *			IRQ.
+ * @rdev:		Array of pointers to regulators associated with this
+ *			IRQ.
+ * @rdev_amount:	Amount of regulators associated with this IRQ.
+ *
+ * Return: handle to irq_helper or an ERR_PTR() encoded error code.
+ */
+void *regulator_irq_helper(struct device *dev,
+			   const struct regulator_irq_desc *d, int irq,
+			   int irq_flags, int common_errs, int *per_rdev_errs,
+			   struct regulator_dev **rdev, int rdev_amount)
+{
+	struct regulator_irq *h;
+	int ret;
+
+	if (!rdev_amount || !d || !d->map_event || !d->name)
+		return ERR_PTR(-EINVAL);
+
+	h = devm_kzalloc(dev, sizeof(*h), GFP_KERNEL);
+	if (!h)
+		return ERR_PTR(-ENOMEM);
+
+	h->irq = irq;
+	h->desc = *d;
+
+	ret = init_rdev_state(dev, h, rdev, common_errs, per_rdev_errs,
+			      rdev_amount);
+	if (ret)
+		return ERR_PTR(ret);
+
+	init_rdev_errors(h);
+
+	if (h->desc.irq_off_ms)
+		INIT_DELAYED_WORK(&h->isr_work, regulator_notifier_isr_work);
+
+	ret = request_threaded_irq(h->irq, NULL, regulator_notifier_isr,
+				   IRQF_ONESHOT | irq_flags, h->desc.name, h);
+	if (ret) {
+		dev_err(dev, "Failed to request IRQ %d\n", irq);
+
+		return ERR_PTR(ret);
+	}
+
+	return h;
+}
+EXPORT_SYMBOL_GPL(regulator_irq_helper);
+
+/**
+ * regulator_irq_helper_cancel - drop IRQ based regulator event/error notifier
+ *
+ * @handle:		Pointer to handle returned by a successful call to
+ *			regulator_irq_helper(). Will be NULLed upon return.
+ *
+ * The associated IRQ is released and work is cancelled when the function
+ * returns.
+ */
+void regulator_irq_helper_cancel(void **handle)
+{
+	if (handle && *handle) {
+		struct regulator_irq *h = *handle;
+
+		free_irq(h->irq, h);
+		if (h->desc.irq_off_ms)
+			cancel_delayed_work_sync(&h->isr_work);
+
+		h = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(regulator_irq_helper_cancel);
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 7ec0fa79d1a8..1d1a8951e740 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -413,6 +413,128 @@ struct regulator_config {
 	struct gpio_desc *ena_gpiod;
 };
 
+/**
+ * struct regulator_err_state - regulator error/notification status
+ *
+ * @rdev:		Regulator which status the struct indicates.
+ * @notifs:		Events which have occurred on the regulator.
+ * @errors:		Errors which are active on the regulator.
+ * @possible_errs:	Errors which can be signaled (by given IRQ).
+ */
+struct regulator_err_state {
+	struct regulator_dev *rdev;
+	unsigned long notifs;
+	unsigned long errors;
+	int possible_errs;
+};
+
+/**
+ * struct regulator_irq_data - regulator error/notification status date
+ *
+ * @states:	Status structs for each of the associated regulators.
+ * @num_states:	Amount of associated regulators.
+ * @data:	Driver data pointer given at regulator_irq_desc.
+ * @opaque:	Value storage for IC driver. Core does not update this. ICs
+ *		may want to store status register value here at map_event and
+ *		compare contents at 'renable' callback to see if new problems
+ *		have been added to status. If that is the case it may be
+ *		desirable to return REGULATOR_ERROR_CLEARED and not
+ *		REGULATOR_ERROR_ON to allow IRQ fire again and to generate
+ *		notifications also for the new issues.
+ *
+ * This structure is passed to 'map_event' and 'renable' callbacks for
+ * reporting regulator status to core.
+ */
+struct regulator_irq_data {
+	struct regulator_err_state *states;
+	int num_states;
+	void *data;
+	long opaque;
+};
+
+/**
+ * struct regulator_irq_desc - notification sender for IRQ based events.
+ *
+ * @name:	The visible name for the IRQ
+ * @fatal_cnt:	If this IRQ is used to signal HW damaging condition it may be
+ *		best to shut-down regulator(s) or reboot the SOC if error
+ *		handling is repeatedly failing. If fatal_cnt is given the IRQ
+ *		handling is aborted if it fails for fatal_cnt times and die()
+ *		callback (if populated) or BUG() is called to try to prevent
+ *		further damage.
+ * @reread_ms:	The time which is waited before attempting to re-read status
+ *		at the worker if IC reading fails. Immediate re-read is done
+ *		if time is not specified.
+ * @irq_off_ms:	The time which IRQ is kept disabled before re-evaluating the
+ *		status for devices which keep IRQ disabled for duration of the
+ *		error. If this is not given the IRQ is left enabled and renable
+ *		is not called.
+ * @skip_off:	If set to true the IRQ handler will attempt to check if any of
+ *		the associated regulators are enabled prior to taking other
+ *		actions. If no regulators are enabled and this is set to true
+ *		a spurious IRQ is assumed and IRQ_NONE is returned.
+ * @high_prio:	Boolean to indicate that high priority WQ should be used.
+ * @data:	Driver private data pointer which will be passed as such to
+ *		the renable, map_event and die callbacks in regulator_irq_data.
+ * @die:	Protection callback. If IC status reading or recovery actions
+ *		fail fatal_cnt times this callback or BUG() is called. This
+ *		callback should implement a final protection attempt like
+ *		disabling the regulator. If protection succeeded this may
+ *		return 0. If anything else is returned the core assumes final
+ *		protection failed and calls BUG() as a last resort.
+ * @map_event:	Driver callback to map IRQ status into regulator devices with
+ *		events / errors. NOTE: callback MUST initialize both the
+ *		errors and notifs for all rdevs which it signals having
+ *		active events as core does not clean the map data.
+ *		REGULATOR_FAILED_RETRY can be returned to indicate that the
+ *		status reading from IC failed. If this is repeated for
+ *		fatal_cnt times the core will call die() callback or BUG()
+ *		as a last resort to protect the HW.
+ * @renable:	Optional callback to check status (if HW supports that) before
+ *		re-enabling IRQ. If implemented this should clear the error
+ *		flags so that errors fetched by regulator_get_error_flags()
+ *		are updated. If callback is not implemented then errors are
+ *		assumed to be cleared and IRQ is re-enabled.
+ *		REGULATOR_FAILED_RETRY can be returned to
+ *		indicate that the status reading from IC failed. If this is
+ *		repeated for 'fatal_cnt' times the core will call die()
+ *		callback or BUG() as a last resort to protect the HW.
+ *		Returning zero indicates that the problem in HW has been solved
+ *		and IRQ will be re-enabled. Returning REGULATOR_ERROR_ON
+ *		indicates the error condition is still active and keeps IRQ
+ *		disabled. Please note that returning REGULATOR_ERROR_ON does
+ *		not retrigger evaluating what events are active or resending
+ *		notifications. If this is needed you probably want to return
+ *		zero and allow IRQ to retrigger causing events to be
+ *		re-evaluated and re-sent.
+ *
+ * This structure is used for registering regulator IRQ notification helper.
+ */
+struct regulator_irq_desc {
+	const char *name;
+	int irq_flags;
+	int fatal_cnt;
+	int reread_ms;
+	int irq_off_ms;
+	bool skip_off;
+	bool high_prio;
+	void *data;
+
+	int (*die)(struct regulator_irq_data *rid);
+	int (*map_event)(int irq, struct regulator_irq_data *rid,
+			  unsigned long *dev_mask);
+	int (*renable)(struct regulator_irq_data *rid);
+};
+
+/*
+ * Return values for regulator IRQ helpers.
+ */
+enum {
+	REGULATOR_ERROR_CLEARED,
+	REGULATOR_FAILED_RETRY,
+	REGULATOR_ERROR_ON,
+};
+
 /*
  * struct coupling_desc
  *
@@ -477,6 +599,9 @@ struct regulator_dev {
 
 	/* time when this regulator was disabled last time */
 	ktime_t last_off;
+	int cached_err;
+	bool use_cached_err;
+	spinlock_t err_lock;
 };
 
 struct regulator_dev *
@@ -491,6 +616,16 @@ void devm_regulator_unregister(struct device *dev, struct regulator_dev *rdev);
 
 int regulator_notifier_call_chain(struct regulator_dev *rdev,
 				  unsigned long event, void *data);
+void *devm_regulator_irq_helper(struct device *dev,
+				const struct regulator_irq_desc *d, int irq,
+				int irq_flags, int common_errs,
+				int *per_rdev_errs, struct regulator_dev **rdev,
+				int rdev_amount);
+void *regulator_irq_helper(struct device *dev,
+			   const struct regulator_irq_desc *d, int irq,
+			   int irq_flags, int common_errs, int *per_rdev_errs,
+			   struct regulator_dev **rdev, int rdev_amount);
+void regulator_irq_helper_cancel(void **handle);
 
 void *rdev_get_drvdata(struct regulator_dev *rdev);
 struct device *rdev_get_dev(struct regulator_dev *rdev);
-- 
cgit v1.2.3


From 89a6a5e56c8248a077d12424a1383a6b18ea840b Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Thu, 3 Jun 2021 08:42:12 +0300
Subject: regulator: add property parsing and callbacks to set protection
 limits

Add DT property parsing code and setting callback for regulator over/under
voltage, over-current and temperature error limits.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Link: https://lore.kernel.org/r/e7b8007ba9eae7076178bf3363fb942ccb1cc9a5.1622628334.git.matti.vaittinen@fi.rohmeurope.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/core.c                  | 122 +++++++++++++++++++++++++++++-
 drivers/regulator/of_regulator.c          |  58 ++++++++++++++
 drivers/regulator/qcom-labibb-regulator.c |  10 ++-
 drivers/regulator/qcom_spmi-regulator.c   |   6 +-
 drivers/regulator/stpmic1_regulator.c     |  20 ++++-
 include/linux/regulator/driver.h          |  41 +++++++++-
 include/linux/regulator/machine.h         |  26 +++++++
 7 files changed, 274 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 85b6d3960369..92fe05178249 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -1305,6 +1305,52 @@ static int machine_constraints_current(struct regulator_dev *rdev,
 
 static int _regulator_do_enable(struct regulator_dev *rdev);
 
+static int notif_set_limit(struct regulator_dev *rdev,
+			   int (*set)(struct regulator_dev *, int, int, bool),
+			   int limit, int severity)
+{
+	bool enable;
+
+	if (limit == REGULATOR_NOTIF_LIMIT_DISABLE) {
+		enable = false;
+		limit = 0;
+	} else {
+		enable = true;
+	}
+
+	if (limit == REGULATOR_NOTIF_LIMIT_ENABLE)
+		limit = 0;
+
+	return set(rdev, limit, severity, enable);
+}
+
+static int handle_notify_limits(struct regulator_dev *rdev,
+			int (*set)(struct regulator_dev *, int, int, bool),
+			struct notification_limit *limits)
+{
+	int ret = 0;
+
+	if (!set)
+		return -EOPNOTSUPP;
+
+	if (limits->prot)
+		ret = notif_set_limit(rdev, set, limits->prot,
+				      REGULATOR_SEVERITY_PROT);
+	if (ret)
+		return ret;
+
+	if (limits->err)
+		ret = notif_set_limit(rdev, set, limits->err,
+				      REGULATOR_SEVERITY_ERR);
+	if (ret)
+		return ret;
+
+	if (limits->warn)
+		ret = notif_set_limit(rdev, set, limits->warn,
+				      REGULATOR_SEVERITY_WARN);
+
+	return ret;
+}
 /**
  * set_machine_constraints - sets regulator constraints
  * @rdev: regulator source
@@ -1390,9 +1436,27 @@ static int set_machine_constraints(struct regulator_dev *rdev)
 		}
 	}
 
+	/*
+	 * Existing logic does not warn if over_current_protection is given as
+	 * a constraint but driver does not support that. I think we should
+	 * warn about this type of issues as it is possible someone changes
+	 * PMIC on board to another type - and the another PMIC's driver does
+	 * not support setting protection. Board composer may happily believe
+	 * the DT limits are respected - especially if the new PMIC HW also
+	 * supports protection but the driver does not. I won't change the logic
+	 * without hearing more experienced opinion on this though.
+	 *
+	 * If warning is seen as a good idea then we can merge handling the
+	 * over-curret protection and detection and get rid of this special
+	 * handling.
+	 */
 	if (rdev->constraints->over_current_protection
 		&& ops->set_over_current_protection) {
-		ret = ops->set_over_current_protection(rdev);
+		int lim = rdev->constraints->over_curr_limits.prot;
+
+		ret = ops->set_over_current_protection(rdev, lim,
+						       REGULATOR_SEVERITY_PROT,
+						       true);
 		if (ret < 0) {
 			rdev_err(rdev, "failed to set over current protection: %pe\n",
 				 ERR_PTR(ret));
@@ -1400,6 +1464,62 @@ static int set_machine_constraints(struct regulator_dev *rdev)
 		}
 	}
 
+	if (rdev->constraints->over_current_detection)
+		ret = handle_notify_limits(rdev,
+					   ops->set_over_current_protection,
+					   &rdev->constraints->over_curr_limits);
+	if (ret) {
+		if (ret != -EOPNOTSUPP) {
+			rdev_err(rdev, "failed to set over current limits: %pe\n",
+				 ERR_PTR(ret));
+			return ret;
+		}
+		rdev_warn(rdev,
+			  "IC does not support requested over-current limits\n");
+	}
+
+	if (rdev->constraints->over_voltage_detection)
+		ret = handle_notify_limits(rdev,
+					   ops->set_over_voltage_protection,
+					   &rdev->constraints->over_voltage_limits);
+	if (ret) {
+		if (ret != -EOPNOTSUPP) {
+			rdev_err(rdev, "failed to set over voltage limits %pe\n",
+				 ERR_PTR(ret));
+			return ret;
+		}
+		rdev_warn(rdev,
+			  "IC does not support requested over voltage limits\n");
+	}
+
+	if (rdev->constraints->under_voltage_detection)
+		ret = handle_notify_limits(rdev,
+					   ops->set_under_voltage_protection,
+					   &rdev->constraints->under_voltage_limits);
+	if (ret) {
+		if (ret != -EOPNOTSUPP) {
+			rdev_err(rdev, "failed to set under voltage limits %pe\n",
+				 ERR_PTR(ret));
+			return ret;
+		}
+		rdev_warn(rdev,
+			  "IC does not support requested under voltage limits\n");
+	}
+
+	if (rdev->constraints->over_temp_detection)
+		ret = handle_notify_limits(rdev,
+					   ops->set_thermal_protection,
+					   &rdev->constraints->temp_limits);
+	if (ret) {
+		if (ret != -EOPNOTSUPP) {
+			rdev_err(rdev, "failed to set temperature limits %pe\n",
+				 ERR_PTR(ret));
+			return ret;
+		}
+		rdev_warn(rdev,
+			  "IC does not support requested temperature limits\n");
+	}
+
 	if (rdev->constraints->active_discharge && ops->set_active_discharge) {
 		bool ad_state = (rdev->constraints->active_discharge ==
 			      REGULATOR_ACTIVE_DISCHARGE_ENABLE) ? true : false;
diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c
index 49f6c05fee34..f54d4f176882 100644
--- a/drivers/regulator/of_regulator.c
+++ b/drivers/regulator/of_regulator.c
@@ -21,6 +21,62 @@ static const char *const regulator_states[PM_SUSPEND_MAX + 1] = {
 	[PM_SUSPEND_MAX]	= "regulator-state-disk",
 };
 
+static void fill_limit(int *limit, int val)
+{
+	if (val)
+		if (val == 1)
+			*limit = REGULATOR_NOTIF_LIMIT_ENABLE;
+		else
+			*limit = val;
+	else
+		*limit = REGULATOR_NOTIF_LIMIT_DISABLE;
+}
+
+static void of_get_regulator_prot_limits(struct device_node *np,
+				struct regulation_constraints *constraints)
+{
+	u32 pval;
+	int i;
+	static const char *const props[] = {
+		"regulator-oc-%s-microamp",
+		"regulator-ov-%s-microvolt",
+		"regulator-temp-%s-kelvin",
+		"regulator-uv-%s-microvolt",
+	};
+	struct notification_limit *limits[] = {
+		&constraints->over_curr_limits,
+		&constraints->over_voltage_limits,
+		&constraints->temp_limits,
+		&constraints->under_voltage_limits,
+	};
+	bool set[4] = {0};
+
+	/* Protection limits: */
+	for (i = 0; i < ARRAY_SIZE(props); i++) {
+		char prop[255];
+		bool found;
+		int j;
+		static const char *const lvl[] = {
+			"protection", "error", "warn"
+		};
+		int *l[] = {
+			&limits[i]->prot, &limits[i]->err, &limits[i]->warn,
+		};
+
+		for (j = 0; j < ARRAY_SIZE(lvl); j++) {
+			snprintf(prop, 255, props[i], lvl[j]);
+			found = !of_property_read_u32(np, prop, &pval);
+			if (found)
+				fill_limit(l[j], pval);
+			set[i] |= found;
+		}
+	}
+	constraints->over_current_detection = set[0];
+	constraints->over_voltage_detection = set[1];
+	constraints->over_temp_detection = set[2];
+	constraints->under_voltage_detection = set[3];
+}
+
 static int of_get_regulation_constraints(struct device *dev,
 					struct device_node *np,
 					struct regulator_init_data **init_data,
@@ -188,6 +244,8 @@ static int of_get_regulation_constraints(struct device *dev,
 	constraints->over_current_protection = of_property_read_bool(np,
 					"regulator-over-current-protection");
 
+	of_get_regulator_prot_limits(np, constraints);
+
 	for (i = 0; i < ARRAY_SIZE(regulator_states); i++) {
 		switch (i) {
 		case PM_SUSPEND_MEM:
diff --git a/drivers/regulator/qcom-labibb-regulator.c b/drivers/regulator/qcom-labibb-regulator.c
index de25e3279b4b..b3da0dc58782 100644
--- a/drivers/regulator/qcom-labibb-regulator.c
+++ b/drivers/regulator/qcom-labibb-regulator.c
@@ -307,13 +307,21 @@ end:
 	return IRQ_HANDLED;
 }
 
-static int qcom_labibb_set_ocp(struct regulator_dev *rdev)
+static int qcom_labibb_set_ocp(struct regulator_dev *rdev, int lim,
+			       int severity, bool enable)
 {
 	struct labibb_regulator *vreg = rdev_get_drvdata(rdev);
 	char *ocp_irq_name;
 	u32 irq_flags = IRQF_ONESHOT;
 	int irq_trig_low, ret;
 
+	/*
+	 * labibb supports only protection - and does not support setting
+	 * limit. Furthermore, we don't support disabling protection.
+	 */
+	if (lim || severity != REGULATOR_SEVERITY_PROT || !enable)
+		return -EINVAL;
+
 	/* If there is no OCP interrupt, there's nothing to set */
 	if (vreg->ocp_irq <= 0)
 		return -EINVAL;
diff --git a/drivers/regulator/qcom_spmi-regulator.c b/drivers/regulator/qcom_spmi-regulator.c
index 95677c51c1fa..41424a3366d0 100644
--- a/drivers/regulator/qcom_spmi-regulator.c
+++ b/drivers/regulator/qcom_spmi-regulator.c
@@ -595,11 +595,15 @@ static int spmi_regulator_vs_enable(struct regulator_dev *rdev)
 	return regulator_enable_regmap(rdev);
 }
 
-static int spmi_regulator_vs_ocp(struct regulator_dev *rdev)
+static int spmi_regulator_vs_ocp(struct regulator_dev *rdev, int lim_uA,
+				 int severity, bool enable)
 {
 	struct spmi_regulator *vreg = rdev_get_drvdata(rdev);
 	u8 reg = SPMI_VS_OCP_OVERRIDE;
 
+	if (lim_uA || !enable || severity != REGULATOR_SEVERITY_PROT)
+		return -EINVAL;
+
 	return spmi_vreg_write(vreg, SPMI_VS_REG_OCP, &reg, 1);
 }
 
diff --git a/drivers/regulator/stpmic1_regulator.c b/drivers/regulator/stpmic1_regulator.c
index cf10fdb72e32..2d7597c76e4a 100644
--- a/drivers/regulator/stpmic1_regulator.c
+++ b/drivers/regulator/stpmic1_regulator.c
@@ -32,7 +32,8 @@ struct stpmic1_regulator_cfg {
 
 static int stpmic1_set_mode(struct regulator_dev *rdev, unsigned int mode);
 static unsigned int stpmic1_get_mode(struct regulator_dev *rdev);
-static int stpmic1_set_icc(struct regulator_dev *rdev);
+static int stpmic1_set_icc(struct regulator_dev *rdev, int lim, int severity,
+			   bool enable);
 static unsigned int stpmic1_map_mode(unsigned int mode);
 
 enum {
@@ -491,11 +492,26 @@ static int stpmic1_set_mode(struct regulator_dev *rdev, unsigned int mode)
 				  STPMIC1_BUCK_MODE_LP, value);
 }
 
-static int stpmic1_set_icc(struct regulator_dev *rdev)
+static int stpmic1_set_icc(struct regulator_dev *rdev, int lim, int severity,
+			   bool enable)
 {
 	struct stpmic1_regulator_cfg *cfg = rdev_get_drvdata(rdev);
 	struct regmap *regmap = rdev_get_regmap(rdev);
 
+	/*
+	 * The code seems like one bit in a register controls whether OCP is
+	 * enabled. So we might be able to turn it off here is if that
+	 * was requested. I won't support this because I don't have the HW.
+	 * Feel free to try and implement if you have the HW and need kernel
+	 * to disable this.
+	 *
+	 * Also, I don't know if limit can be configured or if we support
+	 * error/warning instead of protect. So I just keep existing logic
+	 * and assume no.
+	 */
+	if (lim || severity != REGULATOR_SEVERITY_PROT || !enable)
+		return -EINVAL;
+
 	/* enable switch off in case of over current */
 	return regmap_update_bits(regmap, cfg->icc_reg, cfg->icc_mask,
 				  cfg->icc_mask);
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 1d1a8951e740..4ebfaacf42b7 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -40,6 +40,15 @@ enum regulator_status {
 	REGULATOR_STATUS_UNDEFINED,
 };
 
+enum regulator_detection_severity {
+	/* Hardware shut down voltage outputs if condition is detected */
+	REGULATOR_SEVERITY_PROT,
+	/* Hardware is probably damaged/inoperable */
+	REGULATOR_SEVERITY_ERR,
+	/* Hardware is still recoverable but recovery action must be taken */
+	REGULATOR_SEVERITY_WARN,
+};
+
 /* Initialize struct linear_range for regulators */
 #define REGULATOR_LINEAR_RANGE(_min_uV, _min_sel, _max_sel, _step_uV)	\
 {									\
@@ -78,8 +87,25 @@ enum regulator_status {
  * @get_current_limit: Get the configured limit for a current-limited regulator.
  * @set_input_current_limit: Configure an input limit.
  *
- * @set_over_current_protection: Support capability of automatically shutting
- *                               down when detecting an over current event.
+ * @set_over_current_protection: Support enabling of and setting limits for over
+ *	current situation detection. Detection can be configured for three
+ *	levels of severity.
+ *	REGULATOR_SEVERITY_PROT should automatically shut down the regulator(s).
+ *	REGULATOR_SEVERITY_ERR should indicate that over-current situation is
+ *		caused by an unrecoverable error but HW does not perform
+ *		automatic shut down.
+ *	REGULATOR_SEVERITY_WARN should indicate situation where hardware is
+ *		still believed to not be damaged but that a board sepcific
+ *		recovery action is needed. If lim_uA is 0 the limit should not
+ *		be changed but the detection should just be enabled/disabled as
+ *		is requested.
+ * @set_over_voltage_protection: Support enabling of and setting limits for over
+ *	voltage situation detection. Detection can be configured for same
+ *	severities as over current protection.
+ * @set_under_voltage_protection: Support enabling of and setting limits for
+ *	under situation detection.
+ * @set_thermal_protection: Support enabling of and setting limits for over
+ *	temperature situation detection.
  *
  * @set_active_discharge: Set active discharge enable/disable of regulators.
  *
@@ -143,8 +169,15 @@ struct regulator_ops {
 	int (*get_current_limit) (struct regulator_dev *);
 
 	int (*set_input_current_limit) (struct regulator_dev *, int lim_uA);
-	int (*set_over_current_protection) (struct regulator_dev *);
-	int (*set_active_discharge) (struct regulator_dev *, bool enable);
+	int (*set_over_current_protection)(struct regulator_dev *, int lim_uA,
+					   int severity, bool enable);
+	int (*set_over_voltage_protection)(struct regulator_dev *, int lim_uV,
+					   int severity, bool enable);
+	int (*set_under_voltage_protection)(struct regulator_dev *, int lim_uV,
+					    int severity, bool enable);
+	int (*set_thermal_protection)(struct regulator_dev *, int lim,
+				      int severity, bool enable);
+	int (*set_active_discharge)(struct regulator_dev *, bool enable);
 
 	/* enable/disable regulator */
 	int (*enable) (struct regulator_dev *);
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index 8a56f033b6cd..68b4a514a410 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -83,6 +83,14 @@ struct regulator_state {
 	bool changeable;
 };
 
+#define REGULATOR_NOTIF_LIMIT_DISABLE -1
+#define REGULATOR_NOTIF_LIMIT_ENABLE -2
+struct notification_limit {
+	int prot;
+	int err;
+	int warn;
+};
+
 /**
  * struct regulation_constraints - regulator operating constraints.
  *
@@ -100,6 +108,11 @@ struct regulator_state {
  * @ilim_uA: Maximum input current.
  * @system_load: Load that isn't captured by any consumer requests.
  *
+ * @over_curr_limits:		Limits for acting on over current.
+ * @over_voltage_limits:	Limits for acting on over voltage.
+ * @under_voltage_limits:	Limits for acting on under voltage.
+ * @temp_limits:		Limits for acting on over temperature.
+
  * @max_spread: Max possible spread between coupled regulators
  * @max_uV_step: Max possible step change in voltage
  * @valid_modes_mask: Mask of modes which may be configured by consumers.
@@ -116,6 +129,11 @@ struct regulator_state {
  * @pull_down: Enable pull down when regulator is disabled.
  * @over_current_protection: Auto disable on over current event.
  *
+ * @over_current_detection: Configure over current limits.
+ * @over_voltage_detection: Configure over voltage limits.
+ * @under_voltage_detection: Configure under voltage limits.
+ * @over_temp_detection: Configure over temperature limits.
+ *
  * @input_uV: Input voltage for regulator when supplied by another regulator.
  *
  * @state_disk: State for regulator when system is suspended in disk mode.
@@ -172,6 +190,10 @@ struct regulation_constraints {
 	struct regulator_state state_disk;
 	struct regulator_state state_mem;
 	struct regulator_state state_standby;
+	struct notification_limit over_curr_limits;
+	struct notification_limit over_voltage_limits;
+	struct notification_limit under_voltage_limits;
+	struct notification_limit temp_limits;
 	suspend_state_t initial_state; /* suspend state to set at init */
 
 	/* mode to set on startup */
@@ -193,6 +215,10 @@ struct regulation_constraints {
 	unsigned soft_start:1;	/* ramp voltage slowly */
 	unsigned pull_down:1;	/* pull down resistor when regulator off */
 	unsigned over_current_protection:1; /* auto disable on over current */
+	unsigned over_current_detection:1; /* notify on over current */
+	unsigned over_voltage_detection:1; /* notify on over voltage */
+	unsigned under_voltage_detection:1; /* notify on under voltage */
+	unsigned over_temp_detection:1; /* notify on over temperature */
 };
 
 /**
-- 
cgit v1.2.3


From 9f2470fbc4cb4583c080bb729a998933ba61aca4 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Mon, 14 Jun 2021 19:13:35 -0700
Subject: skmsg: Improve udp_bpf_recvmsg() accuracy

I tried to reuse sk_msg_wait_data() for different protocols,
but it turns out it can not be simply reused. For example,
UDP actually uses two queues to receive skb:
udp_sk(sk)->reader_queue and sk->sk_receive_queue. So we have
to check both of them to know whether we have received any
packet.

Also, UDP does not lock the sock during BH Rx path, it makes
no sense for its ->recvmsg() to lock the sock. It is always
possible for ->recvmsg() to be called before packets actually
arrive in the receive queue, we just use best effort to make
it accurate here.

Fixes: 1f5be6b3b063 ("udp: Implement udp_bpf_recvmsg() for sockmap")
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20210615021342.7416-2-xiyou.wangcong@gmail.com
---
 include/linux/skmsg.h |  2 --
 net/core/skmsg.c      | 23 -----------------------
 net/ipv4/tcp_bpf.c    | 24 +++++++++++++++++++++++-
 net/ipv4/udp_bpf.c    | 47 ++++++++++++++++++++++++++++++++++++++++++-----
 4 files changed, 65 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index aba0f0f429be..e3d080c299f6 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -126,8 +126,6 @@ int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
 			      struct sk_msg *msg, u32 bytes);
 int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
 			     struct sk_msg *msg, u32 bytes);
-int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags,
-		     long timeo, int *err);
 int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
 		   int len, int flags);
 
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 43ce17a6a585..f9a81b314e4c 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -399,29 +399,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
 
-int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags,
-		     long timeo, int *err)
-{
-	DEFINE_WAIT_FUNC(wait, woken_wake_function);
-	int ret = 0;
-
-	if (sk->sk_shutdown & RCV_SHUTDOWN)
-		return 1;
-
-	if (!timeo)
-		return ret;
-
-	add_wait_queue(sk_sleep(sk), &wait);
-	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
-	ret = sk_wait_event(sk, &timeo,
-			    !list_empty(&psock->ingress_msg) ||
-			    !skb_queue_empty(&sk->sk_receive_queue), &wait);
-	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
-	remove_wait_queue(sk_sleep(sk), &wait);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(sk_msg_wait_data);
-
 /* Receive sk_msg from psock->ingress_msg to @msg. */
 int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
 		   int len, int flags)
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index ad9d17923fc5..bb49b52d7be8 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -163,6 +163,28 @@ static bool tcp_bpf_stream_read(const struct sock *sk)
 	return !empty;
 }
 
+static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags,
+			     long timeo, int *err)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	int ret = 0;
+
+	if (sk->sk_shutdown & RCV_SHUTDOWN)
+		return 1;
+
+	if (!timeo)
+		return ret;
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	ret = sk_wait_event(sk, &timeo,
+			    !list_empty(&psock->ingress_msg) ||
+			    !skb_queue_empty(&sk->sk_receive_queue), &wait);
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	remove_wait_queue(sk_sleep(sk), &wait);
+	return ret;
+}
+
 static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		    int nonblock, int flags, int *addr_len)
 {
@@ -188,7 +210,7 @@ msg_bytes_ready:
 		long timeo;
 
 		timeo = sock_rcvtimeo(sk, nonblock);
-		data = sk_msg_wait_data(sk, psock, flags, timeo, &err);
+		data = tcp_msg_wait_data(sk, psock, flags, timeo, &err);
 		if (data) {
 			if (!sk_psock_queue_empty(psock))
 				goto msg_bytes_ready;
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
index 954c4591a6fd..565a70040c57 100644
--- a/net/ipv4/udp_bpf.c
+++ b/net/ipv4/udp_bpf.c
@@ -21,6 +21,45 @@ static int sk_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 	return udp_prot.recvmsg(sk, msg, len, noblock, flags, addr_len);
 }
 
+static bool udp_sk_has_data(struct sock *sk)
+{
+	return !skb_queue_empty(&udp_sk(sk)->reader_queue) ||
+	       !skb_queue_empty(&sk->sk_receive_queue);
+}
+
+static bool psock_has_data(struct sk_psock *psock)
+{
+	return !skb_queue_empty(&psock->ingress_skb) ||
+	       !sk_psock_queue_empty(psock);
+}
+
+#define udp_msg_has_data(__sk, __psock)	\
+		({ udp_sk_has_data(__sk) || psock_has_data(__psock); })
+
+static int udp_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags,
+			     long timeo, int *err)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	int ret = 0;
+
+	if (sk->sk_shutdown & RCV_SHUTDOWN)
+		return 1;
+
+	if (!timeo)
+		return ret;
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	ret = udp_msg_has_data(sk, psock);
+	if (!ret) {
+		wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
+		ret = udp_msg_has_data(sk, psock);
+	}
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	remove_wait_queue(sk_sleep(sk), &wait);
+	return ret;
+}
+
 static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 			   int nonblock, int flags, int *addr_len)
 {
@@ -34,8 +73,7 @@ static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 	if (unlikely(!psock))
 		return sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
 
-	lock_sock(sk);
-	if (sk_psock_queue_empty(psock)) {
+	if (!psock_has_data(psock)) {
 		ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
 		goto out;
 	}
@@ -47,9 +85,9 @@ msg_bytes_ready:
 		long timeo;
 
 		timeo = sock_rcvtimeo(sk, nonblock);
-		data = sk_msg_wait_data(sk, psock, flags, timeo, &err);
+		data = udp_msg_wait_data(sk, psock, flags, timeo, &err);
 		if (data) {
-			if (!sk_psock_queue_empty(psock))
+			if (psock_has_data(psock))
 				goto msg_bytes_ready;
 			ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
 			goto out;
@@ -62,7 +100,6 @@ msg_bytes_ready:
 	}
 	ret = copied;
 out:
-	release_sock(sk);
 	sk_psock_put(sk, psock);
 	return ret;
 }
-- 
cgit v1.2.3


From 67a066b35765d13a55a56edd9b1f54dee9e441e1 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Thu, 10 Jun 2021 19:23:13 +0300
Subject: of: reserved-memory: Add stub for RESERVEDMEM_OF_DECLARE()

The reserved-memory Kconfig could be disabled when drivers are
compile-tested. In this case RESERVEDMEM_OF_DECLARE() produces a
noisy warning about the orphaned __reservedmem_of_table section.
Add the missing stub that fixes the warning. In particular this is
needed for compile-testing of NVIDIA Tegra210 memory driver which
uses reserved-memory.

Reported-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Link: https://lore.kernel.org/r/20210610162313.20942-1-digetx@gmail.com
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of.h              | 11 +++++++----
 include/linux/of_reserved_mem.h |  8 ++++++--
 2 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index d8db8d3592fd..9c2e71e202d1 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -1329,6 +1329,12 @@ static inline int of_get_available_child_count(const struct device_node *np)
 	return num;
 }
 
+#define _OF_DECLARE_STUB(table, name, compat, fn, fn_type)		\
+	static const struct of_device_id __of_table_##name		\
+		__attribute__((unused))					\
+		 = { .compatible = compat,				\
+		     .data = (fn == (fn_type)NULL) ? fn : fn }
+
 #if defined(CONFIG_OF) && !defined(MODULE)
 #define _OF_DECLARE(table, name, compat, fn, fn_type)			\
 	static const struct of_device_id __of_table_##name		\
@@ -1338,10 +1344,7 @@ static inline int of_get_available_child_count(const struct device_node *np)
 		     .data = (fn == (fn_type)NULL) ? fn : fn  }
 #else
 #define _OF_DECLARE(table, name, compat, fn, fn_type)			\
-	static const struct of_device_id __of_table_##name		\
-		__attribute__((unused))					\
-		 = { .compatible = compat,				\
-		     .data = (fn == (fn_type)NULL) ? fn : fn }
+	_OF_DECLARE_STUB(table, name, compat, fn, fn_type)
 #endif
 
 typedef int (*of_init_fn_2)(struct device_node *, struct device_node *);
diff --git a/include/linux/of_reserved_mem.h b/include/linux/of_reserved_mem.h
index 76e4a0fffba4..4de2a24cadc9 100644
--- a/include/linux/of_reserved_mem.h
+++ b/include/linux/of_reserved_mem.h
@@ -27,11 +27,11 @@ struct reserved_mem_ops {
 
 typedef int (*reservedmem_of_init_fn)(struct reserved_mem *rmem);
 
+#ifdef CONFIG_OF_RESERVED_MEM
+
 #define RESERVEDMEM_OF_DECLARE(name, compat, init)			\
 	_OF_DECLARE(reservedmem, name, compat, init, reservedmem_of_init_fn)
 
-#ifdef CONFIG_OF_RESERVED_MEM
-
 int of_reserved_mem_device_init_by_idx(struct device *dev,
 				       struct device_node *np, int idx);
 int of_reserved_mem_device_init_by_name(struct device *dev,
@@ -41,6 +41,10 @@ void of_reserved_mem_device_release(struct device *dev);
 
 struct reserved_mem *of_reserved_mem_lookup(struct device_node *np);
 #else
+
+#define RESERVEDMEM_OF_DECLARE(name, compat, init)			\
+	_OF_DECLARE_STUB(reservedmem, name, compat, init, reservedmem_of_init_fn)
+
 static inline int of_reserved_mem_device_init_by_idx(struct device *dev,
 					struct device_node *np, int idx)
 {
-- 
cgit v1.2.3


From 0d9f837c6958a4c14e6bcb5c5edf6c851d65f507 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 17 Jun 2021 16:22:13 +0200
Subject: driver core: Export device_driver_attach()

This is intended as a replacement API for device_bind_driver(). It has at
least the following benefits:

- Internal locking. Few of the users of device_bind_driver() follow the
  locking rules

- Calls device driver probe() internally. Notably this means that devm
  support for probe works correctly as probe() error will call
  devres_release_all()

- struct device_driver -> dev_groups is supported

- Simplified calling convention, no need to manually call probe().

The general usage is for situations that already know what driver to bind
and need to ensure the bind is synchronized with other logic. Call
device_driver_attach() after device_add().

If probe() returns a failure then this will be preserved up through to the
error return of device_driver_attach().

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20210617142218.1877096-6-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/base/base.h    | 1 -
 drivers/base/dd.c      | 3 +++
 include/linux/device.h | 2 ++
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/base/base.h b/drivers/base/base.h
index e5f9b7e656c3..404db83ee5ec 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -152,7 +152,6 @@ extern int driver_add_groups(struct device_driver *drv,
 			     const struct attribute_group **groups);
 extern void driver_remove_groups(struct device_driver *drv,
 				 const struct attribute_group **groups);
-int device_driver_attach(struct device_driver *drv, struct device *dev);
 void device_driver_detach(struct device *dev);
 
 extern char *make_class_name(const char *name, struct kobject *kobj);
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 1d8012459587..daeb9b5763ae 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -471,6 +471,8 @@ static void driver_sysfs_remove(struct device *dev)
  * (It is ok to call with no other effort from a driver's probe() method.)
  *
  * This function must be called with the device lock held.
+ *
+ * Callers should prefer to use device_driver_attach() instead.
  */
 int device_bind_driver(struct device *dev)
 {
@@ -1065,6 +1067,7 @@ int device_driver_attach(struct device_driver *drv, struct device *dev)
 		return -EAGAIN;
 	return ret;
 }
+EXPORT_SYMBOL_GPL(device_driver_attach);
 
 static void __driver_attach_async_helper(void *_dev, async_cookie_t cookie)
 {
diff --git a/include/linux/device.h b/include/linux/device.h
index f1a00040fa53..d8b9c9e7d493 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -845,6 +845,8 @@ static inline void *dev_get_platdata(const struct device *dev)
  * Manual binding of a device to driver. See drivers/base/bus.c
  * for information on use.
  */
+int __must_check device_driver_attach(struct device_driver *drv,
+				      struct device *dev);
 int __must_check device_bind_driver(struct device *dev);
 void device_release_driver(struct device *dev);
 int  __must_check device_attach(struct device *dev);
-- 
cgit v1.2.3


From 88a21f265ce50a17e6e71e3fb4467625cf234c5a Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 17 Jun 2021 16:22:15 +0200
Subject: vfio/mdev: Allow the mdev_parent_ops to specify the device driver to
 bind

This allows a mdev driver to opt out of using vfio_mdev.c, instead the
driver will provide a 'struct mdev_driver' and register directly with the
driver core.

Much of mdev_parent_ops becomes unused in this mode:
- create()/remove() are done via the mdev_driver probe()/remove()
- mdev_attr_groups becomes mdev_driver driver.dev_groups
- Wrapper function callbacks are replaced with the same ones from
  struct vfio_device_ops

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Link: https://lore.kernel.org/r/20210617142218.1877096-8-hch@lst.de
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 Documentation/driver-api/vfio-mediated-device.rst | 35 ++++++++---------------
 drivers/vfio/mdev/mdev_core.c                     | 30 +++++++++++++------
 drivers/vfio/mdev/mdev_driver.c                   | 10 +++++++
 include/linux/mdev.h                              |  2 ++
 4 files changed, 46 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst
index 1779b85f014e..9f26079cacae 100644
--- a/Documentation/driver-api/vfio-mediated-device.rst
+++ b/Documentation/driver-api/vfio-mediated-device.rst
@@ -93,7 +93,7 @@ interfaces:
 Registration Interface for a Mediated Bus Driver
 ------------------------------------------------
 
-The registration interface for a mediated bus driver provides the following
+The registration interface for a mediated device driver provides the following
 structure to represent a mediated device's driver::
 
      /*
@@ -136,37 +136,26 @@ The structures in the mdev_parent_ops structure are as follows:
 * dev_attr_groups: attributes of the parent device
 * mdev_attr_groups: attributes of the mediated device
 * supported_config: attributes to define supported configurations
+* device_driver: device driver to bind for mediated device instances
 
-The functions in the mdev_parent_ops structure are as follows:
+The mdev_parent_ops also still has various functions pointers.  Theses exist
+for historical reasons only and shall not be used for new drivers.
 
-* create: allocate basic resources in a driver for a mediated device
-* remove: free resources in a driver when a mediated device is destroyed
-
-(Note that mdev-core provides no implicit serialization of create/remove
-callbacks per mdev parent device, per mdev type, or any other categorization.
-Vendor drivers are expected to be fully asynchronous in this respect or
-provide their own internal resource protection.)
-
-The callbacks in the mdev_parent_ops structure are as follows:
-
-* open: open callback of mediated device
-* close: close callback of mediated device
-* ioctl: ioctl callback of mediated device
-* read : read emulation callback
-* write: write emulation callback
-* mmap: mmap emulation callback
-
-A driver should use the mdev_parent_ops structure in the function call to
-register itself with the mdev core driver::
+When a driver wants to add the GUID creation sysfs to an existing device it has
+probe'd to then it should call::
 
 	extern int  mdev_register_device(struct device *dev,
 	                                 const struct mdev_parent_ops *ops);
 
-However, the mdev_parent_ops structure is not required in the function call
-that a driver should use to unregister itself with the mdev core driver::
+This will provide the 'mdev_supported_types/XX/create' files which can then be
+used to trigger the creation of a mdev_device. The created mdev_device will be
+attached to the specified driver.
+
+When the driver needs to remove itself it calls::
 
 	extern void mdev_unregister_device(struct device *dev);
 
+Which will unbind and destroy all the created mdevs and remove the sysfs files.
 
 Mediated Device Management Interface Through sysfs
 ==================================================
diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index ff8c1a845166..e4581ec093a6 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -94,9 +94,11 @@ static void mdev_device_remove_common(struct mdev_device *mdev)
 	mdev_remove_sysfs_files(mdev);
 	device_del(&mdev->dev);
 	lockdep_assert_held(&parent->unreg_sem);
-	ret = parent->ops->remove(mdev);
-	if (ret)
-		dev_err(&mdev->dev, "Remove failed: err=%d\n", ret);
+	if (parent->ops->remove) {
+		ret = parent->ops->remove(mdev);
+		if (ret)
+			dev_err(&mdev->dev, "Remove failed: err=%d\n", ret);
+	}
 
 	/* Balances with device_initialize() */
 	put_device(&mdev->dev);
@@ -127,7 +129,9 @@ int mdev_register_device(struct device *dev, const struct mdev_parent_ops *ops)
 	char *envp[] = { env_string, NULL };
 
 	/* check for mandatory ops */
-	if (!ops || !ops->create || !ops->remove || !ops->supported_type_groups)
+	if (!ops || !ops->supported_type_groups)
+		return -EINVAL;
+	if (!ops->device_driver && (!ops->create || !ops->remove))
 		return -EINVAL;
 
 	dev = get_device(dev);
@@ -256,6 +260,7 @@ int mdev_device_create(struct mdev_type *type, const guid_t *uuid)
 	int ret;
 	struct mdev_device *mdev, *tmp;
 	struct mdev_parent *parent = type->parent;
+	struct mdev_driver *drv = parent->ops->device_driver;
 
 	mutex_lock(&mdev_list_lock);
 
@@ -296,14 +301,22 @@ int mdev_device_create(struct mdev_type *type, const guid_t *uuid)
 		goto out_put_device;
 	}
 
-	ret = parent->ops->create(mdev);
-	if (ret)
-		goto out_unlock;
+	if (parent->ops->create) {
+		ret = parent->ops->create(mdev);
+		if (ret)
+			goto out_unlock;
+	}
 
 	ret = device_add(&mdev->dev);
 	if (ret)
 		goto out_remove;
 
+	if (!drv)
+		drv = &vfio_mdev_driver;
+	ret = device_driver_attach(&drv->driver, &mdev->dev);
+	if (ret)
+		goto out_del;
+
 	ret = mdev_create_sysfs_files(mdev);
 	if (ret)
 		goto out_del;
@@ -317,7 +330,8 @@ int mdev_device_create(struct mdev_type *type, const guid_t *uuid)
 out_del:
 	device_del(&mdev->dev);
 out_remove:
-	parent->ops->remove(mdev);
+	if (parent->ops->remove)
+		parent->ops->remove(mdev);
 out_unlock:
 	up_read(&parent->unreg_sem);
 out_put_device:
diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c
index 041699571b7e..c368ec824e2b 100644
--- a/drivers/vfio/mdev/mdev_driver.c
+++ b/drivers/vfio/mdev/mdev_driver.c
@@ -71,10 +71,20 @@ static int mdev_remove(struct device *dev)
 	return 0;
 }
 
+static int mdev_match(struct device *dev, struct device_driver *drv)
+{
+	/*
+	 * No drivers automatically match. Drivers are only bound by explicit
+	 * device_driver_attach()
+	 */
+	return 0;
+}
+
 struct bus_type mdev_bus_type = {
 	.name		= "mdev",
 	.probe		= mdev_probe,
 	.remove		= mdev_remove,
+	.match		= mdev_match,
 };
 EXPORT_SYMBOL_GPL(mdev_bus_type);
 
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 1fb34ea394ad..3a38598c2605 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -55,6 +55,7 @@ struct device *mtype_get_parent_dev(struct mdev_type *mtype);
  * register the device to mdev module.
  *
  * @owner:		The module owner.
+ * @device_driver:	Which device driver to probe() on newly created devices
  * @dev_attr_groups:	Attributes of the parent device.
  * @mdev_attr_groups:	Attributes of the mediated device.
  * @supported_type_groups: Attributes to define supported types. It is mandatory
@@ -103,6 +104,7 @@ struct device *mtype_get_parent_dev(struct mdev_type *mtype);
  **/
 struct mdev_parent_ops {
 	struct module   *owner;
+	struct mdev_driver *device_driver;
 	const struct attribute_group **dev_attr_groups;
 	const struct attribute_group **mdev_attr_groups;
 	struct attribute_group **supported_type_groups;
-- 
cgit v1.2.3


From a3fa449ffcf5bcf9c3dddf62c11599cdc79ef54a Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Mon, 21 Jun 2021 22:08:49 +0200
Subject: net: handle ARPHRD_IP6GRE in dev_is_mac_header_xmit()

Similar to commit 3b707c3008ca ("net: dev_is_mac_header_xmit() true for
ARPHRD_RAWIP"), add ARPHRD_IP6GRE to dev_is_mac_header_xmit(), to make
ip6gre compatible with act_mirred and __bpf_redirect().

Signed-off-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_arp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/if_arp.h b/include/linux/if_arp.h
index bf5c5f32c65e..b712217f7030 100644
--- a/include/linux/if_arp.h
+++ b/include/linux/if_arp.h
@@ -48,6 +48,7 @@ static inline bool dev_is_mac_header_xmit(const struct net_device *dev)
 	case ARPHRD_TUNNEL6:
 	case ARPHRD_SIT:
 	case ARPHRD_IPGRE:
+	case ARPHRD_IP6GRE:
 	case ARPHRD_VOID:
 	case ARPHRD_NONE:
 	case ARPHRD_RAWIP:
-- 
cgit v1.2.3


From 9a1ac95a59d0724ffac2181a98b232c3f94f49f5 Mon Sep 17 00:00:00 2001
From: Aharon Landau <aharonl@nvidia.com>
Date: Wed, 16 Jun 2021 10:57:38 +0300
Subject: RDMA/mlx5: Refactor get_ts_format functions to simplify code

QPC, SQC and RQC timestamp formats and capabilities are always equal
because they represent general hardware support. So instead of code
duplication, let's merge them into general enum and logic.

Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Reviewed-by: Maor Gottlieb <maorg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/qp.c                    | 84 ++++++++++------------
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  8 +--
 .../net/ethernet/mellanox/mlx5/core/lib/clock.h    | 10 +--
 include/linux/mlx5/mlx5_ifc.h                      | 36 ++--------
 include/linux/mlx5/qp.h                            |  4 +-
 5 files changed, 55 insertions(+), 87 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 9282eb10bfae..0e3babac62db 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -1173,69 +1173,59 @@ static void destroy_flow_rule_vport_sq(struct mlx5_ib_sq *sq)
 	sq->flow_rule = NULL;
 }
 
-static int get_rq_ts_format(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *send_cq)
+static bool fr_supported(int ts_cap)
 {
-	bool fr_supported =
-		MLX5_CAP_GEN(dev->mdev, rq_ts_format) ==
-			MLX5_RQ_TIMESTAMP_FORMAT_CAP_FREE_RUNNING ||
-		MLX5_CAP_GEN(dev->mdev, rq_ts_format) ==
-			MLX5_RQ_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME;
+	return ts_cap == MLX5_TIMESTAMP_FORMAT_CAP_FREE_RUNNING ||
+	       ts_cap == MLX5_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME;
+}
 
-	if (send_cq->create_flags & IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION) {
-		if (!fr_supported) {
-			mlx5_ib_dbg(dev, "Free running TS format is not supported\n");
+static int get_ts_format(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
+			 bool fr_sup)
+{
+	if (cq->create_flags & IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION) {
+		if (!fr_sup) {
+			mlx5_ib_dbg(dev,
+				    "Free running TS format is not supported\n");
 			return -EOPNOTSUPP;
 		}
-		return MLX5_RQC_TIMESTAMP_FORMAT_FREE_RUNNING;
+		return MLX5_TIMESTAMP_FORMAT_FREE_RUNNING;
 	}
-	return fr_supported ? MLX5_RQC_TIMESTAMP_FORMAT_FREE_RUNNING :
-			      MLX5_RQC_TIMESTAMP_FORMAT_DEFAULT;
+	return fr_sup ? MLX5_TIMESTAMP_FORMAT_FREE_RUNNING :
+			MLX5_TIMESTAMP_FORMAT_DEFAULT;
+}
+
+static int get_rq_ts_format(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *recv_cq)
+{
+	u8 ts_cap = MLX5_CAP_GEN(dev->mdev, rq_ts_format);
+
+	return get_ts_format(dev, recv_cq, fr_supported(ts_cap));
 }
 
 static int get_sq_ts_format(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *send_cq)
 {
-	bool fr_supported =
-		MLX5_CAP_GEN(dev->mdev, sq_ts_format) ==
-			MLX5_SQ_TIMESTAMP_FORMAT_CAP_FREE_RUNNING ||
-		MLX5_CAP_GEN(dev->mdev, sq_ts_format) ==
-			MLX5_SQ_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME;
+	u8 ts_cap = MLX5_CAP_GEN(dev->mdev, sq_ts_format);
 
-	if (send_cq->create_flags & IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION) {
-		if (!fr_supported) {
-			mlx5_ib_dbg(dev, "Free running TS format is not supported\n");
-			return -EOPNOTSUPP;
-		}
-		return MLX5_SQC_TIMESTAMP_FORMAT_FREE_RUNNING;
-	}
-	return fr_supported ? MLX5_SQC_TIMESTAMP_FORMAT_FREE_RUNNING :
-			      MLX5_SQC_TIMESTAMP_FORMAT_DEFAULT;
+	return get_ts_format(dev, send_cq, fr_supported(ts_cap));
 }
 
 static int get_qp_ts_format(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *send_cq,
 			    struct mlx5_ib_cq *recv_cq)
 {
-	bool fr_supported =
-		MLX5_CAP_ROCE(dev->mdev, qp_ts_format) ==
-			MLX5_QP_TIMESTAMP_FORMAT_CAP_FREE_RUNNING ||
-		MLX5_CAP_ROCE(dev->mdev, qp_ts_format) ==
-			MLX5_QP_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME;
-	int ts_format = fr_supported ? MLX5_QPC_TIMESTAMP_FORMAT_FREE_RUNNING :
-				       MLX5_QPC_TIMESTAMP_FORMAT_DEFAULT;
-
-	if (recv_cq &&
-	    recv_cq->create_flags & IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION)
-		ts_format = MLX5_QPC_TIMESTAMP_FORMAT_FREE_RUNNING;
-
-	if (send_cq &&
-	    send_cq->create_flags & IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION)
-		ts_format = MLX5_QPC_TIMESTAMP_FORMAT_FREE_RUNNING;
-
-	if (ts_format == MLX5_QPC_TIMESTAMP_FORMAT_FREE_RUNNING &&
-	    !fr_supported) {
-		mlx5_ib_dbg(dev, "Free running TS format is not supported\n");
+	u8 ts_cap = MLX5_CAP_ROCE(dev->mdev, qp_ts_format);
+	bool fr_sup = fr_supported(ts_cap);
+	u8 default_ts = fr_sup ? MLX5_TIMESTAMP_FORMAT_FREE_RUNNING :
+				 MLX5_TIMESTAMP_FORMAT_DEFAULT;
+	int send_ts_format =
+		send_cq ? get_ts_format(dev, send_cq, fr_sup) :
+			  default_ts;
+	int recv_ts_format =
+		recv_cq ? get_ts_format(dev, recv_cq, fr_sup) :
+			  default_ts;
+
+	if (send_ts_format < 0 || recv_ts_format < 0)
 		return -EOPNOTSUPP;
-	}
-	return ts_format;
+
+	return send_ts_format == default_ts ? recv_ts_format : send_ts_format;
 }
 
 static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index ec6bafe7a2e5..039427a2a6c1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -644,8 +644,8 @@ int mlx5e_create_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param)
 		return -ENOMEM;
 
 	ts_format = mlx5_is_real_time_rq(mdev) ?
-		    MLX5_RQC_TIMESTAMP_FORMAT_REAL_TIME :
-		    MLX5_RQC_TIMESTAMP_FORMAT_FREE_RUNNING;
+			    MLX5_TIMESTAMP_FORMAT_REAL_TIME :
+			    MLX5_TIMESTAMP_FORMAT_FREE_RUNNING;
 	rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
 	wq  = MLX5_ADDR_OF(rqc, rqc, wq);
 
@@ -1188,8 +1188,8 @@ static int mlx5e_create_sq(struct mlx5_core_dev *mdev,
 		return -ENOMEM;
 
 	ts_format = mlx5_is_real_time_sq(mdev) ?
-		    MLX5_SQC_TIMESTAMP_FORMAT_REAL_TIME :
-		    MLX5_SQC_TIMESTAMP_FORMAT_FREE_RUNNING;
+			    MLX5_TIMESTAMP_FORMAT_REAL_TIME :
+			    MLX5_TIMESTAMP_FORMAT_FREE_RUNNING;
 	sqc = MLX5_ADDR_OF(create_sq_in, in, ctx);
 	wq = MLX5_ADDR_OF(sqc, sqc, wq);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h
index ceae6bc378e0..bd95b9f8d143 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h
@@ -37,16 +37,18 @@ static inline bool mlx5_is_real_time_rq(struct mlx5_core_dev *mdev)
 {
 	u8 rq_ts_format_cap = MLX5_CAP_GEN(mdev, rq_ts_format);
 
-	return (rq_ts_format_cap == MLX5_RQ_TIMESTAMP_FORMAT_CAP_REAL_TIME  ||
-		rq_ts_format_cap == MLX5_RQ_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME);
+	return (rq_ts_format_cap == MLX5_TIMESTAMP_FORMAT_CAP_REAL_TIME ||
+		rq_ts_format_cap ==
+			MLX5_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME);
 }
 
 static inline bool mlx5_is_real_time_sq(struct mlx5_core_dev *mdev)
 {
 	u8 sq_ts_format_cap = MLX5_CAP_GEN(mdev, sq_ts_format);
 
-	return (sq_ts_format_cap == MLX5_SQ_TIMESTAMP_FORMAT_CAP_REAL_TIME  ||
-		sq_ts_format_cap == MLX5_SQ_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME);
+	return (sq_ts_format_cap == MLX5_TIMESTAMP_FORMAT_CAP_REAL_TIME ||
+		sq_ts_format_cap ==
+			MLX5_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME);
 }
 
 typedef ktime_t (*cqe_ts_to_ns)(struct mlx5_clock *, u64);
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index eb86e80e4643..668e1d016066 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -953,9 +953,9 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
 };
 
 enum {
-	MLX5_QP_TIMESTAMP_FORMAT_CAP_FREE_RUNNING               = 0x0,
-	MLX5_QP_TIMESTAMP_FORMAT_CAP_REAL_TIME                  = 0x1,
-	MLX5_QP_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME = 0x2,
+	MLX5_TIMESTAMP_FORMAT_CAP_FREE_RUNNING               = 0x0,
+	MLX5_TIMESTAMP_FORMAT_CAP_REAL_TIME                  = 0x1,
+	MLX5_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME = 0x2,
 };
 
 struct mlx5_ifc_roce_cap_bits {
@@ -1296,18 +1296,6 @@ enum {
 	MLX5_STEERING_FORMAT_CONNECTX_6DX = 1,
 };
 
-enum {
-	MLX5_SQ_TIMESTAMP_FORMAT_CAP_FREE_RUNNING               = 0x0,
-	MLX5_SQ_TIMESTAMP_FORMAT_CAP_REAL_TIME                  = 0x1,
-	MLX5_SQ_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME = 0x2,
-};
-
-enum {
-	MLX5_RQ_TIMESTAMP_FORMAT_CAP_FREE_RUNNING               = 0x0,
-	MLX5_RQ_TIMESTAMP_FORMAT_CAP_REAL_TIME                  = 0x1,
-	MLX5_RQ_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME = 0x2,
-};
-
 struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_at_0[0x1f];
 	u8         vhca_resource_manager[0x1];
@@ -2944,9 +2932,9 @@ enum {
 };
 
 enum {
-	MLX5_QPC_TIMESTAMP_FORMAT_FREE_RUNNING = 0x0,
-	MLX5_QPC_TIMESTAMP_FORMAT_DEFAULT      = 0x1,
-	MLX5_QPC_TIMESTAMP_FORMAT_REAL_TIME    = 0x2,
+	MLX5_TIMESTAMP_FORMAT_FREE_RUNNING = 0x0,
+	MLX5_TIMESTAMP_FORMAT_DEFAULT      = 0x1,
+	MLX5_TIMESTAMP_FORMAT_REAL_TIME    = 0x2,
 };
 
 struct mlx5_ifc_qpc_bits {
@@ -3396,12 +3384,6 @@ enum {
 	MLX5_SQC_STATE_ERR  = 0x3,
 };
 
-enum {
-	MLX5_SQC_TIMESTAMP_FORMAT_FREE_RUNNING = 0x0,
-	MLX5_SQC_TIMESTAMP_FORMAT_DEFAULT      = 0x1,
-	MLX5_SQC_TIMESTAMP_FORMAT_REAL_TIME    = 0x2,
-};
-
 struct mlx5_ifc_sqc_bits {
 	u8         rlky[0x1];
 	u8         cd_master[0x1];
@@ -3507,12 +3489,6 @@ enum {
 	MLX5_RQC_STATE_ERR  = 0x3,
 };
 
-enum {
-	MLX5_RQC_TIMESTAMP_FORMAT_FREE_RUNNING = 0x0,
-	MLX5_RQC_TIMESTAMP_FORMAT_DEFAULT      = 0x1,
-	MLX5_RQC_TIMESTAMP_FORMAT_REAL_TIME    = 0x2,
-};
-
 struct mlx5_ifc_rqc_bits {
 	u8         rlky[0x1];
 	u8	   delay_drop_en[0x1];
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index b7deb790f257..61e48d459b23 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -550,8 +550,8 @@ static inline const char *mlx5_qp_state_str(int state)
 static inline int mlx5_get_qp_default_ts(struct mlx5_core_dev *dev)
 {
 	return !MLX5_CAP_ROCE(dev, qp_ts_format) ?
-		       MLX5_QPC_TIMESTAMP_FORMAT_FREE_RUNNING :
-		       MLX5_QPC_TIMESTAMP_FORMAT_DEFAULT;
+		       MLX5_TIMESTAMP_FORMAT_FREE_RUNNING :
+		       MLX5_TIMESTAMP_FORMAT_DEFAULT;
 }
 
 #endif /* MLX5_QP_H */
-- 
cgit v1.2.3


From 766c268bc6d39b8124e50d075a36b8a3305bc8e2 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Thu, 17 Jun 2021 11:56:50 +0206
Subject: lib/dump_stack: move cpu lock to printk.c

dump_stack() implements its own cpu-reentrant spinning lock to
best-effort serialize stack traces in the printk log. However,
there are other functions (such as show_regs()) that can also
benefit from this serialization.

Move the cpu-reentrant spinning lock (cpu lock) into new helper
functions printk_cpu_lock_irqsave()/printk_cpu_unlock_irqrestore()
so that it is available for others as well. For !CONFIG_SMP the
cpu lock is a NOP.

Note that having multiple cpu locks in the system can easily
lead to deadlock. Code needing a cpu lock should use the
printk cpu lock, since the printk cpu lock could be acquired
from any code and any context.

Also note that it is not necessary for a cpu lock to disable
interrupts. However, in upcoming work this cpu lock will be used
for emergency tasks (for example, atomic consoles during kernel
crashes) and any interruptions while holding the cpu lock should
be avoided if possible.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
[pmladek@suse.com: Backported on top of 5.13-rc1.]
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20210617095051.4808-2-john.ogness@linutronix.de
---
 include/linux/printk.h | 41 ++++++++++++++++++++++++++++++
 kernel/printk/printk.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/dump_stack.c       | 38 ++-------------------------
 3 files changed, 112 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index fe7eb2351610..1790a5521fd9 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -282,6 +282,47 @@ static inline void printk_safe_flush_on_panic(void)
 }
 #endif
 
+#ifdef CONFIG_SMP
+extern int __printk_cpu_trylock(void);
+extern void __printk_wait_on_cpu_lock(void);
+extern void __printk_cpu_unlock(void);
+
+/**
+ * printk_cpu_lock_irqsave() - Acquire the printk cpu-reentrant spinning
+ *                             lock and disable interrupts.
+ * @flags: Stack-allocated storage for saving local interrupt state,
+ *         to be passed to printk_cpu_unlock_irqrestore().
+ *
+ * If the lock is owned by another CPU, spin until it becomes available.
+ * Interrupts are restored while spinning.
+ */
+#define printk_cpu_lock_irqsave(flags)		\
+	for (;;) {				\
+		local_irq_save(flags);		\
+		if (__printk_cpu_trylock())	\
+			break;			\
+		local_irq_restore(flags);	\
+		__printk_wait_on_cpu_lock();	\
+	}
+
+/**
+ * printk_cpu_unlock_irqrestore() - Release the printk cpu-reentrant spinning
+ *                                  lock and restore interrupts.
+ * @flags: Caller's saved interrupt state, from printk_cpu_lock_irqsave().
+ */
+#define printk_cpu_unlock_irqrestore(flags)	\
+	do {					\
+		__printk_cpu_unlock();		\
+		local_irq_restore(flags);	\
+	} while (0)				\
+
+#else
+
+#define printk_cpu_lock_irqsave(flags) ((void)flags)
+#define printk_cpu_unlock_irqrestore(flags) ((void)flags)
+
+#endif /* CONFIG_SMP */
+
 extern int kptr_restrict;
 
 /**
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 421c35571797..9dfad0efb67f 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3531,3 +3531,72 @@ void kmsg_dump_rewind(struct kmsg_dump_iter *iter)
 EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
 
 #endif
+
+#ifdef CONFIG_SMP
+static atomic_t printk_cpulock_owner = ATOMIC_INIT(-1);
+static atomic_t printk_cpulock_nested = ATOMIC_INIT(0);
+
+/**
+ * __printk_wait_on_cpu_lock() - Busy wait until the printk cpu-reentrant
+ *                               spinning lock is not owned by any CPU.
+ *
+ * Context: Any context.
+ */
+void __printk_wait_on_cpu_lock(void)
+{
+	do {
+		cpu_relax();
+	} while (atomic_read(&printk_cpulock_owner) != -1);
+}
+EXPORT_SYMBOL(__printk_wait_on_cpu_lock);
+
+/**
+ * __printk_cpu_trylock() - Try to acquire the printk cpu-reentrant
+ *                          spinning lock.
+ *
+ * If no processor has the lock, the calling processor takes the lock and
+ * becomes the owner. If the calling processor is already the owner of the
+ * lock, this function succeeds immediately.
+ *
+ * Context: Any context. Expects interrupts to be disabled.
+ * Return: 1 on success, otherwise 0.
+ */
+int __printk_cpu_trylock(void)
+{
+	int cpu;
+	int old;
+
+	cpu = smp_processor_id();
+
+	old = atomic_cmpxchg(&printk_cpulock_owner, -1, cpu);
+	if (old == -1) {
+		/* This CPU is now the owner. */
+		return 1;
+	} else if (old == cpu) {
+		/* This CPU is already the owner. */
+		atomic_inc(&printk_cpulock_nested);
+		return 1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(__printk_cpu_trylock);
+
+/**
+ * __printk_cpu_unlock() - Release the printk cpu-reentrant spinning lock.
+ *
+ * The calling processor must be the owner of the lock.
+ *
+ * Context: Any context. Expects interrupts to be disabled.
+ */
+void __printk_cpu_unlock(void)
+{
+	if (atomic_read(&printk_cpulock_nested)) {
+		atomic_dec(&printk_cpulock_nested);
+		return;
+	}
+
+	atomic_set(&printk_cpulock_owner, -1);
+}
+EXPORT_SYMBOL(__printk_cpu_unlock);
+#endif /* CONFIG_SMP */
diff --git a/lib/dump_stack.c b/lib/dump_stack.c
index f5a33b6f773f..5ebf4375fa8c 100644
--- a/lib/dump_stack.c
+++ b/lib/dump_stack.c
@@ -84,50 +84,16 @@ static void __dump_stack(void)
  *
  * Architectures can override this implementation by implementing its own.
  */
-#ifdef CONFIG_SMP
-static atomic_t dump_lock = ATOMIC_INIT(-1);
-
 asmlinkage __visible void dump_stack(void)
 {
 	unsigned long flags;
-	int was_locked;
-	int old;
-	int cpu;
 
 	/*
 	 * Permit this cpu to perform nested stack dumps while serialising
 	 * against other CPUs
 	 */
-retry:
-	local_irq_save(flags);
-	cpu = smp_processor_id();
-	old = atomic_cmpxchg(&dump_lock, -1, cpu);
-	if (old == -1) {
-		was_locked = 0;
-	} else if (old == cpu) {
-		was_locked = 1;
-	} else {
-		local_irq_restore(flags);
-		/*
-		 * Wait for the lock to release before jumping to
-		 * atomic_cmpxchg() in order to mitigate the thundering herd
-		 * problem.
-		 */
-		do { cpu_relax(); } while (atomic_read(&dump_lock) != -1);
-		goto retry;
-	}
-
-	__dump_stack();
-
-	if (!was_locked)
-		atomic_set(&dump_lock, -1);
-
-	local_irq_restore(flags);
-}
-#else
-asmlinkage __visible void dump_stack(void)
-{
+	printk_cpu_lock_irqsave(flags);
 	__dump_stack();
+	printk_cpu_unlock_irqrestore(flags);
 }
-#endif
 EXPORT_SYMBOL(dump_stack);
-- 
cgit v1.2.3


From 0c79378c01999bd60057c475f163ec807c24891f Mon Sep 17 00:00:00 2001
From: Sebastian Reichel <sebastian.reichel@collabora.com>
Date: Mon, 21 Jun 2021 19:53:55 +0200
Subject: spi: add ancillary device support

Introduce support for ancillary devices, similar to existing
implementation for I2C. This is useful for devices having
multiple chip-selects, for example some microcontrollers
provide a normal SPI interface and a flashing SPI interface.

Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
Link: https://lore.kernel.org/r/20210621175359.126729-2-sebastian.reichel@collabora.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c       | 137 +++++++++++++++++++++++++++++++++++++-----------
 include/linux/spi/spi.h |   2 +
 2 files changed, 108 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 8553e7d48f66..572ad95c1d4f 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -564,49 +564,23 @@ static void spi_cleanup(struct spi_device *spi)
 		spi->controller->cleanup(spi);
 }
 
-/**
- * spi_add_device - Add spi_device allocated with spi_alloc_device
- * @spi: spi_device to register
- *
- * Companion function to spi_alloc_device.  Devices allocated with
- * spi_alloc_device can be added onto the spi bus with this function.
- *
- * Return: 0 on success; negative errno on failure
- */
-int spi_add_device(struct spi_device *spi)
+static int __spi_add_device(struct spi_device *spi)
 {
 	struct spi_controller *ctlr = spi->controller;
 	struct device *dev = ctlr->dev.parent;
 	int status;
 
-	/* Chipselects are numbered 0..max; validate. */
-	if (spi->chip_select >= ctlr->num_chipselect) {
-		dev_err(dev, "cs%d >= max %d\n", spi->chip_select,
-			ctlr->num_chipselect);
-		return -EINVAL;
-	}
-
-	/* Set the bus ID string */
-	spi_dev_set_name(spi);
-
-	/* We need to make sure there's no other device with this
-	 * chipselect **BEFORE** we call setup(), else we'll trash
-	 * its configuration.  Lock against concurrent add() calls.
-	 */
-	mutex_lock(&spi_add_lock);
-
 	status = bus_for_each_dev(&spi_bus_type, NULL, spi, spi_dev_check);
 	if (status) {
 		dev_err(dev, "chipselect %d already in use\n",
 				spi->chip_select);
-		goto done;
+		return status;
 	}
 
 	/* Controller may unregister concurrently */
 	if (IS_ENABLED(CONFIG_SPI_DYNAMIC) &&
 	    !device_is_registered(&ctlr->dev)) {
-		status = -ENODEV;
-		goto done;
+		return -ENODEV;
 	}
 
 	/* Descriptors take precedence */
@@ -623,7 +597,7 @@ int spi_add_device(struct spi_device *spi)
 	if (status < 0) {
 		dev_err(dev, "can't setup %s, status %d\n",
 				dev_name(&spi->dev), status);
-		goto done;
+		return status;
 	}
 
 	/* Device may be bound to an active driver when this returns */
@@ -636,12 +610,64 @@ int spi_add_device(struct spi_device *spi)
 		dev_dbg(dev, "registered child %s\n", dev_name(&spi->dev));
 	}
 
-done:
+	return status;
+}
+
+/**
+ * spi_add_device - Add spi_device allocated with spi_alloc_device
+ * @spi: spi_device to register
+ *
+ * Companion function to spi_alloc_device.  Devices allocated with
+ * spi_alloc_device can be added onto the spi bus with this function.
+ *
+ * Return: 0 on success; negative errno on failure
+ */
+int spi_add_device(struct spi_device *spi)
+{
+	struct spi_controller *ctlr = spi->controller;
+	struct device *dev = ctlr->dev.parent;
+	int status;
+
+	/* Chipselects are numbered 0..max; validate. */
+	if (spi->chip_select >= ctlr->num_chipselect) {
+		dev_err(dev, "cs%d >= max %d\n", spi->chip_select,
+			ctlr->num_chipselect);
+		return -EINVAL;
+	}
+
+	/* Set the bus ID string */
+	spi_dev_set_name(spi);
+
+	/* We need to make sure there's no other device with this
+	 * chipselect **BEFORE** we call setup(), else we'll trash
+	 * its configuration.  Lock against concurrent add() calls.
+	 */
+	mutex_lock(&spi_add_lock);
+	status = __spi_add_device(spi);
 	mutex_unlock(&spi_add_lock);
 	return status;
 }
 EXPORT_SYMBOL_GPL(spi_add_device);
 
+static int spi_add_device_locked(struct spi_device *spi)
+{
+	struct spi_controller *ctlr = spi->controller;
+	struct device *dev = ctlr->dev.parent;
+
+	/* Chipselects are numbered 0..max; validate. */
+	if (spi->chip_select >= ctlr->num_chipselect) {
+		dev_err(dev, "cs%d >= max %d\n", spi->chip_select,
+			ctlr->num_chipselect);
+		return -EINVAL;
+	}
+
+	/* Set the bus ID string */
+	spi_dev_set_name(spi);
+
+	WARN_ON(!mutex_is_locked(&spi_add_lock));
+	return __spi_add_device(spi);
+}
+
 /**
  * spi_new_device - instantiate one new SPI device
  * @ctlr: Controller to which device is connected
@@ -2125,6 +2151,55 @@ static void of_register_spi_devices(struct spi_controller *ctlr)
 static void of_register_spi_devices(struct spi_controller *ctlr) { }
 #endif
 
+/**
+ * spi_new_ancillary_device() - Register ancillary SPI device
+ * @spi:         Pointer to the main SPI device registering the ancillary device
+ * @chip_select: Chip Select of the ancillary device
+ *
+ * Register an ancillary SPI device; for example some chips have a chip-select
+ * for normal device usage and another one for setup/firmware upload.
+ *
+ * This may only be called from main SPI device's probe routine.
+ *
+ * Return: 0 on success; negative errno on failure
+ */
+struct spi_device *spi_new_ancillary_device(struct spi_device *spi,
+					     u8 chip_select)
+{
+	struct spi_device *ancillary;
+	int rc = 0;
+
+	/* Alloc an spi_device */
+	ancillary = spi_alloc_device(spi->controller);
+	if (!ancillary) {
+		rc = -ENOMEM;
+		goto err_out;
+	}
+
+	strlcpy(ancillary->modalias, "dummy", sizeof(ancillary->modalias));
+
+	/* Use provided chip-select for ancillary device */
+	ancillary->chip_select = chip_select;
+
+	/* Take over SPI mode/speed from SPI main device */
+	ancillary->max_speed_hz = spi->max_speed_hz;
+	ancillary->mode = ancillary->mode;
+
+	/* Register the new device */
+	rc = spi_add_device_locked(ancillary);
+	if (rc) {
+		dev_err(&spi->dev, "failed to register ancillary device\n");
+		goto err_out;
+	}
+
+	return ancillary;
+
+err_out:
+	spi_dev_put(ancillary);
+	return ERR_PTR(rc);
+}
+EXPORT_SYMBOL_GPL(spi_new_ancillary_device);
+
 #ifdef CONFIG_ACPI
 struct acpi_spi_lookup {
 	struct spi_controller 	*ctlr;
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index f924160e995f..3ada36175e5f 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -299,6 +299,8 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv)
 		driver_unregister(&sdrv->driver);
 }
 
+extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 chip_select);
+
 /* use a define to avoid include chaining to get THIS_MODULE */
 #define spi_register_driver(driver) \
 	__spi_register_driver(THIS_MODULE, driver)
-- 
cgit v1.2.3


From 49faa77759b211fff344898edc23bb780707fff5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 21 Jun 2021 13:12:38 +0200
Subject: locking/lockdep: Improve noinstr vs errors

Better handle the failure paths.

  vmlinux.o: warning: objtool: debug_locks_off()+0x23: call to console_verbose() leaves .noinstr.text section
  vmlinux.o: warning: objtool: debug_locks_off()+0x19: call to __kasan_check_write() leaves .noinstr.text section

  debug_locks_off+0x19/0x40:
  instrument_atomic_write at include/linux/instrumented.h:86
  (inlined by) __debug_locks_off at include/linux/debug_locks.h:17
  (inlined by) debug_locks_off at lib/debug_locks.c:41

Fixes: 6eebad1ad303 ("lockdep: __always_inline more for noinstr")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20210621120120.784404944@infradead.org
---
 include/linux/debug_locks.h | 2 ++
 kernel/locking/lockdep.c    | 4 +++-
 lib/debug_locks.c           | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h
index 2915f56ad421..edb5c186b0b7 100644
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -27,8 +27,10 @@ extern int debug_locks_off(void);
 	int __ret = 0;							\
 									\
 	if (!oops_in_progress && unlikely(c)) {				\
+		instrumentation_begin();				\
 		if (debug_locks_off() && !debug_locks_silent)		\
 			WARN(1, "DEBUG_LOCKS_WARN_ON(%s)", #c);		\
+		instrumentation_end();					\
 		__ret = 1;						\
 	}								\
 	__ret;								\
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 7641bd407239..e32313072506 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -843,7 +843,7 @@ static int count_matching_names(struct lock_class *new_class)
 }
 
 /* used from NMI context -- must be lockless */
-static __always_inline struct lock_class *
+static noinstr struct lock_class *
 look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
 {
 	struct lockdep_subclass_key *key;
@@ -851,12 +851,14 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
 	struct lock_class *class;
 
 	if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
+		instrumentation_begin();
 		debug_locks_off();
 		printk(KERN_ERR
 			"BUG: looking up invalid subclass: %u\n", subclass);
 		printk(KERN_ERR
 			"turning off the locking correctness validator.\n");
 		dump_stack();
+		instrumentation_end();
 		return NULL;
 	}
 
diff --git a/lib/debug_locks.c b/lib/debug_locks.c
index 06d3135bd184..a75ee30b77cb 100644
--- a/lib/debug_locks.c
+++ b/lib/debug_locks.c
@@ -36,7 +36,7 @@ EXPORT_SYMBOL_GPL(debug_locks_silent);
 /*
  * Generic 'turn off all lock debugging' function:
  */
-noinstr int debug_locks_off(void)
+int debug_locks_off(void)
 {
 	if (debug_locks && __debug_locks_off()) {
 		if (!debug_locks_silent) {
-- 
cgit v1.2.3


From 7560c02bdffb7c52d1457fa551b9e745d4b9e754 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 27 May 2021 12:01:20 -0700
Subject: clocksource: Check per-CPU clock synchronization when marked unstable

Some sorts of per-CPU clock sources have a history of going out of
synchronization with each other.  However, this problem has purportedy been
solved in the past ten years.  Except that it is all too possible that the
problem has instead simply been made less likely, which might mean that
some of the occasional "Marking clocksource 'tsc' as unstable" messages
might be due to desynchronization.  How would anyone know?

Therefore apply CPU-to-CPU synchronization checking to newly unstable
clocksource that are marked with the new CLOCK_SOURCE_VERIFY_PERCPU flag.
Lists of desynchronized CPUs are printed, with the caveat that if it
is the reporting CPU that is itself desynchronized, it will appear that
all the other clocks are wrong.  Just like in real life.

Reported-by: Chris Mason <clm@fb.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Feng Tang <feng.tang@intel.com>
Link: https://lore.kernel.org/r/20210527190124.440372-2-paulmck@kernel.org
---
 arch/x86/kernel/tsc.c       |  3 ++-
 include/linux/clocksource.h |  2 +-
 kernel/time/clocksource.c   | 60 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 57ec01192180..6eb1b097e97e 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1152,7 +1152,8 @@ static struct clocksource clocksource_tsc = {
 	.mask			= CLOCKSOURCE_MASK(64),
 	.flags			= CLOCK_SOURCE_IS_CONTINUOUS |
 				  CLOCK_SOURCE_VALID_FOR_HRES |
-				  CLOCK_SOURCE_MUST_VERIFY,
+				  CLOCK_SOURCE_MUST_VERIFY |
+				  CLOCK_SOURCE_VERIFY_PERCPU,
 	.vdso_clock_mode	= VDSO_CLOCKMODE_TSC,
 	.enable			= tsc_cs_enable,
 	.resume			= tsc_resume,
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index d6ab416ee2d2..7f83d51c0fd7 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -137,7 +137,7 @@ struct clocksource {
 #define CLOCK_SOURCE_UNSTABLE			0x40
 #define CLOCK_SOURCE_SUSPEND_NONSTOP		0x80
 #define CLOCK_SOURCE_RESELECT			0x100
-
+#define CLOCK_SOURCE_VERIFY_PERCPU		0x200
 /* simplify initialization of mask field */
 #define CLOCKSOURCE_MASK(bits) GENMASK_ULL((bits) - 1, 0)
 
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 43243f2be98e..cb12225bf050 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -224,6 +224,60 @@ static bool cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
 	return false;
 }
 
+static u64 csnow_mid;
+static cpumask_t cpus_ahead;
+static cpumask_t cpus_behind;
+
+static void clocksource_verify_one_cpu(void *csin)
+{
+	struct clocksource *cs = (struct clocksource *)csin;
+
+	csnow_mid = cs->read(cs);
+}
+
+static void clocksource_verify_percpu(struct clocksource *cs)
+{
+	int64_t cs_nsec, cs_nsec_max = 0, cs_nsec_min = LLONG_MAX;
+	u64 csnow_begin, csnow_end;
+	int cpu, testcpu;
+	s64 delta;
+
+	cpumask_clear(&cpus_ahead);
+	cpumask_clear(&cpus_behind);
+	preempt_disable();
+	testcpu = smp_processor_id();
+	pr_warn("Checking clocksource %s synchronization from CPU %d.\n", cs->name, testcpu);
+	for_each_online_cpu(cpu) {
+		if (cpu == testcpu)
+			continue;
+		csnow_begin = cs->read(cs);
+		smp_call_function_single(cpu, clocksource_verify_one_cpu, cs, 1);
+		csnow_end = cs->read(cs);
+		delta = (s64)((csnow_mid - csnow_begin) & cs->mask);
+		if (delta < 0)
+			cpumask_set_cpu(cpu, &cpus_behind);
+		delta = (csnow_end - csnow_mid) & cs->mask;
+		if (delta < 0)
+			cpumask_set_cpu(cpu, &cpus_ahead);
+		delta = clocksource_delta(csnow_end, csnow_begin, cs->mask);
+		cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
+		if (cs_nsec > cs_nsec_max)
+			cs_nsec_max = cs_nsec;
+		if (cs_nsec < cs_nsec_min)
+			cs_nsec_min = cs_nsec;
+	}
+	preempt_enable();
+	if (!cpumask_empty(&cpus_ahead))
+		pr_warn("        CPUs %*pbl ahead of CPU %d for clocksource %s.\n",
+			cpumask_pr_args(&cpus_ahead), testcpu, cs->name);
+	if (!cpumask_empty(&cpus_behind))
+		pr_warn("        CPUs %*pbl behind CPU %d for clocksource %s.\n",
+			cpumask_pr_args(&cpus_behind), testcpu, cs->name);
+	if (!cpumask_empty(&cpus_ahead) || !cpumask_empty(&cpus_behind))
+		pr_warn("        CPU %d check durations %lldns - %lldns for clocksource %s.\n",
+			testcpu, cs_nsec_min, cs_nsec_max, cs->name);
+}
+
 static void clocksource_watchdog(struct timer_list *unused)
 {
 	u64 csnow, wdnow, cslast, wdlast, delta;
@@ -448,6 +502,12 @@ static int __clocksource_watchdog_kthread(void)
 	unsigned long flags;
 	int select = 0;
 
+	/* Do any required per-CPU skew verification. */
+	if (curr_clocksource &&
+	    curr_clocksource->flags & CLOCK_SOURCE_UNSTABLE &&
+	    curr_clocksource->flags & CLOCK_SOURCE_VERIFY_PERCPU)
+		clocksource_verify_percpu(curr_clocksource);
+
 	spin_lock_irqsave(&watchdog_lock, flags);
 	list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
 		if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
-- 
cgit v1.2.3


From 2e27e793e280ff12cb5c202a1214c08b0d3a0f26 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 27 May 2021 12:01:22 -0700
Subject: clocksource: Reduce clocksource-skew threshold

Currently, WATCHDOG_THRESHOLD is set to detect a 62.5-millisecond skew in
a 500-millisecond WATCHDOG_INTERVAL.  This requires that clocks be skewed
by more than 12.5% in order to be marked unstable.  Except that a clock
that is skewed by that much is probably destroying unsuspecting software
right and left.  And given that there are now checks for false-positive
skews due to delays between reading the two clocks, it should be possible
to greatly decrease WATCHDOG_THRESHOLD, at least for fine-grained clocks
such as TSC.

Therefore, add a new uncertainty_margin field to the clocksource structure
that contains the maximum uncertainty in nanoseconds for the corresponding
clock.  This field may be initialized manually, as it is for
clocksource_tsc_early and clocksource_jiffies, which is copied to
refined_jiffies.  If the field is not initialized manually, it will be
computed at clock-registry time as the period of the clock in question
based on the scale and freq parameters to __clocksource_update_freq_scale()
function.  If either of those two parameters are zero, the
tens-of-milliseconds WATCHDOG_THRESHOLD is used as a cowardly alternative
to dividing by zero.  No matter how the uncertainty_margin field is
calculated, it is bounded below by twice WATCHDOG_MAX_SKEW, that is, by 100
microseconds.

Note that manually initialized uncertainty_margin fields are not adjusted,
but there is a WARN_ON_ONCE() that triggers if any such field is less than
twice WATCHDOG_MAX_SKEW.  This WARN_ON_ONCE() is intended to discourage
production use of the one-nanosecond uncertainty_margin values that are
used to test the clock-skew code itself.

The actual clock-skew check uses the sum of the uncertainty_margin fields
of the two clocksource structures being compared.  Integer overflow is
avoided because the largest computed value of the uncertainty_margin
fields is one billion (10^9), and double that value fits into an
unsigned int.  However, if someone manually specifies (say) UINT_MAX,
they will get what they deserve.

Note that the refined_jiffies uncertainty_margin field is initialized to
TICK_NSEC, which means that skew checks involving this clocksource will
be sufficently forgiving.  In a similar vein, the clocksource_tsc_early
uncertainty_margin field is initialized to 32*NSEC_PER_MSEC, which
replicates the current behavior and allows custom setting if needed
in order to address the rare skews detected for this clocksource in
current mainline.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Feng Tang <feng.tang@intel.com>
Link: https://lore.kernel.org/r/20210527190124.440372-4-paulmck@kernel.org
---
 arch/x86/kernel/tsc.c       |  1 +
 include/linux/clocksource.h |  3 +++
 kernel/time/clocksource.c   | 48 +++++++++++++++++++++++++++++++++++----------
 kernel/time/jiffies.c       | 15 +++++++-------
 4 files changed, 50 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 6eb1b097e97e..2e076a459a0c 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1128,6 +1128,7 @@ static int tsc_cs_enable(struct clocksource *cs)
 static struct clocksource clocksource_tsc_early = {
 	.name			= "tsc-early",
 	.rating			= 299,
+	.uncertainty_margin	= 32 * NSEC_PER_MSEC,
 	.read			= read_tsc,
 	.mask			= CLOCKSOURCE_MASK(64),
 	.flags			= CLOCK_SOURCE_IS_CONTINUOUS |
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 7f83d51c0fd7..895203727cb5 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -43,6 +43,8 @@ struct module;
  * @shift:		Cycle to nanosecond divisor (power of two)
  * @max_idle_ns:	Maximum idle time permitted by the clocksource (nsecs)
  * @maxadj:		Maximum adjustment value to mult (~11%)
+ * @uncertainty_margin:	Maximum uncertainty in nanoseconds per half second.
+ *			Zero says to use default WATCHDOG_THRESHOLD.
  * @archdata:		Optional arch-specific data
  * @max_cycles:		Maximum safe cycle value which won't overflow on
  *			multiplication
@@ -98,6 +100,7 @@ struct clocksource {
 	u32			shift;
 	u64			max_idle_ns;
 	u32			maxadj;
+	u32			uncertainty_margin;
 #ifdef CONFIG_ARCH_CLOCKSOURCE_DATA
 	struct arch_clocksource_data archdata;
 #endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index e4beab21a1fa..9b27888a6e75 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -95,6 +95,20 @@ static char override_name[CS_NAME_LEN];
 static int finished_booting;
 static u64 suspend_start;
 
+/*
+ * Threshold: 0.0312s, when doubled: 0.0625s.
+ * Also a default for cs->uncertainty_margin when registering clocks.
+ */
+#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5)
+
+/*
+ * Maximum permissible delay between two readouts of the watchdog
+ * clocksource surrounding a read of the clocksource being validated.
+ * This delay could be due to SMIs, NMIs, or to VCPU preemptions.  Used as
+ * a lower bound for cs->uncertainty_margin values when registering clocks.
+ */
+#define WATCHDOG_MAX_SKEW (50 * NSEC_PER_USEC)
+
 #ifdef CONFIG_CLOCKSOURCE_WATCHDOG
 static void clocksource_watchdog_work(struct work_struct *work);
 static void clocksource_select(void);
@@ -121,17 +135,9 @@ static int clocksource_watchdog_kthread(void *data);
 static void __clocksource_change_rating(struct clocksource *cs, int rating);
 
 /*
- * Interval: 0.5sec Threshold: 0.0625s
+ * Interval: 0.5sec.
  */
 #define WATCHDOG_INTERVAL (HZ >> 1)
-#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
-
-/*
- * Maximum permissible delay between two readouts of the watchdog
- * clocksource surrounding a read of the clocksource being validated.
- * This delay could be due to SMIs, NMIs, or to VCPU preemptions.
- */
-#define WATCHDOG_MAX_SKEW (100 * NSEC_PER_USEC)
 
 static void clocksource_watchdog_work(struct work_struct *work)
 {
@@ -348,6 +354,7 @@ static void clocksource_watchdog(struct timer_list *unused)
 	int next_cpu, reset_pending;
 	int64_t wd_nsec, cs_nsec;
 	struct clocksource *cs;
+	u32 md;
 
 	spin_lock(&watchdog_lock);
 	if (!watchdog_running)
@@ -394,7 +401,8 @@ static void clocksource_watchdog(struct timer_list *unused)
 			continue;
 
 		/* Check the deviation from the watchdog clocksource. */
-		if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
+		md = cs->uncertainty_margin + watchdog->uncertainty_margin;
+		if (abs(cs_nsec - wd_nsec) > md) {
 			pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
 				smp_processor_id(), cs->name);
 			pr_warn("                      '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
@@ -1047,6 +1055,26 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq
 		clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
 				       NSEC_PER_SEC / scale, sec * scale);
 	}
+
+	/*
+	 * If the uncertainty margin is not specified, calculate it.
+	 * If both scale and freq are non-zero, calculate the clock
+	 * period, but bound below at 2*WATCHDOG_MAX_SKEW.  However,
+	 * if either of scale or freq is zero, be very conservative and
+	 * take the tens-of-milliseconds WATCHDOG_THRESHOLD value for the
+	 * uncertainty margin.  Allow stupidly small uncertainty margins
+	 * to be specified by the caller for testing purposes, but warn
+	 * to discourage production use of this capability.
+	 */
+	if (scale && freq && !cs->uncertainty_margin) {
+		cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq);
+		if (cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW)
+			cs->uncertainty_margin = 2 * WATCHDOG_MAX_SKEW;
+	} else if (!cs->uncertainty_margin) {
+		cs->uncertainty_margin = WATCHDOG_THRESHOLD;
+	}
+	WARN_ON_ONCE(cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW);
+
 	/*
 	 * Ensure clocksources that have large 'mult' values don't overflow
 	 * when adjusted.
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a492e4da69ba..01935aafdb46 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -49,13 +49,14 @@ static u64 jiffies_read(struct clocksource *cs)
  * for "tick-less" systems.
  */
 static struct clocksource clocksource_jiffies = {
-	.name		= "jiffies",
-	.rating		= 1, /* lowest valid rating*/
-	.read		= jiffies_read,
-	.mask		= CLOCKSOURCE_MASK(32),
-	.mult		= TICK_NSEC << JIFFIES_SHIFT, /* details above */
-	.shift		= JIFFIES_SHIFT,
-	.max_cycles	= 10,
+	.name			= "jiffies",
+	.rating			= 1, /* lowest valid rating*/
+	.uncertainty_margin	= 32 * NSEC_PER_MSEC,
+	.read			= jiffies_read,
+	.mask			= CLOCKSOURCE_MASK(32),
+	.mult			= TICK_NSEC << JIFFIES_SHIFT, /* details above */
+	.shift			= JIFFIES_SHIFT,
+	.max_cycles		= 10,
 };
 
 __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
-- 
cgit v1.2.3


From 1253b9b87e42ab6a3d5c2cb27af2bdd67d7e50ff Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 27 May 2021 12:01:23 -0700
Subject: clocksource: Provide kernel module to test clocksource watchdog

When the clocksource watchdog marks a clock as unstable, this might
be due to that clock being unstable or it might be due to delays that
happen to occur between the reads of the two clocks.  It would be good
to have a way of testing the clocksource watchdog's ability to
distinguish between these two causes of clock skew and instability.

Therefore, provide a new clocksource-wdtest module selected by a new
TEST_CLOCKSOURCE_WATCHDOG Kconfig option.  This module has a single module
parameter named "holdoff" that provides the number of seconds of delay
before testing should start, which defaults to zero when built as a module
and to 10 seconds when built directly into the kernel.  Very large systems
that boot slowly may need to increase the value of this module parameter.

This module uses hand-crafted clocksource structures to do its testing,
thus avoiding messing up timing for the rest of the kernel and for user
applications.  This module first verifies that the ->uncertainty_margin
field of the clocksource structures are set sanely.  It then tests the
delay-detection capability of the clocksource watchdog, increasing the
number of consecutive delays injected, first provoking console messages
complaining about the delays and finally forcing a clock-skew event.
Unexpected test results cause at least one WARN_ON_ONCE() console splat.
If there are no splats, the test has passed.  Finally, it fuzzes the
value returned from a clocksource to test the clocksource watchdog's
ability to detect time skew.

This module checks the state of its clocksource after each test, and
uses WARN_ON_ONCE() to emit a console splat if there are any failures.
This should enable all types of test frameworks to detect any such
failures.

This facility is intended for diagnostic use only, and should be avoided
on production systems.

Reported-by: Chris Mason <clm@fb.com>
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Feng Tang <feng.tang@intel.com>
Link: https://lore.kernel.org/r/20210527190124.440372-5-paulmck@kernel.org
---
 Documentation/admin-guide/kernel-parameters.txt |   6 +
 include/linux/clocksource.h                     |   3 +
 kernel/time/Makefile                            |   1 +
 kernel/time/clocksource-wdtest.c                | 202 ++++++++++++++++++++++++
 kernel/time/clocksource.c                       |   6 +-
 lib/Kconfig.debug                               |  12 ++
 6 files changed, 228 insertions(+), 2 deletions(-)
 create mode 100644 kernel/time/clocksource-wdtest.c

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 9ec9ea1a51f2..591048ed1365 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -597,6 +597,12 @@
 			The actual CPUs are chosen randomly, with
 			no replacement if the same CPU is chosen twice.
 
+	clocksource-wdtest.holdoff= [KNL]
+			Set the time in seconds that the clocksource
+			watchdog test waits before commencing its tests.
+			Defaults to zero when built as a module and to
+			10 seconds when built into the kernel.
+
 	clearcpuid=BITNUM[,BITNUM...] [X86]
 			Disable CPUID feature X for the kernel. See
 			arch/x86/include/asm/cpufeatures.h for the valid bit
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 895203727cb5..1d42d4b17327 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -291,4 +291,7 @@ static inline void timer_probe(void) {}
 #define TIMER_ACPI_DECLARE(name, table_id, fn)		\
 	ACPI_DECLARE_PROBE_ENTRY(timer, name, table_id, 0, NULL, 0, fn)
 
+extern ulong max_cswd_read_retries;
+void clocksource_verify_percpu(struct clocksource *cs);
+
 #endif /* _LINUX_CLOCKSOURCE_H */
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 1fb1c1ef6a19..1ed85b25b096 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -21,3 +21,4 @@ obj-$(CONFIG_HAVE_GENERIC_VDSO)			+= vsyscall.o
 obj-$(CONFIG_DEBUG_FS)				+= timekeeping_debug.o
 obj-$(CONFIG_TEST_UDELAY)			+= test_udelay.o
 obj-$(CONFIG_TIME_NS)				+= namespace.o
+obj-$(CONFIG_TEST_CLOCKSOURCE_WATCHDOG)		+= clocksource-wdtest.o
diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c
new file mode 100644
index 000000000000..01df12395c0e
--- /dev/null
+++ b/kernel/time/clocksource-wdtest.c
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Unit test for the clocksource watchdog.
+ *
+ * Copyright (C) 2021 Facebook, Inc.
+ *
+ * Author: Paul E. McKenney <paulmck@kernel.org>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/device.h>
+#include <linux/clocksource.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
+#include <linux/tick.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/prandom.h>
+#include <linux/cpu.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>");
+
+static int holdoff = IS_BUILTIN(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) ? 10 : 0;
+module_param(holdoff, int, 0444);
+MODULE_PARM_DESC(holdoff, "Time to wait to start test (s).");
+
+/* Watchdog kthread's task_struct pointer for debug purposes. */
+static struct task_struct *wdtest_task;
+
+static u64 wdtest_jiffies_read(struct clocksource *cs)
+{
+	return (u64)jiffies;
+}
+
+/* Assume HZ > 100. */
+#define JIFFIES_SHIFT	8
+
+static struct clocksource clocksource_wdtest_jiffies = {
+	.name			= "wdtest-jiffies",
+	.rating			= 1, /* lowest valid rating*/
+	.uncertainty_margin	= TICK_NSEC,
+	.read			= wdtest_jiffies_read,
+	.mask			= CLOCKSOURCE_MASK(32),
+	.flags			= CLOCK_SOURCE_MUST_VERIFY,
+	.mult			= TICK_NSEC << JIFFIES_SHIFT, /* details above */
+	.shift			= JIFFIES_SHIFT,
+	.max_cycles		= 10,
+};
+
+static int wdtest_ktime_read_ndelays;
+static bool wdtest_ktime_read_fuzz;
+
+static u64 wdtest_ktime_read(struct clocksource *cs)
+{
+	int wkrn = READ_ONCE(wdtest_ktime_read_ndelays);
+	static int sign = 1;
+	u64 ret;
+
+	if (wkrn) {
+		udelay(cs->uncertainty_margin / 250);
+		WRITE_ONCE(wdtest_ktime_read_ndelays, wkrn - 1);
+	}
+	ret = ktime_get_real_fast_ns();
+	if (READ_ONCE(wdtest_ktime_read_fuzz)) {
+		sign = -sign;
+		ret = ret + sign * 100 * NSEC_PER_MSEC;
+	}
+	return ret;
+}
+
+static void wdtest_ktime_cs_mark_unstable(struct clocksource *cs)
+{
+	pr_info("--- Marking %s unstable due to clocksource watchdog.\n", cs->name);
+}
+
+#define KTIME_FLAGS (CLOCK_SOURCE_IS_CONTINUOUS | \
+		     CLOCK_SOURCE_VALID_FOR_HRES | \
+		     CLOCK_SOURCE_MUST_VERIFY | \
+		     CLOCK_SOURCE_VERIFY_PERCPU)
+
+static struct clocksource clocksource_wdtest_ktime = {
+	.name			= "wdtest-ktime",
+	.rating			= 300,
+	.read			= wdtest_ktime_read,
+	.mask			= CLOCKSOURCE_MASK(64),
+	.flags			= KTIME_FLAGS,
+	.mark_unstable		= wdtest_ktime_cs_mark_unstable,
+	.list			= LIST_HEAD_INIT(clocksource_wdtest_ktime.list),
+};
+
+/* Reset the clocksource if needed. */
+static void wdtest_ktime_clocksource_reset(void)
+{
+	if (clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE) {
+		clocksource_unregister(&clocksource_wdtest_ktime);
+		clocksource_wdtest_ktime.flags = KTIME_FLAGS;
+		schedule_timeout_uninterruptible(HZ / 10);
+		clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000);
+	}
+}
+
+/* Run the specified series of watchdog tests. */
+static int wdtest_func(void *arg)
+{
+	unsigned long j1, j2;
+	char *s;
+	int i;
+
+	schedule_timeout_uninterruptible(holdoff * HZ);
+
+	/*
+	 * Verify that jiffies-like clocksources get the manually
+	 * specified uncertainty margin.
+	 */
+	pr_info("--- Verify jiffies-like uncertainty margin.\n");
+	__clocksource_register(&clocksource_wdtest_jiffies);
+	WARN_ON_ONCE(clocksource_wdtest_jiffies.uncertainty_margin != TICK_NSEC);
+
+	j1 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies);
+	schedule_timeout_uninterruptible(HZ);
+	j2 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies);
+	WARN_ON_ONCE(j1 == j2);
+
+	clocksource_unregister(&clocksource_wdtest_jiffies);
+
+	/*
+	 * Verify that tsc-like clocksources are assigned a reasonable
+	 * uncertainty margin.
+	 */
+	pr_info("--- Verify tsc-like uncertainty margin.\n");
+	clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000);
+	WARN_ON_ONCE(clocksource_wdtest_ktime.uncertainty_margin < NSEC_PER_USEC);
+
+	j1 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime);
+	udelay(1);
+	j2 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime);
+	pr_info("--- tsc-like times: %lu - %lu = %lu.\n", j2, j1, j2 - j1);
+	WARN_ON_ONCE(time_before(j2, j1 + NSEC_PER_USEC));
+
+	/* Verify tsc-like stability with various numbers of errors injected. */
+	for (i = 0; i <= max_cswd_read_retries + 1; i++) {
+		if (i <= 1 && i < max_cswd_read_retries)
+			s = "";
+		else if (i <= max_cswd_read_retries)
+			s = ", expect message";
+		else
+			s = ", expect clock skew";
+		pr_info("--- Watchdog with %dx error injection, %lu retries%s.\n", i, max_cswd_read_retries, s);
+		WRITE_ONCE(wdtest_ktime_read_ndelays, i);
+		schedule_timeout_uninterruptible(2 * HZ);
+		WARN_ON_ONCE(READ_ONCE(wdtest_ktime_read_ndelays));
+		WARN_ON_ONCE((i <= max_cswd_read_retries) !=
+			     !(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE));
+		wdtest_ktime_clocksource_reset();
+	}
+
+	/* Verify tsc-like stability with clock-value-fuzz error injection. */
+	pr_info("--- Watchdog clock-value-fuzz error injection, expect clock skew and per-CPU mismatches.\n");
+	WRITE_ONCE(wdtest_ktime_read_fuzz, true);
+	schedule_timeout_uninterruptible(2 * HZ);
+	WARN_ON_ONCE(!(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE));
+	clocksource_verify_percpu(&clocksource_wdtest_ktime);
+	WRITE_ONCE(wdtest_ktime_read_fuzz, false);
+
+	clocksource_unregister(&clocksource_wdtest_ktime);
+
+	pr_info("--- Done with test.\n");
+	return 0;
+}
+
+static void wdtest_print_module_parms(void)
+{
+	pr_alert("--- holdoff=%d\n", holdoff);
+}
+
+/* Cleanup function. */
+static void clocksource_wdtest_cleanup(void)
+{
+}
+
+static int __init clocksource_wdtest_init(void)
+{
+	int ret = 0;
+
+	wdtest_print_module_parms();
+
+	/* Create watchdog-test task. */
+	wdtest_task = kthread_run(wdtest_func, NULL, "wdtest");
+	if (IS_ERR(wdtest_task)) {
+		ret = PTR_ERR(wdtest_task);
+		pr_warn("%s: Failed to create wdtest kthread.\n", __func__);
+		wdtest_task = NULL;
+		return ret;
+	}
+
+	return 0;
+}
+
+module_init(clocksource_wdtest_init);
+module_exit(clocksource_wdtest_cleanup);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 9b27888a6e75..74d6a234fd14 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -199,8 +199,9 @@ void clocksource_mark_unstable(struct clocksource *cs)
 	spin_unlock_irqrestore(&watchdog_lock, flags);
 }
 
-static ulong max_cswd_read_retries = 3;
+ulong max_cswd_read_retries = 3;
 module_param(max_cswd_read_retries, ulong, 0644);
+EXPORT_SYMBOL_GPL(max_cswd_read_retries);
 static int verify_n_cpus = 8;
 module_param(verify_n_cpus, int, 0644);
 
@@ -294,7 +295,7 @@ static void clocksource_verify_one_cpu(void *csin)
 	csnow_mid = cs->read(cs);
 }
 
-static void clocksource_verify_percpu(struct clocksource *cs)
+void clocksource_verify_percpu(struct clocksource *cs)
 {
 	int64_t cs_nsec, cs_nsec_max = 0, cs_nsec_min = LLONG_MAX;
 	u64 csnow_begin, csnow_end;
@@ -347,6 +348,7 @@ static void clocksource_verify_percpu(struct clocksource *cs)
 		pr_warn("        CPU %d check durations %lldns - %lldns for clocksource %s.\n",
 			testcpu, cs_nsec_min, cs_nsec_max, cs->name);
 }
+EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
 
 static void clocksource_watchdog(struct timer_list *unused)
 {
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 678c13967580..0a5a70c742e6 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2571,6 +2571,18 @@ config TEST_FPU
 
 	  If unsure, say N.
 
+config TEST_CLOCKSOURCE_WATCHDOG
+	tristate "Test clocksource watchdog in kernel space"
+	depends on CLOCKSOURCE_WATCHDOG
+	help
+	  Enable this option to create a kernel module that will trigger
+	  a test of the clocksource watchdog.  This module may be loaded
+	  via modprobe or insmod in which case it will run upon being
+	  loaded, or it may be built in, in which case it will run
+	  shortly after boot.
+
+	  If unsure, say N.
+
 endif # RUNTIME_TESTING_MENU
 
 config ARCH_USE_MEMTEST
-- 
cgit v1.2.3


From 62a6ef6a996f5eec73d30d079573a1fa8f95fcd9 Mon Sep 17 00:00:00 2001
From: Marcin Wojtas <mw@semihalf.com>
Date: Mon, 21 Jun 2021 19:30:24 +0200
Subject: net: mdiobus: Introduce fwnode_mdbiobus_register()

This patch introduces a new helper function that
wraps acpi_/of_ mdiobus_register() and allows its
usage via common fwnode_ interface.

Fall back to raw mdiobus_register() in case CONFIG_FWNODE_MDIO
is not enabled, in order to satisfy compatibility
in all future user drivers.

Signed-off-by: Marcin Wojtas <mw@semihalf.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/mdio/fwnode_mdio.c | 22 ++++++++++++++++++++++
 include/linux/fwnode_mdio.h    | 12 ++++++++++++
 2 files changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/mdio/fwnode_mdio.c b/drivers/net/mdio/fwnode_mdio.c
index 1becb1a731f6..ae0bf71a9932 100644
--- a/drivers/net/mdio/fwnode_mdio.c
+++ b/drivers/net/mdio/fwnode_mdio.c
@@ -7,8 +7,10 @@
  */
 
 #include <linux/acpi.h>
+#include <linux/acpi_mdio.h>
 #include <linux/fwnode_mdio.h>
 #include <linux/of.h>
+#include <linux/of_mdio.h>
 #include <linux/phy.h>
 
 MODULE_AUTHOR("Calvin Johnson <calvin.johnson@oss.nxp.com>");
@@ -142,3 +144,23 @@ int fwnode_mdiobus_register_phy(struct mii_bus *bus,
 	return 0;
 }
 EXPORT_SYMBOL(fwnode_mdiobus_register_phy);
+
+/**
+ * fwnode_mdiobus_register - bring up all the PHYs on a given MDIO bus and
+ *	attach them to it.
+ * @bus: Target MDIO bus.
+ * @fwnode: Pointer to fwnode of the MDIO controller.
+ *
+ * Return values are determined accordingly to acpi_/of_ mdiobus_register()
+ * operation.
+ */
+int fwnode_mdiobus_register(struct mii_bus *bus, struct fwnode_handle *fwnode)
+{
+	if (is_acpi_node(fwnode))
+		return acpi_mdiobus_register(bus, fwnode);
+	else if (is_of_node(fwnode))
+		return of_mdiobus_register(bus, to_of_node(fwnode));
+	else
+		return -EINVAL;
+}
+EXPORT_SYMBOL(fwnode_mdiobus_register);
diff --git a/include/linux/fwnode_mdio.h b/include/linux/fwnode_mdio.h
index faf603c48c86..13d4ae8fee0a 100644
--- a/include/linux/fwnode_mdio.h
+++ b/include/linux/fwnode_mdio.h
@@ -16,6 +16,7 @@ int fwnode_mdiobus_phy_device_register(struct mii_bus *mdio,
 int fwnode_mdiobus_register_phy(struct mii_bus *bus,
 				struct fwnode_handle *child, u32 addr);
 
+int fwnode_mdiobus_register(struct mii_bus *bus, struct fwnode_handle *fwnode);
 #else /* CONFIG_FWNODE_MDIO */
 int fwnode_mdiobus_phy_device_register(struct mii_bus *mdio,
 				       struct phy_device *phy,
@@ -30,6 +31,17 @@ static inline int fwnode_mdiobus_register_phy(struct mii_bus *bus,
 {
 	return -EINVAL;
 }
+
+static inline int fwnode_mdiobus_register(struct mii_bus *bus,
+					  struct fwnode_handle *fwnode)
+{
+	/*
+	 * Fall back to mdiobus_register() function to register a bus.
+	 * This way, we don't have to keep compat bits around in drivers.
+	 */
+
+	return mdiobus_register(mdio);
+}
 #endif
 
 #endif /* __LINUX_FWNODE_MDIO_H */
-- 
cgit v1.2.3


From 9f0248ea476ee59d336d7c8bf1a5d0919d93d030 Mon Sep 17 00:00:00 2001
From: Sergey Ryazanov <ryazanov.s.a@gmail.com>
Date: Tue, 22 Jun 2021 01:50:57 +0300
Subject: wwan: core: no more hold netdev ops owning module

The WWAN netdev ops owner holding was used to protect from the
unexpected memory disappear. This approach causes a dependency cycle
(driver -> core -> driver) and effectively prevents a WWAN driver
unloading. E.g. WWAN hwsim could not be unloaded until all simulated
devices are removed:

~# modprobe wwan_hwsim devices=2
~# lsmod | grep wwan
wwan_hwsim             16384  2
wwan                   20480  1 wwan_hwsim
~# rmmod wwan_hwsim
rmmod: ERROR: Module wwan_hwsim is in use
~# echo > /sys/kernel/debug/wwan_hwsim/hwsim0/destroy
~# echo > /sys/kernel/debug/wwan_hwsim/hwsim1/destroy
~# lsmod | grep wwan
wwan_hwsim             16384  0
wwan                   20480  1 wwan_hwsim
~# rmmod wwan_hwsim

For a real device driver this will cause an inability to unload module
until a served device is physically detached.

Since the last commit we are removing all child netdev(s) when a driver
unregister the netdev ops. This allows us to permit the driver
unloading, since any sane driver will call ops unregistering on a device
deinitialization. So, remove the holding of an ops owner to make it
easier to unload a driver module. The owner field has also beed removed
from the ops structure as there are no more users of this field.

Signed-off-by: Sergey Ryazanov <ryazanov.s.a@gmail.com>
Reviewed-by: Loic Poulain <loic.poulain@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/mhi/net.c         |  3 +--
 drivers/net/wwan/wwan_core.c  | 10 ----------
 drivers/net/wwan/wwan_hwsim.c |  1 -
 include/linux/wwan.h          |  2 --
 4 files changed, 1 insertion(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/mhi/net.c b/drivers/net/mhi/net.c
index 6aa753387372..ffd1c01b3f35 100644
--- a/drivers/net/mhi/net.c
+++ b/drivers/net/mhi/net.c
@@ -383,7 +383,6 @@ static void mhi_net_dellink(void *ctxt, struct net_device *ndev,
 }
 
 static const struct wwan_ops mhi_wwan_ops = {
-	.owner = THIS_MODULE,
 	.priv_size = sizeof(struct mhi_net_dev),
 	.setup = mhi_net_setup,
 	.newlink = mhi_net_newlink,
@@ -436,7 +435,7 @@ static void mhi_net_remove(struct mhi_device *mhi_dev)
 	struct mhi_net_dev *mhi_netdev = dev_get_drvdata(&mhi_dev->dev);
 	struct mhi_controller *cntrl = mhi_dev->mhi_cntrl;
 
-	/* rtnetlink takes care of removing remaining links */
+	/* WWAN core takes care of removing remaining links */
 	wwan_unregister_ops(&cntrl->mhi_dev->dev);
 
 	if (create_default_iface)
diff --git a/drivers/net/wwan/wwan_core.c b/drivers/net/wwan/wwan_core.c
index ec6a69b23dd1..b634a0ba1196 100644
--- a/drivers/net/wwan/wwan_core.c
+++ b/drivers/net/wwan/wwan_core.c
@@ -929,11 +929,6 @@ int wwan_register_ops(struct device *parent, const struct wwan_ops *ops,
 		return -EBUSY;
 	}
 
-	if (!try_module_get(ops->owner)) {
-		wwan_remove_dev(wwandev);
-		return -ENODEV;
-	}
-
 	wwandev->ops = ops;
 	wwandev->ops_ctxt = ctxt;
 
@@ -960,7 +955,6 @@ static int wwan_child_dellink(struct device *dev, void *data)
 void wwan_unregister_ops(struct device *parent)
 {
 	struct wwan_device *wwandev = wwan_dev_get_by_parent(parent);
-	struct module *owner;
 	LIST_HEAD(kill_list);
 
 	if (WARN_ON(IS_ERR(wwandev)))
@@ -976,8 +970,6 @@ void wwan_unregister_ops(struct device *parent)
 	 */
 	put_device(&wwandev->dev);
 
-	owner = wwandev->ops->owner;	/* Preserve ops owner */
-
 	rtnl_lock();	/* Prevent concurent netdev(s) creation/destroying */
 
 	/* Remove all child netdev(s), using batch removing */
@@ -989,8 +981,6 @@ void wwan_unregister_ops(struct device *parent)
 
 	rtnl_unlock();
 
-	module_put(owner);
-
 	wwandev->ops_ctxt = NULL;
 	wwan_remove_dev(wwandev);
 }
diff --git a/drivers/net/wwan/wwan_hwsim.c b/drivers/net/wwan/wwan_hwsim.c
index c1e850b9c087..a8582a58a385 100644
--- a/drivers/net/wwan/wwan_hwsim.c
+++ b/drivers/net/wwan/wwan_hwsim.c
@@ -94,7 +94,6 @@ static void wwan_hwsim_netdev_setup(struct net_device *ndev)
 }
 
 static const struct wwan_ops wwan_hwsim_wwan_rtnl_ops = {
-	.owner = THIS_MODULE,
 	.priv_size = 0,			/* No private data */
 	.setup = wwan_hwsim_netdev_setup,
 };
diff --git a/include/linux/wwan.h b/include/linux/wwan.h
index 34222230360c..e1981ea3a2fd 100644
--- a/include/linux/wwan.h
+++ b/include/linux/wwan.h
@@ -128,14 +128,12 @@ void *wwan_port_get_drvdata(struct wwan_port *port);
 
 /**
  * struct wwan_ops - WWAN device ops
- * @owner: module owner of the WWAN ops
  * @priv_size: size of private netdev data area
  * @setup: set up a new netdev
  * @newlink: register the new netdev
  * @dellink: remove the given netdev
  */
 struct wwan_ops {
-	struct module *owner;
 	unsigned int priv_size;
 	void (*setup)(struct net_device *dev);
 	int (*newlink)(void *ctxt, struct net_device *dev,
-- 
cgit v1.2.3


From ca374290aaade741a4781ae5f6e1ba7515e4e5fa Mon Sep 17 00:00:00 2001
From: Sergey Ryazanov <ryazanov.s.a@gmail.com>
Date: Tue, 22 Jun 2021 01:50:58 +0300
Subject: wwan: core: support default netdev creation

Most, if not each WWAN device driver will create a netdev for the
default data channel. Therefore, add an option for the WWAN netdev ops
registration function to create a default netdev for the WWAN device.

A WWAN device driver should pass a default data channel link id to the
ops registering function to request the creation of a default netdev, or
a special value WWAN_NO_DEFAULT_LINK to inform the WWAN core that the
default netdev should not be created.

For now, only wwan_hwsim utilize the default link creation option. Other
drivers will be reworked next.

Signed-off-by: Sergey Ryazanov <ryazanov.s.a@gmail.com>
CC: M Chetan Kumar <m.chetan.kumar@intel.com>
CC: Intel Corporation <linuxwwan@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/mhi/net.c                 |  3 +-
 drivers/net/wwan/iosm/iosm_ipc_wwan.c |  3 +-
 drivers/net/wwan/wwan_core.c          | 75 ++++++++++++++++++++++++++++++++++-
 drivers/net/wwan/wwan_hwsim.c         |  2 +-
 include/linux/wwan.h                  |  8 +++-
 5 files changed, 86 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/mhi/net.c b/drivers/net/mhi/net.c
index ffd1c01b3f35..f36ca5c0dfe9 100644
--- a/drivers/net/mhi/net.c
+++ b/drivers/net/mhi/net.c
@@ -397,7 +397,8 @@ static int mhi_net_probe(struct mhi_device *mhi_dev,
 	struct net_device *ndev;
 	int err;
 
-	err = wwan_register_ops(&cntrl->mhi_dev->dev, &mhi_wwan_ops, mhi_dev);
+	err = wwan_register_ops(&cntrl->mhi_dev->dev, &mhi_wwan_ops, mhi_dev,
+				WWAN_NO_DEFAULT_LINK);
 	if (err)
 		return err;
 
diff --git a/drivers/net/wwan/iosm/iosm_ipc_wwan.c b/drivers/net/wwan/iosm/iosm_ipc_wwan.c
index bee9b278223d..adb2bd40a404 100644
--- a/drivers/net/wwan/iosm/iosm_ipc_wwan.c
+++ b/drivers/net/wwan/iosm/iosm_ipc_wwan.c
@@ -317,7 +317,8 @@ struct iosm_wwan *ipc_wwan_init(struct iosm_imem *ipc_imem, struct device *dev)
 	ipc_wwan->dev = dev;
 	ipc_wwan->ipc_imem = ipc_imem;
 
-	if (wwan_register_ops(ipc_wwan->dev, &iosm_wwan_ops, ipc_wwan)) {
+	if (wwan_register_ops(ipc_wwan->dev, &iosm_wwan_ops, ipc_wwan,
+			      WWAN_NO_DEFAULT_LINK)) {
 		kfree(ipc_wwan);
 		return NULL;
 	}
diff --git a/drivers/net/wwan/wwan_core.c b/drivers/net/wwan/wwan_core.c
index b634a0ba1196..ef6ec641d877 100644
--- a/drivers/net/wwan/wwan_core.c
+++ b/drivers/net/wwan/wwan_core.c
@@ -903,17 +903,81 @@ static struct rtnl_link_ops wwan_rtnl_link_ops __read_mostly = {
 	.policy = wwan_rtnl_policy,
 };
 
+static void wwan_create_default_link(struct wwan_device *wwandev,
+				     u32 def_link_id)
+{
+	struct nlattr *tb[IFLA_MAX + 1], *linkinfo[IFLA_INFO_MAX + 1];
+	struct nlattr *data[IFLA_WWAN_MAX + 1];
+	struct net_device *dev;
+	struct nlmsghdr *nlh;
+	struct sk_buff *msg;
+
+	/* Forge attributes required to create a WWAN netdev. We first
+	 * build a netlink message and then parse it. This looks
+	 * odd, but such approach is less error prone.
+	 */
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (WARN_ON(!msg))
+		return;
+	nlh = nlmsg_put(msg, 0, 0, RTM_NEWLINK, 0, 0);
+	if (WARN_ON(!nlh))
+		goto free_attrs;
+
+	if (nla_put_string(msg, IFLA_PARENT_DEV_NAME, dev_name(&wwandev->dev)))
+		goto free_attrs;
+	tb[IFLA_LINKINFO] = nla_nest_start(msg, IFLA_LINKINFO);
+	if (!tb[IFLA_LINKINFO])
+		goto free_attrs;
+	linkinfo[IFLA_INFO_DATA] = nla_nest_start(msg, IFLA_INFO_DATA);
+	if (!linkinfo[IFLA_INFO_DATA])
+		goto free_attrs;
+	if (nla_put_u32(msg, IFLA_WWAN_LINK_ID, def_link_id))
+		goto free_attrs;
+	nla_nest_end(msg, linkinfo[IFLA_INFO_DATA]);
+	nla_nest_end(msg, tb[IFLA_LINKINFO]);
+
+	nlmsg_end(msg, nlh);
+
+	/* The next three parsing calls can not fail */
+	nlmsg_parse_deprecated(nlh, 0, tb, IFLA_MAX, NULL, NULL);
+	nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX, tb[IFLA_LINKINFO],
+				    NULL, NULL);
+	nla_parse_nested_deprecated(data, IFLA_WWAN_MAX,
+				    linkinfo[IFLA_INFO_DATA], NULL, NULL);
+
+	rtnl_lock();
+
+	dev = rtnl_create_link(&init_net, "wwan%d", NET_NAME_ENUM,
+			       &wwan_rtnl_link_ops, tb, NULL);
+	if (WARN_ON(IS_ERR(dev)))
+		goto unlock;
+
+	if (WARN_ON(wwan_rtnl_newlink(&init_net, dev, tb, data, NULL))) {
+		free_netdev(dev);
+		goto unlock;
+	}
+
+unlock:
+	rtnl_unlock();
+
+free_attrs:
+	nlmsg_free(msg);
+}
+
 /**
  * wwan_register_ops - register WWAN device ops
  * @parent: Device to use as parent and shared by all WWAN ports and
  *	created netdevs
  * @ops: operations to register
  * @ctxt: context to pass to operations
+ * @def_link_id: id of the default link that will be automatically created by
+ *	the WWAN core for the WWAN device. The default link will not be created
+ *	if the passed value is WWAN_NO_DEFAULT_LINK.
  *
  * Returns: 0 on success, a negative error code on failure
  */
 int wwan_register_ops(struct device *parent, const struct wwan_ops *ops,
-		      void *ctxt)
+		      void *ctxt, u32 def_link_id)
 {
 	struct wwan_device *wwandev;
 
@@ -932,6 +996,15 @@ int wwan_register_ops(struct device *parent, const struct wwan_ops *ops,
 	wwandev->ops = ops;
 	wwandev->ops_ctxt = ctxt;
 
+	/* NB: we do not abort ops registration in case of default link
+	 * creation failure. Link ops is the management interface, while the
+	 * default link creation is a service option. And we should not prevent
+	 * a user from manually creating a link latter if service option failed
+	 * now.
+	 */
+	if (def_link_id != WWAN_NO_DEFAULT_LINK)
+		wwan_create_default_link(wwandev, def_link_id);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(wwan_register_ops);
diff --git a/drivers/net/wwan/wwan_hwsim.c b/drivers/net/wwan/wwan_hwsim.c
index a8582a58a385..5b62cf3b3c42 100644
--- a/drivers/net/wwan/wwan_hwsim.c
+++ b/drivers/net/wwan/wwan_hwsim.c
@@ -288,7 +288,7 @@ static struct wwan_hwsim_dev *wwan_hwsim_dev_new(void)
 
 	INIT_WORK(&dev->del_work, wwan_hwsim_dev_del_work);
 
-	err = wwan_register_ops(&dev->dev, &wwan_hwsim_wwan_rtnl_ops, dev);
+	err = wwan_register_ops(&dev->dev, &wwan_hwsim_wwan_rtnl_ops, dev, 1);
 	if (err)
 		goto err_unreg_dev;
 
diff --git a/include/linux/wwan.h b/include/linux/wwan.h
index e1981ea3a2fd..91590db70a12 100644
--- a/include/linux/wwan.h
+++ b/include/linux/wwan.h
@@ -126,6 +126,12 @@ void wwan_port_txon(struct wwan_port *port);
  */
 void *wwan_port_get_drvdata(struct wwan_port *port);
 
+/*
+ * Used to indicate that the WWAN core should not create a default network
+ * link.
+ */
+#define WWAN_NO_DEFAULT_LINK		U32_MAX
+
 /**
  * struct wwan_ops - WWAN device ops
  * @priv_size: size of private netdev data area
@@ -143,7 +149,7 @@ struct wwan_ops {
 };
 
 int wwan_register_ops(struct device *parent, const struct wwan_ops *ops,
-		      void *ctxt);
+		      void *ctxt, u32 def_link_id);
 
 void wwan_unregister_ops(struct device *parent);
 
-- 
cgit v1.2.3


From 699409240389c2994e5fa1cb7d7599129bc7cfdf Mon Sep 17 00:00:00 2001
From: Sergey Ryazanov <ryazanov.s.a@gmail.com>
Date: Tue, 22 Jun 2021 01:51:00 +0300
Subject: wwan: core: add WWAN common private data for netdev

The WWAN core not only multiplex the netdev configuration data, but
process it too, and needs some space to store its private data
associated with the netdev. Add a structure to keep common WWAN core
data. The structure will be stored inside the netdev private data before
WWAN driver private data and have a field to make it easier to access
the driver data. Also add a helper function that simplifies drivers
access to their data.

At the moment we use the common WWAN private data to store the WWAN data
link (channel) id at the time the link is created, and report it back to
user using the .fill_info() RTNL callback. This should help the user to
be aware which network interface is bound to which WWAN device data
channel.

Signed-off-by: Sergey Ryazanov <ryazanov.s.a@gmail.com>
CC: M Chetan Kumar <m.chetan.kumar@intel.com>
CC: Intel Corporation <linuxwwan@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/mhi/net.c                 | 12 ++++++------
 drivers/net/mhi/proto_mbim.c          |  5 +++--
 drivers/net/wwan/iosm/iosm_ipc_wwan.c | 12 ++++++------
 drivers/net/wwan/wwan_core.c          | 29 ++++++++++++++++++++++++++++-
 include/linux/wwan.h                  | 18 ++++++++++++++++++
 5 files changed, 61 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/mhi/net.c b/drivers/net/mhi/net.c
index f36ca5c0dfe9..e60e38c1f09d 100644
--- a/drivers/net/mhi/net.c
+++ b/drivers/net/mhi/net.c
@@ -32,7 +32,7 @@ struct mhi_device_info {
 
 static int mhi_ndo_open(struct net_device *ndev)
 {
-	struct mhi_net_dev *mhi_netdev = netdev_priv(ndev);
+	struct mhi_net_dev *mhi_netdev = wwan_netdev_drvpriv(ndev);
 
 	/* Feed the rx buffer pool */
 	schedule_delayed_work(&mhi_netdev->rx_refill, 0);
@@ -47,7 +47,7 @@ static int mhi_ndo_open(struct net_device *ndev)
 
 static int mhi_ndo_stop(struct net_device *ndev)
 {
-	struct mhi_net_dev *mhi_netdev = netdev_priv(ndev);
+	struct mhi_net_dev *mhi_netdev = wwan_netdev_drvpriv(ndev);
 
 	netif_stop_queue(ndev);
 	netif_carrier_off(ndev);
@@ -58,7 +58,7 @@ static int mhi_ndo_stop(struct net_device *ndev)
 
 static netdev_tx_t mhi_ndo_xmit(struct sk_buff *skb, struct net_device *ndev)
 {
-	struct mhi_net_dev *mhi_netdev = netdev_priv(ndev);
+	struct mhi_net_dev *mhi_netdev = wwan_netdev_drvpriv(ndev);
 	const struct mhi_net_proto *proto = mhi_netdev->proto;
 	struct mhi_device *mdev = mhi_netdev->mdev;
 	int err;
@@ -93,7 +93,7 @@ exit_drop:
 static void mhi_ndo_get_stats64(struct net_device *ndev,
 				struct rtnl_link_stats64 *stats)
 {
-	struct mhi_net_dev *mhi_netdev = netdev_priv(ndev);
+	struct mhi_net_dev *mhi_netdev = wwan_netdev_drvpriv(ndev);
 	unsigned int start;
 
 	do {
@@ -322,7 +322,7 @@ static int mhi_net_newlink(void *ctxt, struct net_device *ndev, u32 if_id,
 	if (dev_get_drvdata(&mhi_dev->dev))
 		return -EBUSY;
 
-	mhi_netdev = netdev_priv(ndev);
+	mhi_netdev = wwan_netdev_drvpriv(ndev);
 
 	dev_set_drvdata(&mhi_dev->dev, mhi_netdev);
 	mhi_netdev->ndev = ndev;
@@ -367,7 +367,7 @@ out_err:
 static void mhi_net_dellink(void *ctxt, struct net_device *ndev,
 			    struct list_head *head)
 {
-	struct mhi_net_dev *mhi_netdev = netdev_priv(ndev);
+	struct mhi_net_dev *mhi_netdev = wwan_netdev_drvpriv(ndev);
 	struct mhi_device *mhi_dev = ctxt;
 
 	if (head)
diff --git a/drivers/net/mhi/proto_mbim.c b/drivers/net/mhi/proto_mbim.c
index fc72b3f6ec9e..bf1ad863237d 100644
--- a/drivers/net/mhi/proto_mbim.c
+++ b/drivers/net/mhi/proto_mbim.c
@@ -16,6 +16,7 @@
 #include <linux/ip.h>
 #include <linux/mii.h>
 #include <linux/netdevice.h>
+#include <linux/wwan.h>
 #include <linux/skbuff.h>
 #include <linux/usb.h>
 #include <linux/usb/cdc.h>
@@ -56,7 +57,7 @@ static void __mbim_errors_inc(struct mhi_net_dev *dev)
 
 static int mbim_rx_verify_nth16(struct sk_buff *skb)
 {
-	struct mhi_net_dev *dev = netdev_priv(skb->dev);
+	struct mhi_net_dev *dev = wwan_netdev_drvpriv(skb->dev);
 	struct mbim_context *ctx = dev->proto_data;
 	struct usb_cdc_ncm_nth16 *nth16;
 	int len;
@@ -102,7 +103,7 @@ static int mbim_rx_verify_nth16(struct sk_buff *skb)
 
 static int mbim_rx_verify_ndp16(struct sk_buff *skb, struct usb_cdc_ncm_ndp16 *ndp16)
 {
-	struct mhi_net_dev *dev = netdev_priv(skb->dev);
+	struct mhi_net_dev *dev = wwan_netdev_drvpriv(skb->dev);
 	int ret;
 
 	if (le16_to_cpu(ndp16->wLength) < USB_CDC_NCM_NDP16_LENGTH_MIN) {
diff --git a/drivers/net/wwan/iosm/iosm_ipc_wwan.c b/drivers/net/wwan/iosm/iosm_ipc_wwan.c
index d3cb28107836..c999c64001f4 100644
--- a/drivers/net/wwan/iosm/iosm_ipc_wwan.c
+++ b/drivers/net/wwan/iosm/iosm_ipc_wwan.c
@@ -20,7 +20,7 @@
 #define IOSM_IF_ID_PAYLOAD 2
 
 /**
- * struct iosm_netdev_priv - netdev private data
+ * struct iosm_netdev_priv - netdev WWAN driver specific private data
  * @ipc_wwan:	Pointer to iosm_wwan struct
  * @netdev:	Pointer to network interface device structure
  * @if_id:	Interface id for device.
@@ -51,7 +51,7 @@ struct iosm_wwan {
 /* Bring-up the wwan net link */
 static int ipc_wwan_link_open(struct net_device *netdev)
 {
-	struct iosm_netdev_priv *priv = netdev_priv(netdev);
+	struct iosm_netdev_priv *priv = wwan_netdev_drvpriv(netdev);
 	struct iosm_wwan *ipc_wwan = priv->ipc_wwan;
 	int if_id = priv->if_id;
 	int ret;
@@ -88,7 +88,7 @@ out:
 /* Bring-down the wwan net link */
 static int ipc_wwan_link_stop(struct net_device *netdev)
 {
-	struct iosm_netdev_priv *priv = netdev_priv(netdev);
+	struct iosm_netdev_priv *priv = wwan_netdev_drvpriv(netdev);
 
 	netif_stop_queue(netdev);
 
@@ -105,7 +105,7 @@ static int ipc_wwan_link_stop(struct net_device *netdev)
 static int ipc_wwan_link_transmit(struct sk_buff *skb,
 				  struct net_device *netdev)
 {
-	struct iosm_netdev_priv *priv = netdev_priv(netdev);
+	struct iosm_netdev_priv *priv = wwan_netdev_drvpriv(netdev);
 	struct iosm_wwan *ipc_wwan = priv->ipc_wwan;
 	int if_id = priv->if_id;
 	int ret;
@@ -178,7 +178,7 @@ static int ipc_wwan_newlink(void *ctxt, struct net_device *dev,
 	    if_id >= ARRAY_SIZE(ipc_wwan->sub_netlist))
 		return -EINVAL;
 
-	priv = netdev_priv(dev);
+	priv = wwan_netdev_drvpriv(dev);
 	priv->if_id = if_id;
 	priv->netdev = dev;
 	priv->ipc_wwan = ipc_wwan;
@@ -208,8 +208,8 @@ out_unlock:
 static void ipc_wwan_dellink(void *ctxt, struct net_device *dev,
 			     struct list_head *head)
 {
+	struct iosm_netdev_priv *priv = wwan_netdev_drvpriv(dev);
 	struct iosm_wwan *ipc_wwan = ctxt;
-	struct iosm_netdev_priv *priv = netdev_priv(dev);
 	int if_id = priv->if_id;
 
 	if (WARN_ON(if_id < IP_MUX_SESSION_START ||
diff --git a/drivers/net/wwan/wwan_core.c b/drivers/net/wwan/wwan_core.c
index ef6ec641d877..3e16c318e705 100644
--- a/drivers/net/wwan/wwan_core.c
+++ b/drivers/net/wwan/wwan_core.c
@@ -815,6 +815,7 @@ static struct net_device *wwan_rtnl_alloc(struct nlattr *tb[],
 	const char *devname = nla_data(tb[IFLA_PARENT_DEV_NAME]);
 	struct wwan_device *wwandev = wwan_dev_get_by_name(devname);
 	struct net_device *dev;
+	unsigned int priv_size;
 
 	if (IS_ERR(wwandev))
 		return ERR_CAST(wwandev);
@@ -825,7 +826,8 @@ static struct net_device *wwan_rtnl_alloc(struct nlattr *tb[],
 		goto out;
 	}
 
-	dev = alloc_netdev_mqs(wwandev->ops->priv_size, ifname, name_assign_type,
+	priv_size = sizeof(struct wwan_netdev_priv) + wwandev->ops->priv_size;
+	dev = alloc_netdev_mqs(priv_size, ifname, name_assign_type,
 			       wwandev->ops->setup, num_tx_queues, num_rx_queues);
 
 	if (dev) {
@@ -845,6 +847,7 @@ static int wwan_rtnl_newlink(struct net *src_net, struct net_device *dev,
 {
 	struct wwan_device *wwandev = wwan_dev_get_by_parent(dev->dev.parent);
 	u32 link_id = nla_get_u32(data[IFLA_WWAN_LINK_ID]);
+	struct wwan_netdev_priv *priv = netdev_priv(dev);
 	int ret;
 
 	if (IS_ERR(wwandev))
@@ -856,6 +859,7 @@ static int wwan_rtnl_newlink(struct net *src_net, struct net_device *dev,
 		goto out;
 	}
 
+	priv->link_id = link_id;
 	if (wwandev->ops->newlink)
 		ret = wwandev->ops->newlink(wwandev->ops_ctxt, dev,
 					    link_id, extack);
@@ -889,6 +893,27 @@ out:
 	put_device(&wwandev->dev);
 }
 
+static size_t wwan_rtnl_get_size(const struct net_device *dev)
+{
+	return
+		nla_total_size(4) +	/* IFLA_WWAN_LINK_ID */
+		0;
+}
+
+static int wwan_rtnl_fill_info(struct sk_buff *skb,
+			       const struct net_device *dev)
+{
+	struct wwan_netdev_priv *priv = netdev_priv(dev);
+
+	if (nla_put_u32(skb, IFLA_WWAN_LINK_ID, priv->link_id))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
 static const struct nla_policy wwan_rtnl_policy[IFLA_WWAN_MAX + 1] = {
 	[IFLA_WWAN_LINK_ID] = { .type = NLA_U32 },
 };
@@ -900,6 +925,8 @@ static struct rtnl_link_ops wwan_rtnl_link_ops __read_mostly = {
 	.validate = wwan_rtnl_validate,
 	.newlink = wwan_rtnl_newlink,
 	.dellink = wwan_rtnl_dellink,
+	.get_size = wwan_rtnl_get_size,
+	.fill_info = wwan_rtnl_fill_info,
 	.policy = wwan_rtnl_policy,
 };
 
diff --git a/include/linux/wwan.h b/include/linux/wwan.h
index 91590db70a12..9fac819f92e3 100644
--- a/include/linux/wwan.h
+++ b/include/linux/wwan.h
@@ -9,6 +9,7 @@
 #include <linux/poll.h>
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
+#include <linux/netdevice.h>
 
 /**
  * enum wwan_port_type - WWAN port types
@@ -126,6 +127,23 @@ void wwan_port_txon(struct wwan_port *port);
  */
 void *wwan_port_get_drvdata(struct wwan_port *port);
 
+/**
+ * struct wwan_netdev_priv - WWAN core network device private data
+ * @link_id: WWAN device data link id
+ * @drv_priv: driver private data area, size is determined in &wwan_ops
+ */
+struct wwan_netdev_priv {
+	u32 link_id;
+
+	/* must be last */
+	u8 drv_priv[] __aligned(sizeof(void *));
+};
+
+static inline void *wwan_netdev_drvpriv(struct net_device *dev)
+{
+	return ((struct wwan_netdev_priv *)netdev_priv(dev))->drv_priv;
+}
+
 /*
  * Used to indicate that the WWAN core should not create a default network
  * link.
-- 
cgit v1.2.3


From b8c48be23c2d03834fe01c3ea757d9df8b97013d Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Tue, 22 Jun 2021 09:50:50 +0300
Subject: ethtool: Use kernel data types for internal EEPROM struct

The struct is not visible to user space and therefore should not use the
user visible data types.

Instead, use internal data types like other structures in the file.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index e030f7510cd3..29dbb603bc91 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -401,12 +401,12 @@ struct ethtool_rmon_stats {
  * required information to the driver.
  */
 struct ethtool_module_eeprom {
-	__u32	offset;
-	__u32	length;
-	__u8	page;
-	__u8	bank;
-	__u8	i2c_address;
-	__u8	*data;
+	u32	offset;
+	u32	length;
+	u8	page;
+	u8	bank;
+	u8	i2c_address;
+	u8	*data;
 };
 
 /**
-- 
cgit v1.2.3


From 380d53c45ff21f66870ee965b62613137f9d010d Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Mon, 21 Jun 2021 16:18:20 -0700
Subject: compiler_attributes.h: define __no_profile, add to noinstr

noinstr implies that we would like the compiler to avoid instrumenting a
function.  Add support for the compiler attribute
no_profile_instrument_function to compiler_attributes.h, then add
__no_profile to the definition of noinstr.

Link: https://lore.kernel.org/lkml/20210614162018.GD68749@worktop.programming.kicks-ass.net/
Link: https://reviews.llvm.org/D104257
Link: https://reviews.llvm.org/D104475
Link: https://reviews.llvm.org/D104658
Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80223
Reviewed-by: Fangrui Song <maskray@google.com>
Reviewed-by: Miguel Ojeda <ojeda@kernel.org>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20210621231822.2848305-2-ndesaulniers@google.com
---
 include/linux/compiler_attributes.h | 13 +++++++++++++
 include/linux/compiler_types.h      |  2 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
index c043b8d2b17b..225511b17223 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -33,6 +33,7 @@
 # define __GCC4_has_attribute___externally_visible__  1
 # define __GCC4_has_attribute___no_caller_saved_registers__ 0
 # define __GCC4_has_attribute___noclone__             1
+# define __GCC4_has_attribute___no_profile_instrument_function__ 0
 # define __GCC4_has_attribute___nonstring__           0
 # define __GCC4_has_attribute___no_sanitize_address__ (__GNUC_MINOR__ >= 8)
 # define __GCC4_has_attribute___no_sanitize_undefined__ (__GNUC_MINOR__ >= 9)
@@ -237,6 +238,18 @@
 # define __nonstring
 #endif
 
+/*
+ * Optional: only supported since GCC >= 7.1, clang >= 13.0.
+ *
+ *      gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-no_005fprofile_005finstrument_005ffunction-function-attribute
+ *    clang: https://clang.llvm.org/docs/AttributeReference.html#no-profile-instrument-function
+ */
+#if __has_attribute(__no_profile_instrument_function__)
+# define __no_profile                  __attribute__((__no_profile_instrument_function__))
+#else
+# define __no_profile
+#endif
+
 /*
  *   gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-noreturn-function-attribute
  * clang: https://clang.llvm.org/docs/AttributeReference.html#noreturn
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index d29bda7f6ebd..d509169860f1 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -210,7 +210,7 @@ struct ftrace_likely_data {
 /* Section for code which can't be instrumented at all */
 #define noinstr								\
 	noinline notrace __attribute((__section__(".noinstr.text")))	\
-	__no_kcsan __no_sanitize_address
+	__no_kcsan __no_sanitize_address __no_profile
 
 #endif /* __KERNEL__ */
 
-- 
cgit v1.2.3


From ae4d682dfd3350d9836dafeed1fc5aa1e27c4963 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Mon, 21 Jun 2021 16:18:21 -0700
Subject: compiler_attributes.h: cleanups for GCC 4.9+

Since
commit 6ec4476ac825 ("Raise gcc version requirement to 4.9")
we no longer support building the kernel with GCC 4.8; drop the
preprocess checks for __GNUC_MINOR__ version. It's implied that if
__GNUC_MAJOR__ is 4, then the only supported version of __GNUC_MINOR__
left is 9.

Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Miguel Ojeda <ojeda@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20210621231822.2848305-3-ndesaulniers@google.com
---
 include/linux/compiler_attributes.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
index 225511b17223..84b1c970acb3 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -27,7 +27,7 @@
  */
 #ifndef __has_attribute
 # define __has_attribute(x) __GCC4_has_attribute_##x
-# define __GCC4_has_attribute___assume_aligned__      (__GNUC_MINOR__ >= 9)
+# define __GCC4_has_attribute___assume_aligned__      1
 # define __GCC4_has_attribute___copy__                0
 # define __GCC4_has_attribute___designated_init__     0
 # define __GCC4_has_attribute___externally_visible__  1
@@ -35,8 +35,8 @@
 # define __GCC4_has_attribute___noclone__             1
 # define __GCC4_has_attribute___no_profile_instrument_function__ 0
 # define __GCC4_has_attribute___nonstring__           0
-# define __GCC4_has_attribute___no_sanitize_address__ (__GNUC_MINOR__ >= 8)
-# define __GCC4_has_attribute___no_sanitize_undefined__ (__GNUC_MINOR__ >= 9)
+# define __GCC4_has_attribute___no_sanitize_address__ 1
+# define __GCC4_has_attribute___no_sanitize_undefined__ 1
 # define __GCC4_has_attribute___fallthrough__         0
 #endif
 
-- 
cgit v1.2.3


From 745a32117b5a0799ce1dd28d5a74dc2b7bf37692 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 22 Jun 2021 14:04:47 -0400
Subject: sctp: add pad chunk and its make function and event table

This chunk is defined in rfc4820#section-3, and used to pad an
SCTP packet. The receiver must discard this chunk and continue
processing the rest of the chunks in the packet.

Add it now, as it will be bundled with a heartbeat chunk to probe
pmtu in the following patches.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h     |  7 +++++++
 include/net/sctp/sm.h    |  1 +
 net/sctp/sm_make_chunk.c | 26 ++++++++++++++++++++++++++
 net/sctp/sm_statetable.c | 23 +++++++++++++++++++++++
 4 files changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index bb1926589693..a86e852507b3 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -98,6 +98,7 @@ enum sctp_cid {
 	SCTP_CID_I_FWD_TSN		= 0xC2,
 	SCTP_CID_ASCONF_ACK		= 0x80,
 	SCTP_CID_RECONF			= 0x82,
+	SCTP_CID_PAD			= 0x84,
 }; /* enum */
 
 
@@ -410,6 +411,12 @@ struct sctp_heartbeat_chunk {
 };
 
 
+/* PAD chunk could be bundled with heartbeat chunk to probe pmtu */
+struct sctp_pad_chunk {
+	struct sctp_chunkhdr uh;
+};
+
+
 /* For the abort and shutdown ACK we must carry the init tag in the
  * common header. Just the common header is all that is needed with a
  * chunk descriptor.
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index fd223c94589a..09c59154634d 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -230,6 +230,7 @@ struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc,
 					   const struct sctp_chunk *chunk,
 					   const void *payload,
 					   const size_t paylen);
+struct sctp_chunk *sctp_make_pad(const struct sctp_association *asoc, int len);
 struct sctp_chunk *sctp_make_op_error(const struct sctp_association *asoc,
 				      const struct sctp_chunk *chunk,
 				      __be16 cause_code, const void *payload,
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 5b44d228b6ca..e5d470cd7c40 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1218,6 +1218,32 @@ nodata:
 	return retval;
 }
 
+/* RFC4820 3. Padding Chunk (PAD)
+ *  0                   1                   2                   3
+ *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Type = 0x84   |   Flags=0     |             Length            |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |                                                               |
+ * \                         Padding Data                          /
+ * /                                                               \
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+struct sctp_chunk *sctp_make_pad(const struct sctp_association *asoc, int len)
+{
+	struct sctp_chunk *retval;
+
+	retval = sctp_make_control(asoc, SCTP_CID_PAD, 0, len, GFP_ATOMIC);
+	if (!retval)
+		return NULL;
+
+	skb_put_zero(retval->skb, len);
+	retval->chunk_hdr->length = htons(ntohs(retval->chunk_hdr->length) + len);
+	retval->chunk_end = skb_tail_pointer(retval->skb);
+
+	return retval;
+}
+
 /* Create an Operation Error chunk with the specified space reserved.
  * This routine can be used for containing multiple causes in the chunk.
  */
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index 88ea87f4f0e7..c82c4233ec6b 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -526,6 +526,26 @@ auth_chunk_event_table[SCTP_NUM_AUTH_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
 	TYPE_SCTP_AUTH,
 }; /*state_fn_t auth_chunk_event_table[][] */
 
+static const struct sctp_sm_table_entry
+pad_chunk_event_table[SCTP_STATE_NUM_STATES] = {
+	/* SCTP_STATE_CLOSED */
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk),
+	/* SCTP_STATE_COOKIE_WAIT */
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk),
+	/* SCTP_STATE_COOKIE_ECHOED */
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk),
+	/* SCTP_STATE_ESTABLISHED */
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk),
+	/* SCTP_STATE_SHUTDOWN_PENDING */
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk),
+	/* SCTP_STATE_SHUTDOWN_SENT */
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk),
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk),
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk),
+};	/* chunk pad */
+
 static const struct sctp_sm_table_entry
 chunk_event_table_unknown[SCTP_STATE_NUM_STATES] = {
 	/* SCTP_STATE_CLOSED */
@@ -992,6 +1012,9 @@ static const struct sctp_sm_table_entry *sctp_chunk_event_lookup(
 
 	case SCTP_CID_AUTH:
 		return &auth_chunk_event_table[0][state];
+
+	case SCTP_CID_PAD:
+		return &pad_chunk_event_table[state];
 	}
 
 	return &chunk_event_table_unknown[state];
-- 
cgit v1.2.3


From 01d5d96542fd4e383da79593f8a3450995ce2257 Mon Sep 17 00:00:00 2001
From: Leah Rumancik <leah.rumancik@gmail.com>
Date: Tue, 18 May 2021 15:13:25 +0000
Subject: ext4: add discard/zeroout flags to journal flush

Add a flags argument to jbd2_journal_flush to enable discarding or
zero-filling the journal blocks while flushing the journal.

Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
Link: https://lore.kernel.org/r/20210518151327.130198-1-leah.rumancik@gmail.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c      |   4 +-
 fs/ext4/ioctl.c      |   6 +--
 fs/ext4/super.c      |   6 +--
 fs/jbd2/journal.c    | 119 +++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/ocfs2/alloc.c     |   2 +-
 fs/ocfs2/journal.c   |   8 ++--
 include/linux/jbd2.h |   6 ++-
 7 files changed, 134 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 211acfba3af7..e1ff4eb3ccba 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3223,7 +3223,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 		ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
 		journal = EXT4_JOURNAL(inode);
 		jbd2_journal_lock_updates(journal);
-		err = jbd2_journal_flush(journal);
+		err = jbd2_journal_flush(journal, 0);
 		jbd2_journal_unlock_updates(journal);
 
 		if (err)
@@ -6005,7 +6005,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	if (val)
 		ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
 	else {
-		err = jbd2_journal_flush(journal);
+		err = jbd2_journal_flush(journal, 0);
 		if (err < 0) {
 			jbd2_journal_unlock_updates(journal);
 			percpu_up_write(&sbi->s_writepages_rwsem);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a96d6721cef9..93e9419825b8 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -706,7 +706,7 @@ static long ext4_ioctl_group_add(struct file *file,
 	err = ext4_group_add(sb, input);
 	if (EXT4_SB(sb)->s_journal) {
 		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-		err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+		err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
 		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
 	}
 	if (err == 0)
@@ -884,7 +884,7 @@ setversion_out:
 		err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
 		if (EXT4_SB(sb)->s_journal) {
 			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
 			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
 		}
 		if (err == 0)
@@ -1027,7 +1027,7 @@ mext_out:
 		if (EXT4_SB(sb)->s_journal) {
 			ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE);
 			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
 			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
 		}
 		if (err == 0)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3b6203543607..ad3919dbd49e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5653,7 +5653,7 @@ static int ext4_mark_recovery_complete(struct super_block *sb,
 		return 0;
 	}
 	jbd2_journal_lock_updates(journal);
-	err = jbd2_journal_flush(journal);
+	err = jbd2_journal_flush(journal, 0);
 	if (err < 0)
 		goto out;
 
@@ -5795,7 +5795,7 @@ static int ext4_freeze(struct super_block *sb)
 		 * Don't clear the needs_recovery flag if we failed to
 		 * flush the journal.
 		 */
-		error = jbd2_journal_flush(journal);
+		error = jbd2_journal_flush(journal, 0);
 		if (error < 0)
 			goto out;
 
@@ -6389,7 +6389,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 		 * otherwise be livelocked...
 		 */
 		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-		err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+		err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
 		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
 		if (err)
 			return err;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2dc944442802..3a2ed60ea8b7 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1686,6 +1686,110 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
 	write_unlock(&journal->j_state_lock);
 }
 
+/**
+ * __jbd2_journal_erase() - Discard or zeroout journal blocks (excluding superblock)
+ * @journal: The journal to erase.
+ * @flags: A discard/zeroout request is sent for each physically contigous
+ *	region of the journal. Either JBD2_JOURNAL_FLUSH_DISCARD or
+ *	JBD2_JOURNAL_FLUSH_ZEROOUT must be set to determine which operation
+ *	to perform.
+ *
+ * Note: JBD2_JOURNAL_FLUSH_ZEROOUT attempts to use hardware offload. Zeroes
+ * will be explicitly written if no hardware offload is available, see
+ * blkdev_issue_zeroout for more details.
+ */
+static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
+{
+	int err = 0;
+	unsigned long block, log_offset; /* logical */
+	unsigned long long phys_block, block_start, block_stop; /* physical */
+	loff_t byte_start, byte_stop, byte_count;
+	struct request_queue *q = bdev_get_queue(journal->j_dev);
+
+	/* flags must be set to either discard or zeroout */
+	if ((flags & ~JBD2_JOURNAL_FLUSH_VALID) || !flags ||
+			((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
+			(flags & JBD2_JOURNAL_FLUSH_ZEROOUT)))
+		return -EINVAL;
+
+	if (!q)
+		return -ENXIO;
+
+	if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !blk_queue_discard(q))
+		return -EOPNOTSUPP;
+
+	/*
+	 * lookup block mapping and issue discard/zeroout for each
+	 * contiguous region
+	 */
+	log_offset = be32_to_cpu(journal->j_superblock->s_first);
+	block_start =  ~0ULL;
+	for (block = log_offset; block < journal->j_total_len; block++) {
+		err = jbd2_journal_bmap(journal, block, &phys_block);
+		if (err) {
+			pr_err("JBD2: bad block at offset %lu", block);
+			return err;
+		}
+
+		if (block_start == ~0ULL) {
+			block_start = phys_block;
+			block_stop = block_start - 1;
+		}
+
+		/*
+		 * last block not contiguous with current block,
+		 * process last contiguous region and return to this block on
+		 * next loop
+		 */
+		if (phys_block != block_stop + 1) {
+			block--;
+		} else {
+			block_stop++;
+			/*
+			 * if this isn't the last block of journal,
+			 * no need to process now because next block may also
+			 * be part of this contiguous region
+			 */
+			if (block != journal->j_total_len - 1)
+				continue;
+		}
+
+		/*
+		 * end of contiguous region or this is last block of journal,
+		 * take care of the region
+		 */
+		byte_start = block_start * journal->j_blocksize;
+		byte_stop = block_stop * journal->j_blocksize;
+		byte_count = (block_stop - block_start + 1) *
+				journal->j_blocksize;
+
+		truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping,
+				byte_start, byte_stop);
+
+		if (flags & JBD2_JOURNAL_FLUSH_DISCARD) {
+			err = blkdev_issue_discard(journal->j_dev,
+					byte_start >> SECTOR_SHIFT,
+					byte_count >> SECTOR_SHIFT,
+					GFP_NOFS, 0);
+		} else if (flags & JBD2_JOURNAL_FLUSH_ZEROOUT) {
+			err = blkdev_issue_zeroout(journal->j_dev,
+					byte_start >> SECTOR_SHIFT,
+					byte_count >> SECTOR_SHIFT,
+					GFP_NOFS, 0);
+		}
+
+		if (unlikely(err != 0)) {
+			pr_err("JBD2: (error %d) unable to wipe journal at physical blocks %llu - %llu",
+					err, block_start, block_stop);
+			return err;
+		}
+
+		/* reset start and stop after processing a region */
+		block_start = ~0ULL;
+	}
+
+	return blkdev_issue_flush(journal->j_dev);
+}
 
 /**
  * jbd2_journal_update_sb_errno() - Update error in the journal.
@@ -2246,13 +2350,18 @@ EXPORT_SYMBOL(jbd2_journal_clear_features);
 /**
  * jbd2_journal_flush() - Flush journal
  * @journal: Journal to act on.
+ * @flags: optional operation on the journal blocks after the flush (see below)
  *
  * Flush all data for a given journal to disk and empty the journal.
  * Filesystems can use this when remounting readonly to ensure that
- * recovery does not need to happen on remount.
+ * recovery does not need to happen on remount. Optionally, a discard or zeroout
+ * can be issued on the journal blocks after flushing.
+ *
+ * flags:
+ *	JBD2_JOURNAL_FLUSH_DISCARD: issues discards for the journal blocks
+ *	JBD2_JOURNAL_FLUSH_ZEROOUT: issues zeroouts for the journal blocks
  */
-
-int jbd2_journal_flush(journal_t *journal)
+int jbd2_journal_flush(journal_t *journal, unsigned int flags)
 {
 	int err = 0;
 	transaction_t *transaction = NULL;
@@ -2306,6 +2415,10 @@ int jbd2_journal_flush(journal_t *journal)
 	 * commits of data to the journal will restore the current
 	 * s_start value. */
 	jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
+
+	if (flags)
+		err = __jbd2_journal_erase(journal, flags);
+
 	mutex_unlock(&journal->j_checkpoint_mutex);
 	write_lock(&journal->j_state_lock);
 	J_ASSERT(!journal->j_running_transaction);
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index e032f2e2c2c5..f1cc8258d34a 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6018,7 +6018,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 	 * Then truncate log will be replayed resulting in cluster double free.
 	 */
 	jbd2_journal_lock_updates(journal->j_journal);
-	status = jbd2_journal_flush(journal->j_journal);
+	status = jbd2_journal_flush(journal->j_journal, 0);
 	jbd2_journal_unlock_updates(journal->j_journal);
 	if (status < 0) {
 		mlog_errno(status);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 4e589ce2fce6..4f15750aac5d 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -308,7 +308,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
 	}
 
 	jbd2_journal_lock_updates(journal->j_journal);
-	status = jbd2_journal_flush(journal->j_journal);
+	status = jbd2_journal_flush(journal->j_journal, 0);
 	jbd2_journal_unlock_updates(journal->j_journal);
 	if (status < 0) {
 		up_write(&journal->j_trans_barrier);
@@ -1000,7 +1000,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 
 	if (ocfs2_mount_local(osb)) {
 		jbd2_journal_lock_updates(journal->j_journal);
-		status = jbd2_journal_flush(journal->j_journal);
+		status = jbd2_journal_flush(journal->j_journal, 0);
 		jbd2_journal_unlock_updates(journal->j_journal);
 		if (status < 0)
 			mlog_errno(status);
@@ -1070,7 +1070,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
 
 	if (replayed) {
 		jbd2_journal_lock_updates(journal->j_journal);
-		status = jbd2_journal_flush(journal->j_journal);
+		status = jbd2_journal_flush(journal->j_journal, 0);
 		jbd2_journal_unlock_updates(journal->j_journal);
 		if (status < 0)
 			mlog_errno(status);
@@ -1666,7 +1666,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 
 	/* wipe the journal */
 	jbd2_journal_lock_updates(journal);
-	status = jbd2_journal_flush(journal);
+	status = jbd2_journal_flush(journal, 0);
 	jbd2_journal_unlock_updates(journal);
 	if (status < 0)
 		mlog_errno(status);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index db0e1920cb12..8543233b0388 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1370,6 +1370,10 @@ JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit,	FAST_COMMIT)
 						 * mode */
 #define JBD2_FAST_COMMIT_ONGOING	0x100	/* Fast commit is ongoing */
 #define JBD2_FULL_COMMIT_ONGOING	0x200	/* Full commit is ongoing */
+#define JBD2_JOURNAL_FLUSH_DISCARD	0x0001
+#define JBD2_JOURNAL_FLUSH_ZEROOUT	0x0002
+#define JBD2_JOURNAL_FLUSH_VALID	(JBD2_JOURNAL_FLUSH_DISCARD | \
+					JBD2_JOURNAL_FLUSH_ZEROOUT)
 
 /*
  * Function declarations for the journaling transaction and buffer
@@ -1500,7 +1504,7 @@ extern int	 jbd2_journal_invalidatepage(journal_t *,
 				struct page *, unsigned int, unsigned int);
 extern int	 jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page);
 extern int	 jbd2_journal_stop(handle_t *);
-extern int	 jbd2_journal_flush (journal_t *);
+extern int	 jbd2_journal_flush(journal_t *journal, unsigned int flags);
 extern void	 jbd2_journal_lock_updates (journal_t *);
 extern void	 jbd2_journal_unlock_updates (journal_t *);
 
-- 
cgit v1.2.3


From c61404153eb683da9c35aad133131554861ed561 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 25 May 2021 11:39:35 -0700
Subject: f2fs: introduce FI_COMPRESS_RELEASED instead of using IMMUTABLE bit

Once we release compressed blocks, we used to set IMMUTABLE bit. But it turned
out it disallows every fs operations which we don't need for compression.

Let's just prevent writing data only.

Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/compress.c      |  3 ++-
 fs/f2fs/f2fs.h          |  6 ++++++
 fs/f2fs/file.c          | 18 ++++++++++++------
 include/linux/f2fs_fs.h |  1 +
 4 files changed, 21 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index bec92ff5ee7d..1c3e98085591 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -928,7 +928,8 @@ static int __f2fs_cluster_blocks(struct inode *inode,
 		}
 
 		f2fs_bug_on(F2FS_I_SB(inode),
-			!compr && ret != cluster_size && !IS_IMMUTABLE(inode));
+			!compr && ret != cluster_size &&
+			!is_inode_flag_set(inode, FI_COMPRESS_RELEASED));
 	}
 fail:
 	f2fs_put_dnode(&dn);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c0bead0df66a..eaf57b5f3c4b 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -706,6 +706,7 @@ enum {
 	FI_COMPRESS_CORRUPT,	/* indicate compressed cluster is corrupted */
 	FI_MMAP_FILE,		/* indicate file was mmapped */
 	FI_ENABLE_COMPRESS,	/* enable compression in "user" compression mode */
+	FI_COMPRESS_RELEASED,	/* compressed blocks were released */
 	FI_MAX,			/* max flag, never be used */
 };
 
@@ -2746,6 +2747,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode,
 	case FI_DATA_EXIST:
 	case FI_INLINE_DOTS:
 	case FI_PIN_FILE:
+	case FI_COMPRESS_RELEASED:
 		f2fs_mark_inode_dirty_sync(inode, true);
 	}
 }
@@ -2867,6 +2869,8 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri)
 		set_bit(FI_EXTRA_ATTR, fi->flags);
 	if (ri->i_inline & F2FS_PIN_FILE)
 		set_bit(FI_PIN_FILE, fi->flags);
+	if (ri->i_inline & F2FS_COMPRESS_RELEASED)
+		set_bit(FI_COMPRESS_RELEASED, fi->flags);
 }
 
 static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri)
@@ -2887,6 +2891,8 @@ static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri)
 		ri->i_inline |= F2FS_EXTRA_ATTR;
 	if (is_inode_flag_set(inode, FI_PIN_FILE))
 		ri->i_inline |= F2FS_PIN_FILE;
+	if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED))
+		ri->i_inline |= F2FS_COMPRESS_RELEASED;
 }
 
 static inline int f2fs_has_extra_attr(struct inode *inode)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 4a8c3128b5a5..4714925e1974 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -63,6 +63,9 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
 	if (unlikely(IS_IMMUTABLE(inode)))
 		return VM_FAULT_SIGBUS;
 
+	if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED))
+		return VM_FAULT_SIGBUS;
+
 	if (unlikely(f2fs_cp_error(sbi))) {
 		err = -EIO;
 		goto err;
@@ -3420,7 +3423,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
 		goto out;
 	}
 
-	if (IS_IMMUTABLE(inode)) {
+	if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -3429,8 +3432,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
 	if (ret)
 		goto out;
 
-	F2FS_I(inode)->i_flags |= F2FS_IMMUTABLE_FL;
-	f2fs_set_inode_flags(inode);
+	set_inode_flag(inode, FI_COMPRESS_RELEASED);
 	inode->i_ctime = current_time(inode);
 	f2fs_mark_inode_dirty_sync(inode, true);
 
@@ -3585,7 +3587,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
 
 	inode_lock(inode);
 
-	if (!IS_IMMUTABLE(inode)) {
+	if (!is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
 		ret = -EINVAL;
 		goto unlock_inode;
 	}
@@ -3630,8 +3632,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
 	up_write(&F2FS_I(inode)->i_mmap_sem);
 
 	if (ret >= 0) {
-		F2FS_I(inode)->i_flags &= ~F2FS_IMMUTABLE_FL;
-		f2fs_set_inode_flags(inode);
+		clear_inode_flag(inode, FI_COMPRESS_RELEASED);
 		inode->i_ctime = current_time(inode);
 		f2fs_mark_inode_dirty_sync(inode, true);
 	}
@@ -4249,6 +4250,11 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		goto unlock;
 	}
 
+	if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
+		ret = -EPERM;
+		goto unlock;
+	}
+
 	ret = generic_write_checks(iocb, from);
 	if (ret > 0) {
 		bool preallocated = false;
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 5487a80617a3..f93000c3a127 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -229,6 +229,7 @@ struct f2fs_extent {
 #define F2FS_INLINE_DOTS	0x10	/* file having implicit dot dentries */
 #define F2FS_EXTRA_ATTR		0x20	/* file having extra attribute */
 #define F2FS_PIN_FILE		0x40	/* file should not be gced */
+#define F2FS_COMPRESS_RELEASED	0x80	/* file released compressed blocks */
 
 struct f2fs_inode {
 	__le16 i_mode;			/* file mode */
-- 
cgit v1.2.3


From 6ce19aff0b8cd386860855185c6cd79337fc4d2b Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Thu, 20 May 2021 19:51:50 +0800
Subject: f2fs: compress: add compress_inode to cache compressed blocks

Support to use address space of inner inode to cache compressed block,
in order to improve cache hit ratio of random read.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Documentation/filesystems/f2fs.rst |   3 +
 fs/f2fs/compress.c                 | 168 ++++++++++++++++++++++++++++++++++++-
 fs/f2fs/data.c                     |  41 +++++++--
 fs/f2fs/debug.c                    |  13 +++
 fs/f2fs/f2fs.h                     |  71 +++++++++++++++-
 fs/f2fs/gc.c                       |   1 +
 fs/f2fs/inode.c                    |  21 ++++-
 fs/f2fs/node.c                     |  14 ++++
 fs/f2fs/node.h                     |   1 +
 fs/f2fs/segment.c                  |   6 +-
 fs/f2fs/super.c                    |  35 +++++++-
 include/linux/f2fs_fs.h            |   1 +
 12 files changed, 361 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
index 992bf91eeec8..809c4d0a696f 100644
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@@ -289,6 +289,9 @@ compress_mode=%s	 Control file compression mode. This supports "fs" and "user"
 			 choosing the target file and the timing. The user can do manual
 			 compression/decompression on the compression enabled files using
 			 ioctls.
+compress_cache		 Support to use address space of a filesystem managed inode to
+			 cache compressed block, in order to improve cache hit ratio of
+			 random read.
 inlinecrypt		 When possible, encrypt/decrypt the contents of encrypted
 			 files using the blk-crypto framework rather than
 			 filesystem-layer encryption. This allows the use of
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 1c3e98085591..455561826c7d 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -12,9 +12,11 @@
 #include <linux/lzo.h>
 #include <linux/lz4.h>
 #include <linux/zstd.h>
+#include <linux/pagevec.h>
 
 #include "f2fs.h"
 #include "node.h"
+#include "segment.h"
 #include <trace/events/f2fs.h>
 
 static struct kmem_cache *cic_entry_slab;
@@ -736,7 +738,7 @@ out:
 	return ret;
 }
 
-static void f2fs_decompress_cluster(struct decompress_io_ctx *dic)
+void f2fs_decompress_cluster(struct decompress_io_ctx *dic)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode);
 	struct f2fs_inode_info *fi = F2FS_I(dic->inode);
@@ -835,7 +837,8 @@ out_end_io:
  * page being waited on in the cluster, and if so, it decompresses the cluster
  * (or in the case of a failure, cleans up without actually decompressing).
  */
-void f2fs_end_read_compressed_page(struct page *page, bool failed)
+void f2fs_end_read_compressed_page(struct page *page, bool failed,
+						block_t blkaddr)
 {
 	struct decompress_io_ctx *dic =
 			(struct decompress_io_ctx *)page_private(page);
@@ -845,6 +848,9 @@ void f2fs_end_read_compressed_page(struct page *page, bool failed)
 
 	if (failed)
 		WRITE_ONCE(dic->failed, true);
+	else if (blkaddr)
+		f2fs_cache_compressed_page(sbi, page,
+					dic->inode->i_ino, blkaddr);
 
 	if (atomic_dec_and_test(&dic->remaining_pages))
 		f2fs_decompress_cluster(dic);
@@ -1660,6 +1666,164 @@ void f2fs_put_page_dic(struct page *page)
 	f2fs_put_dic(dic);
 }
 
+const struct address_space_operations f2fs_compress_aops = {
+	.releasepage = f2fs_release_page,
+	.invalidatepage = f2fs_invalidate_page,
+};
+
+struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi)
+{
+	return sbi->compress_inode->i_mapping;
+}
+
+void f2fs_invalidate_compress_page(struct f2fs_sb_info *sbi, block_t blkaddr)
+{
+	if (!sbi->compress_inode)
+		return;
+	invalidate_mapping_pages(COMPRESS_MAPPING(sbi), blkaddr, blkaddr);
+}
+
+void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
+						nid_t ino, block_t blkaddr)
+{
+	struct page *cpage;
+	int ret;
+
+	if (!test_opt(sbi, COMPRESS_CACHE))
+		return;
+
+	if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ))
+		return;
+
+	if (!f2fs_available_free_memory(sbi, COMPRESS_PAGE))
+		return;
+
+	cpage = find_get_page(COMPRESS_MAPPING(sbi), blkaddr);
+	if (cpage) {
+		f2fs_put_page(cpage, 0);
+		return;
+	}
+
+	cpage = alloc_page(__GFP_NOWARN | __GFP_IO);
+	if (!cpage)
+		return;
+
+	ret = add_to_page_cache_lru(cpage, COMPRESS_MAPPING(sbi),
+						blkaddr, GFP_NOFS);
+	if (ret) {
+		f2fs_put_page(cpage, 0);
+		return;
+	}
+
+	set_page_private_data(cpage, ino);
+
+	if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ))
+		goto out;
+
+	memcpy(page_address(cpage), page_address(page), PAGE_SIZE);
+	SetPageUptodate(cpage);
+out:
+	f2fs_put_page(cpage, 1);
+}
+
+bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
+								block_t blkaddr)
+{
+	struct page *cpage;
+	bool hitted = false;
+
+	if (!test_opt(sbi, COMPRESS_CACHE))
+		return false;
+
+	cpage = f2fs_pagecache_get_page(COMPRESS_MAPPING(sbi),
+				blkaddr, FGP_LOCK | FGP_NOWAIT, GFP_NOFS);
+	if (cpage) {
+		if (PageUptodate(cpage)) {
+			atomic_inc(&sbi->compress_page_hit);
+			memcpy(page_address(page),
+				page_address(cpage), PAGE_SIZE);
+			hitted = true;
+		}
+		f2fs_put_page(cpage, 1);
+	}
+
+	return hitted;
+}
+
+void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino)
+{
+	struct address_space *mapping = sbi->compress_inode->i_mapping;
+	struct pagevec pvec;
+	pgoff_t index = 0;
+	pgoff_t end = MAX_BLKADDR(sbi);
+
+	if (!mapping->nrpages)
+		return;
+
+	pagevec_init(&pvec);
+
+	do {
+		unsigned int nr_pages;
+		int i;
+
+		nr_pages = pagevec_lookup_range(&pvec, mapping,
+						&index, end - 1);
+		if (!nr_pages)
+			break;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			if (page->index > end)
+				break;
+
+			lock_page(page);
+			if (page->mapping != mapping) {
+				unlock_page(page);
+				continue;
+			}
+
+			if (ino != get_page_private_data(page)) {
+				unlock_page(page);
+				continue;
+			}
+
+			generic_error_remove_page(mapping, page);
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	} while (index < end);
+}
+
+int f2fs_init_compress_inode(struct f2fs_sb_info *sbi)
+{
+	struct inode *inode;
+
+	if (!test_opt(sbi, COMPRESS_CACHE))
+		return 0;
+
+	inode = f2fs_iget(sbi->sb, F2FS_COMPRESS_INO(sbi));
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+	sbi->compress_inode = inode;
+
+	sbi->compress_percent = COMPRESS_PERCENT;
+	sbi->compress_watermark = COMPRESS_WATERMARK;
+
+	atomic_set(&sbi->compress_page_hit, 0);
+
+	return 0;
+}
+
+void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi)
+{
+	if (!sbi->compress_inode)
+		return;
+	iput(sbi->compress_inode);
+	sbi->compress_inode = NULL;
+}
+
 int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi)
 {
 	dev_t dev = sbi->sb->s_bdev->bd_dev;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 37f4ab79d014..53af21ff6196 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -132,7 +132,7 @@ static void f2fs_finish_read_bio(struct bio *bio)
 
 		if (f2fs_is_compressed_page(page)) {
 			if (bio->bi_status)
-				f2fs_end_read_compressed_page(page, true);
+				f2fs_end_read_compressed_page(page, true, 0);
 			f2fs_put_page_dic(page);
 			continue;
 		}
@@ -228,15 +228,19 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx)
 	struct bio_vec *bv;
 	struct bvec_iter_all iter_all;
 	bool all_compressed = true;
+	block_t blkaddr = SECTOR_TO_BLOCK(ctx->bio->bi_iter.bi_sector);
 
 	bio_for_each_segment_all(bv, ctx->bio, iter_all) {
 		struct page *page = bv->bv_page;
 
 		/* PG_error was set if decryption failed. */
 		if (f2fs_is_compressed_page(page))
-			f2fs_end_read_compressed_page(page, PageError(page));
+			f2fs_end_read_compressed_page(page, PageError(page),
+						blkaddr);
 		else
 			all_compressed = false;
+
+		blkaddr++;
 	}
 
 	/*
@@ -1352,9 +1356,11 @@ alloc:
 	old_blkaddr = dn->data_blkaddr;
 	f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr,
 				&sum, seg_type, NULL);
-	if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
+	if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) {
 		invalidate_mapping_pages(META_MAPPING(sbi),
 					old_blkaddr, old_blkaddr);
+		f2fs_invalidate_compress_page(sbi, old_blkaddr);
+	}
 	f2fs_update_data_blkaddr(dn, dn->data_blkaddr);
 
 	/*
@@ -2174,7 +2180,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 		goto out_put_dnode;
 	}
 
-	for (i = 0; i < dic->nr_cpages; i++) {
+	for (i = 0; i < cc->nr_cpages; i++) {
 		struct page *page = dic->cpages[i];
 		block_t blkaddr;
 		struct bio_post_read_ctx *ctx;
@@ -2182,6 +2188,14 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 		blkaddr = data_blkaddr(dn.inode, dn.node_page,
 						dn.ofs_in_node + i + 1);
 
+		f2fs_wait_on_block_writeback(inode, blkaddr);
+
+		if (f2fs_load_compressed_page(sbi, page, blkaddr)) {
+			if (atomic_dec_and_test(&dic->remaining_pages))
+				f2fs_decompress_cluster(dic);
+			continue;
+		}
+
 		if (bio && (!page_is_mergeable(sbi, bio,
 					*last_block_in_bio, blkaddr) ||
 		    !f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) {
@@ -2203,8 +2217,6 @@ submit_and_realloc:
 			}
 		}
 
-		f2fs_wait_on_block_writeback(inode, blkaddr);
-
 		if (bio_add_page(bio, page, blocksize, 0) < blocksize)
 			goto submit_and_realloc;
 
@@ -3618,6 +3630,13 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
 
 	clear_page_private_gcing(page);
 
+	if (test_opt(sbi, COMPRESS_CACHE)) {
+		if (f2fs_compressed_file(inode))
+			f2fs_invalidate_compress_pages(sbi, inode->i_ino);
+		if (inode->i_ino == F2FS_COMPRESS_INO(sbi))
+			clear_page_private_data(page);
+	}
+
 	if (page_private_atomic(page))
 		return f2fs_drop_inmem_page(inode, page);
 
@@ -3635,6 +3654,16 @@ int f2fs_release_page(struct page *page, gfp_t wait)
 	if (page_private_atomic(page))
 		return 0;
 
+	if (test_opt(F2FS_P_SB(page), COMPRESS_CACHE)) {
+		struct f2fs_sb_info *sbi = F2FS_P_SB(page);
+		struct inode *inode = page->mapping->host;
+
+		if (f2fs_compressed_file(inode))
+			f2fs_invalidate_compress_pages(sbi, inode->i_ino);
+		if (inode->i_ino == F2FS_COMPRESS_INO(sbi))
+			clear_page_private_data(page);
+	}
+
 	clear_page_private_gcing(page);
 
 	detach_page_private(page);
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index c03949a7ccff..833325038ef3 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -152,6 +152,12 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 		si->node_pages = NODE_MAPPING(sbi)->nrpages;
 	if (sbi->meta_inode)
 		si->meta_pages = META_MAPPING(sbi)->nrpages;
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+	if (sbi->compress_inode) {
+		si->compress_pages = COMPRESS_MAPPING(sbi)->nrpages;
+		si->compress_page_hit = atomic_read(&sbi->compress_page_hit);
+	}
+#endif
 	si->nats = NM_I(sbi)->nat_cnt[TOTAL_NAT];
 	si->dirty_nats = NM_I(sbi)->nat_cnt[DIRTY_NAT];
 	si->sits = MAIN_SEGS(sbi);
@@ -309,6 +315,12 @@ get_cache:
 
 		si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
 	}
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+	if (sbi->compress_inode) {
+		unsigned npages = COMPRESS_MAPPING(sbi)->nrpages;
+		si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
+	}
+#endif
 }
 
 static int stat_show(struct seq_file *s, void *v)
@@ -476,6 +488,7 @@ static int stat_show(struct seq_file *s, void *v)
 			"volatile IO: %4d (Max. %4d)\n",
 			   si->inmem_pages, si->aw_cnt, si->max_aw_cnt,
 			   si->vw_cnt, si->max_vw_cnt);
+		seq_printf(s, "  - compress: %4d, hit:%8d\n", si->compress_pages, si->compress_page_hit);
 		seq_printf(s, "  - nodes: %4d in %4d\n",
 			   si->ndirty_node, si->node_pages);
 		seq_printf(s, "  - dents: %4d in dirs:%4d (%4d)\n",
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index bbc36828a9d9..561e52a07391 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -98,6 +98,7 @@ extern const char *f2fs_fault_name[FAULT_MAX];
 #define F2FS_MOUNT_ATGC			0x08000000
 #define F2FS_MOUNT_MERGE_CHECKPOINT	0x10000000
 #define	F2FS_MOUNT_GC_MERGE		0x20000000
+#define F2FS_MOUNT_COMPRESS_CACHE	0x40000000
 
 #define F2FS_OPTION(sbi)	((sbi)->mount_opt)
 #define clear_opt(sbi, option)	(F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
@@ -1374,6 +1375,37 @@ PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION);
 PAGE_PRIVATE_CLEAR_FUNC(atomic, ATOMIC_WRITE);
 PAGE_PRIVATE_CLEAR_FUNC(dummy, DUMMY_WRITE);
 
+static inline unsigned long get_page_private_data(struct page *page)
+{
+	unsigned long data = page_private(page);
+
+	if (!test_bit(PAGE_PRIVATE_NOT_POINTER, &data))
+		return 0;
+	return data >> PAGE_PRIVATE_MAX;
+}
+
+static inline void set_page_private_data(struct page *page, unsigned long data)
+{
+	if (!PagePrivate(page)) {
+		get_page(page);
+		SetPagePrivate(page);
+	}
+	set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page));
+	page_private(page) |= data << PAGE_PRIVATE_MAX;
+}
+
+static inline void clear_page_private_data(struct page *page)
+{
+	page_private(page) &= (1 << PAGE_PRIVATE_MAX) - 1;
+	if (page_private(page) == 1 << PAGE_PRIVATE_NOT_POINTER) {
+		set_page_private(page, 0);
+		if (PagePrivate(page)) {
+			ClearPagePrivate(page);
+			put_page(page);
+		}
+	}
+}
+
 /* For compression */
 enum compress_algorithm_type {
 	COMPRESS_LZO,
@@ -1388,6 +1420,9 @@ enum compress_flag {
 	COMPRESS_MAX_FLAG,
 };
 
+#define	COMPRESS_WATERMARK			20
+#define	COMPRESS_PERCENT			20
+
 #define COMPRESS_DATA_RESERVED_SIZE		4
 struct compress_data {
 	__le32 clen;			/* compressed data size */
@@ -1700,6 +1735,12 @@ struct f2fs_sb_info {
 	u64 compr_written_block;
 	u64 compr_saved_block;
 	u32 compr_new_inode;
+
+	/* For compressed block cache */
+	struct inode *compress_inode;		/* cache compressed blocks */
+	unsigned int compress_percent;		/* cache page percentage */
+	unsigned int compress_watermark;	/* cache page watermark */
+	atomic_t compress_page_hit;		/* cache hit count */
 #endif
 };
 
@@ -3671,7 +3712,8 @@ struct f2fs_stat_info {
 	unsigned int bimodal, avg_vblocks;
 	int util_free, util_valid, util_invalid;
 	int rsvd_segs, overp_segs;
-	int dirty_count, node_pages, meta_pages;
+	int dirty_count, node_pages, meta_pages, compress_pages;
+	int compress_page_hit;
 	int prefree_count, call_count, cp_count, bg_cp_count;
 	int tot_segs, node_segs, data_segs, free_segs, free_secs;
 	int bg_node_segs, bg_data_segs;
@@ -4007,7 +4049,9 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page);
 bool f2fs_is_compress_backend_ready(struct inode *inode);
 int f2fs_init_compress_mempool(void);
 void f2fs_destroy_compress_mempool(void);
-void f2fs_end_read_compressed_page(struct page *page, bool failed);
+void f2fs_decompress_cluster(struct decompress_io_ctx *dic);
+void f2fs_end_read_compressed_page(struct page *page, bool failed,
+							block_t blkaddr);
 bool f2fs_cluster_is_empty(struct compress_ctx *cc);
 bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index);
 void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page);
@@ -4025,10 +4069,19 @@ void f2fs_put_page_dic(struct page *page);
 int f2fs_init_compress_ctx(struct compress_ctx *cc);
 void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse);
 void f2fs_init_compress_info(struct f2fs_sb_info *sbi);
+int f2fs_init_compress_inode(struct f2fs_sb_info *sbi);
+void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi);
 int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi);
 void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi);
 int __init f2fs_init_compress_cache(void);
 void f2fs_destroy_compress_cache(void);
+struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi);
+void f2fs_invalidate_compress_page(struct f2fs_sb_info *sbi, block_t blkaddr);
+void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
+						nid_t ino, block_t blkaddr);
+bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
+								block_t blkaddr);
+void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino);
 #define inc_compr_inode_stat(inode)					\
 	do {								\
 		struct f2fs_sb_info *sbi = F2FS_I_SB(inode);		\
@@ -4057,7 +4110,9 @@ static inline struct page *f2fs_compress_control_page(struct page *page)
 }
 static inline int f2fs_init_compress_mempool(void) { return 0; }
 static inline void f2fs_destroy_compress_mempool(void) { }
-static inline void f2fs_end_read_compressed_page(struct page *page, bool failed)
+static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic) { }
+static inline void f2fs_end_read_compressed_page(struct page *page,
+						bool failed, block_t blkaddr)
 {
 	WARN_ON_ONCE(1);
 }
@@ -4065,10 +4120,20 @@ static inline void f2fs_put_page_dic(struct page *page)
 {
 	WARN_ON_ONCE(1);
 }
+static inline int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) { return 0; }
+static inline void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) { }
 static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; }
 static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { }
 static inline int __init f2fs_init_compress_cache(void) { return 0; }
 static inline void f2fs_destroy_compress_cache(void) { }
+static inline void f2fs_invalidate_compress_page(struct f2fs_sb_info *sbi,
+				block_t blkaddr) { }
+static inline void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi,
+				struct page *page, nid_t ino, block_t blkaddr) { }
+static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi,
+				struct page *page, block_t blkaddr) { return false; }
+static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi,
+							nid_t ino) { }
 #define inc_compr_inode_stat(inode)		do { } while (0)
 #endif
 
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index ab1c0123904f..da5947b30142 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1261,6 +1261,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
 	f2fs_put_page(mpage, 1);
 	invalidate_mapping_pages(META_MAPPING(fio.sbi),
 				fio.old_blkaddr, fio.old_blkaddr);
+	f2fs_invalidate_compress_page(fio.sbi, fio.old_blkaddr);
 
 	set_page_dirty(fio.encrypted_page);
 	if (clear_page_dirty_for_io(fio.encrypted_page))
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index cbda7ca3b3be..9141147b5bb0 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -18,6 +18,10 @@
 
 #include <trace/events/f2fs.h>
 
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+extern const struct address_space_operations f2fs_compress_aops;
+#endif
+
 void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync)
 {
 	if (is_inode_flag_set(inode, FI_NEW_INODE))
@@ -494,6 +498,11 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
 	if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi))
 		goto make_now;
 
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+	if (ino == F2FS_COMPRESS_INO(sbi))
+		goto make_now;
+#endif
+
 	ret = do_read_inode(inode);
 	if (ret)
 		goto bad_inode;
@@ -504,6 +513,12 @@ make_now:
 	} else if (ino == F2FS_META_INO(sbi)) {
 		inode->i_mapping->a_ops = &f2fs_meta_aops;
 		mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+	} else if (ino == F2FS_COMPRESS_INO(sbi)) {
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+		inode->i_mapping->a_ops = &f2fs_compress_aops;
+#endif
+		mapping_set_gfp_mask(inode->i_mapping,
+			GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE);
 	} else if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &f2fs_file_inode_operations;
 		inode->i_fop = &f2fs_file_operations;
@@ -723,8 +738,12 @@ void f2fs_evict_inode(struct inode *inode)
 	trace_f2fs_evict_inode(inode);
 	truncate_inode_pages_final(&inode->i_data);
 
+	if (test_opt(sbi, COMPRESS_CACHE) && f2fs_compressed_file(inode))
+		f2fs_invalidate_compress_pages(sbi, inode->i_ino);
+
 	if (inode->i_ino == F2FS_NODE_INO(sbi) ||
-			inode->i_ino == F2FS_META_INO(sbi))
+			inode->i_ino == F2FS_META_INO(sbi) ||
+			inode->i_ino == F2FS_COMPRESS_INO(sbi))
 		goto out_clear;
 
 	f2fs_bug_on(sbi, get_dirty_pages(inode));
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 3a8f7afa5059..dd611efa8aa4 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -97,6 +97,20 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
 		mem_size = (atomic_read(&dcc->discard_cmd_cnt) *
 				sizeof(struct discard_cmd)) >> PAGE_SHIFT;
 		res = mem_size < (avail_ram * nm_i->ram_thresh / 100);
+	} else if (type == COMPRESS_PAGE) {
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+		unsigned long free_ram = val.freeram;
+
+		/*
+		 * free memory is lower than watermark or cached page count
+		 * exceed threshold, deny caching compress page.
+		 */
+		res = (free_ram > avail_ram * sbi->compress_watermark / 100) &&
+			(COMPRESS_MAPPING(sbi)->nrpages <
+			 free_ram * sbi->compress_percent / 100);
+#else
+		res = false;
+#endif
 	} else {
 		if (!sbi->sb->s_bdi->wb.dirty_exceeded)
 			return true;
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index d85e8659cfda..84d45385d1f2 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -148,6 +148,7 @@ enum mem_type {
 	EXTENT_CACHE,	/* indicates extent cache */
 	INMEM_PAGES,	/* indicates inmemory pages */
 	DISCARD_CACHE,	/* indicates memory of cached discard cmds */
+	COMPRESS_PAGE,	/* indicates memory of cached compressed pages */
 	BASE_CHECK,	/* check kernel status */
 };
 
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 54847eebc5ca..85fae9f3624d 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2322,6 +2322,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
 		return;
 
 	invalidate_mapping_pages(META_MAPPING(sbi), addr, addr);
+	f2fs_invalidate_compress_page(sbi, addr);
 
 	/* add it into sit main buffer */
 	down_write(&sit_i->sentry_lock);
@@ -3469,9 +3470,11 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
 reallocate:
 	f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
 			&fio->new_blkaddr, sum, type, fio);
-	if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO)
+	if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) {
 		invalidate_mapping_pages(META_MAPPING(fio->sbi),
 					fio->old_blkaddr, fio->old_blkaddr);
+		f2fs_invalidate_compress_page(fio->sbi, fio->old_blkaddr);
+	}
 
 	/* writeout dirty page into bdev */
 	f2fs_submit_page_write(fio);
@@ -3661,6 +3664,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 	if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) {
 		invalidate_mapping_pages(META_MAPPING(sbi),
 					old_blkaddr, old_blkaddr);
+		f2fs_invalidate_compress_page(sbi, old_blkaddr);
 		if (!from_gc)
 			update_segment_mtime(sbi, old_blkaddr, 0);
 		update_sit_entry(sbi, old_blkaddr, -1);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 3e0e34b4680c..d70122da3f99 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -150,6 +150,7 @@ enum {
 	Opt_compress_extension,
 	Opt_compress_chksum,
 	Opt_compress_mode,
+	Opt_compress_cache,
 	Opt_atgc,
 	Opt_gc_merge,
 	Opt_nogc_merge,
@@ -224,6 +225,7 @@ static match_table_t f2fs_tokens = {
 	{Opt_compress_extension, "compress_extension=%s"},
 	{Opt_compress_chksum, "compress_chksum"},
 	{Opt_compress_mode, "compress_mode=%s"},
+	{Opt_compress_cache, "compress_cache"},
 	{Opt_atgc, "atgc"},
 	{Opt_gc_merge, "gc_merge"},
 	{Opt_nogc_merge, "nogc_merge"},
@@ -1066,12 +1068,16 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
 			}
 			kfree(name);
 			break;
+		case Opt_compress_cache:
+			set_opt(sbi, COMPRESS_CACHE);
+			break;
 #else
 		case Opt_compress_algorithm:
 		case Opt_compress_log_size:
 		case Opt_compress_extension:
 		case Opt_compress_chksum:
 		case Opt_compress_mode:
+		case Opt_compress_cache:
 			f2fs_info(sbi, "compression options not supported");
 			break;
 #endif
@@ -1409,6 +1415,8 @@ static void f2fs_put_super(struct super_block *sb)
 
 	f2fs_bug_on(sbi, sbi->fsync_node_num);
 
+	f2fs_destroy_compress_inode(sbi);
+
 	iput(sbi->node_inode);
 	sbi->node_inode = NULL;
 
@@ -1678,6 +1686,9 @@ static inline void f2fs_show_compress_options(struct seq_file *seq,
 		seq_printf(seq, ",compress_mode=%s", "fs");
 	else if (F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER)
 		seq_printf(seq, ",compress_mode=%s", "user");
+
+	if (test_opt(sbi, COMPRESS_CACHE))
+		seq_puts(seq, ",compress_cache");
 }
 #endif
 
@@ -1959,6 +1970,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	bool disable_checkpoint = test_opt(sbi, DISABLE_CHECKPOINT);
 	bool no_io_align = !F2FS_IO_ALIGNED(sbi);
 	bool no_atgc = !test_opt(sbi, ATGC);
+	bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE);
 	bool checkpoint_changed;
 #ifdef CONFIG_QUOTA
 	int i, j;
@@ -2056,6 +2068,12 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 		goto restore_opts;
 	}
 
+	if (no_compress_cache == !!test_opt(sbi, COMPRESS_CACHE)) {
+		err = -EINVAL;
+		f2fs_warn(sbi, "switch compress_cache option is not allowed");
+		goto restore_opts;
+	}
+
 	if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) {
 		err = -EINVAL;
 		f2fs_warn(sbi, "disabling checkpoint not compatible with read-only");
@@ -3965,10 +3983,14 @@ try_onemore:
 		goto free_node_inode;
 	}
 
-	err = f2fs_register_sysfs(sbi);
+	err = f2fs_init_compress_inode(sbi);
 	if (err)
 		goto free_root_inode;
 
+	err = f2fs_register_sysfs(sbi);
+	if (err)
+		goto free_compress_inode;
+
 #ifdef CONFIG_QUOTA
 	/* Enable quota usage during mount */
 	if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb)) {
@@ -4109,6 +4131,8 @@ free_meta:
 	/* evict some inodes being cached by GC */
 	evict_inodes(sb);
 	f2fs_unregister_sysfs(sbi);
+free_compress_inode:
+	f2fs_destroy_compress_inode(sbi);
 free_root_inode:
 	dput(sb->s_root);
 	sb->s_root = NULL;
@@ -4187,6 +4211,15 @@ static void kill_f2fs_super(struct super_block *sb)
 		f2fs_stop_gc_thread(sbi);
 		f2fs_stop_discard_thread(sbi);
 
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+		/*
+		 * latter evict_inode() can bypass checking and invalidating
+		 * compress inode cache.
+		 */
+		if (test_opt(sbi, COMPRESS_CACHE))
+			truncate_inode_pages_final(COMPRESS_MAPPING(sbi));
+#endif
+
 		if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) ||
 				!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
 			struct cp_control cpc = {
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index f93000c3a127..d445150c5350 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -34,6 +34,7 @@
 #define F2FS_ROOT_INO(sbi)	((sbi)->root_ino_num)
 #define F2FS_NODE_INO(sbi)	((sbi)->node_ino_num)
 #define F2FS_META_INO(sbi)	((sbi)->meta_ino_num)
+#define F2FS_COMPRESS_INO(sbi)	(NM_I(sbi)->max_nid)
 
 #define F2FS_MAX_QUOTAS		3
 
-- 
cgit v1.2.3


From 1806239dec0dacde373f0b53f076319f6c6d95cb Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 19 Jun 2021 15:36:30 +0200
Subject: ieee80211: add the value for Category '6' in "rtw_ieee80211_category"

Preparation work for removing the "enum rtw_ieee80211_category" in
"drivers/staging/rtl8188eu/include/ieee80211.h" and
"drivers/staging/rtl8723bs/include/ieee80211.h".

This enum is similar to "enum ieee80211_category" from
"include/linux/ieee80211.h". However it defines the value '6' as
RTW_WLAN_CATEGORY_FT.

So add a corresponding value in "ieee80211_category"

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Link: https://lore.kernel.org/r/66be0187869bd7dae1c0b0785a32db695ee9872e.1624108556.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 2967437f1b11..67f3e51e7ecc 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2933,6 +2933,7 @@ enum ieee80211_category {
 	WLAN_CATEGORY_BACK = 3,
 	WLAN_CATEGORY_PUBLIC = 4,
 	WLAN_CATEGORY_RADIO_MEASUREMENT = 5,
+	WLAN_CATEGORY_FAST_BBS_TRANSITION = 6,
 	WLAN_CATEGORY_HT = 7,
 	WLAN_CATEGORY_SA_QUERY = 8,
 	WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION = 9,
-- 
cgit v1.2.3


From 7da70d6cdf0dbc2c62e4a5759db9b63ef8d90c32 Mon Sep 17 00:00:00 2001
From: Krishnanand Prabhu <krishnanand.prabhu@intel.com>
Date: Fri, 18 Jun 2021 13:41:28 +0300
Subject: ieee80211: define timing measurement in extended capabilities IE

Define the bit used for timing measurement support in extended
capabilities IE, used for time synchronization.

Signed-off-by: Krishnanand Prabhu <krishnanand.prabhu@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Link: https://lore.kernel.org/r/iwlwifi.20210618133832.b75f40765538.I92b50e43e29272c97d17ed5f37f216f4caf0f205@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 67f3e51e7ecc..0a0aaa2d5d9e 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -9,7 +9,7 @@
  * Copyright (c) 2006, Michael Wu <flamingice@sourmilk.net>
  * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH
  * Copyright (c) 2016 - 2017 Intel Deutschland GmbH
- * Copyright (c) 2018 - 2020 Intel Corporation
+ * Copyright (c) 2018 - 2021 Intel Corporation
  */
 
 #ifndef LINUX_IEEE80211_H
@@ -3111,6 +3111,11 @@ enum ieee80211_tdls_actioncode {
  */
 #define WLAN_EXT_CAPA3_MULTI_BSSID_SUPPORT	BIT(6)
 
+/* Timing Measurement protocol for time sync is set in the 7th bit of 3rd byte
+ * of the @WLAN_EID_EXT_CAPABILITY information element
+ */
+#define WLAN_EXT_CAPA3_TIMING_MEASUREMENT_SUPPORT	BIT(7)
+
 /* TDLS capabilities in the 4th byte of @WLAN_EID_EXT_CAPABILITY */
 #define WLAN_EXT_CAPA4_TDLS_BUFFER_STA		BIT(4)
 #define WLAN_EXT_CAPA4_TDLS_PEER_PSM		BIT(5)
-- 
cgit v1.2.3


From 9c7c637050b42b6e368bb39b8d0edff728268341 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 18 Jun 2021 13:41:38 +0300
Subject: ieee80211: add defines for HE PHY cap byte 10

One bit out of the previously completely reserved byte 10 in
the PHY capabilities is used since 802.11ax D7.0, add a new
define for it.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Link: https://lore.kernel.org/r/iwlwifi.20210618133832.c026feb3873d.I380f52a05ddb4153bc77ff7f276a3484819f69b2@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 0a0aaa2d5d9e..a6730072d13a 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2179,6 +2179,8 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
 #define IEEE80211_HE_PHY_CAP9_NOMIMAL_PKT_PADDING_RESERVED		0xc0
 #define IEEE80211_HE_PHY_CAP9_NOMIMAL_PKT_PADDING_MASK			0xc0
 
+#define IEEE80211_HE_PHY_CAP10_HE_MU_M1RU_MAX_LTF			0x01
+
 /* 802.11ax HE TX/RX MCS NSS Support  */
 #define IEEE80211_TX_RX_MCS_NSS_SUPP_HIGHEST_MCS_POS			(3)
 #define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_POS			(6)
-- 
cgit v1.2.3


From ab5df7b953d87efddba4f9df83862f7dcb39b8d5 Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jcrouse@codeaurora.org>
Date: Thu, 10 Jun 2021 14:44:10 -0700
Subject: iommu/arm-smmu-qcom: Add an adreno-smmu-priv callback to get
 pagefault info

Add a callback in adreno-smmu-priv to read interesting SMMU
registers to provide an opportunity for a richer debug experience
in the GPU driver.

Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
Signed-off-by: Rob Clark <robdclark@chromium.org>
Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Link: https://lore.kernel.org/r/20210610214431.539029-3-robdclark@gmail.com
Signed-off-by: Rob Clark <robdclark@chromium.org>
---
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 17 ++++++++++++++++
 drivers/iommu/arm/arm-smmu/arm-smmu.h      |  2 ++
 include/linux/adreno-smmu-priv.h           | 31 +++++++++++++++++++++++++++++-
 3 files changed, 49 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 98b3a1c2a181..b2e31ea84128 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -32,6 +32,22 @@ static void qcom_adreno_smmu_write_sctlr(struct arm_smmu_device *smmu, int idx,
 	arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, reg);
 }
 
+static void qcom_adreno_smmu_get_fault_info(const void *cookie,
+		struct adreno_smmu_fault_info *info)
+{
+	struct arm_smmu_domain *smmu_domain = (void *)cookie;
+	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+
+	info->fsr = arm_smmu_cb_read(smmu, cfg->cbndx, ARM_SMMU_CB_FSR);
+	info->fsynr0 = arm_smmu_cb_read(smmu, cfg->cbndx, ARM_SMMU_CB_FSYNR0);
+	info->fsynr1 = arm_smmu_cb_read(smmu, cfg->cbndx, ARM_SMMU_CB_FSYNR1);
+	info->far = arm_smmu_cb_readq(smmu, cfg->cbndx, ARM_SMMU_CB_FAR);
+	info->cbfrsynra = arm_smmu_gr1_read(smmu, ARM_SMMU_GR1_CBFRSYNRA(cfg->cbndx));
+	info->ttbr0 = arm_smmu_cb_read(smmu, cfg->cbndx, ARM_SMMU_CB_TTBR0);
+	info->contextidr = arm_smmu_cb_read(smmu, cfg->cbndx, ARM_SMMU_CB_CONTEXTIDR);
+}
+
 #define QCOM_ADRENO_SMMU_GPU_SID 0
 
 static bool qcom_adreno_smmu_is_gpu_device(struct device *dev)
@@ -156,6 +172,7 @@ static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain,
 	priv->cookie = smmu_domain;
 	priv->get_ttbr1_cfg = qcom_adreno_smmu_get_ttbr1_cfg;
 	priv->set_ttbr0_cfg = qcom_adreno_smmu_set_ttbr0_cfg;
+	priv->get_fault_info = qcom_adreno_smmu_get_fault_info;
 
 	return 0;
 }
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index c31a59d35c64..84c21c4b0691 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -224,6 +224,8 @@ enum arm_smmu_cbar_type {
 #define ARM_SMMU_CB_FSYNR0		0x68
 #define ARM_SMMU_FSYNR0_WNR		BIT(4)
 
+#define ARM_SMMU_CB_FSYNR1		0x6c
+
 #define ARM_SMMU_CB_S1_TLBIVA		0x600
 #define ARM_SMMU_CB_S1_TLBIASID		0x610
 #define ARM_SMMU_CB_S1_TLBIVAL		0x620
diff --git a/include/linux/adreno-smmu-priv.h b/include/linux/adreno-smmu-priv.h
index a889f28afb42..53fe32fb9214 100644
--- a/include/linux/adreno-smmu-priv.h
+++ b/include/linux/adreno-smmu-priv.h
@@ -8,6 +8,32 @@
 
 #include <linux/io-pgtable.h>
 
+/**
+ * struct adreno_smmu_fault_info - container for key fault information
+ *
+ * @far: The faulting IOVA from ARM_SMMU_CB_FAR
+ * @ttbr0: The current TTBR0 pagetable from ARM_SMMU_CB_TTBR0
+ * @contextidr: The value of ARM_SMMU_CB_CONTEXTIDR
+ * @fsr: The fault status from ARM_SMMU_CB_FSR
+ * @fsynr0: The value of FSYNR0 from ARM_SMMU_CB_FSYNR0
+ * @fsynr1: The value of FSYNR1 from ARM_SMMU_CB_FSYNR0
+ * @cbfrsynra: The value of CBFRSYNRA from ARM_SMMU_GR1_CBFRSYNRA(idx)
+ *
+ * This struct passes back key page fault information to the GPU driver
+ * through the get_fault_info function pointer.
+ * The GPU driver can use this information to print informative
+ * log messages and provide deeper GPU specific insight into the fault.
+ */
+struct adreno_smmu_fault_info {
+	u64 far;
+	u64 ttbr0;
+	u32 contextidr;
+	u32 fsr;
+	u32 fsynr0;
+	u32 fsynr1;
+	u32 cbfrsynra;
+};
+
 /**
  * struct adreno_smmu_priv - private interface between adreno-smmu and GPU
  *
@@ -17,6 +43,8 @@
  * @set_ttbr0_cfg: Set the TTBR0 config for the GPUs context bank.  A
  *                 NULL config disables TTBR0 translation, otherwise
  *                 TTBR0 translation is enabled with the specified cfg
+ * @get_fault_info: Called by the GPU fault handler to get information about
+ *                  the fault
  *
  * The GPU driver (drm/msm) and adreno-smmu work together for controlling
  * the GPU's SMMU instance.  This is by necessity, as the GPU is directly
@@ -31,6 +59,7 @@ struct adreno_smmu_priv {
     const void *cookie;
     const struct io_pgtable_cfg *(*get_ttbr1_cfg)(const void *cookie);
     int (*set_ttbr0_cfg)(const void *cookie, const struct io_pgtable_cfg *cfg);
+    void (*get_fault_info)(const void *cookie, struct adreno_smmu_fault_info *info);
 };
 
-#endif /* __ADRENO_SMMU_PRIV_H */
\ No newline at end of file
+#endif /* __ADRENO_SMMU_PRIV_H */
-- 
cgit v1.2.3


From ba6014a4e480c3c2b169438c47273a113c35ba4e Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@chromium.org>
Date: Thu, 10 Jun 2021 14:44:12 -0700
Subject: iommu/arm-smmu-qcom: Add stall support

Add, via the adreno-smmu-priv interface, a way for the GPU to request
the SMMU to stall translation on faults, and then later resume the
translation, either retrying or terminating the current translation.

This will be used on the GPU side to "freeze" the GPU while we snapshot
useful state for devcoredump.

Signed-off-by: Rob Clark <robdclark@chromium.org>
Acked-by: Jordan Crouse <jordan@cosmicpenguin.net>
Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Link: https://lore.kernel.org/r/20210610214431.539029-5-robdclark@gmail.com
Signed-off-by: Rob Clark <robdclark@chromium.org>
---
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 33 ++++++++++++++++++++++++++++++
 include/linux/adreno-smmu-priv.h           |  7 +++++++
 2 files changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index b2e31ea84128..61fc645c1325 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -13,6 +13,7 @@ struct qcom_smmu {
 	struct arm_smmu_device smmu;
 	bool bypass_quirk;
 	u8 bypass_cbndx;
+	u32 stall_enabled;
 };
 
 static struct qcom_smmu *to_qcom_smmu(struct arm_smmu_device *smmu)
@@ -23,12 +24,17 @@ static struct qcom_smmu *to_qcom_smmu(struct arm_smmu_device *smmu)
 static void qcom_adreno_smmu_write_sctlr(struct arm_smmu_device *smmu, int idx,
 		u32 reg)
 {
+	struct qcom_smmu *qsmmu = to_qcom_smmu(smmu);
+
 	/*
 	 * On the GPU device we want to process subsequent transactions after a
 	 * fault to keep the GPU from hanging
 	 */
 	reg |= ARM_SMMU_SCTLR_HUPCF;
 
+	if (qsmmu->stall_enabled & BIT(idx))
+		reg |= ARM_SMMU_SCTLR_CFCFG;
+
 	arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, reg);
 }
 
@@ -48,6 +54,31 @@ static void qcom_adreno_smmu_get_fault_info(const void *cookie,
 	info->contextidr = arm_smmu_cb_read(smmu, cfg->cbndx, ARM_SMMU_CB_CONTEXTIDR);
 }
 
+static void qcom_adreno_smmu_set_stall(const void *cookie, bool enabled)
+{
+	struct arm_smmu_domain *smmu_domain = (void *)cookie;
+	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+	struct qcom_smmu *qsmmu = to_qcom_smmu(smmu_domain->smmu);
+
+	if (enabled)
+		qsmmu->stall_enabled |= BIT(cfg->cbndx);
+	else
+		qsmmu->stall_enabled &= ~BIT(cfg->cbndx);
+}
+
+static void qcom_adreno_smmu_resume_translation(const void *cookie, bool terminate)
+{
+	struct arm_smmu_domain *smmu_domain = (void *)cookie;
+	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	u32 reg = 0;
+
+	if (terminate)
+		reg |= ARM_SMMU_RESUME_TERMINATE;
+
+	arm_smmu_cb_write(smmu, cfg->cbndx, ARM_SMMU_CB_RESUME, reg);
+}
+
 #define QCOM_ADRENO_SMMU_GPU_SID 0
 
 static bool qcom_adreno_smmu_is_gpu_device(struct device *dev)
@@ -173,6 +204,8 @@ static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain,
 	priv->get_ttbr1_cfg = qcom_adreno_smmu_get_ttbr1_cfg;
 	priv->set_ttbr0_cfg = qcom_adreno_smmu_set_ttbr0_cfg;
 	priv->get_fault_info = qcom_adreno_smmu_get_fault_info;
+	priv->set_stall = qcom_adreno_smmu_set_stall;
+	priv->resume_translation = qcom_adreno_smmu_resume_translation;
 
 	return 0;
 }
diff --git a/include/linux/adreno-smmu-priv.h b/include/linux/adreno-smmu-priv.h
index 53fe32fb9214..c637e0997f6d 100644
--- a/include/linux/adreno-smmu-priv.h
+++ b/include/linux/adreno-smmu-priv.h
@@ -45,6 +45,11 @@ struct adreno_smmu_fault_info {
  *                 TTBR0 translation is enabled with the specified cfg
  * @get_fault_info: Called by the GPU fault handler to get information about
  *                  the fault
+ * @set_stall:     Configure whether stall on fault (CFCFG) is enabled.  Call
+ *                 before set_ttbr0_cfg().  If stalling on fault is enabled,
+ *                 the GPU driver must call resume_translation()
+ * @resume_translation: Resume translation after a fault
+ *
  *
  * The GPU driver (drm/msm) and adreno-smmu work together for controlling
  * the GPU's SMMU instance.  This is by necessity, as the GPU is directly
@@ -60,6 +65,8 @@ struct adreno_smmu_priv {
     const struct io_pgtable_cfg *(*get_ttbr1_cfg)(const void *cookie);
     int (*set_ttbr0_cfg)(const void *cookie, const struct io_pgtable_cfg *cfg);
     void (*get_fault_info)(const void *cookie, struct adreno_smmu_fault_info *info);
+    void (*set_stall)(const void *cookie, bool enabled);
+    void (*resume_translation)(const void *cookie, bool terminate);
 };
 
 #endif /* __ADRENO_SMMU_PRIV_H */
-- 
cgit v1.2.3


From 371071131cd1032c1e9172c51234a2a324841cab Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Jun 2021 14:02:11 +0200
Subject: x86/fpu: Use pkru_write_default() in copy_init_fpstate_to_fpregs()

There is no point in using copy_init_pkru_to_fpregs() which in turn calls
write_pkru(). write_pkru() tries to fiddle with the task's xstate buffer
for nothing because the XRSTOR[S](init_fpstate) just cleared the xfeature
flag in the xstate header which makes get_xsave_addr() fail.

It's a useless exercise anyway because the reinitialization activates the
FPU so before the task's xstate buffer can be used again a XRSTOR[S] must
happen which in turn dumps the PKRU value.

Get rid of the now unused copy_init_pkru_to_fpregs().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20210623121455.732508792@linutronix.de
---
 arch/x86/include/asm/pkeys.h |  1 -
 arch/x86/kernel/fpu/core.c   |  3 +--
 arch/x86/mm/pkeys.c          | 17 -----------------
 include/linux/pkeys.h        |  4 ----
 4 files changed, 1 insertion(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index 4128f647c755..5c7bcaa79623 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -124,7 +124,6 @@ extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 		unsigned long init_val);
 extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 		unsigned long init_val);
-extern void copy_init_pkru_to_fpregs(void);
 
 static inline int vma_pkey(struct vm_area_struct *vma)
 {
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 3866954354a4..fedadcb04ba2 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -311,8 +311,7 @@ static inline void restore_fpregs_from_init_fpstate(u64 features_mask)
 	else
 		frstor(&init_fpstate.fsave);
 
-	if (cpu_feature_enabled(X86_FEATURE_OSPKE))
-		copy_init_pkru_to_fpregs();
+	pkru_write_default();
 }
 
 /*
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index a02cfcf02d0d..fb171a5d7f33 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -10,7 +10,6 @@
 
 #include <asm/cpufeature.h>             /* boot_cpu_has, ...            */
 #include <asm/mmu_context.h>            /* vma_pkey()                   */
-#include <asm/pkru.h>			/* read/write_pkru()		*/
 
 int __execute_only_pkey(struct mm_struct *mm)
 {
@@ -125,22 +124,6 @@ u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) |
 		      PKRU_AD_KEY(10) | PKRU_AD_KEY(11) | PKRU_AD_KEY(12) |
 		      PKRU_AD_KEY(13) | PKRU_AD_KEY(14) | PKRU_AD_KEY(15);
 
-/*
- * Called from the FPU code when creating a fresh set of FPU
- * registers.  This is called from a very specific context where
- * we know the FPU registers are safe for use and we can use PKRU
- * directly.
- */
-void copy_init_pkru_to_fpregs(void)
-{
-	u32 init_pkru_value_snapshot = READ_ONCE(init_pkru_value);
-	/*
-	 * Override the PKRU state that came from 'init_fpstate'
-	 * with the baseline from the process.
-	 */
-	write_pkru(init_pkru_value_snapshot);
-}
-
 static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf,
 			     size_t count, loff_t *ppos)
 {
diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h
index 2955ba976048..6beb26b7151d 100644
--- a/include/linux/pkeys.h
+++ b/include/linux/pkeys.h
@@ -44,10 +44,6 @@ static inline bool arch_pkeys_enabled(void)
 	return false;
 }
 
-static inline void copy_init_pkru_to_fpregs(void)
-{
-}
-
 #endif /* ! CONFIG_ARCH_HAS_PKEYS */
 
 #endif /* _LINUX_PKEYS_H */
-- 
cgit v1.2.3


From 61d1961adf4bd57d1b2c6d94d97323263c470cb2 Mon Sep 17 00:00:00 2001
From: Stephan Gerhold <stephan@gerhold.net>
Date: Fri, 18 Jun 2021 13:15:54 +0200
Subject: soc: qcom: smem_state: Add devm_qcom_smem_state_get()

It is easy to forget to call qcom_smem_state_put() after
a qcom_smem_state_get(). Introduce a devm_qcom_smem_state_get()
helper function that automates this so that qcom_smem_state_put()
is automatically called when a device is removed.

Signed-off-by: Stephan Gerhold <stephan@gerhold.net>
Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Link: https://lore.kernel.org/r/20210618111556.53416-1-stephan@gerhold.net
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/soc/qcom/smem_state.c       | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/soc/qcom/smem_state.h |  8 ++++++++
 2 files changed, 44 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/soc/qcom/smem_state.c b/drivers/soc/qcom/smem_state.c
index d2b558438deb..31faf4aa868e 100644
--- a/drivers/soc/qcom/smem_state.c
+++ b/drivers/soc/qcom/smem_state.c
@@ -151,6 +151,42 @@ void qcom_smem_state_put(struct qcom_smem_state *state)
 }
 EXPORT_SYMBOL_GPL(qcom_smem_state_put);
 
+static void devm_qcom_smem_state_release(struct device *dev, void *res)
+{
+	qcom_smem_state_put(*(struct qcom_smem_state **)res);
+}
+
+/**
+ * devm_qcom_smem_state_get() - acquire handle to a devres managed state
+ * @dev:	client device pointer
+ * @con_id:	name of the state to lookup
+ * @bit:	flags from the state reference, indicating which bit's affected
+ *
+ * Returns handle to the state, or ERR_PTR(). qcom_smem_state_put() is called
+ * automatically when @dev is removed.
+ */
+struct qcom_smem_state *devm_qcom_smem_state_get(struct device *dev,
+						 const char *con_id,
+						 unsigned *bit)
+{
+	struct qcom_smem_state **ptr, *state;
+
+	ptr = devres_alloc(devm_qcom_smem_state_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	state = qcom_smem_state_get(dev, con_id, bit);
+	if (!IS_ERR(state)) {
+		*ptr = state;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return state;
+}
+EXPORT_SYMBOL_GPL(devm_qcom_smem_state_get);
+
 /**
  * qcom_smem_state_register() - register a new state
  * @of_node:	of_node used for matching client lookups
diff --git a/include/linux/soc/qcom/smem_state.h b/include/linux/soc/qcom/smem_state.h
index 63ad8cddad14..652c0158baac 100644
--- a/include/linux/soc/qcom/smem_state.h
+++ b/include/linux/soc/qcom/smem_state.h
@@ -14,6 +14,7 @@ struct qcom_smem_state_ops {
 #ifdef CONFIG_QCOM_SMEM_STATE
 
 struct qcom_smem_state *qcom_smem_state_get(struct device *dev, const char *con_id, unsigned *bit);
+struct qcom_smem_state *devm_qcom_smem_state_get(struct device *dev, const char *con_id, unsigned *bit);
 void qcom_smem_state_put(struct qcom_smem_state *);
 
 int qcom_smem_state_update_bits(struct qcom_smem_state *state, u32 mask, u32 value);
@@ -29,6 +30,13 @@ static inline struct qcom_smem_state *qcom_smem_state_get(struct device *dev,
 	return ERR_PTR(-EINVAL);
 }
 
+static inline struct qcom_smem_state *devm_qcom_smem_state_get(struct device *dev,
+							       const char *con_id,
+							       unsigned *bit)
+{
+	return ERR_PTR(-EINVAL);
+}
+
 static inline void qcom_smem_state_put(struct qcom_smem_state *state)
 {
 }
-- 
cgit v1.2.3


From 5163ab505e489400b4738b2a5547ec83d2dff7bb Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 17 Jun 2021 15:28:10 +0800
Subject: crypto: api - Move crypto attr definitions out of crypto.h

The definitions for crypto_attr-related types and enums are not
needed by most Crypto API users.  This patch moves them out of
crypto.h and into algapi.h/internal.h depending on the extent of
their use.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/internal.h       | 12 ++++++++++++
 include/crypto/algapi.h |  9 +++++++++
 include/linux/crypto.h  | 21 ---------------------
 3 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/internal.h b/crypto/internal.h
index 976ec9dfc76d..f00869af689f 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -29,6 +29,18 @@ struct crypto_larval {
 	u32 mask;
 };
 
+enum {
+	CRYPTOA_UNSPEC,
+	CRYPTOA_ALG,
+	CRYPTOA_TYPE,
+	__CRYPTOA_MAX,
+};
+
+#define CRYPTOA_MAX (__CRYPTOA_MAX - 1)
+
+/* Maximum number of (rtattr) parameters for each template. */
+#define CRYPTO_MAX_ATTRS 32
+
 extern struct list_head crypto_alg_list;
 extern struct rw_semaphore crypto_alg_sem;
 extern struct blocking_notifier_head crypto_chain;
diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h
index 41d42e649da4..5f6841c73e5a 100644
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
@@ -96,6 +96,15 @@ struct scatter_walk {
 	unsigned int offset;
 };
 
+struct crypto_attr_alg {
+	char name[CRYPTO_MAX_ALG_NAME];
+};
+
+struct crypto_attr_type {
+	u32 type;
+	u32 mask;
+};
+
 void crypto_mod_put(struct crypto_alg *alg);
 
 int crypto_register_template(struct crypto_template *tmpl);
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 3b9263d6122f..855869e1fd32 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -643,27 +643,6 @@ struct crypto_comp {
 	struct crypto_tfm base;
 };
 
-enum {
-	CRYPTOA_UNSPEC,
-	CRYPTOA_ALG,
-	CRYPTOA_TYPE,
-	__CRYPTOA_MAX,
-};
-
-#define CRYPTOA_MAX (__CRYPTOA_MAX - 1)
-
-/* Maximum number of (rtattr) parameters for each template. */
-#define CRYPTO_MAX_ATTRS 32
-
-struct crypto_attr_alg {
-	char name[CRYPTO_MAX_ALG_NAME];
-};
-
-struct crypto_attr_type {
-	u32 type;
-	u32 mask;
-};
-
 /* 
  * Transform user interface.
  */
-- 
cgit v1.2.3


From 2309a05d2abe713f7debc951640b010370c8befb Mon Sep 17 00:00:00 2001
From: Beata Michalska <beata.michalska@arm.com>
Date: Thu, 3 Jun 2021 15:06:25 +0100
Subject: sched/core: Introduce SD_ASYM_CPUCAPACITY_FULL sched_domain flag

Introducing new, complementary to SD_ASYM_CPUCAPACITY, sched_domain
topology flag, to distinguish between shed_domains where any CPU
capacity asymmetry is detected (SD_ASYM_CPUCAPACITY) and ones where
a full set of CPU capacities is visible to all domain members
(SD_ASYM_CPUCAPACITY_FULL).

With the distinction between full and partial CPU capacity asymmetry,
brought in by the newly introduced flag, the scope of the original
SD_ASYM_CPUCAPACITY flag gets shifted, still maintaining the existing
behaviour when one is detected on a given sched domain, allowing
misfit migrations within sched domains that do not observe full range
of CPU capacities but still do have members with different capacity
values. It loses though it's meaning when it comes to the lowest CPU
asymmetry sched_domain level per-cpu pointer, which is to be now
denoted by SD_ASYM_CPUCAPACITY_FULL flag.

Signed-off-by: Beata Michalska <beata.michalska@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://lore.kernel.org/r/20210603140627.8409-2-beata.michalska@arm.com
---
 include/linux/sched/sd_flags.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h
index 34b21e971d77..57bde66d95f7 100644
--- a/include/linux/sched/sd_flags.h
+++ b/include/linux/sched/sd_flags.h
@@ -90,6 +90,16 @@ SD_FLAG(SD_WAKE_AFFINE, SDF_SHARED_CHILD)
  */
 SD_FLAG(SD_ASYM_CPUCAPACITY, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
 
+/*
+ * Domain members have different CPU capacities spanning all unique CPU
+ * capacity values.
+ *
+ * SHARED_PARENT: Set from the topmost domain down to the first domain where
+ *		  all available CPU capacities are visible
+ * NEEDS_GROUPS: Per-CPU capacity is asymmetric between groups.
+ */
+SD_FLAG(SD_ASYM_CPUCAPACITY_FULL, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
+
 /*
  * Domain members share CPU capacity (i.e. SMT)
  *
-- 
cgit v1.2.3


From 8d11cfb0c37547bd6b1cdc7c2653c1e6b5ec5abb Mon Sep 17 00:00:00 2001
From: Vladimir Zapolskiy <vz@mleia.com>
Date: Sun, 20 Jun 2021 22:11:03 +0300
Subject: dmaengine: imx-sdma: Remove platform data header

Since commit 6c5f05a6cd88 ("ARM: imx3: Remove imx3 soc_init()")
there are no more users of struct sdma_script_start_addrs outside
of the driver itself, thus let's move the struct declaration just
to the driver source code and remove the header file as unused one.

Signed-off-by: Vladimir Zapolskiy <vz@mleia.com>
Cc: Fabio Estevam <festevam@gmail.com>
Cc: Shawn Guo <shawnguo@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Fabio Estevam <festevam@gmail.com>
Link: https://lore.kernel.org/r/20210620191103.156626-1-vz@mleia.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/imx-sdma.c                     | 56 +++++++++++++++++++++++++++-
 include/linux/platform_data/dma-imx-sdma.h | 60 ------------------------------
 2 files changed, 55 insertions(+), 61 deletions(-)
 delete mode 100644 include/linux/platform_data/dma-imx-sdma.h

(limited to 'include/linux')

diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c
index d5590c08db51..48390ea3c91f 100644
--- a/drivers/dma/imx-sdma.c
+++ b/drivers/dma/imx-sdma.c
@@ -35,7 +35,6 @@
 #include <linux/workqueue.h>
 
 #include <asm/irq.h>
-#include <linux/platform_data/dma-imx-sdma.h>
 #include <linux/platform_data/dma-imx.h>
 #include <linux/regmap.h>
 #include <linux/mfd/syscon.h>
@@ -181,6 +180,61 @@
 				 BIT(DMA_MEM_TO_DEV) | \
 				 BIT(DMA_DEV_TO_DEV))
 
+/**
+ * struct sdma_script_start_addrs - SDMA script start pointers
+ *
+ * start addresses of the different functions in the physical
+ * address space of the SDMA engine.
+ */
+struct sdma_script_start_addrs {
+	s32 ap_2_ap_addr;
+	s32 ap_2_bp_addr;
+	s32 ap_2_ap_fixed_addr;
+	s32 bp_2_ap_addr;
+	s32 loopback_on_dsp_side_addr;
+	s32 mcu_interrupt_only_addr;
+	s32 firi_2_per_addr;
+	s32 firi_2_mcu_addr;
+	s32 per_2_firi_addr;
+	s32 mcu_2_firi_addr;
+	s32 uart_2_per_addr;
+	s32 uart_2_mcu_addr;
+	s32 per_2_app_addr;
+	s32 mcu_2_app_addr;
+	s32 per_2_per_addr;
+	s32 uartsh_2_per_addr;
+	s32 uartsh_2_mcu_addr;
+	s32 per_2_shp_addr;
+	s32 mcu_2_shp_addr;
+	s32 ata_2_mcu_addr;
+	s32 mcu_2_ata_addr;
+	s32 app_2_per_addr;
+	s32 app_2_mcu_addr;
+	s32 shp_2_per_addr;
+	s32 shp_2_mcu_addr;
+	s32 mshc_2_mcu_addr;
+	s32 mcu_2_mshc_addr;
+	s32 spdif_2_mcu_addr;
+	s32 mcu_2_spdif_addr;
+	s32 asrc_2_mcu_addr;
+	s32 ext_mem_2_ipu_addr;
+	s32 descrambler_addr;
+	s32 dptc_dvfs_addr;
+	s32 utra_addr;
+	s32 ram_code_start_addr;
+	/* End of v1 array */
+	s32 mcu_2_ssish_addr;
+	s32 ssish_2_mcu_addr;
+	s32 hdmi_dma_addr;
+	/* End of v2 array */
+	s32 zcanfd_2_mcu_addr;
+	s32 zqspi_2_mcu_addr;
+	s32 mcu_2_ecspi_addr;
+	/* End of v3 array */
+	s32 mcu_2_zqspi_addr;
+	/* End of v4 array */
+};
+
 /*
  * Mode/Count of data node descriptors - IPCv2
  */
diff --git a/include/linux/platform_data/dma-imx-sdma.h b/include/linux/platform_data/dma-imx-sdma.h
deleted file mode 100644
index 725602d9df91..000000000000
--- a/include/linux/platform_data/dma-imx-sdma.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __MACH_MXC_SDMA_H__
-#define __MACH_MXC_SDMA_H__
-
-/**
- * struct sdma_script_start_addrs - SDMA script start pointers
- *
- * start addresses of the different functions in the physical
- * address space of the SDMA engine.
- */
-struct sdma_script_start_addrs {
-	s32 ap_2_ap_addr;
-	s32 ap_2_bp_addr;
-	s32 ap_2_ap_fixed_addr;
-	s32 bp_2_ap_addr;
-	s32 loopback_on_dsp_side_addr;
-	s32 mcu_interrupt_only_addr;
-	s32 firi_2_per_addr;
-	s32 firi_2_mcu_addr;
-	s32 per_2_firi_addr;
-	s32 mcu_2_firi_addr;
-	s32 uart_2_per_addr;
-	s32 uart_2_mcu_addr;
-	s32 per_2_app_addr;
-	s32 mcu_2_app_addr;
-	s32 per_2_per_addr;
-	s32 uartsh_2_per_addr;
-	s32 uartsh_2_mcu_addr;
-	s32 per_2_shp_addr;
-	s32 mcu_2_shp_addr;
-	s32 ata_2_mcu_addr;
-	s32 mcu_2_ata_addr;
-	s32 app_2_per_addr;
-	s32 app_2_mcu_addr;
-	s32 shp_2_per_addr;
-	s32 shp_2_mcu_addr;
-	s32 mshc_2_mcu_addr;
-	s32 mcu_2_mshc_addr;
-	s32 spdif_2_mcu_addr;
-	s32 mcu_2_spdif_addr;
-	s32 asrc_2_mcu_addr;
-	s32 ext_mem_2_ipu_addr;
-	s32 descrambler_addr;
-	s32 dptc_dvfs_addr;
-	s32 utra_addr;
-	s32 ram_code_start_addr;
-	/* End of v1 array */
-	s32 mcu_2_ssish_addr;
-	s32 ssish_2_mcu_addr;
-	s32 hdmi_dma_addr;
-	/* End of v2 array */
-	s32 zcanfd_2_mcu_addr;
-	s32 zqspi_2_mcu_addr;
-	s32 mcu_2_ecspi_addr;
-	/* End of v3 array */
-	s32 mcu_2_zqspi_addr;
-	/* End of v4 array */
-};
-
-#endif /* __MACH_MXC_SDMA_H__ */
-- 
cgit v1.2.3


From bcfa8d14570d85c998a9b706b074ab151b286edf Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Tue, 15 Jun 2021 23:41:03 +0200
Subject: HID: input: Add support for Programmable Buttons
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Map them to KEY_MACRO# event codes.

These buttons are defined by HID as follows:
"The user defines the function of these buttons to control software applications or GUI objects."

This matches the semantics of the KEY_MACRO# input event codes that Linux supports.

Also add support for HID "Named Array" collections.
Also add hid-debug support for KEY_MACRO#.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-debug.c | 11 +++++++++++
 drivers/hid/hid-input.c | 22 ++++++++++++++++++++++
 include/linux/hid.h     |  1 +
 3 files changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c
index 59f8d716d78f..0e76d9b4530a 100644
--- a/drivers/hid/hid-debug.c
+++ b/drivers/hid/hid-debug.c
@@ -122,6 +122,7 @@ static const struct hid_usage_entry hid_usage_table[] = {
   {  9, 0, "Button" },
   { 10, 0, "Ordinal" },
   { 12, 0, "Consumer" },
+      {0, 0x003, "ProgrammableButtons"},
       {0, 0x238, "HorizontalWheel"},
   { 13, 0, "Digitizers" },
     {0, 0x01, "Digitizer"},
@@ -939,6 +940,16 @@ static const char *keys[KEY_MAX + 1] = {
 	[KEY_KBDINPUTASSIST_NEXTGROUP] = "KbdInputAssistNextGroup",
 	[KEY_KBDINPUTASSIST_ACCEPT] = "KbdInputAssistAccept",
 	[KEY_KBDINPUTASSIST_CANCEL] = "KbdInputAssistCancel",
+	[KEY_MACRO1] = "Macro1", [KEY_MACRO2] = "Macro2", [KEY_MACRO3] = "Macro3",
+	[KEY_MACRO4] = "Macro4", [KEY_MACRO5] = "Macro5", [KEY_MACRO6] = "Macro6",
+	[KEY_MACRO7] = "Macro7", [KEY_MACRO8] = "Macro8", [KEY_MACRO9] = "Macro9",
+	[KEY_MACRO10] = "Macro10", [KEY_MACRO11] = "Macro11", [KEY_MACRO12] = "Macro12",
+	[KEY_MACRO13] = "Macro13", [KEY_MACRO14] = "Macro14", [KEY_MACRO15] = "Macro15",
+	[KEY_MACRO16] = "Macro16", [KEY_MACRO17] = "Macro17", [KEY_MACRO18] = "Macro18",
+	[KEY_MACRO19] = "Macro19", [KEY_MACRO20] = "Macro20", [KEY_MACRO21] = "Macro21",
+	[KEY_MACRO22] = "Macro22", [KEY_MACRO23] = "Macro23", [KEY_MACRO24] = "Macro24",
+	[KEY_MACRO25] = "Macro25", [KEY_MACRO26] = "Macro26", [KEY_MACRO27] = "Macro27",
+	[KEY_MACRO28] = "Macro28", [KEY_MACRO29] = "Macro29", [KEY_MACRO30] = "Macro30",
 };
 
 static const char *relatives[REL_MAX + 1] = {
diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c
index c62c6a9b2132..56bdd55fbfb3 100644
--- a/drivers/hid/hid-input.c
+++ b/drivers/hid/hid-input.c
@@ -567,6 +567,16 @@ static void hidinput_update_battery(struct hid_device *dev, int value)
 }
 #endif	/* CONFIG_HID_BATTERY_STRENGTH */
 
+static bool hidinput_field_in_collection(struct hid_device *device, struct hid_field *field,
+					 unsigned int type, unsigned int usage)
+{
+	struct hid_collection *collection;
+
+	collection = &device->collection[field->usage->collection_index];
+
+	return collection->type == type && collection->usage == usage;
+}
+
 static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_field *field,
 				     struct hid_usage *usage)
 {
@@ -632,6 +642,18 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel
 				else
 					code += BTN_TRIGGER_HAPPY - 0x10;
 				break;
+		case HID_CP_CONSUMER_CONTROL:
+				if (hidinput_field_in_collection(device, field,
+								 HID_COLLECTION_NAMED_ARRAY,
+								 HID_CP_PROGRAMMABLEBUTTONS)) {
+					if (code <= 0x1d)
+						code += KEY_MACRO1;
+					else
+						code += BTN_TRIGGER_HAPPY - 0x1e;
+				} else {
+					goto ignore;
+				}
+				break;
 		default:
 			switch (field->physical) {
 			case HID_GD_MOUSE:
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 271021e20a3f..fb0e4dde6175 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -102,6 +102,7 @@ struct hid_item {
 #define HID_COLLECTION_PHYSICAL		0
 #define HID_COLLECTION_APPLICATION	1
 #define HID_COLLECTION_LOGICAL		2
+#define HID_COLLECTION_NAMED_ARRAY	4
 
 /*
  * HID report descriptor global item tags
-- 
cgit v1.2.3


From d0b371e5fba0ef2b4e3f6a3f1b5fe7f8bd97897e Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 21 Jun 2021 18:12:43 +0300
Subject: stm class: Spelling fix

Drop the repeated word "the" in a comment.

Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
[alexander.shishkin: fixed the commit message]
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Link: https://lore.kernel.org/r/20210621151246.31891-2-alexander.shishkin@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/stm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/stm.h b/include/linux/stm.h
index c6f577ab6f21..3b22689512be 100644
--- a/include/linux/stm.h
+++ b/include/linux/stm.h
@@ -57,7 +57,7 @@ struct stm_device;
  *
  * Normally, an STM device will have a range of masters available to software
  * and the rest being statically assigned to various hardware trace sources.
- * The former is defined by the the range [@sw_start..@sw_end] of the device
+ * The former is defined by the range [@sw_start..@sw_end] of the device
  * description. That is, the lowest master that can be allocated to software
  * writers is @sw_start and data from this writer will appear is @sw_start
  * master in the STP stream.
-- 
cgit v1.2.3


From fcf37549ae19e904bc6a5eadf5c25eca36100c5e Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Thu, 10 Jun 2021 19:24:34 +0800
Subject: jbd2: ensure abort the journal if detect IO error when writing
 original buffer back

Although we merged c044f3d8360 ("jbd2: abort journal if free a async
write error metadata buffer"), there is a race between
jbd2_journal_try_to_free_buffers() and jbd2_journal_destroy(), so the
jbd2_log_do_checkpoint() may still fail to detect the buffer write
io error flag which may lead to filesystem inconsistency.

jbd2_journal_try_to_free_buffers()     ext4_put_super()
                                        jbd2_journal_destroy()
  __jbd2_journal_remove_checkpoint()
  detect buffer write error              jbd2_log_do_checkpoint()
                                         jbd2_cleanup_journal_tail()
                                           <--- lead to inconsistency
  jbd2_journal_abort()

Fix this issue by introducing a new atomic flag which only have one
JBD2_CHECKPOINT_IO_ERROR bit now, and set it in
__jbd2_journal_remove_checkpoint() when freeing a checkpoint buffer
which has write_io_error flag. Then jbd2_journal_destroy() will detect
this mark and abort the journal to prevent updating log tail.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20210610112440.3438139-3-yi.zhang@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c | 12 ++++++++++++
 fs/jbd2/journal.c    | 14 ++++++++++++++
 include/linux/jbd2.h | 11 +++++++++++
 3 files changed, 37 insertions(+)

(limited to 'include/linux')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index bf5511d19ac5..d27c10f4502f 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -564,6 +564,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
 	struct transaction_chp_stats_s *stats;
 	transaction_t *transaction;
 	journal_t *journal;
+	struct buffer_head *bh = jh2bh(jh);
 
 	JBUFFER_TRACE(jh, "entry");
 
@@ -575,6 +576,17 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
 	journal = transaction->t_journal;
 
 	JBUFFER_TRACE(jh, "removing from transaction");
+
+	/*
+	 * If we have failed to write the buffer out to disk, the filesystem
+	 * may become inconsistent. We cannot abort the journal here since
+	 * we hold j_list_lock and we have to be careful about races with
+	 * jbd2_journal_destroy(). So mark the writeback IO error in the
+	 * journal here and we abort the journal later from a better context.
+	 */
+	if (buffer_write_io_error(bh))
+		set_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags);
+
 	__buffer_unlink(jh);
 	jh->b_cp_transaction = NULL;
 	jbd2_journal_put_journal_head(jh);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index f88895b4920c..8b3f5bbd65f9 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1610,6 +1610,10 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
 
 	if (is_journal_aborted(journal))
 		return -EIO;
+	if (test_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags)) {
+		jbd2_journal_abort(journal, -EIO);
+		return -EIO;
+	}
 
 	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
 	jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
@@ -2091,6 +2095,16 @@ int jbd2_journal_destroy(journal_t *journal)
 	J_ASSERT(journal->j_checkpoint_transactions == NULL);
 	spin_unlock(&journal->j_list_lock);
 
+	/*
+	 * OK, all checkpoint transactions have been checked, now check the
+	 * write out io error flag and abort the journal if some buffer failed
+	 * to write back to the original location, otherwise the filesystem
+	 * may become inconsistent.
+	 */
+	if (!is_journal_aborted(journal) &&
+	    test_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags))
+		jbd2_journal_abort(journal, -EIO);
+
 	if (journal->j_sb_buffer) {
 		if (!is_journal_aborted(journal)) {
 			mutex_lock_io(&journal->j_checkpoint_mutex);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 8543233b0388..d5db408ae064 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -779,6 +779,11 @@ struct journal_s
 	 */
 	unsigned long		j_flags;
 
+	/**
+	 * @j_atomic_flags: Atomic journaling state flags.
+	 */
+	unsigned long		j_atomic_flags;
+
 	/**
 	 * @j_errno:
 	 *
@@ -1375,6 +1380,12 @@ JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit,	FAST_COMMIT)
 #define JBD2_JOURNAL_FLUSH_VALID	(JBD2_JOURNAL_FLUSH_DISCARD | \
 					JBD2_JOURNAL_FLUSH_ZEROOUT)
 
+/*
+ * Journal atomic flag definitions
+ */
+#define JBD2_CHECKPOINT_IO_ERROR	0x001	/* Detect io error while writing
+						 * buffer back to disk */
+
 /*
  * Function declarations for the journaling transaction and buffer
  * management
-- 
cgit v1.2.3


From 4ba3fcdde7e36af93610ceb3cc38365b14539865 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Thu, 10 Jun 2021 19:24:37 +0800
Subject: jbd2,ext4: add a shrinker to release checkpointed buffers

Current metadata buffer release logic in bdev_try_to_free_page() have
a lot of use-after-free issues when umount filesystem concurrently, and
it is difficult to fix directly because ext4 is the only user of
s_op->bdev_try_to_free_page callback and we may have to add more special
refcount or lock that is only used by ext4 into the common vfs layer,
which is unacceptable.

One better solution is remove the bdev_try_to_free_page callback, but
the real problem is we cannot easily release journal_head on the
checkpointed buffer, so try_to_free_buffers() cannot release buffers and
page under memory pressure, which is more likely to trigger
out-of-memory. So we cannot remove the callback directly before we find
another way to release journal_head.

This patch introduce a shrinker to free journal_head on the checkpointed
transaction. After the journal_head got freed, try_to_free_buffers()
could free buffer properly.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Suggested-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20210610112440.3438139-6-yi.zhang@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c             |   8 +++
 fs/jbd2/checkpoint.c        | 147 ++++++++++++++++++++++++++++++++++++++++++++
 fs/jbd2/journal.c           |  87 ++++++++++++++++++++++++++
 include/linux/jbd2.h        |  26 ++++++++
 include/trace/events/jbd2.h | 101 ++++++++++++++++++++++++++++++
 5 files changed, 369 insertions(+)

(limited to 'include/linux')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ad3919dbd49e..7ee2e21537e0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1174,6 +1174,7 @@ static void ext4_put_super(struct super_block *sb)
 	ext4_unregister_sysfs(sb);
 
 	if (sbi->s_journal) {
+		jbd2_journal_unregister_shrinker(sbi->s_journal);
 		aborted = is_journal_aborted(sbi->s_journal);
 		err = jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
@@ -5186,6 +5187,7 @@ failed_mount_wq:
 	sbi->s_ea_block_cache = NULL;
 
 	if (sbi->s_journal) {
+		jbd2_journal_unregister_shrinker(sbi->s_journal);
 		jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
 	}
@@ -5511,6 +5513,12 @@ static int ext4_load_journal(struct super_block *sb,
 		ext4_commit_super(sb);
 	}
 
+	err = jbd2_journal_register_shrinker(journal);
+	if (err) {
+		EXT4_SB(sb)->s_journal = NULL;
+		goto err_out;
+	}
+
 	return 0;
 
 err_out:
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 75a4f622afaf..1abdae44a3d8 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -79,6 +79,18 @@ static inline void __buffer_relink_io(struct journal_head *jh)
 	transaction->t_checkpoint_io_list = jh;
 }
 
+/*
+ * Check a checkpoint buffer could be release or not.
+ *
+ * Requires j_list_lock
+ */
+static inline bool __cp_buffer_busy(struct journal_head *jh)
+{
+	struct buffer_head *bh = jh2bh(jh);
+
+	return (jh->b_transaction || buffer_locked(bh) || buffer_dirty(bh));
+}
+
 /*
  * Try to release a checkpointed buffer from its transaction.
  * Returns 1 if we released it and 2 if we also released the
@@ -458,6 +470,137 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
 	return 0;
 }
 
+/*
+ * journal_shrink_one_cp_list
+ *
+ * Find 'nr_to_scan' written-back checkpoint buffers in the given list
+ * and try to release them. If the whole transaction is released, set
+ * the 'released' parameter. Return the number of released checkpointed
+ * buffers.
+ *
+ * Called with j_list_lock held.
+ */
+static unsigned long journal_shrink_one_cp_list(struct journal_head *jh,
+						unsigned long *nr_to_scan,
+						bool *released)
+{
+	struct journal_head *last_jh;
+	struct journal_head *next_jh = jh;
+	unsigned long nr_freed = 0;
+	int ret;
+
+	if (!jh || *nr_to_scan == 0)
+		return 0;
+
+	last_jh = jh->b_cpprev;
+	do {
+		jh = next_jh;
+		next_jh = jh->b_cpnext;
+
+		(*nr_to_scan)--;
+		if (__cp_buffer_busy(jh))
+			continue;
+
+		nr_freed++;
+		ret = __jbd2_journal_remove_checkpoint(jh);
+		if (ret) {
+			*released = true;
+			break;
+		}
+
+		if (need_resched())
+			break;
+	} while (jh != last_jh && *nr_to_scan);
+
+	return nr_freed;
+}
+
+/*
+ * jbd2_journal_shrink_checkpoint_list
+ *
+ * Find 'nr_to_scan' written-back checkpoint buffers in the journal
+ * and try to release them. Return the number of released checkpointed
+ * buffers.
+ *
+ * Called with j_list_lock held.
+ */
+unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal,
+						  unsigned long *nr_to_scan)
+{
+	transaction_t *transaction, *last_transaction, *next_transaction;
+	bool released;
+	tid_t first_tid = 0, last_tid = 0, next_tid = 0;
+	tid_t tid = 0;
+	unsigned long nr_freed = 0;
+	unsigned long nr_scanned = *nr_to_scan;
+
+again:
+	spin_lock(&journal->j_list_lock);
+	if (!journal->j_checkpoint_transactions) {
+		spin_unlock(&journal->j_list_lock);
+		goto out;
+	}
+
+	/*
+	 * Get next shrink transaction, resume previous scan or start
+	 * over again. If some others do checkpoint and drop transaction
+	 * from the checkpoint list, we ignore saved j_shrink_transaction
+	 * and start over unconditionally.
+	 */
+	if (journal->j_shrink_transaction)
+		transaction = journal->j_shrink_transaction;
+	else
+		transaction = journal->j_checkpoint_transactions;
+
+	if (!first_tid)
+		first_tid = transaction->t_tid;
+	last_transaction = journal->j_checkpoint_transactions->t_cpprev;
+	next_transaction = transaction;
+	last_tid = last_transaction->t_tid;
+	do {
+		transaction = next_transaction;
+		next_transaction = transaction->t_cpnext;
+		tid = transaction->t_tid;
+		released = false;
+
+		nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_list,
+						       nr_to_scan, &released);
+		if (*nr_to_scan == 0)
+			break;
+		if (need_resched() || spin_needbreak(&journal->j_list_lock))
+			break;
+		if (released)
+			continue;
+
+		nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_io_list,
+						       nr_to_scan, &released);
+		if (*nr_to_scan == 0)
+			break;
+		if (need_resched() || spin_needbreak(&journal->j_list_lock))
+			break;
+	} while (transaction != last_transaction);
+
+	if (transaction != last_transaction) {
+		journal->j_shrink_transaction = next_transaction;
+		next_tid = next_transaction->t_tid;
+	} else {
+		journal->j_shrink_transaction = NULL;
+		next_tid = 0;
+	}
+
+	spin_unlock(&journal->j_list_lock);
+	cond_resched();
+
+	if (*nr_to_scan && next_tid)
+		goto again;
+out:
+	nr_scanned -= *nr_to_scan;
+	trace_jbd2_shrink_checkpoint_list(journal, first_tid, tid, last_tid,
+					  nr_freed, nr_scanned, next_tid);
+
+	return nr_freed;
+}
+
 /*
  * journal_clean_checkpoint_list
  *
@@ -580,6 +723,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
 
 	__buffer_unlink(jh);
 	jh->b_cp_transaction = NULL;
+	percpu_counter_dec(&journal->j_jh_shrink_count);
 	jbd2_journal_put_journal_head(jh);
 
 	/* Is this transaction empty? */
@@ -642,6 +786,7 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
 		jh->b_cpnext->b_cpprev = jh;
 	}
 	transaction->t_checkpoint_list = jh;
+	percpu_counter_inc(&transaction->t_journal->j_jh_shrink_count);
 }
 
 /*
@@ -657,6 +802,8 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
 void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction)
 {
 	assert_spin_locked(&journal->j_list_lock);
+
+	journal->j_shrink_transaction = NULL;
 	if (transaction->t_cpnext) {
 		transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
 		transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 8b3f5bbd65f9..7c52feb6f753 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -2050,6 +2050,91 @@ recovery_error:
 	return -EIO;
 }
 
+/**
+ * jbd2_journal_shrink_scan()
+ *
+ * Scan the checkpointed buffer on the checkpoint list and release the
+ * journal_head.
+ */
+static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink,
+					      struct shrink_control *sc)
+{
+	journal_t *journal = container_of(shrink, journal_t, j_shrinker);
+	unsigned long nr_to_scan = sc->nr_to_scan;
+	unsigned long nr_shrunk;
+	unsigned long count;
+
+	count = percpu_counter_read_positive(&journal->j_jh_shrink_count);
+	trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count);
+
+	nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan);
+
+	count = percpu_counter_read_positive(&journal->j_jh_shrink_count);
+	trace_jbd2_shrink_scan_exit(journal, nr_to_scan, nr_shrunk, count);
+
+	return nr_shrunk;
+}
+
+/**
+ * jbd2_journal_shrink_count()
+ *
+ * Count the number of checkpoint buffers on the checkpoint list.
+ */
+static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink,
+					       struct shrink_control *sc)
+{
+	journal_t *journal = container_of(shrink, journal_t, j_shrinker);
+	unsigned long count;
+
+	count = percpu_counter_read_positive(&journal->j_jh_shrink_count);
+	trace_jbd2_shrink_count(journal, sc->nr_to_scan, count);
+
+	return count;
+}
+
+/**
+ * jbd2_journal_register_shrinker()
+ * @journal: Journal to act on.
+ *
+ * Init a percpu counter to record the checkpointed buffers on the checkpoint
+ * list and register a shrinker to release their journal_head.
+ */
+int jbd2_journal_register_shrinker(journal_t *journal)
+{
+	int err;
+
+	journal->j_shrink_transaction = NULL;
+
+	err = percpu_counter_init(&journal->j_jh_shrink_count, 0, GFP_KERNEL);
+	if (err)
+		return err;
+
+	journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan;
+	journal->j_shrinker.count_objects = jbd2_journal_shrink_count;
+	journal->j_shrinker.seeks = DEFAULT_SEEKS;
+	journal->j_shrinker.batch = journal->j_max_transaction_buffers;
+
+	err = register_shrinker(&journal->j_shrinker);
+	if (err) {
+		percpu_counter_destroy(&journal->j_jh_shrink_count);
+		return err;
+	}
+
+	return 0;
+}
+
+/**
+ * jbd2_journal_unregister_shrinker()
+ * @journal: Journal to act on.
+ *
+ * Unregister the checkpointed buffer shrinker and destroy the percpu counter.
+ */
+void jbd2_journal_unregister_shrinker(journal_t *journal)
+{
+	percpu_counter_destroy(&journal->j_jh_shrink_count);
+	unregister_shrinker(&journal->j_shrinker);
+}
+
 /**
  * jbd2_journal_destroy() - Release a journal_t structure.
  * @journal: Journal to act on.
@@ -2122,6 +2207,8 @@ int jbd2_journal_destroy(journal_t *journal)
 		brelse(journal->j_sb_buffer);
 	}
 
+	jbd2_journal_unregister_shrinker(journal);
+
 	if (journal->j_proc_entry)
 		jbd2_stats_proc_exit(journal);
 	iput(journal->j_inode);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index d5db408ae064..6cc035321562 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -909,6 +909,29 @@ struct journal_s
 	 */
 	struct buffer_head	*j_chkpt_bhs[JBD2_NR_BATCH];
 
+	/**
+	 * @j_shrinker:
+	 *
+	 * Journal head shrinker, reclaim buffer's journal head which
+	 * has been written back.
+	 */
+	struct shrinker		j_shrinker;
+
+	/**
+	 * @j_jh_shrink_count:
+	 *
+	 * Number of journal buffers on the checkpoint list. [j_list_lock]
+	 */
+	struct percpu_counter	j_jh_shrink_count;
+
+	/**
+	 * @j_shrink_transaction:
+	 *
+	 * Record next transaction will shrink on the checkpoint list.
+	 * [j_list_lock]
+	 */
+	transaction_t		*j_shrink_transaction;
+
 	/**
 	 * @j_head:
 	 *
@@ -1422,6 +1445,7 @@ extern void jbd2_journal_commit_transaction(journal_t *);
 
 /* Checkpoint list management */
 void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy);
+unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan);
 int __jbd2_journal_remove_checkpoint(struct journal_head *);
 void jbd2_journal_destroy_checkpoint(journal_t *journal);
 void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *);
@@ -1532,6 +1556,8 @@ extern int	   jbd2_journal_set_features
 		   (journal_t *, unsigned long, unsigned long, unsigned long);
 extern void	   jbd2_journal_clear_features
 		   (journal_t *, unsigned long, unsigned long, unsigned long);
+extern int	   jbd2_journal_register_shrinker(journal_t *journal);
+extern void	   jbd2_journal_unregister_shrinker(journal_t *journal);
 extern int	   jbd2_journal_load       (journal_t *journal);
 extern int	   jbd2_journal_destroy    (journal_t *);
 extern int	   jbd2_journal_recover    (journal_t *journal);
diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h
index d16a32867f3a..a4dfe005983d 100644
--- a/include/trace/events/jbd2.h
+++ b/include/trace/events/jbd2.h
@@ -394,6 +394,107 @@ TRACE_EVENT(jbd2_lock_buffer_stall,
 		__entry->stall_ms)
 );
 
+DECLARE_EVENT_CLASS(jbd2_journal_shrink,
+
+	TP_PROTO(journal_t *journal, unsigned long nr_to_scan,
+		 unsigned long count),
+
+	TP_ARGS(journal, nr_to_scan, count),
+
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned long, nr_to_scan)
+		__field(unsigned long, count)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= journal->j_fs_dev->bd_dev;
+		__entry->nr_to_scan	= nr_to_scan;
+		__entry->count		= count;
+	),
+
+	TP_printk("dev %d,%d nr_to_scan %lu count %lu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->nr_to_scan, __entry->count)
+);
+
+DEFINE_EVENT(jbd2_journal_shrink, jbd2_shrink_count,
+
+	TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count),
+
+	TP_ARGS(journal, nr_to_scan, count)
+);
+
+DEFINE_EVENT(jbd2_journal_shrink, jbd2_shrink_scan_enter,
+
+	TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count),
+
+	TP_ARGS(journal, nr_to_scan, count)
+);
+
+TRACE_EVENT(jbd2_shrink_scan_exit,
+
+	TP_PROTO(journal_t *journal, unsigned long nr_to_scan,
+		 unsigned long nr_shrunk, unsigned long count),
+
+	TP_ARGS(journal, nr_to_scan, nr_shrunk, count),
+
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned long, nr_to_scan)
+		__field(unsigned long, nr_shrunk)
+		__field(unsigned long, count)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= journal->j_fs_dev->bd_dev;
+		__entry->nr_to_scan	= nr_to_scan;
+		__entry->nr_shrunk	= nr_shrunk;
+		__entry->count		= count;
+	),
+
+	TP_printk("dev %d,%d nr_to_scan %lu nr_shrunk %lu count %lu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->nr_to_scan, __entry->nr_shrunk,
+		  __entry->count)
+);
+
+TRACE_EVENT(jbd2_shrink_checkpoint_list,
+
+	TP_PROTO(journal_t *journal, tid_t first_tid, tid_t tid, tid_t last_tid,
+		 unsigned long nr_freed, unsigned long nr_scanned,
+		 tid_t next_tid),
+
+	TP_ARGS(journal, first_tid, tid, last_tid, nr_freed,
+		nr_scanned, next_tid),
+
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(tid_t, first_tid)
+		__field(tid_t, tid)
+		__field(tid_t, last_tid)
+		__field(unsigned long, nr_freed)
+		__field(unsigned long, nr_scanned)
+		__field(tid_t, next_tid)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= journal->j_fs_dev->bd_dev;
+		__entry->first_tid	= first_tid;
+		__entry->tid		= tid;
+		__entry->last_tid	= last_tid;
+		__entry->nr_freed	= nr_freed;
+		__entry->nr_scanned	= nr_scanned;
+		__entry->next_tid	= next_tid;
+	),
+
+	TP_printk("dev %d,%d shrink transaction %u-%u(%u) freed %lu "
+		  "scanned %lu next transaction %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->first_tid, __entry->tid, __entry->last_tid,
+		  __entry->nr_freed, __entry->nr_scanned, __entry->next_tid)
+);
+
 #endif /* _TRACE_JBD2_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From acc6100d3ffa24bdd2add8ea85fb66811bcce5d4 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Thu, 10 Jun 2021 19:24:40 +0800
Subject: fs: remove bdev_try_to_free_page callback

After remove the unique user of sop->bdev_try_to_free_page() callback,
we could remove the callback and the corresponding blkdev_releasepage()
at all.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20210610112440.3438139-9-yi.zhang@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/block_dev.c     | 15 ---------------
 include/linux/fs.h |  1 -
 2 files changed, 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6cc4d4cfe0c2..e215da6d49b4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1733,20 +1733,6 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
 }
 EXPORT_SYMBOL_GPL(blkdev_read_iter);
 
-/*
- * Try to release a page associated with block device when the system
- * is under memory pressure.
- */
-static int blkdev_releasepage(struct page *page, gfp_t wait)
-{
-	struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
-
-	if (super && super->s_op->bdev_try_to_free_page)
-		return super->s_op->bdev_try_to_free_page(super, page, wait);
-
-	return try_to_free_buffers(page);
-}
-
 static int blkdev_writepages(struct address_space *mapping,
 			     struct writeback_control *wbc)
 {
@@ -1760,7 +1746,6 @@ static const struct address_space_operations def_blk_aops = {
 	.write_begin	= blkdev_write_begin,
 	.write_end	= blkdev_write_end,
 	.writepages	= blkdev_writepages,
-	.releasepage	= blkdev_releasepage,
 	.direct_IO	= blkdev_direct_IO,
 	.migratepage	= buffer_migrate_page_norefs,
 	.is_dirty_writeback = buffer_check_dirty_writeback,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c3c88fdb9b2a..c3277b445f96 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2171,7 +2171,6 @@ struct super_operations {
 	ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
 	struct dquot **(*get_dquots)(struct inode *);
 #endif
-	int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
 	long (*nr_cached_objects)(struct super_block *,
 				  struct shrink_control *);
 	long (*free_cached_objects)(struct super_block *,
-- 
cgit v1.2.3


From 0193cc908b5ae8aff2e2d2997ca5d4ae26ed24d4 Mon Sep 17 00:00:00 2001
From: Jing Zhang <jingzhangos@google.com>
Date: Fri, 18 Jun 2021 22:27:03 +0000
Subject: KVM: stats: Separate generic stats from architecture specific ones

Generic KVM stats are those collected in architecture independent code
or those supported by all architectures; put all generic statistics in
a separate structure.  This ensures that they are defined the same way
in the statistics API which is being added, removing duplication among
different architectures in the declaration of the descriptors.

No functional change intended.

Reviewed-by: David Matlack <dmatlack@google.com>
Reviewed-by: Ricardo Koller <ricarkol@google.com>
Reviewed-by: Krish Sadhukhan <krish.sadhukhan@oracle.com>
Signed-off-by: Jing Zhang <jingzhangos@google.com>
Message-Id: <20210618222709.1858088-2-jingzhangos@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm64/include/asm/kvm_host.h   |  9 ++-------
 arch/arm64/kvm/guest.c              | 12 ++++++------
 arch/mips/include/asm/kvm_host.h    |  9 ++-------
 arch/mips/kvm/mips.c                | 12 ++++++------
 arch/powerpc/include/asm/kvm_host.h |  9 ++-------
 arch/powerpc/kvm/book3s.c           | 12 ++++++------
 arch/powerpc/kvm/book3s_hv.c        | 12 ++++++------
 arch/powerpc/kvm/book3s_pr.c        |  2 +-
 arch/powerpc/kvm/book3s_pr_papr.c   |  2 +-
 arch/powerpc/kvm/booke.c            | 14 +++++++-------
 arch/s390/include/asm/kvm_host.h    |  9 ++-------
 arch/s390/kvm/kvm-s390.c            | 12 ++++++------
 arch/x86/include/asm/kvm_host.h     |  9 ++-------
 arch/x86/kvm/x86.c                  | 14 +++++++-------
 include/linux/kvm_types.h           | 12 ++++++++++++
 virt/kvm/kvm_main.c                 | 14 +++++++-------
 16 files changed, 75 insertions(+), 88 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index d56f365b38a8..5a2c82f63baa 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -556,16 +556,11 @@ static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg)
 }
 
 struct kvm_vm_stat {
-	u64 remote_tlb_flush;
+	struct kvm_vm_stat_generic generic;
 };
 
 struct kvm_vcpu_stat {
-	u64 halt_successful_poll;
-	u64 halt_attempted_poll;
-	u64 halt_poll_success_ns;
-	u64 halt_poll_fail_ns;
-	u64 halt_poll_invalid;
-	u64 halt_wakeup;
+	struct kvm_vcpu_stat_generic generic;
 	u64 hvc_exit_stat;
 	u64 wfe_exit_stat;
 	u64 wfi_exit_stat;
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 5cb4a1cd5603..988ead309cbe 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -29,18 +29,18 @@
 #include "trace.h"
 
 struct kvm_stats_debugfs_item debugfs_entries[] = {
-	VCPU_STAT("halt_successful_poll", halt_successful_poll),
-	VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
-	VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
-	VCPU_STAT("halt_wakeup", halt_wakeup),
+	VCPU_STAT("halt_successful_poll", generic.halt_successful_poll),
+	VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll),
+	VCPU_STAT("halt_poll_invalid", generic.halt_poll_invalid),
+	VCPU_STAT("halt_wakeup", generic.halt_wakeup),
 	VCPU_STAT("hvc_exit_stat", hvc_exit_stat),
 	VCPU_STAT("wfe_exit_stat", wfe_exit_stat),
 	VCPU_STAT("wfi_exit_stat", wfi_exit_stat),
 	VCPU_STAT("mmio_exit_user", mmio_exit_user),
 	VCPU_STAT("mmio_exit_kernel", mmio_exit_kernel),
 	VCPU_STAT("exits", exits),
-	VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
-	VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
+	VCPU_STAT("halt_poll_success_ns", generic.halt_poll_success_ns),
+	VCPU_STAT("halt_poll_fail_ns", generic.halt_poll_fail_ns),
 	{ NULL }
 };
 
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 4245c082095f..696f6b009377 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -109,10 +109,11 @@ static inline bool kvm_is_error_hva(unsigned long addr)
 }
 
 struct kvm_vm_stat {
-	u64 remote_tlb_flush;
+	struct kvm_vm_stat_generic generic;
 };
 
 struct kvm_vcpu_stat {
+	struct kvm_vcpu_stat_generic generic;
 	u64 wait_exits;
 	u64 cache_exits;
 	u64 signal_exits;
@@ -142,12 +143,6 @@ struct kvm_vcpu_stat {
 #ifdef CONFIG_CPU_LOONGSON64
 	u64 vz_cpucfg_exits;
 #endif
-	u64 halt_successful_poll;
-	u64 halt_attempted_poll;
-	u64 halt_poll_success_ns;
-	u64 halt_poll_fail_ns;
-	u64 halt_poll_invalid;
-	u64 halt_wakeup;
 };
 
 struct kvm_arch_memory_slot {
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 4d4af97dcc88..2f2969aef60c 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -68,12 +68,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 #ifdef CONFIG_CPU_LOONGSON64
 	VCPU_STAT("vz_cpucfg", vz_cpucfg_exits),
 #endif
-	VCPU_STAT("halt_successful_poll", halt_successful_poll),
-	VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
-	VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
-	VCPU_STAT("halt_wakeup", halt_wakeup),
-	VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
-	VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
+	VCPU_STAT("halt_successful_poll", generic.halt_successful_poll),
+	VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll),
+	VCPU_STAT("halt_poll_invalid", generic.halt_poll_invalid),
+	VCPU_STAT("halt_wakeup", generic.halt_wakeup),
+	VCPU_STAT("halt_poll_success_ns", generic.halt_poll_success_ns),
+	VCPU_STAT("halt_poll_fail_ns", generic.halt_poll_fail_ns),
 	{NULL}
 };
 
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index dd8bd4706259..9f52f282b1aa 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -81,12 +81,13 @@ struct kvmppc_book3s_shadow_vcpu;
 struct kvm_nested_guest;
 
 struct kvm_vm_stat {
-	u64 remote_tlb_flush;
+	struct kvm_vm_stat_generic generic;
 	u64 num_2M_pages;
 	u64 num_1G_pages;
 };
 
 struct kvm_vcpu_stat {
+	struct kvm_vcpu_stat_generic generic;
 	u64 sum_exits;
 	u64 mmio_exits;
 	u64 signal_exits;
@@ -102,14 +103,8 @@ struct kvm_vcpu_stat {
 	u64 emulated_inst_exits;
 	u64 dec_exits;
 	u64 ext_intr_exits;
-	u64 halt_poll_success_ns;
-	u64 halt_poll_fail_ns;
 	u64 halt_wait_ns;
-	u64 halt_successful_poll;
-	u64 halt_attempted_poll;
 	u64 halt_successful_wait;
-	u64 halt_poll_invalid;
-	u64 halt_wakeup;
 	u64 dbell_exits;
 	u64 gdbell_exits;
 	u64 ld;
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 5e1e1cff0ee3..ae9f1b855ff9 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -47,14 +47,14 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	VCPU_STAT("dec", dec_exits),
 	VCPU_STAT("ext_intr", ext_intr_exits),
 	VCPU_STAT("queue_intr", queue_intr),
-	VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
-	VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
+	VCPU_STAT("halt_poll_success_ns", generic.halt_poll_success_ns),
+	VCPU_STAT("halt_poll_fail_ns", generic.halt_poll_fail_ns),
 	VCPU_STAT("halt_wait_ns", halt_wait_ns),
-	VCPU_STAT("halt_successful_poll", halt_successful_poll),
-	VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
+	VCPU_STAT("halt_successful_poll", generic.halt_successful_poll),
+	VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll),
 	VCPU_STAT("halt_successful_wait", halt_successful_wait),
-	VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
-	VCPU_STAT("halt_wakeup", halt_wakeup),
+	VCPU_STAT("halt_poll_invalid", generic.halt_poll_invalid),
+	VCPU_STAT("halt_wakeup", generic.halt_wakeup),
 	VCPU_STAT("pf_storage", pf_storage),
 	VCPU_STAT("sp_storage", sp_storage),
 	VCPU_STAT("pf_instruc", pf_instruc),
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 7e73e5bfe4ba..cd544a46183e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -230,7 +230,7 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 
 	waitp = kvm_arch_vcpu_get_wait(vcpu);
 	if (rcuwait_wake_up(waitp))
-		++vcpu->stat.halt_wakeup;
+		++vcpu->stat.generic.halt_wakeup;
 
 	cpu = READ_ONCE(vcpu->arch.thread_cpu);
 	if (cpu >= 0 && kvmppc_ipi_thread(cpu))
@@ -4092,7 +4092,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 	cur = start_poll = ktime_get();
 	if (vc->halt_poll_ns) {
 		ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns);
-		++vc->runner->stat.halt_attempted_poll;
+		++vc->runner->stat.generic.halt_attempted_poll;
 
 		vc->vcore_state = VCORE_POLLING;
 		spin_unlock(&vc->lock);
@@ -4109,7 +4109,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 		vc->vcore_state = VCORE_INACTIVE;
 
 		if (!do_sleep) {
-			++vc->runner->stat.halt_successful_poll;
+			++vc->runner->stat.generic.halt_successful_poll;
 			goto out;
 		}
 	}
@@ -4121,7 +4121,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 		do_sleep = 0;
 		/* If we polled, count this as a successful poll */
 		if (vc->halt_poll_ns)
-			++vc->runner->stat.halt_successful_poll;
+			++vc->runner->stat.generic.halt_successful_poll;
 		goto out;
 	}
 
@@ -4148,13 +4148,13 @@ out:
 			ktime_to_ns(cur) - ktime_to_ns(start_wait);
 		/* Attribute failed poll time */
 		if (vc->halt_poll_ns)
-			vc->runner->stat.halt_poll_fail_ns +=
+			vc->runner->stat.generic.halt_poll_fail_ns +=
 				ktime_to_ns(start_wait) -
 				ktime_to_ns(start_poll);
 	} else {
 		/* Attribute successful poll time */
 		if (vc->halt_poll_ns)
-			vc->runner->stat.halt_poll_success_ns +=
+			vc->runner->stat.generic.halt_poll_success_ns +=
 				ktime_to_ns(cur) -
 				ktime_to_ns(start_poll);
 	}
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index d7733b07f489..71bcb0140461 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -493,7 +493,7 @@ static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr)
 		if (!vcpu->arch.pending_exceptions) {
 			kvm_vcpu_block(vcpu);
 			kvm_clear_request(KVM_REQ_UNHALT, vcpu);
-			vcpu->stat.halt_wakeup++;
+			vcpu->stat.generic.halt_wakeup++;
 
 			/* Unset POW bit after we woke up */
 			msr &= ~MSR_POW;
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index 031c8015864a..ac14239f3424 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -378,7 +378,7 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 		kvmppc_set_msr_fast(vcpu, kvmppc_get_msr(vcpu) | MSR_EE);
 		kvm_vcpu_block(vcpu);
 		kvm_clear_request(KVM_REQ_UNHALT, vcpu);
-		vcpu->stat.halt_wakeup++;
+		vcpu->stat.generic.halt_wakeup++;
 		return EMULATE_DONE;
 	case H_LOGICAL_CI_LOAD:
 		return kvmppc_h_pr_logical_ci_load(vcpu);
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 7d5fe43f85c4..7a75559ab51d 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -49,15 +49,15 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	VCPU_STAT("inst_emu", emulated_inst_exits),
 	VCPU_STAT("dec", dec_exits),
 	VCPU_STAT("ext_intr", ext_intr_exits),
-	VCPU_STAT("halt_successful_poll", halt_successful_poll),
-	VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
-	VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
-	VCPU_STAT("halt_wakeup", halt_wakeup),
+	VCPU_STAT("halt_successful_poll", generic.halt_successful_poll),
+	VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll),
+	VCPU_STAT("halt_poll_invalid", generic.halt_poll_invalid),
+	VCPU_STAT("halt_wakeup", generic.halt_wakeup),
 	VCPU_STAT("doorbell", dbell_exits),
 	VCPU_STAT("guest doorbell", gdbell_exits),
-	VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
-	VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
-	VM_STAT("remote_tlb_flush", remote_tlb_flush),
+	VCPU_STAT("halt_poll_success_ns", generic.halt_poll_success_ns),
+	VCPU_STAT("halt_poll_fail_ns", generic.halt_poll_fail_ns),
+	VM_STAT("remote_tlb_flush", generic.remote_tlb_flush),
 	{ NULL }
 };
 
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 8925f3969478..9b4473f76e56 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -361,6 +361,7 @@ struct sie_page {
 };
 
 struct kvm_vcpu_stat {
+	struct kvm_vcpu_stat_generic generic;
 	u64 exit_userspace;
 	u64 exit_null;
 	u64 exit_external_request;
@@ -370,13 +371,7 @@ struct kvm_vcpu_stat {
 	u64 exit_validity;
 	u64 exit_instruction;
 	u64 exit_pei;
-	u64 halt_successful_poll;
-	u64 halt_attempted_poll;
-	u64 halt_poll_invalid;
 	u64 halt_no_poll_steal;
-	u64 halt_wakeup;
-	u64 halt_poll_success_ns;
-	u64 halt_poll_fail_ns;
 	u64 instruction_lctl;
 	u64 instruction_lctlg;
 	u64 instruction_stctl;
@@ -755,12 +750,12 @@ struct kvm_vcpu_arch {
 };
 
 struct kvm_vm_stat {
+	struct kvm_vm_stat_generic generic;
 	u64 inject_io;
 	u64 inject_float_mchk;
 	u64 inject_pfault_done;
 	u64 inject_service_signal;
 	u64 inject_virtio;
-	u64 remote_tlb_flush;
 };
 
 struct kvm_arch_memory_slot {
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 1296fc10f80c..75ad44c44717 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -72,13 +72,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	VCPU_STAT("exit_program_interruption", exit_program_interruption),
 	VCPU_STAT("exit_instr_and_program_int", exit_instr_and_program),
 	VCPU_STAT("exit_operation_exception", exit_operation_exception),
-	VCPU_STAT("halt_successful_poll", halt_successful_poll),
-	VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
-	VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
+	VCPU_STAT("halt_successful_poll", generic.halt_successful_poll),
+	VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll),
+	VCPU_STAT("halt_poll_invalid", generic.halt_poll_invalid),
 	VCPU_STAT("halt_no_poll_steal", halt_no_poll_steal),
-	VCPU_STAT("halt_wakeup", halt_wakeup),
-	VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
-	VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
+	VCPU_STAT("halt_wakeup", generic.halt_wakeup),
+	VCPU_STAT("halt_poll_success_ns", generic.halt_poll_success_ns),
+	VCPU_STAT("halt_poll_fail_ns", generic.halt_poll_fail_ns),
 	VCPU_STAT("instruction_lctlg", instruction_lctlg),
 	VCPU_STAT("instruction_lctl", instruction_lctl),
 	VCPU_STAT("instruction_stctl", instruction_stctl),
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e11d64aa0bcd..408051552121 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1160,6 +1160,7 @@ struct kvm_arch {
 };
 
 struct kvm_vm_stat {
+	struct kvm_vm_stat_generic generic;
 	u64 mmu_shadow_zapped;
 	u64 mmu_pte_write;
 	u64 mmu_pde_zapped;
@@ -1167,13 +1168,13 @@ struct kvm_vm_stat {
 	u64 mmu_recycled;
 	u64 mmu_cache_miss;
 	u64 mmu_unsync;
-	u64 remote_tlb_flush;
 	u64 lpages;
 	u64 nx_lpage_splits;
 	u64 max_mmu_page_hash_collisions;
 };
 
 struct kvm_vcpu_stat {
+	struct kvm_vcpu_stat_generic generic;
 	u64 pf_fixed;
 	u64 pf_guest;
 	u64 tlb_flush;
@@ -1187,10 +1188,6 @@ struct kvm_vcpu_stat {
 	u64 nmi_window_exits;
 	u64 l1d_flush;
 	u64 halt_exits;
-	u64 halt_successful_poll;
-	u64 halt_attempted_poll;
-	u64 halt_poll_invalid;
-	u64 halt_wakeup;
 	u64 request_irq_exits;
 	u64 irq_exits;
 	u64 host_state_reload;
@@ -1201,8 +1198,6 @@ struct kvm_vcpu_stat {
 	u64 irq_injections;
 	u64 nmi_injections;
 	u64 req_event;
-	u64 halt_poll_success_ns;
-	u64 halt_poll_fail_ns;
 	u64 nested_run;
 	u64 directed_yield_attempted;
 	u64 directed_yield_successful;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 38c003b60339..71202330848a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -235,10 +235,10 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	VCPU_STAT("irq_window", irq_window_exits),
 	VCPU_STAT("nmi_window", nmi_window_exits),
 	VCPU_STAT("halt_exits", halt_exits),
-	VCPU_STAT("halt_successful_poll", halt_successful_poll),
-	VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
-	VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
-	VCPU_STAT("halt_wakeup", halt_wakeup),
+	VCPU_STAT("halt_successful_poll", generic.halt_successful_poll),
+	VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll),
+	VCPU_STAT("halt_poll_invalid", generic.halt_poll_invalid),
+	VCPU_STAT("halt_wakeup", generic.halt_wakeup),
 	VCPU_STAT("hypercalls", hypercalls),
 	VCPU_STAT("request_irq", request_irq_exits),
 	VCPU_STAT("irq_exits", irq_exits),
@@ -250,8 +250,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	VCPU_STAT("nmi_injections", nmi_injections),
 	VCPU_STAT("req_event", req_event),
 	VCPU_STAT("l1d_flush", l1d_flush),
-	VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
-	VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
+	VCPU_STAT("halt_poll_success_ns", generic.halt_poll_success_ns),
+	VCPU_STAT("halt_poll_fail_ns", generic.halt_poll_fail_ns),
 	VCPU_STAT("nested_run", nested_run),
 	VCPU_STAT("directed_yield_attempted", directed_yield_attempted),
 	VCPU_STAT("directed_yield_successful", directed_yield_successful),
@@ -263,7 +263,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	VM_STAT("mmu_recycled", mmu_recycled),
 	VM_STAT("mmu_cache_miss", mmu_cache_miss),
 	VM_STAT("mmu_unsync", mmu_unsync),
-	VM_STAT("remote_tlb_flush", remote_tlb_flush),
+	VM_STAT("remote_tlb_flush", generic.remote_tlb_flush),
 	VM_STAT("largepages", lpages, .mode = 0444),
 	VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444),
 	VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions),
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index a7580f69dda0..48db778291b7 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -76,5 +76,17 @@ struct kvm_mmu_memory_cache {
 };
 #endif
 
+struct kvm_vm_stat_generic {
+	u64 remote_tlb_flush;
+};
+
+struct kvm_vcpu_stat_generic {
+	u64 halt_successful_poll;
+	u64 halt_attempted_poll;
+	u64 halt_poll_invalid;
+	u64 halt_wakeup;
+	u64 halt_poll_success_ns;
+	u64 halt_poll_fail_ns;
+};
 
 #endif /* __KVM_TYPES_H__ */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ed4d1581d502..cec986487b30 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -332,7 +332,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
 	 */
 	if (!kvm_arch_flush_remote_tlb(kvm)
 	    || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
-		++kvm->stat.remote_tlb_flush;
+		++kvm->stat.generic.remote_tlb_flush;
 	cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
 }
 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
@@ -3029,9 +3029,9 @@ static inline void
 update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited)
 {
 	if (waited)
-		vcpu->stat.halt_poll_fail_ns += poll_ns;
+		vcpu->stat.generic.halt_poll_fail_ns += poll_ns;
 	else
-		vcpu->stat.halt_poll_success_ns += poll_ns;
+		vcpu->stat.generic.halt_poll_success_ns += poll_ns;
 }
 
 /*
@@ -3049,16 +3049,16 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 	if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) {
 		ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
 
-		++vcpu->stat.halt_attempted_poll;
+		++vcpu->stat.generic.halt_attempted_poll;
 		do {
 			/*
 			 * This sets KVM_REQ_UNHALT if an interrupt
 			 * arrives.
 			 */
 			if (kvm_vcpu_check_block(vcpu) < 0) {
-				++vcpu->stat.halt_successful_poll;
+				++vcpu->stat.generic.halt_successful_poll;
 				if (!vcpu_valid_wakeup(vcpu))
-					++vcpu->stat.halt_poll_invalid;
+					++vcpu->stat.generic.halt_poll_invalid;
 				goto out;
 			}
 			poll_end = cur = ktime_get();
@@ -3115,7 +3115,7 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
 	waitp = kvm_arch_vcpu_get_wait(vcpu);
 	if (rcuwait_wake_up(waitp)) {
 		WRITE_ONCE(vcpu->ready, true);
-		++vcpu->stat.halt_wakeup;
+		++vcpu->stat.generic.halt_wakeup;
 		return true;
 	}
 
-- 
cgit v1.2.3


From cb082bfab59a224a49ae803fed52cd03e8d6b5e0 Mon Sep 17 00:00:00 2001
From: Jing Zhang <jingzhangos@google.com>
Date: Fri, 18 Jun 2021 22:27:04 +0000
Subject: KVM: stats: Add fd-based API to read binary stats data

This commit defines the API for userspace and prepare the common
functionalities to support per VM/VCPU binary stats data readings.

The KVM stats now is only accessible by debugfs, which has some
shortcomings this change series are supposed to fix:
1. The current debugfs stats solution in KVM could be disabled
   when kernel Lockdown mode is enabled, which is a potential
   rick for production.
2. The current debugfs stats solution in KVM is organized as "one
   stats per file", it is good for debugging, but not efficient
   for production.
3. The stats read/clear in current debugfs solution in KVM are
   protected by the global kvm_lock.

Besides that, there are some other benefits with this change:
1. All KVM VM/VCPU stats can be read out in a bulk by one copy
   to userspace.
2. A schema is used to describe KVM statistics. From userspace's
   perspective, the KVM statistics are self-describing.
3. With the fd-based solution, a separate telemetry would be able
   to read KVM stats in a less privileged environment.
4. After the initial setup by reading in stats descriptors, a
   telemetry only needs to read the stats data itself, no more
   parsing or setup is needed.

Reviewed-by: David Matlack <dmatlack@google.com>
Reviewed-by: Ricardo Koller <ricarkol@google.com>
Reviewed-by: Krish Sadhukhan <krish.sadhukhan@oracle.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Tested-by: Fuad Tabba <tabba@google.com> #arm64
Signed-off-by: Jing Zhang <jingzhangos@google.com>
Message-Id: <20210618222709.1858088-3-jingzhangos@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm64/kvm/Makefile   |   2 +-
 arch/mips/kvm/Makefile    |   2 +-
 arch/powerpc/kvm/Makefile |   2 +-
 arch/s390/kvm/Makefile    |   3 +-
 arch/x86/kvm/Makefile     |   2 +-
 include/linux/kvm_host.h  |  82 +++++++++++++++++++++++++-
 include/linux/kvm_types.h |   2 +
 include/uapi/linux/kvm.h  |  73 +++++++++++++++++++++++
 virt/kvm/binary_stats.c   | 146 ++++++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 307 insertions(+), 7 deletions(-)
 create mode 100644 virt/kvm/binary_stats.c

(limited to 'include/linux')

diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 589921392cb1..989bb5dad2c8 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -11,7 +11,7 @@ obj-$(CONFIG_KVM) += kvm.o
 obj-$(CONFIG_KVM) += hyp/
 
 kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \
-	 $(KVM)/vfio.o $(KVM)/irqchip.o \
+	 $(KVM)/vfio.o $(KVM)/irqchip.o $(KVM)/binary_stats.o \
 	 arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \
 	 inject_fault.o va_layout.o handle_exit.o \
 	 guest.o debug.o reset.o sys_regs.o \
diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile
index 30cc060857c7..c67250a956b8 100644
--- a/arch/mips/kvm/Makefile
+++ b/arch/mips/kvm/Makefile
@@ -2,7 +2,7 @@
 # Makefile for KVM support for MIPS
 #
 
-common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o eventfd.o)
+common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o eventfd.o binary_stats.o)
 
 EXTRA_CFLAGS += -Ivirt/kvm -Iarch/mips/kvm
 
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index ab241317481c..583c14ef596e 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -6,7 +6,7 @@
 ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
 KVM := ../../../virt/kvm
 
-common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o
+common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/binary_stats.o
 common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o
 common-objs-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o
 
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index 12decca22e7c..b3aaadc60ead 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -4,7 +4,8 @@
 # Copyright IBM Corp. 2008
 
 KVM := ../../../virt/kvm
-common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o  $(KVM)/async_pf.o $(KVM)/irqchip.o $(KVM)/vfio.o
+common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o  $(KVM)/async_pf.o \
+	      $(KVM)/irqchip.o $(KVM)/vfio.o $(KVM)/binary_stats.o
 
 ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
 
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 83331376b779..75dfd27b6e8a 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -11,7 +11,7 @@ KVM := ../../../virt/kvm
 
 kvm-y			+= $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
 				$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \
-				$(KVM)/dirty_ring.o
+				$(KVM)/dirty_ring.o $(KVM)/binary_stats.o
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
 
 kvm-y			+= x86.o emulate.o i8259.o irq.o lapic.o \
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 37cbb56ccd09..9ee7f350473b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1272,16 +1272,94 @@ struct kvm_stats_debugfs_item {
 	int mode;
 };
 
+struct _kvm_stats_desc {
+	struct kvm_stats_desc desc;
+	char name[KVM_STATS_NAME_SIZE];
+};
+
 #define KVM_DBGFS_GET_MODE(dbgfs_item)                                         \
 	((dbgfs_item)->mode ? (dbgfs_item)->mode : 0644)
 
-#define VM_STAT(n, x, ...) 							\
+#define VM_STAT(n, x, ...)						       \
 	{ n, offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ }
-#define VCPU_STAT(n, x, ...)							\
+#define VCPU_STAT(n, x, ...)						       \
 	{ n, offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ }
 
+#define STATS_DESC_COMMON(type, unit, base, exp)			       \
+	.flags = type | unit | base |					       \
+		 BUILD_BUG_ON_ZERO(type & ~KVM_STATS_TYPE_MASK) |	       \
+		 BUILD_BUG_ON_ZERO(unit & ~KVM_STATS_UNIT_MASK) |	       \
+		 BUILD_BUG_ON_ZERO(base & ~KVM_STATS_BASE_MASK),	       \
+	.exponent = exp,						       \
+	.size = 1
+
+#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp)		       \
+	{								       \
+		{							       \
+			STATS_DESC_COMMON(type, unit, base, exp),	       \
+			.offset = offsetof(struct kvm_vm_stat, generic.stat)   \
+		},							       \
+		.name = #stat,						       \
+	}
+#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp)		       \
+	{								       \
+		{							       \
+			STATS_DESC_COMMON(type, unit, base, exp),	       \
+			.offset = offsetof(struct kvm_vcpu_stat, generic.stat) \
+		},							       \
+		.name = #stat,						       \
+	}
+#define VM_STATS_DESC(stat, type, unit, base, exp)			       \
+	{								       \
+		{							       \
+			STATS_DESC_COMMON(type, unit, base, exp),	       \
+			.offset = offsetof(struct kvm_vm_stat, stat)	       \
+		},							       \
+		.name = #stat,						       \
+	}
+#define VCPU_STATS_DESC(stat, type, unit, base, exp)			       \
+	{								       \
+		{							       \
+			STATS_DESC_COMMON(type, unit, base, exp),	       \
+			.offset = offsetof(struct kvm_vcpu_stat, stat)	       \
+		},							       \
+		.name = #stat,						       \
+	}
+/* SCOPE: VM, VM_GENERIC, VCPU, VCPU_GENERIC */
+#define STATS_DESC(SCOPE, stat, type, unit, base, exp)			       \
+	SCOPE##_STATS_DESC(stat, type, unit, base, exp)
+
+#define STATS_DESC_CUMULATIVE(SCOPE, name, unit, base, exponent)	       \
+	STATS_DESC(SCOPE, name, KVM_STATS_TYPE_CUMULATIVE, unit, base, exponent)
+#define STATS_DESC_INSTANT(SCOPE, name, unit, base, exponent)		       \
+	STATS_DESC(SCOPE, name, KVM_STATS_TYPE_INSTANT, unit, base, exponent)
+#define STATS_DESC_PEAK(SCOPE, name, unit, base, exponent)		       \
+	STATS_DESC(SCOPE, name, KVM_STATS_TYPE_PEAK, unit, base, exponent)
+
+/* Cumulative counter, read/write */
+#define STATS_DESC_COUNTER(SCOPE, name)					       \
+	STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_NONE,		       \
+		KVM_STATS_BASE_POW10, 0)
+/* Instantaneous counter, read only */
+#define STATS_DESC_ICOUNTER(SCOPE, name)				       \
+	STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_NONE,		       \
+		KVM_STATS_BASE_POW10, 0)
+/* Peak counter, read/write */
+#define STATS_DESC_PCOUNTER(SCOPE, name)				       \
+	STATS_DESC_PEAK(SCOPE, name, KVM_STATS_UNIT_NONE,		       \
+		KVM_STATS_BASE_POW10, 0)
+
+/* Cumulative time in nanosecond */
+#define STATS_DESC_TIME_NSEC(SCOPE, name)				       \
+	STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS,	       \
+		KVM_STATS_BASE_POW10, -9)
+
 extern struct kvm_stats_debugfs_item debugfs_entries[];
 extern struct dentry *kvm_debugfs_dir;
+ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header,
+		       const struct _kvm_stats_desc *desc,
+		       void *stats, size_t size_stats,
+		       char __user *user_buffer, size_t size, loff_t *offset);
 
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 48db778291b7..ed6a985c5680 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -89,4 +89,6 @@ struct kvm_vcpu_stat_generic {
 	u64 halt_poll_fail_ns;
 };
 
+#define KVM_STATS_NAME_SIZE	48
+
 #endif /* __KVM_TYPES_H__ */
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 330835f1005b..f1ba602260f6 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1087,6 +1087,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_SREGS2 200
 #define KVM_CAP_EXIT_HYPERCALL 201
 #define KVM_CAP_PPC_RPT_INVALIDATE 202
+#define KVM_CAP_BINARY_STATS_FD 203
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1906,4 +1907,76 @@ struct kvm_dirty_gfn {
 #define KVM_BUS_LOCK_DETECTION_OFF             (1 << 0)
 #define KVM_BUS_LOCK_DETECTION_EXIT            (1 << 1)
 
+/**
+ * struct kvm_stats_header - Header of per vm/vcpu binary statistics data.
+ * @flags: Some extra information for header, always 0 for now.
+ * @name_size: The size in bytes of the memory which contains statistics
+ *             name string including trailing '\0'. The memory is allocated
+ *             at the send of statistics descriptor.
+ * @num_desc: The number of statistics the vm or vcpu has.
+ * @id_offset: The offset of the vm/vcpu stats' id string in the file pointed
+ *             by vm/vcpu stats fd.
+ * @desc_offset: The offset of the vm/vcpu stats' descriptor block in the file
+ *               pointd by vm/vcpu stats fd.
+ * @data_offset: The offset of the vm/vcpu stats' data block in the file
+ *               pointed by vm/vcpu stats fd.
+ *
+ * This is the header userspace needs to read from stats fd before any other
+ * readings. It is used by userspace to discover all the information about the
+ * vm/vcpu's binary statistics.
+ * Userspace reads this header from the start of the vm/vcpu's stats fd.
+ */
+struct kvm_stats_header {
+	__u32 flags;
+	__u32 name_size;
+	__u32 num_desc;
+	__u32 id_offset;
+	__u32 desc_offset;
+	__u32 data_offset;
+};
+
+#define KVM_STATS_TYPE_SHIFT		0
+#define KVM_STATS_TYPE_MASK		(0xF << KVM_STATS_TYPE_SHIFT)
+#define KVM_STATS_TYPE_CUMULATIVE	(0x0 << KVM_STATS_TYPE_SHIFT)
+#define KVM_STATS_TYPE_INSTANT		(0x1 << KVM_STATS_TYPE_SHIFT)
+#define KVM_STATS_TYPE_PEAK		(0x2 << KVM_STATS_TYPE_SHIFT)
+#define KVM_STATS_TYPE_MAX		KVM_STATS_TYPE_PEAK
+
+#define KVM_STATS_UNIT_SHIFT		4
+#define KVM_STATS_UNIT_MASK		(0xF << KVM_STATS_UNIT_SHIFT)
+#define KVM_STATS_UNIT_NONE		(0x0 << KVM_STATS_UNIT_SHIFT)
+#define KVM_STATS_UNIT_BYTES		(0x1 << KVM_STATS_UNIT_SHIFT)
+#define KVM_STATS_UNIT_SECONDS		(0x2 << KVM_STATS_UNIT_SHIFT)
+#define KVM_STATS_UNIT_CYCLES		(0x3 << KVM_STATS_UNIT_SHIFT)
+#define KVM_STATS_UNIT_MAX		KVM_STATS_UNIT_CYCLES
+
+#define KVM_STATS_BASE_SHIFT		8
+#define KVM_STATS_BASE_MASK		(0xF << KVM_STATS_BASE_SHIFT)
+#define KVM_STATS_BASE_POW10		(0x0 << KVM_STATS_BASE_SHIFT)
+#define KVM_STATS_BASE_POW2		(0x1 << KVM_STATS_BASE_SHIFT)
+#define KVM_STATS_BASE_MAX		KVM_STATS_BASE_POW2
+
+/**
+ * struct kvm_stats_desc - Descriptor of a KVM statistics.
+ * @flags: Annotations of the stats, like type, unit, etc.
+ * @exponent: Used together with @flags to determine the unit.
+ * @size: The number of data items for this stats.
+ *        Every data item is of type __u64.
+ * @offset: The offset of the stats to the start of stat structure in
+ *          struture kvm or kvm_vcpu.
+ * @unused: Unused field for future usage. Always 0 for now.
+ * @name: The name string for the stats. Its size is indicated by the
+ *        &kvm_stats_header->name_size.
+ */
+struct kvm_stats_desc {
+	__u32 flags;
+	__s16 exponent;
+	__u16 size;
+	__u32 offset;
+	__u32 unused;
+	char name[];
+};
+
+#define KVM_GET_STATS_FD  _IO(KVMIO,  0xce)
+
 #endif /* __LINUX_KVM_H */
diff --git a/virt/kvm/binary_stats.c b/virt/kvm/binary_stats.c
new file mode 100644
index 000000000000..e609d428811a
--- /dev/null
+++ b/virt/kvm/binary_stats.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM binary statistics interface implementation
+ *
+ * Copyright 2021 Google LLC
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/errno.h>
+#include <linux/uaccess.h>
+
+/**
+ * kvm_stats_read() - Common function to read from the binary statistics
+ * file descriptor.
+ *
+ * @id: identification string of the stats
+ * @header: stats header for a vm or a vcpu
+ * @desc: start address of an array of stats descriptors for a vm or a vcpu
+ * @stats: start address of stats data block for a vm or a vcpu
+ * @size_stats: the size of stats data block pointed by @stats
+ * @user_buffer: start address of userspace buffer
+ * @size: requested read size from userspace
+ * @offset: the start position from which the content will be read for the
+ *          corresponding vm or vcp file descriptor
+ *
+ * The file content of a vm/vcpu file descriptor is now defined as below:
+ * +-------------+
+ * |   Header    |
+ * +-------------+
+ * |  id string  |
+ * +-------------+
+ * | Descriptors |
+ * +-------------+
+ * | Stats Data  |
+ * +-------------+
+ * Although this function allows userspace to read any amount of data (as long
+ * as in the limit) from any position, the typical usage would follow below
+ * steps:
+ * 1. Read header from offset 0. Get the offset of descriptors and stats data
+ *    and some other necessary information. This is a one-time work for the
+ *    lifecycle of the corresponding vm/vcpu stats fd.
+ * 2. Read id string from its offset. This is a one-time work for the lifecycle
+ *    of the corresponding vm/vcpu stats fd.
+ * 3. Read descriptors from its offset and discover all the stats by parsing
+ *    descriptors. This is a one-time work for the lifecycle of the
+ *    corresponding vm/vcpu stats fd.
+ * 4. Periodically read stats data from its offset using pread.
+ *
+ * Return: the number of bytes that has been successfully read
+ */
+ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header,
+		       const struct _kvm_stats_desc *desc,
+		       void *stats, size_t size_stats,
+		       char __user *user_buffer, size_t size, loff_t *offset)
+{
+	ssize_t len;
+	ssize_t copylen;
+	ssize_t remain = size;
+	size_t size_desc;
+	size_t size_header;
+	void *src;
+	loff_t pos = *offset;
+	char __user *dest = user_buffer;
+
+	size_header = sizeof(*header);
+	size_desc = header->num_desc * sizeof(*desc);
+
+	len = KVM_STATS_NAME_SIZE + size_header + size_desc + size_stats - pos;
+	len = min(len, remain);
+	if (len <= 0)
+		return 0;
+	remain = len;
+
+	/*
+	 * Copy kvm stats header.
+	 * The header is the first block of content userspace usually read out.
+	 * The pos is 0 and the copylen and remain would be the size of header.
+	 * The copy of the header would be skipped if offset is larger than the
+	 * size of header. That usually happens when userspace reads stats
+	 * descriptors and stats data.
+	 */
+	copylen = size_header - pos;
+	copylen = min(copylen, remain);
+	if (copylen > 0) {
+		src = (void *)header + pos;
+		if (copy_to_user(dest, src, copylen))
+			return -EFAULT;
+		remain -= copylen;
+		pos += copylen;
+		dest += copylen;
+	}
+
+	/*
+	 * Copy kvm stats header id string.
+	 * The id string is unique for every vm/vcpu, which is stored in kvm
+	 * and kvm_vcpu structure.
+	 * The id string is part of the stat header from the perspective of
+	 * userspace, it is usually read out together with previous constant
+	 * header part and could be skipped for later descriptors and stats
+	 * data readings.
+	 */
+	copylen = header->id_offset + KVM_STATS_NAME_SIZE - pos;
+	copylen = min(copylen, remain);
+	if (copylen > 0) {
+		src = id + pos - header->id_offset;
+		if (copy_to_user(dest, src, copylen))
+			return -EFAULT;
+		remain -= copylen;
+		pos += copylen;
+		dest += copylen;
+	}
+
+	/*
+	 * Copy kvm stats descriptors.
+	 * The descriptors copy would be skipped in the typical case that
+	 * userspace periodically read stats data, since the pos would be
+	 * greater than the end address of descriptors
+	 * (header->header.desc_offset + size_desc) causing copylen <= 0.
+	 */
+	copylen = header->desc_offset + size_desc - pos;
+	copylen = min(copylen, remain);
+	if (copylen > 0) {
+		src = (void *)desc + pos - header->desc_offset;
+		if (copy_to_user(dest, src, copylen))
+			return -EFAULT;
+		remain -= copylen;
+		pos += copylen;
+		dest += copylen;
+	}
+
+	/* Copy kvm stats values */
+	copylen = header->data_offset + size_stats - pos;
+	copylen = min(copylen, remain);
+	if (copylen > 0) {
+		src = stats + pos - header->data_offset;
+		if (copy_to_user(dest, src, copylen))
+			return -EFAULT;
+		remain -= copylen;
+		pos += copylen;
+		dest += copylen;
+	}
+
+	*offset = pos;
+	return len;
+}
-- 
cgit v1.2.3


From b9964ce74544ea6cbc4eabd2c89a531adf7f291d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 24 Jun 2021 18:05:51 +0200
Subject: rcu: Create an unrcu_pointer() to remove __rcu from a pointer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The xchg() and cmpxchg() functions are sometimes used to carry out RCU
updates.  Unfortunately, this can result in sparse warnings for both
the old-value and new-value arguments, as well as for the return value.
The arguments can be dealt with using RCU_INITIALIZER():

        old_p = xchg(&p, RCU_INITIALIZER(new_p));

But a sparse warning still remains due to assigning the __rcu pointer
returned from xchg to the (most likely) non-__rcu pointer old_p.

This commit therefore provides an unrcu_pointer() macro that strips
the __rcu.  This macro can be used as follows:

        old_p = unrcu_pointer(xchg(&p, RCU_INITIALIZER(new_p)));

Reported-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210624160609.292325-2-toke@redhat.com
---
 include/linux/rcupdate.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 9455476c5ba2..d7895b81264e 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -363,6 +363,20 @@ static inline void rcu_preempt_sleep_check(void) { }
 #define rcu_check_sparse(p, space)
 #endif /* #else #ifdef __CHECKER__ */
 
+/**
+ * unrcu_pointer - mark a pointer as not being RCU protected
+ * @p: pointer needing to lose its __rcu property
+ *
+ * Converts @p from an __rcu pointer to a __kernel pointer.
+ * This allows an __rcu pointer to be used with xchg() and friends.
+ */
+#define unrcu_pointer(p)						\
+({									\
+	typeof(*p) *_________p1 = (typeof(*p) *__force)(p);		\
+	rcu_check_sparse(p, __rcu); 					\
+	((typeof(*p) __force __kernel *)(_________p1)); 		\
+})
+
 #define __rcu_access_pointer(p, space) \
 ({ \
 	typeof(*p) *_________p1 = (typeof(*p) *__force)READ_ONCE(p); \
-- 
cgit v1.2.3


From 782347b6bcad07ddb574422e01e22c92e05928c8 Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Thu, 24 Jun 2021 18:05:55 +0200
Subject: xdp: Add proper __rcu annotations to redirect map entries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

XDP_REDIRECT works by a three-step process: the bpf_redirect() and
bpf_redirect_map() helpers will lookup the target of the redirect and store
it (along with some other metadata) in a per-CPU struct bpf_redirect_info.
Next, when the program returns the XDP_REDIRECT return code, the driver
will call xdp_do_redirect() which will use the information thus stored to
actually enqueue the frame into a bulk queue structure (that differs
slightly by map type, but shares the same principle). Finally, before
exiting its NAPI poll loop, the driver will call xdp_do_flush(), which will
flush all the different bulk queues, thus completing the redirect.

Pointers to the map entries will be kept around for this whole sequence of
steps, protected by RCU. However, there is no top-level rcu_read_lock() in
the core code; instead drivers add their own rcu_read_lock() around the XDP
portions of the code, but somewhat inconsistently as Martin discovered[0].
However, things still work because everything happens inside a single NAPI
poll sequence, which means it's between a pair of calls to
local_bh_disable()/local_bh_enable(). So Paul suggested[1] that we could
document this intention by using rcu_dereference_check() with
rcu_read_lock_bh_held() as a second parameter, thus allowing sparse and
lockdep to verify that everything is done correctly.

This patch does just that: we add an __rcu annotation to the map entry
pointers and remove the various comments explaining the NAPI poll assurance
strewn through devmap.c in favour of a longer explanation in filter.c. The
goal is to have one coherent documentation of the entire flow, and rely on
the RCU annotations as a "standard" way of communicating the flow in the
map code (which can additionally be understood by sparse and lockdep).

The RCU annotation replacements result in a fairly straight-forward
replacement where READ_ONCE() becomes rcu_dereference_check(), WRITE_ONCE()
becomes rcu_assign_pointer() and xchg() and cmpxchg() gets wrapped in the
proper constructs to cast the pointer back and forth between __rcu and
__kernel address space (for the benefit of sparse). The one complication is
that xskmap has a few constructions where double-pointers are passed back
and forth; these simply all gain __rcu annotations, and only the final
reference/dereference to the inner-most pointer gets changed.

With this, everything can be run through sparse without eliciting
complaints, and lockdep can verify correctness even without the use of
rcu_read_lock() in the drivers. Subsequent patches will clean these up from
the drivers.

[0] https://lore.kernel.org/bpf/20210415173551.7ma4slcbqeyiba2r@kafai-mbp.dhcp.thefacebook.com/
[1] https://lore.kernel.org/bpf/20210419165837.GA975577@paulmck-ThinkPad-P17-Gen-1/

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210624160609.292325-6-toke@redhat.com
---
 include/linux/filter.h |  8 +++-----
 include/net/xdp_sock.h |  2 +-
 kernel/bpf/cpumap.c    | 13 +++++++++----
 kernel/bpf/devmap.c    | 49 +++++++++++++++++++++----------------------------
 net/core/filter.c      | 28 ++++++++++++++++++++++++++++
 net/xdp/xsk.c          |  4 ++--
 net/xdp/xsk.h          |  4 ++--
 net/xdp/xskmap.c       | 29 +++++++++++++++++------------
 8 files changed, 83 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 688856e0b28a..472f97074da0 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -763,11 +763,9 @@ DECLARE_BPF_DISPATCHER(xdp)
 static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
 					    struct xdp_buff *xdp)
 {
-	/* Caller needs to hold rcu_read_lock() (!), otherwise program
-	 * can be released while still running, or map elements could be
-	 * freed early while still having concurrent users. XDP fastpath
-	 * already takes rcu_read_lock() when fetching the program, so
-	 * it's not necessary here anymore.
+	/* Driver XDP hooks are invoked within a single NAPI poll cycle and thus
+	 * under local_bh_disable(), which provides the needed RCU protection
+	 * for accessing map entries.
 	 */
 	return __BPF_PROG_RUN(prog, xdp, BPF_DISPATCHER_FUNC(xdp));
 }
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 9c0722c6d7ac..fff069d2ed1b 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -37,7 +37,7 @@ struct xdp_umem {
 struct xsk_map {
 	struct bpf_map map;
 	spinlock_t lock; /* Synchronize map updates */
-	struct xdp_sock *xsk_map[];
+	struct xdp_sock __rcu *xsk_map[];
 };
 
 struct xdp_sock {
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index a1a0c4e791c6..480e936c54d0 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -74,7 +74,7 @@ struct bpf_cpu_map_entry {
 struct bpf_cpu_map {
 	struct bpf_map map;
 	/* Below members specific for map type */
-	struct bpf_cpu_map_entry **cpu_map;
+	struct bpf_cpu_map_entry __rcu **cpu_map;
 };
 
 static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list);
@@ -469,7 +469,7 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
 {
 	struct bpf_cpu_map_entry *old_rcpu;
 
-	old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu);
+	old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu)));
 	if (old_rcpu) {
 		call_rcu(&old_rcpu->rcu, __cpu_map_entry_free);
 		INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop);
@@ -551,7 +551,7 @@ static void cpu_map_free(struct bpf_map *map)
 	for (i = 0; i < cmap->map.max_entries; i++) {
 		struct bpf_cpu_map_entry *rcpu;
 
-		rcpu = READ_ONCE(cmap->cpu_map[i]);
+		rcpu = rcu_dereference_raw(cmap->cpu_map[i]);
 		if (!rcpu)
 			continue;
 
@@ -562,6 +562,10 @@ static void cpu_map_free(struct bpf_map *map)
 	kfree(cmap);
 }
 
+/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
+ * by local_bh_disable() (from XDP calls inside NAPI). The
+ * rcu_read_lock_bh_held() below makes lockdep accept both.
+ */
 static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
 {
 	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
@@ -570,7 +574,8 @@ static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
 	if (key >= map->max_entries)
 		return NULL;
 
-	rcpu = READ_ONCE(cmap->cpu_map[key]);
+	rcpu = rcu_dereference_check(cmap->cpu_map[key],
+				     rcu_read_lock_bh_held());
 	return rcpu;
 }
 
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 2a75e6c2d27d..2f6bd75cd682 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -73,7 +73,7 @@ struct bpf_dtab_netdev {
 
 struct bpf_dtab {
 	struct bpf_map map;
-	struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */
+	struct bpf_dtab_netdev __rcu **netdev_map; /* DEVMAP type only */
 	struct list_head list;
 
 	/* these are only used for DEVMAP_HASH type maps */
@@ -226,7 +226,7 @@ static void dev_map_free(struct bpf_map *map)
 		for (i = 0; i < dtab->map.max_entries; i++) {
 			struct bpf_dtab_netdev *dev;
 
-			dev = dtab->netdev_map[i];
+			dev = rcu_dereference_raw(dtab->netdev_map[i]);
 			if (!dev)
 				continue;
 
@@ -259,6 +259,10 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	return 0;
 }
 
+/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
+ * by local_bh_disable() (from XDP calls inside NAPI). The
+ * rcu_read_lock_bh_held() below makes lockdep accept both.
+ */
 static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
@@ -410,15 +414,9 @@ out:
 	trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err);
 }
 
-/* __dev_flush is called from xdp_do_flush() which _must_ be signaled
- * from the driver before returning from its napi->poll() routine. The poll()
- * routine is called either from busy_poll context or net_rx_action signaled
- * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
- * net device can be torn down. On devmap tear down we ensure the flush list
- * is empty before completing to ensure all flush operations have completed.
- * When drivers update the bpf program they may need to ensure any flush ops
- * are also complete. Using synchronize_rcu or call_rcu will suffice for this
- * because both wait for napi context to exit.
+/* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the
+ * driver before returning from its napi->poll() routine. See the comment above
+ * xdp_do_flush() in filter.c.
  */
 void __dev_flush(void)
 {
@@ -433,9 +431,9 @@ void __dev_flush(void)
 	}
 }
 
-/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
- * update happens in parallel here a dev_put won't happen until after reading
- * the ifindex.
+/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
+ * by local_bh_disable() (from XDP calls inside NAPI). The
+ * rcu_read_lock_bh_held() below makes lockdep accept both.
  */
 static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 {
@@ -445,12 +443,14 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 	if (key >= map->max_entries)
 		return NULL;
 
-	obj = READ_ONCE(dtab->netdev_map[key]);
+	obj = rcu_dereference_check(dtab->netdev_map[key],
+				    rcu_read_lock_bh_held());
 	return obj;
 }
 
-/* Runs under RCU-read-side, plus in softirq under NAPI protection.
- * Thus, safe percpu variable access.
+/* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu
+ * variable access, and map elements stick around. See comment above
+ * xdp_do_flush() in filter.c.
  */
 static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
 		       struct net_device *dev_rx, struct bpf_prog *xdp_prog)
@@ -735,14 +735,7 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
 	if (k >= map->max_entries)
 		return -EINVAL;
 
-	/* Use call_rcu() here to ensure any rcu critical sections have
-	 * completed as well as any flush operations because call_rcu
-	 * will wait for preempt-disable region to complete, NAPI in this
-	 * context.  And additionally, the driver tear down ensures all
-	 * soft irqs are complete before removing the net device in the
-	 * case of dev_put equals zero.
-	 */
-	old_dev = xchg(&dtab->netdev_map[k], NULL);
+	old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL));
 	if (old_dev)
 		call_rcu(&old_dev->rcu, __dev_map_entry_free);
 	return 0;
@@ -851,7 +844,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
 	 * Remembering the driver side flush operation will happen before the
 	 * net device is removed.
 	 */
-	old_dev = xchg(&dtab->netdev_map[i], dev);
+	old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev)));
 	if (old_dev)
 		call_rcu(&old_dev->rcu, __dev_map_entry_free);
 
@@ -1031,10 +1024,10 @@ static int dev_map_notification(struct notifier_block *notifier,
 			for (i = 0; i < dtab->map.max_entries; i++) {
 				struct bpf_dtab_netdev *dev, *odev;
 
-				dev = READ_ONCE(dtab->netdev_map[i]);
+				dev = rcu_dereference(dtab->netdev_map[i]);
 				if (!dev || netdev != dev->dev)
 					continue;
-				odev = cmpxchg(&dtab->netdev_map[i], dev, NULL);
+				odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL));
 				if (dev == odev)
 					call_rcu(&dev->rcu,
 						 __dev_map_entry_free);
diff --git a/net/core/filter.c b/net/core/filter.c
index d062053994c7..d22895caa164 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3897,6 +3897,34 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+/* XDP_REDIRECT works by a three-step process, implemented in the functions
+ * below:
+ *
+ * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target
+ *    of the redirect and store it (along with some other metadata) in a per-CPU
+ *    struct bpf_redirect_info.
+ *
+ * 2. When the program returns the XDP_REDIRECT return code, the driver will
+ *    call xdp_do_redirect() which will use the information in struct
+ *    bpf_redirect_info to actually enqueue the frame into a map type-specific
+ *    bulk queue structure.
+ *
+ * 3. Before exiting its NAPI poll loop, the driver will call xdp_do_flush(),
+ *    which will flush all the different bulk queues, thus completing the
+ *    redirect.
+ *
+ * Pointers to the map entries will be kept around for this whole sequence of
+ * steps, protected by RCU. However, there is no top-level rcu_read_lock() in
+ * the core code; instead, the RCU protection relies on everything happening
+ * inside a single NAPI poll sequence, which means it's between a pair of calls
+ * to local_bh_disable()/local_bh_enable().
+ *
+ * The map entries are marked as __rcu and the map code makes sure to
+ * dereference those pointers with rcu_dereference_check() in a way that works
+ * for both sections that to hold an rcu_read_lock() and sections that are
+ * called from NAPI without a separate rcu_read_lock(). The code below does not
+ * use RCU annotations, but relies on those in the map code.
+ */
 void xdp_do_flush(void)
 {
 	__dev_flush();
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index cd62d4ba87a9..996da915f520 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -749,7 +749,7 @@ static void xsk_unbind_dev(struct xdp_sock *xs)
 }
 
 static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
-					      struct xdp_sock ***map_entry)
+					      struct xdp_sock __rcu ***map_entry)
 {
 	struct xsk_map *map = NULL;
 	struct xsk_map_node *node;
@@ -785,7 +785,7 @@ static void xsk_delete_from_maps(struct xdp_sock *xs)
 	 * might be updates to the map between
 	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
 	 */
-	struct xdp_sock **map_entry = NULL;
+	struct xdp_sock __rcu **map_entry = NULL;
 	struct xsk_map *map;
 
 	while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h
index edcf249ad1f1..a4bc4749faac 100644
--- a/net/xdp/xsk.h
+++ b/net/xdp/xsk.h
@@ -31,7 +31,7 @@ struct xdp_mmap_offsets_v1 {
 struct xsk_map_node {
 	struct list_head node;
 	struct xsk_map *map;
-	struct xdp_sock **map_entry;
+	struct xdp_sock __rcu **map_entry;
 };
 
 static inline struct xdp_sock *xdp_sk(struct sock *sk)
@@ -40,7 +40,7 @@ static inline struct xdp_sock *xdp_sk(struct sock *sk)
 }
 
 void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
-			     struct xdp_sock **map_entry);
+			     struct xdp_sock __rcu **map_entry);
 void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id);
 int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
 			u16 queue_id);
diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
index 9df75ea4a567..2e48d0e094d9 100644
--- a/net/xdp/xskmap.c
+++ b/net/xdp/xskmap.c
@@ -12,7 +12,7 @@
 #include "xsk.h"
 
 static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
-					       struct xdp_sock **map_entry)
+					       struct xdp_sock __rcu **map_entry)
 {
 	struct xsk_map_node *node;
 
@@ -42,7 +42,7 @@ static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node)
 }
 
 static void xsk_map_sock_delete(struct xdp_sock *xs,
-				struct xdp_sock **map_entry)
+				struct xdp_sock __rcu **map_entry)
 {
 	struct xsk_map_node *n, *tmp;
 
@@ -124,6 +124,10 @@ static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 	return insn - insn_buf;
 }
 
+/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
+ * by local_bh_disable() (from XDP calls inside NAPI). The
+ * rcu_read_lock_bh_held() below makes lockdep accept both.
+ */
 static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key)
 {
 	struct xsk_map *m = container_of(map, struct xsk_map, map);
@@ -131,12 +135,11 @@ static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key)
 	if (key >= map->max_entries)
 		return NULL;
 
-	return READ_ONCE(m->xsk_map[key]);
+	return rcu_dereference_check(m->xsk_map[key], rcu_read_lock_bh_held());
 }
 
 static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
 {
-	WARN_ON_ONCE(!rcu_read_lock_held());
 	return __xsk_map_lookup_elem(map, *(u32 *)key);
 }
 
@@ -149,7 +152,8 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
 			       u64 map_flags)
 {
 	struct xsk_map *m = container_of(map, struct xsk_map, map);
-	struct xdp_sock *xs, *old_xs, **map_entry;
+	struct xdp_sock __rcu **map_entry;
+	struct xdp_sock *xs, *old_xs;
 	u32 i = *(u32 *)key, fd = *(u32 *)value;
 	struct xsk_map_node *node;
 	struct socket *sock;
@@ -179,7 +183,7 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
 	}
 
 	spin_lock_bh(&m->lock);
-	old_xs = READ_ONCE(*map_entry);
+	old_xs = rcu_dereference_protected(*map_entry, lockdep_is_held(&m->lock));
 	if (old_xs == xs) {
 		err = 0;
 		goto out;
@@ -191,7 +195,7 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
 		goto out;
 	}
 	xsk_map_sock_add(xs, node);
-	WRITE_ONCE(*map_entry, xs);
+	rcu_assign_pointer(*map_entry, xs);
 	if (old_xs)
 		xsk_map_sock_delete(old_xs, map_entry);
 	spin_unlock_bh(&m->lock);
@@ -208,7 +212,8 @@ out:
 static int xsk_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct xsk_map *m = container_of(map, struct xsk_map, map);
-	struct xdp_sock *old_xs, **map_entry;
+	struct xdp_sock __rcu **map_entry;
+	struct xdp_sock *old_xs;
 	int k = *(u32 *)key;
 
 	if (k >= map->max_entries)
@@ -216,7 +221,7 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key)
 
 	spin_lock_bh(&m->lock);
 	map_entry = &m->xsk_map[k];
-	old_xs = xchg(map_entry, NULL);
+	old_xs = unrcu_pointer(xchg(map_entry, NULL));
 	if (old_xs)
 		xsk_map_sock_delete(old_xs, map_entry);
 	spin_unlock_bh(&m->lock);
@@ -231,11 +236,11 @@ static int xsk_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
 }
 
 void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
-			     struct xdp_sock **map_entry)
+			     struct xdp_sock __rcu **map_entry)
 {
 	spin_lock_bh(&map->lock);
-	if (READ_ONCE(*map_entry) == xs) {
-		WRITE_ONCE(*map_entry, NULL);
+	if (rcu_access_pointer(*map_entry) == xs) {
+		rcu_assign_pointer(*map_entry, NULL);
 		xsk_map_sock_delete(xs, map_entry);
 	}
 	spin_unlock_bh(&map->lock);
-- 
cgit v1.2.3


From 630161cfdf5cdc696a82b59410d1ff00b23d946e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 24 Jun 2021 14:32:39 +0200
Subject: block: move bdev_disk_changed

Move bdev_disk_changed to block/partitions/core.c, together with the
rest of the partition scanning code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20210624123240.441814-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/partitions/core.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/block_dev.c          | 53 -----------------------------------------------
 include/linux/genhd.h   |  1 -
 3 files changed, 54 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/block/partitions/core.c b/block/partitions/core.c
index 186d4fbd9f09..b79785f7027c 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -596,7 +596,7 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev,
 	return true;
 }
 
-int blk_add_partitions(struct gendisk *disk, struct block_device *bdev)
+static int blk_add_partitions(struct gendisk *disk, struct block_device *bdev)
 {
 	struct parsed_partitions *state;
 	int ret = -EAGAIN, p;
@@ -657,6 +657,59 @@ out_free_state:
 	return ret;
 }
 
+int bdev_disk_changed(struct block_device *bdev, bool invalidate)
+{
+	struct gendisk *disk = bdev->bd_disk;
+	int ret = 0;
+
+	lockdep_assert_held(&disk->open_mutex);
+
+	if (!(disk->flags & GENHD_FL_UP))
+		return -ENXIO;
+
+rescan:
+	if (disk->open_partitions)
+		return -EBUSY;
+	sync_blockdev(bdev);
+	invalidate_bdev(bdev);
+	blk_drop_partitions(disk);
+
+	clear_bit(GD_NEED_PART_SCAN, &disk->state);
+
+	/*
+	 * Historically we only set the capacity to zero for devices that
+	 * support partitions (independ of actually having partitions created).
+	 * Doing that is rather inconsistent, but changing it broke legacy
+	 * udisks polling for legacy ide-cdrom devices.  Use the crude check
+	 * below to get the sane behavior for most device while not breaking
+	 * userspace for this particular setup.
+	 */
+	if (invalidate) {
+		if (disk_part_scan_enabled(disk) ||
+		    !(disk->flags & GENHD_FL_REMOVABLE))
+			set_capacity(disk, 0);
+	}
+
+	if (get_capacity(disk)) {
+		ret = blk_add_partitions(disk, bdev);
+		if (ret == -EAGAIN)
+			goto rescan;
+	} else if (invalidate) {
+		/*
+		 * Tell userspace that the media / partition table may have
+		 * changed.
+		 */
+		kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
+	}
+
+	return ret;
+}
+/*
+ * Only exported for loop and dasd for historic reasons.  Don't use in new
+ * code!
+ */
+EXPORT_SYMBOL_GPL(bdev_disk_changed);
+
 void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p)
 {
 	struct address_space *mapping = state->bdev->bd_inode->i_mapping;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ac9b3c158a77..5b3a73ecb696 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1242,59 +1242,6 @@ static void blkdev_flush_mapping(struct block_device *bdev)
 	bdev_write_inode(bdev);
 }
 
-int bdev_disk_changed(struct block_device *bdev, bool invalidate)
-{
-	struct gendisk *disk = bdev->bd_disk;
-	int ret = 0;
-
-	lockdep_assert_held(&disk->open_mutex);
-
-	if (!(disk->flags & GENHD_FL_UP))
-		return -ENXIO;
-
-rescan:
-	if (disk->open_partitions)
-		return -EBUSY;
-	sync_blockdev(bdev);
-	invalidate_bdev(bdev);
-	blk_drop_partitions(disk);
-
-	clear_bit(GD_NEED_PART_SCAN, &disk->state);
-
-	/*
-	 * Historically we only set the capacity to zero for devices that
-	 * support partitions (independ of actually having partitions created).
-	 * Doing that is rather inconsistent, but changing it broke legacy
-	 * udisks polling for legacy ide-cdrom devices.  Use the crude check
-	 * below to get the sane behavior for most device while not breaking
-	 * userspace for this particular setup.
-	 */
-	if (invalidate) {
-		if (disk_part_scan_enabled(disk) ||
-		    !(disk->flags & GENHD_FL_REMOVABLE))
-			set_capacity(disk, 0);
-	}
-
-	if (get_capacity(disk)) {
-		ret = blk_add_partitions(disk, bdev);
-		if (ret == -EAGAIN)
-			goto rescan;
-	} else if (invalidate) {
-		/*
-		 * Tell userspace that the media / partition table may have
-		 * changed.
-		 */
-		kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
-	}
-
-	return ret;
-}
-/*
- * Only exported for loop and dasd for historic reasons.  Don't use in new
- * code!
- */
-EXPORT_SYMBOL_GPL(bdev_disk_changed);
-
 static int blkdev_get_whole(struct block_device *bdev, fmode_t mode)
 {
 	struct gendisk *disk = bdev->bd_disk;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 03d684f0498f..f5f0c9bdf1d2 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -257,7 +257,6 @@ static inline sector_t get_capacity(struct gendisk *disk)
 }
 
 int bdev_disk_changed(struct block_device *bdev, bool invalidate);
-int blk_add_partitions(struct gendisk *disk, struct block_device *bdev);
 void blk_drop_partitions(struct gendisk *disk);
 
 extern struct gendisk *__alloc_disk_node(int minors, int node_id);
-- 
cgit v1.2.3


From 0384264ea8a39bd98c9a3158060565f650c056a6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 24 Jun 2021 14:32:40 +0200
Subject: block: pass a gendisk to bdev_disk_changed

bdev_disk_changed can only operate on whole devices.  Make that clear
by passing a gendisk instead of the struct block_device.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20210624123240.441814-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/partitions/core.c         | 22 ++++++++++------------
 drivers/block/loop.c            | 21 ++++++++++-----------
 drivers/s390/block/dasd_genhd.c |  4 ++--
 fs/block_dev.c                  |  4 ++--
 include/linux/genhd.h           |  2 +-
 5 files changed, 25 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/block/partitions/core.c b/block/partitions/core.c
index b79785f7027c..347c56a51d87 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -120,8 +120,7 @@ static void free_partitions(struct parsed_partitions *state)
 	kfree(state);
 }
 
-static struct parsed_partitions *check_partition(struct gendisk *hd,
-		struct block_device *bdev)
+static struct parsed_partitions *check_partition(struct gendisk *hd)
 {
 	struct parsed_partitions *state;
 	int i, res, err;
@@ -136,7 +135,7 @@ static struct parsed_partitions *check_partition(struct gendisk *hd,
 	}
 	state->pp_buf[0] = '\0';
 
-	state->bdev = bdev;
+	state->bdev = hd->part0;
 	disk_name(hd, 0, state->name);
 	snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name);
 	if (isdigit(state->name[strlen(state->name)-1]))
@@ -546,7 +545,7 @@ void blk_drop_partitions(struct gendisk *disk)
 	}
 }
 
-static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev,
+static bool blk_add_partition(struct gendisk *disk,
 		struct parsed_partitions *state, int p)
 {
 	sector_t size = state->parts[p].size;
@@ -596,7 +595,7 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev,
 	return true;
 }
 
-static int blk_add_partitions(struct gendisk *disk, struct block_device *bdev)
+static int blk_add_partitions(struct gendisk *disk)
 {
 	struct parsed_partitions *state;
 	int ret = -EAGAIN, p;
@@ -604,7 +603,7 @@ static int blk_add_partitions(struct gendisk *disk, struct block_device *bdev)
 	if (!disk_part_scan_enabled(disk))
 		return 0;
 
-	state = check_partition(disk, bdev);
+	state = check_partition(disk);
 	if (!state)
 		return 0;
 	if (IS_ERR(state)) {
@@ -648,7 +647,7 @@ static int blk_add_partitions(struct gendisk *disk, struct block_device *bdev)
 	kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
 
 	for (p = 1; p < state->limit; p++)
-		if (!blk_add_partition(disk, bdev, state, p))
+		if (!blk_add_partition(disk, state, p))
 			goto out_free_state;
 
 	ret = 0;
@@ -657,9 +656,8 @@ out_free_state:
 	return ret;
 }
 
-int bdev_disk_changed(struct block_device *bdev, bool invalidate)
+int bdev_disk_changed(struct gendisk *disk, bool invalidate)
 {
-	struct gendisk *disk = bdev->bd_disk;
 	int ret = 0;
 
 	lockdep_assert_held(&disk->open_mutex);
@@ -670,8 +668,8 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate)
 rescan:
 	if (disk->open_partitions)
 		return -EBUSY;
-	sync_blockdev(bdev);
-	invalidate_bdev(bdev);
+	sync_blockdev(disk->part0);
+	invalidate_bdev(disk->part0);
 	blk_drop_partitions(disk);
 
 	clear_bit(GD_NEED_PART_SCAN, &disk->state);
@@ -691,7 +689,7 @@ rescan:
 	}
 
 	if (get_capacity(disk)) {
-		ret = blk_add_partitions(disk, bdev);
+		ret = blk_add_partitions(disk);
 		if (ret == -EAGAIN)
 			goto rescan;
 	} else if (invalidate) {
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index e90f7d349816..4fb1f9530d5a 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -647,14 +647,13 @@ static inline void loop_update_dio(struct loop_device *lo)
 				lo->use_dio);
 }
 
-static void loop_reread_partitions(struct loop_device *lo,
-				   struct block_device *bdev)
+static void loop_reread_partitions(struct loop_device *lo)
 {
 	int rc;
 
-	mutex_lock(&bdev->bd_disk->open_mutex);
-	rc = bdev_disk_changed(bdev, false);
-	mutex_unlock(&bdev->bd_disk->open_mutex);
+	mutex_lock(&lo->lo_disk->open_mutex);
+	rc = bdev_disk_changed(lo->lo_disk, false);
+	mutex_unlock(&lo->lo_disk->open_mutex);
 	if (rc)
 		pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n",
 			__func__, lo->lo_number, lo->lo_file_name, rc);
@@ -752,7 +751,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 	 */
 	fput(old_file);
 	if (partscan)
-		loop_reread_partitions(lo, bdev);
+		loop_reread_partitions(lo);
 	return 0;
 
 out_err:
@@ -1174,7 +1173,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
 	bdgrab(bdev);
 	mutex_unlock(&lo->lo_mutex);
 	if (partscan)
-		loop_reread_partitions(lo, bdev);
+		loop_reread_partitions(lo);
 	if (!(mode & FMODE_EXCL))
 		bd_abort_claiming(bdev, loop_configure);
 	return 0;
@@ -1268,10 +1267,10 @@ out_unlock:
 		 * current holder is released.
 		 */
 		if (!release)
-			mutex_lock(&bdev->bd_disk->open_mutex);
-		err = bdev_disk_changed(bdev, false);
+			mutex_lock(&lo->lo_disk->open_mutex);
+		err = bdev_disk_changed(lo->lo_disk, false);
 		if (!release)
-			mutex_unlock(&bdev->bd_disk->open_mutex);
+			mutex_unlock(&lo->lo_disk->open_mutex);
 		if (err)
 			pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
 				__func__, lo_number, err);
@@ -1416,7 +1415,7 @@ out_unfreeze:
 out_unlock:
 	mutex_unlock(&lo->lo_mutex);
 	if (partscan)
-		loop_reread_partitions(lo, bdev);
+		loop_reread_partitions(lo);
 
 	return err;
 }
diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c
index bf2082d461c7..493e8469893c 100644
--- a/drivers/s390/block/dasd_genhd.c
+++ b/drivers/s390/block/dasd_genhd.c
@@ -110,7 +110,7 @@ int dasd_scan_partitions(struct dasd_block *block)
 	}
 
 	mutex_lock(&block->gdp->open_mutex);
-	rc = bdev_disk_changed(bdev, false);
+	rc = bdev_disk_changed(block->gdp, false);
 	mutex_unlock(&block->gdp->open_mutex);
 	if (rc)
 		DBF_DEV_EVENT(DBF_ERR, block->base,
@@ -146,7 +146,7 @@ void dasd_destroy_partitions(struct dasd_block *block)
 	block->bdev = NULL;
 
 	mutex_lock(&bdev->bd_disk->open_mutex);
-	bdev_disk_changed(bdev, true);
+	bdev_disk_changed(bdev->bd_disk, true);
 	mutex_unlock(&bdev->bd_disk->open_mutex);
 
 	/* Matching blkdev_put to the blkdev_get in dasd_scan_partitions. */
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 5b3a73ecb696..34253d155f5c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1253,7 +1253,7 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode)
 			/* avoid ghost partitions on a removed medium */
 			if (ret == -ENOMEDIUM &&
 			     test_bit(GD_NEED_PART_SCAN, &disk->state))
-				bdev_disk_changed(bdev, true);
+				bdev_disk_changed(disk, true);
 			return ret;
 		}
 	}
@@ -1264,7 +1264,7 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode)
 			bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
 	}
 	if (test_bit(GD_NEED_PART_SCAN, &disk->state))
-		bdev_disk_changed(bdev, false);
+		bdev_disk_changed(disk, false);
 	bdev->bd_openers++;
 	return 0;;
 }
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index f5f0c9bdf1d2..13b34177cc85 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -256,7 +256,7 @@ static inline sector_t get_capacity(struct gendisk *disk)
 	return bdev_nr_sectors(disk->part0);
 }
 
-int bdev_disk_changed(struct block_device *bdev, bool invalidate);
+int bdev_disk_changed(struct gendisk *disk, bool invalidate);
 void blk_drop_partitions(struct gendisk *disk);
 
 extern struct gendisk *__alloc_disk_node(int minors, int node_id);
-- 
cgit v1.2.3


From c88c192dc3ea209694cc08f4ccf51f920d26bdae Mon Sep 17 00:00:00 2001
From: Marcin Wojtas <mw@semihalf.com>
Date: Thu, 24 Jun 2021 02:51:51 +0200
Subject: net: mdiobus: fix fwnode_mdbiobus_register() fallback case

The fallback case of fwnode_mdbiobus_register()
(relevant for !CONFIG_FWNODE_MDIO) was defined with wrong
argument name, causing a compilation error. Fix that.

Signed-off-by: Marcin Wojtas <mw@semihalf.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/fwnode_mdio.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fwnode_mdio.h b/include/linux/fwnode_mdio.h
index 13d4ae8fee0a..f62817c23137 100644
--- a/include/linux/fwnode_mdio.h
+++ b/include/linux/fwnode_mdio.h
@@ -40,7 +40,7 @@ static inline int fwnode_mdiobus_register(struct mii_bus *bus,
 	 * This way, we don't have to keep compat bits around in drivers.
 	 */
 
-	return mdiobus_register(mdio);
+	return mdiobus_register(bus);
 }
 #endif
 
-- 
cgit v1.2.3


From 3c0d0894320cc517fda657c69939cd0313d0b4e2 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 21 Jun 2021 11:53:38 +0200
Subject: libceph: don't pass result into ac->ops->handle_reply()

There is no result to pass in msgr2 case because authentication
failures are reported through auth_bad_method frame and in MAuth
case an error is returned immediately.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 include/linux/ceph/auth.h |  2 +-
 net/ceph/auth.c           | 15 ++++++++++-----
 net/ceph/auth_none.c      |  4 ++--
 net/ceph/auth_x.c         |  6 ++----
 4 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index 71b5d481c653..39425e2f7cb2 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -50,7 +50,7 @@ struct ceph_auth_client_ops {
 	 * another request.
 	 */
 	int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
-	int (*handle_reply)(struct ceph_auth_client *ac, int result,
+	int (*handle_reply)(struct ceph_auth_client *ac,
 			    void *buf, void *end, u8 *session_key,
 			    int *session_key_len, u8 *con_secret,
 			    int *con_secret_len);
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index de407e8feb97..3a9d44eee941 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -260,14 +260,19 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 		ac->negotiating = false;
 	}
 
-	ret = ac->ops->handle_reply(ac, result, payload, payload_end,
+	if (result) {
+		pr_err("auth protocol '%s' mauth authentication failed: %d\n",
+		       ceph_auth_proto_name(ac->protocol), result);
+		ret = result;
+		goto out;
+	}
+
+	ret = ac->ops->handle_reply(ac, payload, payload_end,
 				    NULL, NULL, NULL, NULL);
 	if (ret == -EAGAIN) {
 		ret = build_request(ac, true, reply_buf, reply_len);
 		goto out;
 	} else if (ret) {
-		pr_err("auth protocol '%s' mauth authentication failed: %d\n",
-		       ceph_auth_proto_name(ac->protocol), result);
 		goto out;
 	}
 
@@ -480,7 +485,7 @@ int ceph_auth_handle_reply_more(struct ceph_auth_client *ac, void *reply,
 	int ret;
 
 	mutex_lock(&ac->mutex);
-	ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len,
+	ret = ac->ops->handle_reply(ac, reply, reply + reply_len,
 				    NULL, NULL, NULL, NULL);
 	if (ret == -EAGAIN)
 		ret = build_request(ac, false, buf, buf_len);
@@ -498,7 +503,7 @@ int ceph_auth_handle_reply_done(struct ceph_auth_client *ac,
 	int ret;
 
 	mutex_lock(&ac->mutex);
-	ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len,
+	ret = ac->ops->handle_reply(ac, reply, reply + reply_len,
 				    session_key, session_key_len,
 				    con_secret, con_secret_len);
 	if (!ret)
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
index 70e86e462250..aab490a111eb 100644
--- a/net/ceph/auth_none.c
+++ b/net/ceph/auth_none.c
@@ -69,7 +69,7 @@ static int build_request(struct ceph_auth_client *ac, void *buf, void *end)
  * the generic auth code decode the global_id, and we carry no actual
  * authenticate state, so nothing happens here.
  */
-static int handle_reply(struct ceph_auth_client *ac, int result,
+static int handle_reply(struct ceph_auth_client *ac,
 			void *buf, void *end, u8 *session_key,
 			int *session_key_len, u8 *con_secret,
 			int *con_secret_len)
@@ -77,7 +77,7 @@ static int handle_reply(struct ceph_auth_client *ac, int result,
 	struct ceph_auth_none_info *xi = ac->private;
 
 	xi->starting = false;
-	return result;
+	return 0;
 }
 
 static void ceph_auth_none_destroy_authorizer(struct ceph_authorizer *a)
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index 79641c4afee9..cab99c5581b0 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -661,7 +661,7 @@ e_inval:
 	return -EINVAL;
 }
 
-static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
+static int ceph_x_handle_reply(struct ceph_auth_client *ac,
 			       void *buf, void *end,
 			       u8 *session_key, int *session_key_len,
 			       u8 *con_secret, int *con_secret_len)
@@ -669,13 +669,11 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
 	struct ceph_x_info *xi = ac->private;
 	struct ceph_x_ticket_handler *th;
 	int len = end - buf;
+	int result;
 	void *p;
 	int op;
 	int ret;
 
-	if (result)
-		return result;  /* XXX hmm? */
-
 	if (xi->starting) {
 		/* it's a hello */
 		struct ceph_x_server_challenge *sc = buf;
-- 
cgit v1.2.3


From 03af4c7bad8ca59143bca488b90b3775d10d7f94 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 21 Jun 2021 12:17:40 +0200
Subject: libceph: set global_id as soon as we get an auth ticket

Commit 61ca49a9105f ("libceph: don't set global_id until we get an
auth ticket") delayed the setting of global_id too much.  It is set
only after all tickets are received, but in pre-nautilus clusters an
auth ticket and the service tickets are obtained in separate steps
(for a total of three MAuth replies).  When the service tickets are
requested, global_id is used to build an authorizer; if global_id is
still 0 we never get them and fail to establish the session.

Moving the setting of global_id into protocol implementations.  This
way global_id can be set exactly when an auth ticket is received, not
sooner nor later.

Fixes: 61ca49a9105f ("libceph: don't set global_id until we get an auth ticket")
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 include/linux/ceph/auth.h |  4 +++-
 net/ceph/auth.c           | 13 +++++--------
 net/ceph/auth_none.c      |  3 ++-
 net/ceph/auth_x.c         | 11 ++++++-----
 4 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index 39425e2f7cb2..6b138fa97db8 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -50,7 +50,7 @@ struct ceph_auth_client_ops {
 	 * another request.
 	 */
 	int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
-	int (*handle_reply)(struct ceph_auth_client *ac,
+	int (*handle_reply)(struct ceph_auth_client *ac, u64 global_id,
 			    void *buf, void *end, u8 *session_key,
 			    int *session_key_len, u8 *con_secret,
 			    int *con_secret_len);
@@ -104,6 +104,8 @@ struct ceph_auth_client {
 	struct mutex mutex;
 };
 
+void ceph_auth_set_global_id(struct ceph_auth_client *ac, u64 global_id);
+
 struct ceph_auth_client *ceph_auth_init(const char *name,
 					const struct ceph_crypto_key *key,
 					const int *con_modes);
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index 3a9d44eee941..d2b268a1838e 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -36,7 +36,7 @@ static int init_protocol(struct ceph_auth_client *ac, int proto)
 	}
 }
 
-static void set_global_id(struct ceph_auth_client *ac, u64 global_id)
+void ceph_auth_set_global_id(struct ceph_auth_client *ac, u64 global_id)
 {
 	dout("%s global_id %llu\n", __func__, global_id);
 
@@ -267,7 +267,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 		goto out;
 	}
 
-	ret = ac->ops->handle_reply(ac, payload, payload_end,
+	ret = ac->ops->handle_reply(ac, global_id, payload, payload_end,
 				    NULL, NULL, NULL, NULL);
 	if (ret == -EAGAIN) {
 		ret = build_request(ac, true, reply_buf, reply_len);
@@ -276,8 +276,6 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 		goto out;
 	}
 
-	set_global_id(ac, global_id);
-
 out:
 	mutex_unlock(&ac->mutex);
 	return ret;
@@ -485,7 +483,7 @@ int ceph_auth_handle_reply_more(struct ceph_auth_client *ac, void *reply,
 	int ret;
 
 	mutex_lock(&ac->mutex);
-	ret = ac->ops->handle_reply(ac, reply, reply + reply_len,
+	ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len,
 				    NULL, NULL, NULL, NULL);
 	if (ret == -EAGAIN)
 		ret = build_request(ac, false, buf, buf_len);
@@ -503,11 +501,10 @@ int ceph_auth_handle_reply_done(struct ceph_auth_client *ac,
 	int ret;
 
 	mutex_lock(&ac->mutex);
-	ret = ac->ops->handle_reply(ac, reply, reply + reply_len,
+	ret = ac->ops->handle_reply(ac, global_id, reply, reply + reply_len,
 				    session_key, session_key_len,
 				    con_secret, con_secret_len);
-	if (!ret)
-		set_global_id(ac, global_id);
+	WARN_ON(ret == -EAGAIN || ret > 0);
 	mutex_unlock(&ac->mutex);
 	return ret;
 }
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
index aab490a111eb..097e9f8d87a7 100644
--- a/net/ceph/auth_none.c
+++ b/net/ceph/auth_none.c
@@ -69,7 +69,7 @@ static int build_request(struct ceph_auth_client *ac, void *buf, void *end)
  * the generic auth code decode the global_id, and we carry no actual
  * authenticate state, so nothing happens here.
  */
-static int handle_reply(struct ceph_auth_client *ac,
+static int handle_reply(struct ceph_auth_client *ac, u64 global_id,
 			void *buf, void *end, u8 *session_key,
 			int *session_key_len, u8 *con_secret,
 			int *con_secret_len)
@@ -77,6 +77,7 @@ static int handle_reply(struct ceph_auth_client *ac,
 	struct ceph_auth_none_info *xi = ac->private;
 
 	xi->starting = false;
+	ceph_auth_set_global_id(ac, global_id);
 	return 0;
 }
 
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index cab99c5581b0..b71b1635916e 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -597,7 +597,7 @@ bad:
 	return -EINVAL;
 }
 
-static int handle_auth_session_key(struct ceph_auth_client *ac,
+static int handle_auth_session_key(struct ceph_auth_client *ac, u64 global_id,
 				   void **p, void *end,
 				   u8 *session_key, int *session_key_len,
 				   u8 *con_secret, int *con_secret_len)
@@ -613,6 +613,7 @@ static int handle_auth_session_key(struct ceph_auth_client *ac,
 	if (ret)
 		return ret;
 
+	ceph_auth_set_global_id(ac, global_id);
 	if (*p == end) {
 		/* pre-nautilus (or didn't request service tickets!) */
 		WARN_ON(session_key || con_secret);
@@ -661,7 +662,7 @@ e_inval:
 	return -EINVAL;
 }
 
-static int ceph_x_handle_reply(struct ceph_auth_client *ac,
+static int ceph_x_handle_reply(struct ceph_auth_client *ac, u64 global_id,
 			       void *buf, void *end,
 			       u8 *session_key, int *session_key_len,
 			       u8 *con_secret, int *con_secret_len)
@@ -695,9 +696,9 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac,
 	switch (op) {
 	case CEPHX_GET_AUTH_SESSION_KEY:
 		/* AUTH ticket + [connection secret] + service tickets */
-		ret = handle_auth_session_key(ac, &p, end, session_key,
-					      session_key_len, con_secret,
-					      con_secret_len);
+		ret = handle_auth_session_key(ac, global_id, &p, end,
+					      session_key, session_key_len,
+					      con_secret, con_secret_len);
 		break;
 
 	case CEPHX_GET_PRINCIPAL_SESSION_KEY:
-- 
cgit v1.2.3


From e3a9b1212b9d6cb20751196e338f4a5138d539d3 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Tue, 22 Jun 2021 19:28:23 -0700
Subject: PCI: Export pci_dev_trylock() and pci_dev_unlock()

Other places in the kernel use this form, and so just
provide a common path for it.

Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Link: https://lore.kernel.org/r/20210623022824.308041-2-mcgrof@kernel.org
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/pci/pci.c   | 6 ++++--
 include/linux/pci.h | 3 +++
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index b717680377a9..10f08ed3589f 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -5028,7 +5028,7 @@ static void pci_dev_lock(struct pci_dev *dev)
 }
 
 /* Return 1 on successful lock, 0 on contention */
-static int pci_dev_trylock(struct pci_dev *dev)
+int pci_dev_trylock(struct pci_dev *dev)
 {
 	if (pci_cfg_access_trylock(dev)) {
 		if (device_trylock(&dev->dev))
@@ -5038,12 +5038,14 @@ static int pci_dev_trylock(struct pci_dev *dev)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(pci_dev_trylock);
 
-static void pci_dev_unlock(struct pci_dev *dev)
+void pci_dev_unlock(struct pci_dev *dev)
 {
 	device_unlock(&dev->dev);
 	pci_cfg_access_unlock(dev);
 }
+EXPORT_SYMBOL_GPL(pci_dev_unlock);
 
 static void pci_dev_save_and_disable(struct pci_dev *dev)
 {
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 24306504226a..7765c325706a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1621,6 +1621,9 @@ void pci_cfg_access_lock(struct pci_dev *dev);
 bool pci_cfg_access_trylock(struct pci_dev *dev);
 void pci_cfg_access_unlock(struct pci_dev *dev);
 
+int pci_dev_trylock(struct pci_dev *dev);
+void pci_dev_unlock(struct pci_dev *dev);
+
 /*
  * PCI domain support.  Sometimes called PCI segment (eg by ACPI),
  * a PCI domain is defined to be a set of PCI buses which share
-- 
cgit v1.2.3


From fcfe1baeddbf1c7c448b44c82586d0cbc8abc9f5 Mon Sep 17 00:00:00 2001
From: Jing Zhang <jingzhangos@google.com>
Date: Fri, 18 Jun 2021 22:27:05 +0000
Subject: KVM: stats: Support binary stats retrieval for a VM

Add a VM ioctl to get a statistics file descriptor by which a read
functionality is provided for userspace to read out VM stats header,
descriptors and data.
Define VM statistics descriptors and header for all architectures.

Reviewed-by: David Matlack <dmatlack@google.com>
Reviewed-by: Ricardo Koller <ricarkol@google.com>
Reviewed-by: Krish Sadhukhan <krish.sadhukhan@oracle.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Tested-by: Fuad Tabba <tabba@google.com> #arm64
Signed-off-by: Jing Zhang <jingzhangos@google.com>
Message-Id: <20210618222709.1858088-4-jingzhangos@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm64/kvm/guest.c    | 15 +++++++++++++++
 arch/mips/kvm/mips.c      | 15 +++++++++++++++
 arch/powerpc/kvm/book3s.c | 17 +++++++++++++++++
 arch/powerpc/kvm/booke.c  | 17 +++++++++++++++++
 arch/s390/kvm/kvm-s390.c  | 20 ++++++++++++++++++++
 arch/x86/kvm/x86.c        | 25 +++++++++++++++++++++++++
 include/linux/kvm_host.h  |  6 ++++++
 virt/kvm/kvm_main.c       | 42 ++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 157 insertions(+)

(limited to 'include/linux')

diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 988ead309cbe..d7606a3c449b 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -28,6 +28,21 @@
 
 #include "trace.h"
 
+const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
+	KVM_GENERIC_VM_STATS()
+};
+static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
+		sizeof(struct kvm_vm_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vm_stats_header = {
+	.name_size = KVM_STATS_NAME_SIZE,
+	.num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
+	.id_offset =  sizeof(struct kvm_stats_header),
+	.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+	.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+		       sizeof(kvm_vm_stats_desc),
+};
+
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	VCPU_STAT("halt_successful_poll", generic.halt_successful_poll),
 	VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll),
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 2f2969aef60c..9f8b203737df 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -38,6 +38,21 @@
 #define VECTORSPACING 0x100	/* for EI/VI mode */
 #endif
 
+const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
+	KVM_GENERIC_VM_STATS()
+};
+static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
+		sizeof(struct kvm_vm_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vm_stats_header = {
+	.name_size = KVM_STATS_NAME_SIZE,
+	.num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
+	.id_offset = sizeof(struct kvm_stats_header),
+	.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+	.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+		       sizeof(kvm_vm_stats_desc),
+};
+
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	VCPU_STAT("wait", wait_exits),
 	VCPU_STAT("cache", cache_exits),
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index ae9f1b855ff9..1f004837f9c5 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -38,6 +38,23 @@
 
 /* #define EXIT_DEBUG */
 
+const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
+	KVM_GENERIC_VM_STATS(),
+	STATS_DESC_ICOUNTER(VM, num_2M_pages),
+	STATS_DESC_ICOUNTER(VM, num_1G_pages)
+};
+static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
+		sizeof(struct kvm_vm_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vm_stats_header = {
+	.name_size = KVM_STATS_NAME_SIZE,
+	.num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
+	.id_offset = sizeof(struct kvm_stats_header),
+	.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+	.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+		       sizeof(kvm_vm_stats_desc),
+};
+
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	VCPU_STAT("exits", sum_exits),
 	VCPU_STAT("mmio", mmio_exits),
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 7a75559ab51d..a49ea4dcf963 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -36,6 +36,23 @@
 
 unsigned long kvmppc_booke_handlers;
 
+const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
+	KVM_GENERIC_VM_STATS(),
+	STATS_DESC_ICOUNTER(VM, num_2M_pages),
+	STATS_DESC_ICOUNTER(VM, num_1G_pages)
+};
+static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
+		sizeof(struct kvm_vm_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vm_stats_header = {
+	.name_size = KVM_STATS_NAME_SIZE,
+	.num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
+	.id_offset = sizeof(struct kvm_stats_header),
+	.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+	.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+		       sizeof(kvm_vm_stats_desc),
+};
+
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	VCPU_STAT("mmio", mmio_exits),
 	VCPU_STAT("sig", signal_exits),
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 75ad44c44717..c7c7a28af41c 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -58,6 +58,26 @@
 #define VCPU_IRQS_MAX_BUF (sizeof(struct kvm_s390_irq) * \
 			   (KVM_MAX_VCPUS + LOCAL_IRQS))
 
+const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
+	KVM_GENERIC_VM_STATS(),
+	STATS_DESC_COUNTER(VM, inject_io),
+	STATS_DESC_COUNTER(VM, inject_float_mchk),
+	STATS_DESC_COUNTER(VM, inject_pfault_done),
+	STATS_DESC_COUNTER(VM, inject_service_signal),
+	STATS_DESC_COUNTER(VM, inject_virtio)
+};
+static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
+		sizeof(struct kvm_vm_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vm_stats_header = {
+	.name_size = KVM_STATS_NAME_SIZE,
+	.num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
+	.id_offset = sizeof(struct kvm_stats_header),
+	.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+	.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+		       sizeof(kvm_vm_stats_desc),
+};
+
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	VCPU_STAT("userspace_handled", exit_userspace),
 	VCPU_STAT("exit_null", exit_null),
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 71202330848a..570fd0704847 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -223,6 +223,31 @@ EXPORT_SYMBOL_GPL(host_xss);
 u64 __read_mostly supported_xss;
 EXPORT_SYMBOL_GPL(supported_xss);
 
+const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
+	KVM_GENERIC_VM_STATS(),
+	STATS_DESC_COUNTER(VM, mmu_shadow_zapped),
+	STATS_DESC_COUNTER(VM, mmu_pte_write),
+	STATS_DESC_COUNTER(VM, mmu_pde_zapped),
+	STATS_DESC_COUNTER(VM, mmu_flooded),
+	STATS_DESC_COUNTER(VM, mmu_recycled),
+	STATS_DESC_COUNTER(VM, mmu_cache_miss),
+	STATS_DESC_ICOUNTER(VM, mmu_unsync),
+	STATS_DESC_ICOUNTER(VM, lpages),
+	STATS_DESC_ICOUNTER(VM, nx_lpage_splits),
+	STATS_DESC_ICOUNTER(VM, max_mmu_page_hash_collisions)
+};
+static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
+		sizeof(struct kvm_vm_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vm_stats_header = {
+	.name_size = KVM_STATS_NAME_SIZE,
+	.num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
+	.id_offset = sizeof(struct kvm_stats_header),
+	.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+	.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+		       sizeof(kvm_vm_stats_desc),
+};
+
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	VCPU_STAT("pf_fixed", pf_fixed),
 	VCPU_STAT("pf_guest", pf_guest),
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9ee7f350473b..e79ce64b9f6f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -599,6 +599,7 @@ struct kvm {
 #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
 	struct notifier_block pm_notifier;
 #endif
+	char stats_id[KVM_STATS_NAME_SIZE];
 };
 
 #define kvm_err(fmt, ...) \
@@ -1354,12 +1355,17 @@ struct _kvm_stats_desc {
 	STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS,	       \
 		KVM_STATS_BASE_POW10, -9)
 
+#define KVM_GENERIC_VM_STATS()						       \
+	STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush)
+
 extern struct kvm_stats_debugfs_item debugfs_entries[];
 extern struct dentry *kvm_debugfs_dir;
 ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header,
 		       const struct _kvm_stats_desc *desc,
 		       void *stats, size_t size_stats,
 		       char __user *user_buffer, size_t size, loff_t *offset);
+extern const struct kvm_stats_header kvm_vm_stats_header;
+extern const struct _kvm_stats_desc kvm_vm_stats_desc[];
 
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index cec986487b30..33ec43a87d0f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4055,6 +4055,42 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
 	}
 }
 
+static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
+			      size_t size, loff_t *offset)
+{
+	struct kvm *kvm = file->private_data;
+
+	return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
+				&kvm_vm_stats_desc[0], &kvm->stat,
+				sizeof(kvm->stat), user_buffer, size, offset);
+}
+
+static const struct file_operations kvm_vm_stats_fops = {
+	.read = kvm_vm_stats_read,
+	.llseek = noop_llseek,
+};
+
+static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
+{
+	int fd;
+	struct file *file;
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0)
+		return fd;
+
+	file = anon_inode_getfile("kvm-vm-stats",
+			&kvm_vm_stats_fops, kvm, O_RDONLY);
+	if (IS_ERR(file)) {
+		put_unused_fd(fd);
+		return PTR_ERR(file);
+	}
+	file->f_mode |= FMODE_PREAD;
+	fd_install(fd, file);
+
+	return fd;
+}
+
 static long kvm_vm_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {
@@ -4237,6 +4273,9 @@ static long kvm_vm_ioctl(struct file *filp,
 	case KVM_RESET_DIRTY_RINGS:
 		r = kvm_vm_ioctl_reset_dirty_pages(kvm);
 		break;
+	case KVM_GET_STATS_FD:
+		r = kvm_vm_ioctl_get_stats_fd(kvm);
+		break;
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 	}
@@ -4316,6 +4355,9 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
 	if (r < 0)
 		goto put_kvm;
 
+	snprintf(kvm->stats_id, sizeof(kvm->stats_id),
+			"kvm-%d", task_pid_nr(current));
+
 	file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
 	if (IS_ERR(file)) {
 		put_unused_fd(r);
-- 
cgit v1.2.3


From ce55c049459cff0034cc1bcfdce3bf343a2d6317 Mon Sep 17 00:00:00 2001
From: Jing Zhang <jingzhangos@google.com>
Date: Fri, 18 Jun 2021 22:27:06 +0000
Subject: KVM: stats: Support binary stats retrieval for a VCPU

Add a VCPU ioctl to get a statistics file descriptor by which a read
functionality is provided for userspace to read out VCPU stats header,
descriptors and data.
Define VCPU statistics descriptors and header for all architectures.

Reviewed-by: David Matlack <dmatlack@google.com>
Reviewed-by: Ricardo Koller <ricarkol@google.com>
Reviewed-by: Krish Sadhukhan <krish.sadhukhan@oracle.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Tested-by: Fuad Tabba <tabba@google.com> #arm64
Signed-off-by: Jing Zhang <jingzhangos@google.com>
Message-Id: <20210618222709.1858088-5-jingzhangos@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm64/kvm/guest.c    |  21 +++++++++
 arch/mips/kvm/mips.c      |  44 +++++++++++++++++++
 arch/powerpc/kvm/book3s.c |  45 +++++++++++++++++++
 arch/powerpc/kvm/booke.c  |  38 ++++++++++++++++
 arch/s390/kvm/kvm-s390.c  | 108 ++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c        |  41 ++++++++++++++++++
 include/linux/kvm_host.h  |  13 +++++-
 virt/kvm/kvm_main.c       |  51 +++++++++++++++++++++-
 8 files changed, 359 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index d7606a3c449b..f1dc2092d3a0 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -43,6 +43,27 @@ const struct kvm_stats_header kvm_vm_stats_header = {
 		       sizeof(kvm_vm_stats_desc),
 };
 
+const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+	KVM_GENERIC_VCPU_STATS(),
+	STATS_DESC_COUNTER(VCPU, hvc_exit_stat),
+	STATS_DESC_COUNTER(VCPU, wfe_exit_stat),
+	STATS_DESC_COUNTER(VCPU, wfi_exit_stat),
+	STATS_DESC_COUNTER(VCPU, mmio_exit_user),
+	STATS_DESC_COUNTER(VCPU, mmio_exit_kernel),
+	STATS_DESC_COUNTER(VCPU, exits)
+};
+static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
+		sizeof(struct kvm_vcpu_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vcpu_stats_header = {
+	.name_size = KVM_STATS_NAME_SIZE,
+	.num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
+	.id_offset = sizeof(struct kvm_stats_header),
+	.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+	.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+		       sizeof(kvm_vcpu_stats_desc),
+};
+
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	VCPU_STAT("halt_successful_poll", generic.halt_successful_poll),
 	VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll),
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 9f8b203737df..2aba78c2266d 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -53,6 +53,50 @@ const struct kvm_stats_header kvm_vm_stats_header = {
 		       sizeof(kvm_vm_stats_desc),
 };
 
+const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+	KVM_GENERIC_VCPU_STATS(),
+	STATS_DESC_COUNTER(VCPU, wait_exits),
+	STATS_DESC_COUNTER(VCPU, cache_exits),
+	STATS_DESC_COUNTER(VCPU, signal_exits),
+	STATS_DESC_COUNTER(VCPU, int_exits),
+	STATS_DESC_COUNTER(VCPU, cop_unusable_exits),
+	STATS_DESC_COUNTER(VCPU, tlbmod_exits),
+	STATS_DESC_COUNTER(VCPU, tlbmiss_ld_exits),
+	STATS_DESC_COUNTER(VCPU, tlbmiss_st_exits),
+	STATS_DESC_COUNTER(VCPU, addrerr_st_exits),
+	STATS_DESC_COUNTER(VCPU, addrerr_ld_exits),
+	STATS_DESC_COUNTER(VCPU, syscall_exits),
+	STATS_DESC_COUNTER(VCPU, resvd_inst_exits),
+	STATS_DESC_COUNTER(VCPU, break_inst_exits),
+	STATS_DESC_COUNTER(VCPU, trap_inst_exits),
+	STATS_DESC_COUNTER(VCPU, msa_fpe_exits),
+	STATS_DESC_COUNTER(VCPU, fpe_exits),
+	STATS_DESC_COUNTER(VCPU, msa_disabled_exits),
+	STATS_DESC_COUNTER(VCPU, flush_dcache_exits),
+	STATS_DESC_COUNTER(VCPU, vz_gpsi_exits),
+	STATS_DESC_COUNTER(VCPU, vz_gsfc_exits),
+	STATS_DESC_COUNTER(VCPU, vz_hc_exits),
+	STATS_DESC_COUNTER(VCPU, vz_grr_exits),
+	STATS_DESC_COUNTER(VCPU, vz_gva_exits),
+	STATS_DESC_COUNTER(VCPU, vz_ghfc_exits),
+	STATS_DESC_COUNTER(VCPU, vz_gpa_exits),
+	STATS_DESC_COUNTER(VCPU, vz_resvd_exits),
+#ifdef CONFIG_CPU_LOONGSON64
+	STATS_DESC_COUNTER(VCPU, vz_cpucfg_exits),
+#endif
+};
+static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
+		sizeof(struct kvm_vcpu_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vcpu_stats_header = {
+	.name_size = KVM_STATS_NAME_SIZE,
+	.num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
+	.id_offset = sizeof(struct kvm_stats_header),
+	.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+	.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+		       sizeof(kvm_vcpu_stats_desc),
+};
+
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	VCPU_STAT("wait", wait_exits),
 	VCPU_STAT("cache", cache_exits),
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 1f004837f9c5..61229302bce2 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -55,6 +55,51 @@ const struct kvm_stats_header kvm_vm_stats_header = {
 		       sizeof(kvm_vm_stats_desc),
 };
 
+const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+	KVM_GENERIC_VCPU_STATS(),
+	STATS_DESC_COUNTER(VCPU, sum_exits),
+	STATS_DESC_COUNTER(VCPU, mmio_exits),
+	STATS_DESC_COUNTER(VCPU, signal_exits),
+	STATS_DESC_COUNTER(VCPU, light_exits),
+	STATS_DESC_COUNTER(VCPU, itlb_real_miss_exits),
+	STATS_DESC_COUNTER(VCPU, itlb_virt_miss_exits),
+	STATS_DESC_COUNTER(VCPU, dtlb_real_miss_exits),
+	STATS_DESC_COUNTER(VCPU, dtlb_virt_miss_exits),
+	STATS_DESC_COUNTER(VCPU, syscall_exits),
+	STATS_DESC_COUNTER(VCPU, isi_exits),
+	STATS_DESC_COUNTER(VCPU, dsi_exits),
+	STATS_DESC_COUNTER(VCPU, emulated_inst_exits),
+	STATS_DESC_COUNTER(VCPU, dec_exits),
+	STATS_DESC_COUNTER(VCPU, ext_intr_exits),
+	STATS_DESC_TIME_NSEC(VCPU, halt_wait_ns),
+	STATS_DESC_COUNTER(VCPU, halt_successful_wait),
+	STATS_DESC_COUNTER(VCPU, dbell_exits),
+	STATS_DESC_COUNTER(VCPU, gdbell_exits),
+	STATS_DESC_COUNTER(VCPU, ld),
+	STATS_DESC_COUNTER(VCPU, st),
+	STATS_DESC_COUNTER(VCPU, pf_storage),
+	STATS_DESC_COUNTER(VCPU, pf_instruc),
+	STATS_DESC_COUNTER(VCPU, sp_storage),
+	STATS_DESC_COUNTER(VCPU, sp_instruc),
+	STATS_DESC_COUNTER(VCPU, queue_intr),
+	STATS_DESC_COUNTER(VCPU, ld_slow),
+	STATS_DESC_COUNTER(VCPU, st_slow),
+	STATS_DESC_COUNTER(VCPU, pthru_all),
+	STATS_DESC_COUNTER(VCPU, pthru_host),
+	STATS_DESC_COUNTER(VCPU, pthru_bad_aff)
+};
+static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
+		sizeof(struct kvm_vcpu_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vcpu_stats_header = {
+	.name_size = KVM_STATS_NAME_SIZE,
+	.num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
+	.id_offset = sizeof(struct kvm_stats_header),
+	.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+	.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+		       sizeof(kvm_vcpu_stats_desc),
+};
+
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	VCPU_STAT("exits", sum_exits),
 	VCPU_STAT("mmio", mmio_exits),
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index a49ea4dcf963..6e8de33bc138 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -53,6 +53,44 @@ const struct kvm_stats_header kvm_vm_stats_header = {
 		       sizeof(kvm_vm_stats_desc),
 };
 
+const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+	KVM_GENERIC_VCPU_STATS(),
+	STATS_DESC_COUNTER(VCPU, sum_exits),
+	STATS_DESC_COUNTER(VCPU, mmio_exits),
+	STATS_DESC_COUNTER(VCPU, signal_exits),
+	STATS_DESC_COUNTER(VCPU, light_exits),
+	STATS_DESC_COUNTER(VCPU, itlb_real_miss_exits),
+	STATS_DESC_COUNTER(VCPU, itlb_virt_miss_exits),
+	STATS_DESC_COUNTER(VCPU, dtlb_real_miss_exits),
+	STATS_DESC_COUNTER(VCPU, dtlb_virt_miss_exits),
+	STATS_DESC_COUNTER(VCPU, syscall_exits),
+	STATS_DESC_COUNTER(VCPU, isi_exits),
+	STATS_DESC_COUNTER(VCPU, dsi_exits),
+	STATS_DESC_COUNTER(VCPU, emulated_inst_exits),
+	STATS_DESC_COUNTER(VCPU, dec_exits),
+	STATS_DESC_COUNTER(VCPU, ext_intr_exits),
+	STATS_DESC_TIME_NSEC(VCPU, halt_wait_ns),
+	STATS_DESC_COUNTER(VCPU, halt_successful_wait),
+	STATS_DESC_COUNTER(VCPU, dbell_exits),
+	STATS_DESC_COUNTER(VCPU, gdbell_exits),
+	STATS_DESC_COUNTER(VCPU, ld),
+	STATS_DESC_COUNTER(VCPU, st),
+	STATS_DESC_COUNTER(VCPU, pthru_all),
+	STATS_DESC_COUNTER(VCPU, pthru_host),
+	STATS_DESC_COUNTER(VCPU, pthru_bad_aff)
+};
+static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
+		sizeof(struct kvm_vcpu_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vcpu_stats_header = {
+	.name_size = KVM_STATS_NAME_SIZE,
+	.num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
+	.id_offset = sizeof(struct kvm_stats_header),
+	.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+	.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+		       sizeof(kvm_vcpu_stats_desc),
+};
+
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	VCPU_STAT("mmio", mmio_exits),
 	VCPU_STAT("sig", signal_exits),
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index c7c7a28af41c..8ac10bcaf8ba 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -78,6 +78,114 @@ const struct kvm_stats_header kvm_vm_stats_header = {
 		       sizeof(kvm_vm_stats_desc),
 };
 
+const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+	KVM_GENERIC_VCPU_STATS(),
+	STATS_DESC_COUNTER(VCPU, exit_userspace),
+	STATS_DESC_COUNTER(VCPU, exit_null),
+	STATS_DESC_COUNTER(VCPU, exit_external_request),
+	STATS_DESC_COUNTER(VCPU, exit_io_request),
+	STATS_DESC_COUNTER(VCPU, exit_external_interrupt),
+	STATS_DESC_COUNTER(VCPU, exit_stop_request),
+	STATS_DESC_COUNTER(VCPU, exit_validity),
+	STATS_DESC_COUNTER(VCPU, exit_instruction),
+	STATS_DESC_COUNTER(VCPU, exit_pei),
+	STATS_DESC_COUNTER(VCPU, halt_no_poll_steal),
+	STATS_DESC_COUNTER(VCPU, instruction_lctl),
+	STATS_DESC_COUNTER(VCPU, instruction_lctlg),
+	STATS_DESC_COUNTER(VCPU, instruction_stctl),
+	STATS_DESC_COUNTER(VCPU, instruction_stctg),
+	STATS_DESC_COUNTER(VCPU, exit_program_interruption),
+	STATS_DESC_COUNTER(VCPU, exit_instr_and_program),
+	STATS_DESC_COUNTER(VCPU, exit_operation_exception),
+	STATS_DESC_COUNTER(VCPU, deliver_ckc),
+	STATS_DESC_COUNTER(VCPU, deliver_cputm),
+	STATS_DESC_COUNTER(VCPU, deliver_external_call),
+	STATS_DESC_COUNTER(VCPU, deliver_emergency_signal),
+	STATS_DESC_COUNTER(VCPU, deliver_service_signal),
+	STATS_DESC_COUNTER(VCPU, deliver_virtio),
+	STATS_DESC_COUNTER(VCPU, deliver_stop_signal),
+	STATS_DESC_COUNTER(VCPU, deliver_prefix_signal),
+	STATS_DESC_COUNTER(VCPU, deliver_restart_signal),
+	STATS_DESC_COUNTER(VCPU, deliver_program),
+	STATS_DESC_COUNTER(VCPU, deliver_io),
+	STATS_DESC_COUNTER(VCPU, deliver_machine_check),
+	STATS_DESC_COUNTER(VCPU, exit_wait_state),
+	STATS_DESC_COUNTER(VCPU, inject_ckc),
+	STATS_DESC_COUNTER(VCPU, inject_cputm),
+	STATS_DESC_COUNTER(VCPU, inject_external_call),
+	STATS_DESC_COUNTER(VCPU, inject_emergency_signal),
+	STATS_DESC_COUNTER(VCPU, inject_mchk),
+	STATS_DESC_COUNTER(VCPU, inject_pfault_init),
+	STATS_DESC_COUNTER(VCPU, inject_program),
+	STATS_DESC_COUNTER(VCPU, inject_restart),
+	STATS_DESC_COUNTER(VCPU, inject_set_prefix),
+	STATS_DESC_COUNTER(VCPU, inject_stop_signal),
+	STATS_DESC_COUNTER(VCPU, instruction_epsw),
+	STATS_DESC_COUNTER(VCPU, instruction_gs),
+	STATS_DESC_COUNTER(VCPU, instruction_io_other),
+	STATS_DESC_COUNTER(VCPU, instruction_lpsw),
+	STATS_DESC_COUNTER(VCPU, instruction_lpswe),
+	STATS_DESC_COUNTER(VCPU, instruction_pfmf),
+	STATS_DESC_COUNTER(VCPU, instruction_ptff),
+	STATS_DESC_COUNTER(VCPU, instruction_sck),
+	STATS_DESC_COUNTER(VCPU, instruction_sckpf),
+	STATS_DESC_COUNTER(VCPU, instruction_stidp),
+	STATS_DESC_COUNTER(VCPU, instruction_spx),
+	STATS_DESC_COUNTER(VCPU, instruction_stpx),
+	STATS_DESC_COUNTER(VCPU, instruction_stap),
+	STATS_DESC_COUNTER(VCPU, instruction_iske),
+	STATS_DESC_COUNTER(VCPU, instruction_ri),
+	STATS_DESC_COUNTER(VCPU, instruction_rrbe),
+	STATS_DESC_COUNTER(VCPU, instruction_sske),
+	STATS_DESC_COUNTER(VCPU, instruction_ipte_interlock),
+	STATS_DESC_COUNTER(VCPU, instruction_stsi),
+	STATS_DESC_COUNTER(VCPU, instruction_stfl),
+	STATS_DESC_COUNTER(VCPU, instruction_tb),
+	STATS_DESC_COUNTER(VCPU, instruction_tpi),
+	STATS_DESC_COUNTER(VCPU, instruction_tprot),
+	STATS_DESC_COUNTER(VCPU, instruction_tsch),
+	STATS_DESC_COUNTER(VCPU, instruction_sie),
+	STATS_DESC_COUNTER(VCPU, instruction_essa),
+	STATS_DESC_COUNTER(VCPU, instruction_sthyi),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_sense),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_sense_running),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_external_call),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_emergency),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_cond_emergency),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_start),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_stop),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_stop_store_status),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_store_status),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_store_adtl_status),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_arch),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_prefix),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_restart),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_init_cpu_reset),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_cpu_reset),
+	STATS_DESC_COUNTER(VCPU, instruction_sigp_unknown),
+	STATS_DESC_COUNTER(VCPU, diagnose_10),
+	STATS_DESC_COUNTER(VCPU, diagnose_44),
+	STATS_DESC_COUNTER(VCPU, diagnose_9c),
+	STATS_DESC_COUNTER(VCPU, diagnose_9c_ignored),
+	STATS_DESC_COUNTER(VCPU, diagnose_9c_forward),
+	STATS_DESC_COUNTER(VCPU, diagnose_258),
+	STATS_DESC_COUNTER(VCPU, diagnose_308),
+	STATS_DESC_COUNTER(VCPU, diagnose_500),
+	STATS_DESC_COUNTER(VCPU, diagnose_other),
+	STATS_DESC_COUNTER(VCPU, pfault_sync)
+};
+static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
+		sizeof(struct kvm_vcpu_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vcpu_stats_header = {
+	.name_size = KVM_STATS_NAME_SIZE,
+	.num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
+	.id_offset = sizeof(struct kvm_stats_header),
+	.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+	.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+		       sizeof(kvm_vcpu_stats_desc),
+};
+
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	VCPU_STAT("userspace_handled", exit_userspace),
 	VCPU_STAT("exit_null", exit_null),
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 570fd0704847..53b7c25d6ebc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -248,6 +248,47 @@ const struct kvm_stats_header kvm_vm_stats_header = {
 		       sizeof(kvm_vm_stats_desc),
 };
 
+const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+	KVM_GENERIC_VCPU_STATS(),
+	STATS_DESC_COUNTER(VCPU, pf_fixed),
+	STATS_DESC_COUNTER(VCPU, pf_guest),
+	STATS_DESC_COUNTER(VCPU, tlb_flush),
+	STATS_DESC_COUNTER(VCPU, invlpg),
+	STATS_DESC_COUNTER(VCPU, exits),
+	STATS_DESC_COUNTER(VCPU, io_exits),
+	STATS_DESC_COUNTER(VCPU, mmio_exits),
+	STATS_DESC_COUNTER(VCPU, signal_exits),
+	STATS_DESC_COUNTER(VCPU, irq_window_exits),
+	STATS_DESC_COUNTER(VCPU, nmi_window_exits),
+	STATS_DESC_COUNTER(VCPU, l1d_flush),
+	STATS_DESC_COUNTER(VCPU, halt_exits),
+	STATS_DESC_COUNTER(VCPU, request_irq_exits),
+	STATS_DESC_COUNTER(VCPU, irq_exits),
+	STATS_DESC_COUNTER(VCPU, host_state_reload),
+	STATS_DESC_COUNTER(VCPU, fpu_reload),
+	STATS_DESC_COUNTER(VCPU, insn_emulation),
+	STATS_DESC_COUNTER(VCPU, insn_emulation_fail),
+	STATS_DESC_COUNTER(VCPU, hypercalls),
+	STATS_DESC_COUNTER(VCPU, irq_injections),
+	STATS_DESC_COUNTER(VCPU, nmi_injections),
+	STATS_DESC_COUNTER(VCPU, req_event),
+	STATS_DESC_COUNTER(VCPU, nested_run),
+	STATS_DESC_COUNTER(VCPU, directed_yield_attempted),
+	STATS_DESC_COUNTER(VCPU, directed_yield_successful),
+	STATS_DESC_ICOUNTER(VCPU, guest_mode)
+};
+static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
+		sizeof(struct kvm_vcpu_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vcpu_stats_header = {
+	.name_size = KVM_STATS_NAME_SIZE,
+	.num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
+	.id_offset = sizeof(struct kvm_stats_header),
+	.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+	.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+		       sizeof(kvm_vcpu_stats_desc),
+};
+
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	VCPU_STAT("pf_fixed", pf_fixed),
 	VCPU_STAT("pf_guest", pf_guest),
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e79ce64b9f6f..9e75afef16b0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -305,7 +305,6 @@ struct kvm_vcpu {
 	struct pid __rcu *pid;
 	int sigset_active;
 	sigset_t sigset;
-	struct kvm_vcpu_stat stat;
 	unsigned int halt_poll_ns;
 	bool valid_wakeup;
 
@@ -342,6 +341,8 @@ struct kvm_vcpu {
 	bool preempted;
 	bool ready;
 	struct kvm_vcpu_arch arch;
+	struct kvm_vcpu_stat stat;
+	char stats_id[KVM_STATS_NAME_SIZE];
 	struct kvm_dirty_ring dirty_ring;
 };
 
@@ -1358,6 +1359,14 @@ struct _kvm_stats_desc {
 #define KVM_GENERIC_VM_STATS()						       \
 	STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush)
 
+#define KVM_GENERIC_VCPU_STATS()					       \
+	STATS_DESC_COUNTER(VCPU_GENERIC, halt_successful_poll),		       \
+	STATS_DESC_COUNTER(VCPU_GENERIC, halt_attempted_poll),		       \
+	STATS_DESC_COUNTER(VCPU_GENERIC, halt_poll_invalid),		       \
+	STATS_DESC_COUNTER(VCPU_GENERIC, halt_wakeup),			       \
+	STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_success_ns),	       \
+	STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns)
+
 extern struct kvm_stats_debugfs_item debugfs_entries[];
 extern struct dentry *kvm_debugfs_dir;
 ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header,
@@ -1366,6 +1375,8 @@ ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header,
 		       char __user *user_buffer, size_t size, loff_t *offset);
 extern const struct kvm_stats_header kvm_vm_stats_header;
 extern const struct _kvm_stats_desc kvm_vm_stats_desc[];
+extern const struct kvm_stats_header kvm_vcpu_stats_header;
+extern const struct _kvm_stats_desc kvm_vcpu_stats_desc[];
 
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 33ec43a87d0f..c8d0028df4ac 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3448,6 +3448,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 	vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
 	BUG_ON(kvm->vcpus[vcpu->vcpu_idx]);
 
+	/* Fill the stats id string for the vcpu */
+	snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
+		 task_pid_nr(current), id);
+
 	/* Now it's all set up, let userspace reach it */
 	kvm_get_kvm(kvm);
 	r = create_vcpu_fd(vcpu);
@@ -3497,6 +3501,44 @@ static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
 	return 0;
 }
 
+static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
+			      size_t size, loff_t *offset)
+{
+	struct kvm_vcpu *vcpu = file->private_data;
+
+	return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
+			&kvm_vcpu_stats_desc[0], &vcpu->stat,
+			sizeof(vcpu->stat), user_buffer, size, offset);
+}
+
+static const struct file_operations kvm_vcpu_stats_fops = {
+	.read = kvm_vcpu_stats_read,
+	.llseek = noop_llseek,
+};
+
+static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
+{
+	int fd;
+	struct file *file;
+	char name[15 + ITOA_MAX_LEN + 1];
+
+	snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0)
+		return fd;
+
+	file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
+	if (IS_ERR(file)) {
+		put_unused_fd(fd);
+		return PTR_ERR(file);
+	}
+	file->f_mode |= FMODE_PREAD;
+	fd_install(fd, file);
+
+	return fd;
+}
+
 static long kvm_vcpu_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {
@@ -3694,6 +3736,10 @@ out_free1:
 		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
 		break;
 	}
+	case KVM_GET_STATS_FD: {
+		r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
+		break;
+	}
 	default:
 		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
 	}
@@ -3952,6 +3998,8 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 #else
 		return 0;
 #endif
+	case KVM_CAP_BINARY_STATS_FD:
+		return 1;
 	default:
 		break;
 	}
@@ -5254,7 +5302,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 		kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
 					   SLAB_ACCOUNT,
 					   offsetof(struct kvm_vcpu, arch),
-					   sizeof_field(struct kvm_vcpu, arch),
+					   offsetofend(struct kvm_vcpu, stats_id)
+					   - offsetof(struct kvm_vcpu, arch),
 					   NULL);
 	if (!kvm_vcpu_cache) {
 		r = -ENOMEM;
-- 
cgit v1.2.3


From bc9e9e672df9f16f3825320c53ec01b3d44add28 Mon Sep 17 00:00:00 2001
From: Jing Zhang <jingzhangos@google.com>
Date: Wed, 23 Jun 2021 17:28:46 -0400
Subject: KVM: debugfs: Reuse binary stats descriptors

To remove code duplication, use the binary stats descriptors in the
implementation of the debugfs interface for statistics. This unifies
the definition of statistics for the binary and debugfs interfaces.

Signed-off-by: Jing Zhang <jingzhangos@google.com>
Message-Id: <20210618222709.1858088-8-jingzhangos@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm64/kvm/guest.c    |  16 -------
 arch/mips/kvm/mips.c      |  39 -----------------
 arch/powerpc/kvm/book3s.c |  33 --------------
 arch/powerpc/kvm/booke.c  |  25 -----------
 arch/s390/kvm/kvm-s390.c  | 108 ----------------------------------------------
 arch/x86/kvm/x86.c        |  49 +--------------------
 include/linux/kvm_host.h  |  17 +-------
 virt/kvm/kvm_main.c       | 104 ++++++++++++++++++++++++++++++++------------
 8 files changed, 78 insertions(+), 313 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index f1dc2092d3a0..1512a8007a78 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -64,22 +64,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = {
 		       sizeof(kvm_vcpu_stats_desc),
 };
 
-struct kvm_stats_debugfs_item debugfs_entries[] = {
-	VCPU_STAT("halt_successful_poll", generic.halt_successful_poll),
-	VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll),
-	VCPU_STAT("halt_poll_invalid", generic.halt_poll_invalid),
-	VCPU_STAT("halt_wakeup", generic.halt_wakeup),
-	VCPU_STAT("hvc_exit_stat", hvc_exit_stat),
-	VCPU_STAT("wfe_exit_stat", wfe_exit_stat),
-	VCPU_STAT("wfi_exit_stat", wfi_exit_stat),
-	VCPU_STAT("mmio_exit_user", mmio_exit_user),
-	VCPU_STAT("mmio_exit_kernel", mmio_exit_kernel),
-	VCPU_STAT("exits", exits),
-	VCPU_STAT("halt_poll_success_ns", generic.halt_poll_success_ns),
-	VCPU_STAT("halt_poll_fail_ns", generic.halt_poll_fail_ns),
-	{ NULL }
-};
-
 static bool core_reg_offset_is_vreg(u64 off)
 {
 	return off >= KVM_REG_ARM_CORE_REG(fp_regs.vregs) &&
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 2aba78c2266d..af9dd029a4e1 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -97,45 +97,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = {
 		       sizeof(kvm_vcpu_stats_desc),
 };
 
-struct kvm_stats_debugfs_item debugfs_entries[] = {
-	VCPU_STAT("wait", wait_exits),
-	VCPU_STAT("cache", cache_exits),
-	VCPU_STAT("signal", signal_exits),
-	VCPU_STAT("interrupt", int_exits),
-	VCPU_STAT("cop_unusable", cop_unusable_exits),
-	VCPU_STAT("tlbmod", tlbmod_exits),
-	VCPU_STAT("tlbmiss_ld", tlbmiss_ld_exits),
-	VCPU_STAT("tlbmiss_st", tlbmiss_st_exits),
-	VCPU_STAT("addrerr_st", addrerr_st_exits),
-	VCPU_STAT("addrerr_ld", addrerr_ld_exits),
-	VCPU_STAT("syscall", syscall_exits),
-	VCPU_STAT("resvd_inst", resvd_inst_exits),
-	VCPU_STAT("break_inst", break_inst_exits),
-	VCPU_STAT("trap_inst", trap_inst_exits),
-	VCPU_STAT("msa_fpe", msa_fpe_exits),
-	VCPU_STAT("fpe", fpe_exits),
-	VCPU_STAT("msa_disabled", msa_disabled_exits),
-	VCPU_STAT("flush_dcache", flush_dcache_exits),
-	VCPU_STAT("vz_gpsi", vz_gpsi_exits),
-	VCPU_STAT("vz_gsfc", vz_gsfc_exits),
-	VCPU_STAT("vz_hc", vz_hc_exits),
-	VCPU_STAT("vz_grr", vz_grr_exits),
-	VCPU_STAT("vz_gva", vz_gva_exits),
-	VCPU_STAT("vz_ghfc", vz_ghfc_exits),
-	VCPU_STAT("vz_gpa", vz_gpa_exits),
-	VCPU_STAT("vz_resvd", vz_resvd_exits),
-#ifdef CONFIG_CPU_LOONGSON64
-	VCPU_STAT("vz_cpucfg", vz_cpucfg_exits),
-#endif
-	VCPU_STAT("halt_successful_poll", generic.halt_successful_poll),
-	VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll),
-	VCPU_STAT("halt_poll_invalid", generic.halt_poll_invalid),
-	VCPU_STAT("halt_wakeup", generic.halt_wakeup),
-	VCPU_STAT("halt_poll_success_ns", generic.halt_poll_success_ns),
-	VCPU_STAT("halt_poll_fail_ns", generic.halt_poll_fail_ns),
-	{NULL}
-};
-
 bool kvm_trace_guest_mode_change;
 
 int kvm_guest_mode_change_trace_reg(void)
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 61229302bce2..79833f78d1da 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -100,39 +100,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = {
 		       sizeof(kvm_vcpu_stats_desc),
 };
 
-struct kvm_stats_debugfs_item debugfs_entries[] = {
-	VCPU_STAT("exits", sum_exits),
-	VCPU_STAT("mmio", mmio_exits),
-	VCPU_STAT("sig", signal_exits),
-	VCPU_STAT("sysc", syscall_exits),
-	VCPU_STAT("inst_emu", emulated_inst_exits),
-	VCPU_STAT("dec", dec_exits),
-	VCPU_STAT("ext_intr", ext_intr_exits),
-	VCPU_STAT("queue_intr", queue_intr),
-	VCPU_STAT("halt_poll_success_ns", generic.halt_poll_success_ns),
-	VCPU_STAT("halt_poll_fail_ns", generic.halt_poll_fail_ns),
-	VCPU_STAT("halt_wait_ns", halt_wait_ns),
-	VCPU_STAT("halt_successful_poll", generic.halt_successful_poll),
-	VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll),
-	VCPU_STAT("halt_successful_wait", halt_successful_wait),
-	VCPU_STAT("halt_poll_invalid", generic.halt_poll_invalid),
-	VCPU_STAT("halt_wakeup", generic.halt_wakeup),
-	VCPU_STAT("pf_storage", pf_storage),
-	VCPU_STAT("sp_storage", sp_storage),
-	VCPU_STAT("pf_instruc", pf_instruc),
-	VCPU_STAT("sp_instruc", sp_instruc),
-	VCPU_STAT("ld", ld),
-	VCPU_STAT("ld_slow", ld_slow),
-	VCPU_STAT("st", st),
-	VCPU_STAT("st_slow", st_slow),
-	VCPU_STAT("pthru_all", pthru_all),
-	VCPU_STAT("pthru_host", pthru_host),
-	VCPU_STAT("pthru_bad_aff", pthru_bad_aff),
-	VM_STAT("largepages_2M", num_2M_pages, .mode = 0444),
-	VM_STAT("largepages_1G", num_1G_pages, .mode = 0444),
-	{ NULL }
-};
-
 static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
 			unsigned long pending_now, unsigned long old_pending)
 {
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 6e8de33bc138..551b30d84aee 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -91,31 +91,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = {
 		       sizeof(kvm_vcpu_stats_desc),
 };
 
-struct kvm_stats_debugfs_item debugfs_entries[] = {
-	VCPU_STAT("mmio", mmio_exits),
-	VCPU_STAT("sig", signal_exits),
-	VCPU_STAT("itlb_r", itlb_real_miss_exits),
-	VCPU_STAT("itlb_v", itlb_virt_miss_exits),
-	VCPU_STAT("dtlb_r", dtlb_real_miss_exits),
-	VCPU_STAT("dtlb_v", dtlb_virt_miss_exits),
-	VCPU_STAT("sysc", syscall_exits),
-	VCPU_STAT("isi", isi_exits),
-	VCPU_STAT("dsi", dsi_exits),
-	VCPU_STAT("inst_emu", emulated_inst_exits),
-	VCPU_STAT("dec", dec_exits),
-	VCPU_STAT("ext_intr", ext_intr_exits),
-	VCPU_STAT("halt_successful_poll", generic.halt_successful_poll),
-	VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll),
-	VCPU_STAT("halt_poll_invalid", generic.halt_poll_invalid),
-	VCPU_STAT("halt_wakeup", generic.halt_wakeup),
-	VCPU_STAT("doorbell", dbell_exits),
-	VCPU_STAT("guest doorbell", gdbell_exits),
-	VCPU_STAT("halt_poll_success_ns", generic.halt_poll_success_ns),
-	VCPU_STAT("halt_poll_fail_ns", generic.halt_poll_fail_ns),
-	VM_STAT("remote_tlb_flush", generic.remote_tlb_flush),
-	{ NULL }
-};
-
 /* TODO: use vcpu_printf() */
 void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 8ac10bcaf8ba..1695f0ced5ba 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -186,114 +186,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = {
 		       sizeof(kvm_vcpu_stats_desc),
 };
 
-struct kvm_stats_debugfs_item debugfs_entries[] = {
-	VCPU_STAT("userspace_handled", exit_userspace),
-	VCPU_STAT("exit_null", exit_null),
-	VCPU_STAT("pfault_sync", pfault_sync),
-	VCPU_STAT("exit_validity", exit_validity),
-	VCPU_STAT("exit_stop_request", exit_stop_request),
-	VCPU_STAT("exit_external_request", exit_external_request),
-	VCPU_STAT("exit_io_request", exit_io_request),
-	VCPU_STAT("exit_external_interrupt", exit_external_interrupt),
-	VCPU_STAT("exit_instruction", exit_instruction),
-	VCPU_STAT("exit_pei", exit_pei),
-	VCPU_STAT("exit_program_interruption", exit_program_interruption),
-	VCPU_STAT("exit_instr_and_program_int", exit_instr_and_program),
-	VCPU_STAT("exit_operation_exception", exit_operation_exception),
-	VCPU_STAT("halt_successful_poll", generic.halt_successful_poll),
-	VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll),
-	VCPU_STAT("halt_poll_invalid", generic.halt_poll_invalid),
-	VCPU_STAT("halt_no_poll_steal", halt_no_poll_steal),
-	VCPU_STAT("halt_wakeup", generic.halt_wakeup),
-	VCPU_STAT("halt_poll_success_ns", generic.halt_poll_success_ns),
-	VCPU_STAT("halt_poll_fail_ns", generic.halt_poll_fail_ns),
-	VCPU_STAT("instruction_lctlg", instruction_lctlg),
-	VCPU_STAT("instruction_lctl", instruction_lctl),
-	VCPU_STAT("instruction_stctl", instruction_stctl),
-	VCPU_STAT("instruction_stctg", instruction_stctg),
-	VCPU_STAT("deliver_ckc", deliver_ckc),
-	VCPU_STAT("deliver_cputm", deliver_cputm),
-	VCPU_STAT("deliver_emergency_signal", deliver_emergency_signal),
-	VCPU_STAT("deliver_external_call", deliver_external_call),
-	VCPU_STAT("deliver_service_signal", deliver_service_signal),
-	VCPU_STAT("deliver_virtio", deliver_virtio),
-	VCPU_STAT("deliver_stop_signal", deliver_stop_signal),
-	VCPU_STAT("deliver_prefix_signal", deliver_prefix_signal),
-	VCPU_STAT("deliver_restart_signal", deliver_restart_signal),
-	VCPU_STAT("deliver_program", deliver_program),
-	VCPU_STAT("deliver_io", deliver_io),
-	VCPU_STAT("deliver_machine_check", deliver_machine_check),
-	VCPU_STAT("exit_wait_state", exit_wait_state),
-	VCPU_STAT("inject_ckc", inject_ckc),
-	VCPU_STAT("inject_cputm", inject_cputm),
-	VCPU_STAT("inject_external_call", inject_external_call),
-	VM_STAT("inject_float_mchk", inject_float_mchk),
-	VCPU_STAT("inject_emergency_signal", inject_emergency_signal),
-	VM_STAT("inject_io", inject_io),
-	VCPU_STAT("inject_mchk", inject_mchk),
-	VM_STAT("inject_pfault_done", inject_pfault_done),
-	VCPU_STAT("inject_program", inject_program),
-	VCPU_STAT("inject_restart", inject_restart),
-	VM_STAT("inject_service_signal", inject_service_signal),
-	VCPU_STAT("inject_set_prefix", inject_set_prefix),
-	VCPU_STAT("inject_stop_signal", inject_stop_signal),
-	VCPU_STAT("inject_pfault_init", inject_pfault_init),
-	VM_STAT("inject_virtio", inject_virtio),
-	VCPU_STAT("instruction_epsw", instruction_epsw),
-	VCPU_STAT("instruction_gs", instruction_gs),
-	VCPU_STAT("instruction_io_other", instruction_io_other),
-	VCPU_STAT("instruction_lpsw", instruction_lpsw),
-	VCPU_STAT("instruction_lpswe", instruction_lpswe),
-	VCPU_STAT("instruction_pfmf", instruction_pfmf),
-	VCPU_STAT("instruction_ptff", instruction_ptff),
-	VCPU_STAT("instruction_stidp", instruction_stidp),
-	VCPU_STAT("instruction_sck", instruction_sck),
-	VCPU_STAT("instruction_sckpf", instruction_sckpf),
-	VCPU_STAT("instruction_spx", instruction_spx),
-	VCPU_STAT("instruction_stpx", instruction_stpx),
-	VCPU_STAT("instruction_stap", instruction_stap),
-	VCPU_STAT("instruction_iske", instruction_iske),
-	VCPU_STAT("instruction_ri", instruction_ri),
-	VCPU_STAT("instruction_rrbe", instruction_rrbe),
-	VCPU_STAT("instruction_sske", instruction_sske),
-	VCPU_STAT("instruction_ipte_interlock", instruction_ipte_interlock),
-	VCPU_STAT("instruction_essa", instruction_essa),
-	VCPU_STAT("instruction_stsi", instruction_stsi),
-	VCPU_STAT("instruction_stfl", instruction_stfl),
-	VCPU_STAT("instruction_tb", instruction_tb),
-	VCPU_STAT("instruction_tpi", instruction_tpi),
-	VCPU_STAT("instruction_tprot", instruction_tprot),
-	VCPU_STAT("instruction_tsch", instruction_tsch),
-	VCPU_STAT("instruction_sthyi", instruction_sthyi),
-	VCPU_STAT("instruction_sie", instruction_sie),
-	VCPU_STAT("instruction_sigp_sense", instruction_sigp_sense),
-	VCPU_STAT("instruction_sigp_sense_running", instruction_sigp_sense_running),
-	VCPU_STAT("instruction_sigp_external_call", instruction_sigp_external_call),
-	VCPU_STAT("instruction_sigp_emergency", instruction_sigp_emergency),
-	VCPU_STAT("instruction_sigp_cond_emergency", instruction_sigp_cond_emergency),
-	VCPU_STAT("instruction_sigp_start", instruction_sigp_start),
-	VCPU_STAT("instruction_sigp_stop", instruction_sigp_stop),
-	VCPU_STAT("instruction_sigp_stop_store_status", instruction_sigp_stop_store_status),
-	VCPU_STAT("instruction_sigp_store_status", instruction_sigp_store_status),
-	VCPU_STAT("instruction_sigp_store_adtl_status", instruction_sigp_store_adtl_status),
-	VCPU_STAT("instruction_sigp_set_arch", instruction_sigp_arch),
-	VCPU_STAT("instruction_sigp_set_prefix", instruction_sigp_prefix),
-	VCPU_STAT("instruction_sigp_restart", instruction_sigp_restart),
-	VCPU_STAT("instruction_sigp_cpu_reset", instruction_sigp_cpu_reset),
-	VCPU_STAT("instruction_sigp_init_cpu_reset", instruction_sigp_init_cpu_reset),
-	VCPU_STAT("instruction_sigp_unknown", instruction_sigp_unknown),
-	VCPU_STAT("instruction_diag_10", diagnose_10),
-	VCPU_STAT("instruction_diag_44", diagnose_44),
-	VCPU_STAT("instruction_diag_9c", diagnose_9c),
-	VCPU_STAT("diag_9c_ignored", diagnose_9c_ignored),
-	VCPU_STAT("diag_9c_forward", diagnose_9c_forward),
-	VCPU_STAT("instruction_diag_258", diagnose_258),
-	VCPU_STAT("instruction_diag_308", diagnose_308),
-	VCPU_STAT("instruction_diag_500", diagnose_500),
-	VCPU_STAT("instruction_diag_other", diagnose_other),
-	{ NULL }
-};
-
 /* allow nested virtualization in KVM (if enabled by user space) */
 static int nested;
 module_param(nested, int, S_IRUGO);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 53b7c25d6ebc..5833b8780808 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -234,7 +234,7 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
 	STATS_DESC_ICOUNTER(VM, mmu_unsync),
 	STATS_DESC_ICOUNTER(VM, lpages),
 	STATS_DESC_ICOUNTER(VM, nx_lpage_splits),
-	STATS_DESC_ICOUNTER(VM, max_mmu_page_hash_collisions)
+	STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions)
 };
 static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
 		sizeof(struct kvm_vm_stat) / sizeof(u64));
@@ -289,53 +289,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = {
 		       sizeof(kvm_vcpu_stats_desc),
 };
 
-struct kvm_stats_debugfs_item debugfs_entries[] = {
-	VCPU_STAT("pf_fixed", pf_fixed),
-	VCPU_STAT("pf_guest", pf_guest),
-	VCPU_STAT("tlb_flush", tlb_flush),
-	VCPU_STAT("invlpg", invlpg),
-	VCPU_STAT("exits", exits),
-	VCPU_STAT("io_exits", io_exits),
-	VCPU_STAT("mmio_exits", mmio_exits),
-	VCPU_STAT("signal_exits", signal_exits),
-	VCPU_STAT("irq_window", irq_window_exits),
-	VCPU_STAT("nmi_window", nmi_window_exits),
-	VCPU_STAT("halt_exits", halt_exits),
-	VCPU_STAT("halt_successful_poll", generic.halt_successful_poll),
-	VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll),
-	VCPU_STAT("halt_poll_invalid", generic.halt_poll_invalid),
-	VCPU_STAT("halt_wakeup", generic.halt_wakeup),
-	VCPU_STAT("hypercalls", hypercalls),
-	VCPU_STAT("request_irq", request_irq_exits),
-	VCPU_STAT("irq_exits", irq_exits),
-	VCPU_STAT("host_state_reload", host_state_reload),
-	VCPU_STAT("fpu_reload", fpu_reload),
-	VCPU_STAT("insn_emulation", insn_emulation),
-	VCPU_STAT("insn_emulation_fail", insn_emulation_fail),
-	VCPU_STAT("irq_injections", irq_injections),
-	VCPU_STAT("nmi_injections", nmi_injections),
-	VCPU_STAT("req_event", req_event),
-	VCPU_STAT("l1d_flush", l1d_flush),
-	VCPU_STAT("halt_poll_success_ns", generic.halt_poll_success_ns),
-	VCPU_STAT("halt_poll_fail_ns", generic.halt_poll_fail_ns),
-	VCPU_STAT("nested_run", nested_run),
-	VCPU_STAT("directed_yield_attempted", directed_yield_attempted),
-	VCPU_STAT("directed_yield_successful", directed_yield_successful),
-	VCPU_STAT("guest_mode", guest_mode),
-	VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
-	VM_STAT("mmu_pte_write", mmu_pte_write),
-	VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
-	VM_STAT("mmu_flooded", mmu_flooded),
-	VM_STAT("mmu_recycled", mmu_recycled),
-	VM_STAT("mmu_cache_miss", mmu_cache_miss),
-	VM_STAT("mmu_unsync", mmu_unsync),
-	VM_STAT("remote_tlb_flush", generic.remote_tlb_flush),
-	VM_STAT("largepages", lpages, .mode = 0444),
-	VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444),
-	VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions),
-	{ NULL }
-};
-
 u64 __read_mostly host_xcr0;
 u64 __read_mostly supported_xcr0;
 EXPORT_SYMBOL_GPL(supported_xcr0);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9e75afef16b0..ae7735b490b4 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1264,14 +1264,8 @@ enum kvm_stat_kind {
 
 struct kvm_stat_data {
 	struct kvm *kvm;
-	struct kvm_stats_debugfs_item *dbgfs_item;
-};
-
-struct kvm_stats_debugfs_item {
-	const char *name;
-	int offset;
+	const struct _kvm_stats_desc *desc;
 	enum kvm_stat_kind kind;
-	int mode;
 };
 
 struct _kvm_stats_desc {
@@ -1279,14 +1273,6 @@ struct _kvm_stats_desc {
 	char name[KVM_STATS_NAME_SIZE];
 };
 
-#define KVM_DBGFS_GET_MODE(dbgfs_item)                                         \
-	((dbgfs_item)->mode ? (dbgfs_item)->mode : 0644)
-
-#define VM_STAT(n, x, ...)						       \
-	{ n, offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ }
-#define VCPU_STAT(n, x, ...)						       \
-	{ n, offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ }
-
 #define STATS_DESC_COMMON(type, unit, base, exp)			       \
 	.flags = type | unit | base |					       \
 		 BUILD_BUG_ON_ZERO(type & ~KVM_STATS_TYPE_MASK) |	       \
@@ -1367,7 +1353,6 @@ struct _kvm_stats_desc {
 	STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_success_ns),	       \
 	STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns)
 
-extern struct kvm_stats_debugfs_item debugfs_entries[];
 extern struct dentry *kvm_debugfs_dir;
 ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header,
 		       const struct _kvm_stats_desc *desc,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c8d0028df4ac..3dcc2abbfc60 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -115,7 +115,6 @@ static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
 struct dentry *kvm_debugfs_dir;
 EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
 
-static int kvm_debugfs_num_entries;
 static const struct file_operations stat_fops_per_vm;
 
 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
@@ -860,9 +859,24 @@ static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
 	kvfree(slots);
 }
 
+static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
+{
+	switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
+	case KVM_STATS_TYPE_INSTANT:
+		return 0444;
+	case KVM_STATS_TYPE_CUMULATIVE:
+	case KVM_STATS_TYPE_PEAK:
+	default:
+		return 0644;
+	}
+}
+
+
 static void kvm_destroy_vm_debugfs(struct kvm *kvm)
 {
 	int i;
+	int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
+				      kvm_vcpu_stats_header.num_desc;
 
 	if (!kvm->debugfs_dentry)
 		return;
@@ -880,7 +894,10 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
 {
 	char dir_name[ITOA_MAX_LEN * 2];
 	struct kvm_stat_data *stat_data;
-	struct kvm_stats_debugfs_item *p;
+	const struct _kvm_stats_desc *pdesc;
+	int i;
+	int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
+				      kvm_vcpu_stats_header.num_desc;
 
 	if (!debugfs_initialized())
 		return 0;
@@ -894,15 +911,32 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
 	if (!kvm->debugfs_stat_data)
 		return -ENOMEM;
 
-	for (p = debugfs_entries; p->name; p++) {
+	for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
+		pdesc = &kvm_vm_stats_desc[i];
 		stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
 		if (!stat_data)
 			return -ENOMEM;
 
 		stat_data->kvm = kvm;
-		stat_data->dbgfs_item = p;
-		kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
-		debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p),
+		stat_data->desc = pdesc;
+		stat_data->kind = KVM_STAT_VM;
+		kvm->debugfs_stat_data[i] = stat_data;
+		debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
+				    kvm->debugfs_dentry, stat_data,
+				    &stat_fops_per_vm);
+	}
+
+	for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
+		pdesc = &kvm_vcpu_stats_desc[i];
+		stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
+		if (!stat_data)
+			return -ENOMEM;
+
+		stat_data->kvm = kvm;
+		stat_data->desc = pdesc;
+		stat_data->kind = KVM_STAT_VCPU;
+		kvm->debugfs_stat_data[i] = stat_data;
+		debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
 				    kvm->debugfs_dentry, stat_data,
 				    &stat_fops_per_vm);
 	}
@@ -4900,7 +4934,7 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file,
 		return -ENOENT;
 
 	if (simple_attr_open(inode, file, get,
-		    KVM_DBGFS_GET_MODE(stat_data->dbgfs_item) & 0222
+		    kvm_stats_debugfs_mode(stat_data->desc) & 0222
 		    ? set : NULL,
 		    fmt)) {
 		kvm_put_kvm(stat_data->kvm);
@@ -4923,14 +4957,14 @@ static int kvm_debugfs_release(struct inode *inode, struct file *file)
 
 static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
 {
-	*val = *(u64 *)((void *)kvm + offset);
+	*val = *(u64 *)((void *)(&kvm->stat) + offset);
 
 	return 0;
 }
 
 static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
 {
-	*(u64 *)((void *)kvm + offset) = 0;
+	*(u64 *)((void *)(&kvm->stat) + offset) = 0;
 
 	return 0;
 }
@@ -4943,7 +4977,7 @@ static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
 	*val = 0;
 
 	kvm_for_each_vcpu(i, vcpu, kvm)
-		*val += *(u64 *)((void *)vcpu + offset);
+		*val += *(u64 *)((void *)(&vcpu->stat) + offset);
 
 	return 0;
 }
@@ -4954,7 +4988,7 @@ static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
 	struct kvm_vcpu *vcpu;
 
 	kvm_for_each_vcpu(i, vcpu, kvm)
-		*(u64 *)((void *)vcpu + offset) = 0;
+		*(u64 *)((void *)(&vcpu->stat) + offset) = 0;
 
 	return 0;
 }
@@ -4964,14 +4998,14 @@ static int kvm_stat_data_get(void *data, u64 *val)
 	int r = -EFAULT;
 	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
 
-	switch (stat_data->dbgfs_item->kind) {
+	switch (stat_data->kind) {
 	case KVM_STAT_VM:
 		r = kvm_get_stat_per_vm(stat_data->kvm,
-					stat_data->dbgfs_item->offset, val);
+					stat_data->desc->desc.offset, val);
 		break;
 	case KVM_STAT_VCPU:
 		r = kvm_get_stat_per_vcpu(stat_data->kvm,
-					  stat_data->dbgfs_item->offset, val);
+					  stat_data->desc->desc.offset, val);
 		break;
 	}
 
@@ -4986,14 +5020,14 @@ static int kvm_stat_data_clear(void *data, u64 val)
 	if (val)
 		return -EINVAL;
 
-	switch (stat_data->dbgfs_item->kind) {
+	switch (stat_data->kind) {
 	case KVM_STAT_VM:
 		r = kvm_clear_stat_per_vm(stat_data->kvm,
-					  stat_data->dbgfs_item->offset);
+					  stat_data->desc->desc.offset);
 		break;
 	case KVM_STAT_VCPU:
 		r = kvm_clear_stat_per_vcpu(stat_data->kvm,
-					    stat_data->dbgfs_item->offset);
+					    stat_data->desc->desc.offset);
 		break;
 	}
 
@@ -5050,6 +5084,7 @@ static int vm_stat_clear(void *_offset, u64 val)
 }
 
 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
 
 static int vcpu_stat_get(void *_offset, u64 *val)
 {
@@ -5086,11 +5121,7 @@ static int vcpu_stat_clear(void *_offset, u64 val)
 
 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
 			"%llu\n");
-
-static const struct file_operations *stat_fops[] = {
-	[KVM_STAT_VCPU] = &vcpu_stat_fops,
-	[KVM_STAT_VM]   = &vm_stat_fops,
-};
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
 
 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
 {
@@ -5144,15 +5175,32 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
 
 static void kvm_init_debug(void)
 {
-	struct kvm_stats_debugfs_item *p;
+	const struct file_operations *fops;
+	const struct _kvm_stats_desc *pdesc;
+	int i;
 
 	kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
 
-	kvm_debugfs_num_entries = 0;
-	for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
-		debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p),
-				    kvm_debugfs_dir, (void *)(long)p->offset,
-				    stat_fops[p->kind]);
+	for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
+		pdesc = &kvm_vm_stats_desc[i];
+		if (kvm_stats_debugfs_mode(pdesc) & 0222)
+			fops = &vm_stat_fops;
+		else
+			fops = &vm_stat_readonly_fops;
+		debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
+				kvm_debugfs_dir,
+				(void *)(long)pdesc->desc.offset, fops);
+	}
+
+	for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
+		pdesc = &kvm_vcpu_stats_desc[i];
+		if (kvm_stats_debugfs_mode(pdesc) & 0222)
+			fops = &vcpu_stat_fops;
+		else
+			fops = &vcpu_stat_readonly_fops;
+		debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
+				kvm_debugfs_dir,
+				(void *)(long)pdesc->desc.offset, fops);
 	}
 }
 
-- 
cgit v1.2.3


From fd2ef39cc9a6b9c4c41864ac506906c52f94b06a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 23 Jun 2021 11:36:34 +0200
Subject: blk: Fix lock inversion between ioc lock and bfqd lock

Lockdep complains about lock inversion between ioc->lock and bfqd->lock:

bfqd -> ioc:
 put_io_context+0x33/0x90 -> ioc->lock grabbed
 blk_mq_free_request+0x51/0x140
 blk_put_request+0xe/0x10
 blk_attempt_req_merge+0x1d/0x30
 elv_attempt_insert_merge+0x56/0xa0
 blk_mq_sched_try_insert_merge+0x4b/0x60
 bfq_insert_requests+0x9e/0x18c0 -> bfqd->lock grabbed
 blk_mq_sched_insert_requests+0xd6/0x2b0
 blk_mq_flush_plug_list+0x154/0x280
 blk_finish_plug+0x40/0x60
 ext4_writepages+0x696/0x1320
 do_writepages+0x1c/0x80
 __filemap_fdatawrite_range+0xd7/0x120
 sync_file_range+0xac/0xf0

ioc->bfqd:
 bfq_exit_icq+0xa3/0xe0 -> bfqd->lock grabbed
 put_io_context_active+0x78/0xb0 -> ioc->lock grabbed
 exit_io_context+0x48/0x50
 do_exit+0x7e9/0xdd0
 do_group_exit+0x54/0xc0

To avoid this inversion we change blk_mq_sched_try_insert_merge() to not
free the merged request but rather leave that upto the caller similarly
to blk_mq_sched_try_merge(). And in bfq_insert_requests() we make sure
to free all the merged requests after dropping bfqd->lock.

Fixes: aee69d78dec0 ("block, bfq: introduce the BFQ-v0 I/O scheduler as an extra scheduler")
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Acked-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20210623093634.27879-3-jack@suse.cz
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c      |  6 ++++--
 block/blk-merge.c        | 19 ++++++++-----------
 block/blk-mq-sched.c     |  5 +++--
 block/blk-mq-sched.h     |  3 ++-
 block/blk-mq.h           | 11 +++++++++++
 block/blk.h              |  2 +-
 block/elevator.c         | 11 ++++++++---
 block/mq-deadline-main.c |  5 ++++-
 include/linux/elevator.h |  3 ++-
 9 files changed, 43 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 9433d38e486c..727955918563 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -2345,9 +2345,9 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
 
 	ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
 
+	spin_unlock_irq(&bfqd->lock);
 	if (free)
 		blk_mq_free_request(free);
-	spin_unlock_irq(&bfqd->lock);
 
 	return ret;
 }
@@ -5969,14 +5969,16 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	struct bfq_queue *bfqq;
 	bool idle_timer_disabled = false;
 	unsigned int cmd_flags;
+	LIST_HEAD(free);
 
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 	if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio)
 		bfqg_stats_update_legacy_io(q, rq);
 #endif
 	spin_lock_irq(&bfqd->lock);
-	if (blk_mq_sched_try_insert_merge(q, rq)) {
+	if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
 		spin_unlock_irq(&bfqd->lock);
+		blk_mq_free_requests(&free);
 		return;
 	}
 
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4d97fb6dd226..1398b52a24b4 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -846,18 +846,15 @@ static struct request *attempt_front_merge(struct request_queue *q,
 	return NULL;
 }
 
-int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
-			  struct request *next)
+/*
+ * Try to merge 'next' into 'rq'. Return true if the merge happened, false
+ * otherwise. The caller is responsible for freeing 'next' if the merge
+ * happened.
+ */
+bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
+			   struct request *next)
 {
-	struct request *free;
-
-	free = attempt_merge(q, rq, next);
-	if (free) {
-		blk_put_request(free);
-		return 1;
-	}
-
-	return 0;
+	return attempt_merge(q, rq, next);
 }
 
 bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 2403a5c2b053..c838d81ac058 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -399,9 +399,10 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
 	return ret;
 }
 
-bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
+bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
+				   struct list_head *free)
 {
-	return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
+	return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq, free);
 }
 EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
 
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index aff037cfd8e7..5246ae040704 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -13,7 +13,8 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
 		unsigned int nr_segs, struct request **merged_request);
 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
 		unsigned int nr_segs);
-bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
+bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
+				   struct list_head *free);
 void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx);
 void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 4b1ca7b7bbeb..d08779f77a26 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -302,6 +302,17 @@ static inline struct blk_plug *blk_mq_plug(struct request_queue *q,
 	return NULL;
 }
 
+/* Free all requests on the list */
+static inline void blk_mq_free_requests(struct list_head *list)
+{
+	while (!list_empty(list)) {
+		struct request *rq = list_entry_rq(list->next);
+
+		list_del_init(&rq->queuelist);
+		blk_mq_free_request(rq);
+	}
+}
+
 /*
  * For shared tag users, we track the number of currently active users
  * and attempt to provide a fair share of the tag depth for each of them.
diff --git a/block/blk.h b/block/blk.h
index 4fcd7a032377..4b885c0f6708 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -224,7 +224,7 @@ ssize_t part_timeout_store(struct device *, struct device_attribute *,
 void __blk_queue_split(struct bio **bio, unsigned int *nr_segs);
 int ll_back_merge_fn(struct request *req, struct bio *bio,
 		unsigned int nr_segs);
-int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
+bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
 				struct request *next);
 unsigned int blk_recalc_rq_segments(struct request *rq);
 void blk_rq_set_mixed_merge(struct request *rq);
diff --git a/block/elevator.c b/block/elevator.c
index 85d0d4adbb64..52ada14cfe45 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -350,9 +350,11 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req,
  * we can append 'rq' to an existing request, so we can throw 'rq' away
  * afterwards.
  *
- * Returns true if we merged, false otherwise
+ * Returns true if we merged, false otherwise. 'free' will contain all
+ * requests that need to be freed.
  */
-bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
+bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq,
+			      struct list_head *free)
 {
 	struct request *__rq;
 	bool ret;
@@ -363,8 +365,10 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
 	/*
 	 * First try one-hit cache.
 	 */
-	if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq))
+	if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) {
+		list_add(&rq->queuelist, free);
 		return true;
+	}
 
 	if (blk_queue_noxmerges(q))
 		return false;
@@ -378,6 +382,7 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
 		if (!__rq || !blk_attempt_req_merge(q, __rq, rq))
 			break;
 
+		list_add(&rq->queuelist, free);
 		/* The merged request could be merged with others, try again */
 		ret = true;
 		rq = __rq;
diff --git a/block/mq-deadline-main.c b/block/mq-deadline-main.c
index 4815e536091f..9db6da9ef4c6 100644
--- a/block/mq-deadline-main.c
+++ b/block/mq-deadline-main.c
@@ -719,6 +719,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	struct dd_per_prio *per_prio;
 	enum dd_prio prio;
 	struct dd_blkcg *blkcg;
+	LIST_HEAD(free);
 
 	lockdep_assert_held(&dd->lock);
 
@@ -742,8 +743,10 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	WARN_ON_ONCE(rq->elv.priv[0]);
 	rq->elv.priv[0] = blkcg;
 
-	if (blk_mq_sched_try_insert_merge(q, rq))
+	if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
+		blk_mq_free_requests(&free);
 		return;
+	}
 
 	trace_block_rq_insert(rq);
 
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 783ecb3cb77a..ef9ceead3db1 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -117,7 +117,8 @@ extern void elv_merge_requests(struct request_queue *, struct request *,
 			       struct request *);
 extern void elv_merged_request(struct request_queue *, struct request *,
 		enum elv_merge);
-extern bool elv_attempt_insert_merge(struct request_queue *, struct request *);
+extern bool elv_attempt_insert_merge(struct request_queue *, struct request *,
+				     struct list_head *);
 extern struct request *elv_former_request(struct request_queue *, struct request *);
 extern struct request *elv_latter_request(struct request_queue *, struct request *);
 void elevator_init_mq(struct request_queue *q);
-- 
cgit v1.2.3


From 15a64f5a8870b5610b616a4aa753262dfaa5d76e Mon Sep 17 00:00:00 2001
From: Claudio Imbrenda <imbrenda@linux.ibm.com>
Date: Thu, 24 Jun 2021 18:39:36 -0700
Subject: mm/vmalloc: add vmalloc_no_huge

Patch series "mm: add vmalloc_no_huge and use it", v4.

Add vmalloc_no_huge() and export it, so modules can allocate memory with
small pages.

Use the newly added vmalloc_no_huge() in KVM on s390 to get around a
hardware limitation.

This patch (of 2):

Commit 121e6f3258fe3 ("mm/vmalloc: hugepage vmalloc mappings") added
support for hugepage vmalloc mappings, it also added the flag
VM_NO_HUGE_VMAP for __vmalloc_node_range to request the allocation to be
performed with 0-order non-huge pages.

This flag is not accessible when calling vmalloc, the only option is to
call directly __vmalloc_node_range, which is not exported.

This means that a module can't vmalloc memory with small pages.

Case in point: KVM on s390x needs to vmalloc a large area, and it needs
to be mapped with non-huge pages, because of a hardware limitation.

This patch adds the function vmalloc_no_huge, which works like vmalloc,
but it is guaranteed to always back the mapping using small pages.  This
new function is exported, therefore it is usable by modules.

[akpm@linux-foundation.org: whitespace fixes, per Christoph]

Link: https://lkml.kernel.org/r/20210614132357.10202-1-imbrenda@linux.ibm.com
Link: https://lkml.kernel.org/r/20210614132357.10202-2-imbrenda@linux.ibm.com
Fixes: 121e6f3258fe3 ("mm/vmalloc: hugepage vmalloc mappings")
Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Acked-by: Nicholas Piggin <npiggin@gmail.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h |  1 +
 mm/vmalloc.c            | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 4d668abb6391..bfaaf0b6fa76 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -135,6 +135,7 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
 			const void *caller);
 void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
 		int node, const void *caller);
+void *vmalloc_no_huge(unsigned long size);
 
 extern void vfree(const void *addr);
 extern void vfree_atomic(const void *addr);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a13ac524f6ff..fada19e17814 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2998,6 +2998,23 @@ void *vmalloc(unsigned long size)
 }
 EXPORT_SYMBOL(vmalloc);
 
+/**
+ * vmalloc_no_huge - allocate virtually contiguous memory using small pages
+ * @size:    allocation size
+ *
+ * Allocate enough non-huge pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+void *vmalloc_no_huge(unsigned long size)
+{
+	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+				    GFP_KERNEL, PAGE_KERNEL, VM_NO_HUGE_VMAP,
+				    NUMA_NO_NODE, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(vmalloc_no_huge);
+
 /**
  * vzalloc - allocate virtually contiguous memory with zero fill
  * @size:    allocation size
-- 
cgit v1.2.3


From fe19bd3dae3d15d2fbfdb3de8839a6ea0fe94264 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 24 Jun 2021 18:39:52 -0700
Subject: mm, futex: fix shared futex pgoff on shmem huge page

If more than one futex is placed on a shmem huge page, it can happen
that waking the second wakes the first instead, and leaves the second
waiting: the key's shared.pgoff is wrong.

When 3.11 commit 13d60f4b6ab5 ("futex: Take hugepages into account when
generating futex_key"), the only shared huge pages came from hugetlbfs,
and the code added to deal with its exceptional page->index was put into
hugetlb source.  Then that was missed when 4.8 added shmem huge pages.

page_to_pgoff() is what others use for this nowadays: except that, as
currently written, it gives the right answer on hugetlbfs head, but
nonsense on hugetlbfs tails.  Fix that by calling hugetlbfs-specific
hugetlb_basepage_index() on PageHuge tails as well as on head.

Yes, it's unconventional to declare hugetlb_basepage_index() there in
pagemap.h, rather than in hugetlb.h; but I do not expect anything but
page_to_pgoff() ever to need it.

[akpm@linux-foundation.org: give hugetlb_basepage_index() prototype the correct scope]

Link: https://lkml.kernel.org/r/b17d946b-d09-326e-b42a-52884c36df32@google.com
Fixes: 800d8c63b2e9 ("shmem: add huge pages support")
Reported-by: Neel Natu <neelnatu@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Zhang Yi <wetpzy@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Darren Hart <dvhart@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 16 ----------------
 include/linux/pagemap.h | 13 +++++++------
 kernel/futex.c          |  3 +--
 mm/hugetlb.c            |  5 +----
 4 files changed, 9 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 6504346a1947..3c0117656745 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -741,17 +741,6 @@ static inline int hstate_index(struct hstate *h)
 	return h - hstates;
 }
 
-pgoff_t __basepage_index(struct page *page);
-
-/* Return page->index in PAGE_SIZE units */
-static inline pgoff_t basepage_index(struct page *page)
-{
-	if (!PageCompound(page))
-		return page->index;
-
-	return __basepage_index(page);
-}
-
 extern int dissolve_free_huge_page(struct page *page);
 extern int dissolve_free_huge_pages(unsigned long start_pfn,
 				    unsigned long end_pfn);
@@ -988,11 +977,6 @@ static inline int hstate_index(struct hstate *h)
 	return 0;
 }
 
-static inline pgoff_t basepage_index(struct page *page)
-{
-	return page->index;
-}
-
 static inline int dissolve_free_huge_page(struct page *page)
 {
 	return 0;
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e89df447fae3..0f1b34dbf3a2 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -516,7 +516,7 @@ static inline struct page *read_mapping_page(struct address_space *mapping,
 }
 
 /*
- * Get index of the page with in radix-tree
+ * Get index of the page within radix-tree (but not for hugetlb pages).
  * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE)
  */
 static inline pgoff_t page_to_index(struct page *page)
@@ -535,15 +535,16 @@ static inline pgoff_t page_to_index(struct page *page)
 	return pgoff;
 }
 
+extern pgoff_t hugetlb_basepage_index(struct page *page);
+
 /*
- * Get the offset in PAGE_SIZE.
- * (TODO: hugepage should have ->index in PAGE_SIZE)
+ * Get the offset in PAGE_SIZE (even for hugetlb pages).
+ * (TODO: hugetlb pages should have ->index in PAGE_SIZE)
  */
 static inline pgoff_t page_to_pgoff(struct page *page)
 {
-	if (unlikely(PageHeadHuge(page)))
-		return page->index << compound_order(page);
-
+	if (unlikely(PageHuge(page)))
+		return hugetlb_basepage_index(page);
 	return page_to_index(page);
 }
 
diff --git a/kernel/futex.c b/kernel/futex.c
index 4938a00bc785..408cad5e8968 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -35,7 +35,6 @@
 #include <linux/jhash.h>
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
-#include <linux/hugetlb.h>
 #include <linux/freezer.h>
 #include <linux/memblock.h>
 #include <linux/fault-inject.h>
@@ -650,7 +649,7 @@ again:
 
 		key->both.offset |= FUT_OFF_INODE; /* inode-based key */
 		key->shared.i_seq = get_inode_sequence_number(inode);
-		key->shared.pgoff = basepage_index(tail);
+		key->shared.pgoff = page_to_pgoff(tail);
 		rcu_read_unlock();
 	}
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e0a5f9cbbece..5ba5a0da6d57 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1588,15 +1588,12 @@ struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
 	return NULL;
 }
 
-pgoff_t __basepage_index(struct page *page)
+pgoff_t hugetlb_basepage_index(struct page *page)
 {
 	struct page *page_head = compound_head(page);
 	pgoff_t index = page_index(page_head);
 	unsigned long compound_idx;
 
-	if (!PageHuge(page_head))
-		return page_index(page);
-
 	if (compound_order(page_head) >= MAX_ORDER)
 		compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
 	else
-- 
cgit v1.2.3


From 4834446035a1011ff1231626ef33555d64c4fd78 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Fri, 25 Jun 2021 09:35:11 +0200
Subject: tty: make linux/tty_flip.h self-contained

If someone includes linux/tty_flip.h before linux/tty.h, they see
many compiler errors like:
 include/linux/tty_flip.h:23:30: error: invalid use of undefined type 'struct tty_port'
 include/linux/tty_flip.h:26:14: error: invalid use of undefined type 'struct tty_buffer'

tty_flip.h actually lexicographically sorts before tty.h. So if people
sort includes (as I tried in amiserial), the compilation suddenly
breaks.

Solve this by including linux/tty.h from linux/tty_flip.h, so that
everything is defined as needed.

Another alternative would be to uninline tty_insert_flip_char and just
insert forward declarations of tty_port and tty_buffer structs into
tty_flip.h as that inline is the only real user. But that would mean
slowing down the fast path without any good reason. (Provided the fix
is that easy and there were no real problems with this until now.)

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20210625073511.4514-1-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_flip.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tty_flip.h b/include/linux/tty_flip.h
index d6729281ec50..67d78dc553e1 100644
--- a/include/linux/tty_flip.h
+++ b/include/linux/tty_flip.h
@@ -2,6 +2,8 @@
 #ifndef _LINUX_TTY_FLIP_H
 #define _LINUX_TTY_FLIP_H
 
+#include <linux/tty.h>
+
 extern int tty_buffer_set_limit(struct tty_port *port, int limit);
 extern unsigned int tty_buffer_space_avail(struct tty_port *port);
 extern int tty_buffer_request_room(struct tty_port *port, size_t size);
-- 
cgit v1.2.3


From b470e10eb43f19e08245cd87dd3192a8141cfbb5 Mon Sep 17 00:00:00 2001
From: Vinod Koul <vkoul@kernel.org>
Date: Fri, 25 Jun 2021 10:52:11 +0530
Subject: spi: core: add dma_map_dev for dma device

Some controllers like qcom geni need the parent device to be used for
dma mapping, so add a dma_map_dev field and let drivers fill this to be
used as mapping device

Signed-off-by: Vinod Koul <vkoul@kernel.org>
Link: https://lore.kernel.org/r/20210625052213.32260-4-vkoul@kernel.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c       | 4 ++++
 include/linux/spi/spi.h | 1 +
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index c296f08b36c1..35928d0843d9 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -993,11 +993,15 @@ static int __spi_map_msg(struct spi_controller *ctlr, struct spi_message *msg)
 
 	if (ctlr->dma_tx)
 		tx_dev = ctlr->dma_tx->device->dev;
+	else if (ctlr->dma_map_dev)
+		tx_dev = ctlr->dma_map_dev;
 	else
 		tx_dev = ctlr->dev.parent;
 
 	if (ctlr->dma_rx)
 		rx_dev = ctlr->dma_rx->device->dev;
+	else if (ctlr->dma_map_dev)
+		rx_dev = ctlr->dma_map_dev;
 	else
 		rx_dev = ctlr->dev.parent;
 
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 3ada36175e5f..97b8d12b5f2b 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -588,6 +588,7 @@ struct spi_controller {
 	bool			(*can_dma)(struct spi_controller *ctlr,
 					   struct spi_device *spi,
 					   struct spi_transfer *xfer);
+	struct device *dma_map_dev;
 
 	/*
 	 * These hooks are for drivers that want to use the generic
-- 
cgit v1.2.3


From 24e166f43e93de0e9b0a460ecfe4bab1f12212d7 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sat, 29 May 2021 17:14:21 +0200
Subject: HID: core: Add hid_hw_may_wakeup() function

Add a hid_hw_may_wakeup() function, which is the equivalent of
device_may_wakeup() for hid devices.

In most cases this just returns device_may_wakeup(hdev->dev.parent), but for
some ll-drivers this is not correct. E.g. usb_hid_driver instantiated hid
devices have their parent set to the usb-interface to which the usb_hid_driver
is bound, but the power/wakeup* sysfs attributes are part of the usb-device,
which is the usb-interface's parent.

For these special cases a new may_wakeup callback is added to
hid_ll_driver, so that ll-drivers can override the default behavior.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/hid.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index 10e922cee4eb..51a4dad3565e 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -800,6 +800,7 @@ struct hid_driver {
  * @raw_request: send raw report request to device (e.g. feature report)
  * @output_report: send output report to device
  * @idle: send idle request to device
+ * @may_wakeup: return if device may act as a wakeup source during system-suspend
  */
 struct hid_ll_driver {
 	int (*start)(struct hid_device *hdev);
@@ -824,6 +825,7 @@ struct hid_ll_driver {
 	int (*output_report) (struct hid_device *hdev, __u8 *buf, size_t len);
 
 	int (*idle)(struct hid_device *hdev, int report, int idle, int reqtype);
+	bool (*may_wakeup)(struct hid_device *hdev);
 };
 
 extern struct hid_ll_driver i2c_hid_ll_driver;
@@ -1149,6 +1151,22 @@ static inline int hid_hw_idle(struct hid_device *hdev, int report, int idle,
 	return 0;
 }
 
+/**
+ * hid_may_wakeup - return if the hid device may act as a wakeup source during system-suspend
+ *
+ * @hdev: hid device
+ */
+static inline bool hid_hw_may_wakeup(struct hid_device *hdev)
+{
+	if (hdev->ll_driver->may_wakeup)
+		return hdev->ll_driver->may_wakeup(hdev);
+
+	if (hdev->dev.parent)
+		return device_may_wakeup(hdev->dev.parent);
+
+	return false;
+}
+
 /**
  * hid_hw_wait - wait for buffered io to complete
  *
-- 
cgit v1.2.3


From db59e1b6e49201beacdbd0622aa3594f2de4f727 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Fri, 18 Jun 2021 17:20:56 +0200
Subject: ACPI: arm64: Move DMA setup operations out of IORT

Extract generic DMA setup code out of IORT, so it can be reused by VIOT.
Keep it in drivers/acpi/arm64 for now, since it could break x86
platforms that haven't run this code so far, if they have invalid
tables.

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20210618152059.1194210-2-jean-philippe@linaro.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/acpi/arm64/Makefile |  1 +
 drivers/acpi/arm64/dma.c    | 50 +++++++++++++++++++++++++++++++++++++++++
 drivers/acpi/arm64/iort.c   | 54 +++++++--------------------------------------
 drivers/acpi/scan.c         |  2 +-
 include/linux/acpi.h        |  3 +++
 include/linux/acpi_iort.h   |  6 ++---
 6 files changed, 66 insertions(+), 50 deletions(-)
 create mode 100644 drivers/acpi/arm64/dma.c

(limited to 'include/linux')

diff --git a/drivers/acpi/arm64/Makefile b/drivers/acpi/arm64/Makefile
index 6ff50f4ed947..66acbe77f46e 100644
--- a/drivers/acpi/arm64/Makefile
+++ b/drivers/acpi/arm64/Makefile
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_ACPI_IORT) 	+= iort.o
 obj-$(CONFIG_ACPI_GTDT) 	+= gtdt.o
+obj-y				+= dma.o
diff --git a/drivers/acpi/arm64/dma.c b/drivers/acpi/arm64/dma.c
new file mode 100644
index 000000000000..f16739ad3cc0
--- /dev/null
+++ b/drivers/acpi/arm64/dma.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/acpi.h>
+#include <linux/acpi_iort.h>
+#include <linux/device.h>
+#include <linux/dma-direct.h>
+
+void acpi_arch_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size)
+{
+	int ret;
+	u64 end, mask;
+	u64 dmaaddr = 0, size = 0, offset = 0;
+
+	/*
+	 * If @dev is expected to be DMA-capable then the bus code that created
+	 * it should have initialised its dma_mask pointer by this point. For
+	 * now, we'll continue the legacy behaviour of coercing it to the
+	 * coherent mask if not, but we'll no longer do so quietly.
+	 */
+	if (!dev->dma_mask) {
+		dev_warn(dev, "DMA mask not set\n");
+		dev->dma_mask = &dev->coherent_dma_mask;
+	}
+
+	if (dev->coherent_dma_mask)
+		size = max(dev->coherent_dma_mask, dev->coherent_dma_mask + 1);
+	else
+		size = 1ULL << 32;
+
+	ret = acpi_dma_get_range(dev, &dmaaddr, &offset, &size);
+	if (ret == -ENODEV)
+		ret = iort_dma_get_ranges(dev, &size);
+	if (!ret) {
+		/*
+		 * Limit coherent and dma mask based on size retrieved from
+		 * firmware.
+		 */
+		end = dmaaddr + size - 1;
+		mask = DMA_BIT_MASK(ilog2(end) + 1);
+		dev->bus_dma_limit = end;
+		dev->coherent_dma_mask = min(dev->coherent_dma_mask, mask);
+		*dev->dma_mask = min(*dev->dma_mask, mask);
+	}
+
+	*dma_addr = dmaaddr;
+	*dma_size = size;
+
+	ret = dma_direct_set_offset(dev, dmaaddr + offset, dmaaddr, size);
+
+	dev_dbg(dev, "dma_offset(%#08llx)%s\n", offset, ret ? " failed!" : "");
+}
diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 3912a1f6058e..a940be1cf2af 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -1144,56 +1144,18 @@ static int rc_dma_get_range(struct device *dev, u64 *size)
 }
 
 /**
- * iort_dma_setup() - Set-up device DMA parameters.
+ * iort_dma_get_ranges() - Look up DMA addressing limit for the device
+ * @dev: device to lookup
+ * @size: DMA range size result pointer
  *
- * @dev: device to configure
- * @dma_addr: device DMA address result pointer
- * @dma_size: DMA range size result pointer
+ * Return: 0 on success, an error otherwise.
  */
-void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size)
+int iort_dma_get_ranges(struct device *dev, u64 *size)
 {
-	u64 end, mask, dmaaddr = 0, size = 0, offset = 0;
-	int ret;
-
-	/*
-	 * If @dev is expected to be DMA-capable then the bus code that created
-	 * it should have initialised its dma_mask pointer by this point. For
-	 * now, we'll continue the legacy behaviour of coercing it to the
-	 * coherent mask if not, but we'll no longer do so quietly.
-	 */
-	if (!dev->dma_mask) {
-		dev_warn(dev, "DMA mask not set\n");
-		dev->dma_mask = &dev->coherent_dma_mask;
-	}
-
-	if (dev->coherent_dma_mask)
-		size = max(dev->coherent_dma_mask, dev->coherent_dma_mask + 1);
+	if (dev_is_pci(dev))
+		return rc_dma_get_range(dev, size);
 	else
-		size = 1ULL << 32;
-
-	ret = acpi_dma_get_range(dev, &dmaaddr, &offset, &size);
-	if (ret == -ENODEV)
-		ret = dev_is_pci(dev) ? rc_dma_get_range(dev, &size)
-				      : nc_dma_get_range(dev, &size);
-
-	if (!ret) {
-		/*
-		 * Limit coherent and dma mask based on size retrieved from
-		 * firmware.
-		 */
-		end = dmaaddr + size - 1;
-		mask = DMA_BIT_MASK(ilog2(end) + 1);
-		dev->bus_dma_limit = end;
-		dev->coherent_dma_mask = min(dev->coherent_dma_mask, mask);
-		*dev->dma_mask = min(*dev->dma_mask, mask);
-	}
-
-	*dma_addr = dmaaddr;
-	*dma_size = size;
-
-	ret = dma_direct_set_offset(dev, dmaaddr + offset, dmaaddr, size);
-
-	dev_dbg(dev, "dma_offset(%#08llx)%s\n", offset, ret ? " failed!" : "");
+		return nc_dma_get_range(dev, size);
 }
 
 static void __init acpi_iort_register_irq(int hwirq, const char *name,
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index e10d38ac7cf2..ea613df8f913 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1537,7 +1537,7 @@ int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr,
 		return 0;
 	}
 
-	iort_dma_setup(dev, &dma_addr, &size);
+	acpi_arch_dma_setup(dev, &dma_addr, &size);
 
 	iommu = iort_iommu_configure_id(dev, input_id);
 	if (PTR_ERR(iommu) == -EPROBE_DEFER)
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index c60745f657e9..7aaa9559cc19 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -259,9 +259,12 @@ void acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa);
 
 #ifdef CONFIG_ARM64
 void acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa);
+void acpi_arch_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size);
 #else
 static inline void
 acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) { }
+static inline void
+acpi_arch_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size) { }
 #endif
 
 int acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma);
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index 1a12baa58e40..f7f054833afd 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -34,7 +34,7 @@ struct irq_domain *iort_get_device_domain(struct device *dev, u32 id,
 void acpi_configure_pmsi_domain(struct device *dev);
 int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id);
 /* IOMMU interface */
-void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *size);
+int iort_dma_get_ranges(struct device *dev, u64 *size);
 const struct iommu_ops *iort_iommu_configure_id(struct device *dev,
 						const u32 *id_in);
 int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head);
@@ -48,8 +48,8 @@ static inline struct irq_domain *iort_get_device_domain(
 { return NULL; }
 static inline void acpi_configure_pmsi_domain(struct device *dev) { }
 /* IOMMU interface */
-static inline void iort_dma_setup(struct device *dev, u64 *dma_addr,
-				  u64 *size) { }
+static inline int iort_dma_get_ranges(struct device *dev, u64 *size)
+{ return -ENODEV; }
 static inline const struct iommu_ops *iort_iommu_configure_id(
 				      struct device *dev, const u32 *id_in)
 { return NULL; }
-- 
cgit v1.2.3


From 11a8c5e3a94b12848f24d9c63b5c175ce0b80729 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Fri, 18 Jun 2021 17:20:57 +0200
Subject: ACPI: Move IOMMU setup code out of IORT

Extract the code that sets up the IOMMU infrastructure from IORT, since
it can be reused by VIOT. Move it one level up into a new
acpi_iommu_configure_id() function, which calls the IORT parsing
function which in turn calls the acpi_iommu_fwspec_init() helper.

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20210618152059.1194210-3-jean-philippe@linaro.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/acpi/arm64/iort.c | 74 +++++------------------------------------------
 drivers/acpi/scan.c       | 73 +++++++++++++++++++++++++++++++++++++++++++++-
 include/acpi/acpi_bus.h   |  3 ++
 include/linux/acpi_iort.h |  8 ++---
 4 files changed, 86 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index a940be1cf2af..487d1095030d 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -806,23 +806,6 @@ static struct acpi_iort_node *iort_get_msi_resv_iommu(struct device *dev)
 	return NULL;
 }
 
-static inline const struct iommu_ops *iort_fwspec_iommu_ops(struct device *dev)
-{
-	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
-
-	return (fwspec && fwspec->ops) ? fwspec->ops : NULL;
-}
-
-static inline int iort_add_device_replay(struct device *dev)
-{
-	int err = 0;
-
-	if (dev->bus && !device_iommu_mapped(dev))
-		err = iommu_probe_device(dev);
-
-	return err;
-}
-
 /**
  * iort_iommu_msi_get_resv_regions - Reserved region driver helper
  * @dev: Device from iommu_get_resv_regions()
@@ -900,18 +883,6 @@ static inline bool iort_iommu_driver_enabled(u8 type)
 	}
 }
 
-static int arm_smmu_iort_xlate(struct device *dev, u32 streamid,
-			       struct fwnode_handle *fwnode,
-			       const struct iommu_ops *ops)
-{
-	int ret = iommu_fwspec_init(dev, fwnode, ops);
-
-	if (!ret)
-		ret = iommu_fwspec_add_ids(dev, &streamid, 1);
-
-	return ret;
-}
-
 static bool iort_pci_rc_supports_ats(struct acpi_iort_node *node)
 {
 	struct acpi_iort_root_complex *pci_rc;
@@ -946,7 +917,7 @@ static int iort_iommu_xlate(struct device *dev, struct acpi_iort_node *node,
 		return iort_iommu_driver_enabled(node->type) ?
 		       -EPROBE_DEFER : -ENODEV;
 
-	return arm_smmu_iort_xlate(dev, streamid, iort_fwnode, ops);
+	return acpi_iommu_fwspec_init(dev, streamid, iort_fwnode, ops);
 }
 
 struct iort_pci_alias_info {
@@ -1020,24 +991,13 @@ static int iort_nc_iommu_map_id(struct device *dev,
  * @dev: device to configure
  * @id_in: optional input id const value pointer
  *
- * Returns: iommu_ops pointer on configuration success
- *          NULL on configuration failure
+ * Returns: 0 on success, <0 on failure
  */
-const struct iommu_ops *iort_iommu_configure_id(struct device *dev,
-						const u32 *id_in)
+int iort_iommu_configure_id(struct device *dev, const u32 *id_in)
 {
 	struct acpi_iort_node *node;
-	const struct iommu_ops *ops;
 	int err = -ENODEV;
 
-	/*
-	 * If we already translated the fwspec there
-	 * is nothing left to do, return the iommu_ops.
-	 */
-	ops = iort_fwspec_iommu_ops(dev);
-	if (ops)
-		return ops;
-
 	if (dev_is_pci(dev)) {
 		struct iommu_fwspec *fwspec;
 		struct pci_bus *bus = to_pci_dev(dev)->bus;
@@ -1046,7 +1006,7 @@ const struct iommu_ops *iort_iommu_configure_id(struct device *dev,
 		node = iort_scan_node(ACPI_IORT_NODE_PCI_ROOT_COMPLEX,
 				      iort_match_node_callback, &bus->dev);
 		if (!node)
-			return NULL;
+			return -ENODEV;
 
 		info.node = node;
 		err = pci_for_each_dma_alias(to_pci_dev(dev),
@@ -1059,7 +1019,7 @@ const struct iommu_ops *iort_iommu_configure_id(struct device *dev,
 		node = iort_scan_node(ACPI_IORT_NODE_NAMED_COMPONENT,
 				      iort_match_node_callback, dev);
 		if (!node)
-			return NULL;
+			return -ENODEV;
 
 		err = id_in ? iort_nc_iommu_map_id(dev, node, id_in) :
 			      iort_nc_iommu_map(dev, node);
@@ -1068,32 +1028,14 @@ const struct iommu_ops *iort_iommu_configure_id(struct device *dev,
 			iort_named_component_init(dev, node);
 	}
 
-	/*
-	 * If we have reason to believe the IOMMU driver missed the initial
-	 * add_device callback for dev, replay it to get things in order.
-	 */
-	if (!err) {
-		ops = iort_fwspec_iommu_ops(dev);
-		err = iort_add_device_replay(dev);
-	}
-
-	/* Ignore all other errors apart from EPROBE_DEFER */
-	if (err == -EPROBE_DEFER) {
-		ops = ERR_PTR(err);
-	} else if (err) {
-		dev_dbg(dev, "Adding to IOMMU failed: %d\n", err);
-		ops = NULL;
-	}
-
-	return ops;
+	return err;
 }
 
 #else
 int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
 { return 0; }
-const struct iommu_ops *iort_iommu_configure_id(struct device *dev,
-						const u32 *input_id)
-{ return NULL; }
+int iort_iommu_configure_id(struct device *dev, const u32 *input_id)
+{ return -ENODEV; }
 #endif
 
 static int nc_dma_get_range(struct device *dev, u64 *size)
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index ea613df8f913..2a2e690040e9 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/acpi.h>
 #include <linux/acpi_iort.h>
+#include <linux/iommu.h>
 #include <linux/signal.h>
 #include <linux/kthread.h>
 #include <linux/dmi.h>
@@ -1520,6 +1521,76 @@ int acpi_dma_get_range(struct device *dev, u64 *dma_addr, u64 *offset,
 	return ret >= 0 ? 0 : ret;
 }
 
+#ifdef CONFIG_IOMMU_API
+int acpi_iommu_fwspec_init(struct device *dev, u32 id,
+			   struct fwnode_handle *fwnode,
+			   const struct iommu_ops *ops)
+{
+	int ret = iommu_fwspec_init(dev, fwnode, ops);
+
+	if (!ret)
+		ret = iommu_fwspec_add_ids(dev, &id, 1);
+
+	return ret;
+}
+
+static inline const struct iommu_ops *acpi_iommu_fwspec_ops(struct device *dev)
+{
+	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+
+	return fwspec ? fwspec->ops : NULL;
+}
+
+static const struct iommu_ops *acpi_iommu_configure_id(struct device *dev,
+						       const u32 *id_in)
+{
+	int err;
+	const struct iommu_ops *ops;
+
+	/*
+	 * If we already translated the fwspec there is nothing left to do,
+	 * return the iommu_ops.
+	 */
+	ops = acpi_iommu_fwspec_ops(dev);
+	if (ops)
+		return ops;
+
+	err = iort_iommu_configure_id(dev, id_in);
+
+	/*
+	 * If we have reason to believe the IOMMU driver missed the initial
+	 * iommu_probe_device() call for dev, replay it to get things in order.
+	 */
+	if (!err && dev->bus && !device_iommu_mapped(dev))
+		err = iommu_probe_device(dev);
+
+	/* Ignore all other errors apart from EPROBE_DEFER */
+	if (err == -EPROBE_DEFER) {
+		return ERR_PTR(err);
+	} else if (err) {
+		dev_dbg(dev, "Adding to IOMMU failed: %d\n", err);
+		return NULL;
+	}
+	return acpi_iommu_fwspec_ops(dev);
+}
+
+#else /* !CONFIG_IOMMU_API */
+
+int acpi_iommu_fwspec_init(struct device *dev, u32 id,
+			   struct fwnode_handle *fwnode,
+			   const struct iommu_ops *ops)
+{
+	return -ENODEV;
+}
+
+static const struct iommu_ops *acpi_iommu_configure_id(struct device *dev,
+						       const u32 *id_in)
+{
+	return NULL;
+}
+
+#endif /* !CONFIG_IOMMU_API */
+
 /**
  * acpi_dma_configure_id - Set-up DMA configuration for the device.
  * @dev: The pointer to the device
@@ -1539,7 +1610,7 @@ int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr,
 
 	acpi_arch_dma_setup(dev, &dma_addr, &size);
 
-	iommu = iort_iommu_configure_id(dev, input_id);
+	iommu = acpi_iommu_configure_id(dev, input_id);
 	if (PTR_ERR(iommu) == -EPROBE_DEFER)
 		return -EPROBE_DEFER;
 
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 3a82faac5767..41f092a269f6 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -588,6 +588,9 @@ struct acpi_pci_root {
 
 bool acpi_dma_supported(struct acpi_device *adev);
 enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev);
+int acpi_iommu_fwspec_init(struct device *dev, u32 id,
+			   struct fwnode_handle *fwnode,
+			   const struct iommu_ops *ops);
 int acpi_dma_get_range(struct device *dev, u64 *dma_addr, u64 *offset,
 		       u64 *size);
 int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr,
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index f7f054833afd..f1f0842a2cb2 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -35,8 +35,7 @@ void acpi_configure_pmsi_domain(struct device *dev);
 int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id);
 /* IOMMU interface */
 int iort_dma_get_ranges(struct device *dev, u64 *size);
-const struct iommu_ops *iort_iommu_configure_id(struct device *dev,
-						const u32 *id_in);
+int iort_iommu_configure_id(struct device *dev, const u32 *id_in);
 int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head);
 phys_addr_t acpi_iort_dma_get_max_cpu_address(void);
 #else
@@ -50,9 +49,8 @@ static inline void acpi_configure_pmsi_domain(struct device *dev) { }
 /* IOMMU interface */
 static inline int iort_dma_get_ranges(struct device *dev, u64 *size)
 { return -ENODEV; }
-static inline const struct iommu_ops *iort_iommu_configure_id(
-				      struct device *dev, const u32 *id_in)
-{ return NULL; }
+static inline int iort_iommu_configure_id(struct device *dev, const u32 *id_in)
+{ return -ENODEV; }
 static inline
 int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
 { return 0; }
-- 
cgit v1.2.3


From 3cf485540e7b8550936ce3602edf2f58e4007304 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Fri, 18 Jun 2021 17:20:58 +0200
Subject: ACPI: Add driver for the VIOT table

The ACPI Virtual I/O Translation Table describes topology of
para-virtual platforms, similarly to vendor tables DMAR, IVRS and IORT.
For now it describes the relation between virtio-iommu and the endpoints
it manages.

Three steps are needed to configure DMA of endpoints:

(1) acpi_viot_init(): parse the VIOT table, find or create the fwnode
    associated to each vIOMMU device. This needs to happen after
    acpi_scan_init(), because it relies on the struct device and their
    fwnode to be available.

(2) When probing the vIOMMU device, the driver registers its IOMMU ops
    within the IOMMU subsystem. This step doesn't require any
    intervention from the VIOT driver.

(3) viot_iommu_configure(): before binding the endpoint to a driver,
    find the associated IOMMU ops. Register them, along with the
    endpoint ID, into the device's iommu_fwspec.

If step (3) happens before step (2), it is deferred until the IOMMU is
initialized, then retried.

Tested-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/20210618152059.1194210-4-jean-philippe@linaro.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 MAINTAINERS               |   8 +
 drivers/acpi/Kconfig      |   3 +
 drivers/acpi/Makefile     |   2 +
 drivers/acpi/bus.c        |   2 +
 drivers/acpi/scan.c       |   3 +
 drivers/acpi/viot.c       | 366 ++++++++++++++++++++++++++++++++++++++++++++++
 drivers/iommu/Kconfig     |   1 +
 include/linux/acpi_viot.h |  19 +++
 8 files changed, 404 insertions(+)
 create mode 100644 drivers/acpi/viot.c
 create mode 100644 include/linux/acpi_viot.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 503fd21901f1..9f06619f7e41 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -431,6 +431,14 @@ W:	https://01.org/linux-acpi
 B:	https://bugzilla.kernel.org
 F:	drivers/acpi/acpi_video.c
 
+ACPI VIOT DRIVER
+M:	Jean-Philippe Brucker <jean-philippe@linaro.org>
+L:	linux-acpi@vger.kernel.org
+L:	iommu@lists.linux-foundation.org
+S:	Maintained
+F:	drivers/acpi/viot.c
+F:	include/linux/acpi_viot.h
+
 ACPI WMI DRIVER
 L:	platform-driver-x86@vger.kernel.org
 S:	Orphan
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index eedec61e3476..3758c6940ed7 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -526,6 +526,9 @@ endif
 
 source "drivers/acpi/pmic/Kconfig"
 
+config ACPI_VIOT
+	bool
+
 endif	# ACPI
 
 config X86_PM_TIMER
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 700b41adf2db..a6e644c48987 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -118,3 +118,5 @@ video-objs			+= acpi_video.o video_detect.o
 obj-y				+= dptf/
 
 obj-$(CONFIG_ARM64)		+= arm64/
+
+obj-$(CONFIG_ACPI_VIOT)		+= viot.o
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index be7da23fad76..4a96b7097a62 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -27,6 +27,7 @@
 #include <linux/dmi.h>
 #endif
 #include <linux/acpi_iort.h>
+#include <linux/acpi_viot.h>
 #include <linux/pci.h>
 #include <acpi/apei.h>
 #include <linux/suspend.h>
@@ -1345,6 +1346,7 @@ static int __init acpi_init(void)
 	acpi_wakeup_device_init();
 	acpi_debugger_init();
 	acpi_setup_sb_notify_handler();
+	acpi_viot_init();
 	return 0;
 }
 
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 2a2e690040e9..3e2bb04ab528 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/acpi.h>
 #include <linux/acpi_iort.h>
+#include <linux/acpi_viot.h>
 #include <linux/iommu.h>
 #include <linux/signal.h>
 #include <linux/kthread.h>
@@ -1556,6 +1557,8 @@ static const struct iommu_ops *acpi_iommu_configure_id(struct device *dev,
 		return ops;
 
 	err = iort_iommu_configure_id(dev, id_in);
+	if (err && err != -EPROBE_DEFER)
+		err = viot_iommu_configure(dev);
 
 	/*
 	 * If we have reason to believe the IOMMU driver missed the initial
diff --git a/drivers/acpi/viot.c b/drivers/acpi/viot.c
new file mode 100644
index 000000000000..d2256326c73a
--- /dev/null
+++ b/drivers/acpi/viot.c
@@ -0,0 +1,366 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Virtual I/O topology
+ *
+ * The Virtual I/O Translation Table (VIOT) describes the topology of
+ * para-virtual IOMMUs and the endpoints they manage. The OS uses it to
+ * initialize devices in the right order, preventing endpoints from issuing DMA
+ * before their IOMMU is ready.
+ *
+ * When binding a driver to a device, before calling the device driver's probe()
+ * method, the driver infrastructure calls dma_configure(). At that point the
+ * VIOT driver looks for an IOMMU associated to the device in the VIOT table.
+ * If an IOMMU exists and has been initialized, the VIOT driver initializes the
+ * device's IOMMU fwspec, allowing the DMA infrastructure to invoke the IOMMU
+ * ops when the device driver configures DMA mappings. If an IOMMU exists and
+ * hasn't yet been initialized, VIOT returns -EPROBE_DEFER to postpone probing
+ * the device until the IOMMU is available.
+ */
+#define pr_fmt(fmt) "ACPI: VIOT: " fmt
+
+#include <linux/acpi_viot.h>
+#include <linux/dma-iommu.h>
+#include <linux/fwnode.h>
+#include <linux/iommu.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+
+struct viot_iommu {
+	/* Node offset within the table */
+	unsigned int			offset;
+	struct fwnode_handle		*fwnode;
+	struct list_head		list;
+};
+
+struct viot_endpoint {
+	union {
+		/* PCI range */
+		struct {
+			u16		segment_start;
+			u16		segment_end;
+			u16		bdf_start;
+			u16		bdf_end;
+		};
+		/* MMIO */
+		u64			address;
+	};
+	u32				endpoint_id;
+	struct viot_iommu		*viommu;
+	struct list_head		list;
+};
+
+static struct acpi_table_viot *viot;
+static LIST_HEAD(viot_iommus);
+static LIST_HEAD(viot_pci_ranges);
+static LIST_HEAD(viot_mmio_endpoints);
+
+static int __init viot_check_bounds(const struct acpi_viot_header *hdr)
+{
+	struct acpi_viot_header *start, *end, *hdr_end;
+
+	start = ACPI_ADD_PTR(struct acpi_viot_header, viot,
+			     max_t(size_t, sizeof(*viot), viot->node_offset));
+	end = ACPI_ADD_PTR(struct acpi_viot_header, viot, viot->header.length);
+	hdr_end = ACPI_ADD_PTR(struct acpi_viot_header, hdr, sizeof(*hdr));
+
+	if (hdr < start || hdr_end > end) {
+		pr_err(FW_BUG "Node pointer overflows\n");
+		return -EOVERFLOW;
+	}
+	if (hdr->length < sizeof(*hdr)) {
+		pr_err(FW_BUG "Empty node\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int __init viot_get_pci_iommu_fwnode(struct viot_iommu *viommu,
+					    u16 segment, u16 bdf)
+{
+	struct pci_dev *pdev;
+	struct fwnode_handle *fwnode;
+
+	pdev = pci_get_domain_bus_and_slot(segment, PCI_BUS_NUM(bdf),
+					   bdf & 0xff);
+	if (!pdev) {
+		pr_err("Could not find PCI IOMMU\n");
+		return -ENODEV;
+	}
+
+	fwnode = pdev->dev.fwnode;
+	if (!fwnode) {
+		/*
+		 * PCI devices aren't necessarily described by ACPI. Create a
+		 * fwnode so the IOMMU subsystem can identify this device.
+		 */
+		fwnode = acpi_alloc_fwnode_static();
+		if (!fwnode) {
+			pci_dev_put(pdev);
+			return -ENOMEM;
+		}
+		set_primary_fwnode(&pdev->dev, fwnode);
+	}
+	viommu->fwnode = pdev->dev.fwnode;
+	pci_dev_put(pdev);
+	return 0;
+}
+
+static int __init viot_get_mmio_iommu_fwnode(struct viot_iommu *viommu,
+					     u64 address)
+{
+	struct acpi_device *adev;
+	struct resource res = {
+		.start	= address,
+		.end	= address,
+		.flags	= IORESOURCE_MEM,
+	};
+
+	adev = acpi_resource_consumer(&res);
+	if (!adev) {
+		pr_err("Could not find MMIO IOMMU\n");
+		return -EINVAL;
+	}
+	viommu->fwnode = &adev->fwnode;
+	return 0;
+}
+
+static struct viot_iommu * __init viot_get_iommu(unsigned int offset)
+{
+	int ret;
+	struct viot_iommu *viommu;
+	struct acpi_viot_header *hdr = ACPI_ADD_PTR(struct acpi_viot_header,
+						    viot, offset);
+	union {
+		struct acpi_viot_virtio_iommu_pci pci;
+		struct acpi_viot_virtio_iommu_mmio mmio;
+	} *node = (void *)hdr;
+
+	list_for_each_entry(viommu, &viot_iommus, list)
+		if (viommu->offset == offset)
+			return viommu;
+
+	if (viot_check_bounds(hdr))
+		return NULL;
+
+	viommu = kzalloc(sizeof(*viommu), GFP_KERNEL);
+	if (!viommu)
+		return NULL;
+
+	viommu->offset = offset;
+	switch (hdr->type) {
+	case ACPI_VIOT_NODE_VIRTIO_IOMMU_PCI:
+		if (hdr->length < sizeof(node->pci))
+			goto err_free;
+
+		ret = viot_get_pci_iommu_fwnode(viommu, node->pci.segment,
+						node->pci.bdf);
+		break;
+	case ACPI_VIOT_NODE_VIRTIO_IOMMU_MMIO:
+		if (hdr->length < sizeof(node->mmio))
+			goto err_free;
+
+		ret = viot_get_mmio_iommu_fwnode(viommu,
+						 node->mmio.base_address);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	if (ret)
+		goto err_free;
+
+	list_add(&viommu->list, &viot_iommus);
+	return viommu;
+
+err_free:
+	kfree(viommu);
+	return NULL;
+}
+
+static int __init viot_parse_node(const struct acpi_viot_header *hdr)
+{
+	int ret = -EINVAL;
+	struct list_head *list;
+	struct viot_endpoint *ep;
+	union {
+		struct acpi_viot_mmio mmio;
+		struct acpi_viot_pci_range pci;
+	} *node = (void *)hdr;
+
+	if (viot_check_bounds(hdr))
+		return -EINVAL;
+
+	if (hdr->type == ACPI_VIOT_NODE_VIRTIO_IOMMU_PCI ||
+	    hdr->type == ACPI_VIOT_NODE_VIRTIO_IOMMU_MMIO)
+		return 0;
+
+	ep = kzalloc(sizeof(*ep), GFP_KERNEL);
+	if (!ep)
+		return -ENOMEM;
+
+	switch (hdr->type) {
+	case ACPI_VIOT_NODE_PCI_RANGE:
+		if (hdr->length < sizeof(node->pci)) {
+			pr_err(FW_BUG "Invalid PCI node size\n");
+			goto err_free;
+		}
+
+		ep->segment_start = node->pci.segment_start;
+		ep->segment_end = node->pci.segment_end;
+		ep->bdf_start = node->pci.bdf_start;
+		ep->bdf_end = node->pci.bdf_end;
+		ep->endpoint_id = node->pci.endpoint_start;
+		ep->viommu = viot_get_iommu(node->pci.output_node);
+		list = &viot_pci_ranges;
+		break;
+	case ACPI_VIOT_NODE_MMIO:
+		if (hdr->length < sizeof(node->mmio)) {
+			pr_err(FW_BUG "Invalid MMIO node size\n");
+			goto err_free;
+		}
+
+		ep->address = node->mmio.base_address;
+		ep->endpoint_id = node->mmio.endpoint;
+		ep->viommu = viot_get_iommu(node->mmio.output_node);
+		list = &viot_mmio_endpoints;
+		break;
+	default:
+		pr_warn("Unsupported node %x\n", hdr->type);
+		ret = 0;
+		goto err_free;
+	}
+
+	if (!ep->viommu) {
+		pr_warn("No IOMMU node found\n");
+		/*
+		 * A future version of the table may use the node for other
+		 * purposes. Keep parsing.
+		 */
+		ret = 0;
+		goto err_free;
+	}
+
+	list_add(&ep->list, list);
+	return 0;
+
+err_free:
+	kfree(ep);
+	return ret;
+}
+
+/**
+ * acpi_viot_init - Parse the VIOT table
+ *
+ * Parse the VIOT table, prepare the list of endpoints to be used during DMA
+ * setup of devices.
+ */
+void __init acpi_viot_init(void)
+{
+	int i;
+	acpi_status status;
+	struct acpi_table_header *hdr;
+	struct acpi_viot_header *node;
+
+	status = acpi_get_table(ACPI_SIG_VIOT, 0, &hdr);
+	if (ACPI_FAILURE(status)) {
+		if (status != AE_NOT_FOUND) {
+			const char *msg = acpi_format_exception(status);
+
+			pr_err("Failed to get table, %s\n", msg);
+		}
+		return;
+	}
+
+	viot = (void *)hdr;
+
+	node = ACPI_ADD_PTR(struct acpi_viot_header, viot, viot->node_offset);
+	for (i = 0; i < viot->node_count; i++) {
+		if (viot_parse_node(node))
+			return;
+
+		node = ACPI_ADD_PTR(struct acpi_viot_header, node,
+				    node->length);
+	}
+
+	acpi_put_table(hdr);
+}
+
+static int viot_dev_iommu_init(struct device *dev, struct viot_iommu *viommu,
+			       u32 epid)
+{
+	const struct iommu_ops *ops;
+
+	if (!viommu)
+		return -ENODEV;
+
+	/* We're not translating ourself */
+	if (viommu->fwnode == dev->fwnode)
+		return -EINVAL;
+
+	ops = iommu_ops_from_fwnode(viommu->fwnode);
+	if (!ops)
+		return IS_ENABLED(CONFIG_VIRTIO_IOMMU) ?
+			-EPROBE_DEFER : -ENODEV;
+
+	return acpi_iommu_fwspec_init(dev, epid, viommu->fwnode, ops);
+}
+
+static int viot_pci_dev_iommu_init(struct pci_dev *pdev, u16 dev_id, void *data)
+{
+	u32 epid;
+	struct viot_endpoint *ep;
+	u32 domain_nr = pci_domain_nr(pdev->bus);
+
+	list_for_each_entry(ep, &viot_pci_ranges, list) {
+		if (domain_nr >= ep->segment_start &&
+		    domain_nr <= ep->segment_end &&
+		    dev_id >= ep->bdf_start &&
+		    dev_id <= ep->bdf_end) {
+			epid = ((domain_nr - ep->segment_start) << 16) +
+				dev_id - ep->bdf_start + ep->endpoint_id;
+
+			/*
+			 * If we found a PCI range managed by the viommu, we're
+			 * the one that has to request ACS.
+			 */
+			pci_request_acs();
+
+			return viot_dev_iommu_init(&pdev->dev, ep->viommu,
+						   epid);
+		}
+	}
+	return -ENODEV;
+}
+
+static int viot_mmio_dev_iommu_init(struct platform_device *pdev)
+{
+	struct resource *mem;
+	struct viot_endpoint *ep;
+
+	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!mem)
+		return -ENODEV;
+
+	list_for_each_entry(ep, &viot_mmio_endpoints, list) {
+		if (ep->address == mem->start)
+			return viot_dev_iommu_init(&pdev->dev, ep->viommu,
+						   ep->endpoint_id);
+	}
+	return -ENODEV;
+}
+
+/**
+ * viot_iommu_configure - Setup IOMMU ops for an endpoint described by VIOT
+ * @dev: the endpoint
+ *
+ * Return: 0 on success, <0 on failure
+ */
+int viot_iommu_configure(struct device *dev)
+{
+	if (dev_is_pci(dev))
+		return pci_for_each_dma_alias(to_pci_dev(dev),
+					      viot_pci_dev_iommu_init, NULL);
+	else if (dev_is_platform(dev))
+		return viot_mmio_dev_iommu_init(to_platform_device(dev));
+	return -ENODEV;
+}
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 1f111b399bca..aff8a4830dd1 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -403,6 +403,7 @@ config VIRTIO_IOMMU
 	depends on ARM64
 	select IOMMU_API
 	select INTERVAL_TREE
+	select ACPI_VIOT if ACPI
 	help
 	  Para-virtualised IOMMU driver with virtio.
 
diff --git a/include/linux/acpi_viot.h b/include/linux/acpi_viot.h
new file mode 100644
index 000000000000..1eb8ee5b0e5f
--- /dev/null
+++ b/include/linux/acpi_viot.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __ACPI_VIOT_H__
+#define __ACPI_VIOT_H__
+
+#include <linux/acpi.h>
+
+#ifdef CONFIG_ACPI_VIOT
+void __init acpi_viot_init(void);
+int viot_iommu_configure(struct device *dev);
+#else
+static inline void acpi_viot_init(void) {}
+static inline int viot_iommu_configure(struct device *dev)
+{
+	return -ENODEV;
+}
+#endif
+
+#endif /* __ACPI_VIOT_H__ */
-- 
cgit v1.2.3


From ac6d704679d343e55615551f19e9b2e18d68518b Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Fri, 18 Jun 2021 17:20:59 +0200
Subject: iommu/dma: Pass address limit rather than size to
 iommu_setup_dma_ops()

Passing a 64-bit address width to iommu_setup_dma_ops() is valid on
virtual platforms, but isn't currently possible. The overflow check in
iommu_dma_init_domain() prevents this even when @dma_base isn't 0. Pass
a limit address instead of a size, so callers don't have to fake a size
to work around the check.

The base and limit parameters are being phased out, because:
* they are redundant for x86 callers. dma-iommu already reserves the
  first page, and the upper limit is already in domain->geometry.
* they can now be obtained from dev->dma_range_map on Arm.
But removing them on Arm isn't completely straightforward so is left for
future work. As an intermediate step, simplify the x86 callers by
passing dummy limits.

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20210618152059.1194210-5-jean-philippe@linaro.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 arch/arm64/mm/dma-mapping.c |  2 +-
 drivers/iommu/amd/iommu.c   |  2 +-
 drivers/iommu/dma-iommu.c   | 12 ++++++------
 drivers/iommu/intel/iommu.c |  5 +----
 include/linux/dma-iommu.h   |  4 ++--
 5 files changed, 11 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 4bf1dd3eb041..6719f9efea09 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -50,7 +50,7 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 
 	dev->dma_coherent = coherent;
 	if (iommu)
-		iommu_setup_dma_ops(dev, dma_base, size);
+		iommu_setup_dma_ops(dev, dma_base, dma_base + size - 1);
 
 #ifdef CONFIG_XEN
 	if (xen_swiotlb_detect())
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 3ac42bbdefc6..216323fb27ef 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -1713,7 +1713,7 @@ static void amd_iommu_probe_finalize(struct device *dev)
 	/* Domains are initialized for this device - have a look what we ended up with */
 	domain = iommu_get_domain_for_dev(dev);
 	if (domain->type == IOMMU_DOMAIN_DMA)
-		iommu_setup_dma_ops(dev, IOVA_START_PFN << PAGE_SHIFT, 0);
+		iommu_setup_dma_ops(dev, 0, U64_MAX);
 	else
 		set_dma_ops(dev, NULL);
 }
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 7bcdd1205535..c62e19bed302 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -319,16 +319,16 @@ static bool dev_is_untrusted(struct device *dev)
  * iommu_dma_init_domain - Initialise a DMA mapping domain
  * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
  * @base: IOVA at which the mappable address space starts
- * @size: Size of IOVA space
+ * @limit: Last address of the IOVA space
  * @dev: Device the domain is being initialised for
  *
- * @base and @size should be exact multiples of IOMMU page granularity to
+ * @base and @limit + 1 should be exact multiples of IOMMU page granularity to
  * avoid rounding surprises. If necessary, we reserve the page at address 0
  * to ensure it is an invalid IOVA. It is safe to reinitialise a domain, but
  * any change which could make prior IOVAs invalid will fail.
  */
 static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
-		u64 size, struct device *dev)
+				 dma_addr_t limit, struct device *dev)
 {
 	struct iommu_dma_cookie *cookie = domain->iova_cookie;
 	unsigned long order, base_pfn;
@@ -346,7 +346,7 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 	/* Check the domain allows at least some access to the device... */
 	if (domain->geometry.force_aperture) {
 		if (base > domain->geometry.aperture_end ||
-		    base + size <= domain->geometry.aperture_start) {
+		    limit < domain->geometry.aperture_start) {
 			pr_warn("specified DMA range outside IOMMU capability\n");
 			return -EFAULT;
 		}
@@ -1308,7 +1308,7 @@ static const struct dma_map_ops iommu_dma_ops = {
  * The IOMMU core code allocates the default DMA domain, which the underlying
  * IOMMU driver needs to support via the dma-iommu layer.
  */
-void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size)
+void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit)
 {
 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
 
@@ -1320,7 +1320,7 @@ void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size)
 	 * underlying IOMMU driver needs to support via the dma-iommu layer.
 	 */
 	if (domain->type == IOMMU_DOMAIN_DMA) {
-		if (iommu_dma_init_domain(domain, dma_base, size, dev))
+		if (iommu_dma_init_domain(domain, dma_base, dma_limit, dev))
 			goto out_err;
 		dev->dma_ops = &iommu_dma_ops;
 	}
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index be35284a2016..2f7213f0e7a1 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5165,13 +5165,10 @@ static void intel_iommu_release_device(struct device *dev)
 
 static void intel_iommu_probe_finalize(struct device *dev)
 {
-	dma_addr_t base = IOVA_START_PFN << VTD_PAGE_SHIFT;
 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
-	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
 
 	if (domain && domain->type == IOMMU_DOMAIN_DMA)
-		iommu_setup_dma_ops(dev, base,
-				    __DOMAIN_MAX_ADDR(dmar_domain->gaw) - base);
+		iommu_setup_dma_ops(dev, 0, U64_MAX);
 	else
 		set_dma_ops(dev, NULL);
 }
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 6e75a2d689b4..758ca4694257 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -19,7 +19,7 @@ int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base);
 void iommu_put_dma_cookie(struct iommu_domain *domain);
 
 /* Setup call for arch DMA mapping code */
-void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size);
+void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit);
 
 /* The DMA API isn't _quite_ the whole story, though... */
 /*
@@ -50,7 +50,7 @@ struct msi_msg;
 struct device;
 
 static inline void iommu_setup_dma_ops(struct device *dev, u64 dma_base,
-		u64 size)
+				       u64 dma_limit)
 {
 }
 
-- 
cgit v1.2.3


From 87cf5127968ab3c543ebd98253052b928f9b47da Mon Sep 17 00:00:00 2001
From: Quan Nguyen <quan@os.amperecomputing.com>
Date: Wed, 19 May 2021 14:49:28 +0700
Subject: i2c: core-smbus: Expose PEC calculate function for generic use

Expose the PEC calculation i2c_smbus_pec() for generic use.

Signed-off-by: Quan Nguyen <quan@os.amperecomputing.com>
Acked-by: Matt Johnston <matt@codeconstruct.com.au>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 drivers/i2c/i2c-core-smbus.c | 12 ++++++++++--
 include/linux/i2c.h          |  1 +
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core-smbus.c b/drivers/i2c/i2c-core-smbus.c
index d2d32c0fd8c3..e5b2d1465e7e 100644
--- a/drivers/i2c/i2c-core-smbus.c
+++ b/drivers/i2c/i2c-core-smbus.c
@@ -37,8 +37,15 @@ static u8 crc8(u16 data)
 	return (u8)(data >> 8);
 }
 
-/* Incremental CRC8 over count bytes in the array pointed to by p */
-static u8 i2c_smbus_pec(u8 crc, u8 *p, size_t count)
+/**
+ * i2c_smbus_pec - Incremental CRC8 over the given input data array
+ * @crc: previous return crc8 value
+ * @p: pointer to data buffer.
+ * @count: number of bytes in data buffer.
+ *
+ * Incremental CRC8 over count bytes in the array pointed to by p
+ */
+u8 i2c_smbus_pec(u8 crc, u8 *p, size_t count)
 {
 	int i;
 
@@ -46,6 +53,7 @@ static u8 i2c_smbus_pec(u8 crc, u8 *p, size_t count)
 		crc = crc8((crc ^ p[i]) << 8);
 	return crc;
 }
+EXPORT_SYMBOL(i2c_smbus_pec);
 
 /* Assume a 7-bit address, which is reasonable for SMBus */
 static u8 i2c_smbus_msg_pec(u8 pec, struct i2c_msg *msg)
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 953a4eecb88f..685f8c73d99e 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -148,6 +148,7 @@ s32 __i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr,
 /* Now follow the 'nice' access routines. These also document the calling
    conventions of i2c_smbus_xfer. */
 
+u8 i2c_smbus_pec(u8 crc, u8 *p, size_t count);
 s32 i2c_smbus_read_byte(const struct i2c_client *client);
 s32 i2c_smbus_write_byte(const struct i2c_client *client, u8 value);
 s32 i2c_smbus_read_byte_data(const struct i2c_client *client, u8 command);
-- 
cgit v1.2.3


From ff70202b2d1ad522275c6aadc8c53519b6a22c57 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 24 Jun 2021 10:05:05 +0200
Subject: dev_forward_skb: do not scrub skb mark within the same name space

The goal is to keep the mark during a bpf_redirect(), like it is done for
legacy encapsulation / decapsulation, when there is no x-netns.
This was initially done in commit 213dd74aee76 ("skbuff: Do not scrub skb
mark within the same name space").

When the call to skb_scrub_packet() was added in dev_forward_skb() (commit
8b27f27797ca ("skb: allow skb_scrub_packet() to be used by tunnels")), the
second argument (xnet) was set to true to force a call to skb_orphan(). At
this time, the mark was always cleanned up by skb_scrub_packet(), whatever
xnet value was.
This call to skb_orphan() was removed later in commit
9c4c325252c5 ("skbuff: preserve sock reference when scrubbing the skb.").
But this 'true' stayed here without any real reason.

Let's correctly set xnet in ____dev_forward_skb(), this function has access
to the previous interface and to the new interface.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5cbc950b34df..5ab2d1917ca1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4114,7 +4114,7 @@ static __always_inline int ____dev_forward_skb(struct net_device *dev,
 		return NET_RX_DROP;
 	}
 
-	skb_scrub_packet(skb, true);
+	skb_scrub_packet(skb, !net_eq(dev_net(dev), dev_net(skb->dev)));
 	skb->priority = 0;
 	return 0;
 }
-- 
cgit v1.2.3


From ac53c26433b51f1835ce5a935970e427d83e3ec5 Mon Sep 17 00:00:00 2001
From: Marcin Wojtas <mw@semihalf.com>
Date: Fri, 25 Jun 2021 12:38:53 +0200
Subject: net: mdiobus: withdraw fwnode_mdbiobus_register

The newly implemented fwnode_mdbiobus_register turned out to be
problematic - in case the fwnode_/of_/acpi_mdio are built as
modules, a dependency cycle can be observed during the depmod phase of
modules_install, eg.:

depmod: ERROR: Cycle detected: fwnode_mdio -> of_mdio -> fwnode_mdio
depmod: ERROR: Found 2 modules in dependency cycles!

OR:

depmod: ERROR: Cycle detected: acpi_mdio -> fwnode_mdio -> acpi_mdio
depmod: ERROR: Found 2 modules in dependency cycles!

A possible solution could be to rework fwnode_mdiobus_register,
so that to merge the contents of acpi_mdiobus_register and
of_mdiobus_register. However feasible, such change would
be very intrusive and affect huge amount of the of_mdiobus_register
users.

Since there are currently 2 users of ACPI and MDIO
(xgmac_mdio and mvmdio), withdraw the fwnode_mdbiobus_register
and roll back to a simple 'if' condition in affected drivers.

Fixes: 62a6ef6a996f ("net: mdiobus: Introduce fwnode_mdbiobus_register()")
Signed-off-by: Marcin Wojtas <mw@semihalf.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/freescale/Kconfig      |  4 +++-
 drivers/net/ethernet/freescale/xgmac_mdio.c | 11 +++++++++--
 drivers/net/ethernet/marvell/mvmdio.c       | 10 ++++++++--
 drivers/net/mdio/fwnode_mdio.c              | 22 ----------------------
 include/linux/fwnode_mdio.h                 | 12 ------------
 5 files changed, 20 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/freescale/Kconfig b/drivers/net/ethernet/freescale/Kconfig
index 92a390576b88..2d1abdd58fab 100644
--- a/drivers/net/ethernet/freescale/Kconfig
+++ b/drivers/net/ethernet/freescale/Kconfig
@@ -67,7 +67,9 @@ config FSL_PQ_MDIO
 
 config FSL_XGMAC_MDIO
 	tristate "Freescale XGMAC MDIO"
-	depends on FWNODE_MDIO
+	select PHYLIB
+	depends on OF
+	select OF_MDIO
 	help
 	  This driver supports the MDIO bus on the Fman 10G Ethernet MACs, and
 	  on the FMan mEMAC (which supports both Clauses 22 and 45)
diff --git a/drivers/net/ethernet/freescale/xgmac_mdio.c b/drivers/net/ethernet/freescale/xgmac_mdio.c
index 2d99edc8a647..0b68852379da 100644
--- a/drivers/net/ethernet/freescale/xgmac_mdio.c
+++ b/drivers/net/ethernet/freescale/xgmac_mdio.c
@@ -13,7 +13,7 @@
  */
 
 #include <linux/acpi.h>
-#include <linux/fwnode_mdio.h>
+#include <linux/acpi_mdio.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/mdio.h>
@@ -246,6 +246,7 @@ static int xgmac_mdio_read(struct mii_bus *bus, int phy_id, int regnum)
 
 static int xgmac_mdio_probe(struct platform_device *pdev)
 {
+	struct fwnode_handle *fwnode;
 	struct mdio_fsl_priv *priv;
 	struct resource *res;
 	struct mii_bus *bus;
@@ -290,7 +291,13 @@ static int xgmac_mdio_probe(struct platform_device *pdev)
 	priv->has_a011043 = device_property_read_bool(&pdev->dev,
 						      "fsl,erratum-a011043");
 
-	ret = fwnode_mdiobus_register(bus, pdev->dev.fwnode);
+	fwnode = pdev->dev.fwnode;
+	if (is_of_node(fwnode))
+		ret = of_mdiobus_register(bus, to_of_node(fwnode));
+	else if (is_acpi_node(fwnode))
+		ret = acpi_mdiobus_register(bus, fwnode);
+	else
+		ret = -EINVAL;
 	if (ret) {
 		dev_err(&pdev->dev, "cannot register MDIO bus\n");
 		goto err_registration;
diff --git a/drivers/net/ethernet/marvell/mvmdio.c b/drivers/net/ethernet/marvell/mvmdio.c
index 7537ee3f6622..62a97c46fba0 100644
--- a/drivers/net/ethernet/marvell/mvmdio.c
+++ b/drivers/net/ethernet/marvell/mvmdio.c
@@ -18,9 +18,9 @@
  */
 
 #include <linux/acpi.h>
+#include <linux/acpi_mdio.h>
 #include <linux/clk.h>
 #include <linux/delay.h>
-#include <linux/fwnode_mdio.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/kernel.h>
@@ -371,7 +371,13 @@ static int orion_mdio_probe(struct platform_device *pdev)
 		goto out_mdio;
 	}
 
-	ret = fwnode_mdiobus_register(bus, pdev->dev.fwnode);
+	/* For the platforms not supporting DT/ACPI fall-back
+	 * to mdiobus_register via of_mdiobus_register.
+	 */
+	if (is_acpi_node(pdev->dev.fwnode))
+		ret = acpi_mdiobus_register(bus, pdev->dev.fwnode);
+	else
+		ret = of_mdiobus_register(bus, pdev->dev.of_node);
 	if (ret < 0) {
 		dev_err(&pdev->dev, "Cannot register MDIO bus (%d)\n", ret);
 		goto out_mdio;
diff --git a/drivers/net/mdio/fwnode_mdio.c b/drivers/net/mdio/fwnode_mdio.c
index ae0bf71a9932..1becb1a731f6 100644
--- a/drivers/net/mdio/fwnode_mdio.c
+++ b/drivers/net/mdio/fwnode_mdio.c
@@ -7,10 +7,8 @@
  */
 
 #include <linux/acpi.h>
-#include <linux/acpi_mdio.h>
 #include <linux/fwnode_mdio.h>
 #include <linux/of.h>
-#include <linux/of_mdio.h>
 #include <linux/phy.h>
 
 MODULE_AUTHOR("Calvin Johnson <calvin.johnson@oss.nxp.com>");
@@ -144,23 +142,3 @@ int fwnode_mdiobus_register_phy(struct mii_bus *bus,
 	return 0;
 }
 EXPORT_SYMBOL(fwnode_mdiobus_register_phy);
-
-/**
- * fwnode_mdiobus_register - bring up all the PHYs on a given MDIO bus and
- *	attach them to it.
- * @bus: Target MDIO bus.
- * @fwnode: Pointer to fwnode of the MDIO controller.
- *
- * Return values are determined accordingly to acpi_/of_ mdiobus_register()
- * operation.
- */
-int fwnode_mdiobus_register(struct mii_bus *bus, struct fwnode_handle *fwnode)
-{
-	if (is_acpi_node(fwnode))
-		return acpi_mdiobus_register(bus, fwnode);
-	else if (is_of_node(fwnode))
-		return of_mdiobus_register(bus, to_of_node(fwnode));
-	else
-		return -EINVAL;
-}
-EXPORT_SYMBOL(fwnode_mdiobus_register);
diff --git a/include/linux/fwnode_mdio.h b/include/linux/fwnode_mdio.h
index f62817c23137..faf603c48c86 100644
--- a/include/linux/fwnode_mdio.h
+++ b/include/linux/fwnode_mdio.h
@@ -16,7 +16,6 @@ int fwnode_mdiobus_phy_device_register(struct mii_bus *mdio,
 int fwnode_mdiobus_register_phy(struct mii_bus *bus,
 				struct fwnode_handle *child, u32 addr);
 
-int fwnode_mdiobus_register(struct mii_bus *bus, struct fwnode_handle *fwnode);
 #else /* CONFIG_FWNODE_MDIO */
 int fwnode_mdiobus_phy_device_register(struct mii_bus *mdio,
 				       struct phy_device *phy,
@@ -31,17 +30,6 @@ static inline int fwnode_mdiobus_register_phy(struct mii_bus *bus,
 {
 	return -EINVAL;
 }
-
-static inline int fwnode_mdiobus_register(struct mii_bus *bus,
-					  struct fwnode_handle *fwnode)
-{
-	/*
-	 * Fall back to mdiobus_register() function to register a bus.
-	 * This way, we don't have to keep compat bits around in drivers.
-	 */
-
-	return mdiobus_register(bus);
-}
 #endif
 
 #endif /* __LINUX_FWNODE_MDIO_H */
-- 
cgit v1.2.3


From bce29ac9ce0bb0b0b146b687ab978378c21e9078 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@redhat.com>
Date: Tue, 22 Jun 2021 16:42:27 +0200
Subject: trace: Add osnoise tracer

In the context of high-performance computing (HPC), the Operating System
Noise (*osnoise*) refers to the interference experienced by an application
due to activities inside the operating system. In the context of Linux,
NMIs, IRQs, SoftIRQs, and any other system thread can cause noise to the
system. Moreover, hardware-related jobs can also cause noise, for example,
via SMIs.

The osnoise tracer leverages the hwlat_detector by running a similar
loop with preemption, SoftIRQs and IRQs enabled, thus allowing all
the sources of *osnoise* during its execution. Using the same approach
of hwlat, osnoise takes note of the entry and exit point of any
source of interferences, increasing a per-cpu interference counter. The
osnoise tracer also saves an interference counter for each source of
interference. The interference counter for NMI, IRQs, SoftIRQs, and
threads is increased anytime the tool observes these interferences' entry
events. When a noise happens without any interference from the operating
system level, the hardware noise counter increases, pointing to a
hardware-related noise. In this way, osnoise can account for any
source of interference. At the end of the period, the osnoise tracer
prints the sum of all noise, the max single noise, the percentage of CPU
available for the thread, and the counters for the noise sources.

Usage

Write the ASCII text "osnoise" into the current_tracer file of the
tracing system (generally mounted at /sys/kernel/tracing).

For example::

        [root@f32 ~]# cd /sys/kernel/tracing/
        [root@f32 tracing]# echo osnoise > current_tracer

It is possible to follow the trace by reading the trace trace file::

        [root@f32 tracing]# cat trace
        # tracer: osnoise
        #
        #                                _-----=> irqs-off
        #                               / _----=> need-resched
        #                              | / _---=> hardirq/softirq
        #                              || / _--=> preempt-depth                            MAX
        #                              || /                                             SINGLE     Interference counters:
        #                              ||||               RUNTIME      NOISE   % OF CPU  NOISE    +-----------------------------+
        #           TASK-PID      CPU# ||||   TIMESTAMP    IN US       IN US  AVAILABLE  IN US     HW    NMI    IRQ   SIRQ THREAD
        #              | |         |   ||||      |           |             |    |            |      |      |      |      |      |
                   <...>-859     [000] ....    81.637220: 1000000        190  99.98100       9     18      0   1007     18      1
                   <...>-860     [001] ....    81.638154: 1000000        656  99.93440      74     23      0   1006     16      3
                   <...>-861     [002] ....    81.638193: 1000000       5675  99.43250     202      6      0   1013     25     21
                   <...>-862     [003] ....    81.638242: 1000000        125  99.98750      45      1      0   1011     23      0
                   <...>-863     [004] ....    81.638260: 1000000       1721  99.82790     168      7      0   1002     49     41
                   <...>-864     [005] ....    81.638286: 1000000        263  99.97370      57      6      0   1006     26      2
                   <...>-865     [006] ....    81.638302: 1000000        109  99.98910      21      3      0   1006     18      1
                   <...>-866     [007] ....    81.638326: 1000000       7816  99.21840     107      8      0   1016     39     19

In addition to the regular trace fields (from TASK-PID to TIMESTAMP), the
tracer prints a message at the end of each period for each CPU that is
running an osnoise/CPU thread. The osnoise specific fields report:

 - The RUNTIME IN USE reports the amount of time in microseconds that
   the osnoise thread kept looping reading the time.
 - The NOISE IN US reports the sum of noise in microseconds observed
   by the osnoise tracer during the associated runtime.
 - The % OF CPU AVAILABLE reports the percentage of CPU available for
   the osnoise thread during the runtime window.
 - The MAX SINGLE NOISE IN US reports the maximum single noise observed
   during the runtime window.
 - The Interference counters display how many each of the respective
   interference happened during the runtime window.

Note that the example above shows a high number of HW noise samples.
The reason being is that this sample was taken on a virtual machine,
and the host interference is detected as a hardware interference.

Tracer options

The tracer has a set of options inside the osnoise directory, they are:

 - osnoise/cpus: CPUs at which a osnoise thread will execute.
 - osnoise/period_us: the period of the osnoise thread.
 - osnoise/runtime_us: how long an osnoise thread will look for noise.
 - osnoise/stop_tracing_us: stop the system tracing if a single noise
   higher than the configured value happens. Writing 0 disables this
   option.
 - osnoise/stop_tracing_total_us: stop the system tracing if total noise
   higher than the configured value happens. Writing 0 disables this
   option.
 - tracing_threshold: the minimum delta between two time() reads to be
   considered as noise, in us. When set to 0, the default value will
   be used, which is currently 5 us.

Additional Tracing

In addition to the tracer, a set of tracepoints were added to
facilitate the identification of the osnoise source.

 - osnoise:sample_threshold: printed anytime a noise is higher than
   the configurable tolerance_ns.
 - osnoise:nmi_noise: noise from NMI, including the duration.
 - osnoise:irq_noise: noise from an IRQ, including the duration.
 - osnoise:softirq_noise: noise from a SoftIRQ, including the
   duration.
 - osnoise:thread_noise: noise from a thread, including the duration.

Note that all the values are *net values*. For example, if while osnoise
is running, another thread preempts the osnoise thread, it will start a
thread_noise duration at the start. Then, an IRQ takes place, preempting
the thread_noise, starting a irq_noise. When the IRQ ends its execution,
it will compute its duration, and this duration will be subtracted from
the thread_noise, in such a way as to avoid the double accounting of the
IRQ execution. This logic is valid for all sources of noise.

Here is one example of the usage of these tracepoints::

       osnoise/8-961     [008] d.h.  5789.857532: irq_noise: local_timer:236 start 5789.857529929 duration 1845 ns
       osnoise/8-961     [008] dNh.  5789.858408: irq_noise: local_timer:236 start 5789.858404871 duration 2848 ns
     migration/8-54      [008] d...  5789.858413: thread_noise: migration/8:54 start 5789.858409300 duration 3068 ns
       osnoise/8-961     [008] ....  5789.858413: sample_threshold: start 5789.858404555 duration 8723 ns interferences 2

In this example, a noise sample of 8 microseconds was reported in the last
line, pointing to two interferences. Looking backward in the trace, the
two previous entries were about the migration thread running after a
timer IRQ execution. The first event is not part of the noise because
it took place one millisecond before.

It is worth noticing that the sum of the duration reported in the
tracepoints is smaller than eight us reported in the sample_threshold.
The reason roots in the overhead of the entry and exit code that happens
before and after any interference execution. This justifies the dual
approach: measuring thread and tracing.

Link: https://lkml.kernel.org/r/e649467042d60e7b62714c9c6751a56299d15119.1624372313.git.bristot@redhat.com

Cc: Phil Auld <pauld@redhat.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Kate Carcia <kcarcia@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexandre Chartre <alexandre.chartre@oracle.com>
Cc: Clark Willaims <williams@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Daniel Bristot de Oliveira <bristot@redhat.com>
[
  Made the following functions static:
   trace_irqentry_callback()
   trace_irqexit_callback()
   trace_intel_irqentry_callback()
   trace_intel_irqexit_callback()

  Added to include/trace.h:
   osnoise_arch_register()
   osnoise_arch_unregister()

  Fixed define logic for LATENCY_FS_NOTIFY

  Reported-by: kernel test robot <lkp@intel.com>
]
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 Documentation/trace/index.rst          |    1 +
 Documentation/trace/osnoise-tracer.rst |  152 ++++
 arch/x86/kernel/Makefile               |    1 +
 arch/x86/kernel/trace.c                |  237 ++++++
 include/linux/ftrace_irq.h             |   13 +
 include/linux/trace.h                  |    5 +
 include/trace/events/osnoise.h         |  142 ++++
 kernel/trace/Kconfig                   |   34 +
 kernel/trace/Makefile                  |    1 +
 kernel/trace/trace.h                   |    9 +-
 kernel/trace/trace_entries.h           |   25 +
 kernel/trace/trace_osnoise.c           | 1384 ++++++++++++++++++++++++++++++++
 kernel/trace/trace_output.c            |   72 +-
 13 files changed, 2072 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/trace/osnoise-tracer.rst
 create mode 100644 arch/x86/kernel/trace.c
 create mode 100644 include/trace/events/osnoise.h
 create mode 100644 kernel/trace/trace_osnoise.c

(limited to 'include/linux')

diff --git a/Documentation/trace/index.rst b/Documentation/trace/index.rst
index f634b36fd3aa..608107b27cc0 100644
--- a/Documentation/trace/index.rst
+++ b/Documentation/trace/index.rst
@@ -23,6 +23,7 @@ Linux Tracing Technologies
    histogram-design
    boottime-trace
    hwlat_detector
+   osnoise-tracer
    intel_th
    ring-buffer-design
    stm
diff --git a/Documentation/trace/osnoise-tracer.rst b/Documentation/trace/osnoise-tracer.rst
new file mode 100644
index 000000000000..37a3c10fb216
--- /dev/null
+++ b/Documentation/trace/osnoise-tracer.rst
@@ -0,0 +1,152 @@
+==============
+OSNOISE Tracer
+==============
+
+In the context of high-performance computing (HPC), the Operating System
+Noise (*osnoise*) refers to the interference experienced by an application
+due to activities inside the operating system. In the context of Linux,
+NMIs, IRQs, SoftIRQs, and any other system thread can cause noise to the
+system. Moreover, hardware-related jobs can also cause noise, for example,
+via SMIs.
+
+hwlat_detector is one of the tools used to identify the most complex
+source of noise: *hardware noise*.
+
+In a nutshell, the hwlat_detector creates a thread that runs
+periodically for a given period. At the beginning of a period, the thread
+disables interrupt and starts sampling. While running, the hwlatd
+thread reads the time in a loop. As interrupts are disabled, threads,
+IRQs, and SoftIRQs cannot interfere with the hwlatd thread. Hence, the
+cause of any gap between two different reads of the time roots either on
+NMI or in the hardware itself. At the end of the period, hwlatd enables
+interrupts and reports the max observed gap between the reads. It also
+prints a NMI occurrence counter. If the output does not report NMI
+executions, the user can conclude that the hardware is the culprit for
+the latency. The hwlat detects the NMI execution by observing
+the entry and exit of a NMI.
+
+The osnoise tracer leverages the hwlat_detector by running a
+similar loop with preemption, SoftIRQs and IRQs enabled, thus allowing
+all the sources of *osnoise* during its execution. Using the same approach
+of hwlat, osnoise takes note of the entry and exit point of any
+source of interferences, increasing a per-cpu interference counter. The
+osnoise tracer also saves an interference counter for each source of
+interference. The interference counter for NMI, IRQs, SoftIRQs, and
+threads is increased anytime the tool observes these interferences' entry
+events. When a noise happens without any interference from the operating
+system level, the hardware noise counter increases, pointing to a
+hardware-related noise. In this way, osnoise can account for any
+source of interference. At the end of the period, the osnoise tracer
+prints the sum of all noise, the max single noise, the percentage of CPU
+available for the thread, and the counters for the noise sources.
+
+Usage
+-----
+
+Write the ASCII text "osnoise" into the current_tracer file of the
+tracing system (generally mounted at /sys/kernel/tracing).
+
+For example::
+
+        [root@f32 ~]# cd /sys/kernel/tracing/
+        [root@f32 tracing]# echo osnoise > current_tracer
+
+It is possible to follow the trace by reading the trace trace file::
+
+        [root@f32 tracing]# cat trace
+        # tracer: osnoise
+        #
+        #                                _-----=> irqs-off
+        #                               / _----=> need-resched
+        #                              | / _---=> hardirq/softirq
+        #                              || / _--=> preempt-depth                            MAX
+        #                              || /                                             SINGLE     Interference counters:
+        #                              ||||               RUNTIME      NOISE   % OF CPU  NOISE    +-----------------------------+
+        #           TASK-PID      CPU# ||||   TIMESTAMP    IN US       IN US  AVAILABLE  IN US     HW    NMI    IRQ   SIRQ THREAD
+        #              | |         |   ||||      |           |             |    |            |      |      |      |      |      |
+                   <...>-859     [000] ....    81.637220: 1000000        190  99.98100       9     18      0   1007     18      1
+                   <...>-860     [001] ....    81.638154: 1000000        656  99.93440      74     23      0   1006     16      3
+                   <...>-861     [002] ....    81.638193: 1000000       5675  99.43250     202      6      0   1013     25     21
+                   <...>-862     [003] ....    81.638242: 1000000        125  99.98750      45      1      0   1011     23      0
+                   <...>-863     [004] ....    81.638260: 1000000       1721  99.82790     168      7      0   1002     49     41
+                   <...>-864     [005] ....    81.638286: 1000000        263  99.97370      57      6      0   1006     26      2
+                   <...>-865     [006] ....    81.638302: 1000000        109  99.98910      21      3      0   1006     18      1
+                   <...>-866     [007] ....    81.638326: 1000000       7816  99.21840     107      8      0   1016     39     19
+
+In addition to the regular trace fields (from TASK-PID to TIMESTAMP), the
+tracer prints a message at the end of each period for each CPU that is
+running an osnoise/ thread. The osnoise specific fields report:
+
+ - The RUNTIME IN USE reports the amount of time in microseconds that
+   the osnoise thread kept looping reading the time.
+ - The NOISE IN US reports the sum of noise in microseconds observed
+   by the osnoise tracer during the associated runtime.
+ - The % OF CPU AVAILABLE reports the percentage of CPU available for
+   the osnoise thread during the runtime window.
+ - The MAX SINGLE NOISE IN US reports the maximum single noise observed
+   during the runtime window.
+ - The Interference counters display how many each of the respective
+   interference happened during the runtime window.
+
+Note that the example above shows a high number of HW noise samples.
+The reason being is that this sample was taken on a virtual machine,
+and the host interference is detected as a hardware interference.
+
+Tracer options
+---------------------
+
+The tracer has a set of options inside the osnoise directory, they are:
+
+ - osnoise/cpus: CPUs at which a osnoise thread will execute.
+ - osnoise/period_us: the period of the osnoise thread.
+ - osnoise/runtime_us: how long an osnoise thread will look for noise.
+ - osnoise/stop_tracing_us: stop the system tracing if a single noise
+   higher than the configured value happens. Writing 0 disables this
+   option.
+ - osnoise/stop_tracing_total_us: stop the system tracing if total noise
+   higher than the configured value happens. Writing 0 disables this
+   option.
+ - tracing_threshold: the minimum delta between two time() reads to be
+   considered as noise, in us. When set to 0, the default value will
+   will be used, which is currently 5 us.
+
+Additional Tracing
+------------------
+
+In addition to the tracer, a set of tracepoints were added to
+facilitate the identification of the osnoise source.
+
+ - osnoise:sample_threshold: printed anytime a noise is higher than
+   the configurable tolerance_ns.
+ - osnoise:nmi_noise: noise from NMI, including the duration.
+ - osnoise:irq_noise: noise from an IRQ, including the duration.
+ - osnoise:softirq_noise: noise from a SoftIRQ, including the
+   duration.
+ - osnoise:thread_noise: noise from a thread, including the duration.
+
+Note that all the values are *net values*. For example, if while osnoise
+is running, another thread preempts the osnoise thread, it will start a
+thread_noise duration at the start. Then, an IRQ takes place, preempting
+the thread_noise, starting a irq_noise. When the IRQ ends its execution,
+it will compute its duration, and this duration will be subtracted from
+the thread_noise, in such a way as to avoid the double accounting of the
+IRQ execution. This logic is valid for all sources of noise.
+
+Here is one example of the usage of these tracepoints::
+
+       osnoise/8-961     [008] d.h.  5789.857532: irq_noise: local_timer:236 start 5789.857529929 duration 1845 ns
+       osnoise/8-961     [008] dNh.  5789.858408: irq_noise: local_timer:236 start 5789.858404871 duration 2848 ns
+     migration/8-54      [008] d...  5789.858413: thread_noise: migration/8:54 start 5789.858409300 duration 3068 ns
+       osnoise/8-961     [008] ....  5789.858413: sample_threshold: start 5789.858404555 duration 8812 ns interferences 2
+
+In this example, a noise sample of 8 microseconds was reported in the last
+line, pointing to two interferences. Looking backward in the trace, the
+two previous entries were about the migration thread running after a
+timer IRQ execution. The first event is not part of the noise because
+it took place one millisecond before.
+
+It is worth noticing that the sum of the duration reported in the
+tracepoints is smaller than eight us reported in the sample_threshold.
+The reason roots in the overhead of the entry and exit code that happens
+before and after any interference execution. This justifies the dual
+approach: measuring thread and tracing.
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0f66682ac02a..3e625c61f008 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -102,6 +102,7 @@ obj-$(CONFIG_FUNCTION_TRACER)	+= ftrace_$(BITS).o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
 obj-$(CONFIG_FTRACE_SYSCALLS)	+= ftrace.o
 obj-$(CONFIG_X86_TSC)		+= trace_clock.o
+obj-$(CONFIG_TRACING)		+= trace.o
 obj-$(CONFIG_CRASH_CORE)	+= crash_core_$(BITS).o
 obj-$(CONFIG_KEXEC_CORE)	+= machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC_CORE)	+= relocate_kernel_$(BITS).o crash.o
diff --git a/arch/x86/kernel/trace.c b/arch/x86/kernel/trace.c
new file mode 100644
index 000000000000..6912672c33a7
--- /dev/null
+++ b/arch/x86/kernel/trace.c
@@ -0,0 +1,237 @@
+#include <asm/trace/irq_vectors.h>
+#include <linux/trace.h>
+
+#if defined(CONFIG_OSNOISE_TRACER) && defined(CONFIG_X86_LOCAL_APIC)
+extern void osnoise_trace_irq_entry(int id);
+extern void osnoise_trace_irq_exit(int id, const char *desc);
+
+/*
+ * trace_intel_irq_entry - record intel specific IRQ entry
+ */
+static void trace_intel_irq_entry(void *data, int vector)
+{
+	osnoise_trace_irq_entry(vector);
+}
+
+/*
+ * trace_intel_irq_exit - record intel specific IRQ exit
+ */
+static void trace_intel_irq_exit(void *data, int vector)
+{
+	char *vector_desc = (char *) data;
+
+	osnoise_trace_irq_exit(vector, vector_desc);
+}
+
+/*
+ * register_intel_irq_tp - Register intel specific IRQ entry tracepoints
+ */
+int osnoise_arch_register(void)
+{
+	int ret;
+
+	ret = register_trace_local_timer_entry(trace_intel_irq_entry, NULL);
+	if (ret)
+		goto out_err;
+
+	ret = register_trace_local_timer_exit(trace_intel_irq_exit, "local_timer");
+	if (ret)
+		goto out_timer_entry;
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+	ret = register_trace_thermal_apic_entry(trace_intel_irq_entry, NULL);
+	if (ret)
+		goto out_timer_exit;
+
+	ret = register_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic");
+	if (ret)
+		goto out_thermal_entry;
+#endif /* CONFIG_X86_THERMAL_VECTOR */
+
+#ifdef CONFIG_X86_MCE_AMD
+	ret = register_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL);
+	if (ret)
+		goto out_thermal_exit;
+
+	ret = register_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error");
+	if (ret)
+		goto out_deferred_entry;
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+	ret = register_trace_threshold_apic_entry(trace_intel_irq_entry, NULL);
+	if (ret)
+		goto out_deferred_exit;
+
+	ret = register_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic");
+	if (ret)
+		goto out_threshold_entry;
+#endif /* CONFIG_X86_MCE_THRESHOLD */
+
+#ifdef CONFIG_SMP
+	ret = register_trace_call_function_single_entry(trace_intel_irq_entry, NULL);
+	if (ret)
+		goto out_threshold_exit;
+
+	ret = register_trace_call_function_single_exit(trace_intel_irq_exit,
+						       "call_function_single");
+	if (ret)
+		goto out_call_function_single_entry;
+
+	ret = register_trace_call_function_entry(trace_intel_irq_entry, NULL);
+	if (ret)
+		goto out_call_function_single_exit;
+
+	ret = register_trace_call_function_exit(trace_intel_irq_exit, "call_function");
+	if (ret)
+		goto out_call_function_entry;
+
+	ret = register_trace_reschedule_entry(trace_intel_irq_entry, NULL);
+	if (ret)
+		goto out_call_function_exit;
+
+	ret = register_trace_reschedule_exit(trace_intel_irq_exit, "reschedule");
+	if (ret)
+		goto out_reschedule_entry;
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_IRQ_WORK
+	ret = register_trace_irq_work_entry(trace_intel_irq_entry, NULL);
+	if (ret)
+		goto out_reschedule_exit;
+
+	ret = register_trace_irq_work_exit(trace_intel_irq_exit, "irq_work");
+	if (ret)
+		goto out_irq_work_entry;
+#endif
+
+	ret = register_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL);
+	if (ret)
+		goto out_irq_work_exit;
+
+	ret = register_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi");
+	if (ret)
+		goto out_x86_ipi_entry;
+
+	ret = register_trace_error_apic_entry(trace_intel_irq_entry, NULL);
+	if (ret)
+		goto out_x86_ipi_exit;
+
+	ret = register_trace_error_apic_exit(trace_intel_irq_exit, "error_apic");
+	if (ret)
+		goto out_error_apic_entry;
+
+	ret = register_trace_spurious_apic_entry(trace_intel_irq_entry, NULL);
+	if (ret)
+		goto out_error_apic_exit;
+
+	ret = register_trace_spurious_apic_exit(trace_intel_irq_exit, "spurious_apic");
+	if (ret)
+		goto out_spurious_apic_entry;
+
+	return 0;
+
+out_spurious_apic_entry:
+	unregister_trace_spurious_apic_entry(trace_intel_irq_entry, NULL);
+out_error_apic_exit:
+	unregister_trace_error_apic_exit(trace_intel_irq_exit, "error_apic");
+out_error_apic_entry:
+	unregister_trace_error_apic_entry(trace_intel_irq_entry, NULL);
+out_x86_ipi_exit:
+	unregister_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi");
+out_x86_ipi_entry:
+	unregister_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL);
+out_irq_work_exit:
+
+#ifdef CONFIG_IRQ_WORK
+	unregister_trace_irq_work_exit(trace_intel_irq_exit, "irq_work");
+out_irq_work_entry:
+	unregister_trace_irq_work_entry(trace_intel_irq_entry, NULL);
+out_reschedule_exit:
+#endif
+
+#ifdef CONFIG_SMP
+	unregister_trace_reschedule_exit(trace_intel_irq_exit, "reschedule");
+out_reschedule_entry:
+	unregister_trace_reschedule_entry(trace_intel_irq_entry, NULL);
+out_call_function_exit:
+	unregister_trace_call_function_exit(trace_intel_irq_exit, "call_function");
+out_call_function_entry:
+	unregister_trace_call_function_entry(trace_intel_irq_entry, NULL);
+out_call_function_single_exit:
+	unregister_trace_call_function_single_exit(trace_intel_irq_exit, "call_function_single");
+out_call_function_single_entry:
+	unregister_trace_call_function_single_entry(trace_intel_irq_entry, NULL);
+out_threshold_exit:
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+	unregister_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic");
+out_threshold_entry:
+	unregister_trace_threshold_apic_entry(trace_intel_irq_entry, NULL);
+out_deferred_exit:
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+	unregister_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error");
+out_deferred_entry:
+	unregister_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL);
+out_thermal_exit:
+#endif /* CONFIG_X86_MCE_AMD */
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+	unregister_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic");
+out_thermal_entry:
+	unregister_trace_thermal_apic_entry(trace_intel_irq_entry, NULL);
+out_timer_exit:
+#endif /* CONFIG_X86_THERMAL_VECTOR */
+
+	unregister_trace_local_timer_exit(trace_intel_irq_exit, "local_timer");
+out_timer_entry:
+	unregister_trace_local_timer_entry(trace_intel_irq_entry, NULL);
+out_err:
+	return -EINVAL;
+}
+
+void osnoise_arch_unregister(void)
+{
+	unregister_trace_spurious_apic_exit(trace_intel_irq_exit, "spurious_apic");
+	unregister_trace_spurious_apic_entry(trace_intel_irq_entry, NULL);
+	unregister_trace_error_apic_exit(trace_intel_irq_exit, "error_apic");
+	unregister_trace_error_apic_entry(trace_intel_irq_entry, NULL);
+	unregister_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi");
+	unregister_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL);
+
+#ifdef CONFIG_IRQ_WORK
+	unregister_trace_irq_work_exit(trace_intel_irq_exit, "irq_work");
+	unregister_trace_irq_work_entry(trace_intel_irq_entry, NULL);
+#endif
+
+#ifdef CONFIG_SMP
+	unregister_trace_reschedule_exit(trace_intel_irq_exit, "reschedule");
+	unregister_trace_reschedule_entry(trace_intel_irq_entry, NULL);
+	unregister_trace_call_function_exit(trace_intel_irq_exit, "call_function");
+	unregister_trace_call_function_entry(trace_intel_irq_entry, NULL);
+	unregister_trace_call_function_single_exit(trace_intel_irq_exit, "call_function_single");
+	unregister_trace_call_function_single_entry(trace_intel_irq_entry, NULL);
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+	unregister_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic");
+	unregister_trace_threshold_apic_entry(trace_intel_irq_entry, NULL);
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+	unregister_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error");
+	unregister_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL);
+#endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+	unregister_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic");
+	unregister_trace_thermal_apic_entry(trace_intel_irq_entry, NULL);
+#endif /* CONFIG_X86_THERMAL_VECTOR */
+
+	unregister_trace_local_timer_exit(trace_intel_irq_exit, "local_timer");
+	unregister_trace_local_timer_entry(trace_intel_irq_entry, NULL);
+}
+#endif /* CONFIG_OSNOISE_TRAECR && CONFIG_X86_LOCAL_APIC */
diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h
index 0abd9a1d2852..f6faa31289ba 100644
--- a/include/linux/ftrace_irq.h
+++ b/include/linux/ftrace_irq.h
@@ -7,12 +7,21 @@ extern bool trace_hwlat_callback_enabled;
 extern void trace_hwlat_callback(bool enter);
 #endif
 
+#ifdef CONFIG_OSNOISE_TRACER
+extern bool trace_osnoise_callback_enabled;
+extern void trace_osnoise_callback(bool enter);
+#endif
+
 static inline void ftrace_nmi_enter(void)
 {
 #ifdef CONFIG_HWLAT_TRACER
 	if (trace_hwlat_callback_enabled)
 		trace_hwlat_callback(true);
 #endif
+#ifdef CONFIG_OSNOISE_TRACER
+	if (trace_osnoise_callback_enabled)
+		trace_osnoise_callback(true);
+#endif
 }
 
 static inline void ftrace_nmi_exit(void)
@@ -21,6 +30,10 @@ static inline void ftrace_nmi_exit(void)
 	if (trace_hwlat_callback_enabled)
 		trace_hwlat_callback(false);
 #endif
+#ifdef CONFIG_OSNOISE_TRACER
+	if (trace_osnoise_callback_enabled)
+		trace_osnoise_callback(false);
+#endif
 }
 
 #endif /* _LINUX_FTRACE_IRQ_H */
diff --git a/include/linux/trace.h b/include/linux/trace.h
index be1e130ed87c..4e3858640c47 100644
--- a/include/linux/trace.h
+++ b/include/linux/trace.h
@@ -41,6 +41,11 @@ int trace_array_init_printk(struct trace_array *tr);
 void trace_array_put(struct trace_array *tr);
 struct trace_array *trace_array_get_by_name(const char *name);
 int trace_array_destroy(struct trace_array *tr);
+
+/* For osnoise tracer */
+int osnoise_arch_register(void);
+void osnoise_arch_unregister(void);
+
 #endif	/* CONFIG_TRACING */
 
 #endif	/* _LINUX_TRACE_H */
diff --git a/include/trace/events/osnoise.h b/include/trace/events/osnoise.h
new file mode 100644
index 000000000000..28762c69f6c9
--- /dev/null
+++ b/include/trace/events/osnoise.h
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM osnoise
+
+#if !defined(_OSNOISE_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _OSNOISE_TRACE_H
+
+#include <linux/tracepoint.h>
+TRACE_EVENT(thread_noise,
+
+	TP_PROTO(struct task_struct *t, u64 start, u64 duration),
+
+	TP_ARGS(t, start, duration),
+
+	TP_STRUCT__entry(
+		__array(	char,		comm,	TASK_COMM_LEN)
+		__field(	u64,		start	)
+		__field(	u64,		duration)
+		__field(	pid_t,		pid	)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+		__entry->pid = t->pid;
+		__entry->start = start;
+		__entry->duration = duration;
+	),
+
+	TP_printk("%8s:%d start %llu.%09u duration %llu ns",
+		__entry->comm,
+		__entry->pid,
+		__print_ns_to_secs(__entry->start),
+		__print_ns_without_secs(__entry->start),
+		__entry->duration)
+);
+
+TRACE_EVENT(softirq_noise,
+
+	TP_PROTO(int vector, u64 start, u64 duration),
+
+	TP_ARGS(vector, start, duration),
+
+	TP_STRUCT__entry(
+		__field(	u64,		start	)
+		__field(	u64,		duration)
+		__field(	int,		vector	)
+	),
+
+	TP_fast_assign(
+		__entry->vector = vector;
+		__entry->start = start;
+		__entry->duration = duration;
+	),
+
+	TP_printk("%8s:%d start %llu.%09u duration %llu ns",
+		show_softirq_name(__entry->vector),
+		__entry->vector,
+		__print_ns_to_secs(__entry->start),
+		__print_ns_without_secs(__entry->start),
+		__entry->duration)
+);
+
+TRACE_EVENT(irq_noise,
+
+	TP_PROTO(int vector, const char *desc, u64 start, u64 duration),
+
+	TP_ARGS(vector, desc, start, duration),
+
+	TP_STRUCT__entry(
+		__field(	u64,		start	)
+		__field(	u64,		duration)
+		__string(	desc,		desc    )
+		__field(	int,		vector	)
+
+	),
+
+	TP_fast_assign(
+		__assign_str(desc, desc);
+		__entry->vector = vector;
+		__entry->start = start;
+		__entry->duration = duration;
+	),
+
+	TP_printk("%s:%d start %llu.%09u duration %llu ns",
+		__get_str(desc),
+		__entry->vector,
+		__print_ns_to_secs(__entry->start),
+		__print_ns_without_secs(__entry->start),
+		__entry->duration)
+);
+
+TRACE_EVENT(nmi_noise,
+
+	TP_PROTO(u64 start, u64 duration),
+
+	TP_ARGS(start, duration),
+
+	TP_STRUCT__entry(
+		__field(	u64,		start	)
+		__field(	u64,		duration)
+	),
+
+	TP_fast_assign(
+		__entry->start = start;
+		__entry->duration = duration;
+	),
+
+	TP_printk("start %llu.%09u duration %llu ns",
+		__print_ns_to_secs(__entry->start),
+		__print_ns_without_secs(__entry->start),
+		__entry->duration)
+);
+
+TRACE_EVENT(sample_threshold,
+
+	TP_PROTO(u64 start, u64 duration, u64 interference),
+
+	TP_ARGS(start, duration, interference),
+
+	TP_STRUCT__entry(
+		__field(	u64,		start	)
+		__field(	u64,		duration)
+		__field(	u64,		interference)
+	),
+
+	TP_fast_assign(
+		__entry->start = start;
+		__entry->duration = duration;
+		__entry->interference = interference;
+	),
+
+	TP_printk("start %llu.%09u duration %llu ns interferences %llu",
+		__print_ns_to_secs(__entry->start),
+		__print_ns_without_secs(__entry->start),
+		__entry->duration,
+		__entry->interference)
+);
+
+#endif /* _TRACE_OSNOISE_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 7fa82778c3e6..41582ae4682b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -356,6 +356,40 @@ config HWLAT_TRACER
 	 file. Every time a latency is greater than tracing_thresh, it will
 	 be recorded into the ring buffer.
 
+config OSNOISE_TRACER
+	bool "OS Noise tracer"
+	select GENERIC_TRACER
+	help
+	  In the context of high-performance computing (HPC), the Operating
+	  System Noise (osnoise) refers to the interference experienced by an
+	  application due to activities inside the operating system. In the
+	  context of Linux, NMIs, IRQs, SoftIRQs, and any other system thread
+	  can cause noise to the system. Moreover, hardware-related jobs can
+	  also cause noise, for example, via SMIs.
+
+	  The osnoise tracer leverages the hwlat_detector by running a similar
+	  loop with preemption, SoftIRQs and IRQs enabled, thus allowing all
+	  the sources of osnoise during its execution. The osnoise tracer takes
+	  note of the entry and exit point of any source of interferences,
+	  increasing a per-cpu interference counter. It saves an interference
+	  counter for each source of interference. The interference counter for
+	  NMI, IRQs, SoftIRQs, and threads is increased anytime the tool
+	  observes these interferences' entry events. When a noise happens
+	  without any interference from the operating system level, the
+	  hardware noise counter increases, pointing to a hardware-related
+	  noise. In this way, osnoise can account for any source of
+	  interference. At the end of the period, the osnoise tracer prints
+	  the sum of all noise, the max single noise, the percentage of CPU
+	  available for the thread, and the counters for the noise sources.
+
+	  In addition to the tracer, a set of tracepoints were added to
+	  facilitate the identification of the osnoise source.
+
+	  The output will appear in the trace and trace_pipe files.
+
+	  To enable this tracer, echo in "osnoise" into the current_tracer
+          file.
+
 config MMIOTRACE
 	bool "Memory mapped IO tracing"
 	depends on HAVE_MMIOTRACE_SUPPORT && PCI
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index b28d3e5013cd..b1c47ccf4f73 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -58,6 +58,7 @@ obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
 obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o
+obj-$(CONFIG_OSNOISE_TRACER) += trace_osnoise.o
 obj-$(CONFIG_NOP_TRACER) += trace_nop.o
 obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 87588d1e24ca..b959c9ec9711 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -44,6 +44,7 @@ enum trace_type {
 	TRACE_BLK,
 	TRACE_BPUTS,
 	TRACE_HWLAT,
+	TRACE_OSNOISE,
 	TRACE_RAW_DATA,
 	TRACE_FUNC_REPEATS,
 
@@ -297,7 +298,8 @@ struct trace_array {
 	struct array_buffer	max_buffer;
 	bool			allocated_snapshot;
 #endif
-#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
+#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \
+	|| defined(CONFIG_OSNOISE_TRACER)
 	unsigned long		max_latency;
 #ifdef CONFIG_FSNOTIFY
 	struct dentry		*d_max_latency;
@@ -445,6 +447,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT);	\
 		IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS);	\
 		IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT);	\
+		IF_ASSIGN(var, ent, struct osnoise_entry, TRACE_OSNOISE);\
 		IF_ASSIGN(var, ent, struct raw_data_entry, TRACE_RAW_DATA);\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
 			  TRACE_MMIO_RW);				\
@@ -675,8 +678,8 @@ void update_max_tr_single(struct trace_array *tr,
 			  struct task_struct *tsk, int cpu);
 #endif /* CONFIG_TRACER_MAX_TRACE */
 
-#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
-	defined(CONFIG_FSNOTIFY)
+#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \
+	|| defined(CONFIG_OSNOISE_TRACER)) && defined(CONFIG_FSNOTIFY)
 #define LATENCY_FS_NOTIFY
 #endif
 
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 251c819cf0c5..158c0984b59b 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -360,3 +360,28 @@ FTRACE_ENTRY(func_repeats, func_repeats_entry,
 		 __entry->count,
 		 FUNC_REPEATS_GET_DELTA_TS(__entry))
 );
+
+FTRACE_ENTRY(osnoise, osnoise_entry,
+
+	TRACE_OSNOISE,
+
+	F_STRUCT(
+		__field(	u64,			noise		)
+		__field(	u64,			runtime		)
+		__field(	u64,			max_sample	)
+		__field(	unsigned int,		hw_count	)
+		__field(	unsigned int,		nmi_count	)
+		__field(	unsigned int,		irq_count	)
+		__field(	unsigned int,		softirq_count	)
+		__field(	unsigned int,		thread_count	)
+	),
+
+	F_printk("noise:%llu\tmax_sample:%llu\thw:%u\tnmi:%u\tirq:%u\tsoftirq:%u\tthread:%u\n",
+		 __entry->noise,
+		 __entry->max_sample,
+		 __entry->hw_count,
+		 __entry->nmi_count,
+		 __entry->irq_count,
+		 __entry->softirq_count,
+		 __entry->thread_count)
+);
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
new file mode 100644
index 000000000000..4e2c47dc4f19
--- /dev/null
+++ b/kernel/trace/trace_osnoise.c
@@ -0,0 +1,1384 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * OS Noise Tracer: computes the OS Noise suffered by a running thread.
+ *
+ * Based on "hwlat_detector" tracer by:
+ *   Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com>
+ *   Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. <srostedt@redhat.com>
+ *   With feedback from Clark Williams <williams@redhat.com>
+ *
+ * And also based on the rtsl tracer presented on:
+ *  DE OLIVEIRA, Daniel Bristot, et al. Demystifying the real-time linux
+ *  scheduling latency. In: 32nd Euromicro Conference on Real-Time Systems
+ *  (ECRTS 2020). Schloss Dagstuhl-Leibniz-Zentrum fur Informatik, 2020.
+ *
+ * Copyright (C) 2021 Daniel Bristot de Oliveira, Red Hat, Inc. <bristot@redhat.com>
+ */
+
+#include <linux/kthread.h>
+#include <linux/tracefs.h>
+#include <linux/uaccess.h>
+#include <linux/cpumask.h>
+#include <linux/delay.h>
+#include <linux/sched/clock.h>
+#include <linux/sched.h>
+#include "trace.h"
+
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/trace/irq_vectors.h>
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#include <trace/events/irq.h>
+#include <trace/events/sched.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/osnoise.h>
+
+static struct trace_array	*osnoise_trace;
+
+/*
+ * Default values.
+ */
+#define BANNER			"osnoise: "
+#define DEFAULT_SAMPLE_PERIOD	1000000			/* 1s */
+#define DEFAULT_SAMPLE_RUNTIME	1000000			/* 1s */
+
+/*
+ * NMI runtime info.
+ */
+struct osn_nmi {
+	u64	count;
+	u64	delta_start;
+};
+
+/*
+ * IRQ runtime info.
+ */
+struct osn_irq {
+	u64	count;
+	u64	arrival_time;
+	u64	delta_start;
+};
+
+/*
+ * sofirq runtime info.
+ */
+struct osn_softirq {
+	u64	count;
+	u64	arrival_time;
+	u64	delta_start;
+};
+
+/*
+ * thread runtime info.
+ */
+struct osn_thread {
+	u64	count;
+	u64	arrival_time;
+	u64	delta_start;
+};
+
+/*
+ * Runtime information: this structure saves the runtime information used by
+ * one sampling thread.
+ */
+struct osnoise_variables {
+	struct task_struct	*kthread;
+	bool			sampling;
+	pid_t			pid;
+	struct osn_nmi		nmi;
+	struct osn_irq		irq;
+	struct osn_softirq	softirq;
+	struct osn_thread	thread;
+	local_t			int_counter;
+};
+
+/*
+ * Per-cpu runtime information.
+ */
+DEFINE_PER_CPU(struct osnoise_variables, per_cpu_osnoise_var);
+
+/*
+ * this_cpu_osn_var - Return the per-cpu osnoise_variables on its relative CPU
+ */
+static inline struct osnoise_variables *this_cpu_osn_var(void)
+{
+	return this_cpu_ptr(&per_cpu_osnoise_var);
+}
+
+/*
+ * osn_var_reset - Reset the values of the given osnoise_variables
+ */
+static inline void osn_var_reset(struct osnoise_variables *osn_var)
+{
+	/*
+	 * So far, all the values are initialized as 0, so
+	 * zeroing the structure is perfect.
+	 */
+	memset(osn_var, 0, sizeof(*osn_var));
+}
+
+/*
+ * osn_var_reset_all - Reset the value of all per-cpu osnoise_variables
+ */
+static inline void osn_var_reset_all(void)
+{
+	struct osnoise_variables *osn_var;
+	int cpu;
+
+	for_each_cpu(cpu, cpu_online_mask) {
+		osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu);
+		osn_var_reset(osn_var);
+	}
+}
+
+/*
+ * Tells NMIs to call back to the osnoise tracer to record timestamps.
+ */
+bool trace_osnoise_callback_enabled;
+
+/*
+ * osnoise sample structure definition. Used to store the statistics of a
+ * sample run.
+ */
+struct osnoise_sample {
+	u64			runtime;	/* runtime */
+	u64			noise;		/* noise */
+	u64			max_sample;	/* max single noise sample */
+	int			hw_count;	/* # HW (incl. hypervisor) interference */
+	int			nmi_count;	/* # NMIs during this sample */
+	int			irq_count;	/* # IRQs during this sample */
+	int			softirq_count;	/* # softirqs during this sample */
+	int			thread_count;	/* # threads during this sample */
+};
+
+/*
+ * Protect the interface.
+ */
+struct mutex interface_lock;
+
+/*
+ * Tracer data.
+ */
+static struct osnoise_data {
+	u64	sample_period;		/* total sampling period */
+	u64	sample_runtime;		/* active sampling portion of period */
+	u64	stop_tracing;		/* stop trace in the inside operation (loop) */
+	u64	stop_tracing_total;	/* stop trace in the outside operation (report) */
+	bool	tainted;		/* infor users and developers about a problem */
+} osnoise_data = {
+	.sample_period			= DEFAULT_SAMPLE_PERIOD,
+	.sample_runtime			= DEFAULT_SAMPLE_RUNTIME,
+	.stop_tracing			= 0,
+	.stop_tracing_total		= 0,
+};
+
+/*
+ * Boolean variable used to inform that the tracer is currently sampling.
+ */
+static bool osnoise_busy;
+
+/*
+ * Print the osnoise header info.
+ */
+static void print_osnoise_headers(struct seq_file *s)
+{
+	if (osnoise_data.tainted)
+		seq_puts(s, "# osnoise is tainted!\n");
+
+	seq_puts(s, "#                                _-----=> irqs-off\n");
+	seq_puts(s, "#                               / _----=> need-resched\n");
+	seq_puts(s, "#                              | / _---=> hardirq/softirq\n");
+	seq_puts(s, "#                              || / _--=> preempt-depth     ");
+	seq_puts(s, "                       MAX\n");
+
+	seq_puts(s, "#                              || /                         ");
+	seq_puts(s, "                    SINGLE      Interference counters:\n");
+
+	seq_puts(s, "#                              ||||               RUNTIME   ");
+	seq_puts(s, "   NOISE  %% OF CPU  NOISE    +-----------------------------+\n");
+
+	seq_puts(s, "#           TASK-PID      CPU# ||||   TIMESTAMP    IN US    ");
+	seq_puts(s, "   IN US  AVAILABLE  IN US     HW    NMI    IRQ   SIRQ THREAD\n");
+
+	seq_puts(s, "#              | |         |   ||||      |           |      ");
+	seq_puts(s, "       |    |            |      |      |      |      |      |\n");
+}
+
+/*
+ * osnoise_taint - report an osnoise error.
+ */
+#define osnoise_taint(msg) ({							\
+	struct trace_array *tr = osnoise_trace;					\
+										\
+	trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, msg);	\
+	osnoise_data.tainted = true;						\
+})
+
+/*
+ * Record an osnoise_sample into the tracer buffer.
+ */
+static void trace_osnoise_sample(struct osnoise_sample *sample)
+{
+	struct trace_array *tr = osnoise_trace;
+	struct trace_buffer *buffer = tr->array_buffer.buffer;
+	struct trace_event_call *call = &event_osnoise;
+	struct ring_buffer_event *event;
+	struct osnoise_entry *entry;
+
+	event = trace_buffer_lock_reserve(buffer, TRACE_OSNOISE, sizeof(*entry),
+					  tracing_gen_ctx());
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	entry->runtime		= sample->runtime;
+	entry->noise		= sample->noise;
+	entry->max_sample	= sample->max_sample;
+	entry->hw_count		= sample->hw_count;
+	entry->nmi_count	= sample->nmi_count;
+	entry->irq_count	= sample->irq_count;
+	entry->softirq_count	= sample->softirq_count;
+	entry->thread_count	= sample->thread_count;
+
+	if (!call_filter_check_discard(call, entry, buffer, event))
+		trace_buffer_unlock_commit_nostack(buffer, event);
+}
+
+/*
+ * Macros to encapsulate the time capturing infrastructure.
+ */
+#define time_get()	trace_clock_local()
+#define time_to_us(x)	div_u64(x, 1000)
+#define time_sub(a, b)	((a) - (b))
+
+/*
+ * cond_move_irq_delta_start - Forward the delta_start of a running IRQ
+ *
+ * If an IRQ is preempted by an NMI, its delta_start is pushed forward
+ * to discount the NMI interference.
+ *
+ * See get_int_safe_duration().
+ */
+static inline void
+cond_move_irq_delta_start(struct osnoise_variables *osn_var, u64 duration)
+{
+	if (osn_var->irq.delta_start)
+		osn_var->irq.delta_start += duration;
+}
+
+#ifndef CONFIG_PREEMPT_RT
+/*
+ * cond_move_softirq_delta_start - Forward the delta_start of a running softirq.
+ *
+ * If a softirq is preempted by an IRQ or NMI, its delta_start is pushed
+ * forward to discount the interference.
+ *
+ * See get_int_safe_duration().
+ */
+static inline void
+cond_move_softirq_delta_start(struct osnoise_variables *osn_var, u64 duration)
+{
+	if (osn_var->softirq.delta_start)
+		osn_var->softirq.delta_start += duration;
+}
+#else /* CONFIG_PREEMPT_RT */
+#define cond_move_softirq_delta_start(osn_var, duration) do {} while (0)
+#endif
+
+/*
+ * cond_move_thread_delta_start - Forward the delta_start of a running thread
+ *
+ * If a noisy thread is preempted by an softirq, IRQ or NMI, its delta_start
+ * is pushed forward to discount the interference.
+ *
+ * See get_int_safe_duration().
+ */
+static inline void
+cond_move_thread_delta_start(struct osnoise_variables *osn_var, u64 duration)
+{
+	if (osn_var->thread.delta_start)
+		osn_var->thread.delta_start += duration;
+}
+
+/*
+ * get_int_safe_duration - Get the duration of a window
+ *
+ * The irq, softirq and thread varaibles need to have its duration without
+ * the interference from higher priority interrupts. Instead of keeping a
+ * variable to discount the interrupt interference from these variables, the
+ * starting time of these variables are pushed forward with the interrupt's
+ * duration. In this way, a single variable is used to:
+ *
+ *   - Know if a given window is being measured.
+ *   - Account its duration.
+ *   - Discount the interference.
+ *
+ * To avoid getting inconsistent values, e.g.,:
+ *
+ *	now = time_get()
+ *		--->	interrupt!
+ *			delta_start -= int duration;
+ *		<---
+ *	duration = now - delta_start;
+ *
+ *	result: negative duration if the variable duration before the
+ *	interrupt was smaller than the interrupt execution.
+ *
+ * A counter of interrupts is used. If the counter increased, try
+ * to capture an interference safe duration.
+ */
+static inline s64
+get_int_safe_duration(struct osnoise_variables *osn_var, u64 *delta_start)
+{
+	u64 int_counter, now;
+	s64 duration;
+
+	do {
+		int_counter = local_read(&osn_var->int_counter);
+		/* synchronize with interrupts */
+		barrier();
+
+		now = time_get();
+		duration = (now - *delta_start);
+
+		/* synchronize with interrupts */
+		barrier();
+	} while (int_counter != local_read(&osn_var->int_counter));
+
+	/*
+	 * This is an evidence of race conditions that cause
+	 * a value to be "discounted" too much.
+	 */
+	if (duration < 0)
+		osnoise_taint("Negative duration!\n");
+
+	*delta_start = 0;
+
+	return duration;
+}
+
+/*
+ *
+ * set_int_safe_time - Save the current time on *time, aware of interference
+ *
+ * Get the time, taking into consideration a possible interference from
+ * higher priority interrupts.
+ *
+ * See get_int_safe_duration() for an explanation.
+ */
+static u64
+set_int_safe_time(struct osnoise_variables *osn_var, u64 *time)
+{
+	u64 int_counter;
+
+	do {
+		int_counter = local_read(&osn_var->int_counter);
+		/* synchronize with interrupts */
+		barrier();
+
+		*time = time_get();
+
+		/* synchronize with interrupts */
+		barrier();
+	} while (int_counter != local_read(&osn_var->int_counter));
+
+	return int_counter;
+}
+
+/*
+ * trace_osnoise_callback - NMI entry/exit callback
+ *
+ * This function is called at the entry and exit NMI code. The bool enter
+ * distinguishes between either case. This function is used to note a NMI
+ * occurrence, compute the noise caused by the NMI, and to remove the noise
+ * it is potentially causing on other interference variables.
+ */
+void trace_osnoise_callback(bool enter)
+{
+	struct osnoise_variables *osn_var = this_cpu_osn_var();
+	u64 duration;
+
+	if (!osn_var->sampling)
+		return;
+
+	/*
+	 * Currently trace_clock_local() calls sched_clock() and the
+	 * generic version is not NMI safe.
+	 */
+	if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) {
+		if (enter) {
+			osn_var->nmi.delta_start = time_get();
+			local_inc(&osn_var->int_counter);
+		} else {
+			duration = time_get() - osn_var->nmi.delta_start;
+
+			trace_nmi_noise(osn_var->nmi.delta_start, duration);
+
+			cond_move_irq_delta_start(osn_var, duration);
+			cond_move_softirq_delta_start(osn_var, duration);
+			cond_move_thread_delta_start(osn_var, duration);
+		}
+	}
+
+	if (enter)
+		osn_var->nmi.count++;
+}
+
+/*
+ * osnoise_trace_irq_entry - Note the starting of an IRQ
+ *
+ * Save the starting time of an IRQ. As IRQs are non-preemptive to other IRQs,
+ * it is safe to use a single variable (ons_var->irq) to save the statistics.
+ * The arrival_time is used to report... the arrival time. The delta_start
+ * is used to compute the duration at the IRQ exit handler. See
+ * cond_move_irq_delta_start().
+ */
+void osnoise_trace_irq_entry(int id)
+{
+	struct osnoise_variables *osn_var = this_cpu_osn_var();
+
+	if (!osn_var->sampling)
+		return;
+	/*
+	 * This value will be used in the report, but not to compute
+	 * the execution time, so it is safe to get it unsafe.
+	 */
+	osn_var->irq.arrival_time = time_get();
+	set_int_safe_time(osn_var, &osn_var->irq.delta_start);
+	osn_var->irq.count++;
+
+	local_inc(&osn_var->int_counter);
+}
+
+/*
+ * osnoise_irq_exit - Note the end of an IRQ, sava data and trace
+ *
+ * Computes the duration of the IRQ noise, and trace it. Also discounts the
+ * interference from other sources of noise could be currently being accounted.
+ */
+void osnoise_trace_irq_exit(int id, const char *desc)
+{
+	struct osnoise_variables *osn_var = this_cpu_osn_var();
+	int duration;
+
+	if (!osn_var->sampling)
+		return;
+
+	duration = get_int_safe_duration(osn_var, &osn_var->irq.delta_start);
+	trace_irq_noise(id, desc, osn_var->irq.arrival_time, duration);
+	osn_var->irq.arrival_time = 0;
+	cond_move_softirq_delta_start(osn_var, duration);
+	cond_move_thread_delta_start(osn_var, duration);
+}
+
+/*
+ * trace_irqentry_callback - Callback to the irq:irq_entry traceevent
+ *
+ * Used to note the starting of an IRQ occurece.
+ */
+static void trace_irqentry_callback(void *data, int irq,
+				    struct irqaction *action)
+{
+	osnoise_trace_irq_entry(irq);
+}
+
+/*
+ * trace_irqexit_callback - Callback to the irq:irq_exit traceevent
+ *
+ * Used to note the end of an IRQ occurece.
+ */
+static void trace_irqexit_callback(void *data, int irq,
+				   struct irqaction *action, int ret)
+{
+	osnoise_trace_irq_exit(irq, action->name);
+}
+
+/*
+ * arch specific register function.
+ */
+int __weak osnoise_arch_register(void)
+{
+	return 0;
+}
+
+/*
+ * arch specific unregister function.
+ */
+void __weak osnoise_arch_unregister(void)
+{
+	return;
+}
+
+/*
+ * hook_irq_events - Hook IRQ handling events
+ *
+ * This function hooks the IRQ related callbacks to the respective trace
+ * events.
+ */
+int hook_irq_events(void)
+{
+	int ret;
+
+	ret = register_trace_irq_handler_entry(trace_irqentry_callback, NULL);
+	if (ret)
+		goto out_err;
+
+	ret = register_trace_irq_handler_exit(trace_irqexit_callback, NULL);
+	if (ret)
+		goto out_unregister_entry;
+
+	ret = osnoise_arch_register();
+	if (ret)
+		goto out_irq_exit;
+
+	return 0;
+
+out_irq_exit:
+	unregister_trace_irq_handler_exit(trace_irqexit_callback, NULL);
+out_unregister_entry:
+	unregister_trace_irq_handler_entry(trace_irqentry_callback, NULL);
+out_err:
+	return -EINVAL;
+}
+
+/*
+ * unhook_irq_events - Unhook IRQ handling events
+ *
+ * This function unhooks the IRQ related callbacks to the respective trace
+ * events.
+ */
+void unhook_irq_events(void)
+{
+	osnoise_arch_unregister();
+	unregister_trace_irq_handler_exit(trace_irqexit_callback, NULL);
+	unregister_trace_irq_handler_entry(trace_irqentry_callback, NULL);
+}
+
+#ifndef CONFIG_PREEMPT_RT
+/*
+ * trace_softirq_entry_callback - Note the starting of a softirq
+ *
+ * Save the starting time of a softirq. As softirqs are non-preemptive to
+ * other softirqs, it is safe to use a single variable (ons_var->softirq)
+ * to save the statistics. The arrival_time is used to report... the
+ * arrival time. The delta_start is used to compute the duration at the
+ * softirq exit handler. See cond_move_softirq_delta_start().
+ */
+void trace_softirq_entry_callback(void *data, unsigned int vec_nr)
+{
+	struct osnoise_variables *osn_var = this_cpu_osn_var();
+
+	if (!osn_var->sampling)
+		return;
+	/*
+	 * This value will be used in the report, but not to compute
+	 * the execution time, so it is safe to get it unsafe.
+	 */
+	osn_var->softirq.arrival_time = time_get();
+	set_int_safe_time(osn_var, &osn_var->softirq.delta_start);
+	osn_var->softirq.count++;
+
+	local_inc(&osn_var->int_counter);
+}
+
+/*
+ * trace_softirq_exit_callback - Note the end of an softirq
+ *
+ * Computes the duration of the softirq noise, and trace it. Also discounts the
+ * interference from other sources of noise could be currently being accounted.
+ */
+void trace_softirq_exit_callback(void *data, unsigned int vec_nr)
+{
+	struct osnoise_variables *osn_var = this_cpu_osn_var();
+	int duration;
+
+	if (!osn_var->sampling)
+		return;
+
+	duration = get_int_safe_duration(osn_var, &osn_var->softirq.delta_start);
+	trace_softirq_noise(vec_nr, osn_var->softirq.arrival_time, duration);
+	cond_move_thread_delta_start(osn_var, duration);
+	osn_var->softirq.arrival_time = 0;
+}
+
+/*
+ * hook_softirq_events - Hook softirq handling events
+ *
+ * This function hooks the softirq related callbacks to the respective trace
+ * events.
+ */
+static int hook_softirq_events(void)
+{
+	int ret;
+
+	ret = register_trace_softirq_entry(trace_softirq_entry_callback, NULL);
+	if (ret)
+		goto out_err;
+
+	ret = register_trace_softirq_exit(trace_softirq_exit_callback, NULL);
+	if (ret)
+		goto out_unreg_entry;
+
+	return 0;
+
+out_unreg_entry:
+	unregister_trace_softirq_entry(trace_softirq_entry_callback, NULL);
+out_err:
+	return -EINVAL;
+}
+
+/*
+ * unhook_softirq_events - Unhook softirq handling events
+ *
+ * This function hooks the softirq related callbacks to the respective trace
+ * events.
+ */
+static void unhook_softirq_events(void)
+{
+	unregister_trace_softirq_entry(trace_softirq_entry_callback, NULL);
+	unregister_trace_softirq_exit(trace_softirq_exit_callback, NULL);
+}
+#else /* CONFIG_PREEMPT_RT */
+/*
+ * softirq are threads on the PREEMPT_RT mode.
+ */
+static int hook_softirq_events(void)
+{
+	return 0;
+}
+static void unhook_softirq_events(void)
+{
+}
+#endif
+
+/*
+ * thread_entry - Record the starting of a thread noise window
+ *
+ * It saves the context switch time for a noisy thread, and increments
+ * the interference counters.
+ */
+static void
+thread_entry(struct osnoise_variables *osn_var, struct task_struct *t)
+{
+	if (!osn_var->sampling)
+		return;
+	/*
+	 * The arrival time will be used in the report, but not to compute
+	 * the execution time, so it is safe to get it unsafe.
+	 */
+	osn_var->thread.arrival_time = time_get();
+
+	set_int_safe_time(osn_var, &osn_var->thread.delta_start);
+
+	osn_var->thread.count++;
+	local_inc(&osn_var->int_counter);
+}
+
+/*
+ * thread_exit - Report the end of a thread noise window
+ *
+ * It computes the total noise from a thread, tracing if needed.
+ */
+static void
+thread_exit(struct osnoise_variables *osn_var, struct task_struct *t)
+{
+	int duration;
+
+	if (!osn_var->sampling)
+		return;
+
+	duration = get_int_safe_duration(osn_var, &osn_var->thread.delta_start);
+
+	trace_thread_noise(t, osn_var->thread.arrival_time, duration);
+
+	osn_var->thread.arrival_time = 0;
+}
+
+/*
+ * trace_sched_switch - sched:sched_switch trace event handler
+ *
+ * This function is hooked to the sched:sched_switch trace event, and it is
+ * used to record the beginning and to report the end of a thread noise window.
+ */
+void
+trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p,
+			    struct task_struct *n)
+{
+	struct osnoise_variables *osn_var = this_cpu_osn_var();
+
+	if (p->pid != osn_var->pid)
+		thread_exit(osn_var, p);
+
+	if (n->pid != osn_var->pid)
+		thread_entry(osn_var, n);
+}
+
+/*
+ * hook_thread_events - Hook the insturmentation for thread noise
+ *
+ * Hook the osnoise tracer callbacks to handle the noise from other
+ * threads on the necessary kernel events.
+ */
+int hook_thread_events(void)
+{
+	int ret;
+
+	ret = register_trace_sched_switch(trace_sched_switch_callback, NULL);
+	if (ret)
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * unhook_thread_events - *nhook the insturmentation for thread noise
+ *
+ * Unook the osnoise tracer callbacks to handle the noise from other
+ * threads on the necessary kernel events.
+ */
+void unhook_thread_events(void)
+{
+	unregister_trace_sched_switch(trace_sched_switch_callback, NULL);
+}
+
+/*
+ * save_osn_sample_stats - Save the osnoise_sample statistics
+ *
+ * Save the osnoise_sample statistics before the sampling phase. These
+ * values will be used later to compute the diff betwneen the statistics
+ * before and after the osnoise sampling.
+ */
+void save_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s)
+{
+	s->nmi_count = osn_var->nmi.count;
+	s->irq_count = osn_var->irq.count;
+	s->softirq_count = osn_var->softirq.count;
+	s->thread_count = osn_var->thread.count;
+}
+
+/*
+ * diff_osn_sample_stats - Compute the osnoise_sample statistics
+ *
+ * After a sample period, compute the difference on the osnoise_sample
+ * statistics. The struct osnoise_sample *s contains the statistics saved via
+ * save_osn_sample_stats() before the osnoise sampling.
+ */
+void diff_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s)
+{
+	s->nmi_count = osn_var->nmi.count - s->nmi_count;
+	s->irq_count = osn_var->irq.count - s->irq_count;
+	s->softirq_count = osn_var->softirq.count - s->softirq_count;
+	s->thread_count = osn_var->thread.count - s->thread_count;
+}
+
+/*
+ * osnoise_stop_tracing - Stop tracing and the tracer.
+ */
+static void osnoise_stop_tracing(void)
+{
+	struct trace_array *tr = osnoise_trace;
+	tracer_tracing_off(tr);
+}
+
+/*
+ * run_osnoise - Sample the time and look for osnoise
+ *
+ * Used to capture the time, looking for potential osnoise latency repeatedly.
+ * Different from hwlat_detector, it is called with preemption and interrupts
+ * enabled. This allows irqs, softirqs and threads to run, interfering on the
+ * osnoise sampling thread, as they would do with a regular thread.
+ */
+static int run_osnoise(void)
+{
+	struct osnoise_variables *osn_var = this_cpu_osn_var();
+	u64 noise = 0, sum_noise = 0, max_noise = 0;
+	struct trace_array *tr = osnoise_trace;
+	u64 start, sample, last_sample;
+	u64 last_int_count, int_count;
+	s64 total, last_total = 0;
+	struct osnoise_sample s;
+	unsigned int threshold;
+	int hw_count = 0;
+	u64 runtime, stop_in;
+	int ret = -1;
+
+	/*
+	 * Considers the current thread as the workload.
+	 */
+	osn_var->pid = current->pid;
+
+	/*
+	 * Save the current stats for the diff
+	 */
+	save_osn_sample_stats(osn_var, &s);
+
+	/*
+	 * if threshold is 0, use the default value of 5 us.
+	 */
+	threshold = tracing_thresh ? : 5000;
+
+	/*
+	 * Make sure NMIs see sampling first
+	 */
+	osn_var->sampling = true;
+	barrier();
+
+	/*
+	 * Transform the *_us config to nanoseconds to avoid the
+	 * division on the main loop.
+	 */
+	runtime = osnoise_data.sample_runtime * NSEC_PER_USEC;
+	stop_in = osnoise_data.stop_tracing * NSEC_PER_USEC;
+
+	/*
+	 * Start timestemp
+	 */
+	start = time_get();
+
+	/*
+	 * "previous" loop.
+	 */
+	last_int_count = set_int_safe_time(osn_var, &last_sample);
+
+	do {
+		/*
+		 * Get sample!
+		 */
+		int_count = set_int_safe_time(osn_var, &sample);
+
+		noise = time_sub(sample, last_sample);
+
+		/*
+		 * This shouldn't happen.
+		 */
+		if (noise < 0) {
+			osnoise_taint("negative noise!");
+			goto out;
+		}
+
+		/*
+		 * Sample runtime.
+		 */
+		total = time_sub(sample, start);
+
+		/*
+		 * Check for possible overflows.
+		 */
+		if (total < last_total) {
+			osnoise_taint("total overflow!");
+			break;
+		}
+
+		last_total = total;
+
+		if (noise >= threshold) {
+			int interference = int_count - last_int_count;
+
+			if (noise > max_noise)
+				max_noise = noise;
+
+			if (!interference)
+				hw_count++;
+
+			sum_noise += noise;
+
+			trace_sample_threshold(last_sample, noise, interference);
+
+			if (osnoise_data.stop_tracing)
+				if (noise > stop_in)
+					osnoise_stop_tracing();
+		}
+
+		/*
+		 * For the non-preemptive kernel config: let threads runs, if
+		 * they so wish.
+		 */
+		cond_resched();
+
+		last_sample = sample;
+		last_int_count = int_count;
+
+	} while (total < runtime && !kthread_should_stop());
+
+	/*
+	 * Finish the above in the view for interrupts.
+	 */
+	barrier();
+
+	osn_var->sampling = false;
+
+	/*
+	 * Make sure sampling data is no longer updated.
+	 */
+	barrier();
+
+	/*
+	 * Save noise info.
+	 */
+	s.noise = time_to_us(sum_noise);
+	s.runtime = time_to_us(total);
+	s.max_sample = time_to_us(max_noise);
+	s.hw_count = hw_count;
+
+	/* Save interference stats info */
+	diff_osn_sample_stats(osn_var, &s);
+
+	trace_osnoise_sample(&s);
+
+	/* Keep a running maximum ever recorded osnoise "latency" */
+	if (max_noise > tr->max_latency) {
+		tr->max_latency = max_noise;
+		latency_fsnotify(tr);
+	}
+
+	if (osnoise_data.stop_tracing_total)
+		if (s.noise > osnoise_data.stop_tracing_total)
+			osnoise_stop_tracing();
+
+	return 0;
+out:
+	return ret;
+}
+
+static struct cpumask osnoise_cpumask;
+static struct cpumask save_cpumask;
+
+/*
+ * osnoise_main - The osnoise detection kernel thread
+ *
+ * Calls run_osnoise() function to measure the osnoise for the configured runtime,
+ * every period.
+ */
+static int osnoise_main(void *data)
+{
+	s64 interval;
+
+	while (!kthread_should_stop()) {
+
+		run_osnoise();
+
+		mutex_lock(&interface_lock);
+		interval = osnoise_data.sample_period - osnoise_data.sample_runtime;
+		mutex_unlock(&interface_lock);
+
+		do_div(interval, USEC_PER_MSEC);
+
+		/*
+		 * differently from hwlat_detector, the osnoise tracer can run
+		 * without a pause because preemption is on.
+		 */
+		if (interval < 1)
+			continue;
+
+		if (msleep_interruptible(interval))
+			break;
+	}
+
+	return 0;
+}
+
+/*
+ * stop_per_cpu_kthread - stop per-cpu threads
+ *
+ * Stop the osnoise sampling htread. Use this on unload and at system
+ * shutdown.
+ */
+static void stop_per_cpu_kthreads(void)
+{
+	struct task_struct *kthread;
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread;
+		if (kthread)
+			kthread_stop(kthread);
+		per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
+	}
+}
+
+/*
+ * start_per_cpu_kthread - Kick off per-cpu osnoise sampling kthreads
+ *
+ * This starts the kernel thread that will look for osnoise on many
+ * cpus.
+ */
+static int start_per_cpu_kthreads(struct trace_array *tr)
+{
+	struct cpumask *current_mask = &save_cpumask;
+	struct task_struct *kthread;
+	char comm[24];
+	int cpu;
+
+	get_online_cpus();
+	/*
+	 * Run only on CPUs in which trace and osnoise are allowed to run.
+	 */
+	cpumask_and(current_mask, tr->tracing_cpumask, &osnoise_cpumask);
+	/*
+	 * And the CPU is online.
+	 */
+	cpumask_and(current_mask, cpu_online_mask, current_mask);
+	put_online_cpus();
+
+	for_each_online_cpu(cpu)
+		per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
+
+	for_each_cpu(cpu, current_mask) {
+		snprintf(comm, 24, "osnoise/%d", cpu);
+
+		kthread = kthread_create_on_cpu(osnoise_main, NULL, cpu, comm);
+
+		if (IS_ERR(kthread)) {
+			pr_err(BANNER "could not start sampling thread\n");
+			stop_per_cpu_kthreads();
+			return -ENOMEM;
+		}
+
+		per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread;
+		wake_up_process(kthread);
+	}
+
+	return 0;
+}
+
+/*
+ * osnoise_cpus_read - Read function for reading the "cpus" file
+ * @filp: The active open file structure
+ * @ubuf: The userspace provided buffer to read value into
+ * @cnt: The maximum number of bytes to read
+ * @ppos: The current "file" position
+ *
+ * Prints the "cpus" output into the user-provided buffer.
+ */
+static ssize_t
+osnoise_cpus_read(struct file *filp, char __user *ubuf, size_t count,
+		  loff_t *ppos)
+{
+	char *mask_str;
+	int len;
+
+	mutex_lock(&interface_lock);
+
+	len = snprintf(NULL, 0, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)) + 1;
+	mask_str = kmalloc(len, GFP_KERNEL);
+	if (!mask_str) {
+		count = -ENOMEM;
+		goto out_unlock;
+	}
+
+	len = snprintf(mask_str, len, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask));
+	if (len >= count) {
+		count = -EINVAL;
+		goto out_free;
+	}
+
+	count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len);
+
+out_free:
+	kfree(mask_str);
+out_unlock:
+	mutex_unlock(&interface_lock);
+
+	return count;
+}
+
+static void osnoise_tracer_start(struct trace_array *tr);
+static void osnoise_tracer_stop(struct trace_array *tr);
+
+/*
+ * osnoise_cpus_write - Write function for "cpus" entry
+ * @filp: The active open file structure
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in @file
+ *
+ * This function provides a write implementation for the "cpus"
+ * interface to the osnoise trace. By default, it lists all  CPUs,
+ * in this way, allowing osnoise threads to run on any online CPU
+ * of the system. It serves to restrict the execution of osnoise to the
+ * set of CPUs writing via this interface. Note that osnoise also
+ * respects the "tracing_cpumask." Hence, osnoise threads will run only
+ * on the set of CPUs allowed here AND on "tracing_cpumask." Why not
+ * have just "tracing_cpumask?" Because the user might be interested
+ * in tracing what is running on other CPUs. For instance, one might
+ * run osnoise in one HT CPU while observing what is running on the
+ * sibling HT CPU.
+ */
+static ssize_t
+osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count,
+		   loff_t *ppos)
+{
+	struct trace_array *tr = osnoise_trace;
+	cpumask_var_t osnoise_cpumask_new;
+	int running, err;
+	char buf[256];
+
+	if (count >= 256)
+		return -EINVAL;
+
+	if (copy_from_user(buf, ubuf, count))
+		return -EFAULT;
+
+	if (!zalloc_cpumask_var(&osnoise_cpumask_new, GFP_KERNEL))
+		return -ENOMEM;
+
+	err = cpulist_parse(buf, osnoise_cpumask_new);
+	if (err)
+		goto err_free;
+
+	/*
+	 * trace_types_lock is taken to avoid concurrency on start/stop
+	 * and osnoise_busy.
+	 */
+	mutex_lock(&trace_types_lock);
+	running = osnoise_busy;
+	if (running)
+		osnoise_tracer_stop(tr);
+
+	mutex_lock(&interface_lock);
+	cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new);
+	mutex_unlock(&interface_lock);
+
+	if (running)
+		osnoise_tracer_start(tr);
+	mutex_unlock(&trace_types_lock);
+
+	free_cpumask_var(osnoise_cpumask_new);
+	return count;
+
+err_free:
+	free_cpumask_var(osnoise_cpumask_new);
+
+	return err;
+}
+
+/*
+ * osnoise/runtime_us: cannot be greater than the period.
+ */
+static struct trace_min_max_param osnoise_runtime = {
+	.lock	= &interface_lock,
+	.val	= &osnoise_data.sample_runtime,
+	.max	= &osnoise_data.sample_period,
+	.min	= NULL,
+};
+
+/*
+ * osnoise/period_us: cannot be smaller than the runtime.
+ */
+static struct trace_min_max_param osnoise_period = {
+	.lock	= &interface_lock,
+	.val	= &osnoise_data.sample_period,
+	.max	= NULL,
+	.min	= &osnoise_data.sample_runtime,
+};
+
+/*
+ * osnoise/stop_tracing_us: no limit.
+ */
+static struct trace_min_max_param osnoise_stop_tracing_in = {
+	.lock	= &interface_lock,
+	.val	= &osnoise_data.stop_tracing,
+	.max	= NULL,
+	.min	= NULL,
+};
+
+/*
+ * osnoise/stop_tracing_total_us: no limit.
+ */
+static struct trace_min_max_param osnoise_stop_tracing_total = {
+	.lock	= &interface_lock,
+	.val	= &osnoise_data.stop_tracing_total,
+	.max	= NULL,
+	.min	= NULL,
+};
+
+static const struct file_operations cpus_fops = {
+	.open		= tracing_open_generic,
+	.read		= osnoise_cpus_read,
+	.write		= osnoise_cpus_write,
+	.llseek		= generic_file_llseek,
+};
+
+/*
+ * init_tracefs - A function to initialize the tracefs interface files
+ *
+ * This function creates entries in tracefs for "osnoise". It creates the
+ * "osnoise" directory in the tracing directory, and within that
+ * directory is the count, runtime and period files to change and view
+ * those values.
+ */
+static int init_tracefs(void)
+{
+	struct dentry *top_dir;
+	struct dentry *tmp;
+	int ret;
+
+	ret = tracing_init_dentry();
+	if (ret)
+		return -ENOMEM;
+
+	top_dir = tracefs_create_dir("osnoise", NULL);
+	if (!top_dir)
+		return -ENOMEM;
+
+	tmp = tracefs_create_file("period_us", 0640, top_dir,
+				  &osnoise_period, &trace_min_max_fops);
+	if (!tmp)
+		goto err;
+
+	tmp = tracefs_create_file("runtime_us", 0644, top_dir,
+				  &osnoise_runtime, &trace_min_max_fops);
+	if (!tmp)
+		goto err;
+
+	tmp = tracefs_create_file("stop_tracing_us", 0640, top_dir,
+				  &osnoise_stop_tracing_in, &trace_min_max_fops);
+	if (!tmp)
+		goto err;
+
+	tmp = tracefs_create_file("stop_tracing_total_us", 0640, top_dir,
+				  &osnoise_stop_tracing_total, &trace_min_max_fops);
+	if (!tmp)
+		goto err;
+
+	tmp = trace_create_file("cpus", 0644, top_dir, NULL, &cpus_fops);
+	if (!tmp)
+		goto err;
+
+	return 0;
+
+err:
+	tracefs_remove(top_dir);
+	return -ENOMEM;
+}
+
+static int osnoise_hook_events(void)
+{
+	int retval;
+
+	/*
+	 * Trace is already hooked, we are re-enabling from
+	 * a stop_tracing_*.
+	 */
+	if (trace_osnoise_callback_enabled)
+		return 0;
+
+	retval = hook_irq_events();
+	if (retval)
+		return -EINVAL;
+
+	retval = hook_softirq_events();
+	if (retval)
+		goto out_unhook_irq;
+
+	retval = hook_thread_events();
+	/*
+	 * All fine!
+	 */
+	if (!retval)
+		return 0;
+
+	unhook_softirq_events();
+out_unhook_irq:
+	unhook_irq_events();
+	return -EINVAL;
+}
+
+static void osnoise_tracer_start(struct trace_array *tr)
+{
+	int retval;
+
+	if (osnoise_busy)
+		return;
+
+	osn_var_reset_all();
+
+	retval = osnoise_hook_events();
+	if (retval)
+		goto out_err;
+	/*
+	 * Make sure NMIs see reseted values.
+	 */
+	barrier();
+	trace_osnoise_callback_enabled = true;
+
+	retval = start_per_cpu_kthreads(tr);
+	/*
+	 * all fine!
+	 */
+	if (!retval)
+		return;
+
+out_err:
+	unhook_irq_events();
+	pr_err(BANNER "Error starting osnoise tracer\n");
+}
+
+static void osnoise_tracer_stop(struct trace_array *tr)
+{
+	if (!osnoise_busy)
+		return;
+
+	trace_osnoise_callback_enabled = false;
+	barrier();
+
+	stop_per_cpu_kthreads();
+
+	unhook_irq_events();
+	unhook_softirq_events();
+	unhook_thread_events();
+
+	osnoise_busy = false;
+}
+
+static int osnoise_tracer_init(struct trace_array *tr)
+{
+	/* Only allow one instance to enable this */
+	if (osnoise_busy)
+		return -EBUSY;
+
+	osnoise_trace = tr;
+
+	tr->max_latency = 0;
+
+	osnoise_tracer_start(tr);
+
+	osnoise_busy = true;
+
+	return 0;
+}
+
+static void osnoise_tracer_reset(struct trace_array *tr)
+{
+	osnoise_tracer_stop(tr);
+}
+
+static struct tracer osnoise_tracer __read_mostly = {
+	.name		= "osnoise",
+	.init		= osnoise_tracer_init,
+	.reset		= osnoise_tracer_reset,
+	.start		= osnoise_tracer_start,
+	.stop		= osnoise_tracer_stop,
+	.print_header	= print_osnoise_headers,
+	.allow_instances = true,
+};
+
+__init static int init_osnoise_tracer(void)
+{
+	int ret;
+
+	mutex_init(&interface_lock);
+
+	cpumask_copy(&osnoise_cpumask, cpu_all_mask);
+
+	ret = register_tracer(&osnoise_tracer);
+	if (ret)
+		return ret;
+
+	init_tracefs();
+
+	return 0;
+}
+late_initcall(init_osnoise_tracer);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index d0368a569bfa..642b6584eba5 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1202,7 +1202,6 @@ trace_hwlat_print(struct trace_iterator *iter, int flags,
 	return trace_handle_return(s);
 }
 
-
 static enum print_line_t
 trace_hwlat_raw(struct trace_iterator *iter, int flags,
 		struct trace_event *event)
@@ -1232,6 +1231,76 @@ static struct trace_event trace_hwlat_event = {
 	.funcs		= &trace_hwlat_funcs,
 };
 
+/* TRACE_OSNOISE */
+static enum print_line_t
+trace_osnoise_print(struct trace_iterator *iter, int flags,
+		    struct trace_event *event)
+{
+	struct trace_entry *entry = iter->ent;
+	struct trace_seq *s = &iter->seq;
+	struct osnoise_entry *field;
+	u64 ratio, ratio_dec;
+	u64 net_runtime;
+
+	trace_assign_type(field, entry);
+
+	/*
+	 * compute the available % of cpu time.
+	 */
+	net_runtime = field->runtime - field->noise;
+	ratio = net_runtime * 10000000;
+	do_div(ratio, field->runtime);
+	ratio_dec = do_div(ratio, 100000);
+
+	trace_seq_printf(s, "%llu %10llu %3llu.%05llu %7llu",
+			 field->runtime,
+			 field->noise,
+			 ratio, ratio_dec,
+			 field->max_sample);
+
+	trace_seq_printf(s, " %6u", field->hw_count);
+	trace_seq_printf(s, " %6u", field->nmi_count);
+	trace_seq_printf(s, " %6u", field->irq_count);
+	trace_seq_printf(s, " %6u", field->softirq_count);
+	trace_seq_printf(s, " %6u", field->thread_count);
+
+	trace_seq_putc(s, '\n');
+
+	return trace_handle_return(s);
+}
+
+static enum print_line_t
+trace_osnoise_raw(struct trace_iterator *iter, int flags,
+		  struct trace_event *event)
+{
+	struct osnoise_entry *field;
+	struct trace_seq *s = &iter->seq;
+
+	trace_assign_type(field, iter->ent);
+
+	trace_seq_printf(s, "%lld %llu %llu %u %u %u %u %u\n",
+			 field->runtime,
+			 field->noise,
+			 field->max_sample,
+			 field->hw_count,
+			 field->nmi_count,
+			 field->irq_count,
+			 field->softirq_count,
+			 field->thread_count);
+
+	return trace_handle_return(s);
+}
+
+static struct trace_event_functions trace_osnoise_funcs = {
+	.trace		= trace_osnoise_print,
+	.raw		= trace_osnoise_raw,
+};
+
+static struct trace_event trace_osnoise_event = {
+	.type		= TRACE_OSNOISE,
+	.funcs		= &trace_osnoise_funcs,
+};
+
 /* TRACE_BPUTS */
 static enum print_line_t
 trace_bputs_print(struct trace_iterator *iter, int flags,
@@ -1442,6 +1511,7 @@ static struct trace_event *events[] __initdata = {
 	&trace_bprint_event,
 	&trace_print_event,
 	&trace_hwlat_event,
+	&trace_osnoise_event,
 	&trace_raw_data_event,
 	&trace_func_repeats_event,
 	NULL
-- 
cgit v1.2.3


From 1ab6dc35e9148e3cb4a837fdd08f1ca56b55eda0 Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Mon, 19 Apr 2021 16:23:49 +0300
Subject: net/mlx5: DR, Add support for flow sampler offload

Add SW steering support for sFlow / flow sampler action.

Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 .../mellanox/mlx5/core/steering/dr_action.c        | 55 ++++++++++++++++++++++
 .../ethernet/mellanox/mlx5/core/steering/dr_cmd.c  | 33 +++++++++++++
 .../mellanox/mlx5/core/steering/dr_types.h         | 14 ++++++
 .../ethernet/mellanox/mlx5/core/steering/fs_dr.c   | 17 +++++--
 .../ethernet/mellanox/mlx5/core/steering/mlx5dr.h  |  3 ++
 include/linux/mlx5/mlx5_ifc.h                      |  5 ++
 6 files changed, 124 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
index de68c0ec2143..6475ba35cf6b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
@@ -31,6 +31,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
 			[DR_ACTION_TYP_DROP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_QP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_FT]		= DR_ACTION_STATE_TERM,
+			[DR_ACTION_TYP_SAMPLER]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_TAG]		= DR_ACTION_STATE_NON_TERM,
 			[DR_ACTION_TYP_CTR]		= DR_ACTION_STATE_NON_TERM,
 			[DR_ACTION_TYP_TNL_L2_TO_L2]	= DR_ACTION_STATE_DECAP,
@@ -45,6 +46,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
 			[DR_ACTION_TYP_DROP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_QP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_FT]		= DR_ACTION_STATE_TERM,
+			[DR_ACTION_TYP_SAMPLER]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_TAG]		= DR_ACTION_STATE_DECAP,
 			[DR_ACTION_TYP_CTR]		= DR_ACTION_STATE_DECAP,
 			[DR_ACTION_TYP_L2_TO_TNL_L2]	= DR_ACTION_STATE_ENCAP,
@@ -57,6 +59,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
 			[DR_ACTION_TYP_DROP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_QP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_FT]		= DR_ACTION_STATE_TERM,
+			[DR_ACTION_TYP_SAMPLER]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_TAG]		= DR_ACTION_STATE_ENCAP,
 			[DR_ACTION_TYP_CTR]		= DR_ACTION_STATE_ENCAP,
 		},
@@ -64,6 +67,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
 			[DR_ACTION_TYP_DROP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_QP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_FT]		= DR_ACTION_STATE_TERM,
+			[DR_ACTION_TYP_SAMPLER]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_TAG]		= DR_ACTION_STATE_MODIFY_HDR,
 			[DR_ACTION_TYP_CTR]		= DR_ACTION_STATE_MODIFY_HDR,
 			[DR_ACTION_TYP_L2_TO_TNL_L2]	= DR_ACTION_STATE_ENCAP,
@@ -74,6 +78,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
 			[DR_ACTION_TYP_DROP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_QP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_FT]		= DR_ACTION_STATE_TERM,
+			[DR_ACTION_TYP_SAMPLER]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_TAG]		= DR_ACTION_STATE_MODIFY_VLAN,
 			[DR_ACTION_TYP_CTR]		= DR_ACTION_STATE_MODIFY_VLAN,
 			[DR_ACTION_TYP_POP_VLAN]	= DR_ACTION_STATE_MODIFY_VLAN,
@@ -86,6 +91,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
 			[DR_ACTION_TYP_DROP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_QP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_FT]		= DR_ACTION_STATE_TERM,
+			[DR_ACTION_TYP_SAMPLER]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_TAG]		= DR_ACTION_STATE_NON_TERM,
 			[DR_ACTION_TYP_CTR]		= DR_ACTION_STATE_NON_TERM,
 			[DR_ACTION_TYP_TNL_L2_TO_L2]	= DR_ACTION_STATE_DECAP,
@@ -104,6 +110,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
 		[DR_ACTION_STATE_NO_ACTION] = {
 			[DR_ACTION_TYP_DROP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_FT]		= DR_ACTION_STATE_TERM,
+			[DR_ACTION_TYP_SAMPLER]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_CTR]		= DR_ACTION_STATE_NON_TERM,
 			[DR_ACTION_TYP_L2_TO_TNL_L2]	= DR_ACTION_STATE_ENCAP,
 			[DR_ACTION_TYP_L2_TO_TNL_L3]	= DR_ACTION_STATE_ENCAP,
@@ -114,11 +121,13 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
 		[DR_ACTION_STATE_ENCAP] = {
 			[DR_ACTION_TYP_DROP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_FT]		= DR_ACTION_STATE_TERM,
+			[DR_ACTION_TYP_SAMPLER]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_CTR]		= DR_ACTION_STATE_ENCAP,
 		},
 		[DR_ACTION_STATE_MODIFY_HDR] = {
 			[DR_ACTION_TYP_DROP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_FT]		= DR_ACTION_STATE_TERM,
+			[DR_ACTION_TYP_SAMPLER]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_CTR]		= DR_ACTION_STATE_MODIFY_HDR,
 			[DR_ACTION_TYP_L2_TO_TNL_L2]	= DR_ACTION_STATE_ENCAP,
 			[DR_ACTION_TYP_L2_TO_TNL_L3]	= DR_ACTION_STATE_ENCAP,
@@ -128,6 +137,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
 		[DR_ACTION_STATE_MODIFY_VLAN] = {
 			[DR_ACTION_TYP_DROP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_FT]		= DR_ACTION_STATE_TERM,
+			[DR_ACTION_TYP_SAMPLER]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_CTR]		= DR_ACTION_STATE_MODIFY_VLAN,
 			[DR_ACTION_TYP_PUSH_VLAN]	= DR_ACTION_STATE_MODIFY_VLAN,
 			[DR_ACTION_TYP_L2_TO_TNL_L2]	= DR_ACTION_STATE_ENCAP,
@@ -137,6 +147,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
 		[DR_ACTION_STATE_NON_TERM] = {
 			[DR_ACTION_TYP_DROP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_FT]		= DR_ACTION_STATE_TERM,
+			[DR_ACTION_TYP_SAMPLER]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_CTR]		= DR_ACTION_STATE_NON_TERM,
 			[DR_ACTION_TYP_L2_TO_TNL_L2]	= DR_ACTION_STATE_ENCAP,
 			[DR_ACTION_TYP_L2_TO_TNL_L3]	= DR_ACTION_STATE_ENCAP,
@@ -152,6 +163,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
 		[DR_ACTION_STATE_NO_ACTION] = {
 			[DR_ACTION_TYP_DROP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_FT]		= DR_ACTION_STATE_TERM,
+			[DR_ACTION_TYP_SAMPLER]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_CTR]		= DR_ACTION_STATE_NON_TERM,
 			[DR_ACTION_TYP_TNL_L2_TO_L2]	= DR_ACTION_STATE_DECAP,
 			[DR_ACTION_TYP_TNL_L3_TO_L2]	= DR_ACTION_STATE_DECAP,
@@ -166,6 +178,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
 			[DR_ACTION_TYP_DROP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_FT]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_CTR]		= DR_ACTION_STATE_DECAP,
+			[DR_ACTION_TYP_SAMPLER]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_MODIFY_HDR]	= DR_ACTION_STATE_MODIFY_HDR,
 			[DR_ACTION_TYP_POP_VLAN]	= DR_ACTION_STATE_MODIFY_VLAN,
 			[DR_ACTION_TYP_VPORT]		= DR_ACTION_STATE_TERM,
@@ -178,11 +191,13 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
 			[DR_ACTION_TYP_QP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_FT]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_VPORT]		= DR_ACTION_STATE_TERM,
+			[DR_ACTION_TYP_SAMPLER]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_CTR]		= DR_ACTION_STATE_ENCAP,
 		},
 		[DR_ACTION_STATE_MODIFY_HDR] = {
 			[DR_ACTION_TYP_DROP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_FT]		= DR_ACTION_STATE_TERM,
+			[DR_ACTION_TYP_SAMPLER]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_CTR]		= DR_ACTION_STATE_MODIFY_HDR,
 			[DR_ACTION_TYP_VPORT]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_L2_TO_TNL_L2]	= DR_ACTION_STATE_ENCAP,
@@ -192,6 +207,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
 		[DR_ACTION_STATE_MODIFY_VLAN] = {
 			[DR_ACTION_TYP_DROP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_FT]		= DR_ACTION_STATE_TERM,
+			[DR_ACTION_TYP_SAMPLER]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_POP_VLAN]	= DR_ACTION_STATE_MODIFY_VLAN,
 			[DR_ACTION_TYP_CTR]		= DR_ACTION_STATE_MODIFY_VLAN,
 			[DR_ACTION_TYP_VPORT]		= DR_ACTION_STATE_TERM,
@@ -203,6 +219,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
 		[DR_ACTION_STATE_NON_TERM] = {
 			[DR_ACTION_TYP_DROP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_FT]		= DR_ACTION_STATE_TERM,
+			[DR_ACTION_TYP_SAMPLER]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_CTR]		= DR_ACTION_STATE_NON_TERM,
 			[DR_ACTION_TYP_TNL_L2_TO_L2]	= DR_ACTION_STATE_DECAP,
 			[DR_ACTION_TYP_TNL_L3_TO_L2]	= DR_ACTION_STATE_DECAP,
@@ -221,6 +238,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
 		[DR_ACTION_STATE_NO_ACTION] = {
 			[DR_ACTION_TYP_DROP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_FT]		= DR_ACTION_STATE_TERM,
+			[DR_ACTION_TYP_SAMPLER]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_CTR]		= DR_ACTION_STATE_NON_TERM,
 			[DR_ACTION_TYP_MODIFY_HDR]	= DR_ACTION_STATE_MODIFY_HDR,
 			[DR_ACTION_TYP_L2_TO_TNL_L2]	= DR_ACTION_STATE_ENCAP,
@@ -233,11 +251,13 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
 			[DR_ACTION_TYP_DROP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_FT]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_CTR]		= DR_ACTION_STATE_ENCAP,
+			[DR_ACTION_TYP_SAMPLER]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_VPORT]		= DR_ACTION_STATE_TERM,
 		},
 		[DR_ACTION_STATE_MODIFY_HDR] = {
 			[DR_ACTION_TYP_DROP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_FT]		= DR_ACTION_STATE_TERM,
+			[DR_ACTION_TYP_SAMPLER]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_CTR]		= DR_ACTION_STATE_MODIFY_HDR,
 			[DR_ACTION_TYP_L2_TO_TNL_L2]	= DR_ACTION_STATE_ENCAP,
 			[DR_ACTION_TYP_L2_TO_TNL_L3]	= DR_ACTION_STATE_ENCAP,
@@ -248,6 +268,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
 		[DR_ACTION_STATE_MODIFY_VLAN] = {
 			[DR_ACTION_TYP_DROP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_FT]		= DR_ACTION_STATE_TERM,
+			[DR_ACTION_TYP_SAMPLER]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_PUSH_VLAN]	= DR_ACTION_STATE_MODIFY_VLAN,
 			[DR_ACTION_TYP_CTR]		= DR_ACTION_STATE_MODIFY_VLAN,
 			[DR_ACTION_TYP_L2_TO_TNL_L2]	= DR_ACTION_STATE_ENCAP,
@@ -258,6 +279,7 @@ next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX]
 		[DR_ACTION_STATE_NON_TERM] = {
 			[DR_ACTION_TYP_DROP]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_FT]		= DR_ACTION_STATE_TERM,
+			[DR_ACTION_TYP_SAMPLER]		= DR_ACTION_STATE_TERM,
 			[DR_ACTION_TYP_CTR]		= DR_ACTION_STATE_NON_TERM,
 			[DR_ACTION_TYP_MODIFY_HDR]	= DR_ACTION_STATE_MODIFY_HDR,
 			[DR_ACTION_TYP_L2_TO_TNL_L2]	= DR_ACTION_STATE_ENCAP,
@@ -519,6 +541,10 @@ int mlx5dr_actions_build_ste_arr(struct mlx5dr_matcher *matcher,
 			attr.reformat.size = action->reformat->size;
 			attr.reformat.id = action->reformat->id;
 			break;
+		case DR_ACTION_TYP_SAMPLER:
+			attr.final_icm_addr = rx_rule ? action->sampler->rx_icm_addr :
+							action->sampler->tx_icm_addr;
+			break;
 		case DR_ACTION_TYP_VPORT:
 			attr.hit_gvmi = action->vport->caps->vhca_gvmi;
 			dest_action = action;
@@ -612,6 +638,7 @@ static unsigned int action_size[DR_ACTION_TYP_MAX] = {
 	[DR_ACTION_TYP_VPORT]        = sizeof(struct mlx5dr_action_vport),
 	[DR_ACTION_TYP_PUSH_VLAN]    = sizeof(struct mlx5dr_action_push_vlan),
 	[DR_ACTION_TYP_INSERT_HDR]   = sizeof(struct mlx5dr_action_reformat),
+	[DR_ACTION_TYP_SAMPLER]      = sizeof(struct mlx5dr_action_sampler),
 };
 
 static struct mlx5dr_action *
@@ -824,6 +851,31 @@ struct mlx5dr_action *mlx5dr_action_create_tag(u32 tag_value)
 	return action;
 }
 
+struct mlx5dr_action *
+mlx5dr_action_create_flow_sampler(struct mlx5dr_domain *dmn, u32 sampler_id)
+{
+	struct mlx5dr_action *action;
+	u64 icm_rx, icm_tx;
+	int ret;
+
+	ret = mlx5dr_cmd_query_flow_sampler(dmn->mdev, sampler_id,
+					    &icm_rx, &icm_tx);
+	if (ret)
+		return NULL;
+
+	action = dr_action_create_generic(DR_ACTION_TYP_SAMPLER);
+	if (!action)
+		return NULL;
+
+	action->sampler->dmn = dmn;
+	action->sampler->sampler_id = sampler_id;
+	action->sampler->rx_icm_addr = icm_rx;
+	action->sampler->tx_icm_addr = icm_tx;
+
+	refcount_inc(&dmn->refcount);
+	return action;
+}
+
 static int
 dr_action_verify_reformat_params(enum mlx5dr_action_type reformat_type,
 				 struct mlx5dr_domain *dmn,
@@ -1624,6 +1676,9 @@ int mlx5dr_action_destroy(struct mlx5dr_action *action)
 		kfree(action->rewrite->data);
 		refcount_dec(&action->rewrite->dmn->refcount);
 		break;
+	case DR_ACTION_TYP_SAMPLER:
+		refcount_dec(&action->sampler->dmn->refcount);
+		break;
 	default:
 		break;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c
index 6314f50efbd4..54e1f5438bbe 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c
@@ -228,6 +228,36 @@ int mlx5dr_cmd_query_flow_table(struct mlx5_core_dev *dev,
 	return 0;
 }
 
+int mlx5dr_cmd_query_flow_sampler(struct mlx5_core_dev *dev,
+				  u32 sampler_id,
+				  u64 *rx_icm_addr,
+				  u64 *tx_icm_addr)
+{
+	u32 out[MLX5_ST_SZ_DW(query_sampler_obj_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
+	void *attr;
+	int ret;
+
+	MLX5_SET(general_obj_in_cmd_hdr, in, opcode,
+		 MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
+	MLX5_SET(general_obj_in_cmd_hdr, in, obj_type,
+		 MLX5_GENERAL_OBJECT_TYPES_SAMPLER);
+	MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, sampler_id);
+
+	ret = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (ret)
+		return ret;
+
+	attr = MLX5_ADDR_OF(query_sampler_obj_out, out, sampler_object);
+
+	*rx_icm_addr = MLX5_GET64(sampler_obj, attr,
+				  sw_steering_icm_address_rx);
+	*tx_icm_addr = MLX5_GET64(sampler_obj, attr,
+				  sw_steering_icm_address_tx);
+
+	return 0;
+}
+
 int mlx5dr_cmd_sync_steering(struct mlx5_core_dev *mdev)
 {
 	u32 in[MLX5_ST_SZ_DW(sync_steering_in)] = {};
@@ -711,6 +741,9 @@ int mlx5dr_cmd_set_fte(struct mlx5_core_dev *dev,
 						 fte->dest_arr[i].vport.reformat_id);
 				}
 				break;
+			case MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER:
+				id = fte->dest_arr[i].sampler_id;
+				break;
 			default:
 				id = fte->dest_arr[i].tir_num;
 			}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
index 60b8c04e165e..f5e93fa87aff 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
@@ -124,6 +124,7 @@ enum mlx5dr_action_type {
 	DR_ACTION_TYP_POP_VLAN,
 	DR_ACTION_TYP_PUSH_VLAN,
 	DR_ACTION_TYP_INSERT_HDR,
+	DR_ACTION_TYP_SAMPLER,
 	DR_ACTION_TYP_MAX,
 };
 
@@ -919,6 +920,13 @@ struct mlx5dr_action_reformat {
 	u8 param_1;
 };
 
+struct mlx5dr_action_sampler {
+	struct mlx5dr_domain *dmn;
+	u64 rx_icm_addr;
+	u64 tx_icm_addr;
+	u32 sampler_id;
+};
+
 struct mlx5dr_action_dest_tbl {
 	u8 is_fw_tbl:1;
 	union {
@@ -962,6 +970,7 @@ struct mlx5dr_action {
 		void *data;
 		struct mlx5dr_action_rewrite *rewrite;
 		struct mlx5dr_action_reformat *reformat;
+		struct mlx5dr_action_sampler *sampler;
 		struct mlx5dr_action_dest_tbl *dest_tbl;
 		struct mlx5dr_action_ctr *ctr;
 		struct mlx5dr_action_vport *vport;
@@ -1116,6 +1125,10 @@ int mlx5dr_cmd_query_gvmi(struct mlx5_core_dev *mdev,
 			  bool other_vport, u16 vport_number, u16 *gvmi);
 int mlx5dr_cmd_query_esw_caps(struct mlx5_core_dev *mdev,
 			      struct mlx5dr_esw_caps *caps);
+int mlx5dr_cmd_query_flow_sampler(struct mlx5_core_dev *dev,
+				  u32 sampler_id,
+				  u64 *rx_icm_addr,
+				  u64 *tx_icm_addr);
 int mlx5dr_cmd_sync_steering(struct mlx5_core_dev *mdev);
 int mlx5dr_cmd_set_fte_modify_and_vport(struct mlx5_core_dev *mdev,
 					u32 table_type,
@@ -1303,6 +1316,7 @@ struct mlx5dr_cmd_flow_destination_hw_info {
 		u32 ft_num;
 		u32 ft_id;
 		u32 counter_id;
+		u32 sampler_id;
 		struct {
 			u16 num;
 			u16 vhca_id;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c
index 00b4c753cae2..d5926dd7e972 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c
@@ -387,7 +387,7 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns,
 	if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
 		list_for_each_entry(dst, &fte->node.children, node.list) {
 			enum mlx5_flow_destination_type type = dst->dest_attr.type;
-			u32 ft_id;
+			u32 id;
 
 			if (num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX ||
 			    num_term_actions >= MLX5_FLOW_CONTEXT_ACTION_MAX) {
@@ -425,9 +425,20 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns,
 				num_term_actions++;
 				break;
 			case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM:
-				ft_id = dst->dest_attr.ft_num;
+				id = dst->dest_attr.ft_num;
 				tmp_action = mlx5dr_action_create_dest_table_num(domain,
-										 ft_id);
+										 id);
+				if (!tmp_action) {
+					err = -ENOMEM;
+					goto free_actions;
+				}
+				fs_dr_actions[fs_dr_num_actions++] = tmp_action;
+				term_actions[num_term_actions++].dest = tmp_action;
+				break;
+			case MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER:
+				id = dst->dest_attr.sampler_id;
+				tmp_action = mlx5dr_action_create_flow_sampler(domain,
+									       id);
 				if (!tmp_action) {
 					err = -ENOMEM;
 					goto free_actions;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h
index b2aa6c93c3a1..bbfe101d4e57 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h
@@ -100,6 +100,9 @@ struct mlx5dr_action *mlx5dr_action_create_drop(void);
 
 struct mlx5dr_action *mlx5dr_action_create_tag(u32 tag_value);
 
+struct mlx5dr_action *
+mlx5dr_action_create_flow_sampler(struct mlx5dr_domain *dmn, u32 sampler_id);
+
 struct mlx5dr_action *
 mlx5dr_action_create_flow_counter(u32 counter_id);
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 2d1ed78289ff..e32a0d61929b 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -11083,6 +11083,11 @@ struct mlx5_ifc_create_sampler_obj_in_bits {
 	struct mlx5_ifc_sampler_obj_bits sampler_object;
 };
 
+struct mlx5_ifc_query_sampler_obj_out_bits {
+	struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr;
+	struct mlx5_ifc_sampler_obj_bits sampler_object;
+};
+
 enum {
 	MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_KEY_SIZE_128 = 0x0,
 	MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_KEY_SIZE_256 = 0x1,
-- 
cgit v1.2.3


From b3c0d72b092e52ae7369b52fb97f63eb2ea7f16a Mon Sep 17 00:00:00 2001
From: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Date: Mon, 15 Mar 2021 07:33:21 +0800
Subject: mailbox: mtk-cmdq: Remove cmdq_cb_status

cmdq_cb_status is an error status. Use the standard error number
instead of cmdq_cb_status to prevent status duplication.

Signed-off-by: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Reviewed-by: Yongqiang Niu <yongqiang.niu@mediatek.com>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 drivers/mailbox/mtk-cmdq-mailbox.c       | 10 +++++-----
 include/linux/mailbox/mtk-cmdq-mailbox.h |  7 +------
 2 files changed, 6 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mailbox/mtk-cmdq-mailbox.c b/drivers/mailbox/mtk-cmdq-mailbox.c
index 3900ea63b28d..1f2edc9521c1 100644
--- a/drivers/mailbox/mtk-cmdq-mailbox.c
+++ b/drivers/mailbox/mtk-cmdq-mailbox.c
@@ -180,7 +180,7 @@ static bool cmdq_thread_is_in_wfe(struct cmdq_thread *thread)
 	return readl(thread->base + CMDQ_THR_WAIT_TOKEN) & CMDQ_THR_IS_WAITING;
 }
 
-static void cmdq_task_exec_done(struct cmdq_task *task, enum cmdq_cb_status sta)
+static void cmdq_task_exec_done(struct cmdq_task *task, int sta)
 {
 	struct cmdq_task_cb *cb = &task->pkt->async_cb;
 	struct cmdq_cb_data data;
@@ -244,10 +244,10 @@ static void cmdq_thread_irq_handler(struct cmdq *cmdq,
 			curr_task = task;
 
 		if (!curr_task || curr_pa == task_end_pa - CMDQ_INST_SIZE) {
-			cmdq_task_exec_done(task, CMDQ_CB_NORMAL);
+			cmdq_task_exec_done(task, 0);
 			kfree(task);
 		} else if (err) {
-			cmdq_task_exec_done(task, CMDQ_CB_ERROR);
+			cmdq_task_exec_done(task, -ENOEXEC);
 			cmdq_task_handle_error(curr_task);
 			kfree(task);
 		}
@@ -415,7 +415,7 @@ static void cmdq_mbox_shutdown(struct mbox_chan *chan)
 
 	list_for_each_entry_safe(task, tmp, &thread->task_busy_list,
 				 list_entry) {
-		cmdq_task_exec_done(task, CMDQ_CB_ERROR);
+		cmdq_task_exec_done(task, -ECONNABORTED);
 		kfree(task);
 	}
 
@@ -453,7 +453,7 @@ static int cmdq_mbox_flush(struct mbox_chan *chan, unsigned long timeout)
 				 list_entry) {
 		cb = &task->pkt->async_cb;
 		if (cb->cb) {
-			data.sta = CMDQ_CB_ERROR;
+			data.sta = -ECONNABORTED;
 			data.data = cb->data;
 			cb->cb(data);
 		}
diff --git a/include/linux/mailbox/mtk-cmdq-mailbox.h b/include/linux/mailbox/mtk-cmdq-mailbox.h
index d5a983d65f05..2f7d9a37d611 100644
--- a/include/linux/mailbox/mtk-cmdq-mailbox.h
+++ b/include/linux/mailbox/mtk-cmdq-mailbox.h
@@ -65,13 +65,8 @@ enum cmdq_code {
 	CMDQ_CODE_LOGIC = 0xa0,
 };
 
-enum cmdq_cb_status {
-	CMDQ_CB_NORMAL = 0,
-	CMDQ_CB_ERROR
-};
-
 struct cmdq_cb_data {
-	enum cmdq_cb_status	sta;
+	int			sta;
 	void			*data;
 };
 
-- 
cgit v1.2.3


From 8ebc3b5aa4cfafd8b9d58e2595a12f0715594619 Mon Sep 17 00:00:00 2001
From: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Date: Mon, 15 Mar 2021 07:33:23 +0800
Subject: mailbox: mtk-cmdq: Add struct cmdq_pkt in struct cmdq_cb_data

Current client use 'struct cmdq_pkt' as callback data, so
change 'void *data' to 'struct cmdq_pkt *pkt'. Keep data
until client use pkt instead of data.

Signed-off-by: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Reviewed-by: Yongqiang Niu <yongqiang.niu@mediatek.com>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 drivers/mailbox/mtk-cmdq-mailbox.c       | 2 ++
 include/linux/mailbox/mtk-cmdq-mailbox.h | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mailbox/mtk-cmdq-mailbox.c b/drivers/mailbox/mtk-cmdq-mailbox.c
index 7dfd0c9d7bbd..301e65b9527a 100644
--- a/drivers/mailbox/mtk-cmdq-mailbox.c
+++ b/drivers/mailbox/mtk-cmdq-mailbox.c
@@ -188,6 +188,7 @@ static void cmdq_task_exec_done(struct cmdq_task *task, int sta)
 	WARN_ON(cb->cb == (cmdq_async_flush_cb)NULL);
 	data.sta = sta;
 	data.data = cb->data;
+	data.pkt = task->pkt;
 	if (cb->cb)
 		cb->cb(data);
 
@@ -456,6 +457,7 @@ static int cmdq_mbox_flush(struct mbox_chan *chan, unsigned long timeout)
 				 list_entry) {
 		data.sta = -ECONNABORTED;
 		data.data = cb->data;
+		data.pkt = task->pkt;
 		cb = &task->pkt->async_cb;
 		if (cb->cb)
 			cb->cb(data);
diff --git a/include/linux/mailbox/mtk-cmdq-mailbox.h b/include/linux/mailbox/mtk-cmdq-mailbox.h
index 2f7d9a37d611..44365aab043c 100644
--- a/include/linux/mailbox/mtk-cmdq-mailbox.h
+++ b/include/linux/mailbox/mtk-cmdq-mailbox.h
@@ -68,6 +68,7 @@ enum cmdq_code {
 struct cmdq_cb_data {
 	int			sta;
 	void			*data;
+	struct cmdq_pkt		*pkt;
 };
 
 typedef void (*cmdq_async_flush_cb)(struct cmdq_cb_data data);
-- 
cgit v1.2.3


From b4b27b9eed8ebdbf9f3046197d29d733c8c944f3 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 27 Jun 2021 13:32:54 -0700
Subject: Revert "signal: Allow tasks to cache one sigqueue struct"

This reverts commits 4bad58ebc8bc4f20d89cff95417c9b4674769709 (and
399f8dd9a866e107639eabd3c1979cd526ca3a98, which tried to fix it).

I do not believe these are correct, and I'm about to release 5.13, so am
reverting them out of an abundance of caution.

The locking is odd, and appears broken.

On the allocation side (in __sigqueue_alloc()), the locking is somewhat
straightforward: it depends on sighand->siglock.  Since one caller
doesn't hold that lock, it further then tests 'sigqueue_flags' to avoid
the case with no locks held.

On the freeing side (in sigqueue_cache_or_free()), there is no locking
at all, and the logic instead depends on 'current' being a single
thread, and not able to race with itself.

To make things more exciting, there's also the data race between freeing
a signal and allocating one, which is handled by using WRITE_ONCE() and
READ_ONCE(), and being mutually exclusive wrt the initial state (ie
freeing will only free if the old state was NULL, while allocating will
obviously only use the value if it was non-NULL, so only one or the
other will actually act on the value).

However, while the free->alloc paths do seem mutually exclusive thanks
to just the data value dependency, it's not clear what the memory
ordering constraints are on it.  Could writes from the previous
allocation possibly be delayed and seen by the new allocation later,
causing logical inconsistencies?

So it's all very exciting and unusual.

And in particular, it seems that the freeing side is incorrect in
depending on "current" being single-threaded.  Yes, 'current' is a
single thread, but in the presense of asynchronous events even a single
thread can have data races.

And such asynchronous events can and do happen, with interrupts causing
signals to be flushed and thus free'd (for example - sending a
SIGCONT/SIGSTOP can happen from interrupt context, and can flush
previously queued process control signals).

So regardless of all the other questions about the memory ordering and
locking for this new cached allocation, the sigqueue_cache_or_free()
assumptions seem to be fundamentally incorrect.

It may be that people will show me the errors of my ways, and tell me
why this is all safe after all.  We can reinstate it if so.  But my
current belief is that the WRITE_ONCE() that sets the cached entry needs
to be a smp_store_release(), and the READ_ONCE() that finds a cached
entry needs to be a smp_load_acquire() to handle memory ordering
correctly.

And the sequence in sigqueue_cache_or_free() would need to either use a
lock or at least be interrupt-safe some way (perhaps by using something
like the percpu 'cmpxchg': it doesn't need to be SMP-safe, but like the
percpu operations it needs to be interrupt-safe).

Fixes: 399f8dd9a866 ("signal: Prevent sigqueue caching after task got released")
Fixes: 4bad58ebc8bc ("signal: Allow tasks to cache one sigqueue struct")
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h  |  1 -
 include/linux/signal.h |  1 -
 kernel/exit.c          |  1 -
 kernel/fork.c          |  1 -
 kernel/signal.c        | 59 ++------------------------------------------------
 5 files changed, 2 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 28a98fc4ded4..32813c345115 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -997,7 +997,6 @@ struct task_struct {
 	/* Signal handlers: */
 	struct signal_struct		*signal;
 	struct sighand_struct __rcu		*sighand;
-	struct sigqueue			*sigqueue_cache;
 	sigset_t			blocked;
 	sigset_t			real_blocked;
 	/* Restored if set_restore_sigmask() was used: */
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 201f88e3738b..5160fd45e5ca 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -267,7 +267,6 @@ static inline void init_sigpending(struct sigpending *sig)
 }
 
 extern void flush_sigqueue(struct sigpending *queue);
-extern void exit_task_sigqueue_cache(struct task_struct *tsk);
 
 /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
 static inline int valid_signal(unsigned long sig)
diff --git a/kernel/exit.c b/kernel/exit.c
index fd1c04193e18..65809fac3038 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -162,7 +162,6 @@ static void __exit_signal(struct task_struct *tsk)
 		flush_sigqueue(&sig->shared_pending);
 		tty_kref_put(tty);
 	}
-	exit_task_sigqueue_cache(tsk);
 }
 
 static void delayed_put_task_struct(struct rcu_head *rhp)
diff --git a/kernel/fork.c b/kernel/fork.c
index dc06afd725cb..a070caed5c8e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2008,7 +2008,6 @@ static __latent_entropy struct task_struct *copy_process(
 	spin_lock_init(&p->alloc_lock);
 
 	init_sigpending(&p->pending);
-	p->sigqueue_cache = NULL;
 
 	p->utime = p->stime = p->gtime = 0;
 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
diff --git a/kernel/signal.c b/kernel/signal.c
index f1ecd8f0c11d..30a0bee5ff9b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -431,22 +431,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
 	rcu_read_unlock();
 
 	if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
-		/*
-		 * Preallocation does not hold sighand::siglock so it can't
-		 * use the cache. The lockless caching requires that only
-		 * one consumer and only one producer run at a time.
-		 *
-		 * For the regular allocation case it is sufficient to
-		 * check @q for NULL because this code can only be called
-		 * if the target task @t has not been reaped yet; which
-		 * means this code can never observe the error pointer which is
-		 * written to @t->sigqueue_cache in exit_task_sigqueue_cache().
-		 */
-		q = READ_ONCE(t->sigqueue_cache);
-		if (!q || sigqueue_flags)
-			q = kmem_cache_alloc(sigqueue_cachep, gfp_flags);
-		else
-			WRITE_ONCE(t->sigqueue_cache, NULL);
+		q = kmem_cache_alloc(sigqueue_cachep, gfp_flags);
 	} else {
 		print_dropped_signal(sig);
 	}
@@ -463,53 +448,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
 	return q;
 }
 
-void exit_task_sigqueue_cache(struct task_struct *tsk)
-{
-	/* Race free because @tsk is mopped up */
-	struct sigqueue *q = tsk->sigqueue_cache;
-
-	if (q) {
-		/*
-		 * Hand it back to the cache as the task might
-		 * be self reaping which would leak the object.
-		 */
-		 kmem_cache_free(sigqueue_cachep, q);
-	}
-
-	/*
-	 * Set an error pointer to ensure that @tsk will not cache a
-	 * sigqueue when it is reaping it's child tasks
-	 */
-	tsk->sigqueue_cache = ERR_PTR(-1);
-}
-
-static void sigqueue_cache_or_free(struct sigqueue *q)
-{
-	/*
-	 * Cache one sigqueue per task. This pairs with the consumer side
-	 * in __sigqueue_alloc() and needs READ/WRITE_ONCE() to prevent the
-	 * compiler from store tearing and to tell KCSAN that the data race
-	 * is intentional when run without holding current->sighand->siglock,
-	 * which is fine as current obviously cannot run __sigqueue_free()
-	 * concurrently.
-	 *
-	 * The NULL check is safe even if current has been reaped already,
-	 * in which case exit_task_sigqueue_cache() wrote an error pointer
-	 * into current->sigqueue_cache.
-	 */
-	if (!READ_ONCE(current->sigqueue_cache))
-		WRITE_ONCE(current->sigqueue_cache, q);
-	else
-		kmem_cache_free(sigqueue_cachep, q);
-}
-
 static void __sigqueue_free(struct sigqueue *q)
 {
 	if (q->flags & SIGQUEUE_PREALLOC)
 		return;
 	if (atomic_dec_and_test(&q->user->sigpending))
 		free_uid(q->user);
-	sigqueue_cache_or_free(q);
+	kmem_cache_free(sigqueue_cachep, q);
 }
 
 void flush_sigqueue(struct sigpending *queue)
-- 
cgit v1.2.3


From bcda91bf86c1ff7647df85029d69f2aed80f210e Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Wed, 7 Apr 2021 10:01:54 +0200
Subject: pwm: Add a device-managed function to add PWM chips
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This potentially simplifies low-level PWM drivers.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  | 19 +++++++++++++++++++
 include/linux/pwm.h |  3 +++
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index a42999f877d2..83db178f16bb 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -335,6 +335,25 @@ out:
 }
 EXPORT_SYMBOL_GPL(pwmchip_remove);
 
+static void devm_pwmchip_remove(void *data)
+{
+	struct pwm_chip *chip = data;
+
+	pwmchip_remove(chip);
+}
+
+int devm_pwmchip_add(struct device *dev, struct pwm_chip *chip)
+{
+	int ret;
+
+	ret = pwmchip_add(chip);
+	if (ret)
+		return ret;
+
+	return devm_add_action_or_reset(dev, devm_pwmchip_remove, chip);
+}
+EXPORT_SYMBOL_GPL(devm_pwmchip_add);
+
 /**
  * pwm_request() - request a PWM device
  * @pwm: global PWM device index
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 5a73251d28e3..892ece4d4cfa 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -405,6 +405,9 @@ void *pwm_get_chip_data(struct pwm_device *pwm);
 
 int pwmchip_add(struct pwm_chip *chip);
 int pwmchip_remove(struct pwm_chip *chip);
+
+int devm_pwmchip_add(struct device *dev, struct pwm_chip *chip);
+
 struct pwm_device *pwm_request_from_chip(struct pwm_chip *chip,
 					 unsigned int index,
 					 const char *label);
-- 
cgit v1.2.3


From f7d9f6370e006400655ff96cb148f56598492d91 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@redhat.com>
Date: Mon, 28 Jun 2021 11:45:47 +0200
Subject: trace/osnoise: Fix 'no previous prototype' warnings

kernel test robot reported some osnoise functions with "no previous
prototype."

Fix these warnings by making local functions static, and by adding:

 void osnoise_trace_irq_entry(int id);
 void osnoise_trace_irq_exit(int id, const char *desc);

to include/linux/trace.h.

Link: https://lkml.kernel.org/r/e40d3cb4be8bde921f4b40fa6a095cf85ab807bd.1624872608.git.bristot@redhat.com

Fixes: bce29ac9ce0b ("trace: Add osnoise tracer")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 arch/x86/kernel/trace.c      |  3 ---
 include/linux/trace.h        |  2 ++
 kernel/trace/trace_osnoise.c | 20 +++++++++++---------
 3 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/trace.c b/arch/x86/kernel/trace.c
index 6912672c33a7..6b73b6f92ad3 100644
--- a/arch/x86/kernel/trace.c
+++ b/arch/x86/kernel/trace.c
@@ -2,9 +2,6 @@
 #include <linux/trace.h>
 
 #if defined(CONFIG_OSNOISE_TRACER) && defined(CONFIG_X86_LOCAL_APIC)
-extern void osnoise_trace_irq_entry(int id);
-extern void osnoise_trace_irq_exit(int id, const char *desc);
-
 /*
  * trace_intel_irq_entry - record intel specific IRQ entry
  */
diff --git a/include/linux/trace.h b/include/linux/trace.h
index 4e3858640c47..bf169612ffe1 100644
--- a/include/linux/trace.h
+++ b/include/linux/trace.h
@@ -45,6 +45,8 @@ int trace_array_destroy(struct trace_array *tr);
 /* For osnoise tracer */
 int osnoise_arch_register(void);
 void osnoise_arch_unregister(void);
+void osnoise_trace_irq_entry(int id);
+void osnoise_trace_irq_exit(int id, const char *desc);
 
 #endif	/* CONFIG_TRACING */
 
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 556d530af805..9c3109e3ffeb 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -736,7 +736,7 @@ void __weak osnoise_arch_unregister(void)
  * This function hooks the IRQ related callbacks to the respective trace
  * events.
  */
-int hook_irq_events(void)
+static int hook_irq_events(void)
 {
 	int ret;
 
@@ -768,7 +768,7 @@ out_err:
  * This function unhooks the IRQ related callbacks to the respective trace
  * events.
  */
-void unhook_irq_events(void)
+static void unhook_irq_events(void)
 {
 	osnoise_arch_unregister();
 	unregister_trace_irq_handler_exit(trace_irqexit_callback, NULL);
@@ -785,7 +785,7 @@ void unhook_irq_events(void)
  * arrival time. The delta_start is used to compute the duration at the
  * softirq exit handler. See cond_move_softirq_delta_start().
  */
-void trace_softirq_entry_callback(void *data, unsigned int vec_nr)
+static void trace_softirq_entry_callback(void *data, unsigned int vec_nr)
 {
 	struct osnoise_variables *osn_var = this_cpu_osn_var();
 
@@ -808,7 +808,7 @@ void trace_softirq_entry_callback(void *data, unsigned int vec_nr)
  * Computes the duration of the softirq noise, and trace it. Also discounts the
  * interference from other sources of noise could be currently being accounted.
  */
-void trace_softirq_exit_callback(void *data, unsigned int vec_nr)
+static void trace_softirq_exit_callback(void *data, unsigned int vec_nr)
 {
 	struct osnoise_variables *osn_var = this_cpu_osn_var();
 	int duration;
@@ -949,7 +949,7 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t)
  * This function is hooked to the sched:sched_switch trace event, and it is
  * used to record the beginning and to report the end of a thread noise window.
  */
-void
+static void
 trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p,
 			    struct task_struct *n)
 {
@@ -968,7 +968,7 @@ trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p,
  * Hook the osnoise tracer callbacks to handle the noise from other
  * threads on the necessary kernel events.
  */
-int hook_thread_events(void)
+static int hook_thread_events(void)
 {
 	int ret;
 
@@ -985,7 +985,7 @@ int hook_thread_events(void)
  * Unook the osnoise tracer callbacks to handle the noise from other
  * threads on the necessary kernel events.
  */
-void unhook_thread_events(void)
+static void unhook_thread_events(void)
 {
 	unregister_trace_sched_switch(trace_sched_switch_callback, NULL);
 }
@@ -997,7 +997,8 @@ void unhook_thread_events(void)
  * values will be used later to compute the diff betwneen the statistics
  * before and after the osnoise sampling.
  */
-void save_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s)
+static void
+save_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s)
 {
 	s->nmi_count = osn_var->nmi.count;
 	s->irq_count = osn_var->irq.count;
@@ -1012,7 +1013,8 @@ void save_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sam
  * statistics. The struct osnoise_sample *s contains the statistics saved via
  * save_osn_sample_stats() before the osnoise sampling.
  */
-void diff_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s)
+static void
+diff_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s)
 {
 	s->nmi_count = osn_var->nmi.count - s->nmi_count;
 	s->irq_count = osn_var->irq.count - s->irq_count;
-- 
cgit v1.2.3


From 0d2cfbd41c4a5a0ca5598d1874b1081138cd64c6 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sun, 27 Jun 2021 14:54:25 +0300
Subject: net: bridge: ignore switchdev events for LAG ports which didn't
 request replay

There is a slight inconvenience in the switchdev replay helpers added
recently, and this is when:

ip link add br0 type bridge
ip link add bond0 type bond
ip link set bond0 master br0
bridge vlan add dev bond0 vid 100
ip link set swp0 master bond0
ip link set swp1 master bond0

Since the underlying driver (currently only DSA) asks for a replay of
VLANs when swp0 and swp1 join the LAG because it is bridged, what will
happen is that DSA will try to react twice on the VLAN event for swp0.
This is not really a huge problem right now, because most drivers accept
duplicates since the bridge itself does, but it will become a problem
when we add support for replaying switchdev object deletions.

Let's fix this by adding a blank void *ctx in the replay helpers, which
will be passed on by the bridge in the switchdev notifications. If the
context is NULL, everything is the same as before. But if the context is
populated with a valid pointer, the underlying switchdev driver
(currently DSA) can use the pointer to 'see through' the bridge port
(which in the example above is bond0) and 'know' that the event is only
for a particular physical port offloading that bridge port, and not for
all of them.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mscc/ocelot_net.c | 19 +++++++++++++++++--
 include/linux/if_bridge.h              | 14 ++++++++------
 net/bridge/br_fdb.c                    |  7 ++++---
 net/bridge/br_mdb.c                    |  8 +++++---
 net/bridge/br_vlan.c                   |  8 +++++---
 net/dsa/port.c                         |  6 +++---
 net/dsa/slave.c                        |  9 +++++++++
 7 files changed, 51 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
index 456541640feb..166d851962d2 100644
--- a/drivers/net/ethernet/mscc/ocelot_net.c
+++ b/drivers/net/ethernet/mscc/ocelot_net.c
@@ -948,6 +948,9 @@ static int ocelot_port_attr_set(struct net_device *dev, const void *ctx,
 	int port = priv->chip_port;
 	int err = 0;
 
+	if (ctx && ctx != priv)
+		return 0;
+
 	switch (attr->id) {
 	case SWITCHDEV_ATTR_ID_PORT_STP_STATE:
 		ocelot_port_attr_stp_state_set(ocelot, port, attr->u.stp_state);
@@ -1062,8 +1065,12 @@ static int ocelot_port_obj_add(struct net_device *dev, const void *ctx,
 			       const struct switchdev_obj *obj,
 			       struct netlink_ext_ack *extack)
 {
+	struct ocelot_port_private *priv = netdev_priv(dev);
 	int ret = 0;
 
+	if (ctx && ctx != priv)
+		return 0;
+
 	switch (obj->id) {
 	case SWITCHDEV_OBJ_ID_PORT_VLAN:
 		ret = ocelot_port_obj_add_vlan(dev,
@@ -1089,8 +1096,12 @@ static int ocelot_port_obj_add(struct net_device *dev, const void *ctx,
 static int ocelot_port_obj_del(struct net_device *dev, const void *ctx,
 			       const struct switchdev_obj *obj)
 {
+	struct ocelot_port_private *priv = netdev_priv(dev);
 	int ret = 0;
 
+	if (ctx && ctx != priv)
+		return 0;
+
 	switch (obj->id) {
 	case SWITCHDEV_OBJ_ID_PORT_VLAN:
 		ret = ocelot_vlan_vid_del(dev,
@@ -1143,10 +1154,14 @@ static int ocelot_switchdev_sync(struct ocelot *ocelot, int port,
 				 struct net_device *bridge_dev,
 				 struct netlink_ext_ack *extack)
 {
+	struct ocelot_port *ocelot_port = ocelot->ports[port];
+	struct ocelot_port_private *priv;
 	clock_t ageing_time;
 	u8 stp_state;
 	int err;
 
+	priv = container_of(ocelot_port, struct ocelot_port_private, port);
+
 	ocelot_inherit_brport_flags(ocelot, port, brport_dev);
 
 	stp_state = br_port_get_stp_state(brport_dev);
@@ -1160,12 +1175,12 @@ static int ocelot_switchdev_sync(struct ocelot *ocelot, int port,
 	ageing_time = br_get_ageing_time(bridge_dev);
 	ocelot_port_attr_ageing_set(ocelot, port, ageing_time);
 
-	err = br_mdb_replay(bridge_dev, brport_dev,
+	err = br_mdb_replay(bridge_dev, brport_dev, priv,
 			    &ocelot_switchdev_blocking_nb, extack);
 	if (err && err != -EOPNOTSUPP)
 		return err;
 
-	err = br_vlan_replay(bridge_dev, brport_dev,
+	err = br_vlan_replay(bridge_dev, brport_dev, priv,
 			     &ocelot_switchdev_blocking_nb, extack);
 	if (err && err != -EOPNOTSUPP)
 		return err;
diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 12e9a32dbca0..57df761b6f4a 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -71,7 +71,8 @@ bool br_multicast_has_router_adjacent(struct net_device *dev, int proto);
 bool br_multicast_enabled(const struct net_device *dev);
 bool br_multicast_router(const struct net_device *dev);
 int br_mdb_replay(struct net_device *br_dev, struct net_device *dev,
-		  struct notifier_block *nb, struct netlink_ext_ack *extack);
+		  const void *ctx, struct notifier_block *nb,
+		  struct netlink_ext_ack *extack);
 #else
 static inline int br_multicast_list_adjacent(struct net_device *dev,
 					     struct list_head *br_ip_list)
@@ -104,7 +105,7 @@ static inline bool br_multicast_router(const struct net_device *dev)
 	return false;
 }
 static inline int br_mdb_replay(struct net_device *br_dev,
-				struct net_device *dev,
+				struct net_device *dev, const void *ctx,
 				struct notifier_block *nb,
 				struct netlink_ext_ack *extack)
 {
@@ -120,7 +121,8 @@ int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto);
 int br_vlan_get_info(const struct net_device *dev, u16 vid,
 		     struct bridge_vlan_info *p_vinfo);
 int br_vlan_replay(struct net_device *br_dev, struct net_device *dev,
-		   struct notifier_block *nb, struct netlink_ext_ack *extack);
+		   const void *ctx, struct notifier_block *nb,
+		   struct netlink_ext_ack *extack);
 #else
 static inline bool br_vlan_enabled(const struct net_device *dev)
 {
@@ -149,7 +151,7 @@ static inline int br_vlan_get_info(const struct net_device *dev, u16 vid,
 }
 
 static inline int br_vlan_replay(struct net_device *br_dev,
-				 struct net_device *dev,
+				 struct net_device *dev, const void *ctx,
 				 struct notifier_block *nb,
 				 struct netlink_ext_ack *extack)
 {
@@ -166,7 +168,7 @@ bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag);
 u8 br_port_get_stp_state(const struct net_device *dev);
 clock_t br_get_ageing_time(struct net_device *br_dev);
 int br_fdb_replay(struct net_device *br_dev, struct net_device *dev,
-		  struct notifier_block *nb);
+		  const void *ctx, struct notifier_block *nb);
 #else
 static inline struct net_device *
 br_fdb_find_port(const struct net_device *br_dev,
@@ -197,7 +199,7 @@ static inline clock_t br_get_ageing_time(struct net_device *br_dev)
 }
 
 static inline int br_fdb_replay(struct net_device *br_dev,
-				struct net_device *dev,
+				struct net_device *dev, const void *ctx,
 				struct notifier_block *nb)
 {
 	return -EOPNOTSUPP;
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index b8d3ddfe5853..9d164a518e38 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -728,7 +728,7 @@ static inline size_t fdb_nlmsg_size(void)
 
 static int br_fdb_replay_one(struct notifier_block *nb,
 			     struct net_bridge_fdb_entry *fdb,
-			     struct net_device *dev)
+			     struct net_device *dev, const void *ctx)
 {
 	struct switchdev_notifier_fdb_info item;
 	int err;
@@ -739,13 +739,14 @@ static int br_fdb_replay_one(struct notifier_block *nb,
 	item.offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags);
 	item.is_local = test_bit(BR_FDB_LOCAL, &fdb->flags);
 	item.info.dev = dev;
+	item.info.ctx = ctx;
 
 	err = nb->notifier_call(nb, SWITCHDEV_FDB_ADD_TO_DEVICE, &item);
 	return notifier_to_errno(err);
 }
 
 int br_fdb_replay(struct net_device *br_dev, struct net_device *dev,
-		  struct notifier_block *nb)
+		  const void *ctx, struct notifier_block *nb)
 {
 	struct net_bridge_fdb_entry *fdb;
 	struct net_bridge *br;
@@ -766,7 +767,7 @@ int br_fdb_replay(struct net_device *br_dev, struct net_device *dev,
 		if (dst_dev != br_dev && dst_dev != dev)
 			continue;
 
-		err = br_fdb_replay_one(nb, fdb, dst_dev);
+		err = br_fdb_replay_one(nb, fdb, dst_dev, ctx);
 		if (err)
 			break;
 	}
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 3f839a8cc9fb..8bc6afca5e8c 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -568,12 +568,13 @@ static void br_switchdev_mdb_populate(struct switchdev_obj_port_mdb *mdb,
 
 static int br_mdb_replay_one(struct notifier_block *nb, struct net_device *dev,
 			     struct switchdev_obj_port_mdb *mdb,
-			     struct netlink_ext_ack *extack)
+			     const void *ctx, struct netlink_ext_ack *extack)
 {
 	struct switchdev_notifier_port_obj_info obj_info = {
 		.info = {
 			.dev = dev,
 			.extack = extack,
+			.ctx = ctx,
 		},
 		.obj = &mdb->obj,
 	};
@@ -603,7 +604,8 @@ static int br_mdb_queue_one(struct list_head *mdb_list,
 }
 
 int br_mdb_replay(struct net_device *br_dev, struct net_device *dev,
-		  struct notifier_block *nb, struct netlink_ext_ack *extack)
+		  const void *ctx, struct notifier_block *nb,
+		  struct netlink_ext_ack *extack)
 {
 	struct net_bridge_mdb_entry *mp;
 	struct switchdev_obj *obj, *tmp;
@@ -664,7 +666,7 @@ int br_mdb_replay(struct net_device *br_dev, struct net_device *dev,
 
 	list_for_each_entry(obj, &mdb_list, list) {
 		err = br_mdb_replay_one(nb, dev, SWITCHDEV_OBJ_PORT_MDB(obj),
-					extack);
+					ctx, extack);
 		if (err)
 			goto out_free_mdb;
 	}
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 8789a57af543..2bfa2a00e193 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -1807,12 +1807,13 @@ out_kfree:
 static int br_vlan_replay_one(struct notifier_block *nb,
 			      struct net_device *dev,
 			      struct switchdev_obj_port_vlan *vlan,
-			      struct netlink_ext_ack *extack)
+			      const void *ctx, struct netlink_ext_ack *extack)
 {
 	struct switchdev_notifier_port_obj_info obj_info = {
 		.info = {
 			.dev = dev,
 			.extack = extack,
+			.ctx = ctx,
 		},
 		.obj = &vlan->obj,
 	};
@@ -1823,7 +1824,8 @@ static int br_vlan_replay_one(struct notifier_block *nb,
 }
 
 int br_vlan_replay(struct net_device *br_dev, struct net_device *dev,
-		   struct notifier_block *nb, struct netlink_ext_ack *extack)
+		   const void *ctx, struct notifier_block *nb,
+		   struct netlink_ext_ack *extack)
 {
 	struct net_bridge_vlan_group *vg;
 	struct net_bridge_vlan *v;
@@ -1868,7 +1870,7 @@ int br_vlan_replay(struct net_device *br_dev, struct net_device *dev,
 		if (!br_vlan_should_use(v))
 			continue;
 
-		err = br_vlan_replay_one(nb, dev, &vlan, extack);
+		err = br_vlan_replay_one(nb, dev, &vlan, ctx, extack);
 		if (err)
 			return err;
 	}
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 5c93f1e1a03d..339781c98de1 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -194,17 +194,17 @@ static int dsa_port_switchdev_sync(struct dsa_port *dp,
 	if (err && err != -EOPNOTSUPP)
 		return err;
 
-	err = br_mdb_replay(br, brport_dev,
+	err = br_mdb_replay(br, brport_dev, dp,
 			    &dsa_slave_switchdev_blocking_notifier,
 			    extack);
 	if (err && err != -EOPNOTSUPP)
 		return err;
 
-	err = br_fdb_replay(br, brport_dev, &dsa_slave_switchdev_notifier);
+	err = br_fdb_replay(br, brport_dev, dp, &dsa_slave_switchdev_notifier);
 	if (err && err != -EOPNOTSUPP)
 		return err;
 
-	err = br_vlan_replay(br, brport_dev,
+	err = br_vlan_replay(br, brport_dev, dp,
 			     &dsa_slave_switchdev_blocking_notifier,
 			     extack);
 	if (err && err != -EOPNOTSUPP)
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 3692259a025f..2f0d0a6b1f9c 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -278,6 +278,9 @@ static int dsa_slave_port_attr_set(struct net_device *dev, const void *ctx,
 	struct dsa_port *dp = dsa_slave_to_port(dev);
 	int ret;
 
+	if (ctx && ctx != dp)
+		return 0;
+
 	switch (attr->id) {
 	case SWITCHDEV_ATTR_ID_PORT_STP_STATE:
 		if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
@@ -401,6 +404,9 @@ static int dsa_slave_port_obj_add(struct net_device *dev, const void *ctx,
 	struct dsa_port *dp = dsa_slave_to_port(dev);
 	int err;
 
+	if (ctx && ctx != dp)
+		return 0;
+
 	switch (obj->id) {
 	case SWITCHDEV_OBJ_ID_PORT_MDB:
 		if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev))
@@ -475,6 +481,9 @@ static int dsa_slave_port_obj_del(struct net_device *dev, const void *ctx,
 	struct dsa_port *dp = dsa_slave_to_port(dev);
 	int err;
 
+	if (ctx && ctx != dp)
+		return 0;
+
 	switch (obj->id) {
 	case SWITCHDEV_OBJ_ID_PORT_MDB:
 		if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev))
-- 
cgit v1.2.3


From bdf123b455ce596aec6e410ec36fe3687b6a2140 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sun, 27 Jun 2021 14:54:26 +0300
Subject: net: bridge: constify variables in the replay helpers

Some of the arguments and local variables for the newly added switchdev
replay helpers can be const, so let's make them so.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h | 14 +++++++-------
 net/bridge/br_fdb.c       |  6 +++---
 net/bridge/br_mdb.c       |  8 ++++----
 net/bridge/br_stp.c       |  4 ++--
 4 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 57df761b6f4a..6b54da2c65ba 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -104,8 +104,8 @@ static inline bool br_multicast_router(const struct net_device *dev)
 {
 	return false;
 }
-static inline int br_mdb_replay(struct net_device *br_dev,
-				struct net_device *dev, const void *ctx,
+static inline int br_mdb_replay(const struct net_device *br_dev,
+				const struct net_device *dev, const void *ctx,
 				struct notifier_block *nb,
 				struct netlink_ext_ack *extack)
 {
@@ -166,8 +166,8 @@ struct net_device *br_fdb_find_port(const struct net_device *br_dev,
 void br_fdb_clear_offload(const struct net_device *dev, u16 vid);
 bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag);
 u8 br_port_get_stp_state(const struct net_device *dev);
-clock_t br_get_ageing_time(struct net_device *br_dev);
-int br_fdb_replay(struct net_device *br_dev, struct net_device *dev,
+clock_t br_get_ageing_time(const struct net_device *br_dev);
+int br_fdb_replay(const struct net_device *br_dev, const struct net_device *dev,
 		  const void *ctx, struct notifier_block *nb);
 #else
 static inline struct net_device *
@@ -193,13 +193,13 @@ static inline u8 br_port_get_stp_state(const struct net_device *dev)
 	return BR_STATE_DISABLED;
 }
 
-static inline clock_t br_get_ageing_time(struct net_device *br_dev)
+static inline clock_t br_get_ageing_time(const struct net_device *br_dev)
 {
 	return 0;
 }
 
-static inline int br_fdb_replay(struct net_device *br_dev,
-				struct net_device *dev, const void *ctx,
+static inline int br_fdb_replay(const struct net_device *br_dev,
+				const struct net_device *dev, const void *ctx,
 				struct notifier_block *nb)
 {
 	return -EOPNOTSUPP;
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 9d164a518e38..2e777c8b0921 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -727,7 +727,7 @@ static inline size_t fdb_nlmsg_size(void)
 }
 
 static int br_fdb_replay_one(struct notifier_block *nb,
-			     struct net_bridge_fdb_entry *fdb,
+			     const struct net_bridge_fdb_entry *fdb,
 			     struct net_device *dev, const void *ctx)
 {
 	struct switchdev_notifier_fdb_info item;
@@ -745,7 +745,7 @@ static int br_fdb_replay_one(struct notifier_block *nb,
 	return notifier_to_errno(err);
 }
 
-int br_fdb_replay(struct net_device *br_dev, struct net_device *dev,
+int br_fdb_replay(const struct net_device *br_dev, const struct net_device *dev,
 		  const void *ctx, struct notifier_block *nb)
 {
 	struct net_bridge_fdb_entry *fdb;
@@ -760,7 +760,7 @@ int br_fdb_replay(struct net_device *br_dev, struct net_device *dev,
 	rcu_read_lock();
 
 	hlist_for_each_entry_rcu(fdb, &br->fdb_list, fdb_node) {
-		struct net_bridge_port *dst = READ_ONCE(fdb->dst);
+		const struct net_bridge_port *dst = READ_ONCE(fdb->dst);
 		struct net_device *dst_dev;
 
 		dst_dev = dst ? dst->dev : br->dev;
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 8bc6afca5e8c..cebdbff17b54 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -567,7 +567,7 @@ static void br_switchdev_mdb_populate(struct switchdev_obj_port_mdb *mdb,
 }
 
 static int br_mdb_replay_one(struct notifier_block *nb, struct net_device *dev,
-			     struct switchdev_obj_port_mdb *mdb,
+			     const struct switchdev_obj_port_mdb *mdb,
 			     const void *ctx, struct netlink_ext_ack *extack)
 {
 	struct switchdev_notifier_port_obj_info obj_info = {
@@ -607,7 +607,7 @@ int br_mdb_replay(struct net_device *br_dev, struct net_device *dev,
 		  const void *ctx, struct notifier_block *nb,
 		  struct netlink_ext_ack *extack)
 {
-	struct net_bridge_mdb_entry *mp;
+	const struct net_bridge_mdb_entry *mp;
 	struct switchdev_obj *obj, *tmp;
 	struct net_bridge *br;
 	LIST_HEAD(mdb_list);
@@ -634,8 +634,8 @@ int br_mdb_replay(struct net_device *br_dev, struct net_device *dev,
 	rcu_read_lock();
 
 	hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) {
-		struct net_bridge_port_group __rcu **pp;
-		struct net_bridge_port_group *p;
+		struct net_bridge_port_group __rcu * const *pp;
+		const struct net_bridge_port_group *p;
 
 		if (mp->host_joined) {
 			err = br_mdb_queue_one(&mdb_list,
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 3dafb6143cff..1d80f34a139c 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -639,9 +639,9 @@ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time)
 	return 0;
 }
 
-clock_t br_get_ageing_time(struct net_device *br_dev)
+clock_t br_get_ageing_time(const struct net_device *br_dev)
 {
-	struct net_bridge *br;
+	const struct net_bridge *br;
 
 	if (!netif_is_bridge_master(br_dev))
 		return 0;
-- 
cgit v1.2.3


From 7e8c18586daf7c1653c4b43a8119bc9662ed8fa6 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sun, 27 Jun 2021 14:54:27 +0300
Subject: net: bridge: allow the switchdev replay functions to be called for
 deletion

When a switchdev port leaves a LAG that is a bridge port, the switchdev
objects and port attributes offloaded to that port are not removed:

ip link add br0 type bridge
ip link add bond0 type bond mode 802.3ad
ip link set swp0 master bond0
ip link set bond0 master br0
bridge vlan add dev bond0 vid 100
ip link set swp0 nomaster

VLAN 100 will remain installed on swp0 despite it going into standalone
mode, because as far as the bridge is concerned, nothing ever happened
to its bridge port.

Let's extend the bridge vlan, fdb and mdb replay functions to take a
'bool adding' argument, and make DSA and ocelot call the replay
functions with 'adding' as false from the switchdev unsync path, for the
switch port that leaves the bridge.

Note that this patch in itself does not salvage anything, because in the
current pull mode of operation, DSA still needs to call the replay
helpers with adding=false. This will be done in another patch.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mscc/ocelot_net.c |  4 ++--
 include/linux/if_bridge.h              | 12 ++++++------
 net/bridge/br_fdb.c                    | 15 +++++++++++----
 net/bridge/br_mdb.c                    | 15 +++++++++++----
 net/bridge/br_vlan.c                   | 15 +++++++++++----
 net/dsa/port.c                         | 13 ++++++-------
 6 files changed, 47 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
index 166d851962d2..3e89e34f86d5 100644
--- a/drivers/net/ethernet/mscc/ocelot_net.c
+++ b/drivers/net/ethernet/mscc/ocelot_net.c
@@ -1175,12 +1175,12 @@ static int ocelot_switchdev_sync(struct ocelot *ocelot, int port,
 	ageing_time = br_get_ageing_time(bridge_dev);
 	ocelot_port_attr_ageing_set(ocelot, port, ageing_time);
 
-	err = br_mdb_replay(bridge_dev, brport_dev, priv,
+	err = br_mdb_replay(bridge_dev, brport_dev, priv, true,
 			    &ocelot_switchdev_blocking_nb, extack);
 	if (err && err != -EOPNOTSUPP)
 		return err;
 
-	err = br_vlan_replay(bridge_dev, brport_dev, priv,
+	err = br_vlan_replay(bridge_dev, brport_dev, priv, true,
 			     &ocelot_switchdev_blocking_nb, extack);
 	if (err && err != -EOPNOTSUPP)
 		return err;
diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 6b54da2c65ba..b651c5e32a28 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -71,7 +71,7 @@ bool br_multicast_has_router_adjacent(struct net_device *dev, int proto);
 bool br_multicast_enabled(const struct net_device *dev);
 bool br_multicast_router(const struct net_device *dev);
 int br_mdb_replay(struct net_device *br_dev, struct net_device *dev,
-		  const void *ctx, struct notifier_block *nb,
+		  const void *ctx, bool adding, struct notifier_block *nb,
 		  struct netlink_ext_ack *extack);
 #else
 static inline int br_multicast_list_adjacent(struct net_device *dev,
@@ -106,7 +106,7 @@ static inline bool br_multicast_router(const struct net_device *dev)
 }
 static inline int br_mdb_replay(const struct net_device *br_dev,
 				const struct net_device *dev, const void *ctx,
-				struct notifier_block *nb,
+				bool adding, struct notifier_block *nb,
 				struct netlink_ext_ack *extack)
 {
 	return -EOPNOTSUPP;
@@ -121,7 +121,7 @@ int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto);
 int br_vlan_get_info(const struct net_device *dev, u16 vid,
 		     struct bridge_vlan_info *p_vinfo);
 int br_vlan_replay(struct net_device *br_dev, struct net_device *dev,
-		   const void *ctx, struct notifier_block *nb,
+		   const void *ctx, bool adding, struct notifier_block *nb,
 		   struct netlink_ext_ack *extack);
 #else
 static inline bool br_vlan_enabled(const struct net_device *dev)
@@ -152,7 +152,7 @@ static inline int br_vlan_get_info(const struct net_device *dev, u16 vid,
 
 static inline int br_vlan_replay(struct net_device *br_dev,
 				 struct net_device *dev, const void *ctx,
-				 struct notifier_block *nb,
+				 bool adding, struct notifier_block *nb,
 				 struct netlink_ext_ack *extack)
 {
 	return -EOPNOTSUPP;
@@ -168,7 +168,7 @@ bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag);
 u8 br_port_get_stp_state(const struct net_device *dev);
 clock_t br_get_ageing_time(const struct net_device *br_dev);
 int br_fdb_replay(const struct net_device *br_dev, const struct net_device *dev,
-		  const void *ctx, struct notifier_block *nb);
+		  const void *ctx, bool adding, struct notifier_block *nb);
 #else
 static inline struct net_device *
 br_fdb_find_port(const struct net_device *br_dev,
@@ -200,7 +200,7 @@ static inline clock_t br_get_ageing_time(const struct net_device *br_dev)
 
 static inline int br_fdb_replay(const struct net_device *br_dev,
 				const struct net_device *dev, const void *ctx,
-				struct notifier_block *nb)
+				bool adding, struct notifier_block *nb)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 2e777c8b0921..16f9434fdb5d 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -728,7 +728,8 @@ static inline size_t fdb_nlmsg_size(void)
 
 static int br_fdb_replay_one(struct notifier_block *nb,
 			     const struct net_bridge_fdb_entry *fdb,
-			     struct net_device *dev, const void *ctx)
+			     struct net_device *dev, unsigned long action,
+			     const void *ctx)
 {
 	struct switchdev_notifier_fdb_info item;
 	int err;
@@ -741,15 +742,16 @@ static int br_fdb_replay_one(struct notifier_block *nb,
 	item.info.dev = dev;
 	item.info.ctx = ctx;
 
-	err = nb->notifier_call(nb, SWITCHDEV_FDB_ADD_TO_DEVICE, &item);
+	err = nb->notifier_call(nb, action, &item);
 	return notifier_to_errno(err);
 }
 
 int br_fdb_replay(const struct net_device *br_dev, const struct net_device *dev,
-		  const void *ctx, struct notifier_block *nb)
+		  const void *ctx, bool adding, struct notifier_block *nb)
 {
 	struct net_bridge_fdb_entry *fdb;
 	struct net_bridge *br;
+	unsigned long action;
 	int err = 0;
 
 	if (!netif_is_bridge_master(br_dev) || !netif_is_bridge_port(dev))
@@ -757,6 +759,11 @@ int br_fdb_replay(const struct net_device *br_dev, const struct net_device *dev,
 
 	br = netdev_priv(br_dev);
 
+	if (adding)
+		action = SWITCHDEV_FDB_ADD_TO_DEVICE;
+	else
+		action = SWITCHDEV_FDB_DEL_TO_DEVICE;
+
 	rcu_read_lock();
 
 	hlist_for_each_entry_rcu(fdb, &br->fdb_list, fdb_node) {
@@ -767,7 +774,7 @@ int br_fdb_replay(const struct net_device *br_dev, const struct net_device *dev,
 		if (dst_dev != br_dev && dst_dev != dev)
 			continue;
 
-		err = br_fdb_replay_one(nb, fdb, dst_dev, ctx);
+		err = br_fdb_replay_one(nb, fdb, dst_dev, action, ctx);
 		if (err)
 			break;
 	}
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index cebdbff17b54..17a720b4473f 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -568,7 +568,8 @@ static void br_switchdev_mdb_populate(struct switchdev_obj_port_mdb *mdb,
 
 static int br_mdb_replay_one(struct notifier_block *nb, struct net_device *dev,
 			     const struct switchdev_obj_port_mdb *mdb,
-			     const void *ctx, struct netlink_ext_ack *extack)
+			     unsigned long action, const void *ctx,
+			     struct netlink_ext_ack *extack)
 {
 	struct switchdev_notifier_port_obj_info obj_info = {
 		.info = {
@@ -580,7 +581,7 @@ static int br_mdb_replay_one(struct notifier_block *nb, struct net_device *dev,
 	};
 	int err;
 
-	err = nb->notifier_call(nb, SWITCHDEV_PORT_OBJ_ADD, &obj_info);
+	err = nb->notifier_call(nb, action, &obj_info);
 	return notifier_to_errno(err);
 }
 
@@ -604,12 +605,13 @@ static int br_mdb_queue_one(struct list_head *mdb_list,
 }
 
 int br_mdb_replay(struct net_device *br_dev, struct net_device *dev,
-		  const void *ctx, struct notifier_block *nb,
+		  const void *ctx, bool adding, struct notifier_block *nb,
 		  struct netlink_ext_ack *extack)
 {
 	const struct net_bridge_mdb_entry *mp;
 	struct switchdev_obj *obj, *tmp;
 	struct net_bridge *br;
+	unsigned long action;
 	LIST_HEAD(mdb_list);
 	int err = 0;
 
@@ -664,9 +666,14 @@ int br_mdb_replay(struct net_device *br_dev, struct net_device *dev,
 
 	rcu_read_unlock();
 
+	if (adding)
+		action = SWITCHDEV_PORT_OBJ_ADD;
+	else
+		action = SWITCHDEV_PORT_OBJ_DEL;
+
 	list_for_each_entry(obj, &mdb_list, list) {
 		err = br_mdb_replay_one(nb, dev, SWITCHDEV_OBJ_PORT_MDB(obj),
-					ctx, extack);
+					action, ctx, extack);
 		if (err)
 			goto out_free_mdb;
 	}
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 2bfa2a00e193..a08e9f193009 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -1807,7 +1807,8 @@ out_kfree:
 static int br_vlan_replay_one(struct notifier_block *nb,
 			      struct net_device *dev,
 			      struct switchdev_obj_port_vlan *vlan,
-			      const void *ctx, struct netlink_ext_ack *extack)
+			      const void *ctx, unsigned long action,
+			      struct netlink_ext_ack *extack)
 {
 	struct switchdev_notifier_port_obj_info obj_info = {
 		.info = {
@@ -1819,18 +1820,19 @@ static int br_vlan_replay_one(struct notifier_block *nb,
 	};
 	int err;
 
-	err = nb->notifier_call(nb, SWITCHDEV_PORT_OBJ_ADD, &obj_info);
+	err = nb->notifier_call(nb, action, &obj_info);
 	return notifier_to_errno(err);
 }
 
 int br_vlan_replay(struct net_device *br_dev, struct net_device *dev,
-		   const void *ctx, struct notifier_block *nb,
+		   const void *ctx, bool adding, struct notifier_block *nb,
 		   struct netlink_ext_ack *extack)
 {
 	struct net_bridge_vlan_group *vg;
 	struct net_bridge_vlan *v;
 	struct net_bridge_port *p;
 	struct net_bridge *br;
+	unsigned long action;
 	int err = 0;
 	u16 pvid;
 
@@ -1857,6 +1859,11 @@ int br_vlan_replay(struct net_device *br_dev, struct net_device *dev,
 	if (!vg)
 		return 0;
 
+	if (adding)
+		action = SWITCHDEV_PORT_OBJ_ADD;
+	else
+		action = SWITCHDEV_PORT_OBJ_DEL;
+
 	pvid = br_get_pvid(vg);
 
 	list_for_each_entry(v, &vg->vlan_list, vlist) {
@@ -1870,7 +1877,7 @@ int br_vlan_replay(struct net_device *br_dev, struct net_device *dev,
 		if (!br_vlan_should_use(v))
 			continue;
 
-		err = br_vlan_replay_one(nb, dev, &vlan, ctx, extack);
+		err = br_vlan_replay_one(nb, dev, &vlan, ctx, action, extack);
 		if (err)
 			return err;
 	}
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 339781c98de1..4e58d07ececd 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -194,19 +194,18 @@ static int dsa_port_switchdev_sync(struct dsa_port *dp,
 	if (err && err != -EOPNOTSUPP)
 		return err;
 
-	err = br_mdb_replay(br, brport_dev, dp,
-			    &dsa_slave_switchdev_blocking_notifier,
-			    extack);
+	err = br_mdb_replay(br, brport_dev, dp, true,
+			    &dsa_slave_switchdev_blocking_notifier, extack);
 	if (err && err != -EOPNOTSUPP)
 		return err;
 
-	err = br_fdb_replay(br, brport_dev, dp, &dsa_slave_switchdev_notifier);
+	err = br_fdb_replay(br, brport_dev, dp, true,
+			    &dsa_slave_switchdev_notifier);
 	if (err && err != -EOPNOTSUPP)
 		return err;
 
-	err = br_vlan_replay(br, brport_dev, dp,
-			     &dsa_slave_switchdev_blocking_notifier,
-			     extack);
+	err = br_vlan_replay(br, brport_dev, dp, true,
+			     &dsa_slave_switchdev_blocking_notifier, extack);
 	if (err && err != -EOPNOTSUPP)
 		return err;
 
-- 
cgit v1.2.3


From a358f40600b3b39ae3906b6118625b99c0aa7a34 Mon Sep 17 00:00:00 2001
From: Tanner Love <tannerlove@google.com>
Date: Mon, 28 Jun 2021 09:50:06 -0400
Subject: once: implement DO_ONCE_LITE for non-fast-path "do once"
 functionality

Certain uses of "do once" functionality reside outside of fast path,
and so do not require jump label patching via static keys, making
existing DO_ONCE undesirable in such cases.

Replace uses of __section(".data.once") with DO_ONCE_LITE(_IF)?

This patch changes the return values of xfs_printk_once, printk_once,
and printk_deferred_once. Before, they returned whether the print was
performed, but now, they always return true. This is okay because the
return values of the following macros are entirely ignored throughout
the kernel:
- xfs_printk_once
- xfs_warn_once
- xfs_notice_once
- xfs_info_once
- printk_once
- pr_emerg_once
- pr_alert_once
- pr_crit_once
- pr_err_once
- pr_warn_once
- pr_notice_once
- pr_info_once
- pr_devel_once
- pr_debug_once
- printk_deferred_once
- orc_warn

Changes
v3:
  - Expand commit message to explain why changing return values of
    xfs_printk_once, printk_once, printk_deferred_once is benign
v2:
  - Fix i386 build warnings

Signed-off-by: Tanner Love <tannerlove@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Mahesh Bandewar <maheshb@google.com>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/xfs/xfs_message.h      | 13 +++----------
 include/asm-generic/bug.h | 37 +++++++------------------------------
 include/linux/once_lite.h | 24 ++++++++++++++++++++++++
 include/linux/printk.h    | 23 +++--------------------
 kernel/trace/trace.h      | 13 +++----------
 5 files changed, 40 insertions(+), 70 deletions(-)
 create mode 100644 include/linux/once_lite.h

(limited to 'include/linux')

diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
index 7ec1a9207517..bb9860ec9a93 100644
--- a/fs/xfs/xfs_message.h
+++ b/fs/xfs/xfs_message.h
@@ -2,6 +2,8 @@
 #ifndef __XFS_MESSAGE_H
 #define __XFS_MESSAGE_H 1
 
+#include <linux/once_lite.h>
+
 struct xfs_mount;
 
 extern __printf(2, 3)
@@ -41,16 +43,7 @@ do {									\
 } while (0)
 
 #define xfs_printk_once(func, dev, fmt, ...)			\
-({								\
-	static bool __section(".data.once") __print_once;	\
-	bool __ret_print_once = !__print_once; 			\
-								\
-	if (!__print_once) {					\
-		__print_once = true;				\
-		func(dev, fmt, ##__VA_ARGS__);			\
-	}							\
-	unlikely(__ret_print_once);				\
-})
+	DO_ONCE_LITE(func, dev, fmt, ##__VA_ARGS__)
 
 #define xfs_emerg_ratelimited(dev, fmt, ...)				\
 	xfs_printk_ratelimited(xfs_emerg, dev, fmt, ##__VA_ARGS__)
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index b402494883b6..bafc51f483c4 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -4,6 +4,7 @@
 
 #include <linux/compiler.h>
 #include <linux/instrumentation.h>
+#include <linux/once_lite.h>
 
 #define CUT_HERE		"------------[ cut here ]------------\n"
 
@@ -140,39 +141,15 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 })
 
 #ifndef WARN_ON_ONCE
-#define WARN_ON_ONCE(condition)	({				\
-	static bool __section(".data.once") __warned;		\
-	int __ret_warn_once = !!(condition);			\
-								\
-	if (unlikely(__ret_warn_once && !__warned)) {		\
-		__warned = true;				\
-		WARN_ON(1);					\
-	}							\
-	unlikely(__ret_warn_once);				\
-})
+#define WARN_ON_ONCE(condition)					\
+	DO_ONCE_LITE_IF(condition, WARN_ON, 1)
 #endif
 
-#define WARN_ONCE(condition, format...)	({			\
-	static bool __section(".data.once") __warned;		\
-	int __ret_warn_once = !!(condition);			\
-								\
-	if (unlikely(__ret_warn_once && !__warned)) {		\
-		__warned = true;				\
-		WARN(1, format);				\
-	}							\
-	unlikely(__ret_warn_once);				\
-})
+#define WARN_ONCE(condition, format...)				\
+	DO_ONCE_LITE_IF(condition, WARN, 1, format)
 
-#define WARN_TAINT_ONCE(condition, taint, format...)	({	\
-	static bool __section(".data.once") __warned;		\
-	int __ret_warn_once = !!(condition);			\
-								\
-	if (unlikely(__ret_warn_once && !__warned)) {		\
-		__warned = true;				\
-		WARN_TAINT(1, taint, format);			\
-	}							\
-	unlikely(__ret_warn_once);				\
-})
+#define WARN_TAINT_ONCE(condition, taint, format...)		\
+	DO_ONCE_LITE_IF(condition, WARN_TAINT, 1, taint, format)
 
 #else /* !CONFIG_BUG */
 #ifndef HAVE_ARCH_BUG
diff --git a/include/linux/once_lite.h b/include/linux/once_lite.h
new file mode 100644
index 000000000000..861e606b820f
--- /dev/null
+++ b/include/linux/once_lite.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_ONCE_LITE_H
+#define _LINUX_ONCE_LITE_H
+
+#include <linux/types.h>
+
+/* Call a function once. Similar to DO_ONCE(), but does not use jump label
+ * patching via static keys.
+ */
+#define DO_ONCE_LITE(func, ...)						\
+	DO_ONCE_LITE_IF(true, func, ##__VA_ARGS__)
+#define DO_ONCE_LITE_IF(condition, func, ...)				\
+	({								\
+		static bool __section(".data.once") __already_done;	\
+		bool __ret_do_once = !!(condition);			\
+									\
+		if (unlikely(__ret_do_once && !__already_done)) {	\
+			__already_done = true;				\
+			func(__VA_ARGS__);				\
+		}							\
+		unlikely(__ret_do_once);				\
+	})
+
+#endif /* _LINUX_ONCE_LITE_H */
diff --git a/include/linux/printk.h b/include/linux/printk.h
index fe7eb2351610..885379a1c9a1 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -8,6 +8,7 @@
 #include <linux/linkage.h>
 #include <linux/cache.h>
 #include <linux/ratelimit_types.h>
+#include <linux/once_lite.h>
 
 extern const char linux_banner[];
 extern const char linux_proc_banner[];
@@ -436,27 +437,9 @@ extern int kptr_restrict;
 
 #ifdef CONFIG_PRINTK
 #define printk_once(fmt, ...)					\
-({								\
-	static bool __section(".data.once") __print_once;	\
-	bool __ret_print_once = !__print_once;			\
-								\
-	if (!__print_once) {					\
-		__print_once = true;				\
-		printk(fmt, ##__VA_ARGS__);			\
-	}							\
-	unlikely(__ret_print_once);				\
-})
+	DO_ONCE_LITE(printk, fmt, ##__VA_ARGS__)
 #define printk_deferred_once(fmt, ...)				\
-({								\
-	static bool __section(".data.once") __print_once;	\
-	bool __ret_print_once = !__print_once;			\
-								\
-	if (!__print_once) {					\
-		__print_once = true;				\
-		printk_deferred(fmt, ##__VA_ARGS__);		\
-	}							\
-	unlikely(__ret_print_once);				\
-})
+	DO_ONCE_LITE(printk_deferred, fmt, ##__VA_ARGS__)
 #else
 #define printk_once(fmt, ...)					\
 	no_printk(fmt, ##__VA_ARGS__)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cd80d046c7a5..d5d8c088a55d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -20,6 +20,7 @@
 #include <linux/irq_work.h>
 #include <linux/workqueue.h>
 #include <linux/ctype.h>
+#include <linux/once_lite.h>
 
 #ifdef CONFIG_FTRACE_SYSCALLS
 #include <asm/unistd.h>		/* For NR_SYSCALLS	     */
@@ -99,16 +100,8 @@ enum trace_type {
 #include "trace_entries.h"
 
 /* Use this for memory failure errors */
-#define MEM_FAIL(condition, fmt, ...) ({			\
-	static bool __section(".data.once") __warned;		\
-	int __ret_warn_once = !!(condition);			\
-								\
-	if (unlikely(__ret_warn_once && !__warned)) {		\
-		__warned = true;				\
-		pr_err("ERROR: " fmt, ##__VA_ARGS__);		\
-	}							\
-	unlikely(__ret_warn_once);				\
-})
+#define MEM_FAIL(condition, fmt, ...)					\
+	DO_ONCE_LITE_IF(condition, pr_err, "ERROR: " fmt, ##__VA_ARGS__)
 
 /*
  * syscalls are special, and need special handling, this is why
-- 
cgit v1.2.3


From 9913d5745bd720c4266805c8d29952a3702e4eca Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 29 Jun 2021 09:40:10 -0400
Subject: tracepoint: Add tracepoint_probe_register_may_exist() for BPF tracing

All internal use cases for tracepoint_probe_register() is set to not ever
be called with the same function and data. If it is, it is considered a
bug, as that means the accounting of handling tracepoints is corrupted.
If the function and data for a tracepoint is already registered when
tracepoint_probe_register() is called, it will call WARN_ON_ONCE() and
return with EEXISTS.

The BPF system call can end up calling tracepoint_probe_register() with
the same data, which now means that this can trigger the warning because
of a user space process. As WARN_ON_ONCE() should not be called because
user space called a system call with bad data, there needs to be a way to
register a tracepoint without triggering a warning.

Enter tracepoint_probe_register_may_exist(), which can be called, but will
not cause a WARN_ON() if the probe already exists. It will still error out
with EEXIST, which will then be sent to the user space that performed the
BPF system call.

This keeps the previous testing for issues with other users of the
tracepoint code, while letting BPF call it with duplicated data and not
warn about it.

Link: https://lore.kernel.org/lkml/20210626135845.4080-1-penguin-kernel@I-love.SAKURA.ne.jp/
Link: https://syzkaller.appspot.com/bug?id=41f4318cf01762389f4d1c1c459da4f542fe5153

Cc: stable@vger.kernel.org
Fixes: c4f6699dfcb85 ("bpf: introduce BPF_RAW_TRACEPOINT")
Reported-by: syzbot <syzbot+721aa903751db87aa244@syzkaller.appspotmail.com>
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Tested-by: syzbot+721aa903751db87aa244@syzkaller.appspotmail.com
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/tracepoint.h | 10 ++++++++++
 kernel/trace/bpf_trace.c   |  3 ++-
 kernel/tracepoint.c        | 33 ++++++++++++++++++++++++++++++---
 3 files changed, 42 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 13f65420f188..ab58696d0ddd 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -41,7 +41,17 @@ extern int
 tracepoint_probe_register_prio(struct tracepoint *tp, void *probe, void *data,
 			       int prio);
 extern int
+tracepoint_probe_register_prio_may_exist(struct tracepoint *tp, void *probe, void *data,
+					 int prio);
+extern int
 tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data);
+static inline int
+tracepoint_probe_register_may_exist(struct tracepoint *tp, void *probe,
+				    void *data)
+{
+	return tracepoint_probe_register_prio_may_exist(tp, probe, data,
+							TRACEPOINT_DEFAULT_PRIO);
+}
 extern void
 for_each_kernel_tracepoint(void (*fct)(struct tracepoint *tp, void *priv),
 		void *priv);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 7a52bc172841..f0568b3d6bd1 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1840,7 +1840,8 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *
 	if (prog->aux->max_tp_access > btp->writable_size)
 		return -EINVAL;
 
-	return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog);
+	return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func,
+						   prog);
 }
 
 int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 9f478d29b926..976bf8ce8039 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -273,7 +273,8 @@ static void tracepoint_update_call(struct tracepoint *tp, struct tracepoint_func
  * Add the probe function to a tracepoint.
  */
 static int tracepoint_add_func(struct tracepoint *tp,
-			       struct tracepoint_func *func, int prio)
+			       struct tracepoint_func *func, int prio,
+			       bool warn)
 {
 	struct tracepoint_func *old, *tp_funcs;
 	int ret;
@@ -288,7 +289,7 @@ static int tracepoint_add_func(struct tracepoint *tp,
 			lockdep_is_held(&tracepoints_mutex));
 	old = func_add(&tp_funcs, func, prio);
 	if (IS_ERR(old)) {
-		WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM);
+		WARN_ON_ONCE(warn && PTR_ERR(old) != -ENOMEM);
 		return PTR_ERR(old);
 	}
 
@@ -343,6 +344,32 @@ static int tracepoint_remove_func(struct tracepoint *tp,
 	return 0;
 }
 
+/**
+ * tracepoint_probe_register_prio_may_exist -  Connect a probe to a tracepoint with priority
+ * @tp: tracepoint
+ * @probe: probe handler
+ * @data: tracepoint data
+ * @prio: priority of this function over other registered functions
+ *
+ * Same as tracepoint_probe_register_prio() except that it will not warn
+ * if the tracepoint is already registered.
+ */
+int tracepoint_probe_register_prio_may_exist(struct tracepoint *tp, void *probe,
+					     void *data, int prio)
+{
+	struct tracepoint_func tp_func;
+	int ret;
+
+	mutex_lock(&tracepoints_mutex);
+	tp_func.func = probe;
+	tp_func.data = data;
+	tp_func.prio = prio;
+	ret = tracepoint_add_func(tp, &tp_func, prio, false);
+	mutex_unlock(&tracepoints_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio_may_exist);
+
 /**
  * tracepoint_probe_register_prio -  Connect a probe to a tracepoint with priority
  * @tp: tracepoint
@@ -366,7 +393,7 @@ int tracepoint_probe_register_prio(struct tracepoint *tp, void *probe,
 	tp_func.func = probe;
 	tp_func.data = data;
 	tp_func.prio = prio;
-	ret = tracepoint_add_func(tp, &tp_func, prio);
+	ret = tracepoint_add_func(tp, &tp_func, prio, true);
 	mutex_unlock(&tracepoints_mutex);
 	return ret;
 }
-- 
cgit v1.2.3


From 82c850c12fc250bdba25e7e66f54adab2ffcfcd6 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 27 Jun 2021 17:40:12 -0700
Subject: <linux/dma-resv.h>: correct a function name in kernel-doc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix kernel-doc function name warning:

../include/linux/dma-resv.h:227: warning: expecting prototype for dma_resv_exclusive(). Prototype was for dma_resv_excl_fence() instead

Fixes: 6edbd6abb783d ("dma-buf: rename and cleanup dma_resv_get_excl v3")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Christian König <christian.koenig@amd.com>
Cc: linux-media@vger.kernel.org
Cc: dri-devel@lists.freedesktop.org
Cc: linaro-mm-sig@lists.linaro.org
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20210628004012.6792-1-rdunlap@infradead.org
---
 include/linux/dma-resv.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-resv.h b/include/linux/dma-resv.h
index 562b885cf9c3..e1ca2080a1ff 100644
--- a/include/linux/dma-resv.h
+++ b/include/linux/dma-resv.h
@@ -212,7 +212,7 @@ static inline void dma_resv_unlock(struct dma_resv *obj)
 }
 
 /**
- * dma_resv_exclusive - return the object's exclusive fence
+ * dma_resv_excl_fence - return the object's exclusive fence
  * @obj: the reservation object
  *
  * Returns the exclusive fence (if any). Caller must either hold the objects
-- 
cgit v1.2.3


From e97bc66377bca097e1f3349ca18ca17f202ff659 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 11 May 2021 23:41:10 -0400
Subject: NFS: nfs_find_open_context() may only select open files

If a file has already been closed, then it should not be selected to
support further I/O.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
[Trond: Fix an invalid pointer deref reported by Colin Ian King]
---
 fs/nfs/inode.c         | 4 ++++
 include/linux/nfs_fs.h | 1 +
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 529c4099f482..d5d022f92c37 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1101,6 +1101,7 @@ EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context);
 void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
 {
 	filp->private_data = get_nfs_open_context(ctx);
+	set_bit(NFS_CONTEXT_FILE_OPEN, &ctx->flags);
 	if (list_empty(&ctx->list))
 		nfs_inode_attach_open_context(ctx);
 }
@@ -1120,6 +1121,8 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, const struct
 			continue;
 		if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode)
 			continue;
+		if (!test_bit(NFS_CONTEXT_FILE_OPEN, &pos->flags))
+			continue;
 		ctx = get_nfs_open_context(pos);
 		if (ctx)
 			break;
@@ -1135,6 +1138,7 @@ void nfs_file_clear_open_context(struct file *filp)
 	if (ctx) {
 		struct inode *inode = d_inode(ctx->dentry);
 
+		clear_bit(NFS_CONTEXT_FILE_OPEN, &ctx->flags);
 		/*
 		 * We fatal error on write before. Try to writeback
 		 * every page again.
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index ffba254d2098..ce6474594872 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -84,6 +84,7 @@ struct nfs_open_context {
 #define NFS_CONTEXT_RESEND_WRITES	(1)
 #define NFS_CONTEXT_BAD			(2)
 #define NFS_CONTEXT_UNLOCK	(3)
+#define NFS_CONTEXT_FILE_OPEN		(4)
 	int error;
 
 	struct list_head list;
-- 
cgit v1.2.3


From 122e093c1734361dedb64f65c99b93e28e4624f4 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 28 Jun 2021 19:33:26 -0700
Subject: mm/page_alloc: fix memory map initialization for descending nodes

On systems with memory nodes sorted in descending order, for instance Dell
Precision WorkStation T5500, the struct pages for higher PFNs and
respectively lower nodes, could be overwritten by the initialization of
struct pages corresponding to the holes in the memory sections.

For example for the below memory layout

[    0.245624] Early memory node ranges
[    0.248496]   node   1: [mem 0x0000000000001000-0x0000000000090fff]
[    0.251376]   node   1: [mem 0x0000000000100000-0x00000000dbdf8fff]
[    0.254256]   node   1: [mem 0x0000000100000000-0x0000001423ffffff]
[    0.257144]   node   0: [mem 0x0000001424000000-0x0000002023ffffff]

the range 0x1424000000 - 0x1428000000 in the beginning of node 0 starts in
the middle of a section and will be considered as a hole during the
initialization of the last section in node 1.

The wrong initialization of the memory map causes panic on boot when
CONFIG_DEBUG_VM is enabled.

Reorder loop order of the memory map initialization so that the outer loop
will always iterate over populated memory regions in the ascending order
and the inner loop will select the zone corresponding to the PFN range.

This way initialization of the struct pages for the memory holes will be
always done for the ranges that are actually not populated.

[akpm@linux-foundation.org: coding style fixes]

Link: https://lkml.kernel.org/r/YNXlMqBbL+tBG7yq@kernel.org
Link: https://bugzilla.kernel.org/show_bug.cgi?id=213073
Link: https://lkml.kernel.org/r/20210624062305.10940-1-rppt@kernel.org
Fixes: 0740a50b9baa ("mm/page_alloc.c: refactor initialization of struct page for holes in memory layout")
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Boris Petkov <bp@alien8.de>
Cc: Robert Shteynfeld <robert.shteynfeld@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  1 -
 mm/page_alloc.c    | 94 +++++++++++++++++++++++++++++++++---------------------
 2 files changed, 58 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8ae31622deef..9afb8998e7e5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2474,7 +2474,6 @@ extern void set_dma_reserve(unsigned long new_dma_reserve);
 extern void memmap_init_range(unsigned long, int, unsigned long,
 		unsigned long, unsigned long, enum meminit_context,
 		struct vmem_altmap *, int migratetype);
-extern void memmap_init_zone(struct zone *zone);
 extern void setup_per_zone_wmarks(void);
 extern int __meminit init_per_zone_wmark_min(void);
 extern void mem_init(void);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ef2265f86b91..5b5c9f5813b9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6400,7 +6400,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
 		return;
 
 	/*
-	 * The call to memmap_init_zone should have already taken care
+	 * The call to memmap_init should have already taken care
 	 * of the pages reserved for the memmap, so we can just jump to
 	 * the end of that region and start processing the device pages.
 	 */
@@ -6465,7 +6465,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
 /*
  * Only struct pages that correspond to ranges defined by memblock.memory
  * are zeroed and initialized by going through __init_single_page() during
- * memmap_init_zone().
+ * memmap_init_zone_range().
  *
  * But, there could be struct pages that correspond to holes in
  * memblock.memory. This can happen because of the following reasons:
@@ -6484,9 +6484,9 @@ static void __meminit zone_init_free_lists(struct zone *zone)
  *   zone/node above the hole except for the trailing pages in the last
  *   section that will be appended to the zone/node below.
  */
-static u64 __meminit init_unavailable_range(unsigned long spfn,
-					    unsigned long epfn,
-					    int zone, int node)
+static void __init init_unavailable_range(unsigned long spfn,
+					  unsigned long epfn,
+					  int zone, int node)
 {
 	unsigned long pfn;
 	u64 pgcnt = 0;
@@ -6502,56 +6502,77 @@ static u64 __meminit init_unavailable_range(unsigned long spfn,
 		pgcnt++;
 	}
 
-	return pgcnt;
+	if (pgcnt)
+		pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
+			node, zone_names[zone], pgcnt);
 }
 #else
-static inline u64 init_unavailable_range(unsigned long spfn, unsigned long epfn,
-					 int zone, int node)
+static inline void init_unavailable_range(unsigned long spfn,
+					  unsigned long epfn,
+					  int zone, int node)
 {
-	return 0;
 }
 #endif
 
-void __meminit __weak memmap_init_zone(struct zone *zone)
+static void __init memmap_init_zone_range(struct zone *zone,
+					  unsigned long start_pfn,
+					  unsigned long end_pfn,
+					  unsigned long *hole_pfn)
 {
 	unsigned long zone_start_pfn = zone->zone_start_pfn;
 	unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
-	int i, nid = zone_to_nid(zone), zone_id = zone_idx(zone);
-	static unsigned long hole_pfn;
+	int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
+
+	start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
+	end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
+
+	if (start_pfn >= end_pfn)
+		return;
+
+	memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn,
+			  zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
+
+	if (*hole_pfn < start_pfn)
+		init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
+
+	*hole_pfn = end_pfn;
+}
+
+static void __init memmap_init(void)
+{
 	unsigned long start_pfn, end_pfn;
-	u64 pgcnt = 0;
+	unsigned long hole_pfn = 0;
+	int i, j, zone_id, nid;
 
-	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
-		start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
-		end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
+	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+		struct pglist_data *node = NODE_DATA(nid);
+
+		for (j = 0; j < MAX_NR_ZONES; j++) {
+			struct zone *zone = node->node_zones + j;
 
-		if (end_pfn > start_pfn)
-			memmap_init_range(end_pfn - start_pfn, nid,
-					zone_id, start_pfn, zone_end_pfn,
-					MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
+			if (!populated_zone(zone))
+				continue;
 
-		if (hole_pfn < start_pfn)
-			pgcnt += init_unavailable_range(hole_pfn, start_pfn,
-							zone_id, nid);
-		hole_pfn = end_pfn;
+			memmap_init_zone_range(zone, start_pfn, end_pfn,
+					       &hole_pfn);
+			zone_id = j;
+		}
 	}
 
 #ifdef CONFIG_SPARSEMEM
 	/*
-	 * Initialize the hole in the range [zone_end_pfn, section_end].
-	 * If zone boundary falls in the middle of a section, this hole
-	 * will be re-initialized during the call to this function for the
-	 * higher zone.
+	 * Initialize the memory map for hole in the range [memory_end,
+	 * section_end].
+	 * Append the pages in this hole to the highest zone in the last
+	 * node.
+	 * The call to init_unavailable_range() is outside the ifdef to
+	 * silence the compiler warining about zone_id set but not used;
+	 * for FLATMEM it is a nop anyway
 	 */
-	end_pfn = round_up(zone_end_pfn, PAGES_PER_SECTION);
+	end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
 	if (hole_pfn < end_pfn)
-		pgcnt += init_unavailable_range(hole_pfn, end_pfn,
-						zone_id, nid);
 #endif
-
-	if (pgcnt)
-		pr_info("  %s zone: %llu pages in unavailable ranges\n",
-			zone->name, pgcnt);
+		init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
 }
 
 static int zone_batchsize(struct zone *zone)
@@ -7254,7 +7275,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat)
 		set_pageblock_order();
 		setup_usemap(zone);
 		init_currently_empty_zone(zone, zone->zone_start_pfn, size);
-		memmap_init_zone(zone);
 	}
 }
 
@@ -7780,6 +7800,8 @@ void __init free_area_init(unsigned long *max_zone_pfn)
 			node_set_state(nid, N_MEMORY);
 		check_for_memory(pgdat, nid);
 	}
+
+	memmap_init();
 }
 
 static int __init cmdline_parse_core(char *p, unsigned long *core,
-- 
cgit v1.2.3


From 20ce0c2d5a303c41c0e02ceb596837868e290dcc Mon Sep 17 00:00:00 2001
From: Jonathan Neuschäfer <j.neuschaefer@gmx.net>
Date: Mon, 28 Jun 2021 19:33:32 -0700
Subject: kthread: switch to new kerneldoc syntax for named variable macro
 argument
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The syntax without dots is available since commit 43756e347f21
("scripts/kernel-doc: Add support for named variable macro arguments").

The same HTML output is produced with and without this patch.

Link: https://lkml.kernel.org/r/20210513161702.1721039-1-j.neuschaefer@gmx.net
Signed-off-by: Jonathan Neuschäfer <j.neuschaefer@gmx.net>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Felix Kuehling <Felix.Kuehling@amd.com>
Cc: Valentin Schneider <valentin.schneider@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kthread.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 2484ed97e72f..db3eafea168f 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -18,7 +18,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
  * @threadfn: the function to run in the thread
  * @data: data pointer for @threadfn()
  * @namefmt: printf-style format string for the thread name
- * @arg...: arguments for @namefmt.
+ * @arg: arguments for @namefmt.
  *
  * This macro will create a kthread on the current node, leaving it in
  * the stopped state.  This is just a helper for kthread_create_on_node();
-- 
cgit v1.2.3


From 588c7fa022d7b2361500ead5660d9a1a2ecd9b7d Mon Sep 17 00:00:00 2001
From: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Date: Mon, 28 Jun 2021 19:34:39 -0700
Subject: mm, slub: change run-time assertion in kmalloc_index() to
 compile-time

Currently when size is not supported by kmalloc_index, compiler will
generate a run-time BUG() while compile-time error is also possible, and
better.  So change BUG to BUILD_BUG_ON_MSG to make compile-time check
possible.

Also remove code that allocates more than 32MB because current
implementation supports only up to 32MB.

[42.hyeyoo@gmail.com: fix support for clang 10]
  Link: https://lkml.kernel.org/r/20210518181247.GA10062@hyeyoo
[vbabka@suse.cz: fix false-positive assert in kernel/bpf/local_storage.c]
  Link: https://lkml.kernel.org/r/bea97388-01df-8eac-091b-a3c89b4a4a09@suse.czLink: https://lkml.kernel.org/r/20210511173448.GA54466@hyeyoo
[elver@google.com: kfence fix]
  Link: https://lkml.kernel.org/r/20210512195227.245000695c9014242e9a00e5@linux-foundation.org

Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Marco Elver <elver@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h    | 17 ++++++++++++++---
 mm/kfence/kfence_test.c |  5 +++--
 mm/slab_common.c        |  7 +++----
 3 files changed, 20 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 0c97d788762c..bc9ab3a5a017 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -346,8 +346,14 @@ static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags)
  * 1 =  65 .. 96 bytes
  * 2 = 129 .. 192 bytes
  * n = 2^(n-1)+1 .. 2^n
+ *
+ * Note: __kmalloc_index() is compile-time optimized, and not runtime optimized;
+ * typical usage is via kmalloc_index() and therefore evaluated at compile-time.
+ * Callers where !size_is_constant should only be test modules, where runtime
+ * overheads of __kmalloc_index() can be tolerated.  Also see kmalloc_slab().
  */
-static __always_inline unsigned int kmalloc_index(size_t size)
+static __always_inline unsigned int __kmalloc_index(size_t size,
+						    bool size_is_constant)
 {
 	if (!size)
 		return 0;
@@ -382,12 +388,17 @@ static __always_inline unsigned int kmalloc_index(size_t size)
 	if (size <=  8 * 1024 * 1024) return 23;
 	if (size <=  16 * 1024 * 1024) return 24;
 	if (size <=  32 * 1024 * 1024) return 25;
-	if (size <=  64 * 1024 * 1024) return 26;
-	BUG();
+
+	if ((IS_ENABLED(CONFIG_CC_IS_GCC) || CONFIG_CLANG_VERSION >= 110000)
+	    && !IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant)
+		BUILD_BUG_ON_MSG(1, "unexpected size in kmalloc_index()");
+	else
+		BUG();
 
 	/* Will never be reached. Needed because the compiler may complain */
 	return -1;
 }
+#define kmalloc_index(s) __kmalloc_index(s, true)
 #endif /* !CONFIG_SLOB */
 
 void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc;
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
index 4acf4251ee04..7f24b9bcb2ec 100644
--- a/mm/kfence/kfence_test.c
+++ b/mm/kfence/kfence_test.c
@@ -197,7 +197,7 @@ static void test_cache_destroy(void)
 
 static inline size_t kmalloc_cache_alignment(size_t size)
 {
-	return kmalloc_caches[kmalloc_type(GFP_KERNEL)][kmalloc_index(size)]->align;
+	return kmalloc_caches[kmalloc_type(GFP_KERNEL)][__kmalloc_index(size, false)]->align;
 }
 
 /* Must always inline to match stack trace against caller. */
@@ -267,7 +267,8 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat
 
 		if (is_kfence_address(alloc)) {
 			struct page *page = virt_to_head_page(alloc);
-			struct kmem_cache *s = test_cache ?: kmalloc_caches[kmalloc_type(GFP_KERNEL)][kmalloc_index(size)];
+			struct kmem_cache *s = test_cache ?:
+					kmalloc_caches[kmalloc_type(GFP_KERNEL)][__kmalloc_index(size, false)];
 
 			/*
 			 * Verify that various helpers return the right values
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 1ded52592b56..b97b6fa8a7c6 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -754,8 +754,8 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
 
 /*
  * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
- * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is
- * kmalloc-67108864.
+ * kmalloc_index() supports up to 2^25=32MB, so the final entry of the table is
+ * kmalloc-32M.
  */
 const struct kmalloc_info_struct kmalloc_info[] __initconst = {
 	INIT_KMALLOC_INFO(0, 0),
@@ -783,8 +783,7 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = {
 	INIT_KMALLOC_INFO(4194304, 4M),
 	INIT_KMALLOC_INFO(8388608, 8M),
 	INIT_KMALLOC_INFO(16777216, 16M),
-	INIT_KMALLOC_INFO(33554432, 32M),
-	INIT_KMALLOC_INFO(67108864, 64M)
+	INIT_KMALLOC_INFO(33554432, 32M)
 };
 
 /*
-- 
cgit v1.2.3


From 792702911f581f7793962fbeb99d5c3a1b28f4c3 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Mon, 28 Jun 2021 19:34:52 -0700
Subject: slub: force on no_hash_pointers when slub_debug is enabled

Obscuring the pointers that slub shows when debugging makes for some
confusing slub debug messages:

 Padding overwritten. 0x0000000079f0674a-0x000000000d4dce17

Those addresses are hashed for kernel security reasons.  If we're trying
to be secure with slub_debug on the commandline we have some big problems
given that we dump whole chunks of kernel memory to the kernel logs.
Let's force on the no_hash_pointers commandline flag when slub_debug is on
the commandline.  This makes slub debug messages more meaningful and if by
chance a kernel address is in some slub debug object dump we will have a
better chance of figuring out what went wrong.

Note that we don't use %px in the slub code because we want to reduce the
number of places that %px is used in the kernel.  This also nicely prints
a big fat warning at kernel boot if slub_debug is on the commandline so
that we know that this kernel shouldn't be used on production systems.

[akpm@linux-foundation.org: fix build with CONFIG_SLUB_DEBUG=n]

Link: https://lkml.kernel.org/r/20210601182202.3011020-5-swboyd@chromium.org
Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Petr Mladek <pmladek@suse.com>
Cc: Joe Perches <joe@perches.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h |  2 ++
 lib/vsprintf.c         |  2 +-
 mm/slub.c              | 20 +++++++++++++++++++-
 3 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 15d8bad3d2f2..bf950621febf 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -357,6 +357,8 @@ int sscanf(const char *, const char *, ...);
 extern __scanf(2, 0)
 int vsscanf(const char *, const char *, va_list);
 
+extern int no_hash_pointers_enable(char *str);
+
 extern int get_option(char **str, int *pint);
 extern char *get_options(const char *str, int nints, int *ints);
 extern unsigned long long memparse(const char *ptr, char **retptr);
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index f0c35d9b65bf..cc281f5895f9 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -2186,7 +2186,7 @@ char *fwnode_string(char *buf, char *end, struct fwnode_handle *fwnode,
 bool no_hash_pointers __ro_after_init;
 EXPORT_SYMBOL_GPL(no_hash_pointers);
 
-static int __init no_hash_pointers_enable(char *str)
+int __init no_hash_pointers_enable(char *str)
 {
 	if (no_hash_pointers)
 		return 0;
diff --git a/mm/slub.c b/mm/slub.c
index f8e4d37c4641..4b2ba9c099c9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -118,12 +118,26 @@
  */
 
 #ifdef CONFIG_SLUB_DEBUG
+
 #ifdef CONFIG_SLUB_DEBUG_ON
 DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
 #else
 DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
 #endif
-#endif
+
+static inline bool __slub_debug_enabled(void)
+{
+	return static_branch_unlikely(&slub_debug_enabled);
+}
+
+#else		/* CONFIG_SLUB_DEBUG */
+
+static inline bool __slub_debug_enabled(void)
+{
+	return false;
+}
+
+#endif		/* CONFIG_SLUB_DEBUG */
 
 static inline bool kmem_cache_debug(struct kmem_cache *s)
 {
@@ -4487,6 +4501,10 @@ void __init kmem_cache_init(void)
 	if (debug_guardpage_minorder())
 		slub_max_order = 0;
 
+	/* Print slub debugging pointers without hashing */
+	if (__slub_debug_enabled())
+		no_hash_pointers_enable(NULL);
+
 	kmem_cache_node = &boot_kmem_cache_node;
 	kmem_cache = &boot_kmem_cache;
 
-- 
cgit v1.2.3


From 9f849c6f9572d8cef407f55928d3dc68fc42ad3e Mon Sep 17 00:00:00 2001
From: Gavin Shan <gshan@redhat.com>
Date: Mon, 28 Jun 2021 19:35:22 -0700
Subject: mm/page_reporting: allow driver to specify reporting order

The page reporting order (threshold) is sticky to @pageblock_order by
default.  The page reporting can never be triggered because the freeing
page can't come up with a free area like that huge.  The situation becomes
worse when the system memory becomes heavily fragmented.

For example, the following configurations are used on ARM64 when 64KB base
page size is enabled.  In this specific case, the page reporting won't be
triggered until the freeing page comes up with a 512MB free area.  That's
hard to be met, especially when the system memory becomes heavily
fragmented.

   PAGE_SIZE:          64KB
   HPAGE_SIZE:         512MB
   pageblock_order:    13       (512MB)
   MAX_ORDER:          14

This allows the drivers to specify the page reporting order when the page
reporting device is registered.  It falls back to @pageblock_order if it's
not specified by the driver.  The existing users (hv_balloon and
virtio_balloon) don't specify it and @pageblock_order is still taken as
their page reporting order.  So this shouldn't introduce any functional
changes.

Link: https://lkml.kernel.org/r/20210625014710.42954-4-gshan@redhat.com
Signed-off-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_reporting.h | 3 +++
 mm/page_reporting.c            | 6 ++++++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h
index 3b99e0ec24f2..fe648dfa3a7c 100644
--- a/include/linux/page_reporting.h
+++ b/include/linux/page_reporting.h
@@ -18,6 +18,9 @@ struct page_reporting_dev_info {
 
 	/* Current state of page reporting */
 	atomic_t state;
+
+	/* Minimal order of page reporting */
+	unsigned int order;
 };
 
 /* Tear-down and bring-up for page reporting devices */
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index 34bf4d26c2c4..382958eef8a9 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -329,6 +329,12 @@ int page_reporting_register(struct page_reporting_dev_info *prdev)
 		goto err_out;
 	}
 
+	/*
+	 * Update the page reporting order if it's specified by driver.
+	 * Otherwise, it falls back to @pageblock_order.
+	 */
+	page_reporting_order = prdev->order ? : pageblock_order;
+
 	/* initialize state and work structures */
 	atomic_set(&prdev->state, PAGE_REPORTING_IDLE);
 	INIT_DELAYED_WORK(&prdev->work, &page_reporting_process);
-- 
cgit v1.2.3


From f3b6a6df38aa514d97e8c6fcc748be1d4142bec9 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Mon, 28 Jun 2021 19:35:53 -0700
Subject: writeback, cgroup: keep list of inodes attached to bdi_writeback

Currently there is no way to iterate over inodes attached to a specific
cgwb structure.  It limits the ability to efficiently reclaim the
writeback structure itself and associated memory and block cgroup
structures without scanning all inodes belonging to a sb, which can be
prohibitively expensive.

While dirty/in-active-writeback an inode belongs to one of the
bdi_writeback's io lists: b_dirty, b_io, b_more_io and b_dirty_time.  Once
cleaned up, it's removed from all io lists.  So the inode->i_io_list can
be reused to maintain the list of inodes, attached to a bdi_writeback
structure.

This patch introduces a new wb->b_attached list, which contains all inodes
which were dirty at least once and are attached to the given cgwb.  Inodes
attached to the root bdi_writeback structures are never placed on such
list.  The following patch will use this list to try to release cgwbs
structures more efficiently.

Link: https://lkml.kernel.org/r/20210608230225.2078447-6-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Suggested-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Dennis Zhou <dennis@kernel.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Jan Kara <jack@suse.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c                | 93 +++++++++++++++++++++++++---------------
 include/linux/backing-dev-defs.h |  1 +
 mm/backing-dev.c                 |  2 +
 3 files changed, 62 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 96974e13a203..87b305ee5348 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -131,25 +131,6 @@ static bool inode_io_list_move_locked(struct inode *inode,
 	return false;
 }
 
-/**
- * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list
- * @inode: inode to be removed
- * @wb: bdi_writeback @inode is being removed from
- *
- * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
- * clear %WB_has_dirty_io if all are empty afterwards.
- */
-static void inode_io_list_del_locked(struct inode *inode,
-				     struct bdi_writeback *wb)
-{
-	assert_spin_locked(&wb->list_lock);
-	assert_spin_locked(&inode->i_lock);
-
-	inode->i_state &= ~I_SYNC_QUEUED;
-	list_del_init(&inode->i_io_list);
-	wb_io_lists_depopulated(wb);
-}
-
 static void wb_wakeup(struct bdi_writeback *wb)
 {
 	spin_lock_bh(&wb->work_lock);
@@ -278,6 +259,28 @@ void __inode_attach_wb(struct inode *inode, struct page *page)
 }
 EXPORT_SYMBOL_GPL(__inode_attach_wb);
 
+/**
+ * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list
+ * @inode: inode of interest with i_lock held
+ * @wb: target bdi_writeback
+ *
+ * Remove the inode from wb's io lists and if necessarily put onto b_attached
+ * list.  Only inodes attached to cgwb's are kept on this list.
+ */
+static void inode_cgwb_move_to_attached(struct inode *inode,
+					struct bdi_writeback *wb)
+{
+	assert_spin_locked(&wb->list_lock);
+	assert_spin_locked(&inode->i_lock);
+
+	inode->i_state &= ~I_SYNC_QUEUED;
+	if (wb != &wb->bdi->wb)
+		list_move(&inode->i_io_list, &wb->b_attached);
+	else
+		list_del_init(&inode->i_io_list);
+	wb_io_lists_depopulated(wb);
+}
+
 /**
  * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
  * @inode: inode of interest with i_lock held
@@ -418,21 +421,28 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
 	wb_get(new_wb);
 
 	/*
-	 * Transfer to @new_wb's IO list if necessary.  The specific list
-	 * @inode was on is ignored and the inode is put on ->b_dirty which
-	 * is always correct including from ->b_dirty_time.  The transfer
-	 * preserves @inode->dirtied_when ordering.
+	 * Transfer to @new_wb's IO list if necessary.  If the @inode is dirty,
+	 * the specific list @inode was on is ignored and the @inode is put on
+	 * ->b_dirty which is always correct including from ->b_dirty_time.
+	 * The transfer preserves @inode->dirtied_when ordering.  If the @inode
+	 * was clean, it means it was on the b_attached list, so move it onto
+	 * the b_attached list of @new_wb.
 	 */
 	if (!list_empty(&inode->i_io_list)) {
-		struct inode *pos;
-
-		inode_io_list_del_locked(inode, old_wb);
 		inode->i_wb = new_wb;
-		list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
-			if (time_after_eq(inode->dirtied_when,
-					  pos->dirtied_when))
-				break;
-		inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
+
+		if (inode->i_state & I_DIRTY_ALL) {
+			struct inode *pos;
+
+			list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
+				if (time_after_eq(inode->dirtied_when,
+						  pos->dirtied_when))
+					break;
+			inode_io_list_move_locked(inode, new_wb,
+						  pos->i_io_list.prev);
+		} else {
+			inode_cgwb_move_to_attached(inode, new_wb);
+		}
 	} else {
 		inode->i_wb = new_wb;
 	}
@@ -1021,6 +1031,17 @@ fs_initcall(cgroup_writeback_init);
 static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
 static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
 
+static void inode_cgwb_move_to_attached(struct inode *inode,
+					struct bdi_writeback *wb)
+{
+	assert_spin_locked(&wb->list_lock);
+	assert_spin_locked(&inode->i_lock);
+
+	inode->i_state &= ~I_SYNC_QUEUED;
+	list_del_init(&inode->i_io_list);
+	wb_io_lists_depopulated(wb);
+}
+
 static struct bdi_writeback *
 locked_inode_to_wb_and_lock_list(struct inode *inode)
 	__releases(&inode->i_lock)
@@ -1121,7 +1142,11 @@ void inode_io_list_del(struct inode *inode)
 
 	wb = inode_to_wb_and_lock_list(inode);
 	spin_lock(&inode->i_lock);
-	inode_io_list_del_locked(inode, wb);
+
+	inode->i_state &= ~I_SYNC_QUEUED;
+	list_del_init(&inode->i_io_list);
+	wb_io_lists_depopulated(wb);
+
 	spin_unlock(&inode->i_lock);
 	spin_unlock(&wb->list_lock);
 }
@@ -1434,7 +1459,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
 		inode->i_state &= ~I_SYNC_QUEUED;
 	} else {
 		/* The inode is clean. Remove from writeback lists. */
-		inode_io_list_del_locked(inode, wb);
+		inode_cgwb_move_to_attached(inode, wb);
 	}
 }
 
@@ -1586,7 +1611,7 @@ static int writeback_single_inode(struct inode *inode,
 	 * responsible for the writeback lists.
 	 */
 	if (!(inode->i_state & I_DIRTY_ALL))
-		inode_io_list_del_locked(inode, wb);
+		inode_cgwb_move_to_attached(inode, wb);
 	spin_unlock(&wb->list_lock);
 	inode_sync_complete(inode);
 out:
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index fff9367a6348..e5dc238ebe4f 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -154,6 +154,7 @@ struct bdi_writeback {
 	struct cgroup_subsys_state *blkcg_css; /* and blkcg */
 	struct list_head memcg_node;	/* anchored at memcg->cgwb_list */
 	struct list_head blkcg_node;	/* anchored at blkcg->cgwb_list */
+	struct list_head b_attached;	/* attached inodes, protected by list_lock */
 
 	union {
 		struct work_struct release_work;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 576220acd686..54c5dc4b8c24 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -396,6 +396,7 @@ static void cgwb_release_workfn(struct work_struct *work)
 	fprop_local_destroy_percpu(&wb->memcg_completions);
 	percpu_ref_exit(&wb->refcnt);
 	wb_exit(wb);
+	WARN_ON_ONCE(!list_empty(&wb->b_attached));
 	kfree_rcu(wb, rcu);
 }
 
@@ -472,6 +473,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
 
 	wb->memcg_css = memcg_css;
 	wb->blkcg_css = blkcg_css;
+	INIT_LIST_HEAD(&wb->b_attached);
 	INIT_WORK(&wb->release_work, cgwb_release_workfn);
 	set_bit(WB_registered, &wb->state);
 
-- 
cgit v1.2.3


From f5fbe6b7ad6ef1fbdf8074a6ca9fdab739bf86d4 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Mon, 28 Jun 2021 19:35:59 -0700
Subject: writeback, cgroup: support switching multiple inodes at once

Currently only a single inode can be switched to another writeback
structure at once.  That means to switch an inode a separate
inode_switch_wbs_context structure must be allocated, and a separate rcu
callback and work must be scheduled.

It's fine for the existing ad-hoc switching, which is not happening that
often, but sub-optimal for massive switching required in order to release
a writeback structure.  To prepare for it, let's add a support for
switching multiple inodes at once.

Instead of containing a single inode pointer, inode_switch_wbs_context
will contain a NULL-terminated array of inode pointers.
inode_do_switch_wbs() will be called for each inode.

To optimize the locking bdi->wb_switch_rwsem, old_wb's and new_wb's
list_locks will be acquired and released only once altogether for all
inodes.  wb_wakeup() will be also be called only once.  Instead of calling
wb_put(old_wb) after each successful switch, wb_put_many() is introduced
and used.

Link: https://lkml.kernel.org/r/20210608230225.2078447-8-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Dennis Zhou <dennis@kernel.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Jan Kara <jack@suse.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c                | 106 +++++++++++++++++++++++----------------
 include/linux/backing-dev-defs.h |  18 ++++++-
 2 files changed, 80 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5520a6b5cc4d..737ac27adb77 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -335,10 +335,18 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
 }
 
 struct inode_switch_wbs_context {
-	struct inode		*inode;
-	struct bdi_writeback	*new_wb;
-
 	struct rcu_work		work;
+
+	/*
+	 * Multiple inodes can be switched at once.  The switching procedure
+	 * consists of two parts, separated by a RCU grace period.  To make
+	 * sure that the second part is executed for each inode gone through
+	 * the first part, all inode pointers are placed into a NULL-terminated
+	 * array embedded into struct inode_switch_wbs_context.  Otherwise
+	 * an inode could be left in a non-consistent state.
+	 */
+	struct bdi_writeback	*new_wb;
+	struct inode		*inodes[];
 };
 
 static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
@@ -351,39 +359,15 @@ static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
 	up_write(&bdi->wb_switch_rwsem);
 }
 
-static void inode_do_switch_wbs(struct inode *inode,
+static bool inode_do_switch_wbs(struct inode *inode,
+				struct bdi_writeback *old_wb,
 				struct bdi_writeback *new_wb)
 {
-	struct backing_dev_info *bdi = inode_to_bdi(inode);
 	struct address_space *mapping = inode->i_mapping;
-	struct bdi_writeback *old_wb = inode->i_wb;
 	XA_STATE(xas, &mapping->i_pages, 0);
 	struct page *page;
 	bool switched = false;
 
-	/*
-	 * If @inode switches cgwb membership while sync_inodes_sb() is
-	 * being issued, sync_inodes_sb() might miss it.  Synchronize.
-	 */
-	down_read(&bdi->wb_switch_rwsem);
-
-	/*
-	 * By the time control reaches here, RCU grace period has passed
-	 * since I_WB_SWITCH assertion and all wb stat update transactions
-	 * between unlocked_inode_to_wb_begin/end() are guaranteed to be
-	 * synchronizing against the i_pages lock.
-	 *
-	 * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock
-	 * gives us exclusion against all wb related operations on @inode
-	 * including IO list manipulations and stat updates.
-	 */
-	if (old_wb < new_wb) {
-		spin_lock(&old_wb->list_lock);
-		spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
-	} else {
-		spin_lock(&new_wb->list_lock);
-		spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
-	}
 	spin_lock(&inode->i_lock);
 	xa_lock_irq(&mapping->i_pages);
 
@@ -458,25 +442,63 @@ skip_switch:
 
 	xa_unlock_irq(&mapping->i_pages);
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&new_wb->list_lock);
-	spin_unlock(&old_wb->list_lock);
-
-	up_read(&bdi->wb_switch_rwsem);
 
-	if (switched) {
-		wb_wakeup(new_wb);
-		wb_put(old_wb);
-	}
+	return switched;
 }
 
 static void inode_switch_wbs_work_fn(struct work_struct *work)
 {
 	struct inode_switch_wbs_context *isw =
 		container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
+	struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
+	struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
+	struct bdi_writeback *new_wb = isw->new_wb;
+	unsigned long nr_switched = 0;
+	struct inode **inodep;
+
+	/*
+	 * If @inode switches cgwb membership while sync_inodes_sb() is
+	 * being issued, sync_inodes_sb() might miss it.  Synchronize.
+	 */
+	down_read(&bdi->wb_switch_rwsem);
+
+	/*
+	 * By the time control reaches here, RCU grace period has passed
+	 * since I_WB_SWITCH assertion and all wb stat update transactions
+	 * between unlocked_inode_to_wb_begin/end() are guaranteed to be
+	 * synchronizing against the i_pages lock.
+	 *
+	 * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock
+	 * gives us exclusion against all wb related operations on @inode
+	 * including IO list manipulations and stat updates.
+	 */
+	if (old_wb < new_wb) {
+		spin_lock(&old_wb->list_lock);
+		spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
+	} else {
+		spin_lock(&new_wb->list_lock);
+		spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
+	}
+
+	for (inodep = isw->inodes; *inodep; inodep++) {
+		WARN_ON_ONCE((*inodep)->i_wb != old_wb);
+		if (inode_do_switch_wbs(*inodep, old_wb, new_wb))
+			nr_switched++;
+	}
+
+	spin_unlock(&new_wb->list_lock);
+	spin_unlock(&old_wb->list_lock);
+
+	up_read(&bdi->wb_switch_rwsem);
+
+	if (nr_switched) {
+		wb_wakeup(new_wb);
+		wb_put_many(old_wb, nr_switched);
+	}
 
-	inode_do_switch_wbs(isw->inode, isw->new_wb);
-	wb_put(isw->new_wb);
-	iput(isw->inode);
+	for (inodep = isw->inodes; *inodep; inodep++)
+		iput(*inodep);
+	wb_put(new_wb);
 	kfree(isw);
 	atomic_dec(&isw_nr_in_flight);
 }
@@ -503,7 +525,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
 	if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
 		return;
 
-	isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
+	isw = kzalloc(sizeof(*isw) + 2 * sizeof(struct inode *), GFP_ATOMIC);
 	if (!isw)
 		return;
 
@@ -530,7 +552,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
 	__iget(inode);
 	spin_unlock(&inode->i_lock);
 
-	isw->inode = inode;
+	isw->inodes[0] = inode;
 
 	/*
 	 * In addition to synchronizing among switchers, I_WB_SWITCH tells
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index e5dc238ebe4f..63f52ad2ce7a 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -240,8 +240,9 @@ static inline void wb_get(struct bdi_writeback *wb)
 /**
  * wb_put - decrement a wb's refcount
  * @wb: bdi_writeback to put
+ * @nr: number of references to put
  */
-static inline void wb_put(struct bdi_writeback *wb)
+static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr)
 {
 	if (WARN_ON_ONCE(!wb->bdi)) {
 		/*
@@ -252,7 +253,16 @@ static inline void wb_put(struct bdi_writeback *wb)
 	}
 
 	if (wb != &wb->bdi->wb)
-		percpu_ref_put(&wb->refcnt);
+		percpu_ref_put_many(&wb->refcnt, nr);
+}
+
+/**
+ * wb_put - decrement a wb's refcount
+ * @wb: bdi_writeback to put
+ */
+static inline void wb_put(struct bdi_writeback *wb)
+{
+	wb_put_many(wb, 1);
 }
 
 /**
@@ -281,6 +291,10 @@ static inline void wb_put(struct bdi_writeback *wb)
 {
 }
 
+static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr)
+{
+}
+
 static inline bool wb_dying(struct bdi_writeback *wb)
 {
 	return false;
-- 
cgit v1.2.3


From c22d70a162d3cc177282c4487be4d54876ca55c8 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Mon, 28 Jun 2021 19:36:03 -0700
Subject: writeback, cgroup: release dying cgwbs by switching attached inodes

Asynchronously try to release dying cgwbs by switching attached inodes to
the nearest living ancestor wb.  It helps to get rid of per-cgroup
writeback structures themselves and of pinned memory and block cgroups,
which are significantly larger structures (mostly due to large per-cpu
statistics data).  This prevents memory waste and helps to avoid different
scalability problems caused by large piles of dying cgroups.

Reuse the existing mechanism of inode switching used for foreign inode
detection.  To speed things up batch up to 115 inode switching in a single
operation (the maximum number is selected so that the resulting struct
inode_switch_wbs_context can fit into 1024 bytes).  Because every
switching consists of two steps divided by an RCU grace period, it would
be too slow without batching.  Please note that the whole batch counts as
a single operation (when increasing/decreasing isw_nr_in_flight).  This
allows to keep umounting working (flush the switching queue), however
prevents cleanups from consuming the whole switching quota and effectively
blocking the frn switching.

A cgwb cleanup operation can fail due to different reasons (e.g.  not
enough memory, the cgwb has an in-flight/pending io, an attached inode in
a wrong state, etc).  In this case the next scheduled cleanup will make a
new attempt.  An attempt is made each time a new cgwb is offlined (in
other words a memcg and/or a blkcg is deleted by a user).  In the future
an additional attempt scheduled by a timer can be implemented.

[guro@fb.com: replace open-coded "115" with arithmetic]
  Link: https://lkml.kernel.org/r/YMEcSBcq/VXMiPPO@carbon.dhcp.thefacebook.com
[guro@fb.com: add smp_mb() to inode_prepare_wbs_switch()]
  Link: https://lkml.kernel.org/r/YMFa+guFw7OFjf3X@carbon.dhcp.thefacebook.com
[willy@infradead.org: fix documentation]
  Link: https://lkml.kernel.org/r/20210615200242.1716568-2-willy@infradead.org

Link: https://lkml.kernel.org/r/20210608230225.2078447-9-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Dennis Zhou <dennis@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Jan Kara <jack@suse.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c                | 111 +++++++++++++++++++++++++++++++++++----
 include/linux/backing-dev-defs.h |   1 +
 include/linux/writeback.h        |   1 +
 mm/backing-dev.c                 |  64 +++++++++++++++++++++-
 4 files changed, 165 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 737ac27adb77..62193106683d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -225,6 +225,13 @@ void wb_wait_for_completion(struct wb_completion *done)
 					/* one round can affect upto 5 slots */
 #define WB_FRN_MAX_IN_FLIGHT	1024	/* don't queue too many concurrently */
 
+/*
+ * Maximum inodes per isw.  A specific value has been chosen to make
+ * struct inode_switch_wbs_context fit into 1024 bytes kmalloc.
+ */
+#define WB_MAX_INODES_PER_ISW  ((1024UL - sizeof(struct inode_switch_wbs_context)) \
+                                / sizeof(struct inode *))
+
 static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
 static struct workqueue_struct *isw_wq;
 
@@ -503,6 +510,32 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
 	atomic_dec(&isw_nr_in_flight);
 }
 
+static bool inode_prepare_wbs_switch(struct inode *inode,
+				     struct bdi_writeback *new_wb)
+{
+	/*
+	 * Paired with smp_mb() in cgroup_writeback_umount().
+	 * isw_nr_in_flight must be increased before checking SB_ACTIVE and
+	 * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0
+	 * in cgroup_writeback_umount() and the isw_wq will be not flushed.
+	 */
+	smp_mb();
+
+	/* while holding I_WB_SWITCH, no one else can update the association */
+	spin_lock(&inode->i_lock);
+	if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
+	    inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) ||
+	    inode_to_wb(inode) == new_wb) {
+		spin_unlock(&inode->i_lock);
+		return false;
+	}
+	inode->i_state |= I_WB_SWITCH;
+	__iget(inode);
+	spin_unlock(&inode->i_lock);
+
+	return true;
+}
+
 /**
  * inode_switch_wbs - change the wb association of an inode
  * @inode: target inode
@@ -540,17 +573,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
 	if (!isw->new_wb)
 		goto out_free;
 
-	/* while holding I_WB_SWITCH, no one else can update the association */
-	spin_lock(&inode->i_lock);
-	if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
-	    inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) ||
-	    inode_to_wb(inode) == isw->new_wb) {
-		spin_unlock(&inode->i_lock);
+	if (!inode_prepare_wbs_switch(inode, isw->new_wb))
 		goto out_free;
-	}
-	inode->i_state |= I_WB_SWITCH;
-	__iget(inode);
-	spin_unlock(&inode->i_lock);
 
 	isw->inodes[0] = inode;
 
@@ -571,6 +595,73 @@ out_free:
 	kfree(isw);
 }
 
+/**
+ * cleanup_offline_cgwb - detach associated inodes
+ * @wb: target wb
+ *
+ * Switch all inodes attached to @wb to a nearest living ancestor's wb in order
+ * to eventually release the dying @wb.  Returns %true if not all inodes were
+ * switched and the function has to be restarted.
+ */
+bool cleanup_offline_cgwb(struct bdi_writeback *wb)
+{
+	struct cgroup_subsys_state *memcg_css;
+	struct inode_switch_wbs_context *isw;
+	struct inode *inode;
+	int nr;
+	bool restart = false;
+
+	isw = kzalloc(sizeof(*isw) + WB_MAX_INODES_PER_ISW *
+		      sizeof(struct inode *), GFP_KERNEL);
+	if (!isw)
+		return restart;
+
+	atomic_inc(&isw_nr_in_flight);
+
+	for (memcg_css = wb->memcg_css->parent; memcg_css;
+	     memcg_css = memcg_css->parent) {
+		isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
+		if (isw->new_wb)
+			break;
+	}
+	if (unlikely(!isw->new_wb))
+		isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
+
+	nr = 0;
+	spin_lock(&wb->list_lock);
+	list_for_each_entry(inode, &wb->b_attached, i_io_list) {
+		if (!inode_prepare_wbs_switch(inode, isw->new_wb))
+			continue;
+
+		isw->inodes[nr++] = inode;
+
+		if (nr >= WB_MAX_INODES_PER_ISW - 1) {
+			restart = true;
+			break;
+		}
+	}
+	spin_unlock(&wb->list_lock);
+
+	/* no attached inodes? bail out */
+	if (nr == 0) {
+		atomic_dec(&isw_nr_in_flight);
+		wb_put(isw->new_wb);
+		kfree(isw);
+		return restart;
+	}
+
+	/*
+	 * In addition to synchronizing among switchers, I_WB_SWITCH tells
+	 * the RCU protected stat update paths to grab the i_page
+	 * lock so that stat transfer can synchronize against them.
+	 * Let's continue after I_WB_SWITCH is guaranteed to be visible.
+	 */
+	INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
+	queue_rcu_work(isw_wq, &isw->work);
+
+	return restart;
+}
+
 /**
  * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
  * @wbc: writeback_control of interest
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 63f52ad2ce7a..1d7edad9914f 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -155,6 +155,7 @@ struct bdi_writeback {
 	struct list_head memcg_node;	/* anchored at memcg->cgwb_list */
 	struct list_head blkcg_node;	/* anchored at blkcg->cgwb_list */
 	struct list_head b_attached;	/* attached inodes, protected by list_lock */
+	struct list_head offline_node;	/* anchored at offline_cgwbs */
 
 	union {
 		struct work_struct release_work;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 8e5c5bb16e2d..95de51c10248 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -221,6 +221,7 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
 int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr_pages,
 			   enum wb_reason reason, struct wb_completion *done);
 void cgroup_writeback_umount(void);
+bool cleanup_offline_cgwb(struct bdi_writeback *wb);
 
 /**
  * inode_attach_wb - associate an inode with its wb
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 54c5dc4b8c24..271f2ca862c8 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -371,12 +371,16 @@ static void wb_exit(struct bdi_writeback *wb)
 #include <linux/memcontrol.h>
 
 /*
- * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, and memcg->cgwb_list.
- * bdi->cgwb_tree is also RCU protected.
+ * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and
+ * memcg->cgwb_list.  bdi->cgwb_tree is also RCU protected.
  */
 static DEFINE_SPINLOCK(cgwb_lock);
 static struct workqueue_struct *cgwb_release_wq;
 
+static LIST_HEAD(offline_cgwbs);
+static void cleanup_offline_cgwbs_workfn(struct work_struct *work);
+static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn);
+
 static void cgwb_release_workfn(struct work_struct *work)
 {
 	struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
@@ -395,6 +399,11 @@ static void cgwb_release_workfn(struct work_struct *work)
 
 	fprop_local_destroy_percpu(&wb->memcg_completions);
 	percpu_ref_exit(&wb->refcnt);
+
+	spin_lock_irq(&cgwb_lock);
+	list_del(&wb->offline_node);
+	spin_unlock_irq(&cgwb_lock);
+
 	wb_exit(wb);
 	WARN_ON_ONCE(!list_empty(&wb->b_attached));
 	kfree_rcu(wb, rcu);
@@ -414,6 +423,7 @@ static void cgwb_kill(struct bdi_writeback *wb)
 	WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
 	list_del(&wb->memcg_node);
 	list_del(&wb->blkcg_node);
+	list_add(&wb->offline_node, &offline_cgwbs);
 	percpu_ref_kill(&wb->refcnt);
 }
 
@@ -635,6 +645,54 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
 	mutex_unlock(&bdi->cgwb_release_mutex);
 }
 
+/*
+ * cleanup_offline_cgwbs_workfn - try to release dying cgwbs
+ *
+ * Try to release dying cgwbs by switching attached inodes to the nearest
+ * living ancestor's writeback. Processed wbs are placed at the end
+ * of the list to guarantee the forward progress.
+ */
+static void cleanup_offline_cgwbs_workfn(struct work_struct *work)
+{
+	struct bdi_writeback *wb;
+	LIST_HEAD(processed);
+
+	spin_lock_irq(&cgwb_lock);
+
+	while (!list_empty(&offline_cgwbs)) {
+		wb = list_first_entry(&offline_cgwbs, struct bdi_writeback,
+				      offline_node);
+		list_move(&wb->offline_node, &processed);
+
+		/*
+		 * If wb is dirty, cleaning up the writeback by switching
+		 * attached inodes will result in an effective removal of any
+		 * bandwidth restrictions, which isn't the goal.  Instead,
+		 * it can be postponed until the next time, when all io
+		 * will be likely completed.  If in the meantime some inodes
+		 * will get re-dirtied, they should be eventually switched to
+		 * a new cgwb.
+		 */
+		if (wb_has_dirty_io(wb))
+			continue;
+
+		if (!wb_tryget(wb))
+			continue;
+
+		spin_unlock_irq(&cgwb_lock);
+		while (cleanup_offline_cgwb(wb))
+			cond_resched();
+		spin_lock_irq(&cgwb_lock);
+
+		wb_put(wb);
+	}
+
+	if (!list_empty(&processed))
+		list_splice_tail(&processed, &offline_cgwbs);
+
+	spin_unlock_irq(&cgwb_lock);
+}
+
 /**
  * wb_memcg_offline - kill all wb's associated with a memcg being offlined
  * @memcg: memcg being offlined
@@ -651,6 +709,8 @@ void wb_memcg_offline(struct mem_cgroup *memcg)
 		cgwb_kill(wb);
 	memcg_cgwb_list->next = NULL;	/* prevent new wb's */
 	spin_unlock_irq(&cgwb_lock);
+
+	queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work);
 }
 
 /**
-- 
cgit v1.2.3


From c1e3dbe9818e3caa4e467255a348df56912ca549 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Jun 2021 19:36:09 -0700
Subject: fs: move ramfs_aops to libfs

Move the ramfs aops to libfs and reuse them for kernfs and configfs.
Thosw two did not wire up ->set_page_dirty before and now get
__set_page_dirty_no_writeback, which is the right one for no-writeback
address_space usage.

Drop the now unused exports of the libfs helpers only used for ramfs-style
pagecache usage.

Link: https://lkml.kernel.org/r/20210614061512.3966143-3-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/configfs/inode.c |  8 +-------
 fs/kernfs/inode.c   |  8 +-------
 fs/libfs.c          | 17 +++++++++++++----
 fs/ramfs/inode.c    |  9 +--------
 include/linux/fs.h  |  5 +----
 5 files changed, 17 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index eb5ec3e46283..b601610e9907 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -28,12 +28,6 @@
 static struct lock_class_key default_group_class[MAX_LOCK_DEPTH];
 #endif
 
-static const struct address_space_operations configfs_aops = {
-	.readpage	= simple_readpage,
-	.write_begin	= simple_write_begin,
-	.write_end	= simple_write_end,
-};
-
 static const struct inode_operations configfs_inode_operations ={
 	.setattr	= configfs_setattr,
 };
@@ -114,7 +108,7 @@ struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd,
 	struct inode * inode = new_inode(s);
 	if (inode) {
 		inode->i_ino = get_next_ino();
-		inode->i_mapping->a_ops = &configfs_aops;
+		inode->i_mapping->a_ops = &ram_aops;
 		inode->i_op = &configfs_inode_operations;
 
 		if (sd->s_iattr) {
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index d73950fc3d57..26f2aa3586f9 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -17,12 +17,6 @@
 
 #include "kernfs-internal.h"
 
-static const struct address_space_operations kernfs_aops = {
-	.readpage	= simple_readpage,
-	.write_begin	= simple_write_begin,
-	.write_end	= simple_write_end,
-};
-
 static const struct inode_operations kernfs_iops = {
 	.permission	= kernfs_iop_permission,
 	.setattr	= kernfs_iop_setattr,
@@ -203,7 +197,7 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
 {
 	kernfs_get(kn);
 	inode->i_private = kn;
-	inode->i_mapping->a_ops = &kernfs_aops;
+	inode->i_mapping->a_ops = &ram_aops;
 	inode->i_op = &kernfs_iops;
 	inode->i_generation = kernfs_gen(kn);
 
diff --git a/fs/libfs.c b/fs/libfs.c
index e9b29c6ffccb..2d7f086b93d6 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -512,7 +512,7 @@ int simple_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 }
 EXPORT_SYMBOL(simple_setattr);
 
-int simple_readpage(struct file *file, struct page *page)
+static int simple_readpage(struct file *file, struct page *page)
 {
 	clear_highpage(page);
 	flush_dcache_page(page);
@@ -520,7 +520,6 @@ int simple_readpage(struct file *file, struct page *page)
 	unlock_page(page);
 	return 0;
 }
-EXPORT_SYMBOL(simple_readpage);
 
 int simple_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
@@ -568,7 +567,7 @@ EXPORT_SYMBOL(simple_write_begin);
  *
  * Use *ONLY* with simple_readpage()
  */
-int simple_write_end(struct file *file, struct address_space *mapping,
+static int simple_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata)
 {
@@ -597,7 +596,17 @@ int simple_write_end(struct file *file, struct address_space *mapping,
 
 	return copied;
 }
-EXPORT_SYMBOL(simple_write_end);
+
+/*
+ * Provides ramfs-style behavior: data in the pagecache, but no writeback.
+ */
+const struct address_space_operations ram_aops = {
+	.readpage	= simple_readpage,
+	.write_begin	= simple_write_begin,
+	.write_end	= simple_write_end,
+	.set_page_dirty	= __set_page_dirty_no_writeback,
+};
+EXPORT_SYMBOL(ram_aops);
 
 /*
  * the inodes created here are not hashed. If you use iunique to generate
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 9ebd17d7befb..65e7e56005b8 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -53,13 +53,6 @@ struct ramfs_fs_info {
 static const struct super_operations ramfs_ops;
 static const struct inode_operations ramfs_dir_inode_operations;
 
-static const struct address_space_operations ramfs_aops = {
-	.readpage	= simple_readpage,
-	.write_begin	= simple_write_begin,
-	.write_end	= simple_write_end,
-	.set_page_dirty	= __set_page_dirty_no_writeback,
-};
-
 struct inode *ramfs_get_inode(struct super_block *sb,
 				const struct inode *dir, umode_t mode, dev_t dev)
 {
@@ -68,7 +61,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
 	if (inode) {
 		inode->i_ino = get_next_ino();
 		inode_init_owner(&init_user_ns, inode, dir, mode);
-		inode->i_mapping->a_ops = &ramfs_aops;
+		inode->i_mapping->a_ops = &ram_aops;
 		mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
 		mapping_set_unevictable(inode->i_mapping);
 		inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c3c88fdb9b2a..869909345420 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3422,13 +3422,10 @@ extern void noop_invalidatepage(struct page *page, unsigned int offset,
 		unsigned int length);
 extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
 extern int simple_empty(struct dentry *);
-extern int simple_readpage(struct file *file, struct page *page);
 extern int simple_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata);
-extern int simple_write_end(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned copied,
-			struct page *page, void *fsdata);
+extern const struct address_space_operations ram_aops;
 extern int always_delete_dentry(const struct dentry *);
 extern struct inode *alloc_anon_inode(struct super_block *);
 extern int simple_nosetlease(struct file *, long, struct file_lock **, void **);
-- 
cgit v1.2.3


From 6e1cae881a0646f31fe2bda90297d820da1137eb Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 28 Jun 2021 19:36:15 -0700
Subject: mm/writeback: move __set_page_dirty() to core mm

Patch series "Further set_page_dirty cleanups".

Prompted by Christoph's recent patches, here are some more patches to
improve the state of set_page_dirty().  They're all from the folio tree,
so they've been tested to a certain extent.

This patch (of 6):

Nothing in __set_page_dirty() is specific to buffer_head, so move it to
mm/page-writeback.c.  That removes the only caller of
account_page_dirtied() outside of page-writeback.c, so make it static.

Link: https://lkml.kernel.org/r/20210615162342.1669332-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20210615162342.1669332-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c         | 24 ------------------------
 include/linux/mm.h  |  1 -
 mm/page-writeback.c | 27 ++++++++++++++++++++++++++-
 3 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 3d18831c7ad8..6290c3afdba4 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -588,30 +588,6 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 }
 EXPORT_SYMBOL(mark_buffer_dirty_inode);
 
-/*
- * Mark the page dirty, and set it dirty in the page cache, and mark the inode
- * dirty.
- *
- * If warn is true, then emit a warning if the page is not uptodate and has
- * not been truncated.
- *
- * The caller must hold lock_page_memcg().
- */
-void __set_page_dirty(struct page *page, struct address_space *mapping,
-			     int warn)
-{
-	unsigned long flags;
-
-	xa_lock_irqsave(&mapping->i_pages, flags);
-	if (page->mapping) {	/* Race with truncate? */
-		WARN_ON_ONCE(warn && !PageUptodate(page));
-		account_page_dirtied(page, mapping);
-		__xa_set_mark(&mapping->i_pages, page_index(page),
-				PAGECACHE_TAG_DIRTY);
-	}
-	xa_unlock_irqrestore(&mapping->i_pages, flags);
-}
-
 /*
  * Add a page to the dirty page list.
  *
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9afb8998e7e5..12589b811555 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1855,7 +1855,6 @@ int __set_page_dirty_nobuffers(struct page *page);
 int __set_page_dirty_no_writeback(struct page *page);
 int redirty_page_for_writepage(struct writeback_control *wbc,
 				struct page *page);
-void account_page_dirtied(struct page *page, struct address_space *mapping);
 void account_page_cleaned(struct page *page, struct address_space *mapping,
 			  struct bdi_writeback *wb);
 int set_page_dirty(struct page *page);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 21f4b5972311..1345882c428b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2425,7 +2425,8 @@ int __set_page_dirty_no_writeback(struct page *page)
  *
  * NOTE: This relies on being atomic wrt interrupts.
  */
-void account_page_dirtied(struct page *page, struct address_space *mapping)
+static void account_page_dirtied(struct page *page,
+		struct address_space *mapping)
 {
 	struct inode *inode = mapping->host;
 
@@ -2466,6 +2467,30 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
 	}
 }
 
+/*
+ * Mark the page dirty, and set it dirty in the page cache, and mark the inode
+ * dirty.
+ *
+ * If warn is true, then emit a warning if the page is not uptodate and has
+ * not been truncated.
+ *
+ * The caller must hold lock_page_memcg().
+ */
+void __set_page_dirty(struct page *page, struct address_space *mapping,
+			     int warn)
+{
+	unsigned long flags;
+
+	xa_lock_irqsave(&mapping->i_pages, flags);
+	if (page->mapping) {	/* Race with truncate? */
+		WARN_ON_ONCE(warn && !PageUptodate(page));
+		account_page_dirtied(page, mapping);
+		__xa_set_mark(&mapping->i_pages, page_index(page),
+				PAGECACHE_TAG_DIRTY);
+	}
+	xa_unlock_irqrestore(&mapping->i_pages, flags);
+}
+
 /*
  * For address_spaces which do not use buffers.  Just tag the page as dirty in
  * the xarray.
-- 
cgit v1.2.3


From fd7353f88bde80d557b6d74a5351979fc8b1b8db Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 28 Jun 2021 19:36:21 -0700
Subject: iomap: use __set_page_dirty_nobuffers

The only difference between iomap_set_page_dirty() and
__set_page_dirty_nobuffers() is that the latter includes a debugging check
that a !Uptodate page has private data.

Link: https://lkml.kernel.org/r/20210615162342.1669332-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/gfs2/aops.c         |  2 +-
 fs/iomap/buffered-io.c | 27 +--------------------------
 fs/xfs/xfs_aops.c      |  2 +-
 fs/zonefs/super.c      |  2 +-
 include/linux/iomap.h  |  1 -
 5 files changed, 4 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 23b5be3db044..81d8f064126e 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -784,7 +784,7 @@ static const struct address_space_operations gfs2_aops = {
 	.writepages = gfs2_writepages,
 	.readpage = gfs2_readpage,
 	.readahead = gfs2_readahead,
-	.set_page_dirty = iomap_set_page_dirty,
+	.set_page_dirty = __set_page_dirty_nobuffers,
 	.releasepage = iomap_releasepage,
 	.invalidatepage = iomap_invalidatepage,
 	.bmap = gfs2_bmap,
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 9023717c5188..0065781935c7 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -640,31 +640,6 @@ out_no_page:
 	return status;
 }
 
-int
-iomap_set_page_dirty(struct page *page)
-{
-	struct address_space *mapping = page_mapping(page);
-	int newly_dirty;
-
-	if (unlikely(!mapping))
-		return !TestSetPageDirty(page);
-
-	/*
-	 * Lock out page's memcg migration to keep PageDirty
-	 * synchronized with per-memcg dirty page counters.
-	 */
-	lock_page_memcg(page);
-	newly_dirty = !TestSetPageDirty(page);
-	if (newly_dirty)
-		__set_page_dirty(page, mapping, 0);
-	unlock_page_memcg(page);
-
-	if (newly_dirty)
-		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-	return newly_dirty;
-}
-EXPORT_SYMBOL_GPL(iomap_set_page_dirty);
-
 static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
 		size_t copied, struct page *page)
 {
@@ -684,7 +659,7 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
 	if (unlikely(copied < len && !PageUptodate(page)))
 		return 0;
 	iomap_set_range_uptodate(page, offset_in_page(pos), len);
-	iomap_set_page_dirty(page);
+	__set_page_dirty_nobuffers(page);
 	return copied;
 }
 
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 826caa6b4a5a..a335d79dcff8 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -561,7 +561,7 @@ const struct address_space_operations xfs_address_space_operations = {
 	.readahead		= xfs_vm_readahead,
 	.writepage		= xfs_vm_writepage,
 	.writepages		= xfs_vm_writepages,
-	.set_page_dirty		= iomap_set_page_dirty,
+	.set_page_dirty		= __set_page_dirty_nobuffers,
 	.releasepage		= iomap_releasepage,
 	.invalidatepage		= iomap_invalidatepage,
 	.bmap			= xfs_vm_bmap,
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index cd145d318b17..3aacf016c7c2 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -185,7 +185,7 @@ static const struct address_space_operations zonefs_file_aops = {
 	.readahead		= zonefs_readahead,
 	.writepage		= zonefs_writepage,
 	.writepages		= zonefs_writepages,
-	.set_page_dirty		= iomap_set_page_dirty,
+	.set_page_dirty		= __set_page_dirty_nobuffers,
 	.releasepage		= iomap_releasepage,
 	.invalidatepage		= iomap_invalidatepage,
 	.migratepage		= iomap_migrate_page,
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index c87d0cb0de6d..479c1da3e221 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -159,7 +159,6 @@ ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
 		const struct iomap_ops *ops);
 int iomap_readpage(struct page *page, const struct iomap_ops *ops);
 void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
-int iomap_set_page_dirty(struct page *page);
 int iomap_is_partially_uptodate(struct page *page, unsigned long from,
 		unsigned long count);
 int iomap_releasepage(struct page *page, gfp_t gfp_mask);
-- 
cgit v1.2.3


From b82a96c9253333a8834b2df5f262a39cccf4f6c7 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 28 Jun 2021 19:36:27 -0700
Subject: fs: remove noop_set_page_dirty()

Use __set_page_dirty_no_writeback() instead.  This will set the dirty bit
on the page, which will be used to avoid calling set_page_dirty() in the
future.  It will have no effect on actually writing the page back, as the
pages are not on any LRU lists.

[akpm@linux-foundation.org: export __set_page_dirty_no_writeback() to modules]

Link: https://lkml.kernel.org/r/20210615162342.1669332-6-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/dax/device.c |  2 +-
 fs/ext2/inode.c      |  2 +-
 fs/ext4/inode.c      |  2 +-
 fs/fuse/dax.c        |  2 +-
 fs/libfs.c           | 16 ----------------
 fs/xfs/xfs_aops.c    |  2 +-
 include/linux/fs.h   |  1 -
 mm/page-writeback.c  |  1 +
 8 files changed, 6 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index db92573c94e8..dd8222a42808 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -337,7 +337,7 @@ static unsigned long dax_get_unmapped_area(struct file *filp,
 }
 
 static const struct address_space_operations dev_dax_aops = {
-	.set_page_dirty		= noop_set_page_dirty,
+	.set_page_dirty		= __set_page_dirty_no_writeback,
 	.invalidatepage		= noop_invalidatepage,
 };
 
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index bf41f579ed3e..dadb121beb22 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -992,7 +992,7 @@ const struct address_space_operations ext2_nobh_aops = {
 static const struct address_space_operations ext2_dax_aops = {
 	.writepages		= ext2_dax_writepages,
 	.direct_IO		= noop_direct_IO,
-	.set_page_dirty		= noop_set_page_dirty,
+	.set_page_dirty		= __set_page_dirty_no_writeback,
 	.invalidatepage		= noop_invalidatepage,
 };
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index fe6045a46599..b8170a008590 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3701,7 +3701,7 @@ static const struct address_space_operations ext4_da_aops = {
 static const struct address_space_operations ext4_dax_aops = {
 	.writepages		= ext4_dax_writepages,
 	.direct_IO		= noop_direct_IO,
-	.set_page_dirty		= noop_set_page_dirty,
+	.set_page_dirty		= __set_page_dirty_no_writeback,
 	.bmap			= ext4_bmap,
 	.invalidatepage		= noop_invalidatepage,
 	.swap_activate		= ext4_iomap_swap_activate,
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index ff99ab2a3c43..515ad0895345 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -1329,7 +1329,7 @@ bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi)
 static const struct address_space_operations fuse_dax_file_aops  = {
 	.writepages	= fuse_dax_writepages,
 	.direct_IO	= noop_direct_IO,
-	.set_page_dirty	= noop_set_page_dirty,
+	.set_page_dirty	= __set_page_dirty_no_writeback,
 	.invalidatepage	= noop_invalidatepage,
 };
 
diff --git a/fs/libfs.c b/fs/libfs.c
index 3fdd89b156d6..51b4de3b3447 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1171,22 +1171,6 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 }
 EXPORT_SYMBOL(noop_fsync);
 
-int noop_set_page_dirty(struct page *page)
-{
-	/*
-	 * Unlike __set_page_dirty_no_writeback that handles dirty page
-	 * tracking in the page object, dax does all dirty tracking in
-	 * the inode address_space in response to mkwrite faults. In the
-	 * dax case we only need to worry about potentially dirty CPU
-	 * caches, not dirty page cache pages to write back.
-	 *
-	 * This callback is defined to prevent fallback to
-	 * __set_page_dirty_buffers() in set_page_dirty().
-	 */
-	return 0;
-}
-EXPORT_SYMBOL_GPL(noop_set_page_dirty);
-
 void noop_invalidatepage(struct page *page, unsigned int offset,
 		unsigned int length)
 {
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index a335d79dcff8..cb4e0fcf4c76 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -575,7 +575,7 @@ const struct address_space_operations xfs_address_space_operations = {
 const struct address_space_operations xfs_dax_aops = {
 	.writepages		= xfs_dax_writepages,
 	.direct_IO		= noop_direct_IO,
-	.set_page_dirty		= noop_set_page_dirty,
+	.set_page_dirty		= __set_page_dirty_no_writeback,
 	.invalidatepage		= noop_invalidatepage,
 	.swap_activate		= xfs_iomap_swapfile_activate,
 };
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 869909345420..fad6663cd1b0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3417,7 +3417,6 @@ extern int simple_rename(struct user_namespace *, struct inode *,
 extern void simple_recursive_removal(struct dentry *,
                               void (*callback)(struct dentry *));
 extern int noop_fsync(struct file *, loff_t, loff_t, int);
-extern int noop_set_page_dirty(struct page *page);
 extern void noop_invalidatepage(struct page *page, unsigned int offset,
 		unsigned int length);
 extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 8bd69dc5379a..e5b38ffe9fca 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2417,6 +2417,7 @@ int __set_page_dirty_no_writeback(struct page *page)
 		return !TestSetPageDirty(page);
 	return 0;
 }
+EXPORT_SYMBOL(__set_page_dirty_no_writeback);
 
 /*
  * Helper function for set_page_dirty family.
-- 
cgit v1.2.3


From 3a6b2162005f24c7caa10d7f10dba487629787f2 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 28 Jun 2021 19:36:30 -0700
Subject: mm: move page dirtying prototypes from mm.h

These functions implement the address_space ->set_page_dirty operation and
should live in pagemap.h, not mm.h so that the rest of the kernel doesn't
get funny ideas about calling them directly.

Link: https://lkml.kernel.org/r/20210615162342.1669332-7-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fuse/dax.c           | 1 +
 fs/zonefs/super.c       | 2 +-
 include/linux/mm.h      | 3 ---
 include/linux/pagemap.h | 4 ++++
 4 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 515ad0895345..fb733eb5aead 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -9,6 +9,7 @@
 #include <linux/delay.h>
 #include <linux/dax.h>
 #include <linux/uio.h>
+#include <linux/pagemap.h>
 #include <linux/pfn_t.h>
 #include <linux/iomap.h>
 #include <linux/interval_tree.h>
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 3aacf016c7c2..dbf03635869c 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -5,7 +5,7 @@
  * Copyright (C) 2019 Western Digital Corporation or its affiliates.
  */
 #include <linux/module.h>
-#include <linux/fs.h>
+#include <linux/pagemap.h>
 #include <linux/magic.h>
 #include <linux/iomap.h>
 #include <linux/init.h>
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 12589b811555..e39ed497578b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1850,9 +1850,6 @@ extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
 extern void do_invalidatepage(struct page *page, unsigned int offset,
 			      unsigned int length);
 
-void __set_page_dirty(struct page *, struct address_space *, int warn);
-int __set_page_dirty_nobuffers(struct page *page);
-int __set_page_dirty_no_writeback(struct page *page);
 int redirty_page_for_writepage(struct writeback_control *wbc,
 				struct page *page);
 void account_page_cleaned(struct page *page, struct address_space *mapping,
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 0f1b34dbf3a2..ed02aa522263 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -702,6 +702,10 @@ int wait_on_page_writeback_killable(struct page *page);
 extern void end_page_writeback(struct page *page);
 void wait_for_stable_page(struct page *page);
 
+void __set_page_dirty(struct page *, struct address_space *, int warn);
+int __set_page_dirty_nobuffers(struct page *page);
+int __set_page_dirty_no_writeback(struct page *page);
+
 void page_endio(struct page *page, bool is_write, int err);
 
 /**
-- 
cgit v1.2.3


From a458b76a4171f893efa7657dc079924580a8746a Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Mon, 28 Jun 2021 19:36:40 -0700
Subject: mm: gup: pack has_pinned in MMF_HAS_PINNED

has_pinned 32bit can be packed in the MMF_HAS_PINNED bit as a noop
cleanup.

Any atomic_inc/dec to the mm cacheline shared by all threads in pin-fast
would reintroduce a loss of SMP scalability to pin-fast, so there's no
future potential usefulness to keep an atomic in the mm for this.

set_bit(MMF_HAS_PINNED) will be theoretically a bit slower than WRITE_ONCE
(atomic_set is equivalent to WRITE_ONCE), but the set_bit (just like
atomic_set after this commit) has to be still issued only once per "mm",
so the difference between the two will be lost in the noise.

will-it-scale "mmap2" shows no change in performance with enterprise
config as expected.

will-it-scale "pin_fast" retains the > 4000% SMP scalability performance
improvement against upstream as expected.

This is a noop as far as overall performance and SMP scalability are
concerned.

[peterx@redhat.com: pack has_pinned in MMF_HAS_PINNED]
  Link: https://lkml.kernel.org/r/YJqWESqyxa8OZA+2@t490s
[akpm@linux-foundation.org: coding style fixes]
[peterx@redhat.com: fix build for task_mmu.c, introduce mm_set_has_pinned_flag, fix comments]

Link: https://lkml.kernel.org/r/20210507150553.208763-4-peterx@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Kirill Shutemov <kirill@shutemov.name>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c             |  2 +-
 include/linux/mm.h             |  2 +-
 include/linux/mm_types.h       | 10 ----------
 include/linux/sched/coredump.h |  8 ++++++++
 kernel/fork.c                  |  1 -
 mm/gup.c                       | 19 +++++++++++++++----
 6 files changed, 25 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fc9784544b24..66965ad88d8b 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1047,7 +1047,7 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr,
 		return false;
 	if (!is_cow_mapping(vma->vm_flags))
 		return false;
-	if (likely(!atomic_read(&vma->vm_mm->has_pinned)))
+	if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)))
 		return false;
 	page = vm_normal_page(vma, addr, pte);
 	if (!page)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e39ed497578b..79f32962d7ae 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1341,7 +1341,7 @@ static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma,
 	if (!is_cow_mapping(vma->vm_flags))
 		return false;
 
-	if (!atomic_read(&vma->vm_mm->has_pinned))
+	if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))
 		return false;
 
 	return page_maybe_dma_pinned(page);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8f0fb62e8975..b66d0225414e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -435,16 +435,6 @@ struct mm_struct {
 		 */
 		atomic_t mm_count;
 
-		/**
-		 * @has_pinned: Whether this mm has pinned any pages.  This can
-		 * be either replaced in the future by @pinned_vm when it
-		 * becomes stable, or grow into a counter on its own. We're
-		 * aggresive on this bit now - even if the pinned pages were
-		 * unpinned later on, we'll still keep this bit set for the
-		 * lifecycle of this mm just for simplicity.
-		 */
-		atomic_t has_pinned;
-
 #ifdef CONFIG_MMU
 		atomic_long_t pgtables_bytes;	/* PTE page table pages */
 #endif
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index dfd82eab2902..4d9e3a656875 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -73,6 +73,14 @@ static inline int get_dumpable(struct mm_struct *mm)
 #define MMF_OOM_VICTIM		25	/* mm is the oom victim */
 #define MMF_OOM_REAP_QUEUED	26	/* mm was queued for oom_reaper */
 #define MMF_MULTIPROCESS	27	/* mm is shared between processes */
+/*
+ * MMF_HAS_PINNED: Whether this mm has pinned any pages.  This can be either
+ * replaced in the future by mm.pinned_vm when it becomes stable, or grow into
+ * a counter on its own. We're aggresive on this bit for now: even if the
+ * pinned pages were unpinned later on, we'll still keep this bit set for the
+ * lifecycle of this mm, just for simplicity.
+ */
+#define MMF_HAS_PINNED		28	/* FOLL_PIN has run, never cleared */
 #define MMF_DISABLE_THP_MASK	(1 << MMF_DISABLE_THP)
 
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
diff --git a/kernel/fork.c b/kernel/fork.c
index a070caed5c8e..c6747d556ef9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1029,7 +1029,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	mm_pgtables_bytes_init(mm);
 	mm->map_count = 0;
 	mm->locked_vm = 0;
-	atomic_set(&mm->has_pinned, 0);
 	atomic64_set(&mm->pinned_vm, 0);
 	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
 	spin_lock_init(&mm->page_table_lock);
diff --git a/mm/gup.c b/mm/gup.c
index a6c20a7b3c49..8651309f8ec3 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -420,6 +420,17 @@ void unpin_user_pages(struct page **pages, unsigned long npages)
 }
 EXPORT_SYMBOL(unpin_user_pages);
 
+/*
+ * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's
+ * lifecycle.  Avoid setting the bit unless necessary, or it might cause write
+ * cache bouncing on large SMP machines for concurrent pinned gups.
+ */
+static inline void mm_set_has_pinned_flag(unsigned long *mm_flags)
+{
+	if (!test_bit(MMF_HAS_PINNED, mm_flags))
+		set_bit(MMF_HAS_PINNED, mm_flags);
+}
+
 #ifdef CONFIG_MMU
 static struct page *no_page_table(struct vm_area_struct *vma,
 		unsigned int flags)
@@ -1320,8 +1331,8 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
 		BUG_ON(*locked != 1);
 	}
 
-	if ((flags & FOLL_PIN) && !atomic_read(&mm->has_pinned))
-		atomic_set(&mm->has_pinned, 1);
+	if (flags & FOLL_PIN)
+		mm_set_has_pinned_flag(&mm->flags);
 
 	/*
 	 * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
@@ -2641,8 +2652,8 @@ static int internal_get_user_pages_fast(unsigned long start,
 				       FOLL_FAST_ONLY)))
 		return -EINVAL;
 
-	if ((gup_flags & FOLL_PIN) && !atomic_read(&current->mm->has_pinned))
-		atomic_set(&current->mm->has_pinned, 1);
+	if (gup_flags & FOLL_PIN)
+		mm_set_has_pinned_flag(&current->mm->flags);
 
 	if (!(gup_flags & FOLL_FAST_ONLY))
 		might_lock_read(&current->mm->mmap_lock);
-- 
cgit v1.2.3


From 63d8620ecf93b5d8d0a254471184d08f8e8f538d Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Mon, 28 Jun 2021 19:36:46 -0700
Subject: mm/swapfile: use percpu_ref to serialize against concurrent swapoff

Patch series "close various race windows for swap", v6.

When I was investigating the swap code, I found some possible race
windows.  This series aims to fix all these races.  But using current
get/put_swap_device() to guard against concurrent swapoff for
swap_readpage() looks terrible because swap_readpage() may take really
long time.  And to reduce the performance overhead on the hot-path as much
as possible, it appears we can use the percpu_ref to close this race
window(as suggested by Huang, Ying).  The patch 1 adds percpu_ref support
for swap and most of the remaining patches try to use this to close
various race windows.  More details can be found in the respective
changelogs.

This patch (of 4):

Using current get/put_swap_device() to guard against concurrent swapoff
for some swap ops, e.g.  swap_readpage(), looks terrible because they
might take really long time.  This patch adds the percpu_ref support to
serialize against concurrent swapoff(as suggested by Huang, Ying).  Also
we remove the SWP_VALID flag because it's used together with RCU solution.

Link: https://lkml.kernel.org/r/20210426123316.806267-1-linmiaohe@huawei.com
Link: https://lkml.kernel.org/r/20210426123316.806267-2-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Alex Shi <alexs@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  5 ++--
 mm/swapfile.c        | 79 ++++++++++++++++++++++++++++++++--------------------
 2 files changed, 52 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 144727041e78..c9e7fea10b83 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -177,7 +177,6 @@ enum {
 	SWP_PAGE_DISCARD = (1 << 10),	/* freed swap page-cluster discards */
 	SWP_STABLE_WRITES = (1 << 11),	/* no overwrite PG_writeback pages */
 	SWP_SYNCHRONOUS_IO = (1 << 12),	/* synchronous IO is efficient */
-	SWP_VALID	= (1 << 13),	/* swap is valid to be operated on? */
 					/* add others here before... */
 	SWP_SCANNING	= (1 << 14),	/* refcount in scan_swap_map */
 };
@@ -240,6 +239,7 @@ struct swap_cluster_list {
  * The in-memory structure used to track swap areas.
  */
 struct swap_info_struct {
+	struct percpu_ref users;	/* indicate and keep swap device valid. */
 	unsigned long	flags;		/* SWP_USED etc: see above */
 	signed short	prio;		/* swap priority of this type */
 	struct plist_node list;		/* entry in swap_active_head */
@@ -260,6 +260,7 @@ struct swap_info_struct {
 	struct block_device *bdev;	/* swap device or bdev of swap file */
 	struct file *swap_file;		/* seldom referenced */
 	unsigned int old_block_size;	/* seldom referenced */
+	struct completion comp;		/* seldom referenced */
 #ifdef CONFIG_FRONTSWAP
 	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
 	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
@@ -511,7 +512,7 @@ sector_t swap_page_sector(struct page *page);
 
 static inline void put_swap_device(struct swap_info_struct *si)
 {
-	rcu_read_unlock();
+	percpu_ref_put(&si->users);
 }
 
 #else /* CONFIG_SWAP */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 996afa8131c8..a9a04a5360d9 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -39,6 +39,7 @@
 #include <linux/export.h>
 #include <linux/swap_slots.h>
 #include <linux/sort.h>
+#include <linux/completion.h>
 
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
@@ -511,6 +512,14 @@ static void swap_discard_work(struct work_struct *work)
 	spin_unlock(&si->lock);
 }
 
+static void swap_users_ref_free(struct percpu_ref *ref)
+{
+	struct swap_info_struct *si;
+
+	si = container_of(ref, struct swap_info_struct, users);
+	complete(&si->comp);
+}
+
 static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
 {
 	struct swap_cluster_info *ci = si->cluster_info;
@@ -1270,18 +1279,12 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
  * via preventing the swap device from being swapoff, until
  * put_swap_device() is called.  Otherwise return NULL.
  *
- * The entirety of the RCU read critical section must come before the
- * return from or after the call to synchronize_rcu() in
- * enable_swap_info() or swapoff().  So if "si->flags & SWP_VALID" is
- * true, the si->map, si->cluster_info, etc. must be valid in the
- * critical section.
- *
  * Notice that swapoff or swapoff+swapon can still happen before the
- * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
- * in put_swap_device() if there isn't any other way to prevent
- * swapoff, such as page lock, page table lock, etc.  The caller must
- * be prepared for that.  For example, the following situation is
- * possible.
+ * percpu_ref_tryget_live() in get_swap_device() or after the
+ * percpu_ref_put() in put_swap_device() if there isn't any other way
+ * to prevent swapoff, such as page lock, page table lock, etc.  The
+ * caller must be prepared for that.  For example, the following
+ * situation is possible.
  *
  *   CPU1				CPU2
  *   do_swap_page()
@@ -1309,21 +1312,27 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
 	si = swp_swap_info(entry);
 	if (!si)
 		goto bad_nofile;
-
-	rcu_read_lock();
-	if (data_race(!(si->flags & SWP_VALID)))
-		goto unlock_out;
+	if (!percpu_ref_tryget_live(&si->users))
+		goto out;
+	/*
+	 * Guarantee the si->users are checked before accessing other
+	 * fields of swap_info_struct.
+	 *
+	 * Paired with the spin_unlock() after setup_swap_info() in
+	 * enable_swap_info().
+	 */
+	smp_rmb();
 	offset = swp_offset(entry);
 	if (offset >= si->max)
-		goto unlock_out;
+		goto put_out;
 
 	return si;
 bad_nofile:
 	pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
 out:
 	return NULL;
-unlock_out:
-	rcu_read_unlock();
+put_out:
+	percpu_ref_put(&si->users);
 	return NULL;
 }
 
@@ -2466,7 +2475,7 @@ static void setup_swap_info(struct swap_info_struct *p, int prio,
 
 static void _enable_swap_info(struct swap_info_struct *p)
 {
-	p->flags |= SWP_WRITEOK | SWP_VALID;
+	p->flags |= SWP_WRITEOK;
 	atomic_long_add(p->pages, &nr_swap_pages);
 	total_swap_pages += p->pages;
 
@@ -2497,10 +2506,9 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
 	spin_unlock(&p->lock);
 	spin_unlock(&swap_lock);
 	/*
-	 * Guarantee swap_map, cluster_info, etc. fields are valid
-	 * between get/put_swap_device() if SWP_VALID bit is set
+	 * Finished initializing swap device, now it's safe to reference it.
 	 */
-	synchronize_rcu();
+	percpu_ref_resurrect(&p->users);
 	spin_lock(&swap_lock);
 	spin_lock(&p->lock);
 	_enable_swap_info(p);
@@ -2616,16 +2624,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 
 	reenable_swap_slots_cache_unlock();
 
-	spin_lock(&swap_lock);
-	spin_lock(&p->lock);
-	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
-	spin_unlock(&p->lock);
-	spin_unlock(&swap_lock);
 	/*
-	 * wait for swap operations protected by get/put_swap_device()
-	 * to complete
+	 * Wait for swap operations protected by get/put_swap_device()
+	 * to complete.
+	 *
+	 * We need synchronize_rcu() here to protect the accessing to
+	 * the swap cache data structure.
 	 */
+	percpu_ref_kill(&p->users);
 	synchronize_rcu();
+	wait_for_completion(&p->comp);
 
 	flush_work(&p->discard_work);
 
@@ -2857,6 +2865,12 @@ static struct swap_info_struct *alloc_swap_info(void)
 	if (!p)
 		return ERR_PTR(-ENOMEM);
 
+	if (percpu_ref_init(&p->users, swap_users_ref_free,
+			    PERCPU_REF_INIT_DEAD, GFP_KERNEL)) {
+		kvfree(p);
+		return ERR_PTR(-ENOMEM);
+	}
+
 	spin_lock(&swap_lock);
 	for (type = 0; type < nr_swapfiles; type++) {
 		if (!(swap_info[type]->flags & SWP_USED))
@@ -2864,6 +2878,7 @@ static struct swap_info_struct *alloc_swap_info(void)
 	}
 	if (type >= MAX_SWAPFILES) {
 		spin_unlock(&swap_lock);
+		percpu_ref_exit(&p->users);
 		kvfree(p);
 		return ERR_PTR(-EPERM);
 	}
@@ -2891,9 +2906,13 @@ static struct swap_info_struct *alloc_swap_info(void)
 		plist_node_init(&p->avail_lists[i], 0);
 	p->flags = SWP_USED;
 	spin_unlock(&swap_lock);
-	kvfree(defer);
+	if (defer) {
+		percpu_ref_exit(&defer->users);
+		kvfree(defer);
+	}
 	spin_lock_init(&p->lock);
 	spin_lock_init(&p->cont_lock);
+	init_completion(&p->comp);
 
 	return p;
 }
-- 
cgit v1.2.3


From 2799e77529c2a25492a4395db93996e3dacd762d Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Mon, 28 Jun 2021 19:36:50 -0700
Subject: swap: fix do_swap_page() race with swapoff

When I was investigating the swap code, I found the below possible race
window:

CPU 1                                   	CPU 2
-----                                   	-----
do_swap_page
  if (data_race(si->flags & SWP_SYNCHRONOUS_IO)
  swap_readpage
    if (data_race(sis->flags & SWP_FS_OPS)) {
                                        	swapoff
					  	  ..
					  	  p->swap_file = NULL;
					  	  ..
    struct file *swap_file = sis->swap_file;
    struct address_space *mapping = swap_file->f_mapping;[oops!]

Note that for the pages that are swapped in through swap cache, this isn't
an issue. Because the page is locked, and the swap entry will be marked
with SWAP_HAS_CACHE, so swapoff() can not proceed until the page has been
unlocked.

Fix this race by using get/put_swap_device() to guard against concurrent
swapoff.

Link: https://lkml.kernel.org/r/20210426123316.806267-3-linmiaohe@huawei.com
Fixes: 0bcac06f27d7 ("mm,swap: skip swapcache for swapin of synchronous device")
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Alex Shi <alexs@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  9 +++++++++
 mm/memory.c          | 11 +++++++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index c9e7fea10b83..46d51d058d05 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -527,6 +527,15 @@ static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry)
 	return NULL;
 }
 
+static inline struct swap_info_struct *get_swap_device(swp_entry_t entry)
+{
+	return NULL;
+}
+
+static inline void put_swap_device(struct swap_info_struct *si)
+{
+}
+
 #define swap_address_space(entry)		(NULL)
 #define get_nr_swap_pages()			0L
 #define total_swap_pages			0L
diff --git a/mm/memory.c b/mm/memory.c
index 486f4a2874e7..b15367c285bd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3353,6 +3353,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct page *page = NULL, *swapcache;
+	struct swap_info_struct *si = NULL;
 	swp_entry_t entry;
 	pte_t pte;
 	int locked;
@@ -3380,14 +3381,16 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 		goto out;
 	}
 
+	/* Prevent swapoff from happening to us. */
+	si = get_swap_device(entry);
+	if (unlikely(!si))
+		goto out;
 
 	delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
 	page = lookup_swap_cache(entry, vma, vmf->address);
 	swapcache = page;
 
 	if (!page) {
-		struct swap_info_struct *si = swp_swap_info(entry);
-
 		if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
 		    __swap_count(entry) == 1) {
 			/* skip swapcache */
@@ -3556,6 +3559,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 unlock:
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
 out:
+	if (si)
+		put_swap_device(si);
 	return ret;
 out_nomap:
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -3567,6 +3572,8 @@ out_release:
 		unlock_page(swapcache);
 		put_page(swapcache);
 	}
+	if (si)
+		put_swap_device(si);
 	return ret;
 }
 
-- 
cgit v1.2.3


From f4c4a3f48480730214c4f02ffa480f6bf5b0718f Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 28 Jun 2021 19:37:12 -0700
Subject: mm: free idle swap cache page after COW

With commit 09854ba94c6a ("mm: do_wp_page() simplification"), after COW,
the idle swap cache page (neither the page nor the corresponding swap
entry is mapped by any process) will be left in the LRU list, even if it's
in the active list or the head of the inactive list.  So, the page
reclaimer may take quite some overhead to reclaim these actually unused
pages.

To help the page reclaiming, in this patch, after COW, the idle swap cache
page will be tried to be freed.  To avoid to introduce much overhead to
the hot COW code path,

a) there's almost zero overhead for non-swap case via checking
   PageSwapCache() firstly.

b) the page lock is acquired via trylock only.

To test the patch, we used pmbench memory accessing benchmark with
working-set larger than available memory on a 2-socket Intel server with a
NVMe SSD as swap device.  Test results shows that the pmbench score
increases up to 23.8% with the decreased size of swap cache and swapin
throughput.

Link: https://lkml.kernel.org/r/20210601053143.1380078-1-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>	[use free_swap_cache()]
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@surriel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Tim Chen <tim.c.chen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 5 +++++
 mm/memory.c          | 2 ++
 mm/swap_state.c      | 2 +-
 3 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 46d51d058d05..49b1dd2c100b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -446,6 +446,7 @@ extern void __delete_from_swap_cache(struct page *page,
 extern void delete_from_swap_cache(struct page *);
 extern void clear_shadow_from_swap_cache(int type, unsigned long begin,
 				unsigned long end);
+extern void free_swap_cache(struct page *);
 extern void free_page_and_swap_cache(struct page *);
 extern void free_pages_and_swap_cache(struct page **, int);
 extern struct page *lookup_swap_cache(swp_entry_t entry,
@@ -551,6 +552,10 @@ static inline void put_swap_device(struct swap_info_struct *si)
 #define free_pages_and_swap_cache(pages, nr) \
 	release_pages((pages), (nr));
 
+static inline void free_swap_cache(struct page *page)
+{
+}
+
 static inline void show_swap_cache_info(void)
 {
 }
diff --git a/mm/memory.c b/mm/memory.c
index b15367c285bd..a4d82a6de000 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3023,6 +3023,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 				munlock_vma_page(old_page);
 			unlock_page(old_page);
 		}
+		if (page_copied)
+			free_swap_cache(old_page);
 		put_page(old_page);
 	}
 	return page_copied ? VM_FAULT_WRITE : 0;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 1a2ba4056f37..4f8a912ff692 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -286,7 +286,7 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
  * try_to_free_swap() _with_ the lock.
  * 					- Marcelo
  */
-static inline void free_swap_cache(struct page *page)
+void free_swap_cache(struct page *page)
 {
 	if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
 		try_to_free_swap(page);
-- 
cgit v1.2.3


From 494c1dfe855ec1f70f89552fce5eadf4a1717552 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Mon, 28 Jun 2021 19:37:38 -0700
Subject: mm: memcg/slab: create a new set of kmalloc-cg-<n> caches

There are currently two problems in the way the objcg pointer array
(memcg_data) in the page structure is being allocated and freed.

On its allocation, it is possible that the allocated objcg pointer
array comes from the same slab that requires memory accounting. If this
happens, the slab will never become empty again as there is at least
one object left (the obj_cgroup array) in the slab.

When it is freed, the objcg pointer array object may be the last one
in its slab and hence causes kfree() to be called again. With the
right workload, the slab cache may be set up in a way that allows the
recursive kfree() calling loop to nest deep enough to cause a kernel
stack overflow and panic the system.

One way to solve this problem is to split the kmalloc-<n> caches
(KMALLOC_NORMAL) into two separate sets - a new set of kmalloc-<n>
(KMALLOC_NORMAL) caches for unaccounted objects only and a new set of
kmalloc-cg-<n> (KMALLOC_CGROUP) caches for accounted objects only. All
the other caches can still allow a mix of accounted and unaccounted
objects.

With this change, all the objcg pointer array objects will come from
KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So
both the recursive kfree() problem and non-freeable slab problem are
gone.

Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer have
mixed accounted and unaccounted objects, this will slightly reduce the
number of objcg pointer arrays that need to be allocated and save a bit
of memory. On the other hand, creating a new set of kmalloc caches does
have the effect of reducing cache utilization. So it is properly a wash.

The new KMALLOC_CGROUP is added between KMALLOC_NORMAL and
KMALLOC_RECLAIM so that the first for loop in create_kmalloc_caches()
will include the newly added caches without change.

[vbabka@suse.cz: don't create kmalloc-cg caches with cgroup.memory=nokmem]
  Link: https://lkml.kernel.org/r/20210512145107.6208-1-longman@redhat.com
[akpm@linux-foundation.org: un-fat-finger v5 delta creation]
[longman@redhat.com: disable cache merging for KMALLOC_NORMAL caches]
  Link: https://lkml.kernel.org/r/20210505200610.13943-4-longman@redhat.com

Link: https://lkml.kernel.org/r/20210512145107.6208-1-longman@redhat.com
Link: https://lkml.kernel.org/r/20210505200610.13943-3-longman@redhat.com
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
[longman@redhat.com: fix for CONFIG_ZONE_DMA=n]
Suggested-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 42 +++++++++++++++++++++++++++++++++---------
 mm/internal.h        |  5 +++++
 mm/memcontrol.c      |  2 +-
 mm/slab_common.c     | 32 +++++++++++++++++++++++---------
 4 files changed, 62 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index bc9ab3a5a017..083f3ce550bc 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -305,9 +305,21 @@ static inline void __check_heap_object(const void *ptr, unsigned long n,
 /*
  * Whenever changing this, take care of that kmalloc_type() and
  * create_kmalloc_caches() still work as intended.
+ *
+ * KMALLOC_NORMAL can contain only unaccounted objects whereas KMALLOC_CGROUP
+ * is for accounted but unreclaimable and non-dma objects. All the other
+ * kmem caches can have both accounted and unaccounted objects.
  */
 enum kmalloc_cache_type {
 	KMALLOC_NORMAL = 0,
+#ifndef CONFIG_ZONE_DMA
+	KMALLOC_DMA = KMALLOC_NORMAL,
+#endif
+#ifndef CONFIG_MEMCG_KMEM
+	KMALLOC_CGROUP = KMALLOC_NORMAL,
+#else
+	KMALLOC_CGROUP,
+#endif
 	KMALLOC_RECLAIM,
 #ifdef CONFIG_ZONE_DMA
 	KMALLOC_DMA,
@@ -319,24 +331,36 @@ enum kmalloc_cache_type {
 extern struct kmem_cache *
 kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1];
 
+/*
+ * Define gfp bits that should not be set for KMALLOC_NORMAL.
+ */
+#define KMALLOC_NOT_NORMAL_BITS					\
+	(__GFP_RECLAIMABLE |					\
+	(IS_ENABLED(CONFIG_ZONE_DMA)   ? __GFP_DMA : 0) |	\
+	(IS_ENABLED(CONFIG_MEMCG_KMEM) ? __GFP_ACCOUNT : 0))
+
 static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags)
 {
-#ifdef CONFIG_ZONE_DMA
 	/*
 	 * The most common case is KMALLOC_NORMAL, so test for it
-	 * with a single branch for both flags.
+	 * with a single branch for all the relevant flags.
 	 */
-	if (likely((flags & (__GFP_DMA | __GFP_RECLAIMABLE)) == 0))
+	if (likely((flags & KMALLOC_NOT_NORMAL_BITS) == 0))
 		return KMALLOC_NORMAL;
 
 	/*
-	 * At least one of the flags has to be set. If both are, __GFP_DMA
-	 * is more important.
+	 * At least one of the flags has to be set. Their priorities in
+	 * decreasing order are:
+	 *  1) __GFP_DMA
+	 *  2) __GFP_RECLAIMABLE
+	 *  3) __GFP_ACCOUNT
 	 */
-	return flags & __GFP_DMA ? KMALLOC_DMA : KMALLOC_RECLAIM;
-#else
-	return flags & __GFP_RECLAIMABLE ? KMALLOC_RECLAIM : KMALLOC_NORMAL;
-#endif
+	if (IS_ENABLED(CONFIG_ZONE_DMA) && (flags & __GFP_DMA))
+		return KMALLOC_DMA;
+	if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || (flags & __GFP_RECLAIMABLE))
+		return KMALLOC_RECLAIM;
+	else
+		return KMALLOC_CGROUP;
 }
 
 /*
diff --git a/mm/internal.h b/mm/internal.h
index e8fdb531f887..2946dfa0f245 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -115,6 +115,11 @@ extern void putback_lru_page(struct page *page);
  */
 extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
 
+/*
+ * in mm/memcontrol.c:
+ */
+extern bool cgroup_memory_nokmem;
+
 /*
  * in mm/page_alloc.c
  */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2508bd97349c..b913950b9f64 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -83,7 +83,7 @@ DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
 static bool cgroup_memory_nosocket;
 
 /* Kernel memory accounting disabled? */
-static bool cgroup_memory_nokmem;
+bool cgroup_memory_nokmem;
 
 /* Whether the swap controller is active */
 #ifdef CONFIG_MEMCG_SWAP
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 6c0db9f9bd8a..db3f356bf725 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -738,21 +738,25 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
 }
 
 #ifdef CONFIG_ZONE_DMA
-#define INIT_KMALLOC_INFO(__size, __short_size)			\
-{								\
-	.name[KMALLOC_NORMAL]  = "kmalloc-" #__short_size,	\
-	.name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size,	\
-	.name[KMALLOC_DMA]     = "dma-kmalloc-" #__short_size,	\
-	.size = __size,						\
-}
+#define KMALLOC_DMA_NAME(sz)	.name[KMALLOC_DMA] = "dma-kmalloc-" #sz,
+#else
+#define KMALLOC_DMA_NAME(sz)
+#endif
+
+#ifdef CONFIG_MEMCG_KMEM
+#define KMALLOC_CGROUP_NAME(sz)	.name[KMALLOC_CGROUP] = "kmalloc-cg-" #sz,
 #else
+#define KMALLOC_CGROUP_NAME(sz)
+#endif
+
 #define INIT_KMALLOC_INFO(__size, __short_size)			\
 {								\
 	.name[KMALLOC_NORMAL]  = "kmalloc-" #__short_size,	\
 	.name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size,	\
+	KMALLOC_CGROUP_NAME(__short_size)			\
+	KMALLOC_DMA_NAME(__short_size)				\
 	.size = __size,						\
 }
-#endif
 
 /*
  * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
@@ -838,8 +842,15 @@ void __init setup_kmalloc_cache_index_table(void)
 static void __init
 new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags)
 {
-	if (type == KMALLOC_RECLAIM)
+	if (type == KMALLOC_RECLAIM) {
 		flags |= SLAB_RECLAIM_ACCOUNT;
+	} else if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_CGROUP)) {
+		if (cgroup_memory_nokmem) {
+			kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx];
+			return;
+		}
+		flags |= SLAB_ACCOUNT;
+	}
 
 	kmalloc_caches[type][idx] = create_kmalloc_cache(
 					kmalloc_info[idx].name[type],
@@ -857,6 +868,9 @@ void __init create_kmalloc_caches(slab_flags_t flags)
 	int i;
 	enum kmalloc_cache_type type;
 
+	/*
+	 * Including KMALLOC_CGROUP if CONFIG_MEMCG_KMEM defined
+	 */
 	for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
 		for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
 			if (!kmalloc_caches[type][i])
-- 
cgit v1.2.3


From a984226f457f849eb9c4ce727eeaa3b5080597d8 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Mon, 28 Jun 2021 19:37:53 -0700
Subject: mm: memcontrol: remove the pgdata parameter of mem_cgroup_page_lruvec

All the callers of mem_cgroup_page_lruvec() just pass page_pgdat(page) as
the 2nd parameter to it (except isolate_migratepages_block()).  But for
isolate_migratepages_block(), the page_pgdat(page) is also equal to the
local variable of @pgdat.  So mem_cgroup_page_lruvec() do not need the
pgdat parameter.  Just remove it to simplify the code.

Link: https://lkml.kernel.org/r/20210417043538.9793-4-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 10 +++++-----
 mm/compaction.c            |  2 +-
 mm/memcontrol.c            |  9 +++------
 mm/swap.c                  |  2 +-
 mm/workingset.c            |  2 +-
 5 files changed, 11 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index c193be760709..f2a5aaba3577 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -743,13 +743,12 @@ out:
 /**
  * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
  * @page: the page
- * @pgdat: pgdat of the page
  *
  * This function relies on page->mem_cgroup being stable.
  */
-static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
-						struct pglist_data *pgdat)
+static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
 {
+	pg_data_t *pgdat = page_pgdat(page);
 	struct mem_cgroup *memcg = page_memcg(page);
 
 	VM_WARN_ON_ONCE_PAGE(!memcg && !mem_cgroup_disabled(), page);
@@ -1221,9 +1220,10 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
 	return &pgdat->__lruvec;
 }
 
-static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
-						    struct pglist_data *pgdat)
+static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
 {
+	pg_data_t *pgdat = page_pgdat(page);
+
 	return &pgdat->__lruvec;
 }
 
diff --git a/mm/compaction.c b/mm/compaction.c
index 84fde270ae74..7d41b58fb17c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1028,7 +1028,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		if (!TestClearPageLRU(page))
 			goto isolate_fail_put;
 
-		lruvec = mem_cgroup_page_lruvec(page, pgdat);
+		lruvec = mem_cgroup_page_lruvec(page);
 
 		/* If we already hold the lock, we can skip some rechecking */
 		if (lruvec != locked) {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index babbaf49ee36..946a9a483e71 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1199,9 +1199,8 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
 struct lruvec *lock_page_lruvec(struct page *page)
 {
 	struct lruvec *lruvec;
-	struct pglist_data *pgdat = page_pgdat(page);
 
-	lruvec = mem_cgroup_page_lruvec(page, pgdat);
+	lruvec = mem_cgroup_page_lruvec(page);
 	spin_lock(&lruvec->lru_lock);
 
 	lruvec_memcg_debug(lruvec, page);
@@ -1212,9 +1211,8 @@ struct lruvec *lock_page_lruvec(struct page *page)
 struct lruvec *lock_page_lruvec_irq(struct page *page)
 {
 	struct lruvec *lruvec;
-	struct pglist_data *pgdat = page_pgdat(page);
 
-	lruvec = mem_cgroup_page_lruvec(page, pgdat);
+	lruvec = mem_cgroup_page_lruvec(page);
 	spin_lock_irq(&lruvec->lru_lock);
 
 	lruvec_memcg_debug(lruvec, page);
@@ -1225,9 +1223,8 @@ struct lruvec *lock_page_lruvec_irq(struct page *page)
 struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
 {
 	struct lruvec *lruvec;
-	struct pglist_data *pgdat = page_pgdat(page);
 
-	lruvec = mem_cgroup_page_lruvec(page, pgdat);
+	lruvec = mem_cgroup_page_lruvec(page);
 	spin_lock_irqsave(&lruvec->lru_lock, *flags);
 
 	lruvec_memcg_debug(lruvec, page);
diff --git a/mm/swap.c b/mm/swap.c
index dfb48cf9c2c9..18cc9e63515b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -313,7 +313,7 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
 
 void lru_note_cost_page(struct page *page)
 {
-	lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)),
+	lru_note_cost(mem_cgroup_page_lruvec(page),
 		      page_is_file_lru(page), thp_nr_pages(page));
 }
 
diff --git a/mm/workingset.c b/mm/workingset.c
index b7cdeca5a76d..4f7a306ce75a 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -408,7 +408,7 @@ void workingset_activation(struct page *page)
 	memcg = page_memcg_rcu(page);
 	if (!mem_cgroup_disabled() && !memcg)
 		goto out;
-	lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+	lruvec = mem_cgroup_page_lruvec(page);
 	workingset_age_nonresident(lruvec, thp_nr_pages(page));
 out:
 	rcu_read_unlock();
-- 
cgit v1.2.3


From f2e4d28dd9f6478dd54d47b91edc3fe62c019968 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Mon, 28 Jun 2021 19:37:56 -0700
Subject: mm: memcontrol: simplify lruvec_holds_page_lru_lock

We already have a helper lruvec_memcg() to get the memcg from lruvec, we
do not need to do it ourselves in the lruvec_holds_page_lru_lock().  So
use lruvec_memcg() instead.  And if mem_cgroup_disabled() returns false,
the page_memcg(page) (the LRU pages) cannot be NULL.  So remove the odd
logic of "memcg = page_memcg(page) ?  : root_mem_cgroup".  And use
lruvec_pgdat to simplify the code.  We can have a single definition for
this function that works for !CONFIG_MEMCG, CONFIG_MEMCG +
mem_cgroup_disabled() and CONFIG_MEMCG.

Link: https://lkml.kernel.org/r/20210417043538.9793-5-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 31 +++++++------------------------
 1 file changed, 7 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f2a5aaba3577..2fc728492c9b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -755,22 +755,6 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
 	return mem_cgroup_lruvec(memcg, pgdat);
 }
 
-static inline bool lruvec_holds_page_lru_lock(struct page *page,
-					      struct lruvec *lruvec)
-{
-	pg_data_t *pgdat = page_pgdat(page);
-	const struct mem_cgroup *memcg;
-	struct mem_cgroup_per_node *mz;
-
-	if (mem_cgroup_disabled())
-		return lruvec == &pgdat->__lruvec;
-
-	mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-	memcg = page_memcg(page) ? : root_mem_cgroup;
-
-	return lruvec->pgdat == pgdat && mz->memcg == memcg;
-}
-
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
 
 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
@@ -1227,14 +1211,6 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
 	return &pgdat->__lruvec;
 }
 
-static inline bool lruvec_holds_page_lru_lock(struct page *page,
-					      struct lruvec *lruvec)
-{
-	pg_data_t *pgdat = page_pgdat(page);
-
-	return lruvec == &pgdat->__lruvec;
-}
-
 static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
 {
 }
@@ -1516,6 +1492,13 @@ static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
 	spin_unlock_irqrestore(&lruvec->lru_lock, flags);
 }
 
+static inline bool lruvec_holds_page_lru_lock(struct page *page,
+					      struct lruvec *lruvec)
+{
+	return lruvec_pgdat(lruvec) == page_pgdat(page) &&
+	       lruvec_memcg(lruvec) == page_memcg(page);
+}
+
 /* Don't lock again iff page's lruvec locked */
 static inline struct lruvec *relock_page_lruvec_irq(struct page *page,
 		struct lruvec *locked_lruvec)
-- 
cgit v1.2.3


From 7467c39128bda1d58af08aaeb0c7ba54d0ec87ae Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Mon, 28 Jun 2021 19:37:59 -0700
Subject: mm: memcontrol: rename lruvec_holds_page_lru_lock to
 page_matches_lruvec

lruvec_holds_page_lru_lock() doesn't check anything about locking and is
used to check whether the page belongs to the lruvec.  So rename it to
page_matches_lruvec().

Link: https://lkml.kernel.org/r/20210417043538.9793-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 8 ++++----
 mm/vmscan.c                | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 2fc728492c9b..0ce97eff79e2 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1492,8 +1492,8 @@ static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
 	spin_unlock_irqrestore(&lruvec->lru_lock, flags);
 }
 
-static inline bool lruvec_holds_page_lru_lock(struct page *page,
-					      struct lruvec *lruvec)
+/* Test requires a stable page->memcg binding, see page_memcg() */
+static inline bool page_matches_lruvec(struct page *page, struct lruvec *lruvec)
 {
 	return lruvec_pgdat(lruvec) == page_pgdat(page) &&
 	       lruvec_memcg(lruvec) == page_memcg(page);
@@ -1504,7 +1504,7 @@ static inline struct lruvec *relock_page_lruvec_irq(struct page *page,
 		struct lruvec *locked_lruvec)
 {
 	if (locked_lruvec) {
-		if (lruvec_holds_page_lru_lock(page, locked_lruvec))
+		if (page_matches_lruvec(page, locked_lruvec))
 			return locked_lruvec;
 
 		unlock_page_lruvec_irq(locked_lruvec);
@@ -1518,7 +1518,7 @@ static inline struct lruvec *relock_page_lruvec_irqsave(struct page *page,
 		struct lruvec *locked_lruvec, unsigned long *flags)
 {
 	if (locked_lruvec) {
-		if (lruvec_holds_page_lru_lock(page, locked_lruvec))
+		if (page_matches_lruvec(page, locked_lruvec))
 			return locked_lruvec;
 
 		unlock_page_lruvec_irqrestore(locked_lruvec, *flags);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5199b9696bab..ec93d4fd5a6b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2063,7 +2063,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
 		 * All pages were isolated from the same lruvec (and isolation
 		 * inhibits memcg migration).
 		 */
-		VM_BUG_ON_PAGE(!lruvec_holds_page_lru_lock(page, lruvec), page);
+		VM_BUG_ON_PAGE(!page_matches_lruvec(page, lruvec), page);
 		add_page_to_lru_list(page, lruvec);
 		nr_pages = thp_nr_pages(page);
 		nr_moved += nr_pages;
-- 
cgit v1.2.3


From b51478a0b3c7040bfcadf6e2e04df5ddde59fd98 Mon Sep 17 00:00:00 2001
From: wenhuizhang <wenhui@gwmail.gwu.edu>
Date: Mon, 28 Jun 2021 19:38:12 -0700
Subject: memcontrol: use flexible-array member

Change deprecated zero-length-and-one-element-arrays into flexible array
member.Zero-length and one-element arrays detected by Lukas's CodeChecker.
Zero/one element arrays cause undefined behaviours if sizeof() used.

Link: https://lkml.kernel.org/r/20210518200910.29912-1-wenhui@gwmail.gwu.edu
Signed-off-by: wenhuizhang <wenhui@gwmail.gwu.edu>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Alex Shi <alexs@kernel.org>
Cc: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0ce97eff79e2..3cc18c2176e7 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -349,8 +349,7 @@ struct mem_cgroup {
 	struct deferred_split deferred_split_queue;
 #endif
 
-	struct mem_cgroup_per_node *nodeinfo[0];
-	/* WARNING: nodeinfo must be the last member here */
+	struct mem_cgroup_per_node *nodeinfo[];
 };
 
 /*
-- 
cgit v1.2.3


From c74d40e8b5e2ac5eee1ca45b12d3e174915f1d88 Mon Sep 17 00:00:00 2001
From: Dan Schatzberg <schatzberg.dan@gmail.com>
Date: Mon, 28 Jun 2021 19:38:21 -0700
Subject: loop: charge i/o to mem and blk cg

The current code only associates with the existing blkcg when aio is used
to access the backing file.  This patch covers all types of i/o to the
backing file and also associates the memcg so if the backing file is on
tmpfs, memory is charged appropriately.

This patch also exports cgroup_get_e_css and int_active_memcg so it can be
used by the loop module.

Link: https://lkml.kernel.org/r/20210610173944.1203706-4-schatzberg.dan@gmail.com
Signed-off-by: Dan Schatzberg <schatzberg.dan@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Jens Axboe <axboe@kernel.dk>
Cc: Chris Down <chris@chrisdown.name>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/loop.c       | 61 +++++++++++++++++++++++++++++++---------------
 drivers/block/loop.h       |  3 ++-
 include/linux/memcontrol.h |  6 +++++
 kernel/cgroup/cgroup.c     |  1 +
 mm/memcontrol.c            |  1 +
 5 files changed, 51 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 54ed3ebbbc37..452c7437e1f0 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -78,6 +78,7 @@
 #include <linux/uio.h>
 #include <linux/ioprio.h>
 #include <linux/blk-cgroup.h>
+#include <linux/sched/mm.h>
 
 #include "loop.h"
 
@@ -516,8 +517,6 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
 {
 	struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
 
-	if (cmd->css)
-		css_put(cmd->css);
 	cmd->ret = ret;
 	lo_rw_aio_do_completion(cmd);
 }
@@ -578,8 +577,6 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 	cmd->iocb.ki_complete = lo_rw_aio_complete;
 	cmd->iocb.ki_flags = IOCB_DIRECT;
 	cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
-	if (cmd->css)
-		kthread_associate_blkcg(cmd->css);
 
 	if (rw == WRITE)
 		ret = call_write_iter(file, &cmd->iocb, &iter);
@@ -587,7 +584,6 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 		ret = call_read_iter(file, &cmd->iocb, &iter);
 
 	lo_rw_aio_do_completion(cmd);
-	kthread_associate_blkcg(NULL);
 
 	if (ret != -EIOCBQUEUED)
 		cmd->iocb.ki_complete(&cmd->iocb, ret, 0);
@@ -928,7 +924,7 @@ struct loop_worker {
 	struct list_head cmd_list;
 	struct list_head idle_list;
 	struct loop_device *lo;
-	struct cgroup_subsys_state *css;
+	struct cgroup_subsys_state *blkcg_css;
 	unsigned long last_ran_at;
 };
 
@@ -957,7 +953,7 @@ static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
 
 	spin_lock_irq(&lo->lo_work_lock);
 
-	if (queue_on_root_worker(cmd->css))
+	if (queue_on_root_worker(cmd->blkcg_css))
 		goto queue_work;
 
 	node = &lo->worker_tree.rb_node;
@@ -965,10 +961,10 @@ static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
 	while (*node) {
 		parent = *node;
 		cur_worker = container_of(*node, struct loop_worker, rb_node);
-		if (cur_worker->css == cmd->css) {
+		if (cur_worker->blkcg_css == cmd->blkcg_css) {
 			worker = cur_worker;
 			break;
-		} else if ((long)cur_worker->css < (long)cmd->css) {
+		} else if ((long)cur_worker->blkcg_css < (long)cmd->blkcg_css) {
 			node = &(*node)->rb_left;
 		} else {
 			node = &(*node)->rb_right;
@@ -980,13 +976,18 @@ static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
 	worker = kzalloc(sizeof(struct loop_worker), GFP_NOWAIT | __GFP_NOWARN);
 	/*
 	 * In the event we cannot allocate a worker, just queue on the
-	 * rootcg worker
+	 * rootcg worker and issue the I/O as the rootcg
 	 */
-	if (!worker)
+	if (!worker) {
+		cmd->blkcg_css = NULL;
+		if (cmd->memcg_css)
+			css_put(cmd->memcg_css);
+		cmd->memcg_css = NULL;
 		goto queue_work;
+	}
 
-	worker->css = cmd->css;
-	css_get(worker->css);
+	worker->blkcg_css = cmd->blkcg_css;
+	css_get(worker->blkcg_css);
 	INIT_WORK(&worker->work, loop_workfn);
 	INIT_LIST_HEAD(&worker->cmd_list);
 	INIT_LIST_HEAD(&worker->idle_list);
@@ -1306,7 +1307,7 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
 				idle_list) {
 		list_del(&worker->idle_list);
 		rb_erase(&worker->rb_node, &lo->worker_tree);
-		css_put(worker->css);
+		css_put(worker->blkcg_css);
 		kfree(worker);
 	}
 	spin_unlock_irq(&lo->lo_work_lock);
@@ -2100,13 +2101,18 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 	}
 
 	/* always use the first bio's css */
+	cmd->blkcg_css = NULL;
+	cmd->memcg_css = NULL;
 #ifdef CONFIG_BLK_CGROUP
-	if (cmd->use_aio && rq->bio && rq->bio->bi_blkg) {
-		cmd->css = &bio_blkcg(rq->bio)->css;
-		css_get(cmd->css);
-	} else
+	if (rq->bio && rq->bio->bi_blkg) {
+		cmd->blkcg_css = &bio_blkcg(rq->bio)->css;
+#ifdef CONFIG_MEMCG
+		cmd->memcg_css =
+			cgroup_get_e_css(cmd->blkcg_css->cgroup,
+					&memory_cgrp_subsys);
+#endif
+	}
 #endif
-		cmd->css = NULL;
 	loop_queue_work(lo, cmd);
 
 	return BLK_STS_OK;
@@ -2118,13 +2124,28 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
 	const bool write = op_is_write(req_op(rq));
 	struct loop_device *lo = rq->q->queuedata;
 	int ret = 0;
+	struct mem_cgroup *old_memcg = NULL;
 
 	if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) {
 		ret = -EIO;
 		goto failed;
 	}
 
+	if (cmd->blkcg_css)
+		kthread_associate_blkcg(cmd->blkcg_css);
+	if (cmd->memcg_css)
+		old_memcg = set_active_memcg(
+			mem_cgroup_from_css(cmd->memcg_css));
+
 	ret = do_req_filebacked(lo, rq);
+
+	if (cmd->blkcg_css)
+		kthread_associate_blkcg(NULL);
+
+	if (cmd->memcg_css) {
+		set_active_memcg(old_memcg);
+		css_put(cmd->memcg_css);
+	}
  failed:
 	/* complete non-aio request */
 	if (!cmd->use_aio || ret) {
@@ -2203,7 +2224,7 @@ static void loop_free_idle_workers(struct timer_list *timer)
 			break;
 		list_del(&worker->idle_list);
 		rb_erase(&worker->rb_node, &lo->worker_tree);
-		css_put(worker->css);
+		css_put(worker->blkcg_css);
 		kfree(worker);
 	}
 	if (!list_empty(&lo->idle_worker_list))
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index f81c01bde5c0..1988899db63a 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -77,7 +77,8 @@ struct loop_cmd {
 	long ret;
 	struct kiocb iocb;
 	struct bio_vec *bvec;
-	struct cgroup_subsys_state *css;
+	struct cgroup_subsys_state *blkcg_css;
+	struct cgroup_subsys_state *memcg_css;
 };
 
 /* Support for loadable transfer modules */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 3cc18c2176e7..1de3859233a6 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1230,6 +1230,12 @@ static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 	return NULL;
 }
 
+static inline
+struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css)
+{
+	return NULL;
+}
+
 static inline void mem_cgroup_put(struct mem_cgroup *memcg)
 {
 }
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 21ecc6ee6a6d..9cc8c3a686b1 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -577,6 +577,7 @@ out_unlock:
 	rcu_read_unlock();
 	return css;
 }
+EXPORT_SYMBOL_GPL(cgroup_get_e_css);
 
 static void cgroup_get_live(struct cgroup *cgrp)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8f3244e59b30..4ee243ce6135 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -78,6 +78,7 @@ struct mem_cgroup *root_mem_cgroup __read_mostly;
 
 /* Active memory cgroup to use from an interrupt context */
 DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
+EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);
 
 /* Socket memory accounting disabled? */
 static bool cgroup_memory_nosocket;
-- 
cgit v1.2.3


From 6a1803bb582c50909a7f6cc4153360eaf5ae8fc8 Mon Sep 17 00:00:00 2001
From: Huilong Deng <denghuilong@cdjrlc.com>
Date: Mon, 28 Jun 2021 19:38:24 -0700
Subject: mm: memcontrol: remove trailing semicolon in macros

Macros should not use a trailing semicolon.

Link: https://lkml.kernel.org/r/20210614091530.22117-1-denghuilong@cdjrlc.com
Signed-off-by: Huilong Deng <denghuilong@cdjrlc.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1de3859233a6..6d66037be646 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -192,7 +192,7 @@ enum memcg_kmem_state {
 struct memcg_padding {
 	char x[0];
 } ____cacheline_internodealigned_in_smp;
-#define MEMCG_PADDING(name)      struct memcg_padding name;
+#define MEMCG_PADDING(name)      struct memcg_padding name
 #else
 #define MEMCG_PADDING(name)
 #endif
-- 
cgit v1.2.3


From 3b8db39fad98cbb1d36e079236a446fad710daea Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 28 Jun 2021 19:38:35 -0700
Subject: mm: ignore MAP_EXECUTABLE in ksys_mmap_pgoff()

Let's also remove masking off MAP_EXECUTABLE from ksys_mmap_pgoff(): the
last in-tree occurrence of MAP_EXECUTABLE is now in LEGACY_MAP_MASK, which
accepts the flag e.g., for MAP_SHARED_VALIDATE; however, the flag is
ignored throughout the kernel now.

Add a comment to LEGACY_MAP_MASK stating that MAP_EXECUTABLE is ignored.

Link: https://lkml.kernel.org/r/20210421093453.6904-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: Greg Ungerer <gerg@linux-m68k.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kevin Brodsky <Kevin.Brodsky@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mman.h | 2 ++
 mm/mmap.c            | 2 +-
 mm/nommu.c           | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mman.h b/include/linux/mman.h
index 629cefc4ecba..ebb09a964272 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -31,6 +31,8 @@
 /*
  * The historical set of flags that all mmap implementations implicitly
  * support when a ->mmap_validate() op is not provided in file_operations.
+ *
+ * MAP_EXECUTABLE is completely ignored throughout the kernel.
  */
 #define LEGACY_MAP_MASK (MAP_SHARED \
 		| MAP_PRIVATE \
diff --git a/mm/mmap.c b/mm/mmap.c
index 0584e540246e..f9a61f7dc540 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1633,7 +1633,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
 			return PTR_ERR(file);
 	}
 
-	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+	flags &= ~MAP_DENYWRITE;
 
 	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
 out_fput:
diff --git a/mm/nommu.c b/mm/nommu.c
index 85a3a68dffb6..affda71641ca 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1296,7 +1296,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
 			goto out;
 	}
 
-	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+	flags &= ~MAP_DENYWRITE;
 
 	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
 
-- 
cgit v1.2.3


From ce6d42f2e4a2d98898419743b037a95661e3ac9d Mon Sep 17 00:00:00 2001
From: Liam Howlett <liam.howlett@oracle.com>
Date: Mon, 28 Jun 2021 19:38:50 -0700
Subject: mm: add vma_lookup(), update find_vma_intersection() comments

Patch series "mm: Add vma_lookup()", v2.

Many places in the kernel use find_vma() to get a vma and then check the
start address of the vma to ensure the next vma was not returned.

Other places use the find_vma_intersection() call with add, addr + 1 as
the range; looking for just the vma at a specific address.

The third use of find_vma() is by developers who do not know that the
function starts searching at the provided address upwards for the next
vma.  This results in a bug that is often overlooked for a long time.

Adding the new vma_lookup() function will allow for cleaner code by
removing the find_vma() calls which check limits, making
find_vma_intersection() calls of a single address to be shorter, and
potentially reduce the incorrect uses of find_vma().

This patch (of 22):

Many places in the kernel use find_vma() to get a vma and then check the
start address of the vma to ensure the next vma was not returned.

Other places use the find_vma_intersection() call with add, addr + 1 as
the range; looking for just the vma at a specific address.

The third use of find_vma() is by developers who do not know that the
function starts searching at the provided address upwards for the next
vma.  This results in a bug that is often overlooked for a long time.

Adding the new vma_lookup() function will allow for cleaner code by
removing the find_vma() calls which check limits, making
find_vma_intersection() calls of a single address to be shorter, and
potentially reduce the incorrect uses of find_vma().

Also change find_vma_intersection() comments and declaration to be of the
correct length and add kernel documentation style comment.

Link: https://lkml.kernel.org/r/20210521174745.2219620-1-Liam.Howlett@Oracle.com
Link: https://lkml.kernel.org/r/20210521174745.2219620-2-Liam.Howlett@Oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Reviewed-by: Laurent Dufour <ldufour@linux.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Cc: David Miller <davem@davemloft.net>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 36 ++++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 79f32962d7ae..1a98b5447a3b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2676,17 +2676,45 @@ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long add
 extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
 					     struct vm_area_struct **pprev);
 
-/* Look up the first VMA which intersects the interval start_addr..end_addr-1,
-   NULL if none.  Assume start_addr < end_addr. */
-static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
+/**
+ * find_vma_intersection() - Look up the first VMA which intersects the interval
+ * @mm: The process address space.
+ * @start_addr: The inclusive start user address.
+ * @end_addr: The exclusive end user address.
+ *
+ * Returns: The first VMA within the provided range, %NULL otherwise.  Assumes
+ * start_addr < end_addr.
+ */
+static inline
+struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
+					     unsigned long start_addr,
+					     unsigned long end_addr)
 {
-	struct vm_area_struct * vma = find_vma(mm,start_addr);
+	struct vm_area_struct *vma = find_vma(mm, start_addr);
 
 	if (vma && end_addr <= vma->vm_start)
 		vma = NULL;
 	return vma;
 }
 
+/**
+ * vma_lookup() - Find a VMA at a specific address
+ * @mm: The process address space.
+ * @addr: The user address.
+ *
+ * Return: The vm_area_struct at the given address, %NULL otherwise.
+ */
+static inline
+struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
+{
+	struct vm_area_struct *vma = find_vma(mm, addr);
+
+	if (vma && addr < vma->vm_start)
+		vma = NULL;
+
+	return vma;
+}
+
 static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
 {
 	unsigned long vm_start = vma->vm_start;
-- 
cgit v1.2.3


From a2afc59fb25027749bd41c44f47382522232019e Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Mon, 28 Jun 2021 19:40:11 -0700
Subject: mm/page_alloc: add an alloc_pages_bulk_array_node() helper

Patch series "vmalloc() vs bulk allocator", v2.

This patch (of 3):

Add a "node" variant of the alloc_pages_bulk_array() function.  The helper
guarantees that a __alloc_pages_bulk() is invoked with a valid NUMA node
ID.

Link: https://lkml.kernel.org/r/20210516202056.2120-1-urezki@gmail.com
Link: https://lkml.kernel.org/r/20210516202056.2120-2-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sonymobile.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 11da8af06704..94f0b8b1cb55 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -536,6 +536,15 @@ alloc_pages_bulk_array(gfp_t gfp, unsigned long nr_pages, struct page **page_arr
 	return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, NULL, page_array);
 }
 
+static inline unsigned long
+alloc_pages_bulk_array_node(gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array)
+{
+	if (nid == NUMA_NO_NODE)
+		nid = numa_mem_id();
+
+	return __alloc_pages_bulk(gfp, nid, NULL, nr_pages, NULL, page_array);
+}
+
 /*
  * Allocate pages, preferring the node given as nid. The node must be valid and
  * online. For more general interface, see alloc_pages_node().
-- 
cgit v1.2.3


From 4469c0f17ec63dcc8c9ed512f4330b566c2c0d34 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Mon, 28 Jun 2021 19:40:30 -0700
Subject: printk: introduce dump_stack_lvl()

dump_stack() is used for many different cases, which may require a log
level consistent with other kernel messages surrounding the dump_stack()
call.  Without that, certain systems that are configured to ignore the
default level messages will miss stack traces in critical error reports.

This patch introduces dump_stack_lvl() that behaves similarly to
dump_stack(), but accepts a custom log level.  The old dump_stack()
becomes equal to dump_stack_lvl(KERN_DEFAULT).

A somewhat similar patch has been proposed in 2012:
https://lore.kernel.org/lkml/1332493269.2359.9.camel@hebo/ , but wasn't
merged.

[elver@google.com: add missing dump_stack_lvl() stub if CONFIG_PRINTK=n]
  Link: https://lkml.kernel.org/r/YJ0KAM0hQev1AmWe@elver.google.com

Link: https://lkml.kernel.org/r/20210506105405.3535023-1-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: he, bo <bo.he@intel.com>
Cc: Yanmin Zhang <yanmin_zhang@linux.intel.com>
Cc: Prasad Sodagudi <psodagud@quicinc.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h |  5 +++++
 lib/dump_stack.c       | 20 +++++++++++++-------
 2 files changed, 18 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index fe7eb2351610..f589b8b60806 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -206,6 +206,7 @@ void __init setup_log_buf(int early);
 __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...);
 void dump_stack_print_info(const char *log_lvl);
 void show_regs_print_info(const char *log_lvl);
+extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold;
 extern asmlinkage void dump_stack(void) __cold;
 extern void printk_safe_flush(void);
 extern void printk_safe_flush_on_panic(void);
@@ -269,6 +270,10 @@ static inline void show_regs_print_info(const char *log_lvl)
 {
 }
 
+static inline void dump_stack_lvl(const char *log_lvl)
+{
+}
+
 static inline void dump_stack(void)
 {
 }
diff --git a/lib/dump_stack.c b/lib/dump_stack.c
index f5a33b6f773f..586e3f2c6a15 100644
--- a/lib/dump_stack.c
+++ b/lib/dump_stack.c
@@ -73,10 +73,10 @@ void show_regs_print_info(const char *log_lvl)
 	dump_stack_print_info(log_lvl);
 }
 
-static void __dump_stack(void)
+static void __dump_stack(const char *log_lvl)
 {
-	dump_stack_print_info(KERN_DEFAULT);
-	show_stack(NULL, NULL, KERN_DEFAULT);
+	dump_stack_print_info(log_lvl);
+	show_stack(NULL, NULL, log_lvl);
 }
 
 /**
@@ -87,7 +87,7 @@ static void __dump_stack(void)
 #ifdef CONFIG_SMP
 static atomic_t dump_lock = ATOMIC_INIT(-1);
 
-asmlinkage __visible void dump_stack(void)
+asmlinkage __visible void dump_stack_lvl(const char *log_lvl)
 {
 	unsigned long flags;
 	int was_locked;
@@ -117,7 +117,7 @@ retry:
 		goto retry;
 	}
 
-	__dump_stack();
+	__dump_stack(log_lvl);
 
 	if (!was_locked)
 		atomic_set(&dump_lock, -1);
@@ -125,9 +125,15 @@ retry:
 	local_irq_restore(flags);
 }
 #else
-asmlinkage __visible void dump_stack(void)
+asmlinkage __visible void dump_stack_lvl(const char *log_lvl)
 {
-	__dump_stack();
+	__dump_stack(log_lvl);
 }
 #endif
+EXPORT_SYMBOL(dump_stack_lvl);
+
+asmlinkage __visible void dump_stack(void)
+{
+	dump_stack_lvl(KERN_DEFAULT);
+}
 EXPORT_SYMBOL(dump_stack);
-- 
cgit v1.2.3


From 3ff16d30f593d80a958104ee06a94562a12c5879 Mon Sep 17 00:00:00 2001
From: David Gow <davidgow@google.com>
Date: Mon, 28 Jun 2021 19:40:36 -0700
Subject: kasan: test: improve failure message in KUNIT_EXPECT_KASAN_FAIL()

The KUNIT_EXPECT_KASAN_FAIL() macro currently uses KUNIT_EXPECT_EQ() to
compare fail_data.report_expected and fail_data.report_found.  This always
gave a somewhat useless error message on failure, but the addition of
extra compile-time checking with READ_ONCE() has caused it to get much
longer, and be truncated before anything useful is displayed.

Instead, just check fail_data.report_found by hand (we've just set
report_expected to 'true'), and print a better failure message with
KUNIT_FAIL().  Because of this, report_expected is no longer used
anywhere, and can be removed.

Beforehand, a failure in:
KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)area)[3100]);
would have looked like:
[22:00:34] [FAILED] vmalloc_oob
[22:00:34]     # vmalloc_oob: EXPECTATION FAILED at lib/test_kasan.c:991
[22:00:34]     Expected ({ do { extern void __compiletime_assert_705(void) __attribute__((__error__("Unsupported access size for {READ,WRITE}_ONCE()."))); if (!((sizeof(fail_data.report_expected) == sizeof(char) || sizeof(fail_data.repp
[22:00:34]     not ok 45 - vmalloc_oob

With this change, it instead looks like:
[22:04:04] [FAILED] vmalloc_oob
[22:04:04]     # vmalloc_oob: EXPECTATION FAILED at lib/test_kasan.c:993
[22:04:04]     KASAN failure expected in "((volatile char *)area)[3100]", but none occurred
[22:04:04]     not ok 45 - vmalloc_oob

Also update the example failure in the documentation to reflect this.

Link: https://lkml.kernel.org/r/20210606005531.165954-1-davidgow@google.com
Signed-off-by: David Gow <davidgow@google.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Reviewed-by: Marco Elver <elver@google.com>
Acked-by: Brendan Higgins <brendanhiggins@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Daniel Axtens <dja@axtens.net>
Cc: David Gow <davidgow@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/dev-tools/kasan.rst |  9 ++++-----
 include/linux/kasan.h             |  1 -
 lib/test_kasan.c                  | 11 +++++------
 3 files changed, 9 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst
index d3f335ffc751..83ec4a556c19 100644
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -447,11 +447,10 @@ When a test fails due to a failed ``kmalloc``::
 
 When a test fails due to a missing KASAN report::
 
-        # kmalloc_double_kzfree: EXPECTATION FAILED at lib/test_kasan.c:629
-        Expected kasan_data->report_expected == kasan_data->report_found, but
-        kasan_data->report_expected == 1
-        kasan_data->report_found == 0
-        not ok 28 - kmalloc_double_kzfree
+        # kmalloc_double_kzfree: EXPECTATION FAILED at lib/test_kasan.c:974
+        KASAN failure expected in "kfree_sensitive(ptr)", but none occurred
+        not ok 44 - kmalloc_double_kzfree
+
 
 At the end the cumulative status of all KASAN tests is printed. On success::
 
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index b1678a61e6a7..18cd5ec2f469 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -17,7 +17,6 @@ struct task_struct;
 
 /* kasan_data struct is used in KUnit tests for KASAN expected failures */
 struct kunit_kasan_expectation {
-	bool report_expected;
 	bool report_found;
 };
 
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index cacbbbdef768..44e08f4d9c52 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -55,7 +55,6 @@ static int kasan_test_init(struct kunit *test)
 	multishot = kasan_save_enable_multi_shot();
 	kasan_set_tagging_report_once(false);
 	fail_data.report_found = false;
-	fail_data.report_expected = false;
 	kunit_add_named_resource(test, NULL, NULL, &resource,
 					"kasan_data", &fail_data);
 	return 0;
@@ -94,20 +93,20 @@ static void kasan_test_exit(struct kunit *test)
 	    !kasan_async_mode_enabled())				\
 		migrate_disable();					\
 	KUNIT_EXPECT_FALSE(test, READ_ONCE(fail_data.report_found));	\
-	WRITE_ONCE(fail_data.report_expected, true);			\
 	barrier();							\
 	expression;							\
 	barrier();							\
-	KUNIT_EXPECT_EQ(test,						\
-			READ_ONCE(fail_data.report_expected),		\
-			READ_ONCE(fail_data.report_found));		\
+	if (!READ_ONCE(fail_data.report_found)) {			\
+		KUNIT_FAIL(test, KUNIT_SUBTEST_INDENT "KASAN failure "	\
+				"expected in \"" #expression		\
+				 "\", but none occurred");		\
+	}								\
 	if (IS_ENABLED(CONFIG_KASAN_HW_TAGS)) {				\
 		if (READ_ONCE(fail_data.report_found))			\
 			kasan_enable_tagging_sync();			\
 		migrate_enable();					\
 	}								\
 	WRITE_ONCE(fail_data.report_found, false);			\
-	WRITE_ONCE(fail_data.report_expected, false);			\
 } while (0)
 
 #define KASAN_TEST_NEEDS_CONFIG_ON(test, config) do {			\
-- 
cgit v1.2.3


From c0f8aa4fa815daacb6eca52cae04820d6aecb7c2 Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Mon, 28 Jun 2021 19:40:46 -0700
Subject: mm: define default MAX_PTRS_PER_* in include/pgtable.h

Commit c65e774fb3f6 ("x86/mm: Make PGDIR_SHIFT and PTRS_PER_P4D variable")
made PTRS_PER_P4D variable on x86 and introduced MAX_PTRS_PER_P4D as a
constant for cases which need a compile-time constant (e.g.  fixed-size
arrays).

powerpc likewise has boot-time selectable MMU features which can cause
other mm "constants" to vary.  For KASAN, we have some static
PTE/PMD/PUD/P4D arrays so we need compile-time maximums for all these
constants.  Extend the MAX_PTRS_PER_ idiom, and place default definitions
in include/pgtable.h.  These define MAX_PTRS_PER_x to be PTRS_PER_x unless
an architecture has defined MAX_PTRS_PER_x in its arch headers.

Clean up pgtable-nop4d.h and s390's MAX_PTRS_PER_P4D definitions while
we're at it: both can just pick up the default now.

Link: https://lkml.kernel.org/r/20210624034050.511391-4-dja@axtens.net
Signed-off-by: Daniel Axtens <dja@axtens.net>
Acked-by: Andrey Konovalov <andreyknvl@gmail.com>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/include/asm/pgtable.h     |  2 --
 include/asm-generic/pgtable-nop4d.h |  1 -
 include/linux/pgtable.h             | 22 ++++++++++++++++++++++
 3 files changed, 22 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 29c7ecd5ad1d..b38f7b781564 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -344,8 +344,6 @@ static inline int is_module_addr(void *addr)
 #define PTRS_PER_P4D	_CRST_ENTRIES
 #define PTRS_PER_PGD	_CRST_ENTRIES
 
-#define MAX_PTRS_PER_P4D	PTRS_PER_P4D
-
 /*
  * Segment table and region3 table entry encoding
  * (R = read-only, I = invalid, y = young bit):
diff --git a/include/asm-generic/pgtable-nop4d.h b/include/asm-generic/pgtable-nop4d.h
index ce2cbb3c380f..2f6b1befb129 100644
--- a/include/asm-generic/pgtable-nop4d.h
+++ b/include/asm-generic/pgtable-nop4d.h
@@ -9,7 +9,6 @@
 typedef struct { pgd_t pgd; } p4d_t;
 
 #define P4D_SHIFT		PGDIR_SHIFT
-#define MAX_PTRS_PER_P4D	1
 #define PTRS_PER_P4D		1
 #define P4D_SIZE		(1UL << P4D_SHIFT)
 #define P4D_MASK		(~(P4D_SIZE-1))
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index a43047b1030d..c32600c9e1ad 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1592,4 +1592,26 @@ typedef unsigned int pgtbl_mod_mask;
 #define pte_leaf_size(x) PAGE_SIZE
 #endif
 
+/*
+ * Some architectures have MMUs that are configurable or selectable at boot
+ * time. These lead to variable PTRS_PER_x. For statically allocated arrays it
+ * helps to have a static maximum value.
+ */
+
+#ifndef MAX_PTRS_PER_PTE
+#define MAX_PTRS_PER_PTE PTRS_PER_PTE
+#endif
+
+#ifndef MAX_PTRS_PER_PMD
+#define MAX_PTRS_PER_PMD PTRS_PER_PMD
+#endif
+
+#ifndef MAX_PTRS_PER_PUD
+#define MAX_PTRS_PER_PUD PTRS_PER_PUD
+#endif
+
+#ifndef MAX_PTRS_PER_P4D
+#define MAX_PTRS_PER_P4D PTRS_PER_P4D
+#endif
+
 #endif /* _LINUX_PGTABLE_H */
-- 
cgit v1.2.3


From cb32c9c5d45662770160e0055cb672fd6e0813e8 Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Mon, 28 Jun 2021 19:40:49 -0700
Subject: kasan: use MAX_PTRS_PER_* for early shadow tables

powerpc has a variable number of PTRS_PER_*, set at runtime based on the
MMU that the kernel is booted under.

This means the PTRS_PER_* are no longer constants, and therefore breaks
the build.  Switch to using MAX_PTRS_PER_*, which are constant.

Link: https://lkml.kernel.org/r/20210624034050.511391-5-dja@axtens.net
Signed-off-by: Daniel Axtens <dja@axtens.net>
Suggested-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Suggested-by: Balbir Singh <bsingharora@gmail.com>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Reviewed-by: Balbir Singh <bsingharora@gmail.com>
Reviewed-by: Marco Elver <elver@google.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 6 +++---
 mm/kasan/init.c       | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 18cd5ec2f469..8d83bbffcfbb 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -40,9 +40,9 @@ struct kunit_kasan_expectation {
 #endif
 
 extern unsigned char kasan_early_shadow_page[PAGE_SIZE];
-extern pte_t kasan_early_shadow_pte[PTRS_PER_PTE + PTE_HWTABLE_PTRS];
-extern pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD];
-extern pud_t kasan_early_shadow_pud[PTRS_PER_PUD];
+extern pte_t kasan_early_shadow_pte[MAX_PTRS_PER_PTE + PTE_HWTABLE_PTRS];
+extern pmd_t kasan_early_shadow_pmd[MAX_PTRS_PER_PMD];
+extern pud_t kasan_early_shadow_pud[MAX_PTRS_PER_PUD];
 extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D];
 
 int kasan_populate_early_shadow(const void *shadow_start,
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index 348f31d15a97..cc64ed6858c6 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -41,7 +41,7 @@ static inline bool kasan_p4d_table(pgd_t pgd)
 }
 #endif
 #if CONFIG_PGTABLE_LEVELS > 3
-pud_t kasan_early_shadow_pud[PTRS_PER_PUD] __page_aligned_bss;
+pud_t kasan_early_shadow_pud[MAX_PTRS_PER_PUD] __page_aligned_bss;
 static inline bool kasan_pud_table(p4d_t p4d)
 {
 	return p4d_page(p4d) == virt_to_page(lm_alias(kasan_early_shadow_pud));
@@ -53,7 +53,7 @@ static inline bool kasan_pud_table(p4d_t p4d)
 }
 #endif
 #if CONFIG_PGTABLE_LEVELS > 2
-pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD] __page_aligned_bss;
+pmd_t kasan_early_shadow_pmd[MAX_PTRS_PER_PMD] __page_aligned_bss;
 static inline bool kasan_pmd_table(pud_t pud)
 {
 	return pud_page(pud) == virt_to_page(lm_alias(kasan_early_shadow_pmd));
@@ -64,7 +64,7 @@ static inline bool kasan_pmd_table(pud_t pud)
 	return false;
 }
 #endif
-pte_t kasan_early_shadow_pte[PTRS_PER_PTE + PTE_HWTABLE_PTRS]
+pte_t kasan_early_shadow_pte[MAX_PTRS_PER_PTE + PTE_HWTABLE_PTRS]
 	__page_aligned_bss;
 
 static inline bool kasan_pte_table(pmd_t pmd)
-- 
cgit v1.2.3


From c5a54c706e04a4ba7c4e3428776ac9e44aec17ea Mon Sep 17 00:00:00 2001
From: Jungseung Lee <js07.lee@samsung.com>
Date: Mon, 28 Jun 2021 19:41:02 -0700
Subject: mm: report which part of mem is being freed on initmem case

Add the details for figuring out which parts of the kernel image is being
freed on initmem case.

Before:
   Freeing unused kernel memory: 1024K

After:
   Freeing unused kernel image (initmem) memory: 1024K

Link: https://lkml.kernel.org/r/1622706274-4533-1-git-send-email-js07.lee@samsung.com
Signed-off-by: Jungseung Lee <js07.lee@samsung.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1a98b5447a3b..f08e9de92fc5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2416,7 +2416,7 @@ static inline unsigned long free_initmem_default(int poison)
 	extern char __init_begin[], __init_end[];
 
 	return free_reserved_area(&__init_begin, &__init_end,
-				  poison, "unused kernel");
+				  poison, "unused kernel image (initmem)");
 }
 
 static inline unsigned long get_num_physpages(void)
-- 
cgit v1.2.3


From b19bd1c976afeefc2ebba3d4dae8a4c296dae67f Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 28 Jun 2021 19:41:04 -0700
Subject: mm/mmzone.h: simplify is_highmem_idx()

There is a lot of historical ifdefery in is_highmem_idx() and its helper
zone_movable_is_highmem() that was required because of two different paths
for nodes and zones initialization that were selected at compile time.

Until commit 3f08a302f533 ("mm: remove CONFIG_HAVE_MEMBLOCK_NODE_MAP
option") the movable_zone variable was only available for configurations
that had CONFIG_HAVE_MEMBLOCK_NODE_MAP enabled so the test in
zone_movable_is_highmem() used that variable only for such configurations.
For other configurations the test checked if the index of ZONE_MOVABLE
was greater by 1 than the index of ZONE_HIGMEM and then movable zone was
considered a highmem zone.  Needless to say, ZONE_MOVABLE - 1 equals
ZONE_HIGHMEM by definition when CONFIG_HIGHMEM=y.

Commit 3f08a302f533 ("mm: remove CONFIG_HAVE_MEMBLOCK_NODE_MAP option")
made movable_zone variable always available.  Since this variable is set
to ZONE_HIGHMEM if CONFIG_HIGHMEM is enabled and highmem zone is
populated, it is enough to check whether

	zone_idx == ZONE_MOVABLE && movable_zone == ZONE_HIGMEM

to test if zone index points to a highmem zone.

Remove zone_movable_is_highmem() that is not used anywhere except
is_highmem_idx() and use the test above in is_highmem_idx() instead.

Link: https://lkml.kernel.org/r/20210426141927.1314326-3-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0d53eba1c383..c2bfefd34b59 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -982,22 +982,11 @@ static inline void zone_set_nid(struct zone *zone, int nid) {}
 
 extern int movable_zone;
 
-#ifdef CONFIG_HIGHMEM
-static inline int zone_movable_is_highmem(void)
-{
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-	return movable_zone == ZONE_HIGHMEM;
-#else
-	return (ZONE_MOVABLE - 1) == ZONE_HIGHMEM;
-#endif
-}
-#endif
-
 static inline int is_highmem_idx(enum zone_type idx)
 {
 #ifdef CONFIG_HIGHMEM
 	return (idx == ZONE_HIGHMEM ||
-		(idx == ZONE_MOVABLE && zone_movable_is_highmem()));
+		(idx == ZONE_MOVABLE && movable_zone == ZONE_HIGHMEM));
 #else
 	return 0;
 #endif
-- 
cgit v1.2.3


From d2f07ec052ac1a720d6f1919e3dee7d73f04d495 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 28 Jun 2021 19:41:07 -0700
Subject: mm: make __dump_page static

Patch series "Constify struct page arguments".

While working on various solutions to the 32-bit struct page size
regression, one of the problems I found was the networking stack expects
to be able to pass const struct page pointers around, and the mm doesn't
provide a lot of const-friendly functions to call.  The root tangle of
problems is that a lot of functions call VM_BUG_ON_PAGE(), which calls
dump_page(), which calls a lot of functions which don't take a const
struct page (but could be const).

This patch (of 6):

The only caller of __dump_page() now opencodes dump_page(), so remove it
as an externally visible symbol.

Link: https://lkml.kernel.org/r/20210416231531.2521383-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20210416231531.2521383-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmdebug.h | 3 +--
 mm/debug.c              | 2 +-
 mm/page_alloc.c         | 3 +--
 3 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index 5d0767cb424a..1935d4c72d10 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -9,8 +9,7 @@ struct page;
 struct vm_area_struct;
 struct mm_struct;
 
-extern void dump_page(struct page *page, const char *reason);
-extern void __dump_page(struct page *page, const char *reason);
+void dump_page(struct page *page, const char *reason);
 void dump_vma(const struct vm_area_struct *vma);
 void dump_mm(const struct mm_struct *mm);
 
diff --git a/mm/debug.c b/mm/debug.c
index 0bdda8407f71..84cdcd0f7bd3 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -42,7 +42,7 @@ const struct trace_print_flags vmaflag_names[] = {
 	{0, NULL}
 };
 
-void __dump_page(struct page *page, const char *reason)
+static void __dump_page(struct page *page, const char *reason)
 {
 	struct page *head = compound_head(page);
 	struct address_space *mapping;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2bf03c76504b..4087340fca32 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -658,8 +658,7 @@ static void bad_page(struct page *page, const char *reason)
 
 	pr_alert("BUG: Bad page state in process %s  pfn:%05lx\n",
 		current->comm, page_to_pfn(page));
-	__dump_page(page, reason);
-	dump_page_owner(page);
+	dump_page(page, reason);
 
 	print_modules();
 	dump_stack();
-- 
cgit v1.2.3


From 8bf6f451bded5db7840b3b2932ef48be5dce6b38 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 28 Jun 2021 19:41:16 -0700
Subject: mm/page_owner: constify dump_page_owner

dump_page_owner() only uses struct page to find the page_ext, and
lookup_page_ext() already takes a const argument.

Link: https://lkml.kernel.org/r/20210416231531.2521383-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_owner.h | 6 +++---
 mm/page_owner.c            | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h
index 3468794f83d2..719bfe5108c5 100644
--- a/include/linux/page_owner.h
+++ b/include/linux/page_owner.h
@@ -14,7 +14,7 @@ extern void __set_page_owner(struct page *page,
 extern void __split_page_owner(struct page *page, unsigned int nr);
 extern void __copy_page_owner(struct page *oldpage, struct page *newpage);
 extern void __set_page_owner_migrate_reason(struct page *page, int reason);
-extern void __dump_page_owner(struct page *page);
+extern void __dump_page_owner(const struct page *page);
 extern void pagetypeinfo_showmixedcount_print(struct seq_file *m,
 					pg_data_t *pgdat, struct zone *zone);
 
@@ -46,7 +46,7 @@ static inline void set_page_owner_migrate_reason(struct page *page, int reason)
 	if (static_branch_unlikely(&page_owner_inited))
 		__set_page_owner_migrate_reason(page, reason);
 }
-static inline void dump_page_owner(struct page *page)
+static inline void dump_page_owner(const struct page *page)
 {
 	if (static_branch_unlikely(&page_owner_inited))
 		__dump_page_owner(page);
@@ -69,7 +69,7 @@ static inline void copy_page_owner(struct page *oldpage, struct page *newpage)
 static inline void set_page_owner_migrate_reason(struct page *page, int reason)
 {
 }
-static inline void dump_page_owner(struct page *page)
+static inline void dump_page_owner(const struct page *page)
 {
 }
 #endif /* CONFIG_PAGE_OWNER */
diff --git a/mm/page_owner.c b/mm/page_owner.c
index adfabb560eb9..f51a57e92aa3 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -392,7 +392,7 @@ err:
 	return -ENOMEM;
 }
 
-void __dump_page_owner(struct page *page)
+void __dump_page_owner(const struct page *page)
 {
 	struct page_ext *page_ext = lookup_page_ext(page);
 	struct page_owner *page_owner;
-- 
cgit v1.2.3


From 0f2317e34e2c7b97efd4600122115410795ebeea Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 28 Jun 2021 19:41:19 -0700
Subject: mm: make compound_head const-preserving

If you pass a const pointer to compound_head(), you get a const pointer
back; if you pass a mutable pointer, you get a mutable pointer back.  Also
remove an unnecessary forward definition of struct page; we're about to
dereference page->compound_head, so it must already have been defined.

Link: https://lkml.kernel.org/r/20210416231531.2521383-5-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 04a34c08e0a6..d8e26243db25 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -177,17 +177,17 @@ enum pageflags {
 
 #ifndef __GENERATING_BOUNDS_H
 
-struct page;	/* forward declaration */
-
-static inline struct page *compound_head(struct page *page)
+static inline unsigned long _compound_head(const struct page *page)
 {
 	unsigned long head = READ_ONCE(page->compound_head);
 
 	if (unlikely(head & 1))
-		return (struct page *) (head - 1);
-	return page;
+		return head - 1;
+	return (unsigned long)page;
 }
 
+#define compound_head(page)	((typeof(page))_compound_head(page))
+
 static __always_inline int PageTail(struct page *page)
 {
 	return READ_ONCE(page->compound_head) & 1;
-- 
cgit v1.2.3


From ca891f41c4c7921a03dfd0fa1faf324393724480 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 28 Jun 2021 19:41:22 -0700
Subject: mm: constify get_pfnblock_flags_mask and get_pfnblock_migratetype

The struct page is not modified by these routines, so it can be marked
const.

Link: https://lkml.kernel.org/r/20210416231531.2521383-6-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pageblock-flags.h |  2 +-
 mm/page_alloc.c                 | 13 +++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index fff52ad370c1..973fd731a520 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -54,7 +54,7 @@ extern unsigned int pageblock_order;
 /* Forward declaration */
 struct page;
 
-unsigned long get_pfnblock_flags_mask(struct page *page,
+unsigned long get_pfnblock_flags_mask(const struct page *page,
 				unsigned long pfn,
 				unsigned long mask);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ea1efbb06e40..4f5eedb6593a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -474,7 +474,7 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
 #endif
 
 /* Return a pointer to the bitmap storing bits affecting a block of pages */
-static inline unsigned long *get_pageblock_bitmap(struct page *page,
+static inline unsigned long *get_pageblock_bitmap(const struct page *page,
 							unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
@@ -484,7 +484,7 @@ static inline unsigned long *get_pageblock_bitmap(struct page *page,
 #endif /* CONFIG_SPARSEMEM */
 }
 
-static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
+static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
 	pfn &= (PAGES_PER_SECTION-1);
@@ -495,7 +495,7 @@ static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
 }
 
 static __always_inline
-unsigned long __get_pfnblock_flags_mask(struct page *page,
+unsigned long __get_pfnblock_flags_mask(const struct page *page,
 					unsigned long pfn,
 					unsigned long mask)
 {
@@ -520,13 +520,14 @@ unsigned long __get_pfnblock_flags_mask(struct page *page,
  *
  * Return: pageblock_bits flags
  */
-unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
-					unsigned long mask)
+unsigned long get_pfnblock_flags_mask(const struct page *page,
+					unsigned long pfn, unsigned long mask)
 {
 	return __get_pfnblock_flags_mask(page, pfn, mask);
 }
 
-static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
+static __always_inline int get_pfnblock_migratetype(const struct page *page,
+					unsigned long pfn)
 {
 	return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
 }
-- 
cgit v1.2.3


From 5f7dadf3958f882b393d3c4c60da232dbac66424 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 28 Jun 2021 19:41:25 -0700
Subject: mm: constify page_count and page_ref_count

Now that compound_head() accepts a const struct page pointer, these two
functions can be marked as not modifying the page pointer they are passed.

Link: https://lkml.kernel.org/r/20210416231531.2521383-7-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_ref.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index f3318f34fc54..7ad46f45df39 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -62,12 +62,12 @@ static inline void __page_ref_unfreeze(struct page *page, int v)
 
 #endif
 
-static inline int page_ref_count(struct page *page)
+static inline int page_ref_count(const struct page *page)
 {
 	return atomic_read(&page->_refcount);
 }
 
-static inline int page_count(struct page *page)
+static inline int page_count(const struct page *page)
 {
 	return atomic_read(&compound_head(page)->_refcount);
 }
-- 
cgit v1.2.3


From 1cfcee728391ece94a75e4b17fa87253d40c2185 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 28 Jun 2021 19:41:28 -0700
Subject: mm: optimise nth_page for contiguous memmap

If the memmap is virtually contiguous (either because we're using a
virtually mapped memmap or because we don't support a discontig memmap at
all), then we can implement nth_page() by simple addition.  Contrary to
popular belief, the compiler is not able to optimise this itself for a
vmemmap configuration.  This reduces one example user (sg.c) by four
instructions:

        struct page *page = nth_page(rsv_schp->pages[k], offset >> PAGE_SHIFT);

before:
   49 8b 45 70             mov    0x70(%r13),%rax
   48 63 c9                movslq %ecx,%rcx
   48 c1 eb 0c             shr    $0xc,%rbx
   48 8b 04 c8             mov    (%rax,%rcx,8),%rax
   48 2b 05 00 00 00 00    sub    0x0(%rip),%rax
           R_X86_64_PC32      vmemmap_base-0x4
   48 c1 f8 06             sar    $0x6,%rax
   48 01 d8                add    %rbx,%rax
   48 c1 e0 06             shl    $0x6,%rax
   48 03 05 00 00 00 00    add    0x0(%rip),%rax
           R_X86_64_PC32      vmemmap_base-0x4

after:
   49 8b 45 70             mov    0x70(%r13),%rax
   48 63 c9                movslq %ecx,%rcx
   48 c1 eb 0c             shr    $0xc,%rbx
   48 c1 e3 06             shl    $0x6,%rbx
   48 03 1c c8             add    (%rax,%rcx,8),%rbx

Link: https://lkml.kernel.org/r/20210413194625.1472345-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Douglas Gilbert <dougg@torque.net>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f08e9de92fc5..9bd21e6fad6a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -234,7 +234,11 @@ int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *,
 int __add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 		pgoff_t index, gfp_t gfp, void **shadowp);
 
+#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
 #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
+#else
+#define nth_page(page,n) ((page) + (n))
+#endif
 
 /* to align the pointer to the (next) page boundary */
 #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)
-- 
cgit v1.2.3


From 28f836b6777b6f42dce068a40d83a891deaaca37 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Mon, 28 Jun 2021 19:41:38 -0700
Subject: mm/page_alloc: split per cpu page lists and zone stats

The PCP (per-cpu page allocator in page_alloc.c) shares locking
requirements with vmstat and the zone lock which is inconvenient and
causes some issues.  For example, the PCP list and vmstat share the same
per-cpu space meaning that it's possible that vmstat updates dirty cache
lines holding per-cpu lists across CPUs unless padding is used.  Second,
PREEMPT_RT does not want to disable IRQs for too long in the page
allocator.

This series splits the locking requirements and uses locks types more
suitable for PREEMPT_RT, reduces the time when special locking is required
for stats and reduces the time when IRQs need to be disabled on
!PREEMPT_RT kernels.

Why local_lock?  PREEMPT_RT considers the following sequence to be unsafe
as documented in Documentation/locking/locktypes.rst

   local_irq_disable();
   spin_lock(&lock);

The pcp allocator has this sequence for rmqueue_pcplist (local_irq_save)
-> __rmqueue_pcplist -> rmqueue_bulk (spin_lock).  While it's possible to
separate this out, it generally means there are points where we enable
IRQs and reenable them again immediately.  To prevent a migration and the
per-cpu pointer going stale, migrate_disable is also needed.  That is a
custom lock that is similar, but worse, than local_lock.  Furthermore, on
PREEMPT_RT, it's undesirable to leave IRQs disabled for too long.  By
converting to local_lock which disables migration on PREEMPT_RT, the
locking requirements can be separated and start moving the protections for
PCP, stats and the zone lock to PREEMPT_RT-safe equivalent locking.  As a
bonus, local_lock also means that PROVE_LOCKING does something useful.

After that, it's obvious that zone_statistics incurs too much overhead and
leaves IRQs disabled for longer than necessary on !PREEMPT_RT kernels.
zone_statistics uses perfectly accurate counters requiring IRQs be
disabled for parallel RMW sequences when inaccurate ones like vm_events
would do.  The series makes the NUMA statistics (NUMA_HIT and friends)
inaccurate counters that then require no special protection on
!PREEMPT_RT.

The bulk page allocator can then do stat updates in bulk with IRQs enabled
which should improve the efficiency.  Technically, this could have been
done without the local_lock and vmstat conversion work and the order
simply reflects the timing of when different series were implemented.

Finally, there are places where we conflate IRQs being disabled for the
PCP with the IRQ-safe zone spinlock.  The remainder of the series reduces
the scope of what is protected by disabled IRQs on !PREEMPT_RT kernels.
By the end of the series, page_alloc.c does not call local_irq_save so the
locking scope is a bit clearer.  The one exception is that modifying
NR_FREE_PAGES still happens in places where it's known the IRQs are
disabled as it's harmless for PREEMPT_RT and would be expensive to split
the locking there.

No performance data is included because despite the overhead of the stats,
it's within the noise for most workloads on !PREEMPT_RT.  However, Jesper
Dangaard Brouer ran a page allocation microbenchmark on a E5-1650 v4 @
3.60GHz CPU on the first version of this series.  Focusing on the array
variant of the bulk page allocator reveals the following.

(CPU: Intel(R) Xeon(R) CPU E5-1650 v4 @ 3.60GHz)
ARRAY variant: time_bulk_page_alloc_free_array: step=bulk size

         Baseline        Patched
 1       56.383          54.225 (+3.83%)
 2       40.047          35.492 (+11.38%)
 3       37.339          32.643 (+12.58%)
 4       35.578          30.992 (+12.89%)
 8       33.592          29.606 (+11.87%)
 16      32.362          28.532 (+11.85%)
 32      31.476          27.728 (+11.91%)
 64      30.633          27.252 (+11.04%)
 128     30.596          27.090 (+11.46%)

While this is a positive outcome, the series is more likely to be
interesting to the RT people in terms of getting parts of the PREEMPT_RT
tree into mainline.

This patch (of 9):

The per-cpu page allocator lists and the per-cpu vmstat deltas are stored
in the same struct per_cpu_pages even though vmstats have no direct impact
on the per-cpu page lists.  This is inconsistent because the vmstats for a
node are stored on a dedicated structure.  The bigger issue is that the
per_cpu_pages structure is not cache-aligned and stat updates either cache
conflict with adjacent per-cpu lists incurring a runtime cost or padding
is required incurring a memory cost.

This patch splits the per-cpu pagelists and the vmstat deltas into
separate structures.  It's mostly a mechanical conversion but some
variable renaming is done to clearly distinguish the per-cpu pages
structure (pcp) from the vmstats (pzstats).

Superficially, this appears to increase the size of the per_cpu_pages
structure but the movement of expire fills a structure hole so there is no
impact overall.

[mgorman@techsingularity.net: make it W=1 cleaner]
  Link: https://lkml.kernel.org/r/20210514144622.GA3735@techsingularity.net
[mgorman@techsingularity.net: make it W=1 even cleaner]
  Link: https://lkml.kernel.org/r/20210516140705.GB3735@techsingularity.net
[lkp@intel.com: check struct per_cpu_zonestat has a non-zero size]
[vbabka@suse.cz: Init zone->per_cpu_zonestats properly]

Link: https://lkml.kernel.org/r/20210512095458.30632-1-mgorman@techsingularity.net
Link: https://lkml.kernel.org/r/20210512095458.30632-2-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 18 +++++-----
 include/linux/vmstat.h |  8 ++---
 mm/page_alloc.c        | 85 +++++++++++++++++++++++--------------------
 mm/vmstat.c            | 98 ++++++++++++++++++++++++++------------------------
 4 files changed, 113 insertions(+), 96 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c2bfefd34b59..a50b123ab7ae 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -341,20 +341,21 @@ struct per_cpu_pages {
 	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
 	int batch;		/* chunk size for buddy add/remove */
+#ifdef CONFIG_NUMA
+	int expire;		/* When 0, remote pagesets are drained */
+#endif
 
 	/* Lists of pages, one per migrate type stored on the pcp-lists */
 	struct list_head lists[MIGRATE_PCPTYPES];
 };
 
-struct per_cpu_pageset {
-	struct per_cpu_pages pcp;
-#ifdef CONFIG_NUMA
-	s8 expire;
-	u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
-#endif
+struct per_cpu_zonestat {
 #ifdef CONFIG_SMP
-	s8 stat_threshold;
 	s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
+	s8 stat_threshold;
+#endif
+#ifdef CONFIG_NUMA
+	u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
 #endif
 };
 
@@ -484,7 +485,8 @@ struct zone {
 	int node;
 #endif
 	struct pglist_data	*zone_pgdat;
-	struct per_cpu_pageset __percpu *pageset;
+	struct per_cpu_pages	__percpu *per_cpu_pageset;
+	struct per_cpu_zonestat	__percpu *per_cpu_zonestats;
 	/*
 	 * the high and batch values are copied to individual pagesets for
 	 * faster access
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 3299cd69e4ca..0c5f36504613 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -163,7 +163,7 @@ static inline unsigned long zone_numa_state_snapshot(struct zone *zone,
 	int cpu;
 
 	for_each_online_cpu(cpu)
-		x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item];
+		x += per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_stat_diff[item];
 
 	return x;
 }
@@ -236,7 +236,7 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
 #ifdef CONFIG_SMP
 	int cpu;
 	for_each_online_cpu(cpu)
-		x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item];
+		x += per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_stat_diff[item];
 
 	if (x < 0)
 		x = 0;
@@ -291,7 +291,7 @@ struct ctl_table;
 int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp,
 		loff_t *ppos);
 
-void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);
+void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *);
 
 int calculate_pressure_threshold(struct zone *zone);
 int calculate_normal_threshold(struct zone *zone);
@@ -399,7 +399,7 @@ static inline void cpu_vm_stats_fold(int cpu) { }
 static inline void quiet_vmstat(void) { }
 
 static inline void drain_zonestat(struct zone *zone,
-			struct per_cpu_pageset *pset) { }
+			struct per_cpu_zonestat *pzstats) { }
 #endif		/* CONFIG_SMP */
 
 static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 902f889a324d..330c7307a92b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3026,15 +3026,14 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 {
 	unsigned long flags;
-	struct per_cpu_pageset *pset;
 	struct per_cpu_pages *pcp;
 
 	local_irq_save(flags);
-	pset = per_cpu_ptr(zone->pageset, cpu);
 
-	pcp = &pset->pcp;
+	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
 	if (pcp->count)
 		free_pcppages_bulk(zone, pcp->count, pcp);
+
 	local_irq_restore(flags);
 }
 
@@ -3133,7 +3132,7 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
 	 * disables preemption as part of its processing
 	 */
 	for_each_online_cpu(cpu) {
-		struct per_cpu_pageset *pcp;
+		struct per_cpu_pages *pcp;
 		struct zone *z;
 		bool has_pcps = false;
 
@@ -3144,13 +3143,13 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
 			 */
 			has_pcps = true;
 		} else if (zone) {
-			pcp = per_cpu_ptr(zone->pageset, cpu);
-			if (pcp->pcp.count)
+			pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+			if (pcp->count)
 				has_pcps = true;
 		} else {
 			for_each_populated_zone(z) {
-				pcp = per_cpu_ptr(z->pageset, cpu);
-				if (pcp->pcp.count) {
+				pcp = per_cpu_ptr(z->per_cpu_pageset, cpu);
+				if (pcp->count) {
 					has_pcps = true;
 					break;
 				}
@@ -3280,7 +3279,7 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn)
 		migratetype = MIGRATE_MOVABLE;
 	}
 
-	pcp = &this_cpu_ptr(zone->pageset)->pcp;
+	pcp = this_cpu_ptr(zone->per_cpu_pageset);
 	list_add(&page->lru, &pcp->lists[migratetype]);
 	pcp->count++;
 	if (pcp->count >= READ_ONCE(pcp->high))
@@ -3496,7 +3495,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 	unsigned long flags;
 
 	local_irq_save(flags);
-	pcp = &this_cpu_ptr(zone->pageset)->pcp;
+	pcp = this_cpu_ptr(zone->per_cpu_pageset);
 	list = &pcp->lists[migratetype];
 	page = __rmqueue_pcplist(zone,  migratetype, alloc_flags, pcp, list);
 	if (page) {
@@ -5105,7 +5104,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 
 	/* Attempt the batch allocation */
 	local_irq_save(flags);
-	pcp = &this_cpu_ptr(zone->pageset)->pcp;
+	pcp = this_cpu_ptr(zone->per_cpu_pageset);
 	pcp_list = &pcp->lists[ac.migratetype];
 
 	while (nr_populated < nr_pages) {
@@ -5720,7 +5719,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 			continue;
 
 		for_each_online_cpu(cpu)
-			free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
+			free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
 	}
 
 	printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
@@ -5812,7 +5811,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 
 		free_pcp = 0;
 		for_each_online_cpu(cpu)
-			free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
+			free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
 
 		show_node(zone);
 		printk(KERN_CONT
@@ -5853,7 +5852,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 			K(zone_page_state(zone, NR_MLOCK)),
 			K(zone_page_state(zone, NR_BOUNCE)),
 			K(free_pcp),
-			K(this_cpu_read(zone->pageset->pcp.count)),
+			K(this_cpu_read(zone->per_cpu_pageset->count)),
 			K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
 		printk("lowmem_reserve[]:");
 		for (i = 0; i < MAX_NR_ZONES; i++)
@@ -6180,11 +6179,12 @@ static void build_zonelists(pg_data_t *pgdat)
  * not check if the processor is online before following the pageset pointer.
  * Other parts of the kernel may not check if the zone is available.
  */
-static void pageset_init(struct per_cpu_pageset *p);
+static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats);
 /* These effectively disable the pcplists in the boot pageset completely */
 #define BOOT_PAGESET_HIGH	0
 #define BOOT_PAGESET_BATCH	1
-static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
+static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
+static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);
 static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
 
 static void __build_all_zonelists(void *data)
@@ -6251,7 +6251,7 @@ build_all_zonelists_init(void)
 	 * (a chicken-egg dilemma).
 	 */
 	for_each_possible_cpu(cpu)
-		pageset_init(&per_cpu(boot_pageset, cpu));
+		per_cpu_pages_init(&per_cpu(boot_pageset, cpu), &per_cpu(boot_zonestats, cpu));
 
 	mminit_verify_zonelist();
 	cpuset_init_current_mems_allowed();
@@ -6650,14 +6650,13 @@ static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
 	WRITE_ONCE(pcp->high, high);
 }
 
-static void pageset_init(struct per_cpu_pageset *p)
+static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
 {
-	struct per_cpu_pages *pcp;
 	int migratetype;
 
-	memset(p, 0, sizeof(*p));
+	memset(pcp, 0, sizeof(*pcp));
+	memset(pzstats, 0, sizeof(*pzstats));
 
-	pcp = &p->pcp;
 	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
 		INIT_LIST_HEAD(&pcp->lists[migratetype]);
 
@@ -6674,12 +6673,12 @@ static void pageset_init(struct per_cpu_pageset *p)
 static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
 		unsigned long batch)
 {
-	struct per_cpu_pageset *p;
+	struct per_cpu_pages *pcp;
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
-		p = per_cpu_ptr(zone->pageset, cpu);
-		pageset_update(&p->pcp, high, batch);
+		pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+		pageset_update(pcp, high, batch);
 	}
 }
 
@@ -6714,13 +6713,20 @@ static void zone_set_pageset_high_and_batch(struct zone *zone)
 
 void __meminit setup_zone_pageset(struct zone *zone)
 {
-	struct per_cpu_pageset *p;
 	int cpu;
 
-	zone->pageset = alloc_percpu(struct per_cpu_pageset);
+	/* Size may be 0 on !SMP && !NUMA */
+	if (sizeof(struct per_cpu_zonestat) > 0)
+		zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat);
+
+	zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages);
 	for_each_possible_cpu(cpu) {
-		p = per_cpu_ptr(zone->pageset, cpu);
-		pageset_init(p);
+		struct per_cpu_pages *pcp;
+		struct per_cpu_zonestat *pzstats;
+
+		pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+		pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
+		per_cpu_pages_init(pcp, pzstats);
 	}
 
 	zone_set_pageset_high_and_batch(zone);
@@ -6747,9 +6753,9 @@ void __init setup_per_cpu_pageset(void)
 	 * the nodes these zones are associated with.
 	 */
 	for_each_possible_cpu(cpu) {
-		struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu);
-		memset(pcp->vm_numa_stat_diff, 0,
-		       sizeof(pcp->vm_numa_stat_diff));
+		struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu);
+		memset(pzstats->vm_numa_stat_diff, 0,
+		       sizeof(pzstats->vm_numa_stat_diff));
 	}
 #endif
 
@@ -6765,7 +6771,8 @@ static __meminit void zone_pcp_init(struct zone *zone)
 	 * relies on the ability of the linker to provide the
 	 * offset of a (static) per cpu variable into the per cpu area.
 	 */
-	zone->pageset = &boot_pageset;
+	zone->per_cpu_pageset = &boot_pageset;
+	zone->per_cpu_zonestats = &boot_zonestats;
 	zone->pageset_high = BOOT_PAGESET_HIGH;
 	zone->pageset_batch = BOOT_PAGESET_BATCH;
 
@@ -9046,15 +9053,17 @@ void zone_pcp_enable(struct zone *zone)
 void zone_pcp_reset(struct zone *zone)
 {
 	int cpu;
-	struct per_cpu_pageset *pset;
+	struct per_cpu_zonestat *pzstats;
 
-	if (zone->pageset != &boot_pageset) {
+	if (zone->per_cpu_pageset != &boot_pageset) {
 		for_each_online_cpu(cpu) {
-			pset = per_cpu_ptr(zone->pageset, cpu);
-			drain_zonestat(zone, pset);
+			pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
+			drain_zonestat(zone, pzstats);
 		}
-		free_percpu(zone->pageset);
-		zone->pageset = &boot_pageset;
+		free_percpu(zone->per_cpu_pageset);
+		free_percpu(zone->per_cpu_zonestats);
+		zone->per_cpu_pageset = &boot_pageset;
+		zone->per_cpu_zonestats = &boot_zonestats;
 	}
 }
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index cccee36b289c..f1400ba46beb 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -44,7 +44,7 @@ static void zero_zone_numa_counters(struct zone *zone)
 	for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) {
 		atomic_long_set(&zone->vm_numa_stat[item], 0);
 		for_each_online_cpu(cpu)
-			per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]
+			per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_stat_diff[item]
 						= 0;
 	}
 }
@@ -266,7 +266,7 @@ void refresh_zone_stat_thresholds(void)
 		for_each_online_cpu(cpu) {
 			int pgdat_threshold;
 
-			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+			per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
 							= threshold;
 
 			/* Base nodestat threshold on the largest populated zone. */
@@ -303,7 +303,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
 
 		threshold = (*calculate_pressure)(zone);
 		for_each_online_cpu(cpu)
-			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+			per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
 							= threshold;
 	}
 }
@@ -316,7 +316,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 			   long delta)
 {
-	struct per_cpu_pageset __percpu *pcp = zone->pageset;
+	struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
 	s8 __percpu *p = pcp->vm_stat_diff + item;
 	long x;
 	long t;
@@ -389,7 +389,7 @@ EXPORT_SYMBOL(__mod_node_page_state);
  */
 void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 {
-	struct per_cpu_pageset __percpu *pcp = zone->pageset;
+	struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
 	s8 __percpu *p = pcp->vm_stat_diff + item;
 	s8 v, t;
 
@@ -435,7 +435,7 @@ EXPORT_SYMBOL(__inc_node_page_state);
 
 void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
 {
-	struct per_cpu_pageset __percpu *pcp = zone->pageset;
+	struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
 	s8 __percpu *p = pcp->vm_stat_diff + item;
 	s8 v, t;
 
@@ -495,7 +495,7 @@ EXPORT_SYMBOL(__dec_node_page_state);
 static inline void mod_zone_state(struct zone *zone,
        enum zone_stat_item item, long delta, int overstep_mode)
 {
-	struct per_cpu_pageset __percpu *pcp = zone->pageset;
+	struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
 	s8 __percpu *p = pcp->vm_stat_diff + item;
 	long o, n, t, z;
 
@@ -781,19 +781,22 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
 	int changes = 0;
 
 	for_each_populated_zone(zone) {
-		struct per_cpu_pageset __percpu *p = zone->pageset;
+		struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
+#ifdef CONFIG_NUMA
+		struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
+#endif
 
 		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
 			int v;
 
-			v = this_cpu_xchg(p->vm_stat_diff[i], 0);
+			v = this_cpu_xchg(pzstats->vm_stat_diff[i], 0);
 			if (v) {
 
 				atomic_long_add(v, &zone->vm_stat[i]);
 				global_zone_diff[i] += v;
 #ifdef CONFIG_NUMA
 				/* 3 seconds idle till flush */
-				__this_cpu_write(p->expire, 3);
+				__this_cpu_write(pcp->expire, 3);
 #endif
 			}
 		}
@@ -801,12 +804,12 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
 		for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
 			int v;
 
-			v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0);
+			v = this_cpu_xchg(pzstats->vm_numa_stat_diff[i], 0);
 			if (v) {
 
 				atomic_long_add(v, &zone->vm_numa_stat[i]);
 				global_numa_diff[i] += v;
-				__this_cpu_write(p->expire, 3);
+				__this_cpu_write(pcp->expire, 3);
 			}
 		}
 
@@ -819,23 +822,23 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
 			 * Check if there are pages remaining in this pageset
 			 * if not then there is nothing to expire.
 			 */
-			if (!__this_cpu_read(p->expire) ||
-			       !__this_cpu_read(p->pcp.count))
+			if (!__this_cpu_read(pcp->expire) ||
+			       !__this_cpu_read(pcp->count))
 				continue;
 
 			/*
 			 * We never drain zones local to this processor.
 			 */
 			if (zone_to_nid(zone) == numa_node_id()) {
-				__this_cpu_write(p->expire, 0);
+				__this_cpu_write(pcp->expire, 0);
 				continue;
 			}
 
-			if (__this_cpu_dec_return(p->expire))
+			if (__this_cpu_dec_return(pcp->expire))
 				continue;
 
-			if (__this_cpu_read(p->pcp.count)) {
-				drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
+			if (__this_cpu_read(pcp->count)) {
+				drain_zone_pages(zone, this_cpu_ptr(pcp));
 				changes++;
 			}
 		}
@@ -882,27 +885,27 @@ void cpu_vm_stats_fold(int cpu)
 	int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
 
 	for_each_populated_zone(zone) {
-		struct per_cpu_pageset *p;
+		struct per_cpu_zonestat *pzstats;
 
-		p = per_cpu_ptr(zone->pageset, cpu);
+		pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
 
 		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-			if (p->vm_stat_diff[i]) {
+			if (pzstats->vm_stat_diff[i]) {
 				int v;
 
-				v = p->vm_stat_diff[i];
-				p->vm_stat_diff[i] = 0;
+				v = pzstats->vm_stat_diff[i];
+				pzstats->vm_stat_diff[i] = 0;
 				atomic_long_add(v, &zone->vm_stat[i]);
 				global_zone_diff[i] += v;
 			}
 
 #ifdef CONFIG_NUMA
 		for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
-			if (p->vm_numa_stat_diff[i]) {
+			if (pzstats->vm_numa_stat_diff[i]) {
 				int v;
 
-				v = p->vm_numa_stat_diff[i];
-				p->vm_numa_stat_diff[i] = 0;
+				v = pzstats->vm_numa_stat_diff[i];
+				pzstats->vm_numa_stat_diff[i] = 0;
 				atomic_long_add(v, &zone->vm_numa_stat[i]);
 				global_numa_diff[i] += v;
 			}
@@ -936,24 +939,24 @@ void cpu_vm_stats_fold(int cpu)
  * this is only called if !populated_zone(zone), which implies no other users of
  * pset->vm_stat_diff[] exist.
  */
-void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
+void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats)
 {
 	int i;
 
 	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-		if (pset->vm_stat_diff[i]) {
-			int v = pset->vm_stat_diff[i];
-			pset->vm_stat_diff[i] = 0;
+		if (pzstats->vm_stat_diff[i]) {
+			int v = pzstats->vm_stat_diff[i];
+			pzstats->vm_stat_diff[i] = 0;
 			atomic_long_add(v, &zone->vm_stat[i]);
 			atomic_long_add(v, &vm_zone_stat[i]);
 		}
 
 #ifdef CONFIG_NUMA
 	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
-		if (pset->vm_numa_stat_diff[i]) {
-			int v = pset->vm_numa_stat_diff[i];
+		if (pzstats->vm_numa_stat_diff[i]) {
+			int v = pzstats->vm_numa_stat_diff[i];
 
-			pset->vm_numa_stat_diff[i] = 0;
+			pzstats->vm_numa_stat_diff[i] = 0;
 			atomic_long_add(v, &zone->vm_numa_stat[i]);
 			atomic_long_add(v, &vm_numa_stat[i]);
 		}
@@ -965,8 +968,8 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
 void __inc_numa_state(struct zone *zone,
 				 enum numa_stat_item item)
 {
-	struct per_cpu_pageset __percpu *pcp = zone->pageset;
-	u16 __percpu *p = pcp->vm_numa_stat_diff + item;
+	struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
+	u16 __percpu *p = pzstats->vm_numa_stat_diff + item;
 	u16 v;
 
 	v = __this_cpu_inc_return(*p);
@@ -1693,21 +1696,23 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 
 	seq_printf(m, "\n  pagesets");
 	for_each_online_cpu(i) {
-		struct per_cpu_pageset *pageset;
+		struct per_cpu_pages *pcp;
+		struct per_cpu_zonestat __maybe_unused *pzstats;
 
-		pageset = per_cpu_ptr(zone->pageset, i);
+		pcp = per_cpu_ptr(zone->per_cpu_pageset, i);
 		seq_printf(m,
 			   "\n    cpu: %i"
 			   "\n              count: %i"
 			   "\n              high:  %i"
 			   "\n              batch: %i",
 			   i,
-			   pageset->pcp.count,
-			   pageset->pcp.high,
-			   pageset->pcp.batch);
+			   pcp->count,
+			   pcp->high,
+			   pcp->batch);
 #ifdef CONFIG_SMP
+		pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i);
 		seq_printf(m, "\n  vm stats threshold: %d",
-				pageset->stat_threshold);
+				pzstats->stat_threshold);
 #endif
 	}
 	seq_printf(m,
@@ -1927,17 +1932,18 @@ static bool need_update(int cpu)
 	struct zone *zone;
 
 	for_each_populated_zone(zone) {
-		struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
+		struct per_cpu_zonestat *pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
 		struct per_cpu_nodestat *n;
+
 		/*
 		 * The fast way of checking if there are any vmstat diffs.
 		 */
-		if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS *
-			       sizeof(p->vm_stat_diff[0])))
+		if (memchr_inv(pzstats->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS *
+			       sizeof(pzstats->vm_stat_diff[0])))
 			return true;
 #ifdef CONFIG_NUMA
-		if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS *
-			       sizeof(p->vm_numa_stat_diff[0])))
+		if (memchr_inv(pzstats->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS *
+			       sizeof(pzstats->vm_numa_stat_diff[0])))
 			return true;
 #endif
 		if (last_pgdat == zone->zone_pgdat)
-- 
cgit v1.2.3


From dbbee9d5cd83f9d0a29639e260516907ceb2ac3d Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Mon, 28 Jun 2021 19:41:41 -0700
Subject: mm/page_alloc: convert per-cpu list protection to local_lock

There is a lack of clarity of what exactly
local_irq_save/local_irq_restore protects in page_alloc.c .  It conflates
the protection of per-cpu page allocation structures with per-cpu vmstat
deltas.

This patch protects the PCP structure using local_lock which for most
configurations is identical to IRQ enabling/disabling.  The scope of the
lock is still wider than it should be but this is decreased later.

It is possible for the local_lock to be embedded safely within struct
per_cpu_pages but it adds complexity to free_unref_page_list.

[akpm@linux-foundation.org: coding style fixes]
[mgorman@techsingularity.net: work around a pahole limitation with zero-sized struct pagesets]
  Link: https://lkml.kernel.org/r/20210526080741.GW30378@techsingularity.net
[lkp@intel.com: Make pagesets static]

Link: https://lkml.kernel.org/r/20210512095458.30632-3-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  2 ++
 lib/Kconfig.debug      |  3 +++
 mm/page_alloc.c        | 61 +++++++++++++++++++++++++++++++++++++-------------
 3 files changed, 51 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a50b123ab7ae..0d6bb737e5a2 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -20,6 +20,7 @@
 #include <linux/atomic.h>
 #include <linux/mm_types.h>
 #include <linux/page-flags.h>
+#include <linux/local_lock.h>
 #include <asm/page.h>
 
 /* Free memory management - zoned buddy allocator.  */
@@ -337,6 +338,7 @@ enum zone_watermarks {
 #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
 #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
 
+/* Fields and list protected by pagesets local_lock in page_alloc.c */
 struct per_cpu_pages {
 	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 7723f58a9394..deca67d28abb 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -313,6 +313,9 @@ config DEBUG_INFO_BTF
 config PAHOLE_HAS_SPLIT_BTF
 	def_bool $(success, test `$(PAHOLE) --version | sed -E 's/v([0-9]+)\.([0-9]+)/\1\2/'` -ge "119")
 
+config PAHOLE_HAS_ZEROSIZE_PERCPU_SUPPORT
+	def_bool $(success, test `$(PAHOLE) --version | sed -E 's/v([0-9]+)\.([0-9]+)/\1\2/'` -ge "122")
+
 config DEBUG_INFO_BTF_MODULES
 	def_bool y
 	depends on DEBUG_INFO_BTF && MODULES && PAHOLE_HAS_SPLIT_BTF
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 330c7307a92b..89872ad5e872 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -122,6 +122,24 @@ typedef int __bitwise fpi_t;
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_FRACTION	(8)
 
+struct pagesets {
+	local_lock_t lock;
+#if defined(CONFIG_DEBUG_INFO_BTF) &&				\
+	!defined(CONFIG_DEBUG_LOCK_ALLOC) &&			\
+	!defined(CONFIG_PAHOLE_HAS_ZEROSIZE_PERCPU_SUPPORT)
+	/*
+	 * pahole 1.21 and earlier gets confused by zero-sized per-CPU
+	 * variables and produces invalid BTF. Ensure that
+	 * sizeof(struct pagesets) != 0 for older versions of pahole.
+	 */
+	char __pahole_hack;
+	#warning "pahole too old to support zero-sized struct pagesets"
+#endif
+};
+static DEFINE_PER_CPU(struct pagesets, pagesets) = {
+	.lock = INIT_LOCAL_LOCK(lock),
+};
+
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -1453,6 +1471,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 		} while (--count && --batch_free && !list_empty(list));
 	}
 
+	/*
+	 * local_lock_irq held so equivalent to spin_lock_irqsave for
+	 * both PREEMPT_RT and non-PREEMPT_RT configurations.
+	 */
 	spin_lock(&zone->lock);
 	isolated_pageblocks = has_isolate_pageblock(zone);
 
@@ -1573,6 +1595,11 @@ static void __free_pages_ok(struct page *page, unsigned int order,
 		return;
 
 	migratetype = get_pfnblock_migratetype(page, pfn);
+
+	/*
+	 * TODO FIX: Disable IRQs before acquiring IRQ-safe zone->lock
+	 * and protect vmstat updates.
+	 */
 	local_irq_save(flags);
 	__count_vm_events(PGFREE, 1 << order);
 	free_one_page(page_zone(page), page, pfn, order, migratetype,
@@ -2955,6 +2982,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 {
 	int i, allocated = 0;
 
+	/*
+	 * local_lock_irq held so equivalent to spin_lock_irqsave for
+	 * both PREEMPT_RT and non-PREEMPT_RT configurations.
+	 */
 	spin_lock(&zone->lock);
 	for (i = 0; i < count; ++i) {
 		struct page *page = __rmqueue(zone, order, migratetype,
@@ -3007,12 +3038,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 	unsigned long flags;
 	int to_drain, batch;
 
-	local_irq_save(flags);
+	local_lock_irqsave(&pagesets.lock, flags);
 	batch = READ_ONCE(pcp->batch);
 	to_drain = min(pcp->count, batch);
 	if (to_drain > 0)
 		free_pcppages_bulk(zone, to_drain, pcp);
-	local_irq_restore(flags);
+	local_unlock_irqrestore(&pagesets.lock, flags);
 }
 #endif
 
@@ -3028,13 +3059,13 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 	unsigned long flags;
 	struct per_cpu_pages *pcp;
 
-	local_irq_save(flags);
+	local_lock_irqsave(&pagesets.lock, flags);
 
 	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
 	if (pcp->count)
 		free_pcppages_bulk(zone, pcp->count, pcp);
 
-	local_irq_restore(flags);
+	local_unlock_irqrestore(&pagesets.lock, flags);
 }
 
 /*
@@ -3297,9 +3328,9 @@ void free_unref_page(struct page *page)
 	if (!free_unref_page_prepare(page, pfn))
 		return;
 
-	local_irq_save(flags);
+	local_lock_irqsave(&pagesets.lock, flags);
 	free_unref_page_commit(page, pfn);
-	local_irq_restore(flags);
+	local_unlock_irqrestore(&pagesets.lock, flags);
 }
 
 /*
@@ -3319,7 +3350,7 @@ void free_unref_page_list(struct list_head *list)
 		set_page_private(page, pfn);
 	}
 
-	local_irq_save(flags);
+	local_lock_irqsave(&pagesets.lock, flags);
 	list_for_each_entry_safe(page, next, list, lru) {
 		unsigned long pfn = page_private(page);
 
@@ -3332,12 +3363,12 @@ void free_unref_page_list(struct list_head *list)
 		 * a large list of pages to free.
 		 */
 		if (++batch_count == SWAP_CLUSTER_MAX) {
-			local_irq_restore(flags);
+			local_unlock_irqrestore(&pagesets.lock, flags);
 			batch_count = 0;
-			local_irq_save(flags);
+			local_lock_irqsave(&pagesets.lock, flags);
 		}
 	}
-	local_irq_restore(flags);
+	local_unlock_irqrestore(&pagesets.lock, flags);
 }
 
 /*
@@ -3494,7 +3525,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 	struct page *page;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	local_lock_irqsave(&pagesets.lock, flags);
 	pcp = this_cpu_ptr(zone->per_cpu_pageset);
 	list = &pcp->lists[migratetype];
 	page = __rmqueue_pcplist(zone,  migratetype, alloc_flags, pcp, list);
@@ -3502,7 +3533,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
 		zone_statistics(preferred_zone, zone);
 	}
-	local_irq_restore(flags);
+	local_unlock_irqrestore(&pagesets.lock, flags);
 	return page;
 }
 
@@ -5103,7 +5134,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 		goto failed;
 
 	/* Attempt the batch allocation */
-	local_irq_save(flags);
+	local_lock_irqsave(&pagesets.lock, flags);
 	pcp = this_cpu_ptr(zone->per_cpu_pageset);
 	pcp_list = &pcp->lists[ac.migratetype];
 
@@ -5141,12 +5172,12 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 		nr_populated++;
 	}
 
-	local_irq_restore(flags);
+	local_unlock_irqrestore(&pagesets.lock, flags);
 
 	return nr_populated;
 
 failed_irq:
-	local_irq_restore(flags);
+	local_unlock_irqrestore(&pagesets.lock, flags);
 
 failed:
 	page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
-- 
cgit v1.2.3


From f19298b9516c1a031b34b4147773457e3efe743b Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Mon, 28 Jun 2021 19:41:44 -0700
Subject: mm/vmstat: convert NUMA statistics to basic NUMA counters

NUMA statistics are maintained on the zone level for hits, misses, foreign
etc but nothing relies on them being perfectly accurate for functional
correctness.  The counters are used by userspace to get a general overview
of a workloads NUMA behaviour but the page allocator incurs a high cost to
maintain perfect accuracy similar to what is required for a vmstat like
NR_FREE_PAGES.  There even is a sysctl vm.numa_stat to allow userspace to
turn off the collection of NUMA statistics like NUMA_HIT.

This patch converts NUMA_HIT and friends to be NUMA events with similar
accuracy to VM events.  There is a possibility that slight errors will be
introduced but the overall trend as seen by userspace will be similar.
The counters are no longer updated from vmstat_refresh context as it is
unnecessary overhead for counters that may never be read by userspace.
Note that counters could be maintained at the node level to save space but
it would have a user-visible impact due to /proc/zoneinfo.

[lkp@intel.com: Fix misplaced closing brace for !CONFIG_NUMA]

Link: https://lkml.kernel.org/r/20210512095458.30632-4-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c    |  18 ++---
 include/linux/mmzone.h |  13 ++--
 include/linux/vmstat.h |  43 ++++++------
 mm/mempolicy.c         |   2 +-
 mm/page_alloc.c        |  12 ++--
 mm/vmstat.c            | 173 +++++++++++++++++++------------------------------
 6 files changed, 113 insertions(+), 148 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 2c36f61d30bc..9db297431b97 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -482,6 +482,7 @@ static DEVICE_ATTR(meminfo, 0444, node_read_meminfo, NULL);
 static ssize_t node_read_numastat(struct device *dev,
 				  struct device_attribute *attr, char *buf)
 {
+	fold_vm_numa_events();
 	return sysfs_emit(buf,
 			  "numa_hit %lu\n"
 			  "numa_miss %lu\n"
@@ -489,12 +490,12 @@ static ssize_t node_read_numastat(struct device *dev,
 			  "interleave_hit %lu\n"
 			  "local_node %lu\n"
 			  "other_node %lu\n",
-			  sum_zone_numa_state(dev->id, NUMA_HIT),
-			  sum_zone_numa_state(dev->id, NUMA_MISS),
-			  sum_zone_numa_state(dev->id, NUMA_FOREIGN),
-			  sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
-			  sum_zone_numa_state(dev->id, NUMA_LOCAL),
-			  sum_zone_numa_state(dev->id, NUMA_OTHER));
+			  sum_zone_numa_event_state(dev->id, NUMA_HIT),
+			  sum_zone_numa_event_state(dev->id, NUMA_MISS),
+			  sum_zone_numa_event_state(dev->id, NUMA_FOREIGN),
+			  sum_zone_numa_event_state(dev->id, NUMA_INTERLEAVE_HIT),
+			  sum_zone_numa_event_state(dev->id, NUMA_LOCAL),
+			  sum_zone_numa_event_state(dev->id, NUMA_OTHER));
 }
 static DEVICE_ATTR(numastat, 0444, node_read_numastat, NULL);
 
@@ -512,10 +513,11 @@ static ssize_t node_read_vmstat(struct device *dev,
 				     sum_zone_node_page_state(nid, i));
 
 #ifdef CONFIG_NUMA
-	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+	fold_vm_numa_events();
+	for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
 		len += sysfs_emit_at(buf, len, "%s %lu\n",
 				     numa_stat_name(i),
-				     sum_zone_numa_state(nid, i));
+				     sum_zone_numa_event_state(nid, i));
 
 #endif
 	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0d6bb737e5a2..f86018d5e362 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -135,10 +135,10 @@ enum numa_stat_item {
 	NUMA_INTERLEAVE_HIT,	/* interleaver preferred this zone */
 	NUMA_LOCAL,		/* allocation from local node */
 	NUMA_OTHER,		/* allocation from other node */
-	NR_VM_NUMA_STAT_ITEMS
+	NR_VM_NUMA_EVENT_ITEMS
 };
 #else
-#define NR_VM_NUMA_STAT_ITEMS 0
+#define NR_VM_NUMA_EVENT_ITEMS 0
 #endif
 
 enum zone_stat_item {
@@ -357,7 +357,12 @@ struct per_cpu_zonestat {
 	s8 stat_threshold;
 #endif
 #ifdef CONFIG_NUMA
-	u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
+	/*
+	 * Low priority inaccurate counters that are only folded
+	 * on demand. Use a large type to avoid the overhead of
+	 * folding during refresh_cpu_vm_stats.
+	 */
+	unsigned long vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
 #endif
 };
 
@@ -623,7 +628,7 @@ struct zone {
 	ZONE_PADDING(_pad3_)
 	/* Zone statistics */
 	atomic_long_t		vm_stat[NR_VM_ZONE_STAT_ITEMS];
-	atomic_long_t		vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
+	atomic_long_t		vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
 } ____cacheline_internodealigned_in_smp;
 
 enum pgdat_flags {
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 0c5f36504613..59748bbbba4c 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -138,34 +138,27 @@ static inline void vm_events_fold_cpu(int cpu)
  * Zone and node-based page accounting with per cpu differentials.
  */
 extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS];
-extern atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
 extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS];
+extern atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
 
 #ifdef CONFIG_NUMA
-static inline void zone_numa_state_add(long x, struct zone *zone,
-				 enum numa_stat_item item)
+static inline void zone_numa_event_add(long x, struct zone *zone,
+				enum numa_stat_item item)
 {
-	atomic_long_add(x, &zone->vm_numa_stat[item]);
-	atomic_long_add(x, &vm_numa_stat[item]);
+	atomic_long_add(x, &zone->vm_numa_event[item]);
+	atomic_long_add(x, &vm_numa_event[item]);
 }
 
-static inline unsigned long global_numa_state(enum numa_stat_item item)
+static inline unsigned long zone_numa_event_state(struct zone *zone,
+					enum numa_stat_item item)
 {
-	long x = atomic_long_read(&vm_numa_stat[item]);
-
-	return x;
+	return atomic_long_read(&zone->vm_numa_event[item]);
 }
 
-static inline unsigned long zone_numa_state_snapshot(struct zone *zone,
-					enum numa_stat_item item)
+static inline unsigned long
+global_numa_event_state(enum numa_stat_item item)
 {
-	long x = atomic_long_read(&zone->vm_numa_stat[item]);
-	int cpu;
-
-	for_each_online_cpu(cpu)
-		x += per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_stat_diff[item];
-
-	return x;
+	return atomic_long_read(&vm_numa_event[item]);
 }
 #endif /* CONFIG_NUMA */
 
@@ -245,18 +238,22 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
 }
 
 #ifdef CONFIG_NUMA
-extern void __inc_numa_state(struct zone *zone, enum numa_stat_item item);
+extern void __count_numa_event(struct zone *zone, enum numa_stat_item item);
 extern unsigned long sum_zone_node_page_state(int node,
 					      enum zone_stat_item item);
-extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item);
+extern unsigned long sum_zone_numa_event_state(int node, enum numa_stat_item item);
 extern unsigned long node_page_state(struct pglist_data *pgdat,
 						enum node_stat_item item);
 extern unsigned long node_page_state_pages(struct pglist_data *pgdat,
 					   enum node_stat_item item);
+extern void fold_vm_numa_events(void);
 #else
 #define sum_zone_node_page_state(node, item) global_zone_page_state(item)
 #define node_page_state(node, item) global_node_page_state(item)
 #define node_page_state_pages(node, item) global_node_page_state_pages(item)
+static inline void fold_vm_numa_events(void)
+{
+}
 #endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_SMP
@@ -428,7 +425,7 @@ static inline const char *numa_stat_name(enum numa_stat_item item)
 static inline const char *node_stat_name(enum node_stat_item item)
 {
 	return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
-			   NR_VM_NUMA_STAT_ITEMS +
+			   NR_VM_NUMA_EVENT_ITEMS +
 			   item];
 }
 
@@ -440,7 +437,7 @@ static inline const char *lru_list_name(enum lru_list lru)
 static inline const char *writeback_stat_name(enum writeback_stat_item item)
 {
 	return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
-			   NR_VM_NUMA_STAT_ITEMS +
+			   NR_VM_NUMA_EVENT_ITEMS +
 			   NR_VM_NODE_STAT_ITEMS +
 			   item];
 }
@@ -449,7 +446,7 @@ static inline const char *writeback_stat_name(enum writeback_stat_item item)
 static inline const char *vm_event_name(enum vm_event_item item)
 {
 	return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
-			   NR_VM_NUMA_STAT_ITEMS +
+			   NR_VM_NUMA_EVENT_ITEMS +
 			   NR_VM_NODE_STAT_ITEMS +
 			   NR_VM_WRITEBACK_STAT_ITEMS +
 			   item];
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 325771bef5e2..b5d95bf1025d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2150,7 +2150,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 		return page;
 	if (page && page_to_nid(page) == nid) {
 		preempt_disable();
-		__inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
+		__count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
 		preempt_enable();
 	}
 	return page;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 89872ad5e872..4e03109bdae5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3480,12 +3480,12 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
 		local_stat = NUMA_OTHER;
 
 	if (zone_to_nid(z) == zone_to_nid(preferred_zone))
-		__inc_numa_state(z, NUMA_HIT);
+		__count_numa_event(z, NUMA_HIT);
 	else {
-		__inc_numa_state(z, NUMA_MISS);
-		__inc_numa_state(preferred_zone, NUMA_FOREIGN);
+		__count_numa_event(z, NUMA_MISS);
+		__count_numa_event(preferred_zone, NUMA_FOREIGN);
 	}
-	__inc_numa_state(z, local_stat);
+	__count_numa_event(z, local_stat);
 #endif
 }
 
@@ -6785,8 +6785,8 @@ void __init setup_per_cpu_pageset(void)
 	 */
 	for_each_possible_cpu(cpu) {
 		struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu);
-		memset(pzstats->vm_numa_stat_diff, 0,
-		       sizeof(pzstats->vm_numa_stat_diff));
+		memset(pzstats->vm_numa_event, 0,
+		       sizeof(pzstats->vm_numa_event));
 	}
 #endif
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f1400ba46beb..0e27b62e487d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -31,8 +31,6 @@
 
 #include "internal.h"
 
-#define NUMA_STATS_THRESHOLD (U16_MAX - 2)
-
 #ifdef CONFIG_NUMA
 int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
 
@@ -41,11 +39,12 @@ static void zero_zone_numa_counters(struct zone *zone)
 {
 	int item, cpu;
 
-	for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) {
-		atomic_long_set(&zone->vm_numa_stat[item], 0);
-		for_each_online_cpu(cpu)
-			per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_stat_diff[item]
+	for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) {
+		atomic_long_set(&zone->vm_numa_event[item], 0);
+		for_each_online_cpu(cpu) {
+			per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item]
 						= 0;
+		}
 	}
 }
 
@@ -63,8 +62,8 @@ static void zero_global_numa_counters(void)
 {
 	int item;
 
-	for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++)
-		atomic_long_set(&vm_numa_stat[item], 0);
+	for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
+		atomic_long_set(&vm_numa_event[item], 0);
 }
 
 static void invalid_numa_statistics(void)
@@ -161,10 +160,9 @@ void vm_events_fold_cpu(int cpu)
  * vm_stat contains the global counters
  */
 atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
-atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp;
 atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
+atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
 EXPORT_SYMBOL(vm_zone_stat);
-EXPORT_SYMBOL(vm_numa_stat);
 EXPORT_SYMBOL(vm_node_stat);
 
 #ifdef CONFIG_SMP
@@ -706,8 +704,7 @@ EXPORT_SYMBOL(dec_node_page_state);
  * Fold a differential into the global counters.
  * Returns the number of counters updated.
  */
-#ifdef CONFIG_NUMA
-static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff)
+static int fold_diff(int *zone_diff, int *node_diff)
 {
 	int i;
 	int changes = 0;
@@ -718,12 +715,6 @@ static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff)
 			changes++;
 	}
 
-	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
-		if (numa_diff[i]) {
-			atomic_long_add(numa_diff[i], &vm_numa_stat[i]);
-			changes++;
-	}
-
 	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
 		if (node_diff[i]) {
 			atomic_long_add(node_diff[i], &vm_node_stat[i]);
@@ -731,26 +722,34 @@ static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff)
 	}
 	return changes;
 }
-#else
-static int fold_diff(int *zone_diff, int *node_diff)
+
+#ifdef CONFIG_NUMA
+static void fold_vm_zone_numa_events(struct zone *zone)
 {
-	int i;
-	int changes = 0;
+	unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
+	int cpu;
+	enum numa_stat_item item;
 
-	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-		if (zone_diff[i]) {
-			atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
-			changes++;
-	}
+	for_each_online_cpu(cpu) {
+		struct per_cpu_zonestat *pzstats;
 
-	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
-		if (node_diff[i]) {
-			atomic_long_add(node_diff[i], &vm_node_stat[i]);
-			changes++;
+		pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
+		for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
+			zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
 	}
-	return changes;
+
+	for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
+		zone_numa_event_add(zone_numa_events[item], zone, item);
 }
-#endif /* CONFIG_NUMA */
+
+void fold_vm_numa_events(void)
+{
+	struct zone *zone;
+
+	for_each_populated_zone(zone)
+		fold_vm_zone_numa_events(zone);
+}
+#endif
 
 /*
  * Update the zone counters for the current cpu.
@@ -774,9 +773,6 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
 	struct zone *zone;
 	int i;
 	int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
-#ifdef CONFIG_NUMA
-	int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
-#endif
 	int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
 	int changes = 0;
 
@@ -801,17 +797,6 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
 			}
 		}
 #ifdef CONFIG_NUMA
-		for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
-			int v;
-
-			v = this_cpu_xchg(pzstats->vm_numa_stat_diff[i], 0);
-			if (v) {
-
-				atomic_long_add(v, &zone->vm_numa_stat[i]);
-				global_numa_diff[i] += v;
-				__this_cpu_write(pcp->expire, 3);
-			}
-		}
 
 		if (do_pagesets) {
 			cond_resched();
@@ -859,12 +844,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
 		}
 	}
 
-#ifdef CONFIG_NUMA
-	changes += fold_diff(global_zone_diff, global_numa_diff,
-			     global_node_diff);
-#else
 	changes += fold_diff(global_zone_diff, global_node_diff);
-#endif
 	return changes;
 }
 
@@ -879,9 +859,6 @@ void cpu_vm_stats_fold(int cpu)
 	struct zone *zone;
 	int i;
 	int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
-#ifdef CONFIG_NUMA
-	int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
-#endif
 	int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
 
 	for_each_populated_zone(zone) {
@@ -889,7 +866,7 @@ void cpu_vm_stats_fold(int cpu)
 
 		pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
 
-		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
 			if (pzstats->vm_stat_diff[i]) {
 				int v;
 
@@ -898,17 +875,17 @@ void cpu_vm_stats_fold(int cpu)
 				atomic_long_add(v, &zone->vm_stat[i]);
 				global_zone_diff[i] += v;
 			}
-
+		}
 #ifdef CONFIG_NUMA
-		for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
-			if (pzstats->vm_numa_stat_diff[i]) {
-				int v;
+		for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
+			if (pzstats->vm_numa_event[i]) {
+				unsigned long v;
 
-				v = pzstats->vm_numa_stat_diff[i];
-				pzstats->vm_numa_stat_diff[i] = 0;
-				atomic_long_add(v, &zone->vm_numa_stat[i]);
-				global_numa_diff[i] += v;
+				v = pzstats->vm_numa_event[i];
+				pzstats->vm_numa_event[i] = 0;
+				zone_numa_event_add(v, zone, i);
 			}
+		}
 #endif
 	}
 
@@ -928,11 +905,7 @@ void cpu_vm_stats_fold(int cpu)
 			}
 	}
 
-#ifdef CONFIG_NUMA
-	fold_diff(global_zone_diff, global_numa_diff, global_node_diff);
-#else
 	fold_diff(global_zone_diff, global_node_diff);
-#endif
 }
 
 /*
@@ -941,43 +914,37 @@ void cpu_vm_stats_fold(int cpu)
  */
 void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats)
 {
+	unsigned long v;
 	int i;
 
-	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
 		if (pzstats->vm_stat_diff[i]) {
-			int v = pzstats->vm_stat_diff[i];
+			v = pzstats->vm_stat_diff[i];
 			pzstats->vm_stat_diff[i] = 0;
-			atomic_long_add(v, &zone->vm_stat[i]);
-			atomic_long_add(v, &vm_zone_stat[i]);
+			zone_page_state_add(v, zone, i);
 		}
+	}
 
 #ifdef CONFIG_NUMA
-	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
-		if (pzstats->vm_numa_stat_diff[i]) {
-			int v = pzstats->vm_numa_stat_diff[i];
-
-			pzstats->vm_numa_stat_diff[i] = 0;
-			atomic_long_add(v, &zone->vm_numa_stat[i]);
-			atomic_long_add(v, &vm_numa_stat[i]);
+	for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
+		if (pzstats->vm_numa_event[i]) {
+			v = pzstats->vm_numa_event[i];
+			pzstats->vm_numa_event[i] = 0;
+			zone_numa_event_add(v, zone, i);
 		}
+	}
 #endif
 }
 #endif
 
 #ifdef CONFIG_NUMA
-void __inc_numa_state(struct zone *zone,
+/* See __count_vm_event comment on why raw_cpu_inc is used. */
+void __count_numa_event(struct zone *zone,
 				 enum numa_stat_item item)
 {
 	struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
-	u16 __percpu *p = pzstats->vm_numa_stat_diff + item;
-	u16 v;
-
-	v = __this_cpu_inc_return(*p);
 
-	if (unlikely(v > NUMA_STATS_THRESHOLD)) {
-		zone_numa_state_add(v, zone, item);
-		__this_cpu_write(*p, 0);
-	}
+	raw_cpu_inc(pzstats->vm_numa_event[item]);
 }
 
 /*
@@ -998,19 +965,16 @@ unsigned long sum_zone_node_page_state(int node,
 	return count;
 }
 
-/*
- * Determine the per node value of a numa stat item. To avoid deviation,
- * the per cpu stat number in vm_numa_stat_diff[] is also included.
- */
-unsigned long sum_zone_numa_state(int node,
+/* Determine the per node value of a numa stat item. */
+unsigned long sum_zone_numa_event_state(int node,
 				 enum numa_stat_item item)
 {
 	struct zone *zones = NODE_DATA(node)->node_zones;
-	int i;
 	unsigned long count = 0;
+	int i;
 
 	for (i = 0; i < MAX_NR_ZONES; i++)
-		count += zone_numa_state_snapshot(zones + i, item);
+		count += zone_numa_event_state(zones + i, item);
 
 	return count;
 }
@@ -1689,9 +1653,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 			   zone_page_state(zone, i));
 
 #ifdef CONFIG_NUMA
-	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+	for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
 		seq_printf(m, "\n      %-12s %lu", numa_stat_name(i),
-			   zone_numa_state_snapshot(zone, i));
+			   zone_numa_event_state(zone, i));
 #endif
 
 	seq_printf(m, "\n  pagesets");
@@ -1745,7 +1709,7 @@ static const struct seq_operations zoneinfo_op = {
 };
 
 #define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
-			 NR_VM_NUMA_STAT_ITEMS + \
+			 NR_VM_NUMA_EVENT_ITEMS + \
 			 NR_VM_NODE_STAT_ITEMS + \
 			 NR_VM_WRITEBACK_STAT_ITEMS + \
 			 (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
@@ -1760,6 +1724,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
 		return NULL;
 
 	BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
+	fold_vm_numa_events();
 	v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
 	m->private = v;
 	if (!v)
@@ -1769,9 +1734,9 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
 	v += NR_VM_ZONE_STAT_ITEMS;
 
 #ifdef CONFIG_NUMA
-	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
-		v[i] = global_numa_state(i);
-	v += NR_VM_NUMA_STAT_ITEMS;
+	for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
+		v[i] = global_numa_event_state(i);
+	v += NR_VM_NUMA_EVENT_ITEMS;
 #endif
 
 	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
@@ -1941,11 +1906,7 @@ static bool need_update(int cpu)
 		if (memchr_inv(pzstats->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS *
 			       sizeof(pzstats->vm_stat_diff[0])))
 			return true;
-#ifdef CONFIG_NUMA
-		if (memchr_inv(pzstats->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS *
-			       sizeof(pzstats->vm_numa_stat_diff[0])))
-			return true;
-#endif
+
 		if (last_pgdat == zone->zone_pgdat)
 			continue;
 		last_pgdat = zone->zone_pgdat;
-- 
cgit v1.2.3


From 3ac44a346a50988131db124a7e4bb99d3ec71706 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Mon, 28 Jun 2021 19:41:47 -0700
Subject: mm/vmstat: inline NUMA event counter updates

__count_numa_event is small enough to be treated similarly to
__count_vm_event so inline it.

Link: https://lkml.kernel.org/r/20210512095458.30632-5-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmstat.h | 10 +++++++++-
 mm/vmstat.c            |  9 ---------
 2 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 59748bbbba4c..fe32a2210e73 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -238,7 +238,15 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
 }
 
 #ifdef CONFIG_NUMA
-extern void __count_numa_event(struct zone *zone, enum numa_stat_item item);
+/* See __count_vm_event comment on why raw_cpu_inc is used. */
+static inline void
+__count_numa_event(struct zone *zone, enum numa_stat_item item)
+{
+	struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
+
+	raw_cpu_inc(pzstats->vm_numa_event[item]);
+}
+
 extern unsigned long sum_zone_node_page_state(int node,
 					      enum zone_stat_item item);
 extern unsigned long sum_zone_numa_event_state(int node, enum numa_stat_item item);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 0e27b62e487d..b0534e068166 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -938,15 +938,6 @@ void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats)
 #endif
 
 #ifdef CONFIG_NUMA
-/* See __count_vm_event comment on why raw_cpu_inc is used. */
-void __count_numa_event(struct zone *zone,
-				 enum numa_stat_item item)
-{
-	struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
-
-	raw_cpu_inc(pzstats->vm_numa_event[item]);
-}
-
 /*
  * Determine the per node value of a stat item. This function
  * is called frequently in a NUMA machine, so try to be as
-- 
cgit v1.2.3


From 3e23060b2d0b7eebf37b3b6043ea68da0ebc0646 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Mon, 28 Jun 2021 19:41:50 -0700
Subject: mm/page_alloc: batch the accounting updates in the bulk allocator

Now that the zone_statistics are simple counters that do not require
special protection, the bulk allocator accounting updates can be batch
updated without adding too much complexity with protected RMW updates or
using xchg.

Link: https://lkml.kernel.org/r/20210512095458.30632-6-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmstat.h |  8 ++++++++
 mm/page_alloc.c        | 30 +++++++++++++-----------------
 2 files changed, 21 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index fe32a2210e73..d6a6cf53b127 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -247,6 +247,14 @@ __count_numa_event(struct zone *zone, enum numa_stat_item item)
 	raw_cpu_inc(pzstats->vm_numa_event[item]);
 }
 
+static inline void
+__count_numa_events(struct zone *zone, enum numa_stat_item item, long delta)
+{
+	struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
+
+	raw_cpu_add(pzstats->vm_numa_event[item], delta);
+}
+
 extern unsigned long sum_zone_node_page_state(int node,
 					      enum zone_stat_item item);
 extern unsigned long sum_zone_numa_event_state(int node, enum numa_stat_item item);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4e03109bdae5..6bb9b87cf7d5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3467,7 +3467,8 @@ void __putback_isolated_page(struct page *page, unsigned int order, int mt)
  *
  * Must be called with interrupts disabled.
  */
-static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
+static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
+				   long nr_account)
 {
 #ifdef CONFIG_NUMA
 	enum numa_stat_item local_stat = NUMA_LOCAL;
@@ -3480,12 +3481,12 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
 		local_stat = NUMA_OTHER;
 
 	if (zone_to_nid(z) == zone_to_nid(preferred_zone))
-		__count_numa_event(z, NUMA_HIT);
+		__count_numa_events(z, NUMA_HIT, nr_account);
 	else {
-		__count_numa_event(z, NUMA_MISS);
-		__count_numa_event(preferred_zone, NUMA_FOREIGN);
+		__count_numa_events(z, NUMA_MISS, nr_account);
+		__count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account);
 	}
-	__count_numa_event(z, local_stat);
+	__count_numa_events(z, local_stat, nr_account);
 #endif
 }
 
@@ -3531,7 +3532,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 	page = __rmqueue_pcplist(zone,  migratetype, alloc_flags, pcp, list);
 	if (page) {
 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
-		zone_statistics(preferred_zone, zone);
+		zone_statistics(preferred_zone, zone, 1);
 	}
 	local_unlock_irqrestore(&pagesets.lock, flags);
 	return page;
@@ -3592,7 +3593,7 @@ struct page *rmqueue(struct zone *preferred_zone,
 				  get_pcppage_migratetype(page));
 
 	__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
-	zone_statistics(preferred_zone, zone);
+	zone_statistics(preferred_zone, zone, 1);
 	local_irq_restore(flags);
 
 out:
@@ -5077,7 +5078,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 	struct alloc_context ac;
 	gfp_t alloc_gfp;
 	unsigned int alloc_flags = ALLOC_WMARK_LOW;
-	int nr_populated = 0;
+	int nr_populated = 0, nr_account = 0;
 
 	if (unlikely(nr_pages <= 0))
 		return 0;
@@ -5154,15 +5155,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 				goto failed_irq;
 			break;
 		}
-
-		/*
-		 * Ideally this would be batched but the best way to do
-		 * that cheaply is to first convert zone_statistics to
-		 * be inaccurate per-cpu counter like vm_events to avoid
-		 * a RMW cycle then do the accounting with IRQs enabled.
-		 */
-		__count_zid_vm_events(PGALLOC, zone_idx(zone), 1);
-		zone_statistics(ac.preferred_zoneref->zone, zone);
+		nr_account++;
 
 		prep_new_page(page, 0, gfp, 0);
 		if (page_list)
@@ -5172,6 +5165,9 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 		nr_populated++;
 	}
 
+	__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
+	zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
+
 	local_unlock_irqrestore(&pagesets.lock, flags);
 
 	return nr_populated;
-- 
cgit v1.2.3


From bbbecb35a41cb5c63ef78e14cc8b95fa9130bc1a Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Mon, 28 Jun 2021 19:42:09 -0700
Subject: mm/page_alloc: delete vm.percpu_pagelist_fraction

Patch series "Calculate pcp->high based on zone sizes and active CPUs", v2.

The per-cpu page allocator (PCP) is meant to reduce contention on the zone
lock but the sizing of batch and high is archaic and neither takes the
zone size into account or the number of CPUs local to a zone.  With larger
zones and more CPUs per node, the contention is getting worse.
Furthermore, the fact that vm.percpu_pagelist_fraction adjusts both batch
and high values means that the sysctl can reduce zone lock contention but
also increase allocation latencies.

This series disassociates pcp->high from pcp->batch and then scales
pcp->high based on the size of the local zone with limited impact to
reclaim and accounting for active CPUs but leaves pcp->batch static.  It
also adapts the number of pages that can be on the pcp list based on
recent freeing patterns.

The motivation is partially to adjust to larger memory sizes but is also
driven by the fact that large batches of page freeing via release_pages()
often shows zone contention as a major part of the problem.  Another is a
bug report based on an older kernel where a multi-terabyte process can
takes several minutes to exit.  A workaround was to use
vm.percpu_pagelist_fraction to increase the pcp->high value but testing
indicated that a production workload could not use the same values because
of an increase in allocation latencies.  Unfortunately, I cannot reproduce
this test case myself as the multi-terabyte machines are in active use but
it should alleviate the problem.

The series aims to address both and partially acts as a pre-requisite.
pcp only works with order-0 which is useless for SLUB (when using high
orders) and THP (unconditionally).  To store high-order pages on PCP, the
pcp->high values need to be increased first.

This patch (of 6):

The vm.percpu_pagelist_fraction is used to increase the batch and high
limits for the per-cpu page allocator (PCP).  The intent behind the sysctl
is to reduce zone lock acquisition when allocating/freeing pages but it
has a problem.  While it can decrease contention, it can also increase
latency on the allocation side due to unreasonably large batch sizes.
This leads to games where an administrator adjusts
percpu_pagelist_fraction on the fly to work around contention and
allocation latency problems.

This series aims to alleviate the problems with zone lock contention while
avoiding the allocation-side latency problems.  For the purposes of
review, it's easier to remove this sysctl now and reintroduce a similar
sysctl later in the series that deals only with pcp->high.

Link: https://lkml.kernel.org/r/20210525080119.5455-1-mgorman@techsingularity.net
Link: https://lkml.kernel.org/r/20210525080119.5455-2-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/sysctl/vm.rst | 19 ------------
 include/linux/mmzone.h                  |  3 --
 kernel/sysctl.c                         |  8 -----
 mm/page_alloc.c                         | 55 +++------------------------------
 4 files changed, 4 insertions(+), 81 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index 586cd4b86428..2fcafccb53a8 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -64,7 +64,6 @@ Currently, these files are in /proc/sys/vm:
 - overcommit_ratio
 - page-cluster
 - panic_on_oom
-- percpu_pagelist_fraction
 - stat_interval
 - stat_refresh
 - numa_stat
@@ -790,24 +789,6 @@ panic_on_oom=2+kdump gives you very strong tool to investigate
 why oom happens. You can get snapshot.
 
 
-percpu_pagelist_fraction
-========================
-
-This is the fraction of pages at most (high mark pcp->high) in each zone that
-are allocated for each per cpu page list.  The min value for this is 8.  It
-means that we don't allow more than 1/8th of pages in each zone to be
-allocated in any single per_cpu_pagelist.  This entry only changes the value
-of hot per cpu pagelists.  User can specify a number like 100 to allocate
-1/100th of each zone to each per cpu page list.
-
-The batch value of each per cpu pagelist is also updated as a result.  It is
-set to pcp->high/4.  The upper limit of batch is (PAGE_SHIFT * 8)
-
-The initial value is zero.  Kernel does not use this value at boot time to set
-the high water marks for each per cpu page list.  If the user writes '0' to this
-sysctl, it will revert to this default behavior.
-
-
 stat_interval
 =============
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index f86018d5e362..7937a1d1d166 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1027,15 +1027,12 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *,
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES];
 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *,
 		size_t *, loff_t *);
-int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int,
-		void *, size_t *, loff_t *);
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
 		void *, size_t *, loff_t *);
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
 		void *, size_t *, loff_t *);
 int numa_zonelist_order_handler(struct ctl_table *, int,
 		void *, size_t *, loff_t *);
-extern int percpu_pagelist_fraction;
 extern char numa_zonelist_order[];
 #define NUMA_ZONELIST_ORDER_LEN	16
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d4a78e08f6d8..51213c33171e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2908,14 +2908,6 @@ static struct ctl_table vm_table[] = {
 		.extra1		= SYSCTL_ONE,
 		.extra2		= &one_thousand,
 	},
-	{
-		.procname	= "percpu_pagelist_fraction",
-		.data		= &percpu_pagelist_fraction,
-		.maxlen		= sizeof(percpu_pagelist_fraction),
-		.mode		= 0644,
-		.proc_handler	= percpu_pagelist_fraction_sysctl_handler,
-		.extra1		= SYSCTL_ZERO,
-	},
 	{
 		.procname	= "page_lock_unfairness",
 		.data		= &sysctl_page_lock_unfairness,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 941a75b9fb5a..5abf2c1d4c58 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -120,7 +120,6 @@ typedef int __bitwise fpi_t;
 
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 static DEFINE_MUTEX(pcp_batch_high_lock);
-#define MIN_PERCPU_PAGELIST_FRACTION	(8)
 
 struct pagesets {
 	local_lock_t lock;
@@ -193,7 +192,6 @@ EXPORT_SYMBOL(_totalram_pages);
 unsigned long totalreserve_pages __read_mostly;
 unsigned long totalcma_pages __read_mostly;
 
-int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
 EXPORT_SYMBOL(init_on_alloc);
@@ -6735,22 +6733,15 @@ static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long h
 
 /*
  * Calculate and set new high and batch values for all per-cpu pagesets of a
- * zone, based on the zone's size and the percpu_pagelist_fraction sysctl.
+ * zone based on the zone's size.
  */
 static void zone_set_pageset_high_and_batch(struct zone *zone)
 {
 	unsigned long new_high, new_batch;
 
-	if (percpu_pagelist_fraction) {
-		new_high = zone_managed_pages(zone) / percpu_pagelist_fraction;
-		new_batch = max(1UL, new_high / 4);
-		if ((new_high / 4) > (PAGE_SHIFT * 8))
-			new_batch = PAGE_SHIFT * 8;
-	} else {
-		new_batch = zone_batchsize(zone);
-		new_high = 6 * new_batch;
-		new_batch = max(1UL, 1 * new_batch);
-	}
+	new_batch = zone_batchsize(zone);
+	new_high = 6 * new_batch;
+	new_batch = max(1UL, 1 * new_batch);
 
 	if (zone->pageset_high == new_high &&
 	    zone->pageset_batch == new_batch)
@@ -8413,44 +8404,6 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
 	return 0;
 }
 
-/*
- * percpu_pagelist_fraction - changes the pcp->high for each zone on each
- * cpu.  It is the fraction of total pages in each zone that a hot per cpu
- * pagelist can have before it gets flushed back to buddy allocator.
- */
-int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
-		void *buffer, size_t *length, loff_t *ppos)
-{
-	struct zone *zone;
-	int old_percpu_pagelist_fraction;
-	int ret;
-
-	mutex_lock(&pcp_batch_high_lock);
-	old_percpu_pagelist_fraction = percpu_pagelist_fraction;
-
-	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
-	if (!write || ret < 0)
-		goto out;
-
-	/* Sanity checking to avoid pcp imbalance */
-	if (percpu_pagelist_fraction &&
-	    percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
-		percpu_pagelist_fraction = old_percpu_pagelist_fraction;
-		ret = -EINVAL;
-		goto out;
-	}
-
-	/* No change? */
-	if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
-		goto out;
-
-	for_each_populated_zone(zone)
-		zone_set_pageset_high_and_batch(zone);
-out:
-	mutex_unlock(&pcp_batch_high_lock);
-	return ret;
-}
-
 #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
 /*
  * Returns the number of pages that arch has reserved but
-- 
cgit v1.2.3


From 04f8cfeaed0849e702278378bce3867577ca45fb Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Mon, 28 Jun 2021 19:42:15 -0700
Subject: mm/page_alloc: adjust pcp->high after CPU hotplug events

The PCP high watermark is based on the number of online CPUs so the
watermarks must be adjusted during CPU hotplug.  At the time of
hot-remove, the number of online CPUs is already adjusted but during
hot-add, a delta needs to be applied to update PCP to the correct value.
After this patch is applied, the high watermarks are adjusted correctly.

  # grep high: /proc/zoneinfo  | tail -1
              high:  649
  # echo 0 > /sys/devices/system/cpu/cpu4/online
  # grep high: /proc/zoneinfo  | tail -1
              high:  664
  # echo 1 > /sys/devices/system/cpu/cpu4/online
  # grep high: /proc/zoneinfo  | tail -1
              high:  649

Link: https://lkml.kernel.org/r/20210525080119.5455-4-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpuhotplug.h |  2 +-
 mm/internal.h              |  2 +-
 mm/page_alloc.c            | 38 +++++++++++++++++++++++++++-----------
 3 files changed, 29 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 4a62b3980642..47e13582d9fc 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -54,7 +54,7 @@ enum cpuhp_state {
 	CPUHP_MM_MEMCQ_DEAD,
 	CPUHP_PERCPU_CNT_DEAD,
 	CPUHP_RADIX_DEAD,
-	CPUHP_PAGE_ALLOC_DEAD,
+	CPUHP_PAGE_ALLOC,
 	CPUHP_NET_DEV_DEAD,
 	CPUHP_PCI_XGENE_DEAD,
 	CPUHP_IOMMU_IOVA_DEAD,
diff --git a/mm/internal.h b/mm/internal.h
index 2946dfa0f245..18e5fb4d225f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -206,7 +206,7 @@ extern int user_min_free_kbytes;
 extern void free_unref_page(struct page *page);
 extern void free_unref_page_list(struct list_head *list);
 
-extern void zone_pcp_update(struct zone *zone);
+extern void zone_pcp_update(struct zone *zone, int cpu_online);
 extern void zone_pcp_reset(struct zone *zone);
 extern void zone_pcp_disable(struct zone *zone);
 extern void zone_pcp_enable(struct zone *zone);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 19ec81d403a0..8d196a803820 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6667,7 +6667,7 @@ static int zone_batchsize(struct zone *zone)
 #endif
 }
 
-static int zone_highsize(struct zone *zone, int batch)
+static int zone_highsize(struct zone *zone, int batch, int cpu_online)
 {
 #ifdef CONFIG_MMU
 	int high;
@@ -6678,9 +6678,10 @@ static int zone_highsize(struct zone *zone, int batch)
 	 * so that if they are full then background reclaim will not be
 	 * started prematurely. The value is split across all online CPUs
 	 * local to the zone. Note that early in boot that CPUs may not be
-	 * online yet.
+	 * online yet and that during CPU hotplug that the cpumask is not
+	 * yet updated when a CPU is being onlined.
 	 */
-	nr_local_cpus = max(1U, cpumask_weight(cpumask_of_node(zone_to_nid(zone))));
+	nr_local_cpus = max(1U, cpumask_weight(cpumask_of_node(zone_to_nid(zone)))) + cpu_online;
 	high = low_wmark_pages(zone) / nr_local_cpus;
 
 	/*
@@ -6754,12 +6755,12 @@ static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long h
  * Calculate and set new high and batch values for all per-cpu pagesets of a
  * zone based on the zone's size.
  */
-static void zone_set_pageset_high_and_batch(struct zone *zone)
+static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online)
 {
 	int new_high, new_batch;
 
 	new_batch = max(1, zone_batchsize(zone));
-	new_high = zone_highsize(zone, new_batch);
+	new_high = zone_highsize(zone, new_batch, cpu_online);
 
 	if (zone->pageset_high == new_high &&
 	    zone->pageset_batch == new_batch)
@@ -6789,7 +6790,7 @@ void __meminit setup_zone_pageset(struct zone *zone)
 		per_cpu_pages_init(pcp, pzstats);
 	}
 
-	zone_set_pageset_high_and_batch(zone);
+	zone_set_pageset_high_and_batch(zone, 0);
 }
 
 /*
@@ -8044,6 +8045,7 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
 
 static int page_alloc_cpu_dead(unsigned int cpu)
 {
+	struct zone *zone;
 
 	lru_add_drain_cpu(cpu);
 	drain_pages(cpu);
@@ -8064,6 +8066,19 @@ static int page_alloc_cpu_dead(unsigned int cpu)
 	 * race with what we are doing.
 	 */
 	cpu_vm_stats_fold(cpu);
+
+	for_each_populated_zone(zone)
+		zone_pcp_update(zone, 0);
+
+	return 0;
+}
+
+static int page_alloc_cpu_online(unsigned int cpu)
+{
+	struct zone *zone;
+
+	for_each_populated_zone(zone)
+		zone_pcp_update(zone, 1);
 	return 0;
 }
 
@@ -8089,8 +8104,9 @@ void __init page_alloc_init(void)
 		hashdist = 0;
 #endif
 
-	ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
-					"mm/page_alloc:dead", NULL,
+	ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC,
+					"mm/page_alloc:pcp",
+					page_alloc_cpu_online,
 					page_alloc_cpu_dead);
 	WARN_ON(ret < 0);
 }
@@ -8252,7 +8268,7 @@ void setup_per_zone_wmarks(void)
 	 * and high limits or the limits may be inappropriate.
 	 */
 	for_each_zone(zone)
-		zone_pcp_update(zone);
+		zone_pcp_update(zone, 0);
 }
 
 /*
@@ -9053,10 +9069,10 @@ EXPORT_SYMBOL(free_contig_range);
  * The zone indicated has a new number of managed_pages; batch sizes and percpu
  * page high values need to be recalculated.
  */
-void __meminit zone_pcp_update(struct zone *zone)
+void zone_pcp_update(struct zone *zone, int cpu_online)
 {
 	mutex_lock(&pcp_batch_high_lock);
-	zone_set_pageset_high_and_batch(zone);
+	zone_set_pageset_high_and_batch(zone, cpu_online);
 	mutex_unlock(&pcp_batch_high_lock);
 }
 
-- 
cgit v1.2.3


From 3b12e7e97938424de2bb1b95ba0bd6a49bad39f9 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Mon, 28 Jun 2021 19:42:18 -0700
Subject: mm/page_alloc: scale the number of pages that are batch freed

When a task is freeing a large number of order-0 pages, it may acquire the
zone->lock multiple times freeing pages in batches.  This may
unnecessarily contend on the zone lock when freeing very large number of
pages.  This patch adapts the size of the batch based on the recent
pattern to scale the batch size for subsequent frees.

As the machines I used were not large enough to test this are not large
enough to illustrate a problem, a debugging patch shows patterns like the
following (slightly editted for clarity)

Baseline vanilla kernel
  time-unmap-14426   [...] free_pcppages_bulk: free   63 count  378 high  378
  time-unmap-14426   [...] free_pcppages_bulk: free   63 count  378 high  378
  time-unmap-14426   [...] free_pcppages_bulk: free   63 count  378 high  378
  time-unmap-14426   [...] free_pcppages_bulk: free   63 count  378 high  378
  time-unmap-14426   [...] free_pcppages_bulk: free   63 count  378 high  378

With patches
  time-unmap-7724    [...] free_pcppages_bulk: free  126 count  814 high  814
  time-unmap-7724    [...] free_pcppages_bulk: free  252 count  814 high  814
  time-unmap-7724    [...] free_pcppages_bulk: free  504 count  814 high  814
  time-unmap-7724    [...] free_pcppages_bulk: free  751 count  814 high  814
  time-unmap-7724    [...] free_pcppages_bulk: free  751 count  814 high  814

Link: https://lkml.kernel.org/r/20210525080119.5455-5-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  3 ++-
 mm/page_alloc.c        | 41 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 41 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7937a1d1d166..0a86b2890a16 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -343,8 +343,9 @@ struct per_cpu_pages {
 	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
 	int batch;		/* chunk size for buddy add/remove */
+	short free_factor;	/* batch scaling factor during free */
 #ifdef CONFIG_NUMA
-	int expire;		/* When 0, remote pagesets are drained */
+	short expire;		/* When 0, remote pagesets are drained */
 #endif
 
 	/* Lists of pages, one per migrate type stored on the pcp-lists */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8d196a803820..e1d1825a2611 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3278,18 +3278,47 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
 	return true;
 }
 
+static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch)
+{
+	int min_nr_free, max_nr_free;
+
+	/* Check for PCP disabled or boot pageset */
+	if (unlikely(high < batch))
+		return 1;
+
+	/* Leave at least pcp->batch pages on the list */
+	min_nr_free = batch;
+	max_nr_free = high - batch;
+
+	/*
+	 * Double the number of pages freed each time there is subsequent
+	 * freeing of pages without any allocation.
+	 */
+	batch <<= pcp->free_factor;
+	if (batch < max_nr_free)
+		pcp->free_factor++;
+	batch = clamp(batch, min_nr_free, max_nr_free);
+
+	return batch;
+}
+
 static void free_unref_page_commit(struct page *page, unsigned long pfn,
 				   int migratetype)
 {
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
+	int high;
 
 	__count_vm_event(PGFREE);
 	pcp = this_cpu_ptr(zone->per_cpu_pageset);
 	list_add(&page->lru, &pcp->lists[migratetype]);
 	pcp->count++;
-	if (pcp->count >= READ_ONCE(pcp->high))
-		free_pcppages_bulk(zone, READ_ONCE(pcp->batch), pcp);
+	high = READ_ONCE(pcp->high);
+	if (pcp->count >= high) {
+		int batch = READ_ONCE(pcp->batch);
+
+		free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch), pcp);
+	}
 }
 
 /*
@@ -3541,7 +3570,14 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 	unsigned long flags;
 
 	local_lock_irqsave(&pagesets.lock, flags);
+
+	/*
+	 * On allocation, reduce the number of pages that are batch freed.
+	 * See nr_pcp_free() where free_factor is increased for subsequent
+	 * frees.
+	 */
 	pcp = this_cpu_ptr(zone->per_cpu_pageset);
+	pcp->free_factor >>= 1;
 	list = &pcp->lists[migratetype];
 	page = __rmqueue_pcplist(zone,  migratetype, alloc_flags, pcp, list);
 	local_unlock_irqrestore(&pagesets.lock, flags);
@@ -6737,6 +6773,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
 	 */
 	pcp->high = BOOT_PAGESET_HIGH;
 	pcp->batch = BOOT_PAGESET_BATCH;
+	pcp->free_factor = 0;
 }
 
 static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
-- 
cgit v1.2.3


From c49c2c47dab6b8d45022b3fabf0642a0e62e3109 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Mon, 28 Jun 2021 19:42:21 -0700
Subject: mm/page_alloc: limit the number of pages on PCP lists when reclaim is
 active

When kswapd is active then direct reclaim is potentially active.  In
either case, it is possible that a zone would be balanced if pages were
not trapped on PCP lists.  Instead of draining remote pages, simply limit
the size of the PCP lists while kswapd is active.

Link: https://lkml.kernel.org/r/20210525080119.5455-6-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  1 +
 mm/page_alloc.c        | 19 ++++++++++++++++++-
 mm/vmscan.c            | 35 +++++++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0a86b2890a16..b2f40d64bc4b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -647,6 +647,7 @@ enum zone_flags {
 	ZONE_BOOSTED_WATERMARK,		/* zone recently boosted watermarks.
 					 * Cleared when kswapd is woken.
 					 */
+	ZONE_RECLAIM_ACTIVE,		/* kswapd may be scanning the zone. */
 };
 
 static inline unsigned long zone_managed_pages(struct zone *zone)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e1d1825a2611..adf35ccfd8e5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3302,6 +3302,23 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch)
 	return batch;
 }
 
+static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone)
+{
+	int high = READ_ONCE(pcp->high);
+
+	if (unlikely(!high))
+		return 0;
+
+	if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
+		return high;
+
+	/*
+	 * If reclaim is active, limit the number of pages that can be
+	 * stored on pcp lists
+	 */
+	return min(READ_ONCE(pcp->batch) << 2, high);
+}
+
 static void free_unref_page_commit(struct page *page, unsigned long pfn,
 				   int migratetype)
 {
@@ -3313,7 +3330,7 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn,
 	pcp = this_cpu_ptr(zone->per_cpu_pageset);
 	list_add(&page->lru, &pcp->lists[migratetype]);
 	pcp->count++;
-	high = READ_ONCE(pcp->high);
+	high = nr_pcp_high(pcp, zone);
 	if (pcp->count >= high) {
 		int batch = READ_ONCE(pcp->batch);
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f96d62159720..d7c3cb8688dd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3722,6 +3722,38 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
 	return sc->nr_scanned >= sc->nr_to_reclaim;
 }
 
+/* Page allocator PCP high watermark is lowered if reclaim is active. */
+static inline void
+update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active)
+{
+	int i;
+	struct zone *zone;
+
+	for (i = 0; i <= highest_zoneidx; i++) {
+		zone = pgdat->node_zones + i;
+
+		if (!managed_zone(zone))
+			continue;
+
+		if (active)
+			set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
+		else
+			clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
+	}
+}
+
+static inline void
+set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
+{
+	update_reclaim_active(pgdat, highest_zoneidx, true);
+}
+
+static inline void
+clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
+{
+	update_reclaim_active(pgdat, highest_zoneidx, false);
+}
+
 /*
  * For kswapd, balance_pgdat() will reclaim pages across a node from zones
  * that are eligible for use by the caller until at least one zone is
@@ -3774,6 +3806,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
 	boosted = nr_boost_reclaim;
 
 restart:
+	set_reclaim_active(pgdat, highest_zoneidx);
 	sc.priority = DEF_PRIORITY;
 	do {
 		unsigned long nr_reclaimed = sc.nr_reclaimed;
@@ -3907,6 +3940,8 @@ restart:
 		pgdat->kswapd_failures++;
 
 out:
+	clear_reclaim_active(pgdat, highest_zoneidx);
+
 	/* If reclaim was boosted, account for the reclaim done in this pass */
 	if (boosted) {
 		unsigned long flags;
-- 
cgit v1.2.3


From 74f44822097c665041010994502b5971d6cd9f04 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Mon, 28 Jun 2021 19:42:24 -0700
Subject: mm/page_alloc: introduce vm.percpu_pagelist_high_fraction

This introduces a new sysctl vm.percpu_pagelist_high_fraction.  It is
similar to the old vm.percpu_pagelist_fraction.  The old sysctl increased
both pcp->batch and pcp->high with the higher pcp->high potentially
reducing zone->lock contention.  However, the higher pcp->batch value also
potentially increased allocation latency while the PCP was refilled.  This
sysctl only adjusts pcp->high so that zone->lock contention is potentially
reduced but allocation latency during a PCP refill remains the same.

  # grep -E "high:|batch" /proc/zoneinfo | tail -2
              high:  649
              batch: 63

  # sysctl vm.percpu_pagelist_high_fraction=8
  # grep -E "high:|batch" /proc/zoneinfo | tail -2
              high:  35071
              batch: 63

  # sysctl vm.percpu_pagelist_high_fraction=64
              high:  4383
              batch: 63

  # sysctl vm.percpu_pagelist_high_fraction=0
              high:  649
              batch: 63

[mgorman@techsingularity.net: fix documentation]
  Link: https://lkml.kernel.org/r/20210528151010.GQ30378@techsingularity.net

Link: https://lkml.kernel.org/r/20210525080119.5455-7-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/sysctl/vm.rst | 21 ++++++++++
 include/linux/mmzone.h                  |  3 ++
 kernel/sysctl.c                         |  8 ++++
 mm/page_alloc.c                         | 69 +++++++++++++++++++++++++++++----
 4 files changed, 94 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index 2fcafccb53a8..2da25735a629 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -64,6 +64,7 @@ Currently, these files are in /proc/sys/vm:
 - overcommit_ratio
 - page-cluster
 - panic_on_oom
+- percpu_pagelist_high_fraction
 - stat_interval
 - stat_refresh
 - numa_stat
@@ -789,6 +790,26 @@ panic_on_oom=2+kdump gives you very strong tool to investigate
 why oom happens. You can get snapshot.
 
 
+percpu_pagelist_high_fraction
+=============================
+
+This is the fraction of pages in each zone that are can be stored to
+per-cpu page lists. It is an upper boundary that is divided depending
+on the number of online CPUs. The min value for this is 8 which means
+that we do not allow more than 1/8th of pages in each zone to be stored
+on per-cpu page lists. This entry only changes the value of hot per-cpu
+page lists. A user can specify a number like 100 to allocate 1/100th of
+each zone between per-cpu lists.
+
+The batch value of each per-cpu page list remains the same regardless of
+the value of the high fraction so allocation latencies are unaffected.
+
+The initial value is zero. Kernel uses this value to set the high pcp->high
+mark based on the low watermark for the zone and the number of local
+online CPUs.  If the user writes '0' to this sysctl, it will revert to
+this default behavior.
+
+
 stat_interval
 =============
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b2f40d64bc4b..7d206ca850c7 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1029,12 +1029,15 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *,
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES];
 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *,
 		size_t *, loff_t *);
+int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *, int,
+		void *, size_t *, loff_t *);
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
 		void *, size_t *, loff_t *);
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
 		void *, size_t *, loff_t *);
 int numa_zonelist_order_handler(struct ctl_table *, int,
 		void *, size_t *, loff_t *);
+extern int percpu_pagelist_high_fraction;
 extern char numa_zonelist_order[];
 #define NUMA_ZONELIST_ORDER_LEN	16
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 51213c33171e..69d925f1e5da 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2908,6 +2908,14 @@ static struct ctl_table vm_table[] = {
 		.extra1		= SYSCTL_ONE,
 		.extra2		= &one_thousand,
 	},
+	{
+		.procname	= "percpu_pagelist_high_fraction",
+		.data		= &percpu_pagelist_high_fraction,
+		.maxlen		= sizeof(percpu_pagelist_high_fraction),
+		.mode		= 0644,
+		.proc_handler	= percpu_pagelist_high_fraction_sysctl_handler,
+		.extra1		= SYSCTL_ZERO,
+	},
 	{
 		.procname	= "page_lock_unfairness",
 		.data		= &sysctl_page_lock_unfairness,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index adf35ccfd8e5..cfc4071310fb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -120,6 +120,7 @@ typedef int __bitwise fpi_t;
 
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 static DEFINE_MUTEX(pcp_batch_high_lock);
+#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
 
 struct pagesets {
 	local_lock_t lock;
@@ -192,6 +193,7 @@ EXPORT_SYMBOL(_totalram_pages);
 unsigned long totalreserve_pages __read_mostly;
 unsigned long totalcma_pages __read_mostly;
 
+int percpu_pagelist_high_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
 EXPORT_SYMBOL(init_on_alloc);
@@ -6725,17 +6727,32 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online)
 #ifdef CONFIG_MMU
 	int high;
 	int nr_local_cpus;
+	unsigned long total_pages;
+
+	if (!percpu_pagelist_high_fraction) {
+		/*
+		 * By default, the high value of the pcp is based on the zone
+		 * low watermark so that if they are full then background
+		 * reclaim will not be started prematurely.
+		 */
+		total_pages = low_wmark_pages(zone);
+	} else {
+		/*
+		 * If percpu_pagelist_high_fraction is configured, the high
+		 * value is based on a fraction of the managed pages in the
+		 * zone.
+		 */
+		total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction;
+	}
 
 	/*
-	 * The high value of the pcp is based on the zone low watermark
-	 * so that if they are full then background reclaim will not be
-	 * started prematurely. The value is split across all online CPUs
-	 * local to the zone. Note that early in boot that CPUs may not be
-	 * online yet and that during CPU hotplug that the cpumask is not
-	 * yet updated when a CPU is being onlined.
+	 * Split the high value across all online CPUs local to the zone. Note
+	 * that early in boot that CPUs may not be online yet and that during
+	 * CPU hotplug that the cpumask is not yet updated when a CPU is being
+	 * onlined.
 	 */
 	nr_local_cpus = max(1U, cpumask_weight(cpumask_of_node(zone_to_nid(zone)))) + cpu_online;
-	high = low_wmark_pages(zone) / nr_local_cpus;
+	high = total_pages / nr_local_cpus;
 
 	/*
 	 * Ensure high is at least batch*4. The multiple is based on the
@@ -8500,6 +8517,44 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
 	return 0;
 }
 
+/*
+ * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each
+ * cpu. It is the fraction of total pages in each zone that a hot per cpu
+ * pagelist can have before it gets flushed back to buddy allocator.
+ */
+int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table,
+		int write, void *buffer, size_t *length, loff_t *ppos)
+{
+	struct zone *zone;
+	int old_percpu_pagelist_high_fraction;
+	int ret;
+
+	mutex_lock(&pcp_batch_high_lock);
+	old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction;
+
+	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+	if (!write || ret < 0)
+		goto out;
+
+	/* Sanity checking to avoid pcp imbalance */
+	if (percpu_pagelist_high_fraction &&
+	    percpu_pagelist_high_fraction < MIN_PERCPU_PAGELIST_HIGH_FRACTION) {
+		percpu_pagelist_high_fraction = old_percpu_pagelist_high_fraction;
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* No change? */
+	if (percpu_pagelist_high_fraction == old_percpu_pagelist_high_fraction)
+		goto out;
+
+	for_each_populated_zone(zone)
+		zone_set_pageset_high_and_batch(zone, 0);
+out:
+	mutex_unlock(&pcp_batch_high_lock);
+	return ret;
+}
+
 #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
 /*
  * Returns the number of pages that arch has reserved but
-- 
cgit v1.2.3


From 777c00f5ede4fcb9ae49a2a957bec26d4d8f4c29 Mon Sep 17 00:00:00 2001
From: Dong Aisheng <aisheng.dong@nxp.com>
Date: Mon, 28 Jun 2021 19:42:27 -0700
Subject: mm: drop SECTION_SHIFT in code comments

Actually SECTIONS_SHIFT is used in the kernel code, so the code comments
is strictly incorrect.  And since commit bbeae5b05ef6 ("mm: move page
flags layout to separate header"), SECTIONS_SHIFT definition has been
moved to include/linux/page-flags-layout.h, since code itself looks quite
straighforward, instead of moving the code comment into the new place as
well, we just simply remove it.

This also fixed a checkpatch complain derived from the original code:
WARNING: please, no space before tabs
+ * SECTIONS_SHIFT    ^I^I#bits space required to store a section #$

Link: https://lkml.kernel.org/r/20210531091908.1738465-2-aisheng.dong@nxp.com
Signed-off-by: Dong Aisheng <aisheng.dong@nxp.com>
Suggested-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Yu Zhao <yuzhao@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7d206ca850c7..3e62e8ef68b5 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1200,8 +1200,6 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
 #ifdef CONFIG_SPARSEMEM
 
 /*
- * SECTION_SHIFT    		#bits space required to store a section #
- *
  * PA_SECTION_SHIFT		physical address to/from section number
  * PFN_SECTION_SHIFT		pfn to/from section number
  */
-- 
cgit v1.2.3


From bb1c50d3967f69f413b333713c2718d48d1ab7ea Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 28 Jun 2021 19:42:52 -0700
Subject: mm: remove CONFIG_DISCONTIGMEM

There are no architectures that support DISCONTIGMEM left.

Remove the configuration option and the dead code it was guarding in the
generic memory management code.

Link: https://lkml.kernel.org/r/20210608091316.3622-6-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Vineet Gupta <vgupta@synopsys.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/memory_model.h | 37 ++++---------------------------------
 include/linux/mmzone.h             |  8 +++++---
 mm/Kconfig                         | 25 +++----------------------
 mm/page_alloc.c                    | 13 -------------
 4 files changed, 12 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h
index 7637fb46ba4f..a2c8ed60233a 100644
--- a/include/asm-generic/memory_model.h
+++ b/include/asm-generic/memory_model.h
@@ -6,47 +6,18 @@
 
 #ifndef __ASSEMBLY__
 
+/*
+ * supports 3 memory models.
+ */
 #if defined(CONFIG_FLATMEM)
 
 #ifndef ARCH_PFN_OFFSET
 #define ARCH_PFN_OFFSET		(0UL)
 #endif
 
-#elif defined(CONFIG_DISCONTIGMEM)
-
-#ifndef arch_pfn_to_nid
-#define arch_pfn_to_nid(pfn)	pfn_to_nid(pfn)
-#endif
-
-#ifndef arch_local_page_offset
-#define arch_local_page_offset(pfn, nid)	\
-	((pfn) - NODE_DATA(nid)->node_start_pfn)
-#endif
-
-#endif /* CONFIG_DISCONTIGMEM */
-
-/*
- * supports 3 memory models.
- */
-#if defined(CONFIG_FLATMEM)
-
 #define __pfn_to_page(pfn)	(mem_map + ((pfn) - ARCH_PFN_OFFSET))
 #define __page_to_pfn(page)	((unsigned long)((page) - mem_map) + \
 				 ARCH_PFN_OFFSET)
-#elif defined(CONFIG_DISCONTIGMEM)
-
-#define __pfn_to_page(pfn)			\
-({	unsigned long __pfn = (pfn);		\
-	unsigned long __nid = arch_pfn_to_nid(__pfn);  \
-	NODE_DATA(__nid)->node_mem_map + arch_local_page_offset(__pfn, __nid);\
-})
-
-#define __page_to_pfn(pg)						\
-({	const struct page *__pg = (pg);					\
-	struct pglist_data *__pgdat = NODE_DATA(page_to_nid(__pg));	\
-	(unsigned long)(__pg - __pgdat->node_mem_map) +			\
-	 __pgdat->node_start_pfn;					\
-})
 
 #elif defined(CONFIG_SPARSEMEM_VMEMMAP)
 
@@ -70,7 +41,7 @@
 	struct mem_section *__sec = __pfn_to_section(__pfn);	\
 	__section_mem_map_addr(__sec) + __pfn;		\
 })
-#endif /* CONFIG_FLATMEM/DISCONTIGMEM/SPARSEMEM */
+#endif /* CONFIG_FLATMEM/SPARSEMEM */
 
 /*
  * Convert a physical address to a Page Frame Number and back
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3e62e8ef68b5..6f9829562af2 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -749,10 +749,12 @@ struct zonelist {
 	struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
 };
 
-#ifndef CONFIG_DISCONTIGMEM
-/* The array of struct pages - for discontigmem use pgdat->lmem_map */
+/*
+ * The array of struct pages for flatmem.
+ * It must be declared for SPARSEMEM as well because there are configurations
+ * that rely on that.
+ */
 extern struct page *mem_map;
-#endif
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 struct deferred_split {
diff --git a/mm/Kconfig b/mm/Kconfig
index 02d44e3420f5..218b96ccc84a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -19,7 +19,7 @@ choice
 
 config FLATMEM_MANUAL
 	bool "Flat Memory"
-	depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
+	depends on !ARCH_SPARSEMEM_ENABLE || ARCH_FLATMEM_ENABLE
 	help
 	  This option is best suited for non-NUMA systems with
 	  flat address space. The FLATMEM is the most efficient
@@ -32,21 +32,6 @@ config FLATMEM_MANUAL
 
 	  If unsure, choose this option (Flat Memory) over any other.
 
-config DISCONTIGMEM_MANUAL
-	bool "Discontiguous Memory"
-	depends on ARCH_DISCONTIGMEM_ENABLE
-	help
-	  This option provides enhanced support for discontiguous
-	  memory systems, over FLATMEM.  These systems have holes
-	  in their physical address spaces, and this option provides
-	  more efficient handling of these holes.
-
-	  Although "Discontiguous Memory" is still used by several
-	  architectures, it is considered deprecated in favor of
-	  "Sparse Memory".
-
-	  If unsure, choose "Sparse Memory" over this option.
-
 config SPARSEMEM_MANUAL
 	bool "Sparse Memory"
 	depends on ARCH_SPARSEMEM_ENABLE
@@ -62,17 +47,13 @@ config SPARSEMEM_MANUAL
 
 endchoice
 
-config DISCONTIGMEM
-	def_bool y
-	depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL
-
 config SPARSEMEM
 	def_bool y
 	depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL
 
 config FLATMEM
 	def_bool y
-	depends on (!DISCONTIGMEM && !SPARSEMEM) || FLATMEM_MANUAL
+	depends on !SPARSEMEM || FLATMEM_MANUAL
 
 config FLAT_NODE_MEM_MAP
 	def_bool y
@@ -85,7 +66,7 @@ config FLAT_NODE_MEM_MAP
 #
 config NEED_MULTIPLE_NODES
 	def_bool y
-	depends on DISCONTIGMEM || NUMA
+	depends on NUMA
 
 #
 # SPARSEMEM_EXTREME (which is the default) does some bootmem
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 58f7a321598f..8926f3fd3bcf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -349,20 +349,7 @@ compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
 
 int min_free_kbytes = 1024;
 int user_min_free_kbytes = -1;
-#ifdef CONFIG_DISCONTIGMEM
-/*
- * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
- * are not on separate NUMA nodes. Functionally this works but with
- * watermark_boost_factor, it can reclaim prematurely as the ranges can be
- * quite small. By default, do not boost watermarks on discontigmem as in
- * many cases very high-order allocations like THP are likely to be
- * unsupported and the premature reclaim offsets the advantage of long-term
- * fragmentation avoidance.
- */
-int watermark_boost_factor __read_mostly;
-#else
 int watermark_boost_factor __read_mostly = 15000;
-#endif
 int watermark_scale_factor = 10;
 
 static unsigned long nr_kernel_pages __initdata;
-- 
cgit v1.2.3


From d3c251ab95b69f3dc189c4657baeac1b4c050789 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 28 Jun 2021 19:42:55 -0700
Subject: arch, mm: remove stale mentions of DISCONIGMEM

There are several places that mention DISCONIGMEM in comments or have
stale code guarded by CONFIG_DISCONTIGMEM.

Remove the dead code and update the comments.

Link: https://lkml.kernel.org/r/20210608091316.3622-7-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Vineet Gupta <vgupta@synopsys.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/topology.c     | 5 ++---
 arch/ia64/mm/numa.c             | 5 ++---
 arch/mips/include/asm/mmzone.h  | 6 ------
 arch/mips/mm/init.c             | 3 ---
 arch/nds32/include/asm/memory.h | 6 ------
 arch/xtensa/include/asm/page.h  | 4 ----
 include/linux/gfp.h             | 4 ++--
 7 files changed, 6 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index 09fc385c2acd..3639e0a7cb3b 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -3,9 +3,8 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * This file contains NUMA specific variables and functions which can
- * be split away from DISCONTIGMEM and are used on NUMA machines with
- * contiguous memory.
+ * This file contains NUMA specific variables and functions which are used on
+ * NUMA machines with contiguous memory.
  * 		2002/08/07 Erich Focht <efocht@ess.nec.de>
  * Populate cpu entries in sysfs for non-numa systems as well
  *  	Intel Corporation - Ashok Raj
diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c
index 46b6e5f3a40f..d6579ec3ea32 100644
--- a/arch/ia64/mm/numa.c
+++ b/arch/ia64/mm/numa.c
@@ -3,9 +3,8 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * This file contains NUMA specific variables and functions which can
- * be split away from DISCONTIGMEM and are used on NUMA machines with
- * contiguous memory.
+ * This file contains NUMA specific variables and functions which are used on
+ * NUMA machines with contiguous memory.
  * 
  *                         2002/08/07 Erich Focht <efocht@ess.nec.de>
  */
diff --git a/arch/mips/include/asm/mmzone.h b/arch/mips/include/asm/mmzone.h
index b826b8473e95..7649ab45e80c 100644
--- a/arch/mips/include/asm/mmzone.h
+++ b/arch/mips/include/asm/mmzone.h
@@ -20,10 +20,4 @@
 #define nid_to_addrbase(nid) 0
 #endif
 
-#ifdef CONFIG_DISCONTIGMEM
-
-#define pfn_to_nid(pfn)		pa_to_nid((pfn) << PAGE_SHIFT)
-
-#endif /* CONFIG_DISCONTIGMEM */
-
 #endif /* _ASM_MMZONE_H_ */
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index c36358758969..97f6ca341448 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -454,9 +454,6 @@ void __init mem_init(void)
 	BUILD_BUG_ON(IS_ENABLED(CONFIG_32BIT) && (_PFN_SHIFT > PAGE_SHIFT));
 
 #ifdef CONFIG_HIGHMEM
-#ifdef CONFIG_DISCONTIGMEM
-#error "CONFIG_HIGHMEM and CONFIG_DISCONTIGMEM dont work together yet"
-#endif
 	max_mapnr = highend_pfn ? highend_pfn : max_low_pfn;
 #else
 	max_mapnr = max_low_pfn;
diff --git a/arch/nds32/include/asm/memory.h b/arch/nds32/include/asm/memory.h
index 940d32842793..62faafbc28e4 100644
--- a/arch/nds32/include/asm/memory.h
+++ b/arch/nds32/include/asm/memory.h
@@ -76,18 +76,12 @@
  *  virt_to_page(k)	convert a _valid_ virtual address to struct page *
  *  virt_addr_valid(k)	indicates whether a virtual address is valid
  */
-#ifndef CONFIG_DISCONTIGMEM
-
 #define ARCH_PFN_OFFSET		PHYS_PFN_OFFSET
 #define pfn_valid(pfn)		((pfn) >= PHYS_PFN_OFFSET && (pfn) < (PHYS_PFN_OFFSET + max_mapnr))
 
 #define virt_to_page(kaddr)	(pfn_to_page(__pa(kaddr) >> PAGE_SHIFT))
 #define virt_addr_valid(kaddr)	((unsigned long)(kaddr) >= PAGE_OFFSET && (unsigned long)(kaddr) < (unsigned long)high_memory)
 
-#else /* CONFIG_DISCONTIGMEM */
-#error CONFIG_DISCONTIGMEM is not supported yet.
-#endif /* !CONFIG_DISCONTIGMEM */
-
 #define page_to_phys(page)	(page_to_pfn(page) << PAGE_SHIFT)
 
 #endif
diff --git a/arch/xtensa/include/asm/page.h b/arch/xtensa/include/asm/page.h
index 37ce25ef92d6..493eb7083b1a 100644
--- a/arch/xtensa/include/asm/page.h
+++ b/arch/xtensa/include/asm/page.h
@@ -192,10 +192,6 @@ static inline unsigned long ___pa(unsigned long va)
 #define pfn_valid(pfn) \
 	((pfn) >= ARCH_PFN_OFFSET && ((pfn) - ARCH_PFN_OFFSET) < max_mapnr)
 
-#ifdef CONFIG_DISCONTIGMEM
-# error CONFIG_DISCONTIGMEM not supported
-#endif
-
 #define virt_to_page(kaddr)	pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
 #define page_to_virt(page)	__va(page_to_pfn(page) << PAGE_SHIFT)
 #define virt_addr_valid(kaddr)	pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 94f0b8b1cb55..0bec15b0691f 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -494,8 +494,8 @@ static inline int gfp_zonelist(gfp_t flags)
  * There are two zonelists per node, one for all zones with memory and
  * one containing just zones from the node the zonelist belongs to.
  *
- * For the normal case of non-DISCONTIGMEM systems the NODE_DATA() gets
- * optimized to &contig_page_data at compile-time.
+ * For the case of non-NUMA systems the NODE_DATA() gets optimized to
+ * &contig_page_data at compile-time.
  */
 static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
 {
-- 
cgit v1.2.3


From a9ee6cf5c60ed1070e786e53665f9b2f23f2bd11 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 28 Jun 2021 19:43:01 -0700
Subject: mm: replace CONFIG_NEED_MULTIPLE_NODES with CONFIG_NUMA

After removal of DISCINTIGMEM the NEED_MULTIPLE_NODES and NUMA
configuration options are equivalent.

Drop CONFIG_NEED_MULTIPLE_NODES and use CONFIG_NUMA instead.

Done with

	$ sed -i 's/CONFIG_NEED_MULTIPLE_NODES/CONFIG_NUMA/' \
		$(git grep -wl CONFIG_NEED_MULTIPLE_NODES)
	$ sed -i 's/NEED_MULTIPLE_NODES/NUMA/' \
		$(git grep -wl NEED_MULTIPLE_NODES)

with manual tweaks afterwards.

[rppt@linux.ibm.com: fix arm boot crash]
  Link: https://lkml.kernel.org/r/YMj9vHhHOiCVN4BF@linux.ibm.com

Link: https://lkml.kernel.org/r/20210608091316.3622-9-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Vineet Gupta <vgupta@synopsys.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/Kconfig                |  2 +-
 arch/ia64/Kconfig                 |  2 +-
 arch/mips/Kconfig                 |  2 +-
 arch/mips/include/asm/mmzone.h    |  2 +-
 arch/mips/include/asm/page.h      |  2 +-
 arch/mips/mm/init.c               |  4 ++--
 arch/powerpc/Kconfig              |  2 +-
 arch/powerpc/include/asm/mmzone.h |  4 ++--
 arch/powerpc/kernel/setup_64.c    |  2 +-
 arch/powerpc/kernel/smp.c         |  2 +-
 arch/powerpc/kexec/core.c         |  4 ++--
 arch/powerpc/mm/Makefile          |  2 +-
 arch/powerpc/mm/mem.c             |  4 ++--
 arch/riscv/Kconfig                |  2 +-
 arch/s390/Kconfig                 |  2 +-
 arch/sh/include/asm/mmzone.h      |  4 ++--
 arch/sh/kernel/topology.c         |  2 +-
 arch/sh/mm/Kconfig                |  2 +-
 arch/sh/mm/init.c                 |  2 +-
 arch/sparc/Kconfig                |  2 +-
 arch/sparc/include/asm/mmzone.h   |  4 ++--
 arch/sparc/kernel/smp_64.c        |  2 +-
 arch/sparc/mm/init_64.c           | 12 ++++++------
 arch/x86/Kconfig                  |  2 +-
 arch/x86/kernel/setup_percpu.c    |  6 +++---
 arch/x86/mm/init_32.c             |  4 ++--
 include/asm-generic/topology.h    |  2 +-
 include/linux/memblock.h          |  6 +++---
 include/linux/mm.h                |  4 ++--
 include/linux/mmzone.h            |  6 +++---
 kernel/crash_core.c               |  2 +-
 mm/Kconfig                        |  9 ---------
 mm/memblock.c                     |  8 ++++----
 mm/memory.c                       |  3 +--
 mm/page_alloc.c                   |  6 +++---
 mm/sparse.c                       |  2 +-
 36 files changed, 59 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 9f1d8566bbf9..d01a1545ab8f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1035,7 +1035,7 @@ config NODES_SHIFT
 	int "Maximum NUMA Nodes (as a power of 2)"
 	range 1 10
 	default "4"
-	depends on NEED_MULTIPLE_NODES
+	depends on NUMA
 	help
 	  Specify the maximum number of NUMA Nodes available on the target
 	  system.  Increases memory reserved to accommodate various tables.
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 279252e3e0f7..da22a35e6f03 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -302,7 +302,7 @@ config NODES_SHIFT
 	int "Max num nodes shift(3-10)"
 	range 3 10
 	default "10"
-	depends on NEED_MULTIPLE_NODES
+	depends on NUMA
 	help
 	  This option specifies the maximum number of nodes in your SSI system.
 	  MAX_NUMNODES will be 2^(This value).
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index ed51970c08e7..4704a16c2e44 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2867,7 +2867,7 @@ config RANDOMIZE_BASE_MAX_OFFSET
 config NODES_SHIFT
 	int
 	default "6"
-	depends on NEED_MULTIPLE_NODES
+	depends on NUMA
 
 config HW_PERF_EVENTS
 	bool "Enable hardware performance counter support for perf events"
diff --git a/arch/mips/include/asm/mmzone.h b/arch/mips/include/asm/mmzone.h
index 7649ab45e80c..602a21aee9d4 100644
--- a/arch/mips/include/asm/mmzone.h
+++ b/arch/mips/include/asm/mmzone.h
@@ -8,7 +8,7 @@
 
 #include <asm/page.h>
 
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
 # include <mmzone.h>
 #endif
 
diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h
index 195ff4e9771f..96bc798c1ec1 100644
--- a/arch/mips/include/asm/page.h
+++ b/arch/mips/include/asm/page.h
@@ -239,7 +239,7 @@ static inline int pfn_valid(unsigned long pfn)
 
 /* pfn_valid is defined in linux/mmzone.h */
 
-#elif defined(CONFIG_NEED_MULTIPLE_NODES)
+#elif defined(CONFIG_NUMA)
 
 #define pfn_valid(pfn)							\
 ({									\
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index 97f6ca341448..19347dc6bbf8 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -394,7 +394,7 @@ void maar_init(void)
 	}
 }
 
-#ifndef CONFIG_NEED_MULTIPLE_NODES
+#ifndef CONFIG_NUMA
 void __init paging_init(void)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
@@ -473,7 +473,7 @@ void __init mem_init(void)
 				0x80000000 - 4, KCORE_TEXT);
 #endif
 }
-#endif /* !CONFIG_NEED_MULTIPLE_NODES */
+#endif /* !CONFIG_NUMA */
 
 void free_init_pages(const char *what, unsigned long begin, unsigned long end)
 {
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 088dd2afcfe4..14b132cf95e2 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -671,7 +671,7 @@ config NODES_SHIFT
 	int
 	default "8" if PPC64
 	default "4"
-	depends on NEED_MULTIPLE_NODES
+	depends on NUMA
 
 config USE_PERCPU_NUMA_NODE_ID
 	def_bool y
diff --git a/arch/powerpc/include/asm/mmzone.h b/arch/powerpc/include/asm/mmzone.h
index 6cda76b57c5d..4c6c6dbd182f 100644
--- a/arch/powerpc/include/asm/mmzone.h
+++ b/arch/powerpc/include/asm/mmzone.h
@@ -18,7 +18,7 @@
  *    flags field of the struct page
  */
 
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
 
 extern struct pglist_data *node_data[];
 /*
@@ -41,7 +41,7 @@ u64 memory_hotplug_max(void);
 
 #else
 #define memory_hotplug_max() memblock_end_of_DRAM()
-#endif /* CONFIG_NEED_MULTIPLE_NODES */
+#endif /* CONFIG_NUMA */
 #ifdef CONFIG_FA_DUMP
 #define __HAVE_ARCH_RESERVED_KERNEL_PAGES
 #endif
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index e42b85e4f1aa..a35fbf4d0bce 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -788,7 +788,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size,
 					size_t align)
 {
 	const unsigned long goal = __pa(MAX_DMA_ADDRESS);
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
 	int node = early_cpu_to_node(cpu);
 	void *ptr;
 
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 2e05c783440a..a5209ea3859e 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1047,7 +1047,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 			zalloc_cpumask_var_node(&per_cpu(cpu_coregroup_map, cpu),
 						GFP_KERNEL, cpu_to_node(cpu));
 
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
 		/*
 		 * numa_node_id() works after this.
 		 */
diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c
index 56da5eb2b923..48525e8b5730 100644
--- a/arch/powerpc/kexec/core.c
+++ b/arch/powerpc/kexec/core.c
@@ -68,11 +68,11 @@ void machine_kexec_cleanup(struct kimage *image)
 void arch_crash_save_vmcoreinfo(void)
 {
 
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
 	VMCOREINFO_SYMBOL(node_data);
 	VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
 #endif
-#ifndef CONFIG_NEED_MULTIPLE_NODES
+#ifndef CONFIG_NUMA
 	VMCOREINFO_SYMBOL(contig_page_data);
 #endif
 #if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP)
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index c3df3a8501d4..2ffcf540f08b 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -13,7 +13,7 @@ obj-y				:= fault.o mem.o pgtable.o mmap.o maccess.o \
 obj-$(CONFIG_PPC_MMU_NOHASH)	+= nohash/
 obj-$(CONFIG_PPC_BOOK3S_32)	+= book3s32/
 obj-$(CONFIG_PPC_BOOK3S_64)	+= book3s64/
-obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
+obj-$(CONFIG_NUMA) += numa.o
 obj-$(CONFIG_PPC_MM_SLICES)	+= slice.o
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index a6b36a40897a..c5e520c6f13b 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -127,7 +127,7 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size,
 }
 #endif
 
-#ifndef CONFIG_NEED_MULTIPLE_NODES
+#ifndef CONFIG_NUMA
 void __init mem_topology_setup(void)
 {
 	max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
@@ -162,7 +162,7 @@ static int __init mark_nonram_nosave(void)
 
 	return 0;
 }
-#else /* CONFIG_NEED_MULTIPLE_NODES */
+#else /* CONFIG_NUMA */
 static int __init mark_nonram_nosave(void)
 {
 	return 0;
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 18ec0f9bb8d5..15f9490a7aad 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -332,7 +332,7 @@ config NODES_SHIFT
 	int "Maximum NUMA Nodes (as a power of 2)"
 	range 1 10
 	default "2"
-	depends on NEED_MULTIPLE_NODES
+	depends on NUMA
 	help
 	  Specify the maximum number of NUMA Nodes available on the target
 	  system.  Increases memory reserved to accommodate various tables.
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index b4c7c34069f8..707afbcd81c2 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -475,7 +475,7 @@ config NUMA
 
 config NODES_SHIFT
 	int
-	depends on NEED_MULTIPLE_NODES
+	depends on NUMA
 	default "1"
 
 config SCHED_SMT
diff --git a/arch/sh/include/asm/mmzone.h b/arch/sh/include/asm/mmzone.h
index 6552a088dc97..7b8dead2723d 100644
--- a/arch/sh/include/asm/mmzone.h
+++ b/arch/sh/include/asm/mmzone.h
@@ -2,7 +2,7 @@
 #ifndef __ASM_SH_MMZONE_H
 #define __ASM_SH_MMZONE_H
 
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
 #include <linux/numa.h>
 
 extern struct pglist_data *node_data[];
@@ -31,7 +31,7 @@ static inline void
 setup_bootmem_node(int nid, unsigned long start, unsigned long end)
 {
 }
-#endif /* CONFIG_NEED_MULTIPLE_NODES */
+#endif /* CONFIG_NUMA */
 
 /* Platform specific mem init */
 void __init plat_mem_setup(void);
diff --git a/arch/sh/kernel/topology.c b/arch/sh/kernel/topology.c
index 7a989eed3b18..76af6db9daa2 100644
--- a/arch/sh/kernel/topology.c
+++ b/arch/sh/kernel/topology.c
@@ -46,7 +46,7 @@ static int __init topology_init(void)
 {
 	int i, ret;
 
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
 	for_each_online_node(i)
 		register_one_node(i);
 #endif
diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
index d551a9cac41e..ba569cfb4368 100644
--- a/arch/sh/mm/Kconfig
+++ b/arch/sh/mm/Kconfig
@@ -120,7 +120,7 @@ config NODES_SHIFT
 	int
 	default "3" if CPU_SUBTYPE_SHX3
 	default "1"
-	depends on NEED_MULTIPLE_NODES
+	depends on NUMA
 
 config ARCH_FLATMEM_ENABLE
 	def_bool y
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 168d7d4dd735..ce26c7f8950a 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -211,7 +211,7 @@ void __init allocate_pgdat(unsigned int nid)
 
 	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
 
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
 	NODE_DATA(nid) = memblock_alloc_try_nid(
 				sizeof(struct pglist_data),
 				SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 164a5254c91c..c72f52c704cd 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -265,7 +265,7 @@ config NODES_SHIFT
 	int "Maximum NUMA Nodes (as a power of 2)"
 	range 4 5 if SPARC64
 	default "5"
-	depends on NEED_MULTIPLE_NODES
+	depends on NUMA
 	help
 	  Specify the maximum number of NUMA Nodes available on the target
 	  system.  Increases memory reserved to accommodate various tables.
diff --git a/arch/sparc/include/asm/mmzone.h b/arch/sparc/include/asm/mmzone.h
index 6543fb97a849..a236d8aa893a 100644
--- a/arch/sparc/include/asm/mmzone.h
+++ b/arch/sparc/include/asm/mmzone.h
@@ -2,7 +2,7 @@
 #ifndef _SPARC64_MMZONE_H
 #define _SPARC64_MMZONE_H
 
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
 
 #include <linux/cpumask.h>
 
@@ -13,6 +13,6 @@ extern struct pglist_data *node_data[];
 extern int numa_cpu_lookup_table[];
 extern cpumask_t numa_cpumask_lookup_table[];
 
-#endif /* CONFIG_NEED_MULTIPLE_NODES */
+#endif /* CONFIG_NUMA */
 
 #endif /* _SPARC64_MMZONE_H */
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index e38d8bf454e8..c89a5971fb0d 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1546,7 +1546,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size,
 					size_t align)
 {
 	const unsigned long goal = __pa(MAX_DMA_ADDRESS);
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
 	int node = cpu_to_node(cpu);
 	void *ptr;
 
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index e454f179cf5d..06e938d03f3b 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -903,7 +903,7 @@ struct node_mem_mask {
 static struct node_mem_mask node_masks[MAX_NUMNODES];
 static int num_node_masks;
 
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
 
 struct mdesc_mlgroup {
 	u64	node;
@@ -1059,7 +1059,7 @@ static void __init allocate_node_data(int nid)
 {
 	struct pglist_data *p;
 	unsigned long start_pfn, end_pfn;
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
 
 	NODE_DATA(nid) = memblock_alloc_node(sizeof(struct pglist_data),
 					     SMP_CACHE_BYTES, nid);
@@ -1080,7 +1080,7 @@ static void __init allocate_node_data(int nid)
 
 static void init_node_masks_nonnuma(void)
 {
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
 	int i;
 #endif
 
@@ -1090,7 +1090,7 @@ static void init_node_masks_nonnuma(void)
 	node_masks[0].match = 0;
 	num_node_masks = 1;
 
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
 	for (i = 0; i < NR_CPUS; i++)
 		numa_cpu_lookup_table[i] = 0;
 
@@ -1098,7 +1098,7 @@ static void init_node_masks_nonnuma(void)
 #endif
 }
 
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
 struct pglist_data *node_data[MAX_NUMNODES];
 
 EXPORT_SYMBOL(numa_cpu_lookup_table);
@@ -2487,7 +2487,7 @@ int page_in_phys_avail(unsigned long paddr)
 
 static void __init register_page_bootmem_info(void)
 {
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
 	int i;
 
 	for_each_online_node(i)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0045e1b44190..5d523ff70fe7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1597,7 +1597,7 @@ config NODES_SHIFT
 	default "10" if MAXSMP
 	default "6" if X86_64
 	default "3"
-	depends on NEED_MULTIPLE_NODES
+	depends on NUMA
 	help
 	  Specify the maximum number of NUMA Nodes available on the target
 	  system.  Increases memory reserved to accommodate various tables.
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 0941d2f44f2a..78a32b956e81 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -66,7 +66,7 @@ EXPORT_SYMBOL(__per_cpu_offset);
  */
 static bool __init pcpu_need_numa(void)
 {
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
 	pg_data_t *last = NULL;
 	unsigned int cpu;
 
@@ -101,7 +101,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
 					unsigned long align)
 {
 	const unsigned long goal = __pa(MAX_DMA_ADDRESS);
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
 	int node = early_cpu_to_node(cpu);
 	void *ptr;
 
@@ -140,7 +140,7 @@ static void __init pcpu_fc_free(void *ptr, size_t size)
 
 static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
 {
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
 	if (early_cpu_to_node(from) == early_cpu_to_node(to))
 		return LOCAL_DISTANCE;
 	else
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 21ffb03f6c72..74b78840182d 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -651,7 +651,7 @@ void __init find_low_pfn_range(void)
 		highmem_pfn_init();
 }
 
-#ifndef CONFIG_NEED_MULTIPLE_NODES
+#ifndef CONFIG_NUMA
 void __init initmem_init(void)
 {
 #ifdef CONFIG_HIGHMEM
@@ -677,7 +677,7 @@ void __init initmem_init(void)
 
 	setup_bootmem_allocator();
 }
-#endif /* !CONFIG_NEED_MULTIPLE_NODES */
+#endif /* !CONFIG_NUMA */
 
 void __init setup_bootmem_allocator(void)
 {
diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h
index 5aa8705df87e..4dbe715be65b 100644
--- a/include/asm-generic/topology.h
+++ b/include/asm-generic/topology.h
@@ -45,7 +45,7 @@
 #endif
 
 #ifndef cpumask_of_node
-  #ifdef CONFIG_NEED_MULTIPLE_NODES
+  #ifdef CONFIG_NUMA
     #define cpumask_of_node(node)	((node) == 0 ? cpu_online_mask : cpu_none_mask)
   #else
     #define cpumask_of_node(node)	((void)(node), cpu_online_mask)
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 5984fff3f175..552309342c38 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -50,7 +50,7 @@ struct memblock_region {
 	phys_addr_t base;
 	phys_addr_t size;
 	enum memblock_flags flags;
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
 	int nid;
 #endif
 };
@@ -347,7 +347,7 @@ int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask);
 int memblock_set_node(phys_addr_t base, phys_addr_t size,
 		      struct memblock_type *type, int nid);
 
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
 static inline void memblock_set_region_node(struct memblock_region *r, int nid)
 {
 	r->nid = nid;
@@ -366,7 +366,7 @@ static inline int memblock_get_region_node(const struct memblock_region *r)
 {
 	return 0;
 }
-#endif /* CONFIG_NEED_MULTIPLE_NODES */
+#endif /* CONFIG_NUMA */
 
 /* Flags for memblock allocation APIs */
 #define MEMBLOCK_ALLOC_ANYWHERE	(~(phys_addr_t)0)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9bd21e6fad6a..07922ee1477e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -46,7 +46,7 @@ extern int sysctl_page_lock_unfairness;
 
 void init_mm_internals(void);
 
-#ifndef CONFIG_NEED_MULTIPLE_NODES	/* Don't use mapnrs, do it properly */
+#ifndef CONFIG_NUMA		/* Don't use mapnrs, do it properly */
 extern unsigned long max_mapnr;
 
 static inline void set_max_mapnr(unsigned long limit)
@@ -2460,7 +2460,7 @@ extern void get_pfn_range_for_nid(unsigned int nid,
 			unsigned long *start_pfn, unsigned long *end_pfn);
 extern unsigned long find_min_pfn_with_active_regions(void);
 
-#ifndef CONFIG_NEED_MULTIPLE_NODES
+#ifndef CONFIG_NUMA
 static inline int early_pfn_to_nid(unsigned long pfn)
 {
 	return 0;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 6f9829562af2..4bd420ed3961 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1043,17 +1043,17 @@ extern int percpu_pagelist_high_fraction;
 extern char numa_zonelist_order[];
 #define NUMA_ZONELIST_ORDER_LEN	16
 
-#ifndef CONFIG_NEED_MULTIPLE_NODES
+#ifndef CONFIG_NUMA
 
 extern struct pglist_data contig_page_data;
 #define NODE_DATA(nid)		(&contig_page_data)
 #define NODE_MEM_MAP(nid)	mem_map
 
-#else /* CONFIG_NEED_MULTIPLE_NODES */
+#else /* CONFIG_NUMA */
 
 #include <asm/mmzone.h>
 
-#endif /* !CONFIG_NEED_MULTIPLE_NODES */
+#endif /* !CONFIG_NUMA */
 
 extern struct pglist_data *first_online_pgdat(void);
 extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat);
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 684a6061a13a..0a4780c047c9 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -455,7 +455,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_SYMBOL(_stext);
 	VMCOREINFO_SYMBOL(vmap_area_list);
 
-#ifndef CONFIG_NEED_MULTIPLE_NODES
+#ifndef CONFIG_NUMA
 	VMCOREINFO_SYMBOL(mem_map);
 	VMCOREINFO_SYMBOL(contig_page_data);
 #endif
diff --git a/mm/Kconfig b/mm/Kconfig
index 218b96ccc84a..bffe4bd859f3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -59,15 +59,6 @@ config FLAT_NODE_MEM_MAP
 	def_bool y
 	depends on !SPARSEMEM
 
-#
-# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
-# to represent different areas of memory.  This variable allows
-# those dependencies to exist individually.
-#
-config NEED_MULTIPLE_NODES
-	def_bool y
-	depends on NUMA
-
 #
 # SPARSEMEM_EXTREME (which is the default) does some bootmem
 # allocations when sparse_init() is called.  If this cannot
diff --git a/mm/memblock.c b/mm/memblock.c
index afaefa8fc6ab..123feef5259d 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -92,7 +92,7 @@
  * system initialization completes.
  */
 
-#ifndef CONFIG_NEED_MULTIPLE_NODES
+#ifndef CONFIG_NUMA
 struct pglist_data __refdata contig_page_data;
 EXPORT_SYMBOL(contig_page_data);
 #endif
@@ -607,7 +607,7 @@ repeat:
 		 * area, insert that portion.
 		 */
 		if (rbase > base) {
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
 			WARN_ON(nid != memblock_get_region_node(rgn));
 #endif
 			WARN_ON(flags != rgn->flags);
@@ -1205,7 +1205,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
 int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
 				      struct memblock_type *type, int nid)
 {
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
 	int start_rgn, end_rgn;
 	int i, ret;
 
@@ -1849,7 +1849,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type)
 		size = rgn->size;
 		end = base + size - 1;
 		flags = rgn->flags;
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
 		if (memblock_get_region_node(rgn) != MAX_NUMNODES)
 			snprintf(nid_buf, sizeof(nid_buf), " on node %d",
 				 memblock_get_region_node(rgn));
diff --git a/mm/memory.c b/mm/memory.c
index 3dd6b2e73e1d..48c4576df898 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -90,8 +90,7 @@
 #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
 #endif
 
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-/* use the per-pgdat data instead for discontigmem - mbligh */
+#ifndef CONFIG_NUMA
 unsigned long max_mapnr;
 EXPORT_SYMBOL(max_mapnr);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8926f3fd3bcf..c4069f9e3968 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1634,7 +1634,7 @@ void __free_pages_core(struct page *page, unsigned int order)
 	__free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON);
 }
 
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
 
 /*
  * During memory init memblocks map pfns to nids. The search is expensive and
@@ -1684,7 +1684,7 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
 
 	return nid;
 }
-#endif /* CONFIG_NEED_MULTIPLE_NODES */
+#endif /* CONFIG_NUMA */
 
 void __init memblock_free_pages(struct page *page, unsigned long pfn,
 							unsigned int order)
@@ -7438,7 +7438,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
 	pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
 				__func__, pgdat->node_id, (unsigned long)pgdat,
 				(unsigned long)pgdat->node_mem_map);
-#ifndef CONFIG_NEED_MULTIPLE_NODES
+#ifndef CONFIG_NUMA
 	/*
 	 * With no DISCONTIG, the global mem_map is just set as node 0's
 	 */
diff --git a/mm/sparse.c b/mm/sparse.c
index 55c18aff3e42..7272f7a1449d 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -346,7 +346,7 @@ size_t mem_section_usage_size(void)
 
 static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat)
 {
-#ifndef CONFIG_NEED_MULTIPLE_NODES
+#ifndef CONFIG_NUMA
 	return __pa_symbol(pgdat);
 #else
 	return __pa(pgdat);
-- 
cgit v1.2.3


From 43b02ba93b25b1caff7a3457fc5d005485e78da5 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 28 Jun 2021 19:43:05 -0700
Subject: mm: replace CONFIG_FLAT_NODE_MEM_MAP with CONFIG_FLATMEM

After removal of the DISCONTIGMEM memory model the FLAT_NODE_MEM_MAP
configuration option is equivalent to FLATMEM.

Drop CONFIG_FLAT_NODE_MEM_MAP and use CONFIG_FLATMEM instead.

Link: https://lkml.kernel.org/r/20210608091316.3622-10-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Vineet Gupta <vgupta@synopsys.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 4 ++--
 kernel/crash_core.c    | 2 +-
 mm/Kconfig             | 4 ----
 mm/page_alloc.c        | 6 +++---
 mm/page_ext.c          | 2 +-
 5 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4bd420ed3961..578588d4afc9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -788,7 +788,7 @@ typedef struct pglist_data {
 	struct zonelist node_zonelists[MAX_ZONELISTS];
 
 	int nr_zones; /* number of populated zones in this node */
-#ifdef CONFIG_FLAT_NODE_MEM_MAP	/* means !SPARSEMEM */
+#ifdef CONFIG_FLATMEM	/* means !SPARSEMEM */
 	struct page *node_mem_map;
 #ifdef CONFIG_PAGE_EXTENSION
 	struct page_ext *node_page_ext;
@@ -878,7 +878,7 @@ typedef struct pglist_data {
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
 #define node_spanned_pages(nid)	(NODE_DATA(nid)->node_spanned_pages)
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
+#ifdef CONFIG_FLATMEM
 #define pgdat_page_nr(pgdat, pagenr)	((pgdat)->node_mem_map + (pagenr))
 #else
 #define pgdat_page_nr(pgdat, pagenr)	pfn_to_page((pgdat)->node_start_pfn + (pagenr))
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 0a4780c047c9..da449c1cdca7 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -484,7 +484,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_OFFSET(page, compound_head);
 	VMCOREINFO_OFFSET(pglist_data, node_zones);
 	VMCOREINFO_OFFSET(pglist_data, nr_zones);
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
+#ifdef CONFIG_FLATMEM
 	VMCOREINFO_OFFSET(pglist_data, node_mem_map);
 #endif
 	VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
diff --git a/mm/Kconfig b/mm/Kconfig
index bffe4bd859f3..ded98fb859ab 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -55,10 +55,6 @@ config FLATMEM
 	def_bool y
 	depends on !SPARSEMEM || FLATMEM_MANUAL
 
-config FLAT_NODE_MEM_MAP
-	def_bool y
-	depends on !SPARSEMEM
-
 #
 # SPARSEMEM_EXTREME (which is the default) does some bootmem
 # allocations when sparse_init() is called.  If this cannot
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c4069f9e3968..0e441f1677f3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6547,7 +6547,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
 	}
 }
 
-#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
+#if !defined(CONFIG_FLATMEM)
 /*
  * Only struct pages that correspond to ranges defined by memblock.memory
  * are zeroed and initialized by going through __init_single_page() during
@@ -7403,7 +7403,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat)
 	}
 }
 
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
+#ifdef CONFIG_FLATMEM
 static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
 {
 	unsigned long __maybe_unused start = 0;
@@ -7451,7 +7451,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
 }
 #else
 static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
-#endif /* CONFIG_FLAT_NODE_MEM_MAP */
+#endif /* CONFIG_FLATMEM */
 
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
diff --git a/mm/page_ext.c b/mm/page_ext.c
index df6f74aac8e1..293b2685fc48 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -191,7 +191,7 @@ fail:
 	panic("Out of memory");
 }
 
-#else /* CONFIG_FLAT_NODE_MEM_MAP */
+#else /* CONFIG_FLATMEM */
 
 struct page_ext *lookup_page_ext(const struct page *page)
 {
-- 
cgit v1.2.3


From 44042b4498728f4376e84bae1ac8016d146d850b Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Mon, 28 Jun 2021 19:43:08 -0700
Subject: mm/page_alloc: allow high-order pages to be stored on the per-cpu
 lists

The per-cpu page allocator (PCP) only stores order-0 pages.  This means
that all THP and "cheap" high-order allocations including SLUB contends on
the zone->lock.  This patch extends the PCP allocator to store THP and
"cheap" high-order pages.  Note that struct per_cpu_pages increases in
size to 256 bytes (4 cache lines) on x86-64.

Note that this is not necessarily a universal performance win because of
how it is implemented.  High-order pages can cause pcp->high to be
exceeded prematurely for lower-orders so for example, a large number of
THP pages being freed could release order-0 pages from the PCP lists.
Hence, much depends on the allocation/free pattern as observed by a single
CPU to determine if caching helps or hurts a particular workload.

That said, basic performance testing passed.  The following is a netperf
UDP_STREAM test which hits the relevant patches as some of the network
allocations are high-order.

netperf-udp
                                 5.13.0-rc2             5.13.0-rc2
                           mm-pcpburst-v3r4   mm-pcphighorder-v1r7
Hmean     send-64         261.46 (   0.00%)      266.30 *   1.85%*
Hmean     send-128        516.35 (   0.00%)      536.78 *   3.96%*
Hmean     send-256       1014.13 (   0.00%)     1034.63 *   2.02%*
Hmean     send-1024      3907.65 (   0.00%)     4046.11 *   3.54%*
Hmean     send-2048      7492.93 (   0.00%)     7754.85 *   3.50%*
Hmean     send-3312     11410.04 (   0.00%)    11772.32 *   3.18%*
Hmean     send-4096     13521.95 (   0.00%)    13912.34 *   2.89%*
Hmean     send-8192     21660.50 (   0.00%)    22730.72 *   4.94%*
Hmean     send-16384    31902.32 (   0.00%)    32637.50 *   2.30%*

Functionally, a patch like this is necessary to make bulk allocation of
high-order pages work with similar performance to order-0 bulk
allocations.  The bulk allocator is not updated in this series as it would
have to be determined by bulk allocation users how they want to track the
order of pages allocated with the bulk allocator.

Link: https://lkml.kernel.org/r/20210611135753.GC30378@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  20 +++++-
 mm/internal.h          |   2 +-
 mm/page_alloc.c        | 169 +++++++++++++++++++++++++++++++++++--------------
 mm/swap.c              |   2 +-
 4 files changed, 144 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 578588d4afc9..265a32e1ff74 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -333,6 +333,24 @@ enum zone_watermarks {
 	NR_WMARK
 };
 
+/*
+ * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER plus one additional
+ * for pageblock size for THP if configured.
+ */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define NR_PCP_THP 1
+#else
+#define NR_PCP_THP 0
+#endif
+#define NR_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1 + NR_PCP_THP))
+
+/*
+ * Shift to encode migratetype and order in the same integer, with order
+ * in the least significant bits.
+ */
+#define NR_PCP_ORDER_WIDTH 8
+#define NR_PCP_ORDER_MASK ((1<<NR_PCP_ORDER_WIDTH) - 1)
+
 #define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
 #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
 #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
@@ -349,7 +367,7 @@ struct per_cpu_pages {
 #endif
 
 	/* Lists of pages, one per migrate type stored on the pcp-lists */
-	struct list_head lists[MIGRATE_PCPTYPES];
+	struct list_head lists[NR_PCP_LISTS];
 };
 
 struct per_cpu_zonestat {
diff --git a/mm/internal.h b/mm/internal.h
index 18e5fb4d225f..6ec2cea9926b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -203,7 +203,7 @@ extern void post_alloc_hook(struct page *page, unsigned int order,
 					gfp_t gfp_flags);
 extern int user_min_free_kbytes;
 
-extern void free_unref_page(struct page *page);
+extern void free_unref_page(struct page *page, unsigned int order);
 extern void free_unref_page_list(struct list_head *list);
 
 extern void zone_pcp_update(struct zone *zone, int cpu_online);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0e441f1677f3..34f097ecfe08 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -674,10 +674,53 @@ out:
 	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 }
 
+static inline unsigned int order_to_pindex(int migratetype, int order)
+{
+	int base = order;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	if (order > PAGE_ALLOC_COSTLY_ORDER) {
+		VM_BUG_ON(order != pageblock_order);
+		base = PAGE_ALLOC_COSTLY_ORDER + 1;
+	}
+#else
+	VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
+#endif
+
+	return (MIGRATE_PCPTYPES * base) + migratetype;
+}
+
+static inline int pindex_to_order(unsigned int pindex)
+{
+	int order = pindex / MIGRATE_PCPTYPES;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	if (order > PAGE_ALLOC_COSTLY_ORDER) {
+		order = pageblock_order;
+		VM_BUG_ON(order != pageblock_order);
+	}
+#else
+	VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
+#endif
+
+	return order;
+}
+
+static inline bool pcp_allowed_order(unsigned int order)
+{
+	if (order <= PAGE_ALLOC_COSTLY_ORDER)
+		return true;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	if (order == pageblock_order)
+		return true;
+#endif
+	return false;
+}
+
 static inline void free_the_page(struct page *page, unsigned int order)
 {
-	if (order == 0)		/* Via pcp? */
-		free_unref_page(page);
+	if (pcp_allowed_order(order))		/* Via pcp? */
+		free_unref_page(page, order);
 	else
 		__free_pages_ok(page, order, FPI_NONE);
 }
@@ -700,7 +743,7 @@ static inline void free_the_page(struct page *page, unsigned int order)
 void free_compound_page(struct page *page)
 {
 	mem_cgroup_uncharge(page);
-	__free_pages_ok(page, compound_order(page), FPI_NONE);
+	free_the_page(page, compound_order(page));
 }
 
 void prep_compound_page(struct page *page, unsigned int order)
@@ -1350,9 +1393,9 @@ static __always_inline bool free_pages_prepare(struct page *page,
  * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
  * moved from pcp lists to free lists.
  */
-static bool free_pcp_prepare(struct page *page)
+static bool free_pcp_prepare(struct page *page, unsigned int order)
 {
-	return free_pages_prepare(page, 0, true, FPI_NONE);
+	return free_pages_prepare(page, order, true, FPI_NONE);
 }
 
 static bool bulkfree_pcp_prepare(struct page *page)
@@ -1369,12 +1412,12 @@ static bool bulkfree_pcp_prepare(struct page *page)
  * debug_pagealloc enabled, they are checked also immediately when being freed
  * to the pcp lists.
  */
-static bool free_pcp_prepare(struct page *page)
+static bool free_pcp_prepare(struct page *page, unsigned int order)
 {
 	if (debug_pagealloc_enabled_static())
-		return free_pages_prepare(page, 0, true, FPI_NONE);
+		return free_pages_prepare(page, order, true, FPI_NONE);
 	else
-		return free_pages_prepare(page, 0, false, FPI_NONE);
+		return free_pages_prepare(page, order, false, FPI_NONE);
 }
 
 static bool bulkfree_pcp_prepare(struct page *page)
@@ -1406,8 +1449,10 @@ static inline void prefetch_buddy(struct page *page)
 static void free_pcppages_bulk(struct zone *zone, int count,
 					struct per_cpu_pages *pcp)
 {
-	int migratetype = 0;
+	int pindex = 0;
 	int batch_free = 0;
+	int nr_freed = 0;
+	unsigned int order;
 	int prefetch_nr = READ_ONCE(pcp->batch);
 	bool isolated_pageblocks;
 	struct page *page, *tmp;
@@ -1418,7 +1463,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 	 * below while (list_empty(list)) loop.
 	 */
 	count = min(pcp->count, count);
-	while (count) {
+	while (count > 0) {
 		struct list_head *list;
 
 		/*
@@ -1430,24 +1475,31 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 		 */
 		do {
 			batch_free++;
-			if (++migratetype == MIGRATE_PCPTYPES)
-				migratetype = 0;
-			list = &pcp->lists[migratetype];
+			if (++pindex == NR_PCP_LISTS)
+				pindex = 0;
+			list = &pcp->lists[pindex];
 		} while (list_empty(list));
 
 		/* This is the only non-empty list. Free them all. */
-		if (batch_free == MIGRATE_PCPTYPES)
+		if (batch_free == NR_PCP_LISTS)
 			batch_free = count;
 
+		order = pindex_to_order(pindex);
+		BUILD_BUG_ON(MAX_ORDER >= (1<<NR_PCP_ORDER_WIDTH));
 		do {
 			page = list_last_entry(list, struct page, lru);
 			/* must delete to avoid corrupting pcp list */
 			list_del(&page->lru);
-			pcp->count--;
+			nr_freed += 1 << order;
+			count -= 1 << order;
 
 			if (bulkfree_pcp_prepare(page))
 				continue;
 
+			/* Encode order with the migratetype */
+			page->index <<= NR_PCP_ORDER_WIDTH;
+			page->index |= order;
+
 			list_add_tail(&page->lru, &head);
 
 			/*
@@ -1463,8 +1515,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 				prefetch_buddy(page);
 				prefetch_nr--;
 			}
-		} while (--count && --batch_free && !list_empty(list));
+		} while (count > 0 && --batch_free && !list_empty(list));
 	}
+	pcp->count -= nr_freed;
 
 	/*
 	 * local_lock_irq held so equivalent to spin_lock_irqsave for
@@ -1479,14 +1532,19 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 	 */
 	list_for_each_entry_safe(page, tmp, &head, lru) {
 		int mt = get_pcppage_migratetype(page);
+
+		/* mt has been encoded with the order (see above) */
+		order = mt & NR_PCP_ORDER_MASK;
+		mt >>= NR_PCP_ORDER_WIDTH;
+
 		/* MIGRATE_ISOLATE page should not go to pcplists */
 		VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
 		/* Pageblock could have been isolated meanwhile */
 		if (unlikely(isolated_pageblocks))
 			mt = get_pageblock_migratetype(page);
 
-		__free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
-		trace_mm_page_pcpu_drain(page, 0, mt);
+		__free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE);
+		trace_mm_page_pcpu_drain(page, order, mt);
 	}
 	spin_unlock(&zone->lock);
 }
@@ -3263,11 +3321,12 @@ void mark_free_pages(struct zone *zone)
 }
 #endif /* CONFIG_PM */
 
-static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
+static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
+							unsigned int order)
 {
 	int migratetype;
 
-	if (!free_pcp_prepare(page))
+	if (!free_pcp_prepare(page, order))
 		return false;
 
 	migratetype = get_pfnblock_migratetype(page, pfn);
@@ -3317,16 +3376,18 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone)
 }
 
 static void free_unref_page_commit(struct page *page, unsigned long pfn,
-				   int migratetype)
+				   int migratetype, unsigned int order)
 {
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
 	int high;
+	int pindex;
 
 	__count_vm_event(PGFREE);
 	pcp = this_cpu_ptr(zone->per_cpu_pageset);
-	list_add(&page->lru, &pcp->lists[migratetype]);
-	pcp->count++;
+	pindex = order_to_pindex(migratetype, order);
+	list_add(&page->lru, &pcp->lists[pindex]);
+	pcp->count += 1 << order;
 	high = nr_pcp_high(pcp, zone);
 	if (pcp->count >= high) {
 		int batch = READ_ONCE(pcp->batch);
@@ -3336,15 +3397,15 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn,
 }
 
 /*
- * Free a 0-order page
+ * Free a pcp page
  */
-void free_unref_page(struct page *page)
+void free_unref_page(struct page *page, unsigned int order)
 {
 	unsigned long flags;
 	unsigned long pfn = page_to_pfn(page);
 	int migratetype;
 
-	if (!free_unref_page_prepare(page, pfn))
+	if (!free_unref_page_prepare(page, pfn, order))
 		return;
 
 	/*
@@ -3357,14 +3418,14 @@ void free_unref_page(struct page *page)
 	migratetype = get_pcppage_migratetype(page);
 	if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
 		if (unlikely(is_migrate_isolate(migratetype))) {
-			free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE);
+			free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
 			return;
 		}
 		migratetype = MIGRATE_MOVABLE;
 	}
 
 	local_lock_irqsave(&pagesets.lock, flags);
-	free_unref_page_commit(page, pfn, migratetype);
+	free_unref_page_commit(page, pfn, migratetype, order);
 	local_unlock_irqrestore(&pagesets.lock, flags);
 }
 
@@ -3381,7 +3442,7 @@ void free_unref_page_list(struct list_head *list)
 	/* Prepare pages for freeing */
 	list_for_each_entry_safe(page, next, list, lru) {
 		pfn = page_to_pfn(page);
-		if (!free_unref_page_prepare(page, pfn))
+		if (!free_unref_page_prepare(page, pfn, 0))
 			list_del(&page->lru);
 
 		/*
@@ -3413,7 +3474,7 @@ void free_unref_page_list(struct list_head *list)
 		set_page_private(page, 0);
 		migratetype = get_pcppage_migratetype(page);
 		trace_mm_page_free_batched(page);
-		free_unref_page_commit(page, pfn, migratetype);
+		free_unref_page_commit(page, pfn, migratetype, 0);
 
 		/*
 		 * Guard against excessive IRQ disabled times when we get
@@ -3549,7 +3610,8 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
 
 /* Remove page from the per-cpu list, caller must protect the list */
 static inline
-struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
+struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
+			int migratetype,
 			unsigned int alloc_flags,
 			struct per_cpu_pages *pcp,
 			struct list_head *list)
@@ -3558,16 +3620,30 @@ struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
 
 	do {
 		if (list_empty(list)) {
-			pcp->count += rmqueue_bulk(zone, 0,
-					READ_ONCE(pcp->batch), list,
+			int batch = READ_ONCE(pcp->batch);
+			int alloced;
+
+			/*
+			 * Scale batch relative to order if batch implies
+			 * free pages can be stored on the PCP. Batch can
+			 * be 1 for small zones or for boot pagesets which
+			 * should never store free pages as the pages may
+			 * belong to arbitrary zones.
+			 */
+			if (batch > 1)
+				batch = max(batch >> order, 2);
+			alloced = rmqueue_bulk(zone, order,
+					batch, list,
 					migratetype, alloc_flags);
+
+			pcp->count += alloced << order;
 			if (unlikely(list_empty(list)))
 				return NULL;
 		}
 
 		page = list_first_entry(list, struct page, lru);
 		list_del(&page->lru);
-		pcp->count--;
+		pcp->count -= 1 << order;
 	} while (check_new_pcp(page));
 
 	return page;
@@ -3575,8 +3651,9 @@ struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
 
 /* Lock and remove page from the per-cpu list */
 static struct page *rmqueue_pcplist(struct zone *preferred_zone,
-			struct zone *zone, gfp_t gfp_flags,
-			int migratetype, unsigned int alloc_flags)
+			struct zone *zone, unsigned int order,
+			gfp_t gfp_flags, int migratetype,
+			unsigned int alloc_flags)
 {
 	struct per_cpu_pages *pcp;
 	struct list_head *list;
@@ -3592,8 +3669,8 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 	 */
 	pcp = this_cpu_ptr(zone->per_cpu_pageset);
 	pcp->free_factor >>= 1;
-	list = &pcp->lists[migratetype];
-	page = __rmqueue_pcplist(zone,  migratetype, alloc_flags, pcp, list);
+	list = &pcp->lists[order_to_pindex(migratetype, order)];
+	page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
 	local_unlock_irqrestore(&pagesets.lock, flags);
 	if (page) {
 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
@@ -3614,15 +3691,15 @@ struct page *rmqueue(struct zone *preferred_zone,
 	unsigned long flags;
 	struct page *page;
 
-	if (likely(order == 0)) {
+	if (likely(pcp_allowed_order(order))) {
 		/*
 		 * MIGRATE_MOVABLE pcplist could have the pages on CMA area and
 		 * we need to skip it when CMA area isn't allowed.
 		 */
 		if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
 				migratetype != MIGRATE_MOVABLE) {
-			page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
-					migratetype, alloc_flags);
+			page = rmqueue_pcplist(preferred_zone, zone, order,
+					gfp_flags, migratetype, alloc_flags);
 			goto out;
 		}
 	}
@@ -5201,7 +5278,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 	/* Attempt the batch allocation */
 	local_lock_irqsave(&pagesets.lock, flags);
 	pcp = this_cpu_ptr(zone->per_cpu_pageset);
-	pcp_list = &pcp->lists[ac.migratetype];
+	pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
 
 	while (nr_populated < nr_pages) {
 
@@ -5211,7 +5288,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 			continue;
 		}
 
-		page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags,
+		page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags,
 								pcp, pcp_list);
 		if (unlikely(!page)) {
 			/* Try and get at least one page */
@@ -6778,13 +6855,13 @@ static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
 
 static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
 {
-	int migratetype;
+	int pindex;
 
 	memset(pcp, 0, sizeof(*pcp));
 	memset(pzstats, 0, sizeof(*pzstats));
 
-	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
-		INIT_LIST_HEAD(&pcp->lists[migratetype]);
+	for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
+		INIT_LIST_HEAD(&pcp->lists[pindex]);
 
 	/*
 	 * Set batch and high values safe for a boot pageset. A true percpu
diff --git a/mm/swap.c b/mm/swap.c
index 18cc9e63515b..6c11db780467 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -95,7 +95,7 @@ static void __put_single_page(struct page *page)
 {
 	__page_cache_release(page);
 	mem_cgroup_uncharge(page);
-	free_unref_page(page);
+	free_unref_page(page, 0);
 }
 
 static void __put_compound_page(struct page *page)
-- 
cgit v1.2.3


From a3f5d80ea401ac857f2910e28b15f35b2cf902f4 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi@nec.com>
Date: Mon, 28 Jun 2021 19:43:14 -0700
Subject: mm,hwpoison: send SIGBUS with error virutal address

Now an action required MCE in already hwpoisoned address surely sends a
SIGBUS to current process, but the SIGBUS doesn't convey error virtual
address.  That's not optimal for hwpoison-aware applications.

To fix the issue, make memory_failure() call kill_accessing_process(),
that does pagetable walk to find the error virtual address.  It could find
multiple virtual addresses for the same error page, and it seems hard to
tell which virtual address is correct one.  But that's rare and sending
incorrect virtual address could be better than no address.  So let's
report the first found virtual address for now.

[naoya.horiguchi@nec.com: fix walk_page_range() return]
  Link: https://lkml.kernel.org/r/20210603051055.GA244241@hori.linux.bs1.fc.nec.co.jp

Link: https://lkml.kernel.org/r/20210521030156.2612074-4-nao.horiguchi@gmail.com
Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Aili Yao <yaoaili@kingsoft.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Jue Wang <juew@google.com>
Cc: Borislav Petkov <bp@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/mce/core.c |  13 +++-
 include/linux/swapops.h        |   5 ++
 mm/memory-failure.c            | 150 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 165 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index bf7fe87a7e88..22791aadc085 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1257,19 +1257,28 @@ static void kill_me_maybe(struct callback_head *cb)
 {
 	struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
 	int flags = MF_ACTION_REQUIRED;
+	int ret;
 
 	pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
 
 	if (!p->mce_ripv)
 		flags |= MF_MUST_KILL;
 
-	if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags) &&
-	    !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
+	ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
+	if (!ret && !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
 		set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
 		sync_core();
 		return;
 	}
 
+	/*
+	 * -EHWPOISON from memory_failure() means that it already sent SIGBUS
+	 * to the current process with the proper error info, so no need to
+	 * send SIGBUS here again.
+	 */
+	if (ret == -EHWPOISON)
+		return;
+
 	if (p->mce_vaddr != (void __user *)-1l) {
 		force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT);
 	} else {
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 6430a94c6981..5907205c712c 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -330,6 +330,11 @@ static inline int is_hwpoison_entry(swp_entry_t entry)
 	return swp_type(entry) == SWP_HWPOISON;
 }
 
+static inline unsigned long hwpoison_entry_to_pfn(swp_entry_t entry)
+{
+	return swp_offset(entry);
+}
+
 static inline void num_poisoned_pages_inc(void)
 {
 	atomic_long_inc(&num_poisoned_pages);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6f5f78885ab4..4d151ce3e50d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -56,6 +56,7 @@
 #include <linux/kfifo.h>
 #include <linux/ratelimit.h>
 #include <linux/page-isolation.h>
+#include <linux/pagewalk.h>
 #include "internal.h"
 #include "ras/ras_event.h"
 
@@ -554,6 +555,148 @@ static void collect_procs(struct page *page, struct list_head *tokill,
 		collect_procs_file(page, tokill, force_early);
 }
 
+struct hwp_walk {
+	struct to_kill tk;
+	unsigned long pfn;
+	int flags;
+};
+
+static void set_to_kill(struct to_kill *tk, unsigned long addr, short shift)
+{
+	tk->addr = addr;
+	tk->size_shift = shift;
+}
+
+static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
+				unsigned long poisoned_pfn, struct to_kill *tk)
+{
+	unsigned long pfn = 0;
+
+	if (pte_present(pte)) {
+		pfn = pte_pfn(pte);
+	} else {
+		swp_entry_t swp = pte_to_swp_entry(pte);
+
+		if (is_hwpoison_entry(swp))
+			pfn = hwpoison_entry_to_pfn(swp);
+	}
+
+	if (!pfn || pfn != poisoned_pfn)
+		return 0;
+
+	set_to_kill(tk, addr, shift);
+	return 1;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
+				      struct hwp_walk *hwp)
+{
+	pmd_t pmd = *pmdp;
+	unsigned long pfn;
+	unsigned long hwpoison_vaddr;
+
+	if (!pmd_present(pmd))
+		return 0;
+	pfn = pmd_pfn(pmd);
+	if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) {
+		hwpoison_vaddr = addr + ((hwp->pfn - pfn) << PAGE_SHIFT);
+		set_to_kill(&hwp->tk, hwpoison_vaddr, PAGE_SHIFT);
+		return 1;
+	}
+	return 0;
+}
+#else
+static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
+				      struct hwp_walk *hwp)
+{
+	return 0;
+}
+#endif
+
+static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
+			      unsigned long end, struct mm_walk *walk)
+{
+	struct hwp_walk *hwp = (struct hwp_walk *)walk->private;
+	int ret = 0;
+	pte_t *ptep;
+	spinlock_t *ptl;
+
+	ptl = pmd_trans_huge_lock(pmdp, walk->vma);
+	if (ptl) {
+		ret = check_hwpoisoned_pmd_entry(pmdp, addr, hwp);
+		spin_unlock(ptl);
+		goto out;
+	}
+
+	if (pmd_trans_unstable(pmdp))
+		goto out;
+
+	ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, addr, &ptl);
+	for (; addr != end; ptep++, addr += PAGE_SIZE) {
+		ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT,
+					     hwp->pfn, &hwp->tk);
+		if (ret == 1)
+			break;
+	}
+	pte_unmap_unlock(ptep - 1, ptl);
+out:
+	cond_resched();
+	return ret;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
+			    unsigned long addr, unsigned long end,
+			    struct mm_walk *walk)
+{
+	struct hwp_walk *hwp = (struct hwp_walk *)walk->private;
+	pte_t pte = huge_ptep_get(ptep);
+	struct hstate *h = hstate_vma(walk->vma);
+
+	return check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
+				      hwp->pfn, &hwp->tk);
+}
+#else
+#define hwpoison_hugetlb_range	NULL
+#endif
+
+static struct mm_walk_ops hwp_walk_ops = {
+	.pmd_entry = hwpoison_pte_range,
+	.hugetlb_entry = hwpoison_hugetlb_range,
+};
+
+/*
+ * Sends SIGBUS to the current process with error info.
+ *
+ * This function is intended to handle "Action Required" MCEs on already
+ * hardware poisoned pages. They could happen, for example, when
+ * memory_failure() failed to unmap the error page at the first call, or
+ * when multiple local machine checks happened on different CPUs.
+ *
+ * MCE handler currently has no easy access to the error virtual address,
+ * so this function walks page table to find it. The returned virtual address
+ * is proper in most cases, but it could be wrong when the application
+ * process has multiple entries mapping the error page.
+ */
+static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
+				  int flags)
+{
+	int ret;
+	struct hwp_walk priv = {
+		.pfn = pfn,
+	};
+	priv.tk.tsk = p;
+
+	mmap_read_lock(p->mm);
+	ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops,
+			      (void *)&priv);
+	if (ret == 1 && priv.tk.addr)
+		kill_proc(&priv.tk, pfn, flags);
+	mmap_read_unlock(p->mm);
+	return ret ? -EFAULT : -EHWPOISON;
+}
+
 static const char *action_name[] = {
 	[MF_IGNORED] = "Ignored",
 	[MF_FAILED] = "Failed",
@@ -1267,7 +1410,10 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
 	if (TestSetPageHWPoison(head)) {
 		pr_err("Memory failure: %#lx: already hardware poisoned\n",
 		       pfn);
-		return -EHWPOISON;
+		res = -EHWPOISON;
+		if (flags & MF_ACTION_REQUIRED)
+			res = kill_accessing_process(current, page_to_pfn(head), flags);
+		return res;
 	}
 
 	num_poisoned_pages_inc();
@@ -1476,6 +1622,8 @@ try_again:
 		pr_err("Memory failure: %#lx: already hardware poisoned\n",
 			pfn);
 		res = -EHWPOISON;
+		if (flags & MF_ACTION_REQUIRED)
+			res = kill_accessing_process(current, pfn, flags);
 		goto unlock_mutex;
 	}
 
-- 
cgit v1.2.3


From e3ae2365efc14269170a6326477e669332271ab3 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Sun, 27 Jun 2021 18:48:21 -0400
Subject: net: sock: introduce sk_error_report

This patch introduces a function wrapper to call the sk_error_report
callback. That will prepare to add additional handling whenever
sk_error_report is called, for example to trace socket errors.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/chelsio/inline_crypto/chtls/chtls_cm.c  |  2 +-
 drivers/vhost/vsock.c                                |  2 +-
 include/linux/skmsg.h                                |  2 +-
 include/net/sock.h                                   |  2 ++
 include/net/tls.h                                    |  2 +-
 net/caif/caif_socket.c                               |  2 +-
 net/can/bcm.c                                        |  4 ++--
 net/can/isotp.c                                      | 20 ++++++++++----------
 net/can/j1939/socket.c                               |  4 ++--
 net/can/raw.c                                        |  6 +++---
 net/core/skbuff.c                                    |  6 +++---
 net/core/sock.c                                      |  6 ++++++
 net/dccp/ipv4.c                                      |  4 ++--
 net/dccp/ipv6.c                                      |  4 ++--
 net/dccp/proto.c                                     |  2 +-
 net/dccp/timer.c                                     |  2 +-
 net/ipv4/ping.c                                      |  2 +-
 net/ipv4/raw.c                                       |  4 ++--
 net/ipv4/tcp.c                                       |  4 ++--
 net/ipv4/tcp_input.c                                 |  2 +-
 net/ipv4/tcp_ipv4.c                                  |  4 ++--
 net/ipv4/tcp_timer.c                                 |  2 +-
 net/ipv4/udp.c                                       |  4 ++--
 net/ipv6/raw.c                                       |  2 +-
 net/ipv6/tcp_ipv6.c                                  |  4 ++--
 net/ipv6/udp.c                                       |  2 +-
 net/kcm/kcmsock.c                                    |  2 +-
 net/mptcp/subflow.c                                  |  2 +-
 net/netlink/af_netlink.c                             |  8 ++++----
 net/nfc/rawsock.c                                    |  2 +-
 net/packet/af_packet.c                               |  4 ++--
 net/qrtr/qrtr.c                                      |  2 +-
 net/sctp/input.c                                     |  2 +-
 net/sctp/ipv6.c                                      |  2 +-
 net/smc/af_smc.c                                     |  2 +-
 net/strparser/strparser.c                            |  2 +-
 net/unix/af_unix.c                                   |  2 +-
 net/vmw_vsock/af_vsock.c                             |  2 +-
 net/vmw_vsock/virtio_transport.c                     |  2 +-
 net/vmw_vsock/virtio_transport_common.c              |  2 +-
 net/vmw_vsock/vmci_transport.c                       |  4 ++--
 net/xdp/xsk.c                                        |  2 +-
 42 files changed, 75 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c
index 19dc7dc054a2..bcad69c48074 100644
--- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c
+++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c
@@ -2134,7 +2134,7 @@ static void chtls_abort_req_rss(struct sock *sk, struct sk_buff *skb)
 		sk->sk_err = ETIMEDOUT;
 
 		if (!sock_flag(sk, SOCK_DEAD))
-			sk->sk_error_report(sk);
+			sk_error_report(sk);
 
 		if (sk->sk_state == TCP_SYN_RECV && !abort_syn_rcv(sk, skb))
 			return;
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 119f08491d3c..d38c996b4f46 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -734,7 +734,7 @@ static void vhost_vsock_reset_orphans(struct sock *sk)
 	vsk->peer_shutdown = SHUTDOWN_MASK;
 	sk->sk_state = SS_UNCONNECTED;
 	sk->sk_err = ECONNRESET;
-	sk->sk_error_report(sk);
+	sk_error_report(sk);
 }
 
 static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index fcaa9a7996c8..31866031e370 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -347,7 +347,7 @@ static inline void sk_psock_report_error(struct sk_psock *psock, int err)
 	struct sock *sk = psock->sk;
 
 	sk->sk_err = err;
-	sk->sk_error_report(sk);
+	sk_error_report(sk);
 }
 
 struct sk_psock *sk_psock_init(struct sock *sk, int node);
diff --git a/include/net/sock.h b/include/net/sock.h
index ced2fc965ec7..8bdd80027ffb 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2281,6 +2281,8 @@ static inline int sock_error(struct sock *sk)
 	return -err;
 }
 
+void sk_error_report(struct sock *sk);
+
 static inline unsigned long sock_wspace(struct sock *sk)
 {
 	int amt = 0;
diff --git a/include/net/tls.h b/include/net/tls.h
index 8d398a5de3ee..be4b3e1cac46 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -469,7 +469,7 @@ static inline bool tls_is_sk_tx_device_offloaded(struct sock *sk)
 static inline void tls_err_abort(struct sock *sk, int err)
 {
 	sk->sk_err = err;
-	sk->sk_error_report(sk);
+	sk_error_report(sk);
 }
 
 static inline bool tls_bigint_increment(unsigned char *seq, int len)
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 3ad0a1df6712..647554c9813b 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -243,7 +243,7 @@ static void caif_ctrl_cb(struct cflayer *layr,
 		cf_sk->sk.sk_shutdown = SHUTDOWN_MASK;
 		cf_sk->sk.sk_err = ECONNRESET;
 		set_rx_flow_on(cf_sk);
-		cf_sk->sk.sk_error_report(&cf_sk->sk);
+		sk_error_report(&cf_sk->sk);
 		break;
 
 	default:
diff --git a/net/can/bcm.c b/net/can/bcm.c
index f3e4d9528fa3..e15a7dbe5f6c 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -1417,7 +1417,7 @@ static void bcm_notify(struct bcm_sock *bo, unsigned long msg,
 		if (notify_enodev) {
 			sk->sk_err = ENODEV;
 			if (!sock_flag(sk, SOCK_DEAD))
-				sk->sk_error_report(sk);
+				sk_error_report(sk);
 		}
 		break;
 
@@ -1425,7 +1425,7 @@ static void bcm_notify(struct bcm_sock *bo, unsigned long msg,
 		if (bo->bound && bo->ifindex == dev->ifindex) {
 			sk->sk_err = ENETDOWN;
 			if (!sock_flag(sk, SOCK_DEAD))
-				sk->sk_error_report(sk);
+				sk_error_report(sk);
 		}
 	}
 }
diff --git a/net/can/isotp.c b/net/can/isotp.c
index bd49299319a1..9fd274cf166b 100644
--- a/net/can/isotp.c
+++ b/net/can/isotp.c
@@ -168,7 +168,7 @@ static enum hrtimer_restart isotp_rx_timer_handler(struct hrtimer *hrtimer)
 		/* report 'connection timed out' */
 		sk->sk_err = ETIMEDOUT;
 		if (!sock_flag(sk, SOCK_DEAD))
-			sk->sk_error_report(sk);
+			sk_error_report(sk);
 
 		/* reset rx state */
 		so->rx.state = ISOTP_IDLE;
@@ -339,7 +339,7 @@ static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae)
 		/* malformed PDU - report 'not a data message' */
 		sk->sk_err = EBADMSG;
 		if (!sock_flag(sk, SOCK_DEAD))
-			sk->sk_error_report(sk);
+			sk_error_report(sk);
 
 		so->tx.state = ISOTP_IDLE;
 		wake_up_interruptible(&so->wait);
@@ -392,7 +392,7 @@ static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae)
 		/* overflow on receiver side - report 'message too long' */
 		sk->sk_err = EMSGSIZE;
 		if (!sock_flag(sk, SOCK_DEAD))
-			sk->sk_error_report(sk);
+			sk_error_report(sk);
 		fallthrough;
 
 	default:
@@ -420,7 +420,7 @@ static int isotp_rcv_sf(struct sock *sk, struct canfd_frame *cf, int pcilen,
 		/* malformed PDU - report 'not a data message' */
 		sk->sk_err = EBADMSG;
 		if (!sock_flag(sk, SOCK_DEAD))
-			sk->sk_error_report(sk);
+			sk_error_report(sk);
 		return 1;
 	}
 
@@ -535,7 +535,7 @@ static int isotp_rcv_cf(struct sock *sk, struct canfd_frame *cf, int ae,
 		/* wrong sn detected - report 'illegal byte sequence' */
 		sk->sk_err = EILSEQ;
 		if (!sock_flag(sk, SOCK_DEAD))
-			sk->sk_error_report(sk);
+			sk_error_report(sk);
 
 		/* reset rx state */
 		so->rx.state = ISOTP_IDLE;
@@ -559,7 +559,7 @@ static int isotp_rcv_cf(struct sock *sk, struct canfd_frame *cf, int ae,
 			/* malformed PDU - report 'not a data message' */
 			sk->sk_err = EBADMSG;
 			if (!sock_flag(sk, SOCK_DEAD))
-				sk->sk_error_report(sk);
+				sk_error_report(sk);
 			return 1;
 		}
 
@@ -758,7 +758,7 @@ static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer)
 		/* report 'communication error on send' */
 		sk->sk_err = ECOMM;
 		if (!sock_flag(sk, SOCK_DEAD))
-			sk->sk_error_report(sk);
+			sk_error_report(sk);
 
 		/* reset tx state */
 		so->tx.state = ISOTP_IDLE;
@@ -1157,7 +1157,7 @@ out:
 	if (notify_enetdown) {
 		sk->sk_err = ENETDOWN;
 		if (!sock_flag(sk, SOCK_DEAD))
-			sk->sk_error_report(sk);
+			sk_error_report(sk);
 	}
 
 	return err;
@@ -1356,13 +1356,13 @@ static void isotp_notify(struct isotp_sock *so, unsigned long msg,
 
 		sk->sk_err = ENODEV;
 		if (!sock_flag(sk, SOCK_DEAD))
-			sk->sk_error_report(sk);
+			sk_error_report(sk);
 		break;
 
 	case NETDEV_DOWN:
 		sk->sk_err = ENETDOWN;
 		if (!sock_flag(sk, SOCK_DEAD))
-			sk->sk_error_report(sk);
+			sk_error_report(sk);
 		break;
 	}
 }
diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c
index 56aa66147d5a..bf18a32dc6ae 100644
--- a/net/can/j1939/socket.c
+++ b/net/can/j1939/socket.c
@@ -1009,7 +1009,7 @@ void j1939_sk_send_loop_abort(struct sock *sk, int err)
 {
 	sk->sk_err = err;
 
-	sk->sk_error_report(sk);
+	sk_error_report(sk);
 }
 
 static int j1939_sk_send_loop(struct j1939_priv *priv,  struct sock *sk,
@@ -1189,7 +1189,7 @@ void j1939_sk_netdev_event_netdown(struct j1939_priv *priv)
 	list_for_each_entry(jsk, &priv->j1939_socks, list) {
 		jsk->sk.sk_err = error_code;
 		if (!sock_flag(&jsk->sk, SOCK_DEAD))
-			jsk->sk.sk_error_report(&jsk->sk);
+			sk_error_report(&jsk->sk);
 
 		j1939_sk_queue_drop_all(priv, jsk, error_code);
 	}
diff --git a/net/can/raw.c b/net/can/raw.c
index ac96fc210025..ed4fcb7ab0c3 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -295,13 +295,13 @@ static void raw_notify(struct raw_sock *ro, unsigned long msg,
 
 		sk->sk_err = ENODEV;
 		if (!sock_flag(sk, SOCK_DEAD))
-			sk->sk_error_report(sk);
+			sk_error_report(sk);
 		break;
 
 	case NETDEV_DOWN:
 		sk->sk_err = ENETDOWN;
 		if (!sock_flag(sk, SOCK_DEAD))
-			sk->sk_error_report(sk);
+			sk_error_report(sk);
 		break;
 	}
 }
@@ -488,7 +488,7 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len)
 	if (notify_enetdown) {
 		sk->sk_err = ENETDOWN;
 		if (!sock_flag(sk, SOCK_DEAD))
-			sk->sk_error_report(sk);
+			sk_error_report(sk);
 	}
 
 	return err;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2531ac4ffa69..12aabcda6db2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1294,7 +1294,7 @@ static void __msg_zerocopy_callback(struct ubuf_info *uarg)
 	}
 	spin_unlock_irqrestore(&q->lock, flags);
 
-	sk->sk_error_report(sk);
+	sk_error_report(sk);
 
 release:
 	consume_skb(skb);
@@ -4685,7 +4685,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
 
 	skb_queue_tail(&sk->sk_error_queue, skb);
 	if (!sock_flag(sk, SOCK_DEAD))
-		sk->sk_error_report(sk);
+		sk_error_report(sk);
 	return 0;
 }
 EXPORT_SYMBOL(sock_queue_err_skb);
@@ -4716,7 +4716,7 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
 		sk->sk_err = 0;
 
 	if (skb_next)
-		sk->sk_error_report(sk);
+		sk_error_report(sk);
 
 	return skb;
 }
diff --git a/net/core/sock.c b/net/core/sock.c
index a2337b37eba6..c30f8f4cbb22 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -331,6 +331,12 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(__sk_backlog_rcv);
 
+void sk_error_report(struct sock *sk)
+{
+	sk->sk_error_report(sk);
+}
+EXPORT_SYMBOL(sk_error_report);
+
 static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 {
 	struct __kernel_sock_timeval tv;
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index f81c1df761d3..0ea29270d7e5 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -329,7 +329,7 @@ static int dccp_v4_err(struct sk_buff *skb, u32 info)
 			__DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS);
 			sk->sk_err = err;
 
-			sk->sk_error_report(sk);
+			sk_error_report(sk);
 
 			dccp_done(sk);
 		} else
@@ -356,7 +356,7 @@ static int dccp_v4_err(struct sk_buff *skb, u32 info)
 	inet = inet_sk(sk);
 	if (!sock_owned_by_user(sk) && inet->recverr) {
 		sk->sk_err = err;
-		sk->sk_error_report(sk);
+		sk_error_report(sk);
 	} else /* Only an error on timeout */
 		sk->sk_err_soft = err;
 out:
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 6f5304db5a67..fa663518fa0e 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -172,7 +172,7 @@ static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			 * Wake people up to see the error
 			 * (see connect in sock.c)
 			 */
-			sk->sk_error_report(sk);
+			sk_error_report(sk);
 			dccp_done(sk);
 		} else
 			sk->sk_err_soft = err;
@@ -181,7 +181,7 @@ static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 
 	if (!sock_owned_by_user(sk) && np->recverr) {
 		sk->sk_err = err;
-		sk->sk_error_report(sk);
+		sk_error_report(sk);
 	} else
 		sk->sk_err_soft = err;
 
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 6d705d90c614..7eb0fb231940 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -302,7 +302,7 @@ int dccp_disconnect(struct sock *sk, int flags)
 
 	WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
 
-	sk->sk_error_report(sk);
+	sk_error_report(sk);
 	return 0;
 }
 
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index db768f223ef7..27a3b37acd2e 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -20,7 +20,7 @@ int  sysctl_dccp_retries2		__read_mostly = TCP_RETR2;
 static void dccp_write_err(struct sock *sk)
 {
 	sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
-	sk->sk_error_report(sk);
+	sk_error_report(sk);
 
 	dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
 	dccp_done(sk);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 95a718397fd1..1e44a43acfe2 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -573,7 +573,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info)
 		}
 	}
 	sk->sk_err = err;
-	sk->sk_error_report(sk);
+	sk_error_report(sk);
 out:
 	sock_put(sk);
 }
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 50a73178d63a..bb446e60cf58 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -280,7 +280,7 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
 
 	if (inet->recverr || harderr) {
 		sk->sk_err = err;
-		sk->sk_error_report(sk);
+		sk_error_report(sk);
 	}
 }
 
@@ -929,7 +929,7 @@ int raw_abort(struct sock *sk, int err)
 	lock_sock(sk);
 
 	sk->sk_err = err;
-	sk->sk_error_report(sk);
+	sk_error_report(sk);
 	__udp_disconnect(sk, 0);
 
 	release_sock(sk);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0e3f0e0e5b51..a0a96eb826c4 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3059,7 +3059,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 		sk->sk_frag.offset = 0;
 	}
 
-	sk->sk_error_report(sk);
+	sk_error_report(sk);
 	return 0;
 }
 EXPORT_SYMBOL(tcp_disconnect);
@@ -4448,7 +4448,7 @@ int tcp_abort(struct sock *sk, int err)
 		sk->sk_err = err;
 		/* This barrier is coupled with smp_rmb() in tcp_poll() */
 		smp_wmb();
-		sk->sk_error_report(sk);
+		sk_error_report(sk);
 		if (tcp_need_reset(sk->sk_state))
 			tcp_send_active_reset(sk, GFP_ATOMIC);
 		tcp_done(sk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7d5e59f688de..e6ca5a1f3b59 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4270,7 +4270,7 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb)
 	tcp_done(sk);
 
 	if (!sock_flag(sk, SOCK_DEAD))
-		sk->sk_error_report(sk);
+		sk_error_report(sk);
 }
 
 /*
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 6cb8e269f1ab..e66ad6bfe808 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -585,7 +585,7 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)
 		if (!sock_owned_by_user(sk)) {
 			sk->sk_err = err;
 
-			sk->sk_error_report(sk);
+			sk_error_report(sk);
 
 			tcp_done(sk);
 		} else {
@@ -613,7 +613,7 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)
 	inet = inet_sk(sk);
 	if (!sock_owned_by_user(sk) && inet->recverr) {
 		sk->sk_err = err;
-		sk->sk_error_report(sk);
+		sk_error_report(sk);
 	} else	{ /* Only an error on timeout */
 		sk->sk_err_soft = err;
 	}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 56b9d648f054..20cf4a98c69d 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -68,7 +68,7 @@ u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when)
 static void tcp_write_err(struct sock *sk)
 {
 	sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
-	sk->sk_error_report(sk);
+	sk_error_report(sk);
 
 	tcp_write_queue_purge(sk);
 	tcp_done(sk);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1307ad0d3b9e..f86ccbf7c135 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -776,7 +776,7 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 		ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1));
 
 	sk->sk_err = err;
-	sk->sk_error_report(sk);
+	sk_error_report(sk);
 out:
 	return 0;
 }
@@ -2867,7 +2867,7 @@ int udp_abort(struct sock *sk, int err)
 		goto out;
 
 	sk->sk_err = err;
-	sk->sk_error_report(sk);
+	sk_error_report(sk);
 	__udp_disconnect(sk, 0);
 
 out:
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index bf3646b57c68..60f1e4f5be5a 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -354,7 +354,7 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb,
 
 	if (np->recverr || harderr) {
 		sk->sk_err = err;
-		sk->sk_error_report(sk);
+		sk_error_report(sk);
 	}
 }
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 4d71464094b3..578ab6305c3f 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -467,7 +467,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 
 		if (!sock_owned_by_user(sk)) {
 			sk->sk_err = err;
-			sk->sk_error_report(sk);		/* Wake people up to see the error (see connect in sock.c) */
+			sk_error_report(sk);		/* Wake people up to see the error (see connect in sock.c) */
 
 			tcp_done(sk);
 		} else
@@ -486,7 +486,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 
 	if (!sock_owned_by_user(sk) && np->recverr) {
 		sk->sk_err = err;
-		sk->sk_error_report(sk);
+		sk_error_report(sk);
 	} else
 		sk->sk_err_soft = err;
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 3fcd86f4dfdc..368972dbd919 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -610,7 +610,7 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	}
 
 	sk->sk_err = err;
-	sk->sk_error_report(sk);
+	sk_error_report(sk);
 out:
 	return 0;
 }
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 6201965bd822..11a715d76a4f 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -47,7 +47,7 @@ static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb)
 static void report_csk_error(struct sock *csk, int err)
 {
 	csk->sk_err = EPIPE;
-	csk->sk_error_report(csk);
+	sk_error_report(csk);
 }
 
 static void kcm_abort_tx_psock(struct kcm_psock *psock, int err,
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index d55f4ef736a5..706a26a1b0fe 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -1240,7 +1240,7 @@ void __mptcp_error_report(struct sock *sk)
 
 		/* This barrier is coupled with smp_rmb() in mptcp_poll() */
 		smp_wmb();
-		sk->sk_error_report(sk);
+		sk_error_report(sk);
 		break;
 	}
 }
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 6133e412b948..d233ac4a91b6 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -351,7 +351,7 @@ static void netlink_overrun(struct sock *sk)
 		if (!test_and_set_bit(NETLINK_S_CONGESTED,
 				      &nlk_sk(sk)->state)) {
 			sk->sk_err = ENOBUFS;
-			sk->sk_error_report(sk);
+			sk_error_report(sk);
 		}
 	}
 	atomic_inc(&sk->sk_drops);
@@ -1576,7 +1576,7 @@ static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p)
 	}
 
 	sk->sk_err = p->code;
-	sk->sk_error_report(sk);
+	sk_error_report(sk);
 out:
 	return ret;
 }
@@ -2012,7 +2012,7 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 		ret = netlink_dump(sk);
 		if (ret) {
 			sk->sk_err = -ret;
-			sk->sk_error_report(sk);
+			sk_error_report(sk);
 		}
 	}
 
@@ -2439,7 +2439,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
 	skb = nlmsg_new(payload + tlvlen, GFP_KERNEL);
 	if (!skb) {
 		NETLINK_CB(in_skb).sk->sk_err = ENOBUFS;
-		NETLINK_CB(in_skb).sk->sk_error_report(NETLINK_CB(in_skb).sk);
+		sk_error_report(NETLINK_CB(in_skb).sk);
 		return;
 	}
 
diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c
index 5f1d438a0a23..5e39640becdb 100644
--- a/net/nfc/rawsock.c
+++ b/net/nfc/rawsock.c
@@ -49,7 +49,7 @@ static void rawsock_report_error(struct sock *sk, int err)
 
 	sk->sk_shutdown = SHUTDOWN_MASK;
 	sk->sk_err = -err;
-	sk->sk_error_report(sk);
+	sk_error_report(sk);
 
 	rawsock_write_queue_purge(sk);
 }
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 77b0cdab3810..77476184741d 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3206,7 +3206,7 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
 	} else {
 		sk->sk_err = ENETDOWN;
 		if (!sock_flag(sk, SOCK_DEAD))
-			sk->sk_error_report(sk);
+			sk_error_report(sk);
 	}
 
 out_unlock:
@@ -4103,7 +4103,7 @@ static int packet_notifier(struct notifier_block *this,
 					__unregister_prot_hook(sk, false);
 					sk->sk_err = ENETDOWN;
 					if (!sock_flag(sk, SOCK_DEAD))
-						sk->sk_error_report(sk);
+						sk_error_report(sk);
 				}
 				if (msg == NETDEV_UNREGISTER) {
 					packet_cached_dev_reset(po);
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index f2efaa4225f9..e6f4a6202f82 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -751,7 +751,7 @@ static void qrtr_reset_ports(void)
 	xa_for_each_start(&qrtr_ports, index, ipc, 1) {
 		sock_hold(&ipc->sk);
 		ipc->sk.sk_err = ENETRESET;
-		ipc->sk.sk_error_report(&ipc->sk);
+		sk_error_report(&ipc->sk);
 		sock_put(&ipc->sk);
 	}
 	rcu_read_unlock();
diff --git a/net/sctp/input.c b/net/sctp/input.c
index fe6429cc012f..76dcc137f761 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -593,7 +593,7 @@ static void sctp_v4_err_handle(struct sctp_transport *t, struct sk_buff *skb,
 	}
 	if (!sock_owned_by_user(sk) && inet_sk(sk)->recverr) {
 		sk->sk_err = err;
-		sk->sk_error_report(sk);
+		sk_error_report(sk);
 	} else {  /* Only an error on timeout */
 		sk->sk_err_soft = err;
 	}
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 05f81a4d0ee7..d041bed86322 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -152,7 +152,7 @@ static void sctp_v6_err_handle(struct sctp_transport *t, struct sk_buff *skb,
 	icmpv6_err_convert(type, code, &err);
 	if (!sock_owned_by_user(sk) && np->recverr) {
 		sk->sk_err = err;
-		sk->sk_error_report(sk);
+		sk_error_report(sk);
 	} else {
 		sk->sk_err_soft = err;
 	}
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index e41fdac606d4..898389611ae8 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -2218,7 +2218,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
 						   optval, optlen);
 	if (smc->clcsock->sk->sk_err) {
 		sk->sk_err = smc->clcsock->sk->sk_err;
-		sk->sk_error_report(sk);
+		sk_error_report(sk);
 	}
 
 	if (optlen < sizeof(int))
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index b3815c1e8f2e..9c0343568d2a 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -58,7 +58,7 @@ static void strp_abort_strp(struct strparser *strp, int err)
 
 		/* Report an error on the lower socket */
 		sk->sk_err = -err;
-		sk->sk_error_report(sk);
+		sk_error_report(sk);
 	}
 }
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 58c2f318b0a8..23c92ad15c61 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -491,7 +491,7 @@ static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 		 */
 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 			other->sk_err = ECONNRESET;
-			other->sk_error_report(other);
+			sk_error_report(other);
 		}
 	}
 }
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 21ccf450e249..9f12da1ff406 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1281,7 +1281,7 @@ static void vsock_connect_timeout(struct work_struct *work)
 	    (sk->sk_shutdown != SHUTDOWN_MASK)) {
 		sk->sk_state = TCP_CLOSE;
 		sk->sk_err = ETIMEDOUT;
-		sk->sk_error_report(sk);
+		sk_error_report(sk);
 		vsock_transport_cancel_pkt(vsk);
 	}
 	release_sock(sk);
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index ed1664e7bd88..e0c2c992ad9c 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -360,7 +360,7 @@ static void virtio_vsock_reset_sock(struct sock *sk)
 	lock_sock(sk);
 	sk->sk_state = TCP_CLOSE;
 	sk->sk_err = ECONNRESET;
-	sk->sk_error_report(sk);
+	sk_error_report(sk);
 	release_sock(sk);
 }
 
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index f014ccfdd9c2..169ba8b72a63 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1007,7 +1007,7 @@ destroy:
 	virtio_transport_reset(vsk, pkt);
 	sk->sk_state = TCP_CLOSE;
 	sk->sk_err = skerr;
-	sk->sk_error_report(sk);
+	sk_error_report(sk);
 	return err;
 }
 
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index e617ed93f06b..7aef34e32bdf 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -831,7 +831,7 @@ static void vmci_transport_handle_detach(struct sock *sk)
 
 				sk->sk_state = TCP_CLOSE;
 				sk->sk_err = ECONNRESET;
-				sk->sk_error_report(sk);
+				sk_error_report(sk);
 				return;
 			}
 			sk->sk_state = TCP_CLOSE;
@@ -1365,7 +1365,7 @@ destroy:
 
 	sk->sk_state = TCP_CLOSE;
 	sk->sk_err = skerr;
-	sk->sk_error_report(sk);
+	sk_error_report(sk);
 	return err;
 }
 
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 996da915f520..d6b500dc4208 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -1313,7 +1313,7 @@ static int xsk_notifier(struct notifier_block *this,
 			if (xs->dev == dev) {
 				sk->sk_err = ENETDOWN;
 				if (!sock_flag(sk, SOCK_DEAD))
-					sk->sk_error_report(sk);
+					sk_error_report(sk);
 
 				xsk_unbind_dev(xs);
 
-- 
cgit v1.2.3


From 5a9b876e9d76810536bac70c78d961198612919c Mon Sep 17 00:00:00 2001
From: Ling Pei Lee <pei.lee.ling@intel.com>
Date: Tue, 29 Jun 2021 11:08:57 +0800
Subject: net: stmmac: option to enable PHY WOL with PMT enabled

The current stmmac driver WOL implementation will enable MAC WOL
if MAC HW PMT feature is on. Else, the driver will check for
PHY WOL support. There is another case where MAC HW PMT is
enabled but the platform still goes for the PHY WOL option.
E.g, Intel platform are designed for PHY WOL but not MAC WOL
although HW MAC PMT features are enabled.

Introduce use_phy_wol platform data to select PHY WOL
instead of depending on HW PMT features. Set use_phy_wol
will disable the plat->pmt which currently used to
determine the system to wake up by MAC WOL or PHY WOL.

Signed-off-by: Ling Pei Lee <pei.lee.ling@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 3 ++-
 include/linux/stmmac.h                            | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 219535ab2c0c..8d9d6ecf8c63 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -6529,7 +6529,8 @@ static int stmmac_hw_init(struct stmmac_priv *priv)
 		 * register (if supported).
 		 */
 		priv->plat->enh_desc = priv->dma_cap.enh_desc;
-		priv->plat->pmt = priv->dma_cap.pmt_remote_wake_up;
+		priv->plat->pmt = priv->dma_cap.pmt_remote_wake_up &&
+				!priv->plat->use_phy_wol;
 		priv->hw->pmt = priv->plat->pmt;
 		if (priv->dma_cap.hash_tb_sz) {
 			priv->hw->multicast_filter_bins =
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 3867980d1447..d5ae621d66ba 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -265,5 +265,6 @@ struct plat_stmmacenet_data {
 	int msi_sfty_ue_vec;
 	int msi_rx_base_vec;
 	int msi_tx_base_vec;
+	bool use_phy_wol;
 };
 #endif
-- 
cgit v1.2.3


From 3e0f897fd92662f0ff21ca1759d724a9ad574858 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 23 Jun 2021 09:54:42 +0530
Subject: cpufreq: Remove the ->stop_cpu() driver callback

Now that all users of ->stop_cpu() have been migrated to using other
callbacks, drop it from the core.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
[ rjw: Minor edits in the subject and changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/cpu-freq/cpu-drivers.rst                    | 3 ---
 Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst | 3 ---
 drivers/cpufreq/cpufreq.c                                 | 3 ---
 include/linux/cpufreq.h                                   | 1 -
 4 files changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cpu-freq/cpu-drivers.rst b/Documentation/cpu-freq/cpu-drivers.rst
index a697278ce190..74fac797c396 100644
--- a/Documentation/cpu-freq/cpu-drivers.rst
+++ b/Documentation/cpu-freq/cpu-drivers.rst
@@ -71,9 +71,6 @@ And optionally
  .exit - A pointer to a per-policy cleanup function called during
  CPU_POST_DEAD phase of cpu hotplug process.
 
- .stop_cpu - A pointer to a per-policy stop function called during
- CPU_DOWN_PREPARE phase of cpu hotplug process.
-
  .suspend - A pointer to a per-policy suspend function which is called
  with interrupts disabled and _after_ the governor is stopped for the
  policy.
diff --git a/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst
index 0ca2cb646666..9570e9c9e939 100644
--- a/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst
+++ b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst
@@ -76,9 +76,6 @@ CPUfreq核心层注册一个cpufreq_driver结构体。
  .exit - 一个指向per-policy清理函数的指针，该函数在cpu热插拔过程的CPU_POST_DEAD
  阶段被调用。
 
- .stop_cpu - 一个指向per-policy停止函数的指针，该函数在cpu热插拔过程的CPU_DOWN_PREPARE
- 阶段被调用。
-
  .suspend - 一个指向per-policy暂停函数的指针，该函数在关中断且在该策略的调节器停止
  后被调用。
 
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index cbab834c37a0..5e4b5316d254 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1606,9 +1606,6 @@ static int cpufreq_offline(unsigned int cpu)
 		policy->cdev = NULL;
 	}
 
-	if (cpufreq_driver->stop_cpu)
-		cpufreq_driver->stop_cpu(policy);
-
 	if (has_target())
 		cpufreq_exit_governor(policy);
 
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 353969c7acd3..2e2267a36502 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -371,7 +371,6 @@ struct cpufreq_driver {
 	int		(*online)(struct cpufreq_policy *policy);
 	int		(*offline)(struct cpufreq_policy *policy);
 	int		(*exit)(struct cpufreq_policy *policy);
-	void		(*stop_cpu)(struct cpufreq_policy *policy);
 	int		(*suspend)(struct cpufreq_policy *policy);
 	int		(*resume)(struct cpufreq_policy *policy);
 
-- 
cgit v1.2.3


From c333b936c1530e76eba4e81091874d1217046131 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 7 Jun 2021 15:24:57 +0300
Subject: pwm: core: Remove unused devm_pwm_put()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are no users and seems no will come of the devm_pwm_put().
Remove the function.

While at it, slightly update documentation.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 Documentation/driver-api/driver-model/devres.rst |  3 ++-
 Documentation/driver-api/pwm.rst                 |  3 ++-
 drivers/pwm/core.c                               | 25 ------------------------
 include/linux/pwm.h                              |  5 -----
 4 files changed, 4 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
index e0814d214048..772216f7c6ff 100644
--- a/Documentation/driver-api/driver-model/devres.rst
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -401,7 +401,8 @@ POWER
 
 PWM
   devm_pwm_get()
-  devm_pwm_put()
+  devm_of_pwm_get()
+  devm_fwnode_pwm_get()
 
 REGULATOR
   devm_regulator_bulk_get()
diff --git a/Documentation/driver-api/pwm.rst b/Documentation/driver-api/pwm.rst
index 750734a7f874..ccb06e485756 100644
--- a/Documentation/driver-api/pwm.rst
+++ b/Documentation/driver-api/pwm.rst
@@ -40,7 +40,8 @@ after usage with pwm_free().
 
 New users should use the pwm_get() function and pass to it the consumer
 device or a consumer name. pwm_put() is used to free the PWM device. Managed
-variants of these functions, devm_pwm_get() and devm_pwm_put(), also exist.
+variants of the getter, devm_pwm_get(), devm_of_pwm_get(),
+devm_fwnode_pwm_get(), also exist.
 
 After being requested, a PWM has to be configured using::
 
diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index e823a4eba79e..4a173a00014e 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -1172,31 +1172,6 @@ struct pwm_device *devm_fwnode_pwm_get(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(devm_fwnode_pwm_get);
 
-static int devm_pwm_match(struct device *dev, void *res, void *data)
-{
-	struct pwm_device **p = res;
-
-	if (WARN_ON(!p || !*p))
-		return 0;
-
-	return *p == data;
-}
-
-/**
- * devm_pwm_put() - resource managed pwm_put()
- * @dev: device for PWM consumer
- * @pwm: PWM device
- *
- * Release a PWM previously allocated using devm_pwm_get(). Calling this
- * function is usually not needed because devm-allocated resources are
- * automatically released on driver detach.
- */
-void devm_pwm_put(struct device *dev, struct pwm_device *pwm)
-{
-	WARN_ON(devres_release(dev, devm_pwm_release, devm_pwm_match, pwm));
-}
-EXPORT_SYMBOL_GPL(devm_pwm_put);
-
 #ifdef CONFIG_DEBUG_FS
 static void pwm_dbg_show(struct pwm_chip *chip, struct seq_file *s)
 {
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 892ece4d4cfa..a0b7e43049d5 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -426,7 +426,6 @@ struct pwm_device *devm_of_pwm_get(struct device *dev, struct device_node *np,
 struct pwm_device *devm_fwnode_pwm_get(struct device *dev,
 				       struct fwnode_handle *fwnode,
 				       const char *con_id);
-void devm_pwm_put(struct device *dev, struct pwm_device *pwm);
 #else
 static inline struct pwm_device *pwm_request(int pwm_id, const char *label)
 {
@@ -533,10 +532,6 @@ devm_fwnode_pwm_get(struct device *dev, struct fwnode_handle *fwnode,
 {
 	return ERR_PTR(-ENODEV);
 }
-
-static inline void devm_pwm_put(struct device *dev, struct pwm_device *pwm)
-{
-}
 #endif
 
 static inline void pwm_apply_args(struct pwm_device *pwm)
-- 
cgit v1.2.3


From b3beca76181681fce9cf72f37d19c3030e3353c0 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 29 Jun 2021 11:57:08 +0530
Subject: cpufreq: Remove ->resolve_freq()

Commit e3c062360870 ("cpufreq: add cpufreq_driver_resolve_freq()")
introduced this callback, back in 2016, for drivers that provide the
->target() callback.

The kernel hasn't seen a single user of it in the past 5 years and
it is not likely to be used any time soon.

Remove it for now.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
[ rjw: Changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/cpu-freq/cpu-drivers.rst              |  3 ---
 .../translations/zh_CN/cpu-freq/cpu-drivers.rst     |  2 --
 drivers/cpufreq/cpufreq.c                           | 21 ++++++++-------------
 include/linux/cpufreq.h                             |  9 ---------
 4 files changed, 8 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cpu-freq/cpu-drivers.rst b/Documentation/cpu-freq/cpu-drivers.rst
index 74fac797c396..d84ededb66f9 100644
--- a/Documentation/cpu-freq/cpu-drivers.rst
+++ b/Documentation/cpu-freq/cpu-drivers.rst
@@ -58,9 +58,6 @@ And optionally
 
  .driver_data - cpufreq driver specific data.
 
- .resolve_freq - Returns the most appropriate frequency for a target
- frequency. Doesn't change the frequency though.
-
  .get_intermediate and target_intermediate - Used to switch to stable
  frequency while changing CPU frequency.
 
diff --git a/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst
index 9570e9c9e939..5ae9cfa2ec55 100644
--- a/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst
+++ b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst
@@ -64,8 +64,6 @@ CPUfreq核心层注册一个cpufreq_driver结构体。
 
  .driver_data - cpufreq驱动程序的特定数据。
 
- .resolve_freq - 返回最适合目标频率的频率。不过并不能改变频率。
-
  .get_intermediate 和 target_intermediate - 用于在改变CPU频率时切换到稳定
  的频率。
 
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 8271ed1a4947..45f3416988f1 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -527,22 +527,17 @@ EXPORT_SYMBOL_GPL(cpufreq_disable_fast_switch);
 static unsigned int __resolve_freq(struct cpufreq_policy *policy,
 		unsigned int target_freq, unsigned int relation)
 {
-	target_freq = clamp_val(target_freq, policy->min, policy->max);
-	policy->cached_target_freq = target_freq;
+	unsigned int idx;
 
-	if (cpufreq_driver->target_index) {
-		unsigned int idx;
-
-		idx = cpufreq_frequency_table_target(policy, target_freq,
-						     relation);
-		policy->cached_resolved_idx = idx;
-		return policy->freq_table[idx].frequency;
-	}
+	target_freq = clamp_val(target_freq, policy->min, policy->max);
 
-	if (cpufreq_driver->resolve_freq)
-		return cpufreq_driver->resolve_freq(policy, target_freq);
+	if (!cpufreq_driver->target_index)
+		return target_freq;
 
-	return target_freq;
+	idx = cpufreq_frequency_table_target(policy, target_freq, relation);
+	policy->cached_resolved_idx = idx;
+	policy->cached_target_freq = target_freq;
+	return policy->freq_table[idx].frequency;
 }
 
 /**
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 2e2267a36502..9fd719475fcd 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -330,15 +330,6 @@ struct cpufreq_driver {
 				       unsigned long target_perf,
 				       unsigned long capacity);
 
-	/*
-	 * Caches and returns the lowest driver-supported frequency greater than
-	 * or equal to the target frequency, subject to any driver limitations.
-	 * Does not set the frequency. Only to be implemented for drivers with
-	 * target().
-	 */
-	unsigned int	(*resolve_freq)(struct cpufreq_policy *policy,
-					unsigned int target_freq);
-
 	/*
 	 * Only for drivers with target_index() and CPUFREQ_ASYNC_NOTIFICATION
 	 * unset.
-- 
cgit v1.2.3


From bbd7a6cc382f4317b08ba71151b23abf76fc4c34 Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Date: Mon, 28 Jun 2021 00:39:57 +0200
Subject: clk: divider: Add re-usable determine_rate implementations

These are useful when running on 32-bit systems to increase the upper
supported frequency limit. clk_ops.round_rate returns a signed long
which limits the maximum rate on 32-bit systems to 2^31 (or approx.
2.14GHz). clk_ops.determine_rate internally uses an unsigned long so
the maximum rate on 32-bit systems is 2^32 or approx. 4.29GHz.

To avoid code-duplication switch over divider_{ro_,}round_rate_parent
to use the new divider_{ro_,}determine_rate functions.

Reviewed-by: Jerome Brunet <jbrunet@baylibre.com>
Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Link: https://lore.kernel.org/r/20210627223959.188139-2-martin.blumenstingl@googlemail.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk-divider.c    | 75 +++++++++++++++++++++++++++++++++++---------
 include/linux/clk-provider.h |  6 ++++
 2 files changed, 67 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-divider.c b/drivers/clk/clk-divider.c
index 344997203f0e..87ba4966b0e8 100644
--- a/drivers/clk/clk-divider.c
+++ b/drivers/clk/clk-divider.c
@@ -343,16 +343,63 @@ static int clk_divider_bestdiv(struct clk_hw *hw, struct clk_hw *parent,
 	return bestdiv;
 }
 
+int divider_determine_rate(struct clk_hw *hw, struct clk_rate_request *req,
+			   const struct clk_div_table *table, u8 width,
+			   unsigned long flags)
+{
+	int div;
+
+	div = clk_divider_bestdiv(hw, req->best_parent_hw, req->rate,
+				  &req->best_parent_rate, table, width, flags);
+
+	req->rate = DIV_ROUND_UP_ULL((u64)req->best_parent_rate, div);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(divider_determine_rate);
+
+int divider_ro_determine_rate(struct clk_hw *hw, struct clk_rate_request *req,
+			      const struct clk_div_table *table, u8 width,
+			      unsigned long flags, unsigned int val)
+{
+	int div;
+
+	div = _get_div(table, val, flags, width);
+
+	/* Even a read-only clock can propagate a rate change */
+	if (clk_hw_get_flags(hw) & CLK_SET_RATE_PARENT) {
+		if (!req->best_parent_hw)
+			return -EINVAL;
+
+		req->best_parent_rate = clk_hw_round_rate(req->best_parent_hw,
+							  req->rate * div);
+	}
+
+	req->rate = DIV_ROUND_UP_ULL((u64)req->best_parent_rate, div);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(divider_ro_determine_rate);
+
 long divider_round_rate_parent(struct clk_hw *hw, struct clk_hw *parent,
 			       unsigned long rate, unsigned long *prate,
 			       const struct clk_div_table *table,
 			       u8 width, unsigned long flags)
 {
-	int div;
+	struct clk_rate_request req = {
+		.rate = rate,
+		.best_parent_rate = *prate,
+		.best_parent_hw = parent,
+	};
+	int ret;
 
-	div = clk_divider_bestdiv(hw, parent, rate, prate, table, width, flags);
+	ret = divider_determine_rate(hw, &req, table, width, flags);
+	if (ret)
+		return ret;
 
-	return DIV_ROUND_UP_ULL((u64)*prate, div);
+	*prate = req.best_parent_rate;
+
+	return req.rate;
 }
 EXPORT_SYMBOL_GPL(divider_round_rate_parent);
 
@@ -361,23 +408,23 @@ long divider_ro_round_rate_parent(struct clk_hw *hw, struct clk_hw *parent,
 				  const struct clk_div_table *table, u8 width,
 				  unsigned long flags, unsigned int val)
 {
-	int div;
-
-	div = _get_div(table, val, flags, width);
+	struct clk_rate_request req = {
+		.rate = rate,
+		.best_parent_rate = *prate,
+		.best_parent_hw = parent,
+	};
+	int ret;
 
-	/* Even a read-only clock can propagate a rate change */
-	if (clk_hw_get_flags(hw) & CLK_SET_RATE_PARENT) {
-		if (!parent)
-			return -EINVAL;
+	ret = divider_ro_determine_rate(hw, &req, table, width, flags, val);
+	if (ret)
+		return ret;
 
-		*prate = clk_hw_round_rate(parent, rate * div);
-	}
+	*prate = req.best_parent_rate;
 
-	return DIV_ROUND_UP_ULL((u64)*prate, div);
+	return req.rate;
 }
 EXPORT_SYMBOL_GPL(divider_ro_round_rate_parent);
 
-
 static long clk_divider_round_rate(struct clk_hw *hw, unsigned long rate,
 				unsigned long *prate)
 {
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 162a2e5546a3..d83b829305c0 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -629,6 +629,12 @@ long divider_ro_round_rate_parent(struct clk_hw *hw, struct clk_hw *parent,
 				  unsigned long rate, unsigned long *prate,
 				  const struct clk_div_table *table, u8 width,
 				  unsigned long flags, unsigned int val);
+int divider_determine_rate(struct clk_hw *hw, struct clk_rate_request *req,
+			   const struct clk_div_table *table, u8 width,
+			   unsigned long flags);
+int divider_ro_determine_rate(struct clk_hw *hw, struct clk_rate_request *req,
+			      const struct clk_div_table *table, u8 width,
+			      unsigned long flags, unsigned int val);
 int divider_get_val(unsigned long rate, unsigned long parent_rate,
 		const struct clk_div_table *table, u8 width,
 		unsigned long flags);
-- 
cgit v1.2.3


From 5ec780a6eddacbbbc1c5d5838753c3ca43f93526 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 24 Jun 2021 10:10:12 +0200
Subject: block: mark blk_mq_init_queue_data static

All driver uses are gone now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20210624081012.256464-1-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 3 +--
 include/linux/blk-mq.h | 2 --
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2e9fd0ec63d7..2c4ac51e54eb 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3109,7 +3109,7 @@ void blk_mq_release(struct request_queue *q)
 	blk_mq_sysfs_deinit(q);
 }
 
-struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
+static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
 		void *queuedata)
 {
 	struct request_queue *q;
@@ -3126,7 +3126,6 @@ struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
 	}
 	return q;
 }
-EXPORT_SYMBOL_GPL(blk_mq_init_queue_data);
 
 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index fd2de2b422ed..1d18447ebebc 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -439,8 +439,6 @@ enum {
 struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set,
 		void *queuedata);
 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
-struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
-		void *queuedata);
 int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 		struct request_queue *q);
 void blk_mq_unregister_dev(struct device *, struct request_queue *);
-- 
cgit v1.2.3


From da6269da4cfe29f484e8fd27c1496b81b47e2499 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 24 Jun 2021 14:39:34 +0200
Subject: block: remove REQ_OP_SCSI_{IN,OUT}

With the legacy IDE driver gone drivers now use either REQ_OP_DRV_*
or REQ_OP_SCSI_*, so unify the two concepts of passthrough requests
into a single one.

Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c                   |  2 --
 block/bsg-lib.c                    |  2 +-
 block/bsg.c                        |  2 +-
 block/scsi_ioctl.c                 |  6 +++---
 drivers/block/pktcdvd.c            |  2 +-
 drivers/cdrom/cdrom.c              |  2 +-
 drivers/scsi/scsi_error.c          |  2 +-
 drivers/scsi/scsi_lib.c            |  8 ++++----
 drivers/scsi/sg.c                  |  2 +-
 drivers/scsi/st.c                  |  2 +-
 drivers/target/target_core_pscsi.c |  2 +-
 fs/nfsd/blocklayout.c              |  2 +-
 include/linux/blk_types.h          |  3 ---
 include/linux/blkdev.h             | 33 +++------------------------------
 14 files changed, 19 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 514838ccab2d..3eea8d795565 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -142,8 +142,6 @@ static const char *const blk_op_name[] = {
 	REQ_OP_NAME(ZONE_APPEND),
 	REQ_OP_NAME(WRITE_SAME),
 	REQ_OP_NAME(WRITE_ZEROES),
-	REQ_OP_NAME(SCSI_IN),
-	REQ_OP_NAME(SCSI_OUT),
 	REQ_OP_NAME(DRV_IN),
 	REQ_OP_NAME(DRV_OUT),
 };
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 330fede77271..57b082bc9017 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -45,7 +45,7 @@ static int bsg_transport_fill_hdr(struct request *rq, struct sg_io_v4 *hdr,
 		return PTR_ERR(job->request);
 
 	if (hdr->dout_xfer_len && hdr->din_xfer_len) {
-		job->bidi_rq = blk_get_request(rq->q, REQ_OP_SCSI_IN, 0);
+		job->bidi_rq = blk_get_request(rq->q, REQ_OP_DRV_IN, 0);
 		if (IS_ERR(job->bidi_rq)) {
 			ret = PTR_ERR(job->bidi_rq);
 			goto out;
diff --git a/block/bsg.c b/block/bsg.c
index bd10922d5cbb..323e45878362 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -152,7 +152,7 @@ static int bsg_sg_io(struct request_queue *q, fmode_t mode, void __user *uarg)
 		return ret;
 
 	rq = blk_get_request(q, hdr.dout_xfer_len ?
-			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
+			REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
 
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 1b3fe99b83a6..41ca95bfe607 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -311,7 +311,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 		at_head = 1;
 
 	ret = -ENOMEM;
-	rq = blk_get_request(q, writing ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
+	rq = blk_get_request(q, writing ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
 	req = scsi_req(rq);
@@ -433,7 +433,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 
 	}
 
-	rq = blk_get_request(q, in_len ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
+	rq = blk_get_request(q, in_len ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
 	if (IS_ERR(rq)) {
 		err = PTR_ERR(rq);
 		goto error_free_buffer;
@@ -521,7 +521,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
 	struct request *rq;
 	int err;
 
-	rq = blk_get_request(q, REQ_OP_SCSI_OUT, 0);
+	rq = blk_get_request(q, REQ_OP_DRV_OUT, 0);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
 	rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index f69b5c69c2a6..538446b652de 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -704,7 +704,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
 	int ret = 0;
 
 	rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ?
-			     REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
+			     REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
 
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 90ad34c6ef8e..feb827eefd1a 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2186,7 +2186,7 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 
 		len = nr * CD_FRAMESIZE_RAW;
 
-		rq = blk_get_request(q, REQ_OP_SCSI_IN, 0);
+		rq = blk_get_request(q, REQ_OP_DRV_IN, 0);
 		if (IS_ERR(rq)) {
 			ret = PTR_ERR(rq);
 			break;
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index d8fafe77dbbe..03a2ff547b22 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -2011,7 +2011,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev)
 	struct request *req;
 	struct scsi_request *rq;
 
-	req = blk_get_request(sdev->request_queue, REQ_OP_SCSI_IN, 0);
+	req = blk_get_request(sdev->request_queue, REQ_OP_DRV_IN, 0);
 	if (IS_ERR(req))
 		return;
 	rq = scsi_req(req);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 532304d42f00..6cc7dad923cb 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -215,7 +215,7 @@ int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 
 	req = blk_get_request(sdev->request_queue,
 			data_direction == DMA_TO_DEVICE ?
-			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
+			REQ_OP_DRV_OUT : REQ_OP_DRV_IN,
 			rq_flags & RQF_PM ? BLK_MQ_REQ_PM : 0);
 	if (IS_ERR(req))
 		return ret;
@@ -540,7 +540,7 @@ static bool scsi_end_request(struct request *req, blk_status_t error,
 	if (blk_queue_add_random(q))
 		add_disk_randomness(req->rq_disk);
 
-	if (!blk_rq_is_scsi(req)) {
+	if (!blk_rq_is_passthrough(req)) {
 		WARN_ON_ONCE(!(cmd->flags & SCMD_INITIALIZED));
 		cmd->flags &= ~SCMD_INITIALIZED;
 	}
@@ -1115,7 +1115,7 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
 	bool in_flight;
 	int budget_token = cmd->budget_token;
 
-	if (!blk_rq_is_scsi(rq) && !(flags & SCMD_INITIALIZED)) {
+	if (!blk_rq_is_passthrough(rq) && !(flags & SCMD_INITIALIZED)) {
 		flags |= SCMD_INITIALIZED;
 		scsi_initialize_rq(rq);
 	}
@@ -1556,7 +1556,7 @@ static blk_status_t scsi_prepare_cmd(struct request *req)
 	 * Special handling for passthrough commands, which don't go to the ULP
 	 * at all:
 	 */
-	if (blk_rq_is_scsi(req))
+	if (blk_rq_is_passthrough(req))
 		return scsi_setup_scsi_cmnd(sdev, req);
 
 	if (sdev->handler && sdev->handler->prep_fn) {
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index def7ec3bbaf9..f84fa550dd15 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1756,7 +1756,7 @@ sg_start_req(Sg_request *srp, unsigned char *cmd)
 	 * not expect an EWOULDBLOCK from this condition.
 	 */
 	rq = blk_get_request(q, hp->dxfer_direction == SG_DXFER_TO_DEV ?
-			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
+			REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
 	if (IS_ERR(rq)) {
 		kfree(long_cmdp);
 		return PTR_ERR(rq);
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 3b1afe1d5b27..86c951c654a8 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -549,7 +549,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
 
 	req = blk_get_request(SRpnt->stp->device->request_queue,
 			data_direction == DMA_TO_DEVICE ?
-			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
+			REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
 	if (IS_ERR(req))
 		return DRIVER_ERROR << 24;
 	rq = scsi_req(req);
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index f2a11414366d..4531cf47d24e 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -982,7 +982,7 @@ pscsi_execute_cmd(struct se_cmd *cmd)
 
 	req = blk_get_request(pdv->pdv_sd->request_queue,
 			cmd->data_direction == DMA_TO_DEVICE ?
-			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
+			REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
 	if (IS_ERR(req)) {
 		pr_err("PSCSI: blk_get_request() failed\n");
 		ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 1058659a8d31..c99dee99a3c1 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -236,7 +236,7 @@ again:
 	if (!buf)
 		return -ENOMEM;
 
-	rq = blk_get_request(q, REQ_OP_SCSI_IN, 0);
+	rq = blk_get_request(q, REQ_OP_DRV_IN, 0);
 	if (IS_ERR(rq)) {
 		error = -ENOMEM;
 		goto out_free_buf;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index fd3860d18d7e..db61f7df1823 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -350,9 +350,6 @@ enum req_opf {
 	/* reset all the zone present on the device */
 	REQ_OP_ZONE_RESET_ALL	= 17,
 
-	/* SCSI passthrough using struct scsi_request */
-	REQ_OP_SCSI_IN		= 32,
-	REQ_OP_SCSI_OUT		= 33,
 	/* Driver private requests */
 	REQ_OP_DRV_IN		= 34,
 	REQ_OP_DRV_OUT		= 35,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d66d0da72529..d199e51524eb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -240,42 +240,15 @@ struct request {
 	void *end_io_data;
 };
 
-static inline bool blk_op_is_scsi(unsigned int op)
-{
-	return op == REQ_OP_SCSI_IN || op == REQ_OP_SCSI_OUT;
-}
-
-static inline bool blk_op_is_private(unsigned int op)
+static inline bool blk_op_is_passthrough(unsigned int op)
 {
+	op &= REQ_OP_MASK;
 	return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT;
 }
 
-static inline bool blk_rq_is_scsi(struct request *rq)
-{
-	return blk_op_is_scsi(req_op(rq));
-}
-
-static inline bool blk_rq_is_private(struct request *rq)
-{
-	return blk_op_is_private(req_op(rq));
-}
-
 static inline bool blk_rq_is_passthrough(struct request *rq)
 {
-	return blk_rq_is_scsi(rq) || blk_rq_is_private(rq);
-}
-
-static inline bool bio_is_passthrough(struct bio *bio)
-{
-	unsigned op = bio_op(bio);
-
-	return blk_op_is_scsi(op) || blk_op_is_private(op);
-}
-
-static inline bool blk_op_is_passthrough(unsigned int op)
-{
-	return (blk_op_is_scsi(op & REQ_OP_MASK) ||
-			blk_op_is_private(op & REQ_OP_MASK));
+	return blk_op_is_passthrough(req_op(rq));
 }
 
 static inline unsigned short req_get_ioprio(struct request *req)
-- 
cgit v1.2.3


From fb9b16e15cd70e21d8af7f03d700deb9509c2ce8 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Thu, 10 Jun 2021 14:44:36 -0700
Subject: block: return errors from blk_execute_rq()

The synchronous blk_execute_rq() had not provided a way for its callers
to know if its request was successful or not. Return the blk_status_t
result of the request.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Link: https://lore.kernel.org/r/20210610214437.641245-4-kbusch@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-exec.c       | 7 +++++--
 include/linux/blkdev.h | 4 +++-
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-exec.c b/block/blk-exec.c
index 38f88552aa31..d6cd501c0d34 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -21,7 +21,7 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error)
 {
 	struct completion *waiting = rq->end_io_data;
 
-	rq->end_io_data = NULL;
+	rq->end_io_data = (void *)(uintptr_t)error;
 
 	/*
 	 * complete last, if this is a stack request the process (and thus
@@ -85,8 +85,9 @@ static void blk_rq_poll_completion(struct request *rq, struct completion *wait)
  * Description:
  *    Insert a fully prepared request at the back of the I/O scheduler queue
  *    for execution and wait for completion.
+ * Return: The blk_status_t result provided to blk_mq_end_request().
  */
-void blk_execute_rq(struct gendisk *bd_disk, struct request *rq, int at_head)
+blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq, int at_head)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	unsigned long hang_check;
@@ -103,5 +104,7 @@ void blk_execute_rq(struct gendisk *bd_disk, struct request *rq, int at_head)
 		while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2)));
 	else
 		wait_for_completion_io(&wait);
+
+	return (blk_status_t)(uintptr_t)rq->end_io_data;
 }
 EXPORT_SYMBOL(blk_execute_rq);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d199e51524eb..c454fb446fd0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -909,10 +909,12 @@ extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, uns
 extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
 			       struct rq_map_data *, const struct iov_iter *,
 			       gfp_t);
-extern void blk_execute_rq(struct gendisk *, struct request *, int);
 extern void blk_execute_rq_nowait(struct gendisk *,
 				  struct request *, int, rq_end_io_fn *);
 
+blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq,
+			    int at_head);
+
 /* Helper to convert REQ_OP_XXX to its string format XXX */
 extern const char *blk_op_str(unsigned int op);
 
-- 
cgit v1.2.3


From 1eb5dde674f57b1a1918dab33f09e35cdd64eb07 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 23 Jun 2020 15:49:40 +0530
Subject: cpufreq: CPPC: Add support for frequency invariance

The Frequency Invariance Engine (FIE) is providing a frequency scaling
correction factor that helps achieve more accurate load-tracking.

Normally, this scaling factor can be obtained directly with the help of
the cpufreq drivers as they know the exact frequency the hardware is
running at. But that isn't the case for CPPC cpufreq driver.

Another way of obtaining that is using the arch specific counter
support, which is already present in kernel, but that hardware is
optional for platforms.

This patch updates the CPPC driver to register itself with the topology
core to provide its own implementation (cppc_scale_freq_tick()) of
topology_scale_freq_tick() which gets called by the scheduler on every
tick. Note that the arch specific counters have higher priority than
CPPC counters, if available, though the CPPC driver doesn't need to have
any special handling for that.

On an invocation of cppc_scale_freq_tick(), we schedule an irq work
(since we reach here from hard-irq context), which then schedules a
normal work item and cppc_scale_freq_workfn() updates the per_cpu
arch_freq_scale variable based on the counter updates since the last
tick.

To allow platforms to disable this CPPC counter-based frequency
invariance support, this is all done under CONFIG_ACPI_CPPC_CPUFREQ_FIE,
which is enabled by default.

This also exports sched_setattr_nocheck() as the CPPC driver can be
built as a module.

Cc: linux-acpi@vger.kernel.org
Tested-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Ionela Voinescu <ionela.voinescu@arm.com>
Tested-by: Qian Cai <quic_qiancai@quicinc.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/Kconfig.arm    |  10 ++
 drivers/cpufreq/cppc_cpufreq.c | 252 ++++++++++++++++++++++++++++++++++++++---
 include/linux/arch_topology.h  |   1 +
 kernel/sched/core.c            |   1 +
 4 files changed, 251 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm
index e65e0a43be64..a5c5f70acfc9 100644
--- a/drivers/cpufreq/Kconfig.arm
+++ b/drivers/cpufreq/Kconfig.arm
@@ -19,6 +19,16 @@ config ACPI_CPPC_CPUFREQ
 
 	  If in doubt, say N.
 
+config ACPI_CPPC_CPUFREQ_FIE
+	bool "Frequency Invariance support for CPPC cpufreq driver"
+	depends on ACPI_CPPC_CPUFREQ && GENERIC_ARCH_TOPOLOGY
+	default y
+	help
+	  This extends frequency invariance support in the CPPC cpufreq driver,
+	  by using CPPC delivered and reference performance counters.
+
+	  If in doubt, say N.
+
 config ARM_ALLWINNER_SUN50I_CPUFREQ_NVMEM
 	tristate "Allwinner nvmem based SUN50I CPUFreq driver"
 	depends on ARCH_SUNXI
diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index 4a7f0f9b8c60..d4c27022b9c9 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -10,14 +10,18 @@
 
 #define pr_fmt(fmt)	"CPPC Cpufreq:"	fmt
 
+#include <linux/arch_topology.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/delay.h>
 #include <linux/cpu.h>
 #include <linux/cpufreq.h>
 #include <linux/dmi.h>
+#include <linux/irq_work.h>
+#include <linux/kthread.h>
 #include <linux/time.h>
 #include <linux/vmalloc.h>
+#include <uapi/linux/sched/types.h>
 
 #include <asm/unaligned.h>
 
@@ -57,6 +61,216 @@ static struct cppc_workaround_oem_info wa_info[] = {
 	}
 };
 
+#ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE
+
+/* Frequency invariance support */
+struct cppc_freq_invariance {
+	int cpu;
+	struct irq_work irq_work;
+	struct kthread_work work;
+	struct cppc_perf_fb_ctrs prev_perf_fb_ctrs;
+	struct cppc_cpudata *cpu_data;
+};
+
+static DEFINE_PER_CPU(struct cppc_freq_invariance, cppc_freq_inv);
+static struct kthread_worker *kworker_fie;
+
+static struct cpufreq_driver cppc_cpufreq_driver;
+static unsigned int hisi_cppc_cpufreq_get_rate(unsigned int cpu);
+static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data,
+				 struct cppc_perf_fb_ctrs *fb_ctrs_t0,
+				 struct cppc_perf_fb_ctrs *fb_ctrs_t1);
+
+/**
+ * cppc_scale_freq_workfn - CPPC arch_freq_scale updater for frequency invariance
+ * @work: The work item.
+ *
+ * The CPPC driver register itself with the topology core to provide its own
+ * implementation (cppc_scale_freq_tick()) of topology_scale_freq_tick() which
+ * gets called by the scheduler on every tick.
+ *
+ * Note that the arch specific counters have higher priority than CPPC counters,
+ * if available, though the CPPC driver doesn't need to have any special
+ * handling for that.
+ *
+ * On an invocation of cppc_scale_freq_tick(), we schedule an irq work (since we
+ * reach here from hard-irq context), which then schedules a normal work item
+ * and cppc_scale_freq_workfn() updates the per_cpu arch_freq_scale variable
+ * based on the counter updates since the last tick.
+ */
+static void cppc_scale_freq_workfn(struct kthread_work *work)
+{
+	struct cppc_freq_invariance *cppc_fi;
+	struct cppc_perf_fb_ctrs fb_ctrs = {0};
+	struct cppc_cpudata *cpu_data;
+	unsigned long local_freq_scale;
+	u64 perf;
+
+	cppc_fi = container_of(work, struct cppc_freq_invariance, work);
+	cpu_data = cppc_fi->cpu_data;
+
+	if (cppc_get_perf_ctrs(cppc_fi->cpu, &fb_ctrs)) {
+		pr_warn("%s: failed to read perf counters\n", __func__);
+		return;
+	}
+
+	perf = cppc_perf_from_fbctrs(cpu_data, &cppc_fi->prev_perf_fb_ctrs,
+				     &fb_ctrs);
+	cppc_fi->prev_perf_fb_ctrs = fb_ctrs;
+
+	perf <<= SCHED_CAPACITY_SHIFT;
+	local_freq_scale = div64_u64(perf, cpu_data->perf_caps.highest_perf);
+
+	/* This can happen due to counter's overflow */
+	if (unlikely(local_freq_scale > 1024))
+		local_freq_scale = 1024;
+
+	per_cpu(arch_freq_scale, cppc_fi->cpu) = local_freq_scale;
+}
+
+static void cppc_irq_work(struct irq_work *irq_work)
+{
+	struct cppc_freq_invariance *cppc_fi;
+
+	cppc_fi = container_of(irq_work, struct cppc_freq_invariance, irq_work);
+	kthread_queue_work(kworker_fie, &cppc_fi->work);
+}
+
+static void cppc_scale_freq_tick(void)
+{
+	struct cppc_freq_invariance *cppc_fi = &per_cpu(cppc_freq_inv, smp_processor_id());
+
+	/*
+	 * cppc_get_perf_ctrs() can potentially sleep, call that from the right
+	 * context.
+	 */
+	irq_work_queue(&cppc_fi->irq_work);
+}
+
+static struct scale_freq_data cppc_sftd = {
+	.source = SCALE_FREQ_SOURCE_CPPC,
+	.set_freq_scale = cppc_scale_freq_tick,
+};
+
+static void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy)
+{
+	struct cppc_freq_invariance *cppc_fi;
+	int cpu, ret;
+
+	if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate)
+		return;
+
+	for_each_cpu(cpu, policy->cpus) {
+		cppc_fi = &per_cpu(cppc_freq_inv, cpu);
+		cppc_fi->cpu = cpu;
+		cppc_fi->cpu_data = policy->driver_data;
+		kthread_init_work(&cppc_fi->work, cppc_scale_freq_workfn);
+		init_irq_work(&cppc_fi->irq_work, cppc_irq_work);
+
+		ret = cppc_get_perf_ctrs(cpu, &cppc_fi->prev_perf_fb_ctrs);
+		if (ret) {
+			pr_warn("%s: failed to read perf counters for cpu:%d: %d\n",
+				__func__, cpu, ret);
+
+			/*
+			 * Don't abort if the CPU was offline while the driver
+			 * was getting registered.
+			 */
+			if (cpu_online(cpu))
+				return;
+		}
+	}
+
+	/* Register for freq-invariance */
+	topology_set_scale_freq_source(&cppc_sftd, policy->cpus);
+}
+
+/*
+ * We free all the resources on policy's removal and not on CPU removal as the
+ * irq-work are per-cpu and the hotplug core takes care of flushing the pending
+ * irq-works (hint: smpcfd_dying_cpu()) on CPU hotplug. Even if the kthread-work
+ * fires on another CPU after the concerned CPU is removed, it won't harm.
+ *
+ * We just need to make sure to remove them all on policy->exit().
+ */
+static void cppc_cpufreq_cpu_fie_exit(struct cpufreq_policy *policy)
+{
+	struct cppc_freq_invariance *cppc_fi;
+	int cpu;
+
+	if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate)
+		return;
+
+	/* policy->cpus will be empty here, use related_cpus instead */
+	topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_CPPC, policy->related_cpus);
+
+	for_each_cpu(cpu, policy->related_cpus) {
+		cppc_fi = &per_cpu(cppc_freq_inv, cpu);
+		irq_work_sync(&cppc_fi->irq_work);
+		kthread_cancel_work_sync(&cppc_fi->work);
+	}
+}
+
+static void __init cppc_freq_invariance_init(void)
+{
+	struct sched_attr attr = {
+		.size		= sizeof(struct sched_attr),
+		.sched_policy	= SCHED_DEADLINE,
+		.sched_nice	= 0,
+		.sched_priority	= 0,
+		/*
+		 * Fake (unused) bandwidth; workaround to "fix"
+		 * priority inheritance.
+		 */
+		.sched_runtime	= 1000000,
+		.sched_deadline = 10000000,
+		.sched_period	= 10000000,
+	};
+	int ret;
+
+	if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate)
+		return;
+
+	kworker_fie = kthread_create_worker(0, "cppc_fie");
+	if (IS_ERR(kworker_fie))
+		return;
+
+	ret = sched_setattr_nocheck(kworker_fie->task, &attr);
+	if (ret) {
+		pr_warn("%s: failed to set SCHED_DEADLINE: %d\n", __func__,
+			ret);
+		kthread_destroy_worker(kworker_fie);
+		return;
+	}
+}
+
+static void cppc_freq_invariance_exit(void)
+{
+	if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate)
+		return;
+
+	kthread_destroy_worker(kworker_fie);
+	kworker_fie = NULL;
+}
+
+#else
+static inline void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy)
+{
+}
+
+static inline void cppc_cpufreq_cpu_fie_exit(struct cpufreq_policy *policy)
+{
+}
+
+static inline void cppc_freq_invariance_init(void)
+{
+}
+
+static inline void cppc_freq_invariance_exit(void)
+{
+}
+#endif /* CONFIG_ACPI_CPPC_CPUFREQ_FIE */
+
 /* Callback function used to retrieve the max frequency from DMI */
 static void cppc_find_dmi_mhz(const struct dmi_header *dm, void *private)
 {
@@ -341,6 +555,7 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy)
 		goto out;
 	}
 
+	cppc_cpufreq_cpu_fie_init(policy);
 	return 0;
 
 out:
@@ -355,6 +570,8 @@ static int cppc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
 	unsigned int cpu = policy->cpu;
 	int ret;
 
+	cppc_cpufreq_cpu_fie_exit(policy);
+
 	cpu_data->perf_ctrls.desired_perf = caps->lowest_perf;
 
 	ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls);
@@ -374,12 +591,12 @@ static inline u64 get_delta(u64 t1, u64 t0)
 	return (u32)t1 - (u32)t0;
 }
 
-static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu_data,
-				     struct cppc_perf_fb_ctrs *fb_ctrs_t0,
-				     struct cppc_perf_fb_ctrs *fb_ctrs_t1)
+static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data,
+				 struct cppc_perf_fb_ctrs *fb_ctrs_t0,
+				 struct cppc_perf_fb_ctrs *fb_ctrs_t1)
 {
 	u64 delta_reference, delta_delivered;
-	u64 reference_perf, delivered_perf;
+	u64 reference_perf;
 
 	reference_perf = fb_ctrs_t0->reference_perf;
 
@@ -388,14 +605,11 @@ static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu_data,
 	delta_delivered = get_delta(fb_ctrs_t1->delivered,
 				    fb_ctrs_t0->delivered);
 
-	/* Check to avoid divide-by zero */
-	if (delta_reference || delta_delivered)
-		delivered_perf = (reference_perf * delta_delivered) /
-					delta_reference;
-	else
-		delivered_perf = cpu_data->perf_ctrls.desired_perf;
+	/* Check to avoid divide-by zero and invalid delivered_perf */
+	if (!delta_reference || !delta_delivered)
+		return cpu_data->perf_ctrls.desired_perf;
 
-	return cppc_cpufreq_perf_to_khz(cpu_data, delivered_perf);
+	return (reference_perf * delta_delivered) / delta_reference;
 }
 
 static unsigned int cppc_cpufreq_get_rate(unsigned int cpu)
@@ -403,6 +617,7 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu)
 	struct cppc_perf_fb_ctrs fb_ctrs_t0 = {0}, fb_ctrs_t1 = {0};
 	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
 	struct cppc_cpudata *cpu_data = policy->driver_data;
+	u64 delivered_perf;
 	int ret;
 
 	cpufreq_cpu_put(policy);
@@ -417,7 +632,10 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu)
 	if (ret)
 		return ret;
 
-	return cppc_get_rate_from_fbctrs(cpu_data, &fb_ctrs_t0, &fb_ctrs_t1);
+	delivered_perf = cppc_perf_from_fbctrs(cpu_data, &fb_ctrs_t0,
+					       &fb_ctrs_t1);
+
+	return cppc_cpufreq_perf_to_khz(cpu_data, delivered_perf);
 }
 
 static int cppc_cpufreq_set_boost(struct cpufreq_policy *policy, int state)
@@ -518,14 +736,21 @@ static void cppc_check_hisi_workaround(void)
 
 static int __init cppc_cpufreq_init(void)
 {
+	int ret;
+
 	if ((acpi_disabled) || !acpi_cpc_valid())
 		return -ENODEV;
 
 	INIT_LIST_HEAD(&cpu_data_list);
 
 	cppc_check_hisi_workaround();
+	cppc_freq_invariance_init();
 
-	return cpufreq_register_driver(&cppc_cpufreq_driver);
+	ret = cpufreq_register_driver(&cppc_cpufreq_driver);
+	if (ret)
+		cppc_freq_invariance_exit();
+
+	return ret;
 }
 
 static inline void free_cpu_data(void)
@@ -543,6 +768,7 @@ static inline void free_cpu_data(void)
 static void __exit cppc_cpufreq_exit(void)
 {
 	cpufreq_unregister_driver(&cppc_cpufreq_driver);
+	cppc_freq_invariance_exit();
 
 	free_cpu_data();
 }
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index 11e555cfaecb..f180240dc95f 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -37,6 +37,7 @@ bool topology_scale_freq_invariant(void);
 enum scale_freq_source {
 	SCALE_FREQ_SOURCE_CPUFREQ = 0,
 	SCALE_FREQ_SOURCE_ARCH,
+	SCALE_FREQ_SOURCE_CPPC,
 };
 
 struct scale_freq_data {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cf16f8fda9a6..2d9ff40f4661 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7182,6 +7182,7 @@ int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
 {
 	return __sched_setscheduler(p, attr, false, true);
 }
+EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
 
 /**
  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
-- 
cgit v1.2.3


From 426e5c429d16e4cd5ded46e21ff8e939bf8abd0f Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Wed, 30 Jun 2021 18:47:00 -0700
Subject: mm: memory_hotplug: factor out bootmem core functions to
 bootmem_info.c

Patch series "Free some vmemmap pages of HugeTLB page", v23.

This patch series will free some vmemmap pages(struct page structures)
associated with each HugeTLB page when preallocated to save memory.

In order to reduce the difficulty of the first version of code review.  In
this version, we disable PMD/huge page mapping of vmemmap if this feature
was enabled.  This acutely eliminates a bunch of the complex code doing
page table manipulation.  When this patch series is solid, we cam add the
code of vmemmap page table manipulation in the future.

The struct page structures (page structs) are used to describe a physical
page frame.  By default, there is an one-to-one mapping from a page frame
to it's corresponding page struct.

The HugeTLB pages consist of multiple base page size pages and is
supported by many architectures.  See hugetlbpage.rst in the Documentation
directory for more details.  On the x86 architecture, HugeTLB pages of
size 2MB and 1GB are currently supported.  Since the base page size on x86
is 4KB, a 2MB HugeTLB page consists of 512 base pages and a 1GB HugeTLB
page consists of 4096 base pages.  For each base page, there is a
corresponding page struct.

Within the HugeTLB subsystem, only the first 4 page structs are used to
contain unique information about a HugeTLB page.  HUGETLB_CGROUP_MIN_ORDER
provides this upper limit.  The only 'useful' information in the remaining
page structs is the compound_head field, and this field is the same for
all tail pages.

By removing redundant page structs for HugeTLB pages, memory can returned
to the buddy allocator for other uses.

When the system boot up, every 2M HugeTLB has 512 struct page structs which
size is 8 pages(sizeof(struct page) * 512 / PAGE_SIZE).

    HugeTLB                  struct pages(8 pages)         page frame(8 pages)
 +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
 |           |                     |     0     | -------------> |     0     |
 |           |                     +-----------+                +-----------+
 |           |                     |     1     | -------------> |     1     |
 |           |                     +-----------+                +-----------+
 |           |                     |     2     | -------------> |     2     |
 |           |                     +-----------+                +-----------+
 |           |                     |     3     | -------------> |     3     |
 |           |                     +-----------+                +-----------+
 |           |                     |     4     | -------------> |     4     |
 |    2MB    |                     +-----------+                +-----------+
 |           |                     |     5     | -------------> |     5     |
 |           |                     +-----------+                +-----------+
 |           |                     |     6     | -------------> |     6     |
 |           |                     +-----------+                +-----------+
 |           |                     |     7     | -------------> |     7     |
 |           |                     +-----------+                +-----------+
 |           |
 |           |
 |           |
 +-----------+

The value of page->compound_head is the same for all tail pages.  The
first page of page structs (page 0) associated with the HugeTLB page
contains the 4 page structs necessary to describe the HugeTLB.  The only
use of the remaining pages of page structs (page 1 to page 7) is to point
to page->compound_head.  Therefore, we can remap pages 2 to 7 to page 1.
Only 2 pages of page structs will be used for each HugeTLB page.  This
will allow us to free the remaining 6 pages to the buddy allocator.

Here is how things look after remapping.

    HugeTLB                  struct pages(8 pages)         page frame(8 pages)
 +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
 |           |                     |     0     | -------------> |     0     |
 |           |                     +-----------+                +-----------+
 |           |                     |     1     | -------------> |     1     |
 |           |                     +-----------+                +-----------+
 |           |                     |     2     | ----------------^ ^ ^ ^ ^ ^
 |           |                     +-----------+                   | | | | |
 |           |                     |     3     | ------------------+ | | | |
 |           |                     +-----------+                     | | | |
 |           |                     |     4     | --------------------+ | | |
 |    2MB    |                     +-----------+                       | | |
 |           |                     |     5     | ----------------------+ | |
 |           |                     +-----------+                         | |
 |           |                     |     6     | ------------------------+ |
 |           |                     +-----------+                           |
 |           |                     |     7     | --------------------------+
 |           |                     +-----------+
 |           |
 |           |
 |           |
 +-----------+

When a HugeTLB is freed to the buddy system, we should allocate 6 pages
for vmemmap pages and restore the previous mapping relationship.

Apart from 2MB HugeTLB page, we also have 1GB HugeTLB page.  It is similar
to the 2MB HugeTLB page.  We also can use this approach to free the
vmemmap pages.

In this case, for the 1GB HugeTLB page, we can save 4094 pages.  This is a
very substantial gain.  On our server, run some SPDK/QEMU applications
which will use 1024GB HugeTLB page.  With this feature enabled, we can
save ~16GB (1G hugepage)/~12GB (2MB hugepage) memory.

Because there are vmemmap page tables reconstruction on the
freeing/allocating path, it increases some overhead.  Here are some
overhead analysis.

1) Allocating 10240 2MB HugeTLB pages.

   a) With this patch series applied:
   # time echo 10240 > /proc/sys/vm/nr_hugepages

   real     0m0.166s
   user     0m0.000s
   sys      0m0.166s

   # bpftrace -e 'kprobe:alloc_fresh_huge_page { @start[tid] = nsecs; }
     kretprobe:alloc_fresh_huge_page /@start[tid]/ { @latency = hist(nsecs -
     @start[tid]); delete(@start[tid]); }'
   Attaching 2 probes...

   @latency:
   [8K, 16K)           5476 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
   [16K, 32K)          4760 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@       |
   [32K, 64K)             4 |                                                    |

   b) Without this patch series:
   # time echo 10240 > /proc/sys/vm/nr_hugepages

   real     0m0.067s
   user     0m0.000s
   sys      0m0.067s

   # bpftrace -e 'kprobe:alloc_fresh_huge_page { @start[tid] = nsecs; }
     kretprobe:alloc_fresh_huge_page /@start[tid]/ { @latency = hist(nsecs -
     @start[tid]); delete(@start[tid]); }'
   Attaching 2 probes...

   @latency:
   [4K, 8K)           10147 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
   [8K, 16K)             93 |                                                    |

   Summarize: this feature is about ~2x slower than before.

2) Freeing 10240 2MB HugeTLB pages.

   a) With this patch series applied:
   # time echo 0 > /proc/sys/vm/nr_hugepages

   real     0m0.213s
   user     0m0.000s
   sys      0m0.213s

   # bpftrace -e 'kprobe:free_pool_huge_page { @start[tid] = nsecs; }
     kretprobe:free_pool_huge_page /@start[tid]/ { @latency = hist(nsecs -
     @start[tid]); delete(@start[tid]); }'
   Attaching 2 probes...

   @latency:
   [8K, 16K)              6 |                                                    |
   [16K, 32K)         10227 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
   [32K, 64K)             7 |                                                    |

   b) Without this patch series:
   # time echo 0 > /proc/sys/vm/nr_hugepages

   real     0m0.081s
   user     0m0.000s
   sys      0m0.081s

   # bpftrace -e 'kprobe:free_pool_huge_page { @start[tid] = nsecs; }
     kretprobe:free_pool_huge_page /@start[tid]/ { @latency = hist(nsecs -
     @start[tid]); delete(@start[tid]); }'
   Attaching 2 probes...

   @latency:
   [4K, 8K)            6805 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
   [8K, 16K)           3427 |@@@@@@@@@@@@@@@@@@@@@@@@@@                          |
   [16K, 32K)             8 |                                                    |

   Summary: The overhead of __free_hugepage is about ~2-3x slower than before.

Although the overhead has increased, the overhead is not significant.
Like Mike said, "However, remember that the majority of use cases create
HugeTLB pages at or shortly after boot time and add them to the pool.  So,
additional overhead is at pool creation time.  There is no change to
'normal run time' operations of getting a page from or returning a page to
the pool (think page fault/unmap)".

Despite the overhead and in addition to the memory gains from this series.
The following data is obtained by Joao Martins.  Very thanks to his
effort.

There's an additional benefit which is page (un)pinners will see an improvement
and Joao presumes because there are fewer memmap pages and thus the tail/head
pages are staying in cache more often.

Out of the box Joao saw (when comparing linux-next against linux-next +
this series) with gup_test and pinning a 16G HugeTLB file (with 1G pages):

	get_user_pages(): ~32k -> ~9k
	unpin_user_pages(): ~75k -> ~70k

Usually any tight loop fetching compound_head(), or reading tail pages
data (e.g.  compound_head) benefit a lot.  There's some unpinning
inefficiencies Joao was fixing[2], but with that in added it shows even
more:

	unpin_user_pages(): ~27k -> ~3.8k

[1] https://lore.kernel.org/linux-mm/20210409205254.242291-1-mike.kravetz@oracle.com/
[2] https://lore.kernel.org/linux-mm/20210204202500.26474-1-joao.m.martins@oracle.com/

This patch (of 9):

Move bootmem info registration common API to individual bootmem_info.c.
And we will use {get,put}_page_bootmem() to initialize the page for the
vmemmap pages or free the vmemmap pages to buddy in the later patch.  So
move them out of CONFIG_MEMORY_HOTPLUG_SPARSE.  This is just code movement
without any functional change.

Link: https://lkml.kernel.org/r/20210510030027.56044-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210510030027.56044-2-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Tested-by: Chen Huang <chenhuang5@huawei.com>
Tested-by: Bodeddula Balasubramaniam <bodeddub@amazon.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: x86@kernel.org
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Oliver Neukum <oneukum@suse.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Mina Almasry <almasrymina@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Barry Song <song.bao.hua@hisilicon.com>
Cc: HORIGUCHI NAOYA <naoya.horiguchi@nec.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/sparc/mm/init_64.c        |   1 +
 arch/x86/mm/init_64.c          |   3 +-
 include/linux/bootmem_info.h   |  40 +++++++++++++
 include/linux/memory_hotplug.h |  27 ---------
 mm/Makefile                    |   1 +
 mm/bootmem_info.c              | 127 +++++++++++++++++++++++++++++++++++++++++
 mm/memory_hotplug.c            | 116 -------------------------------------
 mm/sparse.c                    |   1 +
 8 files changed, 172 insertions(+), 144 deletions(-)
 create mode 100644 include/linux/bootmem_info.h
 create mode 100644 mm/bootmem_info.c

(limited to 'include/linux')

diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 06e938d03f3b..1b23639e2fcd 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -27,6 +27,7 @@
 #include <linux/percpu.h>
 #include <linux/mmzone.h>
 #include <linux/gfp.h>
+#include <linux/bootmem_info.h>
 
 #include <asm/head.h>
 #include <asm/page.h>
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index e527d829e1ed..3aaf1d30c777 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -33,6 +33,7 @@
 #include <linux/nmi.h>
 #include <linux/gfp.h>
 #include <linux/kcore.h>
+#include <linux/bootmem_info.h>
 
 #include <asm/processor.h>
 #include <asm/bios_ebda.h>
@@ -1623,7 +1624,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 	return err;
 }
 
-#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
+#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
 void register_page_bootmem_memmap(unsigned long section_nr,
 				  struct page *start_page, unsigned long nr_pages)
 {
diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h
new file mode 100644
index 000000000000..4ed6dee1adc9
--- /dev/null
+++ b/include/linux/bootmem_info.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_BOOTMEM_INFO_H
+#define __LINUX_BOOTMEM_INFO_H
+
+#include <linux/mmzone.h>
+
+/*
+ * Types for free bootmem stored in page->lru.next. These have to be in
+ * some random range in unsigned long space for debugging purposes.
+ */
+enum {
+	MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12,
+	SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE,
+	MIX_SECTION_INFO,
+	NODE_INFO,
+	MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO,
+};
+
+#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
+void __init register_page_bootmem_info_node(struct pglist_data *pgdat);
+
+void get_page_bootmem(unsigned long info, struct page *page,
+		      unsigned long type);
+void put_page_bootmem(struct page *page);
+#else
+static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
+{
+}
+
+static inline void put_page_bootmem(struct page *page)
+{
+}
+
+static inline void get_page_bootmem(unsigned long info, struct page *page,
+				    unsigned long type)
+{
+}
+#endif
+
+#endif /* __LINUX_BOOTMEM_INFO_H */
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 28f32fd00fe9..a7fd2c3ccb77 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -18,18 +18,6 @@ struct vmem_altmap;
 #ifdef CONFIG_MEMORY_HOTPLUG
 struct page *pfn_to_online_page(unsigned long pfn);
 
-/*
- * Types for free bootmem stored in page->lru.next. These have to be in
- * some random range in unsigned long space for debugging purposes.
- */
-enum {
-	MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12,
-	SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE,
-	MIX_SECTION_INFO,
-	NODE_INFO,
-	MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO,
-};
-
 /* Types for control the zone type of onlined and offlined memory */
 enum {
 	/* Offline the memory. */
@@ -222,17 +210,6 @@ static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
 #endif /* CONFIG_NUMA */
 #endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
 
-#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
-extern void __init register_page_bootmem_info_node(struct pglist_data *pgdat);
-#else
-static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
-{
-}
-#endif
-extern void put_page_bootmem(struct page *page);
-extern void get_page_bootmem(unsigned long ingo, struct page *page,
-			     unsigned long type);
-
 void get_online_mems(void);
 void put_online_mems(void);
 
@@ -260,10 +237,6 @@ static inline void zone_span_writelock(struct zone *zone) {}
 static inline void zone_span_writeunlock(struct zone *zone) {}
 static inline void zone_seqlock_init(struct zone *zone) {}
 
-static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
-{
-}
-
 static inline int try_online_node(int nid)
 {
 	return 0;
diff --git a/mm/Makefile b/mm/Makefile
index bf71e295e9f6..61ce71ddf7bb 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -125,3 +125,4 @@ obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
 obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
 obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
 obj-$(CONFIG_IO_MAPPING) += io-mapping.o
+obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c
new file mode 100644
index 000000000000..5b152dba7344
--- /dev/null
+++ b/mm/bootmem_info.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Bootmem core functions.
+ *
+ * Copyright (c) 2020, Bytedance.
+ *
+ *     Author: Muchun Song <songmuchun@bytedance.com>
+ *
+ */
+#include <linux/mm.h>
+#include <linux/compiler.h>
+#include <linux/memblock.h>
+#include <linux/bootmem_info.h>
+#include <linux/memory_hotplug.h>
+
+void get_page_bootmem(unsigned long info, struct page *page, unsigned long type)
+{
+	page->freelist = (void *)type;
+	SetPagePrivate(page);
+	set_page_private(page, info);
+	page_ref_inc(page);
+}
+
+void put_page_bootmem(struct page *page)
+{
+	unsigned long type;
+
+	type = (unsigned long) page->freelist;
+	BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
+	       type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
+
+	if (page_ref_dec_return(page) == 1) {
+		page->freelist = NULL;
+		ClearPagePrivate(page);
+		set_page_private(page, 0);
+		INIT_LIST_HEAD(&page->lru);
+		free_reserved_page(page);
+	}
+}
+
+#ifndef CONFIG_SPARSEMEM_VMEMMAP
+static void register_page_bootmem_info_section(unsigned long start_pfn)
+{
+	unsigned long mapsize, section_nr, i;
+	struct mem_section *ms;
+	struct page *page, *memmap;
+	struct mem_section_usage *usage;
+
+	section_nr = pfn_to_section_nr(start_pfn);
+	ms = __nr_to_section(section_nr);
+
+	/* Get section's memmap address */
+	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+
+	/*
+	 * Get page for the memmap's phys address
+	 * XXX: need more consideration for sparse_vmemmap...
+	 */
+	page = virt_to_page(memmap);
+	mapsize = sizeof(struct page) * PAGES_PER_SECTION;
+	mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
+
+	/* remember memmap's page */
+	for (i = 0; i < mapsize; i++, page++)
+		get_page_bootmem(section_nr, page, SECTION_INFO);
+
+	usage = ms->usage;
+	page = virt_to_page(usage);
+
+	mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
+
+	for (i = 0; i < mapsize; i++, page++)
+		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
+
+}
+#else /* CONFIG_SPARSEMEM_VMEMMAP */
+static void register_page_bootmem_info_section(unsigned long start_pfn)
+{
+	unsigned long mapsize, section_nr, i;
+	struct mem_section *ms;
+	struct page *page, *memmap;
+	struct mem_section_usage *usage;
+
+	section_nr = pfn_to_section_nr(start_pfn);
+	ms = __nr_to_section(section_nr);
+
+	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+
+	register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
+
+	usage = ms->usage;
+	page = virt_to_page(usage);
+
+	mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
+
+	for (i = 0; i < mapsize; i++, page++)
+		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
+}
+#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
+
+void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
+{
+	unsigned long i, pfn, end_pfn, nr_pages;
+	int node = pgdat->node_id;
+	struct page *page;
+
+	nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
+	page = virt_to_page(pgdat);
+
+	for (i = 0; i < nr_pages; i++, page++)
+		get_page_bootmem(node, page, NODE_INFO);
+
+	pfn = pgdat->node_start_pfn;
+	end_pfn = pgdat_end_pfn(pgdat);
+
+	/* register section info */
+	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+		/*
+		 * Some platforms can assign the same pfn to multiple nodes - on
+		 * node0 as well as nodeN.  To avoid registering a pfn against
+		 * multiple nodes we check that this pfn does not already
+		 * reside in some other nodes.
+		 */
+		if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node))
+			register_page_bootmem_info_section(pfn);
+	}
+}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 974a565797d8..ae7a07b02049 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -154,122 +154,6 @@ static void release_memory_resource(struct resource *res)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
-void get_page_bootmem(unsigned long info,  struct page *page,
-		      unsigned long type)
-{
-	page->freelist = (void *)type;
-	SetPagePrivate(page);
-	set_page_private(page, info);
-	page_ref_inc(page);
-}
-
-void put_page_bootmem(struct page *page)
-{
-	unsigned long type;
-
-	type = (unsigned long) page->freelist;
-	BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
-	       type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
-
-	if (page_ref_dec_return(page) == 1) {
-		page->freelist = NULL;
-		ClearPagePrivate(page);
-		set_page_private(page, 0);
-		INIT_LIST_HEAD(&page->lru);
-		free_reserved_page(page);
-	}
-}
-
-#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
-#ifndef CONFIG_SPARSEMEM_VMEMMAP
-static void register_page_bootmem_info_section(unsigned long start_pfn)
-{
-	unsigned long mapsize, section_nr, i;
-	struct mem_section *ms;
-	struct page *page, *memmap;
-	struct mem_section_usage *usage;
-
-	section_nr = pfn_to_section_nr(start_pfn);
-	ms = __nr_to_section(section_nr);
-
-	/* Get section's memmap address */
-	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
-
-	/*
-	 * Get page for the memmap's phys address
-	 * XXX: need more consideration for sparse_vmemmap...
-	 */
-	page = virt_to_page(memmap);
-	mapsize = sizeof(struct page) * PAGES_PER_SECTION;
-	mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
-
-	/* remember memmap's page */
-	for (i = 0; i < mapsize; i++, page++)
-		get_page_bootmem(section_nr, page, SECTION_INFO);
-
-	usage = ms->usage;
-	page = virt_to_page(usage);
-
-	mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
-
-	for (i = 0; i < mapsize; i++, page++)
-		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
-
-}
-#else /* CONFIG_SPARSEMEM_VMEMMAP */
-static void register_page_bootmem_info_section(unsigned long start_pfn)
-{
-	unsigned long mapsize, section_nr, i;
-	struct mem_section *ms;
-	struct page *page, *memmap;
-	struct mem_section_usage *usage;
-
-	section_nr = pfn_to_section_nr(start_pfn);
-	ms = __nr_to_section(section_nr);
-
-	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
-
-	register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
-
-	usage = ms->usage;
-	page = virt_to_page(usage);
-
-	mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
-
-	for (i = 0; i < mapsize; i++, page++)
-		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
-}
-#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
-
-void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
-{
-	unsigned long i, pfn, end_pfn, nr_pages;
-	int node = pgdat->node_id;
-	struct page *page;
-
-	nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
-	page = virt_to_page(pgdat);
-
-	for (i = 0; i < nr_pages; i++, page++)
-		get_page_bootmem(node, page, NODE_INFO);
-
-	pfn = pgdat->node_start_pfn;
-	end_pfn = pgdat_end_pfn(pgdat);
-
-	/* register section info */
-	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
-		/*
-		 * Some platforms can assign the same pfn to multiple nodes - on
-		 * node0 as well as nodeN.  To avoid registering a pfn against
-		 * multiple nodes we check that this pfn does not already
-		 * reside in some other nodes.
-		 */
-		if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node))
-			register_page_bootmem_info_section(pfn);
-	}
-}
-#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
-
 static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
 		const char *reason)
 {
diff --git a/mm/sparse.c b/mm/sparse.c
index 7272f7a1449d..6326cdf36c4f 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -13,6 +13,7 @@
 #include <linux/vmalloc.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/bootmem_info.h>
 
 #include "internal.h"
 #include <asm/dma.h>
-- 
cgit v1.2.3


From cd39d4e9e71c5437b67c819c3d53032145bf2879 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Wed, 30 Jun 2021 18:47:09 -0700
Subject: mm: hugetlb: gather discrete indexes of tail page

For HugeTLB page, there are more metadata to save in the struct page.  But
the head struct page cannot meet our needs, so we have to abuse other tail
struct page to store the metadata.  In order to avoid conflicts caused by
subsequent use of more tail struct pages, we can gather these discrete
indexes of tail struct page.  In this case, it will be easier to add a new
tail page index later.

Link: https://lkml.kernel.org/r/20210510030027.56044-4-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Tested-by: Chen Huang <chenhuang5@huawei.com>
Tested-by: Bodeddula Balasubramaniam <bodeddub@amazon.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Barry Song <song.bao.hua@hisilicon.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: HORIGUCHI NAOYA <naoya.horiguchi@nec.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Oliver Neukum <oneukum@suse.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h        | 21 +++++++++++++++++++--
 include/linux/hugetlb_cgroup.h | 19 +++++++++++--------
 2 files changed, 30 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3c0117656745..0c8c96481259 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -29,6 +29,23 @@ typedef struct { unsigned long pd; } hugepd_t;
 #include <linux/shm.h>
 #include <asm/tlbflush.h>
 
+/*
+ * For HugeTLB page, there are more metadata to save in the struct page. But
+ * the head struct page cannot meet our needs, so we have to abuse other tail
+ * struct page to store the metadata. In order to avoid conflicts caused by
+ * subsequent use of more tail struct pages, we gather these discrete indexes
+ * of tail struct page here.
+ */
+enum {
+	SUBPAGE_INDEX_SUBPOOL = 1,	/* reuse page->private */
+#ifdef CONFIG_CGROUP_HUGETLB
+	SUBPAGE_INDEX_CGROUP,		/* reuse page->private */
+	SUBPAGE_INDEX_CGROUP_RSVD,	/* reuse page->private */
+	__MAX_CGROUP_SUBPAGE_INDEX = SUBPAGE_INDEX_CGROUP_RSVD,
+#endif
+	__NR_USED_SUBPAGE,
+};
+
 struct hugepage_subpool {
 	spinlock_t lock;
 	long count;
@@ -635,13 +652,13 @@ extern unsigned int default_hstate_idx;
  */
 static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage)
 {
-	return (struct hugepage_subpool *)(hpage+1)->private;
+	return (void *)page_private(hpage + SUBPAGE_INDEX_SUBPOOL);
 }
 
 static inline void hugetlb_set_page_subpool(struct page *hpage,
 					struct hugepage_subpool *subpool)
 {
-	set_page_private(hpage+1, (unsigned long)subpool);
+	set_page_private(hpage + SUBPAGE_INDEX_SUBPOOL, (unsigned long)subpool);
 }
 
 static inline struct hstate *hstate_file(struct file *f)
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index 0bff345c4bc6..0b8d1fdda3a1 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -21,15 +21,16 @@ struct hugetlb_cgroup;
 struct resv_map;
 struct file_region;
 
+#ifdef CONFIG_CGROUP_HUGETLB
 /*
  * Minimum page order trackable by hugetlb cgroup.
  * At least 4 pages are necessary for all the tracking information.
- * The second tail page (hpage[2]) is the fault usage cgroup.
- * The third tail page (hpage[3]) is the reservation usage cgroup.
+ * The second tail page (hpage[SUBPAGE_INDEX_CGROUP]) is the fault
+ * usage cgroup. The third tail page (hpage[SUBPAGE_INDEX_CGROUP_RSVD])
+ * is the reservation usage cgroup.
  */
-#define HUGETLB_CGROUP_MIN_ORDER	2
+#define HUGETLB_CGROUP_MIN_ORDER order_base_2(__MAX_CGROUP_SUBPAGE_INDEX + 1)
 
-#ifdef CONFIG_CGROUP_HUGETLB
 enum hugetlb_memory_event {
 	HUGETLB_MAX,
 	HUGETLB_NR_MEMORY_EVENTS,
@@ -66,9 +67,9 @@ __hugetlb_cgroup_from_page(struct page *page, bool rsvd)
 	if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
 		return NULL;
 	if (rsvd)
-		return (struct hugetlb_cgroup *)page[3].private;
+		return (void *)page_private(page + SUBPAGE_INDEX_CGROUP_RSVD);
 	else
-		return (struct hugetlb_cgroup *)page[2].private;
+		return (void *)page_private(page + SUBPAGE_INDEX_CGROUP);
 }
 
 static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page)
@@ -90,9 +91,11 @@ static inline int __set_hugetlb_cgroup(struct page *page,
 	if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
 		return -1;
 	if (rsvd)
-		page[3].private = (unsigned long)h_cg;
+		set_page_private(page + SUBPAGE_INDEX_CGROUP_RSVD,
+				 (unsigned long)h_cg);
 	else
-		page[2].private = (unsigned long)h_cg;
+		set_page_private(page + SUBPAGE_INDEX_CGROUP,
+				 (unsigned long)h_cg);
 	return 0;
 }
 
-- 
cgit v1.2.3


From f41f2ed43ca5258d70d53290d1951a21621f95c8 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Wed, 30 Jun 2021 18:47:13 -0700
Subject: mm: hugetlb: free the vmemmap pages associated with each HugeTLB page

Every HugeTLB has more than one struct page structure.  We __know__ that
we only use the first 4 (__NR_USED_SUBPAGE) struct page structures to
store metadata associated with each HugeTLB.

There are a lot of struct page structures associated with each HugeTLB
page.  For tail pages, the value of compound_head is the same.  So we can
reuse first page of tail page structures.  We map the virtual addresses of
the remaining pages of tail page structures to the first tail page struct,
and then free these page frames.  Therefore, we need to reserve two pages
as vmemmap areas.

When we allocate a HugeTLB page from the buddy, we can free some vmemmap
pages associated with each HugeTLB page.  It is more appropriate to do it
in the prep_new_huge_page().

The free_vmemmap_pages_per_hpage(), which indicates how many vmemmap pages
associated with a HugeTLB page can be freed, returns zero for now, which
means the feature is disabled.  We will enable it once all the
infrastructure is there.

[willy@infradead.org: fix documentation warning]
  Link: https://lkml.kernel.org/r/20210615200242.1716568-5-willy@infradead.org

Link: https://lkml.kernel.org/r/20210510030027.56044-5-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Tested-by: Chen Huang <chenhuang5@huawei.com>
Tested-by: Bodeddula Balasubramaniam <bodeddub@amazon.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Barry Song <song.bao.hua@hisilicon.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: HORIGUCHI NAOYA <naoya.horiguchi@nec.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Oliver Neukum <oneukum@suse.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bootmem_info.h |  28 +++++-
 include/linux/mm.h           |   3 +
 mm/Makefile                  |   1 +
 mm/hugetlb.c                 |  22 ++---
 mm/hugetlb_vmemmap.c         | 218 +++++++++++++++++++++++++++++++++++++++++++
 mm/hugetlb_vmemmap.h         |  20 ++++
 mm/sparse-vmemmap.c          | 194 ++++++++++++++++++++++++++++++++++++++
 7 files changed, 473 insertions(+), 13 deletions(-)
 create mode 100644 mm/hugetlb_vmemmap.c
 create mode 100644 mm/hugetlb_vmemmap.h

(limited to 'include/linux')

diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h
index 4ed6dee1adc9..2bc8b1f69c93 100644
--- a/include/linux/bootmem_info.h
+++ b/include/linux/bootmem_info.h
@@ -2,7 +2,7 @@
 #ifndef __LINUX_BOOTMEM_INFO_H
 #define __LINUX_BOOTMEM_INFO_H
 
-#include <linux/mmzone.h>
+#include <linux/mm.h>
 
 /*
  * Types for free bootmem stored in page->lru.next. These have to be in
@@ -22,6 +22,27 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat);
 void get_page_bootmem(unsigned long info, struct page *page,
 		      unsigned long type);
 void put_page_bootmem(struct page *page);
+
+/*
+ * Any memory allocated via the memblock allocator and not via the
+ * buddy will be marked reserved already in the memmap. For those
+ * pages, we can call this function to free it to buddy allocator.
+ */
+static inline void free_bootmem_page(struct page *page)
+{
+	unsigned long magic = (unsigned long)page->freelist;
+
+	/*
+	 * The reserve_bootmem_region sets the reserved flag on bootmem
+	 * pages.
+	 */
+	VM_BUG_ON_PAGE(page_ref_count(page) != 2, page);
+
+	if (magic == SECTION_INFO || magic == MIX_SECTION_INFO)
+		put_page_bootmem(page);
+	else
+		VM_BUG_ON_PAGE(1, page);
+}
 #else
 static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
 {
@@ -35,6 +56,11 @@ static inline void get_page_bootmem(unsigned long info, struct page *page,
 				    unsigned long type)
 {
 }
+
+static inline void free_bootmem_page(struct page *page)
+{
+	free_reserved_page(page);
+}
 #endif
 
 #endif /* __LINUX_BOOTMEM_INFO_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 07922ee1477e..3437aa7c6c91 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3076,6 +3076,9 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
 }
 #endif
 
+void vmemmap_remap_free(unsigned long start, unsigned long end,
+			unsigned long reuse);
+
 void *sparse_buffer_alloc(unsigned long size);
 struct page * __populate_section_memmap(unsigned long pfn,
 		unsigned long nr_pages, int nid, struct vmem_altmap *altmap);
diff --git a/mm/Makefile b/mm/Makefile
index 61ce71ddf7bb..74b47c354682 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -75,6 +75,7 @@ obj-$(CONFIG_FRONTSWAP)	+= frontswap.o
 obj-$(CONFIG_ZSWAP)	+= zswap.o
 obj-$(CONFIG_HAS_DMA)	+= dmapool.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
+obj-$(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP)	+= hugetlb_vmemmap.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SPARSEMEM)	+= sparse.o
 obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 103f1187043f..5f5493f0f003 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -41,6 +41,7 @@
 #include <linux/node.h>
 #include <linux/page_owner.h>
 #include "internal.h"
+#include "hugetlb_vmemmap.h"
 
 int hugetlb_max_hstate __read_mostly;
 unsigned int default_hstate_idx;
@@ -1493,8 +1494,9 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid)
 	h->nr_huge_pages_node[nid]++;
 }
 
-static void __prep_new_huge_page(struct page *page)
+static void __prep_new_huge_page(struct hstate *h, struct page *page)
 {
+	free_huge_page_vmemmap(h, page);
 	INIT_LIST_HEAD(&page->lru);
 	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
 	hugetlb_set_page_subpool(page, NULL);
@@ -1504,7 +1506,7 @@ static void __prep_new_huge_page(struct page *page)
 
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 {
-	__prep_new_huge_page(page);
+	__prep_new_huge_page(h, page);
 	spin_lock_irq(&hugetlb_lock);
 	__prep_account_new_huge_page(h, nid);
 	spin_unlock_irq(&hugetlb_lock);
@@ -2351,14 +2353,15 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
 
 	/*
 	 * Before dissolving the page, we need to allocate a new one for the
-	 * pool to remain stable. Using alloc_buddy_huge_page() allows us to
-	 * not having to deal with prep_new_huge_page() and avoids dealing of any
-	 * counters. This simplifies and let us do the whole thing under the
-	 * lock.
+	 * pool to remain stable.  Here, we allocate the page and 'prep' it
+	 * by doing everything but actually updating counters and adding to
+	 * the pool.  This simplifies and let us do most of the processing
+	 * under the lock.
 	 */
 	new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
 	if (!new_page)
 		return -ENOMEM;
+	__prep_new_huge_page(h, new_page);
 
 retry:
 	spin_lock_irq(&hugetlb_lock);
@@ -2397,14 +2400,9 @@ retry:
 		remove_hugetlb_page(h, old_page, false);
 
 		/*
-		 * new_page needs to be initialized with the standard hugetlb
-		 * state. This is normally done by prep_new_huge_page() but
-		 * that takes hugetlb_lock which is already held so we need to
-		 * open code it here.
 		 * Reference count trick is needed because allocator gives us
 		 * referenced page but the pool requires pages with 0 refcount.
 		 */
-		__prep_new_huge_page(new_page);
 		__prep_account_new_huge_page(h, nid);
 		page_ref_dec(new_page);
 		enqueue_huge_page(h, new_page);
@@ -2420,7 +2418,7 @@ retry:
 
 free_new:
 	spin_unlock_irq(&hugetlb_lock);
-	__free_pages(new_page, huge_page_order(h));
+	update_and_free_page(h, new_page);
 
 	return ret;
 }
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
new file mode 100644
index 000000000000..e45a138a7f85
--- /dev/null
+++ b/mm/hugetlb_vmemmap.c
@@ -0,0 +1,218 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Free some vmemmap pages of HugeTLB
+ *
+ * Copyright (c) 2020, Bytedance. All rights reserved.
+ *
+ *     Author: Muchun Song <songmuchun@bytedance.com>
+ *
+ * The struct page structures (page structs) are used to describe a physical
+ * page frame. By default, there is a one-to-one mapping from a page frame to
+ * it's corresponding page struct.
+ *
+ * HugeTLB pages consist of multiple base page size pages and is supported by
+ * many architectures. See hugetlbpage.rst in the Documentation directory for
+ * more details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB
+ * are currently supported. Since the base page size on x86 is 4KB, a 2MB
+ * HugeTLB page consists of 512 base pages and a 1GB HugeTLB page consists of
+ * 4096 base pages. For each base page, there is a corresponding page struct.
+ *
+ * Within the HugeTLB subsystem, only the first 4 page structs are used to
+ * contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides
+ * this upper limit. The only 'useful' information in the remaining page structs
+ * is the compound_head field, and this field is the same for all tail pages.
+ *
+ * By removing redundant page structs for HugeTLB pages, memory can be returned
+ * to the buddy allocator for other uses.
+ *
+ * Different architectures support different HugeTLB pages. For example, the
+ * following table is the HugeTLB page size supported by x86 and arm64
+ * architectures. Because arm64 supports 4k, 16k, and 64k base pages and
+ * supports contiguous entries, so it supports many kinds of sizes of HugeTLB
+ * page.
+ *
+ * +--------------+-----------+-----------------------------------------------+
+ * | Architecture | Page Size |                HugeTLB Page Size              |
+ * +--------------+-----------+-----------+-----------+-----------+-----------+
+ * |    x86-64    |    4KB    |    2MB    |    1GB    |           |           |
+ * +--------------+-----------+-----------+-----------+-----------+-----------+
+ * |              |    4KB    |   64KB    |    2MB    |    32MB   |    1GB    |
+ * |              +-----------+-----------+-----------+-----------+-----------+
+ * |    arm64     |   16KB    |    2MB    |   32MB    |     1GB   |           |
+ * |              +-----------+-----------+-----------+-----------+-----------+
+ * |              |   64KB    |    2MB    |  512MB    |    16GB   |           |
+ * +--------------+-----------+-----------+-----------+-----------+-----------+
+ *
+ * When the system boot up, every HugeTLB page has more than one struct page
+ * structs which size is (unit: pages):
+ *
+ *    struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE
+ *
+ * Where HugeTLB_Size is the size of the HugeTLB page. We know that the size
+ * of the HugeTLB page is always n times PAGE_SIZE. So we can get the following
+ * relationship.
+ *
+ *    HugeTLB_Size = n * PAGE_SIZE
+ *
+ * Then,
+ *
+ *    struct_size = n * PAGE_SIZE / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE
+ *                = n * sizeof(struct page) / PAGE_SIZE
+ *
+ * We can use huge mapping at the pud/pmd level for the HugeTLB page.
+ *
+ * For the HugeTLB page of the pmd level mapping, then
+ *
+ *    struct_size = n * sizeof(struct page) / PAGE_SIZE
+ *                = PAGE_SIZE / sizeof(pte_t) * sizeof(struct page) / PAGE_SIZE
+ *                = sizeof(struct page) / sizeof(pte_t)
+ *                = 64 / 8
+ *                = 8 (pages)
+ *
+ * Where n is how many pte entries which one page can contains. So the value of
+ * n is (PAGE_SIZE / sizeof(pte_t)).
+ *
+ * This optimization only supports 64-bit system, so the value of sizeof(pte_t)
+ * is 8. And this optimization also applicable only when the size of struct page
+ * is a power of two. In most cases, the size of struct page is 64 bytes (e.g.
+ * x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the
+ * size of struct page structs of it is 8 page frames which size depends on the
+ * size of the base page.
+ *
+ * For the HugeTLB page of the pud level mapping, then
+ *
+ *    struct_size = PAGE_SIZE / sizeof(pmd_t) * struct_size(pmd)
+ *                = PAGE_SIZE / 8 * 8 (pages)
+ *                = PAGE_SIZE (pages)
+ *
+ * Where the struct_size(pmd) is the size of the struct page structs of a
+ * HugeTLB page of the pmd level mapping.
+ *
+ * E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB
+ * HugeTLB page consists in 4096.
+ *
+ * Next, we take the pmd level mapping of the HugeTLB page as an example to
+ * show the internal implementation of this optimization. There are 8 pages
+ * struct page structs associated with a HugeTLB page which is pmd mapped.
+ *
+ * Here is how things look before optimization.
+ *
+ *    HugeTLB                  struct pages(8 pages)         page frame(8 pages)
+ * +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
+ * |           |                     |     0     | -------------> |     0     |
+ * |           |                     +-----------+                +-----------+
+ * |           |                     |     1     | -------------> |     1     |
+ * |           |                     +-----------+                +-----------+
+ * |           |                     |     2     | -------------> |     2     |
+ * |           |                     +-----------+                +-----------+
+ * |           |                     |     3     | -------------> |     3     |
+ * |           |                     +-----------+                +-----------+
+ * |           |                     |     4     | -------------> |     4     |
+ * |    PMD    |                     +-----------+                +-----------+
+ * |   level   |                     |     5     | -------------> |     5     |
+ * |  mapping  |                     +-----------+                +-----------+
+ * |           |                     |     6     | -------------> |     6     |
+ * |           |                     +-----------+                +-----------+
+ * |           |                     |     7     | -------------> |     7     |
+ * |           |                     +-----------+                +-----------+
+ * |           |
+ * |           |
+ * |           |
+ * +-----------+
+ *
+ * The value of page->compound_head is the same for all tail pages. The first
+ * page of page structs (page 0) associated with the HugeTLB page contains the 4
+ * page structs necessary to describe the HugeTLB. The only use of the remaining
+ * pages of page structs (page 1 to page 7) is to point to page->compound_head.
+ * Therefore, we can remap pages 2 to 7 to page 1. Only 2 pages of page structs
+ * will be used for each HugeTLB page. This will allow us to free the remaining
+ * 6 pages to the buddy allocator.
+ *
+ * Here is how things look after remapping.
+ *
+ *    HugeTLB                  struct pages(8 pages)         page frame(8 pages)
+ * +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
+ * |           |                     |     0     | -------------> |     0     |
+ * |           |                     +-----------+                +-----------+
+ * |           |                     |     1     | -------------> |     1     |
+ * |           |                     +-----------+                +-----------+
+ * |           |                     |     2     | ----------------^ ^ ^ ^ ^ ^
+ * |           |                     +-----------+                   | | | | |
+ * |           |                     |     3     | ------------------+ | | | |
+ * |           |                     +-----------+                     | | | |
+ * |           |                     |     4     | --------------------+ | | |
+ * |    PMD    |                     +-----------+                       | | |
+ * |   level   |                     |     5     | ----------------------+ | |
+ * |  mapping  |                     +-----------+                         | |
+ * |           |                     |     6     | ------------------------+ |
+ * |           |                     +-----------+                           |
+ * |           |                     |     7     | --------------------------+
+ * |           |                     +-----------+
+ * |           |
+ * |           |
+ * |           |
+ * +-----------+
+ *
+ * When a HugeTLB is freed to the buddy system, we should allocate 6 pages for
+ * vmemmap pages and restore the previous mapping relationship.
+ *
+ * For the HugeTLB page of the pud level mapping. It is similar to the former.
+ * We also can use this approach to free (PAGE_SIZE - 2) vmemmap pages.
+ *
+ * Apart from the HugeTLB page of the pmd/pud level mapping, some architectures
+ * (e.g. aarch64) provides a contiguous bit in the translation table entries
+ * that hints to the MMU to indicate that it is one of a contiguous set of
+ * entries that can be cached in a single TLB entry.
+ *
+ * The contiguous bit is used to increase the mapping size at the pmd and pte
+ * (last) level. So this type of HugeTLB page can be optimized only when its
+ * size of the struct page structs is greater than 2 pages.
+ */
+#include "hugetlb_vmemmap.h"
+
+/*
+ * There are a lot of struct page structures associated with each HugeTLB page.
+ * For tail pages, the value of compound_head is the same. So we can reuse first
+ * page of tail page structures. We map the virtual addresses of the remaining
+ * pages of tail page structures to the first tail page struct, and then free
+ * these page frames. Therefore, we need to reserve two pages as vmemmap areas.
+ */
+#define RESERVE_VMEMMAP_NR		2U
+#define RESERVE_VMEMMAP_SIZE		(RESERVE_VMEMMAP_NR << PAGE_SHIFT)
+
+/*
+ * How many vmemmap pages associated with a HugeTLB page that can be freed
+ * to the buddy allocator.
+ *
+ * Todo: Returns zero for now, which means the feature is disabled. We will
+ * enable it once all the infrastructure is there.
+ */
+static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
+{
+	return 0;
+}
+
+static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h)
+{
+	return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT;
+}
+
+void free_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+	unsigned long vmemmap_addr = (unsigned long)head;
+	unsigned long vmemmap_end, vmemmap_reuse;
+
+	if (!free_vmemmap_pages_per_hpage(h))
+		return;
+
+	vmemmap_addr += RESERVE_VMEMMAP_SIZE;
+	vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h);
+	vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
+
+	/*
+	 * Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end)
+	 * to the page which @vmemmap_reuse is mapped to, then free the pages
+	 * which the range [@vmemmap_addr, @vmemmap_end] is mapped to.
+	 */
+	vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse);
+}
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
new file mode 100644
index 000000000000..6923f03534d5
--- /dev/null
+++ b/mm/hugetlb_vmemmap.h
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Free some vmemmap pages of HugeTLB
+ *
+ * Copyright (c) 2020, Bytedance. All rights reserved.
+ *
+ *     Author: Muchun Song <songmuchun@bytedance.com>
+ */
+#ifndef _LINUX_HUGETLB_VMEMMAP_H
+#define _LINUX_HUGETLB_VMEMMAP_H
+#include <linux/hugetlb.h>
+
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+void free_huge_page_vmemmap(struct hstate *h, struct page *head);
+#else
+static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+}
+#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
+#endif /* _LINUX_HUGETLB_VMEMMAP_H */
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 16183d85a7d5..3ec5488c815c 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -27,8 +27,202 @@
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
+#include <linux/pgtable.h>
+#include <linux/bootmem_info.h>
+
 #include <asm/dma.h>
 #include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+
+/**
+ * struct vmemmap_remap_walk - walk vmemmap page table
+ *
+ * @remap_pte:		called for each lowest-level entry (PTE).
+ * @reuse_page:		the page which is reused for the tail vmemmap pages.
+ * @reuse_addr:		the virtual address of the @reuse_page page.
+ * @vmemmap_pages:	the list head of the vmemmap pages that can be freed.
+ */
+struct vmemmap_remap_walk {
+	void (*remap_pte)(pte_t *pte, unsigned long addr,
+			  struct vmemmap_remap_walk *walk);
+	struct page *reuse_page;
+	unsigned long reuse_addr;
+	struct list_head *vmemmap_pages;
+};
+
+static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
+			      unsigned long end,
+			      struct vmemmap_remap_walk *walk)
+{
+	pte_t *pte = pte_offset_kernel(pmd, addr);
+
+	/*
+	 * The reuse_page is found 'first' in table walk before we start
+	 * remapping (which is calling @walk->remap_pte).
+	 */
+	if (!walk->reuse_page) {
+		walk->reuse_page = pte_page(*pte);
+		/*
+		 * Because the reuse address is part of the range that we are
+		 * walking, skip the reuse address range.
+		 */
+		addr += PAGE_SIZE;
+		pte++;
+	}
+
+	for (; addr != end; addr += PAGE_SIZE, pte++)
+		walk->remap_pte(pte, addr, walk);
+}
+
+static void vmemmap_pmd_range(pud_t *pud, unsigned long addr,
+			      unsigned long end,
+			      struct vmemmap_remap_walk *walk)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pmd = pmd_offset(pud, addr);
+	do {
+		BUG_ON(pmd_leaf(*pmd));
+
+		next = pmd_addr_end(addr, end);
+		vmemmap_pte_range(pmd, addr, next, walk);
+	} while (pmd++, addr = next, addr != end);
+}
+
+static void vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
+			      unsigned long end,
+			      struct vmemmap_remap_walk *walk)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pud = pud_offset(p4d, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		vmemmap_pmd_range(pud, addr, next, walk);
+	} while (pud++, addr = next, addr != end);
+}
+
+static void vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
+			      unsigned long end,
+			      struct vmemmap_remap_walk *walk)
+{
+	p4d_t *p4d;
+	unsigned long next;
+
+	p4d = p4d_offset(pgd, addr);
+	do {
+		next = p4d_addr_end(addr, end);
+		vmemmap_pud_range(p4d, addr, next, walk);
+	} while (p4d++, addr = next, addr != end);
+}
+
+static void vmemmap_remap_range(unsigned long start, unsigned long end,
+				struct vmemmap_remap_walk *walk)
+{
+	unsigned long addr = start;
+	unsigned long next;
+	pgd_t *pgd;
+
+	VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE));
+	VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE));
+
+	pgd = pgd_offset_k(addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		vmemmap_p4d_range(pgd, addr, next, walk);
+	} while (pgd++, addr = next, addr != end);
+
+	/*
+	 * We only change the mapping of the vmemmap virtual address range
+	 * [@start + PAGE_SIZE, end), so we only need to flush the TLB which
+	 * belongs to the range.
+	 */
+	flush_tlb_kernel_range(start + PAGE_SIZE, end);
+}
+
+/*
+ * Free a vmemmap page. A vmemmap page can be allocated from the memblock
+ * allocator or buddy allocator. If the PG_reserved flag is set, it means
+ * that it allocated from the memblock allocator, just free it via the
+ * free_bootmem_page(). Otherwise, use __free_page().
+ */
+static inline void free_vmemmap_page(struct page *page)
+{
+	if (PageReserved(page))
+		free_bootmem_page(page);
+	else
+		__free_page(page);
+}
+
+/* Free a list of the vmemmap pages */
+static void free_vmemmap_page_list(struct list_head *list)
+{
+	struct page *page, *next;
+
+	list_for_each_entry_safe(page, next, list, lru) {
+		list_del(&page->lru);
+		free_vmemmap_page(page);
+	}
+}
+
+static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
+			      struct vmemmap_remap_walk *walk)
+{
+	/*
+	 * Remap the tail pages as read-only to catch illegal write operation
+	 * to the tail pages.
+	 */
+	pgprot_t pgprot = PAGE_KERNEL_RO;
+	pte_t entry = mk_pte(walk->reuse_page, pgprot);
+	struct page *page = pte_page(*pte);
+
+	list_add(&page->lru, walk->vmemmap_pages);
+	set_pte_at(&init_mm, addr, pte, entry);
+}
+
+/**
+ * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
+ *			to the page which @reuse is mapped to, then free vmemmap
+ *			which the range are mapped to.
+ * @start:	start address of the vmemmap virtual address range that we want
+ *		to remap.
+ * @end:	end address of the vmemmap virtual address range that we want to
+ *		remap.
+ * @reuse:	reuse address.
+ *
+ * Note: This function depends on vmemmap being base page mapped. Please make
+ * sure that we disable PMD mapping of vmemmap pages when calling this function.
+ */
+void vmemmap_remap_free(unsigned long start, unsigned long end,
+			unsigned long reuse)
+{
+	LIST_HEAD(vmemmap_pages);
+	struct vmemmap_remap_walk walk = {
+		.remap_pte	= vmemmap_remap_pte,
+		.reuse_addr	= reuse,
+		.vmemmap_pages	= &vmemmap_pages,
+	};
+
+	/*
+	 * In order to make remapping routine most efficient for the huge pages,
+	 * the routine of vmemmap page table walking has the following rules
+	 * (see more details from the vmemmap_pte_range()):
+	 *
+	 * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
+	 *   should be continuous.
+	 * - The @reuse address is part of the range [@reuse, @end) that we are
+	 *   walking which is passed to vmemmap_remap_range().
+	 * - The @reuse address is the first in the complete range.
+	 *
+	 * So we need to make sure that @start and @reuse meet the above rules.
+	 */
+	BUG_ON(start - reuse != PAGE_SIZE);
+
+	vmemmap_remap_range(reuse, end, &walk);
+	free_vmemmap_page_list(&vmemmap_pages);
+}
 
 /*
  * Allocate a block of memory to be used to back the virtual memory map
-- 
cgit v1.2.3


From ad2fa3717b74994a22519dbe045757135db00dbb Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Wed, 30 Jun 2021 18:47:21 -0700
Subject: mm: hugetlb: alloc the vmemmap pages associated with each HugeTLB
 page

When we free a HugeTLB page to the buddy allocator, we need to allocate
the vmemmap pages associated with it.  However, we may not be able to
allocate the vmemmap pages when the system is under memory pressure.  In
this case, we just refuse to free the HugeTLB page.  This changes behavior
in some corner cases as listed below:

 1) Failing to free a huge page triggered by the user (decrease nr_pages).

    User needs to try again later.

 2) Failing to free a surplus huge page when freed by the application.

    Try again later when freeing a huge page next time.

 3) Failing to dissolve a free huge page on ZONE_MOVABLE via
    offline_pages().

    This can happen when we have plenty of ZONE_MOVABLE memory, but
    not enough kernel memory to allocate vmemmmap pages.  We may even
    be able to migrate huge page contents, but will not be able to
    dissolve the source huge page.  This will prevent an offline
    operation and is unfortunate as memory offlining is expected to
    succeed on movable zones.  Users that depend on memory hotplug
    to succeed for movable zones should carefully consider whether the
    memory savings gained from this feature are worth the risk of
    possibly not being able to offline memory in certain situations.

 4) Failing to dissolve a huge page on CMA/ZONE_MOVABLE via
    alloc_contig_range() - once we have that handling in place. Mainly
    affects CMA and virtio-mem.

    Similar to 3). virito-mem will handle migration errors gracefully.
    CMA might be able to fallback on other free areas within the CMA
    region.

Vmemmap pages are allocated from the page freeing context.  In order for
those allocations to be not disruptive (e.g.  trigger oom killer)
__GFP_NORETRY is used.  hugetlb_lock is dropped for the allocation because
a non sleeping allocation would be too fragile and it could fail too
easily under memory pressure.  GFP_ATOMIC or other modes to access memory
reserves is not used because we want to prevent consuming reserves under
heavy hugetlb freeing.

[mike.kravetz@oracle.com: fix dissolve_free_huge_page use of tail/head page]
  Link: https://lkml.kernel.org/r/20210527231225.226987-1-mike.kravetz@oracle.com
[willy@infradead.org: fix alloc_vmemmap_page_list documentation warning]
  Link: https://lkml.kernel.org/r/20210615200242.1716568-6-willy@infradead.org

Link: https://lkml.kernel.org/r/20210510030027.56044-7-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Barry Song <song.bao.hua@hisilicon.com>
Cc: Bodeddula Balasubramaniam <bodeddub@amazon.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chen Huang <chenhuang5@huawei.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: HORIGUCHI NAOYA <naoya.horiguchi@nec.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Oliver Neukum <oneukum@suse.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/mm/hugetlbpage.rst    |  8 ++
 Documentation/admin-guide/mm/memory-hotplug.rst | 13 ++++
 include/linux/hugetlb.h                         |  3 +
 include/linux/mm.h                              |  2 +
 mm/hugetlb.c                                    | 98 +++++++++++++++++++++----
 mm/hugetlb_vmemmap.c                            | 34 +++++++++
 mm/hugetlb_vmemmap.h                            |  6 ++
 mm/migrate.c                                    |  5 +-
 mm/sparse-vmemmap.c                             | 75 ++++++++++++++++++-
 9 files changed, 227 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst
index f7b1c7462991..6988895d09a8 100644
--- a/Documentation/admin-guide/mm/hugetlbpage.rst
+++ b/Documentation/admin-guide/mm/hugetlbpage.rst
@@ -60,6 +60,10 @@ HugePages_Surp
         the pool above the value in ``/proc/sys/vm/nr_hugepages``. The
         maximum number of surplus huge pages is controlled by
         ``/proc/sys/vm/nr_overcommit_hugepages``.
+	Note: When the feature of freeing unused vmemmap pages associated
+	with each hugetlb page is enabled, the number of surplus huge pages
+	may be temporarily larger than the maximum number of surplus huge
+	pages when the system is under memory pressure.
 Hugepagesize
 	is the default hugepage size (in Kb).
 Hugetlb
@@ -80,6 +84,10 @@ returned to the huge page pool when freed by a task.  A user with root
 privileges can dynamically allocate more or free some persistent huge pages
 by increasing or decreasing the value of ``nr_hugepages``.
 
+Note: When the feature of freeing unused vmemmap pages associated with each
+hugetlb page is enabled, we can fail to free the huge pages triggered by
+the user when ths system is under memory pressure.  Please try again later.
+
 Pages that are used as huge pages are reserved inside the kernel and cannot
 be used for other purposes.  Huge pages cannot be swapped out under
 memory pressure.
diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst
index 05d51d2d8beb..c6bae2d77160 100644
--- a/Documentation/admin-guide/mm/memory-hotplug.rst
+++ b/Documentation/admin-guide/mm/memory-hotplug.rst
@@ -357,6 +357,19 @@ creates ZONE_MOVABLE as following.
    Unfortunately, there is no information to show which memory block belongs
    to ZONE_MOVABLE. This is TBD.
 
+   Memory offlining can fail when dissolving a free huge page on ZONE_MOVABLE
+   and the feature of freeing unused vmemmap pages associated with each hugetlb
+   page is enabled.
+
+   This can happen when we have plenty of ZONE_MOVABLE memory, but not enough
+   kernel memory to allocate vmemmmap pages.  We may even be able to migrate
+   huge page contents, but will not be able to dissolve the source huge page.
+   This will prevent an offline operation and is unfortunate as memory offlining
+   is expected to succeed on movable zones.  Users that depend on memory hotplug
+   to succeed for movable zones should carefully consider whether the memory
+   savings gained from this feature are worth the risk of possibly not being
+   able to offline memory in certain situations.
+
 .. note::
    Techniques that rely on long-term pinnings of memory (especially, RDMA and
    vfio) are fundamentally problematic with ZONE_MOVABLE and, therefore, memory
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 0c8c96481259..3578d9d708fe 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -532,12 +532,14 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
  *	modifications require hugetlb_lock.
  * HPG_freed - Set when page is on the free lists.
  *	Synchronization: hugetlb_lock held for examination and modification.
+ * HPG_vmemmap_optimized - Set when the vmemmap pages of the page are freed.
  */
 enum hugetlb_page_flags {
 	HPG_restore_reserve = 0,
 	HPG_migratable,
 	HPG_temporary,
 	HPG_freed,
+	HPG_vmemmap_optimized,
 	__NR_HPAGEFLAGS,
 };
 
@@ -583,6 +585,7 @@ HPAGEFLAG(RestoreReserve, restore_reserve)
 HPAGEFLAG(Migratable, migratable)
 HPAGEFLAG(Temporary, temporary)
 HPAGEFLAG(Freed, freed)
+HPAGEFLAG(VmemmapOptimized, vmemmap_optimized)
 
 #ifdef CONFIG_HUGETLB_PAGE
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3437aa7c6c91..706bee98d965 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3078,6 +3078,8 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
 
 void vmemmap_remap_free(unsigned long start, unsigned long end,
 			unsigned long reuse);
+int vmemmap_remap_alloc(unsigned long start, unsigned long end,
+			unsigned long reuse, gfp_t gfp_mask);
 
 void *sparse_buffer_alloc(unsigned long size);
 struct page * __populate_section_memmap(unsigned long pfn,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e7eb1ab8c78a..778db5de6232 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1376,6 +1376,39 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page,
 	h->nr_huge_pages_node[nid]--;
 }
 
+static void add_hugetlb_page(struct hstate *h, struct page *page,
+			     bool adjust_surplus)
+{
+	int zeroed;
+	int nid = page_to_nid(page);
+
+	VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page);
+
+	lockdep_assert_held(&hugetlb_lock);
+
+	INIT_LIST_HEAD(&page->lru);
+	h->nr_huge_pages++;
+	h->nr_huge_pages_node[nid]++;
+
+	if (adjust_surplus) {
+		h->surplus_huge_pages++;
+		h->surplus_huge_pages_node[nid]++;
+	}
+
+	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
+	set_page_private(page, 0);
+	SetHPageVmemmapOptimized(page);
+
+	/*
+	 * This page is now managed by the hugetlb allocator and has
+	 * no users -- drop the last reference.
+	 */
+	zeroed = put_page_testzero(page);
+	VM_BUG_ON_PAGE(!zeroed, page);
+	arch_clear_hugepage_flags(page);
+	enqueue_huge_page(h, page);
+}
+
 static void __update_and_free_page(struct hstate *h, struct page *page)
 {
 	int i;
@@ -1384,6 +1417,18 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
 	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
 		return;
 
+	if (alloc_huge_page_vmemmap(h, page)) {
+		spin_lock_irq(&hugetlb_lock);
+		/*
+		 * If we cannot allocate vmemmap pages, just refuse to free the
+		 * page and put the page back on the hugetlb free list and treat
+		 * as a surplus page.
+		 */
+		add_hugetlb_page(h, page, true);
+		spin_unlock_irq(&hugetlb_lock);
+		return;
+	}
+
 	for (i = 0; i < pages_per_huge_page(h);
 	     i++, subpage = mem_map_next(subpage, page, i)) {
 		subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
@@ -1450,7 +1495,7 @@ static inline void flush_free_hpage_work(struct hstate *h)
 static void update_and_free_page(struct hstate *h, struct page *page,
 				 bool atomic)
 {
-	if (!free_vmemmap_pages_per_hpage(h) || !atomic) {
+	if (!HPageVmemmapOptimized(page) || !atomic) {
 		__update_and_free_page(h, page);
 		return;
 	}
@@ -1806,10 +1851,14 @@ static struct page *remove_pool_huge_page(struct hstate *h,
  * nothing for in-use hugepages and non-hugepages.
  * This function returns values like below:
  *
- *  -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
- *          (allocated or reserved.)
- *       0: successfully dissolved free hugepages or the page is not a
- *          hugepage (considered as already dissolved)
+ *  -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages
+ *           when the system is under memory pressure and the feature of
+ *           freeing unused vmemmap pages associated with each hugetlb page
+ *           is enabled.
+ *  -EBUSY:  failed to dissolved free hugepages or the hugepage is in-use
+ *           (allocated or reserved.)
+ *       0:  successfully dissolved free hugepages or the page is not a
+ *           hugepage (considered as already dissolved)
  */
 int dissolve_free_huge_page(struct page *page)
 {
@@ -1851,19 +1900,38 @@ retry:
 			goto retry;
 		}
 
-		/*
-		 * Move PageHWPoison flag from head page to the raw error page,
-		 * which makes any subpages rather than the error page reusable.
-		 */
-		if (PageHWPoison(head) && page != head) {
-			SetPageHWPoison(page);
-			ClearPageHWPoison(head);
-		}
 		remove_hugetlb_page(h, head, false);
 		h->max_huge_pages--;
 		spin_unlock_irq(&hugetlb_lock);
-		update_and_free_page(h, head, false);
-		return 0;
+
+		/*
+		 * Normally update_and_free_page will allocate required vmemmmap
+		 * before freeing the page.  update_and_free_page will fail to
+		 * free the page if it can not allocate required vmemmap.  We
+		 * need to adjust max_huge_pages if the page is not freed.
+		 * Attempt to allocate vmemmmap here so that we can take
+		 * appropriate action on failure.
+		 */
+		rc = alloc_huge_page_vmemmap(h, head);
+		if (!rc) {
+			/*
+			 * Move PageHWPoison flag from head page to the raw
+			 * error page, which makes any subpages rather than
+			 * the error page reusable.
+			 */
+			if (PageHWPoison(head) && page != head) {
+				SetPageHWPoison(page);
+				ClearPageHWPoison(head);
+			}
+			update_and_free_page(h, head, false);
+		} else {
+			spin_lock_irq(&hugetlb_lock);
+			add_hugetlb_page(h, head, false);
+			h->max_huge_pages++;
+			spin_unlock_irq(&hugetlb_lock);
+		}
+
+		return rc;
 	}
 out:
 	spin_unlock_irq(&hugetlb_lock);
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index cb28c5b6c9ff..a897c7778246 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -185,6 +185,38 @@ static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h)
 	return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT;
 }
 
+/*
+ * Previously discarded vmemmap pages will be allocated and remapping
+ * after this function returns zero.
+ */
+int alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+	int ret;
+	unsigned long vmemmap_addr = (unsigned long)head;
+	unsigned long vmemmap_end, vmemmap_reuse;
+
+	if (!HPageVmemmapOptimized(head))
+		return 0;
+
+	vmemmap_addr += RESERVE_VMEMMAP_SIZE;
+	vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h);
+	vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
+	/*
+	 * The pages which the vmemmap virtual address range [@vmemmap_addr,
+	 * @vmemmap_end) are mapped to are freed to the buddy allocator, and
+	 * the range is mapped to the page which @vmemmap_reuse is mapped to.
+	 * When a HugeTLB page is freed to the buddy allocator, previously
+	 * discarded vmemmap pages must be allocated and remapping.
+	 */
+	ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse,
+				  GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
+
+	if (!ret)
+		ClearHPageVmemmapOptimized(head);
+
+	return ret;
+}
+
 void free_huge_page_vmemmap(struct hstate *h, struct page *head)
 {
 	unsigned long vmemmap_addr = (unsigned long)head;
@@ -203,4 +235,6 @@ void free_huge_page_vmemmap(struct hstate *h, struct page *head)
 	 * which the range [@vmemmap_addr, @vmemmap_end] is mapped to.
 	 */
 	vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse);
+
+	SetHPageVmemmapOptimized(head);
 }
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index 01f8637adbe0..a37771b0b82a 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -11,6 +11,7 @@
 #include <linux/hugetlb.h>
 
 #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+int alloc_huge_page_vmemmap(struct hstate *h, struct page *head);
 void free_huge_page_vmemmap(struct hstate *h, struct page *head);
 
 /*
@@ -25,6 +26,11 @@ static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
 	return 0;
 }
 #else
+static inline int alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+	return 0;
+}
+
 static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head)
 {
 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 380ca57b9031..cc4d6af41683 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -626,7 +626,10 @@ void migrate_page_states(struct page *newpage, struct page *page)
 	if (PageSwapCache(page))
 		ClearPageSwapCache(page);
 	ClearPagePrivate(page);
-	set_page_private(page, 0);
+
+	/* page->private contains hugetlb specific flags */
+	if (!PageHuge(page))
+		set_page_private(page, 0);
 
 	/*
 	 * If any waiters have accumulated on the new page then
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 3ec5488c815c..a3aa275e2668 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -40,7 +40,8 @@
  * @remap_pte:		called for each lowest-level entry (PTE).
  * @reuse_page:		the page which is reused for the tail vmemmap pages.
  * @reuse_addr:		the virtual address of the @reuse_page page.
- * @vmemmap_pages:	the list head of the vmemmap pages that can be freed.
+ * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
+ *			or is mapped from.
  */
 struct vmemmap_remap_walk {
 	void (*remap_pte)(pte_t *pte, unsigned long addr,
@@ -224,6 +225,78 @@ void vmemmap_remap_free(unsigned long start, unsigned long end,
 	free_vmemmap_page_list(&vmemmap_pages);
 }
 
+static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
+				struct vmemmap_remap_walk *walk)
+{
+	pgprot_t pgprot = PAGE_KERNEL;
+	struct page *page;
+	void *to;
+
+	BUG_ON(pte_page(*pte) != walk->reuse_page);
+
+	page = list_first_entry(walk->vmemmap_pages, struct page, lru);
+	list_del(&page->lru);
+	to = page_to_virt(page);
+	copy_page(to, (void *)walk->reuse_addr);
+
+	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
+}
+
+static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
+				   gfp_t gfp_mask, struct list_head *list)
+{
+	unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
+	int nid = page_to_nid((struct page *)start);
+	struct page *page, *next;
+
+	while (nr_pages--) {
+		page = alloc_pages_node(nid, gfp_mask, 0);
+		if (!page)
+			goto out;
+		list_add_tail(&page->lru, list);
+	}
+
+	return 0;
+out:
+	list_for_each_entry_safe(page, next, list, lru)
+		__free_pages(page, 0);
+	return -ENOMEM;
+}
+
+/**
+ * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
+ *			 to the page which is from the @vmemmap_pages
+ *			 respectively.
+ * @start:	start address of the vmemmap virtual address range that we want
+ *		to remap.
+ * @end:	end address of the vmemmap virtual address range that we want to
+ *		remap.
+ * @reuse:	reuse address.
+ * @gfp_mask:	GFP flag for allocating vmemmap pages.
+ */
+int vmemmap_remap_alloc(unsigned long start, unsigned long end,
+			unsigned long reuse, gfp_t gfp_mask)
+{
+	LIST_HEAD(vmemmap_pages);
+	struct vmemmap_remap_walk walk = {
+		.remap_pte	= vmemmap_restore_pte,
+		.reuse_addr	= reuse,
+		.vmemmap_pages	= &vmemmap_pages,
+	};
+
+	/* See the comment in the vmemmap_remap_free(). */
+	BUG_ON(start - reuse != PAGE_SIZE);
+
+	might_sleep_if(gfpflags_allow_blocking(gfp_mask));
+
+	if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
+		return -ENOMEM;
+
+	vmemmap_remap_range(reuse, end, &walk);
+
+	return 0;
+}
+
 /*
  * Allocate a block of memory to be used to back the virtual memory map
  * or to back the page tables that are used to create the mapping.
-- 
cgit v1.2.3


From e9fdff87e893ec5b7c32836675db80cf691b2a8b Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Wed, 30 Jun 2021 18:47:25 -0700
Subject: mm: hugetlb: add a kernel parameter hugetlb_free_vmemmap

Add a kernel parameter hugetlb_free_vmemmap to enable the feature of
freeing unused vmemmap pages associated with each hugetlb page on boot.

We disable PMD mapping of vmemmap pages for x86-64 arch when this feature
is enabled.  Because vmemmap_remap_free() depends on vmemmap being base
page mapped.

Link: https://lkml.kernel.org/r/20210510030027.56044-8-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Barry Song <song.bao.hua@hisilicon.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Tested-by: Chen Huang <chenhuang5@huawei.com>
Tested-by: Bodeddula Balasubramaniam <bodeddub@amazon.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: HORIGUCHI NAOYA <naoya.horiguchi@nec.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Oliver Neukum <oneukum@suse.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt | 17 +++++++++++++++++
 Documentation/admin-guide/mm/hugetlbpage.rst    |  3 +++
 arch/x86/mm/init_64.c                           |  8 ++++++--
 include/linux/hugetlb.h                         | 19 +++++++++++++++++++
 mm/hugetlb_vmemmap.c                            | 24 ++++++++++++++++++++++++
 5 files changed, 69 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 566c4b9af3cd..b787a9953f2e 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1567,6 +1567,23 @@
 			Documentation/admin-guide/mm/hugetlbpage.rst.
 			Format: size[KMG]
 
+	hugetlb_free_vmemmap=
+			[KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+			enabled.
+			Allows heavy hugetlb users to free up some more
+			memory (6 * PAGE_SIZE for each 2MB hugetlb page).
+			This feauture is not free though. Large page
+			tables are not used to back vmemmap pages which
+			can lead to a performance degradation for some
+			workloads. Also there will be memory allocation
+			required when hugetlb pages are freed from the
+			pool which can lead to corner cases under heavy
+			memory pressure.
+			Format: { on | off (default) }
+
+			on:  enable the feature
+			off: disable the feature
+
 	hung_task_panic=
 			[KNL] Should the hung task detector generate panics.
 			Format: 0 | 1
diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst
index 6988895d09a8..8abaeb144e44 100644
--- a/Documentation/admin-guide/mm/hugetlbpage.rst
+++ b/Documentation/admin-guide/mm/hugetlbpage.rst
@@ -153,6 +153,9 @@ default_hugepagesz
 
 	will all result in 256 2M huge pages being allocated.  Valid default
 	huge page size is architecture dependent.
+hugetlb_free_vmemmap
+	When CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is set, this enables freeing
+	unused vmemmap pages associated with each HugeTLB page.
 
 When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages``
 indicates the current number of pre-allocated huge pages of the default size.
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 65ea58527176..9d9d18d0c2a1 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -34,6 +34,7 @@
 #include <linux/gfp.h>
 #include <linux/kcore.h>
 #include <linux/bootmem_info.h>
+#include <linux/hugetlb.h>
 
 #include <asm/processor.h>
 #include <asm/bios_ebda.h>
@@ -1609,7 +1610,8 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 	VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE));
 	VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE));
 
-	if (end - start < PAGES_PER_SECTION * sizeof(struct page))
+	if ((is_hugetlb_free_vmemmap_enabled()  && !altmap) ||
+	    end - start < PAGES_PER_SECTION * sizeof(struct page))
 		err = vmemmap_populate_basepages(start, end, node, NULL);
 	else if (boot_cpu_has(X86_FEATURE_PSE))
 		err = vmemmap_populate_hugepages(start, end, node, altmap);
@@ -1637,6 +1639,8 @@ void register_page_bootmem_memmap(unsigned long section_nr,
 	pmd_t *pmd;
 	unsigned int nr_pmd_pages;
 	struct page *page;
+	bool base_mapping = !boot_cpu_has(X86_FEATURE_PSE) ||
+			    is_hugetlb_free_vmemmap_enabled();
 
 	for (; addr < end; addr = next) {
 		pte_t *pte = NULL;
@@ -1662,7 +1666,7 @@ void register_page_bootmem_memmap(unsigned long section_nr,
 		}
 		get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
 
-		if (!boot_cpu_has(X86_FEATURE_PSE)) {
+		if (base_mapping) {
 			next = (addr + PAGE_SIZE) & PAGE_MASK;
 			pmd = pmd_offset(pud, addr);
 			if (pmd_none(*pmd))
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3578d9d708fe..9ad99848f9f0 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -892,6 +892,20 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
 }
 #endif
 
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+extern bool hugetlb_free_vmemmap_enabled;
+
+static inline bool is_hugetlb_free_vmemmap_enabled(void)
+{
+	return hugetlb_free_vmemmap_enabled;
+}
+#else
+static inline bool is_hugetlb_free_vmemmap_enabled(void)
+{
+	return false;
+}
+#endif
+
 #else	/* CONFIG_HUGETLB_PAGE */
 struct hstate {};
 
@@ -1046,6 +1060,11 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr
 					pte_t *ptep, pte_t pte, unsigned long sz)
 {
 }
+
+static inline bool is_hugetlb_free_vmemmap_enabled(void)
+{
+	return false;
+}
 #endif	/* CONFIG_HUGETLB_PAGE */
 
 static inline spinlock_t *huge_pte_lock(struct hstate *h,
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index a897c7778246..3070e1465b1b 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -168,6 +168,8 @@
  * (last) level. So this type of HugeTLB page can be optimized only when its
  * size of the struct page structs is greater than 2 pages.
  */
+#define pr_fmt(fmt)	"HugeTLB: " fmt
+
 #include "hugetlb_vmemmap.h"
 
 /*
@@ -180,6 +182,28 @@
 #define RESERVE_VMEMMAP_NR		2U
 #define RESERVE_VMEMMAP_SIZE		(RESERVE_VMEMMAP_NR << PAGE_SHIFT)
 
+bool hugetlb_free_vmemmap_enabled;
+
+static int __init early_hugetlb_free_vmemmap_param(char *buf)
+{
+	/* We cannot optimize if a "struct page" crosses page boundaries. */
+	if ((!is_power_of_2(sizeof(struct page)))) {
+		pr_warn("cannot free vmemmap pages because \"struct page\" crosses page boundaries\n");
+		return 0;
+	}
+
+	if (!buf)
+		return -EINVAL;
+
+	if (!strcmp(buf, "on"))
+		hugetlb_free_vmemmap_enabled = true;
+	else if (strcmp(buf, "off"))
+		return -EINVAL;
+
+	return 0;
+}
+early_param("hugetlb_free_vmemmap", early_hugetlb_free_vmemmap_param);
+
 static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h)
 {
 	return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT;
-- 
cgit v1.2.3


From 774905878fc9b0b9a5ee4a889b97f773a077aeee Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Wed, 30 Jun 2021 18:47:33 -0700
Subject: mm: hugetlb: introduce nr_free_vmemmap_pages in the struct hstate

All the infrastructure is ready, so we introduce nr_free_vmemmap_pages
field in the hstate to indicate how many vmemmap pages associated with a
HugeTLB page that can be freed to buddy allocator.  And initialize it in
the hugetlb_vmemmap_init().  This patch is actual enablement of the
feature.

There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct page
structs that can be used when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP, so add a
BUILD_BUG_ON to catch invalid usage of the tail struct page.

Link: https://lkml.kernel.org/r/20210510030027.56044-10-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Tested-by: Chen Huang <chenhuang5@huawei.com>
Tested-by: Bodeddula Balasubramaniam <bodeddub@amazon.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Barry Song <song.bao.hua@hisilicon.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: HORIGUCHI NAOYA <naoya.horiguchi@nec.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Oliver Neukum <oneukum@suse.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |  3 +++
 mm/hugetlb.c            |  1 +
 mm/hugetlb_vmemmap.c    | 33 +++++++++++++++++++++++++++++++++
 mm/hugetlb_vmemmap.h    | 10 ++++++----
 4 files changed, 43 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 9ad99848f9f0..8c1920844236 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -608,6 +608,9 @@ struct hstate {
 	unsigned int nr_huge_pages_node[MAX_NUMNODES];
 	unsigned int free_huge_pages_node[MAX_NUMNODES];
 	unsigned int surplus_huge_pages_node[MAX_NUMNODES];
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+	unsigned int nr_free_vmemmap_pages;
+#endif
 #ifdef CONFIG_CGROUP_HUGETLB
 	/* cgroup control files */
 	struct cftype cgroup_files_dfl[7];
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 778db5de6232..dd9c90c082fc 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3585,6 +3585,7 @@ void __init hugetlb_add_hstate(unsigned int order)
 	h->next_nid_to_free = first_memory_node;
 	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
 					huge_page_size(h)/1024);
+	hugetlb_vmemmap_init(h);
 
 	parsed_hstate = h;
 }
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 3070e1465b1b..f9f9bb212319 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -262,3 +262,36 @@ void free_huge_page_vmemmap(struct hstate *h, struct page *head)
 
 	SetHPageVmemmapOptimized(head);
 }
+
+void __init hugetlb_vmemmap_init(struct hstate *h)
+{
+	unsigned int nr_pages = pages_per_huge_page(h);
+	unsigned int vmemmap_pages;
+
+	/*
+	 * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct
+	 * page structs that can be used when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP,
+	 * so add a BUILD_BUG_ON to catch invalid usage of the tail struct page.
+	 */
+	BUILD_BUG_ON(__NR_USED_SUBPAGE >=
+		     RESERVE_VMEMMAP_SIZE / sizeof(struct page));
+
+	if (!hugetlb_free_vmemmap_enabled)
+		return;
+
+	vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT;
+	/*
+	 * The head page and the first tail page are not to be freed to buddy
+	 * allocator, the other pages will map to the first tail page, so they
+	 * can be freed.
+	 *
+	 * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true
+	 * on some architectures (e.g. aarch64). See Documentation/arm64/
+	 * hugetlbpage.rst for more details.
+	 */
+	if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR))
+		h->nr_free_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR;
+
+	pr_info("can free %d vmemmap pages for %s\n", h->nr_free_vmemmap_pages,
+		h->name);
+}
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index a37771b0b82a..cb2bef8f9e73 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -13,17 +13,15 @@
 #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
 int alloc_huge_page_vmemmap(struct hstate *h, struct page *head);
 void free_huge_page_vmemmap(struct hstate *h, struct page *head);
+void hugetlb_vmemmap_init(struct hstate *h);
 
 /*
  * How many vmemmap pages associated with a HugeTLB page that can be freed
  * to the buddy allocator.
- *
- * Todo: Returns zero for now, which means the feature is disabled. We will
- * enable it once all the infrastructure is there.
  */
 static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
 {
-	return 0;
+	return h->nr_free_vmemmap_pages;
 }
 #else
 static inline int alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
@@ -35,6 +33,10 @@ static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head)
 {
 }
 
+static inline void hugetlb_vmemmap_init(struct hstate *h)
+{
+}
+
 static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
 {
 	return 0;
-- 
cgit v1.2.3


From b2bd53f18bb7f7cfc91b3bb527d7809376700a8e Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Wed, 30 Jun 2021 18:47:43 -0700
Subject: mm/huge_memory.c: remove dedicated macro HPAGE_CACHE_INDEX_MASK

Patch series "Cleanup and fixup for huge_memory:, v3.

This series contains cleanups to remove dedicated macro and remove
unnecessary tlb_remove_page_size() for huge zero pmd.  Also this adds
missing read-only THP checking for transparent_hugepage_enabled() and
avoids discarding hugepage if other processes are mapping it.  More
details can be found in the respective changelogs.

Thi patch (of 5):

Rewrite the pgoff checking logic to remove macro HPAGE_CACHE_INDEX_MASK
which is only used here to simplify the code.

Link: https://lkml.kernel.org/r/20210511134857.1581273-1-linmiaohe@huawei.com
Link: https://lkml.kernel.org/r/20210511134857.1581273-2-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: William Kucharski <william.kucharski@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.ibm.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 2a8ebe6c222e..8a5f49abcfa2 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -152,15 +152,13 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
 
 bool transparent_hugepage_enabled(struct vm_area_struct *vma);
 
-#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1)
-
 static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
 		unsigned long haddr)
 {
 	/* Don't have to check pgoff for anonymous vma */
 	if (!vma_is_anonymous(vma)) {
-		if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) !=
-			(vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK))
+		if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
+				HPAGE_PMD_NR))
 			return false;
 	}
 
-- 
cgit v1.2.3


From e6be37b2e7bddfe0c76585ee7c7eee5acc8efeab Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Wed, 30 Jun 2021 18:47:50 -0700
Subject: mm/huge_memory.c: add missing read-only THP checking in
 transparent_hugepage_enabled()

Since commit 99cb0dbd47a1 ("mm,thp: add read-only THP support for
(non-shmem) FS"), read-only THP file mapping is supported.  But it forgot
to add checking for it in transparent_hugepage_enabled().  To fix it, we
add checking for read-only THP file mapping and also introduce helper
transhuge_vma_enabled() to check whether thp is enabled for specified vma
to reduce duplicated code.  We rename transparent_hugepage_enabled to
transparent_hugepage_active to make the code easier to follow as suggested
by David Hildenbrand.

[linmiaohe@huawei.com: define transhuge_vma_enabled next to transhuge_vma_suitable]
  Link: https://lkml.kernel.org/r/20210514093007.4117906-1-linmiaohe@huawei.com

Link: https://lkml.kernel.org/r/20210511134857.1581273-4-linmiaohe@huawei.com
Fixes: 99cb0dbd47a1 ("mm,thp: add read-only THP support for (non-shmem) FS")
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.ibm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: William Kucharski <william.kucharski@oracle.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c      |  2 +-
 include/linux/huge_mm.h | 57 ++++++++++++++++++++++++++++++-------------------
 mm/huge_memory.c        | 11 +++++++++-
 mm/khugepaged.c         |  4 +---
 mm/shmem.c              |  3 +--
 5 files changed, 48 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 66965ad88d8b..9feda7792882 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -832,7 +832,7 @@ static int show_smap(struct seq_file *m, void *v)
 	__show_smap(m, &mss, false);
 
 	seq_printf(m, "THPeligible:    %d\n",
-		   transparent_hugepage_enabled(vma));
+		   transparent_hugepage_active(vma));
 
 	if (arch_pkeys_enabled())
 		seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 8a5f49abcfa2..b4e1ebaae825 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -115,9 +115,34 @@ extern struct kobj_attribute shmem_enabled_attr;
 
 extern unsigned long transparent_hugepage_flags;
 
+static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
+		unsigned long haddr)
+{
+	/* Don't have to check pgoff for anonymous vma */
+	if (!vma_is_anonymous(vma)) {
+		if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
+				HPAGE_PMD_NR))
+			return false;
+	}
+
+	if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
+		return false;
+	return true;
+}
+
+static inline bool transhuge_vma_enabled(struct vm_area_struct *vma,
+					  unsigned long vm_flags)
+{
+	/* Explicitly disabled through madvise. */
+	if ((vm_flags & VM_NOHUGEPAGE) ||
+	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+		return false;
+	return true;
+}
+
 /*
  * to be used on vmas which are known to support THP.
- * Use transparent_hugepage_enabled otherwise
+ * Use transparent_hugepage_active otherwise
  */
 static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
 {
@@ -128,15 +153,12 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
 	if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX))
 		return false;
 
-	if (vma->vm_flags & VM_NOHUGEPAGE)
+	if (!transhuge_vma_enabled(vma, vma->vm_flags))
 		return false;
 
 	if (vma_is_temporary_stack(vma))
 		return false;
 
-	if (test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
-		return false;
-
 	if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG))
 		return true;
 
@@ -150,22 +172,7 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
 	return false;
 }
 
-bool transparent_hugepage_enabled(struct vm_area_struct *vma);
-
-static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
-		unsigned long haddr)
-{
-	/* Don't have to check pgoff for anonymous vma */
-	if (!vma_is_anonymous(vma)) {
-		if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
-				HPAGE_PMD_NR))
-			return false;
-	}
-
-	if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
-		return false;
-	return true;
-}
+bool transparent_hugepage_active(struct vm_area_struct *vma);
 
 #define transparent_hugepage_use_zero_page()				\
 	(transparent_hugepage_flags &					\
@@ -352,7 +359,7 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
 	return false;
 }
 
-static inline bool transparent_hugepage_enabled(struct vm_area_struct *vma)
+static inline bool transparent_hugepage_active(struct vm_area_struct *vma)
 {
 	return false;
 }
@@ -363,6 +370,12 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
 	return false;
 }
 
+static inline bool transhuge_vma_enabled(struct vm_area_struct *vma,
+					  unsigned long vm_flags)
+{
+	return false;
+}
+
 static inline void prep_transhuge_page(struct page *page) {}
 
 static inline bool is_transparent_hugepage(struct page *page)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a76b3835251d..8529ba6af151 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -64,7 +64,14 @@ static atomic_t huge_zero_refcount;
 struct page *huge_zero_page __read_mostly;
 unsigned long huge_zero_pfn __read_mostly = ~0UL;
 
-bool transparent_hugepage_enabled(struct vm_area_struct *vma)
+static inline bool file_thp_enabled(struct vm_area_struct *vma)
+{
+	return transhuge_vma_enabled(vma, vma->vm_flags) && vma->vm_file &&
+	       !inode_is_open_for_write(vma->vm_file->f_inode) &&
+	       (vma->vm_flags & VM_EXEC);
+}
+
+bool transparent_hugepage_active(struct vm_area_struct *vma)
 {
 	/* The addr is used to check if the vma size fits */
 	unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE;
@@ -75,6 +82,8 @@ bool transparent_hugepage_enabled(struct vm_area_struct *vma)
 		return __transparent_hugepage_enabled(vma);
 	if (vma_is_shmem(vma))
 		return shmem_huge_enabled(vma);
+	if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS))
+		return file_thp_enabled(vma);
 
 	return false;
 }
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6c0185fdd815..d97b20fad6e8 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -442,9 +442,7 @@ static inline int khugepaged_test_exit(struct mm_struct *mm)
 static bool hugepage_vma_check(struct vm_area_struct *vma,
 			       unsigned long vm_flags)
 {
-	/* Explicitly disabled through madvise. */
-	if ((vm_flags & VM_NOHUGEPAGE) ||
-	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+	if (!transhuge_vma_enabled(vma, vm_flags))
 		return false;
 
 	/* Enabled via shmem mount options or sysfs settings. */
diff --git a/mm/shmem.c b/mm/shmem.c
index e72931b9246c..1155279a6276 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -4040,8 +4040,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma)
 	loff_t i_size;
 	pgoff_t off;
 
-	if ((vma->vm_flags & VM_NOHUGEPAGE) ||
-	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+	if (!transhuge_vma_enabled(vma, vma->vm_flags))
 		return false;
 	if (shmem_huge == SHMEM_HUGE_FORCE)
 		return true;
-- 
cgit v1.2.3


From 79c1c594f49a88fba9744cb5c85978c6b1b365ec Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Wed, 30 Jun 2021 18:48:00 -0700
Subject: mm/hugetlb: change parameters of arch_make_huge_pte()

Patch series "Subject: [PATCH v2 0/5] Implement huge VMAP and VMALLOC on powerpc 8xx", v2.

This series implements huge VMAP and VMALLOC on powerpc 8xx.

Powerpc 8xx has 4 page sizes:
- 4k
- 16k
- 512k
- 8M

At the time being, vmalloc and vmap only support huge pages which are
leaf at PMD level.

Here the PMD level is 4M, it doesn't correspond to any supported
page size.

For now, implement use of 16k and 512k pages which is done
at PTE level.

Support of 8M pages will be implemented later, it requires use of
hugepd tables.

To allow this, the architecture provides two functions:
- arch_vmap_pte_range_map_size() which tells vmap_pte_range() what
page size to use. A stub returning PAGE_SIZE is provided when the
architecture doesn't provide this function.
- arch_vmap_pte_supported_shift() which tells __vmalloc_node_range()
what page shift to use for a given area size. A stub returning
PAGE_SHIFT is provided when the architecture doesn't provide this
function.

This patch (of 5):

At the time being, arch_make_huge_pte() has the following prototype:

  pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
			   struct page *page, int writable);

vma is used to get the pages shift or size.
vma is also used on Sparc to get vm_flags.
page is not used.
writable is not used.

In order to use this function without a vma, replace vma by shift and
flags.  Also remove the used parameters.

Link: https://lkml.kernel.org/r/cover.1620795204.git.christophe.leroy@csgroup.eu
Link: https://lkml.kernel.org/r/f4633ac6a7da2f22f31a04a89e0a7026bb78b15b.1620795204.git.christophe.leroy@csgroup.eu
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Acked-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/include/asm/hugetlb.h                 | 3 +--
 arch/arm64/mm/hugetlbpage.c                      | 5 ++---
 arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h | 5 ++---
 arch/sparc/include/asm/pgtable_64.h              | 3 +--
 arch/sparc/mm/hugetlbpage.c                      | 6 ++----
 include/linux/hugetlb.h                          | 4 ++--
 mm/hugetlb.c                                     | 6 ++++--
 mm/migrate.c                                     | 4 +++-
 8 files changed, 17 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index 5abf91e3494c..1242f71937f8 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -23,8 +23,7 @@ static inline void arch_clear_hugepage_flags(struct page *page)
 }
 #define arch_clear_hugepage_flags arch_clear_hugepage_flags
 
-extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
-				struct page *page, int writable);
+pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags);
 #define arch_make_huge_pte arch_make_huge_pte
 #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
 extern void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 58987a98e179..23505fc35324 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -339,10 +339,9 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
 	return NULL;
 }
 
-pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
-			 struct page *page, int writable)
+pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
 {
-	size_t pagesize = huge_page_size(hstate_vma(vma));
+	size_t pagesize = 1UL << shift;
 
 	if (pagesize == CONT_PTE_SIZE) {
 		entry = pte_mkcont(entry);
diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
index 39be9aea86db..64b6c608eca4 100644
--- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
@@ -66,10 +66,9 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
 }
 
 #ifdef CONFIG_PPC_4K_PAGES
-static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
-				       struct page *page, int writable)
+static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
 {
-	size_t size = huge_page_size(hstate_vma(vma));
+	size_t size = 1UL << shift;
 
 	if (size == SZ_16K)
 		return __pte(pte_val(entry) & ~_PAGE_HUGE);
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 550d3904de65..2cd80a0a9795 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -377,8 +377,7 @@ static inline pgprot_t pgprot_noncached(pgprot_t prot)
 #define pgprot_noncached pgprot_noncached
 
 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
-extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
-				struct page *page, int writable);
+pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags);
 #define arch_make_huge_pte arch_make_huge_pte
 static inline unsigned long __pte_default_huge_mask(void)
 {
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 04d8790f6c32..0f49fada2093 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -177,10 +177,8 @@ static pte_t hugepage_shift_to_tte(pte_t entry, unsigned int shift)
 		return sun4u_hugepage_shift_to_tte(entry, shift);
 }
 
-pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
-			 struct page *page, int writeable)
+pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
 {
-	unsigned int shift = huge_page_shift(hstate_vma(vma));
 	pte_t pte;
 
 	pte = hugepage_shift_to_tte(entry, shift);
@@ -188,7 +186,7 @@ pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
 #ifdef CONFIG_SPARC64
 	/* If this vma has ADI enabled on it, turn on TTE.mcd
 	 */
-	if (vma->vm_flags & VM_SPARC_ADI)
+	if (flags & VM_SPARC_ADI)
 		return pte_mkmcd(pte);
 	else
 		return pte_mknotmcd(pte);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 8c1920844236..cfde3bec2261 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -741,8 +741,8 @@ static inline void arch_clear_hugepage_flags(struct page *page) { }
 #endif
 
 #ifndef arch_make_huge_pte
-static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
-				       struct page *page, int writable)
+static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift,
+				       vm_flags_t flags)
 {
 	return entry;
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dd9c90c082fc..88f2178ad7c9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4060,6 +4060,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
 				int writable)
 {
 	pte_t entry;
+	unsigned int shift = huge_page_shift(hstate_vma(vma));
 
 	if (writable) {
 		entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
@@ -4070,7 +4071,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
 	}
 	entry = pte_mkyoung(entry);
 	entry = pte_mkhuge(entry);
-	entry = arch_make_huge_pte(entry, vma, page, writable);
+	entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
 
 	return entry;
 }
@@ -5468,10 +5469,11 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 		}
 		if (!huge_pte_none(pte)) {
 			pte_t old_pte;
+			unsigned int shift = huge_page_shift(hstate_vma(vma));
 
 			old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
 			pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
-			pte = arch_make_huge_pte(pte, vma, NULL, 0);
+			pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
 			huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
 			pages++;
 		}
diff --git a/mm/migrate.c b/mm/migrate.c
index cc4d6af41683..75a15f0a2698 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -226,8 +226,10 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
 
 #ifdef CONFIG_HUGETLB_PAGE
 		if (PageHuge(new)) {
+			unsigned int shift = huge_page_shift(hstate_vma(vma));
+
 			pte = pte_mkhuge(pte);
-			pte = arch_make_huge_pte(pte, vma, new, 0);
+			pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
 			set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
 			if (PageAnon(new))
 				hugepage_add_anon_rmap(new, vma, pvmw.address);
-- 
cgit v1.2.3


From c742199a014de23ee92055c2473d91fe5561ffdf Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Wed, 30 Jun 2021 18:48:03 -0700
Subject: mm/pgtable: add stubs for {pmd/pub}_{set/clear}_huge

For architectures with no PMD and/or no PUD, add stubs similar to what we
have for architectures without P4D.

[christophe.leroy@csgroup.eu: arm64: define only {pud/pmd}_{set/clear}_huge when useful]
  Link: https://lkml.kernel.org/r/73ec95f40cafbbb69bdfb43a7f53876fd845b0ce.1620990479.git.christophe.leroy@csgroup.eu
[christophe.leroy@csgroup.eu: x86: define only {pud/pmd}_{set/clear}_huge when useful]
  Link: https://lkml.kernel.org/r/7fbf1b6bc3e15c07c24fa45278d57064f14c896b.1620930415.git.christophe.leroy@csgroup.eu

Link: https://lkml.kernel.org/r/5ac5976419350e8e048d463a64cae449eb3ba4b0.1620795204.git.christophe.leroy@csgroup.eu
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Cc: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/mm/mmu.c     | 20 ++++++++++++--------
 arch/x86/mm/pgtable.c   | 34 +++++++++++++++++++---------------
 include/linux/pgtable.h | 26 +++++++++++++++++++++++++-
 3 files changed, 56 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 89b66ef43a0f..add67deb4910 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1338,6 +1338,7 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
 	return dt_virt;
 }
 
+#if CONFIG_PGTABLE_LEVELS > 3
 int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
 {
 	pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot));
@@ -1352,6 +1353,16 @@ int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
 	return 1;
 }
 
+int pud_clear_huge(pud_t *pudp)
+{
+	if (!pud_sect(READ_ONCE(*pudp)))
+		return 0;
+	pud_clear(pudp);
+	return 1;
+}
+#endif
+
+#if CONFIG_PGTABLE_LEVELS > 2
 int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
 {
 	pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), mk_pmd_sect_prot(prot));
@@ -1366,14 +1377,6 @@ int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
 	return 1;
 }
 
-int pud_clear_huge(pud_t *pudp)
-{
-	if (!pud_sect(READ_ONCE(*pudp)))
-		return 0;
-	pud_clear(pudp);
-	return 1;
-}
-
 int pmd_clear_huge(pmd_t *pmdp)
 {
 	if (!pmd_sect(READ_ONCE(*pmdp)))
@@ -1381,6 +1384,7 @@ int pmd_clear_huge(pmd_t *pmdp)
 	pmd_clear(pmdp);
 	return 1;
 }
+#endif
 
 int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
 {
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index d27cf69e811d..1303ff6ef7be 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -682,6 +682,7 @@ int p4d_clear_huge(p4d_t *p4d)
 }
 #endif
 
+#if CONFIG_PGTABLE_LEVELS > 3
 /**
  * pud_set_huge - setup kernel PUD mapping
  *
@@ -720,6 +721,23 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
 	return 1;
 }
 
+/**
+ * pud_clear_huge - clear kernel PUD mapping when it is set
+ *
+ * Returns 1 on success and 0 on failure (no PUD map is found).
+ */
+int pud_clear_huge(pud_t *pud)
+{
+	if (pud_large(*pud)) {
+		pud_clear(pud);
+		return 1;
+	}
+
+	return 0;
+}
+#endif
+
+#if CONFIG_PGTABLE_LEVELS > 2
 /**
  * pmd_set_huge - setup kernel PMD mapping
  *
@@ -750,21 +768,6 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
 	return 1;
 }
 
-/**
- * pud_clear_huge - clear kernel PUD mapping when it is set
- *
- * Returns 1 on success and 0 on failure (no PUD map is found).
- */
-int pud_clear_huge(pud_t *pud)
-{
-	if (pud_large(*pud)) {
-		pud_clear(pud);
-		return 1;
-	}
-
-	return 0;
-}
-
 /**
  * pmd_clear_huge - clear kernel PMD mapping when it is set
  *
@@ -779,6 +782,7 @@ int pmd_clear_huge(pmd_t *pmd)
 
 	return 0;
 }
+#endif
 
 #ifdef CONFIG_X86_64
 /**
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index c32600c9e1ad..2b0d02291178 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1379,10 +1379,34 @@ static inline int p4d_clear_huge(p4d_t *p4d)
 }
 #endif /* !__PAGETABLE_P4D_FOLDED */
 
+#ifndef __PAGETABLE_PUD_FOLDED
 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
-int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
 int pud_clear_huge(pud_t *pud);
+#else
+static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
+{
+	return 0;
+}
+static inline int pud_clear_huge(pud_t *pud)
+{
+	return 0;
+}
+#endif /* !__PAGETABLE_PUD_FOLDED */
+
+#ifndef __PAGETABLE_PMD_FOLDED
+int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
 int pmd_clear_huge(pmd_t *pmd);
+#else
+static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
+{
+	return 0;
+}
+static inline int pmd_clear_huge(pmd_t *pmd)
+{
+	return 0;
+}
+#endif /* !__PAGETABLE_PMD_FOLDED */
+
 int p4d_free_pud_page(p4d_t *p4d, unsigned long addr);
 int pud_free_pmd_page(pud_t *pud, unsigned long addr);
 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr);
-- 
cgit v1.2.3


From f7ee1f13d606c1b1be3bdaf1609f3991bc06da87 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Wed, 30 Jun 2021 18:48:06 -0700
Subject: mm/vmalloc: enable mapping of huge pages at pte level in vmap

On some architectures like powerpc, there are huge pages that are mapped
at pte level.

Enable it in vmap.

For that, architectures can provide arch_vmap_pte_range_map_size() that
returns the size of pages to map at pte level.

Link: https://lkml.kernel.org/r/fb3ccc73377832ac6708181ec419128a2f98ce36.1620795204.git.christophe.leroy@csgroup.eu
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h |  8 ++++++++
 mm/vmalloc.c            | 21 ++++++++++++++++++---
 2 files changed, 26 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index bfaaf0b6fa76..54ec0736a656 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -104,6 +104,14 @@ static inline bool arch_vmap_pmd_supported(pgprot_t prot)
 }
 #endif
 
+#ifndef arch_vmap_pte_range_map_size
+static inline unsigned long arch_vmap_pte_range_map_size(unsigned long addr, unsigned long end,
+							 u64 pfn, unsigned int max_page_shift)
+{
+	return PAGE_SIZE;
+}
+#endif
+
 /*
  *	Highlevel APIs for driver use
  */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b2ec7f751bd0..fe0af8db71e0 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -36,6 +36,7 @@
 #include <linux/overflow.h>
 #include <linux/pgtable.h>
 #include <linux/uaccess.h>
+#include <linux/hugetlb.h>
 #include <asm/tlbflush.h>
 #include <asm/shmparam.h>
 
@@ -83,10 +84,11 @@ static void free_work(struct work_struct *w)
 /*** Page table manipulation functions ***/
 static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 			phys_addr_t phys_addr, pgprot_t prot,
-			pgtbl_mod_mask *mask)
+			unsigned int max_page_shift, pgtbl_mod_mask *mask)
 {
 	pte_t *pte;
 	u64 pfn;
+	unsigned long size = PAGE_SIZE;
 
 	pfn = phys_addr >> PAGE_SHIFT;
 	pte = pte_alloc_kernel_track(pmd, addr, mask);
@@ -94,9 +96,22 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 		return -ENOMEM;
 	do {
 		BUG_ON(!pte_none(*pte));
+
+#ifdef CONFIG_HUGETLB_PAGE
+		size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
+		if (size != PAGE_SIZE) {
+			pte_t entry = pfn_pte(pfn, prot);
+
+			entry = pte_mkhuge(entry);
+			entry = arch_make_huge_pte(entry, ilog2(size), 0);
+			set_huge_pte_at(&init_mm, addr, pte, entry);
+			pfn += PFN_DOWN(size);
+			continue;
+		}
+#endif
 		set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
 		pfn++;
-	} while (pte++, addr += PAGE_SIZE, addr != end);
+	} while (pte += PFN_DOWN(size), addr += size, addr != end);
 	*mask |= PGTBL_PTE_MODIFIED;
 	return 0;
 }
@@ -145,7 +160,7 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 			continue;
 		}
 
-		if (vmap_pte_range(pmd, addr, next, phys_addr, prot, mask))
+		if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask))
 			return -ENOMEM;
 	} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
 	return 0;
-- 
cgit v1.2.3


From 3382bbee0464bf31e63853c6ec2a83ead77a01cc Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Wed, 30 Jun 2021 18:48:09 -0700
Subject: mm/vmalloc: enable mapping of huge pages at pte level in vmalloc

On some architectures like powerpc, there are huge pages that are mapped
at pte level.

Enable it in vmalloc.

For that, architectures can provide arch_vmap_pte_supported_shift() that
returns the shift for pages to map at pte level.

Link: https://lkml.kernel.org/r/2c717e3b1fba1894d890feb7669f83025bfa314d.1620795204.git.christophe.leroy@csgroup.eu
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h |  7 +++++++
 mm/vmalloc.c            | 13 +++++++------
 2 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 54ec0736a656..1dabd6f22486 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -112,6 +112,13 @@ static inline unsigned long arch_vmap_pte_range_map_size(unsigned long addr, uns
 }
 #endif
 
+#ifndef arch_vmap_pte_supported_shift
+static inline int arch_vmap_pte_supported_shift(unsigned long size)
+{
+	return PAGE_SHIFT;
+}
+#endif
+
 /*
  *	Highlevel APIs for driver use
  */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index fe0af8db71e0..71dd29fe618b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2927,8 +2927,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
 		return NULL;
 	}
 
-	if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP) &&
-			arch_vmap_pmd_supported(prot)) {
+	if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP)) {
 		unsigned long size_per_node;
 
 		/*
@@ -2941,11 +2940,13 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
 		size_per_node = size;
 		if (node == NUMA_NO_NODE)
 			size_per_node /= num_online_nodes();
-		if (size_per_node >= PMD_SIZE) {
+		if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE)
 			shift = PMD_SHIFT;
-			align = max(real_align, 1UL << shift);
-			size = ALIGN(real_size, 1UL << shift);
-		}
+		else
+			shift = arch_vmap_pte_supported_shift(size_per_node);
+
+		align = max(real_align, 1UL << shift);
+		size = ALIGN(real_size, 1UL << shift);
 	}
 
 again:
-- 
cgit v1.2.3


From 8cc5fcbb5be814c115085549b700e473685b11e9 Mon Sep 17 00:00:00 2001
From: Mina Almasry <almasrymina@google.com>
Date: Wed, 30 Jun 2021 18:48:19 -0700
Subject: mm, hugetlb: fix racy resv_huge_pages underflow on UFFDIO_COPY

On UFFDIO_COPY, if we fail to copy the page contents while holding the
hugetlb_fault_mutex, we will drop the mutex and return to the caller after
allocating a page that consumed a reservation.  In this case there may be
a fault that double consumes the reservation.  To handle this, we free the
allocated page, fix the reservations, and allocate a temporary hugetlb
page and return that to the caller.  When the caller does the copy outside
of the lock, we again check the cache, and allocate a page consuming the
reservation, and copy over the contents.

Test:
Hacked the code locally such that resv_huge_pages underflows produce
a warning and the copy_huge_page_from_user() always fails, then:

./tools/testing/selftests/vm/userfaultfd hugetlb_shared 10
        2 /tmp/kokonut_test/huge/userfaultfd_test && echo test success
./tools/testing/selftests/vm/userfaultfd hugetlb 10
	2 /tmp/kokonut_test/huge/userfaultfd_test && echo test success

Both tests succeed and produce no warnings. After the
test runs number of free/resv hugepages is correct.

[yuehaibing@huawei.com: remove set but not used variable 'vm_alloc_shared']
  Link: https://lkml.kernel.org/r/20210601141610.28332-1-yuehaibing@huawei.com
[almasrymina@google.com: fix allocation error check and copy func name]
  Link: https://lkml.kernel.org/r/20210605010626.1459873-1-almasrymina@google.com

Link: https://lkml.kernel.org/r/20210528005029.88088-1-almasrymina@google.com
Signed-off-by: Mina Almasry <almasrymina@google.com>
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h |  4 ++++
 mm/hugetlb.c            | 48 ++++++++++++++++++++++++++++++++++++++---------
 mm/migrate.c            |  2 +-
 mm/userfaultfd.c        | 50 +------------------------------------------------
 4 files changed, 45 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 4bb4e519e3f5..7b7b73977278 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -51,6 +51,7 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
 				  struct page *newpage, struct page *page);
 extern int migrate_page_move_mapping(struct address_space *mapping,
 		struct page *newpage, struct page *page, int extra_count);
+extern void copy_huge_page(struct page *dst, struct page *src);
 #else
 
 static inline void putback_movable_pages(struct list_head *l) {}
@@ -77,6 +78,9 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 	return -ENOSYS;
 }
 
+static inline void copy_huge_page(struct page *dst, struct page *src)
+{
+}
 #endif /* CONFIG_MIGRATION */
 
 #ifdef CONFIG_COMPACTION
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 88f2178ad7c9..b14f4d1749b2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -30,6 +30,7 @@
 #include <linux/numa.h>
 #include <linux/llist.h>
 #include <linux/cma.h>
+#include <linux/migrate.h>
 
 #include <asm/page.h>
 #include <asm/pgalloc.h>
@@ -5076,20 +5077,17 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 			    struct page **pagep)
 {
 	bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
-	struct address_space *mapping;
-	pgoff_t idx;
+	struct hstate *h = hstate_vma(dst_vma);
+	struct address_space *mapping = dst_vma->vm_file->f_mapping;
+	pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
 	unsigned long size;
 	int vm_shared = dst_vma->vm_flags & VM_SHARED;
-	struct hstate *h = hstate_vma(dst_vma);
 	pte_t _dst_pte;
 	spinlock_t *ptl;
-	int ret;
+	int ret = -ENOMEM;
 	struct page *page;
 	int writable;
 
-	mapping = dst_vma->vm_file->f_mapping;
-	idx = vma_hugecache_offset(h, dst_vma, dst_addr);
-
 	if (is_continue) {
 		ret = -EFAULT;
 		page = find_lock_page(mapping, idx);
@@ -5118,12 +5116,44 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 		/* fallback to copy_from_user outside mmap_lock */
 		if (unlikely(ret)) {
 			ret = -ENOENT;
+			/* Free the allocated page which may have
+			 * consumed a reservation.
+			 */
+			restore_reserve_on_error(h, dst_vma, dst_addr, page);
+			put_page(page);
+
+			/* Allocate a temporary page to hold the copied
+			 * contents.
+			 */
+			page = alloc_huge_page_vma(h, dst_vma, dst_addr);
+			if (!page) {
+				ret = -ENOMEM;
+				goto out;
+			}
 			*pagep = page;
-			/* don't free the page */
+			/* Set the outparam pagep and return to the caller to
+			 * copy the contents outside the lock. Don't free the
+			 * page.
+			 */
 			goto out;
 		}
 	} else {
-		page = *pagep;
+		if (vm_shared &&
+		    hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
+			put_page(*pagep);
+			ret = -EEXIST;
+			*pagep = NULL;
+			goto out;
+		}
+
+		page = alloc_huge_page(dst_vma, dst_addr, 0);
+		if (IS_ERR(page)) {
+			ret = -ENOMEM;
+			*pagep = NULL;
+			goto out;
+		}
+		copy_huge_page(page, *pagep);
+		put_page(*pagep);
 		*pagep = NULL;
 	}
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 75a15f0a2698..8fc766e52e52 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -553,7 +553,7 @@ static void __copy_gigantic_page(struct page *dst, struct page *src,
 	}
 }
 
-static void copy_huge_page(struct page *dst, struct page *src)
+void copy_huge_page(struct page *dst, struct page *src)
 {
 	int i;
 	int nr_pages;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 63a73e164d55..da5535d2d990 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -209,7 +209,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 					      unsigned long len,
 					      enum mcopy_atomic_mode mode)
 {
-	int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
 	int vm_shared = dst_vma->vm_flags & VM_SHARED;
 	ssize_t err;
 	pte_t *dst_pte;
@@ -308,7 +307,6 @@ retry:
 
 		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 		i_mmap_unlock_read(mapping);
-		vm_alloc_shared = vm_shared;
 
 		cond_resched();
 
@@ -346,54 +344,8 @@ retry:
 out_unlock:
 	mmap_read_unlock(dst_mm);
 out:
-	if (page) {
-		/*
-		 * We encountered an error and are about to free a newly
-		 * allocated huge page.
-		 *
-		 * Reservation handling is very subtle, and is different for
-		 * private and shared mappings.  See the routine
-		 * restore_reserve_on_error for details.  Unfortunately, we
-		 * can not call restore_reserve_on_error now as it would
-		 * require holding mmap_lock.
-		 *
-		 * If a reservation for the page existed in the reservation
-		 * map of a private mapping, the map was modified to indicate
-		 * the reservation was consumed when the page was allocated.
-		 * We clear the HPageRestoreReserve flag now so that the global
-		 * reserve count will not be incremented in free_huge_page.
-		 * The reservation map will still indicate the reservation
-		 * was consumed and possibly prevent later page allocation.
-		 * This is better than leaking a global reservation.  If no
-		 * reservation existed, it is still safe to clear
-		 * HPageRestoreReserve as no adjustments to reservation counts
-		 * were made during allocation.
-		 *
-		 * The reservation map for shared mappings indicates which
-		 * pages have reservations.  When a huge page is allocated
-		 * for an address with a reservation, no change is made to
-		 * the reserve map.  In this case HPageRestoreReserve will be
-		 * set to indicate that the global reservation count should be
-		 * incremented when the page is freed.  This is the desired
-		 * behavior.  However, when a huge page is allocated for an
-		 * address without a reservation a reservation entry is added
-		 * to the reservation map, and HPageRestoreReserve will not be
-		 * set. When the page is freed, the global reserve count will
-		 * NOT be incremented and it will appear as though we have
-		 * leaked reserved page.  In this case, set HPageRestoreReserve
-		 * so that the global reserve count will be incremented to
-		 * match the reservation map entry which was created.
-		 *
-		 * Note that vm_alloc_shared is based on the flags of the vma
-		 * for which the page was originally allocated.  dst_vma could
-		 * be different or NULL on error.
-		 */
-		if (vm_alloc_shared)
-			SetHPageRestoreReserve(page);
-		else
-			ClearHPageRestoreReserve(page);
+	if (page)
 		put_page(page);
-	}
 	BUG_ON(copied < 0);
 	BUG_ON(err > 0);
 	BUG_ON(!copied && !err);
-- 
cgit v1.2.3


From 3bc2b6a725963bb1b441356873da890e397c1a3f Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Wed, 30 Jun 2021 18:48:22 -0700
Subject: mm: sparsemem: split the huge PMD mapping of vmemmap pages

Patch series "Split huge PMD mapping of vmemmap pages", v4.

In order to reduce the difficulty of code review in series[1].  We disable
huge PMD mapping of vmemmap pages when that feature is enabled.  In this
series, we do not disable huge PMD mapping of vmemmap pages anymore.  We
will split huge PMD mapping when needed.  When HugeTLB pages are freed
from the pool we do not attempt coalasce and move back to a PMD mapping
because it is much more complex.

[1] https://lore.kernel.org/linux-doc/20210510030027.56044-1-songmuchun@bytedance.com/

This patch (of 3):

In [1], PMD mappings of vmemmap pages were disabled if the the feature
hugetlb_free_vmemmap was enabled.  This was done to simplify the initial
implementation of vmmemap freeing for hugetlb pages.  Now, remove this
simplification by allowing PMD mapping and switching to PTE mappings as
needed for allocated hugetlb pages.

When a hugetlb page is allocated, the vmemmap page tables are walked to
free vmemmap pages.  During this walk, split huge PMD mappings to PTE
mappings as required.  In the unlikely case PTE pages can not be
allocated, return error(ENOMEM) and do not optimize vmemmap of the hugetlb
page.

When HugeTLB pages are freed from the pool, we do not attempt to
coalesce and move back to a PMD mapping because it is much more complex.

[1] https://lkml.kernel.org/r/20210510030027.56044-8-songmuchun@bytedance.com

Link: https://lkml.kernel.org/r/20210616094915.34432-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210616094915.34432-2-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Chen Huang <chenhuang5@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h   |   4 +-
 mm/hugetlb_vmemmap.c |   5 +-
 mm/sparse-vmemmap.c  | 163 +++++++++++++++++++++++++++++++++++++++------------
 3 files changed, 129 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 706bee98d965..aa875dacd9c3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3076,8 +3076,8 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
 }
 #endif
 
-void vmemmap_remap_free(unsigned long start, unsigned long end,
-			unsigned long reuse);
+int vmemmap_remap_free(unsigned long start, unsigned long end,
+		       unsigned long reuse);
 int vmemmap_remap_alloc(unsigned long start, unsigned long end,
 			unsigned long reuse, gfp_t gfp_mask);
 
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index f9f9bb212319..06802056f296 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -258,9 +258,8 @@ void free_huge_page_vmemmap(struct hstate *h, struct page *head)
 	 * to the page which @vmemmap_reuse is mapped to, then free the pages
 	 * which the range [@vmemmap_addr, @vmemmap_end] is mapped to.
 	 */
-	vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse);
-
-	SetHPageVmemmapOptimized(head);
+	if (!vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse))
+		SetHPageVmemmapOptimized(head);
 }
 
 void __init hugetlb_vmemmap_init(struct hstate *h)
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index a3aa275e2668..bdce883f9286 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -38,6 +38,7 @@
  * struct vmemmap_remap_walk - walk vmemmap page table
  *
  * @remap_pte:		called for each lowest-level entry (PTE).
+ * @nr_walked:		the number of walked pte.
  * @reuse_page:		the page which is reused for the tail vmemmap pages.
  * @reuse_addr:		the virtual address of the @reuse_page page.
  * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
@@ -46,11 +47,44 @@
 struct vmemmap_remap_walk {
 	void (*remap_pte)(pte_t *pte, unsigned long addr,
 			  struct vmemmap_remap_walk *walk);
+	unsigned long nr_walked;
 	struct page *reuse_page;
 	unsigned long reuse_addr;
 	struct list_head *vmemmap_pages;
 };
 
+static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start,
+				  struct vmemmap_remap_walk *walk)
+{
+	pmd_t __pmd;
+	int i;
+	unsigned long addr = start;
+	struct page *page = pmd_page(*pmd);
+	pte_t *pgtable = pte_alloc_one_kernel(&init_mm);
+
+	if (!pgtable)
+		return -ENOMEM;
+
+	pmd_populate_kernel(&init_mm, &__pmd, pgtable);
+
+	for (i = 0; i < PMD_SIZE / PAGE_SIZE; i++, addr += PAGE_SIZE) {
+		pte_t entry, *pte;
+		pgprot_t pgprot = PAGE_KERNEL;
+
+		entry = mk_pte(page + i, pgprot);
+		pte = pte_offset_kernel(&__pmd, addr);
+		set_pte_at(&init_mm, addr, pte, entry);
+	}
+
+	/* Make pte visible before pmd. See comment in __pte_alloc(). */
+	smp_wmb();
+	pmd_populate_kernel(&init_mm, pmd, pgtable);
+
+	flush_tlb_kernel_range(start, start + PMD_SIZE);
+
+	return 0;
+}
+
 static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
 			      unsigned long end,
 			      struct vmemmap_remap_walk *walk)
@@ -69,58 +103,80 @@ static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
 		 */
 		addr += PAGE_SIZE;
 		pte++;
+		walk->nr_walked++;
 	}
 
-	for (; addr != end; addr += PAGE_SIZE, pte++)
+	for (; addr != end; addr += PAGE_SIZE, pte++) {
 		walk->remap_pte(pte, addr, walk);
+		walk->nr_walked++;
+	}
 }
 
-static void vmemmap_pmd_range(pud_t *pud, unsigned long addr,
-			      unsigned long end,
-			      struct vmemmap_remap_walk *walk)
+static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
+			     unsigned long end,
+			     struct vmemmap_remap_walk *walk)
 {
 	pmd_t *pmd;
 	unsigned long next;
 
 	pmd = pmd_offset(pud, addr);
 	do {
-		BUG_ON(pmd_leaf(*pmd));
+		if (pmd_leaf(*pmd)) {
+			int ret;
 
+			ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, walk);
+			if (ret)
+				return ret;
+		}
 		next = pmd_addr_end(addr, end);
 		vmemmap_pte_range(pmd, addr, next, walk);
 	} while (pmd++, addr = next, addr != end);
+
+	return 0;
 }
 
-static void vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
-			      unsigned long end,
-			      struct vmemmap_remap_walk *walk)
+static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
+			     unsigned long end,
+			     struct vmemmap_remap_walk *walk)
 {
 	pud_t *pud;
 	unsigned long next;
 
 	pud = pud_offset(p4d, addr);
 	do {
+		int ret;
+
 		next = pud_addr_end(addr, end);
-		vmemmap_pmd_range(pud, addr, next, walk);
+		ret = vmemmap_pmd_range(pud, addr, next, walk);
+		if (ret)
+			return ret;
 	} while (pud++, addr = next, addr != end);
+
+	return 0;
 }
 
-static void vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
-			      unsigned long end,
-			      struct vmemmap_remap_walk *walk)
+static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
+			     unsigned long end,
+			     struct vmemmap_remap_walk *walk)
 {
 	p4d_t *p4d;
 	unsigned long next;
 
 	p4d = p4d_offset(pgd, addr);
 	do {
+		int ret;
+
 		next = p4d_addr_end(addr, end);
-		vmemmap_pud_range(p4d, addr, next, walk);
+		ret = vmemmap_pud_range(p4d, addr, next, walk);
+		if (ret)
+			return ret;
 	} while (p4d++, addr = next, addr != end);
+
+	return 0;
 }
 
-static void vmemmap_remap_range(unsigned long start, unsigned long end,
-				struct vmemmap_remap_walk *walk)
+static int vmemmap_remap_range(unsigned long start, unsigned long end,
+			       struct vmemmap_remap_walk *walk)
 {
 	unsigned long addr = start;
 	unsigned long next;
@@ -131,8 +187,12 @@ static void vmemmap_remap_range(unsigned long start, unsigned long end,
 
 	pgd = pgd_offset_k(addr);
 	do {
+		int ret;
+
 		next = pgd_addr_end(addr, end);
-		vmemmap_p4d_range(pgd, addr, next, walk);
+		ret = vmemmap_p4d_range(pgd, addr, next, walk);
+		if (ret)
+			return ret;
 	} while (pgd++, addr = next, addr != end);
 
 	/*
@@ -141,6 +201,8 @@ static void vmemmap_remap_range(unsigned long start, unsigned long end,
 	 * belongs to the range.
 	 */
 	flush_tlb_kernel_range(start + PAGE_SIZE, end);
+
+	return 0;
 }
 
 /*
@@ -179,10 +241,27 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
 	pte_t entry = mk_pte(walk->reuse_page, pgprot);
 	struct page *page = pte_page(*pte);
 
-	list_add(&page->lru, walk->vmemmap_pages);
+	list_add_tail(&page->lru, walk->vmemmap_pages);
 	set_pte_at(&init_mm, addr, pte, entry);
 }
 
+static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
+				struct vmemmap_remap_walk *walk)
+{
+	pgprot_t pgprot = PAGE_KERNEL;
+	struct page *page;
+	void *to;
+
+	BUG_ON(pte_page(*pte) != walk->reuse_page);
+
+	page = list_first_entry(walk->vmemmap_pages, struct page, lru);
+	list_del(&page->lru);
+	to = page_to_virt(page);
+	copy_page(to, (void *)walk->reuse_addr);
+
+	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
+}
+
 /**
  * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
  *			to the page which @reuse is mapped to, then free vmemmap
@@ -193,12 +272,12 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
  *		remap.
  * @reuse:	reuse address.
  *
- * Note: This function depends on vmemmap being base page mapped. Please make
- * sure that we disable PMD mapping of vmemmap pages when calling this function.
+ * Return: %0 on success, negative error code otherwise.
  */
-void vmemmap_remap_free(unsigned long start, unsigned long end,
-			unsigned long reuse)
+int vmemmap_remap_free(unsigned long start, unsigned long end,
+		       unsigned long reuse)
 {
+	int ret;
 	LIST_HEAD(vmemmap_pages);
 	struct vmemmap_remap_walk walk = {
 		.remap_pte	= vmemmap_remap_pte,
@@ -221,25 +300,31 @@ void vmemmap_remap_free(unsigned long start, unsigned long end,
 	 */
 	BUG_ON(start - reuse != PAGE_SIZE);
 
-	vmemmap_remap_range(reuse, end, &walk);
-	free_vmemmap_page_list(&vmemmap_pages);
-}
+	mmap_write_lock(&init_mm);
+	ret = vmemmap_remap_range(reuse, end, &walk);
+	mmap_write_downgrade(&init_mm);
 
-static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
-				struct vmemmap_remap_walk *walk)
-{
-	pgprot_t pgprot = PAGE_KERNEL;
-	struct page *page;
-	void *to;
+	if (ret && walk.nr_walked) {
+		end = reuse + walk.nr_walked * PAGE_SIZE;
+		/*
+		 * vmemmap_pages contains pages from the previous
+		 * vmemmap_remap_range call which failed.  These
+		 * are pages which were removed from the vmemmap.
+		 * They will be restored in the following call.
+		 */
+		walk = (struct vmemmap_remap_walk) {
+			.remap_pte	= vmemmap_restore_pte,
+			.reuse_addr	= reuse,
+			.vmemmap_pages	= &vmemmap_pages,
+		};
 
-	BUG_ON(pte_page(*pte) != walk->reuse_page);
+		vmemmap_remap_range(reuse, end, &walk);
+	}
+	mmap_read_unlock(&init_mm);
 
-	page = list_first_entry(walk->vmemmap_pages, struct page, lru);
-	list_del(&page->lru);
-	to = page_to_virt(page);
-	copy_page(to, (void *)walk->reuse_addr);
+	free_vmemmap_page_list(&vmemmap_pages);
 
-	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
+	return ret;
 }
 
 static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
@@ -273,6 +358,8 @@ out:
  *		remap.
  * @reuse:	reuse address.
  * @gfp_mask:	GFP flag for allocating vmemmap pages.
+ *
+ * Return: %0 on success, negative error code otherwise.
  */
 int vmemmap_remap_alloc(unsigned long start, unsigned long end,
 			unsigned long reuse, gfp_t gfp_mask)
@@ -287,12 +374,12 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end,
 	/* See the comment in the vmemmap_remap_free(). */
 	BUG_ON(start - reuse != PAGE_SIZE);
 
-	might_sleep_if(gfpflags_allow_blocking(gfp_mask));
-
 	if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
 		return -ENOMEM;
 
+	mmap_read_lock(&init_mm);
 	vmemmap_remap_range(reuse, end, &walk);
+	mmap_read_unlock(&init_mm);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 2d7a21715f25122779e2bed17db8c57aa01e922f Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Wed, 30 Jun 2021 18:48:25 -0700
Subject: mm: sparsemem: use huge PMD mapping for vmemmap pages

The preparation of splitting huge PMD mapping of vmemmap pages is ready,
so switch the mapping from PTE to PMD.

Link: https://lkml.kernel.org/r/20210616094915.34432-3-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Chen Huang <chenhuang5@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  7 -------
 arch/x86/mm/init_64.c                           |  8 ++------
 include/linux/hugetlb.h                         | 25 ++++++-------------------
 mm/memory_hotplug.c                             |  2 +-
 4 files changed, 9 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 597572ceb8ba..7a7da02b3bc6 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1572,13 +1572,6 @@
 			enabled.
 			Allows heavy hugetlb users to free up some more
 			memory (6 * PAGE_SIZE for each 2MB hugetlb page).
-			This feauture is not free though. Large page
-			tables are not used to back vmemmap pages which
-			can lead to a performance degradation for some
-			workloads. Also there will be memory allocation
-			required when hugetlb pages are freed from the
-			pool which can lead to corner cases under heavy
-			memory pressure.
 			Format: { on | off (default) }
 
 			on:  enable the feature
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 9d9d18d0c2a1..65ea58527176 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -34,7 +34,6 @@
 #include <linux/gfp.h>
 #include <linux/kcore.h>
 #include <linux/bootmem_info.h>
-#include <linux/hugetlb.h>
 
 #include <asm/processor.h>
 #include <asm/bios_ebda.h>
@@ -1610,8 +1609,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 	VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE));
 	VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE));
 
-	if ((is_hugetlb_free_vmemmap_enabled()  && !altmap) ||
-	    end - start < PAGES_PER_SECTION * sizeof(struct page))
+	if (end - start < PAGES_PER_SECTION * sizeof(struct page))
 		err = vmemmap_populate_basepages(start, end, node, NULL);
 	else if (boot_cpu_has(X86_FEATURE_PSE))
 		err = vmemmap_populate_hugepages(start, end, node, altmap);
@@ -1639,8 +1637,6 @@ void register_page_bootmem_memmap(unsigned long section_nr,
 	pmd_t *pmd;
 	unsigned int nr_pmd_pages;
 	struct page *page;
-	bool base_mapping = !boot_cpu_has(X86_FEATURE_PSE) ||
-			    is_hugetlb_free_vmemmap_enabled();
 
 	for (; addr < end; addr = next) {
 		pte_t *pte = NULL;
@@ -1666,7 +1662,7 @@ void register_page_bootmem_memmap(unsigned long section_nr,
 		}
 		get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
 
-		if (base_mapping) {
+		if (!boot_cpu_has(X86_FEATURE_PSE)) {
 			next = (addr + PAGE_SIZE) & PAGE_MASK;
 			pmd = pmd_offset(pud, addr);
 			if (pmd_none(*pmd))
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index cfde3bec2261..f11ba701e199 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -895,20 +895,6 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
 }
 #endif
 
-#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
-extern bool hugetlb_free_vmemmap_enabled;
-
-static inline bool is_hugetlb_free_vmemmap_enabled(void)
-{
-	return hugetlb_free_vmemmap_enabled;
-}
-#else
-static inline bool is_hugetlb_free_vmemmap_enabled(void)
-{
-	return false;
-}
-#endif
-
 #else	/* CONFIG_HUGETLB_PAGE */
 struct hstate {};
 
@@ -1063,13 +1049,14 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr
 					pte_t *ptep, pte_t pte, unsigned long sz)
 {
 }
-
-static inline bool is_hugetlb_free_vmemmap_enabled(void)
-{
-	return false;
-}
 #endif	/* CONFIG_HUGETLB_PAGE */
 
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+extern bool hugetlb_free_vmemmap_enabled;
+#else
+#define hugetlb_free_vmemmap_enabled	false
+#endif
+
 static inline spinlock_t *huge_pte_lock(struct hstate *h,
 					struct mm_struct *mm, pte_t *pte)
 {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 29d1d742b565..492560b61999 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1056,7 +1056,7 @@ bool mhp_supports_memmap_on_memory(unsigned long size)
 	 *       populate a single PMD.
 	 */
 	return memmap_on_memory &&
-	       !is_hugetlb_free_vmemmap_enabled() &&
+	       !hugetlb_free_vmemmap_enabled &&
 	       IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) &&
 	       size == memory_block_size_bytes() &&
 	       IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
-- 
cgit v1.2.3


From 8f34f1eac3820fc2722e5159acceb22545b30b0d Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Wed, 30 Jun 2021 18:49:02 -0700
Subject: mm/userfaultfd: fix uffd-wp special cases for fork()

We tried to do something similar in b569a1760782 ("userfaultfd: wp: drop
_PAGE_UFFD_WP properly when fork") previously, but it's not doing it all
right..  A few fixes around the code path:

1. We were referencing VM_UFFD_WP vm_flags on the _old_ vma rather
   than the new vma.  That's overlooked in b569a1760782, so it won't work
   as expected.  Thanks to the recent rework on fork code
   (7a4830c380f3a8b3), we can easily get the new vma now, so switch the
   checks to that.

2. Dropping the uffd-wp bit in copy_huge_pmd() could be wrong if the
   huge pmd is a migration huge pmd.  When it happens, instead of using
   pmd_uffd_wp(), we should use pmd_swp_uffd_wp().  The fix is simply to
   handle them separately.

3. Forget to carry over uffd-wp bit for a write migration huge pmd
   entry.  This also happens in copy_huge_pmd(), where we converted a
   write huge migration entry into a read one.

4. In copy_nonpresent_pte(), drop uffd-wp if necessary for swap ptes.

5. In copy_present_page() when COW is enforced when fork(), we also
   need to pass over the uffd-wp bit if VM_UFFD_WP is armed on the new
   vma, and when the pte to be copied has uffd-wp bit set.

Remove the comment in copy_present_pte() about this.  It won't help a huge
lot to only comment there, but comment everywhere would be an overkill.
Let's assume the commit messages would help.

[peterx@redhat.com: fix a few thp pmd missing uffd-wp bit]
  Link: https://lkml.kernel.org/r/20210428225030.9708-4-peterx@redhat.com

Link: https://lkml.kernel.org/r/20210428225030.9708-3-peterx@redhat.com
Fixes: b569a1760782f ("userfaultfd: wp: drop _PAGE_UFFD_WP properly when fork")
Signed-off-by: Peter Xu <peterx@redhat.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Brian Geffon <bgeffon@google.com>
Cc: "Dr . David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Joe Perches <joe@perches.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Oliver Upton <oupton@google.com>
Cc: Shaohua Li <shli@fb.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Wang Qing <wangqing@vivo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h |  2 +-
 include/linux/swapops.h |  2 ++
 mm/huge_memory.c        | 27 ++++++++++++++-------------
 mm/memory.c             | 25 +++++++++++++------------
 4 files changed, 30 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index b4e1ebaae825..939f21b69ead 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -10,7 +10,7 @@
 vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
-		  struct vm_area_struct *vma);
+		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
 void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd);
 int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 5907205c712c..708fbeb21dd3 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -265,6 +265,8 @@ static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
 
 	if (pmd_swp_soft_dirty(pmd))
 		pmd = pmd_swp_clear_soft_dirty(pmd);
+	if (pmd_swp_uffd_wp(pmd))
+		pmd = pmd_swp_clear_uffd_wp(pmd);
 	arch_entry = __pmd_to_swp_entry(pmd);
 	return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b72d919ab13a..40a90ff18180 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1026,7 +1026,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
 
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
-		  struct vm_area_struct *vma)
+		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
 {
 	spinlock_t *dst_ptl, *src_ptl;
 	struct page *src_page;
@@ -1035,7 +1035,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	int ret = -ENOMEM;
 
 	/* Skip if can be re-fill on fault */
-	if (!vma_is_anonymous(vma))
+	if (!vma_is_anonymous(dst_vma))
 		return 0;
 
 	pgtable = pte_alloc_one(dst_mm);
@@ -1049,14 +1049,6 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	ret = -EAGAIN;
 	pmd = *src_pmd;
 
-	/*
-	 * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA
-	 * does not have the VM_UFFD_WP, which means that the uffd
-	 * fork event is not enabled.
-	 */
-	if (!(vma->vm_flags & VM_UFFD_WP))
-		pmd = pmd_clear_uffd_wp(pmd);
-
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
 	if (unlikely(is_swap_pmd(pmd))) {
 		swp_entry_t entry = pmd_to_swp_entry(pmd);
@@ -1067,11 +1059,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 			pmd = swp_entry_to_pmd(entry);
 			if (pmd_swp_soft_dirty(*src_pmd))
 				pmd = pmd_swp_mksoft_dirty(pmd);
+			if (pmd_swp_uffd_wp(*src_pmd))
+				pmd = pmd_swp_mkuffd_wp(pmd);
 			set_pmd_at(src_mm, addr, src_pmd, pmd);
 		}
 		add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
 		mm_inc_nr_ptes(dst_mm);
 		pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
+		if (!userfaultfd_wp(dst_vma))
+			pmd = pmd_swp_clear_uffd_wp(pmd);
 		set_pmd_at(dst_mm, addr, dst_pmd, pmd);
 		ret = 0;
 		goto out_unlock;
@@ -1107,11 +1103,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	 * best effort that the pinned pages won't be replaced by another
 	 * random page during the coming copy-on-write.
 	 */
-	if (unlikely(page_needs_cow_for_dma(vma, src_page))) {
+	if (unlikely(page_needs_cow_for_dma(src_vma, src_page))) {
 		pte_free(dst_mm, pgtable);
 		spin_unlock(src_ptl);
 		spin_unlock(dst_ptl);
-		__split_huge_pmd(vma, src_pmd, addr, false, NULL);
+		__split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
 		return -EAGAIN;
 	}
 
@@ -1121,8 +1117,9 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 out_zero_page:
 	mm_inc_nr_ptes(dst_mm);
 	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
-
 	pmdp_set_wrprotect(src_mm, addr, src_pmd);
+	if (!userfaultfd_wp(dst_vma))
+		pmd = pmd_clear_uffd_wp(pmd);
 	pmd = pmd_mkold(pmd_wrprotect(pmd));
 	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
 
@@ -1835,6 +1832,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 			newpmd = swp_entry_to_pmd(entry);
 			if (pmd_swp_soft_dirty(*pmd))
 				newpmd = pmd_swp_mksoft_dirty(newpmd);
+			if (pmd_swp_uffd_wp(*pmd))
+				newpmd = pmd_swp_mkuffd_wp(newpmd);
 			set_pmd_at(mm, addr, pmd, newpmd);
 		}
 		goto unlock;
@@ -3245,6 +3244,8 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
 		pmde = pmd_mksoft_dirty(pmde);
 	if (is_write_migration_entry(entry))
 		pmde = maybe_pmd_mkwrite(pmde, vma);
+	if (pmd_swp_uffd_wp(*pvmw->pmd))
+		pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
 
 	flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
 	if (PageAnon(new))
diff --git a/mm/memory.c b/mm/memory.c
index 48c4576df898..07a71a016c18 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -707,10 +707,10 @@ out:
 
 static unsigned long
 copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
-		unsigned long addr, int *rss)
+		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
+		struct vm_area_struct *src_vma, unsigned long addr, int *rss)
 {
-	unsigned long vm_flags = vma->vm_flags;
+	unsigned long vm_flags = dst_vma->vm_flags;
 	pte_t pte = *src_pte;
 	struct page *page;
 	swp_entry_t entry = pte_to_swp_entry(pte);
@@ -779,6 +779,8 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 			set_pte_at(src_mm, addr, src_pte, pte);
 		}
 	}
+	if (!userfaultfd_wp(dst_vma))
+		pte = pte_swp_clear_uffd_wp(pte);
 	set_pte_at(dst_mm, addr, dst_pte, pte);
 	return 0;
 }
@@ -844,6 +846,9 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
 	/* All done, just insert the new page copy in the child */
 	pte = mk_pte(new_page, dst_vma->vm_page_prot);
 	pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
+	if (userfaultfd_pte_wp(dst_vma, *src_pte))
+		/* Uffd-wp needs to be delivered to dest pte as well */
+		pte = pte_wrprotect(pte_mkuffd_wp(pte));
 	set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
 	return 0;
 }
@@ -893,12 +898,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 		pte = pte_mkclean(pte);
 	pte = pte_mkold(pte);
 
-	/*
-	 * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA
-	 * does not have the VM_UFFD_WP, which means that the uffd
-	 * fork event is not enabled.
-	 */
-	if (!(vm_flags & VM_UFFD_WP))
+	if (!userfaultfd_wp(dst_vma))
 		pte = pte_clear_uffd_wp(pte);
 
 	set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
@@ -973,7 +973,8 @@ again:
 		if (unlikely(!pte_present(*src_pte))) {
 			entry.val = copy_nonpresent_pte(dst_mm, src_mm,
 							dst_pte, src_pte,
-							src_vma, addr, rss);
+							dst_vma, src_vma,
+							addr, rss);
 			if (entry.val)
 				break;
 			progress += 8;
@@ -1050,8 +1051,8 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 			|| pmd_devmap(*src_pmd)) {
 			int err;
 			VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
-			err = copy_huge_pmd(dst_mm, src_mm,
-					    dst_pmd, src_pmd, addr, src_vma);
+			err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
+					    addr, dst_vma, src_vma);
 			if (err == -ENOMEM)
 				return -ENOMEM;
 			if (!err)
-- 
cgit v1.2.3


From 3460f6e5c1ed94c2ab7c1ccc032a5bebd88deaa7 Mon Sep 17 00:00:00 2001
From: Axel Rasmussen <axelrasmussen@google.com>
Date: Wed, 30 Jun 2021 18:49:17 -0700
Subject: userfaultfd/shmem: combine shmem_{mcopy_atomic,mfill_zeropage}_pte

Patch series "userfaultfd: add minor fault handling for shmem", v6.

Overview
========

See the series which added minor faults for hugetlbfs [3] for a detailed
overview of minor fault handling in general.  This series adds the same
support for shmem-backed areas.

This series is structured as follows:

- Commits 1 and 2 are cleanups.
- Commits 3 and 4 implement the new feature (minor fault handling for shmem).
- Commit 5 advertises that the feature is now available since at this point it's
  fully implemented.
- Commit 6 is a final cleanup, modifying an existing code path to re-use a new
  helper we've introduced.
- Commits 7, 8, 9, 10 update the userfaultfd selftest to exercise the feature.

Use Case
========

In some cases it is useful to have VM memory backed by tmpfs instead of
hugetlbfs.  So, this feature will be used to support the same VM live
migration use case described in my original series.

Additionally, Android folks (Lokesh Gidra <lokeshgidra@google.com>) hope
to optimize the Android Runtime garbage collector using this feature:

"The plan is to use userfaultfd for concurrently compacting the heap.
With this feature, the heap can be shared-mapped at another location where
the GC-thread(s) could continue the compaction operation without the need
to invoke userfault ioctl(UFFDIO_COPY) each time.  OTOH, if and when Java
threads get faults on the heap, UFFDIO_CONTINUE can be used to resume
execution.  Furthermore, this feature enables updating references in the
'non-moving' portion of the heap efficiently.  Without this feature,
uneccessary page copying (ioctl(UFFDIO_COPY)) would be required."

[1] https://lore.kernel.org/patchwork/cover/1388144/
[2] https://lore.kernel.org/patchwork/patch/1408161/
[3] https://lore.kernel.org/linux-fsdevel/20210301222728.176417-1-axelrasmussen@google.com/T/#t

This patch (of 9):

Previously, we did a dance where we had one calling path in userfaultfd.c
(mfill_atomic_pte), but then we split it into two in shmem_fs.h
(shmem_{mcopy_atomic,mfill_zeropage}_pte), and then rejoined into a single
shared function in shmem.c (shmem_mfill_atomic_pte).

This is all a bit overly complex.  Just call the single combined shmem
function directly, allowing us to clean up various branches, boilerplate,
etc.

While we're touching this function, two other small cleanup changes:
- offset is equivalent to pgoff, so we can get rid of offset entirely.
- Split two VM_BUG_ON cases into two statements. This means the line
  number reported when the BUG is hit specifies exactly which condition
  was true.

Link: https://lkml.kernel.org/r/20210503180737.2487560-1-axelrasmussen@google.com
Link: https://lkml.kernel.org/r/20210503180737.2487560-3-axelrasmussen@google.com
Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Brian Geffon <bgeffon@google.com>
Cc: "Dr . David Alan Gilbert" <dgilbert@redhat.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Oliver Upton <oupton@google.com>
Cc: Shaohua Li <shli@fb.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Wang Qing <wangqing@vivo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shmem_fs.h | 19 ++++++++----------
 mm/shmem.c               | 52 +++++++++++++++---------------------------------
 mm/userfaultfd.c         | 10 +++-------
 3 files changed, 27 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index d82b6f396588..a69ea4d97fdd 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -122,21 +122,18 @@ static inline bool shmem_file(struct file *file)
 extern bool shmem_charge(struct inode *inode, long pages);
 extern void shmem_uncharge(struct inode *inode, long pages);
 
+#ifdef CONFIG_USERFAULTFD
 #ifdef CONFIG_SHMEM
-extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+extern int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
 				  struct vm_area_struct *dst_vma,
 				  unsigned long dst_addr,
 				  unsigned long src_addr,
+				  bool zeropage,
 				  struct page **pagep);
-extern int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
-				    pmd_t *dst_pmd,
-				    struct vm_area_struct *dst_vma,
-				    unsigned long dst_addr);
-#else
-#define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
-			       src_addr, pagep)        ({ BUG(); 0; })
-#define shmem_mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, \
-				 dst_addr)      ({ BUG(); 0; })
-#endif
+#else /* !CONFIG_SHMEM */
+#define shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, \
+			       src_addr, zeropage, pagep)       ({ BUG(); 0; })
+#endif /* CONFIG_SHMEM */
+#endif /* CONFIG_USERFAULTFD */
 
 #endif
diff --git a/mm/shmem.c b/mm/shmem.c
index 1155279a6276..9b66a2b2680c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2352,13 +2352,14 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 	return inode;
 }
 
-static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
-				  pmd_t *dst_pmd,
-				  struct vm_area_struct *dst_vma,
-				  unsigned long dst_addr,
-				  unsigned long src_addr,
-				  bool zeropage,
-				  struct page **pagep)
+#ifdef CONFIG_USERFAULTFD
+int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
+			   pmd_t *dst_pmd,
+			   struct vm_area_struct *dst_vma,
+			   unsigned long dst_addr,
+			   unsigned long src_addr,
+			   bool zeropage,
+			   struct page **pagep)
 {
 	struct inode *inode = file_inode(dst_vma->vm_file);
 	struct shmem_inode_info *info = SHMEM_I(inode);
@@ -2370,7 +2371,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
 	struct page *page;
 	pte_t _dst_pte, *dst_pte;
 	int ret;
-	pgoff_t offset, max_off;
+	pgoff_t max_off;
 
 	ret = -ENOMEM;
 	if (!shmem_inode_acct_block(inode, 1)) {
@@ -2391,7 +2392,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
 		if (!page)
 			goto out_unacct_blocks;
 
-		if (!zeropage) {	/* mcopy_atomic */
+		if (!zeropage) {	/* COPY */
 			page_kaddr = kmap_atomic(page);
 			ret = copy_from_user(page_kaddr,
 					     (const void __user *)src_addr,
@@ -2405,7 +2406,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
 				/* don't free the page */
 				return -ENOENT;
 			}
-		} else {		/* mfill_zeropage_atomic */
+		} else {		/* ZEROPAGE */
 			clear_highpage(page);
 		}
 	} else {
@@ -2413,15 +2414,15 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
 		*pagep = NULL;
 	}
 
-	VM_BUG_ON(PageLocked(page) || PageSwapBacked(page));
+	VM_BUG_ON(PageLocked(page));
+	VM_BUG_ON(PageSwapBacked(page));
 	__SetPageLocked(page);
 	__SetPageSwapBacked(page);
 	__SetPageUptodate(page);
 
 	ret = -EFAULT;
-	offset = linear_page_index(dst_vma, dst_addr);
 	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
-	if (unlikely(offset >= max_off))
+	if (unlikely(pgoff >= max_off))
 		goto out_release;
 
 	ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
@@ -2447,7 +2448,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
 
 	ret = -EFAULT;
 	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
-	if (unlikely(offset >= max_off))
+	if (unlikely(pgoff >= max_off))
 		goto out_release_unlock;
 
 	ret = -EEXIST;
@@ -2484,28 +2485,7 @@ out_unacct_blocks:
 	shmem_inode_unacct_blocks(inode, 1);
 	goto out;
 }
-
-int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
-			   pmd_t *dst_pmd,
-			   struct vm_area_struct *dst_vma,
-			   unsigned long dst_addr,
-			   unsigned long src_addr,
-			   struct page **pagep)
-{
-	return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
-				      dst_addr, src_addr, false, pagep);
-}
-
-int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
-			     pmd_t *dst_pmd,
-			     struct vm_area_struct *dst_vma,
-			     unsigned long dst_addr)
-{
-	struct page *page = NULL;
-
-	return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
-				      dst_addr, 0, true, &page);
-}
+#endif /* CONFIG_USERFAULTFD */
 
 #ifdef CONFIG_TMPFS
 static const struct inode_operations shmem_symlink_inode_operations;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index da5535d2d990..57b228b9b6a4 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -392,13 +392,9 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
 						 dst_vma, dst_addr);
 	} else {
 		VM_WARN_ON_ONCE(wp_copy);
-		if (!zeropage)
-			err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
-						     dst_vma, dst_addr,
-						     src_addr, page);
-		else
-			err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
-						       dst_vma, dst_addr);
+		err = shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
+					     dst_addr, src_addr, zeropage,
+					     page);
 	}
 
 	return err;
-- 
cgit v1.2.3


From 7d64ae3ab648a967b7ba5cc3e89281d76742c34e Mon Sep 17 00:00:00 2001
From: Axel Rasmussen <axelrasmussen@google.com>
Date: Wed, 30 Jun 2021 18:49:31 -0700
Subject: userfaultfd/shmem: modify shmem_mfill_atomic_pte to use install_pte()

In a previous commit, we added the mfill_atomic_install_pte() helper.
This helper does the job of setting up PTEs for an existing page, to map
it into a given VMA.  It deals with both the anon and shmem cases, as well
as the shared and private cases.

In other words, shmem_mfill_atomic_pte() duplicates a case it already
handles.  So, expose it, and let shmem_mfill_atomic_pte() use it directly,
to reduce code duplication.

This requires that we refactor shmem_mfill_atomic_pte() a bit:

Instead of doing accounting (shmem_recalc_inode() et al) part-way through
the PTE setup, do it afterward.  This frees up mfill_atomic_install_pte()
from having to care about this accounting, and means we don't need to e.g.
shmem_uncharge() in the error path.

A side effect is this switches shmem_mfill_atomic_pte() to use
lru_cache_add_inactive_or_unevictable() instead of just lru_cache_add().
This wrapper does some extra accounting in an exceptional case, if
appropriate, so it's actually the more correct thing to use.

Link: https://lkml.kernel.org/r/20210503180737.2487560-7-axelrasmussen@google.com
Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Brian Geffon <bgeffon@google.com>
Cc: "Dr . David Alan Gilbert" <dgilbert@redhat.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Oliver Upton <oupton@google.com>
Cc: Shaohua Li <shli@fb.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Wang Qing <wangqing@vivo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/userfaultfd_k.h |  5 ++++
 mm/shmem.c                    | 58 +++++++++----------------------------------
 mm/userfaultfd.c              | 17 +++++--------
 3 files changed, 23 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 794d1538b8ba..331d2ccf0bcc 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -53,6 +53,11 @@ enum mcopy_atomic_mode {
 	MCOPY_ATOMIC_CONTINUE,
 };
 
+extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+				    struct vm_area_struct *dst_vma,
+				    unsigned long dst_addr, struct page *page,
+				    bool newly_allocated, bool wp_copy);
+
 extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
 			    unsigned long src_start, unsigned long len,
 			    bool *mmap_changing, __u64 mode);
diff --git a/mm/shmem.c b/mm/shmem.c
index 0d186b91a2f9..bd2ffd861ddb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2376,14 +2376,11 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
 	struct address_space *mapping = inode->i_mapping;
 	gfp_t gfp = mapping_gfp_mask(mapping);
 	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
-	spinlock_t *ptl;
 	void *page_kaddr;
 	struct page *page;
-	pte_t _dst_pte, *dst_pte;
 	int ret;
 	pgoff_t max_off;
 
-	ret = -ENOMEM;
 	if (!shmem_inode_acct_block(inode, 1)) {
 		/*
 		 * We may have got a page, returned -ENOENT triggering a retry,
@@ -2394,10 +2391,11 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
 			put_page(*pagep);
 			*pagep = NULL;
 		}
-		goto out;
+		return -ENOMEM;
 	}
 
 	if (!*pagep) {
+		ret = -ENOMEM;
 		page = shmem_alloc_page(gfp, info, pgoff);
 		if (!page)
 			goto out_unacct_blocks;
@@ -2412,9 +2410,9 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
 			/* fallback to copy_from_user outside mmap_lock */
 			if (unlikely(ret)) {
 				*pagep = page;
-				shmem_inode_unacct_blocks(inode, 1);
+				ret = -ENOENT;
 				/* don't free the page */
-				return -ENOENT;
+				goto out_unacct_blocks;
 			}
 		} else {		/* ZEROPAGE */
 			clear_highpage(page);
@@ -2440,32 +2438,10 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
 	if (ret)
 		goto out_release;
 
-	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
-	if (dst_vma->vm_flags & VM_WRITE)
-		_dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
-	else {
-		/*
-		 * We don't set the pte dirty if the vma has no
-		 * VM_WRITE permission, so mark the page dirty or it
-		 * could be freed from under us. We could do it
-		 * unconditionally before unlock_page(), but doing it
-		 * only if VM_WRITE is not set is faster.
-		 */
-		set_page_dirty(page);
-	}
-
-	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
-
-	ret = -EFAULT;
-	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
-	if (unlikely(pgoff >= max_off))
-		goto out_release_unlock;
-
-	ret = -EEXIST;
-	if (!pte_none(*dst_pte))
-		goto out_release_unlock;
-
-	lru_cache_add(page);
+	ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+				       page, true, false);
+	if (ret)
+		goto out_delete_from_cache;
 
 	spin_lock_irq(&info->lock);
 	info->alloced++;
@@ -2473,27 +2449,17 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
 	shmem_recalc_inode(inode);
 	spin_unlock_irq(&info->lock);
 
-	inc_mm_counter(dst_mm, mm_counter_file(page));
-	page_add_file_rmap(page, false);
-	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
-
-	/* No need to invalidate - it was non-present before */
-	update_mmu_cache(dst_vma, dst_addr, dst_pte);
-	pte_unmap_unlock(dst_pte, ptl);
+	SetPageDirty(page);
 	unlock_page(page);
-	ret = 0;
-out:
-	return ret;
-out_release_unlock:
-	pte_unmap_unlock(dst_pte, ptl);
-	ClearPageDirty(page);
+	return 0;
+out_delete_from_cache:
 	delete_from_page_cache(page);
 out_release:
 	unlock_page(page);
 	put_page(page);
 out_unacct_blocks:
 	shmem_inode_unacct_blocks(inode, 1);
-	goto out;
+	return ret;
 }
 #endif /* CONFIG_USERFAULTFD */
 
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 218ec2af615f..0e2132834bc7 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -51,18 +51,13 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
 /*
  * Install PTEs, to map dst_addr (within dst_vma) to page.
  *
- * This function handles MCOPY_ATOMIC_CONTINUE (which is always file-backed),
- * whether or not dst_vma is VM_SHARED. It also handles the more general
- * MCOPY_ATOMIC_NORMAL case, when dst_vma is *not* VM_SHARED (it may be file
- * backed, or not).
- *
- * Note that MCOPY_ATOMIC_NORMAL for a VM_SHARED dst_vma is handled by
- * shmem_mcopy_atomic_pte instead.
+ * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
+ * and anon, and for both shared and private VMAs.
  */
-static int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
-				    struct vm_area_struct *dst_vma,
-				    unsigned long dst_addr, struct page *page,
-				    bool newly_allocated, bool wp_copy)
+int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+			     struct vm_area_struct *dst_vma,
+			     unsigned long dst_addr, struct page *page,
+			     bool newly_allocated, bool wp_copy)
 {
 	int ret;
 	pte_t _dst_pte, *dst_pte;
-- 
cgit v1.2.3


From 3c36b419b111e28a657e6534aae07964a98a5ca9 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 30 Jun 2021 18:50:03 -0700
Subject: fs/proc/kcore: drop KCORE_REMAP and KCORE_OTHER

Patch series "fs/proc/kcore: don't read offline sections, logically offline pages and hwpoisoned pages", v3.

Looking for places where the kernel might unconditionally read
PageOffline() pages, I stumbled over /proc/kcore; turns out /proc/kcore
needs some more love to not touch some other pages we really don't want to
read -- i.e., hwpoisoned ones.

Examples for PageOffline() pages are pages inflated in a balloon, memory
unplugged via virtio-mem, and partially-present sections in memory added
by the Hyper-V balloon.

When reading pages inflated in a balloon, we essentially produce
unnecessary load in the hypervisor; holes in partially present sections in
case of Hyper-V are not accessible and already were a problem for
/proc/vmcore, fixed in makedumpfile by detecting PageOffline() pages.  In
the future, virtio-mem might disallow reading unplugged memory -- marked
as PageOffline() -- in some environments, resulting in undefined behavior
when accessed; therefore, I'm trying to identify and rework all these
(corner) cases.

With this series, there is really only access via /dev/mem, /proc/vmcore
and kdb left after I ripped out /dev/kmem.  kdb is an advanced corner-case
use case -- we won't care for now if someone explicitly tries to do nasty
things by reading from/writing to physical addresses we better not touch.
/dev/mem is a use case we won't support for virtio-mem, at least for now,
so we'll simply disallow mapping any virtio-mem memory via /dev/mem next.
/proc/vmcore is really only a problem when dumping the old kernel via
something that's not makedumpfile (read: basically never), however, we'll
try sanitizing that as well in the second kernel in the future.

Tested via kcore_dump:
	https://github.com/schlafwandler/kcore_dump

This patch (of 6):

Commit db779ef67ffe ("proc/kcore: Remove unused kclist_add_remap()")
removed the last user of KCORE_REMAP.

Commit 595dd46ebfc1 ("vfs/proc/kcore, x86/mm/kcore: Fix SMAP fault when
dumping vsyscall user page") removed the last user of KCORE_OTHER.

Let's drop both types.  While at it, also drop vaddr in "struct
kcore_list", used by KCORE_REMAP only.

Link: https://lkml.kernel.org/r/20210526093041.8800-1-david@redhat.com
Link: https://lkml.kernel.org/r/20210526093041.8800-2-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Steven Price <steven.price@arm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Aili Yao <yaoaili@kingsoft.com>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Wei Liu <wei.liu@kernel.org>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/kcore.c       | 7 ++-----
 include/linux/kcore.h | 3 ---
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 4d2e64e9016c..09f77d3c6e15 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -380,11 +380,8 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 			phdr->p_type = PT_LOAD;
 			phdr->p_flags = PF_R | PF_W | PF_X;
 			phdr->p_offset = kc_vaddr_to_offset(m->addr) + data_offset;
-			if (m->type == KCORE_REMAP)
-				phdr->p_vaddr = (size_t)m->vaddr;
-			else
-				phdr->p_vaddr = (size_t)m->addr;
-			if (m->type == KCORE_RAM || m->type == KCORE_REMAP)
+			phdr->p_vaddr = (size_t)m->addr;
+			if (m->type == KCORE_RAM)
 				phdr->p_paddr = __pa(m->addr);
 			else if (m->type == KCORE_TEXT)
 				phdr->p_paddr = __pa_symbol(m->addr);
diff --git a/include/linux/kcore.h b/include/linux/kcore.h
index da676cdbd727..86c0f1d18998 100644
--- a/include/linux/kcore.h
+++ b/include/linux/kcore.h
@@ -11,14 +11,11 @@ enum kcore_type {
 	KCORE_RAM,
 	KCORE_VMEMMAP,
 	KCORE_USER,
-	KCORE_OTHER,
-	KCORE_REMAP,
 };
 
 struct kcore_list {
 	struct list_head list;
 	unsigned long addr;
-	unsigned long vaddr;
 	size_t size;
 	int type;
 };
-- 
cgit v1.2.3


From 0daa322b8ff94d8ee4081c2c6868a1aaf1309642 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 30 Jun 2021 18:50:10 -0700
Subject: fs/proc/kcore: don't read offline sections, logically offline pages
 and hwpoisoned pages

Let's avoid reading:

1) Offline memory sections: the content of offline memory sections is
   stale as the memory is effectively unused by the kernel.  On s390x with
   standby memory, offline memory sections (belonging to offline storage
   increments) are not accessible.  With virtio-mem and the hyper-v
   balloon, we can have unavailable memory chunks that should not be
   accessed inside offline memory sections.  Last but not least, offline
   memory sections might contain hwpoisoned pages which we can no longer
   identify because the memmap is stale.

2) PG_offline pages: logically offline pages that are documented as
   "The content of these pages is effectively stale.  Such pages should
   not be touched (read/write/dump/save) except by their owner.".
   Examples include pages inflated in a balloon or unavailble memory
   ranges inside hotplugged memory sections with virtio-mem or the hyper-v
   balloon.

3) PG_hwpoison pages: Reading pages marked as hwpoisoned can be fatal.
   As documented: "Accessing is not safe since it may cause another
   machine check.  Don't touch!"

Introduce is_page_hwpoison(), adding a comment that it is inherently racy
but best we can really do.

Reading /proc/kcore now performs similar checks as when reading
/proc/vmcore for kdump via makedumpfile: problematic pages are exclude.
It's also similar to hibernation code, however, we don't skip hwpoisoned
pages when processing pages in kernel/power/snapshot.c:saveable_page()
yet.

Note 1: we can race against memory offlining code, especially memory going
offline and getting unplugged: however, we will properly tear down the
identity mapping and handle faults gracefully when accessing this memory
from kcore code.

Note 2: we can race against drivers setting PageOffline() and turning
memory inaccessible in the hypervisor.  We'll handle this in a follow-up
patch.

Link: https://lkml.kernel.org/r/20210526093041.8800-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Aili Yao <yaoaili@kingsoft.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Steven Price <steven.price@arm.com>
Cc: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/kcore.c            | 14 +++++++++++++-
 include/linux/page-flags.h | 12 ++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index ed6fbb3bd50c..92ff1e4436cb 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -465,6 +465,9 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 
 	m = NULL;
 	while (buflen) {
+		struct page *page;
+		unsigned long pfn;
+
 		/*
 		 * If this is the first iteration or the address is not within
 		 * the previous entry, search for a matching entry.
@@ -503,7 +506,16 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 			}
 			break;
 		case KCORE_RAM:
-			if (!pfn_is_ram(__pa(start) >> PAGE_SHIFT)) {
+			pfn = __pa(start) >> PAGE_SHIFT;
+			page = pfn_to_online_page(pfn);
+
+			/*
+			 * Don't read offline sections, logically offline pages
+			 * (e.g., inflated in a balloon), hwpoisoned pages,
+			 * and explicitly excluded physical ranges.
+			 */
+			if (!page || PageOffline(page) ||
+			    is_page_hwpoison(page) || !pfn_is_ram(pfn)) {
 				if (clear_user(buffer, tsz)) {
 					ret = -EFAULT;
 					goto out;
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index d8e26243db25..613295588848 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -694,6 +694,18 @@ PAGEFLAG_FALSE(DoubleMap)
 	TESTSCFLAG_FALSE(DoubleMap)
 #endif
 
+/*
+ * Check if a page is currently marked HWPoisoned. Note that this check is
+ * best effort only and inherently racy: there is no way to synchronize with
+ * failing hardware.
+ */
+static inline bool is_page_hwpoison(struct page *page)
+{
+	if (PageHWPoison(page))
+		return true;
+	return PageHuge(page) && PageHWPoison(compound_head(page));
+}
+
 /*
  * For pages that are never mapped to userspace (and aren't PageSlab),
  * page_type may be used.  Because it is initialised to -1, we invert the
-- 
cgit v1.2.3


From 82840451936f0301781ece80322230fd8edfc648 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 30 Jun 2021 18:50:14 -0700
Subject: mm: introduce page_offline_(begin|end|freeze|thaw) to synchronize
 setting PageOffline()

A driver might set a page logically offline -- PageOffline() -- and turn
the page inaccessible in the hypervisor; after that, access to page
content can be fatal.  One example is virtio-mem; while unplugged memory
-- marked as PageOffline() can currently be read in the hypervisor, this
will no longer be the case in the future; for example, when having a
virtio-mem device backed by huge pages in the hypervisor.

Some special PFN walkers -- i.e., /proc/kcore -- read content of random
pages after checking PageOffline(); however, these PFN walkers can race
with drivers that set PageOffline().

Let's introduce page_offline_(begin|end|freeze|thaw) for synchronizing.

page_offline_freeze()/page_offline_thaw() allows for a subsystem to
synchronize with such drivers, achieving that a page cannot be set
PageOffline() while frozen.

page_offline_begin()/page_offline_end() is used by drivers that care about
such races when setting a page PageOffline().

For simplicity, use a rwsem for now; neither drivers nor users are
performance sensitive.

Link: https://lkml.kernel.org/r/20210526093041.8800-5-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Aili Yao <yaoaili@kingsoft.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Steven Price <steven.price@arm.com>
Cc: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 10 ++++++++++
 mm/util.c                  | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 613295588848..3e7e616067fc 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -769,9 +769,19 @@ PAGE_TYPE_OPS(Buddy, buddy)
  * relies on this feature is aware that re-onlining the memory block will
  * require to re-set the pages PageOffline() and not giving them to the
  * buddy via online_page_callback_t.
+ *
+ * There are drivers that mark a page PageOffline() and expect there won't be
+ * any further access to page content. PFN walkers that read content of random
+ * pages should check PageOffline() and synchronize with such drivers using
+ * page_offline_freeze()/page_offline_thaw().
  */
 PAGE_TYPE_OPS(Offline, offline)
 
+extern void page_offline_freeze(void);
+extern void page_offline_thaw(void);
+extern void page_offline_begin(void);
+extern void page_offline_end(void);
+
 /*
  * Marks pages in use as page tables.
  */
diff --git a/mm/util.c b/mm/util.c
index a8bf17f18a81..a034525e7ba2 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1010,3 +1010,43 @@ void mem_dump_obj(void *object)
 }
 EXPORT_SYMBOL_GPL(mem_dump_obj);
 #endif
+
+/*
+ * A driver might set a page logically offline -- PageOffline() -- and
+ * turn the page inaccessible in the hypervisor; after that, access to page
+ * content can be fatal.
+ *
+ * Some special PFN walkers -- i.e., /proc/kcore -- read content of random
+ * pages after checking PageOffline(); however, these PFN walkers can race
+ * with drivers that set PageOffline().
+ *
+ * page_offline_freeze()/page_offline_thaw() allows for a subsystem to
+ * synchronize with such drivers, achieving that a page cannot be set
+ * PageOffline() while frozen.
+ *
+ * page_offline_begin()/page_offline_end() is used by drivers that care about
+ * such races when setting a page PageOffline().
+ */
+static DECLARE_RWSEM(page_offline_rwsem);
+
+void page_offline_freeze(void)
+{
+	down_read(&page_offline_rwsem);
+}
+
+void page_offline_thaw(void)
+{
+	up_read(&page_offline_rwsem);
+}
+
+void page_offline_begin(void)
+{
+	down_write(&page_offline_rwsem);
+}
+EXPORT_SYMBOL(page_offline_begin);
+
+void page_offline_end(void)
+{
+	up_write(&page_offline_rwsem);
+}
+EXPORT_SYMBOL(page_offline_end);
-- 
cgit v1.2.3


From 2a03085ce88792bac2e25319fc2874a885e7e102 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Wed, 30 Jun 2021 18:50:45 -0700
Subject: mm/zbud: don't export any zbud API

The zbud doesn't need to export any API and it is meant to be used via
zpool API since the commit 12d79d64bfd3 ("mm/zpool: update zswap to use
zpool").  So we can remove the unneeded zbud.h and move down zpool API to
avoid any forward declaration.

[linmiaohe@huawei.com: fix unused function warnings when CONFIG_ZPOOL is disabled]
  Link: https://lkml.kernel.org/r/20210619025508.1239386-1-linmiaohe@huawei.com

Link: https://lkml.kernel.org/r/20210608114515.206992-3-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS          |   1 -
 include/linux/zbud.h |  23 ------
 mm/Kconfig           |   1 +
 mm/zbud.c            | 223 +++++++++++++++++++++++++--------------------------
 4 files changed, 110 insertions(+), 138 deletions(-)
 delete mode 100644 include/linux/zbud.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 0cce91cd5624..6657f01ddbd5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -20172,7 +20172,6 @@ M:	Seth Jennings <sjenning@redhat.com>
 M:	Dan Streetman <ddstreet@ieee.org>
 L:	linux-mm@kvack.org
 S:	Maintained
-F:	include/linux/zbud.h
 F:	mm/zbud.c
 
 ZD1211RW WIRELESS DRIVER
diff --git a/include/linux/zbud.h b/include/linux/zbud.h
deleted file mode 100644
index b1eaf6e31735..000000000000
--- a/include/linux/zbud.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ZBUD_H_
-#define _ZBUD_H_
-
-#include <linux/types.h>
-
-struct zbud_pool;
-
-struct zbud_ops {
-	int (*evict)(struct zbud_pool *pool, unsigned long handle);
-};
-
-struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops);
-void zbud_destroy_pool(struct zbud_pool *pool);
-int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
-	unsigned long *handle);
-void zbud_free(struct zbud_pool *pool, unsigned long handle);
-int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries);
-void *zbud_map(struct zbud_pool *pool, unsigned long handle);
-void zbud_unmap(struct zbud_pool *pool, unsigned long handle);
-u64 zbud_get_pool_size(struct zbud_pool *pool);
-
-#endif /* _ZBUD_H_ */
diff --git a/mm/Kconfig b/mm/Kconfig
index 055639e52459..a0b2c37f851b 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -674,6 +674,7 @@ config ZPOOL
 
 config ZBUD
 	tristate "Low (Up to 2x) density storage for compressed pages"
+	depends on ZPOOL
 	help
 	  A special purpose allocator for storing compressed pages.
 	  It is designed to store up to two compressed pages per physical
diff --git a/mm/zbud.c b/mm/zbud.c
index c48e60bae4a1..48e80a3f7976 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -51,7 +51,6 @@
 #include <linux/preempt.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
-#include <linux/zbud.h>
 #include <linux/zpool.h>
 
 /*****************
@@ -73,6 +72,12 @@
 #define ZHDR_SIZE_ALIGNED CHUNK_SIZE
 #define NCHUNKS		((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
 
+struct zbud_pool;
+
+struct zbud_ops {
+	int (*evict)(struct zbud_pool *pool, unsigned long handle);
+};
+
 /**
  * struct zbud_pool - stores metadata for each zbud pool
  * @lock:	protects all pool fields and first|last_chunk fields of any
@@ -104,10 +109,8 @@ struct zbud_pool {
 	struct list_head lru;
 	u64 pages_nr;
 	const struct zbud_ops *ops;
-#ifdef CONFIG_ZPOOL
 	struct zpool *zpool;
 	const struct zpool_ops *zpool_ops;
-#endif
 };
 
 /*
@@ -126,104 +129,6 @@ struct zbud_header {
 	bool under_reclaim;
 };
 
-/*****************
- * zpool
- ****************/
-
-#ifdef CONFIG_ZPOOL
-
-static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle)
-{
-	if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
-		return pool->zpool_ops->evict(pool->zpool, handle);
-	else
-		return -ENOENT;
-}
-
-static const struct zbud_ops zbud_zpool_ops = {
-	.evict =	zbud_zpool_evict
-};
-
-static void *zbud_zpool_create(const char *name, gfp_t gfp,
-			       const struct zpool_ops *zpool_ops,
-			       struct zpool *zpool)
-{
-	struct zbud_pool *pool;
-
-	pool = zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
-	if (pool) {
-		pool->zpool = zpool;
-		pool->zpool_ops = zpool_ops;
-	}
-	return pool;
-}
-
-static void zbud_zpool_destroy(void *pool)
-{
-	zbud_destroy_pool(pool);
-}
-
-static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp,
-			unsigned long *handle)
-{
-	return zbud_alloc(pool, size, gfp, handle);
-}
-static void zbud_zpool_free(void *pool, unsigned long handle)
-{
-	zbud_free(pool, handle);
-}
-
-static int zbud_zpool_shrink(void *pool, unsigned int pages,
-			unsigned int *reclaimed)
-{
-	unsigned int total = 0;
-	int ret = -EINVAL;
-
-	while (total < pages) {
-		ret = zbud_reclaim_page(pool, 8);
-		if (ret < 0)
-			break;
-		total++;
-	}
-
-	if (reclaimed)
-		*reclaimed = total;
-
-	return ret;
-}
-
-static void *zbud_zpool_map(void *pool, unsigned long handle,
-			enum zpool_mapmode mm)
-{
-	return zbud_map(pool, handle);
-}
-static void zbud_zpool_unmap(void *pool, unsigned long handle)
-{
-	zbud_unmap(pool, handle);
-}
-
-static u64 zbud_zpool_total_size(void *pool)
-{
-	return zbud_get_pool_size(pool) * PAGE_SIZE;
-}
-
-static struct zpool_driver zbud_zpool_driver = {
-	.type =		"zbud",
-	.sleep_mapped = true,
-	.owner =	THIS_MODULE,
-	.create =	zbud_zpool_create,
-	.destroy =	zbud_zpool_destroy,
-	.malloc =	zbud_zpool_malloc,
-	.free =		zbud_zpool_free,
-	.shrink =	zbud_zpool_shrink,
-	.map =		zbud_zpool_map,
-	.unmap =	zbud_zpool_unmap,
-	.total_size =	zbud_zpool_total_size,
-};
-
-MODULE_ALIAS("zpool-zbud");
-#endif /* CONFIG_ZPOOL */
-
 /*****************
  * Helpers
 *****************/
@@ -310,7 +215,7 @@ static int num_free_chunks(struct zbud_header *zhdr)
  * Return: pointer to the new zbud pool or NULL if the metadata allocation
  * failed.
  */
-struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
+static struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
 {
 	struct zbud_pool *pool;
 	int i;
@@ -334,7 +239,7 @@ struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
  *
  * The pool should be emptied before this function is called.
  */
-void zbud_destroy_pool(struct zbud_pool *pool)
+static void zbud_destroy_pool(struct zbud_pool *pool)
 {
 	kfree(pool);
 }
@@ -358,7 +263,7 @@ void zbud_destroy_pool(struct zbud_pool *pool)
  * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
  * a new page.
  */
-int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
+static int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
 			unsigned long *handle)
 {
 	int chunks, i, freechunks;
@@ -433,7 +338,7 @@ found:
  * only sets the first|last_chunks to 0.  The page is actually freed
  * once both buddies are evicted (see zbud_reclaim_page() below).
  */
-void zbud_free(struct zbud_pool *pool, unsigned long handle)
+static void zbud_free(struct zbud_pool *pool, unsigned long handle)
 {
 	struct zbud_header *zhdr;
 	int freechunks;
@@ -505,7 +410,7 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle)
  * no pages to evict or an eviction handler is not registered, -EAGAIN if
  * the retry limit was hit.
  */
-int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
+static int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
 {
 	int i, ret, freechunks;
 	struct zbud_header *zhdr;
@@ -587,7 +492,7 @@ next:
  *
  * Returns: a pointer to the mapped allocation
  */
-void *zbud_map(struct zbud_pool *pool, unsigned long handle)
+static void *zbud_map(struct zbud_pool *pool, unsigned long handle)
 {
 	return (void *)(handle);
 }
@@ -597,7 +502,7 @@ void *zbud_map(struct zbud_pool *pool, unsigned long handle)
  * @pool:	pool in which the allocation resides
  * @handle:	handle associated with the allocation to be unmapped
  */
-void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
+static void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
 {
 }
 
@@ -608,30 +513,120 @@ void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
  * Returns: size in pages of the given pool.  The pool lock need not be
  * taken to access pages_nr.
  */
-u64 zbud_get_pool_size(struct zbud_pool *pool)
+static u64 zbud_get_pool_size(struct zbud_pool *pool)
 {
 	return pool->pages_nr;
 }
 
+/*****************
+ * zpool
+ ****************/
+
+static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle)
+{
+	if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
+		return pool->zpool_ops->evict(pool->zpool, handle);
+	else
+		return -ENOENT;
+}
+
+static const struct zbud_ops zbud_zpool_ops = {
+	.evict =	zbud_zpool_evict
+};
+
+static void *zbud_zpool_create(const char *name, gfp_t gfp,
+			       const struct zpool_ops *zpool_ops,
+			       struct zpool *zpool)
+{
+	struct zbud_pool *pool;
+
+	pool = zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
+	if (pool) {
+		pool->zpool = zpool;
+		pool->zpool_ops = zpool_ops;
+	}
+	return pool;
+}
+
+static void zbud_zpool_destroy(void *pool)
+{
+	zbud_destroy_pool(pool);
+}
+
+static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp,
+			unsigned long *handle)
+{
+	return zbud_alloc(pool, size, gfp, handle);
+}
+static void zbud_zpool_free(void *pool, unsigned long handle)
+{
+	zbud_free(pool, handle);
+}
+
+static int zbud_zpool_shrink(void *pool, unsigned int pages,
+			unsigned int *reclaimed)
+{
+	unsigned int total = 0;
+	int ret = -EINVAL;
+
+	while (total < pages) {
+		ret = zbud_reclaim_page(pool, 8);
+		if (ret < 0)
+			break;
+		total++;
+	}
+
+	if (reclaimed)
+		*reclaimed = total;
+
+	return ret;
+}
+
+static void *zbud_zpool_map(void *pool, unsigned long handle,
+			enum zpool_mapmode mm)
+{
+	return zbud_map(pool, handle);
+}
+static void zbud_zpool_unmap(void *pool, unsigned long handle)
+{
+	zbud_unmap(pool, handle);
+}
+
+static u64 zbud_zpool_total_size(void *pool)
+{
+	return zbud_get_pool_size(pool) * PAGE_SIZE;
+}
+
+static struct zpool_driver zbud_zpool_driver = {
+	.type =		"zbud",
+	.sleep_mapped = true,
+	.owner =	THIS_MODULE,
+	.create =	zbud_zpool_create,
+	.destroy =	zbud_zpool_destroy,
+	.malloc =	zbud_zpool_malloc,
+	.free =		zbud_zpool_free,
+	.shrink =	zbud_zpool_shrink,
+	.map =		zbud_zpool_map,
+	.unmap =	zbud_zpool_unmap,
+	.total_size =	zbud_zpool_total_size,
+};
+
+MODULE_ALIAS("zpool-zbud");
+
 static int __init init_zbud(void)
 {
 	/* Make sure the zbud header will fit in one chunk */
 	BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED);
 	pr_info("loaded\n");
 
-#ifdef CONFIG_ZPOOL
 	zpool_register_driver(&zbud_zpool_driver);
-#endif
 
 	return 0;
 }
 
 static void __exit exit_zbud(void)
 {
-#ifdef CONFIG_ZPOOL
 	zpool_unregister_driver(&zbud_zpool_driver);
-#endif
-
 	pr_info("unloaded\n");
 }
 
-- 
cgit v1.2.3


From b26e517a058bd40c790a1d9868c896842f2e4155 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Wed, 30 Jun 2021 18:50:56 -0700
Subject: mm/mempolicy: cleanup nodemask intersection check for oom

Patch series "mm/mempolicy: some fix and semantics cleanup", v4.

Current memory policy code has some confusing and ambiguous part about
MPOL_LOCAL policy, as it is handled as a faked MPOL_PREFERRED one, and
there are many places having to distinguish them.  Also the nodemask
intersection check needs cleanup to be more explicit for OOM use, and
handle MPOL_INTERLEAVE correctly.  This patchset cleans up these and
unifies the parameter sanity check for mbind() and set_mempolicy().

This patch (of 3):

mempolicy_nodemask_intersects seem to be a general purpose mempolicy
function.  In fact it is partially tailored for the OOM purpose
instead.  The oom proper is the only existing user so rename the
function to make that purpose explicit.

While at it drop the MPOL_INTERLEAVE as those allocations never has a
nodemask defined (see alloc_page_interleave) so this is a dead code and
a confusing one because MPOL_INTERLEAVE is a hint rather than a hard
requirement so it shouldn't be considered during the OOM.

The final code can be reduced to a check for MPOL_BIND which is the
only memory policy that is a hard requirement and thus relevant to a
constrained OOM logic.

[mhocko@suse.com: changelog edits]

Link: https://lkml.kernel.org/r/1622560492-1294-1-git-send-email-feng.tang@intel.com
Link: https://lkml.kernel.org/r/1622560492-1294-2-git-send-email-feng.tang@intel.com
Link: https://lkml.kernel.org/r/1622469956-82897-1-git-send-email-feng.tang@intel.com
Link: https://lkml.kernel.org/r/1622469956-82897-2-git-send-email-feng.tang@intel.com
Signed-off-by: Feng Tang <feng.tang@intel.com>
Suggested-by: Michal Hocko <mhocko@suse.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ben Widawsky <ben.widawsky@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h |  2 +-
 mm/mempolicy.c            | 34 +++++++++-------------------------
 mm/oom_kill.c             |  2 +-
 3 files changed, 11 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 5f1c74df264d..8773c55c7744 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -150,7 +150,7 @@ extern int huge_node(struct vm_area_struct *vma,
 				unsigned long addr, gfp_t gfp_flags,
 				struct mempolicy **mpol, nodemask_t **nodemask);
 extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
-extern bool mempolicy_nodemask_intersects(struct task_struct *tsk,
+extern bool mempolicy_in_oom_domain(struct task_struct *tsk,
 				const nodemask_t *mask);
 extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy);
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b5d95bf1025d..bd213b900e71 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2094,16 +2094,16 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
 #endif
 
 /*
- * mempolicy_nodemask_intersects
+ * mempolicy_in_oom_domain
  *
- * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
- * policy.  Otherwise, check for intersection between mask and the policy
- * nodemask for 'bind' or 'interleave' policy.  For 'preferred' or 'local'
- * policy, always return true since it may allocate elsewhere on fallback.
+ * If tsk's mempolicy is "bind", check for intersection between mask and
+ * the policy nodemask. Otherwise, return true for all other policies
+ * including "interleave", as a tsk with "interleave" policy may have
+ * memory allocated from all nodes in system.
  *
  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
  */
-bool mempolicy_nodemask_intersects(struct task_struct *tsk,
+bool mempolicy_in_oom_domain(struct task_struct *tsk,
 					const nodemask_t *mask)
 {
 	struct mempolicy *mempolicy;
@@ -2111,29 +2111,13 @@ bool mempolicy_nodemask_intersects(struct task_struct *tsk,
 
 	if (!mask)
 		return ret;
+
 	task_lock(tsk);
 	mempolicy = tsk->mempolicy;
-	if (!mempolicy)
-		goto out;
-
-	switch (mempolicy->mode) {
-	case MPOL_PREFERRED:
-		/*
-		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
-		 * allocate from, they may fallback to other nodes when oom.
-		 * Thus, it's possible for tsk to have allocated memory from
-		 * nodes in mask.
-		 */
-		break;
-	case MPOL_BIND:
-	case MPOL_INTERLEAVE:
+	if (mempolicy && mempolicy->mode == MPOL_BIND)
 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
-		break;
-	default:
-		BUG();
-	}
-out:
 	task_unlock(tsk);
+
 	return ret;
 }
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index eefd3f5fde46..fcc29e9a3064 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -104,7 +104,7 @@ static bool oom_cpuset_eligible(struct task_struct *start,
 			 * mempolicy intersects current, otherwise it may be
 			 * needlessly killed.
 			 */
-			ret = mempolicy_nodemask_intersects(tsk, mask);
+			ret = mempolicy_in_oom_domain(tsk, mask);
 		} else {
 			/*
 			 * This is not a mempolicy constrained oom, so only
-- 
cgit v1.2.3


From 269fbe72cded0afce0090103e90d2ae8ef8ac5b5 Mon Sep 17 00:00:00 2001
From: Ben Widawsky <ben.widawsky@intel.com>
Date: Wed, 30 Jun 2021 18:51:10 -0700
Subject: mm/mempolicy: use unified 'nodes' for bind/interleave/prefer policies

Current structure 'mempolicy' uses a union to store the node info for
bind/interleave/perfer policies.

	union {
		short 		 preferred_node; /* preferred */
		nodemask_t	 nodes;		/* interleave/bind */
		/* undefined for default */
	} v;

Since preferred node can also be represented by a nodemask_t with only ont
bit set, unify these policies with using one nodemask_t 'nodes', which can
remove a union, simplify the code and make it easier to support future's
new policy's node info.

Link: https://lore.kernel.org/r/20200630212517.308045-7-ben.widawsky@intel.com
Link: https://lkml.kernel.org/r/1623399825-75651-1-git-send-email-feng.tang@intel.com
Co-developed-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
Signed-off-by: Feng Tang <feng.tang@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h |  7 +---
 mm/mempolicy.c            | 96 ++++++++++++++++++++++-------------------------
 2 files changed, 46 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 8773c55c7744..0aaf91b496e2 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -46,11 +46,8 @@ struct mempolicy {
 	atomic_t refcnt;
 	unsigned short mode; 	/* See MPOL_* above */
 	unsigned short flags;	/* See set_mempolicy() MPOL_F_* above */
-	union {
-		short 		 preferred_node; /* preferred */
-		nodemask_t	 nodes;		/* interleave/bind */
-		/* undefined for default */
-	} v;
+	nodemask_t nodes;	/* interleave/bind/perfer */
+
 	union {
 		nodemask_t cpuset_mems_allowed;	/* relative to these nodes */
 		nodemask_t user_nodemask;	/* nodemask passed by user */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8739bfd967e5..e32360e90274 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -193,7 +193,7 @@ static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 {
 	if (nodes_empty(*nodes))
 		return -EINVAL;
-	pol->v.nodes = *nodes;
+	pol->nodes = *nodes;
 	return 0;
 }
 
@@ -201,7 +201,9 @@ static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 {
 	if (nodes_empty(*nodes))
 		return -EINVAL;
-	pol->v.preferred_node = first_node(*nodes);
+
+	nodes_clear(pol->nodes);
+	node_set(first_node(*nodes), pol->nodes);
 	return 0;
 }
 
@@ -209,7 +211,7 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 {
 	if (nodes_empty(*nodes))
 		return -EINVAL;
-	pol->v.nodes = *nodes;
+	pol->nodes = *nodes;
 	return 0;
 }
 
@@ -324,7 +326,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 	else {
-		nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
+		nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
 								*nodes);
 		pol->w.cpuset_mems_allowed = *nodes;
 	}
@@ -332,7 +334,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 	if (nodes_empty(tmp))
 		tmp = *nodes;
 
-	pol->v.nodes = tmp;
+	pol->nodes = tmp;
 }
 
 static void mpol_rebind_preferred(struct mempolicy *pol,
@@ -897,15 +899,12 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 	switch (p->mode) {
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
-		*nodes = p->v.nodes;
+	case MPOL_PREFERRED:
+		*nodes = p->nodes;
 		break;
 	case MPOL_LOCAL:
 		/* return empty node mask for local allocation */
 		break;
-
-	case MPOL_PREFERRED:
-		node_set(p->v.preferred_node, *nodes);
-		break;
 	default:
 		BUG();
 	}
@@ -989,7 +988,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 			*policy = err;
 		} else if (pol == current->mempolicy &&
 				pol->mode == MPOL_INTERLEAVE) {
-			*policy = next_node_in(current->il_prev, pol->v.nodes);
+			*policy = next_node_in(current->il_prev, pol->nodes);
 		} else {
 			err = -EINVAL;
 			goto out;
@@ -1857,14 +1856,14 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
 
 	/*
-	 * if policy->v.nodes has movable memory only,
+	 * if policy->nodes has movable memory only,
 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
 	 *
-	 * policy->v.nodes is intersect with node_states[N_MEMORY].
+	 * policy->nodes is intersect with node_states[N_MEMORY].
 	 * so if the following test fails, it implies
-	 * policy->v.nodes has movable memory only.
+	 * policy->nodes has movable memory only.
 	 */
-	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
+	if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
 		dynamic_policy_zone = ZONE_MOVABLE;
 
 	return zone >= dynamic_policy_zone;
@@ -1879,8 +1878,8 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
 	if (unlikely(policy->mode == MPOL_BIND) &&
 			apply_policy_zone(policy, gfp_zone(gfp)) &&
-			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
-		return &policy->v.nodes;
+			cpuset_nodemask_valid_mems_allowed(&policy->nodes))
+		return &policy->nodes;
 
 	return NULL;
 }
@@ -1889,7 +1888,7 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
 static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
 {
 	if (policy->mode == MPOL_PREFERRED) {
-		nd = policy->v.preferred_node;
+		nd = first_node(policy->nodes);
 	} else {
 		/*
 		 * __GFP_THISNODE shouldn't even be used with the bind policy
@@ -1908,7 +1907,7 @@ static unsigned interleave_nodes(struct mempolicy *policy)
 	unsigned next;
 	struct task_struct *me = current;
 
-	next = next_node_in(me->il_prev, policy->v.nodes);
+	next = next_node_in(me->il_prev, policy->nodes);
 	if (next < MAX_NUMNODES)
 		me->il_prev = next;
 	return next;
@@ -1932,7 +1931,7 @@ unsigned int mempolicy_slab_node(void)
 
 	switch (policy->mode) {
 	case MPOL_PREFERRED:
-		return policy->v.preferred_node;
+		return first_node(policy->nodes);
 
 	case MPOL_INTERLEAVE:
 		return interleave_nodes(policy);
@@ -1948,7 +1947,7 @@ unsigned int mempolicy_slab_node(void)
 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
 		z = first_zones_zonelist(zonelist, highest_zoneidx,
-							&policy->v.nodes);
+							&policy->nodes);
 		return z->zone ? zone_to_nid(z->zone) : node;
 	}
 	case MPOL_LOCAL:
@@ -1961,12 +1960,12 @@ unsigned int mempolicy_slab_node(void)
 
 /*
  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
- * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
+ * node in pol->nodes (starting from n=0), wrapping around if n exceeds the
  * number of present nodes.
  */
 static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
 {
-	unsigned nnodes = nodes_weight(pol->v.nodes);
+	unsigned nnodes = nodes_weight(pol->nodes);
 	unsigned target;
 	int i;
 	int nid;
@@ -1974,9 +1973,9 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
 	if (!nnodes)
 		return numa_node_id();
 	target = (unsigned int)n % nnodes;
-	nid = first_node(pol->v.nodes);
+	nid = first_node(pol->nodes);
 	for (i = 0; i < target; i++)
-		nid = next_node(nid, pol->v.nodes);
+		nid = next_node(nid, pol->nodes);
 	return nid;
 }
 
@@ -2032,7 +2031,7 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
 	} else {
 		nid = policy_node(gfp_flags, *mpol, numa_node_id());
 		if ((*mpol)->mode == MPOL_BIND)
-			*nodemask = &(*mpol)->v.nodes;
+			*nodemask = &(*mpol)->nodes;
 	}
 	return nid;
 }
@@ -2056,7 +2055,6 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
 bool init_nodemask_of_mempolicy(nodemask_t *mask)
 {
 	struct mempolicy *mempolicy;
-	int nid;
 
 	if (!(mask && current->mempolicy))
 		return false;
@@ -2065,18 +2063,13 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
 	mempolicy = current->mempolicy;
 	switch (mempolicy->mode) {
 	case MPOL_PREFERRED:
-		nid = mempolicy->v.preferred_node;
-		init_nodemask_of_node(mask, nid);
-		break;
-
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
-		*mask = mempolicy->v.nodes;
+		*mask = mempolicy->nodes;
 		break;
 
 	case MPOL_LOCAL:
-		nid = numa_node_id();
-		init_nodemask_of_node(mask, nid);
+		init_nodemask_of_node(mask, numa_node_id());
 		break;
 
 	default:
@@ -2110,7 +2103,7 @@ bool mempolicy_in_oom_domain(struct task_struct *tsk,
 	task_lock(tsk);
 	mempolicy = tsk->mempolicy;
 	if (mempolicy && mempolicy->mode == MPOL_BIND)
-		ret = nodes_intersects(mempolicy->v.nodes, *mask);
+		ret = nodes_intersects(mempolicy->nodes, *mask);
 	task_unlock(tsk);
 
 	return ret;
@@ -2184,7 +2177,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 		 * node in its nodemask, we allocate the standard way.
 		 */
 		if (pol->mode == MPOL_PREFERRED)
-			hpage_node = pol->v.preferred_node;
+			hpage_node = first_node(pol->nodes);
 
 		nmask = policy_nodemask(gfp, pol);
 		if (!nmask || node_isset(hpage_node, *nmask)) {
@@ -2317,9 +2310,8 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 	switch (a->mode) {
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
-		return !!nodes_equal(a->v.nodes, b->v.nodes);
 	case MPOL_PREFERRED:
-		return a->v.preferred_node == b->v.preferred_node;
+		return !!nodes_equal(a->nodes, b->nodes);
 	case MPOL_LOCAL:
 		return true;
 	default:
@@ -2459,7 +2451,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 		break;
 
 	case MPOL_PREFERRED:
-		polnid = pol->v.preferred_node;
+		polnid = first_node(pol->nodes);
 		break;
 
 	case MPOL_LOCAL:
@@ -2469,7 +2461,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 	case MPOL_BIND:
 		/* Optimize placement among multiple nodes via NUMA balancing */
 		if (pol->flags & MPOL_F_MORON) {
-			if (node_isset(thisnid, pol->v.nodes))
+			if (node_isset(thisnid, pol->nodes))
 				break;
 			goto out;
 		}
@@ -2480,12 +2472,12 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 		 * else select nearest allowed node, if any.
 		 * If no allowed nodes, use current [!misplaced].
 		 */
-		if (node_isset(curnid, pol->v.nodes))
+		if (node_isset(curnid, pol->nodes))
 			goto out;
 		z = first_zones_zonelist(
 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
 				gfp_zone(GFP_HIGHUSER),
-				&pol->v.nodes);
+				&pol->nodes);
 		polnid = zone_to_nid(z->zone);
 		break;
 
@@ -2688,7 +2680,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
 		 vma->vm_pgoff,
 		 sz, npol ? npol->mode : -1,
 		 npol ? npol->flags : -1,
-		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
+		 npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE);
 
 	if (npol) {
 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
@@ -2786,7 +2778,7 @@ void __init numa_policy_init(void)
 			.refcnt = ATOMIC_INIT(1),
 			.mode = MPOL_PREFERRED,
 			.flags = MPOL_F_MOF | MPOL_F_MORON,
-			.v = { .preferred_node = nid, },
+			.nodes = nodemask_of_node(nid),
 		};
 	}
 
@@ -2945,12 +2937,14 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
 	 */
-	if (mode != MPOL_PREFERRED)
-		new->v.nodes = nodes;
-	else if (nodelist)
-		new->v.preferred_node = first_node(nodes);
-	else
+	if (mode != MPOL_PREFERRED) {
+		new->nodes = nodes;
+	} else if (nodelist) {
+		nodes_clear(new->nodes);
+		node_set(first_node(nodes), new->nodes);
+	} else {
 		new->mode = MPOL_LOCAL;
+	}
 
 	/*
 	 * Save nodes for contextualization: this will be used to "clone"
@@ -2999,11 +2993,9 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 	case MPOL_LOCAL:
 		break;
 	case MPOL_PREFERRED:
-		node_set(pol->v.preferred_node, nodes);
-		break;
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
-		nodes = pol->v.nodes;
+		nodes = pol->nodes;
 		break;
 	default:
 		WARN_ON_ONCE(1);
-- 
cgit v1.2.3


From 51c656aef629bae94f2b07fcee7eabe280b905ea Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Wed, 30 Jun 2021 18:51:13 -0700
Subject: include/linux/mmzone.h: add documentation for pfn_valid()

Patch series "arm64: drop pfn_valid_within() and simplify pfn_valid()", v4.

These patches aim to remove CONFIG_HOLES_IN_ZONE and essentially hardwire
pfn_valid_within() to 1.

The idea is to mark NOMAP pages as reserved in the memory map and restore
the intended semantics of pfn_valid() to designate availability of struct
page for a pfn.

With this the core mm will be able to cope with the fact that it cannot
use NOMAP pages and the holes created by NOMAP ranges within MAX_ORDER
blocks will be treated correctly even without the need for
pfn_valid_within.

This patch (of 4):

Add comment describing the semantics of pfn_valid() that clarifies that
pfn_valid() only checks for availability of a memory map entry (i.e.
struct page) for a PFN rather than availability of usable memory backing
that PFN.

The most "generic" version of pfn_valid() used by the configurations with
SPARSEMEM enabled resides in include/linux/mmzone.h so this is the most
suitable place for documentation about semantics of pfn_valid().

Link: https://lkml.kernel.org/r/20210511100550.28178-1-rppt@kernel.org
Link: https://lkml.kernel.org/r/20210511100550.28178-2-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Suggested-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 265a32e1ff74..7da43337ad23 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1445,6 +1445,17 @@ static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
 #endif
 
 #ifndef CONFIG_HAVE_ARCH_PFN_VALID
+/**
+ * pfn_valid - check if there is a valid memory map entry for a PFN
+ * @pfn: the page frame number to check
+ *
+ * Check if there is a valid memory map entry aka struct page for the @pfn.
+ * Note, that availability of the memory map entry does not imply that
+ * there is actual usable memory at that @pfn. The struct page may
+ * represent a hole or an unusable page frame.
+ *
+ * Return: 1 for PFNs that have memory map entries and 0 otherwise
+ */
 static inline int pfn_valid(unsigned long pfn)
 {
 	struct mem_section *ms;
-- 
cgit v1.2.3


From 9092d4f7a1f846bcc72e9aace4ed64ed3fc4aa32 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Wed, 30 Jun 2021 18:51:16 -0700
Subject: memblock: update initialization of reserved pages

The struct pages representing a reserved memory region are initialized
using reserve_bootmem_range() function.  This function is called for each
reserved region just before the memory is freed from memblock to the buddy
page allocator.

The struct pages for MEMBLOCK_NOMAP regions are kept with the default
values set by the memory map initialization which makes it necessary to
have a special treatment for such pages in pfn_valid() and
pfn_valid_within().

Split out initialization of the reserved pages to a function with a
meaningful name and treat the MEMBLOCK_NOMAP regions the same way as the
reserved regions and mark struct pages for the NOMAP regions as
PageReserved.

Link: https://lkml.kernel.org/r/20210511100550.28178-3-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h |  4 +++-
 mm/memblock.c            | 28 ++++++++++++++++++++++++++--
 2 files changed, 29 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 552309342c38..cbf46f56d105 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -30,7 +30,9 @@ extern unsigned long long max_possible_pfn;
  * @MEMBLOCK_NONE: no special request
  * @MEMBLOCK_HOTPLUG: hotpluggable region
  * @MEMBLOCK_MIRROR: mirrored region
- * @MEMBLOCK_NOMAP: don't add to kernel direct mapping
+ * @MEMBLOCK_NOMAP: don't add to kernel direct mapping and treat as
+ * reserved in the memory map; refer to memblock_mark_nomap() description
+ * for further details
  */
 enum memblock_flags {
 	MEMBLOCK_NONE		= 0x0,	/* No special request */
diff --git a/mm/memblock.c b/mm/memblock.c
index 123feef5259d..3e4acbf03ab7 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -906,6 +906,11 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
  * @base: the base phys addr of the region
  * @size: the size of the region
  *
+ * The memory regions marked with %MEMBLOCK_NOMAP will not be added to the
+ * direct mapping of the physical memory. These regions will still be
+ * covered by the memory map. The struct page representing NOMAP memory
+ * frames in the memory map will be PageReserved()
+ *
  * Return: 0 on success, -errno on failure.
  */
 int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size)
@@ -2002,6 +2007,26 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
 	return end_pfn - start_pfn;
 }
 
+static void __init memmap_init_reserved_pages(void)
+{
+	struct memblock_region *region;
+	phys_addr_t start, end;
+	u64 i;
+
+	/* initialize struct pages for the reserved regions */
+	for_each_reserved_mem_range(i, &start, &end)
+		reserve_bootmem_region(start, end);
+
+	/* and also treat struct pages for the NOMAP regions as PageReserved */
+	for_each_mem_region(region) {
+		if (memblock_is_nomap(region)) {
+			start = region->base;
+			end = start + region->size;
+			reserve_bootmem_region(start, end);
+		}
+	}
+}
+
 static unsigned long __init free_low_memory_core_early(void)
 {
 	unsigned long count = 0;
@@ -2010,8 +2035,7 @@ static unsigned long __init free_low_memory_core_early(void)
 
 	memblock_clear_hotplug(0, -1);
 
-	for_each_reserved_mem_range(i, &start, &end)
-		reserve_bootmem_region(start, end);
+	memmap_init_reserved_pages();
 
 	/*
 	 * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
-- 
cgit v1.2.3


From 16c9afc776608324ca71c0bc354987bab532f51d Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Wed, 30 Jun 2021 18:51:26 -0700
Subject: arm64/mm: drop HAVE_ARCH_PFN_VALID

CONFIG_SPARSEMEM_VMEMMAP is now the only available memory model on arm64
platforms and free_unused_memmap() would just return without creating any
holes in the memmap mapping.  There is no need for any special handling in
pfn_valid() and HAVE_ARCH_PFN_VALID can just be dropped.  This also moves
the pfn upper bits sanity check into generic pfn_valid().

Link: https://lkml.kernel.org/r/1621947349-25421-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Rapoport <rppt@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/Kconfig            |  1 -
 arch/arm64/include/asm/page.h |  1 -
 arch/arm64/mm/init.c          | 37 -------------------------------------
 include/linux/mmzone.h        |  9 +++++++++
 4 files changed, 9 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 59c9255ab9d0..84f0216d3c5c 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -154,7 +154,6 @@ config ARM64
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_MMAP_RND_BITS
 	select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
-	select HAVE_ARCH_PFN_VALID
 	select HAVE_ARCH_PREL32_RELOCATIONS
 	select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
 	select HAVE_ARCH_SECCOMP_FILTER
diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
index 75ddfe671393..fcbef3eec4b2 100644
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -37,7 +37,6 @@ void copy_highpage(struct page *to, struct page *from);
 
 typedef struct page *pgtable_t;
 
-int pfn_valid(unsigned long pfn);
 int pfn_is_map_memory(unsigned long pfn);
 
 #include <asm/memory.h>
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 725aa84f2faa..49019ea0c8a8 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -219,43 +219,6 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
 	free_area_init(max_zone_pfns);
 }
 
-int pfn_valid(unsigned long pfn)
-{
-	phys_addr_t addr = PFN_PHYS(pfn);
-	struct mem_section *ms;
-
-	/*
-	 * Ensure the upper PAGE_SHIFT bits are clear in the
-	 * pfn. Else it might lead to false positives when
-	 * some of the upper bits are set, but the lower bits
-	 * match a valid pfn.
-	 */
-	if (PHYS_PFN(addr) != pfn)
-		return 0;
-
-	if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
-		return 0;
-
-	ms = __pfn_to_section(pfn);
-	if (!valid_section(ms))
-		return 0;
-
-	/*
-	 * ZONE_DEVICE memory does not have the memblock entries.
-	 * memblock_is_map_memory() check for ZONE_DEVICE based
-	 * addresses will always fail. Even the normal hotplugged
-	 * memory will never have MEMBLOCK_NOMAP flag set in their
-	 * memblock entries. Skip memblock search for all non early
-	 * memory sections covering all of hotplug memory including
-	 * both normal and ZONE_DEVICE based.
-	 */
-	if (!early_section(ms))
-		return pfn_section_valid(ms, pfn);
-
-	return memblock_is_memory(addr);
-}
-EXPORT_SYMBOL(pfn_valid);
-
 int pfn_is_map_memory(unsigned long pfn)
 {
 	phys_addr_t addr = PFN_PHYS(pfn);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7da43337ad23..7bc7e41b6c31 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1460,6 +1460,15 @@ static inline int pfn_valid(unsigned long pfn)
 {
 	struct mem_section *ms;
 
+	/*
+	 * Ensure the upper PAGE_SHIFT bits are clear in the
+	 * pfn. Else it might lead to false positives when
+	 * some of the upper bits are set, but the lower bits
+	 * match a valid pfn.
+	 */
+	if (PHYS_PFN(PFN_PHYS(pfn)) != pfn)
+		return 0;
+
 	if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
 		return 0;
 	ms = __nr_to_section(pfn_to_section_nr(pfn));
-- 
cgit v1.2.3


From 6acfb5ba150cf75005ce85e0e25d79ef2fec287c Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Wed, 30 Jun 2021 18:51:29 -0700
Subject: mm: migrate: fix missing update page_private to hugetlb_page_subpool

Since commit d6995da31122 ("hugetlb: use page.private for hugetlb specific
page flags") converts page.private for hugetlb specific page flags.  We
should use hugetlb_page_subpool() to get the subpool pointer instead of
page_private().

This 'could' prevent the migration of hugetlb pages.  page_private(hpage)
is now used for hugetlb page specific flags.  At migration time, the only
flag which could be set is HPageVmemmapOptimized.  This flag will only be
set if the new vmemmap reduction feature is enabled.  In addition,
!page_mapping() implies an anonymous mapping.  So, this will prevent
migration of hugetb pages in anonymous mappings if the vmemmap reduction
feature is enabled.

In addition, that if statement checked for the rare race condition of a
page being migrated while in the process of being freed.  Since that check
is now wrong, we could leak hugetlb subpool usage counts.

The commit forgot to update it in the page migration routine.  So fix it.

[songmuchun@bytedance.com: fix compiler error when !CONFIG_HUGETLB_PAGE reported by Randy]
  Link: https://lkml.kernel.org/r/20210521022747.35736-1-songmuchun@bytedance.com

Link: https://lkml.kernel.org/r/20210520025949.1866-1-songmuchun@bytedance.com
Fixes: d6995da31122 ("hugetlb: use page.private for hugetlb specific page flags")
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reported-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Tested-by: Anshuman Khandual <anshuman.khandual@arm.com>	[arm64]
Cc: Oscar Salvador <osalvador@suse.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 5 +++++
 mm/migrate.c            | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index f11ba701e199..a58e11f2db15 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -898,6 +898,11 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
 #else	/* CONFIG_HUGETLB_PAGE */
 struct hstate {};
 
+static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage)
+{
+	return NULL;
+}
+
 static inline int isolate_or_dissolve_huge_page(struct page *page,
 						struct list_head *list)
 {
diff --git a/mm/migrate.c b/mm/migrate.c
index 8fc766e52e52..9bd6bc12aa72 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1293,7 +1293,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 	 * page_mapping() set, hugetlbfs specific move page routine will not
 	 * be called and we could leak usage counts for subpools.
 	 */
-	if (page_private(hpage) && !page_mapping(hpage)) {
+	if (hugetlb_page_subpool(hpage) && !page_mapping(hpage)) {
 		rc = -EBUSY;
 		goto out_unlock;
 	}
-- 
cgit v1.2.3


From 5db4f15c4fd7ae74dd40c6f84bf56dfcf13d10cf Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301@gmail.com>
Date: Wed, 30 Jun 2021 18:51:35 -0700
Subject: mm: memory: add orig_pmd to struct vm_fault

Pach series "mm: thp: use generic THP migration for NUMA hinting fault", v3.

When the THP NUMA fault support was added THP migration was not supported
yet.  So the ad hoc THP migration was implemented in NUMA fault handling.
Since v4.14 THP migration has been supported so it doesn't make too much
sense to still keep another THP migration implementation rather than using
the generic migration code.  It is definitely a maintenance burden to keep
two THP migration implementation for different code paths and it is more
error prone.  Using the generic THP migration implementation allows us
remove the duplicate code and some hacks needed by the old ad hoc
implementation.

A quick grep shows x86_64, PowerPC (book3s), ARM64 ans S390 support both
THP and NUMA balancing.  The most of them support THP migration except for
S390.  Zi Yan tried to add THP migration support for S390 before but it
was not accepted due to the design of S390 PMD.  For the discussion,
please see: https://lkml.org/lkml/2018/4/27/953.

Per the discussion with Gerald Schaefer in v1 it is acceptible to skip
huge PMD for S390 for now.

I saw there were some hacks about gup from git history, but I didn't
figure out if they have been removed or not since I just found FOLL_NUMA
code in the current gup implementation and they seems useful.

Patch #1 ~ #2 are preparation patches.
Patch #3 is the real meat.
Patch #4 ~ #6 keep consistent counters and behaviors with before.
Patch #7 skips change huge PMD to prot_none if thp migration is not supported.

Test
----
Did some tests to measure the latency of do_huge_pmd_numa_page.  The test
VM has 80 vcpus and 64G memory.  The test would create 2 processes to
consume 128G memory together which would incur memory pressure to cause
THP splits.  And it also creates 80 processes to hog cpu, and the memory
consumer processes are bound to different nodes periodically in order to
increase NUMA faults.

The below test script is used:

echo 3 > /proc/sys/vm/drop_caches

# Run stress-ng for 24 hours
./stress-ng/stress-ng --vm 2 --vm-bytes 64G --timeout 24h &
PID=$!

./stress-ng/stress-ng --cpu $NR_CPUS --timeout 24h &

# Wait for vm stressors forked
sleep 5

PID_1=`pgrep -P $PID | awk 'NR == 1'`
PID_2=`pgrep -P $PID | awk 'NR == 2'`

JOB1=`pgrep -P $PID_1`
JOB2=`pgrep -P $PID_2`

# Bind load jobs to different nodes periodically to force generate
# cross node memory access
while [ -d "/proc/$PID" ]
do
        taskset -apc 8 $JOB1
        taskset -apc 8 $JOB2
        sleep 300
        taskset -apc 58 $JOB1
        taskset -apc 58 $JOB2
        sleep 300
done

With the above test the histogram of latency of do_huge_pmd_numa_page is
as shown below.  Since the number of do_huge_pmd_numa_page varies
drastically for each run (should be due to scheduler), so I converted the
raw number to percentage.

                             patched               base
@us[stress-ng]:
[0]                          3.57%                 0.16%
[1]                          55.68%                18.36%
[2, 4)                       10.46%                40.44%
[4, 8)                       7.26%                 17.82%
[8, 16)                      21.12%                13.41%
[16, 32)                     1.06%                 4.27%
[32, 64)                     0.56%                 4.07%
[64, 128)                    0.16%                 0.35%
[128, 256)                   < 0.1%                < 0.1%
[256, 512)                   < 0.1%                < 0.1%
[512, 1K)                    < 0.1%                < 0.1%
[1K, 2K)                     < 0.1%                < 0.1%
[2K, 4K)                     < 0.1%                < 0.1%
[4K, 8K)                     < 0.1%                < 0.1%
[8K, 16K)                    < 0.1%                < 0.1%
[16K, 32K)                   < 0.1%                < 0.1%
[32K, 64K)                   < 0.1%                < 0.1%

Per the result, patched kernel is even slightly better than the base
kernel.  I think this is because the lock contention against THP split is
less than base kernel due to the refactor.

To exclude the affect from THP split, I also did test w/o memory pressure.
No obvious regression is spotted.  The below is the test result *w/o*
memory pressure.

                           patched                  base
@us[stress-ng]:
[0]                        7.97%                   18.4%
[1]                        69.63%                  58.24%
[2, 4)                     4.18%                   2.63%
[4, 8)                     0.22%                   0.17%
[8, 16)                    1.03%                   0.92%
[16, 32)                   0.14%                   < 0.1%
[32, 64)                   < 0.1%                  < 0.1%
[64, 128)                  < 0.1%                  < 0.1%
[128, 256)                 < 0.1%                  < 0.1%
[256, 512)                 0.45%                   1.19%
[512, 1K)                  15.45%                  17.27%
[1K, 2K)                   < 0.1%                  < 0.1%
[2K, 4K)                   < 0.1%                  < 0.1%
[4K, 8K)                   < 0.1%                  < 0.1%
[8K, 16K)                  0.86%                   0.88%
[16K, 32K)                 < 0.1%                  0.15%
[32K, 64K)                 < 0.1%                  < 0.1%
[64K, 128K)                < 0.1%                  < 0.1%
[128K, 256K)               < 0.1%                  < 0.1%

The series also survived a series of tests that exercise NUMA balancing
migrations by Mel.

This patch (of 7):

Add orig_pmd to struct vm_fault so the "orig_pmd" parameter used by huge
page fault could be removed, just like its PTE counterpart does.

Link: https://lkml.kernel.org/r/20210518200801.7413-1-shy828301@gmail.com
Link: https://lkml.kernel.org/r/20210518200801.7413-2-shy828301@gmail.com
Signed-off-by: Yang Shi <shy828301@gmail.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h |  9 ++++-----
 include/linux/mm.h      |  7 ++++++-
 mm/huge_memory.c        |  9 ++++++---
 mm/memory.c             | 26 +++++++++++++-------------
 4 files changed, 29 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 939f21b69ead..f123e15d966e 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -11,7 +11,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
-void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd);
+void huge_pmd_set_accessed(struct vm_fault *vmf);
 int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
 		  struct vm_area_struct *vma);
@@ -24,7 +24,7 @@ static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
 }
 #endif
 
-vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd);
+vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf);
 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 				   unsigned long addr, pmd_t *pmd,
 				   unsigned int flags);
@@ -288,7 +288,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
 struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
 		pud_t *pud, int flags, struct dev_pagemap **pgmap);
 
-vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
+vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);
 
 extern struct page *huge_zero_page;
 extern unsigned long huge_zero_pfn;
@@ -441,8 +441,7 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
 	return NULL;
 }
 
-static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf,
-		pmd_t orig_pmd)
+static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
 {
 	return 0;
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index aa875dacd9c3..3cbd2d6d248e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -550,7 +550,12 @@ struct vm_fault {
 	pud_t *pud;			/* Pointer to pud entry matching
 					 * the 'address'
 					 */
-	pte_t orig_pte;			/* Value of PTE at the time of fault */
+	union {
+		pte_t orig_pte;		/* Value of PTE at the time of fault */
+		pmd_t orig_pmd;		/* Value of PMD at the time of fault,
+					 * used by PMD fault only.
+					 */
+	};
 
 	struct page *cow_page;		/* Page handler may use for COW fault */
 	struct page *page;		/* ->fault handlers should return a
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 40a90ff18180..f3a3138d33e9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1257,11 +1257,12 @@ unlock:
 }
 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 
-void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
+void huge_pmd_set_accessed(struct vm_fault *vmf)
 {
 	pmd_t entry;
 	unsigned long haddr;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
+	pmd_t orig_pmd = vmf->orig_pmd;
 
 	vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
 	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
@@ -1278,11 +1279,12 @@ unlock:
 	spin_unlock(vmf->ptl);
 }
 
-vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
+vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct page *page;
 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+	pmd_t orig_pmd = vmf->orig_pmd;
 
 	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
 	VM_BUG_ON_VMA(!vma->anon_vma, vma);
@@ -1418,9 +1420,10 @@ out:
 }
 
 /* NUMA hinting page fault entry point for trans huge pmds */
-vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
+vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
+	pmd_t pmd = vmf->orig_pmd;
 	struct anon_vma *anon_vma = NULL;
 	struct page *page;
 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
diff --git a/mm/memory.c b/mm/memory.c
index b535a1d0317d..5029d793dc3e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4298,12 +4298,12 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
 }
 
 /* `inline' is required to avoid gcc 4.1.2 build error */
-static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
+static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
 {
 	if (vma_is_anonymous(vmf->vma)) {
-		if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd))
+		if (userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
 			return handle_userfault(vmf, VM_UFFD_WP);
-		return do_huge_pmd_wp_page(vmf, orig_pmd);
+		return do_huge_pmd_wp_page(vmf);
 	}
 	if (vmf->vma->vm_ops->huge_fault) {
 		vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
@@ -4530,26 +4530,26 @@ retry_pud:
 		if (!(ret & VM_FAULT_FALLBACK))
 			return ret;
 	} else {
-		pmd_t orig_pmd = *vmf.pmd;
+		vmf.orig_pmd = *vmf.pmd;
 
 		barrier();
-		if (unlikely(is_swap_pmd(orig_pmd))) {
+		if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
 			VM_BUG_ON(thp_migration_supported() &&
-					  !is_pmd_migration_entry(orig_pmd));
-			if (is_pmd_migration_entry(orig_pmd))
+					  !is_pmd_migration_entry(vmf.orig_pmd));
+			if (is_pmd_migration_entry(vmf.orig_pmd))
 				pmd_migration_entry_wait(mm, vmf.pmd);
 			return 0;
 		}
-		if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
-			if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
-				return do_huge_pmd_numa_page(&vmf, orig_pmd);
+		if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) {
+			if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
+				return do_huge_pmd_numa_page(&vmf);
 
-			if (dirty && !pmd_write(orig_pmd)) {
-				ret = wp_huge_pmd(&vmf, orig_pmd);
+			if (dirty && !pmd_write(vmf.orig_pmd)) {
+				ret = wp_huge_pmd(&vmf);
 				if (!(ret & VM_FAULT_FALLBACK))
 					return ret;
 			} else {
-				huge_pmd_set_accessed(&vmf, orig_pmd);
+				huge_pmd_set_accessed(&vmf);
 				return 0;
 			}
 		}
-- 
cgit v1.2.3


From c5b5a3dd2c1fa61049b7789ce596faff4d659a61 Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301@gmail.com>
Date: Wed, 30 Jun 2021 18:51:42 -0700
Subject: mm: thp: refactor NUMA fault handling

When the THP NUMA fault support was added THP migration was not supported
yet.  So the ad hoc THP migration was implemented in NUMA fault handling.
Since v4.14 THP migration has been supported so it doesn't make too much
sense to still keep another THP migration implementation rather than using
the generic migration code.

This patch reworks the NUMA fault handling to use generic migration
implementation to migrate misplaced page.  There is no functional change.

After the refactor the flow of NUMA fault handling looks just like its
PTE counterpart:
  Acquire ptl
  Prepare for migration (elevate page refcount)
  Release ptl
  Isolate page from lru and elevate page refcount
  Migrate the misplaced THP

If migration fails just restore the old normal PMD.

In the old code anon_vma lock was needed to serialize THP migration
against THP split, but since then the THP code has been reworked a lot, it
seems anon_vma lock is not required anymore to avoid the race.

The page refcount elevation when holding ptl should prevent from THP
split.

Use migrate_misplaced_page() for both base page and THP NUMA hinting fault
and remove all the dead and duplicate code.

[dan.carpenter@oracle.com: fix a double unlock bug]
  Link: https://lkml.kernel.org/r/YLX8uYN01JmfLnlK@mwanda

Link: https://lkml.kernel.org/r/20210518200801.7413-4-shy828301@gmail.com
Signed-off-by: Yang Shi <shy828301@gmail.com>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h |  23 -------
 mm/huge_memory.c        | 145 ++++++++++++---------------------------
 mm/internal.h           |  18 -----
 mm/migrate.c            | 177 +++++++++---------------------------------------
 4 files changed, 77 insertions(+), 286 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 7b7b73977278..9b7b7cd3bae9 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -99,14 +99,9 @@ static inline void __ClearPageMovable(struct page *page)
 #endif
 
 #ifdef CONFIG_NUMA_BALANCING
-extern bool pmd_trans_migrating(pmd_t pmd);
 extern int migrate_misplaced_page(struct page *page,
 				  struct vm_area_struct *vma, int node);
 #else
-static inline bool pmd_trans_migrating(pmd_t pmd)
-{
-	return false;
-}
 static inline int migrate_misplaced_page(struct page *page,
 					 struct vm_area_struct *vma, int node)
 {
@@ -114,24 +109,6 @@ static inline int migrate_misplaced_page(struct page *page,
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
-#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
-extern int migrate_misplaced_transhuge_page(struct mm_struct *mm,
-			struct vm_area_struct *vma,
-			pmd_t *pmd, pmd_t entry,
-			unsigned long address,
-			struct page *page, int node);
-#else
-static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm,
-			struct vm_area_struct *vma,
-			pmd_t *pmd, pmd_t entry,
-			unsigned long address,
-			struct page *page, int node)
-{
-	return -EAGAIN;
-}
-#endif /* CONFIG_NUMA_BALANCING && CONFIG_TRANSPARENT_HUGEPAGE*/
-
-
 #ifdef CONFIG_MIGRATION
 
 /*
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f3a3138d33e9..de7a8c2a0c80 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1423,92 +1423,20 @@ out:
 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
-	pmd_t pmd = vmf->orig_pmd;
-	struct anon_vma *anon_vma = NULL;
+	pmd_t oldpmd = vmf->orig_pmd;
+	pmd_t pmd;
 	struct page *page;
 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
-	int page_nid = NUMA_NO_NODE, this_nid = numa_node_id();
+	int page_nid = NUMA_NO_NODE;
 	int target_nid, last_cpupid = -1;
-	bool page_locked;
 	bool migrated = false;
-	bool was_writable;
+	bool was_writable = pmd_savedwrite(oldpmd);
 	int flags = 0;
 
 	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
-	if (unlikely(!pmd_same(pmd, *vmf->pmd)))
-		goto out_unlock;
-
-	/*
-	 * If there are potential migrations, wait for completion and retry
-	 * without disrupting NUMA hinting information. Do not relock and
-	 * check_same as the page may no longer be mapped.
-	 */
-	if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
-		page = pmd_page(*vmf->pmd);
-		if (!get_page_unless_zero(page))
-			goto out_unlock;
-		spin_unlock(vmf->ptl);
-		put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
-		goto out;
-	}
-
-	page = pmd_page(pmd);
-	BUG_ON(is_huge_zero_page(page));
-	page_nid = page_to_nid(page);
-	last_cpupid = page_cpupid_last(page);
-	count_vm_numa_event(NUMA_HINT_FAULTS);
-	if (page_nid == this_nid) {
-		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
-		flags |= TNF_FAULT_LOCAL;
-	}
-
-	/* See similar comment in do_numa_page for explanation */
-	if (!pmd_savedwrite(pmd))
-		flags |= TNF_NO_GROUP;
-
-	/*
-	 * Acquire the page lock to serialise THP migrations but avoid dropping
-	 * page_table_lock if at all possible
-	 */
-	page_locked = trylock_page(page);
-	target_nid = mpol_misplaced(page, vma, haddr);
-	/* Migration could have started since the pmd_trans_migrating check */
-	if (!page_locked) {
-		page_nid = NUMA_NO_NODE;
-		if (!get_page_unless_zero(page))
-			goto out_unlock;
+	if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
 		spin_unlock(vmf->ptl);
-		put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
 		goto out;
-	} else if (target_nid == NUMA_NO_NODE) {
-		/* There are no parallel migrations and page is in the right
-		 * node. Clear the numa hinting info in this pmd.
-		 */
-		goto clear_pmdnuma;
-	}
-
-	/*
-	 * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
-	 * to serialises splits
-	 */
-	get_page(page);
-	spin_unlock(vmf->ptl);
-	anon_vma = page_lock_anon_vma_read(page);
-
-	/* Confirm the PMD did not change while page_table_lock was released */
-	spin_lock(vmf->ptl);
-	if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
-		unlock_page(page);
-		put_page(page);
-		page_nid = NUMA_NO_NODE;
-		goto out_unlock;
-	}
-
-	/* Bail if we fail to protect against THP splits for any reason */
-	if (unlikely(!anon_vma)) {
-		put_page(page);
-		page_nid = NUMA_NO_NODE;
-		goto clear_pmdnuma;
 	}
 
 	/*
@@ -1537,43 +1465,58 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
 					      haddr + HPAGE_PMD_SIZE);
 	}
 
-	/*
-	 * Migrate the THP to the requested node, returns with page unlocked
-	 * and access rights restored.
-	 */
+	pmd = pmd_modify(oldpmd, vma->vm_page_prot);
+	page = vm_normal_page_pmd(vma, haddr, pmd);
+	if (!page)
+		goto out_map;
+
+	/* See similar comment in do_numa_page for explanation */
+	if (!was_writable)
+		flags |= TNF_NO_GROUP;
+
+	page_nid = page_to_nid(page);
+	last_cpupid = page_cpupid_last(page);
+	target_nid = numa_migrate_prep(page, vma, haddr, page_nid,
+				       &flags);
+
+	if (target_nid == NUMA_NO_NODE) {
+		put_page(page);
+		goto out_map;
+	}
+
 	spin_unlock(vmf->ptl);
 
-	migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
-				vmf->pmd, pmd, vmf->address, page, target_nid);
+	migrated = migrate_misplaced_page(page, vma, target_nid);
 	if (migrated) {
 		flags |= TNF_MIGRATED;
 		page_nid = target_nid;
-	} else
+	} else {
 		flags |= TNF_MIGRATE_FAIL;
-
-	goto out;
-clear_pmdnuma:
-	BUG_ON(!PageLocked(page));
-	was_writable = pmd_savedwrite(pmd);
-	pmd = pmd_modify(pmd, vma->vm_page_prot);
-	pmd = pmd_mkyoung(pmd);
-	if (was_writable)
-		pmd = pmd_mkwrite(pmd);
-	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
-	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
-	unlock_page(page);
-out_unlock:
-	spin_unlock(vmf->ptl);
+		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+		if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
+			spin_unlock(vmf->ptl);
+			goto out;
+		}
+		goto out_map;
+	}
 
 out:
-	if (anon_vma)
-		page_unlock_anon_vma_read(anon_vma);
-
 	if (page_nid != NUMA_NO_NODE)
 		task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
 				flags);
 
 	return 0;
+
+out_map:
+	/* Restore the PMD */
+	pmd = pmd_modify(oldpmd, vma->vm_page_prot);
+	pmd = pmd_mkyoung(pmd);
+	if (was_writable)
+		pmd = pmd_mkwrite(pmd);
+	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
+	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
+	spin_unlock(vmf->ptl);
+	goto out;
 }
 
 /*
diff --git a/mm/internal.h b/mm/internal.h
index a4942f9867a3..d565eb94d6ec 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -369,23 +369,6 @@ extern unsigned int munlock_vma_page(struct page *page);
  */
 extern void clear_page_mlock(struct page *page);
 
-/*
- * mlock_migrate_page - called only from migrate_misplaced_transhuge_page()
- * (because that does not go through the full procedure of migration ptes):
- * to migrate the Mlocked page flag; update statistics.
- */
-static inline void mlock_migrate_page(struct page *newpage, struct page *page)
-{
-	if (TestClearPageMlocked(page)) {
-		int nr_pages = thp_nr_pages(page);
-
-		/* Holding pmd lock, no change in irq context: __mod is safe */
-		__mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
-		SetPageMlocked(newpage);
-		__mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
-	}
-}
-
 extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
 
 /*
@@ -461,7 +444,6 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
 #else /* !CONFIG_MMU */
 static inline void clear_page_mlock(struct page *page) { }
 static inline void mlock_vma_page(struct page *page) { }
-static inline void mlock_migrate_page(struct page *new, struct page *old) { }
 static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
 {
 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 9bd6bc12aa72..b7e330900b86 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2048,6 +2048,23 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
 	return newpage;
 }
 
+static struct page *alloc_misplaced_dst_page_thp(struct page *page,
+						 unsigned long data)
+{
+	int nid = (int) data;
+	struct page *newpage;
+
+	newpage = alloc_pages_node(nid, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
+				   HPAGE_PMD_ORDER);
+	if (!newpage)
+		goto out;
+
+	prep_transhuge_page(newpage);
+
+out:
+	return newpage;
+}
+
 static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
 {
 	int page_lru;
@@ -2086,12 +2103,6 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
 	return 1;
 }
 
-bool pmd_trans_migrating(pmd_t pmd)
-{
-	struct page *page = pmd_page(pmd);
-	return PageLocked(page);
-}
-
 /*
  * Attempt to migrate a misplaced page to the specified destination
  * node. Caller is expected to have an elevated reference count on
@@ -2104,6 +2115,20 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
 	int isolated;
 	int nr_remaining;
 	LIST_HEAD(migratepages);
+	new_page_t *new;
+	bool compound;
+
+	/*
+	 * PTE mapped THP or HugeTLB page can't reach here so the page could
+	 * be either base page or THP.  And it must be head page if it is
+	 * THP.
+	 */
+	compound = PageTransHuge(page);
+
+	if (compound)
+		new = alloc_misplaced_dst_page_thp;
+	else
+		new = alloc_misplaced_dst_page;
 
 	/*
 	 * Don't migrate file pages that are mapped in multiple processes
@@ -2125,9 +2150,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
 		goto out;
 
 	list_add(&page->lru, &migratepages);
-	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
-				     NULL, node, MIGRATE_ASYNC,
-				     MR_NUMA_MISPLACED);
+	nr_remaining = migrate_pages(&migratepages, *new, NULL, node,
+				     MIGRATE_ASYNC, MR_NUMA_MISPLACED);
 	if (nr_remaining) {
 		if (!list_empty(&migratepages)) {
 			list_del(&page->lru);
@@ -2146,141 +2170,6 @@ out:
 	return 0;
 }
 #endif /* CONFIG_NUMA_BALANCING */
-
-#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
-/*
- * Migrates a THP to a given target node. page must be locked and is unlocked
- * before returning.
- */
-int migrate_misplaced_transhuge_page(struct mm_struct *mm,
-				struct vm_area_struct *vma,
-				pmd_t *pmd, pmd_t entry,
-				unsigned long address,
-				struct page *page, int node)
-{
-	spinlock_t *ptl;
-	pg_data_t *pgdat = NODE_DATA(node);
-	int isolated = 0;
-	struct page *new_page = NULL;
-	int page_lru = page_is_file_lru(page);
-	unsigned long start = address & HPAGE_PMD_MASK;
-
-	new_page = alloc_pages_node(node,
-		(GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
-		HPAGE_PMD_ORDER);
-	if (!new_page)
-		goto out_fail;
-	prep_transhuge_page(new_page);
-
-	isolated = numamigrate_isolate_page(pgdat, page);
-	if (!isolated) {
-		put_page(new_page);
-		goto out_fail;
-	}
-
-	/* Prepare a page as a migration target */
-	__SetPageLocked(new_page);
-	if (PageSwapBacked(page))
-		__SetPageSwapBacked(new_page);
-
-	/* anon mapping, we can simply copy page->mapping to the new page: */
-	new_page->mapping = page->mapping;
-	new_page->index = page->index;
-	/* flush the cache before copying using the kernel virtual address */
-	flush_cache_range(vma, start, start + HPAGE_PMD_SIZE);
-	migrate_page_copy(new_page, page);
-	WARN_ON(PageLRU(new_page));
-
-	/* Recheck the target PMD */
-	ptl = pmd_lock(mm, pmd);
-	if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) {
-		spin_unlock(ptl);
-
-		/* Reverse changes made by migrate_page_copy() */
-		if (TestClearPageActive(new_page))
-			SetPageActive(page);
-		if (TestClearPageUnevictable(new_page))
-			SetPageUnevictable(page);
-
-		unlock_page(new_page);
-		put_page(new_page);		/* Free it */
-
-		/* Retake the callers reference and putback on LRU */
-		get_page(page);
-		putback_lru_page(page);
-		mod_node_page_state(page_pgdat(page),
-			 NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
-
-		goto out_unlock;
-	}
-
-	entry = mk_huge_pmd(new_page, vma->vm_page_prot);
-	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-
-	/*
-	 * Overwrite the old entry under pagetable lock and establish
-	 * the new PTE. Any parallel GUP will either observe the old
-	 * page blocking on the page lock, block on the page table
-	 * lock or observe the new page. The SetPageUptodate on the
-	 * new page and page_add_new_anon_rmap guarantee the copy is
-	 * visible before the pagetable update.
-	 */
-	page_add_anon_rmap(new_page, vma, start, true);
-	/*
-	 * At this point the pmd is numa/protnone (i.e. non present) and the TLB
-	 * has already been flushed globally.  So no TLB can be currently
-	 * caching this non present pmd mapping.  There's no need to clear the
-	 * pmd before doing set_pmd_at(), nor to flush the TLB after
-	 * set_pmd_at().  Clearing the pmd here would introduce a race
-	 * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the
-	 * mmap_lock for reading.  If the pmd is set to NULL at any given time,
-	 * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this
-	 * pmd.
-	 */
-	set_pmd_at(mm, start, pmd, entry);
-	update_mmu_cache_pmd(vma, address, &entry);
-
-	page_ref_unfreeze(page, 2);
-	mlock_migrate_page(new_page, page);
-	page_remove_rmap(page, true);
-	set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
-
-	spin_unlock(ptl);
-
-	/* Take an "isolate" reference and put new page on the LRU. */
-	get_page(new_page);
-	putback_lru_page(new_page);
-
-	unlock_page(new_page);
-	unlock_page(page);
-	put_page(page);			/* Drop the rmap reference */
-	put_page(page);			/* Drop the LRU isolation reference */
-
-	count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
-	count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
-
-	mod_node_page_state(page_pgdat(page),
-			NR_ISOLATED_ANON + page_lru,
-			-HPAGE_PMD_NR);
-	return isolated;
-
-out_fail:
-	count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
-	ptl = pmd_lock(mm, pmd);
-	if (pmd_same(*pmd, entry)) {
-		entry = pmd_modify(entry, vma->vm_page_prot);
-		set_pmd_at(mm, start, pmd, entry);
-		update_mmu_cache_pmd(vma, address, &entry);
-	}
-	spin_unlock(ptl);
-
-out_unlock:
-	unlock_page(page);
-	put_page(page);
-	return 0;
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
 #endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_DEVICE_PRIVATE
-- 
cgit v1.2.3


From 1fb08ac63beedf58e2ae9f229ea1f9474949a185 Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301@gmail.com>
Date: Wed, 30 Jun 2021 18:52:01 -0700
Subject: mm: rmap: make try_to_unmap() void function

Currently try_to_unmap() return bool value by checking page_mapcount(),
however this may return false positive since page_mapcount() doesn't check
all subpages of compound page.  The total_mapcount() could be used
instead, but its cost is higher since it traverses all subpages.

Actually the most callers of try_to_unmap() don't care about the return
value at all.  So just need check if page is still mapped by page_mapped()
when necessary.  And page_mapped() does bail out early when it finds
mapped subpage.

Link: https://lkml.kernel.org/r/bb27e3fe-6036-b637-5086-272befbfe3da@google.com
Suggested-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Yang Shi <shy828301@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jue Wang <juew@google.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Wang Yugui <wangyugui@e16-tech.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h |  2 +-
 mm/memory-failure.c  | 15 +++++++--------
 mm/rmap.c            | 15 ++++-----------
 mm/vmscan.c          |  3 ++-
 4 files changed, 14 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 8d04e7deedc6..ed31a559e857 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -195,7 +195,7 @@ static inline void page_dup_rmap(struct page *page, bool compound)
 int page_referenced(struct page *, int is_locked,
 			struct mem_cgroup *memcg, unsigned long *vm_flags);
 
-bool try_to_unmap(struct page *, enum ttu_flags flags);
+void try_to_unmap(struct page *, enum ttu_flags flags);
 
 /* Avoid racy checks */
 #define PVMW_SYNC		(1 << 0)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9d2d31ffe8a4..419d92b3225d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1269,7 +1269,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
 	enum ttu_flags ttu = TTU_IGNORE_MLOCK;
 	struct address_space *mapping;
 	LIST_HEAD(tokill);
-	bool unmap_success = true;
+	bool unmap_success;
 	int kill = 1, forcekill;
 	struct page *hpage = *hpagep;
 	bool mlocked = PageMlocked(hpage);
@@ -1332,7 +1332,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
 		collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
 
 	if (!PageHuge(hpage)) {
-		unmap_success = try_to_unmap(hpage, ttu);
+		try_to_unmap(hpage, ttu);
 	} else {
 		if (!PageAnon(hpage)) {
 			/*
@@ -1344,17 +1344,16 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
 			 */
 			mapping = hugetlb_page_mapping_lock_write(hpage);
 			if (mapping) {
-				unmap_success = try_to_unmap(hpage,
-						     ttu|TTU_RMAP_LOCKED);
+				try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
 				i_mmap_unlock_write(mapping);
-			} else {
+			} else
 				pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
-				unmap_success = false;
-			}
 		} else {
-			unmap_success = try_to_unmap(hpage, ttu);
+			try_to_unmap(hpage, ttu);
 		}
 	}
+
+	unmap_success = !page_mapped(hpage);
 	if (!unmap_success)
 		pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
 		       pfn, page_mapcount(hpage));
diff --git a/mm/rmap.c b/mm/rmap.c
index e05c300048e6..f9fd5bc54f0a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1405,7 +1405,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	/*
 	 * When racing against e.g. zap_pte_range() on another cpu,
 	 * in between its ptep_get_and_clear_full() and page_remove_rmap(),
-	 * try_to_unmap() may return false when it is about to become true,
+	 * try_to_unmap() may return before page_mapped() has become false,
 	 * if page table locking is skipped: use TTU_SYNC to wait for that.
 	 */
 	if (flags & TTU_SYNC)
@@ -1756,9 +1756,10 @@ static int page_not_mapped(struct page *page)
  * Tries to remove all the page table entries which are mapping this
  * page, used in the pageout path.  Caller must hold the page lock.
  *
- * If unmap is successful, return true. Otherwise, false.
+ * It is the caller's responsibility to check if the page is still
+ * mapped when needed (use TTU_SYNC to prevent accounting races).
  */
-bool try_to_unmap(struct page *page, enum ttu_flags flags)
+void try_to_unmap(struct page *page, enum ttu_flags flags)
 {
 	struct rmap_walk_control rwc = {
 		.rmap_one = try_to_unmap_one,
@@ -1783,14 +1784,6 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
 		rmap_walk_locked(page, &rwc);
 	else
 		rmap_walk(page, &rwc);
-
-	/*
-	 * When racing against e.g. zap_pte_range() on another cpu,
-	 * in between its ptep_get_and_clear_full() and page_remove_rmap(),
-	 * try_to_unmap() may return false when it is about to become true,
-	 * if page table locking is skipped: use TTU_SYNC to wait for that.
-	 */
-	return !page_mapcount(page);
 }
 
 /**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7b52ab166aae..e1d75e6f9ff4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1499,7 +1499,8 @@ static unsigned int shrink_page_list(struct list_head *page_list,
 			if (unlikely(PageTransHuge(page)))
 				flags |= TTU_SPLIT_HUGE_PMD;
 
-			if (!try_to_unmap(page, flags)) {
+			try_to_unmap(page, flags);
+			if (page_mapped(page)) {
 				stat->nr_unmap_fail += nr_pages;
 				if (!was_swapbacked && PageSwapBacked(page))
 					stat->nr_lazyfree_fail += nr_pages;
-- 
cgit v1.2.3


From c4ffefd16daba0f29fa7d9534de20949b673eca0 Mon Sep 17 00:00:00 2001
From: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Date: Wed, 30 Jun 2021 18:53:10 -0700
Subject: mm: fix typos and grammar error in comments

We moves tha -> We move that in mm/swap.c
statments -> statements in include/linux/mm.h

Link: https://lkml.kernel.org/r/20210509063444.GA24745@hyeyoo
Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 mm/swap.c          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3cbd2d6d248e..714ad9b26ed2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -155,7 +155,7 @@ extern int mmap_rnd_compat_bits __read_mostly;
 /* This function must be updated when the size of struct page grows above 80
  * or reduces below 56. The idea that compiler optimizes out switch()
  * statement, and only leaves move/store instructions. Also the compiler can
- * combine write statments if they are both assignments and can be reordered,
+ * combine write statements if they are both assignments and can be reordered,
  * this can result in several of the writes here being dropped.
  */
 #define	mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
diff --git a/mm/swap.c b/mm/swap.c
index 6c11db780467..19600430e536 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -554,7 +554,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
 	} else {
 		/*
 		 * The page's writeback ends up during pagevec
-		 * We moves tha page into tail of inactive.
+		 * We move that page into tail of inactive.
 		 */
 		add_page_to_lru_list_tail(page, lruvec);
 		__count_vm_events(PGROTATED, nr_pages);
-- 
cgit v1.2.3


From fac7757e1fb05b75c8e22d4f8fe2f6c9c4d7edca Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Wed, 30 Jun 2021 18:53:13 -0700
Subject: mm: define default value for FIRST_USER_ADDRESS

Currently most platforms define FIRST_USER_ADDRESS as 0UL duplication the
same code all over.  Instead just define a generic default value (i.e 0UL)
for FIRST_USER_ADDRESS and let the platforms override when required.  This
makes it much cleaner with reduced code.

The default FIRST_USER_ADDRESS here would be skipped in <linux/pgtable.h>
when the given platform overrides its value via <asm/pgtable.h>.

Link: https://lkml.kernel.org/r/1620615725-24623-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>	[m68k]
Acked-by: Guo Ren <guoren@kernel.org>			[csky]
Acked-by: Stafford Horne <shorne@gmail.com>		[openrisc]
Acked-by: Catalin Marinas <catalin.marinas@arm.com>	[arm64]
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Palmer Dabbelt <palmerdabbelt@google.com>	[RISC-V]
Cc: Richard Henderson <rth@twiddle.net>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Brian Cain <bcain@codeaurora.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Ley Foon Tan <ley.foon.tan@intel.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
Cc: Stafford Horne <shorne@gmail.com>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Zankel <chris@zankel.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/pgtable.h             | 1 -
 arch/arc/include/asm/pgtable.h               | 6 ------
 arch/arm64/include/asm/pgtable.h             | 2 --
 arch/csky/include/asm/pgtable.h              | 1 -
 arch/hexagon/include/asm/pgtable.h           | 3 ---
 arch/ia64/include/asm/pgtable.h              | 1 -
 arch/m68k/include/asm/pgtable_mm.h           | 1 -
 arch/microblaze/include/asm/pgtable.h        | 2 --
 arch/mips/include/asm/pgtable-32.h           | 1 -
 arch/mips/include/asm/pgtable-64.h           | 1 -
 arch/nios2/include/asm/pgtable.h             | 2 --
 arch/openrisc/include/asm/pgtable.h          | 1 -
 arch/parisc/include/asm/pgtable.h            | 2 --
 arch/powerpc/include/asm/book3s/pgtable.h    | 1 -
 arch/powerpc/include/asm/nohash/32/pgtable.h | 1 -
 arch/powerpc/include/asm/nohash/64/pgtable.h | 2 --
 arch/riscv/include/asm/pgtable.h             | 2 --
 arch/s390/include/asm/pgtable.h              | 2 --
 arch/sh/include/asm/pgtable.h                | 2 --
 arch/sparc/include/asm/pgtable_32.h          | 1 -
 arch/sparc/include/asm/pgtable_64.h          | 3 ---
 arch/um/include/asm/pgtable-2level.h         | 1 -
 arch/um/include/asm/pgtable-3level.h         | 1 -
 arch/x86/include/asm/pgtable_types.h         | 2 --
 arch/xtensa/include/asm/pgtable.h            | 1 -
 include/linux/pgtable.h                      | 9 +++++++++
 26 files changed, 9 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h
index e1757b7cfe3d..ff690846465e 100644
--- a/arch/alpha/include/asm/pgtable.h
+++ b/arch/alpha/include/asm/pgtable.h
@@ -46,7 +46,6 @@ struct vm_area_struct;
 #define PTRS_PER_PMD	(1UL << (PAGE_SHIFT-3))
 #define PTRS_PER_PGD	(1UL << (PAGE_SHIFT-3))
 #define USER_PTRS_PER_PGD	(TASK_SIZE / PGDIR_SIZE)
-#define FIRST_USER_ADDRESS	0UL
 
 /* Number of pointers that fit on a page:  this will go away. */
 #define PTRS_PER_PAGE	(1UL << (PAGE_SHIFT-3))
diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h
index 5878846f00cf..577682e8cc7f 100644
--- a/arch/arc/include/asm/pgtable.h
+++ b/arch/arc/include/asm/pgtable.h
@@ -222,12 +222,6 @@
  */
 #define	USER_PTRS_PER_PGD	(TASK_SIZE / PGDIR_SIZE)
 
-/*
- * No special requirements for lowest virtual address we permit any user space
- * mapping to be mapped at.
- */
-#define FIRST_USER_ADDRESS      0UL
-
 
 /****************************************************************
  * Bucket load of VM Helpers
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 0b10204e72fc..25f5c04b43ce 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -26,8 +26,6 @@
 
 #define vmemmap			((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT))
 
-#define FIRST_USER_ADDRESS	0UL
-
 #ifndef __ASSEMBLY__
 
 #include <asm/cmpxchg.h>
diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h
index 0d60367b6bfa..151607ed5158 100644
--- a/arch/csky/include/asm/pgtable.h
+++ b/arch/csky/include/asm/pgtable.h
@@ -14,7 +14,6 @@
 #define PGDIR_MASK		(~(PGDIR_SIZE-1))
 
 #define USER_PTRS_PER_PGD	(PAGE_OFFSET/PGDIR_SIZE)
-#define FIRST_USER_ADDRESS	0UL
 
 /*
  * C-SKY is two-level paging structure:
diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h
index dbb22b80b8c4..e4979508cddf 100644
--- a/arch/hexagon/include/asm/pgtable.h
+++ b/arch/hexagon/include/asm/pgtable.h
@@ -155,9 +155,6 @@ extern unsigned long _dflt_cache_att;
 
 extern pgd_t swapper_pg_dir[PTRS_PER_PGD];  /* located in head.S */
 
-/* Seems to be zero even in architectures where the zero page is firewalled? */
-#define FIRST_USER_ADDRESS 0UL
-
 /*  HUGETLB not working currently  */
 #ifdef CONFIG_HUGETLB_PAGE
 #define pte_mkhuge(pte) __pte((pte_val(pte) & ~0x3) | HVM_HUGEPAGE_SIZE)
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index d765fd948fae..3f5dbbd8b9d8 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -128,7 +128,6 @@
 #define PTRS_PER_PGD_SHIFT	PTRS_PER_PTD_SHIFT
 #define PTRS_PER_PGD		(1UL << PTRS_PER_PGD_SHIFT)
 #define USER_PTRS_PER_PGD	(5*PTRS_PER_PGD/8)	/* regions 0-4 are user regions */
-#define FIRST_USER_ADDRESS	0UL
 
 /*
  * All the normal masks have the "page accessed" bits on, as any time
diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h
index aca22c2c1ee2..143ba7de9bda 100644
--- a/arch/m68k/include/asm/pgtable_mm.h
+++ b/arch/m68k/include/asm/pgtable_mm.h
@@ -72,7 +72,6 @@
 #define PTRS_PER_PGD	128
 #endif
 #define USER_PTRS_PER_PGD	(TASK_SIZE/PGDIR_SIZE)
-#define FIRST_USER_ADDRESS	0UL
 
 /* Virtual address region for use by kernel_map() */
 #ifdef CONFIG_SUN3
diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h
index 9ae8d2c17dd5..71cd547655d9 100644
--- a/arch/microblaze/include/asm/pgtable.h
+++ b/arch/microblaze/include/asm/pgtable.h
@@ -25,8 +25,6 @@ extern int mem_init_done;
 #include <asm/mmu.h>
 #include <asm/page.h>
 
-#define FIRST_USER_ADDRESS	0UL
-
 extern unsigned long va_to_phys(unsigned long address);
 extern pte_t *va_to_pte(unsigned long address);
 
diff --git a/arch/mips/include/asm/pgtable-32.h b/arch/mips/include/asm/pgtable-32.h
index 6c0532d7b211..95df9c293d8d 100644
--- a/arch/mips/include/asm/pgtable-32.h
+++ b/arch/mips/include/asm/pgtable-32.h
@@ -93,7 +93,6 @@ extern int add_temporary_entry(unsigned long entrylo0, unsigned long entrylo1,
 #endif
 
 #define USER_PTRS_PER_PGD	(0x80000000UL/PGDIR_SIZE)
-#define FIRST_USER_ADDRESS	0UL
 
 #define VMALLOC_START	  MAP_BASE
 
diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h
index 1e7d6ce9d8d6..046465906c82 100644
--- a/arch/mips/include/asm/pgtable-64.h
+++ b/arch/mips/include/asm/pgtable-64.h
@@ -137,7 +137,6 @@
 #define PTRS_PER_PTE	((PAGE_SIZE << PTE_ORDER) / sizeof(pte_t))
 
 #define USER_PTRS_PER_PGD       ((TASK_SIZE64 / PGDIR_SIZE)?(TASK_SIZE64 / PGDIR_SIZE):1)
-#define FIRST_USER_ADDRESS	0UL
 
 /*
  * TLB refill handlers also map the vmalloc area into xuseg.  Avoid
diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h
index 2600d76c310c..4a995fa628ee 100644
--- a/arch/nios2/include/asm/pgtable.h
+++ b/arch/nios2/include/asm/pgtable.h
@@ -24,8 +24,6 @@
 #include <asm/pgtable-bits.h>
 #include <asm-generic/pgtable-nopmd.h>
 
-#define FIRST_USER_ADDRESS	0UL
-
 #define VMALLOC_START		CONFIG_NIOS2_KERNEL_MMU_REGION_BASE
 #define VMALLOC_END		(CONFIG_NIOS2_KERNEL_REGION_BASE - 1)
 
diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h
index 9425bedab4fc..4ac591c9ca33 100644
--- a/arch/openrisc/include/asm/pgtable.h
+++ b/arch/openrisc/include/asm/pgtable.h
@@ -73,7 +73,6 @@ extern void paging_init(void);
  */
 
 #define USER_PTRS_PER_PGD       (TASK_SIZE/PGDIR_SIZE)
-#define FIRST_USER_ADDRESS      0UL
 
 /*
  * Kernels own virtual memory area.
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index 39017210dbf0..7f33c29764cc 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@@ -171,8 +171,6 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr)
  * pgd entries used up by user/kernel:
  */
 
-#define FIRST_USER_ADDRESS	0UL
-
 /* NB: The tlb miss handlers make certain assumptions about the order */
 /*     of the following bits, so be careful (One example, bits 25-31  */
 /*     are moved together in one instruction).                        */
diff --git a/arch/powerpc/include/asm/book3s/pgtable.h b/arch/powerpc/include/asm/book3s/pgtable.h
index 0e1263455d73..ad130e15a126 100644
--- a/arch/powerpc/include/asm/book3s/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/pgtable.h
@@ -8,7 +8,6 @@
 #include <asm/book3s/32/pgtable.h>
 #endif
 
-#define FIRST_USER_ADDRESS	0UL
 #ifndef __ASSEMBLY__
 /* Insert a PTE, top-level function is out of line. It uses an inline
  * low level function in the respective pgtable-* files
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
index 96522f7f0618..f06ae00f2a65 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -54,7 +54,6 @@ extern int icache_44x_need_flush;
 #define PGD_MASKED_BITS		0
 
 #define USER_PTRS_PER_PGD	(TASK_SIZE / PGDIR_SIZE)
-#define FIRST_USER_ADDRESS	0UL
 
 #define pte_ERROR(e) \
 	pr_err("%s:%d: bad pte %llx.\n", __FILE__, __LINE__, \
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index 57cd3892bfe0..53fbfdfac93d 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -12,8 +12,6 @@
 #include <asm/barrier.h>
 #include <asm/asm-const.h>
 
-#define FIRST_USER_ADDRESS	0UL
-
 /*
  * Size of EA range mapped by our pagetables.
  */
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 380cd3a7e548..62f3fe7368f3 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -536,8 +536,6 @@ void setup_bootmem(void);
 void paging_init(void);
 void misc_mem_init(void);
 
-#define FIRST_USER_ADDRESS  0
-
 /*
  * ZERO_PAGE is a global shared page that is always zero,
  * used for zero-mapped memory areas, etc.
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index b38f7b781564..7c86bf03fc8f 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -65,8 +65,6 @@ extern unsigned long zero_page_mask;
 
 /* TODO: s390 cannot support io_remap_pfn_range... */
 
-#define FIRST_USER_ADDRESS  0UL
-
 #define pte_ERROR(e) \
 	printk("%s:%d: bad pte %p.\n", __FILE__, __LINE__, (void *) pte_val(e))
 #define pmd_ERROR(e) \
diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h
index 27751e9470df..d7ddb1ec86a0 100644
--- a/arch/sh/include/asm/pgtable.h
+++ b/arch/sh/include/asm/pgtable.h
@@ -59,8 +59,6 @@ static inline unsigned long long neff_sign_extend(unsigned long val)
 /* Entries per level */
 #define PTRS_PER_PTE	(PAGE_SIZE / (1 << PTE_MAGNITUDE))
 
-#define FIRST_USER_ADDRESS	0UL
-
 #define PHYS_ADDR_MASK29		0x1fffffff
 #define PHYS_ADDR_MASK32		0xffffffff
 
diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h
index a5cf79c149fe..0888bda245f5 100644
--- a/arch/sparc/include/asm/pgtable_32.h
+++ b/arch/sparc/include/asm/pgtable_32.h
@@ -48,7 +48,6 @@ unsigned long __init bootmem_init(unsigned long *pages_avail);
 #define PTRS_PER_PMD    	64
 #define PTRS_PER_PGD    	256
 #define USER_PTRS_PER_PGD	PAGE_OFFSET / PGDIR_SIZE
-#define FIRST_USER_ADDRESS	0UL
 #define PTE_SIZE		(PTRS_PER_PTE*4)
 
 #define PAGE_NONE	SRMMU_PAGE_NONE
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 2cd80a0a9795..a400e0f23046 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -95,9 +95,6 @@ bool kern_addr_valid(unsigned long addr);
 #define PTRS_PER_PUD	(1UL << PUD_BITS)
 #define PTRS_PER_PGD	(1UL << PGDIR_BITS)
 
-/* Kernel has a separate 44bit address space. */
-#define FIRST_USER_ADDRESS	0UL
-
 #define pmd_ERROR(e)							\
 	pr_err("%s:%d: bad pmd %p(%016lx) seen at (%pS)\n",		\
 	       __FILE__, __LINE__, &(e), pmd_val(e), __builtin_return_address(0))
diff --git a/arch/um/include/asm/pgtable-2level.h b/arch/um/include/asm/pgtable-2level.h
index 32106d31e4ab..8256ecc5b919 100644
--- a/arch/um/include/asm/pgtable-2level.h
+++ b/arch/um/include/asm/pgtable-2level.h
@@ -23,7 +23,6 @@
 #define PTRS_PER_PTE	1024
 #define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE)
 #define PTRS_PER_PGD	1024
-#define FIRST_USER_ADDRESS	0UL
 
 #define pte_ERROR(e) \
         printk("%s:%d: bad pte %p(%08lx).\n", __FILE__, __LINE__, &(e), \
diff --git a/arch/um/include/asm/pgtable-3level.h b/arch/um/include/asm/pgtable-3level.h
index 7e6a4180db9d..9289a86643a9 100644
--- a/arch/um/include/asm/pgtable-3level.h
+++ b/arch/um/include/asm/pgtable-3level.h
@@ -41,7 +41,6 @@
 #endif
 
 #define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE)
-#define FIRST_USER_ADDRESS	0UL
 
 #define pte_ERROR(e) \
         printk("%s:%d: bad pte %p(%016lx).\n", __FILE__, __LINE__, &(e), \
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index f24d7ef8fffa..40497a9020c6 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -7,8 +7,6 @@
 
 #include <asm/page_types.h>
 
-#define FIRST_USER_ADDRESS	0UL
-
 #define _PAGE_BIT_PRESENT	0	/* is present */
 #define _PAGE_BIT_RW		1	/* writeable */
 #define _PAGE_BIT_USER		2	/* userspace addressable */
diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h
index d7fc45c920c2..bd5aeb795567 100644
--- a/arch/xtensa/include/asm/pgtable.h
+++ b/arch/xtensa/include/asm/pgtable.h
@@ -59,7 +59,6 @@
 #define PTRS_PER_PGD		1024
 #define PGD_ORDER		0
 #define USER_PTRS_PER_PGD	(TASK_SIZE/PGDIR_SIZE)
-#define FIRST_USER_ADDRESS	0UL
 #define FIRST_USER_PGD_NR	(FIRST_USER_ADDRESS >> PGDIR_SHIFT)
 
 #ifdef CONFIG_MMU
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 2b0d02291178..69700e3e615f 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -28,6 +28,15 @@
 #define USER_PGTABLES_CEILING	0UL
 #endif
 
+/*
+ * This defines the first usable user address. Platforms
+ * can override its value with custom FIRST_USER_ADDRESS
+ * defined in their respective <asm/pgtable.h>.
+ */
+#ifndef FIRST_USER_ADDRESS
+#define FIRST_USER_ADDRESS	0UL
+#endif
+
 /*
  * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD]
  *
-- 
cgit v1.2.3


From 041711ce7cdf023f53d76f64d82b75210248e18d Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Wed, 30 Jun 2021 18:53:17 -0700
Subject: mm: fix spelling mistakes

Fix some spelling mistakes in comments:
each having differents usage ==> each has a different usage
statments ==> statements
adresses ==> addresses
aggresive ==> aggressive
datas ==> data
posion ==> poison
higer ==> higher
precisly ==> precisely
wont ==> won't
We moves tha ==> We move the
endianess ==> endianness

Link: https://lkml.kernel.org/r/20210519065853.7723-2-thunder.leizhen@huawei.com
Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Reviewed-by: Souptick Joarder <jrdr.linux@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memremap.h | 2 +-
 include/linux/mm_types.h | 2 +-
 include/linux/mmzone.h   | 2 +-
 mm/memory-failure.c      | 2 +-
 mm/memory_hotplug.c      | 4 ++--
 mm/page_alloc.c          | 2 +-
 mm/swapfile.c            | 2 +-
 7 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 45a79da89c5f..c0e9d35889e8 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -26,7 +26,7 @@ struct vmem_altmap {
 };
 
 /*
- * Specialize ZONE_DEVICE memory into multiple types each having differents
+ * Specialize ZONE_DEVICE memory into multiple types each has a different
  * usage.
  *
  * MEMORY_DEVICE_PRIVATE:
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index b66d0225414e..748617780924 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -397,7 +397,7 @@ struct mm_struct {
 		unsigned long mmap_base;	/* base of mmap area */
 		unsigned long mmap_legacy_base;	/* base of mmap area in bottom-up allocations */
 #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
-		/* Base adresses for compatible mmap() */
+		/* Base addresses for compatible mmap() */
 		unsigned long mmap_compat_base;
 		unsigned long mmap_compat_legacy_base;
 #endif
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7bc7e41b6c31..0ed2c23ed3fb 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -114,7 +114,7 @@ static inline bool free_area_empty(struct free_area *area, int migratetype)
 struct pglist_data;
 
 /*
- * Add a wild amount of padding here to ensure datas fall into separate
+ * Add a wild amount of padding here to ensure data fall into separate
  * cachelines.  There are very few zone structures in the machine, so space
  * consumption is not a concern here.
  */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ee02d8b06839..eefd823deb67 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1340,7 +1340,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
 			 * could potentially call huge_pmd_unshare.  Because of
 			 * this, take semaphore in write mode here and set
 			 * TTU_RMAP_LOCKED to indicate we have taken the lock
-			 * at this higer level.
+			 * at this higher level.
 			 */
 			mapping = hugetlb_page_mapping_lock_write(hpage);
 			if (mapping) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 553c52751249..79f6ce92b6b6 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -783,7 +783,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *z
 
 	/*
 	 * {on,off}lining is constrained to full memory sections (or more
-	 * precisly to memory blocks from the user space POV).
+	 * precisely to memory blocks from the user space POV).
 	 * memmap_on_memory is an exception because it reserves initial part
 	 * of the physical memory space for vmemmaps. That space is pageblock
 	 * aligned.
@@ -1580,7 +1580,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 
 	/*
 	 * {on,off}lining is constrained to full memory sections (or more
-	 * precisly to memory blocks from the user space POV).
+	 * precisely to memory blocks from the user space POV).
 	 * memmap_on_memory is an exception because it reserves initial part
 	 * of the physical memory space for vmemmaps. That space is pageblock
 	 * aligned.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index eeff64843718..6700cfcfab46 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3180,7 +3180,7 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
 	int cpu;
 
 	/*
-	 * Allocate in the BSS so we wont require allocation in
+	 * Allocate in the BSS so we won't require allocation in
 	 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
 	 */
 	static cpumask_t cpus_with_pcps;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e898c879a434..1e07d1c776f2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2967,7 +2967,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
 		return 0;
 	}
 
-	/* swap partition endianess hack... */
+	/* swap partition endianness hack... */
 	if (swab32(swap_header->info.version) == 1) {
 		swab32s(&swap_header->info.version);
 		swab32s(&swap_header->info.last_page);
-- 
cgit v1.2.3


From 2bb6a033fb4078f1c528ee575f551064ed738d6f Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Wed, 30 Jun 2021 18:53:47 -0700
Subject: mm/swap: make swap_address_space an inline function

make W=1 generates the following warning in page_mapping() for allnoconfig

  mm/util.c:700:15: warning: variable `entry' set but not used [-Wunused-but-set-variable]
     swp_entry_t entry;
                 ^~~~~

swap_address is a #define on !CONFIG_SWAP configurations.  Make the helper
an inline function to suppress the warning, add type checking and to apply
any side-effects in the parameter list.

Link: https://lkml.kernel.org/r/20210520084809.8576-12-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 49b1dd2c100b..ac9bd84c905e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -537,7 +537,11 @@ static inline void put_swap_device(struct swap_info_struct *si)
 {
 }
 
-#define swap_address_space(entry)		(NULL)
+static inline struct address_space *swap_address_space(swp_entry_t entry)
+{
+	return NULL;
+}
+
 #define get_nr_swap_pages()			0L
 #define total_swap_pages			0L
 #define total_swapcache_pages()			0UL
-- 
cgit v1.2.3


From 351de44fde5afc3b0b23294ebf404e78065c2745 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Wed, 30 Jun 2021 18:53:56 -0700
Subject: mm/swap: make NODE_DATA an inline function on CONFIG_FLATMEM

make W=1 generates the following warning in mm/workingset.c for allnoconfig

  mm/workingset.c: In function `unpack_shadow':
  mm/workingset.c:201:15: warning: variable `nid' set but not used [-Wunused-but-set-variable]
    int memcgid, nid;
                 ^~~

On FLATMEM, NODE_DATA returns a global pglist_data without dereferencing
nid.  Make the helper an inline function to suppress the warning, add type
checking and to apply any side-effects in the parameter list.

Link: https://lkml.kernel.org/r/20210520084809.8576-15-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0ed2c23ed3fb..fcb535560028 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1064,7 +1064,10 @@ extern char numa_zonelist_order[];
 #ifndef CONFIG_NUMA
 
 extern struct pglist_data contig_page_data;
-#define NODE_DATA(nid)		(&contig_page_data)
+static inline struct pglist_data *NODE_DATA(int nid)
+{
+	return &contig_page_data;
+}
 #define NODE_MEM_MAP(nid)	mem_map
 
 #else /* CONFIG_NUMA */
-- 
cgit v1.2.3


From 1c2f7d14d84f767a797558609eb034511e02f41e Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Wed, 30 Jun 2021 18:53:59 -0700
Subject: mm/thp: define default pmd_pgtable()

Currently most platforms define pmd_pgtable() as pmd_page() duplicating
the same code all over.  Instead just define a default value i.e
pmd_page() for pmd_pgtable() and let platforms override when required via
<asm/pgtable.h>.  All the existing platform that override pmd_pgtable()
have been moved into their respective <asm/pgtable.h> header in order to
precede before the new generic definition.  This makes it much cleaner
with reduced code.

Link: https://lkml.kernel.org/r/1623646133-20306-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Nick Hu <nickhu@andestech.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Brian Cain <bcain@codeaurora.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Ley Foon Tan <ley.foon.tan@intel.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
Cc: Stafford Horne <shorne@gmail.com>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Zankel <chris@zankel.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/pgalloc.h         | 1 -
 arch/arc/include/asm/pgalloc.h           | 2 --
 arch/arc/include/asm/pgtable.h           | 2 ++
 arch/arm/include/asm/pgalloc.h           | 1 -
 arch/arm64/include/asm/pgalloc.h         | 1 -
 arch/csky/include/asm/pgalloc.h          | 2 --
 arch/hexagon/include/asm/pgtable.h       | 1 -
 arch/ia64/include/asm/pgalloc.h          | 1 -
 arch/m68k/include/asm/mcf_pgalloc.h      | 2 --
 arch/m68k/include/asm/mcf_pgtable.h      | 2 ++
 arch/m68k/include/asm/motorola_pgalloc.h | 1 -
 arch/m68k/include/asm/motorola_pgtable.h | 2 ++
 arch/m68k/include/asm/sun3_pgalloc.h     | 1 -
 arch/microblaze/include/asm/pgalloc.h    | 2 --
 arch/mips/include/asm/pgalloc.h          | 1 -
 arch/nds32/include/asm/pgalloc.h         | 5 -----
 arch/nios2/include/asm/pgalloc.h         | 1 -
 arch/openrisc/include/asm/pgalloc.h      | 2 --
 arch/parisc/include/asm/pgalloc.h        | 1 -
 arch/powerpc/include/asm/pgalloc.h       | 5 -----
 arch/powerpc/include/asm/pgtable.h       | 6 ++++++
 arch/riscv/include/asm/pgalloc.h         | 2 --
 arch/s390/include/asm/pgalloc.h          | 3 ---
 arch/s390/include/asm/pgtable.h          | 3 +++
 arch/sh/include/asm/pgalloc.h            | 1 -
 arch/sparc/include/asm/pgalloc_32.h      | 1 -
 arch/sparc/include/asm/pgalloc_64.h      | 1 -
 arch/sparc/include/asm/pgtable_32.h      | 2 ++
 arch/sparc/include/asm/pgtable_64.h      | 2 ++
 arch/um/include/asm/pgalloc.h            | 1 -
 arch/x86/include/asm/pgalloc.h           | 2 --
 arch/xtensa/include/asm/pgalloc.h        | 2 --
 include/linux/pgtable.h                  | 9 +++++++++
 33 files changed, 28 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/pgalloc.h b/arch/alpha/include/asm/pgalloc.h
index 9c6a24fe493d..68be7adbfe58 100644
--- a/arch/alpha/include/asm/pgalloc.h
+++ b/arch/alpha/include/asm/pgalloc.h
@@ -18,7 +18,6 @@ pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte)
 {
 	pmd_set(pmd, (pte_t *)(page_to_pa(pte) + PAGE_OFFSET));
 }
-#define pmd_pgtable(pmd) pmd_page(pmd)
 
 static inline void
 pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
diff --git a/arch/arc/include/asm/pgalloc.h b/arch/arc/include/asm/pgalloc.h
index 6147db925248..a32ca3104ced 100644
--- a/arch/arc/include/asm/pgalloc.h
+++ b/arch/arc/include/asm/pgalloc.h
@@ -129,6 +129,4 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptep)
 
 #define __pte_free_tlb(tlb, pte, addr)  pte_free((tlb)->mm, pte)
 
-#define pmd_pgtable(pmd)	((pgtable_t) pmd_page_vaddr(pmd))
-
 #endif /* _ASM_ARC_PGALLOC_H */
diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h
index 577682e8cc7f..320cc0ae8a08 100644
--- a/arch/arc/include/asm/pgtable.h
+++ b/arch/arc/include/asm/pgtable.h
@@ -350,6 +350,8 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 
 #define kern_addr_valid(addr)	(1)
 
+#define pmd_pgtable(pmd)       ((pgtable_t) pmd_page_vaddr(pmd))
+
 /*
  * remap a physical page `pfn' of size `size' with page protection `prot'
  * into virtual address `from'
diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h
index fdee1f04f4f3..a17f01235c29 100644
--- a/arch/arm/include/asm/pgalloc.h
+++ b/arch/arm/include/asm/pgalloc.h
@@ -143,7 +143,6 @@ pmd_populate(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep)
 
 	__pmd_populate(pmdp, page_to_phys(ptep), prot);
 }
-#define pmd_pgtable(pmd) pmd_page(pmd)
 
 #endif /* CONFIG_MMU */
 
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index 31fbab3d6f99..8433a2058eb1 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -86,6 +86,5 @@ pmd_populate(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep)
 	VM_BUG_ON(mm == &init_mm);
 	__pmd_populate(pmdp, page_to_phys(ptep), PMD_TYPE_TABLE | PMD_TABLE_PXN);
 }
-#define pmd_pgtable(pmd) pmd_page(pmd)
 
 #endif
diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h
index cd211aabbefd..bbbd0698b397 100644
--- a/arch/csky/include/asm/pgalloc.h
+++ b/arch/csky/include/asm/pgalloc.h
@@ -22,8 +22,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 	set_pmd(pmd, __pmd(__pa(page_address(pte))));
 }
 
-#define pmd_pgtable(pmd) pmd_page(pmd)
-
 extern void pgd_init(unsigned long *p);
 
 static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h
index e4979508cddf..18cd6ea9ab23 100644
--- a/arch/hexagon/include/asm/pgtable.h
+++ b/arch/hexagon/include/asm/pgtable.h
@@ -239,7 +239,6 @@ static inline int pmd_bad(pmd_t pmd)
  * pmd_page - converts a PMD entry to a page pointer
  */
 #define pmd_page(pmd)  (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
-#define pmd_pgtable(pmd) pmd_page(pmd)
 
 /**
  * pte_none - check if pte is mapped
diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h
index 9601cfe83c94..0fb2b6291d58 100644
--- a/arch/ia64/include/asm/pgalloc.h
+++ b/arch/ia64/include/asm/pgalloc.h
@@ -52,7 +52,6 @@ pmd_populate(struct mm_struct *mm, pmd_t * pmd_entry, pgtable_t pte)
 {
 	pmd_val(*pmd_entry) = page_to_phys(pte);
 }
-#define pmd_pgtable(pmd) pmd_page(pmd)
 
 static inline void
 pmd_populate_kernel(struct mm_struct *mm, pmd_t * pmd_entry, pte_t * pte)
diff --git a/arch/m68k/include/asm/mcf_pgalloc.h b/arch/m68k/include/asm/mcf_pgalloc.h
index bc1228e00518..5c2c0a864524 100644
--- a/arch/m68k/include/asm/mcf_pgalloc.h
+++ b/arch/m68k/include/asm/mcf_pgalloc.h
@@ -32,8 +32,6 @@ extern inline pmd_t *pmd_alloc_kernel(pgd_t *pgd, unsigned long address)
 
 #define pmd_populate_kernel pmd_populate
 
-#define pmd_pgtable(pmd) pfn_to_virt(pmd_val(pmd) >> PAGE_SHIFT)
-
 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pgtable,
 				  unsigned long address)
 {
diff --git a/arch/m68k/include/asm/mcf_pgtable.h b/arch/m68k/include/asm/mcf_pgtable.h
index 8d4ec05996c5..6f2b87d7a50d 100644
--- a/arch/m68k/include/asm/mcf_pgtable.h
+++ b/arch/m68k/include/asm/mcf_pgtable.h
@@ -150,6 +150,8 @@
 
 #ifndef __ASSEMBLY__
 
+#define pmd_pgtable(pmd) pfn_to_virt(pmd_val(pmd) >> PAGE_SHIFT)
+
 /*
  * Conversion functions: convert a page and protection to a page entry,
  * and a page entry and page directory to the page they refer to.
diff --git a/arch/m68k/include/asm/motorola_pgalloc.h b/arch/m68k/include/asm/motorola_pgalloc.h
index b4fc3b4f6bb3..74a817d9387f 100644
--- a/arch/m68k/include/asm/motorola_pgalloc.h
+++ b/arch/m68k/include/asm/motorola_pgalloc.h
@@ -88,7 +88,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t page
 {
 	pmd_set(pmd, page);
 }
-#define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd))
 
 static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 {
diff --git a/arch/m68k/include/asm/motorola_pgtable.h b/arch/m68k/include/asm/motorola_pgtable.h
index 8076467eff4b..a2908164ee6f 100644
--- a/arch/m68k/include/asm/motorola_pgtable.h
+++ b/arch/m68k/include/asm/motorola_pgtable.h
@@ -105,6 +105,8 @@ extern unsigned long mm_cachebits;
 #define __S110	PAGE_SHARED_C
 #define __S111	PAGE_SHARED_C
 
+#define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd))
+
 /*
  * Conversion functions: convert a page and protection to a page entry,
  * and a page entry and page directory to the page they refer to.
diff --git a/arch/m68k/include/asm/sun3_pgalloc.h b/arch/m68k/include/asm/sun3_pgalloc.h
index 000f64869b91..198036aff519 100644
--- a/arch/m68k/include/asm/sun3_pgalloc.h
+++ b/arch/m68k/include/asm/sun3_pgalloc.h
@@ -32,7 +32,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t page
 {
 	pmd_val(*pmd) = __pa((unsigned long)page_address(page));
 }
-#define pmd_pgtable(pmd) pmd_page(pmd)
 
 /*
  * allocating and freeing a pmd is trivial: the 1-entry pmd is
diff --git a/arch/microblaze/include/asm/pgalloc.h b/arch/microblaze/include/asm/pgalloc.h
index d56b9f670ad1..6c33b05f730f 100644
--- a/arch/microblaze/include/asm/pgalloc.h
+++ b/arch/microblaze/include/asm/pgalloc.h
@@ -28,8 +28,6 @@ static inline pgd_t *get_pgd(void)
 
 #define pgd_alloc(mm)		get_pgd()
 
-#define pmd_pgtable(pmd)	pmd_page(pmd)
-
 extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm);
 
 #define __pte_free_tlb(tlb, pte, addr)	pte_free((tlb)->mm, (pte))
diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h
index 8b18424b3120..dd53d0f79cb3 100644
--- a/arch/mips/include/asm/pgalloc.h
+++ b/arch/mips/include/asm/pgalloc.h
@@ -28,7 +28,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 {
 	set_pmd(pmd, __pmd((unsigned long)page_address(pte)));
 }
-#define pmd_pgtable(pmd) pmd_page(pmd)
 
 /*
  * Initialize a new pmd table with invalid pointers.
diff --git a/arch/nds32/include/asm/pgalloc.h b/arch/nds32/include/asm/pgalloc.h
index 85c117347c86..a08e1ebca70e 100644
--- a/arch/nds32/include/asm/pgalloc.h
+++ b/arch/nds32/include/asm/pgalloc.h
@@ -12,11 +12,6 @@
 #define __HAVE_ARCH_PTE_ALLOC_ONE
 #include <asm-generic/pgalloc.h>	/* for pte_{alloc,free}_one */
 
-/*
- * Since we have only two-level page tables, these are trivial
- */
-#define pmd_pgtable(pmd) pmd_page(pmd)
-
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
 extern void pgd_free(struct mm_struct *mm, pgd_t * pgd);
 
diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h
index e6600d2a5ae0..3c4ae74d5798 100644
--- a/arch/nios2/include/asm/pgalloc.h
+++ b/arch/nios2/include/asm/pgalloc.h
@@ -25,7 +25,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 {
 	set_pmd(pmd, __pmd((unsigned long)page_address(pte)));
 }
-#define pmd_pgtable(pmd) pmd_page(pmd)
 
 /*
  * Initialize a new pmd table with invalid pointers.
diff --git a/arch/openrisc/include/asm/pgalloc.h b/arch/openrisc/include/asm/pgalloc.h
index 88820299ecc4..b7b2b8d16fad 100644
--- a/arch/openrisc/include/asm/pgalloc.h
+++ b/arch/openrisc/include/asm/pgalloc.h
@@ -72,6 +72,4 @@ do {					\
 	tlb_remove_page((tlb), (pte));	\
 } while (0)
 
-#define pmd_pgtable(pmd) pmd_page(pmd)
-
 #endif
diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h
index dda557085311..6a7e98e71f1d 100644
--- a/arch/parisc/include/asm/pgalloc.h
+++ b/arch/parisc/include/asm/pgalloc.h
@@ -69,6 +69,5 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
 
 #define pmd_populate(mm, pmd, pte_page) \
 	pmd_populate_kernel(mm, pmd, page_address(pte_page))
-#define pmd_pgtable(pmd) pmd_page(pmd)
 
 #endif
diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h
index 6dd78a2dc03a..3360cad78ace 100644
--- a/arch/powerpc/include/asm/pgalloc.h
+++ b/arch/powerpc/include/asm/pgalloc.h
@@ -70,9 +70,4 @@ extern struct kmem_cache *pgtable_cache[];
 #include <asm/nohash/pgalloc.h>
 #endif
 
-static inline pgtable_t pmd_pgtable(pmd_t pmd)
-{
-	return (pgtable_t)pmd_page_vaddr(pmd);
-}
-
 #endif /* _ASM_POWERPC_PGALLOC_H */
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index c6a676714f04..5969743719bc 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -152,6 +152,12 @@ static inline bool p4d_is_leaf(p4d_t p4d)
 }
 #endif
 
+#define pmd_pgtable pmd_pgtable
+static inline pgtable_t pmd_pgtable(pmd_t pmd)
+{
+	return (pgtable_t)pmd_page_vaddr(pmd);
+}
+
 #ifdef CONFIG_PPC64
 #define is_ioremap_addr is_ioremap_addr
 static inline bool is_ioremap_addr(const void *x)
diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h
index 23b1544e0ca5..0af6933a7100 100644
--- a/arch/riscv/include/asm/pgalloc.h
+++ b/arch/riscv/include/asm/pgalloc.h
@@ -38,8 +38,6 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
 
-#define pmd_pgtable(pmd)	pmd_page(pmd)
-
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	pgd_t *pgd;
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index 6b187cd72251..f14a555eff74 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -134,9 +134,6 @@ static inline void pmd_populate(struct mm_struct *mm,
 
 #define pmd_populate_kernel(mm, pmd, pte) pmd_populate(mm, pmd, pte)
 
-#define pmd_pgtable(pmd) \
-	((pgtable_t)__va(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE))
-
 /*
  * page table entry allocation/free routines.
  */
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 7c86bf03fc8f..1f8f5da53262 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1709,4 +1709,7 @@ extern void s390_reset_cmma(struct mm_struct *mm);
 #define HAVE_ARCH_UNMAPPED_AREA
 #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
 
+#define pmd_pgtable(pmd) \
+	((pgtable_t)__va(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE))
+
 #endif /* _S390_PAGE_H */
diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h
index 0e6b0be25e33..a9e98233c4d4 100644
--- a/arch/sh/include/asm/pgalloc.h
+++ b/arch/sh/include/asm/pgalloc.h
@@ -30,7 +30,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 {
 	set_pmd(pmd, __pmd((unsigned long)page_address(pte)));
 }
-#define pmd_pgtable(pmd) pmd_page(pmd)
 
 #define __pte_free_tlb(tlb,pte,addr)			\
 do {							\
diff --git a/arch/sparc/include/asm/pgalloc_32.h b/arch/sparc/include/asm/pgalloc_32.h
index 9d353e6dc5a9..4f73e87b22a3 100644
--- a/arch/sparc/include/asm/pgalloc_32.h
+++ b/arch/sparc/include/asm/pgalloc_32.h
@@ -51,7 +51,6 @@ static inline void free_pmd_fast(pmd_t * pmd)
 #define __pmd_free_tlb(tlb, pmd, addr)	pmd_free((tlb)->mm, pmd)
 
 #define pmd_populate(mm, pmd, pte)	pmd_set(pmd, pte)
-#define pmd_pgtable(pmd)		(pgtable_t)__pmd_page(pmd)
 
 void pmd_set(pmd_t *pmdp, pte_t *ptep);
 #define pmd_populate_kernel		pmd_populate
diff --git a/arch/sparc/include/asm/pgalloc_64.h b/arch/sparc/include/asm/pgalloc_64.h
index a8dafc550985..7b5561d17ab1 100644
--- a/arch/sparc/include/asm/pgalloc_64.h
+++ b/arch/sparc/include/asm/pgalloc_64.h
@@ -67,7 +67,6 @@ void pte_free(struct mm_struct *mm, pgtable_t ptepage);
 
 #define pmd_populate_kernel(MM, PMD, PTE)	pmd_set(MM, PMD, PTE)
 #define pmd_populate(MM, PMD, PTE)		pmd_set(MM, PMD, PTE)
-#define pmd_pgtable(PMD)			((pte_t *)pmd_page_vaddr(PMD))
 
 void pgtable_free(void *table, bool is_page);
 
diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h
index 0888bda245f5..ebaf374b55ab 100644
--- a/arch/sparc/include/asm/pgtable_32.h
+++ b/arch/sparc/include/asm/pgtable_32.h
@@ -432,4 +432,6 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma,
 /* We provide our own get_unmapped_area to cope with VA holes for userland */
 #define HAVE_ARCH_UNMAPPED_AREA
 
+#define pmd_pgtable(pmd)	((pgtable_t)__pmd_page(pmd))
+
 #endif /* !(_SPARC_PGTABLE_H) */
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index a400e0f23046..e0ee48ec3903 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -1117,6 +1117,8 @@ extern unsigned long cmdline_memory_size;
 
 asmlinkage void do_sparc64_fault(struct pt_regs *regs);
 
+#define pmd_pgtable(PMD)	((pte_t *)pmd_page_vaddr(PMD))
+
 #ifdef CONFIG_HUGETLB_PAGE
 
 #define pud_leaf_size pud_leaf_size
diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h
index 2bbf28cf3aa9..8ec7cd46dd96 100644
--- a/arch/um/include/asm/pgalloc.h
+++ b/arch/um/include/asm/pgalloc.h
@@ -19,7 +19,6 @@
 	set_pmd(pmd, __pmd(_PAGE_TABLE +			\
 		((unsigned long long)page_to_pfn(pte) <<	\
 			(unsigned long long) PAGE_SHIFT)))
-#define pmd_pgtable(pmd) pmd_page(pmd)
 
 /*
  * Allocate and free page tables.
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 62ad61d6fefc..c7ec5bb88334 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -84,8 +84,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 	set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
 }
 
-#define pmd_pgtable(pmd) pmd_page(pmd)
-
 #if CONFIG_PGTABLE_LEVELS > 2
 extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
 
diff --git a/arch/xtensa/include/asm/pgalloc.h b/arch/xtensa/include/asm/pgalloc.h
index d3a22da4d2c9..eeb2de3a89e5 100644
--- a/arch/xtensa/include/asm/pgalloc.h
+++ b/arch/xtensa/include/asm/pgalloc.h
@@ -25,7 +25,6 @@
 	(pmd_val(*(pmdp)) = ((unsigned long)ptep))
 #define pmd_populate(mm, pmdp, page)					     \
 	(pmd_val(*(pmdp)) = ((unsigned long)page_to_virt(page)))
-#define pmd_pgtable(pmd) pmd_page(pmd)
 
 static inline pgd_t*
 pgd_alloc(struct mm_struct *mm)
@@ -63,7 +62,6 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
 	return page;
 }
 
-#define pmd_pgtable(pmd) pmd_page(pmd)
 #endif /* CONFIG_MMU */
 
 #endif /* _XTENSA_PGALLOC_H */
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 69700e3e615f..e82660f7b9e4 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -37,6 +37,15 @@
 #define FIRST_USER_ADDRESS	0UL
 #endif
 
+/*
+ * This defines the generic helper for accessing PMD page
+ * table page. Although platforms can still override this
+ * via their respective <asm/pgtable.h>.
+ */
+#ifndef pmd_pgtable
+#define pmd_pgtable(pmd) pmd_page(pmd)
+#endif
+
 /*
  * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD]
  *
-- 
cgit v1.2.3


From af5cdaf82238fb3637a0d0fff4670e5be71c611c Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Wed, 30 Jun 2021 18:54:06 -0700
Subject: mm: remove special swap entry functions

Patch series "Add support for SVM atomics in Nouveau", v11.

Introduction
============

Some devices have features such as atomic PTE bits that can be used to
implement atomic access to system memory.  To support atomic operations to
a shared virtual memory page such a device needs access to that page which
is exclusive of the CPU.  This series introduces a mechanism to
temporarily unmap pages granting exclusive access to a device.

These changes are required to support OpenCL atomic operations in Nouveau
to shared virtual memory (SVM) regions allocated with the
CL_MEM_SVM_ATOMICS clSVMAlloc flag.  A more complete description of the
OpenCL SVM feature is available at
https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/
OpenCL_API.html#_shared_virtual_memory .

Implementation
==============

Exclusive device access is implemented by adding a new swap entry type
(SWAP_DEVICE_EXCLUSIVE) which is similar to a migration entry.  The main
difference is that on fault the original entry is immediately restored by
the fault handler instead of waiting.

Restoring the entry triggers calls to MMU notifers which allows a device
driver to revoke the atomic access permission from the GPU prior to the
CPU finalising the entry.

Patches
=======

Patches 1 & 2 refactor existing migration and device private entry
functions.

Patches 3 & 4 rework try_to_unmap_one() by splitting out unrelated
functionality into separate functions - try_to_migrate_one() and
try_to_munlock_one().

Patch 5 renames some existing code but does not introduce functionality.

Patch 6 is a small clean-up to swap entry handling in copy_pte_range().

Patch 7 contains the bulk of the implementation for device exclusive
memory.

Patch 8 contains some additions to the HMM selftests to ensure everything
works as expected.

Patch 9 is a cleanup for the Nouveau SVM implementation.

Patch 10 contains the implementation of atomic access for the Nouveau
driver.

Testing
=======

This has been tested with upstream Mesa 21.1.0 and a simple OpenCL program
which checks that GPU atomic accesses to system memory are atomic.
Without this series the test fails as there is no way of write-protecting
the page mapping which results in the device clobbering CPU writes.  For
reference the test is available at
https://ozlabs.org/~apopple/opencl_svm_atomics/

Further testing has been performed by adding support for testing exclusive
access to the hmm-tests kselftests.

This patch (of 10):

Remove multiple similar inline functions for dealing with different types
of special swap entries.

Both migration and device private swap entries use the swap offset to
store a pfn.  Instead of multiple inline functions to obtain a struct page
for each swap entry type use a common function pfn_swap_entry_to_page().
Also open-code the various entry_to_pfn() functions as this results is
shorter code that is easier to understand.

Link: https://lkml.kernel.org/r/20210616105937.23201-1-apopple@nvidia.com
Link: https://lkml.kernel.org/r/20210616105937.23201-2-apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Ben Skeggs <bskeggs@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/mm/pgtable.c  |  2 +-
 fs/proc/task_mmu.c      | 23 ++++++-----------
 include/linux/swap.h    |  4 +--
 include/linux/swapops.h | 69 +++++++++++++++++--------------------------------
 mm/hmm.c                |  5 ++--
 mm/huge_memory.c        |  6 ++---
 mm/memcontrol.c         |  2 +-
 mm/memory.c             | 10 +++----
 mm/migrate.c            |  6 ++---
 mm/page_vma_mapped.c    |  6 ++---
 10 files changed, 51 insertions(+), 82 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 18205f851c24..eec3a9d7176e 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -691,7 +691,7 @@ static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
 	if (!non_swap_entry(entry))
 		dec_mm_counter(mm, MM_SWAPENTS);
 	else if (is_migration_entry(entry)) {
-		struct page *page = migration_entry_to_page(entry);
+		struct page *page = pfn_swap_entry_to_page(entry);
 
 		dec_mm_counter(mm, mm_counter(page));
 	}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 95c8f1e8fea6..eb97468dfe4c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -514,10 +514,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 			} else {
 				mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
 			}
-		} else if (is_migration_entry(swpent))
-			page = migration_entry_to_page(swpent);
-		else if (is_device_private_entry(swpent))
-			page = device_private_entry_to_page(swpent);
+		} else if (is_pfn_swap_entry(swpent))
+			page = pfn_swap_entry_to_page(swpent);
 	} else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
 							&& pte_none(*pte))) {
 		page = xa_load(&vma->vm_file->f_mapping->i_pages,
@@ -549,7 +547,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
 		swp_entry_t entry = pmd_to_swp_entry(*pmd);
 
 		if (is_migration_entry(entry))
-			page = migration_entry_to_page(entry);
+			page = pfn_swap_entry_to_page(entry);
 	}
 	if (IS_ERR_OR_NULL(page))
 		return;
@@ -694,10 +692,8 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
 	} else if (is_swap_pte(*pte)) {
 		swp_entry_t swpent = pte_to_swp_entry(*pte);
 
-		if (is_migration_entry(swpent))
-			page = migration_entry_to_page(swpent);
-		else if (is_device_private_entry(swpent))
-			page = device_private_entry_to_page(swpent);
+		if (is_pfn_swap_entry(swpent))
+			page = pfn_swap_entry_to_page(swpent);
 	}
 	if (page) {
 		int mapcount = page_mapcount(page);
@@ -1389,11 +1385,8 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
 			frame = swp_type(entry) |
 				(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
 		flags |= PM_SWAP;
-		if (is_migration_entry(entry))
-			page = migration_entry_to_page(entry);
-
-		if (is_device_private_entry(entry))
-			page = device_private_entry_to_page(entry);
+		if (is_pfn_swap_entry(entry))
+			page = pfn_swap_entry_to_page(entry);
 	}
 
 	if (page && !PageAnon(page))
@@ -1454,7 +1447,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 			if (pmd_swp_uffd_wp(pmd))
 				flags |= PM_UFFD_WP;
 			VM_BUG_ON(!is_pmd_migration_entry(pmd));
-			page = migration_entry_to_page(entry);
+			page = pfn_swap_entry_to_page(entry);
 		}
 #endif
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index ac9bd84c905e..df7cbb6b3d3e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -564,8 +564,8 @@ static inline void show_swap_cache_info(void)
 {
 }
 
-#define free_swap_and_cache(e) ({(is_migration_entry(e) || is_device_private_entry(e));})
-#define swapcache_prepare(e) ({(is_migration_entry(e) || is_device_private_entry(e));})
+/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
+#define free_swap_and_cache(e) is_pfn_swap_entry(e)
 
 static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
 {
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 708fbeb21dd3..c24c79812bc1 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -128,16 +128,6 @@ static inline bool is_write_device_private_entry(swp_entry_t entry)
 {
 	return unlikely(swp_type(entry) == SWP_DEVICE_WRITE);
 }
-
-static inline unsigned long device_private_entry_to_pfn(swp_entry_t entry)
-{
-	return swp_offset(entry);
-}
-
-static inline struct page *device_private_entry_to_page(swp_entry_t entry)
-{
-	return pfn_to_page(swp_offset(entry));
-}
 #else /* CONFIG_DEVICE_PRIVATE */
 static inline swp_entry_t make_device_private_entry(struct page *page, bool write)
 {
@@ -157,16 +147,6 @@ static inline bool is_write_device_private_entry(swp_entry_t entry)
 {
 	return false;
 }
-
-static inline unsigned long device_private_entry_to_pfn(swp_entry_t entry)
-{
-	return 0;
-}
-
-static inline struct page *device_private_entry_to_page(swp_entry_t entry)
-{
-	return NULL;
-}
 #endif /* CONFIG_DEVICE_PRIVATE */
 
 #ifdef CONFIG_MIGRATION
@@ -189,22 +169,6 @@ static inline int is_write_migration_entry(swp_entry_t entry)
 	return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE);
 }
 
-static inline unsigned long migration_entry_to_pfn(swp_entry_t entry)
-{
-	return swp_offset(entry);
-}
-
-static inline struct page *migration_entry_to_page(swp_entry_t entry)
-{
-	struct page *p = pfn_to_page(swp_offset(entry));
-	/*
-	 * Any use of migration entries may only occur while the
-	 * corresponding page is locked
-	 */
-	BUG_ON(!PageLocked(compound_head(p)));
-	return p;
-}
-
 static inline void make_migration_entry_read(swp_entry_t *entry)
 {
 	*entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry));
@@ -224,16 +188,6 @@ static inline int is_migration_entry(swp_entry_t swp)
 	return 0;
 }
 
-static inline unsigned long migration_entry_to_pfn(swp_entry_t entry)
-{
-	return 0;
-}
-
-static inline struct page *migration_entry_to_page(swp_entry_t entry)
-{
-	return NULL;
-}
-
 static inline void make_migration_entry_read(swp_entry_t *entryp) { }
 static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
 					spinlock_t *ptl) { }
@@ -248,6 +202,29 @@ static inline int is_write_migration_entry(swp_entry_t entry)
 
 #endif
 
+static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
+{
+	struct page *p = pfn_to_page(swp_offset(entry));
+
+	/*
+	 * Any use of migration entries may only occur while the
+	 * corresponding page is locked
+	 */
+	BUG_ON(is_migration_entry(entry) && !PageLocked(p));
+
+	return p;
+}
+
+/*
+ * A pfn swap entry is a special type of swap entry that always has a pfn stored
+ * in the swap offset. They are used to represent unaddressable device memory
+ * and to restrict access to a page undergoing migration.
+ */
+static inline bool is_pfn_swap_entry(swp_entry_t entry)
+{
+	return is_migration_entry(entry) || is_device_private_entry(entry);
+}
+
 struct page_vma_mapped_walk;
 
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
diff --git a/mm/hmm.c b/mm/hmm.c
index 943cb2ba4442..3b2dda71d0ed 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -214,7 +214,7 @@ static inline bool hmm_is_device_private_entry(struct hmm_range *range,
 		swp_entry_t entry)
 {
 	return is_device_private_entry(entry) &&
-		device_private_entry_to_page(entry)->pgmap->owner ==
+		pfn_swap_entry_to_page(entry)->pgmap->owner ==
 		range->dev_private_owner;
 }
 
@@ -257,8 +257,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 			cpu_flags = HMM_PFN_VALID;
 			if (is_write_device_private_entry(entry))
 				cpu_flags |= HMM_PFN_WRITE;
-			*hmm_pfn = device_private_entry_to_pfn(entry) |
-					cpu_flags;
+			*hmm_pfn = swp_offset(entry) | cpu_flags;
 			return 0;
 		}
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d513b0cd1161..327b8d9d8d2f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1643,7 +1643,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 
 			VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
 			entry = pmd_to_swp_entry(orig_pmd);
-			page = migration_entry_to_page(entry);
+			page = pfn_swap_entry_to_page(entry);
 			flush_needed = 0;
 		} else
 			WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
@@ -2012,7 +2012,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 			swp_entry_t entry;
 
 			entry = pmd_to_swp_entry(old_pmd);
-			page = migration_entry_to_page(entry);
+			page = pfn_swap_entry_to_page(entry);
 		} else {
 			page = pmd_page(old_pmd);
 			if (!PageDirty(page) && pmd_dirty(old_pmd))
@@ -2066,7 +2066,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		swp_entry_t entry;
 
 		entry = pmd_to_swp_entry(old_pmd);
-		page = migration_entry_to_page(entry);
+		page = pfn_swap_entry_to_page(entry);
 		write = is_write_migration_entry(entry);
 		young = false;
 		soft_dirty = pmd_swp_soft_dirty(old_pmd);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f63399bff018..b826dad6fa36 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5532,7 +5532,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
 	 * as special swap entry in the CPU page table.
 	 */
 	if (is_device_private_entry(ent)) {
-		page = device_private_entry_to_page(ent);
+		page = pfn_swap_entry_to_page(ent);
 		/*
 		 * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
 		 * a refcount of 1 when free (unlike normal page)
diff --git a/mm/memory.c b/mm/memory.c
index 64eda960b75e..6723931085c7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -729,7 +729,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		}
 		rss[MM_SWAPENTS]++;
 	} else if (is_migration_entry(entry)) {
-		page = migration_entry_to_page(entry);
+		page = pfn_swap_entry_to_page(entry);
 
 		rss[mm_counter(page)]++;
 
@@ -748,7 +748,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 			set_pte_at(src_mm, addr, src_pte, pte);
 		}
 	} else if (is_device_private_entry(entry)) {
-		page = device_private_entry_to_page(entry);
+		page = pfn_swap_entry_to_page(entry);
 
 		/*
 		 * Update rss count even for unaddressable pages, as
@@ -1280,7 +1280,7 @@ again:
 
 		entry = pte_to_swp_entry(ptent);
 		if (is_device_private_entry(entry)) {
-			struct page *page = device_private_entry_to_page(entry);
+			struct page *page = pfn_swap_entry_to_page(entry);
 
 			if (unlikely(details && details->check_mapping)) {
 				/*
@@ -1309,7 +1309,7 @@ again:
 		else if (is_migration_entry(entry)) {
 			struct page *page;
 
-			page = migration_entry_to_page(entry);
+			page = pfn_swap_entry_to_page(entry);
 			rss[mm_counter(page)]--;
 		}
 		if (unlikely(!free_swap_and_cache(entry)))
@@ -3372,7 +3372,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 			migration_entry_wait(vma->vm_mm, vmf->pmd,
 					     vmf->address);
 		} else if (is_device_private_entry(entry)) {
-			vmf->page = device_private_entry_to_page(entry);
+			vmf->page = pfn_swap_entry_to_page(entry);
 			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
 		} else if (is_hwpoison_entry(entry)) {
 			ret = VM_FAULT_HWPOISON;
diff --git a/mm/migrate.c b/mm/migrate.c
index 8810c1421f5d..b4abb87249e1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -296,7 +296,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
 	if (!is_migration_entry(entry))
 		goto out;
 
-	page = migration_entry_to_page(entry);
+	page = pfn_swap_entry_to_page(entry);
 	page = compound_head(page);
 
 	/*
@@ -337,7 +337,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
 	ptl = pmd_lock(mm, pmd);
 	if (!is_pmd_migration_entry(*pmd))
 		goto unlock;
-	page = migration_entry_to_page(pmd_to_swp_entry(*pmd));
+	page = pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd));
 	if (!get_page_unless_zero(page))
 		goto unlock;
 	spin_unlock(ptl);
@@ -2289,7 +2289,7 @@ again:
 			if (!is_device_private_entry(entry))
 				goto next;
 
-			page = device_private_entry_to_page(entry);
+			page = pfn_swap_entry_to_page(entry);
 			if (!(migrate->flags &
 				MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
 			    page->pgmap->owner != migrate->pgmap_owner)
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index a4435311754b..4df6093e759b 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -96,7 +96,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
 		if (!is_migration_entry(entry))
 			return false;
 
-		pfn = migration_entry_to_pfn(entry);
+		pfn = swp_offset(entry);
 	} else if (is_swap_pte(*pvmw->pte)) {
 		swp_entry_t entry;
 
@@ -105,7 +105,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
 		if (!is_device_private_entry(entry))
 			return false;
 
-		pfn = device_private_entry_to_pfn(entry);
+		pfn = swp_offset(entry);
 	} else {
 		if (!pte_present(*pvmw->pte))
 			return false;
@@ -233,7 +233,7 @@ restart:
 					return not_found(pvmw);
 				entry = pmd_to_swp_entry(pmde);
 				if (!is_migration_entry(entry) ||
-				    migration_entry_to_page(entry) != page)
+				    pfn_swap_entry_to_page(entry) != page)
 					return not_found(pvmw);
 				return true;
 			}
-- 
cgit v1.2.3


From 4dd845b5a3e57ad07f26ef808707b064696fe34b Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Wed, 30 Jun 2021 18:54:09 -0700
Subject: mm/swapops: rework swap entry manipulation code

Both migration and device private pages use special swap entries that are
manipluated by a range of inline functions.  The arguments to these are
somewhat inconsistent so rework them to remove flag type arguments and to
make the arguments similar for both read and write entry creation.

Link: https://lkml.kernel.org/r/20210616105937.23201-3-apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Cc: Ben Skeggs <bskeggs@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swapops.h | 56 ++++++++++++++++++++++++++-----------------------
 mm/debug_vm_pgtable.c   | 12 +++++------
 mm/hmm.c                |  2 +-
 mm/huge_memory.c        | 26 ++++++++++++++++-------
 mm/hugetlb.c            | 10 +++++----
 mm/memory.c             | 10 +++++----
 mm/migrate.c            | 26 +++++++++++++++++------
 mm/mprotect.c           | 10 +++++----
 mm/rmap.c               | 10 ++++++---
 9 files changed, 100 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index c24c79812bc1..04d76357aa0c 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -107,35 +107,35 @@ static inline void *swp_to_radix_entry(swp_entry_t entry)
 }
 
 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
-static inline swp_entry_t make_device_private_entry(struct page *page, bool write)
+static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset)
 {
-	return swp_entry(write ? SWP_DEVICE_WRITE : SWP_DEVICE_READ,
-			 page_to_pfn(page));
+	return swp_entry(SWP_DEVICE_READ, offset);
 }
 
-static inline bool is_device_private_entry(swp_entry_t entry)
+static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset)
 {
-	int type = swp_type(entry);
-	return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE;
+	return swp_entry(SWP_DEVICE_WRITE, offset);
 }
 
-static inline void make_device_private_entry_read(swp_entry_t *entry)
+static inline bool is_device_private_entry(swp_entry_t entry)
 {
-	*entry = swp_entry(SWP_DEVICE_READ, swp_offset(*entry));
+	int type = swp_type(entry);
+	return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE;
 }
 
-static inline bool is_write_device_private_entry(swp_entry_t entry)
+static inline bool is_writable_device_private_entry(swp_entry_t entry)
 {
 	return unlikely(swp_type(entry) == SWP_DEVICE_WRITE);
 }
 #else /* CONFIG_DEVICE_PRIVATE */
-static inline swp_entry_t make_device_private_entry(struct page *page, bool write)
+static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset)
 {
 	return swp_entry(0, 0);
 }
 
-static inline void make_device_private_entry_read(swp_entry_t *entry)
+static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset)
 {
+	return swp_entry(0, 0);
 }
 
 static inline bool is_device_private_entry(swp_entry_t entry)
@@ -143,35 +143,32 @@ static inline bool is_device_private_entry(swp_entry_t entry)
 	return false;
 }
 
-static inline bool is_write_device_private_entry(swp_entry_t entry)
+static inline bool is_writable_device_private_entry(swp_entry_t entry)
 {
 	return false;
 }
 #endif /* CONFIG_DEVICE_PRIVATE */
 
 #ifdef CONFIG_MIGRATION
-static inline swp_entry_t make_migration_entry(struct page *page, int write)
-{
-	BUG_ON(!PageLocked(compound_head(page)));
-
-	return swp_entry(write ? SWP_MIGRATION_WRITE : SWP_MIGRATION_READ,
-			page_to_pfn(page));
-}
-
 static inline int is_migration_entry(swp_entry_t entry)
 {
 	return unlikely(swp_type(entry) == SWP_MIGRATION_READ ||
 			swp_type(entry) == SWP_MIGRATION_WRITE);
 }
 
-static inline int is_write_migration_entry(swp_entry_t entry)
+static inline int is_writable_migration_entry(swp_entry_t entry)
 {
 	return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE);
 }
 
-static inline void make_migration_entry_read(swp_entry_t *entry)
+static inline swp_entry_t make_readable_migration_entry(pgoff_t offset)
 {
-	*entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry));
+	return swp_entry(SWP_MIGRATION_READ, offset);
+}
+
+static inline swp_entry_t make_writable_migration_entry(pgoff_t offset)
+{
+	return swp_entry(SWP_MIGRATION_WRITE, offset);
 }
 
 extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
@@ -181,21 +178,28 @@ extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 extern void migration_entry_wait_huge(struct vm_area_struct *vma,
 		struct mm_struct *mm, pte_t *pte);
 #else
+static inline swp_entry_t make_readable_migration_entry(pgoff_t offset)
+{
+	return swp_entry(0, 0);
+}
+
+static inline swp_entry_t make_writable_migration_entry(pgoff_t offset)
+{
+	return swp_entry(0, 0);
+}
 
-#define make_migration_entry(page, write) swp_entry(0, 0)
 static inline int is_migration_entry(swp_entry_t swp)
 {
 	return 0;
 }
 
-static inline void make_migration_entry_read(swp_entry_t *entryp) { }
 static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
 					spinlock_t *ptl) { }
 static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 					 unsigned long address) { }
 static inline void migration_entry_wait_huge(struct vm_area_struct *vma,
 		struct mm_struct *mm, pte_t *pte) { }
-static inline int is_write_migration_entry(swp_entry_t entry)
+static inline int is_writable_migration_entry(swp_entry_t entry)
 {
 	return 0;
 }
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index f7b23565a04f..1c922691aa61 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -843,17 +843,17 @@ static void __init swap_migration_tests(void)
 	 * locked, otherwise it stumbles upon a BUG_ON().
 	 */
 	__SetPageLocked(page);
-	swp = make_migration_entry(page, 1);
+	swp = make_writable_migration_entry(page_to_pfn(page));
 	WARN_ON(!is_migration_entry(swp));
-	WARN_ON(!is_write_migration_entry(swp));
+	WARN_ON(!is_writable_migration_entry(swp));
 
-	make_migration_entry_read(&swp);
+	swp = make_readable_migration_entry(swp_offset(swp));
 	WARN_ON(!is_migration_entry(swp));
-	WARN_ON(is_write_migration_entry(swp));
+	WARN_ON(is_writable_migration_entry(swp));
 
-	swp = make_migration_entry(page, 0);
+	swp = make_readable_migration_entry(page_to_pfn(page));
 	WARN_ON(!is_migration_entry(swp));
-	WARN_ON(is_write_migration_entry(swp));
+	WARN_ON(is_writable_migration_entry(swp));
 	__ClearPageLocked(page);
 	__free_page(page);
 }
diff --git a/mm/hmm.c b/mm/hmm.c
index 3b2dda71d0ed..11df3ca30b82 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -255,7 +255,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 		 */
 		if (hmm_is_device_private_entry(range, entry)) {
 			cpu_flags = HMM_PFN_VALID;
-			if (is_write_device_private_entry(entry))
+			if (is_writable_device_private_entry(entry))
 				cpu_flags |= HMM_PFN_WRITE;
 			*hmm_pfn = swp_offset(entry) | cpu_flags;
 			return 0;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 327b8d9d8d2f..1c81aa11d618 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1054,8 +1054,9 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		swp_entry_t entry = pmd_to_swp_entry(pmd);
 
 		VM_BUG_ON(!is_pmd_migration_entry(pmd));
-		if (is_write_migration_entry(entry)) {
-			make_migration_entry_read(&entry);
+		if (is_writable_migration_entry(entry)) {
+			entry = make_readable_migration_entry(
+							swp_offset(entry));
 			pmd = swp_entry_to_pmd(entry);
 			if (pmd_swp_soft_dirty(*src_pmd))
 				pmd = pmd_swp_mksoft_dirty(pmd);
@@ -1772,13 +1773,14 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		swp_entry_t entry = pmd_to_swp_entry(*pmd);
 
 		VM_BUG_ON(!is_pmd_migration_entry(*pmd));
-		if (is_write_migration_entry(entry)) {
+		if (is_writable_migration_entry(entry)) {
 			pmd_t newpmd;
 			/*
 			 * A protection check is difficult so
 			 * just be safe and disable write
 			 */
-			make_migration_entry_read(&entry);
+			entry = make_readable_migration_entry(
+							swp_offset(entry));
 			newpmd = swp_entry_to_pmd(entry);
 			if (pmd_swp_soft_dirty(*pmd))
 				newpmd = pmd_swp_mksoft_dirty(newpmd);
@@ -2067,7 +2069,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 
 		entry = pmd_to_swp_entry(old_pmd);
 		page = pfn_swap_entry_to_page(entry);
-		write = is_write_migration_entry(entry);
+		write = is_writable_migration_entry(entry);
 		young = false;
 		soft_dirty = pmd_swp_soft_dirty(old_pmd);
 		uffd_wp = pmd_swp_uffd_wp(old_pmd);
@@ -2099,7 +2101,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		 */
 		if (freeze || pmd_migration) {
 			swp_entry_t swp_entry;
-			swp_entry = make_migration_entry(page + i, write);
+			if (write)
+				swp_entry = make_writable_migration_entry(
+							page_to_pfn(page + i));
+			else
+				swp_entry = make_readable_migration_entry(
+							page_to_pfn(page + i));
 			entry = swp_entry_to_pte(swp_entry);
 			if (soft_dirty)
 				entry = pte_swp_mksoft_dirty(entry);
@@ -3171,7 +3178,10 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
 	pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
 	if (pmd_dirty(pmdval))
 		set_page_dirty(page);
-	entry = make_migration_entry(page, pmd_write(pmdval));
+	if (pmd_write(pmdval))
+		entry = make_writable_migration_entry(page_to_pfn(page));
+	else
+		entry = make_readable_migration_entry(page_to_pfn(page));
 	pmdswp = swp_entry_to_pmd(entry);
 	if (pmd_soft_dirty(pmdval))
 		pmdswp = pmd_swp_mksoft_dirty(pmdswp);
@@ -3197,7 +3207,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
 	pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
 	if (pmd_swp_soft_dirty(*pvmw->pmd))
 		pmde = pmd_mksoft_dirty(pmde);
-	if (is_write_migration_entry(entry))
+	if (is_writable_migration_entry(entry))
 		pmde = maybe_pmd_mkwrite(pmde, vma);
 	if (pmd_swp_uffd_wp(*pvmw->pmd))
 		pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 89ba5147206e..924553aa8f78 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4242,12 +4242,13 @@ again:
 				    is_hugetlb_entry_hwpoisoned(entry))) {
 			swp_entry_t swp_entry = pte_to_swp_entry(entry);
 
-			if (is_write_migration_entry(swp_entry) && cow) {
+			if (is_writable_migration_entry(swp_entry) && cow) {
 				/*
 				 * COW mappings require pages in both
 				 * parent and child to be set to read.
 				 */
-				make_migration_entry_read(&swp_entry);
+				swp_entry = make_readable_migration_entry(
+							swp_offset(swp_entry));
 				entry = swp_entry_to_pte(swp_entry);
 				set_huge_swap_pte_at(src, addr, src_pte,
 						     entry, sz);
@@ -5532,10 +5533,11 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 		if (unlikely(is_hugetlb_entry_migration(pte))) {
 			swp_entry_t entry = pte_to_swp_entry(pte);
 
-			if (is_write_migration_entry(entry)) {
+			if (is_writable_migration_entry(entry)) {
 				pte_t newpte;
 
-				make_migration_entry_read(&entry);
+				entry = make_readable_migration_entry(
+							swp_offset(entry));
 				newpte = swp_entry_to_pte(entry);
 				set_huge_swap_pte_at(mm, address, ptep,
 						     newpte, huge_page_size(h));
diff --git a/mm/memory.c b/mm/memory.c
index 6723931085c7..e30488e9202f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -733,13 +733,14 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 
 		rss[mm_counter(page)]++;
 
-		if (is_write_migration_entry(entry) &&
+		if (is_writable_migration_entry(entry) &&
 				is_cow_mapping(vm_flags)) {
 			/*
 			 * COW mappings require pages in both
 			 * parent and child to be set to read.
 			 */
-			make_migration_entry_read(&entry);
+			entry = make_readable_migration_entry(
+							swp_offset(entry));
 			pte = swp_entry_to_pte(entry);
 			if (pte_swp_soft_dirty(*src_pte))
 				pte = pte_swp_mksoft_dirty(pte);
@@ -770,9 +771,10 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		 * when a device driver is involved (you cannot easily
 		 * save and restore device driver state).
 		 */
-		if (is_write_device_private_entry(entry) &&
+		if (is_writable_device_private_entry(entry) &&
 		    is_cow_mapping(vm_flags)) {
-			make_device_private_entry_read(&entry);
+			entry = make_readable_device_private_entry(
+							swp_offset(entry));
 			pte = swp_entry_to_pte(entry);
 			if (pte_swp_uffd_wp(*src_pte))
 				pte = pte_swp_mkuffd_wp(pte);
diff --git a/mm/migrate.c b/mm/migrate.c
index b4abb87249e1..de47bdfbd2f8 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -210,13 +210,18 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
 		 * Recheck VMA as permissions can change since migration started
 		 */
 		entry = pte_to_swp_entry(*pvmw.pte);
-		if (is_write_migration_entry(entry))
+		if (is_writable_migration_entry(entry))
 			pte = maybe_mkwrite(pte, vma);
 		else if (pte_swp_uffd_wp(*pvmw.pte))
 			pte = pte_mkuffd_wp(pte);
 
 		if (unlikely(is_device_private_page(new))) {
-			entry = make_device_private_entry(new, pte_write(pte));
+			if (pte_write(pte))
+				entry = make_writable_device_private_entry(
+							page_to_pfn(new));
+			else
+				entry = make_readable_device_private_entry(
+							page_to_pfn(new));
 			pte = swp_entry_to_pte(entry);
 			if (pte_swp_soft_dirty(*pvmw.pte))
 				pte = pte_swp_mksoft_dirty(pte);
@@ -2297,7 +2302,7 @@ again:
 
 			mpfn = migrate_pfn(page_to_pfn(page)) |
 					MIGRATE_PFN_MIGRATE;
-			if (is_write_device_private_entry(entry))
+			if (is_writable_device_private_entry(entry))
 				mpfn |= MIGRATE_PFN_WRITE;
 		} else {
 			if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
@@ -2343,8 +2348,12 @@ again:
 			ptep_get_and_clear(mm, addr, ptep);
 
 			/* Setup special migration page table entry */
-			entry = make_migration_entry(page, mpfn &
-						     MIGRATE_PFN_WRITE);
+			if (mpfn & MIGRATE_PFN_WRITE)
+				entry = make_writable_migration_entry(
+							page_to_pfn(page));
+			else
+				entry = make_readable_migration_entry(
+							page_to_pfn(page));
 			swp_pte = swp_entry_to_pte(entry);
 			if (pte_present(pte)) {
 				if (pte_soft_dirty(pte))
@@ -2817,7 +2826,12 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
 		if (is_device_private_page(page)) {
 			swp_entry_t swp_entry;
 
-			swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
+			if (vma->vm_flags & VM_WRITE)
+				swp_entry = make_writable_device_private_entry(
+							page_to_pfn(page));
+			else
+				swp_entry = make_readable_device_private_entry(
+							page_to_pfn(page));
 			entry = swp_entry_to_pte(swp_entry);
 		} else {
 			/*
diff --git a/mm/mprotect.c b/mm/mprotect.c
index e7a443157988..ee5961888e70 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -143,23 +143,25 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 			swp_entry_t entry = pte_to_swp_entry(oldpte);
 			pte_t newpte;
 
-			if (is_write_migration_entry(entry)) {
+			if (is_writable_migration_entry(entry)) {
 				/*
 				 * A protection check is difficult so
 				 * just be safe and disable write
 				 */
-				make_migration_entry_read(&entry);
+				entry = make_readable_migration_entry(
+							swp_offset(entry));
 				newpte = swp_entry_to_pte(entry);
 				if (pte_swp_soft_dirty(oldpte))
 					newpte = pte_swp_mksoft_dirty(newpte);
 				if (pte_swp_uffd_wp(oldpte))
 					newpte = pte_swp_mkuffd_wp(newpte);
-			} else if (is_write_device_private_entry(entry)) {
+			} else if (is_writable_device_private_entry(entry)) {
 				/*
 				 * We do not preserve soft-dirtiness. See
 				 * copy_one_pte() for explanation.
 				 */
-				make_device_private_entry_read(&entry);
+				entry = make_readable_device_private_entry(
+							swp_offset(entry));
 				newpte = swp_entry_to_pte(entry);
 				if (pte_swp_uffd_wp(oldpte))
 					newpte = pte_swp_mkuffd_wp(newpte);
diff --git a/mm/rmap.c b/mm/rmap.c
index f9fd5bc54f0a..b9986c8db524 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1533,7 +1533,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			 * pte. do_swap_page() will wait until the migration
 			 * pte is removed and then restart fault handling.
 			 */
-			entry = make_migration_entry(page, 0);
+			entry = make_readable_migration_entry(page_to_pfn(page));
 			swp_pte = swp_entry_to_pte(entry);
 
 			/*
@@ -1629,8 +1629,12 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			 * pte. do_swap_page() will wait until the migration
 			 * pte is removed and then restart fault handling.
 			 */
-			entry = make_migration_entry(subpage,
-					pte_write(pteval));
+			if (pte_write(pteval))
+				entry = make_writable_migration_entry(
+							page_to_pfn(subpage));
+			else
+				entry = make_readable_migration_entry(
+							page_to_pfn(subpage));
 			swp_pte = swp_entry_to_pte(entry);
 			if (pte_soft_dirty(pteval))
 				swp_pte = pte_swp_mksoft_dirty(swp_pte);
-- 
cgit v1.2.3


From cd62734ca60dbb2ab5bb19c8d837dd9990955310 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Wed, 30 Jun 2021 18:54:12 -0700
Subject: mm/rmap: split try_to_munlock from try_to_unmap

The behaviour of try_to_unmap_one() is difficult to follow because it
performs different operations based on a fairly large set of flags used in
different combinations.

TTU_MUNLOCK is one such flag.  However it is exclusively used by
try_to_munlock() which specifies no other flags.  Therefore rather than
overload try_to_unmap_one() with unrelated behaviour split this out into
it's own function and remove the flag.

Link: https://lkml.kernel.org/r/20210616105937.23201-4-apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Ben Skeggs <bskeggs@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/unevictable-lru.rst | 33 +++++++-----------
 include/linux/rmap.h                 |  3 +-
 mm/mlock.c                           | 12 +++----
 mm/rmap.c                            | 66 ++++++++++++++++++++++++++----------
 4 files changed, 69 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/vm/unevictable-lru.rst b/Documentation/vm/unevictable-lru.rst
index 0e1490524f53..eae3af17f2d9 100644
--- a/Documentation/vm/unevictable-lru.rst
+++ b/Documentation/vm/unevictable-lru.rst
@@ -389,14 +389,14 @@ mlocked, munlock_vma_page() updates that zone statistics for the number of
 mlocked pages.  Note, however, that at this point we haven't checked whether
 the page is mapped by other VM_LOCKED VMAs.
 
-We can't call try_to_munlock(), the function that walks the reverse map to
+We can't call page_mlock(), the function that walks the reverse map to
 check for other VM_LOCKED VMAs, without first isolating the page from the LRU.
-try_to_munlock() is a variant of try_to_unmap() and thus requires that the page
+page_mlock() is a variant of try_to_unmap() and thus requires that the page
 not be on an LRU list [more on these below].  However, the call to
-isolate_lru_page() could fail, in which case we couldn't try_to_munlock().  So,
+isolate_lru_page() could fail, in which case we can't call page_mlock().  So,
 we go ahead and clear PG_mlocked up front, as this might be the only chance we
-have.  If we can successfully isolate the page, we go ahead and
-try_to_munlock(), which will restore the PG_mlocked flag and update the zone
+have.  If we can successfully isolate the page, we go ahead and call
+page_mlock(), which will restore the PG_mlocked flag and update the zone
 page statistics if it finds another VMA holding the page mlocked.  If we fail
 to isolate the page, we'll have left a potentially mlocked page on the LRU.
 This is fine, because we'll catch it later if and if vmscan tries to reclaim
@@ -545,31 +545,24 @@ munlock or munmap system calls, mm teardown (munlock_vma_pages_all), reclaim,
 holepunching, and truncation of file pages and their anonymous COWed pages.
 
 
-try_to_munlock() Reverse Map Scan
+page_mlock() Reverse Map Scan
 ---------------------------------
 
-.. warning::
-   [!] TODO/FIXME: a better name might be page_mlocked() - analogous to the
-   page_referenced() reverse map walker.
-
 When munlock_vma_page() [see section :ref:`munlock()/munlockall() System Call
 Handling <munlock_munlockall_handling>` above] tries to munlock a
 page, it needs to determine whether or not the page is mapped by any
 VM_LOCKED VMA without actually attempting to unmap all PTEs from the
 page.  For this purpose, the unevictable/mlock infrastructure
-introduced a variant of try_to_unmap() called try_to_munlock().
+introduced a variant of try_to_unmap() called page_mlock().
 
-try_to_munlock() calls the same functions as try_to_unmap() for anonymous and
-mapped file and KSM pages with a flag argument specifying unlock versus unmap
-processing.  Again, these functions walk the respective reverse maps looking
-for VM_LOCKED VMAs.  When such a VMA is found, as in the try_to_unmap() case,
-the functions mlock the page via mlock_vma_page() and return SWAP_MLOCK.  This
-undoes the pre-clearing of the page's PG_mlocked done by munlock_vma_page.
+page_mlock() walks the respective reverse maps looking for VM_LOCKED VMAs. When
+such a VMA is found the page is mlocked via mlock_vma_page(). This undoes the
+pre-clearing of the page's PG_mlocked done by munlock_vma_page.
 
-Note that try_to_munlock()'s reverse map walk must visit every VMA in a page's
+Note that page_mlock()'s reverse map walk must visit every VMA in a page's
 reverse map to determine that a page is NOT mapped into any VM_LOCKED VMA.
 However, the scan can terminate when it encounters a VM_LOCKED VMA.
-Although try_to_munlock() might be called a great many times when munlocking a
+Although page_mlock() might be called a great many times when munlocking a
 large region or tearing down a large address space that has been mlocked via
 mlockall(), overall this is a fairly rare event.
 
@@ -602,7 +595,7 @@ inactive lists to the appropriate node's unevictable list.
 shrink_inactive_list() should only see SHM_LOCK'd pages that became SHM_LOCK'd
 after shrink_active_list() had moved them to the inactive list, or pages mapped
 into VM_LOCKED VMAs that munlock_vma_page() couldn't isolate from the LRU to
-recheck via try_to_munlock().  shrink_inactive_list() won't notice the latter,
+recheck via page_mlock().  shrink_inactive_list() won't notice the latter,
 but will pass on to shrink_page_list().
 
 shrink_page_list() again culls obviously unevictable pages that it could
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index ed31a559e857..69190efbd842 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -87,7 +87,6 @@ struct anon_vma_chain {
 
 enum ttu_flags {
 	TTU_MIGRATION		= 0x1,	/* migration mode */
-	TTU_MUNLOCK		= 0x2,	/* munlock mode */
 
 	TTU_SPLIT_HUGE_PMD	= 0x4,	/* split huge PMD if any */
 	TTU_IGNORE_MLOCK	= 0x8,	/* ignore mlock */
@@ -240,7 +239,7 @@ int page_mkclean(struct page *);
  * called in munlock()/munmap() path to check for other vmas holding
  * the page mlocked.
  */
-void try_to_munlock(struct page *);
+void page_mlock(struct page *page);
 
 void remove_migration_ptes(struct page *old, struct page *new, bool locked);
 
diff --git a/mm/mlock.c b/mm/mlock.c
index df590fda5688..4ab757ab6fe8 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -108,7 +108,7 @@ void mlock_vma_page(struct page *page)
 /*
  * Finish munlock after successful page isolation
  *
- * Page must be locked. This is a wrapper for try_to_munlock()
+ * Page must be locked. This is a wrapper for page_mlock()
  * and putback_lru_page() with munlock accounting.
  */
 static void __munlock_isolated_page(struct page *page)
@@ -118,7 +118,7 @@ static void __munlock_isolated_page(struct page *page)
 	 * and we don't need to check all the other vmas.
 	 */
 	if (page_mapcount(page) > 1)
-		try_to_munlock(page);
+		page_mlock(page);
 
 	/* Did try_to_unlock() succeed or punt? */
 	if (!PageMlocked(page))
@@ -158,7 +158,7 @@ static void __munlock_isolation_failed(struct page *page)
  * munlock()ed or munmap()ed, we want to check whether other vmas hold the
  * page locked so that we can leave it on the unevictable lru list and not
  * bother vmscan with it.  However, to walk the page's rmap list in
- * try_to_munlock() we must isolate the page from the LRU.  If some other
+ * page_mlock() we must isolate the page from the LRU.  If some other
  * task has removed the page from the LRU, we won't be able to do that.
  * So we clear the PageMlocked as we might not get another chance.  If we
  * can't isolate the page, we leave it for putback_lru_page() and vmscan
@@ -168,7 +168,7 @@ unsigned int munlock_vma_page(struct page *page)
 {
 	int nr_pages;
 
-	/* For try_to_munlock() and to serialize with page migration */
+	/* For page_mlock() and to serialize with page migration */
 	BUG_ON(!PageLocked(page));
 	VM_BUG_ON_PAGE(PageTail(page), page);
 
@@ -205,7 +205,7 @@ static int __mlock_posix_error_return(long retval)
  *
  * The fast path is available only for evictable pages with single mapping.
  * Then we can bypass the per-cpu pvec and get better performance.
- * when mapcount > 1 we need try_to_munlock() which can fail.
+ * when mapcount > 1 we need page_mlock() which can fail.
  * when !page_evictable(), we need the full redo logic of putback_lru_page to
  * avoid leaving evictable page in unevictable list.
  *
@@ -414,7 +414,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
  *
  * We don't save and restore VM_LOCKED here because pages are
  * still on lru.  In unmap path, pages might be scanned by reclaim
- * and re-mlocked by try_to_{munlock|unmap} before we unmap and
+ * and re-mlocked by page_mlock/try_to_unmap before we unmap and
  * free them.  This will result in freeing mlocked pages.
  */
 void munlock_vma_pages_range(struct vm_area_struct *vma,
diff --git a/mm/rmap.c b/mm/rmap.c
index b9986c8db524..73c0ff1c73ab 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1411,10 +1411,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	if (flags & TTU_SYNC)
 		pvmw.flags = PVMW_SYNC;
 
-	/* munlock has nothing to gain from examining un-locked vmas */
-	if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
-		return true;
-
 	if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
 	    is_zone_device_page(page) && !is_device_private_page(page))
 		return true;
@@ -1476,8 +1472,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 				page_vma_mapped_walk_done(&pvmw);
 				break;
 			}
-			if (flags & TTU_MUNLOCK)
-				continue;
 		}
 
 		/* Unexpected PMD-mapped THP? */
@@ -1790,20 +1784,58 @@ void try_to_unmap(struct page *page, enum ttu_flags flags)
 		rmap_walk(page, &rwc);
 }
 
+/*
+ * Walks the vma's mapping a page and mlocks the page if any locked vma's are
+ * found. Once one is found the page is locked and the scan can be terminated.
+ */
+static bool page_mlock_one(struct page *page, struct vm_area_struct *vma,
+				 unsigned long address, void *unused)
+{
+	struct page_vma_mapped_walk pvmw = {
+		.page = page,
+		.vma = vma,
+		.address = address,
+	};
+
+	/* An un-locked vma doesn't have any pages to lock, continue the scan */
+	if (!(vma->vm_flags & VM_LOCKED))
+		return true;
+
+	while (page_vma_mapped_walk(&pvmw)) {
+		/*
+		 * Need to recheck under the ptl to serialise with
+		 * __munlock_pagevec_fill() after VM_LOCKED is cleared in
+		 * munlock_vma_pages_range().
+		 */
+		if (vma->vm_flags & VM_LOCKED) {
+			/* PTE-mapped THP are never mlocked */
+			if (!PageTransCompound(page))
+				mlock_vma_page(page);
+			page_vma_mapped_walk_done(&pvmw);
+		}
+
+		/*
+		 * no need to continue scanning other vma's if the page has
+		 * been locked.
+		 */
+		return false;
+	}
+
+	return true;
+}
+
 /**
- * try_to_munlock - try to munlock a page
- * @page: the page to be munlocked
+ * page_mlock - try to mlock a page
+ * @page: the page to be mlocked
  *
- * Called from munlock code.  Checks all of the VMAs mapping the page
- * to make sure nobody else has this page mlocked. The page will be
- * returned with PG_mlocked cleared if no other vmas have it mlocked.
+ * Called from munlock code. Checks all of the VMAs mapping the page and mlocks
+ * the page if any are found. The page will be returned with PG_mlocked cleared
+ * if it is not mapped by any locked vmas.
  */
-
-void try_to_munlock(struct page *page)
+void page_mlock(struct page *page)
 {
 	struct rmap_walk_control rwc = {
-		.rmap_one = try_to_unmap_one,
-		.arg = (void *)TTU_MUNLOCK,
+		.rmap_one = page_mlock_one,
 		.done = page_not_mapped,
 		.anon_lock = page_lock_anon_vma_read,
 
@@ -1855,7 +1887,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
  * Find all the mappings of a page using the mapping pointer and the vma chains
  * contained in the anon_vma struct it points to.
  *
- * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
+ * When called from page_mlock(), the mmap_lock of the mm containing the vma
  * where the page was found will be held for write.  So, we won't recheck
  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
  * LOCKED.
@@ -1908,7 +1940,7 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
  * Find all the mappings of a page using the mapping pointer and the vma chains
  * contained in the address_space struct it points to.
  *
- * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
+ * When called from page_mlock(), the mmap_lock of the mm containing the vma
  * where the page was found will be held for write.  So, we won't recheck
  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
  * LOCKED.
-- 
cgit v1.2.3


From a98a2f0c8ce1b2138cb8e3ae410444dedcc14809 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Wed, 30 Jun 2021 18:54:16 -0700
Subject: mm/rmap: split migration into its own function

Migration is currently implemented as a mode of operation for
try_to_unmap_one() generally specified by passing the TTU_MIGRATION flag
or in the case of splitting a huge anonymous page TTU_SPLIT_FREEZE.

However it does not have much in common with the rest of the unmap
functionality of try_to_unmap_one() and thus splitting it into a separate
function reduces the complexity of try_to_unmap_one() making it more
readable.

Several simplifications can also be made in try_to_migrate_one() based on
the following observations:

 - All users of TTU_MIGRATION also set TTU_IGNORE_MLOCK.
 - No users of TTU_MIGRATION ever set TTU_IGNORE_HWPOISON.
 - No users of TTU_MIGRATION ever set TTU_BATCH_FLUSH.

TTU_SPLIT_FREEZE is a special case of migration used when splitting an
anonymous page.  This is most easily dealt with by calling the correct
function from unmap_page() in mm/huge_memory.c - either try_to_migrate()
for PageAnon or try_to_unmap().

Link: https://lkml.kernel.org/r/20210616105937.23201-5-apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Cc: Ben Skeggs <bskeggs@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h |   4 +-
 mm/huge_memory.c     |  16 ++-
 mm/migrate.c         |   9 +-
 mm/rmap.c            | 367 ++++++++++++++++++++++++++++++++++++++-------------
 4 files changed, 289 insertions(+), 107 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 69190efbd842..b0ea9d98302f 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -86,8 +86,6 @@ struct anon_vma_chain {
 };
 
 enum ttu_flags {
-	TTU_MIGRATION		= 0x1,	/* migration mode */
-
 	TTU_SPLIT_HUGE_PMD	= 0x4,	/* split huge PMD if any */
 	TTU_IGNORE_MLOCK	= 0x8,	/* ignore mlock */
 	TTU_SYNC		= 0x10,	/* avoid racy checks with PVMW_SYNC */
@@ -97,7 +95,6 @@ enum ttu_flags {
 					 * do a final flush if necessary */
 	TTU_RMAP_LOCKED		= 0x80,	/* do not grab rmap lock:
 					 * caller holds it */
-	TTU_SPLIT_FREEZE	= 0x100,		/* freeze pte under splitting thp */
 };
 
 #ifdef CONFIG_MMU
@@ -194,6 +191,7 @@ static inline void page_dup_rmap(struct page *page, bool compound)
 int page_referenced(struct page *, int is_locked,
 			struct mem_cgroup *memcg, unsigned long *vm_flags);
 
+void try_to_migrate(struct page *page, enum ttu_flags flags);
 void try_to_unmap(struct page *, enum ttu_flags flags);
 
 /* Avoid racy checks */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1c81aa11d618..8b731d53e9f4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2309,16 +2309,20 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
 
 static void unmap_page(struct page *page)
 {
-	enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC |
-		TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
+	enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
+		TTU_SYNC;
 
 	VM_BUG_ON_PAGE(!PageHead(page), page);
 
-	/* If TTU_SPLIT_FREEZE is ever extended to file, update remap_page() */
+	/*
+	 * Anon pages need migration entries to preserve them, but file
+	 * pages can simply be left unmapped, then faulted back on demand.
+	 * If that is ever changed (perhaps for mlock), update remap_page().
+	 */
 	if (PageAnon(page))
-		ttu_flags |= TTU_SPLIT_FREEZE;
-
-	try_to_unmap(page, ttu_flags);
+		try_to_migrate(page, ttu_flags);
+	else
+		try_to_unmap(page, ttu_flags | TTU_IGNORE_MLOCK);
 
 	VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
 }
diff --git a/mm/migrate.c b/mm/migrate.c
index de47bdfbd2f8..a69f54bd92b9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1109,7 +1109,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 		/* Establish migration ptes */
 		VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
 				page);
-		try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK);
+		try_to_migrate(page, 0);
 		page_was_mapped = 1;
 	}
 
@@ -1311,7 +1311,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 
 	if (page_mapped(hpage)) {
 		bool mapping_locked = false;
-		enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK;
+		enum ttu_flags ttu = 0;
 
 		if (!PageAnon(hpage)) {
 			/*
@@ -1328,7 +1328,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 			ttu |= TTU_RMAP_LOCKED;
 		}
 
-		try_to_unmap(hpage, ttu);
+		try_to_migrate(hpage, ttu);
 		page_was_mapped = 1;
 
 		if (mapping_locked)
@@ -2602,7 +2602,6 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
  */
 static void migrate_vma_unmap(struct migrate_vma *migrate)
 {
-	int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK;
 	const unsigned long npages = migrate->npages;
 	const unsigned long start = migrate->start;
 	unsigned long addr, i, restore = 0;
@@ -2614,7 +2613,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
 			continue;
 
 		if (page_mapped(page)) {
-			try_to_unmap(page, flags);
+			try_to_migrate(page, 0);
 			if (page_mapped(page))
 				goto restore;
 		}
diff --git a/mm/rmap.c b/mm/rmap.c
index 73c0ff1c73ab..b922c7074cdd 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1411,14 +1411,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	if (flags & TTU_SYNC)
 		pvmw.flags = PVMW_SYNC;
 
-	if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
-	    is_zone_device_page(page) && !is_device_private_page(page))
-		return true;
-
-	if (flags & TTU_SPLIT_HUGE_PMD) {
-		split_huge_pmd_address(vma, address,
-				flags & TTU_SPLIT_FREEZE, page);
-	}
+	if (flags & TTU_SPLIT_HUGE_PMD)
+		split_huge_pmd_address(vma, address, false, page);
 
 	/*
 	 * For THP, we have to assume the worse case ie pmd for invalidation.
@@ -1443,16 +1437,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	mmu_notifier_invalidate_range_start(&range);
 
 	while (page_vma_mapped_walk(&pvmw)) {
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
-		/* PMD-mapped THP migration entry */
-		if (!pvmw.pte && (flags & TTU_MIGRATION)) {
-			VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
-
-			set_pmd_migration_entry(&pvmw, page);
-			continue;
-		}
-#endif
-
 		/*
 		 * If the page is mlock()d, we cannot swap it out.
 		 * If it's recently referenced (perhaps page_referenced
@@ -1514,46 +1498,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			}
 		}
 
-		if (IS_ENABLED(CONFIG_MIGRATION) &&
-		    (flags & TTU_MIGRATION) &&
-		    is_zone_device_page(page)) {
-			swp_entry_t entry;
-			pte_t swp_pte;
-
-			pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte);
-
-			/*
-			 * Store the pfn of the page in a special migration
-			 * pte. do_swap_page() will wait until the migration
-			 * pte is removed and then restart fault handling.
-			 */
-			entry = make_readable_migration_entry(page_to_pfn(page));
-			swp_pte = swp_entry_to_pte(entry);
-
-			/*
-			 * pteval maps a zone device page and is therefore
-			 * a swap pte.
-			 */
-			if (pte_swp_soft_dirty(pteval))
-				swp_pte = pte_swp_mksoft_dirty(swp_pte);
-			if (pte_swp_uffd_wp(pteval))
-				swp_pte = pte_swp_mkuffd_wp(swp_pte);
-			set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
-			/*
-			 * No need to invalidate here it will synchronize on
-			 * against the special swap migration pte.
-			 *
-			 * The assignment to subpage above was computed from a
-			 * swap PTE which results in an invalid pointer.
-			 * Since only PAGE_SIZE pages can currently be
-			 * migrated, just set it to page. This will need to be
-			 * changed when hugepage migrations to device private
-			 * memory are supported.
-			 */
-			subpage = page;
-			goto discard;
-		}
-
 		/* Nuke the page table entry. */
 		flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
 		if (should_defer_flush(mm, flags)) {
@@ -1606,39 +1550,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			/* We have to invalidate as we cleared the pte */
 			mmu_notifier_invalidate_range(mm, address,
 						      address + PAGE_SIZE);
-		} else if (IS_ENABLED(CONFIG_MIGRATION) &&
-				(flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
-			swp_entry_t entry;
-			pte_t swp_pte;
-
-			if (arch_unmap_one(mm, vma, address, pteval) < 0) {
-				set_pte_at(mm, address, pvmw.pte, pteval);
-				ret = false;
-				page_vma_mapped_walk_done(&pvmw);
-				break;
-			}
-
-			/*
-			 * Store the pfn of the page in a special migration
-			 * pte. do_swap_page() will wait until the migration
-			 * pte is removed and then restart fault handling.
-			 */
-			if (pte_write(pteval))
-				entry = make_writable_migration_entry(
-							page_to_pfn(subpage));
-			else
-				entry = make_readable_migration_entry(
-							page_to_pfn(subpage));
-			swp_pte = swp_entry_to_pte(entry);
-			if (pte_soft_dirty(pteval))
-				swp_pte = pte_swp_mksoft_dirty(swp_pte);
-			if (pte_uffd_wp(pteval))
-				swp_pte = pte_swp_mkuffd_wp(swp_pte);
-			set_pte_at(mm, address, pvmw.pte, swp_pte);
-			/*
-			 * No need to invalidate here it will synchronize on
-			 * against the special swap migration pte.
-			 */
 		} else if (PageAnon(page)) {
 			swp_entry_t entry = { .val = page_private(subpage) };
 			pte_t swp_pte;
@@ -1766,6 +1677,277 @@ void try_to_unmap(struct page *page, enum ttu_flags flags)
 		.anon_lock = page_lock_anon_vma_read,
 	};
 
+	if (flags & TTU_RMAP_LOCKED)
+		rmap_walk_locked(page, &rwc);
+	else
+		rmap_walk(page, &rwc);
+}
+
+/*
+ * @arg: enum ttu_flags will be passed to this argument.
+ *
+ * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs
+ * containing migration entries. This and TTU_RMAP_LOCKED are the only supported
+ * flags.
+ */
+static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
+		     unsigned long address, void *arg)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct page_vma_mapped_walk pvmw = {
+		.page = page,
+		.vma = vma,
+		.address = address,
+	};
+	pte_t pteval;
+	struct page *subpage;
+	bool ret = true;
+	struct mmu_notifier_range range;
+	enum ttu_flags flags = (enum ttu_flags)(long)arg;
+
+	if (is_zone_device_page(page) && !is_device_private_page(page))
+		return true;
+
+	/*
+	 * When racing against e.g. zap_pte_range() on another cpu,
+	 * in between its ptep_get_and_clear_full() and page_remove_rmap(),
+	 * try_to_migrate() may return before page_mapped() has become false,
+	 * if page table locking is skipped: use TTU_SYNC to wait for that.
+	 */
+	if (flags & TTU_SYNC)
+		pvmw.flags = PVMW_SYNC;
+
+	/*
+	 * unmap_page() in mm/huge_memory.c is the only user of migration with
+	 * TTU_SPLIT_HUGE_PMD and it wants to freeze.
+	 */
+	if (flags & TTU_SPLIT_HUGE_PMD)
+		split_huge_pmd_address(vma, address, true, page);
+
+	/*
+	 * For THP, we have to assume the worse case ie pmd for invalidation.
+	 * For hugetlb, it could be much worse if we need to do pud
+	 * invalidation in the case of pmd sharing.
+	 *
+	 * Note that the page can not be free in this function as call of
+	 * try_to_unmap() must hold a reference on the page.
+	 */
+	range.end = PageKsm(page) ?
+			address + PAGE_SIZE : vma_address_end(page, vma);
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+				address, range.end);
+	if (PageHuge(page)) {
+		/*
+		 * If sharing is possible, start and end will be adjusted
+		 * accordingly.
+		 */
+		adjust_range_if_pmd_sharing_possible(vma, &range.start,
+						     &range.end);
+	}
+	mmu_notifier_invalidate_range_start(&range);
+
+	while (page_vma_mapped_walk(&pvmw)) {
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+		/* PMD-mapped THP migration entry */
+		if (!pvmw.pte) {
+			VM_BUG_ON_PAGE(PageHuge(page) ||
+				       !PageTransCompound(page), page);
+
+			set_pmd_migration_entry(&pvmw, page);
+			continue;
+		}
+#endif
+
+		/* Unexpected PMD-mapped THP? */
+		VM_BUG_ON_PAGE(!pvmw.pte, page);
+
+		subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
+		address = pvmw.address;
+
+		if (PageHuge(page) && !PageAnon(page)) {
+			/*
+			 * To call huge_pmd_unshare, i_mmap_rwsem must be
+			 * held in write mode.  Caller needs to explicitly
+			 * do this outside rmap routines.
+			 */
+			VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+			if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
+				/*
+				 * huge_pmd_unshare unmapped an entire PMD
+				 * page.  There is no way of knowing exactly
+				 * which PMDs may be cached for this mm, so
+				 * we must flush them all.  start/end were
+				 * already adjusted above to cover this range.
+				 */
+				flush_cache_range(vma, range.start, range.end);
+				flush_tlb_range(vma, range.start, range.end);
+				mmu_notifier_invalidate_range(mm, range.start,
+							      range.end);
+
+				/*
+				 * The ref count of the PMD page was dropped
+				 * which is part of the way map counting
+				 * is done for shared PMDs.  Return 'true'
+				 * here.  When there is no other sharing,
+				 * huge_pmd_unshare returns false and we will
+				 * unmap the actual page and drop map count
+				 * to zero.
+				 */
+				page_vma_mapped_walk_done(&pvmw);
+				break;
+			}
+		}
+
+		/* Nuke the page table entry. */
+		flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
+		pteval = ptep_clear_flush(vma, address, pvmw.pte);
+
+		/* Move the dirty bit to the page. Now the pte is gone. */
+		if (pte_dirty(pteval))
+			set_page_dirty(page);
+
+		/* Update high watermark before we lower rss */
+		update_hiwater_rss(mm);
+
+		if (is_zone_device_page(page)) {
+			swp_entry_t entry;
+			pte_t swp_pte;
+
+			/*
+			 * Store the pfn of the page in a special migration
+			 * pte. do_swap_page() will wait until the migration
+			 * pte is removed and then restart fault handling.
+			 */
+			entry = make_readable_migration_entry(
+							page_to_pfn(page));
+			swp_pte = swp_entry_to_pte(entry);
+
+			/*
+			 * pteval maps a zone device page and is therefore
+			 * a swap pte.
+			 */
+			if (pte_swp_soft_dirty(pteval))
+				swp_pte = pte_swp_mksoft_dirty(swp_pte);
+			if (pte_swp_uffd_wp(pteval))
+				swp_pte = pte_swp_mkuffd_wp(swp_pte);
+			set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
+			/*
+			 * No need to invalidate here it will synchronize on
+			 * against the special swap migration pte.
+			 *
+			 * The assignment to subpage above was computed from a
+			 * swap PTE which results in an invalid pointer.
+			 * Since only PAGE_SIZE pages can currently be
+			 * migrated, just set it to page. This will need to be
+			 * changed when hugepage migrations to device private
+			 * memory are supported.
+			 */
+			subpage = page;
+		} else if (PageHWPoison(page)) {
+			pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
+			if (PageHuge(page)) {
+				hugetlb_count_sub(compound_nr(page), mm);
+				set_huge_swap_pte_at(mm, address,
+						     pvmw.pte, pteval,
+						     vma_mmu_pagesize(vma));
+			} else {
+				dec_mm_counter(mm, mm_counter(page));
+				set_pte_at(mm, address, pvmw.pte, pteval);
+			}
+
+		} else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
+			/*
+			 * The guest indicated that the page content is of no
+			 * interest anymore. Simply discard the pte, vmscan
+			 * will take care of the rest.
+			 * A future reference will then fault in a new zero
+			 * page. When userfaultfd is active, we must not drop
+			 * this page though, as its main user (postcopy
+			 * migration) will not expect userfaults on already
+			 * copied pages.
+			 */
+			dec_mm_counter(mm, mm_counter(page));
+			/* We have to invalidate as we cleared the pte */
+			mmu_notifier_invalidate_range(mm, address,
+						      address + PAGE_SIZE);
+		} else {
+			swp_entry_t entry;
+			pte_t swp_pte;
+
+			if (arch_unmap_one(mm, vma, address, pteval) < 0) {
+				set_pte_at(mm, address, pvmw.pte, pteval);
+				ret = false;
+				page_vma_mapped_walk_done(&pvmw);
+				break;
+			}
+
+			/*
+			 * Store the pfn of the page in a special migration
+			 * pte. do_swap_page() will wait until the migration
+			 * pte is removed and then restart fault handling.
+			 */
+			if (pte_write(pteval))
+				entry = make_writable_migration_entry(
+							page_to_pfn(subpage));
+			else
+				entry = make_readable_migration_entry(
+							page_to_pfn(subpage));
+
+			swp_pte = swp_entry_to_pte(entry);
+			if (pte_soft_dirty(pteval))
+				swp_pte = pte_swp_mksoft_dirty(swp_pte);
+			if (pte_uffd_wp(pteval))
+				swp_pte = pte_swp_mkuffd_wp(swp_pte);
+			set_pte_at(mm, address, pvmw.pte, swp_pte);
+			/*
+			 * No need to invalidate here it will synchronize on
+			 * against the special swap migration pte.
+			 */
+		}
+
+		/*
+		 * No need to call mmu_notifier_invalidate_range() it has be
+		 * done above for all cases requiring it to happen under page
+		 * table lock before mmu_notifier_invalidate_range_end()
+		 *
+		 * See Documentation/vm/mmu_notifier.rst
+		 */
+		page_remove_rmap(subpage, PageHuge(page));
+		put_page(page);
+	}
+
+	mmu_notifier_invalidate_range_end(&range);
+
+	return ret;
+}
+
+/**
+ * try_to_migrate - try to replace all page table mappings with swap entries
+ * @page: the page to replace page table entries for
+ * @flags: action and flags
+ *
+ * Tries to remove all the page table entries which are mapping this page and
+ * replace them with special swap entries. Caller must hold the page lock.
+ *
+ * If is successful, return true. Otherwise, false.
+ */
+void try_to_migrate(struct page *page, enum ttu_flags flags)
+{
+	struct rmap_walk_control rwc = {
+		.rmap_one = try_to_migrate_one,
+		.arg = (void *)flags,
+		.done = page_not_mapped,
+		.anon_lock = page_lock_anon_vma_read,
+	};
+
+	/*
+	 * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and
+	 * TTU_SPLIT_HUGE_PMD and TTU_SYNC flags.
+	 */
+	if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
+					TTU_SYNC)))
+		return;
+
 	/*
 	 * During exec, a temporary VMA is setup and later moved.
 	 * The VMA is moved under the anon_vma lock but not the
@@ -1774,8 +1956,7 @@ void try_to_unmap(struct page *page, enum ttu_flags flags)
 	 * locking requirements of exec(), migration skips
 	 * temporary VMAs until after exec() completes.
 	 */
-	if ((flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))
-	    && !PageKsm(page) && PageAnon(page))
+	if (!PageKsm(page) && PageAnon(page))
 		rwc.invalid_vma = invalid_migration_vma;
 
 	if (flags & TTU_RMAP_LOCKED)
-- 
cgit v1.2.3


From 6b49bf6ddbb0d7992c816846acfa5fd1cf751c36 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Wed, 30 Jun 2021 18:54:19 -0700
Subject: mm: rename migrate_pgmap_owner

MMU notifier ranges have a migrate_pgmap_owner field which is used by
drivers to store a pointer.  This is subsequently used by the driver
callback to filter MMU_NOTIFY_MIGRATE events.  Other notifier event types
can also benefit from this filtering, so rename the 'migrate_pgmap_owner'
field to 'owner' and create a new notifier initialisation function to
initialise this field.

Link: https://lkml.kernel.org/r/20210616105937.23201-6-apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Suggested-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Cc: Ben Skeggs <bskeggs@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/hmm.rst              |  2 +-
 drivers/gpu/drm/nouveau/nouveau_svm.c |  2 +-
 include/linux/mmu_notifier.h          | 20 ++++++++++----------
 lib/test_hmm.c                        |  2 +-
 mm/migrate.c                          | 10 +++++-----
 5 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index 09e28507f5b2..3df79307a797 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -332,7 +332,7 @@ between device driver specific code and shared common code:
    walks to fill in the ``args->src`` array with PFNs to be migrated.
    The ``invalidate_range_start()`` callback is passed a
    ``struct mmu_notifier_range`` with the ``event`` field set to
-   ``MMU_NOTIFY_MIGRATE`` and the ``migrate_pgmap_owner`` field set to
+   ``MMU_NOTIFY_MIGRATE`` and the ``owner`` field set to
    the ``args->pgmap_owner`` field passed to migrate_vma_setup(). This is
    allows the device driver to skip the invalidation callback and only
    invalidate device private MMU mappings that are actually migrating.
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
index 1c3f890377d2..e963f7d81953 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -265,7 +265,7 @@ nouveau_svmm_invalidate_range_start(struct mmu_notifier *mn,
 	 * the invalidation is handled as part of the migration process.
 	 */
 	if (update->event == MMU_NOTIFY_MIGRATE &&
-	    update->migrate_pgmap_owner == svmm->vmm->cli->drm->dev)
+	    update->owner == svmm->vmm->cli->drm->dev)
 		goto out;
 
 	if (limit > svmm->unmanaged.start && start < svmm->unmanaged.limit) {
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 1a6a9eb6d3fa..8e428eb813b8 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -41,7 +41,7 @@ struct mmu_interval_notifier;
  *
  * @MMU_NOTIFY_MIGRATE: used during migrate_vma_collect() invalidate to signal
  * a device driver to possibly ignore the invalidation if the
- * migrate_pgmap_owner field matches the driver's device private pgmap owner.
+ * owner field matches the driver's device private pgmap owner.
  */
 enum mmu_notifier_event {
 	MMU_NOTIFY_UNMAP = 0,
@@ -269,7 +269,7 @@ struct mmu_notifier_range {
 	unsigned long end;
 	unsigned flags;
 	enum mmu_notifier_event event;
-	void *migrate_pgmap_owner;
+	void *owner;
 };
 
 static inline int mm_has_notifiers(struct mm_struct *mm)
@@ -521,14 +521,14 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range,
 	range->flags = flags;
 }
 
-static inline void mmu_notifier_range_init_migrate(
-			struct mmu_notifier_range *range, unsigned int flags,
+static inline void mmu_notifier_range_init_owner(
+			struct mmu_notifier_range *range,
+			enum mmu_notifier_event event, unsigned int flags,
 			struct vm_area_struct *vma, struct mm_struct *mm,
-			unsigned long start, unsigned long end, void *pgmap)
+			unsigned long start, unsigned long end, void *owner)
 {
-	mmu_notifier_range_init(range, MMU_NOTIFY_MIGRATE, flags, vma, mm,
-				start, end);
-	range->migrate_pgmap_owner = pgmap;
+	mmu_notifier_range_init(range, event, flags, vma, mm, start, end);
+	range->owner = owner;
 }
 
 #define ptep_clear_flush_young_notify(__vma, __address, __ptep)		\
@@ -655,8 +655,8 @@ static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range,
 
 #define mmu_notifier_range_init(range,event,flags,vma,mm,start,end)  \
 	_mmu_notifier_range_init(range, start, end)
-#define mmu_notifier_range_init_migrate(range, flags, vma, mm, start, end, \
-					pgmap) \
+#define mmu_notifier_range_init_owner(range, event, flags, vma, mm, start, \
+					end, owner) \
 	_mmu_notifier_range_init(range, start, end)
 
 static inline bool
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 15f2e2db77bc..fc7a20bc9b42 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -218,7 +218,7 @@ static bool dmirror_interval_invalidate(struct mmu_interval_notifier *mni,
 	 * the invalidation is handled as part of the migration process.
 	 */
 	if (range->event == MMU_NOTIFY_MIGRATE &&
-	    range->migrate_pgmap_owner == dmirror->mdevice)
+	    range->owner == dmirror->mdevice)
 		return true;
 
 	if (mmu_notifier_range_blockable(range))
diff --git a/mm/migrate.c b/mm/migrate.c
index a69f54bd92b9..23cbd9de030b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2416,8 +2416,8 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
 	 * that the registered device driver can skip invalidating device
 	 * private page mappings that won't be migrated.
 	 */
-	mmu_notifier_range_init_migrate(&range, 0, migrate->vma,
-		migrate->vma->vm_mm, migrate->start, migrate->end,
+	mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0,
+		migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end,
 		migrate->pgmap_owner);
 	mmu_notifier_invalidate_range_start(&range);
 
@@ -2927,9 +2927,9 @@ void migrate_vma_pages(struct migrate_vma *migrate)
 			if (!notified) {
 				notified = true;
 
-				mmu_notifier_range_init_migrate(&range, 0,
-					migrate->vma, migrate->vma->vm_mm,
-					addr, migrate->end,
+				mmu_notifier_range_init_owner(&range,
+					MMU_NOTIFY_MIGRATE, 0, migrate->vma,
+					migrate->vma->vm_mm, addr, migrate->end,
 					migrate->pgmap_owner);
 				mmu_notifier_invalidate_range_start(&range);
 			}
-- 
cgit v1.2.3


From b756a3b5e7ead8f6f4b03cea8ac22478ce04c8a8 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Wed, 30 Jun 2021 18:54:25 -0700
Subject: mm: device exclusive memory access

Some devices require exclusive write access to shared virtual memory (SVM)
ranges to perform atomic operations on that memory.  This requires CPU
page tables to be updated to deny access whilst atomic operations are
occurring.

In order to do this introduce a new swap entry type
(SWP_DEVICE_EXCLUSIVE).  When a SVM range needs to be marked for exclusive
access by a device all page table mappings for the particular range are
replaced with device exclusive swap entries.  This causes any CPU access
to the page to result in a fault.

Faults are resovled by replacing the faulting entry with the original
mapping.  This results in MMU notifiers being called which a driver uses
to update access permissions such as revoking atomic access.  After
notifiers have been called the device will no longer have exclusive access
to the region.

Walking of the page tables to find the target pages is handled by
get_user_pages() rather than a direct page table walk.  A direct page
table walk similar to what migrate_vma_collect()/unmap() does could also
have been utilised.  However this resulted in more code similar in
functionality to what get_user_pages() provides as page faulting is
required to make the PTEs present and to break COW.

[dan.carpenter@oracle.com: fix signedness bug in make_device_exclusive_range()]
  Link: https://lkml.kernel.org/r/YNIz5NVnZ5GiZ3u1@mwanda

Link: https://lkml.kernel.org/r/20210616105937.23201-8-apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Ben Skeggs <bskeggs@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/hmm.rst     |  17 ++++
 include/linux/mmu_notifier.h |   6 ++
 include/linux/rmap.h         |   4 +
 include/linux/swap.h         |   9 ++-
 include/linux/swapops.h      |  44 +++++++++-
 mm/hmm.c                     |   5 ++
 mm/memory.c                  | 127 ++++++++++++++++++++++++++++-
 mm/mprotect.c                |   8 ++
 mm/page_vma_mapped.c         |   9 ++-
 mm/rmap.c                    | 186 +++++++++++++++++++++++++++++++++++++++++++
 10 files changed, 405 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index 3df79307a797..a14c2938e7af 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -405,6 +405,23 @@ between device driver specific code and shared common code:
 
    The lock can now be released.
 
+Exclusive access memory
+=======================
+
+Some devices have features such as atomic PTE bits that can be used to implement
+atomic access to system memory. To support atomic operations to a shared virtual
+memory page such a device needs access to that page which is exclusive of any
+userspace access from the CPU. The ``make_device_exclusive_range()`` function
+can be used to make a memory range inaccessible from userspace.
+
+This replaces all mappings for pages in the given range with special swap
+entries. Any attempt to access the swap entry results in a fault which is
+resovled by replacing the entry with the original mapping. A driver gets
+notified that the mapping has been changed by MMU notifiers, after which point
+it will no longer have exclusive access to the page. Exclusive access is
+guranteed to last until the driver drops the page lock and page reference, at
+which point any CPU faults on the page may proceed as described.
+
 Memory cgroup (memcg) and rss accounting
 ========================================
 
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 8e428eb813b8..6692da8d121d 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -42,6 +42,11 @@ struct mmu_interval_notifier;
  * @MMU_NOTIFY_MIGRATE: used during migrate_vma_collect() invalidate to signal
  * a device driver to possibly ignore the invalidation if the
  * owner field matches the driver's device private pgmap owner.
+ *
+ * @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no
+ * longer have exclusive access to the page. When sent during creation of an
+ * exclusive range the owner will be initialised to the value provided by the
+ * caller of make_device_exclusive_range(), otherwise the owner will be NULL.
  */
 enum mmu_notifier_event {
 	MMU_NOTIFY_UNMAP = 0,
@@ -51,6 +56,7 @@ enum mmu_notifier_event {
 	MMU_NOTIFY_SOFT_DIRTY,
 	MMU_NOTIFY_RELEASE,
 	MMU_NOTIFY_MIGRATE,
+	MMU_NOTIFY_EXCLUSIVE,
 };
 
 #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b0ea9d98302f..83fb86133fe1 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -194,6 +194,10 @@ int page_referenced(struct page *, int is_locked,
 void try_to_migrate(struct page *page, enum ttu_flags flags);
 void try_to_unmap(struct page *, enum ttu_flags flags);
 
+int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
+				unsigned long end, struct page **pages,
+				void *arg);
+
 /* Avoid racy checks */
 #define PVMW_SYNC		(1 << 0)
 /* Look for migarion entries rather than present PTEs */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index df7cbb6b3d3e..6f5a43251593 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -62,12 +62,17 @@ static inline int current_is_kswapd(void)
  * migrate part of a process memory to device memory.
  *
  * When a page is migrated from CPU to device, we set the CPU page table entry
- * to a special SWP_DEVICE_* entry.
+ * to a special SWP_DEVICE_{READ|WRITE} entry.
+ *
+ * When a page is mapped by the device for exclusive access we set the CPU page
+ * table entries to special SWP_DEVICE_EXCLUSIVE_* entries.
  */
 #ifdef CONFIG_DEVICE_PRIVATE
-#define SWP_DEVICE_NUM 2
+#define SWP_DEVICE_NUM 4
 #define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM)
 #define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1)
+#define SWP_DEVICE_EXCLUSIVE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2)
+#define SWP_DEVICE_EXCLUSIVE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+3)
 #else
 #define SWP_DEVICE_NUM 0
 #endif
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 04d76357aa0c..d356ab4047f7 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -127,6 +127,27 @@ static inline bool is_writable_device_private_entry(swp_entry_t entry)
 {
 	return unlikely(swp_type(entry) == SWP_DEVICE_WRITE);
 }
+
+static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset)
+{
+	return swp_entry(SWP_DEVICE_EXCLUSIVE_READ, offset);
+}
+
+static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset)
+{
+	return swp_entry(SWP_DEVICE_EXCLUSIVE_WRITE, offset);
+}
+
+static inline bool is_device_exclusive_entry(swp_entry_t entry)
+{
+	return swp_type(entry) == SWP_DEVICE_EXCLUSIVE_READ ||
+		swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE;
+}
+
+static inline bool is_writable_device_exclusive_entry(swp_entry_t entry)
+{
+	return unlikely(swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE);
+}
 #else /* CONFIG_DEVICE_PRIVATE */
 static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset)
 {
@@ -147,6 +168,26 @@ static inline bool is_writable_device_private_entry(swp_entry_t entry)
 {
 	return false;
 }
+
+static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset)
+{
+	return swp_entry(0, 0);
+}
+
+static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset)
+{
+	return swp_entry(0, 0);
+}
+
+static inline bool is_device_exclusive_entry(swp_entry_t entry)
+{
+	return false;
+}
+
+static inline bool is_writable_device_exclusive_entry(swp_entry_t entry)
+{
+	return false;
+}
 #endif /* CONFIG_DEVICE_PRIVATE */
 
 #ifdef CONFIG_MIGRATION
@@ -226,7 +267,8 @@ static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
  */
 static inline bool is_pfn_swap_entry(swp_entry_t entry)
 {
-	return is_migration_entry(entry) || is_device_private_entry(entry);
+	return is_migration_entry(entry) || is_device_private_entry(entry) ||
+	       is_device_exclusive_entry(entry);
 }
 
 struct page_vma_mapped_walk;
diff --git a/mm/hmm.c b/mm/hmm.c
index 11df3ca30b82..fad6be2bf072 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -26,6 +26,8 @@
 #include <linux/mmu_notifier.h>
 #include <linux/memory_hotplug.h>
 
+#include "internal.h"
+
 struct hmm_vma_walk {
 	struct hmm_range	*range;
 	unsigned long		last;
@@ -271,6 +273,9 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 		if (!non_swap_entry(entry))
 			goto fault;
 
+		if (is_device_exclusive_entry(entry))
+			goto fault;
+
 		if (is_migration_entry(entry)) {
 			pte_unmap(ptep);
 			hmm_vma_walk->last = addr;
diff --git a/mm/memory.c b/mm/memory.c
index 75feddbf0190..747a01d495f2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -699,6 +699,68 @@ out:
 }
 #endif
 
+static void restore_exclusive_pte(struct vm_area_struct *vma,
+				  struct page *page, unsigned long address,
+				  pte_t *ptep)
+{
+	pte_t pte;
+	swp_entry_t entry;
+
+	pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
+	if (pte_swp_soft_dirty(*ptep))
+		pte = pte_mksoft_dirty(pte);
+
+	entry = pte_to_swp_entry(*ptep);
+	if (pte_swp_uffd_wp(*ptep))
+		pte = pte_mkuffd_wp(pte);
+	else if (is_writable_device_exclusive_entry(entry))
+		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+
+	set_pte_at(vma->vm_mm, address, ptep, pte);
+
+	/*
+	 * No need to take a page reference as one was already
+	 * created when the swap entry was made.
+	 */
+	if (PageAnon(page))
+		page_add_anon_rmap(page, vma, address, false);
+	else
+		/*
+		 * Currently device exclusive access only supports anonymous
+		 * memory so the entry shouldn't point to a filebacked page.
+		 */
+		WARN_ON_ONCE(!PageAnon(page));
+
+	if (vma->vm_flags & VM_LOCKED)
+		mlock_vma_page(page);
+
+	/*
+	 * No need to invalidate - it was non-present before. However
+	 * secondary CPUs may have mappings that need invalidating.
+	 */
+	update_mmu_cache(vma, address, ptep);
+}
+
+/*
+ * Tries to restore an exclusive pte if the page lock can be acquired without
+ * sleeping.
+ */
+static int
+try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
+			unsigned long addr)
+{
+	swp_entry_t entry = pte_to_swp_entry(*src_pte);
+	struct page *page = pfn_swap_entry_to_page(entry);
+
+	if (trylock_page(page)) {
+		restore_exclusive_pte(vma, page, addr, src_pte);
+		unlock_page(page);
+		return 0;
+	}
+
+	return -EBUSY;
+}
+
 /*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
@@ -780,6 +842,17 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 				pte = pte_swp_mkuffd_wp(pte);
 			set_pte_at(src_mm, addr, src_pte, pte);
 		}
+	} else if (is_device_exclusive_entry(entry)) {
+		/*
+		 * Make device exclusive entries present by restoring the
+		 * original entry then copying as for a present pte. Device
+		 * exclusive entries currently only support private writable
+		 * (ie. COW) mappings.
+		 */
+		VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
+		if (try_restore_exclusive_pte(src_pte, src_vma, addr))
+			return -EBUSY;
+		return -ENOENT;
 	}
 	if (!userfaultfd_wp(dst_vma))
 		pte = pte_swp_clear_uffd_wp(pte);
@@ -980,9 +1053,18 @@ again:
 			if (ret == -EIO) {
 				entry = pte_to_swp_entry(*src_pte);
 				break;
+			} else if (ret == -EBUSY) {
+				break;
+			} else if (!ret) {
+				progress += 8;
+				continue;
 			}
-			progress += 8;
-			continue;
+
+			/*
+			 * Device exclusive entry restored, continue by copying
+			 * the now present pte.
+			 */
+			WARN_ON_ONCE(ret != -ENOENT);
 		}
 		/* copy_present_pte() will clear `*prealloc' if consumed */
 		ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
@@ -1020,6 +1102,8 @@ again:
 			goto out;
 		}
 		entry.val = 0;
+	} else if (ret == -EBUSY) {
+		goto out;
 	} else if (ret ==  -EAGAIN) {
 		prealloc = page_copy_prealloc(src_mm, src_vma, addr);
 		if (!prealloc)
@@ -1287,7 +1371,8 @@ again:
 		}
 
 		entry = pte_to_swp_entry(ptent);
-		if (is_device_private_entry(entry)) {
+		if (is_device_private_entry(entry) ||
+		    is_device_exclusive_entry(entry)) {
 			struct page *page = pfn_swap_entry_to_page(entry);
 
 			if (unlikely(details && details->check_mapping)) {
@@ -1303,7 +1388,10 @@ again:
 
 			pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 			rss[mm_counter(page)]--;
-			page_remove_rmap(page, false);
+
+			if (is_device_private_entry(entry))
+				page_remove_rmap(page, false);
+
 			put_page(page);
 			continue;
 		}
@@ -3351,6 +3439,34 @@ void unmap_mapping_range(struct address_space *mapping,
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
+/*
+ * Restore a potential device exclusive pte to a working pte entry
+ */
+static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct vm_area_struct *vma = vmf->vma;
+	struct mmu_notifier_range range;
+
+	if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
+		return VM_FAULT_RETRY;
+	mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
+				vma->vm_mm, vmf->address & PAGE_MASK,
+				(vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
+	mmu_notifier_invalidate_range_start(&range);
+
+	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+				&vmf->ptl);
+	if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
+		restore_exclusive_pte(vma, page, vmf->address, vmf->pte);
+
+	pte_unmap_unlock(vmf->pte, vmf->ptl);
+	unlock_page(page);
+
+	mmu_notifier_invalidate_range_end(&range);
+	return 0;
+}
+
 /*
  * We enter with non-exclusive mmap_lock (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
@@ -3379,6 +3495,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 		if (is_migration_entry(entry)) {
 			migration_entry_wait(vma->vm_mm, vmf->pmd,
 					     vmf->address);
+		} else if (is_device_exclusive_entry(entry)) {
+			vmf->page = pfn_swap_entry_to_page(entry);
+			ret = remove_device_exclusive_entry(vmf);
 		} else if (is_device_private_entry(entry)) {
 			vmf->page = pfn_swap_entry_to_page(entry);
 			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ee5961888e70..883e2cc85cad 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -165,6 +165,14 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				newpte = swp_entry_to_pte(entry);
 				if (pte_swp_uffd_wp(oldpte))
 					newpte = pte_swp_mkuffd_wp(newpte);
+			} else if (is_writable_device_exclusive_entry(entry)) {
+				entry = make_readable_device_exclusive_entry(
+							swp_offset(entry));
+				newpte = swp_entry_to_pte(entry);
+				if (pte_swp_soft_dirty(oldpte))
+					newpte = pte_swp_mksoft_dirty(newpte);
+				if (pte_swp_uffd_wp(oldpte))
+					newpte = pte_swp_mkuffd_wp(newpte);
 			} else {
 				newpte = oldpte;
 			}
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 4df6093e759b..f7b331081791 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -41,7 +41,8 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
 
 				/* Handle un-addressable ZONE_DEVICE memory */
 				entry = pte_to_swp_entry(*pvmw->pte);
-				if (!is_device_private_entry(entry))
+				if (!is_device_private_entry(entry) &&
+				    !is_device_exclusive_entry(entry))
 					return false;
 			} else if (!pte_present(*pvmw->pte))
 				return false;
@@ -93,7 +94,8 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
 			return false;
 		entry = pte_to_swp_entry(*pvmw->pte);
 
-		if (!is_migration_entry(entry))
+		if (!is_migration_entry(entry) &&
+		    !is_device_exclusive_entry(entry))
 			return false;
 
 		pfn = swp_offset(entry);
@@ -102,7 +104,8 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
 
 		/* Handle un-addressable ZONE_DEVICE memory */
 		entry = pte_to_swp_entry(*pvmw->pte);
-		if (!is_device_private_entry(entry))
+		if (!is_device_private_entry(entry) &&
+		    !is_device_exclusive_entry(entry))
 			return false;
 
 		pfn = swp_offset(entry);
diff --git a/mm/rmap.c b/mm/rmap.c
index b922c7074cdd..37c24672125c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -2028,6 +2028,192 @@ void page_mlock(struct page *page)
 	rmap_walk(page, &rwc);
 }
 
+#ifdef CONFIG_DEVICE_PRIVATE
+struct make_exclusive_args {
+	struct mm_struct *mm;
+	unsigned long address;
+	void *owner;
+	bool valid;
+};
+
+static bool page_make_device_exclusive_one(struct page *page,
+		struct vm_area_struct *vma, unsigned long address, void *priv)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct page_vma_mapped_walk pvmw = {
+		.page = page,
+		.vma = vma,
+		.address = address,
+	};
+	struct make_exclusive_args *args = priv;
+	pte_t pteval;
+	struct page *subpage;
+	bool ret = true;
+	struct mmu_notifier_range range;
+	swp_entry_t entry;
+	pte_t swp_pte;
+
+	mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
+				      vma->vm_mm, address, min(vma->vm_end,
+				      address + page_size(page)), args->owner);
+	mmu_notifier_invalidate_range_start(&range);
+
+	while (page_vma_mapped_walk(&pvmw)) {
+		/* Unexpected PMD-mapped THP? */
+		VM_BUG_ON_PAGE(!pvmw.pte, page);
+
+		if (!pte_present(*pvmw.pte)) {
+			ret = false;
+			page_vma_mapped_walk_done(&pvmw);
+			break;
+		}
+
+		subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
+		address = pvmw.address;
+
+		/* Nuke the page table entry. */
+		flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
+		pteval = ptep_clear_flush(vma, address, pvmw.pte);
+
+		/* Move the dirty bit to the page. Now the pte is gone. */
+		if (pte_dirty(pteval))
+			set_page_dirty(page);
+
+		/*
+		 * Check that our target page is still mapped at the expected
+		 * address.
+		 */
+		if (args->mm == mm && args->address == address &&
+		    pte_write(pteval))
+			args->valid = true;
+
+		/*
+		 * Store the pfn of the page in a special migration
+		 * pte. do_swap_page() will wait until the migration
+		 * pte is removed and then restart fault handling.
+		 */
+		if (pte_write(pteval))
+			entry = make_writable_device_exclusive_entry(
+							page_to_pfn(subpage));
+		else
+			entry = make_readable_device_exclusive_entry(
+							page_to_pfn(subpage));
+		swp_pte = swp_entry_to_pte(entry);
+		if (pte_soft_dirty(pteval))
+			swp_pte = pte_swp_mksoft_dirty(swp_pte);
+		if (pte_uffd_wp(pteval))
+			swp_pte = pte_swp_mkuffd_wp(swp_pte);
+
+		set_pte_at(mm, address, pvmw.pte, swp_pte);
+
+		/*
+		 * There is a reference on the page for the swap entry which has
+		 * been removed, so shouldn't take another.
+		 */
+		page_remove_rmap(subpage, false);
+	}
+
+	mmu_notifier_invalidate_range_end(&range);
+
+	return ret;
+}
+
+/**
+ * page_make_device_exclusive - mark the page exclusively owned by a device
+ * @page: the page to replace page table entries for
+ * @mm: the mm_struct where the page is expected to be mapped
+ * @address: address where the page is expected to be mapped
+ * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks
+ *
+ * Tries to remove all the page table entries which are mapping this page and
+ * replace them with special device exclusive swap entries to grant a device
+ * exclusive access to the page. Caller must hold the page lock.
+ *
+ * Returns false if the page is still mapped, or if it could not be unmapped
+ * from the expected address. Otherwise returns true (success).
+ */
+static bool page_make_device_exclusive(struct page *page, struct mm_struct *mm,
+				unsigned long address, void *owner)
+{
+	struct make_exclusive_args args = {
+		.mm = mm,
+		.address = address,
+		.owner = owner,
+		.valid = false,
+	};
+	struct rmap_walk_control rwc = {
+		.rmap_one = page_make_device_exclusive_one,
+		.done = page_not_mapped,
+		.anon_lock = page_lock_anon_vma_read,
+		.arg = &args,
+	};
+
+	/*
+	 * Restrict to anonymous pages for now to avoid potential writeback
+	 * issues. Also tail pages shouldn't be passed to rmap_walk so skip
+	 * those.
+	 */
+	if (!PageAnon(page) || PageTail(page))
+		return false;
+
+	rmap_walk(page, &rwc);
+
+	return args.valid && !page_mapcount(page);
+}
+
+/**
+ * make_device_exclusive_range() - Mark a range for exclusive use by a device
+ * @mm: mm_struct of assoicated target process
+ * @start: start of the region to mark for exclusive device access
+ * @end: end address of region
+ * @pages: returns the pages which were successfully marked for exclusive access
+ * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
+ *
+ * Returns: number of pages found in the range by GUP. A page is marked for
+ * exclusive access only if the page pointer is non-NULL.
+ *
+ * This function finds ptes mapping page(s) to the given address range, locks
+ * them and replaces mappings with special swap entries preventing userspace CPU
+ * access. On fault these entries are replaced with the original mapping after
+ * calling MMU notifiers.
+ *
+ * A driver using this to program access from a device must use a mmu notifier
+ * critical section to hold a device specific lock during programming. Once
+ * programming is complete it should drop the page lock and reference after
+ * which point CPU access to the page will revoke the exclusive access.
+ */
+int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
+				unsigned long end, struct page **pages,
+				void *owner)
+{
+	long npages = (end - start) >> PAGE_SHIFT;
+	long i;
+
+	npages = get_user_pages_remote(mm, start, npages,
+				       FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
+				       pages, NULL, NULL);
+	if (npages < 0)
+		return npages;
+
+	for (i = 0; i < npages; i++, start += PAGE_SIZE) {
+		if (!trylock_page(pages[i])) {
+			put_page(pages[i]);
+			pages[i] = NULL;
+			continue;
+		}
+
+		if (!page_make_device_exclusive(pages[i], mm, start, owner)) {
+			unlock_page(pages[i]);
+			put_page(pages[i]);
+			pages[i] = NULL;
+		}
+	}
+
+	return npages;
+}
+EXPORT_SYMBOL_GPL(make_device_exclusive_range);
+#endif
+
 void __put_anon_vma(struct anon_vma *anon_vma)
 {
 	struct anon_vma *root = anon_vma->root;
-- 
cgit v1.2.3


From 070c46505a265d54eba7f713760fa6ed984f2921 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 30 Jun 2021 18:54:56 -0700
Subject: drm: include only needed headers in ascii85.h

The ascii85.h is user of exactly two headers, i.e.  math.h and types.h.
There is no need to carry on entire kernel.h.

Link: https://lkml.kernel.org/r/20210611185915.44181-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Jani Nikula <jani.nikula@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ascii85.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ascii85.h b/include/linux/ascii85.h
index 4cc40201273e..83ad775ad0aa 100644
--- a/include/linux/ascii85.h
+++ b/include/linux/ascii85.h
@@ -8,7 +8,8 @@
 #ifndef _ASCII85_H_
 #define _ASCII85_H_
 
-#include <linux/kernel.h>
+#include <linux/math.h>
+#include <linux/types.h>
 
 #define ASCII85_BUFSZ 6
 
-- 
cgit v1.2.3


From f39650de687e35766572ac89dbcd16a5911e2f0a Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 30 Jun 2021 18:54:59 -0700
Subject: kernel.h: split out panic and oops helpers

kernel.h is being used as a dump for all kinds of stuff for a long time.
Here is the attempt to start cleaning it up by splitting out panic and
oops helpers.

There are several purposes of doing this:
- dropping dependency in bug.h
- dropping a loop by moving out panic_notifier.h
- unload kernel.h from something which has its own domain

At the same time convert users tree-wide to use new headers, although for
the time being include new header back to kernel.h to avoid twisted
indirected includes for existing users.

[akpm@linux-foundation.org: thread_info.h needs limits.h]
[andriy.shevchenko@linux.intel.com: ia64 fix]
  Link: https://lkml.kernel.org/r/20210520130557.55277-1-andriy.shevchenko@linux.intel.com

Link: https://lkml.kernel.org/r/20210511074137.33666-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Co-developed-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Corey Minyard <cminyard@mvista.com>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Wei Liu <wei.liu@kernel.org>
Acked-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Sebastian Reichel <sre@kernel.org>
Acked-by: Luis Chamberlain <mcgrof@kernel.org>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Acked-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Acked-by: Helge Deller <deller@gmx.de> # parisc
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/setup.c                         |  2 +-
 arch/arm64/kernel/setup.c                         |  1 +
 arch/ia64/include/asm/pal.h                       |  1 +
 arch/mips/kernel/relocate.c                       |  1 +
 arch/mips/sgi-ip22/ip22-reset.c                   |  1 +
 arch/mips/sgi-ip32/ip32-reset.c                   |  1 +
 arch/parisc/kernel/pdc_chassis.c                  |  1 +
 arch/powerpc/kernel/setup-common.c                |  1 +
 arch/s390/kernel/ipl.c                            |  1 +
 arch/sparc/kernel/sstate.c                        |  1 +
 arch/um/drivers/mconsole_kern.c                   |  1 +
 arch/um/kernel/um_arch.c                          |  1 +
 arch/x86/include/asm/desc.h                       |  1 +
 arch/x86/kernel/cpu/mshyperv.c                    |  1 +
 arch/x86/kernel/setup.c                           |  1 +
 arch/x86/purgatory/purgatory.c                    |  2 +
 arch/x86/xen/enlighten.c                          |  1 +
 arch/xtensa/platforms/iss/setup.c                 |  1 +
 drivers/bus/brcmstb_gisb.c                        |  1 +
 drivers/char/ipmi/ipmi_msghandler.c               |  1 +
 drivers/clk/analogbits/wrpll-cln28hpc.c           |  4 +
 drivers/edac/altera_edac.c                        |  1 +
 drivers/firmware/google/gsmi.c                    |  1 +
 drivers/hv/vmbus_drv.c                            |  1 +
 drivers/hwtracing/coresight/coresight-cpu-debug.c |  1 +
 drivers/leds/trigger/ledtrig-activity.c           |  1 +
 drivers/leds/trigger/ledtrig-heartbeat.c          |  1 +
 drivers/leds/trigger/ledtrig-panic.c              |  1 +
 drivers/misc/bcm-vk/bcm_vk_dev.c                  |  1 +
 drivers/misc/ibmasm/heartbeat.c                   |  1 +
 drivers/misc/pvpanic/pvpanic.c                    |  1 +
 drivers/net/ipa/ipa_smp2p.c                       |  1 +
 drivers/parisc/power.c                            |  1 +
 drivers/power/reset/ltc2952-poweroff.c            |  1 +
 drivers/remoteproc/remoteproc_core.c              |  1 +
 drivers/s390/char/con3215.c                       |  1 +
 drivers/s390/char/con3270.c                       |  1 +
 drivers/s390/char/sclp.c                          |  1 +
 drivers/s390/char/sclp_con.c                      |  1 +
 drivers/s390/char/sclp_vt220.c                    |  1 +
 drivers/s390/char/zcore.c                         |  1 +
 drivers/soc/bcm/brcmstb/pm/pm-arm.c               |  1 +
 drivers/staging/olpc_dcon/olpc_dcon.c             |  1 +
 drivers/video/fbdev/hyperv_fb.c                   |  1 +
 include/asm-generic/bug.h                         |  3 +-
 include/linux/kernel.h                            | 84 +------------------
 include/linux/panic.h                             | 98 +++++++++++++++++++++++
 include/linux/panic_notifier.h                    | 12 +++
 include/linux/thread_info.h                       |  1 +
 kernel/hung_task.c                                |  1 +
 kernel/kexec_core.c                               |  1 +
 kernel/panic.c                                    |  1 +
 kernel/rcu/tree.c                                 |  2 +
 kernel/sysctl.c                                   |  1 +
 kernel/trace/trace.c                              |  1 +
 55 files changed, 169 insertions(+), 85 deletions(-)
 create mode 100644 include/linux/panic.h
 create mode 100644 include/linux/panic_notifier.h

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c
index 5f6858e9dc28..7d56c217b235 100644
--- a/arch/alpha/kernel/setup.c
+++ b/arch/alpha/kernel/setup.c
@@ -28,6 +28,7 @@
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/ioport.h>
+#include <linux/panic_notifier.h>
 #include <linux/platform_device.h>
 #include <linux/memblock.h>
 #include <linux/pci.h>
@@ -46,7 +47,6 @@
 #include <linux/log2.h>
 #include <linux/export.h>
 
-extern struct atomic_notifier_head panic_notifier_list;
 static int alpha_panic_event(struct notifier_block *, unsigned long, void *);
 static struct notifier_block alpha_panic_block = {
 	alpha_panic_event,
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 61845c0821d9..787bc0f601b3 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -23,6 +23,7 @@
 #include <linux/interrupt.h>
 #include <linux/smp.h>
 #include <linux/fs.h>
+#include <linux/panic_notifier.h>
 #include <linux/proc_fs.h>
 #include <linux/memblock.h>
 #include <linux/of_fdt.h>
diff --git a/arch/ia64/include/asm/pal.h b/arch/ia64/include/asm/pal.h
index 5c51fceedaf9..e6b652f9e45e 100644
--- a/arch/ia64/include/asm/pal.h
+++ b/arch/ia64/include/asm/pal.h
@@ -99,6 +99,7 @@
 
 #include <linux/types.h>
 #include <asm/fpu.h>
+#include <asm/intrinsics.h>
 
 /*
  * Data types needed to pass information into PAL procedures and
diff --git a/arch/mips/kernel/relocate.c b/arch/mips/kernel/relocate.c
index 499a5357c09f..56b51de2dc51 100644
--- a/arch/mips/kernel/relocate.c
+++ b/arch/mips/kernel/relocate.c
@@ -18,6 +18,7 @@
 #include <linux/kernel.h>
 #include <linux/libfdt.h>
 #include <linux/of_fdt.h>
+#include <linux/panic_notifier.h>
 #include <linux/sched/task.h>
 #include <linux/start_kernel.h>
 #include <linux/string.h>
diff --git a/arch/mips/sgi-ip22/ip22-reset.c b/arch/mips/sgi-ip22/ip22-reset.c
index c374f3ceec38..9028dbbb45dd 100644
--- a/arch/mips/sgi-ip22/ip22-reset.c
+++ b/arch/mips/sgi-ip22/ip22-reset.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/sched/signal.h>
 #include <linux/notifier.h>
+#include <linux/panic_notifier.h>
 #include <linux/pm.h>
 #include <linux/timer.h>
 
diff --git a/arch/mips/sgi-ip32/ip32-reset.c b/arch/mips/sgi-ip32/ip32-reset.c
index 20d8637340be..18d1c115cd53 100644
--- a/arch/mips/sgi-ip32/ip32-reset.c
+++ b/arch/mips/sgi-ip32/ip32-reset.c
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/panic_notifier.h>
 #include <linux/sched.h>
 #include <linux/sched/signal.h>
 #include <linux/notifier.h>
diff --git a/arch/parisc/kernel/pdc_chassis.c b/arch/parisc/kernel/pdc_chassis.c
index 75ae88d13909..da154406d368 100644
--- a/arch/parisc/kernel/pdc_chassis.c
+++ b/arch/parisc/kernel/pdc_chassis.c
@@ -20,6 +20,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/panic_notifier.h>
 #include <linux/reboot.h>
 #include <linux/notifier.h>
 #include <linux/cache.h>
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 74a98fff2c2f..046fe21b5c3b 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -9,6 +9,7 @@
 #undef DEBUG
 
 #include <linux/export.h>
+#include <linux/panic_notifier.h>
 #include <linux/string.h>
 #include <linux/sched.h>
 #include <linux/init.h>
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index dba04fbc37a2..36f870dc944f 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -13,6 +13,7 @@
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/delay.h>
+#include <linux/panic_notifier.h>
 #include <linux/reboot.h>
 #include <linux/ctype.h>
 #include <linux/fs.h>
diff --git a/arch/sparc/kernel/sstate.c b/arch/sparc/kernel/sstate.c
index ac8677c3841e..3bcc4ddc6911 100644
--- a/arch/sparc/kernel/sstate.c
+++ b/arch/sparc/kernel/sstate.c
@@ -6,6 +6,7 @@
 
 #include <linux/kernel.h>
 #include <linux/notifier.h>
+#include <linux/panic_notifier.h>
 #include <linux/reboot.h>
 #include <linux/init.h>
 
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 6d00af25ec6b..328b16f99b30 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -12,6 +12,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/notifier.h>
+#include <linux/panic_notifier.h>
 #include <linux/reboot.h>
 #include <linux/sched/debug.h>
 #include <linux/proc_fs.h>
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index 74e07e748a9b..9512253947d5 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -7,6 +7,7 @@
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/panic_notifier.h>
 #include <linux/seq_file.h>
 #include <linux/string.h>
 #include <linux/utsname.h>
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 476082a83d1c..ceb12683b6d1 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -9,6 +9,7 @@
 #include <asm/irq_vectors.h>
 #include <asm/cpu_entry_area.h>
 
+#include <linux/debug_locks.h>
 #include <linux/smp.h>
 #include <linux/percpu.h>
 
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 22f13343b5da..9e5c6f2b044d 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -17,6 +17,7 @@
 #include <linux/irq.h>
 #include <linux/kexec.h>
 #include <linux/i8253.h>
+#include <linux/panic_notifier.h>
 #include <linux/random.h>
 #include <asm/processor.h>
 #include <asm/hypervisor.h>
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 1e720626069a..d21ba6fa39d9 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -14,6 +14,7 @@
 #include <linux/initrd.h>
 #include <linux/iscsi_ibft.h>
 #include <linux/memblock.h>
+#include <linux/panic_notifier.h>
 #include <linux/pci.h>
 #include <linux/root_dev.h>
 #include <linux/hugetlb.h>
diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c
index f03b64d9cb51..7558139920f8 100644
--- a/arch/x86/purgatory/purgatory.c
+++ b/arch/x86/purgatory/purgatory.c
@@ -9,6 +9,8 @@
  */
 
 #include <linux/bug.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
 #include <crypto/sha2.h>
 #include <asm/purgatory.h>
 
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index aa9f50fccc5d..c79bd0af2e8c 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -6,6 +6,7 @@
 #include <linux/cpu.h>
 #include <linux/kexec.h>
 #include <linux/slab.h>
+#include <linux/panic_notifier.h>
 
 #include <xen/xen.h>
 #include <xen/features.h>
diff --git a/arch/xtensa/platforms/iss/setup.c b/arch/xtensa/platforms/iss/setup.c
index ed519aee0ec8..d3433e1bb94e 100644
--- a/arch/xtensa/platforms/iss/setup.c
+++ b/arch/xtensa/platforms/iss/setup.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/notifier.h>
+#include <linux/panic_notifier.h>
 #include <linux/printk.h>
 #include <linux/string.h>
 
diff --git a/drivers/bus/brcmstb_gisb.c b/drivers/bus/brcmstb_gisb.c
index 7355fa2cb439..6551286a60cc 100644
--- a/drivers/bus/brcmstb_gisb.c
+++ b/drivers/bus/brcmstb_gisb.c
@@ -6,6 +6,7 @@
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/module.h>
+#include <linux/panic_notifier.h>
 #include <linux/platform_device.h>
 #include <linux/interrupt.h>
 #include <linux/sysfs.h>
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 8a0e97b33cae..e96cb5c4f97a 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -16,6 +16,7 @@
 
 #include <linux/module.h>
 #include <linux/errno.h>
+#include <linux/panic_notifier.h>
 #include <linux/poll.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
diff --git a/drivers/clk/analogbits/wrpll-cln28hpc.c b/drivers/clk/analogbits/wrpll-cln28hpc.c
index 776ead319ae9..7c64ea52a8d5 100644
--- a/drivers/clk/analogbits/wrpll-cln28hpc.c
+++ b/drivers/clk/analogbits/wrpll-cln28hpc.c
@@ -23,8 +23,12 @@
 
 #include <linux/bug.h>
 #include <linux/err.h>
+#include <linux/limits.h>
 #include <linux/log2.h>
 #include <linux/math64.h>
+#include <linux/math.h>
+#include <linux/minmax.h>
+
 #include <linux/clk/analogbits-wrpll-cln28hpc.h>
 
 /* MIN_INPUT_FREQ: minimum input clock frequency, in Hz (Fref_min) */
diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c
index 5f7fd79ec82f..61c21bd880a4 100644
--- a/drivers/edac/altera_edac.c
+++ b/drivers/edac/altera_edac.c
@@ -20,6 +20,7 @@
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
 #include <linux/of_platform.h>
+#include <linux/panic_notifier.h>
 #include <linux/platform_device.h>
 #include <linux/regmap.h>
 #include <linux/types.h>
diff --git a/drivers/firmware/google/gsmi.c b/drivers/firmware/google/gsmi.c
index bb6e77ee3898..adaa492c3d2d 100644
--- a/drivers/firmware/google/gsmi.c
+++ b/drivers/firmware/google/gsmi.c
@@ -19,6 +19,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
+#include <linux/panic_notifier.h>
 #include <linux/ioctl.h>
 #include <linux/acpi.h>
 #include <linux/io.h>
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 92cb3f7d21d9..57bbbaa4e8f7 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -25,6 +25,7 @@
 
 #include <linux/delay.h>
 #include <linux/notifier.h>
+#include <linux/panic_notifier.h>
 #include <linux/ptrace.h>
 #include <linux/screen_info.h>
 #include <linux/kdebug.h>
diff --git a/drivers/hwtracing/coresight/coresight-cpu-debug.c b/drivers/hwtracing/coresight/coresight-cpu-debug.c
index 2dcf13de751f..9731d3a96073 100644
--- a/drivers/hwtracing/coresight/coresight-cpu-debug.c
+++ b/drivers/hwtracing/coresight/coresight-cpu-debug.c
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/panic_notifier.h>
 #include <linux/pm_qos.h>
 #include <linux/slab.h>
 #include <linux/smp.h>
diff --git a/drivers/leds/trigger/ledtrig-activity.c b/drivers/leds/trigger/ledtrig-activity.c
index 14ba7faaed9e..30bc9df03636 100644
--- a/drivers/leds/trigger/ledtrig-activity.c
+++ b/drivers/leds/trigger/ledtrig-activity.c
@@ -11,6 +11,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/leds.h>
 #include <linux/module.h>
+#include <linux/panic_notifier.h>
 #include <linux/reboot.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
diff --git a/drivers/leds/trigger/ledtrig-heartbeat.c b/drivers/leds/trigger/ledtrig-heartbeat.c
index 36b6709afe9f..7fe0a05574d2 100644
--- a/drivers/leds/trigger/ledtrig-heartbeat.c
+++ b/drivers/leds/trigger/ledtrig-heartbeat.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/panic_notifier.h>
 #include <linux/slab.h>
 #include <linux/timer.h>
 #include <linux/sched.h>
diff --git a/drivers/leds/trigger/ledtrig-panic.c b/drivers/leds/trigger/ledtrig-panic.c
index 5751cd032f9d..64abf2e91608 100644
--- a/drivers/leds/trigger/ledtrig-panic.c
+++ b/drivers/leds/trigger/ledtrig-panic.c
@@ -8,6 +8,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/notifier.h>
+#include <linux/panic_notifier.h>
 #include <linux/leds.h>
 #include "../leds.h"
 
diff --git a/drivers/misc/bcm-vk/bcm_vk_dev.c b/drivers/misc/bcm-vk/bcm_vk_dev.c
index 6bfea3210389..ad639ee85b2a 100644
--- a/drivers/misc/bcm-vk/bcm_vk_dev.c
+++ b/drivers/misc/bcm-vk/bcm_vk_dev.c
@@ -9,6 +9,7 @@
 #include <linux/fs.h>
 #include <linux/idr.h>
 #include <linux/interrupt.h>
+#include <linux/panic_notifier.h>
 #include <linux/kref.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
diff --git a/drivers/misc/ibmasm/heartbeat.c b/drivers/misc/ibmasm/heartbeat.c
index 4f5f3bdc814d..59c9a0d95659 100644
--- a/drivers/misc/ibmasm/heartbeat.c
+++ b/drivers/misc/ibmasm/heartbeat.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/notifier.h>
+#include <linux/panic_notifier.h>
 #include "ibmasm.h"
 #include "dot_command.h"
 #include "lowlevel.h"
diff --git a/drivers/misc/pvpanic/pvpanic.c b/drivers/misc/pvpanic/pvpanic.c
index 65f70a4da8c0..793ea0c01193 100644
--- a/drivers/misc/pvpanic/pvpanic.c
+++ b/drivers/misc/pvpanic/pvpanic.c
@@ -13,6 +13,7 @@
 #include <linux/mod_devicetable.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
+#include <linux/panic_notifier.h>
 #include <linux/types.h>
 #include <linux/cdev.h>
 #include <linux/list.h>
diff --git a/drivers/net/ipa/ipa_smp2p.c b/drivers/net/ipa/ipa_smp2p.c
index a5f7a79a1923..34b68dc43886 100644
--- a/drivers/net/ipa/ipa_smp2p.c
+++ b/drivers/net/ipa/ipa_smp2p.c
@@ -8,6 +8,7 @@
 #include <linux/device.h>
 #include <linux/interrupt.h>
 #include <linux/notifier.h>
+#include <linux/panic_notifier.h>
 #include <linux/soc/qcom/smem.h>
 #include <linux/soc/qcom/smem_state.h>
 
diff --git a/drivers/parisc/power.c b/drivers/parisc/power.c
index ebaf6867b457..456776bd8ee6 100644
--- a/drivers/parisc/power.c
+++ b/drivers/parisc/power.c
@@ -38,6 +38,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/notifier.h>
+#include <linux/panic_notifier.h>
 #include <linux/reboot.h>
 #include <linux/sched/signal.h>
 #include <linux/kthread.h>
diff --git a/drivers/power/reset/ltc2952-poweroff.c b/drivers/power/reset/ltc2952-poweroff.c
index d1495af30081..8688c8ba8894 100644
--- a/drivers/power/reset/ltc2952-poweroff.c
+++ b/drivers/power/reset/ltc2952-poweroff.c
@@ -52,6 +52,7 @@
 #include <linux/slab.h>
 #include <linux/kmod.h>
 #include <linux/module.h>
+#include <linux/panic_notifier.h>
 #include <linux/mod_devicetable.h>
 #include <linux/gpio/consumer.h>
 #include <linux/reboot.h>
diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index 626a6b90fba2..76dd8e2b1e7e 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -20,6 +20,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/device.h>
+#include <linux/panic_notifier.h>
 #include <linux/slab.h>
 #include <linux/mutex.h>
 #include <linux/dma-map-ops.h>
diff --git a/drivers/s390/char/con3215.c b/drivers/s390/char/con3215.c
index 1fd5bca9fa20..02523f4e29f4 100644
--- a/drivers/s390/char/con3215.c
+++ b/drivers/s390/char/con3215.c
@@ -19,6 +19,7 @@
 #include <linux/console.h>
 #include <linux/interrupt.h>
 #include <linux/err.h>
+#include <linux/panic_notifier.h>
 #include <linux/reboot.h>
 #include <linux/serial.h> /* ASYNC_* flags */
 #include <linux/slab.h>
diff --git a/drivers/s390/char/con3270.c b/drivers/s390/char/con3270.c
index e21962c0fd94..87cdbace1453 100644
--- a/drivers/s390/char/con3270.c
+++ b/drivers/s390/char/con3270.c
@@ -13,6 +13,7 @@
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/list.h>
+#include <linux/panic_notifier.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/err.h>
diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c
index 986bbbc23d0a..6627820a5eb9 100644
--- a/drivers/s390/char/sclp.c
+++ b/drivers/s390/char/sclp.c
@@ -11,6 +11,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/module.h>
 #include <linux/err.h>
+#include <linux/panic_notifier.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/timer.h>
diff --git a/drivers/s390/char/sclp_con.c b/drivers/s390/char/sclp_con.c
index 9b852a47ccc1..cc01a7b8595d 100644
--- a/drivers/s390/char/sclp_con.c
+++ b/drivers/s390/char/sclp_con.c
@@ -10,6 +10,7 @@
 #include <linux/kmod.h>
 #include <linux/console.h>
 #include <linux/init.h>
+#include <linux/panic_notifier.h>
 #include <linux/timer.h>
 #include <linux/jiffies.h>
 #include <linux/termios.h>
diff --git a/drivers/s390/char/sclp_vt220.c b/drivers/s390/char/sclp_vt220.c
index 7f4445b0f819..5b8a7b090a97 100644
--- a/drivers/s390/char/sclp_vt220.c
+++ b/drivers/s390/char/sclp_vt220.c
@@ -9,6 +9,7 @@
 
 #include <linux/module.h>
 #include <linux/spinlock.h>
+#include <linux/panic_notifier.h>
 #include <linux/list.h>
 #include <linux/wait.h>
 #include <linux/timer.h>
diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c
index bd3c724bf695..b5b0848da93b 100644
--- a/drivers/s390/char/zcore.c
+++ b/drivers/s390/char/zcore.c
@@ -15,6 +15,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/debugfs.h>
+#include <linux/panic_notifier.h>
 #include <linux/reboot.h>
 
 #include <asm/asm-offsets.h>
diff --git a/drivers/soc/bcm/brcmstb/pm/pm-arm.c b/drivers/soc/bcm/brcmstb/pm/pm-arm.c
index a673fdffe216..3cbb165d6e30 100644
--- a/drivers/soc/bcm/brcmstb/pm/pm-arm.c
+++ b/drivers/soc/bcm/brcmstb/pm/pm-arm.c
@@ -28,6 +28,7 @@
 #include <linux/notifier.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
+#include <linux/panic_notifier.h>
 #include <linux/platform_device.h>
 #include <linux/pm.h>
 #include <linux/printk.h>
diff --git a/drivers/staging/olpc_dcon/olpc_dcon.c b/drivers/staging/olpc_dcon/olpc_dcon.c
index 6d8e9a481786..7284cb4ac395 100644
--- a/drivers/staging/olpc_dcon/olpc_dcon.c
+++ b/drivers/staging/olpc_dcon/olpc_dcon.c
@@ -22,6 +22,7 @@
 #include <linux/device.h>
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
+#include <linux/panic_notifier.h>
 #include <linux/reboot.h>
 #include <linux/olpc-ec.h>
 #include <asm/tsc.h>
diff --git a/drivers/video/fbdev/hyperv_fb.c b/drivers/video/fbdev/hyperv_fb.c
index a7e6eea2c4a1..23999df52739 100644
--- a/drivers/video/fbdev/hyperv_fb.c
+++ b/drivers/video/fbdev/hyperv_fb.c
@@ -52,6 +52,7 @@
 #include <linux/completion.h>
 #include <linux/fb.h>
 #include <linux/pci.h>
+#include <linux/panic_notifier.h>
 #include <linux/efi.h>
 #include <linux/console.h>
 
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index b402494883b6..f152b9bb916f 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -17,7 +17,8 @@
 #endif
 
 #ifndef __ASSEMBLY__
-#include <linux/kernel.h>
+#include <linux/panic.h>
+#include <linux/printk.h>
 
 #ifdef CONFIG_BUG
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index bf950621febf..baea2eb763d0 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -14,6 +14,7 @@
 #include <linux/math.h>
 #include <linux/minmax.h>
 #include <linux/typecheck.h>
+#include <linux/panic.h>
 #include <linux/printk.h>
 #include <linux/build_bug.h>
 #include <linux/static_call_types.h>
@@ -72,7 +73,6 @@
 #define lower_32_bits(n) ((u32)((n) & 0xffffffff))
 
 struct completion;
-struct pt_regs;
 struct user;
 
 #ifdef CONFIG_PREEMPT_VOLUNTARY
@@ -177,14 +177,6 @@ void __might_fault(const char *file, int line);
 static inline void might_fault(void) { }
 #endif
 
-extern struct atomic_notifier_head panic_notifier_list;
-extern long (*panic_blink)(int state);
-__printf(1, 2)
-void panic(const char *fmt, ...) __noreturn __cold;
-void nmi_panic(struct pt_regs *regs, const char *msg);
-extern void oops_enter(void);
-extern void oops_exit(void);
-extern bool oops_may_print(void);
 void do_exit(long error_code) __noreturn;
 void complete_and_exit(struct completion *, long) __noreturn;
 
@@ -372,52 +364,8 @@ extern int __kernel_text_address(unsigned long addr);
 extern int kernel_text_address(unsigned long addr);
 extern int func_ptr_is_kernel_text(void *ptr);
 
-#ifdef CONFIG_SMP
-extern unsigned int sysctl_oops_all_cpu_backtrace;
-#else
-#define sysctl_oops_all_cpu_backtrace 0
-#endif /* CONFIG_SMP */
-
 extern void bust_spinlocks(int yes);
-extern int panic_timeout;
-extern unsigned long panic_print;
-extern int panic_on_oops;
-extern int panic_on_unrecovered_nmi;
-extern int panic_on_io_nmi;
-extern int panic_on_warn;
-extern unsigned long panic_on_taint;
-extern bool panic_on_taint_nousertaint;
-extern int sysctl_panic_on_rcu_stall;
-extern int sysctl_max_rcu_stall_to_panic;
-extern int sysctl_panic_on_stackoverflow;
-
-extern bool crash_kexec_post_notifiers;
 
-/*
- * panic_cpu is used for synchronizing panic() and crash_kexec() execution. It
- * holds a CPU number which is executing panic() currently. A value of
- * PANIC_CPU_INVALID means no CPU has entered panic() or crash_kexec().
- */
-extern atomic_t panic_cpu;
-#define PANIC_CPU_INVALID	-1
-
-/*
- * Only to be used by arch init code. If the user over-wrote the default
- * CONFIG_PANIC_TIMEOUT, honor it.
- */
-static inline void set_arch_panic_timeout(int timeout, int arch_default_timeout)
-{
-	if (panic_timeout == arch_default_timeout)
-		panic_timeout = timeout;
-}
-extern const char *print_tainted(void);
-enum lockdep_ok {
-	LOCKDEP_STILL_OK,
-	LOCKDEP_NOW_UNRELIABLE
-};
-extern void add_taint(unsigned flag, enum lockdep_ok);
-extern int test_taint(unsigned flag);
-extern unsigned long get_taint(void);
 extern int root_mountflags;
 
 extern bool early_boot_irqs_disabled;
@@ -436,36 +384,6 @@ extern enum system_states {
 	SYSTEM_SUSPEND,
 } system_state;
 
-/* This cannot be an enum because some may be used in assembly source. */
-#define TAINT_PROPRIETARY_MODULE	0
-#define TAINT_FORCED_MODULE		1
-#define TAINT_CPU_OUT_OF_SPEC		2
-#define TAINT_FORCED_RMMOD		3
-#define TAINT_MACHINE_CHECK		4
-#define TAINT_BAD_PAGE			5
-#define TAINT_USER			6
-#define TAINT_DIE			7
-#define TAINT_OVERRIDDEN_ACPI_TABLE	8
-#define TAINT_WARN			9
-#define TAINT_CRAP			10
-#define TAINT_FIRMWARE_WORKAROUND	11
-#define TAINT_OOT_MODULE		12
-#define TAINT_UNSIGNED_MODULE		13
-#define TAINT_SOFTLOCKUP		14
-#define TAINT_LIVEPATCH			15
-#define TAINT_AUX			16
-#define TAINT_RANDSTRUCT		17
-#define TAINT_FLAGS_COUNT		18
-#define TAINT_FLAGS_MAX			((1UL << TAINT_FLAGS_COUNT) - 1)
-
-struct taint_flag {
-	char c_true;	/* character printed when tainted */
-	char c_false;	/* character printed when not tainted */
-	bool module;	/* also show as a per-module taint flag */
-};
-
-extern const struct taint_flag taint_flags[TAINT_FLAGS_COUNT];
-
 extern const char hex_asc[];
 #define hex_asc_lo(x)	hex_asc[((x) & 0x0f)]
 #define hex_asc_hi(x)	hex_asc[((x) & 0xf0) >> 4]
diff --git a/include/linux/panic.h b/include/linux/panic.h
new file mode 100644
index 000000000000..f5844908a089
--- /dev/null
+++ b/include/linux/panic.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PANIC_H
+#define _LINUX_PANIC_H
+
+#include <linux/compiler_attributes.h>
+#include <linux/types.h>
+
+struct pt_regs;
+
+extern long (*panic_blink)(int state);
+__printf(1, 2)
+void panic(const char *fmt, ...) __noreturn __cold;
+void nmi_panic(struct pt_regs *regs, const char *msg);
+extern void oops_enter(void);
+extern void oops_exit(void);
+extern bool oops_may_print(void);
+
+#ifdef CONFIG_SMP
+extern unsigned int sysctl_oops_all_cpu_backtrace;
+#else
+#define sysctl_oops_all_cpu_backtrace 0
+#endif /* CONFIG_SMP */
+
+extern int panic_timeout;
+extern unsigned long panic_print;
+extern int panic_on_oops;
+extern int panic_on_unrecovered_nmi;
+extern int panic_on_io_nmi;
+extern int panic_on_warn;
+
+extern unsigned long panic_on_taint;
+extern bool panic_on_taint_nousertaint;
+
+extern int sysctl_panic_on_rcu_stall;
+extern int sysctl_max_rcu_stall_to_panic;
+extern int sysctl_panic_on_stackoverflow;
+
+extern bool crash_kexec_post_notifiers;
+
+/*
+ * panic_cpu is used for synchronizing panic() and crash_kexec() execution. It
+ * holds a CPU number which is executing panic() currently. A value of
+ * PANIC_CPU_INVALID means no CPU has entered panic() or crash_kexec().
+ */
+extern atomic_t panic_cpu;
+#define PANIC_CPU_INVALID	-1
+
+/*
+ * Only to be used by arch init code. If the user over-wrote the default
+ * CONFIG_PANIC_TIMEOUT, honor it.
+ */
+static inline void set_arch_panic_timeout(int timeout, int arch_default_timeout)
+{
+	if (panic_timeout == arch_default_timeout)
+		panic_timeout = timeout;
+}
+
+/* This cannot be an enum because some may be used in assembly source. */
+#define TAINT_PROPRIETARY_MODULE	0
+#define TAINT_FORCED_MODULE		1
+#define TAINT_CPU_OUT_OF_SPEC		2
+#define TAINT_FORCED_RMMOD		3
+#define TAINT_MACHINE_CHECK		4
+#define TAINT_BAD_PAGE			5
+#define TAINT_USER			6
+#define TAINT_DIE			7
+#define TAINT_OVERRIDDEN_ACPI_TABLE	8
+#define TAINT_WARN			9
+#define TAINT_CRAP			10
+#define TAINT_FIRMWARE_WORKAROUND	11
+#define TAINT_OOT_MODULE		12
+#define TAINT_UNSIGNED_MODULE		13
+#define TAINT_SOFTLOCKUP		14
+#define TAINT_LIVEPATCH			15
+#define TAINT_AUX			16
+#define TAINT_RANDSTRUCT		17
+#define TAINT_FLAGS_COUNT		18
+#define TAINT_FLAGS_MAX			((1UL << TAINT_FLAGS_COUNT) - 1)
+
+struct taint_flag {
+	char c_true;	/* character printed when tainted */
+	char c_false;	/* character printed when not tainted */
+	bool module;	/* also show as a per-module taint flag */
+};
+
+extern const struct taint_flag taint_flags[TAINT_FLAGS_COUNT];
+
+enum lockdep_ok {
+	LOCKDEP_STILL_OK,
+	LOCKDEP_NOW_UNRELIABLE,
+};
+
+extern const char *print_tainted(void);
+extern void add_taint(unsigned flag, enum lockdep_ok);
+extern int test_taint(unsigned flag);
+extern unsigned long get_taint(void);
+
+#endif	/* _LINUX_PANIC_H */
diff --git a/include/linux/panic_notifier.h b/include/linux/panic_notifier.h
new file mode 100644
index 000000000000..41e32483d7a7
--- /dev/null
+++ b/include/linux/panic_notifier.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PANIC_NOTIFIERS_H
+#define _LINUX_PANIC_NOTIFIERS_H
+
+#include <linux/notifier.h>
+#include <linux/types.h>
+
+extern struct atomic_notifier_head panic_notifier_list;
+
+extern bool crash_kexec_post_notifiers;
+
+#endif	/* _LINUX_PANIC_NOTIFIERS_H */
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 157762db9d4b..0999f6317978 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -9,6 +9,7 @@
 #define _LINUX_THREAD_INFO_H
 
 #include <linux/types.h>
+#include <linux/limits.h>
 #include <linux/bug.h>
 #include <linux/restart_block.h>
 #include <linux/errno.h>
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 396ebaebea3f..ebc641664eb0 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -15,6 +15,7 @@
 #include <linux/kthread.h>
 #include <linux/lockdep.h>
 #include <linux/export.h>
+#include <linux/panic_notifier.h>
 #include <linux/sysctl.h>
 #include <linux/suspend.h>
 #include <linux/utsname.h>
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index f099baee3578..4b34a9aa32bc 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -26,6 +26,7 @@
 #include <linux/suspend.h>
 #include <linux/device.h>
 #include <linux/freezer.h>
+#include <linux/panic_notifier.h>
 #include <linux/pm.h>
 #include <linux/cpu.h>
 #include <linux/uaccess.h>
diff --git a/kernel/panic.c b/kernel/panic.c
index 332736a72a58..edad89660a2b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -23,6 +23,7 @@
 #include <linux/reboot.h>
 #include <linux/delay.h>
 #include <linux/kexec.h>
+#include <linux/panic_notifier.h>
 #include <linux/sched.h>
 #include <linux/sysrq.h>
 #include <linux/init.h>
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8e78b2430c16..f12056beb916 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -32,6 +32,8 @@
 #include <linux/export.h>
 #include <linux/completion.h>
 #include <linux/moduleparam.h>
+#include <linux/panic.h>
+#include <linux/panic_notifier.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/cpu.h>
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index bb37739f5731..70e286c83d6b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -27,6 +27,7 @@
 #include <linux/sysctl.h>
 #include <linux/bitmap.h>
 #include <linux/signal.h>
+#include <linux/panic.h>
 #include <linux/printk.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d23a09d3eb37..3c1384bc5c5a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -39,6 +39,7 @@
 #include <linux/slab.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
+#include <linux/panic_notifier.h>
 #include <linux/poll.h>
 #include <linux/nmi.h>
 #include <linux/fs.h>
-- 
cgit v1.2.3


From 994b69703e86ed0ab2228fc606761a3b08d48af3 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 30 Jun 2021 18:55:05 -0700
Subject: lib/string_helpers: switch to use BIT() macro

Patch series "lib/string_helpers: get rid of ugly *_escape_mem_ascii()", v3.

Get rid of ugly *_escape_mem_ascii() API since it's not flexible and has
the only single user.  Provide better approach based on usage of the
string_escape_mem() with appropriate flags.

Test cases has been expanded accordingly to cover new functionality.

This patch (of 15):

Switch to use BIT() macro for flag definitions.  No changes implied.

Link: https://lkml.kernel.org/r/20210504180819.73127-1-andriy.shevchenko@linux.intel.com
Link: https://lkml.kernel.org/r/20210504180819.73127-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/string_helpers.h | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index fa06dcdc481e..bf01e24edd89 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_STRING_HELPERS_H_
 #define _LINUX_STRING_HELPERS_H_
 
+#include <linux/bits.h>
 #include <linux/ctype.h>
 #include <linux/types.h>
 
@@ -18,10 +19,10 @@ enum string_size_units {
 void string_get_size(u64 size, u64 blk_size, enum string_size_units units,
 		     char *buf, int len);
 
-#define UNESCAPE_SPACE		0x01
-#define UNESCAPE_OCTAL		0x02
-#define UNESCAPE_HEX		0x04
-#define UNESCAPE_SPECIAL	0x08
+#define UNESCAPE_SPACE		BIT(0)
+#define UNESCAPE_OCTAL		BIT(1)
+#define UNESCAPE_HEX		BIT(2)
+#define UNESCAPE_SPECIAL	BIT(3)
 #define UNESCAPE_ANY		\
 	(UNESCAPE_SPACE | UNESCAPE_OCTAL | UNESCAPE_HEX | UNESCAPE_SPECIAL)
 
@@ -42,15 +43,15 @@ static inline int string_unescape_any_inplace(char *buf)
 	return string_unescape_any(buf, buf, 0);
 }
 
-#define ESCAPE_SPACE		0x01
-#define ESCAPE_SPECIAL		0x02
-#define ESCAPE_NULL		0x04
-#define ESCAPE_OCTAL		0x08
+#define ESCAPE_SPACE		BIT(0)
+#define ESCAPE_SPECIAL		BIT(1)
+#define ESCAPE_NULL		BIT(2)
+#define ESCAPE_OCTAL		BIT(3)
 #define ESCAPE_ANY		\
 	(ESCAPE_SPACE | ESCAPE_OCTAL | ESCAPE_SPECIAL | ESCAPE_NULL)
-#define ESCAPE_NP		0x10
+#define ESCAPE_NP		BIT(4)
 #define ESCAPE_ANY_NP		(ESCAPE_ANY | ESCAPE_NP)
-#define ESCAPE_HEX		0x20
+#define ESCAPE_HEX		BIT(5)
 
 int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
 		unsigned int flags, const char *only);
-- 
cgit v1.2.3


From a0809783355cfe1cc1b2fa7f881c3a79df0b2a27 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 30 Jun 2021 18:55:14 -0700
Subject: lib/string_helpers: introduce ESCAPE_NA for escaping non-ASCII

Some users may want to have an ASCII based filter, provided by isascii()
function.  Here is the addition of a such.

Link: https://lkml.kernel.org/r/20210504180819.73127-5-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/string_helpers.h |  1 +
 lib/string_helpers.c           | 21 +++++++++++++++++----
 2 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index bf01e24edd89..d6cf6fe10f74 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -52,6 +52,7 @@ static inline int string_unescape_any_inplace(char *buf)
 #define ESCAPE_NP		BIT(4)
 #define ESCAPE_ANY_NP		(ESCAPE_ANY | ESCAPE_NP)
 #define ESCAPE_HEX		BIT(5)
+#define ESCAPE_NA		BIT(6)
 
 int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
 		unsigned int flags, const char *only);
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index e3ef9f86cc34..a963404b8c16 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -454,8 +454,8 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
  *
  *	1. The character is not matched to the one from @only string and thus
  *	   must go as-is to the output.
- *	2. The character is matched to the printable class, if asked, and in
- *	   case of match it passes through to the output.
+ *	2. The character is matched to the printable or ASCII class, if asked,
+ *	   and in case of match it passes through to the output.
  *	3. The character is checked if it falls into the class given by @flags.
  *	   %ESCAPE_OCTAL and %ESCAPE_HEX are going last since they cover any
  *	   character. Note that they actually can't go together, otherwise
@@ -463,7 +463,7 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
  *
  * Caller must provide valid source and destination pointers. Be aware that
  * destination buffer will not be NULL-terminated, thus caller have to append
- * it if needs.   The supported flags are::
+ * it if needs. The supported flags are::
  *
  *	%ESCAPE_SPACE: (special white space, not space itself)
  *		'\f' - form feed
@@ -482,11 +482,18 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
  *	%ESCAPE_ANY:
  *		all previous together
  *	%ESCAPE_NP:
- *		escape only non-printable characters (checked by isprint)
+ *		escape only non-printable characters, checked by isprint()
  *	%ESCAPE_ANY_NP:
  *		all previous together
  *	%ESCAPE_HEX:
  *		'\xHH' - byte with hexadecimal value HH (2 digits)
+ *	%ESCAPE_NA:
+ *		escape only non-ascii characters, checked by isascii()
+ *
+ * One notable caveat, the %ESCAPE_NP and %ESCAPE_NA have higher priority
+ * than the rest of the flags (%ESCAPE_NP is higher than %ESCAPE_NA).
+ * It doesn't make much sense to use either of them without %ESCAPE_OCTAL
+ * or %ESCAPE_HEX, because they cover most of the other character classes.
  *
  * Return:
  * The total size of the escaped output that would be generated for
@@ -510,6 +517,8 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
 		 *	  character under question
 		 *	- the character is printable, when @flags has
 		 *	  %ESCAPE_NP bit set
+		 *	- the character is ASCII, when @flags has
+		 *	  %ESCAPE_NA bit set
 		 *	- the character doesn't fall into a class of symbols
 		 *	  defined by given @flags
 		 * In these cases we just pass through a character to the
@@ -523,6 +532,10 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
 		    flags & ESCAPE_NP && escape_passthrough(c, &p, end))
 			continue;
 
+		if (isascii(c) &&
+		    flags & ESCAPE_NA && escape_passthrough(c, &p, end))
+			continue;
+
 		if (flags & ESCAPE_SPACE && escape_space(c, &p, end))
 			continue;
 
-- 
cgit v1.2.3


From 0362c27fb373ea04eace9e7a70e61036ab81f09f Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 30 Jun 2021 18:55:17 -0700
Subject: lib/string_helpers: introduce ESCAPE_NAP to escape non-ASCII and
 non-printable

Some users may want to have an ASCII based filter for printable only
characters, provided by conjunction of isascii() and isprint() functions.

Here is the addition of a such.

Link: https://lkml.kernel.org/r/20210504180819.73127-6-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/string_helpers.h |  1 +
 lib/string_helpers.c           | 20 ++++++++++++++++----
 2 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index d6cf6fe10f74..811c6a627620 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -53,6 +53,7 @@ static inline int string_unescape_any_inplace(char *buf)
 #define ESCAPE_ANY_NP		(ESCAPE_ANY | ESCAPE_NP)
 #define ESCAPE_HEX		BIT(5)
 #define ESCAPE_NA		BIT(6)
+#define ESCAPE_NAP		BIT(7)
 
 int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
 		unsigned int flags, const char *only);
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index a963404b8c16..ceca5fbbd92c 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -454,9 +454,11 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
  *
  *	1. The character is not matched to the one from @only string and thus
  *	   must go as-is to the output.
- *	2. The character is matched to the printable or ASCII class, if asked,
+ *	2. The character is matched to the printable and ASCII classes, if asked,
  *	   and in case of match it passes through to the output.
- *	3. The character is checked if it falls into the class given by @flags.
+ *	3. The character is matched to the printable or ASCII class, if asked,
+ *	   and in case of match it passes through to the output.
+ *	4. The character is checked if it falls into the class given by @flags.
  *	   %ESCAPE_OCTAL and %ESCAPE_HEX are going last since they cover any
  *	   character. Note that they actually can't go together, otherwise
  *	   %ESCAPE_HEX will be ignored.
@@ -489,11 +491,15 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
  *		'\xHH' - byte with hexadecimal value HH (2 digits)
  *	%ESCAPE_NA:
  *		escape only non-ascii characters, checked by isascii()
+ *	%ESCAPE_NAP:
+ *		escape only non-printable or non-ascii characters
  *
- * One notable caveat, the %ESCAPE_NP and %ESCAPE_NA have higher priority
- * than the rest of the flags (%ESCAPE_NP is higher than %ESCAPE_NA).
+ * One notable caveat, the %ESCAPE_NAP, %ESCAPE_NP and %ESCAPE_NA have the
+ * higher priority than the rest of the flags (%ESCAPE_NAP is the highest).
  * It doesn't make much sense to use either of them without %ESCAPE_OCTAL
  * or %ESCAPE_HEX, because they cover most of the other character classes.
+ * %ESCAPE_NAP can utilize %ESCAPE_SPACE or %ESCAPE_SPECIAL in addition to
+ * the above.
  *
  * Return:
  * The total size of the escaped output that would be generated for
@@ -515,6 +521,8 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
 		 * Apply rules in the following sequence:
 		 *	- the @only string is supplied and does not contain a
 		 *	  character under question
+		 *	- the character is printable and ASCII, when @flags has
+		 *	  %ESCAPE_NAP bit set
 		 *	- the character is printable, when @flags has
 		 *	  %ESCAPE_NP bit set
 		 *	- the character is ASCII, when @flags has
@@ -528,6 +536,10 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
 					  escape_passthrough(c, &p, end))
 			continue;
 
+		if (isascii(c) && isprint(c) &&
+		    flags & ESCAPE_NAP && escape_passthrough(c, &p, end))
+			continue;
+
 		if (isprint(c) &&
 		    flags & ESCAPE_NP && escape_passthrough(c, &p, end))
 			continue;
-- 
cgit v1.2.3


From aec0d0966f20d131cc4ff6927b02d448a478a6d4 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 30 Jun 2021 18:55:20 -0700
Subject: lib/string_helpers: allow to append additional characters to be
 escaped

Introduce a new flag to append additional characters, passed in 'only'
parameter, to be escaped if they fall in the corresponding class.

Link: https://lkml.kernel.org/r/20210504180819.73127-7-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/string_helpers.h |  1 +
 lib/string_helpers.c           | 19 +++++++++++++++----
 2 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index 811c6a627620..f8728ed4d563 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -54,6 +54,7 @@ static inline int string_unescape_any_inplace(char *buf)
 #define ESCAPE_HEX		BIT(5)
 #define ESCAPE_NA		BIT(6)
 #define ESCAPE_NAP		BIT(7)
+#define ESCAPE_APPEND		BIT(8)
 
 int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
 		unsigned int flags, const char *only);
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index ceca5fbbd92c..c15aea7a82e9 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -493,6 +493,11 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
  *		escape only non-ascii characters, checked by isascii()
  *	%ESCAPE_NAP:
  *		escape only non-printable or non-ascii characters
+ *	%ESCAPE_APPEND:
+ *		append characters from @only to be escaped by the given classes
+ *
+ * %ESCAPE_APPEND would help to pass additional characters to the escaped, when
+ * one of %ESCAPE_NP, %ESCAPE_NA, or %ESCAPE_NAP is provided.
  *
  * One notable caveat, the %ESCAPE_NAP, %ESCAPE_NP and %ESCAPE_NA have the
  * higher priority than the rest of the flags (%ESCAPE_NAP is the highest).
@@ -513,9 +518,11 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
 	char *p = dst;
 	char *end = p + osz;
 	bool is_dict = only && *only;
+	bool is_append = flags & ESCAPE_APPEND;
 
 	while (isz--) {
 		unsigned char c = *src++;
+		bool in_dict = is_dict && strchr(only, c);
 
 		/*
 		 * Apply rules in the following sequence:
@@ -531,20 +538,24 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
 		 *	  defined by given @flags
 		 * In these cases we just pass through a character to the
 		 * output buffer.
+		 *
+		 * When %ESCAPE_APPEND is passed, the characters from @only
+		 * have been excluded from the %ESCAPE_NAP, %ESCAPE_NP, and
+		 * %ESCAPE_NA cases.
 		 */
-		if (is_dict && !strchr(only, c) &&
+		if (!(is_append || in_dict) && is_dict &&
 					  escape_passthrough(c, &p, end))
 			continue;
 
-		if (isascii(c) && isprint(c) &&
+		if (!(is_append && in_dict) && isascii(c) && isprint(c) &&
 		    flags & ESCAPE_NAP && escape_passthrough(c, &p, end))
 			continue;
 
-		if (isprint(c) &&
+		if (!(is_append && in_dict) && isprint(c) &&
 		    flags & ESCAPE_NP && escape_passthrough(c, &p, end))
 			continue;
 
-		if (isascii(c) &&
+		if (!(is_append && in_dict) && isascii(c) &&
 		    flags & ESCAPE_NA && escape_passthrough(c, &p, end))
 			continue;
 
-- 
cgit v1.2.3


From 259fa5d7d825122c30ad4122c6a1cc937eb74c2d Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 30 Jun 2021 18:55:29 -0700
Subject: lib/test-string_helpers: add test cases for new features

We have got new flags and hence new features of string_escape_mem().
Add test cases for that.

Link: https://lkml.kernel.org/r/20210504180819.73127-10-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/string_helpers.h |   4 ++
 lib/test-string_helpers.c      | 141 ++++++++++++++++++++++++++++++++++++++---
 2 files changed, 137 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index f8728ed4d563..9b0eca2badf2 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -26,6 +26,8 @@ void string_get_size(u64 size, u64 blk_size, enum string_size_units units,
 #define UNESCAPE_ANY		\
 	(UNESCAPE_SPACE | UNESCAPE_OCTAL | UNESCAPE_HEX | UNESCAPE_SPECIAL)
 
+#define UNESCAPE_ALL_MASK	GENMASK(3, 0)
+
 int string_unescape(char *src, char *dst, size_t size, unsigned int flags);
 
 static inline int string_unescape_inplace(char *buf, unsigned int flags)
@@ -56,6 +58,8 @@ static inline int string_unescape_any_inplace(char *buf)
 #define ESCAPE_NAP		BIT(7)
 #define ESCAPE_APPEND		BIT(8)
 
+#define ESCAPE_ALL_MASK		GENMASK(8, 0)
+
 int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
 		unsigned int flags, const char *only);
 
diff --git a/lib/test-string_helpers.c b/lib/test-string_helpers.c
index 3e2def9ccfac..2185d71704f0 100644
--- a/lib/test-string_helpers.c
+++ b/lib/test-string_helpers.c
@@ -202,11 +202,25 @@ static const struct test_string_2 escape0[] __initconst = {{
 	},{
 		/* terminator */
 	}}
+},{
+	.in = "\007 \eb\"\x90\xCF\r",
+	.s1 = {{
+		.out = "\007 \eb\"\\220\\317\r",
+		.flags = ESCAPE_OCTAL | ESCAPE_NA,
+	},{
+		.out = "\007 \eb\"\\x90\\xcf\r",
+		.flags = ESCAPE_HEX | ESCAPE_NA,
+	},{
+		.out = "\007 \eb\"\x90\xCF\r",
+		.flags = ESCAPE_NA,
+	},{
+		/* terminator */
+	}}
 },{
 	/* terminator */
 }};
 
-#define	TEST_STRING_2_DICT_1		"b\\ \t\r"
+#define	TEST_STRING_2_DICT_1		"b\\ \t\r\xCF"
 static const struct test_string_2 escape1[] __initconst = {{
 	.in = "\f\\ \n\r\t\v",
 	.s1 = {{
@@ -215,14 +229,38 @@ static const struct test_string_2 escape1[] __initconst = {{
 	},{
 		.out = "\f\\x5c\\x20\n\\x0d\\x09\v",
 		.flags = ESCAPE_HEX,
+	},{
+		.out = "\f\\134\\040\n\\015\\011\v",
+		.flags = ESCAPE_ANY | ESCAPE_APPEND,
+	},{
+		.out = "\\014\\134\\040\\012\\015\\011\\013",
+		.flags = ESCAPE_OCTAL | ESCAPE_APPEND | ESCAPE_NAP,
+	},{
+		.out = "\\x0c\\x5c\\x20\\x0a\\x0d\\x09\\x0b",
+		.flags = ESCAPE_HEX | ESCAPE_APPEND | ESCAPE_NAP,
+	},{
+		.out = "\f\\134\\040\n\\015\\011\v",
+		.flags = ESCAPE_OCTAL | ESCAPE_APPEND | ESCAPE_NA,
+	},{
+		.out = "\f\\x5c\\x20\n\\x0d\\x09\v",
+		.flags = ESCAPE_HEX | ESCAPE_APPEND | ESCAPE_NA,
 	},{
 		/* terminator */
 	}}
 },{
-	.in = "\\h\\\"\a\e\\",
+	.in = "\\h\\\"\a\xCF\e\\",
 	.s1 = {{
-		.out = "\\134h\\134\"\a\e\\134",
+		.out = "\\134h\\134\"\a\\317\e\\134",
 		.flags = ESCAPE_OCTAL,
+	},{
+		.out = "\\134h\\134\"\a\\317\e\\134",
+		.flags = ESCAPE_ANY | ESCAPE_APPEND,
+	},{
+		.out = "\\134h\\134\"\\007\\317\\033\\134",
+		.flags = ESCAPE_OCTAL | ESCAPE_APPEND | ESCAPE_NAP,
+	},{
+		.out = "\\134h\\134\"\a\\317\e\\134",
+		.flags = ESCAPE_OCTAL | ESCAPE_APPEND | ESCAPE_NA,
 	},{
 		/* terminator */
 	}}
@@ -234,6 +272,88 @@ static const struct test_string_2 escape1[] __initconst = {{
 	},{
 		/* terminator */
 	}}
+},{
+	.in = "\007 \eb\"\x90\xCF\r",
+	.s1 = {{
+		.out = "\007 \eb\"\x90\xCF\r",
+		.flags = ESCAPE_NA,
+	},{
+		.out = "\007 \eb\"\x90\xCF\r",
+		.flags = ESCAPE_SPACE | ESCAPE_NA,
+	},{
+		.out = "\007 \eb\"\x90\xCF\r",
+		.flags = ESCAPE_SPECIAL | ESCAPE_NA,
+	},{
+		.out = "\007 \eb\"\x90\xCF\r",
+		.flags = ESCAPE_SPACE | ESCAPE_SPECIAL | ESCAPE_NA,
+	},{
+		.out = "\007 \eb\"\x90\\317\r",
+		.flags = ESCAPE_OCTAL | ESCAPE_NA,
+	},{
+		.out = "\007 \eb\"\x90\\317\r",
+		.flags = ESCAPE_SPACE | ESCAPE_OCTAL | ESCAPE_NA,
+	},{
+		.out = "\007 \eb\"\x90\\317\r",
+		.flags = ESCAPE_SPECIAL | ESCAPE_OCTAL | ESCAPE_NA,
+	},{
+		.out = "\007 \eb\"\x90\\317\r",
+		.flags = ESCAPE_ANY | ESCAPE_NA,
+	},{
+		.out = "\007 \eb\"\x90\\xcf\r",
+		.flags = ESCAPE_HEX | ESCAPE_NA,
+	},{
+		.out = "\007 \eb\"\x90\\xcf\r",
+		.flags = ESCAPE_SPACE | ESCAPE_HEX | ESCAPE_NA,
+	},{
+		.out = "\007 \eb\"\x90\\xcf\r",
+		.flags = ESCAPE_SPECIAL | ESCAPE_HEX | ESCAPE_NA,
+	},{
+		.out = "\007 \eb\"\x90\\xcf\r",
+		.flags = ESCAPE_SPACE | ESCAPE_SPECIAL | ESCAPE_HEX | ESCAPE_NA,
+	},{
+		/* terminator */
+	}}
+},{
+	.in = "\007 \eb\"\x90\xCF\r",
+	.s1 = {{
+		.out = "\007 \eb\"\x90\xCF\r",
+		.flags = ESCAPE_NAP,
+	},{
+		.out = "\007 \eb\"\x90\xCF\\r",
+		.flags = ESCAPE_SPACE | ESCAPE_NAP,
+	},{
+		.out = "\007 \eb\"\x90\xCF\r",
+		.flags = ESCAPE_SPECIAL | ESCAPE_NAP,
+	},{
+		.out = "\007 \eb\"\x90\xCF\\r",
+		.flags = ESCAPE_SPACE | ESCAPE_SPECIAL | ESCAPE_NAP,
+	},{
+		.out = "\007 \eb\"\x90\\317\\015",
+		.flags = ESCAPE_OCTAL | ESCAPE_NAP,
+	},{
+		.out = "\007 \eb\"\x90\\317\\r",
+		.flags = ESCAPE_SPACE | ESCAPE_OCTAL | ESCAPE_NAP,
+	},{
+		.out = "\007 \eb\"\x90\\317\\015",
+		.flags = ESCAPE_SPECIAL | ESCAPE_OCTAL | ESCAPE_NAP,
+	},{
+		.out = "\007 \eb\"\x90\\317\r",
+		.flags = ESCAPE_ANY | ESCAPE_NAP,
+	},{
+		.out = "\007 \eb\"\x90\\xcf\\x0d",
+		.flags = ESCAPE_HEX | ESCAPE_NAP,
+	},{
+		.out = "\007 \eb\"\x90\\xcf\\r",
+		.flags = ESCAPE_SPACE | ESCAPE_HEX | ESCAPE_NAP,
+	},{
+		.out = "\007 \eb\"\x90\\xcf\\x0d",
+		.flags = ESCAPE_SPECIAL | ESCAPE_HEX | ESCAPE_NAP,
+	},{
+		.out = "\007 \eb\"\x90\\xcf\\r",
+		.flags = ESCAPE_SPACE | ESCAPE_SPECIAL | ESCAPE_HEX | ESCAPE_NAP,
+	},{
+		/* terminator */
+	}}
 },{
 	/* terminator */
 }};
@@ -315,8 +435,13 @@ static __init void test_string_escape(const char *name,
 		/* NULL injection */
 		if (flags & ESCAPE_NULL) {
 			in[p++] = '\0';
-			out_test[q_test++] = '\\';
-			out_test[q_test++] = '0';
+			/* '\0' passes isascii() test */
+			if (flags & ESCAPE_NA && !(flags & ESCAPE_APPEND && esc)) {
+				out_test[q_test++] = '\0';
+			} else {
+				out_test[q_test++] = '\\';
+				out_test[q_test++] = '0';
+			}
 		}
 
 		/* Don't try strings that have no output */
@@ -459,17 +584,17 @@ static int __init test_string_helpers_init(void)
 	unsigned int i;
 
 	pr_info("Running tests...\n");
-	for (i = 0; i < UNESCAPE_ANY + 1; i++)
+	for (i = 0; i < UNESCAPE_ALL_MASK + 1; i++)
 		test_string_unescape("unescape", i, false);
 	test_string_unescape("unescape inplace",
 			     get_random_int() % (UNESCAPE_ANY + 1), true);
 
 	/* Without dictionary */
-	for (i = 0; i < (ESCAPE_ANY_NP | ESCAPE_HEX) + 1; i++)
+	for (i = 0; i < ESCAPE_ALL_MASK + 1; i++)
 		test_string_escape("escape 0", escape0, i, TEST_STRING_2_DICT_0);
 
 	/* With dictionary */
-	for (i = 0; i < (ESCAPE_ANY_NP | ESCAPE_HEX) + 1; i++)
+	for (i = 0; i < ESCAPE_ALL_MASK + 1; i++)
 		test_string_escape("escape 1", escape1, i, TEST_STRING_2_DICT_1);
 
 	/* Test string_get_size() */
-- 
cgit v1.2.3


From 1d31aa172a4e6728918a06ee7f1d6bcb7507172c Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 30 Jun 2021 18:55:34 -0700
Subject: seq_file: introduce seq_escape_mem()

Introduce seq_escape_mem() to allow users to pass additional parameters to
string_escape_mem().

Link: https://lkml.kernel.org/r/20210504180819.73127-12-andriy.shevchenko@linux.intel.com
Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/seq_file.c            | 25 +++++++++++++++++++++++++
 include/linux/seq_file.h |  2 ++
 2 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index 5059248f2d64..532cac2eae0f 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -355,6 +355,31 @@ int seq_release(struct inode *inode, struct file *file)
 }
 EXPORT_SYMBOL(seq_release);
 
+/**
+ * seq_escape_mem - print data into buffer, escaping some characters
+ * @m: target buffer
+ * @src: source buffer
+ * @len: size of source buffer
+ * @flags: flags to pass to string_escape_mem()
+ * @esc: set of characters that need escaping
+ *
+ * Puts data into buffer, replacing each occurrence of character from
+ * given class (defined by @flags and @esc) with printable escaped sequence.
+ *
+ * Use seq_has_overflowed() to check for errors.
+ */
+void seq_escape_mem(struct seq_file *m, const char *src, size_t len,
+		    unsigned int flags, const char *esc)
+{
+	char *buf;
+	size_t size = seq_get_buf(m, &buf);
+	int ret;
+
+	ret = string_escape_mem(src, len, buf, size, flags, esc);
+	seq_commit(m, ret < size ? ret : -1);
+}
+EXPORT_SYMBOL(seq_escape_mem);
+
 /**
  *	seq_escape -	print string into buffer, escaping some characters
  *	@m:	target buffer
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 723b1fa1177e..6de442182784 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -126,6 +126,8 @@ void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num
 void seq_put_hex_ll(struct seq_file *m, const char *delimiter,
 		    unsigned long long v, unsigned int width);
 
+void seq_escape_mem(struct seq_file *m, const char *src, size_t len,
+		    unsigned int flags, const char *esc);
 void seq_escape(struct seq_file *m, const char *s, const char *esc);
 void seq_escape_mem_ascii(struct seq_file *m, const char *src, size_t isz);
 
-- 
cgit v1.2.3


From e7ed4a3b922b04d2042cd2e19d1096fa457b6c11 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 30 Jun 2021 18:55:37 -0700
Subject: seq_file: add seq_escape_str() as replica of string_escape_str()

In some cases we want to escape characters from NULL-terminated strings.
Add seq_escape_str() as replica of string_escape_str() for that.

Link: https://lkml.kernel.org/r/20210504180819.73127-13-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/seq_file.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 6de442182784..63f021cb1b12 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -128,6 +128,13 @@ void seq_put_hex_ll(struct seq_file *m, const char *delimiter,
 
 void seq_escape_mem(struct seq_file *m, const char *src, size_t len,
 		    unsigned int flags, const char *esc);
+
+static inline void seq_escape_str(struct seq_file *m, const char *src,
+				  unsigned int flags, const char *esc)
+{
+	seq_escape_mem(m, src, strlen(src), flags, esc);
+}
+
 void seq_escape(struct seq_file *m, const char *s, const char *esc);
 void seq_escape_mem_ascii(struct seq_file *m, const char *src, size_t isz);
 
-- 
cgit v1.2.3


From cc72181a65990193f54284417efa01d4580014e6 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 30 Jun 2021 18:55:46 -0700
Subject: seq_file: drop unused *_escape_mem_ascii()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are no more users of the seq_escape_mem_ascii() followed by
string_escape_mem_ascii().

Remove them for good.

Link: https://lkml.kernel.org/r/20210504180819.73127-16-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/seq_file.c                  | 11 -----------
 include/linux/seq_file.h       |  1 -
 include/linux/string_helpers.h |  3 ---
 lib/string_helpers.c           | 19 -------------------
 4 files changed, 34 deletions(-)

(limited to 'include/linux')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index 08f54029c2b1..b117b212ef28 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -396,17 +396,6 @@ void seq_escape(struct seq_file *m, const char *s, const char *esc)
 }
 EXPORT_SYMBOL(seq_escape);
 
-void seq_escape_mem_ascii(struct seq_file *m, const char *src, size_t isz)
-{
-	char *buf;
-	size_t size = seq_get_buf(m, &buf);
-	int ret;
-
-	ret = string_escape_mem_ascii(src, isz, buf, size);
-	seq_commit(m, ret < size ? ret : -1);
-}
-EXPORT_SYMBOL(seq_escape_mem_ascii);
-
 void seq_vprintf(struct seq_file *m, const char *f, va_list args)
 {
 	int len;
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 63f021cb1b12..dd99569595fd 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -136,7 +136,6 @@ static inline void seq_escape_str(struct seq_file *m, const char *src,
 }
 
 void seq_escape(struct seq_file *m, const char *s, const char *esc);
-void seq_escape_mem_ascii(struct seq_file *m, const char *src, size_t isz);
 
 void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
 		  int rowsize, int groupsize, const void *buf, size_t len,
diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index 9b0eca2badf2..68189c4a2eb1 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -63,9 +63,6 @@ static inline int string_unescape_any_inplace(char *buf)
 int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
 		unsigned int flags, const char *only);
 
-int string_escape_mem_ascii(const char *src, size_t isz, char *dst,
-					size_t osz);
-
 static inline int string_escape_mem_any_np(const char *src, size_t isz,
 		char *dst, size_t osz, const char *only)
 {
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index c15aea7a82e9..5a35c7e16e96 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -582,25 +582,6 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
 }
 EXPORT_SYMBOL(string_escape_mem);
 
-int string_escape_mem_ascii(const char *src, size_t isz, char *dst,
-					size_t osz)
-{
-	char *p = dst;
-	char *end = p + osz;
-
-	while (isz--) {
-		unsigned char c = *src++;
-
-		if (!isprint(c) || !isascii(c) || c == '"' || c == '\\')
-			escape_hex(c, &p, end);
-		else
-			escape_passthrough(c, &p, end);
-	}
-
-	return p - dst;
-}
-EXPORT_SYMBOL(string_escape_mem_ascii);
-
 /*
  * Return an allocated string that has been escaped of special characters
  * and double quotes, making it safe to log in quotes.
-- 
cgit v1.2.3


From 478485f6c0e5936b62c0c9393a865bfb00f037a5 Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Wed, 30 Jun 2021 18:55:58 -0700
Subject: lib/mpi: fix spelling mistakes

Fix some spelling mistakes in comments:
flaged ==> flagged
bufer ==> buffer
multipler ==> multiplier
MULTIPLER ==> MULTIPLIER
leaset ==> least
chnage ==> change

Link: https://lkml.kernel.org/r/20210604074401.12198-1-thunder.leizhen@huawei.com
Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mpi.h | 4 ++--
 lib/mpi/longlong.h  | 4 ++--
 lib/mpi/mpicoder.c  | 6 +++---
 lib/mpi/mpiutil.c   | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mpi.h b/include/linux/mpi.h
index 3e5358f4de2f..eb0d1c1db208 100644
--- a/include/linux/mpi.h
+++ b/include/linux/mpi.h
@@ -200,7 +200,7 @@ struct mpi_ec_ctx {
 	unsigned int nbits;            /* Number of bits.  */
 
 	/* Domain parameters.  Note that they may not all be set and if set
-	 * the MPIs may be flaged as constant.
+	 * the MPIs may be flagged as constant.
 	 */
 	MPI p;         /* Prime specifying the field GF(p).  */
 	MPI a;         /* First coefficient of the Weierstrass equation.  */
@@ -267,7 +267,7 @@ int mpi_ec_curve_point(MPI_POINT point, struct mpi_ec_ctx *ctx);
 /**
  * mpi_get_size() - returns max size required to store the number
  *
- * @a:	A multi precision integer for which we want to allocate a bufer
+ * @a:	A multi precision integer for which we want to allocate a buffer
  *
  * Return: size required to store the number
  */
diff --git a/lib/mpi/longlong.h b/lib/mpi/longlong.h
index afbd99987cf8..b6fa1d08fb55 100644
--- a/lib/mpi/longlong.h
+++ b/lib/mpi/longlong.h
@@ -48,8 +48,8 @@
 
 /* Define auxiliary asm macros.
  *
- * 1) umul_ppmm(high_prod, low_prod, multipler, multiplicand) multiplies two
- * UWtype integers MULTIPLER and MULTIPLICAND, and generates a two UWtype
+ * 1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
+ * UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
  * word product in HIGH_PROD and LOW_PROD.
  *
  * 2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
diff --git a/lib/mpi/mpicoder.c b/lib/mpi/mpicoder.c
index 7ea225b2204f..39c4c6731094 100644
--- a/lib/mpi/mpicoder.c
+++ b/lib/mpi/mpicoder.c
@@ -234,11 +234,11 @@ static int count_lzeros(MPI a)
 }
 
 /**
- * mpi_read_buffer() - read MPI to a bufer provided by user (msb first)
+ * mpi_read_buffer() - read MPI to a buffer provided by user (msb first)
  *
  * @a:		a multi precision integer
- * @buf:	bufer to which the output will be written to. Needs to be at
- *		leaset mpi_get_size(a) long.
+ * @buf:	buffer to which the output will be written to. Needs to be at
+ *		least mpi_get_size(a) long.
  * @buf_len:	size of the buf.
  * @nbytes:	receives the actual length of the data written on success and
  *		the data to-be-written on -EOVERFLOW in case buf_len was too
diff --git a/lib/mpi/mpiutil.c b/lib/mpi/mpiutil.c
index 3c63710c20c6..9a75ca3f7edf 100644
--- a/lib/mpi/mpiutil.c
+++ b/lib/mpi/mpiutil.c
@@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(mpi_const);
 /****************
  * Note:  It was a bad idea to use the number of limbs to allocate
  *	  because on a alpha the limbs are large but we normally need
- *	  integers of n bits - So we should chnage this to bits (or bytes).
+ *	  integers of n bits - So we should change this to bits (or bytes).
  *
  *	  But mpi_alloc is used in a lot of places :-)
  */
-- 
cgit v1.2.3


From 4c52729377eab025b238caeed48994a39c3b73f2 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 30 Jun 2021 18:56:10 -0700
Subject: kernel.h: split out kstrtox() and simple_strtox() to a separate
 header

kernel.h is being used as a dump for all kinds of stuff for a long time.
Here is the attempt to start cleaning it up by splitting out kstrtox() and
simple_strtox() helpers.

At the same time convert users in header and lib folders to use new
header.  Though for time being include new header back to kernel.h to
avoid twisted indirected includes for existing users.

[andy.shevchenko@gmail.com: fix documentation references]
  Link: https://lkml.kernel.org/r/20210615220003.377901-1-andy.shevchenko@gmail.com

Link: https://lkml.kernel.org/r/20210611185815.44103-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Francis Laniel <laniel_francis@privacyrequired.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Kars Mulder <kerneldev@karsmulder.nl>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Anna Schumaker <anna.schumaker@netapp.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/core-api/kernel-api.rst |   7 +-
 include/linux/kernel.h                | 143 +------------------------------
 include/linux/kstrtox.h               | 155 ++++++++++++++++++++++++++++++++++
 include/linux/string.h                |   7 --
 include/linux/sunrpc/cache.h          |   1 +
 lib/kstrtox.c                         |   5 +-
 lib/parser.c                          |   1 +
 7 files changed, 163 insertions(+), 156 deletions(-)
 create mode 100644 include/linux/kstrtox.h

(limited to 'include/linux')

diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst
index 741aa37dc181..2a7444e3a4c2 100644
--- a/Documentation/core-api/kernel-api.rst
+++ b/Documentation/core-api/kernel-api.rst
@@ -24,11 +24,8 @@ String Conversions
 .. kernel-doc:: lib/vsprintf.c
    :export:
 
-.. kernel-doc:: include/linux/kernel.h
-   :functions: kstrtol
-
-.. kernel-doc:: include/linux/kernel.h
-   :functions: kstrtoul
+.. kernel-doc:: include/linux/kstrtox.h
+   :functions: kstrtol kstrtoul
 
 .. kernel-doc:: lib/kstrtox.c
    :export:
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index baea2eb763d0..7bb0a5cb7d57 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -10,6 +10,7 @@
 #include <linux/types.h>
 #include <linux/compiler.h>
 #include <linux/bitops.h>
+#include <linux/kstrtox.h>
 #include <linux/log2.h>
 #include <linux/math.h>
 #include <linux/minmax.h>
@@ -180,148 +181,6 @@ static inline void might_fault(void) { }
 void do_exit(long error_code) __noreturn;
 void complete_and_exit(struct completion *, long) __noreturn;
 
-/* Internal, do not use. */
-int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
-int __must_check _kstrtol(const char *s, unsigned int base, long *res);
-
-int __must_check kstrtoull(const char *s, unsigned int base, unsigned long long *res);
-int __must_check kstrtoll(const char *s, unsigned int base, long long *res);
-
-/**
- * kstrtoul - convert a string to an unsigned long
- * @s: The start of the string. The string must be null-terminated, and may also
- *  include a single newline before its terminating null. The first character
- *  may also be a plus sign, but not a minus sign.
- * @base: The number base to use. The maximum supported base is 16. If base is
- *  given as 0, then the base of the string is automatically detected with the
- *  conventional semantics - If it begins with 0x the number will be parsed as a
- *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
- *  parsed as an octal number. Otherwise it will be parsed as a decimal.
- * @res: Where to write the result of the conversion on success.
- *
- * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
- * Preferred over simple_strtoul(). Return code must be checked.
-*/
-static inline int __must_check kstrtoul(const char *s, unsigned int base, unsigned long *res)
-{
-	/*
-	 * We want to shortcut function call, but
-	 * __builtin_types_compatible_p(unsigned long, unsigned long long) = 0.
-	 */
-	if (sizeof(unsigned long) == sizeof(unsigned long long) &&
-	    __alignof__(unsigned long) == __alignof__(unsigned long long))
-		return kstrtoull(s, base, (unsigned long long *)res);
-	else
-		return _kstrtoul(s, base, res);
-}
-
-/**
- * kstrtol - convert a string to a long
- * @s: The start of the string. The string must be null-terminated, and may also
- *  include a single newline before its terminating null. The first character
- *  may also be a plus sign or a minus sign.
- * @base: The number base to use. The maximum supported base is 16. If base is
- *  given as 0, then the base of the string is automatically detected with the
- *  conventional semantics - If it begins with 0x the number will be parsed as a
- *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
- *  parsed as an octal number. Otherwise it will be parsed as a decimal.
- * @res: Where to write the result of the conversion on success.
- *
- * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
- * Preferred over simple_strtol(). Return code must be checked.
- */
-static inline int __must_check kstrtol(const char *s, unsigned int base, long *res)
-{
-	/*
-	 * We want to shortcut function call, but
-	 * __builtin_types_compatible_p(long, long long) = 0.
-	 */
-	if (sizeof(long) == sizeof(long long) &&
-	    __alignof__(long) == __alignof__(long long))
-		return kstrtoll(s, base, (long long *)res);
-	else
-		return _kstrtol(s, base, res);
-}
-
-int __must_check kstrtouint(const char *s, unsigned int base, unsigned int *res);
-int __must_check kstrtoint(const char *s, unsigned int base, int *res);
-
-static inline int __must_check kstrtou64(const char *s, unsigned int base, u64 *res)
-{
-	return kstrtoull(s, base, res);
-}
-
-static inline int __must_check kstrtos64(const char *s, unsigned int base, s64 *res)
-{
-	return kstrtoll(s, base, res);
-}
-
-static inline int __must_check kstrtou32(const char *s, unsigned int base, u32 *res)
-{
-	return kstrtouint(s, base, res);
-}
-
-static inline int __must_check kstrtos32(const char *s, unsigned int base, s32 *res)
-{
-	return kstrtoint(s, base, res);
-}
-
-int __must_check kstrtou16(const char *s, unsigned int base, u16 *res);
-int __must_check kstrtos16(const char *s, unsigned int base, s16 *res);
-int __must_check kstrtou8(const char *s, unsigned int base, u8 *res);
-int __must_check kstrtos8(const char *s, unsigned int base, s8 *res);
-int __must_check kstrtobool(const char *s, bool *res);
-
-int __must_check kstrtoull_from_user(const char __user *s, size_t count, unsigned int base, unsigned long long *res);
-int __must_check kstrtoll_from_user(const char __user *s, size_t count, unsigned int base, long long *res);
-int __must_check kstrtoul_from_user(const char __user *s, size_t count, unsigned int base, unsigned long *res);
-int __must_check kstrtol_from_user(const char __user *s, size_t count, unsigned int base, long *res);
-int __must_check kstrtouint_from_user(const char __user *s, size_t count, unsigned int base, unsigned int *res);
-int __must_check kstrtoint_from_user(const char __user *s, size_t count, unsigned int base, int *res);
-int __must_check kstrtou16_from_user(const char __user *s, size_t count, unsigned int base, u16 *res);
-int __must_check kstrtos16_from_user(const char __user *s, size_t count, unsigned int base, s16 *res);
-int __must_check kstrtou8_from_user(const char __user *s, size_t count, unsigned int base, u8 *res);
-int __must_check kstrtos8_from_user(const char __user *s, size_t count, unsigned int base, s8 *res);
-int __must_check kstrtobool_from_user(const char __user *s, size_t count, bool *res);
-
-static inline int __must_check kstrtou64_from_user(const char __user *s, size_t count, unsigned int base, u64 *res)
-{
-	return kstrtoull_from_user(s, count, base, res);
-}
-
-static inline int __must_check kstrtos64_from_user(const char __user *s, size_t count, unsigned int base, s64 *res)
-{
-	return kstrtoll_from_user(s, count, base, res);
-}
-
-static inline int __must_check kstrtou32_from_user(const char __user *s, size_t count, unsigned int base, u32 *res)
-{
-	return kstrtouint_from_user(s, count, base, res);
-}
-
-static inline int __must_check kstrtos32_from_user(const char __user *s, size_t count, unsigned int base, s32 *res)
-{
-	return kstrtoint_from_user(s, count, base, res);
-}
-
-/*
- * Use kstrto<foo> instead.
- *
- * NOTE: simple_strto<foo> does not check for the range overflow and,
- *	 depending on the input, may give interesting results.
- *
- * Use these functions if and only if you cannot use kstrto<foo>, because
- * the conversion ends on the first non-digit character, which may be far
- * beyond the supported range. It might be useful to parse the strings like
- * 10x50 or 12:21 without altering original string or temporary buffer in use.
- * Keep in mind above caveat.
- */
-
-extern unsigned long simple_strtoul(const char *,char **,unsigned int);
-extern long simple_strtol(const char *,char **,unsigned int);
-extern unsigned long long simple_strtoull(const char *,char **,unsigned int);
-extern long long simple_strtoll(const char *,char **,unsigned int);
-
 extern int num_to_str(char *buf, int size,
 		      unsigned long long num, unsigned int width);
 
diff --git a/include/linux/kstrtox.h b/include/linux/kstrtox.h
new file mode 100644
index 000000000000..529974e22ea7
--- /dev/null
+++ b/include/linux/kstrtox.h
@@ -0,0 +1,155 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_KSTRTOX_H
+#define _LINUX_KSTRTOX_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+/* Internal, do not use. */
+int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
+int __must_check _kstrtol(const char *s, unsigned int base, long *res);
+
+int __must_check kstrtoull(const char *s, unsigned int base, unsigned long long *res);
+int __must_check kstrtoll(const char *s, unsigned int base, long long *res);
+
+/**
+ * kstrtoul - convert a string to an unsigned long
+ * @s: The start of the string. The string must be null-terminated, and may also
+ *  include a single newline before its terminating null. The first character
+ *  may also be a plus sign, but not a minus sign.
+ * @base: The number base to use. The maximum supported base is 16. If base is
+ *  given as 0, then the base of the string is automatically detected with the
+ *  conventional semantics - If it begins with 0x the number will be parsed as a
+ *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
+ *  parsed as an octal number. Otherwise it will be parsed as a decimal.
+ * @res: Where to write the result of the conversion on success.
+ *
+ * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
+ * Preferred over simple_strtoul(). Return code must be checked.
+*/
+static inline int __must_check kstrtoul(const char *s, unsigned int base, unsigned long *res)
+{
+	/*
+	 * We want to shortcut function call, but
+	 * __builtin_types_compatible_p(unsigned long, unsigned long long) = 0.
+	 */
+	if (sizeof(unsigned long) == sizeof(unsigned long long) &&
+	    __alignof__(unsigned long) == __alignof__(unsigned long long))
+		return kstrtoull(s, base, (unsigned long long *)res);
+	else
+		return _kstrtoul(s, base, res);
+}
+
+/**
+ * kstrtol - convert a string to a long
+ * @s: The start of the string. The string must be null-terminated, and may also
+ *  include a single newline before its terminating null. The first character
+ *  may also be a plus sign or a minus sign.
+ * @base: The number base to use. The maximum supported base is 16. If base is
+ *  given as 0, then the base of the string is automatically detected with the
+ *  conventional semantics - If it begins with 0x the number will be parsed as a
+ *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
+ *  parsed as an octal number. Otherwise it will be parsed as a decimal.
+ * @res: Where to write the result of the conversion on success.
+ *
+ * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
+ * Preferred over simple_strtol(). Return code must be checked.
+ */
+static inline int __must_check kstrtol(const char *s, unsigned int base, long *res)
+{
+	/*
+	 * We want to shortcut function call, but
+	 * __builtin_types_compatible_p(long, long long) = 0.
+	 */
+	if (sizeof(long) == sizeof(long long) &&
+	    __alignof__(long) == __alignof__(long long))
+		return kstrtoll(s, base, (long long *)res);
+	else
+		return _kstrtol(s, base, res);
+}
+
+int __must_check kstrtouint(const char *s, unsigned int base, unsigned int *res);
+int __must_check kstrtoint(const char *s, unsigned int base, int *res);
+
+static inline int __must_check kstrtou64(const char *s, unsigned int base, u64 *res)
+{
+	return kstrtoull(s, base, res);
+}
+
+static inline int __must_check kstrtos64(const char *s, unsigned int base, s64 *res)
+{
+	return kstrtoll(s, base, res);
+}
+
+static inline int __must_check kstrtou32(const char *s, unsigned int base, u32 *res)
+{
+	return kstrtouint(s, base, res);
+}
+
+static inline int __must_check kstrtos32(const char *s, unsigned int base, s32 *res)
+{
+	return kstrtoint(s, base, res);
+}
+
+int __must_check kstrtou16(const char *s, unsigned int base, u16 *res);
+int __must_check kstrtos16(const char *s, unsigned int base, s16 *res);
+int __must_check kstrtou8(const char *s, unsigned int base, u8 *res);
+int __must_check kstrtos8(const char *s, unsigned int base, s8 *res);
+int __must_check kstrtobool(const char *s, bool *res);
+
+int __must_check kstrtoull_from_user(const char __user *s, size_t count, unsigned int base, unsigned long long *res);
+int __must_check kstrtoll_from_user(const char __user *s, size_t count, unsigned int base, long long *res);
+int __must_check kstrtoul_from_user(const char __user *s, size_t count, unsigned int base, unsigned long *res);
+int __must_check kstrtol_from_user(const char __user *s, size_t count, unsigned int base, long *res);
+int __must_check kstrtouint_from_user(const char __user *s, size_t count, unsigned int base, unsigned int *res);
+int __must_check kstrtoint_from_user(const char __user *s, size_t count, unsigned int base, int *res);
+int __must_check kstrtou16_from_user(const char __user *s, size_t count, unsigned int base, u16 *res);
+int __must_check kstrtos16_from_user(const char __user *s, size_t count, unsigned int base, s16 *res);
+int __must_check kstrtou8_from_user(const char __user *s, size_t count, unsigned int base, u8 *res);
+int __must_check kstrtos8_from_user(const char __user *s, size_t count, unsigned int base, s8 *res);
+int __must_check kstrtobool_from_user(const char __user *s, size_t count, bool *res);
+
+static inline int __must_check kstrtou64_from_user(const char __user *s, size_t count, unsigned int base, u64 *res)
+{
+	return kstrtoull_from_user(s, count, base, res);
+}
+
+static inline int __must_check kstrtos64_from_user(const char __user *s, size_t count, unsigned int base, s64 *res)
+{
+	return kstrtoll_from_user(s, count, base, res);
+}
+
+static inline int __must_check kstrtou32_from_user(const char __user *s, size_t count, unsigned int base, u32 *res)
+{
+	return kstrtouint_from_user(s, count, base, res);
+}
+
+static inline int __must_check kstrtos32_from_user(const char __user *s, size_t count, unsigned int base, s32 *res)
+{
+	return kstrtoint_from_user(s, count, base, res);
+}
+
+/*
+ * Use kstrto<foo> instead.
+ *
+ * NOTE: simple_strto<foo> does not check for the range overflow and,
+ *	 depending on the input, may give interesting results.
+ *
+ * Use these functions if and only if you cannot use kstrto<foo>, because
+ * the conversion ends on the first non-digit character, which may be far
+ * beyond the supported range. It might be useful to parse the strings like
+ * 10x50 or 12:21 without altering original string or temporary buffer in use.
+ * Keep in mind above caveat.
+ */
+
+extern unsigned long simple_strtoul(const char *,char **,unsigned int);
+extern long simple_strtol(const char *,char **,unsigned int);
+extern unsigned long long simple_strtoull(const char *,char **,unsigned int);
+extern long long simple_strtoll(const char *,char **,unsigned int);
+
+static inline int strtobool(const char *s, bool *res)
+{
+	return kstrtobool(s, res);
+}
+
+#endif	/* _LINUX_KSTRTOX_H */
diff --git a/include/linux/string.h b/include/linux/string.h
index 9521d8cab18e..b48d2d28e0b1 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -2,7 +2,6 @@
 #ifndef _LINUX_STRING_H_
 #define _LINUX_STRING_H_
 
-
 #include <linux/compiler.h>	/* for inline */
 #include <linux/types.h>	/* for size_t */
 #include <linux/stddef.h>	/* for NULL */
@@ -184,12 +183,6 @@ extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
 extern void argv_free(char **argv);
 
 extern bool sysfs_streq(const char *s1, const char *s2);
-extern int kstrtobool(const char *s, bool *res);
-static inline int strtobool(const char *s, bool *res)
-{
-	return kstrtobool(s, res);
-}
-
 int match_string(const char * const *array, size_t n, const char *string);
 int __sysfs_match_string(const char * const *array, size_t n, const char *s);
 
diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index d0965e2997b0..b134b2b3371c 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -14,6 +14,7 @@
 #include <linux/kref.h>
 #include <linux/slab.h>
 #include <linux/atomic.h>
+#include <linux/kstrtox.h>
 #include <linux/proc_fs.h>
 
 /*
diff --git a/lib/kstrtox.c b/lib/kstrtox.c
index a118b0b1e9b2..a0cc8dcf4a35 100644
--- a/lib/kstrtox.c
+++ b/lib/kstrtox.c
@@ -14,11 +14,12 @@
  */
 #include <linux/ctype.h>
 #include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/math64.h>
 #include <linux/export.h>
+#include <linux/kstrtox.h>
+#include <linux/math64.h>
 #include <linux/types.h>
 #include <linux/uaccess.h>
+
 #include "kstrtox.h"
 
 const char *_parse_integer_fixup_radix(const char *s, unsigned int *base)
diff --git a/lib/parser.c b/lib/parser.c
index f1a6d90b8c34..bcb23484100e 100644
--- a/lib/parser.c
+++ b/lib/parser.c
@@ -6,6 +6,7 @@
 #include <linux/ctype.h>
 #include <linux/types.h>
 #include <linux/export.h>
+#include <linux/kstrtox.h>
 #include <linux/parser.h>
 #include <linux/slab.h>
 #include <linux/string.h>
-- 
cgit v1.2.3


From 66ce75144d4b33e376f187df3dec495fe47d2ad0 Mon Sep 17 00:00:00 2001
From: Barry Song <song.bao.hua@hisilicon.com>
Date: Wed, 30 Jun 2021 18:56:31 -0700
Subject: kprobes: remove duplicated strong free_insn_page in x86 and s390

free_insn_page() in x86 and s390 is same with the common weak function in
kernel/kprobes.c.  Plus, the comment "Recover page to RW mode before
releasing it" in x86 seems insensible to be there since resetting mapping
is done by common code in vfree() of module_memfree().  So drop these two
duplicated strong functions and related comment, then mark the common one
in kernel/kprobes.c strong.

Link: https://lkml.kernel.org/r/20210608065736.32656-1-song.bao.hua@hisilicon.com
Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Heiko Carstens <hca@linux.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Qi Liu <liuqi115@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/kernel/kprobes.c     | 5 -----
 arch/x86/kernel/kprobes/core.c | 6 ------
 include/linux/kprobes.h        | 1 -
 kernel/kprobes.c               | 2 +-
 4 files changed, 1 insertion(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index aae24dc75df6..60cfbd24229b 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -44,11 +44,6 @@ void *alloc_insn_page(void)
 	return page;
 }
 
-void free_insn_page(void *page)
-{
-	module_memfree(page);
-}
-
 static void *alloc_s390_insn_page(void)
 {
 	if (xchg(&insn_page_in_use, 1) == 1)
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index d3d65545cb8b..3bce67d3a03c 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -422,12 +422,6 @@ void *alloc_insn_page(void)
 	return page;
 }
 
-/* Recover page to RW mode before releasing it */
-void free_insn_page(void *page)
-{
-	module_memfree(page);
-}
-
 /* Kprobe x86 instruction emulation - only regs->ip or IF flag modifiers */
 
 static void kprobe_emulate_ifmodifiers(struct kprobe *p, struct pt_regs *regs)
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 1883a4a9f16a..c98a35a75f40 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -407,7 +407,6 @@ int enable_kprobe(struct kprobe *kp);
 void dump_kprobe(struct kprobe *kp);
 
 void *alloc_insn_page(void);
-void free_insn_page(void *page);
 
 int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 		       char *sym);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 745f08fdd7a6..e0c4c9d57299 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -106,7 +106,7 @@ void __weak *alloc_insn_page(void)
 	return module_alloc(PAGE_SIZE);
 }
 
-void __weak free_insn_page(void *page)
+static void free_insn_page(void *page)
 {
 	module_memfree(page);
 }
-- 
cgit v1.2.3


From 97c885d585c53d3f1ad4545b0ee10f0bdfaa1a4d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 30 Jun 2021 18:56:43 -0700
Subject: x86: signal: don't do sas_ss_reset() until we are certain that
 sigframe won't be abandoned

Currently we handle SS_AUTODISARM as soon as we have stored the altstack
settings into sigframe - that's the point when we have set the things up
for eventual sigreturn to restore the old settings.  And if we manage to
set the sigframe up (we are not done with that yet), everything's fine.
However, in case of failure we end up with sigframe-to-be abandoned and
SIGSEGV force-delivered.  And in that case we end up with inconsistent
rules - late failures have altstack reset, early ones do not.

It's trivial to get consistent behaviour - just handle SS_AUTODISARM once
we have set the sigframe up and are committed to entering the handler,
i.e.  in signal_delivered().

Link: https://lore.kernel.org/lkml/20200404170604.GN23230@ZenIV.linux.org.uk/
Link: https://github.com/ClangBuiltLinux/linux/issues/876
Link: https://lkml.kernel.org/r/20210422230846.1756380-1-ndesaulniers@google.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Tested-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compat.h |  2 --
 include/linux/signal.h |  2 --
 kernel/signal.c        | 14 ++++----------
 3 files changed, 4 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 8855b1b702b2..c270124e4402 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -532,8 +532,6 @@ int __compat_save_altstack(compat_stack_t __user *, unsigned long);
 			&__uss->ss_sp, label); \
 	unsafe_put_user(t->sas_ss_flags, &__uss->ss_flags, label); \
 	unsafe_put_user(t->sas_ss_size, &__uss->ss_size, label); \
-	if (t->sas_ss_flags & SS_AUTODISARM) \
-		sas_ss_reset(t); \
 } while (0);
 
 /*
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 5160fd45e5ca..3454c7ff0778 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -462,8 +462,6 @@ int __save_altstack(stack_t __user *, unsigned long);
 	unsafe_put_user((void __user *)t->sas_ss_sp, &__uss->ss_sp, label); \
 	unsafe_put_user(t->sas_ss_flags, &__uss->ss_flags, label); \
 	unsafe_put_user(t->sas_ss_size, &__uss->ss_size, label); \
-	if (t->sas_ss_flags & SS_AUTODISARM) \
-		sas_ss_reset(t); \
 } while (0);
 
 #ifdef CONFIG_PROC_FS
diff --git a/kernel/signal.c b/kernel/signal.c
index 30a0bee5ff9b..6b4df88cc630 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2829,6 +2829,8 @@ static void signal_delivered(struct ksignal *ksig, int stepping)
 	if (!(ksig->ka.sa.sa_flags & SA_NODEFER))
 		sigaddset(&blocked, ksig->sig);
 	set_current_blocked(&blocked);
+	if (current->sas_ss_flags & SS_AUTODISARM)
+		sas_ss_reset(current);
 	tracehook_signal_handler(stepping);
 }
 
@@ -4147,11 +4149,7 @@ int __save_altstack(stack_t __user *uss, unsigned long sp)
 	int err = __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
 		__put_user(t->sas_ss_flags, &uss->ss_flags) |
 		__put_user(t->sas_ss_size, &uss->ss_size);
-	if (err)
-		return err;
-	if (t->sas_ss_flags & SS_AUTODISARM)
-		sas_ss_reset(t);
-	return 0;
+	return err;
 }
 
 #ifdef CONFIG_COMPAT
@@ -4206,11 +4204,7 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
 			 &uss->ss_sp) |
 		__put_user(t->sas_ss_flags, &uss->ss_flags) |
 		__put_user(t->sas_ss_size, &uss->ss_size);
-	if (err)
-		return err;
-	if (t->sas_ss_flags & SS_AUTODISARM)
-		sas_ss_reset(t);
-	return 0;
+	return err;
 }
 #endif
 
-- 
cgit v1.2.3


From 540540d06e9d9b3769b46d88def90f7e7c002322 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Wed, 30 Jun 2021 18:56:49 -0700
Subject: kcov: add __no_sanitize_coverage to fix noinstr for all architectures

Until now no compiler supported an attribute to disable coverage
instrumentation as used by KCOV.

To work around this limitation on x86, noinstr functions have their
coverage instrumentation turned into nops by objtool.  However, this
solution doesn't scale automatically to other architectures, such as
arm64, which are migrating to use the generic entry code.

Clang [1] and GCC [2] have added support for the attribute recently.
[1] https://github.com/llvm/llvm-project/commit/280333021e9550d80f5c1152a34e33e81df1e178
[2] https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=cec4d4a6782c9bd8d071839c50a239c49caca689
The changes will appear in Clang 13 and GCC 12.

Add __no_sanitize_coverage for both compilers, and add it to noinstr.

Note: In the Clang case, __has_feature(coverage_sanitizer) is only true if
the feature is enabled, and therefore we do not require an additional
defined(CONFIG_KCOV) (like in the GCC case where __has_attribute(..) is
always true) to avoid adding redundant attributes to functions if KCOV is
off.  That being said, compilers that support the attribute will not
generate errors/warnings if the attribute is redundantly used; however,
where possible let's avoid it as it reduces preprocessed code size and
associated compile-time overheads.

[elver@google.com: Implement __has_feature(coverage_sanitizer) in Clang]
  Link: https://lkml.kernel.org/r/20210527162655.3246381-1-elver@google.com
[elver@google.com: add comment explaining __has_feature() in Clang]
  Link: https://lkml.kernel.org/r/20210527194448.3470080-1-elver@google.com

Link: https://lkml.kernel.org/r/20210525175819.699786-1-elver@google.com
Signed-off-by: Marco Elver <elver@google.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Miguel Ojeda <ojeda@kernel.org>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Will Deacon <will@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Cc: Arvind Sankar <nivedita@alum.mit.edu>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-clang.h | 17 +++++++++++++++++
 include/linux/compiler-gcc.h   |  6 ++++++
 include/linux/compiler_types.h |  2 +-
 3 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index adbe76b203e2..49b0ac8b6fd3 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -13,6 +13,12 @@
 /* all clang versions usable with the kernel support KASAN ABI version 5 */
 #define KASAN_ABI_VERSION 5
 
+/*
+ * Note: Checking __has_feature(*_sanitizer) is only true if the feature is
+ * enabled. Therefore it is not required to additionally check defined(CONFIG_*)
+ * to avoid adding redundant attributes in other configurations.
+ */
+
 #if __has_feature(address_sanitizer) || __has_feature(hwaddress_sanitizer)
 /* Emulate GCC's __SANITIZE_ADDRESS__ flag */
 #define __SANITIZE_ADDRESS__
@@ -45,6 +51,17 @@
 #define __no_sanitize_undefined
 #endif
 
+/*
+ * Support for __has_feature(coverage_sanitizer) was added in Clang 13 together
+ * with no_sanitize("coverage"). Prior versions of Clang support coverage
+ * instrumentation, but cannot be queried for support by the preprocessor.
+ */
+#if __has_feature(coverage_sanitizer)
+#define __no_sanitize_coverage __attribute__((no_sanitize("coverage")))
+#else
+#define __no_sanitize_coverage
+#endif
+
 /*
  * Not all versions of clang implement the type-generic versions
  * of the builtin overflow checkers. Fortunately, clang implements
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 5d97ef738a57..cb9217fc60af 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -122,6 +122,12 @@
 #define __no_sanitize_undefined
 #endif
 
+#if defined(CONFIG_KCOV) && __has_attribute(__no_sanitize_coverage__)
+#define __no_sanitize_coverage __attribute__((no_sanitize_coverage))
+#else
+#define __no_sanitize_coverage
+#endif
+
 #if GCC_VERSION >= 50100
 #define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1
 #endif
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index d29bda7f6ebd..cc2bee7f0977 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -210,7 +210,7 @@ struct ftrace_likely_data {
 /* Section for code which can't be instrumented at all */
 #define noinstr								\
 	noinline notrace __attribute((__section__(".noinstr.text")))	\
-	__no_kcsan __no_sanitize_address
+	__no_kcsan __no_sanitize_address __no_sanitize_coverage
 
 #endif /* __KERNEL__ */
 
-- 
cgit v1.2.3


From bd7a94c0fa41dfbea8564556c7a28b05e353c5da Mon Sep 17 00:00:00 2001
From: Aubrey Li <aubrey.li@intel.com>
Date: Fri, 2 Jul 2021 15:03:49 +0800
Subject: ACPI: Correct \_SB._OSC bit definition for PRM

According to Platform Runtime Mechanism Specification v1.0 [1],
Page 42, \_SB._OSC bit 21 is used to indicate OS support for PRM.

Update the definition of the PRM support bit in the code to match the
specification.

Link: https://uefi.org/sites/default/files/resources/Platform%20Runtime%20Mechanism%20-%20with%20legal%20notice.pdf # [1]
Fixes: 60faa8f1ac6e ("ACPI: Add \_SB._OSC bit for PRM")
Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
[ rjw: Changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index a618ba698a5c..81786864566c 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -551,8 +551,8 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context);
 #define OSC_SB_OSLPI_SUPPORT			0x00000100
 #define OSC_SB_CPC_DIVERSE_HIGH_SUPPORT		0x00001000
 #define OSC_SB_GENERIC_INITIATOR_SUPPORT	0x00002000
-#define OSC_SB_PRM_SUPPORT			0x00020000
 #define OSC_SB_NATIVE_USB4_SUPPORT		0x00040000
+#define OSC_SB_PRM_SUPPORT			0x00200000
 
 extern bool osc_sb_apei_support_acked;
 extern bool osc_pc_lpi_support_confirmed;
-- 
cgit v1.2.3


From e13cd45d352dedac53529fb49e7d7e293f74fb90 Mon Sep 17 00:00:00 2001
From: Eli Cohen <elic@nvidia.com>
Date: Mon, 31 May 2021 19:04:04 +0300
Subject: vdpa/mlx5: Support creating resources with uid == 0

Currently all resources must be created with uid != 0 which is essential
when userspace processes are allocating virtquueue resources. Since this
is a kernel implementation, it is perfectly legal to open resources with
uid == 0.

In case firmware supports, avoid allocating user context.

Signed-off-by: Eli Cohen <elic@nvidia.com>
Link: https://lore.kernel.org/r/20210531160404.31368-1-elic@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vdpa/mlx5/core/resources.c | 6 ++++++
 include/linux/mlx5/mlx5_ifc.h      | 4 +++-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/vdpa/mlx5/core/resources.c b/drivers/vdpa/mlx5/core/resources.c
index 6521cbd0f5c2..836ab9ef0fa6 100644
--- a/drivers/vdpa/mlx5/core/resources.c
+++ b/drivers/vdpa/mlx5/core/resources.c
@@ -54,6 +54,9 @@ static int create_uctx(struct mlx5_vdpa_dev *mvdev, u16 *uid)
 	void *in;
 	int err;
 
+	if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0))
+		return 0;
+
 	/* 0 means not supported */
 	if (!MLX5_CAP_GEN(mvdev->mdev, log_max_uctx))
 		return -EOPNOTSUPP;
@@ -79,6 +82,9 @@ static void destroy_uctx(struct mlx5_vdpa_dev *mvdev, u32 uid)
 	u32 out[MLX5_ST_SZ_DW(destroy_uctx_out)] = {};
 	u32 in[MLX5_ST_SZ_DW(destroy_uctx_in)] = {};
 
+	if (!uid)
+		return;
+
 	MLX5_SET(destroy_uctx_in, in, opcode, MLX5_CMD_OP_DESTROY_UCTX);
 	MLX5_SET(destroy_uctx_in, in, uid, uid);
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index b4546e29e561..b0009aa3647f 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1512,7 +1512,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         uar_4k[0x1];
 	u8         reserved_at_241[0x9];
 	u8         uar_sz[0x6];
-	u8         reserved_at_250[0x8];
+	u8         reserved_at_248[0x2];
+	u8         umem_uid_0[0x1];
+	u8         reserved_at_250[0x5];
 	u8         log_pg_sz[0x8];
 
 	u8         bf[0x1];
-- 
cgit v1.2.3


From a029a4eab39e4bf542907a3263773fce3d48c983 Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.ibm.com>
Date: Fri, 25 Jun 2021 15:17:01 +0200
Subject: s390/cpumf: Allow concurrent access for CPU Measurement Counter
 Facility

Commit cf6acb8bdb1d ("s390/cpumf: Add support for complete counter set extraction")
allows access to the CPU Measurement Counter Facility via character
device /dev/hwctr. The access was exclusive via this device or
via perf_event_open() system call. Only one path at a time was
permitted. The CPU Measurement Counter Facility device driver blocked
access to other processes.

This patch removes this restriction and allows concurrent access to
the CPU Measurement Counter Facility from multiple processes at the same
time via perf_event_open() SVC and via /dev/hwctr device. The access
via /dev/hwctr device is still exclusive, only one process is allowed to
access this device.

This patch
- moves the /dev/hwctr device access from file perf_cpum_cf_diag.c.
  to file perf_cpum_cf.c.
- use only one trace buffer .../s390dbf/cpum_cf.
- remove cfset_csd structure and includes its members it into the
  structure cpu_cf_events. This results in one data structure and
  simplifies the access.
- rework function familiy ctr_set_enable, ctr_set_disable, ctr_set_start
  and ctr_set_stop which operate on a counter set number.
  Now they operate on a counter set bit mask.
- move CF_DIAG event functionality to file perf_cpum_cf.c. It now
  contains the complete functionality of the CPU Measurement Counter
  Facility:
  - Performance measurement support for counters using perf stat.
  - Support for complete counter set extraction with device /dev/hwctr.
  - Support for counter set extraction event CF_DIAG attached to
    samples using perf record.
- removes file perf_cpum_cf_diag.c

Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Reviewed-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/include/asm/cpu_mcf.h        |   36 +-
 arch/s390/kernel/Makefile              |    1 -
 arch/s390/kernel/perf_cpum_cf.c        | 1026 +++++++++++++++++++++++++++-
 arch/s390/kernel/perf_cpum_cf_common.c |   27 +-
 arch/s390/kernel/perf_cpum_cf_diag.c   | 1148 --------------------------------
 include/linux/cpuhotplug.h             |    1 -
 6 files changed, 1016 insertions(+), 1223 deletions(-)
 delete mode 100644 arch/s390/kernel/perf_cpum_cf_diag.c

(limited to 'include/linux')

diff --git a/arch/s390/include/asm/cpu_mcf.h b/arch/s390/include/asm/cpu_mcf.h
index 4dcefddb7751..ca0e0e5ddbc4 100644
--- a/arch/s390/include/asm/cpu_mcf.h
+++ b/arch/s390/include/asm/cpu_mcf.h
@@ -32,39 +32,22 @@ static const u64 cpumf_ctr_ctl[CPUMF_CTR_SET_MAX] = {
 	[CPUMF_CTR_SET_MT_DIAG] = 0x20,
 };
 
-static inline void ctr_set_enable(u64 *state, int ctr_set)
-{
-	*state |= cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ENABLE_SHIFT;
-}
-static inline void ctr_set_disable(u64 *state, int ctr_set)
-{
-	*state &= ~(cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ENABLE_SHIFT);
-}
-static inline void ctr_set_start(u64 *state, int ctr_set)
-{
-	*state |= cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ACTCTL_SHIFT;
-}
-static inline void ctr_set_stop(u64 *state, int ctr_set)
-{
-	*state &= ~(cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ACTCTL_SHIFT);
-}
-
-static inline void ctr_set_multiple_enable(u64 *state, u64 ctrsets)
+static inline void ctr_set_enable(u64 *state, u64 ctrsets)
 {
 	*state |= ctrsets << CPUMF_LCCTL_ENABLE_SHIFT;
 }
 
-static inline void ctr_set_multiple_disable(u64 *state, u64 ctrsets)
+static inline void ctr_set_disable(u64 *state, u64 ctrsets)
 {
 	*state &= ~(ctrsets << CPUMF_LCCTL_ENABLE_SHIFT);
 }
 
-static inline void ctr_set_multiple_start(u64 *state, u64 ctrsets)
+static inline void ctr_set_start(u64 *state, u64 ctrsets)
 {
 	*state |= ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT;
 }
 
-static inline void ctr_set_multiple_stop(u64 *state, u64 ctrsets)
+static inline void ctr_set_stop(u64 *state, u64 ctrsets)
 {
 	*state &= ~(ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT);
 }
@@ -92,8 +75,15 @@ struct cpu_cf_events {
 	struct cpumf_ctr_info	info;
 	atomic_t		ctr_set[CPUMF_CTR_SET_MAX];
 	atomic64_t		alert;
-	u64			state;
+	u64			state;		/* For perf_event_open SVC */
+	u64			dev_state;	/* For /dev/hwctr */
 	unsigned int		flags;
+	size_t used;			/* Bytes used in data */
+	size_t usedss;			/* Bytes used in start/stop */
+	unsigned char start[PAGE_SIZE];	/* Counter set at event add */
+	unsigned char stop[PAGE_SIZE];	/* Counter set at event delete */
+	unsigned char data[PAGE_SIZE];	/* Counter set at /dev/hwctr */
+	unsigned int sets;		/* # Counter set saved in memory */
 };
 DECLARE_PER_CPU(struct cpu_cf_events, cpu_cf_events);
 
@@ -124,4 +114,6 @@ static inline int stccm_avail(void)
 
 size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset,
 			   struct cpumf_ctr_info *info);
+int cfset_online_cpu(unsigned int cpu);
+int cfset_offline_cpu(unsigned int cpu);
 #endif /* _ASM_S390_CPU_MCF_H */
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 68ca1834316f..7a77f7f6f9d8 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -71,7 +71,6 @@ obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT)	+= ima_arch.o
 obj-$(CONFIG_PERF_EVENTS)	+= perf_event.o perf_cpum_cf_common.o
 obj-$(CONFIG_PERF_EVENTS)	+= perf_cpum_cf.o perf_cpum_sf.o
 obj-$(CONFIG_PERF_EVENTS)	+= perf_cpum_cf_events.o perf_regs.o
-obj-$(CONFIG_PERF_EVENTS)	+= perf_cpum_cf_diag.o
 
 obj-$(CONFIG_TRACEPOINTS)	+= trace.o
 obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE))	+= uv.o
diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
index 1b7a0525fbed..975a00c8c564 100644
--- a/arch/s390/kernel/perf_cpum_cf.c
+++ b/arch/s390/kernel/perf_cpum_cf.c
@@ -2,8 +2,9 @@
 /*
  * Performance event support for s390x - CPU-measurement Counter Facility
  *
- *  Copyright IBM Corp. 2012, 2019
+ *  Copyright IBM Corp. 2012, 2021
  *  Author(s): Hendrik Brueckner <brueckner@linux.ibm.com>
+ *	       Thomas Richter <tmricht@linux.ibm.com>
  */
 #define KMSG_COMPONENT	"cpum_cf"
 #define pr_fmt(fmt)	KMSG_COMPONENT ": " fmt
@@ -14,7 +15,223 @@
 #include <linux/notifier.h>
 #include <linux/init.h>
 #include <linux/export.h>
+#include <linux/miscdevice.h>
+
 #include <asm/cpu_mcf.h>
+#include <asm/hwctrset.h>
+#include <asm/debug.h>
+
+static unsigned int cfdiag_cpu_speed;	/* CPU speed for CF_DIAG trailer */
+static debug_info_t *cf_dbg;
+
+#define	CF_DIAG_CTRSET_DEF		0xfeef	/* Counter set header mark */
+						/* interval in seconds */
+
+/* Counter sets are stored as data stream in a page sized memory buffer and
+ * exported to user space via raw data attached to the event sample data.
+ * Each counter set starts with an eight byte header consisting of:
+ * - a two byte eye catcher (0xfeef)
+ * - a one byte counter set number
+ * - a two byte counter set size (indicates the number of counters in this set)
+ * - a three byte reserved value (must be zero) to make the header the same
+ *   size as a counter value.
+ * All counter values are eight byte in size.
+ *
+ * All counter sets are followed by a 64 byte trailer.
+ * The trailer consists of a:
+ * - flag field indicating valid fields when corresponding bit set
+ * - the counter facility first and second version number
+ * - the CPU speed if nonzero
+ * - the time stamp the counter sets have been collected
+ * - the time of day (TOD) base value
+ * - the machine type.
+ *
+ * The counter sets are saved when the process is prepared to be executed on a
+ * CPU and saved again when the process is going to be removed from a CPU.
+ * The difference of both counter sets are calculated and stored in the event
+ * sample data area.
+ */
+struct cf_ctrset_entry {	/* CPU-M CF counter set entry (8 byte) */
+	unsigned int def:16;	/* 0-15  Data Entry Format */
+	unsigned int set:16;	/* 16-31 Counter set identifier */
+	unsigned int ctr:16;	/* 32-47 Number of stored counters */
+	unsigned int res1:16;	/* 48-63 Reserved */
+};
+
+struct cf_trailer_entry {	/* CPU-M CF_DIAG trailer (64 byte) */
+	/* 0 - 7 */
+	union {
+		struct {
+			unsigned int clock_base:1;	/* TOD clock base set */
+			unsigned int speed:1;		/* CPU speed set */
+			/* Measurement alerts */
+			unsigned int mtda:1;	/* Loss of MT ctr. data alert */
+			unsigned int caca:1;	/* Counter auth. change alert */
+			unsigned int lcda:1;	/* Loss of counter data alert */
+		};
+		unsigned long flags;	/* 0-63    All indicators */
+	};
+	/* 8 - 15 */
+	unsigned int cfvn:16;			/* 64-79   Ctr First Version */
+	unsigned int csvn:16;			/* 80-95   Ctr Second Version */
+	unsigned int cpu_speed:32;		/* 96-127  CPU speed */
+	/* 16 - 23 */
+	unsigned long timestamp;		/* 128-191 Timestamp (TOD) */
+	/* 24 - 55 */
+	union {
+		struct {
+			unsigned long progusage1;
+			unsigned long progusage2;
+			unsigned long progusage3;
+			unsigned long tod_base;
+		};
+		unsigned long progusage[4];
+	};
+	/* 56 - 63 */
+	unsigned int mach_type:16;		/* Machine type */
+	unsigned int res1:16;			/* Reserved */
+	unsigned int res2:32;			/* Reserved */
+};
+
+/* Create the trailer data at the end of a page. */
+static void cfdiag_trailer(struct cf_trailer_entry *te)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	struct cpuid cpuid;
+
+	te->cfvn = cpuhw->info.cfvn;		/* Counter version numbers */
+	te->csvn = cpuhw->info.csvn;
+
+	get_cpu_id(&cpuid);			/* Machine type */
+	te->mach_type = cpuid.machine;
+	te->cpu_speed = cfdiag_cpu_speed;
+	if (te->cpu_speed)
+		te->speed = 1;
+	te->clock_base = 1;			/* Save clock base */
+	te->tod_base = tod_clock_base.tod;
+	te->timestamp = get_tod_clock_fast();
+}
+
+/* Read a counter set. The counter set number determines the counter set and
+ * the CPUM-CF first and second version number determine the number of
+ * available counters in each counter set.
+ * Each counter set starts with header containing the counter set number and
+ * the number of eight byte counters.
+ *
+ * The functions returns the number of bytes occupied by this counter set
+ * including the header.
+ * If there is no counter in the counter set, this counter set is useless and
+ * zero is returned on this case.
+ *
+ * Note that the counter sets may not be enabled or active and the stcctm
+ * instruction might return error 3. Depending on error_ok value this is ok,
+ * for example when called from cpumf_pmu_start() call back function.
+ */
+static size_t cfdiag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset,
+			       size_t room, bool error_ok)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	size_t ctrset_size, need = 0;
+	int rc = 3;				/* Assume write failure */
+
+	ctrdata->def = CF_DIAG_CTRSET_DEF;
+	ctrdata->set = ctrset;
+	ctrdata->res1 = 0;
+	ctrset_size = cpum_cf_ctrset_size(ctrset, &cpuhw->info);
+
+	if (ctrset_size) {			/* Save data */
+		need = ctrset_size * sizeof(u64) + sizeof(*ctrdata);
+		if (need <= room) {
+			rc = ctr_stcctm(ctrset, ctrset_size,
+					(u64 *)(ctrdata + 1));
+		}
+		if (rc != 3 || error_ok)
+			ctrdata->ctr = ctrset_size;
+		else
+			need = 0;
+	}
+
+	debug_sprintf_event(cf_dbg, 3,
+			    "%s ctrset %d ctrset_size %zu cfvn %d csvn %d"
+			    " need %zd rc %d\n", __func__, ctrset, ctrset_size,
+			    cpuhw->info.cfvn, cpuhw->info.csvn, need, rc);
+	return need;
+}
+
+/* Read out all counter sets and save them in the provided data buffer.
+ * The last 64 byte host an artificial trailer entry.
+ */
+static size_t cfdiag_getctr(void *data, size_t sz, unsigned long auth,
+			    bool error_ok)
+{
+	struct cf_trailer_entry *trailer;
+	size_t offset = 0, done;
+	int i;
+
+	memset(data, 0, sz);
+	sz -= sizeof(*trailer);		/* Always room for trailer */
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+		struct cf_ctrset_entry *ctrdata = data + offset;
+
+		if (!(auth & cpumf_ctr_ctl[i]))
+			continue;	/* Counter set not authorized */
+
+		done = cfdiag_getctrset(ctrdata, i, sz - offset, error_ok);
+		offset += done;
+	}
+	trailer = data + offset;
+	cfdiag_trailer(trailer);
+	return offset + sizeof(*trailer);
+}
+
+/* Calculate the difference for each counter in a counter set. */
+static void cfdiag_diffctrset(u64 *pstart, u64 *pstop, int counters)
+{
+	for (; --counters >= 0; ++pstart, ++pstop)
+		if (*pstop >= *pstart)
+			*pstop -= *pstart;
+		else
+			*pstop = *pstart - *pstop + 1;
+}
+
+/* Scan the counter sets and calculate the difference of each counter
+ * in each set. The result is the increment of each counter during the
+ * period the counter set has been activated.
+ *
+ * Return true on success.
+ */
+static int cfdiag_diffctr(struct cpu_cf_events *cpuhw, unsigned long auth)
+{
+	struct cf_trailer_entry *trailer_start, *trailer_stop;
+	struct cf_ctrset_entry *ctrstart, *ctrstop;
+	size_t offset = 0;
+
+	auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1;
+	do {
+		ctrstart = (struct cf_ctrset_entry *)(cpuhw->start + offset);
+		ctrstop = (struct cf_ctrset_entry *)(cpuhw->stop + offset);
+
+		if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) {
+			pr_err_once("cpum_cf_diag counter set compare error "
+				    "in set %i\n", ctrstart->set);
+			return 0;
+		}
+		auth &= ~cpumf_ctr_ctl[ctrstart->set];
+		if (ctrstart->def == CF_DIAG_CTRSET_DEF) {
+			cfdiag_diffctrset((u64 *)(ctrstart + 1),
+					  (u64 *)(ctrstop + 1), ctrstart->ctr);
+			offset += ctrstart->ctr * sizeof(u64) +
+							sizeof(*ctrstart);
+		}
+	} while (ctrstart->def && auth);
+
+	/* Save time_stamp from start of event in stop's trailer */
+	trailer_start = (struct cf_trailer_entry *)(cpuhw->start + offset);
+	trailer_stop = (struct cf_trailer_entry *)(cpuhw->stop + offset);
+	trailer_stop->progusage[0] = trailer_start->timestamp;
+
+	return 1;
+}
 
 static enum cpumf_ctr_set get_counter_set(u64 event)
 {
@@ -34,7 +251,8 @@ static enum cpumf_ctr_set get_counter_set(u64 event)
 	return set;
 }
 
-static int validate_ctr_version(const struct hw_perf_event *hwc)
+static int validate_ctr_version(const struct hw_perf_event *hwc,
+				enum cpumf_ctr_set set)
 {
 	struct cpu_cf_events *cpuhw;
 	int err = 0;
@@ -43,7 +261,7 @@ static int validate_ctr_version(const struct hw_perf_event *hwc)
 	cpuhw = &get_cpu_var(cpu_cf_events);
 
 	/* check required version for counter sets */
-	switch (hwc->config_base) {
+	switch (set) {
 	case CPUMF_CTR_SET_BASIC:
 	case CPUMF_CTR_SET_USER:
 		if (cpuhw->info.cfvn < 1)
@@ -86,6 +304,8 @@ static int validate_ctr_version(const struct hw_perf_event *hwc)
 		      (cpuhw->info.act_ctl & mtdiag_ctl)))
 			err = -EOPNOTSUPP;
 		break;
+	case CPUMF_CTR_SET_MAX:
+		err = -EOPNOTSUPP;
 	}
 
 	put_cpu_var(cpu_cf_events);
@@ -95,7 +315,6 @@ static int validate_ctr_version(const struct hw_perf_event *hwc)
 static int validate_ctr_auth(const struct hw_perf_event *hwc)
 {
 	struct cpu_cf_events *cpuhw;
-	u64 ctrs_state;
 	int err = 0;
 
 	cpuhw = &get_cpu_var(cpu_cf_events);
@@ -105,8 +324,7 @@ static int validate_ctr_auth(const struct hw_perf_event *hwc)
 	 * return with -ENOENT in order to fall back to other
 	 * PMUs that might suffice the event request.
 	 */
-	ctrs_state = cpumf_ctr_ctl[hwc->config_base];
-	if (!(ctrs_state & cpuhw->info.auth_ctl))
+	if (!(hwc->config_base & cpuhw->info.auth_ctl))
 		err = -ENOENT;
 
 	put_cpu_var(cpu_cf_events);
@@ -126,7 +344,7 @@ static void cpumf_pmu_enable(struct pmu *pmu)
 	if (cpuhw->flags & PMU_F_ENABLED)
 		return;
 
-	err = lcctl(cpuhw->state);
+	err = lcctl(cpuhw->state | cpuhw->dev_state);
 	if (err) {
 		pr_err("Enabling the performance measuring unit "
 		       "failed with rc=%x\n", err);
@@ -151,6 +369,7 @@ static void cpumf_pmu_disable(struct pmu *pmu)
 		return;
 
 	inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1);
+	inactive |= cpuhw->dev_state;
 	err = lcctl(inactive);
 	if (err) {
 		pr_err("Disabling the performance measuring unit "
@@ -199,6 +418,14 @@ static const int cpumf_generic_events_user[] = {
 	[PERF_COUNT_HW_BUS_CYCLES]	    = -1,
 };
 
+static void cpumf_hw_inuse(void)
+{
+	mutex_lock(&pmc_reserve_mutex);
+	if (atomic_inc_return(&num_events) == 1)
+		__kernel_cpumcf_begin();
+	mutex_unlock(&pmc_reserve_mutex);
+}
+
 static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
 {
 	struct perf_event_attr *attr = &event->attr;
@@ -258,11 +485,11 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
 		/*
 		 * Use the hardware perf event structure to store the
 		 * counter number in the 'config' member and the counter
-		 * set number in the 'config_base'.  The counter set number
-		 * is then later used to enable/disable the counter(s).
+		 * set number in the 'config_base' as bit mask.
+		 * It is later used to enable/disable the counter(s).
 		 */
 		hwc->config = ev;
-		hwc->config_base = set;
+		hwc->config_base = cpumf_ctr_ctl[set];
 		break;
 	case CPUMF_CTR_SET_MAX:
 		/* The counter could not be associated to a counter set */
@@ -270,22 +497,13 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
 	}
 
 	/* Initialize for using the CPU-measurement counter facility */
-	if (!atomic_inc_not_zero(&num_events)) {
-		mutex_lock(&pmc_reserve_mutex);
-		if (atomic_read(&num_events) == 0 && __kernel_cpumcf_begin())
-			err = -EBUSY;
-		else
-			atomic_inc(&num_events);
-		mutex_unlock(&pmc_reserve_mutex);
-	}
-	if (err)
-		return err;
+	cpumf_hw_inuse();
 	event->destroy = hw_perf_event_destroy;
 
 	/* Finally, validate version and authorization of the counter set */
 	err = validate_ctr_auth(hwc);
 	if (!err)
-		err = validate_ctr_version(hwc);
+		err = validate_ctr_version(hwc, set);
 
 	return err;
 }
@@ -361,6 +579,7 @@ static void cpumf_pmu_start(struct perf_event *event, int flags)
 {
 	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
 	struct hw_perf_event *hwc = &event->hw;
+	int i;
 
 	if (!(hwc->state & PERF_HES_STOPPED))
 		return;
@@ -376,29 +595,92 @@ static void cpumf_pmu_start(struct perf_event *event, int flags)
 	 * needs to be synchronized.  At this point, the counter set can be in
 	 * the inactive or disabled state.
 	 */
-	hw_perf_event_reset(event);
+	if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) {
+		cpuhw->usedss = cfdiag_getctr(cpuhw->start,
+					      sizeof(cpuhw->start),
+					      hwc->config_base, true);
+	} else {
+		hw_perf_event_reset(event);
+	}
 
-	/* increment refcount for this counter set */
-	atomic_inc(&cpuhw->ctr_set[hwc->config_base]);
+	/* Increment refcount for counter sets */
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i)
+		if ((hwc->config_base & cpumf_ctr_ctl[i]))
+			atomic_inc(&cpuhw->ctr_set[i]);
+}
+
+/* Create perf event sample with the counter sets as raw data.	The sample
+ * is then pushed to the event subsystem and the function checks for
+ * possible event overflows. If an event overflow occurs, the PMU is
+ * stopped.
+ *
+ * Return non-zero if an event overflow occurred.
+ */
+static int cfdiag_push_sample(struct perf_event *event,
+			      struct cpu_cf_events *cpuhw)
+{
+	struct perf_sample_data data;
+	struct perf_raw_record raw;
+	struct pt_regs regs;
+	int overflow;
+
+	/* Setup perf sample */
+	perf_sample_data_init(&data, 0, event->hw.last_period);
+	memset(&regs, 0, sizeof(regs));
+	memset(&raw, 0, sizeof(raw));
+
+	if (event->attr.sample_type & PERF_SAMPLE_CPU)
+		data.cpu_entry.cpu = event->cpu;
+	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+		raw.frag.size = cpuhw->usedss;
+		raw.frag.data = cpuhw->stop;
+		raw.size = raw.frag.size;
+		data.raw = &raw;
+	}
+
+	overflow = perf_event_overflow(event, &data, &regs);
+	debug_sprintf_event(cf_dbg, 3,
+			    "%s event %#llx sample_type %#llx raw %d ov %d\n",
+			    __func__, event->hw.config,
+			    event->attr.sample_type, raw.size, overflow);
+	if (overflow)
+		event->pmu->stop(event, 0);
+
+	perf_event_update_userpage(event);
+	return overflow;
 }
 
 static void cpumf_pmu_stop(struct perf_event *event, int flags)
 {
 	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
 	struct hw_perf_event *hwc = &event->hw;
+	int i;
 
 	if (!(hwc->state & PERF_HES_STOPPED)) {
 		/* Decrement reference count for this counter set and if this
 		 * is the last used counter in the set, clear activation
 		 * control and set the counter set state to inactive.
 		 */
-		if (!atomic_dec_return(&cpuhw->ctr_set[hwc->config_base]))
-			ctr_set_stop(&cpuhw->state, hwc->config_base);
+		for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+			if (!(hwc->config_base & cpumf_ctr_ctl[i]))
+				continue;
+			if (!atomic_dec_return(&cpuhw->ctr_set[i]))
+				ctr_set_stop(&cpuhw->state, cpumf_ctr_ctl[i]);
+		}
 		hwc->state |= PERF_HES_STOPPED;
 	}
 
 	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-		hw_perf_event_update(event);
+		if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) {
+			local64_inc(&event->count);
+			cpuhw->usedss = cfdiag_getctr(cpuhw->stop,
+						      sizeof(cpuhw->stop),
+						      event->hw.config_base,
+						      false);
+			if (cfdiag_diffctr(cpuhw, event->hw.config_base))
+				cfdiag_push_sample(event, cpuhw);
+		} else
+			hw_perf_event_update(event);
 		hwc->state |= PERF_HES_UPTODATE;
 	}
 }
@@ -419,6 +701,7 @@ static int cpumf_pmu_add(struct perf_event *event, int flags)
 static void cpumf_pmu_del(struct perf_event *event, int flags)
 {
 	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	int i;
 
 	cpumf_pmu_stop(event, PERF_EF_UPDATE);
 
@@ -430,8 +713,9 @@ static void cpumf_pmu_del(struct perf_event *event, int flags)
 	 * clear enable control and resets all counters in a set.  Therefore,
 	 * cpumf_pmu_start() always has to reenable a counter set.
 	 */
-	if (!atomic_read(&cpuhw->ctr_set[event->hw.config_base]))
-		ctr_set_disable(&cpuhw->state, event->hw.config_base);
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i)
+		if (!atomic_read(&cpuhw->ctr_set[i]))
+			ctr_set_disable(&cpuhw->state, cpumf_ctr_ctl[i]);
 }
 
 /* Performance monitoring unit for s390x */
@@ -448,6 +732,7 @@ static struct pmu cpumf_pmu = {
 	.read	      = cpumf_pmu_read,
 };
 
+static int cfset_init(void);
 static int __init cpumf_pmu_init(void)
 {
 	int rc;
@@ -455,10 +740,689 @@ static int __init cpumf_pmu_init(void)
 	if (!kernel_cpumcf_avail())
 		return -ENODEV;
 
+	/* Setup s390dbf facility */
+	cf_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128);
+	if (!cf_dbg) {
+		pr_err("Registration of s390dbf(cpum_cf) failed\n");
+		return -ENOMEM;
+	};
+	debug_register_view(cf_dbg, &debug_sprintf_view);
+
 	cpumf_pmu.attr_groups = cpumf_cf_event_group();
 	rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", -1);
-	if (rc)
+	if (rc) {
+		debug_unregister_view(cf_dbg, &debug_sprintf_view);
+		debug_unregister(cf_dbg);
 		pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc);
+	} else if (stccm_avail()) {	/* Setup counter set device */
+		cfset_init();
+	}
+	return rc;
+}
+
+/* Support for the CPU Measurement Facility counter set extraction using
+ * device /dev/hwctr. This allows user space programs to extract complete
+ * counter set via normal file operations.
+ */
+
+static atomic_t cfset_opencnt = ATOMIC_INIT(0);	/* Excl. access */
+static DEFINE_MUTEX(cfset_ctrset_mutex);/* Synchronize access to hardware */
+struct cfset_call_on_cpu_parm {		/* Parm struct for smp_call_on_cpu */
+	unsigned int sets;		/* Counter set bit mask */
+	atomic_t cpus_ack;		/* # CPUs successfully executed func */
+};
+
+static struct cfset_request {		/* CPUs and counter set bit mask */
+	unsigned long ctrset;		/* Bit mask of counter set to read */
+	cpumask_t mask;			/* CPU mask to read from */
+} cfset_request;
+
+static void cfset_ctrset_clear(void)
+{
+	cpumask_clear(&cfset_request.mask);
+	cfset_request.ctrset = 0;
+}
+
+/* The /dev/hwctr device access uses PMU_F_IN_USE to mark the device access
+ * path is currently used.
+ * The cpu_cf_events::dev_state is used to denote counter sets in use by this
+ * interface. It is always or'ed in. If this interface is not active, its
+ * value is zero and no additional counter sets will be included.
+ *
+ * The cpu_cf_events::state is used by the perf_event_open SVC and remains
+ * unchanged.
+ *
+ * perf_pmu_enable() and perf_pmu_enable() and its call backs
+ * cpumf_pmu_enable() and  cpumf_pmu_disable() are called by the
+ * performance measurement subsystem to enable per process
+ * CPU Measurement counter facility.
+ * The XXX_enable() and XXX_disable functions are used to turn off
+ * x86 performance monitoring interrupt (PMI) during scheduling.
+ * s390 uses these calls to temporarily stop and resume the active CPU
+ * counters sets during scheduling.
+ *
+ * We do allow concurrent access of perf_event_open() SVC and /dev/hwctr
+ * device access.  The perf_event_open() SVC interface makes a lot of effort
+ * to only run the counters while the calling process is actively scheduled
+ * to run.
+ * When /dev/hwctr interface is also used at the same time, the counter sets
+ * will keep running, even when the process is scheduled off a CPU.
+ * However this is not a problem and does not lead to wrong counter values
+ * for the perf_event_open() SVC. The current counter value will be recorded
+ * during schedule-in. At schedule-out time the current counter value is
+ * extracted again and the delta is calculated and added to the event.
+ */
+/* Stop all counter sets via ioctl interface */
+static void cfset_ioctl_off(void *parm)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	struct cfset_call_on_cpu_parm *p = parm;
+	int rc;
+
+	cpuhw->dev_state = 0;
+	for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc)
+		if ((p->sets & cpumf_ctr_ctl[rc]))
+			atomic_dec(&cpuhw->ctr_set[rc]);
+	rc = lcctl(cpuhw->state);	/* Keep perf_event_open counter sets */
+	if (rc)
+		pr_err("Counter set stop %#llx of /dev/%s failed rc=%i\n",
+		       cpuhw->state, S390_HWCTR_DEVICE, rc);
+	cpuhw->flags &= ~PMU_F_IN_USE;
+	debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n",
+			    __func__, rc, cpuhw->state, cpuhw->dev_state);
+}
+
+/* Start counter sets on particular CPU */
+static void cfset_ioctl_on(void *parm)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	struct cfset_call_on_cpu_parm *p = parm;
+	int rc;
+
+	cpuhw->flags |= PMU_F_IN_USE;
+	ctr_set_enable(&cpuhw->dev_state, p->sets);
+	ctr_set_start(&cpuhw->dev_state, p->sets);
+	for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc)
+		if ((p->sets & cpumf_ctr_ctl[rc]))
+			atomic_inc(&cpuhw->ctr_set[rc]);
+	rc = lcctl(cpuhw->dev_state | cpuhw->state);	/* Start counter sets */
+	if (!rc)
+		atomic_inc(&p->cpus_ack);
+	else
+		pr_err("Counter set start %#llx of /dev/%s failed rc=%i\n",
+		       cpuhw->dev_state | cpuhw->state, S390_HWCTR_DEVICE, rc);
+	debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n",
+			    __func__, rc, cpuhw->state, cpuhw->dev_state);
+}
+
+static void cfset_release_cpu(void *p)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	int rc;
+
+	debug_sprintf_event(cf_dbg, 4, "%s state %#llx dev_state %#llx\n",
+			    __func__, cpuhw->state, cpuhw->dev_state);
+	rc = lcctl(cpuhw->state);	/* Keep perf_event_open counter sets */
+	if (rc)
+		pr_err("Counter set release %#llx of /dev/%s failed rc=%i\n",
+		       cpuhw->state, S390_HWCTR_DEVICE, rc);
+	cpuhw->dev_state = 0;
+}
+
+/* Release function is also called when application gets terminated without
+ * doing a proper ioctl(..., S390_HWCTR_STOP, ...) command.
+ */
+static int cfset_release(struct inode *inode, struct file *file)
+{
+	on_each_cpu(cfset_release_cpu, NULL, 1);
+	hw_perf_event_destroy(NULL);
+	cfset_ctrset_clear();
+	atomic_set(&cfset_opencnt, 0);
+	return 0;
+}
+
+static int cfset_open(struct inode *inode, struct file *file)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	/* Only one user space program can open /dev/hwctr */
+	if (atomic_xchg(&cfset_opencnt, 1))
+		return -EBUSY;
+
+	cpumf_hw_inuse();
+	file->private_data = NULL;
+	/* nonseekable_open() never fails */
+	return nonseekable_open(inode, file);
+}
+
+static int cfset_all_stop(void)
+{
+	struct cfset_call_on_cpu_parm p = {
+		.sets = cfset_request.ctrset,
+	};
+	cpumask_var_t mask;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_and(mask, &cfset_request.mask, cpu_online_mask);
+	on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1);
+	free_cpumask_var(mask);
+	return 0;
+}
+
+static int cfset_all_start(void)
+{
+	struct cfset_call_on_cpu_parm p = {
+		.sets = cfset_request.ctrset,
+		.cpus_ack = ATOMIC_INIT(0),
+	};
+	cpumask_var_t mask;
+	int rc = 0;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_and(mask, &cfset_request.mask, cpu_online_mask);
+	on_each_cpu_mask(mask, cfset_ioctl_on, &p, 1);
+	if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) {
+		on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1);
+		rc = -EIO;
+		debug_sprintf_event(cf_dbg, 4, "%s CPUs missing", __func__);
+	}
+	free_cpumask_var(mask);
+	return rc;
+}
+
+
+/* Return the maximum required space for all possible CPUs in case one
+ * CPU will be onlined during the START, READ, STOP cycles.
+ * To find out the size of the counter sets, any one CPU will do. They
+ * all have the same counter sets.
+ */
+static size_t cfset_needspace(unsigned int sets)
+{
+	struct cpu_cf_events *cpuhw = get_cpu_ptr(&cpu_cf_events);
+	size_t bytes = 0;
+	int i;
+
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+		if (!(sets & cpumf_ctr_ctl[i]))
+			continue;
+		bytes += cpum_cf_ctrset_size(i, &cpuhw->info) * sizeof(u64) +
+			 sizeof(((struct s390_ctrset_setdata *)0)->set) +
+			 sizeof(((struct s390_ctrset_setdata *)0)->no_cnts);
+	}
+	bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids *
+		(bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) +
+		     sizeof(((struct s390_ctrset_cpudata *)0)->no_sets));
+	put_cpu_ptr(&cpu_cf_events);
+	return bytes;
+}
+
+static int cfset_all_copy(unsigned long arg, cpumask_t *mask)
+{
+	struct s390_ctrset_read __user *ctrset_read;
+	unsigned int cpu, cpus, rc;
+	void __user *uptr;
+
+	ctrset_read = (struct s390_ctrset_read __user *)arg;
+	uptr = ctrset_read->data;
+	for_each_cpu(cpu, mask) {
+		struct cpu_cf_events *cpuhw = per_cpu_ptr(&cpu_cf_events, cpu);
+		struct s390_ctrset_cpudata __user *ctrset_cpudata;
+
+		ctrset_cpudata = uptr;
+		rc  = put_user(cpu, &ctrset_cpudata->cpu_nr);
+		rc |= put_user(cpuhw->sets, &ctrset_cpudata->no_sets);
+		rc |= copy_to_user(ctrset_cpudata->data, cpuhw->data,
+				   cpuhw->used);
+		if (rc)
+			return -EFAULT;
+		uptr += sizeof(struct s390_ctrset_cpudata) + cpuhw->used;
+		cond_resched();
+	}
+	cpus = cpumask_weight(mask);
+	if (put_user(cpus, &ctrset_read->no_cpus))
+		return -EFAULT;
+	debug_sprintf_event(cf_dbg, 4, "%s copied %ld\n", __func__,
+			    uptr - (void __user *)ctrset_read->data);
+	return 0;
+}
+
+static size_t cfset_cpuset_read(struct s390_ctrset_setdata *p, int ctrset,
+				int ctrset_size, size_t room)
+{
+	size_t need = 0;
+	int rc = -1;
+
+	need = sizeof(*p) + sizeof(u64) * ctrset_size;
+	if (need <= room) {
+		p->set = cpumf_ctr_ctl[ctrset];
+		p->no_cnts = ctrset_size;
+		rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv);
+		if (rc == 3)		/* Nothing stored */
+			need = 0;
+	}
+	return need;
+}
+
+/* Read all counter sets. */
+static void cfset_cpu_read(void *parm)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	struct cfset_call_on_cpu_parm *p = parm;
+	int set, set_size;
+	size_t space;
+
+	/* No data saved yet */
+	cpuhw->used = 0;
+	cpuhw->sets = 0;
+	memset(cpuhw->data, 0, sizeof(cpuhw->data));
+
+	/* Scan the counter sets */
+	for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) {
+		struct s390_ctrset_setdata *sp = (void *)cpuhw->data +
+						 cpuhw->used;
+
+		if (!(p->sets & cpumf_ctr_ctl[set]))
+			continue;	/* Counter set not in list */
+		set_size = cpum_cf_ctrset_size(set, &cpuhw->info);
+		space = sizeof(cpuhw->data) - cpuhw->used;
+		space = cfset_cpuset_read(sp, set, set_size, space);
+		if (space) {
+			cpuhw->used += space;
+			cpuhw->sets += 1;
+		}
+	}
+	debug_sprintf_event(cf_dbg, 4, "%s sets %d used %zd\n", __func__,
+			    cpuhw->sets, cpuhw->used);
+}
+
+static int cfset_all_read(unsigned long arg)
+{
+	struct cfset_call_on_cpu_parm p;
+	cpumask_var_t mask;
+	int rc;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	p.sets = cfset_request.ctrset;
+	cpumask_and(mask, &cfset_request.mask, cpu_online_mask);
+	on_each_cpu_mask(mask, cfset_cpu_read, &p, 1);
+	rc = cfset_all_copy(arg, mask);
+	free_cpumask_var(mask);
 	return rc;
 }
-subsys_initcall(cpumf_pmu_init);
+
+static long cfset_ioctl_read(unsigned long arg)
+{
+	struct s390_ctrset_read read;
+	int ret = 0;
+
+	if (copy_from_user(&read, (char __user *)arg, sizeof(read)))
+		return -EFAULT;
+	ret = cfset_all_read(arg);
+	return ret;
+}
+
+static long cfset_ioctl_stop(void)
+{
+	int ret = ENXIO;
+
+	if (cfset_request.ctrset) {
+		ret = cfset_all_stop();
+		cfset_ctrset_clear();
+	}
+	return ret;
+}
+
+static long cfset_ioctl_start(unsigned long arg)
+{
+	struct s390_ctrset_start __user *ustart;
+	struct s390_ctrset_start start;
+	void __user *umask;
+	unsigned int len;
+	int ret = 0;
+	size_t need;
+
+	if (cfset_request.ctrset)
+		return -EBUSY;
+	ustart = (struct s390_ctrset_start __user *)arg;
+	if (copy_from_user(&start, ustart, sizeof(start)))
+		return -EFAULT;
+	if (start.version != S390_HWCTR_START_VERSION)
+		return -EINVAL;
+	if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] |
+				   cpumf_ctr_ctl[CPUMF_CTR_SET_USER] |
+				   cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] |
+				   cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] |
+				   cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG]))
+		return -EINVAL;		/* Invalid counter set */
+	if (!start.counter_sets)
+		return -EINVAL;		/* No counter set at all? */
+	cpumask_clear(&cfset_request.mask);
+	len = min_t(u64, start.cpumask_len, cpumask_size());
+	umask = (void __user *)start.cpumask;
+	if (copy_from_user(&cfset_request.mask, umask, len))
+		return -EFAULT;
+	if (cpumask_empty(&cfset_request.mask))
+		return -EINVAL;
+	need = cfset_needspace(start.counter_sets);
+	if (put_user(need, &ustart->data_bytes))
+		ret = -EFAULT;
+	if (ret)
+		goto out;
+	cfset_request.ctrset = start.counter_sets;
+	ret = cfset_all_start();
+out:
+	if (ret)
+		cfset_ctrset_clear();
+	debug_sprintf_event(cf_dbg, 4, "%s sets %#lx need %ld ret %d\n",
+			    __func__, cfset_request.ctrset, need, ret);
+	return ret;
+}
+
+/* Entry point to the /dev/hwctr device interface.
+ * The ioctl system call supports three subcommands:
+ * S390_HWCTR_START: Start the specified counter sets on a CPU list. The
+ *    counter set keeps running until explicitly stopped. Returns the number
+ *    of bytes needed to store the counter values. If another S390_HWCTR_START
+ *    ioctl subcommand is called without a previous S390_HWCTR_STOP stop
+ *    command, -EBUSY is returned.
+ * S390_HWCTR_READ: Read the counter set values from specified CPU list given
+ *    with the S390_HWCTR_START command.
+ * S390_HWCTR_STOP: Stops the counter sets on the CPU list given with the
+ *    previous S390_HWCTR_START subcommand.
+ */
+static long cfset_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int ret;
+
+	get_online_cpus();
+	mutex_lock(&cfset_ctrset_mutex);
+	switch (cmd) {
+	case S390_HWCTR_START:
+		ret = cfset_ioctl_start(arg);
+		break;
+	case S390_HWCTR_STOP:
+		ret = cfset_ioctl_stop();
+		break;
+	case S390_HWCTR_READ:
+		ret = cfset_ioctl_read(arg);
+		break;
+	default:
+		ret = -ENOTTY;
+		break;
+	}
+	mutex_unlock(&cfset_ctrset_mutex);
+	put_online_cpus();
+	return ret;
+}
+
+static const struct file_operations cfset_fops = {
+	.owner = THIS_MODULE,
+	.open = cfset_open,
+	.release = cfset_release,
+	.unlocked_ioctl	= cfset_ioctl,
+	.compat_ioctl = cfset_ioctl,
+	.llseek = no_llseek
+};
+
+static struct miscdevice cfset_dev = {
+	.name	= S390_HWCTR_DEVICE,
+	.minor	= MISC_DYNAMIC_MINOR,
+	.fops	= &cfset_fops,
+};
+
+int cfset_online_cpu(unsigned int cpu)
+{
+	struct cfset_call_on_cpu_parm p;
+
+	mutex_lock(&cfset_ctrset_mutex);
+	if (cfset_request.ctrset) {
+		p.sets = cfset_request.ctrset;
+		cfset_ioctl_on(&p);
+		cpumask_set_cpu(cpu, &cfset_request.mask);
+	}
+	mutex_unlock(&cfset_ctrset_mutex);
+	return 0;
+}
+
+int cfset_offline_cpu(unsigned int cpu)
+{
+	struct cfset_call_on_cpu_parm p;
+
+	mutex_lock(&cfset_ctrset_mutex);
+	if (cfset_request.ctrset) {
+		p.sets = cfset_request.ctrset;
+		cfset_ioctl_off(&p);
+		cpumask_clear_cpu(cpu, &cfset_request.mask);
+	}
+	mutex_unlock(&cfset_ctrset_mutex);
+	return 0;
+}
+
+static void cfdiag_read(struct perf_event *event)
+{
+	debug_sprintf_event(cf_dbg, 3, "%s event %#llx count %ld\n", __func__,
+			    event->attr.config, local64_read(&event->count));
+}
+
+static int get_authctrsets(void)
+{
+	struct cpu_cf_events *cpuhw;
+	unsigned long auth = 0;
+	enum cpumf_ctr_set i;
+
+	cpuhw = &get_cpu_var(cpu_cf_events);
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+		if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i])
+			auth |= cpumf_ctr_ctl[i];
+	}
+	put_cpu_var(cpu_cf_events);
+	return auth;
+}
+
+/* Setup the event. Test for authorized counter sets and only include counter
+ * sets which are authorized at the time of the setup. Including unauthorized
+ * counter sets result in specification exception (and panic).
+ */
+static int cfdiag_event_init2(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	int err = 0;
+
+	/* Set sample_period to indicate sampling */
+	event->hw.config = attr->config;
+	event->hw.sample_period = attr->sample_period;
+	local64_set(&event->hw.period_left, event->hw.sample_period);
+	local64_set(&event->count, 0);
+	event->hw.last_period = event->hw.sample_period;
+
+	/* Add all authorized counter sets to config_base. The
+	 * the hardware init function is either called per-cpu or just once
+	 * for all CPUS (event->cpu == -1).  This depends on the whether
+	 * counting is started for all CPUs or on a per workload base where
+	 * the perf event moves from one CPU to another CPU.
+	 * Checking the authorization on any CPU is fine as the hardware
+	 * applies the same authorization settings to all CPUs.
+	 */
+	event->hw.config_base = get_authctrsets();
+
+	/* No authorized counter sets, nothing to count/sample */
+	if (!event->hw.config_base)
+		err = -EINVAL;
+
+	debug_sprintf_event(cf_dbg, 5, "%s err %d config_base %#lx\n",
+			    __func__, err, event->hw.config_base);
+	return err;
+}
+
+static int cfdiag_event_init(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	int err = -ENOENT;
+
+	if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG ||
+	    event->attr.type != event->pmu->type)
+		goto out;
+
+	/* Raw events are used to access counters directly,
+	 * hence do not permit excludes.
+	 * This event is useless without PERF_SAMPLE_RAW to return counter set
+	 * values as raw data.
+	 */
+	if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv ||
+	    !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	/* Initialize for using the CPU-measurement counter facility */
+	cpumf_hw_inuse();
+	event->destroy = hw_perf_event_destroy;
+
+	err = cfdiag_event_init2(event);
+	if (unlikely(err))
+		event->destroy(event);
+out:
+	return err;
+}
+
+/* Create cf_diag/events/CF_DIAG event sysfs file. This counter is used
+ * to collect the complete counter sets for a scheduled process. Target
+ * are complete counter sets attached as raw data to the artificial event.
+ * This results in complete counter sets available when a process is
+ * scheduled. Contains the delta of every counter while the process was
+ * running.
+ */
+CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG);
+
+static struct attribute *cfdiag_events_attr[] = {
+	CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG),
+	NULL,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-63");
+
+static struct attribute *cfdiag_format_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group cfdiag_events_group = {
+	.name = "events",
+	.attrs = cfdiag_events_attr,
+};
+static struct attribute_group cfdiag_format_group = {
+	.name = "format",
+	.attrs = cfdiag_format_attr,
+};
+static const struct attribute_group *cfdiag_attr_groups[] = {
+	&cfdiag_events_group,
+	&cfdiag_format_group,
+	NULL,
+};
+
+/* Performance monitoring unit for event CF_DIAG. Since this event
+ * is also started and stopped via the perf_event_open() system call, use
+ * the same event enable/disable call back functions. They do not
+ * have a pointer to the perf_event strcture as first parameter.
+ *
+ * The functions XXX_add, XXX_del, XXX_start and XXX_stop are also common.
+ * Reuse them and distinguish the event (always first parameter) via
+ * 'config' member.
+ */
+static struct pmu cf_diag = {
+	.task_ctx_nr  = perf_sw_context,
+	.event_init   = cfdiag_event_init,
+	.pmu_enable   = cpumf_pmu_enable,
+	.pmu_disable  = cpumf_pmu_disable,
+	.add	      = cpumf_pmu_add,
+	.del	      = cpumf_pmu_del,
+	.start	      = cpumf_pmu_start,
+	.stop	      = cpumf_pmu_stop,
+	.read	      = cfdiag_read,
+
+	.attr_groups  = cfdiag_attr_groups
+};
+
+/* Calculate memory needed to store all counter sets together with header and
+ * trailer data. This is independent of the counter set authorization which
+ * can vary depending on the configuration.
+ */
+static size_t cfdiag_maxsize(struct cpumf_ctr_info *info)
+{
+	size_t max_size = sizeof(struct cf_trailer_entry);
+	enum cpumf_ctr_set i;
+
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+		size_t size = cpum_cf_ctrset_size(i, info);
+
+		if (size)
+			max_size += size * sizeof(u64) +
+				    sizeof(struct cf_ctrset_entry);
+	}
+	return max_size;
+}
+
+/* Get the CPU speed, try sampling facility first and CPU attributes second. */
+static void cfdiag_get_cpu_speed(void)
+{
+	if (cpum_sf_avail()) {			/* Sampling facility first */
+		struct hws_qsi_info_block si;
+
+		memset(&si, 0, sizeof(si));
+		if (!qsi(&si)) {
+			cfdiag_cpu_speed = si.cpu_speed;
+			return;
+		}
+	}
+
+	/* Fallback: CPU speed extract static part. Used in case
+	 * CPU Measurement Sampling Facility is turned off.
+	 */
+	if (test_facility(34)) {
+		unsigned long mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0);
+
+		if (mhz != -1UL)
+			cfdiag_cpu_speed = mhz & 0xffffffff;
+	}
+}
+
+static int cfset_init(void)
+{
+	struct cpumf_ctr_info info;
+	size_t need;
+	int rc;
+
+	if (qctri(&info))
+		return -ENODEV;
+
+	cfdiag_get_cpu_speed();
+	/* Make sure the counter set data fits into predefined buffer. */
+	need = cfdiag_maxsize(&info);
+	if (need > sizeof(((struct cpu_cf_events *)0)->start)) {
+		pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n",
+		       need);
+		return -ENOMEM;
+	}
+
+	rc = misc_register(&cfset_dev);
+	if (rc) {
+		pr_err("Registration of /dev/%s failed rc=%i\n",
+		       cfset_dev.name, rc);
+		goto out;
+	}
+
+	rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1);
+	if (rc) {
+		misc_deregister(&cfset_dev);
+		pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n",
+		       rc);
+	}
+out:
+	return rc;
+}
+
+device_initcall(cpumf_pmu_init);
diff --git a/arch/s390/kernel/perf_cpum_cf_common.c b/arch/s390/kernel/perf_cpum_cf_common.c
index 2300fbaac556..30f0242de4a5 100644
--- a/arch/s390/kernel/perf_cpum_cf_common.c
+++ b/arch/s390/kernel/perf_cpum_cf_common.c
@@ -29,7 +29,11 @@ DEFINE_PER_CPU(struct cpu_cf_events, cpu_cf_events) = {
 	},
 	.alert = ATOMIC64_INIT(0),
 	.state = 0,
+	.dev_state = 0,
 	.flags = 0,
+	.used = 0,
+	.usedss = 0,
+	.sets = 0
 };
 /* Indicator whether the CPU-Measurement Counter Facility Support is ready */
 static bool cpum_cf_initalized;
@@ -96,25 +100,10 @@ bool kernel_cpumcf_avail(void)
 }
 EXPORT_SYMBOL(kernel_cpumcf_avail);
 
-
-/* Reserve/release functions for sharing perf hardware */
-static DEFINE_SPINLOCK(cpumcf_owner_lock);
-static void *cpumcf_owner;
-
 /* Initialize the CPU-measurement counter facility */
 int __kernel_cpumcf_begin(void)
 {
 	int flags = PMC_INIT;
-	int err = 0;
-
-	spin_lock(&cpumcf_owner_lock);
-	if (cpumcf_owner)
-		err = -EBUSY;
-	else
-		cpumcf_owner = __builtin_return_address(0);
-	spin_unlock(&cpumcf_owner_lock);
-	if (err)
-		return err;
 
 	on_each_cpu(cpum_cf_setup_cpu, &flags, 1);
 	irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT);
@@ -144,10 +133,6 @@ void __kernel_cpumcf_end(void)
 
 	on_each_cpu(cpum_cf_setup_cpu, &flags, 1);
 	irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT);
-
-	spin_lock(&cpumcf_owner_lock);
-	cpumcf_owner = NULL;
-	spin_unlock(&cpumcf_owner_lock);
 }
 EXPORT_SYMBOL(__kernel_cpumcf_end);
 
@@ -161,11 +146,13 @@ static int cpum_cf_setup(unsigned int cpu, int flags)
 
 static int cpum_cf_online_cpu(unsigned int cpu)
 {
-	return cpum_cf_setup(cpu, PMC_INIT);
+	cpum_cf_setup(cpu, PMC_INIT);
+	return cfset_online_cpu(cpu);
 }
 
 static int cpum_cf_offline_cpu(unsigned int cpu)
 {
+	cfset_offline_cpu(cpu);
 	return cpum_cf_setup(cpu, PMC_RELEASE);
 }
 
diff --git a/arch/s390/kernel/perf_cpum_cf_diag.c b/arch/s390/kernel/perf_cpum_cf_diag.c
deleted file mode 100644
index 08c985c1097c..000000000000
--- a/arch/s390/kernel/perf_cpum_cf_diag.c
+++ /dev/null
@@ -1,1148 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Performance event support for s390x - CPU-measurement Counter Sets
- *
- *  Copyright IBM Corp. 2019, 2021
- *  Author(s): Hendrik Brueckner <brueckner@linux.ibm.com>
- *	       Thomas Richer <tmricht@linux.ibm.com>
- */
-#define KMSG_COMPONENT	"cpum_cf_diag"
-#define pr_fmt(fmt)	KMSG_COMPONENT ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/kernel_stat.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <linux/processor.h>
-#include <linux/miscdevice.h>
-#include <linux/mutex.h>
-
-#include <asm/ctl_reg.h>
-#include <asm/irq.h>
-#include <asm/cpu_mcf.h>
-#include <asm/timex.h>
-#include <asm/debug.h>
-
-#include <asm/hwctrset.h>
-
-#define	CF_DIAG_CTRSET_DEF		0xfeef	/* Counter set header mark */
-						/* interval in seconds */
-static unsigned int cf_diag_cpu_speed;
-static debug_info_t *cf_diag_dbg;
-
-struct cf_diag_csd {			/* Counter set data per CPU */
-	size_t used;			/* Bytes used in data/start */
-	unsigned char start[PAGE_SIZE];	/* Counter set at event start */
-	unsigned char data[PAGE_SIZE];	/* Counter set at event delete */
-	unsigned int sets;		/* # Counter set saved in data */
-};
-static DEFINE_PER_CPU(struct cf_diag_csd, cf_diag_csd);
-
-/* Counter sets are stored as data stream in a page sized memory buffer and
- * exported to user space via raw data attached to the event sample data.
- * Each counter set starts with an eight byte header consisting of:
- * - a two byte eye catcher (0xfeef)
- * - a one byte counter set number
- * - a two byte counter set size (indicates the number of counters in this set)
- * - a three byte reserved value (must be zero) to make the header the same
- *   size as a counter value.
- * All counter values are eight byte in size.
- *
- * All counter sets are followed by a 64 byte trailer.
- * The trailer consists of a:
- * - flag field indicating valid fields when corresponding bit set
- * - the counter facility first and second version number
- * - the CPU speed if nonzero
- * - the time stamp the counter sets have been collected
- * - the time of day (TOD) base value
- * - the machine type.
- *
- * The counter sets are saved when the process is prepared to be executed on a
- * CPU and saved again when the process is going to be removed from a CPU.
- * The difference of both counter sets are calculated and stored in the event
- * sample data area.
- */
-
-struct cf_ctrset_entry {	/* CPU-M CF counter set entry (8 byte) */
-	unsigned int def:16;	/* 0-15  Data Entry Format */
-	unsigned int set:16;	/* 16-31 Counter set identifier */
-	unsigned int ctr:16;	/* 32-47 Number of stored counters */
-	unsigned int res1:16;	/* 48-63 Reserved */
-};
-
-struct cf_trailer_entry {	/* CPU-M CF_DIAG trailer (64 byte) */
-	/* 0 - 7 */
-	union {
-		struct {
-			unsigned int clock_base:1;	/* TOD clock base set */
-			unsigned int speed:1;		/* CPU speed set */
-			/* Measurement alerts */
-			unsigned int mtda:1;	/* Loss of MT ctr. data alert */
-			unsigned int caca:1;	/* Counter auth. change alert */
-			unsigned int lcda:1;	/* Loss of counter data alert */
-		};
-		unsigned long flags;	/* 0-63    All indicators */
-	};
-	/* 8 - 15 */
-	unsigned int cfvn:16;			/* 64-79   Ctr First Version */
-	unsigned int csvn:16;			/* 80-95   Ctr Second Version */
-	unsigned int cpu_speed:32;		/* 96-127  CPU speed */
-	/* 16 - 23 */
-	unsigned long timestamp;		/* 128-191 Timestamp (TOD) */
-	/* 24 - 55 */
-	union {
-		struct {
-			unsigned long progusage1;
-			unsigned long progusage2;
-			unsigned long progusage3;
-			unsigned long tod_base;
-		};
-		unsigned long progusage[4];
-	};
-	/* 56 - 63 */
-	unsigned int mach_type:16;		/* Machine type */
-	unsigned int res1:16;			/* Reserved */
-	unsigned int res2:32;			/* Reserved */
-};
-
-/* Create the trailer data at the end of a page. */
-static void cf_diag_trailer(struct cf_trailer_entry *te)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	struct cpuid cpuid;
-
-	te->cfvn = cpuhw->info.cfvn;		/* Counter version numbers */
-	te->csvn = cpuhw->info.csvn;
-
-	get_cpu_id(&cpuid);			/* Machine type */
-	te->mach_type = cpuid.machine;
-	te->cpu_speed = cf_diag_cpu_speed;
-	if (te->cpu_speed)
-		te->speed = 1;
-	te->clock_base = 1;			/* Save clock base */
-	te->tod_base = tod_clock_base.tod;
-	te->timestamp = get_tod_clock_fast();
-}
-
-/*
- * Change the CPUMF state to active.
- * Enable and activate the CPU-counter sets according
- * to the per-cpu control state.
- */
-static void cf_diag_enable(struct pmu *pmu)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	int err;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s pmu %p cpu %d flags %#x state %#llx\n",
-			    __func__, pmu, smp_processor_id(), cpuhw->flags,
-			    cpuhw->state);
-	if (cpuhw->flags & PMU_F_ENABLED)
-		return;
-
-	err = lcctl(cpuhw->state);
-	if (err) {
-		pr_err("Enabling the performance measuring unit "
-		       "failed with rc=%x\n", err);
-		return;
-	}
-	cpuhw->flags |= PMU_F_ENABLED;
-}
-
-/*
- * Change the CPUMF state to inactive.
- * Disable and enable (inactive) the CPU-counter sets according
- * to the per-cpu control state.
- */
-static void cf_diag_disable(struct pmu *pmu)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	u64 inactive;
-	int err;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s pmu %p cpu %d flags %#x state %#llx\n",
-			    __func__, pmu, smp_processor_id(), cpuhw->flags,
-			    cpuhw->state);
-	if (!(cpuhw->flags & PMU_F_ENABLED))
-		return;
-
-	inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1);
-	err = lcctl(inactive);
-	if (err) {
-		pr_err("Disabling the performance measuring unit "
-		       "failed with rc=%x\n", err);
-		return;
-	}
-	cpuhw->flags &= ~PMU_F_ENABLED;
-}
-
-/* Number of perf events counting hardware events */
-static atomic_t cf_diag_events = ATOMIC_INIT(0);
-/* Used to avoid races in calling reserve/release_cpumf_hardware */
-static DEFINE_MUTEX(cf_diag_reserve_mutex);
-
-/* Release the PMU if event is the last perf event */
-static void cf_diag_perf_event_destroy(struct perf_event *event)
-{
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s event %p cpu %d cf_diag_events %d\n",
-			    __func__, event, smp_processor_id(),
-			    atomic_read(&cf_diag_events));
-	if (atomic_dec_return(&cf_diag_events) == 0)
-		__kernel_cpumcf_end();
-}
-
-static int get_authctrsets(void)
-{
-	struct cpu_cf_events *cpuhw;
-	unsigned long auth = 0;
-	enum cpumf_ctr_set i;
-
-	cpuhw = &get_cpu_var(cpu_cf_events);
-	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
-		if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i])
-			auth |= cpumf_ctr_ctl[i];
-	}
-	put_cpu_var(cpu_cf_events);
-	return auth;
-}
-
-/* Setup the event. Test for authorized counter sets and only include counter
- * sets which are authorized at the time of the setup. Including unauthorized
- * counter sets result in specification exception (and panic).
- */
-static int __hw_perf_event_init(struct perf_event *event)
-{
-	struct perf_event_attr *attr = &event->attr;
-	int err = 0;
-
-	debug_sprintf_event(cf_diag_dbg, 5, "%s event %p cpu %d\n", __func__,
-			    event, event->cpu);
-
-	event->hw.config = attr->config;
-
-	/* Add all authorized counter sets to config_base. The
-	 * the hardware init function is either called per-cpu or just once
-	 * for all CPUS (event->cpu == -1).  This depends on the whether
-	 * counting is started for all CPUs or on a per workload base where
-	 * the perf event moves from one CPU to another CPU.
-	 * Checking the authorization on any CPU is fine as the hardware
-	 * applies the same authorization settings to all CPUs.
-	 */
-	event->hw.config_base = get_authctrsets();
-
-	/* No authorized counter sets, nothing to count/sample */
-	if (!event->hw.config_base) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	/* Set sample_period to indicate sampling */
-	event->hw.sample_period = attr->sample_period;
-	local64_set(&event->hw.period_left, event->hw.sample_period);
-	event->hw.last_period  = event->hw.sample_period;
-out:
-	debug_sprintf_event(cf_diag_dbg, 5, "%s err %d config_base %#lx\n",
-			    __func__, err, event->hw.config_base);
-	return err;
-}
-
-/* Return 0 if the CPU-measurement counter facility is currently free
- * and an error otherwise.
- */
-static int cf_diag_perf_event_inuse(void)
-{
-	int err = 0;
-
-	if (!atomic_inc_not_zero(&cf_diag_events)) {
-		mutex_lock(&cf_diag_reserve_mutex);
-		if (atomic_read(&cf_diag_events) == 0 &&
-		    __kernel_cpumcf_begin())
-			err = -EBUSY;
-		else
-			err = atomic_inc_return(&cf_diag_events);
-		mutex_unlock(&cf_diag_reserve_mutex);
-	}
-	return err;
-}
-
-static int cf_diag_event_init(struct perf_event *event)
-{
-	struct perf_event_attr *attr = &event->attr;
-	int err = -ENOENT;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s event %p cpu %d config %#llx type:%u "
-			    "sample_type %#llx cf_diag_events %d\n", __func__,
-			    event, event->cpu, attr->config, event->pmu->type,
-			    attr->sample_type, atomic_read(&cf_diag_events));
-
-	if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG ||
-	    event->attr.type != event->pmu->type)
-		goto out;
-
-	/* Raw events are used to access counters directly,
-	 * hence do not permit excludes.
-	 * This event is usesless without PERF_SAMPLE_RAW to return counter set
-	 * values as raw data.
-	 */
-	if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv ||
-	    !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	/* Initialize for using the CPU-measurement counter facility */
-	err = cf_diag_perf_event_inuse();
-	if (err < 0)
-		goto out;
-	event->destroy = cf_diag_perf_event_destroy;
-
-	err = __hw_perf_event_init(event);
-	if (unlikely(err))
-		event->destroy(event);
-out:
-	debug_sprintf_event(cf_diag_dbg, 5, "%s err %d\n", __func__, err);
-	return err;
-}
-
-static void cf_diag_read(struct perf_event *event)
-{
-	debug_sprintf_event(cf_diag_dbg, 5, "%s event %p\n", __func__, event);
-}
-
-/* Calculate memory needed to store all counter sets together with header and
- * trailer data. This is independend of the counter set authorization which
- * can vary depending on the configuration.
- */
-static size_t cf_diag_ctrset_maxsize(struct cpumf_ctr_info *info)
-{
-	size_t max_size = sizeof(struct cf_trailer_entry);
-	enum cpumf_ctr_set i;
-
-	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
-		size_t size = cpum_cf_ctrset_size(i, info);
-
-		if (size)
-			max_size += size * sizeof(u64) +
-				    sizeof(struct cf_ctrset_entry);
-	}
-	debug_sprintf_event(cf_diag_dbg, 5, "%s max_size %zu\n", __func__,
-			    max_size);
-
-	return max_size;
-}
-
-/* Read a counter set. The counter set number determines which counter set and
- * the CPUM-CF first and second version number determine the number of
- * available counters in this counter set.
- * Each counter set starts with header containing the counter set number and
- * the number of 8 byte counters.
- *
- * The functions returns the number of bytes occupied by this counter set
- * including the header.
- * If there is no counter in the counter set, this counter set is useless and
- * zero is returned on this case.
- */
-static size_t cf_diag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset,
-				size_t room)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	size_t ctrset_size, need = 0;
-	int rc = 3;				/* Assume write failure */
-
-	ctrdata->def = CF_DIAG_CTRSET_DEF;
-	ctrdata->set = ctrset;
-	ctrdata->res1 = 0;
-	ctrset_size = cpum_cf_ctrset_size(ctrset, &cpuhw->info);
-
-	if (ctrset_size) {			/* Save data */
-		need = ctrset_size * sizeof(u64) + sizeof(*ctrdata);
-		if (need <= room)
-			rc = ctr_stcctm(ctrset, ctrset_size,
-					(u64 *)(ctrdata + 1));
-		if (rc != 3)
-			ctrdata->ctr = ctrset_size;
-		else
-			need = 0;
-	}
-
-	debug_sprintf_event(cf_diag_dbg, 6,
-			    "%s ctrset %d ctrset_size %zu cfvn %d csvn %d"
-			    " need %zd rc %d\n",
-			    __func__, ctrset, ctrset_size, cpuhw->info.cfvn,
-			    cpuhw->info.csvn, need, rc);
-	return need;
-}
-
-/* Read out all counter sets and save them in the provided data buffer.
- * The last 64 byte host an artificial trailer entry.
- */
-static size_t cf_diag_getctr(void *data, size_t sz, unsigned long auth)
-{
-	struct cf_trailer_entry *trailer;
-	size_t offset = 0, done;
-	int i;
-
-	memset(data, 0, sz);
-	sz -= sizeof(*trailer);			/* Always room for trailer */
-	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
-		struct cf_ctrset_entry *ctrdata = data + offset;
-
-		if (!(auth & cpumf_ctr_ctl[i]))
-			continue;	/* Counter set not authorized */
-
-		done = cf_diag_getctrset(ctrdata, i, sz - offset);
-		offset += done;
-		debug_sprintf_event(cf_diag_dbg, 6,
-				    "%s ctrset %d offset %zu done %zu\n",
-				     __func__, i, offset, done);
-	}
-	trailer = data + offset;
-	cf_diag_trailer(trailer);
-	return offset + sizeof(*trailer);
-}
-
-/* Calculate the difference for each counter in a counter set. */
-static void cf_diag_diffctrset(u64 *pstart, u64 *pstop, int counters)
-{
-	for (; --counters >= 0; ++pstart, ++pstop)
-		if (*pstop >= *pstart)
-			*pstop -= *pstart;
-		else
-			*pstop = *pstart - *pstop;
-}
-
-/* Scan the counter sets and calculate the difference of each counter
- * in each set. The result is the increment of each counter during the
- * period the counter set has been activated.
- *
- * Return true on success.
- */
-static int cf_diag_diffctr(struct cf_diag_csd *csd, unsigned long auth)
-{
-	struct cf_trailer_entry *trailer_start, *trailer_stop;
-	struct cf_ctrset_entry *ctrstart, *ctrstop;
-	size_t offset = 0;
-
-	auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1;
-	do {
-		ctrstart = (struct cf_ctrset_entry *)(csd->start + offset);
-		ctrstop = (struct cf_ctrset_entry *)(csd->data + offset);
-
-		if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) {
-			pr_err("cpum_cf_diag counter set compare error "
-				"in set %i\n", ctrstart->set);
-			return 0;
-		}
-		auth &= ~cpumf_ctr_ctl[ctrstart->set];
-		if (ctrstart->def == CF_DIAG_CTRSET_DEF) {
-			cf_diag_diffctrset((u64 *)(ctrstart + 1),
-					  (u64 *)(ctrstop + 1), ctrstart->ctr);
-			offset += ctrstart->ctr * sizeof(u64) +
-				  sizeof(*ctrstart);
-		}
-		debug_sprintf_event(cf_diag_dbg, 6,
-				    "%s set %d ctr %d offset %zu auth %lx\n",
-				    __func__, ctrstart->set, ctrstart->ctr,
-				    offset, auth);
-	} while (ctrstart->def && auth);
-
-	/* Save time_stamp from start of event in stop's trailer */
-	trailer_start = (struct cf_trailer_entry *)(csd->start + offset);
-	trailer_stop = (struct cf_trailer_entry *)(csd->data + offset);
-	trailer_stop->progusage[0] = trailer_start->timestamp;
-
-	return 1;
-}
-
-/* Create perf event sample with the counter sets as raw data.	The sample
- * is then pushed to the event subsystem and the function checks for
- * possible event overflows. If an event overflow occurs, the PMU is
- * stopped.
- *
- * Return non-zero if an event overflow occurred.
- */
-static int cf_diag_push_sample(struct perf_event *event,
-			       struct cf_diag_csd *csd)
-{
-	struct perf_sample_data data;
-	struct perf_raw_record raw;
-	struct pt_regs regs;
-	int overflow;
-
-	/* Setup perf sample */
-	perf_sample_data_init(&data, 0, event->hw.last_period);
-	memset(&regs, 0, sizeof(regs));
-	memset(&raw, 0, sizeof(raw));
-
-	if (event->attr.sample_type & PERF_SAMPLE_CPU)
-		data.cpu_entry.cpu = event->cpu;
-	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-		raw.frag.size = csd->used;
-		raw.frag.data = csd->data;
-		raw.size = csd->used;
-		data.raw = &raw;
-	}
-
-	overflow = perf_event_overflow(event, &data, &regs);
-	debug_sprintf_event(cf_diag_dbg, 6,
-			    "%s event %p cpu %d sample_type %#llx raw %d "
-			    "ov %d\n", __func__, event, event->cpu,
-			    event->attr.sample_type, raw.size, overflow);
-	if (overflow)
-		event->pmu->stop(event, 0);
-
-	perf_event_update_userpage(event);
-	return overflow;
-}
-
-static void cf_diag_start(struct perf_event *event, int flags)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd);
-	struct hw_perf_event *hwc = &event->hw;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s event %p cpu %d flags %#x hwc-state %#x\n",
-			    __func__, event, event->cpu, flags, hwc->state);
-	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
-		return;
-
-	/* (Re-)enable and activate all counter sets */
-	lcctl(0);		/* Reset counter sets */
-	hwc->state = 0;
-	ctr_set_multiple_enable(&cpuhw->state, hwc->config_base);
-	lcctl(cpuhw->state);	/* Enable counter sets */
-	csd->used = cf_diag_getctr(csd->start, sizeof(csd->start),
-				   event->hw.config_base);
-	ctr_set_multiple_start(&cpuhw->state, hwc->config_base);
-	/* Function cf_diag_enable() starts the counter sets. */
-}
-
-static void cf_diag_stop(struct perf_event *event, int flags)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd);
-	struct hw_perf_event *hwc = &event->hw;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s event %p cpu %d flags %#x hwc-state %#x\n",
-			    __func__, event, event->cpu, flags, hwc->state);
-
-	/* Deactivate all counter sets */
-	ctr_set_multiple_stop(&cpuhw->state, hwc->config_base);
-	local64_inc(&event->count);
-	csd->used = cf_diag_getctr(csd->data, sizeof(csd->data),
-				   event->hw.config_base);
-	if (cf_diag_diffctr(csd, event->hw.config_base))
-		cf_diag_push_sample(event, csd);
-	hwc->state |= PERF_HES_STOPPED;
-}
-
-static int cf_diag_add(struct perf_event *event, int flags)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	int err = 0;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s event %p cpu %d flags %#x cpuhw %p\n",
-			    __func__, event, event->cpu, flags, cpuhw);
-
-	if (cpuhw->flags & PMU_F_IN_USE) {
-		err = -EAGAIN;
-		goto out;
-	}
-
-	event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
-	cpuhw->flags |= PMU_F_IN_USE;
-	if (flags & PERF_EF_START)
-		cf_diag_start(event, PERF_EF_RELOAD);
-out:
-	debug_sprintf_event(cf_diag_dbg, 5, "%s err %d\n", __func__, err);
-	return err;
-}
-
-static void cf_diag_del(struct perf_event *event, int flags)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s event %p cpu %d flags %#x\n",
-			   __func__, event, event->cpu, flags);
-
-	cf_diag_stop(event, PERF_EF_UPDATE);
-	ctr_set_multiple_stop(&cpuhw->state, event->hw.config_base);
-	ctr_set_multiple_disable(&cpuhw->state, event->hw.config_base);
-	cpuhw->flags &= ~PMU_F_IN_USE;
-}
-
-/* Default counter set events and format attribute groups */
-
-CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG);
-
-static struct attribute *cf_diag_events_attr[] = {
-	CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG),
-	NULL,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-63");
-
-static struct attribute *cf_diag_format_attr[] = {
-	&format_attr_event.attr,
-	NULL,
-};
-
-static struct attribute_group cf_diag_events_group = {
-	.name = "events",
-	.attrs = cf_diag_events_attr,
-};
-static struct attribute_group cf_diag_format_group = {
-	.name = "format",
-	.attrs = cf_diag_format_attr,
-};
-static const struct attribute_group *cf_diag_attr_groups[] = {
-	&cf_diag_events_group,
-	&cf_diag_format_group,
-	NULL,
-};
-
-/* Performance monitoring unit for s390x */
-static struct pmu cf_diag = {
-	.task_ctx_nr  = perf_sw_context,
-	.pmu_enable   = cf_diag_enable,
-	.pmu_disable  = cf_diag_disable,
-	.event_init   = cf_diag_event_init,
-	.add	      = cf_diag_add,
-	.del	      = cf_diag_del,
-	.start	      = cf_diag_start,
-	.stop	      = cf_diag_stop,
-	.read	      = cf_diag_read,
-
-	.attr_groups  = cf_diag_attr_groups
-};
-
-/* Get the CPU speed, try sampling facility first and CPU attributes second. */
-static void cf_diag_get_cpu_speed(void)
-{
-	if (cpum_sf_avail()) {			/* Sampling facility first */
-		struct hws_qsi_info_block si;
-
-		memset(&si, 0, sizeof(si));
-		if (!qsi(&si)) {
-			cf_diag_cpu_speed = si.cpu_speed;
-			return;
-		}
-	}
-
-	if (test_facility(34)) {		/* CPU speed extract static part */
-		unsigned long mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0);
-
-		if (mhz != -1UL)
-			cf_diag_cpu_speed = mhz & 0xffffffff;
-	}
-}
-
-/* Code to create device and file I/O operations */
-static atomic_t ctrset_opencnt = ATOMIC_INIT(0);	/* Excl. access */
-
-static int cf_diag_open(struct inode *inode, struct file *file)
-{
-	int err = 0;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-	if (atomic_xchg(&ctrset_opencnt, 1))
-		return -EBUSY;
-
-	/* Avoid concurrent access with perf_event_open() system call */
-	mutex_lock(&cf_diag_reserve_mutex);
-	if (atomic_read(&cf_diag_events) || __kernel_cpumcf_begin())
-		err = -EBUSY;
-	mutex_unlock(&cf_diag_reserve_mutex);
-	if (err) {
-		atomic_set(&ctrset_opencnt, 0);
-		return err;
-	}
-	file->private_data = NULL;
-	debug_sprintf_event(cf_diag_dbg, 2, "%s\n", __func__);
-	/* nonseekable_open() never fails */
-	return nonseekable_open(inode, file);
-}
-
-/* Variables for ioctl() interface support */
-static DEFINE_MUTEX(cf_diag_ctrset_mutex);
-static struct cf_diag_ctrset {
-	unsigned long ctrset;		/* Bit mask of counter set to read */
-	cpumask_t mask;			/* CPU mask to read from */
-} cf_diag_ctrset;
-
-static void cf_diag_ctrset_clear(void)
-{
-	cpumask_clear(&cf_diag_ctrset.mask);
-	cf_diag_ctrset.ctrset = 0;
-}
-
-static void cf_diag_release_cpu(void *p)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-
-	debug_sprintf_event(cf_diag_dbg, 3, "%s cpu %d\n", __func__,
-			    smp_processor_id());
-	lcctl(0);		/* Reset counter sets */
-	cpuhw->state = 0;	/* Save state in CPU hardware state */
-}
-
-/* Release function is also called when application gets terminated without
- * doing a proper ioctl(..., S390_HWCTR_STOP, ...) command.
- * Since only one application is allowed to open the device, simple stop all
- * CPU counter sets.
- */
-static int cf_diag_release(struct inode *inode, struct file *file)
-{
-	on_each_cpu(cf_diag_release_cpu, NULL, 1);
-	cf_diag_ctrset_clear();
-	atomic_set(&ctrset_opencnt, 0);
-	__kernel_cpumcf_end();
-	debug_sprintf_event(cf_diag_dbg, 2, "%s\n", __func__);
-	return 0;
-}
-
-struct cf_diag_call_on_cpu_parm {	/* Parm struct for smp_call_on_cpu */
-	unsigned int sets;		/* Counter set bit mask */
-	atomic_t cpus_ack;		/* # CPUs successfully executed func */
-};
-
-static int cf_diag_all_copy(unsigned long arg, cpumask_t *mask)
-{
-	struct s390_ctrset_read __user *ctrset_read;
-	unsigned int cpu, cpus, rc;
-	void __user *uptr;
-
-	ctrset_read = (struct s390_ctrset_read __user *)arg;
-	uptr = ctrset_read->data;
-	for_each_cpu(cpu, mask) {
-		struct cf_diag_csd *csd = per_cpu_ptr(&cf_diag_csd, cpu);
-		struct s390_ctrset_cpudata __user *ctrset_cpudata;
-
-		ctrset_cpudata = uptr;
-		debug_sprintf_event(cf_diag_dbg, 5, "%s cpu %d used %zd\n",
-				    __func__, cpu, csd->used);
-		rc  = put_user(cpu, &ctrset_cpudata->cpu_nr);
-		rc |= put_user(csd->sets, &ctrset_cpudata->no_sets);
-		rc |= copy_to_user(ctrset_cpudata->data, csd->data, csd->used);
-		if (rc)
-			return -EFAULT;
-		uptr += sizeof(struct s390_ctrset_cpudata) + csd->used;
-		cond_resched();
-	}
-	cpus = cpumask_weight(mask);
-	if (put_user(cpus, &ctrset_read->no_cpus))
-		return -EFAULT;
-	debug_sprintf_event(cf_diag_dbg, 5, "%s copied %ld\n",
-			    __func__, uptr - (void __user *)ctrset_read->data);
-	return 0;
-}
-
-static size_t cf_diag_cpuset_read(struct s390_ctrset_setdata *p, int ctrset,
-				  int ctrset_size, size_t room)
-{
-	size_t need = 0;
-	int rc = -1;
-
-	need = sizeof(*p) + sizeof(u64) * ctrset_size;
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s room %zd need %zd set %#x set_size %d\n",
-			    __func__, room, need, ctrset, ctrset_size);
-	if (need <= room) {
-		p->set = cpumf_ctr_ctl[ctrset];
-		p->no_cnts = ctrset_size;
-		rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv);
-		if (rc == 3)		/* Nothing stored */
-			need = 0;
-	}
-	debug_sprintf_event(cf_diag_dbg, 5, "%s need %zd rc %d\n", __func__,
-			    need, rc);
-	return need;
-}
-
-/* Read all counter sets. Since the perf_event_open() system call with
- * event cpum_cf_diag/.../ is blocked when this interface is active, reuse
- * the perf_event_open() data buffer to store the counter sets.
- */
-static void cf_diag_cpu_read(void *parm)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd);
-	struct cf_diag_call_on_cpu_parm *p = parm;
-	int set, set_size;
-	size_t space;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s new %#x flags %#x state %#llx\n",
-			    __func__, p->sets, cpuhw->flags,
-			    cpuhw->state);
-	/* No data saved yet */
-	csd->used = 0;
-	csd->sets = 0;
-	memset(csd->data, 0, sizeof(csd->data));
-
-	/* Scan the counter sets */
-	for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) {
-		struct s390_ctrset_setdata *sp = (void *)csd->data + csd->used;
-
-		if (!(p->sets & cpumf_ctr_ctl[set]))
-			continue;	/* Counter set not in list */
-		set_size = cpum_cf_ctrset_size(set, &cpuhw->info);
-		space = sizeof(csd->data) - csd->used;
-		space = cf_diag_cpuset_read(sp, set, set_size, space);
-		if (space) {
-			csd->used += space;
-			csd->sets += 1;
-		}
-		debug_sprintf_event(cf_diag_dbg, 5, "%s sp %px space %zd\n",
-				    __func__, sp, space);
-	}
-	debug_sprintf_event(cf_diag_dbg, 5, "%s sets %d used %zd\n", __func__,
-			    csd->sets, csd->used);
-}
-
-static int cf_diag_all_read(unsigned long arg)
-{
-	struct cf_diag_call_on_cpu_parm p;
-	cpumask_var_t mask;
-	int rc;
-
-	debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__);
-	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
-		return -ENOMEM;
-
-	p.sets = cf_diag_ctrset.ctrset;
-	cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask);
-	on_each_cpu_mask(mask, cf_diag_cpu_read, &p, 1);
-	rc = cf_diag_all_copy(arg, mask);
-	free_cpumask_var(mask);
-	debug_sprintf_event(cf_diag_dbg, 5, "%s rc %d\n", __func__, rc);
-	return rc;
-}
-
-/* Stop all counter sets via ioctl interface */
-static void cf_diag_ioctl_off(void *parm)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	struct cf_diag_call_on_cpu_parm *p = parm;
-	int rc;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s new %#x flags %#x state %#llx\n",
-			    __func__, p->sets, cpuhw->flags,
-			    cpuhw->state);
-
-	ctr_set_multiple_disable(&cpuhw->state, p->sets);
-	ctr_set_multiple_stop(&cpuhw->state, p->sets);
-	rc = lcctl(cpuhw->state);		/* Stop counter sets */
-	if (!cpuhw->state)
-		cpuhw->flags &= ~PMU_F_IN_USE;
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s rc %d flags %#x state %#llx\n", __func__,
-			     rc, cpuhw->flags, cpuhw->state);
-}
-
-/* Start counter sets on particular CPU */
-static void cf_diag_ioctl_on(void *parm)
-{
-	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-	struct cf_diag_call_on_cpu_parm *p = parm;
-	int rc;
-
-	debug_sprintf_event(cf_diag_dbg, 5,
-			    "%s new %#x flags %#x state %#llx\n",
-			    __func__, p->sets, cpuhw->flags,
-			    cpuhw->state);
-
-	if (!(cpuhw->flags & PMU_F_IN_USE))
-		cpuhw->state = 0;
-	cpuhw->flags |= PMU_F_IN_USE;
-	rc = lcctl(cpuhw->state);		/* Reset unused counter sets */
-	ctr_set_multiple_enable(&cpuhw->state, p->sets);
-	ctr_set_multiple_start(&cpuhw->state, p->sets);
-	rc |= lcctl(cpuhw->state);		/* Start counter sets */
-	if (!rc)
-		atomic_inc(&p->cpus_ack);
-	debug_sprintf_event(cf_diag_dbg, 5, "%s rc %d state %#llx\n",
-			    __func__, rc, cpuhw->state);
-}
-
-static int cf_diag_all_stop(void)
-{
-	struct cf_diag_call_on_cpu_parm p = {
-		.sets = cf_diag_ctrset.ctrset,
-	};
-	cpumask_var_t mask;
-
-	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
-		return -ENOMEM;
-	cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask);
-	on_each_cpu_mask(mask, cf_diag_ioctl_off, &p, 1);
-	free_cpumask_var(mask);
-	return 0;
-}
-
-static int cf_diag_all_start(void)
-{
-	struct cf_diag_call_on_cpu_parm p = {
-		.sets = cf_diag_ctrset.ctrset,
-		.cpus_ack = ATOMIC_INIT(0),
-	};
-	cpumask_var_t mask;
-	int rc = 0;
-
-	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
-		return -ENOMEM;
-	cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask);
-	on_each_cpu_mask(mask, cf_diag_ioctl_on, &p, 1);
-	if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) {
-		on_each_cpu_mask(mask, cf_diag_ioctl_off, &p, 1);
-		rc = -EIO;
-	}
-	free_cpumask_var(mask);
-	return rc;
-}
-
-/* Return the maximum required space for all possible CPUs in case one
- * CPU will be onlined during the START, READ, STOP cycles.
- * To find out the size of the counter sets, any one CPU will do. They
- * all have the same counter sets.
- */
-static size_t cf_diag_needspace(unsigned int sets)
-{
-	struct cpu_cf_events *cpuhw = get_cpu_ptr(&cpu_cf_events);
-	size_t bytes = 0;
-	int i;
-
-	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
-		if (!(sets & cpumf_ctr_ctl[i]))
-			continue;
-		bytes += cpum_cf_ctrset_size(i, &cpuhw->info) * sizeof(u64) +
-			 sizeof(((struct s390_ctrset_setdata *)0)->set) +
-			 sizeof(((struct s390_ctrset_setdata *)0)->no_cnts);
-	}
-	bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids *
-		(bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) +
-		     sizeof(((struct s390_ctrset_cpudata *)0)->no_sets));
-	debug_sprintf_event(cf_diag_dbg, 5, "%s bytes %ld\n", __func__,
-			    bytes);
-	put_cpu_ptr(&cpu_cf_events);
-	return bytes;
-}
-
-static long cf_diag_ioctl_read(unsigned long arg)
-{
-	struct s390_ctrset_read read;
-	int ret = 0;
-
-	debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__);
-	if (copy_from_user(&read, (char __user *)arg, sizeof(read)))
-		return -EFAULT;
-	ret = cf_diag_all_read(arg);
-	debug_sprintf_event(cf_diag_dbg, 5, "%s ret %d\n", __func__, ret);
-	return ret;
-}
-
-static long cf_diag_ioctl_stop(void)
-{
-	int ret;
-
-	debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__);
-	ret = cf_diag_all_stop();
-	cf_diag_ctrset_clear();
-	debug_sprintf_event(cf_diag_dbg, 5, "%s ret %d\n", __func__, ret);
-	return ret;
-}
-
-static long cf_diag_ioctl_start(unsigned long arg)
-{
-	struct s390_ctrset_start __user *ustart;
-	struct s390_ctrset_start start;
-	void __user *umask;
-	unsigned int len;
-	int ret = 0;
-	size_t need;
-
-	if (cf_diag_ctrset.ctrset)
-		return -EBUSY;
-	ustart = (struct s390_ctrset_start __user *)arg;
-	if (copy_from_user(&start, ustart, sizeof(start)))
-		return -EFAULT;
-	if (start.version != S390_HWCTR_START_VERSION)
-		return -EINVAL;
-	if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] |
-				   cpumf_ctr_ctl[CPUMF_CTR_SET_USER] |
-				   cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] |
-				   cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] |
-				   cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG]))
-		return -EINVAL;		/* Invalid counter set */
-	if (!start.counter_sets)
-		return -EINVAL;		/* No counter set at all? */
-	cpumask_clear(&cf_diag_ctrset.mask);
-	len = min_t(u64, start.cpumask_len, cpumask_size());
-	umask = (void __user *)start.cpumask;
-	if (copy_from_user(&cf_diag_ctrset.mask, umask, len))
-		return -EFAULT;
-	if (cpumask_empty(&cf_diag_ctrset.mask))
-		return -EINVAL;
-	need = cf_diag_needspace(start.counter_sets);
-	if (put_user(need, &ustart->data_bytes))
-		ret = -EFAULT;
-	if (ret)
-		goto out;
-	cf_diag_ctrset.ctrset = start.counter_sets;
-	ret = cf_diag_all_start();
-out:
-	if (ret)
-		cf_diag_ctrset_clear();
-	debug_sprintf_event(cf_diag_dbg, 2, "%s sets %#lx need %ld ret %d\n",
-			    __func__, cf_diag_ctrset.ctrset, need, ret);
-	return ret;
-}
-
-static long cf_diag_ioctl(struct file *file, unsigned int cmd,
-			  unsigned long arg)
-{
-	int ret;
-
-	debug_sprintf_event(cf_diag_dbg, 2, "%s cmd %#x arg %lx\n", __func__,
-			    cmd, arg);
-	get_online_cpus();
-	mutex_lock(&cf_diag_ctrset_mutex);
-	switch (cmd) {
-	case S390_HWCTR_START:
-		ret = cf_diag_ioctl_start(arg);
-		break;
-	case S390_HWCTR_STOP:
-		ret = cf_diag_ioctl_stop();
-		break;
-	case S390_HWCTR_READ:
-		ret = cf_diag_ioctl_read(arg);
-		break;
-	default:
-		ret = -ENOTTY;
-		break;
-	}
-	mutex_unlock(&cf_diag_ctrset_mutex);
-	put_online_cpus();
-	debug_sprintf_event(cf_diag_dbg, 2, "%s ret %d\n", __func__, ret);
-	return ret;
-}
-
-static const struct file_operations cf_diag_fops = {
-	.owner = THIS_MODULE,
-	.open = cf_diag_open,
-	.release = cf_diag_release,
-	.unlocked_ioctl	= cf_diag_ioctl,
-	.compat_ioctl = cf_diag_ioctl,
-	.llseek = no_llseek
-};
-
-static struct miscdevice cf_diag_dev = {
-	.name	= S390_HWCTR_DEVICE,
-	.minor	= MISC_DYNAMIC_MINOR,
-	.fops	= &cf_diag_fops,
-};
-
-static int cf_diag_online_cpu(unsigned int cpu)
-{
-	struct cf_diag_call_on_cpu_parm p;
-
-	mutex_lock(&cf_diag_ctrset_mutex);
-	if (!cf_diag_ctrset.ctrset)
-		goto out;
-	p.sets = cf_diag_ctrset.ctrset;
-	cf_diag_ioctl_on(&p);
-out:
-	mutex_unlock(&cf_diag_ctrset_mutex);
-	return 0;
-}
-
-static int cf_diag_offline_cpu(unsigned int cpu)
-{
-	struct cf_diag_call_on_cpu_parm p;
-
-	mutex_lock(&cf_diag_ctrset_mutex);
-	if (!cf_diag_ctrset.ctrset)
-		goto out;
-	p.sets = cf_diag_ctrset.ctrset;
-	cf_diag_ioctl_off(&p);
-out:
-	mutex_unlock(&cf_diag_ctrset_mutex);
-	return 0;
-}
-
-/* Initialize the counter set PMU to generate complete counter set data as
- * event raw data. This relies on the CPU Measurement Counter Facility device
- * already being loaded and initialized.
- */
-static int __init cf_diag_init(void)
-{
-	struct cpumf_ctr_info info;
-	size_t need;
-	int rc;
-
-	if (!kernel_cpumcf_avail() || !stccm_avail() || qctri(&info))
-		return -ENODEV;
-	cf_diag_get_cpu_speed();
-
-	/* Make sure the counter set data fits into predefined buffer. */
-	need = cf_diag_ctrset_maxsize(&info);
-	if (need > sizeof(((struct cf_diag_csd *)0)->start)) {
-		pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n",
-		       need);
-		return -ENOMEM;
-	}
-
-	rc = misc_register(&cf_diag_dev);
-	if (rc) {
-		pr_err("Registration of /dev/" S390_HWCTR_DEVICE
-		       "failed rc=%d\n", rc);
-		goto out;
-	}
-
-	/* Setup s390dbf facility */
-	cf_diag_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128);
-	if (!cf_diag_dbg) {
-		pr_err("Registration of s390dbf(cpum_cf_diag) failed\n");
-		rc = -ENOMEM;
-		goto out_dbf;
-	}
-	debug_register_view(cf_diag_dbg, &debug_sprintf_view);
-
-	rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1);
-	if (rc) {
-		pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n",
-		       rc);
-		goto out_perf;
-	}
-	rc = cpuhp_setup_state_nocalls(CPUHP_AP_PERF_S390_CFD_ONLINE,
-				       "perf/s390/cfd:online",
-				       cf_diag_online_cpu, cf_diag_offline_cpu);
-	if (!rc)
-		goto out;
-
-	pr_err("Registration of CPUHP_AP_PERF_S390_CFD_ONLINE failed rc=%i\n",
-	       rc);
-	perf_pmu_unregister(&cf_diag);
-out_perf:
-	debug_unregister_view(cf_diag_dbg, &debug_sprintf_view);
-	debug_unregister(cf_diag_dbg);
-out_dbf:
-	misc_deregister(&cf_diag_dev);
-out:
-	return rc;
-}
-device_initcall(cf_diag_init);
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 47e13582d9fc..f39b34b13871 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -171,7 +171,6 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_X86_CSTATE_ONLINE,
 	CPUHP_AP_PERF_X86_IDXD_ONLINE,
 	CPUHP_AP_PERF_S390_CF_ONLINE,
-	CPUHP_AP_PERF_S390_CFD_ONLINE,
 	CPUHP_AP_PERF_S390_SF_ONLINE,
 	CPUHP_AP_PERF_ARM_CCI_ONLINE,
 	CPUHP_AP_PERF_ARM_CCN_ONLINE,
-- 
cgit v1.2.3


From 347269c113f10fbe893f11dd3ae5f44aa15d3111 Mon Sep 17 00:00:00 2001
From: Krzysztof Wilczyński <kw@linux.com>
Date: Sat, 3 Jul 2021 15:13:02 +0000
Subject: PCI: Fix kernel-doc formatting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix kernel-doc formatting throughout drivers/pci and related include files.
No change to functionality intended.

Check for warnings:

  $ find include drivers/pci -type f -path "*pci*.[ch]" | xargs scripts/kernel-doc -none

[bhelgaas: squashed to one commit]
Link: https://lore.kernel.org/r/20210509030237.368540-1-kw@linux.com
Link: https://lore.kernel.org/r/20210703151306.1922450-1-kw@linux.com
Link: https://lore.kernel.org/r/20210703151306.1922450-2-kw@linux.com
Link: https://lore.kernel.org/r/20210703151306.1922450-3-kw@linux.com
Link: https://lore.kernel.org/r/20210703151306.1922450-4-kw@linux.com
Link: https://lore.kernel.org/r/20210703151306.1922450-5-kw@linux.com
Signed-off-by: Krzysztof Wilczyński <kw@linux.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/controller/cadence/pcie-cadence.h |  7 +++++--
 drivers/pci/controller/pci-xgene.c            |  2 +-
 drivers/pci/controller/pcie-iproc-msi.c       |  4 ++--
 drivers/pci/controller/pcie-iproc.c           | 24 +++++++++++-------------
 drivers/pci/controller/pcie-iproc.h           | 16 +++++++++++-----
 drivers/pci/hotplug/cpqphp_core.c             |  7 ++++---
 drivers/pci/hotplug/cpqphp_ctrl.c             |  2 +-
 drivers/pci/hotplug/pciehp.h                  |  3 +++
 drivers/pci/pci.h                             |  4 ++--
 include/linux/pci-ep-cfs.h                    |  2 +-
 include/linux/pci-epc.h                       |  5 ++++-
 include/linux/pci-epf.h                       |  5 ++++-
 include/linux/pci_hotplug.h                   |  2 ++
 include/uapi/linux/pcitest.h                  |  2 +-
 14 files changed, 52 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/controller/cadence/pcie-cadence.h b/drivers/pci/controller/cadence/pcie-cadence.h
index 254d2570f8c9..30db2d68c17a 100644
--- a/drivers/pci/controller/cadence/pcie-cadence.h
+++ b/drivers/pci/controller/cadence/pcie-cadence.h
@@ -263,9 +263,12 @@ struct cdns_pcie_ops {
  * struct cdns_pcie - private data for Cadence PCIe controller drivers
  * @reg_base: IO mapped register base
  * @mem_res: start/end offsets in the physical system memory to map PCI accesses
+ * @dev: PCIe controller
  * @is_rc: tell whether the PCIe controller mode is Root Complex or Endpoint.
- * @bus: In Root Complex mode, the bus number
- * @ops: Platform specific ops to control various inputs from Cadence PCIe
+ * @phy_count: number of supported PHY devices
+ * @phy: list of pointers to specific PHY control blocks
+ * @link: list of pointers to corresponding device link representations
+ * @ops: Platform-specific ops to control various inputs from Cadence PCIe
  *       wrapper
  */
 struct cdns_pcie {
diff --git a/drivers/pci/controller/pci-xgene.c b/drivers/pci/controller/pci-xgene.c
index 7f503dd4ff81..8dc5140dd80d 100644
--- a/drivers/pci/controller/pci-xgene.c
+++ b/drivers/pci/controller/pci-xgene.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0+
-/**
+/*
  * APM X-Gene PCIe Driver
  *
  * Copyright (c) 2014 Applied Micro Circuits Corporation.
diff --git a/drivers/pci/controller/pcie-iproc-msi.c b/drivers/pci/controller/pcie-iproc-msi.c
index eede4e8f3f75..21d28dc0b901 100644
--- a/drivers/pci/controller/pcie-iproc-msi.c
+++ b/drivers/pci/controller/pcie-iproc-msi.c
@@ -49,7 +49,7 @@ enum iproc_msi_reg {
 struct iproc_msi;
 
 /**
- * iProc MSI group
+ * struct iproc_msi_grp - iProc MSI group
  *
  * One MSI group is allocated per GIC interrupt, serviced by one iProc MSI
  * event queue.
@@ -65,7 +65,7 @@ struct iproc_msi_grp {
 };
 
 /**
- * iProc event queue based MSI
+ * struct iproc_msi - iProc event queue based MSI
  *
  * Only meant to be used on platforms without MSI support integrated into the
  * GIC.
diff --git a/drivers/pci/controller/pcie-iproc.c b/drivers/pci/controller/pcie-iproc.c
index 02e52f698eeb..30ac5fbefbbf 100644
--- a/drivers/pci/controller/pcie-iproc.c
+++ b/drivers/pci/controller/pcie-iproc.c
@@ -89,8 +89,8 @@
 #define IPROC_PCIE_REG_INVALID		0xffff
 
 /**
- * iProc PCIe outbound mapping controller specific parameters
- *
+ * struct iproc_pcie_ob_map - iProc PCIe outbound mapping controller-specific
+ * parameters
  * @window_sizes: list of supported outbound mapping window sizes in MB
  * @nr_sizes: number of supported outbound mapping window sizes
  */
@@ -136,22 +136,20 @@ static const struct iproc_pcie_ob_map paxb_v2_ob_map[] = {
 };
 
 /**
- * iProc PCIe inbound mapping type
+ * enum iproc_pcie_ib_map_type - iProc PCIe inbound mapping type
+ * @IPROC_PCIE_IB_MAP_MEM: DDR memory
+ * @IPROC_PCIE_IB_MAP_IO: device I/O memory
+ * @IPROC_PCIE_IB_MAP_INVALID: invalid or unused
  */
 enum iproc_pcie_ib_map_type {
-	/* for DDR memory */
 	IPROC_PCIE_IB_MAP_MEM = 0,
-
-	/* for device I/O memory */
 	IPROC_PCIE_IB_MAP_IO,
-
-	/* invalid or unused */
 	IPROC_PCIE_IB_MAP_INVALID
 };
 
 /**
- * iProc PCIe inbound mapping controller specific parameters
- *
+ * struct iproc_pcie_ib_map - iProc PCIe inbound mapping controller-specific
+ * parameters
  * @type: inbound mapping region type
  * @size_unit: inbound mapping region size unit, could be SZ_1K, SZ_1M, or
  * SZ_1G
@@ -437,7 +435,7 @@ static inline void iproc_pcie_write_reg(struct iproc_pcie *pcie,
 	writel(val, pcie->base + offset);
 }
 
-/**
+/*
  * APB error forwarding can be disabled during access of configuration
  * registers of the endpoint device, to prevent unsupported requests
  * (typically seen during enumeration with multi-function devices) from
@@ -619,7 +617,7 @@ static int iproc_pcie_config_read(struct pci_bus *bus, unsigned int devfn,
 	return PCIBIOS_SUCCESSFUL;
 }
 
-/**
+/*
  * Note access to the configuration registers are protected at the higher layer
  * by 'pci_lock' in drivers/pci/access.c
  */
@@ -897,7 +895,7 @@ static inline int iproc_pcie_ob_write(struct iproc_pcie *pcie, int window_idx,
 	return 0;
 }
 
-/**
+/*
  * Some iProc SoCs require the SW to configure the outbound address mapping
  *
  * Outbound address translation:
diff --git a/drivers/pci/controller/pcie-iproc.h b/drivers/pci/controller/pcie-iproc.h
index c2676e442f55..dcca315897c8 100644
--- a/drivers/pci/controller/pcie-iproc.h
+++ b/drivers/pci/controller/pcie-iproc.h
@@ -7,7 +7,13 @@
 #define _PCIE_IPROC_H
 
 /**
- * iProc PCIe interface type
+ * enum iproc_pcie_type - iProc PCIe interface type
+ * @IPROC_PCIE_PAXB_BCMA: BCMA-based host controllers
+ * @IPROC_PCIE_PAXB:	  PAXB-based host controllers for
+ *			  NS, NSP, Cygnus, NS2, and Pegasus SOCs
+ * @IPROC_PCIE_PAXB_V2:   PAXB-based host controllers for Stingray SoCs
+ * @IPROC_PCIE_PAXC:	  PAXC-based host controllers
+ * @IPROC_PCIE_PAXC_V2:   PAXC-based host controllers (second generation)
  *
  * PAXB is the wrapper used in root complex that can be connected to an
  * external endpoint device.
@@ -24,7 +30,7 @@ enum iproc_pcie_type {
 };
 
 /**
- * iProc PCIe outbound mapping
+ * struct iproc_pcie_ob - iProc PCIe outbound mapping
  * @axi_offset: offset from the AXI address to the internal address used by
  * the iProc PCIe core
  * @nr_windows: total number of supported outbound mapping windows
@@ -35,7 +41,7 @@ struct iproc_pcie_ob {
 };
 
 /**
- * iProc PCIe inbound mapping
+ * struct iproc_pcie_ib - iProc PCIe inbound mapping
  * @nr_regions: total number of supported inbound mapping regions
  */
 struct iproc_pcie_ib {
@@ -47,13 +53,13 @@ struct iproc_pcie_ib_map;
 struct iproc_msi;
 
 /**
- * iProc PCIe device
- *
+ * struct iproc_pcie - iProc PCIe device
  * @dev: pointer to device data structure
  * @type: iProc PCIe interface type
  * @reg_offsets: register offsets
  * @base: PCIe host controller I/O register base
  * @base_addr: PCIe host controller register base physical address
+ * @mem: host bridge memory window resource
  * @phy: optional PHY device that controls the Serdes
  * @map_irq: function callback to map interrupts
  * @ep_is_internal: indicates an internal emulated endpoint device is connected
diff --git a/drivers/pci/hotplug/cpqphp_core.c b/drivers/pci/hotplug/cpqphp_core.c
index b8aacb41a83c..f99a7927e5a8 100644
--- a/drivers/pci/hotplug/cpqphp_core.c
+++ b/drivers/pci/hotplug/cpqphp_core.c
@@ -296,9 +296,10 @@ static int ctrl_slot_cleanup(struct controller *ctrl)
  *
  * Won't work for more than one PCI-PCI bridge in a slot.
  *
- * @bus_num - bus number of PCI device
- * @dev_num - device number of PCI device
- * @slot - Pointer to u8 where slot number will	be returned
+ * @bus: pointer to the PCI bus structure
+ * @bus_num: bus number of PCI device
+ * @dev_num: device number of PCI device
+ * @slot: Pointer to u8 where slot number will	be returned
  *
  * Output:	SUCCESS or FAILURE
  */
diff --git a/drivers/pci/hotplug/cpqphp_ctrl.c b/drivers/pci/hotplug/cpqphp_ctrl.c
index 68de958a9be8..1b26ca0b3701 100644
--- a/drivers/pci/hotplug/cpqphp_ctrl.c
+++ b/drivers/pci/hotplug/cpqphp_ctrl.c
@@ -1877,7 +1877,7 @@ static void interrupt_event_handler(struct controller *ctrl)
 
 /**
  * cpqhp_pushbutton_thread - handle pushbutton events
- * @slot: target slot (struct)
+ * @t: pointer to struct timer_list which holds all timer-related callbacks
  *
  * Scheduled procedure to handle blocking stuff for the pushbuttons.
  * Handles all pending events and exits.
diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h
index 4fd200d8b0a9..d4a930881054 100644
--- a/drivers/pci/hotplug/pciehp.h
+++ b/drivers/pci/hotplug/pciehp.h
@@ -47,6 +47,9 @@ extern int pciehp_poll_time;
  * struct controller - PCIe hotplug controller
  * @pcie: pointer to the controller's PCIe port service device
  * @slot_cap: cached copy of the Slot Capabilities register
+ * @inband_presence_disabled: In-Band Presence Detect Disable supported by
+ *	controller and disabled per spec recommendation (PCIe r5.0, appendix I
+ *	implementation note)
  * @slot_ctrl: cached copy of the Slot Control register
  * @ctrl_lock: serializes writes to the Slot Control register
  * @cmd_started: jiffies when the Slot Control register was last written;
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 37c913bbc6e1..8b385eaf2043 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -324,8 +324,8 @@ struct pci_sriov {
 /**
  * pci_dev_set_io_state - Set the new error state if possible.
  *
- * @dev - pci device to set new error_state
- * @new - the state we want dev to be in
+ * @dev: PCI device to set new error_state
+ * @new: the state we want dev to be in
  *
  * Must be called with device_lock held.
  *
diff --git a/include/linux/pci-ep-cfs.h b/include/linux/pci-ep-cfs.h
index 662881335c7e..3e2140d7e31d 100644
--- a/include/linux/pci-ep-cfs.h
+++ b/include/linux/pci-ep-cfs.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0+ */
-/**
+/*
  * PCI Endpoint ConfigFS header file
  *
  * Copyright (C) 2017 Texas Instruments
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index b82c9b100e97..50a649d33e68 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/**
+/*
  * PCI Endpoint *Controller* (EPC) header file
  *
  * Copyright (C) 2017 Texas Instruments
@@ -58,6 +58,7 @@ pci_epc_interface_string(enum pci_epc_interface_type type)
  * @map_msi_irq: ops to map physical address to MSI address and return MSI data
  * @start: ops to start the PCI link
  * @stop: ops to stop the PCI link
+ * @get_features: ops to get the features supported by the EPC
  * @owner: the module owner containing the ops
  */
 struct pci_epc_ops {
@@ -150,6 +151,8 @@ struct pci_epc {
 /**
  * struct pci_epc_features - features supported by a EPC device per function
  * @linkup_notifier: indicate if the EPC device can notify EPF driver on link up
+ * @core_init_notifier: indicate cores that can notify about their availability
+ *			for initialization
  * @msi_capable: indicate if the endpoint function has MSI capability
  * @msix_capable: indicate if the endpoint function has MSI-X capability
  * @reserved_bar: bitmap to indicate reserved BAR unavailable to function driver
diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index 6833e2160ef1..2debc27ba95e 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/**
+/*
  * PCI Endpoint *Function* (EPF) header file
  *
  * Copyright (C) 2017 Texas Instruments
@@ -102,6 +102,8 @@ struct pci_epf_driver {
  * @phys_addr: physical address that should be mapped to the BAR
  * @addr: virtual address corresponding to the @phys_addr
  * @size: the size of the address space present in BAR
+ * @barno: BAR number
+ * @flags: flags that are set for the BAR
  */
 struct pci_epf_bar {
 	dma_addr_t	phys_addr;
@@ -118,6 +120,7 @@ struct pci_epf_bar {
  * @header: represents standard configuration header
  * @bar: represents the BAR of EPF device
  * @msi_interrupts: number of MSI interrupts required by this function
+ * @msix_interrupts: number of MSI-X interrupts required by this function
  * @func_no: unique function number within this endpoint device
  * @epc: the EPC device to which this EPF device is bound
  * @driver: the EPF driver to which this EPF device is bound
diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index b482e42d7153..2dac431d94ac 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -50,6 +50,8 @@ struct hotplug_slot_ops {
 /**
  * struct hotplug_slot - used to register a physical slot with the hotplug pci core
  * @ops: pointer to the &struct hotplug_slot_ops to be used for this slot
+ * @slot_list: internal list used to track hotplug PCI slots
+ * @pci_slot: represents a physical slot
  * @owner: The module owner of this structure
  * @mod_name: The module name (KBUILD_MODNAME) of this structure
  */
diff --git a/include/uapi/linux/pcitest.h b/include/uapi/linux/pcitest.h
index c3ab4c826297..f9c1af8d141b 100644
--- a/include/uapi/linux/pcitest.h
+++ b/include/uapi/linux/pcitest.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/**
+/*
  * pcitest.h - PCI test uapi defines
  *
  * Copyright (C) 2017 Texas Instruments
-- 
cgit v1.2.3


From ae21f835a5bda0ef1d00940373445693a764d89e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 1 Jul 2021 16:48:23 -0500
Subject: PCI/P2PDMA: Finish RCU conversion of pdev->p2pdma
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

While looking at pci_alloc_p2pmem() I found RCU protection was not properly
applied there, as pdev->p2pdma was potentially read multiple times.

Fix pci_alloc_p2pmem(), add __rcu qualifier to p2pdma field of struct
pci_dev, and fix all other accesses to this field with proper RCU verbs.

Link: https://lore.kernel.org/r/20210701095621.3129283-1-eric.dumazet@gmail.com
Fixes: 1570175abd16 ("PCI/P2PDMA: track pgmap references per resource, not globally")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
---
 drivers/pci/p2pdma.c | 97 ++++++++++++++++++++++++++++++++++++++--------------
 include/linux/pci.h  |  2 +-
 2 files changed, 73 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index ca2574debb2d..69c25e71590a 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -48,10 +48,14 @@ static ssize_t size_show(struct device *dev, struct device_attribute *attr,
 			 char *buf)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
+	struct pci_p2pdma *p2pdma;
 	size_t size = 0;
 
-	if (pdev->p2pdma->pool)
-		size = gen_pool_size(pdev->p2pdma->pool);
+	rcu_read_lock();
+	p2pdma = rcu_dereference(pdev->p2pdma);
+	if (p2pdma && p2pdma->pool)
+		size = gen_pool_size(p2pdma->pool);
+	rcu_read_unlock();
 
 	return scnprintf(buf, PAGE_SIZE, "%zd\n", size);
 }
@@ -61,10 +65,14 @@ static ssize_t available_show(struct device *dev, struct device_attribute *attr,
 			      char *buf)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
+	struct pci_p2pdma *p2pdma;
 	size_t avail = 0;
 
-	if (pdev->p2pdma->pool)
-		avail = gen_pool_avail(pdev->p2pdma->pool);
+	rcu_read_lock();
+	p2pdma = rcu_dereference(pdev->p2pdma);
+	if (p2pdma && p2pdma->pool)
+		avail = gen_pool_avail(p2pdma->pool);
+	rcu_read_unlock();
 
 	return scnprintf(buf, PAGE_SIZE, "%zd\n", avail);
 }
@@ -74,9 +82,16 @@ static ssize_t published_show(struct device *dev, struct device_attribute *attr,
 			      char *buf)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
+	struct pci_p2pdma *p2pdma;
+	bool published = false;
+
+	rcu_read_lock();
+	p2pdma = rcu_dereference(pdev->p2pdma);
+	if (p2pdma)
+		published = p2pdma->p2pmem_published;
+	rcu_read_unlock();
 
-	return scnprintf(buf, PAGE_SIZE, "%d\n",
-			 pdev->p2pdma->p2pmem_published);
+	return scnprintf(buf, PAGE_SIZE, "%d\n", published);
 }
 static DEVICE_ATTR_RO(published);
 
@@ -95,8 +110,9 @@ static const struct attribute_group p2pmem_group = {
 static void pci_p2pdma_release(void *data)
 {
 	struct pci_dev *pdev = data;
-	struct pci_p2pdma *p2pdma = pdev->p2pdma;
+	struct pci_p2pdma *p2pdma;
 
+	p2pdma = rcu_dereference_protected(pdev->p2pdma, 1);
 	if (!p2pdma)
 		return;
 
@@ -128,16 +144,14 @@ static int pci_p2pdma_setup(struct pci_dev *pdev)
 	if (error)
 		goto out_pool_destroy;
 
-	pdev->p2pdma = p2p;
-
 	error = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group);
 	if (error)
 		goto out_pool_destroy;
 
+	rcu_assign_pointer(pdev->p2pdma, p2p);
 	return 0;
 
 out_pool_destroy:
-	pdev->p2pdma = NULL;
 	gen_pool_destroy(p2p->pool);
 out:
 	devm_kfree(&pdev->dev, p2p);
@@ -159,6 +173,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 {
 	struct pci_p2pdma_pagemap *p2p_pgmap;
 	struct dev_pagemap *pgmap;
+	struct pci_p2pdma *p2pdma;
 	void *addr;
 	int error;
 
@@ -200,7 +215,8 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 		goto pgmap_free;
 	}
 
-	error = gen_pool_add_owner(pdev->p2pdma->pool, (unsigned long)addr,
+	p2pdma = rcu_dereference_protected(pdev->p2pdma, 1);
+	error = gen_pool_add_owner(p2pdma->pool, (unsigned long)addr,
 			pci_bus_address(pdev, bar) + offset,
 			range_len(&pgmap->range), dev_to_node(&pdev->dev),
 			pgmap->ref);
@@ -437,6 +453,7 @@ calc_map_type_and_dist(struct pci_dev *provider, struct pci_dev *client,
 	enum pci_p2pdma_map_type map_type = PCI_P2PDMA_MAP_THRU_HOST_BRIDGE;
 	struct pci_dev *a = provider, *b = client, *bb;
 	bool acs_redirects = false;
+	struct pci_p2pdma *p2pdma;
 	struct seq_buf acs_list;
 	int acs_cnt = 0;
 	int dist_a = 0;
@@ -515,9 +532,12 @@ map_through_host_bridge:
 		map_type = PCI_P2PDMA_MAP_NOT_SUPPORTED;
 	}
 done:
-	if (provider->p2pdma)
-		xa_store(&provider->p2pdma->map_types, map_types_idx(client),
+	rcu_read_lock();
+	p2pdma = rcu_dereference(provider->p2pdma);
+	if (p2pdma)
+		xa_store(&p2pdma->map_types, map_types_idx(client),
 			 xa_mk_value(map_type), GFP_KERNEL);
+	rcu_read_unlock();
 	return map_type;
 }
 
@@ -586,7 +606,15 @@ EXPORT_SYMBOL_GPL(pci_p2pdma_distance_many);
  */
 bool pci_has_p2pmem(struct pci_dev *pdev)
 {
-	return pdev->p2pdma && pdev->p2pdma->p2pmem_published;
+	struct pci_p2pdma *p2pdma;
+	bool res;
+
+	rcu_read_lock();
+	p2pdma = rcu_dereference(pdev->p2pdma);
+	res = p2pdma && p2pdma->p2pmem_published;
+	rcu_read_unlock();
+
+	return res;
 }
 EXPORT_SYMBOL_GPL(pci_has_p2pmem);
 
@@ -666,6 +694,7 @@ void *pci_alloc_p2pmem(struct pci_dev *pdev, size_t size)
 {
 	void *ret = NULL;
 	struct percpu_ref *ref;
+	struct pci_p2pdma *p2pdma;
 
 	/*
 	 * Pairs with synchronize_rcu() in pci_p2pdma_release() to
@@ -673,16 +702,16 @@ void *pci_alloc_p2pmem(struct pci_dev *pdev, size_t size)
 	 * read-lock.
 	 */
 	rcu_read_lock();
-	if (unlikely(!pdev->p2pdma))
+	p2pdma = rcu_dereference(pdev->p2pdma);
+	if (unlikely(!p2pdma))
 		goto out;
 
-	ret = (void *)gen_pool_alloc_owner(pdev->p2pdma->pool, size,
-			(void **) &ref);
+	ret = (void *)gen_pool_alloc_owner(p2pdma->pool, size, (void **) &ref);
 	if (!ret)
 		goto out;
 
 	if (unlikely(!percpu_ref_tryget_live(ref))) {
-		gen_pool_free(pdev->p2pdma->pool, (unsigned long) ret, size);
+		gen_pool_free(p2pdma->pool, (unsigned long) ret, size);
 		ret = NULL;
 		goto out;
 	}
@@ -701,8 +730,9 @@ EXPORT_SYMBOL_GPL(pci_alloc_p2pmem);
 void pci_free_p2pmem(struct pci_dev *pdev, void *addr, size_t size)
 {
 	struct percpu_ref *ref;
+	struct pci_p2pdma *p2pdma = rcu_dereference_protected(pdev->p2pdma, 1);
 
-	gen_pool_free_owner(pdev->p2pdma->pool, (uintptr_t)addr, size,
+	gen_pool_free_owner(p2pdma->pool, (uintptr_t)addr, size,
 			(void **) &ref);
 	percpu_ref_put(ref);
 }
@@ -716,9 +746,13 @@ EXPORT_SYMBOL_GPL(pci_free_p2pmem);
  */
 pci_bus_addr_t pci_p2pmem_virt_to_bus(struct pci_dev *pdev, void *addr)
 {
+	struct pci_p2pdma *p2pdma;
+
 	if (!addr)
 		return 0;
-	if (!pdev->p2pdma)
+
+	p2pdma = rcu_dereference_protected(pdev->p2pdma, 1);
+	if (!p2pdma)
 		return 0;
 
 	/*
@@ -726,7 +760,7 @@ pci_bus_addr_t pci_p2pmem_virt_to_bus(struct pci_dev *pdev, void *addr)
 	 * bus address as the physical address. So gen_pool_virt_to_phys()
 	 * actually returns the bus address despite the misleading name.
 	 */
-	return gen_pool_virt_to_phys(pdev->p2pdma->pool, (unsigned long)addr);
+	return gen_pool_virt_to_phys(p2pdma->pool, (unsigned long)addr);
 }
 EXPORT_SYMBOL_GPL(pci_p2pmem_virt_to_bus);
 
@@ -797,16 +831,23 @@ EXPORT_SYMBOL_GPL(pci_p2pmem_free_sgl);
  */
 void pci_p2pmem_publish(struct pci_dev *pdev, bool publish)
 {
-	if (pdev->p2pdma)
-		pdev->p2pdma->p2pmem_published = publish;
+	struct pci_p2pdma *p2pdma;
+
+	rcu_read_lock();
+	p2pdma = rcu_dereference(pdev->p2pdma);
+	if (p2pdma)
+		p2pdma->p2pmem_published = publish;
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(pci_p2pmem_publish);
 
 static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap,
 						    struct device *dev)
 {
+	enum pci_p2pdma_map_type type = PCI_P2PDMA_MAP_NOT_SUPPORTED;
 	struct pci_dev *provider = to_p2p_pgmap(pgmap)->provider;
 	struct pci_dev *client;
+	struct pci_p2pdma *p2pdma;
 
 	if (!provider->p2pdma)
 		return PCI_P2PDMA_MAP_NOT_SUPPORTED;
@@ -816,8 +857,14 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap,
 
 	client = to_pci_dev(dev);
 
-	return xa_to_value(xa_load(&provider->p2pdma->map_types,
-				   map_types_idx(client)));
+	rcu_read_lock();
+	p2pdma = rcu_dereference(provider->p2pdma);
+
+	if (p2pdma)
+		type = xa_to_value(xa_load(&p2pdma->map_types,
+					   map_types_idx(client)));
+	rcu_read_unlock();
+	return type;
 }
 
 static int __pci_p2pdma_map_sg(struct pci_p2pdma_pagemap *p2p_pgmap,
diff --git a/include/linux/pci.h b/include/linux/pci.h
index c20211e59a57..58a39c7239f3 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -497,7 +497,7 @@ struct pci_dev {
 	u16		pasid_features;
 #endif
 #ifdef CONFIG_PCI_P2PDMA
-	struct pci_p2pdma *p2pdma;
+	struct pci_p2pdma __rcu *p2pdma;
 #endif
 	u16		acs_cap;	/* ACS Capability offset */
 	phys_addr_t	rom;		/* Physical address if not from BAR */
-- 
cgit v1.2.3


From 99cdf57b33e68df7afc876739c93a11f0b1ba807 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 3 Jun 2021 16:50:40 -0400
Subject: lockd: Remove stale comments

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/lockd/xdr.h  | 6 ------
 include/linux/lockd/xdr4.h | 7 +------
 2 files changed, 1 insertion(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h
index 7ab9f264313f..a98309c0121c 100644
--- a/include/linux/lockd/xdr.h
+++ b/include/linux/lockd/xdr.h
@@ -109,11 +109,5 @@ int	nlmsvc_decode_shareargs(struct svc_rqst *, __be32 *);
 int	nlmsvc_encode_shareres(struct svc_rqst *, __be32 *);
 int	nlmsvc_decode_notify(struct svc_rqst *, __be32 *);
 int	nlmsvc_decode_reboot(struct svc_rqst *, __be32 *);
-/*
-int	nlmclt_encode_testargs(struct rpc_rqst *, u32 *, struct nlm_args *);
-int	nlmclt_encode_lockargs(struct rpc_rqst *, u32 *, struct nlm_args *);
-int	nlmclt_encode_cancargs(struct rpc_rqst *, u32 *, struct nlm_args *);
-int	nlmclt_encode_unlockargs(struct rpc_rqst *, u32 *, struct nlm_args *);
- */
 
 #endif /* LOCKD_XDR_H */
diff --git a/include/linux/lockd/xdr4.h b/include/linux/lockd/xdr4.h
index e709fe5924f2..5ae766f26e04 100644
--- a/include/linux/lockd/xdr4.h
+++ b/include/linux/lockd/xdr4.h
@@ -37,12 +37,7 @@ int	nlm4svc_decode_shareargs(struct svc_rqst *, __be32 *);
 int	nlm4svc_encode_shareres(struct svc_rqst *, __be32 *);
 int	nlm4svc_decode_notify(struct svc_rqst *, __be32 *);
 int	nlm4svc_decode_reboot(struct svc_rqst *, __be32 *);
-/*
-int	nlmclt_encode_testargs(struct rpc_rqst *, u32 *, struct nlm_args *);
-int	nlmclt_encode_lockargs(struct rpc_rqst *, u32 *, struct nlm_args *);
-int	nlmclt_encode_cancargs(struct rpc_rqst *, u32 *, struct nlm_args *);
-int	nlmclt_encode_unlockargs(struct rpc_rqst *, u32 *, struct nlm_args *);
- */
+
 extern const struct rpc_version nlm_version4;
 
 #endif /* LOCKD_XDR4_H */
-- 
cgit v1.2.3


From 530a5678bc0083e84f99f38f77ced8fbb3d18434 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Wed, 2 Jun 2021 10:15:33 +0800
Subject: vdpa: support packed virtqueue for set/get_vq_state()

This patch extends the vdpa_vq_state to support packed virtqueue
state which is basically the device/driver ring wrap counters and the
avail and used index. This will be used for the virito-vdpa support
for the packed virtqueue and the future vhost/vhost-vdpa support for
the packed virtqueue.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20210602021536.39525-2-jasowang@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
---
 drivers/vdpa/ifcvf/ifcvf_main.c   |  4 ++--
 drivers/vdpa/mlx5/net/mlx5_vnet.c |  8 ++++----
 drivers/vdpa/vdpa_sim/vdpa_sim.c  |  4 ++--
 drivers/vhost/vdpa.c              |  4 ++--
 include/linux/vdpa.h              | 25 +++++++++++++++++++++++--
 5 files changed, 33 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
index 916a97df7477..21b78f1cd521 100644
--- a/drivers/vdpa/ifcvf/ifcvf_main.c
+++ b/drivers/vdpa/ifcvf/ifcvf_main.c
@@ -264,7 +264,7 @@ static int ifcvf_vdpa_get_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
 {
 	struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
 
-	state->avail_index = ifcvf_get_vq_state(vf, qid);
+	state->split.avail_index = ifcvf_get_vq_state(vf, qid);
 	return 0;
 }
 
@@ -273,7 +273,7 @@ static int ifcvf_vdpa_set_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
 {
 	struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
 
-	return ifcvf_set_vq_state(vf, qid, state->avail_index);
+	return ifcvf_set_vq_state(vf, qid, state->split.avail_index);
 }
 
 static void ifcvf_vdpa_set_vq_cb(struct vdpa_device *vdpa_dev, u16 qid,
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index f4b362332f3f..2a31467f7ac5 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1423,8 +1423,8 @@ static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,
 		return -EINVAL;
 	}
 
-	mvq->used_idx = state->avail_index;
-	mvq->avail_idx = state->avail_index;
+	mvq->used_idx = state->split.avail_index;
+	mvq->avail_idx = state->split.avail_index;
 	return 0;
 }
 
@@ -1445,7 +1445,7 @@ static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa
 		 * Since both values should be identical, we take the value of
 		 * used_idx which is reported correctly.
 		 */
-		state->avail_index = mvq->used_idx;
+		state->split.avail_index = mvq->used_idx;
 		return 0;
 	}
 
@@ -1454,7 +1454,7 @@ static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa
 		mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n");
 		return err;
 	}
-	state->avail_index = attr.used_index;
+	state->split.avail_index = attr.used_index;
 	return 0;
 }
 
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index 98f793bc9376..14e024de5cbf 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -374,7 +374,7 @@ static int vdpasim_set_vq_state(struct vdpa_device *vdpa, u16 idx,
 	struct vringh *vrh = &vq->vring;
 
 	spin_lock(&vdpasim->lock);
-	vrh->last_avail_idx = state->avail_index;
+	vrh->last_avail_idx = state->split.avail_index;
 	spin_unlock(&vdpasim->lock);
 
 	return 0;
@@ -387,7 +387,7 @@ static int vdpasim_get_vq_state(struct vdpa_device *vdpa, u16 idx,
 	struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
 	struct vringh *vrh = &vq->vring;
 
-	state->avail_index = vrh->last_avail_idx;
+	state->split.avail_index = vrh->last_avail_idx;
 	return 0;
 }
 
diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index fb41db3da611..210ab35a7ebf 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -383,7 +383,7 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
 		if (r)
 			return r;
 
-		vq->last_avail_idx = vq_state.avail_index;
+		vq->last_avail_idx = vq_state.split.avail_index;
 		break;
 	}
 
@@ -401,7 +401,7 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
 		break;
 
 	case VHOST_SET_VRING_BASE:
-		vq_state.avail_index = vq->last_avail_idx;
+		vq_state.split.avail_index = vq->last_avail_idx;
 		if (ops->set_vq_state(vdpa, idx, &vq_state))
 			r = -EINVAL;
 		break;
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index f311d227aa1b..3357ac98878d 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -28,13 +28,34 @@ struct vdpa_notification_area {
 };
 
 /**
- * struct vdpa_vq_state - vDPA vq_state definition
+ * struct vdpa_vq_state_split - vDPA split virtqueue state
  * @avail_index: available index
  */
-struct vdpa_vq_state {
+struct vdpa_vq_state_split {
 	u16	avail_index;
 };
 
+/**
+ * struct vdpa_vq_state_packed - vDPA packed virtqueue state
+ * @last_avail_counter: last driver ring wrap counter observed by device
+ * @last_avail_idx: device available index
+ * @last_used_counter: device ring wrap counter
+ * @last_used_idx: used index
+ */
+struct vdpa_vq_state_packed {
+        u16	last_avail_counter:1;
+        u16	last_avail_idx:15;
+        u16	last_used_counter:1;
+        u16	last_used_idx:15;
+};
+
+struct vdpa_vq_state {
+     union {
+          struct vdpa_vq_state_split split;
+          struct vdpa_vq_state_packed packed;
+     };
+};
+
 struct vdpa_mgmt_dev;
 
 /**
-- 
cgit v1.2.3


From 0140b3d07617e71a8d9509776434ced107572fc8 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Wed, 2 Jun 2021 10:15:34 +0800
Subject: virtio-pci library: introduce vp_modern_get_driver_features()

This patch introduce a helper to get driver/guest features from the
device.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20210602021536.39525-3-jasowang@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
---
 drivers/virtio/virtio_pci_modern_dev.c | 21 +++++++++++++++++++++
 include/linux/virtio_pci_modern.h      |  1 +
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c
index 54f297028586..e11ed748e661 100644
--- a/drivers/virtio/virtio_pci_modern_dev.c
+++ b/drivers/virtio/virtio_pci_modern_dev.c
@@ -383,6 +383,27 @@ u64 vp_modern_get_features(struct virtio_pci_modern_device *mdev)
 }
 EXPORT_SYMBOL_GPL(vp_modern_get_features);
 
+/*
+ * vp_modern_get_driver_features - get driver features from device
+ * @mdev: the modern virtio-pci device
+ *
+ * Returns the driver features read from the device
+ */
+u64 vp_modern_get_driver_features(struct virtio_pci_modern_device *mdev)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+	u64 features;
+
+	vp_iowrite32(0, &cfg->guest_feature_select);
+	features = vp_ioread32(&cfg->guest_feature);
+	vp_iowrite32(1, &cfg->guest_feature_select);
+	features |= ((u64)vp_ioread32(&cfg->guest_feature) << 32);
+
+	return features;
+}
+EXPORT_SYMBOL_GPL(vp_modern_get_driver_features);
+
 /*
  * vp_modern_set_features - set features to device
  * @mdev: the modern virtio-pci device
diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
index 6a95b58fd0f4..eb2bd9b4077d 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -79,6 +79,7 @@ static inline void vp_iowrite64_twopart(u64 val,
 }
 
 u64 vp_modern_get_features(struct virtio_pci_modern_device *mdev);
+u64 vp_modern_get_driver_features(struct virtio_pci_modern_device *mdev);
 void vp_modern_set_features(struct virtio_pci_modern_device *mdev,
 		     u64 features);
 u32 vp_modern_generation(struct virtio_pci_modern_device *mdev);
-- 
cgit v1.2.3


From 0705e8d1e2207ceeb83dc6e1751b6b82718b353a Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 2 Jul 2021 18:05:03 -0400
Subject: ext4: inline jbd2_journal_[un]register_shrinker()

The function jbd2_journal_unregister_shrinker() was getting called
twice when the file system was getting unmounted.  On Power and ARM
platforms this was causing kernel crash when unmounting the file
system, when a percpu_counter was destroyed twice.

Fix this by removing jbd2_journal_[un]register_shrinker() functions,
and inlining the shrinker setup and teardown into
journal_init_common() and jbd2_journal_destroy().  This means that
ext4 and ocfs2 now no longer need to know about registering and
unregistering jbd2's shrinker.

Also, while we're at it, rename the percpu counter from
j_jh_shrink_count to j_checkpoint_jh_count, since this makes it
clearer what this counter is intended to track.

Link: https://lore.kernel.org/r/20210705145025.3363130-1-tytso@mit.edu
Fixes: 4ba3fcdde7e3 ("jbd2,ext4: add a shrinker to release checkpointed buffers")
Reported-by: Jon Hunter <jonathanh@nvidia.com>
Reported-by: Sachin Sant <sachinp@linux.vnet.ibm.com>
Tested-by: Sachin Sant <sachinp@linux.vnet.ibm.com>
Tested-by: Jon Hunter <jonathanh@nvidia.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c      |   8 ---
 fs/jbd2/checkpoint.c |   4 +-
 fs/jbd2/journal.c    | 149 +++++++++++++++++++++------------------------------
 include/linux/jbd2.h |   6 +--
 4 files changed, 64 insertions(+), 103 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b8ff0399e171..dfa09a277b56 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1184,7 +1184,6 @@ static void ext4_put_super(struct super_block *sb)
 	ext4_unregister_sysfs(sb);
 
 	if (sbi->s_journal) {
-		jbd2_journal_unregister_shrinker(sbi->s_journal);
 		aborted = is_journal_aborted(sbi->s_journal);
 		err = jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
@@ -5176,7 +5175,6 @@ failed_mount_wq:
 	sbi->s_ea_block_cache = NULL;
 
 	if (sbi->s_journal) {
-		jbd2_journal_unregister_shrinker(sbi->s_journal);
 		jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
 	}
@@ -5502,12 +5500,6 @@ static int ext4_load_journal(struct super_block *sb,
 		ext4_commit_super(sb);
 	}
 
-	err = jbd2_journal_register_shrinker(journal);
-	if (err) {
-		EXT4_SB(sb)->s_journal = NULL;
-		goto err_out;
-	}
-
 	return 0;
 
 err_out:
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 51d1eb2ffeb9..746132998c57 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -701,7 +701,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
 
 	__buffer_unlink(jh);
 	jh->b_cp_transaction = NULL;
-	percpu_counter_dec(&journal->j_jh_shrink_count);
+	percpu_counter_dec(&journal->j_checkpoint_jh_count);
 	jbd2_journal_put_journal_head(jh);
 
 	/* Is this transaction empty? */
@@ -764,7 +764,7 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
 		jh->b_cpnext->b_cpprev = jh;
 	}
 	transaction->t_checkpoint_list = jh;
-	percpu_counter_inc(&transaction->t_journal->j_jh_shrink_count);
+	percpu_counter_inc(&transaction->t_journal->j_checkpoint_jh_count);
 }
 
 /*
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 152880c298ca..35302bc192eb 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1283,6 +1283,48 @@ static int jbd2_min_tag_size(void)
 	return sizeof(journal_block_tag_t) - 4;
 }
 
+/**
+ * jbd2_journal_shrink_scan()
+ *
+ * Scan the checkpointed buffer on the checkpoint list and release the
+ * journal_head.
+ */
+static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink,
+					      struct shrink_control *sc)
+{
+	journal_t *journal = container_of(shrink, journal_t, j_shrinker);
+	unsigned long nr_to_scan = sc->nr_to_scan;
+	unsigned long nr_shrunk;
+	unsigned long count;
+
+	count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count);
+	trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count);
+
+	nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan);
+
+	count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count);
+	trace_jbd2_shrink_scan_exit(journal, nr_to_scan, nr_shrunk, count);
+
+	return nr_shrunk;
+}
+
+/**
+ * jbd2_journal_shrink_count()
+ *
+ * Count the number of checkpoint buffers on the checkpoint list.
+ */
+static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink,
+					       struct shrink_control *sc)
+{
+	journal_t *journal = container_of(shrink, journal_t, j_shrinker);
+	unsigned long count;
+
+	count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count);
+	trace_jbd2_shrink_count(journal, sc->nr_to_scan, count);
+
+	return count;
+}
+
 /*
  * Management for journal control blocks: functions to create and
  * destroy journal_t structures, and to initialise and read existing
@@ -1361,9 +1403,23 @@ static journal_t *journal_init_common(struct block_device *bdev,
 	journal->j_sb_buffer = bh;
 	journal->j_superblock = (journal_superblock_t *)bh->b_data;
 
+	journal->j_shrink_transaction = NULL;
+	journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan;
+	journal->j_shrinker.count_objects = jbd2_journal_shrink_count;
+	journal->j_shrinker.seeks = DEFAULT_SEEKS;
+	journal->j_shrinker.batch = journal->j_max_transaction_buffers;
+
+	if (percpu_counter_init(&journal->j_checkpoint_jh_count, 0, GFP_KERNEL))
+		goto err_cleanup;
+
+	if (register_shrinker(&journal->j_shrinker)) {
+		percpu_counter_destroy(&journal->j_checkpoint_jh_count);
+		goto err_cleanup;
+	}
 	return journal;
 
 err_cleanup:
+	brelse(journal->j_sb_buffer);
 	kfree(journal->j_wbuf);
 	jbd2_journal_destroy_revoke(journal);
 	kfree(journal);
@@ -2050,93 +2106,6 @@ recovery_error:
 	return -EIO;
 }
 
-/**
- * jbd2_journal_shrink_scan()
- *
- * Scan the checkpointed buffer on the checkpoint list and release the
- * journal_head.
- */
-static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink,
-					      struct shrink_control *sc)
-{
-	journal_t *journal = container_of(shrink, journal_t, j_shrinker);
-	unsigned long nr_to_scan = sc->nr_to_scan;
-	unsigned long nr_shrunk;
-	unsigned long count;
-
-	count = percpu_counter_read_positive(&journal->j_jh_shrink_count);
-	trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count);
-
-	nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan);
-
-	count = percpu_counter_read_positive(&journal->j_jh_shrink_count);
-	trace_jbd2_shrink_scan_exit(journal, nr_to_scan, nr_shrunk, count);
-
-	return nr_shrunk;
-}
-
-/**
- * jbd2_journal_shrink_count()
- *
- * Count the number of checkpoint buffers on the checkpoint list.
- */
-static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink,
-					       struct shrink_control *sc)
-{
-	journal_t *journal = container_of(shrink, journal_t, j_shrinker);
-	unsigned long count;
-
-	count = percpu_counter_read_positive(&journal->j_jh_shrink_count);
-	trace_jbd2_shrink_count(journal, sc->nr_to_scan, count);
-
-	return count;
-}
-
-/**
- * jbd2_journal_register_shrinker()
- * @journal: Journal to act on.
- *
- * Init a percpu counter to record the checkpointed buffers on the checkpoint
- * list and register a shrinker to release their journal_head.
- */
-int jbd2_journal_register_shrinker(journal_t *journal)
-{
-	int err;
-
-	journal->j_shrink_transaction = NULL;
-
-	err = percpu_counter_init(&journal->j_jh_shrink_count, 0, GFP_KERNEL);
-	if (err)
-		return err;
-
-	journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan;
-	journal->j_shrinker.count_objects = jbd2_journal_shrink_count;
-	journal->j_shrinker.seeks = DEFAULT_SEEKS;
-	journal->j_shrinker.batch = journal->j_max_transaction_buffers;
-
-	err = register_shrinker(&journal->j_shrinker);
-	if (err) {
-		percpu_counter_destroy(&journal->j_jh_shrink_count);
-		return err;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(jbd2_journal_register_shrinker);
-
-/**
- * jbd2_journal_unregister_shrinker()
- * @journal: Journal to act on.
- *
- * Unregister the checkpointed buffer shrinker and destroy the percpu counter.
- */
-void jbd2_journal_unregister_shrinker(journal_t *journal)
-{
-	percpu_counter_destroy(&journal->j_jh_shrink_count);
-	unregister_shrinker(&journal->j_shrinker);
-}
-EXPORT_SYMBOL(jbd2_journal_unregister_shrinker);
-
 /**
  * jbd2_journal_destroy() - Release a journal_t structure.
  * @journal: Journal to act on.
@@ -2209,8 +2178,10 @@ int jbd2_journal_destroy(journal_t *journal)
 		brelse(journal->j_sb_buffer);
 	}
 
-	jbd2_journal_unregister_shrinker(journal);
-
+	if (journal->j_shrinker.flags & SHRINKER_REGISTERED) {
+		percpu_counter_destroy(&journal->j_checkpoint_jh_count);
+		unregister_shrinker(&journal->j_shrinker);
+	}
 	if (journal->j_proc_entry)
 		jbd2_stats_proc_exit(journal);
 	iput(journal->j_inode);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 6cc035321562..fd933c45281a 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -918,11 +918,11 @@ struct journal_s
 	struct shrinker		j_shrinker;
 
 	/**
-	 * @j_jh_shrink_count:
+	 * @j_checkpoint_jh_count:
 	 *
 	 * Number of journal buffers on the checkpoint list. [j_list_lock]
 	 */
-	struct percpu_counter	j_jh_shrink_count;
+	struct percpu_counter	j_checkpoint_jh_count;
 
 	/**
 	 * @j_shrink_transaction:
@@ -1556,8 +1556,6 @@ extern int	   jbd2_journal_set_features
 		   (journal_t *, unsigned long, unsigned long, unsigned long);
 extern void	   jbd2_journal_clear_features
 		   (journal_t *, unsigned long, unsigned long, unsigned long);
-extern int	   jbd2_journal_register_shrinker(journal_t *journal);
-extern void	   jbd2_journal_unregister_shrinker(journal_t *journal);
 extern int	   jbd2_journal_load       (journal_t *journal);
 extern int	   jbd2_journal_destroy    (journal_t *);
 extern int	   jbd2_journal_recover    (journal_t *journal);
-- 
cgit v1.2.3


From c5a382ebdbdaac27ec109993e29f9045d70297f2 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Tue, 8 Jun 2021 15:59:12 -0400
Subject: sunrpc: Create per-rpc_clnt sysfs kobjects

These will eventually have files placed under them for sysfs operations.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/clnt.h |  2 ++
 net/sunrpc/clnt.c           |  5 ++++
 net/sunrpc/sysfs.c          | 61 +++++++++++++++++++++++++++++++++++++++++++++
 net/sunrpc/sysfs.h          |  8 ++++++
 4 files changed, 76 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 02e7a5863d28..8b5d5c97553e 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -29,6 +29,7 @@
 #include <linux/sunrpc/xprtmultipath.h>
 
 struct rpc_inode;
+struct rpc_sysfs_client;
 
 /*
  * The high-level client handle
@@ -71,6 +72,7 @@ struct rpc_clnt {
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 	struct dentry		*cl_debugfs;	/* debugfs directory */
 #endif
+	struct rpc_sysfs_client *cl_sysfs;	/* sysfs directory */
 	/* cl_work is only needed after cl_xpi is no longer used,
 	 * and that are of similar size
 	 */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 42623d6b8f0e..6f3f840a2ea3 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -41,6 +41,7 @@
 #include <trace/events/sunrpc.h>
 
 #include "sunrpc.h"
+#include "sysfs.h"
 #include "netns.h"
 
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -300,6 +301,7 @@ static int rpc_client_register(struct rpc_clnt *clnt,
 	int err;
 
 	rpc_clnt_debugfs_register(clnt);
+	rpc_sysfs_client_setup(clnt, net);
 
 	pipefs_sb = rpc_get_sb_net(net);
 	if (pipefs_sb) {
@@ -327,6 +329,7 @@ err_auth:
 out:
 	if (pipefs_sb)
 		rpc_put_sb_net(net);
+	rpc_sysfs_client_destroy(clnt);
 	rpc_clnt_debugfs_unregister(clnt);
 	return err;
 }
@@ -733,6 +736,7 @@ int rpc_switch_client_transport(struct rpc_clnt *clnt,
 
 	rpc_unregister_client(clnt);
 	__rpc_clnt_remove_pipedir(clnt);
+	rpc_sysfs_client_destroy(clnt);
 	rpc_clnt_debugfs_unregister(clnt);
 
 	/*
@@ -879,6 +883,7 @@ static void rpc_free_client_work(struct work_struct *work)
 	 * so they cannot be called in rpciod, so they are handled separately
 	 * here.
 	 */
+	rpc_sysfs_client_destroy(clnt);
 	rpc_clnt_debugfs_unregister(clnt);
 	rpc_free_clid(clnt);
 	rpc_clnt_remove_pipedir(clnt);
diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c
index fa03e2ef836a..f3b7547ee218 100644
--- a/net/sunrpc/sysfs.c
+++ b/net/sunrpc/sysfs.c
@@ -4,6 +4,7 @@
  */
 #include <linux/sunrpc/clnt.h>
 #include <linux/kobject.h>
+#include "sysfs.h"
 
 static struct kset *rpc_sunrpc_kset;
 static struct kobject *rpc_sunrpc_client_kobj;
@@ -56,8 +57,68 @@ int rpc_sysfs_init(void)
 	return 0;
 }
 
+static void rpc_sysfs_client_release(struct kobject *kobj)
+{
+	struct rpc_sysfs_client *c;
+
+	c = container_of(kobj, struct rpc_sysfs_client, kobject);
+	kfree(c);
+}
+
+static const void *rpc_sysfs_client_namespace(struct kobject *kobj)
+{
+	return container_of(kobj, struct rpc_sysfs_client, kobject)->net;
+}
+
+static struct kobj_type rpc_sysfs_client_type = {
+	.release = rpc_sysfs_client_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.namespace = rpc_sysfs_client_namespace,
+};
+
 void rpc_sysfs_exit(void)
 {
 	kobject_put(rpc_sunrpc_client_kobj);
 	kset_unregister(rpc_sunrpc_kset);
 }
+
+static struct rpc_sysfs_client *rpc_sysfs_client_alloc(struct kobject *parent,
+						       struct net *net,
+						       int clid)
+{
+	struct rpc_sysfs_client *p;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (p) {
+		p->net = net;
+		p->kobject.kset = rpc_sunrpc_kset;
+		if (kobject_init_and_add(&p->kobject, &rpc_sysfs_client_type,
+					 parent, "clnt-%d", clid) == 0)
+			return p;
+		kobject_put(&p->kobject);
+	}
+	return NULL;
+}
+
+void rpc_sysfs_client_setup(struct rpc_clnt *clnt, struct net *net)
+{
+	struct rpc_sysfs_client *rpc_client;
+
+	rpc_client = rpc_sysfs_client_alloc(rpc_sunrpc_client_kobj, net, clnt->cl_clid);
+	if (rpc_client) {
+		clnt->cl_sysfs = rpc_client;
+		kobject_uevent(&rpc_client->kobject, KOBJ_ADD);
+	}
+}
+
+void rpc_sysfs_client_destroy(struct rpc_clnt *clnt)
+{
+	struct rpc_sysfs_client *rpc_client = clnt->cl_sysfs;
+
+	if (rpc_client) {
+		kobject_uevent(&rpc_client->kobject, KOBJ_REMOVE);
+		kobject_del(&rpc_client->kobject);
+		kobject_put(&rpc_client->kobject);
+		clnt->cl_sysfs = NULL;
+	}
+}
diff --git a/net/sunrpc/sysfs.h b/net/sunrpc/sysfs.h
index f181c650aab8..c46afc848993 100644
--- a/net/sunrpc/sysfs.h
+++ b/net/sunrpc/sysfs.h
@@ -5,7 +5,15 @@
 #ifndef __SUNRPC_SYSFS_H
 #define __SUNRPC_SYSFS_H
 
+struct rpc_sysfs_client {
+	struct kobject kobject;
+	struct net *net;
+};
+
 int rpc_sysfs_init(void);
 void rpc_sysfs_exit(void);
 
+void rpc_sysfs_client_setup(struct rpc_clnt *clnt, struct net *net);
+void rpc_sysfs_client_destroy(struct rpc_clnt *clnt);
+
 #endif
-- 
cgit v1.2.3


From 572caba402e10b35a080d1b43c0193da364f3a17 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Tue, 8 Jun 2021 15:59:13 -0400
Subject: sunrpc: add xprt id

This adds a unique identifier for a sunrpc transport in sysfs, which is
similarly managed to the unique IDs of clients.

Signed-off-by: Dan Aloni <dan@kernelim.com>
Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprt.h |  2 ++
 net/sunrpc/sunrpc_syms.c    |  1 +
 net/sunrpc/xprt.c           | 26 ++++++++++++++++++++++++++
 3 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 61b622e334ee..1fbc470ce205 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -185,6 +185,7 @@ enum xprt_transports {
 struct rpc_xprt {
 	struct kref		kref;		/* Reference count */
 	const struct rpc_xprt_ops *ops;		/* transport methods */
+	unsigned int		id;		/* transport id */
 
 	const struct rpc_timeout *timeout;	/* timeout parms */
 	struct sockaddr_storage	addr;		/* server address */
@@ -370,6 +371,7 @@ struct rpc_xprt *	xprt_alloc(struct net *net, size_t size,
 void			xprt_free(struct rpc_xprt *);
 void			xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task);
 bool			xprt_wake_up_backlog(struct rpc_xprt *xprt, struct rpc_rqst *req);
+void			xprt_cleanup_ids(void);
 
 static inline int
 xprt_enable_swap(struct rpc_xprt *xprt)
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 3b57efc692ec..b61b74c00483 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -133,6 +133,7 @@ cleanup_sunrpc(void)
 {
 	rpc_sysfs_exit();
 	rpc_cleanup_clids();
+	xprt_cleanup_ids();
 	rpcauth_remove_module();
 	cleanup_socket_xprt();
 	svc_cleanup_xprt_sock();
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 3509a7f139b9..20b9bd705014 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1746,6 +1746,30 @@ static void xprt_free_all_slots(struct rpc_xprt *xprt)
 	}
 }
 
+static DEFINE_IDA(rpc_xprt_ids);
+
+void xprt_cleanup_ids(void)
+{
+	ida_destroy(&rpc_xprt_ids);
+}
+
+static int xprt_alloc_id(struct rpc_xprt *xprt)
+{
+	int id;
+
+	id = ida_simple_get(&rpc_xprt_ids, 0, 0, GFP_KERNEL);
+	if (id < 0)
+		return id;
+
+	xprt->id = id;
+	return 0;
+}
+
+static void xprt_free_id(struct rpc_xprt *xprt)
+{
+	ida_simple_remove(&rpc_xprt_ids, xprt->id);
+}
+
 struct rpc_xprt *xprt_alloc(struct net *net, size_t size,
 		unsigned int num_prealloc,
 		unsigned int max_alloc)
@@ -1758,6 +1782,7 @@ struct rpc_xprt *xprt_alloc(struct net *net, size_t size,
 	if (xprt == NULL)
 		goto out;
 
+	xprt_alloc_id(xprt);
 	xprt_init(xprt, net);
 
 	for (i = 0; i < num_prealloc; i++) {
@@ -1786,6 +1811,7 @@ void xprt_free(struct rpc_xprt *xprt)
 {
 	put_net(xprt->xprt_net);
 	xprt_free_all_slots(xprt);
+	xprt_free_id(xprt);
 	kfree_rcu(xprt, rcu);
 }
 EXPORT_SYMBOL_GPL(xprt_free);
-- 
cgit v1.2.3


From 5b9268727f299f87432e8b035e9e8bec8ba13e8d Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Tue, 8 Jun 2021 15:59:14 -0400
Subject: sunrpc: add IDs to multipath

This is used to uniquely identify sunrpc multipath objects in /sys.

Signed-off-by: Dan Aloni <dan@kernelim.com>
Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprtmultipath.h |  4 ++++
 net/sunrpc/sunrpc_syms.c             |  1 +
 net/sunrpc/xprtmultipath.c           | 26 ++++++++++++++++++++++++++
 3 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h
index c6cce3fbf29d..ef95a6f18ccf 100644
--- a/include/linux/sunrpc/xprtmultipath.h
+++ b/include/linux/sunrpc/xprtmultipath.h
@@ -14,6 +14,7 @@ struct rpc_xprt_switch {
 	spinlock_t		xps_lock;
 	struct kref		xps_kref;
 
+	unsigned int		xps_id;
 	unsigned int		xps_nxprts;
 	unsigned int		xps_nactive;
 	atomic_long_t		xps_queuelen;
@@ -71,4 +72,7 @@ extern struct rpc_xprt *xprt_iter_get_next(struct rpc_xprt_iter *xpi);
 
 extern bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps,
 		const struct sockaddr *sap);
+
+extern void xprt_multipath_cleanup_ids(void);
+
 #endif
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index b61b74c00483..691c0000e9ea 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -134,6 +134,7 @@ cleanup_sunrpc(void)
 	rpc_sysfs_exit();
 	rpc_cleanup_clids();
 	xprt_cleanup_ids();
+	xprt_multipath_cleanup_ids();
 	rpcauth_remove_module();
 	cleanup_socket_xprt();
 	svc_cleanup_xprt_sock();
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 78c075a68c04..4969a4c216f7 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -86,6 +86,30 @@ void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps,
 	xprt_put(xprt);
 }
 
+static DEFINE_IDA(rpc_xprtswitch_ids);
+
+void xprt_multipath_cleanup_ids(void)
+{
+	ida_destroy(&rpc_xprtswitch_ids);
+}
+
+static int xprt_switch_alloc_id(struct rpc_xprt_switch *xps, gfp_t gfp_flags)
+{
+	int id;
+
+	id = ida_simple_get(&rpc_xprtswitch_ids, 0, 0, gfp_flags);
+	if (id < 0)
+		return id;
+
+	xps->xps_id = id;
+	return 0;
+}
+
+static void xprt_switch_free_id(struct rpc_xprt_switch *xps)
+{
+	ida_simple_remove(&rpc_xprtswitch_ids, xps->xps_id);
+}
+
 /**
  * xprt_switch_alloc - Allocate a new struct rpc_xprt_switch
  * @xprt: pointer to struct rpc_xprt
@@ -103,6 +127,7 @@ struct rpc_xprt_switch *xprt_switch_alloc(struct rpc_xprt *xprt,
 	if (xps != NULL) {
 		spin_lock_init(&xps->xps_lock);
 		kref_init(&xps->xps_kref);
+		xprt_switch_alloc_id(xps, gfp_flags);
 		xps->xps_nxprts = xps->xps_nactive = 0;
 		atomic_long_set(&xps->xps_queuelen, 0);
 		xps->xps_net = NULL;
@@ -136,6 +161,7 @@ static void xprt_switch_free(struct kref *kref)
 			struct rpc_xprt_switch, xps_kref);
 
 	xprt_switch_free_entries(xps);
+	xprt_switch_free_id(xps);
 	kfree_rcu(xps, xps_rcu);
 }
 
-- 
cgit v1.2.3


From d3abc73987fd2a5992a9bdae9f44fa43d1b4db70 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Tue, 8 Jun 2021 15:59:15 -0400
Subject: sunrpc: keep track of the xprt_class in rpc_xprt structure

We need to keep track of the type for a given transport.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprt.h     | 2 ++
 net/sunrpc/xprtrdma/transport.c | 2 ++
 net/sunrpc/xprtsock.c           | 9 +++++++++
 3 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 1fbc470ce205..7efc6c0a5455 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -53,6 +53,7 @@ enum rpc_display_format_t {
 
 struct rpc_task;
 struct rpc_xprt;
+struct xprt_class;
 struct seq_file;
 struct svc_serv;
 struct net;
@@ -289,6 +290,7 @@ struct rpc_xprt {
 	atomic_t		inject_disconnect;
 #endif
 	struct rcu_head		rcu;
+	const struct xprt_class	*xprt_class;
 };
 
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 19a49d26b1e4..9c2ffc67c0fd 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -73,6 +73,7 @@ unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
 unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
 unsigned int xprt_rdma_memreg_strategy		= RPCRDMA_FRWR;
 int xprt_rdma_pad_optimize;
+static struct xprt_class xprt_rdma;
 
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 
@@ -349,6 +350,7 @@ xprt_setup_rdma(struct xprt_create *args)
 	/* Ensure xprt->addr holds valid server TCP (not RDMA)
 	 * address, for any side protocols which peek at it */
 	xprt->prot = IPPROTO_TCP;
+	xprt->xprt_class = &xprt_rdma;
 	xprt->addrlen = args->addrlen;
 	memcpy(&xprt->addr, sap, xprt->addrlen);
 
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 316d04945587..2ad4d0df45fe 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -91,6 +91,11 @@ static unsigned int xprt_max_resvport_limit = RPC_MAX_RESVPORT;
 
 static struct ctl_table_header *sunrpc_table_header;
 
+static struct xprt_class xs_local_transport;
+static struct xprt_class xs_udp_transport;
+static struct xprt_class xs_tcp_transport;
+static struct xprt_class xs_bc_tcp_transport;
+
 /*
  * FIXME: changing the UDP slot table size should also resize the UDP
  *        socket buffers for existing UDP transports
@@ -2779,6 +2784,7 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args)
 	transport = container_of(xprt, struct sock_xprt, xprt);
 
 	xprt->prot = 0;
+	xprt->xprt_class = &xs_local_transport;
 	xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
 
 	xprt->bind_timeout = XS_BIND_TO;
@@ -2848,6 +2854,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
 	transport = container_of(xprt, struct sock_xprt, xprt);
 
 	xprt->prot = IPPROTO_UDP;
+	xprt->xprt_class = &xs_udp_transport;
 	/* XXX: header size can vary due to auth type, IPv6, etc. */
 	xprt->max_payload = (1U << 16) - (MAX_HEADER << 3);
 
@@ -2928,6 +2935,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
 	transport = container_of(xprt, struct sock_xprt, xprt);
 
 	xprt->prot = IPPROTO_TCP;
+	xprt->xprt_class = &xs_tcp_transport;
 	xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
 
 	xprt->bind_timeout = XS_BIND_TO;
@@ -3001,6 +3009,7 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
 	transport = container_of(xprt, struct sock_xprt, xprt);
 
 	xprt->prot = IPPROTO_TCP;
+	xprt->xprt_class = &xs_bc_tcp_transport;
 	xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
 	xprt->timeout = &xs_tcp_default_timeout;
 
-- 
cgit v1.2.3


From baea99445dd4675a834e8a5987d2f368adb62e6c Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Tue, 8 Jun 2021 15:59:16 -0400
Subject: sunrpc: add xprt_switch direcotry to sunrpc's sysfs

Add xprt_switch directory to the sysfs and create individual
xprt_swith subdirectories for multipath transport group.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprtmultipath.h |  2 +
 net/sunrpc/sysfs.c                   | 99 +++++++++++++++++++++++++++++++++---
 net/sunrpc/sysfs.h                   | 10 ++++
 net/sunrpc/xprtmultipath.c           |  4 ++
 4 files changed, 108 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h
index ef95a6f18ccf..b19addc8b715 100644
--- a/include/linux/sunrpc/xprtmultipath.h
+++ b/include/linux/sunrpc/xprtmultipath.h
@@ -10,6 +10,7 @@
 #define _NET_SUNRPC_XPRTMULTIPATH_H
 
 struct rpc_xprt_iter_ops;
+struct rpc_sysfs_xprt_switch;
 struct rpc_xprt_switch {
 	spinlock_t		xps_lock;
 	struct kref		xps_kref;
@@ -24,6 +25,7 @@ struct rpc_xprt_switch {
 
 	const struct rpc_xprt_iter_ops *xps_iter_ops;
 
+	struct rpc_sysfs_xprt_switch *xps_sysfs;
 	struct rcu_head		xps_rcu;
 };
 
diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c
index f3b7547ee218..ed9f7131543f 100644
--- a/net/sunrpc/sysfs.c
+++ b/net/sunrpc/sysfs.c
@@ -7,7 +7,7 @@
 #include "sysfs.h"
 
 static struct kset *rpc_sunrpc_kset;
-static struct kobject *rpc_sunrpc_client_kobj;
+static struct kobject *rpc_sunrpc_client_kobj, *rpc_sunrpc_xprt_switch_kobj;
 
 static void rpc_sysfs_object_release(struct kobject *kobj)
 {
@@ -48,13 +48,22 @@ int rpc_sysfs_init(void)
 	rpc_sunrpc_kset = kset_create_and_add("sunrpc", NULL, kernel_kobj);
 	if (!rpc_sunrpc_kset)
 		return -ENOMEM;
-	rpc_sunrpc_client_kobj = rpc_sysfs_object_alloc("client", rpc_sunrpc_kset, NULL);
-	if (!rpc_sunrpc_client_kobj) {
-		kset_unregister(rpc_sunrpc_kset);
-		rpc_sunrpc_client_kobj = NULL;
-		return -ENOMEM;
-	}
+	rpc_sunrpc_client_kobj =
+		rpc_sysfs_object_alloc("rpc-clients", rpc_sunrpc_kset, NULL);
+	if (!rpc_sunrpc_client_kobj)
+		goto err_client;
+	rpc_sunrpc_xprt_switch_kobj =
+		rpc_sysfs_object_alloc("xprt-switches", rpc_sunrpc_kset, NULL);
+	if (!rpc_sunrpc_xprt_switch_kobj)
+		goto err_switch;
 	return 0;
+err_switch:
+	kobject_put(rpc_sunrpc_client_kobj);
+	rpc_sunrpc_client_kobj = NULL;
+err_client:
+	kset_unregister(rpc_sunrpc_kset);
+	rpc_sunrpc_kset = NULL;
+	return -ENOMEM;
 }
 
 static void rpc_sysfs_client_release(struct kobject *kobj)
@@ -65,20 +74,40 @@ static void rpc_sysfs_client_release(struct kobject *kobj)
 	kfree(c);
 }
 
+static void rpc_sysfs_xprt_switch_release(struct kobject *kobj)
+{
+	struct rpc_sysfs_xprt_switch *xprt_switch;
+
+	xprt_switch = container_of(kobj, struct rpc_sysfs_xprt_switch, kobject);
+	kfree(xprt_switch);
+}
+
 static const void *rpc_sysfs_client_namespace(struct kobject *kobj)
 {
 	return container_of(kobj, struct rpc_sysfs_client, kobject)->net;
 }
 
+static const void *rpc_sysfs_xprt_switch_namespace(struct kobject *kobj)
+{
+	return container_of(kobj, struct rpc_sysfs_xprt_switch, kobject)->net;
+}
+
 static struct kobj_type rpc_sysfs_client_type = {
 	.release = rpc_sysfs_client_release,
 	.sysfs_ops = &kobj_sysfs_ops,
 	.namespace = rpc_sysfs_client_namespace,
 };
 
+static struct kobj_type rpc_sysfs_xprt_switch_type = {
+	.release = rpc_sysfs_xprt_switch_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.namespace = rpc_sysfs_xprt_switch_namespace,
+};
+
 void rpc_sysfs_exit(void)
 {
 	kobject_put(rpc_sunrpc_client_kobj);
+	kobject_put(rpc_sunrpc_xprt_switch_kobj);
 	kset_unregister(rpc_sunrpc_kset);
 }
 
@@ -100,6 +129,28 @@ static struct rpc_sysfs_client *rpc_sysfs_client_alloc(struct kobject *parent,
 	return NULL;
 }
 
+static struct rpc_sysfs_xprt_switch *
+rpc_sysfs_xprt_switch_alloc(struct kobject *parent,
+			    struct rpc_xprt_switch *xprt_switch,
+			    struct net *net,
+			    gfp_t gfp_flags)
+{
+	struct rpc_sysfs_xprt_switch *p;
+
+	p = kzalloc(sizeof(*p), gfp_flags);
+	if (p) {
+		p->net = net;
+		p->kobject.kset = rpc_sunrpc_kset;
+		if (kobject_init_and_add(&p->kobject,
+					 &rpc_sysfs_xprt_switch_type,
+					 parent, "switch-%d",
+					 xprt_switch->xps_id) == 0)
+			return p;
+		kobject_put(&p->kobject);
+	}
+	return NULL;
+}
+
 void rpc_sysfs_client_setup(struct rpc_clnt *clnt, struct net *net)
 {
 	struct rpc_sysfs_client *rpc_client;
@@ -111,6 +162,28 @@ void rpc_sysfs_client_setup(struct rpc_clnt *clnt, struct net *net)
 	}
 }
 
+void rpc_sysfs_xprt_switch_setup(struct rpc_xprt_switch *xprt_switch,
+				 struct rpc_xprt *xprt,
+				 gfp_t gfp_flags)
+{
+	struct rpc_sysfs_xprt_switch *rpc_xprt_switch;
+	struct net *net;
+
+	if (xprt_switch->xps_net)
+		net = xprt_switch->xps_net;
+	else
+		net = xprt->xprt_net;
+	rpc_xprt_switch =
+		rpc_sysfs_xprt_switch_alloc(rpc_sunrpc_xprt_switch_kobj,
+					    xprt_switch, net, gfp_flags);
+	if (rpc_xprt_switch) {
+		xprt_switch->xps_sysfs = rpc_xprt_switch;
+		rpc_xprt_switch->xprt_switch = xprt_switch;
+		rpc_xprt_switch->xprt = xprt;
+		kobject_uevent(&rpc_xprt_switch->kobject, KOBJ_ADD);
+	}
+}
+
 void rpc_sysfs_client_destroy(struct rpc_clnt *clnt)
 {
 	struct rpc_sysfs_client *rpc_client = clnt->cl_sysfs;
@@ -122,3 +195,15 @@ void rpc_sysfs_client_destroy(struct rpc_clnt *clnt)
 		clnt->cl_sysfs = NULL;
 	}
 }
+
+void rpc_sysfs_xprt_switch_destroy(struct rpc_xprt_switch *xprt_switch)
+{
+	struct rpc_sysfs_xprt_switch *rpc_xprt_switch = xprt_switch->xps_sysfs;
+
+	if (rpc_xprt_switch) {
+		kobject_uevent(&rpc_xprt_switch->kobject, KOBJ_REMOVE);
+		kobject_del(&rpc_xprt_switch->kobject);
+		kobject_put(&rpc_xprt_switch->kobject);
+		xprt_switch->xps_sysfs = NULL;
+	}
+}
diff --git a/net/sunrpc/sysfs.h b/net/sunrpc/sysfs.h
index c46afc848993..52ec472bd4a9 100644
--- a/net/sunrpc/sysfs.h
+++ b/net/sunrpc/sysfs.h
@@ -10,10 +10,20 @@ struct rpc_sysfs_client {
 	struct net *net;
 };
 
+struct rpc_sysfs_xprt_switch {
+	struct kobject kobject;
+	struct net *net;
+	struct rpc_xprt_switch *xprt_switch;
+	struct rpc_xprt *xprt;
+};
+
 int rpc_sysfs_init(void);
 void rpc_sysfs_exit(void);
 
 void rpc_sysfs_client_setup(struct rpc_clnt *clnt, struct net *net);
 void rpc_sysfs_client_destroy(struct rpc_clnt *clnt);
+void rpc_sysfs_xprt_switch_setup(struct rpc_xprt_switch *xprt_switch,
+				 struct rpc_xprt *xprt, gfp_t gfp_flags);
+void rpc_sysfs_xprt_switch_destroy(struct rpc_xprt_switch *xprt);
 
 #endif
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 4969a4c216f7..2d73a35df9ee 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -19,6 +19,8 @@
 #include <linux/sunrpc/addr.h>
 #include <linux/sunrpc/xprtmultipath.h>
 
+#include "sysfs.h"
+
 typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct rpc_xprt_switch *xps,
 		const struct rpc_xprt *cur);
 
@@ -133,6 +135,7 @@ struct rpc_xprt_switch *xprt_switch_alloc(struct rpc_xprt *xprt,
 		xps->xps_net = NULL;
 		INIT_LIST_HEAD(&xps->xps_xprt_list);
 		xps->xps_iter_ops = &rpc_xprt_iter_singular;
+		rpc_sysfs_xprt_switch_setup(xps, xprt, gfp_flags);
 		xprt_switch_add_xprt_locked(xps, xprt);
 	}
 
@@ -161,6 +164,7 @@ static void xprt_switch_free(struct kref *kref)
 			struct rpc_xprt_switch, xps_kref);
 
 	xprt_switch_free_entries(xps);
+	rpc_sysfs_xprt_switch_destroy(xps);
 	xprt_switch_free_id(xps);
 	kfree_rcu(xps, xps_rcu);
 }
-- 
cgit v1.2.3


From d408ebe04ac58eb370e2d264e88edbab746adda6 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Tue, 8 Jun 2021 15:59:18 -0400
Subject: sunrpc: add add sysfs directory per xprt under each xprt_switch

Add individual transport directories under each transport switch
group. For instance, for each nconnect=X connections there will be
a transport directory. Naming conventions also identifies transport
type -- xprt-<id>-<type> where type is udp, tcp, rdma, local, bc.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprt.h |  2 ++
 net/sunrpc/sysfs.c          | 67 +++++++++++++++++++++++++++++++++++++++++++++
 net/sunrpc/sysfs.h          |  8 ++++++
 net/sunrpc/xprtmultipath.c  |  4 +++
 4 files changed, 81 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 7efc6c0a5455..8360db664e5f 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -183,6 +183,7 @@ enum xprt_transports {
 	XPRT_TRANSPORT_LOCAL	= 257,
 };
 
+struct rpc_sysfs_xprt;
 struct rpc_xprt {
 	struct kref		kref;		/* Reference count */
 	const struct rpc_xprt_ops *ops;		/* transport methods */
@@ -291,6 +292,7 @@ struct rpc_xprt {
 #endif
 	struct rcu_head		rcu;
 	const struct xprt_class	*xprt_class;
+	struct rpc_sysfs_xprt	*xprt_sysfs;
 };
 
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c
index 0aa63747f496..20f75708594f 100644
--- a/net/sunrpc/sysfs.c
+++ b/net/sunrpc/sysfs.c
@@ -82,6 +82,14 @@ static void rpc_sysfs_xprt_switch_release(struct kobject *kobj)
 	kfree(xprt_switch);
 }
 
+static void rpc_sysfs_xprt_release(struct kobject *kobj)
+{
+	struct rpc_sysfs_xprt *xprt;
+
+	xprt = container_of(kobj, struct rpc_sysfs_xprt, kobject);
+	kfree(xprt);
+}
+
 static const void *rpc_sysfs_client_namespace(struct kobject *kobj)
 {
 	return container_of(kobj, struct rpc_sysfs_client, kobject)->net;
@@ -92,6 +100,12 @@ static const void *rpc_sysfs_xprt_switch_namespace(struct kobject *kobj)
 	return container_of(kobj, struct rpc_sysfs_xprt_switch, kobject)->net;
 }
 
+static const void *rpc_sysfs_xprt_namespace(struct kobject *kobj)
+{
+	return container_of(kobj, struct rpc_sysfs_xprt,
+			    kobject)->xprt->xprt_net;
+}
+
 static struct kobj_type rpc_sysfs_client_type = {
 	.release = rpc_sysfs_client_release,
 	.sysfs_ops = &kobj_sysfs_ops,
@@ -104,6 +118,12 @@ static struct kobj_type rpc_sysfs_xprt_switch_type = {
 	.namespace = rpc_sysfs_xprt_switch_namespace,
 };
 
+static struct kobj_type rpc_sysfs_xprt_type = {
+	.release = rpc_sysfs_xprt_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.namespace = rpc_sysfs_xprt_namespace,
+};
+
 void rpc_sysfs_exit(void)
 {
 	kobject_put(rpc_sunrpc_client_kobj);
@@ -151,6 +171,25 @@ rpc_sysfs_xprt_switch_alloc(struct kobject *parent,
 	return NULL;
 }
 
+static struct rpc_sysfs_xprt *rpc_sysfs_xprt_alloc(struct kobject *parent,
+						   struct rpc_xprt *xprt,
+						   gfp_t gfp_flags)
+{
+	struct rpc_sysfs_xprt *p;
+
+	p = kzalloc(sizeof(*p), gfp_flags);
+	if (!p)
+		goto out;
+	p->kobject.kset = rpc_sunrpc_kset;
+	if (kobject_init_and_add(&p->kobject, &rpc_sysfs_xprt_type,
+				 parent, "xprt-%d-%s", xprt->id,
+				 xprt->address_strings[RPC_DISPLAY_PROTO]) == 0)
+		return p;
+	kobject_put(&p->kobject);
+out:
+	return NULL;
+}
+
 void rpc_sysfs_client_setup(struct rpc_clnt *clnt,
 			    struct rpc_xprt_switch *xprt_switch,
 			    struct net *net)
@@ -199,6 +238,22 @@ void rpc_sysfs_xprt_switch_setup(struct rpc_xprt_switch *xprt_switch,
 	}
 }
 
+void rpc_sysfs_xprt_setup(struct rpc_xprt_switch *xprt_switch,
+			  struct rpc_xprt *xprt,
+			  gfp_t gfp_flags)
+{
+	struct rpc_sysfs_xprt *rpc_xprt;
+	struct rpc_sysfs_xprt_switch *switch_obj =
+		(struct rpc_sysfs_xprt_switch *)xprt_switch->xps_sysfs;
+
+	rpc_xprt = rpc_sysfs_xprt_alloc(&switch_obj->kobject, xprt, gfp_flags);
+	if (rpc_xprt) {
+		xprt->xprt_sysfs = rpc_xprt;
+		rpc_xprt->xprt = xprt;
+		kobject_uevent(&rpc_xprt->kobject, KOBJ_ADD);
+	}
+}
+
 void rpc_sysfs_client_destroy(struct rpc_clnt *clnt)
 {
 	struct rpc_sysfs_client *rpc_client = clnt->cl_sysfs;
@@ -225,3 +280,15 @@ void rpc_sysfs_xprt_switch_destroy(struct rpc_xprt_switch *xprt_switch)
 		xprt_switch->xps_sysfs = NULL;
 	}
 }
+
+void rpc_sysfs_xprt_destroy(struct rpc_xprt *xprt)
+{
+	struct rpc_sysfs_xprt *rpc_xprt = xprt->xprt_sysfs;
+
+	if (rpc_xprt) {
+		kobject_uevent(&rpc_xprt->kobject, KOBJ_REMOVE);
+		kobject_del(&rpc_xprt->kobject);
+		kobject_put(&rpc_xprt->kobject);
+		xprt->xprt_sysfs = NULL;
+	}
+}
diff --git a/net/sunrpc/sysfs.h b/net/sunrpc/sysfs.h
index 760f0582aee5..ff10451de6fa 100644
--- a/net/sunrpc/sysfs.h
+++ b/net/sunrpc/sysfs.h
@@ -19,6 +19,11 @@ struct rpc_sysfs_xprt_switch {
 	struct rpc_xprt *xprt;
 };
 
+struct rpc_sysfs_xprt {
+	struct kobject kobject;
+	struct rpc_xprt *xprt;
+};
+
 int rpc_sysfs_init(void);
 void rpc_sysfs_exit(void);
 
@@ -29,5 +34,8 @@ void rpc_sysfs_client_destroy(struct rpc_clnt *clnt);
 void rpc_sysfs_xprt_switch_setup(struct rpc_xprt_switch *xprt_switch,
 				 struct rpc_xprt *xprt, gfp_t gfp_flags);
 void rpc_sysfs_xprt_switch_destroy(struct rpc_xprt_switch *xprt);
+void rpc_sysfs_xprt_setup(struct rpc_xprt_switch *xprt_switch,
+			  struct rpc_xprt *xprt, gfp_t gfp_flags);
+void rpc_sysfs_xprt_destroy(struct rpc_xprt *xprt);
 
 #endif
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 2d73a35df9ee..e7973c1ff70c 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -57,6 +57,7 @@ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps,
 	if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL)
 		xprt_switch_add_xprt_locked(xps, xprt);
 	spin_unlock(&xps->xps_lock);
+	rpc_sysfs_xprt_setup(xps, xprt, GFP_KERNEL);
 }
 
 static void xprt_switch_remove_xprt_locked(struct rpc_xprt_switch *xps,
@@ -85,6 +86,7 @@ void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps,
 	spin_lock(&xps->xps_lock);
 	xprt_switch_remove_xprt_locked(xps, xprt);
 	spin_unlock(&xps->xps_lock);
+	rpc_sysfs_xprt_destroy(xprt);
 	xprt_put(xprt);
 }
 
@@ -137,6 +139,7 @@ struct rpc_xprt_switch *xprt_switch_alloc(struct rpc_xprt *xprt,
 		xps->xps_iter_ops = &rpc_xprt_iter_singular;
 		rpc_sysfs_xprt_switch_setup(xps, xprt, gfp_flags);
 		xprt_switch_add_xprt_locked(xps, xprt);
+		rpc_sysfs_xprt_setup(xps, xprt, gfp_flags);
 	}
 
 	return xps;
@@ -152,6 +155,7 @@ static void xprt_switch_free_entries(struct rpc_xprt_switch *xps)
 				struct rpc_xprt, xprt_switch);
 		xprt_switch_remove_xprt_locked(xps, xprt);
 		spin_unlock(&xps->xps_lock);
+		rpc_sysfs_xprt_destroy(xprt);
 		xprt_put(xprt);
 		spin_lock(&xps->xps_lock);
 	}
-- 
cgit v1.2.3


From e091853ebdb486fd8bde86b87178fdf3850914fc Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Wed, 23 Jun 2021 23:28:46 -0400
Subject: SUNRPC mark the first transport

When an RPC client gets created it's first transport is special
and should be marked a main transport.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprt.h | 1 +
 net/sunrpc/clnt.c           | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 13a4eaf385cf..692e5946c029 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -293,6 +293,7 @@ struct rpc_xprt {
 	struct rcu_head		rcu;
 	const struct xprt_class	*xprt_class;
 	struct rpc_sysfs_xprt	*xprt_sysfs;
+	bool			main; /*mark if this is the 1st transport */
 };
 
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 9bf820bad84c..408618765aa5 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -412,6 +412,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args,
 	}
 
 	rpc_clnt_set_transport(clnt, xprt, timeout);
+	xprt->main = true;
 	xprt_iter_init(&clnt->cl_xpi, xps);
 	xprt_switch_put(xps);
 
-- 
cgit v1.2.3


From a8482488a7d6d320f63a9ee1912dbb5ae5b80a61 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Wed, 23 Jun 2021 23:28:48 -0400
Subject: SUNRPC query transport's source port

Provide ability to query transport's source port.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprtsock.h | 1 +
 net/sunrpc/xprtsock.c           | 7 +++++++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h
index 3c1423ee74b4..8c2a712cb242 100644
--- a/include/linux/sunrpc/xprtsock.h
+++ b/include/linux/sunrpc/xprtsock.h
@@ -10,6 +10,7 @@
 
 int		init_socket_xprt(void);
 void		cleanup_socket_xprt(void);
+unsigned short	get_srcport(struct rpc_xprt *);
 
 #define RPC_MIN_RESVPORT	(1U)
 #define RPC_MAX_RESVPORT	(65535U)
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 2ad4d0df45fe..4611845ec1eb 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1653,6 +1653,13 @@ static int xs_get_srcport(struct sock_xprt *transport)
 	return port;
 }
 
+unsigned short get_srcport(struct rpc_xprt *xprt)
+{
+	struct sock_xprt *sock = container_of(xprt, struct sock_xprt, xprt);
+	return sock->srcport;
+}
+EXPORT_SYMBOL(get_srcport);
+
 static unsigned short xs_next_srcport(struct sock_xprt *transport, unsigned short port)
 {
 	if (transport->srcport != 0)
-- 
cgit v1.2.3


From 587bc7255d26ca80b58026881db5fb3bf770cc43 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Tue, 8 Jun 2021 15:59:19 -0400
Subject: sunrpc: add dst_attr attributes to the sysfs xprt directory

Allow to query and set the destination's address of a transport.
Setting of the destination address is allowed only for TCP or RDMA
based connections.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprt.h |   1 +
 net/sunrpc/sysfs.c          | 105 ++++++++++++++++++++++++++++++++++++++++++++
 net/sunrpc/xprt.c           |   4 +-
 net/sunrpc/xprtmultipath.c  |   2 -
 4 files changed, 109 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 8360db664e5f..13a4eaf385cf 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -414,6 +414,7 @@ void			xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
 
 bool			xprt_lock_connect(struct rpc_xprt *, struct rpc_task *, void *);
 void			xprt_unlock_connect(struct rpc_xprt *, void *);
+void			xprt_release_write(struct rpc_xprt *, struct rpc_task *);
 
 /*
  * Reserved bit positions in xprt->state
diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c
index 20f75708594f..402924bbd743 100644
--- a/net/sunrpc/sysfs.c
+++ b/net/sunrpc/sysfs.c
@@ -4,8 +4,23 @@
  */
 #include <linux/sunrpc/clnt.h>
 #include <linux/kobject.h>
+#include <linux/sunrpc/addr.h>
+
 #include "sysfs.h"
 
+struct xprt_addr {
+	const char *addr;
+	struct rcu_head rcu;
+};
+
+static void free_xprt_addr(struct rcu_head *head)
+{
+	struct xprt_addr *addr = container_of(head, struct xprt_addr, rcu);
+
+	kfree(addr->addr);
+	kfree(addr);
+}
+
 static struct kset *rpc_sunrpc_kset;
 static struct kobject *rpc_sunrpc_client_kobj, *rpc_sunrpc_xprt_switch_kobj;
 
@@ -43,6 +58,87 @@ static struct kobject *rpc_sysfs_object_alloc(const char *name,
 	return NULL;
 }
 
+static inline struct rpc_xprt *
+rpc_sysfs_xprt_kobj_get_xprt(struct kobject *kobj)
+{
+	struct rpc_sysfs_xprt *x = container_of(kobj,
+		struct rpc_sysfs_xprt, kobject);
+
+	return xprt_get(x->xprt);
+}
+
+static ssize_t rpc_sysfs_xprt_dstaddr_show(struct kobject *kobj,
+					   struct kobj_attribute *attr,
+					   char *buf)
+{
+	struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj);
+	ssize_t ret;
+
+	if (!xprt)
+		return 0;
+	ret = sprintf(buf, "%s\n", xprt->address_strings[RPC_DISPLAY_ADDR]);
+	xprt_put(xprt);
+	return ret + 1;
+}
+
+static ssize_t rpc_sysfs_xprt_dstaddr_store(struct kobject *kobj,
+					    struct kobj_attribute *attr,
+					    const char *buf, size_t count)
+{
+	struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj);
+	struct sockaddr *saddr;
+	char *dst_addr;
+	int port;
+	struct xprt_addr *saved_addr;
+	size_t buf_len;
+
+	if (!xprt)
+		return 0;
+	if (!(xprt->xprt_class->ident == XPRT_TRANSPORT_TCP ||
+	      xprt->xprt_class->ident == XPRT_TRANSPORT_RDMA)) {
+		xprt_put(xprt);
+		return -EOPNOTSUPP;
+	}
+
+	if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) {
+		count = -EINTR;
+		goto out_put;
+	}
+	saddr = (struct sockaddr *)&xprt->addr;
+	port = rpc_get_port(saddr);
+
+	/* buf_len is the len until the first occurence of either
+	 * '\n' or '\0'
+	 */
+	buf_len = strcspn(buf, "\n");
+
+	dst_addr = kstrndup(buf, buf_len, GFP_KERNEL);
+	if (!dst_addr)
+		goto out_err;
+	saved_addr = kzalloc(sizeof(*saved_addr), GFP_KERNEL);
+	if (!saved_addr)
+		goto out_err_free;
+	saved_addr->addr =
+		rcu_dereference_raw(xprt->address_strings[RPC_DISPLAY_ADDR]);
+	rcu_assign_pointer(xprt->address_strings[RPC_DISPLAY_ADDR], dst_addr);
+	call_rcu(&saved_addr->rcu, free_xprt_addr);
+	xprt->addrlen = rpc_pton(xprt->xprt_net, buf, buf_len, saddr,
+				 sizeof(*saddr));
+	rpc_set_port(saddr, port);
+
+	xprt_force_disconnect(xprt);
+out:
+	xprt_release_write(xprt, NULL);
+out_put:
+	xprt_put(xprt);
+	return count;
+out_err_free:
+	kfree(dst_addr);
+out_err:
+	count = -ENOMEM;
+	goto out;
+}
+
 int rpc_sysfs_init(void)
 {
 	rpc_sunrpc_kset = kset_create_and_add("sunrpc", NULL, kernel_kobj);
@@ -106,6 +202,14 @@ static const void *rpc_sysfs_xprt_namespace(struct kobject *kobj)
 			    kobject)->xprt->xprt_net;
 }
 
+static struct kobj_attribute rpc_sysfs_xprt_dstaddr = __ATTR(dstaddr,
+	0644, rpc_sysfs_xprt_dstaddr_show, rpc_sysfs_xprt_dstaddr_store);
+
+static struct attribute *rpc_sysfs_xprt_attrs[] = {
+	&rpc_sysfs_xprt_dstaddr.attr,
+	NULL,
+};
+
 static struct kobj_type rpc_sysfs_client_type = {
 	.release = rpc_sysfs_client_release,
 	.sysfs_ops = &kobj_sysfs_ops,
@@ -120,6 +224,7 @@ static struct kobj_type rpc_sysfs_xprt_switch_type = {
 
 static struct kobj_type rpc_sysfs_xprt_type = {
 	.release = rpc_sysfs_xprt_release,
+	.default_attrs = rpc_sysfs_xprt_attrs,
 	.sysfs_ops = &kobj_sysfs_ops,
 	.namespace = rpc_sysfs_xprt_namespace,
 };
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 20b9bd705014..fb6db09725c7 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -55,6 +55,7 @@
 #include <trace/events/sunrpc.h>
 
 #include "sunrpc.h"
+#include "sysfs.h"
 
 /*
  * Local variables
@@ -443,7 +444,7 @@ void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
 }
 EXPORT_SYMBOL_GPL(xprt_release_xprt_cong);
 
-static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task)
+void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task)
 {
 	if (xprt->snd_task != task)
 		return;
@@ -1812,6 +1813,7 @@ void xprt_free(struct rpc_xprt *xprt)
 	put_net(xprt->xprt_net);
 	xprt_free_all_slots(xprt);
 	xprt_free_id(xprt);
+	rpc_sysfs_xprt_destroy(xprt);
 	kfree_rcu(xprt, rcu);
 }
 EXPORT_SYMBOL_GPL(xprt_free);
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index e7973c1ff70c..07e76ae1028a 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -86,7 +86,6 @@ void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps,
 	spin_lock(&xps->xps_lock);
 	xprt_switch_remove_xprt_locked(xps, xprt);
 	spin_unlock(&xps->xps_lock);
-	rpc_sysfs_xprt_destroy(xprt);
 	xprt_put(xprt);
 }
 
@@ -155,7 +154,6 @@ static void xprt_switch_free_entries(struct rpc_xprt_switch *xps)
 				struct rpc_xprt, xprt_switch);
 		xprt_switch_remove_xprt_locked(xps, xprt);
 		spin_unlock(&xps->xps_lock);
-		rpc_sysfs_xprt_destroy(xprt);
 		xprt_put(xprt);
 		spin_lock(&xps->xps_lock);
 	}
-- 
cgit v1.2.3


From 5b7eb78486cd9ac58bfbd6d84ea0fe2d9fead03b Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Wed, 23 Jun 2021 23:28:50 -0400
Subject: SUNRPC: take a xprt offline using sysfs

Using sysfs's xprt_state attribute, mark a particular transport offline.
It will not be picked during the round-robin selection. It's not allowed
to take the main (1st created transport associated with the rpc_client)
offline. Also bring a transport back online via sysfs by writing "online"
and that would allow for this transport to be picked during the round-
robin selection.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprt.h |  1 +
 net/sunrpc/sysfs.c          | 66 ++++++++++++++++++++++++++++++++++++++++++---
 net/sunrpc/sysfs.h          |  1 +
 net/sunrpc/xprtmultipath.c  |  6 +++--
 4 files changed, 68 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 692e5946c029..b8ed7fa1b4ca 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -427,6 +427,7 @@ void			xprt_release_write(struct rpc_xprt *, struct rpc_task *);
 #define XPRT_BOUND		(4)
 #define XPRT_BINDING		(5)
 #define XPRT_CLOSING		(6)
+#define XPRT_OFFLINE		(7)
 #define XPRT_CONGESTED		(9)
 #define XPRT_CWND_WAIT		(10)
 #define XPRT_WRITE_SPACE	(11)
diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c
index 08aa503295b7..a30ad18aa7dc 100644
--- a/net/sunrpc/sysfs.c
+++ b/net/sunrpc/sysfs.c
@@ -68,6 +68,15 @@ rpc_sysfs_xprt_kobj_get_xprt(struct kobject *kobj)
 	return xprt_get(x->xprt);
 }
 
+static inline struct rpc_xprt_switch *
+rpc_sysfs_xprt_kobj_get_xprt_switch(struct kobject *kobj)
+{
+	struct rpc_sysfs_xprt *x = container_of(kobj,
+		struct rpc_sysfs_xprt, kobject);
+
+	return xprt_switch_get(x->xprt_switch);
+}
+
 static inline struct rpc_xprt_switch *
 rpc_sysfs_xprt_switch_kobj_get_xprt(struct kobject *kobj)
 {
@@ -122,7 +131,7 @@ static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj,
 	struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj);
 	ssize_t ret;
 	int locked, connected, connecting, close_wait, bound, binding,
-	    closing, congested, cwnd_wait, write_space;
+	    closing, congested, cwnd_wait, write_space, offline;
 
 	if (!xprt)
 		return 0;
@@ -140,8 +149,9 @@ static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj,
 		congested = test_bit(XPRT_CONGESTED, &xprt->state);
 		cwnd_wait = test_bit(XPRT_CWND_WAIT, &xprt->state);
 		write_space = test_bit(XPRT_WRITE_SPACE, &xprt->state);
+		offline = test_bit(XPRT_OFFLINE, &xprt->state);
 
-		ret = sprintf(buf, "state=%s %s %s %s %s %s %s %s %s %s\n",
+		ret = sprintf(buf, "state=%s %s %s %s %s %s %s %s %s %s %s\n",
 			      locked ? "LOCKED" : "",
 			      connected ? "CONNECTED" : "",
 			      connecting ? "CONNECTING" : "",
@@ -151,7 +161,8 @@ static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj,
 			      closing ? "CLOSING" : "",
 			      congested ? "CONGESTED" : "",
 			      cwnd_wait ? "CWND_WAIT" : "",
-			      write_space ? "WRITE_SPACE" : "");
+			      write_space ? "WRITE_SPACE" : "",
+			      offline ? "OFFLINE" : "");
 	}
 
 	xprt_put(xprt);
@@ -233,6 +244,52 @@ out_err:
 	goto out;
 }
 
+static ssize_t rpc_sysfs_xprt_state_change(struct kobject *kobj,
+					   struct kobj_attribute *attr,
+					   const char *buf, size_t count)
+{
+	struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj);
+	int offline = 0, online = 0;
+	struct rpc_xprt_switch *xps = rpc_sysfs_xprt_kobj_get_xprt_switch(kobj);
+
+	if (!xprt)
+		return 0;
+
+	if (!strncmp(buf, "offline", 7))
+		offline = 1;
+	else if (!strncmp(buf, "online", 6))
+		online = 1;
+	else
+		return -EINVAL;
+
+	if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) {
+		count = -EINTR;
+		goto out_put;
+	}
+	if (xprt->main) {
+		count = -EINVAL;
+		goto release_tasks;
+	}
+	if (offline) {
+		set_bit(XPRT_OFFLINE, &xprt->state);
+		spin_lock(&xps->xps_lock);
+		xps->xps_nactive--;
+		spin_unlock(&xps->xps_lock);
+	} else if (online) {
+		clear_bit(XPRT_OFFLINE, &xprt->state);
+		spin_lock(&xps->xps_lock);
+		xps->xps_nactive++;
+		spin_unlock(&xps->xps_lock);
+	}
+
+release_tasks:
+	xprt_release_write(xprt, NULL);
+out_put:
+	xprt_put(xprt);
+	xprt_switch_put(xps);
+	return count;
+}
+
 int rpc_sysfs_init(void)
 {
 	rpc_sunrpc_kset = kset_create_and_add("sunrpc", NULL, kernel_kobj);
@@ -303,7 +360,7 @@ static struct kobj_attribute rpc_sysfs_xprt_info = __ATTR(xprt_info,
 	0444, rpc_sysfs_xprt_info_show, NULL);
 
 static struct kobj_attribute rpc_sysfs_xprt_change_state = __ATTR(xprt_state,
-	0644, rpc_sysfs_xprt_state_show, NULL);
+	0644, rpc_sysfs_xprt_state_show, rpc_sysfs_xprt_state_change);
 
 static struct attribute *rpc_sysfs_xprt_attrs[] = {
 	&rpc_sysfs_xprt_dstaddr.attr,
@@ -466,6 +523,7 @@ void rpc_sysfs_xprt_setup(struct rpc_xprt_switch *xprt_switch,
 	if (rpc_xprt) {
 		xprt->xprt_sysfs = rpc_xprt;
 		rpc_xprt->xprt = xprt;
+		rpc_xprt->xprt_switch = xprt_switch;
 		kobject_uevent(&rpc_xprt->kobject, KOBJ_ADD);
 	}
 }
diff --git a/net/sunrpc/sysfs.h b/net/sunrpc/sysfs.h
index ff10451de6fa..6620cebd1037 100644
--- a/net/sunrpc/sysfs.h
+++ b/net/sunrpc/sysfs.h
@@ -22,6 +22,7 @@ struct rpc_sysfs_xprt_switch {
 struct rpc_sysfs_xprt {
 	struct kobject kobject;
 	struct rpc_xprt *xprt;
+	struct rpc_xprt_switch *xprt_switch;
 };
 
 int rpc_sysfs_init(void);
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 07e76ae1028a..5f4845d1e922 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -65,7 +65,8 @@ static void xprt_switch_remove_xprt_locked(struct rpc_xprt_switch *xps,
 {
 	if (unlikely(xprt == NULL))
 		return;
-	xps->xps_nactive--;
+	if (!test_bit(XPRT_OFFLINE, &xprt->state))
+		xps->xps_nactive--;
 	xps->xps_nxprts--;
 	if (xps->xps_nxprts == 0)
 		xps->xps_net = NULL;
@@ -230,7 +231,8 @@ void xprt_iter_default_rewind(struct rpc_xprt_iter *xpi)
 static
 bool xprt_is_active(const struct rpc_xprt *xprt)
 {
-	return kref_read(&xprt->kref) != 0;
+	return (kref_read(&xprt->kref) != 0 &&
+		!test_bit(XPRT_OFFLINE, &xprt->state));
 }
 
 static
-- 
cgit v1.2.3


From 85e39feead948bdf8322c961d7a9bebc20d629f3 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Wed, 23 Jun 2021 23:28:51 -0400
Subject: NFSv4.1 identify and mark RPC tasks that can move between transports

In preparation for when we can re-try a task on a different transport,
identify and mark such RPC tasks as moveable. Only 4.1+ operarations can
be re-tried on a different transport.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4proc.c            | 38 +++++++++++++++++++++++++++++++++-----
 fs/nfs/pagelist.c            |  8 ++++++--
 fs/nfs/write.c               |  6 +++++-
 include/linux/sunrpc/sched.h |  2 ++
 4 files changed, 46 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e653654c10bc..d3ee3700c9dd 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1155,7 +1155,11 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
 				   struct nfs4_sequence_args *args,
 				   struct nfs4_sequence_res *res)
 {
-	return nfs4_do_call_sync(clnt, server, msg, args, res, 0);
+	unsigned short task_flags = 0;
+
+	if (server->nfs_client->cl_minorversion)
+		task_flags = RPC_TASK_MOVEABLE;
+	return nfs4_do_call_sync(clnt, server, msg, args, res, task_flags);
 }
 
 
@@ -2569,6 +2573,9 @@ static int nfs4_run_open_task(struct nfs4_opendata *data,
 	};
 	int status;
 
+	if (server->nfs_client->cl_minorversion)
+		task_setup_data.flags |= RPC_TASK_MOVEABLE;
+
 	kref_get(&data->kref);
 	data->rpc_done = false;
 	data->rpc_status = 0;
@@ -3749,6 +3756,9 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
 	};
 	int status = -ENOMEM;
 
+	if (server->nfs_client->cl_minorversion)
+		task_setup_data.flags |= RPC_TASK_MOVEABLE;
+
 	nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_CLEANUP,
 		&task_setup_data.rpc_client, &msg);
 
@@ -4188,6 +4198,9 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
 	};
 	unsigned short task_flags = 0;
 
+	if (nfs4_has_session(server->nfs_client))
+		task_flags = RPC_TASK_MOVEABLE;
+
 	/* Is this is an attribute revalidation, subject to softreval? */
 	if (inode && (server->flags & NFS_MOUNT_SOFTREVAL))
 		task_flags |= RPC_TASK_TIMEOUT;
@@ -4307,6 +4320,9 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
 	};
 	unsigned short task_flags = 0;
 
+	if (server->nfs_client->cl_minorversion)
+		task_flags = RPC_TASK_MOVEABLE;
+
 	/* Is this is an attribute revalidation, subject to softreval? */
 	if (nfs_lookup_is_soft_revalidate(dentry))
 		task_flags |= RPC_TASK_TIMEOUT;
@@ -6538,7 +6554,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred,
 		.rpc_client = server->client,
 		.rpc_message = &msg,
 		.callback_ops = &nfs4_delegreturn_ops,
-		.flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT,
+		.flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT | RPC_TASK_MOVEABLE,
 	};
 	int status = 0;
 
@@ -6856,6 +6872,11 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
 		.workqueue = nfsiod_workqueue,
 		.flags = RPC_TASK_ASYNC,
 	};
+	struct nfs_client *client =
+		NFS_SERVER(lsp->ls_state->inode)->nfs_client;
+
+	if (client->cl_minorversion)
+		task_setup_data.flags |= RPC_TASK_MOVEABLE;
 
 	nfs4_state_protect(NFS_SERVER(lsp->ls_state->inode)->nfs_client,
 		NFS_SP4_MACH_CRED_CLEANUP, &task_setup_data.rpc_client, &msg);
@@ -7130,6 +7151,10 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
 		.flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
 	};
 	int ret;
+	struct nfs_client *client = NFS_SERVER(state->inode)->nfs_client;
+
+	if (client->cl_minorversion)
+		task_setup_data.flags |= RPC_TASK_MOVEABLE;
 
 	dprintk("%s: begin!\n", __func__);
 	data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
@@ -9186,7 +9211,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
 		.rpc_client = clp->cl_rpcclient,
 		.rpc_message = &msg,
 		.callback_ops = &nfs41_sequence_ops,
-		.flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT,
+		.flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT | RPC_TASK_MOVEABLE,
 	};
 	struct rpc_task *ret;
 
@@ -9509,7 +9534,8 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
 		.rpc_message = &msg,
 		.callback_ops = &nfs4_layoutget_call_ops,
 		.callback_data = lgp,
-		.flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
+		.flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF |
+			 RPC_TASK_MOVEABLE,
 	};
 	struct pnfs_layout_segment *lseg = NULL;
 	struct nfs4_exception exception = {
@@ -9650,6 +9676,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
 		.rpc_message = &msg,
 		.callback_ops = &nfs4_layoutreturn_call_ops,
 		.callback_data = lrp,
+		.flags = RPC_TASK_MOVEABLE,
 	};
 	int status = 0;
 
@@ -9804,6 +9831,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
 		.rpc_message = &msg,
 		.callback_ops = &nfs4_layoutcommit_ops,
 		.callback_data = data,
+		.flags = RPC_TASK_MOVEABLE,
 	};
 	struct rpc_task *task;
 	int status = 0;
@@ -10131,7 +10159,7 @@ static int nfs41_free_stateid(struct nfs_server *server,
 		.rpc_client = server->client,
 		.rpc_message = &msg,
 		.callback_ops = &nfs41_free_stateid_ops,
-		.flags = RPC_TASK_ASYNC,
+		.flags = RPC_TASK_ASYNC | RPC_TASK_MOVEABLE,
 	};
 	struct nfs_free_stateid_data *data;
 	struct rpc_task *task;
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index cf9cc62ec48e..cc232d1f16f2 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -954,6 +954,7 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
 {
 	struct nfs_pgio_header *hdr;
 	int ret;
+	unsigned short task_flags = 0;
 
 	hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
 	if (!hdr) {
@@ -962,14 +963,17 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
 	}
 	nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
 	ret = nfs_generic_pgio(desc, hdr);
-	if (ret == 0)
+	if (ret == 0) {
+		if (NFS_SERVER(hdr->inode)->nfs_client->cl_minorversion)
+			task_flags = RPC_TASK_MOVEABLE;
 		ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode),
 					hdr,
 					hdr->cred,
 					NFS_PROTO(hdr->inode),
 					desc->pg_rpc_callops,
 					desc->pg_ioflags,
-					RPC_TASK_CRED_NOREF);
+					RPC_TASK_CRED_NOREF | task_flags);
+	}
 	return ret;
 }
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3bf82178166a..eae9bf114041 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1810,6 +1810,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
 		struct nfs_commit_info *cinfo)
 {
 	struct nfs_commit_data	*data;
+	unsigned short task_flags = 0;
 
 	/* another commit raced with us */
 	if (list_empty(head))
@@ -1820,8 +1821,11 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
 	/* Set up the argument struct */
 	nfs_init_commit(data, head, NULL, cinfo);
 	atomic_inc(&cinfo->mds->rpcs_out);
+	if (NFS_SERVER(inode)->nfs_client->cl_minorversion)
+		task_flags = RPC_TASK_MOVEABLE;
 	return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode),
-				   data->mds_ops, how, RPC_TASK_CRED_NOREF);
+				   data->mds_ops, how,
+				   RPC_TASK_CRED_NOREF | task_flags);
 }
 
 /*
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index df696efdd675..a237b8dbf608 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -121,6 +121,7 @@ struct rpc_task_setup {
  */
 #define RPC_TASK_ASYNC		0x0001		/* is an async task */
 #define RPC_TASK_SWAPPER	0x0002		/* is swapping in/out */
+#define RPC_TASK_MOVEABLE	0x0004		/* nfs4.1+ rpc tasks */
 #define RPC_TASK_NULLCREDS	0x0010		/* Use AUTH_NULL credential */
 #define RPC_CALL_MAJORSEEN	0x0020		/* major timeout seen */
 #define RPC_TASK_ROOTCREDS	0x0040		/* force root creds */
@@ -139,6 +140,7 @@ struct rpc_task_setup {
 #define RPC_IS_SOFT(t)		((t)->tk_flags & (RPC_TASK_SOFT|RPC_TASK_TIMEOUT))
 #define RPC_IS_SOFTCONN(t)	((t)->tk_flags & RPC_TASK_SOFTCONN)
 #define RPC_WAS_SENT(t)		((t)->tk_flags & RPC_TASK_SENT)
+#define RPC_IS_MOVEABLE(t)	((t)->tk_flags & RPC_TASK_MOVEABLE)
 
 #define RPC_TASK_RUNNING	0
 #define RPC_TASK_QUEUED		1
-- 
cgit v1.2.3


From 6f081693e7b2ba63422b735684b05a850a6351ba Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Wed, 23 Jun 2021 23:28:53 -0400
Subject: sunrpc: remove an offlined xprt using sysfs

Once a transport has been put offline, this transport can be also
removed from the list of transports. Any tasks that have been stuck
on this transport would find the next available active transport
and be re-tried. This transport would be removed from the xprt_switch
list and freed.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprt.h |  1 +
 net/sunrpc/clnt.c           | 24 ++++++++++++++++++++++++
 net/sunrpc/sysfs.c          | 26 ++++++++++++++++++++++----
 3 files changed, 47 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index b8ed7fa1b4ca..c8c39f22d3b1 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -428,6 +428,7 @@ void			xprt_release_write(struct rpc_xprt *, struct rpc_task *);
 #define XPRT_BINDING		(5)
 #define XPRT_CLOSING		(6)
 #define XPRT_OFFLINE		(7)
+#define XPRT_REMOVE		(8)
 #define XPRT_CONGESTED		(9)
 #define XPRT_CWND_WAIT		(10)
 #define XPRT_WRITE_SPACE	(11)
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 408618765aa5..8b4de70e8ead 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -2106,6 +2106,30 @@ call_connect_status(struct rpc_task *task)
 	case -ENOTCONN:
 	case -EAGAIN:
 	case -ETIMEDOUT:
+		if (!(task->tk_flags & RPC_TASK_NO_ROUND_ROBIN) &&
+		    (task->tk_flags & RPC_TASK_MOVEABLE) &&
+		    test_bit(XPRT_REMOVE, &xprt->state)) {
+			struct rpc_xprt *saved = task->tk_xprt;
+			struct rpc_xprt_switch *xps;
+
+			rcu_read_lock();
+			xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch));
+			rcu_read_unlock();
+			if (xps->xps_nxprts > 1) {
+				long value;
+
+				xprt_release(task);
+				value = atomic_long_dec_return(&xprt->queuelen);
+				if (value == 0)
+					rpc_xprt_switch_remove_xprt(xps, saved);
+				xprt_put(saved);
+				task->tk_xprt = NULL;
+				task->tk_action = call_start;
+			}
+			xprt_switch_put(xps);
+			if (!task->tk_xprt)
+				return;
+		}
 		goto out_retry;
 	case -ENOBUFS:
 		rpc_delay(task, HZ >> 2);
diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c
index b576c7f06829..64da3bfd28e6 100644
--- a/net/sunrpc/sysfs.c
+++ b/net/sunrpc/sysfs.c
@@ -133,7 +133,7 @@ static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj,
 	struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj);
 	ssize_t ret;
 	int locked, connected, connecting, close_wait, bound, binding,
-	    closing, congested, cwnd_wait, write_space, offline;
+	    closing, congested, cwnd_wait, write_space, offline, remove;
 
 	if (!xprt)
 		return 0;
@@ -152,8 +152,9 @@ static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj,
 		cwnd_wait = test_bit(XPRT_CWND_WAIT, &xprt->state);
 		write_space = test_bit(XPRT_WRITE_SPACE, &xprt->state);
 		offline = test_bit(XPRT_OFFLINE, &xprt->state);
+		remove = test_bit(XPRT_REMOVE, &xprt->state);
 
-		ret = sprintf(buf, "state=%s %s %s %s %s %s %s %s %s %s %s\n",
+		ret = sprintf(buf, "state=%s %s %s %s %s %s %s %s %s %s %s %s\n",
 			      locked ? "LOCKED" : "",
 			      connected ? "CONNECTED" : "",
 			      connecting ? "CONNECTING" : "",
@@ -164,7 +165,8 @@ static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj,
 			      congested ? "CONGESTED" : "",
 			      cwnd_wait ? "CWND_WAIT" : "",
 			      write_space ? "WRITE_SPACE" : "",
-			      offline ? "OFFLINE" : "");
+			      offline ? "OFFLINE" : "",
+			      remove ? "REMOVE" : "");
 	}
 
 	xprt_put(xprt);
@@ -251,7 +253,7 @@ static ssize_t rpc_sysfs_xprt_state_change(struct kobject *kobj,
 					   const char *buf, size_t count)
 {
 	struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj);
-	int offline = 0, online = 0;
+	int offline = 0, online = 0, remove = 0;
 	struct rpc_xprt_switch *xps = rpc_sysfs_xprt_kobj_get_xprt_switch(kobj);
 
 	if (!xprt)
@@ -261,6 +263,8 @@ static ssize_t rpc_sysfs_xprt_state_change(struct kobject *kobj,
 		offline = 1;
 	else if (!strncmp(buf, "online", 6))
 		online = 1;
+	else if (!strncmp(buf, "remove", 6))
+		remove = 1;
 	else
 		return -EINVAL;
 
@@ -282,6 +286,20 @@ static ssize_t rpc_sysfs_xprt_state_change(struct kobject *kobj,
 		spin_lock(&xps->xps_lock);
 		xps->xps_nactive++;
 		spin_unlock(&xps->xps_lock);
+	} else if (remove) {
+		if (test_bit(XPRT_OFFLINE, &xprt->state)) {
+			set_bit(XPRT_REMOVE, &xprt->state);
+			xprt_force_disconnect(xprt);
+			if (test_bit(XPRT_CONNECTED, &xprt->state)) {
+				if (!xprt->sending.qlen &&
+				    !xprt->pending.qlen &&
+				    !xprt->backlog.qlen &&
+				    !atomic_long_read(&xprt->queuelen))
+					rpc_xprt_switch_remove_xprt(xps, xprt);
+			}
+		} else {
+			count = -EINVAL;
+		}
 	}
 
 release_tasks:
-- 
cgit v1.2.3


From b4e89bcba2b3a966e043107cb52c682bb860cee7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 2 Jul 2021 17:24:22 -0400
Subject: NFSv4/pnfs: Clean up layout get on open

Cache the layout in the arguments so we don't have to keep looking it up
from the inode.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4proc.c       |  6 +-----
 fs/nfs/pnfs.c           | 28 ++++++++++++++++------------
 include/linux/nfs_xdr.h |  1 +
 3 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 2031d2b9b6e3..78d5f15ee852 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -9419,7 +9419,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
 {
 	struct inode *inode = lgp->args.inode;
 	struct nfs_server *server = NFS_SERVER(inode);
-	struct pnfs_layout_hdr *lo;
+	struct pnfs_layout_hdr *lo = lgp->lo;
 	int nfs4err = task->tk_status;
 	int err, status = 0;
 	LIST_HEAD(head);
@@ -9471,7 +9471,6 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
 	case -NFS4ERR_BAD_STATEID:
 		exception->timeout = 0;
 		spin_lock(&inode->i_lock);
-		lo = NFS_I(inode)->layout;
 		/* If the open stateid was bad, then recover it. */
 		if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) ||
 		    !nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) {
@@ -9554,9 +9553,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
 
 	dprintk("--> %s\n", __func__);
 
-	/* nfs4_layoutget_release calls pnfs_put_layout_hdr */
-	pnfs_get_layout_hdr(NFS_I(inode)->layout);
-
 	nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0, 0);
 
 	task = rpc_run_task(&task_setup_data);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index be960e47d7f6..ef14ea0b6ab8 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1128,8 +1128,7 @@ void pnfs_layoutget_free(struct nfs4_layoutget *lgp)
 	size_t max_pages = lgp->args.layout.pglen / PAGE_SIZE;
 
 	nfs4_free_pages(lgp->args.layout.pages, max_pages);
-	if (lgp->args.inode)
-		pnfs_put_layout_hdr(NFS_I(lgp->args.inode)->layout);
+	pnfs_put_layout_hdr(lgp->lo);
 	put_nfs_open_context(lgp->args.ctx);
 	kfree(lgp);
 }
@@ -2124,6 +2123,9 @@ lookup_again:
 		goto out_put_layout_hdr;
 	}
 
+	lgp->lo = lo;
+	pnfs_get_layout_hdr(lo);
+
 	lseg = nfs4_proc_layoutget(lgp, &timeout);
 	trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
 				 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
@@ -2255,6 +2257,7 @@ static void _lgopen_prepare_attached(struct nfs4_opendata *data,
 		pnfs_put_layout_hdr(lo);
 		return;
 	}
+	lgp->lo = lo;
 	data->lgp = lgp;
 	data->o_arg.lg_args = &lgp->args;
 	data->o_res.lg_res = &lgp->res;
@@ -2263,6 +2266,7 @@ static void _lgopen_prepare_attached(struct nfs4_opendata *data,
 static void _lgopen_prepare_floating(struct nfs4_opendata *data,
 				     struct nfs_open_context *ctx)
 {
+	struct inode *ino = data->dentry->d_inode;
 	struct pnfs_layout_range rng = {
 		.iomode = (data->o_arg.fmode & FMODE_WRITE) ?
 			  IOMODE_RW: IOMODE_READ,
@@ -2271,7 +2275,7 @@ static void _lgopen_prepare_floating(struct nfs4_opendata *data,
 	};
 	struct nfs4_layoutget *lgp;
 
-	lgp = pnfs_alloc_init_layoutget_args(NULL, ctx, &current_stateid,
+	lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &current_stateid,
 					     &rng, GFP_KERNEL);
 	if (!lgp)
 		return;
@@ -2291,6 +2295,8 @@ void pnfs_lgopen_prepare(struct nfs4_opendata *data,
 	/* Could check on max_ops, but currently hardcoded high enough */
 	if (!nfs_server_capable(data->dir->d_inode, NFS_CAP_LGOPEN))
 		return;
+	if (data->lgp)
+		return;
 	if (data->state)
 		_lgopen_prepare_attached(data, ctx);
 	else
@@ -2330,13 +2336,13 @@ void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp,
 		}
 		return;
 	}
-	if (!lgp->args.inode) {
+	if (!lgp->lo) {
 		lo = _pnfs_grab_empty_layout(ino, ctx);
 		if (!lo)
 			return;
-		lgp->args.inode = ino;
+		lgp->lo = lo;
 	} else
-		lo = NFS_I(lgp->args.inode)->layout;
+		lo = lgp->lo;
 
 	lseg = pnfs_layout_process(lgp);
 	if (!IS_ERR(lseg)) {
@@ -2349,11 +2355,9 @@ void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp,
 void nfs4_lgopen_release(struct nfs4_layoutget *lgp)
 {
 	if (lgp != NULL) {
-		struct inode *inode = lgp->args.inode;
-		if (inode) {
-			struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
-			pnfs_clear_first_layoutget(lo);
-			nfs_layoutget_end(lo);
+		if (lgp->lo) {
+			pnfs_clear_first_layoutget(lgp->lo);
+			nfs_layoutget_end(lgp->lo);
 		}
 		pnfs_layoutget_free(lgp);
 	}
@@ -2362,7 +2366,7 @@ void nfs4_lgopen_release(struct nfs4_layoutget *lgp)
 struct pnfs_layout_segment *
 pnfs_layout_process(struct nfs4_layoutget *lgp)
 {
-	struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
+	struct pnfs_layout_hdr *lo = lgp->lo;
 	struct nfs4_layoutget_res *res = &lgp->res;
 	struct pnfs_layout_segment *lseg;
 	struct inode *ino = lo->plh_inode;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 717ecc87c9e7..e9698b6278a5 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -277,6 +277,7 @@ struct nfs4_layoutget {
 	struct nfs4_layoutget_args args;
 	struct nfs4_layoutget_res res;
 	const struct cred *cred;
+	struct pnfs_layout_hdr *lo;
 	gfp_t gfp_flags;
 };
 
-- 
cgit v1.2.3


From c23c80822fbdf69c1aacbca50b8339972697f850 Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Wed, 7 Jul 2021 18:07:34 -0700
Subject: lib: fix spelling mistakes in header files

Fix some spelling mistakes in comments found by "codespell":
Hoever ==> However
poiter ==> pointer
representaion ==> representation
uppon ==> upon
independend ==> independent
aquired ==> acquired
mis-match ==> mismatch
scrach ==> scratch
struture ==> structure
Analagous ==> Analogous
interation ==> iteration

And some were discovered manually by Joe Perches and Christoph Lameter:
stroed ==> stored
arch independent ==> an architecture independent
A example structure for ==> Example structure for

Link: https://lkml.kernel.org/r/20210609150027.14805-2-thunder.leizhen@huawei.com
Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Cc: Christoph Lameter <cl@gentwo.de>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bootconfig.h      | 4 ++--
 include/linux/cpumask.h         | 2 +-
 include/linux/debugobjects.h    | 2 +-
 include/linux/lru_cache.h       | 8 ++++----
 include/linux/nodemask.h        | 6 +++---
 include/linux/percpu-refcount.h | 2 +-
 include/linux/scatterlist.h     | 2 +-
 7 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h
index 6bdd94cff4e2..abe089c27529 100644
--- a/include/linux/bootconfig.h
+++ b/include/linux/bootconfig.h
@@ -214,10 +214,10 @@ static inline struct xbc_node * __init xbc_node_get_subkey(struct xbc_node *node
  * @value: Iterated value of array entry.
  *
  * Iterate array entries of given @key under @node. Each array entry node
- * is stroed to @anode and @value. If the @node doesn't have @key node,
+ * is stored to @anode and @value. If the @node doesn't have @key node,
  * it does nothing.
  * Note that even if the found key node has only one value (not array)
- * this executes block once. Hoever, if the found key node has no value
+ * this executes block once. However, if the found key node has no value
  * (key-only node), this does nothing. So don't use this for testing the
  * key-value pair existence.
  */
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index bfc4690de4f4..f3689a52bfd0 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -259,7 +259,7 @@ extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool
 /**
  * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location
  * @cpu: the (optionally unsigned) integer iterator
- * @mask: the cpumask poiter
+ * @mask: the cpumask pointer
  * @start: the start location
  *
  * The implementation does not assume any bit in @mask is set (including @start).
diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h
index 8d2dde23e9fb..32444686b6ff 100644
--- a/include/linux/debugobjects.h
+++ b/include/linux/debugobjects.h
@@ -18,7 +18,7 @@ enum debug_obj_state {
 struct debug_obj_descr;
 
 /**
- * struct debug_obj - representaion of an tracked object
+ * struct debug_obj - representation of an tracked object
  * @node:	hlist node to link the object into the tracker list
  * @state:	tracked object state
  * @astate:	current active state
diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h
index 429d67d815ce..07add7882a5d 100644
--- a/include/linux/lru_cache.h
+++ b/include/linux/lru_cache.h
@@ -32,7 +32,7 @@ This header file (and its .c file; kernel-doc of functions see there)
   Because of this later property, it is called "lru_cache".
   As it actually Tracks Objects in an Active SeT, we could also call it
   toast (incidentally that is what may happen to the data on the
-  backend storage uppon next resync, if we don't get it right).
+  backend storage upon next resync, if we don't get it right).
 
 What for?
 
@@ -152,7 +152,7 @@ struct lc_element {
 	 * for paranoia, and for "lc_element_to_index" */
 	unsigned lc_index;
 	/* if we want to track a larger set of objects,
-	 * it needs to become arch independend u64 */
+	 * it needs to become an architecture independent u64 */
 	unsigned lc_number;
 	/* special label when on free list */
 #define LC_FREE (~0U)
@@ -263,7 +263,7 @@ extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char
  *
  * Allows (expects) the set to be "dirty".  Note that the reference counts and
  * order on the active and lru lists may still change.  Used to serialize
- * changing transactions.  Returns true if we aquired the lock.
+ * changing transactions.  Returns true if we acquired the lock.
  */
 static inline int lc_try_lock_for_transaction(struct lru_cache *lc)
 {
@@ -275,7 +275,7 @@ static inline int lc_try_lock_for_transaction(struct lru_cache *lc)
  * @lc: the lru cache to operate on
  *
  * Note that the reference counts and order on the active and lru lists may
- * still change.  Only works on a "clean" set.  Returns true if we aquired the
+ * still change.  Only works on a "clean" set.  Returns true if we acquired the
  * lock, which means there are no pending changes, and any further attempt to
  * change the set will not succeed until the next lc_unlock().
  */
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index ac398e143c9a..567c3ddba2c4 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -119,7 +119,7 @@ static inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m)
  * The inline keyword gives the compiler room to decide to inline, or
  * not inline a function as it sees best.  However, as these functions
  * are called in both __init and non-__init functions, if they are not
- * inlined we will end up with a section mis-match error (of the type of
+ * inlined we will end up with a section mismatch error (of the type of
  * freeable items not being freed).  So we must use __always_inline here
  * to fix the problem.  If other functions in the future also end up in
  * this situation they will also need to be annotated as __always_inline
@@ -515,7 +515,7 @@ static inline int node_random(const nodemask_t *mask)
 #define for_each_online_node(node) for_each_node_state(node, N_ONLINE)
 
 /*
- * For nodemask scrach area.
+ * For nodemask scratch area.
  * NODEMASK_ALLOC(type, name) allocates an object with a specified type and
  * name.
  */
@@ -528,7 +528,7 @@ static inline int node_random(const nodemask_t *mask)
 #define NODEMASK_FREE(m)			do {} while (0)
 #endif
 
-/* A example struture for using NODEMASK_ALLOC, used in mempolicy. */
+/* Example structure for using NODEMASK_ALLOC, used in mempolicy. */
 struct nodemask_scratch {
 	nodemask_t	mask1;
 	nodemask_t	mask2;
diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 16c35a728b4c..ae16a9856305 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -213,7 +213,7 @@ static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr)
  * percpu_ref_get - increment a percpu refcount
  * @ref: percpu_ref to get
  *
- * Analagous to atomic_long_inc().
+ * Analogous to atomic_long_inc().
  *
  * This function is safe to call as long as @ref is between init and exit.
  */
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 6f70572b2938..ecf87484814f 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -474,7 +474,7 @@ sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter)
  * Iterates over sg entries mapping page-by-page.  On each successful
  * iteration, @miter->page points to the mapped page and
  * @miter->length bytes of data can be accessed at @miter->addr.  As
- * long as an interation is enclosed between start and stop, the user
+ * long as an iteration is enclosed between start and stop, the user
  * is free to choose control structure and when to stop.
  *
  * @miter->consumed is set to @miter->length on each iteration.  It
-- 
cgit v1.2.3


From 6d47c23b16aa78ff93a3050ccf4b1bd1c064b8b3 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Wed, 7 Jul 2021 18:07:59 -0700
Subject: set_memory: allow querying whether set_direct_map_*() is actually
 enabled

On arm64, set_direct_map_*() functions may return 0 without actually
changing the linear map.  This behaviour can be controlled using kernel
parameters, so we need a way to determine at runtime whether calls to
set_direct_map_invalid_noflush() and set_direct_map_default_noflush() have
any effect.

Extend set_memory API with can_set_direct_map() function that allows
checking if calling set_direct_map_*() will actually change the page
table, replace several occurrences of open coded checks in arm64 with the
new function and provide a generic stub for architectures that always
modify page tables upon calls to set_direct_map APIs.

[arnd@arndb.de: arm64: kfence: fix header inclusion ]

Link: https://lkml.kernel.org/r/20210518072034.31572-4-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christopher Lameter <cl@linux.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Elena Reshetova <elena.reshetova@intel.com>
Cc: Hagen Paul Pfeifer <hagen@jauu.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Bottomley <jejb@linux.ibm.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Palmer Dabbelt <palmerdabbelt@google.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tycho Andersen <tycho@tycho.ws>
Cc: Will Deacon <will@kernel.org>
Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/include/asm/Kbuild       |  1 -
 arch/arm64/include/asm/cacheflush.h |  6 ------
 arch/arm64/include/asm/kfence.h     |  2 +-
 arch/arm64/include/asm/set_memory.h | 17 +++++++++++++++++
 arch/arm64/kernel/machine_kexec.c   |  1 +
 arch/arm64/mm/mmu.c                 |  7 +++----
 arch/arm64/mm/pageattr.c            | 13 +++++++++----
 include/linux/set_memory.h          | 12 ++++++++++++
 8 files changed, 43 insertions(+), 16 deletions(-)
 create mode 100644 arch/arm64/include/asm/set_memory.h

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
index 26889dbfe904..64202010b700 100644
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -3,7 +3,6 @@ generic-y += early_ioremap.h
 generic-y += mcs_spinlock.h
 generic-y += qrwlock.h
 generic-y += qspinlock.h
-generic-y += set_memory.h
 generic-y += user.h
 
 generated-y += cpucaps.h
diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index 543c997eb3b7..5a228e203ef9 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -144,12 +144,6 @@ static __always_inline void icache_inval_all_pou(void)
 	dsb(ish);
 }
 
-int set_memory_valid(unsigned long addr, int numpages, int enable);
-
-int set_direct_map_invalid_noflush(struct page *page);
-int set_direct_map_default_noflush(struct page *page);
-bool kernel_page_present(struct page *page);
-
 #include <asm-generic/cacheflush.h>
 
 #endif /* __ASM_CACHEFLUSH_H */
diff --git a/arch/arm64/include/asm/kfence.h b/arch/arm64/include/asm/kfence.h
index d061176d57ea..aa855c6a0ae6 100644
--- a/arch/arm64/include/asm/kfence.h
+++ b/arch/arm64/include/asm/kfence.h
@@ -8,7 +8,7 @@
 #ifndef __ASM_KFENCE_H
 #define __ASM_KFENCE_H
 
-#include <asm/cacheflush.h>
+#include <asm/set_memory.h>
 
 static inline bool arch_kfence_init_pool(void) { return true; }
 
diff --git a/arch/arm64/include/asm/set_memory.h b/arch/arm64/include/asm/set_memory.h
new file mode 100644
index 000000000000..0f740b781187
--- /dev/null
+++ b/arch/arm64/include/asm/set_memory.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef _ASM_ARM64_SET_MEMORY_H
+#define _ASM_ARM64_SET_MEMORY_H
+
+#include <asm-generic/set_memory.h>
+
+bool can_set_direct_map(void);
+#define can_set_direct_map can_set_direct_map
+
+int set_memory_valid(unsigned long addr, int numpages, int enable);
+
+int set_direct_map_invalid_noflush(struct page *page);
+int set_direct_map_default_noflush(struct page *page);
+bool kernel_page_present(struct page *page);
+
+#endif /* _ASM_ARM64_SET_MEMORY_H */
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index 03ceabe4d912..213d56c14f60 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/kexec.h>
 #include <linux/page-flags.h>
+#include <linux/set_memory.h>
 #include <linux/smp.h>
 
 #include <asm/cacheflush.h>
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 595fde9a47dd..d74586508448 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -22,6 +22,7 @@
 #include <linux/io.h>
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
+#include <linux/set_memory.h>
 
 #include <asm/barrier.h>
 #include <asm/cputype.h>
@@ -515,8 +516,7 @@ static void __init map_mem(pgd_t *pgdp)
 	 */
 	BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end));
 
-	if (rodata_full || crash_mem_map || debug_pagealloc_enabled() ||
-	    IS_ENABLED(CONFIG_KFENCE))
+	if (can_set_direct_map() || crash_mem_map || IS_ENABLED(CONFIG_KFENCE))
 		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
 
 	/*
@@ -1489,8 +1489,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
 	 * KFENCE requires linear map to be mapped at page granularity, so that
 	 * it is possible to protect/unprotect single pages in the KFENCE pool.
 	 */
-	if (rodata_full || debug_pagealloc_enabled() ||
-	    IS_ENABLED(CONFIG_KFENCE))
+	if (can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE))
 		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
 
 	__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 92eccaf595c8..a3bacd79507a 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -19,6 +19,11 @@ struct page_change_data {
 
 bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED);
 
+bool can_set_direct_map(void)
+{
+	return rodata_full || debug_pagealloc_enabled();
+}
+
 static int change_page_range(pte_t *ptep, unsigned long addr, void *data)
 {
 	struct page_change_data *cdata = data;
@@ -155,7 +160,7 @@ int set_direct_map_invalid_noflush(struct page *page)
 		.clear_mask = __pgprot(PTE_VALID),
 	};
 
-	if (!debug_pagealloc_enabled() && !rodata_full)
+	if (!can_set_direct_map())
 		return 0;
 
 	return apply_to_page_range(&init_mm,
@@ -170,7 +175,7 @@ int set_direct_map_default_noflush(struct page *page)
 		.clear_mask = __pgprot(PTE_RDONLY),
 	};
 
-	if (!debug_pagealloc_enabled() && !rodata_full)
+	if (!can_set_direct_map())
 		return 0;
 
 	return apply_to_page_range(&init_mm,
@@ -181,7 +186,7 @@ int set_direct_map_default_noflush(struct page *page)
 #ifdef CONFIG_DEBUG_PAGEALLOC
 void __kernel_map_pages(struct page *page, int numpages, int enable)
 {
-	if (!debug_pagealloc_enabled() && !rodata_full)
+	if (!can_set_direct_map())
 		return;
 
 	set_memory_valid((unsigned long)page_address(page), numpages, enable);
@@ -206,7 +211,7 @@ bool kernel_page_present(struct page *page)
 	pte_t *ptep;
 	unsigned long addr = (unsigned long)page_address(page);
 
-	if (!debug_pagealloc_enabled() && !rodata_full)
+	if (!can_set_direct_map())
 		return true;
 
 	pgdp = pgd_offset_k(addr);
diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h
index fe1aa4e54680..f36be5166c19 100644
--- a/include/linux/set_memory.h
+++ b/include/linux/set_memory.h
@@ -28,7 +28,19 @@ static inline bool kernel_page_present(struct page *page)
 {
 	return true;
 }
+#else /* CONFIG_ARCH_HAS_SET_DIRECT_MAP */
+/*
+ * Some architectures, e.g. ARM64 can disable direct map modifications at
+ * boot time. Let them overrive this query.
+ */
+#ifndef can_set_direct_map
+static inline bool can_set_direct_map(void)
+{
+	return true;
+}
+#define can_set_direct_map can_set_direct_map
 #endif
+#endif /* CONFIG_ARCH_HAS_SET_DIRECT_MAP */
 
 #ifndef set_mce_nospec
 static inline int set_mce_nospec(unsigned long pfn, bool unmap)
-- 
cgit v1.2.3


From 1507f51255c9ff07d75909a84e7c0d7f3c4b2f49 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Wed, 7 Jul 2021 18:08:03 -0700
Subject: mm: introduce memfd_secret system call to create "secret" memory
 areas

Introduce "memfd_secret" system call with the ability to create memory
areas visible only in the context of the owning process and not mapped not
only to other processes but in the kernel page tables as well.

The secretmem feature is off by default and the user must explicitly
enable it at the boot time.

Once secretmem is enabled, the user will be able to create a file
descriptor using the memfd_secret() system call.  The memory areas created
by mmap() calls from this file descriptor will be unmapped from the kernel
direct map and they will be only mapped in the page table of the processes
that have access to the file descriptor.

Secretmem is designed to provide the following protections:

* Enhanced protection (in conjunction with all the other in-kernel
  attack prevention systems) against ROP attacks.  Seceretmem makes
  "simple" ROP insufficient to perform exfiltration, which increases the
  required complexity of the attack.  Along with other protections like
  the kernel stack size limit and address space layout randomization which
  make finding gadgets is really hard, absence of any in-kernel primitive
  for accessing secret memory means the one gadget ROP attack can't work.
  Since the only way to access secret memory is to reconstruct the missing
  mapping entry, the attacker has to recover the physical page and insert
  a PTE pointing to it in the kernel and then retrieve the contents.  That
  takes at least three gadgets which is a level of difficulty beyond most
  standard attacks.

* Prevent cross-process secret userspace memory exposures.  Once the
  secret memory is allocated, the user can't accidentally pass it into the
  kernel to be transmitted somewhere.  The secreremem pages cannot be
  accessed via the direct map and they are disallowed in GUP.

* Harden against exploited kernel flaws.  In order to access secretmem,
  a kernel-side attack would need to either walk the page tables and
  create new ones, or spawn a new privileged uiserspace process to perform
  secrets exfiltration using ptrace.

The file descriptor based memory has several advantages over the
"traditional" mm interfaces, such as mlock(), mprotect(), madvise().  File
descriptor approach allows explicit and controlled sharing of the memory
areas, it allows to seal the operations.  Besides, file descriptor based
memory paves the way for VMMs to remove the secret memory range from the
userspace hipervisor process, for instance QEMU.  Andy Lutomirski says:

  "Getting fd-backed memory into a guest will take some possibly major
  work in the kernel, but getting vma-backed memory into a guest without
  mapping it in the host user address space seems much, much worse."

memfd_secret() is made a dedicated system call rather than an extension to
memfd_create() because it's purpose is to allow the user to create more
secure memory mappings rather than to simply allow file based access to
the memory.  Nowadays a new system call cost is negligible while it is way
simpler for userspace to deal with a clear-cut system calls than with a
multiplexer or an overloaded syscall.  Moreover, the initial
implementation of memfd_secret() is completely distinct from
memfd_create() so there is no much sense in overloading memfd_create() to
begin with.  If there will be a need for code sharing between these
implementation it can be easily achieved without a need to adjust user
visible APIs.

The secret memory remains accessible in the process context using uaccess
primitives, but it is not exposed to the kernel otherwise; secret memory
areas are removed from the direct map and functions in the
follow_page()/get_user_page() family will refuse to return a page that
belongs to the secret memory area.

Once there will be a use case that will require exposing secretmem to the
kernel it will be an opt-in request in the system call flags so that user
would have to decide what data can be exposed to the kernel.

Removing of the pages from the direct map may cause its fragmentation on
architectures that use large pages to map the physical memory which
affects the system performance.  However, the original Kconfig text for
CONFIG_DIRECT_GBPAGES said that gigabyte pages in the direct map "...  can
improve the kernel's performance a tiny bit ..." (commit 00d1c5e05736
("x86: add gbpages switches")) and the recent report [1] showed that "...
although 1G mappings are a good default choice, there is no compelling
evidence that it must be the only choice".  Hence, it is sufficient to
have secretmem disabled by default with the ability of a system
administrator to enable it at boot time.

Pages in the secretmem regions are unevictable and unmovable to avoid
accidental exposure of the sensitive data via swap or during page
migration.

Since the secretmem mappings are locked in memory they cannot exceed
RLIMIT_MEMLOCK.  Since these mappings are already locked independently
from mlock(), an attempt to mlock()/munlock() secretmem range would fail
and mlockall()/munlockall() will ignore secretmem mappings.

However, unlike mlock()ed memory, secretmem currently behaves more like
long-term GUP: secretmem mappings are unmovable mappings directly consumed
by user space.  With default limits, there is no excessive use of
secretmem and it poses no real problem in combination with
ZONE_MOVABLE/CMA, but in the future this should be addressed to allow
balanced use of large amounts of secretmem along with ZONE_MOVABLE/CMA.

A page that was a part of the secret memory area is cleared when it is
freed to ensure the data is not exposed to the next user of that page.

The following example demonstrates creation of a secret mapping (error
handling is omitted):

	fd = memfd_secret(0);
	ftruncate(fd, MAP_SIZE);
	ptr = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE,
		   MAP_SHARED, fd, 0);

[1] https://lore.kernel.org/linux-mm/213b4567-46ce-f116-9cdf-bbd0c884eb3c@linux.intel.com/

[akpm@linux-foundation.org: suppress Kconfig whine]

Link: https://lkml.kernel.org/r/20210518072034.31572-5-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Hagen Paul Pfeifer <hagen@jauu.net>
Acked-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Elena Reshetova <elena.reshetova@intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Bottomley <jejb@linux.ibm.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Palmer Dabbelt <palmerdabbelt@google.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tycho Andersen <tycho@tycho.ws>
Cc: Will Deacon <will@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/secretmem.h  |  48 +++++++++
 include/uapi/linux/magic.h |   1 +
 kernel/sys_ni.c            |   2 +
 mm/Kconfig                 |   4 +
 mm/Makefile                |   1 +
 mm/gup.c                   |  12 +++
 mm/mlock.c                 |   3 +-
 mm/secretmem.c             | 239 +++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 309 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/secretmem.h
 create mode 100644 mm/secretmem.c

(limited to 'include/linux')

diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h
new file mode 100644
index 000000000000..e617b4afcc62
--- /dev/null
+++ b/include/linux/secretmem.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _LINUX_SECRETMEM_H
+#define _LINUX_SECRETMEM_H
+
+#ifdef CONFIG_SECRETMEM
+
+extern const struct address_space_operations secretmem_aops;
+
+static inline bool page_is_secretmem(struct page *page)
+{
+	struct address_space *mapping;
+
+	/*
+	 * Using page_mapping() is quite slow because of the actual call
+	 * instruction and repeated compound_head(page) inside the
+	 * page_mapping() function.
+	 * We know that secretmem pages are not compound and LRU so we can
+	 * save a couple of cycles here.
+	 */
+	if (PageCompound(page) || !PageLRU(page))
+		return false;
+
+	mapping = (struct address_space *)
+		((unsigned long)page->mapping & ~PAGE_MAPPING_FLAGS);
+
+	if (mapping != page->mapping)
+		return false;
+
+	return mapping->a_ops == &secretmem_aops;
+}
+
+bool vma_is_secretmem(struct vm_area_struct *vma);
+
+#else
+
+static inline bool vma_is_secretmem(struct vm_area_struct *vma)
+{
+	return false;
+}
+
+static inline bool page_is_secretmem(struct page *page)
+{
+	return false;
+}
+
+#endif /* CONFIG_SECRETMEM */
+
+#endif /* _LINUX_SECRETMEM_H */
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index f3956fc11de6..35687dcb1a42 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -97,5 +97,6 @@
 #define DEVMEM_MAGIC		0x454d444d	/* "DMEM" */
 #define Z3FOLD_MAGIC		0x33
 #define PPC_CMM_MAGIC		0xc7571590
+#define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
 
 #endif /* __LINUX_MAGIC_H__ */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index dad4d994641e..30971b1dd4a9 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -358,6 +358,8 @@ COND_SYSCALL(pkey_mprotect);
 COND_SYSCALL(pkey_alloc);
 COND_SYSCALL(pkey_free);
 
+/* memfd_secret */
+COND_SYSCALL(memfd_secret);
 
 /*
  * Architecture specific weak syscall entries.
diff --git a/mm/Kconfig b/mm/Kconfig
index a02498c0e13d..40a9bfcd5062 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -885,4 +885,8 @@ config KMAP_LOCAL
 # struct io_mapping based helper.  Selected by drivers that need them
 config IO_MAPPING
 	bool
+
+config SECRETMEM
+	def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
+
 endmenu
diff --git a/mm/Makefile b/mm/Makefile
index 74b47c354682..e3436741d539 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -113,6 +113,7 @@ obj-$(CONFIG_CMA)	+= cma.o
 obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
 obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
 obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
+obj-$(CONFIG_SECRETMEM) += secretmem.o
 obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o
 obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
 obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
diff --git a/mm/gup.c b/mm/gup.c
index 728d996767cb..42b8b1fa6521 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -10,6 +10,7 @@
 #include <linux/rmap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/secretmem.h>
 
 #include <linux/sched/signal.h>
 #include <linux/rwsem.h>
@@ -855,6 +856,9 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 	struct follow_page_context ctx = { NULL };
 	struct page *page;
 
+	if (vma_is_secretmem(vma))
+		return NULL;
+
 	page = follow_page_mask(vma, address, foll_flags, &ctx);
 	if (ctx.pgmap)
 		put_dev_pagemap(ctx.pgmap);
@@ -988,6 +992,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
 	if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
 		return -EOPNOTSUPP;
 
+	if (vma_is_secretmem(vma))
+		return -EFAULT;
+
 	if (write) {
 		if (!(vm_flags & VM_WRITE)) {
 			if (!(gup_flags & FOLL_FORCE))
@@ -2170,6 +2177,11 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 		if (!head)
 			goto pte_unmap;
 
+		if (unlikely(page_is_secretmem(page))) {
+			put_compound_head(head, 1, flags);
+			goto pte_unmap;
+		}
+
 		if (unlikely(pte_val(pte) != pte_val(*ptep))) {
 			put_compound_head(head, 1, flags);
 			goto pte_unmap;
diff --git a/mm/mlock.c b/mm/mlock.c
index 0d639bf48794..16d2ee160d43 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -23,6 +23,7 @@
 #include <linux/hugetlb.h>
 #include <linux/memcontrol.h>
 #include <linux/mm_inline.h>
+#include <linux/secretmem.h>
 
 #include "internal.h"
 
@@ -503,7 +504,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 
 	if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
 	    is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
-	    vma_is_dax(vma))
+	    vma_is_dax(vma) || vma_is_secretmem(vma))
 		/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
 		goto out;
 
diff --git a/mm/secretmem.c b/mm/secretmem.c
new file mode 100644
index 000000000000..972cd1bbc3cc
--- /dev/null
+++ b/mm/secretmem.c
@@ -0,0 +1,239 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corporation, 2021
+ *
+ * Author: Mike Rapoport <rppt@linux.ibm.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/swap.h>
+#include <linux/mount.h>
+#include <linux/memfd.h>
+#include <linux/bitops.h>
+#include <linux/printk.h>
+#include <linux/pagemap.h>
+#include <linux/syscalls.h>
+#include <linux/pseudo_fs.h>
+#include <linux/secretmem.h>
+#include <linux/set_memory.h>
+#include <linux/sched/signal.h>
+
+#include <uapi/linux/magic.h>
+
+#include <asm/tlbflush.h>
+
+#include "internal.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) "secretmem: " fmt
+
+/*
+ * Define mode and flag masks to allow validation of the system call
+ * parameters.
+ */
+#define SECRETMEM_MODE_MASK	(0x0)
+#define SECRETMEM_FLAGS_MASK	SECRETMEM_MODE_MASK
+
+static bool secretmem_enable __ro_after_init;
+module_param_named(enable, secretmem_enable, bool, 0400);
+MODULE_PARM_DESC(secretmem_enable,
+		 "Enable secretmem and memfd_secret(2) system call");
+
+static vm_fault_t secretmem_fault(struct vm_fault *vmf)
+{
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+	struct inode *inode = file_inode(vmf->vma->vm_file);
+	pgoff_t offset = vmf->pgoff;
+	gfp_t gfp = vmf->gfp_mask;
+	unsigned long addr;
+	struct page *page;
+	int err;
+
+	if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
+		return vmf_error(-EINVAL);
+
+retry:
+	page = find_lock_page(mapping, offset);
+	if (!page) {
+		page = alloc_page(gfp | __GFP_ZERO);
+		if (!page)
+			return VM_FAULT_OOM;
+
+		err = set_direct_map_invalid_noflush(page);
+		if (err) {
+			put_page(page);
+			return vmf_error(err);
+		}
+
+		__SetPageUptodate(page);
+		err = add_to_page_cache_lru(page, mapping, offset, gfp);
+		if (unlikely(err)) {
+			put_page(page);
+			/*
+			 * If a split of large page was required, it
+			 * already happened when we marked the page invalid
+			 * which guarantees that this call won't fail
+			 */
+			set_direct_map_default_noflush(page);
+			if (err == -EEXIST)
+				goto retry;
+
+			return vmf_error(err);
+		}
+
+		addr = (unsigned long)page_address(page);
+		flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+	}
+
+	vmf->page = page;
+	return VM_FAULT_LOCKED;
+}
+
+static const struct vm_operations_struct secretmem_vm_ops = {
+	.fault = secretmem_fault,
+};
+
+static int secretmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	unsigned long len = vma->vm_end - vma->vm_start;
+
+	if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
+		return -EINVAL;
+
+	if (mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len))
+		return -EAGAIN;
+
+	vma->vm_flags |= VM_LOCKED | VM_DONTDUMP;
+	vma->vm_ops = &secretmem_vm_ops;
+
+	return 0;
+}
+
+bool vma_is_secretmem(struct vm_area_struct *vma)
+{
+	return vma->vm_ops == &secretmem_vm_ops;
+}
+
+static const struct file_operations secretmem_fops = {
+	.mmap		= secretmem_mmap,
+};
+
+static bool secretmem_isolate_page(struct page *page, isolate_mode_t mode)
+{
+	return false;
+}
+
+static int secretmem_migratepage(struct address_space *mapping,
+				 struct page *newpage, struct page *page,
+				 enum migrate_mode mode)
+{
+	return -EBUSY;
+}
+
+static void secretmem_freepage(struct page *page)
+{
+	set_direct_map_default_noflush(page);
+	clear_highpage(page);
+}
+
+const struct address_space_operations secretmem_aops = {
+	.freepage	= secretmem_freepage,
+	.migratepage	= secretmem_migratepage,
+	.isolate_page	= secretmem_isolate_page,
+};
+
+static struct vfsmount *secretmem_mnt;
+
+static struct file *secretmem_file_create(unsigned long flags)
+{
+	struct file *file = ERR_PTR(-ENOMEM);
+	struct inode *inode;
+
+	inode = alloc_anon_inode(secretmem_mnt->mnt_sb);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem",
+				 O_RDWR, &secretmem_fops);
+	if (IS_ERR(file))
+		goto err_free_inode;
+
+	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+	mapping_set_unevictable(inode->i_mapping);
+
+	inode->i_mapping->a_ops = &secretmem_aops;
+
+	/* pretend we are a normal file with zero size */
+	inode->i_mode |= S_IFREG;
+	inode->i_size = 0;
+
+	return file;
+
+err_free_inode:
+	iput(inode);
+	return file;
+}
+
+SYSCALL_DEFINE1(memfd_secret, unsigned int, flags)
+{
+	struct file *file;
+	int fd, err;
+
+	/* make sure local flags do not confict with global fcntl.h */
+	BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC);
+
+	if (!secretmem_enable)
+		return -ENOSYS;
+
+	if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC))
+		return -EINVAL;
+
+	fd = get_unused_fd_flags(flags & O_CLOEXEC);
+	if (fd < 0)
+		return fd;
+
+	file = secretmem_file_create(flags);
+	if (IS_ERR(file)) {
+		err = PTR_ERR(file);
+		goto err_put_fd;
+	}
+
+	file->f_flags |= O_LARGEFILE;
+
+	fd_install(fd, file);
+	return fd;
+
+err_put_fd:
+	put_unused_fd(fd);
+	return err;
+}
+
+static int secretmem_init_fs_context(struct fs_context *fc)
+{
+	return init_pseudo(fc, SECRETMEM_MAGIC) ? 0 : -ENOMEM;
+}
+
+static struct file_system_type secretmem_fs = {
+	.name		= "secretmem",
+	.init_fs_context = secretmem_init_fs_context,
+	.kill_sb	= kill_anon_super,
+};
+
+static int secretmem_init(void)
+{
+	int ret = 0;
+
+	if (!secretmem_enable)
+		return ret;
+
+	secretmem_mnt = kern_mount(&secretmem_fs);
+	if (IS_ERR(secretmem_mnt))
+		ret = PTR_ERR(secretmem_mnt);
+
+	/* prevent secretmem mappings from ever getting PROT_EXEC */
+	secretmem_mnt->mnt_flags |= MNT_NOEXEC;
+
+	return ret;
+}
+fs_initcall(secretmem_init);
-- 
cgit v1.2.3


From 9a436f8ff6316c3c1a21a758e14ded930bd615d9 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Wed, 7 Jul 2021 18:08:07 -0700
Subject: PM: hibernate: disable when there are active secretmem users

It is unsafe to allow saving of secretmem areas to the hibernation
snapshot as they would be visible after the resume and this essentially
will defeat the purpose of secret memory mappings.

Prevent hibernation whenever there are active secret memory users.

Link: https://lkml.kernel.org/r/20210518072034.31572-6-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Elena Reshetova <elena.reshetova@intel.com>
Cc: Hagen Paul Pfeifer <hagen@jauu.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Bottomley <jejb@linux.ibm.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Palmer Dabbelt <palmerdabbelt@google.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tycho Andersen <tycho@tycho.ws>
Cc: Will Deacon <will@kernel.org>
Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/secretmem.h |  6 ++++++
 kernel/power/hibernate.c  |  5 ++++-
 mm/secretmem.c            | 15 +++++++++++++++
 3 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h
index e617b4afcc62..21c3771e6a56 100644
--- a/include/linux/secretmem.h
+++ b/include/linux/secretmem.h
@@ -30,6 +30,7 @@ static inline bool page_is_secretmem(struct page *page)
 }
 
 bool vma_is_secretmem(struct vm_area_struct *vma);
+bool secretmem_active(void);
 
 #else
 
@@ -43,6 +44,11 @@ static inline bool page_is_secretmem(struct page *page)
 	return false;
 }
 
+static inline bool secretmem_active(void)
+{
+	return false;
+}
+
 #endif /* CONFIG_SECRETMEM */
 
 #endif /* _LINUX_SECRETMEM_H */
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index da0b41914177..559acef3fddb 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -31,6 +31,7 @@
 #include <linux/genhd.h>
 #include <linux/ktime.h>
 #include <linux/security.h>
+#include <linux/secretmem.h>
 #include <trace/events/power.h>
 
 #include "power.h"
@@ -81,7 +82,9 @@ void hibernate_release(void)
 
 bool hibernation_available(void)
 {
-	return nohibernate == 0 && !security_locked_down(LOCKDOWN_HIBERNATION);
+	return nohibernate == 0 &&
+		!security_locked_down(LOCKDOWN_HIBERNATION) &&
+		!secretmem_active();
 }
 
 /**
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 972cd1bbc3cc..f77d25467a14 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -40,6 +40,13 @@ module_param_named(enable, secretmem_enable, bool, 0400);
 MODULE_PARM_DESC(secretmem_enable,
 		 "Enable secretmem and memfd_secret(2) system call");
 
+static atomic_t secretmem_users;
+
+bool secretmem_active(void)
+{
+	return !!atomic_read(&secretmem_users);
+}
+
 static vm_fault_t secretmem_fault(struct vm_fault *vmf)
 {
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
@@ -94,6 +101,12 @@ static const struct vm_operations_struct secretmem_vm_ops = {
 	.fault = secretmem_fault,
 };
 
+static int secretmem_release(struct inode *inode, struct file *file)
+{
+	atomic_dec(&secretmem_users);
+	return 0;
+}
+
 static int secretmem_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	unsigned long len = vma->vm_end - vma->vm_start;
@@ -116,6 +129,7 @@ bool vma_is_secretmem(struct vm_area_struct *vma)
 }
 
 static const struct file_operations secretmem_fops = {
+	.release	= secretmem_release,
 	.mmap		= secretmem_mmap,
 };
 
@@ -202,6 +216,7 @@ SYSCALL_DEFINE1(memfd_secret, unsigned int, flags)
 	file->f_flags |= O_LARGEFILE;
 
 	fd_install(fd, file);
+	atomic_inc(&secretmem_users);
 	return fd;
 
 err_put_fd:
-- 
cgit v1.2.3


From 7bb7f2ac24a028b20fca466b9633847b289b156a Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Wed, 7 Jul 2021 18:08:11 -0700
Subject: arch, mm: wire up memfd_secret system call where relevant

Wire up memfd_secret system call on architectures that define
ARCH_HAS_SET_DIRECT_MAP, namely arm64, risc-v and x86.

Link: https://lkml.kernel.org/r/20210518072034.31572-7-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Palmer Dabbelt <palmerdabbelt@google.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christopher Lameter <cl@linux.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Elena Reshetova <elena.reshetova@intel.com>
Cc: Hagen Paul Pfeifer <hagen@jauu.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Bottomley <jejb@linux.ibm.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tycho Andersen <tycho@tycho.ws>
Cc: Will Deacon <will@kernel.org>
Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/include/uapi/asm/unistd.h   | 1 +
 arch/riscv/include/asm/unistd.h        | 1 +
 arch/x86/entry/syscalls/syscall_32.tbl | 1 +
 arch/x86/entry/syscalls/syscall_64.tbl | 1 +
 include/linux/syscalls.h               | 1 +
 include/uapi/asm-generic/unistd.h      | 7 ++++++-
 scripts/checksyscalls.sh               | 4 ++++
 7 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/uapi/asm/unistd.h b/arch/arm64/include/uapi/asm/unistd.h
index f83a70e07df8..ce2ee8f1e361 100644
--- a/arch/arm64/include/uapi/asm/unistd.h
+++ b/arch/arm64/include/uapi/asm/unistd.h
@@ -20,5 +20,6 @@
 #define __ARCH_WANT_SET_GET_RLIMIT
 #define __ARCH_WANT_TIME32_SYSCALLS
 #define __ARCH_WANT_SYS_CLONE3
+#define __ARCH_WANT_MEMFD_SECRET
 
 #include <asm-generic/unistd.h>
diff --git a/arch/riscv/include/asm/unistd.h b/arch/riscv/include/asm/unistd.h
index 977ee6181dab..6c316093a1e5 100644
--- a/arch/riscv/include/asm/unistd.h
+++ b/arch/riscv/include/asm/unistd.h
@@ -9,6 +9,7 @@
  */
 
 #define __ARCH_WANT_SYS_CLONE
+#define __ARCH_WANT_MEMFD_SECRET
 
 #include <uapi/asm/unistd.h>
 
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index fba2f615119a..ce763a12311c 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -451,3 +451,4 @@
 444	i386	landlock_create_ruleset	sys_landlock_create_ruleset
 445	i386	landlock_add_rule	sys_landlock_add_rule
 446	i386	landlock_restrict_self	sys_landlock_restrict_self
+447	i386	memfd_secret		sys_memfd_secret
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index af973e400053..f6b57799c1ea 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -368,6 +368,7 @@
 444	common	landlock_create_ruleset	sys_landlock_create_ruleset
 445	common	landlock_add_rule	sys_landlock_add_rule
 446	common	landlock_restrict_self	sys_landlock_restrict_self
+447	common	memfd_secret		sys_memfd_secret
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 586128d5c3b8..69c9a7010081 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1050,6 +1050,7 @@ asmlinkage long sys_landlock_create_ruleset(const struct landlock_ruleset_attr _
 asmlinkage long sys_landlock_add_rule(int ruleset_fd, enum landlock_rule_type rule_type,
 		const void __user *rule_attr, __u32 flags);
 asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags);
+asmlinkage long sys_memfd_secret(unsigned int flags);
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index f211961ce1da..a9d6fcd95f42 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -873,8 +873,13 @@ __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule)
 #define __NR_landlock_restrict_self 446
 __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self)
 
+#ifdef __ARCH_WANT_MEMFD_SECRET
+#define __NR_memfd_secret 447
+__SYSCALL(__NR_memfd_secret, sys_memfd_secret)
+#endif
+
 #undef __NR_syscalls
-#define __NR_syscalls 447
+#define __NR_syscalls 448
 
 /*
  * 32 bit systems traditionally used different
diff --git a/scripts/checksyscalls.sh b/scripts/checksyscalls.sh
index a18b47695f55..b7609958ee36 100755
--- a/scripts/checksyscalls.sh
+++ b/scripts/checksyscalls.sh
@@ -40,6 +40,10 @@ cat << EOF
 #define __IGNORE_setrlimit	/* setrlimit */
 #endif
 
+#ifndef __ARCH_WANT_MEMFD_SECRET
+#define __IGNORE_memfd_secret
+#endif
+
 /* Missing flags argument */
 #define __IGNORE_renameat	/* renameat2 */
 
-- 
cgit v1.2.3


From 06c8839815ac7aa2b44ea3bb3ee1820b08418f55 Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Wed, 7 Jul 2021 18:08:19 -0700
Subject: mm: fix spelling mistakes in header files

Fix some spelling mistakes in comments:
successfull ==> successful
potentialy ==> potentially
alloced ==> allocated
indicies ==> indices
wont ==> won't
resposible ==> responsible
dirtyness ==> dirtiness
droppped ==> dropped
alread ==> already
occured ==> occurred
interupts ==> interrupts
extention ==> extension
slighly ==> slightly
Dont't ==> Don't

Link: https://lkml.kernel.org/r/20210531034849.9549-2-thunder.leizhen@huawei.com
Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h   | 4 ++--
 include/linux/hmm.h          | 2 +-
 include/linux/hugetlb.h      | 6 +++---
 include/linux/list_lru.h     | 4 ++--
 include/linux/mmu_notifier.h | 8 ++++----
 include/linux/percpu-defs.h  | 2 +-
 include/linux/shrinker.h     | 2 +-
 include/linux/vmalloc.h      | 4 ++--
 8 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 4221888bdcd6..c24098c7acca 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -35,12 +35,12 @@ enum compact_result {
 	COMPACT_CONTINUE,
 
 	/*
-	 * The full zone was compacted scanned but wasn't successfull to compact
+	 * The full zone was compacted scanned but wasn't successful to compact
 	 * suitable pages.
 	 */
 	COMPACT_COMPLETE,
 	/*
-	 * direct compaction has scanned part of the zone but wasn't successfull
+	 * direct compaction has scanned part of the zone but wasn't successful
 	 * to compact suitable pages.
 	 */
 	COMPACT_PARTIAL_SKIPPED,
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 866a0fa104c4..2fd2e91d5107 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -113,7 +113,7 @@ int hmm_range_fault(struct hmm_range *range);
  * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
  *
  * When waiting for mmu notifiers we need some kind of time out otherwise we
- * could potentialy wait for ever, 1000ms ie 1s sounds like a long time to
+ * could potentially wait for ever, 1000ms ie 1s sounds like a long time to
  * wait already.
  */
 #define HMM_RANGE_DEFAULT_TIMEOUT 1000
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 8e0f32f935bd..f7ca1a3870ea 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -51,7 +51,7 @@ struct hugepage_subpool {
 	long count;
 	long max_hpages;	/* Maximum huge pages or -1 if no maximum. */
 	long used_hpages;	/* Used count against maximum, includes */
-				/* both alloced and reserved pages. */
+				/* both allocated and reserved pages. */
 	struct hstate *hstate;
 	long min_hpages;	/* Minimum huge pages or -1 if no minimum. */
 	long rsv_hpages;	/* Pages reserved against global pool to */
@@ -85,7 +85,7 @@ struct resv_map {
  * by a resv_map's lock.  The set of regions within the resv_map represent
  * reservations for huge pages, or huge pages that have already been
  * instantiated within the map.  The from and to elements are huge page
- * indicies into the associated mapping.  from indicates the starting index
+ * indices into the associated mapping.  from indicates the starting index
  * of the region.  to represents the first index past the end of  the region.
  *
  * For example, a file region structure with from == 0 and to == 4 represents
@@ -797,7 +797,7 @@ static inline bool hugepage_migration_supported(struct hstate *h)
  * It determines whether or not a huge page should be placed on
  * movable zone or not. Movability of any huge page should be
  * required only if huge page size is supported for migration.
- * There wont be any reason for the huge page to be movable if
+ * There won't be any reason for the huge page to be movable if
  * it is not migratable to start with. Also the size of the huge
  * page should be large enough to be placed under a movable zone
  * and still feasible enough to be migratable. Just the presence
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 9dcaa3e582c9..1b5fceb565df 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -146,7 +146,7 @@ typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item,
  * @lru: the lru pointer.
  * @nid: the node id to scan from.
  * @memcg: the cgroup to scan from.
- * @isolate: callback function that is resposible for deciding what to do with
+ * @isolate: callback function that is responsible for deciding what to do with
  *  the item currently being scanned
  * @cb_arg: opaque type that will be passed to @isolate
  * @nr_to_walk: how many items to scan.
@@ -172,7 +172,7 @@ unsigned long list_lru_walk_one(struct list_lru *lru,
  * @lru: the lru pointer.
  * @nid: the node id to scan from.
  * @memcg: the cgroup to scan from.
- * @isolate: callback function that is resposible for deciding what to do with
+ * @isolate: callback function that is responsible for deciding what to do with
  *  the item currently being scanned
  * @cb_arg: opaque type that will be passed to @isolate
  * @nr_to_walk: how many items to scan.
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 6692da8d121d..45fc2c81e370 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -33,7 +33,7 @@ struct mmu_interval_notifier;
  *
  * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same
  * access flags). User should soft dirty the page in the end callback to make
- * sure that anyone relying on soft dirtyness catch pages that might be written
+ * sure that anyone relying on soft dirtiness catch pages that might be written
  * through non CPU mappings.
  *
  * @MMU_NOTIFY_RELEASE: used during mmu_interval_notifier invalidate to signal
@@ -167,7 +167,7 @@ struct mmu_notifier_ops {
 	 * decrease the refcount. If the refcount is decreased on
 	 * invalidate_range_start() then the VM can free pages as page
 	 * table entries are removed.  If the refcount is only
-	 * droppped on invalidate_range_end() then the driver itself
+	 * dropped on invalidate_range_end() then the driver itself
 	 * will drop the last refcount but it must take care to flush
 	 * any secondary tlb before doing the final free on the
 	 * page. Pages will no longer be referenced by the linux
@@ -196,7 +196,7 @@ struct mmu_notifier_ops {
 	 * If invalidate_range() is used to manage a non-CPU TLB with
 	 * shared page-tables, it not necessary to implement the
 	 * invalidate_range_start()/end() notifiers, as
-	 * invalidate_range() alread catches the points in time when an
+	 * invalidate_range() already catches the points in time when an
 	 * external TLB range needs to be flushed. For more in depth
 	 * discussion on this see Documentation/vm/mmu_notifier.rst
 	 *
@@ -369,7 +369,7 @@ mmu_interval_read_retry(struct mmu_interval_notifier *interval_sub,
  * mmu_interval_read_retry() will return true.
  *
  * False is not reliable and only suggests a collision may not have
- * occured. It can be called many times and does not have to hold the user
+ * occurred. It can be called many times and does not have to hold the user
  * provided lock.
  *
  * This call can be used as part of loops and other expensive operations to
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index dff7040f629a..af1071535de8 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -412,7 +412,7 @@ do {									\
  * instead.
  *
  * If there is no other protection through preempt disable and/or disabling
- * interupts then one of these RMW operations can show unexpected behavior
+ * interrupts then one of these RMW operations can show unexpected behavior
  * because the execution thread was rescheduled on another processor or an
  * interrupt occurred and the same percpu variable was modified from the
  * interrupt context.
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 1eac79ce57d4..9814fff58a69 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -4,7 +4,7 @@
 
 /*
  * This struct is used to pass information from page reclaim to the shrinkers.
- * We consolidate the values for easier extention later.
+ * We consolidate the values for easier extension later.
  *
  * The 'gfpmask' refers to the allocation we are currently trying to
  * fulfil.
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 1dabd6f22486..2644425b6dce 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -29,7 +29,7 @@ struct notifier_block;		/* in notifier.h */
 #define VM_NO_HUGE_VMAP		0x00000400	/* force PAGE_SIZE pte mapping */
 
 /*
- * VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC.
+ * VM_KASAN is used slightly differently depending on CONFIG_KASAN_VMALLOC.
  *
  * If IS_ENABLED(CONFIG_KASAN_VMALLOC), VM_KASAN is set on a vm_struct after
  * shadow memory has been mapped. It's used to handle allocation errors so that
@@ -247,7 +247,7 @@ static inline void set_vm_flush_reset_perms(void *addr)
 extern long vread(char *buf, char *addr, unsigned long count);
 
 /*
- *	Internals.  Dont't use..
+ *	Internals.  Don't use..
  */
 extern struct list_head vmap_area_list;
 extern __init void vm_area_add_early(struct vm_struct *vm);
-- 
cgit v1.2.3


From 5748fbc533a32459582535b759887c45ca0fe556 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 7 Jul 2021 18:08:22 -0700
Subject: mm: add setup_initial_init_mm() helper

Patch series "init_mm: cleanup ARCH's text/data/brk setup code", v3.

Add setup_initial_init_mm() helper, then use it to cleanup the text, data
and brk setup code.

This patch (of 15):

Add setup_initial_init_mm() helper to setup kernel text, data and brk.

Link: https://lkml.kernel.org/r/20210608083418.137226-1-wangkefeng.wang@huawei.com
Link: https://lkml.kernel.org/r/20210608083418.137226-2-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Souptick Joarder <jrdr.linux@gmail.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Greentime Hu <green.hu@gmail.com>
Cc: Greg Ungerer <gerg@linux-m68k.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Ley Foon Tan <ley.foon.tan@intel.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nick Hu <nickhu@andestech.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Rich Felker <dalias@libc.org>
Cc: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 3 +++
 mm/init-mm.c       | 9 +++++++++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 788a0b1323d0..57453dba41b9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -238,6 +238,9 @@ int __add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 
 #define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
 
+void setup_initial_init_mm(void *start_code, void *end_code,
+			   void *end_data, void *brk);
+
 /*
  * Linux kernel virtual memory manager primitives.
  * The idea being to have a "virtual" mm in the same way
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 153162669f80..b4a6f38fb51d 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -40,3 +40,12 @@ struct mm_struct init_mm = {
 	.cpu_bitmap	= CPU_BITS_NONE,
 	INIT_MM_CONTEXT(init_mm)
 };
+
+void setup_initial_init_mm(void *start_code, void *end_code,
+			   void *end_data, void *brk)
+{
+	init_mm.start_code = (unsigned long)start_code;
+	init_mm.end_code = (unsigned long)end_code;
+	init_mm.end_data = (unsigned long)end_data;
+	init_mm.brk = (unsigned long)brk;
+}
-- 
cgit v1.2.3


From 7eaf3cf3b7c5a49b3ca60e1ceb3d1d7430cc9d0e Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Wed, 7 Jul 2021 18:09:10 -0700
Subject: buildid: add API to parse build ID out of buffer

Add an API that can parse the build ID out of a buffer, instead of a vma,
to support printing a kernel module's build ID for stack traces.

Link: https://lkml.kernel.org/r/20210511003845.2429846-3-swboyd@chromium.org
Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Jessica Yu <jeyu@kernel.org>
Cc: Evan Green <evgreen@chromium.org>
Cc: Hsin-Yi Wang <hsinyi@chromium.org>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Sasha Levin <sashal@kernel.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/buildid.h |  1 +
 lib/buildid.c           | 50 ++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 38 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buildid.h b/include/linux/buildid.h
index 40232f90db6e..ebce93f26d06 100644
--- a/include/linux/buildid.h
+++ b/include/linux/buildid.h
@@ -8,5 +8,6 @@
 
 int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
 		   __u32 *size);
+int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size);
 
 #endif
diff --git a/lib/buildid.c b/lib/buildid.c
index e014636ec3eb..6aea1c4e5e85 100644
--- a/lib/buildid.c
+++ b/lib/buildid.c
@@ -2,30 +2,23 @@
 
 #include <linux/buildid.h>
 #include <linux/elf.h>
+#include <linux/kernel.h>
 #include <linux/pagemap.h>
 
 #define BUILD_ID 3
+
 /*
  * Parse build id from the note segment. This logic can be shared between
  * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are
  * identical.
  */
-static inline int parse_build_id(void *page_addr,
-				 unsigned char *build_id,
-				 __u32 *size,
-				 void *note_start,
-				 Elf32_Word note_size)
+static int parse_build_id_buf(unsigned char *build_id,
+			      __u32 *size,
+			      const void *note_start,
+			      Elf32_Word note_size)
 {
 	Elf32_Word note_offs = 0, new_offs;
 
-	/* check for overflow */
-	if (note_start < page_addr || note_start + note_size < note_start)
-		return -EINVAL;
-
-	/* only supports note that fits in the first page */
-	if (note_start + note_size > page_addr + PAGE_SIZE)
-		return -EINVAL;
-
 	while (note_offs + sizeof(Elf32_Nhdr) < note_size) {
 		Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs);
 
@@ -50,9 +43,27 @@ static inline int parse_build_id(void *page_addr,
 			break;
 		note_offs = new_offs;
 	}
+
 	return -EINVAL;
 }
 
+static inline int parse_build_id(void *page_addr,
+				 unsigned char *build_id,
+				 __u32 *size,
+				 void *note_start,
+				 Elf32_Word note_size)
+{
+	/* check for overflow */
+	if (note_start < page_addr || note_start + note_size < note_start)
+		return -EINVAL;
+
+	/* only supports note that fits in the first page */
+	if (note_start + note_size > page_addr + PAGE_SIZE)
+		return -EINVAL;
+
+	return parse_build_id_buf(build_id, size, note_start, note_size);
+}
+
 /* Parse build ID from 32-bit ELF */
 static int get_build_id_32(void *page_addr, unsigned char *build_id,
 			   __u32 *size)
@@ -148,3 +159,16 @@ out:
 	put_page(page);
 	return ret;
 }
+
+/**
+ * build_id_parse_buf - Get build ID from a buffer
+ * @buf:      Elf note section(s) to parse
+ * @buf_size: Size of @buf in bytes
+ * @build_id: Build ID parsed from @buf, at least BUILD_ID_SIZE_MAX long
+ *
+ * Return: 0 on success, -EINVAL otherwise
+ */
+int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size)
+{
+	return parse_build_id_buf(build_id, NULL, buf, buf_size);
+}
-- 
cgit v1.2.3


From 83cc6fa0049d7c5333a53f4d959a9457340284ea Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Wed, 7 Jul 2021 18:09:13 -0700
Subject: buildid: stash away kernels build ID on init

Parse the kernel's build ID at initialization so that other code can print
a hex format string representation of the running kernel's build ID.  This
will be used in the kdump and dump_stack code so that developers can
easily locate the vmlinux debug symbols for a crash/stacktrace.

[swboyd@chromium.org: fix implicit declaration of init_vmlinux_build_id()]
  Link: https://lkml.kernel.org/r/CAE-0n51UjTbay8N9FXAyE7_aR2+ePrQnKSRJ0gbmRsXtcLBVaw@mail.gmail.com

Link: https://lkml.kernel.org/r/20210511003845.2429846-4-swboyd@chromium.org
Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Jessica Yu <jeyu@kernel.org>
Cc: Evan Green <evgreen@chromium.org>
Cc: Hsin-Yi Wang <hsinyi@chromium.org>
Cc: Dave Young <dyoung@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Sasha Levin <sashal@kernel.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/buildid.h |  3 +++
 init/main.c             |  2 ++
 lib/buildid.c           | 15 +++++++++++++++
 3 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/buildid.h b/include/linux/buildid.h
index ebce93f26d06..f375900cf9ed 100644
--- a/include/linux/buildid.h
+++ b/include/linux/buildid.h
@@ -10,4 +10,7 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
 		   __u32 *size);
 int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size);
 
+extern unsigned char vmlinux_build_id[BUILD_ID_SIZE_MAX];
+void init_vmlinux_build_id(void);
+
 #endif
diff --git a/init/main.c b/init/main.c
index af521b30a3b8..f5b8246e8aa1 100644
--- a/init/main.c
+++ b/init/main.c
@@ -45,6 +45,7 @@
 #include <linux/srcu.h>
 #include <linux/moduleparam.h>
 #include <linux/kallsyms.h>
+#include <linux/buildid.h>
 #include <linux/writeback.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
@@ -913,6 +914,7 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
 	set_task_stack_end_magic(&init_task);
 	smp_setup_processor_id();
 	debug_objects_early_init();
+	init_vmlinux_build_id();
 
 	cgroup_init_early();
 
diff --git a/lib/buildid.c b/lib/buildid.c
index 6aea1c4e5e85..1103ed46214f 100644
--- a/lib/buildid.c
+++ b/lib/buildid.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include <linux/buildid.h>
+#include <linux/cache.h>
 #include <linux/elf.h>
 #include <linux/kernel.h>
 #include <linux/pagemap.h>
@@ -172,3 +173,17 @@ int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size)
 {
 	return parse_build_id_buf(build_id, NULL, buf, buf_size);
 }
+
+unsigned char vmlinux_build_id[BUILD_ID_SIZE_MAX] __ro_after_init;
+
+/**
+ * init_vmlinux_build_id - Compute and stash the running kernel's build ID
+ */
+void __init init_vmlinux_build_id(void)
+{
+	extern const void __start_notes __weak;
+	extern const void __stop_notes __weak;
+	unsigned int size = &__stop_notes - &__start_notes;
+
+	build_id_parse_buf(&__start_notes, vmlinux_build_id, size);
+}
-- 
cgit v1.2.3


From 22f4e66df79d0a730fcd6c17f3403b5ab8c72ced Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Wed, 7 Jul 2021 18:09:17 -0700
Subject: dump_stack: add vmlinux build ID to stack traces

Add the running kernel's build ID[1] to the stacktrace information header.
This makes it simpler for developers to locate the vmlinux with full
debuginfo for a particular kernel stacktrace.  Combined with
scripts/decode_stracktrace.sh, a developer can download the correct
vmlinux from a debuginfod[2] server and find the exact file and line
number for the functions plus offsets in a stacktrace.

This is especially useful for pstore crash debugging where the kernel
crashes are recorded in the pstore logs and the recovery kernel is
different or the debuginfo doesn't exist on the device due to space
concerns (the data can be large and a security concern).  The stacktrace
can be analyzed after the crash by using the build ID to find the matching
vmlinux and understand where in the function something went wrong.

Example stacktrace from lkdtm:

 WARNING: CPU: 4 PID: 3255 at drivers/misc/lkdtm/bugs.c:83 lkdtm_WARNING+0x28/0x30 [lkdtm]
 Modules linked in: lkdtm rfcomm algif_hash algif_skcipher af_alg xt_cgroup uinput xt_MASQUERADE
 CPU: 4 PID: 3255 Comm: bash Not tainted 5.11 #3 aa23f7a1231c229de205662d5a9e0d4c580f19a1
 Hardware name: Google Lazor (rev3+) with KB Backlight (DT)
 pstate: 00400009 (nzcv daif +PAN -UAO -TCO BTYPE=--)
 pc : lkdtm_WARNING+0x28/0x30 [lkdtm]

The hex string aa23f7a1231c229de205662d5a9e0d4c580f19a1 is the build ID,
following the kernel version number. Put it all behind a config option,
STACKTRACE_BUILD_ID, so that kernel developers can remove this
information if they decide it is too much.

Link: https://lkml.kernel.org/r/20210511003845.2429846-5-swboyd@chromium.org
Link: https://fedoraproject.org/wiki/Releases/FeatureBuildId [1]
Link: https://sourceware.org/elfutils/Debuginfod.html [2]
Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Jessica Yu <jeyu@kernel.org>
Cc: Evan Green <evgreen@chromium.org>
Cc: Hsin-Yi Wang <hsinyi@chromium.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Sasha Levin <sashal@kernel.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/buildid.h |  4 ++++
 lib/Kconfig.debug       | 11 +++++++++++
 lib/buildid.c           |  2 ++
 lib/dump_stack.c        | 13 +++++++++++--
 4 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buildid.h b/include/linux/buildid.h
index f375900cf9ed..3e8d77a93ec4 100644
--- a/include/linux/buildid.h
+++ b/include/linux/buildid.h
@@ -10,7 +10,11 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
 		   __u32 *size);
 int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size);
 
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
 extern unsigned char vmlinux_build_id[BUILD_ID_SIZE_MAX];
 void init_vmlinux_build_id(void);
+#else
+static inline void init_vmlinux_build_id(void) { }
+#endif
 
 #endif
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 4cd6af3f89b3..2987925efe7d 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -35,6 +35,17 @@ config PRINTK_CALLER
 	  no option to enable/disable at the kernel command line parameter or
 	  sysfs interface.
 
+config STACKTRACE_BUILD_ID
+	bool "Show build ID information in stacktraces"
+	depends on PRINTK
+	help
+	  Selecting this option adds build ID information for symbols in
+	  stacktraces printed with the printk format '%p[SR]b'.
+
+	  This option is intended for distros where debuginfo is not easily
+	  accessible but can be downloaded given the build ID of the vmlinux or
+	  kernel module where the function is located.
+
 config CONSOLE_LOGLEVEL_DEFAULT
 	int "Default console loglevel (1-15)"
 	range 1 15
diff --git a/lib/buildid.c b/lib/buildid.c
index 1103ed46214f..6f1e2903740b 100644
--- a/lib/buildid.c
+++ b/lib/buildid.c
@@ -174,6 +174,7 @@ int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size)
 	return parse_build_id_buf(build_id, NULL, buf, buf_size);
 }
 
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
 unsigned char vmlinux_build_id[BUILD_ID_SIZE_MAX] __ro_after_init;
 
 /**
@@ -187,3 +188,4 @@ void __init init_vmlinux_build_id(void)
 
 	build_id_parse_buf(&__start_notes, vmlinux_build_id, size);
 }
+#endif
diff --git a/lib/dump_stack.c b/lib/dump_stack.c
index 27f16872320d..cd3387bb34e5 100644
--- a/lib/dump_stack.c
+++ b/lib/dump_stack.c
@@ -5,6 +5,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/buildid.h>
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
@@ -36,6 +37,14 @@ void __init dump_stack_set_arch_desc(const char *fmt, ...)
 	va_end(args);
 }
 
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
+#define BUILD_ID_FMT " %20phN"
+#define BUILD_ID_VAL vmlinux_build_id
+#else
+#define BUILD_ID_FMT "%s"
+#define BUILD_ID_VAL ""
+#endif
+
 /**
  * dump_stack_print_info - print generic debug info for dump_stack()
  * @log_lvl: log level
@@ -45,13 +54,13 @@ void __init dump_stack_set_arch_desc(const char *fmt, ...)
  */
 void dump_stack_print_info(const char *log_lvl)
 {
-	printk("%sCPU: %d PID: %d Comm: %.20s %s%s %s %.*s\n",
+	printk("%sCPU: %d PID: %d Comm: %.20s %s%s %s %.*s" BUILD_ID_FMT "\n",
 	       log_lvl, raw_smp_processor_id(), current->pid, current->comm,
 	       kexec_crash_loaded() ? "Kdump: loaded " : "",
 	       print_tainted(),
 	       init_utsname()->release,
 	       (int)strcspn(init_utsname()->version, " "),
-	       init_utsname()->version);
+	       init_utsname()->version, BUILD_ID_VAL);
 
 	if (dump_stack_arch_desc_str[0] != '\0')
 		printk("%sHardware name: %s\n",
-- 
cgit v1.2.3


From 9294523e3768030ae8afb84110bcecc66425a647 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Wed, 7 Jul 2021 18:09:20 -0700
Subject: module: add printk formats to add module build ID to stacktraces

Let's make kernel stacktraces easier to identify by including the build
ID[1] of a module if the stacktrace is printing a symbol from a module.
This makes it simpler for developers to locate a kernel module's full
debuginfo for a particular stacktrace.  Combined with
scripts/decode_stracktrace.sh, a developer can download the matching
debuginfo from a debuginfod[2] server and find the exact file and line
number for the functions plus offsets in a stacktrace that match the
module.  This is especially useful for pstore crash debugging where the
kernel crashes are recorded in something like console-ramoops and the
recovery kernel/modules are different or the debuginfo doesn't exist on
the device due to space concerns (the debuginfo can be too large for space
limited devices).

Originally, I put this on the %pS format, but that was quickly rejected
given that %pS is used in other places such as ftrace where build IDs
aren't meaningful.  There was some discussions on the list to put every
module build ID into the "Modules linked in:" section of the stacktrace
message but that quickly becomes very hard to read once you have more than
three or four modules linked in.  It also provides too much information
when we don't expect each module to be traversed in a stacktrace.  Having
the build ID for modules that aren't important just makes things messy.
Splitting it to multiple lines for each module quickly explodes the number
of lines printed in an oops too, possibly wrapping the warning off the
console.  And finally, trying to stash away each module used in a
callstack to provide the ID of each symbol printed is cumbersome and would
require changes to each architecture to stash away modules and return
their build IDs once unwinding has completed.

Instead, we opt for the simpler approach of introducing new printk formats
'%pS[R]b' for "pointer symbolic backtrace with module build ID" and '%pBb'
for "pointer backtrace with module build ID" and then updating the few
places in the architecture layer where the stacktrace is printed to use
this new format.

Before:

 Call trace:
  lkdtm_WARNING+0x28/0x30 [lkdtm]
  direct_entry+0x16c/0x1b4 [lkdtm]
  full_proxy_write+0x74/0xa4
  vfs_write+0xec/0x2e8

After:

 Call trace:
  lkdtm_WARNING+0x28/0x30 [lkdtm 6c2215028606bda50de823490723dc4bc5bf46f9]
  direct_entry+0x16c/0x1b4 [lkdtm 6c2215028606bda50de823490723dc4bc5bf46f9]
  full_proxy_write+0x74/0xa4
  vfs_write+0xec/0x2e8

[akpm@linux-foundation.org: fix build with CONFIG_MODULES=n, tweak code layout]
[rdunlap@infradead.org: fix build when CONFIG_MODULES is not set]
  Link: https://lkml.kernel.org/r/20210513171510.20328-1-rdunlap@infradead.org
[akpm@linux-foundation.org: make kallsyms_lookup_buildid() static]
[cuibixuan@huawei.com: fix build error when CONFIG_SYSFS is disabled]
  Link: https://lkml.kernel.org/r/20210525105049.34804-1-cuibixuan@huawei.com

Link: https://lkml.kernel.org/r/20210511003845.2429846-6-swboyd@chromium.org
Link: https://fedoraproject.org/wiki/Releases/FeatureBuildId [1]
Link: https://sourceware.org/elfutils/Debuginfod.html [2]
Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Signed-off-by: Bixuan Cui <cuibixuan@huawei.com>
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Jessica Yu <jeyu@kernel.org>
Cc: Evan Green <evgreen@chromium.org>
Cc: Hsin-Yi Wang <hsinyi@chromium.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Sasha Levin <sashal@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/core-api/printk-formats.rst |  11 ++++
 include/linux/kallsyms.h                  |  21 +++++-
 include/linux/module.h                    |   9 ++-
 kernel/kallsyms.c                         | 104 ++++++++++++++++++++++++------
 kernel/module.c                           |  42 ++++++++++--
 lib/vsprintf.c                            |   8 ++-
 6 files changed, 166 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst
index 4346ae17a72c..d941717a191b 100644
--- a/Documentation/core-api/printk-formats.rst
+++ b/Documentation/core-api/printk-formats.rst
@@ -125,6 +125,17 @@ used when printing stack backtraces. The specifier takes into
 consideration the effect of compiler optimisations which may occur
 when tail-calls are used and marked with the noreturn GCC attribute.
 
+If the pointer is within a module, the module name and optionally build ID is
+printed after the symbol name with an extra ``b`` appended to the end of the
+specifier.
+
+::
+	%pS	versatile_init+0x0/0x110 [module_name]
+	%pSb	versatile_init+0x0/0x110 [module_name ed5019fdf5e53be37cb1ba7899292d7e143b259e]
+	%pSRb	versatile_init+0x9/0x110 [module_name ed5019fdf5e53be37cb1ba7899292d7e143b259e]
+		(with __builtin_extract_return_addr() translation)
+	%pBb	prev_fn_of_versatile_init+0x88/0x88 [module_name ed5019fdf5e53be37cb1ba7899292d7e143b259e]
+
 Probed Pointers from BPF / tracing
 ----------------------------------
 
diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index 465060acc981..a1d6fc82d7f0 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -7,6 +7,7 @@
 #define _LINUX_KALLSYMS_H
 
 #include <linux/errno.h>
+#include <linux/buildid.h>
 #include <linux/kernel.h>
 #include <linux/stddef.h>
 #include <linux/mm.h>
@@ -15,8 +16,10 @@
 #include <asm/sections.h>
 
 #define KSYM_NAME_LEN 128
-#define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s]") + (KSYM_NAME_LEN - 1) + \
-			 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + 1)
+#define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s %s]") + \
+			(KSYM_NAME_LEN - 1) + \
+			2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + \
+			(BUILD_ID_SIZE_MAX * 2) + 1)
 
 struct cred;
 struct module;
@@ -91,8 +94,10 @@ const char *kallsyms_lookup(unsigned long addr,
 
 /* Look up a kernel symbol and return it in a text buffer. */
 extern int sprint_symbol(char *buffer, unsigned long address);
+extern int sprint_symbol_build_id(char *buffer, unsigned long address);
 extern int sprint_symbol_no_offset(char *buffer, unsigned long address);
 extern int sprint_backtrace(char *buffer, unsigned long address);
+extern int sprint_backtrace_build_id(char *buffer, unsigned long address);
 
 int lookup_symbol_name(unsigned long addr, char *symname);
 int lookup_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name);
@@ -128,6 +133,12 @@ static inline int sprint_symbol(char *buffer, unsigned long addr)
 	return 0;
 }
 
+static inline int sprint_symbol_build_id(char *buffer, unsigned long address)
+{
+	*buffer = '\0';
+	return 0;
+}
+
 static inline int sprint_symbol_no_offset(char *buffer, unsigned long addr)
 {
 	*buffer = '\0';
@@ -140,6 +151,12 @@ static inline int sprint_backtrace(char *buffer, unsigned long addr)
 	return 0;
 }
 
+static inline int sprint_backtrace_build_id(char *buffer, unsigned long addr)
+{
+	*buffer = '\0';
+	return 0;
+}
+
 static inline int lookup_symbol_name(unsigned long addr, char *symname)
 {
 	return -ERANGE;
diff --git a/include/linux/module.h b/include/linux/module.h
index 8100bb477d86..8a298d820dbc 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -11,6 +11,7 @@
 
 #include <linux/list.h>
 #include <linux/stat.h>
+#include <linux/buildid.h>
 #include <linux/compiler.h>
 #include <linux/cache.h>
 #include <linux/kmod.h>
@@ -369,6 +370,11 @@ struct module {
 	/* Unique handle for this module */
 	char name[MODULE_NAME_LEN];
 
+#ifdef CONFIG_STACKTRACE_BUILD_ID
+	/* Module build ID */
+	unsigned char build_id[BUILD_ID_SIZE_MAX];
+#endif
+
 	/* Sysfs stuff. */
 	struct module_kobject mkobj;
 	struct module_attribute *modinfo_attrs;
@@ -636,7 +642,7 @@ void *dereference_module_function_descriptor(struct module *mod, void *ptr);
 const char *module_address_lookup(unsigned long addr,
 			    unsigned long *symbolsize,
 			    unsigned long *offset,
-			    char **modname,
+			    char **modname, const unsigned char **modbuildid,
 			    char *namebuf);
 int lookup_module_symbol_name(unsigned long addr, char *symname);
 int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name);
@@ -740,6 +746,7 @@ static inline const char *module_address_lookup(unsigned long addr,
 					  unsigned long *symbolsize,
 					  unsigned long *offset,
 					  char **modname,
+					  const unsigned char **modbuildid,
 					  char *namebuf)
 {
 	return NULL;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index c851ca0ed357..0ba87982d017 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -25,7 +25,10 @@
 #include <linux/filter.h>
 #include <linux/ftrace.h>
 #include <linux/kprobes.h>
+#include <linux/build_bug.h>
 #include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
 
 /*
  * These will be re-linked against their real values
@@ -297,21 +300,14 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
 		get_symbol_pos(addr, symbolsize, offset);
 		return 1;
 	}
-	return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) ||
+	return !!module_address_lookup(addr, symbolsize, offset, NULL, NULL, namebuf) ||
 	       !!__bpf_address_lookup(addr, symbolsize, offset, namebuf);
 }
 
-/*
- * Lookup an address
- * - modname is set to NULL if it's in the kernel.
- * - We guarantee that the returned name is valid until we reschedule even if.
- *   It resides in a module.
- * - We also guarantee that modname will be valid until rescheduled.
- */
-const char *kallsyms_lookup(unsigned long addr,
-			    unsigned long *symbolsize,
-			    unsigned long *offset,
-			    char **modname, char *namebuf)
+static const char *kallsyms_lookup_buildid(unsigned long addr,
+			unsigned long *symbolsize,
+			unsigned long *offset, char **modname,
+			const unsigned char **modbuildid, char *namebuf)
 {
 	const char *ret;
 
@@ -327,6 +323,8 @@ const char *kallsyms_lookup(unsigned long addr,
 				       namebuf, KSYM_NAME_LEN);
 		if (modname)
 			*modname = NULL;
+		if (modbuildid)
+			*modbuildid = NULL;
 
 		ret = namebuf;
 		goto found;
@@ -334,7 +332,7 @@ const char *kallsyms_lookup(unsigned long addr,
 
 	/* See if it's in a module or a BPF JITed image. */
 	ret = module_address_lookup(addr, symbolsize, offset,
-				    modname, namebuf);
+				    modname, modbuildid, namebuf);
 	if (!ret)
 		ret = bpf_address_lookup(addr, symbolsize,
 					 offset, modname, namebuf);
@@ -348,6 +346,22 @@ found:
 	return ret;
 }
 
+/*
+ * Lookup an address
+ * - modname is set to NULL if it's in the kernel.
+ * - We guarantee that the returned name is valid until we reschedule even if.
+ *   It resides in a module.
+ * - We also guarantee that modname will be valid until rescheduled.
+ */
+const char *kallsyms_lookup(unsigned long addr,
+			    unsigned long *symbolsize,
+			    unsigned long *offset,
+			    char **modname, char *namebuf)
+{
+	return kallsyms_lookup_buildid(addr, symbolsize, offset, modname,
+				       NULL, namebuf);
+}
+
 int lookup_symbol_name(unsigned long addr, char *symname)
 {
 	int res;
@@ -404,15 +418,17 @@ found:
 
 /* Look up a kernel symbol and return it in a text buffer. */
 static int __sprint_symbol(char *buffer, unsigned long address,
-			   int symbol_offset, int add_offset)
+			   int symbol_offset, int add_offset, int add_buildid)
 {
 	char *modname;
+	const unsigned char *buildid;
 	const char *name;
 	unsigned long offset, size;
 	int len;
 
 	address += symbol_offset;
-	name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
+	name = kallsyms_lookup_buildid(address, &size, &offset, &modname, &buildid,
+				       buffer);
 	if (!name)
 		return sprintf(buffer, "0x%lx", address - symbol_offset);
 
@@ -424,8 +440,19 @@ static int __sprint_symbol(char *buffer, unsigned long address,
 	if (add_offset)
 		len += sprintf(buffer + len, "+%#lx/%#lx", offset, size);
 
-	if (modname)
-		len += sprintf(buffer + len, " [%s]", modname);
+	if (modname) {
+		len += sprintf(buffer + len, " [%s", modname);
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
+		if (add_buildid && buildid) {
+			/* build ID should match length of sprintf */
+#if IS_ENABLED(CONFIG_MODULES)
+			static_assert(sizeof(typeof_member(struct module, build_id)) == 20);
+#endif
+			len += sprintf(buffer + len, " %20phN", buildid);
+		}
+#endif
+		len += sprintf(buffer + len, "]");
+	}
 
 	return len;
 }
@@ -443,10 +470,27 @@ static int __sprint_symbol(char *buffer, unsigned long address,
  */
 int sprint_symbol(char *buffer, unsigned long address)
 {
-	return __sprint_symbol(buffer, address, 0, 1);
+	return __sprint_symbol(buffer, address, 0, 1, 0);
 }
 EXPORT_SYMBOL_GPL(sprint_symbol);
 
+/**
+ * sprint_symbol_build_id - Look up a kernel symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function looks up a kernel symbol with @address and stores its name,
+ * offset, size, module name and module build ID to @buffer if possible. If no
+ * symbol was found, just saves its @address as is.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_symbol_build_id(char *buffer, unsigned long address)
+{
+	return __sprint_symbol(buffer, address, 0, 1, 1);
+}
+EXPORT_SYMBOL_GPL(sprint_symbol_build_id);
+
 /**
  * sprint_symbol_no_offset - Look up a kernel symbol and return it in a text buffer
  * @buffer: buffer to be stored
@@ -460,7 +504,7 @@ EXPORT_SYMBOL_GPL(sprint_symbol);
  */
 int sprint_symbol_no_offset(char *buffer, unsigned long address)
 {
-	return __sprint_symbol(buffer, address, 0, 0);
+	return __sprint_symbol(buffer, address, 0, 0, 0);
 }
 EXPORT_SYMBOL_GPL(sprint_symbol_no_offset);
 
@@ -480,7 +524,27 @@ EXPORT_SYMBOL_GPL(sprint_symbol_no_offset);
  */
 int sprint_backtrace(char *buffer, unsigned long address)
 {
-	return __sprint_symbol(buffer, address, -1, 1);
+	return __sprint_symbol(buffer, address, -1, 1, 0);
+}
+
+/**
+ * sprint_backtrace_build_id - Look up a backtrace symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function is for stack backtrace and does the same thing as
+ * sprint_symbol() but with modified/decreased @address. If there is a
+ * tail-call to the function marked "noreturn", gcc optimized out code after
+ * the call so that the stack-saved return address could point outside of the
+ * caller. This function ensures that kallsyms will find the original caller
+ * by decreasing @address. This function also appends the module build ID to
+ * the @buffer if @address is within a kernel module.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_backtrace_build_id(char *buffer, unsigned long address)
+{
+	return __sprint_symbol(buffer, address, -1, 1, 1);
 }
 
 /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
diff --git a/kernel/module.c b/kernel/module.c
index 64bd61b2d3ad..ed13917ea5f3 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -13,6 +13,7 @@
 #include <linux/trace_events.h>
 #include <linux/init.h>
 #include <linux/kallsyms.h>
+#include <linux/buildid.h>
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/sysfs.h>
@@ -1465,6 +1466,13 @@ resolve_symbol_wait(struct module *mod,
 	return ksym;
 }
 
+#ifdef CONFIG_KALLSYMS
+static inline bool sect_empty(const Elf_Shdr *sect)
+{
+	return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
+}
+#endif
+
 /*
  * /sys/module/foo/sections stuff
  * J. Corbet <corbet@lwn.net>
@@ -1472,11 +1480,6 @@ resolve_symbol_wait(struct module *mod,
 #ifdef CONFIG_SYSFS
 
 #ifdef CONFIG_KALLSYMS
-static inline bool sect_empty(const Elf_Shdr *sect)
-{
-	return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
-}
-
 struct module_sect_attr {
 	struct bin_attribute battr;
 	unsigned long address;
@@ -2797,6 +2800,26 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
 }
 #endif /* CONFIG_KALLSYMS */
 
+#if IS_ENABLED(CONFIG_KALLSYMS) && IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
+static void init_build_id(struct module *mod, const struct load_info *info)
+{
+	const Elf_Shdr *sechdr;
+	unsigned int i;
+
+	for (i = 0; i < info->hdr->e_shnum; i++) {
+		sechdr = &info->sechdrs[i];
+		if (!sect_empty(sechdr) && sechdr->sh_type == SHT_NOTE &&
+		    !build_id_parse_buf((void *)sechdr->sh_addr, mod->build_id,
+					sechdr->sh_size))
+			break;
+	}
+}
+#else
+static void init_build_id(struct module *mod, const struct load_info *info)
+{
+}
+#endif
+
 static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsigned int num)
 {
 	if (!debug)
@@ -4021,6 +4044,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
 		goto free_arch_cleanup;
 	}
 
+	init_build_id(mod, info);
 	dynamic_debug_setup(mod, info->debug, info->num_debug);
 
 	/* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
@@ -4254,6 +4278,7 @@ const char *module_address_lookup(unsigned long addr,
 			    unsigned long *size,
 			    unsigned long *offset,
 			    char **modname,
+			    const unsigned char **modbuildid,
 			    char *namebuf)
 {
 	const char *ret = NULL;
@@ -4264,6 +4289,13 @@ const char *module_address_lookup(unsigned long addr,
 	if (mod) {
 		if (modname)
 			*modname = mod->name;
+		if (modbuildid) {
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
+			*modbuildid = mod->build_id;
+#else
+			*modbuildid = NULL;
+#endif
+		}
 
 		ret = find_kallsyms_symbol(mod, addr, size, offset);
 	}
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 77ba1c40a99d..26c83943748a 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -993,8 +993,12 @@ char *symbol_string(char *buf, char *end, void *ptr,
 	value = (unsigned long)ptr;
 
 #ifdef CONFIG_KALLSYMS
-	if (*fmt == 'B')
+	if (*fmt == 'B' && fmt[1] == 'b')
+		sprint_backtrace_build_id(sym, value);
+	else if (*fmt == 'B')
 		sprint_backtrace(sym, value);
+	else if (*fmt == 'S' && (fmt[1] == 'b' || (fmt[1] == 'R' && fmt[2] == 'b')))
+		sprint_symbol_build_id(sym, value);
 	else if (*fmt != 's')
 		sprint_symbol(sym, value);
 	else
@@ -2263,9 +2267,11 @@ early_param("no_hash_pointers", no_hash_pointers_enable);
  * - 'S' For symbolic direct pointers (or function descriptors) with offset
  * - 's' For symbolic direct pointers (or function descriptors) without offset
  * - '[Ss]R' as above with __builtin_extract_return_addr() translation
+ * - 'S[R]b' as above with module build ID (for use in backtraces)
  * - '[Ff]' %pf and %pF were obsoleted and later removed in favor of
  *	    %ps and %pS. Be careful when re-using these specifiers.
  * - 'B' For backtraced symbolic direct pointers with offset
+ * - 'Bb' as above with module build ID (for use in backtraces)
  * - 'R' For decoded struct resource, e.g., [mem 0x0-0x1f 64bit pref]
  * - 'r' For raw struct resource, e.g., [mem 0x0-0x1f flags 0x201]
  * - 'b[l]' For a bitmap, the number of bits is determined by the field
-- 
cgit v1.2.3


From 44e8a5e9120bf4fc1ab046b648b0598e6652c36e Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Wed, 7 Jul 2021 18:09:49 -0700
Subject: kdump: use vmlinux_build_id to simplify

We can use the vmlinux_build_id array here now instead of open coding it.
This mostly consolidates code.

Link: https://lkml.kernel.org/r/20210511003845.2429846-14-swboyd@chromium.org
Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Jessica Yu <jeyu@kernel.org>
Cc: Evan Green <evgreen@chromium.org>
Cc: Hsin-Yi Wang <hsinyi@chromium.org>
Cc: Dave Young <dyoung@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Sasha Levin <sashal@kernel.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/buildid.h    |  2 +-
 include/linux/crash_core.h | 12 +++++------
 kernel/crash_core.c        | 50 ++--------------------------------------------
 lib/buildid.c              |  2 +-
 4 files changed, 10 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buildid.h b/include/linux/buildid.h
index 3e8d77a93ec4..3b7a0ff4642f 100644
--- a/include/linux/buildid.h
+++ b/include/linux/buildid.h
@@ -10,7 +10,7 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
 		   __u32 *size);
 int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size);
 
-#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_CRASH_CORE)
 extern unsigned char vmlinux_build_id[BUILD_ID_SIZE_MAX];
 void init_vmlinux_build_id(void);
 #else
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 206bde8308b2..de62a722431e 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -38,8 +38,12 @@ phys_addr_t paddr_vmcoreinfo_note(void);
 
 #define VMCOREINFO_OSRELEASE(value) \
 	vmcoreinfo_append_str("OSRELEASE=%s\n", value)
-#define VMCOREINFO_BUILD_ID(value) \
-	vmcoreinfo_append_str("BUILD-ID=%s\n", value)
+#define VMCOREINFO_BUILD_ID()						\
+	({								\
+		static_assert(sizeof(vmlinux_build_id) == 20);		\
+		vmcoreinfo_append_str("BUILD-ID=%20phN\n", vmlinux_build_id); \
+	})
+
 #define VMCOREINFO_PAGESIZE(value) \
 	vmcoreinfo_append_str("PAGESIZE=%ld\n", value)
 #define VMCOREINFO_SYMBOL(name) \
@@ -69,10 +73,6 @@ extern unsigned char *vmcoreinfo_data;
 extern size_t vmcoreinfo_size;
 extern u32 *vmcoreinfo_note;
 
-/* raw contents of kernel .notes section */
-extern const void __start_notes __weak;
-extern const void __stop_notes __weak;
-
 Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
 			  void *data, size_t data_len);
 void final_note(Elf_Word *buf);
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index da449c1cdca7..eb53f5ec62c9 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
  */
 
+#include <linux/buildid.h>
 #include <linux/crash_core.h>
 #include <linux/utsname.h>
 #include <linux/vmalloc.h>
@@ -378,53 +379,6 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void)
 }
 EXPORT_SYMBOL(paddr_vmcoreinfo_note);
 
-#define NOTES_SIZE (&__stop_notes - &__start_notes)
-#define BUILD_ID_MAX SHA1_DIGEST_SIZE
-#define NT_GNU_BUILD_ID 3
-
-struct elf_note_section {
-	struct elf_note	n_hdr;
-	u8 n_data[];
-};
-
-/*
- * Add build ID from .notes section as generated by the GNU ld(1)
- * or LLVM lld(1) --build-id option.
- */
-static void add_build_id_vmcoreinfo(void)
-{
-	char build_id[BUILD_ID_MAX * 2 + 1];
-	int n_remain = NOTES_SIZE;
-
-	while (n_remain >= sizeof(struct elf_note)) {
-		const struct elf_note_section *note_sec =
-			&__start_notes + NOTES_SIZE - n_remain;
-		const u32 n_namesz = note_sec->n_hdr.n_namesz;
-
-		if (note_sec->n_hdr.n_type == NT_GNU_BUILD_ID &&
-		    n_namesz != 0 &&
-		    !strcmp((char *)&note_sec->n_data[0], "GNU")) {
-			if (note_sec->n_hdr.n_descsz <= BUILD_ID_MAX) {
-				const u32 n_descsz = note_sec->n_hdr.n_descsz;
-				const u8 *s = &note_sec->n_data[n_namesz];
-
-				s = PTR_ALIGN(s, 4);
-				bin2hex(build_id, s, n_descsz);
-				build_id[2 * n_descsz] = '\0';
-				VMCOREINFO_BUILD_ID(build_id);
-				return;
-			}
-			pr_warn("Build ID is too large to include in vmcoreinfo: %u > %u\n",
-				note_sec->n_hdr.n_descsz,
-				BUILD_ID_MAX);
-			return;
-		}
-		n_remain -= sizeof(struct elf_note) +
-			ALIGN(note_sec->n_hdr.n_namesz, 4) +
-			ALIGN(note_sec->n_hdr.n_descsz, 4);
-	}
-}
-
 static int __init crash_save_vmcoreinfo_init(void)
 {
 	vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
@@ -443,7 +397,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	}
 
 	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
-	add_build_id_vmcoreinfo();
+	VMCOREINFO_BUILD_ID();
 	VMCOREINFO_PAGESIZE(PAGE_SIZE);
 
 	VMCOREINFO_SYMBOL(init_uts_ns);
diff --git a/lib/buildid.c b/lib/buildid.c
index 180a1a9b3b76..dfc62625cae4 100644
--- a/lib/buildid.c
+++ b/lib/buildid.c
@@ -174,7 +174,7 @@ int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size)
 	return parse_build_id_buf(build_id, NULL, buf, buf_size);
 }
 
-#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_CRASH_CORE)
 unsigned char vmlinux_build_id[BUILD_ID_SIZE_MAX] __ro_after_init;
 
 /**
-- 
cgit v1.2.3


From 9cf6fa2458443118b84090aa1bf7a3630b5940e8 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Wed, 7 Jul 2021 18:09:53 -0700
Subject: mm: rename pud_page_vaddr to pud_pgtable and make it return pmd_t *

No functional change in this patch.

[aneesh.kumar@linux.ibm.com: fix]
  Link: https://lkml.kernel.org/r/87wnqtnb60.fsf@linux.ibm.com
[sfr@canb.auug.org.au: another fix]
  Link: https://lkml.kernel.org/r/20210619134410.89559-1-aneesh.kumar@linux.ibm.com

Link: https://lkml.kernel.org/r/20210615110859.320299-1-aneesh.kumar@linux.ibm.com
Link: https://lore.kernel.org/linuxppc-dev/CAHk-=wi+J+iodze9FtjM3Zi4j4OeS+qqbKxME9QN4roxPEXH9Q@mail.gmail.com/
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Hugh Dickins <hughd@google.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/pgtable.h             | 8 +++++---
 arch/arm/include/asm/pgtable-3level.h        | 2 +-
 arch/arm64/include/asm/pgtable.h             | 4 ++--
 arch/ia64/include/asm/pgtable.h              | 2 +-
 arch/m68k/include/asm/motorola_pgtable.h     | 2 +-
 arch/mips/include/asm/pgtable-64.h           | 4 ++--
 arch/parisc/include/asm/pgtable.h            | 4 ++--
 arch/powerpc/include/asm/book3s/64/pgtable.h | 6 +++++-
 arch/powerpc/include/asm/nohash/64/pgtable.h | 6 +++++-
 arch/powerpc/mm/book3s64/radix_pgtable.c     | 4 ++--
 arch/powerpc/mm/pgtable_64.c                 | 2 +-
 arch/riscv/include/asm/pgtable-64.h          | 4 ++--
 arch/sh/include/asm/pgtable-3level.h         | 4 ++--
 arch/sparc/include/asm/pgtable_32.h          | 6 +++---
 arch/sparc/include/asm/pgtable_64.h          | 6 +++---
 arch/um/include/asm/pgtable-3level.h         | 2 +-
 arch/x86/include/asm/pgtable.h               | 4 ++--
 arch/x86/mm/pat/set_memory.c                 | 4 ++--
 arch/x86/mm/pgtable.c                        | 2 +-
 include/asm-generic/pgtable-nopmd.h          | 2 +-
 include/asm-generic/pgtable-nopud.h          | 2 +-
 include/linux/pgtable.h                      | 2 +-
 22 files changed, 46 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h
index ff690846465e..02f0429f1068 100644
--- a/arch/alpha/include/asm/pgtable.h
+++ b/arch/alpha/include/asm/pgtable.h
@@ -236,8 +236,10 @@ pmd_page_vaddr(pmd_t pmd)
 #define pmd_page(pmd)	(pfn_to_page(pmd_val(pmd) >> 32))
 #define pud_page(pud)	(pfn_to_page(pud_val(pud) >> 32))
 
-extern inline unsigned long pud_page_vaddr(pud_t pgd)
-{ return PAGE_OFFSET + ((pud_val(pgd) & _PFN_MASK) >> (32-PAGE_SHIFT)); }
+extern inline pmd_t *pud_pgtable(pud_t pgd)
+{
+	return (pmd_t *)(PAGE_OFFSET + ((pud_val(pgd) & _PFN_MASK) >> (32-PAGE_SHIFT)));
+}
 
 extern inline int pte_none(pte_t pte)		{ return !pte_val(pte); }
 extern inline int pte_present(pte_t pte)	{ return pte_val(pte) & _PAGE_VALID; }
@@ -287,7 +289,7 @@ extern inline pte_t pte_mkyoung(pte_t pte)	{ pte_val(pte) |= __ACCESS_BITS; retu
 /* Find an entry in the second-level page table.. */
 extern inline pmd_t * pmd_offset(pud_t * dir, unsigned long address)
 {
-	pmd_t *ret = (pmd_t *) pud_page_vaddr(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PAGE - 1));
+	pmd_t *ret = pud_pgtable(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PAGE - 1));
 	smp_rmb(); /* see above */
 	return ret;
 }
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index d4edab51a77c..eabe72ff7381 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -130,7 +130,7 @@
 		flush_pmd_entry(pudp);	\
 	} while (0)
 
-static inline pmd_t *pud_page_vaddr(pud_t pud)
+static inline pmd_t *pud_pgtable(pud_t pud)
 {
 	return __va(pud_val(pud) & PHYS_MASK & (s32)PAGE_MASK);
 }
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 508c7ffad515..d6b8e74f3018 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -649,9 +649,9 @@ static inline phys_addr_t pud_page_paddr(pud_t pud)
 	return __pud_to_phys(pud);
 }
 
-static inline unsigned long pud_page_vaddr(pud_t pud)
+static inline pmd_t *pud_pgtable(pud_t pud)
 {
-	return (unsigned long)__va(pud_page_paddr(pud));
+	return (pmd_t *)__va(pud_page_paddr(pud));
 }
 
 /* Find an entry in the second-level page table. */
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index 3f5dbbd8b9d8..7599d04ae192 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -273,7 +273,7 @@ ia64_phys_addr_valid (unsigned long addr)
 #define pud_bad(pud)			(!ia64_phys_addr_valid(pud_val(pud)))
 #define pud_present(pud)		(pud_val(pud) != 0UL)
 #define pud_clear(pudp)			(pud_val(*(pudp)) = 0UL)
-#define pud_page_vaddr(pud)		((unsigned long) __va(pud_val(pud) & _PFN_MASK))
+#define pud_pgtable(pud)		((pmd_t *) __va(pud_val(pud) & _PFN_MASK))
 #define pud_page(pud)			virt_to_page((pud_val(pud) + PAGE_OFFSET))
 
 #if CONFIG_PGTABLE_LEVELS == 4
diff --git a/arch/m68k/include/asm/motorola_pgtable.h b/arch/m68k/include/asm/motorola_pgtable.h
index a2908164ee6f..022c3abc280d 100644
--- a/arch/m68k/include/asm/motorola_pgtable.h
+++ b/arch/m68k/include/asm/motorola_pgtable.h
@@ -131,7 +131,7 @@ static inline void pud_set(pud_t *pudp, pmd_t *pmdp)
 
 #define __pte_page(pte) ((unsigned long)__va(pte_val(pte) & PAGE_MASK))
 #define pmd_page_vaddr(pmd) ((unsigned long)__va(pmd_val(pmd) & _TABLE_MASK))
-#define pud_page_vaddr(pud) ((unsigned long)__va(pud_val(pud) & _TABLE_MASK))
+#define pud_pgtable(pud) ((pmd_t *)__va(pud_val(pud) & _TABLE_MASK))
 
 
 #define pte_none(pte)		(!pte_val(pte))
diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h
index 046465906c82..1136b6f3e41e 100644
--- a/arch/mips/include/asm/pgtable-64.h
+++ b/arch/mips/include/asm/pgtable-64.h
@@ -313,9 +313,9 @@ static inline void pud_clear(pud_t *pudp)
 #endif
 
 #ifndef __PAGETABLE_PMD_FOLDED
-static inline unsigned long pud_page_vaddr(pud_t pud)
+static inline pmd_t *pud_pgtable(pud_t pud)
 {
-	return pud_val(pud);
+	return (pmd_t *)pud_val(pud);
 }
 #define pud_phys(pud)		virt_to_phys((void *)pud_val(pud))
 #define pud_page(pud)		(pfn_to_page(pud_phys(pud) >> PAGE_SHIFT))
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index 7f33c29764cc..43937af127b1 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@@ -322,8 +322,8 @@ static inline void pmd_clear(pmd_t *pmd) {
 
 
 #if CONFIG_PGTABLE_LEVELS == 3
-#define pud_page_vaddr(pud) ((unsigned long) __va(pud_address(pud)))
-#define pud_page(pud)	virt_to_page((void *)pud_page_vaddr(pud))
+#define pud_pgtable(pud) ((pmd_t *) __va(pud_address(pud)))
+#define pud_page(pud)	virt_to_page((void *)pud_pgtable(pud))
 
 /* For 64 bit we have three level tables */
 
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 4d9941b2fe51..536037017c3e 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1051,9 +1051,13 @@ extern struct page *p4d_page(p4d_t p4d);
 /* Pointers in the page table tree are physical addresses */
 #define __pgtable_ptr_val(ptr)	__pa(ptr)
 
-#define pud_page_vaddr(pud)	__va(pud_val(pud) & ~PUD_MASKED_BITS)
 #define p4d_page_vaddr(p4d)	__va(p4d_val(p4d) & ~P4D_MASKED_BITS)
 
+static inline pmd_t *pud_pgtable(pud_t pud)
+{
+	return (pmd_t *)__va(pud_val(pud) & ~PUD_MASKED_BITS);
+}
+
 #define pte_ERROR(e) \
 	pr_err("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
 #define pmd_ERROR(e) \
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index 53fbfdfac93d..d081704b13fb 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -162,7 +162,11 @@ static inline void pud_clear(pud_t *pudp)
 #define	pud_bad(pud)		(!is_kernel_addr(pud_val(pud)) \
 				 || (pud_val(pud) & PUD_BAD_BITS))
 #define pud_present(pud)	(pud_val(pud) != 0)
-#define pud_page_vaddr(pud)	(pud_val(pud) & ~PUD_MASKED_BITS)
+
+static inline pmd_t *pud_pgtable(pud_t pud)
+{
+	return (pmd_t *)(pud_val(pud) & ~PUD_MASKED_BITS);
+}
 
 extern struct page *pud_page(pud_t pud);
 
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 2176a5f70746..54f135b1ee60 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -820,7 +820,7 @@ static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
 			continue;
 		}
 
-		pmd_base = (pmd_t *)pud_page_vaddr(*pud);
+		pmd_base = pud_pgtable(*pud);
 		remove_pmd_table(pmd_base, addr, next);
 		free_pmd_table(pmd_base, pud);
 	}
@@ -1105,7 +1105,7 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr)
 	pmd_t *pmd;
 	int i;
 
-	pmd = (pmd_t *)pud_page_vaddr(*pud);
+	pmd = pud_pgtable(*pud);
 	pud_clear(pud);
 
 	flush_tlb_kernel_range(addr, addr + PUD_SIZE);
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index cc6e2f94517f..4ba311808bdb 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -115,7 +115,7 @@ struct page *pud_page(pud_t pud)
 		VM_WARN_ON(!pud_huge(pud));
 		return pte_page(pud_pte(pud));
 	}
-	return virt_to_page(pud_page_vaddr(pud));
+	return virt_to_page(pud_pgtable(pud));
 }
 
 /*
diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
index f3b0da64c6c8..0e863f3f7187 100644
--- a/arch/riscv/include/asm/pgtable-64.h
+++ b/arch/riscv/include/asm/pgtable-64.h
@@ -60,9 +60,9 @@ static inline void pud_clear(pud_t *pudp)
 	set_pud(pudp, __pud(0));
 }
 
-static inline unsigned long pud_page_vaddr(pud_t pud)
+static inline pmd_t *pud_pgtable(pud_t pud)
 {
-	return (unsigned long)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT);
+	return (pmd_t *)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT);
 }
 
 static inline struct page *pud_page(pud_t pud)
diff --git a/arch/sh/include/asm/pgtable-3level.h b/arch/sh/include/asm/pgtable-3level.h
index 82d74472dfcd..56bf35c2f29c 100644
--- a/arch/sh/include/asm/pgtable-3level.h
+++ b/arch/sh/include/asm/pgtable-3level.h
@@ -32,9 +32,9 @@ typedef struct { unsigned long long pmd; } pmd_t;
 #define pmd_val(x)	((x).pmd)
 #define __pmd(x)	((pmd_t) { (x) } )
 
-static inline unsigned long pud_page_vaddr(pud_t pud)
+static inline pmd_t *pud_pgtable(pud_t pud)
 {
-	return pud_val(pud);
+	return (pmd_t *)pud_val(pud);
 }
 
 /* only used by the stubbed out hugetlb gup code, should never be called */
diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h
index ebaf374b55ab..ffccfe3b22ed 100644
--- a/arch/sparc/include/asm/pgtable_32.h
+++ b/arch/sparc/include/asm/pgtable_32.h
@@ -151,13 +151,13 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
 	return (unsigned long)__nocache_va(v << 4);
 }
 
-static inline unsigned long pud_page_vaddr(pud_t pud)
+static inline pmd_t *pud_pgtable(pud_t pud)
 {
 	if (srmmu_device_memory(pud_val(pud))) {
-		return ~0;
+		return (pmd_t *)~0;
 	} else {
 		unsigned long v = pud_val(pud) & SRMMU_PTD_PMASK;
-		return (unsigned long)__nocache_va(v << 4);
+		return (pmd_t *)__nocache_va(v << 4);
 	}
 }
 
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index e0ee48ec3903..23da25d4765c 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -841,18 +841,18 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
 	return ((unsigned long) __va(pfn << PAGE_SHIFT));
 }
 
-static inline unsigned long pud_page_vaddr(pud_t pud)
+static inline pmd_t *pud_pgtable(pud_t pud)
 {
 	pte_t pte = __pte(pud_val(pud));
 	unsigned long pfn;
 
 	pfn = pte_pfn(pte);
 
-	return ((unsigned long) __va(pfn << PAGE_SHIFT));
+	return ((pmd_t *) __va(pfn << PAGE_SHIFT));
 }
 
 #define pmd_page(pmd) 			virt_to_page((void *)pmd_page_vaddr(pmd))
-#define pud_page(pud) 			virt_to_page((void *)pud_page_vaddr(pud))
+#define pud_page(pud)			virt_to_page((void *)pud_pgtable(pud))
 #define pmd_clear(pmdp)			(pmd_val(*(pmdp)) = 0UL)
 #define pud_present(pud)		(pud_val(pud) != 0U)
 #define pud_clear(pudp)			(pud_val(*(pudp)) = 0UL)
diff --git a/arch/um/include/asm/pgtable-3level.h b/arch/um/include/asm/pgtable-3level.h
index 9289a86643a9..cb896e6121c8 100644
--- a/arch/um/include/asm/pgtable-3level.h
+++ b/arch/um/include/asm/pgtable-3level.h
@@ -83,7 +83,7 @@ static inline void pud_clear (pud_t *pud)
 }
 
 #define pud_page(pud) phys_to_page(pud_val(pud) & PAGE_MASK)
-#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PAGE_MASK))
+#define pud_pgtable(pud) ((pmd_t *) __va(pud_val(pud) & PAGE_MASK))
 
 static inline unsigned long pte_pfn(pte_t pte)
 {
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index ec0d8e106646..e064c5fef0f3 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -836,9 +836,9 @@ static inline int pud_present(pud_t pud)
 	return pud_flags(pud) & _PAGE_PRESENT;
 }
 
-static inline unsigned long pud_page_vaddr(pud_t pud)
+static inline pmd_t *pud_pgtable(pud_t pud)
 {
-	return (unsigned long)__va(pud_val(pud) & pud_pfn_mask(pud));
+	return (pmd_t *)__va(pud_val(pud) & pud_pfn_mask(pud));
 }
 
 /*
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 156cd235659f..ad8a5c586a35 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -1134,7 +1134,7 @@ static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
 			      unsigned long start, unsigned long end)
 {
 	if (unmap_pte_range(pmd, start, end))
-		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+		if (try_to_free_pmd_page(pud_pgtable(*pud)))
 			pud_clear(pud);
 }
 
@@ -1178,7 +1178,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
 	 * Try again to free the PMD page if haven't succeeded above.
 	 */
 	if (!pud_none(*pud))
-		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+		if (try_to_free_pmd_page(pud_pgtable(*pud)))
 			pud_clear(pud);
 }
 
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 1303ff6ef7be..3364fe62b903 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -801,7 +801,7 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr)
 	pte_t *pte;
 	int i;
 
-	pmd = (pmd_t *)pud_page_vaddr(*pud);
+	pmd = pud_pgtable(*pud);
 	pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
 	if (!pmd_sv)
 		return 0;
diff --git a/include/asm-generic/pgtable-nopmd.h b/include/asm-generic/pgtable-nopmd.h
index 3e13acd019ae..10789cf51d16 100644
--- a/include/asm-generic/pgtable-nopmd.h
+++ b/include/asm-generic/pgtable-nopmd.h
@@ -51,7 +51,7 @@ static inline pmd_t * pmd_offset(pud_t * pud, unsigned long address)
 #define __pmd(x)				((pmd_t) { __pud(x) } )
 
 #define pud_page(pud)				(pmd_page((pmd_t){ pud }))
-#define pud_page_vaddr(pud)			(pmd_page_vaddr((pmd_t){ pud }))
+#define pud_pgtable(pud)			((pmd_t *)(pmd_page_vaddr((pmd_t){ pud })))
 
 /*
  * allocating and freeing a pmd is trivial: the 1-entry pmd is
diff --git a/include/asm-generic/pgtable-nopud.h b/include/asm-generic/pgtable-nopud.h
index a9d751fbda9e..7cbd15f70bf5 100644
--- a/include/asm-generic/pgtable-nopud.h
+++ b/include/asm-generic/pgtable-nopud.h
@@ -49,7 +49,7 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
 #define __pud(x)				((pud_t) { __p4d(x) })
 
 #define p4d_page(p4d)				(pud_page((pud_t){ p4d }))
-#define p4d_page_vaddr(p4d)			(pud_page_vaddr((pud_t){ p4d }))
+#define p4d_page_vaddr(p4d)			(pud_pgtable((pud_t){ p4d }))
 
 /*
  * allocating and freeing a pud is trivial: the 1-entry pud is
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index e82660f7b9e4..c7c992ada1fe 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -106,7 +106,7 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
 #ifndef pmd_offset
 static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
 {
-	return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address);
+	return pud_pgtable(*pud) + pmd_index(address);
 }
 #define pmd_offset pmd_offset
 #endif
-- 
cgit v1.2.3


From dc4875f0e791de554bdc45aa1dbd6e45e107e50f Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Wed, 7 Jul 2021 18:09:56 -0700
Subject: mm: rename p4d_page_vaddr to p4d_pgtable and make it return pud_t *

No functional change in this patch.

[aneesh.kumar@linux.ibm.com: m68k build error reported by kernel robot]
  Link: https://lkml.kernel.org/r/87tulxnb2v.fsf@linux.ibm.com

Link: https://lkml.kernel.org/r/20210615110859.320299-2-aneesh.kumar@linux.ibm.com
Link: https://lore.kernel.org/linuxppc-dev/CAHk-=wi+J+iodze9FtjM3Zi4j4OeS+qqbKxME9QN4roxPEXH9Q@mail.gmail.com/
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Hugh Dickins <hughd@google.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/include/asm/pgtable.h                | 4 ++--
 arch/ia64/include/asm/pgtable.h                 | 2 +-
 arch/mips/include/asm/pgtable-64.h              | 4 ++--
 arch/powerpc/include/asm/book3s/64/pgtable.h    | 5 ++++-
 arch/powerpc/include/asm/nohash/64/pgtable-4k.h | 6 +++++-
 arch/powerpc/mm/book3s64/radix_pgtable.c        | 2 +-
 arch/powerpc/mm/pgtable_64.c                    | 2 +-
 arch/sparc/include/asm/pgtable_64.h             | 4 ++--
 arch/x86/include/asm/pgtable.h                  | 4 ++--
 arch/x86/mm/init_64.c                           | 4 ++--
 include/asm-generic/pgtable-nop4d.h             | 2 +-
 include/asm-generic/pgtable-nopud.h             | 2 +-
 include/linux/pgtable.h                         | 2 +-
 13 files changed, 25 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index d6b8e74f3018..f09bf5c02891 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -710,9 +710,9 @@ static inline phys_addr_t p4d_page_paddr(p4d_t p4d)
 	return __p4d_to_phys(p4d);
 }
 
-static inline unsigned long p4d_page_vaddr(p4d_t p4d)
+static inline pud_t *p4d_pgtable(p4d_t p4d)
 {
-	return (unsigned long)__va(p4d_page_paddr(p4d));
+	return (pud_t *)__va(p4d_page_paddr(p4d));
 }
 
 /* Find an entry in the frst-level page table. */
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index 7599d04ae192..9584b2c5f394 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -281,7 +281,7 @@ ia64_phys_addr_valid (unsigned long addr)
 #define p4d_bad(p4d)			(!ia64_phys_addr_valid(p4d_val(p4d)))
 #define p4d_present(p4d)		(p4d_val(p4d) != 0UL)
 #define p4d_clear(p4dp)			(p4d_val(*(p4dp)) = 0UL)
-#define p4d_page_vaddr(p4d)		((unsigned long) __va(p4d_val(p4d) & _PFN_MASK))
+#define p4d_pgtable(p4d)		((pud_t *) __va(p4d_val(p4d) & _PFN_MASK))
 #define p4d_page(p4d)			virt_to_page((p4d_val(p4d) + PAGE_OFFSET))
 #endif
 
diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h
index 1136b6f3e41e..41921acdc9d8 100644
--- a/arch/mips/include/asm/pgtable-64.h
+++ b/arch/mips/include/asm/pgtable-64.h
@@ -209,9 +209,9 @@ static inline void p4d_clear(p4d_t *p4dp)
 	p4d_val(*p4dp) = (unsigned long)invalid_pud_table;
 }
 
-static inline unsigned long p4d_page_vaddr(p4d_t p4d)
+static inline pud_t *p4d_pgtable(p4d_t p4d)
 {
-	return p4d_val(p4d);
+	return (pud_t *)p4d_val(p4d);
 }
 
 #define p4d_phys(p4d)		virt_to_phys((void *)p4d_val(p4d))
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 536037017c3e..5d34a8646f08 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1051,7 +1051,10 @@ extern struct page *p4d_page(p4d_t p4d);
 /* Pointers in the page table tree are physical addresses */
 #define __pgtable_ptr_val(ptr)	__pa(ptr)
 
-#define p4d_page_vaddr(p4d)	__va(p4d_val(p4d) & ~P4D_MASKED_BITS)
+static inline pud_t *p4d_pgtable(p4d_t p4d)
+{
+	return (pud_t *)__va(p4d_val(p4d) & ~P4D_MASKED_BITS);
+}
 
 static inline pmd_t *pud_pgtable(pud_t pud)
 {
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable-4k.h b/arch/powerpc/include/asm/nohash/64/pgtable-4k.h
index fe2f4c9acd9e..10f5cf444d72 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable-4k.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable-4k.h
@@ -56,10 +56,14 @@
 #define p4d_none(p4d)		(!p4d_val(p4d))
 #define p4d_bad(p4d)		(p4d_val(p4d) == 0)
 #define p4d_present(p4d)	(p4d_val(p4d) != 0)
-#define p4d_page_vaddr(p4d)	(p4d_val(p4d) & ~P4D_MASKED_BITS)
 
 #ifndef __ASSEMBLY__
 
+static inline pud_t *p4d_pgtable(p4d_t p4d)
+{
+	return (pud_t *) (p4d_val(p4d) & ~P4D_MASKED_BITS);
+}
+
 static inline void p4d_clear(p4d_t *p4dp)
 {
 	*p4dp = __p4d(0);
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 54f135b1ee60..e50ddf129c15 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -854,7 +854,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end)
 			continue;
 		}
 
-		pud_base = (pud_t *)p4d_page_vaddr(*p4d);
+		pud_base = p4d_pgtable(*p4d);
 		remove_pud_table(pud_base, addr, next);
 		free_pud_table(pud_base, p4d);
 	}
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 4ba311808bdb..78c8cf01db5f 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -105,7 +105,7 @@ struct page *p4d_page(p4d_t p4d)
 		VM_WARN_ON(!p4d_huge(p4d));
 		return pte_page(p4d_pte(p4d));
 	}
-	return virt_to_page(p4d_page_vaddr(p4d));
+	return virt_to_page(p4d_pgtable(p4d));
 }
 #endif
 
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 23da25d4765c..4679e45c8348 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -856,8 +856,8 @@ static inline pmd_t *pud_pgtable(pud_t pud)
 #define pmd_clear(pmdp)			(pmd_val(*(pmdp)) = 0UL)
 #define pud_present(pud)		(pud_val(pud) != 0U)
 #define pud_clear(pudp)			(pud_val(*(pudp)) = 0UL)
-#define p4d_page_vaddr(p4d)		\
-	((unsigned long) __va(p4d_val(p4d)))
+#define p4d_pgtable(p4d)		\
+	((pud_t *) __va(p4d_val(p4d)))
 #define p4d_present(p4d)		(p4d_val(p4d) != 0U)
 #define p4d_clear(p4dp)			(p4d_val(*(p4dp)) = 0UL)
 
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index e064c5fef0f3..448cd01eb3ec 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -877,9 +877,9 @@ static inline int p4d_present(p4d_t p4d)
 	return p4d_flags(p4d) & _PAGE_PRESENT;
 }
 
-static inline unsigned long p4d_page_vaddr(p4d_t p4d)
+static inline pud_t *p4d_pgtable(p4d_t p4d)
 {
-	return (unsigned long)__va(p4d_val(p4d) & p4d_pfn_mask(p4d));
+	return (pud_t *)__va(p4d_val(p4d) & p4d_pfn_mask(p4d));
 }
 
 /*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 65ea58527176..ddeaba947eb3 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -194,8 +194,8 @@ static void sync_global_pgds_l4(unsigned long start, unsigned long end)
 			spin_lock(pgt_lock);
 
 			if (!p4d_none(*p4d_ref) && !p4d_none(*p4d))
-				BUG_ON(p4d_page_vaddr(*p4d)
-				       != p4d_page_vaddr(*p4d_ref));
+				BUG_ON(p4d_pgtable(*p4d)
+				       != p4d_pgtable(*p4d_ref));
 
 			if (p4d_none(*p4d))
 				set_p4d(p4d, *p4d_ref);
diff --git a/include/asm-generic/pgtable-nop4d.h b/include/asm-generic/pgtable-nop4d.h
index 2f6b1befb129..03b7dae47dd4 100644
--- a/include/asm-generic/pgtable-nop4d.h
+++ b/include/asm-generic/pgtable-nop4d.h
@@ -41,7 +41,7 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
 #define __p4d(x)				((p4d_t) { __pgd(x) })
 
 #define pgd_page(pgd)				(p4d_page((p4d_t){ pgd }))
-#define pgd_page_vaddr(pgd)			(p4d_page_vaddr((p4d_t){ pgd }))
+#define pgd_page_vaddr(pgd)			((unsigned long)(p4d_pgtable((p4d_t){ pgd })))
 
 /*
  * allocating and freeing a p4d is trivial: the 1-entry p4d is
diff --git a/include/asm-generic/pgtable-nopud.h b/include/asm-generic/pgtable-nopud.h
index 7cbd15f70bf5..eb70c6d7ceff 100644
--- a/include/asm-generic/pgtable-nopud.h
+++ b/include/asm-generic/pgtable-nopud.h
@@ -49,7 +49,7 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
 #define __pud(x)				((pud_t) { __p4d(x) })
 
 #define p4d_page(p4d)				(pud_page((pud_t){ p4d }))
-#define p4d_page_vaddr(p4d)			(pud_pgtable((pud_t){ p4d }))
+#define p4d_pgtable(p4d)			((pud_t *)(pud_pgtable((pud_t){ p4d })))
 
 /*
  * allocating and freeing a pud is trivial: the 1-entry pud is
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index c7c992ada1fe..d147480cdefc 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -114,7 +114,7 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
 #ifndef pud_offset
 static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
 {
-	return (pud_t *)p4d_page_vaddr(*p4d) + pud_index(address);
+	return p4d_pgtable(*p4d) + pud_index(address);
 }
 #define pud_offset pud_offset
 #endif
-- 
cgit v1.2.3