From e71da1fd0e84bc5c87a78b405e40713840eecc80 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Sat, 6 Feb 2021 16:13:48 +0100 Subject: HID: intel-ish-hid: Make remove callback return void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The driver core ignores the return value of struct bus_type::remove() because there is only little that can be done. To simplify the quest to make this function return void, let struct ishtp_cl_driver::remove() return void, too. All users already unconditionally return 0, this commit makes it obvious that returning an error value is a bad idea. Signed-off-by: Uwe Kleine-König Acked-by: Srinivas Pandruvada Signed-off-by: Jiri Kosina --- include/linux/intel-ish-client-if.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h index 0d6b4bc191c5..94669e21dc8b 100644 --- a/include/linux/intel-ish-client-if.h +++ b/include/linux/intel-ish-client-if.h @@ -36,7 +36,7 @@ struct ishtp_cl_driver { const char *name; const guid_t *guid; int (*probe)(struct ishtp_cl_device *dev); - int (*remove)(struct ishtp_cl_device *dev); + void (*remove)(struct ishtp_cl_device *dev); int (*reset)(struct ishtp_cl_device *dev); const struct dev_pm_ops *pm; }; -- cgit v1.2.3 From c57179c73562e31d39139ac245b8a2d337e1823b Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 26 Mar 2021 14:34:58 +0000 Subject: HID: ishtp-hid-client: Fix 'suggest-attribute=format' compiler warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes the following W=1 kernel build warning(s): drivers/hid/intel-ish-hid/ishtp/bus.c: In function ‘ishtp_trace_callback’: drivers/hid/intel-ish-hid/ishtp/bus.c:876:29: warning: return type might be a candidate for a format attribute [-Wsuggest-attribute=format] 876 | return cl_device->ishtp_dev->print_log; | ~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~ Cc: Srinivas Pandruvada Cc: Jiri Kosina Cc: Benjamin Tissoires Cc: Daniel Drubin Cc: linux-input@vger.kernel.org Suggested-by: Arnd Bergmann Signed-off-by: Lee Jones Acked-by: Srinivas Pandruvada Signed-off-by: Benjamin Tissoires --- drivers/hid/intel-ish-hid/ishtp-hid-client.c | 4 ++-- drivers/hid/intel-ish-hid/ishtp-hid.h | 4 ++-- drivers/hid/intel-ish-hid/ishtp/bus.c | 4 ++-- drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h | 4 ++-- include/linux/intel-ish-client-if.h | 8 +++++++- 5 files changed, 15 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/hid/intel-ish-hid/ishtp-hid-client.c b/drivers/hid/intel-ish-hid/ishtp-hid-client.c index 042a7091802d..6b1fa971b33e 100644 --- a/drivers/hid/intel-ish-hid/ishtp-hid-client.c +++ b/drivers/hid/intel-ish-hid/ishtp-hid-client.c @@ -784,7 +784,7 @@ static void hid_ishtp_cl_reset_handler(struct work_struct *work) } } -void (*hid_print_trace)(void *unused, const char *format, ...); +ishtp_print_log ishtp_hid_print_trace; /** * hid_ishtp_cl_probe() - ISHTP client driver probe @@ -823,7 +823,7 @@ static int hid_ishtp_cl_probe(struct ishtp_cl_device *cl_device) INIT_WORK(&client_data->work, hid_ishtp_cl_reset_handler); - hid_print_trace = ishtp_trace_callback(cl_device); + ishtp_hid_print_trace = ishtp_trace_callback(cl_device); rv = hid_ishtp_cl_init(hid_ishtp_cl, 0); if (rv) { diff --git a/drivers/hid/intel-ish-hid/ishtp-hid.h b/drivers/hid/intel-ish-hid/ishtp-hid.h index e2423f7d2b54..f88443a7d935 100644 --- a/drivers/hid/intel-ish-hid/ishtp-hid.h +++ b/drivers/hid/intel-ish-hid/ishtp-hid.h @@ -16,9 +16,9 @@ #define IS_RESPONSE 0x80 /* Used to dump to Linux trace buffer, if enabled */ -extern void (*hid_print_trace)(void *unused, const char *format, ...); +extern ishtp_print_log ishtp_hid_print_trace; #define hid_ishtp_trace(client, ...) \ - (hid_print_trace)(NULL, __VA_ARGS__) + (ishtp_hid_print_trace)(NULL, __VA_ARGS__) /* ISH HID message structure */ struct hostif_msg_hdr { diff --git a/drivers/hid/intel-ish-hid/ishtp/bus.c b/drivers/hid/intel-ish-hid/ishtp/bus.c index c1c7d5356208..f0802b047ed8 100644 --- a/drivers/hid/intel-ish-hid/ishtp/bus.c +++ b/drivers/hid/intel-ish-hid/ishtp/bus.c @@ -869,9 +869,9 @@ EXPORT_SYMBOL(ishtp_get_pci_device); * * This interface is used to return trace callback function pointer. * - * Return: void *. + * Return: *ishtp_print_log() */ -void *ishtp_trace_callback(struct ishtp_cl_device *cl_device) +ishtp_print_log ishtp_trace_callback(struct ishtp_cl_device *cl_device) { return cl_device->ishtp_dev->print_log; } diff --git a/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h b/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h index 1cc6364aa957..f579b16e6d7a 100644 --- a/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h +++ b/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h @@ -10,6 +10,7 @@ #include #include +#include #include "bus.h" #include "hbm.h" @@ -202,8 +203,7 @@ struct ishtp_device { uint64_t ishtp_host_dma_rx_buf_phys; /* Dump to trace buffers if enabled*/ - __printf(2, 3) void (*print_log)(struct ishtp_device *dev, - const char *format, ...); + ishtp_print_log print_log; /* Debug stats */ unsigned int ipc_rx_cnt; diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h index 94669e21dc8b..25e2b4e80502 100644 --- a/include/linux/intel-ish-client-if.h +++ b/include/linux/intel-ish-client-if.h @@ -8,11 +8,17 @@ #ifndef _INTEL_ISH_CLIENT_IF_H_ #define _INTEL_ISH_CLIENT_IF_H_ +#include +#include + struct ishtp_cl_device; struct ishtp_device; struct ishtp_cl; struct ishtp_fw_client; +typedef __printf(2, 3) void (*ishtp_print_log)(struct ishtp_device *dev, + const char *format, ...); + /* Client state */ enum cl_state { ISHTP_CL_INITIALIZING = 0, @@ -76,7 +82,7 @@ int ishtp_register_event_cb(struct ishtp_cl_device *device, /* Get the device * from ishtp device instance */ struct device *ishtp_device(struct ishtp_cl_device *cl_device); /* Trace interface for clients */ -void *ishtp_trace_callback(struct ishtp_cl_device *cl_device); +ishtp_print_log ishtp_trace_callback(struct ishtp_cl_device *cl_device); /* Get device pointer of PCI device for DMA acces */ struct device *ishtp_get_pci_device(struct ishtp_cl_device *cl_device); -- cgit v1.2.3 From ffb37ca3bd16ce6ea2df2f87fde9a31e94ebb54b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 1 Apr 2021 19:00:57 -0400 Subject: switch file_open_root() to struct path ... and provide file_open_root_mnt(), using the root of given mount. Signed-off-by: Al Viro --- Documentation/filesystems/porting.rst | 9 +++++++++ arch/um/drivers/mconsole_kern.c | 2 +- fs/coredump.c | 4 ++-- fs/fhandle.c | 2 +- fs/internal.h | 2 +- fs/kernel_read_file.c | 2 +- fs/namei.c | 8 +++----- fs/open.c | 4 ++-- fs/proc/proc_sysctl.c | 2 +- include/linux/fs.h | 8 +++++++- kernel/usermode_driver.c | 2 +- 11 files changed, 29 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 0302035781be..9bb2b35f90bb 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -890,3 +890,12 @@ been called or returned with non -EIOCBQUEUED code. mnt_want_write_file() can now only be paired with mnt_drop_write_file(), whereas previously it could be paired with mnt_drop_write() as well. + +--- + +**mandatory** + +Calling conventions for file_open_root() changed; now it takes struct path * +instead of passing mount and dentry separately. For callers that used to +pass mnt_root> pair (i.e. the root of given mount), a new helper +is provided - file_open_root_mnt(). In-tree users adjusted. diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 6d00af25ec6b..c42b10024e26 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -140,7 +140,7 @@ void mconsole_proc(struct mc_request *req) mconsole_reply(req, "Proc not available", 1, 0); goto out; } - file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY, 0); + file = file_open_root_mnt(mnt, ptr, O_RDONLY, 0); if (IS_ERR(file)) { mconsole_reply(req, "Failed to open file", 1, 0); printk(KERN_ERR "open /proc/%s: %ld\n", ptr, PTR_ERR(file)); diff --git a/fs/coredump.c b/fs/coredump.c index 1c0fdc1aa70b..087db444b06b 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -755,8 +755,8 @@ void do_coredump(const kernel_siginfo_t *siginfo) task_lock(&init_task); get_fs_root(init_task.fs, &root); task_unlock(&init_task); - cprm.file = file_open_root(root.dentry, root.mnt, - cn.corename, open_flags, 0600); + cprm.file = file_open_root(&root, cn.corename, + open_flags, 0600); path_put(&root); } else { cprm.file = filp_open(cn.corename, open_flags, 0600); diff --git a/fs/fhandle.c b/fs/fhandle.c index ec6feeccc276..6630c69c23a2 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -229,7 +229,7 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh, path_put(&path); return fd; } - file = file_open_root(path.dentry, path.mnt, "", open_flag, 0); + file = file_open_root(&path, "", open_flag, 0); if (IS_ERR(file)) { put_unused_fd(fd); retval = PTR_ERR(file); diff --git a/fs/internal.h b/fs/internal.h index 6aeae7ef3380..3ce8edbaa3ca 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -129,7 +129,7 @@ struct open_flags { }; extern struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op); -extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, +extern struct file *do_file_open_root(const struct path *, const char *, const struct open_flags *); extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c index 90d255fbdd9b..87aac4c72c37 100644 --- a/fs/kernel_read_file.c +++ b/fs/kernel_read_file.c @@ -160,7 +160,7 @@ int kernel_read_file_from_path_initns(const char *path, loff_t offset, get_fs_root(init_task.fs, &root); task_unlock(&init_task); - file = file_open_root(root.dentry, root.mnt, path, O_RDONLY, 0); + file = file_open_root(&root, path, O_RDONLY, 0); path_put(&root); if (IS_ERR(file)) return PTR_ERR(file); diff --git a/fs/namei.c b/fs/namei.c index 48a2f288e802..4b6cf4974dd7 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3533,7 +3533,7 @@ struct file *do_filp_open(int dfd, struct filename *pathname, return filp; } -struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, +struct file *do_file_open_root(const struct path *root, const char *name, const struct open_flags *op) { struct nameidata nd; @@ -3541,16 +3541,14 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, struct filename *filename; int flags = op->lookup_flags | LOOKUP_ROOT; - nd.root.mnt = mnt; - nd.root.dentry = dentry; - - if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN) + if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN) return ERR_PTR(-ELOOP); filename = getname_kernel(name); if (IS_ERR(filename)) return ERR_CAST(filename); + nd.root = *root; set_nameidata(&nd, -1, filename); file = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(file == ERR_PTR(-ECHILD))) diff --git a/fs/open.c b/fs/open.c index e53af13b5835..b3c904e82e2a 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1156,7 +1156,7 @@ struct file *filp_open(const char *filename, int flags, umode_t mode) } EXPORT_SYMBOL(filp_open); -struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, +struct file *file_open_root(const struct path *root, const char *filename, int flags, umode_t mode) { struct open_flags op; @@ -1164,7 +1164,7 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, int err = build_open_flags(&how, &op); if (err) return ERR_PTR(err); - return do_file_open_root(dentry, mnt, filename, &op); + return do_file_open_root(root, filename, &op); } EXPORT_SYMBOL(file_open_root); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 984e42f8cb11..6606f21fc195 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1806,7 +1806,7 @@ static int process_sysctl_arg(char *param, char *val, panic("%s: Failed to allocate path for %s\n", __func__, param); strreplace(path, '.', '/'); - file = file_open_root((*proc_mnt)->mnt_root, *proc_mnt, path, O_WRONLY, 0); + file = file_open_root_mnt(*proc_mnt, path, O_WRONLY, 0); if (IS_ERR(file)) { err = PTR_ERR(file); if (err == -ENOENT) diff --git a/include/linux/fs.h b/include/linux/fs.h index ec8f3ddf4a6a..1acea2bb9d60 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2632,8 +2632,14 @@ extern long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode); extern struct file *file_open_name(struct filename *, int, umode_t); extern struct file *filp_open(const char *, int, umode_t); -extern struct file *file_open_root(struct dentry *, struct vfsmount *, +extern struct file *file_open_root(const struct path *, const char *, int, umode_t); +static inline struct file *file_open_root_mnt(struct vfsmount *mnt, + const char *name, int flags, umode_t mode) +{ + return file_open_root(&(struct path){.mnt = mnt, .dentry = mnt->mnt_root}, + name, flags, mode); +} extern struct file * dentry_open(const struct path *, int, const struct cred *); extern struct file * open_with_fake_path(const struct path *, int, struct inode*, const struct cred *); diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c index 0b35212ffc3d..78353cb73836 100644 --- a/kernel/usermode_driver.c +++ b/kernel/usermode_driver.c @@ -26,7 +26,7 @@ static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *na if (IS_ERR(mnt)) return mnt; - file = file_open_root(mnt->mnt_root, mnt, name, O_CREAT | O_WRONLY, 0700); + file = file_open_root_mnt(mnt, name, O_CREAT | O_WRONLY, 0700); if (IS_ERR(file)) { mntput(mnt); return ERR_CAST(file); -- cgit v1.2.3 From bcba1e7d0d520adba895d9e0800a056f734b0a6a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 1 Apr 2021 22:03:41 -0400 Subject: take LOOKUP_{ROOT,ROOT_GRABBED,JUMPED} out of LOOKUP_... space Separate field in nameidata (nd->state) holding the flags that should be internal-only - that way we both get some spare bits in LOOKUP_... and get simpler rules for nd->root lifetime rules, since we can set the replacement of LOOKUP_ROOT (ND_ROOT_PRESET) at the same time we set nd->root. Signed-off-by: Al Viro --- Documentation/filesystems/path-lookup.rst | 6 ++-- fs/namei.c | 54 ++++++++++++++++++------------- fs/nfs/nfstrace.h | 4 --- include/linux/namei.h | 3 -- 4 files changed, 34 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/path-lookup.rst b/Documentation/filesystems/path-lookup.rst index c482e1619e77..ede67f705787 100644 --- a/Documentation/filesystems/path-lookup.rst +++ b/Documentation/filesystems/path-lookup.rst @@ -1321,18 +1321,18 @@ to lookup: RCU-walk, REF-walk, and REF-walk with forced revalidation. yet. This is primarily used to tell the audit subsystem the full context of a particular access being audited. -``LOOKUP_ROOT`` indicates that the ``root`` field in the ``nameidata`` was +``ND_ROOT_PRESET`` indicates that the ``root`` field in the ``nameidata`` was provided by the caller, so it shouldn't be released when it is no longer needed. -``LOOKUP_JUMPED`` means that the current dentry was chosen not because +``ND_JUMPED`` means that the current dentry was chosen not because it had the right name but for some other reason. This happens when following "``..``", following a symlink to ``/``, crossing a mount point or accessing a "``/proc/$PID/fd/$FD``" symlink (also known as a "magic link"). In this case the filesystem has not been asked to revalidate the name (with ``d_revalidate()``). In such cases the inode may still need to be revalidated, so ``d_op->d_weak_revalidate()`` is called if -``LOOKUP_JUMPED`` is set when the look completes - which may be at the +``ND_JUMPED`` is set when the look completes - which may be at the final component or, when creating, unlinking, or renaming, at the penultimate component. Resolution-restriction flags diff --git a/fs/namei.c b/fs/namei.c index 4b6cf4974dd7..622b9f15bf1c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -554,7 +554,7 @@ struct nameidata { struct qstr last; struct path root; struct inode *inode; /* path.dentry.d_inode */ - unsigned int flags; + unsigned int flags, state; unsigned seq, m_seq, r_seq; int last_type; unsigned depth; @@ -573,6 +573,10 @@ struct nameidata { umode_t dir_mode; } __randomize_layout; +#define ND_ROOT_PRESET 1 +#define ND_ROOT_GRABBED 2 +#define ND_JUMPED 4 + static void set_nameidata(struct nameidata *p, int dfd, struct filename *name) { struct nameidata *old = current->nameidata; @@ -583,6 +587,7 @@ static void set_nameidata(struct nameidata *p, int dfd, struct filename *name) p->path.dentry = NULL; p->total_link_count = old ? old->total_link_count : 0; p->saved = old; + p->state = 0; current->nameidata = p; } @@ -645,9 +650,9 @@ static void terminate_walk(struct nameidata *nd) path_put(&nd->path); for (i = 0; i < nd->depth; i++) path_put(&nd->stack[i].link); - if (nd->flags & LOOKUP_ROOT_GRABBED) { + if (nd->state & ND_ROOT_GRABBED) { path_put(&nd->root); - nd->flags &= ~LOOKUP_ROOT_GRABBED; + nd->state &= ~ND_ROOT_GRABBED; } } else { nd->flags &= ~LOOKUP_RCU; @@ -710,9 +715,9 @@ static bool legitimize_root(struct nameidata *nd) if (!nd->root.mnt && (nd->flags & LOOKUP_IS_SCOPED)) return false; /* Nothing to do if nd->root is zero or is managed by the VFS user. */ - if (!nd->root.mnt || (nd->flags & LOOKUP_ROOT)) + if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET)) return true; - nd->flags |= LOOKUP_ROOT_GRABBED; + nd->state |= ND_ROOT_GRABBED; return legitimize_path(nd, &nd->root, nd->root_seq); } @@ -849,8 +854,9 @@ static int complete_walk(struct nameidata *nd) * We don't want to zero nd->root for scoped-lookups or * externally-managed nd->root. */ - if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED))) - nd->root.mnt = NULL; + if (!(nd->state & ND_ROOT_PRESET)) + if (!(nd->flags & LOOKUP_IS_SCOPED)) + nd->root.mnt = NULL; nd->flags &= ~LOOKUP_CACHED; if (!try_to_unlazy(nd)) return -ECHILD; @@ -877,7 +883,7 @@ static int complete_walk(struct nameidata *nd) return -EXDEV; } - if (likely(!(nd->flags & LOOKUP_JUMPED))) + if (likely(!(nd->state & ND_JUMPED))) return 0; if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE))) @@ -915,7 +921,7 @@ static int set_root(struct nameidata *nd) } while (read_seqcount_retry(&fs->seq, seq)); } else { get_fs_root(fs, &nd->root); - nd->flags |= LOOKUP_ROOT_GRABBED; + nd->state |= ND_ROOT_GRABBED; } return 0; } @@ -948,7 +954,7 @@ static int nd_jump_root(struct nameidata *nd) path_get(&nd->path); nd->inode = nd->path.dentry->d_inode; } - nd->flags |= LOOKUP_JUMPED; + nd->state |= ND_JUMPED; return 0; } @@ -976,7 +982,7 @@ int nd_jump_link(struct path *path) path_put(&nd->path); nd->path = *path; nd->inode = nd->path.dentry->d_inode; - nd->flags |= LOOKUP_JUMPED; + nd->state |= ND_JUMPED; return 0; err: @@ -1424,7 +1430,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, if (mounted) { path->mnt = &mounted->mnt; dentry = path->dentry = mounted->mnt.mnt_root; - nd->flags |= LOOKUP_JUMPED; + nd->state |= ND_JUMPED; *seqp = read_seqcount_begin(&dentry->d_seq); *inode = dentry->d_inode; /* @@ -1469,7 +1475,7 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, if (unlikely(nd->flags & LOOKUP_NO_XDEV)) ret = -EXDEV; else - nd->flags |= LOOKUP_JUMPED; + nd->state |= ND_JUMPED; } if (unlikely(ret)) { dput(path->dentry); @@ -2220,7 +2226,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) case 2: if (name[1] == '.') { type = LAST_DOTDOT; - nd->flags |= LOOKUP_JUMPED; + nd->state |= ND_JUMPED; } break; case 1: @@ -2228,7 +2234,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) } if (likely(type == LAST_NORM)) { struct dentry *parent = nd->path.dentry; - nd->flags &= ~LOOKUP_JUMPED; + nd->state &= ~ND_JUMPED; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { struct qstr this = { { .hash_len = hash_len }, .name = name }; err = parent->d_op->d_hash(parent, &this); @@ -2302,14 +2308,15 @@ static const char *path_init(struct nameidata *nd, unsigned flags) if (flags & LOOKUP_RCU) rcu_read_lock(); - nd->flags = flags | LOOKUP_JUMPED; + nd->flags = flags; + nd->state |= ND_JUMPED; nd->depth = 0; nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount); nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount); smp_rmb(); - if (flags & LOOKUP_ROOT) { + if (nd->state & ND_ROOT_PRESET) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; if (*s && unlikely(!d_can_lookup(root))) @@ -2384,7 +2391,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->root_seq = nd->seq; } else { path_get(&nd->root); - nd->flags |= LOOKUP_ROOT_GRABBED; + nd->state |= ND_ROOT_GRABBED; } } return s; @@ -2423,7 +2430,7 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path ; if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) { err = handle_lookup_down(nd); - nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please... + nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please... } if (!err) err = complete_walk(nd); @@ -2447,11 +2454,11 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags, struct nameidata nd; if (IS_ERR(name)) return PTR_ERR(name); + set_nameidata(&nd, dfd, name); if (unlikely(root)) { nd.root = *root; - flags |= LOOKUP_ROOT; + nd.state = ND_ROOT_PRESET; } - set_nameidata(&nd, dfd, name); retval = path_lookupat(&nd, flags | LOOKUP_RCU, path); if (unlikely(retval == -ECHILD)) retval = path_lookupat(&nd, flags, path); @@ -3539,7 +3546,7 @@ struct file *do_file_open_root(const struct path *root, struct nameidata nd; struct file *file; struct filename *filename; - int flags = op->lookup_flags | LOOKUP_ROOT; + int flags = op->lookup_flags; if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN) return ERR_PTR(-ELOOP); @@ -3548,8 +3555,9 @@ struct file *do_file_open_root(const struct path *root, if (IS_ERR(filename)) return ERR_CAST(filename); - nd.root = *root; set_nameidata(&nd, -1, filename); + nd.root = *root; + nd.state = ND_ROOT_PRESET; file = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(file == ERR_PTR(-ECHILD))) file = path_openat(&nd, op, flags); diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 5a59dcdce0b2..cb7f49723bbf 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -271,8 +271,6 @@ TRACE_DEFINE_ENUM(LOOKUP_OPEN); TRACE_DEFINE_ENUM(LOOKUP_CREATE); TRACE_DEFINE_ENUM(LOOKUP_EXCL); TRACE_DEFINE_ENUM(LOOKUP_RENAME_TARGET); -TRACE_DEFINE_ENUM(LOOKUP_JUMPED); -TRACE_DEFINE_ENUM(LOOKUP_ROOT); TRACE_DEFINE_ENUM(LOOKUP_EMPTY); TRACE_DEFINE_ENUM(LOOKUP_DOWN); @@ -288,8 +286,6 @@ TRACE_DEFINE_ENUM(LOOKUP_DOWN); { LOOKUP_CREATE, "CREATE" }, \ { LOOKUP_EXCL, "EXCL" }, \ { LOOKUP_RENAME_TARGET, "RENAME_TARGET" }, \ - { LOOKUP_JUMPED, "JUMPED" }, \ - { LOOKUP_ROOT, "ROOT" }, \ { LOOKUP_EMPTY, "EMPTY" }, \ { LOOKUP_DOWN, "DOWN" }) diff --git a/include/linux/namei.h b/include/linux/namei.h index b9605b2b46e7..be9a2b349ca7 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -36,9 +36,6 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT}; /* internal use only */ #define LOOKUP_PARENT 0x0010 -#define LOOKUP_JUMPED 0x1000 -#define LOOKUP_ROOT 0x2000 -#define LOOKUP_ROOT_GRABBED 0x0008 /* Scoping flags for lookup. */ #define LOOKUP_NO_SYMLINKS 0x010000 /* No symlink crossing. */ -- cgit v1.2.3 From f9c82a4ea89c384d49ce03768ba88d049ed3f1f0 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:08 +0200 Subject: Increase size of ucounts to atomic_long_t RLIMIT_MSGQUEUE and RLIMIT_MEMLOCK use unsigned long to store their counters. As a preparation for moving rlimits based on ucounts, we need to increase the size of the variable to long. Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/257aa5fb1a7d81cf0f4c34f39ada2320c4284771.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- include/linux/user_namespace.h | 4 ++-- kernel/ucount.c | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index f6c5f784be5a..c242c10906c5 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -88,7 +88,7 @@ struct user_namespace { struct ctl_table_header *sysctls; #endif struct ucounts *ucounts; - int ucount_max[UCOUNT_COUNTS]; + long ucount_max[UCOUNT_COUNTS]; } __randomize_layout; struct ucounts { @@ -96,7 +96,7 @@ struct ucounts { struct user_namespace *ns; kuid_t uid; int count; - atomic_t ucount[UCOUNT_COUNTS]; + atomic_long_t ucount[UCOUNT_COUNTS]; }; extern struct user_namespace init_user_ns; diff --git a/kernel/ucount.c b/kernel/ucount.c index 11b1596e2542..04c561751af1 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -175,14 +175,14 @@ static void put_ucounts(struct ucounts *ucounts) kfree(ucounts); } -static inline bool atomic_inc_below(atomic_t *v, int u) +static inline bool atomic_long_inc_below(atomic_long_t *v, int u) { - int c, old; - c = atomic_read(v); + long c, old; + c = atomic_long_read(v); for (;;) { if (unlikely(c >= u)) return false; - old = atomic_cmpxchg(v, c, c+1); + old = atomic_long_cmpxchg(v, c, c+1); if (likely(old == c)) return true; c = old; @@ -196,17 +196,17 @@ struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, struct user_namespace *tns; ucounts = get_ucounts(ns, uid); for (iter = ucounts; iter; iter = tns->ucounts) { - int max; + long max; tns = iter->ns; max = READ_ONCE(tns->ucount_max[type]); - if (!atomic_inc_below(&iter->ucount[type], max)) + if (!atomic_long_inc_below(&iter->ucount[type], max)) goto fail; } return ucounts; fail: bad = iter; for (iter = ucounts; iter != bad; iter = iter->ns->ucounts) - atomic_dec(&iter->ucount[type]); + atomic_long_dec(&iter->ucount[type]); put_ucounts(ucounts); return NULL; @@ -216,7 +216,7 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type) { struct ucounts *iter; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - int dec = atomic_dec_if_positive(&iter->ucount[type]); + long dec = atomic_long_dec_if_positive(&iter->ucount[type]); WARN_ON_ONCE(dec < 0); } put_ucounts(ucounts); -- cgit v1.2.3 From 905ae01c4ae2ae3df05bb141801b1db4b7d83c61 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:09 +0200 Subject: Add a reference to ucounts for each cred For RLIMIT_NPROC and some other rlimits the user_struct that holds the global limit is kept alive for the lifetime of a process by keeping it in struct cred. Adding a pointer to ucounts in the struct cred will allow to track RLIMIT_NPROC not only for user in the system, but for user in the user_namespace. Updating ucounts may require memory allocation which may fail. So, we cannot change cred.ucounts in the commit_creds() because this function cannot fail and it should always return 0. For this reason, we modify cred.ucounts before calling the commit_creds(). Changelog v6: * Fix null-ptr-deref in is_ucounts_overlimit() detected by trinity. This error was caused by the fact that cred_alloc_blank() left the ucounts pointer empty. Reported-by: kernel test robot Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/b37aaef28d8b9b0d757e07ba6dd27281bbe39259.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- fs/exec.c | 4 ++++ include/linux/cred.h | 2 ++ include/linux/user_namespace.h | 4 ++++ kernel/cred.c | 40 ++++++++++++++++++++++++++++++++++++++++ kernel/fork.c | 6 ++++++ kernel/sys.c | 12 ++++++++++++ kernel/ucount.c | 40 +++++++++++++++++++++++++++++++++++++--- kernel/user_namespace.c | 3 +++ 8 files changed, 108 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/fs/exec.c b/fs/exec.c index 18594f11c31f..d7c4187ca023 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1360,6 +1360,10 @@ int begin_new_exec(struct linux_binprm * bprm) WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1); flush_signal_handlers(me, 0); + retval = set_cred_ucounts(bprm->cred); + if (retval < 0) + goto out_unlock; + /* * install the new credentials for this executable */ diff --git a/include/linux/cred.h b/include/linux/cred.h index 4c6350503697..66436e655032 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -144,6 +144,7 @@ struct cred { #endif struct user_struct *user; /* real user ID subscription */ struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */ + struct ucounts *ucounts; struct group_info *group_info; /* supplementary groups for euid/fsgid */ /* RCU deletion */ union { @@ -170,6 +171,7 @@ extern int set_security_override_from_ctx(struct cred *, const char *); extern int set_create_files_as(struct cred *, struct inode *); extern int cred_fscmp(const struct cred *, const struct cred *); extern void __init cred_init(void); +extern int set_cred_ucounts(struct cred *); /* * check for validity of credentials diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index c242c10906c5..7919b80d57ed 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -100,11 +100,15 @@ struct ucounts { }; extern struct user_namespace init_user_ns; +extern struct ucounts init_ucounts; bool setup_userns_sysctls(struct user_namespace *ns); void retire_userns_sysctls(struct user_namespace *ns); struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type); void dec_ucount(struct ucounts *ucounts, enum ucount_type type); +struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid); +struct ucounts *get_ucounts(struct ucounts *ucounts); +void put_ucounts(struct ucounts *ucounts); #ifdef CONFIG_USER_NS diff --git a/kernel/cred.c b/kernel/cred.c index 421b1149c651..58a8a9e24347 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -60,6 +60,7 @@ struct cred init_cred = { .user = INIT_USER, .user_ns = &init_user_ns, .group_info = &init_groups, + .ucounts = &init_ucounts, }; static inline void set_cred_subscribers(struct cred *cred, int n) @@ -119,6 +120,8 @@ static void put_cred_rcu(struct rcu_head *rcu) if (cred->group_info) put_group_info(cred->group_info); free_uid(cred->user); + if (cred->ucounts) + put_ucounts(cred->ucounts); put_user_ns(cred->user_ns); kmem_cache_free(cred_jar, cred); } @@ -222,6 +225,7 @@ struct cred *cred_alloc_blank(void) #ifdef CONFIG_DEBUG_CREDENTIALS new->magic = CRED_MAGIC; #endif + new->ucounts = get_ucounts(&init_ucounts); if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0) goto error; @@ -284,6 +288,11 @@ struct cred *prepare_creds(void) if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; + + new->ucounts = get_ucounts(new->ucounts); + if (!new->ucounts) + goto error; + validate_creds(new); return new; @@ -363,6 +372,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) ret = create_user_ns(new); if (ret < 0) goto error_put; + if (set_cred_ucounts(new) < 0) + goto error_put; } #ifdef CONFIG_KEYS @@ -653,6 +664,31 @@ int cred_fscmp(const struct cred *a, const struct cred *b) } EXPORT_SYMBOL(cred_fscmp); +int set_cred_ucounts(struct cred *new) +{ + struct task_struct *task = current; + const struct cred *old = task->real_cred; + struct ucounts *old_ucounts = new->ucounts; + + if (new->user == old->user && new->user_ns == old->user_ns) + return 0; + + /* + * This optimization is needed because alloc_ucounts() uses locks + * for table lookups. + */ + if (old_ucounts && old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid)) + return 0; + + if (!(new->ucounts = alloc_ucounts(new->user_ns, new->euid))) + return -EAGAIN; + + if (old_ucounts) + put_ucounts(old_ucounts); + + return 0; +} + /* * initialise the credentials stuff */ @@ -719,6 +755,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; + new->ucounts = get_ucounts(new->ucounts); + if (!new->ucounts) + goto error; + put_cred(old); validate_creds(new); return new; diff --git a/kernel/fork.c b/kernel/fork.c index 426cd0c51f9e..321a5e31d817 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2995,6 +2995,12 @@ int ksys_unshare(unsigned long unshare_flags) if (err) goto bad_unshare_cleanup_cred; + if (new_cred) { + err = set_cred_ucounts(new_cred); + if (err) + goto bad_unshare_cleanup_cred; + } + if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) { if (do_sysvsem) { /* diff --git a/kernel/sys.c b/kernel/sys.c index 2e2e3f378d97..cabfc5b86175 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -552,6 +552,10 @@ long __sys_setreuid(uid_t ruid, uid_t euid) if (retval < 0) goto error; + retval = set_cred_ucounts(new); + if (retval < 0) + goto error; + return commit_creds(new); error: @@ -610,6 +614,10 @@ long __sys_setuid(uid_t uid) if (retval < 0) goto error; + retval = set_cred_ucounts(new); + if (retval < 0) + goto error; + return commit_creds(new); error: @@ -685,6 +693,10 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) if (retval < 0) goto error; + retval = set_cred_ucounts(new); + if (retval < 0) + goto error; + return commit_creds(new); error: diff --git a/kernel/ucount.c b/kernel/ucount.c index 04c561751af1..50cc1dfb7d28 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -8,6 +8,12 @@ #include #include +struct ucounts init_ucounts = { + .ns = &init_user_ns, + .uid = GLOBAL_ROOT_UID, + .count = 1, +}; + #define UCOUNTS_HASHTABLE_BITS 10 static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)]; static DEFINE_SPINLOCK(ucounts_lock); @@ -125,7 +131,15 @@ static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struc return NULL; } -static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) +static void hlist_add_ucounts(struct ucounts *ucounts) +{ + struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid); + spin_lock_irq(&ucounts_lock); + hlist_add_head(&ucounts->node, hashent); + spin_unlock_irq(&ucounts_lock); +} + +struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) { struct hlist_head *hashent = ucounts_hashentry(ns, uid); struct ucounts *ucounts, *new; @@ -160,7 +174,26 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) return ucounts; } -static void put_ucounts(struct ucounts *ucounts) +struct ucounts *get_ucounts(struct ucounts *ucounts) +{ + unsigned long flags; + + if (!ucounts) + return NULL; + + spin_lock_irqsave(&ucounts_lock, flags); + if (ucounts->count == INT_MAX) { + WARN_ONCE(1, "ucounts: counter has reached its maximum value"); + ucounts = NULL; + } else { + ucounts->count += 1; + } + spin_unlock_irqrestore(&ucounts_lock, flags); + + return ucounts; +} + +void put_ucounts(struct ucounts *ucounts) { unsigned long flags; @@ -194,7 +227,7 @@ struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, { struct ucounts *ucounts, *iter, *bad; struct user_namespace *tns; - ucounts = get_ucounts(ns, uid); + ucounts = alloc_ucounts(ns, uid); for (iter = ucounts; iter; iter = tns->ucounts) { long max; tns = iter->ns; @@ -237,6 +270,7 @@ static __init int user_namespace_sysctl_init(void) BUG_ON(!user_header); BUG_ON(!setup_userns_sysctls(&init_user_ns)); #endif + hlist_add_ucounts(&init_ucounts); return 0; } subsys_initcall(user_namespace_sysctl_init); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 9a4b980d695b..f1b7b4b8ffa2 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1340,6 +1340,9 @@ static int userns_install(struct nsset *nsset, struct ns_common *ns) put_user_ns(cred->user_ns); set_cred_user_ns(cred, get_user_ns(user_ns)); + if (set_cred_ucounts(cred) < 0) + return -EINVAL; + return 0; } -- cgit v1.2.3 From b6c336528926ef73b0f70260f2636de2c3b94c14 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:10 +0200 Subject: Use atomic_t for ucounts reference counting The current implementation of the ucounts reference counter requires the use of spin_lock. We're going to use get_ucounts() in more performance critical areas like a handling of RLIMIT_SIGPENDING. Now we need to use spin_lock only if we want to change the hashtable. v10: * Always try to put ucounts in case we cannot increase ucounts->count. This will allow to cover the case when all consumers will return ucounts at once. v9: * Use a negative value to check that the ucounts->count is close to overflow. Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/94d1dbecab060a6b116b0a2d1accd8ca1bbb4f5f.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- include/linux/user_namespace.h | 4 ++-- kernel/ucount.c | 53 +++++++++++++++--------------------------- 2 files changed, 21 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 7919b80d57ed..80b5bf12feae 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -95,7 +95,7 @@ struct ucounts { struct hlist_node node; struct user_namespace *ns; kuid_t uid; - int count; + atomic_t count; atomic_long_t ucount[UCOUNT_COUNTS]; }; @@ -107,7 +107,7 @@ void retire_userns_sysctls(struct user_namespace *ns); struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type); void dec_ucount(struct ucounts *ucounts, enum ucount_type type); struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid); -struct ucounts *get_ucounts(struct ucounts *ucounts); +struct ucounts * __must_check get_ucounts(struct ucounts *ucounts); void put_ucounts(struct ucounts *ucounts); #ifdef CONFIG_USER_NS diff --git a/kernel/ucount.c b/kernel/ucount.c index 50cc1dfb7d28..365865f368ec 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -11,7 +11,7 @@ struct ucounts init_ucounts = { .ns = &init_user_ns, .uid = GLOBAL_ROOT_UID, - .count = 1, + .count = ATOMIC_INIT(1), }; #define UCOUNTS_HASHTABLE_BITS 10 @@ -139,6 +139,15 @@ static void hlist_add_ucounts(struct ucounts *ucounts) spin_unlock_irq(&ucounts_lock); } +struct ucounts *get_ucounts(struct ucounts *ucounts) +{ + if (ucounts && atomic_add_negative(1, &ucounts->count)) { + put_ucounts(ucounts); + ucounts = NULL; + } + return ucounts; +} + struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) { struct hlist_head *hashent = ucounts_hashentry(ns, uid); @@ -155,7 +164,7 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) new->ns = ns; new->uid = uid; - new->count = 0; + atomic_set(&new->count, 1); spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); @@ -163,33 +172,12 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) kfree(new); } else { hlist_add_head(&new->node, hashent); - ucounts = new; + spin_unlock_irq(&ucounts_lock); + return new; } } - if (ucounts->count == INT_MAX) - ucounts = NULL; - else - ucounts->count += 1; spin_unlock_irq(&ucounts_lock); - return ucounts; -} - -struct ucounts *get_ucounts(struct ucounts *ucounts) -{ - unsigned long flags; - - if (!ucounts) - return NULL; - - spin_lock_irqsave(&ucounts_lock, flags); - if (ucounts->count == INT_MAX) { - WARN_ONCE(1, "ucounts: counter has reached its maximum value"); - ucounts = NULL; - } else { - ucounts->count += 1; - } - spin_unlock_irqrestore(&ucounts_lock, flags); - + ucounts = get_ucounts(ucounts); return ucounts; } @@ -197,15 +185,12 @@ void put_ucounts(struct ucounts *ucounts) { unsigned long flags; - spin_lock_irqsave(&ucounts_lock, flags); - ucounts->count -= 1; - if (!ucounts->count) + if (atomic_dec_and_test(&ucounts->count)) { + spin_lock_irqsave(&ucounts_lock, flags); hlist_del_init(&ucounts->node); - else - ucounts = NULL; - spin_unlock_irqrestore(&ucounts_lock, flags); - - kfree(ucounts); + spin_unlock_irqrestore(&ucounts_lock, flags); + kfree(ucounts); + } } static inline bool atomic_long_inc_below(atomic_long_t *v, int u) -- cgit v1.2.3 From 21d1c5e386bc751f1953b371d72cd5b7d9c9e270 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:11 +0200 Subject: Reimplement RLIMIT_NPROC on top of ucounts The rlimit counter is tied to uid in the user_namespace. This allows rlimit values to be specified in userns even if they are already globally exceeded by the user. However, the value of the previous user_namespaces cannot be exceeded. To illustrate the impact of rlimits, let's say there is a program that does not fork. Some service-A wants to run this program as user X in multiple containers. Since the program never fork the service wants to set RLIMIT_NPROC=1. service-A \- program (uid=1000, container1, rlimit_nproc=1) \- program (uid=1000, container2, rlimit_nproc=1) The service-A sets RLIMIT_NPROC=1 and runs the program in container1. When the service-A tries to run a program with RLIMIT_NPROC=1 in container2 it fails since user X already has one running process. We cannot use existing inc_ucounts / dec_ucounts because they do not allow us to exceed the maximum for the counter. Some rlimits can be overlimited by root or if the user has the appropriate capability. Changelog v11: * Change inc_rlimit_ucounts() which now returns top value of ucounts. * Drop inc_rlimit_ucounts_and_test() because the return code of inc_rlimit_ucounts() can be checked. Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/c5286a8aa16d2d698c222f7532f3d735c82bc6bc.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- fs/exec.c | 2 +- include/linux/cred.h | 2 ++ include/linux/sched/user.h | 1 - include/linux/user_namespace.h | 12 ++++++++++++ kernel/cred.c | 10 +++++----- kernel/exit.c | 2 +- kernel/fork.c | 9 +++++---- kernel/sys.c | 2 +- kernel/ucount.c | 44 ++++++++++++++++++++++++++++++++++++++++++ kernel/user.c | 1 - kernel/user_namespace.c | 3 ++- 11 files changed, 73 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/fs/exec.c b/fs/exec.c index d7c4187ca023..f2bcdbeb3afb 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1878,7 +1878,7 @@ static int do_execveat_common(int fd, struct filename *filename, * whether NPROC limit is still exceeded. */ if ((current->flags & PF_NPROC_EXCEEDED) && - atomic_read(¤t_user()->processes) > rlimit(RLIMIT_NPROC)) { + is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { retval = -EAGAIN; goto out_ret; } diff --git a/include/linux/cred.h b/include/linux/cred.h index 66436e655032..5ca1e8a1d035 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -372,6 +372,7 @@ static inline void put_cred(const struct cred *_cred) #define task_uid(task) (task_cred_xxx((task), uid)) #define task_euid(task) (task_cred_xxx((task), euid)) +#define task_ucounts(task) (task_cred_xxx((task), ucounts)) #define current_cred_xxx(xxx) \ ({ \ @@ -388,6 +389,7 @@ static inline void put_cred(const struct cred *_cred) #define current_fsgid() (current_cred_xxx(fsgid)) #define current_cap() (current_cred_xxx(cap_effective)) #define current_user() (current_cred_xxx(user)) +#define current_ucounts() (current_cred_xxx(ucounts)) extern struct user_namespace init_user_ns; #ifdef CONFIG_USER_NS diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index a8ec3b6093fc..d33d867ad6c1 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -12,7 +12,6 @@ */ struct user_struct { refcount_t __count; /* reference count */ - atomic_t processes; /* How many processes does this user have? */ atomic_t sigpending; /* How many pending signals does this user have? */ #ifdef CONFIG_FANOTIFY atomic_t fanotify_listeners; diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 80b5bf12feae..4a97acc35990 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -50,9 +50,12 @@ enum ucount_type { UCOUNT_INOTIFY_INSTANCES, UCOUNT_INOTIFY_WATCHES, #endif + UCOUNT_RLIMIT_NPROC, UCOUNT_COUNTS, }; +#define MAX_PER_NAMESPACE_UCOUNTS UCOUNT_RLIMIT_NPROC + struct user_namespace { struct uid_gid_map uid_map; struct uid_gid_map gid_map; @@ -110,6 +113,15 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid); struct ucounts * __must_check get_ucounts(struct ucounts *ucounts); void put_ucounts(struct ucounts *ucounts); +static inline long get_ucounts_value(struct ucounts *ucounts, enum ucount_type type) +{ + return atomic_long_read(&ucounts->ucount[type]); +} + +long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v); +bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v); +bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max); + #ifdef CONFIG_USER_NS static inline struct user_namespace *get_user_ns(struct user_namespace *ns) diff --git a/kernel/cred.c b/kernel/cred.c index 58a8a9e24347..dcfa30b337c5 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -360,7 +360,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) kdebug("share_creds(%p{%d,%d})", p->cred, atomic_read(&p->cred->usage), read_cred_subscribers(p->cred)); - atomic_inc(&p->cred->user->processes); + inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); return 0; } @@ -395,8 +395,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) } #endif - atomic_inc(&new->user->processes); p->cred = p->real_cred = get_cred(new); + inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); alter_cred_subscribers(new, 2); validate_creds(new); return 0; @@ -496,12 +496,12 @@ int commit_creds(struct cred *new) * in set_user(). */ alter_cred_subscribers(new, 2); - if (new->user != old->user) - atomic_inc(&new->user->processes); + if (new->user != old->user || new->user_ns != old->user_ns) + inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1); rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->cred, new); if (new->user != old->user) - atomic_dec(&old->user->processes); + dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1); alter_cred_subscribers(old, -2); /* send notifications */ diff --git a/kernel/exit.c b/kernel/exit.c index 04029e35e69a..61c0fe902b50 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -188,7 +188,7 @@ repeat: /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ rcu_read_lock(); - atomic_dec(&__task_cred(p)->user->processes); + dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); rcu_read_unlock(); cgroup_release(p); diff --git a/kernel/fork.c b/kernel/fork.c index 321a5e31d817..ed7dfb07178d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -819,9 +819,11 @@ void __init fork_init(void) init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; - for (i = 0; i < UCOUNT_COUNTS; i++) + for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) init_user_ns.ucount_max[i] = max_threads/2; + init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC); + #ifdef CONFIG_VMAP_STACK cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", NULL, free_vm_stack_cache); @@ -1978,8 +1980,7 @@ static __latent_entropy struct task_struct *copy_process( DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif retval = -EAGAIN; - if (atomic_read(&p->real_cred->user->processes) >= - task_rlimit(p, RLIMIT_NPROC)) { + if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { if (p->real_cred->user != INIT_USER && !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) goto bad_fork_free; @@ -2382,7 +2383,7 @@ bad_fork_cleanup_threadgroup_lock: #endif delayacct_tsk_free(p); bad_fork_cleanup_count: - atomic_dec(&p->cred->user->processes); + dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); exit_creds(p); bad_fork_free: p->state = TASK_DEAD; diff --git a/kernel/sys.c b/kernel/sys.c index cabfc5b86175..00266a65a000 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -473,7 +473,7 @@ static int set_user(struct cred *new) * for programs doing set*uid()+execve() by harmlessly deferring the * failure to the execve() stage. */ - if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) && + if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) && new_user != INIT_USER) current->flags |= PF_NPROC_EXCEEDED; else diff --git a/kernel/ucount.c b/kernel/ucount.c index 365865f368ec..6caa56f7dec8 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -80,6 +80,7 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_inotify_instances"), UCOUNT_ENTRY("max_inotify_watches"), #endif + { }, { } }; #endif /* CONFIG_SYSCTL */ @@ -240,6 +241,48 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type) put_ucounts(ucounts); } +long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v) +{ + struct ucounts *iter; + long ret = 0; + + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + long max = READ_ONCE(iter->ns->ucount_max[type]); + long new = atomic_long_add_return(v, &iter->ucount[type]); + if (new < 0 || new > max) + ret = LONG_MAX; + else if (iter == ucounts) + ret = new; + } + return ret; +} + +bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v) +{ + struct ucounts *iter; + long new; + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + long dec = atomic_long_add_return(-v, &iter->ucount[type]); + WARN_ON_ONCE(dec < 0); + if (iter == ucounts) + new = dec; + } + return (new == 0); +} + +bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max) +{ + struct ucounts *iter; + if (get_ucounts_value(ucounts, type) > max) + return true; + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + max = READ_ONCE(iter->ns->ucount_max[type]); + if (get_ucounts_value(iter, type) > max) + return true; + } + return false; +} + static __init int user_namespace_sysctl_init(void) { #ifdef CONFIG_SYSCTL @@ -256,6 +299,7 @@ static __init int user_namespace_sysctl_init(void) BUG_ON(!setup_userns_sysctls(&init_user_ns)); #endif hlist_add_ucounts(&init_ucounts); + inc_rlimit_ucounts(&init_ucounts, UCOUNT_RLIMIT_NPROC, 1); return 0; } subsys_initcall(user_namespace_sysctl_init); diff --git a/kernel/user.c b/kernel/user.c index a2478cddf536..7f5ff498207a 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -98,7 +98,6 @@ static DEFINE_SPINLOCK(uidhash_lock); /* root_user.__count is 1, for init task cred */ struct user_struct root_user = { .__count = REFCOUNT_INIT(1), - .processes = ATOMIC_INIT(1), .sigpending = ATOMIC_INIT(0), .locked_shm = 0, .uid = GLOBAL_ROOT_UID, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index f1b7b4b8ffa2..e6577c835072 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -119,9 +119,10 @@ int create_user_ns(struct cred *new) ns->owner = owner; ns->group = group; INIT_WORK(&ns->work, free_user_ns); - for (i = 0; i < UCOUNT_COUNTS; i++) { + for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) { ns->ucount_max[i] = INT_MAX; } + ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ -- cgit v1.2.3 From 6e52a9f0532f912af37bab4caf18b57d1b9845f4 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:12 +0200 Subject: Reimplement RLIMIT_MSGQUEUE on top of ucounts The rlimit counter is tied to uid in the user_namespace. This allows rlimit values to be specified in userns even if they are already globally exceeded by the user. However, the value of the previous user_namespaces cannot be exceeded. Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/2531f42f7884bbfee56a978040b3e0d25cdf6cde.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- include/linux/sched/user.h | 4 ---- include/linux/user_namespace.h | 1 + ipc/mqueue.c | 40 +++++++++++++++++++++------------------- kernel/fork.c | 1 + kernel/ucount.c | 1 + kernel/user_namespace.c | 1 + 6 files changed, 25 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index d33d867ad6c1..8a34446681aa 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -18,10 +18,6 @@ struct user_struct { #endif #ifdef CONFIG_EPOLL atomic_long_t epoll_watches; /* The number of file descriptors currently watched */ -#endif -#ifdef CONFIG_POSIX_MQUEUE - /* protected by mq_lock */ - unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */ #endif unsigned long locked_shm; /* How many pages of mlocked shm ? */ unsigned long unix_inflight; /* How many files in flight in unix sockets */ diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 4a97acc35990..5eeb86b00e68 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -51,6 +51,7 @@ enum ucount_type { UCOUNT_INOTIFY_WATCHES, #endif UCOUNT_RLIMIT_NPROC, + UCOUNT_RLIMIT_MSGQUEUE, UCOUNT_COUNTS, }; diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 8031464ed4ae..461fcf8c873d 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -144,7 +144,7 @@ struct mqueue_inode_info { struct pid *notify_owner; u32 notify_self_exec_id; struct user_namespace *notify_user_ns; - struct user_struct *user; /* user who created, for accounting */ + struct ucounts *ucounts; /* user who created, for accounting */ struct sock *notify_sock; struct sk_buff *notify_cookie; @@ -292,7 +292,6 @@ static struct inode *mqueue_get_inode(struct super_block *sb, struct ipc_namespace *ipc_ns, umode_t mode, struct mq_attr *attr) { - struct user_struct *u = current_user(); struct inode *inode; int ret = -ENOMEM; @@ -321,7 +320,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, info->notify_owner = NULL; info->notify_user_ns = NULL; info->qsize = 0; - info->user = NULL; /* set when all is ok */ + info->ucounts = NULL; /* set when all is ok */ info->msg_tree = RB_ROOT; info->msg_tree_rightmost = NULL; info->node_cache = NULL; @@ -371,19 +370,23 @@ static struct inode *mqueue_get_inode(struct super_block *sb, if (mq_bytes + mq_treesize < mq_bytes) goto out_inode; mq_bytes += mq_treesize; - spin_lock(&mq_lock); - if (u->mq_bytes + mq_bytes < u->mq_bytes || - u->mq_bytes + mq_bytes > rlimit(RLIMIT_MSGQUEUE)) { + info->ucounts = get_ucounts(current_ucounts()); + if (info->ucounts) { + long msgqueue; + + spin_lock(&mq_lock); + msgqueue = inc_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); + if (msgqueue == LONG_MAX || msgqueue > rlimit(RLIMIT_MSGQUEUE)) { + dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); + spin_unlock(&mq_lock); + put_ucounts(info->ucounts); + info->ucounts = NULL; + /* mqueue_evict_inode() releases info->messages */ + ret = -EMFILE; + goto out_inode; + } spin_unlock(&mq_lock); - /* mqueue_evict_inode() releases info->messages */ - ret = -EMFILE; - goto out_inode; } - u->mq_bytes += mq_bytes; - spin_unlock(&mq_lock); - - /* all is ok */ - info->user = get_uid(u); } else if (S_ISDIR(mode)) { inc_nlink(inode); /* Some things misbehave if size == 0 on a directory */ @@ -497,7 +500,6 @@ static void mqueue_free_inode(struct inode *inode) static void mqueue_evict_inode(struct inode *inode) { struct mqueue_inode_info *info; - struct user_struct *user; struct ipc_namespace *ipc_ns; struct msg_msg *msg, *nmsg; LIST_HEAD(tmp_msg); @@ -520,8 +522,7 @@ static void mqueue_evict_inode(struct inode *inode) free_msg(msg); } - user = info->user; - if (user) { + if (info->ucounts) { unsigned long mq_bytes, mq_treesize; /* Total amount of bytes accounted for the mqueue */ @@ -533,7 +534,7 @@ static void mqueue_evict_inode(struct inode *inode) info->attr.mq_msgsize); spin_lock(&mq_lock); - user->mq_bytes -= mq_bytes; + dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); /* * get_ns_from_inode() ensures that the * (ipc_ns = sb->s_fs_info) is either a valid ipc_ns @@ -543,7 +544,8 @@ static void mqueue_evict_inode(struct inode *inode) if (ipc_ns) ipc_ns->mq_queues_count--; spin_unlock(&mq_lock); - free_uid(user); + put_ucounts(info->ucounts); + info->ucounts = NULL; } if (ipc_ns) put_ipc_ns(ipc_ns); diff --git a/kernel/fork.c b/kernel/fork.c index ed7dfb07178d..a9c5097dfc86 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -823,6 +823,7 @@ void __init fork_init(void) init_user_ns.ucount_max[i] = max_threads/2; init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC); + init_user_ns.ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = task_rlimit(&init_task, RLIMIT_MSGQUEUE); #ifdef CONFIG_VMAP_STACK cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", diff --git a/kernel/ucount.c b/kernel/ucount.c index 6caa56f7dec8..6e6f936a5963 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -80,6 +80,7 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_inotify_instances"), UCOUNT_ENTRY("max_inotify_watches"), #endif + { }, { }, { } }; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index e6577c835072..7eccc4f84549 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -123,6 +123,7 @@ int create_user_ns(struct cred *new) ns->ucount_max[i] = INT_MAX; } ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC); + ns->ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = rlimit(RLIMIT_MSGQUEUE); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ -- cgit v1.2.3 From d64696905554e919321e31afc210606653b8f6a4 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:13 +0200 Subject: Reimplement RLIMIT_SIGPENDING on top of ucounts The rlimit counter is tied to uid in the user_namespace. This allows rlimit values to be specified in userns even if they are already globally exceeded by the user. However, the value of the previous user_namespaces cannot be exceeded. Changelog v11: * Revert most of changes to fix performance issues. v10: * Fix memory leak on get_ucounts failure. Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/df9d7764dddd50f28616b7840de74ec0f81711a8.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- fs/proc/array.c | 2 +- include/linux/sched/user.h | 1 - include/linux/signal_types.h | 4 +++- include/linux/user_namespace.h | 1 + kernel/fork.c | 1 + kernel/signal.c | 25 +++++++++++++------------ kernel/ucount.c | 1 + kernel/user.c | 1 - kernel/user_namespace.c | 1 + 9 files changed, 21 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/array.c b/fs/proc/array.c index bb87e4d89cd8..74b0ea4b7e38 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -284,7 +284,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) collect_sigign_sigcatch(p, &ignored, &caught); num_threads = get_nr_threads(p); rcu_read_lock(); /* FIXME: is this correct? */ - qsize = atomic_read(&__task_cred(p)->user->sigpending); + qsize = get_ucounts_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING); rcu_read_unlock(); qlim = task_rlimit(p, RLIMIT_SIGPENDING); unlock_task_sighand(p, &flags); diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index 8a34446681aa..8ba9cec4fb99 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -12,7 +12,6 @@ */ struct user_struct { refcount_t __count; /* reference count */ - atomic_t sigpending; /* How many pending signals does this user have? */ #ifdef CONFIG_FANOTIFY atomic_t fanotify_listeners; #endif diff --git a/include/linux/signal_types.h b/include/linux/signal_types.h index 68e06c75c5b2..34cb28b8f16c 100644 --- a/include/linux/signal_types.h +++ b/include/linux/signal_types.h @@ -13,6 +13,8 @@ typedef struct kernel_siginfo { __SIGINFO; } kernel_siginfo_t; +struct ucounts; + /* * Real Time signals may be queued. */ @@ -21,7 +23,7 @@ struct sigqueue { struct list_head list; int flags; kernel_siginfo_t info; - struct user_struct *user; + struct ucounts *ucounts; }; /* flags values. */ diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 5eeb86b00e68..58f417986472 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -52,6 +52,7 @@ enum ucount_type { #endif UCOUNT_RLIMIT_NPROC, UCOUNT_RLIMIT_MSGQUEUE, + UCOUNT_RLIMIT_SIGPENDING, UCOUNT_COUNTS, }; diff --git a/kernel/fork.c b/kernel/fork.c index a9c5097dfc86..03119926b27d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -824,6 +824,7 @@ void __init fork_init(void) init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC); init_user_ns.ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = task_rlimit(&init_task, RLIMIT_MSGQUEUE); + init_user_ns.ucount_max[UCOUNT_RLIMIT_SIGPENDING] = task_rlimit(&init_task, RLIMIT_SIGPENDING); #ifdef CONFIG_VMAP_STACK cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", diff --git a/kernel/signal.c b/kernel/signal.c index f2718350bf4b..9a6dab712123 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -413,8 +413,8 @@ static struct sigqueue * __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) { struct sigqueue *q = NULL; - struct user_struct *user; - int sigpending; + struct ucounts *ucounts = NULL; + long sigpending; /* * Protect access to @t credentials. This can go away when all @@ -425,27 +425,26 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi * changes from/to zero. */ rcu_read_lock(); - user = __task_cred(t)->user; - sigpending = atomic_inc_return(&user->sigpending); + ucounts = task_ucounts(t); + sigpending = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1); if (sigpending == 1) - get_uid(user); + ucounts = get_ucounts(ucounts); rcu_read_unlock(); - if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { + if (override_rlimit || (sigpending < LONG_MAX && sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { q = kmem_cache_alloc(sigqueue_cachep, flags); } else { print_dropped_signal(sig); } if (unlikely(q == NULL)) { - if (atomic_dec_and_test(&user->sigpending)) - free_uid(user); + if (ucounts && dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1)) + put_ucounts(ucounts); } else { INIT_LIST_HEAD(&q->list); q->flags = 0; - q->user = user; + q->ucounts = ucounts; } - return q; } @@ -453,8 +452,10 @@ static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) return; - if (atomic_dec_and_test(&q->user->sigpending)) - free_uid(q->user); + if (q->ucounts && dec_rlimit_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING, 1)) { + put_ucounts(q->ucounts); + q->ucounts = NULL; + } kmem_cache_free(sigqueue_cachep, q); } diff --git a/kernel/ucount.c b/kernel/ucount.c index 6e6f936a5963..8ce62da6a62c 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -80,6 +80,7 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_inotify_instances"), UCOUNT_ENTRY("max_inotify_watches"), #endif + { }, { }, { }, { } diff --git a/kernel/user.c b/kernel/user.c index 7f5ff498207a..6737327f83be 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -98,7 +98,6 @@ static DEFINE_SPINLOCK(uidhash_lock); /* root_user.__count is 1, for init task cred */ struct user_struct root_user = { .__count = REFCOUNT_INIT(1), - .sigpending = ATOMIC_INIT(0), .locked_shm = 0, .uid = GLOBAL_ROOT_UID, .ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0), diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 7eccc4f84549..822eacee4588 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -124,6 +124,7 @@ int create_user_ns(struct cred *new) } ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC); ns->ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = rlimit(RLIMIT_MSGQUEUE); + ns->ucount_max[UCOUNT_RLIMIT_SIGPENDING] = rlimit(RLIMIT_SIGPENDING); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ -- cgit v1.2.3 From d7c9e99aee48e6bc0b427f3e3c658a6aba15001e Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:14 +0200 Subject: Reimplement RLIMIT_MEMLOCK on top of ucounts The rlimit counter is tied to uid in the user_namespace. This allows rlimit values to be specified in userns even if they are already globally exceeded by the user. However, the value of the previous user_namespaces cannot be exceeded. Changelog v11: * Fix issue found by lkp robot. v8: * Fix issues found by lkp-tests project. v7: * Keep only ucounts for RLIMIT_MEMLOCK checks instead of struct cred. v6: * Fix bug in hugetlb_file_setup() detected by trinity. Reported-by: kernel test robot Reported-by: kernel test robot Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/970d50c70c71bfd4496e0e8d2a0a32feebebb350.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- fs/hugetlbfs/inode.c | 16 ++++++++-------- include/linux/hugetlb.h | 4 ++-- include/linux/mm.h | 4 ++-- include/linux/sched/user.h | 1 - include/linux/shmem_fs.h | 2 +- include/linux/user_namespace.h | 1 + ipc/shm.c | 26 +++++++++++++------------- kernel/fork.c | 1 + kernel/ucount.c | 1 + kernel/user.c | 1 - kernel/user_namespace.c | 1 + mm/memfd.c | 4 ++-- mm/mlock.c | 22 ++++++++++++++-------- mm/mmap.c | 4 ++-- mm/shmem.c | 10 +++++----- 15 files changed, 53 insertions(+), 45 deletions(-) (limited to 'include/linux') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 701c82c36138..be519fc9559a 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1443,7 +1443,7 @@ static int get_hstate_idx(int page_size_log) * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. */ struct file *hugetlb_file_setup(const char *name, size_t size, - vm_flags_t acctflag, struct user_struct **user, + vm_flags_t acctflag, struct ucounts **ucounts, int creat_flags, int page_size_log) { struct inode *inode; @@ -1455,20 +1455,20 @@ struct file *hugetlb_file_setup(const char *name, size_t size, if (hstate_idx < 0) return ERR_PTR(-ENODEV); - *user = NULL; + *ucounts = NULL; mnt = hugetlbfs_vfsmount[hstate_idx]; if (!mnt) return ERR_PTR(-ENOENT); if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { - *user = current_user(); - if (user_shm_lock(size, *user)) { + *ucounts = current_ucounts(); + if (user_shm_lock(size, *ucounts)) { task_lock(current); pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", current->comm, current->pid); task_unlock(current); } else { - *user = NULL; + *ucounts = NULL; return ERR_PTR(-EPERM); } } @@ -1495,9 +1495,9 @@ struct file *hugetlb_file_setup(const char *name, size_t size, iput(inode); out: - if (*user) { - user_shm_unlock(size, *user); - *user = NULL; + if (*ucounts) { + user_shm_unlock(size, *ucounts); + *ucounts = NULL; } return file; } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index cccd1aab69dd..96d63dbdec65 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -434,7 +434,7 @@ static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) extern const struct file_operations hugetlbfs_file_operations; extern const struct vm_operations_struct hugetlb_vm_ops; struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, - struct user_struct **user, int creat_flags, + struct ucounts **ucounts, int creat_flags, int page_size_log); static inline bool is_file_hugepages(struct file *file) @@ -454,7 +454,7 @@ static inline struct hstate *hstate_inode(struct inode *i) #define is_file_hugepages(file) false static inline struct file * hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag, - struct user_struct **user, int creat_flags, + struct ucounts **ucounts, int creat_flags, int page_size_log) { return ERR_PTR(-ENOSYS); diff --git a/include/linux/mm.h b/include/linux/mm.h index 8ba434287387..3b4e24738ce4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1670,8 +1670,8 @@ extern bool can_do_mlock(void); #else static inline bool can_do_mlock(void) { return false; } #endif -extern int user_shm_lock(size_t, struct user_struct *); -extern void user_shm_unlock(size_t, struct user_struct *); +extern int user_shm_lock(size_t, struct ucounts *); +extern void user_shm_unlock(size_t, struct ucounts *); /* * Parameter block passed down to zap_pte_range in exceptional cases. diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index 8ba9cec4fb99..82bd2532da6b 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -18,7 +18,6 @@ struct user_struct { #ifdef CONFIG_EPOLL atomic_long_t epoll_watches; /* The number of file descriptors currently watched */ #endif - unsigned long locked_shm; /* How many pages of mlocked shm ? */ unsigned long unix_inflight; /* How many files in flight in unix sockets */ atomic_long_t pipe_bufs; /* how many pages are allocated in pipe buffers */ diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index d82b6f396588..aa77dcd1646f 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -65,7 +65,7 @@ extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, extern int shmem_zero_setup(struct vm_area_struct *); extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); -extern int shmem_lock(struct file *file, int lock, struct user_struct *user); +extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts); #ifdef CONFIG_SHMEM extern const struct address_space_operations shmem_aops; static inline bool shmem_mapping(struct address_space *mapping) diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 58f417986472..2a3177b9b8bf 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -53,6 +53,7 @@ enum ucount_type { UCOUNT_RLIMIT_NPROC, UCOUNT_RLIMIT_MSGQUEUE, UCOUNT_RLIMIT_SIGPENDING, + UCOUNT_RLIMIT_MEMLOCK, UCOUNT_COUNTS, }; diff --git a/ipc/shm.c b/ipc/shm.c index febd88daba8c..003234fbbd17 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -60,7 +60,7 @@ struct shmid_kernel /* private to the kernel */ time64_t shm_ctim; struct pid *shm_cprid; struct pid *shm_lprid; - struct user_struct *mlock_user; + struct ucounts *mlock_ucounts; /* The task created the shm object. NULL if the task is dead. */ struct task_struct *shm_creator; @@ -286,10 +286,10 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) shm_rmid(ns, shp); shm_unlock(shp); if (!is_file_hugepages(shm_file)) - shmem_lock(shm_file, 0, shp->mlock_user); - else if (shp->mlock_user) + shmem_lock(shm_file, 0, shp->mlock_ucounts); + else if (shp->mlock_ucounts) user_shm_unlock(i_size_read(file_inode(shm_file)), - shp->mlock_user); + shp->mlock_ucounts); fput(shm_file); ipc_update_pid(&shp->shm_cprid, NULL); ipc_update_pid(&shp->shm_lprid, NULL); @@ -625,7 +625,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) shp->shm_perm.key = key; shp->shm_perm.mode = (shmflg & S_IRWXUGO); - shp->mlock_user = NULL; + shp->mlock_ucounts = NULL; shp->shm_perm.security = NULL; error = security_shm_alloc(&shp->shm_perm); @@ -650,7 +650,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) if (shmflg & SHM_NORESERVE) acctflag = VM_NORESERVE; file = hugetlb_file_setup(name, hugesize, acctflag, - &shp->mlock_user, HUGETLB_SHMFS_INODE, + &shp->mlock_ucounts, HUGETLB_SHMFS_INODE, (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); } else { /* @@ -698,8 +698,8 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) no_id: ipc_update_pid(&shp->shm_cprid, NULL); ipc_update_pid(&shp->shm_lprid, NULL); - if (is_file_hugepages(file) && shp->mlock_user) - user_shm_unlock(size, shp->mlock_user); + if (is_file_hugepages(file) && shp->mlock_ucounts) + user_shm_unlock(size, shp->mlock_ucounts); fput(file); ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); return error; @@ -1105,12 +1105,12 @@ static int shmctl_do_lock(struct ipc_namespace *ns, int shmid, int cmd) goto out_unlock0; if (cmd == SHM_LOCK) { - struct user_struct *user = current_user(); + struct ucounts *ucounts = current_ucounts(); - err = shmem_lock(shm_file, 1, user); + err = shmem_lock(shm_file, 1, ucounts); if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) { shp->shm_perm.mode |= SHM_LOCKED; - shp->mlock_user = user; + shp->mlock_ucounts = ucounts; } goto out_unlock0; } @@ -1118,9 +1118,9 @@ static int shmctl_do_lock(struct ipc_namespace *ns, int shmid, int cmd) /* SHM_UNLOCK */ if (!(shp->shm_perm.mode & SHM_LOCKED)) goto out_unlock0; - shmem_lock(shm_file, 0, shp->mlock_user); + shmem_lock(shm_file, 0, shp->mlock_ucounts); shp->shm_perm.mode &= ~SHM_LOCKED; - shp->mlock_user = NULL; + shp->mlock_ucounts = NULL; get_file(shm_file); ipc_unlock_object(&shp->shm_perm); rcu_read_unlock(); diff --git a/kernel/fork.c b/kernel/fork.c index 03119926b27d..610fd4de60d7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -825,6 +825,7 @@ void __init fork_init(void) init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC); init_user_ns.ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = task_rlimit(&init_task, RLIMIT_MSGQUEUE); init_user_ns.ucount_max[UCOUNT_RLIMIT_SIGPENDING] = task_rlimit(&init_task, RLIMIT_SIGPENDING); + init_user_ns.ucount_max[UCOUNT_RLIMIT_MEMLOCK] = task_rlimit(&init_task, RLIMIT_MEMLOCK); #ifdef CONFIG_VMAP_STACK cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", diff --git a/kernel/ucount.c b/kernel/ucount.c index 8ce62da6a62c..d316bac3e520 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -83,6 +83,7 @@ static struct ctl_table user_table[] = { { }, { }, { }, + { }, { } }; #endif /* CONFIG_SYSCTL */ diff --git a/kernel/user.c b/kernel/user.c index 6737327f83be..c82399c1618a 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -98,7 +98,6 @@ static DEFINE_SPINLOCK(uidhash_lock); /* root_user.__count is 1, for init task cred */ struct user_struct root_user = { .__count = REFCOUNT_INIT(1), - .locked_shm = 0, .uid = GLOBAL_ROOT_UID, .ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0), }; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 822eacee4588..892da1360862 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -125,6 +125,7 @@ int create_user_ns(struct cred *new) ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC); ns->ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = rlimit(RLIMIT_MSGQUEUE); ns->ucount_max[UCOUNT_RLIMIT_SIGPENDING] = rlimit(RLIMIT_SIGPENDING); + ns->ucount_max[UCOUNT_RLIMIT_MEMLOCK] = rlimit(RLIMIT_MEMLOCK); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ diff --git a/mm/memfd.c b/mm/memfd.c index 2647c898990c..081dd33e6a61 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -297,9 +297,9 @@ SYSCALL_DEFINE2(memfd_create, } if (flags & MFD_HUGETLB) { - struct user_struct *user = NULL; + struct ucounts *ucounts = NULL; - file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user, + file = hugetlb_file_setup(name, 0, VM_NORESERVE, &ucounts, HUGETLB_ANONHUGE_INODE, (flags >> MFD_HUGE_SHIFT) & MFD_HUGE_MASK); diff --git a/mm/mlock.c b/mm/mlock.c index f8f8cc32d03d..dd411aabf695 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -817,9 +817,10 @@ SYSCALL_DEFINE0(munlockall) */ static DEFINE_SPINLOCK(shmlock_user_lock); -int user_shm_lock(size_t size, struct user_struct *user) +int user_shm_lock(size_t size, struct ucounts *ucounts) { unsigned long lock_limit, locked; + long memlock; int allowed = 0; locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -828,21 +829,26 @@ int user_shm_lock(size_t size, struct user_struct *user) allowed = 1; lock_limit >>= PAGE_SHIFT; spin_lock(&shmlock_user_lock); - if (!allowed && - locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK)) + memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); + + if (!allowed && (memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) { + dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); + goto out; + } + if (!get_ucounts(ucounts)) { + dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); goto out; - get_uid(user); - user->locked_shm += locked; + } allowed = 1; out: spin_unlock(&shmlock_user_lock); return allowed; } -void user_shm_unlock(size_t size, struct user_struct *user) +void user_shm_unlock(size_t size, struct ucounts *ucounts) { spin_lock(&shmlock_user_lock); - user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT); spin_unlock(&shmlock_user_lock); - free_uid(user); + put_ucounts(ucounts); } diff --git a/mm/mmap.c b/mm/mmap.c index 3f287599a7a3..99f97d200aa4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1605,7 +1605,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, goto out_fput; } } else if (flags & MAP_HUGETLB) { - struct user_struct *user = NULL; + struct ucounts *ucounts = NULL; struct hstate *hs; hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); @@ -1621,7 +1621,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, */ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, - &user, HUGETLB_ANONHUGE_INODE, + &ucounts, HUGETLB_ANONHUGE_INODE, (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); if (IS_ERR(file)) return PTR_ERR(file); diff --git a/mm/shmem.c b/mm/shmem.c index b2db4ed0fbc7..7ee6d27222e9 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2227,7 +2227,7 @@ static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, } #endif -int shmem_lock(struct file *file, int lock, struct user_struct *user) +int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { struct inode *inode = file_inode(file); struct shmem_inode_info *info = SHMEM_I(inode); @@ -2239,13 +2239,13 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) * no serialization needed when called from shm_destroy(). */ if (lock && !(info->flags & VM_LOCKED)) { - if (!user_shm_lock(inode->i_size, user)) + if (!user_shm_lock(inode->i_size, ucounts)) goto out_nomem; info->flags |= VM_LOCKED; mapping_set_unevictable(file->f_mapping); } - if (!lock && (info->flags & VM_LOCKED) && user) { - user_shm_unlock(inode->i_size, user); + if (!lock && (info->flags & VM_LOCKED) && ucounts) { + user_shm_unlock(inode->i_size, ucounts); info->flags &= ~VM_LOCKED; mapping_clear_unevictable(file->f_mapping); } @@ -4093,7 +4093,7 @@ int shmem_unuse(unsigned int type, bool frontswap, return 0; } -int shmem_lock(struct file *file, int lock, struct user_struct *user) +int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { return 0; } -- cgit v1.2.3 From c1ada3dc7219b02b3467aa906c2f5f8b098578d1 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:16 +0200 Subject: ucounts: Set ucount_max to the largest positive value the type can hold The ns->ucount_max[] is signed long which is less than the rlimit size. We have to protect ucount_max[] from overflow and only use the largest value that we can hold. On 32bit using "long" instead of "unsigned long" to hold the counts has the downside that RLIMIT_MSGQUEUE and RLIMIT_MEMLOCK are limited to 2GiB instead of 4GiB. I don't think anyone cares but it should be mentioned in case someone does. The RLIMIT_NPROC and RLIMIT_SIGPENDING used atomic_t so their maximum hasn't changed. Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/1825a5dfa18bc5a570e79feb05e2bd07fd57e7e3.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- include/linux/user_namespace.h | 6 ++++++ kernel/fork.c | 8 ++++---- kernel/user_namespace.c | 8 ++++---- 3 files changed, 14 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 2a3177b9b8bf..61794ae32fa8 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -125,6 +125,12 @@ long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v); bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v); bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max); +static inline void set_rlimit_ucount_max(struct user_namespace *ns, + enum ucount_type type, unsigned long max) +{ + ns->ucount_max[type] = max <= LONG_MAX ? max : LONG_MAX; +} + #ifdef CONFIG_USER_NS static inline struct user_namespace *get_user_ns(struct user_namespace *ns) diff --git a/kernel/fork.c b/kernel/fork.c index 610fd4de60d7..c41820481b2e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -822,10 +822,10 @@ void __init fork_init(void) for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) init_user_ns.ucount_max[i] = max_threads/2; - init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC); - init_user_ns.ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = task_rlimit(&init_task, RLIMIT_MSGQUEUE); - init_user_ns.ucount_max[UCOUNT_RLIMIT_SIGPENDING] = task_rlimit(&init_task, RLIMIT_SIGPENDING); - init_user_ns.ucount_max[UCOUNT_RLIMIT_MEMLOCK] = task_rlimit(&init_task, RLIMIT_MEMLOCK); + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, task_rlimit(&init_task, RLIMIT_NPROC)); + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, task_rlimit(&init_task, RLIMIT_MSGQUEUE)); + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, task_rlimit(&init_task, RLIMIT_SIGPENDING)); + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, task_rlimit(&init_task, RLIMIT_MEMLOCK)); #ifdef CONFIG_VMAP_STACK cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 892da1360862..d4a545bbab7f 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -122,10 +122,10 @@ int create_user_ns(struct cred *new) for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) { ns->ucount_max[i] = INT_MAX; } - ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC); - ns->ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = rlimit(RLIMIT_MSGQUEUE); - ns->ucount_max[UCOUNT_RLIMIT_SIGPENDING] = rlimit(RLIMIT_SIGPENDING); - ns->ucount_max[UCOUNT_RLIMIT_MEMLOCK] = rlimit(RLIMIT_MEMLOCK); + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)); + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ -- cgit v1.2.3 From 6be388f4a35d2ce5ef7dbf635a8964a5da7f799f Mon Sep 17 00:00:00 2001 From: Anirudh Rayabharam Date: Sun, 25 Apr 2021 23:03:53 +0530 Subject: HID: usbhid: fix info leak in hid_submit_ctrl In hid_submit_ctrl(), the way of calculating the report length doesn't take into account that report->size can be zero. When running the syzkaller reproducer, a report of size 0 causes hid_submit_ctrl) to calculate transfer_buffer_length as 16384. When this urb is passed to the usb core layer, KMSAN reports an info leak of 16384 bytes. To fix this, first modify hid_report_len() to account for the zero report size case by using DIV_ROUND_UP for the division. Then, call it from hid_submit_ctrl(). Reported-by: syzbot+7c2bb71996f95a82524c@syzkaller.appspotmail.com Signed-off-by: Anirudh Rayabharam Acked-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- drivers/hid/usbhid/hid-core.c | 2 +- include/linux/hid.h | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c index 86257ce6d619..4e9077363c96 100644 --- a/drivers/hid/usbhid/hid-core.c +++ b/drivers/hid/usbhid/hid-core.c @@ -374,7 +374,7 @@ static int hid_submit_ctrl(struct hid_device *hid) raw_report = usbhid->ctrl[usbhid->ctrltail].raw_report; dir = usbhid->ctrl[usbhid->ctrltail].dir; - len = ((report->size - 1) >> 3) + 1 + (report->id > 0); + len = hid_report_len(report); if (dir == USB_DIR_OUT) { usbhid->urbctrl->pipe = usb_sndctrlpipe(hid_to_usb_dev(hid), 0); usbhid->urbctrl->transfer_buffer_length = len; diff --git a/include/linux/hid.h b/include/linux/hid.h index 271021e20a3f..10e922cee4eb 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1167,8 +1167,7 @@ static inline void hid_hw_wait(struct hid_device *hdev) */ static inline u32 hid_report_len(struct hid_report *report) { - /* equivalent to DIV_ROUND_UP(report->size, 8) + !!(report->id > 0) */ - return ((report->size - 1) >> 3) + 1 + (report->id > 0); + return DIV_ROUND_UP(report->size, 8) + (report->id > 0); } int hid_report_raw_event(struct hid_device *hid, int type, u8 *data, u32 size, -- cgit v1.2.3 From 1333a6779501f4cc662ff5c8b36b0a22f3a7ddc6 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Sat, 24 Apr 2021 13:06:04 +0200 Subject: nvmem: core: allow specifying of_node Until now, the of_node of the parent device is used. Some devices provide more than just the nvmem provider. To avoid name space clashes, add a way to allow specifying the nvmem cells in subnodes. Consider the following example: flash@0 { compatible = "jedec,spi-nor"; partitions { compatible = "fixed-partitions"; #address-cells = <1>; #size-cells = <1>; partition@0 { reg = <0x000000 0x010000>; }; }; otp { compatible = "user-otp"; #address-cells = <1>; #size-cells = <1>; serial-number@0 { reg = <0x0 0x8>; }; }; }; There the nvmem provider might be the MTD partition or the OTP region of the flash. Add a new config->of_node parameter, which if set, will be used instead of the parent's of_node. Signed-off-by: Michael Walle Acked-by: Srinivas Kandagatla Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20210424110608.15748-2-michael@walle.cc --- drivers/nvmem/core.c | 4 +++- include/linux/nvmem-provider.h | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c index bca671ff4e54..62d363a399d3 100644 --- a/drivers/nvmem/core.c +++ b/drivers/nvmem/core.c @@ -789,7 +789,9 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config) nvmem->reg_write = config->reg_write; nvmem->keepout = config->keepout; nvmem->nkeepout = config->nkeepout; - if (!config->no_of_node) + if (config->of_node) + nvmem->dev.of_node = config->of_node; + else if (!config->no_of_node) nvmem->dev.of_node = config->dev->of_node; switch (config->id) { diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h index e162b757b6d5..471cb7b9e896 100644 --- a/include/linux/nvmem-provider.h +++ b/include/linux/nvmem-provider.h @@ -57,6 +57,7 @@ struct nvmem_keepout { * @type: Type of the nvmem storage * @read_only: Device is read-only. * @root_only: Device is accessibly to root only. + * @of_node: If given, this will be used instead of the parent's of_node. * @no_of_node: Device should not use the parent's of_node even if it's !NULL. * @reg_read: Callback to read data. * @reg_write: Callback to write data. @@ -86,6 +87,7 @@ struct nvmem_config { enum nvmem_type type; bool read_only; bool root_only; + struct device_node *of_node; bool no_of_node; nvmem_reg_read_t reg_read; nvmem_reg_write_t reg_write; -- cgit v1.2.3 From 4b361cfa862479fbb1d14ddf01de4dbc7146dcc5 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Sat, 24 Apr 2021 13:06:08 +0200 Subject: mtd: core: add OTP nvmem provider support Flash OTP regions can already be read via user space. Some boards have their serial number or MAC addresses stored in the OTP regions. Add support for them being a (read-only) nvmem provider. The API to read the OTP data is already in place. It distinguishes between factory and user OTP, thus there are up to two different providers. Signed-off-by: Michael Walle Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20210424110608.15748-6-michael@walle.cc --- drivers/mtd/mtdcore.c | 148 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/mtd/mtd.h | 2 + 2 files changed, 150 insertions(+) (limited to 'include/linux') diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index 1fb43eea3599..3f2e20e22501 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -776,6 +776,146 @@ static void mtd_set_dev_defaults(struct mtd_info *mtd) mutex_init(&mtd->master.chrdev_lock); } +static ssize_t mtd_otp_size(struct mtd_info *mtd, bool is_user) +{ + struct otp_info *info = kmalloc(PAGE_SIZE, GFP_KERNEL); + ssize_t size = 0; + unsigned int i; + size_t retlen; + int ret; + + if (is_user) + ret = mtd_get_user_prot_info(mtd, PAGE_SIZE, &retlen, info); + else + ret = mtd_get_fact_prot_info(mtd, PAGE_SIZE, &retlen, info); + if (ret) + goto err; + + for (i = 0; i < retlen / sizeof(*info); i++) { + size += info->length; + info++; + } + + kfree(info); + return size; + +err: + kfree(info); + return ret; +} + +static struct nvmem_device *mtd_otp_nvmem_register(struct mtd_info *mtd, + const char *compatible, + int size, + nvmem_reg_read_t reg_read) +{ + struct nvmem_device *nvmem = NULL; + struct nvmem_config config = {}; + struct device_node *np; + + /* DT binding is optional */ + np = of_get_compatible_child(mtd->dev.of_node, compatible); + + /* OTP nvmem will be registered on the physical device */ + config.dev = mtd->dev.parent; + /* just reuse the compatible as name */ + config.name = compatible; + config.id = NVMEM_DEVID_NONE; + config.owner = THIS_MODULE; + config.type = NVMEM_TYPE_OTP; + config.root_only = true; + config.reg_read = reg_read; + config.size = size; + config.of_node = np; + config.priv = mtd; + + nvmem = nvmem_register(&config); + /* Just ignore if there is no NVMEM support in the kernel */ + if (IS_ERR(nvmem) && PTR_ERR(nvmem) == -EOPNOTSUPP) + nvmem = NULL; + + of_node_put(np); + + return nvmem; +} + +static int mtd_nvmem_user_otp_reg_read(void *priv, unsigned int offset, + void *val, size_t bytes) +{ + struct mtd_info *mtd = priv; + size_t retlen; + int ret; + + ret = mtd_read_user_prot_reg(mtd, offset, bytes, &retlen, val); + if (ret) + return ret; + + return retlen == bytes ? 0 : -EIO; +} + +static int mtd_nvmem_fact_otp_reg_read(void *priv, unsigned int offset, + void *val, size_t bytes) +{ + struct mtd_info *mtd = priv; + size_t retlen; + int ret; + + ret = mtd_read_fact_prot_reg(mtd, offset, bytes, &retlen, val); + if (ret) + return ret; + + return retlen == bytes ? 0 : -EIO; +} + +static int mtd_otp_nvmem_add(struct mtd_info *mtd) +{ + struct nvmem_device *nvmem; + ssize_t size; + int err; + + if (mtd->_get_user_prot_info && mtd->_read_user_prot_reg) { + size = mtd_otp_size(mtd, true); + if (size < 0) + return size; + + if (size > 0) { + nvmem = mtd_otp_nvmem_register(mtd, "user-otp", size, + mtd_nvmem_user_otp_reg_read); + if (IS_ERR(nvmem)) { + dev_err(&mtd->dev, "Failed to register OTP NVMEM device\n"); + return PTR_ERR(nvmem); + } + mtd->otp_user_nvmem = nvmem; + } + } + + if (mtd->_get_fact_prot_info && mtd->_read_fact_prot_reg) { + size = mtd_otp_size(mtd, false); + if (size < 0) { + err = size; + goto err; + } + + if (size > 0) { + nvmem = mtd_otp_nvmem_register(mtd, "factory-otp", size, + mtd_nvmem_fact_otp_reg_read); + if (IS_ERR(nvmem)) { + dev_err(&mtd->dev, "Failed to register OTP NVMEM device\n"); + err = PTR_ERR(nvmem); + goto err; + } + mtd->otp_factory_nvmem = nvmem; + } + } + + return 0; + +err: + if (mtd->otp_user_nvmem) + nvmem_unregister(mtd->otp_user_nvmem); + return err; +} + /** * mtd_device_parse_register - parse partitions and register an MTD device. * @@ -851,6 +991,8 @@ int mtd_device_parse_register(struct mtd_info *mtd, const char * const *types, register_reboot_notifier(&mtd->reboot_notifier); } + ret = mtd_otp_nvmem_add(mtd); + out: if (ret && device_is_registered(&mtd->dev)) del_mtd_device(mtd); @@ -872,6 +1014,12 @@ int mtd_device_unregister(struct mtd_info *master) if (master->_reboot) unregister_reboot_notifier(&master->reboot_notifier); + if (master->otp_user_nvmem) + nvmem_unregister(master->otp_user_nvmem); + + if (master->otp_factory_nvmem) + nvmem_unregister(master->otp_factory_nvmem); + err = del_mtd_partitions(master); if (err) return err; diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h index a89955f3cbc8..88227044fc86 100644 --- a/include/linux/mtd/mtd.h +++ b/include/linux/mtd/mtd.h @@ -380,6 +380,8 @@ struct mtd_info { int usecount; struct mtd_debug_info dbg; struct nvmem_device *nvmem; + struct nvmem_device *otp_user_nvmem; + struct nvmem_device *otp_factory_nvmem; /* * Parent device from the MTD partition point of view. -- cgit v1.2.3 From 0e4768713e71dd224633fd7e00ad358bc48f433a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 23 Apr 2021 21:24:31 +0300 Subject: spi: pxa2xx: Replace header inclusions by forward declarations When the data structure is only referred by pointer, compiler may not need to see the contents of the data type. Thus, we may replace header inclusions by respective forward declarations. Due to above add missed headers as well. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210423182441.50272-5-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- drivers/spi/spi-pxa2xx-dma.c | 4 ++-- drivers/spi/spi-pxa2xx-pci.c | 1 + drivers/spi/spi-pxa2xx.c | 2 ++ drivers/spi/spi-pxa2xx.h | 18 ++++++++++-------- include/linux/spi/pxa2xx_spi.h | 2 ++ 5 files changed, 17 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/spi/spi-pxa2xx-dma.c b/drivers/spi/spi-pxa2xx-dma.c index 2e4a49567146..e00dbadd39ec 100644 --- a/drivers/spi/spi-pxa2xx-dma.c +++ b/drivers/spi/spi-pxa2xx-dma.c @@ -9,11 +9,11 @@ #include #include #include -#include #include #include -#include + #include +#include #include "spi-pxa2xx.h" diff --git a/drivers/spi/spi-pxa2xx-pci.c b/drivers/spi/spi-pxa2xx-pci.c index f588fad77fc0..a259be12d326 100644 --- a/drivers/spi/spi-pxa2xx-pci.c +++ b/drivers/spi/spi-pxa2xx-pci.c @@ -8,6 +8,7 @@ #include #include #include + #include #include diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c index 0f3f7d725937..1d4c7f4217ed 100644 --- a/drivers/spi/spi-pxa2xx.c +++ b/drivers/spi/spi-pxa2xx.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -25,6 +26,7 @@ #include #include #include + #include #include diff --git a/drivers/spi/spi-pxa2xx.h b/drivers/spi/spi-pxa2xx.h index 6724d7e056ce..739e264feaa6 100644 --- a/drivers/spi/spi-pxa2xx.h +++ b/drivers/spi/spi-pxa2xx.h @@ -7,16 +7,18 @@ #ifndef SPI_PXA2XX_H #define SPI_PXA2XX_H -#include -#include -#include -#include #include -#include -#include +#include +#include #include -#include -#include + +#include + +struct gpio_desc; +struct pxa2xx_spi_controller; +struct spi_controller; +struct spi_device; +struct spi_transfer; struct driver_data { /* SSP Info */ diff --git a/include/linux/spi/pxa2xx_spi.h b/include/linux/spi/pxa2xx_spi.h index 31f00c7f4f59..1e0e2f136319 100644 --- a/include/linux/spi/pxa2xx_spi.h +++ b/include/linux/spi/pxa2xx_spi.h @@ -5,6 +5,8 @@ #ifndef __linux_pxa2xx_spi_h #define __linux_pxa2xx_spi_h +#include + #include #define PXA2XX_CS_ASSERT (0x01) -- cgit v1.2.3 From 5edc24901f4d469f8fc943004f73655933e89dbf Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 23 Apr 2021 21:24:32 +0300 Subject: spi: pxa2xx: Unify ifdeffery used in the headers The two headers have quite different ifdeffery to prevent multiple inclusion. Unify them with the pattern that in particular reflects their location. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210423182441.50272-6-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- include/linux/pxa2xx_ssp.h | 6 +++--- include/linux/spi/pxa2xx_spi.h | 7 ++++--- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pxa2xx_ssp.h b/include/linux/pxa2xx_ssp.h index 7f73b26ed22e..14b049840faf 100644 --- a/include/linux/pxa2xx_ssp.h +++ b/include/linux/pxa2xx_ssp.h @@ -11,8 +11,8 @@ * PXA3xx SSP1, SSP2, SSP3, SSP4 */ -#ifndef __LINUX_SSP_H -#define __LINUX_SSP_H +#ifndef __LINUX_PXA2XX_SSP_H +#define __LINUX_PXA2XX_SSP_H #include #include @@ -270,4 +270,4 @@ static inline struct ssp_device *pxa_ssp_request_of(const struct device_node *n, static inline void pxa_ssp_free(struct ssp_device *ssp) {} #endif -#endif +#endif /* __LINUX_PXA2XX_SSP_H */ diff --git a/include/linux/spi/pxa2xx_spi.h b/include/linux/spi/pxa2xx_spi.h index 1e0e2f136319..12ef04d0896d 100644 --- a/include/linux/spi/pxa2xx_spi.h +++ b/include/linux/spi/pxa2xx_spi.h @@ -2,8 +2,8 @@ /* * Copyright (C) 2005 Stephen Street / StreetFire Sound Labs */ -#ifndef __linux_pxa2xx_spi_h -#define __linux_pxa2xx_spi_h +#ifndef __LINUX_SPI_PXA2XX_SPI_H +#define __LINUX_SPI_PXA2XX_SPI_H #include @@ -51,4 +51,5 @@ struct pxa2xx_spi_chip { extern void pxa2xx_set_spi_info(unsigned id, struct pxa2xx_spi_controller *info); #endif -#endif + +#endif /* __LINUX_SPI_PXA2XX_SPI_H */ -- cgit v1.2.3 From 1beb37b0e3f98708bfb37778049764b4500756da Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 23 Apr 2021 21:24:33 +0300 Subject: spi: pxa2xx: Group Intel Quark specific definitions DDS_RATE is Intel Quark specific definition. Move it to the rest Intel Quark related. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210423182441.50272-7-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- include/linux/pxa2xx_ssp.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pxa2xx_ssp.h b/include/linux/pxa2xx_ssp.h index 14b049840faf..1b6c1a0922bd 100644 --- a/include/linux/pxa2xx_ssp.h +++ b/include/linux/pxa2xx_ssp.h @@ -38,7 +38,6 @@ struct device_node; #define SSDR (0x10) /* SSP Data Write/Data Read Register */ #define SSTO (0x28) /* SSP Time Out Register */ -#define DDS_RATE (0x28) /* SSP DDS Clock Rate Register (Intel Quark) */ #define SSPSP (0x2C) /* SSP Programmable Serial Protocol */ #define SSTSA (0x30) /* SSP Tx Timeslot Active */ #define SSRSA (0x34) /* SSP Rx Timeslot Active */ @@ -105,6 +104,9 @@ struct device_node; #define CE4100_SSCR1_RFT GENMASK(11, 10) /* Receive FIFO Threshold (mask) */ #define CE4100_SSCR1_RxTresh(x) (((x) - 1) << 10) /* level [1..4] */ +/* Intel Quark X1000 */ +#define DDS_RATE 0x28 /* SSP DDS Clock Rate Register */ + /* QUARK_X1000 SSCR0 bit definition */ #define QUARK_X1000_SSCR0_DSS GENMASK(4, 0) /* Data Size Select (mask) */ #define QUARK_X1000_SSCR0_DataSize(x) ((x) - 1) /* Data Size Select [4..32] */ -- cgit v1.2.3 From 661ee6280931548f7b3b887ad26a157474ae5ac4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 8 May 2021 14:15:38 +0200 Subject: cgroup: introduce cgroup.kill Introduce the cgroup.kill file. It does what it says on the tin and allows a caller to kill a cgroup by writing "1" into cgroup.kill. The file is available in non-root cgroups. Killing cgroups is a process directed operation, i.e. the whole thread-group is affected. Consequently trying to write to cgroup.kill in threaded cgroups will be rejected and EOPNOTSUPP returned. This behavior aligns with cgroup.procs where reads in threaded-cgroups are rejected with EOPNOTSUPP. The cgroup.kill file is write-only since killing a cgroup is an event not which makes it different from e.g. freezer where a cgroup transitions between the two states. As with all new cgroup features cgroup.kill is recursive by default. Killing a cgroup is protected against concurrent migrations through the cgroup mutex. To protect against forkbombs and to mitigate the effect of racing forks a new CGRP_KILL css set lock protected flag is introduced that is set prior to killing a cgroup and unset after the cgroup has been killed. We can then check in cgroup_post_fork() where we hold the css set lock already whether the cgroup is currently being killed. If so we send the child a SIGKILL signal immediately taking it down as soon as it returns to userspace. To make the killing of the child semantically clean it is killed after all cgroup attachment operations have been finalized. There are various use-cases of this interface: - Containers usually have a conservative layout where each container usually has a delegated cgroup. For such layouts there is a 1:1 mapping between container and cgroup. If the container in addition uses a separate pid namespace then killing a container usually becomes a simple kill -9 from an ancestor pid namespace. However, there are quite a few scenarios where that isn't true. For example, there are containers that share the cgroup with other processes on purpose that are supposed to be bound to the lifetime of the container but are not in the same pidns of the container. Containers that are in a delegated cgroup but share the pid namespace with the host or other containers. - Service managers such as systemd use cgroups to group and organize processes belonging to a service. They usually rely on a recursive algorithm now to kill a service. With cgroup.kill this becomes a simple write to cgroup.kill. - Userspace OOM implementations can make good use of this feature to efficiently take down whole cgroups quickly. - The kill program can gain a new kill --cgroup /sys/fs/cgroup/delegated flag to take down cgroups. A few observations about the semantics: - If parent and child are in the same cgroup and CLONE_INTO_CGROUP is not specified we are not taking cgroup mutex meaning the cgroup can be killed while a process in that cgroup is forking. If the kill request happens right before cgroup_can_fork() and before the parent grabs its siglock the parent is guaranteed to see the pending SIGKILL. In addition we perform another check in cgroup_post_fork() whether the cgroup is being killed and is so take down the child (see above). This is robust enough and protects gainst forkbombs. If userspace really really wants to have stricter protection the simple solution would be to grab the write side of the cgroup threadgroup rwsem which will force all ongoing forks to complete before killing starts. We concluded that this is not necessary as the semantics for concurrent forking should simply align with freezer where a similar check as cgroup_post_fork() is performed. For all other cases CLONE_INTO_CGROUP is required. In this case we will grab the cgroup mutex so the cgroup can't be killed while we fork. Once we're done with the fork and have dropped cgroup mutex we are visible and will be found by any subsequent kill request. - We obviously don't kill kthreads. This means a cgroup that has a kthread will not become empty after killing and consequently no unpopulated event will be generated. The assumption is that kthreads should be in the root cgroup only anyway so this is not an issue. - We skip killing tasks that already have pending fatal signals. - Freezer doesn't care about tasks in different pid namespaces, i.e. if you have two tasks in different pid namespaces the cgroup would still be frozen. The cgroup.kill mechanism consequently behaves the same way, i.e. we kill all processes and ignore in which pid namespace they exist. - If the caller is located in a cgroup that is killed the caller will obviously be killed as well. Link: https://lore.kernel.org/r/20210503143922.3093755-1-brauner@kernel.org Cc: Shakeel Butt Cc: Roman Gushchin Cc: Tejun Heo Cc: cgroups@vger.kernel.org Reviewed-by: Shakeel Butt Reviewed-by: Serge Hallyn Acked-by: Roman Gushchin Signed-off-by: Christian Brauner Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 3 ++ kernel/cgroup/cgroup.c | 127 +++++++++++++++++++++++++++++++++++++++----- 2 files changed, 116 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 559ee05f86b2..43fef771009a 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -71,6 +71,9 @@ enum { /* Cgroup is frozen. */ CGRP_FROZEN, + + /* Control group has to be killed. */ + CGRP_KILL, }; /* cgroup_root->flags */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e049edd66776..e640fc78d731 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3667,6 +3667,80 @@ static ssize_t cgroup_freeze_write(struct kernfs_open_file *of, return nbytes; } +static void __cgroup_kill(struct cgroup *cgrp) +{ + struct css_task_iter it; + struct task_struct *task; + + lockdep_assert_held(&cgroup_mutex); + + spin_lock_irq(&css_set_lock); + set_bit(CGRP_KILL, &cgrp->flags); + spin_unlock_irq(&css_set_lock); + + css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it); + while ((task = css_task_iter_next(&it))) { + /* Ignore kernel threads here. */ + if (task->flags & PF_KTHREAD) + continue; + + /* Skip tasks that are already dying. */ + if (__fatal_signal_pending(task)) + continue; + + send_sig(SIGKILL, task, 0); + } + css_task_iter_end(&it); + + spin_lock_irq(&css_set_lock); + clear_bit(CGRP_KILL, &cgrp->flags); + spin_unlock_irq(&css_set_lock); +} + +static void cgroup_kill(struct cgroup *cgrp) +{ + struct cgroup_subsys_state *css; + struct cgroup *dsct; + + lockdep_assert_held(&cgroup_mutex); + + cgroup_for_each_live_descendant_pre(dsct, css, cgrp) + __cgroup_kill(dsct); +} + +static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + ssize_t ret = 0; + int kill; + struct cgroup *cgrp; + + ret = kstrtoint(strstrip(buf), 0, &kill); + if (ret) + return ret; + + if (kill != 1) + return -ERANGE; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENOENT; + + /* + * Killing is a process directed operation, i.e. the whole thread-group + * is taken down so act like we do for cgroup.procs and only make this + * writable in non-threaded cgroups. + */ + if (cgroup_is_threaded(cgrp)) + ret = -EOPNOTSUPP; + else + cgroup_kill(cgrp); + + cgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of_cft(of); @@ -4859,6 +4933,11 @@ static struct cftype cgroup_base_files[] = { .seq_show = cgroup_freeze_show, .write = cgroup_freeze_write, }, + { + .name = "cgroup.kill", + .flags = CFTYPE_NOT_ON_ROOT, + .write = cgroup_kill_write, + }, { .name = "cpu.stat", .seq_show = cpu_stat_show, @@ -6085,6 +6164,8 @@ void cgroup_post_fork(struct task_struct *child, struct kernel_clone_args *kargs) __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { + unsigned long cgrp_flags = 0; + bool kill = false; struct cgroup_subsys *ss; struct css_set *cset; int i; @@ -6096,6 +6177,11 @@ void cgroup_post_fork(struct task_struct *child, /* init tasks are special, only link regular threads */ if (likely(child->pid)) { + if (kargs->cgrp) + cgrp_flags = kargs->cgrp->flags; + else + cgrp_flags = cset->dfl_cgrp->flags; + WARN_ON_ONCE(!list_empty(&child->cg_list)); cset->nr_tasks++; css_set_move_task(child, NULL, cset, false); @@ -6104,23 +6190,32 @@ void cgroup_post_fork(struct task_struct *child, cset = NULL; } - /* - * If the cgroup has to be frozen, the new task has too. Let's set - * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the - * frozen state. - */ - if (unlikely(cgroup_task_freeze(child))) { - spin_lock(&child->sighand->siglock); - WARN_ON_ONCE(child->frozen); - child->jobctl |= JOBCTL_TRAP_FREEZE; - spin_unlock(&child->sighand->siglock); + if (!(child->flags & PF_KTHREAD)) { + if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) { + /* + * If the cgroup has to be frozen, the new task has + * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to + * get the task into the frozen state. + */ + spin_lock(&child->sighand->siglock); + WARN_ON_ONCE(child->frozen); + child->jobctl |= JOBCTL_TRAP_FREEZE; + spin_unlock(&child->sighand->siglock); + + /* + * Calling cgroup_update_frozen() isn't required here, + * because it will be called anyway a bit later from + * do_freezer_trap(). So we avoid cgroup's transient + * switch from the frozen state and back. + */ + } /* - * Calling cgroup_update_frozen() isn't required here, - * because it will be called anyway a bit later from - * do_freezer_trap(). So we avoid cgroup's transient switch - * from the frozen state and back. + * If the cgroup is to be killed notice it now and take the + * child down right after we finished preparing it for + * userspace. */ + kill = test_bit(CGRP_KILL, &cgrp_flags); } spin_unlock_irq(&css_set_lock); @@ -6143,6 +6238,10 @@ void cgroup_post_fork(struct task_struct *child, put_css_set(rcset); } + /* Cgroup has to be killed so take down child immediately. */ + if (unlikely(kill)) + do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID); + cgroup_css_set_put_fork(kargs); } -- cgit v1.2.3 From bf067edf5d2f5b2948ee7197974a719aae3e526c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 8 May 2021 00:07:47 +0200 Subject: openrisc: always use unaligned-struct header openrisc is the only architecture using the linux/unaligned/*memmove infrastructure. There is a comment saying that this version is more efficient, but this was added in 2011 before the openrisc gcc port was merged upstream. I checked a couple of files to see what the actual difference is with the mainline gcc (9.4 and 11.1), and found that the generic header seems to produce better code now, regardless of the gcc version. Specifically, the be_memmove leads to allocating a stack slot and copying the data one byte at a time, then reading the whole word from the stack: 00000000 : 0: 9c 21 ff f4 l.addi r1,r1,-12 4: d4 01 10 04 l.sw 4(r1),r2 8: 8e 63 00 00 l.lbz r19,0(r3) c: 9c 41 00 0c l.addi r2,r1,12 10: 8e 23 00 01 l.lbz r17,1(r3) 14: db e2 9f f4 l.sb -12(r2),r19 18: db e2 8f f5 l.sb -11(r2),r17 1c: 8e 63 00 02 l.lbz r19,2(r3) 20: 8e 23 00 03 l.lbz r17,3(r3) 24: d4 01 48 08 l.sw 8(r1),r9 28: db e2 9f f6 l.sb -10(r2),r19 2c: db e2 8f f7 l.sb -9(r2),r17 30: 85 62 ff f4 l.lwz r11,-12(r2) 34: 85 21 00 08 l.lwz r9,8(r1) 38: 84 41 00 04 l.lwz r2,4(r1) 3c: 44 00 48 00 l.jr r9 40: 9c 21 00 0c l.addi r1,r1,12 while the be_struct version reads each byte into a register and does a shift to the right position: 00000000 : 0: 9c 21 ff f8 l.addi r1,r1,-8 4: 8e 63 00 00 l.lbz r19,0(r3) 8: aa 20 00 18 l.ori r17,r0,0x18 c: e2 73 88 08 l.sll r19,r19,r17 10: 8d 63 00 01 l.lbz r11,1(r3) 14: aa 20 00 10 l.ori r17,r0,0x10 18: e1 6b 88 08 l.sll r11,r11,r17 1c: e1 6b 98 04 l.or r11,r11,r19 20: 8e 23 00 02 l.lbz r17,2(r3) 24: aa 60 00 08 l.ori r19,r0,0x8 28: e2 31 98 08 l.sll r17,r17,r19 2c: d4 01 10 00 l.sw 0(r1),r2 30: d4 01 48 04 l.sw 4(r1),r9 34: 9c 41 00 08 l.addi r2,r1,8 38: e2 31 58 04 l.or r17,r17,r11 3c: 8d 63 00 03 l.lbz r11,3(r3) 40: e1 6b 88 04 l.or r11,r11,r17 44: 84 41 00 00 l.lwz r2,0(r1) 48: 85 21 00 04 l.lwz r9,4(r1) 4c: 44 00 48 00 l.jr r9 50: 9c 21 00 08 l.addi r1,r1,8 According to Stafford Horne, the new version should in fact perform better. In the trivial example, the struct version is a few instructions longer, but building a whole kernel shows an overall reduction in code size, presumably because it now has to manage fewer stack slots: text data bss dec hex filename 4792010 181480 82324 5055814 4d2546 vmlinux-unaligned-memmove 4790642 181480 82324 5054446 4d1fee vmlinux-unaligned-struct Remove the memmove version completely and let openrisc use the same code as everyone else, as a simplification. Signed-off-by: Arnd Bergmann Acked-by: Stafford Horne --- arch/openrisc/include/asm/unaligned.h | 47 ----------------------------------- include/linux/unaligned/be_memmove.h | 37 --------------------------- include/linux/unaligned/le_memmove.h | 37 --------------------------- include/linux/unaligned/memmove.h | 46 ---------------------------------- 4 files changed, 167 deletions(-) delete mode 100644 arch/openrisc/include/asm/unaligned.h delete mode 100644 include/linux/unaligned/be_memmove.h delete mode 100644 include/linux/unaligned/le_memmove.h delete mode 100644 include/linux/unaligned/memmove.h (limited to 'include/linux') diff --git a/arch/openrisc/include/asm/unaligned.h b/arch/openrisc/include/asm/unaligned.h deleted file mode 100644 index 14353f2101f2..000000000000 --- a/arch/openrisc/include/asm/unaligned.h +++ /dev/null @@ -1,47 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * OpenRISC Linux - * - * Linux architectural port borrowing liberally from similar works of - * others. All original copyrights apply as per the original source - * declaration. - * - * OpenRISC implementation: - * Copyright (C) 2003 Matjaz Breskvar - * Copyright (C) 2010-2011 Jonas Bonn - * et al. - */ - -#ifndef __ASM_OPENRISC_UNALIGNED_H -#define __ASM_OPENRISC_UNALIGNED_H - -/* - * This is copied from the generic implementation and the C-struct - * variant replaced with the memmove variant. The GCC compiler - * for the OR32 arch optimizes too aggressively for the C-struct - * variant to work, so use the memmove variant instead. - * - * It may be worth considering implementing the unaligned access - * exception handler and allowing unaligned accesses (access_ok.h)... - * not sure if it would be much of a performance win without further - * investigation. - */ -#include - -#if defined(__LITTLE_ENDIAN) -# include -# include -# include -# define get_unaligned __get_unaligned_le -# define put_unaligned __put_unaligned_le -#elif defined(__BIG_ENDIAN) -# include -# include -# include -# define get_unaligned __get_unaligned_be -# define put_unaligned __put_unaligned_be -#else -# error need to define endianess -#endif - -#endif /* __ASM_OPENRISC_UNALIGNED_H */ diff --git a/include/linux/unaligned/be_memmove.h b/include/linux/unaligned/be_memmove.h deleted file mode 100644 index 7164214a4ba1..000000000000 --- a/include/linux/unaligned/be_memmove.h +++ /dev/null @@ -1,37 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_UNALIGNED_BE_MEMMOVE_H -#define _LINUX_UNALIGNED_BE_MEMMOVE_H - -#include - -static inline u16 get_unaligned_be16(const void *p) -{ - return __get_unaligned_memmove16((const u8 *)p); -} - -static inline u32 get_unaligned_be32(const void *p) -{ - return __get_unaligned_memmove32((const u8 *)p); -} - -static inline u64 get_unaligned_be64(const void *p) -{ - return __get_unaligned_memmove64((const u8 *)p); -} - -static inline void put_unaligned_be16(u16 val, void *p) -{ - __put_unaligned_memmove16(val, p); -} - -static inline void put_unaligned_be32(u32 val, void *p) -{ - __put_unaligned_memmove32(val, p); -} - -static inline void put_unaligned_be64(u64 val, void *p) -{ - __put_unaligned_memmove64(val, p); -} - -#endif /* _LINUX_UNALIGNED_LE_MEMMOVE_H */ diff --git a/include/linux/unaligned/le_memmove.h b/include/linux/unaligned/le_memmove.h deleted file mode 100644 index 9202e864d026..000000000000 --- a/include/linux/unaligned/le_memmove.h +++ /dev/null @@ -1,37 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_UNALIGNED_LE_MEMMOVE_H -#define _LINUX_UNALIGNED_LE_MEMMOVE_H - -#include - -static inline u16 get_unaligned_le16(const void *p) -{ - return __get_unaligned_memmove16((const u8 *)p); -} - -static inline u32 get_unaligned_le32(const void *p) -{ - return __get_unaligned_memmove32((const u8 *)p); -} - -static inline u64 get_unaligned_le64(const void *p) -{ - return __get_unaligned_memmove64((const u8 *)p); -} - -static inline void put_unaligned_le16(u16 val, void *p) -{ - __put_unaligned_memmove16(val, p); -} - -static inline void put_unaligned_le32(u32 val, void *p) -{ - __put_unaligned_memmove32(val, p); -} - -static inline void put_unaligned_le64(u64 val, void *p) -{ - __put_unaligned_memmove64(val, p); -} - -#endif /* _LINUX_UNALIGNED_LE_MEMMOVE_H */ diff --git a/include/linux/unaligned/memmove.h b/include/linux/unaligned/memmove.h deleted file mode 100644 index ac71b53bc6dc..000000000000 --- a/include/linux/unaligned/memmove.h +++ /dev/null @@ -1,46 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_UNALIGNED_MEMMOVE_H -#define _LINUX_UNALIGNED_MEMMOVE_H - -#include -#include - -/* Use memmove here, so gcc does not insert a __builtin_memcpy. */ - -static inline u16 __get_unaligned_memmove16(const void *p) -{ - u16 tmp; - memmove(&tmp, p, 2); - return tmp; -} - -static inline u32 __get_unaligned_memmove32(const void *p) -{ - u32 tmp; - memmove(&tmp, p, 4); - return tmp; -} - -static inline u64 __get_unaligned_memmove64(const void *p) -{ - u64 tmp; - memmove(&tmp, p, 8); - return tmp; -} - -static inline void __put_unaligned_memmove16(u16 val, void *p) -{ - memmove(p, &val, 2); -} - -static inline void __put_unaligned_memmove32(u32 val, void *p) -{ - memmove(p, &val, 4); -} - -static inline void __put_unaligned_memmove64(u64 val, void *p) -{ - memmove(p, &val, 8); -} - -#endif /* _LINUX_UNALIGNED_MEMMOVE_H */ -- cgit v1.2.3 From 0652035a57945e14e611dafae2ec5b46a05bc1d1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 8 May 2021 00:07:51 +0200 Subject: asm-generic: unaligned: remove byteshift helpers In theory, compilers should be able to work this out themselves so we can use a simpler version based on the swab() helpers. I have verified that this works on all supported compiler versions (gcc-4.9 and up, clang-10 and up). Looking at the object code produced by gcc-11, I found that the impact is mostly a change in inlining decisions that lead to slightly larger code. In other cases, this version produces explicit byte swaps in place of separate byte access, or comparing against pre-swapped constants. While the source code is clearly simpler, I have not seen an indication of the new version actually producing better code on Arm, so maybe we want to skip this after all. From what I can tell, gcc recognizes the byteswap pattern in the byteshift.h header and can turn it into explicit instructions, but it does not turn a __builtin_bswap32() back into individual bytes when that would result in better output, e.g. when storing a byte-reversed constant. Suggested-by: Linus Torvalds Signed-off-by: Arnd Bergmann --- arch/arm/include/asm/unaligned.h | 2 - include/asm-generic/unaligned.h | 2 - include/linux/unaligned/be_byteshift.h | 71 ---------------------------------- include/linux/unaligned/be_struct.h | 30 ++++++++++++++ include/linux/unaligned/le_byteshift.h | 71 ---------------------------------- include/linux/unaligned/le_struct.h | 30 ++++++++++++++ 6 files changed, 60 insertions(+), 146 deletions(-) delete mode 100644 include/linux/unaligned/be_byteshift.h delete mode 100644 include/linux/unaligned/le_byteshift.h (limited to 'include/linux') diff --git a/arch/arm/include/asm/unaligned.h b/arch/arm/include/asm/unaligned.h index ab905ffcf193..3c5248fb4cdc 100644 --- a/arch/arm/include/asm/unaligned.h +++ b/arch/arm/include/asm/unaligned.h @@ -10,13 +10,11 @@ #if defined(__LITTLE_ENDIAN) # include -# include # include # define get_unaligned __get_unaligned_le # define put_unaligned __put_unaligned_le #elif defined(__BIG_ENDIAN) # include -# include # include # define get_unaligned __get_unaligned_be # define put_unaligned __put_unaligned_be diff --git a/include/asm-generic/unaligned.h b/include/asm-generic/unaligned.h index 374c940e9be1..d79df721ae60 100644 --- a/include/asm-generic/unaligned.h +++ b/include/asm-generic/unaligned.h @@ -16,7 +16,6 @@ #if defined(__LITTLE_ENDIAN) # ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS # include -# include # endif # include # define get_unaligned __get_unaligned_le @@ -24,7 +23,6 @@ #elif defined(__BIG_ENDIAN) # ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS # include -# include # endif # include # define get_unaligned __get_unaligned_be diff --git a/include/linux/unaligned/be_byteshift.h b/include/linux/unaligned/be_byteshift.h deleted file mode 100644 index c43ff5918c8a..000000000000 --- a/include/linux/unaligned/be_byteshift.h +++ /dev/null @@ -1,71 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_UNALIGNED_BE_BYTESHIFT_H -#define _LINUX_UNALIGNED_BE_BYTESHIFT_H - -#include - -static inline u16 __get_unaligned_be16(const u8 *p) -{ - return p[0] << 8 | p[1]; -} - -static inline u32 __get_unaligned_be32(const u8 *p) -{ - return p[0] << 24 | p[1] << 16 | p[2] << 8 | p[3]; -} - -static inline u64 __get_unaligned_be64(const u8 *p) -{ - return (u64)__get_unaligned_be32(p) << 32 | - __get_unaligned_be32(p + 4); -} - -static inline void __put_unaligned_be16(u16 val, u8 *p) -{ - *p++ = val >> 8; - *p++ = val; -} - -static inline void __put_unaligned_be32(u32 val, u8 *p) -{ - __put_unaligned_be16(val >> 16, p); - __put_unaligned_be16(val, p + 2); -} - -static inline void __put_unaligned_be64(u64 val, u8 *p) -{ - __put_unaligned_be32(val >> 32, p); - __put_unaligned_be32(val, p + 4); -} - -static inline u16 get_unaligned_be16(const void *p) -{ - return __get_unaligned_be16(p); -} - -static inline u32 get_unaligned_be32(const void *p) -{ - return __get_unaligned_be32(p); -} - -static inline u64 get_unaligned_be64(const void *p) -{ - return __get_unaligned_be64(p); -} - -static inline void put_unaligned_be16(u16 val, void *p) -{ - __put_unaligned_be16(val, p); -} - -static inline void put_unaligned_be32(u32 val, void *p) -{ - __put_unaligned_be32(val, p); -} - -static inline void put_unaligned_be64(u64 val, void *p) -{ - __put_unaligned_be64(val, p); -} - -#endif /* _LINUX_UNALIGNED_BE_BYTESHIFT_H */ diff --git a/include/linux/unaligned/be_struct.h b/include/linux/unaligned/be_struct.h index 15ea503a13fc..76d9fe297c33 100644 --- a/include/linux/unaligned/be_struct.h +++ b/include/linux/unaligned/be_struct.h @@ -34,4 +34,34 @@ static inline void put_unaligned_be64(u64 val, void *p) __put_unaligned_cpu64(val, p); } +static inline u16 get_unaligned_le16(const void *p) +{ + return swab16(__get_unaligned_cpu16((const u8 *)p)); +} + +static inline u32 get_unaligned_le32(const void *p) +{ + return swab32(__get_unaligned_cpu32((const u8 *)p)); +} + +static inline u64 get_unaligned_le64(const void *p) +{ + return swab64(__get_unaligned_cpu64((const u8 *)p)); +} + +static inline void put_unaligned_le16(u16 val, void *p) +{ + __put_unaligned_cpu16(swab16(val), p); +} + +static inline void put_unaligned_le32(u32 val, void *p) +{ + __put_unaligned_cpu32(swab32(val), p); +} + +static inline void put_unaligned_le64(u64 val, void *p) +{ + __put_unaligned_cpu64(swab64(val), p); +} + #endif /* _LINUX_UNALIGNED_BE_STRUCT_H */ diff --git a/include/linux/unaligned/le_byteshift.h b/include/linux/unaligned/le_byteshift.h deleted file mode 100644 index 2248dcb0df76..000000000000 --- a/include/linux/unaligned/le_byteshift.h +++ /dev/null @@ -1,71 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_UNALIGNED_LE_BYTESHIFT_H -#define _LINUX_UNALIGNED_LE_BYTESHIFT_H - -#include - -static inline u16 __get_unaligned_le16(const u8 *p) -{ - return p[0] | p[1] << 8; -} - -static inline u32 __get_unaligned_le32(const u8 *p) -{ - return p[0] | p[1] << 8 | p[2] << 16 | p[3] << 24; -} - -static inline u64 __get_unaligned_le64(const u8 *p) -{ - return (u64)__get_unaligned_le32(p + 4) << 32 | - __get_unaligned_le32(p); -} - -static inline void __put_unaligned_le16(u16 val, u8 *p) -{ - *p++ = val; - *p++ = val >> 8; -} - -static inline void __put_unaligned_le32(u32 val, u8 *p) -{ - __put_unaligned_le16(val >> 16, p + 2); - __put_unaligned_le16(val, p); -} - -static inline void __put_unaligned_le64(u64 val, u8 *p) -{ - __put_unaligned_le32(val >> 32, p + 4); - __put_unaligned_le32(val, p); -} - -static inline u16 get_unaligned_le16(const void *p) -{ - return __get_unaligned_le16(p); -} - -static inline u32 get_unaligned_le32(const void *p) -{ - return __get_unaligned_le32(p); -} - -static inline u64 get_unaligned_le64(const void *p) -{ - return __get_unaligned_le64(p); -} - -static inline void put_unaligned_le16(u16 val, void *p) -{ - __put_unaligned_le16(val, p); -} - -static inline void put_unaligned_le32(u32 val, void *p) -{ - __put_unaligned_le32(val, p); -} - -static inline void put_unaligned_le64(u64 val, void *p) -{ - __put_unaligned_le64(val, p); -} - -#endif /* _LINUX_UNALIGNED_LE_BYTESHIFT_H */ diff --git a/include/linux/unaligned/le_struct.h b/include/linux/unaligned/le_struct.h index 9977987883a6..22f90a4afaa5 100644 --- a/include/linux/unaligned/le_struct.h +++ b/include/linux/unaligned/le_struct.h @@ -34,4 +34,34 @@ static inline void put_unaligned_le64(u64 val, void *p) __put_unaligned_cpu64(val, p); } +static inline u16 get_unaligned_be16(const void *p) +{ + return swab16(__get_unaligned_cpu16((const u8 *)p)); +} + +static inline u32 get_unaligned_be32(const void *p) +{ + return swab32(__get_unaligned_cpu32((const u8 *)p)); +} + +static inline u64 get_unaligned_be64(const void *p) +{ + return swab64(__get_unaligned_cpu64((const u8 *)p)); +} + +static inline void put_unaligned_be16(u16 val, void *p) +{ + __put_unaligned_cpu16(swab16(val), p); +} + +static inline void put_unaligned_be32(u32 val, void *p) +{ + __put_unaligned_cpu32(swab32(val), p); +} + +static inline void put_unaligned_be64(u64 val, void *p) +{ + __put_unaligned_cpu64(swab64(val), p); +} + #endif /* _LINUX_UNALIGNED_LE_STRUCT_H */ -- cgit v1.2.3 From 778aaefb8e864fc61f850539ea479554dd4caea1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 8 May 2021 00:07:52 +0200 Subject: asm-generic: unaligned always use struct helpers As found by Vineet Gupta and Linus Torvalds, gcc has somewhat unexpected behavior when faced with overlapping unaligned pointers. The kernel's unaligned/access-ok.h header technically invokes undefined behavior that happens to usually work on the architectures using it, but if the compiler optimizes code based on the assumption that undefined behavior doesn't happen, it can create output that actually causes data corruption. A related problem was previously found on 32-bit ARMv7, where most instructions can be used on unaligned data, but 64-bit ldrd/strd causes an exception. The workaround was to always use the unaligned/le_struct.h helper instead of unaligned/access-ok.h, in commit 1cce91dfc8f7 ("ARM: 8715/1: add a private asm/unaligned.h"). The same solution should work on all other architectures as well, so remove the access-ok.h variant and use the other one unconditionally on all architectures, picking either the big-endian or little-endian version. With this, the arm specific header can be removed as well, and the only file including linux/unaligned/access_ok.h gets moved to including the normal file. Fortunately, this made almost no difference to the object code produced by gcc-11. On x86, s390, powerpc, and arc, the resulting binary appears to be identical to the previous version, while on arm64 and m68k there are minimal differences that looks like an optimization pass went into a different direction, usually using fewer stack spills on the new version. Signed-off-by: Arnd Bergmann Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100363 --- arch/arm/include/asm/unaligned.h | 25 -------------- arch/mips/crypto/crc32-mips.c | 2 +- include/asm-generic/unaligned.h | 13 ++----- include/linux/unaligned/access_ok.h | 68 ------------------------------------- 4 files changed, 3 insertions(+), 105 deletions(-) delete mode 100644 arch/arm/include/asm/unaligned.h delete mode 100644 include/linux/unaligned/access_ok.h (limited to 'include/linux') diff --git a/arch/arm/include/asm/unaligned.h b/arch/arm/include/asm/unaligned.h deleted file mode 100644 index 3c5248fb4cdc..000000000000 --- a/arch/arm/include/asm/unaligned.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef __ASM_ARM_UNALIGNED_H -#define __ASM_ARM_UNALIGNED_H - -/* - * We generally want to set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS on ARMv6+, - * but we don't want to use linux/unaligned/access_ok.h since that can lead - * to traps on unaligned stm/ldm or strd/ldrd. - */ -#include - -#if defined(__LITTLE_ENDIAN) -# include -# include -# define get_unaligned __get_unaligned_le -# define put_unaligned __put_unaligned_le -#elif defined(__BIG_ENDIAN) -# include -# include -# define get_unaligned __get_unaligned_be -# define put_unaligned __put_unaligned_be -#else -# error need to define endianess -#endif - -#endif /* __ASM_ARM_UNALIGNED_H */ diff --git a/arch/mips/crypto/crc32-mips.c b/arch/mips/crypto/crc32-mips.c index faa88a6a74c0..0a03529cf317 100644 --- a/arch/mips/crypto/crc32-mips.c +++ b/arch/mips/crypto/crc32-mips.c @@ -8,13 +8,13 @@ * Copyright (C) 2018 MIPS Tech, LLC */ -#include #include #include #include #include #include #include +#include #include diff --git a/include/asm-generic/unaligned.h b/include/asm-generic/unaligned.h index d79df721ae60..36bf03aaa674 100644 --- a/include/asm-generic/unaligned.h +++ b/include/asm-generic/unaligned.h @@ -8,22 +8,13 @@ */ #include -/* Set by the arch if it can handle unaligned accesses in hardware. */ -#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS -# include -#endif - #if defined(__LITTLE_ENDIAN) -# ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS -# include -# endif +# include # include # define get_unaligned __get_unaligned_le # define put_unaligned __put_unaligned_le #elif defined(__BIG_ENDIAN) -# ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS -# include -# endif +# include # include # define get_unaligned __get_unaligned_be # define put_unaligned __put_unaligned_be diff --git a/include/linux/unaligned/access_ok.h b/include/linux/unaligned/access_ok.h deleted file mode 100644 index 167aa849c0ce..000000000000 --- a/include/linux/unaligned/access_ok.h +++ /dev/null @@ -1,68 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_UNALIGNED_ACCESS_OK_H -#define _LINUX_UNALIGNED_ACCESS_OK_H - -#include -#include - -static __always_inline u16 get_unaligned_le16(const void *p) -{ - return le16_to_cpup((__le16 *)p); -} - -static __always_inline u32 get_unaligned_le32(const void *p) -{ - return le32_to_cpup((__le32 *)p); -} - -static __always_inline u64 get_unaligned_le64(const void *p) -{ - return le64_to_cpup((__le64 *)p); -} - -static __always_inline u16 get_unaligned_be16(const void *p) -{ - return be16_to_cpup((__be16 *)p); -} - -static __always_inline u32 get_unaligned_be32(const void *p) -{ - return be32_to_cpup((__be32 *)p); -} - -static __always_inline u64 get_unaligned_be64(const void *p) -{ - return be64_to_cpup((__be64 *)p); -} - -static __always_inline void put_unaligned_le16(u16 val, void *p) -{ - *((__le16 *)p) = cpu_to_le16(val); -} - -static __always_inline void put_unaligned_le32(u32 val, void *p) -{ - *((__le32 *)p) = cpu_to_le32(val); -} - -static __always_inline void put_unaligned_le64(u64 val, void *p) -{ - *((__le64 *)p) = cpu_to_le64(val); -} - -static __always_inline void put_unaligned_be16(u16 val, void *p) -{ - *((__be16 *)p) = cpu_to_be16(val); -} - -static __always_inline void put_unaligned_be32(u32 val, void *p) -{ - *((__be32 *)p) = cpu_to_be32(val); -} - -static __always_inline void put_unaligned_be64(u64 val, void *p) -{ - *((__be64 *)p) = cpu_to_be64(val); -} - -#endif /* _LINUX_UNALIGNED_ACCESS_OK_H */ -- cgit v1.2.3 From 9d9d415f0048e4f7a6109595e2d1657850569c6c Mon Sep 17 00:00:00 2001 From: "Radu Pirea (NXP OSS)" Date: Mon, 10 May 2021 18:34:32 +0300 Subject: ptp: ptp_clock: make scaled_ppm_to_ppb static inline Make scaled_ppm_to_ppb static inline to be able to build drivers that use this function even with PTP_1588_CLOCK disabled. Signed-off-by: Radu Pirea (NXP OSS) Signed-off-by: David S. Miller --- drivers/ptp/ptp_clock.c | 21 --------------------- include/linux/ptp_clock_kernel.h | 34 ++++++++++++++++++++++++++-------- 2 files changed, 26 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c index 03a246e60fd9..a780435331c8 100644 --- a/drivers/ptp/ptp_clock.c +++ b/drivers/ptp/ptp_clock.c @@ -63,27 +63,6 @@ static void enqueue_external_timestamp(struct timestamp_event_queue *queue, spin_unlock_irqrestore(&queue->lock, flags); } -s32 scaled_ppm_to_ppb(long ppm) -{ - /* - * The 'freq' field in the 'struct timex' is in parts per - * million, but with a 16 bit binary fractional field. - * - * We want to calculate - * - * ppb = scaled_ppm * 1000 / 2^16 - * - * which simplifies to - * - * ppb = scaled_ppm * 125 / 2^13 - */ - s64 ppb = 1 + ppm; - ppb *= 125; - ppb >>= 13; - return (s32) ppb; -} -EXPORT_SYMBOL(scaled_ppm_to_ppb); - /* posix clock implementation */ static int ptp_clock_getres(struct posix_clock *pc, struct timespec64 *tp) diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 0d47fd33b228..a311bddd9e85 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -186,6 +186,32 @@ struct ptp_clock_event { }; }; +/** + * scaled_ppm_to_ppb() - convert scaled ppm to ppb + * + * @ppm: Parts per million, but with a 16 bit binary fractional field + */ +static inline s32 scaled_ppm_to_ppb(long ppm) +{ + /* + * The 'freq' field in the 'struct timex' is in parts per + * million, but with a 16 bit binary fractional field. + * + * We want to calculate + * + * ppb = scaled_ppm * 1000 / 2^16 + * + * which simplifies to + * + * ppb = scaled_ppm * 125 / 2^13 + */ + s64 ppb = 1 + ppm; + + ppb *= 125; + ppb >>= 13; + return (s32)ppb; +} + #if IS_REACHABLE(CONFIG_PTP_1588_CLOCK) /** @@ -229,14 +255,6 @@ extern void ptp_clock_event(struct ptp_clock *ptp, extern int ptp_clock_index(struct ptp_clock *ptp); -/** - * scaled_ppm_to_ppb() - convert scaled ppm to ppb - * - * @ppm: Parts per million, but with a 16 bit binary fractional field - */ - -extern s32 scaled_ppm_to_ppb(long ppm); - /** * ptp_find_pin() - obtain the pin index of a given auxiliary function * -- cgit v1.2.3 From 258ca95e2cd9a0fcc4508a1bf1742b1a3e9a7bbb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Feb 2021 01:10:04 +0100 Subject: timer: Revert "timer: Add timer_curr_running()" This reverts commit dcd42591ebb8a25895b551a5297ea9c24414ba54. The only user was RCU/nocb. Signed-off-by: Frederic Weisbecker Cc: Josh Triplett Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Boqun Feng Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney --- include/linux/timer.h | 2 -- kernel/time/timer.c | 14 -------------- 2 files changed, 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index 4118a97e62fb..fda13c9d1256 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -192,8 +192,6 @@ extern int try_to_del_timer_sync(struct timer_list *timer); #define del_singleshot_timer_sync(t) del_timer_sync(t) -extern bool timer_curr_running(struct timer_list *timer); - extern void init_timers(void); struct hrtimer; extern enum hrtimer_restart it_real_fn(struct hrtimer *); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index d111adf4a0cb..84332f01dc57 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1237,20 +1237,6 @@ int try_to_del_timer_sync(struct timer_list *timer) } EXPORT_SYMBOL(try_to_del_timer_sync); -bool timer_curr_running(struct timer_list *timer) -{ - int i; - - for (i = 0; i < NR_BASES; i++) { - struct timer_base *base = this_cpu_ptr(&timer_bases[i]); - - if (base->running_timer == timer) - return true; - } - - return false; -} - #ifdef CONFIG_PREEMPT_RT static __init void timer_base_init_expiry_lock(struct timer_base *base) { -- cgit v1.2.3 From 7bf0a6141ab9c1d113bd85d6d13d43903a4278ba Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 9 Apr 2021 00:38:58 +0200 Subject: srcu: Unconditionally embed struct lockdep_map Since struct lockdep_map has zero size when CONFIG_DEBUG_LOCK_ALLOC=n, this commit removes the #ifdef from the srcu_struct structure's ->dep_map. This change will simplify further manipulations of this field. Signed-off-by: Frederic Weisbecker Cc: Uladzislau Rezki Cc: Boqun Feng Cc: Lai Jiangshan Cc: Neeraj Upadhyay Cc: Josh Triplett Cc: Joel Fernandes Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 9cfcc8a756ae..cb1f4351e8ba 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -82,9 +82,7 @@ struct srcu_struct { /* callback for the barrier */ /* operation. */ struct delayed_work work; -#ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ }; /* Values for state variable (bottom bits of ->srcu_gp_seq). */ -- cgit v1.2.3 From 8e9c01c717df7e05c5bd1ca86aaa3a74b31f37f1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 9 Apr 2021 00:38:59 +0200 Subject: srcu: Initialize SRCU after timers Once srcu_init() is called, the SRCU core will make use of delayed workqueues, which rely on timers. However init_timers() is called several steps after rcu_init(). This means that a call_srcu() after rcu_init() but before init_timers() would find itself within a dangerously uninitialized timer core. This commit therefore creates a separate call to srcu_init() after init_timer() completes, which ensures that we stay in early SRCU mode until timers are safe(r). Signed-off-by: Frederic Weisbecker Cc: Uladzislau Rezki Cc: Boqun Feng Cc: Lai Jiangshan Cc: Neeraj Upadhyay Cc: Josh Triplett Cc: Joel Fernandes Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 6 ++++++ init/main.c | 2 ++ kernel/rcu/rcu.h | 6 ------ kernel/rcu/srcutree.c | 5 +++++ kernel/rcu/tiny.c | 1 - kernel/rcu/tree.c | 1 - 6 files changed, 13 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index a0895bbf71ce..e6011a9975af 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -64,6 +64,12 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp); unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp); bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie); +#ifdef CONFIG_SRCU +void srcu_init(void); +#else /* #ifdef CONFIG_SRCU */ +static inline void srcu_init(void) { } +#endif /* #else #ifdef CONFIG_SRCU */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC /** diff --git a/init/main.c b/init/main.c index eb01e121d2f1..7b6f49c4d388 100644 --- a/init/main.c +++ b/init/main.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -979,6 +980,7 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void) tick_init(); rcu_init_nohz(); init_timers(); + srcu_init(); hrtimers_init(); softirq_init(); timekeeping_init(); diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index bf0827d4b659..ca3f2af32bf8 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -422,12 +422,6 @@ do { \ #endif /* #if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) */ -#ifdef CONFIG_SRCU -void srcu_init(void); -#else /* #ifdef CONFIG_SRCU */ -static inline void srcu_init(void) { } -#endif /* #else #ifdef CONFIG_SRCU */ - #ifdef CONFIG_TINY_RCU /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ static inline bool rcu_gp_is_normal(void) { return true; } diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index f4f0cbf7a02b..9ec35c158740 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1384,6 +1384,11 @@ void __init srcu_init(void) { struct srcu_struct *ssp; + /* + * Once that is set, call_srcu() can follow the normal path and + * queue delayed work. This must follow RCU workqueues creation + * and timers initialization. + */ srcu_init_done = true; while (!list_empty(&srcu_boot_list)) { ssp = list_first_entry(&srcu_boot_list, struct srcu_struct, diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index c8a029fbb114..340b3f8b090d 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -221,5 +221,4 @@ void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); rcu_early_boot_tests(); - srcu_init(); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8e78b2430c16..c35b15229cff 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4735,7 +4735,6 @@ void __init rcu_init(void) WARN_ON(!rcu_gp_wq); rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_par_gp_wq); - srcu_init(); /* Fill in default value for rcutree.qovld boot parameter. */ /* -After- the rcu_node ->lock fields are initialized! */ -- cgit v1.2.3 From c9e73e3d2b1eb1ea7ff068e05007eec3bd8ef1c9 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 29 Apr 2021 14:46:56 +0100 Subject: bpf: verifier: Allocate idmap scratch in verifier env func_states_equal makes a very short lived allocation for idmap, probably because it's too large to fit on the stack. However the function is called quite often, leading to a lot of alloc / free churn. Replace the temporary allocation with dedicated scratch space in struct bpf_verifier_env. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Edward Cree Link: https://lore.kernel.org/bpf/20210429134656.122225-4-lmb@cloudflare.com --- include/linux/bpf_verifier.h | 8 ++++++++ kernel/bpf/verifier.c | 46 +++++++++++++++----------------------------- 2 files changed, 23 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 06841517ab1e..d4632aa3ca50 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -215,6 +215,13 @@ struct bpf_idx_pair { u32 idx; }; +struct bpf_id_pair { + u32 old; + u32 cur; +}; + +/* Maximum number of register states that can exist at once */ +#define BPF_ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) #define MAX_CALL_FRAMES 8 struct bpf_verifier_state { /* call stack tracking */ @@ -418,6 +425,7 @@ struct bpf_verifier_env { const struct bpf_line_info *prev_linfo; struct bpf_verifier_log log; struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1]; + struct bpf_id_pair idmap_scratch[BPF_ID_MAP_SIZE]; struct { int *insn_state; int *insn_stack; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 380c8ad49b7f..bdfdb54676ea 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9748,13 +9748,6 @@ static bool range_within(struct bpf_reg_state *old, old->s32_max_value >= cur->s32_max_value; } -/* Maximum number of register states that can exist at once */ -#define ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) -struct idpair { - u32 old; - u32 cur; -}; - /* If in the old state two registers had the same id, then they need to have * the same id in the new state as well. But that id could be different from * the old state, so we need to track the mapping from old to new ids. @@ -9765,11 +9758,11 @@ struct idpair { * So we look through our idmap to see if this old id has been seen before. If * so, we require the new id to match; otherwise, we add the id pair to the map. */ -static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap) +static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap) { unsigned int i; - for (i = 0; i < ID_MAP_SIZE; i++) { + for (i = 0; i < BPF_ID_MAP_SIZE; i++) { if (!idmap[i].old) { /* Reached an empty slot; haven't seen this id before */ idmap[i].old = old_id; @@ -9882,7 +9875,7 @@ next: /* Returns true if (rold safe implies rcur safe) */ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, - struct idpair *idmap) + struct bpf_id_pair *idmap) { bool equal; @@ -10000,7 +9993,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, static bool stacksafe(struct bpf_func_state *old, struct bpf_func_state *cur, - struct idpair *idmap) + struct bpf_id_pair *idmap) { int i, spi; @@ -10097,32 +10090,23 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur) * whereas register type in current state is meaningful, it means that * the current state will reach 'bpf_exit' instruction safely */ -static bool func_states_equal(struct bpf_func_state *old, +static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old, struct bpf_func_state *cur) { - struct idpair *idmap; - bool ret = false; int i; - idmap = kcalloc(ID_MAP_SIZE, sizeof(struct idpair), GFP_KERNEL); - /* If we failed to allocate the idmap, just say it's not safe */ - if (!idmap) - return false; - - for (i = 0; i < MAX_BPF_REG; i++) { - if (!regsafe(&old->regs[i], &cur->regs[i], idmap)) - goto out_free; - } + memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch)); + for (i = 0; i < MAX_BPF_REG; i++) + if (!regsafe(&old->regs[i], &cur->regs[i], env->idmap_scratch)) + return false; - if (!stacksafe(old, cur, idmap)) - goto out_free; + if (!stacksafe(old, cur, env->idmap_scratch)) + return false; if (!refsafe(old, cur)) - goto out_free; - ret = true; -out_free: - kfree(idmap); - return ret; + return false; + + return true; } static bool states_equal(struct bpf_verifier_env *env, @@ -10149,7 +10133,7 @@ static bool states_equal(struct bpf_verifier_env *env, for (i = 0; i <= old->curframe; i++) { if (old->frame[i]->callsite != cur->frame[i]->callsite) return false; - if (!func_states_equal(old->frame[i], cur->frame[i])) + if (!func_states_equal(env, old->frame[i], cur->frame[i])) return false; } return true; -- cgit v1.2.3 From ce7c169dee28866539abb0e603b9a23055d30fdc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Mar 2021 13:23:49 -0700 Subject: rcu: Remove the unused rcu_irq_exit_preempt() function Commit 9ee01e0f69a9 ("x86/entry: Clean up idtentry_enter/exit() leftovers") left the rcu_irq_exit_preempt() in place in order to avoid conflicts with the -rcu tree. Now that this change has long since hit mainline, this commit removes the no-longer-used rcu_irq_exit_preempt() function. Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 1 - include/linux/rcutree.h | 1 - kernel/rcu/tree.c | 22 ---------------------- 3 files changed, 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 35e0be326ffc..953e70fafe38 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -86,7 +86,6 @@ static inline void rcu_irq_enter(void) { } static inline void rcu_irq_exit_irqson(void) { } static inline void rcu_irq_enter_irqson(void) { } static inline void rcu_irq_exit(void) { } -static inline void rcu_irq_exit_preempt(void) { } static inline void rcu_irq_exit_check_preempt(void) { } #define rcu_is_idle_cpu(cpu) \ (is_idle_task(current) && !in_nmi() && !in_irq() && !in_serving_softirq()) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index b89b54130f49..53209d669400 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -49,7 +49,6 @@ void rcu_idle_enter(void); void rcu_idle_exit(void); void rcu_irq_enter(void); void rcu_irq_exit(void); -void rcu_irq_exit_preempt(void); void rcu_irq_enter_irqson(void); void rcu_irq_exit_irqson(void); bool rcu_is_idle_cpu(int cpu); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8e78b2430c16..f6543b8004c0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -833,28 +833,6 @@ void noinstr rcu_irq_exit(void) rcu_nmi_exit(); } -/** - * rcu_irq_exit_preempt - Inform RCU that current CPU is exiting irq - * towards in kernel preemption - * - * Same as rcu_irq_exit() but has a sanity check that scheduling is safe - * from RCU point of view. Invoked from return from interrupt before kernel - * preemption. - */ -void rcu_irq_exit_preempt(void) -{ - lockdep_assert_irqs_disabled(); - rcu_nmi_exit(); - - RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0, - "RCU dynticks_nesting counter underflow/zero!"); - RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) != - DYNTICK_IRQ_NONIDLE, - "Bad RCU dynticks_nmi_nesting counter\n"); - RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), - "RCU in extended quiescent state!"); -} - #ifdef CONFIG_PROVE_RCU /** * rcu_irq_exit_check_preempt - Validate that scheduling is possible -- cgit v1.2.3 From 3066820034b5dd4e89bd74a7739c51c2d6f5e554 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 5 Apr 2021 09:51:05 -0700 Subject: rcu: Reject RCU_LOCKDEP_WARN() false positives If another lockdep report runs concurrently with an RCU lockdep report from RCU_LOCKDEP_WARN(), the following sequence of events can occur: 1. debug_lockdep_rcu_enabled() sees that lockdep is enabled when called from (say) synchronize_rcu(). 2. Lockdep is disabled by a concurrent lockdep report. 3. debug_lockdep_rcu_enabled() evaluates its lockdep-expression argument, for example, lock_is_held(&rcu_bh_lock_map). 4. Because lockdep is now disabled, lock_is_held() plays it safe and returns the constant 1. 5. But in this case, the constant 1 is not safe, because invoking synchronize_rcu() under rcu_read_lock_bh() is disallowed. 6. debug_lockdep_rcu_enabled() wrongly invokes lockdep_rcu_suspicious(), resulting in a false-positive splat. This commit therefore changes RCU_LOCKDEP_WARN() to check debug_lockdep_rcu_enabled() after checking the lockdep expression, so that any "safe" returns from lock_is_held() are rejected by debug_lockdep_rcu_enabled(). This requires memory ordering, which is supplied by READ_ONCE(debug_locks). The resulting volatile accesses prevent the compiler from reordering and the fact that only one variable is being accessed prevents the underlying hardware from reordering. The combination works for IA64, which can reorder reads to the same location, but this is defeated by the volatile accesses, which compile to load instructions that provide ordering. Reported-by: syzbot+dde0cc33951735441301@syzkaller.appspotmail.com Reported-by: Matthew Wilcox Reported-by: syzbot+88e4f02896967fe1ab0d@syzkaller.appspotmail.com Reported-by: Thomas Gleixner Suggested-by: Boqun Feng Reviewed-by: Boqun Feng Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 +- kernel/rcu/update.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 9455476c5ba2..1199ffd305d1 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -315,7 +315,7 @@ static inline int rcu_read_lock_any_held(void) #define RCU_LOCKDEP_WARN(c, s) \ do { \ static bool __section(".data.unlikely") __warned; \ - if (debug_lockdep_rcu_enabled() && !__warned && (c)) { \ + if ((c) && debug_lockdep_rcu_enabled() && !__warned) { \ __warned = true; \ lockdep_rcu_suspicious(__FILE__, __LINE__, s); \ } \ diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index b95ae86c40a7..dd94a602a6d2 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -277,7 +277,7 @@ EXPORT_SYMBOL_GPL(rcu_callback_map); noinstr int notrace debug_lockdep_rcu_enabled(void) { - return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks && + return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && READ_ONCE(debug_locks) && current->lockdep_recursion == 0; } EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); -- cgit v1.2.3 From f4f809f66b7545b89bff4b132cdb37adc2d2c157 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 10 May 2021 14:39:46 -0700 Subject: cgroup: inline cgroup_task_freeze() After the introduction of the cgroup.kill there is only one call site of cgroup_task_freeze() left: cgroup_exit(). cgroup_task_freeze() is currently taking rcu_read_lock() to read task's cgroup flags, but because it's always called with css_set_lock locked, the rcu protection is excessive. Simplify the code by inlining cgroup_task_freeze(). v2: fix build Signed-off-by: Roman Gushchin Reviewed-by: Shakeel Butt Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 18 ------------------ kernel/cgroup/cgroup.c | 3 ++- 2 files changed, 2 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4f2f79de083e..a72764287cb5 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -906,20 +906,6 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze); void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src, struct cgroup *dst); -static inline bool cgroup_task_freeze(struct task_struct *task) -{ - bool ret; - - if (task->flags & PF_KTHREAD) - return false; - - rcu_read_lock(); - ret = test_bit(CGRP_FREEZE, &task_dfl_cgroup(task)->flags); - rcu_read_unlock(); - - return ret; -} - static inline bool cgroup_task_frozen(struct task_struct *task) { return task->frozen; @@ -929,10 +915,6 @@ static inline bool cgroup_task_frozen(struct task_struct *task) static inline void cgroup_enter_frozen(void) { } static inline void cgroup_leave_frozen(bool always_leave) { } -static inline bool cgroup_task_freeze(struct task_struct *task) -{ - return false; -} static inline bool cgroup_task_frozen(struct task_struct *task) { return false; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e640fc78d731..8e0d7092afbb 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6267,7 +6267,8 @@ void cgroup_exit(struct task_struct *tsk) cset->nr_tasks--; WARN_ON_ONCE(cgroup_task_frozen(tsk)); - if (unlikely(cgroup_task_freeze(tsk))) + if (unlikely(!(tsk->flags & PF_KTHREAD) && + test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags))) cgroup_update_frozen(task_dfl_cgroup(tsk)); spin_unlock_irq(&css_set_lock); -- cgit v1.2.3 From 8a922805fb0950187ff037801e337aec010a6ccb Mon Sep 17 00:00:00 2001 From: Zhongjun Tan Date: Fri, 9 Apr 2021 13:48:41 +0800 Subject: selinux: delete selinux_xfrm_policy_lookup() useless argument seliunx_xfrm_policy_lookup() is hooks of security_xfrm_policy_lookup(). The dir argument is uselss in security_xfrm_policy_lookup(). So remove the dir argument from selinux_xfrm_policy_lookup() and security_xfrm_policy_lookup(). Signed-off-by: Zhongjun Tan [PM: reformat the subject line] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 3 +-- include/linux/security.h | 4 ++-- net/xfrm/xfrm_policy.c | 6 ++---- security/security.c | 4 ++-- security/selinux/include/xfrm.h | 2 +- security/selinux/xfrm.c | 2 +- 6 files changed, 9 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 04c01794de83..2adeea44c0d5 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -358,8 +358,7 @@ LSM_HOOK(int, 0, xfrm_state_alloc_acquire, struct xfrm_state *x, struct xfrm_sec_ctx *polsec, u32 secid) LSM_HOOK(void, LSM_RET_VOID, xfrm_state_free_security, struct xfrm_state *x) LSM_HOOK(int, 0, xfrm_state_delete_security, struct xfrm_state *x) -LSM_HOOK(int, 0, xfrm_policy_lookup, struct xfrm_sec_ctx *ctx, u32 fl_secid, - u8 dir) +LSM_HOOK(int, 0, xfrm_policy_lookup, struct xfrm_sec_ctx *ctx, u32 fl_secid) LSM_HOOK(int, 1, xfrm_state_pol_flow_match, struct xfrm_state *x, struct xfrm_policy *xp, const struct flowi_common *flic) LSM_HOOK(int, 0, xfrm_decode_session, struct sk_buff *skb, u32 *secid, diff --git a/include/linux/security.h b/include/linux/security.h index 06f7c50ce77f..24eda04221e9 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1681,7 +1681,7 @@ int security_xfrm_state_alloc_acquire(struct xfrm_state *x, struct xfrm_sec_ctx *polsec, u32 secid); int security_xfrm_state_delete(struct xfrm_state *x); void security_xfrm_state_free(struct xfrm_state *x); -int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir); +int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid); int security_xfrm_state_pol_flow_match(struct xfrm_state *x, struct xfrm_policy *xp, const struct flowi_common *flic); @@ -1732,7 +1732,7 @@ static inline int security_xfrm_state_delete(struct xfrm_state *x) return 0; } -static inline int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir) +static inline int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid) { return 0; } diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index ce500f847b99..e70cf1d2c0e0 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1902,8 +1902,7 @@ static int xfrm_policy_match(const struct xfrm_policy *pol, match = xfrm_selector_match(sel, fl, family); if (match) - ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid, - dir); + ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid); return ret; } @@ -2181,8 +2180,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, goto out; } err = security_xfrm_policy_lookup(pol->security, - fl->flowi_secid, - dir); + fl->flowi_secid); if (!err) { if (!xfrm_pol_hold_rcu(pol)) goto again; diff --git a/security/security.c b/security/security.c index b38155b2de83..0c1c9796e3e4 100644 --- a/security/security.c +++ b/security/security.c @@ -2466,9 +2466,9 @@ void security_xfrm_state_free(struct xfrm_state *x) call_void_hook(xfrm_state_free_security, x); } -int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir) +int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid) { - return call_int_hook(xfrm_policy_lookup, 0, ctx, fl_secid, dir); + return call_int_hook(xfrm_policy_lookup, 0, ctx, fl_secid); } int security_xfrm_state_pol_flow_match(struct xfrm_state *x, diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h index 0a6f34a7a971..74159400eeee 100644 --- a/security/selinux/include/xfrm.h +++ b/security/selinux/include/xfrm.h @@ -23,7 +23,7 @@ int selinux_xfrm_state_alloc_acquire(struct xfrm_state *x, struct xfrm_sec_ctx *polsec, u32 secid); void selinux_xfrm_state_free(struct xfrm_state *x); int selinux_xfrm_state_delete(struct xfrm_state *x); -int selinux_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir); +int selinux_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid); int selinux_xfrm_state_pol_flow_match(struct xfrm_state *x, struct xfrm_policy *xp, const struct flowi_common *flic); diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c index 634f3db24da6..be83e5ce4469 100644 --- a/security/selinux/xfrm.c +++ b/security/selinux/xfrm.c @@ -150,7 +150,7 @@ static int selinux_xfrm_delete(struct xfrm_sec_ctx *ctx) * LSM hook implementation that authorizes that a flow can use a xfrm policy * rule. */ -int selinux_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir) +int selinux_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid) { int rc; -- cgit v1.2.3 From 0c8ccd8b267fc735e4621774ce62728f27d42863 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 10 May 2021 15:41:29 +0300 Subject: spi: pxa2xx: Use pxa_ssp_enable()/pxa_ssp_disable() in the driver There are few places that repeat the logic of pxa_ssp_enable() and pxa_ssp_disable(). Use them instead of open coded variants. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210510124134.24638-10-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- drivers/spi/spi-pxa2xx-dma.c | 4 +--- drivers/spi/spi-pxa2xx.c | 36 ++++++++++++++++++------------------ include/linux/pxa2xx_ssp.h | 16 ++++++++++++++++ sound/soc/pxa/pxa-ssp.c | 16 ---------------- 4 files changed, 35 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/drivers/spi/spi-pxa2xx-dma.c b/drivers/spi/spi-pxa2xx-dma.c index e00dbadd39ec..5ca01ad7f460 100644 --- a/drivers/spi/spi-pxa2xx-dma.c +++ b/drivers/spi/spi-pxa2xx-dma.c @@ -50,9 +50,7 @@ static void pxa2xx_spi_dma_transfer_complete(struct driver_data *drv_data, if (error) { /* In case we got an error we disable the SSP now */ - pxa2xx_spi_write(drv_data, SSCR0, - pxa2xx_spi_read(drv_data, SSCR0) - & ~SSCR0_SSE); + pxa_ssp_disable(drv_data->ssp); msg->status = -EIO; } diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c index 087c84e605b9..a27f51f5db65 100644 --- a/drivers/spi/spi-pxa2xx.c +++ b/drivers/spi/spi-pxa2xx.c @@ -286,13 +286,11 @@ static u32 pxa2xx_configure_sscr0(const struct driver_data *drv_data, case QUARK_X1000_SSP: return clk_div | QUARK_X1000_SSCR0_Motorola - | QUARK_X1000_SSCR0_DataSize(bits > 32 ? 8 : bits) - | SSCR0_SSE; + | QUARK_X1000_SSCR0_DataSize(bits > 32 ? 8 : bits); default: return clk_div | SSCR0_Motorola | SSCR0_DataSize(bits > 16 ? bits - 16 : bits) - | SSCR0_SSE | (bits > 16 ? SSCR0_EDSS : 0); } } @@ -498,8 +496,7 @@ static void pxa2xx_spi_off(struct driver_data *drv_data) if (is_mmp2_ssp(drv_data)) return; - pxa2xx_spi_write(drv_data, SSCR0, - pxa2xx_spi_read(drv_data, SSCR0) & ~SSCR0_SSE); + pxa_ssp_disable(drv_data->ssp); } static int null_writer(struct driver_data *drv_data) @@ -1098,25 +1095,26 @@ static int pxa2xx_spi_transfer_one(struct spi_controller *controller, (pxa2xx_spi_read(drv_data, DDS_RATE) != chip->dds_rate)) pxa2xx_spi_write(drv_data, DDS_RATE, chip->dds_rate); + /* Stop the SSP */ + if (!is_mmp2_ssp(drv_data)) + pxa_ssp_disable(drv_data->ssp); + + if (!pxa25x_ssp_comp(drv_data)) + pxa2xx_spi_write(drv_data, SSTO, chip->timeout); + /* see if we need to reload the config registers */ if ((pxa2xx_spi_read(drv_data, SSCR0) != cr0) || (pxa2xx_spi_read(drv_data, SSCR1) & change_mask) != (cr1 & change_mask)) { - /* stop the SSP, and update the other bits */ - if (!is_mmp2_ssp(drv_data)) - pxa2xx_spi_write(drv_data, SSCR0, cr0 & ~SSCR0_SSE); - if (!pxa25x_ssp_comp(drv_data)) - pxa2xx_spi_write(drv_data, SSTO, chip->timeout); /* first set CR1 without interrupt and service enables */ pxa2xx_spi_write(drv_data, SSCR1, cr1 & change_mask); - /* restart the SSP */ + /* Update the other bits */ pxa2xx_spi_write(drv_data, SSCR0, cr0); - - } else { - if (!pxa25x_ssp_comp(drv_data)) - pxa2xx_spi_write(drv_data, SSTO, chip->timeout); } + /* Restart the SSP */ + pxa_ssp_enable(drv_data->ssp); + if (is_mmp2_ssp(drv_data)) { u8 tx_level = (pxa2xx_spi_read(drv_data, SSSR) & SSSR_TFL_MASK) >> 8; @@ -1786,8 +1784,9 @@ static int pxa2xx_spi_probe(struct platform_device *pdev) controller->min_speed_hz = DIV_ROUND_UP(controller->max_speed_hz, 512); + pxa_ssp_disable(ssp); + /* Load default SSP configuration */ - pxa2xx_spi_write(drv_data, SSCR0, 0); switch (drv_data->ssp_type) { case QUARK_X1000_SSP: tmp = QUARK_X1000_SSCR1_RxTresh(RX_THRESH_QUARK_X1000_DFLT) | @@ -1928,7 +1927,7 @@ static int pxa2xx_spi_remove(struct platform_device *pdev) spi_unregister_controller(drv_data->controller); /* Disable the SSP at the peripheral and SOC level */ - pxa2xx_spi_write(drv_data, SSCR0, 0); + pxa_ssp_disable(ssp); clk_disable_unprepare(ssp->clk); /* Release DMA */ @@ -1957,7 +1956,8 @@ static int pxa2xx_spi_suspend(struct device *dev) status = spi_controller_suspend(drv_data->controller); if (status != 0) return status; - pxa2xx_spi_write(drv_data, SSCR0, 0); + + pxa_ssp_disable(ssp); if (!pm_runtime_suspended(dev)) clk_disable_unprepare(ssp->clk); diff --git a/include/linux/pxa2xx_ssp.h b/include/linux/pxa2xx_ssp.h index 1b6c1a0922bd..fdfbe17e15f4 100644 --- a/include/linux/pxa2xx_ssp.h +++ b/include/linux/pxa2xx_ssp.h @@ -254,6 +254,22 @@ static inline u32 pxa_ssp_read_reg(struct ssp_device *dev, u32 reg) return __raw_readl(dev->mmio_base + reg); } +static inline void pxa_ssp_enable(struct ssp_device *ssp) +{ + u32 sscr0; + + sscr0 = pxa_ssp_read_reg(ssp, SSCR0) | SSCR0_SSE; + pxa_ssp_write_reg(ssp, SSCR0, sscr0); +} + +static inline void pxa_ssp_disable(struct ssp_device *ssp) +{ + u32 sscr0; + + sscr0 = pxa_ssp_read_reg(ssp, SSCR0) & ~SSCR0_SSE; + pxa_ssp_write_reg(ssp, SSCR0, sscr0); +} + #if IS_ENABLED(CONFIG_PXA_SSP) struct ssp_device *pxa_ssp_request(int port, const char *label); void pxa_ssp_free(struct ssp_device *); diff --git a/sound/soc/pxa/pxa-ssp.c b/sound/soc/pxa/pxa-ssp.c index b941adcbb8f9..939e7e28486a 100644 --- a/sound/soc/pxa/pxa-ssp.c +++ b/sound/soc/pxa/pxa-ssp.c @@ -61,22 +61,6 @@ static void dump_registers(struct ssp_device *ssp) pxa_ssp_read_reg(ssp, SSACD)); } -static void pxa_ssp_enable(struct ssp_device *ssp) -{ - uint32_t sscr0; - - sscr0 = __raw_readl(ssp->mmio_base + SSCR0) | SSCR0_SSE; - __raw_writel(sscr0, ssp->mmio_base + SSCR0); -} - -static void pxa_ssp_disable(struct ssp_device *ssp) -{ - uint32_t sscr0; - - sscr0 = __raw_readl(ssp->mmio_base + SSCR0) & ~SSCR0_SSE; - __raw_writel(sscr0, ssp->mmio_base + SSCR0); -} - static void pxa_ssp_set_dma_params(struct ssp_device *ssp, int width4, int out, struct snd_dmaengine_dai_dma_data *dma) { -- cgit v1.2.3 From 3fdb59cf10b020b32b9f1dfc78611320623dcb3e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 10 May 2021 15:41:34 +0300 Subject: spi: pxa2xx: Introduce special type for Merrifield SPIs Intel Merrifield SPI is actually more closer to PXA3xx. It has extended FIFO (32 bytes) and additional registers to get or set FIFO thresholds. Introduce new type for Intel Merrifield SPI host controllers and handle bigger FIFO size. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210510124134.24638-15-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- drivers/spi/spi-pxa2xx-pci.c | 2 +- drivers/spi/spi-pxa2xx.c | 32 +++++++++++++++++++++++++++++--- include/linux/pxa2xx_ssp.h | 16 ++++++++++++++++ 3 files changed, 46 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/spi/spi-pxa2xx-pci.c b/drivers/spi/spi-pxa2xx-pci.c index a259be12d326..dce9ade9a4df 100644 --- a/drivers/spi/spi-pxa2xx-pci.c +++ b/drivers/spi/spi-pxa2xx-pci.c @@ -179,7 +179,7 @@ static struct pxa_spi_info spi_info_configs[] = { .rx_param = &bsw2_rx_param, }, [PORT_MRFLD] = { - .type = PXA27x_SSP, + .type = MRFLD_SSP, .max_clk_rate = 25000000, .setup = mrfld_spi_setup, }, diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c index af3f01de8f5b..5985b39e2dd6 100644 --- a/drivers/spi/spi-pxa2xx.c +++ b/drivers/spi/spi-pxa2xx.c @@ -200,6 +200,11 @@ static bool is_mmp2_ssp(const struct driver_data *drv_data) return drv_data->ssp_type == MMP2_SSP; } +static bool is_mrfld_ssp(const struct driver_data *drv_data) +{ + return drv_data->ssp_type == MRFLD_SSP; +} + static void pxa2xx_spi_update(const struct driver_data *drv_data, u32 reg, u32 mask, u32 value) { if ((pxa2xx_spi_read(drv_data, reg) & mask) != value) @@ -1087,6 +1092,15 @@ static int pxa2xx_spi_transfer_one(struct spi_controller *controller, pxa2xx_spi_update(drv_data, SSITF, GENMASK(15, 0), chip->lpss_tx_threshold); } + if (is_mrfld_ssp(drv_data)) { + u32 thresh = 0; + + thresh |= SFIFOTT_RxThresh(chip->lpss_rx_threshold); + thresh |= SFIFOTT_TxThresh(chip->lpss_tx_threshold); + + pxa2xx_spi_update(drv_data, SFIFOTT, 0xffffffff, thresh); + } + if (is_quark_x1000_ssp(drv_data)) pxa2xx_spi_update(drv_data, DDS_RATE, GENMASK(23, 0), chip->dds_rate); @@ -1253,6 +1267,11 @@ static int setup(struct spi_device *spi) tx_hi_thres = 0; rx_thres = RX_THRESH_QUARK_X1000_DFLT; break; + case MRFLD_SSP: + tx_thres = TX_THRESH_MRFLD_DFLT; + tx_hi_thres = 0; + rx_thres = RX_THRESH_MRFLD_DFLT; + break; case CE4100_SSP: tx_thres = TX_THRESH_CE4100_DFLT; tx_hi_thres = 0; @@ -1328,9 +1347,16 @@ static int setup(struct spi_device *spi) chip->cr1 |= SSCR1_SPH; } - chip->lpss_rx_threshold = SSIRF_RxThresh(rx_thres); - chip->lpss_tx_threshold = SSITF_TxLoThresh(tx_thres) - | SSITF_TxHiThresh(tx_hi_thres); + if (is_lpss_ssp(drv_data)) { + chip->lpss_rx_threshold = SSIRF_RxThresh(rx_thres); + chip->lpss_tx_threshold = SSITF_TxLoThresh(tx_thres) | + SSITF_TxHiThresh(tx_hi_thres); + } + + if (is_mrfld_ssp(drv_data)) { + chip->lpss_rx_threshold = rx_thres; + chip->lpss_tx_threshold = tx_thres; + } /* set dma burst and threshold outside of chip_info path so that if * chip_info goes away after setting chip->enable_dma, the diff --git a/include/linux/pxa2xx_ssp.h b/include/linux/pxa2xx_ssp.h index fdfbe17e15f4..2b21bc1f3c73 100644 --- a/include/linux/pxa2xx_ssp.h +++ b/include/linux/pxa2xx_ssp.h @@ -183,6 +183,21 @@ struct device_node; #define SSACD_ACPS(x) ((x) << 4) /* Audio clock PLL select */ #define SSACD_SCDX8 BIT(7) /* SYSCLK division ratio select */ +/* Intel Merrifield SSP */ +#define SFIFOL 0x68 /* FIFO level */ +#define SFIFOTT 0x6c /* FIFO trigger threshold */ + +#define RX_THRESH_MRFLD_DFLT 16 +#define TX_THRESH_MRFLD_DFLT 16 + +#define SFIFOL_TFL_MASK GENMASK(15, 0) /* Transmit FIFO Level mask */ +#define SFIFOL_RFL_MASK GENMASK(31, 16) /* Receive FIFO Level mask */ + +#define SFIFOTT_TFT GENMASK(15, 0) /* Transmit FIFO Threshold (mask) */ +#define SFIFOTT_TxThresh(x) (((x) - 1) << 0) /* TX FIFO trigger threshold / level */ +#define SFIFOTT_RFT GENMASK(31, 16) /* Receive FIFO Threshold (mask) */ +#define SFIFOTT_RxThresh(x) (((x) - 1) << 16) /* RX FIFO trigger threshold / level */ + /* LPSS SSP */ #define SSITF 0x44 /* TX FIFO trigger level */ #define SSITF_TxHiThresh(x) (((x) - 1) << 0) @@ -205,6 +220,7 @@ enum pxa_ssp_type { MMP2_SSP, PXA910_SSP, CE4100_SSP, + MRFLD_SSP, QUARK_X1000_SSP, LPSS_LPT_SSP, /* Keep LPSS types sorted with lpss_platforms[] */ LPSS_BYT_SSP, -- cgit v1.2.3 From 35f3f8504c3b60a1ae5576e178b27fc0ddd6157d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 10 May 2021 16:12:42 +0300 Subject: spi: Switch to signed types for *_native_cs SPI controller fields While fixing undefined behaviour the commit f60d7270c8a3 ("spi: Avoid undefined behaviour when counting unused native CSs") missed the case when all CSs are GPIOs and thus unused_native_cs will be evaluated to -1 in unsigned representation. This will falsely trigger a condition in the spi_get_gpio_descs(). Switch to signed types for *_native_cs SPI controller fields to fix above. Fixes: f60d7270c8a3 ("spi: Avoid undefined behaviour when counting unused native CSs") Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210510131242.49455-1-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 360a3bc767ca..74239d65c7fd 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -644,8 +644,8 @@ struct spi_controller { int *cs_gpios; struct gpio_desc **cs_gpiods; bool use_gpio_descriptors; - u8 unused_native_cs; - u8 max_native_cs; + s8 unused_native_cs; + s8 max_native_cs; /* statistics */ struct spi_statistics statistics; -- cgit v1.2.3 From 345e9f5ca798600e44c0843646621f2804eb99f4 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Tue, 11 May 2021 11:00:45 +0800 Subject: soundwire: bus: only use CLOCK_STOP_MODE0 and fix confusions Existing devices and implementations only support the required CLOCK_STOP_MODE0. All the code related to CLOCK_STOP_MODE1 has not been tested and is highly questionable, with a clear confusion between CLOCK_STOP_MODE1 and the simple clock stop state machine. This patch removes all usages of CLOCK_STOP_MODE1 - which has no impact on any solution - and fixes the use of the simple clock stop state machine. The resulting code should be a lot more symmetrical and easier to maintain. Note that CLOCK_STOP_MODE1 is not supported in the SoundWire Device Class specification so it's rather unlikely that we need to re-add this mode later. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Guennadi Liakhovetski Reviewed-by: Rander Wang Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20210511030048.25622-2-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- drivers/soundwire/bus.c | 100 +++++++++++++++++------------------------- include/linux/soundwire/sdw.h | 2 - 2 files changed, 40 insertions(+), 62 deletions(-) (limited to 'include/linux') diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c index a9e0aa72654d..dc4033b6f2e9 100644 --- a/drivers/soundwire/bus.c +++ b/drivers/soundwire/bus.c @@ -821,26 +821,6 @@ static void sdw_modify_slave_status(struct sdw_slave *slave, mutex_unlock(&bus->bus_lock); } -static enum sdw_clk_stop_mode sdw_get_clk_stop_mode(struct sdw_slave *slave) -{ - enum sdw_clk_stop_mode mode; - - /* - * Query for clock stop mode if Slave implements - * ops->get_clk_stop_mode, else read from property. - */ - if (slave->ops && slave->ops->get_clk_stop_mode) { - mode = slave->ops->get_clk_stop_mode(slave); - } else { - if (slave->prop.clk_stop_mode1) - mode = SDW_CLK_STOP_MODE1; - else - mode = SDW_CLK_STOP_MODE0; - } - - return mode; -} - static int sdw_slave_clk_stop_callback(struct sdw_slave *slave, enum sdw_clk_stop_mode mode, enum sdw_clk_stop_type type) @@ -933,7 +913,6 @@ static int sdw_bus_wait_for_clk_prep_deprep(struct sdw_bus *bus, u16 dev_num) */ int sdw_bus_prep_clk_stop(struct sdw_bus *bus) { - enum sdw_clk_stop_mode slave_mode; bool simple_clk_stop = true; struct sdw_slave *slave; bool is_slave = false; @@ -955,10 +934,8 @@ int sdw_bus_prep_clk_stop(struct sdw_bus *bus) /* Identify if Slave(s) are available on Bus */ is_slave = true; - slave_mode = sdw_get_clk_stop_mode(slave); - slave->curr_clk_stop_mode = slave_mode; - - ret = sdw_slave_clk_stop_callback(slave, slave_mode, + ret = sdw_slave_clk_stop_callback(slave, + SDW_CLK_STOP_MODE0, SDW_CLK_PRE_PREPARE); if (ret < 0) { dev_err(&slave->dev, @@ -966,22 +943,29 @@ int sdw_bus_prep_clk_stop(struct sdw_bus *bus) return ret; } - ret = sdw_slave_clk_stop_prepare(slave, - slave_mode, true); - if (ret < 0) { - dev_err(&slave->dev, - "pre-prepare failed:%d", ret); - return ret; - } - - if (slave_mode == SDW_CLK_STOP_MODE1) + /* Only prepare a Slave device if needed */ + if (!slave->prop.simple_clk_stop_capable) { simple_clk_stop = false; + + ret = sdw_slave_clk_stop_prepare(slave, + SDW_CLK_STOP_MODE0, + true); + if (ret < 0) { + dev_err(&slave->dev, + "pre-prepare failed:%d", ret); + return ret; + } + } } /* Skip remaining clock stop preparation if no Slave is attached */ if (!is_slave) return ret; + /* + * Don't wait for all Slaves to be ready if they follow the simple + * state machine + */ if (!simple_clk_stop) { ret = sdw_bus_wait_for_clk_prep_deprep(bus, SDW_BROADCAST_DEV_NUM); @@ -998,17 +982,13 @@ int sdw_bus_prep_clk_stop(struct sdw_bus *bus) slave->status != SDW_SLAVE_ALERT) continue; - slave_mode = slave->curr_clk_stop_mode; - - if (slave_mode == SDW_CLK_STOP_MODE1) { - ret = sdw_slave_clk_stop_callback(slave, - slave_mode, - SDW_CLK_POST_PREPARE); + ret = sdw_slave_clk_stop_callback(slave, + SDW_CLK_STOP_MODE0, + SDW_CLK_POST_PREPARE); - if (ret < 0) { - dev_err(&slave->dev, - "post-prepare failed:%d", ret); - } + if (ret < 0) { + dev_err(&slave->dev, + "post-prepare failed:%d", ret); } } @@ -1059,7 +1039,6 @@ EXPORT_SYMBOL(sdw_bus_clk_stop); */ int sdw_bus_exit_clk_stop(struct sdw_bus *bus) { - enum sdw_clk_stop_mode mode; bool simple_clk_stop = true; struct sdw_slave *slave; bool is_slave = false; @@ -1081,31 +1060,33 @@ int sdw_bus_exit_clk_stop(struct sdw_bus *bus) /* Identify if Slave(s) are available on Bus */ is_slave = true; - mode = slave->curr_clk_stop_mode; - - if (mode == SDW_CLK_STOP_MODE1) { - simple_clk_stop = false; - continue; - } - - ret = sdw_slave_clk_stop_callback(slave, mode, + ret = sdw_slave_clk_stop_callback(slave, SDW_CLK_STOP_MODE0, SDW_CLK_PRE_DEPREPARE); if (ret < 0) dev_warn(&slave->dev, "clk stop deprep failed:%d", ret); - ret = sdw_slave_clk_stop_prepare(slave, mode, - false); + /* Only de-prepare a Slave device if needed */ + if (!slave->prop.simple_clk_stop_capable) { + simple_clk_stop = false; - if (ret < 0) - dev_warn(&slave->dev, - "clk stop deprep failed:%d", ret); + ret = sdw_slave_clk_stop_prepare(slave, SDW_CLK_STOP_MODE0, + false); + + if (ret < 0) + dev_warn(&slave->dev, + "clk stop deprep failed:%d", ret); + } } /* Skip remaining clock stop de-preparation if no Slave is attached */ if (!is_slave) return 0; + /* + * Don't wait for all Slaves to be ready if they follow the simple + * state machine + */ if (!simple_clk_stop) sdw_bus_wait_for_clk_prep_deprep(bus, SDW_BROADCAST_DEV_NUM); @@ -1117,8 +1098,7 @@ int sdw_bus_exit_clk_stop(struct sdw_bus *bus) slave->status != SDW_SLAVE_ALERT) continue; - mode = slave->curr_clk_stop_mode; - sdw_slave_clk_stop_callback(slave, mode, + sdw_slave_clk_stop_callback(slave, SDW_CLK_STOP_MODE0, SDW_CLK_POST_DEPREPARE); } diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index ced07f8fde87..5d93d9949653 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -624,7 +624,6 @@ struct sdw_slave_ops { int (*port_prep)(struct sdw_slave *slave, struct sdw_prepare_ch *prepare_ch, enum sdw_port_prep_ops pre_ops); - int (*get_clk_stop_mode)(struct sdw_slave *slave); int (*clk_stop)(struct sdw_slave *slave, enum sdw_clk_stop_mode mode, enum sdw_clk_stop_type type); @@ -675,7 +674,6 @@ struct sdw_slave { struct list_head node; struct completion port_ready[SDW_MAX_PORTS]; unsigned int m_port_map[SDW_MAX_PORTS]; - enum sdw_clk_stop_mode curr_clk_stop_mode; u16 dev_num; u16 dev_num_sticky; bool probed; -- cgit v1.2.3 From 448df2d8fcab6cc50e0de4679ce3afe2ece282f2 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Tue, 11 May 2021 11:00:46 +0800 Subject: soundwire: add missing kernel-doc description For some reason we never added a description for the clk_stop callback. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Guennadi Liakhovetski Reviewed-by: Rander Wang Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20210511030048.25622-3-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 5d93d9949653..8ca736e92d5a 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -612,6 +612,7 @@ struct sdw_bus_params { * @update_status: Update Slave status * @bus_config: Update the bus config for Slave * @port_prep: Prepare the port with parameters + * @clk_stop: handle imp-def sequences before and after prepare and de-prepare */ struct sdw_slave_ops { int (*read_prop)(struct sdw_slave *sdw); -- cgit v1.2.3 From bf30396cdf8132a199af5f8f0e60367876f455df Mon Sep 17 00:00:00 2001 From: Loic Poulain Date: Tue, 11 May 2021 16:42:22 +0200 Subject: net: wwan: Add unknown port type Some devices may have ports with unknown type/protocol which need to be tagged (though not supported by WWAN core). This will be the case for cdc-wdm based drivers. Signed-off-by: Loic Poulain Signed-off-by: David S. Miller --- include/linux/wwan.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/wwan.h b/include/linux/wwan.h index aa05a253dcf9..7216c114d758 100644 --- a/include/linux/wwan.h +++ b/include/linux/wwan.h @@ -15,6 +15,7 @@ * @WWAN_PORT_QMI: Qcom modem/MSM interface for modem control * @WWAN_PORT_QCDM: Qcom Modem diagnostic interface * @WWAN_PORT_FIREHOSE: XML based command protocol + * @WWAN_PORT_UNKNOWN: Unknown port type * @WWAN_PORT_MAX: Number of supported port types */ enum wwan_port_type { @@ -23,7 +24,8 @@ enum wwan_port_type { WWAN_PORT_QMI, WWAN_PORT_QCDM, WWAN_PORT_FIREHOSE, - WWAN_PORT_MAX, + WWAN_PORT_UNKNOWN, + WWAN_PORT_MAX = WWAN_PORT_UNKNOWN, }; struct wwan_port; -- cgit v1.2.3 From cac6fb015f719104e60b1c68c15ca5b734f57b9c Mon Sep 17 00:00:00 2001 From: Loic Poulain Date: Tue, 11 May 2021 16:42:23 +0200 Subject: usb: class: cdc-wdm: WWAN framework integration The WWAN framework provides a unified way to handle WWAN/modems and its control port(s). It has initially been introduced to support MHI/PCI modems, offering the same control protocols as the USB variants such as MBIM, QMI, AT... The WWAN framework exposes these control protocols as character devices, similarly to cdc-wdm, but in a bus agnostic fashion. This change adds registration of the USB modem cdc-wdm control endpoints to the WWAN framework as standard control ports (wwanXpY...). Exposing cdc-wdm through WWAN framework normally maintains backward compatibility, e.g: $ qmicli --device-open-qmi -d /dev/wwan0p1QMI --dms-get-ids instead of $ qmicli --device-open-qmi -d /dev/cdc-wdm0 --dms-get-ids However, some tools may rely on cdc-wdm driver/device name for device detection. It is then safer to keep the 'legacy' cdc-wdm character device to prevent any breakage. This is handled in this change by API mutual exclusion, only one access method can be used at a time, either cdc-wdm chardev or WWAN API. Note that unknown channel types (other than MBIM, AT or MBIM) are not registered to the WWAN framework. Signed-off-by: Loic Poulain Signed-off-by: David S. Miller --- drivers/net/usb/cdc_mbim.c | 1 + drivers/net/usb/huawei_cdc_ncm.c | 1 + drivers/net/usb/qmi_wwan.c | 3 +- drivers/usb/class/cdc-wdm.c | 180 ++++++++++++++++++++++++++++++++++++++- include/linux/usb/cdc-wdm.h | 3 +- 5 files changed, 182 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/usb/cdc_mbim.c b/drivers/net/usb/cdc_mbim.c index 5db66272fc82..42fb75057c15 100644 --- a/drivers/net/usb/cdc_mbim.c +++ b/drivers/net/usb/cdc_mbim.c @@ -168,6 +168,7 @@ static int cdc_mbim_bind(struct usbnet *dev, struct usb_interface *intf) subdriver = usb_cdc_wdm_register(ctx->control, &dev->status->desc, le16_to_cpu(ctx->mbim_desc->wMaxControlMessage), + WWAN_PORT_MBIM, cdc_mbim_wdm_manage_power); if (IS_ERR(subdriver)) { ret = PTR_ERR(subdriver); diff --git a/drivers/net/usb/huawei_cdc_ncm.c b/drivers/net/usb/huawei_cdc_ncm.c index a87f0dabcdb7..849b77330bf2 100644 --- a/drivers/net/usb/huawei_cdc_ncm.c +++ b/drivers/net/usb/huawei_cdc_ncm.c @@ -96,6 +96,7 @@ static int huawei_cdc_ncm_bind(struct usbnet *usbnet_dev, subdriver = usb_cdc_wdm_register(ctx->control, &usbnet_dev->status->desc, 1024, /* wMaxCommand */ + WWAN_PORT_AT, huawei_cdc_ncm_wdm_manage_power); if (IS_ERR(subdriver)) { ret = PTR_ERR(subdriver); diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 6700f1970b24..db157f21a322 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -710,7 +710,8 @@ static int qmi_wwan_register_subdriver(struct usbnet *dev) /* register subdriver */ subdriver = usb_cdc_wdm_register(info->control, &dev->status->desc, - 4096, &qmi_wwan_cdc_wdm_manage_power); + 4096, WWAN_PORT_QMI, + &qmi_wwan_cdc_wdm_manage_power); if (IS_ERR(subdriver)) { dev_err(&info->control->dev, "subdriver registration failed\n"); rv = PTR_ERR(subdriver); diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c index 508b1c3f8b73..457b00c6e984 100644 --- a/drivers/usb/class/cdc-wdm.c +++ b/drivers/usb/class/cdc-wdm.c @@ -21,8 +21,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -55,6 +57,7 @@ MODULE_DEVICE_TABLE (usb, wdm_ids); #define WDM_SUSPENDING 8 #define WDM_RESETTING 9 #define WDM_OVERFLOW 10 +#define WDM_WWAN_IN_USE 11 #define WDM_MAX 16 @@ -106,6 +109,9 @@ struct wdm_device { struct list_head device_list; int (*manage_power)(struct usb_interface *, int); + + enum wwan_port_type wwanp_type; + struct wwan_port *wwanp; }; static struct usb_driver wdm_driver; @@ -157,6 +163,8 @@ static void wdm_out_callback(struct urb *urb) wake_up_all(&desc->wait); } +static void wdm_wwan_rx(struct wdm_device *desc, int length); + static void wdm_in_callback(struct urb *urb) { unsigned long flags; @@ -192,6 +200,11 @@ static void wdm_in_callback(struct urb *urb) } } + if (test_bit(WDM_WWAN_IN_USE, &desc->flags)) { + wdm_wwan_rx(desc, length); + goto out; + } + /* * only set a new error if there is no previous error. * Errors are only cleared during read/open @@ -226,6 +239,7 @@ skip_error: set_bit(WDM_READ, &desc->flags); wake_up(&desc->wait); } +out: spin_unlock_irqrestore(&desc->iuspin, flags); } @@ -697,6 +711,11 @@ static int wdm_open(struct inode *inode, struct file *file) goto out; file->private_data = desc; + if (test_bit(WDM_WWAN_IN_USE, &desc->flags)) { + rv = -EBUSY; + goto out; + } + rv = usb_autopm_get_interface(desc->intf); if (rv < 0) { dev_err(&desc->intf->dev, "Error autopm - %d\n", rv); @@ -792,6 +811,151 @@ static struct usb_class_driver wdm_class = { .minor_base = WDM_MINOR_BASE, }; +/* --- WWAN framework integration --- */ +#ifdef CONFIG_WWAN +static int wdm_wwan_port_start(struct wwan_port *port) +{ + struct wdm_device *desc = wwan_port_get_drvdata(port); + + /* The interface is both exposed via the WWAN framework and as a + * legacy usbmisc chardev. If chardev is already open, just fail + * to prevent concurrent usage. Otherwise, switch to WWAN mode. + */ + mutex_lock(&wdm_mutex); + if (desc->count) { + mutex_unlock(&wdm_mutex); + return -EBUSY; + } + set_bit(WDM_WWAN_IN_USE, &desc->flags); + mutex_unlock(&wdm_mutex); + + desc->manage_power(desc->intf, 1); + + /* tx is allowed */ + wwan_port_txon(port); + + /* Start getting events */ + return usb_submit_urb(desc->validity, GFP_KERNEL); +} + +static void wdm_wwan_port_stop(struct wwan_port *port) +{ + struct wdm_device *desc = wwan_port_get_drvdata(port); + + /* Stop all transfers and disable WWAN mode */ + kill_urbs(desc); + desc->manage_power(desc->intf, 0); + clear_bit(WDM_READ, &desc->flags); + clear_bit(WDM_WWAN_IN_USE, &desc->flags); +} + +static void wdm_wwan_port_tx_complete(struct urb *urb) +{ + struct sk_buff *skb = urb->context; + struct wdm_device *desc = skb_shinfo(skb)->destructor_arg; + + usb_autopm_put_interface(desc->intf); + wwan_port_txon(desc->wwanp); + kfree_skb(skb); +} + +static int wdm_wwan_port_tx(struct wwan_port *port, struct sk_buff *skb) +{ + struct wdm_device *desc = wwan_port_get_drvdata(port); + struct usb_interface *intf = desc->intf; + struct usb_ctrlrequest *req = desc->orq; + int rv; + + rv = usb_autopm_get_interface(intf); + if (rv) + return rv; + + usb_fill_control_urb( + desc->command, + interface_to_usbdev(intf), + usb_sndctrlpipe(interface_to_usbdev(intf), 0), + (unsigned char *)req, + skb->data, + skb->len, + wdm_wwan_port_tx_complete, + skb + ); + + req->bRequestType = (USB_DIR_OUT | USB_TYPE_CLASS | USB_RECIP_INTERFACE); + req->bRequest = USB_CDC_SEND_ENCAPSULATED_COMMAND; + req->wValue = 0; + req->wIndex = desc->inum; + req->wLength = cpu_to_le16(skb->len); + + skb_shinfo(skb)->destructor_arg = desc; + + rv = usb_submit_urb(desc->command, GFP_KERNEL); + if (rv) + usb_autopm_put_interface(intf); + else /* One transfer at a time, stop TX until URB completion */ + wwan_port_txoff(port); + + return rv; +} + +static struct wwan_port_ops wdm_wwan_port_ops = { + .start = wdm_wwan_port_start, + .stop = wdm_wwan_port_stop, + .tx = wdm_wwan_port_tx, +}; + +static void wdm_wwan_init(struct wdm_device *desc) +{ + struct usb_interface *intf = desc->intf; + struct wwan_port *port; + + /* Only register to WWAN core if protocol/type is known */ + if (desc->wwanp_type == WWAN_PORT_UNKNOWN) { + dev_info(&intf->dev, "Unknown control protocol\n"); + return; + } + + port = wwan_create_port(&intf->dev, desc->wwanp_type, &wdm_wwan_port_ops, desc); + if (IS_ERR(port)) { + dev_err(&intf->dev, "%s: Unable to create WWAN port\n", + dev_name(intf->usb_dev)); + return; + } + + desc->wwanp = port; +} + +static void wdm_wwan_deinit(struct wdm_device *desc) +{ + if (!desc->wwanp) + return; + + wwan_remove_port(desc->wwanp); + desc->wwanp = NULL; +} + +static void wdm_wwan_rx(struct wdm_device *desc, int length) +{ + struct wwan_port *port = desc->wwanp; + struct sk_buff *skb; + + /* Forward data to WWAN port */ + skb = alloc_skb(length, GFP_ATOMIC); + if (!skb) + return; + + memcpy(skb_put(skb, length), desc->inbuf, length); + wwan_port_rx(port, skb); + + /* inbuf has been copied, it is safe to check for outstanding data */ + schedule_work(&desc->service_outs_intr); +} +#else /* CONFIG_WWAN */ +static void wdm_wwan_init(struct wdm_device *desc) {} +static void wdm_wwan_deinit(struct wdm_device *desc) {} +static void wdm_wwan_rx(struct wdm_device *desc, int length) {} +#endif /* CONFIG_WWAN */ + /* --- error handling --- */ static void wdm_rxwork(struct work_struct *work) { @@ -836,7 +1000,8 @@ static void service_interrupt_work(struct work_struct *work) /* --- hotplug --- */ static int wdm_create(struct usb_interface *intf, struct usb_endpoint_descriptor *ep, - u16 bufsize, int (*manage_power)(struct usb_interface *, int)) + u16 bufsize, enum wwan_port_type type, + int (*manage_power)(struct usb_interface *, int)) { int rv = -ENOMEM; struct wdm_device *desc; @@ -853,6 +1018,7 @@ static int wdm_create(struct usb_interface *intf, struct usb_endpoint_descriptor /* this will be expanded and needed in hardware endianness */ desc->inum = cpu_to_le16((u16)intf->cur_altsetting->desc.bInterfaceNumber); desc->intf = intf; + desc->wwanp_type = type; INIT_WORK(&desc->rxwork, wdm_rxwork); INIT_WORK(&desc->service_outs_intr, service_interrupt_work); @@ -933,6 +1099,9 @@ static int wdm_create(struct usb_interface *intf, struct usb_endpoint_descriptor goto err; else dev_info(&intf->dev, "%s: USB WDM device\n", dev_name(intf->usb_dev)); + + wdm_wwan_init(desc); + out: return rv; err: @@ -977,7 +1146,7 @@ static int wdm_probe(struct usb_interface *intf, const struct usb_device_id *id) goto err; ep = &iface->endpoint[0].desc; - rv = wdm_create(intf, ep, maxcom, &wdm_manage_power); + rv = wdm_create(intf, ep, maxcom, WWAN_PORT_UNKNOWN, &wdm_manage_power); err: return rv; @@ -988,6 +1157,7 @@ err: * @intf: usb interface the subdriver will associate with * @ep: interrupt endpoint to monitor for notifications * @bufsize: maximum message size to support for read/write + * @type: Type/protocol of the transported data (MBIM, QMI...) * @manage_power: call-back invoked during open and release to * manage the device's power * Create WDM usb class character device and associate it with intf @@ -1005,12 +1175,12 @@ err: */ struct usb_driver *usb_cdc_wdm_register(struct usb_interface *intf, struct usb_endpoint_descriptor *ep, - int bufsize, + int bufsize, enum wwan_port_type type, int (*manage_power)(struct usb_interface *, int)) { int rv; - rv = wdm_create(intf, ep, bufsize, manage_power); + rv = wdm_create(intf, ep, bufsize, type, manage_power); if (rv < 0) goto err; @@ -1029,6 +1199,8 @@ static void wdm_disconnect(struct usb_interface *intf) desc = wdm_find_device(intf); mutex_lock(&wdm_mutex); + wdm_wwan_deinit(desc); + /* the spinlock makes sure no new urbs are generated in the callbacks */ spin_lock_irqsave(&desc->iuspin, flags); set_bit(WDM_DISCONNECTING, &desc->flags); diff --git a/include/linux/usb/cdc-wdm.h b/include/linux/usb/cdc-wdm.h index 9b895f93d8de..9f5a51f79ba5 100644 --- a/include/linux/usb/cdc-wdm.h +++ b/include/linux/usb/cdc-wdm.h @@ -12,11 +12,12 @@ #ifndef __LINUX_USB_CDC_WDM_H #define __LINUX_USB_CDC_WDM_H +#include #include extern struct usb_driver *usb_cdc_wdm_register(struct usb_interface *intf, struct usb_endpoint_descriptor *ep, - int bufsize, + int bufsize, enum wwan_port_type type, int (*manage_power)(struct usb_interface *, int)); #endif /* __LINUX_USB_CDC_WDM_H */ -- cgit v1.2.3 From c5895d3f06cbb80ccb311f1dcb37074651030cb6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 May 2021 22:43:42 +0200 Subject: sched: Simplify sched_info_on() The situation around sched_info is somewhat complicated, it is used by sched_stats and delayacct and, indirectly, kvm. If SCHEDSTATS=Y (but disabled by default) sched_info_on() is unconditionally true -- this is the case for all distro kernel configs I checked. If for some reason SCHEDSTATS=N, but TASK_DELAY_ACCT=Y, then sched_info_on() can return false when delayacct is disabled, presumably because there would be no other users left; except kvm is. Instead of complicating matters further by accurately accounting sched_stat and kvm state, simply unconditionally enable when SCHED_INFO=Y, matching the common distro case. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/20210505111525.121458839@infradead.org --- include/linux/sched/stat.h | 10 ++-------- kernel/delayacct.c | 1 - kernel/sched/stats.h | 37 ++++++++++--------------------------- 3 files changed, 12 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h index 568286411b43..939c3ec9e1b9 100644 --- a/include/linux/sched/stat.h +++ b/include/linux/sched/stat.h @@ -3,6 +3,7 @@ #define _LINUX_SCHED_STAT_H #include +#include /* * Various counters maintained by the scheduler and fork(), @@ -23,14 +24,7 @@ extern unsigned long nr_iowait_cpu(int cpu); static inline int sched_info_on(void) { -#ifdef CONFIG_SCHEDSTATS - return 1; -#elif defined(CONFIG_TASK_DELAY_ACCT) - extern int delayacct_on; - return delayacct_on; -#else - return 0; -#endif + return IS_ENABLED(CONFIG_SCHED_INFO); } #ifdef CONFIG_SCHEDSTATS diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 3fe7cd52b459..3a0b910386d1 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -15,7 +15,6 @@ #include int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ -EXPORT_SYMBOL_GPL(delayacct_on); struct kmem_cache *delayacct_cache; static int __init delayacct_setup_disable(char *str) diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index ee7da12a7056..33ffd41935ba 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -150,11 +150,6 @@ static inline void psi_sched_switch(struct task_struct *prev, #endif /* CONFIG_PSI */ #ifdef CONFIG_SCHED_INFO -static inline void sched_info_reset_dequeued(struct task_struct *t) -{ - t->sched_info.last_queued = 0; -} - /* * We are interested in knowing how long it was from the *first* time a * task was queued to the time that it finally hit a CPU, we call this routine @@ -163,13 +158,12 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) */ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t) { - unsigned long long now = rq_clock(rq), delta = 0; + unsigned long long delta = 0; - if (sched_info_on()) { - if (t->sched_info.last_queued) - delta = now - t->sched_info.last_queued; + if (t->sched_info.last_queued) { + delta = rq_clock(rq) - t->sched_info.last_queued; + t->sched_info.last_queued = 0; } - sched_info_reset_dequeued(t); t->sched_info.run_delay += delta; rq_sched_info_dequeue(rq, delta); @@ -184,9 +178,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) { unsigned long long now = rq_clock(rq), delta = 0; - if (t->sched_info.last_queued) + if (t->sched_info.last_queued) { delta = now - t->sched_info.last_queued; - sched_info_reset_dequeued(t); + t->sched_info.last_queued = 0; + } t->sched_info.run_delay += delta; t->sched_info.last_arrival = now; t->sched_info.pcount++; @@ -201,10 +196,8 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) */ static inline void sched_info_enqueue(struct rq *rq, struct task_struct *t) { - if (sched_info_on()) { - if (!t->sched_info.last_queued) - t->sched_info.last_queued = rq_clock(rq); - } + if (!t->sched_info.last_queued) + t->sched_info.last_queued = rq_clock(rq); } /* @@ -231,7 +224,7 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t) * the idle task.) We are only called when prev != next. */ static inline void -__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) +sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { /* * prev now departs the CPU. It's not interesting to record @@ -245,18 +238,8 @@ __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct sched_info_arrive(rq, next); } -static inline void -sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) -{ - if (sched_info_on()) - __sched_info_switch(rq, prev, next); -} - #else /* !CONFIG_SCHED_INFO: */ # define sched_info_enqueue(rq, t) do { } while (0) -# define sched_info_reset_dequeued(t) do { } while (0) # define sched_info_dequeue(rq, t) do { } while (0) -# define sched_info_depart(rq, t) do { } while (0) -# define sched_info_arrive(rq, next) do { } while (0) # define sched_info_switch(rq, t, next) do { } while (0) #endif /* CONFIG_SCHED_INFO */ -- cgit v1.2.3 From eee4d9fee2544389e5ce5697ed92db67c86d7a9f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 May 2021 22:43:36 +0200 Subject: delayacct: Add static_branch in scheduler hooks Cheaper when delayacct is disabled. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Balbir Singh Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/20210505111525.248028369@infradead.org --- include/linux/delayacct.h | 8 ++++++++ kernel/delayacct.c | 3 +++ 2 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 21651f946751..57fefa54b53a 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -58,8 +58,10 @@ struct task_delay_info { #include #include +#include #ifdef CONFIG_TASK_DELAY_ACCT +DECLARE_STATIC_KEY_TRUE(delayacct_key); extern int delayacct_on; /* Delay accounting turned on/off */ extern struct kmem_cache *delayacct_cache; extern void delayacct_init(void); @@ -114,6 +116,9 @@ static inline void delayacct_tsk_free(struct task_struct *tsk) static inline void delayacct_blkio_start(void) { + if (!static_branch_likely(&delayacct_key)) + return; + delayacct_set_flag(current, DELAYACCT_PF_BLKIO); if (current->delays) __delayacct_blkio_start(); @@ -121,6 +126,9 @@ static inline void delayacct_blkio_start(void) static inline void delayacct_blkio_end(struct task_struct *p) { + if (!static_branch_likely(&delayacct_key)) + return; + if (p->delays) __delayacct_blkio_end(p); delayacct_clear_flag(p, DELAYACCT_PF_BLKIO); diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 3a0b910386d1..63012fd39ae2 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -14,6 +14,7 @@ #include #include +DEFINE_STATIC_KEY_TRUE(delayacct_key); int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ struct kmem_cache *delayacct_cache; @@ -28,6 +29,8 @@ void delayacct_init(void) { delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT); delayacct_tsk_init(&init_task); + if (!delayacct_on) + static_branch_disable(&delayacct_key); } void __delayacct_tsk_init(struct task_struct *tsk) -- cgit v1.2.3 From e4042ad492357fa995921376462b04a025dd53b6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 May 2021 22:43:32 +0200 Subject: delayacct: Default disabled Assuming this stuff isn't actually used much; disable it by default and avoid allocating and tracking the task_delay_info structure. taskstats is changed to still report the regular sched and sched_info and only skip the missing task_delay_info fields instead of not reporting anything. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210505111525.308018373@infradead.org --- Documentation/accounting/delay-accounting.rst | 8 ++++---- Documentation/admin-guide/kernel-parameters.txt | 2 +- include/linux/delayacct.h | 16 ++++------------ kernel/delayacct.c | 19 +++++++++++-------- 4 files changed, 20 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst index 7cc7f5852da0..f20b282d6de0 100644 --- a/Documentation/accounting/delay-accounting.rst +++ b/Documentation/accounting/delay-accounting.rst @@ -69,13 +69,13 @@ Compile the kernel with:: CONFIG_TASK_DELAY_ACCT=y CONFIG_TASKSTATS=y -Delay accounting is enabled by default at boot up. -To disable, add:: +Delay accounting is disabled by default at boot up. +To enable, add:: - nodelayacct + delayacct to the kernel boot options. The rest of the instructions -below assume this has not been done. +below assume this has been done. After the system has booted up, use a utility similar to getdelays.c to access the delays diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index cb89dbdedc46..ef5048c127a3 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3244,7 +3244,7 @@ noclflush [BUGS=X86] Don't use the CLFLUSH instruction - nodelayacct [KNL] Disable per-task delay accounting + delayacct [KNL] Enable per-task delay accounting nodsp [SH] Disable hardware DSP at boot time. diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 57fefa54b53a..225c8e01a111 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -61,7 +61,7 @@ struct task_delay_info { #include #ifdef CONFIG_TASK_DELAY_ACCT -DECLARE_STATIC_KEY_TRUE(delayacct_key); +DECLARE_STATIC_KEY_FALSE(delayacct_key); extern int delayacct_on; /* Delay accounting turned on/off */ extern struct kmem_cache *delayacct_cache; extern void delayacct_init(void); @@ -69,7 +69,7 @@ extern void __delayacct_tsk_init(struct task_struct *); extern void __delayacct_tsk_exit(struct task_struct *); extern void __delayacct_blkio_start(void); extern void __delayacct_blkio_end(struct task_struct *); -extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *); +extern int delayacct_add_tsk(struct taskstats *, struct task_struct *); extern __u64 __delayacct_blkio_ticks(struct task_struct *); extern void __delayacct_freepages_start(void); extern void __delayacct_freepages_end(void); @@ -116,7 +116,7 @@ static inline void delayacct_tsk_free(struct task_struct *tsk) static inline void delayacct_blkio_start(void) { - if (!static_branch_likely(&delayacct_key)) + if (!static_branch_unlikely(&delayacct_key)) return; delayacct_set_flag(current, DELAYACCT_PF_BLKIO); @@ -126,7 +126,7 @@ static inline void delayacct_blkio_start(void) static inline void delayacct_blkio_end(struct task_struct *p) { - if (!static_branch_likely(&delayacct_key)) + if (!static_branch_unlikely(&delayacct_key)) return; if (p->delays) @@ -134,14 +134,6 @@ static inline void delayacct_blkio_end(struct task_struct *p) delayacct_clear_flag(p, DELAYACCT_PF_BLKIO); } -static inline int delayacct_add_tsk(struct taskstats *d, - struct task_struct *tsk) -{ - if (!delayacct_on || !tsk->delays) - return 0; - return __delayacct_add_tsk(d, tsk); -} - static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk) { if (tsk->delays) diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 63012fd39ae2..3f086903b295 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -14,23 +14,23 @@ #include #include -DEFINE_STATIC_KEY_TRUE(delayacct_key); -int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ +DEFINE_STATIC_KEY_FALSE(delayacct_key); +int delayacct_on __read_mostly; /* Delay accounting turned on/off */ struct kmem_cache *delayacct_cache; -static int __init delayacct_setup_disable(char *str) +static int __init delayacct_setup_enable(char *str) { - delayacct_on = 0; + delayacct_on = 1; return 1; } -__setup("nodelayacct", delayacct_setup_disable); +__setup("delayacct", delayacct_setup_enable); void delayacct_init(void) { delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT); delayacct_tsk_init(&init_task); - if (!delayacct_on) - static_branch_disable(&delayacct_key); + if (delayacct_on) + static_branch_enable(&delayacct_key); } void __delayacct_tsk_init(struct task_struct *tsk) @@ -83,7 +83,7 @@ void __delayacct_blkio_end(struct task_struct *p) delayacct_end(&delays->lock, &delays->blkio_start, total, count); } -int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) +int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) { u64 utime, stime, stimescaled, utimescaled; unsigned long long t2, t3; @@ -118,6 +118,9 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->cpu_run_virtual_total = (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp; + if (!tsk->delays) + return 0; + /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ raw_spin_lock_irqsave(&tsk->delays->lock, flags); -- cgit v1.2.3 From 0cd7c741f01de13dc1eecf22557593b3514639bb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 May 2021 14:01:00 +0200 Subject: delayacct: Add sysctl to enable at runtime Just like sched_schedstats, allow runtime enabling (and disabling) of delayacct. This is useful if one forgot to add the delayacct boot time option. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/YJkhebGJAywaZowX@hirez.programming.kicks-ass.net --- Documentation/accounting/delay-accounting.rst | 6 +++-- include/linux/delayacct.h | 4 +++ kernel/delayacct.c | 36 +++++++++++++++++++++++++-- kernel/sysctl.c | 12 +++++++++ 4 files changed, 54 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst index f20b282d6de0..1b8b46deeb29 100644 --- a/Documentation/accounting/delay-accounting.rst +++ b/Documentation/accounting/delay-accounting.rst @@ -74,8 +74,10 @@ To enable, add:: delayacct -to the kernel boot options. The rest of the instructions -below assume this has been done. +to the kernel boot options. The rest of the instructions below assume this has +been done. Alternatively, use sysctl kernel.task_delayacct to switch the state +at runtime. Note however that only tasks started after enabling it will have +delayacct information. After the system has booted up, use a utility similar to getdelays.c to access the delays diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 225c8e01a111..af7e6eb50283 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -65,6 +65,10 @@ DECLARE_STATIC_KEY_FALSE(delayacct_key); extern int delayacct_on; /* Delay accounting turned on/off */ extern struct kmem_cache *delayacct_cache; extern void delayacct_init(void); + +extern int sysctl_delayacct(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); + extern void __delayacct_tsk_init(struct task_struct *); extern void __delayacct_tsk_exit(struct task_struct *); extern void __delayacct_blkio_start(void); diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 3f086903b295..51530d5b15a8 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -18,6 +18,17 @@ DEFINE_STATIC_KEY_FALSE(delayacct_key); int delayacct_on __read_mostly; /* Delay accounting turned on/off */ struct kmem_cache *delayacct_cache; +static void set_delayacct(bool enabled) +{ + if (enabled) { + static_branch_enable(&delayacct_key); + delayacct_on = 1; + } else { + delayacct_on = 0; + static_branch_disable(&delayacct_key); + } +} + static int __init delayacct_setup_enable(char *str) { delayacct_on = 1; @@ -29,9 +40,30 @@ void delayacct_init(void) { delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT); delayacct_tsk_init(&init_task); - if (delayacct_on) - static_branch_enable(&delayacct_key); + set_delayacct(delayacct_on); +} + +#ifdef CONFIG_PROC_SYSCTL +int sysctl_delayacct(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + int state = delayacct_on; + struct ctl_table t; + int err; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &state; + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + if (write) + set_delayacct(state); + return err; } +#endif void __delayacct_tsk_init(struct task_struct *tsk) { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 14edf84cc571..0afbfc83157a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -71,6 +71,7 @@ #include #include #include +#include #include "../lib/kstrtox.h" @@ -1727,6 +1728,17 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_SCHEDSTATS */ +#ifdef CONFIG_TASK_DELAY_ACCT + { + .procname = "task_delayacct", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_delayacct, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_TASK_DELAY_ACCT */ #ifdef CONFIG_NUMA_BALANCING { .procname = "numa_balancing", -- cgit v1.2.3 From 8a311c740b53324ec584e0e3bb7077d56b123c28 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Nov 2020 18:19:36 -0500 Subject: sched: Basic tracking of matching tasks Introduce task_struct::core_cookie as an opaque identifier for core scheduling. When enabled; core scheduling will only allow matching task to be on the core; where idle matches everything. When task_struct::core_cookie is set (and core scheduling is enabled) these tasks are indexed in a second RB-tree, first on cookie value then on scheduling function, such that matching task selection always finds the most elegible match. NOTE: *shudder* at the overhead... NOTE: *sigh*, a 3rd copy of the scheduling function; the alternative is per class tracking of cookies and that just duplicates a lot of stuff for no raisin (the 2nd copy lives in the rt-mutex PI code). [Joel: folded fixes] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Joel Fernandes (Google) Signed-off-by: Peter Zijlstra (Intel) Tested-by: Don Hiatt Tested-by: Hongyu Ning Tested-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210422123308.496975854@infradead.org --- include/linux/sched.h | 8 ++- kernel/sched/core.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++++-- kernel/sched/fair.c | 46 --------------- kernel/sched/sched.h | 55 ++++++++++++++++++ 4 files changed, 210 insertions(+), 51 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index d2c881384517..45eedccf86aa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -700,10 +700,16 @@ struct task_struct { const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; + struct sched_dl_entity dl; + +#ifdef CONFIG_SCHED_CORE + struct rb_node core_node; + unsigned long core_cookie; +#endif + #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif - struct sched_dl_entity dl; #ifdef CONFIG_UCLAMP_TASK /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 85147bea9d93..c057d471c025 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -88,6 +88,133 @@ __read_mostly int scheduler_running; DEFINE_STATIC_KEY_FALSE(__sched_core_enabled); +/* kernel prio, less is more */ +static inline int __task_prio(struct task_struct *p) +{ + if (p->sched_class == &stop_sched_class) /* trumps deadline */ + return -2; + + if (rt_prio(p->prio)) /* includes deadline */ + return p->prio; /* [-1, 99] */ + + if (p->sched_class == &idle_sched_class) + return MAX_RT_PRIO + NICE_WIDTH; /* 140 */ + + return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */ +} + +/* + * l(a,b) + * le(a,b) := !l(b,a) + * g(a,b) := l(b,a) + * ge(a,b) := !l(a,b) + */ + +/* real prio, less is less */ +static inline bool prio_less(struct task_struct *a, struct task_struct *b) +{ + + int pa = __task_prio(a), pb = __task_prio(b); + + if (-pa < -pb) + return true; + + if (-pb < -pa) + return false; + + if (pa == -1) /* dl_prio() doesn't work because of stop_class above */ + return !dl_time_before(a->dl.deadline, b->dl.deadline); + + if (pa == MAX_RT_PRIO + MAX_NICE) { /* fair */ + u64 vruntime = b->se.vruntime; + + /* + * Normalize the vruntime if tasks are in different cpus. + */ + if (task_cpu(a) != task_cpu(b)) { + vruntime -= task_cfs_rq(b)->min_vruntime; + vruntime += task_cfs_rq(a)->min_vruntime; + } + + return !((s64)(a->se.vruntime - vruntime) <= 0); + } + + return false; +} + +static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b) +{ + if (a->core_cookie < b->core_cookie) + return true; + + if (a->core_cookie > b->core_cookie) + return false; + + /* flip prio, so high prio is leftmost */ + if (prio_less(b, a)) + return true; + + return false; +} + +#define __node_2_sc(node) rb_entry((node), struct task_struct, core_node) + +static inline bool rb_sched_core_less(struct rb_node *a, const struct rb_node *b) +{ + return __sched_core_less(__node_2_sc(a), __node_2_sc(b)); +} + +static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node) +{ + const struct task_struct *p = __node_2_sc(node); + unsigned long cookie = (unsigned long)key; + + if (cookie < p->core_cookie) + return -1; + + if (cookie > p->core_cookie) + return 1; + + return 0; +} + +static void sched_core_enqueue(struct rq *rq, struct task_struct *p) +{ + rq->core->core_task_seq++; + + if (!p->core_cookie) + return; + + rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less); +} + +static void sched_core_dequeue(struct rq *rq, struct task_struct *p) +{ + rq->core->core_task_seq++; + + if (!p->core_cookie) + return; + + rb_erase(&p->core_node, &rq->core_tree); +} + +/* + * Find left-most (aka, highest priority) task matching @cookie. + */ +static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie) +{ + struct rb_node *node; + + node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp); + /* + * The idle task always matches any cookie! + */ + if (!node) + return idle_sched_class.pick_task(rq); + + return __node_2_sc(node); +} + /* * Magic required such that: * @@ -147,10 +274,16 @@ static void __sched_core_flip(bool enabled) cpus_read_unlock(); } -static void __sched_core_enable(void) +static void sched_core_assert_empty(void) { - // XXX verify there are no cookie tasks (yet) + int cpu; + for_each_possible_cpu(cpu) + WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree)); +} + +static void __sched_core_enable(void) +{ static_branch_enable(&__sched_core_enabled); /* * Ensure all previous instances of raw_spin_rq_*lock() have finished @@ -158,12 +291,12 @@ static void __sched_core_enable(void) */ synchronize_rcu(); __sched_core_flip(true); + sched_core_assert_empty(); } static void __sched_core_disable(void) { - // XXX verify there are no cookie tasks (left) - + sched_core_assert_empty(); __sched_core_flip(false); static_branch_disable(&__sched_core_enabled); } @@ -205,6 +338,11 @@ void sched_core_put(void) schedule_work(&_work); } +#else /* !CONFIG_SCHED_CORE */ + +static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { } +static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { } + #endif /* CONFIG_SCHED_CORE */ /* @@ -1797,10 +1935,16 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) uclamp_rq_inc(rq, p); p->sched_class->enqueue_task(rq, p, flags); + + if (sched_core_enabled(rq)) + sched_core_enqueue(rq, p); } static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { + if (sched_core_enabled(rq)) + sched_core_dequeue(rq, p); + if (!(flags & DEQUEUE_NOCLOCK)) update_rq_clock(rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 51d72ab5b5ae..08be7a2eb05b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -268,33 +268,11 @@ const struct sched_class fair_sched_class; */ #ifdef CONFIG_FAIR_GROUP_SCHED -static inline struct task_struct *task_of(struct sched_entity *se) -{ - SCHED_WARN_ON(!entity_is_task(se)); - return container_of(se, struct task_struct, se); -} /* Walk up scheduling entities hierarchy */ #define for_each_sched_entity(se) \ for (; se; se = se->parent) -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return p->se.cfs_rq; -} - -/* runqueue on which this entity is (to be) queued */ -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - return se->cfs_rq; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return grp->my_q; -} - static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) { if (!path) @@ -455,33 +433,9 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) #else /* !CONFIG_FAIR_GROUP_SCHED */ -static inline struct task_struct *task_of(struct sched_entity *se) -{ - return container_of(se, struct task_struct, se); -} - #define for_each_sched_entity(se) \ for (; se; se = NULL) -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return &task_rq(p)->cfs; -} - -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - struct task_struct *p = task_of(se); - struct rq *rq = task_rq(p); - - return &rq->cfs; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return NULL; -} - static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) { if (path) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fa990cd259ce..e43a2176d88f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1080,6 +1080,10 @@ struct rq { /* per rq */ struct rq *core; unsigned int core_enabled; + struct rb_root core_tree; + + /* shared state */ + unsigned int core_task_seq; #endif }; @@ -1243,6 +1247,57 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) +#ifdef CONFIG_FAIR_GROUP_SCHED +static inline struct task_struct *task_of(struct sched_entity *se) +{ + SCHED_WARN_ON(!entity_is_task(se)); + return container_of(se, struct task_struct, se); +} + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return p->se.cfs_rq; +} + +/* runqueue on which this entity is (to be) queued */ +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + return se->cfs_rq; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return grp->my_q; +} + +#else + +static inline struct task_struct *task_of(struct sched_entity *se) +{ + return container_of(se, struct task_struct, se); +} + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return &task_rq(p)->cfs; +} + +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + struct rq *rq = task_rq(p); + + return &rq->cfs; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return NULL; +} +#endif + extern void update_rq_clock(struct rq *rq); static inline u64 __rq_clock_broken(struct rq *rq) -- cgit v1.2.3 From d2dfa17bc7de67e99685c4d6557837bf801a102c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Nov 2020 18:19:43 -0500 Subject: sched: Trivial forced-newidle balancer When a sibling is forced-idle to match the core-cookie; search for matching tasks to fill the core. rcu_read_unlock() can incur an infrequent deadlock in sched_core_balance(). Fix this by using the RCU-sched flavor instead. Signed-off-by: Peter Zijlstra (Intel) Tested-by: Don Hiatt Tested-by: Hongyu Ning Tested-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210422123308.800048269@infradead.org --- include/linux/sched.h | 1 + kernel/sched/core.c | 130 +++++++++++++++++++++++++++++++++++++++++++++++++- kernel/sched/idle.c | 1 + kernel/sched/sched.h | 6 +++ 4 files changed, 137 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 45eedccf86aa..9b822e383212 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -705,6 +705,7 @@ struct task_struct { #ifdef CONFIG_SCHED_CORE struct rb_node core_node; unsigned long core_cookie; + unsigned int core_occupation; #endif #ifdef CONFIG_CGROUP_SCHED diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e45c1d21b371..b4988887510f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -204,6 +204,21 @@ static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie) return __node_2_sc(node); } +static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie) +{ + struct rb_node *node = &p->core_node; + + node = rb_next(node); + if (!node) + return NULL; + + p = container_of(node, struct task_struct, core_node); + if (p->core_cookie != cookie) + return NULL; + + return p; +} + /* * Magic required such that: * @@ -5389,8 +5404,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) const struct sched_class *class; const struct cpumask *smt_mask; bool fi_before = false; + int i, j, cpu, occ = 0; bool need_sync; - int i, j, cpu; if (!sched_core_enabled(rq)) return __pick_next_task(rq, prev, rf); @@ -5512,6 +5527,9 @@ again: if (!p) continue; + if (!is_task_rq_idle(p)) + occ++; + rq_i->core_pick = p; if (rq_i->idle == p && rq_i->nr_running) { rq->core->core_forceidle = true; @@ -5543,6 +5561,7 @@ again: cpu_rq(j)->core_pick = NULL; } + occ = 1; goto again; } } @@ -5588,6 +5607,8 @@ again: if (!(fi_before && rq->core->core_forceidle)) task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle); + rq_i->core_pick->core_occupation = occ; + if (i == cpu) { rq_i->core_pick = NULL; continue; @@ -5609,6 +5630,113 @@ done: return next; } +static bool try_steal_cookie(int this, int that) +{ + struct rq *dst = cpu_rq(this), *src = cpu_rq(that); + struct task_struct *p; + unsigned long cookie; + bool success = false; + + local_irq_disable(); + double_rq_lock(dst, src); + + cookie = dst->core->core_cookie; + if (!cookie) + goto unlock; + + if (dst->curr != dst->idle) + goto unlock; + + p = sched_core_find(src, cookie); + if (p == src->idle) + goto unlock; + + do { + if (p == src->core_pick || p == src->curr) + goto next; + + if (!cpumask_test_cpu(this, &p->cpus_mask)) + goto next; + + if (p->core_occupation > dst->idle->core_occupation) + goto next; + + p->on_rq = TASK_ON_RQ_MIGRATING; + deactivate_task(src, p, 0); + set_task_cpu(p, this); + activate_task(dst, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; + + resched_curr(dst); + + success = true; + break; + +next: + p = sched_core_next(p, cookie); + } while (p); + +unlock: + double_rq_unlock(dst, src); + local_irq_enable(); + + return success; +} + +static bool steal_cookie_task(int cpu, struct sched_domain *sd) +{ + int i; + + for_each_cpu_wrap(i, sched_domain_span(sd), cpu) { + if (i == cpu) + continue; + + if (need_resched()) + break; + + if (try_steal_cookie(cpu, i)) + return true; + } + + return false; +} + +static void sched_core_balance(struct rq *rq) +{ + struct sched_domain *sd; + int cpu = cpu_of(rq); + + preempt_disable(); + rcu_read_lock(); + raw_spin_rq_unlock_irq(rq); + for_each_domain(cpu, sd) { + if (need_resched()) + break; + + if (steal_cookie_task(cpu, sd)) + break; + } + raw_spin_rq_lock_irq(rq); + rcu_read_unlock(); + preempt_enable(); +} + +static DEFINE_PER_CPU(struct callback_head, core_balance_head); + +void queue_core_balance(struct rq *rq) +{ + if (!sched_core_enabled(rq)) + return; + + if (!rq->core->core_cookie) + return; + + if (!rq->nr_running) /* not forced idle */ + return; + + queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance); +} + static inline void sched_core_cpu_starting(unsigned int cpu) { const struct cpumask *smt_mask = cpu_smt_mask(cpu); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 43646e7876d9..912b47aa99d8 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -437,6 +437,7 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir { update_idle_core(rq); schedstat_inc(rq->sched_goidle); + queue_core_balance(rq); } #ifdef CONFIG_SMP diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4a898abc60ce..91ca1fee9fec 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1170,6 +1170,8 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq) bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool fi); +extern void queue_core_balance(struct rq *rq); + #else /* !CONFIG_SCHED_CORE */ static inline bool sched_core_enabled(struct rq *rq) @@ -1192,6 +1194,10 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq) return &rq->__lock; } +static inline void queue_core_balance(struct rq *rq) +{ +} + #endif /* CONFIG_SCHED_CORE */ static inline void lockdep_assert_rq_held(struct rq *rq) -- cgit v1.2.3 From 6e33cad0af49336952e5541464bd02f5b5fd433e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Mar 2021 18:55:06 +0100 Subject: sched: Trivial core scheduling cookie management In order to not have to use pid_struct, create a new, smaller, structure to manage task cookies for core scheduling. Signed-off-by: Peter Zijlstra (Intel) Tested-by: Don Hiatt Tested-by: Hongyu Ning Tested-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210422123308.919768100@infradead.org --- include/linux/sched.h | 6 +++ kernel/fork.c | 1 + kernel/sched/Makefile | 1 + kernel/sched/core.c | 7 +-- kernel/sched/core_sched.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 16 +++++++ 6 files changed, 137 insertions(+), 3 deletions(-) create mode 100644 kernel/sched/core_sched.c (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9b822e383212..eab3f7c4251b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2179,4 +2179,10 @@ int sched_trace_rq_nr_running(struct rq *rq); const struct cpumask *sched_trace_rd_span(struct root_domain *rd); +#ifdef CONFIG_SCHED_CORE +extern void sched_core_free(struct task_struct *tsk); +#else +static inline void sched_core_free(struct task_struct *tsk) { } +#endif + #endif diff --git a/kernel/fork.c b/kernel/fork.c index dc06afd725cb..d16c60c9daca 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -742,6 +742,7 @@ void __put_task_struct(struct task_struct *tsk) exit_creds(tsk); delayacct_tsk_free(tsk); put_signal_struct(tsk->signal); + sched_core_free(tsk); if (!profile_handoff_task(tsk)) free_task(tsk); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 5fc9c9b70862..978fcfca5871 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -36,3 +36,4 @@ obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o obj-$(CONFIG_PSI) += psi.o +obj-$(CONFIG_SCHED_CORE) += core_sched.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b4988887510f..55b2d9399e12 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -167,7 +167,7 @@ static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node) return 0; } -static void sched_core_enqueue(struct rq *rq, struct task_struct *p) +void sched_core_enqueue(struct rq *rq, struct task_struct *p) { rq->core->core_task_seq++; @@ -177,14 +177,15 @@ static void sched_core_enqueue(struct rq *rq, struct task_struct *p) rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less); } -static void sched_core_dequeue(struct rq *rq, struct task_struct *p) +void sched_core_dequeue(struct rq *rq, struct task_struct *p) { rq->core->core_task_seq++; - if (!p->core_cookie) + if (!sched_core_enqueued(p)) return; rb_erase(&p->core_node, &rq->core_tree); + RB_CLEAR_NODE(&p->core_node); } /* diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c new file mode 100644 index 000000000000..8d0869a9eb8c --- /dev/null +++ b/kernel/sched/core_sched.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "sched.h" + +/* + * A simple wrapper around refcount. An allocated sched_core_cookie's + * address is used to compute the cookie of the task. + */ +struct sched_core_cookie { + refcount_t refcnt; +}; + +unsigned long sched_core_alloc_cookie(void) +{ + struct sched_core_cookie *ck = kmalloc(sizeof(*ck), GFP_KERNEL); + if (!ck) + return 0; + + refcount_set(&ck->refcnt, 1); + sched_core_get(); + + return (unsigned long)ck; +} + +void sched_core_put_cookie(unsigned long cookie) +{ + struct sched_core_cookie *ptr = (void *)cookie; + + if (ptr && refcount_dec_and_test(&ptr->refcnt)) { + kfree(ptr); + sched_core_put(); + } +} + +unsigned long sched_core_get_cookie(unsigned long cookie) +{ + struct sched_core_cookie *ptr = (void *)cookie; + + if (ptr) + refcount_inc(&ptr->refcnt); + + return cookie; +} + +/* + * sched_core_update_cookie - replace the cookie on a task + * @p: the task to update + * @cookie: the new cookie + * + * Effectively exchange the task cookie; caller is responsible for lifetimes on + * both ends. + * + * Returns: the old cookie + */ +unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cookie) +{ + unsigned long old_cookie; + struct rq_flags rf; + struct rq *rq; + bool enqueued; + + rq = task_rq_lock(p, &rf); + + /* + * Since creating a cookie implies sched_core_get(), and we cannot set + * a cookie until after we've created it, similarly, we cannot destroy + * a cookie until after we've removed it, we must have core scheduling + * enabled here. + */ + SCHED_WARN_ON((p->core_cookie || cookie) && !sched_core_enabled(rq)); + + enqueued = sched_core_enqueued(p); + if (enqueued) + sched_core_dequeue(rq, p); + + old_cookie = p->core_cookie; + p->core_cookie = cookie; + + if (enqueued) + sched_core_enqueue(rq, p); + + /* + * If task is currently running, it may not be compatible anymore after + * the cookie change, so enter the scheduler on its CPU to schedule it + * away. + */ + if (task_running(rq, p)) + resched_curr(rq); + + task_rq_unlock(rq, p, &rf); + + return old_cookie; +} + +static unsigned long sched_core_clone_cookie(struct task_struct *p) +{ + unsigned long cookie, flags; + + raw_spin_lock_irqsave(&p->pi_lock, flags); + cookie = sched_core_get_cookie(p->core_cookie); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + return cookie; +} + +void sched_core_free(struct task_struct *p) +{ + sched_core_put_cookie(p->core_cookie); +} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3878386a0a02..904c52b560d1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1229,6 +1229,22 @@ static inline bool sched_group_cookie_match(struct rq *rq, extern void queue_core_balance(struct rq *rq); +static inline bool sched_core_enqueued(struct task_struct *p) +{ + return !RB_EMPTY_NODE(&p->core_node); +} + +extern void sched_core_enqueue(struct rq *rq, struct task_struct *p); +extern void sched_core_dequeue(struct rq *rq, struct task_struct *p); + +extern void sched_core_get(void); +extern void sched_core_put(void); + +extern unsigned long sched_core_alloc_cookie(void); +extern void sched_core_put_cookie(unsigned long cookie); +extern unsigned long sched_core_get_cookie(unsigned long cookie); +extern unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cookie); + #else /* !CONFIG_SCHED_CORE */ static inline bool sched_core_enabled(struct rq *rq) -- cgit v1.2.3 From 85dd3f61203c5cfa72b308ff327b5fbf3fc1ce5e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 29 Mar 2021 15:18:35 +0200 Subject: sched: Inherit task cookie on fork() Note that sched_core_fork() is called from under tasklist_lock, and not from sched_fork() earlier. This avoids a few races later. Signed-off-by: Peter Zijlstra (Intel) Tested-by: Don Hiatt Tested-by: Hongyu Ning Tested-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210422123308.980003687@infradead.org --- include/linux/sched.h | 2 ++ kernel/fork.c | 3 +++ kernel/sched/core_sched.c | 6 ++++++ 3 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index eab3f7c4251b..fba47e52e482 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2181,8 +2181,10 @@ const struct cpumask *sched_trace_rd_span(struct root_domain *rd); #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); +extern void sched_core_fork(struct task_struct *p); #else static inline void sched_core_free(struct task_struct *tsk) { } +static inline void sched_core_fork(struct task_struct *p) { } #endif #endif diff --git a/kernel/fork.c b/kernel/fork.c index d16c60c9daca..e7fd928fcafe 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2251,6 +2251,8 @@ static __latent_entropy struct task_struct *copy_process( klp_copy_process(p); + sched_core_fork(p); + spin_lock(¤t->sighand->siglock); /* @@ -2338,6 +2340,7 @@ static __latent_entropy struct task_struct *copy_process( return p; bad_fork_cancel_cgroup: + sched_core_free(p); spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); cgroup_cancel_fork(p, args); diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 8d0869a9eb8c..dcbbeaefaaa3 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -103,6 +103,12 @@ static unsigned long sched_core_clone_cookie(struct task_struct *p) return cookie; } +void sched_core_fork(struct task_struct *p) +{ + RB_CLEAR_NODE(&p->core_node); + p->core_cookie = sched_core_clone_cookie(current); +} + void sched_core_free(struct task_struct *p) { sched_core_put_cookie(p->core_cookie); -- cgit v1.2.3 From 7ac592aa35a684ff1858fb9ec282886b9e3575ac Mon Sep 17 00:00:00 2001 From: Chris Hyser Date: Wed, 24 Mar 2021 17:40:15 -0400 Subject: sched: prctl() core-scheduling interface This patch provides support for setting and copying core scheduling 'task cookies' between threads (PID), processes (TGID), and process groups (PGID). The value of core scheduling isn't that tasks don't share a core, 'nosmt' can do that. The value lies in exploiting all the sharing opportunities that exist to recover possible lost performance and that requires a degree of flexibility in the API. From a security perspective (and there are others), the thread, process and process group distinction is an existent hierarchal categorization of tasks that reflects many of the security concerns about 'data sharing'. For example, protecting against cache-snooping by a thread that can just read the memory directly isn't all that useful. With this in mind, subcommands to CREATE/SHARE (TO/FROM) provide a mechanism to create and share cookies. CREATE/SHARE_TO specify a target pid with enum pidtype used to specify the scope of the targeted tasks. For example, PIDTYPE_TGID will share the cookie with the process and all of it's threads as typically desired in a security scenario. API: prctl(PR_SCHED_CORE, PR_SCHED_CORE_GET, tgtpid, pidtype, &cookie) prctl(PR_SCHED_CORE, PR_SCHED_CORE_CREATE, tgtpid, pidtype, NULL) prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_TO, tgtpid, pidtype, NULL) prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_FROM, srcpid, pidtype, NULL) where 'tgtpid/srcpid == 0' implies the current process and pidtype is kernel enum pid_type {PIDTYPE_PID, PIDTYPE_TGID, PIDTYPE_PGID, ...}. For return values, EINVAL, ENOMEM are what they say. ESRCH means the tgtpid/srcpid was not found. EPERM indicates lack of PTRACE permission access to tgtpid/srcpid. ENODEV indicates your machines lacks SMT. [peterz: complete rewrite] Signed-off-by: Chris Hyser Signed-off-by: Peter Zijlstra (Intel) Tested-by: Don Hiatt Tested-by: Hongyu Ning Tested-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210422123309.039845339@infradead.org --- include/linux/sched.h | 2 + include/uapi/linux/prctl.h | 8 +++ kernel/sched/core_sched.c | 114 +++++++++++++++++++++++++++++++++++++++ kernel/sys.c | 5 ++ tools/include/uapi/linux/prctl.h | 8 +++ 5 files changed, 137 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index fba47e52e482..c7e7d50e2fdc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2182,6 +2182,8 @@ const struct cpumask *sched_trace_rd_span(struct root_domain *rd); #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); extern void sched_core_fork(struct task_struct *p); +extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, + unsigned long uaddr); #else static inline void sched_core_free(struct task_struct *tsk) { } static inline void sched_core_fork(struct task_struct *p) { } diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 18a9f59dc067..967d9c55323d 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -259,4 +259,12 @@ struct prctl_mm_map { #define PR_PAC_SET_ENABLED_KEYS 60 #define PR_PAC_GET_ENABLED_KEYS 61 +/* Request the scheduler to share a core */ +#define PR_SCHED_CORE 62 +# define PR_SCHED_CORE_GET 0 +# define PR_SCHED_CORE_CREATE 1 /* create unique core_sched cookie */ +# define PR_SCHED_CORE_SHARE_TO 2 /* push core_sched cookie to pid */ +# define PR_SCHED_CORE_SHARE_FROM 3 /* pull core_sched cookie to pid */ +# define PR_SCHED_CORE_MAX 4 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index dcbbeaefaaa3..9a80e9a474c0 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include "sched.h" /* @@ -113,3 +114,116 @@ void sched_core_free(struct task_struct *p) { sched_core_put_cookie(p->core_cookie); } + +static void __sched_core_set(struct task_struct *p, unsigned long cookie) +{ + cookie = sched_core_get_cookie(cookie); + cookie = sched_core_update_cookie(p, cookie); + sched_core_put_cookie(cookie); +} + +/* Called from prctl interface: PR_SCHED_CORE */ +int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, + unsigned long uaddr) +{ + unsigned long cookie = 0, id = 0; + struct task_struct *task, *p; + struct pid *grp; + int err = 0; + + if (!static_branch_likely(&sched_smt_present)) + return -ENODEV; + + if (type > PIDTYPE_PGID || cmd >= PR_SCHED_CORE_MAX || pid < 0 || + (cmd != PR_SCHED_CORE_GET && uaddr)) + return -EINVAL; + + rcu_read_lock(); + if (pid == 0) { + task = current; + } else { + task = find_task_by_vpid(pid); + if (!task) { + rcu_read_unlock(); + return -ESRCH; + } + } + get_task_struct(task); + rcu_read_unlock(); + + /* + * Check if this process has the right to modify the specified + * process. Use the regular "ptrace_may_access()" checks. + */ + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { + err = -EPERM; + goto out; + } + + switch (cmd) { + case PR_SCHED_CORE_GET: + if (type != PIDTYPE_PID || uaddr & 7) { + err = -EINVAL; + goto out; + } + cookie = sched_core_clone_cookie(task); + if (cookie) { + /* XXX improve ? */ + ptr_to_hashval((void *)cookie, &id); + } + err = put_user(id, (u64 __user *)uaddr); + goto out; + + case PR_SCHED_CORE_CREATE: + cookie = sched_core_alloc_cookie(); + if (!cookie) { + err = -ENOMEM; + goto out; + } + break; + + case PR_SCHED_CORE_SHARE_TO: + cookie = sched_core_clone_cookie(current); + break; + + case PR_SCHED_CORE_SHARE_FROM: + if (type != PIDTYPE_PID) { + err = -EINVAL; + goto out; + } + cookie = sched_core_clone_cookie(task); + __sched_core_set(current, cookie); + goto out; + + default: + err = -EINVAL; + goto out; + }; + + if (type == PIDTYPE_PID) { + __sched_core_set(task, cookie); + goto out; + } + + read_lock(&tasklist_lock); + grp = task_pid_type(task, type); + + do_each_pid_thread(grp, type, p) { + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) { + err = -EPERM; + goto out_tasklist; + } + } while_each_pid_thread(grp, type, p); + + do_each_pid_thread(grp, type, p) { + __sched_core_set(p, cookie); + } while_each_pid_thread(grp, type, p); +out_tasklist: + read_unlock(&tasklist_lock); + +out: + sched_core_put_cookie(cookie); + put_task_struct(task); + return err; +} + diff --git a/kernel/sys.c b/kernel/sys.c index 3a583a29815f..9de46a4bf492 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2550,6 +2550,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = set_syscall_user_dispatch(arg2, arg3, arg4, (char __user *) arg5); break; +#ifdef CONFIG_SCHED_CORE + case PR_SCHED_CORE: + error = sched_core_share_pid(arg2, arg3, arg4, arg5); + break; +#endif default: error = -EINVAL; break; diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index 18a9f59dc067..967d9c55323d 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -259,4 +259,12 @@ struct prctl_mm_map { #define PR_PAC_SET_ENABLED_KEYS 60 #define PR_PAC_GET_ENABLED_KEYS 61 +/* Request the scheduler to share a core */ +#define PR_SCHED_CORE 62 +# define PR_SCHED_CORE_GET 0 +# define PR_SCHED_CORE_CREATE 1 /* create unique core_sched cookie */ +# define PR_SCHED_CORE_SHARE_TO 2 /* push core_sched cookie to pid */ +# define PR_SCHED_CORE_SHARE_FROM 3 /* pull core_sched cookie to pid */ +# define PR_SCHED_CORE_MAX 4 + #endif /* _LINUX_PRCTL_H */ -- cgit v1.2.3 From fa5e5dc39669b4427830c546ede8709323b8276c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 May 2021 21:33:58 +0200 Subject: jump_label, x86: Introduce jump_entry_size() This allows architectures to have variable sized jumps. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210506194157.786777050@infradead.org --- arch/x86/include/asm/jump_label.h | 4 ++-- arch/x86/kernel/jump_label.c | 7 +++++++ include/linux/jump_label.h | 9 +++++++++ kernel/jump_label.c | 2 +- 4 files changed, 19 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index dfdc2b1c17dd..d85802a00629 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -4,8 +4,6 @@ #define HAVE_JUMP_LABEL_BATCH -#define JUMP_LABEL_NOP_SIZE 5 - #include #include @@ -47,6 +45,8 @@ l_yes: return true; } +extern int arch_jump_entry_size(struct jump_entry *entry); + #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 638d3b9be0ad..a29eecc14c94 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -16,6 +16,13 @@ #include #include +#define JUMP_LABEL_NOP_SIZE JMP32_INSN_SIZE + +int arch_jump_entry_size(struct jump_entry *entry) +{ + return JMP32_INSN_SIZE; +} + static const void * __jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type) { diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 05f5554d860f..8c45f58292ac 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -176,6 +176,15 @@ static inline void jump_entry_set_init(struct jump_entry *entry) entry->key |= 2; } +static inline int jump_entry_size(struct jump_entry *entry) +{ +#ifdef JUMP_LABEL_NOP_SIZE + return JUMP_LABEL_NOP_SIZE; +#else + return arch_jump_entry_size(entry); +#endif +} + #endif #endif diff --git a/kernel/jump_label.c b/kernel/jump_label.c index ba39fbb1f8e7..521cafcfcb69 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -309,7 +309,7 @@ EXPORT_SYMBOL_GPL(jump_label_rate_limit); static int addr_conflict(struct jump_entry *entry, void *start, void *end) { if (jump_entry_code(entry) <= (unsigned long)end && - jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE > (unsigned long)start) + jump_entry_code(entry) + jump_entry_size(entry) > (unsigned long)start) return 1; return 0; -- cgit v1.2.3 From 5af0ea293d78c8b8f0b87ae2b13f7ac584057bc3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 May 2021 21:34:00 +0200 Subject: jump_label: Free jump_entry::key bit1 for build use Have jump_label_init() set jump_entry::key bit1 to either 0 ot 1 unconditionally. This makes it available for build-time games. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210506194157.906893264@infradead.org --- include/linux/jump_label.h | 7 +++++-- kernel/jump_label.c | 10 ++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 8c45f58292ac..48b9b2a82767 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -171,9 +171,12 @@ static inline bool jump_entry_is_init(const struct jump_entry *entry) return (unsigned long)entry->key & 2UL; } -static inline void jump_entry_set_init(struct jump_entry *entry) +static inline void jump_entry_set_init(struct jump_entry *entry, bool set) { - entry->key |= 2; + if (set) + entry->key |= 2; + else + entry->key &= ~2; } static inline int jump_entry_size(struct jump_entry *entry) diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 521cafcfcb69..bdb0681bece8 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -483,13 +483,14 @@ void __init jump_label_init(void) for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; + bool in_init; /* rewrite NOPs */ if (jump_label_type(iter) == JUMP_LABEL_NOP) arch_jump_label_transform_static(iter, JUMP_LABEL_NOP); - if (init_section_contains((void *)jump_entry_code(iter), 1)) - jump_entry_set_init(iter); + in_init = init_section_contains((void *)jump_entry_code(iter), 1); + jump_entry_set_init(iter, in_init); iterk = jump_entry_key(iter); if (iterk == key) @@ -634,9 +635,10 @@ static int jump_label_add_module(struct module *mod) for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; + bool in_init; - if (within_module_init(jump_entry_code(iter), mod)) - jump_entry_set_init(iter); + in_init = within_module_init(jump_entry_code(iter), mod); + jump_entry_set_init(iter, in_init); iterk = jump_entry_key(iter); if (iterk == key) -- cgit v1.2.3 From cc00c1988801dc71f63bb7bad019e85046865095 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 12 May 2021 19:51:31 +0200 Subject: sched: Fix leftover comment typos A few more snuck in. Also capitalize 'CPU' while at it. Signed-off-by: Ingo Molnar --- include/linux/sched_clock.h | 2 +- kernel/sched/core.c | 4 ++-- kernel/sched/fair.c | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched_clock.h b/include/linux/sched_clock.h index 528718e4ed52..835ee87ed792 100644 --- a/include/linux/sched_clock.h +++ b/include/linux/sched_clock.h @@ -14,7 +14,7 @@ * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit * clocks. * @read_sched_clock: Current clock source (or dummy source when suspended). - * @mult: Multipler for scaled math conversion. + * @mult: Multiplier for scaled math conversion. * @shift: Shift value for scaled math conversion. * * Care must be taken when updating this structure; it is read by diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9d00f4958bde..ac8882da5daf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5506,7 +5506,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } /* - * Try and select tasks for each sibling in decending sched_class + * Try and select tasks for each sibling in descending sched_class * order. */ for_each_class(class) { @@ -5520,7 +5520,7 @@ again: /* * If this sibling doesn't yet have a suitable task to - * run; ask for the most elegible task, given the + * run; ask for the most eligible task, given the * highest priority task already selected for this * core. */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2635e1048421..161b92aa1c79 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10808,11 +10808,11 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) * sched_slice() considers only this active rq and it gets the * whole slice. But during force idle, we have siblings acting * like a single runqueue and hence we need to consider runnable - * tasks on this cpu and the forced idle cpu. Ideally, we should + * tasks on this CPU and the forced idle CPU. Ideally, we should * go through the forced idle rq, but that would be a perf hit. - * We can assume that the forced idle cpu has atleast + * We can assume that the forced idle CPU has at least * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check - * if we need to give up the cpu. + * if we need to give up the CPU. */ if (rq->core->core_forceidle && rq->cfs.nr_running == 1 && __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE)) -- cgit v1.2.3 From 93d0955e6cf562d02aae37f5f8d98d9d9d16e0d4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 12 May 2021 20:04:28 +0200 Subject: locking: Fix comment typos A few snuck through. Signed-off-by: Ingo Molnar --- include/linux/lockdep_types.h | 2 +- kernel/futex.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h index 2ec9ff5a7fff..3e726ace5c62 100644 --- a/include/linux/lockdep_types.h +++ b/include/linux/lockdep_types.h @@ -52,7 +52,7 @@ enum lockdep_lock_type { * NR_LOCKDEP_CACHING_CLASSES ... Number of classes * cached in the instance of lockdep_map * - * Currently main class (subclass == 0) and signle depth subclass + * Currently main class (subclass == 0) and single depth subclass * are cached in lockdep_map. This optimization is mainly targeting * on rq->lock. double_rq_lock() acquires this highly competitive with * single depth. diff --git a/kernel/futex.c b/kernel/futex.c index 4938a00bc785..2f386f012900 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1874,7 +1874,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, * If the caller intends to requeue more than 1 waiter to pifutex, * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, * as we have means to handle the possible fault. If not, don't set - * the bit unecessarily as it will force the subsequent unlock to enter + * the bit unnecessarily as it will force the subsequent unlock to enter * the kernel. */ top_waiter = futex_top_waiter(hb1, key1); @@ -2103,7 +2103,7 @@ retry_private: continue; /* - * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always + * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always * be paired with each other and no other futex ops. * * We should never be requeueing a futex_q with a pi_state, @@ -2318,7 +2318,7 @@ retry: } /* - * PI futexes can not be requeued and must remove themself from the + * PI futexes can not be requeued and must remove themselves from the * hash bucket. The hash bucket lock (i.e. lock_ptr) is held. */ static void unqueue_me_pi(struct futex_q *q) @@ -2903,7 +2903,7 @@ no_block: */ res = fixup_owner(uaddr, &q, !ret); /* - * If fixup_owner() returned an error, proprogate that. If it acquired + * If fixup_owner() returned an error, propagate that. If it acquired * the lock, clear our -ETIMEDOUT or -EINTR. */ if (res) @@ -3280,7 +3280,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ res = fixup_owner(uaddr2, &q, !ret); /* - * If fixup_owner() returned an error, proprogate that. If it + * If fixup_owner() returned an error, propagate that. If it * acquired the lock, clear -ETIMEDOUT or -EINTR. */ if (res) @@ -3678,7 +3678,7 @@ void futex_exec_release(struct task_struct *tsk) { /* * The state handling is done for consistency, but in the case of - * exec() there is no way to prevent futher damage as the PID stays + * exec() there is no way to prevent further damage as the PID stays * the same. But for the unlikely and arguably buggy case that a * futex is held on exec(), this provides at least as much state * consistency protection which is possible. -- cgit v1.2.3 From ca0760e7d79e2bb9c342e6b3f925b1ef01c6303e Mon Sep 17 00:00:00 2001 From: Wei Ming Chen Date: Thu, 6 May 2021 20:30:51 +0800 Subject: Compiler Attributes: Add continue in comment Add "continue;" for switch/case block according to Doc[1] [1] https://www.kernel.org/doc/html/latest/process/deprecated.html?highlight=fallthrough#implicit-switch-case-fall-through Signed-off-by: Wei Ming Chen Signed-off-by: Miguel Ojeda --- include/linux/compiler_attributes.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index c043b8d2b17b..183ddd5fd072 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -199,6 +199,7 @@ * must end with any of these keywords: * break; * fallthrough; + * continue; * goto