From f9c82a4ea89c384d49ce03768ba88d049ed3f1f0 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:08 +0200 Subject: Increase size of ucounts to atomic_long_t RLIMIT_MSGQUEUE and RLIMIT_MEMLOCK use unsigned long to store their counters. As a preparation for moving rlimits based on ucounts, we need to increase the size of the variable to long. Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/257aa5fb1a7d81cf0f4c34f39ada2320c4284771.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- include/linux/user_namespace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index f6c5f784be5a..c242c10906c5 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -88,7 +88,7 @@ struct user_namespace { struct ctl_table_header *sysctls; #endif struct ucounts *ucounts; - int ucount_max[UCOUNT_COUNTS]; + long ucount_max[UCOUNT_COUNTS]; } __randomize_layout; struct ucounts { @@ -96,7 +96,7 @@ struct ucounts { struct user_namespace *ns; kuid_t uid; int count; - atomic_t ucount[UCOUNT_COUNTS]; + atomic_long_t ucount[UCOUNT_COUNTS]; }; extern struct user_namespace init_user_ns; -- cgit v1.2.3 From 905ae01c4ae2ae3df05bb141801b1db4b7d83c61 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:09 +0200 Subject: Add a reference to ucounts for each cred For RLIMIT_NPROC and some other rlimits the user_struct that holds the global limit is kept alive for the lifetime of a process by keeping it in struct cred. Adding a pointer to ucounts in the struct cred will allow to track RLIMIT_NPROC not only for user in the system, but for user in the user_namespace. Updating ucounts may require memory allocation which may fail. So, we cannot change cred.ucounts in the commit_creds() because this function cannot fail and it should always return 0. For this reason, we modify cred.ucounts before calling the commit_creds(). Changelog v6: * Fix null-ptr-deref in is_ucounts_overlimit() detected by trinity. This error was caused by the fact that cred_alloc_blank() left the ucounts pointer empty. Reported-by: kernel test robot Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/b37aaef28d8b9b0d757e07ba6dd27281bbe39259.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- fs/exec.c | 4 ++++ include/linux/cred.h | 2 ++ include/linux/user_namespace.h | 4 ++++ kernel/cred.c | 40 ++++++++++++++++++++++++++++++++++++++++ kernel/fork.c | 6 ++++++ kernel/sys.c | 12 ++++++++++++ kernel/ucount.c | 40 +++++++++++++++++++++++++++++++++++++--- kernel/user_namespace.c | 3 +++ 8 files changed, 108 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/fs/exec.c b/fs/exec.c index 18594f11c31f..d7c4187ca023 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1360,6 +1360,10 @@ int begin_new_exec(struct linux_binprm * bprm) WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1); flush_signal_handlers(me, 0); + retval = set_cred_ucounts(bprm->cred); + if (retval < 0) + goto out_unlock; + /* * install the new credentials for this executable */ diff --git a/include/linux/cred.h b/include/linux/cred.h index 4c6350503697..66436e655032 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -144,6 +144,7 @@ struct cred { #endif struct user_struct *user; /* real user ID subscription */ struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */ + struct ucounts *ucounts; struct group_info *group_info; /* supplementary groups for euid/fsgid */ /* RCU deletion */ union { @@ -170,6 +171,7 @@ extern int set_security_override_from_ctx(struct cred *, const char *); extern int set_create_files_as(struct cred *, struct inode *); extern int cred_fscmp(const struct cred *, const struct cred *); extern void __init cred_init(void); +extern int set_cred_ucounts(struct cred *); /* * check for validity of credentials diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index c242c10906c5..7919b80d57ed 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -100,11 +100,15 @@ struct ucounts { }; extern struct user_namespace init_user_ns; +extern struct ucounts init_ucounts; bool setup_userns_sysctls(struct user_namespace *ns); void retire_userns_sysctls(struct user_namespace *ns); struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type); void dec_ucount(struct ucounts *ucounts, enum ucount_type type); +struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid); +struct ucounts *get_ucounts(struct ucounts *ucounts); +void put_ucounts(struct ucounts *ucounts); #ifdef CONFIG_USER_NS diff --git a/kernel/cred.c b/kernel/cred.c index 421b1149c651..58a8a9e24347 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -60,6 +60,7 @@ struct cred init_cred = { .user = INIT_USER, .user_ns = &init_user_ns, .group_info = &init_groups, + .ucounts = &init_ucounts, }; static inline void set_cred_subscribers(struct cred *cred, int n) @@ -119,6 +120,8 @@ static void put_cred_rcu(struct rcu_head *rcu) if (cred->group_info) put_group_info(cred->group_info); free_uid(cred->user); + if (cred->ucounts) + put_ucounts(cred->ucounts); put_user_ns(cred->user_ns); kmem_cache_free(cred_jar, cred); } @@ -222,6 +225,7 @@ struct cred *cred_alloc_blank(void) #ifdef CONFIG_DEBUG_CREDENTIALS new->magic = CRED_MAGIC; #endif + new->ucounts = get_ucounts(&init_ucounts); if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0) goto error; @@ -284,6 +288,11 @@ struct cred *prepare_creds(void) if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; + + new->ucounts = get_ucounts(new->ucounts); + if (!new->ucounts) + goto error; + validate_creds(new); return new; @@ -363,6 +372,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) ret = create_user_ns(new); if (ret < 0) goto error_put; + if (set_cred_ucounts(new) < 0) + goto error_put; } #ifdef CONFIG_KEYS @@ -653,6 +664,31 @@ int cred_fscmp(const struct cred *a, const struct cred *b) } EXPORT_SYMBOL(cred_fscmp); +int set_cred_ucounts(struct cred *new) +{ + struct task_struct *task = current; + const struct cred *old = task->real_cred; + struct ucounts *old_ucounts = new->ucounts; + + if (new->user == old->user && new->user_ns == old->user_ns) + return 0; + + /* + * This optimization is needed because alloc_ucounts() uses locks + * for table lookups. + */ + if (old_ucounts && old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid)) + return 0; + + if (!(new->ucounts = alloc_ucounts(new->user_ns, new->euid))) + return -EAGAIN; + + if (old_ucounts) + put_ucounts(old_ucounts); + + return 0; +} + /* * initialise the credentials stuff */ @@ -719,6 +755,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; + new->ucounts = get_ucounts(new->ucounts); + if (!new->ucounts) + goto error; + put_cred(old); validate_creds(new); return new; diff --git a/kernel/fork.c b/kernel/fork.c index 426cd0c51f9e..321a5e31d817 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2995,6 +2995,12 @@ int ksys_unshare(unsigned long unshare_flags) if (err) goto bad_unshare_cleanup_cred; + if (new_cred) { + err = set_cred_ucounts(new_cred); + if (err) + goto bad_unshare_cleanup_cred; + } + if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) { if (do_sysvsem) { /* diff --git a/kernel/sys.c b/kernel/sys.c index 2e2e3f378d97..cabfc5b86175 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -552,6 +552,10 @@ long __sys_setreuid(uid_t ruid, uid_t euid) if (retval < 0) goto error; + retval = set_cred_ucounts(new); + if (retval < 0) + goto error; + return commit_creds(new); error: @@ -610,6 +614,10 @@ long __sys_setuid(uid_t uid) if (retval < 0) goto error; + retval = set_cred_ucounts(new); + if (retval < 0) + goto error; + return commit_creds(new); error: @@ -685,6 +693,10 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) if (retval < 0) goto error; + retval = set_cred_ucounts(new); + if (retval < 0) + goto error; + return commit_creds(new); error: diff --git a/kernel/ucount.c b/kernel/ucount.c index 04c561751af1..50cc1dfb7d28 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -8,6 +8,12 @@ #include #include +struct ucounts init_ucounts = { + .ns = &init_user_ns, + .uid = GLOBAL_ROOT_UID, + .count = 1, +}; + #define UCOUNTS_HASHTABLE_BITS 10 static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)]; static DEFINE_SPINLOCK(ucounts_lock); @@ -125,7 +131,15 @@ static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struc return NULL; } -static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) +static void hlist_add_ucounts(struct ucounts *ucounts) +{ + struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid); + spin_lock_irq(&ucounts_lock); + hlist_add_head(&ucounts->node, hashent); + spin_unlock_irq(&ucounts_lock); +} + +struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) { struct hlist_head *hashent = ucounts_hashentry(ns, uid); struct ucounts *ucounts, *new; @@ -160,7 +174,26 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) return ucounts; } -static void put_ucounts(struct ucounts *ucounts) +struct ucounts *get_ucounts(struct ucounts *ucounts) +{ + unsigned long flags; + + if (!ucounts) + return NULL; + + spin_lock_irqsave(&ucounts_lock, flags); + if (ucounts->count == INT_MAX) { + WARN_ONCE(1, "ucounts: counter has reached its maximum value"); + ucounts = NULL; + } else { + ucounts->count += 1; + } + spin_unlock_irqrestore(&ucounts_lock, flags); + + return ucounts; +} + +void put_ucounts(struct ucounts *ucounts) { unsigned long flags; @@ -194,7 +227,7 @@ struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, { struct ucounts *ucounts, *iter, *bad; struct user_namespace *tns; - ucounts = get_ucounts(ns, uid); + ucounts = alloc_ucounts(ns, uid); for (iter = ucounts; iter; iter = tns->ucounts) { long max; tns = iter->ns; @@ -237,6 +270,7 @@ static __init int user_namespace_sysctl_init(void) BUG_ON(!user_header); BUG_ON(!setup_userns_sysctls(&init_user_ns)); #endif + hlist_add_ucounts(&init_ucounts); return 0; } subsys_initcall(user_namespace_sysctl_init); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 9a4b980d695b..f1b7b4b8ffa2 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1340,6 +1340,9 @@ static int userns_install(struct nsset *nsset, struct ns_common *ns) put_user_ns(cred->user_ns); set_cred_user_ns(cred, get_user_ns(user_ns)); + if (set_cred_ucounts(cred) < 0) + return -EINVAL; + return 0; } -- cgit v1.2.3 From b6c336528926ef73b0f70260f2636de2c3b94c14 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:10 +0200 Subject: Use atomic_t for ucounts reference counting The current implementation of the ucounts reference counter requires the use of spin_lock. We're going to use get_ucounts() in more performance critical areas like a handling of RLIMIT_SIGPENDING. Now we need to use spin_lock only if we want to change the hashtable. v10: * Always try to put ucounts in case we cannot increase ucounts->count. This will allow to cover the case when all consumers will return ucounts at once. v9: * Use a negative value to check that the ucounts->count is close to overflow. Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/94d1dbecab060a6b116b0a2d1accd8ca1bbb4f5f.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- include/linux/user_namespace.h | 4 ++-- kernel/ucount.c | 53 +++++++++++++++--------------------------- 2 files changed, 21 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 7919b80d57ed..80b5bf12feae 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -95,7 +95,7 @@ struct ucounts { struct hlist_node node; struct user_namespace *ns; kuid_t uid; - int count; + atomic_t count; atomic_long_t ucount[UCOUNT_COUNTS]; }; @@ -107,7 +107,7 @@ void retire_userns_sysctls(struct user_namespace *ns); struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type); void dec_ucount(struct ucounts *ucounts, enum ucount_type type); struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid); -struct ucounts *get_ucounts(struct ucounts *ucounts); +struct ucounts * __must_check get_ucounts(struct ucounts *ucounts); void put_ucounts(struct ucounts *ucounts); #ifdef CONFIG_USER_NS diff --git a/kernel/ucount.c b/kernel/ucount.c index 50cc1dfb7d28..365865f368ec 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -11,7 +11,7 @@ struct ucounts init_ucounts = { .ns = &init_user_ns, .uid = GLOBAL_ROOT_UID, - .count = 1, + .count = ATOMIC_INIT(1), }; #define UCOUNTS_HASHTABLE_BITS 10 @@ -139,6 +139,15 @@ static void hlist_add_ucounts(struct ucounts *ucounts) spin_unlock_irq(&ucounts_lock); } +struct ucounts *get_ucounts(struct ucounts *ucounts) +{ + if (ucounts && atomic_add_negative(1, &ucounts->count)) { + put_ucounts(ucounts); + ucounts = NULL; + } + return ucounts; +} + struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) { struct hlist_head *hashent = ucounts_hashentry(ns, uid); @@ -155,7 +164,7 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) new->ns = ns; new->uid = uid; - new->count = 0; + atomic_set(&new->count, 1); spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); @@ -163,33 +172,12 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) kfree(new); } else { hlist_add_head(&new->node, hashent); - ucounts = new; + spin_unlock_irq(&ucounts_lock); + return new; } } - if (ucounts->count == INT_MAX) - ucounts = NULL; - else - ucounts->count += 1; spin_unlock_irq(&ucounts_lock); - return ucounts; -} - -struct ucounts *get_ucounts(struct ucounts *ucounts) -{ - unsigned long flags; - - if (!ucounts) - return NULL; - - spin_lock_irqsave(&ucounts_lock, flags); - if (ucounts->count == INT_MAX) { - WARN_ONCE(1, "ucounts: counter has reached its maximum value"); - ucounts = NULL; - } else { - ucounts->count += 1; - } - spin_unlock_irqrestore(&ucounts_lock, flags); - + ucounts = get_ucounts(ucounts); return ucounts; } @@ -197,15 +185,12 @@ void put_ucounts(struct ucounts *ucounts) { unsigned long flags; - spin_lock_irqsave(&ucounts_lock, flags); - ucounts->count -= 1; - if (!ucounts->count) + if (atomic_dec_and_test(&ucounts->count)) { + spin_lock_irqsave(&ucounts_lock, flags); hlist_del_init(&ucounts->node); - else - ucounts = NULL; - spin_unlock_irqrestore(&ucounts_lock, flags); - - kfree(ucounts); + spin_unlock_irqrestore(&ucounts_lock, flags); + kfree(ucounts); + } } static inline bool atomic_long_inc_below(atomic_long_t *v, int u) -- cgit v1.2.3 From 21d1c5e386bc751f1953b371d72cd5b7d9c9e270 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:11 +0200 Subject: Reimplement RLIMIT_NPROC on top of ucounts The rlimit counter is tied to uid in the user_namespace. This allows rlimit values to be specified in userns even if they are already globally exceeded by the user. However, the value of the previous user_namespaces cannot be exceeded. To illustrate the impact of rlimits, let's say there is a program that does not fork. Some service-A wants to run this program as user X in multiple containers. Since the program never fork the service wants to set RLIMIT_NPROC=1. service-A \- program (uid=1000, container1, rlimit_nproc=1) \- program (uid=1000, container2, rlimit_nproc=1) The service-A sets RLIMIT_NPROC=1 and runs the program in container1. When the service-A tries to run a program with RLIMIT_NPROC=1 in container2 it fails since user X already has one running process. We cannot use existing inc_ucounts / dec_ucounts because they do not allow us to exceed the maximum for the counter. Some rlimits can be overlimited by root or if the user has the appropriate capability. Changelog v11: * Change inc_rlimit_ucounts() which now returns top value of ucounts. * Drop inc_rlimit_ucounts_and_test() because the return code of inc_rlimit_ucounts() can be checked. Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/c5286a8aa16d2d698c222f7532f3d735c82bc6bc.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- fs/exec.c | 2 +- include/linux/cred.h | 2 ++ include/linux/sched/user.h | 1 - include/linux/user_namespace.h | 12 ++++++++++++ kernel/cred.c | 10 +++++----- kernel/exit.c | 2 +- kernel/fork.c | 9 +++++---- kernel/sys.c | 2 +- kernel/ucount.c | 44 ++++++++++++++++++++++++++++++++++++++++++ kernel/user.c | 1 - kernel/user_namespace.c | 3 ++- 11 files changed, 73 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/fs/exec.c b/fs/exec.c index d7c4187ca023..f2bcdbeb3afb 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1878,7 +1878,7 @@ static int do_execveat_common(int fd, struct filename *filename, * whether NPROC limit is still exceeded. */ if ((current->flags & PF_NPROC_EXCEEDED) && - atomic_read(¤t_user()->processes) > rlimit(RLIMIT_NPROC)) { + is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { retval = -EAGAIN; goto out_ret; } diff --git a/include/linux/cred.h b/include/linux/cred.h index 66436e655032..5ca1e8a1d035 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -372,6 +372,7 @@ static inline void put_cred(const struct cred *_cred) #define task_uid(task) (task_cred_xxx((task), uid)) #define task_euid(task) (task_cred_xxx((task), euid)) +#define task_ucounts(task) (task_cred_xxx((task), ucounts)) #define current_cred_xxx(xxx) \ ({ \ @@ -388,6 +389,7 @@ static inline void put_cred(const struct cred *_cred) #define current_fsgid() (current_cred_xxx(fsgid)) #define current_cap() (current_cred_xxx(cap_effective)) #define current_user() (current_cred_xxx(user)) +#define current_ucounts() (current_cred_xxx(ucounts)) extern struct user_namespace init_user_ns; #ifdef CONFIG_USER_NS diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index a8ec3b6093fc..d33d867ad6c1 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -12,7 +12,6 @@ */ struct user_struct { refcount_t __count; /* reference count */ - atomic_t processes; /* How many processes does this user have? */ atomic_t sigpending; /* How many pending signals does this user have? */ #ifdef CONFIG_FANOTIFY atomic_t fanotify_listeners; diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 80b5bf12feae..4a97acc35990 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -50,9 +50,12 @@ enum ucount_type { UCOUNT_INOTIFY_INSTANCES, UCOUNT_INOTIFY_WATCHES, #endif + UCOUNT_RLIMIT_NPROC, UCOUNT_COUNTS, }; +#define MAX_PER_NAMESPACE_UCOUNTS UCOUNT_RLIMIT_NPROC + struct user_namespace { struct uid_gid_map uid_map; struct uid_gid_map gid_map; @@ -110,6 +113,15 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid); struct ucounts * __must_check get_ucounts(struct ucounts *ucounts); void put_ucounts(struct ucounts *ucounts); +static inline long get_ucounts_value(struct ucounts *ucounts, enum ucount_type type) +{ + return atomic_long_read(&ucounts->ucount[type]); +} + +long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v); +bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v); +bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max); + #ifdef CONFIG_USER_NS static inline struct user_namespace *get_user_ns(struct user_namespace *ns) diff --git a/kernel/cred.c b/kernel/cred.c index 58a8a9e24347..dcfa30b337c5 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -360,7 +360,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) kdebug("share_creds(%p{%d,%d})", p->cred, atomic_read(&p->cred->usage), read_cred_subscribers(p->cred)); - atomic_inc(&p->cred->user->processes); + inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); return 0; } @@ -395,8 +395,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) } #endif - atomic_inc(&new->user->processes); p->cred = p->real_cred = get_cred(new); + inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); alter_cred_subscribers(new, 2); validate_creds(new); return 0; @@ -496,12 +496,12 @@ int commit_creds(struct cred *new) * in set_user(). */ alter_cred_subscribers(new, 2); - if (new->user != old->user) - atomic_inc(&new->user->processes); + if (new->user != old->user || new->user_ns != old->user_ns) + inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1); rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->cred, new); if (new->user != old->user) - atomic_dec(&old->user->processes); + dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1); alter_cred_subscribers(old, -2); /* send notifications */ diff --git a/kernel/exit.c b/kernel/exit.c index 04029e35e69a..61c0fe902b50 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -188,7 +188,7 @@ repeat: /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ rcu_read_lock(); - atomic_dec(&__task_cred(p)->user->processes); + dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); rcu_read_unlock(); cgroup_release(p); diff --git a/kernel/fork.c b/kernel/fork.c index 321a5e31d817..ed7dfb07178d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -819,9 +819,11 @@ void __init fork_init(void) init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; - for (i = 0; i < UCOUNT_COUNTS; i++) + for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) init_user_ns.ucount_max[i] = max_threads/2; + init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC); + #ifdef CONFIG_VMAP_STACK cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", NULL, free_vm_stack_cache); @@ -1978,8 +1980,7 @@ static __latent_entropy struct task_struct *copy_process( DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif retval = -EAGAIN; - if (atomic_read(&p->real_cred->user->processes) >= - task_rlimit(p, RLIMIT_NPROC)) { + if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { if (p->real_cred->user != INIT_USER && !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) goto bad_fork_free; @@ -2382,7 +2383,7 @@ bad_fork_cleanup_threadgroup_lock: #endif delayacct_tsk_free(p); bad_fork_cleanup_count: - atomic_dec(&p->cred->user->processes); + dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); exit_creds(p); bad_fork_free: p->state = TASK_DEAD; diff --git a/kernel/sys.c b/kernel/sys.c index cabfc5b86175..00266a65a000 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -473,7 +473,7 @@ static int set_user(struct cred *new) * for programs doing set*uid()+execve() by harmlessly deferring the * failure to the execve() stage. */ - if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) && + if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) && new_user != INIT_USER) current->flags |= PF_NPROC_EXCEEDED; else diff --git a/kernel/ucount.c b/kernel/ucount.c index 365865f368ec..6caa56f7dec8 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -80,6 +80,7 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_inotify_instances"), UCOUNT_ENTRY("max_inotify_watches"), #endif + { }, { } }; #endif /* CONFIG_SYSCTL */ @@ -240,6 +241,48 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type) put_ucounts(ucounts); } +long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v) +{ + struct ucounts *iter; + long ret = 0; + + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + long max = READ_ONCE(iter->ns->ucount_max[type]); + long new = atomic_long_add_return(v, &iter->ucount[type]); + if (new < 0 || new > max) + ret = LONG_MAX; + else if (iter == ucounts) + ret = new; + } + return ret; +} + +bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v) +{ + struct ucounts *iter; + long new; + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + long dec = atomic_long_add_return(-v, &iter->ucount[type]); + WARN_ON_ONCE(dec < 0); + if (iter == ucounts) + new = dec; + } + return (new == 0); +} + +bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max) +{ + struct ucounts *iter; + if (get_ucounts_value(ucounts, type) > max) + return true; + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + max = READ_ONCE(iter->ns->ucount_max[type]); + if (get_ucounts_value(iter, type) > max) + return true; + } + return false; +} + static __init int user_namespace_sysctl_init(void) { #ifdef CONFIG_SYSCTL @@ -256,6 +299,7 @@ static __init int user_namespace_sysctl_init(void) BUG_ON(!setup_userns_sysctls(&init_user_ns)); #endif hlist_add_ucounts(&init_ucounts); + inc_rlimit_ucounts(&init_ucounts, UCOUNT_RLIMIT_NPROC, 1); return 0; } subsys_initcall(user_namespace_sysctl_init); diff --git a/kernel/user.c b/kernel/user.c index a2478cddf536..7f5ff498207a 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -98,7 +98,6 @@ static DEFINE_SPINLOCK(uidhash_lock); /* root_user.__count is 1, for init task cred */ struct user_struct root_user = { .__count = REFCOUNT_INIT(1), - .processes = ATOMIC_INIT(1), .sigpending = ATOMIC_INIT(0), .locked_shm = 0, .uid = GLOBAL_ROOT_UID, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index f1b7b4b8ffa2..e6577c835072 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -119,9 +119,10 @@ int create_user_ns(struct cred *new) ns->owner = owner; ns->group = group; INIT_WORK(&ns->work, free_user_ns); - for (i = 0; i < UCOUNT_COUNTS; i++) { + for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) { ns->ucount_max[i] = INT_MAX; } + ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ -- cgit v1.2.3 From 6e52a9f0532f912af37bab4caf18b57d1b9845f4 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:12 +0200 Subject: Reimplement RLIMIT_MSGQUEUE on top of ucounts The rlimit counter is tied to uid in the user_namespace. This allows rlimit values to be specified in userns even if they are already globally exceeded by the user. However, the value of the previous user_namespaces cannot be exceeded. Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/2531f42f7884bbfee56a978040b3e0d25cdf6cde.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- include/linux/sched/user.h | 4 ---- include/linux/user_namespace.h | 1 + ipc/mqueue.c | 40 +++++++++++++++++++++------------------- kernel/fork.c | 1 + kernel/ucount.c | 1 + kernel/user_namespace.c | 1 + 6 files changed, 25 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index d33d867ad6c1..8a34446681aa 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -18,10 +18,6 @@ struct user_struct { #endif #ifdef CONFIG_EPOLL atomic_long_t epoll_watches; /* The number of file descriptors currently watched */ -#endif -#ifdef CONFIG_POSIX_MQUEUE - /* protected by mq_lock */ - unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */ #endif unsigned long locked_shm; /* How many pages of mlocked shm ? */ unsigned long unix_inflight; /* How many files in flight in unix sockets */ diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 4a97acc35990..5eeb86b00e68 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -51,6 +51,7 @@ enum ucount_type { UCOUNT_INOTIFY_WATCHES, #endif UCOUNT_RLIMIT_NPROC, + UCOUNT_RLIMIT_MSGQUEUE, UCOUNT_COUNTS, }; diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 8031464ed4ae..461fcf8c873d 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -144,7 +144,7 @@ struct mqueue_inode_info { struct pid *notify_owner; u32 notify_self_exec_id; struct user_namespace *notify_user_ns; - struct user_struct *user; /* user who created, for accounting */ + struct ucounts *ucounts; /* user who created, for accounting */ struct sock *notify_sock; struct sk_buff *notify_cookie; @@ -292,7 +292,6 @@ static struct inode *mqueue_get_inode(struct super_block *sb, struct ipc_namespace *ipc_ns, umode_t mode, struct mq_attr *attr) { - struct user_struct *u = current_user(); struct inode *inode; int ret = -ENOMEM; @@ -321,7 +320,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, info->notify_owner = NULL; info->notify_user_ns = NULL; info->qsize = 0; - info->user = NULL; /* set when all is ok */ + info->ucounts = NULL; /* set when all is ok */ info->msg_tree = RB_ROOT; info->msg_tree_rightmost = NULL; info->node_cache = NULL; @@ -371,19 +370,23 @@ static struct inode *mqueue_get_inode(struct super_block *sb, if (mq_bytes + mq_treesize < mq_bytes) goto out_inode; mq_bytes += mq_treesize; - spin_lock(&mq_lock); - if (u->mq_bytes + mq_bytes < u->mq_bytes || - u->mq_bytes + mq_bytes > rlimit(RLIMIT_MSGQUEUE)) { + info->ucounts = get_ucounts(current_ucounts()); + if (info->ucounts) { + long msgqueue; + + spin_lock(&mq_lock); + msgqueue = inc_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); + if (msgqueue == LONG_MAX || msgqueue > rlimit(RLIMIT_MSGQUEUE)) { + dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); + spin_unlock(&mq_lock); + put_ucounts(info->ucounts); + info->ucounts = NULL; + /* mqueue_evict_inode() releases info->messages */ + ret = -EMFILE; + goto out_inode; + } spin_unlock(&mq_lock); - /* mqueue_evict_inode() releases info->messages */ - ret = -EMFILE; - goto out_inode; } - u->mq_bytes += mq_bytes; - spin_unlock(&mq_lock); - - /* all is ok */ - info->user = get_uid(u); } else if (S_ISDIR(mode)) { inc_nlink(inode); /* Some things misbehave if size == 0 on a directory */ @@ -497,7 +500,6 @@ static void mqueue_free_inode(struct inode *inode) static void mqueue_evict_inode(struct inode *inode) { struct mqueue_inode_info *info; - struct user_struct *user; struct ipc_namespace *ipc_ns; struct msg_msg *msg, *nmsg; LIST_HEAD(tmp_msg); @@ -520,8 +522,7 @@ static void mqueue_evict_inode(struct inode *inode) free_msg(msg); } - user = info->user; - if (user) { + if (info->ucounts) { unsigned long mq_bytes, mq_treesize; /* Total amount of bytes accounted for the mqueue */ @@ -533,7 +534,7 @@ static void mqueue_evict_inode(struct inode *inode) info->attr.mq_msgsize); spin_lock(&mq_lock); - user->mq_bytes -= mq_bytes; + dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); /* * get_ns_from_inode() ensures that the * (ipc_ns = sb->s_fs_info) is either a valid ipc_ns @@ -543,7 +544,8 @@ static void mqueue_evict_inode(struct inode *inode) if (ipc_ns) ipc_ns->mq_queues_count--; spin_unlock(&mq_lock); - free_uid(user); + put_ucounts(info->ucounts); + info->ucounts = NULL; } if (ipc_ns) put_ipc_ns(ipc_ns); diff --git a/kernel/fork.c b/kernel/fork.c index ed7dfb07178d..a9c5097dfc86 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -823,6 +823,7 @@ void __init fork_init(void) init_user_ns.ucount_max[i] = max_threads/2; init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC); + init_user_ns.ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = task_rlimit(&init_task, RLIMIT_MSGQUEUE); #ifdef CONFIG_VMAP_STACK cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", diff --git a/kernel/ucount.c b/kernel/ucount.c index 6caa56f7dec8..6e6f936a5963 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -80,6 +80,7 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_inotify_instances"), UCOUNT_ENTRY("max_inotify_watches"), #endif + { }, { }, { } }; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index e6577c835072..7eccc4f84549 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -123,6 +123,7 @@ int create_user_ns(struct cred *new) ns->ucount_max[i] = INT_MAX; } ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC); + ns->ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = rlimit(RLIMIT_MSGQUEUE); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ -- cgit v1.2.3 From d64696905554e919321e31afc210606653b8f6a4 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:13 +0200 Subject: Reimplement RLIMIT_SIGPENDING on top of ucounts The rlimit counter is tied to uid in the user_namespace. This allows rlimit values to be specified in userns even if they are already globally exceeded by the user. However, the value of the previous user_namespaces cannot be exceeded. Changelog v11: * Revert most of changes to fix performance issues. v10: * Fix memory leak on get_ucounts failure. Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/df9d7764dddd50f28616b7840de74ec0f81711a8.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- fs/proc/array.c | 2 +- include/linux/sched/user.h | 1 - include/linux/signal_types.h | 4 +++- include/linux/user_namespace.h | 1 + kernel/fork.c | 1 + kernel/signal.c | 25 +++++++++++++------------ kernel/ucount.c | 1 + kernel/user.c | 1 - kernel/user_namespace.c | 1 + 9 files changed, 21 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/array.c b/fs/proc/array.c index bb87e4d89cd8..74b0ea4b7e38 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -284,7 +284,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) collect_sigign_sigcatch(p, &ignored, &caught); num_threads = get_nr_threads(p); rcu_read_lock(); /* FIXME: is this correct? */ - qsize = atomic_read(&__task_cred(p)->user->sigpending); + qsize = get_ucounts_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING); rcu_read_unlock(); qlim = task_rlimit(p, RLIMIT_SIGPENDING); unlock_task_sighand(p, &flags); diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index 8a34446681aa..8ba9cec4fb99 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -12,7 +12,6 @@ */ struct user_struct { refcount_t __count; /* reference count */ - atomic_t sigpending; /* How many pending signals does this user have? */ #ifdef CONFIG_FANOTIFY atomic_t fanotify_listeners; #endif diff --git a/include/linux/signal_types.h b/include/linux/signal_types.h index 68e06c75c5b2..34cb28b8f16c 100644 --- a/include/linux/signal_types.h +++ b/include/linux/signal_types.h @@ -13,6 +13,8 @@ typedef struct kernel_siginfo { __SIGINFO; } kernel_siginfo_t; +struct ucounts; + /* * Real Time signals may be queued. */ @@ -21,7 +23,7 @@ struct sigqueue { struct list_head list; int flags; kernel_siginfo_t info; - struct user_struct *user; + struct ucounts *ucounts; }; /* flags values. */ diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 5eeb86b00e68..58f417986472 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -52,6 +52,7 @@ enum ucount_type { #endif UCOUNT_RLIMIT_NPROC, UCOUNT_RLIMIT_MSGQUEUE, + UCOUNT_RLIMIT_SIGPENDING, UCOUNT_COUNTS, }; diff --git a/kernel/fork.c b/kernel/fork.c index a9c5097dfc86..03119926b27d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -824,6 +824,7 @@ void __init fork_init(void) init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC); init_user_ns.ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = task_rlimit(&init_task, RLIMIT_MSGQUEUE); + init_user_ns.ucount_max[UCOUNT_RLIMIT_SIGPENDING] = task_rlimit(&init_task, RLIMIT_SIGPENDING); #ifdef CONFIG_VMAP_STACK cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", diff --git a/kernel/signal.c b/kernel/signal.c index f2718350bf4b..9a6dab712123 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -413,8 +413,8 @@ static struct sigqueue * __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) { struct sigqueue *q = NULL; - struct user_struct *user; - int sigpending; + struct ucounts *ucounts = NULL; + long sigpending; /* * Protect access to @t credentials. This can go away when all @@ -425,27 +425,26 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi * changes from/to zero. */ rcu_read_lock(); - user = __task_cred(t)->user; - sigpending = atomic_inc_return(&user->sigpending); + ucounts = task_ucounts(t); + sigpending = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1); if (sigpending == 1) - get_uid(user); + ucounts = get_ucounts(ucounts); rcu_read_unlock(); - if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { + if (override_rlimit || (sigpending < LONG_MAX && sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { q = kmem_cache_alloc(sigqueue_cachep, flags); } else { print_dropped_signal(sig); } if (unlikely(q == NULL)) { - if (atomic_dec_and_test(&user->sigpending)) - free_uid(user); + if (ucounts && dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1)) + put_ucounts(ucounts); } else { INIT_LIST_HEAD(&q->list); q->flags = 0; - q->user = user; + q->ucounts = ucounts; } - return q; } @@ -453,8 +452,10 @@ static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) return; - if (atomic_dec_and_test(&q->user->sigpending)) - free_uid(q->user); + if (q->ucounts && dec_rlimit_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING, 1)) { + put_ucounts(q->ucounts); + q->ucounts = NULL; + } kmem_cache_free(sigqueue_cachep, q); } diff --git a/kernel/ucount.c b/kernel/ucount.c index 6e6f936a5963..8ce62da6a62c 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -80,6 +80,7 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_inotify_instances"), UCOUNT_ENTRY("max_inotify_watches"), #endif + { }, { }, { }, { } diff --git a/kernel/user.c b/kernel/user.c index 7f5ff498207a..6737327f83be 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -98,7 +98,6 @@ static DEFINE_SPINLOCK(uidhash_lock); /* root_user.__count is 1, for init task cred */ struct user_struct root_user = { .__count = REFCOUNT_INIT(1), - .sigpending = ATOMIC_INIT(0), .locked_shm = 0, .uid = GLOBAL_ROOT_UID, .ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0), diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 7eccc4f84549..822eacee4588 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -124,6 +124,7 @@ int create_user_ns(struct cred *new) } ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC); ns->ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = rlimit(RLIMIT_MSGQUEUE); + ns->ucount_max[UCOUNT_RLIMIT_SIGPENDING] = rlimit(RLIMIT_SIGPENDING); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ -- cgit v1.2.3 From d7c9e99aee48e6bc0b427f3e3c658a6aba15001e Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:14 +0200 Subject: Reimplement RLIMIT_MEMLOCK on top of ucounts The rlimit counter is tied to uid in the user_namespace. This allows rlimit values to be specified in userns even if they are already globally exceeded by the user. However, the value of the previous user_namespaces cannot be exceeded. Changelog v11: * Fix issue found by lkp robot. v8: * Fix issues found by lkp-tests project. v7: * Keep only ucounts for RLIMIT_MEMLOCK checks instead of struct cred. v6: * Fix bug in hugetlb_file_setup() detected by trinity. Reported-by: kernel test robot Reported-by: kernel test robot Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/970d50c70c71bfd4496e0e8d2a0a32feebebb350.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- fs/hugetlbfs/inode.c | 16 ++++++++-------- include/linux/hugetlb.h | 4 ++-- include/linux/mm.h | 4 ++-- include/linux/sched/user.h | 1 - include/linux/shmem_fs.h | 2 +- include/linux/user_namespace.h | 1 + ipc/shm.c | 26 +++++++++++++------------- kernel/fork.c | 1 + kernel/ucount.c | 1 + kernel/user.c | 1 - kernel/user_namespace.c | 1 + mm/memfd.c | 4 ++-- mm/mlock.c | 22 ++++++++++++++-------- mm/mmap.c | 4 ++-- mm/shmem.c | 10 +++++----- 15 files changed, 53 insertions(+), 45 deletions(-) (limited to 'include/linux') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 701c82c36138..be519fc9559a 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1443,7 +1443,7 @@ static int get_hstate_idx(int page_size_log) * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. */ struct file *hugetlb_file_setup(const char *name, size_t size, - vm_flags_t acctflag, struct user_struct **user, + vm_flags_t acctflag, struct ucounts **ucounts, int creat_flags, int page_size_log) { struct inode *inode; @@ -1455,20 +1455,20 @@ struct file *hugetlb_file_setup(const char *name, size_t size, if (hstate_idx < 0) return ERR_PTR(-ENODEV); - *user = NULL; + *ucounts = NULL; mnt = hugetlbfs_vfsmount[hstate_idx]; if (!mnt) return ERR_PTR(-ENOENT); if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { - *user = current_user(); - if (user_shm_lock(size, *user)) { + *ucounts = current_ucounts(); + if (user_shm_lock(size, *ucounts)) { task_lock(current); pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", current->comm, current->pid); task_unlock(current); } else { - *user = NULL; + *ucounts = NULL; return ERR_PTR(-EPERM); } } @@ -1495,9 +1495,9 @@ struct file *hugetlb_file_setup(const char *name, size_t size, iput(inode); out: - if (*user) { - user_shm_unlock(size, *user); - *user = NULL; + if (*ucounts) { + user_shm_unlock(size, *ucounts); + *ucounts = NULL; } return file; } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index cccd1aab69dd..96d63dbdec65 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -434,7 +434,7 @@ static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) extern const struct file_operations hugetlbfs_file_operations; extern const struct vm_operations_struct hugetlb_vm_ops; struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, - struct user_struct **user, int creat_flags, + struct ucounts **ucounts, int creat_flags, int page_size_log); static inline bool is_file_hugepages(struct file *file) @@ -454,7 +454,7 @@ static inline struct hstate *hstate_inode(struct inode *i) #define is_file_hugepages(file) false static inline struct file * hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag, - struct user_struct **user, int creat_flags, + struct ucounts **ucounts, int creat_flags, int page_size_log) { return ERR_PTR(-ENOSYS); diff --git a/include/linux/mm.h b/include/linux/mm.h index 8ba434287387..3b4e24738ce4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1670,8 +1670,8 @@ extern bool can_do_mlock(void); #else static inline bool can_do_mlock(void) { return false; } #endif -extern int user_shm_lock(size_t, struct user_struct *); -extern void user_shm_unlock(size_t, struct user_struct *); +extern int user_shm_lock(size_t, struct ucounts *); +extern void user_shm_unlock(size_t, struct ucounts *); /* * Parameter block passed down to zap_pte_range in exceptional cases. diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index 8ba9cec4fb99..82bd2532da6b 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -18,7 +18,6 @@ struct user_struct { #ifdef CONFIG_EPOLL atomic_long_t epoll_watches; /* The number of file descriptors currently watched */ #endif - unsigned long locked_shm; /* How many pages of mlocked shm ? */ unsigned long unix_inflight; /* How many files in flight in unix sockets */ atomic_long_t pipe_bufs; /* how many pages are allocated in pipe buffers */ diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index d82b6f396588..aa77dcd1646f 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -65,7 +65,7 @@ extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, extern int shmem_zero_setup(struct vm_area_struct *); extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); -extern int shmem_lock(struct file *file, int lock, struct user_struct *user); +extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts); #ifdef CONFIG_SHMEM extern const struct address_space_operations shmem_aops; static inline bool shmem_mapping(struct address_space *mapping) diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 58f417986472..2a3177b9b8bf 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -53,6 +53,7 @@ enum ucount_type { UCOUNT_RLIMIT_NPROC, UCOUNT_RLIMIT_MSGQUEUE, UCOUNT_RLIMIT_SIGPENDING, + UCOUNT_RLIMIT_MEMLOCK, UCOUNT_COUNTS, }; diff --git a/ipc/shm.c b/ipc/shm.c index febd88daba8c..003234fbbd17 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -60,7 +60,7 @@ struct shmid_kernel /* private to the kernel */ time64_t shm_ctim; struct pid *shm_cprid; struct pid *shm_lprid; - struct user_struct *mlock_user; + struct ucounts *mlock_ucounts; /* The task created the shm object. NULL if the task is dead. */ struct task_struct *shm_creator; @@ -286,10 +286,10 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) shm_rmid(ns, shp); shm_unlock(shp); if (!is_file_hugepages(shm_file)) - shmem_lock(shm_file, 0, shp->mlock_user); - else if (shp->mlock_user) + shmem_lock(shm_file, 0, shp->mlock_ucounts); + else if (shp->mlock_ucounts) user_shm_unlock(i_size_read(file_inode(shm_file)), - shp->mlock_user); + shp->mlock_ucounts); fput(shm_file); ipc_update_pid(&shp->shm_cprid, NULL); ipc_update_pid(&shp->shm_lprid, NULL); @@ -625,7 +625,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) shp->shm_perm.key = key; shp->shm_perm.mode = (shmflg & S_IRWXUGO); - shp->mlock_user = NULL; + shp->mlock_ucounts = NULL; shp->shm_perm.security = NULL; error = security_shm_alloc(&shp->shm_perm); @@ -650,7 +650,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) if (shmflg & SHM_NORESERVE) acctflag = VM_NORESERVE; file = hugetlb_file_setup(name, hugesize, acctflag, - &shp->mlock_user, HUGETLB_SHMFS_INODE, + &shp->mlock_ucounts, HUGETLB_SHMFS_INODE, (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); } else { /* @@ -698,8 +698,8 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) no_id: ipc_update_pid(&shp->shm_cprid, NULL); ipc_update_pid(&shp->shm_lprid, NULL); - if (is_file_hugepages(file) && shp->mlock_user) - user_shm_unlock(size, shp->mlock_user); + if (is_file_hugepages(file) && shp->mlock_ucounts) + user_shm_unlock(size, shp->mlock_ucounts); fput(file); ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); return error; @@ -1105,12 +1105,12 @@ static int shmctl_do_lock(struct ipc_namespace *ns, int shmid, int cmd) goto out_unlock0; if (cmd == SHM_LOCK) { - struct user_struct *user = current_user(); + struct ucounts *ucounts = current_ucounts(); - err = shmem_lock(shm_file, 1, user); + err = shmem_lock(shm_file, 1, ucounts); if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) { shp->shm_perm.mode |= SHM_LOCKED; - shp->mlock_user = user; + shp->mlock_ucounts = ucounts; } goto out_unlock0; } @@ -1118,9 +1118,9 @@ static int shmctl_do_lock(struct ipc_namespace *ns, int shmid, int cmd) /* SHM_UNLOCK */ if (!(shp->shm_perm.mode & SHM_LOCKED)) goto out_unlock0; - shmem_lock(shm_file, 0, shp->mlock_user); + shmem_lock(shm_file, 0, shp->mlock_ucounts); shp->shm_perm.mode &= ~SHM_LOCKED; - shp->mlock_user = NULL; + shp->mlock_ucounts = NULL; get_file(shm_file); ipc_unlock_object(&shp->shm_perm); rcu_read_unlock(); diff --git a/kernel/fork.c b/kernel/fork.c index 03119926b27d..610fd4de60d7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -825,6 +825,7 @@ void __init fork_init(void) init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC); init_user_ns.ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = task_rlimit(&init_task, RLIMIT_MSGQUEUE); init_user_ns.ucount_max[UCOUNT_RLIMIT_SIGPENDING] = task_rlimit(&init_task, RLIMIT_SIGPENDING); + init_user_ns.ucount_max[UCOUNT_RLIMIT_MEMLOCK] = task_rlimit(&init_task, RLIMIT_MEMLOCK); #ifdef CONFIG_VMAP_STACK cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", diff --git a/kernel/ucount.c b/kernel/ucount.c index 8ce62da6a62c..d316bac3e520 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -83,6 +83,7 @@ static struct ctl_table user_table[] = { { }, { }, { }, + { }, { } }; #endif /* CONFIG_SYSCTL */ diff --git a/kernel/user.c b/kernel/user.c index 6737327f83be..c82399c1618a 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -98,7 +98,6 @@ static DEFINE_SPINLOCK(uidhash_lock); /* root_user.__count is 1, for init task cred */ struct user_struct root_user = { .__count = REFCOUNT_INIT(1), - .locked_shm = 0, .uid = GLOBAL_ROOT_UID, .ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0), }; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 822eacee4588..892da1360862 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -125,6 +125,7 @@ int create_user_ns(struct cred *new) ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC); ns->ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = rlimit(RLIMIT_MSGQUEUE); ns->ucount_max[UCOUNT_RLIMIT_SIGPENDING] = rlimit(RLIMIT_SIGPENDING); + ns->ucount_max[UCOUNT_RLIMIT_MEMLOCK] = rlimit(RLIMIT_MEMLOCK); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ diff --git a/mm/memfd.c b/mm/memfd.c index 2647c898990c..081dd33e6a61 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -297,9 +297,9 @@ SYSCALL_DEFINE2(memfd_create, } if (flags & MFD_HUGETLB) { - struct user_struct *user = NULL; + struct ucounts *ucounts = NULL; - file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user, + file = hugetlb_file_setup(name, 0, VM_NORESERVE, &ucounts, HUGETLB_ANONHUGE_INODE, (flags >> MFD_HUGE_SHIFT) & MFD_HUGE_MASK); diff --git a/mm/mlock.c b/mm/mlock.c index f8f8cc32d03d..dd411aabf695 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -817,9 +817,10 @@ SYSCALL_DEFINE0(munlockall) */ static DEFINE_SPINLOCK(shmlock_user_lock); -int user_shm_lock(size_t size, struct user_struct *user) +int user_shm_lock(size_t size, struct ucounts *ucounts) { unsigned long lock_limit, locked; + long memlock; int allowed = 0; locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -828,21 +829,26 @@ int user_shm_lock(size_t size, struct user_struct *user) allowed = 1; lock_limit >>= PAGE_SHIFT; spin_lock(&shmlock_user_lock); - if (!allowed && - locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK)) + memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); + + if (!allowed && (memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) { + dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); + goto out; + } + if (!get_ucounts(ucounts)) { + dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); goto out; - get_uid(user); - user->locked_shm += locked; + } allowed = 1; out: spin_unlock(&shmlock_user_lock); return allowed; } -void user_shm_unlock(size_t size, struct user_struct *user) +void user_shm_unlock(size_t size, struct ucounts *ucounts) { spin_lock(&shmlock_user_lock); - user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT); spin_unlock(&shmlock_user_lock); - free_uid(user); + put_ucounts(ucounts); } diff --git a/mm/mmap.c b/mm/mmap.c index 3f287599a7a3..99f97d200aa4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1605,7 +1605,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, goto out_fput; } } else if (flags & MAP_HUGETLB) { - struct user_struct *user = NULL; + struct ucounts *ucounts = NULL; struct hstate *hs; hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); @@ -1621,7 +1621,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, */ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, - &user, HUGETLB_ANONHUGE_INODE, + &ucounts, HUGETLB_ANONHUGE_INODE, (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); if (IS_ERR(file)) return PTR_ERR(file); diff --git a/mm/shmem.c b/mm/shmem.c index b2db4ed0fbc7..7ee6d27222e9 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2227,7 +2227,7 @@ static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, } #endif -int shmem_lock(struct file *file, int lock, struct user_struct *user) +int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { struct inode *inode = file_inode(file); struct shmem_inode_info *info = SHMEM_I(inode); @@ -2239,13 +2239,13 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) * no serialization needed when called from shm_destroy(). */ if (lock && !(info->flags & VM_LOCKED)) { - if (!user_shm_lock(inode->i_size, user)) + if (!user_shm_lock(inode->i_size, ucounts)) goto out_nomem; info->flags |= VM_LOCKED; mapping_set_unevictable(file->f_mapping); } - if (!lock && (info->flags & VM_LOCKED) && user) { - user_shm_unlock(inode->i_size, user); + if (!lock && (info->flags & VM_LOCKED) && ucounts) { + user_shm_unlock(inode->i_size, ucounts); info->flags &= ~VM_LOCKED; mapping_clear_unevictable(file->f_mapping); } @@ -4093,7 +4093,7 @@ int shmem_unuse(unsigned int type, bool frontswap, return 0; } -int shmem_lock(struct file *file, int lock, struct user_struct *user) +int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { return 0; } -- cgit v1.2.3 From c1ada3dc7219b02b3467aa906c2f5f8b098578d1 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:16 +0200 Subject: ucounts: Set ucount_max to the largest positive value the type can hold The ns->ucount_max[] is signed long which is less than the rlimit size. We have to protect ucount_max[] from overflow and only use the largest value that we can hold. On 32bit using "long" instead of "unsigned long" to hold the counts has the downside that RLIMIT_MSGQUEUE and RLIMIT_MEMLOCK are limited to 2GiB instead of 4GiB. I don't think anyone cares but it should be mentioned in case someone does. The RLIMIT_NPROC and RLIMIT_SIGPENDING used atomic_t so their maximum hasn't changed. Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/1825a5dfa18bc5a570e79feb05e2bd07fd57e7e3.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- include/linux/user_namespace.h | 6 ++++++ kernel/fork.c | 8 ++++---- kernel/user_namespace.c | 8 ++++---- 3 files changed, 14 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 2a3177b9b8bf..61794ae32fa8 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -125,6 +125,12 @@ long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v); bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v); bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max); +static inline void set_rlimit_ucount_max(struct user_namespace *ns, + enum ucount_type type, unsigned long max) +{ + ns->ucount_max[type] = max <= LONG_MAX ? max : LONG_MAX; +} + #ifdef CONFIG_USER_NS static inline struct user_namespace *get_user_ns(struct user_namespace *ns) diff --git a/kernel/fork.c b/kernel/fork.c index 610fd4de60d7..c41820481b2e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -822,10 +822,10 @@ void __init fork_init(void) for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) init_user_ns.ucount_max[i] = max_threads/2; - init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC); - init_user_ns.ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = task_rlimit(&init_task, RLIMIT_MSGQUEUE); - init_user_ns.ucount_max[UCOUNT_RLIMIT_SIGPENDING] = task_rlimit(&init_task, RLIMIT_SIGPENDING); - init_user_ns.ucount_max[UCOUNT_RLIMIT_MEMLOCK] = task_rlimit(&init_task, RLIMIT_MEMLOCK); + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, task_rlimit(&init_task, RLIMIT_NPROC)); + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, task_rlimit(&init_task, RLIMIT_MSGQUEUE)); + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, task_rlimit(&init_task, RLIMIT_SIGPENDING)); + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, task_rlimit(&init_task, RLIMIT_MEMLOCK)); #ifdef CONFIG_VMAP_STACK cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 892da1360862..d4a545bbab7f 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -122,10 +122,10 @@ int create_user_ns(struct cred *new) for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) { ns->ucount_max[i] = INT_MAX; } - ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC); - ns->ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = rlimit(RLIMIT_MSGQUEUE); - ns->ucount_max[UCOUNT_RLIMIT_SIGPENDING] = rlimit(RLIMIT_SIGPENDING); - ns->ucount_max[UCOUNT_RLIMIT_MEMLOCK] = rlimit(RLIMIT_MEMLOCK); + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)); + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ -- cgit v1.2.3 From 6be388f4a35d2ce5ef7dbf635a8964a5da7f799f Mon Sep 17 00:00:00 2001 From: Anirudh Rayabharam Date: Sun, 25 Apr 2021 23:03:53 +0530 Subject: HID: usbhid: fix info leak in hid_submit_ctrl In hid_submit_ctrl(), the way of calculating the report length doesn't take into account that report->size can be zero. When running the syzkaller reproducer, a report of size 0 causes hid_submit_ctrl) to calculate transfer_buffer_length as 16384. When this urb is passed to the usb core layer, KMSAN reports an info leak of 16384 bytes. To fix this, first modify hid_report_len() to account for the zero report size case by using DIV_ROUND_UP for the division. Then, call it from hid_submit_ctrl(). Reported-by: syzbot+7c2bb71996f95a82524c@syzkaller.appspotmail.com Signed-off-by: Anirudh Rayabharam Acked-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- drivers/hid/usbhid/hid-core.c | 2 +- include/linux/hid.h | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c index 86257ce6d619..4e9077363c96 100644 --- a/drivers/hid/usbhid/hid-core.c +++ b/drivers/hid/usbhid/hid-core.c @@ -374,7 +374,7 @@ static int hid_submit_ctrl(struct hid_device *hid) raw_report = usbhid->ctrl[usbhid->ctrltail].raw_report; dir = usbhid->ctrl[usbhid->ctrltail].dir; - len = ((report->size - 1) >> 3) + 1 + (report->id > 0); + len = hid_report_len(report); if (dir == USB_DIR_OUT) { usbhid->urbctrl->pipe = usb_sndctrlpipe(hid_to_usb_dev(hid), 0); usbhid->urbctrl->transfer_buffer_length = len; diff --git a/include/linux/hid.h b/include/linux/hid.h index 271021e20a3f..10e922cee4eb 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1167,8 +1167,7 @@ static inline void hid_hw_wait(struct hid_device *hdev) */ static inline u32 hid_report_len(struct hid_report *report) { - /* equivalent to DIV_ROUND_UP(report->size, 8) + !!(report->id > 0) */ - return ((report->size - 1) >> 3) + 1 + (report->id > 0); + return DIV_ROUND_UP(report->size, 8) + (report->id > 0); } int hid_report_raw_event(struct hid_device *hid, int type, u8 *data, u32 size, -- cgit v1.2.3 From 0e4768713e71dd224633fd7e00ad358bc48f433a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 23 Apr 2021 21:24:31 +0300 Subject: spi: pxa2xx: Replace header inclusions by forward declarations When the data structure is only referred by pointer, compiler may not need to see the contents of the data type. Thus, we may replace header inclusions by respective forward declarations. Due to above add missed headers as well. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210423182441.50272-5-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- drivers/spi/spi-pxa2xx-dma.c | 4 ++-- drivers/spi/spi-pxa2xx-pci.c | 1 + drivers/spi/spi-pxa2xx.c | 2 ++ drivers/spi/spi-pxa2xx.h | 18 ++++++++++-------- include/linux/spi/pxa2xx_spi.h | 2 ++ 5 files changed, 17 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/spi/spi-pxa2xx-dma.c b/drivers/spi/spi-pxa2xx-dma.c index 2e4a49567146..e00dbadd39ec 100644 --- a/drivers/spi/spi-pxa2xx-dma.c +++ b/drivers/spi/spi-pxa2xx-dma.c @@ -9,11 +9,11 @@ #include #include #include -#include #include #include -#include + #include +#include #include "spi-pxa2xx.h" diff --git a/drivers/spi/spi-pxa2xx-pci.c b/drivers/spi/spi-pxa2xx-pci.c index f588fad77fc0..a259be12d326 100644 --- a/drivers/spi/spi-pxa2xx-pci.c +++ b/drivers/spi/spi-pxa2xx-pci.c @@ -8,6 +8,7 @@ #include #include #include + #include #include diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c index 0f3f7d725937..1d4c7f4217ed 100644 --- a/drivers/spi/spi-pxa2xx.c +++ b/drivers/spi/spi-pxa2xx.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -25,6 +26,7 @@ #include #include #include + #include #include diff --git a/drivers/spi/spi-pxa2xx.h b/drivers/spi/spi-pxa2xx.h index 6724d7e056ce..739e264feaa6 100644 --- a/drivers/spi/spi-pxa2xx.h +++ b/drivers/spi/spi-pxa2xx.h @@ -7,16 +7,18 @@ #ifndef SPI_PXA2XX_H #define SPI_PXA2XX_H -#include -#include -#include -#include #include -#include -#include +#include +#include #include -#include -#include + +#include + +struct gpio_desc; +struct pxa2xx_spi_controller; +struct spi_controller; +struct spi_device; +struct spi_transfer; struct driver_data { /* SSP Info */ diff --git a/include/linux/spi/pxa2xx_spi.h b/include/linux/spi/pxa2xx_spi.h index 31f00c7f4f59..1e0e2f136319 100644 --- a/include/linux/spi/pxa2xx_spi.h +++ b/include/linux/spi/pxa2xx_spi.h @@ -5,6 +5,8 @@ #ifndef __linux_pxa2xx_spi_h #define __linux_pxa2xx_spi_h +#include + #include #define PXA2XX_CS_ASSERT (0x01) -- cgit v1.2.3 From 5edc24901f4d469f8fc943004f73655933e89dbf Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 23 Apr 2021 21:24:32 +0300 Subject: spi: pxa2xx: Unify ifdeffery used in the headers The two headers have quite different ifdeffery to prevent multiple inclusion. Unify them with the pattern that in particular reflects their location. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210423182441.50272-6-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- include/linux/pxa2xx_ssp.h | 6 +++--- include/linux/spi/pxa2xx_spi.h | 7 ++++--- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pxa2xx_ssp.h b/include/linux/pxa2xx_ssp.h index 7f73b26ed22e..14b049840faf 100644 --- a/include/linux/pxa2xx_ssp.h +++ b/include/linux/pxa2xx_ssp.h @@ -11,8 +11,8 @@ * PXA3xx SSP1, SSP2, SSP3, SSP4 */ -#ifndef __LINUX_SSP_H -#define __LINUX_SSP_H +#ifndef __LINUX_PXA2XX_SSP_H +#define __LINUX_PXA2XX_SSP_H #include #include @@ -270,4 +270,4 @@ static inline struct ssp_device *pxa_ssp_request_of(const struct device_node *n, static inline void pxa_ssp_free(struct ssp_device *ssp) {} #endif -#endif +#endif /* __LINUX_PXA2XX_SSP_H */ diff --git a/include/linux/spi/pxa2xx_spi.h b/include/linux/spi/pxa2xx_spi.h index 1e0e2f136319..12ef04d0896d 100644 --- a/include/linux/spi/pxa2xx_spi.h +++ b/include/linux/spi/pxa2xx_spi.h @@ -2,8 +2,8 @@ /* * Copyright (C) 2005 Stephen Street / StreetFire Sound Labs */ -#ifndef __linux_pxa2xx_spi_h -#define __linux_pxa2xx_spi_h +#ifndef __LINUX_SPI_PXA2XX_SPI_H +#define __LINUX_SPI_PXA2XX_SPI_H #include @@ -51,4 +51,5 @@ struct pxa2xx_spi_chip { extern void pxa2xx_set_spi_info(unsigned id, struct pxa2xx_spi_controller *info); #endif -#endif + +#endif /* __LINUX_SPI_PXA2XX_SPI_H */ -- cgit v1.2.3 From 1beb37b0e3f98708bfb37778049764b4500756da Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 23 Apr 2021 21:24:33 +0300 Subject: spi: pxa2xx: Group Intel Quark specific definitions DDS_RATE is Intel Quark specific definition. Move it to the rest Intel Quark related. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210423182441.50272-7-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- include/linux/pxa2xx_ssp.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pxa2xx_ssp.h b/include/linux/pxa2xx_ssp.h index 14b049840faf..1b6c1a0922bd 100644 --- a/include/linux/pxa2xx_ssp.h +++ b/include/linux/pxa2xx_ssp.h @@ -38,7 +38,6 @@ struct device_node; #define SSDR (0x10) /* SSP Data Write/Data Read Register */ #define SSTO (0x28) /* SSP Time Out Register */ -#define DDS_RATE (0x28) /* SSP DDS Clock Rate Register (Intel Quark) */ #define SSPSP (0x2C) /* SSP Programmable Serial Protocol */ #define SSTSA (0x30) /* SSP Tx Timeslot Active */ #define SSRSA (0x34) /* SSP Rx Timeslot Active */ @@ -105,6 +104,9 @@ struct device_node; #define CE4100_SSCR1_RFT GENMASK(11, 10) /* Receive FIFO Threshold (mask) */ #define CE4100_SSCR1_RxTresh(x) (((x) - 1) << 10) /* level [1..4] */ +/* Intel Quark X1000 */ +#define DDS_RATE 0x28 /* SSP DDS Clock Rate Register */ + /* QUARK_X1000 SSCR0 bit definition */ #define QUARK_X1000_SSCR0_DSS GENMASK(4, 0) /* Data Size Select (mask) */ #define QUARK_X1000_SSCR0_DataSize(x) ((x) - 1) /* Data Size Select [4..32] */ -- cgit v1.2.3 From 28ec344bb8911bb0d4910456b22ba0dd4f662521 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Wed, 5 May 2021 17:44:22 -0700 Subject: usb: typec: tcpm: Don't block probing of consumers of "connector" nodes fw_devlink expects DT device nodes with "compatible" property to have struct devices created for them. Since the connector node might not be populated as a device, mark it as such so that fw_devlink knows not to wait on this fwnode being populated as a struct device. Without this patch, USB functionality can be broken on some boards. Fixes: f7514a663016 ("of: property: fw_devlink: Add support for remote-endpoint") Reported-by: John Stultz Tested-by: John Stultz Signed-off-by: Saravana Kannan Link: https://lore.kernel.org/r/20210506004423.345199-1-saravanak@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 3 ++- drivers/usb/typec/tcpm/tcpm.c | 9 +++++++++ include/linux/fwnode.h | 1 + 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/base/core.c b/drivers/base/core.c index 4a8bf8cda52b..628e33939aca 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -150,7 +150,7 @@ void fwnode_links_purge(struct fwnode_handle *fwnode) fwnode_links_purge_consumers(fwnode); } -static void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode) +void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode) { struct fwnode_handle *child; @@ -164,6 +164,7 @@ static void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode) fwnode_for_each_available_child_node(fwnode, child) fw_devlink_purge_absent_suppliers(child); } +EXPORT_SYMBOL_GPL(fw_devlink_purge_absent_suppliers); #ifdef CONFIG_SRCU static DEFINE_MUTEX(device_links_lock); diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index c4fdc00a3bc8..bffa342d4e38 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -5754,6 +5754,15 @@ static int tcpm_fw_get_caps(struct tcpm_port *port, if (!fwnode) return -EINVAL; + /* + * This fwnode has a "compatible" property, but is never populated as a + * struct device. Instead we simply parse it to read the properties. + * This it breaks fw_devlink=on. To maintain backward compatibility + * with existing DT files, we work around this by deleting any + * fwnode_links to/from this fwnode. + */ + fw_devlink_purge_absent_suppliers(fwnode); + /* USB data support is optional */ ret = fwnode_property_read_string(fwnode, "data-role", &cap_str); if (ret == 0) { diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index ed4e67a7ff1c..59828516ebaf 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -187,5 +187,6 @@ extern u32 fw_devlink_get_flags(void); extern bool fw_devlink_is_strict(void); int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup); void fwnode_links_purge(struct fwnode_handle *fwnode); +void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode); #endif -- cgit v1.2.3 From c745253e2a691a40c66790defe85c104a887e14a Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Wed, 5 May 2021 14:09:15 +0300 Subject: PM: runtime: Fix unpaired parent child_count for force_resume As pm_runtime_need_not_resume() relies also on usage_count, it can return a different value in pm_runtime_force_suspend() compared to when called in pm_runtime_force_resume(). Different return values can happen if anything calls PM runtime functions in between, and causes the parent child_count to increase on every resume. So far I've seen the issue only for omapdrm that does complicated things with PM runtime calls during system suspend for legacy reasons: omap_atomic_commit_tail() for omapdrm.0 dispc_runtime_get() wakes up 58000000.dss as it's the dispc parent dispc_runtime_resume() rpm_resume() increases parent child_count dispc_runtime_put() won't idle, PM runtime suspend blocked pm_runtime_force_suspend() for 58000000.dss, !pm_runtime_need_not_resume() __update_runtime_status() system suspended pm_runtime_force_resume() for 58000000.dss, pm_runtime_need_not_resume() pm_runtime_enable() only called because of pm_runtime_need_not_resume() omap_atomic_commit_tail() for omapdrm.0 dispc_runtime_get() wakes up 58000000.dss as it's the dispc parent dispc_runtime_resume() rpm_resume() increases parent child_count dispc_runtime_put() won't idle, PM runtime suspend blocked ... rpm_suspend for 58000000.dss but parent child_count is now unbalanced Let's fix the issue by adding a flag for needs_force_resume and use it in pm_runtime_force_resume() instead of pm_runtime_need_not_resume(). Additionally omapdrm system suspend could be simplified later on to avoid lots of unnecessary PM runtime calls and the complexity it adds. The driver can just use internal functions that are shared between the PM runtime and system suspend related functions. Fixes: 4918e1f87c5f ("PM / runtime: Rework pm_runtime_force_suspend/resume()") Signed-off-by: Tony Lindgren Reviewed-by: Ulf Hansson Tested-by: Tomi Valkeinen Cc: 4.16+ # 4.16+ Signed-off-by: Rafael J. Wysocki --- drivers/base/power/runtime.c | 10 +++++++--- include/linux/pm.h | 1 + 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 1fc1a992f90c..b570848d23e0 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1637,6 +1637,7 @@ void pm_runtime_init(struct device *dev) dev->power.request_pending = false; dev->power.request = RPM_REQ_NONE; dev->power.deferred_resume = false; + dev->power.needs_force_resume = 0; INIT_WORK(&dev->power.work, pm_runtime_work); dev->power.timer_expires = 0; @@ -1804,10 +1805,12 @@ int pm_runtime_force_suspend(struct device *dev) * its parent, but set its status to RPM_SUSPENDED anyway in case this * function will be called again for it in the meantime. */ - if (pm_runtime_need_not_resume(dev)) + if (pm_runtime_need_not_resume(dev)) { pm_runtime_set_suspended(dev); - else + } else { __update_runtime_status(dev, RPM_SUSPENDED); + dev->power.needs_force_resume = 1; + } return 0; @@ -1834,7 +1837,7 @@ int pm_runtime_force_resume(struct device *dev) int (*callback)(struct device *); int ret = 0; - if (!pm_runtime_status_suspended(dev) || pm_runtime_need_not_resume(dev)) + if (!pm_runtime_status_suspended(dev) || !dev->power.needs_force_resume) goto out; /* @@ -1853,6 +1856,7 @@ int pm_runtime_force_resume(struct device *dev) pm_runtime_mark_last_busy(dev); out: + dev->power.needs_force_resume = 0; pm_runtime_enable(dev); return ret; } diff --git a/include/linux/pm.h b/include/linux/pm.h index c9657408fee1..1d8209c09686 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -601,6 +601,7 @@ struct dev_pm_info { unsigned int idle_notification:1; unsigned int request_pending:1; unsigned int deferred_resume:1; + unsigned int needs_force_resume:1; unsigned int runtime_auto:1; bool ignore_children:1; unsigned int no_callbacks:1; -- cgit v1.2.3 From 2515dd6ce8e545b0b2eece84920048ef9ed846c4 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Mon, 19 Apr 2021 16:17:41 -0700 Subject: stack: Replace "o" output with "r" input constraint "o" isn't a common asm() constraint to use; it triggers an assertion in assert-enabled builds of LLVM that it's not recognized when targeting aarch64 (though it appears to fall back to "m"). It's fixed in LLVM 13 now, but there isn't really a good reason to use "o" in particular here. To avoid causing build issues for those using assert-enabled builds of earlier LLVM versions, the constraint needs changing. Instead, if the point is to retain the __builtin_alloca(), make ptr appear to "escape" via being an input to an empty inline asm block. This is preferable anyways, since otherwise this looks like a dead store. While the use of "r" was considered in https://lore.kernel.org/lkml/202104011447.2E7F543@keescook/ it was only tested as an output (which looks like a dead store, and wasn't sufficient). Use "r" as an input constraint instead, which behaves correctly across compilers and architectures. Fixes: 39218ff4c625 ("stack: Optionally randomize kernel stack offset each syscall") Signed-off-by: Nick Desaulniers Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Tested-by: Kees Cook Tested-by: Nathan Chancellor Reviewed-by: Nathan Chancellor Link: https://reviews.llvm.org/D100412 Link: https://bugs.llvm.org/show_bug.cgi?id=49956 Link: https://lore.kernel.org/r/20210419231741.4084415-1-keescook@chromium.org --- include/linux/randomize_kstack.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/randomize_kstack.h b/include/linux/randomize_kstack.h index fd80fab663a9..bebc911161b6 100644 --- a/include/linux/randomize_kstack.h +++ b/include/linux/randomize_kstack.h @@ -38,7 +38,7 @@ void *__builtin_alloca(size_t size); u32 offset = raw_cpu_read(kstack_offset); \ u8 *ptr = __builtin_alloca(KSTACK_OFFSET_MAX(offset)); \ /* Keep allocation even after "ptr" loses scope. */ \ - asm volatile("" : "=o"(*ptr) :: "memory"); \ + asm volatile("" :: "r"(ptr) : "memory"); \ } \ } while (0) -- cgit v1.2.3 From 0c8ccd8b267fc735e4621774ce62728f27d42863 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 10 May 2021 15:41:29 +0300 Subject: spi: pxa2xx: Use pxa_ssp_enable()/pxa_ssp_disable() in the driver There are few places that repeat the logic of pxa_ssp_enable() and pxa_ssp_disable(). Use them instead of open coded variants. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210510124134.24638-10-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- drivers/spi/spi-pxa2xx-dma.c | 4 +--- drivers/spi/spi-pxa2xx.c | 36 ++++++++++++++++++------------------ include/linux/pxa2xx_ssp.h | 16 ++++++++++++++++ sound/soc/pxa/pxa-ssp.c | 16 ---------------- 4 files changed, 35 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/drivers/spi/spi-pxa2xx-dma.c b/drivers/spi/spi-pxa2xx-dma.c index e00dbadd39ec..5ca01ad7f460 100644 --- a/drivers/spi/spi-pxa2xx-dma.c +++ b/drivers/spi/spi-pxa2xx-dma.c @@ -50,9 +50,7 @@ static void pxa2xx_spi_dma_transfer_complete(struct driver_data *drv_data, if (error) { /* In case we got an error we disable the SSP now */ - pxa2xx_spi_write(drv_data, SSCR0, - pxa2xx_spi_read(drv_data, SSCR0) - & ~SSCR0_SSE); + pxa_ssp_disable(drv_data->ssp); msg->status = -EIO; } diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c index 087c84e605b9..a27f51f5db65 100644 --- a/drivers/spi/spi-pxa2xx.c +++ b/drivers/spi/spi-pxa2xx.c @@ -286,13 +286,11 @@ static u32 pxa2xx_configure_sscr0(const struct driver_data *drv_data, case QUARK_X1000_SSP: return clk_div | QUARK_X1000_SSCR0_Motorola - | QUARK_X1000_SSCR0_DataSize(bits > 32 ? 8 : bits) - | SSCR0_SSE; + | QUARK_X1000_SSCR0_DataSize(bits > 32 ? 8 : bits); default: return clk_div | SSCR0_Motorola | SSCR0_DataSize(bits > 16 ? bits - 16 : bits) - | SSCR0_SSE | (bits > 16 ? SSCR0_EDSS : 0); } } @@ -498,8 +496,7 @@ static void pxa2xx_spi_off(struct driver_data *drv_data) if (is_mmp2_ssp(drv_data)) return; - pxa2xx_spi_write(drv_data, SSCR0, - pxa2xx_spi_read(drv_data, SSCR0) & ~SSCR0_SSE); + pxa_ssp_disable(drv_data->ssp); } static int null_writer(struct driver_data *drv_data) @@ -1098,25 +1095,26 @@ static int pxa2xx_spi_transfer_one(struct spi_controller *controller, (pxa2xx_spi_read(drv_data, DDS_RATE) != chip->dds_rate)) pxa2xx_spi_write(drv_data, DDS_RATE, chip->dds_rate); + /* Stop the SSP */ + if (!is_mmp2_ssp(drv_data)) + pxa_ssp_disable(drv_data->ssp); + + if (!pxa25x_ssp_comp(drv_data)) + pxa2xx_spi_write(drv_data, SSTO, chip->timeout); + /* see if we need to reload the config registers */ if ((pxa2xx_spi_read(drv_data, SSCR0) != cr0) || (pxa2xx_spi_read(drv_data, SSCR1) & change_mask) != (cr1 & change_mask)) { - /* stop the SSP, and update the other bits */ - if (!is_mmp2_ssp(drv_data)) - pxa2xx_spi_write(drv_data, SSCR0, cr0 & ~SSCR0_SSE); - if (!pxa25x_ssp_comp(drv_data)) - pxa2xx_spi_write(drv_data, SSTO, chip->timeout); /* first set CR1 without interrupt and service enables */ pxa2xx_spi_write(drv_data, SSCR1, cr1 & change_mask); - /* restart the SSP */ + /* Update the other bits */ pxa2xx_spi_write(drv_data, SSCR0, cr0); - - } else { - if (!pxa25x_ssp_comp(drv_data)) - pxa2xx_spi_write(drv_data, SSTO, chip->timeout); } + /* Restart the SSP */ + pxa_ssp_enable(drv_data->ssp); + if (is_mmp2_ssp(drv_data)) { u8 tx_level = (pxa2xx_spi_read(drv_data, SSSR) & SSSR_TFL_MASK) >> 8; @@ -1786,8 +1784,9 @@ static int pxa2xx_spi_probe(struct platform_device *pdev) controller->min_speed_hz = DIV_ROUND_UP(controller->max_speed_hz, 512); + pxa_ssp_disable(ssp); + /* Load default SSP configuration */ - pxa2xx_spi_write(drv_data, SSCR0, 0); switch (drv_data->ssp_type) { case QUARK_X1000_SSP: tmp = QUARK_X1000_SSCR1_RxTresh(RX_THRESH_QUARK_X1000_DFLT) | @@ -1928,7 +1927,7 @@ static int pxa2xx_spi_remove(struct platform_device *pdev) spi_unregister_controller(drv_data->controller); /* Disable the SSP at the peripheral and SOC level */ - pxa2xx_spi_write(drv_data, SSCR0, 0); + pxa_ssp_disable(ssp); clk_disable_unprepare(ssp->clk); /* Release DMA */ @@ -1957,7 +1956,8 @@ static int pxa2xx_spi_suspend(struct device *dev) status = spi_controller_suspend(drv_data->controller); if (status != 0) return status; - pxa2xx_spi_write(drv_data, SSCR0, 0); + + pxa_ssp_disable(ssp); if (!pm_runtime_suspended(dev)) clk_disable_unprepare(ssp->clk); diff --git a/include/linux/pxa2xx_ssp.h b/include/linux/pxa2xx_ssp.h index 1b6c1a0922bd..fdfbe17e15f4 100644 --- a/include/linux/pxa2xx_ssp.h +++ b/include/linux/pxa2xx_ssp.h @@ -254,6 +254,22 @@ static inline u32 pxa_ssp_read_reg(struct ssp_device *dev, u32 reg) return __raw_readl(dev->mmio_base + reg); } +static inline void pxa_ssp_enable(struct ssp_device *ssp) +{ + u32 sscr0; + + sscr0 = pxa_ssp_read_reg(ssp, SSCR0) | SSCR0_SSE; + pxa_ssp_write_reg(ssp, SSCR0, sscr0); +} + +static inline void pxa_ssp_disable(struct ssp_device *ssp) +{ + u32 sscr0; + + sscr0 = pxa_ssp_read_reg(ssp, SSCR0) & ~SSCR0_SSE; + pxa_ssp_write_reg(ssp, SSCR0, sscr0); +} + #if IS_ENABLED(CONFIG_PXA_SSP) struct ssp_device *pxa_ssp_request(int port, const char *label); void pxa_ssp_free(struct ssp_device *); diff --git a/sound/soc/pxa/pxa-ssp.c b/sound/soc/pxa/pxa-ssp.c index b941adcbb8f9..939e7e28486a 100644 --- a/sound/soc/pxa/pxa-ssp.c +++ b/sound/soc/pxa/pxa-ssp.c @@ -61,22 +61,6 @@ static void dump_registers(struct ssp_device *ssp) pxa_ssp_read_reg(ssp, SSACD)); } -static void pxa_ssp_enable(struct ssp_device *ssp) -{ - uint32_t sscr0; - - sscr0 = __raw_readl(ssp->mmio_base + SSCR0) | SSCR0_SSE; - __raw_writel(sscr0, ssp->mmio_base + SSCR0); -} - -static void pxa_ssp_disable(struct ssp_device *ssp) -{ - uint32_t sscr0; - - sscr0 = __raw_readl(ssp->mmio_base + SSCR0) & ~SSCR0_SSE; - __raw_writel(sscr0, ssp->mmio_base + SSCR0); -} - static void pxa_ssp_set_dma_params(struct ssp_device *ssp, int width4, int out, struct snd_dmaengine_dai_dma_data *dma) { -- cgit v1.2.3 From 3fdb59cf10b020b32b9f1dfc78611320623dcb3e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 10 May 2021 15:41:34 +0300 Subject: spi: pxa2xx: Introduce special type for Merrifield SPIs Intel Merrifield SPI is actually more closer to PXA3xx. It has extended FIFO (32 bytes) and additional registers to get or set FIFO thresholds. Introduce new type for Intel Merrifield SPI host controllers and handle bigger FIFO size. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210510124134.24638-15-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- drivers/spi/spi-pxa2xx-pci.c | 2 +- drivers/spi/spi-pxa2xx.c | 32 +++++++++++++++++++++++++++++--- include/linux/pxa2xx_ssp.h | 16 ++++++++++++++++ 3 files changed, 46 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/spi/spi-pxa2xx-pci.c b/drivers/spi/spi-pxa2xx-pci.c index a259be12d326..dce9ade9a4df 100644 --- a/drivers/spi/spi-pxa2xx-pci.c +++ b/drivers/spi/spi-pxa2xx-pci.c @@ -179,7 +179,7 @@ static struct pxa_spi_info spi_info_configs[] = { .rx_param = &bsw2_rx_param, }, [PORT_MRFLD] = { - .type = PXA27x_SSP, + .type = MRFLD_SSP, .max_clk_rate = 25000000, .setup = mrfld_spi_setup, }, diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c index af3f01de8f5b..5985b39e2dd6 100644 --- a/drivers/spi/spi-pxa2xx.c +++ b/drivers/spi/spi-pxa2xx.c @@ -200,6 +200,11 @@ static bool is_mmp2_ssp(const struct driver_data *drv_data) return drv_data->ssp_type == MMP2_SSP; } +static bool is_mrfld_ssp(const struct driver_data *drv_data) +{ + return drv_data->ssp_type == MRFLD_SSP; +} + static void pxa2xx_spi_update(const struct driver_data *drv_data, u32 reg, u32 mask, u32 value) { if ((pxa2xx_spi_read(drv_data, reg) & mask) != value) @@ -1087,6 +1092,15 @@ static int pxa2xx_spi_transfer_one(struct spi_controller *controller, pxa2xx_spi_update(drv_data, SSITF, GENMASK(15, 0), chip->lpss_tx_threshold); } + if (is_mrfld_ssp(drv_data)) { + u32 thresh = 0; + + thresh |= SFIFOTT_RxThresh(chip->lpss_rx_threshold); + thresh |= SFIFOTT_TxThresh(chip->lpss_tx_threshold); + + pxa2xx_spi_update(drv_data, SFIFOTT, 0xffffffff, thresh); + } + if (is_quark_x1000_ssp(drv_data)) pxa2xx_spi_update(drv_data, DDS_RATE, GENMASK(23, 0), chip->dds_rate); @@ -1253,6 +1267,11 @@ static int setup(struct spi_device *spi) tx_hi_thres = 0; rx_thres = RX_THRESH_QUARK_X1000_DFLT; break; + case MRFLD_SSP: + tx_thres = TX_THRESH_MRFLD_DFLT; + tx_hi_thres = 0; + rx_thres = RX_THRESH_MRFLD_DFLT; + break; case CE4100_SSP: tx_thres = TX_THRESH_CE4100_DFLT; tx_hi_thres = 0; @@ -1328,9 +1347,16 @@ static int setup(struct spi_device *spi) chip->cr1 |= SSCR1_SPH; } - chip->lpss_rx_threshold = SSIRF_RxThresh(rx_thres); - chip->lpss_tx_threshold = SSITF_TxLoThresh(tx_thres) - | SSITF_TxHiThresh(tx_hi_thres); + if (is_lpss_ssp(drv_data)) { + chip->lpss_rx_threshold = SSIRF_RxThresh(rx_thres); + chip->lpss_tx_threshold = SSITF_TxLoThresh(tx_thres) | + SSITF_TxHiThresh(tx_hi_thres); + } + + if (is_mrfld_ssp(drv_data)) { + chip->lpss_rx_threshold = rx_thres; + chip->lpss_tx_threshold = tx_thres; + } /* set dma burst and threshold outside of chip_info path so that if * chip_info goes away after setting chip->enable_dma, the diff --git a/include/linux/pxa2xx_ssp.h b/include/linux/pxa2xx_ssp.h index fdfbe17e15f4..2b21bc1f3c73 100644 --- a/include/linux/pxa2xx_ssp.h +++ b/include/linux/pxa2xx_ssp.h @@ -183,6 +183,21 @@ struct device_node; #define SSACD_ACPS(x) ((x) << 4) /* Audio clock PLL select */ #define SSACD_SCDX8 BIT(7) /* SYSCLK division ratio select */ +/* Intel Merrifield SSP */ +#define SFIFOL 0x68 /* FIFO level */ +#define SFIFOTT 0x6c /* FIFO trigger threshold */ + +#define RX_THRESH_MRFLD_DFLT 16 +#define TX_THRESH_MRFLD_DFLT 16 + +#define SFIFOL_TFL_MASK GENMASK(15, 0) /* Transmit FIFO Level mask */ +#define SFIFOL_RFL_MASK GENMASK(31, 16) /* Receive FIFO Level mask */ + +#define SFIFOTT_TFT GENMASK(15, 0) /* Transmit FIFO Threshold (mask) */ +#define SFIFOTT_TxThresh(x) (((x) - 1) << 0) /* TX FIFO trigger threshold / level */ +#define SFIFOTT_RFT GENMASK(31, 16) /* Receive FIFO Threshold (mask) */ +#define SFIFOTT_RxThresh(x) (((x) - 1) << 16) /* RX FIFO trigger threshold / level */ + /* LPSS SSP */ #define SSITF 0x44 /* TX FIFO trigger level */ #define SSITF_TxHiThresh(x) (((x) - 1) << 0) @@ -205,6 +220,7 @@ enum pxa_ssp_type { MMP2_SSP, PXA910_SSP, CE4100_SSP, + MRFLD_SSP, QUARK_X1000_SSP, LPSS_LPT_SSP, /* Keep LPSS types sorted with lpss_platforms[] */ LPSS_BYT_SSP, -- cgit v1.2.3 From 35f3f8504c3b60a1ae5576e178b27fc0ddd6157d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 10 May 2021 16:12:42 +0300 Subject: spi: Switch to signed types for *_native_cs SPI controller fields While fixing undefined behaviour the commit f60d7270c8a3 ("spi: Avoid undefined behaviour when counting unused native CSs") missed the case when all CSs are GPIOs and thus unused_native_cs will be evaluated to -1 in unsigned representation. This will falsely trigger a condition in the spi_get_gpio_descs(). Switch to signed types for *_native_cs SPI controller fields to fix above. Fixes: f60d7270c8a3 ("spi: Avoid undefined behaviour when counting unused native CSs") Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210510131242.49455-1-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 360a3bc767ca..74239d65c7fd 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -644,8 +644,8 @@ struct spi_controller { int *cs_gpios; struct gpio_desc **cs_gpiods; bool use_gpio_descriptors; - u8 unused_native_cs; - u8 max_native_cs; + s8 unused_native_cs; + s8 max_native_cs; /* statistics */ struct spi_statistics statistics; -- cgit v1.2.3 From efed9a3337e341bd0989161b97453b52567bc59d Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 10 May 2021 17:05:35 -0700 Subject: kyber: fix out of bounds access when preempted __blk_mq_sched_bio_merge() gets the ctx and hctx for the current CPU and passes the hctx to ->bio_merge(). kyber_bio_merge() then gets the ctx for the current CPU again and uses that to get the corresponding Kyber context in the passed hctx. However, the thread may be preempted between the two calls to blk_mq_get_ctx(), and the ctx returned the second time may no longer correspond to the passed hctx. This "works" accidentally most of the time, but it can cause us to read garbage if the second ctx came from an hctx with more ctx's than the first one (i.e., if ctx->index_hw[hctx->type] > hctx->nr_ctx). This manifested as this UBSAN array index out of bounds error reported by Jakub: UBSAN: array-index-out-of-bounds in ../kernel/locking/qspinlock.c:130:9 index 13106 is out of range for type 'long unsigned int [128]' Call Trace: dump_stack+0xa4/0xe5 ubsan_epilogue+0x5/0x40 __ubsan_handle_out_of_bounds.cold.13+0x2a/0x34 queued_spin_lock_slowpath+0x476/0x480 do_raw_spin_lock+0x1c2/0x1d0 kyber_bio_merge+0x112/0x180 blk_mq_submit_bio+0x1f5/0x1100 submit_bio_noacct+0x7b0/0x870 submit_bio+0xc2/0x3a0 btrfs_map_bio+0x4f0/0x9d0 btrfs_submit_data_bio+0x24e/0x310 submit_one_bio+0x7f/0xb0 submit_extent_page+0xc4/0x440 __extent_writepage_io+0x2b8/0x5e0 __extent_writepage+0x28d/0x6e0 extent_write_cache_pages+0x4d7/0x7a0 extent_writepages+0xa2/0x110 do_writepages+0x8f/0x180 __writeback_single_inode+0x99/0x7f0 writeback_sb_inodes+0x34e/0x790 __writeback_inodes_wb+0x9e/0x120 wb_writeback+0x4d2/0x660 wb_workfn+0x64d/0xa10 process_one_work+0x53a/0xa80 worker_thread+0x69/0x5b0 kthread+0x20b/0x240 ret_from_fork+0x1f/0x30 Only Kyber uses the hctx, so fix it by passing the request_queue to ->bio_merge() instead. BFQ and mq-deadline just use that, and Kyber can map the queues itself to avoid the mismatch. Fixes: a6088845c2bf ("block: kyber: make kyber more friendly with merging") Reported-by: Jakub Kicinski Signed-off-by: Omar Sandoval Link: https://lore.kernel.org/r/c7598605401a48d5cfeadebb678abd10af22b83f.1620691329.git.osandov@fb.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 3 +-- block/blk-mq-sched.c | 8 +++++--- block/kyber-iosched.c | 5 +++-- block/mq-deadline.c | 3 +-- include/linux/elevator.h | 2 +- 5 files changed, 11 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 0270cd7ca165..59b2499d3f8b 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2263,10 +2263,9 @@ static void bfq_remove_request(struct request_queue *q, } -static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio, +static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { - struct request_queue *q = hctx->queue; struct bfq_data *bfqd = q->elevator->elevator_data; struct request *free = NULL; /* diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 42a365b1b9c0..996a4b2f73aa 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -358,14 +358,16 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { struct elevator_queue *e = q->elevator; - struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); + struct blk_mq_ctx *ctx; + struct blk_mq_hw_ctx *hctx; bool ret = false; enum hctx_type type; if (e && e->type->ops.bio_merge) - return e->type->ops.bio_merge(hctx, bio, nr_segs); + return e->type->ops.bio_merge(q, bio, nr_segs); + ctx = blk_mq_get_ctx(q); + hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); type = hctx->type; if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) || list_empty_careful(&ctx->rq_lists[type])) diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 8969e122f081..81e3279ecd57 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -561,11 +561,12 @@ static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) } } -static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio, +static bool kyber_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { + struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); struct kyber_hctx_data *khd = hctx->sched_data; - struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue); struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]]; unsigned int sched_domain = kyber_sched_domain(bio->bi_opf); struct list_head *rq_list = &kcq->rq_list[sched_domain]; diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 04aded71ead2..8eea2cbf2bf4 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -461,10 +461,9 @@ static int dd_request_merge(struct request_queue *q, struct request **rq, return ELEVATOR_NO_MERGE; } -static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio, +static bool dd_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { - struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; struct request *free = NULL; bool ret; diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 1fe8e105b83b..dcb2f9022c1d 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -34,7 +34,7 @@ struct elevator_mq_ops { void (*depth_updated)(struct blk_mq_hw_ctx *); bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); - bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *, unsigned int); + bool (*bio_merge)(struct request_queue *, struct bio *, unsigned int); int (*request_merge)(struct request_queue *q, struct request **, struct bio *); void (*request_merged)(struct request_queue *, struct request *, enum elv_merge); void (*requests_merged)(struct request_queue *, struct request *, struct request *); -- cgit v1.2.3 From c5895d3f06cbb80ccb311f1dcb37074651030cb6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 May 2021 22:43:42 +0200 Subject: sched: Simplify sched_info_on() The situation around sched_info is somewhat complicated, it is used by sched_stats and delayacct and, indirectly, kvm. If SCHEDSTATS=Y (but disabled by default) sched_info_on() is unconditionally true -- this is the case for all distro kernel configs I checked. If for some reason SCHEDSTATS=N, but TASK_DELAY_ACCT=Y, then sched_info_on() can return false when delayacct is disabled, presumably because there would be no other users left; except kvm is. Instead of complicating matters further by accurately accounting sched_stat and kvm state, simply unconditionally enable when SCHED_INFO=Y, matching the common distro case. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/20210505111525.121458839@infradead.org --- include/linux/sched/stat.h | 10 ++-------- kernel/delayacct.c | 1 - kernel/sched/stats.h | 37 ++++++++++--------------------------- 3 files changed, 12 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h index 568286411b43..939c3ec9e1b9 100644 --- a/include/linux/sched/stat.h +++ b/include/linux/sched/stat.h @@ -3,6 +3,7 @@ #define _LINUX_SCHED_STAT_H #include +#include /* * Various counters maintained by the scheduler and fork(), @@ -23,14 +24,7 @@ extern unsigned long nr_iowait_cpu(int cpu); static inline int sched_info_on(void) { -#ifdef CONFIG_SCHEDSTATS - return 1; -#elif defined(CONFIG_TASK_DELAY_ACCT) - extern int delayacct_on; - return delayacct_on; -#else - return 0; -#endif + return IS_ENABLED(CONFIG_SCHED_INFO); } #ifdef CONFIG_SCHEDSTATS diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 3fe7cd52b459..3a0b910386d1 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -15,7 +15,6 @@ #include int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ -EXPORT_SYMBOL_GPL(delayacct_on); struct kmem_cache *delayacct_cache; static int __init delayacct_setup_disable(char *str) diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index ee7da12a7056..33ffd41935ba 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -150,11 +150,6 @@ static inline void psi_sched_switch(struct task_struct *prev, #endif /* CONFIG_PSI */ #ifdef CONFIG_SCHED_INFO -static inline void sched_info_reset_dequeued(struct task_struct *t) -{ - t->sched_info.last_queued = 0; -} - /* * We are interested in knowing how long it was from the *first* time a * task was queued to the time that it finally hit a CPU, we call this routine @@ -163,13 +158,12 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) */ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t) { - unsigned long long now = rq_clock(rq), delta = 0; + unsigned long long delta = 0; - if (sched_info_on()) { - if (t->sched_info.last_queued) - delta = now - t->sched_info.last_queued; + if (t->sched_info.last_queued) { + delta = rq_clock(rq) - t->sched_info.last_queued; + t->sched_info.last_queued = 0; } - sched_info_reset_dequeued(t); t->sched_info.run_delay += delta; rq_sched_info_dequeue(rq, delta); @@ -184,9 +178,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) { unsigned long long now = rq_clock(rq), delta = 0; - if (t->sched_info.last_queued) + if (t->sched_info.last_queued) { delta = now - t->sched_info.last_queued; - sched_info_reset_dequeued(t); + t->sched_info.last_queued = 0; + } t->sched_info.run_delay += delta; t->sched_info.last_arrival = now; t->sched_info.pcount++; @@ -201,10 +196,8 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) */ static inline void sched_info_enqueue(struct rq *rq, struct task_struct *t) { - if (sched_info_on()) { - if (!t->sched_info.last_queued) - t->sched_info.last_queued = rq_clock(rq); - } + if (!t->sched_info.last_queued) + t->sched_info.last_queued = rq_clock(rq); } /* @@ -231,7 +224,7 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t) * the idle task.) We are only called when prev != next. */ static inline void -__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) +sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { /* * prev now departs the CPU. It's not interesting to record @@ -245,18 +238,8 @@ __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct sched_info_arrive(rq, next); } -static inline void -sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) -{ - if (sched_info_on()) - __sched_info_switch(rq, prev, next); -} - #else /* !CONFIG_SCHED_INFO: */ # define sched_info_enqueue(rq, t) do { } while (0) -# define sched_info_reset_dequeued(t) do { } while (0) # define sched_info_dequeue(rq, t) do { } while (0) -# define sched_info_depart(rq, t) do { } while (0) -# define sched_info_arrive(rq, next) do { } while (0) # define sched_info_switch(rq, t, next) do { } while (0) #endif /* CONFIG_SCHED_INFO */ -- cgit v1.2.3 From eee4d9fee2544389e5ce5697ed92db67c86d7a9f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 May 2021 22:43:36 +0200 Subject: delayacct: Add static_branch in scheduler hooks Cheaper when delayacct is disabled. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Balbir Singh Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/20210505111525.248028369@infradead.org --- include/linux/delayacct.h | 8 ++++++++ kernel/delayacct.c | 3 +++ 2 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 21651f946751..57fefa54b53a 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -58,8 +58,10 @@ struct task_delay_info { #include #include +#include #ifdef CONFIG_TASK_DELAY_ACCT +DECLARE_STATIC_KEY_TRUE(delayacct_key); extern int delayacct_on; /* Delay accounting turned on/off */ extern struct kmem_cache *delayacct_cache; extern void delayacct_init(void); @@ -114,6 +116,9 @@ static inline void delayacct_tsk_free(struct task_struct *tsk) static inline void delayacct_blkio_start(void) { + if (!static_branch_likely(&delayacct_key)) + return; + delayacct_set_flag(current, DELAYACCT_PF_BLKIO); if (current->delays) __delayacct_blkio_start(); @@ -121,6 +126,9 @@ static inline void delayacct_blkio_start(void) static inline void delayacct_blkio_end(struct task_struct *p) { + if (!static_branch_likely(&delayacct_key)) + return; + if (p->delays) __delayacct_blkio_end(p); delayacct_clear_flag(p, DELAYACCT_PF_BLKIO); diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 3a0b910386d1..63012fd39ae2 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -14,6 +14,7 @@ #include #include +DEFINE_STATIC_KEY_TRUE(delayacct_key); int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ struct kmem_cache *delayacct_cache; @@ -28,6 +29,8 @@ void delayacct_init(void) { delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT); delayacct_tsk_init(&init_task); + if (!delayacct_on) + static_branch_disable(&delayacct_key); } void __delayacct_tsk_init(struct task_struct *tsk) -- cgit v1.2.3 From e4042ad492357fa995921376462b04a025dd53b6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 May 2021 22:43:32 +0200 Subject: delayacct: Default disabled Assuming this stuff isn't actually used much; disable it by default and avoid allocating and tracking the task_delay_info structure. taskstats is changed to still report the regular sched and sched_info and only skip the missing task_delay_info fields instead of not reporting anything. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210505111525.308018373@infradead.org --- Documentation/accounting/delay-accounting.rst | 8 ++++---- Documentation/admin-guide/kernel-parameters.txt | 2 +- include/linux/delayacct.h | 16 ++++------------ kernel/delayacct.c | 19 +++++++++++-------- 4 files changed, 20 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst index 7cc7f5852da0..f20b282d6de0 100644 --- a/Documentation/accounting/delay-accounting.rst +++ b/Documentation/accounting/delay-accounting.rst @@ -69,13 +69,13 @@ Compile the kernel with:: CONFIG_TASK_DELAY_ACCT=y CONFIG_TASKSTATS=y -Delay accounting is enabled by default at boot up. -To disable, add:: +Delay accounting is disabled by default at boot up. +To enable, add:: - nodelayacct + delayacct to the kernel boot options. The rest of the instructions -below assume this has not been done. +below assume this has been done. After the system has booted up, use a utility similar to getdelays.c to access the delays diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index cb89dbdedc46..ef5048c127a3 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3244,7 +3244,7 @@ noclflush [BUGS=X86] Don't use the CLFLUSH instruction - nodelayacct [KNL] Disable per-task delay accounting + delayacct [KNL] Enable per-task delay accounting nodsp [SH] Disable hardware DSP at boot time. diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 57fefa54b53a..225c8e01a111 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -61,7 +61,7 @@ struct task_delay_info { #include #ifdef CONFIG_TASK_DELAY_ACCT -DECLARE_STATIC_KEY_TRUE(delayacct_key); +DECLARE_STATIC_KEY_FALSE(delayacct_key); extern int delayacct_on; /* Delay accounting turned on/off */ extern struct kmem_cache *delayacct_cache; extern void delayacct_init(void); @@ -69,7 +69,7 @@ extern void __delayacct_tsk_init(struct task_struct *); extern void __delayacct_tsk_exit(struct task_struct *); extern void __delayacct_blkio_start(void); extern void __delayacct_blkio_end(struct task_struct *); -extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *); +extern int delayacct_add_tsk(struct taskstats *, struct task_struct *); extern __u64 __delayacct_blkio_ticks(struct task_struct *); extern void __delayacct_freepages_start(void); extern void __delayacct_freepages_end(void); @@ -116,7 +116,7 @@ static inline void delayacct_tsk_free(struct task_struct *tsk) static inline void delayacct_blkio_start(void) { - if (!static_branch_likely(&delayacct_key)) + if (!static_branch_unlikely(&delayacct_key)) return; delayacct_set_flag(current, DELAYACCT_PF_BLKIO); @@ -126,7 +126,7 @@ static inline void delayacct_blkio_start(void) static inline void delayacct_blkio_end(struct task_struct *p) { - if (!static_branch_likely(&delayacct_key)) + if (!static_branch_unlikely(&delayacct_key)) return; if (p->delays) @@ -134,14 +134,6 @@ static inline void delayacct_blkio_end(struct task_struct *p) delayacct_clear_flag(p, DELAYACCT_PF_BLKIO); } -static inline int delayacct_add_tsk(struct taskstats *d, - struct task_struct *tsk) -{ - if (!delayacct_on || !tsk->delays) - return 0; - return __delayacct_add_tsk(d, tsk); -} - static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk) { if (tsk->delays) diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 63012fd39ae2..3f086903b295 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -14,23 +14,23 @@ #include #include -DEFINE_STATIC_KEY_TRUE(delayacct_key); -int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ +DEFINE_STATIC_KEY_FALSE(delayacct_key); +int delayacct_on __read_mostly; /* Delay accounting turned on/off */ struct kmem_cache *delayacct_cache; -static int __init delayacct_setup_disable(char *str) +static int __init delayacct_setup_enable(char *str) { - delayacct_on = 0; + delayacct_on = 1; return 1; } -__setup("nodelayacct", delayacct_setup_disable); +__setup("delayacct", delayacct_setup_enable); void delayacct_init(void) { delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT); delayacct_tsk_init(&init_task); - if (!delayacct_on) - static_branch_disable(&delayacct_key); + if (delayacct_on) + static_branch_enable(&delayacct_key); } void __delayacct_tsk_init(struct task_struct *tsk) @@ -83,7 +83,7 @@ void __delayacct_blkio_end(struct task_struct *p) delayacct_end(&delays->lock, &delays->blkio_start, total, count); } -int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) +int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) { u64 utime, stime, stimescaled, utimescaled; unsigned long long t2, t3; @@ -118,6 +118,9 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->cpu_run_virtual_total = (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp; + if (!tsk->delays) + return 0; + /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ raw_spin_lock_irqsave(&tsk->delays->lock, flags); -- cgit v1.2.3 From 0cd7c741f01de13dc1eecf22557593b3514639bb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 May 2021 14:01:00 +0200 Subject: delayacct: Add sysctl to enable at runtime Just like sched_schedstats, allow runtime enabling (and disabling) of delayacct. This is useful if one forgot to add the delayacct boot time option. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/YJkhebGJAywaZowX@hirez.programming.kicks-ass.net --- Documentation/accounting/delay-accounting.rst | 6 +++-- include/linux/delayacct.h | 4 +++ kernel/delayacct.c | 36 +++++++++++++++++++++++++-- kernel/sysctl.c | 12 +++++++++ 4 files changed, 54 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst index f20b282d6de0..1b8b46deeb29 100644 --- a/Documentation/accounting/delay-accounting.rst +++ b/Documentation/accounting/delay-accounting.rst @@ -74,8 +74,10 @@ To enable, add:: delayacct -to the kernel boot options. The rest of the instructions -below assume this has been done. +to the kernel boot options. The rest of the instructions below assume this has +been done. Alternatively, use sysctl kernel.task_delayacct to switch the state +at runtime. Note however that only tasks started after enabling it will have +delayacct information. After the system has booted up, use a utility similar to getdelays.c to access the delays diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 225c8e01a111..af7e6eb50283 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -65,6 +65,10 @@ DECLARE_STATIC_KEY_FALSE(delayacct_key); extern int delayacct_on; /* Delay accounting turned on/off */ extern struct kmem_cache *delayacct_cache; extern void delayacct_init(void); + +extern int sysctl_delayacct(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); + extern void __delayacct_tsk_init(struct task_struct *); extern void __delayacct_tsk_exit(struct task_struct *); extern void __delayacct_blkio_start(void); diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 3f086903b295..51530d5b15a8 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -18,6 +18,17 @@ DEFINE_STATIC_KEY_FALSE(delayacct_key); int delayacct_on __read_mostly; /* Delay accounting turned on/off */ struct kmem_cache *delayacct_cache; +static void set_delayacct(bool enabled) +{ + if (enabled) { + static_branch_enable(&delayacct_key); + delayacct_on = 1; + } else { + delayacct_on = 0; + static_branch_disable(&delayacct_key); + } +} + static int __init delayacct_setup_enable(char *str) { delayacct_on = 1; @@ -29,9 +40,30 @@ void delayacct_init(void) { delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT); delayacct_tsk_init(&init_task); - if (delayacct_on) - static_branch_enable(&delayacct_key); + set_delayacct(delayacct_on); +} + +#ifdef CONFIG_PROC_SYSCTL +int sysctl_delayacct(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + int state = delayacct_on; + struct ctl_table t; + int err; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &state; + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + if (write) + set_delayacct(state); + return err; } +#endif void __delayacct_tsk_init(struct task_struct *tsk) { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 14edf84cc571..0afbfc83157a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -71,6 +71,7 @@ #include #include #include +#include #include "../lib/kstrtox.h" @@ -1727,6 +1728,17 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_SCHEDSTATS */ +#ifdef CONFIG_TASK_DELAY_ACCT + { + .procname = "task_delayacct", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_delayacct, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_TASK_DELAY_ACCT */ #ifdef CONFIG_NUMA_BALANCING { .procname = "numa_balancing", -- cgit v1.2.3 From 8a311c740b53324ec584e0e3bb7077d56b123c28 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Nov 2020 18:19:36 -0500 Subject: sched: Basic tracking of matching tasks Introduce task_struct::core_cookie as an opaque identifier for core scheduling. When enabled; core scheduling will only allow matching task to be on the core; where idle matches everything. When task_struct::core_cookie is set (and core scheduling is enabled) these tasks are indexed in a second RB-tree, first on cookie value then on scheduling function, such that matching task selection always finds the most elegible match. NOTE: *shudder* at the overhead... NOTE: *sigh*, a 3rd copy of the scheduling function; the alternative is per class tracking of cookies and that just duplicates a lot of stuff for no raisin (the 2nd copy lives in the rt-mutex PI code). [Joel: folded fixes] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Joel Fernandes (Google) Signed-off-by: Peter Zijlstra (Intel) Tested-by: Don Hiatt Tested-by: Hongyu Ning Tested-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210422123308.496975854@infradead.org --- include/linux/sched.h | 8 ++- kernel/sched/core.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++++-- kernel/sched/fair.c | 46 --------------- kernel/sched/sched.h | 55 ++++++++++++++++++ 4 files changed, 210 insertions(+), 51 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index d2c881384517..45eedccf86aa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -700,10 +700,16 @@ struct task_struct { const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; + struct sched_dl_entity dl; + +#ifdef CONFIG_SCHED_CORE + struct rb_node core_node; + unsigned long core_cookie; +#endif + #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif - struct sched_dl_entity dl; #ifdef CONFIG_UCLAMP_TASK /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 85147bea9d93..c057d471c025 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -88,6 +88,133 @@ __read_mostly int scheduler_running; DEFINE_STATIC_KEY_FALSE(__sched_core_enabled); +/* kernel prio, less is more */ +static inline int __task_prio(struct task_struct *p) +{ + if (p->sched_class == &stop_sched_class) /* trumps deadline */ + return -2; + + if (rt_prio(p->prio)) /* includes deadline */ + return p->prio; /* [-1, 99] */ + + if (p->sched_class == &idle_sched_class) + return MAX_RT_PRIO + NICE_WIDTH; /* 140 */ + + return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */ +} + +/* + * l(a,b) + * le(a,b) := !l(b,a) + * g(a,b) := l(b,a) + * ge(a,b) := !l(a,b) + */ + +/* real prio, less is less */ +static inline bool prio_less(struct task_struct *a, struct task_struct *b) +{ + + int pa = __task_prio(a), pb = __task_prio(b); + + if (-pa < -pb) + return true; + + if (-pb < -pa) + return false; + + if (pa == -1) /* dl_prio() doesn't work because of stop_class above */ + return !dl_time_before(a->dl.deadline, b->dl.deadline); + + if (pa == MAX_RT_PRIO + MAX_NICE) { /* fair */ + u64 vruntime = b->se.vruntime; + + /* + * Normalize the vruntime if tasks are in different cpus. + */ + if (task_cpu(a) != task_cpu(b)) { + vruntime -= task_cfs_rq(b)->min_vruntime; + vruntime += task_cfs_rq(a)->min_vruntime; + } + + return !((s64)(a->se.vruntime - vruntime) <= 0); + } + + return false; +} + +static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b) +{ + if (a->core_cookie < b->core_cookie) + return true; + + if (a->core_cookie > b->core_cookie) + return false; + + /* flip prio, so high prio is leftmost */ + if (prio_less(b, a)) + return true; + + return false; +} + +#define __node_2_sc(node) rb_entry((node), struct task_struct, core_node) + +static inline bool rb_sched_core_less(struct rb_node *a, const struct rb_node *b) +{ + return __sched_core_less(__node_2_sc(a), __node_2_sc(b)); +} + +static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node) +{ + const struct task_struct *p = __node_2_sc(node); + unsigned long cookie = (unsigned long)key; + + if (cookie < p->core_cookie) + return -1; + + if (cookie > p->core_cookie) + return 1; + + return 0; +} + +static void sched_core_enqueue(struct rq *rq, struct task_struct *p) +{ + rq->core->core_task_seq++; + + if (!p->core_cookie) + return; + + rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less); +} + +static void sched_core_dequeue(struct rq *rq, struct task_struct *p) +{ + rq->core->core_task_seq++; + + if (!p->core_cookie) + return; + + rb_erase(&p->core_node, &rq->core_tree); +} + +/* + * Find left-most (aka, highest priority) task matching @cookie. + */ +static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie) +{ + struct rb_node *node; + + node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp); + /* + * The idle task always matches any cookie! + */ + if (!node) + return idle_sched_class.pick_task(rq); + + return __node_2_sc(node); +} + /* * Magic required such that: * @@ -147,10 +274,16 @@ static void __sched_core_flip(bool enabled) cpus_read_unlock(); } -static void __sched_core_enable(void) +static void sched_core_assert_empty(void) { - // XXX verify there are no cookie tasks (yet) + int cpu; + for_each_possible_cpu(cpu) + WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree)); +} + +static void __sched_core_enable(void) +{ static_branch_enable(&__sched_core_enabled); /* * Ensure all previous instances of raw_spin_rq_*lock() have finished @@ -158,12 +291,12 @@ static void __sched_core_enable(void) */ synchronize_rcu(); __sched_core_flip(true); + sched_core_assert_empty(); } static void __sched_core_disable(void) { - // XXX verify there are no cookie tasks (left) - + sched_core_assert_empty(); __sched_core_flip(false); static_branch_disable(&__sched_core_enabled); } @@ -205,6 +338,11 @@ void sched_core_put(void) schedule_work(&_work); } +#else /* !CONFIG_SCHED_CORE */ + +static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { } +static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { } + #endif /* CONFIG_SCHED_CORE */ /* @@ -1797,10 +1935,16 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) uclamp_rq_inc(rq, p); p->sched_class->enqueue_task(rq, p, flags); + + if (sched_core_enabled(rq)) + sched_core_enqueue(rq, p); } static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { + if (sched_core_enabled(rq)) + sched_core_dequeue(rq, p); + if (!(flags & DEQUEUE_NOCLOCK)) update_rq_clock(rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 51d72ab5b5ae..08be7a2eb05b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -268,33 +268,11 @@ const struct sched_class fair_sched_class; */ #ifdef CONFIG_FAIR_GROUP_SCHED -static inline struct task_struct *task_of(struct sched_entity *se) -{ - SCHED_WARN_ON(!entity_is_task(se)); - return container_of(se, struct task_struct, se); -} /* Walk up scheduling entities hierarchy */ #define for_each_sched_entity(se) \ for (; se; se = se->parent) -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return p->se.cfs_rq; -} - -/* runqueue on which this entity is (to be) queued */ -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - return se->cfs_rq; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return grp->my_q; -} - static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) { if (!path) @@ -455,33 +433,9 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) #else /* !CONFIG_FAIR_GROUP_SCHED */ -static inline struct task_struct *task_of(struct sched_entity *se) -{ - return container_of(se, struct task_struct, se); -} - #define for_each_sched_entity(se) \ for (; se; se = NULL) -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return &task_rq(p)->cfs; -} - -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - struct task_struct *p = task_of(se); - struct rq *rq = task_rq(p); - - return &rq->cfs; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return NULL; -} - static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) { if (path) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fa990cd259ce..e43a2176d88f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1080,6 +1080,10 @@ struct rq { /* per rq */ struct rq *core; unsigned int core_enabled; + struct rb_root core_tree; + + /* shared state */ + unsigned int core_task_seq; #endif }; @@ -1243,6 +1247,57 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) +#ifdef CONFIG_FAIR_GROUP_SCHED +static inline struct task_struct *task_of(struct sched_entity *se) +{ + SCHED_WARN_ON(!entity_is_task(se)); + return container_of(se, struct task_struct, se); +} + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return p->se.cfs_rq; +} + +/* runqueue on which this entity is (to be) queued */ +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + return se->cfs_rq; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return grp->my_q; +} + +#else + +static inline struct task_struct *task_of(struct sched_entity *se) +{ + return container_of(se, struct task_struct, se); +} + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return &task_rq(p)->cfs; +} + +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + struct rq *rq = task_rq(p); + + return &rq->cfs; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return NULL; +} +#endif + extern void update_rq_clock(struct rq *rq); static inline u64 __rq_clock_broken(struct rq *rq) -- cgit v1.2.3 From d2dfa17bc7de67e99685c4d6557837bf801a102c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Nov 2020 18:19:43 -0500 Subject: sched: Trivial forced-newidle balancer When a sibling is forced-idle to match the core-cookie; search for matching tasks to fill the core. rcu_read_unlock() can incur an infrequent deadlock in sched_core_balance(). Fix this by using the RCU-sched flavor instead. Signed-off-by: Peter Zijlstra (Intel) Tested-by: Don Hiatt Tested-by: Hongyu Ning Tested-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210422123308.800048269@infradead.org --- include/linux/sched.h | 1 + kernel/sched/core.c | 130 +++++++++++++++++++++++++++++++++++++++++++++++++- kernel/sched/idle.c | 1 + kernel/sched/sched.h | 6 +++ 4 files changed, 137 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 45eedccf86aa..9b822e383212 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -705,6 +705,7 @@ struct task_struct { #ifdef CONFIG_SCHED_CORE struct rb_node core_node; unsigned long core_cookie; + unsigned int core_occupation; #endif #ifdef CONFIG_CGROUP_SCHED diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e45c1d21b371..b4988887510f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -204,6 +204,21 @@ static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie) return __node_2_sc(node); } +static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie) +{ + struct rb_node *node = &p->core_node; + + node = rb_next(node); + if (!node) + return NULL; + + p = container_of(node, struct task_struct, core_node); + if (p->core_cookie != cookie) + return NULL; + + return p; +} + /* * Magic required such that: * @@ -5389,8 +5404,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) const struct sched_class *class; const struct cpumask *smt_mask; bool fi_before = false; + int i, j, cpu, occ = 0; bool need_sync; - int i, j, cpu; if (!sched_core_enabled(rq)) return __pick_next_task(rq, prev, rf); @@ -5512,6 +5527,9 @@ again: if (!p) continue; + if (!is_task_rq_idle(p)) + occ++; + rq_i->core_pick = p; if (rq_i->idle == p && rq_i->nr_running) { rq->core->core_forceidle = true; @@ -5543,6 +5561,7 @@ again: cpu_rq(j)->core_pick = NULL; } + occ = 1; goto again; } } @@ -5588,6 +5607,8 @@ again: if (!(fi_before && rq->core->core_forceidle)) task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle); + rq_i->core_pick->core_occupation = occ; + if (i == cpu) { rq_i->core_pick = NULL; continue; @@ -5609,6 +5630,113 @@ done: return next; } +static bool try_steal_cookie(int this, int that) +{ + struct rq *dst = cpu_rq(this), *src = cpu_rq(that); + struct task_struct *p; + unsigned long cookie; + bool success = false; + + local_irq_disable(); + double_rq_lock(dst, src); + + cookie = dst->core->core_cookie; + if (!cookie) + goto unlock; + + if (dst->curr != dst->idle) + goto unlock; + + p = sched_core_find(src, cookie); + if (p == src->idle) + goto unlock; + + do { + if (p == src->core_pick || p == src->curr) + goto next; + + if (!cpumask_test_cpu(this, &p->cpus_mask)) + goto next; + + if (p->core_occupation > dst->idle->core_occupation) + goto next; + + p->on_rq = TASK_ON_RQ_MIGRATING; + deactivate_task(src, p, 0); + set_task_cpu(p, this); + activate_task(dst, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; + + resched_curr(dst); + + success = true; + break; + +next: + p = sched_core_next(p, cookie); + } while (p); + +unlock: + double_rq_unlock(dst, src); + local_irq_enable(); + + return success; +} + +static bool steal_cookie_task(int cpu, struct sched_domain *sd) +{ + int i; + + for_each_cpu_wrap(i, sched_domain_span(sd), cpu) { + if (i == cpu) + continue; + + if (need_resched()) + break; + + if (try_steal_cookie(cpu, i)) + return true; + } + + return false; +} + +static void sched_core_balance(struct rq *rq) +{ + struct sched_domain *sd; + int cpu = cpu_of(rq); + + preempt_disable(); + rcu_read_lock(); + raw_spin_rq_unlock_irq(rq); + for_each_domain(cpu, sd) { + if (need_resched()) + break; + + if (steal_cookie_task(cpu, sd)) + break; + } + raw_spin_rq_lock_irq(rq); + rcu_read_unlock(); + preempt_enable(); +} + +static DEFINE_PER_CPU(struct callback_head, core_balance_head); + +void queue_core_balance(struct rq *rq) +{ + if (!sched_core_enabled(rq)) + return; + + if (!rq->core->core_cookie) + return; + + if (!rq->nr_running) /* not forced idle */ + return; + + queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance); +} + static inline void sched_core_cpu_starting(unsigned int cpu) { const struct cpumask *smt_mask = cpu_smt_mask(cpu); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 43646e7876d9..912b47aa99d8 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -437,6 +437,7 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir { update_idle_core(rq); schedstat_inc(rq->sched_goidle); + queue_core_balance(rq); } #ifdef CONFIG_SMP diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4a898abc60ce..91ca1fee9fec 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1170,6 +1170,8 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq) bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool fi); +extern void queue_core_balance(struct rq *rq); + #else /* !CONFIG_SCHED_CORE */ static inline bool sched_core_enabled(struct rq *rq) @@ -1192,6 +1194,10 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq) return &rq->__lock; } +static inline void queue_core_balance(struct rq *rq) +{ +} + #endif /* CONFIG_SCHED_CORE */ static inline void lockdep_assert_rq_held(struct rq *rq) -- cgit v1.2.3 From 6e33cad0af49336952e5541464bd02f5b5fd433e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Mar 2021 18:55:06 +0100 Subject: sched: Trivial core scheduling cookie management In order to not have to use pid_struct, create a new, smaller, structure to manage task cookies for core scheduling. Signed-off-by: Peter Zijlstra (Intel) Tested-by: Don Hiatt Tested-by: Hongyu Ning Tested-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210422123308.919768100@infradead.org --- include/linux/sched.h | 6 +++ kernel/fork.c | 1 + kernel/sched/Makefile | 1 + kernel/sched/core.c | 7 +-- kernel/sched/core_sched.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 16 +++++++ 6 files changed, 137 insertions(+), 3 deletions(-) create mode 100644 kernel/sched/core_sched.c (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9b822e383212..eab3f7c4251b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2179,4 +2179,10 @@ int sched_trace_rq_nr_running(struct rq *rq); const struct cpumask *sched_trace_rd_span(struct root_domain *rd); +#ifdef CONFIG_SCHED_CORE +extern void sched_core_free(struct task_struct *tsk); +#else +static inline void sched_core_free(struct task_struct *tsk) { } +#endif + #endif diff --git a/kernel/fork.c b/kernel/fork.c index dc06afd725cb..d16c60c9daca 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -742,6 +742,7 @@ void __put_task_struct(struct task_struct *tsk) exit_creds(tsk); delayacct_tsk_free(tsk); put_signal_struct(tsk->signal); + sched_core_free(tsk); if (!profile_handoff_task(tsk)) free_task(tsk); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 5fc9c9b70862..978fcfca5871 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -36,3 +36,4 @@ obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o obj-$(CONFIG_PSI) += psi.o +obj-$(CONFIG_SCHED_CORE) += core_sched.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b4988887510f..55b2d9399e12 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -167,7 +167,7 @@ static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node) return 0; } -static void sched_core_enqueue(struct rq *rq, struct task_struct *p) +void sched_core_enqueue(struct rq *rq, struct task_struct *p) { rq->core->core_task_seq++; @@ -177,14 +177,15 @@ static void sched_core_enqueue(struct rq *rq, struct task_struct *p) rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less); } -static void sched_core_dequeue(struct rq *rq, struct task_struct *p) +void sched_core_dequeue(struct rq *rq, struct task_struct *p) { rq->core->core_task_seq++; - if (!p->core_cookie) + if (!sched_core_enqueued(p)) return; rb_erase(&p->core_node, &rq->core_tree); + RB_CLEAR_NODE(&p->core_node); } /* diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c new file mode 100644 index 000000000000..8d0869a9eb8c --- /dev/null +++ b/kernel/sched/core_sched.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "sched.h" + +/* + * A simple wrapper around refcount. An allocated sched_core_cookie's + * address is used to compute the cookie of the task. + */ +struct sched_core_cookie { + refcount_t refcnt; +}; + +unsigned long sched_core_alloc_cookie(void) +{ + struct sched_core_cookie *ck = kmalloc(sizeof(*ck), GFP_KERNEL); + if (!ck) + return 0; + + refcount_set(&ck->refcnt, 1); + sched_core_get(); + + return (unsigned long)ck; +} + +void sched_core_put_cookie(unsigned long cookie) +{ + struct sched_core_cookie *ptr = (void *)cookie; + + if (ptr && refcount_dec_and_test(&ptr->refcnt)) { + kfree(ptr); + sched_core_put(); + } +} + +unsigned long sched_core_get_cookie(unsigned long cookie) +{ + struct sched_core_cookie *ptr = (void *)cookie; + + if (ptr) + refcount_inc(&ptr->refcnt); + + return cookie; +} + +/* + * sched_core_update_cookie - replace the cookie on a task + * @p: the task to update + * @cookie: the new cookie + * + * Effectively exchange the task cookie; caller is responsible for lifetimes on + * both ends. + * + * Returns: the old cookie + */ +unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cookie) +{ + unsigned long old_cookie; + struct rq_flags rf; + struct rq *rq; + bool enqueued; + + rq = task_rq_lock(p, &rf); + + /* + * Since creating a cookie implies sched_core_get(), and we cannot set + * a cookie until after we've created it, similarly, we cannot destroy + * a cookie until after we've removed it, we must have core scheduling + * enabled here. + */ + SCHED_WARN_ON((p->core_cookie || cookie) && !sched_core_enabled(rq)); + + enqueued = sched_core_enqueued(p); + if (enqueued) + sched_core_dequeue(rq, p); + + old_cookie = p->core_cookie; + p->core_cookie = cookie; + + if (enqueued) + sched_core_enqueue(rq, p); + + /* + * If task is currently running, it may not be compatible anymore after + * the cookie change, so enter the scheduler on its CPU to schedule it + * away. + */ + if (task_running(rq, p)) + resched_curr(rq); + + task_rq_unlock(rq, p, &rf); + + return old_cookie; +} + +static unsigned long sched_core_clone_cookie(struct task_struct *p) +{ + unsigned long cookie, flags; + + raw_spin_lock_irqsave(&p->pi_lock, flags); + cookie = sched_core_get_cookie(p->core_cookie); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + return cookie; +} + +void sched_core_free(struct task_struct *p) +{ + sched_core_put_cookie(p->core_cookie); +} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3878386a0a02..904c52b560d1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1229,6 +1229,22 @@ static inline bool sched_group_cookie_match(struct rq *rq, extern void queue_core_balance(struct rq *rq); +static inline bool sched_core_enqueued(struct task_struct *p) +{ + return !RB_EMPTY_NODE(&p->core_node); +} + +extern void sched_core_enqueue(struct rq *rq, struct task_struct *p); +extern void sched_core_dequeue(struct rq *rq, struct task_struct *p); + +extern void sched_core_get(void); +extern void sched_core_put(void); + +extern unsigned long sched_core_alloc_cookie(void); +extern void sched_core_put_cookie(unsigned long cookie); +extern unsigned long sched_core_get_cookie(unsigned long cookie); +extern unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cookie); + #else /* !CONFIG_SCHED_CORE */ static inline bool sched_core_enabled(struct rq *rq) -- cgit v1.2.3 From 85dd3f61203c5cfa72b308ff327b5fbf3fc1ce5e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 29 Mar 2021 15:18:35 +0200 Subject: sched: Inherit task cookie on fork() Note that sched_core_fork() is called from under tasklist_lock, and not from sched_fork() earlier. This avoids a few races later. Signed-off-by: Peter Zijlstra (Intel) Tested-by: Don Hiatt Tested-by: Hongyu Ning Tested-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210422123308.980003687@infradead.org --- include/linux/sched.h | 2 ++ kernel/fork.c | 3 +++ kernel/sched/core_sched.c | 6 ++++++ 3 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index eab3f7c4251b..fba47e52e482 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2181,8 +2181,10 @@ const struct cpumask *sched_trace_rd_span(struct root_domain *rd); #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); +extern void sched_core_fork(struct task_struct *p); #else static inline void sched_core_free(struct task_struct *tsk) { } +static inline void sched_core_fork(struct task_struct *p) { } #endif #endif diff --git a/kernel/fork.c b/kernel/fork.c index d16c60c9daca..e7fd928fcafe 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2251,6 +2251,8 @@ static __latent_entropy struct task_struct *copy_process( klp_copy_process(p); + sched_core_fork(p); + spin_lock(¤t->sighand->siglock); /* @@ -2338,6 +2340,7 @@ static __latent_entropy struct task_struct *copy_process( return p; bad_fork_cancel_cgroup: + sched_core_free(p); spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); cgroup_cancel_fork(p, args); diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 8d0869a9eb8c..dcbbeaefaaa3 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -103,6 +103,12 @@ static unsigned long sched_core_clone_cookie(struct task_struct *p) return cookie; } +void sched_core_fork(struct task_struct *p) +{ + RB_CLEAR_NODE(&p->core_node); + p->core_cookie = sched_core_clone_cookie(current); +} + void sched_core_free(struct task_struct *p) { sched_core_put_cookie(p->core_cookie); -- cgit v1.2.3 From 7ac592aa35a684ff1858fb9ec282886b9e3575ac Mon Sep 17 00:00:00 2001 From: Chris Hyser Date: Wed, 24 Mar 2021 17:40:15 -0400 Subject: sched: prctl() core-scheduling interface This patch provides support for setting and copying core scheduling 'task cookies' between threads (PID), processes (TGID), and process groups (PGID). The value of core scheduling isn't that tasks don't share a core, 'nosmt' can do that. The value lies in exploiting all the sharing opportunities that exist to recover possible lost performance and that requires a degree of flexibility in the API. From a security perspective (and there are others), the thread, process and process group distinction is an existent hierarchal categorization of tasks that reflects many of the security concerns about 'data sharing'. For example, protecting against cache-snooping by a thread that can just read the memory directly isn't all that useful. With this in mind, subcommands to CREATE/SHARE (TO/FROM) provide a mechanism to create and share cookies. CREATE/SHARE_TO specify a target pid with enum pidtype used to specify the scope of the targeted tasks. For example, PIDTYPE_TGID will share the cookie with the process and all of it's threads as typically desired in a security scenario. API: prctl(PR_SCHED_CORE, PR_SCHED_CORE_GET, tgtpid, pidtype, &cookie) prctl(PR_SCHED_CORE, PR_SCHED_CORE_CREATE, tgtpid, pidtype, NULL) prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_TO, tgtpid, pidtype, NULL) prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_FROM, srcpid, pidtype, NULL) where 'tgtpid/srcpid == 0' implies the current process and pidtype is kernel enum pid_type {PIDTYPE_PID, PIDTYPE_TGID, PIDTYPE_PGID, ...}. For return values, EINVAL, ENOMEM are what they say. ESRCH means the tgtpid/srcpid was not found. EPERM indicates lack of PTRACE permission access to tgtpid/srcpid. ENODEV indicates your machines lacks SMT. [peterz: complete rewrite] Signed-off-by: Chris Hyser Signed-off-by: Peter Zijlstra (Intel) Tested-by: Don Hiatt Tested-by: Hongyu Ning Tested-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210422123309.039845339@infradead.org --- include/linux/sched.h | 2 + include/uapi/linux/prctl.h | 8 +++ kernel/sched/core_sched.c | 114 +++++++++++++++++++++++++++++++++++++++ kernel/sys.c | 5 ++ tools/include/uapi/linux/prctl.h | 8 +++ 5 files changed, 137 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index fba47e52e482..c7e7d50e2fdc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2182,6 +2182,8 @@ const struct cpumask *sched_trace_rd_span(struct root_domain *rd); #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); extern void sched_core_fork(struct task_struct *p); +extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, + unsigned long uaddr); #else static inline void sched_core_free(struct task_struct *tsk) { } static inline void sched_core_fork(struct task_struct *p) { } diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 18a9f59dc067..967d9c55323d 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -259,4 +259,12 @@ struct prctl_mm_map { #define PR_PAC_SET_ENABLED_KEYS 60 #define PR_PAC_GET_ENABLED_KEYS 61 +/* Request the scheduler to share a core */ +#define PR_SCHED_CORE 62 +# define PR_SCHED_CORE_GET 0 +# define PR_SCHED_CORE_CREATE 1 /* create unique core_sched cookie */ +# define PR_SCHED_CORE_SHARE_TO 2 /* push core_sched cookie to pid */ +# define PR_SCHED_CORE_SHARE_FROM 3 /* pull core_sched cookie to pid */ +# define PR_SCHED_CORE_MAX 4 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index dcbbeaefaaa3..9a80e9a474c0 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include "sched.h" /* @@ -113,3 +114,116 @@ void sched_core_free(struct task_struct *p) { sched_core_put_cookie(p->core_cookie); } + +static void __sched_core_set(struct task_struct *p, unsigned long cookie) +{ + cookie = sched_core_get_cookie(cookie); + cookie = sched_core_update_cookie(p, cookie); + sched_core_put_cookie(cookie); +} + +/* Called from prctl interface: PR_SCHED_CORE */ +int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, + unsigned long uaddr) +{ + unsigned long cookie = 0, id = 0; + struct task_struct *task, *p; + struct pid *grp; + int err = 0; + + if (!static_branch_likely(&sched_smt_present)) + return -ENODEV; + + if (type > PIDTYPE_PGID || cmd >= PR_SCHED_CORE_MAX || pid < 0 || + (cmd != PR_SCHED_CORE_GET && uaddr)) + return -EINVAL; + + rcu_read_lock(); + if (pid == 0) { + task = current; + } else { + task = find_task_by_vpid(pid); + if (!task) { + rcu_read_unlock(); + return -ESRCH; + } + } + get_task_struct(task); + rcu_read_unlock(); + + /* + * Check if this process has the right to modify the specified + * process. Use the regular "ptrace_may_access()" checks. + */ + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { + err = -EPERM; + goto out; + } + + switch (cmd) { + case PR_SCHED_CORE_GET: + if (type != PIDTYPE_PID || uaddr & 7) { + err = -EINVAL; + goto out; + } + cookie = sched_core_clone_cookie(task); + if (cookie) { + /* XXX improve ? */ + ptr_to_hashval((void *)cookie, &id); + } + err = put_user(id, (u64 __user *)uaddr); + goto out; + + case PR_SCHED_CORE_CREATE: + cookie = sched_core_alloc_cookie(); + if (!cookie) { + err = -ENOMEM; + goto out; + } + break; + + case PR_SCHED_CORE_SHARE_TO: + cookie = sched_core_clone_cookie(current); + break; + + case PR_SCHED_CORE_SHARE_FROM: + if (type != PIDTYPE_PID) { + err = -EINVAL; + goto out; + } + cookie = sched_core_clone_cookie(task); + __sched_core_set(current, cookie); + goto out; + + default: + err = -EINVAL; + goto out; + }; + + if (type == PIDTYPE_PID) { + __sched_core_set(task, cookie); + goto out; + } + + read_lock(&tasklist_lock); + grp = task_pid_type(task, type); + + do_each_pid_thread(grp, type, p) { + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) { + err = -EPERM; + goto out_tasklist; + } + } while_each_pid_thread(grp, type, p); + + do_each_pid_thread(grp, type, p) { + __sched_core_set(p, cookie); + } while_each_pid_thread(grp, type, p); +out_tasklist: + read_unlock(&tasklist_lock); + +out: + sched_core_put_cookie(cookie); + put_task_struct(task); + return err; +} + diff --git a/kernel/sys.c b/kernel/sys.c index 3a583a29815f..9de46a4bf492 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2550,6 +2550,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = set_syscall_user_dispatch(arg2, arg3, arg4, (char __user *) arg5); break; +#ifdef CONFIG_SCHED_CORE + case PR_SCHED_CORE: + error = sched_core_share_pid(arg2, arg3, arg4, arg5); + break; +#endif default: error = -EINVAL; break; diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index 18a9f59dc067..967d9c55323d 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -259,4 +259,12 @@ struct prctl_mm_map { #define PR_PAC_SET_ENABLED_KEYS 60 #define PR_PAC_GET_ENABLED_KEYS 61 +/* Request the scheduler to share a core */ +#define PR_SCHED_CORE 62 +# define PR_SCHED_CORE_GET 0 +# define PR_SCHED_CORE_CREATE 1 /* create unique core_sched cookie */ +# define PR_SCHED_CORE_SHARE_TO 2 /* push core_sched cookie to pid */ +# define PR_SCHED_CORE_SHARE_FROM 3 /* pull core_sched cookie to pid */ +# define PR_SCHED_CORE_MAX 4 + #endif /* _LINUX_PRCTL_H */ -- cgit v1.2.3 From fa5e5dc39669b4427830c546ede8709323b8276c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 May 2021 21:33:58 +0200 Subject: jump_label, x86: Introduce jump_entry_size() This allows architectures to have variable sized jumps. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210506194157.786777050@infradead.org --- arch/x86/include/asm/jump_label.h | 4 ++-- arch/x86/kernel/jump_label.c | 7 +++++++ include/linux/jump_label.h | 9 +++++++++ kernel/jump_label.c | 2 +- 4 files changed, 19 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index dfdc2b1c17dd..d85802a00629 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -4,8 +4,6 @@ #define HAVE_JUMP_LABEL_BATCH -#define JUMP_LABEL_NOP_SIZE 5 - #include #include @@ -47,6 +45,8 @@ l_yes: return true; } +extern int arch_jump_entry_size(struct jump_entry *entry); + #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 638d3b9be0ad..a29eecc14c94 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -16,6 +16,13 @@ #include #include +#define JUMP_LABEL_NOP_SIZE JMP32_INSN_SIZE + +int arch_jump_entry_size(struct jump_entry *entry) +{ + return JMP32_INSN_SIZE; +} + static const void * __jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type) { diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 05f5554d860f..8c45f58292ac 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -176,6 +176,15 @@ static inline void jump_entry_set_init(struct jump_entry *entry) entry->key |= 2; } +static inline int jump_entry_size(struct jump_entry *entry) +{ +#ifdef JUMP_LABEL_NOP_SIZE + return JUMP_LABEL_NOP_SIZE; +#else + return arch_jump_entry_size(entry); +#endif +} + #endif #endif diff --git a/kernel/jump_label.c b/kernel/jump_label.c index ba39fbb1f8e7..521cafcfcb69 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -309,7 +309,7 @@ EXPORT_SYMBOL_GPL(jump_label_rate_limit); static int addr_conflict(struct jump_entry *entry, void *start, void *end) { if (jump_entry_code(entry) <= (unsigned long)end && - jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE > (unsigned long)start) + jump_entry_code(entry) + jump_entry_size(entry) > (unsigned long)start) return 1; return 0; -- cgit v1.2.3 From 5af0ea293d78c8b8f0b87ae2b13f7ac584057bc3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 May 2021 21:34:00 +0200 Subject: jump_label: Free jump_entry::key bit1 for build use Have jump_label_init() set jump_entry::key bit1 to either 0 ot 1 unconditionally. This makes it available for build-time games. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210506194157.906893264@infradead.org --- include/linux/jump_label.h | 7 +++++-- kernel/jump_label.c | 10 ++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 8c45f58292ac..48b9b2a82767 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -171,9 +171,12 @@ static inline bool jump_entry_is_init(const struct jump_entry *entry) return (unsigned long)entry->key & 2UL; } -static inline void jump_entry_set_init(struct jump_entry *entry) +static inline void jump_entry_set_init(struct jump_entry *entry, bool set) { - entry->key |= 2; + if (set) + entry->key |= 2; + else + entry->key &= ~2; } static inline int jump_entry_size(struct jump_entry *entry) diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 521cafcfcb69..bdb0681bece8 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -483,13 +483,14 @@ void __init jump_label_init(void) for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; + bool in_init; /* rewrite NOPs */ if (jump_label_type(iter) == JUMP_LABEL_NOP) arch_jump_label_transform_static(iter, JUMP_LABEL_NOP); - if (init_section_contains((void *)jump_entry_code(iter), 1)) - jump_entry_set_init(iter); + in_init = init_section_contains((void *)jump_entry_code(iter), 1); + jump_entry_set_init(iter, in_init); iterk = jump_entry_key(iter); if (iterk == key) @@ -634,9 +635,10 @@ static int jump_label_add_module(struct module *mod) for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; + bool in_init; - if (within_module_init(jump_entry_code(iter), mod)) - jump_entry_set_init(iter); + in_init = within_module_init(jump_entry_code(iter), mod); + jump_entry_set_init(iter, in_init); iterk = jump_entry_key(iter); if (iterk == key) -- cgit v1.2.3 From 190515f610946db025cdedebde93958b725fb583 Mon Sep 17 00:00:00 2001 From: Lin Feng Date: Wed, 12 May 2021 18:01:24 +0800 Subject: blkdev.h: remove unused codes blk_account_rq Last users of blk_account_rq gone with patch commit a1ce35fa49852db ("block: remove dead elevator code") and now it gets no caller, it can be safely removed. Signed-off-by: Lin Feng Link: https://lore.kernel.org/r/20210512100124.173769-1-linf@wangsu.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b91ba6207365..26c3e368656f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -677,11 +677,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); -static inline bool blk_account_rq(struct request *rq) -{ - return (rq->rq_flags & RQF_STARTED) && !blk_rq_is_passthrough(rq); -} - #define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist) #define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ) -- cgit v1.2.3 From cc00c1988801dc71f63bb7bad019e85046865095 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 12 May 2021 19:51:31 +0200 Subject: sched: Fix leftover comment typos A few more snuck in. Also capitalize 'CPU' while at it. Signed-off-by: Ingo Molnar --- include/linux/sched_clock.h | 2 +- kernel/sched/core.c | 4 ++-- kernel/sched/fair.c | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched_clock.h b/include/linux/sched_clock.h index 528718e4ed52..835ee87ed792 100644 --- a/include/linux/sched_clock.h +++ b/include/linux/sched_clock.h @@ -14,7 +14,7 @@ * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit * clocks. * @read_sched_clock: Current clock source (or dummy source when suspended). - * @mult: Multipler for scaled math conversion. + * @mult: Multiplier for scaled math conversion. * @shift: Shift value for scaled math conversion. * * Care must be taken when updating this structure; it is read by diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9d00f4958bde..ac8882da5daf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5506,7 +5506,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } /* - * Try and select tasks for each sibling in decending sched_class + * Try and select tasks for each sibling in descending sched_class * order. */ for_each_class(class) { @@ -5520,7 +5520,7 @@ again: /* * If this sibling doesn't yet have a suitable task to - * run; ask for the most elegible task, given the + * run; ask for the most eligible task, given the * highest priority task already selected for this * core. */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2635e1048421..161b92aa1c79 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10808,11 +10808,11 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) * sched_slice() considers only this active rq and it gets the * whole slice. But during force idle, we have siblings acting * like a single runqueue and hence we need to consider runnable - * tasks on this cpu and the forced idle cpu. Ideally, we should + * tasks on this CPU and the forced idle CPU. Ideally, we should * go through the forced idle rq, but that would be a perf hit. - * We can assume that the forced idle cpu has atleast + * We can assume that the forced idle CPU has at least * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check - * if we need to give up the cpu. + * if we need to give up the CPU. */ if (rq->core->core_forceidle && rq->cfs.nr_running == 1 && __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE)) -- cgit v1.2.3 From 93d0955e6cf562d02aae37f5f8d98d9d9d16e0d4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 12 May 2021 20:04:28 +0200 Subject: locking: Fix comment typos A few snuck through. Signed-off-by: Ingo Molnar --- include/linux/lockdep_types.h | 2 +- kernel/futex.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h index 2ec9ff5a7fff..3e726ace5c62 100644 --- a/include/linux/lockdep_types.h +++ b/include/linux/lockdep_types.h @@ -52,7 +52,7 @@ enum lockdep_lock_type { * NR_LOCKDEP_CACHING_CLASSES ... Number of classes * cached in the instance of lockdep_map * - * Currently main class (subclass == 0) and signle depth subclass + * Currently main class (subclass == 0) and single depth subclass * are cached in lockdep_map. This optimization is mainly targeting * on rq->lock. double_rq_lock() acquires this highly competitive with * single depth. diff --git a/kernel/futex.c b/kernel/futex.c index 4938a00bc785..2f386f012900 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1874,7 +1874,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, * If the caller intends to requeue more than 1 waiter to pifutex, * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, * as we have means to handle the possible fault. If not, don't set - * the bit unecessarily as it will force the subsequent unlock to enter + * the bit unnecessarily as it will force the subsequent unlock to enter * the kernel. */ top_waiter = futex_top_waiter(hb1, key1); @@ -2103,7 +2103,7 @@ retry_private: continue; /* - * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always + * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always * be paired with each other and no other futex ops. * * We should never be requeueing a futex_q with a pi_state, @@ -2318,7 +2318,7 @@ retry: } /* - * PI futexes can not be requeued and must remove themself from the + * PI futexes can not be requeued and must remove themselves from the * hash bucket. The hash bucket lock (i.e. lock_ptr) is held. */ static void unqueue_me_pi(struct futex_q *q) @@ -2903,7 +2903,7 @@ no_block: */ res = fixup_owner(uaddr, &q, !ret); /* - * If fixup_owner() returned an error, proprogate that. If it acquired + * If fixup_owner() returned an error, propagate that. If it acquired * the lock, clear our -ETIMEDOUT or -EINTR. */ if (res) @@ -3280,7 +3280,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ res = fixup_owner(uaddr2, &q, !ret); /* - * If fixup_owner() returned an error, proprogate that. If it + * If fixup_owner() returned an error, propagate that. If it * acquired the lock, clear -ETIMEDOUT or -EINTR. */ if (res) @@ -3678,7 +3678,7 @@ void futex_exec_release(struct task_struct *tsk) { /* * The state handling is done for consistency, but in the case of - * exec() there is no way to prevent futher damage as the PID stays + * exec() there is no way to prevent further damage as the PID stays * the same. But for the unlikely and arguably buggy case that a * futex is held on exec(), this provides at least as much state * consistency protection which is possible. -- cgit v1.2.3 From ca0760e7d79e2bb9c342e6b3f925b1ef01c6303e Mon Sep 17 00:00:00 2001 From: Wei Ming Chen Date: Thu, 6 May 2021 20:30:51 +0800 Subject: Compiler Attributes: Add continue in comment Add "continue;" for switch/case block according to Doc[1] [1] https://www.kernel.org/doc/html/latest/process/deprecated.html?highlight=fallthrough#implicit-switch-case-fall-through Signed-off-by: Wei Ming Chen Signed-off-by: Miguel Ojeda --- include/linux/compiler_attributes.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index c043b8d2b17b..183ddd5fd072 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -199,6 +199,7 @@ * must end with any of these keywords: * break; * fallthrough; + * continue; * goto