diff options
Diffstat (limited to 'kernel')
44 files changed, 1609 insertions, 1572 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 718fb8afab7a..e0d7587e7684 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -45,7 +45,6 @@ ifneq ($(CONFIG_SMP),y) obj-y += up.o endif obj-$(CONFIG_UID16) += uid16.o -obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_MODULE_SIG) += module_signing.o obj-$(CONFIG_KALLSYMS) += kallsyms.o @@ -65,7 +64,7 @@ obj-$(CONFIG_SMP) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o -obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o +obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ obj-$(CONFIG_KPROBES) += kprobes.o @@ -100,6 +99,8 @@ obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o +obj-$(CONFIG_HAS_IOMEM) += memremap.o + $(obj)/configs.o: $(obj)/config_data.h # config_data.h contains the same information as ikconfig.h but gzipped. @@ -112,99 +113,3 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call filechk,ikconfiggz) - -############################################################################### -# -# Roll all the X.509 certificates that we can find together and pull them into -# the kernel so that they get loaded into the system trusted keyring during -# boot. -# -# We look in the source root and the build root for all files whose name ends -# in ".x509". Unfortunately, this will generate duplicate filenames, so we -# have make canonicalise the pathnames and then sort them to discard the -# duplicates. -# -############################################################################### -ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y) -X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509) -X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509 -X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \ - $(or $(realpath $(CERT)),$(CERT)))) -X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw)) - -ifeq ($(X509_CERTIFICATES),) -$(warning *** No X.509 certificates found ***) -endif - -ifneq ($(wildcard $(obj)/.x509.list),) -ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES)) -$(warning X.509 certificate list changed to "$(X509_CERTIFICATES)" from "$(shell cat $(obj)/.x509.list)") -$(shell rm $(obj)/.x509.list) -endif -endif - -kernel/system_certificates.o: $(obj)/x509_certificate_list - -quiet_cmd_x509certs = CERTS $@ - cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; $(kecho) " - Including cert $(X509)") - -targets += $(obj)/x509_certificate_list -$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list - $(call if_changed,x509certs) - -targets += $(obj)/.x509.list -$(obj)/.x509.list: - @echo $(X509_CERTIFICATES) >$@ -endif - -clean-files := x509_certificate_list .x509.list - -ifeq ($(CONFIG_MODULE_SIG),y) -############################################################################### -# -# If module signing is requested, say by allyesconfig, but a key has not been -# supplied, then one will need to be generated to make sure the build does not -# fail and that the kernel may be used afterwards. -# -############################################################################### -ifndef CONFIG_MODULE_SIG_HASH -$(error Could not determine digest type to use from kernel config) -endif - -signing_key.priv signing_key.x509: x509.genkey - @echo "###" - @echo "### Now generating an X.509 key pair to be used for signing modules." - @echo "###" - @echo "### If this takes a long time, you might wish to run rngd in the" - @echo "### background to keep the supply of entropy topped up. It" - @echo "### needs to be run as root, and uses a hardware random" - @echo "### number generator if one is available." - @echo "###" - openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \ - -batch -x509 -config x509.genkey \ - -outform DER -out signing_key.x509 \ - -keyout signing_key.priv 2>&1 - @echo "###" - @echo "### Key pair generated." - @echo "###" - -x509.genkey: - @echo Generating X.509 key generation config - @echo >x509.genkey "[ req ]" - @echo >>x509.genkey "default_bits = 4096" - @echo >>x509.genkey "distinguished_name = req_distinguished_name" - @echo >>x509.genkey "prompt = no" - @echo >>x509.genkey "string_mask = utf8only" - @echo >>x509.genkey "x509_extensions = myexts" - @echo >>x509.genkey - @echo >>x509.genkey "[ req_distinguished_name ]" - @echo >>x509.genkey "#O = Unspecified company" - @echo >>x509.genkey "CN = Build time autogenerated kernel key" - @echo >>x509.genkey "#emailAddress = unspecified.user@unspecified.company" - @echo >>x509.genkey - @echo >>x509.genkey "[ myexts ]" - @echo >>x509.genkey "basicConstraints=critical,CA:FALSE" - @echo >>x509.genkey "keyUsage=digitalSignature" - @echo >>x509.genkey "subjectKeyIdentifier=hash" - @echo >>x509.genkey "authorityKeyIdentifier=keyid" -endif diff --git a/kernel/audit.c b/kernel/audit.c index f9e6065346db..662c007635fb 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1761,7 +1761,7 @@ void audit_log_name(struct audit_context *context, struct audit_names *n, } else audit_log_format(ab, " name=(null)"); - if (n->ino != (unsigned long)-1) + if (n->ino != AUDIT_INO_UNSET) audit_log_format(ab, " inode=%lu" " dev=%02x:%02x mode=%#ho" " ouid=%u ogid=%u rdev=%02x:%02x", diff --git a/kernel/audit.h b/kernel/audit.h index d641f9bb3ed0..dadf86a0e59e 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -50,6 +50,7 @@ enum audit_state { /* Rule lists */ struct audit_watch; +struct audit_fsnotify_mark; struct audit_tree; struct audit_chunk; @@ -252,6 +253,7 @@ struct audit_net { extern int selinux_audit_rule_update(void); extern struct mutex audit_filter_mutex; +extern int audit_del_rule(struct audit_entry *); extern void audit_free_rule_rcu(struct rcu_head *); extern struct list_head audit_filter_list[]; @@ -269,6 +271,15 @@ extern int audit_add_watch(struct audit_krule *krule, struct list_head **list); extern void audit_remove_watch_rule(struct audit_krule *krule); extern char *audit_watch_path(struct audit_watch *watch); extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev); + +extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len); +extern char *audit_mark_path(struct audit_fsnotify_mark *mark); +extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark); +extern void audit_remove_mark_rule(struct audit_krule *krule); +extern int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev); +extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old); +extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark); + #else #define audit_put_watch(w) {} #define audit_get_watch(w) {} @@ -278,6 +289,13 @@ extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev #define audit_watch_path(w) "" #define audit_watch_compare(w, i, d) 0 +#define audit_alloc_mark(k, p, l) (ERR_PTR(-EINVAL)) +#define audit_mark_path(m) "" +#define audit_remove_mark(m) +#define audit_remove_mark_rule(k) +#define audit_mark_compare(m, i, d) 0 +#define audit_exe_compare(t, m) (-EINVAL) +#define audit_dupe_exe(n, o) (-EINVAL) #endif /* CONFIG_AUDIT_WATCH */ #ifdef CONFIG_AUDIT_TREE diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c new file mode 100644 index 000000000000..27c6046c2c3d --- /dev/null +++ b/kernel/audit_fsnotify.c @@ -0,0 +1,216 @@ +/* audit_fsnotify.c -- tracking inodes + * + * Copyright 2003-2009,2014-2015 Red Hat, Inc. + * Copyright 2005 Hewlett-Packard Development Company, L.P. + * Copyright 2005 IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/kernel.h> +#include <linux/audit.h> +#include <linux/kthread.h> +#include <linux/mutex.h> +#include <linux/fs.h> +#include <linux/fsnotify_backend.h> +#include <linux/namei.h> +#include <linux/netlink.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/security.h> +#include "audit.h" + +/* + * this mark lives on the parent directory of the inode in question. + * but dev, ino, and path are about the child + */ +struct audit_fsnotify_mark { + dev_t dev; /* associated superblock device */ + unsigned long ino; /* associated inode number */ + char *path; /* insertion path */ + struct fsnotify_mark mark; /* fsnotify mark on the inode */ + struct audit_krule *rule; +}; + +/* fsnotify handle. */ +static struct fsnotify_group *audit_fsnotify_group; + +/* fsnotify events we care about. */ +#define AUDIT_FS_EVENTS (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ + FS_MOVE_SELF | FS_EVENT_ON_CHILD) + +static void audit_fsnotify_mark_free(struct audit_fsnotify_mark *audit_mark) +{ + kfree(audit_mark->path); + kfree(audit_mark); +} + +static void audit_fsnotify_free_mark(struct fsnotify_mark *mark) +{ + struct audit_fsnotify_mark *audit_mark; + + audit_mark = container_of(mark, struct audit_fsnotify_mark, mark); + audit_fsnotify_mark_free(audit_mark); +} + +char *audit_mark_path(struct audit_fsnotify_mark *mark) +{ + return mark->path; +} + +int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev) +{ + if (mark->ino == AUDIT_INO_UNSET) + return 0; + return (mark->ino == ino) && (mark->dev == dev); +} + +static void audit_update_mark(struct audit_fsnotify_mark *audit_mark, + struct inode *inode) +{ + audit_mark->dev = inode ? inode->i_sb->s_dev : AUDIT_DEV_UNSET; + audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET; +} + +struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len) +{ + struct audit_fsnotify_mark *audit_mark; + struct path path; + struct dentry *dentry; + struct inode *inode; + int ret; + + if (pathname[0] != '/' || pathname[len-1] == '/') + return ERR_PTR(-EINVAL); + + dentry = kern_path_locked(pathname, &path); + if (IS_ERR(dentry)) + return (void *)dentry; /* returning an error */ + inode = path.dentry->d_inode; + mutex_unlock(&inode->i_mutex); + + audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL); + if (unlikely(!audit_mark)) { + audit_mark = ERR_PTR(-ENOMEM); + goto out; + } + + fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_free_mark); + audit_mark->mark.mask = AUDIT_FS_EVENTS; + audit_mark->path = pathname; + audit_update_mark(audit_mark, dentry->d_inode); + audit_mark->rule = krule; + + ret = fsnotify_add_mark(&audit_mark->mark, audit_fsnotify_group, inode, NULL, true); + if (ret < 0) { + audit_fsnotify_mark_free(audit_mark); + audit_mark = ERR_PTR(ret); + } +out: + dput(dentry); + path_put(&path); + return audit_mark; +} + +static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, char *op) +{ + struct audit_buffer *ab; + struct audit_krule *rule = audit_mark->rule; + + if (!audit_enabled) + return; + ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); + if (unlikely(!ab)) + return; + audit_log_format(ab, "auid=%u ses=%u op=", + from_kuid(&init_user_ns, audit_get_loginuid(current)), + audit_get_sessionid(current)); + audit_log_string(ab, op); + audit_log_format(ab, " path="); + audit_log_untrustedstring(ab, audit_mark->path); + audit_log_key(ab, rule->filterkey); + audit_log_format(ab, " list=%d res=1", rule->listnr); + audit_log_end(ab); +} + +void audit_remove_mark(struct audit_fsnotify_mark *audit_mark) +{ + fsnotify_destroy_mark(&audit_mark->mark, audit_fsnotify_group); + fsnotify_put_mark(&audit_mark->mark); +} + +void audit_remove_mark_rule(struct audit_krule *krule) +{ + struct audit_fsnotify_mark *mark = krule->exe; + + audit_remove_mark(mark); +} + +static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark) +{ + struct audit_krule *rule = audit_mark->rule; + struct audit_entry *entry = container_of(rule, struct audit_entry, rule); + + audit_mark_log_rule_change(audit_mark, "autoremove_rule"); + audit_del_rule(entry); +} + +/* Update mark data in audit rules based on fsnotify events. */ +static int audit_mark_handle_event(struct fsnotify_group *group, + struct inode *to_tell, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + u32 mask, void *data, int data_type, + const unsigned char *dname, u32 cookie) +{ + struct audit_fsnotify_mark *audit_mark; + struct inode *inode = NULL; + + audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark); + + BUG_ON(group != audit_fsnotify_group); + + switch (data_type) { + case (FSNOTIFY_EVENT_PATH): + inode = ((struct path *)data)->dentry->d_inode; + break; + case (FSNOTIFY_EVENT_INODE): + inode = (struct inode *)data; + break; + default: + BUG(); + return 0; + }; + + if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) { + if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL)) + return 0; + audit_update_mark(audit_mark, inode); + } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) + audit_autoremove_mark_rule(audit_mark); + + return 0; +} + +static const struct fsnotify_ops audit_mark_fsnotify_ops = { + .handle_event = audit_mark_handle_event, +}; + +static int __init audit_fsnotify_init(void) +{ + audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops); + if (IS_ERR(audit_fsnotify_group)) { + audit_fsnotify_group = NULL; + audit_panic("cannot create audit fsnotify group"); + } + return 0; +} +device_initcall(audit_fsnotify_init); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index b0f9877273fc..94ecdabda8e6 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -479,6 +479,8 @@ static void kill_rules(struct audit_tree *tree) if (rule->tree) { /* not a half-baked one */ audit_tree_log_remove_rule(rule); + if (entry->rule.exe) + audit_remove_mark(entry->rule.exe); rule->tree = NULL; list_del_rcu(&entry->list); list_del(&entry->rule.list); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 6e30024d9aac..656c7e93ac0d 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -138,7 +138,7 @@ char *audit_watch_path(struct audit_watch *watch) int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) { - return (watch->ino != (unsigned long)-1) && + return (watch->ino != AUDIT_INO_UNSET) && (watch->ino == ino) && (watch->dev == dev); } @@ -179,8 +179,8 @@ static struct audit_watch *audit_init_watch(char *path) INIT_LIST_HEAD(&watch->rules); atomic_set(&watch->count, 1); watch->path = path; - watch->dev = (dev_t)-1; - watch->ino = (unsigned long)-1; + watch->dev = AUDIT_DEV_UNSET; + watch->ino = AUDIT_INO_UNSET; return watch; } @@ -203,7 +203,6 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op) if (IS_ERR(watch)) return PTR_ERR(watch); - audit_get_watch(watch); krule->watch = watch; return 0; @@ -313,6 +312,8 @@ static void audit_update_watch(struct audit_parent *parent, list_replace(&oentry->rule.list, &nentry->rule.list); } + if (oentry->rule.exe) + audit_remove_mark(oentry->rule.exe); audit_watch_log_rule_change(r, owatch, "updated_rules"); @@ -343,6 +344,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent) list_for_each_entry_safe(r, nextr, &w->rules, rlist) { e = container_of(r, struct audit_entry, rule); audit_watch_log_rule_change(r, w, "remove_rule"); + if (e->rule.exe) + audit_remove_mark(e->rule.exe); list_del(&r->rlist); list_del(&r->list); list_del_rcu(&e->list); @@ -387,19 +390,20 @@ static void audit_add_to_parent(struct audit_krule *krule, watch_found = 1; - /* put krule's and initial refs to temporary watch */ - audit_put_watch(watch); + /* put krule's ref to temporary watch */ audit_put_watch(watch); audit_get_watch(w); krule->watch = watch = w; + + audit_put_parent(parent); break; } if (!watch_found) { - audit_get_parent(parent); watch->parent = parent; + audit_get_watch(watch); list_add(&watch->wlist, &parent->watches); } list_add(&krule->rlist, &watch->rules); @@ -437,9 +441,6 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) audit_add_to_parent(krule, parent); - /* match get in audit_find_parent or audit_init_parent */ - audit_put_parent(parent); - h = audit_hash_ino((u32)watch->ino); *list = &audit_inode_hash[h]; error: @@ -496,7 +497,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, if (mask & (FS_CREATE|FS_MOVED_TO) && inode) audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); else if (mask & (FS_DELETE|FS_MOVED_FROM)) - audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); + audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1); else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) audit_remove_parent_watches(parent); @@ -517,3 +518,36 @@ static int __init audit_watch_init(void) return 0; } device_initcall(audit_watch_init); + +int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old) +{ + struct audit_fsnotify_mark *audit_mark; + char *pathname; + + pathname = kstrdup(audit_mark_path(old->exe), GFP_KERNEL); + if (!pathname) + return -ENOMEM; + + audit_mark = audit_alloc_mark(new, pathname, strlen(pathname)); + if (IS_ERR(audit_mark)) { + kfree(pathname); + return PTR_ERR(audit_mark); + } + new->exe = audit_mark; + + return 0; +} + +int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark) +{ + struct file *exe_file; + unsigned long ino; + dev_t dev; + + rcu_read_lock(); + exe_file = rcu_dereference(tsk->mm->exe_file); + ino = exe_file->f_inode->i_ino; + dev = exe_file->f_inode->i_sb->s_dev; + rcu_read_unlock(); + return audit_mark_compare(mark, ino, dev); +} diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 72e1660a79a3..7714d93edb85 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -405,6 +405,12 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) if (f->val > AUDIT_MAX_FIELD_COMPARE) return -EINVAL; break; + case AUDIT_EXE: + if (f->op != Audit_equal) + return -EINVAL; + if (entry->rule.listnr != AUDIT_FILTER_EXIT) + return -EINVAL; + break; }; return 0; } @@ -419,6 +425,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, size_t remain = datasz - sizeof(struct audit_rule_data); int i; char *str; + struct audit_fsnotify_mark *audit_mark; entry = audit_to_entry_common(data); if (IS_ERR(entry)) @@ -539,6 +546,24 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, entry->rule.buflen += f->val; entry->rule.filterkey = str; break; + case AUDIT_EXE: + if (entry->rule.exe || f->val > PATH_MAX) + goto exit_free; + str = audit_unpack_string(&bufp, &remain, f->val); + if (IS_ERR(str)) { + err = PTR_ERR(str); + goto exit_free; + } + entry->rule.buflen += f->val; + + audit_mark = audit_alloc_mark(&entry->rule, str, f->val); + if (IS_ERR(audit_mark)) { + kfree(str); + err = PTR_ERR(audit_mark); + goto exit_free; + } + entry->rule.exe = audit_mark; + break; } } @@ -549,10 +574,10 @@ exit_nofree: return entry; exit_free: - if (entry->rule.watch) - audit_put_watch(entry->rule.watch); /* matches initial get */ if (entry->rule.tree) audit_put_tree(entry->rule.tree); /* that's the temporary one */ + if (entry->rule.exe) + audit_remove_mark(entry->rule.exe); /* that's the template one */ audit_free_rule(entry); return ERR_PTR(err); } @@ -617,6 +642,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data->buflen += data->values[i] = audit_pack_string(&bufp, krule->filterkey); break; + case AUDIT_EXE: + data->buflen += data->values[i] = + audit_pack_string(&bufp, audit_mark_path(krule->exe)); + break; case AUDIT_LOGINUID_SET: if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) { data->fields[i] = AUDIT_LOGINUID; @@ -680,6 +709,12 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) if (strcmp(a->filterkey, b->filterkey)) return 1; break; + case AUDIT_EXE: + /* both paths exist based on above type compare */ + if (strcmp(audit_mark_path(a->exe), + audit_mark_path(b->exe))) + return 1; + break; case AUDIT_UID: case AUDIT_EUID: case AUDIT_SUID: @@ -801,8 +836,14 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old) err = -ENOMEM; else new->filterkey = fk; + break; + case AUDIT_EXE: + err = audit_dupe_exe(new, old); + break; } if (err) { + if (new->exe) + audit_remove_mark(new->exe); audit_free_rule(entry); return ERR_PTR(err); } @@ -863,7 +904,7 @@ static inline int audit_add_rule(struct audit_entry *entry) struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; struct list_head *list; - int err; + int err = 0; #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; @@ -881,7 +922,7 @@ static inline int audit_add_rule(struct audit_entry *entry) /* normally audit_add_tree_rule() will free it on failure */ if (tree) audit_put_tree(tree); - goto error; + return err; } if (watch) { @@ -895,14 +936,14 @@ static inline int audit_add_rule(struct audit_entry *entry) */ if (tree) audit_put_tree(tree); - goto error; + return err; } } if (tree) { err = audit_add_tree_rule(&entry->rule); if (err) { mutex_unlock(&audit_filter_mutex); - goto error; + return err; } } @@ -933,19 +974,13 @@ static inline int audit_add_rule(struct audit_entry *entry) #endif mutex_unlock(&audit_filter_mutex); - return 0; - -error: - if (watch) - audit_put_watch(watch); /* tmp watch, matches initial get */ return err; } /* Remove an existing rule from filterlist. */ -static inline int audit_del_rule(struct audit_entry *entry) +int audit_del_rule(struct audit_entry *entry) { struct audit_entry *e; - struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; struct list_head *list; int ret = 0; @@ -961,7 +996,6 @@ static inline int audit_del_rule(struct audit_entry *entry) mutex_lock(&audit_filter_mutex); e = audit_find_rule(entry, &list); if (!e) { - mutex_unlock(&audit_filter_mutex); ret = -ENOENT; goto out; } @@ -972,9 +1006,8 @@ static inline int audit_del_rule(struct audit_entry *entry) if (e->rule.tree) audit_remove_tree_rule(&e->rule); - list_del_rcu(&e->list); - list_del(&e->rule.list); - call_rcu(&e->rcu, audit_free_rule_rcu); + if (e->rule.exe) + audit_remove_mark_rule(&e->rule); #ifdef CONFIG_AUDITSYSCALL if (!dont_count) @@ -983,11 +1016,14 @@ static inline int audit_del_rule(struct audit_entry *entry) if (!audit_match_signal(entry)) audit_signals--; #endif - mutex_unlock(&audit_filter_mutex); + + list_del_rcu(&e->list); + list_del(&e->rule.list); + call_rcu(&e->rcu, audit_free_rule_rcu); out: - if (watch) - audit_put_watch(watch); /* match initial get */ + mutex_unlock(&audit_filter_mutex); + if (tree) audit_put_tree(tree); /* that's the temporary one */ @@ -1077,8 +1113,11 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data, WARN_ON(1); } - if (err || type == AUDIT_DEL_RULE) + if (err || type == AUDIT_DEL_RULE) { + if (entry->rule.exe) + audit_remove_mark(entry->rule.exe); audit_free_rule(entry); + } return err; } @@ -1370,6 +1409,8 @@ static int update_lsm_rule(struct audit_krule *r) return 0; nentry = audit_dupe_rule(r); + if (entry->rule.exe) + audit_remove_mark(entry->rule.exe); if (IS_ERR(nentry)) { /* save the first error encountered for the * return value */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e85bdfd15fed..b86cc04959de 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -180,7 +180,7 @@ static int audit_match_filetype(struct audit_context *ctx, int val) return 0; list_for_each_entry(n, &ctx->names_list, list) { - if ((n->ino != -1) && + if ((n->ino != AUDIT_INO_UNSET) && ((n->mode & S_IFMT) == mode)) return 1; } @@ -466,6 +466,9 @@ static int audit_filter_rules(struct task_struct *tsk, result = audit_comparator(ctx->ppid, f->op, f->val); } break; + case AUDIT_EXE: + result = audit_exe_compare(tsk, rule->exe); + break; case AUDIT_UID: result = audit_uid_comparator(cred->uid, f->op, f->uid); break; @@ -1680,7 +1683,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context, aname->should_free = true; } - aname->ino = (unsigned long)-1; + aname->ino = AUDIT_INO_UNSET; aname->type = type; list_add_tail(&aname->list, &context->names_list); @@ -1922,7 +1925,7 @@ void __audit_inode_child(const struct inode *parent, if (inode) audit_copy_inode(found_child, dentry, inode); else - found_child->ino = (unsigned long)-1; + found_child->ino = AUDIT_INO_UNSET; } EXPORT_SYMBOL_GPL(__audit_inode_child); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f3f5cd5e2c0d..2cf0f79f1fc9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1342,7 +1342,7 @@ static int cgroup_show_options(struct seq_file *seq, if (root != &cgrp_dfl_root) for_each_subsys(ss, ssid) if (root->subsys_mask & (1 << ssid)) - seq_printf(seq, ",%s", ss->legacy_name); + seq_show_option(seq, ss->legacy_name, NULL); if (root->flags & CGRP_ROOT_NOPREFIX) seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) @@ -1350,13 +1350,14 @@ static int cgroup_show_options(struct seq_file *seq, spin_lock(&release_agent_path_lock); if (strlen(root->release_agent_path)) - seq_printf(seq, ",release_agent=%s", root->release_agent_path); + seq_show_option(seq, "release_agent", + root->release_agent_path); spin_unlock(&release_agent_path_lock); if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) seq_puts(seq, ",clone_children"); if (strlen(root->name)) - seq_printf(seq, ",name=%s", root->name); + seq_show_option(seq, "name", root->name); return 0; } diff --git a/kernel/fork.c b/kernel/fork.c index 03aa2e6de7a4..7d5f0f118a63 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -454,8 +454,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) tmp->vm_mm = mm; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= ~VM_LOCKED; + tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP); tmp->vm_next = tmp->vm_prev = NULL; + tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; file = tmp->vm_file; if (file) { struct inode *inode = file_inode(file); diff --git a/kernel/futex.c b/kernel/futex.c index c4a182f5357e..6e443efc65f4 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -64,6 +64,7 @@ #include <linux/hugetlb.h> #include <linux/freezer.h> #include <linux/bootmem.h> +#include <linux/fault-inject.h> #include <asm/futex.h> @@ -258,6 +259,66 @@ static unsigned long __read_mostly futex_hashsize; static struct futex_hash_bucket *futex_queues; +/* + * Fault injections for futexes. + */ +#ifdef CONFIG_FAIL_FUTEX + +static struct { + struct fault_attr attr; + + u32 ignore_private; +} fail_futex = { + .attr = FAULT_ATTR_INITIALIZER, + .ignore_private = 0, +}; + +static int __init setup_fail_futex(char *str) +{ + return setup_fault_attr(&fail_futex.attr, str); +} +__setup("fail_futex=", setup_fail_futex); + +static bool should_fail_futex(bool fshared) +{ + if (fail_futex.ignore_private && !fshared) + return false; + + return should_fail(&fail_futex.attr, 1); +} + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + +static int __init fail_futex_debugfs(void) +{ + umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_futex", NULL, + &fail_futex.attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + if (!debugfs_create_bool("ignore-private", mode, dir, + &fail_futex.ignore_private)) { + debugfs_remove_recursive(dir); + return -ENOMEM; + } + + return 0; +} + +late_initcall(fail_futex_debugfs); + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ + +#else +static inline bool should_fail_futex(bool fshared) +{ + return false; +} +#endif /* CONFIG_FAIL_FUTEX */ + static inline void futex_get_mm(union futex_key *key) { atomic_inc(&key->private.mm->mm_count); @@ -413,6 +474,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) return -EFAULT; + if (unlikely(should_fail_futex(fshared))) + return -EFAULT; + /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs @@ -428,6 +492,10 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) } again: + /* Ignore any VERIFY_READ mapping (futex common case) */ + if (unlikely(should_fail_futex(fshared))) + return -EFAULT; + err = get_user_pages_fast(address, 1, 1, &page); /* * If write access is not required (eg. FUTEX_WAIT), try @@ -516,7 +584,7 @@ again: * A RO anonymous page will never change and thus doesn't make * sense for futex operations. */ - if (ro) { + if (unlikely(should_fail_futex(fshared)) || ro) { err = -EFAULT; goto out; } @@ -974,6 +1042,9 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) { u32 uninitialized_var(curval); + if (unlikely(should_fail_futex(true))) + return -EFAULT; + if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) return -EFAULT; @@ -1015,12 +1086,18 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, if (get_futex_value_locked(&uval, uaddr)) return -EFAULT; + if (unlikely(should_fail_futex(true))) + return -EFAULT; + /* * Detect deadlocks. */ if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) return -EDEADLK; + if ((unlikely(should_fail_futex(true)))) + return -EDEADLK; + /* * Lookup existing state first. If it exists, try to attach to * its pi_state. @@ -1155,6 +1232,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, */ newval = FUTEX_WAITERS | task_pid_vnr(new_owner); + if (unlikely(should_fail_futex(true))) + ret = -EFAULT; + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) ret = -EFAULT; else if (curval != uval) @@ -1457,6 +1537,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, if (get_futex_value_locked(&curval, pifutex)) return -EFAULT; + if (unlikely(should_fail_futex(true))) + return -EFAULT; + /* * Find the top_waiter and determine if there are additional waiters. * If the caller intends to requeue more than 1 waiter to pifutex, @@ -2268,8 +2351,11 @@ static long futex_wait_restart(struct restart_block *restart) /* * Userspace tried a 0 -> TID atomic transition of the futex value * and failed. The kernel side here does the whole locking operation: - * if there are waiters then it will block, it does PI, etc. (Due to - * races the kernel might see a 0 value of the futex too.) + * if there are waiters then it will block as a consequence of relying + * on rt-mutexes, it does PI, etc. (Due to races the kernel might see + * a 0 value of the futex too.). + * + * Also serves as futex trylock_pi()'ing, and due semantics. */ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) @@ -2300,6 +2386,10 @@ retry_private: ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); if (unlikely(ret)) { + /* + * Atomic work succeeded and we got the lock, + * or failed. Either way, we do _not_ block. + */ switch (ret) { case 1: /* We got the lock. */ @@ -2530,7 +2620,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 * @uaddr: the futex we initially wait on (non-pi) * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be - * the same type, no requeueing from private to shared, etc. + * the same type, no requeueing from private to shared, etc. * @val: the expected value of uaddr * @abs_time: absolute timeout * @bitset: 32 bit wakeup bitset set by userspace, defaults to all @@ -3005,6 +3095,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || cmd == FUTEX_WAIT_BITSET || cmd == FUTEX_WAIT_REQUEUE_PI)) { + if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) + return -EFAULT; if (copy_from_user(&ts, utime, sizeof(ts)) != 0) return -EFAULT; if (!timespec_valid(&ts)) diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 52ebaca1b9fc..f7dd15d537f9 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -54,7 +54,7 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); } -static void jump_label_update(struct static_key *key, int enable); +static void jump_label_update(struct static_key *key); void static_key_slow_inc(struct static_key *key) { @@ -63,13 +63,8 @@ void static_key_slow_inc(struct static_key *key) return; jump_label_lock(); - if (atomic_read(&key->enabled) == 0) { - if (!jump_label_get_branch_default(key)) - jump_label_update(key, JUMP_LABEL_ENABLE); - else - jump_label_update(key, JUMP_LABEL_DISABLE); - } - atomic_inc(&key->enabled); + if (atomic_inc_return(&key->enabled) == 1) + jump_label_update(key); jump_label_unlock(); } EXPORT_SYMBOL_GPL(static_key_slow_inc); @@ -87,10 +82,7 @@ static void __static_key_slow_dec(struct static_key *key, atomic_inc(&key->enabled); schedule_delayed_work(work, rate_limit); } else { - if (!jump_label_get_branch_default(key)) - jump_label_update(key, JUMP_LABEL_DISABLE); - else - jump_label_update(key, JUMP_LABEL_ENABLE); + jump_label_update(key); } jump_label_unlock(); } @@ -149,7 +141,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start, return 0; } -/* +/* * Update code which is definitely not currently executing. * Architectures which need heavyweight synchronization to modify * running code can override this to make the non-live update case @@ -158,37 +150,54 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start, void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type) { - arch_jump_label_transform(entry, type); + arch_jump_label_transform(entry, type); +} + +static inline struct jump_entry *static_key_entries(struct static_key *key) +{ + return (struct jump_entry *)((unsigned long)key->entries & ~JUMP_TYPE_MASK); +} + +static inline bool static_key_type(struct static_key *key) +{ + return (unsigned long)key->entries & JUMP_TYPE_MASK; +} + +static inline struct static_key *jump_entry_key(struct jump_entry *entry) +{ + return (struct static_key *)((unsigned long)entry->key & ~1UL); +} + +static bool jump_entry_branch(struct jump_entry *entry) +{ + return (unsigned long)entry->key & 1UL; +} + +static enum jump_label_type jump_label_type(struct jump_entry *entry) +{ + struct static_key *key = jump_entry_key(entry); + bool enabled = static_key_enabled(key); + bool branch = jump_entry_branch(entry); + + /* See the comment in linux/jump_label.h */ + return enabled ^ branch; } static void __jump_label_update(struct static_key *key, struct jump_entry *entry, - struct jump_entry *stop, int enable) + struct jump_entry *stop) { - for (; (entry < stop) && - (entry->key == (jump_label_t)(unsigned long)key); - entry++) { + for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { /* * entry->code set to 0 invalidates module init text sections * kernel_text_address() verifies we are not in core kernel * init code, see jump_label_invalidate_module_init(). */ if (entry->code && kernel_text_address(entry->code)) - arch_jump_label_transform(entry, enable); + arch_jump_label_transform(entry, jump_label_type(entry)); } } -static enum jump_label_type jump_label_type(struct static_key *key) -{ - bool true_branch = jump_label_get_branch_default(key); - bool state = static_key_enabled(key); - - if ((!true_branch && state) || (true_branch && !state)) - return JUMP_LABEL_ENABLE; - - return JUMP_LABEL_DISABLE; -} - void __init jump_label_init(void) { struct jump_entry *iter_start = __start___jump_table; @@ -202,8 +211,11 @@ void __init jump_label_init(void) for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; - iterk = (struct static_key *)(unsigned long)iter->key; - arch_jump_label_transform_static(iter, jump_label_type(iterk)); + /* rewrite NOPs */ + if (jump_label_type(iter) == JUMP_LABEL_NOP) + arch_jump_label_transform_static(iter, JUMP_LABEL_NOP); + + iterk = jump_entry_key(iter); if (iterk == key) continue; @@ -222,6 +234,16 @@ void __init jump_label_init(void) #ifdef CONFIG_MODULES +static enum jump_label_type jump_label_init_type(struct jump_entry *entry) +{ + struct static_key *key = jump_entry_key(entry); + bool type = static_key_type(key); + bool branch = jump_entry_branch(entry); + + /* See the comment in linux/jump_label.h */ + return type ^ branch; +} + struct static_key_mod { struct static_key_mod *next; struct jump_entry *entries; @@ -243,17 +265,15 @@ static int __jump_label_mod_text_reserved(void *start, void *end) start, end); } -static void __jump_label_mod_update(struct static_key *key, int enable) +static void __jump_label_mod_update(struct static_key *key) { - struct static_key_mod *mod = key->next; + struct static_key_mod *mod; - while (mod) { + for (mod = key->next; mod; mod = mod->next) { struct module *m = mod->mod; __jump_label_update(key, mod->entries, - m->jump_entries + m->num_jump_entries, - enable); - mod = mod->next; + m->jump_entries + m->num_jump_entries); } } @@ -276,7 +296,9 @@ void jump_label_apply_nops(struct module *mod) return; for (iter = iter_start; iter < iter_stop; iter++) { - arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); + /* Only write NOPs for arch_branch_static(). */ + if (jump_label_init_type(iter) == JUMP_LABEL_NOP) + arch_jump_label_transform_static(iter, JUMP_LABEL_NOP); } } @@ -297,7 +319,7 @@ static int jump_label_add_module(struct module *mod) for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; - iterk = (struct static_key *)(unsigned long)iter->key; + iterk = jump_entry_key(iter); if (iterk == key) continue; @@ -318,8 +340,9 @@ static int jump_label_add_module(struct module *mod) jlm->next = key->next; key->next = jlm; - if (jump_label_type(key) == JUMP_LABEL_ENABLE) - __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); + /* Only update if we've changed from our initial state */ + if (jump_label_type(iter) != jump_label_init_type(iter)) + __jump_label_update(key, iter, iter_stop); } return 0; @@ -334,10 +357,10 @@ static void jump_label_del_module(struct module *mod) struct static_key_mod *jlm, **prev; for (iter = iter_start; iter < iter_stop; iter++) { - if (iter->key == (jump_label_t)(unsigned long)key) + if (jump_entry_key(iter) == key) continue; - key = (struct static_key *)(unsigned long)iter->key; + key = jump_entry_key(iter); if (within_module(iter->key, mod)) continue; @@ -439,14 +462,14 @@ int jump_label_text_reserved(void *start, void *end) return ret; } -static void jump_label_update(struct static_key *key, int enable) +static void jump_label_update(struct static_key *key) { struct jump_entry *stop = __stop___jump_table; - struct jump_entry *entry = jump_label_get_entries(key); + struct jump_entry *entry = static_key_entries(key); #ifdef CONFIG_MODULES struct module *mod; - __jump_label_mod_update(key, enable); + __jump_label_mod_update(key); preempt_disable(); mod = __module_address((unsigned long)key); @@ -456,7 +479,44 @@ static void jump_label_update(struct static_key *key, int enable) #endif /* if there are no users, entry can be NULL */ if (entry) - __jump_label_update(key, entry, stop, enable); + __jump_label_update(key, entry, stop); } -#endif +#ifdef CONFIG_STATIC_KEYS_SELFTEST +static DEFINE_STATIC_KEY_TRUE(sk_true); +static DEFINE_STATIC_KEY_FALSE(sk_false); + +static __init int jump_label_test(void) +{ + int i; + + for (i = 0; i < 2; i++) { + WARN_ON(static_key_enabled(&sk_true.key) != true); + WARN_ON(static_key_enabled(&sk_false.key) != false); + + WARN_ON(!static_branch_likely(&sk_true)); + WARN_ON(!static_branch_unlikely(&sk_true)); + WARN_ON(static_branch_likely(&sk_false)); + WARN_ON(static_branch_unlikely(&sk_false)); + + static_branch_disable(&sk_true); + static_branch_enable(&sk_false); + + WARN_ON(static_key_enabled(&sk_true.key) == true); + WARN_ON(static_key_enabled(&sk_false.key) == false); + + WARN_ON(static_branch_likely(&sk_true)); + WARN_ON(static_branch_unlikely(&sk_true)); + WARN_ON(!static_branch_likely(&sk_false)); + WARN_ON(!static_branch_unlikely(&sk_false)); + + static_branch_enable(&sk_true); + static_branch_disable(&sk_false); + } + + return 0; +} +late_initcall(jump_label_test); +#endif /* STATIC_KEYS_SELFTEST */ + +#endif /* HAVE_JUMP_LABEL */ diff --git a/kernel/kthread.c b/kernel/kthread.c index 490924cc9e7c..9ff173dca1ae 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -248,15 +248,16 @@ static void create_kthread(struct kthread_create_info *create) * kthread_create_on_node - create a kthread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. - * @node: memory node number. + * @node: task and thread structures for the thread are allocated on this node * @namefmt: printf-style name for the thread. * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start - * it. See also kthread_run(). + * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and + * is affine to all CPUs. * * If thread is going to be bound on a particular cpu, give its node - * in @node, to get NUMA affinity for kthread stack, or else give -1. + * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either call do_exit() directly if it is a * standalone thread for which no one will call kthread_stop(), or diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 7dd5c9918e4c..8e96f6cc2a4a 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,5 +1,5 @@ -obj-y += mutex.o semaphore.o rwsem.o +obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) @@ -20,11 +20,9 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o -obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o -obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 652a8ee8efe9..f32567254867 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -88,6 +88,19 @@ void percpu_down_read(struct percpu_rw_semaphore *brw) __up_read(&brw->rw_sem); } +int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) +{ + if (unlikely(!update_fast_ctr(brw, +1))) { + if (!__down_read_trylock(&brw->rw_sem)) + return 0; + atomic_inc(&brw->slow_read_ctr); + __up_read(&brw->rw_sem); + } + + rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_); + return 1; +} + void percpu_up_read(struct percpu_rw_semaphore *brw) { rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 6c5da483966b..f17a3e3b3550 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -55,27 +55,29 @@ rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) { while ((cnts & _QW_WMASK) == _QW_LOCKED) { cpu_relax_lowlatency(); - cnts = smp_load_acquire((u32 *)&lock->cnts); + cnts = atomic_read_acquire(&lock->cnts); } } /** - * queue_read_lock_slowpath - acquire read lock of a queue rwlock + * queued_read_lock_slowpath - acquire read lock of a queue rwlock * @lock: Pointer to queue rwlock structure + * @cnts: Current qrwlock lock value */ -void queue_read_lock_slowpath(struct qrwlock *lock) +void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) { - u32 cnts; - /* * Readers come here when they cannot get the lock without waiting */ if (unlikely(in_interrupt())) { /* - * Readers in interrupt context will spin until the lock is - * available without waiting in the queue. + * Readers in interrupt context will get the lock immediately + * if the writer is just waiting (not holding the lock yet). + * The rspin_until_writer_unlock() function returns immediately + * in this case. Otherwise, they will spin (with ACQUIRE + * semantics) until the lock is available without waiting in + * the queue. */ - cnts = smp_load_acquire((u32 *)&lock->cnts); rspin_until_writer_unlock(lock, cnts); return; } @@ -87,16 +89,11 @@ void queue_read_lock_slowpath(struct qrwlock *lock) arch_spin_lock(&lock->lock); /* - * At the head of the wait queue now, wait until the writer state - * goes to 0 and then try to increment the reader count and get - * the lock. It is possible that an incoming writer may steal the - * lock in the interim, so it is necessary to check the writer byte - * to make sure that the write lock isn't taken. + * The ACQUIRE semantics of the following spinning code ensure + * that accesses can't leak upwards out of our subsequent critical + * section in the case that the lock is currently held for write. */ - while (atomic_read(&lock->cnts) & _QW_WMASK) - cpu_relax_lowlatency(); - - cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS; + cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS; rspin_until_writer_unlock(lock, cnts); /* @@ -104,13 +101,13 @@ void queue_read_lock_slowpath(struct qrwlock *lock) */ arch_spin_unlock(&lock->lock); } -EXPORT_SYMBOL(queue_read_lock_slowpath); +EXPORT_SYMBOL(queued_read_lock_slowpath); /** - * queue_write_lock_slowpath - acquire write lock of a queue rwlock + * queued_write_lock_slowpath - acquire write lock of a queue rwlock * @lock : Pointer to queue rwlock structure */ -void queue_write_lock_slowpath(struct qrwlock *lock) +void queued_write_lock_slowpath(struct qrwlock *lock) { u32 cnts; @@ -119,7 +116,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock) /* Try to acquire the lock directly if no reader is present */ if (!atomic_read(&lock->cnts) && - (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0)) + (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0)) goto unlock; /* @@ -130,7 +127,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock) struct __qrwlock *l = (struct __qrwlock *)lock; if (!READ_ONCE(l->wmode) && - (cmpxchg(&l->wmode, 0, _QW_WAITING) == 0)) + (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0)) break; cpu_relax_lowlatency(); @@ -140,8 +137,8 @@ void queue_write_lock_slowpath(struct qrwlock *lock) for (;;) { cnts = atomic_read(&lock->cnts); if ((cnts == _QW_WAITING) && - (atomic_cmpxchg(&lock->cnts, _QW_WAITING, - _QW_LOCKED) == _QW_WAITING)) + (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING, + _QW_LOCKED) == _QW_WAITING)) break; cpu_relax_lowlatency(); @@ -149,4 +146,4 @@ void queue_write_lock_slowpath(struct qrwlock *lock) unlock: arch_spin_unlock(&lock->lock); } -EXPORT_SYMBOL(queue_write_lock_slowpath); +EXPORT_SYMBOL(queued_write_lock_slowpath); diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 38c49202d532..337c8818541d 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -239,8 +239,8 @@ static __always_inline void set_locked(struct qspinlock *lock) static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } -static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { } - +static __always_inline void __pv_kick_node(struct qspinlock *lock, + struct mcs_spinlock *node) { } static __always_inline void __pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) { } @@ -440,7 +440,7 @@ queue: cpu_relax(); arch_mcs_spin_unlock_contended(&next->locked); - pv_kick_node(next); + pv_kick_node(lock, next); release: /* diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index df19ae4debd0..c8e6e9a596f5 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -22,9 +22,14 @@ #define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET) +/* + * Queue node uses: vcpu_running & vcpu_halted. + * Queue head uses: vcpu_running & vcpu_hashed. + */ enum vcpu_state { vcpu_running = 0, - vcpu_halted, + vcpu_halted, /* Used only in pv_wait_node */ + vcpu_hashed, /* = pv_hash'ed + vcpu_halted */ }; struct pv_node { @@ -153,7 +158,8 @@ static void pv_init_node(struct mcs_spinlock *node) /* * Wait for node->locked to become true, halt the vcpu after a short spin. - * pv_kick_node() is used to wake the vcpu again. + * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its + * behalf. */ static void pv_wait_node(struct mcs_spinlock *node) { @@ -172,9 +178,9 @@ static void pv_wait_node(struct mcs_spinlock *node) * * [S] pn->state = vcpu_halted [S] next->locked = 1 * MB MB - * [L] pn->locked [RmW] pn->state = vcpu_running + * [L] pn->locked [RmW] pn->state = vcpu_hashed * - * Matches the xchg() from pv_kick_node(). + * Matches the cmpxchg() from pv_kick_node(). */ smp_store_mb(pn->state, vcpu_halted); @@ -182,9 +188,10 @@ static void pv_wait_node(struct mcs_spinlock *node) pv_wait(&pn->state, vcpu_halted); /* - * Reset the vCPU state to avoid unncessary CPU kicking + * If pv_kick_node() changed us to vcpu_hashed, retain that value + * so that pv_wait_head() knows to not also try to hash this lock. */ - WRITE_ONCE(pn->state, vcpu_running); + cmpxchg(&pn->state, vcpu_halted, vcpu_running); /* * If the locked flag is still not set after wakeup, it is a @@ -194,6 +201,7 @@ static void pv_wait_node(struct mcs_spinlock *node) * MCS lock will be released soon. */ } + /* * By now our node->locked should be 1 and our caller will not actually * spin-wait for it. We do however rely on our caller to do a @@ -202,24 +210,35 @@ static void pv_wait_node(struct mcs_spinlock *node) } /* - * Called after setting next->locked = 1, used to wake those stuck in - * pv_wait_node(). + * Called after setting next->locked = 1 when we're the lock owner. + * + * Instead of waking the waiters stuck in pv_wait_node() advance their state such + * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle. */ -static void pv_kick_node(struct mcs_spinlock *node) +static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; + struct __qspinlock *l = (void *)lock; /* - * Note that because node->locked is already set, this actual - * mcs_spinlock entry could be re-used already. + * If the vCPU is indeed halted, advance its state to match that of + * pv_wait_node(). If OTOH this fails, the vCPU was running and will + * observe its next->locked value and advance itself. * - * This should be fine however, kicking people for no reason is - * harmless. + * Matches with smp_store_mb() and cmpxchg() in pv_wait_node() + */ + if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted) + return; + + /* + * Put the lock into the hash table and set the _Q_SLOW_VAL. * - * See the comment in pv_wait_node(). + * As this is the same vCPU that will check the _Q_SLOW_VAL value and + * the hash table later on at unlock time, no atomic instruction is + * needed. */ - if (xchg(&pn->state, vcpu_running) == vcpu_halted) - pv_kick(pn->cpu); + WRITE_ONCE(l->locked, _Q_SLOW_VAL); + (void)pv_hash(lock, pn); } /* @@ -233,6 +252,13 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) struct qspinlock **lp = NULL; int loop; + /* + * If pv_kick_node() already advanced our state, we don't need to + * insert ourselves into the hash table anymore. + */ + if (READ_ONCE(pn->state) == vcpu_hashed) + lp = (struct qspinlock **)1; + for (;;) { for (loop = SPIN_THRESHOLD; loop; loop--) { if (!READ_ONCE(l->locked)) @@ -240,17 +266,22 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) cpu_relax(); } - WRITE_ONCE(pn->state, vcpu_halted); if (!lp) { /* ONCE */ + WRITE_ONCE(pn->state, vcpu_hashed); lp = pv_hash(lock, pn); + /* - * lp must be set before setting _Q_SLOW_VAL + * We must hash before setting _Q_SLOW_VAL, such that + * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock() + * we'll be sure to be able to observe our hash entry. * - * [S] lp = lock [RmW] l = l->locked = 0 - * MB MB - * [S] l->locked = _Q_SLOW_VAL [L] lp + * [S] pn->state + * [S] <hash> [Rmw] l->locked == _Q_SLOW_VAL + * MB RMB + * [RmW] l->locked = _Q_SLOW_VAL [L] <unhash> + * [L] pn->state * - * Matches the cmpxchg() in __pv_queued_spin_unlock(). + * Matches the smp_rmb() in __pv_queued_spin_unlock(). */ if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) { /* @@ -287,24 +318,34 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) { struct __qspinlock *l = (void *)lock; struct pv_node *node; - u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); + u8 locked; /* * We must not unlock if SLOW, because in that case we must first * unhash. Otherwise it would be possible to have multiple @lock * entries, which would be BAD. */ - if (likely(lockval == _Q_LOCKED_VAL)) + locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); + if (likely(locked == _Q_LOCKED_VAL)) return; - if (unlikely(lockval != _Q_SLOW_VAL)) { - if (debug_locks_silent) - return; - WARN(1, "pvqspinlock: lock %p has corrupted value 0x%x!\n", lock, atomic_read(&lock->val)); + if (unlikely(locked != _Q_SLOW_VAL)) { + WARN(!debug_locks_silent, + "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n", + (unsigned long)lock, atomic_read(&lock->val)); return; } /* + * A failed cmpxchg doesn't provide any memory-ordering guarantees, + * so we need a barrier to order the read of the node data in + * pv_unhash *after* we've read the lock being _Q_SLOW_VAL. + * + * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL. + */ + smp_rmb(); + + /* * Since the above failed to release, this must be the SLOW path. * Therefore start by looking up the blocked node and unhashing it. */ @@ -319,8 +360,11 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) /* * At this point the memory pointed at by lock can be freed/reused, * however we can still use the pv_node to kick the CPU. + * The other vCPU may not really be halted, but kicking an active + * vCPU is harmless other than the additional latency in completing + * the unlock. */ - if (READ_ONCE(node->state) == vcpu_halted) + if (READ_ONCE(node->state) == vcpu_hashed) pv_kick(node->cpu); } /* diff --git a/kernel/locking/rtmutex-tester.c b/kernel/locking/rtmutex-tester.c deleted file mode 100644 index 1d96dd0d93c1..000000000000 --- a/kernel/locking/rtmutex-tester.c +++ /dev/null @@ -1,420 +0,0 @@ -/* - * RT-Mutex-tester: scriptable tester for rt mutexes - * - * started by Thomas Gleixner: - * - * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> - * - */ -#include <linux/device.h> -#include <linux/kthread.h> -#include <linux/export.h> -#include <linux/sched.h> -#include <linux/sched/rt.h> -#include <linux/spinlock.h> -#include <linux/timer.h> -#include <linux/freezer.h> -#include <linux/stat.h> - -#include "rtmutex.h" - -#define MAX_RT_TEST_THREADS 8 -#define MAX_RT_TEST_MUTEXES 8 - -static spinlock_t rttest_lock; -static atomic_t rttest_event; - -struct test_thread_data { - int opcode; - int opdata; - int mutexes[MAX_RT_TEST_MUTEXES]; - int event; - struct device dev; -}; - -static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; -static struct task_struct *threads[MAX_RT_TEST_THREADS]; -static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES]; - -enum test_opcodes { - RTTEST_NOP = 0, - RTTEST_SCHEDOT, /* 1 Sched other, data = nice */ - RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */ - RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */ - RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */ - RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */ - RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ - RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ - RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ - /* 9, 10 - reserved for BKL commemoration */ - RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */ - RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ - RTTEST_RESET = 99, /* 99 Reset all pending operations */ -}; - -static int handle_op(struct test_thread_data *td, int lockwakeup) -{ - int i, id, ret = -EINVAL; - - switch(td->opcode) { - - case RTTEST_NOP: - return 0; - - case RTTEST_LOCKCONT: - td->mutexes[td->opdata] = 1; - td->event = atomic_add_return(1, &rttest_event); - return 0; - - case RTTEST_RESET: - for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) { - if (td->mutexes[i] == 4) { - rt_mutex_unlock(&mutexes[i]); - td->mutexes[i] = 0; - } - } - return 0; - - case RTTEST_RESETEVENT: - atomic_set(&rttest_event, 0); - return 0; - - default: - if (lockwakeup) - return ret; - } - - switch(td->opcode) { - - case RTTEST_LOCK: - case RTTEST_LOCKNOWAIT: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES) - return ret; - - td->mutexes[id] = 1; - td->event = atomic_add_return(1, &rttest_event); - rt_mutex_lock(&mutexes[id]); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = 4; - return 0; - - case RTTEST_LOCKINT: - case RTTEST_LOCKINTNOWAIT: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES) - return ret; - - td->mutexes[id] = 1; - td->event = atomic_add_return(1, &rttest_event); - ret = rt_mutex_lock_interruptible(&mutexes[id], 0); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = ret ? 0 : 4; - return ret ? -EINTR : 0; - - case RTTEST_UNLOCK: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4) - return ret; - - td->event = atomic_add_return(1, &rttest_event); - rt_mutex_unlock(&mutexes[id]); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = 0; - return 0; - - default: - break; - } - return ret; -} - -/* - * Schedule replacement for rtsem_down(). Only called for threads with - * PF_MUTEX_TESTER set. - * - * This allows us to have finegrained control over the event flow. - * - */ -void schedule_rt_mutex_test(struct rt_mutex *mutex) -{ - int tid, op, dat; - struct test_thread_data *td; - - /* We have to lookup the task */ - for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) { - if (threads[tid] == current) - break; - } - - BUG_ON(tid == MAX_RT_TEST_THREADS); - - td = &thread_data[tid]; - - op = td->opcode; - dat = td->opdata; - - switch (op) { - case RTTEST_LOCK: - case RTTEST_LOCKINT: - case RTTEST_LOCKNOWAIT: - case RTTEST_LOCKINTNOWAIT: - if (mutex != &mutexes[dat]) - break; - - if (td->mutexes[dat] != 1) - break; - - td->mutexes[dat] = 2; - td->event = atomic_add_return(1, &rttest_event); - break; - - default: - break; - } - - schedule(); - - - switch (op) { - case RTTEST_LOCK: - case RTTEST_LOCKINT: - if (mutex != &mutexes[dat]) - return; - - if (td->mutexes[dat] != 2) - return; - - td->mutexes[dat] = 3; - td->event = atomic_add_return(1, &rttest_event); - break; - - case RTTEST_LOCKNOWAIT: - case RTTEST_LOCKINTNOWAIT: - if (mutex != &mutexes[dat]) - return; - - if (td->mutexes[dat] != 2) - return; - - td->mutexes[dat] = 1; - td->event = atomic_add_return(1, &rttest_event); - return; - - default: - return; - } - - td->opcode = 0; - - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - - if (td->opcode > 0) { - int ret; - - set_current_state(TASK_RUNNING); - ret = handle_op(td, 1); - set_current_state(TASK_INTERRUPTIBLE); - if (td->opcode == RTTEST_LOCKCONT) - break; - td->opcode = ret; - } - - /* Wait for the next command to be executed */ - schedule(); - } - - /* Restore previous command and data */ - td->opcode = op; - td->opdata = dat; -} - -static int test_func(void *data) -{ - struct test_thread_data *td = data; - int ret; - - current->flags |= PF_MUTEX_TESTER; - set_freezable(); - allow_signal(SIGHUP); - - for(;;) { - - set_current_state(TASK_INTERRUPTIBLE); - - if (td->opcode > 0) { - set_current_state(TASK_RUNNING); - ret = handle_op(td, 0); - set_current_state(TASK_INTERRUPTIBLE); - td->opcode = ret; - } - - /* Wait for the next command to be executed */ - schedule(); - try_to_freeze(); - - if (signal_pending(current)) - flush_signals(current); - - if(kthread_should_stop()) - break; - } - return 0; -} - -/** - * sysfs_test_command - interface for test commands - * @dev: thread reference - * @buf: command for actual step - * @count: length of buffer - * - * command syntax: - * - * opcode:data - */ -static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct sched_param schedpar; - struct test_thread_data *td; - char cmdbuf[32]; - int op, dat, tid, ret; - - td = container_of(dev, struct test_thread_data, dev); - tid = td->dev.id; - - /* strings from sysfs write are not 0 terminated! */ - if (count >= sizeof(cmdbuf)) - return -EINVAL; - - /* strip of \n: */ - if (buf[count-1] == '\n') - count--; - if (count < 1) - return -EINVAL; - - memcpy(cmdbuf, buf, count); - cmdbuf[count] = 0; - - if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2) - return -EINVAL; - - switch (op) { - case RTTEST_SCHEDOT: - schedpar.sched_priority = 0; - ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar); - if (ret) - return ret; - set_user_nice(current, 0); - break; - - case RTTEST_SCHEDRT: - schedpar.sched_priority = dat; - ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar); - if (ret) - return ret; - break; - - case RTTEST_SIGNAL: - send_sig(SIGHUP, threads[tid], 0); - break; - - default: - if (td->opcode > 0) - return -EBUSY; - td->opdata = dat; - td->opcode = op; - wake_up_process(threads[tid]); - } - - return count; -} - -/** - * sysfs_test_status - sysfs interface for rt tester - * @dev: thread to query - * @buf: char buffer to be filled with thread status info - */ -static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct test_thread_data *td; - struct task_struct *tsk; - char *curr = buf; - int i; - - td = container_of(dev, struct test_thread_data, dev); - tsk = threads[td->dev.id]; - - spin_lock(&rttest_lock); - - curr += sprintf(curr, - "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:", - td->opcode, td->event, tsk->state, - (MAX_RT_PRIO - 1) - tsk->prio, - (MAX_RT_PRIO - 1) - tsk->normal_prio, - tsk->pi_blocked_on); - - for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) - curr += sprintf(curr, "%d", td->mutexes[i]); - - spin_unlock(&rttest_lock); - - curr += sprintf(curr, ", T: %p, R: %p\n", tsk, - mutexes[td->dev.id].owner); - - return curr - buf; -} - -static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL); -static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command); - -static struct bus_type rttest_subsys = { - .name = "rttest", - .dev_name = "rttest", -}; - -static int init_test_thread(int id) -{ - thread_data[id].dev.bus = &rttest_subsys; - thread_data[id].dev.id = id; - - threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); - if (IS_ERR(threads[id])) - return PTR_ERR(threads[id]); - - return device_register(&thread_data[id].dev); -} - -static int init_rttest(void) -{ - int ret, i; - - spin_lock_init(&rttest_lock); - - for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) - rt_mutex_init(&mutexes[i]); - - ret = subsys_system_register(&rttest_subsys, NULL); - if (ret) - return ret; - - for (i = 0; i < MAX_RT_TEST_THREADS; i++) { - ret = init_test_thread(i); - if (ret) - break; - ret = device_create_file(&thread_data[i].dev, &dev_attr_status); - if (ret) - break; - ret = device_create_file(&thread_data[i].dev, &dev_attr_command); - if (ret) - break; - } - - printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" ); - - return ret; -} - -device_initcall(init_rttest); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 5674b073473c..7781d801212f 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1120,7 +1120,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, debug_rt_mutex_print_deadlock(waiter); - schedule_rt_mutex(lock); + schedule(); raw_spin_lock(&lock->wait_lock); set_current_state(state); diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 7844f8f0e639..4f5f83c7d2d3 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -15,28 +15,6 @@ #include <linux/rtmutex.h> /* - * The rtmutex in kernel tester is independent of rtmutex debugging. We - * call schedule_rt_mutex_test() instead of schedule() for the tasks which - * belong to the tester. That way we can delay the wakeup path of those - * threads to provoke lock stealing and testing of complex boosting scenarios. - */ -#ifdef CONFIG_RT_MUTEX_TESTER - -extern void schedule_rt_mutex_test(struct rt_mutex *lock); - -#define schedule_rt_mutex(_lock) \ - do { \ - if (!(current->flags & PF_MUTEX_TESTER)) \ - schedule(); \ - else \ - schedule_rt_mutex_test(_lock); \ - } while (0) - -#else -# define schedule_rt_mutex(_lock) schedule() -#endif - -/* * This is the control structure for tasks blocked on a rt_mutex, * which is allocated on the kernel stack on of the blocked task. * diff --git a/kernel/memremap.c b/kernel/memremap.c new file mode 100644 index 000000000000..72b0c66628b6 --- /dev/null +++ b/kernel/memremap.c @@ -0,0 +1,190 @@ +/* + * Copyright(c) 2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/device.h> +#include <linux/types.h> +#include <linux/io.h> +#include <linux/mm.h> +#include <linux/memory_hotplug.h> + +#ifndef ioremap_cache +/* temporary while we convert existing ioremap_cache users to memremap */ +__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) +{ + return ioremap(offset, size); +} +#endif + +/** + * memremap() - remap an iomem_resource as cacheable memory + * @offset: iomem resource start address + * @size: size of remap + * @flags: either MEMREMAP_WB or MEMREMAP_WT + * + * memremap() is "ioremap" for cases where it is known that the resource + * being mapped does not have i/o side effects and the __iomem + * annotation is not applicable. + * + * MEMREMAP_WB - matches the default mapping for "System RAM" on + * the architecture. This is usually a read-allocate write-back cache. + * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM + * memremap() will bypass establishing a new mapping and instead return + * a pointer into the direct map. + * + * MEMREMAP_WT - establish a mapping whereby writes either bypass the + * cache or are written through to memory and never exist in a + * cache-dirty state with respect to program visibility. Attempts to + * map "System RAM" with this mapping type will fail. + */ +void *memremap(resource_size_t offset, size_t size, unsigned long flags) +{ + int is_ram = region_intersects(offset, size, "System RAM"); + void *addr = NULL; + + if (is_ram == REGION_MIXED) { + WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n", + &offset, (unsigned long) size); + return NULL; + } + + /* Try all mapping types requested until one returns non-NULL */ + if (flags & MEMREMAP_WB) { + flags &= ~MEMREMAP_WB; + /* + * MEMREMAP_WB is special in that it can be satisifed + * from the direct map. Some archs depend on the + * capability of memremap() to autodetect cases where + * the requested range is potentially in "System RAM" + */ + if (is_ram == REGION_INTERSECTS) + addr = __va(offset); + else + addr = ioremap_cache(offset, size); + } + + /* + * If we don't have a mapping yet and more request flags are + * pending then we will be attempting to establish a new virtual + * address mapping. Enforce that this mapping is not aliasing + * "System RAM" + */ + if (!addr && is_ram == REGION_INTERSECTS && flags) { + WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n", + &offset, (unsigned long) size); + return NULL; + } + + if (!addr && (flags & MEMREMAP_WT)) { + flags &= ~MEMREMAP_WT; + addr = ioremap_wt(offset, size); + } + + return addr; +} +EXPORT_SYMBOL(memremap); + +void memunmap(void *addr) +{ + if (is_vmalloc_addr(addr)) + iounmap((void __iomem *) addr); +} +EXPORT_SYMBOL(memunmap); + +static void devm_memremap_release(struct device *dev, void *res) +{ + memunmap(res); +} + +static int devm_memremap_match(struct device *dev, void *res, void *match_data) +{ + return *(void **)res == match_data; +} + +void *devm_memremap(struct device *dev, resource_size_t offset, + size_t size, unsigned long flags) +{ + void **ptr, *addr; + + ptr = devres_alloc(devm_memremap_release, sizeof(*ptr), GFP_KERNEL); + if (!ptr) + return NULL; + + addr = memremap(offset, size, flags); + if (addr) { + *ptr = addr; + devres_add(dev, ptr); + } else + devres_free(ptr); + + return addr; +} +EXPORT_SYMBOL(devm_memremap); + +void devm_memunmap(struct device *dev, void *addr) +{ + WARN_ON(devres_destroy(dev, devm_memremap_release, devm_memremap_match, + addr)); + memunmap(addr); +} +EXPORT_SYMBOL(devm_memunmap); + +#ifdef CONFIG_ZONE_DEVICE +struct page_map { + struct resource res; +}; + +static void devm_memremap_pages_release(struct device *dev, void *res) +{ + struct page_map *page_map = res; + + /* pages are dead and unused, undo the arch mapping */ + arch_remove_memory(page_map->res.start, resource_size(&page_map->res)); +} + +void *devm_memremap_pages(struct device *dev, struct resource *res) +{ + int is_ram = region_intersects(res->start, resource_size(res), + "System RAM"); + struct page_map *page_map; + int error, nid; + + if (is_ram == REGION_MIXED) { + WARN_ONCE(1, "%s attempted on mixed region %pr\n", + __func__, res); + return ERR_PTR(-ENXIO); + } + + if (is_ram == REGION_INTERSECTS) + return __va(res->start); + + page_map = devres_alloc(devm_memremap_pages_release, + sizeof(*page_map), GFP_KERNEL); + if (!page_map) + return ERR_PTR(-ENOMEM); + + memcpy(&page_map->res, res, sizeof(*res)); + + nid = dev_to_node(dev); + if (nid < 0) + nid = 0; + + error = arch_add_memory(nid, res->start, resource_size(res), true); + if (error) { + devres_free(page_map); + return ERR_PTR(error); + } + + devres_add(dev, page_map); + return __va(res->start); +} +EXPORT_SYMBOL(devm_memremap_pages); +#endif /* CONFIG_ZONE_DEVICE */ diff --git a/kernel/module_signing.c b/kernel/module_signing.c index be5b8fac4bd0..bd62f5cda746 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -10,11 +10,8 @@ */ #include <linux/kernel.h> -#include <linux/err.h> -#include <crypto/public_key.h> -#include <crypto/hash.h> -#include <keys/asymmetric-type.h> #include <keys/system_keyring.h> +#include <crypto/public_key.h> #include "module-internal.h" /* @@ -28,170 +25,22 @@ * - Information block */ struct module_signature { - u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */ - u8 hash; /* Digest algorithm [enum hash_algo] */ - u8 id_type; /* Key identifier type [enum pkey_id_type] */ - u8 signer_len; /* Length of signer's name */ - u8 key_id_len; /* Length of key identifier */ + u8 algo; /* Public-key crypto algorithm [0] */ + u8 hash; /* Digest algorithm [0] */ + u8 id_type; /* Key identifier type [PKEY_ID_PKCS7] */ + u8 signer_len; /* Length of signer's name [0] */ + u8 key_id_len; /* Length of key identifier [0] */ u8 __pad[3]; __be32 sig_len; /* Length of signature data */ }; /* - * Digest the module contents. - */ -static struct public_key_signature *mod_make_digest(enum hash_algo hash, - const void *mod, - unsigned long modlen) -{ - struct public_key_signature *pks; - struct crypto_shash *tfm; - struct shash_desc *desc; - size_t digest_size, desc_size; - int ret; - - pr_devel("==>%s()\n", __func__); - - /* Allocate the hashing algorithm we're going to need and find out how - * big the hash operational data will be. - */ - tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0); - if (IS_ERR(tfm)) - return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm); - - desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); - digest_size = crypto_shash_digestsize(tfm); - - /* We allocate the hash operational data storage on the end of our - * context data and the digest output buffer on the end of that. - */ - ret = -ENOMEM; - pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL); - if (!pks) - goto error_no_pks; - - pks->pkey_hash_algo = hash; - pks->digest = (u8 *)pks + sizeof(*pks) + desc_size; - pks->digest_size = digest_size; - - desc = (void *)pks + sizeof(*pks); - desc->tfm = tfm; - desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; - - ret = crypto_shash_init(desc); - if (ret < 0) - goto error; - - ret = crypto_shash_finup(desc, mod, modlen, pks->digest); - if (ret < 0) - goto error; - - crypto_free_shash(tfm); - pr_devel("<==%s() = ok\n", __func__); - return pks; - -error: - kfree(pks); -error_no_pks: - crypto_free_shash(tfm); - pr_devel("<==%s() = %d\n", __func__, ret); - return ERR_PTR(ret); -} - -/* - * Extract an MPI array from the signature data. This represents the actual - * signature. Each raw MPI is prefaced by a BE 2-byte value indicating the - * size of the MPI in bytes. - * - * RSA signatures only have one MPI, so currently we only read one. - */ -static int mod_extract_mpi_array(struct public_key_signature *pks, - const void *data, size_t len) -{ - size_t nbytes; - MPI mpi; - - if (len < 3) - return -EBADMSG; - nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1]; - data += 2; - len -= 2; - if (len != nbytes) - return -EBADMSG; - - mpi = mpi_read_raw_data(data, nbytes); - if (!mpi) - return -ENOMEM; - pks->mpi[0] = mpi; - pks->nr_mpi = 1; - return 0; -} - -/* - * Request an asymmetric key. - */ -static struct key *request_asymmetric_key(const char *signer, size_t signer_len, - const u8 *key_id, size_t key_id_len) -{ - key_ref_t key; - size_t i; - char *id, *q; - - pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len); - - /* Construct an identifier. */ - id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL); - if (!id) - return ERR_PTR(-ENOKEY); - - memcpy(id, signer, signer_len); - - q = id + signer_len; - *q++ = ':'; - *q++ = ' '; - for (i = 0; i < key_id_len; i++) { - *q++ = hex_asc[*key_id >> 4]; - *q++ = hex_asc[*key_id++ & 0x0f]; - } - - *q = 0; - - pr_debug("Look up: \"%s\"\n", id); - - key = keyring_search(make_key_ref(system_trusted_keyring, 1), - &key_type_asymmetric, id); - if (IS_ERR(key)) - pr_warn("Request for unknown module key '%s' err %ld\n", - id, PTR_ERR(key)); - kfree(id); - - if (IS_ERR(key)) { - switch (PTR_ERR(key)) { - /* Hide some search errors */ - case -EACCES: - case -ENOTDIR: - case -EAGAIN: - return ERR_PTR(-ENOKEY); - default: - return ERR_CAST(key); - } - } - - pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key))); - return key_ref_to_ptr(key); -} - -/* * Verify the signature on a module. */ int mod_verify_sig(const void *mod, unsigned long *_modlen) { - struct public_key_signature *pks; struct module_signature ms; - struct key *key; - const void *sig; size_t modlen = *_modlen, sig_len; - int ret; pr_devel("==>%s(,%zu)\n", __func__, modlen); @@ -205,46 +54,24 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen) if (sig_len >= modlen) return -EBADMSG; modlen -= sig_len; - if ((size_t)ms.signer_len + ms.key_id_len >= modlen) - return -EBADMSG; - modlen -= (size_t)ms.signer_len + ms.key_id_len; - *_modlen = modlen; - sig = mod + modlen; - - /* For the moment, only support RSA and X.509 identifiers */ - if (ms.algo != PKEY_ALGO_RSA || - ms.id_type != PKEY_ID_X509) - return -ENOPKG; - if (ms.hash >= PKEY_HASH__LAST || - !hash_algo_name[ms.hash]) + if (ms.id_type != PKEY_ID_PKCS7) { + pr_err("Module is not signed with expected PKCS#7 message\n"); return -ENOPKG; - - key = request_asymmetric_key(sig, ms.signer_len, - sig + ms.signer_len, ms.key_id_len); - if (IS_ERR(key)) - return PTR_ERR(key); - - pks = mod_make_digest(ms.hash, mod, modlen); - if (IS_ERR(pks)) { - ret = PTR_ERR(pks); - goto error_put_key; } - ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len, - sig_len); - if (ret < 0) - goto error_free_pks; - - ret = verify_signature(key, pks); - pr_devel("verify_signature() = %d\n", ret); + if (ms.algo != 0 || + ms.hash != 0 || + ms.signer_len != 0 || + ms.key_id_len != 0 || + ms.__pad[0] != 0 || + ms.__pad[1] != 0 || + ms.__pad[2] != 0) { + pr_err("PKCS#7 signature info has unexpected non-zero params\n"); + return -EBADMSG; + } -error_free_pks: - mpi_free(pks->rsa.s); - kfree(pks); -error_put_key: - key_put(key); - pr_devel("<==%s() = %d\n", __func__, ret); - return ret; + return system_verify_data(mod, modlen, mod + modlen, sig_len, + VERIFYING_MODULE_SIGNATURE); } diff --git a/kernel/profile.c b/kernel/profile.c index a7bcd28d6e9f..99513e1160e5 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -339,7 +339,7 @@ static int profile_cpu_callback(struct notifier_block *info, node = cpu_to_mem(cpu); per_cpu(cpu_profile_flip, cpu) = 0; if (!per_cpu(cpu_profile_hits, cpu)[1]) { - page = alloc_pages_exact_node(node, + page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) @@ -347,7 +347,7 @@ static int profile_cpu_callback(struct notifier_block *info, per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); } if (!per_cpu(cpu_profile_hits, cpu)[0]) { - page = alloc_pages_exact_node(node, + page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) @@ -543,14 +543,14 @@ static int create_hash_tables(void) int node = cpu_to_mem(cpu); struct page *page; - page = alloc_pages_exact_node(node, + page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, 0); if (!page) goto out_cleanup; per_cpu(cpu_profile_hits, cpu)[1] = (struct profile_hit *)page_address(page); - page = alloc_pages_exact_node(node, + page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, 0); if (!page) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index c8e0e050a36a..787320de68e0 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -556,6 +556,19 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data) if (data & ~(unsigned long)PTRACE_O_MASK) return -EINVAL; + if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) { + if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) || + !config_enabled(CONFIG_SECCOMP)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (seccomp_mode(¤t->seccomp) != SECCOMP_MODE_DISABLED || + current->ptrace & PT_SUSPEND_SECCOMP) + return -EPERM; + } + /* Avoid intermediate state when all opts are cleared */ flags = child->ptrace; flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT); diff --git a/kernel/resource.c b/kernel/resource.c index fed052a1bc9f..f150dbbe6f62 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -492,40 +492,51 @@ int __weak page_is_ram(unsigned long pfn) } EXPORT_SYMBOL_GPL(page_is_ram); -/* - * Search for a resouce entry that fully contains the specified region. - * If found, return 1 if it is RAM, 0 if not. - * If not found, or region is not fully contained, return -1 +/** + * region_intersects() - determine intersection of region with known resources + * @start: region start address + * @size: size of region + * @name: name of resource (in iomem_resource) * - * Used by the ioremap functions to ensure the user is not remapping RAM and is - * a vast speed up over walking through the resource table page by page. + * Check if the specified region partially overlaps or fully eclipses a + * resource identified by @name. Return REGION_DISJOINT if the region + * does not overlap @name, return REGION_MIXED if the region overlaps + * @type and another resource, and return REGION_INTERSECTS if the + * region overlaps @type and no other defined resource. Note, that + * REGION_INTERSECTS is also returned in the case when the specified + * region overlaps RAM and undefined memory holes. + * + * region_intersect() is used by memory remapping functions to ensure + * the user is not remapping RAM and is a vast speed up over walking + * through the resource table page by page. */ -int region_is_ram(resource_size_t start, unsigned long size) +int region_intersects(resource_size_t start, size_t size, const char *name) { - struct resource *p; - resource_size_t end = start + size - 1; unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY; - const char *name = "System RAM"; - int ret = -1; + resource_size_t end = start + size - 1; + int type = 0; int other = 0; + struct resource *p; read_lock(&resource_lock); for (p = iomem_resource.child; p ; p = p->sibling) { - if (p->end < start) - continue; - - if (p->start <= start && end <= p->end) { - /* resource fully contains region */ - if ((p->flags != flags) || strcmp(p->name, name)) - ret = 0; - else - ret = 1; - break; - } - if (end < p->start) - break; /* not found */ + bool is_type = strcmp(p->name, name) == 0 && p->flags == flags; + + if (start >= p->start && start <= p->end) + is_type ? type++ : other++; + if (end >= p->start && end <= p->end) + is_type ? type++ : other++; + if (p->start >= start && p->end <= end) + is_type ? type++ : other++; } read_unlock(&resource_lock); - return ret; + + if (other == 0) + return type ? REGION_INTERSECTS : REGION_DISJOINT; + + if (type) + return REGION_MIXED; + + return REGION_DISJOINT; } void __weak arch_remove_reservations(struct resource *avail) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d8420c233ff7..3595403921bd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -164,14 +164,12 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { static void sched_feat_disable(int i) { - if (static_key_enabled(&sched_feat_keys[i])) - static_key_slow_dec(&sched_feat_keys[i]); + static_key_disable(&sched_feat_keys[i]); } static void sched_feat_enable(int i) { - if (!static_key_enabled(&sched_feat_keys[i])) - static_key_slow_inc(&sched_feat_keys[i]); + static_key_enable(&sched_feat_keys[i]); } #else static void sched_feat_disable(int i) { }; diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 052e02672d12..272d9322bc5d 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -106,9 +106,10 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) } EXPORT_SYMBOL_GPL(__wake_up_locked); -void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr, + void *key) { - __wake_up_common(q, mode, 1, 0, key); + __wake_up_common(q, mode, nr, 0, key); } EXPORT_SYMBOL_GPL(__wake_up_locked_key); @@ -283,7 +284,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, if (!list_empty(&wait->task_list)) list_del_init(&wait->task_list); else if (waitqueue_active(q)) - __wake_up_locked_key(q, mode, key); + __wake_up_locked_key(q, mode, 1, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(abort_exclusive_wait); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 245df6b32b81..5bd4779282df 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -175,17 +175,16 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) */ static u32 seccomp_run_filters(struct seccomp_data *sd) { - struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); struct seccomp_data sd_local; u32 ret = SECCOMP_RET_ALLOW; + /* Make sure cross-thread synced filter points somewhere sane. */ + struct seccomp_filter *f = + lockless_dereference(current->seccomp.filter); /* Ensure unexpected behavior doesn't result in failing open. */ if (unlikely(WARN_ON(f == NULL))) return SECCOMP_RET_KILL; - /* Make sure cross-thread synced filter points somewhere sane. */ - smp_read_barrier_depends(); - if (!sd) { populate_seccomp_data(&sd_local); sd = &sd_local; @@ -549,7 +548,11 @@ void secure_computing_strict(int this_syscall) { int mode = current->seccomp.mode; - if (mode == 0) + if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && + unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) + return; + + if (mode == SECCOMP_MODE_DISABLED) return; else if (mode == SECCOMP_MODE_STRICT) __secure_computing_strict(this_syscall); @@ -650,6 +653,10 @@ u32 seccomp_phase1(struct seccomp_data *sd) int this_syscall = sd ? sd->nr : syscall_get_nr(current, task_pt_regs(current)); + if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && + unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) + return SECCOMP_PHASE1_OK; + switch (mode) { case SECCOMP_MODE_STRICT: __secure_computing_strict(this_syscall); /* may call do_exit */ diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 7c434c39f02a..a818cbc73e14 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -113,7 +113,8 @@ static int smpboot_thread_fn(void *data) if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); preempt_enable(); - if (ht->cleanup) + /* cleanup must mirror setup */ + if (ht->cleanup && td->status != HP_THREAD_NONE) ht->cleanup(td->cpu, cpu_online(td->cpu)); kfree(td); return 0; @@ -259,15 +260,6 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) { unsigned int cpu; - /* Unpark any threads that were voluntarily parked. */ - for_each_cpu_not(cpu, ht->cpumask) { - if (cpu_online(cpu)) { - struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); - if (tsk) - kthread_unpark(tsk); - } - } - /* We need to destroy also the parked threads of offline cpus */ for_each_possible_cpu(cpu) { struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); @@ -281,19 +273,22 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) } /** - * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug + * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related + * to hotplug * @plug_thread: Hotplug thread descriptor + * @cpumask: The cpumask where threads run * * Creates and starts the threads on all online cpus. */ -int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) +int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread, + const struct cpumask *cpumask) { unsigned int cpu; int ret = 0; if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL)) return -ENOMEM; - cpumask_copy(plug_thread->cpumask, cpu_possible_mask); + cpumask_copy(plug_thread->cpumask, cpumask); get_online_cpus(); mutex_lock(&smpboot_threads_lock); @@ -301,9 +296,11 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) ret = __smpboot_create_thread(plug_thread, cpu); if (ret) { smpboot_destroy_threads(plug_thread); + free_cpumask_var(plug_thread->cpumask); goto out; } - smpboot_unpark_thread(plug_thread, cpu); + if (cpumask_test_cpu(cpu, cpumask)) + smpboot_unpark_thread(plug_thread, cpu); } list_add(&plug_thread->list, &hotplug_threads); out: @@ -311,7 +308,7 @@ out: put_online_cpus(); return ret; } -EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); +EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask); /** * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index ca7d84f438f1..03c3875d9958 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -219,6 +219,7 @@ cond_syscall(compat_sys_timerfd_gettime); cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); cond_syscall(sys_memfd_create); +cond_syscall(sys_userfaultfd); /* performance counters: */ cond_syscall(sys_perf_event_open); diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S deleted file mode 100644 index 3e9868d47535..000000000000 --- a/kernel/system_certificates.S +++ /dev/null @@ -1,20 +0,0 @@ -#include <linux/export.h> -#include <linux/init.h> - - __INITRODATA - - .align 8 - .globl VMLINUX_SYMBOL(system_certificate_list) -VMLINUX_SYMBOL(system_certificate_list): -__cert_list_start: - .incbin "kernel/x509_certificate_list" -__cert_list_end: - - .align 8 - .globl VMLINUX_SYMBOL(system_certificate_list_size) -VMLINUX_SYMBOL(system_certificate_list_size): -#ifdef CONFIG_64BIT - .quad __cert_list_end - __cert_list_start -#else - .long __cert_list_end - __cert_list_start -#endif diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c deleted file mode 100644 index 875f64e8935b..000000000000 --- a/kernel/system_keyring.c +++ /dev/null @@ -1,106 +0,0 @@ -/* System trusted keyring for trusted public keys - * - * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#include <linux/export.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/cred.h> -#include <linux/err.h> -#include <keys/asymmetric-type.h> -#include <keys/system_keyring.h> -#include "module-internal.h" - -struct key *system_trusted_keyring; -EXPORT_SYMBOL_GPL(system_trusted_keyring); - -extern __initconst const u8 system_certificate_list[]; -extern __initconst const unsigned long system_certificate_list_size; - -/* - * Load the compiled-in keys - */ -static __init int system_trusted_keyring_init(void) -{ - pr_notice("Initialise system trusted keyring\n"); - - system_trusted_keyring = - keyring_alloc(".system_keyring", - KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), - KEY_ALLOC_NOT_IN_QUOTA, NULL); - if (IS_ERR(system_trusted_keyring)) - panic("Can't allocate system trusted keyring\n"); - - set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags); - return 0; -} - -/* - * Must be initialised before we try and load the keys into the keyring. - */ -device_initcall(system_trusted_keyring_init); - -/* - * Load the compiled-in list of X.509 certificates. - */ -static __init int load_system_certificate_list(void) -{ - key_ref_t key; - const u8 *p, *end; - size_t plen; - - pr_notice("Loading compiled-in X.509 certificates\n"); - - p = system_certificate_list; - end = p + system_certificate_list_size; - while (p < end) { - /* Each cert begins with an ASN.1 SEQUENCE tag and must be more - * than 256 bytes in size. - */ - if (end - p < 4) - goto dodgy_cert; - if (p[0] != 0x30 && - p[1] != 0x82) - goto dodgy_cert; - plen = (p[2] << 8) | p[3]; - plen += 4; - if (plen > end - p) - goto dodgy_cert; - - key = key_create_or_update(make_key_ref(system_trusted_keyring, 1), - "asymmetric", - NULL, - p, - plen, - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ), - KEY_ALLOC_NOT_IN_QUOTA | - KEY_ALLOC_TRUSTED); - if (IS_ERR(key)) { - pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", - PTR_ERR(key)); - } else { - set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags); - pr_notice("Loaded X.509 cert '%s'\n", - key_ref_to_ptr(key)->description); - key_ref_put(key); - } - p += plen; - } - - return 0; - -dodgy_cert: - pr_err("Problem parsing in-kernel X.509 certificate list\n"); - return 0; -} -late_initcall(load_system_certificate_list); diff --git a/kernel/task_work.c b/kernel/task_work.c index 8727032e3a6f..53fa971d000d 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -18,6 +18,8 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ * This is like the signal handler which runs in kernel mode, but it doesn't * try to wake up the @task. * + * Note: there is no ordering guarantee on works queued here. + * * RETURNS: * 0 if succeeds or -ESRCH. */ @@ -108,16 +110,6 @@ void task_work_run(void) raw_spin_unlock_wait(&task->pi_lock); smp_mb(); - /* Reverse the list to run the works in fifo order */ - head = NULL; - do { - next = work->next; - work->next = head; - head = work; - work = next; - } while (work); - - work = head; do { next = work->next; work->func(work); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eb11011b5292..b0623ac785a2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -630,13 +630,18 @@ static int function_stat_show(struct seq_file *m, void *v) goto out; } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + avg = rec->time; + do_div(avg, rec->counter); + if (tracing_thresh && (avg < tracing_thresh)) + goto out; +#endif + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); seq_printf(m, " %-30.30s %10lu", str, rec->counter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER seq_puts(m, " "); - avg = rec->time; - do_div(avg, rec->counter); /* Sample standard deviation (s^2) */ if (rec->counter <= 1) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6260717c18e3..fc347f8b1bca 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -400,6 +400,17 @@ struct rb_irq_work { }; /* + * Structure to hold event state and handle nested events. + */ +struct rb_event_info { + u64 ts; + u64 delta; + unsigned long length; + struct buffer_page *tail_page; + int add_timestamp; +}; + +/* * Used for which event context the event is in. * NMI = 0 * IRQ = 1 @@ -1876,73 +1887,6 @@ rb_event_index(struct ring_buffer_event *event) return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; } -static inline int -rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - unsigned long addr = (unsigned long)event; - unsigned long index; - - index = rb_event_index(event); - addr &= PAGE_MASK; - - return cpu_buffer->commit_page->page == (void *)addr && - rb_commit_index(cpu_buffer) == index; -} - -static void -rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) -{ - unsigned long max_count; - - /* - * We only race with interrupts and NMIs on this CPU. - * If we own the commit event, then we can commit - * all others that interrupted us, since the interruptions - * are in stack format (they finish before they come - * back to us). This allows us to do a simple loop to - * assign the commit to the tail. - */ - again: - max_count = cpu_buffer->nr_pages * 100; - - while (cpu_buffer->commit_page != cpu_buffer->tail_page) { - if (RB_WARN_ON(cpu_buffer, !(--max_count))) - return; - if (RB_WARN_ON(cpu_buffer, - rb_is_reader_page(cpu_buffer->tail_page))) - return; - local_set(&cpu_buffer->commit_page->page->commit, - rb_page_write(cpu_buffer->commit_page)); - rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; - /* add barrier to keep gcc from optimizing too much */ - barrier(); - } - while (rb_commit_index(cpu_buffer) != - rb_page_write(cpu_buffer->commit_page)) { - - local_set(&cpu_buffer->commit_page->page->commit, - rb_page_write(cpu_buffer->commit_page)); - RB_WARN_ON(cpu_buffer, - local_read(&cpu_buffer->commit_page->page->commit) & - ~RB_WRITE_MASK); - barrier(); - } - - /* again, keep gcc from optimizing */ - barrier(); - - /* - * If an interrupt came in just after the first while loop - * and pushed the tail page forward, we will be left with - * a dangling commit that will never go forward. - */ - if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) - goto again; -} - static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; @@ -1968,64 +1912,6 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) iter->head = 0; } -/* Slow path, do not inline */ -static noinline struct ring_buffer_event * -rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) -{ - event->type_len = RINGBUF_TYPE_TIME_EXTEND; - - /* Not the first event on the page? */ - if (rb_event_index(event)) { - event->time_delta = delta & TS_MASK; - event->array[0] = delta >> TS_SHIFT; - } else { - /* nope, just zero it */ - event->time_delta = 0; - event->array[0] = 0; - } - - return skip_time_extend(event); -} - -/** - * rb_update_event - update event type and data - * @event: the event to update - * @type: the type of event - * @length: the size of the event field in the ring buffer - * - * Update the type and data fields of the event. The length - * is the actual size that is written to the ring buffer, - * and with this, we can determine what to place into the - * data field. - */ -static void -rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event, unsigned length, - int add_timestamp, u64 delta) -{ - /* Only a commit updates the timestamp */ - if (unlikely(!rb_event_is_commit(cpu_buffer, event))) - delta = 0; - - /* - * If we need to add a timestamp, then we - * add it to the start of the resevered space. - */ - if (unlikely(add_timestamp)) { - event = rb_add_time_stamp(event, delta); - length -= RB_LEN_TIME_EXTEND; - delta = 0; - } - - event->time_delta = delta; - length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { - event->type_len = 0; - event->array[0] = length; - } else - event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); -} - /* * rb_handle_head_page - writer hit the head page * @@ -2184,29 +2070,13 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, return 0; } -static unsigned rb_calculate_event_length(unsigned length) -{ - struct ring_buffer_event event; /* Used only for sizeof array */ - - /* zero length can cause confusions */ - if (!length) - length++; - - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) - length += sizeof(event.array[0]); - - length += RB_EVNT_HDR_SIZE; - length = ALIGN(length, RB_ARCH_ALIGNMENT); - - return length; -} - static inline void rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page *tail_page, - unsigned long tail, unsigned long length) + unsigned long tail, struct rb_event_info *info) { + struct buffer_page *tail_page = info->tail_page; struct ring_buffer_event *event; + unsigned long length = info->length; /* * Only the event that crossed the page boundary @@ -2276,13 +2146,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, */ static noinline struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, - unsigned long length, unsigned long tail, - struct buffer_page *tail_page, u64 ts) + unsigned long tail, struct rb_event_info *info) { + struct buffer_page *tail_page = info->tail_page; struct buffer_page *commit_page = cpu_buffer->commit_page; struct ring_buffer *buffer = cpu_buffer->buffer; struct buffer_page *next_page; int ret; + u64 ts; next_page = tail_page; @@ -2368,74 +2239,120 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, out_again: - rb_reset_tail(cpu_buffer, tail_page, tail, length); + rb_reset_tail(cpu_buffer, tail, info); /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); out_reset: /* reset write */ - rb_reset_tail(cpu_buffer, tail_page, tail, length); + rb_reset_tail(cpu_buffer, tail, info); return NULL; } -static struct ring_buffer_event * -__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, - unsigned long length, u64 ts, - u64 delta, int add_timestamp) +/* Slow path, do not inline */ +static noinline struct ring_buffer_event * +rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) { - struct buffer_page *tail_page; - struct ring_buffer_event *event; - unsigned long tail, write; + event->type_len = RINGBUF_TYPE_TIME_EXTEND; - /* - * If the time delta since the last event is too big to - * hold in the time field of the event, then we append a - * TIME EXTEND event ahead of the data event. - */ - if (unlikely(add_timestamp)) - length += RB_LEN_TIME_EXTEND; + /* Not the first event on the page? */ + if (rb_event_index(event)) { + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; + } else { + /* nope, just zero it */ + event->time_delta = 0; + event->array[0] = 0; + } - tail_page = cpu_buffer->tail_page; - write = local_add_return(length, &tail_page->write); + return skip_time_extend(event); +} - /* set write to only the index of the write */ - write &= RB_WRITE_MASK; - tail = write - length; +static inline int rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event); + +/** + * rb_update_event - update event type and data + * @event: the event to update + * @type: the type of event + * @length: the size of the event field in the ring buffer + * + * Update the type and data fields of the event. The length + * is the actual size that is written to the ring buffer, + * and with this, we can determine what to place into the + * data field. + */ +static void +rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, + struct rb_event_info *info) +{ + unsigned length = info->length; + u64 delta = info->delta; + + /* Only a commit updates the timestamp */ + if (unlikely(!rb_event_is_commit(cpu_buffer, event))) + delta = 0; /* - * If this is the first commit on the page, then it has the same - * timestamp as the page itself. + * If we need to add a timestamp, then we + * add it to the start of the resevered space. */ - if (!tail) + if (unlikely(info->add_timestamp)) { + event = rb_add_time_stamp(event, delta); + length -= RB_LEN_TIME_EXTEND; delta = 0; + } - /* See if we shot pass the end of this buffer page */ - if (unlikely(write > BUF_PAGE_SIZE)) - return rb_move_tail(cpu_buffer, length, tail, - tail_page, ts); + event->time_delta = delta; + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { + event->type_len = 0; + event->array[0] = length; + } else + event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); +} - /* We reserved something on the buffer */ +static unsigned rb_calculate_event_length(unsigned length) +{ + struct ring_buffer_event event; /* Used only for sizeof array */ - event = __rb_page_index(tail_page, tail); - kmemcheck_annotate_bitfield(event, bitfield); - rb_update_event(cpu_buffer, event, length, add_timestamp, delta); + /* zero length can cause confusions */ + if (!length) + length++; - local_inc(&tail_page->entries); + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) + length += sizeof(event.array[0]); + + length += RB_EVNT_HDR_SIZE; + length = ALIGN(length, RB_ARCH_ALIGNMENT); /* - * If this is the first commit on the page, then update - * its timestamp. + * In case the time delta is larger than the 27 bits for it + * in the header, we need to add a timestamp. If another + * event comes in when trying to discard this one to increase + * the length, then the timestamp will be added in the allocated + * space of this event. If length is bigger than the size needed + * for the TIME_EXTEND, then padding has to be used. The events + * length must be either RB_LEN_TIME_EXTEND, or greater than or equal + * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding. + * As length is a multiple of 4, we only need to worry if it + * is 12 (RB_LEN_TIME_EXTEND + 4). */ - if (!tail) - tail_page->page->time_stamp = ts; + if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT) + length += RB_ALIGNMENT; - /* account for these added bytes */ - local_add(length, &cpu_buffer->entries_bytes); + return length; +} - return event; +#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline bool sched_clock_stable(void) +{ + return true; } +#endif static inline int rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, @@ -2483,6 +2400,59 @@ static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) local_inc(&cpu_buffer->commits); } +static void +rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long max_count; + + /* + * We only race with interrupts and NMIs on this CPU. + * If we own the commit event, then we can commit + * all others that interrupted us, since the interruptions + * are in stack format (they finish before they come + * back to us). This allows us to do a simple loop to + * assign the commit to the tail. + */ + again: + max_count = cpu_buffer->nr_pages * 100; + + while (cpu_buffer->commit_page != cpu_buffer->tail_page) { + if (RB_WARN_ON(cpu_buffer, !(--max_count))) + return; + if (RB_WARN_ON(cpu_buffer, + rb_is_reader_page(cpu_buffer->tail_page))) + return; + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; + /* add barrier to keep gcc from optimizing too much */ + barrier(); + } + while (rb_commit_index(cpu_buffer) != + rb_page_write(cpu_buffer->commit_page)) { + + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + RB_WARN_ON(cpu_buffer, + local_read(&cpu_buffer->commit_page->page->commit) & + ~RB_WRITE_MASK); + barrier(); + } + + /* again, keep gcc from optimizing */ + barrier(); + + /* + * If an interrupt came in just after the first while loop + * and pushed the tail page forward, we will be left with + * a dangling commit that will never go forward. + */ + if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) + goto again; +} + static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long commits; @@ -2515,91 +2485,94 @@ static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) } } -static struct ring_buffer_event * -rb_reserve_next_event(struct ring_buffer *buffer, - struct ring_buffer_per_cpu *cpu_buffer, - unsigned long length) +static inline void rb_event_discard(struct ring_buffer_event *event) { - struct ring_buffer_event *event; - u64 ts, delta; - int nr_loops = 0; - int add_timestamp; - u64 diff; + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); - rb_start_commit(cpu_buffer); + /* array[0] holds the actual length for the discarded event */ + event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; + event->type_len = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + if (!event->time_delta) + event->time_delta = 1; +} -#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP - /* - * Due to the ability to swap a cpu buffer from a buffer - * it is possible it was swapped before we committed. - * (committing stops a swap). We check for it here and - * if it happened, we have to fail the write. - */ - barrier(); - if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) { - local_dec(&cpu_buffer->committing); - local_dec(&cpu_buffer->commits); - return NULL; - } -#endif +static inline int +rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + unsigned long index; - length = rb_calculate_event_length(length); - again: - add_timestamp = 0; - delta = 0; + index = rb_event_index(event); + addr &= PAGE_MASK; + + return cpu_buffer->commit_page->page == (void *)addr && + rb_commit_index(cpu_buffer) == index; +} + +static void +rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + u64 delta; /* - * We allow for interrupts to reenter here and do a trace. - * If one does, it will cause this original code to loop - * back here. Even with heavy interrupts happening, this - * should only happen a few times in a row. If this happens - * 1000 times in a row, there must be either an interrupt - * storm or we have something buggy. - * Bail! + * The event first in the commit queue updates the + * time stamp. */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) - goto out_fail; + if (rb_event_is_commit(cpu_buffer, event)) { + /* + * A commit event that is first on a page + * updates the write timestamp with the page stamp + */ + if (!rb_event_index(event)) + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; + else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + delta = event->array[0]; + delta <<= TS_SHIFT; + delta += event->time_delta; + cpu_buffer->write_stamp += delta; + } else + cpu_buffer->write_stamp += event->time_delta; + } +} - ts = rb_time_stamp(cpu_buffer->buffer); - diff = ts - cpu_buffer->write_stamp; +static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + local_inc(&cpu_buffer->entries); + rb_update_write_stamp(cpu_buffer, event); + rb_end_commit(cpu_buffer); +} - /* make sure this diff is calculated here */ - barrier(); +static __always_inline void +rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) +{ + bool pagebusy; - /* Did the write stamp get updated already? */ - if (likely(ts >= cpu_buffer->write_stamp)) { - delta = diff; - if (unlikely(test_time_stamp(delta))) { - int local_clock_stable = 1; -#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - local_clock_stable = sched_clock_stable(); -#endif - WARN_ONCE(delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", - (unsigned long long)delta, - (unsigned long long)ts, - (unsigned long long)cpu_buffer->write_stamp, - local_clock_stable ? "" : - "If you just came from a suspend/resume,\n" - "please switch to the trace global clock:\n" - " echo global > /sys/kernel/debug/tracing/trace_clock\n"); - add_timestamp = 1; - } + if (buffer->irq_work.waiters_pending) { + buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&buffer->irq_work.work); } - event = __rb_reserve_next(cpu_buffer, length, ts, - delta, add_timestamp); - if (unlikely(PTR_ERR(event) == -EAGAIN)) - goto again; - - if (!event) - goto out_fail; + if (cpu_buffer->irq_work.waiters_pending) { + cpu_buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } - return event; + pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; - out_fail: - rb_end_commit(cpu_buffer); - return NULL; + if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { + cpu_buffer->irq_work.wakeup_full = true; + cpu_buffer->irq_work.full_waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } } /* @@ -2672,6 +2645,178 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) } /** + * ring_buffer_unlock_commit - commit a reserved + * @buffer: The buffer to commit to + * @event: The event pointer to commit. + * + * This commits the data to the ring buffer, and releases any locks held. + * + * Must be paired with ring_buffer_lock_reserve. + */ +int ring_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu = raw_smp_processor_id(); + + cpu_buffer = buffer->buffers[cpu]; + + rb_commit(cpu_buffer, event); + + rb_wakeups(buffer, cpu_buffer); + + trace_recursive_unlock(cpu_buffer); + + preempt_enable_notrace(); + + return 0; +} +EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); + +static noinline void +rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info) +{ + WARN_ONCE(info->delta > (1ULL << 59), + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", + (unsigned long long)info->delta, + (unsigned long long)info->ts, + (unsigned long long)cpu_buffer->write_stamp, + sched_clock_stable() ? "" : + "If you just came from a suspend/resume,\n" + "please switch to the trace global clock:\n" + " echo global > /sys/kernel/debug/tracing/trace_clock\n"); + info->add_timestamp = 1; +} + +static struct ring_buffer_event * +__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info) +{ + struct ring_buffer_event *event; + struct buffer_page *tail_page; + unsigned long tail, write; + + /* + * If the time delta since the last event is too big to + * hold in the time field of the event, then we append a + * TIME EXTEND event ahead of the data event. + */ + if (unlikely(info->add_timestamp)) + info->length += RB_LEN_TIME_EXTEND; + + tail_page = info->tail_page = cpu_buffer->tail_page; + write = local_add_return(info->length, &tail_page->write); + + /* set write to only the index of the write */ + write &= RB_WRITE_MASK; + tail = write - info->length; + + /* + * If this is the first commit on the page, then it has the same + * timestamp as the page itself. + */ + if (!tail) + info->delta = 0; + + /* See if we shot pass the end of this buffer page */ + if (unlikely(write > BUF_PAGE_SIZE)) + return rb_move_tail(cpu_buffer, tail, info); + + /* We reserved something on the buffer */ + + event = __rb_page_index(tail_page, tail); + kmemcheck_annotate_bitfield(event, bitfield); + rb_update_event(cpu_buffer, event, info); + + local_inc(&tail_page->entries); + + /* + * If this is the first commit on the page, then update + * its timestamp. + */ + if (!tail) + tail_page->page->time_stamp = info->ts; + + /* account for these added bytes */ + local_add(info->length, &cpu_buffer->entries_bytes); + + return event; +} + +static struct ring_buffer_event * +rb_reserve_next_event(struct ring_buffer *buffer, + struct ring_buffer_per_cpu *cpu_buffer, + unsigned long length) +{ + struct ring_buffer_event *event; + struct rb_event_info info; + int nr_loops = 0; + u64 diff; + + rb_start_commit(cpu_buffer); + +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP + /* + * Due to the ability to swap a cpu buffer from a buffer + * it is possible it was swapped before we committed. + * (committing stops a swap). We check for it here and + * if it happened, we have to fail the write. + */ + barrier(); + if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) { + local_dec(&cpu_buffer->committing); + local_dec(&cpu_buffer->commits); + return NULL; + } +#endif + + info.length = rb_calculate_event_length(length); + again: + info.add_timestamp = 0; + info.delta = 0; + + /* + * We allow for interrupts to reenter here and do a trace. + * If one does, it will cause this original code to loop + * back here. Even with heavy interrupts happening, this + * should only happen a few times in a row. If this happens + * 1000 times in a row, there must be either an interrupt + * storm or we have something buggy. + * Bail! + */ + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) + goto out_fail; + + info.ts = rb_time_stamp(cpu_buffer->buffer); + diff = info.ts - cpu_buffer->write_stamp; + + /* make sure this diff is calculated here */ + barrier(); + + /* Did the write stamp get updated already? */ + if (likely(info.ts >= cpu_buffer->write_stamp)) { + info.delta = diff; + if (unlikely(test_time_stamp(info.delta))) + rb_handle_timestamp(cpu_buffer, &info); + } + + event = __rb_reserve_next(cpu_buffer, &info); + + if (unlikely(PTR_ERR(event) == -EAGAIN)) + goto again; + + if (!event) + goto out_fail; + + return event; + + out_fail: + rb_end_commit(cpu_buffer); + return NULL; +} + +/** * ring_buffer_lock_reserve - reserve a part of the buffer * @buffer: the ring buffer to reserve from * @length: the length of the data to reserve (excluding event header) @@ -2729,111 +2874,6 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) } EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); -static void -rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - u64 delta; - - /* - * The event first in the commit queue updates the - * time stamp. - */ - if (rb_event_is_commit(cpu_buffer, event)) { - /* - * A commit event that is first on a page - * updates the write timestamp with the page stamp - */ - if (!rb_event_index(event)) - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; - else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { - delta = event->array[0]; - delta <<= TS_SHIFT; - delta += event->time_delta; - cpu_buffer->write_stamp += delta; - } else - cpu_buffer->write_stamp += event->time_delta; - } -} - -static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - local_inc(&cpu_buffer->entries); - rb_update_write_stamp(cpu_buffer, event); - rb_end_commit(cpu_buffer); -} - -static __always_inline void -rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) -{ - bool pagebusy; - - if (buffer->irq_work.waiters_pending) { - buffer->irq_work.waiters_pending = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&buffer->irq_work.work); - } - - if (cpu_buffer->irq_work.waiters_pending) { - cpu_buffer->irq_work.waiters_pending = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&cpu_buffer->irq_work.work); - } - - pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; - - if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { - cpu_buffer->irq_work.wakeup_full = true; - cpu_buffer->irq_work.full_waiters_pending = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&cpu_buffer->irq_work.work); - } -} - -/** - * ring_buffer_unlock_commit - commit a reserved - * @buffer: The buffer to commit to - * @event: The event pointer to commit. - * - * This commits the data to the ring buffer, and releases any locks held. - * - * Must be paired with ring_buffer_lock_reserve. - */ -int ring_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event) -{ - struct ring_buffer_per_cpu *cpu_buffer; - int cpu = raw_smp_processor_id(); - - cpu_buffer = buffer->buffers[cpu]; - - rb_commit(cpu_buffer, event); - - rb_wakeups(buffer, cpu_buffer); - - trace_recursive_unlock(cpu_buffer); - - preempt_enable_notrace(); - - return 0; -} -EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); - -static inline void rb_event_discard(struct ring_buffer_event *event) -{ - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) - event = skip_time_extend(event); - - /* array[0] holds the actual length for the discarded event */ - event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; - event->type_len = RINGBUF_TYPE_PADDING; - /* time delta must be non zero */ - if (!event->time_delta) - event->time_delta = 1; -} - /* * Decrement the entries to the page that an event is on. * The event does not even need to exist, only the pointer diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index abcbf7ff8743..6e79408674aa 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3035,7 +3035,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (!iter) return ERR_PTR(-ENOMEM); - iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(), + iter->buffer_iter = kcalloc(nr_cpu_ids, sizeof(*iter->buffer_iter), GFP_KERNEL); if (!iter->buffer_iter) goto release; @@ -6990,7 +6990,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) trace_init_global_iter(&iter); for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled); + atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 404a372ad85a..7ca09cdc20c2 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -30,6 +30,7 @@ DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); +static LIST_HEAD(ftrace_generic_fields); static LIST_HEAD(ftrace_common_fields); #define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) @@ -94,6 +95,10 @@ trace_find_event_field(struct trace_event_call *call, char *name) struct ftrace_event_field *field; struct list_head *head; + field = __find_event_field(&ftrace_generic_fields, name); + if (field) + return field; + field = __find_event_field(&ftrace_common_fields, name); if (field) return field; @@ -144,6 +149,13 @@ int trace_define_field(struct trace_event_call *call, const char *type, } EXPORT_SYMBOL_GPL(trace_define_field); +#define __generic_field(type, item, filter_type) \ + ret = __trace_define_field(&ftrace_generic_fields, #type, \ + #item, 0, 0, is_signed_type(type), \ + filter_type); \ + if (ret) \ + return ret; + #define __common_field(type, item) \ ret = __trace_define_field(&ftrace_common_fields, #type, \ "common_" #item, \ @@ -153,6 +165,16 @@ EXPORT_SYMBOL_GPL(trace_define_field); if (ret) \ return ret; +static int trace_define_generic_fields(void) +{ + int ret; + + __generic_field(int, cpu, FILTER_OTHER); + __generic_field(char *, comm, FILTER_PTR_STRING); + + return ret; +} + static int trace_define_common_fields(void) { int ret; @@ -2671,6 +2693,9 @@ static __init int event_trace_init(void) if (!entry) pr_warn("Could not create tracefs 'available_events' entry\n"); + if (trace_define_generic_fields()) + pr_warn("tracing: Failed to allocated generic fields"); + if (trace_define_common_fields()) pr_warn("tracing: Failed to allocate common fields"); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index d81d6f302b14..bd1bf184c5c9 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -252,6 +252,50 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event) return match; } +/* Filter predicate for CPUs. */ +static int filter_pred_cpu(struct filter_pred *pred, void *event) +{ + int cpu, cmp; + int match = 0; + + cpu = raw_smp_processor_id(); + cmp = pred->val; + + switch (pred->op) { + case OP_EQ: + match = cpu == cmp; + break; + case OP_LT: + match = cpu < cmp; + break; + case OP_LE: + match = cpu <= cmp; + break; + case OP_GT: + match = cpu > cmp; + break; + case OP_GE: + match = cpu >= cmp; + break; + default: + break; + } + + return !!match == !pred->not; +} + +/* Filter predicate for COMM. */ +static int filter_pred_comm(struct filter_pred *pred, void *event) +{ + int cmp, match; + + cmp = pred->regex.match(current->comm, &pred->regex, + pred->regex.field_len); + match = cmp ^ pred->not; + + return match; +} + static int filter_pred_none(struct filter_pred *pred, void *event) { return 0; @@ -1002,7 +1046,10 @@ static int init_pred(struct filter_parse_state *ps, if (is_string_field(field)) { filter_build_regex(pred); - if (field->filter_type == FILTER_STATIC_STRING) { + if (!strcmp(field->name, "comm")) { + fn = filter_pred_comm; + pred->regex.field_len = TASK_COMM_LEN; + } else if (field->filter_type == FILTER_STATIC_STRING) { fn = filter_pred_string; pred->regex.field_len = field->size; } else if (field->filter_type == FILTER_DYN_STRING) @@ -1025,7 +1072,10 @@ static int init_pred(struct filter_parse_state *ps, } pred->val = val; - fn = select_comparison_fn(pred->op, field->size, + if (!strcmp(field->name, "cpu")) + fn = filter_pred_cpu; + else + fn = select_comparison_fn(pred->op, field->size, field->is_signed); if (!fn) { parse_error(ps, FILT_ERR_INVALID_OP, 0); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 8968bf720c12..ca98445782ac 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -715,13 +715,13 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) snprintf(nsecs_str, slen, "%03lu", nsecs_rem); trace_seq_printf(s, ".%s", nsecs_str); - len += strlen(nsecs_str); + len += strlen(nsecs_str) + 1; } trace_seq_puts(s, " us "); /* Print remaining spaces to fit the row's width */ - for (i = len; i < 7; i++) + for (i = len; i < 8; i++) trace_seq_putc(s, ' '); } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index dfab253727dc..8e481a84aeea 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -496,6 +496,8 @@ static const struct trace_mark { char sym; } mark[] = { MARK(1000000000ULL , '$'), /* 1 sec */ + MARK(100000000ULL , '@'), /* 100 msec */ + MARK(10000000ULL , '*'), /* 10 msec */ MARK(1000000ULL , '#'), /* 1000 usecs */ MARK(100000ULL , '!'), /* 100 usecs */ MARK(10000ULL , '+'), /* 10 usecs */ @@ -508,7 +510,7 @@ char trace_find_mark(unsigned long long d) int size = ARRAY_SIZE(mark); for (i = 0; i < size; i++) { - if (d >= mark[i].val) + if (d > mark[i].val) break; } diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 3f34496244e9..b746399ab59c 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -18,12 +18,6 @@ #define STACK_TRACE_ENTRIES 500 -#ifdef CC_USING_FENTRY -# define fentry 1 -#else -# define fentry 0 -#endif - static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; @@ -35,7 +29,7 @@ static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; */ static struct stack_trace max_stack_trace = { .max_entries = STACK_TRACE_ENTRIES - 1, - .entries = &stack_dump_trace[1], + .entries = &stack_dump_trace[0], }; static unsigned long max_stack_size; @@ -55,7 +49,7 @@ static inline void print_max_stack(void) pr_emerg(" Depth Size Location (%d entries)\n" " ----- ---- --------\n", - max_stack_trace.nr_entries - 1); + max_stack_trace.nr_entries); for (i = 0; i < max_stack_trace.nr_entries; i++) { if (stack_dump_trace[i] == ULONG_MAX) @@ -77,7 +71,7 @@ check_stack(unsigned long ip, unsigned long *stack) unsigned long this_size, flags; unsigned long *p, *top, *start; static int tracer_frame; int frame_size = ACCESS_ONCE(tracer_frame); - int i; + int i, x; this_size = ((unsigned long)stack) & (THREAD_SIZE-1); this_size = THREAD_SIZE - this_size; @@ -105,26 +99,20 @@ check_stack(unsigned long ip, unsigned long *stack) max_stack_size = this_size; max_stack_trace.nr_entries = 0; - - if (using_ftrace_ops_list_func()) - max_stack_trace.skip = 4; - else - max_stack_trace.skip = 3; + max_stack_trace.skip = 3; save_stack_trace(&max_stack_trace); - /* - * Add the passed in ip from the function tracer. - * Searching for this on the stack will skip over - * most of the overhead from the stack tracer itself. - */ - stack_dump_trace[0] = ip; - max_stack_trace.nr_entries++; + /* Skip over the overhead of the stack tracer itself */ + for (i = 0; i < max_stack_trace.nr_entries; i++) { + if (stack_dump_trace[i] == ip) + break; + } /* * Now find where in the stack these are. */ - i = 0; + x = 0; start = stack; top = (unsigned long *) (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); @@ -139,12 +127,15 @@ check_stack(unsigned long ip, unsigned long *stack) while (i < max_stack_trace.nr_entries) { int found = 0; - stack_dump_index[i] = this_size; + stack_dump_index[x] = this_size; p = start; for (; p < top && i < max_stack_trace.nr_entries; p++) { + if (stack_dump_trace[i] == ULONG_MAX) + break; if (*p == stack_dump_trace[i]) { - this_size = stack_dump_index[i++] = + stack_dump_trace[x] = stack_dump_trace[i++]; + this_size = stack_dump_index[x++] = (top - p) * sizeof(unsigned long); found = 1; /* Start the search from here */ @@ -156,7 +147,7 @@ check_stack(unsigned long ip, unsigned long *stack) * out what that is, then figure it out * now. */ - if (unlikely(!tracer_frame) && i == 1) { + if (unlikely(!tracer_frame)) { tracer_frame = (p - stack) * sizeof(unsigned long); max_stack_size -= tracer_frame; @@ -168,6 +159,10 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } + max_stack_trace.nr_entries = x; + for (; x < i; x++) + stack_dump_trace[x] = ULONG_MAX; + if (task_stack_end_corrupted(current)) { print_max_stack(); BUG(); @@ -192,24 +187,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, if (per_cpu(trace_active, cpu)++ != 0) goto out; - /* - * When fentry is used, the traced function does not get - * its stack frame set up, and we lose the parent. - * The ip is pretty useless because the function tracer - * was called before that function set up its stack frame. - * In this case, we use the parent ip. - * - * By adding the return address of either the parent ip - * or the current ip we can disregard most of the stack usage - * caused by the stack tracer itself. - * - * The function tracer always reports the address of where the - * mcount call was, but the stack will hold the return address. - */ - if (fentry) - ip = parent_ip; - else - ip += MCOUNT_INSN_SIZE; + ip += MCOUNT_INSN_SIZE; check_stack(ip, &stack); @@ -284,7 +262,7 @@ __next(struct seq_file *m, loff_t *pos) { long n = *pos - 1; - if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX) + if (n > max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX) return NULL; m->private = (void *)n; @@ -354,7 +332,7 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, " Depth Size Location" " (%d entries)\n" " ----- ---- --------\n", - max_stack_trace.nr_entries - 1); + max_stack_trace.nr_entries); if (!stack_tracer_enabled && !max_stack_size) print_disabled(m); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index f65a0a06a8c0..88fefa68c516 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -39,6 +39,7 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) cred->cap_inheritable = CAP_EMPTY_SET; cred->cap_permitted = CAP_FULL_SET; cred->cap_effective = CAP_FULL_SET; + cred->cap_ambient = CAP_EMPTY_SET; cred->cap_bset = CAP_FULL_SET; #ifdef CONFIG_KEYS key_put(cred->request_key_auth); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index a6ffa43f2993..64ed1c37bd1f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -24,6 +24,7 @@ #include <asm/irq_regs.h> #include <linux/kvm_para.h> #include <linux/perf_event.h> +#include <linux/kthread.h> /* * The run state of the lockup detectors is controlled by the content of the @@ -66,7 +67,26 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); #define for_each_watchdog_cpu(cpu) \ for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) +/* + * The 'watchdog_running' variable is set to 1 when the watchdog threads + * are registered/started and is set to 0 when the watchdog threads are + * unregistered/stopped, so it is an indicator whether the threads exist. + */ static int __read_mostly watchdog_running; +/* + * If a subsystem has a need to deactivate the watchdog temporarily, it + * can use the suspend/resume interface to achieve this. The content of + * the 'watchdog_suspended' variable reflects this state. Existing threads + * are parked/unparked by the lockup_detector_{suspend|resume} functions + * (see comment blocks pertaining to those functions for further details). + * + * 'watchdog_suspended' also prevents threads from being registered/started + * or unregistered/stopped via parameters in /proc/sys/kernel, so the state + * of 'watchdog_running' cannot change while the watchdog is deactivated + * temporarily (see related code in 'proc' handlers). + */ +static int __read_mostly watchdog_suspended; + static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); @@ -613,46 +633,9 @@ static void watchdog_nmi_disable(unsigned int cpu) } } -void watchdog_nmi_enable_all(void) -{ - int cpu; - - mutex_lock(&watchdog_proc_mutex); - - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) - goto unlock; - - get_online_cpus(); - for_each_watchdog_cpu(cpu) - watchdog_nmi_enable(cpu); - put_online_cpus(); - -unlock: - mutex_unlock(&watchdog_proc_mutex); -} - -void watchdog_nmi_disable_all(void) -{ - int cpu; - - mutex_lock(&watchdog_proc_mutex); - - if (!watchdog_running) - goto unlock; - - get_online_cpus(); - for_each_watchdog_cpu(cpu) - watchdog_nmi_disable(cpu); - put_online_cpus(); - -unlock: - mutex_unlock(&watchdog_proc_mutex); -} #else static int watchdog_nmi_enable(unsigned int cpu) { return 0; } static void watchdog_nmi_disable(unsigned int cpu) { return; } -void watchdog_nmi_enable_all(void) {} -void watchdog_nmi_disable_all(void) {} #endif /* CONFIG_HARDLOCKUP_DETECTOR */ static struct smp_hotplug_thread watchdog_threads = { @@ -666,46 +649,89 @@ static struct smp_hotplug_thread watchdog_threads = { .unpark = watchdog_enable, }; -static void restart_watchdog_hrtimer(void *info) +/* + * park all watchdog threads that are specified in 'watchdog_cpumask' + */ +static int watchdog_park_threads(void) { - struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); - int ret; + int cpu, ret = 0; + get_online_cpus(); + for_each_watchdog_cpu(cpu) { + ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); + if (ret) + break; + } + if (ret) { + for_each_watchdog_cpu(cpu) + kthread_unpark(per_cpu(softlockup_watchdog, cpu)); + } + put_online_cpus(); + + return ret; +} + +/* + * unpark all watchdog threads that are specified in 'watchdog_cpumask' + */ +static void watchdog_unpark_threads(void) +{ + int cpu; + + get_online_cpus(); + for_each_watchdog_cpu(cpu) + kthread_unpark(per_cpu(softlockup_watchdog, cpu)); + put_online_cpus(); +} + +/* + * Suspend the hard and soft lockup detector by parking the watchdog threads. + */ +int lockup_detector_suspend(void) +{ + int ret = 0; + + mutex_lock(&watchdog_proc_mutex); /* - * No need to cancel and restart hrtimer if it is currently executing - * because it will reprogram itself with the new period now. - * We should never see it unqueued here because we are running per-cpu - * with interrupts disabled. + * Multiple suspend requests can be active in parallel (counted by + * the 'watchdog_suspended' variable). If the watchdog threads are + * running, the first caller takes care that they will be parked. + * The state of 'watchdog_running' cannot change while a suspend + * request is active (see related code in 'proc' handlers). */ - ret = hrtimer_try_to_cancel(hrtimer); - if (ret == 1) - hrtimer_start(hrtimer, ns_to_ktime(sample_period), - HRTIMER_MODE_REL_PINNED); + if (watchdog_running && !watchdog_suspended) + ret = watchdog_park_threads(); + + if (ret == 0) + watchdog_suspended++; + + mutex_unlock(&watchdog_proc_mutex); + + return ret; } -static void update_watchdog(int cpu) +/* + * Resume the hard and soft lockup detector by unparking the watchdog threads. + */ +void lockup_detector_resume(void) { + mutex_lock(&watchdog_proc_mutex); + + watchdog_suspended--; /* - * Make sure that perf event counter will adopt to a new - * sampling period. Updating the sampling period directly would - * be much nicer but we do not have an API for that now so - * let's use a big hammer. - * Hrtimer will adopt the new period on the next tick but this - * might be late already so we have to restart the timer as well. + * The watchdog threads are unparked if they were previously running + * and if there is no more active suspend request. */ - watchdog_nmi_disable(cpu); - smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1); - watchdog_nmi_enable(cpu); + if (watchdog_running && !watchdog_suspended) + watchdog_unpark_threads(); + + mutex_unlock(&watchdog_proc_mutex); } static void update_watchdog_all_cpus(void) { - int cpu; - - get_online_cpus(); - for_each_watchdog_cpu(cpu) - update_watchdog(cpu); - put_online_cpus(); + watchdog_park_threads(); + watchdog_unpark_threads(); } static int watchdog_enable_all_cpus(void) @@ -713,15 +739,12 @@ static int watchdog_enable_all_cpus(void) int err = 0; if (!watchdog_running) { - err = smpboot_register_percpu_thread(&watchdog_threads); + err = smpboot_register_percpu_thread_cpumask(&watchdog_threads, + &watchdog_cpumask); if (err) pr_err("Failed to create watchdog threads, disabled\n"); - else { - if (smpboot_update_cpumask_percpu_thread( - &watchdog_threads, &watchdog_cpumask)) - pr_err("Failed to set cpumask for watchdog threads\n"); + else watchdog_running = 1; - } } else { /* * Enable/disable the lockup detectors or @@ -787,6 +810,12 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, mutex_lock(&watchdog_proc_mutex); + if (watchdog_suspended) { + /* no parameter changes allowed while watchdog is suspended */ + err = -EAGAIN; + goto out; + } + /* * If the parameter is being read return the state of the corresponding * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the @@ -872,6 +901,12 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, mutex_lock(&watchdog_proc_mutex); + if (watchdog_suspended) { + /* no parameter changes allowed while watchdog is suspended */ + err = -EAGAIN; + goto out; + } + old = ACCESS_ONCE(watchdog_thresh); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -903,6 +938,13 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, int err; mutex_lock(&watchdog_proc_mutex); + + if (watchdog_suspended) { + /* no parameter changes allowed while watchdog is suspended */ + err = -EAGAIN; + goto out; + } + err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); if (!err && write) { /* Remove impossible cpus to keep sysctl output cleaner. */ @@ -920,6 +962,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, pr_err("cpumask update failed\n"); } } +out: mutex_unlock(&watchdog_proc_mutex); return err; } @@ -932,10 +975,8 @@ void __init lockup_detector_init(void) #ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_enabled()) { - if (!cpumask_empty(tick_nohz_full_mask)) - pr_info("Disabling watchdog on nohz_full cores by default\n"); - cpumask_andnot(&watchdog_cpumask, cpu_possible_mask, - tick_nohz_full_mask); + pr_info("Disabling watchdog on nohz_full cores by default\n"); + cpumask_copy(&watchdog_cpumask, housekeeping_mask); } else cpumask_copy(&watchdog_cpumask, cpu_possible_mask); #else |