diff options
Diffstat (limited to 'kernel')
45 files changed, 1374 insertions, 632 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 5404911eaee9..86e3285ae7e5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -54,6 +54,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o @@ -130,3 +131,77 @@ quiet_cmd_timeconst = TIMEC $@ targets += timeconst.h $(obj)/timeconst.h: $(src)/timeconst.pl FORCE $(call if_changed,timeconst) + +ifeq ($(CONFIG_MODULE_SIG),y) +# +# Pull the signing certificate and any extra certificates into the kernel +# +extra_certificates: + touch $@ + +kernel/modsign_pubkey.o: signing_key.x509 extra_certificates + +############################################################################### +# +# If module signing is requested, say by allyesconfig, but a key has not been +# supplied, then one will need to be generated to make sure the build does not +# fail and that the kernel may be used afterwards. +# +############################################################################### +sign_key_with_hash := +ifeq ($(CONFIG_MODULE_SIG_SHA1),y) +sign_key_with_hash := -sha1 +endif +ifeq ($(CONFIG_MODULE_SIG_SHA224),y) +sign_key_with_hash := -sha224 +endif +ifeq ($(CONFIG_MODULE_SIG_SHA256),y) +sign_key_with_hash := -sha256 +endif +ifeq ($(CONFIG_MODULE_SIG_SHA384),y) +sign_key_with_hash := -sha384 +endif +ifeq ($(CONFIG_MODULE_SIG_SHA512),y) +sign_key_with_hash := -sha512 +endif +ifeq ($(sign_key_with_hash),) +$(error Could not determine digest type to use from kernel config) +endif + +signing_key.priv signing_key.x509: x509.genkey + @echo "###" + @echo "### Now generating an X.509 key pair to be used for signing modules." + @echo "###" + @echo "### If this takes a long time, you might wish to run rngd in the" + @echo "### background to keep the supply of entropy topped up. It" + @echo "### needs to be run as root, and uses a hardware random" + @echo "### number generator if one is available." + @echo "###" + openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \ + -x509 -config x509.genkey \ + -outform DER -out signing_key.x509 \ + -keyout signing_key.priv + @echo "###" + @echo "### Key pair generated." + @echo "###" + +x509.genkey: + @echo Generating X.509 key generation config + @echo >x509.genkey "[ req ]" + @echo >>x509.genkey "default_bits = 4096" + @echo >>x509.genkey "distinguished_name = req_distinguished_name" + @echo >>x509.genkey "prompt = no" + @echo >>x509.genkey "string_mask = utf8only" + @echo >>x509.genkey "x509_extensions = myexts" + @echo >>x509.genkey + @echo >>x509.genkey "[ req_distinguished_name ]" + @echo >>x509.genkey "O = Magrathea" + @echo >>x509.genkey "CN = Glacier signing key" + @echo >>x509.genkey "emailAddress = slartibartfast@magrathea.h2g2" + @echo >>x509.genkey + @echo >>x509.genkey "[ myexts ]" + @echo >>x509.genkey "basicConstraints=critical,CA:FALSE" + @echo >>x509.genkey "keyUsage=digitalSignature" + @echo >>x509.genkey "subjectKeyIdentifier=hash" + @echo >>x509.genkey "authorityKeyIdentifier=keyid" +endif diff --git a/kernel/acct.c b/kernel/acct.c index 6cd7529c9e6a..051e071a06e7 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -193,7 +193,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, } } -static int acct_on(char *name) +static int acct_on(struct filename *pathname) { struct file *file; struct vfsmount *mnt; @@ -201,7 +201,7 @@ static int acct_on(char *name) struct bsd_acct_struct *acct = NULL; /* Difference from BSD - they don't do O_APPEND */ - file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0); + file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0); if (IS_ERR(file)) return PTR_ERR(file); @@ -260,7 +260,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name) return -EPERM; if (name) { - char *tmp = getname(name); + struct filename *tmp = getname(name); if (IS_ERR(tmp)) return (PTR_ERR(tmp)); error = acct_on(tmp); diff --git a/kernel/audit.c b/kernel/audit.c index 4d0ceede3319..40414e9143db 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1440,6 +1440,8 @@ void audit_log_link_denied(const char *operation, struct path *link) ab = audit_log_start(current->audit_context, GFP_KERNEL, AUDIT_ANOM_LINK); + if (!ab) + return; audit_log_format(ab, "op=%s action=denied", operation); audit_log_format(ab, " pid=%d comm=", current->pid); audit_log_untrustedstring(ab, current->comm); diff --git a/kernel/audit.h b/kernel/audit.h index 9eb3d79482b6..d51cba868e1b 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -74,12 +74,15 @@ static inline int audit_hash_ino(u32 ino) return (ino & (AUDIT_INODE_BUCKETS-1)); } +/* Indicates that audit should log the full pathname. */ +#define AUDIT_NAME_FULL -1 + extern int audit_match_class(int class, unsigned syscall); extern int audit_comparator(const u32 left, const u32 op, const u32 right); extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right); extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); -extern int audit_compare_dname_path(const char *dname, const char *path, - int *dirlen); +extern int parent_len(const char *path); +extern int audit_compare_dname_path(const char *dname, const char *path, int plen); extern struct sk_buff * audit_make_reply(int pid, int seq, int type, int done, int multi, const void *payload, int size); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 1c22ec3d87bc..9a9ae6e3d290 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -265,7 +265,8 @@ static void audit_update_watch(struct audit_parent *parent, /* Run all of the watches on this parent looking for the one that * matches the given dname */ list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { - if (audit_compare_dname_path(dname, owatch->path, NULL)) + if (audit_compare_dname_path(dname, owatch->path, + AUDIT_NAME_FULL)) continue; /* If the update involves invalidating rules, do the inode-based diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index c4bcdbaf4d4d..7f19f23d38a3 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1298,41 +1298,60 @@ int audit_gid_comparator(kgid_t left, u32 op, kgid_t right) } } -/* Compare given dentry name with last component in given path, - * return of 0 indicates a match. */ -int audit_compare_dname_path(const char *dname, const char *path, - int *dirlen) +/** + * parent_len - find the length of the parent portion of a pathname + * @path: pathname of which to determine length + */ +int parent_len(const char *path) { - int dlen, plen; + int plen; const char *p; - if (!dname || !path) - return 1; - - dlen = strlen(dname); plen = strlen(path); - if (plen < dlen) - return 1; + + if (plen == 0) + return plen; /* disregard trailing slashes */ p = path + plen - 1; while ((*p == '/') && (p > path)) p--; - /* find last path component */ - p = p - dlen + 1; - if (p < path) + /* walk backward until we find the next slash or hit beginning */ + while ((*p != '/') && (p > path)) + p--; + + /* did we find a slash? Then increment to include it in path */ + if (*p == '/') + p++; + + return p - path; +} + +/** + * audit_compare_dname_path - compare given dentry name with last component in + * given path. Return of 0 indicates a match. + * @dname: dentry name that we're comparing + * @path: full pathname that we're comparing + * @parentlen: length of the parent if known. Passing in AUDIT_NAME_FULL + * here indicates that we must compute this value. + */ +int audit_compare_dname_path(const char *dname, const char *path, int parentlen) +{ + int dlen, pathlen; + const char *p; + + dlen = strlen(dname); + pathlen = strlen(path); + if (pathlen < dlen) return 1; - else if (p > path) { - if (*--p != '/') - return 1; - else - p++; - } - /* return length of path's directory component */ - if (dirlen) - *dirlen = p - path; + parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen; + if (pathlen - parentlen != dlen) + return 1; + + p = path + parentlen; + return strncmp(p, dname, dlen); } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 29e090cc0e46..2f186ed80c40 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -81,9 +81,6 @@ * a name dynamically and also add those to the list anchored by names_list. */ #define AUDIT_NAMES 5 -/* Indicates that audit should log the full pathname. */ -#define AUDIT_NAME_FULL -1 - /* no execve audit message should be longer than this (userspace limits) */ #define MAX_EXECVE_AUDIT_LEN 7500 @@ -106,27 +103,29 @@ struct audit_cap_data { * we don't let putname() free it (instead we free all of the saved * pointers at syscall exit time). * - * Further, in fs/namei.c:path_lookup() we store the inode and device. */ + * Further, in fs/namei.c:path_lookup() we store the inode and device. + */ struct audit_names { - struct list_head list; /* audit_context->names_list */ - const char *name; - unsigned long ino; - dev_t dev; - umode_t mode; - kuid_t uid; - kgid_t gid; - dev_t rdev; - u32 osid; - struct audit_cap_data fcap; - unsigned int fcap_ver; - int name_len; /* number of name's characters to log */ - bool name_put; /* call __putname() for this name */ + struct list_head list; /* audit_context->names_list */ + struct filename *name; + unsigned long ino; + dev_t dev; + umode_t mode; + kuid_t uid; + kgid_t gid; + dev_t rdev; + u32 osid; + struct audit_cap_data fcap; + unsigned int fcap_ver; + int name_len; /* number of name's characters to log */ + unsigned char type; /* record type */ + bool name_put; /* call __putname() for this name */ /* * This was an allocated audit_names and not from the array of * names allocated in the task audit context. Thus this name * should be freed on syscall exit */ - bool should_free; + bool should_free; }; struct audit_aux_data { @@ -998,7 +997,7 @@ static inline void audit_free_names(struct audit_context *context) context->ino_count); list_for_each_entry(n, &context->names_list, list) { printk(KERN_ERR "names[%d] = %p = %s\n", i, - n->name, n->name ?: "(null)"); + n->name, n->name->name ?: "(null)"); } dump_stack(); return; @@ -1151,7 +1150,6 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) const struct cred *cred; char name[sizeof(tsk->comm)]; struct mm_struct *mm = tsk->mm; - struct vm_area_struct *vma; char *tty; if (!ab) @@ -1191,16 +1189,8 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) if (mm) { down_read(&mm->mmap_sem); - vma = mm->mmap; - while (vma) { - if ((vma->vm_flags & VM_EXECUTABLE) && - vma->vm_file) { - audit_log_d_path(ab, " exe=", - &vma->vm_file->f_path); - break; - } - vma = vma->vm_next; - } + if (mm->exe_file) + audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); up_read(&mm->mmap_sem); } audit_log_task_context(ab); @@ -1564,7 +1554,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, case AUDIT_NAME_FULL: /* log the full path */ audit_log_format(ab, " name="); - audit_log_untrustedstring(ab, n->name); + audit_log_untrustedstring(ab, n->name->name); break; case 0: /* name was specified as a relative path and the @@ -1574,7 +1564,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, default: /* log the name's directory component */ audit_log_format(ab, " name="); - audit_log_n_untrustedstring(ab, n->name, + audit_log_n_untrustedstring(ab, n->name->name, n->name_len); } } else @@ -2004,7 +1994,8 @@ retry: #endif } -static struct audit_names *audit_alloc_name(struct audit_context *context) +static struct audit_names *audit_alloc_name(struct audit_context *context, + unsigned char type) { struct audit_names *aname; @@ -2019,6 +2010,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context) } aname->ino = (unsigned long)-1; + aname->type = type; list_add_tail(&aname->list, &context->names_list); context->name_count++; @@ -2029,13 +2021,36 @@ static struct audit_names *audit_alloc_name(struct audit_context *context) } /** + * audit_reusename - fill out filename with info from existing entry + * @uptr: userland ptr to pathname + * + * Search the audit_names list for the current audit context. If there is an + * existing entry with a matching "uptr" then return the filename + * associated with that audit_name. If not, return NULL. + */ +struct filename * +__audit_reusename(const __user char *uptr) +{ + struct audit_context *context = current->audit_context; + struct audit_names *n; + + list_for_each_entry(n, &context->names_list, list) { + if (!n->name) + continue; + if (n->name->uptr == uptr) + return n->name; + } + return NULL; +} + +/** * audit_getname - add a name to the list * @name: name to add * * Add a name to the list of audit names for this context. * Called from fs/namei.c:getname(). */ -void __audit_getname(const char *name) +void __audit_getname(struct filename *name) { struct audit_context *context = current->audit_context; struct audit_names *n; @@ -2049,13 +2064,19 @@ void __audit_getname(const char *name) return; } - n = audit_alloc_name(context); +#if AUDIT_DEBUG + /* The filename _must_ have a populated ->name */ + BUG_ON(!name->name); +#endif + + n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); if (!n) return; n->name = name; n->name_len = AUDIT_NAME_FULL; n->name_put = true; + name->aname = n; if (!context->pwd.dentry) get_fs_pwd(current->fs, &context->pwd); @@ -2068,7 +2089,7 @@ void __audit_getname(const char *name) * then we delay the putname until syscall exit. * Called from include/linux/fs.h:putname(). */ -void audit_putname(const char *name) +void audit_putname(struct filename *name) { struct audit_context *context = current->audit_context; @@ -2083,7 +2104,7 @@ void audit_putname(const char *name) list_for_each_entry(n, &context->names_list, list) printk(KERN_ERR "name[%d] = %p = %s\n", i, - n->name, n->name ?: "(null)"); + n->name, n->name->name ?: "(null)"); } #endif __putname(name); @@ -2097,8 +2118,8 @@ void audit_putname(const char *name) " put_count=%d\n", __FILE__, __LINE__, context->serial, context->major, - context->in_syscall, name, context->name_count, - context->put_count); + context->in_syscall, name->name, + context->name_count, context->put_count); dump_stack(); } } @@ -2141,13 +2162,13 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent } /** - * audit_inode - store the inode and device from a lookup + * __audit_inode - store the inode and device from a lookup * @name: name being audited * @dentry: dentry being audited - * - * Called from fs/namei.c:path_lookup(). + * @parent: does this dentry represent the parent? */ -void __audit_inode(const char *name, const struct dentry *dentry) +void __audit_inode(struct filename *name, const struct dentry *dentry, + unsigned int parent) { struct audit_context *context = current->audit_context; const struct inode *inode = dentry->d_inode; @@ -2156,24 +2177,69 @@ void __audit_inode(const char *name, const struct dentry *dentry) if (!context->in_syscall) return; + if (!name) + goto out_alloc; + +#if AUDIT_DEBUG + /* The struct filename _must_ have a populated ->name */ + BUG_ON(!name->name); +#endif + /* + * If we have a pointer to an audit_names entry already, then we can + * just use it directly if the type is correct. + */ + n = name->aname; + if (n) { + if (parent) { + if (n->type == AUDIT_TYPE_PARENT || + n->type == AUDIT_TYPE_UNKNOWN) + goto out; + } else { + if (n->type != AUDIT_TYPE_PARENT) + goto out; + } + } + list_for_each_entry_reverse(n, &context->names_list, list) { - if (n->name && (n->name == name)) - goto out; + /* does the name pointer match? */ + if (!n->name || n->name->name != name->name) + continue; + + /* match the correct record type */ + if (parent) { + if (n->type == AUDIT_TYPE_PARENT || + n->type == AUDIT_TYPE_UNKNOWN) + goto out; + } else { + if (n->type != AUDIT_TYPE_PARENT) + goto out; + } } - /* unable to find the name from a previous getname() */ - n = audit_alloc_name(context); +out_alloc: + /* unable to find the name from a previous getname(). Allocate a new + * anonymous entry. + */ + n = audit_alloc_name(context, AUDIT_TYPE_NORMAL); if (!n) return; out: + if (parent) { + n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; + n->type = AUDIT_TYPE_PARENT; + } else { + n->name_len = AUDIT_NAME_FULL; + n->type = AUDIT_TYPE_NORMAL; + } handle_path(dentry); audit_copy_inode(n, dentry, inode); } /** - * audit_inode_child - collect inode info for created/removed objects - * @dentry: dentry being audited + * __audit_inode_child - collect inode info for created/removed objects * @parent: inode of dentry parent + * @dentry: dentry being audited + * @type: AUDIT_TYPE_* value that we're looking for * * For syscalls that create or remove filesystem objects, audit_inode * can only collect information for the filesystem object's parent. @@ -2183,15 +2249,14 @@ out: * must be hooked prior, in order to capture the target inode during * unsuccessful attempts. */ -void __audit_inode_child(const struct dentry *dentry, - const struct inode *parent) +void __audit_inode_child(const struct inode *parent, + const struct dentry *dentry, + const unsigned char type) { struct audit_context *context = current->audit_context; - const char *found_parent = NULL, *found_child = NULL; const struct inode *inode = dentry->d_inode; const char *dname = dentry->d_name.name; - struct audit_names *n; - int dirlen = 0; + struct audit_names *n, *found_parent = NULL, *found_child = NULL; if (!context->in_syscall) return; @@ -2199,62 +2264,65 @@ void __audit_inode_child(const struct dentry *dentry, if (inode) handle_one(inode); - /* parent is more likely, look for it first */ + /* look for a parent entry first */ list_for_each_entry(n, &context->names_list, list) { - if (!n->name) + if (!n->name || n->type != AUDIT_TYPE_PARENT) continue; if (n->ino == parent->i_ino && - !audit_compare_dname_path(dname, n->name, &dirlen)) { - n->name_len = dirlen; /* update parent data in place */ - found_parent = n->name; - goto add_names; + !audit_compare_dname_path(dname, n->name->name, n->name_len)) { + found_parent = n; + break; } } - /* no matching parent, look for matching child */ + /* is there a matching child entry? */ list_for_each_entry(n, &context->names_list, list) { - if (!n->name) + /* can only match entries that have a name */ + if (!n->name || n->type != type) continue; - /* strcmp() is the more likely scenario */ - if (!strcmp(dname, n->name) || - !audit_compare_dname_path(dname, n->name, &dirlen)) { - if (inode) - audit_copy_inode(n, NULL, inode); - else - n->ino = (unsigned long)-1; - found_child = n->name; - goto add_names; + /* if we found a parent, make sure this one is a child of it */ + if (found_parent && (n->name != found_parent->name)) + continue; + + if (!strcmp(dname, n->name->name) || + !audit_compare_dname_path(dname, n->name->name, + found_parent ? + found_parent->name_len : + AUDIT_NAME_FULL)) { + found_child = n; + break; } } -add_names: if (!found_parent) { - n = audit_alloc_name(context); + /* create a new, "anonymous" parent record */ + n = audit_alloc_name(context, AUDIT_TYPE_PARENT); if (!n) return; audit_copy_inode(n, NULL, parent); } if (!found_child) { - n = audit_alloc_name(context); - if (!n) + found_child = audit_alloc_name(context, type); + if (!found_child) return; /* Re-use the name belonging to the slot for a matching parent * directory. All names for this context are relinquished in * audit_free_names() */ if (found_parent) { - n->name = found_parent; - n->name_len = AUDIT_NAME_FULL; + found_child->name = found_parent->name; + found_child->name_len = AUDIT_NAME_FULL; /* don't call __putname() */ - n->name_put = false; + found_child->name_put = false; } - - if (inode) - audit_copy_inode(n, NULL, inode); } + if (inode) + audit_copy_inode(found_child, dentry, inode); + else + found_child->ino = (unsigned long)-1; } EXPORT_SYMBOL_GPL(__audit_inode_child); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 13774b3b39aa..f24f724620dd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1962,9 +1962,8 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, * trading it for newcg is protected by cgroup_mutex, we're safe to drop * it here; it will be freed under RCU. */ - put_css_set(oldcg); - set_bit(CGRP_RELEASABLE, &oldcgrp->flags); + put_css_set(oldcg); } /** @@ -4815,31 +4814,20 @@ static const struct file_operations proc_cgroupstats_operations = { * * A pointer to the shared css_set was automatically copied in * fork.c by dup_task_struct(). However, we ignore that copy, since - * it was not made under the protection of RCU, cgroup_mutex or - * threadgroup_change_begin(), so it might no longer be a valid - * cgroup pointer. cgroup_attach_task() might have already changed - * current->cgroups, allowing the previously referenced cgroup - * group to be removed and freed. - * - * Outside the pointer validity we also need to process the css_set - * inheritance between threadgoup_change_begin() and - * threadgoup_change_end(), this way there is no leak in any process - * wide migration performed by cgroup_attach_proc() that could otherwise - * miss a thread because it is too early or too late in the fork stage. + * it was not made under the protection of RCU or cgroup_mutex, so + * might no longer be a valid cgroup pointer. cgroup_attach_task() might + * have already changed current->cgroups, allowing the previously + * referenced cgroup group to be removed and freed. * * At the point that cgroup_fork() is called, 'current' is the parent * task, and the passed argument 'child' points to the child task. */ void cgroup_fork(struct task_struct *child) { - /* - * We don't need to task_lock() current because current->cgroups - * can't be changed concurrently here. The parent obviously hasn't - * exited and called cgroup_exit(), and we are synchronized against - * cgroup migration through threadgroup_change_begin(). - */ + task_lock(current); child->cgroups = current->cgroups; get_css_set(child->cgroups); + task_unlock(current); INIT_LIST_HEAD(&child->cg_list); } @@ -4895,19 +4883,10 @@ void cgroup_post_fork(struct task_struct *child) */ if (use_task_css_set_links) { write_lock(&css_set_lock); - if (list_empty(&child->cg_list)) { - /* - * It's safe to use child->cgroups without task_lock() - * here because we are protected through - * threadgroup_change_begin() against concurrent - * css_set change in cgroup_task_migrate(). Also - * the task can't exit at that point until - * wake_up_new_task() is called, so we are protected - * against cgroup_exit() setting child->cgroup to - * init_css_set. - */ + task_lock(child); + if (list_empty(&child->cg_list)) list_add(&child->cg_list, &child->cgroups->tasks); - } + task_unlock(child); write_unlock(&css_set_lock); } } diff --git a/kernel/cpu.c b/kernel/cpu.c index f560598807c1..42bd331ee0ab 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -80,6 +80,10 @@ void put_online_cpus(void) if (cpu_hotplug.active_writer == current) return; mutex_lock(&cpu_hotplug.lock); + + if (WARN_ON(!cpu_hotplug.refcount)) + cpu_hotplug.refcount++; /* try to fix things up */ + if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) wake_up_process(cpu_hotplug.active_writer); mutex_unlock(&cpu_hotplug.lock); diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 17e073c309e6..9a61738cefc8 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -696,6 +696,22 @@ out: return ret; } +/* + * GDB places a breakpoint at this function to know dynamically + * loaded objects. It's not defined static so that only one instance with this + * name exists in the kernel. + */ + +static int module_event(struct notifier_block *self, unsigned long val, + void *data) +{ + return 0; +} + +static struct notifier_block dbg_module_load_nb = { + .notifier_call = module_event, +}; + int kgdb_nmicallback(int cpu, void *regs) { #ifdef CONFIG_SMP @@ -824,6 +840,7 @@ static void kgdb_register_callbacks(void) kgdb_arch_init(); if (!dbg_is_early) kgdb_arch_late(); + register_module_notifier(&dbg_module_load_nb); register_reboot_notifier(&dbg_reboot_notifier); atomic_notifier_chain_register(&panic_notifier_list, &kgdb_panic_event_nb); @@ -847,6 +864,7 @@ static void kgdb_unregister_callbacks(void) if (kgdb_io_module_registered) { kgdb_io_module_registered = 0; unregister_reboot_notifier(&dbg_reboot_notifier); + unregister_module_notifier(&dbg_module_load_nb); atomic_notifier_chain_unregister(&panic_notifier_list, &kgdb_panic_event_nb); kgdb_arch_exit(); diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 07c9bbb94a0b..b03e0e814e43 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -129,6 +129,8 @@ kdb_bt(int argc, const char **argv) } /* Now the inactive tasks */ kdb_do_each_thread(g, p) { + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; if (task_curr(p)) continue; if (kdb_bt1(p, mask, argcount, btaprompt)) diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 0a69d2adc4f3..14ff4849262c 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -552,6 +552,7 @@ int vkdb_printf(const char *fmt, va_list ap) { int diag; int linecount; + int colcount; int logging, saved_loglevel = 0; int saved_trap_printk; int got_printf_lock = 0; @@ -584,6 +585,10 @@ int vkdb_printf(const char *fmt, va_list ap) if (diag || linecount <= 1) linecount = 24; + diag = kdbgetintenv("COLUMNS", &colcount); + if (diag || colcount <= 1) + colcount = 80; + diag = kdbgetintenv("LOGGING", &logging); if (diag) logging = 0; @@ -690,7 +695,7 @@ kdb_printit: gdbstub_msg_write(kdb_buffer, retlen); } else { if (dbg_io_ops && !dbg_io_ops->is_console) { - len = strlen(kdb_buffer); + len = retlen; cp = kdb_buffer; while (len--) { dbg_io_ops->write_char(*cp); @@ -709,11 +714,29 @@ kdb_printit: printk(KERN_INFO "%s", kdb_buffer); } - if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n')) - kdb_nextline++; + if (KDB_STATE(PAGER)) { + /* + * Check printed string to decide how to bump the + * kdb_nextline to control when the more prompt should + * show up. + */ + int got = 0; + len = retlen; + while (len--) { + if (kdb_buffer[len] == '\n') { + kdb_nextline++; + got = 0; + } else if (kdb_buffer[len] == '\r') { + got = 0; + } else { + got++; + } + } + kdb_nextline += got / (colcount + 1); + } /* check for having reached the LINES number of printed lines */ - if (kdb_nextline == linecount) { + if (kdb_nextline >= linecount) { char buf1[16] = ""; /* Watch out for recursion here. Any routine that calls @@ -765,7 +788,7 @@ kdb_printit: kdb_grepping_flag = 0; kdb_printf("\n"); } else if (buf1[0] == ' ') { - kdb_printf("\n"); + kdb_printf("\r"); suspend_grep = 1; /* for this recursion */ } else if (buf1[0] == '\n') { kdb_nextline = linecount - 1; diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 1261dc7eaeb9..4d5f8d5612f3 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2101,6 +2101,8 @@ static int kdb_dmesg(int argc, const char **argv) } if (!lines--) break; + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; kdb_printf("%.*s\n", (int)len - 1, buf); } diff --git a/kernel/events/core.c b/kernel/events/core.c index f16f3c58f11a..dbccf83c134d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -372,6 +372,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode) list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + if (cpuctx->unique_pmu != pmu) + continue; /* ensure we process each cpuctx once */ /* * perf_cgroup_events says at least one @@ -395,9 +397,10 @@ void perf_cgroup_switch(struct task_struct *task, int mode) if (mode & PERF_CGROUP_SWIN) { WARN_ON_ONCE(cpuctx->cgrp); - /* set cgrp before ctxsw in to - * allow event_filter_match() to not - * have to pass task around + /* + * set cgrp before ctxsw in to allow + * event_filter_match() to not have to pass + * task around */ cpuctx->cgrp = perf_cgroup_from_task(task); cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); @@ -3671,7 +3674,7 @@ unlock: atomic_inc(&event->mmap_count); mutex_unlock(&event->mmap_mutex); - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &perf_mmap_vmops; return ret; @@ -4412,7 +4415,7 @@ static void perf_event_task_event(struct perf_task_event *task_event) rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->active_pmu != pmu) + if (cpuctx->unique_pmu != pmu) goto next; perf_event_task_ctx(&cpuctx->ctx, task_event); @@ -4558,7 +4561,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->active_pmu != pmu) + if (cpuctx->unique_pmu != pmu) goto next; perf_event_comm_ctx(&cpuctx->ctx, comm_event); @@ -4754,7 +4757,7 @@ got_name: rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->active_pmu != pmu) + if (cpuctx->unique_pmu != pmu) goto next; perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); @@ -5855,8 +5858,8 @@ static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - if (cpuctx->active_pmu == old_pmu) - cpuctx->active_pmu = pmu; + if (cpuctx->unique_pmu == old_pmu) + cpuctx->unique_pmu = pmu; } } @@ -5991,7 +5994,7 @@ skip_type: cpuctx->ctx.pmu = pmu; cpuctx->jiffies_interval = 1; INIT_LIST_HEAD(&cpuctx->rotation_list); - cpuctx->active_pmu = pmu; + cpuctx->unique_pmu = pmu; } got_cpu_context: diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 912ef48d28ab..5cc4e7e42e68 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -78,15 +78,23 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; */ static atomic_t uprobe_events = ATOMIC_INIT(0); +/* Have a copy of original instruction */ +#define UPROBE_COPY_INSN 0 +/* Dont run handlers when first register/ last unregister in progress*/ +#define UPROBE_RUN_HANDLER 1 +/* Can skip singlestep */ +#define UPROBE_SKIP_SSTEP 2 + struct uprobe { struct rb_node rb_node; /* node in the rb tree */ atomic_t ref; struct rw_semaphore consumer_rwsem; + struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */ struct list_head pending_list; struct uprobe_consumer *consumers; struct inode *inode; /* Also hold a ref to inode */ loff_t offset; - int flags; + unsigned long flags; struct arch_uprobe arch; }; @@ -100,17 +108,12 @@ struct uprobe { */ static bool valid_vma(struct vm_area_struct *vma, bool is_register) { - if (!vma->vm_file) - return false; - - if (!is_register) - return true; + vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED; - if ((vma->vm_flags & (VM_HUGETLB|VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) - == (VM_READ|VM_EXEC)) - return true; + if (is_register) + flags |= VM_WRITE; - return false; + return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC; } static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset) @@ -141,10 +144,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, spinlock_t *ptl; pte_t *ptep; int err; + /* For mmu_notifiers */ + const unsigned long mmun_start = addr; + const unsigned long mmun_end = addr + PAGE_SIZE; /* For try_to_free_swap() and munlock_vma_page() below */ lock_page(page); + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); err = -EAGAIN; ptep = page_check_address(page, mm, addr, &ptl, 0); if (!ptep) @@ -173,6 +180,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = 0; unlock: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(page); return err; } @@ -188,19 +196,44 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn) return *insn == UPROBE_SWBP_INSN; } +static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode) +{ + void *kaddr = kmap_atomic(page); + memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE); + kunmap_atomic(kaddr); +} + +static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode) +{ + uprobe_opcode_t old_opcode; + bool is_swbp; + + copy_opcode(page, vaddr, &old_opcode); + is_swbp = is_swbp_insn(&old_opcode); + + if (is_swbp_insn(new_opcode)) { + if (is_swbp) /* register: already installed? */ + return 0; + } else { + if (!is_swbp) /* unregister: was it changed by us? */ + return 0; + } + + return 1; +} + /* * NOTE: * Expect the breakpoint instruction to be the smallest size instruction for * the architecture. If an arch has variable length instruction and the * breakpoint instruction is not of the smallest length instruction - * supported by that architecture then we need to modify read_opcode / + * supported by that architecture then we need to modify is_swbp_at_addr and * write_opcode accordingly. This would never be a problem for archs that * have fixed length instructions. */ /* * write_opcode - write the opcode at a given virtual address. - * @auprobe: arch breakpointing information. * @mm: the probed process address space. * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. @@ -211,8 +244,8 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn) * For mm @mm, write the opcode at @vaddr. * Return 0 (success) or a negative errno. */ -static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, - unsigned long vaddr, uprobe_opcode_t opcode) +static int write_opcode(struct mm_struct *mm, unsigned long vaddr, + uprobe_opcode_t opcode) { struct page *old_page, *new_page; void *vaddr_old, *vaddr_new; @@ -221,10 +254,14 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, retry: /* Read the page with vaddr into memory */ - ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); + ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); if (ret <= 0) return ret; + ret = verify_opcode(old_page, vaddr, &opcode); + if (ret <= 0) + goto put_old; + ret = -ENOMEM; new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); if (!new_page) @@ -259,63 +296,6 @@ put_old: } /** - * read_opcode - read the opcode at a given virtual address. - * @mm: the probed process address space. - * @vaddr: the virtual address to read the opcode. - * @opcode: location to store the read opcode. - * - * Called with mm->mmap_sem held (for read and with a reference to - * mm. - * - * For mm @mm, read the opcode at @vaddr and store it in @opcode. - * Return 0 (success) or a negative errno. - */ -static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t *opcode) -{ - struct page *page; - void *vaddr_new; - int ret; - - ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); - if (ret <= 0) - return ret; - - vaddr_new = kmap_atomic(page); - vaddr &= ~PAGE_MASK; - memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE); - kunmap_atomic(vaddr_new); - - put_page(page); - - return 0; -} - -static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) -{ - uprobe_opcode_t opcode; - int result; - - if (current->mm == mm) { - pagefault_disable(); - result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, - sizeof(opcode)); - pagefault_enable(); - - if (likely(result == 0)) - goto out; - } - - result = read_opcode(mm, vaddr, &opcode); - if (result) - return result; -out: - if (is_swbp_insn(&opcode)) - return 1; - - return 0; -} - -/** * set_swbp - store breakpoint at a given address. * @auprobe: arch specific probepoint information. * @mm: the probed process address space. @@ -326,18 +306,7 @@ out: */ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - int result; - /* - * See the comment near uprobes_hash(). - */ - result = is_swbp_at_addr(mm, vaddr); - if (result == 1) - return 0; - - if (result) - return result; - - return write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN); + return write_opcode(mm, vaddr, UPROBE_SWBP_INSN); } /** @@ -352,16 +321,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned int __weak set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - int result; - - result = is_swbp_at_addr(mm, vaddr); - if (!result) - return -EINVAL; - - if (result != 1) - return result; - - return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); + return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); } static int match_uprobe(struct uprobe *l, struct uprobe *r) @@ -468,7 +428,7 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) spin_unlock(&uprobes_treelock); /* For now assume that the instruction need not be single-stepped */ - uprobe->flags |= UPROBE_SKIP_SSTEP; + __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); return u; } @@ -490,6 +450,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->inode = igrab(inode); uprobe->offset = offset; init_rwsem(&uprobe->consumer_rwsem); + mutex_init(&uprobe->copy_mutex); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); @@ -510,7 +471,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) { struct uprobe_consumer *uc; - if (!(uprobe->flags & UPROBE_RUN_HANDLER)) + if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags)) return; down_read(&uprobe->consumer_rwsem); @@ -616,29 +577,43 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp) return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); } -/* - * How mm->uprobes_state.count gets updated - * uprobe_mmap() increments the count if - * - it successfully adds a breakpoint. - * - it cannot add a breakpoint, but sees that there is a underlying - * breakpoint (via a is_swbp_at_addr()). - * - * uprobe_munmap() decrements the count if - * - it sees a underlying breakpoint, (via is_swbp_at_addr) - * (Subsequent uprobe_unregister wouldnt find the breakpoint - * unless a uprobe_mmap kicks in, since the old vma would be - * dropped just after uprobe_munmap.) - * - * uprobe_register increments the count if: - * - it successfully adds a breakpoint. - * - * uprobe_unregister decrements the count if: - * - it sees a underlying breakpoint and removes successfully. - * (via is_swbp_at_addr) - * (Subsequent uprobe_munmap wouldnt find the breakpoint - * since there is no underlying breakpoint after the - * breakpoint removal.) - */ +static int prepare_uprobe(struct uprobe *uprobe, struct file *file, + struct mm_struct *mm, unsigned long vaddr) +{ + int ret = 0; + + if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) + return ret; + + mutex_lock(&uprobe->copy_mutex); + if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) + goto out; + + ret = copy_insn(uprobe, file); + if (ret) + goto out; + + ret = -ENOTSUPP; + if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) + goto out; + + ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); + if (ret) + goto out; + + /* write_opcode() assumes we don't cross page boundary */ + BUG_ON((uprobe->offset & ~PAGE_MASK) + + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); + + smp_wmb(); /* pairs with rmb() in find_active_uprobe() */ + set_bit(UPROBE_COPY_INSN, &uprobe->flags); + + out: + mutex_unlock(&uprobe->copy_mutex); + + return ret; +} + static int install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long vaddr) @@ -656,24 +631,9 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, if (!uprobe->consumers) return 0; - if (!(uprobe->flags & UPROBE_COPY_INSN)) { - ret = copy_insn(uprobe, vma->vm_file); - if (ret) - return ret; - - if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) - return -ENOTSUPP; - - ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); - if (ret) - return ret; - - /* write_opcode() assumes we don't cross page boundary */ - BUG_ON((uprobe->offset & ~PAGE_MASK) + - UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); - - uprobe->flags |= UPROBE_COPY_INSN; - } + ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr); + if (ret) + return ret; /* * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(), @@ -692,15 +652,15 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, return ret; } -static void +static int remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) { /* can happen if uprobe_register() fails */ if (!test_bit(MMF_HAS_UPROBES, &mm->flags)) - return; + return 0; set_bit(MMF_RECALC_UPROBES, &mm->flags); - set_orig_insn(&uprobe->arch, mm, vaddr); + return set_orig_insn(&uprobe->arch, mm, vaddr); } /* @@ -735,7 +695,6 @@ static struct map_info * build_map_info(struct address_space *mapping, loff_t offset, bool is_register) { unsigned long pgoff = offset >> PAGE_SHIFT; - struct prio_tree_iter iter; struct vm_area_struct *vma; struct map_info *curr = NULL; struct map_info *prev = NULL; @@ -744,7 +703,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) again: mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (!valid_vma(vma, is_register)) continue; @@ -816,7 +775,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) struct mm_struct *mm = info->mm; struct vm_area_struct *vma; - if (err) + if (err && is_register) goto free; down_write(&mm->mmap_sem); @@ -832,7 +791,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) if (is_register) err = install_breakpoint(uprobe, mm, vma, info->vaddr); else - remove_breakpoint(uprobe, mm, info->vaddr); + err |= remove_breakpoint(uprobe, mm, info->vaddr); unlock: up_write(&mm->mmap_sem); @@ -889,13 +848,15 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * mutex_lock(uprobes_hash(inode)); uprobe = alloc_uprobe(inode, offset); - if (uprobe && !consumer_add(uprobe, uc)) { + if (!uprobe) { + ret = -ENOMEM; + } else if (!consumer_add(uprobe, uc)) { ret = __uprobe_register(uprobe); if (ret) { uprobe->consumers = NULL; __uprobe_unregister(uprobe); } else { - uprobe->flags |= UPROBE_RUN_HANDLER; + set_bit(UPROBE_RUN_HANDLER, &uprobe->flags); } } @@ -928,7 +889,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume if (consumer_del(uprobe, uc)) { if (!uprobe->consumers) { __uprobe_unregister(uprobe); - uprobe->flags &= ~UPROBE_RUN_HANDLER; + clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags); } } @@ -1389,10 +1350,11 @@ bool uprobe_deny_signal(void) */ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) { - if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) - return true; - - uprobe->flags &= ~UPROBE_SKIP_SSTEP; + if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) { + if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) + return true; + clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); + } return false; } @@ -1415,6 +1377,30 @@ static void mmf_recalc_uprobes(struct mm_struct *mm) clear_bit(MMF_HAS_UPROBES, &mm->flags); } +static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) +{ + struct page *page; + uprobe_opcode_t opcode; + int result; + + pagefault_disable(); + result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, + sizeof(opcode)); + pagefault_enable(); + + if (likely(result == 0)) + goto out; + + result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); + if (result < 0) + return result; + + copy_opcode(page, vaddr, &opcode); + put_page(page); + out: + return is_swbp_insn(&opcode); +} + static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) { struct mm_struct *mm = current->mm; @@ -1485,38 +1471,41 @@ static void handle_swbp(struct pt_regs *regs) } return; } + /* + * TODO: move copy_insn/etc into _register and remove this hack. + * After we hit the bp, _unregister + _register can install the + * new and not-yet-analyzed uprobe at the same address, restart. + */ + smp_rmb(); /* pairs with wmb() in install_breakpoint() */ + if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) + goto restart; utask = current->utask; if (!utask) { utask = add_utask(); /* Cannot allocate; re-execute the instruction. */ if (!utask) - goto cleanup_ret; + goto restart; } - utask->active_uprobe = uprobe; + handler_chain(uprobe, regs); - if (uprobe->flags & UPROBE_SKIP_SSTEP && can_skip_sstep(uprobe, regs)) - goto cleanup_ret; + if (can_skip_sstep(uprobe, regs)) + goto out; - utask->state = UTASK_SSTEP; if (!pre_ssout(uprobe, regs, bp_vaddr)) { arch_uprobe_enable_step(&uprobe->arch); + utask->active_uprobe = uprobe; + utask->state = UTASK_SSTEP; return; } -cleanup_ret: - if (utask) { - utask->active_uprobe = NULL; - utask->state = UTASK_RUNNING; - } - if (!(uprobe->flags & UPROBE_SKIP_SSTEP)) - - /* - * cannot singlestep; cannot skip instruction; - * re-execute the instruction. - */ - instruction_pointer_set(regs, bp_vaddr); - +restart: + /* + * cannot singlestep; cannot skip instruction; + * re-execute the instruction. + */ + instruction_pointer_set(regs, bp_vaddr); +out: put_uprobe(uprobe); } @@ -1548,13 +1537,12 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) } /* - * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag. (and on - * subsequent probe hits on the thread sets the state to UTASK_BP_HIT) and - * allows the thread to return from interrupt. + * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and + * allows the thread to return from interrupt. After that handle_swbp() + * sets utask->active_uprobe. * - * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag and - * also sets the state to UTASK_SSTEP_ACK and allows the thread to return from - * interrupt. + * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag + * and allows the thread to return from interrupt. * * While returning to userspace, thread notices the TIF_UPROBE flag and calls * uprobe_notify_resume(). @@ -1563,11 +1551,13 @@ void uprobe_notify_resume(struct pt_regs *regs) { struct uprobe_task *utask; + clear_thread_flag(TIF_UPROBE); + utask = current->utask; - if (!utask || utask->state == UTASK_BP_HIT) - handle_swbp(regs); - else + if (utask && utask->active_uprobe) handle_singlestep(utask, regs); + else + handle_swbp(regs); } /* @@ -1576,17 +1566,10 @@ void uprobe_notify_resume(struct pt_regs *regs) */ int uprobe_pre_sstep_notifier(struct pt_regs *regs) { - struct uprobe_task *utask; - if (!current->mm || !test_bit(MMF_HAS_UPROBES, ¤t->mm->flags)) return 0; - utask = current->utask; - if (utask) - utask->state = UTASK_BP_HIT; - set_thread_flag(TIF_UPROBE); - return 1; } diff --git a/kernel/fork.c b/kernel/fork.c index a2b1efc20928..8b20ab7d3aa2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -423,7 +423,12 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mapping->i_mmap_writable++; flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ - vma_prio_tree_add(tmp, mpnt); + if (unlikely(tmp->vm_flags & VM_NONLINEAR)) + vma_nonlinear_insert(tmp, + &mapping->i_mmap_nonlinear); + else + vma_interval_tree_insert_after(tmp, mpnt, + &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } @@ -622,26 +627,6 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); -/* - * We added or removed a vma mapping the executable. The vmas are only mapped - * during exec and are not mapped with the mmap system call. - * Callers must hold down_write() on the mm's mmap_sem for these - */ -void added_exe_file_vma(struct mm_struct *mm) -{ - mm->num_exe_file_vmas++; -} - -void removed_exe_file_vma(struct mm_struct *mm) -{ - mm->num_exe_file_vmas--; - if ((mm->num_exe_file_vmas == 0) && mm->exe_file) { - fput(mm->exe_file); - mm->exe_file = NULL; - } - -} - void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { if (new_exe_file) @@ -649,15 +634,13 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) if (mm->exe_file) fput(mm->exe_file); mm->exe_file = new_exe_file; - mm->num_exe_file_vmas = 0; } struct file *get_mm_exe_file(struct mm_struct *mm) { struct file *exe_file; - /* We need mmap_sem to protect against races with removal of - * VM_EXECUTABLE vmas */ + /* We need mmap_sem to protect against races with removal of exe_file */ down_read(&mm->mmap_sem); exe_file = mm->exe_file; if (exe_file) @@ -1078,7 +1061,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) init_rwsem(&sig->group_rwsem); #endif - sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; @@ -1602,7 +1584,7 @@ long do_fork(unsigned long clone_flags, * requested, no event is reported; otherwise, report if the event * for the type of forking is enabled. */ - if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) { + if (!(clone_flags & CLONE_UNTRACED) && likely(user_mode(regs))) { if (clone_flags & CLONE_VFORK) trace = PTRACE_EVENT_VFORK; else if ((clone_flags & CSIGNAL) != SIGCHLD) @@ -1652,6 +1634,17 @@ long do_fork(unsigned long clone_flags, return nr; } +#ifdef CONFIG_GENERIC_KERNEL_THREAD +/* + * Create a kernel thread. + */ +pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) +{ + return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, NULL, + (unsigned long)arg, NULL, NULL); +} +#endif + #ifndef ARCH_MIN_MMSTRUCT_ALIGN #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 49a77727db42..4e69e24d3d7d 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -148,7 +148,8 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, * @host_data: Controller private data pointer * * Allocates a legacy irq_domain if irq_base is positive or a linear - * domain otherwise. + * domain otherwise. For the legacy domain, IRQ descriptors will also + * be allocated. * * This is intended to implement the expected behaviour for most * interrupt controllers which is that a linear mapping should @@ -162,11 +163,33 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, const struct irq_domain_ops *ops, void *host_data) { - if (first_irq > 0) - return irq_domain_add_legacy(of_node, size, first_irq, 0, + if (first_irq > 0) { + int irq_base; + + if (IS_ENABLED(CONFIG_SPARSE_IRQ)) { + /* + * Set the descriptor allocator to search for a + * 1-to-1 mapping, such as irq_alloc_desc_at(). + * Use of_node_to_nid() which is defined to + * numa_node_id() on platforms that have no custom + * implementation. + */ + irq_base = irq_alloc_descs(first_irq, first_irq, size, + of_node_to_nid(of_node)); + if (irq_base < 0) { + WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", + first_irq); + irq_base = first_irq; + } + } else + irq_base = first_irq; + + return irq_domain_add_legacy(of_node, size, irq_base, 0, ops, host_data); - else - return irq_domain_add_linear(of_node, size, ops, host_data); + } + + /* A linear domain is the default */ + return irq_domain_add_linear(of_node, size, ops, host_data); } /** diff --git a/kernel/kexec.c b/kernel/kexec.c index 0668d58d6413..5e4bd7864c5d 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -21,7 +21,6 @@ #include <linux/hardirq.h> #include <linux/elf.h> #include <linux/elfcore.h> -#include <generated/utsrelease.h> #include <linux/utsname.h> #include <linux/numa.h> #include <linux/suspend.h> diff --git a/kernel/kmod.c b/kernel/kmod.c index 6f99aead66c6..1c317e386831 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -37,6 +37,7 @@ #include <linux/notifier.h> #include <linux/suspend.h> #include <linux/rwsem.h> +#include <linux/ptrace.h> #include <asm/uaccess.h> #include <trace/events/module.h> @@ -221,11 +222,13 @@ static int ____call_usermodehelper(void *data) retval = kernel_execve(sub_info->path, (const char *const *)sub_info->argv, (const char *const *)sub_info->envp); + if (!retval) + return 0; /* Exec failed? */ fail: sub_info->retval = retval; - return 0; + do_exit(0); } static int call_helper(void *data) @@ -292,7 +295,7 @@ static int wait_for_helper(void *data) } umh_complete(sub_info); - return 0; + do_exit(0); } /* This is run by khelper thread */ diff --git a/kernel/kthread.c b/kernel/kthread.c index 146a6fa96825..29fb60caecb5 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -16,6 +16,7 @@ #include <linux/mutex.h> #include <linux/slab.h> #include <linux/freezer.h> +#include <linux/ptrace.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c new file mode 100644 index 000000000000..4646eb2c3820 --- /dev/null +++ b/kernel/modsign_pubkey.c @@ -0,0 +1,113 @@ +/* Public keys for module signature verification + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/cred.h> +#include <linux/err.h> +#include <keys/asymmetric-type.h> +#include "module-internal.h" + +struct key *modsign_keyring; + +extern __initdata const u8 modsign_certificate_list[]; +extern __initdata const u8 modsign_certificate_list_end[]; +asm(".section .init.data,\"aw\"\n" + "modsign_certificate_list:\n" + ".incbin \"signing_key.x509\"\n" + ".incbin \"extra_certificates\"\n" + "modsign_certificate_list_end:" + ); + +/* + * We need to make sure ccache doesn't cache the .o file as it doesn't notice + * if modsign.pub changes. + */ +static __initdata const char annoy_ccache[] = __TIME__ "foo"; + +/* + * Load the compiled-in keys + */ +static __init int module_verify_init(void) +{ + pr_notice("Initialise module verification\n"); + + modsign_keyring = key_alloc(&key_type_keyring, ".module_sign", + KUIDT_INIT(0), KGIDT_INIT(0), + current_cred(), + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA); + if (IS_ERR(modsign_keyring)) + panic("Can't allocate module signing keyring\n"); + + if (key_instantiate_and_link(modsign_keyring, NULL, 0, NULL, NULL) < 0) + panic("Can't instantiate module signing keyring\n"); + + return 0; +} + +/* + * Must be initialised before we try and load the keys into the keyring. + */ +device_initcall(module_verify_init); + +/* + * Load the compiled-in keys + */ +static __init int load_module_signing_keys(void) +{ + key_ref_t key; + const u8 *p, *end; + size_t plen; + + pr_notice("Loading module verification certificates\n"); + + end = modsign_certificate_list_end; + p = modsign_certificate_list; + while (p < end) { + /* Each cert begins with an ASN.1 SEQUENCE tag and must be more + * than 256 bytes in size. + */ + if (end - p < 4) + goto dodgy_cert; + if (p[0] != 0x30 && + p[1] != 0x82) + goto dodgy_cert; + plen = (p[2] << 8) | p[3]; + plen += 4; + if (plen > end - p) + goto dodgy_cert; + + key = key_create_or_update(make_key_ref(modsign_keyring, 1), + "asymmetric", + NULL, + p, + plen, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA); + if (IS_ERR(key)) + pr_err("MODSIGN: Problem loading in-kernel X.509 certificate (%ld)\n", + PTR_ERR(key)); + else + pr_notice("MODSIGN: Loaded cert '%s'\n", + key_ref_to_ptr(key)->description); + p += plen; + } + + return 0; + +dodgy_cert: + pr_err("MODSIGN: Problem parsing in-kernel X.509 certificate list\n"); + return 0; +} +late_initcall(load_module_signing_keys); diff --git a/kernel/module-internal.h b/kernel/module-internal.h new file mode 100644 index 000000000000..24f9247b7d02 --- /dev/null +++ b/kernel/module-internal.h @@ -0,0 +1,14 @@ +/* Module internals + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +extern struct key *modsign_keyring; + +extern int mod_verify_sig(const void *mod, unsigned long *_modlen); diff --git a/kernel/module.c b/kernel/module.c index 4edbd9c11aca..6085f5ef88ea 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -58,6 +58,8 @@ #include <linux/jump_label.h> #include <linux/pfn.h> #include <linux/bsearch.h> +#include <linux/fips.h> +#include "module-internal.h" #define CREATE_TRACE_POINTS #include <trace/events/module.h> @@ -102,6 +104,43 @@ static LIST_HEAD(modules); struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */ #endif /* CONFIG_KGDB_KDB */ +#ifdef CONFIG_MODULE_SIG +#ifdef CONFIG_MODULE_SIG_FORCE +static bool sig_enforce = true; +#else +static bool sig_enforce = false; + +static int param_set_bool_enable_only(const char *val, + const struct kernel_param *kp) +{ + int err; + bool test; + struct kernel_param dummy_kp = *kp; + + dummy_kp.arg = &test; + + err = param_set_bool(val, &dummy_kp); + if (err) + return err; + + /* Don't let them unset it once it's set! */ + if (!test && sig_enforce) + return -EROFS; + + if (test) + sig_enforce = true; + return 0; +} + +static const struct kernel_param_ops param_ops_bool_enable_only = { + .set = param_set_bool_enable_only, + .get = param_get_bool, +}; +#define param_check_bool_enable_only param_check_bool + +module_param(sig_enforce, bool_enable_only, 0644); +#endif /* !CONFIG_MODULE_SIG_FORCE */ +#endif /* CONFIG_MODULE_SIG */ /* Block module loading/unloading? */ int modules_disabled = 0; @@ -136,6 +175,7 @@ struct load_info { unsigned long symoffs, stroffs; struct _ddebug *debug; unsigned int num_debug; + bool sig_ok; struct { unsigned int sym, str, mod, vers, info, pcpu; } index; @@ -1949,26 +1989,6 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) return ret; } -int __weak apply_relocate(Elf_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - pr_err("module %s: REL relocation unsupported\n", me->name); - return -ENOEXEC; -} - -int __weak apply_relocate_add(Elf_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - pr_err("module %s: RELA relocation unsupported\n", me->name); - return -ENOEXEC; -} - static int apply_relocations(struct module *mod, const struct load_info *info) { unsigned int i; @@ -2399,7 +2419,44 @@ static inline void kmemleak_load_module(const struct module *mod, } #endif -/* Sets info->hdr and info->len. */ +#ifdef CONFIG_MODULE_SIG +static int module_sig_check(struct load_info *info, + const void *mod, unsigned long *_len) +{ + int err = -ENOKEY; + unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; + unsigned long len = *_len; + + if (len > markerlen && + memcmp(mod + len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { + /* We truncate the module to discard the signature */ + *_len -= markerlen; + err = mod_verify_sig(mod, _len); + } + + if (!err) { + info->sig_ok = true; + return 0; + } + + /* Not having a signature is only an error if we're strict. */ + if (err < 0 && fips_enabled) + panic("Module verification failed with error %d in FIPS mode\n", + err); + if (err == -ENOKEY && !sig_enforce) + err = 0; + + return err; +} +#else /* !CONFIG_MODULE_SIG */ +static int module_sig_check(struct load_info *info, + void *mod, unsigned long *len) +{ + return 0; +} +#endif /* !CONFIG_MODULE_SIG */ + +/* Sets info->hdr, info->len and info->sig_ok. */ static int copy_and_check(struct load_info *info, const void __user *umod, unsigned long len, const char __user *uargs) @@ -2419,6 +2476,10 @@ static int copy_and_check(struct load_info *info, goto free_hdr; } + err = module_sig_check(info, hdr, &len); + if (err) + goto free_hdr; + /* Sanity checks against insmoding binaries or wrong arch, weird elf version */ if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 @@ -2730,6 +2791,10 @@ static int check_module_license_and_versions(struct module *mod) if (strcmp(mod->name, "driverloader") == 0) add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + /* lve claims to be GPL but upstream won't provide source */ + if (strcmp(mod->name, "lve") == 0) + add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) || (mod->num_gpl_syms && !mod->gpl_crcs) @@ -2861,6 +2926,20 @@ static int post_relocation(struct module *mod, const struct load_info *info) return module_finalize(info->hdr, info->sechdrs, mod); } +/* Is this module of this name done loading? No locks held. */ +static bool finished_loading(const char *name) +{ + struct module *mod; + bool ret; + + mutex_lock(&module_mutex); + mod = find_module(name); + ret = !mod || mod->state != MODULE_STATE_COMING; + mutex_unlock(&module_mutex); + + return ret; +} + /* Allocate and load the module: note that size of section 0 is always zero, and we rely on this for optional sections. */ static struct module *load_module(void __user *umod, @@ -2868,7 +2947,7 @@ static struct module *load_module(void __user *umod, const char __user *uargs) { struct load_info info = { NULL, }; - struct module *mod; + struct module *mod, *old; long err; pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n", @@ -2886,6 +2965,12 @@ static struct module *load_module(void __user *umod, goto free_copy; } +#ifdef CONFIG_MODULE_SIG + mod->sig_ok = info.sig_ok; + if (!mod->sig_ok) + add_taint_module(mod, TAINT_FORCED_MODULE); +#endif + /* Now module is in final location, initialize linked lists, etc. */ err = module_unload_init(mod); if (err) @@ -2934,8 +3019,18 @@ static struct module *load_module(void __user *umod, * function to insert in a way safe to concurrent readers. * The mutex protects against concurrent writers. */ +again: mutex_lock(&module_mutex); - if (find_module(mod->name)) { + if ((old = find_module(mod->name)) != NULL) { + if (old->state == MODULE_STATE_COMING) { + /* Wait in case it fails to load. */ + mutex_unlock(&module_mutex); + err = wait_event_interruptible(module_wq, + finished_loading(mod->name)); + if (err) + goto free_arch_cleanup; + goto again; + } err = -EEXIST; goto unlock; } @@ -2975,7 +3070,7 @@ static struct module *load_module(void __user *umod, /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); module_bug_cleanup(mod); - + wake_up_all(&module_wq); ddebug: dynamic_debug_remove(info.debug); unlock: @@ -3050,7 +3145,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); free_module(mod); - wake_up(&module_wq); + wake_up_all(&module_wq); return ret; } if (ret > 0) { @@ -3062,9 +3157,8 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, dump_stack(); } - /* Now it's a first class citizen! Wake up anyone waiting for it. */ + /* Now it's a first class citizen! */ mod->state = MODULE_STATE_LIVE; - wake_up(&module_wq); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_LIVE, mod); @@ -3087,6 +3181,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, mod->init_ro_size = 0; mod->init_text_size = 0; mutex_unlock(&module_mutex); + wake_up_all(&module_wq); return 0; } diff --git a/kernel/module_signing.c b/kernel/module_signing.c new file mode 100644 index 000000000000..ea1b1df5dbb0 --- /dev/null +++ b/kernel/module_signing.c @@ -0,0 +1,249 @@ +/* Module signature checker + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/err.h> +#include <crypto/public_key.h> +#include <crypto/hash.h> +#include <keys/asymmetric-type.h> +#include "module-internal.h" + +/* + * Module signature information block. + * + * The constituents of the signature section are, in order: + * + * - Signer's name + * - Key identifier + * - Signature data + * - Information block + */ +struct module_signature { + enum pkey_algo algo : 8; /* Public-key crypto algorithm */ + enum pkey_hash_algo hash : 8; /* Digest algorithm */ + enum pkey_id_type id_type : 8; /* Key identifier type */ + u8 signer_len; /* Length of signer's name */ + u8 key_id_len; /* Length of key identifier */ + u8 __pad[3]; + __be32 sig_len; /* Length of signature data */ +}; + +/* + * Digest the module contents. + */ +static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash, + const void *mod, + unsigned long modlen) +{ + struct public_key_signature *pks; + struct crypto_shash *tfm; + struct shash_desc *desc; + size_t digest_size, desc_size; + int ret; + + pr_devel("==>%s()\n", __func__); + + /* Allocate the hashing algorithm we're going to need and find out how + * big the hash operational data will be. + */ + tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0); + if (IS_ERR(tfm)) + return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm); + + desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); + digest_size = crypto_shash_digestsize(tfm); + + /* We allocate the hash operational data storage on the end of our + * context data and the digest output buffer on the end of that. + */ + ret = -ENOMEM; + pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL); + if (!pks) + goto error_no_pks; + + pks->pkey_hash_algo = hash; + pks->digest = (u8 *)pks + sizeof(*pks) + desc_size; + pks->digest_size = digest_size; + + desc = (void *)pks + sizeof(*pks); + desc->tfm = tfm; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + ret = crypto_shash_init(desc); + if (ret < 0) + goto error; + + ret = crypto_shash_finup(desc, mod, modlen, pks->digest); + if (ret < 0) + goto error; + + crypto_free_shash(tfm); + pr_devel("<==%s() = ok\n", __func__); + return pks; + +error: + kfree(pks); +error_no_pks: + crypto_free_shash(tfm); + pr_devel("<==%s() = %d\n", __func__, ret); + return ERR_PTR(ret); +} + +/* + * Extract an MPI array from the signature data. This represents the actual + * signature. Each raw MPI is prefaced by a BE 2-byte value indicating the + * size of the MPI in bytes. + * + * RSA signatures only have one MPI, so currently we only read one. + */ +static int mod_extract_mpi_array(struct public_key_signature *pks, + const void *data, size_t len) +{ + size_t nbytes; + MPI mpi; + + if (len < 3) + return -EBADMSG; + nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1]; + data += 2; + len -= 2; + if (len != nbytes) + return -EBADMSG; + + mpi = mpi_read_raw_data(data, nbytes); + if (!mpi) + return -ENOMEM; + pks->mpi[0] = mpi; + pks->nr_mpi = 1; + return 0; +} + +/* + * Request an asymmetric key. + */ +static struct key *request_asymmetric_key(const char *signer, size_t signer_len, + const u8 *key_id, size_t key_id_len) +{ + key_ref_t key; + size_t i; + char *id, *q; + + pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len); + + /* Construct an identifier. */ + id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL); + if (!id) + return ERR_PTR(-ENOKEY); + + memcpy(id, signer, signer_len); + + q = id + signer_len; + *q++ = ':'; + *q++ = ' '; + for (i = 0; i < key_id_len; i++) { + *q++ = hex_asc[*key_id >> 4]; + *q++ = hex_asc[*key_id++ & 0x0f]; + } + + *q = 0; + + pr_debug("Look up: \"%s\"\n", id); + + key = keyring_search(make_key_ref(modsign_keyring, 1), + &key_type_asymmetric, id); + if (IS_ERR(key)) + pr_warn("Request for unknown module key '%s' err %ld\n", + id, PTR_ERR(key)); + kfree(id); + + if (IS_ERR(key)) { + switch (PTR_ERR(key)) { + /* Hide some search errors */ + case -EACCES: + case -ENOTDIR: + case -EAGAIN: + return ERR_PTR(-ENOKEY); + default: + return ERR_CAST(key); + } + } + + pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key))); + return key_ref_to_ptr(key); +} + +/* + * Verify the signature on a module. + */ +int mod_verify_sig(const void *mod, unsigned long *_modlen) +{ + struct public_key_signature *pks; + struct module_signature ms; + struct key *key; + const void *sig; + size_t modlen = *_modlen, sig_len; + int ret; + + pr_devel("==>%s(,%zu)\n", __func__, modlen); + + if (modlen <= sizeof(ms)) + return -EBADMSG; + + memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms)); + modlen -= sizeof(ms); + + sig_len = be32_to_cpu(ms.sig_len); + if (sig_len >= modlen) + return -EBADMSG; + modlen -= sig_len; + if ((size_t)ms.signer_len + ms.key_id_len >= modlen) + return -EBADMSG; + modlen -= (size_t)ms.signer_len + ms.key_id_len; + + *_modlen = modlen; + sig = mod + modlen; + + /* For the moment, only support RSA and X.509 identifiers */ + if (ms.algo != PKEY_ALGO_RSA || + ms.id_type != PKEY_ID_X509) + return -ENOPKG; + + if (ms.hash >= PKEY_HASH__LAST || + !pkey_hash_algo[ms.hash]) + return -ENOPKG; + + key = request_asymmetric_key(sig, ms.signer_len, + sig + ms.signer_len, ms.key_id_len); + if (IS_ERR(key)) + return PTR_ERR(key); + + pks = mod_make_digest(ms.hash, mod, modlen); + if (IS_ERR(pks)) { + ret = PTR_ERR(pks); + goto error_put_key; + } + + ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len, + sig_len); + if (ret < 0) + goto error_free_pks; + + ret = verify_signature(key, pks); + pr_devel("verify_signature() = %d\n", ret); + +error_free_pks: + mpi_free(pks->rsa.s); + kfree(pks); +error_put_key: + key_put(key); + pr_devel("<==%s() = %d\n", __func__, ret); + return ret; +} diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 478bad2745e3..7b07cc0dfb75 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -71,12 +71,22 @@ err_alloc: return NULL; } +/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ +#define MAX_PID_NS_LEVEL 32 + static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns) { struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; - int i, err = -ENOMEM; + int i; + int err; + + if (level > MAX_PID_NS_LEVEL) { + err = -EINVAL; + goto out; + } + err = -ENOMEM; ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); if (ns == NULL) goto out; @@ -133,19 +143,26 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old return create_pid_namespace(old_ns); } -void free_pid_ns(struct kref *kref) +static void free_pid_ns(struct kref *kref) { - struct pid_namespace *ns, *parent; + struct pid_namespace *ns; ns = container_of(kref, struct pid_namespace, kref); - - parent = ns->parent; destroy_pid_namespace(ns); +} + +void put_pid_ns(struct pid_namespace *ns) +{ + struct pid_namespace *parent; - if (parent != NULL) - put_pid_ns(parent); + while (ns != &init_pid_ns) { + parent = ns->parent; + if (!kref_put(&ns->kref, free_pid_ns)) + break; + ns = parent; + } } -EXPORT_SYMBOL_GPL(free_pid_ns); +EXPORT_SYMBOL_GPL(put_pid_ns); void zap_pid_ns_processes(struct pid_namespace *pid_ns) { diff --git a/kernel/printk.c b/kernel/printk.c index 66a2ea37b576..2d607f4d1797 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1890,7 +1890,6 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self, switch (action) { case CPU_ONLINE: case CPU_DEAD: - case CPU_DYING: case CPU_DOWN_FAILED: case CPU_UP_CANCELED: console_lock(); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 4fb2376ddf06..74df86bd9204 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -74,6 +74,7 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; .orphan_nxttail = &sname##_state.orphan_nxtlist, \ .orphan_donetail = &sname##_state.orphan_donelist, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ + .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ .name = #sname, \ } @@ -1197,7 +1198,7 @@ static int rcu_gp_init(struct rcu_state *rsp) raw_spin_unlock_irq(&rnp->lock); /* Exclude any concurrent CPU-hotplug operations. */ - get_online_cpus(); + mutex_lock(&rsp->onoff_mutex); /* * Set the quiescent-state-needed bits in all the rcu_node @@ -1234,7 +1235,7 @@ static int rcu_gp_init(struct rcu_state *rsp) cond_resched(); } - put_online_cpus(); + mutex_unlock(&rsp->onoff_mutex); return 1; } @@ -1700,6 +1701,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ /* Exclude any attempts to start a new grace period. */ + mutex_lock(&rsp->onoff_mutex); raw_spin_lock_irqsave(&rsp->onofflock, flags); /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ @@ -1744,6 +1746,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) init_callback_list(rdp); /* Disallow further callbacks on this CPU. */ rdp->nxttail[RCU_NEXT_TAIL] = NULL; + mutex_unlock(&rsp->onoff_mutex); } #else /* #ifdef CONFIG_HOTPLUG_CPU */ @@ -2648,6 +2651,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); + /* Exclude new grace periods. */ + mutex_lock(&rsp->onoff_mutex); + /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave(&rnp->lock, flags); rdp->beenonline = 1; /* We have now been online. */ @@ -2662,14 +2668,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rcu_prepare_for_idle_init(cpu); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - /* - * A new grace period might start here. If so, we won't be part - * of it, but that is OK, as we are currently in a quiescent state. - */ - - /* Exclude any attempts to start a new GP on large systems. */ - raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ - /* Add CPU to rcu_node bitmasks. */ rnp = rdp->mynode; mask = rdp->grpmask; @@ -2693,8 +2691,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ rnp = rnp->parent; } while (rnp != NULL && !(rnp->qsmaskinit & mask)); + local_irq_restore(flags); - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); + mutex_unlock(&rsp->onoff_mutex); } static void __cpuinit rcu_prepare_cpu(int cpu) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 5faf05d68326..a240f032848e 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -394,11 +394,17 @@ struct rcu_state { struct rcu_head **orphan_donetail; /* Tail of above. */ long qlen_lazy; /* Number of lazy callbacks. */ long qlen; /* Total number of callbacks. */ + /* End of fields guarded by onofflock. */ + + struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */ + struct mutex barrier_mutex; /* Guards barrier fields. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ struct completion barrier_completion; /* Wake at barrier end. */ unsigned long n_barrier_done; /* ++ at start and end of */ /* _rcu_barrier(). */ + /* End of fields guarded by barrier_mutex. */ + unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ unsigned long n_force_qs; /* Number of calls to */ diff --git a/kernel/resource.c b/kernel/resource.c index 34d45886ee84..73f35d4b30b9 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -763,6 +763,7 @@ static void __init __reserve_region_with_split(struct resource *root, struct resource *parent = root; struct resource *conflict; struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); + struct resource *next_res = NULL; if (!res) return; @@ -772,21 +773,46 @@ static void __init __reserve_region_with_split(struct resource *root, res->end = end; res->flags = IORESOURCE_BUSY; - conflict = __request_resource(parent, res); - if (!conflict) - return; + while (1) { - /* failed, split and try again */ - kfree(res); + conflict = __request_resource(parent, res); + if (!conflict) { + if (!next_res) + break; + res = next_res; + next_res = NULL; + continue; + } - /* conflict covered whole area */ - if (conflict->start <= start && conflict->end >= end) - return; + /* conflict covered whole area */ + if (conflict->start <= res->start && + conflict->end >= res->end) { + kfree(res); + WARN_ON(next_res); + break; + } + + /* failed, split and try again */ + if (conflict->start > res->start) { + end = res->end; + res->end = conflict->start - 1; + if (conflict->end < end) { + next_res = kzalloc(sizeof(*next_res), + GFP_ATOMIC); + if (!next_res) { + kfree(res); + break; + } + next_res->name = name; + next_res->start = conflict->end + 1; + next_res->end = end; + next_res->flags = IORESOURCE_BUSY; + } + } else { + res->start = conflict->end + 1; + } + } - if (conflict->start > start) - __reserve_region_with_split(root, start, conflict->start-1, name); - if (conflict->end < end) - __reserve_region_with_split(root, conflict->end+1, end, name); } void __init reserve_region_with_split(struct resource *root, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c17747236438..2d8927fda712 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -505,7 +505,7 @@ static inline void init_hrtick(void) #ifdef CONFIG_SMP #ifndef tsk_is_polling -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) +#define tsk_is_polling(t) 0 #endif void resched_task(struct task_struct *p) @@ -6122,6 +6122,17 @@ static void sched_init_numa(void) * numbers. */ + /* + * Here, we should temporarily reset sched_domains_numa_levels to 0. + * If it fails to allocate memory for array sched_domains_numa_masks[][], + * the array will contain less then 'level' members. This could be + * dangerous when we use it to iterate array sched_domains_numa_masks[][] + * in other functions. + * + * We reset it to 'level' at the end of this function. + */ + sched_domains_numa_levels = 0; + sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); if (!sched_domains_numa_masks) return; @@ -6176,11 +6187,68 @@ static void sched_init_numa(void) } sched_domain_topology = tl; + + sched_domains_numa_levels = level; +} + +static void sched_domains_numa_masks_set(int cpu) +{ + int i, j; + int node = cpu_to_node(cpu); + + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) { + if (node_distance(j, node) <= sched_domains_numa_distance[i]) + cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); + } + } +} + +static void sched_domains_numa_masks_clear(int cpu) +{ + int i, j; + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) + cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); + } +} + +/* + * Update sched_domains_numa_masks[level][node] array when new cpus + * are onlined. + */ +static int sched_domains_numa_masks_update(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + sched_domains_numa_masks_set(cpu); + break; + + case CPU_DEAD: + sched_domains_numa_masks_clear(cpu); + break; + + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; } #else static inline void sched_init_numa(void) { } + +static int sched_domains_numa_masks_update(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + return 0; +} #endif /* CONFIG_NUMA */ static int __sdt_alloc(const struct cpumask *cpu_map) @@ -6629,6 +6697,7 @@ void __init sched_init_smp(void) mutex_unlock(&sched_domains_mutex); put_online_cpus(); + hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); diff --git a/kernel/signal.c b/kernel/signal.c index 2c681f11b7d2..0af8868525d6 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -17,6 +17,7 @@ #include <linux/fs.h> #include <linux/tty.h> #include <linux/binfmts.h> +#include <linux/coredump.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/ptrace.h> @@ -2359,7 +2360,7 @@ relock: * first and our do_group_exit call below will use * that value and ignore the one we pass it. */ - do_coredump(info->si_signo, info->si_signo, regs); + do_coredump(info, regs); } /* diff --git a/kernel/sys.c b/kernel/sys.c index f9492284e5d2..e6e0ece5f6a0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -368,6 +368,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier); void kernel_restart(char *cmd) { kernel_restart_prepare(cmd); + disable_nonboot_cpus(); if (!cmd) printk(KERN_EMERG "Restarting system.\n"); else @@ -1264,15 +1265,16 @@ DECLARE_RWSEM(uts_sem); * Work around broken programs that cannot handle "Linux 3.0". * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 */ -static int override_release(char __user *release, int len) +static int override_release(char __user *release, size_t len) { int ret = 0; - char buf[65]; if (current->personality & UNAME26) { - char *rest = UTS_RELEASE; + const char *rest = UTS_RELEASE; + char buf[65] = { 0 }; int ndots = 0; unsigned v; + size_t copy; while (*rest) { if (*rest == '.' && ++ndots >= 3) @@ -1282,8 +1284,9 @@ static int override_release(char __user *release, int len) rest++; } v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; - snprintf(buf, len, "2.6.%u%s", v, rest); - ret = copy_to_user(release, buf, len); + copy = clamp_t(size_t, len, 1, sizeof(buf)); + copy = scnprintf(buf, copy, "2.6.%u%s", v, rest); + ret = copy_to_user(release, buf, copy + 1); } return ret; } @@ -2204,7 +2207,7 @@ static int __orderly_poweroff(void) return -ENOMEM; } - ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT, + ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC, NULL, argv_cleanup, NULL); if (ret == -ENOMEM) argv_free(argv); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 84c76a34e41c..26f65eaa01f9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -97,10 +97,12 @@ extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern int max_threads; -extern int core_uses_pid; extern int suid_dumpable; +#ifdef CONFIG_COREDUMP +extern int core_uses_pid; extern char core_pattern[]; extern unsigned int core_pipe_limit; +#endif extern int pid_max; extern int min_free_kbytes; extern int pid_max_min, pid_max_max; @@ -177,8 +179,10 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +#ifdef CONFIG_COREDUMP static int proc_dostring_coredump(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +#endif #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses it's own private copy */ @@ -404,6 +408,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_COREDUMP { .procname = "core_uses_pid", .data = &core_uses_pid, @@ -425,6 +430,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#endif #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", @@ -1543,8 +1549,7 @@ static struct ctl_table fs_table[] = { }; static struct ctl_table debug_table[] = { -#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ - defined(CONFIG_S390) || defined(CONFIG_TILE) || defined(CONFIG_ARM64) +#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE { .procname = "exception-trace", .data = &show_unhandled_signals, @@ -2036,12 +2041,14 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, static void validate_coredump_safety(void) { +#ifdef CONFIG_COREDUMP if (suid_dumpable == SUID_DUMPABLE_SAFE && core_pattern[0] != '/' && core_pattern[0] != '|') { printk(KERN_WARNING "Unsafe core_pattern used with "\ "suid_dumpable=2. Pipe handler or fully qualified "\ "core dump path required.\n"); } +#endif } static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, @@ -2053,6 +2060,7 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, return error; } +#ifdef CONFIG_COREDUMP static int proc_dostring_coredump(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -2061,6 +2069,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, validate_coredump_safety(); return error; } +#endif static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, void __user *buffer, diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 610f0838d555..145bb4d3bd4d 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -445,6 +445,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, sizeof(struct cgroupstats)); if (na == NULL) { + nlmsg_free(rep_skb); rc = -EMSGSIZE; goto err; } diff --git a/kernel/time.c b/kernel/time.c index ba744cf80696..d226c6a3fd28 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -30,7 +30,7 @@ #include <linux/export.h> #include <linux/timex.h> #include <linux/capability.h> -#include <linux/clocksource.h> +#include <linux/timekeeper_internal.h> #include <linux/errno.h> #include <linux/syscalls.h> #include <linux/security.h> diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index fd42bd452b75..8601f0db1261 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -16,6 +16,10 @@ config ARCH_CLOCKSOURCE_DATA config GENERIC_TIME_VSYSCALL bool +# Timekeeping vsyscall support +config GENERIC_TIME_VSYSCALL_OLD + bool + # ktime_t scalar 64bit nsec representation config KTIME_SCALAR bool diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index aa27d391bfc8..f11d83b12949 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -37,7 +37,6 @@ static struct alarm_base { spinlock_t lock; struct timerqueue_head timerqueue; - struct hrtimer timer; ktime_t (*gettime)(void); clockid_t base_clockid; } alarm_bases[ALARM_NUMTYPE]; @@ -46,6 +45,8 @@ static struct alarm_base { static ktime_t freezer_delta; static DEFINE_SPINLOCK(freezer_delta_lock); +static struct wakeup_source *ws; + #ifdef CONFIG_RTC_CLASS /* rtc timer and device for setting alarm wakeups at suspend */ static struct rtc_timer rtctimer; @@ -130,50 +131,35 @@ static inline void alarmtimer_rtc_timer_init(void) { } * @base: pointer to the base where the timer is being run * @alarm: pointer to alarm being enqueued. * - * Adds alarm to a alarm_base timerqueue and if necessary sets - * an hrtimer to run. + * Adds alarm to a alarm_base timerqueue * * Must hold base->lock when calling. */ static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) { + if (alarm->state & ALARMTIMER_STATE_ENQUEUED) + timerqueue_del(&base->timerqueue, &alarm->node); + timerqueue_add(&base->timerqueue, &alarm->node); alarm->state |= ALARMTIMER_STATE_ENQUEUED; - - if (&alarm->node == timerqueue_getnext(&base->timerqueue)) { - hrtimer_try_to_cancel(&base->timer); - hrtimer_start(&base->timer, alarm->node.expires, - HRTIMER_MODE_ABS); - } } /** - * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue + * alarmtimer_dequeue - Removes an alarm timer from an alarm_base timerqueue * @base: pointer to the base where the timer is running * @alarm: pointer to alarm being removed * - * Removes alarm to a alarm_base timerqueue and if necessary sets - * a new timer to run. + * Removes alarm to a alarm_base timerqueue * * Must hold base->lock when calling. */ -static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) +static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm) { - struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue); - if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED)) return; timerqueue_del(&base->timerqueue, &alarm->node); alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; - - if (next == &alarm->node) { - hrtimer_try_to_cancel(&base->timer); - next = timerqueue_getnext(&base->timerqueue); - if (!next) - return; - hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS); - } } @@ -188,42 +174,23 @@ static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) */ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) { - struct alarm_base *base = container_of(timer, struct alarm_base, timer); - struct timerqueue_node *next; + struct alarm *alarm = container_of(timer, struct alarm, timer); + struct alarm_base *base = &alarm_bases[alarm->type]; unsigned long flags; - ktime_t now; int ret = HRTIMER_NORESTART; int restart = ALARMTIMER_NORESTART; spin_lock_irqsave(&base->lock, flags); - now = base->gettime(); - while ((next = timerqueue_getnext(&base->timerqueue))) { - struct alarm *alarm; - ktime_t expired = next->expires; - - if (expired.tv64 > now.tv64) - break; - - alarm = container_of(next, struct alarm, node); - - timerqueue_del(&base->timerqueue, &alarm->node); - alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; - - alarm->state |= ALARMTIMER_STATE_CALLBACK; - spin_unlock_irqrestore(&base->lock, flags); - if (alarm->function) - restart = alarm->function(alarm, now); - spin_lock_irqsave(&base->lock, flags); - alarm->state &= ~ALARMTIMER_STATE_CALLBACK; + alarmtimer_dequeue(base, alarm); + spin_unlock_irqrestore(&base->lock, flags); - if (restart != ALARMTIMER_NORESTART) { - timerqueue_add(&base->timerqueue, &alarm->node); - alarm->state |= ALARMTIMER_STATE_ENQUEUED; - } - } + if (alarm->function) + restart = alarm->function(alarm, base->gettime()); - if (next) { - hrtimer_set_expires(&base->timer, next->expires); + spin_lock_irqsave(&base->lock, flags); + if (restart != ALARMTIMER_NORESTART) { + hrtimer_set_expires(&alarm->timer, alarm->node.expires); + alarmtimer_enqueue(base, alarm); ret = HRTIMER_RESTART; } spin_unlock_irqrestore(&base->lock, flags); @@ -250,6 +217,7 @@ static int alarmtimer_suspend(struct device *dev) unsigned long flags; struct rtc_device *rtc; int i; + int ret; spin_lock_irqsave(&freezer_delta_lock, flags); min = freezer_delta; @@ -279,8 +247,10 @@ static int alarmtimer_suspend(struct device *dev) if (min.tv64 == 0) return 0; - /* XXX - Should we enforce a minimum sleep time? */ - WARN_ON(min.tv64 < NSEC_PER_SEC); + if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) { + __pm_wakeup_event(ws, 2 * MSEC_PER_SEC); + return -EBUSY; + } /* Setup an rtc timer to fire that far in the future */ rtc_timer_cancel(rtc, &rtctimer); @@ -288,9 +258,11 @@ static int alarmtimer_suspend(struct device *dev) now = rtc_tm_to_ktime(tm); now = ktime_add(now, min); - rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); - - return 0; + /* Set alarm, if in the past reject suspend briefly to handle */ + ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); + if (ret < 0) + __pm_wakeup_event(ws, MSEC_PER_SEC); + return ret; } #else static int alarmtimer_suspend(struct device *dev) @@ -324,6 +296,9 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) { timerqueue_init(&alarm->node); + hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid, + HRTIMER_MODE_ABS); + alarm->timer.function = alarmtimer_fired; alarm->function = function; alarm->type = type; alarm->state = ALARMTIMER_STATE_INACTIVE; @@ -334,17 +309,19 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, * @alarm: ptr to alarm to set * @start: time to run the alarm */ -void alarm_start(struct alarm *alarm, ktime_t start) +int alarm_start(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; unsigned long flags; + int ret; spin_lock_irqsave(&base->lock, flags); - if (alarmtimer_active(alarm)) - alarmtimer_remove(base, alarm); alarm->node.expires = start; alarmtimer_enqueue(base, alarm); + ret = hrtimer_start(&alarm->timer, alarm->node.expires, + HRTIMER_MODE_ABS); spin_unlock_irqrestore(&base->lock, flags); + return ret; } /** @@ -358,18 +335,12 @@ int alarm_try_to_cancel(struct alarm *alarm) { struct alarm_base *base = &alarm_bases[alarm->type]; unsigned long flags; - int ret = -1; - spin_lock_irqsave(&base->lock, flags); - - if (alarmtimer_callback_running(alarm)) - goto out; + int ret; - if (alarmtimer_is_queued(alarm)) { - alarmtimer_remove(base, alarm); - ret = 1; - } else - ret = 0; -out: + spin_lock_irqsave(&base->lock, flags); + ret = hrtimer_try_to_cancel(&alarm->timer); + if (ret >= 0) + alarmtimer_dequeue(base, alarm); spin_unlock_irqrestore(&base->lock, flags); return ret; } @@ -802,10 +773,6 @@ static int __init alarmtimer_init(void) for (i = 0; i < ALARM_NUMTYPE; i++) { timerqueue_init_head(&alarm_bases[i].timerqueue); spin_lock_init(&alarm_bases[i].lock); - hrtimer_init(&alarm_bases[i].timer, - alarm_bases[i].base_clockid, - HRTIMER_MODE_ABS); - alarm_bases[i].timer.function = alarmtimer_fired; } error = alarmtimer_rtc_interface_setup(); @@ -821,6 +788,7 @@ static int __init alarmtimer_init(void) error = PTR_ERR(pdev); goto out_drv; } + ws = wakeup_source_register("alarmtimer"); return 0; out_drv: diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 46da0537c10b..6629bf7b5285 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -37,7 +37,7 @@ * requested HZ value. It is also not recommended * for "tick-less" systems. */ -#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/SHIFTED_HZ)) +#define NSEC_PER_JIFFY ((NSEC_PER_SEC+HZ/2)/HZ) /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier * conversion, the .shift value could be zero. However @@ -95,3 +95,33 @@ struct clocksource * __init __weak clocksource_default_clock(void) { return &clocksource_jiffies; } + +struct clocksource refined_jiffies; + +int register_refined_jiffies(long cycles_per_second) +{ + u64 nsec_per_tick, shift_hz; + long cycles_per_tick; + + + + refined_jiffies = clocksource_jiffies; + refined_jiffies.name = "refined-jiffies"; + refined_jiffies.rating++; + + /* Calc cycles per tick */ + cycles_per_tick = (cycles_per_second + HZ/2)/HZ; + /* shift_hz stores hz<<8 for extra accuracy */ + shift_hz = (u64)cycles_per_second << 8; + shift_hz += cycles_per_tick/2; + do_div(shift_hz, cycles_per_tick); + /* Calculate nsec_per_tick using shift_hz */ + nsec_per_tick = (u64)NSEC_PER_SEC << 8; + nsec_per_tick += (u32)shift_hz/2; + do_div(nsec_per_tick, (u32)shift_hz); + + refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; + + clocksource_register(&refined_jiffies); + return 0; +} diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f423bdd035c2..a40260885265 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -835,7 +835,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) */ if (ts->tick_stopped) { touch_softlockup_watchdog(); - if (idle_cpu(cpu)) + if (is_idle_task(current)) ts->idle_jiffies++; } update_process_times(user_mode(regs)); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5ce06a3fa91e..e424970bb562 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -8,6 +8,7 @@ * */ +#include <linux/timekeeper_internal.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/percpu.h> @@ -21,61 +22,6 @@ #include <linux/tick.h> #include <linux/stop_machine.h> -/* Structure holding internal timekeeping values. */ -struct timekeeper { - /* Current clocksource used for timekeeping. */ - struct clocksource *clock; - /* NTP adjusted clock multiplier */ - u32 mult; - /* The shift value of the current clocksource. */ - u32 shift; - /* Number of clock cycles in one NTP interval. */ - cycle_t cycle_interval; - /* Number of clock shifted nano seconds in one NTP interval. */ - u64 xtime_interval; - /* shifted nano seconds left over when rounding cycle_interval */ - s64 xtime_remainder; - /* Raw nano seconds accumulated per NTP interval. */ - u32 raw_interval; - - /* Current CLOCK_REALTIME time in seconds */ - u64 xtime_sec; - /* Clock shifted nano seconds */ - u64 xtime_nsec; - - /* Difference between accumulated time and NTP time in ntp - * shifted nano seconds. */ - s64 ntp_error; - /* Shift conversion between clock shifted nano seconds and - * ntp shifted nano seconds. */ - u32 ntp_error_shift; - - /* - * wall_to_monotonic is what we need to add to xtime (or xtime corrected - * for sub jiffie times) to get to monotonic time. Monotonic is pegged - * at zero at system boot time, so wall_to_monotonic will be negative, - * however, we will ALWAYS keep the tv_nsec part positive so we can use - * the usual normalization. - * - * wall_to_monotonic is moved after resume from suspend for the - * monotonic time not to jump. We need to add total_sleep_time to - * wall_to_monotonic to get the real boot based time offset. - * - * - wall_to_monotonic is no longer the boot time, getboottime must be - * used instead. - */ - struct timespec wall_to_monotonic; - /* Offset clock monotonic -> clock realtime */ - ktime_t offs_real; - /* time spent in suspend */ - struct timespec total_sleep_time; - /* Offset clock monotonic -> clock boottime */ - ktime_t offs_boot; - /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ - struct timespec raw_time; - /* Seqlock for all timekeeper values */ - seqlock_t lock; -}; static struct timekeeper timekeeper; @@ -96,15 +42,6 @@ static inline void tk_normalize_xtime(struct timekeeper *tk) } } -static struct timespec tk_xtime(struct timekeeper *tk) -{ - struct timespec ts; - - ts.tv_sec = tk->xtime_sec; - ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift); - return ts; -} - static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) { tk->xtime_sec = ts->tv_sec; @@ -246,14 +183,11 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) /* must hold write on timekeeper.lock */ static void timekeeping_update(struct timekeeper *tk, bool clearntp) { - struct timespec xt; - if (clearntp) { tk->ntp_error = 0; ntp_clear(); } - xt = tk_xtime(tk); - update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult); + update_vsyscall(tk); } /** @@ -1113,7 +1047,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ - raw_nsecs = tk->raw_interval << shift; + raw_nsecs = (u64)tk->raw_interval << shift; raw_nsecs += tk->raw_time.tv_nsec; if (raw_nsecs >= NSEC_PER_SEC) { u64 raw_secs = raw_nsecs; @@ -1130,6 +1064,33 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, return offset; } +#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD +static inline void old_vsyscall_fixup(struct timekeeper *tk) +{ + s64 remainder; + + /* + * Store only full nanoseconds into xtime_nsec after rounding + * it up and add the remainder to the error difference. + * XXX - This is necessary to avoid small 1ns inconsistnecies caused + * by truncating the remainder in vsyscalls. However, it causes + * additional work to be done in timekeeping_adjust(). Once + * the vsyscall implementations are converted to use xtime_nsec + * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD + * users are removed, this can be killed. + */ + remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); + tk->xtime_nsec -= remainder; + tk->xtime_nsec += 1ULL << tk->shift; + tk->ntp_error += remainder << tk->ntp_error_shift; + +} +#else +#define old_vsyscall_fixup(tk) +#endif + + + /** * update_wall_time - Uses the current clocksource to increment the wall time * @@ -1141,7 +1102,6 @@ static void update_wall_time(void) cycle_t offset; int shift = 0, maxshift; unsigned long flags; - s64 remainder; write_seqlock_irqsave(&tk->lock, flags); @@ -1183,20 +1143,11 @@ static void update_wall_time(void) /* correct the clock when NTP error is too big */ timekeeping_adjust(tk, offset); - /* - * Store only full nanoseconds into xtime_nsec after rounding - * it up and add the remainder to the error difference. - * XXX - This is necessary to avoid small 1ns inconsistnecies caused - * by truncating the remainder in vsyscalls. However, it causes - * additional work to be done in timekeeping_adjust(). Once - * the vsyscall implementations are converted to use xtime_nsec - * (shifted nanoseconds), this can be killed. - */ - remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); - tk->xtime_nsec -= remainder; - tk->xtime_nsec += 1ULL << tk->shift; - tk->ntp_error += remainder << tk->ntp_error_shift; + * XXX This can be killed once everyone converts + * to the new update_vsyscall. + */ + old_vsyscall_fixup(tk); /* * Finally, make sure that after the rounding diff --git a/kernel/timer.c b/kernel/timer.c index d5de1b2292aa..367d00858482 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -63,6 +63,7 @@ EXPORT_SYMBOL(jiffies_64); #define TVR_SIZE (1 << TVR_BITS) #define TVN_MASK (TVN_SIZE - 1) #define TVR_MASK (TVR_SIZE - 1) +#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1)) struct tvec { struct list_head vec[TVN_SIZE]; @@ -359,11 +360,12 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer) vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); } else { int i; - /* If the timeout is larger than 0xffffffff on 64-bit - * architectures then we use the maximum timeout: + /* If the timeout is larger than MAX_TVAL (on 64-bit + * architectures or with CONFIG_BASE_SMALL=1) then we + * use the maximum timeout. */ - if (idx > 0xffffffffUL) { - idx = 0xffffffffUL; + if (idx > MAX_TVAL) { + idx = MAX_TVAL; expires = idx + base->timer_jiffies; } i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b32ed0e385a5..b979426d16c6 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1567,6 +1567,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, put_online_cpus(); } else { + /* Make sure this CPU has been intitialized */ + if (!cpumask_test_cpu(cpu_id, buffer->cpumask)) + goto out; + cpu_buffer = buffer->buffers[cpu_id]; if (nr_pages == cpu_buffer->nr_pages) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index cdcb59450b49..31e4f55773f1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4200,12 +4200,6 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, buf->private = 0; } -static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - return 1; -} - static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { @@ -4221,7 +4215,7 @@ static const struct pipe_buf_operations buffer_pipe_buf_ops = { .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, - .steal = buffer_pipe_buf_steal, + .steal = generic_pipe_buf_steal, .get = buffer_pipe_buf_get, }; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 483162a9f908..507a7a9630bf 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -13,7 +13,6 @@ #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/ftrace.h> -#include <linux/pstore.h> #include <linux/fs.h> #include "trace.h" @@ -76,10 +75,9 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip, preempt_enable_notrace(); } -/* Our two options */ +/* Our option */ enum { TRACE_FUNC_OPT_STACK = 0x1, - TRACE_FUNC_OPT_PSTORE = 0x2, }; static struct tracer_flags func_flags; @@ -109,12 +107,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { - /* - * So far tracing doesn't support multiple buffers, so - * we make an explicit call for now. - */ - if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE)) - pstore_ftrace_call(ip, parent_ip); pc = preempt_count(); trace_function(tr, ip, parent_ip, flags, pc); } @@ -181,9 +173,6 @@ static struct tracer_opt func_opts[] = { #ifdef CONFIG_STACKTRACE { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, #endif -#ifdef CONFIG_PSTORE_FTRACE - { TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) }, -#endif { } /* Always set a last empty entry */ }; @@ -236,8 +225,6 @@ static int func_set_flag(u32 old_flags, u32 bit, int set) } break; - case TRACE_FUNC_OPT_PSTORE: - break; default: return -EINVAL; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d951daa0ca9a..042d221d33cc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2982,7 +2982,7 @@ bool cancel_delayed_work(struct delayed_work *dwork) set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work)); local_irq_restore(flags); - return true; + return ret; } EXPORT_SYMBOL(cancel_delayed_work); |