summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile75
-rw-r--r--kernel/acct.c6
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/audit.h7
-rw-r--r--kernel/audit_watch.c3
-rw-r--r--kernel/auditfilter.c65
-rw-r--r--kernel/auditsc.c230
-rw-r--r--kernel/cgroup.c41
-rw-r--r--kernel/cpu.c4
-rw-r--r--kernel/debug/debug_core.c18
-rw-r--r--kernel/debug/kdb/kdb_bt.c2
-rw-r--r--kernel/debug/kdb/kdb_io.c33
-rw-r--r--kernel/debug/kdb/kdb_main.c2
-rw-r--r--kernel/events/core.c23
-rw-r--r--kernel/events/uprobes.c353
-rw-r--r--kernel/fork.c45
-rw-r--r--kernel/irq/irqdomain.c33
-rw-r--r--kernel/kexec.c1
-rw-r--r--kernel/kmod.c7
-rw-r--r--kernel/kthread.c1
-rw-r--r--kernel/modsign_pubkey.c113
-rw-r--r--kernel/module-internal.h14
-rw-r--r--kernel/module.c149
-rw-r--r--kernel/module_signing.c249
-rw-r--r--kernel/pid_namespace.c33
-rw-r--r--kernel/printk.c1
-rw-r--r--kernel/rcutree.c21
-rw-r--r--kernel/rcutree.h6
-rw-r--r--kernel/resource.c50
-rw-r--r--kernel/sched/core.c71
-rw-r--r--kernel/signal.c3
-rw-r--r--kernel/sys.c15
-rw-r--r--kernel/sysctl.c15
-rw-r--r--kernel/taskstats.c1
-rw-r--r--kernel/time.c2
-rw-r--r--kernel/time/Kconfig4
-rw-r--r--kernel/time/alarmtimer.c118
-rw-r--r--kernel/time/jiffies.c32
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/timekeeping.c117
-rw-r--r--kernel/timer.c10
-rw-r--r--kernel/trace/ring_buffer.c4
-rw-r--r--kernel/trace/trace.c8
-rw-r--r--kernel/trace/trace_functions.c15
-rw-r--r--kernel/workqueue.c2
45 files changed, 1374 insertions, 632 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 5404911eaee9..86e3285ae7e5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
+obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_KEXEC) += kexec.o
@@ -130,3 +131,77 @@ quiet_cmd_timeconst = TIMEC $@
targets += timeconst.h
$(obj)/timeconst.h: $(src)/timeconst.pl FORCE
$(call if_changed,timeconst)
+
+ifeq ($(CONFIG_MODULE_SIG),y)
+#
+# Pull the signing certificate and any extra certificates into the kernel
+#
+extra_certificates:
+ touch $@
+
+kernel/modsign_pubkey.o: signing_key.x509 extra_certificates
+
+###############################################################################
+#
+# If module signing is requested, say by allyesconfig, but a key has not been
+# supplied, then one will need to be generated to make sure the build does not
+# fail and that the kernel may be used afterwards.
+#
+###############################################################################
+sign_key_with_hash :=
+ifeq ($(CONFIG_MODULE_SIG_SHA1),y)
+sign_key_with_hash := -sha1
+endif
+ifeq ($(CONFIG_MODULE_SIG_SHA224),y)
+sign_key_with_hash := -sha224
+endif
+ifeq ($(CONFIG_MODULE_SIG_SHA256),y)
+sign_key_with_hash := -sha256
+endif
+ifeq ($(CONFIG_MODULE_SIG_SHA384),y)
+sign_key_with_hash := -sha384
+endif
+ifeq ($(CONFIG_MODULE_SIG_SHA512),y)
+sign_key_with_hash := -sha512
+endif
+ifeq ($(sign_key_with_hash),)
+$(error Could not determine digest type to use from kernel config)
+endif
+
+signing_key.priv signing_key.x509: x509.genkey
+ @echo "###"
+ @echo "### Now generating an X.509 key pair to be used for signing modules."
+ @echo "###"
+ @echo "### If this takes a long time, you might wish to run rngd in the"
+ @echo "### background to keep the supply of entropy topped up. It"
+ @echo "### needs to be run as root, and uses a hardware random"
+ @echo "### number generator if one is available."
+ @echo "###"
+ openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \
+ -x509 -config x509.genkey \
+ -outform DER -out signing_key.x509 \
+ -keyout signing_key.priv
+ @echo "###"
+ @echo "### Key pair generated."
+ @echo "###"
+
+x509.genkey:
+ @echo Generating X.509 key generation config
+ @echo >x509.genkey "[ req ]"
+ @echo >>x509.genkey "default_bits = 4096"
+ @echo >>x509.genkey "distinguished_name = req_distinguished_name"
+ @echo >>x509.genkey "prompt = no"
+ @echo >>x509.genkey "string_mask = utf8only"
+ @echo >>x509.genkey "x509_extensions = myexts"
+ @echo >>x509.genkey
+ @echo >>x509.genkey "[ req_distinguished_name ]"
+ @echo >>x509.genkey "O = Magrathea"
+ @echo >>x509.genkey "CN = Glacier signing key"
+ @echo >>x509.genkey "emailAddress = slartibartfast@magrathea.h2g2"
+ @echo >>x509.genkey
+ @echo >>x509.genkey "[ myexts ]"
+ @echo >>x509.genkey "basicConstraints=critical,CA:FALSE"
+ @echo >>x509.genkey "keyUsage=digitalSignature"
+ @echo >>x509.genkey "subjectKeyIdentifier=hash"
+ @echo >>x509.genkey "authorityKeyIdentifier=keyid"
+endif
diff --git a/kernel/acct.c b/kernel/acct.c
index 6cd7529c9e6a..051e071a06e7 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -193,7 +193,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
}
}
-static int acct_on(char *name)
+static int acct_on(struct filename *pathname)
{
struct file *file;
struct vfsmount *mnt;
@@ -201,7 +201,7 @@ static int acct_on(char *name)
struct bsd_acct_struct *acct = NULL;
/* Difference from BSD - they don't do O_APPEND */
- file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
+ file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
if (IS_ERR(file))
return PTR_ERR(file);
@@ -260,7 +260,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
return -EPERM;
if (name) {
- char *tmp = getname(name);
+ struct filename *tmp = getname(name);
if (IS_ERR(tmp))
return (PTR_ERR(tmp));
error = acct_on(tmp);
diff --git a/kernel/audit.c b/kernel/audit.c
index 4d0ceede3319..40414e9143db 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1440,6 +1440,8 @@ void audit_log_link_denied(const char *operation, struct path *link)
ab = audit_log_start(current->audit_context, GFP_KERNEL,
AUDIT_ANOM_LINK);
+ if (!ab)
+ return;
audit_log_format(ab, "op=%s action=denied", operation);
audit_log_format(ab, " pid=%d comm=", current->pid);
audit_log_untrustedstring(ab, current->comm);
diff --git a/kernel/audit.h b/kernel/audit.h
index 9eb3d79482b6..d51cba868e1b 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -74,12 +74,15 @@ static inline int audit_hash_ino(u32 ino)
return (ino & (AUDIT_INODE_BUCKETS-1));
}
+/* Indicates that audit should log the full pathname. */
+#define AUDIT_NAME_FULL -1
+
extern int audit_match_class(int class, unsigned syscall);
extern int audit_comparator(const u32 left, const u32 op, const u32 right);
extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
-extern int audit_compare_dname_path(const char *dname, const char *path,
- int *dirlen);
+extern int parent_len(const char *path);
+extern int audit_compare_dname_path(const char *dname, const char *path, int plen);
extern struct sk_buff * audit_make_reply(int pid, int seq, int type,
int done, int multi,
const void *payload, int size);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 1c22ec3d87bc..9a9ae6e3d290 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -265,7 +265,8 @@ static void audit_update_watch(struct audit_parent *parent,
/* Run all of the watches on this parent looking for the one that
* matches the given dname */
list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
- if (audit_compare_dname_path(dname, owatch->path, NULL))
+ if (audit_compare_dname_path(dname, owatch->path,
+ AUDIT_NAME_FULL))
continue;
/* If the update involves invalidating rules, do the inode-based
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index c4bcdbaf4d4d..7f19f23d38a3 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1298,41 +1298,60 @@ int audit_gid_comparator(kgid_t left, u32 op, kgid_t right)
}
}
-/* Compare given dentry name with last component in given path,
- * return of 0 indicates a match. */
-int audit_compare_dname_path(const char *dname, const char *path,
- int *dirlen)
+/**
+ * parent_len - find the length of the parent portion of a pathname
+ * @path: pathname of which to determine length
+ */
+int parent_len(const char *path)
{
- int dlen, plen;
+ int plen;
const char *p;
- if (!dname || !path)
- return 1;
-
- dlen = strlen(dname);
plen = strlen(path);
- if (plen < dlen)
- return 1;
+
+ if (plen == 0)
+ return plen;
/* disregard trailing slashes */
p = path + plen - 1;
while ((*p == '/') && (p > path))
p--;
- /* find last path component */
- p = p - dlen + 1;
- if (p < path)
+ /* walk backward until we find the next slash or hit beginning */
+ while ((*p != '/') && (p > path))
+ p--;
+
+ /* did we find a slash? Then increment to include it in path */
+ if (*p == '/')
+ p++;
+
+ return p - path;
+}
+
+/**
+ * audit_compare_dname_path - compare given dentry name with last component in
+ * given path. Return of 0 indicates a match.
+ * @dname: dentry name that we're comparing
+ * @path: full pathname that we're comparing
+ * @parentlen: length of the parent if known. Passing in AUDIT_NAME_FULL
+ * here indicates that we must compute this value.
+ */
+int audit_compare_dname_path(const char *dname, const char *path, int parentlen)
+{
+ int dlen, pathlen;
+ const char *p;
+
+ dlen = strlen(dname);
+ pathlen = strlen(path);
+ if (pathlen < dlen)
return 1;
- else if (p > path) {
- if (*--p != '/')
- return 1;
- else
- p++;
- }
- /* return length of path's directory component */
- if (dirlen)
- *dirlen = p - path;
+ parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen;
+ if (pathlen - parentlen != dlen)
+ return 1;
+
+ p = path + parentlen;
+
return strncmp(p, dname, dlen);
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 29e090cc0e46..2f186ed80c40 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -81,9 +81,6 @@
* a name dynamically and also add those to the list anchored by names_list. */
#define AUDIT_NAMES 5
-/* Indicates that audit should log the full pathname. */
-#define AUDIT_NAME_FULL -1
-
/* no execve audit message should be longer than this (userspace limits) */
#define MAX_EXECVE_AUDIT_LEN 7500
@@ -106,27 +103,29 @@ struct audit_cap_data {
* we don't let putname() free it (instead we free all of the saved
* pointers at syscall exit time).
*
- * Further, in fs/namei.c:path_lookup() we store the inode and device. */
+ * Further, in fs/namei.c:path_lookup() we store the inode and device.
+ */
struct audit_names {
- struct list_head list; /* audit_context->names_list */
- const char *name;
- unsigned long ino;
- dev_t dev;
- umode_t mode;
- kuid_t uid;
- kgid_t gid;
- dev_t rdev;
- u32 osid;
- struct audit_cap_data fcap;
- unsigned int fcap_ver;
- int name_len; /* number of name's characters to log */
- bool name_put; /* call __putname() for this name */
+ struct list_head list; /* audit_context->names_list */
+ struct filename *name;
+ unsigned long ino;
+ dev_t dev;
+ umode_t mode;
+ kuid_t uid;
+ kgid_t gid;
+ dev_t rdev;
+ u32 osid;
+ struct audit_cap_data fcap;
+ unsigned int fcap_ver;
+ int name_len; /* number of name's characters to log */
+ unsigned char type; /* record type */
+ bool name_put; /* call __putname() for this name */
/*
* This was an allocated audit_names and not from the array of
* names allocated in the task audit context. Thus this name
* should be freed on syscall exit
*/
- bool should_free;
+ bool should_free;
};
struct audit_aux_data {
@@ -998,7 +997,7 @@ static inline void audit_free_names(struct audit_context *context)
context->ino_count);
list_for_each_entry(n, &context->names_list, list) {
printk(KERN_ERR "names[%d] = %p = %s\n", i,
- n->name, n->name ?: "(null)");
+ n->name, n->name->name ?: "(null)");
}
dump_stack();
return;
@@ -1151,7 +1150,6 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
const struct cred *cred;
char name[sizeof(tsk->comm)];
struct mm_struct *mm = tsk->mm;
- struct vm_area_struct *vma;
char *tty;
if (!ab)
@@ -1191,16 +1189,8 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
if (mm) {
down_read(&mm->mmap_sem);
- vma = mm->mmap;
- while (vma) {
- if ((vma->vm_flags & VM_EXECUTABLE) &&
- vma->vm_file) {
- audit_log_d_path(ab, " exe=",
- &vma->vm_file->f_path);
- break;
- }
- vma = vma->vm_next;
- }
+ if (mm->exe_file)
+ audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
up_read(&mm->mmap_sem);
}
audit_log_task_context(ab);
@@ -1564,7 +1554,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
case AUDIT_NAME_FULL:
/* log the full path */
audit_log_format(ab, " name=");
- audit_log_untrustedstring(ab, n->name);
+ audit_log_untrustedstring(ab, n->name->name);
break;
case 0:
/* name was specified as a relative path and the
@@ -1574,7 +1564,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
default:
/* log the name's directory component */
audit_log_format(ab, " name=");
- audit_log_n_untrustedstring(ab, n->name,
+ audit_log_n_untrustedstring(ab, n->name->name,
n->name_len);
}
} else
@@ -2004,7 +1994,8 @@ retry:
#endif
}
-static struct audit_names *audit_alloc_name(struct audit_context *context)
+static struct audit_names *audit_alloc_name(struct audit_context *context,
+ unsigned char type)
{
struct audit_names *aname;
@@ -2019,6 +2010,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context)
}
aname->ino = (unsigned long)-1;
+ aname->type = type;
list_add_tail(&aname->list, &context->names_list);
context->name_count++;
@@ -2029,13 +2021,36 @@ static struct audit_names *audit_alloc_name(struct audit_context *context)
}
/**
+ * audit_reusename - fill out filename with info from existing entry
+ * @uptr: userland ptr to pathname
+ *
+ * Search the audit_names list for the current audit context. If there is an
+ * existing entry with a matching "uptr" then return the filename
+ * associated with that audit_name. If not, return NULL.
+ */
+struct filename *
+__audit_reusename(const __user char *uptr)
+{
+ struct audit_context *context = current->audit_context;
+ struct audit_names *n;
+
+ list_for_each_entry(n, &context->names_list, list) {
+ if (!n->name)
+ continue;
+ if (n->name->uptr == uptr)
+ return n->name;
+ }
+ return NULL;
+}
+
+/**
* audit_getname - add a name to the list
* @name: name to add
*
* Add a name to the list of audit names for this context.
* Called from fs/namei.c:getname().
*/
-void __audit_getname(const char *name)
+void __audit_getname(struct filename *name)
{
struct audit_context *context = current->audit_context;
struct audit_names *n;
@@ -2049,13 +2064,19 @@ void __audit_getname(const char *name)
return;
}
- n = audit_alloc_name(context);
+#if AUDIT_DEBUG
+ /* The filename _must_ have a populated ->name */
+ BUG_ON(!name->name);
+#endif
+
+ n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
if (!n)
return;
n->name = name;
n->name_len = AUDIT_NAME_FULL;
n->name_put = true;
+ name->aname = n;
if (!context->pwd.dentry)
get_fs_pwd(current->fs, &context->pwd);
@@ -2068,7 +2089,7 @@ void __audit_getname(const char *name)
* then we delay the putname until syscall exit.
* Called from include/linux/fs.h:putname().
*/
-void audit_putname(const char *name)
+void audit_putname(struct filename *name)
{
struct audit_context *context = current->audit_context;
@@ -2083,7 +2104,7 @@ void audit_putname(const char *name)
list_for_each_entry(n, &context->names_list, list)
printk(KERN_ERR "name[%d] = %p = %s\n", i,
- n->name, n->name ?: "(null)");
+ n->name, n->name->name ?: "(null)");
}
#endif
__putname(name);
@@ -2097,8 +2118,8 @@ void audit_putname(const char *name)
" put_count=%d\n",
__FILE__, __LINE__,
context->serial, context->major,
- context->in_syscall, name, context->name_count,
- context->put_count);
+ context->in_syscall, name->name,
+ context->name_count, context->put_count);
dump_stack();
}
}
@@ -2141,13 +2162,13 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent
}
/**
- * audit_inode - store the inode and device from a lookup
+ * __audit_inode - store the inode and device from a lookup
* @name: name being audited
* @dentry: dentry being audited
- *
- * Called from fs/namei.c:path_lookup().
+ * @parent: does this dentry represent the parent?
*/
-void __audit_inode(const char *name, const struct dentry *dentry)
+void __audit_inode(struct filename *name, const struct dentry *dentry,
+ unsigned int parent)
{
struct audit_context *context = current->audit_context;
const struct inode *inode = dentry->d_inode;
@@ -2156,24 +2177,69 @@ void __audit_inode(const char *name, const struct dentry *dentry)
if (!context->in_syscall)
return;
+ if (!name)
+ goto out_alloc;
+
+#if AUDIT_DEBUG
+ /* The struct filename _must_ have a populated ->name */
+ BUG_ON(!name->name);
+#endif
+ /*
+ * If we have a pointer to an audit_names entry already, then we can
+ * just use it directly if the type is correct.
+ */
+ n = name->aname;
+ if (n) {
+ if (parent) {
+ if (n->type == AUDIT_TYPE_PARENT ||
+ n->type == AUDIT_TYPE_UNKNOWN)
+ goto out;
+ } else {
+ if (n->type != AUDIT_TYPE_PARENT)
+ goto out;
+ }
+ }
+
list_for_each_entry_reverse(n, &context->names_list, list) {
- if (n->name && (n->name == name))
- goto out;
+ /* does the name pointer match? */
+ if (!n->name || n->name->name != name->name)
+ continue;
+
+ /* match the correct record type */
+ if (parent) {
+ if (n->type == AUDIT_TYPE_PARENT ||
+ n->type == AUDIT_TYPE_UNKNOWN)
+ goto out;
+ } else {
+ if (n->type != AUDIT_TYPE_PARENT)
+ goto out;
+ }
}
- /* unable to find the name from a previous getname() */
- n = audit_alloc_name(context);
+out_alloc:
+ /* unable to find the name from a previous getname(). Allocate a new
+ * anonymous entry.
+ */
+ n = audit_alloc_name(context, AUDIT_TYPE_NORMAL);
if (!n)
return;
out:
+ if (parent) {
+ n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
+ n->type = AUDIT_TYPE_PARENT;
+ } else {
+ n->name_len = AUDIT_NAME_FULL;
+ n->type = AUDIT_TYPE_NORMAL;
+ }
handle_path(dentry);
audit_copy_inode(n, dentry, inode);
}
/**
- * audit_inode_child - collect inode info for created/removed objects
- * @dentry: dentry being audited
+ * __audit_inode_child - collect inode info for created/removed objects
* @parent: inode of dentry parent
+ * @dentry: dentry being audited
+ * @type: AUDIT_TYPE_* value that we're looking for
*
* For syscalls that create or remove filesystem objects, audit_inode
* can only collect information for the filesystem object's parent.
@@ -2183,15 +2249,14 @@ out:
* must be hooked prior, in order to capture the target inode during
* unsuccessful attempts.
*/
-void __audit_inode_child(const struct dentry *dentry,
- const struct inode *parent)
+void __audit_inode_child(const struct inode *parent,
+ const struct dentry *dentry,
+ const unsigned char type)
{
struct audit_context *context = current->audit_context;
- const char *found_parent = NULL, *found_child = NULL;
const struct inode *inode = dentry->d_inode;
const char *dname = dentry->d_name.name;
- struct audit_names *n;
- int dirlen = 0;
+ struct audit_names *n, *found_parent = NULL, *found_child = NULL;
if (!context->in_syscall)
return;
@@ -2199,62 +2264,65 @@ void __audit_inode_child(const struct dentry *dentry,
if (inode)
handle_one(inode);
- /* parent is more likely, look for it first */
+ /* look for a parent entry first */
list_for_each_entry(n, &context->names_list, list) {
- if (!n->name)
+ if (!n->name || n->type != AUDIT_TYPE_PARENT)
continue;
if (n->ino == parent->i_ino &&
- !audit_compare_dname_path(dname, n->name, &dirlen)) {
- n->name_len = dirlen; /* update parent data in place */
- found_parent = n->name;
- goto add_names;
+ !audit_compare_dname_path(dname, n->name->name, n->name_len)) {
+ found_parent = n;
+ break;
}
}
- /* no matching parent, look for matching child */
+ /* is there a matching child entry? */
list_for_each_entry(n, &context->names_list, list) {
- if (!n->name)
+ /* can only match entries that have a name */
+ if (!n->name || n->type != type)
continue;
- /* strcmp() is the more likely scenario */
- if (!strcmp(dname, n->name) ||
- !audit_compare_dname_path(dname, n->name, &dirlen)) {
- if (inode)
- audit_copy_inode(n, NULL, inode);
- else
- n->ino = (unsigned long)-1;
- found_child = n->name;
- goto add_names;
+ /* if we found a parent, make sure this one is a child of it */
+ if (found_parent && (n->name != found_parent->name))
+ continue;
+
+ if (!strcmp(dname, n->name->name) ||
+ !audit_compare_dname_path(dname, n->name->name,
+ found_parent ?
+ found_parent->name_len :
+ AUDIT_NAME_FULL)) {
+ found_child = n;
+ break;
}
}
-add_names:
if (!found_parent) {
- n = audit_alloc_name(context);
+ /* create a new, "anonymous" parent record */
+ n = audit_alloc_name(context, AUDIT_TYPE_PARENT);
if (!n)
return;
audit_copy_inode(n, NULL, parent);
}
if (!found_child) {
- n = audit_alloc_name(context);
- if (!n)
+ found_child = audit_alloc_name(context, type);
+ if (!found_child)
return;
/* Re-use the name belonging to the slot for a matching parent
* directory. All names for this context are relinquished in
* audit_free_names() */
if (found_parent) {
- n->name = found_parent;
- n->name_len = AUDIT_NAME_FULL;
+ found_child->name = found_parent->name;
+ found_child->name_len = AUDIT_NAME_FULL;
/* don't call __putname() */
- n->name_put = false;
+ found_child->name_put = false;
}
-
- if (inode)
- audit_copy_inode(n, NULL, inode);
}
+ if (inode)
+ audit_copy_inode(found_child, dentry, inode);
+ else
+ found_child->ino = (unsigned long)-1;
}
EXPORT_SYMBOL_GPL(__audit_inode_child);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 13774b3b39aa..f24f724620dd 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1962,9 +1962,8 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
* trading it for newcg is protected by cgroup_mutex, we're safe to drop
* it here; it will be freed under RCU.
*/
- put_css_set(oldcg);
-
set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+ put_css_set(oldcg);
}
/**
@@ -4815,31 +4814,20 @@ static const struct file_operations proc_cgroupstats_operations = {
*
* A pointer to the shared css_set was automatically copied in
* fork.c by dup_task_struct(). However, we ignore that copy, since
- * it was not made under the protection of RCU, cgroup_mutex or
- * threadgroup_change_begin(), so it might no longer be a valid
- * cgroup pointer. cgroup_attach_task() might have already changed
- * current->cgroups, allowing the previously referenced cgroup
- * group to be removed and freed.
- *
- * Outside the pointer validity we also need to process the css_set
- * inheritance between threadgoup_change_begin() and
- * threadgoup_change_end(), this way there is no leak in any process
- * wide migration performed by cgroup_attach_proc() that could otherwise
- * miss a thread because it is too early or too late in the fork stage.
+ * it was not made under the protection of RCU or cgroup_mutex, so
+ * might no longer be a valid cgroup pointer. cgroup_attach_task() might
+ * have already changed current->cgroups, allowing the previously
+ * referenced cgroup group to be removed and freed.
*
* At the point that cgroup_fork() is called, 'current' is the parent
* task, and the passed argument 'child' points to the child task.
*/
void cgroup_fork(struct task_struct *child)
{
- /*
- * We don't need to task_lock() current because current->cgroups
- * can't be changed concurrently here. The parent obviously hasn't
- * exited and called cgroup_exit(), and we are synchronized against
- * cgroup migration through threadgroup_change_begin().
- */
+ task_lock(current);
child->cgroups = current->cgroups;
get_css_set(child->cgroups);
+ task_unlock(current);
INIT_LIST_HEAD(&child->cg_list);
}
@@ -4895,19 +4883,10 @@ void cgroup_post_fork(struct task_struct *child)
*/
if (use_task_css_set_links) {
write_lock(&css_set_lock);
- if (list_empty(&child->cg_list)) {
- /*
- * It's safe to use child->cgroups without task_lock()
- * here because we are protected through
- * threadgroup_change_begin() against concurrent
- * css_set change in cgroup_task_migrate(). Also
- * the task can't exit at that point until
- * wake_up_new_task() is called, so we are protected
- * against cgroup_exit() setting child->cgroup to
- * init_css_set.
- */
+ task_lock(child);
+ if (list_empty(&child->cg_list))
list_add(&child->cg_list, &child->cgroups->tasks);
- }
+ task_unlock(child);
write_unlock(&css_set_lock);
}
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f560598807c1..42bd331ee0ab 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -80,6 +80,10 @@ void put_online_cpus(void)
if (cpu_hotplug.active_writer == current)
return;
mutex_lock(&cpu_hotplug.lock);
+
+ if (WARN_ON(!cpu_hotplug.refcount))
+ cpu_hotplug.refcount++; /* try to fix things up */
+
if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
wake_up_process(cpu_hotplug.active_writer);
mutex_unlock(&cpu_hotplug.lock);
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 17e073c309e6..9a61738cefc8 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -696,6 +696,22 @@ out:
return ret;
}
+/*
+ * GDB places a breakpoint at this function to know dynamically
+ * loaded objects. It's not defined static so that only one instance with this
+ * name exists in the kernel.
+ */
+
+static int module_event(struct notifier_block *self, unsigned long val,
+ void *data)
+{
+ return 0;
+}
+
+static struct notifier_block dbg_module_load_nb = {
+ .notifier_call = module_event,
+};
+
int kgdb_nmicallback(int cpu, void *regs)
{
#ifdef CONFIG_SMP
@@ -824,6 +840,7 @@ static void kgdb_register_callbacks(void)
kgdb_arch_init();
if (!dbg_is_early)
kgdb_arch_late();
+ register_module_notifier(&dbg_module_load_nb);
register_reboot_notifier(&dbg_reboot_notifier);
atomic_notifier_chain_register(&panic_notifier_list,
&kgdb_panic_event_nb);
@@ -847,6 +864,7 @@ static void kgdb_unregister_callbacks(void)
if (kgdb_io_module_registered) {
kgdb_io_module_registered = 0;
unregister_reboot_notifier(&dbg_reboot_notifier);
+ unregister_module_notifier(&dbg_module_load_nb);
atomic_notifier_chain_unregister(&panic_notifier_list,
&kgdb_panic_event_nb);
kgdb_arch_exit();
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 07c9bbb94a0b..b03e0e814e43 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -129,6 +129,8 @@ kdb_bt(int argc, const char **argv)
}
/* Now the inactive tasks */
kdb_do_each_thread(g, p) {
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
if (task_curr(p))
continue;
if (kdb_bt1(p, mask, argcount, btaprompt))
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 0a69d2adc4f3..14ff4849262c 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -552,6 +552,7 @@ int vkdb_printf(const char *fmt, va_list ap)
{
int diag;
int linecount;
+ int colcount;
int logging, saved_loglevel = 0;
int saved_trap_printk;
int got_printf_lock = 0;
@@ -584,6 +585,10 @@ int vkdb_printf(const char *fmt, va_list ap)
if (diag || linecount <= 1)
linecount = 24;
+ diag = kdbgetintenv("COLUMNS", &colcount);
+ if (diag || colcount <= 1)
+ colcount = 80;
+
diag = kdbgetintenv("LOGGING", &logging);
if (diag)
logging = 0;
@@ -690,7 +695,7 @@ kdb_printit:
gdbstub_msg_write(kdb_buffer, retlen);
} else {
if (dbg_io_ops && !dbg_io_ops->is_console) {
- len = strlen(kdb_buffer);
+ len = retlen;
cp = kdb_buffer;
while (len--) {
dbg_io_ops->write_char(*cp);
@@ -709,11 +714,29 @@ kdb_printit:
printk(KERN_INFO "%s", kdb_buffer);
}
- if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n'))
- kdb_nextline++;
+ if (KDB_STATE(PAGER)) {
+ /*
+ * Check printed string to decide how to bump the
+ * kdb_nextline to control when the more prompt should
+ * show up.
+ */
+ int got = 0;
+ len = retlen;
+ while (len--) {
+ if (kdb_buffer[len] == '\n') {
+ kdb_nextline++;
+ got = 0;
+ } else if (kdb_buffer[len] == '\r') {
+ got = 0;
+ } else {
+ got++;
+ }
+ }
+ kdb_nextline += got / (colcount + 1);
+ }
/* check for having reached the LINES number of printed lines */
- if (kdb_nextline == linecount) {
+ if (kdb_nextline >= linecount) {
char buf1[16] = "";
/* Watch out for recursion here. Any routine that calls
@@ -765,7 +788,7 @@ kdb_printit:
kdb_grepping_flag = 0;
kdb_printf("\n");
} else if (buf1[0] == ' ') {
- kdb_printf("\n");
+ kdb_printf("\r");
suspend_grep = 1; /* for this recursion */
} else if (buf1[0] == '\n') {
kdb_nextline = linecount - 1;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 1261dc7eaeb9..4d5f8d5612f3 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2101,6 +2101,8 @@ static int kdb_dmesg(int argc, const char **argv)
}
if (!lines--)
break;
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
kdb_printf("%.*s\n", (int)len - 1, buf);
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f16f3c58f11a..dbccf83c134d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -372,6 +372,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ if (cpuctx->unique_pmu != pmu)
+ continue; /* ensure we process each cpuctx once */
/*
* perf_cgroup_events says at least one
@@ -395,9 +397,10 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
if (mode & PERF_CGROUP_SWIN) {
WARN_ON_ONCE(cpuctx->cgrp);
- /* set cgrp before ctxsw in to
- * allow event_filter_match() to not
- * have to pass task around
+ /*
+ * set cgrp before ctxsw in to allow
+ * event_filter_match() to not have to pass
+ * task around
*/
cpuctx->cgrp = perf_cgroup_from_task(task);
cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
@@ -3671,7 +3674,7 @@ unlock:
atomic_inc(&event->mmap_count);
mutex_unlock(&event->mmap_mutex);
- vma->vm_flags |= VM_RESERVED;
+ vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
vma->vm_ops = &perf_mmap_vmops;
return ret;
@@ -4412,7 +4415,7 @@ static void perf_event_task_event(struct perf_task_event *task_event)
rcu_read_lock();
list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->active_pmu != pmu)
+ if (cpuctx->unique_pmu != pmu)
goto next;
perf_event_task_ctx(&cpuctx->ctx, task_event);
@@ -4558,7 +4561,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
rcu_read_lock();
list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->active_pmu != pmu)
+ if (cpuctx->unique_pmu != pmu)
goto next;
perf_event_comm_ctx(&cpuctx->ctx, comm_event);
@@ -4754,7 +4757,7 @@ got_name:
rcu_read_lock();
list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->active_pmu != pmu)
+ if (cpuctx->unique_pmu != pmu)
goto next;
perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
vma->vm_flags & VM_EXEC);
@@ -5855,8 +5858,8 @@ static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- if (cpuctx->active_pmu == old_pmu)
- cpuctx->active_pmu = pmu;
+ if (cpuctx->unique_pmu == old_pmu)
+ cpuctx->unique_pmu = pmu;
}
}
@@ -5991,7 +5994,7 @@ skip_type:
cpuctx->ctx.pmu = pmu;
cpuctx->jiffies_interval = 1;
INIT_LIST_HEAD(&cpuctx->rotation_list);
- cpuctx->active_pmu = pmu;
+ cpuctx->unique_pmu = pmu;
}
got_cpu_context:
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 912ef48d28ab..5cc4e7e42e68 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -78,15 +78,23 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
*/
static atomic_t uprobe_events = ATOMIC_INIT(0);
+/* Have a copy of original instruction */
+#define UPROBE_COPY_INSN 0
+/* Dont run handlers when first register/ last unregister in progress*/
+#define UPROBE_RUN_HANDLER 1
+/* Can skip singlestep */
+#define UPROBE_SKIP_SSTEP 2
+
struct uprobe {
struct rb_node rb_node; /* node in the rb tree */
atomic_t ref;
struct rw_semaphore consumer_rwsem;
+ struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */
struct list_head pending_list;
struct uprobe_consumer *consumers;
struct inode *inode; /* Also hold a ref to inode */
loff_t offset;
- int flags;
+ unsigned long flags;
struct arch_uprobe arch;
};
@@ -100,17 +108,12 @@ struct uprobe {
*/
static bool valid_vma(struct vm_area_struct *vma, bool is_register)
{
- if (!vma->vm_file)
- return false;
-
- if (!is_register)
- return true;
+ vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED;
- if ((vma->vm_flags & (VM_HUGETLB|VM_READ|VM_WRITE|VM_EXEC|VM_SHARED))
- == (VM_READ|VM_EXEC))
- return true;
+ if (is_register)
+ flags |= VM_WRITE;
- return false;
+ return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
}
static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
@@ -141,10 +144,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
spinlock_t *ptl;
pte_t *ptep;
int err;
+ /* For mmu_notifiers */
+ const unsigned long mmun_start = addr;
+ const unsigned long mmun_end = addr + PAGE_SIZE;
/* For try_to_free_swap() and munlock_vma_page() below */
lock_page(page);
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
err = -EAGAIN;
ptep = page_check_address(page, mm, addr, &ptl, 0);
if (!ptep)
@@ -173,6 +180,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
err = 0;
unlock:
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
unlock_page(page);
return err;
}
@@ -188,19 +196,44 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn)
return *insn == UPROBE_SWBP_INSN;
}
+static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode)
+{
+ void *kaddr = kmap_atomic(page);
+ memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE);
+ kunmap_atomic(kaddr);
+}
+
+static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
+{
+ uprobe_opcode_t old_opcode;
+ bool is_swbp;
+
+ copy_opcode(page, vaddr, &old_opcode);
+ is_swbp = is_swbp_insn(&old_opcode);
+
+ if (is_swbp_insn(new_opcode)) {
+ if (is_swbp) /* register: already installed? */
+ return 0;
+ } else {
+ if (!is_swbp) /* unregister: was it changed by us? */
+ return 0;
+ }
+
+ return 1;
+}
+
/*
* NOTE:
* Expect the breakpoint instruction to be the smallest size instruction for
* the architecture. If an arch has variable length instruction and the
* breakpoint instruction is not of the smallest length instruction
- * supported by that architecture then we need to modify read_opcode /
+ * supported by that architecture then we need to modify is_swbp_at_addr and
* write_opcode accordingly. This would never be a problem for archs that
* have fixed length instructions.
*/
/*
* write_opcode - write the opcode at a given virtual address.
- * @auprobe: arch breakpointing information.
* @mm: the probed process address space.
* @vaddr: the virtual address to store the opcode.
* @opcode: opcode to be written at @vaddr.
@@ -211,8 +244,8 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn)
* For mm @mm, write the opcode at @vaddr.
* Return 0 (success) or a negative errno.
*/
-static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
- unsigned long vaddr, uprobe_opcode_t opcode)
+static int write_opcode(struct mm_struct *mm, unsigned long vaddr,
+ uprobe_opcode_t opcode)
{
struct page *old_page, *new_page;
void *vaddr_old, *vaddr_new;
@@ -221,10 +254,14 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
retry:
/* Read the page with vaddr into memory */
- ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma);
+ ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);
if (ret <= 0)
return ret;
+ ret = verify_opcode(old_page, vaddr, &opcode);
+ if (ret <= 0)
+ goto put_old;
+
ret = -ENOMEM;
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
if (!new_page)
@@ -259,63 +296,6 @@ put_old:
}
/**
- * read_opcode - read the opcode at a given virtual address.
- * @mm: the probed process address space.
- * @vaddr: the virtual address to read the opcode.
- * @opcode: location to store the read opcode.
- *
- * Called with mm->mmap_sem held (for read and with a reference to
- * mm.
- *
- * For mm @mm, read the opcode at @vaddr and store it in @opcode.
- * Return 0 (success) or a negative errno.
- */
-static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t *opcode)
-{
- struct page *page;
- void *vaddr_new;
- int ret;
-
- ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
- if (ret <= 0)
- return ret;
-
- vaddr_new = kmap_atomic(page);
- vaddr &= ~PAGE_MASK;
- memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE);
- kunmap_atomic(vaddr_new);
-
- put_page(page);
-
- return 0;
-}
-
-static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
-{
- uprobe_opcode_t opcode;
- int result;
-
- if (current->mm == mm) {
- pagefault_disable();
- result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
- sizeof(opcode));
- pagefault_enable();
-
- if (likely(result == 0))
- goto out;
- }
-
- result = read_opcode(mm, vaddr, &opcode);
- if (result)
- return result;
-out:
- if (is_swbp_insn(&opcode))
- return 1;
-
- return 0;
-}
-
-/**
* set_swbp - store breakpoint at a given address.
* @auprobe: arch specific probepoint information.
* @mm: the probed process address space.
@@ -326,18 +306,7 @@ out:
*/
int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
- int result;
- /*
- * See the comment near uprobes_hash().
- */
- result = is_swbp_at_addr(mm, vaddr);
- if (result == 1)
- return 0;
-
- if (result)
- return result;
-
- return write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
+ return write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
}
/**
@@ -352,16 +321,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
int __weak
set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
- int result;
-
- result = is_swbp_at_addr(mm, vaddr);
- if (!result)
- return -EINVAL;
-
- if (result != 1)
- return result;
-
- return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
+ return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
}
static int match_uprobe(struct uprobe *l, struct uprobe *r)
@@ -468,7 +428,7 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
spin_unlock(&uprobes_treelock);
/* For now assume that the instruction need not be single-stepped */
- uprobe->flags |= UPROBE_SKIP_SSTEP;
+ __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
return u;
}
@@ -490,6 +450,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
uprobe->inode = igrab(inode);
uprobe->offset = offset;
init_rwsem(&uprobe->consumer_rwsem);
+ mutex_init(&uprobe->copy_mutex);
/* add to uprobes_tree, sorted on inode:offset */
cur_uprobe = insert_uprobe(uprobe);
@@ -510,7 +471,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
{
struct uprobe_consumer *uc;
- if (!(uprobe->flags & UPROBE_RUN_HANDLER))
+ if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags))
return;
down_read(&uprobe->consumer_rwsem);
@@ -616,29 +577,43 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp)
return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset);
}
-/*
- * How mm->uprobes_state.count gets updated
- * uprobe_mmap() increments the count if
- * - it successfully adds a breakpoint.
- * - it cannot add a breakpoint, but sees that there is a underlying
- * breakpoint (via a is_swbp_at_addr()).
- *
- * uprobe_munmap() decrements the count if
- * - it sees a underlying breakpoint, (via is_swbp_at_addr)
- * (Subsequent uprobe_unregister wouldnt find the breakpoint
- * unless a uprobe_mmap kicks in, since the old vma would be
- * dropped just after uprobe_munmap.)
- *
- * uprobe_register increments the count if:
- * - it successfully adds a breakpoint.
- *
- * uprobe_unregister decrements the count if:
- * - it sees a underlying breakpoint and removes successfully.
- * (via is_swbp_at_addr)
- * (Subsequent uprobe_munmap wouldnt find the breakpoint
- * since there is no underlying breakpoint after the
- * breakpoint removal.)
- */
+static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
+ struct mm_struct *mm, unsigned long vaddr)
+{
+ int ret = 0;
+
+ if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
+ return ret;
+
+ mutex_lock(&uprobe->copy_mutex);
+ if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
+ goto out;
+
+ ret = copy_insn(uprobe, file);
+ if (ret)
+ goto out;
+
+ ret = -ENOTSUPP;
+ if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
+ goto out;
+
+ ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
+ if (ret)
+ goto out;
+
+ /* write_opcode() assumes we don't cross page boundary */
+ BUG_ON((uprobe->offset & ~PAGE_MASK) +
+ UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
+
+ smp_wmb(); /* pairs with rmb() in find_active_uprobe() */
+ set_bit(UPROBE_COPY_INSN, &uprobe->flags);
+
+ out:
+ mutex_unlock(&uprobe->copy_mutex);
+
+ return ret;
+}
+
static int
install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long vaddr)
@@ -656,24 +631,9 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
if (!uprobe->consumers)
return 0;
- if (!(uprobe->flags & UPROBE_COPY_INSN)) {
- ret = copy_insn(uprobe, vma->vm_file);
- if (ret)
- return ret;
-
- if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
- return -ENOTSUPP;
-
- ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
- if (ret)
- return ret;
-
- /* write_opcode() assumes we don't cross page boundary */
- BUG_ON((uprobe->offset & ~PAGE_MASK) +
- UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
-
- uprobe->flags |= UPROBE_COPY_INSN;
- }
+ ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
+ if (ret)
+ return ret;
/*
* set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
@@ -692,15 +652,15 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
return ret;
}
-static void
+static int
remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
{
/* can happen if uprobe_register() fails */
if (!test_bit(MMF_HAS_UPROBES, &mm->flags))
- return;
+ return 0;
set_bit(MMF_RECALC_UPROBES, &mm->flags);
- set_orig_insn(&uprobe->arch, mm, vaddr);
+ return set_orig_insn(&uprobe->arch, mm, vaddr);
}
/*
@@ -735,7 +695,6 @@ static struct map_info *
build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
{
unsigned long pgoff = offset >> PAGE_SHIFT;
- struct prio_tree_iter iter;
struct vm_area_struct *vma;
struct map_info *curr = NULL;
struct map_info *prev = NULL;
@@ -744,7 +703,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
again:
mutex_lock(&mapping->i_mmap_mutex);
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
if (!valid_vma(vma, is_register))
continue;
@@ -816,7 +775,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
struct mm_struct *mm = info->mm;
struct vm_area_struct *vma;
- if (err)
+ if (err && is_register)
goto free;
down_write(&mm->mmap_sem);
@@ -832,7 +791,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
if (is_register)
err = install_breakpoint(uprobe, mm, vma, info->vaddr);
else
- remove_breakpoint(uprobe, mm, info->vaddr);
+ err |= remove_breakpoint(uprobe, mm, info->vaddr);
unlock:
up_write(&mm->mmap_sem);
@@ -889,13 +848,15 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
mutex_lock(uprobes_hash(inode));
uprobe = alloc_uprobe(inode, offset);
- if (uprobe && !consumer_add(uprobe, uc)) {
+ if (!uprobe) {
+ ret = -ENOMEM;
+ } else if (!consumer_add(uprobe, uc)) {
ret = __uprobe_register(uprobe);
if (ret) {
uprobe->consumers = NULL;
__uprobe_unregister(uprobe);
} else {
- uprobe->flags |= UPROBE_RUN_HANDLER;
+ set_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
}
}
@@ -928,7 +889,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
if (consumer_del(uprobe, uc)) {
if (!uprobe->consumers) {
__uprobe_unregister(uprobe);
- uprobe->flags &= ~UPROBE_RUN_HANDLER;
+ clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
}
}
@@ -1389,10 +1350,11 @@ bool uprobe_deny_signal(void)
*/
static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
{
- if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
- return true;
-
- uprobe->flags &= ~UPROBE_SKIP_SSTEP;
+ if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) {
+ if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
+ return true;
+ clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
+ }
return false;
}
@@ -1415,6 +1377,30 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
clear_bit(MMF_HAS_UPROBES, &mm->flags);
}
+static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
+{
+ struct page *page;
+ uprobe_opcode_t opcode;
+ int result;
+
+ pagefault_disable();
+ result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
+ sizeof(opcode));
+ pagefault_enable();
+
+ if (likely(result == 0))
+ goto out;
+
+ result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
+ if (result < 0)
+ return result;
+
+ copy_opcode(page, vaddr, &opcode);
+ put_page(page);
+ out:
+ return is_swbp_insn(&opcode);
+}
+
static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
{
struct mm_struct *mm = current->mm;
@@ -1485,38 +1471,41 @@ static void handle_swbp(struct pt_regs *regs)
}
return;
}
+ /*
+ * TODO: move copy_insn/etc into _register and remove this hack.
+ * After we hit the bp, _unregister + _register can install the
+ * new and not-yet-analyzed uprobe at the same address, restart.
+ */
+ smp_rmb(); /* pairs with wmb() in install_breakpoint() */
+ if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
+ goto restart;
utask = current->utask;
if (!utask) {
utask = add_utask();
/* Cannot allocate; re-execute the instruction. */
if (!utask)
- goto cleanup_ret;
+ goto restart;
}
- utask->active_uprobe = uprobe;
+
handler_chain(uprobe, regs);
- if (uprobe->flags & UPROBE_SKIP_SSTEP && can_skip_sstep(uprobe, regs))
- goto cleanup_ret;
+ if (can_skip_sstep(uprobe, regs))
+ goto out;
- utask->state = UTASK_SSTEP;
if (!pre_ssout(uprobe, regs, bp_vaddr)) {
arch_uprobe_enable_step(&uprobe->arch);
+ utask->active_uprobe = uprobe;
+ utask->state = UTASK_SSTEP;
return;
}
-cleanup_ret:
- if (utask) {
- utask->active_uprobe = NULL;
- utask->state = UTASK_RUNNING;
- }
- if (!(uprobe->flags & UPROBE_SKIP_SSTEP))
-
- /*
- * cannot singlestep; cannot skip instruction;
- * re-execute the instruction.
- */
- instruction_pointer_set(regs, bp_vaddr);
-
+restart:
+ /*
+ * cannot singlestep; cannot skip instruction;
+ * re-execute the instruction.
+ */
+ instruction_pointer_set(regs, bp_vaddr);
+out:
put_uprobe(uprobe);
}
@@ -1548,13 +1537,12 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
}
/*
- * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag. (and on
- * subsequent probe hits on the thread sets the state to UTASK_BP_HIT) and
- * allows the thread to return from interrupt.
+ * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and
+ * allows the thread to return from interrupt. After that handle_swbp()
+ * sets utask->active_uprobe.
*
- * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag and
- * also sets the state to UTASK_SSTEP_ACK and allows the thread to return from
- * interrupt.
+ * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag
+ * and allows the thread to return from interrupt.
*
* While returning to userspace, thread notices the TIF_UPROBE flag and calls
* uprobe_notify_resume().
@@ -1563,11 +1551,13 @@ void uprobe_notify_resume(struct pt_regs *regs)
{
struct uprobe_task *utask;
+ clear_thread_flag(TIF_UPROBE);
+
utask = current->utask;
- if (!utask || utask->state == UTASK_BP_HIT)
- handle_swbp(regs);
- else
+ if (utask && utask->active_uprobe)
handle_singlestep(utask, regs);
+ else
+ handle_swbp(regs);
}
/*
@@ -1576,17 +1566,10 @@ void uprobe_notify_resume(struct pt_regs *regs)
*/
int uprobe_pre_sstep_notifier(struct pt_regs *regs)
{
- struct uprobe_task *utask;
-
if (!current->mm || !test_bit(MMF_HAS_UPROBES, &current->mm->flags))
return 0;
- utask = current->utask;
- if (utask)
- utask->state = UTASK_BP_HIT;
-
set_thread_flag(TIF_UPROBE);
-
return 1;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index a2b1efc20928..8b20ab7d3aa2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -423,7 +423,12 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
mapping->i_mmap_writable++;
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
- vma_prio_tree_add(tmp, mpnt);
+ if (unlikely(tmp->vm_flags & VM_NONLINEAR))
+ vma_nonlinear_insert(tmp,
+ &mapping->i_mmap_nonlinear);
+ else
+ vma_interval_tree_insert_after(tmp, mpnt,
+ &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
mutex_unlock(&mapping->i_mmap_mutex);
}
@@ -622,26 +627,6 @@ void mmput(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(mmput);
-/*
- * We added or removed a vma mapping the executable. The vmas are only mapped
- * during exec and are not mapped with the mmap system call.
- * Callers must hold down_write() on the mm's mmap_sem for these
- */
-void added_exe_file_vma(struct mm_struct *mm)
-{
- mm->num_exe_file_vmas++;
-}
-
-void removed_exe_file_vma(struct mm_struct *mm)
-{
- mm->num_exe_file_vmas--;
- if ((mm->num_exe_file_vmas == 0) && mm->exe_file) {
- fput(mm->exe_file);
- mm->exe_file = NULL;
- }
-
-}
-
void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
if (new_exe_file)
@@ -649,15 +634,13 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
if (mm->exe_file)
fput(mm->exe_file);
mm->exe_file = new_exe_file;
- mm->num_exe_file_vmas = 0;
}
struct file *get_mm_exe_file(struct mm_struct *mm)
{
struct file *exe_file;
- /* We need mmap_sem to protect against races with removal of
- * VM_EXECUTABLE vmas */
+ /* We need mmap_sem to protect against races with removal of exe_file */
down_read(&mm->mmap_sem);
exe_file = mm->exe_file;
if (exe_file)
@@ -1078,7 +1061,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
init_rwsem(&sig->group_rwsem);
#endif
- sig->oom_adj = current->signal->oom_adj;
sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
@@ -1602,7 +1584,7 @@ long do_fork(unsigned long clone_flags,
* requested, no event is reported; otherwise, report if the event
* for the type of forking is enabled.
*/
- if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) {
+ if (!(clone_flags & CLONE_UNTRACED) && likely(user_mode(regs))) {
if (clone_flags & CLONE_VFORK)
trace = PTRACE_EVENT_VFORK;
else if ((clone_flags & CSIGNAL) != SIGCHLD)
@@ -1652,6 +1634,17 @@ long do_fork(unsigned long clone_flags,
return nr;
}
+#ifdef CONFIG_GENERIC_KERNEL_THREAD
+/*
+ * Create a kernel thread.
+ */
+pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+{
+ return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, NULL,
+ (unsigned long)arg, NULL, NULL);
+}
+#endif
+
#ifndef ARCH_MIN_MMSTRUCT_ALIGN
#define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 49a77727db42..4e69e24d3d7d 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -148,7 +148,8 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
* @host_data: Controller private data pointer
*
* Allocates a legacy irq_domain if irq_base is positive or a linear
- * domain otherwise.
+ * domain otherwise. For the legacy domain, IRQ descriptors will also
+ * be allocated.
*
* This is intended to implement the expected behaviour for most
* interrupt controllers which is that a linear mapping should
@@ -162,11 +163,33 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
const struct irq_domain_ops *ops,
void *host_data)
{
- if (first_irq > 0)
- return irq_domain_add_legacy(of_node, size, first_irq, 0,
+ if (first_irq > 0) {
+ int irq_base;
+
+ if (IS_ENABLED(CONFIG_SPARSE_IRQ)) {
+ /*
+ * Set the descriptor allocator to search for a
+ * 1-to-1 mapping, such as irq_alloc_desc_at().
+ * Use of_node_to_nid() which is defined to
+ * numa_node_id() on platforms that have no custom
+ * implementation.
+ */
+ irq_base = irq_alloc_descs(first_irq, first_irq, size,
+ of_node_to_nid(of_node));
+ if (irq_base < 0) {
+ WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
+ first_irq);
+ irq_base = first_irq;
+ }
+ } else
+ irq_base = first_irq;
+
+ return irq_domain_add_legacy(of_node, size, irq_base, 0,
ops, host_data);
- else
- return irq_domain_add_linear(of_node, size, ops, host_data);
+ }
+
+ /* A linear domain is the default */
+ return irq_domain_add_linear(of_node, size, ops, host_data);
}
/**
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 0668d58d6413..5e4bd7864c5d 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -21,7 +21,6 @@
#include <linux/hardirq.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
-#include <generated/utsrelease.h>
#include <linux/utsname.h>
#include <linux/numa.h>
#include <linux/suspend.h>
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 6f99aead66c6..1c317e386831 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -37,6 +37,7 @@
#include <linux/notifier.h>
#include <linux/suspend.h>
#include <linux/rwsem.h>
+#include <linux/ptrace.h>
#include <asm/uaccess.h>
#include <trace/events/module.h>
@@ -221,11 +222,13 @@ static int ____call_usermodehelper(void *data)
retval = kernel_execve(sub_info->path,
(const char *const *)sub_info->argv,
(const char *const *)sub_info->envp);
+ if (!retval)
+ return 0;
/* Exec failed? */
fail:
sub_info->retval = retval;
- return 0;
+ do_exit(0);
}
static int call_helper(void *data)
@@ -292,7 +295,7 @@ static int wait_for_helper(void *data)
}
umh_complete(sub_info);
- return 0;
+ do_exit(0);
}
/* This is run by khelper thread */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 146a6fa96825..29fb60caecb5 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -16,6 +16,7 @@
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/freezer.h>
+#include <linux/ptrace.h>
#include <trace/events/sched.h>
static DEFINE_SPINLOCK(kthread_create_lock);
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c
new file mode 100644
index 000000000000..4646eb2c3820
--- /dev/null
+++ b/kernel/modsign_pubkey.c
@@ -0,0 +1,113 @@
+/* Public keys for module signature verification
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/cred.h>
+#include <linux/err.h>
+#include <keys/asymmetric-type.h>
+#include "module-internal.h"
+
+struct key *modsign_keyring;
+
+extern __initdata const u8 modsign_certificate_list[];
+extern __initdata const u8 modsign_certificate_list_end[];
+asm(".section .init.data,\"aw\"\n"
+ "modsign_certificate_list:\n"
+ ".incbin \"signing_key.x509\"\n"
+ ".incbin \"extra_certificates\"\n"
+ "modsign_certificate_list_end:"
+ );
+
+/*
+ * We need to make sure ccache doesn't cache the .o file as it doesn't notice
+ * if modsign.pub changes.
+ */
+static __initdata const char annoy_ccache[] = __TIME__ "foo";
+
+/*
+ * Load the compiled-in keys
+ */
+static __init int module_verify_init(void)
+{
+ pr_notice("Initialise module verification\n");
+
+ modsign_keyring = key_alloc(&key_type_keyring, ".module_sign",
+ KUIDT_INIT(0), KGIDT_INIT(0),
+ current_cred(),
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ KEY_USR_VIEW | KEY_USR_READ,
+ KEY_ALLOC_NOT_IN_QUOTA);
+ if (IS_ERR(modsign_keyring))
+ panic("Can't allocate module signing keyring\n");
+
+ if (key_instantiate_and_link(modsign_keyring, NULL, 0, NULL, NULL) < 0)
+ panic("Can't instantiate module signing keyring\n");
+
+ return 0;
+}
+
+/*
+ * Must be initialised before we try and load the keys into the keyring.
+ */
+device_initcall(module_verify_init);
+
+/*
+ * Load the compiled-in keys
+ */
+static __init int load_module_signing_keys(void)
+{
+ key_ref_t key;
+ const u8 *p, *end;
+ size_t plen;
+
+ pr_notice("Loading module verification certificates\n");
+
+ end = modsign_certificate_list_end;
+ p = modsign_certificate_list;
+ while (p < end) {
+ /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
+ * than 256 bytes in size.
+ */
+ if (end - p < 4)
+ goto dodgy_cert;
+ if (p[0] != 0x30 &&
+ p[1] != 0x82)
+ goto dodgy_cert;
+ plen = (p[2] << 8) | p[3];
+ plen += 4;
+ if (plen > end - p)
+ goto dodgy_cert;
+
+ key = key_create_or_update(make_key_ref(modsign_keyring, 1),
+ "asymmetric",
+ NULL,
+ p,
+ plen,
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ KEY_USR_VIEW,
+ KEY_ALLOC_NOT_IN_QUOTA);
+ if (IS_ERR(key))
+ pr_err("MODSIGN: Problem loading in-kernel X.509 certificate (%ld)\n",
+ PTR_ERR(key));
+ else
+ pr_notice("MODSIGN: Loaded cert '%s'\n",
+ key_ref_to_ptr(key)->description);
+ p += plen;
+ }
+
+ return 0;
+
+dodgy_cert:
+ pr_err("MODSIGN: Problem parsing in-kernel X.509 certificate list\n");
+ return 0;
+}
+late_initcall(load_module_signing_keys);
diff --git a/kernel/module-internal.h b/kernel/module-internal.h
new file mode 100644
index 000000000000..24f9247b7d02
--- /dev/null
+++ b/kernel/module-internal.h
@@ -0,0 +1,14 @@
+/* Module internals
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+extern struct key *modsign_keyring;
+
+extern int mod_verify_sig(const void *mod, unsigned long *_modlen);
diff --git a/kernel/module.c b/kernel/module.c
index 4edbd9c11aca..6085f5ef88ea 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -58,6 +58,8 @@
#include <linux/jump_label.h>
#include <linux/pfn.h>
#include <linux/bsearch.h>
+#include <linux/fips.h>
+#include "module-internal.h"
#define CREATE_TRACE_POINTS
#include <trace/events/module.h>
@@ -102,6 +104,43 @@ static LIST_HEAD(modules);
struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
#endif /* CONFIG_KGDB_KDB */
+#ifdef CONFIG_MODULE_SIG
+#ifdef CONFIG_MODULE_SIG_FORCE
+static bool sig_enforce = true;
+#else
+static bool sig_enforce = false;
+
+static int param_set_bool_enable_only(const char *val,
+ const struct kernel_param *kp)
+{
+ int err;
+ bool test;
+ struct kernel_param dummy_kp = *kp;
+
+ dummy_kp.arg = &test;
+
+ err = param_set_bool(val, &dummy_kp);
+ if (err)
+ return err;
+
+ /* Don't let them unset it once it's set! */
+ if (!test && sig_enforce)
+ return -EROFS;
+
+ if (test)
+ sig_enforce = true;
+ return 0;
+}
+
+static const struct kernel_param_ops param_ops_bool_enable_only = {
+ .set = param_set_bool_enable_only,
+ .get = param_get_bool,
+};
+#define param_check_bool_enable_only param_check_bool
+
+module_param(sig_enforce, bool_enable_only, 0644);
+#endif /* !CONFIG_MODULE_SIG_FORCE */
+#endif /* CONFIG_MODULE_SIG */
/* Block module loading/unloading? */
int modules_disabled = 0;
@@ -136,6 +175,7 @@ struct load_info {
unsigned long symoffs, stroffs;
struct _ddebug *debug;
unsigned int num_debug;
+ bool sig_ok;
struct {
unsigned int sym, str, mod, vers, info, pcpu;
} index;
@@ -1949,26 +1989,6 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
return ret;
}
-int __weak apply_relocate(Elf_Shdr *sechdrs,
- const char *strtab,
- unsigned int symindex,
- unsigned int relsec,
- struct module *me)
-{
- pr_err("module %s: REL relocation unsupported\n", me->name);
- return -ENOEXEC;
-}
-
-int __weak apply_relocate_add(Elf_Shdr *sechdrs,
- const char *strtab,
- unsigned int symindex,
- unsigned int relsec,
- struct module *me)
-{
- pr_err("module %s: RELA relocation unsupported\n", me->name);
- return -ENOEXEC;
-}
-
static int apply_relocations(struct module *mod, const struct load_info *info)
{
unsigned int i;
@@ -2399,7 +2419,44 @@ static inline void kmemleak_load_module(const struct module *mod,
}
#endif
-/* Sets info->hdr and info->len. */
+#ifdef CONFIG_MODULE_SIG
+static int module_sig_check(struct load_info *info,
+ const void *mod, unsigned long *_len)
+{
+ int err = -ENOKEY;
+ unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
+ unsigned long len = *_len;
+
+ if (len > markerlen &&
+ memcmp(mod + len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
+ /* We truncate the module to discard the signature */
+ *_len -= markerlen;
+ err = mod_verify_sig(mod, _len);
+ }
+
+ if (!err) {
+ info->sig_ok = true;
+ return 0;
+ }
+
+ /* Not having a signature is only an error if we're strict. */
+ if (err < 0 && fips_enabled)
+ panic("Module verification failed with error %d in FIPS mode\n",
+ err);
+ if (err == -ENOKEY && !sig_enforce)
+ err = 0;
+
+ return err;
+}
+#else /* !CONFIG_MODULE_SIG */
+static int module_sig_check(struct load_info *info,
+ void *mod, unsigned long *len)
+{
+ return 0;
+}
+#endif /* !CONFIG_MODULE_SIG */
+
+/* Sets info->hdr, info->len and info->sig_ok. */
static int copy_and_check(struct load_info *info,
const void __user *umod, unsigned long len,
const char __user *uargs)
@@ -2419,6 +2476,10 @@ static int copy_and_check(struct load_info *info,
goto free_hdr;
}
+ err = module_sig_check(info, hdr, &len);
+ if (err)
+ goto free_hdr;
+
/* Sanity checks against insmoding binaries or wrong arch,
weird elf version */
if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0
@@ -2730,6 +2791,10 @@ static int check_module_license_and_versions(struct module *mod)
if (strcmp(mod->name, "driverloader") == 0)
add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+ /* lve claims to be GPL but upstream won't provide source */
+ if (strcmp(mod->name, "lve") == 0)
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+
#ifdef CONFIG_MODVERSIONS
if ((mod->num_syms && !mod->crcs)
|| (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2861,6 +2926,20 @@ static int post_relocation(struct module *mod, const struct load_info *info)
return module_finalize(info->hdr, info->sechdrs, mod);
}
+/* Is this module of this name done loading? No locks held. */
+static bool finished_loading(const char *name)
+{
+ struct module *mod;
+ bool ret;
+
+ mutex_lock(&module_mutex);
+ mod = find_module(name);
+ ret = !mod || mod->state != MODULE_STATE_COMING;
+ mutex_unlock(&module_mutex);
+
+ return ret;
+}
+
/* Allocate and load the module: note that size of section 0 is always
zero, and we rely on this for optional sections. */
static struct module *load_module(void __user *umod,
@@ -2868,7 +2947,7 @@ static struct module *load_module(void __user *umod,
const char __user *uargs)
{
struct load_info info = { NULL, };
- struct module *mod;
+ struct module *mod, *old;
long err;
pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2886,6 +2965,12 @@ static struct module *load_module(void __user *umod,
goto free_copy;
}
+#ifdef CONFIG_MODULE_SIG
+ mod->sig_ok = info.sig_ok;
+ if (!mod->sig_ok)
+ add_taint_module(mod, TAINT_FORCED_MODULE);
+#endif
+
/* Now module is in final location, initialize linked lists, etc. */
err = module_unload_init(mod);
if (err)
@@ -2934,8 +3019,18 @@ static struct module *load_module(void __user *umod,
* function to insert in a way safe to concurrent readers.
* The mutex protects against concurrent writers.
*/
+again:
mutex_lock(&module_mutex);
- if (find_module(mod->name)) {
+ if ((old = find_module(mod->name)) != NULL) {
+ if (old->state == MODULE_STATE_COMING) {
+ /* Wait in case it fails to load. */
+ mutex_unlock(&module_mutex);
+ err = wait_event_interruptible(module_wq,
+ finished_loading(mod->name));
+ if (err)
+ goto free_arch_cleanup;
+ goto again;
+ }
err = -EEXIST;
goto unlock;
}
@@ -2975,7 +3070,7 @@ static struct module *load_module(void __user *umod,
/* Unlink carefully: kallsyms could be walking list. */
list_del_rcu(&mod->list);
module_bug_cleanup(mod);
-
+ wake_up_all(&module_wq);
ddebug:
dynamic_debug_remove(info.debug);
unlock:
@@ -3050,7 +3145,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
free_module(mod);
- wake_up(&module_wq);
+ wake_up_all(&module_wq);
return ret;
}
if (ret > 0) {
@@ -3062,9 +3157,8 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
dump_stack();
}
- /* Now it's a first class citizen! Wake up anyone waiting for it. */
+ /* Now it's a first class citizen! */
mod->state = MODULE_STATE_LIVE;
- wake_up(&module_wq);
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_LIVE, mod);
@@ -3087,6 +3181,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
mod->init_ro_size = 0;
mod->init_text_size = 0;
mutex_unlock(&module_mutex);
+ wake_up_all(&module_wq);
return 0;
}
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
new file mode 100644
index 000000000000..ea1b1df5dbb0
--- /dev/null
+++ b/kernel/module_signing.c
@@ -0,0 +1,249 @@
+/* Module signature checker
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <crypto/public_key.h>
+#include <crypto/hash.h>
+#include <keys/asymmetric-type.h>
+#include "module-internal.h"
+
+/*
+ * Module signature information block.
+ *
+ * The constituents of the signature section are, in order:
+ *
+ * - Signer's name
+ * - Key identifier
+ * - Signature data
+ * - Information block
+ */
+struct module_signature {
+ enum pkey_algo algo : 8; /* Public-key crypto algorithm */
+ enum pkey_hash_algo hash : 8; /* Digest algorithm */
+ enum pkey_id_type id_type : 8; /* Key identifier type */
+ u8 signer_len; /* Length of signer's name */
+ u8 key_id_len; /* Length of key identifier */
+ u8 __pad[3];
+ __be32 sig_len; /* Length of signature data */
+};
+
+/*
+ * Digest the module contents.
+ */
+static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash,
+ const void *mod,
+ unsigned long modlen)
+{
+ struct public_key_signature *pks;
+ struct crypto_shash *tfm;
+ struct shash_desc *desc;
+ size_t digest_size, desc_size;
+ int ret;
+
+ pr_devel("==>%s()\n", __func__);
+
+ /* Allocate the hashing algorithm we're going to need and find out how
+ * big the hash operational data will be.
+ */
+ tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0);
+ if (IS_ERR(tfm))
+ return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm);
+
+ desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
+ digest_size = crypto_shash_digestsize(tfm);
+
+ /* We allocate the hash operational data storage on the end of our
+ * context data and the digest output buffer on the end of that.
+ */
+ ret = -ENOMEM;
+ pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL);
+ if (!pks)
+ goto error_no_pks;
+
+ pks->pkey_hash_algo = hash;
+ pks->digest = (u8 *)pks + sizeof(*pks) + desc_size;
+ pks->digest_size = digest_size;
+
+ desc = (void *)pks + sizeof(*pks);
+ desc->tfm = tfm;
+ desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ ret = crypto_shash_init(desc);
+ if (ret < 0)
+ goto error;
+
+ ret = crypto_shash_finup(desc, mod, modlen, pks->digest);
+ if (ret < 0)
+ goto error;
+
+ crypto_free_shash(tfm);
+ pr_devel("<==%s() = ok\n", __func__);
+ return pks;
+
+error:
+ kfree(pks);
+error_no_pks:
+ crypto_free_shash(tfm);
+ pr_devel("<==%s() = %d\n", __func__, ret);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Extract an MPI array from the signature data. This represents the actual
+ * signature. Each raw MPI is prefaced by a BE 2-byte value indicating the
+ * size of the MPI in bytes.
+ *
+ * RSA signatures only have one MPI, so currently we only read one.
+ */
+static int mod_extract_mpi_array(struct public_key_signature *pks,
+ const void *data, size_t len)
+{
+ size_t nbytes;
+ MPI mpi;
+
+ if (len < 3)
+ return -EBADMSG;
+ nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1];
+ data += 2;
+ len -= 2;
+ if (len != nbytes)
+ return -EBADMSG;
+
+ mpi = mpi_read_raw_data(data, nbytes);
+ if (!mpi)
+ return -ENOMEM;
+ pks->mpi[0] = mpi;
+ pks->nr_mpi = 1;
+ return 0;
+}
+
+/*
+ * Request an asymmetric key.
+ */
+static struct key *request_asymmetric_key(const char *signer, size_t signer_len,
+ const u8 *key_id, size_t key_id_len)
+{
+ key_ref_t key;
+ size_t i;
+ char *id, *q;
+
+ pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len);
+
+ /* Construct an identifier. */
+ id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL);
+ if (!id)
+ return ERR_PTR(-ENOKEY);
+
+ memcpy(id, signer, signer_len);
+
+ q = id + signer_len;
+ *q++ = ':';
+ *q++ = ' ';
+ for (i = 0; i < key_id_len; i++) {
+ *q++ = hex_asc[*key_id >> 4];
+ *q++ = hex_asc[*key_id++ & 0x0f];
+ }
+
+ *q = 0;
+
+ pr_debug("Look up: \"%s\"\n", id);
+
+ key = keyring_search(make_key_ref(modsign_keyring, 1),
+ &key_type_asymmetric, id);
+ if (IS_ERR(key))
+ pr_warn("Request for unknown module key '%s' err %ld\n",
+ id, PTR_ERR(key));
+ kfree(id);
+
+ if (IS_ERR(key)) {
+ switch (PTR_ERR(key)) {
+ /* Hide some search errors */
+ case -EACCES:
+ case -ENOTDIR:
+ case -EAGAIN:
+ return ERR_PTR(-ENOKEY);
+ default:
+ return ERR_CAST(key);
+ }
+ }
+
+ pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key)));
+ return key_ref_to_ptr(key);
+}
+
+/*
+ * Verify the signature on a module.
+ */
+int mod_verify_sig(const void *mod, unsigned long *_modlen)
+{
+ struct public_key_signature *pks;
+ struct module_signature ms;
+ struct key *key;
+ const void *sig;
+ size_t modlen = *_modlen, sig_len;
+ int ret;
+
+ pr_devel("==>%s(,%zu)\n", __func__, modlen);
+
+ if (modlen <= sizeof(ms))
+ return -EBADMSG;
+
+ memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
+ modlen -= sizeof(ms);
+
+ sig_len = be32_to_cpu(ms.sig_len);
+ if (sig_len >= modlen)
+ return -EBADMSG;
+ modlen -= sig_len;
+ if ((size_t)ms.signer_len + ms.key_id_len >= modlen)
+ return -EBADMSG;
+ modlen -= (size_t)ms.signer_len + ms.key_id_len;
+
+ *_modlen = modlen;
+ sig = mod + modlen;
+
+ /* For the moment, only support RSA and X.509 identifiers */
+ if (ms.algo != PKEY_ALGO_RSA ||
+ ms.id_type != PKEY_ID_X509)
+ return -ENOPKG;
+
+ if (ms.hash >= PKEY_HASH__LAST ||
+ !pkey_hash_algo[ms.hash])
+ return -ENOPKG;
+
+ key = request_asymmetric_key(sig, ms.signer_len,
+ sig + ms.signer_len, ms.key_id_len);
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+
+ pks = mod_make_digest(ms.hash, mod, modlen);
+ if (IS_ERR(pks)) {
+ ret = PTR_ERR(pks);
+ goto error_put_key;
+ }
+
+ ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len,
+ sig_len);
+ if (ret < 0)
+ goto error_free_pks;
+
+ ret = verify_signature(key, pks);
+ pr_devel("verify_signature() = %d\n", ret);
+
+error_free_pks:
+ mpi_free(pks->rsa.s);
+ kfree(pks);
+error_put_key:
+ key_put(key);
+ pr_devel("<==%s() = %d\n", __func__, ret);
+ return ret;
+}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 478bad2745e3..7b07cc0dfb75 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -71,12 +71,22 @@ err_alloc:
return NULL;
}
+/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
+#define MAX_PID_NS_LEVEL 32
+
static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns)
{
struct pid_namespace *ns;
unsigned int level = parent_pid_ns->level + 1;
- int i, err = -ENOMEM;
+ int i;
+ int err;
+
+ if (level > MAX_PID_NS_LEVEL) {
+ err = -EINVAL;
+ goto out;
+ }
+ err = -ENOMEM;
ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
if (ns == NULL)
goto out;
@@ -133,19 +143,26 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old
return create_pid_namespace(old_ns);
}
-void free_pid_ns(struct kref *kref)
+static void free_pid_ns(struct kref *kref)
{
- struct pid_namespace *ns, *parent;
+ struct pid_namespace *ns;
ns = container_of(kref, struct pid_namespace, kref);
-
- parent = ns->parent;
destroy_pid_namespace(ns);
+}
+
+void put_pid_ns(struct pid_namespace *ns)
+{
+ struct pid_namespace *parent;
- if (parent != NULL)
- put_pid_ns(parent);
+ while (ns != &init_pid_ns) {
+ parent = ns->parent;
+ if (!kref_put(&ns->kref, free_pid_ns))
+ break;
+ ns = parent;
+ }
}
-EXPORT_SYMBOL_GPL(free_pid_ns);
+EXPORT_SYMBOL_GPL(put_pid_ns);
void zap_pid_ns_processes(struct pid_namespace *pid_ns)
{
diff --git a/kernel/printk.c b/kernel/printk.c
index 66a2ea37b576..2d607f4d1797 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1890,7 +1890,6 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self,
switch (action) {
case CPU_ONLINE:
case CPU_DEAD:
- case CPU_DYING:
case CPU_DOWN_FAILED:
case CPU_UP_CANCELED:
console_lock();
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 4fb2376ddf06..74df86bd9204 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -74,6 +74,7 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
.orphan_nxttail = &sname##_state.orphan_nxtlist, \
.orphan_donetail = &sname##_state.orphan_donelist, \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
+ .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
.name = #sname, \
}
@@ -1197,7 +1198,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
raw_spin_unlock_irq(&rnp->lock);
/* Exclude any concurrent CPU-hotplug operations. */
- get_online_cpus();
+ mutex_lock(&rsp->onoff_mutex);
/*
* Set the quiescent-state-needed bits in all the rcu_node
@@ -1234,7 +1235,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
cond_resched();
}
- put_online_cpus();
+ mutex_unlock(&rsp->onoff_mutex);
return 1;
}
@@ -1700,6 +1701,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
/* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
/* Exclude any attempts to start a new grace period. */
+ mutex_lock(&rsp->onoff_mutex);
raw_spin_lock_irqsave(&rsp->onofflock, flags);
/* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
@@ -1744,6 +1746,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
init_callback_list(rdp);
/* Disallow further callbacks on this CPU. */
rdp->nxttail[RCU_NEXT_TAIL] = NULL;
+ mutex_unlock(&rsp->onoff_mutex);
}
#else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -2648,6 +2651,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rcu_get_root(rsp);
+ /* Exclude new grace periods. */
+ mutex_lock(&rsp->onoff_mutex);
+
/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave(&rnp->lock, flags);
rdp->beenonline = 1; /* We have now been online. */
@@ -2662,14 +2668,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
rcu_prepare_for_idle_init(cpu);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- /*
- * A new grace period might start here. If so, we won't be part
- * of it, but that is OK, as we are currently in a quiescent state.
- */
-
- /* Exclude any attempts to start a new GP on large systems. */
- raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
-
/* Add CPU to rcu_node bitmasks. */
rnp = rdp->mynode;
mask = rdp->grpmask;
@@ -2693,8 +2691,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
rnp = rnp->parent;
} while (rnp != NULL && !(rnp->qsmaskinit & mask));
+ local_irq_restore(flags);
- raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
+ mutex_unlock(&rsp->onoff_mutex);
}
static void __cpuinit rcu_prepare_cpu(int cpu)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 5faf05d68326..a240f032848e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -394,11 +394,17 @@ struct rcu_state {
struct rcu_head **orphan_donetail; /* Tail of above. */
long qlen_lazy; /* Number of lazy callbacks. */
long qlen; /* Total number of callbacks. */
+ /* End of fields guarded by onofflock. */
+
+ struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */
+
struct mutex barrier_mutex; /* Guards barrier fields. */
atomic_t barrier_cpu_count; /* # CPUs waiting on. */
struct completion barrier_completion; /* Wake at barrier end. */
unsigned long n_barrier_done; /* ++ at start and end of */
/* _rcu_barrier(). */
+ /* End of fields guarded by barrier_mutex. */
+
unsigned long jiffies_force_qs; /* Time at which to invoke */
/* force_quiescent_state(). */
unsigned long n_force_qs; /* Number of calls to */
diff --git a/kernel/resource.c b/kernel/resource.c
index 34d45886ee84..73f35d4b30b9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -763,6 +763,7 @@ static void __init __reserve_region_with_split(struct resource *root,
struct resource *parent = root;
struct resource *conflict;
struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC);
+ struct resource *next_res = NULL;
if (!res)
return;
@@ -772,21 +773,46 @@ static void __init __reserve_region_with_split(struct resource *root,
res->end = end;
res->flags = IORESOURCE_BUSY;
- conflict = __request_resource(parent, res);
- if (!conflict)
- return;
+ while (1) {
- /* failed, split and try again */
- kfree(res);
+ conflict = __request_resource(parent, res);
+ if (!conflict) {
+ if (!next_res)
+ break;
+ res = next_res;
+ next_res = NULL;
+ continue;
+ }
- /* conflict covered whole area */
- if (conflict->start <= start && conflict->end >= end)
- return;
+ /* conflict covered whole area */
+ if (conflict->start <= res->start &&
+ conflict->end >= res->end) {
+ kfree(res);
+ WARN_ON(next_res);
+ break;
+ }
+
+ /* failed, split and try again */
+ if (conflict->start > res->start) {
+ end = res->end;
+ res->end = conflict->start - 1;
+ if (conflict->end < end) {
+ next_res = kzalloc(sizeof(*next_res),
+ GFP_ATOMIC);
+ if (!next_res) {
+ kfree(res);
+ break;
+ }
+ next_res->name = name;
+ next_res->start = conflict->end + 1;
+ next_res->end = end;
+ next_res->flags = IORESOURCE_BUSY;
+ }
+ } else {
+ res->start = conflict->end + 1;
+ }
+ }
- if (conflict->start > start)
- __reserve_region_with_split(root, start, conflict->start-1, name);
- if (conflict->end < end)
- __reserve_region_with_split(root, conflict->end+1, end, name);
}
void __init reserve_region_with_split(struct resource *root,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c17747236438..2d8927fda712 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -505,7 +505,7 @@ static inline void init_hrtick(void)
#ifdef CONFIG_SMP
#ifndef tsk_is_polling
-#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
+#define tsk_is_polling(t) 0
#endif
void resched_task(struct task_struct *p)
@@ -6122,6 +6122,17 @@ static void sched_init_numa(void)
* numbers.
*/
+ /*
+ * Here, we should temporarily reset sched_domains_numa_levels to 0.
+ * If it fails to allocate memory for array sched_domains_numa_masks[][],
+ * the array will contain less then 'level' members. This could be
+ * dangerous when we use it to iterate array sched_domains_numa_masks[][]
+ * in other functions.
+ *
+ * We reset it to 'level' at the end of this function.
+ */
+ sched_domains_numa_levels = 0;
+
sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
if (!sched_domains_numa_masks)
return;
@@ -6176,11 +6187,68 @@ static void sched_init_numa(void)
}
sched_domain_topology = tl;
+
+ sched_domains_numa_levels = level;
+}
+
+static void sched_domains_numa_masks_set(int cpu)
+{
+ int i, j;
+ int node = cpu_to_node(cpu);
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ if (node_distance(j, node) <= sched_domains_numa_distance[i])
+ cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
+ }
+}
+
+static void sched_domains_numa_masks_clear(int cpu)
+{
+ int i, j;
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++)
+ cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
+}
+
+/*
+ * Update sched_domains_numa_masks[level][node] array when new cpus
+ * are onlined.
+ */
+static int sched_domains_numa_masks_update(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ int cpu = (long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_ONLINE:
+ sched_domains_numa_masks_set(cpu);
+ break;
+
+ case CPU_DEAD:
+ sched_domains_numa_masks_clear(cpu);
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return NOTIFY_OK;
}
#else
static inline void sched_init_numa(void)
{
}
+
+static int sched_domains_numa_masks_update(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ return 0;
+}
#endif /* CONFIG_NUMA */
static int __sdt_alloc(const struct cpumask *cpu_map)
@@ -6629,6 +6697,7 @@ void __init sched_init_smp(void)
mutex_unlock(&sched_domains_mutex);
put_online_cpus();
+ hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
diff --git a/kernel/signal.c b/kernel/signal.c
index 2c681f11b7d2..0af8868525d6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -17,6 +17,7 @@
#include <linux/fs.h>
#include <linux/tty.h>
#include <linux/binfmts.h>
+#include <linux/coredump.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/ptrace.h>
@@ -2359,7 +2360,7 @@ relock:
* first and our do_group_exit call below will use
* that value and ignore the one we pass it.
*/
- do_coredump(info->si_signo, info->si_signo, regs);
+ do_coredump(info, regs);
}
/*
diff --git a/kernel/sys.c b/kernel/sys.c
index f9492284e5d2..e6e0ece5f6a0 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -368,6 +368,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
void kernel_restart(char *cmd)
{
kernel_restart_prepare(cmd);
+ disable_nonboot_cpus();
if (!cmd)
printk(KERN_EMERG "Restarting system.\n");
else
@@ -1264,15 +1265,16 @@ DECLARE_RWSEM(uts_sem);
* Work around broken programs that cannot handle "Linux 3.0".
* Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
*/
-static int override_release(char __user *release, int len)
+static int override_release(char __user *release, size_t len)
{
int ret = 0;
- char buf[65];
if (current->personality & UNAME26) {
- char *rest = UTS_RELEASE;
+ const char *rest = UTS_RELEASE;
+ char buf[65] = { 0 };
int ndots = 0;
unsigned v;
+ size_t copy;
while (*rest) {
if (*rest == '.' && ++ndots >= 3)
@@ -1282,8 +1284,9 @@ static int override_release(char __user *release, int len)
rest++;
}
v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40;
- snprintf(buf, len, "2.6.%u%s", v, rest);
- ret = copy_to_user(release, buf, len);
+ copy = clamp_t(size_t, len, 1, sizeof(buf));
+ copy = scnprintf(buf, copy, "2.6.%u%s", v, rest);
+ ret = copy_to_user(release, buf, copy + 1);
}
return ret;
}
@@ -2204,7 +2207,7 @@ static int __orderly_poweroff(void)
return -ENOMEM;
}
- ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT,
+ ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC,
NULL, argv_cleanup, NULL);
if (ret == -ENOMEM)
argv_free(argv);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 84c76a34e41c..26f65eaa01f9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -97,10 +97,12 @@
extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
extern int max_threads;
-extern int core_uses_pid;
extern int suid_dumpable;
+#ifdef CONFIG_COREDUMP
+extern int core_uses_pid;
extern char core_pattern[];
extern unsigned int core_pipe_limit;
+#endif
extern int pid_max;
extern int min_free_kbytes;
extern int pid_max_min, pid_max_max;
@@ -177,8 +179,10 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
+#ifdef CONFIG_COREDUMP
static int proc_dostring_coredump(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
+#endif
#ifdef CONFIG_MAGIC_SYSRQ
/* Note: sysrq code uses it's own private copy */
@@ -404,6 +408,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#ifdef CONFIG_COREDUMP
{
.procname = "core_uses_pid",
.data = &core_uses_pid,
@@ -425,6 +430,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#endif
#ifdef CONFIG_PROC_SYSCTL
{
.procname = "tainted",
@@ -1543,8 +1549,7 @@ static struct ctl_table fs_table[] = {
};
static struct ctl_table debug_table[] = {
-#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
- defined(CONFIG_S390) || defined(CONFIG_TILE) || defined(CONFIG_ARM64)
+#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE
{
.procname = "exception-trace",
.data = &show_unhandled_signals,
@@ -2036,12 +2041,14 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
static void validate_coredump_safety(void)
{
+#ifdef CONFIG_COREDUMP
if (suid_dumpable == SUID_DUMPABLE_SAFE &&
core_pattern[0] != '/' && core_pattern[0] != '|') {
printk(KERN_WARNING "Unsafe core_pattern used with "\
"suid_dumpable=2. Pipe handler or fully qualified "\
"core dump path required.\n");
}
+#endif
}
static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
@@ -2053,6 +2060,7 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
return error;
}
+#ifdef CONFIG_COREDUMP
static int proc_dostring_coredump(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -2061,6 +2069,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
validate_coredump_safety();
return error;
}
+#endif
static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
void __user *buffer,
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 610f0838d555..145bb4d3bd4d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -445,6 +445,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
sizeof(struct cgroupstats));
if (na == NULL) {
+ nlmsg_free(rep_skb);
rc = -EMSGSIZE;
goto err;
}
diff --git a/kernel/time.c b/kernel/time.c
index ba744cf80696..d226c6a3fd28 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -30,7 +30,7 @@
#include <linux/export.h>
#include <linux/timex.h>
#include <linux/capability.h>
-#include <linux/clocksource.h>
+#include <linux/timekeeper_internal.h>
#include <linux/errno.h>
#include <linux/syscalls.h>
#include <linux/security.h>
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index fd42bd452b75..8601f0db1261 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -16,6 +16,10 @@ config ARCH_CLOCKSOURCE_DATA
config GENERIC_TIME_VSYSCALL
bool
+# Timekeeping vsyscall support
+config GENERIC_TIME_VSYSCALL_OLD
+ bool
+
# ktime_t scalar 64bit nsec representation
config KTIME_SCALAR
bool
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index aa27d391bfc8..f11d83b12949 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -37,7 +37,6 @@
static struct alarm_base {
spinlock_t lock;
struct timerqueue_head timerqueue;
- struct hrtimer timer;
ktime_t (*gettime)(void);
clockid_t base_clockid;
} alarm_bases[ALARM_NUMTYPE];
@@ -46,6 +45,8 @@ static struct alarm_base {
static ktime_t freezer_delta;
static DEFINE_SPINLOCK(freezer_delta_lock);
+static struct wakeup_source *ws;
+
#ifdef CONFIG_RTC_CLASS
/* rtc timer and device for setting alarm wakeups at suspend */
static struct rtc_timer rtctimer;
@@ -130,50 +131,35 @@ static inline void alarmtimer_rtc_timer_init(void) { }
* @base: pointer to the base where the timer is being run
* @alarm: pointer to alarm being enqueued.
*
- * Adds alarm to a alarm_base timerqueue and if necessary sets
- * an hrtimer to run.
+ * Adds alarm to a alarm_base timerqueue
*
* Must hold base->lock when calling.
*/
static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)
{
+ if (alarm->state & ALARMTIMER_STATE_ENQUEUED)
+ timerqueue_del(&base->timerqueue, &alarm->node);
+
timerqueue_add(&base->timerqueue, &alarm->node);
alarm->state |= ALARMTIMER_STATE_ENQUEUED;
-
- if (&alarm->node == timerqueue_getnext(&base->timerqueue)) {
- hrtimer_try_to_cancel(&base->timer);
- hrtimer_start(&base->timer, alarm->node.expires,
- HRTIMER_MODE_ABS);
- }
}
/**
- * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue
+ * alarmtimer_dequeue - Removes an alarm timer from an alarm_base timerqueue
* @base: pointer to the base where the timer is running
* @alarm: pointer to alarm being removed
*
- * Removes alarm to a alarm_base timerqueue and if necessary sets
- * a new timer to run.
+ * Removes alarm to a alarm_base timerqueue
*
* Must hold base->lock when calling.
*/
-static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
+static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm)
{
- struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue);
-
if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED))
return;
timerqueue_del(&base->timerqueue, &alarm->node);
alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
-
- if (next == &alarm->node) {
- hrtimer_try_to_cancel(&base->timer);
- next = timerqueue_getnext(&base->timerqueue);
- if (!next)
- return;
- hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS);
- }
}
@@ -188,42 +174,23 @@ static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
*/
static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
{
- struct alarm_base *base = container_of(timer, struct alarm_base, timer);
- struct timerqueue_node *next;
+ struct alarm *alarm = container_of(timer, struct alarm, timer);
+ struct alarm_base *base = &alarm_bases[alarm->type];
unsigned long flags;
- ktime_t now;
int ret = HRTIMER_NORESTART;
int restart = ALARMTIMER_NORESTART;
spin_lock_irqsave(&base->lock, flags);
- now = base->gettime();
- while ((next = timerqueue_getnext(&base->timerqueue))) {
- struct alarm *alarm;
- ktime_t expired = next->expires;
-
- if (expired.tv64 > now.tv64)
- break;
-
- alarm = container_of(next, struct alarm, node);
-
- timerqueue_del(&base->timerqueue, &alarm->node);
- alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
-
- alarm->state |= ALARMTIMER_STATE_CALLBACK;
- spin_unlock_irqrestore(&base->lock, flags);
- if (alarm->function)
- restart = alarm->function(alarm, now);
- spin_lock_irqsave(&base->lock, flags);
- alarm->state &= ~ALARMTIMER_STATE_CALLBACK;
+ alarmtimer_dequeue(base, alarm);
+ spin_unlock_irqrestore(&base->lock, flags);
- if (restart != ALARMTIMER_NORESTART) {
- timerqueue_add(&base->timerqueue, &alarm->node);
- alarm->state |= ALARMTIMER_STATE_ENQUEUED;
- }
- }
+ if (alarm->function)
+ restart = alarm->function(alarm, base->gettime());
- if (next) {
- hrtimer_set_expires(&base->timer, next->expires);
+ spin_lock_irqsave(&base->lock, flags);
+ if (restart != ALARMTIMER_NORESTART) {
+ hrtimer_set_expires(&alarm->timer, alarm->node.expires);
+ alarmtimer_enqueue(base, alarm);
ret = HRTIMER_RESTART;
}
spin_unlock_irqrestore(&base->lock, flags);
@@ -250,6 +217,7 @@ static int alarmtimer_suspend(struct device *dev)
unsigned long flags;
struct rtc_device *rtc;
int i;
+ int ret;
spin_lock_irqsave(&freezer_delta_lock, flags);
min = freezer_delta;
@@ -279,8 +247,10 @@ static int alarmtimer_suspend(struct device *dev)
if (min.tv64 == 0)
return 0;
- /* XXX - Should we enforce a minimum sleep time? */
- WARN_ON(min.tv64 < NSEC_PER_SEC);
+ if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) {
+ __pm_wakeup_event(ws, 2 * MSEC_PER_SEC);
+ return -EBUSY;
+ }
/* Setup an rtc timer to fire that far in the future */
rtc_timer_cancel(rtc, &rtctimer);
@@ -288,9 +258,11 @@ static int alarmtimer_suspend(struct device *dev)
now = rtc_tm_to_ktime(tm);
now = ktime_add(now, min);
- rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
-
- return 0;
+ /* Set alarm, if in the past reject suspend briefly to handle */
+ ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
+ if (ret < 0)
+ __pm_wakeup_event(ws, MSEC_PER_SEC);
+ return ret;
}
#else
static int alarmtimer_suspend(struct device *dev)
@@ -324,6 +296,9 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
{
timerqueue_init(&alarm->node);
+ hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid,
+ HRTIMER_MODE_ABS);
+ alarm->timer.function = alarmtimer_fired;
alarm->function = function;
alarm->type = type;
alarm->state = ALARMTIMER_STATE_INACTIVE;
@@ -334,17 +309,19 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
* @alarm: ptr to alarm to set
* @start: time to run the alarm
*/
-void alarm_start(struct alarm *alarm, ktime_t start)
+int alarm_start(struct alarm *alarm, ktime_t start)
{
struct alarm_base *base = &alarm_bases[alarm->type];
unsigned long flags;
+ int ret;
spin_lock_irqsave(&base->lock, flags);
- if (alarmtimer_active(alarm))
- alarmtimer_remove(base, alarm);
alarm->node.expires = start;
alarmtimer_enqueue(base, alarm);
+ ret = hrtimer_start(&alarm->timer, alarm->node.expires,
+ HRTIMER_MODE_ABS);
spin_unlock_irqrestore(&base->lock, flags);
+ return ret;
}
/**
@@ -358,18 +335,12 @@ int alarm_try_to_cancel(struct alarm *alarm)
{
struct alarm_base *base = &alarm_bases[alarm->type];
unsigned long flags;
- int ret = -1;
- spin_lock_irqsave(&base->lock, flags);
-
- if (alarmtimer_callback_running(alarm))
- goto out;
+ int ret;
- if (alarmtimer_is_queued(alarm)) {
- alarmtimer_remove(base, alarm);
- ret = 1;
- } else
- ret = 0;
-out:
+ spin_lock_irqsave(&base->lock, flags);
+ ret = hrtimer_try_to_cancel(&alarm->timer);
+ if (ret >= 0)
+ alarmtimer_dequeue(base, alarm);
spin_unlock_irqrestore(&base->lock, flags);
return ret;
}
@@ -802,10 +773,6 @@ static int __init alarmtimer_init(void)
for (i = 0; i < ALARM_NUMTYPE; i++) {
timerqueue_init_head(&alarm_bases[i].timerqueue);
spin_lock_init(&alarm_bases[i].lock);
- hrtimer_init(&alarm_bases[i].timer,
- alarm_bases[i].base_clockid,
- HRTIMER_MODE_ABS);
- alarm_bases[i].timer.function = alarmtimer_fired;
}
error = alarmtimer_rtc_interface_setup();
@@ -821,6 +788,7 @@ static int __init alarmtimer_init(void)
error = PTR_ERR(pdev);
goto out_drv;
}
+ ws = wakeup_source_register("alarmtimer");
return 0;
out_drv:
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 46da0537c10b..6629bf7b5285 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -37,7 +37,7 @@
* requested HZ value. It is also not recommended
* for "tick-less" systems.
*/
-#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/SHIFTED_HZ))
+#define NSEC_PER_JIFFY ((NSEC_PER_SEC+HZ/2)/HZ)
/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
* conversion, the .shift value could be zero. However
@@ -95,3 +95,33 @@ struct clocksource * __init __weak clocksource_default_clock(void)
{
return &clocksource_jiffies;
}
+
+struct clocksource refined_jiffies;
+
+int register_refined_jiffies(long cycles_per_second)
+{
+ u64 nsec_per_tick, shift_hz;
+ long cycles_per_tick;
+
+
+
+ refined_jiffies = clocksource_jiffies;
+ refined_jiffies.name = "refined-jiffies";
+ refined_jiffies.rating++;
+
+ /* Calc cycles per tick */
+ cycles_per_tick = (cycles_per_second + HZ/2)/HZ;
+ /* shift_hz stores hz<<8 for extra accuracy */
+ shift_hz = (u64)cycles_per_second << 8;
+ shift_hz += cycles_per_tick/2;
+ do_div(shift_hz, cycles_per_tick);
+ /* Calculate nsec_per_tick using shift_hz */
+ nsec_per_tick = (u64)NSEC_PER_SEC << 8;
+ nsec_per_tick += (u32)shift_hz/2;
+ do_div(nsec_per_tick, (u32)shift_hz);
+
+ refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
+
+ clocksource_register(&refined_jiffies);
+ return 0;
+}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f423bdd035c2..a40260885265 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -835,7 +835,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
*/
if (ts->tick_stopped) {
touch_softlockup_watchdog();
- if (idle_cpu(cpu))
+ if (is_idle_task(current))
ts->idle_jiffies++;
}
update_process_times(user_mode(regs));
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5ce06a3fa91e..e424970bb562 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -8,6 +8,7 @@
*
*/
+#include <linux/timekeeper_internal.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
@@ -21,61 +22,6 @@
#include <linux/tick.h>
#include <linux/stop_machine.h>
-/* Structure holding internal timekeeping values. */
-struct timekeeper {
- /* Current clocksource used for timekeeping. */
- struct clocksource *clock;
- /* NTP adjusted clock multiplier */
- u32 mult;
- /* The shift value of the current clocksource. */
- u32 shift;
- /* Number of clock cycles in one NTP interval. */
- cycle_t cycle_interval;
- /* Number of clock shifted nano seconds in one NTP interval. */
- u64 xtime_interval;
- /* shifted nano seconds left over when rounding cycle_interval */
- s64 xtime_remainder;
- /* Raw nano seconds accumulated per NTP interval. */
- u32 raw_interval;
-
- /* Current CLOCK_REALTIME time in seconds */
- u64 xtime_sec;
- /* Clock shifted nano seconds */
- u64 xtime_nsec;
-
- /* Difference between accumulated time and NTP time in ntp
- * shifted nano seconds. */
- s64 ntp_error;
- /* Shift conversion between clock shifted nano seconds and
- * ntp shifted nano seconds. */
- u32 ntp_error_shift;
-
- /*
- * wall_to_monotonic is what we need to add to xtime (or xtime corrected
- * for sub jiffie times) to get to monotonic time. Monotonic is pegged
- * at zero at system boot time, so wall_to_monotonic will be negative,
- * however, we will ALWAYS keep the tv_nsec part positive so we can use
- * the usual normalization.
- *
- * wall_to_monotonic is moved after resume from suspend for the
- * monotonic time not to jump. We need to add total_sleep_time to
- * wall_to_monotonic to get the real boot based time offset.
- *
- * - wall_to_monotonic is no longer the boot time, getboottime must be
- * used instead.
- */
- struct timespec wall_to_monotonic;
- /* Offset clock monotonic -> clock realtime */
- ktime_t offs_real;
- /* time spent in suspend */
- struct timespec total_sleep_time;
- /* Offset clock monotonic -> clock boottime */
- ktime_t offs_boot;
- /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
- struct timespec raw_time;
- /* Seqlock for all timekeeper values */
- seqlock_t lock;
-};
static struct timekeeper timekeeper;
@@ -96,15 +42,6 @@ static inline void tk_normalize_xtime(struct timekeeper *tk)
}
}
-static struct timespec tk_xtime(struct timekeeper *tk)
-{
- struct timespec ts;
-
- ts.tv_sec = tk->xtime_sec;
- ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift);
- return ts;
-}
-
static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts)
{
tk->xtime_sec = ts->tv_sec;
@@ -246,14 +183,11 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
/* must hold write on timekeeper.lock */
static void timekeeping_update(struct timekeeper *tk, bool clearntp)
{
- struct timespec xt;
-
if (clearntp) {
tk->ntp_error = 0;
ntp_clear();
}
- xt = tk_xtime(tk);
- update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult);
+ update_vsyscall(tk);
}
/**
@@ -1113,7 +1047,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
accumulate_nsecs_to_secs(tk);
/* Accumulate raw time */
- raw_nsecs = tk->raw_interval << shift;
+ raw_nsecs = (u64)tk->raw_interval << shift;
raw_nsecs += tk->raw_time.tv_nsec;
if (raw_nsecs >= NSEC_PER_SEC) {
u64 raw_secs = raw_nsecs;
@@ -1130,6 +1064,33 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
return offset;
}
+#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
+static inline void old_vsyscall_fixup(struct timekeeper *tk)
+{
+ s64 remainder;
+
+ /*
+ * Store only full nanoseconds into xtime_nsec after rounding
+ * it up and add the remainder to the error difference.
+ * XXX - This is necessary to avoid small 1ns inconsistnecies caused
+ * by truncating the remainder in vsyscalls. However, it causes
+ * additional work to be done in timekeeping_adjust(). Once
+ * the vsyscall implementations are converted to use xtime_nsec
+ * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
+ * users are removed, this can be killed.
+ */
+ remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1);
+ tk->xtime_nsec -= remainder;
+ tk->xtime_nsec += 1ULL << tk->shift;
+ tk->ntp_error += remainder << tk->ntp_error_shift;
+
+}
+#else
+#define old_vsyscall_fixup(tk)
+#endif
+
+
+
/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
@@ -1141,7 +1102,6 @@ static void update_wall_time(void)
cycle_t offset;
int shift = 0, maxshift;
unsigned long flags;
- s64 remainder;
write_seqlock_irqsave(&tk->lock, flags);
@@ -1183,20 +1143,11 @@ static void update_wall_time(void)
/* correct the clock when NTP error is too big */
timekeeping_adjust(tk, offset);
-
/*
- * Store only full nanoseconds into xtime_nsec after rounding
- * it up and add the remainder to the error difference.
- * XXX - This is necessary to avoid small 1ns inconsistnecies caused
- * by truncating the remainder in vsyscalls. However, it causes
- * additional work to be done in timekeeping_adjust(). Once
- * the vsyscall implementations are converted to use xtime_nsec
- * (shifted nanoseconds), this can be killed.
- */
- remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1);
- tk->xtime_nsec -= remainder;
- tk->xtime_nsec += 1ULL << tk->shift;
- tk->ntp_error += remainder << tk->ntp_error_shift;
+ * XXX This can be killed once everyone converts
+ * to the new update_vsyscall.
+ */
+ old_vsyscall_fixup(tk);
/*
* Finally, make sure that after the rounding
diff --git a/kernel/timer.c b/kernel/timer.c
index d5de1b2292aa..367d00858482 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -63,6 +63,7 @@ EXPORT_SYMBOL(jiffies_64);
#define TVR_SIZE (1 << TVR_BITS)
#define TVN_MASK (TVN_SIZE - 1)
#define TVR_MASK (TVR_SIZE - 1)
+#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
struct tvec {
struct list_head vec[TVN_SIZE];
@@ -359,11 +360,12 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
} else {
int i;
- /* If the timeout is larger than 0xffffffff on 64-bit
- * architectures then we use the maximum timeout:
+ /* If the timeout is larger than MAX_TVAL (on 64-bit
+ * architectures or with CONFIG_BASE_SMALL=1) then we
+ * use the maximum timeout.
*/
- if (idx > 0xffffffffUL) {
- idx = 0xffffffffUL;
+ if (idx > MAX_TVAL) {
+ idx = MAX_TVAL;
expires = idx + base->timer_jiffies;
}
i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b32ed0e385a5..b979426d16c6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1567,6 +1567,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
put_online_cpus();
} else {
+ /* Make sure this CPU has been intitialized */
+ if (!cpumask_test_cpu(cpu_id, buffer->cpumask))
+ goto out;
+
cpu_buffer = buffer->buffers[cpu_id];
if (nr_pages == cpu_buffer->nr_pages)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index cdcb59450b49..31e4f55773f1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4200,12 +4200,6 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
buf->private = 0;
}
-static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf)
-{
- return 1;
-}
-
static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
@@ -4221,7 +4215,7 @@ static const struct pipe_buf_operations buffer_pipe_buf_ops = {
.unmap = generic_pipe_buf_unmap,
.confirm = generic_pipe_buf_confirm,
.release = buffer_pipe_buf_release,
- .steal = buffer_pipe_buf_steal,
+ .steal = generic_pipe_buf_steal,
.get = buffer_pipe_buf_get,
};
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 483162a9f908..507a7a9630bf 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -13,7 +13,6 @@
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
-#include <linux/pstore.h>
#include <linux/fs.h>
#include "trace.h"
@@ -76,10 +75,9 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip,
preempt_enable_notrace();
}
-/* Our two options */
+/* Our option */
enum {
TRACE_FUNC_OPT_STACK = 0x1,
- TRACE_FUNC_OPT_PSTORE = 0x2,
};
static struct tracer_flags func_flags;
@@ -109,12 +107,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
- /*
- * So far tracing doesn't support multiple buffers, so
- * we make an explicit call for now.
- */
- if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE))
- pstore_ftrace_call(ip, parent_ip);
pc = preempt_count();
trace_function(tr, ip, parent_ip, flags, pc);
}
@@ -181,9 +173,6 @@ static struct tracer_opt func_opts[] = {
#ifdef CONFIG_STACKTRACE
{ TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
#endif
-#ifdef CONFIG_PSTORE_FTRACE
- { TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) },
-#endif
{ } /* Always set a last empty entry */
};
@@ -236,8 +225,6 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
}
break;
- case TRACE_FUNC_OPT_PSTORE:
- break;
default:
return -EINVAL;
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d951daa0ca9a..042d221d33cc 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2982,7 +2982,7 @@ bool cancel_delayed_work(struct delayed_work *dwork)
set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work));
local_irq_restore(flags);
- return true;
+ return ret;
}
EXPORT_SYMBOL(cancel_delayed_work);